diff --git a/.get_maintainer.ignore b/.get_maintainer.ignore index b458815f1d1bf4..e8d2269bad9d00 100644 --- a/.get_maintainer.ignore +++ b/.get_maintainer.ignore @@ -1,5 +1,6 @@ Alan Cox Alan Cox +Alyssa Rosenzweig Christoph Hellwig Jeff Kirsher Marc Gonzalez diff --git a/.mailmap b/.mailmap index aa09e792017f77..8db24be50158d1 100644 --- a/.mailmap +++ b/.mailmap @@ -134,6 +134,7 @@ Ben M Cahill Ben Widawsky Ben Widawsky Ben Widawsky +Bence Csókás Benjamin Poirier Benjamin Tissoires Benjamin Tissoires @@ -623,6 +624,7 @@ Paulo Alcantara Paulo Alcantara Pavankumar Kondeti Peter A Jonsson +Peter Hilber Peter Oruba Peter Oruba Pierre-Louis Bossart diff --git a/CREDITS b/CREDITS index a687c3c35c4c23..e47df5e74abee1 100644 --- a/CREDITS +++ b/CREDITS @@ -3912,6 +3912,12 @@ S: C/ Federico Garcia Lorca 1 10-A S: Sevilla 41005 S: Spain +N: Björn Töpel +E: bjorn@kernel.org +D: AF_XDP +S: Gothenburg +S: Sweden + N: Linus Torvalds E: torvalds@linux-foundation.org D: Original kernel hacker diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl b/Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl new file mode 100644 index 00000000000000..7b7c789a5cf59c --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl @@ -0,0 +1,25 @@ +What: /sys/bus/event_source/devices/vpa_dtl/format +Date: February 2025 +Contact: Linux on PowerPC Developer List +Description: Read-only. Attribute group to describe the magic bits + that go into perf_event_attr.config for a particular pmu. + (See ABI/testing/sysfs-bus-event_source-devices-format). + + Each attribute under this group defines a bit range of the + perf_event_attr.config. Supported attribute are listed + below:: + + event = "config:0-7" - event ID + + For example:: + + dtl_cede = "event=0x1" + +What: /sys/bus/event_source/devices/vpa_dtl/events +Date: February 2025 +Contact: Linux on PowerPC Developer List +Description: (RO) Attribute group to describe performance monitoring events + for the Virtual Processor Dispatch Trace Log. Each attribute in + this group describes a single performance monitoring event + supported by vpa_dtl pmu. The name of the file is the name of + the event (See ABI/testing/sysfs-bus-event_source-devices-events). diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index 87a058e14e7edd..4b21d5d2325136 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -553,6 +553,43 @@ Description: Integer > 0: representing full cycles Integer = 0: cycle_count info is not available +What: /sys/class/power_supply//internal_resistance +Date: August 2025 +Contact: linux-arm-msm@vger.kernel.org +Description: + Represent the battery's internal resistance, often referred + to as Equivalent Series Resistance (ESR). It is a dynamic + parameter that reflects the opposition to current flow within + the cell. It is not a fixed value but varies significantly + based on several operational conditions, including battery + state of charge (SoC), temperature, and whether the battery + is in a charging or discharging state. + + Access: Read + + Valid values: Represented in microohms + +What: /sys/class/power_supply//state_of_health +Date: August 2025 +Contact: linux-arm-msm@vger.kernel.org +Description: + The state_of_health parameter quantifies the overall condition + of a battery as a percentage, reflecting its ability to deliver + rated performance relative to its original specifications. It is + dynamically computed using a combination of learned capacity + and impedance-based degradation indicators, both of which evolve + over the battery's lifecycle. + Note that the exact algorithms are kept secret by most battery + vendors and the value from different battery vendors cannot be + compared with each other as there is no vendor-agnostic definition + of "performance". Also this usually cannot be used for any + calculations (i.e. this is not the factor between charge_full and + charge_full_design). + + Access: Read + + Valid values: 0 - 100 (percent) + **USB Properties** What: /sys/class/power_supply//input_current_limit diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power index e4ec5de9a5dd23..9bf7c8a267c587 100644 --- a/Documentation/ABI/testing/sysfs-devices-power +++ b/Documentation/ABI/testing/sysfs-devices-power @@ -274,15 +274,15 @@ What: /sys/devices/.../power/runtime_active_time Date: Jul 2010 Contact: Arjan van de Ven Description: - Reports the total time that the device has been active. - Used for runtime PM statistics. + Reports the total time that the device has been active, in + milliseconds. Used for runtime PM statistics. What: /sys/devices/.../power/runtime_suspended_time Date: Jul 2010 Contact: Arjan van de Ven Description: - Reports total time that the device has been suspended. - Used for runtime PM statistics. + Reports total time that the device has been suspended, in + milliseconds. Used for runtime PM statistics. What: /sys/devices/.../power/runtime_usage Date: Apr 2010 diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index ab8cd337f43aad..8aed6d94c4cd0d 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -586,6 +586,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsa /sys/devices/system/cpu/vulnerabilities/tsx_async_abort + /sys/devices/system/cpu/vulnerabilities/vmscape Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 51c0bc4c2dc534..0e6c67ac585a08 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -15,6 +15,9 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst + microcode= [X86] Control the behavior of the microcode loader. + Available options, comma separated: + + base_rev=X - with with format: + Set the base microcode revision of each thread when in + debug mode. + + dis_ucode_ldr: disable the microcode loader + + force_minrev: Enable or disable the microcode minimal revision enforcement for the runtime microcode loader. @@ -3829,6 +3842,7 @@ srbds=off [X86,INTEL] ssbd=force-off [ARM64] tsx_async_abort=off [X86] + vmscape=off [X86] Exceptions: This does not have any effect on @@ -6154,7 +6168,7 @@ rdt= [HW,X86,RDT] Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, - mba, smba, bmec. + mba, smba, bmec, abmc. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba @@ -6405,8 +6419,9 @@ rodata= [KNL,EARLY] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. - full Mark read-only kernel memory and aliases as read-only - [arm64] + noalias Mark read-only kernel memory as read-only but retain + writable aliases in the direct map for regions outside + of the kernel image. [arm64] rockchip.usb_uart [EARLY] @@ -6428,6 +6443,9 @@ rootflags= [KNL] Set root filesystem mount option string + initramfs_options= [KNL] + Specify mount options for for the initramfs mount. + rootfstype= [KNL] Set root filesystem type rootwait [KNL] Wait (indefinitely) for root device to show up. @@ -8041,6 +8059,16 @@ vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: + vmscape= [X86] Controls mitigation for VMscape attacks. + VMscape attacks can leak information from a userspace + hypervisor to a guest via speculative side-channels. + + off - disable the mitigation + ibpb - use Indirect Branch Prediction Barrier + (IBPB) mitigation (default) + force - force vulnerability detection even on + unaffected processors + vsyscall= [X86-64,EARLY] Controls the behavior of vsyscalls (i.e. calls to fixed addresses of 0xffffffffff600x00 from legacy diff --git a/Documentation/admin-guide/laptops/lg-laptop.rst b/Documentation/admin-guide/laptops/lg-laptop.rst index 67fd6932cef4ff..c4dd534f91edd1 100644 --- a/Documentation/admin-guide/laptops/lg-laptop.rst +++ b/Documentation/admin-guide/laptops/lg-laptop.rst @@ -48,8 +48,8 @@ This value is reset to 100 when the kernel boots. Fan mode -------- -Writing 1/0 to /sys/devices/platform/lg-laptop/fan_mode disables/enables -the fan silent mode. +Writing 0/1/2 to /sys/devices/platform/lg-laptop/fan_mode sets fan mode to +Optimal/Silent/Performance respectively. USB charge diff --git a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst index cb376f335f402c..167f9281fbf57e 100644 --- a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst +++ b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst @@ -16,8 +16,8 @@ provides the following two features: - one 64-bit counter for Time Based Analysis (RX/TX data throughput and time spent in each low-power LTSSM state) and -- one 32-bit counter for Event Counting (error and non-error events for - a specified lane) +- one 32-bit counter per event for Event Counting (error and non-error + events for a specified lane) Note: There is no interrupt for counter overflow. diff --git a/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst b/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst new file mode 100644 index 00000000000000..46595b788d3acd --- /dev/null +++ b/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst @@ -0,0 +1,110 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +================================================ +Fujitsu Uncore Performance Monitoring Unit (PMU) +================================================ + +This driver supports the Uncore MAC PMUs and the Uncore PCI PMUs found +in Fujitsu chips. +Each MAC PMU on these chips is exposed as a uncore perf PMU with device name +mac_iod_mac_ch. +And each PCI PMU on these chips is exposed as a uncore perf PMU with device name +pci_iod_pci. + +The driver provides a description of its available events and configuration +options in sysfs, see /sys/bus/event_sources/devices/mac_iod_mac_ch/ +and /sys/bus/event_sources/devices/pci_iod_pci/. +This driver exports: +- formats, used by perf user space and other tools to configure events +- events, used by perf user space and other tools to create events + symbolically, e.g.: + perf stat -a -e mac_iod0_mac0_ch0/event=0x21/ ls + perf stat -a -e pci_iod0_pci0/event=0x24/ ls +- cpumask, used by perf user space and other tools to know on which CPUs + to open the events + +This driver supports the following events for MAC: +- cycles + This event counts MAC cycles at MAC frequency. +- read-count + This event counts the number of read requests to MAC. +- read-count-request + This event counts the number of read requests including retry to MAC. +- read-count-return + This event counts the number of responses to read requests to MAC. +- read-count-request-pftgt + This event counts the number of read requests including retry with PFTGT + flag. +- read-count-request-normal + This event counts the number of read requests including retry without PFTGT + flag. +- read-count-return-pftgt-hit + This event counts the number of responses to read requests which hit the + PFTGT buffer. +- read-count-return-pftgt-miss + This event counts the number of responses to read requests which miss the + PFTGT buffer. +- read-wait + This event counts outstanding read requests issued by DDR memory controller + per cycle. +- write-count + This event counts the number of write requests to MAC (including zero write, + full write, partial write, write cancel). +- write-count-write + This event counts the number of full write requests to MAC (not including + zero write). +- write-count-pwrite + This event counts the number of partial write requests to MAC. +- memory-read-count + This event counts the number of read requests from MAC to memory. +- memory-write-count + This event counts the number of full write requests from MAC to memory. +- memory-pwrite-count + This event counts the number of partial write requests from MAC to memory. +- ea-mac + This event counts energy consumption of MAC. +- ea-memory + This event counts energy consumption of memory. +- ea-memory-mac-write + This event counts the number of write requests from MAC to memory. +- ea-ha + This event counts energy consumption of HA. + + 'ea' is the abbreviation for 'Energy Analyzer'. + +Examples for use with perf:: + + perf stat -e mac_iod0_mac0_ch0/ea-mac/ ls + +And, this driver supports the following events for PCI: +- pci-port0-cycles + This event counts PCI cycles at PCI frequency in port0. +- pci-port0-read-count + This event counts read transactions for data transfer in port0. +- pci-port0-read-count-bus + This event counts read transactions for bus usage in port0. +- pci-port0-write-count + This event counts write transactions for data transfer in port0. +- pci-port0-write-count-bus + This event counts write transactions for bus usage in port0. +- pci-port1-cycles + This event counts PCI cycles at PCI frequency in port1. +- pci-port1-read-count + This event counts read transactions for data transfer in port1. +- pci-port1-read-count-bus + This event counts read transactions for bus usage in port1. +- pci-port1-write-count + This event counts write transactions for data transfer in port1. +- pci-port1-write-count-bus + This event counts write transactions for bus usage in port1. +- ea-pci + This event counts energy consumption of PCI. + + 'ea' is the abbreviation for 'Energy Analyzer'. + +Examples for use with perf:: + + perf stat -e pci_iod0_pci0/ea-pci/ ls + +Given that these are uncore PMUs the driver does not support sampling, therefore +"perf record" will not work. Per-task perf sessions are not supported. diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst index 48992a0b8e94f7..c4c2cbbf88cb82 100644 --- a/Documentation/admin-guide/perf/hisi-pmu.rst +++ b/Documentation/admin-guide/perf/hisi-pmu.rst @@ -18,9 +18,10 @@ HiSilicon SoC uncore PMU driver Each device PMU has separate registers for event counting, control and interrupt, and the PMU driver shall register perf PMU drivers like L3C, HHA and DDRC etc. The available events and configuration options shall -be described in the sysfs, see: +be described in the sysfs, see:: + +/sys/bus/event_source/devices/hisi_sccl{X}_ -/sys/bus/event_source/devices/hisi_sccl{X}_. The "perf list" command shall list the available events from sysfs. Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU @@ -112,6 +113,50 @@ uring channel. It is 2 bits. Some important codes are as follows: - 2'b00: default value, count the events which sent to the both uring and uring_ext channel; +6. ch: NoC PMU supports filtering the event counts of certain transaction +channel with this option. The current supported channels are as follows: + +- 3'b010: Request channel +- 3'b100: Snoop channel +- 3'b110: Response channel +- 3'b111: Data channel + +7. tt_en: NoC PMU supports counting only transactions that have tracetag set +if this option is set. See the 2nd list for more information about tracetag. + +For HiSilicon uncore PMU v3 whose identifier is 0x40, some uncore PMUs are +further divided into parts for finer granularity of tracing, each part has its +own dedicated PMU, and all such PMUs together cover the monitoring job of events +on particular uncore device. Such PMUs are described in sysfs with name format +slightly changed:: + +/sys/bus/event_source/devices/hisi_sccl{X}_ + +Z is the sub-id, indicating different PMUs for part of hardware device. + +Usage of most PMUs with different sub-ids are identical. Specially, L3C PMU +provides ``ext`` option to allow exploration of even finer granual statistics +of L3C PMU. L3C PMU driver uses that as hint of termination when delivering +perf command to hardware: + +- ext=0: Default, could be used with event names. +- ext=1 and ext=2: Must be used with event codes, event names are not supported. + +An example of perf command could be:: + + $# perf stat -a -e hisi_sccl0_l3c1_0/rd_spipe/ sleep 5 + +or:: + + $# perf stat -a -e hisi_sccl0_l3c1_0/event=0x1,ext=1/ sleep 5 + +As above, ``hisi_sccl0_l3c1_0`` locates PMU of Super CPU CLuster 0, L3 cache 1 +pipe0. + +First command locates the first part of L3C since ``ext=0`` is implied by +default. Second command issues the counting on another part of L3C with the +event ``0x1``. + Users could configure IDs to count data come from specific CCL/ICL, by setting srcid_cmd & srcid_msk, and data desitined for specific CCL/ICL by setting tgtid_cmd & tgtid_msk. A set bit in srcid_msk/tgtid_msk means the PMU will not diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 072b510385c417..47d9a3df6329bc 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -29,3 +29,4 @@ Performance monitor support cxl ampere_cspmu mrvl-pem-pmu + fujitsu_uncore_pmu diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index cacb9f0307dd5e..738d7b4dc33af1 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -274,10 +274,6 @@ are the following: The time it takes to switch the CPUs belonging to this policy from one P-state to another, in nanoseconds. - If unknown or if known to be so high that the scaling driver does not - work with the `ondemand`_ governor, -1 (:c:macro:`CPUFREQ_ETERNAL`) - will be returned by reads from this attribute. - ``related_cpus`` List of all (online and offline) CPUs belonging to this policy. diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index a18328a5fb93be..c85cd327af284d 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -34,22 +34,6 @@ When mounting an XFS filesystem, the following options are accepted. to the file. Specifying a fixed ``allocsize`` value turns off the dynamic behaviour. - attr2 or noattr2 - The options enable/disable an "opportunistic" improvement to - be made in the way inline extended attributes are stored - on-disk. When the new form is used for the first time when - ``attr2`` is selected (either when setting or removing extended - attributes) the on-disk superblock feature bit field will be - updated to reflect this format being in use. - - The default behaviour is determined by the on-disk feature - bit indicating that ``attr2`` behaviour is active. If either - mount option is set, then that becomes the new default used - by the filesystem. - - CRC enabled filesystems always use the ``attr2`` format, and so - will reject the ``noattr2`` mount option if it is set. - discard or nodiscard (default) Enable/disable the issuing of commands to let the block device reclaim space freed by the filesystem. This is @@ -75,12 +59,6 @@ When mounting an XFS filesystem, the following options are accepted. across the entire filesystem rather than just on directories configured to use it. - ikeep or noikeep (default) - When ``ikeep`` is specified, XFS does not delete empty inode - clusters and keeps them around on disk. When ``noikeep`` is - specified, empty inode clusters are returned to the free - space pool. - inode32 or inode64 (default) When ``inode32`` is specified, it indicates that XFS limits inode creation to locations which will not result in inode @@ -253,9 +231,8 @@ latest version and try again. The deprecation will take place in two parts. Support for mounting V4 filesystems can now be disabled at kernel build time via Kconfig option. -The option will default to yes until September 2025, at which time it -will be changed to default to no. In September 2030, support will be -removed from the codebase entirely. +These options were changed to default to no in September 2025. In +September 2030, support will be removed from the codebase entirely. Note: Distributors may choose to withdraw V4 format support earlier than the dates listed above. @@ -268,8 +245,6 @@ Deprecated Mount Options ============================ ================ Mounting with V4 filesystem September 2030 Mounting ascii-ci filesystem September 2030 -ikeep/noikeep September 2025 -attr2/noattr2 September 2025 ============================ ================ @@ -285,6 +260,8 @@ Removed Mount Options osyncisdsync/osyncisosync v4.0 barrier v4.19 nobarrier v4.19 + ikeep/noikeep v6.18 + attr2/noattr2 v6.18 =========================== ======= sysctls @@ -312,9 +289,6 @@ The following sysctls are available for the XFS filesystem: removes unused preallocation from clean inodes and releases the unused space back to the free pool. - fs.xfs.speculative_cow_prealloc_lifetime - This is an alias for speculative_prealloc_lifetime. - fs.xfs.error_level (Min: 0 Default: 3 Max: 11) A volume knob for error reporting when internal errors occur. This will generate detailed messages & backtraces for filesystem @@ -341,17 +315,6 @@ The following sysctls are available for the XFS filesystem: This option is intended for debugging only. - fs.xfs.irix_symlink_mode (Min: 0 Default: 0 Max: 1) - Controls whether symlinks are created with mode 0777 (default) - or whether their mode is affected by the umask (irix mode). - - fs.xfs.irix_sgid_inherit (Min: 0 Default: 0 Max: 1) - Controls files created in SGID directories. - If the group ID of the new file does not match the effective group - ID or one of the supplementary group IDs of the parent dir, the - ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl - is set. - fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1) Setting this to "1" will cause the "sync" flag set by the **xfs_io(8)** chattr command on a directory to be @@ -387,24 +350,20 @@ The following sysctls are available for the XFS filesystem: Deprecated Sysctls ================== -=========================================== ================ - Name Removal Schedule -=========================================== ================ -fs.xfs.irix_sgid_inherit September 2025 -fs.xfs.irix_symlink_mode September 2025 -fs.xfs.speculative_cow_prealloc_lifetime September 2025 -=========================================== ================ - +None currently. Removed Sysctls =============== -============================= ======= - Name Removed -============================= ======= - fs.xfs.xfsbufd_centisec v4.0 - fs.xfs.age_buffer_centisecs v4.0 -============================= ======= +========================================== ======= + Name Removed +========================================== ======= + fs.xfs.xfsbufd_centisec v4.0 + fs.xfs.age_buffer_centisecs v4.0 + fs.xfs.irix_symlink_mode v6.18 + fs.xfs.irix_sgid_inherit v6.18 + fs.xfs.speculative_cow_prealloc_lifetime v6.18 +========================================== ======= Error handling ============== diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index 2f666a7c303cdf..e4f953839f7181 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -466,6 +466,17 @@ Before jumping into the kernel, the following conditions must be met: - HDFGWTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1. - HDFGWTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1. + For CPUs with SPE data source filtering (FEAT_SPE_FDS): + + - If EL3 is present: + + - MDCR_EL3.EnPMS3 (bit 42) must be initialised to 0b1. + + - If the kernel is entered at EL1 and EL2 is present: + + - HDFGRTR2_EL2.nPMSDSFR_EL1 (bit 19) must be initialised to 0b1. + - HDFGWTR2_EL2.nPMSDSFR_EL1 (bit 19) must be initialised to 0b1. + For CPUs with Memory Copy and Memory Set instructions (FEAT_MOPS): - If the kernel is entered at EL1 and EL2 is present: diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index f58ada4d6cb2fd..a15df49568498f 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -441,6 +441,10 @@ HWCAP3_MTE_FAR HWCAP3_MTE_STORE_ONLY Functionality implied by ID_AA64PFR2_EL1.MTESTOREONLY == 0b0001. +HWCAP3_LSFE + Functionality implied by ID_AA64ISAR3_EL1.LSFE == 0b0001 + + 4. Unused AT_HWCAP bits ----------------------- diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index b18ef4064bc046..a7ec57060f64f5 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -200,6 +200,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-V3 | #3312417 | ARM64_ERRATUM_3194386 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Neoverse-V3AE | #3312417 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | MMU-500 | #841119,826419 | ARM_SMMU_MMU_500_CPRE_ERRATA| | | | #562869,1047329 | | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst index 4cb38330e7046b..583f2ee9cb9775 100644 --- a/Documentation/arch/arm64/sme.rst +++ b/Documentation/arch/arm64/sme.rst @@ -81,17 +81,7 @@ The ZA matrix is square with each side having as many bytes as a streaming mode SVE vector. -3. Sharing of streaming and non-streaming mode SVE state ---------------------------------------------------------- - -It is implementation defined which if any parts of the SVE state are shared -between streaming and non-streaming modes. When switching between modes -via software interfaces such as ptrace if no register content is provided as -part of switching no state will be assumed to be shared and everything will -be zeroed. - - -4. System call behaviour +3. System call behaviour ------------------------- * On syscall PSTATE.ZA is preserved, if PSTATE.ZA==1 then the contents of the @@ -112,7 +102,7 @@ be zeroed. exceptions for execve() described in section 6. -5. Signal handling +4. Signal handling ------------------- * Signal handlers are invoked with PSTATE.SM=0, PSTATE.ZA=0, and TPIDR2_EL0=0. diff --git a/Documentation/arch/powerpc/index.rst b/Documentation/arch/powerpc/index.rst index 53fc9f89f3e420..1be2ee3f0361f7 100644 --- a/Documentation/arch/powerpc/index.rst +++ b/Documentation/arch/powerpc/index.rst @@ -37,6 +37,7 @@ powerpc vas-api vcpudispatch_stats vmemmap_dedup + vpa-dtl features diff --git a/Documentation/arch/powerpc/vpa-dtl.rst b/Documentation/arch/powerpc/vpa-dtl.rst new file mode 100644 index 00000000000000..58d0022f993ab1 --- /dev/null +++ b/Documentation/arch/powerpc/vpa-dtl.rst @@ -0,0 +1,156 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. _vpa-dtl: + +=================================== +DTL (Dispatch Trace Log) +=================================== + +Athira Rajeev, 19 April 2025 + +.. contents:: + :depth: 3 + + +Basic overview +============== + +The pseries Shared Processor Logical Partition(SPLPAR) machines can +retrieve a log of dispatch and preempt events from the hypervisor +using data from Disptach Trace Log(DTL) buffer. With this information, +user can retrieve when and why each dispatch & preempt has occurred. +The vpa-dtl PMU exposes the Virtual Processor Area(VPA) DTL counters +via perf. + +Infrastructure used +=================== + +The VPA DTL PMU counters do not interrupt on overflow or generate any +PMI interrupts. Therefore, hrtimer is used to poll the DTL data. The timer +nterval can be provided by user via sample_period field in nano seconds. +vpa dtl pmu has one hrtimer added per vpa-dtl pmu thread. DTL (Dispatch +Trace Log) contains information about dispatch/preempt, enqueue time etc. +We directly copy the DTL buffer data as part of auxiliary buffer and it +will be processed later. This will avoid time taken to create samples +in the kernel space. The PMU driver collecting Dispatch Trace Log (DTL) +entries makes use of AUX support in perf infrastructure. On the tools side, +this data is made available as PERF_RECORD_AUXTRACE records. + +To correlate each DTL entry with other events across CPU's, an auxtrace_queue +is created for each CPU. Each auxtrace queue has a array/list of auxtrace buffers. +All auxtrace queues is maintained in auxtrace heap. The queues are sorted +based on timestamp. When the different PERF_RECORD_XX records are processed, +compare the timestamp of perf record with timestamp of top element in the +auxtrace heap so that DTL events can be co-related with other events +Process the auxtrace queue if the timestamp of element from heap is +lower than timestamp from entry in perf record. Sometimes it could happen that +one buffer is only partially processed. if the timestamp of occurrence of +another event is more than currently processed element in the queue, it will +move on to next perf record. So keep track of position of buffer to continue +processing next time. Update the timestamp of the auxtrace heap with the timestamp +of last processed entry from the auxtrace buffer. + +This infrastructure ensures dispatch trace log entries can be correlated +and presented along with other events like sched. + +vpa-dtl PMU example usage +========================= + +.. code-block:: sh + + # ls /sys/devices/vpa_dtl/ + events format perf_event_mux_interval_ms power subsystem type uevent + + +To capture the DTL data using perf record: +.. code-block:: sh + + # ./perf record -a -e sched:\*,vpa_dtl/dtl_all/ -c 1000000000 sleep 1 + +The result can be interpreted using perf record. Snippet of perf report -D + +.. code-block:: sh + + # ./perf report -D + +There are different PERF_RECORD_XX records. In that records corresponding to +auxtrace buffers includes: + +1. PERF_RECORD_AUX + Conveys that new data is available in AUX area + +2. PERF_RECORD_AUXTRACE_INFO + Describes offset and size of auxtrace data in the buffers + +3. PERF_RECORD_AUXTRACE + This is the record that defines the auxtrace data which here in case of + vpa-dtl pmu is dispatch trace log data. + +Snippet from perf report -D showing the PERF_RECORD_AUXTRACE dump + +.. code-block:: sh + +0 0 0x39b10 [0x30]: PERF_RECORD_AUXTRACE size: 0x690 offset: 0 ref: 0 idx: 0 tid: -1 cpu: 0 +. +. ... VPA DTL PMU data: size 1680 bytes, entries is 35 +. 00000000: boot_tb: 21349649546353231, tb_freq: 512000000 +. 00000030: dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:7064, ready_to_enqueue_time:187, waiting_to_ready_time:6611773 +. 00000060: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:146, ready_to_enqueue_time:0, waiting_to_ready_time:15359437 +. 00000090: dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:4868, ready_to_enqueue_time:232, waiting_to_ready_time:5100709 +. 000000c0: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:179, ready_to_enqueue_time:0, waiting_to_ready_time:30714243 +. 000000f0: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:197, ready_to_enqueue_time:0, waiting_to_ready_time:15350648 +. 00000120: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:213, ready_to_enqueue_time:0, waiting_to_ready_time:15353446 +. 00000150: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:212, ready_to_enqueue_time:0, waiting_to_ready_time:15355126 +. 00000180: dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:6368, ready_to_enqueue_time:164, waiting_to_ready_time:5104665 + +Above is representation of dtl entry of below format: + +struct dtl_entry { + u8 dispatch_reason; + u8 preempt_reason; + u16 processor_id; + u32 enqueue_to_dispatch_time; + u32 ready_to_enqueue_time; + u32 waiting_to_ready_time; + u64 timebase; + u64 fault_addr; + u64 srr0; + u64 srr1; + +}; + +First two fields represent the dispatch reason and preempt reason. The post +processing of PERF_RECORD_AUXTRACE records will translate to meaningful data +for user to consume. + +Visualize the dispatch trace log entries with perf report +========================================================= + +.. code-block:: sh + + # ./perf record -a -e sched:*,vpa_dtl/dtl_all/ -c 1000000000 sleep 1 + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.300 MB perf.data ] + + # ./perf report + # Samples: 321 of event 'vpa-dtl' + # Event count (approx.): 321 + # + # Children Self Command Shared Object Symbol + # ........ ........ ....... ................. .............................. + # + 100.00% 100.00% swapper [kernel.kallsyms] [k] plpar_hcall_norets_notrace + +Visualize the dispatch trace log entries with perf script +========================================================= + +.. code-block:: sh + + # ./perf script + migration/9 67 [009] 105373.359903: sched:sched_waking: comm=perf pid=13418 prio=120 target_cpu=009 + migration/9 67 [009] 105373.359904: sched:sched_migrate_task: comm=perf pid=13418 prio=120 orig_cpu=9 dest_cpu=10 + migration/9 67 [009] 105373.359907: sched:sched_stat_runtime: comm=migration/9 pid=67 runtime=4050 [ns] + migration/9 67 [009] 105373.359908: sched:sched_switch: prev_comm=migration/9 prev_pid=67 prev_prio=0 prev_state=S ==> next_comm=swapper/9 next_pid=0 next_prio=120 + :256 256 [016] 105373.359913: vpa-dtl: timebase: 21403600706628832 dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:4854, ready_to_enqueue_time:139, waiting_to_ready_time:511842115 c0000000000fcd28 plpar_hcall_norets_notrace+0x18 ([kernel.kallsyms]) + :256 256 [017] 105373.360012: vpa-dtl: timebase: 21403600706679454 dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:236, ready_to_enqueue_time:0, waiting_to_ready_time:133864583 c0000000000fcd28 plpar_hcall_norets_notrace+0x18 ([kernel.kallsyms]) + perf 13418 [010] 105373.360048: sched:sched_stat_runtime: comm=perf pid=13418 runtime=139748 [ns] + perf 13418 [010] 105373.360052: sched:sched_waking: comm=migration/10 pid=72 prio=0 target_cpu=010 diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst index 2aa9be272d5de1..2f449c9b15bdd6 100644 --- a/Documentation/arch/riscv/hwprobe.rst +++ b/Documentation/arch/riscv/hwprobe.rst @@ -327,6 +327,15 @@ The following keys are defined: * :c:macro:`RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED`: Misaligned vector accesses are not supported at all and will generate a misaligned address fault. +* :c:macro:`RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0`: A bitmask containing the + mips vendor extensions that are compatible with the + :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior. + + * MIPS + + * :c:macro:`RISCV_HWPROBE_VENDOR_EXT_XMIPSEXECTL`: The xmipsexectl vendor + extension is supported in the MIPS ISA extensions spec. + * :c:macro:`RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0`: A bitmask containing the thead vendor extensions that are compatible with the :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior. diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index c12837e61bda53..86bec8ac2c4de7 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -141,6 +141,197 @@ Thread-related topology information in the kernel: +System topology enumeration +=========================== + +The topology on x86 systems can be discovered using a combination of vendor +specific CPUID leaves which enumerate the processor topology and the cache +hierarchy. + +The CPUID leaves in their preferred order of parsing for each x86 vendor is as +follows: + +1) AMD + + 1) CPUID leaf 0x80000026 [Extended CPU Topology] (Core::X86::Cpuid::ExCpuTopology) + + The extended CPUID leaf 0x80000026 is the extension of the CPUID leaf 0xB + and provides the topology information of Core, Complex, CCD (Die), and + Socket in each level. + + Support for the leaf is discovered by checking if the maximum extended + CPUID level is >= 0x80000026 and then checking if `LogProcAtThisLevel` + in `EBX[15:0]` at a particular level (starting from 0) is non-zero. + + The `LevelType` in `ECX[15:8]` at the level provides the topology domain + the level describes - Core, Complex, CCD(Die), or the Socket. + + The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the + number of bits that need to be right-shifted from `ExtendedLocalApicId` + in `EDX[31:0]` in order to get a unique Topology ID for the topology + level. CPUs with the same Topology ID share the resources at that level. + + CPUID leaf 0x80000026 also provides more information regarding the power + and efficiency rankings, and about the core type on AMD processors with + heterogeneous characteristics. + + If CPUID leaf 0x80000026 is supported, further parsing is not required. + + 2) CPUID leaf 0x0000000B [Extended Topology Enumeration] (Core::X86::Cpuid::ExtTopEnum) + + The extended CPUID leaf 0x0000000B is the predecessor on the extended + CPUID leaf 0x80000026 and only describes the core, and the socket domains + of the processor topology. + + The support for the leaf is discovered by checking if the maximum supported + CPUID level is >= 0xB and then if `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + The `LevelType` in `ECX[15:8]` at the level provides the topology domain + that the level describes - Thread, or Processor (Socket). + + The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the + number of bits that need to be right-shifted from the `ExtendedLocalApicId` + in `EDX[31:0]` to get a unique Topology ID for that topology level. CPUs + sharing the Topology ID share the resources at that level. + + If CPUID leaf 0xB is supported, further parsing is not required. + + + 3) CPUID leaf 0x80000008 ECX [Size Identifiers] (Core::X86::Cpuid::SizeId) + + If neither the CPUID leaf 0x80000026 nor 0xB is supported, the number of + CPUs on the package is detected using the Size Identifier leaf + 0x80000008 ECX. + + The support for the leaf is discovered by checking if the supported + extended CPUID level is >= 0x80000008. + + The shifts from the APIC ID for the Socket ID is calculated from the + `ApicIdSize` field in `ECX[15:12]` if it is non-zero. + + If `ApicIdSize` is reported to be zero, the shift is calculated as the + order of the `number of threads` calculated from `NC` field in + `ECX[7:0]` which describes the `number of threads - 1` on the package. + + Unless Extended APIC ID is supported, the APIC ID used to find the + Socket ID is from the `LocalApicId` field of CPUID leaf 0x00000001 + `EBX[31:24]`. + + The topology parsing continues to detect if Extended APIC ID is + supported or not. + + + 4) CPUID leaf 0x8000001E [Extended APIC ID, Core Identifiers, Node Identifiers] + (Core::X86::Cpuid::{ExtApicId,CoreId,NodeId}) + + The support for Extended APIC ID can be detected by checking for the + presence of `TopologyExtensions` in `ECX[22]` of CPUID leaf 0x80000001 + [Feature Identifiers] (Core::X86::Cpuid::FeatureExtIdEcx). + + If Topology Extensions is supported, the APIC ID from `ExtendedApicId` + from CPUID leaf 0x8000001E `EAX[31:0]` should be preferred over that from + `LocalApicId` field of CPUID leaf 0x00000001 `EBX[31:24]` for topology + enumeration. + + On processors of Family 0x17 and above that do not support CPUID leaf + 0x80000026 or CPUID leaf 0xB, the shifts from the APIC ID for the Core + ID is calculated using the order of `number of threads per core` + calculated using the `ThreadsPerCore` field in `EBX[15:8]` which + describes `number of threads per core - 1`. + + On Processors of Family 0x15, the Core ID from `EBX[7:0]` is used as the + `cu_id` (Compute Unit ID) to detect CPUs that share the compute units. + + + All AMD processors that support the `TopologyExtensions` feature store the + `NodeId` from the `ECX[7:0]` of CPUID leaf 0x8000001E + (Core::X86::Cpuid::NodeId) as the per-CPU `node_id`. On older processors, + the `node_id` was discovered using MSR_FAM10H_NODE_ID MSR (MSR + 0x0xc001_100c). The presence of the NODE_ID MSR was detected by checking + `ECX[19]` of CPUID leaf 0x80000001 [Feature Identifiers] + (Core::X86::Cpuid::FeatureExtIdEcx). + + +2) Intel + + On Intel platforms, the CPUID leaves that enumerate the processor + topology are as follows: + + 1) CPUID leaf 0x1F (V2 Extended Topology Enumeration Leaf) + + The CPUID leaf 0x1F is the extension of the CPUID leaf 0xB and provides + the topology information of Core, Module, Tile, Die, DieGrp, and Socket + in each level. + + The support for the leaf is discovered by checking if the supported + CPUID level is >= 0x1F and then `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + The `Domain Type` in `ECX[15:8]` of the sub-leaf provides the topology + domain that the level describes - Core, Module, Tile, Die, DieGrp, and + Socket. + + The kernel uses the value from `EAX[4:0]` to discover the number of + bits that need to be right shifted from the `x2APIC ID` in `EDX[31:0]` + to get a unique Topology ID for the topology level. CPUs with the same + Topology ID share the resources at that level. + + If CPUID leaf 0x1F is supported, further parsing is not required. + + + 2) CPUID leaf 0x0000000B (Extended Topology Enumeration Leaf) + + The extended CPUID leaf 0x0000000B is the predecessor of the V2 Extended + Topology Enumeration Leaf 0x1F and only describes the core, and the + socket domains of the processor topology. + + The support for the leaf is iscovered by checking if the supported CPUID + level is >= 0xB and then checking if `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + CPUID leaf 0x0000000B shares the same layout as CPUID leaf 0x1F and + should be enumerated in a similar manner. + + If CPUID leaf 0xB is supported, further parsing is not required. + + + 3) CPUID leaf 0x00000004 (Deterministic Cache Parameters Leaf) + + On Intel processors that support neither CPUID leaf 0x1F, nor CPUID leaf + 0xB, the shifts for the SMT domains is calculated using the number of + CPUs sharing the L1 cache. + + Processors that feature Hyper-Threading is detected using `EDX[28]` of + CPUID leaf 0x1 (Basic CPUID Information). + + The order of `Maximum number of addressable IDs for logical processors + sharing this cache` from `EAX[25:14]` of level-0 of CPUID 0x4 provides + the shifts from the APIC ID required to compute the Core ID. + + The APIC ID and Package information is computed using the data from + CPUID leaf 0x1. + + + 4) CPUID leaf 0x00000001 (Basic CPUID Information) + + The mask and shifts to derive the Physical Package (socket) ID is + computed using the `Maximum number of addressable IDs for logical + processors in this physical package` from `EBX[23:16]` of CPUID leaf + 0x1. + + The APIC ID on the legacy platforms is derived from the `Initial APIC + ID` field from `EBX[31:24]` of CPUID leaf 0x1. + + +3) Centaur and Zhaoxin + + Similar to Intel, Centaur and Zhaoxin use a combination of CPUID leaf + 0x00000004 (Deterministic Cache Parameters Leaf) and CPUID leaf 0x00000001 + (Basic CPUID Information) to derive the topology information. + + + System topology examples ======================== diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index ae468b781d3118..e38941370b90c9 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -335,9 +335,26 @@ consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely also be KF_RET_NULL. +2.4.8 KF_RCU_PROTECTED flag +--------------------------- + +The KF_RCU_PROTECTED flag is used to indicate that the kfunc must be invoked in +an RCU critical section. This is assumed by default in non-sleepable programs, +and must be explicitly ensured by calling ``bpf_rcu_read_lock`` for sleepable +ones. + +If the kfunc returns a pointer value, this flag also enforces that the returned +pointer is RCU protected, and can only be used while the RCU critical section is +active. + +The flag is distinct from the ``KF_RCU`` flag, which only ensures that its +arguments are at least RCU protected pointers. This may transitively imply that +RCU protection is ensured, but it does not work in cases of kfuncs which require +RCU protection but do not take RCU protected arguments. + .. _KF_deprecated_flag: -2.4.8 KF_DEPRECATED flag +2.4.9 KF_DEPRECATED flag ------------------------ The KF_DEPRECATED flag is used for kfuncs which are scheduled to be diff --git a/Documentation/bpf/verifier.rst b/Documentation/bpf/verifier.rst index 95e6f80a407e52..510d15bc697b86 100644 --- a/Documentation/bpf/verifier.rst +++ b/Documentation/bpf/verifier.rst @@ -347,270 +347,6 @@ However, only the value of register ``r1`` is important to successfully finish verification. The goal of the liveness tracking algorithm is to spot this fact and figure out that both states are actually equivalent. -Data structures -~~~~~~~~~~~~~~~ - -Liveness is tracked using the following data structures:: - - enum bpf_reg_liveness { - REG_LIVE_NONE = 0, - REG_LIVE_READ32 = 0x1, - REG_LIVE_READ64 = 0x2, - REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, - REG_LIVE_WRITTEN = 0x4, - REG_LIVE_DONE = 0x8, - }; - - struct bpf_reg_state { - ... - struct bpf_reg_state *parent; - ... - enum bpf_reg_liveness live; - ... - }; - - struct bpf_stack_state { - struct bpf_reg_state spilled_ptr; - ... - }; - - struct bpf_func_state { - struct bpf_reg_state regs[MAX_BPF_REG]; - ... - struct bpf_stack_state *stack; - } - - struct bpf_verifier_state { - struct bpf_func_state *frame[MAX_CALL_FRAMES]; - struct bpf_verifier_state *parent; - ... - } - -* ``REG_LIVE_NONE`` is an initial value assigned to ``->live`` fields upon new - verifier state creation; - -* ``REG_LIVE_WRITTEN`` means that the value of the register (or stack slot) is - defined by some instruction verified between this verifier state's parent and - verifier state itself; - -* ``REG_LIVE_READ{32,64}`` means that the value of the register (or stack slot) - is read by a some child state of this verifier state; - -* ``REG_LIVE_DONE`` is a marker used by ``clean_verifier_state()`` to avoid - processing same verifier state multiple times and for some sanity checks; - -* ``->live`` field values are formed by combining ``enum bpf_reg_liveness`` - values using bitwise or. - -Register parentage chains -~~~~~~~~~~~~~~~~~~~~~~~~~ - -In order to propagate information between parent and child states, a *register -parentage chain* is established. Each register or stack slot is linked to a -corresponding register or stack slot in its parent state via a ``->parent`` -pointer. This link is established upon state creation in ``is_state_visited()`` -and might be modified by ``set_callee_state()`` called from -``__check_func_call()``. - -The rules for correspondence between registers / stack slots are as follows: - -* For the current stack frame, registers and stack slots of the new state are - linked to the registers and stack slots of the parent state with the same - indices. - -* For the outer stack frames, only callee saved registers (r6-r9) and stack - slots are linked to the registers and stack slots of the parent state with the - same indices. - -* When function call is processed a new ``struct bpf_func_state`` instance is - allocated, it encapsulates a new set of registers and stack slots. For this - new frame, parent links for r6-r9 and stack slots are set to nil, parent links - for r1-r5 are set to match caller r1-r5 parent links. - -This could be illustrated by the following diagram (arrows stand for -``->parent`` pointers):: - - ... ; Frame #0, some instructions - --- checkpoint #0 --- - 1 : r6 = 42 ; Frame #0 - --- checkpoint #1 --- - 2 : call foo() ; Frame #0 - ... ; Frame #1, instructions from foo() - --- checkpoint #2 --- - ... ; Frame #1, instructions from foo() - --- checkpoint #3 --- - exit ; Frame #1, return from foo() - 3 : r1 = r6 ; Frame #0 <- current state - - +-------------------------------+-------------------------------+ - | Frame #0 | Frame #1 | - Checkpoint +-------------------------------+-------------------------------+ - #0 | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - ^ ^ ^ ^ - | | | | - Checkpoint +-------------------------------+ - #1 | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - ^ ^ ^ - |_______|_______|_______________ - | | | - nil nil | | | nil nil - | | | | | | | - Checkpoint +-------------------------------+-------------------------------+ - #2 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+-------------------------------+ - ^ ^ ^ ^ ^ - nil nil | | | | | - | | | | | | | - Checkpoint +-------------------------------+-------------------------------+ - #3 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+-------------------------------+ - ^ ^ - nil nil | | - | | | | - Current +-------------------------------+ - state | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - \ - r6 read mark is propagated via these links - all the way up to checkpoint #1. - The checkpoint #1 contains a write mark for r6 - because of instruction (1), thus read propagation - does not reach checkpoint #0 (see section below). - -Liveness marks tracking -~~~~~~~~~~~~~~~~~~~~~~~ - -For each processed instruction, the verifier tracks read and written registers -and stack slots. The main idea of the algorithm is that read marks propagate -back along the state parentage chain until they hit a write mark, which 'screens -off' earlier states from the read. The information about reads is propagated by -function ``mark_reg_read()`` which could be summarized as follows:: - - mark_reg_read(struct bpf_reg_state *state, ...): - parent = state->parent - while parent: - if state->live & REG_LIVE_WRITTEN: - break - if parent->live & REG_LIVE_READ64: - break - parent->live |= REG_LIVE_READ64 - state = parent - parent = state->parent - -Notes: - -* The read marks are applied to the **parent** state while write marks are - applied to the **current** state. The write mark on a register or stack slot - means that it is updated by some instruction in the straight-line code leading - from the parent state to the current state. - -* Details about REG_LIVE_READ32 are omitted. - -* Function ``propagate_liveness()`` (see section :ref:`read_marks_for_cache_hits`) - might override the first parent link. Please refer to the comments in the - ``propagate_liveness()`` and ``mark_reg_read()`` source code for further - details. - -Because stack writes could have different sizes ``REG_LIVE_WRITTEN`` marks are -applied conservatively: stack slots are marked as written only if write size -corresponds to the size of the register, e.g. see function ``save_register_state()``. - -Consider the following example:: - - 0: (*u64)(r10 - 8) = 0 ; define 8 bytes of fp-8 - --- checkpoint #0 --- - 1: (*u32)(r10 - 8) = 1 ; redefine lower 4 bytes - 2: r1 = (*u32)(r10 - 8) ; read lower 4 bytes defined at (1) - 3: r2 = (*u32)(r10 - 4) ; read upper 4 bytes defined at (0) - -As stated above, the write at (1) does not count as ``REG_LIVE_WRITTEN``. Should -it be otherwise, the algorithm above wouldn't be able to propagate the read mark -from (3) to checkpoint #0. - -Once the ``BPF_EXIT`` instruction is reached ``update_branch_counts()`` is -called to update the ``->branches`` counter for each verifier state in a chain -of parent verifier states. When the ``->branches`` counter reaches zero the -verifier state becomes a valid entry in a set of cached verifier states. - -Each entry of the verifier states cache is post-processed by a function -``clean_live_states()``. This function marks all registers and stack slots -without ``REG_LIVE_READ{32,64}`` marks as ``NOT_INIT`` or ``STACK_INVALID``. -Registers/stack slots marked in this way are ignored in function ``stacksafe()`` -called from ``states_equal()`` when a state cache entry is considered for -equivalence with a current state. - -Now it is possible to explain how the example from the beginning of the section -works:: - - 0: call bpf_get_prandom_u32() - 1: r1 = 0 - 2: if r0 == 0 goto +1 - 3: r0 = 1 - --- checkpoint[0] --- - 4: r0 = r1 - 5: exit - -* At instruction #2 branching point is reached and state ``{ r0 == 0, r1 == 0, pc == 4 }`` - is pushed to states processing queue (pc stands for program counter). - -* At instruction #4: - - * ``checkpoint[0]`` states cache entry is created: ``{ r0 == 1, r1 == 0, pc == 4 }``; - * ``checkpoint[0].r0`` is marked as written; - * ``checkpoint[0].r1`` is marked as read; - -* At instruction #5 exit is reached and ``checkpoint[0]`` can now be processed - by ``clean_live_states()``. After this processing ``checkpoint[0].r1`` has a - read mark and all other registers and stack slots are marked as ``NOT_INIT`` - or ``STACK_INVALID`` - -* The state ``{ r0 == 0, r1 == 0, pc == 4 }`` is popped from the states queue - and is compared against a cached state ``{ r1 == 0, pc == 4 }``, the states - are considered equivalent. - -.. _read_marks_for_cache_hits: - -Read marks propagation for cache hits -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Another point is the handling of read marks when a previously verified state is -found in the states cache. Upon cache hit verifier must behave in the same way -as if the current state was verified to the program exit. This means that all -read marks, present on registers and stack slots of the cached state, must be -propagated over the parentage chain of the current state. Example below shows -why this is important. Function ``propagate_liveness()`` handles this case. - -Consider the following state parentage chain (S is a starting state, A-E are -derived states, -> arrows show which state is derived from which):: - - r1 read - <------------- A[r1] == 0 - C[r1] == 0 - S ---> A ---> B ---> exit E[r1] == 1 - | - ` ---> C ---> D - | - ` ---> E ^ - |___ suppose all these - ^ states are at insn #Y - | - suppose all these - states are at insn #X - -* Chain of states ``S -> A -> B -> exit`` is verified first. - -* While ``B -> exit`` is verified, register ``r1`` is read and this read mark is - propagated up to state ``A``. - -* When chain of states ``C -> D`` is verified the state ``D`` turns out to be - equivalent to state ``B``. - -* The read mark for ``r1`` has to be propagated to state ``C``, otherwise state - ``C`` might get mistakenly marked as equivalent to state ``E`` even though - values for register ``r1`` differ between ``C`` and ``E``. - Understanding eBPF verifier messages ==================================== diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index d84ededb66f92a..c5635ac3de5474 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -109,8 +109,7 @@ Then, the driver must fill in the following values: +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | the time it takes on this CPU to | | | switch between two frequencies in | -| | nanoseconds (if appropriate, else | -| | specify CPUFREQ_ETERNAL) | +| | nanoseconds | +-----------------------------------+--------------------------------------+ |policy->cur | The current operating frequency of | | | this CPU (if appropriate) | diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml index 5bd517befb6805..4accf4cbc6c710 100644 --- a/Documentation/devicetree/bindings/arm/cpus.yaml +++ b/Documentation/devicetree/bindings/arm/cpus.yaml @@ -353,6 +353,12 @@ properties: $ref: /schemas/types.yaml#/definitions/phandle description: Link to Mediatek Cache Coherent Interconnect + edac-enabled: + $ref: /schemas/types.yaml#/definitions/flag + description: + A72 CPUs support Error Detection And Correction (EDAC) on their L1 and + L2 caches. This flag marks this function as usable. + qcom,saw: $ref: /schemas/types.yaml#/definitions/phandle description: @@ -399,6 +405,17 @@ properties: allOf: - $ref: /schemas/cpu.yaml# - $ref: /schemas/opp/opp-v1.yaml# + - if: + not: + properties: + compatible: + contains: + const: arm,cortex-a72 + then: + # Allow edac-enabled only for Cortex A72 + properties: + edac-enabled: false + - if: # If the enable-method property contains one of those values properties: diff --git a/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml b/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml index 4e78933232b6b9..6f3a8578fe2a68 100644 --- a/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml +++ b/Documentation/devicetree/bindings/clock/marvell,pxa1908.yaml @@ -19,11 +19,14 @@ description: | properties: compatible: - enum: - - marvell,pxa1908-apbc - - marvell,pxa1908-apbcp - - marvell,pxa1908-mpmu - - marvell,pxa1908-apmu + oneOf: + - enum: + - marvell,pxa1908-apbc + - marvell,pxa1908-apbcp + - marvell,pxa1908-mpmu + - items: + - const: marvell,pxa1908-apmu + - const: syscon reg: maxItems: 1 @@ -31,6 +34,9 @@ properties: '#clock-cells': const: 1 + '#power-domain-cells': + const: 1 + required: - compatible - reg @@ -38,11 +44,23 @@ required: additionalProperties: false +if: + not: + properties: + compatible: + contains: + const: marvell,pxa1908-apmu + +then: + properties: + '#power-domain-cells': false + examples: # APMU block: - | clock-controller@d4282800 { - compatible = "marvell,pxa1908-apmu"; + compatible = "marvell,pxa1908-apmu", "syscon"; reg = <0xd4282800 0x400>; #clock-cells = <1>; + #power-domain-cells = <1>; }; diff --git a/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml index f2f87f0f545bc5..6493a6968bb4b9 100644 --- a/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml +++ b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml @@ -92,8 +92,12 @@ required: anyOf: - required: - qcom,powered-remotely + - num-channels + - qcom,num-ees - required: - qcom,controlled-remotely + - num-channels + - qcom,num-ees - required: - clocks - clock-names diff --git a/Documentation/devicetree/bindings/eeprom/at24.yaml b/Documentation/devicetree/bindings/eeprom/at24.yaml index 0ac68646c07779..50af7ccf6e21ab 100644 --- a/Documentation/devicetree/bindings/eeprom/at24.yaml +++ b/Documentation/devicetree/bindings/eeprom/at24.yaml @@ -143,6 +143,7 @@ properties: - const: atmel,24c128 - items: - enum: + - giantec,gt24c256c - puya,p24c256c - const: atmel,24c256 - items: diff --git a/Documentation/devicetree/bindings/platform/acer,aspire1-ec.yaml b/Documentation/devicetree/bindings/embedded-controller/acer,aspire1-ec.yaml similarity index 94% rename from Documentation/devicetree/bindings/platform/acer,aspire1-ec.yaml rename to Documentation/devicetree/bindings/embedded-controller/acer,aspire1-ec.yaml index 7cb0134134ffa6..01ee61768527c8 100644 --- a/Documentation/devicetree/bindings/platform/acer,aspire1-ec.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/acer,aspire1-ec.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/platform/acer,aspire1-ec.yaml# +$id: http://devicetree.org/schemas/embedded-controller/acer,aspire1-ec.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Acer Aspire 1 Embedded Controller diff --git a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml b/Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml similarity index 99% rename from Documentation/devicetree/bindings/mfd/google,cros-ec.yaml rename to Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml index 50f45709006690..3ab5737c9a8f3f 100644 --- a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/mfd/google,cros-ec.yaml# +$id: http://devicetree.org/schemas/embedded-controller/google,cros-ec.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: ChromeOS Embedded Controller diff --git a/Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml b/Documentation/devicetree/bindings/embedded-controller/gw,gsc.yaml similarity index 98% rename from Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml rename to Documentation/devicetree/bindings/embedded-controller/gw,gsc.yaml index dc379f3ebf24ff..82d4b2dadbae4e 100644 --- a/Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/gw,gsc.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause %YAML 1.2 --- -$id: http://devicetree.org/schemas/mfd/gateworks-gsc.yaml# +$id: http://devicetree.org/schemas/embedded-controller/gw,gsc.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Gateworks System Controller diff --git a/Documentation/devicetree/bindings/platform/huawei,gaokun-ec.yaml b/Documentation/devicetree/bindings/embedded-controller/huawei,gaokun3-ec.yaml similarity index 97% rename from Documentation/devicetree/bindings/platform/huawei,gaokun-ec.yaml rename to Documentation/devicetree/bindings/embedded-controller/huawei,gaokun3-ec.yaml index 4a03b0ee314900..cd9e65b6c2ea35 100644 --- a/Documentation/devicetree/bindings/platform/huawei,gaokun-ec.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/huawei,gaokun3-ec.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/platform/huawei,gaokun-ec.yaml# +$id: http://devicetree.org/schemas/embedded-controller/huawei,gaokun3-ec.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Huawei Matebook E Go Embedded Controller diff --git a/Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml b/Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml similarity index 94% rename from Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml rename to Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml index 37207a97e06c69..a77e67f6cb82f9 100644 --- a/Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/mfd/kontron,sl28cpld.yaml# +$id: http://devicetree.org/schemas/embedded-controller/kontron,sl28cpld.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Kontron's sl28cpld board management controller @@ -16,7 +16,12 @@ description: | properties: compatible: - const: kontron,sl28cpld + oneOf: + - items: + - enum: + - kontron,sa67mcu + - const: kontron,sl28cpld + - const: kontron,sl28cpld reg: description: diff --git a/Documentation/devicetree/bindings/platform/lenovo,yoga-c630-ec.yaml b/Documentation/devicetree/bindings/embedded-controller/lenovo,yoga-c630-ec.yaml similarity index 95% rename from Documentation/devicetree/bindings/platform/lenovo,yoga-c630-ec.yaml rename to Documentation/devicetree/bindings/embedded-controller/lenovo,yoga-c630-ec.yaml index 3180ce1a22d445..a029b38e8dc0b1 100644 --- a/Documentation/devicetree/bindings/platform/lenovo,yoga-c630-ec.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/lenovo,yoga-c630-ec.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/platform/lenovo,yoga-c630-ec.yaml# +$id: http://devicetree.org/schemas/embedded-controller/lenovo,yoga-c630-ec.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Lenovo Yoga C630 Embedded Controller. diff --git a/Documentation/devicetree/bindings/platform/microsoft,surface-sam.yaml b/Documentation/devicetree/bindings/embedded-controller/microsoft,surface-sam.yaml similarity index 92% rename from Documentation/devicetree/bindings/platform/microsoft,surface-sam.yaml rename to Documentation/devicetree/bindings/embedded-controller/microsoft,surface-sam.yaml index b33d26f15b2afa..9202cfca0b3518 100644 --- a/Documentation/devicetree/bindings/platform/microsoft,surface-sam.yaml +++ b/Documentation/devicetree/bindings/embedded-controller/microsoft,surface-sam.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/platform/microsoft,surface-sam.yaml# +$id: http://devicetree.org/schemas/embedded-controller/microsoft,surface-sam.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Surface System Aggregator Module (SAM, SSAM) diff --git a/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml b/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml index 87e986386f32a4..b4d55bf6a28548 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml +++ b/Documentation/devicetree/bindings/gpio/gpio-mmio.yaml @@ -22,6 +22,7 @@ properties: - brcm,bcm6345-gpio - ni,169445-nand-gpio - wd,mbl-gpio # Western Digital MyBook Live memory-mapped GPIO controller + - intel,ixp4xx-expansion-bus-mmio-gpio big-endian: true @@ -89,6 +90,20 @@ properties: description: If this property is present, the controller cannot drive the GPIO lines. +if: + properties: + compatible: + contains: + const: intel,ixp4xx-expansion-bus-mmio-gpio +then: + $ref: /schemas/memory-controllers/intel,ixp4xx-expansion-peripheral-props.yaml# + +patternProperties: + "^.+-hog(-[0-9]+)?$": + type: object + required: + - gpio-hog + required: - compatible - reg @@ -96,7 +111,7 @@ required: - '#gpio-cells' - gpio-controller -additionalProperties: false +unevaluatedProperties: false examples: - | @@ -126,3 +141,22 @@ examples: gpio-controller; #gpio-cells = <2>; }; + + bus@c4000000 { + compatible = "intel,ixp42x-expansion-bus-controller", "syscon"; + reg = <0xc4000000 0x30>; + native-endian; + #address-cells = <2>; + #size-cells = <1>; + ranges = <0 0x0 0x50000000 0x01000000>; + dma-ranges = <0 0x0 0x50000000 0x01000000>; + gpio@1,0 { + compatible = "intel,ixp4xx-expansion-bus-mmio-gpio"; + gpio-controller; + #gpio-cells = <2>; + big-endian; + reg = <1 0x00000000 0x2>; + reg-names = "dat"; + intel,ixp4xx-eb-write-enable = <1>; + }; + }; diff --git a/Documentation/devicetree/bindings/gpio/gpio-mxs.yaml b/Documentation/devicetree/bindings/gpio/gpio-mxs.yaml index b58e08c8ecd8a1..aaf97124803f42 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-mxs.yaml +++ b/Documentation/devicetree/bindings/gpio/gpio-mxs.yaml @@ -18,9 +18,13 @@ description: | properties: compatible: - enum: - - fsl,imx23-pinctrl - - fsl,imx28-pinctrl + items: + - enum: + - fsl,imx23-pinctrl + - fsl,imx28-pinctrl + # Over 10 years old devices, driver use simple-bus to probe child gpio + # Devices. Keep it as it to be compatible existed dts files. + - const: simple-bus '#address-cells': const: 1 @@ -31,7 +35,65 @@ properties: maxItems: 1 patternProperties: - "gpio@[0-9]+$": + "^(?!gpio@)[^@]+@[0-9]+$": + type: object + properties: + fsl,pinmux-ids: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + An integer array. Each integer in the array specify a pin + with given mux function, with bank, pin and mux packed as below. + + [15..12] : bank number + [11..4] : pin number + [3..0] : mux selection + + This integer with mux selection packed is used as an entity by both group + and config nodes to identify a pin. The mux selection in the integer takes + effects only on group node, and will get ignored by driver with config node, + since config node is only meant to set up pin configurations. + + Valid values for these integers are listed below. + + reg: + items: + - description: | + pin group index. NOTE: it is supposed wrong use reg property + here. But it is over 10 years devices. Just keep it as it. + + fsl,drive-strength: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + description: | + 0: MXS_DRIVE_4mA + 1: MXS_DRIVE_8mA + 2: MXS_DRIVE_12mA + 3: MXS_DRIVE_16mA + + fsl,voltage: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1] + description: | + 0: MXS_VOLTAGE_LOW - 1.8 V + 1: MXS_VOLTAGE_HIGH - 3.3 V + + fsl,pull-up: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1] + description: | + 0: MXS_PULL_DISABLE - Disable the internal pull-up + 1: MXS_PULL_ENABLE - Enable the internal pull-up + + Note that when enabling the pull-up, the internal pad keeper gets disabled. + Also, some pins doesn't have a pull up, in that case, setting the fsl,pull-up + will only disable the internal pad keeper. + + required: + - fsl,pinmux-ids + + additionalProperties: false + + "^gpio@[0-9]+$": type: object properties: compatible: @@ -80,7 +142,7 @@ examples: pinctrl@80018000 { #address-cells = <1>; #size-cells = <0>; - compatible = "fsl,imx28-pinctrl"; + compatible = "fsl,imx28-pinctrl", "simple-bus"; reg = <0x80018000 0x2000>; gpio@0 { @@ -132,4 +194,12 @@ examples: interrupt-controller; #interrupt-cells = <2>; }; + + lcdif-apx4@5 { + reg = <5>; + fsl,pinmux-ids = <0x1181 0x1191>; + fsl,drive-strength = <0>; + fsl,voltage = <0>; + fsl,pull-up = <0>; + }; }; diff --git a/Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml b/Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml index b032471831e7c7..02663d67eac751 100644 --- a/Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml @@ -11,7 +11,7 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. There are three flavors of the GPIO controller, one full featured input/output with interrupt support (kontron,sl28cpld-gpio), one diff --git a/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml b/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml index b68159600e2bd8..69852444df239e 100644 --- a/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml @@ -14,6 +14,7 @@ properties: oneOf: - enum: - loongson,ls2k-gpio + - loongson,ls2k0300-gpio - loongson,ls2k0500-gpio0 - loongson,ls2k0500-gpio1 - loongson,ls2k2000-gpio0 @@ -36,7 +37,7 @@ properties: ngpios: minimum: 1 - maximum: 64 + maximum: 128 "#gpio-cells": const: 2 @@ -49,6 +50,14 @@ properties: minItems: 1 maxItems: 64 + "#interrupt-cells": + const: 2 + + interrupt-controller: true + + resets: + maxItems: 1 + required: - compatible - reg @@ -58,6 +67,23 @@ required: - gpio-ranges - interrupts +allOf: + - if: + properties: + compatible: + contains: + const: loongson,ls2k0300-gpio + then: + required: + - "#interrupt-cells" + - interrupt-controller + - resets + else: + properties: + "#interrupts-cells": false + interrupt-controller: false + resets: false + additionalProperties: false examples: diff --git a/Documentation/devicetree/bindings/gpio/maxim,max31910.yaml b/Documentation/devicetree/bindings/gpio/maxim,max31910.yaml index 82a190a715f940..4d200f9dffd5fe 100644 --- a/Documentation/devicetree/bindings/gpio/maxim,max31910.yaml +++ b/Documentation/devicetree/bindings/gpio/maxim,max31910.yaml @@ -95,9 +95,9 @@ examples: #gpio-cells = <2>; maxim,modesel-gpios = <&gpio2 23>; - maxim,fault-gpios = <&gpio2 24 GPIO_ACTIVE_LOW>; - maxim,db0-gpios = <&gpio2 25>; - maxim,db1-gpios = <&gpio2 26>; + maxim,fault-gpios = <&gpio2 24 GPIO_ACTIVE_LOW>; + maxim,db0-gpios = <&gpio2 25>; + maxim,db1-gpios = <&gpio2 26>; spi-max-frequency = <25000000>; }; diff --git a/Documentation/devicetree/bindings/gpio/maxim,max7360-gpio.yaml b/Documentation/devicetree/bindings/gpio/maxim,max7360-gpio.yaml new file mode 100644 index 00000000000000..c5c3fc4c816f7f --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/maxim,max7360-gpio.yaml @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/gpio/maxim,max7360-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Maxim MAX7360 GPIO controller + +maintainers: + - Kamel Bouhara + - Mathieu Dubois-Briand + +description: | + Maxim MAX7360 GPIO controller, in MAX7360 chipset + https://www.analog.com/en/products/max7360.html + + The device provides two series of GPIOs, referred here as GPIOs and GPOs. + + PORT0 to PORT7 pins can be used as GPIOs, with support for interrupts and + constant-current mode. These pins will also be used by the rotary encoder and + PWM functionalities. + + COL2 to COL7 pins can be used as GPOs, there is no input capability. COL pins + will be partitioned, with the first pins being affected to the keypad + functionality and the last ones as GPOs. + +properties: + compatible: + enum: + - maxim,max7360-gpio + - maxim,max7360-gpo + + gpio-controller: true + + "#gpio-cells": + const: 2 + + interrupt-controller: true + + "#interrupt-cells": + const: 2 + + maxim,constant-current-disable: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Bit field, each bit disables constant-current output of the associated + GPIO, starting from the least significant bit for the first GPIO. + maximum: 0xff + +required: + - compatible + - gpio-controller + +allOf: + - if: + properties: + compatible: + contains: + enum: + - maxim,max7360-gpio + ngpios: false + then: + required: + - interrupt-controller + else: + properties: + interrupt-controller: false + maxim,constant-current-disable: false + +additionalProperties: false + +examples: + - | + gpio { + compatible = "maxim,max7360-gpio"; + + gpio-controller; + #gpio-cells = <2>; + maxim,constant-current-disable = <0x06>; + + interrupt-controller; + #interrupt-cells = <2>; + }; diff --git a/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml b/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml index 065f5761a93f61..2bd620a1099b9a 100644 --- a/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml @@ -85,6 +85,7 @@ properties: - nvidia,tegra194-gpio-aon - nvidia,tegra234-gpio - nvidia,tegra234-gpio-aon + - nvidia,tegra256-gpio reg-names: items: @@ -155,6 +156,7 @@ allOf: - nvidia,tegra186-gpio - nvidia,tegra194-gpio - nvidia,tegra234-gpio + - nvidia,tegra256-gpio then: properties: interrupts: diff --git a/Documentation/devicetree/bindings/gpio/spacemit,k1-gpio.yaml b/Documentation/devicetree/bindings/gpio/spacemit,k1-gpio.yaml index ec0232e72c7122..83e0b2d14c9f8c 100644 --- a/Documentation/devicetree/bindings/gpio/spacemit,k1-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/spacemit,k1-gpio.yaml @@ -80,7 +80,7 @@ examples: gpio@d4019000 { compatible = "spacemit,k1-gpio"; reg = <0xd4019000 0x800>; - clocks =<&ccu 9>, <&ccu 61>; + clocks = <&ccu 9>, <&ccu 61>; clock-names = "core", "bus"; gpio-controller; #gpio-cells = <3>; diff --git a/Documentation/devicetree/bindings/gpio/trivial-gpio.yaml b/Documentation/devicetree/bindings/gpio/trivial-gpio.yaml index 0299d4a25086af..c994177de940af 100644 --- a/Documentation/devicetree/bindings/gpio/trivial-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/trivial-gpio.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/trivial-gpio.yaml# +$id: http://devicetree.org/schemas/gpio/trivial-gpio.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Trivial 2-cell GPIO controllers diff --git a/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml b/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml index ddb72857c84641..d6a7517f2a50c4 100644 --- a/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml +++ b/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml @@ -18,6 +18,13 @@ description: | Datasheets: https://www.analog.com/en/products/adm1294.html + The SQ24905C is also a Hot-swap controller compatibility to the ADM1278, + the PMBUS_MFR_MODEL is MC09C + + Datasheets: + https://www.silergy.com/ + download/downloadFile?id=5669&type=product&ftype=note + properties: compatible: enum: @@ -30,6 +37,7 @@ properties: - adi,adm1281 - adi,adm1293 - adi,adm1294 + - silergy,mc09c reg: maxItems: 1 @@ -96,6 +104,7 @@ allOf: - adi,adm1281 - adi,adm1293 - adi,adm1294 + - silergy,mc09c then: properties: adi,volt-curr-sample-average: diff --git a/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml b/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml index 010333cb25c0e1..966b221b6caa6b 100644 --- a/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml +++ b/Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml @@ -11,11 +11,12 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. properties: compatible: enum: + - kontron,sa67mcu-hwmon - kontron,sl28cpld-fan reg: diff --git a/Documentation/devicetree/bindings/hwmon/lantiq,cputemp.yaml b/Documentation/devicetree/bindings/hwmon/lantiq,cputemp.yaml new file mode 100644 index 00000000000000..9419b481ff35b1 --- /dev/null +++ b/Documentation/devicetree/bindings/hwmon/lantiq,cputemp.yaml @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/hwmon/lantiq,cputemp.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Lantiq cpu temperature sensor + +maintainers: + - Florian Eckert + +properties: + compatible: + const: lantiq,cputemp + + reg: + maxItems: 1 + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + cputemp@103040 { + compatible = "lantiq,cputemp"; + reg = <0x103040 0x4>; + }; diff --git a/Documentation/devicetree/bindings/hwmon/lm75.yaml b/Documentation/devicetree/bindings/hwmon/lm75.yaml index c38255243f5729..0b9fda81e3ec50 100644 --- a/Documentation/devicetree/bindings/hwmon/lm75.yaml +++ b/Documentation/devicetree/bindings/hwmon/lm75.yaml @@ -28,6 +28,7 @@ properties: - maxim,max31725 - maxim,max31726 - maxim,mcp980x + - nxp,p3t1750 - nxp,p3t1755 - nxp,pct2075 - st,stds75 @@ -69,6 +70,7 @@ allOf: - ti,tmp100 - ti,tmp101 - ti,tmp112 + - ti,tmp75 then: properties: interrupts: false diff --git a/Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt b/Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt deleted file mode 100644 index 473b34c876dd32..00000000000000 --- a/Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt +++ /dev/null @@ -1,10 +0,0 @@ -Lantiq cpu temperature sensor - -Requires node properties: -- compatible value : - "lantiq,cputemp" - -Example: - cputemp@0 { - compatible = "lantiq,cputemp"; - }; diff --git a/Documentation/devicetree/bindings/hwmon/pmbus/isil,isl68137.yaml b/Documentation/devicetree/bindings/hwmon/pmbus/isil,isl68137.yaml index 3dc7f15484d287..ae23a05375cb83 100644 --- a/Documentation/devicetree/bindings/hwmon/pmbus/isil,isl68137.yaml +++ b/Documentation/devicetree/bindings/hwmon/pmbus/isil,isl68137.yaml @@ -54,6 +54,8 @@ properties: - renesas,raa228004 - renesas,raa228006 - renesas,raa228228 + - renesas,raa228244 + - renesas,raa228246 - renesas,raa229001 - renesas,raa229004 - renesas,raa229621 diff --git a/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml b/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml index 8b4ed5ee962fb5..a84cc3a4cfdcaf 100644 --- a/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml +++ b/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml @@ -31,6 +31,15 @@ properties: it must be self resetting edge interrupts. maxItems: 1 + fan-shutdown-percent: + description: + Fan RPM in percent set during shutdown. This is used to keep the fan + running at fixed RPM after the kernel shut down, which is useful on + hardware that does keep heating itself even after the kernel did shut + down, for example from some sort of management core. + minimum: 0 + maximum: 100 + fan-stop-to-start-percent: description: Minimum fan RPM in percent to start when stopped. diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml index fa68b99ef2e292..d3cde89366866b 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml @@ -32,6 +32,8 @@ properties: - ti,ina237 - ti,ina238 - ti,ina260 + - ti,ina700 + - ti,ina780 reg: maxItems: 1 @@ -114,10 +116,42 @@ allOf: - ti,ina237 - ti,ina238 - ti,ina260 + - ti,ina700 + - ti,ina780 then: properties: ti,maximum-expected-current-microamp: false + - if: + properties: + compatible: + contains: + enum: + - silergy,sy24655 + - ti,ina209 + - ti,ina219 + - ti,ina220 + - ti,ina226 + - ti,ina230 + - ti,ina231 + - ti,ina260 + - ti,ina700 + - ti,ina780 + then: + properties: + ti,shunt-gain: false + + - if: + properties: + compatible: + contains: + enum: + - ti,ina700 + - ti,ina780 + then: + properties: + shunt-resistor: false + unevaluatedProperties: false examples: diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml index 4c89448eba0dc0..96b2e4969f78a1 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml @@ -20,6 +20,10 @@ properties: reg: maxItems: 1 + label: + description: + A descriptive name for this channel, like "ambient" or "psu". + "#thermal-sensor-cells": const: 1 @@ -45,6 +49,7 @@ examples: reg = <0x48>; interrupt-parent = <&gpio7>; interrupts = <16 IRQ_TYPE_LEVEL_LOW>; + label = "somelabel"; vcc-supply = <&supply>; #thermal-sensor-cells = <1>; }; diff --git a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml index fed3e1b8c43f67..500a965bdb7a84 100644 --- a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml @@ -20,17 +20,22 @@ allOf: properties: compatible: - items: - - enum: - - apple,s5l8960x-i2c - - apple,t7000-i2c - - apple,s8000-i2c - - apple,t8010-i2c - - apple,t8015-i2c - - apple,t8103-i2c - - apple,t8112-i2c - - apple,t6000-i2c - - const: apple,i2c + oneOf: + - items: + - const: apple,t6020-i2c + - const: apple,t8103-i2c + - items: + - enum: + # Do not add additional SoC to this list. + - apple,s5l8960x-i2c + - apple,t7000-i2c + - apple,s8000-i2c + - apple,t8010-i2c + - apple,t8015-i2c + - apple,t8103-i2c + - apple,t8112-i2c + - apple,t6000-i2c + - const: apple,i2c reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml b/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml index 7ae8c7b1d0067e..32269239bae467 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml +++ b/Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml @@ -33,11 +33,16 @@ properties: - samsung,exynos7870-hsi2c - tesla,fsd-hsi2c - const: samsung,exynos7-hsi2c + - items: + - enum: + - samsung,exynos8890-hsi2c + - const: samsung,exynos8895-hsi2c - items: - enum: - google,gs101-hsi2c - samsung,exynos2200-hsi2c - samsung,exynos850-hsi2c + - samsung,exynos990-hsi2c - const: samsung,exynosautov9-hsi2c - const: samsung,exynos5-hsi2c # Exynos5250 and Exynos5420 deprecated: true diff --git a/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml b/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml index 23fe8ff76645e4..3562ce0c0f7e48 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml +++ b/Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml @@ -50,6 +50,12 @@ properties: - enum: - mediatek,mt6795-i2c - const: mediatek,mt8173-i2c + - items: + - enum: + - mediatek,mt6878-i2c + - mediatek,mt6991-i2c + - mediatek,mt8196-i2c + - const: mediatek,mt8188-i2c - items: - enum: - mediatek,mt6893-i2c diff --git a/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml b/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml index 6b6f6762d122f9..32c3b69ccf3420 100644 --- a/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml @@ -80,6 +80,11 @@ properties: support for 64 KiB transactions whereas earlier chips supported no more than 4 KiB per transactions. const: nvidia,tegra194-i2c + - description: | + Tegra256 has 8 generic I2C controllers. The controllers are similar to + the previous generations, but have a different parent clock and hence + the timing parameters are configured differently. + const: nvidia,tegra256-i2c reg: maxItems: 1 @@ -186,6 +191,7 @@ allOf: contains: enum: - nvidia,tegra194-i2c + - nvidia,tegra256-i2c then: required: - resets diff --git a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml index 73144473b9b24e..7456783d1f8ef5 100644 --- a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml +++ b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml @@ -25,6 +25,8 @@ properties: - items: - enum: + - qcom,qcm2290-cci + - qcom,sa8775p-cci - qcom,sc7280-cci - qcom,sc8280xp-cci - qcom,sdm670-cci @@ -44,11 +46,11 @@ properties: const: 0 clocks: - minItems: 3 + minItems: 2 maxItems: 6 clock-names: - minItems: 3 + minItems: 2 maxItems: 6 interrupts: @@ -113,6 +115,7 @@ allOf: then: properties: clocks: + minItems: 3 maxItems: 3 clock-names: items: @@ -120,6 +123,22 @@ allOf: - const: cci_ahb - const: cci + - if: + properties: + compatible: + contains: + enum: + - qcom,qcm2290-cci + then: + properties: + clocks: + minItems: 2 + maxItems: 2 + clock-names: + items: + - const: ahb + - const: cci + - if: properties: compatible: @@ -223,6 +242,7 @@ allOf: compatible: contains: enum: + - qcom,sa8775p-cci - qcom,sm8550-cci - qcom,sm8650-cci - qcom,x1e80100-cci diff --git a/Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml b/Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml index 6ba7d793504c8c..a2ddc680361769 100644 --- a/Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml @@ -13,7 +13,6 @@ properties: compatible: oneOf: - enum: - - samsung,s3c2410-i2c - samsung,s3c2440-i2c # For s3c2440-like I2C used inside HDMIPHY block found on several SoCs: - samsung,s3c2440-hdmiphy-i2c @@ -93,7 +92,6 @@ allOf: compatible: contains: enum: - - samsung,s3c2410-i2c - samsung,s3c2440-i2c - samsung,s3c2440-hdmiphy-i2c then: diff --git a/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml b/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml index 3d6aefb0d0f185..b7220fff22350f 100644 --- a/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml @@ -9,6 +9,9 @@ title: I2C controller embedded in SpacemiT's K1 SoC maintainers: - Troy Mitchell +allOf: + - $ref: /schemas/i2c/i2c-controller.yaml# + properties: compatible: const: spacemit,k1-i2c @@ -53,7 +56,7 @@ examples: reg = <0xd4010800 0x38>; interrupt-parent = <&plic>; interrupts = <36>; - clocks =<&ccu 32>, <&ccu 84>; + clocks = <&ccu 32>, <&ccu 84>; clock-names = "func", "bus"; clock-frequency = <100000>; }; diff --git a/Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml b/Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml new file mode 100644 index 00000000000000..2498672d265488 --- /dev/null +++ b/Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/i3c/adi,i3c-master.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices I3C Controller + +description: + FPGA-based I3C controller designed to interface with I3C and I2C peripherals, + implementing a subset of the I3C-basic specification. The IP core is tested + on arm, microblaze, and arm64 architectures. + + https://analogdevicesinc.github.io/hdl/library/i3c_controller + +maintainers: + - Jorge Marques + +properties: + compatible: + const: adi,i3c-master-v1 + + reg: + maxItems: 1 + + clocks: + minItems: 1 + items: + - description: The AXI interconnect clock, drives the register map. + - description: + The secondary clock, drives the internal logic asynchronously to the + register map. The presence of this entry states that the IP Core was + synthesized with a second clock input, and the absence of this entry + indicates a topology where a single clock input drives all the + internal logic. + + clock-names: + minItems: 1 + items: + - const: axi + - const: i3c + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - clock-names + - interrupts + +allOf: + - $ref: i3c.yaml# + +unevaluatedProperties: false + +examples: + - | + #include + + i3c@44a00000 { + compatible = "adi,i3c-master-v1"; + reg = <0x44a00000 0x1000>; + interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clkc 15>, <&clkc 15>; + clock-names = "axi", "i3c"; + #address-cells = <3>; + #size-cells = <0>; + + /* I3C and I2C devices */ + }; diff --git a/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml b/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml index fe2e9633c46f8b..a20d875086d463 100644 --- a/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml +++ b/Documentation/devicetree/bindings/i3c/renesas,i3c.yaml @@ -4,7 +4,7 @@ $id: http://devicetree.org/schemas/i3c/renesas,i3c.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Renesas RZ/G3S and RZ/G3E I3C Bus Interface +title: Renesas I3C Bus Interface maintainers: - Wolfram Sang @@ -12,10 +12,16 @@ maintainers: properties: compatible: - items: - - enum: - - renesas,r9a08g045-i3c # RZ/G3S - - renesas,r9a09g047-i3c # RZ/G3E + oneOf: + - items: + - enum: + - renesas,r9a08g045-i3c # RZ/G3S + - renesas,r9a09g047-i3c # RZ/G3E + - items: + - enum: + - renesas,r9a09g056-i3c # RZ/V2N + - renesas,r9a09g057-i3c # RZ/V2H(P) + - const: renesas,r9a09g047-i3c reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml index d5287a2bf866bb..d998a9d69b91f4 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2500-scu-ic.yaml @@ -5,7 +5,7 @@ $id: http://devicetree.org/schemas/interrupt-controller/aspeed,ast2500-scu-ic.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Aspeed AST25XX and AST26XX SCU Interrupt Controller +title: Aspeed AST25XX, AST26XX, AST27XX SCU Interrupt Controller maintainers: - Eddie James @@ -16,6 +16,10 @@ properties: - aspeed,ast2500-scu-ic - aspeed,ast2600-scu-ic0 - aspeed,ast2600-scu-ic1 + - aspeed,ast2700-scu-ic0 + - aspeed,ast2700-scu-ic1 + - aspeed,ast2700-scu-ic2 + - aspeed,ast2700-scu-ic3 reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml index e8dfa6507f64d3..87df07beda5926 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml @@ -11,7 +11,7 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. The following interrupts are available. All types and levels are fixed and handled by the board management controller. diff --git a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-kcs-bmc.yaml b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-kcs-bmc.yaml index 129e32c4c77411..610c7986320897 100644 --- a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-kcs-bmc.yaml +++ b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-kcs-bmc.yaml @@ -40,6 +40,9 @@ properties: - description: ODR register - description: STR register + clocks: + maxItems: 1 + aspeed,lpc-io-reg: $ref: /schemas/types.yaml#/definitions/uint32-array minItems: 1 diff --git a/Documentation/devicetree/bindings/leds/ams,as3645a.txt b/Documentation/devicetree/bindings/leds/ams,as3645a.txt deleted file mode 100644 index 4af2987b25e923..00000000000000 --- a/Documentation/devicetree/bindings/leds/ams,as3645a.txt +++ /dev/null @@ -1,85 +0,0 @@ -Analog devices AS3645A device tree bindings - -The AS3645A flash LED controller can drive two LEDs, one high current -flash LED and one indicator LED. The high current flash LED can be -used in torch mode as well. - -Ranges below noted as [a, b] are closed ranges between a and b, i.e. a -and b are included in the range. - -Please also see common.txt in the same directory. - - -Required properties -=================== - -compatible : Must be "ams,as3645a". -reg : The I2C address of the device. Typically 0x30. -#address-cells : 1 -#size-cells : 0 - - -Required properties of the flash child node (0) -=============================================== - -reg: 0 -flash-timeout-us: Flash timeout in microseconds. The value must be in - the range [100000, 850000] and divisible by 50000. -flash-max-microamp: Maximum flash current in microamperes. Has to be - in the range between [200000, 500000] and - divisible by 20000. -led-max-microamp: Maximum torch (assist) current in microamperes. The - value must be in the range between [20000, 160000] and - divisible by 20000. -ams,input-max-microamp: Maximum flash controller input current. The - value must be in the range [1250000, 2000000] - and divisible by 50000. - - -Optional properties of the flash child node -=========================================== - -function : See Documentation/devicetree/bindings/leds/common.txt. -color : See Documentation/devicetree/bindings/leds/common.txt. -label : See Documentation/devicetree/bindings/leds/common.txt (deprecated). - - -Required properties of the indicator child node (1) -=================================================== - -reg: 1 -led-max-microamp: Maximum indicator current. The allowed values are - 2500, 5000, 7500 and 10000. - -Optional properties of the indicator child node -=============================================== - -function : See Documentation/devicetree/bindings/leds/common.txt. -color : See Documentation/devicetree/bindings/leds/common.txt. -label : See Documentation/devicetree/bindings/leds/common.txt (deprecated). - - -Example -======= - -#include - - as3645a@30 { - #address-cells = <1>; - #size-cells = <0>; - reg = <0x30>; - compatible = "ams,as3645a"; - led@0 { - reg = <0x0>; - flash-timeout-us = <150000>; - flash-max-microamp = <320000>; - led-max-microamp = <60000>; - ams,input-max-microamp = <1750000>; - function = LED_FUNCTION_FLASH; - }; - led@1 { - reg = <0x1>; - led-max-microamp = <10000>; - function = LED_FUNCTION_INDICATOR; - }; - }; diff --git a/Documentation/devicetree/bindings/leds/ams,as3645a.yaml b/Documentation/devicetree/bindings/leds/ams,as3645a.yaml new file mode 100644 index 00000000000000..250a4b275d8a8a --- /dev/null +++ b/Documentation/devicetree/bindings/leds/ams,as3645a.yaml @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/leds/ams,as3645a.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices AS3645A LED Controller + +maintainers: + - Sakari Ailus + +description: + The AS3645A flash LED controller can drive two LEDs, one + high current flash LED and one indicator LED. The high + current flash LED can be used in torch mode as well. + +properties: + compatible: + const: ams,as3645a + + "#address-cells": + const: 1 + + "#size-cells": + const: 0 + + reg: + maxItems: 1 + + led@0: + description: led0 describes the 'flash' feature + type: object + $ref: common.yaml# + unevaluatedProperties: false + + properties: + reg: + const: 0 + + flash-timeout-us: + minimum: 100000 + maximum: 850000 + multipleOf: 50000 + + flash-max-microamp: + minimum: 200000 + maximum: 500000 + multipleOf: 20000 + + led-max-microamp: + minimum: 20000 + maximum: 160000 + multipleOf: 20000 + description: + Maximum current when in torch (assist) mode. + + ams,input-max-microamp: + minimum: 1250000 + maximum: 2000000 + multipleOf: 50000 + + required: + - reg + - flash-timeout-us + - flash-max-microamp + - led-max-microamp + - ams,input-max-microamp + + led@1: + description: led1 describes the 'indicator' feature + type: object + $ref: common.yaml# + unevaluatedProperties: false + + properties: + reg: + const: 1 + + led-max-microamp: + enum: + - 2500 + - 5000 + - 7500 + - 10000 + description: + Maximum indicator current. + + required: + - reg + - led-max-microamp + +required: + - compatible + - reg + - "#size-cells" + - "#address-cells" + +additionalProperties: false + +examples: + - | + #include + + i2c{ + #address-cells = <1>; + #size-cells = <0>; + + led-controller@30 { + compatible = "ams,as3645a"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x30>; + + led@0 { + reg = <0>; + flash-timeout-us = <150000>; + flash-max-microamp = <320000>; + led-max-microamp = <60000>; + ams,input-max-microamp = <1750000>; + function = LED_FUNCTION_FLASH; + }; + + led@1 { + reg = <1>; + led-max-microamp = <10000>; + function = LED_FUNCTION_INDICATOR; + }; + }; + }; +... diff --git a/Documentation/devicetree/bindings/leds/backlight/led-backlight.yaml b/Documentation/devicetree/bindings/leds/backlight/led-backlight.yaml index f5554da6bc6c73..8fc5af8f27f9eb 100644 --- a/Documentation/devicetree/bindings/leds/backlight/led-backlight.yaml +++ b/Documentation/devicetree/bindings/leds/backlight/led-backlight.yaml @@ -23,11 +23,7 @@ properties: compatible: const: led-backlight - leds: - description: A list of LED nodes - $ref: /schemas/types.yaml#/definitions/phandle-array - items: - maxItems: 1 + leds: true required: - compatible diff --git a/Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml b/Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml index 3c0431c51159e5..906735acfbaf94 100644 --- a/Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml +++ b/Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml @@ -42,7 +42,6 @@ properties: description: GPIO attached to the SDB pin. audio-gain-db: - $ref: /schemas/types.yaml#/definitions/uint32 default: 0 description: Audio gain selection for external analog modulation input. enum: [0, 3, 6, 9, 12, 15, 18, 21] diff --git a/Documentation/devicetree/bindings/leds/leds-consumer.yaml b/Documentation/devicetree/bindings/leds/leds-consumer.yaml new file mode 100644 index 00000000000000..fe6a0faa1d3b8f --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-consumer.yaml @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/leds/leds-consumer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Common leds consumer + +maintainers: + - Aleksandrs Vinarskis + +description: + Some LED defined in DT are required by other DT consumers, for example + v4l2 subnode may require privacy or flash LED. Unlike trigger-source + approach which is typically used as 'soft' binding, referencing LED + devices by phandle makes things simpler when 'hard' binding is desired. + + Document LED properties that its consumers may define. + +select: true + +properties: + leds: + oneOf: + - type: object + - $ref: /schemas/types.yaml#/definitions/phandle-array + description: + A list of LED device(s) required by a particular consumer. + items: + maxItems: 1 + + led-names: + description: + A list of device name(s). Used to map LED devices to their respective + functions, when consumer requires more than one LED. + +additionalProperties: true + +examples: + - | + #include + #include + + leds { + compatible = "gpio-leds"; + + privacy_led: privacy-led { + color = ; + default-state = "off"; + function = LED_FUNCTION_INDICATOR; + gpios = <&tlmm 110 GPIO_ACTIVE_HIGH>; + }; + }; + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + v4l2_node: camera@36 { + reg = <0x36>; + + leds = <&privacy_led>; + led-names = "privacy"; + }; + }; + +... diff --git a/Documentation/devicetree/bindings/leds/leds-group-multicolor.yaml b/Documentation/devicetree/bindings/leds/leds-group-multicolor.yaml index 8ed059a5a724f6..5c9cfa39396b0b 100644 --- a/Documentation/devicetree/bindings/leds/leds-group-multicolor.yaml +++ b/Documentation/devicetree/bindings/leds/leds-group-multicolor.yaml @@ -17,10 +17,7 @@ properties: compatible: const: leds-group-multicolor - leds: - description: - An aray of monochromatic leds - $ref: /schemas/types.yaml#/definitions/phandle-array + leds: true required: - leds diff --git a/Documentation/devicetree/bindings/media/video-interface-devices.yaml b/Documentation/devicetree/bindings/media/video-interface-devices.yaml index cf7712ad297c01..3ad1590b04966f 100644 --- a/Documentation/devicetree/bindings/media/video-interface-devices.yaml +++ b/Documentation/devicetree/bindings/media/video-interface-devices.yaml @@ -17,6 +17,14 @@ properties: An array of phandles, each referring to a flash LED, a sub-node of the LED driver device node. + leds: + minItems: 1 + maxItems: 1 + + led-names: + enum: + - privacy + lens-focus: $ref: /schemas/types.yaml#/definitions/phandle description: diff --git a/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml new file mode 100644 index 00000000000000..479288567d0b0c --- /dev/null +++ b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/memory-controllers/xlnx,versal-net-ddrmc5.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Xilinx Versal NET Memory Controller + +maintainers: + - Shubhrajyoti Datta + +description: + The integrated DDR Memory Controllers (DDRMCs) support both DDR5 and LPDDR5 + compact and extended memory interfaces. Versal NET DDR memory controller + has an optional ECC support which correct single bit ECC errors and detect + double bit ECC errors. It also has support for reporting other errors like + MMCM (Mixed-Mode Clock Manager) errors and General software errors. + +properties: + compatible: + const: xlnx,versal-net-ddrmc5 + + amd,rproc: + $ref: /schemas/types.yaml#/definitions/phandle + description: + phandle to the remoteproc_r5 rproc node using which APU interacts + with remote processor. APU primarily communicates with the RPU for + accessing the DDRMC address space and getting error notification. + +required: + - compatible + - amd,rproc + +additionalProperties: false + +examples: + - | + memory-controller { + compatible = "xlnx,versal-net-ddrmc5"; + amd,rproc = <&remoteproc_r5>; + }; diff --git a/Documentation/devicetree/bindings/mfd/act8945a.txt b/Documentation/devicetree/bindings/mfd/act8945a.txt deleted file mode 100644 index 5ca75d888b4a5b..00000000000000 --- a/Documentation/devicetree/bindings/mfd/act8945a.txt +++ /dev/null @@ -1,82 +0,0 @@ -Device-Tree bindings for Active-semi ACT8945A MFD driver - -Required properties: - - compatible: "active-semi,act8945a". - - reg: the I2C slave address for the ACT8945A chip - -The chip exposes two subdevices: - - a regulators: see ../regulator/act8945a-regulator.txt - - a charger: see ../power/act8945a-charger.txt - -Example: - pmic@5b { - compatible = "active-semi,act8945a"; - reg = <0x5b>; - - active-semi,vsel-high; - - regulators { - vdd_1v35_reg: REG_DCDC1 { - regulator-name = "VDD_1V35"; - regulator-min-microvolt = <1350000>; - regulator-max-microvolt = <1350000>; - regulator-always-on; - }; - - vdd_1v2_reg: REG_DCDC2 { - regulator-name = "VDD_1V2"; - regulator-min-microvolt = <1100000>; - regulator-max-microvolt = <1300000>; - regulator-always-on; - }; - - vdd_3v3_reg: REG_DCDC3 { - regulator-name = "VDD_3V3"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - regulator-always-on; - }; - - vdd_fuse_reg: REG_LDO1 { - regulator-name = "VDD_FUSE"; - regulator-min-microvolt = <2500000>; - regulator-max-microvolt = <2500000>; - regulator-always-on; - }; - - vdd_3v3_lp_reg: REG_LDO2 { - regulator-name = "VDD_3V3_LP"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - regulator-always-on; - }; - - vdd_led_reg: REG_LDO3 { - regulator-name = "VDD_LED"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - regulator-always-on; - }; - - vdd_sdhc_1v8_reg: REG_LDO4 { - regulator-name = "VDD_SDHC_1V8"; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; - regulator-always-on; - }; - }; - - charger { - compatible = "active-semi,act8945a-charger"; - pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_charger_chglev &pinctrl_charger_lbo &pinctrl_charger_irq>; - interrupt-parent = <&pioA>; - interrupts = <45 IRQ_TYPE_LEVEL_LOW>; - - active-semi,chglev-gpios = <&pioA 12 GPIO_ACTIVE_HIGH>; - active-semi,lbo-gpios = <&pioA 72 GPIO_ACTIVE_LOW>; - active-semi,input-voltage-threshold-microvolt = <6600>; - active-semi,precondition-timeout = <40>; - active-semi,total-timeout = <3>; - }; - }; diff --git a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml index 5eccd10d95ce5d..da1887d7a8fe55 100644 --- a/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml +++ b/Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml @@ -48,8 +48,34 @@ properties: patternProperties: '^p2a-control@[0-9a-f]+$': - description: See Documentation/devicetree/bindings/misc/aspeed-p2a-ctrl.txt + description: > + PCI-to-AHB Bridge Control + + The bridge is available on platforms with the VGA enabled on the Aspeed + device. In this case, the host has access to a 64KiB window into all of + the BMC's memory. The BMC can disable this bridge. If the bridge is + enabled, the host has read access to all the regions of memory, however + the host only has read and write access depending on a register + controlled by the BMC. type: object + additionalProperties: false + + properties: + compatible: + enum: + - aspeed,ast2400-p2a-ctrl + - aspeed,ast2500-p2a-ctrl + reg: + maxItems: 1 + + memory-region: + maxItems: 1 + description: + A reserved_memory region to be used for the PCI to AHB mapping + + required: + - compatible + - reg '^pinctrl(@[0-9a-f]+)?$': type: object @@ -75,6 +101,10 @@ patternProperties: - aspeed,ast2500-scu-ic - aspeed,ast2600-scu-ic0 - aspeed,ast2600-scu-ic1 + - aspeed,ast2700-scu-ic0 + - aspeed,ast2700-scu-ic1 + - aspeed,ast2700-scu-ic2 + - aspeed,ast2700-scu-ic3 '^silicon-id@[0-9a-f]+$': description: Unique hardware silicon identifiers within the SoC @@ -123,6 +153,11 @@ examples: #size-cells = <1>; ranges = <0x0 0x1e6e2000 0x1000>; + p2a-control@2c { + compatible = "aspeed,ast2400-p2a-ctrl"; + reg = <0x2c 0x4>; + }; + silicon-id@7c { compatible = "aspeed,ast2500-silicon-id", "aspeed,silicon-id"; reg = <0x7c 0x4>, <0x150 0x8>; diff --git a/Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml b/Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml index d88854e60b7f95..f329223cec071d 100644 --- a/Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml +++ b/Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml @@ -137,6 +137,9 @@ patternProperties: reg: maxItems: 1 + clocks: + maxItems: 1 + interrupts: maxItems: 1 diff --git a/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml b/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml new file mode 100644 index 00000000000000..d2886f2686a8d3 --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/fsl,mc13xxx.yaml @@ -0,0 +1,288 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mfd/fsl,mc13xxx.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Freescale MC13xxx Power Management Integrated Circuits (PMIC) + +maintainers: + - Alexander Kurz + +description: > + The MC13xxx PMIC series consists of the three models MC13783, MC13892 + and MC34708 and provide regulators and other features like RTC, ADC, + LED, touchscreen, codec and input buttons. + + Link to datasheets + https://www.nxp.com/docs/en/data-sheet/MC13783.pdf + https://www.nxp.com/docs/en/data-sheet/MC13892.pdf + https://www.nxp.com/docs/en/data-sheet/MC34708.pdf + +properties: + compatible: + enum: + - fsl,mc13783 + - fsl,mc13892 + - fsl,mc34708 + + reg: + description: I2C slave address or SPI chip select number. + maxItems: 1 + + spi-max-frequency: true + + spi-cs-high: true + + system-power-controller: true + + interrupts: + maxItems: 1 + + buttons: + type: object + properties: + "#address-cells": + const: 1 + + "#size-cells": + const: 0 + + patternProperties: + "^onkey@[0-2]$": + $ref: /schemas/input/input.yaml# + unevaluatedProperties: false + type: object + + properties: + reg: + description: | + One of + MC13783 BUTTON IDs: + 0: ONOFD1 + 1: ONOFD2 + 2: ONOFD3 + + MC13892 BUTTON IDs: + 0: PWRON1 + 1: PWRON2 + 2: PWRON3 + + MC34708 BUTTON IDs: + 0: PWRON1 + 1: PWRON2 + maximum: 2 + + debounce-delay-ms: + enum: [0, 30, 150, 750] + default: 30 + description: + Sets the debouncing delay in milliseconds. + + active-low: + description: Set active when pin is pulled low. + + linux,code: true + + fsl,enable-reset: + description: + Setting of the global reset option. + type: boolean + + unevaluatedProperties: false + + leds: + type: object + $ref: /schemas/leds/common.yaml# + + properties: + reg: + description: | + One of + MC13783 LED IDs + 0: Main display + 1: AUX display + 2: Keypad + 3: Red 1 + 4: Green 1 + 5: Blue 1 + 6: Red 2 + 7: Green 2 + 8: Blue 2 + 9: Red 3 + 10: Green 3 + 11: Blue 3 + + MC13892 LED IDs + 0: Main display + 1: AUX display + 2: Keypad + 3: Red + 4: Green + 5: Blue + + MC34708 LED IDs + 0: Charger Red + 1: Charger Green + maxItems: 1 + + led-control: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + Setting for LED-Control register array length depends on model, + mc13783: 6, mc13892: 4, mc34708: 1 + + regulators: + type: object + + additionalProperties: + type: object + + description: | + List of child nodes specifying the regulators, depending on chip variant. + Each child node is defined using the standard binding for regulators and + the optional regulator properties defined below. + + fsl,mc13xxx-uses-adc: + type: boolean + description: Indicate the ADC is being used + + fsl,mc13xxx-uses-codec: + type: boolean + description: Indicate the Audio Codec is being used + + fsl,mc13xxx-uses-rtc: + type: boolean + description: Indicate the RTC is being used + + fsl,mc13xxx-uses-touch: + type: boolean + description: Indicate the touchscreen controller is being used + +required: + - compatible + - reg + +allOf: + - if: + properties: + compatible: + contains: + const: fsl,mc13783 + then: + properties: + leds: + properties: + led-control: + minItems: 6 + maxItems: 6 + regulators: + patternProperties: + "^gpo[1-4]|pwgt[12]spi|sw[12][ab]|sw3|vaudio|vcam|vdig|vesim|vgen|viohi|violo|vmmc[12]|vrf[12]|vrfbg|vrfcp|vrfdig|vrfref|vsim|vvib$": + type: object + $ref: /schemas/regulator/regulator.yaml# + + unevaluatedProperties: false + + - if: + properties: + compatible: + contains: + const: fsl,mc13892 + then: + properties: + leds: + properties: + led-control: + minItems: 4 + maxItems: 4 + regulators: + patternProperties: + "^gpo[1-4]|pwgt[12]spi|sw[1-4]|swbst|vaudio|vcam|vcoincell|vdig|vgen[1-3]|viohi|vpll|vsd|vusb|vusb2|vvideo$": + type: object + $ref: /schemas/regulator/regulator.yaml# + + unevaluatedProperties: false + + - if: + properties: + compatible: + contains: + const: fsl,mc34708 + then: + properties: + buttons: + patternProperties: + "^onkey@[0-2]$": + properties: + reg: + maximum: 1 + leds: + properties: + led-control: + minItems: 1 + maxItems: 1 + +additionalProperties: false + +examples: + - | + #include + #include + #include + + spi { + #address-cells = <1>; + #size-cells = <0>; + + pmic: mc13892@0 { + compatible = "fsl,mc13892"; + reg = <0>; + spi-max-frequency = <1000000>; + spi-cs-high; + interrupt-parent = <&gpio0>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; + fsl,mc13xxx-uses-rtc; + fsl,mc13xxx-uses-adc; + + buttons { + #address-cells = <1>; + #size-cells = <0>; + + onkey@0 { + reg = <0>; + debounce-delay-ms = <30>; + active-low; + fsl,enable-reset; + }; + }; + + leds { + #address-cells = <1>; + #size-cells = <0>; + led-control = <0x000 0x000 0x0e0 0x000>; + + sysled@3 { + reg = <3>; + label = "system:red:live"; + linux,default-trigger = "heartbeat"; + }; + }; + + regulators { + sw1_reg: sw1 { + regulator-min-microvolt = <600000>; + regulator-max-microvolt = <1375000>; + regulator-boot-on; + regulator-always-on; + }; + + sw2_reg: sw2 { + regulator-min-microvolt = <900000>; + regulator-max-microvolt = <1850000>; + regulator-boot-on; + regulator-always-on; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/mfd/maxim,max7360.yaml b/Documentation/devicetree/bindings/mfd/maxim,max7360.yaml new file mode 100644 index 00000000000000..3fc920c8639d0f --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/maxim,max7360.yaml @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mfd/maxim,max7360.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Maxim MAX7360 Keypad, Rotary encoder, PWM and GPIO controller + +maintainers: + - Kamel Bouhara + - Mathieu Dubois-Briand + +description: | + Maxim MAX7360 device, with following functions: + - keypad controller + - rotary controller + - GPIO and GPO controller + - PWM controller + + https://www.analog.com/en/products/max7360.html + +allOf: + - $ref: /schemas/input/matrix-keymap.yaml# + - $ref: /schemas/input/input.yaml# + +properties: + compatible: + enum: + - maxim,max7360 + + reg: + maxItems: 1 + + interrupts: + maxItems: 2 + + interrupt-names: + items: + - const: inti + - const: intk + + keypad-debounce-delay-ms: + description: Keypad debounce delay in ms + minimum: 9 + maximum: 40 + default: 9 + + rotary-debounce-delay-ms: + description: Rotary encoder debounce delay in ms + minimum: 0 + maximum: 15 + default: 0 + + linux,axis: + $ref: /schemas/input/rotary-encoder.yaml#/properties/linux,axis + + rotary-encoder,relative-axis: + $ref: /schemas/types.yaml#/definitions/flag + description: + Register a relative axis rather than an absolute one. + + rotary-encoder,steps: + $ref: /schemas/types.yaml#/definitions/uint32 + default: 24 + description: + Number of steps in a full turnaround of the + encoder. Only relevant for absolute axis. Defaults to 24 which is a + typical value for such devices. + + rotary-encoder,rollover: + $ref: /schemas/types.yaml#/definitions/flag + description: + Automatic rollover when the rotary value becomes + greater than the specified steps or smaller than 0. For absolute axis only. + + "#pwm-cells": + const: 3 + + gpio: + $ref: /schemas/gpio/maxim,max7360-gpio.yaml# + description: + PORT0 to PORT7 general purpose input/output pins configuration. + + gpo: + $ref: /schemas/gpio/maxim,max7360-gpio.yaml# + description: > + COL2 to COL7 general purpose output pins configuration. Allows to use + unused keypad columns as outputs. + + The MAX7360 has 8 column lines and 6 of them can be used as GPOs. GPIOs + numbers used for this gpio-controller node do correspond to the column + numbers: values 0 and 1 are never valid, values from 2 to 7 might be valid + depending on the value of the keypad,num-column property. + +patternProperties: + '-pins$': + type: object + description: + Pinctrl node's client devices use subnodes for desired pin configuration. + Client device subnodes use below standard properties. + $ref: /schemas/pinctrl/pincfg-node.yaml + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode. + items: + pattern: '^(PORT[0-7]|ROTARY)$' + minItems: 1 + maxItems: 8 + + function: + description: + Specify the alternative function to be configured for the specified + pins. + enum: [gpio, pwm, rotary] + + additionalProperties: false + +required: + - compatible + - reg + - interrupts + - interrupt-names + - linux,keymap + - linux,axis + - "#pwm-cells" + - gpio + - gpo + +unevaluatedProperties: false + +examples: + - | + #include + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + io-expander@38 { + compatible = "maxim,max7360"; + reg = <0x38>; + + interrupt-parent = <&gpio1>; + interrupts = <23 IRQ_TYPE_LEVEL_LOW>, + <24 IRQ_TYPE_LEVEL_LOW>; + interrupt-names = "inti", "intk"; + + keypad,num-rows = <8>; + keypad,num-columns = <4>; + linux,keymap = < + MATRIX_KEY(0x00, 0x00, KEY_F5) + MATRIX_KEY(0x01, 0x00, KEY_F4) + MATRIX_KEY(0x02, 0x01, KEY_F6) + >; + keypad-debounce-delay-ms = <10>; + autorepeat; + + rotary-debounce-delay-ms = <2>; + linux,axis = <0>; /* REL_X */ + rotary-encoder,relative-axis; + + #pwm-cells = <3>; + + max7360_gpio: gpio { + compatible = "maxim,max7360-gpio"; + + gpio-controller; + #gpio-cells = <2>; + maxim,constant-current-disable = <0x06>; + + interrupt-controller; + #interrupt-cells = <0x2>; + }; + + max7360_gpo: gpo { + compatible = "maxim,max7360-gpo"; + + gpio-controller; + #gpio-cells = <2>; + }; + + backlight_pins: backlight-pins { + pins = "PORT2"; + function = "pwm"; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/mfd/mc13xxx.txt b/Documentation/devicetree/bindings/mfd/mc13xxx.txt deleted file mode 100644 index 8261ea73278a6b..00000000000000 --- a/Documentation/devicetree/bindings/mfd/mc13xxx.txt +++ /dev/null @@ -1,156 +0,0 @@ -* Freescale MC13783/MC13892 Power Management Integrated Circuit (PMIC) - -Required properties: -- compatible : Should be "fsl,mc13783" or "fsl,mc13892" - -Optional properties: -- fsl,mc13xxx-uses-adc : Indicate the ADC is being used -- fsl,mc13xxx-uses-codec : Indicate the Audio Codec is being used -- fsl,mc13xxx-uses-rtc : Indicate the RTC is being used -- fsl,mc13xxx-uses-touch : Indicate the touchscreen controller is being used - -Sub-nodes: -- codec: Contain the Audio Codec node. - - adc-port: Contain PMIC SSI port number used for ADC. - - dac-port: Contain PMIC SSI port number used for DAC. -- leds : Contain the led nodes and initial register values in property - "led-control". Number of register depends of used IC, for MC13783 is 6, - for MC13892 is 4, for MC34708 is 1. See datasheet for bits definitions of - these registers. - - #address-cells: Must be 1. - - #size-cells: Must be 0. - Each led node should contain "reg", which used as LED ID (described below). - Optional properties "label" and "linux,default-trigger" is described in - Documentation/devicetree/bindings/leds/common.txt. -- regulators : Contain the regulator nodes. The regulators are bound using - their names as listed below with their registers and bits for enabling. - -MC13783 LED IDs: - 0 : Main display - 1 : AUX display - 2 : Keypad - 3 : Red 1 - 4 : Green 1 - 5 : Blue 1 - 6 : Red 2 - 7 : Green 2 - 8 : Blue 2 - 9 : Red 3 - 10 : Green 3 - 11 : Blue 3 - -MC13892 LED IDs: - 0 : Main display - 1 : AUX display - 2 : Keypad - 3 : Red - 4 : Green - 5 : Blue - -MC34708 LED IDs: - 0 : Charger Red - 1 : Charger Green - -MC13783 regulators: - sw1a : regulator SW1A (register 24, bit 0) - sw1b : regulator SW1B (register 25, bit 0) - sw2a : regulator SW2A (register 26, bit 0) - sw2b : regulator SW2B (register 27, bit 0) - sw3 : regulator SW3 (register 29, bit 20) - vaudio : regulator VAUDIO (register 32, bit 0) - viohi : regulator VIOHI (register 32, bit 3) - violo : regulator VIOLO (register 32, bit 6) - vdig : regulator VDIG (register 32, bit 9) - vgen : regulator VGEN (register 32, bit 12) - vrfdig : regulator VRFDIG (register 32, bit 15) - vrfref : regulator VRFREF (register 32, bit 18) - vrfcp : regulator VRFCP (register 32, bit 21) - vsim : regulator VSIM (register 33, bit 0) - vesim : regulator VESIM (register 33, bit 3) - vcam : regulator VCAM (register 33, bit 6) - vrfbg : regulator VRFBG (register 33, bit 9) - vvib : regulator VVIB (register 33, bit 11) - vrf1 : regulator VRF1 (register 33, bit 12) - vrf2 : regulator VRF2 (register 33, bit 15) - vmmc1 : regulator VMMC1 (register 33, bit 18) - vmmc2 : regulator VMMC2 (register 33, bit 21) - gpo1 : regulator GPO1 (register 34, bit 6) - gpo2 : regulator GPO2 (register 34, bit 8) - gpo3 : regulator GPO3 (register 34, bit 10) - gpo4 : regulator GPO4 (register 34, bit 12) - pwgt1spi : regulator PWGT1SPI (register 34, bit 15) - pwgt2spi : regulator PWGT2SPI (register 34, bit 16) - -MC13892 regulators: - vcoincell : regulator VCOINCELL (register 13, bit 23) - sw1 : regulator SW1 (register 24, bit 0) - sw2 : regulator SW2 (register 25, bit 0) - sw3 : regulator SW3 (register 26, bit 0) - sw4 : regulator SW4 (register 27, bit 0) - swbst : regulator SWBST (register 29, bit 20) - vgen1 : regulator VGEN1 (register 32, bit 0) - viohi : regulator VIOHI (register 32, bit 3) - vdig : regulator VDIG (register 32, bit 9) - vgen2 : regulator VGEN2 (register 32, bit 12) - vpll : regulator VPLL (register 32, bit 15) - vusb2 : regulator VUSB2 (register 32, bit 18) - vgen3 : regulator VGEN3 (register 33, bit 0) - vcam : regulator VCAM (register 33, bit 6) - vvideo : regulator VVIDEO (register 33, bit 12) - vaudio : regulator VAUDIO (register 33, bit 15) - vsd : regulator VSD (register 33, bit 18) - gpo1 : regulator GPO1 (register 34, bit 6) - gpo2 : regulator GPO2 (register 34, bit 8) - gpo3 : regulator GPO3 (register 34, bit 10) - gpo4 : regulator GPO4 (register 34, bit 12) - pwgt1spi : regulator PWGT1SPI (register 34, bit 15) - pwgt2spi : regulator PWGT2SPI (register 34, bit 16) - vusb : regulator VUSB (register 50, bit 3) - - The bindings details of individual regulator device can be found in: - Documentation/devicetree/bindings/regulator/regulator.txt - -Examples: - -ecspi@70010000 { /* ECSPI1 */ - cs-gpios = <&gpio4 24 0>, /* GPIO4_24 */ - <&gpio4 25 0>; /* GPIO4_25 */ - - pmic: mc13892@0 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,mc13892"; - spi-max-frequency = <6000000>; - reg = <0>; - interrupt-parent = <&gpio0>; - interrupts = <8>; - - leds { - #address-cells = <1>; - #size-cells = <0>; - led-control = <0x000 0x000 0x0e0 0x000>; - - sysled@3 { - reg = <3>; - label = "system:red:live"; - linux,default-trigger = "heartbeat"; - }; - }; - - regulators { - sw1_reg: mc13892__sw1 { - regulator-min-microvolt = <600000>; - regulator-max-microvolt = <1375000>; - regulator-boot-on; - regulator-always-on; - }; - - sw2_reg: mc13892__sw2 { - regulator-min-microvolt = <900000>; - regulator-max-microvolt = <1850000>; - regulator-boot-on; - regulator-always-on; - }; - }; - }; -}; diff --git a/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml b/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml index 877078ac172f1d..5454d9403cad79 100644 --- a/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml +++ b/Documentation/devicetree/bindings/mfd/qnap,ts433-mcu.yaml @@ -16,8 +16,12 @@ description: properties: compatible: enum: + - qnap,ts233-mcu - qnap,ts433-mcu + nvmem-layout: + $ref: /schemas/nvmem/layouts/nvmem-layout.yaml + patternProperties: "^fan-[0-9]+$": $ref: /schemas/hwmon/fan-common.yaml# diff --git a/Documentation/devicetree/bindings/mfd/spacemit,p1.yaml b/Documentation/devicetree/bindings/mfd/spacemit,p1.yaml new file mode 100644 index 00000000000000..c6593ac6ef6adb --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/spacemit,p1.yaml @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mfd/spacemit,p1.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: SpacemiT P1 Power Management Integrated Circuit + +maintainers: + - Troy Mitchell + +description: + P1 is an I2C-controlled PMIC produced by SpacemiT. It implements six + constant-on-time buck converters and twelve low-dropout regulators. + It also contains a load switch, watchdog timer, real-time clock, eight + 12-bit ADC channels, and six GPIOs. Additional details are available + in the "Power Stone/P1" section at the following link. + https://developer.spacemit.com/documentation + +properties: + compatible: + const: spacemit,p1 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + vin-supply: + description: Input supply phandle. + + regulators: + type: object + + patternProperties: + "^(buck[1-6]|aldo[1-4]|dldo[1-7])$": + type: object + $ref: /schemas/regulator/regulator.yaml# + unevaluatedProperties: false + + unevaluatedProperties: false + +required: + - compatible + - reg + - interrupts + +unevaluatedProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pmic@41 { + compatible = "spacemit,p1"; + reg = <0x41>; + interrupts = <64>; + + regulators { + buck1 { + regulator-name = "buck1"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <3450000>; + regulator-ramp-delay = <5000>; + regulator-always-on; + }; + + aldo1 { + regulator-name = "aldo1"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <3400000>; + regulator-boot-on; + }; + + dldo1 { + regulator-name = "dldo1"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <3400000>; + regulator-boot-on; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/mfd/syscon.yaml b/Documentation/devicetree/bindings/mfd/syscon.yaml index 27672adeb1fedb..657c38175fba21 100644 --- a/Documentation/devicetree/bindings/mfd/syscon.yaml +++ b/Documentation/devicetree/bindings/mfd/syscon.yaml @@ -79,6 +79,7 @@ select: - marvell,armada-3700-cpu-misc - marvell,armada-3700-nb-pm - marvell,armada-3700-avs + - marvell,armada-3700-usb2-host-device-misc - marvell,armada-3700-usb2-host-misc - marvell,dove-global-config - mediatek,mt2701-pctl-a-syscfg @@ -90,6 +91,7 @@ select: - mediatek,mt8173-pctl-a-syscfg - mediatek,mt8365-syscfg - microchip,lan966x-cpu-syscon + - microchip,mpfs-control-scb - microchip,mpfs-sysreg-scb - microchip,sam9x60-sfr - microchip,sama7d65-ddr3phy @@ -185,6 +187,7 @@ properties: - marvell,armada-3700-cpu-misc - marvell,armada-3700-nb-pm - marvell,armada-3700-avs + - marvell,armada-3700-usb2-host-device-misc - marvell,armada-3700-usb2-host-misc - marvell,dove-global-config - mediatek,mt2701-pctl-a-syscfg @@ -197,6 +200,7 @@ properties: - mediatek,mt8365-infracfg-nao - mediatek,mt8365-syscfg - microchip,lan966x-cpu-syscon + - microchip,mpfs-control-scb - microchip,mpfs-sysreg-scb - microchip,sam9x60-sfr - microchip,sama7d65-ddr3phy diff --git a/Documentation/devicetree/bindings/mfd/ti,bq25703a.yaml b/Documentation/devicetree/bindings/mfd/ti,bq25703a.yaml new file mode 100644 index 00000000000000..ba14663c9266a5 --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/ti,bq25703a.yaml @@ -0,0 +1,117 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mfd/ti,bq25703a.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: BQ25703A Charger Manager/Buck/Boost Converter + +maintainers: + - Chris Morgan + +allOf: + - $ref: /schemas/power/supply/power-supply.yaml# + +properties: + compatible: + const: ti,bq25703a + + reg: + const: 0x6b + + input-current-limit-microamp: + description: + Maximum total input current allowed used for both charging and + powering the device. + minimum: 50000 + maximum: 6400000 + default: 3250000 + + interrupts: + maxItems: 1 + + monitored-battery: + description: + A minimum of constant-charge-current-max-microamp, + constant-charge-voltage-max-microvolt, and + voltage-min-design-microvolt are required. + + regulators: + type: object + additionalProperties: false + description: + Boost converter regulator output of bq257xx. + + properties: + vbus: + type: object + $ref: /schemas/regulator/regulator.yaml + additionalProperties: false + + properties: + regulator-name: true + regulator-min-microamp: + minimum: 0 + maximum: 6350000 + regulator-max-microamp: + minimum: 0 + maximum: 6350000 + regulator-min-microvolt: + minimum: 4480000 + maximum: 20800000 + regulator-max-microvolt: + minimum: 4480000 + maximum: 20800000 + enable-gpios: + description: + The BQ25703 may require both a register write and a GPIO + toggle to enable the boost regulator. + + required: + - regulator-name + - regulator-min-microamp + - regulator-max-microamp + - regulator-min-microvolt + - regulator-max-microvolt + +unevaluatedProperties: false + +required: + - compatible + - reg + - input-current-limit-microamp + - monitored-battery + - power-supplies + +examples: + - | + #include + #include + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + + bq25703: charger@6b { + compatible = "ti,bq25703a"; + reg = <0x6b>; + input-current-limit-microamp = <5000000>; + interrupt-parent = <&gpio0>; + interrupts = ; + monitored-battery = <&battery>; + power-supplies = <&fusb302>; + + regulators { + usb_otg_vbus: vbus { + enable-gpios = <&gpio4 RK_PA6 GPIO_ACTIVE_HIGH>; + regulator-max-microamp = <960000>; + regulator-max-microvolt = <5088000>; + regulator-min-microamp = <512000>; + regulator-min-microvolt = <4992000>; + regulator-name = "usb_otg_vbus"; + }; + }; + }; + }; + +... diff --git a/Documentation/devicetree/bindings/mfd/ti,tps6594.yaml b/Documentation/devicetree/bindings/mfd/ti,tps6594.yaml index a48cb00afe4381..ca17fbdea691d4 100644 --- a/Documentation/devicetree/bindings/mfd/ti,tps6594.yaml +++ b/Documentation/devicetree/bindings/mfd/ti,tps6594.yaml @@ -41,6 +41,7 @@ properties: system-power-controller: true gpio-controller: true + gpio-line-names: true '#gpio-cells': const: 2 diff --git a/Documentation/devicetree/bindings/mfd/ti,twl.yaml b/Documentation/devicetree/bindings/mfd/ti,twl.yaml index f162ab60c09b56..776b04e182cb2a 100644 --- a/Documentation/devicetree/bindings/mfd/ti,twl.yaml +++ b/Documentation/devicetree/bindings/mfd/ti,twl.yaml @@ -11,9 +11,9 @@ maintainers: description: | The TWLs are Integrated Power Management Chips. - Some version might contain much more analog function like + Some versions might contain much more analog functions like USB transceiver or Audio amplifier. - These chips are connected to an i2c bus. + These chips are connected to an I2C bus. allOf: - if: @@ -49,33 +49,14 @@ allOf: ti,retain-on-reset: false properties: - madc: - type: object - $ref: /schemas/iio/adc/ti,twl4030-madc.yaml - unevaluatedProperties: false - charger: - type: object $ref: /schemas/power/supply/twl4030-charger.yaml unevaluatedProperties: false - pwrbutton: - type: object - additionalProperties: false - properties: - compatible: - const: ti,twl4030-pwrbutton - interrupts: - items: - - items: - const: 8 - - watchdog: - type: object - additionalProperties: false - properties: - compatible: - const: ti,twl4030-wdt + gpadc: false + + usb-comparator: false + - if: properties: compatible: @@ -106,15 +87,30 @@ allOf: properties: charger: - type: object - properties: - compatible: - const: ti,twl6030-charger + $ref: /schemas/power/supply/ti,twl6030-charger.yaml + unevaluatedProperties: false + gpadc: - type: object properties: compatible: const: ti,twl6030-gpadc + + pwrbutton: false + + madc: false + + watchdog: false + + audio: false + + keypad: false + + twl4030-usb: false + + gpio: false + + power: false + - if: properties: compatible: @@ -142,23 +138,36 @@ allOf: properties: charger: - type: object - properties: - compatible: - items: - - const: ti,twl6032-charger - - const: ti,twl6030-charger + $ref: /schemas/power/supply/ti,twl6030-charger.yaml + unevaluatedProperties: false + gpadc: - type: object properties: compatible: const: ti,twl6032-gpadc + pwrbutton: false + + madc: false + + watchdog: false + + audio: false + + keypad: false + + twl4030-usb: false + + gpio: false + + power: false + properties: compatible: - description: - TWL4030 for integrated power-management/audio CODEC device used in OMAP3 - based boards + description: > + TWL4030 for integrated power-management/audio CODEC device used in + OMAP3 based boards. + TWL6030/32 for integrated power-management used in OMAP4 based boards enum: - ti,twl4030 @@ -181,28 +190,221 @@ properties: "#clock-cells": const: 1 + clocks: + maxItems: 1 + + clock-names: + const: fck + charger: type: object - additionalProperties: true + properties: compatible: true + required: - compatible rtc: type: object additionalProperties: false + properties: compatible: const: ti,twl4030-rtc interrupts: maxItems: 1 + madc: + type: object + $ref: /schemas/iio/adc/ti,twl4030-madc.yaml + unevaluatedProperties: false + + pwrbutton: + type: object + additionalProperties: false + + properties: + compatible: + const: ti,twl4030-pwrbutton + interrupts: + items: + - items: + const: 8 + + watchdog: + type: object + additionalProperties: false + + properties: + compatible: + const: ti,twl4030-wdt + + audio: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl4030-audio + + required: + - compatible + + keypad: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl4030-keypad + + required: + - compatible + + twl4030-usb: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl4030-usb + + required: + - compatible + + gpio: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl4030-gpio + + required: + - compatible + + power: + type: object + additionalProperties: false + description: > + The power management module inside the TWL4030 provides several + facilities to control the power resources, including power scripts. + + For now, the binding only supports the complete shutdown of the + system after poweroff. + + Board-specific compatible strings may be used for platform-specific + power configurations. + + A board-specific compatible string (e.g., ti,twl4030-power-omap3-evm) + may be paired with a generic fallback (generally for power saving mode). + + properties: + compatible: + oneOf: + # Case 1: A single compatible string is provided. + - enum: + - ti,twl4030-power + - ti,twl4030-power-reset + - ti,twl4030-power-idle + - ti,twl4030-power-idle-osc-off + - ti,twl4030-power-omap3-sdp + - ti,twl4030-power-omap3-ldp + - ti,twl4030-power-omap3-evm + + # Case 2: The specific, valid fallback for 'idle-osc-off'. + - items: + - const: ti,twl4030-power-idle-osc-off + - const: ti,twl4030-power-idle + + # Case 3: The specific, valid fallback for 'omap3-evm'. + - items: + - const: ti,twl4030-power-omap3-evm + - const: ti,twl4030-power-idle + + ti,system-power-controller: + type: boolean + deprecated: true + description: > + DEPRECATED. The standard 'system-power-controller' + property on the parent node should be used instead. + + ti,use_poweroff: + type: boolean + deprecated: true + description: DEPRECATED, to be removed. + + required: + - compatible + + gpadc: + type: object + $ref: /schemas/iio/adc/ti,twl6030-gpadc.yaml + unevaluatedProperties: false + + properties: + compatible: true + + usb-comparator: + type: object + additionalProperties: true + + properties: + compatible: + const: ti,twl6030-usb + + required: + - compatible + + pwm: + type: object + $ref: /schemas/pwm/pwm.yaml# + unevaluatedProperties: false + description: + PWM controllers (PWM1 and PWM2 on TWL4030, PWM0 and PWM1 on TWL6030/32). + + properties: + compatible: + enum: + - ti,twl4030-pwm + - ti,twl6030-pwm + + '#pwm-cells': + const: 2 + + required: + - compatible + - '#pwm-cells' + + pwmled: + type: object + $ref: /schemas/pwm/pwm.yaml# + unevaluatedProperties: false + description: > + PWM controllers connected to LED terminals (PWMA and PWMB on TWL4030. + + LED PWM on TWL6030/32, mainly used as charging indicator LED). + + properties: + compatible: + enum: + - ti,twl4030-pwmled + - ti,twl6030-pwmled + + '#pwm-cells': + const: 2 + + required: + - compatible + - '#pwm-cells' + patternProperties: "^regulator-": type: object unevaluatedProperties: false $ref: /schemas/regulator/regulator.yaml + properties: compatible: true regulator-initial-mode: @@ -211,12 +413,13 @@ patternProperties: # with low power consumption with low load current capability - 0x0e # Active mode, the regulator can deliver its nominal output # voltage with full-load current capability + ti,retain-on-reset: - description: - Does not turn off the supplies during warm - reset. Could be needed for VMMC, as TWL6030 - reset sequence for this signal does not comply - with the SD specification. + description: > + Does not turn off the supplies during warm reset. + + Could be needed for VMMC, as TWL6030 reset sequence for + this signal does not comply with the SD specification. type: boolean unevaluatedProperties: false @@ -271,6 +474,16 @@ examples: compatible = "ti,twl6030-vmmc"; ti,retain-on-reset; }; + + pwm { + compatible = "ti,twl6030-pwm"; + #pwm-cells = <2>; + }; + + pwmled { + compatible = "ti,twl6030-pwmled"; + #pwm-cells = <2>; + }; }; }; @@ -325,6 +538,20 @@ examples: watchdog { compatible = "ti,twl4030-wdt"; }; + + power { + compatible = "ti,twl4030-power"; + }; + + pwm { + compatible = "ti,twl4030-pwm"; + #pwm-cells = <2>; + }; + + pwmled { + compatible = "ti,twl4030-pwmled"; + #pwm-cells = <2>; + }; }; }; ... diff --git a/Documentation/devicetree/bindings/mfd/twl4030-power.txt b/Documentation/devicetree/bindings/mfd/twl4030-power.txt deleted file mode 100644 index 3d19963312ce0a..00000000000000 --- a/Documentation/devicetree/bindings/mfd/twl4030-power.txt +++ /dev/null @@ -1,48 +0,0 @@ -Texas Instruments TWL family (twl4030) reset and power management module - -The power management module inside the TWL family provides several facilities -to control the power resources, including power scripts. For now, the -binding only supports the complete shutdown of the system after poweroff. - -Required properties: -- compatible : must be one of the following - "ti,twl4030-power" - "ti,twl4030-power-reset" - "ti,twl4030-power-idle" - "ti,twl4030-power-idle-osc-off" - -The use of ti,twl4030-power-reset is recommended at least on -3530 that needs a special configuration for warm reset to work. - -When using ti,twl4030-power-idle, the TI recommended configuration -for idle modes is loaded to the tlw4030 PMIC. - -When using ti,twl4030-power-idle-osc-off, the TI recommended -configuration is used with the external oscillator being shut -down during off-idle. Note that this does not work on all boards -depending on how the external oscillator is wired. - -Optional properties: - -- ti,system-power-controller: This indicates that TWL4030 is the - power supply master of the system. With this flag, the chip will - initiate an ACTIVE-to-OFF or SLEEP-to-OFF transition when the - system poweroffs. - -- ti,use_poweroff: Deprecated name for ti,system-power-controller - -Example: -&i2c1 { - clock-frequency = <2600000>; - - twl: twl@48 { - reg = <0x48>; - interrupts = <7>; /* SYS_NIRQ cascaded to intc */ - interrupt-parent = <&intc>; - - twl_power: power { - compatible = "ti,twl4030-power"; - ti,use_poweroff; - }; - }; -}; diff --git a/Documentation/devicetree/bindings/misc/aspeed-p2a-ctrl.txt b/Documentation/devicetree/bindings/misc/aspeed-p2a-ctrl.txt deleted file mode 100644 index f2e2e28b317ce1..00000000000000 --- a/Documentation/devicetree/bindings/misc/aspeed-p2a-ctrl.txt +++ /dev/null @@ -1,46 +0,0 @@ -====================================================================== -Device tree bindings for Aspeed AST2400/AST2500 PCI-to-AHB Bridge Control Driver -====================================================================== - -The bridge is available on platforms with the VGA enabled on the Aspeed device. -In this case, the host has access to a 64KiB window into all of the BMC's -memory. The BMC can disable this bridge. If the bridge is enabled, the host -has read access to all the regions of memory, however the host only has read -and write access depending on a register controlled by the BMC. - -Required properties: -=================== - - - compatible: must be one of: - - "aspeed,ast2400-p2a-ctrl" - - "aspeed,ast2500-p2a-ctrl" - -Optional properties: -=================== - -- reg: A hint for the memory regions associated with the P2A controller -- memory-region: A phandle to a reserved_memory region to be used for the PCI - to AHB mapping - -The p2a-control node should be the child of a syscon node with the required -property: - -- compatible : Should be one of the following: - "aspeed,ast2400-scu", "syscon", "simple-mfd" - "aspeed,ast2500-scu", "syscon", "simple-mfd" - -Example -=================== - -g4 Example ----------- - -syscon: scu@1e6e2000 { - compatible = "aspeed,ast2400-scu", "syscon", "simple-mfd"; - reg = <0x1e6e2000 0x1a8>; - - p2a: p2a-control { - compatible = "aspeed,ast2400-p2a-ctrl"; - memory-region = <&reserved_memory>; - }; -}; diff --git a/Documentation/devicetree/bindings/mmc/fsl,esdhc.yaml b/Documentation/devicetree/bindings/mmc/fsl,esdhc.yaml index 62087cf920df8f..f45e592901e24b 100644 --- a/Documentation/devicetree/bindings/mmc/fsl,esdhc.yaml +++ b/Documentation/devicetree/bindings/mmc/fsl,esdhc.yaml @@ -90,6 +90,7 @@ required: allOf: - $ref: sdhci-common.yaml# + - $ref: mmc-controller-common.yaml# unevaluatedProperties: false diff --git a/Documentation/devicetree/bindings/mmc/mmc-controller-common.yaml b/Documentation/devicetree/bindings/mmc/mmc-controller-common.yaml index 9a72354397591d..7414d5522dfe8a 100644 --- a/Documentation/devicetree/bindings/mmc/mmc-controller-common.yaml +++ b/Documentation/devicetree/bindings/mmc/mmc-controller-common.yaml @@ -93,6 +93,14 @@ properties: minimum: 400000 maximum: 384000000 + max-sd-hs-hz: + description: | + Maximum frequency (in Hz) to be used for SD cards operating in + High-Speed (HS) mode. + minimum: 400000 + maximum: 50000000 + default: 50000000 + disable-wp: $ref: /schemas/types.yaml#/definitions/flag description: diff --git a/Documentation/devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml b/Documentation/devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml index e8bd49d46794ee..27c4060f2f9174 100644 --- a/Documentation/devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml +++ b/Documentation/devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml @@ -31,6 +31,7 @@ properties: - samsung,exynos5433-dw-mshc-smu - samsung,exynos7885-dw-mshc-smu - samsung,exynos850-dw-mshc-smu + - samsung,exynos8890-dw-mshc-smu - samsung,exynos8895-dw-mshc-smu - const: samsung,exynos7-dw-mshc-smu diff --git a/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml b/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml index 22d1f50c3fd1a0..594bd174ff211e 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml +++ b/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml @@ -48,6 +48,7 @@ properties: - qcom,qcs615-sdhci - qcom,qcs8300-sdhci - qcom,qdu1000-sdhci + - qcom,sa8775p-sdhci - qcom,sar2130p-sdhci - qcom,sc7180-sdhci - qcom,sc7280-sdhci diff --git a/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml b/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml index e7c06032048a3a..186ce8ff4626a1 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml +++ b/Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml @@ -44,12 +44,29 @@ allOf: items: - const: default - const: state_cmd_gpio - pinctrl-0: - description: - Should contain default pinctrl. + minItems: 1 + pinctrl-1: description: Should switch CMD pin to GPIO mode as a high output. + - if: + properties: + compatible: + contains: + const: mrvl,pxav3-mmc + then: + properties: + pinctrl-names: + description: + Optional for increasing stability of the controller at fast bus clocks. + items: + - const: default + - const: state_uhs + minItems: 1 + + pinctrl-1: + description: + Should switch the drive strength of the data pins to high. properties: compatible: @@ -82,6 +99,14 @@ properties: - const: io - const: core + pinctrl-names: true + + pinctrl-0: + description: + Should contain default pinctrl. + + pinctrl-1: true + mrvl,clk-delay-cycles: description: Specify a number of cycles to delay for tuning. $ref: /schemas/types.yaml#/definitions/uint32 diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml index 8597ea625edba5..d2e578d6b83b88 100644 --- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml +++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml @@ -33,6 +33,7 @@ properties: - items: - enum: - fsl,imx91-ddr-pmu + - fsl,imx94-ddr-pmu - fsl,imx95-ddr-pmu - const: fsl,imx93-ddr-pmu diff --git a/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml b/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml index d9501df4288692..c35d3164280538 100644 --- a/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml +++ b/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml @@ -47,21 +47,19 @@ properties: const: 0 clocks: + minItems: 1 maxItems: 3 - description: Reference clocks for CP110; MG clock, MG Core clock, AXI clock clock-names: - items: - - const: mg_clk - - const: mg_core_clk - - const: axi_clk + minItems: 1 + maxItems: 3 marvell,system-controller: description: Phandle to the Marvell system controller (CP110 only) $ref: /schemas/types.yaml#/definitions/phandle patternProperties: - '^phy@[0-2]$': + '^phy@[0-5]$': description: A COMPHY lane child node type: object additionalProperties: false @@ -69,10 +67,14 @@ patternProperties: properties: reg: description: COMPHY lane number + maximum: 5 '#phy-cells': const: 1 + connector: + type: object + required: - reg - '#phy-cells' @@ -91,13 +93,24 @@ allOf: then: properties: - clocks: false - clock-names: false + clocks: + maxItems: 1 + clock-names: + const: xtal required: - reg-names else: + properties: + clocks: + minItems: 3 + clock-names: + items: + - const: mg_clk + - const: mg_core_clk + - const: axi_clk + required: - marvell,system-controller diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml index a1ae8c7988c891..b6f140bf5b3b2f 100644 --- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml @@ -176,6 +176,8 @@ allOf: compatible: contains: enum: + - qcom,sa8775p-qmp-gen4x2-pcie-phy + - qcom,sa8775p-qmp-gen4x4-pcie-phy - qcom,sc8280xp-qmp-gen3x1-pcie-phy - qcom,sc8280xp-qmp-gen3x2-pcie-phy - qcom,sc8280xp-qmp-gen3x4-pcie-phy @@ -197,8 +199,6 @@ allOf: contains: enum: - qcom,qcs8300-qmp-gen4x2-pcie-phy - - qcom,sa8775p-qmp-gen4x2-pcie-phy - - qcom,sa8775p-qmp-gen4x4-pcie-phy then: properties: clocks: diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2712c0-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2712c0-pinctrl.yaml new file mode 100644 index 00000000000000..ae6c13a746b9c6 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2712c0-pinctrl.yaml @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/brcm,bcm2712c0-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Broadcom STB family pin controller + +maintainers: + - Ivan T. Ivanov + - A. della Porta + +description: > + Broadcom's STB family of memory-mapped pin controllers. + + This includes the pin controllers inside the BCM2712 SoC which + are instances of the STB family and has two silicon variants, + C0 and D0, which differs slightly in terms of registers layout. + + The -aon- (Always On) variant is the same IP block but differs + in the number of pins that are associated and the pinmux functions + for each of those pins. + +allOf: + - $ref: pinctrl.yaml# + +properties: + compatible: + enum: + - brcm,bcm2712c0-pinctrl + - brcm,bcm2712c0-aon-pinctrl + - brcm,bcm2712d0-pinctrl + - brcm,bcm2712d0-aon-pinctrl + + reg: + maxItems: 1 + +patternProperties: + '-state$': + oneOf: + - $ref: '#/$defs/brcmstb-pinctrl-state' + - patternProperties: + '-pins$': + $ref: '#/$defs/brcmstb-pinctrl-state' + additionalProperties: false + +$defs: + brcmstb-pinctrl-state: + allOf: + - $ref: pincfg-node.yaml# + - $ref: pinmux-node.yaml# + + description: > + Pin controller client devices use pin configuration subnodes (children + and grandchildren) for desired pin configuration. + + Client device subnodes use below standard properties. + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode (either this or "groups" must be specified). + items: + pattern: '^((aon_)?s?gpio[0-6]?[0-9])|(emmc_(clk|cmd|dat[0-7]|ds))$' + + function: + description: + Specify the alternative function to be configured for the specified + pins. + enum: [ gpio, alt1, alt2, alt3, alt4, alt5, alt6, alt7, alt8, + aon_cpu_standbyb, aon_fp_4sec_resetb, aon_gpclk, aon_pwm, + arm_jtag, aud_fs_clk0, avs_pmu_bsc, bsc_m0, bsc_m1, bsc_m2, + bsc_m3, clk_observe, ctl_hdmi_5v, enet0, enet0_mii, enet0_rgmii, + ext_sc_clk, fl0, fl1, gpclk0, gpclk1, gpclk2, hdmi_tx0_auto_i2c, + hdmi_tx0_bsc, hdmi_tx1_auto_i2c, hdmi_tx1_bsc, i2s_in, i2s_out, + ir_in, mtsif, mtsif_alt, mtsif_alt1, pdm, pkt, pm_led_out, sc0, + sd0, sd2, sd_card_a, sd_card_b, sd_card_c, sd_card_d, sd_card_e, + sd_card_f, sd_card_g, spdif_out, spi_m, spi_s, sr_edm_sense, te0, + te1, tsio, uart0, uart1, uart2, usb_pwr, usb_vbus, uui, vc_i2c0, + vc_i2c3, vc_i2c4, vc_i2c5, vc_i2csl, vc_pcm, vc_pwm0, vc_pwm1, + vc_spi0, vc_spi3, vc_spi4, vc_spi5, vc_uart0, vc_uart2, vc_uart3, + vc_uart4 ] + + bias-disable: true + bias-pull-down: true + bias-pull-up: true + + required: + - pins + + if: + properties: + pins: + not: + contains: + pattern: "^emmc_(clk|cmd|dat[0-7]|ds)$" + then: + required: + - function + else: + properties: + function: false + + additionalProperties: false + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + pinctrl@7d504100 { + compatible = "brcm,bcm2712c0-pinctrl"; + reg = <0x7d504100 0x30>; + + bt-shutdown-default-state { + function = "gpio"; + pins = "gpio29"; + }; + + uarta-default-state { + rts-tx-pins { + function = "uart0"; + pins = "gpio24", "gpio26"; + bias-disable; + }; + + cts-rx-pins { + function = "uart0"; + pins = "gpio25", "gpio27"; + bias-pull-up; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt deleted file mode 100644 index 5682b2010e5009..00000000000000 --- a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt +++ /dev/null @@ -1,99 +0,0 @@ -Broadcom BCM2835 GPIO (and pinmux) controller - -The BCM2835 GPIO module is a combined GPIO controller, (GPIO) interrupt -controller, and pinmux/control device. - -Required properties: -- compatible: "brcm,bcm2835-gpio" -- compatible: should be one of: - "brcm,bcm2835-gpio" - BCM2835 compatible pinctrl - "brcm,bcm7211-gpio" - BCM7211 compatible pinctrl - "brcm,bcm2711-gpio" - BCM2711 compatible pinctrl - "brcm,bcm7211-gpio" - BCM7211 compatible pinctrl -- reg: Should contain the physical address of the GPIO module's registers. -- gpio-controller: Marks the device node as a GPIO controller. -- #gpio-cells : Should be two. The first cell is the pin number and the - second cell is used to specify optional parameters: - - bit 0 specifies polarity (0 for normal, 1 for inverted) -- interrupts : The interrupt outputs from the controller. One interrupt per - individual bank followed by the "all banks" interrupt. For BCM7211, an - additional set of per-bank interrupt line and an "all banks" wake-up - interrupt may be specified. -- interrupt-controller: Marks the device node as an interrupt controller. -- #interrupt-cells : Should be 2. - The first cell is the GPIO number. - The second cell is used to specify flags: - bits[3:0] trigger type and level flags: - 1 = low-to-high edge triggered. - 2 = high-to-low edge triggered. - 4 = active high level-sensitive. - 8 = active low level-sensitive. - Valid combinations are 1, 2, 3, 4, 8. - -Please refer to ../gpio/gpio.txt for a general description of GPIO bindings. - -Please refer to pinctrl-bindings.txt in this directory for details of the -common pinctrl bindings used by client devices, including the meaning of the -phrase "pin configuration node". - -Each pin configuration node lists the pin(s) to which it applies, and one or -more of the mux function to select on those pin(s), and pull-up/down -configuration. Each subnode only affects those parameters that are explicitly -listed. In other words, a subnode that lists only a mux function implies no -information about any pull configuration. Similarly, a subnode that lists only -a pul parameter implies no information about the mux function. - -The BCM2835 pin configuration and multiplexing supports the generic bindings. -For details on each properties, you can refer to ./pinctrl-bindings.txt. - -Required sub-node properties: - - pins - - function - -Optional sub-node properties: - - bias-disable - - bias-pull-up - - bias-pull-down - - output-high - - output-low - -Legacy pin configuration and multiplexing binding: -*** (Its use is deprecated, use generic multiplexing and configuration -bindings instead) - -Required subnode-properties: -- brcm,pins: An array of cells. Each cell contains the ID of a pin. Valid IDs - are the integer GPIO IDs; 0==GPIO0, 1==GPIO1, ... 53==GPIO53. - -Optional subnode-properties: -- brcm,function: Integer, containing the function to mux to the pin(s): - 0: GPIO in - 1: GPIO out - 2: alt5 - 3: alt4 - 4: alt0 - 5: alt1 - 6: alt2 - 7: alt3 -- brcm,pull: Integer, representing the pull-down/up to apply to the pin(s): - 0: none - 1: down - 2: up - -Each of brcm,function and brcm,pull may contain either a single value which -will be applied to all pins in brcm,pins, or 1 value for each entry in -brcm,pins. - -Example: - - gpio: gpio { - compatible = "brcm,bcm2835-gpio"; - reg = <0x2200000 0xb4>; - interrupts = <2 17>, <2 19>, <2 18>, <2 20>; - - gpio-controller; - #gpio-cells = <2>; - - interrupt-controller; - #interrupt-cells = <2>; - }; diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.yaml b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.yaml new file mode 100644 index 00000000000000..6514f347f6bc6a --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.yaml @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/brcm,bcm2835-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Broadcom BCM2835 GPIO (and pinmux) controller + +maintainers: + - Florian Fainelli + +description: > + The BCM2835 GPIO module is a combined GPIO controller, (GPIO) interrupt + controller, and pinmux/control device. + +properties: + compatible: + enum: + - brcm,bcm2835-gpio + - brcm,bcm2711-gpio + - brcm,bcm7211-gpio + + reg: + maxItems: 1 + + '#gpio-cells': + const: 2 + + gpio-controller: true + gpio-ranges: true + gpio-line-names: true + + interrupts: + description: > + Interrupt outputs: one per bank, then the combined “all banks” line. + BCM7211 may specify up to four per-bank wake-up lines and one combined + wake-up interrupt. + minItems: 4 + maxItems: 10 + + '#interrupt-cells': + const: 2 + + interrupt-controller: true + +additionalProperties: + oneOf: + - type: object + additionalProperties: false + + patternProperties: + '^pins?-': + type: object + allOf: + - $ref: /schemas/pinctrl/pincfg-node.yaml# + - $ref: /schemas/pinctrl/pinmux-node.yaml# + additionalProperties: false + + properties: + pins: true + function: true + bias-disable: true + bias-pull-up: true + bias-pull-down: true + output-high: true + output-low: true + + required: + - pins + - function + + - type: object + additionalProperties: false + deprecated: true + + properties: + brcm,pins: + description: + GPIO pin numbers for legacy configuration. + $ref: /schemas/types.yaml#/definitions/uint32-array + + brcm,function: + description: + Legacy mux function for the pins (0=input, 1=output, 2–7=alt functions). + $ref: /schemas/types.yaml#/definitions/uint32-array + maximum: 7 + + brcm,pull: + description: > + Legacy pull setting for the pins (0=none, 1=pull-down, 2=pull-up). + $ref: /schemas/types.yaml#/definitions/uint32-array + maximum: 2 + + required: + - brcm,pins + +allOf: + - if: + properties: + compatible: + contains: + enum: + - brcm,bcm2835-gpio + - brcm,bcm2711-gpio + then: + properties: + interrupts: + maxItems: 5 + +examples: + - | + gpio@2200000 { + compatible = "brcm,bcm2835-gpio"; + reg = <0x2200000 0xb4>; + interrupts = <2 17>, <2 19>, <2 18>, <2 20>, <2 21>; + #gpio-cells = <2>; + gpio-controller; + #interrupt-cells = <2>; + interrupt-controller; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt deleted file mode 100644 index a73cbeb0f309de..00000000000000 --- a/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.txt +++ /dev/null @@ -1,123 +0,0 @@ -Broadcom iProc GPIO/PINCONF Controller - -Required properties: - -- compatible: - "brcm,iproc-gpio" for the generic iProc based GPIO controller IP that - supports full-featured pinctrl and GPIO functions used in various iProc - based SoCs - - May contain an SoC-specific compatibility string to accommodate any - SoC-specific features - - "brcm,cygnus-ccm-gpio", "brcm,cygnus-asiu-gpio", or - "brcm,cygnus-crmu-gpio" for Cygnus SoCs - - "brcm,iproc-nsp-gpio" for the iProc NSP SoC that has drive strength support - disabled - - "brcm,iproc-stingray-gpio" for the iProc Stingray SoC that has the general - pinctrl support completely disabled in this IP block. In Stingray, a - different IP block is used to handle pinctrl related functions - -- reg: - Define the base and range of the I/O address space that contains SoC -GPIO/PINCONF controller registers - -- ngpios: - Total number of in-use slots in GPIO controller - -- #gpio-cells: - Must be two. The first cell is the GPIO pin number (within the -controller's pin space) and the second cell is used for the following: - bit[0]: polarity (0 for active high and 1 for active low) - -- gpio-controller: - Specifies that the node is a GPIO controller - -Optional properties: - -- interrupts: - Interrupt ID - -- interrupt-controller: - Specifies that the node is an interrupt controller - -- gpio-ranges: - Specifies the mapping between gpio controller and pin-controllers pins. - This requires 4 fields in cells defined as - - 1. Phandle of pin-controller. - 2. GPIO base pin offset. - 3 Pin-control base pin offset. - 4. number of gpio pins which are linearly mapped from pin base. - -Supported generic PINCONF properties in child nodes: - -- pins: - The list of pins (within the controller's own pin space) that properties -in the node apply to. Pin names are "gpio-" - -- bias-disable: - Disable pin bias - -- bias-pull-up: - Enable internal pull up resistor - -- bias-pull-down: - Enable internal pull down resistor - -- drive-strength: - Valid drive strength values include 2, 4, 6, 8, 10, 12, 14, 16 (mA) - -Example: - gpio_ccm: gpio@1800a000 { - compatible = "brcm,cygnus-ccm-gpio"; - reg = <0x1800a000 0x50>, - <0x0301d164 0x20>; - ngpios = <24>; - #gpio-cells = <2>; - gpio-controller; - interrupts = ; - interrupt-controller; - - touch_pins: touch_pins { - pwr: pwr { - pins = "gpio-0"; - drive-strength = <16>; - }; - - event: event { - pins = "gpio-1"; - bias-pull-up; - }; - }; - }; - - gpio_asiu: gpio@180a5000 { - compatible = "brcm,cygnus-asiu-gpio"; - reg = <0x180a5000 0x668>; - ngpios = <146>; - #gpio-cells = <2>; - gpio-controller; - interrupts = ; - interrupt-controller; - gpio-ranges = <&pinctrl 0 42 1>, - <&pinctrl 1 44 3>; - }; - - /* - * Touchscreen that uses the CCM GPIO 0 and 1 - */ - tsc { - ... - ... - gpio-pwr = <&gpio_ccm 0 0>; - gpio-event = <&gpio_ccm 1 0>; - }; - - /* Bluetooth that uses the ASIU GPIO 5, with polarity inverted */ - bluetooth { - ... - ... - bcm,rfkill-bank-sel = <&gpio_asiu 5 1> - } diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.yaml b/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.yaml new file mode 100644 index 00000000000000..a0ed308b7fc848 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/brcm,iproc-gpio.yaml @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/brcm,iproc-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Broadcom iProc GPIO/PINCONF Controller + +maintainers: + - Ray Jui + - Scott Branden + +properties: + compatible: + oneOf: + - enum: + - brcm,cygnus-asiu-gpio + - brcm,cygnus-ccm-gpio + - brcm,cygnus-crmu-gpio + - brcm,iproc-gpio + - brcm,iproc-stingray-gpio + - items: + - enum: + - brcm,iproc-hr2-gpio + - brcm,iproc-nsp-gpio + - const: brcm,iproc-gpio + + reg: + minItems: 1 + items: + - description: GPIO Bank registers + - description: IO Ctrl registers + + "#gpio-cells": + const: 2 + + gpio-controller: true + + gpio-ranges: true + + ngpios: true + + "#interrupt-cells": + const: 2 + + interrupts: + maxItems: 1 + + interrupt-controller: true + +required: + - compatible + - reg + - "#gpio-cells" + - gpio-controller + - ngpios + +patternProperties: + '-pins$': + type: object + additionalProperties: + description: Pin configuration child nodes. + allOf: + - $ref: pincfg-node.yaml# + - $ref: pinmux-node.yaml# + additionalProperties: false + + properties: + pins: + items: + pattern: '^gpio-' + + bias-disable: true + bias-pull-up: true + bias-pull-down: true + + drive-strength: + enum: [ 2, 4, 6, 8, 10, 12, 14, 16 ] + + required: + - pins + +additionalProperties: false + +examples: + - | + #include + + gpio@1800a000 { + compatible = "brcm,cygnus-ccm-gpio"; + reg = <0x1800a000 0x50>, + <0x0301d164 0x20>; + ngpios = <24>; + #gpio-cells = <2>; + gpio-controller; + #interrupt-cells = <2>; + interrupts = ; + interrupt-controller; + + touch-pins { + pwr { + pins = "gpio-0"; + drive-strength = <16>; + }; + + event { + pins = "gpio-1"; + bias-pull-up; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/fsl,mxs-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/fsl,mxs-pinctrl.txt deleted file mode 100644 index 1e70a8aff2600e..00000000000000 --- a/Documentation/devicetree/bindings/pinctrl/fsl,mxs-pinctrl.txt +++ /dev/null @@ -1,127 +0,0 @@ -* Freescale MXS Pin Controller - -The pins controlled by mxs pin controller are organized in banks, each bank -has 32 pins. Each pin has 4 multiplexing functions, and generally, the 4th -function is GPIO. The configuration on the pins includes drive strength, -voltage and pull-up. - -Required properties: -- compatible: "fsl,imx23-pinctrl" or "fsl,imx28-pinctrl" -- reg: Should contain the register physical address and length for the - pin controller. - -Please refer to pinctrl-bindings.txt in this directory for details of the -common pinctrl bindings used by client devices. - -The node of mxs pin controller acts as a container for an arbitrary number of -subnodes. Each of these subnodes represents some desired configuration for -a group of pins, and only affects those parameters that are explicitly listed. -In other words, a subnode that describes a drive strength parameter implies no -information about pull-up. For this reason, even seemingly boolean values are -actually tristates in this binding: unspecified, off, or on. Unspecified is -represented as an absent property, and off/on are represented as integer -values 0 and 1. - -Those subnodes under mxs pin controller node will fall into two categories. -One is to set up a group of pins for a function, both mux selection and pin -configurations, and it's called group node in the binding document. The other -one is to adjust the pin configuration for some particular pins that need a -different configuration than what is defined in group node. The binding -document calls this type of node config node. - -On mxs, there is no hardware pin group. The pin group in this binding only -means a group of pins put together for particular peripheral to work in -particular function, like SSP0 functioning as mmc0-8bit. That said, the -group node should include all the pins needed for one function rather than -having these pins defined in several group nodes. It also means each of -"pinctrl-*" phandle in client device node should only have one group node -pointed in there, while the phandle can have multiple config node referenced -there to adjust configurations for some pins in the group. - -Required subnode-properties: -- fsl,pinmux-ids: An integer array. Each integer in the array specify a pin - with given mux function, with bank, pin and mux packed as below. - - [15..12] : bank number - [11..4] : pin number - [3..0] : mux selection - - This integer with mux selection packed is used as an entity by both group - and config nodes to identify a pin. The mux selection in the integer takes - effects only on group node, and will get ignored by driver with config node, - since config node is only meant to set up pin configurations. - - Valid values for these integers are listed below. - -- reg: Should be the index of the group nodes for same function. This property - is required only for group nodes, and should not be present in any config - nodes. - -Optional subnode-properties: -- fsl,drive-strength: Integer. - 0: MXS_DRIVE_4mA - 1: MXS_DRIVE_8mA - 2: MXS_DRIVE_12mA - 3: MXS_DRIVE_16mA -- fsl,voltage: Integer. - 0: MXS_VOLTAGE_LOW - 1.8 V - 1: MXS_VOLTAGE_HIGH - 3.3 V -- fsl,pull-up: Integer. - 0: MXS_PULL_DISABLE - Disable the internal pull-up - 1: MXS_PULL_ENABLE - Enable the internal pull-up - -Note that when enabling the pull-up, the internal pad keeper gets disabled. -Also, some pins doesn't have a pull up, in that case, setting the fsl,pull-up -will only disable the internal pad keeper. - -Examples: - -pinctrl@80018000 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,imx28-pinctrl"; - reg = <0x80018000 2000>; - - mmc0_8bit_pins_a: mmc0-8bit@0 { - reg = <0>; - fsl,pinmux-ids = < - MX28_PAD_SSP0_DATA0__SSP0_D0 - MX28_PAD_SSP0_DATA1__SSP0_D1 - MX28_PAD_SSP0_DATA2__SSP0_D2 - MX28_PAD_SSP0_DATA3__SSP0_D3 - MX28_PAD_SSP0_DATA4__SSP0_D4 - MX28_PAD_SSP0_DATA5__SSP0_D5 - MX28_PAD_SSP0_DATA6__SSP0_D6 - MX28_PAD_SSP0_DATA7__SSP0_D7 - MX28_PAD_SSP0_CMD__SSP0_CMD - MX28_PAD_SSP0_DETECT__SSP0_CARD_DETECT - MX28_PAD_SSP0_SCK__SSP0_SCK - >; - fsl,drive-strength = ; - fsl,voltage = ; - fsl,pull-up = ; - }; - - mmc_cd_cfg: mmc-cd-cfg { - fsl,pinmux-ids = ; - fsl,pull-up = ; - }; - - mmc_sck_cfg: mmc-sck-cfg { - fsl,pinmux-ids = ; - fsl,drive-strength = ; - fsl,pull-up = ; - }; -}; - -In this example, group node mmc0-8bit defines a group of pins for mxs SSP0 -to function as a 8-bit mmc device, with 8mA, 3.3V and pull-up configurations -applied on all these pins. And config nodes mmc-cd-cfg and mmc-sck-cfg are -adjusting the configuration for pins card-detection and clock from what group -node mmc0-8bit defines. Only the configuration properties to be adjusted need -to be listed in the config nodes. - -Valid values for i.MX28/i.MX23 pinmux-id are defined in -arch/arm/boot/dts/imx28-pinfunc.h and arch/arm/boot/dts/imx23-pinfunc.h. -The definitions for the padconfig properties can be found in -arch/arm/boot/dts/mxs-pinfunc.h. diff --git a/Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml index 464879274cae4c..3db2438fadc78b 100644 --- a/Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml @@ -48,6 +48,8 @@ properties: description: GPIO valid number range. + gpio-line-names: true + interrupt-controller: true interrupts: diff --git a/Documentation/devicetree/bindings/pinctrl/nvidia,tegra186-pinmux.yaml b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra186-pinmux.yaml new file mode 100644 index 00000000000000..ac764d0ac4b638 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra186-pinmux.yaml @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/nvidia,tegra186-pinmux.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NVIDIA Tegra186 Pinmux Controller + +maintainers: + - Thierry Reding + - Jon Hunter + +properties: + compatible: + enum: + - nvidia,tegra186-pinmux + - nvidia,tegra186-pinmux-aon + + reg: + items: + - description: pinmux registers + +patternProperties: + "^pinmux(-[a-z0-9-]+)?$": + type: object + + # pin groups + additionalProperties: + $ref: nvidia,tegra-pinmux-common.yaml + unevaluatedProperties: false + properties: + nvidia,function: + enum: [ aud, can0, can1, ccla, dca, dcb, dcc, directdc, directdc1, + displaya, displayb, dmic1, dmic2, dmic3, dmic4, dmic5, dp, + dspk0, dspk1, dtv, eqos, extperiph1, extperiph2, extperiph3, + extperiph4, gp, gpio, hdmi, i2c1, i2c2, i2c3, i2c5, i2c7, + i2c8, i2c9, i2s1, i2s2, i2s3, i2s4, i2s5, i2s6, iqc0, iqc1, + nv, pe, pe0, pe1, pe2, qspi, rsvd0, rsvd1, rsvd2, rsvd3, + sata, sce, sdmmc1, sdmmc2, sdmmc3, sdmmc4, soc, spdif, spi1, + spi2, spi3, spi4, touch, uarta, uartb, uartc, uartd, uarte, + uartf, uartg, ufs0, usb, vgp1, vgp2, vgp3, vgp4, vgp5, vgp6, + wdt ] + + nvidia,pull: true + nvidia,tristate: true + nvidia,schmitt: true + nvidia,enable-input: true + nvidia,open-drain: true + nvidia,lock: true + nvidia,drive-type: true + nvidia,io-hv: true + + required: + - nvidia,pins + +additionalProperties: false + +allOf: + - if: + properties: + compatible: + const: nvidia,tegra186-pinmux + then: + patternProperties: + "^pinmux(-[a-z0-9-]+)?$": + type: object + additionalProperties: + properties: + nvidia,pins: + description: An array of strings. Each string contains the name + of a pin or group. Valid values for these names are listed + below. + items: + enum: [ pex_l0_rst_n_pa0, pex_l0_clkreq_n_pa1, + pex_wake_n_pa2, pex_l1_rst_n_pa3, + pex_l1_clkreq_n_pa4, pex_l2_rst_n_pa5, + pex_l2_clkreq_n_pa6, uart4_tx_pb0, uart4_rx_pb1, + uart4_rts_pb2, uart4_cts_pb3, gpio_wan1_pb4, + gpio_wan2_pb5, gpio_wan3_pb6, gpio_wan4_pc0, + dap2_sclk_pc1, dap2_dout_pc2, dap2_din_pc3, + dap2_fs_pc4, gen1_i2c_scl_pc5, gen1_i2c_sda_pc6, + sdmmc1_clk_pd0, sdmmc1_cmd_pd1, sdmmc1_dat0_pd2, + sdmmc1_dat1_pd3, sdmmc1_dat2_pd4, sdmmc1_dat3_pd5, + eqos_txc_pe0, eqos_td0_pe1, eqos_td1_pe2, + eqos_td2_pe3, eqos_td3_pe4, eqos_tx_ctl_pe5, + eqos_rd0_pe6, eqos_rd1_pe7, eqos_rd2_pf0, + eqos_rd3_pf1, eqos_rx_ctl_pf2, eqos_rxc_pf3, + eqos_mdio_pf4, eqos_mdc_pf5, sdmmc3_clk_pg0, + sdmmc3_cmd_pg1, sdmmc3_dat0_pg2, sdmmc3_dat1_pg3, + sdmmc3_dat2_pg4, sdmmc3_dat3_pg5, gpio_wan5_ph0, + gpio_wan6_ph1, gpio_wan7_ph2, gpio_wan8_ph3, + bcpu_pwr_req_ph4, mcpu_pwr_req_ph5, gpu_pwr_req_ph6, + gpio_pq0_pi0, gpio_pq1_pi1, gpio_pq2_pi2, + gpio_pq3_pi3, gpio_pq4_pi4, gpio_pq5_pi5, + gpio_pq6_pi6, gpio_pq7_pi7, dap1_sclk_pj0, + dap1_dout_pj1, dap1_din_pj2, dap1_fs_pj3, + aud_mclk_pj4, gpio_aud0_pj5, gpio_aud1_pj6, + gpio_aud2_pj7, gpio_aud3_pk0, gen7_i2c_scl_pl0, + gen7_i2c_sda_pl1, gen9_i2c_scl_pl2, gen9_i2c_sda_pl3, + usb_vbus_en0_pl4, usb_vbus_en1_pl5, gp_pwm6_pl6, + gp_pwm7_pl7, dmic1_dat_pm0, dmic1_clk_pm1, + dmic2_dat_pm2, dmic2_clk_pm3, dmic4_dat_pm4, + dmic4_clk_pm5, gpio_cam1_pn0, gpio_cam2_pn1, + gpio_cam3_pn2, gpio_cam4_pn3, gpio_cam6_pn5, + gpio_cam7_pn6, extperiph1_clk_po0, + extperiph2_clk_po1, cam_i2c_scl_po2, cam_i2c_sda_po3, + dp_aux_ch0_hpd_pp0, dp_aux_ch1_hpd_pp1, hdmi_cec_pp2, + gpio_edp0_pp3, gpio_edp1_pp4, gpio_edp2_pp5, + gpio_edp3_pp6, directdc1_clk_pq0, directdc1_in_pq1, + directdc1_out0_pq2, directdc1_out1_pq3, + directdc1_out2_pq4, directdc1_out3_pq5, + qspi_sck_pr0, qspi_io0_pr1, qspi_io1_pr2, + qspi_io2_pr3, qspi_io3_pr4, qspi_cs_n_pr5, + uart1_tx_pt0, uart1_rx_pt1, uart1_rts_pt2, + uart1_cts_pt3, uart2_tx_px0, uart2_rx_px1, + uart2_rts_px2, uart2_cts_px3, uart5_tx_px4, + uart5_rx_px5, uart5_rts_px6, uart5_cts_px7, + gpio_mdm1_py0, gpio_mdm2_py1, gpio_mdm3_py2, + gpio_mdm4_py3, gpio_mdm5_py4, gpio_mdm6_py5, + gpio_mdm7_py6, ufs0_ref_clk_pbb0, ufs0_rst_pbb1, + dap4_sclk_pcc0, dap4_dout_pcc1, dap4_din_pcc2, + dap4_fs_pcc3, directdc_comp, sdmmc1_comp, eqos_comp, + sdmmc3_comp, qspi_comp, + # drive groups + drive_gpio_aud3_pk0, drive_gpio_aud2_pj7, + drive_gpio_aud1_pj6, drive_gpio_aud0_pj5, + drive_aud_mclk_pj4, drive_dap1_fs_pj3, + drive_dap1_din_pj2, drive_dap1_dout_pj1, + drive_dap1_sclk_pj0, drive_dmic1_clk_pm1, + drive_dmic1_dat_pm0, drive_dmic2_dat_pm2, + drive_dmic2_clk_pm3, drive_dmic4_dat_pm4, + drive_dmic4_clk_pm5, drive_dap4_fs_pcc3, + drive_dap4_din_pcc2, drive_dap4_dout_pcc1, + drive_dap4_sclk_pcc0, drive_extperiph2_clk_po1, + drive_extperiph1_clk_po0, drive_cam_i2c_sda_po3, + drive_cam_i2c_scl_po2, drive_gpio_cam1_pn0, + drive_gpio_cam2_pn1, drive_gpio_cam3_pn2, + drive_gpio_cam4_pn3, drive_gpio_cam5_pn4, + drive_gpio_cam6_pn5, drive_gpio_cam7_pn6, + drive_dap2_din_pc3, drive_dap2_dout_pc2, + drive_dap2_fs_pc4, drive_dap2_sclk_pc1, + drive_uart4_cts_pb3, drive_uart4_rts_pb2, + drive_uart4_rx_pb1, drive_uart4_tx_pb0, + drive_gpio_wan4_pc0, drive_gpio_wan3_pb6, + drive_gpio_wan2_pb5, drive_gpio_wan1_pb4, + drive_gen1_i2c_scl_pc5, drive_gen1_i2c_sda_pc6, + drive_uart1_cts_pt3, drive_uart1_rts_pt2, + drive_uart1_rx_pt1, drive_uart1_tx_pt0, + drive_directdc1_out3_pq5, drive_directdc1_out2_pq4, + drive_directdc1_out1_pq3, drive_directdc1_out0_pq2, + drive_directdc1_in_pq1, drive_directdc1_clk_pq0, + drive_gpio_pq0_pi0, drive_gpio_pq1_pi1, + drive_gpio_pq2_pi2, drive_gpio_pq3_pi3, + drive_gpio_pq4_pi4, drive_gpio_pq5_pi5, + drive_gpio_pq6_pi6, drive_gpio_pq7_pi7, + drive_gpio_edp2_pp5, drive_gpio_edp3_pp6, + drive_gpio_edp0_pp3, drive_gpio_edp1_pp4, + drive_dp_aux_ch0_hpd_pp0, drive_dp_aux_ch1_hpd_pp1, + drive_hdmi_cec_pp2, drive_pex_l2_clkreq_n_pa6, + drive_pex_wake_n_pa2, drive_pex_l1_clkreq_n_pa4, + drive_pex_l1_rst_n_pa3, drive_pex_l0_clkreq_n_pa1, + drive_pex_l0_rst_n_pa0, drive_pex_l2_rst_n_pa5, + drive_sdmmc1_clk_pd0, drive_sdmmc1_cmd_pd1, + drive_sdmmc1_dat3_pd5, drive_sdmmc1_dat2_pd4, + drive_sdmmc1_dat1_pd3, drive_sdmmc1_dat0_pd2, + drive_eqos_td3_pe4, drive_eqos_td2_pe3, + drive_eqos_td1_pe2, drive_eqos_td0_pe1, + drive_eqos_rd3_pf1, drive_eqos_rd2_pf0, + drive_eqos_rd1_pe7, drive_eqos_mdio_pf4, + drive_eqos_rd0_pe6, drive_eqos_mdc_pf5, + drive_eqos_txc_pe0, drive_eqos_rxc_pf3, + drive_eqos_tx_ctl_pe5, drive_eqos_rx_ctl_pf2, + drive_sdmmc3_dat3_pg5, drive_sdmmc3_dat2_pg4, + drive_sdmmc3_dat1_pg3, drive_sdmmc3_dat0_pg2, + drive_sdmmc3_cmd_pg1, drive_sdmmc3_clk_pg0, + drive_qspi_io3_pr4, drive_qspi_io2_pr3, + drive_qspi_io1_pr2, drive_qspi_io0_pr1, + drive_qspi_sck_pr0, drive_qspi_cs_n_pr5, + drive_gpio_wan8_ph3, drive_gpio_wan7_ph2, + drive_gpio_wan6_ph1, drive_gpio_wan5_ph0, + drive_uart2_tx_px0, drive_uart2_rx_px1, + drive_uart2_rts_px2, drive_uart2_cts_px3, + drive_uart5_rx_px5, drive_uart5_tx_px4, + drive_uart5_rts_px6, drive_uart5_cts_px7, + drive_gpio_mdm1_py0, drive_gpio_mdm2_py1, + drive_gpio_mdm3_py2, drive_gpio_mdm4_py3, + drive_gpio_mdm5_py4, drive_gpio_mdm6_py5, + drive_gpio_mdm7_py6, drive_bcpu_pwr_req_ph4, + drive_mcpu_pwr_req_ph5, drive_gpu_pwr_req_ph6, + drive_gen7_i2c_scl_pl0, drive_gen7_i2c_sda_pl1, + drive_gen9_i2c_sda_pl3, drive_gen9_i2c_scl_pl2, + drive_usb_vbus_en0_pl4, drive_usb_vbus_en1_pl5, + drive_gp_pwm7_pl7, drive_gp_pwm6_pl6, + drive_ufs0_rst_pbb1, drive_ufs0_ref_clk_pbb0, + drive_directdc_comp, drive_sdmmc1_comp, + drive_eqos_comp, drive_sdmmc3_comp, drive_sdmmc4_clk, + drive_sdmmc4_cmd, drive_sdmmc4_dqs, + drive_sdmmc4_dat7, drive_sdmmc4_dat6, + drive_sdmmc4_dat5, drive_sdmmc4_dat4, + drive_sdmmc4_dat3, drive_sdmmc4_dat2, + drive_sdmmc4_dat1, drive_sdmmc4_dat0, + drive_qspi_comp ] + + - if: + properties: + compatible: + const: nvidia,tegra186-pinmux-aon + then: + patternProperties: + "^pinmux(-[a-z0-9-]+)?$": + type: object + additionalProperties: + properties: + nvidia,pins: + items: + enum: [ pwr_i2c_scl_ps0, pwr_i2c_sda_ps1, batt_oc_ps2, + safe_state_ps3, vcomp_alert_ps4, gpio_dis0_pu0, + gpio_dis1_pu1, gpio_dis2_pu2, gpio_dis3_pu3, + gpio_dis4_pu4, gpio_dis5_pu5, gpio_sen0_pv0, + gpio_sen1_pv1, gpio_sen2_pv2, gpio_sen3_pv3, + gpio_sen4_pv4, gpio_sen5_pv5, gpio_sen6_pv6, + gpio_sen7_pv7, gen8_i2c_scl_pw0, gen8_i2c_sda_pw1, + uart3_tx_pw2, uart3_rx_pw3, uart3_rts_pw4, + uart3_cts_pw5, uart7_tx_pw6, uart7_rx_pw7, + can1_dout_pz0, can1_din_pz1, can0_dout_pz2, + can0_din_pz3, can_gpio0_paa0, can_gpio1_paa1, + can_gpio2_paa2, can_gpio3_paa3, can_gpio4_paa4, + can_gpio5_paa5, can_gpio6_paa6, can_gpio7_paa7, + gpio_sen8_pee0, gpio_sen9_pee1, touch_clk_pee2, + power_on_pff0, gpio_sw1_pff1, gpio_sw2_pff2, + gpio_sw3_pff3, gpio_sw4_pff4, shutdown, pmu_int, + soc_pwr_req, clk_32k_in, + # drive groups + drive_touch_clk_pee2, drive_uart3_cts_pw5, + drive_uart3_rts_pw4, drive_uart3_rx_pw3, + drive_uart3_tx_pw2, drive_gen8_i2c_sda_pw1, + drive_gen8_i2c_scl_pw0, drive_uart7_rx_pw7, + drive_uart7_tx_pw6, drive_gpio_sen0_pv0, + drive_gpio_sen1_pv1, drive_gpio_sen2_pv2, + drive_gpio_sen3_pv3, drive_gpio_sen4_pv4, + drive_gpio_sen5_pv5, drive_gpio_sen6_pv6, + drive_gpio_sen7_pv7, drive_gpio_sen8_pee0, + drive_gpio_sen9_pee1, drive_can_gpio7_paa7, + drive_can1_dout_pz0, drive_can1_din_pz1, + drive_can0_dout_pz2, drive_can0_din_pz3, + drive_can_gpio0_paa0, drive_can_gpio1_paa1, + drive_can_gpio2_paa2, drive_can_gpio3_paa3, + drive_can_gpio4_paa4, drive_can_gpio5_paa5, + drive_can_gpio6_paa6, drive_gpio_sw1_pff1, + drive_gpio_sw2_pff2, drive_gpio_sw3_pff3, + drive_gpio_sw4_pff4, drive_shutdown, drive_pmu_int, + drive_safe_state_ps3, drive_vcomp_alert_ps4, + drive_soc_pwr_req, drive_batt_oc_ps2, + drive_clk_32k_in, drive_power_on_pff0, + drive_pwr_i2c_scl_ps0, drive_pwr_i2c_sda_ps1, + drive_gpio_dis0_pu0, drive_gpio_dis1_pu1, + drive_gpio_dis2_pu2, drive_gpio_dis3_pu3, + drive_gpio_dis4_pu4, drive_gpio_dis5_pu5 ] + +required: + - compatible + - reg + +examples: + - | + #include + + pinmux@2430000 { + compatible = "nvidia,tegra186-pinmux"; + reg = <0x2430000 0x15000>; + + pinctrl-names = "jetson_io"; + pinctrl-0 = <&jetson_io_pinmux>; + + jetson_io_pinmux: pinmux { + hdr40-pin7 { + nvidia,pins = "aud_mclk_pj4"; + nvidia,function = "aud"; + nvidia,pull = ; + nvidia,tristate = ; + nvidia,enable-input = ; + }; + }; + }; +... diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,glymur-tlmm.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,glymur-tlmm.yaml new file mode 100644 index 00000000000000..d2b0cfeffb501e --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/qcom,glymur-tlmm.yaml @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/qcom,glymur-tlmm.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm Technologies, Inc. Glymur TLMM block + +maintainers: + - Bjorn Andersson + +description: + Top Level Mode Multiplexer pin controller in Qualcomm Glymur SoC. + +allOf: + - $ref: /schemas/pinctrl/qcom,tlmm-common.yaml# + +properties: + compatible: + const: qcom,glymur-tlmm + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + gpio-reserved-ranges: + minItems: 1 + maxItems: 125 + + gpio-line-names: + maxItems: 250 + +patternProperties: + "-state$": + oneOf: + - $ref: "#/$defs/qcom-glymur-tlmm-state" + - patternProperties: + "-pins$": + $ref: "#/$defs/qcom-glymur-tlmm-state" + additionalProperties: false + +$defs: + qcom-glymur-tlmm-state: + type: object + description: + Pinctrl node's client devices use subnodes for desired pin configuration. + Client device subnodes use below standard properties. + $ref: qcom,tlmm-common.yaml#/$defs/qcom-tlmm-state + unevaluatedProperties: false + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode. + items: + oneOf: + - pattern: "^gpio([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9])$" + - enum: [ ufs_reset, sdc2_clk, sdc2_cmd, sdc2_data ] + minItems: 1 + maxItems: 36 + + function: + description: + Specify the alternative function to be configured for the specified + pins. + enum: [ gpio, resout_gpio_n, aoss_cti, asc_cci, atest_char, atest_usb, + audio_ext_mclk0, audio_ext_mclk1, audio_ref_clk, cam_asc_mclk4, + cam_mclk, cci_async_in, cci_i2c_scl, cci_i2c_sda, cci_timer, + cmu_rng, cri_trng, dbg_out_clk, ddr_bist_complete, + ddr_bist_fail, ddr_bist_start, ddr_bist_stop, ddr_pxi, + edp0_hot, edp0_lcd, edp1_lcd, egpio, eusb0_ac_en, eusb1_ac_en, + eusb2_ac_en, eusb3_ac_en, eusb5_ac_en, eusb6_ac_en, gcc_gp1, + gcc_gp2, gcc_gp3, host2wlan_sol, i2c0_s_scl, i2c0_s_sda, + i2s0_data, i2s0_sck, i2s0_ws, i2s1_data, i2s1_sck, i2s1_ws, + ibi_i3c, jitter_bist, mdp_vsync_out, mdp_vsync_e, mdp_vsync_p, + mdp_vsync_s, pcie3a_clk, pcie3a_rst_n, pcie3b_clk, + pcie4_clk_req_n, pcie5_clk_req_n, pcie6_clk_req_n, phase_flag, + pll_bist_sync, pll_clk_aux, pmc_oca_n, pmc_uva_n, prng_rosc, + qdss_cti, qdss_gpio, qspi, qup0_se0, qup0_se1, qup0_se2, + qup0_se3_l0, qup0_se3, qup0_se4, qup0_se5, qup0_se6, qup0_se7, + qup1_se0, qup1_se1, qup1_se2, qup1_se3, qup1_se4, qup1_se5, + qup1_se6, qup1_se7, qup2_se0, qup2_se1, qup2_se2, qup2_se3, + qup2_se4, qup2_se5, qup2_se6, qup2_se7, qup3_se0, qup3_se1, + sd_write_protect, sdc4_clk, sdc4_cmd, sdc4_data, smb_acok_n, + sys_throttle, tb_trig_sdc2, tb_trig_sdc4, tmess_prng, + tsense_pwm, tsense_therm, usb0_dp, usb0_phy_ps, usb0_sbrx, + usb0_sbtx, usb0_tmu, usb1_dbg, usb1_dp, usb1_phy_ps, usb1_sbrx, + usb1_sbtx, usb1_tmu, usb2_dp, usb2_phy_ps, usb2_sbrx, usb2_sbtx, + usb2_tmu, vsense_trigger_mirnat, wcn_sw, wcn_sw_ctrl ] + + required: + - pins + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + #include + tlmm: pinctrl@f100000 { + compatible = "qcom,glymur-tlmm"; + reg = <0x0f100000 0xf00000>; + interrupts = ; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + gpio-ranges = <&tlmm 0 0 249>; + wakeup-parent = <&pdc>; + gpio-reserved-ranges = <4 4>, <10 2>, <33 3>, <44 4>; + qup_uart21_default: qup-uart21-default-state { + tx-pins { + pins = "gpio86"; + function = "qup2_se5"; + drive-strength = <2>; + bias-disable; + }; + + rx-pins { + pins = "gpio87"; + function = "qup2_se5"; + drive-strength = <2>; + bias-disable; + }; + }; + }; +... diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml index 08801cc4e476ff..bc7b8dda883765 100644 --- a/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml @@ -20,6 +20,16 @@ properties: reg: maxItems: 2 + clocks: + items: + - description: LPASS Core voting clock + - description: LPASS Audio voting clock + + clock-names: + items: + - const: core + - const: audio + patternProperties: "-state$": oneOf: @@ -70,10 +80,16 @@ unevaluatedProperties: false examples: - | + #include lpass_tlmm: pinctrl@33c0000 { compatible = "qcom,sc7280-lpass-lpi-pinctrl"; reg = <0x33c0000 0x20000>, <0x3550000 0x10000>; + + clocks = <&q6prmcc LPASS_HW_MACRO_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>, + <&q6prmcc LPASS_HW_DCODEC_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>; + clock-names = "core", "audio"; + gpio-controller; #gpio-cells = <2>; gpio-ranges = <&lpass_tlmm 0 0 15>; diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-lpass-lpi-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-lpass-lpi-pinctrl.yaml new file mode 100644 index 00000000000000..409e5a4d4da9c6 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-lpass-lpi-pinctrl.yaml @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/qcom,sdm660-lpass-lpi-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm SDM660 SoC LPASS LPI TLMM + +maintainers: + - Nickolay Goppen + +description: + Top Level Mode Multiplexer pin controller in the Low Power Audio SubSystem + (LPASS) Low Power Island (LPI) of Qualcomm SDM660 SoC. + +properties: + compatible: + const: qcom,sdm660-lpass-lpi-pinctrl + + reg: + items: + - description: LPASS LPI TLMM Control and Status registers + +patternProperties: + "-state$": + oneOf: + - $ref: "#/$defs/qcom-sdm660-lpass-state" + - patternProperties: + "-pins$": + $ref: "#/$defs/qcom-sdm660-lpass-state" + additionalProperties: false + +$defs: + qcom-sdm660-lpass-state: + type: object + description: + Pinctrl node's client devices use subnodes for desired pin configuration. + Client device subnodes use below standard properties. + $ref: qcom,lpass-lpi-common.yaml#/$defs/qcom-tlmm-state + unevaluatedProperties: false + + properties: + pins: + description: + List of gpio pins affected by the properties specified in this + subnode. + items: + pattern: "^gpio([0-9]|[1-2][0-9]|3[0-1])$" + + function: + enum: [ gpio, comp_rx, dmic1_clk, dmic1_data, dmic2_clk, dmic2_data, + mclk0, pdm_tx, pdm_clk, pdm_rx, pdm_sync ] + description: + Specify the alternative function to be configured for the specified + pins. + +allOf: + - $ref: qcom,lpass-lpi-common.yaml# + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + lpi_tlmm: pinctrl@15070000 { + compatible = "qcom,sdm660-lpass-lpi-pinctrl"; + reg = <0x15070000 0x20000>; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&lpi_tlmm 0 0 32>; + + cdc_pdm_default: cdc-pdm-default-state { + clk-pins { + pins = "gpio18"; + function = "pdm_clk"; + drive-strength = <8>; + output-high; + }; + + sync-pins{ + pins = "gpio19"; + function = "pdm_sync"; + drive-strength = <4>; + output-high; + }; + + tx-pins { + pins = "gpio20"; + function = "pdm_tx"; + drive-strength = <8>; + }; + + rx-pins { + pins = "gpio21", "gpio23", "gpio25"; + function = "pdm_rx"; + drive-strength = <4>; + output-high; + }; + }; + + cdc_comp_default: cdc-comp-default-state { + pins = "gpio22", "gpio24"; + function = "comp_rx"; + drive-strength = <8>; + }; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/raspberrypi,rp1-gpio.yaml b/Documentation/devicetree/bindings/pinctrl/raspberrypi,rp1-gpio.yaml index eec9a9b58542f9..af6fbbd4feeaf6 100644 --- a/Documentation/devicetree/bindings/pinctrl/raspberrypi,rp1-gpio.yaml +++ b/Documentation/devicetree/bindings/pinctrl/raspberrypi,rp1-gpio.yaml @@ -72,10 +72,36 @@ $defs: pins: description: List of gpio pins affected by the properties specified in this - subnode. + subnode (either this or "groups" must be specified). items: pattern: '^gpio([0-9]|[1-4][0-9]|5[0-3])$' + groups: + description: + List of groups affected by the properties specified in this + subnode (either this or "pins" must be specified). + items: + anyOf: + - pattern: '^gpio([0-9]|[1-4][0-9]|5[0-3])$' + - enum: [ uart0, uart0_ctrl, uart1, uart1_ctrl, uart2, uart2_ctrl, + uart3, uart3_ctrl, uart4, uart4_ctrl, uart5_0, + uart5_0_ctrl, uart5_1, uart5_1_ctrl, uart5_2, + uart5_2_ctrl, uart5_3, + sd0, sd1, + i2s0, i2s0_dual, i2s0_quad, i2s1, i2s1_dual, i2s1_quad, + i2s2_0, i2s2_0_dual, i2s2_1, i2s2_1_dual, + i2c4_0, i2c4_1, i2c4_2, i2c4_3, i2c6_0, i2c6_1, i2c5_0, + i2c5_1, i2c5_2, i2c5_3, i2c0_0, i2c0_1, i2c1_0, i2c1_1, + i2c2_0, i2c2_1, i2c3_0, i2c3_1, i2c3_2, + dpi_16bit, dpi_16bit_cpadhi, dpi_16bit_pad666, + dpi_18bit, dpi_18bit_cpadhi, dpi_24bit, + spi0, spi0_quad, spi1, spi2, spi3, spi4, spi5, spi6_0, + spi6_1, spi7_0, spi7_1, spi8_0, spi8_1, + aaud_0, aaud_1, aaud_2, aaud_3, aaud_4, + vbus0_0, vbus0_1, vbus1, vbus2, vbus3, + mic_0, mic_1, mic_2, mic_3, + ir ] + function: enum: [ alt0, alt1, alt2, alt3, alt4, gpio, alt6, alt7, alt8, none, aaud, dcd0, dpi, dsi0_te_ext, dsi1_te_ext, dsr0, dtr0, gpclk0, @@ -103,6 +129,13 @@ $defs: drive-strength: enum: [ 2, 4, 8, 12 ] + required: + - function + + oneOf: + - required: [ groups ] + - required: [ pins ] + additionalProperties: false allOf: diff --git a/Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml new file mode 100644 index 00000000000000..36d66597148424 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/renesas,r9a09g077-pinctrl.yaml @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pinctrl/renesas,r9a09g077-pinctrl.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/T2H and RZ/N2H Pin and GPIO controller + +maintainers: + - Lad Prabhakar + +description: + The Renesas RZ/T2H and RZ/N2H SoCs feature a combined Pin and GPIO controller. + Pin multiplexing and GPIO configuration are performed on a per-pin basis. + Each port supports up to 8 pins, each configurable for either GPIO (port mode) + or alternate function mode. Each pin supports function mode values ranging from + 0x0 to 0x2A, allowing selection from up to 43 different functions. + +properties: + compatible: + enum: + - renesas,r9a09g077-pinctrl # RZ/T2H + - renesas,r9a09g087-pinctrl # RZ/N2H + + reg: + minItems: 1 + items: + - description: Non-safety I/O Port base + - description: Safety I/O Port safety region base + - description: Safety I/O Port Non-safety region base + + reg-names: + minItems: 1 + items: + - const: nsr + - const: srs + - const: srn + + gpio-controller: true + + '#gpio-cells': + const: 2 + description: + The first cell contains the global GPIO port index, constructed using the + RZT2H_GPIO() helper macro from + (e.g. "RZT2H_GPIO(3, 0)" for P03_0). The second cell represents the consumer + flag. Use the macros defined in include/dt-bindings/gpio/gpio.h. + + gpio-ranges: + maxItems: 1 + + clocks: + maxItems: 1 + + power-domains: + maxItems: 1 + +definitions: + renesas-rzt2h-n2h-pins-node: + type: object + allOf: + - $ref: pincfg-node.yaml# + - $ref: pinmux-node.yaml# + properties: + pinmux: + description: + Values are constructed from I/O port number, pin number, and + alternate function configuration number using the RZT2H_PORT_PINMUX() + helper macro from . + pins: true + phandle: true + input: true + input-enable: true + output-enable: true + oneOf: + - required: [pinmux] + - required: [pins] + additionalProperties: false + +patternProperties: + # Grouping nodes: allow multiple "-pins" subnodes within a "-group" + '.*-group$': + type: object + description: + Pin controller client devices can organize pin configuration entries into + grouping nodes ending in "-group". These group nodes may contain multiple + child nodes each ending in "-pins" to configure distinct sets of pins. + additionalProperties: false + patternProperties: + '-pins$': + $ref: '#/definitions/renesas-rzt2h-n2h-pins-node' + + # Standalone "-pins" nodes under client devices or groups + '-pins$': + $ref: '#/definitions/renesas-rzt2h-n2h-pins-node' + + '-hog$': + type: object + description: GPIO hog node + properties: + gpio-hog: true + gpios: true + input: true + output-high: true + output-low: true + line-name: true + required: + - gpio-hog + - gpios + additionalProperties: false + +allOf: + - $ref: pinctrl.yaml# + +required: + - compatible + - reg + - reg-names + - gpio-controller + - '#gpio-cells' + - gpio-ranges + - clocks + - power-domains + +unevaluatedProperties: false + +examples: + - | + #include + #include + + pinctrl@802c0000 { + compatible = "renesas,r9a09g077-pinctrl"; + reg = <0x802c0000 0x2000>, + <0x812c0000 0x2000>, + <0x802b0000 0x2000>; + reg-names = "nsr", "srs", "srn"; + clocks = <&cpg CPG_CORE R9A09G077_CLK_PCLKM>; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&pinctrl 0 0 288>; + power-domains = <&cpg>; + + serial0-pins { + pinmux = , /* Tx */ + ; /* Rx */ + }; + + sd1-pwr-en-hog { + gpio-hog; + gpios = ; + output-high; + line-name = "sd1_pwr_en"; + }; + + i2c0-pins { + pins = "RIIC0_SDA", "RIIC0_SCL"; + input-enable; + }; + + sd0-sd-group { + ctrl-pins { + pinmux = , /* SD0_CLK */ + ; /* SD0_CMD */ + }; + + data-pins { + pinmux = , /* SD0_CLK */ + ; /* SD0_CMD */ + }; + }; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-wakeup-interrupt.yaml b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-wakeup-interrupt.yaml index 0da6d69f599171..dd11c73a55da3f 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-wakeup-interrupt.yaml +++ b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-wakeup-interrupt.yaml @@ -30,8 +30,6 @@ properties: compatible: oneOf: - enum: - - samsung,s3c2410-wakeup-eint - - samsung,s3c2412-wakeup-eint - samsung,s3c64xx-wakeup-eint - samsung,s5pv210-wakeup-eint - samsung,exynos4210-wakeup-eint @@ -59,27 +57,12 @@ properties: description: Interrupt used by multiplexed external wake-up interrupts. minItems: 1 - maxItems: 6 + maxItems: 4 required: - compatible allOf: - - if: - properties: - compatible: - contains: - enum: - - samsung,s3c2410-wakeup-eint - - samsung,s3c2412-wakeup-eint - then: - properties: - interrupts: - minItems: 6 - maxItems: 6 - required: - - interrupts - - if: properties: compatible: diff --git a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml index de846085614166..f1094d65e84603 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml @@ -35,11 +35,8 @@ properties: compatible: enum: + - axis,artpec8-pinctrl - google,gs101-pinctrl - - samsung,s3c2412-pinctrl - - samsung,s3c2416-pinctrl - - samsung,s3c2440-pinctrl - - samsung,s3c2450-pinctrl - samsung,s3c64xx-pinctrl - samsung,s5pv210-pinctrl - samsung,exynos2200-pinctrl diff --git a/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml b/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml index 15d74138baa343..12b71688dd3407 100644 --- a/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml +++ b/Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml @@ -24,6 +24,9 @@ properties: - amlogic,a5-pwrc - amlogic,c3-pwrc - amlogic,t7-pwrc + - amlogic,s6-pwrc + - amlogic,s7-pwrc + - amlogic,s7d-pwrc "#power-domain-cells": const: 1 diff --git a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml index 9c7cc632abee25..500d98921581a3 100644 --- a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml +++ b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml @@ -44,6 +44,15 @@ properties: '#size-cells': const: 0 + access-controllers: + description: + A number of phandles to external blocks to set and clear the required + bits to enable or disable bus protection, necessary to avoid any bus + faults while enabling or disabling a power domain. + For example, this may hold phandles to INFRACFG and SMI. + minItems: 1 + maxItems: 3 + patternProperties: "^power-domain@[0-9a-f]+$": $ref: "#/$defs/power-domain-node" @@ -123,14 +132,17 @@ $defs: mediatek,infracfg: $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the INFRACFG register range. + deprecated: true mediatek,infracfg-nao: $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the INFRACFG-NAO register range. + deprecated: true mediatek,smi: $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the SMI register range. + deprecated: true required: - reg @@ -138,6 +150,31 @@ $defs: required: - compatible +allOf: + - if: + properties: + compatible: + contains: + enum: + - mediatek,mt8183-power-controller + then: + properties: + access-controllers: + minItems: 2 + maxItems: 2 + + - if: + properties: + compatible: + contains: + enum: + - mediatek,mt8365-power-controller + then: + properties: + access-controllers: + minItems: 3 + maxItems: 3 + additionalProperties: false examples: diff --git a/Documentation/devicetree/bindings/power/supply/active-semi,act8945a-charger.yaml b/Documentation/devicetree/bindings/power/supply/active-semi,act8945a-charger.yaml deleted file mode 100644 index 5220d9cb16d880..00000000000000 --- a/Documentation/devicetree/bindings/power/supply/active-semi,act8945a-charger.yaml +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -%YAML 1.2 ---- -$id: http://devicetree.org/schemas/power/supply/active-semi,act8945a-charger.yaml# -$schema: http://devicetree.org/meta-schemas/core.yaml# - -title: Active-semi ACT8945A Charger Function - -maintainers: - - Sebastian Reichel - -allOf: - - $ref: power-supply.yaml# - -properties: - compatible: - const: active-semi,act8945a-charger - - interrupts: - maxItems: 1 - - active-semi,chglev-gpios: - maxItems: 1 - description: charge current level GPIO - - active-semi,lbo-gpios: - maxItems: 1 - description: low battery voltage detect GPIO - - active-semi,input-voltage-threshold-microvolt: - description: | - Specifies the charger's input over-voltage threshold value. - Despite the name, specified values are in millivolt (mV). - Defaults to 6.6 V - enum: [ 6600, 7000, 7500, 8000 ] - - active-semi,precondition-timeout: - $ref: /schemas/types.yaml#/definitions/uint32 - description: | - Specifies the charger's PRECONDITION safety timer setting value in minutes. - If 0, it means to disable this timer. - Defaults to 40 minutes. - enum: [ 0, 40, 60, 80 ] - - active-semi,total-timeout: - $ref: /schemas/types.yaml#/definitions/uint32 - description: | - Specifies the charger's total safety timer setting value in hours; - If 0, it means to disable this timer; - Defaults to 3 hours. - enum: [ 0, 3, 4, 5 ] - -required: - - compatible - - interrupts - - active-semi,chglev-gpios - - active-semi,lbo-gpios - -additionalProperties: false - -examples: - - | - #include - #include - pmic { - charger { - compatible = "active-semi,act8945a-charger"; - interrupt-parent = <&pioA>; - interrupts = <45 IRQ_TYPE_LEVEL_LOW>; - active-semi,chglev-gpios = <&pioA 12 GPIO_ACTIVE_HIGH>; - active-semi,lbo-gpios = <&pioA 72 GPIO_ACTIVE_LOW>; - active-semi,input-voltage-threshold-microvolt = <6600>; - active-semi,precondition-timeout = <40>; - active-semi,total-timeout = <3>; - }; - }; diff --git a/Documentation/devicetree/bindings/power/supply/bq24190.yaml b/Documentation/devicetree/bindings/power/supply/bq24190.yaml index ac9a76fc5876be..938554a9fb02c2 100644 --- a/Documentation/devicetree/bindings/power/supply/bq24190.yaml +++ b/Documentation/devicetree/bindings/power/supply/bq24190.yaml @@ -30,6 +30,12 @@ properties: interrupts: maxItems: 1 + ce-gpios: + description: + Active low Charge Enable pin. Battery charging is enabled when + REG01[5:4] = 01 and CE pin is Low. CE pin must be pulled high or low. + maxItems: 1 + usb-otg-vbus: $ref: /schemas/regulator/regulator.yaml# description: | diff --git a/Documentation/devicetree/bindings/power/supply/bq27xxx.yaml b/Documentation/devicetree/bindings/power/supply/bq27xxx.yaml index 309ea33b5b259d..bc05400186cf1c 100644 --- a/Documentation/devicetree/bindings/power/supply/bq27xxx.yaml +++ b/Documentation/devicetree/bindings/power/supply/bq27xxx.yaml @@ -16,9 +16,6 @@ description: | Support various Texas Instruments fuel gauge devices that share similar register maps and power supply properties -allOf: - - $ref: power-supply.yaml# - properties: compatible: enum: @@ -58,6 +55,10 @@ properties: maxItems: 1 description: integer, I2C address of the fuel gauge. + interrupts: + maxItems: 1 + description: the SOC_INT or GPOUT pin + monitored-battery: description: | The fuel gauge uses the following battery properties: @@ -68,6 +69,36 @@ properties: power-supplies: true +allOf: + - $ref: power-supply.yaml# + - if: + properties: + compatible: + contains: + enum: + - ti,bq27200 + - ti,bq27210 + - ti,bq27500 # deprecated, use revision specific property below + - ti,bq27510 # deprecated, use revision specific property below + - ti,bq27520 # deprecated, use revision specific property below + - ti,bq27500-1 + - ti,bq27510g1 + - ti,bq27510g2 + - ti,bq27521 + - ti,bq27541 + - ti,bq27542 + - ti,bq27546 + - ti,bq27742 + - ti,bq27545 + - ti,bq27411 + - ti,bq27z561 + - ti,bq28z610 + - ti,bq34z100 + - ti,bq78z100 + then: + properties: + interrupts: false + required: - compatible - reg diff --git a/Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml b/Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml index 142157bff0cd85..04519b0c581d0e 100644 --- a/Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml +++ b/Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml @@ -17,8 +17,9 @@ properties: items: - enum: - apple,t8103-fpwm - - apple,t6000-fpwm - apple,t8112-fpwm + - apple,t6000-fpwm + - apple,t6020-fpwm - const: apple,s5l-fpwm reg: diff --git a/Documentation/devicetree/bindings/pwm/fsl,vf610-ftm-pwm.yaml b/Documentation/devicetree/bindings/pwm/fsl,vf610-ftm-pwm.yaml index 7f9f72d95e7a32..c7a10180208e03 100644 --- a/Documentation/devicetree/bindings/pwm/fsl,vf610-ftm-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/fsl,vf610-ftm-pwm.yaml @@ -26,9 +26,14 @@ maintainers: properties: compatible: - enum: - - fsl,vf610-ftm-pwm - - fsl,imx8qm-ftm-pwm + oneOf: + - enum: + - fsl,vf610-ftm-pwm + - fsl,imx8qm-ftm-pwm + - nxp,s32g2-ftm-pwm + - items: + - const: nxp,s32g3-ftm-pwm + - const: nxp,s32g2-ftm-pwm reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml index f7bc84b05a871b..8f5a468cfb91fb 100644 --- a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml @@ -14,7 +14,7 @@ description: | Google's ChromeOS EC PWM is a simple PWM attached to the Embedded Controller (EC) and controlled via a host-command interface. An EC PWM node should be only found as a sub-node of the EC node (see - Documentation/devicetree/bindings/mfd/google,cros-ec.yaml). + Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml). allOf: - $ref: pwm.yaml# diff --git a/Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml b/Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml index 981cfec53f3727..19a9d2e15a964f 100644 --- a/Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml @@ -11,7 +11,7 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. The controller supports one PWM channel and supports only four distinct frequencies (250Hz, 500Hz, 1kHz, 2kHz). diff --git a/Documentation/devicetree/bindings/pwm/nxp,lpc1850-sct-pwm.yaml b/Documentation/devicetree/bindings/pwm/nxp,lpc1850-sct-pwm.yaml index ffda0123878eda..920e0413d4312b 100644 --- a/Documentation/devicetree/bindings/pwm/nxp,lpc1850-sct-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/nxp,lpc1850-sct-pwm.yaml @@ -48,7 +48,7 @@ examples: pwm@40000000 { compatible = "nxp,lpc1850-sct-pwm"; reg = <0x40000000 0x1000>; - clocks =<&ccu1 CLK_CPU_SCT>; + clocks = <&ccu1 CLK_CPU_SCT>; clock-names = "pwm"; #pwm-cells = <3>; }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-samsung.yaml b/Documentation/devicetree/bindings/pwm/pwm-samsung.yaml index 17a2b927af3370..97acbdec39f102 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-samsung.yaml +++ b/Documentation/devicetree/bindings/pwm/pwm-samsung.yaml @@ -31,6 +31,7 @@ properties: - enum: - samsung,exynos5433-pwm - samsung,exynos7-pwm + - samsung,exynos8890-pwm - samsung,exynosautov9-pwm - samsung,exynosautov920-pwm - tesla,fsd-pwm diff --git a/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt b/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt deleted file mode 100644 index d97ca1964e9470..00000000000000 --- a/Documentation/devicetree/bindings/pwm/ti,twl-pwm.txt +++ /dev/null @@ -1,17 +0,0 @@ -Texas Instruments TWL series PWM drivers - -Supported PWMs: -On TWL4030 series: PWM1 and PWM2 -On TWL6030 series: PWM0 and PWM1 - -Required properties: -- compatible: "ti,twl4030-pwm" or "ti,twl6030-pwm" -- #pwm-cells: should be 2. See pwm.yaml in this directory for a description of - the cells format. - -Example: - -twl_pwm: pwm { - compatible = "ti,twl6030-pwm"; - #pwm-cells = <2>; -}; diff --git a/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt b/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt deleted file mode 100644 index 31ca1b032ef034..00000000000000 --- a/Documentation/devicetree/bindings/pwm/ti,twl-pwmled.txt +++ /dev/null @@ -1,17 +0,0 @@ -Texas Instruments TWL series PWM drivers connected to LED terminals - -Supported PWMs: -On TWL4030 series: PWMA and PWMB (connected to LEDA and LEDB terminals) -On TWL6030 series: LED PWM (mainly used as charging indicator LED) - -Required properties: -- compatible: "ti,twl4030-pwmled" or "ti,twl6030-pwmled" -- #pwm-cells: should be 2. See pwm.yaml in this directory for a description of - the cells format. - -Example: - -twl_pwmled: pwmled { - compatible = "ti,twl6030-pwmled"; - #pwm-cells = <2>; -}; diff --git a/Documentation/devicetree/bindings/regulator/active-semi,act8945a.yaml b/Documentation/devicetree/bindings/regulator/active-semi,act8945a.yaml index bdf3f7d34ef51b..a8d579844dc7bc 100644 --- a/Documentation/devicetree/bindings/regulator/active-semi,act8945a.yaml +++ b/Documentation/devicetree/bindings/regulator/active-semi,act8945a.yaml @@ -91,28 +91,41 @@ properties: maxItems: 1 active-semi,chglev-gpios: - description: CGHLEV GPIO + description: charge current level GPIO maxItems: 1 active-semi,lbo-gpios: - description: LBO GPIO + description: low battery voltage detect GPIO maxItems: 1 active-semi,input-voltage-threshold-microvolt: - description: Input voltage threshold - maxItems: 1 + description: + Specifies the charger's input over-voltage threshold value. Despite + the name, specified values are in millivolt (mV). + enum: [ 6600, 7000, 7500, 8000 ] + default: 6600 active-semi,precondition-timeout: - description: Precondition timeout + description: + Specifies the charger's PRECONDITION safety timer setting value in + minutes. If 0, it means to disable this timer. + enum: [ 0, 40, 60, 80 ] + default: 40 $ref: /schemas/types.yaml#/definitions/uint32 active-semi,total-timeout: - description: Total timeout + description: + Specifies the charger's total safety timer setting value in hours; If + 0, it means to disable this timer; + enum: [ 0, 3, 4, 5 ] + default: 3 $ref: /schemas/types.yaml#/definitions/uint32 required: - compatible - interrupts + - active-semi,chglev-gpios + - active-semi,lbo-gpios additionalProperties: false diff --git a/Documentation/devicetree/bindings/regulator/maxim,max77838.yaml b/Documentation/devicetree/bindings/regulator/maxim,max77838.yaml new file mode 100644 index 00000000000000..bed36af5493df2 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/maxim,max77838.yaml @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/maxim,max77838.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Maxim Integrated MAX77838 PMIC + +maintainers: + - Ivaylo Ivanov + +properties: + $nodename: + pattern: "pmic@[0-9a-f]{1,2}" + compatible: + enum: + - maxim,max77838 + + reg: + maxItems: 1 + + regulators: + type: object + $ref: regulator.yaml# + description: | + list of regulators provided by this controller, must be named + after their hardware counterparts ldo[1-4] and buck + + properties: + buck: + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + patternProperties: + "^ldo([1-4])$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + additionalProperties: false + +required: + - compatible + - reg + - regulators + +additionalProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pmic@60 { + compatible = "maxim,max77838"; + reg = <0x60>; + + regulators { + ldo2 { + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + }; + }; + }; + }; +... diff --git a/Documentation/devicetree/bindings/regulator/nxp,pf0900.yaml b/Documentation/devicetree/bindings/regulator/nxp,pf0900.yaml new file mode 100644 index 00000000000000..8c8fc2cd4cedb0 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/nxp,pf0900.yaml @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/nxp,pf0900.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NXP PF0900 Power Management Integrated Circuit regulators + +maintainers: + - Joy Zou + +description: + The PF0900 is a power management integrated circuit (PMIC) optimized + for high performance i.MX9x based applications. It features five high + efficiency buck converters, three linear and one vaon regulators. It + provides low quiescent current in Standby and low power off Modes. + +properties: + compatible: + enum: + - nxp,pf0900 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + regulators: + type: object + additionalProperties: false + + properties: + vaon: + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + patternProperties: + "^ldo[1-3]$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + "^sw[1-5]$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + + nxp,i2c-crc-enable: + type: boolean + description: + The CRC enabled during register read/write. Controlled by customer + unviewable fuse bits OTP_I2C_CRC_EN. Check chip part number. + +required: + - compatible + - reg + - interrupts + - regulators + +additionalProperties: false + +examples: + - | + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pmic@8 { + compatible = "nxp,pf0900"; + reg = <0x08>; + interrupt-parent = <&pcal6524>; + interrupts = <89 IRQ_TYPE_LEVEL_LOW>; + nxp,i2c-crc-enable; + + regulators { + vaon { + regulator-name = "VAON"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + }; + + sw1 { + regulator-name = "SW1"; + regulator-min-microvolt = <500000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + regulator-state-mem { + regulator-on-in-suspend; + regulator-suspend-max-microvolt = <650000>; + regulator-suspend-min-microvolt = <650000>; + }; + }; + + sw2 { + regulator-name = "SW2"; + regulator-min-microvolt = <300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + }; + + sw3 { + regulator-name = "SW3"; + regulator-min-microvolt = <300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + }; + + sw4 { + regulator-name = "SW4"; + regulator-min-microvolt = <300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + }; + + sw5 { + regulator-name = "SW5"; + regulator-min-microvolt = <300000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + regulator-ramp-delay = <1950>; + }; + + ldo1 { + regulator-name = "LDO1"; + regulator-min-microvolt = <750000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + }; + + ldo2 { + regulator-name = "LDO2"; + regulator-min-microvolt = <650000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + }; + + ldo3 { + regulator-name = "LDO3"; + regulator-min-microvolt = <650000>; + regulator-max-microvolt = <3300000>; + regulator-boot-on; + regulator-always-on; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/regulator/nxp,pf5300.yaml b/Documentation/devicetree/bindings/regulator/nxp,pf5300.yaml new file mode 100644 index 00000000000000..5b9d5d4e48d09a --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/nxp,pf5300.yaml @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/nxp,pf5300.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NXP PF5300/PF5301/PF5302 PMIC regulators + +maintainers: + - Woodrow Douglass + +description: | + The PF5300, PF5301, and PF5302 integrate high-performance buck converters, + 12 A, 8 A, and 15 A, respectively, to power high-end automotive and industrial + processors. With adaptive voltage positioning and a high-bandwidth loop, they + offer transient regulation to minimize capacitor requirements. + +allOf: + - $ref: regulator.yaml# + +properties: + compatible: + oneOf: + - const: nxp,pf5300 + - items: + - enum: + - nxp,pf5301 + - nxp,pf5302 + - const: nxp,pf5300 + reg: + maxItems: 1 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + regulator@28 { + compatible = "nxp,pf5302", "nxp,pf5300"; + reg = <0x28>; + + regulator-always-on; + regulator-boot-on; + regulator-max-microvolt = <1200000>; + regulator-min-microvolt = <500000>; + }; + }; diff --git a/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml index f02f97d4fdd215..40f9223d4c2721 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,sdm845-refgen-regulator.yaml @@ -23,11 +23,14 @@ properties: - enum: - qcom,sc7180-refgen-regulator - qcom,sc8180x-refgen-regulator + - qcom,sdm670-refgen-regulator - qcom,sm8150-refgen-regulator - const: qcom,sdm845-refgen-regulator - items: - enum: + - qcom,qcs8300-refgen-regulator + - qcom,sa8775p-refgen-regulator - qcom,sc7280-refgen-regulator - qcom,sc8280xp-refgen-regulator - qcom,sm6350-refgen-regulator diff --git a/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator-v2.yaml b/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator-v2.yaml new file mode 100644 index 00000000000000..37b9ed371b67d3 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator-v2.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/raspberrypi,7inch-touchscreen-panel-regulator-v2.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: RaspberryPi 5" and 7" display V2 MCU-based regulator/backlight controller + +maintainers: + - Marek Vasut + +description: | + The RaspberryPi 5" and 7" display 2 has an MCU-based regulator, PWM + backlight and GPIO controller on the PCB, which is used to turn the + display unit on/off and control the backlight. + +allOf: + - $ref: regulator.yaml# + +properties: + compatible: + const: raspberrypi,touchscreen-panel-regulator-v2 + + reg: + maxItems: 1 + + gpio-controller: true + "#gpio-cells": + const: 2 + description: + The first cell is the pin number, and the second cell is used to + specify the gpio polarity (GPIO_ACTIVE_HIGH or GPIO_ACTIVE_LOW). + + "#pwm-cells": + const: 3 + description: See ../../pwm/pwm.yaml for description of the cell formats. + +additionalProperties: false + +required: + - compatible + - reg + - gpio-controller + - "#gpio-cells" + - "#pwm-cells" + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + regulator@45 { + compatible = "raspberrypi,touchscreen-panel-regulator-v2"; + reg = <0x45>; + gpio-controller; + #gpio-cells = <2>; + #pwm-cells = <3>; + }; + }; + +... diff --git a/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator.yaml b/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator.yaml index 18944d39d08fcb..41678400e63fa6 100644 --- a/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator.yaml @@ -12,17 +12,14 @@ maintainers: description: | The RaspberryPi 7" display has an ATTINY88-based regulator/backlight controller on the PCB, which is used to turn the display unit on/off - and control the backlight. The V2 supports 5" and 7" panels and also - offers PWM backlight control. + and control the backlight. allOf: - $ref: regulator.yaml# properties: compatible: - enum: - - raspberrypi,7inch-touchscreen-panel-regulator - - raspberrypi,touchscreen-panel-regulator-v2 + const: raspberrypi,7inch-touchscreen-panel-regulator reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/regulator/richtek,rt5133.yaml b/Documentation/devicetree/bindings/regulator/richtek,rt5133.yaml new file mode 100644 index 00000000000000..d2e007fee6ba1f --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/richtek,rt5133.yaml @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/regulator/richtek,rt5133.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Richtek RT5133 PMIC Regulator + +maintainers: + - ShihChia Chang + +description: + The RT5133 is an integrated Power Management IC for portable devices, + featuring 8 LDOs and 3 GPOs. It allows programmable output voltages, + soft-start times, and protections via I2C. GPO operation depends on LDO1 + voltage. + +properties: + compatible: + enum: + - richtek,rt5133 + + reg: + maxItems: 1 + + enable-gpios: + maxItems: 1 + + wakeup-source: true + + interrupts: + maxItems: 1 + + gpio-controller: true + + "#gpio-cells": + const: 2 + + richtek,oc-shutdown-all: + type: boolean + description: + Controls the behavior when any LDO (Low Dropout Regulator) enters an + Over Current state. + If set to true, all LDO channels will be shut down. + If set to false, only the affected LDO channel will shut down itself. + + richtek,pgb-shutdown-all: + type: boolean + description: + Controls the behavior when any LDO enters a Power Good Bad state. + If set to true, all LDO channels will be shut down. + If set to false, only the affected LDO channel will shut down itself. + + regulators: + type: object + additionalProperties: false + + properties: + base: + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + description: + Properties for the base regulator, which is the top-level supply for + LDO1 to LDO6. It functions merely as an on/off switch rather than + regulating voltages. If none of LDO1 to LDO6 are in use, switching + off the base will reduce the quiescent current. + + required: + - regulator-name + + patternProperties: + "^ldo([1-6])$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + description: + Properties for single LDO regulator + + required: + - regulator-name + + "^ldo([7-8])$": + type: object + $ref: regulator.yaml# + unevaluatedProperties: false + description: + Properties for single LDO regulator + + properties: + vin-supply: true + + required: + - regulator-name + - vin-supply + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + #include + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + + pmic@18 { + compatible = "richtek,rt5133"; + reg = <0x18>; + wakeup-source; + interrupts-extended = <&gpio 0 IRQ_TYPE_EDGE_FALLING>; + enable-gpios = <&gpio 2 GPIO_ACTIVE_HIGH>; + gpio-controller; + #gpio-cells = <2>; + richtek,oc-shutdown-all; + richtek,pgb-shutdown-all; + regulators { + base { + regulator-name = "base"; + }; + pvin78: ldo1 { + regulator-name = "ldo1"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3199998>; + regulator-active-discharge = <1>; + }; + ldo2 { + regulator-name = "ldo2"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3200000>; + regulator-active-discharge = <1>; + }; + ldo3 { + regulator-name = "ldo3"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <3000000>; + regulator-active-discharge = <1>; + }; + ldo4 { + regulator-name = "ldo4"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <3000000>; + regulator-active-discharge = <1>; + }; + ldo5 { + regulator-name = "ldo5"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <3000000>; + regulator-active-discharge = <1>; + }; + ldo6 { + regulator-name = "ldo6"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <3000000>; + regulator-active-discharge = <1>; + }; + ldo7 { + regulator-name = "ldo7"; + regulator-min-microvolt = <900000>; + regulator-max-microvolt = <1200000>; + regulator-active-discharge = <1>; + vin-supply = <&pvin78>; + }; + ldo8 { + regulator-name = "ldo8"; + regulator-min-microvolt = <855000>; + regulator-max-microvolt = <1200000>; + regulator-active-discharge = <1>; + vin-supply = <&pvin78>; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml index adc6b3f36fde49..179c98b33b4d9f 100644 --- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml @@ -58,7 +58,7 @@ properties: maxItems: 1 cros-ec-rpmsg: - $ref: /schemas/mfd/google,cros-ec.yaml + $ref: /schemas/embedded-controller/google,cros-ec.yaml description: This subnode represents the rpmsg device. The properties of this node are defined by the individual bindings for @@ -126,7 +126,7 @@ patternProperties: maxItems: 1 cros-ec-rpmsg: - $ref: /schemas/mfd/google,cros-ec.yaml + $ref: /schemas/embedded-controller/google,cros-ec.yaml description: This subnode represents the rpmsg device. The properties of this node are defined by the individual bindings for diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml index ede6a58ccf5347..de41a6f074d3af 100644 --- a/Documentation/devicetree/bindings/riscv/extensions.yaml +++ b/Documentation/devicetree/bindings/riscv/extensions.yaml @@ -662,6 +662,12 @@ properties: Registers in the AX45MP datasheet. https://www.andestech.com/wp-content/uploads/AX45MP-1C-Rev.-5.0.0-Datasheet.pdf + # MIPS + - const: xmipsexectl + description: + The MIPS extension for execution control as documented in + https://mips.com/wp-content/uploads/2025/06/P8700_Programmers_Reference_Manual_Rev1.84_5-31-2025.pdf + # SiFive - const: xsfvqmaccdod description: diff --git a/Documentation/devicetree/bindings/serial/8250.yaml b/Documentation/devicetree/bindings/serial/8250.yaml index e46bee8d25bf02..b243afa69a1aeb 100644 --- a/Documentation/devicetree/bindings/serial/8250.yaml +++ b/Documentation/devicetree/bindings/serial/8250.yaml @@ -48,7 +48,6 @@ allOf: oneOf: - required: [ clock-frequency ] - required: [ clocks ] - - if: properties: compatible: @@ -60,12 +59,39 @@ allOf: items: - const: uartclk - const: reg - else: + - if: + properties: + compatible: + contains: + const: spacemit,k1-uart + then: properties: clock-names: items: - const: core - const: bus + - if: + properties: + compatible: + contains: + enum: + - spacemit,k1-uart + - nxp,lpc1850-uart + then: + required: + - clocks + - clock-names + properties: + clocks: + minItems: 2 + clock-names: + minItems: 2 + else: + properties: + clocks: + maxItems: 1 + clock-names: + maxItems: 1 properties: compatible: @@ -162,6 +188,9 @@ properties: minItems: 1 maxItems: 2 oneOf: + - enum: + - main + - uart - items: - const: core - const: bus @@ -264,29 +293,6 @@ required: - reg - interrupts -if: - properties: - compatible: - contains: - enum: - - spacemit,k1-uart - - nxp,lpc1850-uart -then: - required: - - clocks - - clock-names - properties: - clocks: - minItems: 2 - clock-names: - minItems: 2 -else: - properties: - clocks: - maxItems: 1 - clock-names: - maxItems: 1 - unevaluatedProperties: false examples: diff --git a/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml b/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml index 89c462653e2d33..8cc848ae11cb73 100644 --- a/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml +++ b/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml @@ -41,7 +41,7 @@ properties: - const: dma_intr2 clocks: - minItems: 1 + maxItems: 1 clock-names: const: sw_baud diff --git a/Documentation/devicetree/bindings/soc/imx/fsl,imx93-media-blk-ctrl.yaml b/Documentation/devicetree/bindings/soc/imx/fsl,imx93-media-blk-ctrl.yaml index b3554e7f9e76dd..34aea58094e553 100644 --- a/Documentation/devicetree/bindings/soc/imx/fsl,imx93-media-blk-ctrl.yaml +++ b/Documentation/devicetree/bindings/soc/imx/fsl,imx93-media-blk-ctrl.yaml @@ -18,7 +18,9 @@ description: properties: compatible: items: - - const: fsl,imx93-media-blk-ctrl + - enum: + - fsl,imx91-media-blk-ctrl + - fsl,imx93-media-blk-ctrl - const: syscon reg: @@ -31,21 +33,54 @@ properties: maxItems: 1 clocks: + minItems: 8 maxItems: 10 clock-names: - items: - - const: apb - - const: axi - - const: nic - - const: disp - - const: cam - - const: pxp - - const: lcdif - - const: isi - - const: csi - - const: dsi + minItems: 8 + maxItems: 10 +allOf: + - if: + properties: + compatible: + contains: + const: fsl,imx91-media-blk-ctrl + then: + properties: + clocks: + maxItems: 8 + clock-names: + items: + - const: apb + - const: axi + - const: nic + - const: disp + - const: cam + - const: lcdif + - const: isi + - const: csi + - if: + properties: + compatible: + contains: + const: fsl,imx93-media-blk-ctrl + then: + properties: + clocks: + minItems: 10 + clock-names: + items: + - const: apb + - const: axi + - const: nic + - const: disp + - const: cam + - const: pxp + - const: lcdif + - const: isi + - const: csi + - const: dsi required: - compatible - reg diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml b/Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml index 48114bb0c9276c..7085bf88afabaa 100644 --- a/Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml +++ b/Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml @@ -56,6 +56,20 @@ properties: The array should contain a gpio entry for each PMIC Glink connector, in reg order. It is defined that GPIO active level means "CC2" or Reversed/Flipped orientation. + nvmem-cells: + minItems: 3 + maxItems: 3 + description: + The nvmem cells contain the charge control settings, including the charge control + enable status, the battery state of charge (SoC) threshold for stopping charging, + and the battery SoC delta required to restart charging. + + nvmem-cell-names: + items: + - const: charge_limit_en + - const: charge_limit_end + - const: charge_limit_delta + patternProperties: '^connector@\d$': $ref: /schemas/connector/usb-connector.yaml# diff --git a/Documentation/devicetree/bindings/sound/asahi-kasei,ak4458.yaml b/Documentation/devicetree/bindings/sound/asahi-kasei,ak4458.yaml index 4477f84b7acc0e..1fdbeecc5eff9d 100644 --- a/Documentation/devicetree/bindings/sound/asahi-kasei,ak4458.yaml +++ b/Documentation/devicetree/bindings/sound/asahi-kasei,ak4458.yaml @@ -15,6 +15,9 @@ properties: - asahi-kasei,ak4458 - asahi-kasei,ak4497 + "#sound-dai-cells": + const: 0 + reg: maxItems: 1 @@ -46,6 +49,7 @@ required: - reg allOf: + - $ref: dai-common.yaml# - if: properties: compatible: diff --git a/Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml b/Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml index 14dea1feefc5ae..e6cf2ebcd77714 100644 --- a/Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml +++ b/Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml @@ -151,6 +151,12 @@ properties: minimum: 0 maximum: 5 + cirrus,subsystem-id: + $ref: /schemas/types.yaml#/definitions/string + description: + Subsystem ID. If this property is present, it sets the system name, + used to identify the firmware and tuning to load. + required: - compatible - reg diff --git a/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml b/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml index 1434f443373892..dd51e8c5b8c233 100644 --- a/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml +++ b/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml @@ -15,7 +15,7 @@ description: | Embedded Controller (EC) and is controlled via a host-command interface. An EC codec node should only be found inside the "codecs" subnode of a cros-ec node. - (see Documentation/devicetree/bindings/mfd/google,cros-ec.yaml). + (see Documentation/devicetree/bindings/embedded-controller/google,cros-ec.yaml). allOf: - $ref: dai-common.yaml# diff --git a/Documentation/devicetree/bindings/sound/mediatek,mt8183-audio.yaml b/Documentation/devicetree/bindings/sound/mediatek,mt8183-audio.yaml new file mode 100644 index 00000000000000..031b0fa7b4dc1b --- /dev/null +++ b/Documentation/devicetree/bindings/sound/mediatek,mt8183-audio.yaml @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/mediatek,mt8183-audio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Mediatek AFE PCM controller for mt8183 + +maintainers: + - Julien Massot + +properties: + compatible: + const: mediatek,mt8183-audio + + interrupts: + maxItems: 1 + + resets: + maxItems: 1 + + reset-names: + const: audiosys + + power-domains: + maxItems: 1 + + memory-region: + maxItems: 1 + + clocks: + items: + - description: AFE clock + - description: ADDA DAC clock + - description: ADDA DAC pre-distortion clock + - description: ADDA ADC clock + - description: ADDA6 ADC clock + - description: Audio low-jitter 22.5792m clock + - description: Audio low-jitter 24.576m clock + - description: Audio PLL1 tuner clock + - description: Audio PLL2 tuner clock + - description: I2S1 bit clock + - description: I2S2 bit clock + - description: I2S3 bit clock + - description: I2S4 bit clock + - description: Audio Time-Division Multiplexing interface clock + - description: Powerdown Audio test model clock + - description: Audio infra sys clock + - description: Audio infra 26M clock + - description: Mux for audio clock + - description: Mux for audio internal bus clock + - description: Mux main divider by 4 + - description: Primary audio mux + - description: Primary audio PLL + - description: Secondary audio mux + - description: Secondary audio PLL + - description: Primary audio en-generator clock + - description: Primary PLL divider by 4 for IEC + - description: Secondary audio en-generator clock + - description: Secondary PLL divider by 8 for IEC + - description: Mux selector for I2S port 0 + - description: Mux selector for I2S port 1 + - description: Mux selector for I2S port 2 + - description: Mux selector for I2S port 3 + - description: Mux selector for I2S port 4 + - description: Mux selector for I2S port 5 + - description: APLL1 and APLL2 divider for I2S port 0 + - description: APLL1 and APLL2 divider for I2S port 1 + - description: APLL1 and APLL2 divider for I2S port 2 + - description: APLL1 and APLL2 divider for I2S port 3 + - description: APLL1 and APLL2 divider for I2S port 4 + - description: APLL1 and APLL2 divider for IEC + - description: 26MHz clock for audio subsystem + + clock-names: + items: + - const: aud_afe_clk + - const: aud_dac_clk + - const: aud_dac_predis_clk + - const: aud_adc_clk + - const: aud_adc_adda6_clk + - const: aud_apll22m_clk + - const: aud_apll24m_clk + - const: aud_apll1_tuner_clk + - const: aud_apll2_tuner_clk + - const: aud_i2s1_bclk_sw + - const: aud_i2s2_bclk_sw + - const: aud_i2s3_bclk_sw + - const: aud_i2s4_bclk_sw + - const: aud_tdm_clk + - const: aud_tml_clk + - const: aud_infra_clk + - const: mtkaif_26m_clk + - const: top_mux_audio + - const: top_mux_aud_intbus + - const: top_syspll_d2_d4 + - const: top_mux_aud_1 + - const: top_apll1_ck + - const: top_mux_aud_2 + - const: top_apll2_ck + - const: top_mux_aud_eng1 + - const: top_apll1_d8 + - const: top_mux_aud_eng2 + - const: top_apll2_d8 + - const: top_i2s0_m_sel + - const: top_i2s1_m_sel + - const: top_i2s2_m_sel + - const: top_i2s3_m_sel + - const: top_i2s4_m_sel + - const: top_i2s5_m_sel + - const: top_apll12_div0 + - const: top_apll12_div1 + - const: top_apll12_div2 + - const: top_apll12_div3 + - const: top_apll12_div4 + - const: top_apll12_divb + - const: top_clk26m_clk + +required: + - compatible + - interrupts + - resets + - reset-names + - power-domains + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + #include + #include + #include + #include + #include + + audio-controller { + compatible = "mediatek,mt8183-audio"; + interrupts = ; + resets = <&watchdog MT8183_TOPRGU_AUDIO_SW_RST>; + reset-names = "audiosys"; + power-domains = <&spm MT8183_POWER_DOMAIN_AUDIO>; + clocks = <&audiosys CLK_AUDIO_AFE>, + <&audiosys CLK_AUDIO_DAC>, + <&audiosys CLK_AUDIO_DAC_PREDIS>, + <&audiosys CLK_AUDIO_ADC>, + <&audiosys CLK_AUDIO_PDN_ADDA6_ADC>, + <&audiosys CLK_AUDIO_22M>, + <&audiosys CLK_AUDIO_24M>, + <&audiosys CLK_AUDIO_APLL_TUNER>, + <&audiosys CLK_AUDIO_APLL2_TUNER>, + <&audiosys CLK_AUDIO_I2S1>, + <&audiosys CLK_AUDIO_I2S2>, + <&audiosys CLK_AUDIO_I2S3>, + <&audiosys CLK_AUDIO_I2S4>, + <&audiosys CLK_AUDIO_TDM>, + <&audiosys CLK_AUDIO_TML>, + <&infracfg CLK_INFRA_AUDIO>, + <&infracfg CLK_INFRA_AUDIO_26M_BCLK>, + <&topckgen CLK_TOP_MUX_AUDIO>, + <&topckgen CLK_TOP_MUX_AUD_INTBUS>, + <&topckgen CLK_TOP_SYSPLL_D2_D4>, + <&topckgen CLK_TOP_MUX_AUD_1>, + <&topckgen CLK_TOP_APLL1_CK>, + <&topckgen CLK_TOP_MUX_AUD_2>, + <&topckgen CLK_TOP_APLL2_CK>, + <&topckgen CLK_TOP_MUX_AUD_ENG1>, + <&topckgen CLK_TOP_APLL1_D8>, + <&topckgen CLK_TOP_MUX_AUD_ENG2>, + <&topckgen CLK_TOP_APLL2_D8>, + <&topckgen CLK_TOP_MUX_APLL_I2S0>, + <&topckgen CLK_TOP_MUX_APLL_I2S1>, + <&topckgen CLK_TOP_MUX_APLL_I2S2>, + <&topckgen CLK_TOP_MUX_APLL_I2S3>, + <&topckgen CLK_TOP_MUX_APLL_I2S4>, + <&topckgen CLK_TOP_MUX_APLL_I2S5>, + <&topckgen CLK_TOP_APLL12_DIV0>, + <&topckgen CLK_TOP_APLL12_DIV1>, + <&topckgen CLK_TOP_APLL12_DIV2>, + <&topckgen CLK_TOP_APLL12_DIV3>, + <&topckgen CLK_TOP_APLL12_DIV4>, + <&topckgen CLK_TOP_APLL12_DIVB>, + <&clk26m>; + clock-names = "aud_afe_clk", + "aud_dac_clk", + "aud_dac_predis_clk", + "aud_adc_clk", + "aud_adc_adda6_clk", + "aud_apll22m_clk", + "aud_apll24m_clk", + "aud_apll1_tuner_clk", + "aud_apll2_tuner_clk", + "aud_i2s1_bclk_sw", + "aud_i2s2_bclk_sw", + "aud_i2s3_bclk_sw", + "aud_i2s4_bclk_sw", + "aud_tdm_clk", + "aud_tml_clk", + "aud_infra_clk", + "mtkaif_26m_clk", + "top_mux_audio", + "top_mux_aud_intbus", + "top_syspll_d2_d4", + "top_mux_aud_1", + "top_apll1_ck", + "top_mux_aud_2", + "top_apll2_ck", + "top_mux_aud_eng1", + "top_apll1_d8", + "top_mux_aud_eng2", + "top_apll2_d8", + "top_i2s0_m_sel", + "top_i2s1_m_sel", + "top_i2s2_m_sel", + "top_i2s3_m_sel", + "top_i2s4_m_sel", + "top_i2s5_m_sel", + "top_apll12_div0", + "top_apll12_div1", + "top_apll12_div2", + "top_apll12_div3", + "top_apll12_div4", + "top_apll12_divb", + "top_clk26m_clk"; + }; + +... diff --git a/Documentation/devicetree/bindings/sound/mediatek,mt8183_da7219.yaml b/Documentation/devicetree/bindings/sound/mediatek,mt8183_da7219.yaml new file mode 100644 index 00000000000000..b526e8123182bc --- /dev/null +++ b/Documentation/devicetree/bindings/sound/mediatek,mt8183_da7219.yaml @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/mediatek,mt8183_da7219.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek MT8183 sound card with external codecs + +maintainers: + - Julien Massot + +description: + MediaTek MT8183 SoC-based sound cards with DA7219 as headset codec, + and MAX98357A, RT1015 or RT1015P as speaker amplifiers. Optionally includes HDMI codec. + +properties: + compatible: + enum: + - mediatek,mt8183_da7219_max98357 + - mediatek,mt8183_da7219_rt1015 + - mediatek,mt8183_da7219_rt1015p + + mediatek,headset-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: Phandle to the DA7219 headset codec. + + mediatek,platform: + $ref: /schemas/types.yaml#/definitions/phandle + description: Phandle to the MT8183 ASoC platform (e.g., AFE node). + + mediatek,hdmi-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: Optional phandle to the HDMI codec (e.g., IT6505). + +required: + - compatible + - mediatek,headset-codec + - mediatek,platform + +additionalProperties: false + +examples: + - | + sound { + compatible = "mediatek,mt8183_da7219_max98357"; + mediatek,headset-codec = <&da7219>; + mediatek,hdmi-codec = <&it6505dptx>; + mediatek,platform = <&afe>; + }; diff --git a/Documentation/devicetree/bindings/sound/mediatek,mt8183_mt6358_ts3a227.yaml b/Documentation/devicetree/bindings/sound/mediatek,mt8183_mt6358_ts3a227.yaml new file mode 100644 index 00000000000000..43a6f9d40644c2 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/mediatek,mt8183_mt6358_ts3a227.yaml @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/mediatek,mt8183_mt6358_ts3a227.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek MT8183 sound card with MT6358, TS3A227, and MAX98357/RT1015 codecs + +maintainers: + - Julien Massot + +description: + MediaTek MT8183 SoC-based sound cards using the MT6358 codec, + with optional TS3A227 headset codec, EC codec (via Chrome EC), and HDMI audio. + Speaker amplifier can be one of MAX98357A/B, RT1015, or RT1015P. + +properties: + compatible: + enum: + - mediatek,mt8183_mt6358_ts3a227_max98357 + - mediatek,mt8183_mt6358_ts3a227_max98357b + - mediatek,mt8183_mt6358_ts3a227_rt1015 + - mediatek,mt8183_mt6358_ts3a227_rt1015p + + mediatek,platform: + $ref: /schemas/types.yaml#/definitions/phandle + description: Phandle to the MT8183 ASoC platform node (e.g., AFE). + + mediatek,headset-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: Phandle to the TS3A227 headset codec. + + mediatek,ec-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: | + Optional phandle to a ChromeOS EC codec node. + See bindings in google,cros-ec-codec.yaml. + + mediatek,hdmi-codec: + $ref: /schemas/types.yaml#/definitions/phandle + description: Optional phandle to an HDMI audio codec node. + +required: + - compatible + - mediatek,platform + +additionalProperties: false + +examples: + - | + sound { + compatible = "mediatek,mt8183_mt6358_ts3a227_max98357"; + mediatek,headset-codec = <&ts3a227>; + mediatek,ec-codec = <&ec_codec>; + mediatek,hdmi-codec = <&it6505dptx>; + mediatek,platform = <&afe>; + }; + +... diff --git a/Documentation/devicetree/bindings/sound/mt8183-afe-pcm.txt b/Documentation/devicetree/bindings/sound/mt8183-afe-pcm.txt deleted file mode 100644 index 1f1cba4152ceec..00000000000000 --- a/Documentation/devicetree/bindings/sound/mt8183-afe-pcm.txt +++ /dev/null @@ -1,42 +0,0 @@ -Mediatek AFE PCM controller for mt8183 - -Required properties: -- compatible = "mediatek,mt68183-audio"; -- reg: register location and size -- interrupts: should contain AFE interrupt -- resets: Must contain an entry for each entry in reset-names - See ../reset/reset.txt for details. -- reset-names: should have these reset names: - "audiosys"; -- power-domains: should define the power domain -- clocks: Must contain an entry for each entry in clock-names -- clock-names: should have these clock names: - "infra_sys_audio_clk", - "mtkaif_26m_clk", - "top_mux_audio", - "top_mux_aud_intbus", - "top_sys_pll3_d4", - "top_clk26m_clk"; - -Example: - - afe: mt8183-afe-pcm@11220000 { - compatible = "mediatek,mt8183-audio"; - reg = <0 0x11220000 0 0x1000>; - interrupts = ; - resets = <&watchdog MT8183_TOPRGU_AUDIO_SW_RST>; - reset-names = "audiosys"; - power-domains = <&scpsys MT8183_POWER_DOMAIN_AUDIO>; - clocks = <&infrasys CLK_INFRA_AUDIO>, - <&infrasys CLK_INFRA_AUDIO_26M_BCLK>, - <&topckgen CLK_TOP_MUX_AUDIO>, - <&topckgen CLK_TOP_MUX_AUD_INTBUS>, - <&topckgen CLK_TOP_SYSPLL_D2_D4>, - <&clk26m>; - clock-names = "infra_sys_audio_clk", - "mtkaif_26m_clk", - "top_mux_audio", - "top_mux_aud_intbus", - "top_sys_pll_d2_d4", - "top_clk26m_clk"; - }; diff --git a/Documentation/devicetree/bindings/sound/mt8183-da7219-max98357.txt b/Documentation/devicetree/bindings/sound/mt8183-da7219-max98357.txt deleted file mode 100644 index f276dfc74b4654..00000000000000 --- a/Documentation/devicetree/bindings/sound/mt8183-da7219-max98357.txt +++ /dev/null @@ -1,21 +0,0 @@ -MT8183 with MT6358, DA7219, MAX98357, and RT1015 CODECS - -Required properties: -- compatible : "mediatek,mt8183_da7219_max98357" for MAX98357A codec - "mediatek,mt8183_da7219_rt1015" for RT1015 codec - "mediatek,mt8183_da7219_rt1015p" for RT1015P codec -- mediatek,headset-codec: the phandles of da7219 codecs -- mediatek,platform: the phandle of MT8183 ASoC platform - -Optional properties: -- mediatek,hdmi-codec: the phandles of HDMI codec - -Example: - - sound { - compatible = "mediatek,mt8183_da7219_max98357"; - mediatek,headset-codec = <&da7219>; - mediatek,hdmi-codec = <&it6505dptx>; - mediatek,platform = <&afe>; - }; - diff --git a/Documentation/devicetree/bindings/sound/mt8183-mt6358-ts3a227-max98357.txt b/Documentation/devicetree/bindings/sound/mt8183-mt6358-ts3a227-max98357.txt deleted file mode 100644 index ecd46ed8eb98b9..00000000000000 --- a/Documentation/devicetree/bindings/sound/mt8183-mt6358-ts3a227-max98357.txt +++ /dev/null @@ -1,25 +0,0 @@ -MT8183 with MT6358, TS3A227, MAX98357, and RT1015 CODECS - -Required properties: -- compatible : "mediatek,mt8183_mt6358_ts3a227_max98357" for MAX98357A codec - "mediatek,mt8183_mt6358_ts3a227_max98357b" for MAX98357B codec - "mediatek,mt8183_mt6358_ts3a227_rt1015" for RT1015 codec - "mediatek,mt8183_mt6358_ts3a227_rt1015p" for RT1015P codec -- mediatek,platform: the phandle of MT8183 ASoC platform - -Optional properties: -- mediatek,headset-codec: the phandles of ts3a227 codecs -- mediatek,ec-codec: the phandle of EC codecs. - See google,cros-ec-codec.txt for more details. -- mediatek,hdmi-codec: the phandles of HDMI codec - -Example: - - sound { - compatible = "mediatek,mt8183_mt6358_ts3a227_max98357"; - mediatek,headset-codec = <&ts3a227>; - mediatek,ec-codec = <&ec_codec>; - mediatek,hdmi-codec = <&it6505dptx>; - mediatek,platform = <&afe>; - }; - diff --git a/Documentation/devicetree/bindings/sound/qcom,pm4125-codec.yaml b/Documentation/devicetree/bindings/sound/qcom,pm4125-codec.yaml new file mode 100644 index 00000000000000..6e2f103be1d324 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/qcom,pm4125-codec.yaml @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/qcom,pm4125-codec.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm PM4125 Audio Codec + +maintainers: + - Alexey Klimov + +description: + The audio codec IC found on Qualcomm PM4125/PM2250 PMIC. + It has RX and TX Soundwire slave devices. + +allOf: + - $ref: dai-common.yaml# + +properties: + compatible: + const: qcom,pm4125-codec + + reg: + description: + Specifies the SPMI base address for the audio codec peripherals. The + address space contains reset register needed to power-on the codec. + maxItems: 1 + + reg-names: + maxItems: 1 + + vdd-io-supply: + description: A reference to the 1.8V I/O supply + + vdd-cp-supply: + description: A reference to the charge pump I/O supply + + vdd-mic-bias-supply: + description: A reference to the 3.3V mic bias supply + + vdd-pa-vpos-supply: + description: A reference to the PA VPOS supply + + qcom,tx-device: + $ref: /schemas/types.yaml#/definitions/phandle-array + description: A reference to Soundwire tx device phandle + + qcom,rx-device: + $ref: /schemas/types.yaml#/definitions/phandle-array + description: A reference to Soundwire rx device phandle + + qcom,micbias1-microvolt: + description: micbias1 voltage + minimum: 1800000 + maximum: 2850000 + + qcom,micbias2-microvolt: + description: micbias2 voltage + minimum: 1800000 + maximum: 2850000 + + qcom,micbias3-microvolt: + description: micbias3 voltage + minimum: 1800000 + maximum: 2850000 + + qcom,mbhc-buttons-vthreshold-microvolt: + description: + Array of 8 Voltage threshold values corresponding to headset + button0 - button7 + minItems: 8 + maxItems: 8 + + '#sound-dai-cells': + const: 1 + +required: + - compatible + - reg + - vdd-io-supply + - vdd-cp-supply + - vdd-mic-bias-supply + - vdd-pa-vpos-supply + - qcom,tx-device + - qcom,rx-device + - qcom,micbias1-microvolt + - qcom,micbias2-microvolt + - qcom,micbias3-microvolt + - '#sound-dai-cells' + +unevaluatedProperties: false + +examples: + - | + #include + + spmi { + #address-cells = <2>; + #size-cells = <0>; + + pmic { + #address-cells = <1>; + #size-cells = <0>; + + audio-codec@f000 { + compatible = "qcom,pm4125-codec"; + reg = <0xf000>; + vdd-io-supply = <&pm4125_l15>; + vdd-cp-supply = <&pm4125_s4>; + vdd-pa-vpos-supply = <&pm4125_s4>; + vdd-mic-bias-supply = <&pm4125_l22>; + qcom,micbias1-microvolt = <1800000>; + qcom,micbias2-microvolt = <1800000>; + qcom,micbias3-microvolt = <1800000>; + qcom,rx-device = <&pm4125_rx>; + qcom,tx-device = <&pm4125_tx>; + #sound-dai-cells = <1>; + }; + }; + }; + + /* ... */ + + soundwire@a610000 { + reg = <0x0a610000 0x2000>; + #address-cells = <2>; + #size-cells = <0>; + pm4125_rx: audio-codec@0,4 { + compatible = "sdw20217010c00"; + reg = <0 4>; + qcom,rx-port-mapping = <1 3>; + }; + }; +... diff --git a/Documentation/devicetree/bindings/sound/qcom,pm4125-sdw.yaml b/Documentation/devicetree/bindings/sound/qcom,pm4125-sdw.yaml new file mode 100644 index 00000000000000..23624f32ac3058 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/qcom,pm4125-sdw.yaml @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/qcom,pm4125-sdw.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm SoundWire Slave devices on PM4125/PM2250 PMIC audio codec. + +maintainers: + - Alexey Klimov + +description: + The audio codec IC found on Qualcomm PM4125/PM2250 PMICs. + It has RX and TX Soundwire slave devices. + +properties: + compatible: + const: sdw20217010c00 + + reg: + maxItems: 1 + + qcom,tx-port-mapping: + description: | + Specifies static port mapping between device and host tx ports. + In the order of the device port index which are adc1_port, adc23_port, + dmic03_mbhc_port, dmic46_port. + Supports maximum 2 tx soundwire ports. + + PM4125 TX Port 1 (ADC1,2 & DMIC0 & MBHC) <=> SWR0 Port 1 + PM4125 TX Port 2 (ADC1 & DMIC0,1,2 & MBHC) <=> SWR0 Port 2 + + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 2 + maxItems: 2 + items: + enum: [1, 2, 3, 4] + + qcom,rx-port-mapping: + description: | + Specifies static port mapping between device and host rx ports. + In the order of device port index which are hph_port, clsh_port, + comp_port, lo_port, dsd port. + Supports maximum 2 rx soundwire ports. + + PM4125 RX Port 1 (HPH_L/R) <==> SWR1 Port 1 (HPH_L/R) + PM4125 RX Port 2 (COMP_L/R) <==> SWR1 Port 3 (COMP_L/R) + + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 2 + maxItems: 2 + items: + enum: [1, 2, 3, 4, 5] + +required: + - compatible + - reg + +oneOf: + - required: + - qcom,tx-port-mapping + - required: + - qcom,rx-port-mapping + +additionalProperties: false + +examples: + - | + soundwire@a610000 { + reg = <0x0a610000 0x2000>; + #address-cells = <2>; + #size-cells = <0>; + pm4125_rx: codec@0,1 { + compatible = "sdw20217010c00"; + reg = <0 1>; + qcom,rx-port-mapping = <1 3>; + }; + }; +... diff --git a/Documentation/devicetree/bindings/sound/ti,pcm1754.yaml b/Documentation/devicetree/bindings/sound/ti,pcm1754.yaml new file mode 100644 index 00000000000000..a757f737690c18 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/ti,pcm1754.yaml @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/ti,pcm1754.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Texas Instruments PCM1754 Stereo DAC + +description: + The PCM1754 is a simple stereo DAC that is controlled via hardware gpios. + +maintainers: + - Stefan Kerkmann + +allOf: + - $ref: dai-common.yaml# + +properties: + compatible: + enum: + - ti,pcm1754 + + vcc-supply: true + + '#sound-dai-cells': + const: 0 + + format-gpios: + maxItems: 1 + description: + GPIO used to select the PCM format + + mute-gpios: + maxItems: 1 + description: + GPIO used to mute all outputs + +required: + - compatible + - '#sound-dai-cells' + - vcc-supply + +additionalProperties: false + +examples: + - | + #include + codec { + compatible = "ti,pcm1754"; + #sound-dai-cells = <0>; + + vcc-supply = <&vcc_reg>; + mute-gpios = <&gpio 0 GPIO_ACTIVE_HIGH>; + format-gpios = <&gpio 1 GPIO_ACTIVE_HIGH>; + }; diff --git a/Documentation/devicetree/bindings/sound/ti,tas2781.yaml b/Documentation/devicetree/bindings/sound/ti,tas2781.yaml index 011211112be4be..bd00afa47d62b3 100644 --- a/Documentation/devicetree/bindings/sound/ti,tas2781.yaml +++ b/Documentation/devicetree/bindings/sound/ti,tas2781.yaml @@ -11,11 +11,13 @@ maintainers: - Shenghao Ding description: | - The TAS2118/TAS2X20/TAS257x is mono, digital input Class-D audio + The TAS2118/TAS2X20 is mono, digital input Class-D audio amplifier optimized for efficiently driving high peak power into small loudspeakers. - Integrated speaker voltage and current sense provides for - real time monitoring of loudspeaker behavior. + The TAS257x is mono, digital input Class-D audio amplifier optimized + for efficiently driving high peak power into small loudspeakers. + Integrated speaker voltage and current sense provides for real time + monitoring of loudspeaker behavior. The TAS2563/TAS2781 is a mono, digital input Class-D audio amplifier optimized for efficiently driving high peak power into small loudspeakers. An integrated on-chip DSP supports Texas @@ -25,9 +27,7 @@ description: | The TAS5825/TAS5827 is a stereo, digital input Class-D audio amplifier optimized for efficiently driving high peak power into small loudspeakers. An integrated on-chip DSP supports Texas - Instruments Smart Amp speaker protection algorithm. The - integrated speaker voltage and current sense provides for real time - monitoring of loudspeaker behavior. + Instruments Smart Amp speaker protection algorithm. Specifications about the audio amplifier can be found at: https://www.ti.com/lit/gpn/tas2120 @@ -131,6 +131,7 @@ allOf: contains: enum: - ti,tas2563 + - ti,tas5825 then: properties: reg: @@ -181,15 +182,14 @@ allOf: compatible: contains: enum: - - ti,tas5825 - ti,tas5827 then: properties: reg: - maxItems: 4 + maxItems: 6 items: - minimum: 0x4c - maximum: 0x4f + minimum: 0x60 + maximum: 0x65 additionalProperties: false diff --git a/Documentation/devicetree/bindings/spi/amlogic,a4-spifc.yaml b/Documentation/devicetree/bindings/spi/amlogic,a4-spifc.yaml new file mode 100644 index 00000000000000..b4cef838bcd40b --- /dev/null +++ b/Documentation/devicetree/bindings/spi/amlogic,a4-spifc.yaml @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2025 Amlogic, Inc. All rights reserved +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/spi/amlogic,a4-spifc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: SPI flash controller for Amlogic ARM SoCs + +maintainers: + - Liang Yang + - Feng Chen + - Xianwei Zhao + +description: + The Amlogic SPI flash controller is an extended version of the Amlogic NAND + flash controller. It supports SPI Nor Flash and SPI NAND Flash(where the Host + ECC HW engine could be enabled). + +allOf: + - $ref: /schemas/spi/spi-controller.yaml# + +properties: + compatible: + const: amlogic,a4-spifc + + reg: + maxItems: 1 + + clocks: + items: + - description: clock apb gate + - description: clock used for the controller + + clock-names: + items: + - const: gate + - const: core + + interrupts: + maxItems: 1 + + amlogic,rx-adj: + description: + Number of clock cycles by which sampling is delayed. + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + +required: + - compatible + - reg + - clocks + - clock-names + +unevaluatedProperties: false + +examples: + - | + sfc0: spi@fe08d000 { + compatible = "amlogic,a4-spifc"; + reg = <0xfe08d000 0x800>; + clocks = <&clkc_periphs 31>, + <&clkc_periphs 102>; + clock-names = "gate", "core"; + + pinctrl-0 = <&spiflash_default>; + pinctrl-names = "default"; + + #address-cells = <1>; + #size-cells = <0>; + + flash@0 { + compatible = "spi-nand"; + reg = <0>; + #address-cells = <1>; + #size-cells = <1>; + nand-ecc-engine = <&sfc0>; + nand-ecc-strength = <8>; + nand-ecc-step-size = <512>; + }; + }; diff --git a/Documentation/devicetree/bindings/spi/atmel,at91rm9200-spi.yaml b/Documentation/devicetree/bindings/spi/atmel,at91rm9200-spi.yaml index d29772994cf5f1..11885d0cc2099f 100644 --- a/Documentation/devicetree/bindings/spi/atmel,at91rm9200-spi.yaml +++ b/Documentation/devicetree/bindings/spi/atmel,at91rm9200-spi.yaml @@ -31,11 +31,16 @@ properties: maxItems: 1 clock-names: - contains: - const: spi_clk + items: + - const: spi_clk + - const: spi_gclk + minItems: 1 clocks: - maxItems: 1 + items: + - description: Peripheral Bus clock + - description: Programmable Generic clock + minItems: 1 dmas: items: diff --git a/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml b/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml index b0d99bc105352c..30ab42c95c0894 100644 --- a/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml +++ b/Documentation/devicetree/bindings/spi/atmel,quadspi.yaml @@ -17,6 +17,9 @@ properties: enum: - atmel,sama5d2-qspi - microchip,sam9x60-qspi + - microchip,sam9x7-ospi + - microchip,sama7d65-qspi + - microchip,sama7d65-ospi - microchip,sama7g5-qspi - microchip,sama7g5-ospi diff --git a/Documentation/devicetree/bindings/spi/samsung,spi.yaml b/Documentation/devicetree/bindings/spi/samsung,spi.yaml index fe298d47b1a905..1ce8b2770a4aaf 100644 --- a/Documentation/devicetree/bindings/spi/samsung,spi.yaml +++ b/Documentation/devicetree/bindings/spi/samsung,spi.yaml @@ -18,7 +18,6 @@ properties: oneOf: - enum: - google,gs101-spi - - samsung,s3c2443-spi # for S3C2443, S3C2416 and S3C2450 - samsung,s3c6410-spi - samsung,s5pv210-spi # for S5PV210 and S5PC110 - samsung,exynos4210-spi diff --git a/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.yaml b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.yaml index cf47a1f3b3847d..25efedced58424 100644 --- a/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.yaml +++ b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.yaml @@ -18,6 +18,7 @@ description: The SOCTHERM IP block contains thermal sensors, support for properties: compatible: enum: + - nvidia,tegra114-soctherm - nvidia,tegra124-soctherm - nvidia,tegra132-soctherm - nvidia,tegra210-soctherm @@ -206,6 +207,7 @@ allOf: compatible: contains: enum: + - nvidia,tegra114-soctherm - nvidia,tegra124-soctherm - nvidia,tegra210-soctherm - nvidia,tegra210b01-soctherm diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index 94311ebd7652d4..78e2f6573b96f2 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -49,11 +49,13 @@ properties: - description: v2 of TSENS items: - enum: + - qcom,glymur-tsens - qcom,milos-tsens - qcom,msm8953-tsens - qcom,msm8996-tsens - qcom,msm8998-tsens - qcom,qcm2290-tsens + - qcom,qcs615-tsens - qcom,sa8255p-tsens - qcom,sa8775p-tsens - qcom,sar2130p-tsens diff --git a/Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml b/Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml new file mode 100644 index 00000000000000..573e2b9d37524b --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/thermal/renesas,r9a08g045-tsu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/G3S Thermal Sensor Unit + +description: + The thermal sensor unit (TSU) measures the temperature(Tj) inside + the LSI. + +maintainers: + - Claudiu Beznea + +$ref: thermal-sensor.yaml# + +properties: + compatible: + const: renesas,r9a08g045-tsu + + reg: + maxItems: 1 + + clocks: + items: + - description: TSU module clock + + power-domains: + maxItems: 1 + + resets: + items: + - description: TSU module reset + + io-channels: + items: + - description: ADC channel which reports the TSU temperature + + io-channel-names: + items: + - const: tsu + + "#thermal-sensor-cells": + const: 0 + +required: + - compatible + - reg + - clocks + - power-domains + - resets + - io-channels + - io-channel-names + - '#thermal-sensor-cells' + +additionalProperties: false + +examples: + - | + #include + + tsu: thermal@10059000 { + compatible = "renesas,r9a08g045-tsu"; + reg = <0x10059000 0x1000>; + clocks = <&cpg CPG_MOD R9A08G045_TSU_PCLK>; + resets = <&cpg R9A08G045_TSU_PRESETN>; + power-domains = <&cpg>; + #thermal-sensor-cells = <0>; + io-channels = <&adc 8>; + io-channel-names = "tsu"; + }; + + thermal-zones { + cpu-thermal { + polling-delay-passive = <250>; + polling-delay = <1000>; + thermal-sensors = <&tsu>; + + trips { + sensor_crit: sensor-crit { + temperature = <125000>; + hysteresis = <1000>; + type = "critical"; + }; + target: trip-point { + temperature = <100000>; + hysteresis = <1000>; + type = "passive"; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml b/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml new file mode 100644 index 00000000000000..8d3f3c24f0f270 --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/thermal/renesas,r9a09g047-tsu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/G3E Temperature Sensor Unit (TSU) + +maintainers: + - John Madieu + +description: + The Temperature Sensor Unit (TSU) is an integrated thermal sensor that + monitors the chip temperature on the Renesas RZ/G3E SoC. The TSU provides + real-time temperature measurements for thermal management. + +properties: + compatible: + const: renesas,r9a09g047-tsu + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + resets: + maxItems: 1 + + power-domains: + maxItems: 1 + + interrupts: + items: + - description: Conversion complete interrupt signal (pulse) + - description: Comparison result interrupt signal (level) + + interrupt-names: + items: + - const: adi + - const: adcmpi + + "#thermal-sensor-cells": + const: 0 + + renesas,tsu-trim: + $ref: /schemas/types.yaml#/definitions/phandle-array + items: + - items: + - description: phandle to system controller + - description: offset of trim registers + description: + Phandle and offset to the system controller containing the TSU + calibration trim values. The offset points to the first trim register + (OTPTSU1TRMVAL0), with the second trim register (OTPTSU1TRMVAL1) located + at offset + 4. + +required: + - compatible + - reg + - clocks + - resets + - power-domains + - interrupts + - interrupt-names + - "#thermal-sensor-cells" + - renesas,tsu-trim + +additionalProperties: false + +examples: + - | + #include + #include + + thermal-sensor@14002000 { + compatible = "renesas,r9a09g047-tsu"; + reg = <0x14002000 0x1000>; + clocks = <&cpg CPG_MOD 0x10a>; + resets = <&cpg 0xf8>; + power-domains = <&cpg>; + interrupts = , + ; + interrupt-names = "adi", "adcmpi"; + #thermal-sensor-cells = <0>; + renesas,tsu-trim = <&sys 0x330>; + }; diff --git a/Documentation/devicetree/bindings/thermal/rockchip-thermal.yaml b/Documentation/devicetree/bindings/thermal/rockchip-thermal.yaml index 573f447cc26ed7..9fa5c4c49d76e3 100644 --- a/Documentation/devicetree/bindings/thermal/rockchip-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/rockchip-thermal.yaml @@ -119,6 +119,21 @@ required: - resets allOf: + - if: + properties: + compatible: + contains: + enum: + - rockchip,px30-tsadc + - rockchip,rk3366-tsadc + - rockchip,rk3399-tsadc + - rockchip,rk3568-tsadc + then: + required: + - rockchip,grf + else: + properties: + rockchip,grf: false - if: not: properties: diff --git a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt b/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt deleted file mode 100644 index 3cb2f4c98d6436..00000000000000 --- a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt +++ /dev/null @@ -1,38 +0,0 @@ -Faraday Technology timer - -This timer is a generic IP block from Faraday Technology, embedded in the -Cortina Systems Gemini SoCs and other designs. - -Required properties: - -- compatible : Must be one of - "faraday,fttmr010" - "cortina,gemini-timer", "faraday,fttmr010" - "moxa,moxart-timer", "faraday,fttmr010" - "aspeed,ast2400-timer" - "aspeed,ast2500-timer" - "aspeed,ast2600-timer" - -- reg : Should contain registers location and length -- interrupts : Should contain the three timer interrupts usually with - flags for falling edge - -Optionally required properties: - -- clocks : a clock to provide the tick rate for "faraday,fttmr010" -- clock-names : should be "EXTCLK" and "PCLK" for the external tick timer - and peripheral clock respectively, for "faraday,fttmr010" -- syscon : a phandle to the global Gemini system controller if the compatible - type is "cortina,gemini-timer" - -Example: - -timer@43000000 { - compatible = "faraday,fttmr010"; - reg = <0x43000000 0x1000>; - interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */ - <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */ - <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */ - clocks = <&extclk>, <&pclk>; - clock-names = "EXTCLK", "PCLK"; -}; diff --git a/Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml b/Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml new file mode 100644 index 00000000000000..39506323556c57 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/faraday,fttmr010.yaml @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/faraday,fttmr010.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Faraday FTTMR010 timer + +maintainers: + - Joel Stanley + - Linus Walleij + +description: + This timer is a generic IP block from Faraday Technology, embedded in the + Cortina Systems Gemini SoCs and other designs. + +properties: + compatible: + oneOf: + - items: + - const: moxa,moxart-timer + - const: faraday,fttmr010 + - enum: + - aspeed,ast2400-timer + - aspeed,ast2500-timer + - aspeed,ast2600-timer + - cortina,gemini-timer + - faraday,fttmr010 + + reg: + maxItems: 1 + + interrupts: + minItems: 1 + maxItems: 8 + description: One interrupt per timer + + clocks: + minItems: 1 + items: + - description: Peripheral clock + - description: External tick clock + + clock-names: + minItems: 1 + items: + - const: PCLK + - const: EXTCLK + + resets: + maxItems: 1 + + syscon: + description: System controller phandle for Gemini systems + $ref: /schemas/types.yaml#/definitions/phandle + +required: + - compatible + - reg + - interrupts + +allOf: + - if: + properties: + compatible: + contains: + const: cortina,gemini-timer + then: + required: + - syscon + else: + properties: + syscon: false + +additionalProperties: false + +examples: + - | + #include + + timer@43000000 { + compatible = "faraday,fttmr010"; + reg = <0x43000000 0x1000>; + interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */ + <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */ + <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */ + clocks = <&pclk>, <&extclk>; + clock-names = "PCLK", "EXTCLK"; + }; diff --git a/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml b/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml index 0e4a8ddc3de327..e3b61b62521e8b 100644 --- a/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml +++ b/Documentation/devicetree/bindings/timer/fsl,ftm-timer.yaml @@ -14,7 +14,9 @@ properties: const: fsl,ftm-timer reg: - maxItems: 1 + items: + - description: clock event device + - description: clock source device interrupts: maxItems: 1 @@ -50,7 +52,8 @@ examples: ftm@400b8000 { compatible = "fsl,ftm-timer"; - reg = <0x400b8000 0x1000>; + reg = <0x400b8000 0x1000>, + <0x400b9000 0x1000>; interrupts = <0 44 IRQ_TYPE_LEVEL_HIGH>; clock-names = "ftm-evt", "ftm-src", "ftm-evt-counter-en", "ftm-src-counter-en"; clocks = <&clks VF610_CLK_FTM2>, <&clks VF610_CLK_FTM3>, diff --git a/Documentation/devicetree/bindings/timer/fsl,timrot.yaml b/Documentation/devicetree/bindings/timer/fsl,timrot.yaml new file mode 100644 index 00000000000000..d181f274ef9f89 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/fsl,timrot.yaml @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/fsl,timrot.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Freescale MXS Timer + +maintainers: + - Frank Li + +properties: + compatible: + items: + - enum: + - fsl,imx23-timrot + - fsl,imx28-timrot + - const: fsl,timrot + + reg: + maxItems: 1 + + interrupts: + items: + - description: irq for timer0 + - description: irq for timer1 + - description: irq for timer2 + - description: irq for timer3 + + clocks: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + - clocks + +additionalProperties: false + +examples: + - | + timer: timer@80068000 { + compatible = "fsl,imx28-timrot", "fsl,timrot"; + reg = <0x80068000 0x2000>; + interrupts = <48>, <49>, <50>, <51>; + clocks = <&clks 26>; + }; diff --git a/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml b/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml index bee2c35bd0e293..42e130654d58e0 100644 --- a/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml +++ b/Documentation/devicetree/bindings/timer/fsl,vf610-pit.yaml @@ -15,8 +15,13 @@ description: properties: compatible: - enum: - - fsl,vf610-pit + oneOf: + - enum: + - fsl,vf610-pit + - nxp,s32g2-pit + - items: + - const: nxp,s32g3-pit + - const: nxp,s32g2-pit reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/timer/mediatek,timer.yaml b/Documentation/devicetree/bindings/timer/mediatek,timer.yaml index f68fc7050c5687..e3e38066c2cb72 100644 --- a/Documentation/devicetree/bindings/timer/mediatek,timer.yaml +++ b/Documentation/devicetree/bindings/timer/mediatek,timer.yaml @@ -26,6 +26,7 @@ properties: - items: - enum: - mediatek,mt2701-timer + - mediatek,mt6572-timer - mediatek,mt6580-timer - mediatek,mt6582-timer - mediatek,mt6589-timer @@ -44,6 +45,7 @@ properties: - mediatek,mt8188-timer - mediatek,mt8192-timer - mediatek,mt8195-timer + - mediatek,mt8196-timer - mediatek,mt8365-systimer - const: mediatek,mt6765-timer diff --git a/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml b/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml index 3931054b42fb97..3ad10c5b66ba54 100644 --- a/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml +++ b/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml @@ -221,7 +221,10 @@ properties: maxItems: 1 "#pwm-cells": - const: 2 + oneOf: + - const: 2 + deprecated: true + - const: 3 required: - compatible @@ -299,5 +302,5 @@ examples: clocks = <&cpg CPG_MOD R9A07G044_MTU_X_MCK_MTU3>; power-domains = <&cpg>; resets = <&cpg R9A07G044_MTU_X_PRESET_MTU3>; - #pwm-cells = <2>; + #pwm-cells = <3>; }; diff --git a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml index 10578f54458115..a4b229e0e78aa7 100644 --- a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml +++ b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml @@ -26,6 +26,7 @@ properties: - items: - enum: - axis,artpec8-mct + - axis,artpec9-mct - google,gs101-mct - samsung,exynos2200-mct-peris - samsung,exynos3250-mct @@ -131,6 +132,7 @@ allOf: contains: enum: - axis,artpec8-mct + - axis,artpec9-mct - google,gs101-mct - samsung,exynos2200-mct-peris - samsung,exynos5260-mct diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index f3dd18681aa6f8..12cb7f64b98819 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -293,10 +293,20 @@ properties: - mps,mp2856 # Monolithic Power Systems Inc. multi-phase controller mp2857 - mps,mp2857 + # Monolithic Power Systems Inc. multi-phase controller mp2869 + - mps,mp2869 # Monolithic Power Systems Inc. multi-phase controller mp2888 - mps,mp2888 # Monolithic Power Systems Inc. multi-phase controller mp2891 - mps,mp2891 + # Monolithic Power Systems Inc. multi-phase controller mp29502 + - mps,mp29502 + # Monolithic Power Systems Inc. multi-phase controller mp29608 + - mps,mp29608 + # Monolithic Power Systems Inc. multi-phase controller mp29612 + - mps,mp29612 + # Monolithic Power Systems Inc. multi-phase controller mp29816 + - mps,mp29816 # Monolithic Power Systems Inc. multi-phase controller mp2993 - mps,mp2993 # Monolithic Power Systems Inc. hot-swap protection device @@ -305,6 +315,8 @@ properties: - mps,mp5920 # Monolithic Power Systems Inc. multi-phase hot-swap controller mp5990 - mps,mp5990 + # Monolithic Power Systems Inc. multi-phase hot-swap controller mp5998 + - mps,mp5998 # Monolithic Power Systems Inc. digital step-down converter mp9941 - mps,mp9941 # Temperature sensor with integrated fan control @@ -362,6 +374,9 @@ properties: # Sensirion low power multi-pixel gas sensor with I2C interface - sensirion,sgpc3 # Sensirion temperature & humidity sensor with I2C interface + - sensirion,sht20 + - sensirion,sht21 + - sensirion,sht25 - sensirion,sht4x # Sensortek 3 axis accelerometer - sensortek,stk8312 diff --git a/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml b/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml index 179272f74de5fb..0821ba0e84a3ca 100644 --- a/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml @@ -11,14 +11,19 @@ maintainers: description: | This module is part of the sl28cpld multi-function device. For more - details see ../mfd/kontron,sl28cpld.yaml. + details see ../embedded-controller/kontron,sl28cpld.yaml. allOf: - $ref: watchdog.yaml# properties: compatible: - const: kontron,sl28cpld-wdt + oneOf: + - items: + - enum: + - kontron,sa67mcu-wdt + - const: kontron,sl28cpld-wdt + - const: kontron,sl28cpld-wdt reg: maxItems: 1 diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst index 4fd1cbd8296e19..069b54d8591bde 100644 --- a/Documentation/driver-api/gpio/board.rst +++ b/Documentation/driver-api/gpio/board.rst @@ -94,6 +94,71 @@ with the help of _DSD (Device Specific Data), introduced in ACPI 5.1:: For more information about the ACPI GPIO bindings see Documentation/firmware-guide/acpi/gpio-properties.rst. +Software Nodes +-------------- + +Software nodes allow board-specific code to construct an in-memory, +device-tree-like structure using struct software_node and struct +property_entry. This structure can then be associated with a platform device, +allowing drivers to use the standard device properties API to query +configuration, just as they would on an ACPI or device tree system. + +Software-node-backed GPIOs are described using the ``PROPERTY_ENTRY_GPIO()`` +macro, which ties a software node representing the GPIO controller with +consumer device. It allows consumers to use regular gpiolib APIs, such as +gpiod_get(), gpiod_get_optional(). + +The software node representing a GPIO controller need not be attached to the +GPIO controller device. The only requirement is that the node must be +registered and its name must match the GPIO controller's label. + +For example, here is how to describe a single GPIO-connected LED. This is an +alternative to using platform_data on legacy systems. + +.. code-block:: c + + #include + #include + #include + + /* + * 1. Define a node for the GPIO controller. Its .name must match the + * controller's label. + */ + static const struct software_node gpio_controller_node = { + .name = "gpio-foo", + }; + + /* 2. Define the properties for the LED device. */ + static const struct property_entry led_device_props[] = { + PROPERTY_ENTRY_STRING("label", "myboard:green:status"), + PROPERTY_ENTRY_STRING("linux,default-trigger", "heartbeat"), + PROPERTY_ENTRY_GPIO("gpios", &gpio_controller_node, 42, GPIO_ACTIVE_HIGH), + { } + }; + + /* 3. Define the software node for the LED device. */ + static const struct software_node led_device_swnode = { + .name = "status-led", + .properties = led_device_props, + }; + + /* + * 4. Register the software nodes and the platform device. + */ + const struct software_node *swnodes[] = { + &gpio_controller_node, + &led_device_swnode, + NULL + }; + software_node_register_node_group(swnodes); + + // Then register a platform_device for "leds-gpio" and associate + // it with &led_device_swnode via .fwnode. + +For a complete guide on converting board files to use software nodes, see +Documentation/driver-api/gpio/legacy-boards.rst. + Platform Data ------------- Finally, GPIOs can be bound to devices and functions using platform data. Board diff --git a/Documentation/driver-api/gpio/index.rst b/Documentation/driver-api/gpio/index.rst index 43f6a3afe10b55..87929840e85a29 100644 --- a/Documentation/driver-api/gpio/index.rst +++ b/Documentation/driver-api/gpio/index.rst @@ -12,6 +12,7 @@ Contents: driver consumer board + legacy-boards drivers-on-gpio bt8xxgpio diff --git a/Documentation/driver-api/gpio/legacy-boards.rst b/Documentation/driver-api/gpio/legacy-boards.rst new file mode 100644 index 00000000000000..46e3a26dba772e --- /dev/null +++ b/Documentation/driver-api/gpio/legacy-boards.rst @@ -0,0 +1,298 @@ +Supporting Legacy Boards +======================== + +Many drivers in the kernel, such as ``leds-gpio`` and ``gpio-keys``, are +migrating away from using board-specific ``platform_data`` to a unified device +properties interface. This interface allows drivers to be simpler and more +generic, as they can query properties in a standardized way. + +On modern systems, these properties are provided via device tree. However, some +older platforms have not been converted to device tree and instead rely on +board files to describe their hardware configuration. To bridge this gap and +allow these legacy boards to work with modern, generic drivers, the kernel +provides a mechanism called **software nodes**. + +This document provides a guide on how to convert a legacy board file from using +``platform_data`` and ``gpiod_lookup_table`` to the modern software node +approach for describing GPIO-connected devices. + +The Core Idea: Software Nodes +----------------------------- + +Software nodes allow board-specific code to construct an in-memory, +device-tree-like structure using struct software_node and struct +property_entry. This structure can then be associated with a platform device, +allowing drivers to use the standard device properties API (e.g., +device_property_read_u32(), device_property_read_string()) to query +configuration, just as they would on an ACPI or device tree system. + +The gpiolib code has support for handling software nodes, so that if GPIO is +described properly, as detailed in the section below, then regular gpiolib APIs, +such as gpiod_get(), gpiod_get_optional(), and others will work. + +Requirements for GPIO Properties +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using software nodes to describe GPIO connections, the following +requirements must be met for the GPIO core to correctly resolve the reference: + +1. **The GPIO controller's software node "name" must match the controller's + "label".** The gpiolib core uses this name to find the corresponding + struct gpio_chip at runtime. + This software node has to be registered, but need not be attached to the + device representing the GPIO controller that is providing the GPIO in + question. It may be left as a "free floating" node. + +2. **The GPIO property must be a reference.** The ``PROPERTY_ENTRY_GPIO()`` + macro handles this as it is an alias for ``PROPERTY_ENTRY_REF()``. + +3. **The reference must have exactly two arguments:** + + - The first argument is the GPIO offset within the controller. + - The second argument is the flags for the GPIO line (e.g., + GPIO_ACTIVE_HIGH, GPIO_ACTIVE_LOW). + +The ``PROPERTY_ENTRY_GPIO()`` macro is the preferred way of defining GPIO +properties in software nodes. + +Conversion Example +------------------ + +Let's walk through an example of converting a board file that defines a GPIO- +connected LED and a button. + +Before: Using Platform Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A typical legacy board file might look like this: + +.. code-block:: c + + #include + #include + #include + #include + + #define MYBOARD_GPIO_CONTROLLER "gpio-foo" + + /* LED setup */ + static const struct gpio_led myboard_leds[] = { + { + .name = "myboard:green:status", + .default_trigger = "heartbeat", + }, + }; + + static const struct gpio_led_platform_data myboard_leds_pdata = { + .num_leds = ARRAY_SIZE(myboard_leds), + .leds = myboard_leds, + }; + + static struct gpiod_lookup_table myboard_leds_gpios = { + .dev_id = "leds-gpio", + .table = { + GPIO_LOOKUP_IDX(MYBOARD_GPIO_CONTROLLER, 42, NULL, 0, GPIO_ACTIVE_HIGH), + { }, + }, + }; + + /* Button setup */ + static struct gpio_keys_button myboard_buttons[] = { + { + .code = KEY_WPS_BUTTON, + .desc = "WPS Button", + .active_low = 1, + }, + }; + + static const struct gpio_keys_platform_data myboard_buttons_pdata = { + .buttons = myboard_buttons, + .nbuttons = ARRAY_SIZE(myboard_buttons), + }; + + static struct gpiod_lookup_table myboard_buttons_gpios = { + .dev_id = "gpio-keys", + .table = { + GPIO_LOOKUP_IDX(MYBOARD_GPIO_CONTROLLER, 15, NULL, 0, GPIO_ACTIVE_LOW), + { }, + }, + }; + + /* Device registration */ + static int __init myboard_init(void) + { + gpiod_add_lookup_table(&myboard_leds_gpios); + gpiod_add_lookup_table(&myboard_buttons_gpios); + + platform_device_register_data(NULL, "leds-gpio", -1, + &myboard_leds_pdata, sizeof(myboard_leds_pdata)); + platform_device_register_data(NULL, "gpio-keys", -1, + &myboard_buttons_pdata, sizeof(myboard_buttons_pdata)); + + return 0; + } + +After: Using Software Nodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Here is how the same configuration can be expressed using software nodes. + +Step 1: Define the GPIO Controller Node +*************************************** + +First, define a software node that represents the GPIO controller that the +LEDs and buttons are connected to. The ``name`` of this node must match the +name of the driver for the GPIO controller (e.g., "gpio-foo"). + +.. code-block:: c + + #include + #include + + #define MYBOARD_GPIO_CONTROLLER "gpio-foo" + + static const struct software_node myboard_gpio_controller_node = { + .name = MYBOARD_GPIO_CONTROLLER, + }; + +Step 2: Define Consumer Device Nodes and Properties +*************************************************** + +Next, define the software nodes for the consumer devices (the LEDs and buttons). +This involves creating a parent node for each device type and child nodes for +each individual LED or button. + +.. code-block:: c + + /* LED setup */ + static const struct software_node myboard_leds_node = { + .name = "myboard-leds", + }; + + static const struct property_entry myboard_status_led_props[] = { + PROPERTY_ENTRY_STRING("label", "myboard:green:status"), + PROPERTY_ENTRY_STRING("linux,default-trigger", "heartbeat"), + PROPERTY_ENTRY_GPIO("gpios", &myboard_gpio_controller_node, 42, GPIO_ACTIVE_HIGH), + { } + }; + + static const struct software_node myboard_status_led_swnode = { + .name = "status-led", + .parent = &myboard_leds_node, + .properties = myboard_status_led_props, + }; + + /* Button setup */ + static const struct software_node myboard_keys_node = { + .name = "myboard-keys", + }; + + static const struct property_entry myboard_wps_button_props[] = { + PROPERTY_ENTRY_STRING("label", "WPS Button"), + PROPERTY_ENTRY_U32("linux,code", KEY_WPS_BUTTON), + PROPERTY_ENTRY_GPIO("gpios", &myboard_gpio_controller_node, 15, GPIO_ACTIVE_LOW), + { } + }; + + static const struct software_node myboard_wps_button_swnode = { + .name = "wps-button", + .parent = &myboard_keys_node, + .properties = myboard_wps_button_props, + }; + + + +Step 3: Group and Register the Nodes +************************************ + +For maintainability, it is often beneficial to group all software nodes into a +single array and register them with one call. + +.. code-block:: c + + static const struct software_node * const myboard_swnodes[] = { + &myboard_gpio_controller_node, + &myboard_leds_node, + &myboard_status_led_swnode, + &myboard_keys_node, + &myboard_wps_button_swnode, + NULL + }; + + static int __init myboard_init(void) + { + int error; + + error = software_node_register_node_group(myboard_swnodes); + if (error) { + pr_err("Failed to register software nodes: %d\n", error); + return error; + } + + // ... platform device registration follows + } + +.. note:: + When splitting registration of nodes by devices that they represent, it is + essential that the software node representing the GPIO controller itself + is registered first, before any of the nodes that reference it. + +Step 4: Register Platform Devices with Software Nodes +***************************************************** + +Finally, register the platform devices and associate them with their respective +software nodes using the ``fwnode`` field in struct platform_device_info. + +.. code-block:: c + + static struct platform_device *leds_pdev; + static struct platform_device *keys_pdev; + + static int __init myboard_init(void) + { + struct platform_device_info pdev_info; + int error; + + error = software_node_register_node_group(myboard_swnodes); + if (error) + return error; + + memset(&pdev_info, 0, sizeof(pdev_info)); + pdev_info.name = "leds-gpio"; + pdev_info.id = PLATFORM_DEVID_NONE; + pdev_info.fwnode = software_node_fwnode(&myboard_leds_node); + leds_pdev = platform_device_register_full(&pdev_info); + if (IS_ERR(leds_pdev)) { + error = PTR_ERR(leds_pdev); + goto err_unregister_nodes; + } + + memset(&pdev_info, 0, sizeof(pdev_info)); + pdev_info.name = "gpio-keys"; + pdev_info.id = PLATFORM_DEVID_NONE; + pdev_info.fwnode = software_node_fwnode(&myboard_keys_node); + keys_pdev = platform_device_register_full(&pdev_info); + if (IS_ERR(keys_pdev)) { + error = PTR_ERR(keys_pdev); + platform_device_unregister(leds_pdev); + goto err_unregister_nodes; + } + + return 0; + + err_unregister_nodes: + software_node_unregister_node_group(myboard_swnodes); + return error; + } + + static void __exit myboard_exit(void) + { + platform_device_unregister(keys_pdev); + platform_device_unregister(leds_pdev); + software_node_unregister_node_group(myboard_swnodes); + } + +With these changes, the generic ``leds-gpio`` and ``gpio-keys`` drivers will +be able to probe successfully and get their configuration from the properties +defined in the software nodes, removing the need for board-specific platform +data. diff --git a/Documentation/driver-api/pin-control.rst b/Documentation/driver-api/pin-control.rst index 27ea1236307e84..8208924e513e1f 100644 --- a/Documentation/driver-api/pin-control.rst +++ b/Documentation/driver-api/pin-control.rst @@ -863,7 +863,7 @@ has to be handled by the ```` interface. Instead view thi a certain pin config setting. Look in e.g. ```` and you find this in the documentation: - PIN_CONFIG_OUTPUT: + PIN_CONFIG_LEVEL: this will configure the pin in output, use argument 1 to indicate high level, argument 0 to indicate low level. @@ -897,7 +897,7 @@ And your machine configuration may look like this: }; static unsigned long uart_sleep_mode[] = { - PIN_CONF_PACKED(PIN_CONFIG_OUTPUT, 0), + PIN_CONF_PACKED(PIN_CONFIG_LEVEL, 0), }; static struct pinctrl_map pinmap[] __initdata = { diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst deleted file mode 100644 index b29562a6bf555c..00000000000000 --- a/Documentation/filesystems/bcachefs/CodingStyle.rst +++ /dev/null @@ -1,186 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -bcachefs coding style -===================== - -Good development is like gardening, and codebases are our gardens. Tend to them -every day; look for little things that are out of place or in need of tidying. -A little weeding here and there goes a long way; don't wait until things have -spiraled out of control. - -Things don't always have to be perfect - nitpicking often does more harm than -good. But appreciate beauty when you see it - and let people know. - -The code that you are afraid to touch is the code most in need of refactoring. - -A little organizing here and there goes a long way. - -Put real thought into how you organize things. - -Good code is readable code, where the structure is simple and leaves nowhere -for bugs to hide. - -Assertions are one of our most important tools for writing reliable code. If in -the course of writing a patchset you encounter a condition that shouldn't -happen (and will have unpredictable or undefined behaviour if it does), or -you're not sure if it can happen and not sure how to handle it yet - make it a -BUG_ON(). Don't leave undefined or unspecified behavior lurking in the codebase. - -By the time you finish the patchset, you should understand better which -assertions need to be handled and turned into checks with error paths, and -which should be logically impossible. Leave the BUG_ON()s in for the ones which -are logically impossible. (Or, make them debug mode assertions if they're -expensive - but don't turn everything into a debug mode assertion, so that -we're not stuck debugging undefined behaviour should it turn out that you were -wrong). - -Assertions are documentation that can't go out of date. Good assertions are -wonderful. - -Good assertions drastically and dramatically reduce the amount of testing -required to shake out bugs. - -Good assertions are based on state, not logic. To write good assertions, you -have to think about what the invariants on your state are. - -Good invariants and assertions will hold everywhere in your codebase. This -means that you can run them in only a few places in the checked in version, but -should you need to debug something that caused the assertion to fail, you can -quickly shotgun them everywhere to find the codepath that broke the invariant. - -A good assertion checks something that the compiler could check for us, and -elide - if we were working in a language with embedded correctness proofs that -the compiler could check. This is something that exists today, but it'll likely -still be a few decades before it comes to systems programming languages. But we -can still incorporate that kind of thinking into our code and document the -invariants with runtime checks - much like the way people working in -dynamically typed languages may add type annotations, gradually making their -code statically typed. - -Looking for ways to make your assertions simpler - and higher level - will -often nudge you towards making the entire system simpler and more robust. - -Good code is code where you can poke around and see what it's doing - -introspection. We can't debug anything if we can't see what's going on. - -Whenever we're debugging, and the solution isn't immediately obvious, if the -issue is that we don't know where the issue is because we can't see what's -going on - fix that first. - -We have the tools to make anything visible at runtime, efficiently - RCU and -percpu data structures among them. Don't let things stay hidden. - -The most important tool for introspection is the humble pretty printer - in -bcachefs, this means `*_to_text()` functions, which output to printbufs. - -Pretty printers are wonderful, because they compose and you can use them -everywhere. Having functions to print whatever object you're working with will -make your error messages much easier to write (therefore they will actually -exist) and much more informative. And they can be used from sysfs/debugfs, as -well as tracepoints. - -Runtime info and debugging tools should come with clear descriptions and -labels, and good structure - we don't want files with a list of bare integers, -like in procfs. Part of the job of the debugging tools is to educate users and -new developers as to how the system works. - -Error messages should, whenever possible, tell you everything you need to debug -the issue. It's worth putting effort into them. - -Tracepoints shouldn't be the first thing you reach for. They're an important -tool, but always look for more immediate ways to make things visible. When we -have to rely on tracing, we have to know which tracepoints we're looking for, -and then we have to run the troublesome workload, and then we have to sift -through logs. This is a lot of steps to go through when a user is hitting -something, and if it's intermittent it may not even be possible. - -The humble counter is an incredibly useful tool. They're cheap and simple to -use, and many complicated internal operations with lots of things that can -behave weirdly (anything involving memory reclaim, for example) become -shockingly easy to debug once you have counters on every distinct codepath. - -Persistent counters are even better. - -When debugging, try to get the most out of every bug you come across; don't -rush to fix the initial issue. Look for things that will make related bugs -easier the next time around - introspection, new assertions, better error -messages, new debug tools, and do those first. Look for ways to make the system -better behaved; often one bug will uncover several other bugs through -downstream effects. - -Fix all that first, and then the original bug last - even if that means keeping -a user waiting. They'll thank you in the long run, and when they understand -what you're doing you'll be amazed at how patient they're happy to be. Users -like to help - otherwise they wouldn't be reporting the bug in the first place. - -Talk to your users. Don't isolate yourself. - -Users notice all sorts of interesting things, and by just talking to them and -interacting with them you can benefit from their experience. - -Spend time doing support and helpdesk stuff. Don't just write code - code isn't -finished until it's being used trouble free. - -This will also motivate you to make your debugging tools as good as possible, -and perhaps even your documentation, too. Like anything else in life, the more -time you spend at it the better you'll get, and you the developer are the -person most able to improve the tools to make debugging quick and easy. - -Be wary of how you take on and commit to big projects. Don't let development -become product-manager focused. Often time an idea is a good one but needs to -wait for its proper time - but you won't know if it's the proper time for an -idea until you start writing code. - -Expect to throw a lot of things away, or leave them half finished for later. -Nobody writes all perfect code that all gets shipped, and you'll be much more -productive in the long run if you notice this early and shift to something -else. The experience gained and lessons learned will be valuable for all the -other work you do. - -But don't be afraid to tackle projects that require significant rework of -existing code. Sometimes these can be the best projects, because they can lead -us to make existing code more general, more flexible, more multipurpose and -perhaps more robust. Just don't hesitate to abandon the idea if it looks like -it's going to make a mess of things. - -Complicated features can often be done as a series of refactorings, with the -final change that actually implements the feature as a quite small patch at the -end. It's wonderful when this happens, especially when those refactorings are -things that improve the codebase in their own right. When that happens there's -much less risk of wasted effort if the feature you were going for doesn't work -out. - -Always strive to work incrementally. Always strive to turn the big projects -into little bite sized projects that can prove their own merits. - -Instead of always tackling those big projects, look for little things that -will be useful, and make the big projects easier. - -The question of what's likely to be useful is where junior developers most -often go astray - doing something because it seems like it'll be useful often -leads to overengineering. Knowing what's useful comes from many years of -experience, or talking with people who have that experience - or from simply -reading lots of code and looking for common patterns and issues. Don't be -afraid to throw things away and do something simpler. - -Talk about your ideas with your fellow developers; often times the best things -come from relaxed conversations where people aren't afraid to say "what if?". - -Don't neglect your tools. - -The most important tools (besides the compiler and our text editor) are the -tools we use for testing. The shortest possible edit/test/debug cycle is -essential for working productively. We learn, gain experience, and discover the -errors in our thinking by running our code and seeing what happens. If your -time is being wasted because your tools are bad or too slow - don't accept it, -fix it. - -Put effort into your documentation, commit messages, and code comments - but -don't go overboard. A good commit message is wonderful - but if the information -was important enough to go in a commit message, ask yourself if it would be -even better as a code comment. - -A good code comment is wonderful, but even better is the comment that didn't -need to exist because the code was so straightforward as to be obvious; -organized into small clean and tidy modules, with clear and descriptive names -for functions and variables, where every line of code has a clear purpose. diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst deleted file mode 100644 index 18c79d5483911d..00000000000000 --- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst +++ /dev/null @@ -1,105 +0,0 @@ -Submitting patches to bcachefs -============================== - -Here are suggestions for submitting patches to bcachefs subsystem. - -Submission checklist --------------------- - -Patches must be tested before being submitted, either with the xfstests suite -[0]_, or the full bcachefs test suite in ktest [1]_, depending on what's being -touched. Note that ktest wraps xfstests and will be an easier method to running -it for most users; it includes single-command wrappers for all the mainstream -in-kernel local filesystems. - -Patches will undergo more testing after being merged (including -lockdep/kasan/preempt/etc. variants), these are not generally required to be -run by the submitter - but do put some thought into what you're changing and -which tests might be relevant, e.g. are you dealing with tricky memory layout -work? kasan, are you doing locking work? then lockdep; and ktest includes -single-command variants for the debug build types you'll most likely need. - -The exception to this rule is incomplete WIP/RFC patches: if you're working on -something nontrivial, it's encouraged to send out a WIP patch to let people -know what you're doing and make sure you're on the right track. Just make sure -it includes a brief note as to what's done and what's incomplete, to avoid -confusion. - -Rigorous checkpatch.pl adherence is not required (many of its warnings are -considered out of date), but try not to deviate too much without reason. - -Focus on writing code that reads well and is organized well; code should be -aesthetically pleasing. - -CI --- - -Instead of running your tests locally, when running the full test suite it's -preferable to let a server farm do it in parallel, and then have the results -in a nice test dashboard (which can tell you which failures are new, and -presents results in a git log view, avoiding the need for most bisecting). - -That exists [2]_, and community members may request an account. If you work for -a big tech company, you'll need to help out with server costs to get access - -but the CI is not restricted to running bcachefs tests: it runs any ktest test -(which generally makes it easy to wrap other tests that can run in qemu). - -Other things to think about ---------------------------- - -- How will we debug this code? Is there sufficient introspection to diagnose - when something starts acting wonky on a user machine? - - We don't necessarily need every single field of every data structure visible - with introspection, but having the important fields of all the core data - types wired up makes debugging drastically easier - a bit of thoughtful - foresight greatly reduces the need to have people build custom kernels with - debug patches. - - More broadly, think about all the debug tooling that might be needed. - -- Does it make the codebase more or less of a mess? Can we also try to do some - organizing, too? - -- Do new tests need to be written? New assertions? How do we know and verify - that the code is correct, and what happens if something goes wrong? - - We don't yet have automated code coverage analysis or easy fault injection - - but for now, pretend we did and ask what they might tell us. - - Assertions are hugely important, given that we don't yet have a systems - language that can do ergonomic embedded correctness proofs. Hitting an assert - in testing is much better than wandering off into undefined behaviour la-la - land - use them. Use them judiciously, and not as a replacement for proper - error handling, but use them. - -- Does it need to be performance tested? Should we add new performance counters? - - bcachefs has a set of persistent runtime counters which can be viewed with - the 'bcachefs fs top' command; this should give users a basic idea of what - their filesystem is currently doing. If you're doing a new feature or looking - at old code, think if anything should be added. - -- If it's a new on disk format feature - have upgrades and downgrades been - tested? (Automated tests exists but aren't in the CI, due to the hassle of - disk image management; coordinate to have them run.) - -Mailing list, IRC ------------------ - -Patches should hit the list [3]_, but much discussion and code review happens -on IRC as well [4]_; many people appreciate the more conversational approach -and quicker feedback. - -Additionally, we have a lively user community doing excellent QA work, which -exists primarily on IRC. Please make use of that resource; user feedback is -important for any nontrivial feature, and documenting it in commit messages -would be a good idea. - -.. rubric:: References - -.. [0] git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git -.. [1] https://evilpiepirate.org/git/ktest.git/ -.. [2] https://evilpiepirate.org/~testdashboard/ci/ -.. [3] linux-bcachefs@vger.kernel.org -.. [4] irc.oftc.net#bcache, #bcachefs-dev diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst deleted file mode 100644 index 871a38f557e8e4..00000000000000 --- a/Documentation/filesystems/bcachefs/casefolding.rst +++ /dev/null @@ -1,108 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Casefolding -=========== - -bcachefs has support for case-insensitive file and directory -lookups using the regular `chattr +F` (`S_CASEFOLD`, `FS_CASEFOLD_FL`) -casefolding attributes. - -The main usecase for casefolding is compatibility with software written -against other filesystems that rely on casefolded lookups -(eg. NTFS and Wine/Proton). -Taking advantage of file-system level casefolding can lead to great -loading time gains in many applications and games. - -Casefolding support requires a kernel with the `CONFIG_UNICODE` enabled. -Once a directory has been flagged for casefolding, a feature bit -is enabled on the superblock which marks the filesystem as using -casefolding. -When the feature bit for casefolding is enabled, it is no longer possible -to mount that filesystem on kernels without `CONFIG_UNICODE` enabled. - -On the lookup/query side: casefolding is implemented by allocating a new -string of `BCH_NAME_MAX` length using the `utf8_casefold` function to -casefold the query string. - -On the dirent side: casefolding is implemented by ensuring the `bkey`'s -hash is made from the casefolded string and storing the cached casefolded -name with the regular name in the dirent. - -The structure looks like this: - -* Regular: [dirent data][regular name][nul][nul]... -* Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]... - -(Do note, the number of NULs here is merely for illustration; their count can -vary per-key, and they may not even be present if the key is aligned to -`sizeof(u64)`.) - -This is efficient as it means that for all file lookups that require casefolding, -it has identical performance to a regular lookup: -a hash comparison and a `memcmp` of the name. - -Rationale ---------- - -Several designs were considered for this system: -One was to introduce a dirent_v2, however that would be painful especially as -the hash system only has support for a single key type. This would also need -`BCH_NAME_MAX` to change between versions, and a new feature bit. - -Another option was to store without the two lengths, and just take the length of -the regular name and casefolded name contiguously / 2 as the length. This would -assume that the regular length == casefolded length, but that could potentially -not be true, if the uppercase unicode glyph had a different UTF-8 encoding than -the lowercase unicode glyph. -It would be possible to disregard the casefold cache for those cases, but it was -decided to simply encode the two string lengths in the key to avoid random -performance issues if this edgecase was ever hit. - -The option settled on was to use a free-bit in d_type to mark a dirent as having -a casefold cache, and then treat the first 4 bytes the name block as lengths. -You can see this in the `d_cf_name_block` member of union in `bch_dirent`. - -The feature bit was used to allow casefolding support to be enabled for the majority -of users, but some allow users who have no need for the feature to still use bcachefs as -`CONFIG_UNICODE` can increase the kernel side a significant amount due to the tables used, -which may be decider between using bcachefs for eg. embedded platforms. - -Other filesystems like ext4 and f2fs have a super-block level option for casefolding -encoding, but bcachefs currently does not provide this. ext4 and f2fs do not expose -any encodings than a single UTF-8 version. When future encodings are desirable, -they will be added trivially using the opts mechanism. - -dentry/dcache considerations ----------------------------- - -Currently, in casefolded directories, bcachefs (like other filesystems) will not cache -negative dentry's. - -This is because currently doing so presents a problem in the following scenario: - - - Lookup file "blAH" in a casefolded directory - - Creation of file "BLAH" in a casefolded directory - - Lookup file "blAH" in a casefolded directory - -This would fail if negative dentry's were cached. - -This is slightly suboptimal, but could be fixed in future with some vfs work. - - -References ----------- - -(from Peter Anvin, on the list) - -It is worth noting that Microsoft has basically declared their -"recommended" case folding (upcase) table to be permanently frozen (for -new filesystem instances in the case where they use an on-disk -translation table created at format time.) As far as I know they have -never supported anything other than 1:1 conversion of BMP code points, -nor normalization. - -The exFAT specification enumerates the full recommended upcase table, -although in a somewhat annoying format (basically a hex dump of -compressed data): - -https://learn.microsoft.com/en-us/windows/win32/fileio/exfat-specification diff --git a/Documentation/filesystems/bcachefs/errorcodes.rst b/Documentation/filesystems/bcachefs/errorcodes.rst deleted file mode 100644 index 2cccaa0ba7cd4d..00000000000000 --- a/Documentation/filesystems/bcachefs/errorcodes.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -bcachefs private error codes ----------------------------- - -In bcachefs, as a hard rule we do not throw or directly use standard error -codes (-EINVAL, -EBUSY, etc.). Instead, we define private error codes as needed -in fs/bcachefs/errcode.h. - -This gives us much better error messages and makes debugging much easier. Any -direct uses of standard error codes you see in the source code are simply old -code that has yet to be converted - feel free to clean it up! - -Private error codes may subtype another error code, this allows for grouping of -related errors that should be handled similarly (e.g. transaction restart -errors), as well as specifying which standard error code should be returned at -the bcachefs module boundary. - -At the module boundary, we use bch2_err_class() to convert to a standard error -code; this also emits a trace event so that the original error code be -recovered even if it wasn't logged. - -Do not reuse error codes! Generally speaking, a private error code should only -be thrown in one place. That means that when we see it in a log message we can -see, unambiguously, exactly which file and line number it was returned from. - -Try to give error codes names that are as reasonably descriptive of the error -as possible. Frequently, the error will be logged at a place far removed from -where the error was generated; good names for error codes mean much more -descriptive and useful error messages. diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst deleted file mode 100644 index 59a332509dcd97..00000000000000 --- a/Documentation/filesystems/bcachefs/future/idle_work.rst +++ /dev/null @@ -1,78 +0,0 @@ -Idle/background work classes design doc: - -Right now, our behaviour at idle isn't ideal, it was designed for servers that -would be under sustained load, to keep pending work at a "medium" level, to -let work build up so we can process it in more efficient batches, while also -giving headroom for bursts in load. - -But for desktops or mobile - scenarios where work is less sustained and power -usage is more important - we want to operate differently, with a "rush to -idle" so the system can go to sleep. We don't want to be dribbling out -background work while the system should be idle. - -The complicating factor is that there are a number of background tasks, which -form a heirarchy (or a digraph, depending on how you divide it up) - one -background task may generate work for another. - -Thus proper idle detection needs to model this heirarchy. - -- Foreground writes -- Page cache writeback -- Copygc, rebalance -- Journal reclaim - -When we implement idle detection and rush to idle, we need to be careful not -to disturb too much the existing behaviour that works reasonably well when the -system is under sustained load (or perhaps improve it in the case of -rebalance, which currently does not actively attempt to let work batch up). - -SUSTAINED LOAD REGIME ---------------------- - -When the system is under continuous load, we want these jobs to run -continuously - this is perhaps best modelled with a P/D controller, where -they'll be trying to keep a target value (i.e. fragmented disk space, -available journal space) roughly in the middle of some range. - -The goal under sustained load is to balance our ability to handle load spikes -without running out of x resource (free disk space, free space in the -journal), while also letting some work accumululate to be batched (or become -unnecessary). - -For example, we don't want to run copygc too aggressively, because then it -will be evacuating buckets that would have become empty (been overwritten or -deleted) anyways, and we don't want to wait until we're almost out of free -space because then the system will behave unpredicably - suddenly we're doing -a lot more work to service each write and the system becomes much slower. - -IDLE REGIME ------------ - -When the system becomes idle, we should start flushing our pending work -quicker so the system can go to sleep. - -Note that the definition of "idle" depends on where in the heirarchy a task -is - a task should start flushing work more quickly when the task above it has -stopped generating new work. - -e.g. rebalance should start flushing more quickly when page cache writeback is -idle, and journal reclaim should only start flushing more quickly when both -copygc and rebalance are idle. - -It's important to let work accumulate when more work is still incoming and we -still have room, because flushing is always more efficient if we let it batch -up. New writes may overwrite data before rebalance moves it, and tasks may be -generating more updates for the btree nodes that journal reclaim needs to flush. - -On idle, how much work we do at each interval should be proportional to the -length of time we have been idle for. If we're idle only for a short duration, -we shouldn't flush everything right away; the system might wake up and start -generating new work soon, and flushing immediately might end up doing a lot of -work that would have been unnecessary if we'd allowed things to batch more. - -To summarize, we will need: - - - A list of classes for background tasks that generate work, which will - include one "foreground" class. - - Tracking for each class - "Am I doing work, or have I gone to sleep?" - - And each class should check the class above it when deciding how much work to issue. diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst deleted file mode 100644 index e5c4c2120b93e8..00000000000000 --- a/Documentation/filesystems/bcachefs/index.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================== -bcachefs Documentation -====================== - -Subsystem-specific development process notes --------------------------------------------- - -Development notes specific to bcachefs. These are intended to supplement -:doc:`general kernel development handbook `. - -.. toctree:: - :maxdepth: 1 - :numbered: - - CodingStyle - SubmittingPatches - -Filesystem implementation -------------------------- - -Documentation for filesystem features and their implementation details. -At this moment, only a few of these are described here. - -.. toctree:: - :maxdepth: 1 - :numbered: - - casefolding - errorcodes - -Future design -------------- -.. toctree:: - :maxdepth: 1 - - future/idle_work diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 11a599387266a4..622187a96bdc6c 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -72,7 +72,6 @@ Documentation for filesystem implementations. afs autofs autofs-mount-control - bcachefs/index befs bfs btrfs diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f0750..78c3d07c0c08e9 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -340,8 +340,8 @@ of those. Caller makes sure async writeback cannot be running for the inode whil ->drop_inode() returns int now; it's called on final iput() with inode->i_lock held and it returns true if filesystems wants the inode to be -dropped. As before, generic_drop_inode() is still the default and it's been -updated appropriately. generic_delete_inode() is also alive and it consists +dropped. As before, inode_generic_drop() is still the default and it's been +updated appropriately. inode_just_drop() is also alive and it consists simply of return 1. Note that all actual eviction work is done by caller after ->drop_inode() returns. @@ -1285,3 +1285,15 @@ rather than a VMA, as the VMA at this stage is not yet valid. The vm_area_desc provides the minimum required information for a filesystem to initialise state upon memory mapping of a file-backed region, and output parameters for the file system to set this state. + +--- + +**mandatory** + +Several functions are renamed: + +- kern_path_locked -> start_removing_path +- kern_path_create -> start_creating_path +- user_path_create -> start_creating_user_path +- user_path_locked_at -> start_removing_user_path_at +- done_path_create -> end_creating_path diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2971551b723534..b7e3147ba3d457 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -2362,6 +2362,7 @@ The following mount options are supported: hidepid= Set /proc// access mode. gid= Set the group authorized to learn processes information. subset= Show only the specified subset of procfs. + pidns= Specify a the namespace used by this procfs. ========= ======================================================== hidepid=off or hidepid=0 means classic mode - everybody may access all @@ -2394,6 +2395,13 @@ information about processes information, just add identd to this group. subset=pid hides all top level files and directories in the procfs that are not related to tasks. +pidns= specifies a pid namespace (either as a string path to something like +`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that +will be used by the procfs instance when translating pids. By default, procfs +will use the calling process's active pid namespace. Note that the pid +namespace of an existing procfs instance cannot be modified (attempting to do +so will give an `-EBUSY` error). + Chapter 5: Filesystem behavior ============================== diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index c7949dd44f2f3a..006d23af66e19f 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -26,6 +26,7 @@ MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local" MBA (Memory Bandwidth Allocation) "mba" SMBA (Slow Memory Bandwidth Allocation) "" BMEC (Bandwidth Monitoring Event Configuration) "" +ABMC (Assignable Bandwidth Monitoring Counters) "" =============================================== ================================ Historically, new features were made visible by default in /proc/cpuinfo. This @@ -256,6 +257,144 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/mbm_local_bytes_config 0=0x30;1=0x30;3=0x15;4=0x15 +"mbm_assign_mode": + The supported counter assignment modes. The enclosed brackets indicate which mode + is enabled. The MBM events associated with counters may reset when "mbm_assign_mode" + is changed. + :: + + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + [mbm_event] + default + + "mbm_event": + + mbm_event mode allows users to assign a hardware counter to an RMID, event + pair and monitor the bandwidth usage as long as it is assigned. The hardware + continues to track the assigned counter until it is explicitly unassigned by + the user. Each event within a resctrl group can be assigned independently. + + In this mode, a monitoring event can only accumulate data while it is backed + by a hardware counter. Use "mbm_L3_assignments" found in each CTRL_MON and MON + group to specify which of the events should have a counter assigned. The number + of counters available is described in the "num_mbm_cntrs" file. Changing the + mode may cause all counters on the resource to reset. + + Moving to mbm_event counter assignment mode requires users to assign the counters + to the events. Otherwise, the MBM event counters will return 'Unassigned' when read. + + The mode is beneficial for AMD platforms that support more CTRL_MON + and MON groups than available hardware counters. By default, this + feature is enabled on AMD platforms with the ABMC (Assignable Bandwidth + Monitoring Counters) capability, ensuring counters remain assigned even + when the corresponding RMID is not actively used by any processor. + + "default": + + In default mode, resctrl assumes there is a hardware counter for each + event within every CTRL_MON and MON group. On AMD platforms, it is + recommended to use the mbm_event mode, if supported, to prevent reset of MBM + events between reads resulting from hardware re-allocating counters. This can + result in misleading values or display "Unavailable" if no counter is assigned + to the event. + + * To enable "mbm_event" counter assignment mode: + :: + + # echo "mbm_event" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + + * To enable "default" monitoring mode: + :: + + # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + +"num_mbm_cntrs": + The maximum number of counters (total of available and assigned counters) in + each domain when the system supports mbm_event mode. + + For example, on a system with maximum of 32 memory bandwidth monitoring + counters in each of its L3 domains: + :: + + # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs + 0=32;1=32 + +"available_mbm_cntrs": + The number of counters available for assignment in each domain when mbm_event + mode is enabled on the system. + + For example, on a system with 30 available [hardware] assignable counters + in each of its L3 domains: + :: + + # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs + 0=30;1=30 + +"event_configs": + Directory that exists when "mbm_event" counter assignment mode is supported. + Contains a sub-directory for each MBM event that can be assigned to a counter. + + Two MBM events are supported by default: mbm_local_bytes and mbm_total_bytes. + Each MBM event's sub-directory contains a file named "event_filter" that is + used to view and modify which memory transactions the MBM event is configured + with. The file is accessible only when "mbm_event" counter assignment mode is + enabled. + + List of memory transaction types supported: + + ========================== ======================================================== + Name Description + ========================== ======================================================== + dirty_victim_writes_all Dirty Victims from the QOS domain to all types of memory + remote_reads_slow_memory Reads to slow memory in the non-local NUMA domain + local_reads_slow_memory Reads to slow memory in the local NUMA domain + remote_non_temporal_writes Non-temporal writes to non-local NUMA domain + local_non_temporal_writes Non-temporal writes to local NUMA domain + remote_reads Reads to memory in the non-local NUMA domain + local_reads Reads to memory in the local NUMA domain + ========================== ======================================================== + + For example:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + + Modify the event configuration by writing to the "event_filter" file within + the "event_configs" directory. The read/write "event_filter" file contains the + configuration of the event that reflects which memory transactions are counted by it. + + For example:: + + # echo "local_reads, local_non_temporal_writes" > + /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,local_non_temporal_writes + +"mbm_assign_on_mkdir": + Exists when "mbm_event" counter assignment mode is supported. Accessible + only when "mbm_event" counter assignment mode is enabled. + + Determines if a counter will automatically be assigned to an RMID, MBM event + pair when its associated monitor group is created via mkdir. Enabled by default + on boot, also when switched from "default" mode to "mbm_event" counter assignment + mode. Users can disable this capability by writing to the interface. + + "0": + Auto assignment is disabled. + "1": + Auto assignment is enabled. + + Example:: + + # echo 0 > /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + 0 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy @@ -380,10 +519,77 @@ When monitoring is enabled all MON groups will also contain: for the L3 cache they occupy). These are named "mon_sub_L3_YY" where "YY" is the node number. + When the 'mbm_event' counter assignment mode is enabled, reading + an MBM event of a MON group returns 'Unassigned' if no hardware + counter is assigned to it. For CTRL_MON groups, 'Unassigned' is + returned if the MBM event does not have an assigned counter in the + CTRL_MON group nor in any of its associated MON groups. + "mon_hw_id": Available only with debug option. The identifier used by hardware for the monitor group. On x86 this is the RMID. +When monitoring is enabled all MON groups may also contain: + +"mbm_L3_assignments": + Exists when "mbm_event" counter assignment mode is supported and lists the + counter assignment states of the group. + + The assignment list is displayed in the following format: + + :=;= + + Event: A valid MBM event in the + /sys/fs/resctrl/info/L3_MON/event_configs directory. + + Domain ID: A valid domain ID. When writing, '*' applies the changes + to all the domains. + + Assignment states: + + _ : No counter assigned. + + e : Counter assigned exclusively. + + Example: + + To display the counter assignment states for the default group. + :: + + # cd /sys/fs/resctrl + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + + Assignments can be modified by writing to the interface. + + Examples: + + To unassign the counter associated with the mbm_total_bytes event on domain 0: + :: + + # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=e + mbm_local_bytes:0=e;1=e + + To unassign the counter associated with the mbm_total_bytes event on all the domains: + :: + + # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=_ + mbm_local_bytes:0=e;1=e + + To assign a counter associated with the mbm_total_bytes event on all domains in + exclusive mode: + :: + + # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + When the "mba_MBps" mount option is used all CTRL_MON groups will also contain: "mba_MBps_event": @@ -1429,6 +1635,125 @@ View the llc occupancy snapshot:: # cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy 11234000 + +Examples on working with mbm_assign_mode +======================================== + +a. Check if MBM counter assignment mode is supported. +:: + + # mount -t resctrl resctrl /sys/fs/resctrl/ + + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + [mbm_event] + default + +The "mbm_event" mode is detected and enabled. + +b. Check how many assignable counters are supported. +:: + + # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs + 0=32;1=32 + +c. Check how many assignable counters are available for assignment in each domain. +:: + + # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs + 0=30;1=30 + +d. To list the default group's assign states. +:: + + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + +e. To unassign the counter associated with the mbm_total_bytes event on domain 0. +:: + + # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=e + mbm_local_bytes:0=e;1=e + +f. To unassign the counter associated with the mbm_total_bytes event on all domains. +:: + + # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignment + mbm_total_bytes:0=_;1=_ + mbm_local_bytes:0=e;1=e + +g. To assign a counter associated with the mbm_total_bytes event on all domains in +exclusive mode. +:: + + # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + +h. Read the events mbm_total_bytes and mbm_local_bytes of the default group. There is +no change in reading the events with the assignment. +:: + + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_total_bytes + 779247936 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_total_bytes + 562324232 + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + 212122123 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + 121212144 + +i. Check the event configurations. +:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + +j. Change the event configuration for mbm_local_bytes. +:: + + # echo "local_reads, local_non_temporal_writes, local_reads_slow_memory, remote_reads" > + /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory,remote_reads + +k. Now read the local events again. The first read may come back with "Unavailable" +status. The subsequent read of mbm_local_bytes will display the current value. +:: + + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + Unavailable + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + 2252323 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + Unavailable + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + 1566565 + +l. Users have the option to go back to 'default' mbm_assign_mode if required. This can be +done using the following command. Note that switching the mbm_assign_mode may reset all +the MBM counters (and thus all MBM events) of all the resctrl groups. +:: + + # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + mbm_event + [default] + +m. Unmount the resctrl filesystem. +:: + + # umount /sys/fs/resctrl/ + Intel RDT Errata ================ diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 486a9163347478..4f13b01e42eb5e 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -209,31 +209,8 @@ method fills in is the "s_op" field. This is a pointer to a "struct super_operations" which describes the next level of the filesystem implementation. -Usually, a filesystem uses one of the generic mount() implementations -and provides a fill_super() callback instead. The generic variants are: - -``mount_bdev`` - mount a filesystem residing on a block device - -``mount_nodev`` - mount a filesystem that is not backed by a device - -``mount_single`` - mount a filesystem which shares the instance between all mounts - -A fill_super() callback implementation has the following arguments: - -``struct super_block *sb`` - the superblock structure. The callback must initialize this - properly. - -``void *data`` - arbitrary mount options, usually comes as an ASCII string (see - "Mount Options" section) - -``int silent`` - whether or not to be silent on error - +For more information on mounting (and the new mount API), see +Documentation/filesystems/mount_api.rst. The Superblock Object ===================== @@ -327,11 +304,11 @@ or bottom half). inode->i_lock spinlock held. This method should be either NULL (normal UNIX filesystem - semantics) or "generic_delete_inode" (for filesystems that do + semantics) or "inode_just_drop" (for filesystems that do not want to cache inodes - causing "delete_inode" to always be called regardless of the value of i_nlink) - The "generic_delete_inode()" behavior is equivalent to the old + The "inode_just_drop()" behavior is equivalent to the old practice of using "force_delete" in the put_inode() case, but does not have the races that the "force_delete()" approach had. diff --git a/Documentation/gpu/nova/core/todo.rst b/Documentation/gpu/nova/core/todo.rst index 894a1e9c3741a4..8fdb5bced3460a 100644 --- a/Documentation/gpu/nova/core/todo.rst +++ b/Documentation/gpu/nova/core/todo.rst @@ -147,7 +147,6 @@ Numerical operations [NUMM] Nova uses integer operations that are not part of the standard library (or not implemented in an optimized way for the kernel). These include: -- Aligning up and down to a power of two, - The "Find Last Set Bit" (`fls` function of the C part of the kernel) operation. diff --git a/Documentation/hwmon/adm1275.rst b/Documentation/hwmon/adm1275.rst index 57bd7a8505589f..cf923f20fa523b 100644 --- a/Documentation/hwmon/adm1275.rst +++ b/Documentation/hwmon/adm1275.rst @@ -67,6 +67,14 @@ Supported chips: Datasheet: https://www.analog.com/media/en/technical-documentation/data-sheets/ADM1293_1294.pdf + * Silergy SQ24905C + + Prefix: 'mc09c' + + Addresses scanned: - + + Datasheet: https://www.silergy.com/download/downloadFile?id=5669&type=product&ftype=note + Author: Guenter Roeck @@ -74,14 +82,14 @@ Description ----------- This driver supports hardware monitoring for Analog Devices ADM1075, ADM1272, -ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, ADM1293, and ADM1294 Hot-Swap -Controller and Digital Power Monitors. +ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, ADM1293, ADM1294, and SQ24905C +Hot-Swap Controller and Digital Power Monitors. -ADM1075, ADM1272, ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, ADM1293, and -ADM1294 are hot-swap controllers that allow a circuit board to be removed from -or inserted into a live backplane. They also feature current and voltage -readback via an integrated 12 bit analog-to-digital converter (ADC), accessed -using a PMBus interface. +ADM1075, ADM1272, ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, ADM1293, +ADM1294 and SQ24905C are hot-swap controllers that allow a circuit board to be +removed from or inserted into a live backplane. They also feature current and +voltage readback via an integrated 12 bit analog-to-digital converter (ADC), +accessed using a PMBus interface. The driver is a client driver to the core PMBus driver. Please see Documentation/hwmon/pmbus.rst for details on PMBus client drivers. @@ -160,5 +168,5 @@ temp1_highest Highest observed temperature. temp1_reset_history Write any value to reset history. Temperature attributes are supported on ADM1272, - ADM1273, ADM1278, and ADM1281. + ADM1273, ADM1278, ADM1281 and SQ24905C. ======================= ======================================================= diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst index de2f2985f06f89..a5a58c00c32234 100644 --- a/Documentation/hwmon/asus_ec_sensors.rst +++ b/Documentation/hwmon/asus_ec_sensors.rst @@ -8,7 +8,9 @@ Supported boards: * PRIME X470-PRO * PRIME X570-PRO * PRIME X670E-PRO WIFI + * PRIME Z270-A * Pro WS X570-ACE + * Pro WS WRX90E-SAGE SE * ProArt X570-CREATOR WIFI * ProArt X670E-CREATOR WIFI * ProArt X870E-CREATOR WIFI @@ -25,16 +27,26 @@ Supported boards: * ROG MAXIMUS Z690 FORMULA * ROG STRIX B550-E GAMING * ROG STRIX B550-I GAMING + * ROG STRIX B650E-I GAMING WIFI + * ROG STRIX B850-I GAMING WIFI * ROG STRIX X570-E GAMING * ROG STRIX X570-E GAMING WIFI II * ROG STRIX X570-F GAMING * ROG STRIX X570-I GAMING + * ROG STRIX X670E-E GAMING WIFI + * ROG STRIX X670E-I GAMING WIFI + * ROG STRIX X870-I GAMING WIFI + * ROG STRIX X870E-E GAMING WIFI * ROG STRIX Z390-F GAMING * ROG STRIX Z490-F GAMING * ROG STRIX Z690-A GAMING WIFI D4 + * ROG STRIX Z690-E GAMING WIFI + * ROG STRIX Z790-E GAMING WIFI II + * ROG STRIX Z790-I GAMING WIFI * ROG ZENITH II EXTREME * ROG ZENITH II EXTREME ALPHA * TUF GAMING X670E PLUS + * TUF GAMING X670E PLUS WIFI Authors: - Eugene Shalygin diff --git a/Documentation/hwmon/cros_ec_hwmon.rst b/Documentation/hwmon/cros_ec_hwmon.rst index 47ecae983bdbef..6db812708325f7 100644 --- a/Documentation/hwmon/cros_ec_hwmon.rst +++ b/Documentation/hwmon/cros_ec_hwmon.rst @@ -23,4 +23,9 @@ ChromeOS embedded controller used in Chromebooks and other devices. The channel labels exposed via hwmon are retrieved from the EC itself. -Fan and temperature readings are supported. +Fan and temperature readings are supported. PWM fan control is also supported if +the EC also supports setting fan PWM values and fan mode. Note that EC will +switch fan control mode back to auto when suspended. This driver will restore +the fan state to what they were before suspended when resumed. +If a fan is controllable, this driver will register that fan as a cooling device +in the thermal framework as well. diff --git a/Documentation/hwmon/crps.rst b/Documentation/hwmon/crps.rst index 87380b4965580d..d42ea59d2dae16 100644 --- a/Documentation/hwmon/crps.rst +++ b/Documentation/hwmon/crps.rst @@ -43,7 +43,7 @@ curr1_label "iin" curr1_input Measured input current curr1_max Maximum input current curr1_max_alarm Input maximum current high alarm -curr1_crit Critial high input current +curr1_crit Critical high input current curr1_crit_alarm Input critical current high alarm curr1_rated_max Maximum rated input current @@ -51,7 +51,7 @@ curr2_label "iout1" curr2_input Measured output current curr2_max Maximum output current curr2_max_alarm Output maximum current high alarm -curr2_crit Critial high output current +curr2_crit Critical high output current curr2_crit_alarm Output critical current high alarm curr2_rated_max Maximum rated output current diff --git a/Documentation/hwmon/dell-smm-hwmon.rst b/Documentation/hwmon/dell-smm-hwmon.rst index 5a4edb6565cf95..3e4e2d916ac523 100644 --- a/Documentation/hwmon/dell-smm-hwmon.rst +++ b/Documentation/hwmon/dell-smm-hwmon.rst @@ -38,7 +38,7 @@ fan[1-4]_min RO Minimal Fan speed in RPM fan[1-4]_max RO Maximal Fan speed in RPM fan[1-4]_target RO Expected Fan speed in RPM pwm[1-4] RW Control the fan PWM duty-cycle. -pwm1_enable WO Enable or disable automatic BIOS fan +pwm[1-4]_enable RW/WO Enable or disable automatic BIOS fan control (not supported on all laptops, see below for details). temp[1-10]_input RO Temperature reading in milli-degrees @@ -49,26 +49,40 @@ temp[1-10]_label RO Temperature sensor label. Due to the nature of the SMM interface, each pwmX attribute controls fan number X. -Disabling automatic BIOS fan control ------------------------------------- - -On some laptops the BIOS automatically sets fan speed every few -seconds. Therefore the fan speed set by mean of this driver is quickly -overwritten. - -There is experimental support for disabling automatic BIOS fan -control, at least on laptops where the corresponding SMM command is -known, by writing the value ``1`` in the attribute ``pwm1_enable`` -(writing ``2`` enables automatic BIOS control again). Even if you have -more than one fan, all of them are set to either enabled or disabled -automatic fan control at the same time and, notwithstanding the name, -``pwm1_enable`` sets automatic control for all fans. - -If ``pwm1_enable`` is not available, then it means that SMM codes for -enabling and disabling automatic BIOS fan control are not whitelisted -for your hardware. It is possible that codes that work for other -laptops actually work for yours as well, or that you have to discover -new codes. +Enabling/Disabling automatic BIOS fan control +--------------------------------------------- + +There exist two methods for enabling/disabling automatic BIOS fan control: + +1. Separate SMM commands to enable/disable automatic BIOS fan control for all fans. + +2. A special fan state that enables automatic BIOS fan control for a individual fan. + +The driver cannot reliably detect what method should be used on a given +device, so instead the following heuristic is used: + +- use fan state 3 for enabling BIOS fan control if the maximum fan state + setable by the user is smaller than 3 (default setting). + +- use separate SMM commands if device is whitelisted to support them. + +When using the first method, each fan will have a standard ``pwmX_enable`` +sysfs attribute. Writing ``1`` into this attribute will disable automatic +BIOS fan control for the associated fan and set it to maximum speed. Enabling +BIOS fan control again can be achieved by writing ``2`` into this attribute. +Reading this sysfs attributes returns the current setting as reported by +the underlying hardware. + +When using the second method however, only the ``pwm1_enable`` sysfs attribute +will be available to enable/disable automatic BIOS fan control globaly for all +fans available on a given device. Additionally, this sysfs attribute is write-only +as there exists no SMM command for reading the current fan control setting. + +If no ``pwmX_enable`` attributes are available, then it means that the driver +cannot use the first method and the SMM codes for enabling and disabling automatic +BIOS fan control are not whitelisted for your device. It is possible that codes +that work for other laptops actually work for yours as well, or that you have to +discover new codes. Check the list ``i8k_whitelist_fan_control`` in file ``drivers/hwmon/dell-smm-hwmon.c`` in the kernel tree: as a first diff --git a/Documentation/hwmon/gpd-fan.rst b/Documentation/hwmon/gpd-fan.rst new file mode 100644 index 00000000000000..0b56b70e6264dd --- /dev/null +++ b/Documentation/hwmon/gpd-fan.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +Kernel driver gpd-fan +========================= + +Author: + - Cryolitia PukNgae + +Description +------------ + +Handheld devices from Shenzhen GPD Technology Co., Ltd. provide fan readings +and fan control through their embedded controllers. + +Supported devices +----------------- + +Currently the driver supports the following handhelds: + + - GPD Win Mini (7840U) + - GPD Win Mini (8840U) + - GPD Win Mini (HX370) + - GPD Pocket 4 + - GPD Duo + - GPD Win Max 2 (6800U) + - GPD Win Max 2 2023 (7840U) + - GPD Win Max 2 2024 (8840U) + - GPD Win Max 2 2025 (HX370) + - GPD Win 4 (6800U) + - GPD Win 4 (7840U) + +Module parameters +----------------- + +gpd_fan_board + Force specific which module quirk should be used. + Use it like "gpd_fan_board=wm2". + + - wm2 + - GPD Win 4 (7840U) + - GPD Win Max 2 (6800U) + - GPD Win Max 2 2023 (7840U) + - GPD Win Max 2 2024 (8840U) + - GPD Win Max 2 2025 (HX370) + - win4 + - GPD Win 4 (6800U) + - win_mini + - GPD Win Mini (7840U) + - GPD Win Mini (8840U) + - GPD Win Mini (HX370) + - GPD Pocket 4 + - GPD Duo + +Sysfs entries +------------- + +The following attributes are supported: + +fan1_input + Read Only. Reads current fan RPM. + +pwm1_enable + Read/Write. Enable manual fan control. Write "0" to disable control and run + at full speed. Write "1" to set to manual, write "2" to let the EC control + decide fan speed. Read this attribute to see current status. + + NB:In consideration of the safety of the device, when setting to manual mode, + the pwm speed will be set to the maximum value (255) by default. You can set + a different value by writing pwm1 later. + +pwm1 + Read/Write. Read this attribute to see current duty cycle in the range + [0-255]. When pwm1_enable is set to "1" (manual) write any value in the + range [0-255] to set fan speed. + + NB: Many boards (except listed under wm2 above) don't support reading the + current pwm value in auto mode. That will just return EOPNOTSUPP. In manual + mode it will always return the real value. diff --git a/Documentation/hwmon/hwmon-kernel-api.rst b/Documentation/hwmon/hwmon-kernel-api.rst index e47fc757e63ed2..1d7f1397a82744 100644 --- a/Documentation/hwmon/hwmon-kernel-api.rst +++ b/Documentation/hwmon/hwmon-kernel-api.rst @@ -42,6 +42,9 @@ register/unregister functions:: char *devm_hwmon_sanitize_name(struct device *dev, const char *name); + void hwmon_lock(struct device *dev); + void hwmon_unlock(struct device *dev); + hwmon_device_register_with_info registers a hardware monitoring device. It creates the standard sysfs attributes in the hardware monitoring core, letting the driver focus on reading from and writing to the chip instead @@ -79,6 +82,13 @@ devm_hwmon_sanitize_name is the resource managed version of hwmon_sanitize_name; the memory will be freed automatically on device removal. +When using ``[devm_]hwmon_device_register_with_info()`` to register the +hardware monitoring device, accesses using the associated access functions +are serialised by the hardware monitoring core. If a driver needs locking +for other functions such as interrupt handlers or for attributes which are +fully implemented in the driver, hwmon_lock() and hwmon_unlock() can be used +to ensure that calls to those functions are serialized. + Using devm_hwmon_device_register_with_info() -------------------------------------------- @@ -159,6 +169,7 @@ It contains following fields: hwmon_curr Current sensor hwmon_power Power sensor hwmon_energy Energy sensor + hwmon_energy64 Energy sensor, reported as 64-bit signed value hwmon_humidity Humidity sensor hwmon_fan Fan speed sensor hwmon_pwm PWM control @@ -288,6 +299,8 @@ Parameters: The sensor channel number. val: Pointer to attribute value. + For hwmon_energy64, `'val`' is passed as `long *` but needs + a typecast to `s64 *`. Return value: 0 on success, a negative error number otherwise. diff --git a/Documentation/hwmon/ina238.rst b/Documentation/hwmon/ina238.rst index 9a24da4786a43f..43950d1ec551f7 100644 --- a/Documentation/hwmon/ina238.rst +++ b/Documentation/hwmon/ina238.rst @@ -5,6 +5,24 @@ Kernel driver ina238 Supported chips: + * Texas Instruments INA228 + + Prefix: 'ina228' + + Addresses: I2C 0x40 - 0x4f + + Datasheet: + https://www.ti.com/lit/gpn/ina228 + + * Texas Instruments INA237 + + Prefix: 'ina237' + + Addresses: I2C 0x40 - 0x4f + + Datasheet: + https://www.ti.com/lit/gpn/ina237 + * Texas Instruments INA238 Prefix: 'ina238' @@ -14,6 +32,16 @@ Supported chips: Datasheet: https://www.ti.com/lit/gpn/ina238 + * Texas Instruments INA700 + + Datasheet: + https://www.ti.com/product/ina700 + + * Texas Instruments INA780 + + Datasheet: + https://www.ti.com/product/ina780a + * Silergy SQ52206 Prefix: 'SQ52206' @@ -29,10 +57,20 @@ The INA238 is a current shunt, power and temperature monitor with an I2C interface. It includes a number of programmable functions including alerts, conversion rate, sample averaging and selectable shunt voltage accuracy. -The shunt value in micro-ohms can be set via platform data or device tree at -compile-time or via the shunt_resistor attribute in sysfs at run-time. Please -refer to the Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml for bindings -if the device tree is used. +The shunt value in micro-ohms can be set via device properties, either from +platform code or from device tree data. Please refer to +Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml for bindings if +device tree is used. + +INA237 is a functionally equivalent variant of INA238 with slightly +different accuracy. INA228 is another variant of INA238 with higher ADC +resolution. This chip also reports the energy. + +INA700 and INA780 are variants of the chip series with built-in shunt resistor. +They also report the energy. + +SQ52206 is a mostly compatible chip from Sylergy. It reports the energy +as well as the peak power consumption. Sysfs entries ------------- @@ -53,19 +91,19 @@ in1_max_alarm Maximum shunt voltage alarm power1_input Power measurement (uW) power1_max Maximum power threshold (uW) power1_max_alarm Maximum power alarm +power1_input_highest Peak Power (uW) + (SQ52206 only) curr1_input Current measurement (mA) +curr1_min Minimum current threshold (mA) +curr1_min_alarm Minimum current alarm +curr1_max Maximum current threshold (mA) +curr1_max_alarm Maximum current alarm + +energy1_input Energy measurement (uJ) + (SQ52206, INA237, and INA780 only) temp1_input Die temperature measurement (mC) temp1_max Maximum die temperature threshold (mC) temp1_max_alarm Maximum die temperature alarm ======================= ======================================================= - -Additional sysfs entries for sq52206 ------------------------------------- - -======================= ======================================================= -energy1_input Energy measurement (uJ) - -power1_input_highest Peak Power (uW) -======================= ======================================================= diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst index d292a86ac5da90..51a5bdf75b0865 100644 --- a/Documentation/hwmon/index.rst +++ b/Documentation/hwmon/index.rst @@ -82,6 +82,7 @@ Hardware Monitoring Kernel Drivers gigabyte_waterforce gsc-hwmon gl518sm + gpd-fan gxp-fan-ctrl hih6130 hp-wmi-sensors @@ -173,8 +174,10 @@ Hardware Monitoring Kernel Drivers menf21bmc mlxreg-fan mp2856 + mp2869 mp2888 mp2891 + mp29502 mp2975 mp2993 mp5023 @@ -211,6 +214,7 @@ Hardware Monitoring Kernel Drivers q54sj108a2 qnap-mcu-hwmon raspberrypi-hwmon + sa67 sbrmi sbtsi_temp sch5627 diff --git a/Documentation/hwmon/isl68137.rst b/Documentation/hwmon/isl68137.rst index 0e71b22047f897..5bc029c98383d3 100644 --- a/Documentation/hwmon/isl68137.rst +++ b/Documentation/hwmon/isl68137.rst @@ -374,6 +374,26 @@ Supported chips: Publicly available (after August 2020 launch) at the Renesas website + * Renesas RAA228244 + + Prefix: 'raa228244' + + Addresses scanned: - + + Datasheet: + + Provided by Renesas upon request and NDA + + * Renesas RAA228246 + + Prefix: 'raa228246' + + Addresses scanned: - + + Datasheet: + + Provided by Renesas upon request and NDA + * Renesas RAA229001 Prefix: 'raa229001' diff --git a/Documentation/hwmon/lm75.rst b/Documentation/hwmon/lm75.rst index c6a54bbca3c51c..908b3a9df06e82 100644 --- a/Documentation/hwmon/lm75.rst +++ b/Documentation/hwmon/lm75.rst @@ -121,9 +121,9 @@ Supported chips: https://www.ti.com/product/TMP1075 - * NXP LM75B, P3T1755, PCT2075 + * NXP LM75B, P3T1750, P3T1755, PCT2075 - Prefix: 'lm75b', 'p3t1755', 'pct2075' + Prefix: 'lm75b', 'p3t1750', 'p3t1755', 'pct2075' Addresses scanned: none @@ -131,6 +131,8 @@ Supported chips: https://www.nxp.com/docs/en/data-sheet/LM75B.pdf + https://www.nxp.com/docs/en/data-sheet/P3T1750DP.pdf + https://www.nxp.com/docs/en/data-sheet/P3T1755.pdf https://www.nxp.com/docs/en/data-sheet/PCT2075.pdf diff --git a/Documentation/hwmon/mp2869.rst b/Documentation/hwmon/mp2869.rst new file mode 100644 index 00000000000000..2d9d65fc86b6a1 --- /dev/null +++ b/Documentation/hwmon/mp2869.rst @@ -0,0 +1,175 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Kernel driver mp2869 +==================== + +Supported chips: + + * MPS mp2869 + + Prefix: 'mp2869' + + * MPS mp29608 + + Prefix: 'mp29608' + + * MPS mp29612 + + Prefix: 'mp29612' + + * MPS mp29816 + + Prefix: 'mp29816' + +Author: + + Wensheng Wang + +Description +----------- + +This driver implements support for Monolithic Power Systems, Inc. (MPS) +MP2869 Dual Loop Digital Multi-phase Controller. + +Device compliant with: + +- PMBus rev 1.3 interface. + +The driver exports the following attributes via the 'sysfs' files +for input voltage: + +**in1_input** + +**in1_label** + +**in1_crit** + +**in1_crit_alarm** + +**in1_lcrit** + +**in1_lcrit_alarm** + +**in1_min** + +**in1_min_alarm** + +The driver provides the following attributes for output voltage: + +**in2_input** + +**in2_label** + +**in2_crit** + +**in2_crit_alarm** + +**in2_lcrit** + +**in2_lcrit_alarm** + +**in3_input** + +**in3_label** + +**in3_crit** + +**in3_crit_alarm** + +**in3_lcrit** + +**in3_lcrit_alarm** + +The driver provides the following attributes for input current: + +**curr1_input** + +**curr1_label** + +**curr2_input** + +**curr2_label** + +The driver provides the following attributes for output current: + +**curr3_input** + +**curr3_label** + +**curr3_crit** + +**curr3_crit_alarm** + +**curr3_max** + +**curr3_max_alarm** + +**curr4_input** + +**curr4_label** + +**curr4_crit** + +**curr4_crit_alarm** + +**curr4_max** + +**curr4_max_alarm** + +The driver provides the following attributes for input power: + +**power1_input** + +**power1_label** + +**power2_input** + +**power2_label** + +The driver provides the following attributes for output power: + +**power3_input** + +**power3_label** + +**power3_input** + +**power3_label** + +**power3_max** + +**power3_max_alarm** + +**power4_input** + +**power4_label** + +**power4_input** + +**power4_label** + +**power4_max** + +**power4_max_alarm** + +The driver provides the following attributes for temperature: + +**temp1_input** + +**temp1_crit** + +**temp1_crit_alarm** + +**temp1_max** + +**temp1_max_alarm** + +**temp2_input** + +**temp2_crit** + +**temp2_crit_alarm** + +**temp2_max** + +**temp2_max_alarm** diff --git a/Documentation/hwmon/mp29502.rst b/Documentation/hwmon/mp29502.rst new file mode 100644 index 00000000000000..893e741a6b71cd --- /dev/null +++ b/Documentation/hwmon/mp29502.rst @@ -0,0 +1,93 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Kernel driver mp29502 +===================== + +Supported chips: + + * MPS mp29502 + + Prefix: 'mp29502' + +Author: + + Wensheng Wang + +Description +----------- + +This driver implements support for Monolithic Power Systems, Inc. (MPS) +MP29502 Digital Multi-phase Controller. + +Device compliant with: + +- PMBus rev 1.3 interface. + +The driver exports the following attributes via the 'sysfs' files +for input voltage: + +**in1_input** + +**in1_label** + +**in1_crit** + +**in1_crit_alarm** + +The driver provides the following attributes for output voltage: + +**in2_input** + +**in2_label** + +**in2_crit** + +**in2_crit_alarm** + +**in2_lcrit** + +**in2_lcrit_alarm** + +The driver provides the following attributes for input current: + +**curr1_input** + +**curr1_label** + +The driver provides the following attributes for output current: + +**curr2_input** + +**curr2_label** + +**curr2_crit** + +**curr2_crit_alarm** + +**curr2_max** + +**curr2_max_alarm** + +The driver provides the following attributes for input power: + +**power1_input** + +**power1_label** + +The driver provides the following attributes for output power: + +**power2_input** + +**power2_label** + +The driver provides the following attributes for temperature: + +**temp1_input** + +**temp1_crit** + +**temp1_crit_alarm** + +**temp1_max** + +**temp1_max_alarm** diff --git a/Documentation/hwmon/mp5990.rst b/Documentation/hwmon/mp5990.rst index 6f2f0c099d449d..7fd536757ff2b2 100644 --- a/Documentation/hwmon/mp5990.rst +++ b/Documentation/hwmon/mp5990.rst @@ -9,9 +9,13 @@ Supported chips: Prefix: 'mp5990' - * Datasheet + Datasheet: Publicly available at the MPS website: https://www.monolithicpower.com/en/mp5990.html - Publicly available at the MPS website : https://www.monolithicpower.com/en/mp5990.html + * MPS MP5998 + + Prefix: 'mp5998' + + Datasheet: Not publicly available Author: @@ -21,7 +25,7 @@ Description ----------- This driver implements support for Monolithic Power Systems, Inc. (MPS) -MP5990 Hot-Swap Controller. +MP5990 and MP5998 Hot-Swap Controller. Device compliant with: @@ -53,7 +57,7 @@ The driver provides the following attributes for output voltage: **in2_alarm** -The driver provides the following attributes for output current: +The driver provides the following attributes for current: **curr1_input** @@ -63,6 +67,14 @@ The driver provides the following attributes for output current: **curr1_max** +**curr2_input** + +**curr2_label** + +**curr2_max** + +**curr2_max_alarm** + The driver provides the following attributes for input power: **power1_input** @@ -71,6 +83,16 @@ The driver provides the following attributes for input power: **power1_alarm** +The driver provides the following attributes for output power: + +**power2_input** + +**power2_label** + +**power2_max** + +**power2_max_alarm** + The driver provides the following attributes for temperature: **temp1_input** diff --git a/Documentation/hwmon/sa67.rst b/Documentation/hwmon/sa67.rst new file mode 100644 index 00000000000000..029c7c169b7fd2 --- /dev/null +++ b/Documentation/hwmon/sa67.rst @@ -0,0 +1,41 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +Kernel driver sa67mcu +===================== + +Supported chips: + + * Kontron sa67mcu + + Prefix: 'sa67mcu' + + Datasheet: not available + +Authors: Michael Walle + +Description +----------- + +The sa67mcu is a board management controller which also exposes a hardware +monitoring controller. + +The controller has two voltage and one temperature sensor. The values are +hold in two 8 bit registers to form one 16 bit value. Reading the lower byte +will also capture the high byte to make the access atomic. The unit of the +volatge sensors are 1mV and the unit of the temperature sensor is 0.1degC. + +Sysfs entries +------------- + +The following attributes are supported. + +======================= ======================================================== +in0_label "VDDIN" +in0_input Measured VDDIN voltage. + +in1_label "VDD_RTC" +in1_input Measured VDD_RTC voltage. + +temp1_input MCU temperature. Roughly the board temperature. +======================= ======================================================== + diff --git a/Documentation/hwmon/sht21.rst b/Documentation/hwmon/sht21.rst index 1bccc8e8aac8d3..d20e8a460ba6c7 100644 --- a/Documentation/hwmon/sht21.rst +++ b/Documentation/hwmon/sht21.rst @@ -3,6 +3,16 @@ Kernel driver sht21 Supported chips: + * Sensirion SHT20 + + Prefix: 'sht20' + + Addresses scanned: none + + Datasheet: Publicly available at the Sensirion website + + https://www.sensirion.com/file/datasheet_sht20 + * Sensirion SHT21 Prefix: 'sht21' @@ -13,8 +23,6 @@ Supported chips: https://www.sensirion.com/file/datasheet_sht21 - - * Sensirion SHT25 Prefix: 'sht25' @@ -25,8 +33,6 @@ Supported chips: https://www.sensirion.com/file/datasheet_sht25 - - Author: Urs Fleisch @@ -47,13 +53,11 @@ in the board setup code. sysfs-Interface --------------- -temp1_input - - temperature input - -humidity1_input - - humidity input -eic - - Electronic Identification Code +=================== ============================================================ +temp1_input Temperature input +humidity1_input Humidity input +eic Electronic Identification Code +=================== ============================================================ Notes ----- diff --git a/Documentation/i2c/busses/i2c-i801.rst b/Documentation/i2c/busses/i2c-i801.rst index 47e8ac5b7099f7..36c563ad3f068f 100644 --- a/Documentation/i2c/busses/i2c-i801.rst +++ b/Documentation/i2c/busses/i2c-i801.rst @@ -50,6 +50,7 @@ Supported adapters: * Intel Birch Stream (SOC) * Intel Arrow Lake (SOC) * Intel Panther Lake (SOC) + * Intel Wildcat Lake (SOC) Datasheets: Publicly available at the Intel website diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst index a91abb8f6840f7..abce88f15d7cb3 100644 --- a/Documentation/kbuild/kconfig-language.rst +++ b/Documentation/kbuild/kconfig-language.rst @@ -232,6 +232,38 @@ applicable everywhere (see syntax). enables the third modular state for all config symbols. At most one symbol may have the "modules" option set. +- transitional attribute: "transitional" + This declares the symbol as transitional, meaning it should be processed + during configuration but omitted from newly written .config files. + Transitional symbols are useful for backward compatibility during config + option migrations - they allow olddefconfig to process existing .config + files while ensuring the old option doesn't appear in new configurations. + + A transitional symbol: + - Has no prompt (is not visible to users in menus) + - Is processed normally during configuration (values are read and used) + - Can be referenced in default expressions of other symbols + - Is not written to new .config files + - Cannot have any other properties (it is a pass-through option) + + Example migration from OLD_NAME to NEW_NAME:: + + config NEW_NAME + bool "New option name" + default OLD_NAME + help + This replaces the old CONFIG_OLD_NAME option. + + config OLD_NAME + bool + transitional + help + Transitional config for OLD_NAME to NEW_NAME migration. + + With this setup, existing .config files with "CONFIG_OLD_NAME=y" will + result in "CONFIG_NEW_NAME=y" being set, while CONFIG_OLD_NAME will be + omitted from newly written .config files. + Menu dependencies ----------------- diff --git a/Documentation/netlink/specs/conntrack.yaml b/Documentation/netlink/specs/conntrack.yaml index c6832633ab7bf9..591e22a2ee4382 100644 --- a/Documentation/netlink/specs/conntrack.yaml +++ b/Documentation/netlink/specs/conntrack.yaml @@ -575,8 +575,8 @@ operations: - nat-dst - timeout - mark - - counter-orig - - counter-reply + - counters-orig + - counters-reply - use - id - nat-dst @@ -591,7 +591,6 @@ operations: request: value: 0x101 attributes: - - nfgen-family - mark - filter - status @@ -608,8 +607,8 @@ operations: - nat-dst - timeout - mark - - counter-orig - - counter-reply + - counters-orig + - counters-reply - use - id - nat-dst diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index 02f1ddcfbf1cfd..d1b4829b580ad0 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: closed doc: >- @@ -256,7 +256,7 @@ attribute-sets: type: u32 - name: if-idx - type: u32 + type: s32 - name: reset-reason type: u32 diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst index bc1b585355f7ad..7650c4b5be5f18 100644 --- a/Documentation/networking/can.rst +++ b/Documentation/networking/can.rst @@ -742,7 +742,7 @@ The broadcast manager sends responses to user space in the same form: struct timeval ival1, ival2; /* count and subsequent interval */ canid_t can_id; /* unique can_id for task */ __u32 nframes; /* number of can_frames following */ - struct can_frame frames[0]; + struct can_frame frames[]; }; The aligned payload 'frames' uses the same basic CAN frame structure defined diff --git a/Documentation/networking/mptcp.rst b/Documentation/networking/mptcp.rst index 17f2bab6116447..2e31038d646205 100644 --- a/Documentation/networking/mptcp.rst +++ b/Documentation/networking/mptcp.rst @@ -60,10 +60,10 @@ address announcements. Typically, it is the client side that initiates subflows, and the server side that announces additional addresses via the ``ADD_ADDR`` and ``REMOVE_ADDR`` options. -Path managers are controlled by the ``net.mptcp.pm_type`` sysctl knob -- see -mptcp-sysctl.rst. There are two types: the in-kernel one (type ``0``) where the -same rules are applied for all the connections (see: ``ip mptcp``) ; and the -userspace one (type ``1``), controlled by a userspace daemon (i.e. `mptcpd +Path managers are controlled by the ``net.mptcp.path_manager`` sysctl knob -- +see mptcp-sysctl.rst. There are two types: the in-kernel one (``kernel``) where +the same rules are applied for all the connections (see: ``ip mptcp``) ; and the +userspace one (``userspace``), controlled by a userspace daemon (i.e. `mptcpd `_) where different rules can be applied for each connection. The path managers can be controlled via a Netlink API; see netlink_spec/mptcp_pm.rst. diff --git a/Documentation/power/regulator/consumer.rst b/Documentation/power/regulator/consumer.rst index 9d2416f63f6e36..c01675b25a901e 100644 --- a/Documentation/power/regulator/consumer.rst +++ b/Documentation/power/regulator/consumer.rst @@ -23,10 +23,18 @@ To release the regulator the consumer driver should call :: regulator_put(regulator); Consumers can be supplied by more than one regulator e.g. codec consumer with -analog and digital supplies :: +analog and digital supplies by means of bulk operations :: + + struct regulator_bulk_data supplies[2]; + + supplies[0].supply = "Vcc"; /* digital core */ + supplies[1].supply = "Avdd"; /* analog */ + + ret = regulator_bulk_get(dev, ARRAY_SIZE(supplies), supplies); + + // convenience helper to call regulator_put() on multiple regulators + regulator_bulk_free(ARRAY_SIZE(supplies), supplies); - digital = regulator_get(dev, "Vcc"); /* digital core */ - analog = regulator_get(dev, "Avdd"); /* analog */ The regulator access functions regulator_get() and regulator_put() will usually be called in your device drivers probe() and remove() respectively. @@ -51,11 +59,21 @@ A consumer can determine if a regulator is enabled by calling:: This will return > zero when the regulator is enabled. +A set of regulators can be enabled with a single bulk operation :: + + int regulator_bulk_enable(int num_consumers, + struct regulator_bulk_data *consumers); + A consumer can disable its supply when no longer needed by calling:: int regulator_disable(regulator); +Or a number of them :: + + int regulator_bulk_disable(int num_consumers, + struct regulator_bulk_data *consumers); + NOTE: This may not disable the supply if it's shared with other consumers. The regulator will only be disabled when the enabled reference count is zero. @@ -64,11 +82,15 @@ Finally, a regulator can be forcefully disabled in the case of an emergency:: int regulator_force_disable(regulator); +This operation is also supported for multiple regulators :: + + int regulator_bulk_force_disable(int num_consumers, + struct regulator_bulk_data *consumers); + NOTE: this will immediately and forcefully shutdown the regulator output. All consumers will be powered off. - 3. Regulator Voltage Control & Status (dynamic drivers) ======================================================= diff --git a/Documentation/sound/alsa-configuration.rst b/Documentation/sound/alsa-configuration.rst index bf45df1558bbc3..0a4eaa7d66ddd0 100644 --- a/Documentation/sound/alsa-configuration.rst +++ b/Documentation/sound/alsa-configuration.rst @@ -2293,43 +2293,85 @@ delayed_register notice the need. skip_validation Skip unit descriptor validation (default: no). - The option is used to ignores the validation errors with the hexdump + The option is used to ignore the validation errors with the hexdump of the unit descriptor instead of a driver probe error, so that we can check its details. quirk_flags - Contains the bit flags for various device specific workarounds. - Applied to the corresponding card index. - - * bit 0: Skip reading sample rate for devices - * bit 1: Create Media Controller API entries - * bit 2: Allow alignment on audio sub-slot at transfer - * bit 3: Add length specifier to transfers - * bit 4: Start playback stream at first in implement feedback mode - * bit 5: Skip clock selector setup - * bit 6: Ignore errors from clock source search - * bit 7: Indicates ITF-USB DSD based DACs - * bit 8: Add a delay of 20ms at each control message handling - * bit 9: Add a delay of 1-2ms at each control message handling - * bit 10: Add a delay of 5-6ms at each control message handling - * bit 11: Add a delay of 50ms at each interface setup - * bit 12: Perform sample rate validations at probe - * bit 13: Disable runtime PM autosuspend - * bit 14: Ignore errors for mixer access - * bit 15: Support generic DSD raw U32_BE format - * bit 16: Set up the interface at first like UAC1 - * bit 17: Apply the generic implicit feedback sync mode - * bit 18: Don't apply implicit feedback sync mode - * bit 19: Don't closed interface during setting sample rate - * bit 20: Force an interface reset whenever stopping & restarting - a stream - * bit 21: Do not set PCM rate (frequency) when only one rate is - available for the given endpoint. - * bit 22: Set the fixed resolution 16 for Mic Capture Volume - * bit 23: Set the fixed resolution 384 for Mic Capture Volume - * bit 24: Set minimum volume control value as mute for devices - where the lowest playback value represents muted state instead - of minimum audible volume - * bit 25: Be similar to bit 24 but for capture streams + The option provides a refined and flexible control for applying quirk + flags. It allows to specify the quirk flags for each device, and can + be modified dynamically via sysfs. + The old usage accepts an array of integers, each of which applies quirk + flags on the device in the order of probing. + E.g., ``quirk_flags=0x01,0x02`` applies get_sample_rate to the first + device, and share_media_device to the second device. + The new usage accepts a string in the format of + ``VID1:PID1:FLAGS1;VID2:PID2:FLAGS2;...``, where ``VIDx`` and ``PIDx`` + specify the device, and ``FLAGSx`` specify the flags to be applied. + ``VIDx`` and ``PIDx`` are 4-digit hexadecimal numbers, and can be + specified as ``*`` to match any value. ``FLAGSx`` can be a set of + flags given by name, separated by ``|``, or a hexadecimal number + representing the bit flags. The available flag names are listed below. + An exclamation mark can be prefixed to a flag name to negate the flag. + For example, ``1234:abcd:mixer_playback_min_mute|!ignore_ctl_error;*:*:0x01;`` + applies the ``mixer_playback_min_mute`` flag and clears the + ``ignore_ctl_error`` flag for the device 1234:abcd, and applies the + ``skip_sample_rate`` flag for all devices. + + * bit 0: ``get_sample_rate`` + Skip reading sample rate for devices + * bit 1: ``share_media_device`` + Create Media Controller API entries + * bit 2: ``align_transfer`` + Allow alignment on audio sub-slot at transfer + * bit 3: ``tx_length`` + Add length specifier to transfers + * bit 4: ``playback_first`` + Start playback stream at first in implement feedback mode + * bit 5: ``skip_clock_selector`` + Skip clock selector setup + * bit 6: ``ignore_clock_source`` + Ignore errors from clock source search + * bit 7: ``itf_usb_dsd_dac`` + Indicates ITF-USB DSD-based DACs + * bit 8: ``ctl_msg_delay`` + Add a delay of 20ms at each control message handling + * bit 9: ``ctl_msg_delay_1m`` + Add a delay of 1-2ms at each control message handling + * bit 10: ``ctl_msg_delay_5m`` + Add a delay of 5-6ms at each control message handling + * bit 11: ``iface_delay`` + Add a delay of 50ms at each interface setup + * bit 12: ``validate_rates`` + Perform sample rate validations at probe + * bit 13: ``disable_autosuspend`` + Disable runtime PM autosuspend + * bit 14: ``ignore_ctl_error`` + Ignore errors for mixer access + * bit 15: ``dsd_raw`` + Support generic DSD raw U32_BE format + * bit 16: ``set_iface_first`` + Set up the interface at first like UAC1 + * bit 17: ``generic_implicit_fb`` + Apply the generic implicit feedback sync mode + * bit 18: ``skip_implicit_fb`` + Don't apply implicit feedback sync mode + * bit 19: ``iface_skip_close`` + Don't close interface during setting sample rate + * bit 20: ``force_iface_reset`` + Force an interface reset whenever stopping & restarting a stream + * bit 21: ``fixed_rate`` + Do not set PCM rate (frequency) when only one rate is available + for the given endpoint + * bit 22: ``mic_res_16`` + Set the fixed resolution 16 for Mic Capture Volume + * bit 23: ``mic_res_384`` + Set the fixed resolution 384 for Mic Capture Volume + * bit 24: ``mixer_playback_min_mute`` + Set minimum volume control value as mute for devices where the + lowest playback value represents muted state instead of minimum + audible volume + * bit 25: ``mixer_capture_min_mute`` + Similar to bit 24 but for capture streams This module supports multiple devices, autoprobe and hotplugging. diff --git a/Documentation/staging/crc32.rst b/Documentation/staging/crc32.rst index 7542220967cb4c..64f3dd430a6ca7 100644 --- a/Documentation/staging/crc32.rst +++ b/Documentation/staging/crc32.rst @@ -34,7 +34,7 @@ do it in the right order, matching the endianness. Just like with ordinary division, you proceed one digit (bit) at a time. Each step of the division you take one more digit (bit) of the dividend and append it to the current remainder. Then you figure out the -appropriate multiple of the divisor to subtract to being the remainder +appropriate multiple of the divisor to subtract to bring the remainder back into range. In binary, this is easy - it has to be either 0 or 1, and to make the XOR cancel, it's just a copy of bit 32 of the remainder. @@ -116,7 +116,7 @@ for any fractional bytes at the end. To reduce the number of conditional branches, software commonly uses the byte-at-a-time table method, popularized by Dilip V. Sarwate, "Computation of Cyclic Redundancy Checks via Table Look-Up", Comm. ACM -v.31 no.8 (August 1998) p. 1008-1013. +v.31 no.8 (August 1988) p. 1008-1013. Here, rather than just shifting one bit of the remainder to decide in the correct multiple to subtract, we can shift a byte at a time. diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst index 2ca92042767be4..8238f4c6e4f51a 100644 --- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -112,8 +112,7 @@ CPUfreq核心层注册一个cpufreq_driver结构体。 | | | +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | CPU在两个频率之间切换所需的时间,以 | -| | 纳秒为单位(如不适用,设定为 | -| | CPUFREQ_ETERNAL) | +| | 纳秒为单位 | | | | +-----------------------------------+--------------------------------------+ |policy->cur | 该CPU当前的工作频率(如适用) | diff --git a/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst index add3de2d4523ad..5435c3928d4b3e 100644 --- a/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst @@ -112,8 +112,7 @@ CPUfreq核心層註冊一個cpufreq_driver結構體。 | | | +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | CPU在兩個頻率之間切換所需的時間,以 | -| | 納秒爲單位(如不適用,設定爲 | -| | CPUFREQ_ETERNAL) | +| | 納秒爲單位 | | | | +-----------------------------------+--------------------------------------+ |policy->cur | 該CPU當前的工作頻率(如適用) | diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 406a9f4d08694e..7c527a01d1cf5a 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -374,6 +374,8 @@ Code Seq# Include File Comments 0xB2 08 arch/powerpc/include/uapi/asm/papr-physical-attestation.h powerpc/pseries Physical Attestation API +0xB2 09 arch/powerpc/include/uapi/asm/papr-hvpipe.h powerpc/pseries HVPIPE API + 0xB3 00 linux/mmc/ioctl.h 0xB4 00-0F linux/gpio.h 0xB5 00-0F uapi/linux/rpmsg.h diff --git a/Kbuild b/Kbuild index f327ca86990cca..13324b4bbe236a 100644 --- a/Kbuild +++ b/Kbuild @@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file) $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE $(call filechk,offsets,__ASM_OFFSETS_H__) +# Generate rq-offsets.h + +rq-offsets-file := include/generated/rq-offsets.h + +targets += kernel/sched/rq-offsets.s + +kernel/sched/rq-offsets.s: $(offsets-file) + +$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE + $(call filechk,offsets,__RQ_OFFSETS_H__) + # Check for missing system calls quiet_cmd_syscalls = CALL $< cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags) PHONY += missing-syscalls -missing-syscalls: scripts/checksyscalls.sh $(offsets-file) +missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file) $(call cmd,syscalls) # Check the manual modification of atomic headers diff --git a/MAINTAINERS b/MAINTAINERS index d85cc78eef4a00..ff07cdc4bbd07a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1318,6 +1318,16 @@ S: Maintained F: Documentation/devicetree/bindings/rtc/amlogic,a4-rtc.yaml F: drivers/rtc/rtc-amlogic-a4.c +AMLOGIC SPIFC DRIVER +M: Liang Yang +M: Feng Chen +M: Xianwei Zhao +L: linux-amlogic@lists.infradead.org +L: linux-spi@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/spi/amlogic,a4-spifc.yaml +F: drivers/spi/spi-amlogic-spifc-a4.c + AMLOGIC SPISG DRIVER M: Sunny Luo M: Xianwei Zhao @@ -1772,7 +1782,7 @@ F: drivers/staging/iio/*/ad* X: drivers/iio/*/adjd* ANALOGBITS PLL LIBRARIES -M: Paul Walmsley +M: Paul Walmsley M: Samuel Holland S: Supported F: drivers/clk/analogbits/* @@ -1845,7 +1855,6 @@ S: Odd fixes F: drivers/input/mouse/bcm5974.c APPLE PCIE CONTROLLER DRIVER -M: Alyssa Rosenzweig M: Marc Zyngier L: linux-pci@vger.kernel.org S: Maintained @@ -1990,6 +1999,7 @@ S: Maintained F: arch/arm/include/asm/arch_timer.h F: arch/arm64/include/asm/arch_timer.h F: drivers/clocksource/arm_arch_timer.c +F: drivers/clocksource/arm_arch_timer_mmio.c ARM GENERIC INTERRUPT CONTROLLER DRIVERS M: Marc Zyngier @@ -2364,7 +2374,6 @@ F: sound/soc/codecs/ssm3515.c ARM/APPLE MACHINE SUPPORT M: Sven Peter M: Janne Grunau -R: Alyssa Rosenzweig R: Neal Gompa L: asahi@lists.linux.dev L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -2871,7 +2880,9 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm64/boot/dts/marvell/mmp/ F: drivers/clk/mmp/clk-pxa1908*.c +F: drivers/pmdomain/marvell/ F: include/dt-bindings/clock/marvell,pxa1908.h +F: include/dt-bindings/power/marvell,pxa1908-power.h ARM/Mediatek RTC DRIVER M: Eddie Huang @@ -3990,8 +4001,9 @@ F: drivers/input/touchscreen/atmel_mxt_ts.c ATOMIC INFRASTRUCTURE M: Will Deacon M: Peter Zijlstra -R: Boqun Feng +M: Boqun Feng R: Mark Rutland +R: Gary Guo L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/atomic_*.txt @@ -3999,6 +4011,9 @@ F: arch/*/include/asm/atomic*.h F: include/*/atomic*.h F: include/linux/refcount.h F: scripts/atomic/ +F: rust/kernel/sync/atomic.rs +F: rust/kernel/sync/atomic/ +F: rust/kernel/sync/refcount.rs ATTO EXPRESSSAS SAS/SATA RAID SCSI DRIVER M: Bradley Grove @@ -4219,10 +4234,7 @@ M: Kent Overstreet L: linux-bcachefs@vger.kernel.org S: Externally maintained C: irc://irc.oftc.net/bcache -P: Documentation/filesystems/bcachefs/SubmittingPatches.rst T: git https://evilpiepirate.org/git/bcachefs.git -F: fs/bcachefs/ -F: Documentation/filesystems/bcachefs/ BDISP ST MEDIA DRIVER M: Fabien Dessenne @@ -4683,7 +4695,6 @@ F: security/bpf/ BPF [SELFTESTS] (Test Runners & Infrastructure) M: Andrii Nakryiko M: Eduard Zingerman -R: Mykola Lysenko L: bpf@vger.kernel.org S: Maintained F: tools/testing/selftests/bpf/ @@ -5259,7 +5270,6 @@ F: drivers/gpio/gpio-bt8xx.c BTRFS FILE SYSTEM M: Chris Mason -M: Josef Bacik M: David Sterba L: linux-btrfs@vger.kernel.org S: Maintained @@ -6225,7 +6235,7 @@ M: Josef Bacik M: Jens Axboe L: cgroups@vger.kernel.org L: linux-block@vger.kernel.org -T: git git://git.kernel.dk/linux-block +T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git F: Documentation/admin-guide/cgroup-v1/blkio-controller.rst F: block/bfq-cgroup.c F: block/blk-cgroup.c @@ -6282,9 +6292,8 @@ F: tools/testing/selftests/cgroup/test_kmem.c F: tools/testing/selftests/cgroup/test_memcontrol.c CORETEMP HARDWARE MONITORING DRIVER -M: Fenghua Yu L: linux-hwmon@vger.kernel.org -S: Maintained +S: Orphan F: Documentation/hwmon/coretemp.rst F: drivers/hwmon/coretemp.c @@ -6491,6 +6500,7 @@ S: Supported T: git https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git F: include/linux/cred.h F: kernel/cred.c +F: rust/kernel/cred.rs F: Documentation/security/credentials.rst INTEL CRPS COMMON REDUNDANT PSU DRIVER @@ -7028,6 +7038,21 @@ F: drivers/devfreq/event/ F: include/dt-bindings/pmu/exynos_ppmu.h F: include/linux/devfreq-event.h +DEVICE I/O & IRQ [RUST] +M: Danilo Krummrich +M: Alice Ryhl +M: Daniel Almeida +L: rust-for-linux@vger.kernel.org +S: Supported +W: https://rust-for-linux.com +B: https://github.com/Rust-for-Linux/linux/issues +C: https://rust-for-linux.zulipchat.com +T: git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git +F: rust/kernel/io.rs +F: rust/kernel/io/ +F: rust/kernel/irq.rs +F: rust/kernel/irq/ + DEVICE RESOURCE MANAGEMENT HELPERS M: Hans de Goede R: Matti Vaittinen @@ -7246,15 +7271,15 @@ F: include/linux/swiotlb.h F: kernel/dma/ DMA MAPPING HELPERS DEVICE DRIVER API [RUST] -M: Abdiel Janulgue M: Danilo Krummrich +R: Abdiel Janulgue R: Daniel Almeida R: Robin Murphy R: Andreas Hindborg L: rust-for-linux@vger.kernel.org S: Supported W: https://rust-for-linux.com -T: git https://github.com/Rust-for-Linux/linux.git alloc-next +T: git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git F: rust/helpers/dma.c F: rust/kernel/dma.rs F: samples/rust/rust_dma.rs @@ -7438,7 +7463,7 @@ S: Supported F: Documentation/devicetree/bindings/dpll/dpll-device.yaml F: Documentation/devicetree/bindings/dpll/dpll-pin.yaml F: Documentation/driver-api/dpll.rst -F: drivers/dpll/* +F: drivers/dpll/ F: include/linux/dpll.h F: include/uapi/linux/dpll.h @@ -7479,6 +7504,8 @@ F: include/linux/kobj* F: include/linux/property.h F: include/linux/sysfs.h F: lib/kobj* +F: rust/kernel/debugfs.rs +F: rust/kernel/debugfs/ F: rust/kernel/device.rs F: rust/kernel/device/ F: rust/kernel/device_id.rs @@ -7486,6 +7513,8 @@ F: rust/kernel/devres.rs F: rust/kernel/driver.rs F: rust/kernel/faux.rs F: rust/kernel/platform.rs +F: samples/rust/rust_debugfs.rs +F: samples/rust/rust_debugfs_scoped.rs F: samples/rust/rust_driver_platform.rs F: samples/rust/rust_driver_faux.rs @@ -8086,7 +8115,6 @@ F: Documentation/devicetree/bindings/gpu/ F: Documentation/gpu/ F: drivers/gpu/drm/ F: drivers/gpu/vga/ -F: rust/kernel/drm/ F: include/drm/drm F: include/linux/vga* F: include/uapi/drm/ @@ -8098,11 +8126,21 @@ X: drivers/gpu/drm/i915/ X: drivers/gpu/drm/kmb/ X: drivers/gpu/drm/mediatek/ X: drivers/gpu/drm/msm/ -X: drivers/gpu/drm/nouveau/ +X: drivers/gpu/drm/nova/ X: drivers/gpu/drm/radeon/ X: drivers/gpu/drm/tegra/ X: drivers/gpu/drm/xe/ +DRM DRIVERS AND COMMON INFRASTRUCTURE [RUST] +M: Danilo Krummrich +M: Alice Ryhl +S: Supported +W: https://drm.pages.freedesktop.org/maintainer-tools/drm-rust.html +T: git https://gitlab.freedesktop.org/drm/rust/kernel.git +F: drivers/gpu/drm/nova/ +F: drivers/gpu/nova-core/ +F: rust/kernel/drm/ + DRM DRIVERS FOR ALLWINNER A10 M: Maxime Ripard M: Chen-Yu Tsai @@ -8744,9 +8782,6 @@ F: drivers/edac/thunderx_edac* EDAC-CORE M: Borislav Petkov M: Tony Luck -R: James Morse -R: Mauro Carvalho Chehab -R: Robert Richter L: linux-edac@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next @@ -8754,6 +8789,13 @@ F: Documentation/driver-api/edac.rst F: drivers/edac/ F: include/linux/edac.h +EDAC-A72 +M: Vijay Balakrishna +M: Tyler Hicks +L: linux-edac@vger.kernel.org +S: Supported +F: drivers/edac/a72_edac.c + EDAC-DMC520 M: Lei Wang L: linux-edac@vger.kernel.org @@ -9770,11 +9812,14 @@ F: drivers/video/fbdev/imxfb.c FREESCALE IMX DDR PMU DRIVER M: Frank Li +M: Xu Yang L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: Documentation/admin-guide/perf/imx-ddr.rst F: Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml F: drivers/perf/fsl_imx8_ddr_perf.c +F: drivers/perf/fsl_imx9_ddr_perf.c +F: tools/perf/pmu-events/arch/arm64/freescale/ FREESCALE IMX I2C DRIVER M: Oleksij Rempel @@ -10154,7 +10199,7 @@ F: drivers/media/i2c/gc2145.c GATEWORKS SYSTEM CONTROLLER (GSC) DRIVER M: Tim Harvey S: Maintained -F: Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml +F: Documentation/devicetree/bindings/embedded-controller/gw,gsc.yaml F: Documentation/hwmon/gsc-hwmon.rst F: drivers/hwmon/gsc-hwmon.c F: drivers/mfd/gateworks-gsc.c @@ -10436,6 +10481,13 @@ F: drivers/phy/samsung/phy-gs101-ufs.c F: include/dt-bindings/clock/google,gs101.h K: [gG]oogle.?[tT]ensor +GPD FAN DRIVER +M: Cryolitia PukNgae +L: linux-hwmon@vger.kernel.org +S: Maintained +F: Documentation/hwmon/gpd-fan.rst +F: drivers/hwmon/gpd-fan.c + GPD POCKET FAN DRIVER M: Hans de Goede L: platform-driver-x86@vger.kernel.org @@ -10732,7 +10784,6 @@ W: http://www.kernel.org/pub/linux/kernel/people/fseidel/hdaps/ F: drivers/platform/x86/hdaps.c HARDWARE MONITORING -M: Jean Delvare M: Guenter Roeck L: linux-hwmon@vger.kernel.org S: Maintained @@ -10818,8 +10869,10 @@ M: John Paul Adrian Glaubitz M: Yangtao Li L: linux-fsdevel@vger.kernel.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfs.rst F: fs/hfs/ +F: include/linux/hfs_common.h HFSPLUS FILESYSTEM M: Viacheslav Dubeyko @@ -10827,8 +10880,10 @@ M: John Paul Adrian Glaubitz M: Yangtao Li L: linux-fsdevel@vger.kernel.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfsplus.rst F: fs/hfsplus/ +F: include/linux/hfs_common.h HGA FRAMEBUFFER DRIVER M: Ferenc Bakonyi @@ -11086,7 +11141,6 @@ F: Documentation/devicetree/bindings/net/hisilicon*.txt F: drivers/net/ethernet/hisilicon/ HISILICON PMU DRIVER -M: Yicong Yang M: Jonathan Cameron S: Supported W: http://www.hisilicon.com @@ -11326,7 +11380,7 @@ F: drivers/net/ethernet/huawei/hinic3/ HUAWEI MATEBOOK E GO EMBEDDED CONTROLLER DRIVER M: Pengyu Luo S: Maintained -F: Documentation/devicetree/bindings/platform/huawei,gaokun-ec.yaml +F: Documentation/devicetree/bindings/embedded-controller/huawei,gaokun3-ec.yaml F: drivers/platform/arm64/huawei-gaokun-ec.c F: drivers/power/supply/huawei-gaokun-battery.c F: drivers/usb/typec/ucsi/ucsi_huawei_gaokun.c @@ -11634,6 +11688,12 @@ S: Maintained F: Documentation/devicetree/bindings/i3c/aspeed,ast2600-i3c.yaml F: drivers/i3c/master/ast2600-i3c-master.c +I3C DRIVER FOR ANALOG DEVICES I3C CONTROLLER IP +M: Jorge Marques +S: Maintained +F: Documentation/devicetree/bindings/i3c/adi,i3c-master.yaml +F: drivers/i3c/master/adi-i3c-master.c + I3C DRIVER FOR CADENCE I3C MASTER IP M: Przemysław Gaj S: Maintained @@ -12885,8 +12945,8 @@ IO_URING M: Jens Axboe L: io-uring@vger.kernel.org S: Maintained -T: git git://git.kernel.dk/linux-block -T: git git://git.kernel.dk/liburing +T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/liburing.git F: include/linux/io_uring/ F: include/linux/io_uring.h F: include/linux/io_uring_types.h @@ -14377,6 +14437,15 @@ S: Maintained F: Documentation/devicetree/bindings/pwm/loongson,ls7a-pwm.yaml F: drivers/pwm/pwm-loongson.c +LOONGSON SECURITY ENGINE DRIVERS +M: Qunqin Zhao +L: linux-crypto@vger.kernel.org +S: Maintained +F: drivers/char/tpm/tpm_loongson.c +F: drivers/crypto/loongson/ +F: drivers/mfd/loongson-se.c +F: include/linux/mfd/loongson-se.h + LOONGSON-2 SOC SERIES CLOCK DRIVER M: Yinbo Zhu L: linux-clk@vger.kernel.org @@ -14432,6 +14501,12 @@ S: Maintained F: Documentation/devicetree/bindings/thermal/loongson,ls2k-thermal.yaml F: drivers/thermal/loongson2_thermal.c +LOONGSON-2K Board Management Controller (BMC) DRIVER +M: Binbin Zhou +M: Chong Qiao +S: Maintained +F: drivers/mfd/ls2k-bmc-core.c + LOONGSON EDAC DRIVER M: Zhao Qunqin L: linux-edac@vger.kernel.org @@ -15037,6 +15112,19 @@ L: linux-iio@vger.kernel.org S: Maintained F: drivers/iio/temperature/max30208.c +MAXIM MAX7360 KEYPAD LED MFD DRIVER +M: Mathieu Dubois-Briand +S: Maintained +F: Documentation/devicetree/bindings/gpio/maxim,max7360-gpio.yaml +F: Documentation/devicetree/bindings/mfd/maxim,max7360.yaml +F: drivers/gpio/gpio-max7360.c +F: drivers/input/keyboard/max7360-keypad.c +F: drivers/input/misc/max7360-rotary.c +F: drivers/mfd/max7360.c +F: drivers/pinctrl/pinctrl-max7360.c +F: drivers/pwm/pwm-max7360.c +F: include/linux/mfd/max7360.h + MAXIM MAX77650 PMIC MFD DRIVER M: Bartosz Golaszewski L: linux-kernel@vger.kernel.org @@ -15076,6 +15164,13 @@ F: Documentation/devicetree/bindings/*/*max77802.yaml F: drivers/regulator/max77802-regulator.c F: include/dt-bindings/*/*max77802.h +MAXIM MAX77838 PMIC REGULATOR DEVICE DRIVER +M: Ivaylo Ivanov +L: linux-kernel@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/regulator/maxim,max77838.yaml +F: drivers/regulator/max77838-regulator.c + MAXIM MAX77976 BATTERY CHARGER M: Luca Ceresoli S: Supported @@ -15755,13 +15850,6 @@ S: Supported W: http://www.melexis.com F: drivers/iio/temperature/mlx90635.c -MELFAS MIP4 TOUCHSCREEN DRIVER -M: Sangwon Jee -S: Supported -W: http://www.melfas.com -F: Documentation/devicetree/bindings/input/touchscreen/melfas_mip4.txt -F: drivers/input/touchscreen/melfas_mip4.c - MELLANOX BLUEFIELD I2C DRIVER M: Khalil Blaiech M: Asmaa Mnebhi @@ -16142,6 +16230,7 @@ M: Andrew Morton M: Mike Rapoport L: linux-mm@kvack.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git F: include/linux/numa_memblks.h F: mm/numa.c F: mm/numa_emulation.c @@ -16209,6 +16298,7 @@ R: Rik van Riel R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo +R: Jann Horn L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h @@ -16253,6 +16343,7 @@ R: Nico Pache R: Ryan Roberts R: Dev Jain R: Barry Song +R: Lance Yang L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org @@ -16703,7 +16794,6 @@ F: drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c F: drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c MICROCHIP PCI1XXXX I2C DRIVER -M: Tharun Kumar P M: Kumaravel Thiagarajan M: Microchip Linux Driver Support L: linux-i2c@vger.kernel.org @@ -16712,7 +16802,6 @@ F: drivers/i2c/busses/i2c-mchp-pci1xxxx.c MICROCHIP PCIe UART DRIVER M: Kumaravel Thiagarajan -M: Tharun Kumar P L: linux-serial@vger.kernel.org S: Maintained F: drivers/tty/serial/8250/8250_pci1xxxx.c @@ -17191,6 +17280,13 @@ S: Maintained F: Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml F: drivers/video/backlight/mp3309c.c +MPS MP2869 DRIVER +M: Wensheng Wang +L: linux-hwmon@vger.kernel.org +S: Maintained +F: Documentation/hwmon/mp2869.rst +F: drivers/hwmon/pmbus/mp2869.c + MPS MP2891 DRIVER M: Noah Wang L: linux-hwmon@vger.kernel.org @@ -17198,6 +17294,13 @@ S: Maintained F: Documentation/hwmon/mp2891.rst F: drivers/hwmon/pmbus/mp2891.c +MPS MP29502 DRIVER +M: Wensheng Wang +L: linux-hwmon@vger.kernel.org +S: Maintained +F: Documentation/hwmon/mp29502.rst +F: drivers/hwmon/pmbus/mp29502.c + MPS MP2993 DRIVER M: Noah Wang L: linux-hwmon@vger.kernel.org @@ -17494,6 +17597,7 @@ NETFILTER M: Pablo Neira Ayuso M: Jozsef Kadlecsik M: Florian Westphal +R: Phil Sutter L: netfilter-devel@vger.kernel.org L: coreteam@netfilter.org S: Maintained @@ -18125,6 +18229,18 @@ F: drivers/nubus/ F: include/linux/nubus.h F: include/uapi/linux/nubus.h +NUVOTON NCT6694 MFD DRIVER +M: Ming Yu +S: Supported +F: drivers/gpio/gpio-nct6694.c +F: drivers/hwmon/nct6694-hwmon.c +F: drivers/i2c/busses/i2c-nct6694.c +F: drivers/mfd/nct6694.c +F: drivers/net/can/usb/nct6694_canfd.c +F: drivers/rtc/rtc-nct6694.c +F: drivers/watchdog/nct6694_wdt.c +F: include/linux/mfd/nct6694.h + NUVOTON NCT7201 IIO DRIVER M: Eason Yang L: linux-iio@vger.kernel.org @@ -18307,6 +18423,12 @@ F: Documentation/devicetree/bindings/clock/*imx* F: drivers/clk/imx/ F: include/dt-bindings/clock/*imx* +NXP PF5300/PF5301/PF5302 PMIC REGULATOR DEVICE DRIVER +M: Woodrow Douglass +S: Maintained +F: Documentation/devicetree/bindings/regulator/nxp,pf5300.yaml +F: drivers/regulator/pf530x-regulator.c + NXP PF8100/PF8121A/PF8200 PMIC REGULATOR DEVICE DRIVER M: Jagan Teki S: Maintained @@ -19308,7 +19430,7 @@ S: Maintained F: drivers/pci/controller/dwc/*layerscape* PCI DRIVER FOR FU740 -M: Paul Walmsley +M: Paul Walmsley M: Greentime Hu M: Samuel Holland L: linux-pci@vger.kernel.org @@ -19575,6 +19697,7 @@ C: irc://irc.oftc.net/linux-pci T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git F: rust/helpers/pci.c F: rust/kernel/pci.rs +F: rust/kernel/pci/ F: samples/rust/rust_driver_pci.rs PCIE BANDWIDTH CONTROLLER @@ -19877,6 +20000,7 @@ M: Christian Brauner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git +F: rust/kernel/pid_namespace.rs F: samples/pidfd/ F: tools/testing/selftests/clone3/ F: tools/testing/selftests/pid_namespace/ @@ -20482,6 +20606,8 @@ F: include/dt-bindings/sound/qcom,wcd93* F: sound/soc/codecs/lpass-*.* F: sound/soc/codecs/msm8916-wcd-analog.c F: sound/soc/codecs/msm8916-wcd-digital.c +F: sound/soc/codecs/pm4125-sdw.c +F: sound/soc/codecs/pm4125.* F: sound/soc/codecs/wcd-clsh-v2.* F: sound/soc/codecs/wcd-mbhc-v2.* F: sound/soc/codecs/wcd93*.* @@ -20788,8 +20914,8 @@ S: Supported F: drivers/dma/qcom/hidma* QUALCOMM I2C QCOM GENI DRIVER -M: Mukesh Kumar Savaliya -M: Viken Dadhaniya +M: Mukesh Kumar Savaliya +M: Viken Dadhaniya L: linux-i2c@vger.kernel.org L: linux-arm-msm@vger.kernel.org S: Maintained @@ -21187,6 +21313,7 @@ M: Tony Luck M: Reinette Chatre R: Dave Martin R: James Morse +R: Babu Moger L: linux-kernel@vger.kernel.org S: Supported F: Documentation/filesystems/resctrl.rst @@ -21584,6 +21711,20 @@ S: Maintained F: Documentation/devicetree/bindings/iio/potentiometer/renesas,x9250.yaml F: drivers/iio/potentiometer/x9250.c +RENESAS RZ/G3E THERMAL SENSOR UNIT DRIVER +M: John Madieu +L: linux-pm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/thermal/renesas,r9a09g047-tsu.yaml +F: drivers/thermal/renesas/rzg3e_thermal.c + +RENESAS RZ/G3S THERMAL SENSOR UNIT DRIVER +M: Claudiu Beznea +L: linux-pm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/thermal/renesas,r9a08g045-tsu.yaml +F: drivers/thermal/renesas/rzg3s_thermal.c + RESET CONTROLLER FRAMEWORK M: Philipp Zabel S: Maintained @@ -21668,7 +21809,7 @@ F: Documentation/devicetree/bindings/timer/andestech,plmt0.yaml F: arch/riscv/boot/dts/andes/ RISC-V ARCHITECTURE -M: Paul Walmsley +M: Paul Walmsley M: Palmer Dabbelt M: Albert Ou R: Alexandre Ghiti @@ -21775,6 +21916,7 @@ F: drivers/mailbox/mailbox-th1520.c F: drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c F: drivers/pinctrl/pinctrl-th1520.c F: drivers/pmdomain/thead/ +F: drivers/power/reset/th1520-aon-reboot.c F: drivers/power/sequencing/pwrseq-thead-gpu.c F: drivers/reset/reset-th1520.c F: include/dt-bindings/clock/thead,th1520-clk-ap.h @@ -22062,6 +22204,7 @@ F: drivers/infiniband/ulp/rtrs/ RUNTIME VERIFICATION (RV) M: Steven Rostedt +M: Gabriele Monaco L: linux-trace-kernel@vger.kernel.org S: Maintained F: Documentation/trace/rv/ @@ -22430,7 +22573,7 @@ F: Documentation/devicetree/bindings/regulator/samsung,s2m*.yaml F: Documentation/devicetree/bindings/regulator/samsung,s5m*.yaml F: drivers/clk/clk-s2mps11.c F: drivers/mfd/sec*.[ch] -F: drivers/regulator/s2m*.c +F: drivers/regulator/s2*.c F: drivers/regulator/s5m*.c F: drivers/rtc/rtc-s5m.c F: include/linux/mfd/samsung/ @@ -22833,6 +22976,7 @@ F: include/linux/security.h F: include/uapi/linux/lsm.h F: security/ F: tools/testing/selftests/lsm/ +F: rust/kernel/security.rs X: security/selinux/ K: \bsecurity_[a-z_0-9]\+\b @@ -23119,7 +23263,7 @@ S: Maintained F: drivers/watchdog/simatic-ipc-wdt.c SIFIVE DRIVERS -M: Paul Walmsley +M: Paul Walmsley M: Samuel Holland L: linux-riscv@lists.infradead.org S: Supported @@ -23219,13 +23363,14 @@ F: drivers/usb/misc/sisusbvga/ SL28 CPLD MFD DRIVER M: Michael Walle S: Maintained +F: Documentation/devicetree/bindings/embedded-controller/kontron,sl28cpld.yaml F: Documentation/devicetree/bindings/gpio/kontron,sl28cpld-gpio.yaml F: Documentation/devicetree/bindings/hwmon/kontron,sl28cpld-hwmon.yaml F: Documentation/devicetree/bindings/interrupt-controller/kontron,sl28cpld-intc.yaml -F: Documentation/devicetree/bindings/mfd/kontron,sl28cpld.yaml F: Documentation/devicetree/bindings/pwm/kontron,sl28cpld-pwm.yaml F: Documentation/devicetree/bindings/watchdog/kontron,sl28cpld-wdt.yaml F: drivers/gpio/gpio-sl28cpld.c +F: drivers/hwmon/sa67mcu-hwmon.c F: drivers/hwmon/sl28cpld-hwmon.c F: drivers/irqchip/irq-sl28cpld.c F: drivers/pwm/pwm-sl28cpld.c @@ -23723,6 +23868,12 @@ W: https://linuxtv.org Q: http://patchwork.linuxtv.org/project/linux-media/list/ F: drivers/media/dvb-frontends/sp2* +SPACEMIT K1 I2C DRIVER +M: Troy Mitchell +S: Maintained +F: Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml +F: drivers/i2c/busses/i2c-k1.c + SPANISH DOCUMENTATION M: Carlos Bilbao R: Avadhut Naik @@ -24269,7 +24420,7 @@ F: Documentation/devicetree/bindings/input/allwinner,sun4i-a10-lradc-keys.yaml F: drivers/input/keyboard/sun4i-lradc-keys.c SUNDANCE NETWORK DRIVER -M: Denis Kirjanov +M: Denis Kirjanov L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/dlink/sundance.c @@ -24495,9 +24646,8 @@ F: Documentation/devicetree/bindings/media/snps,dw-hdmi-rx.yaml F: drivers/media/platform/synopsys/hdmirx/* SYNOPSYS DESIGNWARE I2C DRIVER -M: Jarkko Nikula +M: Mika Westerberg R: Andy Shevchenko -R: Mika Westerberg R: Jan Dabros L: linux-i2c@vger.kernel.org S: Supported @@ -25652,16 +25802,10 @@ W: https://github.com/srcres258/linux-doc T: git https://github.com/srcres258/linux-doc.git doc-zh-tw F: Documentation/translations/zh_TW/ -TRIGGER SOURCE - ADI UTIL SIGMA DELTA SPI -M: David Lechner -S: Maintained -F: Documentation/devicetree/bindings/trigger-source/adi,util-sigma-delta-spi.yaml - TRIGGER SOURCE M: David Lechner S: Maintained -F: Documentation/devicetree/bindings/trigger-source/gpio-trigger.yaml -F: Documentation/devicetree/bindings/trigger-source/pwm-trigger.yaml +F: Documentation/devicetree/bindings/trigger-source/* TRUSTED SECURITY MODULE (TSM) INFRASTRUCTURE M: Dan Williams @@ -26796,7 +26940,7 @@ F: drivers/nvdimm/nd_virtio.c F: drivers/nvdimm/virtio_pmem.c VIRTIO RTC DRIVER -M: Peter Hilber +M: Peter Hilber L: virtualization@lists.linux.dev S: Maintained F: drivers/virtio/virtio_rtc_* @@ -26811,6 +26955,13 @@ S: Maintained F: include/uapi/linux/virtio_snd.h F: sound/virtio/* +VIRTIO SPI DRIVER +M: Haixu Cui +L: virtualization@lists.linux.dev +S: Maintained +F: drivers/spi/spi-virtio.c +F: include/uapi/linux/virtio_spi.h + VIRTUAL BOX GUEST DEVICE DRIVER M: Hans de Goede M: Arnd Bergmann @@ -27458,10 +27609,8 @@ F: tools/testing/selftests/bpf/*xdp* K: (?:\b|_)xdp(?:\b|_) XDP SOCKETS (AF_XDP) -M: Björn Töpel M: Magnus Karlsson M: Maciej Fijalkowski -R: Jonathan Lemon R: Stanislav Fomichev L: netdev@vger.kernel.org L: bpf@vger.kernel.org @@ -27673,6 +27822,13 @@ S: Maintained F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml F: drivers/edac/versal_edac.c +XILINX VERSALNET EDAC DRIVER +M: Shubhrajyoti Datta +S: Maintained +F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml +F: drivers/edac/versalnet_edac.c +F: include/linux/cdx/edac_cdx_pcol.h + XILINX WATCHDOG DRIVER M: Srinivas Neeli R: Shubhrajyoti Datta diff --git a/Makefile b/Makefile index cf37b94078211c..d426446aeef57b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = NAME = Baby Opossum Posse # *DOCUMENTATION* @@ -1020,7 +1020,7 @@ KBUILD_AFLAGS += -fno-lto export CC_FLAGS_LTO endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI CC_FLAGS_CFI := -fsanitize=kcfi ifdef CONFIG_CFI_ICALL_NORMALIZE_INTEGERS CC_FLAGS_CFI += -fsanitize-cfi-icall-experimental-normalize-integers @@ -1444,11 +1444,11 @@ endif tools/: FORCE $(Q)mkdir -p $(objtree)/tools - $(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ + $(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ tools/%: FORCE $(Q)mkdir -p $(objtree)/tools - $(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* + $(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* # --------------------------------------------------------------------------- # Kernel selftest diff --git a/arch/Kconfig b/arch/Kconfig index d1b4ffd6e08564..5440616f0774b2 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -41,6 +41,44 @@ config HOTPLUG_SMT config SMT_NUM_THREADS_DYNAMIC bool +config ARCH_SUPPORTS_SCHED_SMT + bool + +config ARCH_SUPPORTS_SCHED_CLUSTER + bool + +config ARCH_SUPPORTS_SCHED_MC + bool + +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on ARCH_SUPPORTS_SCHED_SMT + default y + help + Improves the CPU scheduler's decision making when dealing with + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + +config SCHED_CLUSTER + bool "Cluster scheduler support" + depends on ARCH_SUPPORTS_SCHED_CLUSTER + default y + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + +config SCHED_MC + bool "Multi-Core Cache (MC) scheduler support" + depends on ARCH_SUPPORTS_SCHED_MC + default y + help + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + # Selected by HOTPLUG_CORE_SYNC_DEAD or HOTPLUG_CORE_SYNC_FULL config HOTPLUG_CORE_SYNC bool @@ -867,22 +905,26 @@ config PROPELLER_CLANG If unsure, say N. -config ARCH_SUPPORTS_CFI_CLANG +config ARCH_SUPPORTS_CFI bool help - An architecture should select this option if it can support Clang's - Control-Flow Integrity (CFI) checking. + An architecture should select this option if it can support Kernel + Control-Flow Integrity (CFI) checking (-fsanitize=kcfi). config ARCH_USES_CFI_TRAPS bool + help + An architecture should select this option if it requires the + .kcfi_traps section for KCFI trap handling. -config CFI_CLANG - bool "Use Clang's Control Flow Integrity (CFI)" - depends on ARCH_SUPPORTS_CFI_CLANG +config CFI + bool "Use Kernel Control Flow Integrity (kCFI)" + default CFI_CLANG + depends on ARCH_SUPPORTS_CFI depends on $(cc-option,-fsanitize=kcfi) help - This option enables Clang's forward-edge Control Flow Integrity - (CFI) checking, where the compiler injects a runtime check to each + This option enables forward-edge Control Flow Integrity (CFI) + checking, where the compiler injects a runtime check to each indirect function call to ensure the target is a valid function with the correct static type. This restricts possible call targets and makes it more difficult for an attacker to exploit bugs that allow @@ -891,10 +933,16 @@ config CFI_CLANG https://clang.llvm.org/docs/ControlFlowIntegrity.html +config CFI_CLANG + bool + transitional + help + Transitional config for CFI_CLANG to CFI migration. + config CFI_ICALL_NORMALIZE_INTEGERS bool "Normalize CFI tags for integers" - depends on CFI_CLANG - depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG + depends on CFI + depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS help This option normalizes the CFI tags for integer types so that all integer types of the same size and signedness receive the same CFI @@ -907,7 +955,7 @@ config CFI_ICALL_NORMALIZE_INTEGERS This option is necessary for using CFI with Rust. If unsure, say N. -config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG +config HAVE_CFI_ICALL_NORMALIZE_INTEGERS def_bool y depends on $(cc-option,-fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers) # With GCOV/KASAN we need this fix: https://github.com/llvm/llvm-project/pull/104826 @@ -915,7 +963,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC def_bool y - depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG + depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS depends on RUSTC_VERSION >= 107900 # With GCOV/KASAN we need this fix: https://github.com/rust-lang/rust/pull/129373 depends on (RUSTC_LLVM_VERSION >= 190103 && RUSTC_VERSION >= 108200) || \ @@ -923,7 +971,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC config CFI_PERMISSIVE bool "Use CFI in permissive mode" - depends on CFI_CLANG + depends on CFI help When selected, Control Flow Integrity (CFI) violations result in a warning instead of a kernel panic. This option should only be used @@ -1609,7 +1657,7 @@ config HAVE_SPARSE_SYSCALL_NR related optimizations for a given architecture. config ARCH_HAS_VDSO_ARCH_DATA - depends on GENERIC_VDSO_DATA_STORE + depends on HAVE_GENERIC_VDSO bool config ARCH_HAS_VDSO_TIME_DATA @@ -1730,6 +1778,10 @@ config ARCH_VMLINUX_NEEDS_RELOCS relocations preserved. This is used by some architectures to construct bespoke relocation tables for KASLR. +# Select if architecture uses the common generic TIF bits +config HAVE_GENERIC_TIF_BITS + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index 3e33621922c31b..76e4343c090f7d 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -328,7 +328,7 @@ static inline unsigned long ffz_b(unsigned long x) return sum; } -static inline unsigned long ffz(unsigned long word) +static inline unsigned long __attribute_const__ ffz(unsigned long word) { #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67) /* Whee. EV67 can calculate it directly. */ @@ -348,7 +348,7 @@ static inline unsigned long ffz(unsigned long word) /* * __ffs = Find First set bit in word. Undefined if no set bit exists. */ -static inline unsigned long __ffs(unsigned long word) +static inline __attribute_const__ unsigned long __ffs(unsigned long word) { #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67) /* Whee. EV67 can calculate it directly. */ @@ -373,7 +373,7 @@ static inline unsigned long __ffs(unsigned long word) * differs in spirit from the above __ffs. */ -static inline int ffs(int word) +static inline __attribute_const__ int ffs(int word) { int result = __ffs(word) + 1; return word ? result : 0; @@ -383,14 +383,14 @@ static inline int ffs(int word) * fls: find last bit set. */ #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67) -static inline int fls64(unsigned long word) +static inline __attribute_const__ int fls64(unsigned long word) { return 64 - __kernel_ctlz(word); } #else extern const unsigned char __flsm1_tab[256]; -static inline int fls64(unsigned long x) +static inline __attribute_const__ int fls64(unsigned long x) { unsigned long t, a, r; @@ -403,12 +403,12 @@ static inline int fls64(unsigned long x) } #endif -static inline unsigned long __fls(unsigned long x) +static inline __attribute_const__ unsigned long __fls(unsigned long x) { return fls64(x) - 1; } -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { return fls64(x); } diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c index e9dad60b147f33..1ebb058904992b 100644 --- a/arch/alpha/kernel/asm-offsets.c +++ b/arch/alpha/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 582d96548385dd..06522451f018f3 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -231,7 +231,7 @@ flush_thread(void) */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; extern void ret_from_fork(void); diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c index f77deb7991757e..2978da85fcb65b 100644 --- a/arch/arc/kernel/asm-offsets.c +++ b/arch/arc/kernel/asm-offsets.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 186ceab661eb02..8166d090871304 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -166,7 +166,7 @@ asmlinkage void ret_from_fork(void); */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *c_regs; /* child's pt_regs */ diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b1f3df39ed4068..358057001859fc 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -38,7 +38,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_NEED_CMPXCHG_1_EMU if CPU_V6 select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_CFI_CLANG + select ARCH_SUPPORTS_CFI select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP @@ -941,28 +941,14 @@ config IRQSTACKS config ARM_CPU_TOPOLOGY bool "Support cpu topology definition" depends on SMP && CPU_V7 + select ARCH_SUPPORTS_SCHED_MC + select ARCH_SUPPORTS_SCHED_SMT default y help Support ARM cpu topology definition. The MPIDR register defines affinity between processors which is then used to describe the cpu topology of an ARM System. -config SCHED_MC - bool "Multi-core scheduler support" - depends on ARM_CPU_TOPOLOGY - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SCHED_SMT - bool "SMT scheduler support" - depends on ARM_CPU_TOPOLOGY - help - Improves the CPU scheduler's decision making when dealing with - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - config HAVE_ARM_SCU bool help diff --git a/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts b/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts index 83d283cf66334f..d425d9ee83db0e 100644 --- a/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts +++ b/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts @@ -218,7 +218,7 @@ &usbphy { usb0_id_det-gpios = <&pio 7 4 (GPIO_ACTIVE_HIGH | GPIO_PULL_UP)>; /* PH4 */ usb0_vbus_det-gpios = <&pio 7 5 (GPIO_ACTIVE_HIGH | GPIO_PULL_UP)>; /* PH5 */ - usb0_vbus-supply = <®_usb0_vbus>; + usb0_vbus-supply = <®_usb0_vbus>; usb1_vbus-supply = <®_usb1_vbus>; usb2_vbus-supply = <®_usb2_vbus>; status = "okay"; diff --git a/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi b/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi index 272584881bb214..a0f787581dd902 100644 --- a/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi +++ b/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi @@ -82,7 +82,7 @@ }; &ehci0 { - status = "okay"; + status = "okay"; }; &mmc1 { diff --git a/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi b/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi index fa162f7fa9f011..f0ed802a9d08e6 100644 --- a/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi +++ b/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi @@ -705,7 +705,7 @@ }; /omit-if-no-ref/ - uart2_rts_cts_pi_pins: uart2-rts-cts-pi-pins{ + uart2_rts_cts_pi_pins: uart2-rts-cts-pi-pins { pins = "PI16", "PI17"; function = "uart2"; }; diff --git a/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts b/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts index 5143cb4e7b787a..cb6292319f39d7 100644 --- a/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts +++ b/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts @@ -29,7 +29,7 @@ clk_can0: clock-can0 { compatible = "fixed-clock"; #clock-cells = <0>; - clock-frequency = <40000000>; + clock-frequency = <40000000>; }; gpio-keys { diff --git a/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts b/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts index ce0d6514eeb571..e4794ccb8e413f 100644 --- a/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts +++ b/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts @@ -66,8 +66,10 @@ mdio0 { #address-cells = <1>; #size-cells = <0>; - phy0: ethernet-phy@0 { - reg = <0>; + compatible = "snps,dwmac-mdio"; + + phy0: ethernet-phy@4 { + reg = <4>; rxd0-skew-ps = <0>; rxd1-skew-ps = <0>; rxd2-skew-ps = <0>; diff --git a/arch/arm/boot/dts/marvell/armada-370-db.dts b/arch/arm/boot/dts/marvell/armada-370-db.dts index a7dc4c04d10bdf..a9a05d826f2233 100644 --- a/arch/arm/boot/dts/marvell/armada-370-db.dts +++ b/arch/arm/boot/dts/marvell/armada-370-db.dts @@ -119,7 +119,7 @@ "Out Jack", "HPL", "Out Jack", "HPR", "AIN1L", "In Jack", - "AIN1L", "In Jack"; + "AIN1R", "In Jack"; status = "okay"; simple-audio-card,dai-link@0 { diff --git a/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts b/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts index d4e0b8150a84ce..cf26e2ceaaa074 100644 --- a/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts +++ b/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts @@ -38,7 +38,7 @@ simple-audio-card,mclk-fs = <256>; simple-audio-card,cpu { - sound-dai = <&audio0 0>; + sound-dai = <&audio0>; }; simple-audio-card,codec { diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig index 6915c766923a2f..84070e9698e8cc 100644 --- a/arch/arm/configs/exynos_defconfig +++ b/arch/arm/configs/exynos_defconfig @@ -364,7 +364,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_AES_ARM_BS=m -CONFIG_CRYPTO_CHACHA20_NEON=m CONFIG_CRYPTO_DEV_EXYNOS_RNG=y CONFIG_CRYPTO_DEV_S5P=y CONFIG_DMA_CMA=y diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig index a3be0b2ede09c7..a2995eb390c603 100644 --- a/arch/arm/configs/milbeaut_m10v_defconfig +++ b/arch/arm/configs/milbeaut_m10v_defconfig @@ -101,7 +101,6 @@ CONFIG_CRYPTO_GHASH_ARM_CE=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_AES_ARM_CE=m -CONFIG_CRYPTO_CHACHA20_NEON=m # CONFIG_CRYPTO_HW is not set CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=64 diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index f2822eeefb9577..cc0e0e4a879cb1 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -1291,7 +1291,6 @@ CONFIG_CRYPTO_GHASH_ARM_CE=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m CONFIG_CRYPTO_AES_ARM_CE=m -CONFIG_CRYPTO_CHACHA20_NEON=m CONFIG_CRYPTO_DEV_SUN4I_SS=m CONFIG_CRYPTO_DEV_FSL_CAAM=m CONFIG_CRYPTO_DEV_EXYNOS_RNG=m diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 939913ed9a73bd..1d5f752417398c 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -708,7 +708,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=y CONFIG_CRYPTO_GHASH_ARM_CE=m CONFIG_CRYPTO_AES_ARM=m CONFIG_CRYPTO_AES_ARM_BS=m -CONFIG_CRYPTO_CHACHA20_NEON=m CONFIG_CRYPTO_DEV_OMAP=m CONFIG_CRYPTO_DEV_OMAP_SHAM=m CONFIG_CRYPTO_DEV_OMAP_AES=m diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 1e5f3cdf691c4f..c436eec22d86ca 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -2,19 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (arm)" -config CRYPTO_CURVE25519_NEON - tristate - depends on KERNEL_MODE_NEON - select CRYPTO_KPP - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CURVE25519 - default CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 algorithm - - Architecture: arm with - - NEON (Advanced SIMD) extensions - config CRYPTO_GHASH_ARM_CE tristate "Hash functions: GHASH (PMULL/NEON/ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 4f23999ae17dfe..6346a73effc06a 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o -obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o @@ -18,4 +17,3 @@ blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o -curve25519-neon-y := curve25519-core.o curve25519-glue.o diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c deleted file mode 100644 index e7b87e09dd99f4..00000000000000 --- a/arch/arm/crypto/curve25519-glue.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * - * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This - * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been - * manually reworked for use in kernel space. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], - const u8 scalar[CURVE25519_KEY_SIZE], - const u8 point[CURVE25519_KEY_SIZE]) -{ - if (static_branch_likely(&have_neon) && crypto_simd_usable()) { - kernel_neon_begin(); - curve25519_neon(out, scalar, point); - kernel_neon_end(); - } else { - curve25519_generic(out, scalar, point); - } -} -EXPORT_SYMBOL(curve25519_arch); - -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) -{ - return curve25519_arch(pub, secret, curve25519_base_point); -} -EXPORT_SYMBOL(curve25519_base_arch); - -static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - u8 *secret = kpp_tfm_ctx(tfm); - - if (!len) - curve25519_generate_secret(secret); - else if (len == CURVE25519_KEY_SIZE && - crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) - memcpy(secret, buf, CURVE25519_KEY_SIZE); - else - return -EINVAL; - return 0; -} - -static int curve25519_compute_value(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 public_key[CURVE25519_KEY_SIZE]; - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - u8 const *bp; - - if (req->src) { - copied = sg_copy_to_buffer(req->src, - sg_nents_for_len(req->src, - CURVE25519_KEY_SIZE), - public_key, CURVE25519_KEY_SIZE); - if (copied != CURVE25519_KEY_SIZE) - return -EINVAL; - bp = public_key; - } else { - bp = curve25519_base_point; - } - - curve25519_arch(buf, secret, bp); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static unsigned int curve25519_max_size(struct crypto_kpp *tfm) -{ - return CURVE25519_KEY_SIZE; -} - -static struct kpp_alg curve25519_alg = { - .base.cra_name = "curve25519", - .base.cra_driver_name = "curve25519-neon", - .base.cra_priority = 200, - .base.cra_module = THIS_MODULE, - .base.cra_ctxsize = CURVE25519_KEY_SIZE, - - .set_secret = curve25519_set_secret, - .generate_public_key = curve25519_compute_value, - .compute_shared_secret = curve25519_compute_value, - .max_size = curve25519_max_size, -}; - -static int __init arm_curve25519_init(void) -{ - if (elf_hwcap & HWCAP_NEON) { - static_branch_enable(&have_neon); - return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? - crypto_register_kpp(&curve25519_alg) : 0; - } - return 0; -} - -static void __exit arm_curve25519_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON) - crypto_unregister_kpp(&curve25519_alg); -} - -module_init(arm_curve25519_init); -module_exit(arm_curve25519_exit); - -MODULE_ALIAS_CRYPTO("curve25519"); -MODULE_ALIAS_CRYPTO("curve25519-neon"); -MODULE_DESCRIPTION("Public key crypto: Curve25519 (NEON-accelerated)"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/include/asm/vdso/vsyscall.h b/arch/arm/include/asm/vdso/vsyscall.h index 4e7226ad02ec4d..ff1c729af05f03 100644 --- a/arch/arm/include/asm/vdso/vsyscall.h +++ b/arch/arm/include/asm/vdso/vsyscall.h @@ -7,8 +7,6 @@ #include #include -extern bool cntvct_ok; - static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 123f4a8ef44660..2101938d27fcbc 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -7,6 +7,8 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS + #include #include #include diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index a12efd0f43e81a..cd4b34c96e35e9 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -904,7 +904,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs) watchpoint_single_step_handler(addr); } -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI static void hw_breakpoint_cfi_handler(struct pt_regs *regs) { /* diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index da488d92e7a00d..55ca3fcd37e860 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -484,7 +484,7 @@ module_arch_cleanup(struct module *mod) #endif } -void __weak module_arch_freeing_init(struct module *mod) +void module_arch_freeing_init(struct module *mod) { #ifdef CONFIG_ARM_UNWIND struct unwind_table *init = mod->arch.init_table; diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index e16ed102960cb0..d7aa95225c70bd 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -234,7 +234,7 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct thread_info *thread = task_thread_info(p); diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 325448ffbba0c2..e38a30477f3d70 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -54,11 +54,9 @@ struct elfinfo { char *dynstr; /* ptr to .dynstr section */ }; -/* Cached result of boot-time check for whether the arch timer exists, - * and if so, whether the virtual counter is useable. +/* Boot-time check for whether the arch timer exists, and if so, + * whether the virtual counter is usable. */ -bool cntvct_ok __ro_after_init; - static bool __init cntvct_functional(void) { struct device_node *np; @@ -159,7 +157,7 @@ static void __init patch_vdso(void *ehdr) * want programs to incur the slight additional overhead of * dispatching through the VDSO only to fall back to syscalls. */ - if (!cntvct_ok) { + if (!cntvct_functional()) { vdso_nullpatch_one(&einfo, "__vdso_gettimeofday"); vdso_nullpatch_one(&einfo, "__vdso_clock_gettime"); vdso_nullpatch_one(&einfo, "__vdso_clock_gettime64"); @@ -197,8 +195,6 @@ static int __init vdso_init(void) vdso_total_pages = VDSO_NR_PAGES; /* for the data/vvar pages */ vdso_total_pages += text_pages; - cntvct_ok = cntvct_functional(); - patch_vdso(vdso_start); return 0; diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig index dc47b2312127fd..6ea1bd55acf8de 100644 --- a/arch/arm/mach-imx/Kconfig +++ b/arch/arm/mach-imx/Kconfig @@ -242,7 +242,7 @@ choice config VF_USE_PIT_TIMER bool "Use PIT timer" - select VF_PIT_TIMER + select NXP_PIT_TIMER help Use SoC Periodic Interrupt Timer (PIT) as clocksource diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 5c1023a6d78c1b..7b27ee9482b3eb 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -926,9 +926,7 @@ config VDSO default y if ARM_ARCH_TIMER select HAVE_GENERIC_VDSO select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_32 select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE help Place in the process address space an ELF shared object providing fast implementations of gettimeofday and diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile index a195cd1d3e6dc4..1e220101337148 100644 --- a/arch/arm/mm/Makefile +++ b/arch/arm/mm/Makefile @@ -89,7 +89,7 @@ obj-$(CONFIG_CPU_V6) += proc-v6.o obj-$(CONFIG_CPU_V6K) += proc-v6.o obj-$(CONFIG_CPU_V7) += proc-v7.o proc-v7-bugs.o obj-$(CONFIG_CPU_V7M) += proc-v7m.o -obj-$(CONFIG_CFI_CLANG) += proc.o +obj-$(CONFIG_CFI) += proc.o obj-$(CONFIG_OUTER_CACHE) += l2c-common.o obj-$(CONFIG_CACHE_B15_RAC) += cache-b15-rac.o diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S index 4a3668b52a2db0..e1641799569bc0 100644 --- a/arch/arm/mm/cache-fa.S +++ b/arch/arm/mm/cache-fa.S @@ -112,7 +112,7 @@ SYM_FUNC_END(fa_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(fa_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b fa_coherent_user_range #endif SYM_FUNC_END(fa_coherent_kern_range) diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S index 0e94e5193dbd41..001d7042bd4696 100644 --- a/arch/arm/mm/cache-v4.S +++ b/arch/arm/mm/cache-v4.S @@ -104,7 +104,7 @@ SYM_FUNC_END(v4_coherent_user_range) * - size - region size */ SYM_TYPED_FUNC_START(v4_flush_kern_dcache_area) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4_dma_flush_range #endif SYM_FUNC_END(v4_flush_kern_dcache_area) diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S index ce55a2eef5da40..874fe5310f9a01 100644 --- a/arch/arm/mm/cache-v4wb.S +++ b/arch/arm/mm/cache-v4wb.S @@ -136,7 +136,7 @@ SYM_FUNC_END(v4wb_flush_user_cache_range) */ SYM_TYPED_FUNC_START(v4wb_flush_kern_dcache_area) add r1, r0, r1 -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wb_coherent_user_range #endif SYM_FUNC_END(v4wb_flush_kern_dcache_area) @@ -152,7 +152,7 @@ SYM_FUNC_END(v4wb_flush_kern_dcache_area) * - end - virtual end address */ SYM_TYPED_FUNC_START(v4wb_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wb_coherent_user_range #endif SYM_FUNC_END(v4wb_coherent_kern_range) diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S index a97dc267b3b0d7..2ee62e4b2b0753 100644 --- a/arch/arm/mm/cache-v4wt.S +++ b/arch/arm/mm/cache-v4wt.S @@ -108,7 +108,7 @@ SYM_FUNC_END(v4wt_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(v4wt_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wt_coherent_user_range #endif SYM_FUNC_END(v4wt_coherent_kern_range) diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S index 9f415476e2183d..5ceea8965ea19d 100644 --- a/arch/arm/mm/cache-v6.S +++ b/arch/arm/mm/cache-v6.S @@ -117,7 +117,7 @@ SYM_FUNC_END(v6_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v6_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v6_coherent_user_range #endif SYM_FUNC_END(v6_coherent_kern_range) diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 201ca05436fad5..726681fb7d4de9 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -261,7 +261,7 @@ SYM_FUNC_END(v7_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v7_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v7_coherent_user_range #endif SYM_FUNC_END(v7_coherent_kern_range) diff --git a/arch/arm/mm/cache-v7m.S b/arch/arm/mm/cache-v7m.S index 14d719eba729de..7f9cfad2ea2105 100644 --- a/arch/arm/mm/cache-v7m.S +++ b/arch/arm/mm/cache-v7m.S @@ -286,7 +286,7 @@ SYM_FUNC_END(v7m_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v7m_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v7m_coherent_user_range #endif SYM_FUNC_END(v7m_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S index d0ce3414a13e26..4612a4961e817a 100644 --- a/arch/arm/mm/proc-arm1020.S +++ b/arch/arm/mm/proc-arm1020.S @@ -203,7 +203,7 @@ SYM_FUNC_END(arm1020_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1020_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1020_coherent_user_range #endif SYM_FUNC_END(arm1020_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S index 64f031bf6eff5a..b4a8a3a8eda3d4 100644 --- a/arch/arm/mm/proc-arm1020e.S +++ b/arch/arm/mm/proc-arm1020e.S @@ -200,7 +200,7 @@ SYM_FUNC_END(arm1020e_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1020e_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1020e_coherent_user_range #endif SYM_FUNC_END(arm1020e_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S index 42ed5ed0725285..709870e99e1913 100644 --- a/arch/arm/mm/proc-arm1022.S +++ b/arch/arm/mm/proc-arm1022.S @@ -199,7 +199,7 @@ SYM_FUNC_END(arm1022_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1022_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1022_coherent_user_range #endif SYM_FUNC_END(arm1022_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S index b3ae62cd553aac..02f7370a8c5cbf 100644 --- a/arch/arm/mm/proc-arm1026.S +++ b/arch/arm/mm/proc-arm1026.S @@ -194,7 +194,7 @@ SYM_FUNC_END(arm1026_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1026_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1026_coherent_user_range #endif SYM_FUNC_END(arm1026_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S index a30df54ad5fae2..4727f4b5b6e8da 100644 --- a/arch/arm/mm/proc-arm920.S +++ b/arch/arm/mm/proc-arm920.S @@ -180,7 +180,7 @@ SYM_FUNC_END(arm920_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm920_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm920_coherent_user_range #endif SYM_FUNC_END(arm920_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S index aac4e048100d01..5a4a3f4f2683b8 100644 --- a/arch/arm/mm/proc-arm922.S +++ b/arch/arm/mm/proc-arm922.S @@ -182,7 +182,7 @@ SYM_FUNC_END(arm922_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm922_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm922_coherent_user_range #endif SYM_FUNC_END(arm922_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S index 035941faeb2ed4..1c4830afe1d395 100644 --- a/arch/arm/mm/proc-arm925.S +++ b/arch/arm/mm/proc-arm925.S @@ -229,7 +229,7 @@ SYM_FUNC_END(arm925_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm925_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm925_coherent_user_range #endif SYM_FUNC_END(arm925_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S index 6f43d6af2d9a7a..a09cc3e02efda4 100644 --- a/arch/arm/mm/proc-arm926.S +++ b/arch/arm/mm/proc-arm926.S @@ -192,7 +192,7 @@ SYM_FUNC_END(arm926_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm926_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm926_coherent_user_range #endif SYM_FUNC_END(arm926_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S index 0d30bb25c42bf1..545c076c36d241 100644 --- a/arch/arm/mm/proc-arm940.S +++ b/arch/arm/mm/proc-arm940.S @@ -153,7 +153,7 @@ SYM_FUNC_END(arm940_coherent_kern_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm940_coherent_user_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm940_flush_kern_dcache_area #endif SYM_FUNC_END(arm940_coherent_user_range) diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S index 27750ace2cedaa..f3d4e18c3fba5a 100644 --- a/arch/arm/mm/proc-arm946.S +++ b/arch/arm/mm/proc-arm946.S @@ -173,7 +173,7 @@ SYM_FUNC_END(arm946_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm946_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm946_coherent_user_range #endif SYM_FUNC_END(arm946_coherent_kern_range) diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S index f67b2ffac85411..7f08d06c962539 100644 --- a/arch/arm/mm/proc-feroceon.S +++ b/arch/arm/mm/proc-feroceon.S @@ -208,7 +208,7 @@ SYM_FUNC_END(feroceon_flush_user_cache_range) */ .align 5 SYM_TYPED_FUNC_START(feroceon_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b feroceon_coherent_user_range #endif SYM_FUNC_END(feroceon_coherent_kern_range) diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S index 8e9f38da863a52..4669c63e3121d0 100644 --- a/arch/arm/mm/proc-mohawk.S +++ b/arch/arm/mm/proc-mohawk.S @@ -163,7 +163,7 @@ SYM_FUNC_END(mohawk_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(mohawk_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b mohawk_coherent_user_range #endif SYM_FUNC_END(mohawk_coherent_kern_range) diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S index 14927b38045244..fd25634a2ed5dc 100644 --- a/arch/arm/mm/proc-xsc3.S +++ b/arch/arm/mm/proc-xsc3.S @@ -223,7 +223,7 @@ SYM_FUNC_END(xsc3_flush_user_cache_range) * it also trashes the mini I-cache used by JTAG debuggers. */ SYM_TYPED_FUNC_START(xsc3_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b xsc3_coherent_user_range #endif SYM_FUNC_END(xsc3_coherent_kern_range) diff --git a/arch/arm/mm/tlb-v4.S b/arch/arm/mm/tlb-v4.S index 09ff69008d94d2..079774a02be631 100644 --- a/arch/arm/mm/tlb-v4.S +++ b/arch/arm/mm/tlb-v4.S @@ -52,7 +52,7 @@ SYM_FUNC_END(v4_flush_user_tlb_range) * - start - virtual address (may not be aligned) * - end - virtual address (may not be aligned) */ -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI SYM_TYPED_FUNC_START(v4_flush_kern_tlb_range) b .v4_flush_kern_tlb_range SYM_FUNC_END(v4_flush_kern_tlb_range) diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index 885e0c5e8c20df..3d96fb41d6245d 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), true); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64d..b3e13f67d59853 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -100,7 +100,7 @@ config ARM64 select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN select ARCH_SUPPORTS_LTO_CLANG_THIN - select ARCH_SUPPORTS_CFI_CLANG + select ARCH_SUPPORTS_CFI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING @@ -108,6 +108,9 @@ config ARM64 select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT + select ARCH_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_CLUSTER + select ARCH_SUPPORTS_SCHED_MC select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT @@ -151,6 +154,7 @@ config ARM64 select GENERIC_EARLY_IOREMAP select GENERIC_IDLE_POLL_SETUP select GENERIC_IOREMAP + select GENERIC_IRQ_ENTRY select GENERIC_IRQ_IPI select GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD select GENERIC_IRQ_PROBE @@ -162,8 +166,6 @@ config ARM64 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select HARDIRQS_SW_RESEND select HAS_IOPORT select HAVE_MOVE_PMD @@ -212,7 +214,7 @@ config ARM64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \ if DYNAMIC_FTRACE_WITH_ARGS && DYNAMIC_FTRACE_WITH_CALL_OPS select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS \ - if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG && \ + if (DYNAMIC_FTRACE_WITH_ARGS && !CFI && \ (CC_IS_CLANG || !CC_OPTIMIZE_FOR_SIZE)) select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \ if DYNAMIC_FTRACE_WITH_ARGS @@ -1138,6 +1140,7 @@ config ARM64_ERRATUM_3194386 * ARM Neoverse-V1 erratum 3324341 * ARM Neoverse V2 erratum 3324336 * ARM Neoverse-V3 erratum 3312417 + * ARM Neoverse-V3AE erratum 3312417 On affected cores "MSR SSBS, #0" instructions may not affect subsequent speculative instructions, which may permit unexepected @@ -1493,7 +1496,7 @@ choice config CPU_BIG_ENDIAN bool "Build big-endian kernel" # https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c - depends on AS_IS_GNU || AS_VERSION >= 150000 + depends on (AS_IS_GNU || AS_VERSION >= 150000) && BROKEN help Say Y if you plan on running a kernel with a big-endian userspace. @@ -1505,29 +1508,6 @@ config CPU_LITTLE_ENDIAN endchoice -config SCHED_MC - bool "Multi-core scheduler support" - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SCHED_CLUSTER - bool "Cluster scheduler support" - help - Cluster scheduler support improves the CPU scheduler's decision - making when dealing with machines that have clusters of CPUs. - Cluster usually means a couple of CPUs which are placed closely - by sharing mid-level caches, last-level cache tags or internal - busses. - -config SCHED_SMT - bool "SMT scheduler support" - help - Improves the CPU scheduler's decision making when dealing with - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - config NR_CPUS int "Maximum number of CPUs (2-4096)" range 2 4096 @@ -1698,20 +1678,6 @@ config MITIGATE_SPECTRE_BRANCH_HISTORY When taking an exception from user-space, a sequence of branches or a firmware call overwrites the branch history. -config RODATA_FULL_DEFAULT_ENABLED - bool "Apply r/o permissions of VM areas also to their linear aliases" - default y - help - Apply read-only attributes of VM areas to the linear alias of - the backing pages as well. This prevents code or read-only data - from being modified (inadvertently or intentionally) via another - mapping of the same memory page. This additional enhancement can - be turned off at runtime by passing rodata=[off|on] (and turned on - with rodata=full if this option is set to 'n') - - This requires the linear region to be mapped down to pages, - which may adversely affect performance in some cases. - config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" depends on !KCSAN @@ -1782,7 +1748,6 @@ config COMPAT_VDSO bool "Enable vDSO for 32-bit applications" depends on !CPU_BIG_ENDIAN depends on (CC_IS_CLANG && LD_IS_LLD) || "$(CROSS_COMPILE_COMPAT)" != "" - select GENERIC_COMPAT_VDSO default y help Place in the process address space of 32-bit applications an @@ -2218,14 +2183,13 @@ config ARM64_HAFT endmenu # "ARMv8.9 architectural features" -menu "v9.4 architectural features" +menu "ARMv9.4 architectural features" config ARM64_GCS bool "Enable support for Guarded Control Stack (GCS)" default y select ARCH_HAS_USER_SHADOW_STACK select ARCH_USES_HIGH_VMA_FLAGS - depends on !UPROBES help Guarded Control Stack (GCS) provides support for a separate stack with restricted access which contains only return @@ -2237,7 +2201,7 @@ config ARM64_GCS The feature is detected at runtime, and will remain disabled if the system does not implement the feature. -endmenu # "v9.4 architectural features" +endmenu # "ARMv9.4 architectural features" config ARM64_SVE bool "ARM Scalable Vector Extension support" diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index bb24dba7338ea0..d6d21e8498dcf9 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -298,7 +298,7 @@ cpu-thermal { polling-delay-passive = <250>; polling-delay = <2000>; - thermal-sensors = <&tmu 0>; + thermal-sensors = <&tmu 1>; trips { cpu_alert0: trip0 { temperature = <85000>; @@ -331,7 +331,7 @@ soc-thermal { polling-delay-passive = <250>; polling-delay = <2000>; - thermal-sensors = <&tmu 1>; + thermal-sensors = <&tmu 0>; trips { soc_alert0: trip0 { temperature = <85000>; diff --git a/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi b/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi index 0d4a5fd9503f29..f2d278d171eb19 100644 --- a/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi @@ -345,11 +345,13 @@ /* CPS Lane 1 - U32 */ sata-port@0 { phys = <&cp1_comphy1 0>; + status = "okay"; }; /* CPS Lane 3 - U31 */ sata-port@1 { phys = <&cp1_comphy3 1>; + status = "okay"; }; }; diff --git a/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi b/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi index ad0ab34b66028c..bd42bfbe408bbe 100644 --- a/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi +++ b/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi @@ -152,11 +152,12 @@ /* SRDS #0 - SATA on M.2 connector */ &cp0_sata0 { - phys = <&cp0_comphy0 1>; status = "okay"; - /* only port 1 is available */ - /delete-node/ sata-port@0; + sata-port@1 { + phys = <&cp0_comphy0 1>; + status = "okay"; + }; }; /* microSD */ diff --git a/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts b/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts index 47234d0858dd21..338853d3b179bb 100644 --- a/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts +++ b/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts @@ -563,11 +563,13 @@ /* SRDS #1 - SATA on M.2 (J44) */ &cp1_sata0 { - phys = <&cp1_comphy1 0>; status = "okay"; /* only port 0 is available */ - /delete-node/ sata-port@1; + sata-port@0 { + phys = <&cp1_comphy1 0>; + status = "okay"; + }; }; &cp1_syscon0 { diff --git a/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts b/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts index 0f53745a6fa0d8..6f237d3542b910 100644 --- a/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts +++ b/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts @@ -413,7 +413,13 @@ /* SRDS #0,#1,#2,#3 - PCIe */ &cp0_pcie0 { num-lanes = <4>; - phys = <&cp0_comphy0 0>, <&cp0_comphy1 0>, <&cp0_comphy2 0>, <&cp0_comphy3 0>; + /* + * The mvebu-comphy driver does not currently know how to pass correct + * lane-count to ATF while configuring the serdes lanes. + * Rely on bootloader configuration only. + * + * phys = <&cp0_comphy0 0>, <&cp0_comphy1 0>, <&cp0_comphy2 0>, <&cp0_comphy3 0>; + */ status = "okay"; }; @@ -475,7 +481,13 @@ /* SRDS #0,#1 - PCIe */ &cp1_pcie0 { num-lanes = <2>; - phys = <&cp1_comphy0 0>, <&cp1_comphy1 0>; + /* + * The mvebu-comphy driver does not currently know how to pass correct + * lane-count to ATF while configuring the serdes lanes. + * Rely on bootloader configuration only. + * + * phys = <&cp1_comphy0 0>, <&cp1_comphy1 0>; + */ status = "okay"; }; @@ -512,10 +524,9 @@ status = "okay"; /* only port 1 is available */ - /delete-node/ sata-port@0; - sata-port@1 { phys = <&cp1_comphy3 1>; + status = "okay"; }; }; @@ -631,9 +642,8 @@ status = "okay"; /* only port 1 is available */ - /delete-node/ sata-port@0; - sata-port@1 { + status = "okay"; phys = <&cp2_comphy3 1>; }; }; diff --git a/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi b/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi index afc041c1c448c3..bb2bb47fd77c12 100644 --- a/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi +++ b/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi @@ -137,6 +137,14 @@ pinctrl-0 = <&ap_mmc0_pins>; pinctrl-names = "default"; vqmmc-supply = <&v_1_8>; + /* + * Not stable in HS modes - phy needs "more calibration", so disable + * UHS (by preventing voltage switch), SDR104, SDR50 and DDR50 modes. + */ + no-1-8-v; + no-sd; + no-sdio; + non-removable; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi index 3d8b6f0c554188..69833a0a94d0f1 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi @@ -731,6 +731,7 @@ spi-max-frequency = <104000000>; spi-rx-bus-width = <4>; spi-tx-bus-width = <1>; + vcc-supply = <&vcc_1v8_s3>; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi index 4fedc50cce8c86..11940c77f2bd01 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi @@ -42,9 +42,8 @@ simple-audio-card,bitclock-master = <&masterdai>; simple-audio-card,format = "i2s"; simple-audio-card,frame-master = <&masterdai>; - simple-audio-card,hp-det-gpios = <&gpio1 RK_PD5 GPIO_ACTIVE_LOW>; + simple-audio-card,hp-det-gpios = <&gpio1 RK_PD5 GPIO_ACTIVE_HIGH>; simple-audio-card,mclk-fs = <256>; - simple-audio-card,pin-switches = "Headphones"; simple-audio-card,routing = "Headphones", "LOUT1", "Headphones", "ROUT1", diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index bf13d676aae2cc..e223cbf350e49c 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -871,6 +871,8 @@ static inline bool system_supports_pmuv3(void) return cpus_have_final_cap(ARM64_HAS_PMUV3); } +bool cpu_supports_bbml2_noabort(void); + static inline bool system_supports_bbml2_noabort(void) { return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT); diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 661735616787e2..9b00b75acbf296 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -81,7 +81,6 @@ #define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 -#define ARM_CPU_PART_CORTEX_X1C 0xD4C #define ARM_CPU_PART_CORTEX_A520 0xD80 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_A715 0xD4D @@ -93,9 +92,11 @@ #define ARM_CPU_PART_NEOVERSE_V2 0xD4F #define ARM_CPU_PART_CORTEX_A720 0xD81 #define ARM_CPU_PART_CORTEX_X4 0xD82 +#define ARM_CPU_PART_NEOVERSE_V3AE 0xD83 #define ARM_CPU_PART_NEOVERSE_V3 0xD84 #define ARM_CPU_PART_CORTEX_X925 0xD85 #define ARM_CPU_PART_CORTEX_A725 0xD87 +#define ARM_CPU_PART_CORTEX_A720AE 0xD89 #define ARM_CPU_PART_NEOVERSE_N3 0xD8E #define APM_CPU_PART_XGENE 0x000 @@ -129,6 +130,7 @@ #define NVIDIA_CPU_PART_DENVER 0x003 #define NVIDIA_CPU_PART_CARMEL 0x004 +#define NVIDIA_CPU_PART_OLYMPUS 0x010 #define FUJITSU_CPU_PART_A64FX 0x001 @@ -170,7 +172,6 @@ #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) -#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715) @@ -182,9 +183,11 @@ #define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) #define MIDR_CORTEX_A720 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720) #define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4) +#define MIDR_NEOVERSE_V3AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3AE) #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) #define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925) #define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725) +#define MIDR_CORTEX_A720AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720AE) #define MIDR_NEOVERSE_N3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N3) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) @@ -220,6 +223,7 @@ #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) +#define MIDR_NVIDIA_OLYMPUS MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_OLYMPUS) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09) diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index fbb5c99eb2f9d6..5fca480090434c 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -128,7 +128,7 @@ static inline void local_daif_inherit(struct pt_regs *regs) { unsigned long flags = regs->pstate & DAIF_MASK; - if (interrupts_enabled(regs)) + if (!regs_irqs_disabled(regs)) trace_hardirqs_on(); if (system_uses_irq_prio_masking()) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 46033027510cca..b37da3ee852963 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -91,6 +91,14 @@ msr cntvoff_el2, xzr // Clear virtual offset .endm +/* Branch to skip_label if SPE version is less than given version */ +.macro __spe_vers_imp skip_label, version, tmp + mrs \tmp, id_aa64dfr0_el1 + ubfx \tmp, \tmp, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 + cmp \tmp, \version + b.lt \skip_label +.endm + .macro __init_el2_debug mrs x1, id_aa64dfr0_el1 ubfx x0, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 @@ -103,8 +111,7 @@ csel x2, xzr, x0, eq // all PMU counters from EL1 /* Statistical profiling */ - ubfx x0, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 - cbz x0, .Lskip_spe_\@ // Skip if SPE not present + __spe_vers_imp .Lskip_spe_\@, ID_AA64DFR0_EL1_PMSVer_IMP, x0 // Skip if SPE not present mrs_s x0, SYS_PMBIDR_EL1 // If SPE available at EL2, and x0, x0, #(1 << PMBIDR_EL1_P_SHIFT) @@ -263,10 +270,8 @@ mov x0, xzr mov x2, xzr - mrs x1, id_aa64dfr0_el1 - ubfx x1, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 - cmp x1, #3 - b.lt .Lskip_spe_fgt_\@ + /* If SPEv1p2 is implemented, */ + __spe_vers_imp .Lskip_spe_fgt_\@, #ID_AA64DFR0_EL1_PMSVer_V1P2, x1 /* Disable PMSNEVFR_EL1 read and write traps */ orr x0, x0, #HDFGRTR_EL2_nPMSNEVFR_EL1_MASK orr x2, x2, #HDFGWTR_EL2_nPMSNEVFR_EL1_MASK @@ -387,6 +392,17 @@ orr x0, x0, #HDFGRTR2_EL2_nPMICFILTR_EL0 orr x0, x0, #HDFGRTR2_EL2_nPMUACR_EL1 .Lskip_pmuv3p9_\@: + /* If SPE is implemented, */ + __spe_vers_imp .Lskip_spefds_\@, ID_AA64DFR0_EL1_PMSVer_IMP, x1 + /* we can read PMSIDR and */ + mrs_s x1, SYS_PMSIDR_EL1 + and x1, x1, #PMSIDR_EL1_FDS + /* if FEAT_SPE_FDS is implemented, */ + cbz x1, .Lskip_spefds_\@ + /* disable traps of PMSDSFR to EL2. */ + orr x0, x0, #HDFGRTR2_EL2_nPMSDSFR_EL1 + +.Lskip_spefds_\@: msr_s SYS_HDFGRTR2_EL2, x0 msr_s SYS_HDFGWTR2_EL2, x0 msr_s SYS_HFGRTR2_EL2, xzr diff --git a/arch/arm64/include/asm/entry-common.h b/arch/arm64/include/asm/entry-common.h new file mode 100644 index 00000000000000..cab8cd78f69385 --- /dev/null +++ b/arch/arm64/include/asm/entry-common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_ARM64_ENTRY_COMMON_H +#define _ASM_ARM64_ENTRY_COMMON_H + +#include + +#include +#include +#include +#include +#include + +#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_MTE_ASYNC_FAULT | _TIF_FOREIGN_FPSTATE) + +static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, + unsigned long ti_work) +{ + if (ti_work & _TIF_MTE_ASYNC_FAULT) { + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + send_sig_fault(SIGSEGV, SEGV_MTEAERR, (void __user *)NULL, current); + } + + if (ti_work & _TIF_FOREIGN_FPSTATE) + fpsimd_restore_current_state(); +} + +#define arch_exit_to_user_mode_work arch_exit_to_user_mode_work + +static inline bool arch_irqentry_exit_need_resched(void) +{ + /* + * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC + * priority masking is used the GIC irqchip driver will clear DAIF.IF + * using gic_arch_enable_irqs() for normal IRQs. If anything is set in + * DAIF we must have handled an NMI, so skip preemption. + */ + if (system_uses_irq_prio_masking() && read_sysreg(daif)) + return false; + + /* + * Preempting a task from an IRQ means we leave copies of PSTATE + * on the stack. cpufeature's enable calls may modify PSTATE, but + * resuming one of these preempted tasks would undo those changes. + * + * Only allow a task to be preempted once cpufeatures have been + * enabled. + */ + if (!system_capabilities_finalized()) + return false; + + return true; +} + +#define arch_irqentry_exit_need_resched arch_irqentry_exit_need_resched + +#endif /* _ASM_ARM64_ENTRY_COMMON_H */ diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index e3874c4fc399e5..a2da3cb21c244a 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -89,7 +89,6 @@ void do_el1_fpac(struct pt_regs *regs, unsigned long esr); void do_el0_mops(struct pt_regs *regs, unsigned long esr); void do_el1_mops(struct pt_regs *regs, unsigned long esr); void do_serror(struct pt_regs *regs, unsigned long esr); -void do_signal(struct pt_regs *regs); void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far); #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 5bc432234d3aba..8fa0707069e8b9 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -21,7 +21,7 @@ static inline void gcsstr(u64 *addr, u64 val) register u64 *_addr __asm__ ("x0") = addr; register long _val __asm__ ("x1") = val; - /* GCSSTTR x1, x0 */ + /* GCSSTTR x1, [x0] */ asm volatile( ".inst 0xd91f1c01\n" : @@ -81,6 +81,82 @@ static inline int gcs_check_locked(struct task_struct *task, return 0; } +static inline int gcssttr(unsigned long __user *addr, unsigned long val) +{ + register unsigned long __user *_addr __asm__ ("x0") = addr; + register unsigned long _val __asm__ ("x1") = val; + int err = 0; + + /* GCSSTTR x1, [x0] */ + asm volatile( + "1: .inst 0xd91f1c01\n" + "2: \n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) + : "+r" (err) + : "rZ" (_val), "r" (_addr) + : "memory"); + + return err; +} + +static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, + int *err) +{ + int ret; + + if (!access_ok((char __user *)addr, sizeof(u64))) { + *err = -EFAULT; + return; + } + + uaccess_ttbr0_enable(); + ret = gcssttr(addr, val); + if (ret != 0) + *err = ret; + uaccess_ttbr0_disable(); +} + +static inline void push_user_gcs(unsigned long val, int *err) +{ + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + gcspr -= sizeof(u64); + put_user_gcs(val, (unsigned long __user *)gcspr, err); + if (!*err) + write_sysreg_s(gcspr, SYS_GCSPR_EL0); +} + +/* + * Unlike put/push_user_gcs() above, get/pop_user_gsc() doesn't + * validate the GCS permission is set on the page being read. This + * differs from how the hardware works when it consumes data stored at + * GCSPR. Callers should ensure this is acceptable. + */ +static inline u64 get_user_gcs(unsigned long __user *addr, int *err) +{ + unsigned long ret; + u64 load = 0; + + /* Ensure previous GCS operation are visible before we read the page */ + gcsb_dsync(); + ret = copy_from_user(&load, addr, sizeof(load)); + if (ret != 0) + *err = ret; + return load; +} + +static inline u64 pop_user_gcs(int *err) +{ + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + u64 read_val; + + read_val = get_user_gcs((__force unsigned long __user *)gcspr, err); + if (!*err) + write_sysreg_s(gcspr + sizeof(u64), SYS_GCSPR_EL0); + + return read_val; +} + #else static inline bool task_gcs_el0_enabled(struct task_struct *task) @@ -91,6 +167,10 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) static inline void gcs_set_el0_mode(struct task_struct *task) { } static inline void gcs_free(struct task_struct *task) { } static inline void gcs_preserve_current_state(void) { } +static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, + int *err) { } +static inline void push_user_gcs(unsigned long val, int *err) { } + static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, const struct kernel_clone_args *args) { @@ -101,6 +181,15 @@ static inline int gcs_check_locked(struct task_struct *task, { return 0; } +static inline u64 get_user_gcs(unsigned long __user *addr, int *err) +{ + *err = -EFAULT; + return 0; +} +static inline u64 pop_user_gcs(int *err) +{ + return 0; +} #endif diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 13f94c8ddfc03b..6d567265467ccc 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -178,6 +178,7 @@ #define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128) #define KERNEL_HWCAP_MTE_FAR __khwcap3_feature(MTE_FAR) #define KERNEL_HWCAP_MTE_STORE_ONLY __khwcap3_feature(MTE_STORE_ONLY) +#define KERNEL_HWCAP_LSFE __khwcap3_feature(LSFE) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 9b96840fb979bf..83e03abbb2ca92 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -274,6 +274,10 @@ int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); #define ioremap_np(addr, size) \ ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) + +#define ioremap_encrypted(addr, size) \ + ioremap_prot((addr), (size), PAGE_KERNEL) + /* * io{read,write}{16,32,64}be() macros */ @@ -311,7 +315,7 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size) { if (unlikely(is_realm_world())) - return __arm64_is_protected_mmio(phys_addr, size); + return arm64_rsi_is_protected(phys_addr, size); return false; } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 2b07f0a27a7d85..0ee4f6fa3a172d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1369,6 +1369,7 @@ static inline bool kvm_system_needs_idmapped_vectors(void) } void kvm_init_host_debug_data(void); +void kvm_debug_init_vhe(void); void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu); void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 1246216616b518..2888b5d0375736 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -355,11 +355,6 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return pteref; } -static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) -{ - return pteref; -} - static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { /* @@ -389,11 +384,6 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } -static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) -{ - return rcu_dereference_raw(pteref); -} - static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) @@ -561,26 +551,6 @@ static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2 */ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); -/** - * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). - * @addr: Intermediate physical address at which to place the mapping. - * @size: Size of the mapping. - * - * The page-table is assumed to be unreachable by any hardware walkers prior - * to freeing and therefore no TLB invalidation is performed. - */ -void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, - u64 addr, u64 size); - -/** - * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). - * - * It is assumed that the rest of the page-table is freed before this operation. - */ -void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); - /** * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 35f9d94780048c..ea58282f59bb4f 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -179,9 +179,7 @@ struct pkvm_mapping { int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops); -void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, - u64 addr, u64 size); -void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); +void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags); diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 49f1a810df1681..ff6fd0bbd7d272 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -78,6 +78,9 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgprot_t prot, bool page_mappings_only); extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot); extern void mark_linear_text_alias_ro(void); +extern int split_kernel_leaf_mapping(unsigned long start, unsigned long end); +extern void init_idmap_kpti_bbml2_flag(void); +extern void linear_map_maybe_split_to_ptes(void); /* * This check is triggered during the early boot before the cpufeature diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index abd2dee416b3b3..aa89c2e67ebc84 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -371,6 +371,11 @@ static inline pmd_t pmd_mkcont(pmd_t pmd) return __pmd(pmd_val(pmd) | PMD_SECT_CONT); } +static inline pmd_t pmd_mknoncont(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) & ~PMD_SECT_CONT); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h index 0159b625cc7f0e..932ea4b6204289 100644 --- a/arch/arm64/include/asm/preempt.h +++ b/arch/arm64/include/asm/preempt.h @@ -2,7 +2,6 @@ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H -#include #include #define PREEMPT_NEED_RESCHED BIT(32) @@ -87,7 +86,6 @@ void preempt_schedule_notrace(void); #ifdef CONFIG_PREEMPT_DYNAMIC -DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); void dynamic_preempt_schedule(void); #define __preempt_schedule() dynamic_preempt_schedule() void dynamic_preempt_schedule_notrace(void); diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index fded5358641f8f..baff24004459ea 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -7,6 +7,8 @@ #include +DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); + #ifdef CONFIG_PTDUMP #include diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 47ff8654c5ec1e..65b053a24d8247 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -169,10 +169,6 @@ struct pt_regs { u64 sdei_ttbr1; struct frame_record_meta stackframe; - - /* Only valid for some EL1 exceptions. */ - u64 lockdep_hardirqs; - u64 exit_rcu; }; /* For correct stack alignment, pt_regs has to be a multiple of 16 bytes. */ @@ -214,11 +210,12 @@ static inline void forget_syscall(struct pt_regs *regs) (regs)->pmr == GIC_PRIO_IRQON : \ true) -#define interrupts_enabled(regs) \ - (!((regs)->pstate & PSR_I_BIT) && irqs_priority_unmasked(regs)) +static __always_inline bool regs_irqs_disabled(const struct pt_regs *regs) +{ + return (regs->pstate & PSR_I_BIT) || !irqs_priority_unmasked(regs); +} -#define fast_interrupts_enabled(regs) \ - (!((regs)->pstate & PSR_F_BIT)) +#define interrupts_enabled(regs) (!regs_irqs_disabled(regs)) static inline unsigned long user_stack_pointer(struct pt_regs *regs) { diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h index b42aeac05340ed..88b50d660e85a0 100644 --- a/arch/arm64/include/asm/rsi.h +++ b/arch/arm64/include/asm/rsi.h @@ -16,7 +16,7 @@ DECLARE_STATIC_KEY_FALSE(rsi_present); void __init arm64_rsi_init(void); -bool __arm64_is_protected_mmio(phys_addr_t base, size_t size); +bool arm64_rsi_is_protected(phys_addr_t base, size_t size); static inline bool is_realm_world(void) { diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h index ba269a7a320134..3d96dde4d214cb 100644 --- a/arch/arm64/include/asm/setup.h +++ b/arch/arm64/include/asm/setup.h @@ -21,7 +21,7 @@ static inline bool arch_parse_debug_rodata(char *arg) if (!arg) return false; - if (!strcmp(arg, "full")) { + if (!strcmp(arg, "on")) { rodata_enabled = rodata_full = true; return true; } @@ -31,7 +31,7 @@ static inline bool arch_parse_debug_rodata(char *arg) return true; } - if (!strcmp(arg, "on")) { + if (!strcmp(arg, "noalias")) { rodata_enabled = true; rodata_full = false; return true; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6604fd6f33f452..6455db1b54fd2d 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -281,8 +281,6 @@ #define SYS_RGSR_EL1 sys_reg(3, 0, 1, 0, 5) #define SYS_GCR_EL1 sys_reg(3, 0, 1, 0, 6) -#define SYS_TCR_EL1 sys_reg(3, 0, 2, 0, 2) - #define SYS_APIAKEYLO_EL1 sys_reg(3, 0, 2, 1, 0) #define SYS_APIAKEYHI_EL1 sys_reg(3, 0, 2, 1, 1) #define SYS_APIBKEYLO_EL1 sys_reg(3, 0, 2, 1, 2) @@ -344,15 +342,6 @@ #define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56) #define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52)) -/*** Statistical Profiling Extension ***/ -#define PMSEVFR_EL1_RES0_IMP \ - (GENMASK_ULL(47, 32) | GENMASK_ULL(23, 16) | GENMASK_ULL(11, 8) |\ - BIT_ULL(6) | BIT_ULL(4) | BIT_ULL(2) | BIT_ULL(0)) -#define PMSEVFR_EL1_RES0_V1P1 \ - (PMSEVFR_EL1_RES0_IMP & ~(BIT_ULL(18) | BIT_ULL(17) | BIT_ULL(11))) -#define PMSEVFR_EL1_RES0_V1P2 \ - (PMSEVFR_EL1_RES0_V1P1 & ~BIT_ULL(6)) - /* Buffer error reporting */ #define PMBSR_EL1_FAULT_FSC_SHIFT PMBSR_EL1_MSS_SHIFT #define PMBSR_EL1_FAULT_FSC_MASK PMBSR_EL1_MSS_MASK diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 5b91803201ef88..1aa4ecb73429fe 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -502,44 +502,4 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr, #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ -#ifdef CONFIG_ARM64_GCS - -static inline int gcssttr(unsigned long __user *addr, unsigned long val) -{ - register unsigned long __user *_addr __asm__ ("x0") = addr; - register unsigned long _val __asm__ ("x1") = val; - int err = 0; - - /* GCSSTTR x1, x0 */ - asm volatile( - "1: .inst 0xd91f1c01\n" - "2: \n" - _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) - : "+r" (err) - : "rZ" (_val), "r" (_addr) - : "memory"); - - return err; -} - -static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, - int *err) -{ - int ret; - - if (!access_ok((char __user *)addr, sizeof(u64))) { - *err = -EFAULT; - return; - } - - uaccess_ttbr0_enable(); - ret = gcssttr(addr, val); - if (ret != 0) - *err = ret; - uaccess_ttbr0_disable(); -} - - -#endif /* CONFIG_ARM64_GCS */ - #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/include/asm/vdso/compat_barrier.h b/arch/arm64/include/asm/vdso/compat_barrier.h index 3ac35f4a667cfc..6d75e03d38274a 100644 --- a/arch/arm64/include/asm/vdso/compat_barrier.h +++ b/arch/arm64/include/asm/vdso/compat_barrier.h @@ -7,11 +7,10 @@ #ifndef __ASSEMBLY__ /* - * Warning: This code is meant to be used with - * ENABLE_COMPAT_VDSO only. + * Warning: This code is meant to be used from the compat vDSO only. */ -#ifndef ENABLE_COMPAT_VDSO -#error This header is meant to be used with ENABLE_COMPAT_VDSO only +#ifdef __arch64__ +#error This header is meant to be used with from the compat vDSO only #endif #ifdef dmb diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index d60ea7a72a9cb3..7d1a116549b1b9 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -2,8 +2,8 @@ /* * Copyright (C) 2018 ARM Limited */ -#ifndef __ASM_VDSO_GETTIMEOFDAY_H -#define __ASM_VDSO_GETTIMEOFDAY_H +#ifndef __ASM_VDSO_COMPAT_GETTIMEOFDAY_H +#define __ASM_VDSO_COMPAT_GETTIMEOFDAY_H #ifndef __ASSEMBLY__ @@ -163,4 +163,4 @@ static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) #endif /* !__ASSEMBLY__ */ -#endif /* __ASM_VDSO_GETTIMEOFDAY_H */ +#endif /* __ASM_VDSO_COMPAT_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index da1ab87595925f..c59e84105b43cd 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -5,6 +5,8 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H +#ifdef __aarch64__ + #ifndef __ASSEMBLY__ #include @@ -96,4 +98,10 @@ static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data( #endif /* !__ASSEMBLY__ */ +#else /* !__aarch64__ */ + +#include "compat_gettimeofday.h" + +#endif /* __aarch64__ */ + #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 12f534e8f3edf8..4ec1acd3c1b348 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -9,18 +9,13 @@ #define arch_vmap_pud_supported arch_vmap_pud_supported static inline bool arch_vmap_pud_supported(pgprot_t prot) { - /* - * SW table walks can't handle removal of intermediate entries. - */ - return pud_sect_supported() && - !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); + return pud_sect_supported(); } #define arch_vmap_pmd_supported arch_vmap_pmd_supported static inline bool arch_vmap_pmd_supported(pgprot_t prot) { - /* See arch_vmap_pud_supported() */ - return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); + return true; } #define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h index 2788e95d0ff022..2977b5fe068dbe 100644 --- a/arch/arm64/include/asm/xen/events.h +++ b/arch/arm64/include/asm/xen/events.h @@ -14,7 +14,7 @@ enum ipi_vector { static inline int xen_irqs_disabled(struct pt_regs *regs) { - return !interrupts_enabled(regs); + return regs_irqs_disabled(regs); } #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 72c78468b806b5..575564ecdb0b78 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -145,5 +145,6 @@ */ #define HWCAP3_MTE_FAR (1UL << 0) #define HWCAP3_MTE_STORE_ONLY (1UL << 1) +#define HWCAP3_LSFE (1UL << 2) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 4d529ff7ba513a..7aca29e1d30b24 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -357,6 +357,16 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) * as long as we take care not to create a writable * mapping for executable code. */ + fallthrough; + + case EFI_ACPI_MEMORY_NVS: + /* + * ACPI NVS marks an area reserved for use by the + * firmware, even after exiting the boot service. + * This may be used by the firmware for sharing dynamic + * tables/data (e.g., ACPI CCEL) with the OS. Map it + * as read-only. + */ prot = PAGE_KERNEL_RO; break; @@ -407,7 +417,7 @@ int apei_claim_sea(struct pt_regs *regs) return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags()); if (regs) - return_to_irqs_enabled = interrupts_enabled(regs); + return_to_irqs_enabled = !regs_irqs_disabled(regs); /* * SEA can interrupt SError, mask it and describe this as an NMI so diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 30d4bbe68661f4..b6367ff3a49ca1 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -6,6 +6,7 @@ * 2001-2002 Keith Owens * Copyright (C) 2012 ARM Ltd. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 59d723c9ab8f5a..8cb3b575a03165 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -531,6 +531,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), @@ -545,6 +546,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = { MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3AE), {} }; #endif diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index ef269a5a37e12c..63cd05e6973d38 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -279,6 +279,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -2028,6 +2029,7 @@ static void __init kpti_install_ng_mappings(void) if (arm64_use_ng_mappings) return; + init_idmap_kpti_bbml2_flag(); stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); } @@ -2218,7 +2220,7 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); } -static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) +bool cpu_supports_bbml2_noabort(void) { /* * We want to allow usage of BBML2 in as wide a range of kernel contexts @@ -2235,6 +2237,10 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco static const struct midr_range supports_bbml2_noabort_list[] = { MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), + MIDR_REV_RANGE(MIDR_NEOVERSE_V3AE, 0, 2, 0xf), + MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), {} }; @@ -2250,6 +2256,11 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco return true; } +static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) +{ + return cpu_supports_bbml2_noabort(); +} + #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) { @@ -3277,6 +3288,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM), HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT), HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX), + HWCAP_CAP(ID_AA64ISAR3_EL1, LSFE, IMP, CAP_HWCAP, KERNEL_HWCAP_LSFE), HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), @@ -3948,6 +3960,7 @@ void __init setup_system_features(void) { setup_system_capabilities(); + linear_map_maybe_split_to_ptes(); kpti_install_ng_mappings(); sve_setup(); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index ba834909a28bd0..c44e6d94f5deb1 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -162,6 +162,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_SMOP4] = "smesmop4", [KERNEL_HWCAP_MTE_FAR] = "mtefar", [KERNEL_HWCAP_MTE_STORE_ONLY] = "mtestoreonly", + [KERNEL_HWCAP_LSFE] = "lsfe", }; #ifdef CONFIG_COMPAT diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 110d9ff54174f7..29307642f4c904 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -167,7 +167,7 @@ static void send_user_sigtrap(int si_code) if (WARN_ON(!user_mode(regs))) return; - if (interrupts_enabled(regs)) + if (!regs_irqs_disabled(regs)) local_irq_enable(); arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), @@ -212,7 +212,7 @@ static int call_el1_break_hook(struct pt_regs *regs, unsigned long esr) if (esr_brk_comment(esr) == BUG_BRK_IMM) return bug_brk_handler(regs, esr); - if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) + if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) return cfi_brk_handler(regs, esr); if (esr_brk_comment(esr) == FAULT_BRK_IMM) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 2b0c5925502e70..f546a914f04174 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -37,29 +38,20 @@ * This is intended to match the logic in irqentry_enter(), handling the kernel * mode transitions only. */ -static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs) +static __always_inline irqentry_state_t __enter_from_kernel_mode(struct pt_regs *regs) { - regs->exit_rcu = false; - - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { - lockdep_hardirqs_off(CALLER_ADDR0); - ct_irq_enter(); - trace_hardirqs_off_finish(); - - regs->exit_rcu = true; - return; - } - - lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter_check_tick(); - trace_hardirqs_off_finish(); + return irqentry_enter(regs); } -static void noinstr enter_from_kernel_mode(struct pt_regs *regs) +static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) { - __enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = __enter_from_kernel_mode(regs); mte_check_tfsr_entry(); mte_disable_tco_entry(current); + + return state; } /* @@ -70,30 +62,17 @@ static void noinstr enter_from_kernel_mode(struct pt_regs *regs) * This is intended to match the logic in irqentry_exit(), handling the kernel * mode transitions only, and with preemption handled elsewhere. */ -static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) -{ - lockdep_assert_irqs_disabled(); - - if (interrupts_enabled(regs)) { - if (regs->exit_rcu) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - ct_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - trace_hardirqs_on(); - } else { - if (regs->exit_rcu) - ct_irq_exit(); - } +static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) +{ + irqentry_exit(regs, state); } -static void noinstr exit_to_kernel_mode(struct pt_regs *regs) +static void noinstr exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) { mte_check_tfsr_exit(); - __exit_to_kernel_mode(regs); + __exit_to_kernel_mode(regs, state); } /* @@ -101,18 +80,15 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs) * Before this function is called it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __enter_from_user_mode(void) +static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { - lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(ct_state() != CT_STATE_USER); - user_exit_irqoff(); - trace_hardirqs_off_finish(); + enter_from_user_mode(regs); mte_disable_tco_entry(current); } -static __always_inline void enter_from_user_mode(struct pt_regs *regs) +static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) { - __enter_from_user_mode(); + __enter_from_user_mode(regs); } /* @@ -120,113 +96,19 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __exit_to_user_mode(void) -{ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - user_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) -{ - do { - local_irq_enable(); - - if (thread_flags & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) - schedule(); - - if (thread_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (thread_flags & _TIF_MTE_ASYNC_FAULT) { - clear_thread_flag(TIF_MTE_ASYNC_FAULT); - send_sig_fault(SIGSEGV, SEGV_MTEAERR, - (void __user *)NULL, current); - } - - if (thread_flags & _TIF_PATCH_PENDING) - klp_update_patch_state(current); - if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - do_signal(regs); - - if (thread_flags & _TIF_NOTIFY_RESUME) - resume_user_mode_work(regs); - - if (thread_flags & _TIF_FOREIGN_FPSTATE) - fpsimd_restore_current_state(); - - local_irq_disable(); - thread_flags = read_thread_flags(); - } while (thread_flags & _TIF_WORK_MASK); -} - -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { - unsigned long flags; - local_irq_disable(); - - flags = read_thread_flags(); - if (unlikely(flags & _TIF_WORK_MASK)) - do_notify_resume(regs, flags); - - local_daif_mask(); - - lockdep_sys_exit(); -} - -static __always_inline void exit_to_user_mode(struct pt_regs *regs) -{ exit_to_user_mode_prepare(regs); + local_daif_mask(); mte_check_tfsr_exit(); - __exit_to_user_mode(); + exit_to_user_mode(); } asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) { - exit_to_user_mode(regs); -} - -/* - * Handle IRQ/context state management when entering an NMI from user/kernel - * mode. Before this function is called it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static void noinstr arm64_enter_nmi(struct pt_regs *regs) -{ - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - ct_nmi_enter(); - - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); -} - -/* - * Handle IRQ/context state management when exiting an NMI from user/kernel - * mode. After this function returns it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static void noinstr arm64_exit_nmi(struct pt_regs *regs) -{ - bool restore = regs->lockdep_hardirqs; - - ftrace_nmi_exit(); - if (restore) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - } - - ct_nmi_exit(); - lockdep_hardirq_exit(); - if (restore) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); + arm64_exit_to_user_mode(regs); } /* @@ -234,14 +116,18 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs) * kernel mode. Before this function is called it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) +static noinstr irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) { - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + irqentry_state_t state; + + state.lockdep = lockdep_hardirqs_enabled(); lockdep_hardirqs_off(CALLER_ADDR0); ct_nmi_enter(); trace_hardirqs_off_finish(); + + return state; } /* @@ -249,62 +135,19 @@ static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) * kernel mode. After this function returns it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) +static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, + irqentry_state_t state) { - bool restore = regs->lockdep_hardirqs; - - if (restore) { + if (state.lockdep) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); } ct_nmi_exit(); - if (restore) + if (state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); } -#ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -#define need_irq_preemption() \ - (static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) -#else -#define need_irq_preemption() (IS_ENABLED(CONFIG_PREEMPTION)) -#endif - -static void __sched arm64_preempt_schedule_irq(void) -{ - if (!need_irq_preemption()) - return; - - /* - * Note: thread_info::preempt_count includes both thread_info::count - * and thread_info::need_resched, and is not equivalent to - * preempt_count(). - */ - if (READ_ONCE(current_thread_info()->preempt_count) != 0) - return; - - /* - * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC - * priority masking is used the GIC irqchip driver will clear DAIF.IF - * using gic_arch_enable_irqs() for normal IRQs. If anything is set in - * DAIF we must have handled an NMI, so skip preemption. - */ - if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return; - - /* - * Preempting a task from an IRQ means we leave copies of PSTATE - * on the stack. cpufeature's enable calls may modify PSTATE, but - * resuming one of these preempted tasks would undo those changes. - * - * Only allow a task to be preempted once cpufeatures have been - * enabled. - */ - if (system_capabilities_finalized()) - preempt_schedule_irq(); -} - static void do_interrupt_handler(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { @@ -324,7 +167,7 @@ extern void (*handle_arch_fiq)(struct pt_regs *); static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, unsigned long esr) { - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); console_verbose(); @@ -475,73 +318,87 @@ UNHANDLED(el1t, 64, error) static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; - enter_from_kernel_mode(regs); + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_mem_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; - enter_from_kernel_mode(regs); + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_undef(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_bti(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_gcs(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_mops(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) { - arm64_enter_el1_dbg(regs); + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); do_breakpoint(esr, regs); debug_exception_exit(regs); - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) { - arm64_enter_el1_dbg(regs); + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); if (!cortex_a76_erratum_1463225_debug_handler(regs)) { debug_exception_enter(regs); /* @@ -554,37 +411,42 @@ static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) do_el1_softstep(esr, regs); debug_exception_exit(regs); } - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) { /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); + irqentry_state_t state; - arm64_enter_el1_dbg(regs); + state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); do_watchpoint(far, esr, regs); debug_exception_exit(regs); - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) { - arm64_enter_el1_dbg(regs); + irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); do_el1_brk64(esr, regs); debug_exception_exit(regs); - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_fpac(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) @@ -639,30 +501,32 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) static __always_inline void __el1_pnmi(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - arm64_enter_nmi(regs); + irqentry_state_t state; + + state = irqentry_nmi_enter(regs); do_interrupt_handler(regs, handler); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); } static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_kernel_mode(regs); + irqentry_state_t state; + + state = enter_from_kernel_mode(regs); irq_enter_rcu(); do_interrupt_handler(regs, handler); irq_exit_rcu(); - arm64_preempt_schedule_irq(); - - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { write_sysreg(DAIF_PROCCTX_NOIRQ, daif); - if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && regs_irqs_disabled(regs)) __el1_pnmi(regs, handler); else __el1_irq(regs, handler); @@ -681,21 +545,22 @@ asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + irqentry_state_t state; local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); } static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) @@ -710,50 +575,50 @@ static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(far)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sve_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sme_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_exc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_sys(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) @@ -763,58 +628,58 @@ static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(instruction_pointer(regs))) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(regs->sp, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_undef(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_bti(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_bti(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_mops(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_gcs(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); bad_el0_sync(regs, 0, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr) @@ -822,12 +687,12 @@ static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(regs->pc)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); debug_exception_enter(regs); do_breakpoint(esr, regs); debug_exception_exit(regs); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) @@ -835,7 +700,7 @@ static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(regs->pc)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); /* * After handling a breakpoint, we suspend the breakpoint * and use single-step to move to the next instruction. @@ -846,7 +711,7 @@ static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) local_daif_restore(DAIF_PROCCTX); do_el0_softstep(esr, regs); } - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr) @@ -854,39 +719,39 @@ static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr) /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); debug_exception_enter(regs); do_watchpoint(far, esr, regs); debug_exception_exit(regs); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_brk64(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); fpsimd_syscall_exit(); } static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_fpac(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) @@ -960,7 +825,7 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) static void noinstr el0_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); write_sysreg(DAIF_PROCCTX_NOIRQ, daif); @@ -971,7 +836,7 @@ static void noinstr el0_interrupt(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr __el0_irq_handler_common(struct pt_regs *regs) @@ -997,14 +862,15 @@ asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) static void noinstr __el0_error_handler_common(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + irqentry_state_t state; - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) @@ -1015,27 +881,27 @@ asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) #ifdef CONFIG_COMPAT static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_cp15(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc_compat(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); do_el0_svc_compat(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_bkpt32(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) @@ -1114,7 +980,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) unsigned long esr = read_sysreg(esr_el1); unsigned long far = read_sysreg(far_el1); - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); panic_bad_stack(regs, esr, far); } @@ -1122,6 +988,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { + irqentry_state_t state; unsigned long ret; /* @@ -1146,9 +1013,9 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) else if (cpu_has_pan()) set_pstate_pan(0); - arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); ret = do_sdei_event(regs, arg); - arm64_exit_nmi(regs); + irqentry_nmi_exit(regs, state); return ret; } diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index c37f02d7194e0b..e3f8f51748bc94 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1265,6 +1265,8 @@ void __init sme_setup(void) if (!system_supports_sme()) return; + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + /* * SME doesn't require any particular vector length be * supported but it does require at least one. We should have @@ -1272,9 +1274,8 @@ void __init sme_setup(void) * let's double check here. The bitmap is SVE_VQ_MAP sized for * sharing with SVE. */ - WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX)); + WARN_ON(min_bit >= SVE_VQ_MAX); - min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index af1ca875c52ce2..410060ebd86dfd 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -94,7 +94,7 @@ int load_other_segments(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline) { - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; void *dtb = NULL; unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index 0f4bd77718590c..e8ddbde31a833d 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -18,9 +18,9 @@ extern const u8 __eh_frame_start[], __eh_frame_end[]; -extern void idmap_cpu_replace_ttbr1(void *pgdir); +extern void idmap_cpu_replace_ttbr1(phys_addr_t pgdir); -static void __init map_segment(pgd_t *pg_dir, u64 *pgd, u64 va_offset, +static void __init map_segment(pgd_t *pg_dir, phys_addr_t *pgd, u64 va_offset, void *start, void *end, pgprot_t prot, bool may_use_cont, int root_level) { @@ -40,7 +40,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) { bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS); bool twopass = IS_ENABLED(CONFIG_RELOCATABLE); - u64 pgdp = (u64)init_pg_dir + PAGE_SIZE; + phys_addr_t pgdp = (phys_addr_t)init_pg_dir + PAGE_SIZE; pgprot_t text_prot = PAGE_KERNEL_ROX; pgprot_t data_prot = PAGE_KERNEL; pgprot_t prot; @@ -78,6 +78,12 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) twopass |= enable_scs; prot = twopass ? data_prot : text_prot; + /* + * [_stext, _text) isn't executed after boot and contains some + * non-executable, unpredictable data, so map it non-executable. + */ + map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot, + false, root_level); map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, !twopass, root_level); map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, @@ -90,7 +96,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) true, root_level); dsb(ishst); - idmap_cpu_replace_ttbr1(init_pg_dir); + idmap_cpu_replace_ttbr1((phys_addr_t)init_pg_dir); if (twopass) { if (IS_ENABLED(CONFIG_RELOCATABLE)) @@ -129,10 +135,10 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) /* Copy the root page table to its final location */ memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE); dsb(ishst); - idmap_cpu_replace_ttbr1(swapper_pg_dir); + idmap_cpu_replace_ttbr1((phys_addr_t)swapper_pg_dir); } -static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) +static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(phys_addr_t ttbr) { u64 sctlr = read_sysreg(sctlr_el1); u64 tcr = read_sysreg(tcr_el1) | TCR_DS; @@ -172,30 +178,30 @@ static void __init remap_idmap_for_lpa2(void) */ create_init_idmap(init_pg_dir, mask); dsb(ishst); - set_ttbr0_for_lpa2((u64)init_pg_dir); + set_ttbr0_for_lpa2((phys_addr_t)init_pg_dir); /* * Recreate the initial ID map with the same granularity as before. * Don't bother with the FDT, we no longer need it after this. */ memset(init_idmap_pg_dir, 0, - (u64)init_idmap_pg_end - (u64)init_idmap_pg_dir); + (char *)init_idmap_pg_end - (char *)init_idmap_pg_dir); create_init_idmap(init_idmap_pg_dir, mask); dsb(ishst); /* switch back to the updated initial ID map */ - set_ttbr0_for_lpa2((u64)init_idmap_pg_dir); + set_ttbr0_for_lpa2((phys_addr_t)init_idmap_pg_dir); /* wipe the temporary ID map from memory */ - memset(init_pg_dir, 0, (u64)init_pg_end - (u64)init_pg_dir); + memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); } -static void __init map_fdt(u64 fdt) +static void *__init map_fdt(phys_addr_t fdt) { static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); - u64 efdt = fdt + MAX_FDT_SIZE; - u64 ptep = (u64)ptes; + phys_addr_t efdt = fdt + MAX_FDT_SIZE; + phys_addr_t ptep = (phys_addr_t)ptes; /* We're idmapped when called */ /* * Map up to MAX_FDT_SIZE bytes, but avoid overlap with @@ -205,6 +211,8 @@ static void __init map_fdt(u64 fdt) fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)init_idmap_pg_dir, false, 0); dsb(ishst); + + return (void *)fdt; } /* @@ -230,7 +238,7 @@ static bool __init ng_mappings_allowed(void) return true; } -asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) +asmlinkage void __init early_map_kernel(u64 boot_status, phys_addr_t fdt) { static char const chosen_str[] __initconst = "/chosen"; u64 va_base, pa_base = (u64)&_text; @@ -238,15 +246,14 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) int root_level = 4 - CONFIG_PGTABLE_LEVELS; int va_bits = VA_BITS; int chosen; - - map_fdt((u64)fdt); + void *fdt_mapped = map_fdt(fdt); /* Clear BSS and the initial page tables */ - memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start); + memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); /* Parse the command line for CPU feature overrides */ - chosen = fdt_path_offset(fdt, chosen_str); - init_feature_override(boot_status, fdt, chosen); + chosen = fdt_path_offset(fdt_mapped, chosen_str); + init_feature_override(boot_status, fdt_mapped, chosen); if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) { va_bits = VA_BITS_MIN; @@ -266,7 +273,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) * fill in the high bits from the seed. */ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - u64 kaslr_seed = kaslr_early_init(fdt, chosen); + u64 kaslr_seed = kaslr_early_init(fdt_mapped, chosen); if (kaslr_seed && kaslr_requires_kpti()) arm64_use_ng_mappings = ng_mappings_allowed(); diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 7982788e7b9aaa..de52cd85c69193 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -26,8 +26,9 @@ * @va_offset: Offset between a physical page and its current mapping * in the VA space */ -void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset) +void __init map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset) { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; @@ -87,19 +88,22 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, } } -asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) +asmlinkage phys_addr_t __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) { - u64 ptep = (u64)pg_dir + PAGE_SIZE; + phys_addr_t ptep = (phys_addr_t)pg_dir + PAGE_SIZE; /* MMU is off */ pgprot_t text_prot = PAGE_KERNEL_ROX; pgprot_t data_prot = PAGE_KERNEL; pgprot_val(text_prot) &= ~clrmask; pgprot_val(data_prot) &= ~clrmask; - map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext, - text_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); - map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin, - data_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); + /* MMU is off; pointer casts to phys_addr_t are safe */ + map_range(&ptep, (u64)_stext, (u64)__initdata_begin, + (phys_addr_t)_stext, text_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); + map_range(&ptep, (u64)__initdata_begin, (u64)_end, + (phys_addr_t)__initdata_begin, data_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); return ptep; } diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index 46cafee7829f48..08ef9f80456bcc 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -29,9 +29,10 @@ u64 kaslr_early_init(void *fdt, int chosen); void relocate_kernel(u64 offset); int scs_patch(const u8 eh_frame[], int size); -void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); +void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); -asmlinkage void early_map_kernel(u64 boot_status, void *fdt); +asmlinkage void early_map_kernel(u64 boot_status, phys_addr_t fdt); -asmlinkage u64 create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); +asmlinkage phys_addr_t create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 6438bf62e753f3..4137cc5ef031f6 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -108,9 +108,10 @@ arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) aarch64_insn_is_bl(insn)) { api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || - aarch64_insn_is_blr(insn) || - aarch64_insn_is_ret(insn)) { - api->handler = simulate_br_blr_ret; + aarch64_insn_is_blr(insn)) { + api->handler = simulate_br_blr; + } else if (aarch64_insn_is_ret(insn)) { + api->handler = simulate_ret; } else { /* * Instruction cannot be stepped out-of-line and we don't diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 4c6d2d712fbd3c..89fbeb32107e31 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -13,6 +13,7 @@ #include #include "simulate-insn.h" +#include "asm/gcs.h" #define bbl_displacement(insn) \ sign_extend32(((insn) & 0x3ffffff) << 2, 27) @@ -49,6 +50,21 @@ static inline u32 get_w_reg(struct pt_regs *regs, int reg) return lower_32_bits(pt_regs_read_reg(regs, reg)); } +static inline int update_lr(struct pt_regs *regs, long addr) +{ + int err = 0; + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + push_user_gcs(addr, &err); + if (err) { + force_sig(SIGSEGV); + return err; + } + } + procedure_link_pointer_set(regs, addr); + return err; +} + static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs) { int xn = opcode & 0x1f; @@ -107,9 +123,9 @@ simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs) { int disp = bbl_displacement(opcode); - /* Link register is x30 */ if (opcode & (1 << 31)) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; instruction_pointer_set(regs, addr + disp); } @@ -126,16 +142,34 @@ simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs) } void __kprobes -simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs) +simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; + u64 b_target = get_x_reg(regs, xn); - /* update pc first in case we're doing a "blr lr" */ - instruction_pointer_set(regs, get_x_reg(regs, xn)); - - /* Link register is x30 */ if (((opcode >> 21) & 0x3) == 1) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; + + instruction_pointer_set(regs, b_target); +} + +void __kprobes +simulate_ret(u32 opcode, long addr, struct pt_regs *regs) +{ + u64 ret_addr; + int err = 0; + int xn = (opcode >> 5) & 0x1f; + u64 r_target = get_x_reg(regs, xn); + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + ret_addr = pop_user_gcs(&err); + if (err || ret_addr != r_target) { + force_sig(SIGSEGV); + return; + } + } + instruction_pointer_set(regs, r_target); } void __kprobes diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index efb2803ec943d6..9e772a292d565d 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -11,7 +11,8 @@ void simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs); -void simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs); +void simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs); +void simulate_ret(u32 opcode, long addr, struct pt_regs *regs); void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 1f91fd2a818798..2799bdb2fb820b 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "decode-insn.h" @@ -159,11 +160,43 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { unsigned long orig_ret_vaddr; + unsigned long gcs_ret_vaddr; + int err = 0; + u64 gcspr; orig_ret_vaddr = procedure_link_pointer(regs); + + if (task_gcs_el0_enabled(current)) { + gcspr = read_sysreg_s(SYS_GCSPR_EL0); + gcs_ret_vaddr = get_user_gcs((__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + + /* + * If the LR and GCS return addr don't match, then some kind of PAC + * signing or control flow occurred since entering the probed function. + * Likely because the user is attempting to retprobe on an instruction + * that isn't a function boundary or inside a leaf function. Explicitly + * abort this retprobe because it will generate a GCS exception. + */ + if (gcs_ret_vaddr != orig_ret_vaddr) { + orig_ret_vaddr = -1; + goto out; + } + + put_user_gcs(trampoline_vaddr, (__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + } + /* Replace the return addr with trampoline addr */ procedure_link_pointer_set(regs, trampoline_vaddr); +out: return orig_ret_vaddr; } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 96482a1412c6aa..fba7ca102a8c42 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -409,7 +409,7 @@ asmlinkage void ret_from_fork(void) asm("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index edf1783ffc8174..f9a32dfde00671 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -884,6 +884,7 @@ static u8 spectre_bhb_loop_affected(void) static const struct midr_range spectre_bhb_k38_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), {}, }; static const struct midr_range spectre_bhb_k32_list[] = { diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index ce4778141ec7b8..c64a06f58c0bc0 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -84,7 +84,25 @@ static void __init arm64_rsi_setup_memory(void) } } -bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +/* + * Check if a given PA range is Trusted (e.g., Protected memory, a Trusted Device + * mapping, or an MMIO emulated in the Realm world). + * + * We can rely on the RIPAS value of the region to detect if a given region is + * protected. + * + * RIPAS_DEV - A trusted device memory or a trusted emulated MMIO (in the Realm + * world + * RIPAS_RAM - Memory (RAM), protected by the RMM guarantees. (e.g., Firmware + * reserved regions for data sharing). + * + * RIPAS_DESTROYED is a special case of one of the above, where the host did + * something without our permission and as such we can't do anything about it. + * + * The only case where something is emulated by the untrusted hypervisor or is + * backed by shared memory is indicated by RSI_RIPAS_EMPTY. + */ +bool arm64_rsi_is_protected(phys_addr_t base, size_t size) { enum ripas ripas; phys_addr_t end, top; @@ -101,18 +119,18 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) break; if (WARN_ON(top <= base)) break; - if (ripas != RSI_RIPAS_DEV) + if (ripas == RSI_RIPAS_EMPTY) break; base = top; } return base >= end; } -EXPORT_SYMBOL(__arm64_is_protected_mmio); +EXPORT_SYMBOL(arm64_rsi_is_protected); static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) { - if (__arm64_is_protected_mmio(phys, size)) + if (arm64_rsi_is_protected(phys, size)) *prot = pgprot_encrypted(*prot); else *prot = pgprot_decrypted(*prot); diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 6f24a0251e1837..95169f7b653198 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -243,7 +243,7 @@ unsigned long __kprobes do_sdei_event(struct pt_regs *regs, * If we interrupted the kernel with interrupts masked, we always go * back to wherever we came from. */ - if (mode == kernel_mode && !interrupts_enabled(regs)) + if (mode == kernel_mode && regs_irqs_disabled(regs)) return SDEI_EV_HANDLED; /* diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 77c7926a4df660..23c05dc7a8f2ac 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -214,7 +214,7 @@ static void __init request_standard_resources(void) unsigned long i = 0; size_t res_size; - kernel_code.start = __pa_symbol(_stext); + kernel_code.start = __pa_symbol(_text); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); @@ -280,7 +280,7 @@ u64 cpu_logical_map(unsigned int cpu) void __init __no_sanitize_address setup_arch(char **cmdline_p) { - setup_initial_init_mm(_stext, _etext, _edata, _end); + setup_initial_init_mm(_text, _etext, _edata, _end); *cmdline_p = boot_command_line; diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index db3f972f8cd973..1110eeb21f572d 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1576,7 +1577,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -void do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index c442fcec6b9e8c..aba7ca6bca2d1a 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -43,7 +43,7 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, add_random_kstack_offset(); - if (scno < sc_nr) { + if (likely(scno < sc_nr)) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f528b6041f6a80..5041817af267f8 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -1015,7 +1015,7 @@ int bug_brk_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_HANDLED; } -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) { unsigned long target; @@ -1039,7 +1039,7 @@ int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr) { diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index f2dfdc7dc8185b..ffa3536581f670 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -21,8 +21,6 @@ endif cc32-option = $(call try-run,\ $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) -cc32-disable-warning = $(call try-run,\ - $(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) # We cannot use the global flags to compile the vDSO files, the main reason # being that the 32-bit compiler may be older than the main (64-bit) compiler @@ -59,10 +57,10 @@ VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING VDSO_CAFLAGS += -march=armv8-a VDSO_CFLAGS := $(VDSO_CAFLAGS) -VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 # KBUILD_CFLAGS from top-level Makefile VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ + $(filter -Werror,$(KBUILD_CPPFLAGS)) \ -Werror-implicit-function-declaration \ -Wno-format-security \ -std=gnu11 @@ -74,16 +72,6 @@ VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) VDSO_CFLAGS += -Werror=date-time VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) -# The 32-bit compiler does not provide 128-bit integers, which are used in -# some headers that are indirectly included from the vDSO code. -# This hack makes the compiler happy and should trigger a warning/error if -# variables of such type are referenced. -VDSO_CFLAGS += -D__uint128_t='void*' -# Silence some warnings coming from headers that operate on long's -# (on GCC 4.8 or older, there is unfortunately no way to silence this warning) -VDSO_CFLAGS += $(call cc32-disable-warning,shift-count-overflow) -VDSO_CFLAGS += -Wno-int-to-pointer-cast - # Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is # unreliable. ifeq ($(CONFIG_THUMB2_COMPAT_VDSO), y) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5bf101c869c9ab..bd6b6a620a09ca 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2113,8 +2113,10 @@ static void cpu_hyp_init_features(void) { cpu_set_hyp_vector(); - if (is_kernel_in_hyp_mode()) + if (is_kernel_in_hyp_mode()) { kvm_timer_init_vhe(); + kvm_debug_init_vhe(); + } if (vgic_present) kvm_vgic_init_cpu_hardware(); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 381382c19fe474..e027d9c32b0d30 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -96,6 +96,13 @@ void kvm_init_host_debug_data(void) } } +void kvm_debug_init_vhe(void) +{ + /* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */ + if (SYS_FIELD_GET(ID_AA64DFR0_EL1, PMSVer, read_sysreg(id_aa64dfr0_el1))) + write_sysreg_el1(0, SYS_PMSCR); +} + /* * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host * has taken over MDSCR_EL1. @@ -138,6 +145,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) /* Must be called before kvm_vcpu_load_vhe() */ KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); + if (has_vhe()) + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + /* * Determine which of the possible debug states we're in: * @@ -184,6 +194,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) { + if (has_vhe()) + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) return; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a598072f36d2ca..8bdb1eed090abf 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -545,7 +545,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else print_nvhe_hyp_panic("BUG", panic_addr); - } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) { + } else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) { kvm_nvhe_report_cfi_failure(panic_addr); } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) && ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 84ec4e100fbb98..b6682202edf3c6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -431,9 +431,6 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); } - *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - if (cpus_have_final_cap(ARM64_HAS_HCX)) { u64 hcrx = vcpu->arch.hcrx_el2; if (is_nested_ctxt(vcpu)) { @@ -454,8 +451,6 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); - write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); - write_sysreg(0, hstr_el2); if (system_supports_pmuv3()) { write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index ccd575d5f6dec9..d3b9ec8a7c2834 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -50,6 +50,10 @@ extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); static void __activate_traps(struct kvm_vcpu *vcpu) { ___activate_traps(vcpu, vcpu->arch.hcr_el2); + + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); + __activate_traps_common(vcpu); __activate_cptr_traps(vcpu); @@ -93,6 +97,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + __deactivate_traps_common(vcpu); write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 71d2fc97f0046a..82da9b03692d45 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -253,7 +253,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); - __vcpu_assign_sys_reg(vcpu, read_sysreg_el1(SYS_VBAR), VBAR_EL1); + __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR)); kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c36f282a175dfc..c351b4abd5dbfb 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1551,38 +1551,21 @@ static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, return 0; } -void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, - u64 addr, u64 size) +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) { + size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); -} - -void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) -{ - size_t pgd_sz; - + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - - /* - * Since the pgtable is unlinked at this point, and not shared with - * other walkers, safely deference pgd with kvm_dereference_pteref_raw() - */ - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); pgt->pgd = NULL; } -void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) -{ - kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); - kvm_pgtable_stage2_destroy_pgd(pgt); -} - void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 86f3d80daf37af..7363942925038e 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,38 +904,6 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } -/* - * Assume that @pgt is valid and unlinked from the KVM MMU to free the - * page-table without taking the kvm_mmu_lock and without performing any - * TLB invalidations. - * - * Also, the range of addresses can be large enough to cause need_resched - * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke - * cond_resched() periodically to prevent hogging the CPU for a long time - * and schedule something else, if required. - */ -static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, - phys_addr_t end) -{ - u64 next; - - do { - next = stage2_range_addr_end(addr, end); - KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, - next - addr); - if (next != end) - cond_resched(); - } while (addr = next, addr != end); -} - -static void kvm_stage2_destroy(struct kvm_pgtable *pgt) -{ - unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); - - stage2_destroy_range(pgt, 0, BIT(ia_bits)); - KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); -} - /** * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure @@ -1012,7 +980,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return 0; out_destroy_pgtable: - kvm_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); out_free_pgtable: kfree(pgt); return err; @@ -1106,10 +1074,14 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); } + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + write_unlock(&kvm->mmu_lock); if (pgt) { - kvm_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); kfree(pgt); } } @@ -1541,11 +1513,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); - if (fault_is_perm && !write_fault && !exec_fault) { - kvm_err("Unexpected L2 read permission error\n"); - return -EFAULT; - } - if (!is_protected_kvm_enabled()) memcache = &vcpu->arch.mmu_page_cache; else diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 77db81bae86f9b..50d559248a1f0f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -847,7 +847,7 @@ static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, vt->wr.level)); - ipa_start = vt->wr.pa & (ipa_size - 1); + ipa_start = vt->wr.pa & ~(ipa_size - 1); ipa_end = ipa_start + ipa_size; if (ipa_end <= start || ipa_start >= end) @@ -887,7 +887,7 @@ static void invalidate_vncr_va(struct kvm *kvm, va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, vt->wr.level)); - va_start = vt->gva & (va_size - 1); + va_start = vt->gva & ~(va_size - 1); va_end = va_start + va_size; switch (scope->type) { @@ -1276,7 +1276,7 @@ static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu) !(tcr & TCR_ASID16)) asid &= GENMASK(7, 0); - return asid != vt->wr.asid; + return asid == vt->wr.asid; } return true; diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 61827cf6fea4aa..fcd70bfe44fb8c 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -316,16 +316,9 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e return 0; } -void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, - u64 addr, u64 size) +void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) { - __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); -} - -void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) -{ - /* Expected to be called after all pKVM mappings have been released. */ - WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); + __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL)); } int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 2684f273d9e17a..4c1209261b65d4 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -69,7 +69,7 @@ static int iter_mark_lpis(struct kvm *kvm) int nr_lpis = 0; xa_for_each(&dist->lpi_xa, intid, irq) { - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) continue; xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1e680ad6e86359..4c3c0d82e47601 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); + xa_init(&dist->lpi_xa); } /* CREATION */ @@ -208,7 +208,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) raw_spin_lock_init(&irq->irq_lock); irq->vcpu = NULL; irq->target_vcpu = vcpu0; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); switch (dist->vgic_model) { case KVM_DEV_TYPE_ARM_VGIC_V2: irq->targets = 0; @@ -277,7 +277,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) irq->intid = i; irq->vcpu = NULL; irq->target_vcpu = vcpu; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 0); if (vgic_irq_is_sgi(i)) { /* SGIs */ irq->enabled = 1; diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 7368c13f16b729..ce3e3ed3f29f04 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -78,7 +78,6 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; - unsigned long flags; int ret; /* In this case there is no put, since we keep the reference. */ @@ -89,7 +88,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (!irq) return ERR_PTR(-ENOMEM); - ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); + ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); if (ret) { kfree(irq); return ERR_PTR(ret); @@ -99,19 +98,19 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, raw_spin_lock_init(&irq->irq_lock); irq->config = VGIC_CONFIG_EDGE; - kref_init(&irq->refcount); + refcount_set(&irq->refcount, 1); irq->intid = intid; irq->target_vcpu = vcpu; irq->group = 1; - xa_lock_irqsave(&dist->lpi_xa, flags); + xa_lock(&dist->lpi_xa); /* * There could be a race with another vgic_add_lpi(), so we need to * check that we don't add a second list entry with the same LPI. */ oldirq = xa_load(&dist->lpi_xa, intid); - if (vgic_try_get_irq_kref(oldirq)) { + if (vgic_try_get_irq_ref(oldirq)) { /* Someone was faster with adding this LPI, lets use that. */ kfree(irq); irq = oldirq; @@ -126,7 +125,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, } out_unlock: - xa_unlock_irqrestore(&dist->lpi_xa, flags); + xa_unlock(&dist->lpi_xa); if (ret) return ERR_PTR(ret); @@ -547,7 +546,7 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, rcu_read_lock(); irq = xa_load(&its->translation_cache, cache_key); - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) irq = NULL; rcu_read_unlock(); @@ -571,7 +570,7 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, * its_lock, as the ITE (and the reference it holds) cannot be freed. */ lockdep_assert_held(&its->its_lock); - vgic_get_irq_kref(irq); + vgic_get_irq_ref(irq); old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 4d9343d2b0b155..548aec9d5a7280 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -518,7 +518,7 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq) if (!irq->hw || irq->host_irq != host_irq) continue; - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) return NULL; return irq; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index f5148b38120ad7..6dd5a10081e27e 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -28,8 +28,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * kvm->arch.config_lock (mutex) * its->cmd_lock (mutex) * its->its_lock (mutex) - * vgic_cpu->ap_list_lock must be taken with IRQs disabled - * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_dist->lpi_xa.xa_lock + * vgic_cpu->ap_list_lock must be taken with IRQs disabled * vgic_irq->irq_lock must be taken with IRQs disabled * * As the ap_list_lock might be taken from the timer interrupt handler, @@ -71,7 +71,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) rcu_read_lock(); irq = xa_load(&dist->lpi_xa, intid); - if (!vgic_try_get_irq_kref(irq)) + if (!vgic_try_get_irq_ref(irq)) irq = NULL; rcu_read_unlock(); @@ -114,37 +114,66 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) return vgic_get_irq(vcpu->kvm, intid); } -/* - * We can't do anything in here, because we lack the kvm pointer to - * lock and remove the item from the lpi_list. So we keep this function - * empty and use the return value of kref_put() to trigger the freeing. - */ -static void vgic_irq_release(struct kref *ref) +static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq) +{ + lockdep_assert_held(&dist->lpi_xa.xa_lock); + __xa_erase(&dist->lpi_xa, irq->intid); + kfree_rcu(irq, rcu); +} + +static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return false; + + return refcount_dec_and_test(&irq->refcount); +} + +static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq) { + if (!__vgic_put_irq(kvm, irq)) + return false; + + irq->pending_release = true; + return true; } void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long flags; - if (irq->intid < VGIC_MIN_LPI) - return; + if (irq->intid >= VGIC_MIN_LPI) + might_lock(&dist->lpi_xa.xa_lock); - if (!kref_put(&irq->refcount, vgic_irq_release)) + if (!__vgic_put_irq(kvm, irq)) return; - xa_lock_irqsave(&dist->lpi_xa, flags); - __xa_erase(&dist->lpi_xa, irq->intid); - xa_unlock_irqrestore(&dist->lpi_xa, flags); + xa_lock(&dist->lpi_xa); + vgic_release_lpi_locked(dist, irq); + xa_unlock(&dist->lpi_xa); +} - kfree_rcu(irq, rcu); +static void vgic_release_deleted_lpis(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid; + struct vgic_irq *irq; + + xa_lock(&dist->lpi_xa); + + xa_for_each(&dist->lpi_xa, intid, irq) { + if (irq->pending_release) + vgic_release_lpi_locked(dist, irq); + } + + xa_unlock(&dist->lpi_xa); } void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted = false; unsigned long flags; raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); @@ -155,11 +184,14 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) list_del(&irq->ap_list); irq->vcpu = NULL; raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); + deleted |= vgic_put_irq_norelease(vcpu->kvm, irq); } } raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + + if (deleted) + vgic_release_deleted_lpis(vcpu->kvm); } void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) @@ -399,7 +431,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, * now in the ap_list. This is safe as the caller must already hold a * reference on the irq. */ - vgic_get_irq_kref(irq); + vgic_get_irq_ref(irq); list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; @@ -630,6 +662,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + bool deleted_lpis = false; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); @@ -657,12 +690,12 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) /* * This vgic_put_irq call matches the - * vgic_get_irq_kref in vgic_queue_irq_unlock, + * vgic_get_irq_ref in vgic_queue_irq_unlock, * where we added the LPI to the ap_list. As * we remove the irq from the list, we drop * also drop the refcount. */ - vgic_put_irq(vcpu->kvm, irq); + deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq); continue; } @@ -725,6 +758,9 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) } raw_spin_unlock(&vgic_cpu->ap_list_lock); + + if (unlikely(deleted_lpis)) + vgic_release_deleted_lpis(vcpu->kvm); } static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) @@ -818,7 +854,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) * the AP list has been sorted already. */ if (multi_sgi && irq->priority > prio) { - _raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&irq->irq_lock); break; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index de1c1d3261c396..ac5f9c5d2b9800 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -267,7 +267,7 @@ void vgic_v2_put(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); -static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) +static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq) { if (!irq) return false; @@ -275,12 +275,12 @@ static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) if (irq->intid < VGIC_MIN_LPI) return true; - return kref_get_unless_zero(&irq->refcount); + return refcount_inc_not_zero(&irq->refcount); } -static inline void vgic_get_irq_kref(struct vgic_irq *irq) +static inline void vgic_get_irq_ref(struct vgic_irq *irq) { - WARN_ON_ONCE(!vgic_try_get_irq_kref(irq)); + WARN_ON_ONCE(!vgic_try_get_irq_ref(irq)); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index ea84a61ed50848..524d34a0e92198 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -243,7 +243,7 @@ void __init arm64_memblock_init(void) */ if (memory_limit != PHYS_ADDR_MAX) { memblock_mem_limit_remove_map(memory_limit); - memblock_add(__pa_symbol(_text), (u64)(_end - _text)); + memblock_add(__pa_symbol(_text), (resource_size_t)(_end - _text)); } if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { @@ -252,8 +252,8 @@ void __init arm64_memblock_init(void) * initrd to become inaccessible via the linear mapping. * Otherwise, this is a no-op */ - u64 base = phys_initrd_start & PAGE_MASK; - u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; + phys_addr_t base = phys_initrd_start & PAGE_MASK; + resource_size_t size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; /* * We can only add back the initrd memory if we don't end up @@ -279,7 +279,7 @@ void __init arm64_memblock_init(void) * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. */ - memblock_reserve(__pa_symbol(_stext), _end - _stext); + memblock_reserve(__pa_symbol(_text), _end - _text); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { /* the generic initrd code expects virtual addresses */ initrd_start = __phys_to_virt(phys_initrd_start); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1838015207404d..3a444a5fe46965 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include @@ -47,6 +49,8 @@ #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ +DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); + u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); @@ -474,14 +478,18 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, int flags); #endif -static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, +#define INVALID_PHYS_ADDR (-1ULL) + +static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, enum pgtable_type pgtable_type) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ - struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); + struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0); phys_addr_t pa; - BUG_ON(!ptdesc); + if (!ptdesc) + return INVALID_PHYS_ADDR; + pa = page_to_phys(ptdesc_page(ptdesc)); switch (pgtable_type) { @@ -502,16 +510,392 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, return pa; } +static phys_addr_t +try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type, gfp_t gfp) +{ + return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type); +} + static phys_addr_t __maybe_unused pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) { - return __pgd_pgtable_alloc(&init_mm, pgtable_type); + phys_addr_t pa; + + pa = __pgd_pgtable_alloc(&init_mm, GFP_PGTABLE_KERNEL, pgtable_type); + BUG_ON(pa == INVALID_PHYS_ADDR); + return pa; } static phys_addr_t pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) { - return __pgd_pgtable_alloc(NULL, pgtable_type); + phys_addr_t pa; + + pa = __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type); + BUG_ON(pa == INVALID_PHYS_ADDR); + return pa; +} + +static void split_contpte(pte_t *ptep) +{ + int i; + + ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); + for (i = 0; i < CONT_PTES; i++, ptep++) + __set_pte(ptep, pte_mknoncont(__ptep_get(ptep))); +} + +static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) +{ + pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; + unsigned long pfn = pmd_pfn(pmd); + pgprot_t prot = pmd_pgprot(pmd); + phys_addr_t pte_phys; + pte_t *ptep; + int i; + + pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE, gfp); + if (pte_phys == INVALID_PHYS_ADDR) + return -ENOMEM; + ptep = (pte_t *)phys_to_virt(pte_phys); + + if (pgprot_val(prot) & PMD_SECT_PXN) + tableprot |= PMD_TABLE_PXN; + + prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); + if (to_cont) + prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++) + __set_pte(ptep, pfn_pte(pfn, prot)); + + /* + * Ensure the pte entries are visible to the table walker by the time + * the pmd entry that points to the ptes is visible. + */ + dsb(ishst); + __pmd_populate(pmdp, pte_phys, tableprot); + + return 0; +} + +static void split_contpmd(pmd_t *pmdp) +{ + int i; + + pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS); + for (i = 0; i < CONT_PMDS; i++, pmdp++) + set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp))); +} + +static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) +{ + pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; + unsigned int step = PMD_SIZE >> PAGE_SHIFT; + unsigned long pfn = pud_pfn(pud); + pgprot_t prot = pud_pgprot(pud); + phys_addr_t pmd_phys; + pmd_t *pmdp; + int i; + + pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD, gfp); + if (pmd_phys == INVALID_PHYS_ADDR) + return -ENOMEM; + pmdp = (pmd_t *)phys_to_virt(pmd_phys); + + if (pgprot_val(prot) & PMD_SECT_PXN) + tableprot |= PUD_TABLE_PXN; + + prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); + if (to_cont) + prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step) + set_pmd(pmdp, pfn_pmd(pfn, prot)); + + /* + * Ensure the pmd entries are visible to the table walker by the time + * the pud entry that points to the pmds is visible. + */ + dsb(ishst); + __pud_populate(pudp, pmd_phys, tableprot); + + return 0; +} + +static int split_kernel_leaf_mapping_locked(unsigned long addr) +{ + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + int ret = 0; + + /* + * PGD: If addr is PGD aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. + */ + if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr) + goto out; + pgdp = pgd_offset_k(addr); + pgd = pgdp_get(pgdp); + if (!pgd_present(pgd)) + goto out; + + /* + * P4D: If addr is P4D aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. + */ + if (ALIGN_DOWN(addr, P4D_SIZE) == addr) + goto out; + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + if (!p4d_present(p4d)) + goto out; + + /* + * PUD: If addr is PUD aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. Otherwise, + * if we have a pud leaf, split to contpmd. + */ + if (ALIGN_DOWN(addr, PUD_SIZE) == addr) + goto out; + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (!pud_present(pud)) + goto out; + if (pud_leaf(pud)) { + ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true); + if (ret) + goto out; + } + + /* + * CONTPMD: If addr is CONTPMD aligned then addr already describes a + * leaf boundary. If not present then there is nothing to split. + * Otherwise, if we have a contpmd leaf, split to pmd. + */ + if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr) + goto out; + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + if (!pmd_present(pmd)) + goto out; + if (pmd_leaf(pmd)) { + if (pmd_cont(pmd)) + split_contpmd(pmdp); + /* + * PMD: If addr is PMD aligned then addr already describes a + * leaf boundary. Otherwise, split to contpte. + */ + if (ALIGN_DOWN(addr, PMD_SIZE) == addr) + goto out; + ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true); + if (ret) + goto out; + } + + /* + * CONTPTE: If addr is CONTPTE aligned then addr already describes a + * leaf boundary. If not present then there is nothing to split. + * Otherwise, if we have a contpte leaf, split to pte. + */ + if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr) + goto out; + ptep = pte_offset_kernel(pmdp, addr); + pte = __ptep_get(ptep); + if (!pte_present(pte)) + goto out; + if (pte_cont(pte)) + split_contpte(ptep); + +out: + return ret; +} + +static DEFINE_MUTEX(pgtable_split_lock); + +int split_kernel_leaf_mapping(unsigned long start, unsigned long end) +{ + int ret; + + /* + * !BBML2_NOABORT systems should not be trying to change permissions on + * anything that is not pte-mapped in the first place. Just return early + * and let the permission change code raise a warning if not already + * pte-mapped. + */ + if (!system_supports_bbml2_noabort()) + return 0; + + /* + * Ensure start and end are at least page-aligned since this is the + * finest granularity we can split to. + */ + if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end)) + return -EINVAL; + + mutex_lock(&pgtable_split_lock); + arch_enter_lazy_mmu_mode(); + + /* + * The split_kernel_leaf_mapping_locked() may sleep, it is not a + * problem for ARM64 since ARM64's lazy MMU implementation allows + * sleeping. + * + * Optimize for the common case of splitting out a single page from a + * larger mapping. Here we can just split on the "least aligned" of + * start and end and this will guarantee that there must also be a split + * on the more aligned address since the both addresses must be in the + * same contpte block and it must have been split to ptes. + */ + if (end - start == PAGE_SIZE) { + start = __ffs(start) < __ffs(end) ? start : end; + ret = split_kernel_leaf_mapping_locked(start); + } else { + ret = split_kernel_leaf_mapping_locked(start); + if (!ret) + ret = split_kernel_leaf_mapping_locked(end); + } + + arch_leave_lazy_mmu_mode(); + mutex_unlock(&pgtable_split_lock); + return ret; +} + +static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + pud_t pud = pudp_get(pudp); + int ret = 0; + + if (pud_leaf(pud)) + ret = split_pud(pudp, pud, GFP_ATOMIC, false); + + return ret; +} + +static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + pmd_t pmd = pmdp_get(pmdp); + int ret = 0; + + if (pmd_leaf(pmd)) { + if (pmd_cont(pmd)) + split_contpmd(pmdp); + ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false); + + /* + * We have split the pmd directly to ptes so there is no need to + * visit each pte to check if they are contpte. + */ + walk->action = ACTION_CONTINUE; + } + + return ret; +} + +static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + pte_t pte = __ptep_get(ptep); + + if (pte_cont(pte)) + split_contpte(ptep); + + return 0; +} + +static const struct mm_walk_ops split_to_ptes_ops __initconst = { + .pud_entry = split_to_ptes_pud_entry, + .pmd_entry = split_to_ptes_pmd_entry, + .pte_entry = split_to_ptes_pte_entry, +}; + +static bool linear_map_requires_bbml2 __initdata; + +u32 idmap_kpti_bbml2_flag; + +void __init init_idmap_kpti_bbml2_flag(void) +{ + WRITE_ONCE(idmap_kpti_bbml2_flag, 1); + /* Must be visible to other CPUs before stop_machine() is called. */ + smp_mb(); +} + +static int __init linear_map_split_to_ptes(void *__unused) +{ + /* + * Repainting the linear map must be done by CPU0 (the boot CPU) because + * that's the only CPU that we know supports BBML2. The other CPUs will + * be held in a waiting area with the idmap active. + */ + if (!smp_processor_id()) { + unsigned long lstart = _PAGE_OFFSET(vabits_actual); + unsigned long lend = PAGE_END; + unsigned long kstart = (unsigned long)lm_alias(_stext); + unsigned long kend = (unsigned long)lm_alias(__init_begin); + int ret; + + /* + * Wait for all secondary CPUs to be put into the waiting area. + */ + smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus()); + + /* + * Walk all of the linear map [lstart, lend), except the kernel + * linear map alias [kstart, kend), and split all mappings to + * PTE. The kernel alias remains static throughout runtime so + * can continue to be safely mapped with large mappings. + */ + ret = walk_kernel_page_table_range_lockless(lstart, kstart, + &split_to_ptes_ops, NULL, NULL); + if (!ret) + ret = walk_kernel_page_table_range_lockless(kend, lend, + &split_to_ptes_ops, NULL, NULL); + if (ret) + panic("Failed to split linear map\n"); + flush_tlb_kernel_range(lstart, lend); + + /* + * Relies on dsb in flush_tlb_kernel_range() to avoid reordering + * before any page table split operations. + */ + WRITE_ONCE(idmap_kpti_bbml2_flag, 0); + } else { + typedef void (wait_split_fn)(void); + extern wait_split_fn wait_linear_map_split_to_ptes; + wait_split_fn *wait_fn; + + wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes); + + /* + * At least one secondary CPU doesn't support BBML2 so cannot + * tolerate the size of the live mappings changing. So have the + * secondary CPUs wait for the boot CPU to make the changes + * with the idmap active and init_mm inactive. + */ + cpu_install_idmap(); + wait_fn(); + cpu_uninstall_idmap(); + } + + return 0; +} + +void __init linear_map_maybe_split_to_ptes(void) +{ + if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) { + init_idmap_kpti_bbml2_flag(); + stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask); + } } /* @@ -574,8 +958,8 @@ void __init mark_linear_text_alias_ro(void) /* * Remove the write permissions from the linear alias of .text/.rodata */ - update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), - (unsigned long)__init_begin - (unsigned long)_stext, + update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), + (unsigned long)__init_begin - (unsigned long)_text, PAGE_KERNEL_RO); } @@ -633,10 +1017,20 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { #endif /* CONFIG_KFENCE */ +static inline bool force_pte_mapping(void) +{ + bool bbml2 = system_capabilities_finalized() ? + system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); + + return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() || + is_realm_world())) || + debug_pagealloc_enabled(); +} + static void __init map_mem(pgd_t *pgdp) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); - phys_addr_t kernel_start = __pa_symbol(_stext); + phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); phys_addr_t start, end; phys_addr_t early_kfence_pool; @@ -658,7 +1052,9 @@ static void __init map_mem(pgd_t *pgdp) early_kfence_pool = arm64_kfence_alloc_pool(); - if (can_set_direct_map()) + linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map(); + + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* @@ -683,7 +1079,7 @@ static void __init map_mem(pgd_t *pgdp) } /* - * Map the linear alias of the [_stext, __init_begin) interval + * Map the linear alias of the [_text, __init_begin) interval * as non-executable now, and remove the write permission in * mark_linear_text_alias_ro() below (which will be called after * alternative patching has completed). This makes the contents @@ -710,6 +1106,10 @@ void mark_rodata_ro(void) WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); + /* mark the range between _text and _stext as read only. */ + update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, + (unsigned long)_stext - (unsigned long)_text, + PAGE_KERNEL_RO); } static void __init declare_vma(struct vm_struct *vma, @@ -780,38 +1180,41 @@ static void __init declare_kernel_vmas(void) { static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); + declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD); declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); declare_vma(&vmlinux_seg[4], _data, _end, 0); } -void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); +void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, - kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; + kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; static void __init create_idmap(void) { - u64 start = __pa_symbol(__idmap_text_start); - u64 end = __pa_symbol(__idmap_text_end); - u64 ptep = __pa_symbol(idmap_ptes); + phys_addr_t start = __pa_symbol(__idmap_text_start); + phys_addr_t end = __pa_symbol(__idmap_text_end); + phys_addr_t ptep = __pa_symbol(idmap_ptes); __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); - if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { - extern u32 __idmap_kpti_flag; - u64 pa = __pa_symbol(&__idmap_kpti_flag); + if (linear_map_requires_bbml2 || + (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) { + phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag); /* * The KPTI G-to-nG conversion code needs a read-write mapping - * of its synchronization flag in the ID map. + * of its synchronization flag in the ID map. This is also used + * when splitting the linear map to ptes if a secondary CPU + * doesn't support bbml2. */ - ptep = __pa_symbol(kpti_ptes); + ptep = __pa_symbol(kpti_bbml2_ptes); __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); @@ -1261,7 +1664,8 @@ int pmd_clear_huge(pmd_t *pmdp) return 1; } -int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) +static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, + bool acquire_mmap_lock) { pte_t *table; pmd_t pmd; @@ -1273,13 +1677,25 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) return 1; } + /* See comment in pud_free_pmd_page for static key logic */ table = pte_offset_kernel(pmdp, addr); pmd_clear(pmdp); __flush_tlb_kernel_pgtable(addr); + if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) { + mmap_read_lock(&init_mm); + mmap_read_unlock(&init_mm); + } + pte_free_kernel(NULL, table); return 1; } +int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) +{ + /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */ + return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true); +} + int pud_free_pmd_page(pud_t *pudp, unsigned long addr) { pmd_t *table; @@ -1295,16 +1711,36 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) } table = pmd_offset(pudp, addr); + + /* + * Our objective is to prevent ptdump from reading a PMD table which has + * been freed. In this race, if pud_free_pmd_page observes the key on + * (which got flipped by ptdump) then the mmap lock sequence here will, + * as a result of the mmap write lock/unlock sequence in ptdump, give + * us the correct synchronization. If not, this means that ptdump has + * yet not started walking the pagetables - the sequence of barriers + * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will + * observe an empty PUD. + */ + pud_clear(pudp); + __flush_tlb_kernel_pgtable(addr); + if (static_branch_unlikely(&arm64_ptdump_lock_key)) { + mmap_read_lock(&init_mm); + mmap_read_unlock(&init_mm); + } + pmdp = table; next = addr; end = addr + PUD_SIZE; do { if (pmd_present(pmdp_get(pmdp))) - pmd_free_pte_page(pmdp, next); + /* + * PMD has been isolated, so ptdump won't see it. No + * need to acquire init_mm.mmap_lock. + */ + __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false); } while (pmdp++, next += PMD_SIZE, next != end); - pud_clear(pudp); - __flush_tlb_kernel_pgtable(addr); pmd_free(NULL, table); return 1; } @@ -1324,8 +1760,8 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) struct range arch_get_mappable_range(void) { struct range mhp_range; - u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); - u64 end_linear_pa = __pa(PAGE_END - 1); + phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); + phys_addr_t end_linear_pa = __pa(PAGE_END - 1); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { /* @@ -1360,7 +1796,7 @@ int arch_add_memory(int nid, u64 start, u64 size, VM_BUG_ON(!mhp_range_allowed(start, size, true)); - if (can_set_direct_map()) + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 04d4a8f676db42..5135f2d66958d9 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -20,7 +21,66 @@ struct page_change_data { pgprot_t clear_mask; }; -bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) +{ + struct page_change_data *masks = walk->private; + + val &= ~(pgprot_val(masks->clear_mask)); + val |= (pgprot_val(masks->set_mask)); + + return val; +} + +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t val = pudp_get(pud); + + if (pud_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) + return -EINVAL; + val = __pud(set_pageattr_masks(pud_val(val), walk)); + set_pud(pud, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t val = pmdp_get(pmd); + + if (pmd_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) + return -EINVAL; + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); + set_pmd(pmd, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t val = __ptep_get(pte); + + val = __pte(set_pageattr_masks(pte_val(val), walk)); + __set_pte(pte, val); + + return 0; +} + +static const struct mm_walk_ops pageattr_ops = { + .pud_entry = pageattr_pud_entry, + .pmd_entry = pageattr_pmd_entry, + .pte_entry = pageattr_pte_entry, +}; + +bool rodata_full __ro_after_init = true; bool can_set_direct_map(void) { @@ -37,32 +97,39 @@ bool can_set_direct_map(void) arm64_kfence_can_set_direct_map() || is_realm_world(); } -static int change_page_range(pte_t *ptep, unsigned long addr, void *data) +static int update_range_prot(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data *cdata = data; - pte_t pte = __ptep_get(ptep); + struct page_change_data data; + int ret; - pte = clear_pte_bit(pte, cdata->clear_mask); - pte = set_pte_bit(pte, cdata->set_mask); + data.set_mask = set_mask; + data.clear_mask = clear_mask; - __set_pte(ptep, pte); - return 0; + ret = split_kernel_leaf_mapping(start, start + size); + if (WARN_ON_ONCE(ret)) + return ret; + + arch_enter_lazy_mmu_mode(); + + /* + * The caller must ensure that the range we are operating on does not + * partially overlap a block mapping, or a cont mapping. Any such case + * must be eliminated by splitting the mapping. + */ + ret = walk_kernel_page_table_range_lockless(start, start + size, + &pageattr_ops, NULL, &data); + arch_leave_lazy_mmu_mode(); + + return ret; } -/* - * This function assumes that the range is mapped with PAGE_SIZE pages. - */ static int __change_memory_common(unsigned long start, unsigned long size, - pgprot_t set_mask, pgprot_t clear_mask) + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data data; int ret; - data.set_mask = set_mask; - data.clear_mask = clear_mask; - - ret = apply_to_page_range(&init_mm, start, size, change_page_range, - &data); + ret = update_range_prot(start, size, set_mask, clear_mask); /* * If the memory is being made valid without changing any other bits @@ -174,32 +241,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) int set_direct_map_invalid_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(0), - .clear_mask = __pgprot(PTE_VALID), - }; + pgprot_t clear_mask = __pgprot(PTE_VALID); + pgprot_t set_mask = __pgprot(0); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } int set_direct_map_default_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(PTE_VALID | PTE_WRITE), - .clear_mask = __pgprot(PTE_RDONLY), - }; + pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); + pgprot_t clear_mask = __pgprot(PTE_RDONLY); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } static int __set_memory_enc_dec(unsigned long addr, diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 8c75965afc9e59..86818511962b61 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -245,10 +245,6 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1) * * Called exactly once from stop_machine context by each CPU found during boot. */ - .pushsection ".data", "aw", %progbits -SYM_DATA(__idmap_kpti_flag, .long 1) - .popsection - SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) cpu .req w0 temp_pte .req x0 @@ -273,7 +269,7 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) mov x5, x3 // preserve temp_pte arg mrs swapper_ttb, ttbr1_el1 - adr_l flag_ptr, __idmap_kpti_flag + adr_l flag_ptr, idmap_kpti_bbml2_flag cbnz cpu, __idmap_kpti_secondary @@ -416,7 +412,25 @@ alternative_else_nop_endif __idmap_kpti_secondary: /* Uninstall swapper before surgery begins */ __idmap_cpu_set_reserved_ttbr1 x16, x17 + b scondary_cpu_wait + + .unreq swapper_ttb + .unreq flag_ptr +SYM_FUNC_END(idmap_kpti_install_ng_mappings) + .popsection +#endif + + .pushsection ".idmap.text", "a" +SYM_TYPED_FUNC_START(wait_linear_map_split_to_ptes) + /* Must be same registers as in idmap_kpti_install_ng_mappings */ + swapper_ttb .req x3 + flag_ptr .req x4 + + mrs swapper_ttb, ttbr1_el1 + adr_l flag_ptr, idmap_kpti_bbml2_flag + __idmap_cpu_set_reserved_ttbr1 x16, x17 +scondary_cpu_wait: /* Increment the flag to let the boot CPU we're ready */ 1: ldxr w16, [flag_ptr] add w16, w16, #1 @@ -436,9 +450,8 @@ __idmap_kpti_secondary: .unreq swapper_ttb .unreq flag_ptr -SYM_FUNC_END(idmap_kpti_install_ng_mappings) +SYM_FUNC_END(wait_linear_map_split_to_ptes) .popsection -#endif /* * __cpu_setup diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 421a5de806c62d..ab9899ca1e5f21 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -283,6 +283,13 @@ void note_page_flush(struct ptdump_state *pt_st) note_page(pt_st, 0, -1, pte_val(pte_zero)); } +static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm) +{ + static_branch_inc(&arm64_ptdump_lock_key); + ptdump_walk_pgd(st, mm, NULL); + static_branch_dec(&arm64_ptdump_lock_key); +} + void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; @@ -311,7 +318,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) } }; - ptdump_walk_pgd(&st.ptdump, info->mm, NULL); + arm64_ptdump_walk_pgd(&st.ptdump, info->mm); } static void __init ptdump_initialize(void) @@ -353,7 +360,7 @@ bool ptdump_check_wx(void) } }; - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + arm64_ptdump_walk_pgd(&st.ptdump, &init_mm); if (st.wx_pages || st.uxn_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", diff --git a/arch/arm64/net/Makefile b/arch/arm64/net/Makefile index 5c540efb7d9b9a..3ae382bfca8797 100644 --- a/arch/arm64/net/Makefile +++ b/arch/arm64/net/Makefile @@ -2,4 +2,4 @@ # # ARM64 networking code # -obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 52ffe115a8c47c..ab83089c3d8fe0 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -185,7 +185,7 @@ static inline void emit_bti(u32 insn, struct jit_ctx *ctx) static inline void emit_kcfi(u32 hash, struct jit_ctx *ctx) { - if (IS_ENABLED(CONFIG_CFI_CLANG)) + if (IS_ENABLED(CONFIG_CFI)) emit_u32_data(hash, ctx); } @@ -1066,19 +1066,53 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) emit(A64_RET(A64_LR), ctx); } -#define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` field in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+-----------+----------+ + * | 31-27 | 26-22 | 21 | 20-16 | 15-0 | + * | | | | | | + * | FIXUP_REG | Unused | ARENA_ACC | ARENA_REG | OFFSET | + * +-----------+--------+-----------+-----------+----------+ + * + * - OFFSET (16 bits): Offset used to compute address for Load/Store instruction. + * - ARENA_REG (5 bits): Register that is used to calculate the address for load/store when + * accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * - FIXUP_REG (5 bits): Destination register for the load instruction (cleared on fault) or set to + * DONT_CLEAR if it is a store instruction. + */ + +#define BPF_FIXUP_OFFSET_MASK GENMASK(15, 0) +#define BPF_FIXUP_ARENA_REG_MASK GENMASK(20, 16) +#define BPF_ARENA_ACCESS BIT(21) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) { - off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); + s16 off = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); + int arena_reg = FIELD_GET(BPF_FIXUP_ARENA_REG_MASK, ex->fixup); + bool is_arena = !!(ex->fixup & BPF_ARENA_ACCESS); + bool is_write = (dst_reg == DONT_CLEAR); + unsigned long addr; + + if (is_arena) { + addr = regs->regs[arena_reg] + off; + bpf_prog_report_arena_violation(is_write, addr, regs->pc); + } if (dst_reg != DONT_CLEAR) regs->regs[dst_reg] = 0; - regs->pc = (unsigned long)&ex->fixup - offset; + /* Skip the faulting instruction */ + regs->pc += AARCH64_INSN_SIZE; + return true; } @@ -1088,7 +1122,9 @@ static int add_exception_handler(const struct bpf_insn *insn, int dst_reg) { off_t ins_offset; - off_t fixup_offset; + s16 off = insn->off; + bool is_arena; + int arena_reg; unsigned long pc; struct exception_table_entry *ex; @@ -1097,11 +1133,16 @@ static int add_exception_handler(const struct bpf_insn *insn, return 0; if (BPF_MODE(insn->code) != BPF_PROBE_MEM && - BPF_MODE(insn->code) != BPF_PROBE_MEMSX && - BPF_MODE(insn->code) != BPF_PROBE_MEM32 && - BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) + BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32 && + BPF_MODE(insn->code) != BPF_PROBE_MEM32SX && + BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; + is_arena = (BPF_MODE(insn->code) == BPF_PROBE_MEM32) || + (BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) || + (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC); + if (!ctx->prog->aux->extable || WARN_ON_ONCE(ctx->exentry_idx >= ctx->prog->aux->num_exentries)) return -EINVAL; @@ -1119,22 +1160,6 @@ static int add_exception_handler(const struct bpf_insn *insn, if (WARN_ON_ONCE(ins_offset >= 0 || ins_offset < INT_MIN)) return -ERANGE; - /* - * Since the extable follows the program, the fixup offset is always - * negative and limited to BPF_JIT_REGION_SIZE. Store a positive value - * to keep things simple, and put the destination register in the upper - * bits. We don't need to worry about buildtime or runtime sort - * modifying the upper bits because the table is already sorted, and - * isn't part of the main exception table. - * - * The fixup_offset is set to the next instruction from the instruction - * that may fault. The execution will jump to this after handling the - * fault. - */ - fixup_offset = (long)&ex->fixup - (pc + AARCH64_INSN_SIZE); - if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset)) - return -ERANGE; - /* * The offsets above have been calculated using the RO buffer but we * need to use the R/W buffer for writes. @@ -1147,8 +1172,26 @@ static int add_exception_handler(const struct bpf_insn *insn, if (BPF_CLASS(insn->code) != BPF_LDX) dst_reg = DONT_CLEAR; - ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) | - FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + ex->fixup = FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + + if (is_arena) { + ex->fixup |= BPF_ARENA_ACCESS; + /* + * insn->src_reg/dst_reg holds the address in the arena region with upper 32-bits + * being zero because of a preceding addr_space_cast(r, 0x0, 0x1) instruction. + * This address is adjusted with the addition of arena_vm_start (see the + * implementation of BPF_PROBE_MEM32 and BPF_PROBE_ATOMIC) before being used for the + * memory access. Pass the reg holding the unmodified 32-bit address to + * ex_handler_bpf. + */ + if (BPF_CLASS(insn->code) == BPF_LDX) + arena_reg = bpf2a64[insn->src_reg]; + else + arena_reg = bpf2a64[insn->dst_reg]; + + ex->fixup |= FIELD_PREP(BPF_FIXUP_OFFSET_MASK, off) | + FIELD_PREP(BPF_FIXUP_ARENA_REG_MASK, arena_reg); + } ex->type = EX_TYPE_BPF; @@ -1558,7 +1601,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (ret < 0) return ret; emit_call(func_addr, ctx); - emit(A64_MOV(1, r0, A64_R(0)), ctx); + /* + * Call to arch_bpf_timed_may_goto() is emitted by the + * verifier and called with custom calling convention with + * first argument and return value in BPF_REG_AX (x9). + */ + if (func_addr != (u64)arch_bpf_timed_may_goto) + emit(A64_MOV(1, r0, A64_R(0)), ctx); break; } /* tail call */ @@ -1612,7 +1661,11 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: - if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_W: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32 || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) { emit(A64_ADD(1, tmp2, src, arena_vm_base), ctx); src = tmp2; } @@ -1624,7 +1677,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, off_adj = off; } sign_extend = (BPF_MODE(insn->code) == BPF_MEMSX || - BPF_MODE(insn->code) == BPF_PROBE_MEMSX); + BPF_MODE(insn->code) == BPF_PROBE_MEMSX || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX); switch (BPF_SIZE(code)) { case BPF_W: if (is_lsi_offset(off_adj, 2)) { @@ -1832,9 +1886,11 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (ret) return ret; - ret = add_exception_handler(insn, ctx, dst); - if (ret) - return ret; + if (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { + ret = add_exception_handler(insn, ctx, dst); + if (ret) + return ret; + } break; default: @@ -2767,7 +2823,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, goto out; } - bpf_flush_icache(ro_image, ro_image + size); out: kvfree(image); return ret; @@ -3038,6 +3093,11 @@ bool bpf_jit_bypass_spec_v4(void) return true; } +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} + bool bpf_jit_inlines_helper_call(s32 imm) { switch (imm) { @@ -3064,8 +3124,7 @@ void bpf_jit_free(struct bpf_prog *prog) * before freeing it. */ if (jit_data) { - bpf_arch_text_copy(&jit_data->ro_header->size, &jit_data->header->size, - sizeof(jit_data->header->size)); + bpf_jit_binary_pack_finalize(jit_data->ro_header, jit_data->header); kfree(jit_data); } prog->bpf_func -= cfi_get_offset(); diff --git a/arch/arm64/net/bpf_timed_may_goto.S b/arch/arm64/net/bpf_timed_may_goto.S new file mode 100644 index 00000000000000..894cfcd7b2416d --- /dev/null +++ b/arch/arm64/net/bpf_timed_may_goto.S @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Puranjay Mohan */ + +#include + +SYM_FUNC_START(arch_bpf_timed_may_goto) + /* Allocate stack space and emit frame record */ + stp x29, x30, [sp, #-64]! + mov x29, sp + + /* Save BPF registers R0 - R5 (x7, x0-x4)*/ + stp x7, x0, [sp, #16] + stp x1, x2, [sp, #32] + stp x3, x4, [sp, #48] + + /* + * Stack depth was passed in BPF_REG_AX (x9), add it to the BPF_FP + * (x25) to get the pointer to count and timestamp and pass it as the + * first argument in x0. + * + * Before generating the call to arch_bpf_timed_may_goto, the verifier + * generates a load instruction using FP, i.e. REG_AX = *(u64 *)(FP - + * stack_off_cnt), so BPF_REG_FP (x25) is always set up by the arm64 + * jit in this case. + */ + add x0, x9, x25 + bl bpf_check_timed_may_goto + /* BPF_REG_AX(x9) will be stored into count, so move return value to it. */ + mov x9, x0 + + /* Restore BPF registers R0 - R5 (x7, x0-x4) */ + ldp x7, x0, [sp, #16] + ldp x1, x2, [sp, #32] + ldp x3, x4, [sp, #48] + + /* Restore FP and LR */ + ldp x29, x30, [sp], #64 + + ret +SYM_FUNC_END(arch_bpf_timed_may_goto) diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk index f2a1732cb1f638..bbbb812603e8f0 100755 --- a/arch/arm64/tools/gen-sysreg.awk +++ b/arch/arm64/tools/gen-sysreg.awk @@ -122,6 +122,10 @@ $1 == "SysregFields" && block_current() == "Root" { res1 = "UL(0)" unkn = "UL(0)" + if (reg in defined_fields) + fatal("Duplicate SysregFields definition for " reg) + defined_fields[reg] = 1 + next_bit = 63 next @@ -162,6 +166,10 @@ $1 == "Sysreg" && block_current() == "Root" { res1 = "UL(0)" unkn = "UL(0)" + if (reg in defined_regs) + fatal("Duplicate Sysreg definition for " reg) + defined_regs[reg] = 1 + define("REG_" reg, "S" op0 "_" op1 "_C" crn "_C" crm "_" op2) define("SYS_" reg, "sys_reg(" op0 ", " op1 ", " crn ", " crm ", " op2 ")") @@ -284,6 +292,8 @@ $1 == "SignedEnum" && (block_current() == "Sysreg" || block_current() == "Sysreg define_field(reg, field, msb, lsb) define_field_sign(reg, field, "true") + delete seen_enum_vals + next } @@ -297,6 +307,8 @@ $1 == "UnsignedEnum" && (block_current() == "Sysreg" || block_current() == "Sysr define_field(reg, field, msb, lsb) define_field_sign(reg, field, "false") + delete seen_enum_vals + next } @@ -309,6 +321,8 @@ $1 == "Enum" && (block_current() == "Sysreg" || block_current() == "SysregFields define_field(reg, field, msb, lsb) + delete seen_enum_vals + next } @@ -320,6 +334,8 @@ $1 == "EndEnum" && block_current() == "Enum" { lsb = null print "" + delete seen_enum_vals + block_pop() next } @@ -329,6 +345,10 @@ $1 == "EndEnum" && block_current() == "Enum" { val = $1 name = $2 + if (val in seen_enum_vals) + fatal("Duplicate Enum value " val " for " name) + seen_enum_vals[val] = 1 + define(reg "_" field "_" name, "UL(" val ")") next } diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 696ab1f32a6749..1c6cdf9d54bba3 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -31,7 +31,7 @@ # Mapping # EndSysreg -# Where multiple system regsiters are not VHE aliases but share a +# Where multiple system registers are not VHE aliases but share a # common layout, a SysregFields block can be used to describe the # shared layout: @@ -54,7 +54,7 @@ # # In general it is recommended that new enumeration items be named for the # feature that introduces them (eg, FEAT_LS64_ACCDATA introduces enumeration -# item ACCDATA) though it may be more taseful to do something else. +# item ACCDATA) though it may be more tasteful to do something else. Sysreg OSDTRRX_EL1 2 0 0 0 2 Res0 63:32 @@ -474,7 +474,7 @@ EndEnum Enum 7:4 Security 0b0000 NI 0b0001 EL3 - 0b0001 NSACR_RFR + 0b0010 NSACR_RFR EndEnum UnsignedEnum 3:0 ProgMod 0b0000 NI @@ -1693,7 +1693,7 @@ UnsignedEnum 43:40 TraceFilt 0b0000 NI 0b0001 IMP EndEnum -UnsignedEnum 39:36 DoubleLock +SignedEnum 39:36 DoubleLock 0b0000 IMP 0b1111 NI EndEnum @@ -2409,7 +2409,7 @@ UnsignedEnum 11:8 ASID2 0b0000 NI 0b0001 IMP EndEnum -SignedEnum 7:4 EIESB +UnsignedEnum 7:4 EIESB 0b0000 NI 0b0001 ToEL3 0b0010 ToELx @@ -2528,10 +2528,6 @@ Field 17:16 ZEN Res0 15:0 EndSysreg -Sysreg CPACR_EL12 3 5 1 0 2 -Mapping CPACR_EL1 -EndSysreg - Sysreg CPACRALIAS_EL1 3 0 1 4 4 Mapping CPACR_EL1 EndSysreg @@ -2576,10 +2572,6 @@ Sysreg PFAR_EL12 3 5 6 0 5 Mapping PFAR_EL1 EndSysreg -Sysreg RCWSMASK_EL1 3 0 13 0 3 -Field 63:0 RCWSMASK -EndSysreg - Sysreg SCTLR2_EL1 3 0 1 0 3 Res0 63:13 Field 12 CPTM0 @@ -2994,11 +2986,20 @@ Field 0 RND EndSysreg Sysreg PMSFCR_EL1 3 0 9 9 4 -Res0 63:19 +Res0 63:53 +Field 52 SIMDm +Field 51 FPm +Field 50 STm +Field 49 LDm +Field 48 Bm +Res0 47:21 +Field 20 SIMD +Field 19 FP Field 18 ST Field 17 LD Field 16 B -Res0 15:4 +Res0 15:5 +Field 4 FDS Field 3 FnE Field 2 FL Field 1 FT @@ -4756,17 +4757,53 @@ Field 37 TBI0 Field 36 AS Res0 35 Field 34:32 IPS -Field 31:30 TG1 -Field 29:28 SH1 -Field 27:26 ORGN1 -Field 25:24 IRGN1 +Enum 31:30 TG1 + 0b01 16K + 0b10 4K + 0b11 64K +EndEnum +Enum 29:28 SH1 + 0b00 NONE + 0b10 OUTER + 0b11 INNER +EndEnum +Enum 27:26 ORGN1 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum +Enum 25:24 IRGN1 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum Field 23 EPD1 Field 22 A1 Field 21:16 T1SZ -Field 15:14 TG0 -Field 13:12 SH0 -Field 11:10 ORGN0 -Field 9:8 IRGN0 +Enum 15:14 TG0 + 0b00 4K + 0b01 64K + 0b10 16K +EndEnum +Enum 13:12 SH0 + 0b00 NONE + 0b10 OUTER + 0b11 INNER +EndEnum +Enum 11:10 ORGN0 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum +Enum 9:8 IRGN0 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum Field 7 EPD0 Res0 6 Field 5:0 T0SZ diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h index 72e1b2aa29a07d..80d67eee6e860b 100644 --- a/arch/csky/include/asm/bitops.h +++ b/arch/csky/include/asm/bitops.h @@ -9,7 +9,7 @@ /* * asm-generic/bitops/ffs.h */ -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { if (!x) return 0; @@ -26,7 +26,7 @@ static inline int ffs(int x) /* * asm-generic/bitops/__ffs.h */ -static __always_inline unsigned long __ffs(unsigned long x) +static __always_inline __attribute_const__ unsigned long __ffs(unsigned long x) { asm volatile ( "brev %0\n" @@ -39,7 +39,7 @@ static __always_inline unsigned long __ffs(unsigned long x) /* * asm-generic/bitops/fls.h */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { asm volatile( "ff1 %0\n" @@ -52,7 +52,7 @@ static __always_inline int fls(unsigned int x) /* * asm-generic/bitops/__fls.h */ -static __always_inline unsigned long __fls(unsigned long x) +static __always_inline __attribute_const__ unsigned long __fls(unsigned long x) { return fls(x) - 1; } diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c index d1e9035794733d..5525c8e7e1d9ea 100644 --- a/arch/csky/kernel/asm-offsets.c +++ b/arch/csky/kernel/asm-offsets.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. +#define COMPILE_OFFSETS #include #include diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 0c6e4b17fe00fd..a7a90340042a5f 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -32,7 +32,7 @@ void flush_thread(void){} int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct switch_stack *childstack; diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 160d8f37fa1a34..b23cb13833af1c 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -200,7 +200,7 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) * * Undefined if no zero exists, so code should check against ~0UL first. */ -static inline long ffz(int x) +static inline long __attribute_const__ ffz(int x) { int r; @@ -217,7 +217,7 @@ static inline long ffz(int x) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { int r; @@ -238,7 +238,7 @@ static inline int fls(unsigned int x) * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { int r; @@ -260,7 +260,7 @@ static inline int ffs(int x) * bits_per_long assumed to be 32 * numbering starts at 0 I think (instead of 1 like ffs) */ -static inline unsigned long __ffs(unsigned long word) +static inline __attribute_const__ unsigned long __ffs(unsigned long word) { int num; @@ -278,7 +278,7 @@ static inline unsigned long __ffs(unsigned long word) * Undefined if no set bit exists, so code should check against 0 first. * bits_per_long assumed to be 32 */ -static inline unsigned long __fls(unsigned long word) +static inline __attribute_const__ unsigned long __fls(unsigned long word) { int num; diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c index 03a7063f945614..50eea9fa6f1375 100644 --- a/arch/hexagon/kernel/asm-offsets.c +++ b/arch/hexagon/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 2a77bfd7569450..15b4992bfa298a 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -52,7 +52,7 @@ void arch_cpu_idle(void) */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac9e..e6225539579f00 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -70,6 +70,8 @@ config LOONGARCH select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_RT + select ARCH_SUPPORTS_SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @@ -108,8 +110,6 @@ config LOONGARCH select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select GPIOLIB select HAS_IOPORT select HAVE_ARCH_AUDITSYSCALL @@ -140,6 +140,7 @@ config LOONGARCH select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN select HAVE_EXIT_THREAD + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FTRACE_GRAPH_FUNC select HAVE_FUNCTION_ARG_ACCESS_API @@ -298,6 +299,10 @@ config AS_HAS_LVZ_EXTENSION config CC_HAS_ANNOTATE_TABLEJUMP def_bool $(cc-option,-mannotate-tablejump) +config RUSTC_HAS_ANNOTATE_TABLEJUMP + depends on RUST + def_bool $(rustc-option,-Cllvm-args=--loongarch-annotate-tablejump) + menu "Kernel type and options" source "kernel/Kconfig.hz" @@ -448,23 +453,6 @@ config EFI_STUB This kernel feature allows the kernel to be loaded directly by EFI firmware without the use of a bootloader. -config SCHED_SMT - bool "SMT scheduler support" - depends on SMP - default y - help - Improves scheduler's performance when there are multiple - threads in one physical core. - -config SCHED_MC - bool "Multi-core scheduler support" - depends on SMP - default y - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. - config SMP bool "Multi-Processing support" help @@ -563,10 +551,14 @@ config ARCH_STRICT_ALIGN -mstrict-align build parameter to prevent unaligned accesses. CPUs with h/w unaligned access support: - Loongson-2K2000/2K3000/3A5000/3C5000/3D5000. + Loongson-2K2000/2K3000 and all of Loongson-3 series processors + based on LoongArch. CPUs without h/w unaligned access support: - Loongson-2K500/2K1000. + Loongson-2K0300/2K0500/2K1000. + + If you want to make sure whether to support unaligned memory access + on your hardware, please read the bit 20 (UAL) of CPUCFG1 register. This option is enabled by default to make the kernel be able to run on all LoongArch systems. But you can disable it manually if you want diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index a3a9759414f40f..ae419e32f22e2f 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -102,16 +102,21 @@ KBUILD_CFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma) ifdef CONFIG_OBJTOOL ifdef CONFIG_CC_HAS_ANNOTATE_TABLEJUMP +KBUILD_CFLAGS += -mannotate-tablejump +else +KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers +endif +ifdef CONFIG_RUSTC_HAS_ANNOTATE_TABLEJUMP +KBUILD_RUSTFLAGS += -Cllvm-args=--loongarch-annotate-tablejump +else +KBUILD_RUSTFLAGS += -Zno-jump-tables # keep compatibility with older compilers +endif +ifdef CONFIG_LTO_CLANG # The annotate-tablejump option can not be passed to LLVM backend when LTO is enabled. # Ensure it is aware of linker with LTO, '--loongarch-annotate-tablejump' also needs to # be passed via '-mllvm' to ld.lld. -KBUILD_CFLAGS += -mannotate-tablejump -ifdef CONFIG_LTO_CLANG KBUILD_LDFLAGS += -mllvm --loongarch-annotate-tablejump endif -else -KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers -endif endif KBUILD_RUSTFLAGS += --target=loongarch64-unknown-none-softfloat -Ccode-model=small diff --git a/arch/loongarch/include/asm/acenv.h b/arch/loongarch/include/asm/acenv.h index 52f298f7293bab..483c955f2ae50d 100644 --- a/arch/loongarch/include/asm/acenv.h +++ b/arch/loongarch/include/asm/acenv.h @@ -10,9 +10,8 @@ #ifndef _ASM_LOONGARCH_ACENV_H #define _ASM_LOONGARCH_ACENV_H -/* - * This header is required by ACPI core, but we have nothing to fill in - * right now. Will be updated later when needed. - */ +#ifdef CONFIG_ARCH_STRICT_ALIGN +#define ACPI_MISALIGNMENT_NOT_SUPPORTED +#endif /* CONFIG_ARCH_STRICT_ALIGN */ #endif /* _ASM_LOONGARCH_ACENV_H */ diff --git a/arch/loongarch/include/asm/kvm_mmu.h b/arch/loongarch/include/asm/kvm_mmu.h index 099bafc6f797c9..e36cc7e8ed200a 100644 --- a/arch/loongarch/include/asm/kvm_mmu.h +++ b/arch/loongarch/include/asm/kvm_mmu.h @@ -16,6 +16,13 @@ */ #define KVM_MMU_CACHE_MIN_PAGES (CONFIG_PGTABLE_LEVELS - 1) +/* + * _PAGE_MODIFIED is a SW pte bit, it records page ever written on host + * kernel, on secondary MMU it records the page writeable attribute, in + * order for fast path handling. + */ +#define KVM_PAGE_WRITEABLE _PAGE_MODIFIED + #define _KVM_FLUSH_PGTABLE 0x1 #define _KVM_HAS_PGMASK 0x2 #define kvm_pfn_pte(pfn, prot) (((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) @@ -52,10 +59,10 @@ static inline void kvm_set_pte(kvm_pte_t *ptep, kvm_pte_t val) WRITE_ONCE(*ptep, val); } -static inline int kvm_pte_write(kvm_pte_t pte) { return pte & _PAGE_WRITE; } -static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & _PAGE_DIRTY; } static inline int kvm_pte_young(kvm_pte_t pte) { return pte & _PAGE_ACCESSED; } static inline int kvm_pte_huge(kvm_pte_t pte) { return pte & _PAGE_HUGE; } +static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & __WRITEABLE; } +static inline int kvm_pte_writeable(kvm_pte_t pte) { return pte & KVM_PAGE_WRITEABLE; } static inline kvm_pte_t kvm_pte_mkyoung(kvm_pte_t pte) { @@ -69,12 +76,12 @@ static inline kvm_pte_t kvm_pte_mkold(kvm_pte_t pte) static inline kvm_pte_t kvm_pte_mkdirty(kvm_pte_t pte) { - return pte | _PAGE_DIRTY; + return pte | __WRITEABLE; } static inline kvm_pte_t kvm_pte_mkclean(kvm_pte_t pte) { - return pte & ~_PAGE_DIRTY; + return pte & ~__WRITEABLE; } static inline kvm_pte_t kvm_pte_mkhuge(kvm_pte_t pte) @@ -87,6 +94,11 @@ static inline kvm_pte_t kvm_pte_mksmall(kvm_pte_t pte) return pte & ~_PAGE_HUGE; } +static inline kvm_pte_t kvm_pte_mkwriteable(kvm_pte_t pte) +{ + return pte | KVM_PAGE_WRITEABLE; +} + static inline int kvm_need_flush(kvm_ptw_ctx *ctx) { return ctx->flag & _KVM_FLUSH_PGTABLE; diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index 9dfa2ef0081670..4d7117fcdc78c0 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -65,50 +65,42 @@ register unsigned long current_stack_pointer __asm__("$sp"); * access * - pending work-to-be-done flags are in LSW * - other flags in MSW + * + * Tell the generic TIF infrastructure which special bits loongarch supports */ -#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 1 /* lazy rescheduling necessary */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NOTIFY_RESUME 3 /* callback before returning to user */ -#define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ -#define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ -#define TIF_NOHZ 6 /* in adaptive nohz mode */ -#define TIF_UPROBE 7 /* breakpointed or singlestepping */ -#define TIF_USEDFPU 8 /* FPU was used by this task this quantum (SMP) */ -#define TIF_USEDSIMD 9 /* SIMD has been used this quantum */ -#define TIF_MEMDIE 10 /* is terminating due to OOM killer */ -#define TIF_FIXADE 11 /* Fix address errors in software */ -#define TIF_LOGADE 12 /* Log address errors to syslog */ -#define TIF_32BIT_REGS 13 /* 32-bit general purpose registers */ -#define TIF_32BIT_ADDR 14 /* 32-bit address space */ -#define TIF_LOAD_WATCH 15 /* If set, load watch registers */ -#define TIF_SINGLESTEP 16 /* Single Step */ -#define TIF_LSX_CTX_LIVE 17 /* LSX context must be preserved */ -#define TIF_LASX_CTX_LIVE 18 /* LASX context must be preserved */ -#define TIF_USEDLBT 19 /* LBT was used by this task this quantum (SMP) */ -#define TIF_LBT_CTX_LIVE 20 /* LBT context must be preserved */ -#define TIF_PATCH_PENDING 21 /* pending live patching update */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include + +/* Architecture specific bits */ +#define TIF_NOHZ 16 /* in adaptive nohz mode */ +#define TIF_USEDFPU 17 /* FPU was used by this task this quantum (SMP) */ +#define TIF_USEDSIMD 18 /* SIMD has been used this quantum */ +#define TIF_FIXADE 19 /* Fix address errors in software */ +#define TIF_LOGADE 20 /* Log address errors to syslog */ +#define TIF_32BIT_REGS 21 /* 32-bit general purpose registers */ +#define TIF_32BIT_ADDR 22 /* 32-bit address space */ +#define TIF_LOAD_WATCH 23 /* If set, load watch registers */ +#define TIF_SINGLESTEP 24 /* Single Step */ +#define TIF_LSX_CTX_LIVE 25 /* LSX context must be preserved */ +#define TIF_LASX_CTX_LIVE 26 /* LASX context must be preserved */ +#define TIF_USEDLBT 27 /* LBT was used by this task this quantum (SMP) */ +#define TIF_LBT_CTX_LIVE 28 /* LBT context must be preserved */ -#define _TIF_NEED_RESCHED (1< #include #include diff --git a/arch/loongarch/kernel/env.c b/arch/loongarch/kernel/env.c index c0a5dc9aeae287..23bd5ae2212c26 100644 --- a/arch/loongarch/kernel/env.c +++ b/arch/loongarch/kernel/env.c @@ -86,7 +86,7 @@ late_initcall(fdt_cpu_clk_init); static ssize_t boardinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, + return sysfs_emit(buf, "BIOS Information\n" "Vendor\t\t\t: %s\n" "Version\t\t\t: %s\n" @@ -109,6 +109,8 @@ static int __init boardinfo_init(void) struct kobject *loongson_kobj; loongson_kobj = kobject_create_and_add("loongson", firmware_kobj); + if (!loongson_kobj) + return -ENOMEM; return sysfs_create_file(loongson_kobj, &boardinfo_attr.attr); } diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c index 3582f591bab286..efd9edf65603cc 100644 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@ -167,7 +167,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long childksp; unsigned long tls = args->tls; unsigned long usp = args->stack; - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; struct pt_regs *childregs, *regs = current_pt_regs(); childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE; diff --git a/arch/loongarch/kernel/stacktrace.c b/arch/loongarch/kernel/stacktrace.c index 9a038d1070d73b..387dc4d3c4868f 100644 --- a/arch/loongarch/kernel/stacktrace.c +++ b/arch/loongarch/kernel/stacktrace.c @@ -51,12 +51,13 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (task == current) { regs->regs[3] = (unsigned long)__builtin_frame_address(0); regs->csr_era = (unsigned long)__builtin_return_address(0); + regs->regs[22] = 0; } else { regs->regs[3] = thread_saved_fp(task); regs->csr_era = thread_saved_ra(task); + regs->regs[22] = task->thread.reg22; } regs->regs[1] = 0; - regs->regs[22] = 0; for (unwind_start(&state, task, regs); !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index f3092f2de8b501..6fb92cc1a4c92a 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -112,8 +112,6 @@ static int arch_timer_starting(unsigned int cpu) static int arch_timer_dying(unsigned int cpu) { - constant_set_state_shutdown(this_cpu_ptr(&constant_clockevent_device)); - /* Clear Timer Interrupt */ write_csr_tintclear(CSR_TINTCLR_TI); diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c index 7b888d9085a014..dee1a15d7f4c77 100644 --- a/arch/loongarch/kernel/vdso.c +++ b/arch/loongarch/kernel/vdso.c @@ -54,6 +54,9 @@ static int __init init_vdso(void) vdso_info.code_mapping.pages = kcalloc(vdso_info.size / PAGE_SIZE, sizeof(struct page *), GFP_KERNEL); + if (!vdso_info.code_mapping.pages) + return -ENOMEM; + pfn = __phys_to_pfn(__pa_symbol(vdso_info.vdso)); for (i = 0; i < vdso_info.size / PAGE_SIZE; i++) vdso_info.code_mapping.pages[i] = pfn_to_page(pfn + i); diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 2ce41f93b2a444..6c9c7de7226b63 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -778,10 +778,8 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu) return 0; default: return KVM_HCALL_INVALID_CODE; - }; - - return KVM_HCALL_INVALID_CODE; -}; + } +} /* * kvm_handle_lsx_disabled() - Guest used LSX while disabled in root. diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 026b139dcff2de..c3233369538184 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -426,21 +426,26 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, struct loongarch_eiointc *s = dev->kvm->arch.eiointc; data = (void __user *)attr->addr; - spin_lock_irqsave(&s->lock, flags); switch (type) { case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE: if (copy_from_user(&val, data, 4)) - ret = -EFAULT; - else { - if (val >= EIOINTC_ROUTE_MAX_VCPUS) - ret = -EINVAL; - else - s->num_cpu = val; - } + return -EFAULT; + break; + default: + break; + } + + spin_lock_irqsave(&s->lock, flags); + switch (type) { + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: + if (val >= EIOINTC_ROUTE_MAX_VCPUS) + ret = -EINVAL; + else + s->num_cpu = val; break; case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE: - if (copy_from_user(&s->features, data, 4)) - ret = -EFAULT; + s->features = val; if (!(s->features & BIT(EIOINTC_HAS_VIRT_EXTENSION))) s->status |= BIT(EIOINTC_ENABLE); break; @@ -462,19 +467,17 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, static int kvm_eiointc_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, - bool is_write) + bool is_write, int *data) { int addr, cpu, offset, ret = 0; unsigned long flags; void *p = NULL; - void __user *data; struct loongarch_eiointc *s; s = dev->kvm->arch.eiointc; addr = attr->attr; cpu = addr >> 16; addr &= 0xffff; - data = (void __user *)attr->addr; switch (addr) { case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: offset = (addr - EIOINTC_NODETYPE_START) / 4; @@ -513,13 +516,10 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, } spin_lock_irqsave(&s->lock, flags); - if (is_write) { - if (copy_from_user(p, data, 4)) - ret = -EFAULT; - } else { - if (copy_to_user(data, p, 4)) - ret = -EFAULT; - } + if (is_write) + memcpy(p, data, 4); + else + memcpy(data, p, 4); spin_unlock_irqrestore(&s->lock, flags); return ret; @@ -527,19 +527,17 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, static int kvm_eiointc_sw_status_access(struct kvm_device *dev, struct kvm_device_attr *attr, - bool is_write) + bool is_write, int *data) { int addr, ret = 0; unsigned long flags; void *p = NULL; - void __user *data; struct loongarch_eiointc *s; s = dev->kvm->arch.eiointc; addr = attr->attr; addr &= 0xffff; - data = (void __user *)attr->addr; switch (addr) { case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU: if (is_write) @@ -561,13 +559,10 @@ static int kvm_eiointc_sw_status_access(struct kvm_device *dev, return -EINVAL; } spin_lock_irqsave(&s->lock, flags); - if (is_write) { - if (copy_from_user(p, data, 4)) - ret = -EFAULT; - } else { - if (copy_to_user(data, p, 4)) - ret = -EFAULT; - } + if (is_write) + memcpy(p, data, 4); + else + memcpy(data, p, 4); spin_unlock_irqrestore(&s->lock, flags); return ret; @@ -576,11 +571,27 @@ static int kvm_eiointc_sw_status_access(struct kvm_device *dev, static int kvm_eiointc_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { + int ret, data; + switch (attr->group) { case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS: - return kvm_eiointc_regs_access(dev, attr, false); + ret = kvm_eiointc_regs_access(dev, attr, false, &data); + if (ret) + return ret; + + if (copy_to_user((void __user *)attr->addr, &data, 4)) + ret = -EFAULT; + + return ret; case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: - return kvm_eiointc_sw_status_access(dev, attr, false); + ret = kvm_eiointc_sw_status_access(dev, attr, false, &data); + if (ret) + return ret; + + if (copy_to_user((void __user *)attr->addr, &data, 4)) + ret = -EFAULT; + + return ret; default: return -EINVAL; } @@ -589,13 +600,21 @@ static int kvm_eiointc_get_attr(struct kvm_device *dev, static int kvm_eiointc_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { + int data; + switch (attr->group) { case KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL: return kvm_eiointc_ctrl_access(dev, attr); case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS: - return kvm_eiointc_regs_access(dev, attr, true); + if (copy_from_user(&data, (void __user *)attr->addr, 4)) + return -EFAULT; + + return kvm_eiointc_regs_access(dev, attr, true, &data); case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: - return kvm_eiointc_sw_status_access(dev, attr, true); + if (copy_from_user(&data, (void __user *)attr->addr, 4)) + return -EFAULT; + + return kvm_eiointc_sw_status_access(dev, attr, true, &data); default: return -EINVAL; } diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index 119290bcea79ab..baf3b4faf7ead2 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -348,6 +348,7 @@ static int kvm_pch_pic_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, bool is_write) { + char buf[8]; int addr, offset, len = 8, ret = 0; void __user *data; void *p = NULL; @@ -397,17 +398,23 @@ static int kvm_pch_pic_regs_access(struct kvm_device *dev, return -EINVAL; } - spin_lock(&s->lock); - /* write or read value according to is_write */ if (is_write) { - if (copy_from_user(p, data, len)) - ret = -EFAULT; - } else { - if (copy_to_user(data, p, len)) - ret = -EFAULT; + if (copy_from_user(buf, data, len)) + return -EFAULT; } + + spin_lock(&s->lock); + if (is_write) + memcpy(p, buf, len); + else + memcpy(buf, p, len); spin_unlock(&s->lock); + if (!is_write) { + if (copy_to_user(data, buf, len)) + return -EFAULT; + } + return ret; } diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index ed956c5cf2cc04..7c8143e79c1279 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -569,7 +569,7 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ /* Track access to pages marked old */ new = kvm_pte_mkyoung(*ptep); if (write && !kvm_pte_dirty(new)) { - if (!kvm_pte_write(new)) { + if (!kvm_pte_writeable(new)) { ret = -EFAULT; goto out; } @@ -856,9 +856,9 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) prot_bits |= _CACHE_SUC; if (writeable) { - prot_bits |= _PAGE_WRITE; + prot_bits = kvm_pte_mkwriteable(prot_bits); if (write) - prot_bits |= __WRITEABLE; + prot_bits = kvm_pte_mkdirty(prot_bits); } /* Disable dirty logging on HugePages */ @@ -904,7 +904,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) kvm_release_faultin_page(kvm, page, false, writeable); spin_unlock(&kvm->mmu_lock); - if (prot_bits & _PAGE_DIRTY) + if (kvm_pte_dirty(prot_bits)) mark_page_dirty_in_slot(kvm, memslot, gfn); out: diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 5171bb183967b9..fba8089c9fb3f1 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -125,6 +125,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -206,14 +207,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -233,10 +232,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -245,6 +242,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -309,7 +307,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -454,7 +451,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -560,7 +556,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -605,6 +600,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -636,7 +632,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 16f343ae48c675..6af37716384ca7 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -121,6 +121,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -202,14 +203,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -229,10 +228,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -241,6 +238,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -299,7 +297,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -411,7 +408,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -517,7 +513,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -562,6 +557,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -593,7 +589,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index c08788728ea962..471f4ec3730d9f 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -128,6 +128,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -209,14 +210,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -236,10 +235,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -248,6 +245,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -310,7 +308,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -431,7 +428,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -537,7 +533,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -582,6 +577,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -613,7 +609,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 962497e7c53fd6..28492ef5145706 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -118,6 +118,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -199,14 +200,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -226,10 +225,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -238,6 +235,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -296,7 +294,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -403,7 +400,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -509,7 +505,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -554,6 +549,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -585,7 +581,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index ec28650189e406..2fbefb16b72ef9 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -120,6 +120,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -201,14 +202,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -228,10 +227,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -240,6 +237,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -298,7 +296,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -413,7 +410,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -519,7 +515,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -564,6 +559,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -595,7 +591,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 0afb3ad180dee3..deec5df3f35a4a 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -119,6 +119,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -200,14 +201,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -227,10 +226,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -239,6 +236,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -298,7 +296,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -430,7 +427,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -536,7 +532,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -581,6 +576,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -612,7 +608,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index b311e953995d6d..301a05c1257728 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -139,6 +139,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -220,14 +221,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -247,10 +246,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -259,6 +256,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -327,7 +325,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -517,7 +514,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -623,7 +619,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -668,6 +663,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -699,7 +695,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index f4e6224f137f99..0d401db0e8f82b 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -117,6 +117,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -198,14 +199,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -225,10 +224,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -237,6 +234,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -295,7 +293,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -403,7 +400,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -509,7 +505,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -554,6 +549,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -585,7 +581,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 498e167222f18c..90fb5b6bcf833a 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -118,6 +118,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -199,14 +200,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -226,10 +225,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -238,6 +235,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -296,7 +294,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -404,7 +401,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -510,7 +506,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -555,6 +550,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -586,7 +582,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 8c6b1eef853423..b89b0f7fe2dac7 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -119,6 +119,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -200,14 +201,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -227,10 +226,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -239,6 +236,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -300,7 +298,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -420,7 +417,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -526,7 +522,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -571,6 +566,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -602,7 +598,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index c34648f299efb9..8cc372c4df7206 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -114,6 +114,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -195,14 +196,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -222,10 +221,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -234,6 +231,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -292,7 +290,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -401,7 +398,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -507,7 +503,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -552,6 +547,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -582,7 +578,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 73810d14660f21..f4569f64c6e4e7 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -115,6 +115,7 @@ CONFIG_NFT_FIB_NETDEV=m CONFIG_NFT_REJECT_NETDEV=m CONFIG_NF_FLOW_TABLE_INET=m CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m @@ -196,14 +197,12 @@ CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IP_NF_TARGET_SYNPROXY=m CONFIG_IP_NF_NAT=m CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -223,10 +222,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RT=m CONFIG_IP6_NF_MATCH_SRH=m CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m CONFIG_IP6_NF_TARGET_REJECT=m CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m CONFIG_IP6_NF_RAW=m CONFIG_IP6_NF_NAT=m CONFIG_IP6_NF_TARGET_MASQUERADE=m @@ -235,6 +232,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -293,7 +291,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y -CONFIG_CDROM_PKTCDVD=m CONFIG_ATA_OVER_ETH=m CONFIG_DUMMY_IRQ=m CONFIG_RAID_ATTRS=m @@ -401,7 +398,6 @@ CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_BTRFS_FS=m -CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -507,7 +503,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m @@ -552,6 +547,7 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_PRIME_NUMBERS=m +CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -583,7 +579,6 @@ CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_LINEAR_RANGES_TEST=m -CONFIG_CRC_BENCHMARK=y CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 14c64a6f121762..e9639e48c6c3c1 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -350,12 +350,12 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, #include #else -static inline int find_first_zero_bit(const unsigned long *vaddr, - unsigned size) +static inline unsigned long find_first_zero_bit(const unsigned long *vaddr, + unsigned long size) { const unsigned long *p = vaddr; - int res = 32; - unsigned int words; + unsigned long res = 32; + unsigned long words; unsigned long num; if (!size) @@ -376,8 +376,9 @@ static inline int find_first_zero_bit(const unsigned long *vaddr, } #define find_first_zero_bit find_first_zero_bit -static inline int find_next_zero_bit(const unsigned long *vaddr, int size, - int offset) +static inline unsigned long find_next_zero_bit(const unsigned long *vaddr, + unsigned long size, + unsigned long offset) { const unsigned long *p = vaddr + (offset >> 5); int bit = offset & 31UL, res; @@ -406,11 +407,12 @@ static inline int find_next_zero_bit(const unsigned long *vaddr, int size, } #define find_next_zero_bit find_next_zero_bit -static inline int find_first_bit(const unsigned long *vaddr, unsigned size) +static inline unsigned long find_first_bit(const unsigned long *vaddr, + unsigned long size) { const unsigned long *p = vaddr; - int res = 32; - unsigned int words; + unsigned long res = 32; + unsigned long words; unsigned long num; if (!size) @@ -431,8 +433,9 @@ static inline int find_first_bit(const unsigned long *vaddr, unsigned size) } #define find_first_bit find_first_bit -static inline int find_next_bit(const unsigned long *vaddr, int size, - int offset) +static inline unsigned long find_next_bit(const unsigned long *vaddr, + unsigned long size, + unsigned long offset) { const unsigned long *p = vaddr + (offset >> 5); int bit = offset & 31UL, res; @@ -465,7 +468,7 @@ static inline int find_next_bit(const unsigned long *vaddr, int size, * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ -static inline unsigned long ffz(unsigned long word) +static inline unsigned long __attribute_const__ ffz(unsigned long word) { int res; @@ -488,7 +491,7 @@ static inline unsigned long ffz(unsigned long word) */ #if (defined(__mcfisaaplus__) || defined(__mcfisac__)) && \ !defined(CONFIG_M68000) -static inline unsigned long __ffs(unsigned long x) +static inline __attribute_const__ unsigned long __ffs(unsigned long x) { __asm__ __volatile__ ("bitrev %0; ff1 %0" : "=d" (x) @@ -496,7 +499,7 @@ static inline unsigned long __ffs(unsigned long x) return x; } -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { if (!x) return 0; @@ -518,7 +521,7 @@ static inline int ffs(int x) * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { int cnt; @@ -528,7 +531,7 @@ static inline int ffs(int x) return 32 - cnt; } -static inline unsigned long __ffs(unsigned long x) +static inline __attribute_const__ unsigned long __ffs(unsigned long x) { return ffs(x) - 1; } @@ -536,7 +539,7 @@ static inline unsigned long __ffs(unsigned long x) /* * fls: find last bit set. */ -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { int cnt; @@ -546,7 +549,7 @@ static inline int fls(unsigned int x) return 32 - cnt; } -static inline unsigned long __fls(unsigned long x) +static inline __attribute_const__ unsigned long __fls(unsigned long x) { return fls(x) - 1; } diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c index 906d7323053744..67a1990f9d748f 100644 --- a/arch/m68k/kernel/asm-offsets.c +++ b/arch/m68k/kernel/asm-offsets.c @@ -9,6 +9,7 @@ * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #define ASM_OFFSETS_C #include diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index fda7eac23f872d..f5a07a70e9385a 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -141,7 +141,7 @@ asmlinkage int m68k_clone3(struct pt_regs *regs) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct fork_frame { diff --git a/arch/microblaze/Kconfig.platform b/arch/microblaze/Kconfig.platform index 7795f90dad8680..9cf9007ed69ace 100644 --- a/arch/microblaze/Kconfig.platform +++ b/arch/microblaze/Kconfig.platform @@ -8,10 +8,10 @@ menu "Platform options" config OPT_LIB_FUNCTION - bool "Optimalized lib function" + bool "Optimized lib function" default y help - Allows turn on optimalized library function (memcpy and memmove). + Turns on optimized library functions (memcpy and memmove). They are optimized by using word alignment. This will work fine if both source and destination are aligned on the same boundary. However, if they are aligned on different boundaries @@ -19,13 +19,13 @@ config OPT_LIB_FUNCTION on MicroBlaze systems without a barrel shifter. config OPT_LIB_ASM - bool "Optimalized lib function ASM" + bool "Optimized lib function ASM" depends on OPT_LIB_FUNCTION && (XILINX_MICROBLAZE0_USE_BARREL = 1) depends on CPU_BIG_ENDIAN default n help - Allows turn on optimalized library function (memcpy and memmove). - Function are written in asm code. + Turns on optimized library functions (memcpy and memmove). + They are written in assembly. # Definitions for MICROBLAZE0 comment "Definitions for MICROBLAZE0" diff --git a/arch/microblaze/include/asm/asm-compat.h b/arch/microblaze/include/asm/asm-compat.h index c05259ce2d2c2d..9f046147623197 100644 --- a/arch/microblaze/include/asm/asm-compat.h +++ b/arch/microblaze/include/asm/asm-compat.h @@ -4,7 +4,7 @@ #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define stringify_in_c(...) __VA_ARGS__ # define ASM_CONST(x) x #else diff --git a/arch/microblaze/include/asm/current.h b/arch/microblaze/include/asm/current.h index a4bb45be30e69f..099e69f32bf97a 100644 --- a/arch/microblaze/include/asm/current.h +++ b/arch/microblaze/include/asm/current.h @@ -14,13 +14,13 @@ * but check asm/microblaze/kernel/entry.S to be sure. */ #define CURRENT_TASK r31 -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ /* * Dedicate r31 to keeping the current task pointer */ register struct task_struct *current asm("r31"); # define get_current() current -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_CURRENT_H */ diff --git a/arch/microblaze/include/asm/entry.h b/arch/microblaze/include/asm/entry.h index 6c42bed4116628..9efadf12397ca8 100644 --- a/arch/microblaze/include/asm/entry.h +++ b/arch/microblaze/include/asm/entry.h @@ -21,7 +21,7 @@ #define PER_CPU(var) var -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ DECLARE_PER_CPU(unsigned int, KSP); /* Saved kernel stack pointer */ DECLARE_PER_CPU(unsigned int, KM); /* Kernel/user mode */ DECLARE_PER_CPU(unsigned int, ENTRY_SP); /* Saved SP on kernel entry */ @@ -29,6 +29,6 @@ DECLARE_PER_CPU(unsigned int, R11_SAVE); /* Temp variable for entry */ DECLARE_PER_CPU(unsigned int, CURRENT_SAVE); /* Saved current pointer */ extern asmlinkage void do_notify_resume(struct pt_regs *regs, int in_syscall); -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_ENTRY_H */ diff --git a/arch/microblaze/include/asm/exceptions.h b/arch/microblaze/include/asm/exceptions.h index 967f175173e141..c4591e4f7175bb 100644 --- a/arch/microblaze/include/asm/exceptions.h +++ b/arch/microblaze/include/asm/exceptions.h @@ -11,7 +11,7 @@ #define _ASM_MICROBLAZE_EXCEPTIONS_H #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Macros to enable and disable HW exceptions in the MSR */ /* Define MSR enable bit for HW exceptions */ @@ -64,6 +64,6 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig); void die(const char *str, struct pt_regs *fp, long err); void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr); -#endif /*__ASSEMBLY__ */ +#endif /*__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_MICROBLAZE_EXCEPTIONS_H */ diff --git a/arch/microblaze/include/asm/fixmap.h b/arch/microblaze/include/asm/fixmap.h index e6e9288bff7613..f9797849e4d436 100644 --- a/arch/microblaze/include/asm/fixmap.h +++ b/arch/microblaze/include/asm/fixmap.h @@ -15,7 +15,7 @@ #ifndef _ASM_FIXMAP_H #define _ASM_FIXMAP_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #ifdef CONFIG_HIGHMEM @@ -62,5 +62,5 @@ extern void __set_fixmap(enum fixed_addresses idx, #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h index 4ca38b92a3a209..27c1bafb669c36 100644 --- a/arch/microblaze/include/asm/ftrace.h +++ b/arch/microblaze/include/asm/ftrace.h @@ -7,7 +7,7 @@ #define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 8 /* sizeof mcount call */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void _mcount(void); extern void ftrace_call_graph(void); void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr); diff --git a/arch/microblaze/include/asm/kgdb.h b/arch/microblaze/include/asm/kgdb.h index 8dc5ebb07fd5a6..321c3c8bfcf27f 100644 --- a/arch/microblaze/include/asm/kgdb.h +++ b/arch/microblaze/include/asm/kgdb.h @@ -3,7 +3,7 @@ #ifndef __MICROBLAZE_KGDB_H__ #define __MICROBLAZE_KGDB_H__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define CACHE_FLUSH_IS_SAFE 1 #define BUFMAX 2048 @@ -27,6 +27,6 @@ static inline void arch_kgdb_breakpoint(void) struct pt_regs; asmlinkage void microblaze_kgdb_break(struct pt_regs *regs); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __MICROBLAZE_KGDB_H__ */ #endif /* __KERNEL__ */ diff --git a/arch/microblaze/include/asm/mmu.h b/arch/microblaze/include/asm/mmu.h index b928a87c00766a..7262dc4da3385e 100644 --- a/arch/microblaze/include/asm/mmu.h +++ b/arch/microblaze/include/asm/mmu.h @@ -9,7 +9,7 @@ #define _ASM_MICROBLAZE_MMU_H # ifdef __KERNEL__ -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ /* Default "unsigned long" context */ typedef unsigned long mm_context_t; @@ -56,7 +56,7 @@ extern void _tlbia(void); /* invalidate all TLB entries */ * mapping has to increase tlb_skip size. */ extern u32 tlb_skip; -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ /* * The MicroBlaze processor has a TLB architecture identical to PPC-40x. The diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 90fc9c81debda7..90ac9f34b4b492 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -25,7 +25,7 @@ #define PTE_SHIFT (PAGE_SHIFT - 2) /* 1024 ptes per page */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * PAGE_OFFSET -- the first address of the first page of memory. With MMU @@ -100,7 +100,7 @@ extern int page_is_ram(unsigned long pfn); # define page_to_virt(page) __va(page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ /* Convert between virtual and physical address for MMU. */ /* Handle MicroBlaze processor with virtual memory. */ @@ -113,7 +113,7 @@ extern int page_is_ram(unsigned long pfn); #define tovirt(rd, rs) \ addik rd, rs, (CONFIG_KERNEL_START - CONFIG_KERNEL_BASE_ADDR) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ # define __pa(x) __virt_to_phys((unsigned long)(x)) # define __va(x) ((void *)__phys_to_virt((unsigned long)(x))) @@ -130,7 +130,7 @@ static inline const void *pfn_to_virt(unsigned long pfn) #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define TOPHYS(addr) __virt_to_phys(addr) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index bae1abfa6f6b86..a60e8d89510267 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -10,14 +10,14 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern int mem_init_done; #endif #include #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -39,7 +39,7 @@ extern pte_t *va_to_pte(unsigned long address); #define VMALLOC_START (CONFIG_KERNEL_START + CONFIG_LOWMEM_SIZE) #define VMALLOC_END ioremap_bot -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Macro to mark a page protection value as "uncacheable". @@ -208,7 +208,7 @@ extern pte_t *va_to_pte(unsigned long address); * Also, write permissions imply read permissions. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -216,7 +216,7 @@ extern pte_t *va_to_pte(unsigned long address); extern unsigned long empty_zero_page[1024]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define pte_none(pte) ((pte_val(pte) & ~_PTE_NONE_MASK) == 0) #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT) @@ -237,7 +237,7 @@ extern unsigned long empty_zero_page[1024]; #define pfn_pte(pfn, prot) \ __pte(((pte_basic_t)(pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * The following only work if pte_present() is true. * Undefined behaviour if not.. @@ -436,13 +436,13 @@ extern int mem_init_done; asmlinkage void __init mmu_init(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern unsigned long ioremap_bot, ioremap_base; void setup_memory(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_PGTABLE_H */ diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 4e193c7550dfa2..d59bdfffca7cc0 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -14,7 +14,7 @@ #include #include -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ /* from kernel/cpu/mb.c */ extern const struct seq_operations cpuinfo_op; @@ -29,7 +29,7 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long usp); extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ /* * This is used to define STACK_TOP, and with MMU it must be below @@ -45,7 +45,7 @@ extern void ret_from_kernel_thread(void); # define THREAD_KSP 0 -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ /* If you change this, you must change the associated assembly-languages * constants defined below, THREAD_*. @@ -88,5 +88,5 @@ unsigned long __get_wchan(struct task_struct *p); extern struct dentry *of_debugfs_root; #endif -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_PROCESSOR_H */ diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h index bfcb89df5e26fc..17982292a64fdf 100644 --- a/arch/microblaze/include/asm/ptrace.h +++ b/arch/microblaze/include/asm/ptrace.h @@ -7,7 +7,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define kernel_mode(regs) ((regs)->pt_mode) #define user_mode(regs) (!kernel_mode(regs)) @@ -20,5 +20,5 @@ static inline long regs_return_value(struct pt_regs *regs) return regs->r3; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_PTRACE_H */ diff --git a/arch/microblaze/include/asm/sections.h b/arch/microblaze/include/asm/sections.h index a9311ad84a67fc..f5008f5e7a5c12 100644 --- a/arch/microblaze/include/asm/sections.h +++ b/arch/microblaze/include/asm/sections.h @@ -10,11 +10,11 @@ #include -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ extern char _ssbss[], _esbss[]; extern unsigned long __ivt_start[], __ivt_end[]; extern u32 _fdt_start[], _fdt_end[]; -# endif /* !__ASSEMBLY__ */ +# endif /* !__ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_SECTIONS_H */ diff --git a/arch/microblaze/include/asm/setup.h b/arch/microblaze/include/asm/setup.h index bf2600f7595932..837ed0bbae4b5b 100644 --- a/arch/microblaze/include/asm/setup.h +++ b/arch/microblaze/include/asm/setup.h @@ -9,7 +9,7 @@ #include -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ extern char cmd_line[COMMAND_LINE_SIZE]; extern char *klimit; @@ -25,5 +25,5 @@ void machine_shutdown(void); void machine_halt(void); void machine_power_off(void); -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_SETUP_H */ diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index a0ddd2a36fb94b..0153f7c2717c98 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -13,7 +13,7 @@ #define THREAD_SIZE (1 << THREAD_SHIFT) #define THREAD_SIZE_ORDER 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ # include # include @@ -86,7 +86,7 @@ static inline struct thread_info *current_thread_info(void) } /* thread information allocation */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * thread information flags diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index cfe3f888b432b0..fedda9908aa94e 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -8,7 +8,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* #define __ARCH_WANT_OLD_READDIR */ /* #define __ARCH_WANT_OLD_STAT */ @@ -33,6 +33,6 @@ #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_FORK -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_MICROBLAZE_UNISTD_H */ diff --git a/arch/microblaze/include/asm/xilinx_mb_manager.h b/arch/microblaze/include/asm/xilinx_mb_manager.h index 7b6995722b0c0a..121a3224882b2c 100644 --- a/arch/microblaze/include/asm/xilinx_mb_manager.h +++ b/arch/microblaze/include/asm/xilinx_mb_manager.h @@ -5,7 +5,7 @@ #ifndef _XILINX_MB_MANAGER_H #define _XILINX_MB_MANAGER_H -# ifndef __ASSEMBLY__ +# ifndef __ASSEMBLER__ #include @@ -21,7 +21,7 @@ void xmb_manager_register(uintptr_t phys_baseaddr, u32 cr_val, void *priv, void (*reset_callback)(void *data)); asmlinkage void xmb_inject_err(void); -# endif /* __ASSEMBLY__ */ +# endif /* __ASSEMBLER__ */ /* Error injection offset */ #define XMB_INJECT_ERR_OFFSET 0x200 diff --git a/arch/microblaze/include/uapi/asm/ptrace.h b/arch/microblaze/include/uapi/asm/ptrace.h index 46dd94cb78021f..8039957a1a9cd6 100644 --- a/arch/microblaze/include/uapi/asm/ptrace.h +++ b/arch/microblaze/include/uapi/asm/ptrace.h @@ -10,7 +10,7 @@ #ifndef _UAPI_ASM_MICROBLAZE_PTRACE_H #define _UAPI_ASM_MICROBLAZE_PTRACE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef unsigned long microblaze_reg_t; @@ -68,6 +68,6 @@ struct pt_regs { #endif /* __KERNEL */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_MICROBLAZE_PTRACE_H */ diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c index 104c3ac5f30c88..b4b67d58e7f6ae 100644 --- a/arch/microblaze/kernel/asm-offsets.c +++ b/arch/microblaze/kernel/asm-offsets.c @@ -7,6 +7,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 56342e11442d2a..6cbf642d7b801d 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -54,7 +54,7 @@ void flush_thread(void) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index caf508f6e9ec8e..608e01ed6cff86 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -51,7 +51,6 @@ config MIPS select GENERIC_SMP_IDLE_THREAD select GENERIC_IDLE_POLL_SETUP select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HAS_IOPORT if !NO_IOPORT_MAP || ISA select HAVE_ARCH_COMPILER_H @@ -2223,7 +2222,7 @@ config MIPS_MT_SMP select SMP select SMP_UP select SYS_SUPPORTS_SMP - select SYS_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_SMT select MIPS_PERF_SHARED_TC_COUNTERS help This is a kernel model which is known as SMVP. This is supported @@ -2235,18 +2234,6 @@ config MIPS_MT_SMP config MIPS_MT bool -config SCHED_SMT - bool "SMT (multithreading) scheduler support" - depends on SYS_SUPPORTS_SCHED_SMT - default n - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with MIPS MT enabled cores at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SYS_SUPPORTS_SCHED_SMT - bool - config SYS_SUPPORTS_MULTITHREADING bool @@ -2318,7 +2305,7 @@ config MIPS_CPS select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU select SYNC_R4K if (CEVT_R4K || CSRC_R4K) select SYS_SUPPORTS_HOTPLUG_CPU - select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6 + select ARCH_SUPPORTS_SCHED_SMT if CPU_MIPSR6 select SYS_SUPPORTS_SMP select WEAK_ORDERING select GENERIC_IRQ_MIGRATION if HOTPLUG_CPU diff --git a/arch/mips/cavium-octeon/Makefile b/arch/mips/cavium-octeon/Makefile index 2a59265788413e..ab84ede0cbe0e8 100644 --- a/arch/mips/cavium-octeon/Makefile +++ b/arch/mips/cavium-octeon/Makefile @@ -11,9 +11,9 @@ obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o obj-y += dma-octeon.o +obj-y += octeon-crypto.o obj-y += octeon-memcpy.o obj-y += executive/ -obj-y += crypto/ obj-$(CONFIG_MTD) += flash_setup.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile deleted file mode 100644 index 83f2f5dd93cccd..00000000000000 --- a/arch/mips/cavium-octeon/crypto/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# OCTEON-specific crypto modules. -# - -obj-y += octeon-crypto.o - -obj-$(CONFIG_CRYPTO_MD5_OCTEON) += octeon-md5.o diff --git a/arch/mips/cavium-octeon/crypto/octeon-md5.c b/arch/mips/cavium-octeon/crypto/octeon-md5.c deleted file mode 100644 index a8ce831e2cebd9..00000000000000 --- a/arch/mips/cavium-octeon/crypto/octeon-md5.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Cryptographic API. - * - * MD5 Message Digest Algorithm (RFC1321). - * - * Adapted for OCTEON by Aaro Koskinen . - * - * Based on crypto/md5.c, which is: - * - * Derived from cryptoapi implementation, originally based on the - * public domain implementation written by Colin Plumb in 1993. - * - * Copyright (c) Cryptoapi developers. - * Copyright (c) 2002 James Morris - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct octeon_md5_state { - __le32 hash[MD5_HASH_WORDS]; - u64 byte_count; -}; - -/* - * We pass everything as 64-bit. OCTEON can handle misaligned data. - */ - -static void octeon_md5_store_hash(struct octeon_md5_state *ctx) -{ - u64 *hash = (u64 *)ctx->hash; - - write_octeon_64bit_hash_dword(hash[0], 0); - write_octeon_64bit_hash_dword(hash[1], 1); -} - -static void octeon_md5_read_hash(struct octeon_md5_state *ctx) -{ - u64 *hash = (u64 *)ctx->hash; - - hash[0] = read_octeon_64bit_hash_dword(0); - hash[1] = read_octeon_64bit_hash_dword(1); -} - -static void octeon_md5_transform(const void *_block) -{ - const u64 *block = _block; - - write_octeon_64bit_block_dword(block[0], 0); - write_octeon_64bit_block_dword(block[1], 1); - write_octeon_64bit_block_dword(block[2], 2); - write_octeon_64bit_block_dword(block[3], 3); - write_octeon_64bit_block_dword(block[4], 4); - write_octeon_64bit_block_dword(block[5], 5); - write_octeon_64bit_block_dword(block[6], 6); - octeon_md5_start(block[7]); -} - -static int octeon_md5_init(struct shash_desc *desc) -{ - struct octeon_md5_state *mctx = shash_desc_ctx(desc); - - mctx->hash[0] = cpu_to_le32(MD5_H0); - mctx->hash[1] = cpu_to_le32(MD5_H1); - mctx->hash[2] = cpu_to_le32(MD5_H2); - mctx->hash[3] = cpu_to_le32(MD5_H3); - mctx->byte_count = 0; - - return 0; -} - -static int octeon_md5_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct octeon_md5_state *mctx = shash_desc_ctx(desc); - struct octeon_cop2_state state; - unsigned long flags; - - mctx->byte_count += len; - flags = octeon_crypto_enable(&state); - octeon_md5_store_hash(mctx); - - do { - octeon_md5_transform(data); - data += MD5_HMAC_BLOCK_SIZE; - len -= MD5_HMAC_BLOCK_SIZE; - } while (len >= MD5_HMAC_BLOCK_SIZE); - - octeon_md5_read_hash(mctx); - octeon_crypto_disable(&state, flags); - mctx->byte_count -= len; - return len; -} - -static int octeon_md5_finup(struct shash_desc *desc, const u8 *src, - unsigned int offset, u8 *out) -{ - struct octeon_md5_state *mctx = shash_desc_ctx(desc); - int padding = 56 - (offset + 1); - struct octeon_cop2_state state; - u32 block[MD5_BLOCK_WORDS]; - unsigned long flags; - char *p; - - p = memcpy(block, src, offset); - p += offset; - *p++ = 0x80; - - flags = octeon_crypto_enable(&state); - octeon_md5_store_hash(mctx); - - if (padding < 0) { - memset(p, 0x00, padding + sizeof(u64)); - octeon_md5_transform(block); - p = (char *)block; - padding = 56; - } - - memset(p, 0, padding); - mctx->byte_count += offset; - block[14] = mctx->byte_count << 3; - block[15] = mctx->byte_count >> 29; - cpu_to_le32_array(block + 14, 2); - octeon_md5_transform(block); - - octeon_md5_read_hash(mctx); - octeon_crypto_disable(&state, flags); - - memzero_explicit(block, sizeof(block)); - memcpy(out, mctx->hash, sizeof(mctx->hash)); - - return 0; -} - -static int octeon_md5_export(struct shash_desc *desc, void *out) -{ - struct octeon_md5_state *ctx = shash_desc_ctx(desc); - union { - u8 *u8; - u32 *u32; - u64 *u64; - } p = { .u8 = out }; - int i; - - for (i = 0; i < MD5_HASH_WORDS; i++) - put_unaligned(le32_to_cpu(ctx->hash[i]), p.u32++); - put_unaligned(ctx->byte_count, p.u64); - return 0; -} - -static int octeon_md5_import(struct shash_desc *desc, const void *in) -{ - struct octeon_md5_state *ctx = shash_desc_ctx(desc); - union { - const u8 *u8; - const u32 *u32; - const u64 *u64; - } p = { .u8 = in }; - int i; - - for (i = 0; i < MD5_HASH_WORDS; i++) - ctx->hash[i] = cpu_to_le32(get_unaligned(p.u32++)); - ctx->byte_count = get_unaligned(p.u64); - return 0; -} - -static struct shash_alg alg = { - .digestsize = MD5_DIGEST_SIZE, - .init = octeon_md5_init, - .update = octeon_md5_update, - .finup = octeon_md5_finup, - .export = octeon_md5_export, - .import = octeon_md5_import, - .statesize = MD5_STATE_SIZE, - .descsize = sizeof(struct octeon_md5_state), - .base = { - .cra_name = "md5", - .cra_driver_name= "octeon-md5", - .cra_priority = OCTEON_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = MD5_HMAC_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init md5_mod_init(void) -{ - if (!octeon_has_crypto()) - return -ENOTSUPP; - return crypto_register_shash(&alg); -} - -static void __exit md5_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(md5_mod_init); -module_exit(md5_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MD5 Message Digest Algorithm (OCTEON)"); -MODULE_AUTHOR("Aaro Koskinen "); diff --git a/arch/mips/cavium-octeon/crypto/octeon-crypto.c b/arch/mips/cavium-octeon/octeon-crypto.c similarity index 100% rename from arch/mips/cavium-octeon/crypto/octeon-crypto.c rename to arch/mips/cavium-octeon/octeon-crypto.c diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig index 3f50e1d78894a1..68c363366bceb8 100644 --- a/arch/mips/configs/cavium_octeon_defconfig +++ b/arch/mips/configs/cavium_octeon_defconfig @@ -155,7 +155,6 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_MD5_OCTEON=y CONFIG_CRYPTO_DES=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_FS=y diff --git a/arch/mips/crypto/Kconfig b/arch/mips/crypto/Kconfig index 7b91f4ec65bffb..6a5bd5074867e0 100644 --- a/arch/mips/crypto/Kconfig +++ b/arch/mips/crypto/Kconfig @@ -2,14 +2,4 @@ menu "Accelerated Cryptographic Algorithms for CPU (mips)" -config CRYPTO_MD5_OCTEON - tristate "Digests: MD5 (OCTEON)" - depends on CPU_CAVIUM_OCTEON - select CRYPTO_MD5 - select CRYPTO_HASH - help - MD5 message digest algorithm (RFC1321) - - Architecture: mips OCTEON using crypto instructions, when available - endmenu diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index 89f73d1a4ea4e7..42f88452c920fb 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -327,7 +327,7 @@ static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long * * Return the bit position (0..63) of the most significant 1 bit in a word * Returns -1 if no 1 bit exists */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline __attribute_const__ unsigned long __fls(unsigned long word) { int num; @@ -393,7 +393,7 @@ static __always_inline unsigned long __fls(unsigned long word) * Returns 0..SZLONG-1 * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned long __ffs(unsigned long word) { return __fls(word & -word); } @@ -405,7 +405,7 @@ static __always_inline unsigned long __ffs(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { int r; @@ -458,7 +458,7 @@ static inline int fls(unsigned int x) * the libc and compiler builtin ffs routines, therefore * differs in spirit from the below ffz (man ffs). */ -static inline int ffs(int word) +static inline __attribute_const__ int ffs(int word) { if (!word) return 0; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 1e29efcba46e57..5debd9a3854a9e 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -9,6 +9,8 @@ * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com * Copyright (C) 2000 MIPS Technologies, Inc. */ +#define COMPILE_OFFSETS + #include #include #include diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 02aa6a04a21da4..29191fa1801e2a 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -107,7 +107,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); diff --git a/arch/nios2/include/asm/entry.h b/arch/nios2/include/asm/entry.h index bafb7b2ca59fcb..cb25ed56450ab9 100644 --- a/arch/nios2/include/asm/entry.h +++ b/arch/nios2/include/asm/entry.h @@ -10,7 +10,7 @@ #ifndef _ASM_NIOS2_ENTRY_H #define _ASM_NIOS2_ENTRY_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include #include @@ -117,5 +117,5 @@ addi sp, sp, SWITCH_STACK_SIZE .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_NIOS2_ENTRY_H */ diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 2897ec1b74f618..00a51623d38a54 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -26,7 +26,7 @@ #define PAGE_OFFSET \ (CONFIG_NIOS2_MEM_BASE + CONFIG_NIOS2_KERNEL_REGION_BASE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This gives the physical RAM offset. @@ -90,6 +90,6 @@ extern struct page *mem_map; #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_NIOS2_PAGE_H */ diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index eb44130364a9a1..d9521c3c2df98e 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -36,7 +36,7 @@ /* Kuser helpers is mapped to this user space address */ #define KUSER_BASE 0x1000 #define KUSER_SIZE (PAGE_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ # define TASK_SIZE 0x7FFF0000UL # define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) @@ -72,6 +72,6 @@ extern unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_NIOS2_PROCESSOR_H */ diff --git a/arch/nios2/include/asm/ptrace.h b/arch/nios2/include/asm/ptrace.h index 9da34c3022a272..96cbcd40c7ce56 100644 --- a/arch/nios2/include/asm/ptrace.h +++ b/arch/nios2/include/asm/ptrace.h @@ -18,7 +18,7 @@ /* This struct defines the way the registers are stored on the stack during a system call. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pt_regs { unsigned long r8; /* r8-r15 Caller-saved GP registers */ unsigned long r9; @@ -78,5 +78,5 @@ extern void show_regs(struct pt_regs *); int do_syscall_trace_enter(void); void do_syscall_trace_exit(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_NIOS2_PTRACE_H */ diff --git a/arch/nios2/include/asm/registers.h b/arch/nios2/include/asm/registers.h index 95b67dd16f8188..165dab26221f23 100644 --- a/arch/nios2/include/asm/registers.h +++ b/arch/nios2/include/asm/registers.h @@ -6,7 +6,7 @@ #ifndef _ASM_NIOS2_REGISTERS_H #define _ASM_NIOS2_REGISTERS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #endif @@ -44,7 +44,7 @@ /* tlbmisc register bits */ #define TLBMISC_PID_SHIFT 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define TLBMISC_PID_MASK ((1UL << cpuinfo.tlb_pid_num_bits) - 1) #endif #define TLBMISC_WAY_MASK 0xf diff --git a/arch/nios2/include/asm/setup.h b/arch/nios2/include/asm/setup.h index 908a1526d1bd78..6d3f26a71cb513 100644 --- a/arch/nios2/include/asm/setup.h +++ b/arch/nios2/include/asm/setup.h @@ -8,7 +8,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef __KERNEL__ extern char exception_handler_hook[]; @@ -18,6 +18,6 @@ extern char fast_handler_end[]; extern void pagetable_init(void); #endif/* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_NIOS2_SETUP_H */ diff --git a/arch/nios2/include/asm/syscalls.h b/arch/nios2/include/asm/syscalls.h index b4d4ed3bf9c86c..0e214b0a0ac89f 100644 --- a/arch/nios2/include/asm/syscalls.h +++ b/arch/nios2/include/asm/syscalls.h @@ -7,6 +7,7 @@ int sys_cacheflush(unsigned long addr, unsigned long len, unsigned int op); +asmlinkage long __sys_clone3(struct clone_args __user *uargs, size_t size); #include diff --git a/arch/nios2/include/asm/thread_info.h b/arch/nios2/include/asm/thread_info.h index 5abac9893b32b5..83df79286d62ee 100644 --- a/arch/nios2/include/asm/thread_info.h +++ b/arch/nios2/include/asm/thread_info.h @@ -24,7 +24,7 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE 8192 /* 2 * PAGE_SIZE */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * low level task data that entry.S needs immediate access to @@ -61,7 +61,7 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * thread information flags diff --git a/arch/nios2/include/asm/traps.h b/arch/nios2/include/asm/traps.h index afd77bef01c65d..133a3dedbc3e8d 100644 --- a/arch/nios2/include/asm/traps.h +++ b/arch/nios2/include/asm/traps.h @@ -12,7 +12,7 @@ #define TRAP_ID_SYSCALL 0 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr); void do_page_fault(struct pt_regs *regs, unsigned long cause, unsigned long address); diff --git a/arch/nios2/include/asm/unistd.h b/arch/nios2/include/asm/unistd.h index 1146e56473c512..213f6de3cf7b10 100644 --- a/arch/nios2/include/asm/unistd.h +++ b/arch/nios2/include/asm/unistd.h @@ -7,6 +7,4 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SET_GET_RLIMIT -#define __ARCH_BROKEN_SYS_CLONE3 - #endif diff --git a/arch/nios2/include/uapi/asm/ptrace.h b/arch/nios2/include/uapi/asm/ptrace.h index 2b91dbe5bcfee5..1298db9f0fc98e 100644 --- a/arch/nios2/include/uapi/asm/ptrace.h +++ b/arch/nios2/include/uapi/asm/ptrace.h @@ -13,7 +13,7 @@ #ifndef _UAPI_ASM_NIOS2_PTRACE_H #define _UAPI_ASM_NIOS2_PTRACE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -80,5 +80,5 @@ struct user_pt_regs { __u32 regs[49]; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_NIOS2_PTRACE_H */ diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c index e3d9b7b6fb48aa..88190b503ce5de 100644 --- a/arch/nios2/kernel/asm-offsets.c +++ b/arch/nios2/kernel/asm-offsets.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2011 Tobias Klauser */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/nios2/kernel/entry.S b/arch/nios2/kernel/entry.S index 99f0a65e62347e..dd40dfd908e59c 100644 --- a/arch/nios2/kernel/entry.S +++ b/arch/nios2/kernel/entry.S @@ -403,6 +403,12 @@ ENTRY(sys_clone) addi sp, sp, 4 RESTORE_SWITCH_STACK ret +/* long syscall(SYS_clone3, struct clone_args *cl_args, size_t size); */ +ENTRY(__sys_clone3) + SAVE_SWITCH_STACK + call sys_clone3 + RESTORE_SWITCH_STACK + ret ENTRY(sys_rt_sigreturn) SAVE_SWITCH_STACK diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index f84021303f6a82..151404139085cf 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -101,7 +101,7 @@ void flush_thread(void) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index 2a40150142c36f..f43f01c4ab934b 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -142,6 +142,20 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low, *max_high = PFN_DOWN(memblock_end_of_DRAM()); } +static void __init adjust_lowmem_bounds(void) +{ + phys_addr_t block_start, block_end; + u64 i; + phys_addr_t memblock_limit = 0; + + for_each_mem_range(i, &block_start, &block_end) { + if (block_end > memblock_limit) + memblock_limit = block_end; + } + + memblock_set_current_limit(memblock_limit); +} + void __init setup_arch(char **cmdline_p) { console_verbose(); @@ -157,6 +171,7 @@ void __init setup_arch(char **cmdline_p) /* Keep a copy of command line */ *cmdline_p = boot_command_line; + adjust_lowmem_bounds(); find_limits(&min_low_pfn, &max_low_pfn, &max_pfn); memblock_reserve(__pa_symbol(_stext), _end - _stext); diff --git a/arch/nios2/kernel/syscall_table.c b/arch/nios2/kernel/syscall_table.c index 434694067d8f55..c99818aac9e1b8 100644 --- a/arch/nios2/kernel/syscall_table.c +++ b/arch/nios2/kernel/syscall_table.c @@ -13,6 +13,7 @@ #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) #define sys_mmap2 sys_mmap_pgoff +#define sys_clone3 __sys_clone3 void *sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls-1] = sys_ni_syscall, diff --git a/arch/openrisc/include/asm/bitops/__ffs.h b/arch/openrisc/include/asm/bitops/__ffs.h index 1e224b616fdf68..4827b66530b2bd 100644 --- a/arch/openrisc/include/asm/bitops/__ffs.h +++ b/arch/openrisc/include/asm/bitops/__ffs.h @@ -11,7 +11,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FF1 -static inline unsigned long __ffs(unsigned long x) +static inline __attribute_const__ unsigned long __ffs(unsigned long x) { int ret; diff --git a/arch/openrisc/include/asm/bitops/__fls.h b/arch/openrisc/include/asm/bitops/__fls.h index 9658446ad14102..637cc76fe4b7d6 100644 --- a/arch/openrisc/include/asm/bitops/__fls.h +++ b/arch/openrisc/include/asm/bitops/__fls.h @@ -11,7 +11,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FL1 -static inline unsigned long __fls(unsigned long x) +static inline __attribute_const__ unsigned long __fls(unsigned long x) { int ret; diff --git a/arch/openrisc/include/asm/bitops/ffs.h b/arch/openrisc/include/asm/bitops/ffs.h index b4c835d6bc8496..536a60ab9cc30f 100644 --- a/arch/openrisc/include/asm/bitops/ffs.h +++ b/arch/openrisc/include/asm/bitops/ffs.h @@ -10,7 +10,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FF1 -static inline int ffs(int x) +static inline __attribute_const__ int ffs(int x) { int ret; diff --git a/arch/openrisc/include/asm/bitops/fls.h b/arch/openrisc/include/asm/bitops/fls.h index 6b77f6556fb9c1..77da7639bb3e4f 100644 --- a/arch/openrisc/include/asm/bitops/fls.h +++ b/arch/openrisc/include/asm/bitops/fls.h @@ -11,7 +11,7 @@ #ifdef CONFIG_OPENRISC_HAVE_INST_FL1 -static inline int fls(unsigned int x) +static inline __attribute_const__ int fls(unsigned int x) { int ret; diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c index 710651d5aaae10..3cc826f2216b10 100644 --- a/arch/openrisc/kernel/asm-offsets.c +++ b/arch/openrisc/kernel/asm-offsets.c @@ -18,6 +18,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index eef99fee2110cb..73ffb9fa3118bb 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -165,7 +165,7 @@ extern asmlinkage void ret_from_fork(void); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *userregs; diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 2efa4b08b7b841..0940c162f1f7b4 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -44,6 +44,7 @@ config PARISC select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP && PA8X00 select GENERIC_CPU_DEVICES if !SMP select GENERIC_LIB_DEVMEM_IS_ALLOWED select SYSCTL_ARCH_UNALIGN_ALLOW @@ -319,14 +320,6 @@ config SMP If you don't know what to do here, say N. -config SCHED_MC - bool "Multi-core scheduler support" - depends on GENERIC_ARCH_TOPOLOGY && PA8X00 - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config IRQSTACKS bool "Use separate kernel stacks when processing interrupts" default y diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index 0ec9cfc5131fc3..bd1280a8a5ece0 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -123,7 +123,7 @@ static __inline__ int test_and_change_bit(int nr, volatile unsigned long * addr) * cycles for each mispredicted branch. */ -static __inline__ unsigned long __ffs(unsigned long x) +static __inline__ __attribute_const__ unsigned long __ffs(unsigned long x) { unsigned long ret; @@ -161,7 +161,7 @@ static __inline__ unsigned long __ffs(unsigned long x) * This is defined the same way as the libc and compiler builtin * ffs routines, therefore differs in spirit from the above ffz (man ffs). */ -static __inline__ int ffs(int x) +static __inline__ __attribute_const__ int ffs(int x) { return x ? (__ffs((unsigned long)x) + 1) : 0; } @@ -171,7 +171,7 @@ static __inline__ int ffs(int x) * fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __inline__ int fls(unsigned int x) +static __inline__ __attribute_const__ int fls(unsigned int x) { int ret; if (!x) diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c index 757816a7bd4b28..9abfe65492c65e 100644 --- a/arch/parisc/kernel/asm-offsets.c +++ b/arch/parisc/kernel/asm-offsets.c @@ -13,6 +13,7 @@ * Copyright (C) 2002 Randolph Chung * Copyright (C) 2003 James Bottomley */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index ed93bd8c154533..e64ab5d2a40d61 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -201,7 +201,7 @@ arch_initcall(parisc_idle_init); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *cregs = &(p->thread.regs); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9fc6..22173a8976704d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -170,6 +170,9 @@ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx + select ARCH_SUPPORTS_SCHED_MC if SMP + select ARCH_SUPPORTS_SCHED_SMT if PPC64 && SMP + select SCHED_MC if ARCH_SUPPORTS_SCHED_MC select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST @@ -207,8 +210,6 @@ config PPC select GENERIC_PCI_IOMAP if PCI select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select HAS_IOPORT if PCI select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP @@ -243,12 +244,14 @@ config PPC select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_GUP_FAST select HAVE_FTRACE_GRAPH_FUNC + select HAVE_FTRACE_REGS_HAVING_PT_REGS select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1 select HAVE_FUNCTION_ERROR_INJECTION + select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER if !COMPILE_TEST && (PPC64 || (PPC32 && CC_IS_GCC)) - select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC + select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO select HAVE_HARDLOCKUP_DETECTOR_ARCH if PPC_BOOK3S_64 && SMP select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI @@ -963,14 +966,6 @@ config PPC_PROT_SAO_LPAR config PPC_COPRO_BASE bool -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on PPC64 && SMP - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with POWER5 cpus at a cost of slightly increased - overhead in some places. If unsure say N here. - config PPC_DENORMALISATION bool "PowerPC denormalisation exception handling" depends on PPC_BOOK3S_64 diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 9753fb87217c35..a58b1029592ce2 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -58,7 +58,7 @@ ifeq ($(CONFIG_PPC64)$(CONFIG_LD_IS_BFD),yy) # There is a corresponding test in arch/powerpc/lib/Makefile KBUILD_LDFLAGS_MODULE += --save-restore-funcs else -KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o +KBUILD_LDFLAGS_MODULE += $(objtree)/arch/powerpc/lib/crtsavres.o endif ifdef CONFIG_CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/boot/page.h b/arch/powerpc/boot/page.h index c3d55fc8f34c4e..e44a3119720db5 100644 --- a/arch/powerpc/boot/page.h +++ b/arch/powerpc/boot/page.h @@ -5,7 +5,7 @@ * Copyright (C) 2001 PPC64 Team, IBM Corp */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define ASM_CONST(x) x #else #define __ASM_CONST(x) x##UL diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 3d8dc822282ac8..a75baefd1cffba 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -226,11 +226,7 @@ ld_is_lld() # Do not include PT_INTERP segment when linking pie. Non-pie linking # just ignores this option. -LD_VERSION=$(${CROSS}ld --version | ld_version) -LD_NO_DL_MIN_VERSION=$(echo 2.26 | ld_version) -if [ "$LD_VERSION" -ge "$LD_NO_DL_MIN_VERSION" ] ; then - nodl="--no-dynamic-linker" -fi +nodl="--no-dynamic-linker" # suppress some warnings in recent ld versions nowarn="-z noexecstack" diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index d06388b0f66e31..bd4685612de6dd 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -320,7 +320,6 @@ CONFIG_XMON=y CONFIG_CRYPTO_BENCHMARK=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_WP512=m diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index ce34597e9f3e14..2d92c11eea7e47 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -387,7 +387,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_LZO=m -CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_AES_GCM_P10=m CONFIG_CRYPTO_DEV_NX=y CONFIG_CRYPTO_DEV_NX_ENCRYPT=m diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index cfe39fc221cf81..662aed46f9c795 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -2,27 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" -config CRYPTO_CURVE25519_PPC64 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN - select CRYPTO_KPP - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CURVE25519 - default CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 algorithm - - Architecture: PowerPC64 - - Little-endian - -config CRYPTO_MD5_PPC - tristate "Digests: MD5" - select CRYPTO_HASH - help - MD5 message digest algorithm (RFC1321) - - Architecture: powerpc - config CRYPTO_AES_PPC_SPE tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)" depends on SPE diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index bc8fd27344b8bb..5960e5300db71e 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -6,16 +6,12 @@ # obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o -obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o -obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o -md5-ppc-y := md5-asm.o md5-glue.o aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o -curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) override flavour := linux-ppc64le diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c deleted file mode 100644 index 204440a90cd84c..00000000000000 --- a/arch/powerpc/crypto/md5-glue.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Glue code for MD5 implementation for PPC assembler - * - * Based on generic implementation. - * - * Copyright (c) 2015 Markus Stockhausen - */ - -#include -#include -#include -#include -#include - -extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks); - -static int ppc_md5_init(struct shash_desc *desc) -{ - struct md5_state *sctx = shash_desc_ctx(desc); - - sctx->hash[0] = MD5_H0; - sctx->hash[1] = MD5_H1; - sctx->hash[2] = MD5_H2; - sctx->hash[3] = MD5_H3; - sctx->byte_count = 0; - - return 0; -} - -static int ppc_md5_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct md5_state *sctx = shash_desc_ctx(desc); - - sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE); - ppc_md5_transform(sctx->hash, data, len >> 6); - return len - round_down(len, MD5_HMAC_BLOCK_SIZE); -} - -static int ppc_md5_finup(struct shash_desc *desc, const u8 *src, - unsigned int offset, u8 *out) -{ - struct md5_state *sctx = shash_desc_ctx(desc); - __le64 block[MD5_BLOCK_WORDS] = {}; - u8 *p = memcpy(block, src, offset); - __le32 *dst = (__le32 *)out; - __le64 *pbits; - - src = p; - p += offset; - *p++ = 0x80; - sctx->byte_count += offset; - pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1]; - *pbits = cpu_to_le64(sctx->byte_count << 3); - ppc_md5_transform(sctx->hash, src, (pbits - block + 1) / 8); - memzero_explicit(block, sizeof(block)); - - dst[0] = cpu_to_le32(sctx->hash[0]); - dst[1] = cpu_to_le32(sctx->hash[1]); - dst[2] = cpu_to_le32(sctx->hash[2]); - dst[3] = cpu_to_le32(sctx->hash[3]); - return 0; -} - -static struct shash_alg alg = { - .digestsize = MD5_DIGEST_SIZE, - .init = ppc_md5_init, - .update = ppc_md5_update, - .finup = ppc_md5_finup, - .descsize = MD5_STATE_SIZE, - .base = { - .cra_name = "md5", - .cra_driver_name= "md5-ppc", - .cra_priority = 200, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = MD5_HMAC_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init ppc_md5_mod_init(void) -{ - return crypto_register_shash(&alg); -} - -static void __exit ppc_md5_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(ppc_md5_mod_init); -module_exit(ppc_md5_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, PPC assembler"); - -MODULE_ALIAS_CRYPTO("md5"); -MODULE_ALIAS_CRYPTO("md5-ppc"); diff --git a/arch/powerpc/include/asm/asm-const.h b/arch/powerpc/include/asm/asm-const.h index bfb3c3534877a0..392bdb1f104f4e 100644 --- a/arch/powerpc/include/asm/asm-const.h +++ b/arch/powerpc/include/asm/asm-const.h @@ -1,7 +1,7 @@ #ifndef _ASM_POWERPC_ASM_CONST_H #define _ASM_POWERPC_ASM_CONST_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define stringify_in_c(...) __VA_ARGS__ # define ASM_CONST(x) x #else diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index b95b666f03744e..9e9833faa4af87 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -7,7 +7,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #endif diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 671ecc6711e366..0d0470cd5ac31d 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -276,7 +276,7 @@ static inline void arch___clear_bit_unlock(int nr, volatile unsigned long *addr) * fls: find last (most-significant) bit set. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { int lz; @@ -294,7 +294,7 @@ static __always_inline int fls(unsigned int x) * 32-bit fls calls. */ #ifdef CONFIG_PPC64 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { int lz; diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 4e14a5427a6323..873c5146e32610 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -7,7 +7,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_KUAP @@ -170,6 +170,6 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) #endif /* CONFIG_PPC_KUAP */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */ diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 78c6a5fde1d615..8435bf3cdabfaa 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -29,7 +29,7 @@ #define BPP_RX 0x01 /* Read only */ #define BPP_RW 0x02 /* Read/write */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Contort a phys_addr_t into the right format/bits for a BAT */ #ifdef CONFIG_PHYS_64BIT #define BAT_PHYS_ADDR(x) ((u32)((x & 0x00000000fffe0000ULL) | \ @@ -47,7 +47,7 @@ struct ppc_bat { u32 batu; u32 batl; }; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * Hash table @@ -64,7 +64,7 @@ struct ppc_bat { #define SR_KP 0x20000000 /* User key */ #define SR_KS 0x40000000 /* Supervisor key */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include @@ -225,7 +225,7 @@ static __always_inline void update_user_segments(u32 val) int __init find_free_bat(void); unsigned int bat_block_size(unsigned long base, unsigned long top); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* We happily ignore the smaller BATs on 601, we don't actually use * those definitions on hash32 at the moment anyway diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index dd4eb306317581..f4390704d5ba29 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -7,8 +7,14 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), - pgtable_gfp_flags(mm, GFP_KERNEL)); + pgd_t *pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); + +#ifdef CONFIG_PPC_BOOK3S_603 + memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, + (MAX_PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#endif + return pgd; } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 92d21c6faf1e57..87dcca962be786 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -102,7 +102,7 @@ #define PMD_CACHE_INDEX PMD_INDEX_SIZE #define PUD_CACHE_INDEX PUD_INDEX_SIZE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) #define PMD_TABLE_SIZE 0 #define PUD_TABLE_SIZE 0 @@ -110,7 +110,7 @@ /* Bits to mask out from a PMD to get to the PTE page */ #define PMD_MASKED_BITS (PTE_TABLE_SIZE - 1) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) @@ -132,12 +132,12 @@ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); void unmap_kernel_page(unsigned long va); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary @@ -199,7 +199,7 @@ void unmap_kernel_page(unsigned long va); #define MODULES_SIZE (CONFIG_MODULES_SIZE * SZ_1M) #define MODULES_VADDR (MODULES_END - MODULES_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -602,6 +602,6 @@ static inline pgprot_t pgprot_writecombine(pgprot_t prot) return pgprot_noncached_wc(prot); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_32_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 7132392fa7cdfe..8e5bd9902bed4e 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -32,7 +32,7 @@ */ #define H_KERN_VIRT_START ASM_CONST(0xc0003d0000000000) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) #define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE) #define H_PUD_TABLE_SIZE (sizeof(pud_t) << H_PUD_INDEX_SIZE) @@ -168,6 +168,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, extern int hash__has_transparent_hugepage(void); #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 0fb5b7da94783e..7deb3a66890bcd 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -79,7 +79,7 @@ #endif #define H_PMD_FRAG_NR (PAGE_SIZE >> H_PMD_FRAG_SIZE_SHIFT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /* @@ -281,6 +281,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, extern int hash__has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 0755f2567021dc..5a8cbd496731ef 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -112,7 +112,7 @@ #define H_PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define H_PUD_BAD_BITS (PMD_TABLE_SIZE-1) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline int get_region_id(unsigned long ea) { int region_id; @@ -295,6 +295,6 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid, pgprot_t prot); int hash__remove_section_mapping(unsigned long start, unsigned long end); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */ diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 497a7bd31ecc01..03aec3c6c851c1 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -10,7 +10,7 @@ #define AMR_KUEP_BLOCKED UL(0x5455555555555555) #define AMR_KUAP_BLOCKED (AMR_KUAP_BLOCK_READ | AMR_KUAP_BLOCK_WRITE) -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro kuap_user_restore gpr1, gpr2 #if defined(CONFIG_PPC_PKEY) @@ -191,7 +191,7 @@ #endif .endm -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #include #include @@ -413,6 +413,6 @@ static __always_inline void restore_user_access(unsigned long flags) if (static_branch_unlikely(&uaccess_flush_key) && flags == AMR_KUAP_BLOCKED) do_uaccess_flush(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_KUP_H */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 1c4eebbc69c94b..3463514232071e 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -130,7 +130,7 @@ #define POWER9_TLB_SETS_HASH 256 /* # sets in POWER9 TLB Hash mode */ #define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct mmu_hash_ops { void (*hpte_invalidate)(unsigned long slot, @@ -220,7 +220,7 @@ static inline unsigned long get_sllp_encoding(int psize) return sllp; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Segment sizes. @@ -248,7 +248,7 @@ static inline unsigned long get_sllp_encoding(int psize) #define LP_BITS 8 #define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline int slb_vsid_shift(int ssize) { @@ -532,7 +532,7 @@ void slb_set_size(u16 size); static inline void slb_set_size(u16 size) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * VSID allocation (256MB segment) @@ -668,7 +668,7 @@ static inline void slb_set_size(u16 size) { } #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) #define LOW_SLICE_ARRAY_SZ (BITS_PER_LONG / BITS_PER_BYTE) #define TASK_SLICE_ARRAY_SZ(x) ((x)->hash_context->slb_addr_limit >> 41) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_SUBPAGE_PROT /* @@ -881,5 +881,5 @@ static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index fedbc5d381917c..48631365b48cfc 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -4,7 +4,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Page size definition * @@ -26,12 +26,12 @@ struct mmu_psize_def { }; }; extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* 64-bit classic hash table MMU */ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * ISA 3.0 partition and process table entry format */ @@ -288,5 +288,5 @@ static inline unsigned long get_user_vsid(mm_context_t *ctx, } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index 4d8d7b4ea16ba5..004a03e97e58ee 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H #define _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_HUGETLB_PAGE #endif /* CONFIG_HUGETLB_PAGE */ @@ -14,5 +14,5 @@ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, BUG(); return hash__remap_4k_pfn(vma, addr, pfn, prot); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c1980036531599..aac8ce30cd3b39 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -4,7 +4,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -143,7 +143,7 @@ #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * page table defines */ @@ -291,7 +291,7 @@ static inline unsigned long pud_leaf_size(pud_t pud) else return PUD_SIZE; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include #include @@ -327,7 +327,7 @@ static inline unsigned long pud_leaf_size(pud_t pud) #define FIXADDR_SIZE SZ_32M #define FIXADDR_TOP (IOREMAP_END + FIXADDR_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long clr, @@ -1381,5 +1381,5 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va return false; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index df23a8267e4d09..da954e7797441f 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -4,7 +4,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #endif @@ -14,7 +14,7 @@ #include #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #endif @@ -132,7 +132,7 @@ #define RADIX_VMEMMAP_SIZE RADIX_KERN_MAP_SIZE #define RADIX_VMEMMAP_END (RADIX_VMEMMAP_START + RADIX_VMEMMAP_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) #define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) #define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE) @@ -362,5 +362,5 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, unsigned long start, unsigned long end, int node, struct dev_pagemap *pgmap); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h index 5fbe18544cbd1b..6e2f7a74cd7591 100644 --- a/arch/powerpc/include/asm/book3s/64/slice.h +++ b/arch/powerpc/include/asm/book3s/64/slice.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_BOOK3S_64_SLICE_H #define _ASM_POWERPC_BOOK3S_64_SLICE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_64S_HASH_MMU #ifdef CONFIG_HUGETLB_PAGE @@ -37,6 +37,6 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, void slice_init_new_context_exec(struct mm_struct *mm); void slice_setup_new_exec(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */ diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 1db485aacbd9b7..bbaa7e81f82134 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -7,7 +7,7 @@ #ifdef CONFIG_BUG -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include #ifdef CONFIG_DEBUG_BUGVERBOSE .macro EMIT_BUG_ENTRY addr,file,line,flags @@ -31,7 +31,7 @@ .endm #endif /* verbose */ -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ /* _EMIT_BUG_ENTRY expects args %0,%1,%2,%3 to be FILE, LINE, flags and sizeof(struct bug_entry), respectively */ #ifdef CONFIG_DEBUG_BUGVERBOSE @@ -101,12 +101,12 @@ #define HAVE_ARCH_WARN_ON #endif -#endif /* __ASSEMBLY __ */ +#endif /* __ASSEMBLER__ */ #else -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro EMIT_BUG_ENTRY addr,file,line,flags .endm -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #define _EMIT_BUG_ENTRY #endif #endif /* CONFIG_BUG */ @@ -115,7 +115,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pt_regs; void hash__do_page_fault(struct pt_regs *); @@ -128,7 +128,7 @@ void die_mce(const char *str, struct pt_regs *regs, long err); extern bool die_will_crash(void); extern void panic_flush_kmsg_start(void); extern void panic_flush_kmsg_end(void); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BUG_H */ diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h index 69232231d27080..6796babc4d310c 100644 --- a/arch/powerpc/include/asm/cache.h +++ b/arch/powerpc/include/asm/cache.h @@ -37,7 +37,7 @@ #define ARCH_DMA_MINALIGN L1_CACHE_BYTES #endif -#if !defined(__ASSEMBLY__) +#if !defined(__ASSEMBLER__) #ifdef CONFIG_PPC64 struct ppc_cache_info { @@ -145,6 +145,6 @@ static inline void iccci(void *addr) asm volatile ("iccci 0, %0" : : "r"(addr) : "memory"); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_CACHE_H */ diff --git a/arch/powerpc/include/asm/cpu_has_feature.h b/arch/powerpc/include/asm/cpu_has_feature.h index bf8a228229fa92..604fa3b6c33d47 100644 --- a/arch/powerpc/include/asm/cpu_has_feature.h +++ b/arch/powerpc/include/asm/cpu_has_feature.h @@ -2,7 +2,7 @@ #ifndef __ASM_POWERPC_CPU_HAS_FEATURE_H #define __ASM_POWERPC_CPU_HAS_FEATURE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -51,5 +51,5 @@ static __always_inline bool cpu_has_feature(unsigned long feature) } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_POWERPC_CPU_HAS_FEATURE_H */ diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 0cce5dc7fb1c2d..054cd2fcfd551d 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -68,7 +68,7 @@ #define ERR_EC_ESL_MISMATCH -1 #define ERR_DEEP_STATE_ESL_MISMATCH -2 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PNV_IDLE_NAME_LEN 16 struct pnv_idle_states_t { diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 29a529d2ab8b44..ec16c12296da80 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -7,7 +7,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* This structure can grow, it's real size is used by head.S code * via the mkdefs mechanism. @@ -103,7 +103,7 @@ extern void cpu_feature_keys_init(void); static inline void cpu_feature_keys_init(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* CPU kernel features */ @@ -195,7 +195,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_DEXCR_NPHIE LONG_ASM_CONST(0x0010000000000000) #define CPU_FTR_P11_PVR LONG_ASM_CONST(0x0020000000000000) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define CPU_FTR_PPCAS_ARCH_V2 (CPU_FTR_NOEXECUTE) @@ -602,6 +602,6 @@ enum { */ #define HBP_NUM_MAX 2 -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_POWERPC_CPUTABLE_H */ diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index f26c430f398264..d06f2b20b81057 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_CPUTHREADS_H #define _ASM_POWERPC_CPUTHREADS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -107,7 +107,7 @@ static inline u32 get_tensr(void) void book3e_start_thread(int thread, unsigned long addr); void book3e_stop_thread(int thread); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define INVALID_THREAD_HWID 0x0fff diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index 3e9da22a277962..0b9ef726f92c94 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -40,12 +40,6 @@ static inline void _ppc_msgsnd(u32 msg) : : "i" (CPU_FTR_HVMODE), "r" (msg)); } -/* sync before sending message */ -static inline void ppc_msgsnd_sync(void) -{ - __asm__ __volatile__ ("sync" : : : "memory"); -} - /* sync after taking message interrupt */ static inline void ppc_msgsync(void) { @@ -76,12 +70,6 @@ static inline void _ppc_msgsnd(u32 msg) __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); } -/* sync before sending message */ -static inline void ppc_msgsnd_sync(void) -{ - __asm__ __volatile__ ("sync" : : : "memory"); -} - /* sync after taking message interrupt */ static inline void ppc_msgsync(void) { @@ -91,6 +79,12 @@ static inline void ppc_msgsync(void) extern void doorbell_exception(struct pt_regs *regs); +/* sync before sending message */ +static inline void ppc_msgsnd_sync(void) +{ + __asm__ __volatile__ ("sync" : : : "memory"); +} + static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag) { u32 msg = PPC_DBELL_TYPE(type) | (flags & PPC_DBELL_MSG_BRDCAST) | diff --git a/arch/powerpc/include/asm/dcr-native.h b/arch/powerpc/include/asm/dcr-native.h index a92059964579b8..65b3fc2dc4043d 100644 --- a/arch/powerpc/include/asm/dcr-native.h +++ b/arch/powerpc/include/asm/dcr-native.h @@ -7,7 +7,7 @@ #ifndef _ASM_POWERPC_DCR_NATIVE_H #define _ASM_POWERPC_DCR_NATIVE_H #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -139,6 +139,6 @@ static inline void __dcri_clrset(int base_addr, int base_data, int reg, DCRN_ ## base ## _CONFIG_DATA, \ reg, clr, set) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_DCR_NATIVE_H */ diff --git a/arch/powerpc/include/asm/dcr.h b/arch/powerpc/include/asm/dcr.h index 180021cd0b30f7..3c0fac2cc2b257 100644 --- a/arch/powerpc/include/asm/dcr.h +++ b/arch/powerpc/include/asm/dcr.h @@ -7,7 +7,7 @@ #ifndef _ASM_POWERPC_DCR_H #define _ASM_POWERPC_DCR_H #ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_DCR #include @@ -28,6 +28,6 @@ extern unsigned int dcr_resource_start(const struct device_node *np, extern unsigned int dcr_resource_len(const struct device_node *np, unsigned int index); #endif /* CONFIG_PPC_DCR */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_DCR_H */ diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index cdf3c6df5123a7..8fc5aaa4bbbad8 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -52,7 +52,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -571,5 +571,5 @@ static inline long epapr_hypercall4(unsigned int nr, unsigned long p1, in[3] = p4; return epapr_hypercall(in, out, nr); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _EPAPR_HCALLS_H */ diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index b1ef1e92c34a1b..1a83b1ff3578a7 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -149,7 +149,7 @@ exc_##label##_book3e: addi r11,r13,PACA_EXTLB; \ TLB_MISS_RESTORE(r11) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern unsigned int interrupt_base_book3e; #endif diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index bb6f78fcf981cc..a9437e89f69f72 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -53,7 +53,7 @@ */ #define MAX_MCE_DEPTH 4 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define STF_ENTRY_BARRIER_SLOT \ STF_ENTRY_BARRIER_FIXUP_SECTION; \ @@ -170,9 +170,9 @@ RFSCV; \ b rfscv_flush_fallback -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ /* Prototype for function defined in exceptions-64s.S */ void do_uaccess_flush(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/include/asm/extable.h b/arch/powerpc/include/asm/extable.h index 26ce2e5c0fa8e9..d483a9c24ba96f 100644 --- a/arch/powerpc/include/asm/extable.h +++ b/arch/powerpc/include/asm/extable.h @@ -17,7 +17,7 @@ #define ARCH_HAS_RELATIVE_EXTABLE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct exception_table_entry { int insn; diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 17d168dd8b4912..756a6c694018ce 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -168,7 +168,7 @@ label##5: \ #define ALT_FW_FTR_SECTION_END_IFCLR(msk) \ ALT_FW_FTR_SECTION_END_NESTED_IFCLR(msk, 97) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define ASM_FTR_IF(section_if, section_else, msk, val) \ stringify_in_c(BEGIN_FTR_SECTION) \ @@ -196,7 +196,7 @@ label##5: \ #define ASM_MMU_FTR_IFCLR(section_if, section_else, msk) \ ASM_MMU_FTR_IF(section_if, section_else, (msk), 0) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* LWSYNC feature sections */ #define START_LWSYNC_SECTION(label) label##1: @@ -276,7 +276,7 @@ label##3: \ FTR_ENTRY_OFFSET 956b-957b; \ .popsection; -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include extern long stf_barrier_fallback; diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 69ae9cf57d50b6..abd7c56f4d55ca 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -58,7 +58,7 @@ #define FW_FEATURE_WATCHDOG ASM_CONST(0x0000080000000000) #define FW_FEATURE_PLPKS ASM_CONST(0x0000100000000000) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ enum { #ifdef CONFIG_PPC64 @@ -146,6 +146,6 @@ void pseries_probe_fw_features(void); static inline void pseries_probe_fw_features(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_FIRMWARE_H */ diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index f9068dd8dfce7a..bc5109eab5b744 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -14,7 +14,7 @@ #ifndef _ASM_FIXMAP_H #define _ASM_FIXMAP_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -111,5 +111,5 @@ static inline void __set_fixmap(enum fixed_addresses idx, #define VIRT_IMMR_BASE (__fix_to_virt(FIX_IMMR_BASE)) #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/powerpc/include/asm/fprobe.h b/arch/powerpc/include/asm/fprobe.h new file mode 100644 index 00000000000000..d64bc28fb3d3c3 --- /dev/null +++ b/arch/powerpc/include/asm/fprobe.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_PPC_FPROBE_H +#define _ASM_PPC_FPROBE_H + +#include + +#ifdef CONFIG_64BIT +#undef FPROBE_HEADER_MSB_PATTERN +#define FPROBE_HEADER_MSB_PATTERN (PAGE_OFFSET & ~FPROBE_HEADER_MSB_MASK) +#endif + +#endif /* _ASM_PPC_FPROBE_H */ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 82da7c7a1d1256..5984eaa75ce8dd 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -15,7 +15,7 @@ #define FTRACE_MCOUNT_MAX_OFFSET 8 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void _mcount(void); unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, @@ -50,6 +50,21 @@ static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs * asm volatile("mfmsr %0" : "=r" ((_regs)->msr)); \ } while (0) +#undef ftrace_regs_get_return_value +static __always_inline unsigned long +ftrace_regs_get_return_value(const struct ftrace_regs *fregs) +{ + return arch_ftrace_regs(fregs)->regs.gpr[3]; +} +#define ftrace_regs_get_return_value ftrace_regs_get_return_value + +#undef ftrace_regs_get_frame_pointer +static __always_inline unsigned long +ftrace_regs_get_frame_pointer(const struct ftrace_regs *fregs) +{ + return arch_ftrace_regs(fregs)->regs.gpr[1]; +} + static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip) @@ -69,14 +84,14 @@ struct ftrace_ops; void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif #endif /* CONFIG_FUNCTION_TRACER */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_FTRACE_SYSCALLS /* * Some syscall entry functions on powerpc start with "ppc_" (fork and clone, @@ -160,6 +175,6 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsi static inline void ftrace_free_init_tramp(void) { } static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; } #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_FTRACE */ diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index d73153b0275d65..3966bd5810cb65 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -4,7 +4,7 @@ #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* * We can't do CPP stringification and concatination directly into the section * name for some reason, so these macros can do it for us. @@ -167,6 +167,6 @@ end_##sname: // find label from _within_ sname #define ABS_ADDR(label, sname) (label - start_ ## sname + sname ## _start) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_HEAD_64_H */ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index ea6c8dc400d294..9aef16149d9274 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -534,7 +534,7 @@ #define H_HTM_TARGET_NODAL_CHIP_INDEX(x) ((unsigned long)(x)<<(63-31)) #define H_HTM_TARGET_CORE_INDEX_ON_CHIP(x) ((unsigned long)(x)<<(63-47)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /** @@ -735,6 +735,6 @@ struct hv_gpci_request_buffer { uint8_t bytes[HGPCI_MAX_DATA_BYTES]; } __packed; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 569ac1165b0693..1078ba88efaf46 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -59,7 +59,7 @@ #define IRQS_PMI_DISABLED 2 #define IRQS_ALL_DISABLED (IRQS_DISABLED | IRQS_PMI_DISABLED) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline void __hard_irq_enable(void) { @@ -516,6 +516,6 @@ static inline unsigned long mtmsr_isync_irqsafe(unsigned long msr) #define ARCH_IRQ_INIT_FLAGS IRQ_NOREQUEST -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 23638d4e73ac03..eb0e4a20b81883 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -64,7 +64,7 @@ #define INTERRUPT_DATA_LOAD_TLB_MISS_603 0x1100 #define INTERRUPT_DATA_STORE_TLB_MISS_603 0x1200 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -675,6 +675,6 @@ unsigned long interrupt_exit_user_restart(struct pt_regs *regs); unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs); #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_INTERRUPT_H */ diff --git a/arch/powerpc/include/asm/irqflags.h b/arch/powerpc/include/asm/irqflags.h index 47d46712928ac6..1351fb40fe7491 100644 --- a/arch/powerpc/include/asm/irqflags.h +++ b/arch/powerpc/include/asm/irqflags.h @@ -5,7 +5,7 @@ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Get definitions for arch_local_save_flags(x), etc. */ diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 2f2a86ed2280aa..d4eaba459a0ed5 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -6,7 +6,7 @@ * Copyright 2010 Michael Ellerman, IBM Corp. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index b5bbb94c51f6db..db121494462248 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -12,7 +12,7 @@ #define EXPORT_SYMBOL_KASAN(fn) #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -80,5 +80,5 @@ void kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end); int kasan_init_region(void *start, size_t size); -#endif /* __ASSEMBLY */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/powerpc/include/asm/kdump.h b/arch/powerpc/include/asm/kdump.h index fd128d1e52b3bb..802644178f4329 100644 --- a/arch/powerpc/include/asm/kdump.h +++ b/arch/powerpc/include/asm/kdump.h @@ -31,7 +31,7 @@ #endif /* CONFIG_CRASH_DUMP */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #if defined(CONFIG_CRASH_DUMP) && !defined(CONFIG_NONSTATIC_KERNEL) extern void reserve_kdump_trampoline(void); @@ -42,6 +42,6 @@ static inline void reserve_kdump_trampoline(void) { ; } static inline void setup_kdump_trampoline(void) { ; } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __PPC64_KDUMP_H */ diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 70f2f0517509e0..4bbf9f699aaaf4 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -49,7 +49,7 @@ #define KEXEC_STATE_IRQS_OFF 1 #define KEXEC_STATE_REAL_MODE 2 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include typedef void (*crash_shutdown_t)(void); @@ -210,6 +210,6 @@ static inline void reset_sprs(void) } #endif -#endif /* ! __ASSEMBLY__ */ +#endif /* ! __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_KEXEC_H */ diff --git a/arch/powerpc/include/asm/kgdb.h b/arch/powerpc/include/asm/kgdb.h index 715c18b7533463..f39531903325a6 100644 --- a/arch/powerpc/include/asm/kgdb.h +++ b/arch/powerpc/include/asm/kgdb.h @@ -21,7 +21,7 @@ #ifndef __POWERPC_KGDB_H__ #define __POWERPC_KGDB_H__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define BREAK_INSTR_SIZE 4 #define BUFMAX ((NUMREGBYTES * 2) + 512) @@ -62,6 +62,6 @@ static inline void arch_kgdb_breakpoint(void) /* CR/LR, R1, R2, R13-R31 inclusive. */ #define NUMCRITREGBYTES (23 * sizeof(int)) #endif /* 32/64 */ -#endif /* !(__ASSEMBLY__) */ +#endif /* !(__ASSEMBLER__) */ #endif /* !__POWERPC_KGDB_H__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 2bb03d941e3e8a..dab63b82a8d4f3 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -6,7 +6,7 @@ #define KUAP_WRITE 2 #define KUAP_READ_WRITE (KUAP_READ | KUAP_WRITE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include static __always_inline bool kuap_is_disabled(void); @@ -28,14 +28,14 @@ static __always_inline bool kuap_is_disabled(void); #include #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifndef CONFIG_PPC_KUAP .macro kuap_check_amr gpr1, gpr2 .endm #endif -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ extern bool disable_kuep; extern bool disable_kuap; @@ -181,6 +181,6 @@ static __always_inline void prevent_current_write_to_user(void) prevent_user_access(KUAP_WRITE); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_KUAP_H_ */ diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index d68d71987d5cf3..f9af8df0907757 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -9,7 +9,7 @@ #ifndef __POWERPC_KVM_ASM_H__ #define __POWERPC_KVM_ASM_H__ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CONFIG_64BIT #define PPC_STD(sreg, offset, areg) std sreg, (offset)(areg) #define PPC_LD(treg, offset, areg) ld treg, (offset)(areg) diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index a36797938620f0..3435fe144908f4 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -20,7 +20,7 @@ /* Maximum number of subcores per physical core */ #define MAX_SUBCORES 4 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CONFIG_KVM_BOOK3S_HANDLER @@ -58,7 +58,7 @@ kvmppc_resume_\intno: #endif /* CONFIG_KVM_BOOK3S_HANDLER */ -#else /*__ASSEMBLY__ */ +#else /*__ASSEMBLER__ */ struct kvmppc_vcore; @@ -150,7 +150,7 @@ struct kvmppc_book3s_shadow_vcpu { #endif }; -#endif /*__ASSEMBLY__ */ +#endif /*__ASSEMBLER__ */ /* Values for kvm_state */ #define KVM_HWTHREAD_IN_KERNEL 0 diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h index 7487ef5821210f..3acf2995d364ce 100644 --- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h +++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h @@ -8,7 +8,7 @@ #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* * All exceptions from guest state must go through KVM @@ -64,5 +64,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) #endif .endm -#endif /*__ASSEMBLY__ */ +#endif /*__ASSEMBLER__ */ #endif /* ASM_KVM_BOOKE_HV_ASM_H */ diff --git a/arch/powerpc/include/asm/lv1call.h b/arch/powerpc/include/asm/lv1call.h index b11501b30193ba..ae70120953a85a 100644 --- a/arch/powerpc/include/asm/lv1call.h +++ b/arch/powerpc/include/asm/lv1call.h @@ -10,7 +10,7 @@ #if !defined(_ASM_POWERPC_LV1CALL_H) #define _ASM_POWERPC_LV1CALL_H -#if !defined(__ASSEMBLY__) +#if !defined(__ASSEMBLER__) #include #include @@ -211,7 +211,7 @@ {return _lv1_##name(LV1_##in##_IN_##out##_OUT_ARGS);} #endif -#endif /* !defined(__ASSEMBLY__) */ +#endif /* !defined(__ASSEMBLER__) */ /* lv1 call table */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 4182d68d9cd179..5f9c5d436e1713 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -137,7 +137,7 @@ MMU_FTR_CI_LARGE_PAGE #define MMU_FTRS_PA6T MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ MMU_FTR_CI_LARGE_PAGE | MMU_FTR_NO_SLBIE_B -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -332,7 +332,7 @@ static inline bool strict_module_rwx_enabled(void) { return IS_ENABLED(CONFIG_STRICT_MODULE_RWX) && strict_kernel_rwx_enabled(); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* The kernel use the constants below to index in the page sizes array. * The use of fixed constants for this purpose is better for performances @@ -377,7 +377,7 @@ static inline bool strict_module_rwx_enabled(void) #include #else /* CONFIG_PPC_BOOK3S_64 */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* MMU initialization */ extern void early_init_mmu(void); extern void early_init_mmu_secondary(void); @@ -388,7 +388,7 @@ static inline void mmu_early_init_devtree(void) { } static inline void pkey_early_init_devtree(void) {} extern void *abatron_pteptrs[2]; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif #if defined(CONFIG_PPC_BOOK3S_32) diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index e1ee5026ac4af4..864e22deaa2cd2 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -27,6 +27,7 @@ struct ppc_plt_entry { struct mod_arch_specific { #ifdef __powerpc64__ unsigned int stubs_section; /* Index of stubs section in module */ + unsigned int stub_count; /* Number of stubs used */ #ifdef CONFIG_PPC_KERNEL_PCREL unsigned int got_section; /* What section is the GOT? */ unsigned int pcpu_section; /* .data..percpu section */ diff --git a/arch/powerpc/include/asm/mpc52xx.h b/arch/powerpc/include/asm/mpc52xx.h index 01ae6c351e5028..d7ffbd06797d2d 100644 --- a/arch/powerpc/include/asm/mpc52xx.h +++ b/arch/powerpc/include/asm/mpc52xx.h @@ -13,10 +13,10 @@ #ifndef __ASM_POWERPC_MPC52xx_H__ #define __ASM_POWERPC_MPC52xx_H__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include @@ -30,7 +30,7 @@ /* Structures mapping of some unit register set */ /* ======================================================================== */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Memory Mapping Control */ struct mpc52xx_mmap_ctl { @@ -258,14 +258,14 @@ struct mpc52xx_intr { u32 per_error; /* INTR + 0x38 */ }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* ========================================================================= */ /* Prototypes for MPC52xx sysdev */ /* ========================================================================= */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct device_node; @@ -297,7 +297,7 @@ extern void __init mpc52xx_setup_pci(void); static inline void mpc52xx_setup_pci(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifdef CONFIG_PM struct mpc52xx_suspend { diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 46bc5925e5fdc1..08486b15b20751 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -7,7 +7,7 @@ #ifdef CONFIG_PPC_KUAP -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -82,7 +82,7 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) return !((regs->kuap ^ MD_APG_KUAP) & 0xff000000); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* CONFIG_PPC_KUAP */ diff --git a/arch/powerpc/include/asm/nohash/32/mmu-44x.h b/arch/powerpc/include/asm/nohash/32/mmu-44x.h index 2d92a39d8f2e80..c3d19219432440 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-44x.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-44x.h @@ -100,7 +100,7 @@ #define PPC47x_TLB2_S_RW (PPC47x_TLB2_SW | PPC47x_TLB2_SR) #define PPC47x_TLB2_IMG (PPC47x_TLB2_I | PPC47x_TLB2_M | PPC47x_TLB2_G) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern unsigned int tlb_44x_hwater; extern unsigned int tlb_44x_index; @@ -114,7 +114,7 @@ typedef struct { /* patch sites */ extern s32 patch__tlb_44x_hwater_D, patch__tlb_44x_hwater_I; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifndef CONFIG_PPC_EARLY_DEBUG_44x #define PPC44x_EARLY_TLBS 1 diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 2986f9ba40b88b..f19115db8072fd 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -174,7 +174,7 @@ #define MODULES_SIZE (CONFIG_MODULES_SIZE * SZ_1M) #define MODULES_VADDR (MODULES_END - MODULES_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -265,6 +265,6 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size) extern s32 patch__itlbmiss_exit_1, patch__dtlbmiss_exit_1; extern s32 patch__itlbmiss_perf, patch__dtlbmiss_perf; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_MMU_8XX_H_ */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index b481738c4bb520..2d71e4b7cd09c1 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -4,12 +4,12 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include /* For sub-arch specific PPC_PIN_SIZE */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PTE_INDEX_SIZE PTE_SHIFT #define PMD_INDEX_SIZE 0 @@ -19,14 +19,14 @@ #define PMD_CACHE_INDEX PMD_INDEX_SIZE #define PUD_CACHE_INDEX PUD_INDEX_SIZE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) #define PMD_TABLE_SIZE 0 #define PUD_TABLE_SIZE 0 #define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #define PMD_MASKED_BITS (PTE_TABLE_SIZE - 1) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) @@ -149,7 +149,7 @@ #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD) @@ -199,6 +199,6 @@ static inline void pmd_clear(pmd_t *pmdp) /* We borrow LSB 2 to store the exclusive marker in swap PTEs. */ #define _PAGE_SWP_EXCLUSIVE 0x000004 -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 54ebb91dbdcf37..e2ea8ba9f8caeb 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -83,7 +83,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline pte_t pte_wrprotect(pte_t pte) { return __pte(pte_val(pte) | _PAGE_RO); diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h index 10f5cf444d72a8..fb6fa1d4e0749a 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h @@ -14,12 +14,12 @@ #define PUD_INDEX_SIZE 9 #define PGD_INDEX_SIZE 9 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) #define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) #define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) #define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) @@ -57,7 +57,7 @@ #define p4d_bad(p4d) (p4d_val(p4d) == 0) #define p4d_present(p4d) (p4d_val(p4d) != 0) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline pud_t *p4d_pgtable(p4d_t p4d) { @@ -80,7 +80,7 @@ static inline p4d_t pte_p4d(pte_t pte) } extern struct page *p4d_page(p4d_t p4d); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #define pud_ERROR(e) \ pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e)) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 2202c78730e8eb..2deb955b7bc89e 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -77,7 +77,7 @@ #define H_PAGE_4K_PFN 0 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* pte_clear moved to later in this file */ #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) @@ -209,6 +209,6 @@ void __patch_exception(int exc, unsigned long addr); __patch_exception((exc), (unsigned long)&name); \ } while (0) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/kup-booke.h b/arch/powerpc/include/asm/nohash/kup-booke.h index 0c7c3258134c56..d6bbb6d78bbe43 100644 --- a/arch/powerpc/include/asm/nohash/kup-booke.h +++ b/arch/powerpc/include/asm/nohash/kup-booke.h @@ -7,7 +7,7 @@ #ifdef CONFIG_PPC_KUAP -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro kuap_check_amr gpr1, gpr2 .endm @@ -105,7 +105,7 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) return !regs->kuap; } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* CONFIG_PPC_KUAP */ diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h b/arch/powerpc/include/asm/nohash/mmu-e500.h index b281d9eeaf1e65..2fad5ff426a0a4 100644 --- a/arch/powerpc/include/asm/nohash/mmu-e500.h +++ b/arch/powerpc/include/asm/nohash/mmu-e500.h @@ -230,7 +230,7 @@ #define MAS2_M_IF_NEEDED 0 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include extern unsigned int tlbcam_index; @@ -318,6 +318,6 @@ extern int book3e_htw_mode; #include DECLARE_PER_CPU(int, next_tlbcam_idx); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_MMU_BOOK3E_H_ */ diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h index bb5f3e8ea912df..4ef780b291bc31 100644 --- a/arch/powerpc/include/asm/nohash/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/pgalloc.h @@ -22,7 +22,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), pgtable_gfp_flags(mm, GFP_KERNEL)); -#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_BOOK3S_603) +#ifdef CONFIG_PPC_8xx memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (MAX_PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); #endif diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 7d6b9e5b286ef9..5af168b7f29241 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_NOHASH_PGTABLE_H #define _ASM_POWERPC_NOHASH_PGTABLE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, unsigned long clr, unsigned long set, int huge); #endif @@ -27,7 +27,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern int icache_44x_need_flush; @@ -373,5 +373,5 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); void unmap_kernel_page(unsigned long va); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index cb78392494da0c..b61efc3ee9040c 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -86,7 +86,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline pte_t pte_mkexec(pte_t pte) { return __pte((pte_val(pte) & ~_PAGE_BAP_SX) | _PAGE_BAP_UX); @@ -134,7 +134,7 @@ static inline unsigned long pud_leaf_size(pud_t pud) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_PTE_E500_H */ diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 8c9d4b26bf579e..d3eaa342579708 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -246,7 +246,7 @@ #define OPAL_CONFIG_IDLE_UNDO 0 #define OPAL_CONFIG_IDLE_APPLY 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Other enums */ enum OpalFreezeState { @@ -1183,6 +1183,6 @@ struct opal_mpipl_fadump { struct opal_mpipl_region region[]; } __packed; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __OPAL_API_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index af304e6cb486c5..0a398265ba04e5 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -10,7 +10,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -390,6 +390,6 @@ void opal_powercap_init(void); void opal_psr_init(void); void opal_sensor_groups_init(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_OPAL_H */ diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index af9a2628d1df05..b28fbb1d57eb90 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -6,7 +6,7 @@ * Copyright (C) 2001,2005 IBM Corporation. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -23,7 +23,7 @@ */ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifndef CONFIG_HUGETLB_PAGE #define HPAGE_SHIFT PAGE_SHIFT #elif defined(CONFIG_PPC_BOOK3S_64) @@ -75,7 +75,7 @@ extern unsigned int hpage_shift; #define LOAD_OFFSET ASM_CONST((CONFIG_KERNEL_START-CONFIG_PHYSICAL_START)) #if defined(CONFIG_NONSTATIC_KERNEL) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern phys_addr_t memstart_addr; extern phys_addr_t kernstart_addr; @@ -84,7 +84,7 @@ extern phys_addr_t kernstart_addr; extern long long virt_phys_offset; #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define PHYSICAL_START kernstart_addr #else /* !CONFIG_NONSTATIC_KERNEL */ @@ -216,7 +216,7 @@ extern long long virt_phys_offset; #endif #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline unsigned long virt_to_pfn(const void *kaddr) { return __pa(kaddr) >> PAGE_SHIFT; @@ -261,7 +261,7 @@ static inline const void *pfn_to_kaddr(unsigned long pfn) #define is_kernel_addr(x) ((x) >= TASK_SIZE) #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC_BOOK3S_64 #include @@ -290,6 +290,6 @@ static inline unsigned long kaslr_offset(void) } #include -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_PAGE_H */ diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h index b9ac9e3a771cbb..25482405a8111d 100644 --- a/arch/powerpc/include/asm/page_32.h +++ b/arch/powerpc/include/asm/page_32.h @@ -19,7 +19,7 @@ #define PTE_SHIFT (PAGE_SHIFT - PTE_T_LOG2) /* full page */ #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * The basic type of a PTE - 64 bits for those CPUs with > 32 bit * physical addressing. @@ -53,6 +53,6 @@ extern void copy_page(void *to, void *from); #define PGD_T_LOG2 (__builtin_ffs(sizeof(pgd_t)) - 1) #define PTE_T_LOG2 (__builtin_ffs(sizeof(pte_t)) - 1) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_PAGE_32_H */ diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 79a9b7c6a132ca..0f564a06bf6843 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -35,7 +35,7 @@ #define ESID_MASK_1T 0xffffff0000000000UL #define GET_ESID_1T(x) (((x) >> SID_SHIFT_1T) & SID_MASK_1T) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include typedef unsigned long pte_basic_t; @@ -82,7 +82,7 @@ extern void copy_page(void *to, void *from); /* Log 2 of page table size */ extern u64 ppc64_pft_size; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define VM_DATA_DEFAULT_FLAGS \ (is_32bit_task() ? \ diff --git a/arch/powerpc/include/asm/papr-sysparm.h b/arch/powerpc/include/asm/papr-sysparm.h index c3cd5b131033eb..a3b5a0d05db6fc 100644 --- a/arch/powerpc/include/asm/papr-sysparm.h +++ b/arch/powerpc/include/asm/papr-sysparm.h @@ -21,6 +21,7 @@ typedef struct { #define PAPR_SYSPARM_COOP_MEM_OVERCOMMIT_ATTRS mk_papr_sysparm(44) #define PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS mk_papr_sysparm(50) #define PAPR_SYSPARM_LPAR_NAME mk_papr_sysparm(55) +#define PAPR_SYSPARM_HVPIPE_ENABLE mk_papr_sysparm(64) /** * struct papr_sysparm_buf - RTAS work area layout for system parameter functions. diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 2aa3a091ef20ea..1dae53130782a0 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -133,8 +133,6 @@ struct pci_controller { /* IRQ domain hierarchy */ struct irq_domain *dev_domain; - struct irq_domain *msi_domain; - struct fwnode_handle *fwnode; /* iommu_ops support */ struct iommu_device iommu; diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 93d77ad5a92fa4..17fd7ff6e535b6 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_PGTABLE_H #define _ASM_POWERPC_PGTABLE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include /* For TASK_SIZE */ @@ -12,7 +12,7 @@ struct mm_struct; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef CONFIG_PPC_BOOK3S #include @@ -20,18 +20,6 @@ struct mm_struct; #include #endif /* !CONFIG_PPC_BOOK3S */ -/* - * Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - /* Make modules code happy. We don't set RO yet */ #define PAGE_KERNEL_EXEC PAGE_KERNEL_X @@ -39,7 +27,7 @@ struct mm_struct; #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define PFN_PTE_SHIFT PTE_RPN_SHIFT @@ -214,6 +202,6 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) #endif /* CONFIG_PPC64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 8053b24afc3956..55ca49d183196f 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -571,6 +571,7 @@ (0x54000001 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH(i) | __PPC_MB(mb) | __PPC_ME(me)) #define PPC_RAW_RLWIMI(d, a, i, mb, me) (0x50000000 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH(i) | __PPC_MB(mb) | __PPC_ME(me)) #define PPC_RAW_RLDICL(d, a, i, mb) (0x78000000 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH64(i) | __PPC_MB64(mb)) +#define PPC_RAW_RLDICL_DOT(d, a, i, mb) (0x78000000 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH64(i) | __PPC_MB64(mb) | 0x1) #define PPC_RAW_RLDICR(d, a, i, me) (0x78000004 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH64(i) | __PPC_ME64(me)) /* slwi = rlwinm Rx, Ry, n, 0, 31-n */ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index b891910fce8a69..46947c82a7127d 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -12,7 +12,7 @@ #include #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define SZL (BITS_PER_LONG/8) @@ -868,7 +868,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #endif /* !CONFIG_PPC_BOOK3E_64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define SOFT_MASK_TABLE(_start, _end) \ stringify_in_c(.section __soft_mask_table,"a";)\ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 6b94de17201c76..f156bdb43e2be1 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -29,14 +29,14 @@ #ifdef CONFIG_PPC64 /* Default SMT priority is set to 3. Use 11- 13bits to save priority. */ #define PPR_PRIORITY 3 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define DEFAULT_PPR (PPR_PRIORITY << 50) #else #define DEFAULT_PPR ((u64)PPR_PRIORITY << 50) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_PPC64 */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -460,5 +460,5 @@ int enter_vmx_ops(void); void *exit_vmx_ops(void *dest); #endif /* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_PROCESSOR_H */ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 7b9350756875a7..94aa1de2b06e19 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -24,7 +24,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pt_regs { union { @@ -165,7 +165,7 @@ struct pt_regs #define STACK_INT_FRAME_SIZE (KERNEL_REDZONE_SIZE + STACK_USER_INT_FRAME_SIZE) #define STACK_INT_FRAME_MARKER_LONGS (STACK_INT_FRAME_MARKER/sizeof(long)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #ifdef CONFIG_SMP @@ -414,7 +414,7 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, unsig return 0; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifndef __powerpc64__ /* We need PT_SOFTE defined at all time to avoid #ifdefs */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 0228c90bbcc7bf..3fe1866354323b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -60,7 +60,7 @@ #define MSR_RI_LG 1 /* Recoverable Exception */ #define MSR_LE_LG 0 /* Little Endian */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __MASK(X) (1<<(X)) #else #define __MASK(X) (1UL<<(X)) @@ -1358,7 +1358,7 @@ #define PVR_ARCH_31_P11 0x0f000007 /* Macros for setting and retrieving special purpose registers */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #if defined(CONFIG_PPC64) || defined(__CHECKER__) typedef struct { @@ -1450,6 +1450,6 @@ extern void scom970_write(unsigned int address, unsigned long value); struct pt_regs; extern void ppc_save_regs(struct pt_regs *regs); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_REG_H */ diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 656bfaf91526e0..56f9d3b1de859f 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -576,7 +576,7 @@ #define TEN_THREAD(x) (1 << (x)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define mftmr(rn) ({unsigned long rval; \ asm volatile(MFTMR(rn, %0) : "=r" (rval)); rval;}) #define mttmr(rn, v) asm volatile(MTTMR(rn, %0) : \ @@ -585,7 +585,7 @@ extern unsigned long global_dbcr0[]; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_POWERPC_REG_BOOKE_H__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/reg_fsl_emb.h b/arch/powerpc/include/asm/reg_fsl_emb.h index 9893d2001b6801..ec459c3d9498a1 100644 --- a/arch/powerpc/include/asm/reg_fsl_emb.h +++ b/arch/powerpc/include/asm/reg_fsl_emb.h @@ -9,7 +9,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Performance Monitor Registers */ static __always_inline unsigned int mfpmr(unsigned int rn) { @@ -32,7 +32,7 @@ static __always_inline void mtpmr(unsigned int rn, unsigned int val) ".machine pop;" : [val] "=r" (val) : [rn] "i" (rn)); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* Freescale Book E Performance Monitor APU Registers */ #define PMRN_PMC0 0x010 /* Performance Monitor Counter 0 */ diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 75fa0293c508e6..d046bbd5017d3c 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -68,9 +68,11 @@ enum rtas_function_index { RTAS_FNIDX__IBM_READ_PCI_CONFIG, RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE, RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2, + RTAS_FNIDX__IBM_RECEIVE_HVPIPE_MSG, RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW, RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW, RTAS_FNIDX__IBM_SCAN_LOG_DUMP, + RTAS_FNIDX__IBM_SEND_HVPIPE_MSG, RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR, RTAS_FNIDX__IBM_SET_EEH_OPTION, RTAS_FNIDX__IBM_SET_SLOT_RESET, @@ -163,9 +165,11 @@ typedef struct { #define RTAS_FN_IBM_READ_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__IBM_READ_PCI_CONFIG) #define RTAS_FN_IBM_READ_SLOT_RESET_STATE rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE) #define RTAS_FN_IBM_READ_SLOT_RESET_STATE2 rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2) +#define RTAS_FN_IBM_RECEIVE_HVPIPE_MSG rtas_fn_handle(RTAS_FNIDX__IBM_RECEIVE_HVPIPE_MSG) #define RTAS_FN_IBM_REMOVE_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW) #define RTAS_FN_IBM_RESET_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW) #define RTAS_FN_IBM_SCAN_LOG_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_SCAN_LOG_DUMP) +#define RTAS_FN_IBM_SEND_HVPIPE_MSG rtas_fn_handle(RTAS_FNIDX__IBM_SEND_HVPIPE_MSG) #define RTAS_FN_IBM_SET_DYNAMIC_INDICATOR rtas_fn_handle(RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR) #define RTAS_FN_IBM_SET_EEH_OPTION rtas_fn_handle(RTAS_FNIDX__IBM_SET_EEH_OPTION) #define RTAS_FN_IBM_SET_SLOT_RESET rtas_fn_handle(RTAS_FNIDX__IBM_SET_SLOT_RESET) @@ -217,6 +221,7 @@ typedef struct { #define RTAS_HARDWARE_ERROR -1 /* Hardware or other unspecified error. */ #define RTAS_BUSY -2 /* Retry immediately. */ #define RTAS_INVALID_PARAMETER -3 /* Invalid indicator/domain/sensor etc. */ +#define RTAS_FUNC_NOT_SUPPORTED -5 /* Function not supported */ #define RTAS_UNEXPECTED_STATE_CHANGE -7 /* Seems limited to EEH and slot reset. */ #define RTAS_EXTENDED_DELAY_MIN 9900 /* Retry after delaying for ~1ms. */ #define RTAS_EXTENDED_DELAY_MAX 9905 /* Retry after delaying for ~100s. */ @@ -233,6 +238,7 @@ typedef struct { #define RTAS_EPOW_WARNING 0x40000000 /* set bit 1 */ #define RTAS_HOTPLUG_EVENTS 0x10000000 /* set bit 3 */ #define RTAS_IO_EVENTS 0x08000000 /* set bit 4 */ +#define RTAS_HVPIPE_MSG_EVENTS 0x04000000 /* set bit 5 */ #define RTAS_EVENT_SCAN_ALL_EVENTS 0xffffffff /* RTAS event severity */ @@ -282,6 +288,7 @@ typedef struct { #define RTAS_TYPE_DEALLOC 0xE3 #define RTAS_TYPE_DUMP 0xE4 #define RTAS_TYPE_HOTPLUG 0xE5 +#define RTAS_TYPE_HVPIPE 0xE6 /* I don't add PowerMGM events right now, this is a different topic */ #define RTAS_TYPE_PMGM_POWER_SW_ON 0x60 #define RTAS_TYPE_PMGM_POWER_SW_OFF 0x61 @@ -374,6 +381,7 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log) #define PSERIES_ELOG_SECT_ID_HMC_ID (('H' << 8) | 'M') #define PSERIES_ELOG_SECT_ID_EPOW (('E' << 8) | 'P') #define PSERIES_ELOG_SECT_ID_IO_EVENT (('I' << 8) | 'E') +#define PSERIES_ELOG_SECT_ID_HVPIPE_EVENT (('P' << 8) | 'E') #define PSERIES_ELOG_SECT_ID_MANUFACT_INFO (('M' << 8) | 'I') #define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') #define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') @@ -519,6 +527,7 @@ extern struct mutex rtas_ibm_get_indices_lock; extern struct mutex rtas_ibm_set_dynamic_indicator_lock; extern struct mutex rtas_ibm_get_dynamic_sensor_state_lock; extern struct mutex rtas_ibm_physical_attestation_lock; +extern struct mutex rtas_ibm_send_hvpipe_msg_lock; #define GLOBAL_INTERRUPT_QUEUE 9005 diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index eed74c1fb832fc..50a92b24628daf 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -4,7 +4,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void ppc_printk_progress(char *s, unsigned short hex); extern unsigned long long memory_limit; @@ -89,7 +89,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, extern struct seq_buf ppc_hw_desc; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_SETUP_H */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index b77927ccb0ab00..e41b9ea42122be 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -18,7 +18,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC64 #include @@ -266,7 +266,7 @@ extern char __secondary_hold; extern unsigned int booting_thread_hwid; extern void __early_start(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_SMP_H) */ diff --git a/arch/powerpc/include/asm/spu_csa.h b/arch/powerpc/include/asm/spu_csa.h index c33df961c04549..1b3271a033928a 100644 --- a/arch/powerpc/include/asm/spu_csa.h +++ b/arch/powerpc/include/asm/spu_csa.h @@ -43,7 +43,7 @@ #define SPU_DECR_STATUS_RUNNING 0x1 #define SPU_DECR_STATUS_WRAPPED 0x2 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /** * spu_reg128 - generic 128-bit register definition. */ @@ -243,5 +243,5 @@ struct spu_state { #endif /* !__SPU__ */ #endif /* __KERNEL__ */ -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _SPU_CSA_H_ */ diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h index b0b4c64870d77c..0d3ccb34adfb2e 100644 --- a/arch/powerpc/include/asm/synch.h +++ b/arch/powerpc/include/asm/synch.h @@ -7,7 +7,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup; extern void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end); @@ -40,7 +40,7 @@ static inline void ppc_after_tlbiel_barrier(void) */ asm volatile(ASM_FTR_IFSET(PPC_CP_ABORT, "", %0) : : "i" (CPU_FTR_ARCH_31) : "memory"); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #if defined(__powerpc64__) # define LWSYNC lwsync diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 2785c7462ebf7b..b0f200aba2b3df 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -41,7 +41,7 @@ #define THREAD_ALIGN (1 << THREAD_ALIGN_SHIFT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -89,7 +89,7 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * thread information flag bit numbers @@ -162,7 +162,7 @@ void arch_setup_new_exec(void); #define _TLF_LAZY_MMU (1 << TLF_LAZY_MMU) #define _TLF_RUNLATCH (1 << TLF_RUNLATCH) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline void clear_thread_local_flags(unsigned int flags) { @@ -233,7 +233,7 @@ static inline int arch_within_stack_frames(const void * const stack, extern void *emergency_ctx[]; #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index f8885586efafb3..7991ab1d4cb893 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -29,6 +29,10 @@ extern u64 decrementer_max; extern void generic_calibrate_decr(void); +#ifdef CONFIG_PPC_SPLPAR +extern u64 get_boot_tb(void); +#endif + /* Some sane defaults: 125 MHz timebase, 1GHz processor */ extern unsigned long ppc_proc_freq; #define DEFAULT_PROC_FREQ (DEFAULT_TB_FREQ * 8) diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h index e94f6db5e367b3..d700affba4480a 100644 --- a/arch/powerpc/include/asm/tm.h +++ b/arch/powerpc/include/asm/tm.h @@ -8,7 +8,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void tm_reclaim(struct thread_struct *thread, uint8_t cause); @@ -19,4 +19,4 @@ extern void tm_restore_sprs(struct thread_struct *thread); extern bool tm_suspend_disabled; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index da15b5efe8071a..f19ca44512d1e8 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu) #ifdef CONFIG_SMP #include +struct cpumask *cpu_coregroup_mask(int cpu); + #ifdef CONFIG_PPC64 #include diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h index 93157a661dcc75..55d7ba6d910bdb 100644 --- a/arch/powerpc/include/asm/types.h +++ b/arch/powerpc/include/asm/types.h @@ -11,10 +11,10 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef __vector128 vector128; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_TYPES_H */ diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 027ef94a12fbdd..b873fbb6d712f7 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -9,7 +9,7 @@ #define NR_syscalls __NR_syscalls -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -52,5 +52,5 @@ #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_UNISTD_H_ */ diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 1ca23fbfe087ae..07af3257607254 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -5,7 +5,7 @@ #define VDSO_VERSION_STRING LINUX_2.6.15 #define __VDSO_PAGES 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_PPC64 #include @@ -21,7 +21,7 @@ int vdso_getcpu_init(void); -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #ifdef __VDSO64__ #define V_FUNCTION_BEGIN(name) \ @@ -49,6 +49,6 @@ int vdso_getcpu_init(void); #endif /* __VDSO32__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_H */ diff --git a/arch/powerpc/include/asm/vdso/getrandom.h b/arch/powerpc/include/asm/vdso/getrandom.h index 067a5396aac6e9..4c24976061f495 100644 --- a/arch/powerpc/include/asm/vdso/getrandom.h +++ b/arch/powerpc/include/asm/vdso/getrandom.h @@ -5,7 +5,7 @@ #ifndef _ASM_POWERPC_VDSO_GETRANDOM_H #define _ASM_POWERPC_VDSO_GETRANDOM_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -62,6 +62,6 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(vo ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_GETRANDOM_H */ diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index 99c9d6f43fde2e..ab3df12c8d947e 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_VDSO_GETTIMEOFDAY_H #define _ASM_POWERPC_VDSO_GETTIMEOFDAY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -141,6 +141,6 @@ int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz __kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_time_data *vd); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h index 80d13207c5688d..c1f3d7aaf3ee97 100644 --- a/arch/powerpc/include/asm/vdso/processor.h +++ b/arch/powerpc/include/asm/vdso/processor.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_VDSO_PROCESSOR_H #define _ASM_POWERPC_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Macros for adjusting thread priority (hardware multi-threading) */ #ifdef CONFIG_PPC64 @@ -33,6 +33,6 @@ #define cpu_relax() barrier() #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_PROCESSOR_H */ diff --git a/arch/powerpc/include/asm/vdso/vsyscall.h b/arch/powerpc/include/asm/vdso/vsyscall.h index c2c9ae1b22e71a..bee18e8660a027 100644 --- a/arch/powerpc/include/asm/vdso/vsyscall.h +++ b/arch/powerpc/include/asm/vdso/vsyscall.h @@ -2,13 +2,13 @@ #ifndef _ASM_POWERPC_VDSO_VSYSCALL_H #define _ASM_POWERPC_VDSO_VSYSCALL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /* The asm-generic header needs to be included after the definitions above */ #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_POWERPC_VDSO_VSYSCALL_H */ diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index 95d45a50355d26..441264af0e3669 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -9,11 +9,11 @@ * IBM Corp. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ .macro get_datapage ptr symbol bcl 20, 31, .+4 @@ -23,7 +23,7 @@ addi \ptr, \ptr, (\symbol - 999b)@l .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _SYSTEMCFG_H */ diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 92930b0b5d0e17..efb0f5effcc694 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -111,7 +111,6 @@ void xive_native_free_vp_block(u32 vp_base); int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data); void xive_cleanup_irq_data(struct xive_irq_data *xd); -void xive_irq_free_data(unsigned int virq); void xive_native_free_irq(u32 irq); int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); diff --git a/arch/powerpc/include/uapi/asm/opal-prd.h b/arch/powerpc/include/uapi/asm/opal-prd.h index 1869cf83a870ed..11abcf0192ca1a 100644 --- a/arch/powerpc/include/uapi/asm/opal-prd.h +++ b/arch/powerpc/include/uapi/asm/opal-prd.h @@ -40,7 +40,7 @@ #define OPAL_PRD_SCOM_READ _IOR('o', 0x02, struct opal_prd_scom) #define OPAL_PRD_SCOM_WRITE _IOW('o', 0x03, struct opal_prd_scom) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct opal_prd_info { __u64 version; @@ -54,6 +54,6 @@ struct opal_prd_scom { __s64 rc; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_POWERPC_OPAL_PRD_H */ diff --git a/arch/powerpc/include/uapi/asm/papr-hvpipe.h b/arch/powerpc/include/uapi/asm/papr-hvpipe.h new file mode 100644 index 00000000000000..f8794139d06a4a --- /dev/null +++ b/arch/powerpc/include/uapi/asm/papr-hvpipe.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_PAPR_HVPIPE_H_ +#define _UAPI_PAPR_HVPIPE_H_ + +#include +#include +#include + +/* + * This header is included in payload between OS and the user + * space. + * flags: OS notifies the user space whether the hvpipe is + * closed or the buffer has the payload. + */ +struct papr_hvpipe_hdr { + __u8 version; + __u8 reserved[3]; + __u32 flags; + __u8 reserved2[40]; +}; + +/* + * ioctl for /dev/papr-hvpipe + */ +#define PAPR_HVPIPE_IOC_CREATE_HANDLE _IOW(PAPR_MISCDEV_IOC_ID, 9, __u32) + +/* + * hvpipe_hdr flags used for read() + */ +#define HVPIPE_MSG_AVAILABLE 0x01 /* Payload is available */ +#define HVPIPE_LOST_CONNECTION 0x02 /* Pipe connection is closed/unavailable */ + +#endif /* _UAPI_PAPR_HVPIPE_H_ */ diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h index 7004cfea3f5ff8..01e630149d48e1 100644 --- a/arch/powerpc/include/uapi/asm/ptrace.h +++ b/arch/powerpc/include/uapi/asm/ptrace.h @@ -27,7 +27,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef __KERNEL__ struct user_pt_regs @@ -57,7 +57,7 @@ struct pt_regs unsigned long result; /* Result of a system call */ }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* @@ -200,7 +200,7 @@ struct pt_regs #define PPC_PTRACE_SETHWDEBUG 0x88 #define PPC_PTRACE_DELHWDEBUG 0x87 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct ppc_debug_info { __u32 version; /* Only version 1 exists to date */ @@ -212,7 +212,7 @@ struct ppc_debug_info { __u64 features; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * features will have bits indication whether there is support for: @@ -224,7 +224,7 @@ struct ppc_debug_info { #define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x0000000000000010 #define PPC_DEBUG_FEATURE_DATA_BP_ARCH_31 0x0000000000000020 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct ppc_hw_breakpoint { __u32 version; /* currently, version must be 1 */ @@ -236,7 +236,7 @@ struct ppc_hw_breakpoint { __u64 condition_value; /* contents of the DVC register */ }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Trigger Type diff --git a/arch/powerpc/include/uapi/asm/types.h b/arch/powerpc/include/uapi/asm/types.h index 327616fb70e449..9dbf55e38ea58b 100644 --- a/arch/powerpc/include/uapi/asm/types.h +++ b/arch/powerpc/include/uapi/asm/types.h @@ -28,14 +28,14 @@ # include #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef struct { __u32 u[4]; } __attribute__((aligned(16))) __vector128; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_POWERPC_TYPES_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b3048f6d3822c0..a4bc80b30410ae 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 56c5ebe21b99a4..393e19ee13222d 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -162,7 +162,7 @@ instruction_counter: * For the MPC8xx, this is a software tablewalk to load the instruction * TLB. The task switch loads the M_TWB register with the pointer to the first * level table. - * If we discover there is no second level table (value is zero) or if there + * If there is no second level table (value is zero) or if there * is an invalid pte, we load that into the TLB, which causes another fault * into the TLB Error interrupt where we can handle such problems. * We have to use the MD_xxx registers for the tablewalk because the @@ -183,14 +183,11 @@ instruction_counter: mtspr SPRN_SPRG_SCRATCH2, r10 mtspr SPRN_M_TW, r11 - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ mfspr r10, SPRN_SRR0 /* Get effective address of fault */ INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11) mtspr SPRN_MD_EPN, r10 mfspr r10, SPRN_M_TWB /* Get level 1 table */ - lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ + lwz r11, 0(r10) /* Get level 1 entry */ mtspr SPRN_MD_TWC, r11 mfspr r10, SPRN_MD_TWC lwz r10, 0(r10) /* Get the pte */ @@ -228,12 +225,8 @@ instruction_counter: mtspr SPRN_SPRG_SCRATCH2, r10 mtspr SPRN_M_TW, r11 - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ - mfspr r10, SPRN_MD_EPN mfspr r10, SPRN_M_TWB /* Get level 1 table */ - lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ + lwz r11, 0(r10) /* Get level 1 entry */ mtspr SPRN_MD_TWC, r11 mfspr r10, SPRN_MD_TWC @@ -375,7 +368,7 @@ FixupPGD: mfspr r10, SPRN_DAR mtspr SPRN_MD_EPN, r10 mfspr r11, SPRN_M_TWB /* Get level 1 table */ - lwz r10, (swapper_pg_dir - PAGE_OFFSET)@l(r11) /* Get the level 1 entry */ + lwz r10, 0(r11) /* Get the level 1 entry */ cmpwi cr1, r10, 0 bne cr1, 1f @@ -384,7 +377,7 @@ FixupPGD: lwz r10, (swapper_pg_dir - PAGE_OFFSET)@l(r10) /* Get the level 1 entry */ cmpwi cr1, r10, 0 beq cr1, 1f - stw r10, (swapper_pg_dir - PAGE_OFFSET)@l(r11) /* Set the level 1 entry */ + stw r10, 0(r11) /* Set the level 1 entry */ mfspr r10, SPRN_M_TW mtcr r10 mfspr r10, SPRN_SPRG_SCRATCH0 @@ -412,9 +405,10 @@ FixupDAR:/* Entry point for dcbx workaround. */ tophys(r11, r10) mfspr r11, SPRN_M_TWB /* Get level 1 table */ rlwinm r11, r11, 0, 20, 31 - oris r11, r11, (swapper_pg_dir - PAGE_OFFSET)@ha + oris r11, r11, (swapper_pg_dir - PAGE_OFFSET)@h + ori r11, r11, (swapper_pg_dir - PAGE_OFFSET)@l 3: - lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11) /* Get the level 1 entry */ + lwz r11, 0(r11) /* Get the level 1 entry */ rlwinm r11, r11, 0, ~_PMD_PAGE_8M mtspr SPRN_MD_TWC, r11 mfspr r11, SPRN_MD_TWC @@ -535,7 +529,8 @@ start_here: li r0,0 stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) - lis r6, swapper_pg_dir@ha + lis r6, swapper_pg_dir@h + ori r6, r6, swapper_pg_dir@l tophys(r6,r6) mtspr SPRN_M_TWB, r6 diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 0b5c1993809eb0..75471fb6fb1014 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -7,7 +7,7 @@ #include #include /* for THREAD_SHIFT */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* * Macros used for common Book-e exception handling @@ -522,5 +522,5 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) bl kernel_fp_unavailable_exception; \ b interrupt_return -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __HEAD_BOOKE_H__ */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 126bf3b06ab7e2..2a44bc8e2439f2 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -209,8 +209,7 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, char *secstrings, struct module *me) { - /* One extra reloc so it's always 0-addr terminated */ - unsigned long relocs = 1; + unsigned long relocs = 0; unsigned i; /* Every relocated section... */ @@ -705,7 +704,7 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, /* Find this stub, or if that fails, the next avail. entry */ stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; - for (i = 0; stub_func_addr(stubs[i].funcdata); i++) { + for (i = 0; i < me->arch.stub_count; i++) { if (WARN_ON(i >= num_stubs)) return 0; @@ -716,6 +715,7 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, if (!create_stub(sechdrs, &stubs[i], addr, me, name)) return 0; + me->arch.stub_count++; return (unsigned long)&stubs[i]; } @@ -1118,29 +1118,19 @@ int module_trampoline_target(struct module *mod, unsigned long addr, static int setup_ftrace_ool_stubs(const Elf64_Shdr *sechdrs, unsigned long addr, struct module *me) { #ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE - unsigned int i, total_stubs, num_stubs; + unsigned int total_stubs, num_stubs; struct ppc64_stub_entry *stub; total_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stub); num_stubs = roundup(me->arch.ool_stub_count * sizeof(struct ftrace_ool_stub), sizeof(struct ppc64_stub_entry)) / sizeof(struct ppc64_stub_entry); - /* Find the next available entry */ - stub = (void *)sechdrs[me->arch.stubs_section].sh_addr; - for (i = 0; stub_func_addr(stub[i].funcdata); i++) - if (WARN_ON(i >= total_stubs)) - return -1; - - if (WARN_ON(i + num_stubs > total_stubs)) + if (WARN_ON(me->arch.stub_count + num_stubs > total_stubs)) return -1; - stub += i; - me->arch.ool_stubs = (struct ftrace_ool_stub *)stub; - - /* reserve stubs */ - for (i = 0; i < num_stubs; i++) - if (patch_u32((void *)&stub->funcdata, PPC_RAW_NOP())) - return -1; + stub = (void *)sechdrs[me->arch.stubs_section].sh_addr; + me->arch.ool_stubs = (struct ftrace_ool_stub *)(stub + me->arch.stub_count); + me->arch.stub_count += num_stubs; #endif return 0; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 855e0988650326..eb23966ac0a9f0 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1805,7 +1805,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) f = ret_from_kernel_user_thread; } else { struct pt_regs *regs = current_pt_regs(); - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; /* Copy registers */ diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index e61245c4468e53..8d81c1e7a8db13 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -98,6 +98,8 @@ DEFINE_MUTEX(rtas_ibm_get_vpd_lock); DEFINE_MUTEX(rtas_ibm_get_indices_lock); DEFINE_MUTEX(rtas_ibm_set_dynamic_indicator_lock); DEFINE_MUTEX(rtas_ibm_get_dynamic_sensor_state_lock); +DEFINE_MUTEX(rtas_ibm_receive_hvpipe_msg_lock); +DEFINE_MUTEX(rtas_ibm_send_hvpipe_msg_lock); static struct rtas_function rtas_function_table[] __ro_after_init = { [RTAS_FNIDX__CHECK_EXCEPTION] = { @@ -373,6 +375,17 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { [RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2] = { .name = "ibm,read-slot-reset-state2", }, + [RTAS_FNIDX__IBM_RECEIVE_HVPIPE_MSG] { + .name = "ibm,receive-hvpipe-msg", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 0, .size_idx1 = 1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ v2.13 R1–7.3.32.1 + */ + .lock = &rtas_ibm_receive_hvpipe_msg_lock, + }, [RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW] = { .name = "ibm,remove-pe-dma-window", }, @@ -391,6 +404,17 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx2 = -1, .size_idx2 = -1, }, }, + [RTAS_FNIDX__IBM_SEND_HVPIPE_MSG] { + .name = "ibm,send-hvpipe-msg", + .filter = &(const struct rtas_filter) { + .buf_idx1 = 1, .size_idx1 = -1, + .buf_idx2 = -1, .size_idx2 = -1, + }, + /* + * PAPR+ v2.13 R1–7.3.32.2 + */ + .lock = &rtas_ibm_send_hvpipe_msg_lock, + }, [RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR] = { .name = "ibm,set-dynamic-indicator", .filter = &(const struct rtas_filter) { diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 9bba469239fcd0..6336ec9aedd0af 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -89,6 +89,8 @@ static char *rtas_event_type(int type) return "Platform Resource Reassignment Event"; case RTAS_TYPE_HOTPLUG: return "Hotplug Event"; + case RTAS_TYPE_HVPIPE: + return "Hypervisor Pipe Notification event"; } return rtas_type[0]; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f59e4b9cc20743..68edb66c2964ba 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1028,19 +1028,19 @@ static int powerpc_shared_proc_flags(void) * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. */ -static const struct cpumask *shared_cache_mask(int cpu) +static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu) { return per_cpu(cpu_l2_cache_map, cpu); } #ifdef CONFIG_SCHED_SMT -static const struct cpumask *smallcore_smt_mask(int cpu) +static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu) { return cpu_smallcore_mask(cpu); } #endif -static struct cpumask *cpu_coregroup_mask(int cpu) +struct cpumask *cpu_coregroup_mask(int cpu) { return per_cpu(cpu_coregroup_map, cpu); } @@ -1054,11 +1054,6 @@ static bool has_coregroup_support(void) return coregroup_enabled; } -static const struct cpumask *cpu_mc_mask(int cpu) -{ - return cpu_coregroup_mask(cpu); -} - static int __init init_big_cores(void) { int cpu; @@ -1448,7 +1443,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) return false; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update l2-cache mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); @@ -1538,7 +1533,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask) return; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update coregroup mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); @@ -1601,7 +1596,7 @@ static void add_cpu_to_masks(int cpu) /* If chip_id is -1; limit the cpu_core_mask to within PKG */ if (chip_id == -1) - cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + cpumask_and(mask, mask, cpu_node_mask(cpu)); for_each_cpu(i, mask) { if (chip_id == cpu_to_chip_id(i)) { @@ -1701,22 +1696,22 @@ static void __init build_sched_topology(void) if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); powerpc_topology[i++] = - SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); + SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT); } else { - powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); + powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT); } #endif if (shared_caches) { powerpc_topology[i++] = - SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); + SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE); } if (has_coregroup_support()) { powerpc_topology[i++] = - SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); + SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC); } - powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); + powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG); /* There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 8224381c1dba34..4bbeb8644d3da4 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -137,7 +137,7 @@ EXPORT_SYMBOL_GPL(rtc_lock); static u64 tb_to_ns_scale __read_mostly; static unsigned tb_to_ns_shift __read_mostly; -static u64 boot_tb __read_mostly; +static u64 boot_tb __ro_after_init; extern struct timezone sys_tz; static long timezone_offset; @@ -639,6 +639,12 @@ notrace unsigned long long sched_clock(void) return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; } +#ifdef CONFIG_PPC_SPLPAR +u64 get_boot_tb(void) +{ + return boot_tb; +} +#endif #ifdef CONFIG_PPC_PSERIES diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 6dca92d5a6e822..841d077e28251a 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -488,8 +488,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return ret; /* Set up out-of-line stub */ - if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) - return ftrace_init_ool_stub(mod, rec); + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) { + ret = ftrace_init_ool_stub(mod, rec); + goto out; + } /* Nop-out the ftrace location */ new = ppc_inst(PPC_RAW_NOP()); @@ -520,6 +522,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return -EINVAL; } +out: + if (!ret) + ret = ftrace_rec_set_nop_ops(rec); + return ret; } diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S b/arch/powerpc/kernel/trace/ftrace_entry.S index 3565c67fc63859..6599fe3c62347f 100644 --- a/arch/powerpc/kernel/trace/ftrace_entry.S +++ b/arch/powerpc/kernel/trace/ftrace_entry.S @@ -409,23 +409,31 @@ EXPORT_SYMBOL(_mcount) _GLOBAL(return_to_handler) /* need to save return values */ #ifdef CONFIG_PPC64 - std r4, -32(r1) - std r3, -24(r1) + stdu r1, -SWITCH_FRAME_SIZE(r1) + std r4, GPR4(r1) + std r3, GPR3(r1) + /* Save previous stack pointer (r1) */ + addi r3, r1, SWITCH_FRAME_SIZE + std r3, GPR1(r1) /* save TOC */ - std r2, -16(r1) - std r31, -8(r1) + std r2, 24(r1) + std r31, 32(r1) mr r31, r1 - stdu r1, -112(r1) - + /* pass ftrace_regs/pt_regs to ftrace_return_to_handler */ + addi r3, r1, STACK_INT_FRAME_REGS /* * We might be called from a module. * Switch to our TOC to run inside the core kernel. */ LOAD_PACA_TOC() #else - stwu r1, -16(r1) - stw r3, 8(r1) - stw r4, 12(r1) + stwu r1, -SWITCH_FRAME_SIZE(r1) + stw r4, GPR4(r1) + stw r3, GPR3(r1) + addi r3, r1, SWITCH_FRAME_SIZE + stw r3, GPR1(r1) + /* pass ftrace_regs/pt_regs to ftrace_return_to_handler */ + addi r3, r1, STACK_INT_FRAME_REGS #endif bl ftrace_return_to_handler @@ -435,15 +443,15 @@ _GLOBAL(return_to_handler) mtlr r3 #ifdef CONFIG_PPC64 - ld r1, 0(r1) - ld r4, -32(r1) - ld r3, -24(r1) - ld r2, -16(r1) - ld r31, -8(r1) + ld r4, GPR4(r1) + ld r3, GPR3(r1) + ld r2, 24(r1) + ld r31, 32(r1) + ld r1, 0(r1) #else - lwz r3, 8(r1) - lwz r4, 12(r1) - addi r1, r1, 16 + lwz r3, GPR3(r1) + lwz r4, GPR4(r1) + addi r1, r1, SWITCH_FRAME_SIZE #endif /* Jump back to real return address */ diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 219d67bcf747e7..ab7c4cc80943ce 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -40,8 +41,6 @@ static_assert(__VDSO_PAGES == VDSO_NR_PAGES); extern char vdso32_start, vdso32_end; extern char vdso64_start, vdso64_end; -long sys_ni_syscall(void); - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma, unsigned long text_size) { diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index bcc7e4dff8c305..95ab4cdf582ebd 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -9,6 +9,7 @@ #include #include #include +#include #define MAX_NODES 4 @@ -708,26 +709,26 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b qnodesp->count--; } -void queued_spin_lock_slowpath(struct qspinlock *lock) +void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock) { + trace_contention_begin(lock, LCB_F_SPIN); /* * This looks funny, but it induces the compiler to inline both * sides of the branch rather than share code as when the condition * is passed as the paravirt argument to the functions. */ if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) { - if (try_to_steal_lock(lock, true)) { + if (try_to_steal_lock(lock, true)) spec_barrier(); - return; - } - queued_spin_lock_mcs_queue(lock, true); + else + queued_spin_lock_mcs_queue(lock, true); } else { - if (try_to_steal_lock(lock, false)) { + if (try_to_steal_lock(lock, false)) spec_barrier(); - return; - } - queued_spin_lock_mcs_queue(lock, false); + else + queued_spin_lock_mcs_queue(lock, false); } + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_spin_lock_slowpath); diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index be9c4106e22f04..c42ecdf94e48cd 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -204,7 +204,7 @@ int mmu_mark_initmem_nx(void) for (i = 0; i < nb - 1 && base < top;) { size = bat_block_size(base, top); - setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); base += size; } if (base < top) { @@ -215,7 +215,7 @@ int mmu_mark_initmem_nx(void) pr_warn("Some RW data is getting mapped X. " "Adjust CONFIG_DATA_SHIFT to avoid that.\n"); } - setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); base += size; } for (; i < nb; i++) diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c index a1a4e697251aa6..28a96a10c90778 100644 --- a/arch/powerpc/mm/nohash/mmu_context.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -203,15 +203,7 @@ static unsigned int steal_context_up(unsigned int id) static void set_context(unsigned long id, pgd_t *pgd) { if (IS_ENABLED(CONFIG_PPC_8xx)) { - s16 offset = (s16)(__pa(swapper_pg_dir)); - - /* - * Register M_TWB will contain base address of level 1 table minus the - * lower part of the kernel PGDIR base address, so that all accesses to - * level 1 table are done relative to lower part of kernel PGDIR base - * address. - */ - mtspr(SPRN_M_TWB, __pa(pgd) - offset); + mtspr(SPRN_M_TWB, __pa(pgd)); /* Update context */ mtspr(SPRN_M_CASID, id - 1); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 15276068f657df..0c9ef705803e93 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -104,7 +104,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) p = memstart_addr + s; for (; s < top; s += PAGE_SIZE) { ktext = core_kernel_text(v); - map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); + map_kernel_page(v, p, ktext ? PAGE_KERNEL_X : PAGE_KERNEL); v += PAGE_SIZE; p += PAGE_SIZE; } diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 4c26912c2e3c36..8334cd667bba1e 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -8,7 +8,7 @@ #ifndef _BPF_JIT_H #define _BPF_JIT_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -161,9 +161,11 @@ struct codegen_context { unsigned int seen; unsigned int idx; unsigned int stack_size; - int b2p[MAX_BPF_JIT_REG + 2]; + int b2p[MAX_BPF_JIT_REG + 3]; unsigned int exentry_idx; unsigned int alt_exit_addr; + u64 arena_vm_start; + u64 user_vm_start; }; #define bpf_to_ppc(r) (ctx->b2p[r]) @@ -201,7 +203,7 @@ int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass, struct codegen_context *ctx, int insn_idx, - int jmp_off, int dst_reg); + int jmp_off, int dst_reg, u32 code); #endif diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index c0684733e9d6ac..88ad5ba7b87fd0 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -204,6 +204,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* Make sure that the stack is quadword aligned. */ cgctx.stack_size = round_up(fp->aux->stack_depth, 16); + cgctx.arena_vm_start = bpf_arena_get_kern_vm_start(fp->aux->arena); + cgctx.user_vm_start = bpf_arena_get_user_vm_start(fp->aux->arena); /* Scouting faux-generate pass 0 */ if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) { @@ -326,7 +328,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) */ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass, struct codegen_context *ctx, int insn_idx, int jmp_off, - int dst_reg) + int dst_reg, u32 code) { off_t offset; unsigned long pc; @@ -355,6 +357,9 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass (ctx->exentry_idx * BPF_FIXUP_LEN * 4); fixup[0] = PPC_RAW_LI(dst_reg, 0); + if (BPF_CLASS(code) == BPF_ST || BPF_CLASS(code) == BPF_STX) + fixup[0] = PPC_RAW_NOP(); + if (IS_ENABLED(CONFIG_PPC32)) fixup[1] = PPC_RAW_LI(dst_reg - 1, 0); /* clear higher 32-bit register too */ @@ -435,11 +440,32 @@ bool bpf_jit_supports_kfunc_call(void) return true; } +bool bpf_jit_supports_arena(void) +{ + return IS_ENABLED(CONFIG_PPC64); +} + bool bpf_jit_supports_far_kfunc_call(void) { return IS_ENABLED(CONFIG_PPC64); } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_H: + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(insn)) + return false; + return IS_ENABLED(CONFIG_PPC64); + } + return true; +} + void *arch_alloc_bpf_trampoline(unsigned int size) { return bpf_prog_pack_alloc(size, bpf_jit_fill_ill_insns); @@ -579,7 +605,7 @@ static void bpf_trampoline_setup_tail_call_cnt(u32 *image, struct codegen_contex { if (IS_ENABLED(CONFIG_PPC64)) { /* See bpf_jit_stack_tailcallcnt() */ - int tailcallcnt_offset = 6 * 8; + int tailcallcnt_offset = 7 * 8; EMIT(PPC_RAW_LL(_R3, _R1, func_frame_offset - tailcallcnt_offset)); EMIT(PPC_RAW_STL(_R3, _R1, -tailcallcnt_offset)); @@ -594,7 +620,7 @@ static void bpf_trampoline_restore_tail_call_cnt(u32 *image, struct codegen_cont { if (IS_ENABLED(CONFIG_PPC64)) { /* See bpf_jit_stack_tailcallcnt() */ - int tailcallcnt_offset = 6 * 8; + int tailcallcnt_offset = 7 * 8; EMIT(PPC_RAW_LL(_R3, _R1, -tailcallcnt_offset)); EMIT(PPC_RAW_STL(_R3, _R1, func_frame_offset - tailcallcnt_offset)); diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 0aace304dfe191..3087e744fb2504 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -1087,7 +1087,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code } ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, insn_idx, - jmp_off, dst_reg); + jmp_off, dst_reg, code); if (ret) return ret; } diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 025524378443e6..1fe37128c87640 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -25,18 +25,18 @@ * with our redzone usage. * * [ prev sp ] <------------- - * [ nv gpr save area ] 5*8 | + * [ nv gpr save area ] 6*8 | * [ tail_call_cnt ] 8 | - * [ local_tmp_var ] 16 | + * [ local_tmp_var ] 24 | * fp (r31) --> [ ebpf stack space ] upto 512 | * [ frame header ] 32/112 | * sp (r1) ---> [ stack pointer ] -------------- */ /* for gpr non volatile registers BPG_REG_6 to 10 */ -#define BPF_PPC_STACK_SAVE (5*8) +#define BPF_PPC_STACK_SAVE (6*8) /* for bpf JIT code internal usage */ -#define BPF_PPC_STACK_LOCALS 24 +#define BPF_PPC_STACK_LOCALS 32 /* stack frame excluding BPF stack, ensure this is quadword aligned */ #define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + \ BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE) @@ -44,6 +44,7 @@ /* BPF register usage */ #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) +#define ARENA_VM_START (MAX_BPF_JIT_REG + 2) /* BPF to ppc register mappings */ void bpf_jit_init_reg_mapping(struct codegen_context *ctx) @@ -67,10 +68,12 @@ void bpf_jit_init_reg_mapping(struct codegen_context *ctx) ctx->b2p[BPF_REG_AX] = _R12; ctx->b2p[TMP_REG_1] = _R9; ctx->b2p[TMP_REG_2] = _R10; + /* non volatile register for kern_vm_start address */ + ctx->b2p[ARENA_VM_START] = _R26; } -/* PPC NVR range -- update this if we ever use NVRs below r27 */ -#define BPF_PPC_NVR_MIN _R27 +/* PPC NVR range -- update this if we ever use NVRs below r26 */ +#define BPF_PPC_NVR_MIN _R26 static inline bool bpf_has_stack_frame(struct codegen_context *ctx) { @@ -89,9 +92,9 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) * [ prev sp ] <------------- * [ ... ] | * sp (r1) ---> [ stack pointer ] -------------- - * [ nv gpr save area ] 5*8 + * [ nv gpr save area ] 6*8 * [ tail_call_cnt ] 8 - * [ local_tmp_var ] 16 + * [ local_tmp_var ] 24 * [ unused red zone ] 224 */ static int bpf_jit_stack_local(struct codegen_context *ctx) @@ -99,12 +102,12 @@ static int bpf_jit_stack_local(struct codegen_context *ctx) if (bpf_has_stack_frame(ctx)) return STACK_FRAME_MIN_SIZE + ctx->stack_size; else - return -(BPF_PPC_STACK_SAVE + 24); + return -(BPF_PPC_STACK_SAVE + 32); } static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx) { - return bpf_jit_stack_local(ctx) + 16; + return bpf_jit_stack_local(ctx) + 24; } static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) @@ -170,10 +173,17 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) if (bpf_is_seen_register(ctx, bpf_to_ppc(i))) EMIT(PPC_RAW_STD(bpf_to_ppc(i), _R1, bpf_jit_stack_offsetof(ctx, bpf_to_ppc(i)))); + if (ctx->arena_vm_start) + EMIT(PPC_RAW_STD(bpf_to_ppc(ARENA_VM_START), _R1, + bpf_jit_stack_offsetof(ctx, bpf_to_ppc(ARENA_VM_START)))); + /* Setup frame pointer to point to the bpf stack area */ if (bpf_is_seen_register(ctx, bpf_to_ppc(BPF_REG_FP))) EMIT(PPC_RAW_ADDI(bpf_to_ppc(BPF_REG_FP), _R1, STACK_FRAME_MIN_SIZE + ctx->stack_size)); + + if (ctx->arena_vm_start) + PPC_LI64(bpf_to_ppc(ARENA_VM_START), ctx->arena_vm_start); } static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx) @@ -185,6 +195,10 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx if (bpf_is_seen_register(ctx, bpf_to_ppc(i))) EMIT(PPC_RAW_LD(bpf_to_ppc(i), _R1, bpf_jit_stack_offsetof(ctx, bpf_to_ppc(i)))); + if (ctx->arena_vm_start) + EMIT(PPC_RAW_LD(bpf_to_ppc(ARENA_VM_START), _R1, + bpf_jit_stack_offsetof(ctx, bpf_to_ppc(ARENA_VM_START)))); + /* Tear down our stack frame */ if (bpf_has_stack_frame(ctx)) { EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME + ctx->stack_size)); @@ -396,11 +410,11 @@ void bpf_stf_barrier(void); asm ( " .global bpf_stf_barrier ;" " bpf_stf_barrier: ;" -" std 21,-64(1) ;" -" std 22,-56(1) ;" +" std 21,-80(1) ;" +" std 22,-72(1) ;" " sync ;" -" ld 21,-64(1) ;" -" ld 22,-56(1) ;" +" ld 21,-80(1) ;" +" ld 22,-72(1) ;" " ori 31,31,0 ;" " .rept 14 ;" " b 1f ;" @@ -409,6 +423,141 @@ asm ( " blr ;" ); +static int bpf_jit_emit_atomic_ops(u32 *image, struct codegen_context *ctx, + const struct bpf_insn *insn, u32 *jmp_off, + u32 *tmp_idx, u32 *addrp) +{ + u32 tmp1_reg = bpf_to_ppc(TMP_REG_1); + u32 tmp2_reg = bpf_to_ppc(TMP_REG_2); + u32 size = BPF_SIZE(insn->code); + u32 src_reg = bpf_to_ppc(insn->src_reg); + u32 dst_reg = bpf_to_ppc(insn->dst_reg); + s32 imm = insn->imm; + + u32 save_reg = tmp2_reg; + u32 ret_reg = src_reg; + u32 fixup_idx; + + /* Get offset into TMP_REG_1 */ + EMIT(PPC_RAW_LI(tmp1_reg, insn->off)); + /* + * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' + * before and after the operation. + * + * This is a requirement in the Linux Kernel Memory Model. + * See __cmpxchg_u64() in asm/cmpxchg.h as an example. + */ + if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); + + *tmp_idx = ctx->idx; + + /* load value from memory into TMP_REG_2 */ + if (size == BPF_DW) + EMIT(PPC_RAW_LDARX(tmp2_reg, tmp1_reg, dst_reg, 0)); + else + EMIT(PPC_RAW_LWARX(tmp2_reg, tmp1_reg, dst_reg, 0)); + /* Save old value in _R0 */ + if (imm & BPF_FETCH) + EMIT(PPC_RAW_MR(_R0, tmp2_reg)); + + switch (imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + EMIT(PPC_RAW_ADD(tmp2_reg, tmp2_reg, src_reg)); + break; + case BPF_AND: + case BPF_AND | BPF_FETCH: + EMIT(PPC_RAW_AND(tmp2_reg, tmp2_reg, src_reg)); + break; + case BPF_OR: + case BPF_OR | BPF_FETCH: + EMIT(PPC_RAW_OR(tmp2_reg, tmp2_reg, src_reg)); + break; + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + EMIT(PPC_RAW_XOR(tmp2_reg, tmp2_reg, src_reg)); + break; + case BPF_CMPXCHG: + /* + * Return old value in BPF_REG_0 for BPF_CMPXCHG & + * in src_reg for other cases. + */ + ret_reg = bpf_to_ppc(BPF_REG_0); + + /* Compare with old value in BPF_R0 */ + if (size == BPF_DW) + EMIT(PPC_RAW_CMPD(bpf_to_ppc(BPF_REG_0), tmp2_reg)); + else + EMIT(PPC_RAW_CMPW(bpf_to_ppc(BPF_REG_0), tmp2_reg)); + /* Don't set if different from old value */ + PPC_BCC_SHORT(COND_NE, (ctx->idx + 3) * 4); + fallthrough; + case BPF_XCHG: + save_reg = src_reg; + break; + default: + return -EOPNOTSUPP; + } + + /* store new value */ + if (size == BPF_DW) + EMIT(PPC_RAW_STDCX(save_reg, tmp1_reg, dst_reg)); + else + EMIT(PPC_RAW_STWCX(save_reg, tmp1_reg, dst_reg)); + /* we're done if this succeeded */ + PPC_BCC_SHORT(COND_NE, *tmp_idx * 4); + fixup_idx = ctx->idx; + + if (imm & BPF_FETCH) { + /* Emit 'sync' to enforce full ordering */ + if (IS_ENABLED(CONFIG_SMP)) + EMIT(PPC_RAW_SYNC()); + EMIT(PPC_RAW_MR(ret_reg, _R0)); + /* + * Skip unnecessary zero-extension for 32-bit cmpxchg. + * For context, see commit 39491867ace5. + */ + if (size != BPF_DW && imm == BPF_CMPXCHG && + insn_is_zext(insn + 1)) + *addrp = ctx->idx * 4; + } + + *jmp_off = (fixup_idx - *tmp_idx) * 4; + + return 0; +} + +static int bpf_jit_emit_probe_mem_store(struct codegen_context *ctx, u32 src_reg, s16 off, + u32 code, u32 *image) +{ + u32 tmp1_reg = bpf_to_ppc(TMP_REG_1); + u32 tmp2_reg = bpf_to_ppc(TMP_REG_2); + + switch (BPF_SIZE(code)) { + case BPF_B: + EMIT(PPC_RAW_STB(src_reg, tmp1_reg, off)); + break; + case BPF_H: + EMIT(PPC_RAW_STH(src_reg, tmp1_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_STW(src_reg, tmp1_reg, off)); + break; + case BPF_DW: + if (off % 4) { + EMIT(PPC_RAW_LI(tmp2_reg, off)); + EMIT(PPC_RAW_STDX(src_reg, tmp1_reg, tmp2_reg)); + } else { + EMIT(PPC_RAW_STD(src_reg, tmp1_reg, off)); + } + break; + default: + return -EINVAL; + } + return 0; +} + static int emit_atomic_ld_st(const struct bpf_insn insn, struct codegen_context *ctx, u32 *image) { u32 code = insn.code; @@ -494,7 +643,6 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code u32 size = BPF_SIZE(code); u32 tmp1_reg = bpf_to_ppc(TMP_REG_1); u32 tmp2_reg = bpf_to_ppc(TMP_REG_2); - u32 save_reg, ret_reg; s16 off = insn[i].off; s32 imm = insn[i].imm; bool func_addr_fixed; @@ -502,6 +650,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code u64 imm64; u32 true_cond; u32 tmp_idx; + u32 jmp_off; /* * addrs[] maps a BPF bytecode address into a real offset from @@ -768,6 +917,16 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code */ case BPF_ALU | BPF_MOV | BPF_X: /* (u32) dst = src */ case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ + + if (insn_is_cast_user(&insn[i])) { + EMIT(PPC_RAW_RLDICL_DOT(tmp1_reg, src_reg, 0, 32)); + PPC_LI64(dst_reg, (ctx->user_vm_start & 0xffffffff00000000UL)); + PPC_BCC_SHORT(COND_EQ, (ctx->idx + 2) * 4); + EMIT(PPC_RAW_OR(tmp1_reg, dst_reg, tmp1_reg)); + EMIT(PPC_RAW_MR(dst_reg, tmp1_reg)); + break; + } + if (imm == 1) { /* special mov32 for zext */ EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 0, 31)); @@ -960,6 +1119,76 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code } break; + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + + EMIT(PPC_RAW_ADD(tmp1_reg, dst_reg, bpf_to_ppc(ARENA_VM_START))); + + ret = bpf_jit_emit_probe_mem_store(ctx, src_reg, off, code, image); + if (ret) + return ret; + + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, + ctx->idx - 1, 4, -1, code); + if (ret) + return ret; + + break; + + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + + EMIT(PPC_RAW_ADD(tmp1_reg, dst_reg, bpf_to_ppc(ARENA_VM_START))); + + if (BPF_SIZE(code) == BPF_W || BPF_SIZE(code) == BPF_DW) { + PPC_LI32(tmp2_reg, imm); + src_reg = tmp2_reg; + } else { + EMIT(PPC_RAW_LI(tmp2_reg, imm)); + src_reg = tmp2_reg; + } + + ret = bpf_jit_emit_probe_mem_store(ctx, src_reg, off, code, image); + if (ret) + return ret; + + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, + ctx->idx - 1, 4, -1, code); + if (ret) + return ret; + + break; + + /* + * BPF_STX PROBE_ATOMIC (arena atomic ops) + */ + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: + EMIT(PPC_RAW_ADD(dst_reg, dst_reg, bpf_to_ppc(ARENA_VM_START))); + ret = bpf_jit_emit_atomic_ops(image, ctx, &insn[i], + &jmp_off, &tmp_idx, &addrs[i + 1]); + if (ret) { + if (ret == -EOPNOTSUPP) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + } + return ret; + } + /* LDARX/LWARX should land here on exception. */ + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, + tmp_idx, jmp_off, dst_reg, code); + if (ret) + return ret; + + /* Retrieve the dst_reg */ + EMIT(PPC_RAW_SUB(dst_reg, dst_reg, bpf_to_ppc(ARENA_VM_START))); + break; + /* * BPF_STX ATOMIC (atomic ops) */ @@ -982,93 +1211,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code return -EOPNOTSUPP; } - save_reg = tmp2_reg; - ret_reg = src_reg; - - /* Get offset into TMP_REG_1 */ - EMIT(PPC_RAW_LI(tmp1_reg, off)); - /* - * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' - * before and after the operation. - * - * This is a requirement in the Linux Kernel Memory Model. - * See __cmpxchg_u64() in asm/cmpxchg.h as an example. - */ - if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) - EMIT(PPC_RAW_SYNC()); - tmp_idx = ctx->idx * 4; - /* load value from memory into TMP_REG_2 */ - if (size == BPF_DW) - EMIT(PPC_RAW_LDARX(tmp2_reg, tmp1_reg, dst_reg, 0)); - else - EMIT(PPC_RAW_LWARX(tmp2_reg, tmp1_reg, dst_reg, 0)); - - /* Save old value in _R0 */ - if (imm & BPF_FETCH) - EMIT(PPC_RAW_MR(_R0, tmp2_reg)); - - switch (imm) { - case BPF_ADD: - case BPF_ADD | BPF_FETCH: - EMIT(PPC_RAW_ADD(tmp2_reg, tmp2_reg, src_reg)); - break; - case BPF_AND: - case BPF_AND | BPF_FETCH: - EMIT(PPC_RAW_AND(tmp2_reg, tmp2_reg, src_reg)); - break; - case BPF_OR: - case BPF_OR | BPF_FETCH: - EMIT(PPC_RAW_OR(tmp2_reg, tmp2_reg, src_reg)); - break; - case BPF_XOR: - case BPF_XOR | BPF_FETCH: - EMIT(PPC_RAW_XOR(tmp2_reg, tmp2_reg, src_reg)); - break; - case BPF_CMPXCHG: - /* - * Return old value in BPF_REG_0 for BPF_CMPXCHG & - * in src_reg for other cases. - */ - ret_reg = bpf_to_ppc(BPF_REG_0); - - /* Compare with old value in BPF_R0 */ - if (size == BPF_DW) - EMIT(PPC_RAW_CMPD(bpf_to_ppc(BPF_REG_0), tmp2_reg)); - else - EMIT(PPC_RAW_CMPW(bpf_to_ppc(BPF_REG_0), tmp2_reg)); - /* Don't set if different from old value */ - PPC_BCC_SHORT(COND_NE, (ctx->idx + 3) * 4); - fallthrough; - case BPF_XCHG: - save_reg = src_reg; - break; - default: - pr_err_ratelimited( - "eBPF filter atomic op code %02x (@%d) unsupported\n", - code, i); - return -EOPNOTSUPP; - } - - /* store new value */ - if (size == BPF_DW) - EMIT(PPC_RAW_STDCX(save_reg, tmp1_reg, dst_reg)); - else - EMIT(PPC_RAW_STWCX(save_reg, tmp1_reg, dst_reg)); - /* we're done if this succeeded */ - PPC_BCC_SHORT(COND_NE, tmp_idx); - - if (imm & BPF_FETCH) { - /* Emit 'sync' to enforce full ordering */ - if (IS_ENABLED(CONFIG_SMP)) - EMIT(PPC_RAW_SYNC()); - EMIT(PPC_RAW_MR(ret_reg, _R0)); - /* - * Skip unnecessary zero-extension for 32-bit cmpxchg. - * For context, see commit 39491867ace5. - */ - if (size != BPF_DW && imm == BPF_CMPXCHG && - insn_is_zext(&insn[i + 1])) - addrs[++i] = ctx->idx * 4; + ret = bpf_jit_emit_atomic_ops(image, ctx, &insn[i], + &jmp_off, &tmp_idx, &addrs[i + 1]); + if (ret) { + if (ret == -EOPNOTSUPP) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + } + return ret; } break; @@ -1112,9 +1263,10 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code * Check if 'off' is word aligned for BPF_DW, because * we might generate two instructions. */ - if ((BPF_SIZE(code) == BPF_DW || - (BPF_SIZE(code) == BPF_B && BPF_MODE(code) == BPF_PROBE_MEMSX)) && - (off & 3)) + if ((BPF_SIZE(code) == BPF_DW && (off & 3)) || + (BPF_SIZE(code) == BPF_B && + BPF_MODE(code) == BPF_PROBE_MEMSX) || + (BPF_SIZE(code) == BPF_B && BPF_MODE(code) == BPF_MEMSX)) PPC_JMP((ctx->idx + 3) * 4); else PPC_JMP((ctx->idx + 2) * 4); @@ -1160,12 +1312,49 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code if (BPF_MODE(code) == BPF_PROBE_MEM) { ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, - ctx->idx - 1, 4, dst_reg); + ctx->idx - 1, 4, dst_reg, code); if (ret) return ret; } break; + /* dst = *(u64 *)(ul) (src + ARENA_VM_START + off) */ + case BPF_LDX | BPF_PROBE_MEM32 | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: + case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + + EMIT(PPC_RAW_ADD(tmp1_reg, src_reg, bpf_to_ppc(ARENA_VM_START))); + + switch (size) { + case BPF_B: + EMIT(PPC_RAW_LBZ(dst_reg, tmp1_reg, off)); + break; + case BPF_H: + EMIT(PPC_RAW_LHZ(dst_reg, tmp1_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_LWZ(dst_reg, tmp1_reg, off)); + break; + case BPF_DW: + if (off % 4) { + EMIT(PPC_RAW_LI(tmp2_reg, off)); + EMIT(PPC_RAW_LDX(dst_reg, tmp1_reg, tmp2_reg)); + } else { + EMIT(PPC_RAW_LD(dst_reg, tmp1_reg, off)); + } + break; + } + + if (size != BPF_DW && insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; + + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, + ctx->idx - 1, 4, dst_reg, code); + if (ret) + return ret; + break; + /* * Doubleword load * 16 byte instruction that uses two 'struct bpf_insn' diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index 7f53fcb7495a8a..78dd7e25219ee5 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_PPC_POWERNV) += imc-pmu.o obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o -obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o +obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o vpa-dtl.o obj-$(CONFIG_VPA_PMU) += vpa-pmu.o diff --git a/arch/powerpc/perf/vpa-dtl.c b/arch/powerpc/perf/vpa-dtl.c new file mode 100644 index 00000000000000..3c1d1c28deb9a2 --- /dev/null +++ b/arch/powerpc/perf/vpa-dtl.c @@ -0,0 +1,596 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Perf interface to expose Dispatch Trace Log counters. + * + * Copyright (C) 2024 Kajol Jain, IBM Corporation + */ + +#ifdef CONFIG_PPC_SPLPAR +#define pr_fmt(fmt) "vpa_dtl: " fmt + +#include +#include +#include +#include + +#define EVENT(_name, _code) enum{_name = _code} + +/* + * Based on Power Architecture Platform Reference(PAPR) documentation, + * Table 14.14. Per Virtual Processor Area, below Dispatch Trace Log(DTL) + * Enable Mask used to get corresponding virtual processor dispatch + * to preempt traces: + * DTL_CEDE(0x1): Trace voluntary (OS initiated) virtual + * processor waits + * DTL_PREEMPT(0x2): Trace time slice preempts + * DTL_FAULT(0x4): Trace virtual partition memory page + faults. + * DTL_ALL(0x7): Trace all (DTL_CEDE | DTL_PREEMPT | DTL_FAULT) + * + * Event codes based on Dispatch Trace Log Enable Mask. + */ +EVENT(DTL_CEDE, 0x1); +EVENT(DTL_PREEMPT, 0x2); +EVENT(DTL_FAULT, 0x4); +EVENT(DTL_ALL, 0x7); + +GENERIC_EVENT_ATTR(dtl_cede, DTL_CEDE); +GENERIC_EVENT_ATTR(dtl_preempt, DTL_PREEMPT); +GENERIC_EVENT_ATTR(dtl_fault, DTL_FAULT); +GENERIC_EVENT_ATTR(dtl_all, DTL_ALL); + +PMU_FORMAT_ATTR(event, "config:0-7"); + +static struct attribute *events_attr[] = { + GENERIC_EVENT_PTR(DTL_CEDE), + GENERIC_EVENT_PTR(DTL_PREEMPT), + GENERIC_EVENT_PTR(DTL_FAULT), + GENERIC_EVENT_PTR(DTL_ALL), + NULL +}; + +static struct attribute_group event_group = { + .name = "events", + .attrs = events_attr, +}; + +static struct attribute *format_attrs[] = { + &format_attr_event.attr, + NULL, +}; + +static const struct attribute_group format_group = { + .name = "format", + .attrs = format_attrs, +}; + +static const struct attribute_group *attr_groups[] = { + &format_group, + &event_group, + NULL, +}; + +struct vpa_dtl { + struct dtl_entry *buf; + u64 last_idx; +}; + +struct vpa_pmu_ctx { + struct perf_output_handle handle; +}; + +struct vpa_pmu_buf { + int nr_pages; + bool snapshot; + u64 *base; + u64 size; + u64 head; + u64 head_size; + /* boot timebase and frequency needs to be saved only at once */ + int boottb_freq_saved; + u64 threshold; + bool full; +}; + +/* + * To corelate each DTL entry with other events across CPU's, + * we need to map timebase from "struct dtl_entry" which phyp + * provides with boot timebase. This also needs timebase frequency. + * Formula is: ((timbase from DTL entry - boot time) / frequency) + * + * To match with size of "struct dtl_entry" to ease post processing, + * padded 24 bytes to the structure. + */ +struct boottb_freq { + u64 boot_tb; + u64 tb_freq; + u64 timebase; + u64 padded[3]; +}; + +static DEFINE_PER_CPU(struct vpa_pmu_ctx, vpa_pmu_ctx); +static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu); + +/* variable to capture reference count for the active dtl threads */ +static int dtl_global_refc; +static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock); + +/* + * Capture DTL data in AUX buffer + */ +static void vpa_dtl_capture_aux(long *n_entries, struct vpa_pmu_buf *buf, + struct vpa_dtl *dtl, int index) +{ + struct dtl_entry *aux_copy_buf = (struct dtl_entry *)buf->base; + + /* + * check if there is enough space to contain the + * DTL data. If not, save the data for available + * memory and set full to true. + */ + if (buf->head + *n_entries >= buf->threshold) { + *n_entries = buf->threshold - buf->head; + buf->full = 1; + } + + /* + * Copy to AUX buffer from per-thread address + */ + memcpy(aux_copy_buf + buf->head, &dtl->buf[index], *n_entries * sizeof(struct dtl_entry)); + + if (buf->full) { + /* + * Set head of private aux to zero when buffer is full + * so that next data will be copied to beginning of the + * buffer + */ + buf->head = 0; + return; + } + + buf->head += *n_entries; + + return; +} + +/* + * Function to dump the dispatch trace log buffer data to the + * perf data. + * + * perf_aux_output_begin: This function is called before writing + * to AUX area. This returns the pointer to aux area private structure, + * ie "struct vpa_pmu_buf" here which is set in setup_aux() function. + * The function obtains the output handle (used in perf_aux_output_end). + * when capture completes in vpa_dtl_capture_aux(), call perf_aux_output_end() + * to commit the recorded data. + * + * perf_aux_output_end: This function commits data by adjusting the + * aux_head of "struct perf_buffer". aux_tail will be moved in perf tools + * side when writing the data from aux buffer to perf.data file in disk. + * + * Here in the private aux structure, we maintain head to know where + * to copy data next time in the PMU driver. vpa_pmu_buf->head is moved to + * maintain the aux head for PMU driver. It is responsiblity of PMU + * driver to make sure data is copied between perf_aux_output_begin and + * perf_aux_output_end. + * + * After data is copied in vpa_dtl_capture_aux() function, perf_aux_output_end() + * is called to move the aux->head of "struct perf_buffer" to indicate size of + * data in aux buffer. This will post a PERF_RECORD_AUX into the perf buffer. + * Data will be written to disk only when the allocated buffer is full. + * + * By this approach, all the DTL data will be present as-is in the + * perf.data. The data will be pre-processed in perf tools side when doing + * perf report/perf script and this will avoid time taken to create samples + * in the kernel space. + */ +static void vpa_dtl_dump_sample_data(struct perf_event *event) +{ + u64 cur_idx, last_idx, i; + u64 boot_tb; + struct boottb_freq boottb_freq; + + /* actual number of entries read */ + long n_read = 0, read_size = 0; + + /* number of entries added to dtl buffer */ + long n_req; + + struct vpa_pmu_ctx *vpa_ctx = this_cpu_ptr(&vpa_pmu_ctx); + + struct vpa_pmu_buf *aux_buf; + + struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); + u64 size; + + cur_idx = be64_to_cpu(lppaca_of(event->cpu).dtl_idx); + last_idx = dtl->last_idx; + + if (last_idx + N_DISPATCH_LOG <= cur_idx) + last_idx = cur_idx - N_DISPATCH_LOG + 1; + + n_req = cur_idx - last_idx; + + /* no new entry added to the buffer, return */ + if (n_req <= 0) + return; + + dtl->last_idx = last_idx + n_req; + boot_tb = get_boot_tb(); + + i = last_idx % N_DISPATCH_LOG; + + aux_buf = perf_aux_output_begin(&vpa_ctx->handle, event); + if (!aux_buf) { + pr_debug("returning. no aux\n"); + return; + } + + if (!aux_buf->boottb_freq_saved) { + pr_debug("Copying boot tb to aux buffer: %lld\n", boot_tb); + /* Save boot_tb to convert raw timebase to it's relative system boot time */ + boottb_freq.boot_tb = boot_tb; + /* Save tb_ticks_per_sec to convert timebase to sec */ + boottb_freq.tb_freq = tb_ticks_per_sec; + boottb_freq.timebase = 0; + memcpy(aux_buf->base, &boottb_freq, sizeof(boottb_freq)); + aux_buf->head += 1; + aux_buf->boottb_freq_saved = 1; + n_read += 1; + } + + /* read the tail of the buffer if we've wrapped */ + if (i + n_req > N_DISPATCH_LOG) { + read_size = N_DISPATCH_LOG - i; + vpa_dtl_capture_aux(&read_size, aux_buf, dtl, i); + n_req -= read_size; + n_read += read_size; + i = 0; + if (aux_buf->full) { + size = (n_read * sizeof(struct dtl_entry)); + if ((size + aux_buf->head_size) > aux_buf->size) { + size = aux_buf->size - aux_buf->head_size; + perf_aux_output_end(&vpa_ctx->handle, size); + aux_buf->head = 0; + aux_buf->head_size = 0; + } else { + aux_buf->head_size += (n_read * sizeof(struct dtl_entry)); + perf_aux_output_end(&vpa_ctx->handle, n_read * sizeof(struct dtl_entry)); + } + goto out; + } + } + + /* .. and now the head */ + vpa_dtl_capture_aux(&n_req, aux_buf, dtl, i); + + size = ((n_req + n_read) * sizeof(struct dtl_entry)); + if ((size + aux_buf->head_size) > aux_buf->size) { + size = aux_buf->size - aux_buf->head_size; + perf_aux_output_end(&vpa_ctx->handle, size); + aux_buf->head = 0; + aux_buf->head_size = 0; + } else { + aux_buf->head_size += ((n_req + n_read) * sizeof(struct dtl_entry)); + /* Move the aux->head to indicate size of data in aux buffer */ + perf_aux_output_end(&vpa_ctx->handle, (n_req + n_read) * sizeof(struct dtl_entry)); + } +out: + aux_buf->full = 0; +} + +/* + * The VPA Dispatch Trace log counters do not interrupt on overflow. + * Therefore, the kernel needs to poll the counters to avoid missing + * an overflow using hrtimer. The timer interval is based on sample_period + * count provided by user, and minimum interval is 1 millisecond. + */ +static enum hrtimer_restart vpa_dtl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + + vpa_dtl_dump_sample_data(event); + period = max_t(u64, NSEC_PER_MSEC, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return HRTIMER_RESTART; +} + +static void vpa_dtl_start_hrtimer(struct perf_event *event) +{ + u64 period; + struct hw_perf_event *hwc = &event->hw; + + period = max_t(u64, NSEC_PER_MSEC, hwc->sample_period); + hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED); +} + +static void vpa_dtl_stop_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + hrtimer_cancel(&hwc->hrtimer); +} + +static void vpa_dtl_reset_global_refc(struct perf_event *event) +{ + spin_lock(&dtl_global_lock); + dtl_global_refc--; + if (dtl_global_refc <= 0) { + dtl_global_refc = 0; + up_write(&dtl_access_lock); + } + spin_unlock(&dtl_global_lock); +} + +static int vpa_dtl_mem_alloc(int cpu) +{ + struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, cpu); + struct dtl_entry *buf = NULL; + + /* Check for dispatch trace log buffer cache */ + if (!dtl_cache) + return -ENOMEM; + + buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL | GFP_ATOMIC, cpu_to_node(cpu)); + if (!buf) { + pr_warn("buffer allocation failed for cpu %d\n", cpu); + return -ENOMEM; + } + dtl->buf = buf; + return 0; +} + +static int vpa_dtl_event_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* test the event attr type for PMU enumeration */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (!perfmon_capable()) + return -EACCES; + + /* Return if this is a counting event */ + if (!is_sampling_event(event)) + return -EOPNOTSUPP; + + /* no branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + /* Invalid eventcode */ + switch (event->attr.config) { + case DTL_LOG_CEDE: + case DTL_LOG_PREEMPT: + case DTL_LOG_FAULT: + case DTL_LOG_ALL: + break; + default: + return -EINVAL; + } + + spin_lock(&dtl_global_lock); + + /* + * To ensure there are no other conflicting dtl users + * (example: /proc/powerpc/vcpudispatch_stats or debugfs dtl), + * below code try to take the dtl_access_lock. + * The dtl_access_lock is a rwlock defined in dtl.h, which is used + * to unsure there is no conflicting dtl users. + * Based on below code, vpa_dtl pmu tries to take write access lock + * and also checks for dtl_global_refc, to make sure that the + * dtl_access_lock is taken by vpa_dtl pmu interface. + */ + if (dtl_global_refc == 0 && !down_write_trylock(&dtl_access_lock)) { + spin_unlock(&dtl_global_lock); + return -EBUSY; + } + + /* Allocate dtl buffer memory */ + if (vpa_dtl_mem_alloc(event->cpu)) { + spin_unlock(&dtl_global_lock); + return -ENOMEM; + } + + /* + * Increment the number of active vpa_dtl pmu threads. The + * dtl_global_refc is used to keep count of cpu threads that + * currently capturing dtl data using vpa_dtl pmu interface. + */ + dtl_global_refc++; + + spin_unlock(&dtl_global_lock); + + hrtimer_setup(&hwc->hrtimer, vpa_dtl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + hwc->last_period = hwc->sample_period; + event->attr.freq = 0; + } + + event->destroy = vpa_dtl_reset_global_refc; + return 0; +} + +static int vpa_dtl_event_add(struct perf_event *event, int flags) +{ + int ret, hwcpu; + unsigned long addr; + struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); + + /* + * Register our dtl buffer with the hypervisor. The + * HV expects the buffer size to be passed in the second + * word of the buffer. Refer section '14.11.3.2. H_REGISTER_VPA' + * from PAPR for more information. + */ + ((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES); + dtl->last_idx = 0; + + hwcpu = get_hard_smp_processor_id(event->cpu); + addr = __pa(dtl->buf); + + ret = register_dtl(hwcpu, addr); + if (ret) { + pr_warn("DTL registration for cpu %d (hw %d) failed with %d\n", + event->cpu, hwcpu, ret); + return ret; + } + + /* set our initial buffer indices */ + lppaca_of(event->cpu).dtl_idx = 0; + + /* + * Ensure that our updates to the lppaca fields have + * occurred before we actually enable the logging + */ + smp_wmb(); + + /* enable event logging */ + lppaca_of(event->cpu).dtl_enable_mask = event->attr.config; + + vpa_dtl_start_hrtimer(event); + + return 0; +} + +static void vpa_dtl_event_del(struct perf_event *event, int flags) +{ + int hwcpu = get_hard_smp_processor_id(event->cpu); + struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu); + + vpa_dtl_stop_hrtimer(event); + unregister_dtl(hwcpu); + kmem_cache_free(dtl_cache, dtl->buf); + dtl->buf = NULL; + lppaca_of(event->cpu).dtl_enable_mask = 0x0; +} + +/* + * This function definition is empty as vpa_dtl_dump_sample_data + * is used to parse and dump the dispatch trace log data, + * to perf data. + */ +static void vpa_dtl_event_read(struct perf_event *event) +{ +} + +/* + * Set up pmu-private data structures for an AUX area + * **pages contains the aux buffer allocated for this event + * for the corresponding cpu. rb_alloc_aux uses "alloc_pages_node" + * and returns pointer to each page address. Map these pages to + * contiguous space using vmap and use that as base address. + * + * The aux private data structure ie, "struct vpa_pmu_buf" mainly + * saves + * - buf->base: aux buffer base address + * - buf->head: offset from base address where data will be written to. + * - buf->size: Size of allocated memory + */ +static void *vpa_dtl_setup_aux(struct perf_event *event, void **pages, + int nr_pages, bool snapshot) +{ + int i, cpu = event->cpu; + struct vpa_pmu_buf *buf __free(kfree) = NULL; + struct page **pglist __free(kfree) = NULL; + + /* We need at least one page for this to work. */ + if (!nr_pages) + return NULL; + + if (cpu == -1) + cpu = raw_smp_processor_id(); + + buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu)); + if (!buf) + return NULL; + + pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL); + if (!pglist) + return NULL; + + for (i = 0; i < nr_pages; ++i) + pglist[i] = virt_to_page(pages[i]); + + buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL); + if (!buf->base) + return NULL; + + buf->nr_pages = nr_pages; + buf->snapshot = false; + + buf->size = nr_pages << PAGE_SHIFT; + buf->head = 0; + buf->head_size = 0; + buf->boottb_freq_saved = 0; + buf->threshold = ((buf->size - 32) / sizeof(struct dtl_entry)); + return no_free_ptr(buf); +} + +/* + * free pmu-private AUX data structures + */ +static void vpa_dtl_free_aux(void *aux) +{ + struct vpa_pmu_buf *buf = aux; + + vunmap(buf->base); + kfree(buf); +} + +static struct pmu vpa_dtl_pmu = { + .task_ctx_nr = perf_invalid_context, + + .name = "vpa_dtl", + .attr_groups = attr_groups, + .event_init = vpa_dtl_event_init, + .add = vpa_dtl_event_add, + .del = vpa_dtl_event_del, + .read = vpa_dtl_event_read, + .setup_aux = vpa_dtl_setup_aux, + .free_aux = vpa_dtl_free_aux, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_EXCLUSIVE, +}; + +static int vpa_dtl_init(void) +{ + int r; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) { + pr_debug("not a shared virtualized system, not enabling\n"); + return -ENODEV; + } + + /* This driver is intended only for L1 host. */ + if (is_kvm_guest()) { + pr_debug("Only supported for L1 host system\n"); + return -ENODEV; + } + + r = perf_pmu_register(&vpa_dtl_pmu, vpa_dtl_pmu.name, -1); + if (r) + return r; + + return 0; +} + +device_initcall(vpa_dtl_init); +#endif //CONFIG_PPC_SPLPAR diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 35a1f4b9f8272b..fc79f846693352 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -231,7 +231,6 @@ config PPC4xx_GPIO bool "PPC4xx GPIO support" depends on 44x select GPIOLIB - select OF_GPIO_MM_GPIOCHIP help Enable gpiolib support for ppc440 based boards diff --git a/arch/powerpc/platforms/44x/gpio.c b/arch/powerpc/platforms/44x/gpio.c index 08ab7658256872..aea0d913b59d0b 100644 --- a/arch/powerpc/platforms/44x/gpio.c +++ b/arch/powerpc/platforms/44x/gpio.c @@ -14,10 +14,10 @@ #include #include #include -#include #include #include #include +#include #define GPIO_MASK(gpio) (0x80000000 >> (gpio)) #define GPIO_MASK2(gpio) (0xc0000000 >> ((gpio) * 2)) @@ -45,7 +45,8 @@ struct ppc4xx_gpio { }; struct ppc4xx_gpio_chip { - struct of_mm_gpio_chip mm_gc; + struct gpio_chip gc; + void __iomem *regs; spinlock_t lock; }; @@ -57,8 +58,8 @@ struct ppc4xx_gpio_chip { static int ppc4xx_gpio_get(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct ppc4xx_gpio __iomem *regs = mm_gc->regs; + struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); + struct ppc4xx_gpio __iomem *regs = chip->regs; return !!(in_be32(®s->ir) & GPIO_MASK(gpio)); } @@ -66,8 +67,8 @@ static int ppc4xx_gpio_get(struct gpio_chip *gc, unsigned int gpio) static inline void __ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct ppc4xx_gpio __iomem *regs = mm_gc->regs; + struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); + struct ppc4xx_gpio __iomem *regs = chip->regs; if (val) setbits32(®s->or, GPIO_MASK(gpio)); @@ -93,9 +94,8 @@ static int ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int ppc4xx_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); - struct ppc4xx_gpio __iomem *regs = mm_gc->regs; + struct ppc4xx_gpio __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&chip->lock, flags); @@ -123,9 +123,8 @@ static int ppc4xx_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) static int ppc4xx_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); - struct ppc4xx_gpio __iomem *regs = mm_gc->regs; + struct ppc4xx_gpio __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&chip->lock, flags); @@ -155,42 +154,57 @@ ppc4xx_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) return 0; } -static int __init ppc4xx_add_gpiochips(void) +static int ppc4xx_gpio_probe(struct platform_device *ofdev) { - struct device_node *np; - - for_each_compatible_node(np, NULL, "ibm,ppc4xx-gpio") { - int ret; - struct ppc4xx_gpio_chip *ppc4xx_gc; - struct of_mm_gpio_chip *mm_gc; - struct gpio_chip *gc; - - ppc4xx_gc = kzalloc(sizeof(*ppc4xx_gc), GFP_KERNEL); - if (!ppc4xx_gc) { - ret = -ENOMEM; - goto err; - } - - spin_lock_init(&ppc4xx_gc->lock); - - mm_gc = &ppc4xx_gc->mm_gc; - gc = &mm_gc->gc; - - gc->ngpio = 32; - gc->direction_input = ppc4xx_gpio_dir_in; - gc->direction_output = ppc4xx_gpio_dir_out; - gc->get = ppc4xx_gpio_get; - gc->set = ppc4xx_gpio_set; - - ret = of_mm_gpiochip_add_data(np, mm_gc, ppc4xx_gc); - if (ret) - goto err; - continue; -err: - pr_err("%pOF: registration failed with status %d\n", np, ret); - kfree(ppc4xx_gc); - /* try others anyway */ - } - return 0; + struct device *dev = &ofdev->dev; + struct device_node *np = dev->of_node; + struct ppc4xx_gpio_chip *chip; + struct gpio_chip *gc; + + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + spin_lock_init(&chip->lock); + + gc = &chip->gc; + + gc->base = -1; + gc->ngpio = 32; + gc->direction_input = ppc4xx_gpio_dir_in; + gc->direction_output = ppc4xx_gpio_dir_out; + gc->get = ppc4xx_gpio_get; + gc->set = ppc4xx_gpio_set; + + gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np); + if (!gc->label) + return -ENOMEM; + + chip->regs = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(chip->regs)) + return PTR_ERR(chip->regs); + + return devm_gpiochip_add_data(dev, gc, chip); +} + +static const struct of_device_id ppc4xx_gpio_match[] = { + { + .compatible = "ibm,ppc4xx-gpio", + }, + {}, +}; +MODULE_DEVICE_TABLE(of, ppc4xx_gpio_match); + +static struct platform_driver ppc4xx_gpio_driver = { + .probe = ppc4xx_gpio_probe, + .driver = { + .name = "ppc4xx-gpio", + .of_match_table = ppc4xx_gpio_match, + }, +}; + +static int __init ppc4xx_gpio_init(void) +{ + return platform_driver_register(&ppc4xx_gpio_driver); } -arch_initcall(ppc4xx_add_gpiochips); +arch_initcall(ppc4xx_gpio_init); diff --git a/arch/powerpc/platforms/8xx/Kconfig b/arch/powerpc/platforms/8xx/Kconfig index 8623aebfac482f..abb2b45b2789eb 100644 --- a/arch/powerpc/platforms/8xx/Kconfig +++ b/arch/powerpc/platforms/8xx/Kconfig @@ -101,7 +101,6 @@ comment "Generic MPC8xx Options" config 8xx_GPIO bool "GPIO API Support" select GPIOLIB - select OF_GPIO_MM_GPIOCHIP help Saying Y here will cause the ports on an MPC8xx processor to be used with the GPIO API. If you say N here, the kernel needs less memory. diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index fea3766eac0f31..364eef32ddcc0f 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -243,7 +243,6 @@ config CPM2 select CPM select HAVE_PCI select GPIOLIB - select OF_GPIO_MM_GPIOCHIP help The CPM2 (Communications Processor Module) is a coprocessor on embedded CPUs made by Freescale. Selecting this option means that diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index d5a2c77bc90871..ce839783c0df6a 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1430,7 +1430,7 @@ static int spufs_mfc_open(struct inode *inode, struct file *file) if (ctx->owner != current->mm) return -EINVAL; - if (atomic_read(&inode->i_count) != 1) + if (icount_read(inode) != 1) return -EBUSY; mutex_lock(&ctx->mapping_lock); diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 157e046e6e93cf..ea4ba1b6ce6a96 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -67,11 +67,11 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, struct dentry *dentry; int ret; - dentry = user_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_user_path(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); ret = PTR_ERR(dentry); if (!IS_ERR(dentry)) { ret = spufs_create(&path, dentry, flags, mode, neighbor); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); } return ret; diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 95d7ba73d43d0d..b5ad7c173ef0c1 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -9,6 +9,7 @@ config PPC_POWERNV select PPC_P7_NAP select FORCE_PCI select PCI_MSI + select IRQ_MSI_LIB select EPAPR_BOOT select PPC_INDIRECT_PIO select PPC_UDBG_16550 diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d8ccf2c9b98ad0..b0c1d9d16fb52c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,6 @@ #include #include #include -#include #include "powernv.h" #include "pci.h" @@ -1707,23 +1707,6 @@ static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, return 0; } -/* - * The msi_free() op is called before irq_domain_free_irqs_top() when - * the handler data is still available. Use that to clear the XIVE - * controller. - */ -static void pnv_msi_ops_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, - unsigned int irq) -{ - if (xive_enabled()) - xive_irq_free_data(irq); -} - -static struct msi_domain_ops pnv_pci_msi_domain_ops = { - .msi_free = pnv_msi_ops_msi_free, -}; - static void pnv_msi_shutdown(struct irq_data *d) { d = d->parent_data; @@ -1731,31 +1714,33 @@ static void pnv_msi_shutdown(struct irq_data *d) d->chip->irq_shutdown(d); } -static void pnv_msi_mask(struct irq_data *d) +static bool pnv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) { - pci_msi_mask_irq(d); - irq_chip_mask_parent(d); -} + struct irq_chip *chip = info->chip; -static void pnv_msi_unmask(struct irq_data *d) -{ - pci_msi_unmask_irq(d); - irq_chip_unmask_parent(d); -} + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; -static struct irq_chip pnv_pci_msi_irq_chip = { - .name = "PNV-PCI-MSI", - .irq_shutdown = pnv_msi_shutdown, - .irq_mask = pnv_msi_mask, - .irq_unmask = pnv_msi_unmask, - .irq_eoi = irq_chip_eoi_parent, -}; + chip->irq_shutdown = pnv_msi_shutdown; + return true; +} -static struct msi_domain_info pnv_msi_domain_info = { - .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), - .ops = &pnv_pci_msi_domain_ops, - .chip = &pnv_pci_msi_irq_chip, +#define PNV_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT) +#define PNV_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_PCI_MSIX | \ + MSI_FLAG_MULTI_PCI_MSI) + +static const struct msi_parent_ops pnv_msi_parent_ops = { + .required_flags = PNV_PCI_MSI_FLAGS_REQUIRED, + .supported_flags = PNV_PCI_MSI_FLAGS_SUPPORTED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .prefix = "PNV-", + .init_dev_msi_info = pnv_init_dev_msi_info, }; static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) @@ -1854,7 +1839,7 @@ static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, return 0; out: - irq_domain_free_irqs_parent(domain, virq, i - 1); + irq_domain_free_irqs_parent(domain, virq, i); msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs); return ret; } @@ -1870,41 +1855,30 @@ static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq, virq, d->hwirq, nr_irqs); msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs); - /* XIVE domain is cleared through ->msi_free() */ + irq_domain_free_irqs_parent(domain, virq, nr_irqs); } static const struct irq_domain_ops pnv_irq_domain_ops = { + .select = msi_lib_irq_domain_select, .alloc = pnv_irq_domain_alloc, .free = pnv_irq_domain_free, }; static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count) { - struct pnv_phb *phb = hose->private_data; struct irq_domain *parent = irq_get_default_domain(); - - hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id); - if (!hose->fwnode) - return -ENOMEM; - - hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count, - hose->fwnode, - &pnv_irq_domain_ops, hose); + struct irq_domain_info info = { + .fwnode = of_fwnode_handle(hose->dn), + .ops = &pnv_irq_domain_ops, + .host_data = hose, + .size = count, + .parent = parent, + }; + + hose->dev_domain = msi_create_parent_irq_domain(&info, &pnv_msi_parent_ops); if (!hose->dev_domain) { - pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", - hose->dn, hose->global_number); - irq_domain_free_fwnode(hose->fwnode); - return -ENOMEM; - } - - hose->msi_domain = pci_msi_create_irq_domain(of_fwnode_handle(hose->dn), - &pnv_msi_domain_info, - hose->dev_domain); - if (!hose->msi_domain) { pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", hose->dn, hose->global_number); - irq_domain_free_fwnode(hose->fwnode); - irq_domain_remove(hose->dev_domain); return -ENOMEM; } diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h index 77feee8436d48c..413fd85d9bc285 100644 --- a/arch/powerpc/platforms/powernv/subcore.h +++ b/arch/powerpc/platforms/powernv/subcore.h @@ -9,7 +9,7 @@ #define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */ #define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_SMP void split_core_secondary_loop(u8 *state); @@ -18,4 +18,4 @@ extern void update_subcore_sibling_mask(void); static inline void update_subcore_sibling_mask(void) { } #endif /* CONFIG_SMP */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index fa3c2fff082a87..3e042218d6cd8c 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -7,6 +7,7 @@ config PPC_PSERIES select OF_DYNAMIC select FORCE_PCI select PCI_MSI + select IRQ_MSI_LIB select GENERIC_ALLOCATOR select PPC_XICS select PPC_XIVE_SPAPR diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 57222678bb3f9f..931ebaa474c81e 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -5,6 +5,7 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \ of_helpers.o rtas-work-area.o papr-sysparm.o \ papr-rtas-common.o papr-vpd.o papr-indices.o \ papr-platform-dump.o papr-phy-attest.o \ + papr-hvpipe.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpar.o mobility.o rng.o \ pci.o pci_dlpar.o eeh_pseries.o msi.o \ diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 62bd8e2d5d4c0b..95fe802ccdfdb4 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -28,6 +28,7 @@ #include #include "pseries.h" #include "vas.h" /* vas_migration_handler() */ +#include "papr-hvpipe.h" /* hvpipe_migration_handler() */ #include "../../kernel/cacheinfo.h" static struct kobject *mobility_kobj; @@ -744,6 +745,7 @@ static int pseries_migrate_partition(u64 handle) * by closing VAS windows at the beginning of this function. */ vas_migration_handler(VAS_SUSPEND); + hvpipe_migration_handler(HVPIPE_SUSPEND); ret = wait_for_vasi_session_suspending(handle); if (ret) @@ -770,6 +772,7 @@ static int pseries_migrate_partition(u64 handle) out: vas_migration_handler(VAS_RESUME); + hvpipe_migration_handler(HVPIPE_RESUME); return ret; } diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index ee1c8c6898a3c7..825f9432e03d7d 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -15,7 +16,6 @@ #include #include #include -#include #include "pseries.h" @@ -430,43 +430,25 @@ static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type, static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg) { + struct msi_domain_info *info = domain->host_data; struct pci_dev *pdev = to_pci_dev(dev); - int type = pdev->msix_enabled ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; + int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; return rtas_prepare_msi_irqs(pdev, nvec, type, arg); } -/* - * ->msi_free() is called before irq_domain_free_irqs_top() when the - * handler data is still available. Use that to clear the XIVE - * controller data. - */ -static void pseries_msi_ops_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, - unsigned int irq) -{ - if (xive_enabled()) - xive_irq_free_data(irq); -} - /* * RTAS can not disable one MSI at a time. It's all or nothing. Do it * at the end after all IRQs have been freed. */ -static void pseries_msi_post_free(struct irq_domain *domain, struct device *dev) +static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) { - if (WARN_ON_ONCE(!dev_is_pci(dev))) - return; + struct msi_desc *desc = arg->desc; + struct pci_dev *pdev = msi_desc_to_pci_dev(desc); - rtas_disable_msi(to_pci_dev(dev)); + rtas_disable_msi(pdev); } -static struct msi_domain_ops pseries_pci_msi_domain_ops = { - .msi_prepare = pseries_msi_ops_prepare, - .msi_free = pseries_msi_ops_msi_free, - .msi_post_free = pseries_msi_post_free, -}; - static void pseries_msi_shutdown(struct irq_data *d) { d = d->parent_data; @@ -474,18 +456,6 @@ static void pseries_msi_shutdown(struct irq_data *d) d->chip->irq_shutdown(d); } -static void pseries_msi_mask(struct irq_data *d) -{ - pci_msi_mask_irq(d); - irq_chip_mask_parent(d); -} - -static void pseries_msi_unmask(struct irq_data *d) -{ - pci_msi_unmask_irq(d); - irq_chip_unmask_parent(d); -} - static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { struct msi_desc *entry = irq_data_get_msi_desc(data); @@ -500,27 +470,39 @@ static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg) entry->msg = *msg; } -static struct irq_chip pseries_pci_msi_irq_chip = { - .name = "pSeries-PCI-MSI", - .irq_shutdown = pseries_msi_shutdown, - .irq_mask = pseries_msi_mask, - .irq_unmask = pseries_msi_unmask, - .irq_eoi = irq_chip_eoi_parent, - .irq_write_msi_msg = pseries_msi_write_msg, -}; +static bool pseries_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; -/* - * Set MSI_FLAG_MSIX_CONTIGUOUS as there is no way to express to - * firmware to request a discontiguous or non-zero based range of - * MSI-X entries. Core code will reject such setup attempts. - */ -static struct msi_domain_info pseries_msi_domain_info = { - .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX | - MSI_FLAG_MSIX_CONTIGUOUS), - .ops = &pseries_pci_msi_domain_ops, - .chip = &pseries_pci_msi_irq_chip, + chip->irq_shutdown = pseries_msi_shutdown; + chip->irq_write_msi_msg = pseries_msi_write_msg; + + info->ops->msi_prepare = pseries_msi_ops_prepare; + info->ops->msi_teardown = pseries_msi_ops_teardown; + + return true; +} + +#define PSERIES_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT) +#define PSERIES_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_PCI_MSIX | \ + MSI_FLAG_MSIX_CONTIGUOUS | \ + MSI_FLAG_MULTI_PCI_MSI) + +static const struct msi_parent_ops pseries_msi_parent_ops = { + .required_flags = PSERIES_PCI_MSI_FLAGS_REQUIRED, + .supported_flags = PSERIES_PCI_MSI_FLAGS_SUPPORTED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .prefix = "pSeries-", + .init_dev_msi_info = pseries_init_dev_msi_info, }; static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) @@ -593,7 +575,7 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq out: /* TODO: handle RTAS cleanup in ->msi_finish() ? */ - irq_domain_free_irqs_parent(domain, virq, i - 1); + irq_domain_free_irqs_parent(domain, virq, i); return ret; } @@ -604,11 +586,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq struct pci_controller *phb = irq_data_get_irq_chip_data(d); pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs); - - /* XIVE domain data is cleared through ->msi_free() */ + irq_domain_free_irqs_parent(domain, virq, nr_irqs); } static const struct irq_domain_ops pseries_irq_domain_ops = { + .select = msi_lib_irq_domain_select, .alloc = pseries_irq_domain_alloc, .free = pseries_irq_domain_free, }; @@ -617,30 +599,18 @@ static int __pseries_msi_allocate_domains(struct pci_controller *phb, unsigned int count) { struct irq_domain *parent = irq_get_default_domain(); - - phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI", - phb->global_number); - if (!phb->fwnode) - return -ENOMEM; - - phb->dev_domain = irq_domain_create_hierarchy(parent, 0, count, - phb->fwnode, - &pseries_irq_domain_ops, phb); + struct irq_domain_info info = { + .fwnode = of_fwnode_handle(phb->dn), + .ops = &pseries_irq_domain_ops, + .host_data = phb, + .size = count, + .parent = parent, + }; + + phb->dev_domain = msi_create_parent_irq_domain(&info, &pseries_msi_parent_ops); if (!phb->dev_domain) { - pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", - phb->dn, phb->global_number); - irq_domain_free_fwnode(phb->fwnode); - return -ENOMEM; - } - - phb->msi_domain = pci_msi_create_irq_domain(of_fwnode_handle(phb->dn), - &pseries_msi_domain_info, - phb->dev_domain); - if (!phb->msi_domain) { pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", phb->dn, phb->global_number); - irq_domain_free_fwnode(phb->fwnode); - irq_domain_remove(phb->dev_domain); return -ENOMEM; } @@ -662,12 +632,8 @@ int pseries_msi_allocate_domains(struct pci_controller *phb) void pseries_msi_free_domains(struct pci_controller *phb) { - if (phb->msi_domain) - irq_domain_remove(phb->msi_domain); if (phb->dev_domain) irq_domain_remove(phb->dev_domain); - if (phb->fwnode) - irq_domain_free_fwnode(phb->fwnode); } static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c new file mode 100644 index 00000000000000..21a2f447c43fdc --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -0,0 +1,818 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-hvpipe: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pseries.h" +#include "papr-hvpipe.h" + +static DEFINE_SPINLOCK(hvpipe_src_list_lock); +static LIST_HEAD(hvpipe_src_list); + +static unsigned char hvpipe_ras_buf[RTAS_ERROR_LOG_MAX]; +static struct workqueue_struct *papr_hvpipe_wq; +static struct work_struct *papr_hvpipe_work; +static int hvpipe_check_exception_token; +static bool hvpipe_feature; + +/* + * New PowerPC FW provides support for partitions and various + * sources (Ex: remote hardware management console (HMC)) to + * exchange information through an inband hypervisor channel + * called HVPIPE. Only HMCs are supported right now and + * partitions can communicate with multiple HMCs and each + * source represented by source ID. + * + * FW introduces send HVPIPE and recv HVPIPE RTAS calls for + * partitions to send and receive payloads respectively. + * + * These RTAS functions have the following certain requirements + * / limitations: + * - One hvpipe per partition for all sources. + * - Assume the return status of send HVPIPE as delivered to source + * - Assume the return status of recv HVPIPE as ACK to source + * - Generates HVPIPE event message when the payload is ready + * for the partition. The hypervisor will not deliver another + * event until the partition read the previous payload which + * means the pipe is blocked for any sources. + * + * Linux implementation: + * Follow the similar interfaces that the OS has for other RTAS calls. + * ex: /dev/papr-indices, /dev/papr-vpd, etc. + * - /dev/papr-hvpipe is available for the user space. + * - devfd = open("/dev/papr-hvpipe", ..) + * - fd = ioctl(fd,HVPIPE_IOC_CREATE_HANDLE,&srcID)-for each source + * - write(fd, buf, size) --> Issue send HVPIPE RTAS call and + * returns size for success or the corresponding error for RTAS + * return code for failure. + * - poll(fd,..) -> wakeup FD if the payload is available to read. + * HVPIPE event message handler wakeup FD based on source ID in + * the event message + * - read(fd, buf, size) --> Issue recv HVPIPE RTAS call and + * returns size for success or the corresponding error for RTAS + * return code for failure. + */ + +/* + * ibm,receive-hvpipe-msg RTAS call. + * @area: Caller-provided work area buffer for results. + * @srcID: Source ID returned by the RTAS call. + * @bytesw: Bytes written by RTAS call to @area. + */ +static int rtas_ibm_receive_hvpipe_msg(struct rtas_work_area *area, + u32 *srcID, u32 *bytesw) +{ + const s32 token = rtas_function_token(RTAS_FN_IBM_RECEIVE_HVPIPE_MSG); + u32 rets[2]; + s32 fwrc; + int ret; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + do { + fwrc = rtas_call(token, 2, 3, rets, + rtas_work_area_phys(area), + rtas_work_area_size(area)); + + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case RTAS_SUCCESS: + *srcID = rets[0]; + *bytesw = rets[1]; + ret = 0; + break; + case RTAS_HARDWARE_ERROR: + ret = -EIO; + break; + case RTAS_INVALID_PARAMETER: + ret = -EINVAL; + break; + case RTAS_FUNC_NOT_SUPPORTED: + ret = -EOPNOTSUPP; + break; + default: + ret = -EIO; + pr_err_ratelimited("unexpected ibm,receive-hvpipe-msg status %d\n", fwrc); + break; + } + + return ret; +} + +/* + * ibm,send-hvpipe-msg RTAS call + * @area: Caller-provided work area buffer to send. + * @srcID: Target source for the send pipe message. + */ +static int rtas_ibm_send_hvpipe_msg(struct rtas_work_area *area, u32 srcID) +{ + const s32 token = rtas_function_token(RTAS_FN_IBM_SEND_HVPIPE_MSG); + s32 fwrc; + int ret; + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + do { + fwrc = rtas_call(token, 2, 1, NULL, srcID, + rtas_work_area_phys(area)); + + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case RTAS_SUCCESS: + ret = 0; + break; + case RTAS_HARDWARE_ERROR: + ret = -EIO; + break; + case RTAS_INVALID_PARAMETER: + ret = -EINVAL; + break; + case RTAS_HVPIPE_CLOSED: + ret = -EPIPE; + break; + case RTAS_FUNC_NOT_SUPPORTED: + ret = -EOPNOTSUPP; + break; + default: + ret = -EIO; + pr_err_ratelimited("unexpected ibm,receive-hvpipe-msg status %d\n", fwrc); + break; + } + + return ret; +} + +static struct hvpipe_source_info *hvpipe_find_source(u32 srcID) +{ + struct hvpipe_source_info *src_info; + + list_for_each_entry(src_info, &hvpipe_src_list, list) + if (src_info->srcID == srcID) + return src_info; + + return NULL; +} + +/* + * This work function collects receive buffer with recv HVPIPE + * RTAS call. Called from read() + * @buf: User specified buffer to copy the payload that returned + * from recv HVPIPE RTAS. + * @size: Size of buffer user passed. + */ +static int hvpipe_rtas_recv_msg(char __user *buf, int size) +{ + struct rtas_work_area *work_area; + u32 srcID, bytes_written; + int ret; + + work_area = rtas_work_area_alloc(SZ_4K); + if (!work_area) { + pr_err("Could not allocate RTAS buffer for recv pipe\n"); + return -ENOMEM; + } + + ret = rtas_ibm_receive_hvpipe_msg(work_area, &srcID, + &bytes_written); + if (!ret) { + /* + * Recv HVPIPE RTAS is successful. + * When releasing FD or no one is waiting on the + * specific source, issue recv HVPIPE RTAS call + * so that pipe is not blocked - this func is called + * with NULL buf. + */ + if (buf) { + if (size < bytes_written) { + pr_err("Received the payload size = %d, but the buffer size = %d\n", + bytes_written, size); + bytes_written = size; + } + ret = copy_to_user(buf, + rtas_work_area_raw_buf(work_area), + bytes_written); + if (!ret) + ret = bytes_written; + } + } else { + pr_err("ibm,receive-hvpipe-msg failed with %d\n", + ret); + } + + rtas_work_area_free(work_area); + return ret; +} + +/* + * papr_hvpipe_handle_write - Issue send HVPIPE RTAS and return + * the size (payload + HVPIPE_HDR_LEN) for RTAS success. + * Otherwise returns the status of RTAS to the user space + */ +static ssize_t papr_hvpipe_handle_write(struct file *file, + const char __user *buf, size_t size, loff_t *off) +{ + struct hvpipe_source_info *src_info = file->private_data; + struct rtas_work_area *work_area, *work_buf; + unsigned long ret, len; + __be64 *area_be; + + /* + * Return -ENXIO during migration + */ + if (!hvpipe_feature) + return -ENXIO; + + if (!src_info) + return -EIO; + + /* + * Send HVPIPE RTAS is used to send payload to the specific + * source with the input parameters source ID and the payload + * as buffer list. Each entry in the buffer list contains + * address/length pair of the buffer. + * + * The buffer list format is as follows: + * + * Header (length of address/length pairs and the header length) + * Address of 4K buffer 1 + * Length of 4K buffer 1 used + * ... + * Address of 4K buffer n + * Length of 4K buffer n used + * + * See PAPR 7.3.32.2 ibm,send-hvpipe-msg + * + * Even though can support max 1MB payload, the hypervisor + * supports only 4048 bytes payload at present and also + * just one address/length entry. + * + * writev() interface can be added in future when the + * hypervisor supports multiple buffer list entries. + */ + /* HVPIPE_MAX_WRITE_BUFFER_SIZE = 4048 bytes */ + if ((size > (HVPIPE_HDR_LEN + HVPIPE_MAX_WRITE_BUFFER_SIZE)) || + (size <= HVPIPE_HDR_LEN)) + return -EINVAL; + + /* + * The length of (address + length) pair + the length of header + */ + len = (2 * sizeof(u64)) + sizeof(u64); + size -= HVPIPE_HDR_LEN; + buf += HVPIPE_HDR_LEN; + mutex_lock(&rtas_ibm_send_hvpipe_msg_lock); + work_area = rtas_work_area_alloc(SZ_4K); + if (!work_area) { + ret = -ENOMEM; + goto out; + } + area_be = (__be64 *)rtas_work_area_raw_buf(work_area); + /* header */ + area_be[0] = cpu_to_be64(len); + + work_buf = rtas_work_area_alloc(SZ_4K); + if (!work_buf) { + ret = -ENOMEM; + goto out_work; + } + /* First buffer address */ + area_be[1] = cpu_to_be64(rtas_work_area_phys(work_buf)); + /* First buffer address length */ + area_be[2] = cpu_to_be64(size); + + if (!copy_from_user(rtas_work_area_raw_buf(work_buf), buf, size)) { + ret = rtas_ibm_send_hvpipe_msg(work_area, src_info->srcID); + if (!ret) + ret = size + HVPIPE_HDR_LEN; + } else + ret = -EPERM; + + rtas_work_area_free(work_buf); +out_work: + rtas_work_area_free(work_area); +out: + mutex_unlock(&rtas_ibm_send_hvpipe_msg_lock); + return ret; +} + +/* + * papr_hvpipe_handle_read - If the payload for the specific + * source is pending in the hypervisor, issue recv HVPIPE RTAS + * and return the payload to the user space. + * + * When the payload is available for the partition, the + * hypervisor notifies HVPIPE event with the source ID + * and the event handler wakeup FD(s) that are waiting. + */ +static ssize_t papr_hvpipe_handle_read(struct file *file, + char __user *buf, size_t size, loff_t *off) +{ + + struct hvpipe_source_info *src_info = file->private_data; + struct papr_hvpipe_hdr hdr; + long ret; + + /* + * Return -ENXIO during migration + */ + if (!hvpipe_feature) + return -ENXIO; + + if (!src_info) + return -EIO; + + /* + * Max payload is 4048 (HVPIPE_MAX_WRITE_BUFFER_SIZE) + */ + if ((size > (HVPIPE_HDR_LEN + HVPIPE_MAX_WRITE_BUFFER_SIZE)) || + (size < HVPIPE_HDR_LEN)) + return -EINVAL; + + /* + * Payload is not available to receive or source pipe + * is not closed. + */ + if (!src_info->hvpipe_status) + return 0; + + hdr.version = 0; + hdr.flags = 0; + + /* + * In case if the hvpipe has payload and also the + * hypervisor closed the pipe to the source, retrieve + * the payload and return to the user space first and + * then notify the userspace about the hvpipe close in + * next read(). + */ + if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) + hdr.flags = HVPIPE_MSG_AVAILABLE; + else if (src_info->hvpipe_status & HVPIPE_LOST_CONNECTION) + hdr.flags = HVPIPE_LOST_CONNECTION; + else + /* + * Should not be here without one of the above + * flags set + */ + return -EIO; + + ret = copy_to_user(buf, &hdr, HVPIPE_HDR_LEN); + if (ret) + return ret; + + /* + * Message event has payload, so get the payload with + * recv HVPIPE RTAS. + */ + if (hdr.flags & HVPIPE_MSG_AVAILABLE) { + ret = hvpipe_rtas_recv_msg(buf + HVPIPE_HDR_LEN, + size - HVPIPE_HDR_LEN); + if (ret > 0) { + src_info->hvpipe_status &= ~HVPIPE_MSG_AVAILABLE; + ret += HVPIPE_HDR_LEN; + } + } else if (hdr.flags & HVPIPE_LOST_CONNECTION) { + /* + * Hypervisor is closing the pipe for the specific + * source. So notify user space. + */ + src_info->hvpipe_status &= ~HVPIPE_LOST_CONNECTION; + ret = HVPIPE_HDR_LEN; + } + + return ret; +} + +/* + * The user space waits for the payload to receive. + * The hypervisor sends HVPIPE event message to the partition + * when the payload is available. The event handler wakeup FD + * depends on the source ID in the message event. + */ +static __poll_t papr_hvpipe_handle_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct hvpipe_source_info *src_info = filp->private_data; + + /* + * HVPIPE is disabled during SUSPEND and enabled after migration. + * So return POLLRDHUP during migration + */ + if (!hvpipe_feature) + return POLLRDHUP; + + if (!src_info) + return POLLNVAL; + + /* + * If hvpipe already has pending payload, return so that + * the user space can issue read(). + */ + if (src_info->hvpipe_status) + return POLLIN | POLLRDNORM; + + /* + * Wait for the message event + * hvpipe_event_interrupt() wakes up this wait_queue + */ + poll_wait(filp, &src_info->recv_wqh, wait); + if (src_info->hvpipe_status) + return POLLIN | POLLRDNORM; + + return 0; +} + +static int papr_hvpipe_handle_release(struct inode *inode, + struct file *file) +{ + struct hvpipe_source_info *src_info; + + /* + * Hold the lock, remove source from src_list, reset the + * hvpipe status and release the lock to prevent any race + * with message event IRQ. + */ + spin_lock(&hvpipe_src_list_lock); + src_info = file->private_data; + list_del(&src_info->list); + file->private_data = NULL; + /* + * If the pipe for this specific source has any pending + * payload, issue recv HVPIPE RTAS so that pipe will not + * be blocked. + */ + if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) { + src_info->hvpipe_status = 0; + spin_unlock(&hvpipe_src_list_lock); + hvpipe_rtas_recv_msg(NULL, 0); + } else + spin_unlock(&hvpipe_src_list_lock); + + kfree(src_info); + return 0; +} + +static const struct file_operations papr_hvpipe_handle_ops = { + .read = papr_hvpipe_handle_read, + .write = papr_hvpipe_handle_write, + .release = papr_hvpipe_handle_release, + .poll = papr_hvpipe_handle_poll, +}; + +static int papr_hvpipe_dev_create_handle(u32 srcID) +{ + struct hvpipe_source_info *src_info; + struct file *file; + long err; + int fd; + + spin_lock(&hvpipe_src_list_lock); + /* + * Do not allow more than one process communicates with + * each source. + */ + src_info = hvpipe_find_source(srcID); + if (src_info) { + spin_unlock(&hvpipe_src_list_lock); + pr_err("pid(%d) is already using the source(%d)\n", + src_info->tsk->pid, srcID); + return -EALREADY; + } + spin_unlock(&hvpipe_src_list_lock); + + src_info = kzalloc(sizeof(*src_info), GFP_KERNEL_ACCOUNT); + if (!src_info) + return -ENOMEM; + + src_info->srcID = srcID; + src_info->tsk = current; + init_waitqueue_head(&src_info->recv_wqh); + + fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = fd; + goto free_buf; + } + + file = anon_inode_getfile("[papr-hvpipe]", + &papr_hvpipe_handle_ops, (void *)src_info, + O_RDWR); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + spin_lock(&hvpipe_src_list_lock); + /* + * If two processes are executing ioctl() for the same + * source ID concurrently, prevent the second process to + * acquire FD. + */ + if (hvpipe_find_source(srcID)) { + spin_unlock(&hvpipe_src_list_lock); + err = -EALREADY; + goto free_file; + } + list_add(&src_info->list, &hvpipe_src_list); + spin_unlock(&hvpipe_src_list_lock); + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); +free_buf: + kfree(src_info); + return err; +} + +/* + * Top-level ioctl handler for /dev/papr_hvpipe + * + * Use separate FD for each source (exa :HMC). So ioctl is called + * with source ID which returns FD. + */ +static long papr_hvpipe_dev_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + u32 __user *argp = (void __user *)arg; + u32 srcID; + long ret; + + /* + * Return -ENXIO during migration + */ + if (!hvpipe_feature) + return -ENXIO; + + if (get_user(srcID, argp)) + return -EFAULT; + + /* + * Support only HMC source right now + */ + if (!(srcID & HVPIPE_HMC_ID_MASK)) + return -EINVAL; + + switch (ioctl) { + case PAPR_HVPIPE_IOC_CREATE_HANDLE: + ret = papr_hvpipe_dev_create_handle(srcID); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + return ret; +} + +/* + * papr_hvpipe_work_fn - called to issue recv HVPIPE RTAS for + * sources that are not monitored by user space so that pipe + * will not be blocked. + */ +static void papr_hvpipe_work_fn(struct work_struct *work) +{ + hvpipe_rtas_recv_msg(NULL, 0); +} + +/* + * HVPIPE event message IRQ handler. + * The hypervisor sends event IRQ if the partition has payload + * and generates another event only after payload is read with + * recv HVPIPE RTAS. + */ +static irqreturn_t hvpipe_event_interrupt(int irq, void *dev_id) +{ + struct hvpipe_event_buf *hvpipe_event; + struct pseries_errorlog *pseries_log; + struct hvpipe_source_info *src_info; + struct rtas_error_log *elog; + int rc; + + rc = rtas_call(hvpipe_check_exception_token, 6, 1, NULL, + RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), + RTAS_HVPIPE_MSG_EVENTS, 1, __pa(&hvpipe_ras_buf), + rtas_get_error_log_max()); + + if (rc != 0) { + pr_err_ratelimited("unexpected hvpipe-event-notification failed %d\n", rc); + return IRQ_HANDLED; + } + + elog = (struct rtas_error_log *)hvpipe_ras_buf; + if (unlikely(rtas_error_type(elog) != RTAS_TYPE_HVPIPE)) { + pr_warn_ratelimited("Unexpected event type %d\n", + rtas_error_type(elog)); + return IRQ_HANDLED; + } + + pseries_log = get_pseries_errorlog(elog, + PSERIES_ELOG_SECT_ID_HVPIPE_EVENT); + hvpipe_event = (struct hvpipe_event_buf *)pseries_log->data; + + /* + * The hypervisor notifies partition when the payload is + * available to read with recv HVPIPE RTAS and it will not + * notify another event for any source until the previous + * payload is read. Means the pipe is blocked in the + * hypervisor until the payload is read. + * + * If the source is ready to accept payload and wakeup the + * corresponding FD. Hold lock and update hvpipe_status + * and this lock is needed in case the user space process + * is in release FD instead of poll() so that release() + * reads the payload to unblock pipe before closing FD. + * + * otherwise (means no other user process waiting for the + * payload, issue recv HVPIPE RTAS (papr_hvpipe_work_fn()) + * to unblock pipe. + */ + spin_lock(&hvpipe_src_list_lock); + src_info = hvpipe_find_source(be32_to_cpu(hvpipe_event->srcID)); + if (src_info) { + u32 flags = 0; + + if (hvpipe_event->event_type & HVPIPE_LOST_CONNECTION) + flags = HVPIPE_LOST_CONNECTION; + else if (hvpipe_event->event_type & HVPIPE_MSG_AVAILABLE) + flags = HVPIPE_MSG_AVAILABLE; + + src_info->hvpipe_status |= flags; + wake_up(&src_info->recv_wqh); + spin_unlock(&hvpipe_src_list_lock); + } else { + spin_unlock(&hvpipe_src_list_lock); + /* + * user space is not waiting on this source. So + * execute receive pipe RTAS so that pipe will not + * be blocked. + */ + if (hvpipe_event->event_type & HVPIPE_MSG_AVAILABLE) + queue_work(papr_hvpipe_wq, papr_hvpipe_work); + } + + return IRQ_HANDLED; +} + +/* + * Enable hvpipe by system parameter set with parameter + * token = 64 and with 1 byte buffer data: + * 0 = hvpipe not in use/disable + * 1 = hvpipe in use/enable + */ +static int set_hvpipe_sys_param(u8 val) +{ + struct papr_sysparm_buf *buf; + int ret; + + buf = papr_sysparm_buf_alloc(); + if (!buf) + return -ENOMEM; + + buf->len = cpu_to_be16(1); + buf->val[0] = val; + ret = papr_sysparm_set(PAPR_SYSPARM_HVPIPE_ENABLE, buf); + if (ret) + pr_err("Can not enable hvpipe %d\n", ret); + + papr_sysparm_buf_free(buf); + + return ret; +} + +static int __init enable_hvpipe_IRQ(void) +{ + struct device_node *np; + + hvpipe_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION); + if (hvpipe_check_exception_token == RTAS_UNKNOWN_SERVICE) + return -ENODEV; + + /* hvpipe events */ + np = of_find_node_by_path("/event-sources/ibm,hvpipe-msg-events"); + if (np != NULL) { + request_event_sources_irqs(np, hvpipe_event_interrupt, + "HPIPE_EVENT"); + of_node_put(np); + } else { + pr_err("Can not enable hvpipe event IRQ\n"); + return -ENODEV; + } + + return 0; +} + +void hvpipe_migration_handler(int action) +{ + pr_info("hvpipe migration event %d\n", action); + + /* + * HVPIPE is not used (Failed to create /dev/papr-hvpipe). + * So nothing to do for migration. + */ + if (!papr_hvpipe_work) + return; + + switch (action) { + case HVPIPE_SUSPEND: + if (hvpipe_feature) { + /* + * Disable hvpipe_feature to the user space. + * It will be enabled with RESUME event. + */ + hvpipe_feature = false; + /* + * set system parameter hvpipe 'disable' + */ + set_hvpipe_sys_param(0); + } + break; + case HVPIPE_RESUME: + /* + * set system parameter hvpipe 'enable' + */ + if (!set_hvpipe_sys_param(1)) + hvpipe_feature = true; + else + pr_err("hvpipe is not enabled after migration\n"); + + break; + } +} + +static const struct file_operations papr_hvpipe_ops = { + .unlocked_ioctl = papr_hvpipe_dev_ioctl, +}; + +static struct miscdevice papr_hvpipe_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "papr-hvpipe", + .fops = &papr_hvpipe_ops, +}; + +static int __init papr_hvpipe_init(void) +{ + int ret; + + if (!of_find_property(rtas.dev, "ibm,hypervisor-pipe-capable", + NULL)) + return -ENODEV; + + if (!rtas_function_implemented(RTAS_FN_IBM_SEND_HVPIPE_MSG) || + !rtas_function_implemented(RTAS_FN_IBM_RECEIVE_HVPIPE_MSG)) + return -ENODEV; + + papr_hvpipe_work = kzalloc(sizeof(struct work_struct), GFP_ATOMIC); + if (!papr_hvpipe_work) + return -ENOMEM; + + INIT_WORK(papr_hvpipe_work, papr_hvpipe_work_fn); + + papr_hvpipe_wq = alloc_ordered_workqueue("papr hvpipe workqueue", 0); + if (!papr_hvpipe_wq) { + ret = -ENOMEM; + goto out; + } + + ret = enable_hvpipe_IRQ(); + if (!ret) { + ret = set_hvpipe_sys_param(1); + if (!ret) + ret = misc_register(&papr_hvpipe_dev); + } + + if (!ret) { + pr_info("hvpipe feature is enabled\n"); + hvpipe_feature = true; + return 0; + } + + pr_err("hvpipe feature is not enabled %d\n", ret); + destroy_workqueue(papr_hvpipe_wq); +out: + kfree(papr_hvpipe_work); + papr_hvpipe_work = NULL; + return ret; +} +machine_device_initcall(pseries, papr_hvpipe_init); diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.h b/arch/powerpc/platforms/pseries/papr-hvpipe.h new file mode 100644 index 00000000000000..c343f4230865c0 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _PAPR_HVPIPE_H +#define _PAPR_HVPIPE_H + +#define HVPIPE_HMC_ID_MASK 0x02000000 /*02-HMC,00-reserved and HMC ID */ +#define HVPIPE_MAX_WRITE_BUFFER_SIZE 4048 +/* + * hvpipe specific RTAS return values + */ +#define RTAS_HVPIPE_CLOSED -4 + +#define HVPIPE_HDR_LEN sizeof(struct papr_hvpipe_hdr) + +enum hvpipe_migrate_action { + HVPIPE_SUSPEND, + HVPIPE_RESUME, +}; + +struct hvpipe_source_info { + struct list_head list; /* list of sources */ + u32 srcID; + u32 hvpipe_status; + wait_queue_head_t recv_wqh; /* wake up poll() waitq */ + struct task_struct *tsk; +}; + +/* + * Source ID Format 0xCCRRQQQQ + * CC = indicating value is source type (ex: 0x02 for HMC) + * RR = 0x00 (reserved) + * QQQQ = 0x0000 – 0xFFFF indicating the source index indetifier + */ +struct hvpipe_event_buf { + __be32 srcID; /* Source ID */ + u8 event_type; /* 0x01 for hvpipe message available */ + /* from specified src ID */ + /* 0x02 for loss of pipe connection */ + /* with specified src ID */ +}; + +void hvpipe_migration_handler(int action); +#endif /* _PAPR_HVPIPE_H */ diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c index f469f6a9f6e0ff..07ea605ab0e628 100644 --- a/arch/powerpc/sysdev/cpm_common.c +++ b/arch/powerpc/sysdev/cpm_common.c @@ -28,10 +28,6 @@ #include -#if defined(CONFIG_CPM2) || defined(CONFIG_8xx_GPIO) -#include -#endif - static int __init cpm_init(void) { struct device_node *np; @@ -91,32 +87,33 @@ void __init udbg_init_cpm(void) #if defined(CONFIG_CPM2) || defined(CONFIG_8xx_GPIO) +#include + struct cpm2_ioports { u32 dir, par, sor, odr, dat; u32 res[3]; }; struct cpm2_gpio32_chip { - struct of_mm_gpio_chip mm_gc; + struct gpio_chip gc; + void __iomem *regs; spinlock_t lock; /* shadowed data register to clear/set bits safely */ u32 cpdata; }; -static void cpm2_gpio32_save_regs(struct of_mm_gpio_chip *mm_gc) +static void cpm2_gpio32_save_regs(struct cpm2_gpio32_chip *cpm2_gc) { - struct cpm2_gpio32_chip *cpm2_gc = - container_of(mm_gc, struct cpm2_gpio32_chip, mm_gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; cpm2_gc->cpdata = in_be32(&iop->dat); } static int cpm2_gpio32_get(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; u32 pin_mask; pin_mask = 1 << (31 - gpio); @@ -124,11 +121,9 @@ static int cpm2_gpio32_get(struct gpio_chip *gc, unsigned int gpio) return !!(in_be32(&iop->dat) & pin_mask); } -static void __cpm2_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask, - int value) +static void __cpm2_gpio32_set(struct cpm2_gpio32_chip *cpm2_gc, u32 pin_mask, int value) { - struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(&mm_gc->gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; if (value) cpm2_gc->cpdata |= pin_mask; @@ -140,14 +135,13 @@ static void __cpm2_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask, static int cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); unsigned long flags; u32 pin_mask = 1 << (31 - gpio); spin_lock_irqsave(&cpm2_gc->lock, flags); - __cpm2_gpio32_set(mm_gc, pin_mask, value); + __cpm2_gpio32_set(cpm2_gc, pin_mask, value); spin_unlock_irqrestore(&cpm2_gc->lock, flags); @@ -156,16 +150,15 @@ static int cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value) static int cpm2_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; unsigned long flags; u32 pin_mask = 1 << (31 - gpio); spin_lock_irqsave(&cpm2_gc->lock, flags); setbits32(&iop->dir, pin_mask); - __cpm2_gpio32_set(mm_gc, pin_mask, val); + __cpm2_gpio32_set(cpm2_gc, pin_mask, val); spin_unlock_irqrestore(&cpm2_gc->lock, flags); @@ -174,9 +167,8 @@ static int cpm2_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static int cpm2_gpio32_dir_in(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); - struct cpm2_ioports __iomem *iop = mm_gc->regs; + struct cpm2_ioports __iomem *iop = cpm2_gc->regs; unsigned long flags; u32 pin_mask = 1 << (31 - gpio); @@ -193,19 +185,17 @@ int cpm2_gpiochip_add32(struct device *dev) { struct device_node *np = dev->of_node; struct cpm2_gpio32_chip *cpm2_gc; - struct of_mm_gpio_chip *mm_gc; struct gpio_chip *gc; - cpm2_gc = kzalloc(sizeof(*cpm2_gc), GFP_KERNEL); + cpm2_gc = devm_kzalloc(dev, sizeof(*cpm2_gc), GFP_KERNEL); if (!cpm2_gc) return -ENOMEM; spin_lock_init(&cpm2_gc->lock); - mm_gc = &cpm2_gc->mm_gc; - gc = &mm_gc->gc; + gc = &cpm2_gc->gc; - mm_gc->save_regs = cpm2_gpio32_save_regs; + gc->base = -1; gc->ngpio = 32; gc->direction_input = cpm2_gpio32_dir_in; gc->direction_output = cpm2_gpio32_dir_out; @@ -214,6 +204,16 @@ int cpm2_gpiochip_add32(struct device *dev) gc->parent = dev; gc->owner = THIS_MODULE; - return of_mm_gpiochip_add_data(np, mm_gc, cpm2_gc); + gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np); + if (!gc->label) + return -ENOMEM; + + cpm2_gc->regs = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(cpm2_gc->regs)) + return PTR_ERR(cpm2_gc->regs); + + cpm2_gpio32_save_regs(cpm2_gc); + + return devm_gpiochip_add_data(dev, gc, cpm2_gc); } #endif /* CONFIG_CPM2 || CONFIG_8xx_GPIO */ diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index f1059240502479..625361a15424e5 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -317,7 +317,7 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d) if (d) { char buffer[128]; - xive_irq_data_dump(irq_data_get_irq_handler_data(d), + xive_irq_data_dump(irq_data_get_irq_chip_data(d), buffer, sizeof(buffer)); xmon_printf("%s", buffer); } @@ -437,7 +437,7 @@ static void xive_do_source_eoi(struct xive_irq_data *xd) /* irq_chip eoi callback, called with irq descriptor lock held */ static void xive_irq_eoi(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); struct xive_cpu *xc = __this_cpu_read(xive_cpu); DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n", @@ -595,7 +595,7 @@ static int xive_pick_irq_target(struct irq_data *d, const struct cpumask *affinity) { static unsigned int fuzz; - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); cpumask_var_t mask; int cpu = -1; @@ -628,7 +628,7 @@ static int xive_pick_irq_target(struct irq_data *d, static unsigned int xive_irq_startup(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); int target, rc; @@ -673,7 +673,7 @@ static unsigned int xive_irq_startup(struct irq_data *d) /* called with irq descriptor lock held */ static void xive_irq_shutdown(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); pr_debug("%s: irq %d [0x%x] data @%p\n", __func__, d->irq, hw_irq, d); @@ -698,7 +698,7 @@ static void xive_irq_shutdown(struct irq_data *d) static void xive_irq_unmask(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd); @@ -707,7 +707,7 @@ static void xive_irq_unmask(struct irq_data *d) static void xive_irq_mask(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd); @@ -718,7 +718,7 @@ static int xive_irq_set_affinity(struct irq_data *d, const struct cpumask *cpumask, bool force) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); u32 target, old_target; int rc = 0; @@ -776,7 +776,7 @@ static int xive_irq_set_affinity(struct irq_data *d, static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); /* * We only support these. This has really no effect other than setting @@ -815,7 +815,7 @@ static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) static int xive_irq_retrigger(struct irq_data *d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); /* This should be only for MSIs */ if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) @@ -837,7 +837,7 @@ static int xive_irq_retrigger(struct irq_data *d) */ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(d); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); int rc; u8 pq; @@ -951,7 +951,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) static int xive_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); + struct xive_irq_data *xd = irq_data_get_irq_chip_data(data); u8 pq; switch (which) { @@ -1011,21 +1011,20 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd) } EXPORT_SYMBOL_GPL(xive_cleanup_irq_data); -static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) +static struct xive_irq_data *xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) { struct xive_irq_data *xd; int rc; xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL); if (!xd) - return -ENOMEM; + return ERR_PTR(-ENOMEM); rc = xive_ops->populate_irq_data(hw, xd); if (rc) { kfree(xd); - return rc; + return ERR_PTR(rc); } xd->target = XIVE_INVALID_TARGET; - irq_set_handler_data(virq, xd); /* * Turn OFF by default the interrupt being mapped. A side @@ -1036,20 +1035,19 @@ static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) */ xive_esb_read(xd, XIVE_ESB_SET_PQ_01); - return 0; + return xd; } -void xive_irq_free_data(unsigned int virq) +static void xive_irq_free_data(unsigned int virq) { - struct xive_irq_data *xd = irq_get_handler_data(virq); + struct xive_irq_data *xd = irq_get_chip_data(virq); if (!xd) return; - irq_set_handler_data(virq, NULL); + irq_set_chip_data(virq, NULL); xive_cleanup_irq_data(xd); kfree(xd); } -EXPORT_SYMBOL_GPL(xive_irq_free_data); #ifdef CONFIG_SMP @@ -1286,7 +1284,7 @@ void __init xive_smp_probe(void) static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw) { - int rc; + struct xive_irq_data *xd; /* * Mark interrupts as edge sensitive by default so that resend @@ -1294,11 +1292,12 @@ static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, */ irq_clear_status_flags(virq, IRQ_LEVEL); - rc = xive_irq_alloc_data(virq, hw); - if (rc) - return rc; + xd = xive_irq_alloc_data(virq, hw); + if (IS_ERR(xd)) + return PTR_ERR(xd); irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq); + irq_set_chip_data(virq, xd); return 0; } @@ -1366,7 +1365,7 @@ static void xive_irq_domain_debug_show(struct seq_file *m, struct irq_domain *d, seq_printf(m, "%*sXIVE:\n", ind, ""); ind++; - xd = irq_data_get_irq_handler_data(irqd); + xd = irq_data_get_irq_chip_data(irqd); if (!xd) { seq_printf(m, "%*snot assigned\n", ind, ""); return; @@ -1403,6 +1402,7 @@ static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_fwspec *fwspec = arg; + struct xive_irq_data *xd; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int i, rc; @@ -1423,12 +1423,11 @@ static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, irq_clear_status_flags(virq, IRQ_LEVEL); /* allocates and sets handler data */ - rc = xive_irq_alloc_data(virq + i, hwirq + i); - if (rc) - return rc; + xd = xive_irq_alloc_data(virq + i, hwirq + i); + if (IS_ERR(xd)) + return PTR_ERR(xd); - irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, - &xive_irq_chip, domain->host_data); + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, &xive_irq_chip, xd); irq_set_handler(virq + i, handle_fasteoi_irq); } @@ -1764,7 +1763,7 @@ static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d) seq_printf(m, "IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ", hw_irq, target, prio, lirq); - xive_irq_data_dump(irq_data_get_irq_handler_data(d), buffer, sizeof(buffer)); + xive_irq_data_dump(irq_data_get_irq_chip_data(d), buffer, sizeof(buffer)); seq_puts(m, buffer); seq_puts(m, "\n"); } diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c index 0774d711453efa..de9b4236728c4e 100644 --- a/arch/powerpc/xmon/ppc-opc.c +++ b/arch/powerpc/xmon/ppc-opc.c @@ -954,8 +954,7 @@ const struct powerpc_operand powerpc_operands[] = { 0xff, 11, NULL, NULL, PPC_OPERAND_SIGNOPT }, }; -const unsigned int num_powerpc_operands = (sizeof (powerpc_operands) - / sizeof (powerpc_operands[0])); +const unsigned int num_powerpc_operands = ARRAY_SIZE(powerpc_operands); /* The functions used to insert and extract complicated operands. */ @@ -6968,9 +6967,8 @@ const struct powerpc_opcode powerpc_opcodes[] = { {"fcfidu.", XRC(63,974,1), XRA_MASK, POWER7|PPCA2, PPCVLE, {FRT, FRB}}, }; -const int powerpc_num_opcodes = - sizeof (powerpc_opcodes) / sizeof (powerpc_opcodes[0]); - +const int powerpc_num_opcodes = ARRAY_SIZE(powerpc_opcodes); + /* The VLE opcode table. The format of this opcode table is the same as the main opcode table. */ @@ -7207,9 +7205,8 @@ const struct powerpc_opcode vle_opcodes[] = { {"se_bl", BD8(58,0,1), BD8_MASK, PPCVLE, 0, {B8}}, }; -const int vle_num_opcodes = - sizeof (vle_opcodes) / sizeof (vle_opcodes[0]); - +const int vle_num_opcodes = ARRAY_SIZE(vle_opcodes); + /* The macro table. This is only used by the assembler. */ /* The expressions of the form (-x ! 31) & (x | 31) have the value 0 @@ -7276,5 +7273,4 @@ const struct powerpc_macro powerpc_macros[] = { {"e_clrlslwi",4, PPCVLE, "e_rlwinm %0,%1,%3,(%2)-(%3),31-(%3)"}, }; -const int powerpc_num_macros = - sizeof (powerpc_macros) / sizeof (powerpc_macros[0]); +const int powerpc_num_macros = ARRAY_SIZE(powerpc_macros); diff --git a/arch/powerpc/xmon/xmon_bpts.h b/arch/powerpc/xmon/xmon_bpts.h index 377068f52edb9a..e14e4fb862e0c0 100644 --- a/arch/powerpc/xmon/xmon_bpts.h +++ b/arch/powerpc/xmon/xmon_bpts.h @@ -3,12 +3,12 @@ #define XMON_BPTS_H #define NBPTS 256 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #define BPT_SIZE (sizeof(ppc_inst_t) * 2) #define BPT_WORDS (BPT_SIZE / sizeof(ppc_inst_t)) extern unsigned int bpt_table[NBPTS * BPT_WORDS]; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* XMON_BPTS_H */ diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 51dcd8eaa24356..2181dde50d6e45 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -28,6 +28,7 @@ config RISCV select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX + select ARCH_HAS_ELF_CORE_EFLAGS select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL @@ -52,7 +53,8 @@ config RISCV select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN - select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE + select ARCH_HAS_VDSO_ARCH_DATA if HAVE_GENERIC_VDSO + select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK if ACPI select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX @@ -60,7 +62,7 @@ config RISCV select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW # clang >= 17: https://github.com/llvm/llvm-project/commit/62fa708ceb027713b386c7e0efda994f8bdc27e2 - select ARCH_SUPPORTS_CFI_CLANG if CLANG_VERSION >= 170000 + select ARCH_SUPPORTS_CFI if (!CC_IS_CLANG || CLANG_VERSION >= 170000) select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_HUGETLBFS if MMU @@ -72,11 +74,12 @@ config RISCV select ARCH_SUPPORTS_PER_VMA_LOCK if MMU select ARCH_SUPPORTS_RT select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK + select ARCH_SUPPORTS_SCHED_MC if SMP select ARCH_USE_CMPXCHG_LOCKREF if 64BIT select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_SYM_ANNOTATIONS - select ARCH_USES_CFI_TRAPS if CFI_CLANG + select ARCH_USES_CFI_TRAPS if CFI select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if MMU select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS @@ -107,7 +110,7 @@ config RISCV select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_ENTRY - select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO + select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO && 64BIT select GENERIC_IDLE_POLL_SETUP select GENERIC_IOREMAP if MMU select GENERIC_IRQ_IPI if SMP @@ -120,9 +123,7 @@ config RISCV select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD - select GENERIC_TIME_VSYSCALL if MMU && 64BIT - select GENERIC_VDSO_DATA_STORE if MMU - select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO + select GENERIC_TIME_VSYSCALL if GENERIC_GETTIMEOFDAY select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU select HAVE_ALIGNED_STRUCT_PAGE @@ -154,18 +155,19 @@ config RISCV select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS - select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG) + select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI) select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_TRACER if !XIP_KERNEL && HAVE_DYNAMIC_FTRACE select HAVE_EBPF_JIT if MMU + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST if MMU select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_GCC_PLUGINS - select HAVE_GENERIC_VDSO if MMU && 64BIT + select HAVE_GENERIC_VDSO if MMU select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_BZIP2 if !XIP_KERNEL && !EFI_ZBOOT select HAVE_KERNEL_GZIP if !XIP_KERNEL && !EFI_ZBOOT @@ -221,7 +223,7 @@ config RISCV select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU - select VDSO_GETRANDOM if HAVE_GENERIC_VDSO + select VDSO_GETRANDOM if HAVE_GENERIC_VDSO && 64BIT select USER_STACKTRACE_SUPPORT select ZONE_DMA32 if 64BIT @@ -453,14 +455,6 @@ config SMP If you don't know what to do here, say N. -config SCHED_MC - bool "Multi-core scheduler support" - depends on SMP - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config NR_CPUS int "Maximum number of CPUs (2-512)" depends on SMP @@ -714,7 +708,6 @@ config TOOLCHAIN_HAS_ZACAS config RISCV_ISA_ZACAS bool "Zacas extension support for atomic CAS" - depends on TOOLCHAIN_HAS_ZACAS depends on RISCV_ALTERNATIVE default y help diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata index e318119d570de0..aca9b0cfcfecf9 100644 --- a/arch/riscv/Kconfig.errata +++ b/arch/riscv/Kconfig.errata @@ -21,6 +21,29 @@ config ERRATA_ANDES_CMO If you don't know what to do here, say "Y". +config ERRATA_MIPS + bool "MIPS errata" + depends on RISCV_ALTERNATIVE + help + All MIPS errata Kconfig depend on this Kconfig. Disabling + this Kconfig will disable all MIPS errata. Please say "Y" + here if your platform uses MIPS CPU cores. + + Otherwise, please say "N" here to avoid unnecessary overhead. + +config ERRATA_MIPS_P8700_PAUSE_OPCODE + bool "Fix the PAUSE Opcode for MIPS P8700" + depends on ERRATA_MIPS && 64BIT + default n + help + The RISCV MIPS P8700 uses a different opcode for PAUSE. + It is a 'hint' encoding of the SLLI instruction, + with rd=0, rs1=0 and imm=5. It will behave as a NOP + instruction if no additional behavior beyond that of + SLLI is implemented. + + If you are not using the P8700 processor, say n. + config ERRATA_SIFIVE bool "SiFive errata" depends on RISCV_ALTERNATIVE diff --git a/arch/riscv/Kconfig.vendor b/arch/riscv/Kconfig.vendor index e14f26368963c1..3c1f92e406c3f2 100644 --- a/arch/riscv/Kconfig.vendor +++ b/arch/riscv/Kconfig.vendor @@ -16,6 +16,19 @@ config RISCV_ISA_VENDOR_EXT_ANDES If you don't know what to do here, say Y. endmenu +menu "MIPS" +config RISCV_ISA_VENDOR_EXT_MIPS + bool "MIPS vendor extension support" + select RISCV_ISA_VENDOR_EXT + default y + help + Say N here to disable detection of and support for all MIPS vendor + extensions. Without this option enabled, MIPS vendor extensions will + not be detected at boot and their presence not reported to userspace. + + If you don't know what to do here, say Y. +endmenu + menu "SiFive" config RISCV_ISA_VENDOR_EXT_SIFIVE bool "SiFive vendor extension support" diff --git a/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts b/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts index bc5c84f227622e..5f2e5cc3e3d555 100644 --- a/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts +++ b/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts @@ -17,7 +17,7 @@ #cooling-cells = <2>; }; - i2c-gpio-0 { + i2c-0 { compatible = "i2c-gpio"; sda-gpios = <&pio 3 14 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>; /* PD14/GPIO44 */ scl-gpios = <&pio 3 15 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>; /* PD15/GPIO45 */ diff --git a/arch/riscv/boot/dts/sophgo/sg2042.dtsi b/arch/riscv/boot/dts/sophgo/sg2042.dtsi index b3e4d3c18fdcf9..6430c6e25c0017 100644 --- a/arch/riscv/boot/dts/sophgo/sg2042.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2042.dtsi @@ -190,7 +190,7 @@ reg-names = "clr", "doorbell"; msi-controller; #msi-cells = <0>; - msi-ranges = <&intc 64 IRQ_TYPE_LEVEL_HIGH 32>; + msi-ranges = <&intc 64 IRQ_TYPE_EDGE_RISING 32>; }; rpgate: clock-controller@7030010368 { diff --git a/arch/riscv/boot/dts/sophgo/sg2044.dtsi b/arch/riscv/boot/dts/sophgo/sg2044.dtsi index 6ec955744b0cbf..320c4d1d08e69c 100644 --- a/arch/riscv/boot/dts/sophgo/sg2044.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2044.dtsi @@ -214,7 +214,7 @@ reg-names = "clr", "doorbell"; #msi-cells = <0>; msi-controller; - msi-ranges = <&intc 352 IRQ_TYPE_LEVEL_HIGH 512>; + msi-ranges = <&intc 352 IRQ_TYPE_EDGE_RISING 512>; status = "disabled"; }; diff --git a/arch/riscv/errata/Makefile b/arch/riscv/errata/Makefile index bc6c77ba837d2d..02a7a3335b1d55 100644 --- a/arch/riscv/errata/Makefile +++ b/arch/riscv/errata/Makefile @@ -13,5 +13,6 @@ endif endif obj-$(CONFIG_ERRATA_ANDES) += andes/ +obj-$(CONFIG_ERRATA_MIPS) += mips/ obj-$(CONFIG_ERRATA_SIFIVE) += sifive/ obj-$(CONFIG_ERRATA_THEAD) += thead/ diff --git a/arch/riscv/errata/mips/Makefile b/arch/riscv/errata/mips/Makefile new file mode 100644 index 00000000000000..6278c389b801ee --- /dev/null +++ b/arch/riscv/errata/mips/Makefile @@ -0,0 +1,5 @@ +ifdef CONFIG_RISCV_ALTERNATIVE_EARLY +CFLAGS_errata.o := -mcmodel=medany +endif + +obj-y += errata.o diff --git a/arch/riscv/errata/mips/errata.c b/arch/riscv/errata/mips/errata.c new file mode 100644 index 00000000000000..e984a8152208c3 --- /dev/null +++ b/arch/riscv/errata/mips/errata.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 MIPS. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static inline bool errata_probe_pause(void) +{ + if (!IS_ENABLED(CONFIG_ERRATA_MIPS_P8700_PAUSE_OPCODE)) + return false; + + if (!riscv_isa_vendor_extension_available(MIPS_VENDOR_ID, XMIPSEXECTL)) + return false; + + return true; +} + +static u32 mips_errata_probe(void) +{ + u32 cpu_req_errata = 0; + + if (errata_probe_pause()) + cpu_req_errata |= BIT(ERRATA_MIPS_P8700_PAUSE_OPCODE); + + return cpu_req_errata; +} + +void mips_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid, + unsigned int stage) +{ + struct alt_entry *alt; + u32 cpu_req_errata = mips_errata_probe(); + u32 tmp; + + BUILD_BUG_ON(ERRATA_MIPS_NUMBER >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE); + + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) + return; + + for (alt = begin; alt < end; alt++) { + if (alt->vendor_id != MIPS_VENDOR_ID) + continue; + + if (alt->patch_id >= ERRATA_MIPS_NUMBER) { + WARN(1, "MIPS errata id:%d not in kernel errata list\n", + alt->patch_id); + continue; + } + + tmp = (1U << alt->patch_id); + if (cpu_req_errata && tmp) { + mutex_lock(&text_mutex); + patch_text_nosync(ALT_OLD_PTR(alt), ALT_ALT_PTR(alt), + alt->alt_len); + mutex_unlock(&text_mutex); + } + } +} diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index 231d777d936c2d..9619bd5c8ebaa3 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -4,7 +4,7 @@ #ifdef CONFIG_RISCV_ALTERNATIVE -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro ALT_ENTRY oldptr newptr vendor_id patch_id new_len .4byte \oldptr - . @@ -53,7 +53,7 @@ #define __ALTERNATIVE_CFG(...) ALTERNATIVE_CFG __VA_ARGS__ #define __ALTERNATIVE_CFG_2(...) ALTERNATIVE_CFG_2 __VA_ARGS__ -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #include #include @@ -98,7 +98,7 @@ __ALTERNATIVE_CFG(old_c, new_c_1, vendor_id_1, patch_id_1, enable_1) \ ALT_NEW_CONTENT(vendor_id_2, patch_id_2, enable_2, new_c_2) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, patch_id, CONFIG_k) \ __ALTERNATIVE_CFG(old_c, new_c, vendor_id, patch_id, IS_ENABLED(CONFIG_k)) @@ -109,7 +109,7 @@ new_c_2, vendor_id_2, patch_id_2, IS_ENABLED(CONFIG_k_2)) #else /* CONFIG_RISCV_ALTERNATIVE */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro ALTERNATIVE_CFG old_c \old_c @@ -118,12 +118,12 @@ #define __ALTERNATIVE_CFG(old_c, ...) ALTERNATIVE_CFG old_c #define __ALTERNATIVE_CFG_2(old_c, ...) ALTERNATIVE_CFG old_c -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #define __ALTERNATIVE_CFG(old_c, ...) old_c "\n" #define __ALTERNATIVE_CFG_2(old_c, ...) old_c "\n" -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define _ALTERNATIVE_CFG(old_c, ...) __ALTERNATIVE_CFG(old_c) #define _ALTERNATIVE_CFG_2(old_c, ...) __ALTERNATIVE_CFG_2(old_c) diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index 3c2b59b2501792..8407d1d535b852 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -8,7 +8,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_RISCV_ALTERNATIVE @@ -48,6 +48,9 @@ struct alt_entry { void andes_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); +void mips_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid, + unsigned int stage); void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); diff --git a/arch/riscv/include/asm/asm-extable.h b/arch/riscv/include/asm/asm-extable.h index 0c8bfd54fc4e05..37d425d7a76296 100644 --- a/arch/riscv/include/asm/asm-extable.h +++ b/arch/riscv/include/asm/asm-extable.h @@ -10,7 +10,7 @@ #ifdef CONFIG_MMU -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __ASM_EXTABLE_RAW(insn, fixup, type, data) \ .pushsection __ex_table, "a"; \ @@ -25,7 +25,7 @@ __ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_FIXUP, 0) .endm -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #include #include @@ -77,7 +77,7 @@ EX_DATA_REG(ADDR, addr) \ ")") -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #else /* CONFIG_MMU */ #define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err) diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h index 2a16e88e13deda..8bd2a11382a390 100644 --- a/arch/riscv/include/asm/asm.h +++ b/arch/riscv/include/asm/asm.h @@ -6,7 +6,7 @@ #ifndef _ASM_RISCV_ASM_H #define _ASM_RISCV_ASM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __ASM_STR(x) x #else #define __ASM_STR(x) #x @@ -30,7 +30,7 @@ #define SRLI __REG_SEL(srliw, srli) #if __SIZEOF_POINTER__ == 8 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define RISCV_PTR .dword #define RISCV_SZPTR 8 #define RISCV_LGPTR 3 @@ -40,7 +40,7 @@ #define RISCV_LGPTR "3" #endif #elif __SIZEOF_POINTER__ == 4 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define RISCV_PTR .word #define RISCV_SZPTR 4 #define RISCV_LGPTR 2 @@ -69,7 +69,7 @@ #error "Unexpected __SIZEOF_SHORT__" #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include /* Common assembly source macros */ @@ -194,6 +194,6 @@ #define ASM_NOKPROBE(name) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_ASM_H */ diff --git a/arch/riscv/include/asm/assembler.h b/arch/riscv/include/asm/assembler.h index 44b1457d3e9567..16931712beab64 100644 --- a/arch/riscv/include/asm/assembler.h +++ b/arch/riscv/include/asm/assembler.h @@ -5,7 +5,7 @@ * Author: Jee Heng Sia */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #error "Only include this from assembly code" #endif diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index b8c5726d86acb1..700ba3f922cb51 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -10,7 +10,7 @@ #ifndef _ASM_RISCV_BARRIER_H #define _ASM_RISCV_BARRIER_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -82,6 +82,6 @@ do { \ #include -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_BARRIER_H */ diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index d59310f74c2ba7..77880677b06e03 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -45,7 +45,7 @@ #error "Unexpected BITS_PER_LONG" #endif -static __always_inline unsigned long variable__ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned long variable__ffs(unsigned long word) { asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) @@ -74,7 +74,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) (unsigned long)__builtin_ctzl(word) : \ variable__ffs(word)) -static __always_inline unsigned long variable__fls(unsigned long word) +static __always_inline __attribute_const__ unsigned long variable__fls(unsigned long word) { asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) @@ -103,7 +103,7 @@ static __always_inline unsigned long variable__fls(unsigned long word) (unsigned long)(BITS_PER_LONG - 1 - __builtin_clzl(word)) : \ variable__fls(word)) -static __always_inline int variable_ffs(int x) +static __always_inline __attribute_const__ int variable_ffs(int x) { asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) diff --git a/arch/riscv/include/asm/cache.h b/arch/riscv/include/asm/cache.h index 570e9d8acad1e5..eb42b739d1328c 100644 --- a/arch/riscv/include/asm/cache.h +++ b/arch/riscv/include/asm/cache.h @@ -24,7 +24,7 @@ #define ARCH_SLAB_MINALIGN 16 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern int dma_cache_alignment; #ifdef CONFIG_RISCV_DMA_NONCOHERENT @@ -35,6 +35,6 @@ static inline int dma_get_cache_alignment(void) } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CACHE_H */ diff --git a/arch/riscv/include/asm/cfi.h b/arch/riscv/include/asm/cfi.h index 4508aaa7a2fdb1..710aa8192edd97 100644 --- a/arch/riscv/include/asm/cfi.h +++ b/arch/riscv/include/asm/cfi.h @@ -11,7 +11,7 @@ struct pt_regs; -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall #else @@ -19,6 +19,6 @@ static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { return BUG_TRAP_TYPE_NONE; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #endif /* _ASM_RISCV_CFI_H */ diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 0b749e71021624..122e1485d39a0a 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -14,6 +14,7 @@ #include #include #include +#include #define __arch_xchg_masked(sc_sfx, swap_sfx, prepend, sc_append, \ swap_append, r, p, n) \ @@ -133,6 +134,7 @@ ({ \ if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && \ IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && \ + IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZACAS) && \ riscv_has_extension_unlikely(RISCV_ISA_EXT_ZABHA) && \ riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS)) { \ r = o; \ @@ -180,6 +182,7 @@ r, p, co, o, n) \ ({ \ if (IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && \ + IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZACAS) && \ riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS)) { \ r = o; \ \ @@ -315,7 +318,7 @@ arch_cmpxchg_release((ptr), (o), (n)); \ }) -#if defined(CONFIG_64BIT) && defined(CONFIG_RISCV_ISA_ZACAS) +#if defined(CONFIG_64BIT) && defined(CONFIG_RISCV_ISA_ZACAS) && defined(CONFIG_TOOLCHAIN_HAS_ZACAS) #define system_has_cmpxchg128() riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS) @@ -351,7 +354,7 @@ union __u128_halves { #define arch_cmpxchg128_local(ptr, o, n) \ __arch_cmpxchg128((ptr), (o), (n), "") -#endif /* CONFIG_64BIT && CONFIG_RISCV_ISA_ZACAS */ +#endif /* CONFIG_64BIT && CONFIG_RISCV_ISA_ZACAS && CONFIG_TOOLCHAIN_HAS_ZACAS */ #ifdef CONFIG_RISCV_ISA_ZAWRS /* @@ -438,7 +441,7 @@ static __always_inline void __cmpwait(volatile void *ptr, return; no_zawrs: - asm volatile(RISCV_PAUSE : : : "memory"); + ALT_RISCV_PAUSE(); } #define __cmpwait_relaxed(ptr, val) \ diff --git a/arch/riscv/include/asm/cpu_ops_sbi.h b/arch/riscv/include/asm/cpu_ops_sbi.h index d6e4665b31954c..776fa55fbaa456 100644 --- a/arch/riscv/include/asm/cpu_ops_sbi.h +++ b/arch/riscv/include/asm/cpu_ops_sbi.h @@ -5,7 +5,7 @@ #ifndef __ASM_CPU_OPS_SBI_H #define __ASM_CPU_OPS_SBI_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 6fed42e377059c..4a37a98398ad3b 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -513,7 +513,7 @@ #define IE_TIE (_AC(0x1, UL) << RV_IRQ_TIMER) #define IE_EIE (_AC(0x1, UL) << RV_IRQ_EXT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define csr_swap(csr, val) \ ({ \ @@ -575,6 +575,6 @@ : "memory"); \ }) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CSR_H */ diff --git a/arch/riscv/include/asm/current.h b/arch/riscv/include/asm/current.h index 21774d868c65bd..ba5aa72aff631a 100644 --- a/arch/riscv/include/asm/current.h +++ b/arch/riscv/include/asm/current.h @@ -13,7 +13,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct task_struct; @@ -35,6 +35,6 @@ static __always_inline struct task_struct *get_current(void) register unsigned long current_stack_pointer __asm__("sp"); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CURRENT_H */ diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index 6e426ed7919a4a..6694b5ccdcf85c 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -5,31 +5,14 @@ #ifndef ASM_ERRATA_LIST_H #define ASM_ERRATA_LIST_H -#include #include #include #include #include +#include +#include -#ifdef CONFIG_ERRATA_ANDES -#define ERRATA_ANDES_NO_IOCP 0 -#define ERRATA_ANDES_NUMBER 1 -#endif - -#ifdef CONFIG_ERRATA_SIFIVE -#define ERRATA_SIFIVE_CIP_453 0 -#define ERRATA_SIFIVE_CIP_1200 1 -#define ERRATA_SIFIVE_NUMBER 2 -#endif - -#ifdef CONFIG_ERRATA_THEAD -#define ERRATA_THEAD_MAE 0 -#define ERRATA_THEAD_PMU 1 -#define ERRATA_THEAD_GHOSTWRITE 2 -#define ERRATA_THEAD_NUMBER 3 -#endif - -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define ALT_INSN_FAULT(x) \ ALTERNATIVE(__stringify(RISCV_PTR do_trap_insn_fault), \ @@ -42,7 +25,7 @@ ALTERNATIVE(__stringify(RISCV_PTR do_page_fault), \ __stringify(RISCV_PTR sifive_cip_453_page_fault_trp), \ SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453, \ CONFIG_ERRATA_SIFIVE_CIP_453) -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #define ALT_SFENCE_VMA_ASID(asid) \ asm(ALTERNATIVE("sfence.vma x0, %0", "sfence.vma", SIFIVE_VENDOR_ID, \ @@ -59,6 +42,17 @@ asm(ALTERNATIVE("sfence.vma %0, %1", "sfence.vma", SIFIVE_VENDOR_ID, \ ERRATA_SIFIVE_CIP_1200, CONFIG_ERRATA_SIFIVE_CIP_1200) \ : : "r" (addr), "r" (asid) : "memory") +#define ALT_RISCV_PAUSE() \ +asm(ALTERNATIVE( \ + RISCV_PAUSE, /* Original RISC‑V pause insn */ \ + MIPS_PAUSE, /* Replacement for MIPS P8700 */ \ + MIPS_VENDOR_ID, /* Vendor ID to match */ \ + ERRATA_MIPS_P8700_PAUSE_OPCODE, /* patch_id */ \ + CONFIG_ERRATA_MIPS_P8700_PAUSE_OPCODE) \ + : /* no outputs */ \ + : /* no inputs */ \ + : "memory") + /* * _val is marked as "will be overwritten", so need to set it to 0 * in the default case. @@ -123,6 +117,6 @@ asm volatile(ALTERNATIVE( \ #define THEAD_C9XX_RV_IRQ_PMU 17 #define THEAD_C9XX_CSR_SCOUNTEROF 0x5c5 -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/riscv/include/asm/errata_list_vendors.h b/arch/riscv/include/asm/errata_list_vendors.h new file mode 100644 index 00000000000000..ec7eba3734371a --- /dev/null +++ b/arch/riscv/include/asm/errata_list_vendors.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef ASM_ERRATA_LIST_VENDORS_H +#define ASM_ERRATA_LIST_VENDORS_H + +#ifdef CONFIG_ERRATA_ANDES +#define ERRATA_ANDES_NO_IOCP 0 +#define ERRATA_ANDES_NUMBER 1 +#endif + +#ifdef CONFIG_ERRATA_SIFIVE +#define ERRATA_SIFIVE_CIP_453 0 +#define ERRATA_SIFIVE_CIP_1200 1 +#define ERRATA_SIFIVE_NUMBER 2 +#endif + +#ifdef CONFIG_ERRATA_THEAD +#define ERRATA_THEAD_MAE 0 +#define ERRATA_THEAD_PMU 1 +#define ERRATA_THEAD_GHOSTWRITE 2 +#define ERRATA_THEAD_NUMBER 3 +#endif + +#ifdef CONFIG_ERRATA_MIPS +#define ERRATA_MIPS_P8700_PAUSE_OPCODE 0 +#define ERRATA_MIPS_NUMBER 1 +#endif + +#endif /* ASM_ERRATA_LIST_VENDORS_H */ diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 22ebea3c2b26c1..e5026cd8f022f5 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -13,7 +13,7 @@ #endif #define ARCH_SUPPORTS_FTRACE_OPS 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void *return_address(unsigned int level); @@ -112,7 +112,7 @@ do { \ #define MCOUNT_JALR_SIZE 4 #define MCOUNT_NOP4_SIZE 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct dyn_ftrace; int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); #define ftrace_init_nop ftrace_init_nop @@ -235,7 +235,7 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsi #endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_DYNAMIC_FTRACE */ diff --git a/arch/riscv/include/asm/gpr-num.h b/arch/riscv/include/asm/gpr-num.h index efeb5edf8a3af1..b499cf83273415 100644 --- a/arch/riscv/include/asm/gpr-num.h +++ b/arch/riscv/include/asm/gpr-num.h @@ -2,7 +2,7 @@ #ifndef __ASM_GPR_NUM_H #define __ASM_GPR_NUM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 .equ .L__gpr_num_x\num, \num @@ -41,7 +41,7 @@ .equ .L__gpr_num_t5, 30 .equ .L__gpr_num_t6, 31 -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define __DEFINE_ASM_GPR_NUMS \ " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \ @@ -80,6 +80,6 @@ " .equ .L__gpr_num_t5, 30\n" \ " .equ .L__gpr_num_t6, 31\n" -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_GPR_NUM_H */ diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 7fe0a379474ae2..948d2b34e94e84 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include -#define RISCV_HWPROBE_MAX_KEY 13 +#define RISCV_HWPROBE_MAX_KEY 14 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { @@ -22,6 +22,7 @@ static inline bool hwprobe_key_is_bitmask(__s64 key) case RISCV_HWPROBE_KEY_IMA_EXT_0: case RISCV_HWPROBE_KEY_CPUPERF_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: + case RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0: return true; } diff --git a/arch/riscv/include/asm/image.h b/arch/riscv/include/asm/image.h index 8927a6ea1127e2..899254966e8585 100644 --- a/arch/riscv/include/asm/image.h +++ b/arch/riscv/include/asm/image.h @@ -29,7 +29,7 @@ #define RISCV_HEADER_VERSION (RISCV_HEADER_VERSION_MAJOR << 16 | \ RISCV_HEADER_VERSION_MINOR) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define riscv_image_flag_field(flags, field)\ (((flags) >> field##_SHIFT) & field##_MASK) /** @@ -63,5 +63,5 @@ struct riscv_image_header { u32 magic2; u32 res3; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_IMAGE_H */ diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index d5adbaec1d010d..c9cfcea52cbbcf 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -25,7 +25,7 @@ #define INSN_S_SIMM5_SHIFT 7 #define INSN_S_OPCODE_SHIFT 0 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CONFIG_AS_HAS_INSN @@ -77,7 +77,7 @@ #define __INSN_I(...) insn_i __VA_ARGS__ #define __INSN_S(...) insn_s __VA_ARGS__ -#else /* ! __ASSEMBLY__ */ +#else /* ! __ASSEMBLER__ */ #ifdef CONFIG_AS_HAS_INSN @@ -153,7 +153,7 @@ #endif -#endif /* ! __ASSEMBLY__ */ +#endif /* ! __ASSEMBLER__ */ #define INSN_R(opcode, func3, func7, rd, rs1, rs2) \ __INSN_R(RV_##opcode, RV_##func3, RV_##func7, \ @@ -263,7 +263,7 @@ #define RISCV_INSN_NOP4 _AC(0x00000013, U) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define nop() __asm__ __volatile__ ("nop") #define __nops(n) ".rept " #n "\nnop\n.endr\n" #define nops(n) __asm__ __volatile__ (__nops(n)) diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h index 09fde95a5e8f75..c3005573e8c999 100644 --- a/arch/riscv/include/asm/insn.h +++ b/arch/riscv/include/asm/insn.h @@ -64,6 +64,7 @@ #define RVG_RS2_OPOFF 20 #define RVG_RD_OPOFF 7 #define RVG_RS1_MASK GENMASK(4, 0) +#define RVG_RS2_MASK GENMASK(4, 0) #define RVG_RD_MASK GENMASK(4, 0) /* The bit field of immediate value in RVC J instruction */ @@ -286,45 +287,216 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code) (code & RVC_INSN_J_RS1_MASK) != 0; } -#define RV_IMM_SIGN(x) (-(((x) >> 31) & 1)) -#define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1)) -#define RV_X(X, s, mask) (((X) >> (s)) & (mask)) -#define RVC_X(X, s, mask) RV_X(X, s, mask) +#define INSN_MATCH_LB 0x3 +#define INSN_MASK_LB 0x707f +#define INSN_MATCH_LH 0x1003 +#define INSN_MASK_LH 0x707f +#define INSN_MATCH_LW 0x2003 +#define INSN_MASK_LW 0x707f +#define INSN_MATCH_LD 0x3003 +#define INSN_MASK_LD 0x707f +#define INSN_MATCH_LBU 0x4003 +#define INSN_MASK_LBU 0x707f +#define INSN_MATCH_LHU 0x5003 +#define INSN_MASK_LHU 0x707f +#define INSN_MATCH_LWU 0x6003 +#define INSN_MASK_LWU 0x707f +#define INSN_MATCH_SB 0x23 +#define INSN_MASK_SB 0x707f +#define INSN_MATCH_SH 0x1023 +#define INSN_MASK_SH 0x707f +#define INSN_MATCH_SW 0x2023 +#define INSN_MASK_SW 0x707f +#define INSN_MATCH_SD 0x3023 +#define INSN_MASK_SD 0x707f + +#define INSN_MATCH_C_LD 0x6000 +#define INSN_MASK_C_LD 0xe003 +#define INSN_MATCH_C_SD 0xe000 +#define INSN_MASK_C_SD 0xe003 +#define INSN_MATCH_C_LW 0x4000 +#define INSN_MASK_C_LW 0xe003 +#define INSN_MATCH_C_SW 0xc000 +#define INSN_MASK_C_SW 0xe003 +#define INSN_MATCH_C_LDSP 0x6002 +#define INSN_MASK_C_LDSP 0xe003 +#define INSN_MATCH_C_SDSP 0xe002 +#define INSN_MASK_C_SDSP 0xe003 +#define INSN_MATCH_C_LWSP 0x4002 +#define INSN_MASK_C_LWSP 0xe003 +#define INSN_MATCH_C_SWSP 0xc002 +#define INSN_MASK_C_SWSP 0xe003 + +#define INSN_OPCODE_MASK 0x007c +#define INSN_OPCODE_SHIFT 2 +#define INSN_OPCODE_SYSTEM 28 + +#define INSN_MASK_WFI 0xffffffff +#define INSN_MATCH_WFI 0x10500073 + +#define INSN_MASK_WRS 0xffffffff +#define INSN_MATCH_WRS 0x00d00073 + +#define INSN_MATCH_CSRRW 0x1073 +#define INSN_MASK_CSRRW 0x707f +#define INSN_MATCH_CSRRS 0x2073 +#define INSN_MASK_CSRRS 0x707f +#define INSN_MATCH_CSRRC 0x3073 +#define INSN_MASK_CSRRC 0x707f +#define INSN_MATCH_CSRRWI 0x5073 +#define INSN_MASK_CSRRWI 0x707f +#define INSN_MATCH_CSRRSI 0x6073 +#define INSN_MASK_CSRRSI 0x707f +#define INSN_MATCH_CSRRCI 0x7073 +#define INSN_MASK_CSRRCI 0x707f + +#define INSN_MATCH_FLW 0x2007 +#define INSN_MASK_FLW 0x707f +#define INSN_MATCH_FLD 0x3007 +#define INSN_MASK_FLD 0x707f +#define INSN_MATCH_FLQ 0x4007 +#define INSN_MASK_FLQ 0x707f +#define INSN_MATCH_FSW 0x2027 +#define INSN_MASK_FSW 0x707f +#define INSN_MATCH_FSD 0x3027 +#define INSN_MASK_FSD 0x707f +#define INSN_MATCH_FSQ 0x4027 +#define INSN_MASK_FSQ 0x707f + +#define INSN_MATCH_C_FLD 0x2000 +#define INSN_MASK_C_FLD 0xe003 +#define INSN_MATCH_C_FLW 0x6000 +#define INSN_MASK_C_FLW 0xe003 +#define INSN_MATCH_C_FSD 0xa000 +#define INSN_MASK_C_FSD 0xe003 +#define INSN_MATCH_C_FSW 0xe000 +#define INSN_MASK_C_FSW 0xe003 +#define INSN_MATCH_C_FLDSP 0x2002 +#define INSN_MASK_C_FLDSP 0xe003 +#define INSN_MATCH_C_FSDSP 0xa002 +#define INSN_MASK_C_FSDSP 0xe003 +#define INSN_MATCH_C_FLWSP 0x6002 +#define INSN_MASK_C_FLWSP 0xe003 +#define INSN_MATCH_C_FSWSP 0xe002 +#define INSN_MASK_C_FSWSP 0xe003 + +#define INSN_MATCH_C_LHU 0x8400 +#define INSN_MASK_C_LHU 0xfc43 +#define INSN_MATCH_C_LH 0x8440 +#define INSN_MASK_C_LH 0xfc43 +#define INSN_MATCH_C_SH 0x8c00 +#define INSN_MASK_C_SH 0xfc43 + +#define INSN_16BIT_MASK 0x3 +#define INSN_IS_16BIT(insn) (((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK) +#define INSN_LEN(insn) (INSN_IS_16BIT(insn) ? 2 : 4) + +#define SHIFT_RIGHT(x, y) \ + ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) + +#define REG_MASK \ + ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) + +#define REG_OFFSET(insn, pos) \ + (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) + +#define REG_PTR(insn, pos, regs) \ + ((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))) + +#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) +#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) +#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) +#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) +#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) +#define GET_SP(regs) (*REG_PTR(2, 0, regs)) +#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) +#define IMM_I(insn) ((s32)(insn) >> 20) +#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ + (s32)(((insn) >> 7) & 0x1f)) + +#define SH_RD 7 +#define SH_RS1 15 +#define SH_RS2 20 +#define SH_RS2C 2 +#define MASK_RX 0x1f + +#if defined(CONFIG_64BIT) +#define LOG_REGBYTES 3 +#else +#define LOG_REGBYTES 2 +#endif + +#define MASK_FUNCT3 0x7000 + +#define GET_FUNCT3(insn) (((insn) >> 12) & 7) + +#define RV_IMM_SIGN(x) (-(((x) >> 31) & 1)) +#define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1)) +#define RV_X_MASK(X, s, mask) (((X) >> (s)) & (mask)) +#define RV_X(X, s, n) RV_X_MASK(X, s, ((1 << (n)) - 1)) +#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ + (RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 1) << 6)) +#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 2) << 6)) +#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 2) << 6)) +#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 3) << 6)) +#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ + (RV_X(x, 7, 2) << 6)) +#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 7, 3) << 6)) +#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) +#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) +#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) +#define RVC_X(X, s, mask) RV_X_MASK(X, s, mask) + +#define RV_EXTRACT_FUNCT3(x) \ + ({typeof(x) x_ = (x); \ + (RV_X_MASK(x_, RV_INSN_FUNCT3_OPOFF, \ + RV_INSN_FUNCT3_MASK >> RV_INSN_FUNCT3_OPOFF)); }) #define RV_EXTRACT_RS1_REG(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RVG_RS1_OPOFF, RVG_RS1_MASK)); }) + (RV_X_MASK(x_, RVG_RS1_OPOFF, RVG_RS1_MASK)); }) + +#define RV_EXTRACT_RS2_REG(x) \ + ({typeof(x) x_ = (x); \ + (RV_X_MASK(x_, RVG_RS2_OPOFF, RVG_RS2_MASK)); }) #define RV_EXTRACT_RD_REG(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RVG_RD_OPOFF, RVG_RD_MASK)); }) + (RV_X_MASK(x_, RVG_RD_OPOFF, RVG_RD_MASK)); }) #define RV_EXTRACT_UTYPE_IMM(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RV_U_IMM_31_12_OPOFF, RV_U_IMM_31_12_MASK)); }) + (RV_X_MASK(x_, RV_U_IMM_31_12_OPOFF, RV_U_IMM_31_12_MASK)); }) #define RV_EXTRACT_JTYPE_IMM(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RV_J_IMM_10_1_OPOFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OFF) | \ - (RV_X(x_, RV_J_IMM_11_OPOFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OFF) | \ - (RV_X(x_, RV_J_IMM_19_12_OPOFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OFF) | \ + (RV_X_MASK(x_, RV_J_IMM_10_1_OPOFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OFF) | \ + (RV_X_MASK(x_, RV_J_IMM_11_OPOFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OFF) | \ + (RV_X_MASK(x_, RV_J_IMM_19_12_OPOFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OFF) | \ (RV_IMM_SIGN(x_) << RV_J_IMM_SIGN_OFF); }) #define RV_EXTRACT_ITYPE_IMM(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RV_I_IMM_11_0_OPOFF, RV_I_IMM_11_0_MASK)) | \ + (RV_X_MASK(x_, RV_I_IMM_11_0_OPOFF, RV_I_IMM_11_0_MASK)) | \ (RV_IMM_SIGN(x_) << RV_I_IMM_SIGN_OFF); }) #define RV_EXTRACT_BTYPE_IMM(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RV_B_IMM_4_1_OPOFF, RV_B_IMM_4_1_MASK) << RV_B_IMM_4_1_OFF) | \ - (RV_X(x_, RV_B_IMM_10_5_OPOFF, RV_B_IMM_10_5_MASK) << RV_B_IMM_10_5_OFF) | \ - (RV_X(x_, RV_B_IMM_11_OPOFF, RV_B_IMM_11_MASK) << RV_B_IMM_11_OFF) | \ + (RV_X_MASK(x_, RV_B_IMM_4_1_OPOFF, RV_B_IMM_4_1_MASK) << RV_B_IMM_4_1_OFF) | \ + (RV_X_MASK(x_, RV_B_IMM_10_5_OPOFF, RV_B_IMM_10_5_MASK) << RV_B_IMM_10_5_OFF) | \ + (RV_X_MASK(x_, RV_B_IMM_11_OPOFF, RV_B_IMM_11_MASK) << RV_B_IMM_11_OFF) | \ (RV_IMM_SIGN(x_) << RV_B_IMM_SIGN_OFF); }) #define RVC_EXTRACT_C2_RS1_REG(x) \ ({typeof(x) x_ = (x); \ - (RV_X(x_, RVC_C2_RS1_OPOFF, RVC_C2_RS1_MASK)); }) + (RV_X_MASK(x_, RVC_C2_RS1_OPOFF, RVC_C2_RS1_MASK)); }) #define RVC_EXTRACT_JTYPE_IMM(x) \ ({typeof(x) x_ = (x); \ @@ -346,13 +518,13 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code) (RVC_IMM_SIGN(x_) << RVC_B_IMM_SIGN_OFF); }) #define RVG_EXTRACT_SYSTEM_CSR(x) \ - ({typeof(x) x_ = (x); RV_X(x_, RVG_SYSTEM_CSR_OFF, RVG_SYSTEM_CSR_MASK); }) + ({typeof(x) x_ = (x); RV_X_MASK(x_, RVG_SYSTEM_CSR_OFF, RVG_SYSTEM_CSR_MASK); }) #define RVFDQ_EXTRACT_FL_FS_WIDTH(x) \ - ({typeof(x) x_ = (x); RV_X(x_, RVFDQ_FL_FS_WIDTH_OFF, \ + ({typeof(x) x_ = (x); RV_X_MASK(x_, RVFDQ_FL_FS_WIDTH_OFF, \ RVFDQ_FL_FS_WIDTH_MASK); }) -#define RVV_EXRACT_VL_VS_WIDTH(x) RVFDQ_EXTRACT_FL_FS_WIDTH(x) +#define RVV_EXTRACT_VL_VS_WIDTH(x) RVFDQ_EXTRACT_FL_FS_WIDTH(x) /* * Get the immediate from a J-type instruction. @@ -375,10 +547,10 @@ static inline void riscv_insn_insert_jtype_imm(u32 *insn, s32 imm) { /* drop the old IMMs, all jal IMM bits sit at 31:12 */ *insn &= ~GENMASK(31, 12); - *insn |= (RV_X(imm, RV_J_IMM_10_1_OFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OPOFF) | - (RV_X(imm, RV_J_IMM_11_OFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OPOFF) | - (RV_X(imm, RV_J_IMM_19_12_OFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OPOFF) | - (RV_X(imm, RV_J_IMM_SIGN_OFF, 1) << RV_J_IMM_SIGN_OPOFF); + *insn |= (RV_X_MASK(imm, RV_J_IMM_10_1_OFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OPOFF) | + (RV_X_MASK(imm, RV_J_IMM_11_OFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OPOFF) | + (RV_X_MASK(imm, RV_J_IMM_19_12_OFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OPOFF) | + (RV_X_MASK(imm, RV_J_IMM_SIGN_OFF, 1) << RV_J_IMM_SIGN_OPOFF); } /* diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index a0e51840b9db43..09bb5f57a9d346 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -28,6 +28,10 @@ #ifdef CONFIG_MMU #define IO_SPACE_LIMIT (PCI_IO_SIZE - 1) #define PCI_IOBASE ((void __iomem *)PCI_IO_START) + +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), __pgprot(_PAGE_KERNEL_NC)) + #endif /* CONFIG_MMU */ /* diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h index 87a71cc6d146ce..3ab5f2e3212bec 100644 --- a/arch/riscv/include/asm/jump_label.h +++ b/arch/riscv/include/asm/jump_label.h @@ -7,7 +7,7 @@ #ifndef __ASM_JUMP_LABEL_H #define __ASM_JUMP_LABEL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -66,5 +66,5 @@ static __always_inline bool arch_static_branch_jump(struct static_key * const ke return true; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_JUMP_LABEL_H */ diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index e6a0071bdb56c4..60af6691f90321 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -4,7 +4,7 @@ #ifndef __ASM_KASAN_H #define __ASM_KASAN_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * The following comment was copied from arm64: diff --git a/arch/riscv/include/asm/kgdb.h b/arch/riscv/include/asm/kgdb.h index cc11c4544cffd1..7559d728c5ff9b 100644 --- a/arch/riscv/include/asm/kgdb.h +++ b/arch/riscv/include/asm/kgdb.h @@ -17,12 +17,12 @@ #define BREAK_INSTR_SIZE 4 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ void arch_kgdb_breakpoint(void); extern unsigned long kgdb_compiled_break; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #define DBG_REG_ZERO "zero" #define DBG_REG_RA "ra" diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 1cc90465d75b18..cf8e6eac77d520 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -7,7 +7,7 @@ #ifndef _ASM_RISCV_MMU_H #define _ASM_RISCV_MMU_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef struct { #ifndef CONFIG_MMU @@ -40,6 +40,6 @@ typedef struct { void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, pgprot_t prot); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_MMU_H */ diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 572a141ddecdb1..ffe213ad65a4ee 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -41,7 +41,7 @@ #define PAGE_OFFSET ((unsigned long)phys_ram_base) #endif /* CONFIG_MMU */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_RISCV_ISA_ZICBOZ void clear_page(void *page); @@ -199,7 +199,7 @@ static __always_inline void *pfn_to_kaddr(unsigned long pfn) return __va(pfn << PAGE_SHIFT); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define virt_addr_valid(vaddr) ({ \ unsigned long _addr = (unsigned long)vaddr; \ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 91697fbf1f9013..29e994a9afb67c 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -111,7 +111,7 @@ #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -203,6 +203,7 @@ extern struct pt_alloc_ops pt_ops __meminitdata; #define PAGE_TABLE __pgprot(_PAGE_TABLE) +#define _PAGE_KERNEL_NC ((_PAGE_KERNEL & ~_PAGE_MTMASK) | _PAGE_NOCACHE) #define _PAGE_IOREMAP ((_PAGE_KERNEL & ~_PAGE_MTMASK) | _PAGE_IO) #define PAGE_KERNEL_IO __pgprot(_PAGE_IOREMAP) @@ -942,6 +943,23 @@ static inline int pudp_test_and_clear_young(struct vm_area_struct *vma, return ptep_test_and_clear_young(vma, address, (pte_t *)pudp); } +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, pud_t *pudp) +{ +#ifdef CONFIG_SMP + pud_t pud = __pud(xchg(&pudp->pud, 0)); +#else + pud_t pud = *pudp; + + pud_clear(pudp); +#endif + + page_table_check_pud_clear(mm, pud); + + return pud; +} + static inline int pud_young(pud_t pud) { return pte_young(pud_pte(pud)); @@ -1118,6 +1136,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ set_pgd(pgdp, pgd); \ }) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_RISCV_PGTABLE_H */ diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 24d3af4d3807e3..da5426122d280b 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -54,7 +54,7 @@ #define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE / 3) #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct task_struct; struct pt_regs; @@ -215,6 +215,6 @@ long get_tagged_addr_ctrl(struct task_struct *task); #define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl(current) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_PROCESSOR_H */ diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index a7dc0e33075796..addc8188152f7f 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -10,7 +10,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pt_regs { unsigned long epc; @@ -180,6 +180,6 @@ static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) return !(regs->status & SR_PIE); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/include/asm/scs.h b/arch/riscv/include/asm/scs.h index 0e45db78b24bf2..ab7714aa93bdc4 100644 --- a/arch/riscv/include/asm/scs.h +++ b/arch/riscv/include/asm/scs.h @@ -2,7 +2,7 @@ #ifndef _ASM_SCS_H #define _ASM_SCS_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include #ifdef CONFIG_SHADOW_CALL_STACK @@ -49,6 +49,6 @@ .endm #endif /* CONFIG_SHADOW_CALL_STACK */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_SCS_H */ diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index ea263d3683ef6f..87389e93325a3b 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -6,7 +6,7 @@ #ifndef _ASM_RISCV_SET_MEMORY_H #define _ASM_RISCV_SET_MEMORY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Functions to change memory attributes. */ @@ -45,7 +45,7 @@ int set_direct_map_default_noflush(struct page *page); int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_XIP_KERNEL) #ifdef CONFIG_64BIT diff --git a/arch/riscv/include/asm/swab.h b/arch/riscv/include/asm/swab.h new file mode 100644 index 00000000000000..c1da22aa13268f --- /dev/null +++ b/arch/riscv/include/asm/swab.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_RISCV_SWAB_H +#define _ASM_RISCV_SWAB_H + +#include +#include +#include +#include +#include + +#if defined(CONFIG_TOOLCHAIN_HAS_ZBB) && defined(CONFIG_RISCV_ISA_ZBB) && !defined(NO_ALTERNATIVE) + +// Duplicated from include/uapi/linux/swab.h +#define ___constant_swab16(x) ((__u16)( \ + (((__u16)(x) & (__u16)0x00ffU) << 8) | \ + (((__u16)(x) & (__u16)0xff00U) >> 8))) + +#define ___constant_swab32(x) ((__u32)( \ + (((__u32)(x) & (__u32)0x000000ffUL) << 24) | \ + (((__u32)(x) & (__u32)0x0000ff00UL) << 8) | \ + (((__u32)(x) & (__u32)0x00ff0000UL) >> 8) | \ + (((__u32)(x) & (__u32)0xff000000UL) >> 24))) + +#define ___constant_swab64(x) ((__u64)( \ + (((__u64)(x) & (__u64)0x00000000000000ffULL) << 56) | \ + (((__u64)(x) & (__u64)0x000000000000ff00ULL) << 40) | \ + (((__u64)(x) & (__u64)0x0000000000ff0000ULL) << 24) | \ + (((__u64)(x) & (__u64)0x00000000ff000000ULL) << 8) | \ + (((__u64)(x) & (__u64)0x000000ff00000000ULL) >> 8) | \ + (((__u64)(x) & (__u64)0x0000ff0000000000ULL) >> 24) | \ + (((__u64)(x) & (__u64)0x00ff000000000000ULL) >> 40) | \ + (((__u64)(x) & (__u64)0xff00000000000000ULL) >> 56))) + +#define ARCH_SWAB(size, value) \ +({ \ + unsigned long x = value; \ + \ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBB)) { \ + asm volatile (".option push\n" \ + ".option arch,+zbb\n" \ + "rev8 %0, %1\n" \ + ".option pop\n" \ + : "=r" (x) : "r" (x)); \ + x = x >> (BITS_PER_LONG - size); \ + } else { \ + x = ___constant_swab##size(value); \ + } \ + x; \ +}) + +static __always_inline __u16 __arch_swab16(__u16 value) +{ + return ARCH_SWAB(16, value); +} + +static __always_inline __u32 __arch_swab32(__u32 value) +{ + return ARCH_SWAB(32, value); +} + +#ifdef CONFIG_64BIT +static __always_inline __u64 __arch_swab64(__u64 value) +{ + return ARCH_SWAB(64, value); +} +#else +static __always_inline __u64 __arch_swab64(__u64 value) +{ + __u32 h = value >> 32; + __u32 l = value & ((1ULL << 32) - 1); + + return ((__u64)(__arch_swab32(l)) << 32) | ((__u64)(__arch_swab32(h))); +} +#endif + +#define __arch_swab64 __arch_swab64 +#define __arch_swab32 __arch_swab32 +#define __arch_swab16 __arch_swab16 + +#undef ___constant_swab16 +#undef ___constant_swab32 +#undef ___constant_swab64 + +#undef ARCH_SWAB + +#endif /* defined(CONFIG_TOOLCHAIN_HAS_ZBB) && defined(CONFIG_RISCV_ISA_ZBB) && !defined(NO_ALTERNATIVE) */ +#endif /* _ASM_RISCV_SWAB_H */ diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index f5916a70879a87..836d80dd29210d 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -37,7 +37,7 @@ #define IRQ_STACK_SIZE THREAD_SIZE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -98,7 +98,7 @@ struct thread_info { void arch_release_task_struct(struct task_struct *tsk); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * thread information flags @@ -107,23 +107,18 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); * - pending work-to-be-done flags are in lowest half-word * - other flags in upper half-word(s) */ -#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 1 /* Lazy rescheduling needed */ -#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ -#define TIF_SIGPENDING 3 /* signal pending */ -#define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ -#define TIF_MEMDIE 5 /* is terminating due to OOM killer */ -#define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ -#define TIF_UPROBE 10 /* uprobe breakpoint or singlestep */ -#define TIF_32BIT 11 /* compat-mode 32bit process */ -#define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ - -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_RISCV_V_DEFER_RESTORE (1 << TIF_RISCV_V_DEFER_RESTORE) + +/* + * Tell the generic TIF infrastructure which bits riscv supports + */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include + +#define TIF_32BIT 16 /* compat-mode 32bit process */ +#define TIF_RISCV_V_DEFER_RESTORE 17 /* restore Vector before returing to user */ + +#define _TIF_RISCV_V_DEFER_RESTORE BIT(TIF_RISCV_V_DEFER_RESTORE) #endif /* _ASM_RISCV_THREAD_INFO_H */ diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index c130d8100232cb..f80357fe24d111 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -16,7 +16,7 @@ #define __VDSO_PAGES 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #define VDSO_SYMBOL(base, name) \ @@ -34,7 +34,7 @@ extern char compat_vdso_start[], compat_vdso_end[]; extern char vdso_start[], vdso_end[]; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* CONFIG_MMU */ diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h index c6d66895c1f585..ab4aef9550998e 100644 --- a/arch/riscv/include/asm/vdso/getrandom.h +++ b/arch/riscv/include/asm/vdso/getrandom.h @@ -5,7 +5,7 @@ #ifndef __ASM_VDSO_GETRANDOM_H #define __ASM_VDSO_GETRANDOM_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -25,6 +25,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns return ret; } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h index 29164f84f93cec..9ec08fa04d35a3 100644 --- a/arch/riscv/include/asm/vdso/gettimeofday.h +++ b/arch/riscv/include/asm/vdso/gettimeofday.h @@ -2,7 +2,7 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -79,6 +79,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return csr_read(CSR_TIME); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h index 8f383f05a290f1..c42f95dc8811d8 100644 --- a/arch/riscv/include/asm/vdso/processor.h +++ b/arch/riscv/include/asm/vdso/processor.h @@ -2,9 +2,10 @@ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include +#include #include static inline void cpu_relax(void) @@ -19,10 +20,10 @@ static inline void cpu_relax(void) * Reduce instruction retirement. * This assumes the PC changes. */ - __asm__ __volatile__ (RISCV_PAUSE); + ALT_RISCV_PAUSE(); barrier(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/arch/riscv/include/asm/vdso/vsyscall.h b/arch/riscv/include/asm/vdso/vsyscall.h index 1140b54b4bc827..558eb9dfda5203 100644 --- a/arch/riscv/include/asm/vdso/vsyscall.h +++ b/arch/riscv/include/asm/vdso/vsyscall.h @@ -2,13 +2,13 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /* The asm-generic header needs to be included after the definitions above */ #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/riscv/include/asm/vendor_extensions/mips.h b/arch/riscv/include/asm/vendor_extensions/mips.h new file mode 100644 index 00000000000000..ea8ca747d691df --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions/mips.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 MIPS. + */ + +#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H +#define _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H + +#include + +#define RISCV_ISA_VENDOR_EXT_XMIPSEXECTL 0 + +#ifndef __ASSEMBLER__ +struct riscv_isa_vendor_ext_data_list; +extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_mips; +#endif + +/* Extension specific instructions */ + +/* + * All of the xmipsexectl extension instructions are + * ‘hint’ encodings of the SLLI instruction, + * with rd = 0, rs1 = 0 and imm = 1 for IHB, imm = 3 for EHB, + * and imm = 5 for PAUSE. + * MIPS.PAUSE is an alternative opcode which is implemented to have the + * same behavior as PAUSE on some MIPS RISCV cores. + * MIPS.EHB clears all execution hazards before allowing + * any subsequent instructions to execute. + * MIPS.IHB clears all instruction hazards before + * allowing any subsequent instructions to fetch. + */ + +#define MIPS_PAUSE ".4byte 0x00501013\n\t" +#define MIPS_EHB ".4byte 0x00301013\n\t" +#define MIPS_IHB ".4byte 0x00101013\n\t" + +#endif // _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H diff --git a/arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h b/arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h new file mode 100644 index 00000000000000..e63f664b6b1766 --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 MIPS. + */ + +#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_HWPROBE_H_ +#define _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_HWPROBE_H_ + +#include +#include + +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_MIPS +void hwprobe_isa_vendor_ext_mips_0(struct riscv_hwprobe *pair, const struct cpumask *cpus); +#else +static inline void hwprobe_isa_vendor_ext_mips_0(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + pair->value = 0; +} +#endif + +#endif // _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_HWPROBE_H_ diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h index a5150cdf34d87f..3b09874d7a6dfb 100644 --- a/arch/riscv/include/asm/vendorid_list.h +++ b/arch/riscv/include/asm/vendorid_list.h @@ -9,5 +9,6 @@ #define MICROCHIP_VENDOR_ID 0x029 #define SIFIVE_VENDOR_ID 0x489 #define THEAD_VENDOR_ID 0x5b7 +#define MIPS_VENDOR_ID 0x722 #endif diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index aaf6ad97049931..5d30a4fae37a82 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -106,6 +106,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0 11 #define RISCV_HWPROBE_KEY_ZICBOM_BLOCK_SIZE 12 #define RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0 13 +#define RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0 14 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index ef27d4289da118..251099d860aa46 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -9,7 +9,7 @@ #ifndef __LINUX_KVM_RISCV_H #define __LINUX_KVM_RISCV_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index a38268b19c3d3d..beff8df80ac9c3 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -6,7 +6,7 @@ #ifndef _UAPI_ASM_RISCV_PTRACE_H #define _UAPI_ASM_RISCV_PTRACE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -127,6 +127,6 @@ struct __riscv_v_regset_state { */ #define RISCV_MAX_VLENB (8192) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/include/uapi/asm/sigcontext.h b/arch/riscv/include/uapi/asm/sigcontext.h index cd4f175dc83763..748dffc9ae194c 100644 --- a/arch/riscv/include/uapi/asm/sigcontext.h +++ b/arch/riscv/include/uapi/asm/sigcontext.h @@ -15,7 +15,7 @@ /* The size of END signal context header. */ #define END_HDR_SIZE 0x0 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct __sc_riscv_v_state { struct __riscv_v_ext_state v_state; @@ -35,6 +35,6 @@ struct sigcontext { }; }; -#endif /*!__ASSEMBLY__*/ +#endif /*!__ASSEMBLER__*/ #endif /* _UAPI_ASM_RISCV_SIGCONTEXT_H */ diff --git a/arch/riscv/include/uapi/asm/vendor/mips.h b/arch/riscv/include/uapi/asm/vendor/mips.h new file mode 100644 index 00000000000000..e65ab268b26551 --- /dev/null +++ b/arch/riscv/include/uapi/asm/vendor/mips.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#define RISCV_HWPROBE_VENDOR_EXT_XMIPSEXECTL BIT(0) diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index c7b542573407c8..f60fce69b7259f 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -113,7 +113,7 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_EFI) += efi.o obj-$(CONFIG_COMPAT) += compat_syscall_table.o diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c index 3f6d5a6789e878..71698ee11621ac 100644 --- a/arch/riscv/kernel/acpi.c +++ b/arch/riscv/kernel/acpi.c @@ -14,6 +14,7 @@ */ #include +#include #include #include #include @@ -160,6 +161,8 @@ void __init acpi_boot_table_init(void) early_init_dt_scan_chosen_stdout(); } else { acpi_parse_spcr(earlycon_acpi_spcr_enable, true); + if (IS_ENABLED(CONFIG_ACPI_BGRT)) + acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); } } diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index 7eb3cb1215c621..7642704c7f1841 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -47,6 +47,11 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info cpu_mfr_info->patch_func = andes_errata_patch_func; break; #endif +#ifdef CONFIG_ERRATA_MIPS + case MIPS_VENDOR_ID: + cpu_mfr_info->patch_func = mips_errata_patch_func; + break; +#endif #ifdef CONFIG_ERRATA_SIFIVE case SIFIVE_VENDOR_ID: cpu_mfr_info->patch_func = sifive_errata_patch_func; diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 6e8c0d6feae9e9..7d42d3b8a32a75 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -3,6 +3,7 @@ * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2017 SiFive */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 743d53415572e0..67b59699357da8 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -474,10 +474,10 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(zacas, RISCV_ISA_EXT_ZACAS), __RISCV_ISA_EXT_DATA(zalrsc, RISCV_ISA_EXT_ZALRSC), __RISCV_ISA_EXT_DATA(zawrs, RISCV_ISA_EXT_ZAWRS), - __RISCV_ISA_EXT_DATA(zfa, RISCV_ISA_EXT_ZFA), + __RISCV_ISA_EXT_DATA_VALIDATE(zfa, RISCV_ISA_EXT_ZFA, riscv_ext_f_depends), __RISCV_ISA_EXT_DATA_VALIDATE(zfbfmin, RISCV_ISA_EXT_ZFBFMIN, riscv_ext_f_depends), - __RISCV_ISA_EXT_DATA(zfh, RISCV_ISA_EXT_ZFH), - __RISCV_ISA_EXT_DATA(zfhmin, RISCV_ISA_EXT_ZFHMIN), + __RISCV_ISA_EXT_DATA_VALIDATE(zfh, RISCV_ISA_EXT_ZFH, riscv_ext_f_depends), + __RISCV_ISA_EXT_DATA_VALIDATE(zfhmin, RISCV_ISA_EXT_ZFHMIN, riscv_ext_f_depends), __RISCV_ISA_EXT_DATA(zca, RISCV_ISA_EXT_ZCA), __RISCV_ISA_EXT_DATA_VALIDATE(zcb, RISCV_ISA_EXT_ZCB, riscv_ext_zca_depends), __RISCV_ISA_EXT_DATA_VALIDATE(zcd, RISCV_ISA_EXT_ZCD, riscv_ext_zcd_validate), diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index d0ded2438533c4..d3d92a4becc726 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -7,6 +7,7 @@ #include #include +#include #include #include #include diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c index b9eb41b0a97519..dd9d92a9651746 100644 --- a/arch/riscv/kernel/machine_kexec_file.c +++ b/arch/riscv/kernel/machine_kexec_file.c @@ -15,6 +15,7 @@ #include #include #include +#include const struct kexec_file_ops * const kexec_file_loaders[] = { &elf_kexec_ops, @@ -109,7 +110,6 @@ static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, } #endif -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) #define RISCV_IMM_BITS 12 #define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS) #define RISCV_CONST_HIGH_PART(x) \ diff --git a/arch/riscv/kernel/pi/Makefile b/arch/riscv/kernel/pi/Makefile index 7dd15be69c9007..bc098edac89813 100644 --- a/arch/riscv/kernel/pi/Makefile +++ b/arch/riscv/kernel/pi/Makefile @@ -39,4 +39,4 @@ $(obj)/ctype.o: $(srctree)/lib/ctype.c FORCE $(call if_changed_rule,cc_o_c) obj-y := cmdline_early.pi.o fdt_early.pi.o string.pi.o ctype.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o archrandom_early.pi.o -extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) +targets := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/riscv/kernel/pi/cmdline_early.c b/arch/riscv/kernel/pi/cmdline_early.c index fbcdc9e4e14322..389d086a071876 100644 --- a/arch/riscv/kernel/pi/cmdline_early.c +++ b/arch/riscv/kernel/pi/cmdline_early.c @@ -41,9 +41,9 @@ static char *get_early_cmdline(uintptr_t dtb_pa) static u64 match_noXlvl(char *cmdline) { if (strstr(cmdline, "no4lvl")) - return SATP_MODE_48; + return SATP_MODE_39; else if (strstr(cmdline, "no5lvl")) - return SATP_MODE_57; + return SATP_MODE_48; return 0; } diff --git a/arch/riscv/kernel/pi/fdt_early.c b/arch/riscv/kernel/pi/fdt_early.c index 9bdee2fafe47e4..a12ff8090f1903 100644 --- a/arch/riscv/kernel/pi/fdt_early.c +++ b/arch/riscv/kernel/pi/fdt_early.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "pi.h" @@ -183,3 +184,42 @@ bool fdt_early_match_extension_isa(const void *fdt, const char *ext_name) return ret; } + +/** + * set_satp_mode_from_fdt - determine SATP mode based on the MMU type in fdt + * + * @dtb_pa: physical address of the device tree blob + * + * Returns the SATP mode corresponding to the MMU type of the first enabled CPU, + * 0 otherwise + */ +u64 set_satp_mode_from_fdt(uintptr_t dtb_pa) +{ + const void *fdt = (const void *)dtb_pa; + const char *mmu_type; + int node, parent; + + parent = fdt_path_offset(fdt, "/cpus"); + if (parent < 0) + return 0; + + fdt_for_each_subnode(node, fdt, parent) { + if (!fdt_node_name_eq(fdt, node, "cpu")) + continue; + + if (!fdt_device_is_available(fdt, node)) + continue; + + mmu_type = fdt_getprop(fdt, node, "mmu-type", NULL); + if (!mmu_type) + break; + + if (!strcmp(mmu_type, "riscv,sv39")) + return SATP_MODE_39; + else if (!strcmp(mmu_type, "riscv,sv48")) + return SATP_MODE_48; + break; + } + + return 0; +} diff --git a/arch/riscv/kernel/pi/pi.h b/arch/riscv/kernel/pi/pi.h index 21141d84fea603..3fee2cfddf7cfb 100644 --- a/arch/riscv/kernel/pi/pi.h +++ b/arch/riscv/kernel/pi/pi.h @@ -14,6 +14,7 @@ u64 get_kaslr_seed(uintptr_t dtb_pa); u64 get_kaslr_seed_zkr(const uintptr_t dtb_pa); bool set_nokaslr_from_cmdline(uintptr_t dtb_pa); u64 set_satp_mode_from_cmdline(uintptr_t dtb_pa); +u64 set_satp_mode_from_fdt(uintptr_t dtb_pa); bool fdt_early_match_extension_isa(const void *fdt, const char *ext_name); diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 6c166029079c42..fa581590c1f8b2 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -41,19 +41,16 @@ bool __kprobes simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs * 1 10 1 8 5 JAL/J */ bool ret; - u32 imm; - u32 index = (opcode >> 7) & 0x1f; + s32 imm; + u32 index = RV_EXTRACT_RD_REG(opcode); ret = rv_insn_reg_set_val(regs, index, addr + 4); if (!ret) return ret; - imm = ((opcode >> 21) & 0x3ff) << 1; - imm |= ((opcode >> 20) & 0x1) << 11; - imm |= ((opcode >> 12) & 0xff) << 12; - imm |= ((opcode >> 31) & 0x1) << 20; + imm = RV_EXTRACT_JTYPE_IMM(opcode); - instruction_pointer_set(regs, addr + sign_extend32((imm), 20)); + instruction_pointer_set(regs, addr + imm); return ret; } @@ -67,9 +64,9 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg */ bool ret; unsigned long base_addr; - u32 imm = (opcode >> 20) & 0xfff; - u32 rd_index = (opcode >> 7) & 0x1f; - u32 rs1_index = (opcode >> 15) & 0x1f; + u32 imm = RV_EXTRACT_ITYPE_IMM(opcode); + u32 rd_index = RV_EXTRACT_RD_REG(opcode); + u32 rs1_index = RV_EXTRACT_RS1_REG(opcode); ret = rv_insn_reg_get_val(regs, rs1_index, &base_addr); if (!ret) @@ -84,20 +81,6 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg return ret; } -#define auipc_rd_idx(opcode) \ - ((opcode >> 7) & 0x1f) - -#define auipc_imm(opcode) \ - ((((opcode) >> 12) & 0xfffff) << 12) - -#if __riscv_xlen == 64 -#define auipc_offset(opcode) sign_extend64(auipc_imm(opcode), 31) -#elif __riscv_xlen == 32 -#define auipc_offset(opcode) auipc_imm(opcode) -#else -#error "Unexpected __riscv_xlen" -#endif - bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *regs) { /* @@ -107,8 +90,8 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re * 20 5 7 */ - u32 rd_idx = auipc_rd_idx(opcode); - unsigned long rd_val = addr + auipc_offset(opcode); + u32 rd_idx = RV_EXTRACT_RD_REG(opcode); + unsigned long rd_val = addr + (s32)RV_EXTRACT_UTYPE_IMM(opcode); if (!rv_insn_reg_set_val(regs, rd_idx, rd_val)) return false; @@ -118,24 +101,6 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re return true; } -#define branch_rs1_idx(opcode) \ - (((opcode) >> 15) & 0x1f) - -#define branch_rs2_idx(opcode) \ - (((opcode) >> 20) & 0x1f) - -#define branch_funct3(opcode) \ - (((opcode) >> 12) & 0x7) - -#define branch_imm(opcode) \ - (((((opcode) >> 8) & 0xf ) << 1) | \ - ((((opcode) >> 25) & 0x3f) << 5) | \ - ((((opcode) >> 7) & 0x1 ) << 11) | \ - ((((opcode) >> 31) & 0x1 ) << 12)) - -#define branch_offset(opcode) \ - sign_extend32((branch_imm(opcode)), 12) - bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs) { /* @@ -156,12 +121,12 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r unsigned long rs1_val; unsigned long rs2_val; - if (!rv_insn_reg_get_val(regs, branch_rs1_idx(opcode), &rs1_val) || - !rv_insn_reg_get_val(regs, branch_rs2_idx(opcode), &rs2_val)) + if (!rv_insn_reg_get_val(regs, RV_EXTRACT_RS1_REG(opcode), &rs1_val) || + !rv_insn_reg_get_val(regs, RV_EXTRACT_RS2_REG(opcode), &rs2_val)) return false; - offset_tmp = branch_offset(opcode); - switch (branch_funct3(opcode)) { + offset_tmp = RV_EXTRACT_BTYPE_IMM(opcode); + switch (RV_EXTRACT_FUNCT3(opcode)) { case RVG_FUNCT3_BEQ: offset = (rs1_val == rs2_val) ? offset_tmp : 4; break; @@ -191,24 +156,9 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r bool __kprobes simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs) { - /* - * 15 13 12 2 1 0 - * | funct3 | offset[11|4|9:8|10|6|7|3:1|5] | opcode | - * 3 11 2 - */ - - s32 offset; + s32 offset = RVC_EXTRACT_JTYPE_IMM(opcode); - offset = ((opcode >> 3) & 0x7) << 1; - offset |= ((opcode >> 11) & 0x1) << 4; - offset |= ((opcode >> 2) & 0x1) << 5; - offset |= ((opcode >> 7) & 0x1) << 6; - offset |= ((opcode >> 6) & 0x1) << 7; - offset |= ((opcode >> 9) & 0x3) << 8; - offset |= ((opcode >> 8) & 0x1) << 10; - offset |= ((opcode >> 12) & 0x1) << 11; - - instruction_pointer_set(regs, addr + sign_extend32(offset, 11)); + instruction_pointer_set(regs, addr + offset); return true; } @@ -224,7 +174,7 @@ static bool __kprobes simulate_c_jr_jalr(u32 opcode, unsigned long addr, struct unsigned long jump_addr; - u32 rs1 = (opcode >> 7) & 0x1f; + u32 rs1 = RVC_EXTRACT_C2_RS1_REG(opcode); if (rs1 == 0) /* C.JR is only valid when rs1 != x0 */ return false; @@ -268,16 +218,10 @@ static bool __kprobes simulate_c_bnez_beqz(u32 opcode, unsigned long addr, struc if (!rv_insn_reg_get_val(regs, rs1, &rs1_val)) return false; - if ((rs1_val != 0 && is_bnez) || (rs1_val == 0 && !is_bnez)) { - offset = ((opcode >> 3) & 0x3) << 1; - offset |= ((opcode >> 10) & 0x3) << 3; - offset |= ((opcode >> 2) & 0x1) << 5; - offset |= ((opcode >> 5) & 0x3) << 6; - offset |= ((opcode >> 12) & 0x1) << 8; - offset = sign_extend32(offset, 8); - } else { + if ((rs1_val != 0 && is_bnez) || (rs1_val == 0 && !is_bnez)) + offset = RVC_EXTRACT_BTYPE_IMM(opcode); + else offset = 2; - } instruction_pointer_set(regs, addr + offset); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index a0a40889d79a53..31a392993cb452 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -223,7 +223,7 @@ asmlinkage void ret_from_fork_user(struct pt_regs *regs) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 53836a9235e320..5e8cde05526435 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -148,7 +148,7 @@ static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, static void sbi_set_power_off(void) { - pm_power_off = sbi_shutdown; + register_platform_power_off(sbi_shutdown); } #else static void __sbi_set_timer_v01(uint64_t stime_value) @@ -682,7 +682,7 @@ void __init sbi_init(void) if (sbi_spec_version >= sbi_mk_version(0, 3) && sbi_probe_extension(SBI_EXT_SRST)) { pr_info("SBI SRST extension detected\n"); - pm_power_off = sbi_srst_power_off; + register_platform_power_off(sbi_srst_power_off); sbi_srst_reboot_nb.notifier_call = sbi_srst_reboot; sbi_srst_reboot_nb.priority = 192; register_restart_handler(&sbi_srst_reboot_nb); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f90cce7a3acea8..14235e58c539cd 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -290,6 +290,7 @@ static void __init riscv_spinlock_init(void) if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && + IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZACAS) && riscv_isa_extension_available(NULL, ZABHA) && riscv_isa_extension_available(NULL, ZACAS)) { using_ext = "using Zabha"; diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 0b170e18a2beba..000f4451a9d873 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -153,14 +154,12 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, EXT_KEY(ZVKT); } - if (has_fpu()) { - EXT_KEY(ZCD); - EXT_KEY(ZCF); - EXT_KEY(ZFA); - EXT_KEY(ZFBFMIN); - EXT_KEY(ZFH); - EXT_KEY(ZFHMIN); - } + EXT_KEY(ZCD); + EXT_KEY(ZCF); + EXT_KEY(ZFA); + EXT_KEY(ZFBFMIN); + EXT_KEY(ZFH); + EXT_KEY(ZFHMIN); if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM)) EXT_KEY(SUPM); @@ -309,6 +308,9 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: hwprobe_isa_vendor_ext_thead_0(pair, cpus); break; + case RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0: + hwprobe_isa_vendor_ext_mips_0(pair, cpus); + break; /* * For forward compatibility, unknown keys don't fail the whole diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index d77afe05578f23..795b2e815ac923 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -10,7 +10,7 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset, + unsigned long fd, unsigned long offset, unsigned long page_shift_offset) { if (unlikely(offset & (~PAGE_MASK >> page_shift_offset))) diff --git a/arch/riscv/kernel/tests/Kconfig.debug b/arch/riscv/kernel/tests/Kconfig.debug index 78cea5d2c27022..5db4df44279e9d 100644 --- a/arch/riscv/kernel/tests/Kconfig.debug +++ b/arch/riscv/kernel/tests/Kconfig.debug @@ -30,6 +30,18 @@ config RISCV_MODULE_LINKING_KUNIT If unsure, say N. +config RISCV_KPROBES_KUNIT + bool "KUnit test for riscv kprobes" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on KPROBES + default KUNIT_ALL_TESTS + help + Enable testing for riscv kprobes. Useful for riscv and/or kprobes + development. The test verifies that kprobes do not change the behaviour + of some sample functions. + + If unsure, say N. + endif # RUNTIME_TESTING_MENU endmenu # "arch/riscv/kernel runtime Testing" diff --git a/arch/riscv/kernel/tests/Makefile b/arch/riscv/kernel/tests/Makefile index 7d6c76cffe2067..407e7e6c28dcbc 100644 --- a/arch/riscv/kernel/tests/Makefile +++ b/arch/riscv/kernel/tests/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_RISCV_MODULE_LINKING_KUNIT) += module_test/ +obj-$(CONFIG_RISCV_KPROBES_KUNIT) += kprobes/ diff --git a/arch/riscv/kernel/tests/kprobes/Makefile b/arch/riscv/kernel/tests/kprobes/Makefile new file mode 100644 index 00000000000000..4cb6c66a98e8ea --- /dev/null +++ b/arch/riscv/kernel/tests/kprobes/Makefile @@ -0,0 +1 @@ +obj-y += test-kprobes.o test-kprobes-asm.o diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S b/arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S new file mode 100644 index 00000000000000..b951d0f1248231 --- /dev/null +++ b/arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#include +#include +#include "test-kprobes.h" + +SYM_FUNC_START(test_kprobes_add) + li a1, KPROBE_TEST_MAGIC_UPPER + li a2, KPROBE_TEST_MAGIC_LOWER +test_kprobes_add_addr1: + add a1, a1, a2 +test_kprobes_add_addr2: + add a0, a1, x0 + ret +SYM_FUNC_END(test_kprobes_add) + +SYM_FUNC_START(test_kprobes_jal) + li a0, 0 + mv a1, ra + .option push + .option norvc +test_kprobes_jal_addr1: + jal x0, 2f + ret + .option pop +1: li a0, KPROBE_TEST_MAGIC_UPPER + ret + .option push + .option norvc +test_kprobes_jal_addr2: +2: jal 1b + .option pop + li a2, KPROBE_TEST_MAGIC_LOWER + add a0, a0, a2 + jr a1 +SYM_FUNC_END(test_kprobes_jal) + +SYM_FUNC_START(test_kprobes_jalr) + la a0, 1f + mv a1, ra + .option push + .option norvc +test_kprobes_jalr_addr: + jalr a0 + .option pop + li t0, KPROBE_TEST_MAGIC_UPPER + add a0, a0, t0 + jr a1 +1: li a0, KPROBE_TEST_MAGIC_LOWER + ret +SYM_FUNC_END(test_kprobes_jalr) + +SYM_FUNC_START(test_kprobes_auipc) +test_kprobes_auipc_addr: + auipc a0, KPROBE_TEST_MAGIC_LOWER + la a1, test_kprobes_auipc_addr + sub a0, a0, a1 + srli a0, a0, 12 + li a1, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a1 + ret +SYM_FUNC_END(test_kprobes_auipc) + +SYM_FUNC_START(test_kprobes_branch) + .option push + .option norvc + li a0, 0 + li a1, 1 + li a2, 2 +test_kprobes_branch_addr1: + beqz a0, 1f + ret +1: +test_kprobes_branch_addr2: + beqz a1, 3f +test_kprobes_branch_addr3: + bnez a0, 3f +test_kprobes_branch_addr4: + bnez a2, 1f + ret +1: +test_kprobes_branch_addr5: + bge a1, a2, 3f +test_kprobes_branch_addr6: + bge a2, a1, 2f + ret +1: + li t0, KPROBE_TEST_MAGIC_UPPER + add a0, a0, t0 + ret +2: +test_kprobes_branch_addr7: + blt a2, a1, 3f + li a0, KPROBE_TEST_MAGIC_LOWER +test_kprobes_branch_addr8: + blt a1, a2, 1b +3: + li a0, 0 + ret + .option pop +SYM_FUNC_END(test_kprobes_branch) + +#ifdef CONFIG_RISCV_ISA_C + +SYM_FUNC_START(test_kprobes_c_j) + li a0, 0 +test_kprobes_branch_c_j_addr1: + c.j 2f +1: + li a1, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a1 + ret +2: li a0, KPROBE_TEST_MAGIC_LOWER +test_kprobes_branch_c_j_addr2: + c.j 1b +SYM_FUNC_END(test_kprobes_c_j) + +SYM_FUNC_START(test_kprobes_c_jr) + la a0, 2f +test_kprobes_c_jr_addr1: + c.jr a0 + ret +1: li a1, KPROBE_TEST_MAGIC_LOWER + add a0, a0, a1 + ret +2: + li a0, KPROBE_TEST_MAGIC_UPPER + la a1, 1b +test_kprobes_c_jr_addr2: + c.jr a1 +SYM_FUNC_END(test_kprobes_c_jr) + +SYM_FUNC_START(test_kprobes_c_jalr) + mv a1, ra + la a0, 1f +test_kprobes_c_jalr_addr: + c.jalr a0 + li a2, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a2 + jr a1 +1: li a0, KPROBE_TEST_MAGIC_LOWER + ret +SYM_FUNC_END(test_kprobes_c_jalr) + +SYM_FUNC_START(test_kprobes_c_beqz) + li a0, 0 + li a1, 1 +test_kprobes_c_beqz_addr1: + c.beqz a0, 2f + ret +1: li a1, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a1 + ret +test_kprobes_c_beqz_addr2: +2: c.beqz a1, 3f + li a0, KPROBE_TEST_MAGIC_LOWER + mv a1, x0 +test_kprobes_c_beqz_addr3: + c.beqz a1, 1b +3: li a0, 0 + ret +SYM_FUNC_END(test_kprobes_c_beqz) + +SYM_FUNC_START(test_kprobes_c_bnez) + li a0, 0 + li a1, 1 +test_kprobes_c_bnez_addr1: + c.bnez a1, 2f + ret +1: li a1, KPROBE_TEST_MAGIC_UPPER + add a0, a0, a1 + ret +test_kprobes_c_bnez_addr2: +2: c.bnez a0, 3f + li a0, KPROBE_TEST_MAGIC_LOWER +test_kprobes_c_bnez_addr3: + c.bnez a0, 1b +3: li a0, 0 + ret +SYM_FUNC_END(test_kprobes_c_bnez) + +#endif /* CONFIG_RISCV_ISA_C */ + +SYM_DATA_START(test_kprobes_addresses) + RISCV_PTR test_kprobes_add_addr1 + RISCV_PTR test_kprobes_add_addr2 + RISCV_PTR test_kprobes_jal_addr1 + RISCV_PTR test_kprobes_jal_addr2 + RISCV_PTR test_kprobes_jalr_addr + RISCV_PTR test_kprobes_auipc_addr + RISCV_PTR test_kprobes_branch_addr1 + RISCV_PTR test_kprobes_branch_addr2 + RISCV_PTR test_kprobes_branch_addr3 + RISCV_PTR test_kprobes_branch_addr4 + RISCV_PTR test_kprobes_branch_addr5 + RISCV_PTR test_kprobes_branch_addr6 + RISCV_PTR test_kprobes_branch_addr7 + RISCV_PTR test_kprobes_branch_addr8 +#ifdef CONFIG_RISCV_ISA_C + RISCV_PTR test_kprobes_branch_c_j_addr1 + RISCV_PTR test_kprobes_branch_c_j_addr2 + RISCV_PTR test_kprobes_c_jr_addr1 + RISCV_PTR test_kprobes_c_jr_addr2 + RISCV_PTR test_kprobes_c_jalr_addr + RISCV_PTR test_kprobes_c_beqz_addr1 + RISCV_PTR test_kprobes_c_beqz_addr2 + RISCV_PTR test_kprobes_c_beqz_addr3 + RISCV_PTR test_kprobes_c_bnez_addr1 + RISCV_PTR test_kprobes_c_bnez_addr2 + RISCV_PTR test_kprobes_c_bnez_addr3 +#endif /* CONFIG_RISCV_ISA_C */ + RISCV_PTR 0 +SYM_DATA_END(test_kprobes_addresses) + +SYM_DATA_START(test_kprobes_functions) + RISCV_PTR test_kprobes_add + RISCV_PTR test_kprobes_jal + RISCV_PTR test_kprobes_jalr + RISCV_PTR test_kprobes_auipc + RISCV_PTR test_kprobes_branch +#ifdef CONFIG_RISCV_ISA_C + RISCV_PTR test_kprobes_c_j + RISCV_PTR test_kprobes_c_jr + RISCV_PTR test_kprobes_c_jalr + RISCV_PTR test_kprobes_c_beqz + RISCV_PTR test_kprobes_c_bnez +#endif /* CONFIG_RISCV_ISA_C */ + RISCV_PTR 0 +SYM_DATA_END(test_kprobes_functions) diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes.c b/arch/riscv/kernel/tests/kprobes/test-kprobes.c new file mode 100644 index 00000000000000..6f6cdfbf5a9588 --- /dev/null +++ b/arch/riscv/kernel/tests/kprobes/test-kprobes.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include "test-kprobes.h" + +static int kprobe_dummy_handler(struct kprobe *kp, struct pt_regs *regs) +{ + return 0; +} + +static void test_kprobe_riscv(struct kunit *test) +{ + unsigned int num_kprobe = 0; + long (*func)(void); + struct kprobe *kp; + int i; + + while (test_kprobes_addresses[num_kprobe]) + num_kprobe++; + + kp = kcalloc(num_kprobe, sizeof(*kp), GFP_KERNEL); + KUNIT_EXPECT_TRUE(test, kp); + if (!kp) + return; + + for (i = 0; i < num_kprobe; ++i) { + kp[i].addr = test_kprobes_addresses[i]; + kp[i].pre_handler = kprobe_dummy_handler; + KUNIT_EXPECT_EQ(test, 0, register_kprobe(&kp[i])); + } + + for (i = 0;; ++i) { + func = test_kprobes_functions[i]; + if (!func) + break; + KUNIT_EXPECT_EQ_MSG(test, KPROBE_TEST_MAGIC, func(), "function %d broken", i); + } + + for (i = 0; i < num_kprobe; ++i) + unregister_kprobe(&kp[i]); + kfree(kp); +} + +static struct kunit_case kprobes_testcases[] = { + KUNIT_CASE(test_kprobe_riscv), + {} +}; + +static struct kunit_suite kprobes_test_suite = { + .name = "kprobes_test_riscv", + .test_cases = kprobes_testcases, +}; + +kunit_test_suites(&kprobes_test_suite); diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes.h b/arch/riscv/kernel/tests/kprobes/test-kprobes.h new file mode 100644 index 00000000000000..3886ab491ecba3 --- /dev/null +++ b/arch/riscv/kernel/tests/kprobes/test-kprobes.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef TEST_KPROBES_H +#define TEST_KPROBES_H + +/* + * The magic value that all the functions in the test_kprobes_functions array return. The test + * installs kprobes into these functions, and verify that the functions still correctly return this + * value. + */ +#define KPROBE_TEST_MAGIC 0xcafebabe +#define KPROBE_TEST_MAGIC_LOWER 0x0000babe +#define KPROBE_TEST_MAGIC_UPPER 0xcafe0000 + +#ifndef __ASSEMBLY__ + +/* array of addresses to install kprobes */ +extern void *test_kprobes_addresses[]; + +/* array of functions that return KPROBE_TEST_MAGIC */ +extern long (*test_kprobes_functions[])(void); + +#endif /* __ASSEMBLY__ */ + +#endif /* TEST_KPROBES_H */ diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index f760e4fcc052d2..2a27d3ff4ac66d 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -18,149 +18,7 @@ #include #include #include - -#define INSN_MATCH_LB 0x3 -#define INSN_MASK_LB 0x707f -#define INSN_MATCH_LH 0x1003 -#define INSN_MASK_LH 0x707f -#define INSN_MATCH_LW 0x2003 -#define INSN_MASK_LW 0x707f -#define INSN_MATCH_LD 0x3003 -#define INSN_MASK_LD 0x707f -#define INSN_MATCH_LBU 0x4003 -#define INSN_MASK_LBU 0x707f -#define INSN_MATCH_LHU 0x5003 -#define INSN_MASK_LHU 0x707f -#define INSN_MATCH_LWU 0x6003 -#define INSN_MASK_LWU 0x707f -#define INSN_MATCH_SB 0x23 -#define INSN_MASK_SB 0x707f -#define INSN_MATCH_SH 0x1023 -#define INSN_MASK_SH 0x707f -#define INSN_MATCH_SW 0x2023 -#define INSN_MASK_SW 0x707f -#define INSN_MATCH_SD 0x3023 -#define INSN_MASK_SD 0x707f - -#define INSN_MATCH_FLW 0x2007 -#define INSN_MASK_FLW 0x707f -#define INSN_MATCH_FLD 0x3007 -#define INSN_MASK_FLD 0x707f -#define INSN_MATCH_FLQ 0x4007 -#define INSN_MASK_FLQ 0x707f -#define INSN_MATCH_FSW 0x2027 -#define INSN_MASK_FSW 0x707f -#define INSN_MATCH_FSD 0x3027 -#define INSN_MASK_FSD 0x707f -#define INSN_MATCH_FSQ 0x4027 -#define INSN_MASK_FSQ 0x707f - -#define INSN_MATCH_C_LD 0x6000 -#define INSN_MASK_C_LD 0xe003 -#define INSN_MATCH_C_SD 0xe000 -#define INSN_MASK_C_SD 0xe003 -#define INSN_MATCH_C_LW 0x4000 -#define INSN_MASK_C_LW 0xe003 -#define INSN_MATCH_C_SW 0xc000 -#define INSN_MASK_C_SW 0xe003 -#define INSN_MATCH_C_LDSP 0x6002 -#define INSN_MASK_C_LDSP 0xe003 -#define INSN_MATCH_C_SDSP 0xe002 -#define INSN_MASK_C_SDSP 0xe003 -#define INSN_MATCH_C_LWSP 0x4002 -#define INSN_MASK_C_LWSP 0xe003 -#define INSN_MATCH_C_SWSP 0xc002 -#define INSN_MASK_C_SWSP 0xe003 - -#define INSN_MATCH_C_FLD 0x2000 -#define INSN_MASK_C_FLD 0xe003 -#define INSN_MATCH_C_FLW 0x6000 -#define INSN_MASK_C_FLW 0xe003 -#define INSN_MATCH_C_FSD 0xa000 -#define INSN_MASK_C_FSD 0xe003 -#define INSN_MATCH_C_FSW 0xe000 -#define INSN_MASK_C_FSW 0xe003 -#define INSN_MATCH_C_FLDSP 0x2002 -#define INSN_MASK_C_FLDSP 0xe003 -#define INSN_MATCH_C_FSDSP 0xa002 -#define INSN_MASK_C_FSDSP 0xe003 -#define INSN_MATCH_C_FLWSP 0x6002 -#define INSN_MASK_C_FLWSP 0xe003 -#define INSN_MATCH_C_FSWSP 0xe002 -#define INSN_MASK_C_FSWSP 0xe003 - -#define INSN_MATCH_C_LHU 0x8400 -#define INSN_MASK_C_LHU 0xfc43 -#define INSN_MATCH_C_LH 0x8440 -#define INSN_MASK_C_LH 0xfc43 -#define INSN_MATCH_C_SH 0x8c00 -#define INSN_MASK_C_SH 0xfc43 - -#define INSN_LEN(insn) ((((insn) & 0x3) < 0x3) ? 2 : 4) - -#if defined(CONFIG_64BIT) -#define LOG_REGBYTES 3 -#define XLEN 64 -#else -#define LOG_REGBYTES 2 -#define XLEN 32 -#endif -#define REGBYTES (1 << LOG_REGBYTES) -#define XLEN_MINUS_16 ((XLEN) - 16) - -#define SH_RD 7 -#define SH_RS1 15 -#define SH_RS2 20 -#define SH_RS2C 2 - -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) -#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ - (RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 1) << 6)) -#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 2) << 6)) -#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 2) << 6)) -#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 3) << 6)) -#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ - (RV_X(x, 7, 2) << 6)) -#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 7, 3) << 6)) -#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) -#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) -#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) - -#define SHIFT_RIGHT(x, y) \ - ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) - -#define REG_MASK \ - ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) - -#define REG_OFFSET(insn, pos) \ - (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) - -#define REG_PTR(insn, pos, regs) \ - (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)) - -#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) -#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) -#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) -#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) -#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) -#define GET_SP(regs) (*REG_PTR(2, 0, regs)) -#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) -#define IMM_I(insn) ((s32)(insn) >> 20) -#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ - (s32)(((insn) >> 7) & 0x1f)) -#define MASK_FUNCT3 0x7000 - -#define GET_PRECISION(insn) (((insn) >> 25) & 3) -#define GET_RM(insn) (((insn) >> 12) & 7) -#define PRECISION_S 0 -#define PRECISION_D 1 +#include #ifdef CONFIG_FPU diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index 184f780c932d44..901e67adf57608 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -93,7 +93,7 @@ bool insn_is_vector(u32 insn_buf) return true; case RVV_OPCODE_VL: case RVV_OPCODE_VS: - width = RVV_EXRACT_VL_VS_WIDTH(insn_buf); + width = RVV_EXTRACT_VL_VS_WIDTH(insn_buf); if (width == RVV_VL_VS_WIDTH_8 || width == RVV_VL_VS_WIDTH_16 || width == RVV_VL_VS_WIDTH_32 || width == RVV_VL_VS_WIDTH_64) return true; diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c index 92d8ff81f42c9c..bb4a7592368560 100644 --- a/arch/riscv/kernel/vendor_extensions.c +++ b/arch/riscv/kernel/vendor_extensions.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -16,6 +17,9 @@ struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[] = { #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES &riscv_isa_vendor_ext_list_andes, #endif +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_MIPS + &riscv_isa_vendor_ext_list_mips, +#endif #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE &riscv_isa_vendor_ext_list_sifive, #endif @@ -49,6 +53,12 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig cpu_bmap = riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap; break; #endif + #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_MIPS + case MIPS_VENDOR_ID: + bmap = &riscv_isa_vendor_ext_list_mips.all_harts_isa_bitmap; + cpu_bmap = riscv_isa_vendor_ext_list_mips.per_hart_isa_bitmap; + break; + #endif #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE case SIFIVE_VENDOR_ID: bmap = &riscv_isa_vendor_ext_list_sifive.all_harts_isa_bitmap; diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile index a4eca96d1c8a2f..bf116c82b6bdb3 100644 --- a/arch/riscv/kernel/vendor_extensions/Makefile +++ b/arch/riscv/kernel/vendor_extensions/Makefile @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES) += andes.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_MIPS) += mips.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_MIPS) += mips_hwprobe.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive_hwprobe.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead.o diff --git a/arch/riscv/kernel/vendor_extensions/mips.c b/arch/riscv/kernel/vendor_extensions/mips.c new file mode 100644 index 00000000000000..f691129f96c21f --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/mips.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 MIPS. + */ + +#include +#include +#include + +#include +#include +#include + +/* All MIPS vendor extensions supported in Linux */ +static const struct riscv_isa_ext_data riscv_isa_vendor_ext_mips[] = { + __RISCV_ISA_EXT_DATA(xmipsexectl, RISCV_ISA_VENDOR_EXT_XMIPSEXECTL), +}; + +struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_mips = { + .ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_mips), + .ext_data = riscv_isa_vendor_ext_mips, +}; diff --git a/arch/riscv/kernel/vendor_extensions/mips_hwprobe.c b/arch/riscv/kernel/vendor_extensions/mips_hwprobe.c new file mode 100644 index 00000000000000..dc213a2ca70d95 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/mips_hwprobe.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 MIPS. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +void hwprobe_isa_vendor_ext_mips_0(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + VENDOR_EXTENSION_SUPPORTED(pair, cpus, + riscv_isa_vendor_ext_list_mips.per_hart_isa_bitmap, + { VENDOR_EXT_KEY(XMIPSEXECTL); }); +} diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c index 97dec18e69892a..de1f96ea62251f 100644 --- a/arch/riscv/kvm/vcpu_insn.c +++ b/arch/riscv/kvm/vcpu_insn.c @@ -8,133 +8,7 @@ #include #include - -#define INSN_OPCODE_MASK 0x007c -#define INSN_OPCODE_SHIFT 2 -#define INSN_OPCODE_SYSTEM 28 - -#define INSN_MASK_WFI 0xffffffff -#define INSN_MATCH_WFI 0x10500073 - -#define INSN_MASK_WRS 0xffffffff -#define INSN_MATCH_WRS 0x00d00073 - -#define INSN_MATCH_CSRRW 0x1073 -#define INSN_MASK_CSRRW 0x707f -#define INSN_MATCH_CSRRS 0x2073 -#define INSN_MASK_CSRRS 0x707f -#define INSN_MATCH_CSRRC 0x3073 -#define INSN_MASK_CSRRC 0x707f -#define INSN_MATCH_CSRRWI 0x5073 -#define INSN_MASK_CSRRWI 0x707f -#define INSN_MATCH_CSRRSI 0x6073 -#define INSN_MASK_CSRRSI 0x707f -#define INSN_MATCH_CSRRCI 0x7073 -#define INSN_MASK_CSRRCI 0x707f - -#define INSN_MATCH_LB 0x3 -#define INSN_MASK_LB 0x707f -#define INSN_MATCH_LH 0x1003 -#define INSN_MASK_LH 0x707f -#define INSN_MATCH_LW 0x2003 -#define INSN_MASK_LW 0x707f -#define INSN_MATCH_LD 0x3003 -#define INSN_MASK_LD 0x707f -#define INSN_MATCH_LBU 0x4003 -#define INSN_MASK_LBU 0x707f -#define INSN_MATCH_LHU 0x5003 -#define INSN_MASK_LHU 0x707f -#define INSN_MATCH_LWU 0x6003 -#define INSN_MASK_LWU 0x707f -#define INSN_MATCH_SB 0x23 -#define INSN_MASK_SB 0x707f -#define INSN_MATCH_SH 0x1023 -#define INSN_MASK_SH 0x707f -#define INSN_MATCH_SW 0x2023 -#define INSN_MASK_SW 0x707f -#define INSN_MATCH_SD 0x3023 -#define INSN_MASK_SD 0x707f - -#define INSN_MATCH_C_LD 0x6000 -#define INSN_MASK_C_LD 0xe003 -#define INSN_MATCH_C_SD 0xe000 -#define INSN_MASK_C_SD 0xe003 -#define INSN_MATCH_C_LW 0x4000 -#define INSN_MASK_C_LW 0xe003 -#define INSN_MATCH_C_SW 0xc000 -#define INSN_MASK_C_SW 0xe003 -#define INSN_MATCH_C_LDSP 0x6002 -#define INSN_MASK_C_LDSP 0xe003 -#define INSN_MATCH_C_SDSP 0xe002 -#define INSN_MASK_C_SDSP 0xe003 -#define INSN_MATCH_C_LWSP 0x4002 -#define INSN_MASK_C_LWSP 0xe003 -#define INSN_MATCH_C_SWSP 0xc002 -#define INSN_MASK_C_SWSP 0xe003 - -#define INSN_16BIT_MASK 0x3 - -#define INSN_IS_16BIT(insn) (((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK) - -#define INSN_LEN(insn) (INSN_IS_16BIT(insn) ? 2 : 4) - -#ifdef CONFIG_64BIT -#define LOG_REGBYTES 3 -#else -#define LOG_REGBYTES 2 -#endif -#define REGBYTES (1 << LOG_REGBYTES) - -#define SH_RD 7 -#define SH_RS1 15 -#define SH_RS2 20 -#define SH_RS2C 2 -#define MASK_RX 0x1f - -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) -#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ - (RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 1) << 6)) -#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 2) << 6)) -#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 2) << 6)) -#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 3) << 6)) -#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ - (RV_X(x, 7, 2) << 6)) -#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 7, 3) << 6)) -#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) -#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) -#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) - -#define SHIFT_RIGHT(x, y) \ - ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) - -#define REG_MASK \ - ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) - -#define REG_OFFSET(insn, pos) \ - (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) - -#define REG_PTR(insn, pos, regs) \ - ((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))) - -#define GET_FUNCT3(insn) (((insn) >> 12) & 7) - -#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) -#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) -#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) -#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) -#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) -#define GET_SP(regs) (*REG_PTR(2, 0, regs)) -#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) -#define IMM_I(insn) ((s32)(insn) >> 20) -#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ - (s32)(((insn) >> 7) & 0x1f)) +#include struct insn_func { unsigned long mask; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 15683ae13fa5d1..6091f3f06fa35d 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -816,6 +817,7 @@ static __meminit pgprot_t pgprot_from_va(uintptr_t va) #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) u64 __pi_set_satp_mode_from_cmdline(uintptr_t dtb_pa); +u64 __pi_set_satp_mode_from_fdt(uintptr_t dtb_pa); static void __init disable_pgtable_l5(void) { @@ -855,18 +857,22 @@ static void __init set_mmap_rnd_bits_max(void) * underlying hardware: establish 1:1 mapping in 4-level page table mode * then read SATP to see if the configuration was taken into account * meaning sv48 is supported. + * The maximum SATP mode is limited by both the command line and the "mmu-type" + * property in the device tree, since some platforms may hang if an unsupported + * SATP mode is attempted. */ static __init void set_satp_mode(uintptr_t dtb_pa) { u64 identity_satp, hw_satp; uintptr_t set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; - u64 satp_mode_cmdline = __pi_set_satp_mode_from_cmdline(dtb_pa); + u64 satp_mode_limit = min_not_zero(__pi_set_satp_mode_from_cmdline(dtb_pa), + __pi_set_satp_mode_from_fdt(dtb_pa)); kernel_map.page_offset = PAGE_OFFSET_L5; - if (satp_mode_cmdline == SATP_MODE_57) { + if (satp_mode_limit == SATP_MODE_48) { disable_pgtable_l5(); - } else if (satp_mode_cmdline == SATP_MODE_48) { + } else if (satp_mode_limit == SATP_MODE_39) { disable_pgtable_l5(); disable_pgtable_l4(); return; diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index e7b032dfd17f0f..632ced07bca442 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -13,21 +13,15 @@ #include #include +/* verify runtime detection extension status */ +#define rv_ext_enabled(ext) \ + (IS_ENABLED(CONFIG_RISCV_ISA_##ext) && riscv_has_extension_likely(RISCV_ISA_EXT_##ext)) + static inline bool rvc_enabled(void) { return IS_ENABLED(CONFIG_RISCV_ISA_C); } -static inline bool rvzba_enabled(void) -{ - return IS_ENABLED(CONFIG_RISCV_ISA_ZBA) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBA); -} - -static inline bool rvzbb_enabled(void) -{ - return IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBB); -} - enum { RV_REG_ZERO = 0, /* The constant value 0 */ RV_REG_RA = 1, /* Return address */ @@ -84,6 +78,8 @@ struct rv_jit_context { int epilogue_offset; int *offset; /* BPF to RV */ int nexentries; + int ex_insn_off; + int ex_jmp_off; unsigned long flags; int stack_size; u64 arena_vm_start; @@ -757,6 +753,17 @@ static inline u16 rvc_swsp(u32 imm8, u8 rs2) return rv_css_insn(0x6, imm, rs2, 0x2); } +/* RVZACAS instructions. */ +static inline u32 rvzacas_amocas_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x5, aq, rl, rs2, rs1, 2, rd, 0x2f); +} + +static inline u32 rvzacas_amocas_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x5, aq, rl, rs2, rs1, 3, rd, 0x2f); +} + /* RVZBA instructions. */ static inline u32 rvzba_sh2add(u8 rd, u8 rs1, u8 rs2) { @@ -1123,7 +1130,7 @@ static inline void emit_sw(u8 rs1, s32 off, u8 rs2, struct rv_jit_context *ctx) static inline void emit_sh2add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) { - if (rvzba_enabled()) { + if (rv_ext_enabled(ZBA)) { emit(rvzba_sh2add(rd, rs1, rs2), ctx); return; } @@ -1134,7 +1141,7 @@ static inline void emit_sh2add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx static inline void emit_sh3add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) { - if (rvzba_enabled()) { + if (rv_ext_enabled(ZBA)) { emit(rvzba_sh3add(rd, rs1, rs2), ctx); return; } @@ -1184,7 +1191,7 @@ static inline void emit_subw(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) static inline void emit_sextb(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { emit(rvzbb_sextb(rd, rs), ctx); return; } @@ -1195,7 +1202,7 @@ static inline void emit_sextb(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_sexth(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { emit(rvzbb_sexth(rd, rs), ctx); return; } @@ -1211,7 +1218,7 @@ static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { emit(rvzbb_zexth(rd, rs), ctx); return; } @@ -1222,7 +1229,7 @@ static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzba_enabled()) { + if (rv_ext_enabled(ZBA)) { emit(rvzba_zextw(rd, rs), ctx); return; } @@ -1233,7 +1240,7 @@ static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_bswap(u8 rd, s32 imm, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { int bits = 64 - imm; emit(rvzbb_rev8(rd, rd), ctx); @@ -1289,6 +1296,35 @@ static inline void emit_bswap(u8 rd, s32 imm, struct rv_jit_context *ctx) emit_mv(rd, RV_REG_T2, ctx); } +static inline void emit_cmpxchg(u8 rd, u8 rs, u8 r0, bool is64, struct rv_jit_context *ctx) +{ + int jmp_offset; + + if (rv_ext_enabled(ZACAS)) { + ctx->ex_insn_off = ctx->ninsns; + emit(is64 ? rvzacas_amocas_d(r0, rs, rd, 1, 1) : + rvzacas_amocas_w(r0, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; + if (!is64) + emit_zextw(r0, r0, ctx); + return; + } + + if (is64) + emit_mv(RV_REG_T2, r0, ctx); + else + emit_addiw(RV_REG_T2, r0, 0, ctx); + emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) : + rv_lr_w(r0, 0, rd, 0, 0), ctx); + jmp_offset = ninsns_rvoff(8); + emit(rv_bne(RV_REG_T2, r0, jmp_offset >> 1), ctx); + emit(is64 ? rv_sc_d(RV_REG_T3, rs, rd, 0, 1) : + rv_sc_w(RV_REG_T3, rs, rd, 0, 1), ctx); + jmp_offset = ninsns_rvoff(-6); + emit(rv_bne(RV_REG_T3, 0, jmp_offset >> 1), ctx); + emit_fence_rw_rw(ctx); +} + #endif /* __riscv_xlen == 64 */ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 9883a55d61b5b9..45cbc7c6fe490c 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -18,7 +18,7 @@ #define RV_MAX_REG_ARGS 8 #define RV_FENTRY_NINSNS 2 #define RV_FENTRY_NBYTES (RV_FENTRY_NINSNS * 4) -#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI_CLANG) ? 1 : 0) +#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI) ? 1 : 0) /* imm that allows emit_imm to emit max count insns */ #define RV_MAX_COUNT_IMM 0x7FFF7FF7FF7FF7FF @@ -469,142 +469,96 @@ static int emit_call(u64 addr, bool fixed_addr, struct rv_jit_context *ctx) static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx) { - if (IS_ENABLED(CONFIG_CFI_CLANG)) + if (IS_ENABLED(CONFIG_CFI)) emit(hash, ctx); } -static int emit_load_8(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_ldx_insn(u8 rd, s16 off, u8 rs, u8 size, bool sign_ext, + struct rv_jit_context *ctx) { - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, off, rs), ctx); - else - emit(rv_lbu(rd, off, rs), ctx); - return ctx->ninsns - insns_start; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lbu(rd, 0, RV_REG_T1), ctx); - return ctx->ninsns - insns_start; -} - -static int emit_load_16(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, off, rs), ctx); - else - emit(rv_lhu(rd, off, rs), ctx); - return ctx->ninsns - insns_start; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lhu(rd, 0, RV_REG_T1), ctx); - return ctx->ninsns - insns_start; -} - -static int emit_load_32(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, off, rs), ctx); - else - emit(rv_lwu(rd, off, rs), ctx); - return ctx->ninsns - insns_start; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lwu(rd, 0, RV_REG_T1), ctx); - return ctx->ninsns - insns_start; -} - -static int emit_load_64(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; + switch (size) { + case BPF_B: + emit(sign_ext ? rv_lb(rd, off, rs) : rv_lbu(rd, off, rs), ctx); + break; + case BPF_H: + emit(sign_ext ? rv_lh(rd, off, rs) : rv_lhu(rd, off, rs), ctx); + break; + case BPF_W: + emit(sign_ext ? rv_lw(rd, off, rs) : rv_lwu(rd, off, rs), ctx); + break; + case BPF_DW: emit_ld(rd, off, rs, ctx); - return ctx->ninsns - insns_start; + break; } - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - emit_ld(rd, 0, RV_REG_T1, ctx); - return ctx->ninsns - insns_start; } -static void emit_store_8(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_stx_insn(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) { - if (is_12b_int(off)) { + switch (size) { + case BPF_B: emit(rv_sb(rd, off, rs), ctx); - return; + break; + case BPF_H: + emit(rv_sh(rd, off, rs), ctx); + break; + case BPF_W: + emit_sw(rd, off, rs, ctx); + break; + case BPF_DW: + emit_sd(rd, off, rs, ctx); + break; } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sb(RV_REG_T1, 0, rs), ctx); } -static void emit_store_16(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_ldx(u8 rd, s16 off, u8 rs, u8 size, bool sign_ext, + struct rv_jit_context *ctx) { if (is_12b_int(off)) { - emit(rv_sh(rd, off, rs), ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_ldx_insn(rd, off, rs, size, sign_ext, ctx); + ctx->ex_jmp_off = ctx->ninsns; return; } emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sh(RV_REG_T1, 0, rs), ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_ldx_insn(rd, 0, RV_REG_T1, size, sign_ext, ctx); + ctx->ex_jmp_off = ctx->ninsns; } -static void emit_store_32(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_st(u8 rd, s16 off, s32 imm, u8 size, struct rv_jit_context *ctx) { + emit_imm(RV_REG_T1, imm, ctx); if (is_12b_int(off)) { - emit_sw(rd, off, rs, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_stx_insn(rd, off, RV_REG_T1, size, ctx); + ctx->ex_jmp_off = ctx->ninsns; return; } - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sw(RV_REG_T1, 0, rs, ctx); + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_stx_insn(RV_REG_T2, 0, RV_REG_T1, size, ctx); + ctx->ex_jmp_off = ctx->ninsns; } -static void emit_store_64(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_stx(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) { if (is_12b_int(off)) { - emit_sd(rd, off, rs, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_stx_insn(rd, off, rs, size, ctx); + ctx->ex_jmp_off = ctx->ninsns; return; } emit_imm(RV_REG_T1, off, ctx); emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sd(RV_REG_T1, 0, rs, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_stx_insn(RV_REG_T1, 0, rs, size, ctx); + ctx->ex_jmp_off = ctx->ninsns; } static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, @@ -617,20 +571,12 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, switch (imm) { /* dst_reg = load_acquire(src_reg + off16) */ case BPF_LOAD_ACQ: - switch (BPF_SIZE(code)) { - case BPF_B: - emit_load_8(false, rd, off, rs, ctx); - break; - case BPF_H: - emit_load_16(false, rd, off, rs, ctx); - break; - case BPF_W: - emit_load_32(false, rd, off, rs, ctx); - break; - case BPF_DW: - emit_load_64(false, rd, off, rs, ctx); - break; + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + emit_add(RV_REG_T2, rs, RV_REG_ARENA, ctx); + rs = RV_REG_T2; } + + emit_ldx(rd, off, rs, BPF_SIZE(code), false, ctx); emit_fence_r_rw(ctx); /* If our next insn is a redundant zext, return 1 to tell @@ -641,21 +587,13 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, break; /* store_release(dst_reg + off16, src_reg) */ case BPF_STORE_REL: - emit_fence_rw_w(ctx); - switch (BPF_SIZE(code)) { - case BPF_B: - emit_store_8(rd, off, rs, ctx); - break; - case BPF_H: - emit_store_16(rd, off, rs, ctx); - break; - case BPF_W: - emit_store_32(rd, off, rs, ctx); - break; - case BPF_DW: - emit_store_64(rd, off, rs, ctx); - break; + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T2; } + + emit_fence_rw_w(ctx); + emit_stx(rd, off, rs, BPF_SIZE(code), ctx); break; default: pr_err_once("bpf-jit: invalid atomic load/store opcode %02x\n", imm); @@ -668,17 +606,15 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, struct rv_jit_context *ctx) { - u8 r0, code = insn->code; + u8 code = insn->code; s16 off = insn->off; s32 imm = insn->imm; - int jmp_offset; - bool is64; + bool is64 = BPF_SIZE(code) == BPF_DW; if (BPF_SIZE(code) != BPF_W && BPF_SIZE(code) != BPF_DW) { pr_err_once("bpf-jit: 1- and 2-byte RMW atomics are not supported\n"); return -EINVAL; } - is64 = BPF_SIZE(code) == BPF_DW; if (off) { if (is_12b_int(off)) { @@ -690,72 +626,82 @@ static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, rd = RV_REG_T1; } + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + emit_add(RV_REG_T1, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T1; + } + switch (imm) { /* lock *(u32/u64 *)(dst_reg + off16) = src_reg */ case BPF_ADD: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoadd_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoadd_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; case BPF_AND: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoand_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoand_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; case BPF_OR: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoor_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoor_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; case BPF_XOR: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoxor_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoxor_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; /* src_reg = atomic_fetch_(dst_reg + off16, src_reg) */ case BPF_ADD | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoadd_d(rs, rs, rd, 1, 1) : rv_amoadd_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_AND | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoand_d(rs, rs, rd, 1, 1) : rv_amoand_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_OR | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoor_d(rs, rs, rd, 1, 1) : rv_amoor_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_XOR | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoxor_d(rs, rs, rd, 1, 1) : rv_amoxor_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; /* src_reg = atomic_xchg(dst_reg + off16, src_reg); */ case BPF_XCHG: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoswap_d(rs, rs, rd, 1, 1) : rv_amoswap_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; /* r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg); */ case BPF_CMPXCHG: - r0 = bpf_to_rv_reg(BPF_REG_0, ctx); - if (is64) - emit_mv(RV_REG_T2, r0, ctx); - else - emit_addiw(RV_REG_T2, r0, 0, ctx); - emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) : - rv_lr_w(r0, 0, rd, 0, 0), ctx); - jmp_offset = ninsns_rvoff(8); - emit(rv_bne(RV_REG_T2, r0, jmp_offset >> 1), ctx); - emit(is64 ? rv_sc_d(RV_REG_T3, rs, rd, 0, 1) : - rv_sc_w(RV_REG_T3, rs, rd, 0, 1), ctx); - jmp_offset = ninsns_rvoff(-6); - emit(rv_bne(RV_REG_T3, 0, jmp_offset >> 1), ctx); - emit_fence_rw_rw(ctx); + emit_cmpxchg(rd, rs, regmap[BPF_REG_0], is64, ctx); break; default: pr_err_once("bpf-jit: invalid atomic RMW opcode %02x\n", imm); @@ -765,6 +711,39 @@ static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, return 0; } +/* + * Sign-extend the register if necessary + */ +static int sign_extend(u8 rd, u8 rs, u8 sz, bool sign, struct rv_jit_context *ctx) +{ + if (!sign && (sz == 1 || sz == 2)) { + if (rd != rs) + emit_mv(rd, rs, ctx); + return 0; + } + + switch (sz) { + case 1: + emit_sextb(rd, rs, ctx); + break; + case 2: + emit_sexth(rd, rs, ctx); + break; + case 4: + emit_sextw(rd, rs, ctx); + break; + case 8: + if (rd != rs) + emit_mv(rd, rs, ctx); + break; + default: + pr_err("bpf-jit: invalid size %d for sign_extend\n", sz); + return -EINVAL; + } + + return 0; +} + #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define REG_DONT_CLEAR_MARKER 0 /* RV_REG_ZERO unused in pt_regmap */ @@ -783,9 +762,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, } /* For accesses to BTF pointers, add an entry to the exception table */ -static int add_exception_handler(const struct bpf_insn *insn, - struct rv_jit_context *ctx, - int dst_reg, int insn_len) +static int add_exception_handler(const struct bpf_insn *insn, int dst_reg, + struct rv_jit_context *ctx) { struct exception_table_entry *ex; unsigned long pc; @@ -793,21 +771,23 @@ static int add_exception_handler(const struct bpf_insn *insn, off_t fixup_offset; if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable || - (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX && - BPF_MODE(insn->code) != BPF_PROBE_MEM32)) + ctx->ex_insn_off <= 0 || ctx->ex_jmp_off <= 0) return 0; - if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries)) - return -EINVAL; + if (BPF_MODE(insn->code) != BPF_PROBE_MEM && + BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32 && + BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) + return 0; - if (WARN_ON_ONCE(insn_len > ctx->ninsns)) + if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries)) return -EINVAL; - if (WARN_ON_ONCE(!rvc_enabled() && insn_len == 1)) + if (WARN_ON_ONCE(ctx->ex_insn_off > ctx->ninsns || ctx->ex_jmp_off > ctx->ninsns)) return -EINVAL; ex = &ctx->prog->aux->extable[ctx->nexentries]; - pc = (unsigned long)&ctx->ro_insns[ctx->ninsns - insn_len]; + pc = (unsigned long)&ctx->ro_insns[ctx->ex_insn_off]; /* * This is the relative offset of the instruction that may fault from @@ -831,7 +811,7 @@ static int add_exception_handler(const struct bpf_insn *insn, * that may fault. The execution will jump to this after handling the * fault. */ - fixup_offset = (long)&ex->fixup - (pc + insn_len * sizeof(u16)); + fixup_offset = (long)&ex->fixup - (long)&ctx->ro_insns[ctx->ex_jmp_off]; if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset)) return -ERANGE; @@ -848,6 +828,8 @@ static int add_exception_handler(const struct bpf_insn *insn, FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); ex->type = EX_TYPE_BPF; + ctx->ex_insn_off = 0; + ctx->ex_jmp_off = 0; ctx->nexentries++; return 0; } @@ -1079,10 +1061,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, stack_size += 16; save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); - if (save_ret) { + if (save_ret) stack_size += 16; /* Save both A5 (BPF R0) and A0 */ - retval_off = stack_size; - } + retval_off = stack_size; stack_size += nr_arg_slots * 8; args_off = stack_size; @@ -1226,8 +1207,15 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, restore_args(min_t(int, nr_arg_slots, RV_MAX_REG_ARGS), args_off, ctx); if (save_ret) { - emit_ld(RV_REG_A0, -retval_off, RV_REG_FP, ctx); emit_ld(regmap[BPF_REG_0], -(retval_off - 8), RV_REG_FP, ctx); + if (is_struct_ops) { + ret = sign_extend(RV_REG_A0, regmap[BPF_REG_0], m->ret_size, + m->ret_flags & BTF_FMODEL_SIGNED_ARG, ctx); + if (ret) + goto out; + } else { + emit_ld(RV_REG_A0, -retval_off, RV_REG_FP, ctx); + } } emit_ld(RV_REG_S1, -sreg_off, RV_REG_FP, ctx); @@ -1320,7 +1308,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, goto out; } - bpf_flush_icache(ro_image, ro_image_end); out: kvfree(image); return ret < 0 ? ret : size; @@ -1857,7 +1844,6 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: { bool sign_ext; - int insn_len; sign_ext = BPF_MODE(insn->code) == BPF_MEMSX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX; @@ -1867,22 +1853,9 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, rs = RV_REG_T2; } - switch (BPF_SIZE(code)) { - case BPF_B: - insn_len = emit_load_8(sign_ext, rd, off, rs, ctx); - break; - case BPF_H: - insn_len = emit_load_16(sign_ext, rd, off, rs, ctx); - break; - case BPF_W: - insn_len = emit_load_32(sign_ext, rd, off, rs, ctx); - break; - case BPF_DW: - insn_len = emit_load_64(sign_ext, rd, off, rs, ctx); - break; - } + emit_ldx(rd, off, rs, BPF_SIZE(code), sign_ext, ctx); - ret = add_exception_handler(insn, ctx, rd, insn_len); + ret = add_exception_handler(insn, rd, ctx); if (ret) return ret; @@ -1890,238 +1863,73 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return 1; break; } + /* speculation barrier */ case BPF_ST | BPF_NOSPEC: break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit(rv_sb(rd, off, RV_REG_T1), ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx); - break; - case BPF_ST | BPF_MEM | BPF_H: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit(rv_sh(rd, off, RV_REG_T1), ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx); - break; case BPF_ST | BPF_MEM | BPF_W: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit_sw(rd, off, RV_REG_T1, ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx); - break; case BPF_ST | BPF_MEM | BPF_DW: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit_sd(rd, off, RV_REG_T1, ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); - break; - + /* ST | PROBE_MEM32: *(size *)(dst + RV_REG_ARENA + off) = imm */ case BPF_ST | BPF_PROBE_MEM32 | BPF_B: case BPF_ST | BPF_PROBE_MEM32 | BPF_H: case BPF_ST | BPF_PROBE_MEM32 | BPF_W: case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: - { - int insn_len, insns_start; - - emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx); - rd = RV_REG_T3; - - /* Load imm to a register then store it */ - emit_imm(RV_REG_T1, imm, ctx); - - switch (BPF_SIZE(code)) { - case BPF_B: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sb(rd, off, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_H: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sh(rd, off, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_W: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sw(rd, off, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_DW: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sd(rd, off, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T3; } - ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, - insn_len); + emit_st(rd, off, imm, BPF_SIZE(code), ctx); + + ret = add_exception_handler(insn, REG_DONT_CLEAR_MARKER, ctx); if (ret) return ret; - break; - } /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_B: - emit_store_8(rd, off, rs, ctx); - break; case BPF_STX | BPF_MEM | BPF_H: - emit_store_16(rd, off, rs, ctx); - break; case BPF_STX | BPF_MEM | BPF_W: - emit_store_32(rd, off, rs, ctx); - break; case BPF_STX | BPF_MEM | BPF_DW: - emit_store_64(rd, off, rs, ctx); + /* STX | PROBE_MEM32: *(size *)(dst + RV_REG_ARENA + off) = src */ + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T2; + } + + emit_stx(rd, off, rs, BPF_SIZE(code), ctx); + + ret = add_exception_handler(insn, REG_DONT_CLEAR_MARKER, ctx); + if (ret) + return ret; break; + + /* Atomics */ case BPF_STX | BPF_ATOMIC | BPF_B: case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_B: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_H: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: if (bpf_atomic_is_load_store(insn)) ret = emit_atomic_ld_st(rd, rs, insn, ctx); else ret = emit_atomic_rmw(rd, rs, insn, ctx); - if (ret) - return ret; - break; - case BPF_STX | BPF_PROBE_MEM32 | BPF_B: - case BPF_STX | BPF_PROBE_MEM32 | BPF_H: - case BPF_STX | BPF_PROBE_MEM32 | BPF_W: - case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: - { - int insn_len, insns_start; - - emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); - rd = RV_REG_T2; - - switch (BPF_SIZE(code)) { - case BPF_B: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sb(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sb(RV_REG_T1, 0, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_H: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sh(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sh(RV_REG_T1, 0, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_W: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sw(rd, off, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit_sw(RV_REG_T1, 0, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_DW: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sd(rd, off, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit_sd(RV_REG_T1, 0, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, - insn_len); + ret = ret ?: add_exception_handler(insn, REG_DONT_CLEAR_MARKER, ctx); if (ret) return ret; - break; - } default: pr_err("bpf-jit: unknown opcode %02x\n", code); @@ -2249,6 +2057,25 @@ bool bpf_jit_supports_arena(void) return true; } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (in_arena) { + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == BPF_CMPXCHG) + return rv_ext_enabled(ZACAS); + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; + } + } + + return true; +} + bool bpf_jit_supports_percpu_insn(void) { return true; diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index 240592e3f5c2f5..530e497ca2f9ce 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -71,7 +71,7 @@ ifdef CONFIG_STACKPROTECTOR_STRONG PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) endif diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf680c26a33cf7..2414ee3ff00237 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -49,6 +49,13 @@ config KASAN_SHADOW_OFFSET depends on KASAN default 0x1C000000000000 +config CC_HAS_BUILTIN_FFS + def_bool !(CC_IS_GCC && GCC_VERSION < 160000) + help + GCC versions before 16.0.0 generate library calls to ffs() + for __builtin_ffs() even when __has_builtin(__builtin_ffs) + is true. + config CC_ASM_FLAG_OUTPUT_BROKEN def_bool CC_IS_GCC && GCC_VERSION < 140200 help @@ -167,8 +174,6 @@ config S390 select GENERIC_GETTIMEOFDAY select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL @@ -199,6 +204,7 @@ config S390 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FENTRY select HAVE_FTRACE_GRAPH_FUNC @@ -547,15 +553,11 @@ config NODES_SHIFT depends on NUMA default "1" -config SCHED_SMT - def_bool n - -config SCHED_MC - def_bool n - config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" + select ARCH_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_MC select SCHED_SMT select SCHED_MC help diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index c0152db285f0b9..37d5b097ede5f5 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -10,6 +10,7 @@ #include #include +#include struct vmlinux_info { unsigned long entry; @@ -89,6 +90,13 @@ void __noreturn jump_to_kernel(psw_t *psw); #define boot_info(fmt, ...) boot_printk(KERN_INFO boot_fmt(fmt), ##__VA_ARGS__) #define boot_debug(fmt, ...) boot_printk(KERN_DEBUG boot_fmt(fmt), ##__VA_ARGS__) +#define boot_panic(...) do { \ + boot_emerg(__VA_ARGS__); \ + print_stacktrace(current_frame_address()); \ + boot_emerg(" -- System halted\n"); \ + disabled_wait(); \ +} while (0) + extern struct machine_info machine; extern int boot_console_loglevel; extern bool boot_ignore_loglevel; diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index 03500b9d9fb9ad..8d1bc25a6bf4ed 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -68,9 +68,7 @@ static void decompress_error(char *m) { if (bootdebug) boot_rb_dump(); - boot_emerg("Decompression error: %s\n", m); - boot_emerg(" -- System halted\n"); - disabled_wait(); + boot_panic("Decompression error: %s\n", m); } unsigned long mem_safe_offset(void) diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c index 45e3d057cfaa31..1f2ca5435838e8 100644 --- a/arch/s390/boot/physmem_info.c +++ b/arch/s390/boot/physmem_info.c @@ -228,9 +228,7 @@ static void die_oom(unsigned long size, unsigned long align, unsigned long min, boot_emerg("Usable online memory total: %lu Reserved: %lu Free: %lu\n", total_mem, total_reserved_mem, total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0); - print_stacktrace(current_frame_address()); - boot_emerg(" -- System halted\n"); - disabled_wait(); + boot_panic("Oom\n"); } static void _physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 93684a7757161c..3fbd25b9498f35 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -44,13 +44,6 @@ u64 __bootdata_preserved(clock_comparator_max) = -1UL; u64 __bootdata_preserved(stfle_fac_list[16]); struct oldmem_data __bootdata_preserved(oldmem_data); -void error(char *x) -{ - boot_emerg("%s\n", x); - boot_emerg(" -- System halted\n"); - disabled_wait(); -} - static char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); static void detect_machine_type(void) @@ -220,10 +213,10 @@ static void rescue_initrd(unsigned long min, unsigned long max) static void copy_bootdata(void) { if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size) - error(".boot.data section size mismatch"); + boot_panic(".boot.data section size mismatch\n"); memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size); if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size) - error(".boot.preserved.data section size mismatch"); + boot_panic(".boot.preserved.data section size mismatch\n"); memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); } @@ -237,7 +230,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) { loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) - error("64-bit relocation outside of kernel!\n"); + boot_panic("64-bit relocation outside of kernel!\n"); *(u64 *)loc += offset; } } diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5e616bc988ac35..99467f2dc01840 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -118,8 +118,13 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_TLS_TOE=y CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y @@ -542,6 +547,7 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_SF=y # CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set @@ -658,9 +664,6 @@ CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_BTRFS_DEBUG=y CONFIG_BTRFS_ASSERT=y CONFIG_NILFS2_FS=m -CONFIG_BCACHEFS_FS=y -CONFIG_BCACHEFS_QUOTA=y -CONFIG_BCACHEFS_POSIX_ACL=y CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y @@ -761,7 +764,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 094599cdaf4d9b..a8573807e0c075 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -109,8 +109,13 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_TLS_TOE=y CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y @@ -532,6 +537,7 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_SF=y # CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set @@ -645,9 +651,6 @@ CONFIG_OCFS2_FS=m CONFIG_BTRFS_FS=y CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_NILFS2_FS=m -CONFIG_BCACHEFS_FS=m -CONFIG_BCACHEFS_QUOTA=y -CONFIG_BCACHEFS_POSIX_ACL=y CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y @@ -745,7 +748,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index a5ca0a9476916c..ec945fb60c021f 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -122,6 +122,8 @@ static inline bool test_bit_inv(unsigned long nr, return test_bit(nr ^ (BITS_PER_LONG - 1), ptr); } +#ifndef CONFIG_CC_HAS_BUILTIN_FFS + /** * __flogr - find leftmost one * @word - The word to search @@ -130,11 +132,12 @@ static inline bool test_bit_inv(unsigned long nr, * where the most significant bit has bit number 0. * If no bit is set this function returns 64. */ -static inline unsigned char __flogr(unsigned long word) +static __always_inline __attribute_const__ unsigned long __flogr(unsigned long word) { - if (__builtin_constant_p(word)) { - unsigned long bit = 0; + unsigned long bit; + if (__builtin_constant_p(word)) { + bit = 0; if (!word) return 64; if (!(word & 0xffffffff00000000UL)) { @@ -163,27 +166,22 @@ static inline unsigned char __flogr(unsigned long word) } return bit; } else { - union register_pair rp; + union register_pair rp __uninitialized; rp.even = word; - asm volatile( - " flogr %[rp],%[rp]\n" - : [rp] "+d" (rp.pair) : : "cc"); - return rp.even; + asm("flogr %[rp],%[rp]" + : [rp] "+d" (rp.pair) : : "cc"); + bit = rp.even; + /* + * The result of the flogr instruction is a value in the range + * of 0..64. Let the compiler know that the AND operation can + * be optimized away. + */ + __assume(bit <= 64); + return bit & 127; } } -/** - * __ffs - find first bit in word. - * @word: The word to search - * - * Undefined if no bit exists, so code should check against 0 first. - */ -static inline unsigned long __ffs(unsigned long word) -{ - return __flogr(-word & word) ^ (BITS_PER_LONG - 1); -} - /** * ffs - find first bit set * @word: the word to search @@ -191,58 +189,26 @@ static inline unsigned long __ffs(unsigned long word) * This is defined the same way as the libc and * compiler builtin ffs routines (man ffs). */ -static inline int ffs(int word) +static __always_inline __flatten __attribute_const__ int ffs(int word) { - unsigned long mask = 2 * BITS_PER_LONG - 1; unsigned int val = (unsigned int)word; - return (1 + (__flogr(-val & val) ^ (BITS_PER_LONG - 1))) & mask; -} - -/** - * __fls - find last (most-significant) set bit in a long word - * @word: the word to search - * - * Undefined if no set bit exists, so code should check against 0 first. - */ -static inline unsigned long __fls(unsigned long word) -{ - return __flogr(word) ^ (BITS_PER_LONG - 1); + return BITS_PER_LONG - __flogr(-val & val); } -/** - * fls64 - find last set bit in a 64-bit word - * @word: the word to search - * - * This is defined in a similar way as the libc and compiler builtin - * ffsll, but returns the position of the most significant set bit. - * - * fls64(value) returns 0 if value is 0 or the position of the last - * set bit if value is nonzero. The last (most significant) bit is - * at position 64. - */ -static inline int fls64(unsigned long word) -{ - unsigned long mask = 2 * BITS_PER_LONG - 1; +#else /* CONFIG_CC_HAS_BUILTIN_FFS */ - return (1 + (__flogr(word) ^ (BITS_PER_LONG - 1))) & mask; -} +#include -/** - * fls - find last (most-significant) bit set - * @word: the word to search - * - * This is defined the same way as ffs. - * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. - */ -static inline int fls(unsigned int word) -{ - return fls64(word); -} +#endif /* CONFIG_CC_HAS_BUILTIN_FFS */ +#include +#include +#include +#include +#include #include #include -#include #include #include #include diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index e5f57cfe1d4582..025c6dcbf89331 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -16,11 +16,11 @@ #define ZPCI_PCI_ST_FUNC_NOT_AVAIL 40 #define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE 44 -/* Load/Store return codes */ -#define ZPCI_PCI_LS_OK 0 -#define ZPCI_PCI_LS_ERR 1 -#define ZPCI_PCI_LS_BUSY 2 -#define ZPCI_PCI_LS_INVAL_HANDLE 3 +/* PCI instruction condition codes */ +#define ZPCI_CC_OK 0 +#define ZPCI_CC_ERR 1 +#define ZPCI_CC_BUSY 2 +#define ZPCI_CC_INVAL_HANDLE 3 /* Load/Store address space identifiers */ #define ZPCI_PCIAS_MEMIO_0 0 diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 5345398df65342..a16e650723719a 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -19,12 +19,16 @@ #define CRST_ALLOC_ORDER 2 -unsigned long *crst_table_alloc(struct mm_struct *); +unsigned long *crst_table_alloc_noprof(struct mm_struct *); +#define crst_table_alloc(...) alloc_hooks(crst_table_alloc_noprof(__VA_ARGS__)) void crst_table_free(struct mm_struct *, unsigned long *); -unsigned long *page_table_alloc(struct mm_struct *); -struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm); +unsigned long *page_table_alloc_noprof(struct mm_struct *); +#define page_table_alloc(...) alloc_hooks(page_table_alloc_noprof(__VA_ARGS__)) void page_table_free(struct mm_struct *, unsigned long *); + +struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm); +#define page_table_alloc_pgste(...) alloc_hooks(page_table_alloc_pgste_noprof(__VA_ARGS__)) void page_table_free_pgste(struct ptdesc *ptdesc); static inline void crst_table_init(unsigned long *crst, unsigned long entry) @@ -48,9 +52,9 @@ static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long return addr; } -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) +static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long address) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -59,6 +63,7 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) return (p4d_t *) table; } +#define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { @@ -69,9 +74,9 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) crst_table_free(mm, (unsigned long *) p4d); } -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long address) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -80,6 +85,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) return (pud_t *) table; } +#define pud_alloc_one(...) alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__)) static inline void pud_free(struct mm_struct *mm, pud_t *pud) { @@ -90,9 +96,9 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) crst_table_free(mm, (unsigned long *) pud); } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -103,6 +109,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) } return (pmd_t *) table; } +#define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { @@ -127,9 +134,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd))); } -static inline pgd_t *pgd_alloc(struct mm_struct *mm) +static inline pgd_t *pgd_alloc_noprof(struct mm_struct *mm) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -137,6 +144,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return (pgd_t *) table; } +#define pgd_alloc(...) alloc_hooks(pgd_alloc_noprof(__VA_ARGS__)) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index f6ed2c8192c87c..7878e9bfbf072b 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -56,49 +56,31 @@ void arch_setup_new_exec(void); /* * thread information flags bit numbers + * + * Tell the generic TIF infrastructure which special bits s390 supports */ -#define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ -#define TIF_SIGPENDING 1 /* signal pending */ -#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling needed */ -#define TIF_UPROBE 4 /* breakpointed or single-stepping */ -#define TIF_PATCH_PENDING 5 /* pending live patching update */ -#define TIF_ASCE_PRIMARY 6 /* primary asce is kernel asce */ -#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ -#define TIF_GUARDED_STORAGE 8 /* load guarded storage control block */ -#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ -#define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ -#define TIF_31BIT 16 /* 32bit process */ -#define TIF_MEMDIE 17 /* is terminating due to OOM killer */ -#define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ -#define TIF_SINGLE_STEP 19 /* This task is single stepped */ -#define TIF_BLOCK_STEP 20 /* This task is block stepped */ -#define TIF_UPROBE_SINGLESTEP 21 /* This task is uprobe single stepped */ -#define TIF_SYSCALL_TRACE 24 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ -#define TIF_SECCOMP 26 /* secure computing */ -#define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include + +/* Architecture specific bits */ +#define TIF_ASCE_PRIMARY 16 /* primary asce is kernel asce */ +#define TIF_GUARDED_STORAGE 17 /* load guarded storage control block */ +#define TIF_ISOLATE_BP_GUEST 18 /* Run KVM guests with isolated BP */ +#define TIF_PER_TRAP 19 /* Need to handle PER trap on exit to usermode */ +#define TIF_31BIT 20 /* 32bit process */ +#define TIF_SINGLE_STEP 21 /* This task is single stepped */ +#define TIF_BLOCK_STEP 22 /* This task is block stepped */ +#define TIF_UPROBE_SINGLESTEP 23 /* This task is uprobe single stepped */ -#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) -#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) -#define _TIF_UPROBE BIT(TIF_UPROBE) -#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) #define _TIF_ASCE_PRIMARY BIT(TIF_ASCE_PRIMARY) -#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) #define _TIF_31BIT BIT(TIF_31BIT) -#define _TIF_MEMDIE BIT(TIF_MEMDIE) -#define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) #define _TIF_BLOCK_STEP BIT(TIF_BLOCK_STEP) #define _TIF_UPROBE_SINGLESTEP BIT(TIF_UPROBE_SINGLESTEP) -#define _TIF_SYSCALL_TRACE BIT(TIF_SYSCALL_TRACE) -#define _TIF_SYSCALL_AUDIT BIT(TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP BIT(TIF_SECCOMP) -#define _TIF_SYSCALL_TRACEPOINT BIT(TIF_SYSCALL_TRACEPOINT) #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 95ecad9c7d7d27..a8915663e917fa 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index c62100dc62c8df..6a26f202441d3a 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -1416,18 +1416,12 @@ static inline char *debug_get_user_string(const char __user *user_buf, { char *buffer; - buffer = kmalloc(user_len + 1, GFP_KERNEL); - if (!buffer) - return ERR_PTR(-ENOMEM); - if (copy_from_user(buffer, user_buf, user_len) != 0) { - kfree(buffer); - return ERR_PTR(-EFAULT); - } + buffer = memdup_user_nul(user_buf, user_len); + if (IS_ERR(buffer)) + return buffer; /* got the string, now strip linefeed. */ if (buffer[user_len - 1] == '\n') buffer[user_len - 1] = 0; - else - buffer[user_len] = 0; return buffer; } diff --git a/arch/s390/kernel/diag/diag324.c b/arch/s390/kernel/diag/diag324.c index 7fa4c0b7eb6c71..f0a8b4841fb964 100644 --- a/arch/s390/kernel/diag/diag324.c +++ b/arch/s390/kernel/diag/diag324.c @@ -116,7 +116,7 @@ static void pibwork_handler(struct work_struct *work) mutex_lock(&pibmutex); timedout = ktime_add_ns(data->expire, PIBWORK_DELAY); if (ktime_before(ktime_get(), timedout)) { - mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); goto out; } vfree(data->pib); @@ -174,7 +174,7 @@ long diag324_pibbuf(unsigned long arg) pib_update(data); data->sequence++; data->expire = ktime_add_ns(ktime_get(), tod_to_ns(data->pib->intv)); - mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); first = false; } rc = data->rc; diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c index e7b66d046e8d38..2507bc3f775741 100644 --- a/arch/s390/kernel/hiperdispatch.c +++ b/arch/s390/kernel/hiperdispatch.c @@ -191,7 +191,7 @@ int hd_enable_hiperdispatch(void) return 0; if (hd_online_cores <= hd_entitled_cores) return 0; - mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); + mod_delayed_work(system_dfl_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); hd_update_capacities(); return 1; } diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 4d364de4379921..143e34a4eca57c 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_elf(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; Elf_Addr entry; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index a32ce8bea745cf..9a439175723cad 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_image(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; buf.image = image; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index c2bac14dd668ae..a36d7311c6683b 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -129,7 +129,7 @@ static int kexec_file_update_purgatory(struct kimage *image, static int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -152,7 +152,7 @@ static int kexec_file_add_purgatory(struct kimage *image, static int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -184,7 +184,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, { __u32 *lc_ipl_parmblock_ptr; unsigned int len, ncerts; - struct kexec_buf buf; + struct kexec_buf buf = {}; unsigned long addr; void *ptr, *end; int ret; diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 4d09954ebf49e8..04457d88e5892c 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -760,8 +760,6 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) break; case PERF_TYPE_HARDWARE: - if (is_sampling_event(event)) /* No sampling support */ - return -ENOENT; ev = attr->config; if (!attr->exclude_user && attr->exclude_kernel) { /* @@ -859,6 +857,8 @@ static int cpumf_pmu_event_init(struct perf_event *event) unsigned int type = event->attr.type; int err = -ENOENT; + if (is_sampling_event(event)) /* No sampling support */ + return err; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) err = __hw_perf_event_init(event, type); else if (event->pmu->type == type) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index f373a1009c456e..9455f213dc2021 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -285,10 +285,10 @@ static int paicrypt_event_init(struct perf_event *event) /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) return -ENOENT; - /* PAI crypto event must be in valid range */ + /* PAI crypto event must be in valid range, try others if not */ if (a->config < PAI_CRYPTO_BASE || a->config > PAI_CRYPTO_BASE + paicrypt_cnt) - return -EINVAL; + return -ENOENT; /* Allow only CRYPTO_ALL for sampling */ if (a->sample_period && a->config != PAI_CRYPTO_BASE) return -EINVAL; diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index d827473e7f87f8..7b32935273ced1 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -265,7 +265,7 @@ static int paiext_event_valid(struct perf_event *event) event->hw.config_base = offsetof(struct paiext_cb, acc); return 0; } - return -EINVAL; + return -ENOENT; } /* Might be called on different CPU than the one the event is intended for. */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index f55f09cda6f889..b107dbca4ed7df 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -106,7 +106,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long new_stackp = args->stack; unsigned long tls = args->tls; struct fake_frame diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 46569b8e47dde3..1594c80e9bc4db 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -509,33 +509,27 @@ int topology_cpu_init(struct cpu *cpu) return rc; } -static const struct cpumask *cpu_thread_mask(int cpu) -{ - return &cpu_topology[cpu].thread_mask; -} - - const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_mask; } -static const struct cpumask *cpu_book_mask(int cpu) +static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].book_mask; } -static const struct cpumask *cpu_drawer_mask(int cpu) +static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].drawer_mask; } static struct sched_domain_topology_level s390_topology[] = { - SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), - SDTL_INIT(cpu_book_mask, NULL, BOOK), - SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), + SDTL_INIT(tl_book_mask, NULL, BOOK), + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 2a92a8b9e4c2f7..9384572ffa7b7a 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2778,12 +2778,19 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) static struct page *get_map_page(struct kvm *kvm, u64 uaddr) { + struct mm_struct *mm = kvm->mm; struct page *page = NULL; + int locked = 1; + + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE, + &page, &locked); + if (locked) + mmap_read_unlock(mm); + mmput(mm); + } - mmap_read_lock(kvm->mm); - get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, - &page, NULL); - mmap_read_unlock(kvm->mm); return page; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bf6fa8b9ca7328..6d51aa5f66bee1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4864,12 +4864,12 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) * @vcpu: the vCPU whose gmap is to be fixed up * @gfn: the guest frame number used for memslots (including fake memslots) * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps - * @flags: FOLL_* flags + * @foll: FOLL_* flags * * Return: 0 on success, < 0 in case of error. * Context: The mm lock must not be held before calling. May sleep. */ -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) +int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll) { struct kvm_memory_slot *slot; unsigned int fault_flags; @@ -4883,13 +4883,13 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return vcpu_post_run_addressing_exception(vcpu); - fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; + fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; if (vcpu->arch.gmap->pfault_enabled) - flags |= FOLL_NOWAIT; + foll |= FOLL_NOWAIT; vmaddr = __gfn_to_hva_memslot(slot, gfn); try_again: - pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); + pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page); /* Access outside memory, inject addressing exception */ if (is_noslot_pfn(pfn)) @@ -4905,7 +4905,7 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u return 0; vcpu->stat.pfault_sync++; /* Could not setup async pfault, try again synchronously */ - flags &= ~FOLL_NOWAIT; + foll &= ~FOLL_NOWAIT; goto try_again; } /* Any other error */ @@ -4925,7 +4925,7 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u return rc; } -static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll) { unsigned long gaddr_tmp; gfn_t gfn; @@ -4950,18 +4950,18 @@ static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, un } gfn = gpa_to_gfn(gaddr_tmp); } - return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); + return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll); } static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) { - unsigned int flags = 0; + unsigned int foll = 0; unsigned long gaddr; int rc; gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; if (kvm_s390_cur_gmap_fault_is_write()) - flags = FAULT_FLAG_WRITE; + foll = FOLL_WRITE; switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) { case 0: @@ -5003,7 +5003,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) send_sig(SIGSEGV, current, 0); if (rc != -ENXIO) break; - flags = FAULT_FLAG_WRITE; + foll = FOLL_WRITE; fallthrough; case PGM_PROTECTION: case PGM_SEGMENT_TRANSLATION: @@ -5013,7 +5013,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) case PGM_REGION_SECOND_TRANS: case PGM_REGION_THIRD_TRANS: kvm_s390_assert_primary_as(vcpu); - return vcpu_dat_fault_handler(vcpu, gaddr, flags); + return vcpu_dat_fault_handler(vcpu, gaddr, foll); default: KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", current->thread.gmap_int_code, current->thread.gmap_teid.val); diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 25ede8354514f2..6ba5a0305e25bc 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -624,6 +624,17 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) int cc, ret; u16 dummy; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + /* The notifier will be unregistered when the VM is destroyed */ + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + if (ret) { + kvm->arch.pv.mmu_notifier.ops = NULL; + return ret; + } + } + ret = kvm_s390_pv_alloc_vm(kvm); if (ret) return ret; @@ -659,11 +670,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return -EIO; } kvm->arch.gmap->guest_handle = uvcb.guest_handle; - /* Add the notifier only once. No races because we hold kvm->lock */ - if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { - kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; - mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); - } return 0; } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d2f6f1f6d2fcb9..36700384fe6bdd 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -14,11 +14,15 @@ #include #include -unsigned long *crst_table_alloc(struct mm_struct *mm) +unsigned long *crst_table_alloc_noprof(struct mm_struct *mm) { - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct ptdesc *ptdesc; unsigned long *table; + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER); if (!ptdesc) return NULL; table = ptdesc_to_virt(ptdesc); @@ -112,12 +116,12 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) #ifdef CONFIG_PGSTE -struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) +struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm) { struct ptdesc *ptdesc; u64 *table; - ptdesc = pagetable_alloc(GFP_KERNEL, 0); + ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0); if (ptdesc) { table = (u64 *)ptdesc_to_virt(ptdesc); __arch_set_page_dat(table, 1); @@ -134,12 +138,15 @@ void page_table_free_pgste(struct ptdesc *ptdesc) #endif /* CONFIG_PGSTE */ -unsigned long *page_table_alloc(struct mm_struct *mm) +unsigned long *page_table_alloc_noprof(struct mm_struct *mm) { + gfp_t gfp = GFP_KERNEL_ACCOUNT; struct ptdesc *ptdesc; unsigned long *table; - ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(mm, ptdesc)) { diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 60688be4e87666..50eb57c976bc30 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -335,7 +335,6 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, int nodat; struct mm_struct *mm = vma->vm_mm; - preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); old = ptep_flush_lazy(mm, addr, ptep, nodat); @@ -360,7 +359,6 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, } else { set_pte(ptep, pte); } - preempt_enable(); } static inline void pmdp_idte_local(struct mm_struct *mm, diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile index 8cab6deb0403df..9275cf63192aa7 100644 --- a/arch/s390/net/Makefile +++ b/arch/s390/net/Makefile @@ -2,5 +2,5 @@ # # Arch-specific network modules # -obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o obj-$(CONFIG_HAVE_PNETID) += pnet.o diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bb17efe29d6570..cf461d76e9da32 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -674,20 +674,6 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp) _EMIT2(0x07f0 | reg); \ } while (0) -/* - * Call r1 either directly or via __s390_indirect_jump_r1 thunk - */ -static void call_r1(struct bpf_jit *jit) -{ - if (nospec_uses_trampoline()) - /* brasl %r14,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, - __s390_indirect_jump_r1); - else - /* basr %r14,%r1 */ - EMIT2(0x0d00, REG_14, REG_1); -} - /* * Function epilogue */ @@ -1790,20 +1776,21 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; + /* * Copy the tail call counter to where the callee expects it. - * - * Note 1: The callee can increment the tail call counter, but - * we do not load it back, since the x86 JIT does not do this - * either. - * - * Note 2: We assume that the verifier does not let us call the - * main program, which clears the tail call counter on entry. */ - /* mvc tail_call_cnt(4,%r15),frame_off+tail_call_cnt(%r15) */ - _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt), - 0xf000 | (jit->frame_off + - offsetof(struct prog_frame, tail_call_cnt))); + + if (insn->src_reg == BPF_PSEUDO_CALL) + /* + * mvc tail_call_cnt(4,%r15), + * frame_off+tail_call_cnt(%r15) + */ + _EMIT6(0xd203f000 | offsetof(struct prog_frame, + tail_call_cnt), + 0xf000 | (jit->frame_off + + offsetof(struct prog_frame, + tail_call_cnt))); /* Sign-extend the kfunc arguments. */ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { @@ -1819,12 +1806,38 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, } } - /* lgrl %w1,func */ - EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); - /* %r1() */ - call_r1(jit); - /* lgr %b0,%r2: load return value into %b0 */ - EMIT4(0xb9040000, BPF_REG_0, REG_2); + if ((void *)func == arch_bpf_timed_may_goto) { + /* + * arch_bpf_timed_may_goto() has a special ABI: the + * parameters are in BPF_REG_AX and BPF_REG_10; the + * return value is in BPF_REG_AX; and all GPRs except + * REG_W0, REG_W1, and BPF_REG_AX are callee-saved. + */ + + /* brasl %r0,func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_0, (void *)func); + } else { + /* brasl %r14,func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, (void *)func); + /* lgr %b0,%r2: load return value into %b0 */ + EMIT4(0xb9040000, BPF_REG_0, REG_2); + } + + /* + * Copy the potentially updated tail call counter back. + */ + + if (insn->src_reg == BPF_PSEUDO_CALL) + /* + * mvc frame_off+tail_call_cnt(%r15), + * tail_call_cnt(4,%r15) + */ + _EMIT6(0xd203f000 | (jit->frame_off + + offsetof(struct prog_frame, + tail_call_cnt)), + 0xf000 | offsetof(struct prog_frame, + tail_call_cnt)); + break; } case BPF_JMP | BPF_TAIL_CALL: { @@ -2517,14 +2530,12 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * goto skip; */ - /* %r1 = __bpf_prog_enter */ - load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p)); /* %r2 = p */ load_imm64(jit, REG_2, (u64)p); /* la %r3,run_ctx_off(%r15) */ EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_prog_enter */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, bpf_trampoline_enter(p)); /* ltgr %r7,%r2 */ EMIT4(0xb9020000, REG_7, REG_2); /* brcl 8,skip */ @@ -2535,15 +2546,13 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * retval = bpf_func(args, p->insnsi); */ - /* %r1 = p->bpf_func */ - load_imm64(jit, REG_1, (u64)p->bpf_func); /* la %r2,bpf_args_off(%r15) */ EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off); /* %r3 = p->insnsi */ if (!p->jited) load_imm64(jit, REG_3, (u64)p->insnsi); - /* %r1() */ - call_r1(jit); + /* brasl %r14,p->bpf_func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, p->bpf_func); /* stg %r2,retval_off(%r15) */ if (save_ret) { if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags)) @@ -2560,16 +2569,14 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * __bpf_prog_exit(p, start, &run_ctx); */ - /* %r1 = __bpf_prog_exit */ - load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p)); /* %r2 = p */ load_imm64(jit, REG_2, (u64)p); /* lgr %r3,%r7 */ EMIT4(0xb9040000, REG_3, REG_7); /* la %r4,run_ctx_off(%r15) */ EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_prog_exit */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, bpf_trampoline_exit(p)); return 0; } @@ -2729,9 +2736,6 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* lgr %r8,%r0 */ EMIT4(0xb9040000, REG_8, REG_0); - } else { - /* %r8 = func_addr + S390X_PATCH_SIZE */ - load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE); } /* @@ -2757,12 +2761,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, * __bpf_tramp_enter(im); */ - /* %r1 = __bpf_tramp_enter */ - load_imm64(jit, REG_1, (u64)__bpf_tramp_enter); /* %r2 = im */ load_imm64(jit, REG_2, (u64)im); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_tramp_enter */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, __bpf_tramp_enter); } for (i = 0; i < fentry->nr_links; i++) @@ -2815,13 +2817,25 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* mvc tail_call_cnt(4,%r15),tccnt_off(%r15) */ _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt), 0xf000 | tjit->tccnt_off); - /* lgr %r1,%r8 */ - EMIT4(0xb9040000, REG_1, REG_8); - /* %r1() */ - call_r1(jit); + if (flags & BPF_TRAMP_F_ORIG_STACK) { + if (nospec_uses_trampoline()) + /* brasl %r14,__s390_indirect_jump_r8 */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + __s390_indirect_jump_r8); + else + /* basr %r14,%r8 */ + EMIT2(0x0d00, REG_14, REG_8); + } else { + /* brasl %r14,func_addr+S390X_PATCH_SIZE */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + func_addr + S390X_PATCH_SIZE); + } /* stg %r2,retval_off(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, tjit->retval_off); + /* mvc tccnt_off(%r15),tail_call_cnt(4,%r15) */ + _EMIT6(0xd203f000 | tjit->tccnt_off, + 0xf000 | offsetof(struct prog_frame, tail_call_cnt)); im->ip_after_call = jit->prg_buf + jit->prg; @@ -2846,12 +2860,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, * __bpf_tramp_exit(im); */ - /* %r1 = __bpf_tramp_exit */ - load_imm64(jit, REG_1, (u64)__bpf_tramp_exit); /* %r2 = im */ load_imm64(jit, REG_2, (u64)im); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_tramp_exit */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, __bpf_tramp_exit); } /* lmg %r2,%rN,reg_args_off(%r15) */ @@ -2860,7 +2872,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, REG_2 + (nr_reg_args - 1), REG_15, tjit->reg_args_off); /* lgr %r1,%r8 */ - if (!(flags & BPF_TRAMP_F_SKIP_FRAME)) + if (!(flags & BPF_TRAMP_F_SKIP_FRAME) && + (flags & BPF_TRAMP_F_ORIG_STACK)) EMIT4(0xb9040000, REG_1, REG_8); /* lmg %r7,%r8,r7_r8_off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15, @@ -2879,9 +2892,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); if (flags & BPF_TRAMP_F_SKIP_FRAME) EMIT_JUMP_REG(14); - else + else if (flags & BPF_TRAMP_F_ORIG_STACK) EMIT_JUMP_REG(1); - + else + /* brcl 0xf,func_addr+S390X_PATCH_SIZE */ + EMIT6_PCREL_RILC_PTR(0xc0040000, 0xf, + func_addr + S390X_PATCH_SIZE); return 0; } @@ -2951,6 +2967,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) case BPF_STX | BPF_ATOMIC | BPF_DW: if (bpf_atomic_is_load_store(insn)) return false; + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } return true; } @@ -2989,3 +3010,8 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *, u64, u64, u64), prev_addr = addr; } } + +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} diff --git a/arch/s390/net/bpf_timed_may_goto.S b/arch/s390/net/bpf_timed_may_goto.S new file mode 100644 index 00000000000000..06f567a460d7bd --- /dev/null +++ b/arch/s390/net/bpf_timed_may_goto.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include + +#define R2_OFF 0 +#define R5_OFF (R2_OFF + (5 - 2 + 1) * 8) +#define R14_OFF (R5_OFF + 8) +#define RETADDR_OFF (R14_OFF + 8) +#define R15_OFF (RETADDR_OFF + 8) +#define BACKCHAIN_OFF (R15_OFF + 8) +#define FRAME_SIZE (BACKCHAIN_OFF + 8) +#define FRAME_OFF (STACK_FRAME_OVERHEAD - FRAME_SIZE) +#if (FRAME_OFF + BACKCHAIN_OFF) != __SF_BACKCHAIN +#error Stack frame layout calculation is broken +#endif + + GEN_BR_THUNK %r1 + +SYM_FUNC_START(arch_bpf_timed_may_goto) + /* + * This function has a special ABI: the parameters are in %r12 and + * %r13; the return value is in %r12; all GPRs except %r0, %r1, and + * %r12 are callee-saved; and the return address is in %r0. + */ + stmg %r2,%r5,FRAME_OFF+R2_OFF(%r15) + stg %r14,FRAME_OFF+R14_OFF(%r15) + stg %r0,FRAME_OFF+RETADDR_OFF(%r15) + stg %r15,FRAME_OFF+R15_OFF(%r15) + lgr %r1,%r15 + lay %r15,-FRAME_SIZE(%r15) + stg %r1,__SF_BACKCHAIN(%r15) + + lay %r2,0(%r12,%r13) + brasl %r14,bpf_check_timed_may_goto + lgr %r12,%r2 + + lg %r15,FRAME_SIZE+FRAME_OFF+R15_OFF(%r15) + lmg %r2,%r5,FRAME_OFF+R2_OFF(%r15) + lg %r14,FRAME_OFF+R14_OFF(%r15) + lg %r1,FRAME_OFF+RETADDR_OFF(%r15) + BR_EX %r1 +SYM_FUNC_END(arch_bpf_timed_may_goto) diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h index 10ceb0d6b5a997..aba3aa96a50e99 100644 --- a/arch/sh/include/asm/bitops.h +++ b/arch/sh/include/asm/bitops.h @@ -24,7 +24,7 @@ #include #endif -static inline unsigned long ffz(unsigned long word) +static inline unsigned long __attribute_const__ ffz(unsigned long word) { unsigned long result; @@ -44,7 +44,7 @@ static inline unsigned long ffz(unsigned long word) * * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static inline __attribute_const__ unsigned long __ffs(unsigned long word) { unsigned long result; diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c index a0322e8328456e..429b6a76314684 100644 --- a/arch/sh/kernel/asm-offsets.c +++ b/arch/sh/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 92b6649d492952..62f753a85b89c7 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -89,7 +89,7 @@ asmlinkage void ret_from_kernel_thread(void); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 7b595092cbfb65..a630d373e6453c 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -110,6 +110,8 @@ config SPARC64 select HAVE_SETUP_PER_CPU_AREA select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK + select ARCH_SUPPORTS_SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP config ARCH_PROC_KCORE_TEXT def_bool y @@ -288,24 +290,6 @@ if SPARC64 || COMPILE_TEST source "kernel/power/Kconfig" endif -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on SPARC64 && SMP - default y - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with SPARC cpus at a cost of slightly increased overhead - in some places. If unsure say N here. - -config SCHED_MC - bool "Multi-core scheduler support" - depends on SPARC64 && SMP - default y - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config CMDLINE_BOOL bool "Default bootloader kernel arguments" depends on SPARC64 diff --git a/arch/sparc/crypto/Kconfig b/arch/sparc/crypto/Kconfig index f5b2e720fec3c1..f755da97953462 100644 --- a/arch/sparc/crypto/Kconfig +++ b/arch/sparc/crypto/Kconfig @@ -16,16 +16,6 @@ config CRYPTO_DES_SPARC64 Architecture: sparc64 -config CRYPTO_MD5_SPARC64 - tristate "Digests: MD5" - depends on SPARC64 - select CRYPTO_MD5 - select CRYPTO_HASH - help - MD5 message digest algorithm (RFC1321) - - Architecture: sparc64 using crypto instructions, when available - config CRYPTO_AES_SPARC64 tristate "Ciphers: AES, modes: ECB, CBC, CTR" depends on SPARC64 diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile index 0d05a17988c4cd..7b4796842ddd7c 100644 --- a/arch/sparc/crypto/Makefile +++ b/arch/sparc/crypto/Makefile @@ -3,14 +3,10 @@ # Arch-specific CryptoAPI modules. # -obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o - obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o obj-$(CONFIG_CRYPTO_CAMELLIA_SPARC64) += camellia-sparc64.o -md5-sparc64-y := md5_asm.o md5_glue.o - aes-sparc64-y := aes_asm.o aes_glue.o des-sparc64-y := des_asm.o des_glue.o camellia-sparc64-y := camellia_asm.o camellia_glue.o diff --git a/arch/sparc/crypto/md5_glue.c b/arch/sparc/crypto/md5_glue.c deleted file mode 100644 index b3615f0cdf6262..00000000000000 --- a/arch/sparc/crypto/md5_glue.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Glue code for MD5 hashing optimized for sparc64 crypto opcodes. - * - * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c - * and crypto/md5.c which are: - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald - * Copyright (c) Jean-Francois Dive - * Copyright (c) Mathias Krause - * Copyright (c) Cryptoapi developers. - * Copyright (c) 2002 James Morris - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct sparc_md5_state { - __le32 hash[MD5_HASH_WORDS]; - u64 byte_count; -}; - -asmlinkage void md5_sparc64_transform(__le32 *digest, const char *data, - unsigned int rounds); - -static int md5_sparc64_init(struct shash_desc *desc) -{ - struct sparc_md5_state *mctx = shash_desc_ctx(desc); - - mctx->hash[0] = cpu_to_le32(MD5_H0); - mctx->hash[1] = cpu_to_le32(MD5_H1); - mctx->hash[2] = cpu_to_le32(MD5_H2); - mctx->hash[3] = cpu_to_le32(MD5_H3); - mctx->byte_count = 0; - - return 0; -} - -static int md5_sparc64_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sparc_md5_state *sctx = shash_desc_ctx(desc); - - sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE); - md5_sparc64_transform(sctx->hash, data, len / MD5_HMAC_BLOCK_SIZE); - return len - round_down(len, MD5_HMAC_BLOCK_SIZE); -} - -/* Add padding and return the message digest. */ -static int md5_sparc64_finup(struct shash_desc *desc, const u8 *src, - unsigned int offset, u8 *out) -{ - struct sparc_md5_state *sctx = shash_desc_ctx(desc); - __le64 block[MD5_BLOCK_WORDS] = {}; - u8 *p = memcpy(block, src, offset); - __le32 *dst = (__le32 *)out; - __le64 *pbits; - int i; - - src = p; - p += offset; - *p++ = 0x80; - sctx->byte_count += offset; - pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1]; - *pbits = cpu_to_le64(sctx->byte_count << 3); - md5_sparc64_transform(sctx->hash, src, (pbits - block + 1) / 8); - memzero_explicit(block, sizeof(block)); - - /* Store state in digest */ - for (i = 0; i < MD5_HASH_WORDS; i++) - dst[i] = sctx->hash[i]; - - return 0; -} - -static int md5_sparc64_export(struct shash_desc *desc, void *out) -{ - struct sparc_md5_state *sctx = shash_desc_ctx(desc); - union { - u8 *u8; - u32 *u32; - u64 *u64; - } p = { .u8 = out }; - int i; - - for (i = 0; i < MD5_HASH_WORDS; i++) - put_unaligned(le32_to_cpu(sctx->hash[i]), p.u32++); - put_unaligned(sctx->byte_count, p.u64); - return 0; -} - -static int md5_sparc64_import(struct shash_desc *desc, const void *in) -{ - struct sparc_md5_state *sctx = shash_desc_ctx(desc); - union { - const u8 *u8; - const u32 *u32; - const u64 *u64; - } p = { .u8 = in }; - int i; - - for (i = 0; i < MD5_HASH_WORDS; i++) - sctx->hash[i] = cpu_to_le32(get_unaligned(p.u32++)); - sctx->byte_count = get_unaligned(p.u64); - return 0; -} - -static struct shash_alg alg = { - .digestsize = MD5_DIGEST_SIZE, - .init = md5_sparc64_init, - .update = md5_sparc64_update, - .finup = md5_sparc64_finup, - .export = md5_sparc64_export, - .import = md5_sparc64_import, - .descsize = sizeof(struct sparc_md5_state), - .statesize = sizeof(struct sparc_md5_state), - .base = { - .cra_name = "md5", - .cra_driver_name= "md5-sparc64", - .cra_priority = SPARC_CR_OPCODE_PRIORITY, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = MD5_HMAC_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static bool __init sparc64_has_md5_opcode(void) -{ - unsigned long cfr; - - if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) - return false; - - __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); - if (!(cfr & CFR_MD5)) - return false; - - return true; -} - -static int __init md5_sparc64_mod_init(void) -{ - if (sparc64_has_md5_opcode()) { - pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n"); - return crypto_register_shash(&alg); - } - pr_info("sparc64 md5 opcode not available.\n"); - return -ENODEV; -} - -static void __exit md5_sparc64_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(md5_sparc64_mod_init); -module_exit(md5_sparc64_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MD5 Message Digest Algorithm, sparc64 md5 opcode accelerated"); - -MODULE_ALIAS_CRYPTO("md5"); - -#include "crop_devid.c" diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h index 005a8ae858f16c..2c7d33b3ec2ea5 100644 --- a/arch/sparc/include/asm/bitops_64.h +++ b/arch/sparc/include/asm/bitops_64.h @@ -23,8 +23,8 @@ void set_bit(unsigned long nr, volatile unsigned long *addr); void clear_bit(unsigned long nr, volatile unsigned long *addr); void change_bit(unsigned long nr, volatile unsigned long *addr); -int fls(unsigned int word); -int __fls(unsigned long word); +int __attribute_const__ fls(unsigned int word); +int __attribute_const__ __fls(unsigned long word); #include @@ -32,8 +32,8 @@ int __fls(unsigned long word); #ifdef __KERNEL__ -int ffs(int x); -unsigned long __ffs(unsigned long); +int __attribute_const__ ffs(int x); +unsigned long __attribute_const__ __ffs(unsigned long); #include #include diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c index 3d9b9855dce917..6e660bde48dd89 100644 --- a/arch/sparc/kernel/asm-offsets.c +++ b/arch/sparc/kernel/asm-offsets.c @@ -10,6 +10,7 @@ * * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 9c7c662cb5659e..5a28c0e91bf15f 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -260,7 +260,7 @@ extern void ret_from_kernel_thread(void); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 529adfecd58ca1..25781923788a03 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -567,7 +567,7 @@ void fault_in_user_windows(struct pt_regs *regs) */ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct thread_info *t = task_thread_info(p); diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index ad8d78fb1d9aaf..de7867ae220d0c 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -1250,10 +1250,12 @@ static int virtio_uml_probe(struct platform_device *pdev) device_set_wakeup_capable(&vu_dev->vdev.dev, true); rc = register_virtio_device(&vu_dev->vdev); - if (rc) + if (rc) { put_device(&vu_dev->vdev.dev); + return rc; + } vu_dev->registered = 1; - return rc; + return 0; error_init: os_close_file(vu_dev->sock); diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c index 1fb12235ab9c84..a69873aa697f4f 100644 --- a/arch/um/kernel/asm-offsets.c +++ b/arch/um/kernel/asm-offsets.c @@ -1 +1,3 @@ +#define COMPILE_OFFSETS + #include diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 1be644de9e41ec..9c9c66dc45f054 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -143,7 +143,7 @@ static void fork_handler(void) int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; void (*handler)(void); diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 617886d1fb1e91..21f0e50fb1df95 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -535,7 +535,7 @@ ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, cmsg->cmsg_type != SCM_RIGHTS) return n; - memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len); + memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len - CMSG_LEN(0)); return n; } diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 4193e04d7e4a7f..e3ad71a0d13c41 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -20,8 +20,7 @@ void stack_protections(unsigned long address) { - if (mprotect((void *) address, UM_THREAD_SIZE, - PROT_READ | PROT_WRITE | PROT_EXEC) < 0) + if (mprotect((void *) address, UM_THREAD_SIZE, PROT_READ | PROT_WRITE) < 0) panic("protecting stack failed, errno = %d", errno); } diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index f7fb3d88c57bd8..36b985d0e7bf8a 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -3,6 +3,8 @@ # Branch profiling isn't noinstr-safe. Disable it for arch/x86/* subdir-ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING +obj-y += boot/startup/ + obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/ obj-y += entry/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100eb..75f3de70df51f0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,7 +14,6 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS - select GENERIC_VDSO_32 select HAVE_DEBUG_STACKOVERFLOW select KMAP_LOCAL select MODULES_USE_ELF_REL @@ -26,7 +25,6 @@ config X86_64 depends on 64BIT # Options that are inherently 64-bit kernel only: select ARCH_HAS_GIGANTIC_PAGE - select ARCH_HAS_PTDUMP select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK @@ -99,6 +97,7 @@ config X86 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PREEMPT_LAZY + select ARCH_HAS_PTDUMP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_HW_PTE_YOUNG select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2 @@ -127,8 +126,8 @@ config X86 select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 - select ARCH_SUPPORTS_CFI_CLANG if X86_64 - select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG + select ARCH_SUPPORTS_CFI if X86_64 + select ARCH_USES_CFI_TRAPS if X86_64 && CFI select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_RT @@ -182,8 +181,6 @@ config X86 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select GENERIC_VDSO_OVERFLOW_PROTECT select GUP_GET_PXX_LOW_HIGH if X86_PAE select HARDIRQS_SW_RESEND @@ -239,6 +236,7 @@ config X86 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA if X86_32 select HAVE_EXIT_THREAD + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC if HAVE_FUNCTION_GRAPH_TRACER @@ -330,6 +328,10 @@ config X86 imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE select ARCH_SUPPORTS_PT_RECLAIM if X86_64 + select ARCH_SUPPORTS_SCHED_SMT if SMP + select SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_CLUSTER if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP config INSTRUCTION_DECODER def_bool y @@ -483,6 +485,19 @@ config X86_X2APIC If in doubt, say Y. +config AMD_SECURE_AVIC + bool "AMD Secure AVIC" + depends on AMD_MEM_ENCRYPT && X86_X2APIC + help + Enable this to get AMD Secure AVIC support on guests that have this feature. + + AMD Secure AVIC provides hardware acceleration for performance sensitive + APIC accesses and support for managing guest owned APIC state for SEV-SNP + guests. Secure AVIC does not support xAPIC mode. It has functional + dependency on x2apic being enabled in the guest. + + If you don't know what to do here, say N. + config X86_POSTED_MSI bool "Enable MSI and MSI-x delivery by posted interrupts" depends on X86_64 && IRQ_REMAP @@ -879,6 +894,15 @@ config ACRN_GUEST IOT with small footprint and real-time features. More details can be found in https://projectacrn.org/. +config BHYVE_GUEST + bool "Bhyve (BSD Hypervisor) Guest support" + depends on X86_64 + help + This option allows to run Linux to recognise when it is running as a + guest in the Bhyve hypervisor, and to support more than 255 vCPUs when + when doing so. More details about Bhyve can be found at https://bhyve.org + and https://wiki.freebsd.org/bhyve/. + config INTEL_TDX_GUEST bool "Intel TDX (Trust Domain Extensions) - Guest Support" depends on X86_64 && CPU_SUP_INTEL @@ -1031,29 +1055,6 @@ config NR_CPUS This is purely to save memory: each supported CPU adds about 8KB to the kernel image. -config SCHED_CLUSTER - bool "Cluster scheduler support" - depends on SMP - default y - help - Cluster scheduler support improves the CPU scheduler's decision - making when dealing with machines that have clusters of CPUs. - Cluster usually means a couple of CPUs which are placed closely - by sharing mid-level caches, last-level cache tags or internal - busses. - -config SCHED_SMT - def_bool y if SMP - -config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" - depends on SMP - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config SCHED_MC_PRIO bool "CPU core priorities scheduler support" depends on SCHED_MC @@ -1340,7 +1341,7 @@ config MICROCODE_LATE_LOADING use this at your own risk. Late loading taints the kernel unless the microcode header indicates that it is safe for late loading via the minimal revision check. This minimal revision check can be enforced on - the kernel command line with "microcode.minrev=Y". + the kernel command line with "microcode=force_minrev". config MICROCODE_LATE_FORCE_MINREV bool "Enforce late microcode loading minimal revision check" @@ -1356,10 +1357,22 @@ config MICROCODE_LATE_FORCE_MINREV revision check fails. This minimal revision check can also be controlled via the - "microcode.minrev" parameter on the kernel command line. + "microcode=force_minrev" parameter on the kernel command line. If unsure say Y. +config MICROCODE_DBG + bool "Enable microcode loader debugging" + default n + depends on MICROCODE + help + Enable code which allows for debugging the microcode loader in + a guest. Meaning the patch loading is simulated but everything else + related to patch parsing and handling is done as on baremetal with + the purpose of debugging solely the software side of things. + + You almost certainly want to say n here. + config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" help @@ -1753,11 +1766,7 @@ config X86_UMIP config CC_HAS_IBT # GCC >= 9 and binutils >= 2.29 # Retpoline check to work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93654 - # Clang/LLVM >= 14 - # https://github.com/llvm/llvm-project/commit/e0b89df2e0f0130881bf6c39bf31d7f6aac00e0f - # https://github.com/llvm/llvm-project/commit/dfcf69770bc522b9e411c66454934a37c1f35332 - def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || \ - (CC_IS_CLANG && CLANG_VERSION >= 140000)) && \ + def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || CC_IS_CLANG) && \ $(as-instr,endbr64) config X86_CET @@ -1769,8 +1778,6 @@ config X86_KERNEL_IBT prompt "Indirect Branch Tracking" def_bool y depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL - # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f - depends on !LD_IS_LLD || LLD_VERSION >= 140000 select OBJTOOL select X86_CET help @@ -2396,11 +2403,11 @@ config FUNCTION_PADDING_CFI default 3 if FUNCTION_ALIGNMENT_8B default 0 -# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG +# Basically: FUNCTION_ALIGNMENT - 5*CFI # except Kconfig can't do arithmetic :/ config FUNCTION_PADDING_BYTES int - default FUNCTION_PADDING_CFI if CFI_CLANG + default FUNCTION_PADDING_CFI if CFI default FUNCTION_ALIGNMENT config CALL_PADDING @@ -2410,7 +2417,7 @@ config CALL_PADDING config FINEIBT def_bool y - depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE + depends on X86_KERNEL_IBT && CFI && MITIGATION_RETPOLINE select CALL_PADDING config FINEIBT_BHI @@ -2427,7 +2434,7 @@ config CALL_THUNKS config PREFIX_SYMBOLS def_bool y - depends on CALL_PADDING && !CFI_CLANG + depends on CALL_PADDING && !CFI menuconfig CPU_MITIGATIONS bool "Mitigations for CPU vulnerabilities" @@ -2701,6 +2708,15 @@ config MITIGATION_TSA security vulnerability on AMD CPUs which can lead to forwarding of invalid info to subsequent instructions and thus can affect their timing and thereby cause a leakage. + +config MITIGATION_VMSCAPE + bool "Mitigate VMSCAPE" + depends on KVM + default y + help + Enable mitigation for VMSCAPE attacks. VMSCAPE is a hardware security + vulnerability on Intel and AMD CPUs that may allow a guest to do + Spectre v2 style attacks on userspace hypervisor. endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1913d342969ba2..4db7e4bf69f5cd 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -13,8 +13,8 @@ else endif ifdef CONFIG_CC_IS_GCC -RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) -RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register) +RETPOLINE_CFLAGS := -mindirect-branch=thunk-extern -mindirect-branch-register +RETPOLINE_VDSO_CFLAGS := -mindirect-branch=thunk-inline -mindirect-branch-register endif ifdef CONFIG_CC_IS_CLANG RETPOLINE_CFLAGS := -mretpoline-external-thunk @@ -37,10 +37,11 @@ export RETPOLINE_VDSO_CFLAGS # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. -ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) +ifdef CONFIG_CC_IS_GCC cc_stack_align4 := -mpreferred-stack-boundary=2 cc_stack_align8 := -mpreferred-stack-boundary=3 -else ifneq ($(call cc-option, -mstack-alignment=16),) +endif +ifdef CONFIG_CC_IS_CLANG cc_stack_align4 := -mstack-alignment=4 cc_stack_align8 := -mstack-alignment=8 endif @@ -83,19 +84,7 @@ KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-av # CC_FLAGS_FPU := -msse -msse2 ifdef CONFIG_CC_IS_GCC -# Stack alignment mismatch, proceed with caution. -# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3 -# (8B stack alignment). -# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 -# -# The "-msse" in the first argument is there so that the -# -mpreferred-stack-boundary=3 build error: -# -# -mpreferred-stack-boundary=3 is not between 4 and 12 -# -# can be triggered. Otherwise gcc doesn't complain. CC_FLAGS_FPU += -mhard-float -CC_FLAGS_FPU += $(call cc-option,-msse -mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4) endif ifeq ($(CONFIG_X86_KERNEL_IBT),y) @@ -159,7 +148,7 @@ else # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += -mno-80387 - KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) + KBUILD_CFLAGS += -mno-fp-ret-in-387 # By default gcc and clang use a stack alignment of 16 bytes for x86. # However the standard kernel entry on x86-64 leaves the stack on an @@ -171,7 +160,7 @@ else KBUILD_CFLAGS += $(cc_stack_align8) # Use -mskip-rax-setup if supported. - KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + KBUILD_CFLAGS += -mskip-rax-setup ifdef CONFIG_X86_NATIVE_CPU KBUILD_CFLAGS += -march=native @@ -286,7 +275,6 @@ archprepare: $(cpufeaturemasks.hdr) ### # Kernel objects -core-y += arch/x86/boot/startup/ libs-y += arch/x86/lib/ # drivers-y are linked after core-y diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a38fdcdb9bd39..74657589264dfa 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 94b5991da001a7..0f41ca0e52c0fb 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -332,6 +332,8 @@ static size_t parse_elf(void *output) } const unsigned long kernel_text_size = VO___start_rodata - VO__text; +const unsigned long kernel_inittext_offset = VO__sinittext - VO__text; +const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext; const unsigned long kernel_total_size = VO__end - VO__text; static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c index 89dd02de2a0f05..7530ad8b768b1d 100644 --- a/arch/x86/boot/compressed/sev-handle-vc.c +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include "error.h" #include "sev.h" #include @@ -14,6 +15,8 @@ #include #define __BOOT_COMPRESSED +#undef __init +#define __init /* Basic instruction decoding support needed */ #include "../../lib/inat.c" diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index fd1b67dfea228f..6e5c32a53d0342 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -32,102 +32,47 @@ struct ghcb *boot_ghcb; #undef __init #define __init -#undef __head -#define __head - #define __BOOT_COMPRESSED -extern struct svsm_ca *boot_svsm_caa; -extern u64 boot_svsm_caa_pa; - -struct svsm_ca *svsm_get_caa(void) -{ - return boot_svsm_caa; -} - -u64 svsm_get_caa_pa(void) -{ - return boot_svsm_caa_pa; -} - -int svsm_perform_call_protocol(struct svsm_call *call); - u8 snp_vmpl; +u16 ghcb_version; + +u64 boot_svsm_caa_pa; /* Include code for early handlers */ #include "../../boot/startup/sev-shared.c" -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb *ghcb; - int ret; - - if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - return ret; -} - static bool sev_snp_enabled(void) { return sev_status & MSR_AMD64_SEV_SNP_ENABLED; } -static void __page_state_change(unsigned long paddr, enum psc_op op) -{ - u64 val, msr; - - /* - * If private -> shared then invalidate the page before requesting the - * state change in the RMP table. - */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(paddr, paddr, false); - - /* Save the current GHCB MSR value */ - msr = sev_es_rd_ghcb_msr(); - - /* Issue VMGEXIT to change the page state in RMP table. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - /* Read the response of the VMGEXIT. */ - val = sev_es_rd_ghcb_msr(); - if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - /* Restore the GHCB MSR value */ - sev_es_wr_ghcb_msr(msr); - - /* - * Now that page state is changed in the RMP table, validate it so that it is - * consistent with the RMP entry. - */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(paddr, paddr, true); -} - void snp_set_page_private(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); + __page_state_change(paddr, paddr, &d); } void snp_set_page_shared(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_SHARED); + __page_state_change(paddr, paddr, &d); } bool early_setup_ghcb(void) @@ -152,8 +97,14 @@ bool early_setup_ghcb(void) void snp_accept_memory(phys_addr_t start, phys_addr_t end) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) - __page_state_change(pa, SNP_PAGE_STATE_PRIVATE); + __page_state_change(pa, pa, &d); } void sev_es_shutdown_ghcb(void) @@ -235,15 +186,23 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_SECURE_AVIC | \ MSR_AMD64_SNP_RESERVED_MASK) +#ifdef CONFIG_AMD_SECURE_AVIC +#define SNP_FEATURE_SECURE_AVIC MSR_AMD64_SNP_SECURE_AVIC +#else +#define SNP_FEATURE_SECURE_AVIC 0 +#endif + /* * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented * by the guest kernel. As and when a new feature is implemented in the * guest kernel, a corresponding bit should be added to the mask. */ #define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \ - MSR_AMD64_SNP_SECURE_TSC) + MSR_AMD64_SNP_SECURE_TSC | \ + SNP_FEATURE_SECURE_AVIC) u64 snp_get_unsupported_features(u64 status) { @@ -347,7 +306,7 @@ static bool early_snp_init(struct boot_params *bp) * running at VMPL0. The CA will be used to communicate with the * SVSM and request its services. */ - svsm_setup_ca(cc_info); + svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page)); /* * Pass run-time kernel a pointer to CC info via boot_params so EFI @@ -391,6 +350,8 @@ static int sev_check_cpu_support(void) if (!(eax & BIT(1))) return -ENODEV; + sev_snp_needs_sfw = !(ebx & BIT(31)); + return ebx & 0x3f; } @@ -453,30 +414,16 @@ void sev_enable(struct boot_params *bp) */ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { u64 hv_features; - int ret; hv_features = get_hv_features(); if (!(hv_features & GHCB_HV_FT_SNP)) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* - * Enforce running at VMPL0 or with an SVSM. - * - * Use RMPADJUST (see the rmpadjust() function for a description of - * what the instruction does) to update the VMPL1 permissions of a - * page. If the guest is running at VMPL0, this will succeed. If the - * guest is running at any other VMPL, this will fail. Linux SNP guests - * only ever run at a single VMPL level so permission mask changes of a - * lesser-privileged VMPL are a don't-care. - */ - ret = rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1); - - /* - * Running at VMPL0 is not required if an SVSM is present and the hypervisor - * supports the required SVSM GHCB events. + * Running at VMPL0 is required unless an SVSM is present and + * the hypervisor supports the required SVSM GHCB events. */ - if (ret && - !(snp_vmpl && (hv_features & GHCB_HV_FT_SNP_MULTI_VMPL))) + if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); } @@ -550,7 +497,6 @@ bool early_is_sevsnp_guest(void) /* Obtain the address of the calling area to use */ boot_rdmsr(MSR_SVSM_CAA, &m); - boot_svsm_caa = (void *)m.q; boot_svsm_caa_pa = m.q; /* diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 63e037e94e4c03..916bac09b464da 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -106,18 +106,5 @@ void get_cpuflags(void) cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6], &cpu.flags[1]); } - - if (max_amd_level >= 0x8000001f) { - u32 ebx; - - /* - * The X86_FEATURE_COHERENCY_SFW_NO feature bit is in - * the virtualization flags entry (word 8) and set by - * scattered.c, so the bit needs to be explicitly set. - */ - cpuid(0x8000001f, &ignored, &ebx, &ignored, &ignored); - if (ebx & BIT(31)) - set_bit(X86_FEATURE_COHERENCY_SFW_NO, cpu.flags); - } } } diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile index b514f7e81332aa..e8fdf020b4223e 100644 --- a/arch/x86/boot/startup/Makefile +++ b/arch/x86/boot/startup/Makefile @@ -4,6 +4,7 @@ KBUILD_AFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \ -Os -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ -fno-stack-protector -D__NO_FORTIFY \ -fno-jump-tables \ -include $(srctree)/include/linux/hidden.h @@ -19,6 +20,7 @@ KCOV_INSTRUMENT := n obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o +pi-objs := $(patsubst %.o,$(obj)/%.o,$(obj-y)) lib-$(CONFIG_X86_64) += la57toggle.o lib-$(CONFIG_EFI_MIXED) += efi-mixed.o @@ -28,3 +30,23 @@ lib-$(CONFIG_EFI_MIXED) += efi-mixed.o # to be linked into the decompressor or the EFI stub but not vmlinux # $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y + +# +# Invoke objtool for each object individually to check for absolute +# relocations, even if other objtool actions are being deferred. +# +$(pi-objs): objtool-enabled = 1 +$(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs + +# +# Confine the startup code by prefixing all symbols with __pi_ (for position +# independent). This ensures that startup code can only call other startup +# code, or code that has explicitly been made accessible to it via a symbol +# alias. +# +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +targets += $(obj-y) +obj-y := $(patsubst %.o,%.pi.o,$(obj-y)) diff --git a/arch/x86/boot/startup/exports.h b/arch/x86/boot/startup/exports.h new file mode 100644 index 00000000000000..01d2363dc445f0 --- /dev/null +++ b/arch/x86/boot/startup/exports.h @@ -0,0 +1,14 @@ + +/* + * The symbols below are functions that are implemented by the startup code, + * but called at runtime by the SEV code residing in the core kernel. + */ +PROVIDE(early_set_pages_state = __pi_early_set_pages_state); +PROVIDE(early_snp_set_memory_private = __pi_early_snp_set_memory_private); +PROVIDE(early_snp_set_memory_shared = __pi_early_snp_set_memory_shared); +PROVIDE(get_hv_features = __pi_get_hv_features); +PROVIDE(sev_es_terminate = __pi_sev_es_terminate); +PROVIDE(snp_cpuid = __pi_snp_cpuid); +PROVIDE(snp_cpuid_get_table = __pi_snp_cpuid_get_table); +PROVIDE(svsm_issue_call = __pi_svsm_issue_call); +PROVIDE(svsm_process_result_codes = __pi_svsm_process_result_codes); diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c index a3112a69b06a77..d16102abdaec34 100644 --- a/arch/x86/boot/startup/gdt_idt.c +++ b/arch/x86/boot/startup/gdt_idt.c @@ -24,7 +24,7 @@ static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; /* This may run while still in the direct mapping */ -void __head startup_64_load_idt(void *vc_handler) +void startup_64_load_idt(void *vc_handler) { struct desc_ptr desc = { .address = (unsigned long)rip_rel_ptr(bringup_idt_table), @@ -46,7 +46,7 @@ void __head startup_64_load_idt(void *vc_handler) /* * Setup boot CPU state needed before kernel switches to virtual addresses. */ -void __head startup_64_setup_gdt_idt(void) +void __init startup_64_setup_gdt_idt(void) { struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page); void *handler = NULL; diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c index 332dbe6688c4d9..83ba98d6157272 100644 --- a/arch/x86/boot/startup/map_kernel.c +++ b/arch/x86/boot/startup/map_kernel.c @@ -30,7 +30,7 @@ static inline bool check_la57_support(void) return true; } -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, +static unsigned long __init sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd, unsigned long p2v_offset) { @@ -84,7 +84,7 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, * the 1:1 mapping of memory. Kernel virtual addresses can be determined by * subtracting p2v_offset from the RIP-relative address. */ -unsigned long __head __startup_64(unsigned long p2v_offset, +unsigned long __init __startup_64(unsigned long p2v_offset, struct boot_params *bp) { pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts); diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index a34cd19796f9ac..4e22ffd735168b 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -12,35 +12,12 @@ #include #ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) #define has_cpuflag(f) boot_cpu_has(f) #else #undef WARN #define WARN(condition, format...) (!!(condition)) -#undef vc_forward_exception -#define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n") #endif -/* - * SVSM related information: - * During boot, the page tables are set up as identity mapped and later - * changed to use kernel virtual addresses. Maintain separate virtual and - * physical addresses for the CAA to allow SVSM functions to be used during - * early boot, both with identity mapped virtual addresses and proper kernel - * virtual addresses. - */ -struct svsm_ca *boot_svsm_caa __ro_after_init; -u64 boot_svsm_caa_pa __ro_after_init; - -/* - * Since feature negotiation related variables are set early in the boot - * process they must reside in the .data section so as not to be zeroed - * out when the .bss section is later cleared. - * - * GHCB protocol version negotiated with the hypervisor. - */ -static u16 ghcb_version __ro_after_init; - /* Copy of the SNP firmware's CPUID page. */ static struct snp_cpuid_table cpuid_table_copy __ro_after_init; @@ -54,17 +31,9 @@ static u32 cpuid_std_range_max __ro_after_init; static u32 cpuid_hyp_range_max __ro_after_init; static u32 cpuid_ext_range_max __ro_after_init; -bool __init sev_es_check_cpu_features(void) -{ - if (!has_cpuflag(X86_FEATURE_RDRAND)) { - error("RDRAND instruction not supported - no trusted source of randomness available\n"); - return false; - } +bool sev_snp_needs_sfw; - return true; -} - -void __head __noreturn +void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; @@ -83,7 +52,7 @@ sev_es_terminate(unsigned int set, unsigned int reason) /* * The hypervisor features are available from GHCB version 2 onward. */ -u64 get_hv_features(void) +u64 __init get_hv_features(void) { u64 val; @@ -100,72 +69,7 @@ u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } -void snp_register_ghcb_early(unsigned long paddr) -{ - unsigned long pfn = paddr >> PAGE_SHIFT; - u64 val; - - sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - /* If the response GPA is not ours then abort the guest */ - if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || - (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); -} - -bool sev_es_negotiate_protocol(void) -{ - u64 val; - - /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - - if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) - return false; - - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) - return false; - - ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); - - return true; -} - -static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - u32 ret; - - ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); - if (!ret) - return ES_OK; - - if (ret == 1) { - u64 info = ghcb->save.sw_exit_info_2; - unsigned long v = info & SVM_EVTINJ_VEC_MASK; - - /* Check if exception information from hypervisor is sane. */ - if ((info & SVM_EVTINJ_VALID) && - ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && - ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { - ctxt->fi.vector = v; - - if (info & SVM_EVTINJ_VALID_ERR) - ctxt->fi.error_code = info >> 32; - - return ES_EXCEPTION; - } - } - - return ES_VMM_ERROR; -} - -static inline int svsm_process_result_codes(struct svsm_call *call) +int svsm_process_result_codes(struct svsm_call *call) { switch (call->rax_out) { case SVSM_SUCCESS: @@ -193,7 +97,7 @@ static inline int svsm_process_result_codes(struct svsm_call *call) * - RAX specifies the SVSM protocol/callid as input and the return code * as output. */ -static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) +void svsm_issue_call(struct svsm_call *call, u8 *pending) { register unsigned long rax asm("rax") = call->rax; register unsigned long rcx asm("rcx") = call->rcx; @@ -216,7 +120,7 @@ static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) call->r9_out = r9; } -static int svsm_perform_msr_protocol(struct svsm_call *call) +int svsm_perform_msr_protocol(struct svsm_call *call) { u8 pending = 0; u64 val, resp; @@ -247,63 +151,6 @@ static int svsm_perform_msr_protocol(struct svsm_call *call) return svsm_process_result_codes(call); } -static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) -{ - struct es_em_ctxt ctxt; - u8 pending = 0; - - vc_ghcb_invalidate(ghcb); - - /* - * Fill in protocol and format specifiers. This can be called very early - * in the boot, so use rip-relative references as needed. - */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - - svsm_issue_call(call, &pending); - - if (pending) - return -EINVAL; - - switch (verify_exception_info(ghcb, &ctxt)) { - case ES_OK: - break; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - fallthrough; - default: - return -EINVAL; - } - - return svsm_process_result_codes(call); -} - -enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) -{ - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - return verify_exception_info(ghcb, ctxt); -} - static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) { u64 val; @@ -342,44 +189,7 @@ static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) return ret; } -static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - u32 cr4 = native_read_cr4(); - int ret; - - ghcb_set_rax(ghcb, leaf->fn); - ghcb_set_rcx(ghcb, leaf->subfn); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #UD - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - leaf->eax = ghcb->save.rax; - leaf->ebx = ghcb->save.rbx; - leaf->ecx = ghcb->save.rcx; - leaf->edx = ghcb->save.rdx; - - return ES_OK; -} - -static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) - : __sev_cpuid_hv_msr(leaf); -} /* * This may be called early while still running on the initial identity @@ -412,7 +222,7 @@ const struct snp_cpuid_table *snp_cpuid_get_table(void) * * Return: XSAVE area size on success, 0 otherwise. */ -static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); u64 xfeatures_found = 0; @@ -448,7 +258,7 @@ static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) return xsave_size; } -static bool __head +static bool snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -484,21 +294,21 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) return false; } -static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf) { - if (sev_cpuid_hv(ghcb, ctxt, leaf)) + if (__sev_cpuid_hv_msr(leaf)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int __head -snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) +static int +snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; switch (leaf->fn) { case 0x1: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* initial APIC ID */ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); @@ -517,7 +327,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, break; case 0xB: leaf_hv.subfn = 0; - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->edx = leaf_hv.edx; @@ -565,7 +375,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, } break; case 0x8000001E: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->eax = leaf_hv.eax; @@ -586,8 +396,8 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -int __head -snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -621,7 +431,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) return 0; } - return snp_cpuid_postprocess(ghcb, ctxt, leaf); + return snp_cpuid_postprocess(cpuid_fn, ctx, leaf); } /* @@ -629,7 +439,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) * page yet, so it only supports the MSR based communication with the * hypervisor and only the CPUID exit-code. */ -void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); @@ -648,13 +458,24 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; - ret = snp_cpuid(NULL, NULL, &leaf); + /* + * If SNP is active, then snp_cpuid() uses the CPUID table to obtain the + * CPUID values (with possible HV interaction during post-processing of + * the values). But if SNP is not active (no CPUID table present), then + * snp_cpuid() returns -EOPNOTSUPP so that an SEV-ES guest can call the + * HV to obtain the CPUID information. + */ + ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf); if (!ret) goto cpuid_done; if (ret != -EOPNOTSUPP) goto fail; + /* + * This is reached by a SEV-ES guest and needs to invoke the HV for + * the CPUID data. + */ if (__sev_cpuid_hv_msr(&leaf)) goto fail; @@ -705,7 +526,7 @@ struct cc_setup_data { * Search for a Confidential Computing blob passed in as a setup_data entry * via the Linux Boot Protocol. */ -static __head +static __init struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) { struct cc_setup_data *sd = NULL; @@ -733,7 +554,7 @@ struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) * mapping needs to be updated in sync with all the changes to virtual memory * layout and related mapping facilities throughout the boot process. */ -static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) { const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; int i; @@ -761,13 +582,24 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) } } -static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) +static int svsm_call_msr_protocol(struct svsm_call *call) +{ + int ret; + + do { + ret = svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + return ret; +} + +static void svsm_pval_4k_page(unsigned long paddr, bool validate, + struct svsm_ca *caa, u64 caa_pa) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; unsigned long flags; u64 pc_pa; - int ret; /* * This can be called very early in the boot, use native functions in @@ -775,10 +607,10 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) */ flags = native_local_irq_save(); - call.caa = svsm_get_caa(); + call.caa = caa; pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; - pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer); + pc_pa = caa_pa + offsetof(struct svsm_ca, svsm_buffer); pc->num_entries = 1; pc->cur_index = 0; @@ -792,20 +624,24 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); call.rcx = pc_pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + /* + * Use the MSR protocol exclusively, so that this code is usable in + * startup code where VA/PA translations of the GHCB page's address may + * be problematic. + */ + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); native_local_irq_restore(flags); } -static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, - bool validate) +static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, + bool validate, struct svsm_ca *caa, u64 caa_pa) { int ret; if (snp_vmpl) { - svsm_pval_4k_page(paddr, validate); + svsm_pval_4k_page(paddr, validate, caa, caa_pa); } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); if (ret) @@ -816,15 +652,51 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, * If validating memory (making it private) and affected by the * cache-coherency vulnerability, perform the cache eviction mitigation. */ - if (validate && !has_cpuflag(X86_FEATURE_COHERENCY_SFW_NO)) + if (validate && sev_snp_needs_sfw) sev_evict_cache((void *)vaddr, 1); } +static void __page_state_change(unsigned long vaddr, unsigned long paddr, + const struct psc_desc *desc) +{ + u64 val, msr; + + /* + * If private -> shared then invalidate the page before requesting the + * state change in the RMP table. + */ + if (desc->op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false, desc->ca, desc->caa_pa); + + /* Save the current GHCB MSR value */ + msr = sev_es_rd_ghcb_msr(); + + /* Issue VMGEXIT to change the page state in RMP table. */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, desc->op)); + VMGEXIT(); + + /* Read the response of the VMGEXIT. */ + val = sev_es_rd_ghcb_msr(); + if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + /* Restore the GHCB MSR value */ + sev_es_wr_ghcb_msr(msr); + + /* + * Now that page state is changed in the RMP table, validate it so that it is + * consistent with the RMP entry. + */ + if (desc->op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true, desc->ca, desc->caa_pa); +} + /* * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. */ -static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) +static bool __init svsm_setup_ca(const struct cc_blob_sev_info *cc_info, + void *page) { struct snp_secrets_page *secrets_page; struct snp_cpuid_table *cpuid_table; @@ -847,7 +719,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) * routine is running identity mapped when called, both by the decompressor * code and the early kernel code. */ - if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1)) + if (!rmpadjust((unsigned long)page, RMP_PG_SIZE_4K, 1)) return false; /* @@ -875,11 +747,6 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) if (caa & (PAGE_SIZE - 1)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA); - /* - * The CA is identity mapped when this routine is called, both by the - * decompressor code and the early kernel code. - */ - boot_svsm_caa = (struct svsm_ca *)caa; boot_svsm_caa_pa = caa; /* Advertise the SVSM presence via CPUID. */ diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 0b7e3b95018389..09725428d3e657 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -41,143 +41,14 @@ #include #include -/* For early boot hypervisor communication in SEV-ES enabled guests */ -struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -struct ghcb *boot_ghcb __section(".data"); - -/* Bitmap of SEV features supported by the hypervisor */ -u64 sev_hv_features __ro_after_init; - -/* Secrets page physical address from the CC blob */ -u64 sev_secrets_pa __ro_after_init; - -/* For early boot SVSM communication */ -struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); - -DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); -DEFINE_PER_CPU(u64, svsm_caa_pa); - -/* - * Nothing shall interrupt this code path while holding the per-CPU - * GHCB. The backup GHCB is only for NMIs interrupting this path. - * - * Callers must disable local interrupts around it. - */ -noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) { - /* - * Backup-GHCB is also already in use. There is no way - * to continue here so just kill the machine. To make - * panic() work, mark GHCBs inactive so that messages - * can be printed out. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - instrumentation_begin(); - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - instrumentation_end(); - } - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; - - state->ghcb = &data->backup_ghcb; - - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } - - return ghcb; -} - /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -noinstr void __sev_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - /* - * Invalidate the GHCB so a VMGEXIT instruction issued - * from userspace won't appear to be valid. - */ - vc_ghcb_invalidate(ghcb); - data->ghcb_active = false; - } -} - -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - int ret; - - /* - * This can be called very early in the boot, use native functions in - * order to avoid paravirt issues. - */ - flags = native_local_irq_save(); - - if (sev_cfg.ghcbs_initialized) - ghcb = __sev_get_ghcb(&state); - else if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - if (sev_cfg.ghcbs_initialized) - __sev_put_ghcb(&state); - - native_local_irq_restore(flags); - - return ret; -} - -void __head +void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) + unsigned long npages, const struct psc_desc *desc) { unsigned long paddr_end; - u64 val; vaddr = vaddr & PAGE_MASK; @@ -185,42 +56,22 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, paddr_end = paddr + (npages << PAGE_SHIFT); while (paddr < paddr_end) { - /* Page validation must be rescinded before changing to shared */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(vaddr, paddr, false); - - /* - * Use the MSR protocol because this function can be called before - * the GHCB is established. - */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) - goto e_term; - - if (GHCB_MSR_PSC_RESP_VAL(val)) - goto e_term; - - /* Page validation must be performed after changing to private */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(vaddr, paddr, true); + __page_state_change(vaddr, paddr, desc); vaddr += PAGE_SIZE; paddr += PAGE_SIZE; } - - return; - -e_term: - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); } -void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -234,12 +85,18 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd * Ask the hypervisor to mark the memory pages as private in the RMP * table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); + early_set_pages_state(vaddr, paddr, npages, &d); } -void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -250,7 +107,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr return; /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); + early_set_pages_state(vaddr, paddr, npages, &d); } /* @@ -266,7 +123,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr * * Scan for the blob in that order. */ -static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -287,15 +144,15 @@ static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) found_cc_info: if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); return cc_info; } -static __head void svsm_setup(struct cc_blob_sev_info *cc_info) +static void __init svsm_setup(struct cc_blob_sev_info *cc_info) { + struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys; struct svsm_call call = {}; - int ret; u64 pa; /* @@ -303,7 +160,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) * running at VMPL0. The CA will be used to communicate with the * SVSM to perform the SVSM services. */ - if (!svsm_setup_ca(cc_info)) + if (!svsm_setup_ca(cc_info, rip_rel_ptr(&boot_svsm_ca_page))) return; /* @@ -315,25 +172,25 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) pa = (u64)rip_rel_ptr(&boot_svsm_ca_page); /* - * Switch over to the boot SVSM CA while the current CA is still - * addressable. There is no GHCB at this point so use the MSR protocol. + * Switch over to the boot SVSM CA while the current CA is still 1:1 + * mapped and thus addressable with VA == PA. There is no GHCB at this + * point so use the MSR protocol. * * SVSM_CORE_REMAP_CA call: * RAX = 0 (Protocol=0, CallID=0) * RCX = New CA GPA */ - call.caa = svsm_get_caa(); + call.caa = (struct svsm_ca *)secrets->svsm_caa; call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); call.rcx = pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); - boot_svsm_caa = (struct svsm_ca *)pa; boot_svsm_caa_pa = pa; } -bool __head snp_init(struct boot_params *bp) +bool __init snp_init(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -361,8 +218,3 @@ bool __head snp_init(struct boot_params *bp) return true; } - -void __head __noreturn snp_abort(void) -{ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); -} diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 70ea1748c0a786..e7ea65f3f1d6a5 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -91,7 +91,7 @@ struct sme_populate_pgd_data { */ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); -static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; @@ -106,7 +106,7 @@ static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) memset(pgd_p, 0, pgd_size); } -static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd; p4d_t *p4d; @@ -143,7 +143,7 @@ static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) return pud; } -static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -159,7 +159,7 @@ static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); } -static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -185,7 +185,7 @@ static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } -static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); @@ -195,7 +195,7 @@ static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd(ppd); @@ -205,7 +205,7 @@ static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, pmdval_t pmd_flags, pteval_t pte_flags) { unsigned long vaddr_end; @@ -229,22 +229,22 @@ static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, __sme_map_range_pte(ppd); } -static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); } -static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); } -static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } -static unsigned long __head sme_pgtable_calc(unsigned long len) +static unsigned long __init sme_pgtable_calc(unsigned long len) { unsigned long entries = 0, tables = 0; @@ -281,7 +281,7 @@ static unsigned long __head sme_pgtable_calc(unsigned long len) return entries + tables; } -void __head sme_encrypt_kernel(struct boot_params *bp) +void __init sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; @@ -485,7 +485,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp) native_write_cr3(__native_read_cr3()); } -void __head sme_enable(struct boot_params *bp) +void __init sme_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; @@ -521,6 +521,7 @@ void __head sme_enable(struct boot_params *bp) return; me_mask = 1UL << (ebx & 0x3f); + sev_snp_needs_sfw = !(ebx & BIT(31)); /* Check the SEV MSR whether SEV or SME is enabled */ sev_status = msr = native_rdmsrq(MSR_AMD64_SEV); @@ -531,7 +532,7 @@ void __head sme_enable(struct boot_params *bp) * enablement abort the guest. */ if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED)) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { @@ -567,7 +568,6 @@ void __head sme_enable(struct boot_params *bp) #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* Local version for startup code, which never operates on user page tables */ -__weak pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d4610af6811434..989ca9f72ba30b 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -104,6 +104,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_HOST_SEV_SNP: return cc_flags.host_sev_snp; + case CC_ATTR_SNP_SECURE_AVIC: + return sev_status & MSR_AMD64_SNP_SECURE_AVIC; + default: return false; } diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index 342d79f0ab6a8a..3b8ae214a6a64d 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += core.o sev-nmi.o vc-handle.o +obj-y += core.o noinstr.o vc-handle.o # Clang 14 and older may fail to respect __no_sanitize_undefined when inlining -UBSAN_SANITIZE_sev-nmi.o := n +UBSAN_SANITIZE_noinstr.o := n # GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining -KASAN_SANITIZE_sev-nmi.o := n -KCSAN_SANITIZE_sev-nmi.o := n +KASAN_SANITIZE_noinstr.o := n +KCSAN_SANITIZE_noinstr.o := n diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 14ef5908fb2735..9ae3b11754e655 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -46,6 +46,48 @@ #include #include +/* Bitmap of SEV features supported by the hypervisor */ +u64 sev_hv_features __ro_after_init; +SYM_PIC_ALIAS(sev_hv_features); + +/* Secrets page physical address from the CC blob */ +u64 sev_secrets_pa __ro_after_init; +SYM_PIC_ALIAS(sev_secrets_pa); + +/* For early boot SVSM communication */ +struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); +SYM_PIC_ALIAS(boot_svsm_ca_page); + +/* + * SVSM related information: + * During boot, the page tables are set up as identity mapped and later + * changed to use kernel virtual addresses. Maintain separate virtual and + * physical addresses for the CAA to allow SVSM functions to be used during + * early boot, both with identity mapped virtual addresses and proper kernel + * virtual addresses. + */ +u64 boot_svsm_caa_pa __ro_after_init; +SYM_PIC_ALIAS(boot_svsm_caa_pa); + +DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); +DEFINE_PER_CPU(u64, svsm_caa_pa); + +static inline struct svsm_ca *svsm_get_caa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa); + else + return rip_rel_ptr(&boot_svsm_ca_page); +} + +static inline u64 svsm_get_caa_pa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa_pa); + else + return boot_svsm_caa_pa; +} + /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ #define AP_INIT_CS_LIMIT 0xffff #define AP_INIT_DS_LIMIT 0xffff @@ -79,6 +121,7 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", + [MSR_AMD64_SNP_SECURE_AVIC_BIT] = "SecureAVIC", }; /* @@ -100,6 +143,26 @@ DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); */ u8 snp_vmpl __ro_after_init; EXPORT_SYMBOL_GPL(snp_vmpl); +SYM_PIC_ALIAS(snp_vmpl); + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +u16 ghcb_version __ro_after_init; +SYM_PIC_ALIAS(ghcb_version); + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +struct ghcb *boot_ghcb __section(".data"); static u64 __init get_snp_jump_table_addr(void) { @@ -154,6 +217,73 @@ static u64 __init get_jump_table_addr(void) return ret; } +static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) +{ + struct es_em_ctxt ctxt; + u8 pending = 0; + + vc_ghcb_invalidate(ghcb); + + /* + * Fill in protocol and format specifiers. This can be called very early + * in the boot, so use rip-relative references as needed. + */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + + svsm_issue_call(call, &pending); + + if (pending) + return -EINVAL; + + switch (verify_exception_info(ghcb, &ctxt)) { + case ES_OK: + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + fallthrough; + default: + return -EINVAL; + } + + return svsm_process_result_codes(call); +} + +static int svsm_perform_call_protocol(struct svsm_call *call) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + flags = native_local_irq_save(); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else if (boot_ghcb) + ghcb = boot_ghcb; + else + ghcb = NULL; + + do { + ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) + : __pi_svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + native_local_irq_restore(flags); + + return ret; +} + static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size, int ret, u64 svsm_ret) { @@ -531,8 +661,11 @@ static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) unsigned long vaddr_end; /* Use the MSR protocol when a GHCB is not available. */ - if (!boot_ghcb) - return early_set_pages_state(vaddr, __pa(vaddr), npages, op); + if (!boot_ghcb) { + struct psc_desc d = { op, svsm_get_caa(), svsm_get_caa_pa() }; + + return early_set_pages_state(vaddr, __pa(vaddr), npages, &d); + } vaddr = vaddr & PAGE_MASK; vaddr_end = vaddr + (npages << PAGE_SHIFT); @@ -973,6 +1106,9 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + vmsa->vintr_ctrl |= V_GIF_MASK | V_NMI_ENABLE_MASK; + /* SVME must be set. */ vmsa->efer = EFER_SVME; @@ -1107,6 +1243,105 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) return 0; } +u64 savic_ghcb_msr_read(u32 reg) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { .cx = msr }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, false); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) read returned error (%d)\n", msr, res); + /* MSR read failures are treated as fatal errors */ + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + } + + __sev_put_ghcb(&state); + + return regs.ax | regs.dx << 32; +} + +void savic_ghcb_msr_write(u32 reg, u64 value) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { + .cx = msr, + .ax = lower_32_bits(value), + .dx = upper_32_bits(value) + }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, true); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) write returned error (%d)\n", msr, res); + /* MSR writes should never fail. Any failure is fatal error for SNP guest */ + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + } + + __sev_put_ghcb(&state); +} + +enum es_result savic_register_gpa(u64 gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + ghcb_set_rbx(ghcb, gpa); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_REGISTER_GPA, 0); + + __sev_put_ghcb(&state); + + return res; +} + +enum es_result savic_unregister_gpa(u64 *gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_UNREGISTER_GPA, 0); + if (gpa && res == ES_OK) + *gpa = ghcb->save.rbx; + + __sev_put_ghcb(&state); + + return res; +} + static void snp_register_per_cpu_ghcb(void) { struct sev_es_runtime_data *data; @@ -1233,7 +1468,8 @@ static void __init alloc_runtime_data(int cpu) struct svsm_ca *caa; /* Allocate the SVSM CA page if an SVSM is present */ - caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE); + caa = cpu ? memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE) + : &boot_svsm_ca_page; per_cpu(svsm_caa, cpu) = caa; per_cpu(svsm_caa_pa, cpu) = __pa(caa); @@ -1287,32 +1523,9 @@ void __init sev_es_init_vc_handling(void) init_ghcb(cpu); } - /* If running under an SVSM, switch to the per-cpu CA */ - if (snp_vmpl) { - struct svsm_call call = {}; - unsigned long flags; - int ret; - - local_irq_save(flags); - - /* - * SVSM_CORE_REMAP_CA call: - * RAX = 0 (Protocol=0, CallID=0) - * RCX = New CA GPA - */ - call.caa = svsm_get_caa(); - call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); - call.rcx = this_cpu_read(svsm_caa_pa); - ret = svsm_perform_call_protocol(&call); - if (ret) - panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", - ret, call.rax_out); - + if (snp_vmpl) sev_cfg.use_cas = true; - local_irq_restore(flags); - } - sev_es_setup_play_dead(); /* Secondary CPUs use the runtime #VC handler */ @@ -1590,15 +1803,6 @@ void sev_show_status(void) pr_cont("\n"); } -void __init snp_update_svsm_ca(void) -{ - if (!snp_vmpl) - return; - - /* Update the CAA to a proper kernel address */ - boot_svsm_caa = &boot_svsm_ca_page; -} - #ifdef CONFIG_SYSFS static ssize_t vmpl_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/noinstr.c similarity index 61% rename from arch/x86/coco/sev/sev-nmi.c rename to arch/x86/coco/sev/noinstr.c index d8dfaddfb3671e..b527eafb631235 100644 --- a/arch/x86/coco/sev/sev-nmi.c +++ b/arch/x86/coco/sev/noinstr.c @@ -106,3 +106,77 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index c3b4acbde0d8c6..7fc136a353347b 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -351,6 +351,8 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, } #define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define error(v) +#define has_cpuflag(f) boot_cpu_has(f) #include "vc-shared.c" @@ -402,14 +404,10 @@ static enum es_result __vc_handle_secure_tsc_msrs(struct es_em_ctxt *ctxt, bool return ES_OK; } -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write) { struct pt_regs *regs = ctxt->regs; enum es_result ret; - bool write; - - /* Is it a WRMSR? */ - write = ctxt->insn.opcode.bytes[1] == 0x30; switch (regs->cx) { case MSR_SVSM_CAA: @@ -419,6 +417,15 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) if (sev_status & MSR_AMD64_SNP_SECURE_TSC) return __vc_handle_secure_tsc_msrs(ctxt, write); break; + case MSR_AMD64_SAVIC_CONTROL: + /* + * AMD64_SAVIC_CONTROL should not be intercepted when + * Secure AVIC is enabled. Terminate the Secure AVIC guest + * if the interception is enabled. + */ + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return ES_VMM_ERROR; + break; default: break; } @@ -439,6 +446,11 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_handle_msr(ghcb, ctxt, ctxt->insn.opcode.bytes[1] == 0x30); +} + static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) { int trapnr = ctxt->fi.vector; diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c index 2c0ab0fdc0603e..9b01c9ad81be62 100644 --- a/arch/x86/coco/sev/vc-shared.c +++ b/arch/x86/coco/sev/vc-shared.c @@ -409,15 +409,109 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + u32 ret; + + ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); + if (!ret) + return ES_OK; + + if (ret == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + + return ES_EXCEPTION; + } + } + + return ES_VMM_ERROR; +} + +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); +} + +static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + u32 cr4 = native_read_cr4(); + int ret; + + ghcb_set_rax(ghcb, leaf->fn); + ghcb_set_rcx(ghcb, leaf->subfn); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #UD - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + leaf->eax = ghcb->save.rax; + leaf->ebx = ghcb->save.rbx; + leaf->ecx = ghcb->save.rcx; + leaf->edx = ghcb->save.rdx; + + return ES_OK; +} + +struct cpuid_ctx { + struct ghcb *ghcb; + struct es_em_ctxt *ctxt; +}; + +static void snp_cpuid_hv_ghcb(void *p, struct cpuid_leaf *leaf) +{ + struct cpuid_ctx *ctx = p; + + if (__sev_cpuid_hv_ghcb(ctx->ghcb, ctx->ctxt, leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { + struct cpuid_ctx ctx = { ghcb, ctxt }; struct pt_regs *regs = ctxt->regs; struct cpuid_leaf leaf; int ret; leaf.fn = regs->ax; leaf.subfn = regs->cx; - ret = snp_cpuid(ghcb, ctxt, &leaf); + ret = snp_cpuid(snp_cpuid_hv_ghcb, &ctx, &leaf); if (!ret) { regs->ax = leaf.eax; regs->bx = leaf.ebx; @@ -502,3 +596,50 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + +bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) + return false; + + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) + return false; + + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + + return true; +} diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config index d5d091e03bd34b..98b6952ba9d2e2 100644 --- a/arch/x86/configs/xen.config +++ b/arch/x86/configs/xen.config @@ -12,7 +12,6 @@ CONFIG_CPU_FREQ=y # x86 xen specific config options CONFIG_XEN_PVH=y -CONFIG_XEN_SAVE_RESTORE=y # CONFIG_XEN_DEBUG_FS is not set CONFIG_XEN_MCE_LOG=y CONFIG_XEN_ACPI_PROCESSOR=m diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 94016c60561e28..d9c6fc78cf3324 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -2,19 +2,6 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)" -config CRYPTO_CURVE25519_X86 - tristate - depends on 64BIT - select CRYPTO_KPP - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CURVE25519 - default CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 algorithm - - Architecture: x86_64 using: - - ADX (large integer arithmetic) - config CRYPTO_AES_NI_INTEL tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)" select CRYPTO_AEAD diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index d402963d6b579a..dfba7e5e88ea69 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -62,8 +62,6 @@ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o -obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o - obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o @@ -81,6 +79,3 @@ aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o - -# Disable GCOV in odd or sensitive code -GCOV_PROFILE_curve25519-x86_64.o := n diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92cf0fe2291eb9..ced2a1deecd7ce 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7610f26dfbd90c..745caa6c15a32d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2069,13 +2069,15 @@ static void _x86_pmu_read(struct perf_event *event) void x86_pmu_show_pmu_cap(struct pmu *pmu) { - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); - pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); + pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); + pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); + pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); + pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016llx\n", x86_pmu.max_period); + pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 61da6b8a3d519f..cbac54cb3a9ec5 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -643,4 +643,4 @@ static __init int bts_init(void) return perf_pmu_register(&bts_pmu, "intel_bts", -1); } -arch_initcall(bts_init); +early_initcall(bts_init); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c2fb729c270ec4..28f5468a6ea36b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 mask, bits = 0; int idx = hwc->idx; + u64 bits = 0; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) idx -= INTEL_PMC_IDX_FIXED; bits = intel_fixed_bits_by_idx(idx, bits); - mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); - - if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - } - cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); cpuc->fixed_ctrl_val |= bits; } @@ -2997,7 +2993,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) if (event->group_leader != leader->group_leader) break; for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { - if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + if (i + idx >= cpuc->n_events || + !is_acr_event_group(cpuc->event_list[i + idx])) return; __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); } @@ -5318,9 +5315,9 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); if (pmu->intel_cap.perf_metrics) - pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; else - pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; intel_pmu_check_event_constraints(pmu->event_constraints, pmu->cntr_mask64, @@ -5455,7 +5452,7 @@ static void intel_pmu_cpu_starting(int cpu) rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); if (!perf_cap.perf_metrics) { x86_pmu.intel_cap.perf_metrics = 0; - x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; } } @@ -7789,7 +7786,7 @@ __init int intel_pmu_init(void) } if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) - x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; if (x86_pmu.intel_cap.pebs_timing_info) x86_pmu.flags |= PMU_FL_RETIRE_LATENCY; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 07ba4935e87320..a26e66d66444a8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -305,6 +305,8 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); + void (*setup)(void); + void (*teardown)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); void (*init_apic_ldr)(void); @@ -317,6 +319,8 @@ struct apic { /* wakeup secondary CPU using 64-bit wakeup point */ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); + void (*update_vector)(unsigned int cpu, unsigned int vector, bool set); + char *name; }; @@ -470,6 +474,12 @@ static __always_inline bool apic_id_valid(u32 apic_id) return apic_id <= apic->max_apic_id; } +static __always_inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + if (apic->update_vector) + apic->update_vector(cpu, vector, set); +} + #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } @@ -481,6 +491,7 @@ static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } static inline void apic_setup_apic_calls(void) { } +static inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) { } #define apic_update_callback(_callback, _fn) do { } while (0) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 094106b6a5384f..be39a543fbe5d0 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -135,6 +135,8 @@ #define APIC_TDR_DIV_128 0xA #define APIC_EFEAT 0x400 #define APIC_ECTRL 0x410 +#define APIC_SEOI 0x420 +#define APIC_IER 0x480 #define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index eebbc8889e70f5..a835f891164d47 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -246,7 +246,7 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) variable_test_bit(nr, addr); } -static __always_inline unsigned long variable__ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned long variable__ffs(unsigned long word) { asm("tzcnt %1,%0" : "=r" (word) @@ -265,7 +265,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) (unsigned long)__builtin_ctzl(word) : \ variable__ffs(word)) -static __always_inline unsigned long variable_ffz(unsigned long word) +static __always_inline __attribute_const__ unsigned long variable_ffz(unsigned long word) { return variable__ffs(~word); } @@ -287,7 +287,7 @@ static __always_inline unsigned long variable_ffz(unsigned long word) * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline __attribute_const__ unsigned long __fls(unsigned long word) { if (__builtin_constant_p(word)) return BITS_PER_LONG - 1 - __builtin_clzl(word); @@ -301,7 +301,7 @@ static __always_inline unsigned long __fls(unsigned long word) #undef ADDR #ifdef __KERNEL__ -static __always_inline int variable_ffs(int x) +static __always_inline __attribute_const__ int variable_ffs(int x) { int r; @@ -355,7 +355,7 @@ static __always_inline int variable_ffs(int x) * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { int r; @@ -400,7 +400,7 @@ static __always_inline int fls(unsigned int x) * at position 64. */ #ifdef CONFIG_X86_64 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { int bitpos = -1; diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 02b23aa78955fb..f7b67cb7391562 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -82,6 +82,8 @@ #ifndef __ASSEMBLER__ extern unsigned int output_len; extern const unsigned long kernel_text_size; +extern const unsigned long kernel_inittext_offset; +extern const unsigned long kernel_inittext_size; extern const unsigned long kernel_total_size; unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 1751f1eb95ef67..976b90a3d190ea 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -113,7 +113,7 @@ extern bhi_thunk __bhi_args_end[]; struct pt_regs; -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall @@ -157,7 +157,7 @@ static inline int cfi_get_func_arity(void *func) { return 0; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #if HAS_KERNEL_IBT == 1 #define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x))) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 06fc0479a23f01..b2a562217d3ffc 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -495,6 +495,8 @@ #define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ +#define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ +#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ /* * BUG word(s) @@ -551,4 +553,5 @@ #define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ #define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ #define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */ +#define X86_BUG_VMSCAPE X86_BUG( 1*32+10) /* "vmscape" CPU is affected by VMSCAPE attacks from guests */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index d535a97c728422..ce3eb6d5fdf9f2 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -93,6 +93,13 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, * 8 (ia32) bits. */ choose_random_kstack_offset(rdtsc()); + + /* Avoid unnecessary reads of 'x86_ibpb_exit_to_user' */ + if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER) && + this_cpu_read(x86_ibpb_exit_to_user)) { + indirect_branch_prediction_barrier(); + this_cpu_write(x86_ibpb_exit_to_user, false); + } } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index c060549c6c9407..89004f4ca208da 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -11,7 +11,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); extern void fpu__drop(struct task_struct *tsk); -extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, +extern int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal, unsigned long shstk_addr); extern void fpu_flush_thread(void); diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index e41cbf2ec41d20..9ad86a7d13f6d7 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -30,6 +30,7 @@ enum x86_hypervisor_type { X86_HYPER_KVM, X86_HYPER_JAILHOUSE, X86_HYPER_ACRN, + X86_HYPER_BHYVE, }; #ifdef CONFIG_HYPERVISOR_GUEST @@ -64,6 +65,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv; extern const struct hypervisor_x86 x86_hyper_kvm; extern const struct hypervisor_x86 x86_hyper_jailhouse; extern const struct hypervisor_x86 x86_hyper_acrn; +extern const struct hypervisor_x86 x86_hyper_bhyve; extern struct hypervisor_x86 x86_hyper_xen_hvm; extern bool nopv; diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 97f341777db54c..1b3060a3425cfe 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -37,6 +37,8 @@ #define INAT_PFX_EVEX 15 /* EVEX prefix */ /* x86-64 REX2 prefix */ #define INAT_PFX_REX2 16 /* 0xD5 */ +/* AMD XOP prefix */ +#define INAT_PFX_XOP 17 /* 0x8F */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -77,6 +79,7 @@ #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_XOPOK INAT_VEXOK #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) @@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); +extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, + insn_byte_t map_select); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } +static inline int inat_is_xop_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_XOP; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr) return attr & INAT_VEXOK; } +static inline int inat_accept_xop(insn_attr_t attr) +{ + return attr & INAT_XOPOK; +} + static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 5a68e9db651893..01ccdd168df080 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,12 +2,6 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H -#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 -#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector -#else -#define __head __section(".head.text") __no_sanitize_undefined __no_kstack_erase -#endif - struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void (*free_pgt_page)(void *, void *); /* free buf for page table */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 7152ea809e6a5e..091f88c8254d3a 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -71,7 +71,10 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ - struct insn_field vex_prefix; /* VEX prefix */ + union { + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field xop_prefix; /* XOP prefix */ + }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -135,6 +138,17 @@ struct insn { #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* XOP bit fields */ +#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ +#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ +#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ +#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ +#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ +#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ +#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ +#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ +#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ +#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); @@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) return X86_REX2_M(insn->rex_prefix.bytes[1]); } -static inline int insn_is_avx(struct insn *insn) +static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); @@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +/* If we already know this is AVX/XOP encoded */ +static inline int avx_insn_is_xop(struct insn *insn) +{ + insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); + + return inat_is_xop_prefix(attr); +} + +static inline int insn_is_xop(struct insn *insn) +{ + if (!insn_is_avx_or_xop(insn)) + return 0; + + return avx_insn_is_xop(insn); +} + static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; @@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn) return X86_VEX_W(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_xop_map_bits(struct insn *insn) +{ + if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ + return 0; + return X86_XOP_M(insn->xop_prefix.bytes[1]); +} + +static inline insn_byte_t insn_xop_p_bits(struct insn *insn) +{ + return X86_XOP_P(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { - if (insn_is_avx(insn)) + if (insn_is_avx_or_xop(insn)) { + if (avx_insn_is_xop(insn)) + return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index e345dbdf933eb4..f32a0eca2ae56a 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -51,7 +51,7 @@ #define INTEL_PENTIUM_MMX IFM(5, 0x04) /* P55C */ #define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ -/* Family 6 */ +/* Family 6, 18, 19 */ #define INTEL_PENTIUM_PRO IFM(6, 0x01) #define INTEL_PENTIUM_II_KLAMATH IFM(6, 0x03) #define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05) @@ -126,6 +126,8 @@ #define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) +#define INTEL_DIAMONDRAPIDS_X IFM(19, 0x01) /* Panther Cove */ + #define INTEL_BARTLETTLAKE IFM(6, 0xD7) /* Raptor Cove */ /* "Hybrid" Processors (P-Core/E-Core) */ @@ -203,9 +205,6 @@ #define INTEL_P4_PRESCOTT_2M IFM(15, 0x04) #define INTEL_P4_CEDARMILL IFM(15, 0x06) /* Also Xeon Dempsey */ -/* Family 19 */ -#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ - /* * Intel CPU core types * diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6c77c03139f7fa..31e3cb550fb3f8 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -241,12 +241,14 @@ struct cper_ia_proc_ctx; #ifdef CONFIG_X86_MCE int mcheck_init(void); +void mca_bsp_init(struct cpuinfo_x86 *c); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } +static inline void mca_bsp_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, @@ -290,8 +292,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); enum mcp_flags { MCP_TIMESTAMP = BIT(0), /* log time stamp */ MCP_UC = BIT(1), /* log uncorrected errors */ - MCP_DONTLOG = BIT(2), /* only clear, don't log */ - MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */ + MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */ }; void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); @@ -371,15 +372,9 @@ enum smca_bank_types { extern bool amd_mce_is_memory_error(struct mce *m); -extern int mce_threshold_create_device(unsigned int cpu); -extern int mce_threshold_remove_device(unsigned int cpu); - void mce_amd_feature_init(struct cpuinfo_x86 *c); enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank); #else - -static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; -static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa1410..718a55d82fe459 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -631,6 +633,11 @@ #define MSR_AMD_PPIN 0xc00102f1 #define MSR_AMD64_CPUID_FN_7 0xc0011002 #define MSR_AMD64_CPUID_FN_1 0xc0011004 + +#define MSR_AMD64_CPUID_EXT_FEAT 0xc0011005 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT 54 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT BIT_ULL(MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) + #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_TW_CFG 0xc0011023 @@ -699,8 +706,15 @@ #define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) #define MSR_AMD64_SNP_SMT_PROT_BIT 17 #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) -#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) +#define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) +#define MSR_AMD64_SAVIC_CONTROL 0xc0010138 +#define MSR_AMD64_SAVIC_EN_BIT 0 +#define MSR_AMD64_SAVIC_EN BIT_ULL(MSR_AMD64_SAVIC_EN_BIT) +#define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1 +#define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 #define MSR_AMD64_RMP_CFG 0xc0010136 @@ -1223,6 +1237,8 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_ABMC_CFG 0xc00003fd +#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 10f261678749a7..08ed5a2e46a5fd 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -514,6 +514,7 @@ enum spectre_v2_user_mitigation { /* The Speculative Store Bypass disable variants */ enum ssb_mitigation { SPEC_STORE_BYPASS_NONE, + SPEC_STORE_BYPASS_AUTO, SPEC_STORE_BYPASS_DISABLE, SPEC_STORE_BYPASS_PRCTL, SPEC_STORE_BYPASS_SECCOMP, @@ -530,6 +531,8 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } +DECLARE_PER_CPU(bool, x86_ibpb_exit_to_user); + static inline void indirect_branch_prediction_barrier(void) { asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_IBPB) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 70d1d94aca7e63..49a4d442f3fc21 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -35,7 +35,6 @@ #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) -#define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 #define INTEL_FIXED_0_KERNEL (1ULL << 0) #define INTEL_FIXED_0_USER (1ULL << 1) @@ -48,6 +47,11 @@ #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) +#define INTEL_FIXED_BITS_MASK \ + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ + INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ + ICL_FIXED_0_ADAPTIVE) + #define intel_fixed_bits_by_idx(_idx, _bits) \ ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) @@ -430,7 +434,7 @@ static inline bool is_topdown_idx(int idx) #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 -#define GLOBAL_CTRL_EN_PERF_METRICS 48 +#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) /* * We model guest LBR event tracing as another fixed-mode PMC like BTS. * diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index feb93b50e990ac..575f8408a9e7c6 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -44,7 +44,6 @@ DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); extern bool rdt_alloc_capable; extern bool rdt_mon_capable; -extern unsigned int rdt_mon_features; DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); @@ -84,21 +83,6 @@ static inline void resctrl_arch_disable_mon(void) static_branch_dec_cpuslocked(&rdt_enable_key); } -static inline bool resctrl_arch_is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 77d8f49b92bdd0..f59ae7186940a9 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -244,7 +244,7 @@ static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) { - unsigned int p; + unsigned long p; /* * Load CPU and node number from the GDT. LSL is faster than RDTSCP @@ -254,10 +254,10 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) * * If RDPID is available, use it. */ - alternative_io ("lsl %[seg],%[p]", - ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + alternative_io ("lsl %[seg],%k[p]", + "rdpid %[p]", X86_FEATURE_RDPID, - [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); + [p] "=r" (p), [seg] "r" (__CPUNODE_SEG)); if (cpu) *cpu = (p & VDSO_CPUNODE_MASK); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 692af46603a175..914eb32581c73d 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -53,6 +53,7 @@ extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); extern void startup_64_setup_gdt_idt(void); extern void startup_64_load_idt(void *vc_handler); +extern void __pi_startup_64_load_idt(void *vc_handler); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 0020d77a080001..01a6e4dbe42357 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -208,6 +208,7 @@ struct snp_psc_desc { #define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ #define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */ #define GHCB_TERM_SVSM_CA_REMAP_FAIL 11 /* SVSM is present but CA could not be remapped */ +#define GHCB_TERM_SAVIC_FAIL 12 /* Secure AVIC-specific failure */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 3dfd306d1c9e88..c58c47c68ab6fe 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -2,7 +2,6 @@ #define DR7_RESET_VALUE 0x400 -extern struct ghcb boot_ghcb_page; extern u64 sev_hv_features; extern u64 sev_secrets_pa; @@ -56,31 +55,15 @@ DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa); void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op); + unsigned long npages, const struct psc_desc *desc); DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); DECLARE_PER_CPU(u64, svsm_caa_pa); -extern struct svsm_ca *boot_svsm_caa; extern u64 boot_svsm_caa_pa; -static __always_inline struct svsm_ca *svsm_get_caa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa); - else - return boot_svsm_caa; -} - -static __always_inline u64 svsm_get_caa_pa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa_pa); - else - return boot_svsm_caa_pa; -} - -int svsm_perform_call_protocol(struct svsm_call *call); +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt); +void vc_forward_exception(struct es_em_ctxt *ctxt); static inline u64 sev_es_rd_ghcb_msr(void) { @@ -97,9 +80,8 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val) native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); } -void snp_register_ghcb_early(unsigned long paddr); -bool sev_es_negotiate_protocol(void); -bool sev_es_check_cpu_features(void); +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write); + u64 get_hv_features(void); const struct snp_cpuid_table *snp_cpuid_get_table(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 02236962fdb108..f9046c4b9a2b06 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -503,6 +503,7 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) } void setup_ghcb(void); +void snp_register_ghcb_early(unsigned long paddr); void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages); void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, @@ -511,14 +512,12 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void __noreturn snp_abort(void); void snp_dmi_setup(void); int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input); void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); void sev_show_status(void); -void snp_update_svsm_ca(void); int prepare_pte_enc(struct pte_enc_desc *d); void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot); void snp_kexec_finish(void); @@ -533,6 +532,10 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); +enum es_result savic_register_gpa(u64 gpa); +enum es_result savic_unregister_gpa(u64 *gpa); +u64 savic_ghcb_msr_read(u32 reg); +void savic_ghcb_msr_write(u32 reg, u64 value); static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { @@ -540,8 +543,6 @@ static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -void vc_forward_exception(struct es_em_ctxt *ctxt); - /* I/O parameters for CPUID-related helpers */ struct cpuid_leaf { u32 fn; @@ -552,7 +553,13 @@ struct cpuid_leaf { u32 edx; }; -int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf); +int svsm_perform_msr_protocol(struct svsm_call *call); +int __pi_svsm_perform_msr_protocol(struct svsm_call *call); +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf); + +void svsm_issue_call(struct svsm_call *call, u8 *pending); +int svsm_process_result_codes(struct svsm_call *call); void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, @@ -560,7 +567,36 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +bool sev_es_negotiate_protocol(void); +bool sev_es_check_cpu_features(void); + +extern u16 ghcb_version; extern struct ghcb *boot_ghcb; +extern bool sev_snp_needs_sfw; + +struct psc_desc { + enum psc_op op; + struct svsm_ca *ca; + u64 caa_pa; +}; + +static inline void sev_evict_cache(void *va, int npages) +{ + volatile u8 val __always_unused; + u8 *bytes = va; + int page_idx; + + /* + * For SEV guests, a read from the first/last cache-lines of a 4K page + * using the guest key is sufficient to cause a flush of all cache-lines + * associated with that 4K page without incurring all the overhead of a + * full CLFLUSH sequence. + */ + for (page_idx = 0; page_idx < npages; page_idx++) { + val = bytes[page_idx * PAGE_SIZE]; + val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1]; + } +} #else /* !CONFIG_AMD_MEM_ENCRYPT */ @@ -582,7 +618,6 @@ static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npag static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } -static inline void snp_abort(void) { } static inline void snp_dmi_setup(void) { } static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input) { @@ -592,7 +627,6 @@ static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } static inline void sev_show_status(void) { } -static inline void snp_update_svsm_ca(void) { } static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; } static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { } static inline void snp_kexec_finish(void) { } @@ -605,6 +639,11 @@ static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } +static inline void sev_evict_cache(void *va, int npages) {} +static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } +static inline enum es_result savic_unregister_gpa(u64 *gpa) { return ES_UNSUPPORTED; } +static inline void savic_ghcb_msr_write(u32 reg, u64 value) { } +static inline u64 savic_ghcb_msr_read(u32 reg) { return 0; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -616,26 +655,12 @@ void snp_dump_hva_rmpentry(unsigned long address); int psmash(u64 pfn); int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); int rmp_make_shared(u64 pfn, enum pg_level level); -void snp_leak_pages(u64 pfn, unsigned int npages); +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp); void kdump_sev_callback(void); void snp_fixup_e820_tables(void); - -static inline void sev_evict_cache(void *va, int npages) +static inline void snp_leak_pages(u64 pfn, unsigned int pages) { - volatile u8 val __always_unused; - u8 *bytes = va; - int page_idx; - - /* - * For SEV guests, a read from the first/last cache-lines of a 4K page - * using the guest key is sufficient to cause a flush of all cache-lines - * associated with that 4K page without incurring all the overhead of a - * full CLFLUSH sequence. - */ - for (page_idx = 0; page_idx < npages; page_idx++) { - val = bytes[page_idx * PAGE_SIZE]; - val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1]; - } + __snp_leak_pages(pfn, pages, true); } #else static inline bool snp_probe_rmptable_info(void) { return false; } @@ -649,10 +674,10 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as return -ENODEV; } static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } +static inline void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) {} static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} static inline void kdump_sev_callback(void) { } static inline void snp_fixup_e820_tables(void) {} -static inline void sev_evict_cache(void *va, int npages) {} #endif #endif diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index ba6f2fe438488d..fc7dcec58fd48b 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -16,25 +16,29 @@ struct thread_shstk { long shstk_prctl(struct task_struct *task, int option, unsigned long arg2); void reset_thread_features(void); -unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags, +unsigned long shstk_alloc_thread_stack(struct task_struct *p, u64 clone_flags, unsigned long stack_size); void shstk_free(struct task_struct *p); int setup_signal_shadow_stack(struct ksignal *ksig); int restore_signal_shadow_stack(void); int shstk_update_last_frame(unsigned long val); bool shstk_is_enabled(void); +int shstk_pop(u64 *val); +int shstk_push(u64 val); #else static inline long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { return -EINVAL; } static inline void reset_thread_features(void) {} static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p, - unsigned long clone_flags, + u64 clone_flags, unsigned long stack_size) { return 0; } static inline void shstk_free(struct task_struct *p) {} static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } static inline int restore_signal_shadow_stack(void) { return 0; } static inline int shstk_update_last_frame(unsigned long val) { return 0; } static inline bool shstk_is_enabled(void) { return false; } +static inline int shstk_pop(u64 *val) { return -ENOTSUPP; } +static inline int shstk_push(u64 val) { return -ENOTSUPP; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #endif /* __ASSEMBLER__ */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 9282465eea21d3..e71e0e8362ed8e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -80,56 +80,42 @@ struct thread_info { #endif /* - * thread information flags - * - these are process state flags that various assembly files - * may need to access + * Tell the generic TIF infrastructure which bits x86 supports */ -#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 4 /* Lazy rescheduling needed */ -#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ -#define TIF_SSBD 6 /* Speculative store bypass disable */ -#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ -#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ -#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ -#define TIF_UPROBE 12 /* breakpointed or singlestepping */ -#define TIF_PATCH_PENDING 13 /* pending live patching update */ -#define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ -#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ -#define TIF_NOTSC 16 /* TSC is not accessible in userland */ -#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ -#define TIF_MEMDIE 20 /* is terminating due to OOM killer */ -#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_POLLING_NRFLAG +#define HAVE_TIF_SINGLESTEP + +#include + +/* Architecture specific TIF space starts at 16 */ +#define TIF_SSBD 16 /* Speculative store bypass disable */ +#define TIF_SPEC_IB 17 /* Indirect branch speculation mitigation */ +#define TIF_SPEC_L1D_FLUSH 18 /* Flush L1D on mm switches (processes) */ +#define TIF_NEED_FPU_LOAD 19 /* load FPU on return to userspace */ +#define TIF_NOCPUID 20 /* CPUID is not accessible in userland */ +#define TIF_NOTSC 21 /* TSC is not accessible in userland */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ -#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ +#define TIF_SINGLESTEP 25 /* reenable singlestep on user return*/ +#define TIF_BLOCKSTEP 26 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ - -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) -#define _TIF_SSBD (1 << TIF_SSBD) -#define _TIF_SPEC_IB (1 << TIF_SPEC_IB) -#define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH) -#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) -#define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) -#define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) -#define _TIF_NOCPUID (1 << TIF_NOCPUID) -#define _TIF_NOTSC (1 << TIF_NOTSC) -#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) -#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) -#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) -#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_ADDR32 (1 << TIF_ADDR32) +#define TIF_ADDR32 28 /* 32-bit address space on 64 bits */ + +#define _TIF_SSBD BIT(TIF_SSBD) +#define _TIF_SPEC_IB BIT(TIF_SPEC_IB) +#define _TIF_SPEC_L1D_FLUSH BIT(TIF_SPEC_L1D_FLUSH) +#define _TIF_NEED_FPU_LOAD BIT(TIF_NEED_FPU_LOAD) +#define _TIF_NOCPUID BIT(TIF_NOCPUID) +#define _TIF_NOTSC BIT(TIF_NOTSC) +#define _TIF_IO_BITMAP BIT(TIF_IO_BITMAP) +#define _TIF_SPEC_FORCE_UPDATE BIT(TIF_SPEC_FORCE_UPDATE) +#define _TIF_FORCED_TF BIT(TIF_FORCED_TF) +#define _TIF_BLOCKSTEP BIT(TIF_BLOCKSTEP) +#define _TIF_SINGLESTEP BIT(TIF_SINGLESTEP) +#define _TIF_LAZY_MMU_UPDATES BIT(TIF_LAZY_MMU_UPDATES) +#define _TIF_ADDR32 BIT(TIF_ADDR32) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6c79ee7c0957a7..21041898157a1f 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -231,6 +231,16 @@ static inline bool topology_is_primary_thread(unsigned int cpu) } #define topology_is_primary_thread topology_is_primary_thread +int topology_get_primary_thread(unsigned int cpu); + +static inline bool topology_is_core_online(unsigned int cpu) +{ + int pcpu = topology_get_primary_thread(cpu); + + return pcpu >= 0 ? cpu_online(pcpu) : false; +} +#define topology_is_core_online topology_is_core_online + #else /* CONFIG_SMP */ static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_max_smt_threads(void) { return 1; } diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a75c..1ee2e5115955cd 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 85e63d58c07463..59f642a94b9d90 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -12,9 +12,9 @@ #include #include +#include #include #include -#include /* Xen machine address */ typedef struct xmaddr { @@ -162,7 +162,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) * pfn_to_mfn. This will have to be removed when we figured * out which call. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return pfn; mfn = __pfn_to_mfn(pfn); @@ -175,7 +175,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) static inline int phys_to_machine_mapping_valid(unsigned long pfn) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return 1; return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY; @@ -210,7 +210,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) * gfn_to_pfn. This will have to be removed when we figure * out which call. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return mfn; pfn = mfn_to_pfn_no_overrides(mfn); @@ -242,7 +242,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) /* Pseudo-physical <-> Guest conversion */ static inline unsigned long pfn_to_gfn(unsigned long pfn) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return pfn; else return pfn_to_mfn(pfn); @@ -250,7 +250,7 @@ static inline unsigned long pfn_to_gfn(unsigned long pfn) static inline unsigned long gfn_to_pfn(unsigned long gfn) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return gfn; else return mfn_to_pfn(gfn); @@ -284,7 +284,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) { unsigned long pfn; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return mfn; pfn = mfn_to_pfn(mfn); diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 9c640a521a6700..650e3256ea7d73 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -118,6 +118,10 @@ #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 +#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 +#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 +#define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d2a6d953be911..bc184dd38d993b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8698d66563ed64..0281703da5e262 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -89,7 +89,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, */ flags->bm_control = 0; } - if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) { + if (cpu_feature_enabled(X86_FEATURE_ZEN)) { /* * For all AMD Zen or newer CPUs that support C3, caches * should not be flushed by software while entering C3 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7bde68247b5fc5..79ae9cb5001906 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1170,7 +1170,7 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #ifdef CONFIG_CFI_AUTO_DEFAULT # define __CFI_DEFAULT CFI_AUTO -#elif defined(CONFIG_CFI_CLANG) +#elif defined(CONFIG_CFI) # define __CFI_DEFAULT CFI_KCFI #else # define __CFI_DEFAULT CFI_OFF @@ -1182,7 +1182,7 @@ enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; bool cfi_bhi __ro_after_init = false; #endif -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI u32 cfi_get_func_hash(void *func) { u32 hash; diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52d1808ee360b0..581db89477f923 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,6 +18,7 @@ ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d73ba5a7b623d4..680d305589a3ac 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -592,6 +592,8 @@ static void setup_APIC_timer(void) 0xF, ~0UL); } else clockevents_register_device(levt); + + apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true); } /* @@ -1168,6 +1170,9 @@ void disable_local_APIC(void) if (!apic_accessible()) return; + if (apic->teardown) + apic->teardown(); + apic_soft_disable(); #ifdef CONFIG_X86_32 @@ -1428,63 +1433,61 @@ union apic_ir { u32 regs[APIC_IR_REGS]; }; -static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) +static bool apic_check_and_eoi_isr(union apic_ir *isr) { int i, bit; - /* Read the IRRs */ - for (i = 0; i < APIC_IR_REGS; i++) - irr->regs[i] = apic_read(APIC_IRR + i * 0x10); - /* Read the ISRs */ for (i = 0; i < APIC_IR_REGS; i++) isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + /* If the ISR map empty, nothing to do here. */ + if (bitmap_empty(isr->map, APIC_IR_BITS)) + return true; + /* - * If the ISR map is not empty. ACK the APIC and run another round - * to verify whether a pending IRR has been unblocked and turned - * into a ISR. + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an EOI for each + * set bit. The priority traversal order does not matter as there + * can't be new ISR bits raised at this point. What matters is that + * an EOI is issued for each ISR bit. */ - if (!bitmap_empty(isr->map, APIC_IR_BITS)) { - /* - * There can be multiple ISR bits set when a high priority - * interrupt preempted a lower priority one. Issue an ACK - * per set bit. - */ - for_each_set_bit(bit, isr->map, APIC_IR_BITS) - apic_eoi(); - return true; - } + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + apic_eoi(); - return !bitmap_empty(irr->map, APIC_IR_BITS); + /* Reread the ISRs, they should be empty now */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + + return bitmap_empty(isr->map, APIC_IR_BITS); } /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. + * If a CPU services an interrupt and crashes before issuing EOI to the + * local APIC, the corresponding ISR bit is still set when the crashing CPU + * jumps into a crash kernel. Read the ISR and issue an EOI for each set + * bit to acknowledge it as otherwise these slots would be locked forever + * waiting for an EOI. * - * Most probably by now the CPU has serviced that pending interrupt and it - * might not have done the apic_eoi() because it thought, interrupt - * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serviced the interrupt. Hence - * a vector might get locked. It was noticed for timer irq (vector - * 0x31). Issue an extra EOI to clear ISR. + * If there are pending bits in the IRR, then they won't be converted into + * ISR bits as the CPU has interrupts disabled. They will be delivered once + * the CPU enables interrupts and there is nothing which can prevent that. * - * If there are pending IRR bits they turn into ISR bits after a higher - * priority ISR bit has been acked. + * In the worst case this results in spurious interrupt warnings. */ -static void apic_pending_intr_clear(void) +static void apic_clear_isr(void) { - union apic_ir irr, isr; + union apic_ir ir; unsigned int i; - /* 512 loops are way oversized and give the APIC a chance to obey. */ - for (i = 0; i < 512; i++) { - if (!apic_check_and_ack(&irr, &isr)) - return; - } - /* Dump the IRR/ISR content if that failed */ - pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); + if (!apic_check_and_eoi_isr(&ir)) + pr_warn("APIC: Stale ISR: %256pb\n", ir.map); + + for (i = 0; i < APIC_IR_REGS; i++) + ir.regs[i] = apic_read(APIC_IRR + i * 0x10); + + if (!bitmap_empty(ir.map, APIC_IR_BITS)) + pr_warn("APIC: Stale IRR: %256pb\n", ir.map); } /** @@ -1503,6 +1506,9 @@ static void setup_local_APIC(void) return; } + if (apic->setup) + apic->setup(); + /* * If this comes from kexec/kcrash the APIC might be enabled in * SPIV. Soft disable it before doing further initialization. @@ -1541,8 +1547,7 @@ static void setup_local_APIC(void) value |= 0x10; apic_write(APIC_TASKPRI, value); - /* Clear eventually stale ISR/IRR bits */ - apic_pending_intr_clear(); + apic_clear_isr(); /* * Now that we are all set up, enable the APIC diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a947b46a8b642b..bddc544653999a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,13 +134,20 @@ static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, apicd->hw_irq_cfg.vector = vector; apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + + apic_update_vector(cpu, vector, true); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); - trace_vector_config(irqd->irq, vector, cpu, - apicd->hw_irq_cfg.dest_apicid); + trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid); } -static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, - unsigned int newcpu) +static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed) +{ + apic_update_vector(cpu, vector, false); + irq_matrix_free(vector_matrix, cpu, vector, managed); +} + +static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu) { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); @@ -174,8 +181,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, apicd->prev_cpu = apicd->cpu; WARN_ON_ONCE(apicd->cpu == newcpu); } else { - irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, - managed); + apic_free_vector(apicd->cpu, apicd->vector, managed); } setnew: @@ -261,7 +267,7 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc(irqd->irq, vector, resvd, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -337,7 +343,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc_managed(irqd->irq, vector, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -357,7 +363,7 @@ static void clear_irq_vector(struct irq_data *irqd) apicd->prev_cpu); per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apic_free_vector(apicd->cpu, vector, managed); apicd->vector = 0; /* Clean up move in progress */ @@ -366,7 +372,7 @@ static void clear_irq_vector(struct irq_data *irqd) return; per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apic_free_vector(apicd->prev_cpu, vector, managed); apicd->prev_vector = 0; apicd->move_in_progress = 0; hlist_del_init(&apicd->clist); @@ -905,7 +911,7 @@ static void free_moved_vector(struct apic_chip_data *apicd) * affinity mask comes online. */ trace_vector_free_moved(apicd->irq, cpu, vector, managed); - irq_matrix_free(vector_matrix, cpu, vector, managed); + apic_free_vector(cpu, vector, managed); per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; hlist_del_init(&apicd->clist); apicd->prev_vector = 0; diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c new file mode 100644 index 00000000000000..dbc5678bc3b689 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure AVIC Support (SEV-SNP Guests) + * + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Author: Neeraj Upadhyay + */ + +#include +#include +#include +#include + +#include +#include + +#include "local.h" + +struct secure_avic_page { + u8 regs[PAGE_SIZE]; +} __aligned(PAGE_SIZE); + +static struct secure_avic_page __percpu *savic_page __ro_after_init; + +static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); +} + +static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset) +{ + return &per_cpu_ptr(savic_page, cpu)->regs[offset]; +} + +static inline void update_vector(unsigned int cpu, unsigned int offset, + unsigned int vector, bool set) +{ + void *bitmap = get_reg_bitmap(cpu, offset); + + if (set) + apic_set_vector(vector, bitmap); + else + apic_clear_vector(vector, bitmap); +} + +#define SAVIC_ALLOWED_IRR 0x204 + +/* + * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers + * result in #VC exception (for non-accelerated register accesses) + * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler + * can read/write the x2APIC register in the guest APIC backing page. + * + * Since doing this would increase the latency of accessing x2APIC + * registers, instead of doing RDMSR/WRMSR based accesses and + * handling the APIC register reads/writes in the #VC exception handler, + * the read() and write() callbacks directly read/write the APIC register + * from/to the vCPU's APIC backing page. + */ +static u32 savic_read(u32 reg) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TMCCT: + case APIC_TDCR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + return savic_ghcb_msr_read(reg); + case APIC_ID: + case APIC_LVR: + case APIC_TASKPRI: + case APIC_ARBPRI: + case APIC_PROCPRI: + case APIC_LDR: + case APIC_SPIV: + case APIC_ESR: + case APIC_EFEAT: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + return apic_get_reg(ap, reg); + case APIC_ICR: + return (u32)apic_get_reg64(ap, reg); + case APIC_ISR ... APIC_ISR + 0x70: + case APIC_TMR ... APIC_TMR + 0x70: + if (WARN_ONCE(!IS_ALIGNED(reg, 16), + "APIC register read offset 0x%x not aligned at 16 bytes", reg)) + return 0; + return apic_get_reg(ap, reg); + /* IRR and ALLOWED_IRR offset range */ + case APIC_IRR ... APIC_IRR + 0x74: + /* + * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from + * their respective base offset. APIC_IRRs are in the range + * + * (0x200, 0x210, ..., 0x270) + * + * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range + * + * (0x204, 0x214, ..., 0x274). + * + * Filter out everything else. + */ + if (WARN_ONCE(!(IS_ALIGNED(reg, 16) || + IS_ALIGNED(reg - 4, 16)), + "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg)) + return 0; + return apic_get_reg(ap, reg); + default: + pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg); + return 0; + } +} + +#define SAVIC_NMI_REQ 0x278 + +/* + * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware + * updates the APIC_IRR in the APIC backing page of the vCPU. In addition, + * hardware evaluates the new APIC_IRR update for interrupt injection to + * the vCPU. So, self IPIs are hardware-accelerated. + */ +static inline void self_ipi_reg_write(unsigned int vector) +{ + native_apic_msr_write(APIC_SELF_IPI, vector); +} + +static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi) +{ + if (nmi) + apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1); + else + update_vector(cpu, APIC_IRR, vector, true); +} + +static void send_ipi_allbut(unsigned int vector, bool nmi) +{ + unsigned int cpu, src_cpu; + + guard(irqsave)(); + + src_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, cpu_online_mask) { + if (cpu == src_cpu) + continue; + send_ipi_dest(cpu, vector, nmi); + } +} + +static inline void self_ipi(unsigned int vector, bool nmi) +{ + u32 icr_low = APIC_SELF_IPI | vector; + + if (nmi) + icr_low |= APIC_DM_NMI; + + native_x2apic_icr_write(icr_low, 0); +} + +static void savic_icr_write(u32 icr_low, u32 icr_high) +{ + unsigned int dsh, vector; + u64 icr_data; + bool nmi; + + dsh = icr_low & APIC_DEST_ALLBUT; + vector = icr_low & APIC_VECTOR_MASK; + nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI); + + switch (dsh) { + case APIC_DEST_SELF: + self_ipi(vector, nmi); + break; + case APIC_DEST_ALLINC: + self_ipi(vector, nmi); + fallthrough; + case APIC_DEST_ALLBUT: + send_ipi_allbut(vector, nmi); + break; + default: + send_ipi_dest(icr_high, vector, nmi); + break; + } + + icr_data = ((u64)icr_high) << 32 | icr_low; + if (dsh != APIC_DEST_SELF) + savic_ghcb_msr_write(APIC_ICR, icr_data); + apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data); +} + +static void savic_write(u32 reg, u32 data) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TDCR: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVTERR: + savic_ghcb_msr_write(reg, data); + break; + case APIC_TASKPRI: + case APIC_EOI: + case APIC_SPIV: + case SAVIC_NMI_REQ: + case APIC_ESR: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + apic_set_reg(ap, reg, data); + break; + case APIC_ICR: + savic_icr_write(data, 0); + break; + case APIC_SELF_IPI: + self_ipi_reg_write(data); + break; + /* ALLOWED_IRR offsets are writable */ + case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70: + if (IS_ALIGNED(reg - 4, 16)) { + apic_set_reg(ap, reg, data); + break; + } + fallthrough; + default: + pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg); + } +} + +static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh) +{ + unsigned int icr_low; + + icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL); + savic_icr_write(icr_low, dest); +} + +static void savic_send_ipi(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + send_ipi(dest, vector, 0); +} + +static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self) +{ + unsigned int cpu, this_cpu; + + guard(irqsave)(); + + this_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, mask) { + if (excl_self && cpu == this_cpu) + continue; + send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0); + } +} + +static void savic_send_ipi_mask(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, false); +} + +static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, true); +} + +static void savic_send_ipi_allbutself(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLBUT); +} + +static void savic_send_ipi_all(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLINC); +} + +static void savic_send_ipi_self(int vector) +{ + self_ipi_reg_write(vector); +} + +static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); +} + +static void savic_eoi(void) +{ + unsigned int cpu; + int vec; + + cpu = raw_smp_processor_id(); + vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR)); + if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR")) + return; + + /* Is level-triggered interrupt? */ + if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) { + update_vector(cpu, APIC_ISR, vec, false); + /* + * Propagate the EOI write to the hypervisor for level-triggered + * interrupts. Return to the guest from GHCB protocol event takes + * care of re-evaluating interrupt state. + */ + savic_ghcb_msr_write(APIC_EOI, 0); + } else { + /* + * Hardware clears APIC_ISR and re-evaluates the interrupt state + * to determine if there is any pending interrupt which can be + * delivered to CPU. + */ + native_apic_msr_eoi(); + } +} + +static void savic_teardown(void) +{ + /* Disable Secure AVIC */ + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0); + savic_unregister_gpa(NULL); +} + +static void savic_setup(void) +{ + void *ap = this_cpu_ptr(savic_page); + enum es_result res; + unsigned long gpa; + + /* + * Before Secure AVIC is enabled, APIC MSR reads are intercepted. + * APIC_ID MSR read returns the value from the hypervisor. + */ + apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID)); + + gpa = __pa(ap); + + /* + * The NPT entry for a vCPU's APIC backing page must always be + * present when the vCPU is running in order for Secure AVIC to + * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot + * be resumed if the NPT entry for the APIC backing page is not + * present. Notify GPA of the vCPU's APIC backing page to the + * hypervisor by calling savic_register_gpa(). Before executing + * VMRUN, the hypervisor makes use of this information to make sure + * the APIC backing page is mapped in NPT. + */ + res = savic_register_gpa(gpa); + if (res != ES_OK) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, + gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI); +} + +static int savic_probe(void) +{ + if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return 0; + + if (!x2apic_mode) { + pr_err("Secure AVIC enabled in non x2APIC mode\n"); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + /* unreachable */ + } + + savic_page = alloc_percpu(struct secure_avic_page); + if (!savic_page) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + + return 1; +} + +static struct apic apic_x2apic_savic __ro_after_init = { + + .name = "secure avic x2apic", + .probe = savic_probe, + .acpi_madt_oem_check = savic_acpi_madt_oem_check, + .setup = savic_setup, + .teardown = savic_teardown, + + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = savic_send_ipi, + .send_IPI_mask = savic_send_ipi_mask, + .send_IPI_mask_allbutself = savic_send_ipi_mask_allbutself, + .send_IPI_allbutself = savic_send_ipi_allbutself, + .send_IPI_all = savic_send_ipi_all, + .send_IPI_self = savic_send_ipi_self, + + .nmi_to_offline_cpu = true, + + .read = savic_read, + .write = savic_write, + .eoi = savic_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = savic_icr_write, + + .update_vector = savic_update_vector, +}; + +apic_driver(apic_x2apic_savic); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1e26179ff18c4a..2f8a58ef690ec7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_BHYVE_GUEST) += bhyve.o obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c new file mode 100644 index 00000000000000..f1a8ca3dd1ed86 --- /dev/null +++ b/arch/x86/kernel/cpu/bhyve.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FreeBSD Bhyve guest enlightenments + * + * Copyright © 2025 Amazon.com, Inc. or its affiliates. + * + * Author: David Woodhouse + */ + +#include +#include +#include +#include + +static uint32_t bhyve_cpuid_base; +static uint32_t bhyve_cpuid_max; + +#define BHYVE_SIGNATURE "bhyve bhyve " + +#define CPUID_BHYVE_FEATURES 0x40000001 + +/* Features advertised in CPUID_BHYVE_FEATURES %eax */ + +/* MSI Extended Dest ID */ +#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) + +static uint32_t __init bhyve_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return 0; + + bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0); + if (!bhyve_cpuid_base) + return 0; + + bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base); + return bhyve_cpuid_max; +} + +static uint32_t bhyve_features(void) +{ + unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES; + + if (bhyve_cpuid_max < cpuid_leaf) + return 0; + + return cpuid_eax(cpuid_leaf); +} + +static bool __init bhyve_ext_dest_id(void) +{ + return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID); +} + +static bool __init bhyve_x2apic_available(void) +{ + return true; +} + +const struct hypervisor_x86 x86_hyper_bhyve __refconst = { + .name = "Bhyve", + .detect = bhyve_detect, + .init.init_platform = x86_init_noop, + .init.x2apic_available = bhyve_x2apic_available, + .init.msi_ext_dest_id = bhyve_ext_dest_id, +}; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index af838b8d845cfb..6a526ae1fe9933 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -96,6 +96,9 @@ static void __init its_update_mitigation(void); static void __init its_apply_mitigation(void); static void __init tsa_select_mitigation(void); static void __init tsa_apply_mitigation(void); +static void __init vmscape_select_mitigation(void); +static void __init vmscape_update_mitigation(void); +static void __init vmscape_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -105,6 +108,14 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); +/* + * Set when the CPU has run a potentially malicious guest. An IBPB will + * be needed to before running userspace. That IBPB will flush the branch + * predictor content. + */ +DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user); +EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user); + u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; static u64 __ro_after_init x86_arch_cap_msr; @@ -262,6 +273,7 @@ void __init cpu_select_mitigations(void) its_select_mitigation(); bhi_select_mitigation(); tsa_select_mitigation(); + vmscape_select_mitigation(); /* * After mitigations are selected, some may need to update their @@ -293,6 +305,7 @@ void __init cpu_select_mitigations(void) bhi_update_mitigation(); /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ srso_update_mitigation(); + vmscape_update_mitigation(); spectre_v1_apply_mitigation(); spectre_v2_apply_mitigation(); @@ -310,6 +323,7 @@ void __init cpu_select_mitigations(void) its_apply_mitigation(); bhi_apply_mitigation(); tsa_apply_mitigation(); + vmscape_apply_mitigation(); } /* @@ -420,6 +434,9 @@ static bool __init should_mitigate_vuln(unsigned int bug) case X86_BUG_SPEC_STORE_BYPASS: return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER); + case X86_BUG_VMSCAPE: + return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); + default: WARN(1, "Unknown bug %x\n", bug); return false; @@ -670,8 +687,7 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } @@ -1446,8 +1462,10 @@ static void __init retbleed_update_mitigation(void) retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { pr_err(RETBLEED_INTEL_MSG); + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } } } @@ -1828,9 +1846,10 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; -static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; -enum spectre_v2_user_cmd { +enum spectre_v2_user_mitigation_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, @@ -1840,6 +1859,9 @@ enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_SECCOMP_IBPB, }; +static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; + static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", @@ -1848,50 +1870,31 @@ static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; -static const struct { - const char *option; - enum spectre_v2_user_cmd cmd; - bool secure; -} v2_user_options[] __initconst = { - { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, - { "off", SPECTRE_V2_USER_CMD_NONE, false }, - { "on", SPECTRE_V2_USER_CMD_FORCE, true }, - { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, - { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, - { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, - { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, -}; - -static void __init spec_v2_user_print_cond(const char *reason, bool secure) +static int __init spectre_v2_user_parse_cmdline(char *str) { - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("spectre_v2_user=%s forced on command line.\n", reason); -} - -static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) -{ - char arg[20]; - int ret, i; - - if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) - return SPECTRE_V2_USER_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2_user", - arg, sizeof(arg)); - if (ret < 0) - return SPECTRE_V2_USER_CMD_AUTO; + if (!str) + return -EINVAL; - for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { - if (match_option(arg, ret, v2_user_options[i].option)) { - spec_v2_user_print_cond(v2_user_options[i].option, - v2_user_options[i].secure); - return v2_user_options[i].cmd; - } - } + if (!strcmp(str, "auto")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO; + else if (!strcmp(str, "off")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE; + else if (!strcmp(str, "on")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE; + else if (!strcmp(str, "prctl")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL; + else if (!strcmp(str, "prctl,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB; + else if (!strcmp(str, "seccomp")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP; + else if (!strcmp(str, "seccomp,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB; + else + pr_err("Ignoring unknown spectre_v2_user option (%s).", str); - pr_err("Unknown user space protection option (%s). Switching to default\n", arg); - return SPECTRE_V2_USER_CMD_AUTO; + return 0; } +early_param("spectre_v2_user", spectre_v2_user_parse_cmdline); static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { @@ -1903,7 +1906,7 @@ static void __init spectre_v2_user_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - switch (spectre_v2_parse_user_cmdline()) { + switch (spectre_v2_user_cmd) { case SPECTRE_V2_USER_CMD_NONE: return; case SPECTRE_V2_USER_CMD_FORCE: @@ -2031,119 +2034,61 @@ static void __init spectre_v2_user_apply_mitigation(void) static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", - [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; -static const struct { - const char *option; - enum spectre_v2_mitigation_cmd cmd; - bool secure; -} mitigation_options[] __initconst = { - { "off", SPECTRE_V2_CMD_NONE, false }, - { "on", SPECTRE_V2_CMD_FORCE, true }, - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, - { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, - { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, - { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, - { "auto", SPECTRE_V2_CMD_AUTO, false }, - { "ibrs", SPECTRE_V2_CMD_IBRS, false }, -}; +static bool nospectre_v2 __ro_after_init; -static void __init spec_v2_print_cond(const char *reason, bool secure) +static int __init nospectre_v2_parse_cmdline(char *str) { - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("%s selected on command line.\n", reason); + nospectre_v2 = true; + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + return 0; } +early_param("nospectre_v2", nospectre_v2_parse_cmdline); -static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +static int __init spectre_v2_parse_cmdline(char *str) { - enum spectre_v2_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) - return SPECTRE_V2_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); - if (ret < 0) - return cmd; - - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { - if (!match_option(arg, ret, mitigation_options[i].option)) - continue; - cmd = mitigation_options[i].cmd; - break; - } - - if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_EIBRS || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && - !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (!str) + return -EINVAL; - if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { - pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (nospectre_v2) + return 0; - if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { - pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; + if (!strcmp(str, "off")) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + } else if (!strcmp(str, "on")) { + spectre_v2_cmd = SPECTRE_V2_CMD_FORCE; + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } else if (!strcmp(str, "retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE; + } else if (!strcmp(str, "retpoline,amd") || + !strcmp(str, "retpoline,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE; + } else if (!strcmp(str, "retpoline,generic")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (!strcmp(str, "eibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS; + } else if (!strcmp(str, "eibrs,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE; + } else if (!strcmp(str, "eibrs,retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE; + } else if (!strcmp(str, "auto")) { + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } else if (!strcmp(str, "ibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_IBRS; + } else { + pr_err("Ignoring unknown spectre_v2 option (%s).", str); } - spec_v2_print_cond(mitigation_options[i].option, - mitigation_options[i].secure); - return cmd; + return 0; } +early_param("spectre_v2", spectre_v2_parse_cmdline); static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { @@ -2292,10 +2237,6 @@ static void __init bhi_update_mitigation(void) { if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) bhi_mitigation = BHI_MITIGATION_OFF; - - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) - bhi_mitigation = BHI_MITIGATION_OFF; } static void __init bhi_apply_mitigation(void) @@ -2331,11 +2272,55 @@ static void __init bhi_apply_mitigation(void) static void __init spectre_v2_select_mitigation(void) { - spectre_v2_cmd = spectre_v2_parse_cmdline(); + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { + pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { + pr_err("IBRS selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { + pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; return; + } switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: @@ -2538,101 +2523,11 @@ static void update_mds_branch_idle(void) } } -#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" -#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" -#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" - -void cpu_bugs_smt_update(void) -{ - mutex_lock(&spec_ctrl_mutex); - - if (sched_smt_active() && unprivileged_ebpf_enabled() && - spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) - pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); - - switch (spectre_v2_user_stibp) { - case SPECTRE_V2_USER_NONE: - break; - case SPECTRE_V2_USER_STRICT: - case SPECTRE_V2_USER_STRICT_PREFERRED: - update_stibp_strict(); - break; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: - update_indir_branch_cond(); - break; - } - - switch (mds_mitigation) { - case MDS_MITIGATION_FULL: - case MDS_MITIGATION_AUTO: - case MDS_MITIGATION_VMWERV: - if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) - pr_warn_once(MDS_MSG_SMT); - update_mds_branch_idle(); - break; - case MDS_MITIGATION_OFF: - break; - } - - switch (taa_mitigation) { - case TAA_MITIGATION_VERW: - case TAA_MITIGATION_AUTO: - case TAA_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(TAA_MSG_SMT); - break; - case TAA_MITIGATION_TSX_DISABLED: - case TAA_MITIGATION_OFF: - break; - } - - switch (mmio_mitigation) { - case MMIO_MITIGATION_VERW: - case MMIO_MITIGATION_AUTO: - case MMIO_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(MMIO_MSG_SMT); - break; - case MMIO_MITIGATION_OFF: - break; - } - - switch (tsa_mitigation) { - case TSA_MITIGATION_USER_KERNEL: - case TSA_MITIGATION_VM: - case TSA_MITIGATION_AUTO: - case TSA_MITIGATION_FULL: - /* - * TSA-SQ can potentially lead to info leakage between - * SMT threads. - */ - if (sched_smt_active()) - static_branch_enable(&cpu_buf_idle_clear); - else - static_branch_disable(&cpu_buf_idle_clear); - break; - case TSA_MITIGATION_NONE: - case TSA_MITIGATION_UCODE_NEEDED: - break; - } - - mutex_unlock(&spec_ctrl_mutex); -} - #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt -static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; - -/* The kernel command line selection */ -enum ssb_mitigation_cmd { - SPEC_STORE_BYPASS_CMD_NONE, - SPEC_STORE_BYPASS_CMD_AUTO, - SPEC_STORE_BYPASS_CMD_ON, - SPEC_STORE_BYPASS_CMD_PRCTL, - SPEC_STORE_BYPASS_CMD_SECCOMP, -}; +static enum ssb_mitigation ssb_mode __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE; static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_NONE] = "Vulnerable", @@ -2641,94 +2536,61 @@ static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", }; -static const struct { - const char *option; - enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initconst = { - { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ - { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ - { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ - { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ - { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ -}; +static bool nossb __ro_after_init; -static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +static int __init nossb_parse_cmdline(char *str) { - enum ssb_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? - SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || - cpu_mitigations_off()) { - return SPEC_STORE_BYPASS_CMD_NONE; - } else { - ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", - arg, sizeof(arg)); - if (ret < 0) - return cmd; + nossb = true; + ssb_mode = SPEC_STORE_BYPASS_NONE; + return 0; +} +early_param("nospec_store_bypass_disable", nossb_parse_cmdline); - for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { - if (!match_option(arg, ret, ssb_mitigation_options[i].option)) - continue; +static int __init ssb_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; - cmd = ssb_mitigation_options[i].cmd; - break; - } + if (nossb) + return 0; - if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - } + if (!strcmp(str, "auto")) + ssb_mode = SPEC_STORE_BYPASS_AUTO; + else if (!strcmp(str, "on")) + ssb_mode = SPEC_STORE_BYPASS_DISABLE; + else if (!strcmp(str, "off")) + ssb_mode = SPEC_STORE_BYPASS_NONE; + else if (!strcmp(str, "prctl")) + ssb_mode = SPEC_STORE_BYPASS_PRCTL; + else if (!strcmp(str, "seccomp")) + ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ? + SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL; + else + pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n", + str); - return cmd; + return 0; } +early_param("spec_store_bypass_disable", ssb_parse_cmdline); static void __init ssb_select_mitigation(void) { - enum ssb_mitigation_cmd cmd; - - if (!boot_cpu_has(X86_FEATURE_SSBD)) - goto out; - - cmd = ssb_parse_cmdline(); - if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && - (cmd == SPEC_STORE_BYPASS_CMD_NONE || - cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) { + ssb_mode = SPEC_STORE_BYPASS_NONE; return; + } - switch (cmd) { - case SPEC_STORE_BYPASS_CMD_SECCOMP: - /* - * Choose prctl+seccomp as the default mode if seccomp is - * enabled. - */ - if (IS_ENABLED(CONFIG_SECCOMP)) - ssb_mode = SPEC_STORE_BYPASS_SECCOMP; - else - ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_ON: - ssb_mode = SPEC_STORE_BYPASS_DISABLE; - break; - case SPEC_STORE_BYPASS_CMD_AUTO: + if (ssb_mode == SPEC_STORE_BYPASS_AUTO) { if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS)) ssb_mode = SPEC_STORE_BYPASS_PRCTL; else ssb_mode = SPEC_STORE_BYPASS_NONE; - break; - case SPEC_STORE_BYPASS_CMD_PRCTL: - ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_NONE: - break; } -out: - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); + if (!boot_cpu_has(X86_FEATURE_SSBD)) + ssb_mode = SPEC_STORE_BYPASS_NONE; + + pr_info("%s\n", ssb_strings[ssb_mode]); } static void __init ssb_apply_mitigation(void) @@ -2944,6 +2806,7 @@ static int ssb_prctl_get(struct task_struct *task) return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: case SPEC_STORE_BYPASS_PRCTL: + case SPEC_STORE_BYPASS_AUTO: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ssb_noexec(task)) @@ -3263,14 +3126,15 @@ static void __init srso_select_mitigation(void) static void __init srso_update_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_SRSO)) + return; + /* If retbleed is using IBPB, that works for SRSO as well */ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) srso_mitigation = SRSO_MITIGATION_IBPB; - if (boot_cpu_has_bug(X86_BUG_SRSO) && - !cpu_mitigations_off()) - pr_info("%s\n", srso_strings[srso_mitigation]); + pr_info("%s\n", srso_strings[srso_mitigation]); } static void __init srso_apply_mitigation(void) @@ -3330,9 +3194,188 @@ static void __init srso_apply_mitigation(void) } } +#undef pr_fmt +#define pr_fmt(fmt) "VMSCAPE: " fmt + +enum vmscape_mitigations { + VMSCAPE_MITIGATION_NONE, + VMSCAPE_MITIGATION_AUTO, + VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER, + VMSCAPE_MITIGATION_IBPB_ON_VMEXIT, +}; + +static const char * const vmscape_strings[] = { + [VMSCAPE_MITIGATION_NONE] = "Vulnerable", + /* [VMSCAPE_MITIGATION_AUTO] */ + [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace", + [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT", +}; + +static enum vmscape_mitigations vmscape_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE; + +static int __init vmscape_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } else if (!strcmp(str, "ibpb")) { + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_VMSCAPE); + vmscape_mitigation = VMSCAPE_MITIGATION_AUTO; + } else { + pr_err("Ignoring unknown vmscape=%s option.\n", str); + } + + return 0; +} +early_param("vmscape", vmscape_parse_cmdline); + +static void __init vmscape_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) || + !boot_cpu_has(X86_FEATURE_IBPB)) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + return; + } + + if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_VMSCAPE)) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + else + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } +} + +static void __init vmscape_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE)) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB || + srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT; + + pr_info("%s\n", vmscape_strings[vmscape_mitigation]); +} + +static void __init vmscape_apply_mitigation(void) +{ + if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER) + setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER); +} + #undef pr_fmt #define pr_fmt(fmt) fmt +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" +#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n" + +void cpu_bugs_smt_update(void) +{ + mutex_lock(&spec_ctrl_mutex); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + + switch (spectre_v2_user_stibp) { + case SPECTRE_V2_USER_NONE: + break; + case SPECTRE_V2_USER_STRICT: + case SPECTRE_V2_USER_STRICT_PREFERRED: + update_stibp_strict(); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + update_indir_branch_cond(); + break; + } + + switch (mds_mitigation) { + case MDS_MITIGATION_FULL: + case MDS_MITIGATION_AUTO: + case MDS_MITIGATION_VMWERV: + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + pr_warn_once(MDS_MSG_SMT); + update_mds_branch_idle(); + break; + case MDS_MITIGATION_OFF: + break; + } + + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_AUTO: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_AUTO: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + + switch (tsa_mitigation) { + case TSA_MITIGATION_USER_KERNEL: + case TSA_MITIGATION_VM: + case TSA_MITIGATION_AUTO: + case TSA_MITIGATION_FULL: + /* + * TSA-SQ can potentially lead to info leakage between + * SMT threads. + */ + if (sched_smt_active()) + static_branch_enable(&cpu_buf_idle_clear); + else + static_branch_disable(&cpu_buf_idle_clear); + break; + case TSA_MITIGATION_NONE: + case TSA_MITIGATION_UCODE_NEEDED: + break; + } + + switch (vmscape_mitigation) { + case VMSCAPE_MITIGATION_NONE: + case VMSCAPE_MITIGATION_AUTO: + break; + case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT: + case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER: + /* + * Hypervisors can be attacked across-threads, warn for SMT when + * STIBP is not already enabled system-wide. + * + * Intel eIBRS (!AUTOIBRS) implies STIBP on. + */ + if (!sched_smt_active() || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + break; + pr_warn_once(VMSCAPE_MSG_SMT); + break; + } + + mutex_unlock(&spec_ctrl_mutex); +} + #ifdef CONFIG_SYSFS #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" @@ -3518,9 +3561,6 @@ static const char *spectre_bhi_state(void) static ssize_t spectre_v2_show_state(char *buf) { - if (spectre_v2_enabled == SPECTRE_V2_LFENCE) - return sysfs_emit(buf, "Vulnerable: LFENCE\n"); - if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); @@ -3578,6 +3618,11 @@ static ssize_t tsa_show_state(char *buf) return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]); } +static ssize_t vmscape_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -3644,6 +3689,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_TSA: return tsa_show_state(buf); + case X86_BUG_VMSCAPE: + return vmscape_show_state(buf); + default: break; } @@ -3735,6 +3783,11 @@ ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_TSA); } + +ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE); +} #endif void __warn_thunk(void) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index adfa7e8bb86557..51a95b07831fa1 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -289,6 +289,22 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } +/* + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4) +{ + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + + return apicid >> index_msb; +} + /* * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. */ @@ -312,18 +328,11 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) * Newer families: LLC ID is calculated from the number * of threads sharing the L3 cache. */ - u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; + struct _cpuid4_info id4 = {}; - cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); - if (eax) - num_sharing_cache = ((eax >> 14) & 0xfff) + 1; - - if (num_sharing_cache) { - int index_msb = get_count_order(num_sharing_cache); - - c->topo.llc_id = c->topo.apicid >> index_msb; - } + if (!amd_fill_cpuid4_info(llc_index, &id4)) + c->topo.llc_id = get_cache_id(c->topo.apicid, &id4); } } @@ -598,27 +607,12 @@ int init_cache_level(unsigned int cpu) return 0; } -/* - * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input - * ECX as cache index. Then right shift apicid by the number's order to get - * cache id for this cache node. - */ -static void get_cache_id(int cpu, struct _cpuid4_info *id4) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - unsigned long num_threads_sharing; - int index_msb; - - num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - id4->id = c->topo.apicid >> index_msb; -} - int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *ci = this_cpu_ci->info_list; u8 cpu_vendor = boot_cpu_data.x86_vendor; + u32 apicid = cpu_data(cpu).topo.apicid; struct amd_northbridge *nb = NULL; struct _cpuid4_info id4 = {}; int idx, ret; @@ -628,7 +622,7 @@ int populate_cache_leaves(unsigned int cpu) if (ret) return ret; - get_cache_id(cpu, &id4); + id4.id = get_cache_id(apicid, &id4); if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) nb = amd_init_l3_cache(idx); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 34a054181c4dc4..c7d3512914ca97 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1236,55 +1236,71 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define ITS_NATIVE_ONLY BIT(9) /* CPU is affected by Transient Scheduler Attacks */ #define TSA BIT(10) +/* CPU is affected by VMSCAPE */ +#define VMSCAPE BIT(11) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { - VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), - VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), - VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), - VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), - VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), - VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS), - VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS), - VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_METEORLAKE_L, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_H, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_U, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_LUNARLAKE_M, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SAPPHIRERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_GRANITERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_EMERALDRAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_CRESTMONT_X, X86_STEP_MAX, VMSCAPE), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), - VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), - VULNBL_AMD(0x19, SRSO | TSA), - VULNBL_AMD(0x1a, SRSO), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE), + VULNBL_AMD(0x1a, SRSO | VMSCAPE), {} }; @@ -1543,6 +1559,14 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } } + /* + * Set the bug only on bare-metal. A nested hypervisor should already be + * deploying IBPB to isolate itself from nested guests. + */ + if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) && + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + setup_force_cpu_bug(X86_BUG_VMSCAPE); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1784,6 +1808,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_LA57); detect_nopl(); + mca_bsp_init(c); } void __init init_cpu_devs(void) diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 553bfbfc3a1b5a..f3e9219845e858 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_ACRN_GUEST &x86_hyper_acrn, #endif +#ifdef CONFIG_BHYVE_GUEST + &x86_hyper_bhyve, +#endif }; enum x86_hypervisor_type x86_hyper_type; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 5c4eb28c3ac930..d6906442f49bf9 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -241,7 +241,8 @@ struct threshold_block { struct threshold_bank { struct kobject *kobj; - struct threshold_block *blocks; + /* List of threshold blocks within this MCA bank. */ + struct list_head miscj; }; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); @@ -252,9 +253,6 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); */ static DEFINE_PER_CPU(u64, bank_map); -/* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u64, smca_misc_banks_map); - static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -264,28 +262,6 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) -{ - u32 low, high; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return; - - if (!(low & MCI_CONFIG_MCAX)) - return; - - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) - return; - - if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); - -} - static void smca_configure(unsigned int bank, unsigned int cpu) { u8 *bank_counts = this_cpu_ptr(smca_bank_counts); @@ -326,8 +302,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - smca_set_misc_banks_map(bank, cpu); - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -419,8 +393,8 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return true; }; -/* Reprogram MCx_MISC MSR behind this threshold bank. */ -static void threshold_restart_bank(void *_tr) +/* Reprogram MCx_MISC MSR behind this threshold block. */ +static void threshold_restart_block(void *_tr) { struct thresh_restart *tr = _tr; u32 hi, lo; @@ -478,7 +452,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) }; b->threshold_limit = THRESHOLD_MAX; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); }; static int setup_APIC_mce_threshold(int reserved, int new) @@ -525,18 +499,6 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int bank, unsigned int block, - unsigned int cpu) -{ - if (!block) - return MSR_AMD64_SMCA_MCx_MISC(bank); - - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) - return 0; - - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -546,8 +508,15 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; - if (mce_flags.smca) - return smca_get_block_address(bank, block, cpu); + if (mce_flags.smca) { + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(low & MASK_BLKPTR_LO)) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } /* Fall back to method we used for older processors: */ switch (block) { @@ -677,6 +646,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) wrmsrq(MSR_K7_HWCR, hwcr); } +static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -684,6 +675,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + amd_apply_cpu_quirks(c); + + mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (mce_flags.smca) @@ -714,6 +708,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +void smca_bsp_init(void) +{ + mce_threshold_vector = amd_threshold_interrupt; + deferred_error_int_vector = amd_deferred_error_interrupt; +} + /* * DRAM ECC errors are reported in the Northbridge (bank 4) with * Extended Error Code 8. @@ -921,7 +921,7 @@ static void log_and_reset_block(struct threshold_block *block) /* Reset threshold block after logging error. */ memset(&tr, 0, sizeof(tr)); tr.b = block; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); } /* @@ -930,9 +930,9 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; - struct threshold_bank **bp = this_cpu_read(threshold_banks); + struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank; unsigned int bank, cpu = smp_processor_id(); + struct threshold_block *block, *tmp; /* * Validate that the threshold bank has been initialized already. The @@ -946,20 +946,20 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; - first_block = bp[bank]->blocks; - if (!first_block) + thr_bank = bp[bank]; + if (!thr_bank) continue; - /* - * The first block is also the head of the list. Check it first - * before iterating over the rest. - */ - log_and_reset_block(first_block); - list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj) log_and_reset_block(block); } } +void amd_clear_bank(struct mce *m) +{ + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Sysfs Interface */ @@ -995,7 +995,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1020,7 +1020,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1181,13 +1181,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb default_attrs[2] = NULL; } - INIT_LIST_HEAD(&b->miscj); - - /* This is safe as @tb is not visible yet */ - if (tb->blocks) - list_add(&b->miscj, &tb->blocks->miscj); - else - tb->blocks = b; + list_add(&b->miscj, &tb->miscj); err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) @@ -1238,6 +1232,8 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } + INIT_LIST_HEAD(&b->miscj); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1258,26 +1254,15 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_blocks(struct threshold_bank *bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&bank->blocks->kobj); -} - -static void threshold_remove_bank(struct threshold_bank *bank) -{ - if (!bank->blocks) - goto out_free; - - deallocate_threshold_blocks(bank); - -out_free: kobject_put(bank->kobj); kfree(bank); } @@ -1296,12 +1281,12 @@ static void __threshold_remove_device(struct threshold_bank **bp) kfree(bp); } -int mce_threshold_remove_device(unsigned int cpu) +void mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); if (!bp) - return 0; + return; /* * Clear the pointer before cleaning up, so that the interrupt won't @@ -1310,7 +1295,7 @@ int mce_threshold_remove_device(unsigned int cpu) this_cpu_write(threshold_banks, NULL); __threshold_remove_device(bp); - return 0; + return; } /** @@ -1324,36 +1309,34 @@ int mce_threshold_remove_device(unsigned int cpu) * thread running on @cpu. The callback is invoked on all CPUs which are * online when the callback is installed or during a real hotplug event. */ -int mce_threshold_create_device(unsigned int cpu) +void mce_threshold_create_device(unsigned int cpu) { unsigned int numbanks, bank; struct threshold_bank **bp; - int err; if (!mce_flags.amd_threshold) - return 0; + return; bp = this_cpu_read(threshold_banks); if (bp) - return 0; + return; numbanks = this_cpu_read(mce_num_banks); bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) - return -ENOMEM; + return; for (bank = 0; bank < numbanks; ++bank) { if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; - err = threshold_create_bank(bp, cpu, bank); - if (err) { + if (threshold_create_bank(bp, cpu, bank)) { __threshold_remove_device(bp); - return err; + return; } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; + return; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4da4eab56c81de..460e90a1a0b172 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrq(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrq(u32 msr, u64 v) +noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -714,6 +714,60 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); +/* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ +static bool ser_should_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_should_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + +static void clear_bank(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_clear_bank(m); + + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. @@ -765,51 +819,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!mca_cfg.cmci_disabled) mce_track_storm(m); - /* If this entry is not valid, ignore it */ - if (!(m->status & MCI_STATUS_VAL)) + /* Verify that the error should be logged based on hardware conditions. */ + if (!should_log_poll_error(flags, &err)) continue; - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m->status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m->status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) - goto log_it; - - /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. - */ - continue; - -log_it: - if (flags & MCP_DONTLOG) - goto clear_it; - mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* @@ -826,10 +839,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_log(&err); clear_it: - /* - * Clear state for this bank. - */ - mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + clear_bank(m); } /* @@ -1810,9 +1820,10 @@ static void __mcheck_cpu_mce_banks_init(void) struct mce_bank *b = &mce_banks[i]; /* - * Init them all, __mcheck_cpu_apply_quirks() is going to apply - * the required vendor quirks before - * __mcheck_cpu_init_clear_banks() does the final bank setup. + * Init them all by default. + * + * The required vendor quirks will be applied before + * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; b->init = true; @@ -1840,69 +1851,34 @@ static void __mcheck_cpu_cap_init(void) this_cpu_write(mce_num_banks, b); __mcheck_cpu_mce_banks_init(); - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; } static void __mcheck_cpu_init_generic(void) { - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; u64 cap; - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. Log them - * only, do not start processing them. That will happen in mcheck_late_init() - * when all consumers have been registered on the notifier chain. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - - cr4_set_bits(X86_CR4_MCE); - rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } -static void __mcheck_cpu_init_clear_banks(void) +static void __mcheck_cpu_init_prepare_banks(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; int i; - for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - struct mce_bank *b = &mce_banks[i]; + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + if (mca_cfg.bootlog) { + mce_banks_t all_banks; - if (!b->init) - continue; - wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks); } -} - -/* - * Do a final check to see if there are any unused/RAZ banks. - * - * This must be done after the banks have been initialized and any quirks have - * been applied. - * - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. - * Otherwise, a user who disables a bank will not be able to re-enable it - * without a system reboot. - */ -static void __mcheck_cpu_check_banks(void) -{ - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - u64 msrval; - int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; @@ -1910,25 +1886,16 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -static void apply_quirks_amd(struct cpuinfo_x86 *c) +static void amd_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* * Lots of broken BIOS around that don't clear them @@ -1937,13 +1904,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) mca_cfg.bootlog = 0; } - /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks)) - mce_banks[0].ctl = 0; - /* * overflow_recov is supported for F15h Models 00h-0fh * even though we don't have a CPUID bit for it. @@ -1955,25 +1915,12 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) mce_flags.zen_ifu_quirk = 1; } -static void apply_quirks_intel(struct cpuinfo_x86 *c) +static void intel_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - /* Older CPUs (prior to family 6) don't need quirks. */ if (c->x86_vfm < INTEL_PENTIUM_PRO) return; - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) - mce_banks[0].init = false; - /* * All newer Intel systems support MCE broadcasting. Enable * synchronization with a one second timeout. @@ -1999,7 +1946,7 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) mce_flags.skx_repmov_quirk = 1; } -static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c) { /* * All newer Zhaoxin CPUs support MCE broadcasting. Enable @@ -2011,34 +1958,6 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) } } -/* Add per CPU specific workarounds here */ -static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) -{ - struct mca_config *cfg = &mca_cfg; - - switch (c->x86_vendor) { - case X86_VENDOR_UNKNOWN: - pr_info("unknown CPU type - not enabling MCE support\n"); - return false; - case X86_VENDOR_AMD: - apply_quirks_amd(c); - break; - case X86_VENDOR_INTEL: - apply_quirks_intel(c); - break; - case X86_VENDOR_ZHAOXIN: - apply_quirks_zhaoxin(c); - break; - } - - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; - - return true; -} - static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) @@ -2060,19 +1979,6 @@ static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return false; } -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.amd_threshold = 1; - } -} - static void mce_centaur_feature_init(struct cpuinfo_x86 *c) { struct mca_config *cfg = &mca_cfg; @@ -2281,6 +2187,53 @@ DEFINE_IDTENTRY_RAW(exc_machine_check) } #endif +void mca_bsp_init(struct cpuinfo_x86 *c) +{ + u64 cap; + + if (!mce_available(c)) + return; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + mca_cfg.disabled = 1; + pr_info("unknown CPU type - not enabling MCE support\n"); + return; + } + + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + + if (mce_flags.smca) + smca_bsp_init(); + + rdmsrq(MSR_IA32_MCG_CAP, cap); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_apply_global_quirks(c); + break; + case X86_VENDOR_INTEL: + intel_apply_global_quirks(c); + break; + case X86_VENDOR_ZHAOXIN: + zhaoxin_apply_global_quirks(c); + break; + } + + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = 0; + if (mca_cfg.bootlog != 0) + mca_cfg.panic_timeout = 30; +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: @@ -2298,11 +2251,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (!__mcheck_cpu_apply_quirks(c)) { - mca_cfg.disabled = 1; - return; - } - if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); @@ -2311,12 +2259,11 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) mca_cfg.initialized = 1; - __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_check_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); + cr4_set_bits(X86_CR4_MCE); } /* @@ -2483,7 +2430,8 @@ static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); + cr4_set_bits(X86_CR4_MCE); } static struct syscore_ops mce_syscore_ops = { @@ -2501,8 +2449,9 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); + cr4_set_bits(X86_CR4_MCE); } /* Reinit MCEs after user configuration changes */ diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 9b149b9c410901..4655223ba5606f 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -468,8 +468,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c) } } +static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + * + * Older CPUs (prior to family 6) can't reach this point and already + * return early due to the check of __mcheck_cpu_ancient_init(). + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + this_cpu_ptr(mce_banks_array)[0].init = false; +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { + intel_apply_cpu_quirks(c); intel_init_cmci(); intel_init_lmce(); intel_imc_init(c); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b5ba598e54cb48..b0e00ec5cc8c9f 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -265,8 +265,11 @@ void mce_prep_record_common(struct mce *m); void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD +void mce_threshold_create_device(unsigned int cpu); +void mce_threshold_remove_device(unsigned int cpu); extern bool amd_filter_mce(struct mce *m); bool amd_mce_usable_address(struct mce *m); +void amd_clear_bank(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -292,10 +295,15 @@ static __always_inline void smca_extract_err_addr(struct mce *m) m->addr &= GENMASK_ULL(55, lsb); } +void smca_bsp_init(void); #else +static inline void mce_threshold_create_device(unsigned int cpu) { } +static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } +static inline void amd_clear_bank(struct mce *m) { } static inline void smca_extract_err_addr(struct mce *m) { } +static inline void smca_bsp_init(void) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -313,6 +321,7 @@ static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif noinstr u64 mce_rdmsrq(u32 msr); +noinstr void mce_wrmsrq(u32 msr, u64 v); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 514f63340880fd..cdce885e2fd50a 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -269,15 +269,6 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi return true; } -static u32 get_patch_level(void) -{ - u32 rev, dummy __always_unused; - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - return rev; -} - static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) { union zen_patch_rev p; @@ -295,6 +286,30 @@ static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) return c; } +static u32 get_patch_level(void) +{ + u32 rev, dummy __always_unused; + + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + int cpu = smp_processor_id(); + + if (!microcode_rev[cpu]) { + if (!base_rev) + base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); + + microcode_rev[cpu] = base_rev; + + ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev); + } + + return microcode_rev[cpu]; + } + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + return rev; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; @@ -324,13 +339,13 @@ static bool verify_container(const u8 *buf, size_t buf_size) u32 cont_magic; if (buf_size <= CONTAINER_HDR_SZ) { - pr_debug("Truncated microcode container header.\n"); + ucode_dbg("Truncated microcode container header.\n"); return false; } cont_magic = *(const u32 *)buf; if (cont_magic != UCODE_MAGIC) { - pr_debug("Invalid magic value (0x%08x).\n", cont_magic); + ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic); return false; } @@ -355,8 +370,8 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { - pr_debug("Wrong microcode container equivalence table type: %u.\n", - cont_type); + ucode_dbg("Wrong microcode container equivalence table type: %u.\n", + cont_type); return false; } @@ -365,7 +380,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) equiv_tbl_len = hdr[2]; if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) || buf_size < equiv_tbl_len) { - pr_debug("Truncated equivalence table.\n"); + ucode_dbg("Truncated equivalence table.\n"); return false; } @@ -385,7 +400,7 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize const u32 *hdr; if (buf_size < SECTION_HDR_SIZE) { - pr_debug("Truncated patch section.\n"); + ucode_dbg("Truncated patch section.\n"); return false; } @@ -394,13 +409,13 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize p_size = hdr[1]; if (p_type != UCODE_UCODE_TYPE) { - pr_debug("Invalid type field (0x%x) in container file section header.\n", - p_type); + ucode_dbg("Invalid type field (0x%x) in container file section header.\n", + p_type); return false; } if (p_size < sizeof(struct microcode_header_amd)) { - pr_debug("Patch of size %u too short.\n", p_size); + ucode_dbg("Patch of size %u too short.\n", p_size); return false; } @@ -477,12 +492,12 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) * size sh_psize, as the section claims. */ if (buf_size < sh_psize) { - pr_debug("Patch of size %u truncated.\n", sh_psize); + ucode_dbg("Patch of size %u truncated.\n", sh_psize); return -1; } if (!__verify_patch_size(sh_psize, buf_size)) { - pr_debug("Per-family patch size mismatch.\n"); + ucode_dbg("Per-family patch size mismatch.\n"); return -1; } @@ -496,6 +511,9 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) proc_id = mc_hdr->processor_rev_id; patch_fam = 0xf + (proc_id >> 12); + + ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); + if (patch_fam != family) return 1; @@ -566,9 +584,14 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); + + ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id); + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; + + ucode_dbg(" match: size: %d\n", patch_size); } skip: @@ -639,8 +662,14 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, invlpg(p_addr_end); } + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) + microcode_rev[smp_processor_id()] = mc->hdr.patch_id; + /* verify patch application was successful */ *cur_rev = get_patch_level(); + + ucode_dbg("updated rev: 0x%x\n", *cur_rev); + if (*cur_rev != mc->hdr.patch_id) return false; @@ -1026,7 +1055,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", + ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ @@ -1169,7 +1198,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); if (request_firmware_direct(&fw, (const char *)fw_name, device)) { - pr_debug("failed to load file %s\n", fw_name); + ucode_dbg("failed to load file %s\n", fw_name); goto out; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b92e09a87c6997..f75c140906d002 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,10 +43,19 @@ #include "internal.h" static struct microcode_ops *microcode_ops; -static bool dis_ucode_ldr = false; +static bool dis_ucode_ldr; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); -module_param(force_minrev, bool, S_IRUSR | S_IWUSR); + +/* + * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in + * order to not uglify the code with ifdeffery and use IS_ENABLED() + * instead, leave them in. When microcode debugging is not enabled, + * those are meaningless anyway. + */ +/* base microcode revision for debugging */ +u32 base_rev; +u32 microcode_rev[NR_CPUS] = {}; /* * Synchronization. @@ -119,20 +128,48 @@ bool __init microcode_loader_disabled(void) * overwritten. */ if (!cpuid_feature() || - native_cpuid_ecx(1) & BIT(31) || + ((native_cpuid_ecx(1) & BIT(31)) && + !IS_ENABLED(CONFIG_MICROCODE_DBG)) || amd_check_current_patch_level()) dis_ucode_ldr = true; return dis_ucode_ldr; } +static void early_parse_cmdline(void) +{ + char cmd_buf[64] = {}; + char *s, *p = cmd_buf; + + if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) { + while ((s = strsep(&p, ","))) { + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + if (strstr(s, "base_rev=")) { + /* advance to the option arg */ + strsep(&s, "="); + if (kstrtouint(s, 16, &base_rev)) { ; } + } + } + + if (!strcmp("force_minrev", s)) + force_minrev = true; + + if (!strcmp(s, "dis_ucode_ldr")) + dis_ucode_ldr = true; + } + } + + /* old, compat option */ + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; +} + void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; bool intel = true; - if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) - dis_ucode_ldr = true; + early_parse_cmdline(); if (microcode_loader_disabled()) return; diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h index cb6e601701ab6e..2d48e6593540f2 100644 --- a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -67,9 +67,8 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, @@ -81,51 +80,62 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 50a9702ae4e2b5..ae8dbc2b908d72 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -44,6 +44,9 @@ struct early_load_data { extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; +extern u32 microcode_rev[NR_CPUS]; +extern u32 base_rev; + struct cpio_data find_microcode_in_initrd(const char *path); #define MAX_UCODE_COUNT 128 @@ -122,4 +125,10 @@ static inline void reload_ucode_intel(void) { } static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } #endif /* !CONFIG_CPU_SUP_INTEL */ +#define ucode_dbg(fmt, ...) \ +({ \ + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) \ + pr_info(fmt, ##__VA_ARGS__); \ +}) + #endif /* _X86_MICROCODE_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 187d527ef73b6e..06ca5a30140c2f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -107,7 +107,7 @@ u32 resctrl_arch_system_num_rmid_idx(void) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->num_rmid; + return r->mon.num_rmid; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -365,8 +365,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { - kfree(hw_dom->arch_mbm_total); - kfree(hw_dom->arch_mbm_local); + int idx; + + for_each_mbm_idx(idx) + kfree(hw_dom->arch_mbm_states[idx]); kfree(hw_dom); } @@ -400,25 +402,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * */ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { - size_t tsize; - - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_total) - return -ENOMEM; - } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_local); - hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_local) { - kfree(hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = NULL; - return -ENOMEM; - } + size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + for_each_mbm_idx(idx) { + kfree(hw_dom->arch_mbm_states[idx]); + hw_dom->arch_mbm_states[idx] = NULL; + } + + return -ENOMEM; } static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) @@ -516,6 +520,9 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = container_of(hdr, struct rdt_mon_domain, hdr); cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); return; } @@ -535,9 +542,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; } @@ -707,6 +718,7 @@ enum { RDT_FLAG_MBA, RDT_FLAG_SMBA, RDT_FLAG_BMEC, + RDT_FLAG_ABMC, }; #define RDT_OPT(idx, n, f) \ @@ -732,6 +744,7 @@ static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), + RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -863,15 +876,24 @@ static __init bool get_rdt_alloc_resources(void) static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool ret = false; - if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) - rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_ABMC)) + ret = true; - if (!rdt_mon_features) + if (!ret) return false; return !rdt_get_mon_l3_config(r); @@ -965,7 +987,7 @@ static enum cpuhp_state rdt_online; /* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; c->x86_cache_mbm_width_offset = -1; @@ -977,7 +999,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) || + cpu_has(c, X86_FEATURE_ABMC)) { u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5e3c41b3643737..9f4c2f0aaf5c80 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,15 @@ struct arch_mbm_state { u64 prev_msr; }; +/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ +#define ABMC_ENABLE_BIT 0 + +/* + * Qos Event Identifiers. + */ +#define ABMC_EXTENDED_EVT_ID BIT(31) +#define ABMC_EVT_ID BIT(0) + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -54,15 +63,15 @@ struct rdt_hw_ctrl_domain { * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share * a resource for a monitor function * @d_resctrl: Properties exposed to the resctrl file system - * @arch_mbm_total: arch private state for MBM total bandwidth - * @arch_mbm_local: arch private state for MBM local bandwidth + * @arch_mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct arch_mbm_state + * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_mon_domain { struct rdt_mon_domain d_resctrl; - struct arch_mbm_state *arch_mbm_total; - struct arch_mbm_state *arch_mbm_local; + struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) @@ -102,6 +111,7 @@ struct msr_param { * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource + * @mbm_cntr_assign_enabled: ABMC feature is enabled * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -115,6 +125,7 @@ struct rdt_hw_resource { unsigned int mon_scale; unsigned int mbm_width; bool cdp_enabled; + bool mbm_cntr_assign_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) @@ -159,6 +170,42 @@ union cpuid_0x10_x_edx { unsigned int full; }; +/* + * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG. + * + * @bw_type : Event configuration that represents the memory + * transactions being tracked by the @cntr_id. + * @bw_src : Bandwidth source (RMID or CLOSID). + * @reserved1 : Reserved. + * @is_clos : @bw_src field is a CLOSID (not an RMID). + * @cntr_id : Counter identifier. + * @reserved : Reserved. + * @cntr_en : Counting enable bit. + * @cfg_en : Configuration enable bit. + * + * Configuration and counting: + * Counter can be configured across multiple writes to MSR. Configuration + * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the + * configuration is applied. + * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not + * count events. + * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start + * counting events. + */ +union l3_qos_abmc_cfg { + struct { + unsigned long bw_type :32, + bw_src :12, + reserved1: 3, + is_clos : 1, + cntr_id : 5, + reserved : 9, + cntr_en : 1, + cfg_en : 1; + } split; + unsigned long full; +}; + void rdt_ctrl_update(void *arg); int rdt_get_mon_l3_config(struct rdt_resource *r); @@ -168,5 +215,6 @@ bool rdt_cpu_has(int flag); void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c261558276cdd4..c8945610d45550 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -31,11 +31,6 @@ */ bool rdt_mon_capable; -/* - * Global to indicate which monitoring events are enabled. - */ -unsigned int rdt_mon_features; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; @@ -135,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid) if (snc_nodes_per_l3_cache == 1) return lrmid; - return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; } static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) @@ -166,18 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do u32 rmid, enum resctrl_event_id eventid) { - switch (eventid) { - case QOS_L3_OCCUP_EVENT_ID: - return NULL; - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &hw_dom->arch_mbm_total[rmid]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &hw_dom->arch_mbm_local[rmid]; - default: - /* Never expect to get here */ - WARN_ON_ONCE(1); + struct arch_mbm_state *state; + + if (!resctrl_is_mbm_event(eventid)) return NULL; - } + + state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; + + return state ? &state[rmid] : NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -206,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - - if (resctrl_arch_is_mbm_total_enabled()) - memset(hw_dom->arch_mbm_total, 0, - sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - - if (resctrl_arch_is_mbm_local_enabled()) - memset(hw_dom->arch_mbm_local, 0, - sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + memset(hw_dom->arch_mbm_states[idx], 0, + sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) @@ -224,15 +217,33 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 msr_val) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct arch_mbm_state *am; + u64 chunks; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + am->chunks += mbm_overflow_count(am->prev_msr, msr_val, + hw_res->mbm_width); + chunks = get_corrected_mbm_count(rmid, am->chunks); + am->prev_msr = msr_val; + } else { + chunks = msr_val; + } + + return chunks * hw_res->mon_scale; +} + int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); int cpu = cpumask_any(&d->hdr.cpu_mask); - struct arch_mbm_state *am; - u64 msr_val, chunks; + u64 msr_val; u32 prmid; int ret; @@ -243,17 +254,76 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, if (ret) return ret; + *val = get_corrected_val(r, d, rmid, eventid, msr_val); + + return 0; +} + +static int __cntr_id_read(u32 cntr_id, u64 *val) +{ + u64 msr_val; + + /* + * QM_EVTSEL Register definition: + * ======================================================= + * Bits Mnemonic Description + * ======================================================= + * 63:44 -- Reserved + * 43:32 RMID RMID or counter ID in ABMC mode + * when reading an MBM event + * 31 ExtendedEvtID Extended Event Identifier + * 30:8 -- Reserved + * 7:0 EvtID Event Identifier + * ======================================================= + * The contents of a specific counter can be read by setting the + * following fields in QM_EVTSEL.ExtendedEvtID(=1) and + * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID + * to the desired counter ID. Reading the QM_CTR then returns the + * contents of the specified counter. The RMID_VAL_ERROR bit is set + * if the counter configuration is invalid, or if an invalid counter + * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit + * is set if the counter data is unavailable. + */ + wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + *val = msr_val; + return 0; +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct arch_mbm_state *am; + am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { - am->chunks += mbm_overflow_count(am->prev_msr, msr_val, - hw_res->mbm_width); - chunks = get_corrected_mbm_count(rmid, am->chunks); - am->prev_msr = msr_val; - } else { - chunks = msr_val; + memset(am, 0, sizeof(*am)); + + /* Record any initial, non-zero count value. */ + __cntr_id_read(cntr_id, &am->prev_msr); } +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + u64 msr_val; + int ret; + + ret = __cntr_id_read(cntr_id, &msr_val); + if (ret) + return ret; - *val = chunks * hw_res->mon_scale; + *val = get_corrected_val(r, d, rmid, eventid, msr_val); return 0; } @@ -346,12 +416,13 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; + u32 eax, ebx, ecx, edx; snc_nodes_per_l3_cache = snc_get_config(); resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; - r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; + r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -366,7 +437,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - threshold = resctrl_rmid_realloc_limit / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; /* * Because num_rmid may not be a power of two, round the value @@ -375,12 +446,17 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - if (rdt_cpu_has(X86_FEATURE_BMEC)) { - u32 eax, ebx, ecx, edx; - + if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + } + + if (rdt_cpu_has(X86_FEATURE_ABMC)) { + r->mon.mbm_cntr_assignable = true; + cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); + r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + hw_res->mbm_cntr_assign_enabled = true; } r->mon_capable = true; @@ -401,3 +477,91 @@ void __init intel_rdt_mbm_apply_quirk(void) mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; mbm_cf = mbm_cf_table[cf_index].cf; } + +static void resctrl_abmc_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); +} + +/* + * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs + * associated with all monitor domains. + */ +static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, + &enable, 1); + resctrl_arch_reset_rmid_all(r, d); + } +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (r->mon.mbm_cntr_assignable && + hw_res->mbm_cntr_assign_enabled != enable) { + _resctrl_abmc_enable(r, enable); + hw_res->mbm_cntr_assign_enabled = enable; + } + + return 0; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; +} + +static void resctrl_abmc_config_one_amd(void *info) +{ + union l3_qos_abmc_cfg *abmc_cfg = info; + + wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); +} + +/* + * Send an IPI to the domain to assign the counter to RMID, event pair. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + union l3_qos_abmc_cfg abmc_cfg = { 0 }; + struct arch_mbm_state *am; + + abmc_cfg.split.cfg_en = 1; + abmc_cfg.split.cntr_en = assign ? 1 : 0; + abmc_cfg.split.cntr_id = cntr_id; + abmc_cfg.split.bw_src = rmid; + if (assign) + abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); + + smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); + + /* + * The hardware counter is reset (because cfg_en == 1) so there is no + * need to record initial non-zero counts. + */ + am = get_arch_mbm_state(hw_dom, rmid, evtid); + if (am) + memset(am, 0, sizeof(*am)); +} + +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); +} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 6b868afb26c319..4cee6213d66738 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -51,6 +51,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index e35ccdc84910f5..6073a16628f9e4 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -372,6 +372,19 @@ unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_uni return topo_unit_count(lvlid, at_level, apic_maps[which_units].map); } +#ifdef CONFIG_SMP +int topology_get_primary_thread(unsigned int cpu) +{ + u32 apic_id = cpuid_to_apicid[cpu]; + + /* + * Get the core domain level APIC id, which is the primary thread + * and return the CPU number assigned to it. + */ + return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN)); +} +#endif + #ifdef CONFIG_ACPI_HOTPLUG_CPU /** * topology_hotplug_apic - Handle a physical hotplugged APIC after boot diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 827dd0dbb6e9d2..6ac097e1310659 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -59,7 +59,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) tscan->amd_node_id = node_id; } -static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) +static bool parse_8000_001e(struct topo_scan *tscan) { struct { // eax @@ -85,7 +85,7 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) * If leaf 0xb/0x26 is available, then the APIC ID and the domain * shifts are set already. */ - if (!has_topoext) { + if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) { tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* @@ -163,11 +163,12 @@ static void topoext_fixup(struct topo_scan *tscan) c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) return; - if (msr_set_bit(0xc0011005, 54) <= 0) + if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT, + MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0) return; - rdmsrq(0xc0011005, msrval); - if (msrval & BIT_64(54)) { + rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval); + if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } @@ -175,27 +176,27 @@ static void topoext_fixup(struct topo_scan *tscan) static void parse_topology_amd(struct topo_scan *tscan) { - bool has_topoext = false; + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); /* - * If the extended topology leaf 0x8000_001e is available - * try to get SMT, CORE, TILE, and DIE shifts from extended + * Try to get SMT, CORE, TILE, and DIE shifts from extended * CPUID leaf 0x8000_0026 on supported processors first. If * extended CPUID leaf 0x8000_0026 is not supported, try to - * get SMT and CORE shift from leaf 0xb first, then try to - * get the CORE shift from leaf 0x8000_0008. + * get SMT and CORE shift from leaf 0xb. If either leaf is + * available, cpu_parse_topology_ext() will return true. + * + * If XTOPOLOGY leaves (0x26/0xb) are not available, try to + * get the CORE shift from leaf 0x8000_0008 first. */ - if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) - has_topoext = cpu_parse_topology_ext(tscan); - - if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) - tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - - if (!has_topoext && !parse_8000_0008(tscan)) + if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan)) return; - /* Prefer leaf 0x8000001e if available */ - if (parse_8000_001e(tscan, has_topoext)) + /* + * Prefer leaf 0x8000001e if available to get the SMT shift and + * the initial APIC ID if XTOPOLOGY leaves are not available. + */ + if (parse_8000_001e(tscan)) return; /* Try the NODEID MSR */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index aefd412a23dc24..1f71cc135e9ade 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -631,7 +631,7 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) } /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, +int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal, unsigned long ssp) { /* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 533fcf5636fc92..fd28b53dbac51f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -52,10 +52,13 @@ SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); unsigned int __pgtable_l5_enabled __ro_after_init; +SYM_PIC_ALIAS(__pgtable_l5_enabled); unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); +SYM_PIC_ALIAS(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); +SYM_PIC_ALIAS(ptrs_per_p4d); unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); @@ -316,5 +319,5 @@ void early_setup_idt(void) handler = vc_boot_ghcb; } - startup_64_load_idt(handler); + __pi_startup_64_load_idt(handler); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 76743dfad6ab9d..80ef5d386b03dc 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -61,7 +61,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -__HEAD + __INIT SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx @@ -136,6 +136,9 @@ SYM_CODE_END(startup_32) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ +#ifdef CONFIG_HOTPLUG_CPU + .text +#endif SYM_FUNC_START(startup_32_smp) cld movl $(__BOOT_DS),%eax diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3e9b3a3bd03961..21816b48537c3a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -33,7 +33,7 @@ * because we need identity-mapped pages. */ - __HEAD + __INIT .code64 SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_END_OF_STACK @@ -71,7 +71,7 @@ SYM_CODE_START_NOALIGN(startup_64) xorl %edx, %edx wrmsr - call startup_64_setup_gdt_idt + call __pi_startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(startup_64) * subsequent code. Pass the boot_params pointer as the first argument. */ movq %r15, %rdi - call sme_enable + call __pi_sme_enable #endif /* Sanitize CPU configuration */ @@ -111,7 +111,7 @@ SYM_CODE_START_NOALIGN(startup_64) * programmed into CR3. */ movq %r15, %rsi - call __startup_64 + call __pi___startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ leaq early_top_pgt(%rip), %rcx @@ -562,7 +562,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Call C handler */ movq %rsp, %rdi movq ORIG_RAX(%rsp), %rsi - call do_vc_no_ghcb + call __pi_do_vc_no_ghcb /* Unwind pt_regs */ POP_REGS diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6079d15dab8ca7..3863d7709386fc 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -339,7 +339,7 @@ static bool can_probe(unsigned long paddr) if (is_exception_insn(&insn)) return false; - if (IS_ENABLED(CONFIG_CFI_CLANG)) { + if (IS_ENABLED(CONFIG_CFI)) { /* * The compiler generates the following instruction sequence * for indirect call checks and cfi.c decodes this; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1b7960cf6eb0c1..e3a3987b0c4fb6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -159,7 +159,7 @@ __visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct inactive_task_frame *frame; diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 2ddf23387c7ef7..978232b6d48d76 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -191,7 +191,7 @@ void reset_thread_features(void) current->thread.features_locked = 0; } -unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags, unsigned long stack_size) { struct thread_shstk *shstk = &tsk->thread.shstk; @@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void) return ssp; } +int shstk_pop(u64 *val) +{ + int ret = 0; + u64 ssp; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + if (val && get_user(*val, (__user u64 *)ssp)) + ret = -EFAULT; + else + wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE); + fpregs_unlock(); + + return ret; +} + +int shstk_push(u64 val) +{ + u64 ssp; + int ret; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + ssp -= SS_FRAME_SIZE; + ret = write_user_shstk_64((__user void *)ssp, val); + if (!ret) + wrmsrq(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return ret; +} + #define SHSTK_DATA_BIT BIT(63) static int put_shstk_data(u64 __user *addr, u64 data) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 33e166f6ab1224..eb289abece2370 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -479,14 +479,14 @@ static int x86_cluster_flags(void) static bool x86_has_numa_in_package; static struct sched_domain_topology_level x86_topology[] = { - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), + SDTL_INIT(tl_mc_mask, x86_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG), { NULL }, }; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 5a4b21389b1d98..d432f3824f0c29 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -156,15 +156,26 @@ static int identify_insn(struct insn *insn) if (!insn->modrm.nbytes) return -EINVAL; - /* All the instructions of interest start with 0x0f. */ - if (insn->opcode.bytes[0] != 0xf) + /* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */ + if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf) return -EINVAL; if (insn->opcode.bytes[1] == 0x1) { switch (X86_MODRM_REG(insn->modrm.value)) { case 0: + /* The reg form of 0F 01 /0 encodes VMX instructions. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SGDT; case 1: + /* + * The reg form of 0F 01 /1 encodes MONITOR/MWAIT, + * STAC/CLAC, and ENCLS. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SIDT; case 4: return UMIP_INST_SMSW; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6d383839e83936..845aeaf36b8d2f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Post-execution fixups. */ @@ -310,25 +311,32 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool #ifdef CONFIG_X86_64 +struct uretprobe_syscall_args { + unsigned long r11; + unsigned long cx; + unsigned long ax; +}; + asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" "uretprobe_trampoline_entry:\n" - "pushq %rax\n" - "pushq %rcx\n" - "pushq %r11\n" - "movq $" __stringify(__NR_uretprobe) ", %rax\n" + "push %rax\n" + "push %rcx\n" + "push %r11\n" + "mov $" __stringify(__NR_uretprobe) ", %rax\n" "syscall\n" ".global uretprobe_syscall_check\n" "uretprobe_syscall_check:\n" - "popq %r11\n" - "popq %rcx\n" - - /* The uretprobe syscall replaces stored %rax value with final + "pop %r11\n" + "pop %rcx\n" + /* + * The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ - "retq\n" + "ret\n" + "int3\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" @@ -338,7 +346,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); @@ -365,7 +373,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp) SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); - unsigned long err, ip, sp, r11_cx_ax[3], tramp; + struct uretprobe_syscall_args args; + unsigned long err, ip, sp, tramp; /* If there's no trampoline, we are called from wrong place. */ tramp = uprobe_get_trampoline_vaddr(); @@ -376,15 +385,15 @@ SYSCALL_DEFINE0(uretprobe) if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; - err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) goto sigill; /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ - regs->r11 = r11_cx_ax[0]; - regs->cx = r11_cx_ax[1]; - regs->ax = r11_cx_ax[2]; - regs->sp += sizeof(r11_cx_ax); + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ax = args.ax; + regs->sp += sizeof(args); regs->orig_ax = -1; ip = regs->ip; @@ -400,21 +409,21 @@ SYSCALL_DEFINE0(uretprobe) */ if (regs->sp != sp || shstk_is_enabled()) return regs->ax; - regs->sp -= sizeof(r11_cx_ax); + regs->sp -= sizeof(args); /* for the case uprobe_consumer has changed r11/cx */ - r11_cx_ax[0] = regs->r11; - r11_cx_ax[1] = regs->cx; + args.r11 = regs->r11; + args.cx = regs->cx; /* * ax register is passed through as return value, so we can use * its space on stack for ip value and jump to it through the * trampoline's ret instruction */ - r11_cx_ax[2] = regs->ip; + args.ax = regs->ip; regs->ip = ip; - err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); if (err) goto sigill; @@ -608,6 +617,581 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_trampoline(unsigned long vaddr) +{ + struct vm_unmapped_area_info info = { + .length = PAGE_SIZE, + .align_mask = ~PAGE_MASK, + }; + unsigned long low_limit, high_limit; + unsigned long low_tramp, high_tramp; + unsigned long call_end = vaddr + 5; + + if (check_add_overflow(call_end, INT_MIN, &low_limit)) + low_limit = PAGE_SIZE; + + high_limit = call_end + INT_MAX; + + /* Search up from the caller address. */ + info.low_limit = call_end; + info.high_limit = min(high_limit, TASK_SIZE); + high_tramp = vm_unmapped_area(&info); + + /* Search down from the caller address. */ + info.low_limit = max(low_limit, PAGE_SIZE); + info.high_limit = call_end; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + low_tramp = vm_unmapped_area(&info); + + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) + return -ENOMEM; + if (IS_ERR_VALUE(high_tramp)) + return low_tramp; + if (IS_ERR_VALUE(low_tramp)) + return high_tramp; + + /* Return address that's closest to the caller address. */ + if (call_end - low_tramp < high_tramp - call_end) + return low_tramp; + return high_tramp; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + vaddr = find_nearest_trampoline(vaddr); + if (IS_ERR_VALUE(vaddr)) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + tramp->vaddr = vaddr; + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) { + kfree(tramp); + return NULL; + } + return tramp; +} + +static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) + return NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + *new = false; + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + *new = true; + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + /* + * We do not unmap and release uprobe trampoline page itself, + * because there's no easy way to make sure none of the threads + * is still inside the trampoline. + */ + hlist_del(&tramp->node); + kfree(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +/* + * See uprobe syscall trampoline; the call to the trampoline will push + * the return address on the stack, the trampoline itself then pushes + * cx, r11 and ax. + */ +struct uprobe_syscall_args { + unsigned long ax; + unsigned long r11; + unsigned long cx; + unsigned long retaddr; +}; + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + struct uprobe_syscall_args args; + unsigned long ip, sp, sret; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + return -ENXIO; + + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = args.ax; + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ip = args.retaddr - 5; + regs->sp += sizeof(args); + regs->orig_ax = -1; + + sp = regs->sp; + + err = shstk_pop((u64 *)&sret); + if (err == -EFAULT || (!err && sret != args.retaddr)) + goto sigill; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) { + /* skip the trampoline call */ + if (args.retaddr - 5 == regs->ip) + regs->ip += 5; + return regs->ax; + } + + regs->sp -= sizeof(args); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + args.ax = regs->ax; + args.r11 = regs->r11; + args.cx = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (args.retaddr - 5 != regs->ip) + args.retaddr = regs->ip; + + if (shstk_push(args.retaddr) == -EFAULT) + goto sigill; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "mov $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + "int3\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + +enum { + EXPECT_SWBP, + EXPECT_CALL, +}; + +struct write_opcode_ctx { + unsigned long base; + int expect; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +/* + * Verification callback used by int3_update uprobe_write calls to make sure + * the underlying instruction is as expected - either int3 or call. + */ +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->expect) { + case EXPECT_SWBP: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case EXPECT_CALL: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +/* + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. + * (borrowed comment from smp_text_poke_batch_finish) + * + * The way it is done: + * - Add an INT3 trap to the address that will be patched + * - SMP sync all CPUs + * - Update all but the first byte of the patched range + * - SMP sync all CPUs + * - Replace the first byte (INT3) by the first byte of the replacing opcode + * - SMP sync all CPUs + */ +static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, char *insn, bool optimize) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * Write int3 trap. + * + * The swbp_optimize path comes with breakpoint already installed, + * so we can skip this step for optimize == true. + */ + if (!optimize) { + ctx.expect = EXPECT_CALL; + err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + } + + smp_text_poke_sync_each_cpu(); + + /* Write all but the first byte of the patched range. */ + ctx.expect = EXPECT_SWBP; + err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + /* + * Write first byte. + * + * The swbp_unoptimize needs to finish uprobe removal together + * with ref_ctr update, using uprobe_write with proper flags. + */ + err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, + optimize /* is_register */, !optimize /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + return 0; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, unsigned long tramp) +{ + u8 call[5]; + + __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, + (const void *) tramp, CALL_INSN_SIZE); + return int3_update(auprobe, vma, vaddr, call, true /* optimize */); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + return __is_optimized((uprobe_opcode_t *)&insn, vaddr); +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) + return 0; + } + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, + true /* is_register */); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) { + ret = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(ret); + return ret; + } + } + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, + false /* is_register */); +} + +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + bool new = false; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -EINVAL; + tramp = get_uprobe_trampoline(vaddr, &new); + if (!tramp) + return -EINVAL; + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err) && new) + destroy_uprobe_trampoline(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool insn_is_nop(struct insn *insn) +{ + return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; +} + +static bool insn_is_nopl(struct insn *insn) +{ + if (insn->opcode.nbytes != 2) + return false; + + if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) + return false; + + if (!insn->modrm.nbytes) + return false; + + if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) + return false; + + /* 0f 1f /0 - NOPL */ + return true; +} + +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + if (!insn->x86_64 || insn->length != 5) + return false; + + if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + return false; + + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -621,6 +1205,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -979,14 +1567,17 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - struct insn insn; u8 fix_ip_or_call = UPROBE_FIX_IP; + struct insn insn; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; + if (can_optimize(&insn, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4fa0be732af10f..d7af4a64c211b7 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -160,11 +160,6 @@ SECTIONS } :text = 0xcccccccc - /* bootstrapping code */ - .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { - HEAD_TEXT - } :text = 0xcccccccc - /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); @@ -227,6 +222,8 @@ SECTIONS */ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { *(.altinstr_aux) + . = ALIGN(PAGE_SIZE); + __inittext_end = .; } INIT_DATA_SECTION(16) @@ -535,3 +532,5 @@ xen_elfnote_entry_value = xen_elfnote_phys32_entry_value = ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); #endif + +#include "../boot/startup/exports.h" diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ad89d0bd600581..103604c4b33b58 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -13,7 +13,7 @@ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) -/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) \ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9931c6c4bc62a..1bfebe40854f49 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4046,8 +4046,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (nested_svm_virtualize_tpr(vcpu) || - kvm_vcpu_apicv_active(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; cr8 = kvm_get_cr8(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 604490b1cb19c7..706b6fd56d3c5d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11010,6 +11010,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrq(MSR_IA32_XFD_ERR, 0); + /* + * Mark this CPU as needing a branch predictor flush before running + * userspace. Must be done before enabling preemption to ensure it gets + * set for the CPU that actually ran the guest, and not the CPU that it + * may migrate to. + */ + if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER)) + this_cpu_write(x86_ibpb_exit_to_user, true); + /* * Consume any pending interrupts, including the possible source of * VM-Exit on SVM and any ticks that occur between VM-Exit and now. diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index b0f3b2a62ae27b..a5cafd402cfd3d 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, return table[opcode]; } +insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) +{ + const insn_attr_t *table; + + if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) + return 0; + map_select -= X86_XOP_M_MIN; + /* At first, this checks the master table */ + table = inat_xop_tables[map_select]; + if (!table) + return 0; + return table[opcode]; +} diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 149a57e334ab5c..225af1399c9d3f 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -200,12 +200,15 @@ int insn_get_prefixes(struct insn *insn) } insn->rex_prefix.got = 1; - /* Decode VEX prefix */ + /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); - attr = inat_get_opcode_attribute(b); - if (inat_is_vex_prefix(attr)) { + if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); - if (!insn->x86_64) { + + if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { + /* Grp1A.0 is always POP Ev */ + goto vex_end; + } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is @@ -226,13 +229,13 @@ int insn_get_prefixes(struct insn *insn) if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; - } else if (inat_is_vex3_prefix(attr)) { + } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) - /* VEX.W overrides opnd_size */ + /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* @@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn) insn_set_byte(opcode, 0, op); opcode->nbytes = 1; - /* Check if there is VEX prefix or not */ - if (insn_is_avx(insn)) { + /* Check if there is VEX/XOP prefix or not */ + if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; + + /* XOP prefix has different encoding */ + if (unlikely(avx_insn_is_xop(insn))) { + m = insn_xop_map_bits(insn); + insn->attr = inat_get_xop_attribute(op, m); + if (!inat_accept_xop(insn->attr)) { + insn->attr = 0; + return -EINVAL; + } + /* XOP has only 1 byte for opcode */ + goto end; + } + m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); @@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && + !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index d78d769a02bd39..f513d33b6d3704 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -15,7 +15,6 @@ .section .text..__x86.indirect_thunk - .macro POLINE reg ANNOTATE_INTRA_FUNCTION_CALL call .Ldo_rop_\@ @@ -73,6 +72,7 @@ SYM_CODE_END(__x86_indirect_thunk_array) #undef GEN #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING + .macro CALL_THUNK reg .align RETPOLINE_THUNK_SIZE @@ -126,7 +126,45 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) #define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg) #include #undef GEN -#endif + +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ + +#ifdef CONFIG_MITIGATION_ITS + +.macro ITS_THUNK reg + +/* + * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) + * that complete the fineibt_paranoid caller sequence. + */ +1: .byte 0xea +SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + jne 1b +SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + jmp *%\reg + int3 + .align 32, 0xcc /* fill to the end of the line */ + .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ +.endm + +/* ITS mitigation requires thunks be aligned to upper half of cacheline */ +.align 64, 0xcc +.skip 29, 0xcc + +#define GEN(reg) ITS_THUNK reg +#include +#undef GEN + + .align 64, 0xcc +SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) +SYM_CODE_END(__x86_indirect_its_thunk_array) + +#endif /* CONFIG_MITIGATION_ITS */ #ifdef CONFIG_MITIGATION_RETHUNK @@ -370,39 +408,6 @@ SYM_FUNC_END(call_depth_return_thunk) #ifdef CONFIG_MITIGATION_ITS -.macro ITS_THUNK reg - -/* - * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) - * that complete the fineibt_paranoid caller sequence. - */ -1: .byte 0xea -SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_UNDEFINED - ANNOTATE_NOENDBR - jne 1b -SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_UNDEFINED - ANNOTATE_NOENDBR - ANNOTATE_RETPOLINE_SAFE - jmp *%\reg - int3 - .align 32, 0xcc /* fill to the end of the line */ - .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ -.endm - -/* ITS mitigation requires thunks be aligned to upper half of cacheline */ -.align 64, 0xcc -.skip 29, 0xcc - -#define GEN(reg) ITS_THUNK reg -#include -#undef GEN - - .align 64, 0xcc -SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) -SYM_CODE_END(__x86_indirect_its_thunk_array) - .align 64, 0xcc .skip 32, 0xcc SYM_CODE_START(its_return_thunk) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 262f7ca1fb9527..2a4e69ecc2de0f 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -27,6 +27,11 @@ # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. +# (xop): this opcode accepts XOP prefix. +# +# XOP Superscripts +# (W=0): this opcode requires XOP.W == 0 +# (W=1): this opcode requires XOP.W == 1 # # Last Prefix Superscripts # - (66): the last prefix is 0x66 @@ -194,7 +199,7 @@ AVXcode: 8c: MOV Ev,Sw 8d: LEA Gv,M 8e: MOV Sw,Ew -8f: Grp1A (1A) | POP Ev (d64) +8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix) # 0x90 - 0x9f 90: NOP | PAUSE (F3) | XCHG r8,rAX 91: XCHG rCX/r9,rAX @@ -1106,6 +1111,84 @@ AVXcode: 7 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) EndTable +# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5 +Table: XOP map 8h +Referrer: +XOPcode: 0 +85: VPMACSSWW Vo,Ho,Wo,Lo +86: VPMACSSWD Vo,Ho,Wo,Lo +87: VPMACSSDQL Vo,Ho,Wo,Lo +8e: VPMACSSDD Vo,Ho,Wo,Lo +8f: VPMACSSDQH Vo,Ho,Wo,Lo +95: VPMACSWW Vo,Ho,Wo,Lo +96: VPMACSWD Vo,Ho,Wo,Lo +97: VPMACSDQL Vo,Ho,Wo,Lo +9e: VPMACSDD Vo,Ho,Wo,Lo +9f: VPMACSDQH Vo,Ho,Wo,Lo +a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1) +a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1) +a6: VPMADCSSWD Vo,Ho,Wo,Lo +b6: VPMADCSWD Vo,Ho,Wo,Lo +c0: VPROTB Vo,Wo,Ib +c1: VPROTW Vo,Wo,Ib +c2: VPROTD Vo,Wo,Ib +c3: VPROTQ Vo,Wo,Ib +cc: VPCOMccB Vo,Ho,Wo,Ib +cd: VPCOMccW Vo,Ho,Wo,Ib +ce: VPCOMccD Vo,Ho,Wo,Ib +cf: VPCOMccQ Vo,Ho,Wo,Ib +ec: VPCOMccUB Vo,Ho,Wo,Ib +ed: VPCOMccUW Vo,Ho,Wo,Ib +ee: VPCOMccUD Vo,Ho,Wo,Ib +ef: VPCOMccUQ Vo,Ho,Wo,Ib +EndTable + +Table: XOP map 9h +Referrer: +XOPcode: 1 +01: GrpXOP1 +02: GrpXOP2 +12: GrpXOP3 +80: VFRCZPS Vx,Wx +81: VFRCZPD Vx,Wx +82: VFRCZSS Vq,Wss +83: VFRCZSD Vq,Wsd +90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1) +95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1) +96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1) +97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1) +98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1) +99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1) +9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1) +9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1) +c1: VPHADDBW Vo,Wo +c2: VPHADDBD Vo,Wo +c3: VPHADDBQ Vo,Wo +c6: VPHADDWD Vo,Wo +c7: VPHADDWQ Vo,Wo +cb: VPHADDDQ Vo,Wo +d1: VPHADDUBWD Vo,Wo +d2: VPHADDUBD Vo,Wo +d3: VPHADDUBQ Vo,Wo +d6: VPHADDUWD Vo,Wo +d7: VPHADDUWQ Vo,Wo +db: VPHADDUDQ Vo,Wo +e1: VPHSUBBW Vo,Wo +e2: VPHSUBWD Vo,Wo +e3: VPHSUBDQ Vo,Wo +EndTable + +Table: XOP map Ah +Referrer: +XOPcode: 2 +10: BEXTR Gy,Ey,Id +12: GrpXOP4 +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1320,3 +1403,29 @@ GrpTable: GrpRNG 4: xcrypt-cfb 5: xcrypt-ofb EndTable + +# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4 +GrpTable: GrpXOP1 +1: BLCFILL By,Ey (xop) +2: BLSFILL By,Ey (xop) +3: BLCS By,Ey (xop) +4: TZMSK By,Ey (xop) +5: BLCIC By,Ey (xop) +6: BLSIC By,Ey (xop) +7: T1MSKC By,Ey (xop) +EndTable + +GrpTable: GrpXOP2 +1: BLCMSK By,Ey (xop) +6: BLCI By,Ey (xop) +EndTable + +GrpTable: GrpXOP3 +0: LLWPCB Ry (xop) +1: SLWPCB Ry (xop) +EndTable + +GrpTable: GrpXOP4 +0: LWPINS By,Ed,Id (xop) +1: LWPVAL By,Ed,Id (xop) +EndTable diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index faf3a13fb6ba0b..2f8c32173972d6 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -536,12 +536,6 @@ void __init sme_early_init(void) x86_init.resources.dmi_setup = snp_dmi_setup; } - /* - * Switch the SVSM CA mapping (if active) from identity mapped to - * kernel mapped. - */ - snp_update_svsm_ca(); - if (sev_status & MSR_AMD64_SNP_SECURE_TSC) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index f8a33b25ae869e..edbf9c99884846 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -16,7 +16,7 @@ .text .code64 -SYM_FUNC_START(sme_encrypt_execute) +SYM_FUNC_START(__pi_sme_encrypt_execute) /* * Entry parameters: @@ -69,9 +69,9 @@ SYM_FUNC_START(sme_encrypt_execute) ANNOTATE_UNRET_SAFE ret int3 -SYM_FUNC_END(sme_encrypt_execute) +SYM_FUNC_END(__pi_sme_encrypt_execute) -SYM_FUNC_START(__enc_copy) +SYM_FUNC_START_LOCAL(__enc_copy) ANNOTATE_NOENDBR /* * Routine used to encrypt memory in place. diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7e3fca1646203c..fc13306af15fa8 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1151,11 +1152,38 @@ static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 i *pprog = prog; } +static void emit_ldsx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* movsx rax, byte ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBE); + break; + case BPF_H: + /* movsx rax, word ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBF); + break; + case BPF_W: + /* movsx rax, dword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x63); + break; + } + emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off); + *pprog = prog; +} + static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); } +static void emit_ldsx_r12(u8 **prog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_ldsx_index(prog, size, dst_reg, src_reg, X86_REG_R12, off); +} + /* STX: *(u8*)(dst_reg + off) = src_reg */ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -1388,16 +1416,67 @@ static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, return 0; } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` and `data` fields in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+---------+----------+ + * | 31 | 30-24 | 23-16 | 15-8 | 7-0 | + * | | | | | | + * | ARENA_ACC | Unused | ARENA_REG | DST_REG | INSN_LEN | + * +-----------+--------+-----------+---------+----------+ + * + * - INSN_LEN (8 bits): Length of faulting insn (max x86 insn = 15 bytes (fits in 8 bits)). + * - DST_REG (8 bits): Offset of dst_reg from reg2pt_regs[] (max offset = 112 (fits in 8 bits)). + * This is set to DONT_CLEAR if the insn is a store. + * - ARENA_REG (8 bits): Offset of the register that is used to calculate the + * address for load/store when accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * + * Bit layout of `data` (32-bit): + * + * +--------------+--------+--------------+ + * | 31-16 | 15-8 | 7-0 | + * | | | | + * | ARENA_OFFSET | Unused | EX_TYPE_BPF | + * +--------------+--------+--------------+ + * + * - ARENA_OFFSET (16 bits): Offset used to calculate the address for load/store when + * accessing the arena region. + */ + #define DONT_CLEAR 1 +#define FIXUP_INSN_LEN_MASK GENMASK(7, 0) +#define FIXUP_REG_MASK GENMASK(15, 8) +#define FIXUP_ARENA_REG_MASK GENMASK(23, 16) +#define FIXUP_ARENA_ACCESS BIT(31) +#define DATA_ARENA_OFFSET_MASK GENMASK(31, 16) bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { - u32 reg = x->fixup >> 8; + u32 reg = FIELD_GET(FIXUP_REG_MASK, x->fixup); + u32 insn_len = FIELD_GET(FIXUP_INSN_LEN_MASK, x->fixup); + bool is_arena = !!(x->fixup & FIXUP_ARENA_ACCESS); + bool is_write = (reg == DONT_CLEAR); + unsigned long addr; + s16 off; + u32 arena_reg; + + if (is_arena) { + arena_reg = FIELD_GET(FIXUP_ARENA_REG_MASK, x->fixup); + off = FIELD_GET(DATA_ARENA_OFFSET_MASK, x->data); + addr = *(unsigned long *)((void *)regs + arena_reg) + off; + bpf_prog_report_arena_violation(is_write, addr, regs->ip); + } /* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR) *(unsigned long *)((void *)regs + reg) = 0; - regs->ip += x->fixup & 0xff; + regs->ip += insn_len; + return true; } @@ -2057,19 +2136,27 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_B: case BPF_STX | BPF_PROBE_MEM32 | BPF_H: case BPF_STX | BPF_PROBE_MEM32 | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: start_of_ldx = prog; - if (BPF_CLASS(insn->code) == BPF_LDX) - emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); - else + if (BPF_CLASS(insn->code) == BPF_LDX) { + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) + emit_ldsx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + else + emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } else { emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } populate_extable: { struct exception_table_entry *ex; u8 *_insn = image + proglen + (start_of_ldx - temp); + u32 arena_reg, fixup_reg; s64 delta; if (!bpf_prog->aux->extable) @@ -2089,8 +2176,29 @@ st: if (is_imm8(insn->off)) ex->data = EX_TYPE_BPF; - ex->fixup = (prog - start_of_ldx) | - ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + /* + * src_reg/dst_reg holds the address in the arena region with upper + * 32-bits being zero because of a preceding addr_space_cast(r, + * 0x0, 0x1) instruction. This address is adjusted with the addition + * of arena_vm_start (see the implementation of BPF_PROBE_MEM32 and + * BPF_PROBE_ATOMIC) before being used for the memory access. Pass + * the reg holding the unmodified 32-bit address to + * ex_handler_bpf(). + */ + if (BPF_CLASS(insn->code) == BPF_LDX) { + arena_reg = reg2pt_regs[src_reg]; + fixup_reg = reg2pt_regs[dst_reg]; + } else { + arena_reg = reg2pt_regs[dst_reg]; + fixup_reg = DONT_CLEAR; + } + + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_ARENA_REG_MASK, arena_reg) | + FIELD_PREP(FIXUP_REG_MASK, fixup_reg); + ex->fixup |= FIXUP_ARENA_ACCESS; + + ex->data |= FIELD_PREP(DATA_ARENA_OFFSET_MASK, insn->off); } break; @@ -2208,7 +2316,8 @@ st: if (is_imm8(insn->off)) * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_REG_MASK, reg2pt_regs[dst_reg]); } break; diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 1d78e5631bb81c..344030c1a81d46 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -24,7 +24,7 @@ #include #include - __HEAD + __INIT /* * Entry point for PVH guests. diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index e0a607a14e7ed7..5ce1d426300004 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -57,7 +57,7 @@ ifdef CONFIG_MITIGATION_RETPOLINE PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS) endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) endif diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 2c19d7fc8a8559..7ea1b75e59b742 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -21,6 +21,7 @@ function clear_vars() { eid = -1 # escape id gid = -1 # group id aid = -1 # AVX id + xopid = -1 # XOP id tname = "" } @@ -39,9 +40,11 @@ BEGIN { ggid = 1 geid = 1 gaid = 0 + gxopid = 0 delete etable delete gtable delete atable + delete xoptable opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" @@ -61,6 +64,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -87,6 +91,8 @@ BEGIN { evexonly_expr = "\\(ev\\)" # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size evex_scalable_expr = "\\(es\\)" + # All opcodes in XOP table or with (xop) superscript accept XOP prefix + xopok_expr = "\\(xop\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -106,6 +112,7 @@ BEGIN { prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" prefix_num["REX2"] = "INAT_PFX_REX2" + prefix_num["XOP"] = "INAT_PFX_XOP" clear_vars() } @@ -147,6 +154,7 @@ function array_size(arr, i,c) { if (NF != 1) { # AVX/escape opcode table aid = $2 + xopid = -1 if (gaid <= aid) gaid = aid + 1 if (tname == "") # AVX only opcode table @@ -156,6 +164,20 @@ function array_size(arr, i,c) { tname = "inat_primary_table" } +/^XOPcode:/ { + if (NF != 1) { + # XOP opcode table + xopid = $2 + aid = -1 + if (gxopid <= xopid) + gxopid = xopid + 1 + if (tname == "") # XOP only opcode table + tname = sprintf("inat_xop_table_%d", $2) + } + if (xopid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n) etable[eid,0] = tname if (aid >= 0) atable[aid,0] = tname + else if (xopid >= 0) + xoptable[xopid] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", @@ -347,6 +371,8 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") + else if (match(ext, xopok_expr) || xopid >= 0) + flags = add_flags(flags, "INAT_XOPOK") # check prefixes if (match(ext, prefix_expr)) { @@ -413,6 +439,14 @@ END { print " ["i"]["j"] = "atable[i,j]"," print "};\n" + print "/* XOP opcode map array */" + print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \ + " = {" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print " ["i"] = "xoptable[i]"," + print "};" + print "#else /* !__BOOT_COMPRESSED */\n" print "/* Escape opcode map array */" @@ -430,6 +464,10 @@ END { "[INAT_LSTPFX_MAX + 1];" print "" + print "/* XOP opcode map array */" + print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];" + print "" + print "static void inat_init_tables(void)" print "{" @@ -455,6 +493,12 @@ END { if (atable[i,j]) print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + print "" + print "\t/* Print XOP opcode map array */" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print "\tinat_xop_tables["i"] = "xoptable[i]";" + print "}" print "#endif" } diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5778bc4984153f..e5a2b9a912d198 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -740,10 +740,10 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { - int headtext = !strcmp(sec_name(sec->shdr.sh_info), ".head.text"); unsigned r_type = ELF64_R_TYPE(rel->r_info); ElfW(Addr) offset = rel->r_offset; int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname); + if (sym->st_shndx == SHN_UNDEF) return 0; @@ -783,12 +783,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, break; } - if (headtext) { - die("Absolute reference to symbol '%s' not permitted in .head.text\n", - symname); - break; - } - /* * Relocation offsets for 64 bit kernels are output * as 32 bits and sign extended back to 64 bits when diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 942372e69b4dd3..ee643a6cd69165 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -1029,7 +1029,7 @@ int rmp_make_shared(u64 pfn, enum pg_level level) } EXPORT_SYMBOL_GPL(rmp_make_shared); -void snp_leak_pages(u64 pfn, unsigned int npages) +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) { struct page *page = pfn_to_page(pfn); @@ -1052,14 +1052,15 @@ void snp_leak_pages(u64 pfn, unsigned int npages) (PageHead(page) && compound_nr(page) <= npages)) list_add_tail(&page->buddy_list, &snp_leaked_pages_list); - dump_rmpentry(pfn); + if (dump_rmp) + dump_rmpentry(pfn); snp_nr_leaked_pages++; pfn++; page++; } spin_unlock(&snp_leaked_pages_list_lock); } -EXPORT_SYMBOL_GPL(snp_leak_pages); +EXPORT_SYMBOL_GPL(__snp_leak_pages); void kdump_sev_callback(void) { diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 98d8a50d2aed4b..aa4040fd921560 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -8,6 +8,7 @@ config XEN depends on PARAVIRT select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR + select HIBERNATE_CALLBACKS depends on X86_64 || (X86_32 && X86_PAE) depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM) depends on X86_LOCAL_APIC && X86_TSC @@ -64,12 +65,6 @@ config XEN_PVHVM_GUEST help Support running as a Xen PVHVM guest. -config XEN_SAVE_RESTORE - bool - depends on XEN - select HIBERNATE_CALLBACKS - default y - config XEN_DEBUG_FS bool "Enable Xen debug and tuning parameters in debugfs" depends on XEN && DEBUG_FS diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 26bbaf4b7330b4..4806cc28d7ca77 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -382,7 +382,6 @@ static bool __init xen_check_xsave(void) static void __init xen_init_capabilities(void) { - setup_force_cpu_cap(X86_FEATURE_XENPV); setup_clear_cpu_cap(X86_FEATURE_DCA); setup_clear_cpu_cap(X86_FEATURE_APERFMPERF); setup_clear_cpu_cap(X86_FEATURE_MTRR); @@ -1402,6 +1401,7 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) JMP32_INSN_SIZE); xen_domain_type = XEN_PV_DOMAIN; + setup_force_cpu_cap(X86_FEATURE_XENPV); xen_start_flags = xen_start_info->flags; /* Interrupts are guaranteed to be off initially. */ early_boot_irqs_disabled = true; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c4c479373249be..3be45bf4bc7975 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int nr, struct page **pages) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return xen_xlate_unmap_gfn_range(vma, nr, pages); if (!pages) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 56914e21e30305..2dd12b61a230ad 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -686,7 +686,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, int i, ret = 0; pte_t *pte; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return 0; if (kmap_ops) { @@ -769,7 +769,7 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, { int i, ret = 0; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return 0; for (i = 0; i < count; i++) { diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h index e02ec583338947..f7390b6761e1bb 100644 --- a/arch/xtensa/include/asm/bitops.h +++ b/arch/xtensa/include/asm/bitops.h @@ -37,7 +37,7 @@ static inline unsigned long __cntlz (unsigned long x) * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ -static inline int ffz(unsigned long x) +static inline int __attribute_const__ ffz(unsigned long x) { return 31 - __cntlz(~x & -~x); } @@ -46,7 +46,7 @@ static inline int ffz(unsigned long x) * __ffs: Find first bit set in word. Return 0 for bit 0 */ -static inline unsigned long __ffs(unsigned long x) +static inline __attribute_const__ unsigned long __ffs(unsigned long x) { return 31 - __cntlz(x & -x); } @@ -57,7 +57,7 @@ static inline unsigned long __ffs(unsigned long x) * differs in spirit from the above ffz (man ffs). */ -static inline int ffs(unsigned long x) +static inline __attribute_const__ int ffs(unsigned long x) { return 32 - __cntlz(x & -x); } @@ -67,7 +67,7 @@ static inline int ffs(unsigned long x) * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls (unsigned int x) +static inline __attribute_const__ int fls (unsigned int x) { return 32 - __cntlz(x); } @@ -78,7 +78,7 @@ static inline int fls (unsigned int x) * * Undefined if no set bit exists, so code should check against 0 first. */ -static inline unsigned long __fls(unsigned long word) +static inline __attribute_const__ unsigned long __fls(unsigned long word) { return 31 - __cntlz(word); } diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c index da38de20ae598b..cfbced95e944a4 100644 --- a/arch/xtensa/kernel/asm-offsets.c +++ b/arch/xtensa/kernel/asm-offsets.c @@ -11,6 +11,7 @@ * * Chris Zankel */ +#define COMPILE_OFFSETS #include #include diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 7bd66677f7b6de..94d43f44be1315 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -267,7 +267,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long usp_thread_fn = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); diff --git a/block/bdev.c b/block/bdev.c index b77ddd12dc0634..810707cca9703e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -412,7 +412,7 @@ static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = bdev_evict_inode, }; diff --git a/block/blk-core.c b/block/blk-core.c index 4201504158a17e..a27185cd8edead 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -557,9 +557,11 @@ static inline int bio_check_eod(struct bio *bio) sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); - if (nr_sectors && maxsector && + if (nr_sectors && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { + if (!maxsector) + return -EIO; pr_info_ratelimited("%s: attempt to access beyond end of device\n" "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", current->comm, bio->bi_bdev, bio->bi_opf, diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 056b8948369d55..ce08ad4565e283 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -58,16 +58,14 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp) { - struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk); + struct blk_integrity *bi; struct logical_block_metadata_cap meta_cap = {}; size_t usize = _IOC_SIZE(cmd); - if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) || - _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) || - _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) || - _IOC_SIZE(cmd) < LBMD_SIZE_VER0) + if (!extensible_ioctl_valid(cmd, FS_IOC_GETLBMD_CAP, LBMD_SIZE_VER0)) return -ENOIOCTLCMD; + bi = blk_get_integrity(bdev->bd_disk); if (!bi) goto out; diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9fda3906e5f5d6..d15918d7fabb33 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -286,7 +286,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) } EXPORT_SYMBOL_GPL(set_task_ioprio); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; diff --git a/block/fops.c b/block/fops.c index 82451ac8ff25dd..ddbc69c0922baa 100644 --- a/block/fops.c +++ b/block/fops.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -54,7 +55,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; - WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -131,7 +131,7 @@ static void blkdev_bio_end_io(struct bio *bio) if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; - if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + if (bio_integrity(bio)) bio_integrity_unmap_user(bio); if (atomic_dec_and_test(&dio->ref)) { @@ -233,7 +233,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } bio->bi_opf |= REQ_NOWAIT; } - if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); if (unlikely(ret)) goto fail; @@ -301,7 +301,7 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } - if (iocb->ki_flags & IOCB_HAS_METADATA) + if (bio_integrity(bio)) bio_integrity_unmap_user(bio); iocb->ki_complete(iocb, ret); @@ -422,7 +422,8 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (likely(nr_pages <= BIO_MAX_VECS)) { + if (likely(nr_pages <= BIO_MAX_VECS && + !(iocb->ki_flags & IOCB_HAS_METADATA))) { if (is_sync_kiocb(iocb)) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); @@ -687,6 +688,8 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + if (blk_get_integrity(bdev->bd_disk)) + filp->f_mode |= FMODE_HAS_METADATA; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) diff --git a/crypto/Kconfig b/crypto/Kconfig index 23bd98981ae8e9..a04595f9d0ca4b 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -344,14 +344,6 @@ config CRYPTO_ECRDSA One of the Russian cryptographic standard algorithms (called GOST algorithms). Only signature verification is implemented. -config CRYPTO_CURVE25519 - tristate "Curve25519" - select CRYPTO_KPP - select CRYPTO_LIB_CURVE25519_GENERIC - select CRYPTO_LIB_CURVE25519_INTERNAL - help - Curve25519 elliptic curve (RFC7748) - endmenu menu "Block ciphers" @@ -609,6 +601,7 @@ menu "Length-preserving ciphers and modes" config CRYPTO_ADIANTUM tristate "Adiantum" select CRYPTO_CHACHA20 + select CRYPTO_LIB_POLY1305 select CRYPTO_LIB_POLY1305_GENERIC select CRYPTO_NHPOLY1305 select CRYPTO_MANAGER @@ -647,7 +640,6 @@ config CRYPTO_ARC4 config CRYPTO_CHACHA20 tristate "ChaCha" select CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_SKCIPHER help The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms @@ -770,6 +762,7 @@ config CRYPTO_XTS config CRYPTO_NHPOLY1305 tristate select CRYPTO_HASH + select CRYPTO_LIB_POLY1305 select CRYPTO_LIB_POLY1305_GENERIC endmenu @@ -938,8 +931,9 @@ config CRYPTO_MD4 config CRYPTO_MD5 tristate "MD5" select CRYPTO_HASH + select CRYPTO_LIB_MD5 help - MD5 message digest algorithm (RFC1321) + MD5 message digest algorithm (RFC1321), including HMAC support. config CRYPTO_MICHAEL_MIC tristate "Michael MIC" diff --git a/crypto/Makefile b/crypto/Makefile index 6c5d59369dacc6..e430e6e99b6a24 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -182,7 +182,6 @@ obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o obj-$(CONFIG_CRYPTO_ECC) += ecc.o obj-$(CONFIG_CRYPTO_ESSIV) += essiv.o -obj-$(CONFIG_CRYPTO_CURVE25519) += curve25519-generic.o ecdh_generic-y += ecdh.o ecdh_generic-y += ecdh_helper.o diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 0da7c1ac778a0e..ca6fdcc6c54aca 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -970,6 +970,12 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, } lock_sock(sk); + if (ctx->write) { + release_sock(sk); + return -EBUSY; + } + ctx->write = true; + if (ctx->init && !ctx->more) { if (ctx->used) { err = -EINVAL; @@ -1019,6 +1025,8 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, continue; } + ctx->merge = 0; + if (!af_alg_writable(sk)) { err = af_alg_wait_for_wmem(sk, msg->msg_flags); if (err) @@ -1058,7 +1066,6 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, ctx->used += plen; copied += plen; size -= plen; - ctx->merge = 0; } else { do { struct page *pg; @@ -1104,6 +1111,7 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unlock: af_alg_data_wakeup(sk); + ctx->write = false; release_sock(sk); return copied ?: err; diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index f0d4ff3c20a832..6d6475e3a9bf2b 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -429,6 +429,7 @@ int pkcs7_verify(struct pkcs7_message *pkcs7, /* Authattr presence checked in parser */ break; case VERIFYING_UNSPECIFIED_SIGNATURE: + case VERIFYING_BPF_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid unspecified sig (not pkcs7-data)\n"); return -EKEYREJECTED; diff --git a/crypto/chacha.c b/crypto/chacha.c index c3a11f4e2d13dc..ec16d5a33f3cd6 100644 --- a/crypto/chacha.c +++ b/crypto/chacha.c @@ -47,7 +47,7 @@ static int chacha12_setkey(struct crypto_skcipher *tfm, static int chacha_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, - const u8 iv[CHACHA_IV_SIZE], bool arch) + const u8 iv[CHACHA_IV_SIZE]) { struct skcipher_walk walk; struct chacha_state state; @@ -63,36 +63,23 @@ static int chacha_stream_xor(struct skcipher_request *req, if (nbytes < walk.total) nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE); - if (arch) - chacha_crypt(&state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, ctx->nrounds); - else - chacha_crypt_generic(&state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); + chacha_crypt(&state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } -static int crypto_chacha_crypt_generic(struct skcipher_request *req) +static int crypto_chacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - return chacha_stream_xor(req, ctx, req->iv, false); + return chacha_stream_xor(req, ctx, req->iv); } -static int crypto_chacha_crypt_arch(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - return chacha_stream_xor(req, ctx, req->iv, true); -} - -static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch) +static int crypto_xchacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -102,10 +89,7 @@ static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch) /* Compute the subkey given the original key and first 128 nonce bits */ chacha_init(&state, ctx->key, req->iv); - if (arch) - hchacha_block(&state, subctx.key, ctx->nrounds); - else - hchacha_block_generic(&state, subctx.key, ctx->nrounds); + hchacha_block(&state, subctx.key, ctx->nrounds); subctx.nrounds = ctx->nrounds; /* Build the real IV */ @@ -113,71 +97,13 @@ static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch) memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ /* Generate the stream and XOR it with the data */ - return chacha_stream_xor(req, &subctx, real_iv, arch); -} - -static int crypto_xchacha_crypt_generic(struct skcipher_request *req) -{ - return crypto_xchacha_crypt(req, false); -} - -static int crypto_xchacha_crypt_arch(struct skcipher_request *req) -{ - return crypto_xchacha_crypt(req, true); + return chacha_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { { .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-generic", - .base.cra_priority = 100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = crypto_chacha_crypt_generic, - .decrypt = crypto_chacha_crypt_generic, - }, - { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-generic", - .base.cra_priority = 100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = crypto_xchacha_crypt_generic, - .decrypt = crypto_xchacha_crypt_generic, - }, - { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-generic", - .base.cra_priority = 100, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha12_setkey, - .encrypt = crypto_xchacha_crypt_generic, - .decrypt = crypto_xchacha_crypt_generic, - }, - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-" __stringify(ARCH), + .base.cra_driver_name = "chacha20-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), @@ -188,12 +114,12 @@ static struct skcipher_alg algs[] = { .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, - .encrypt = crypto_chacha_crypt_arch, - .decrypt = crypto_chacha_crypt_arch, + .encrypt = crypto_chacha_crypt, + .decrypt = crypto_chacha_crypt, }, { .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-" __stringify(ARCH), + .base.cra_driver_name = "xchacha20-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), @@ -204,12 +130,12 @@ static struct skcipher_alg algs[] = { .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, - .encrypt = crypto_xchacha_crypt_arch, - .decrypt = crypto_xchacha_crypt_arch, + .encrypt = crypto_xchacha_crypt, + .decrypt = crypto_xchacha_crypt, }, { .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-" __stringify(ARCH), + .base.cra_driver_name = "xchacha12-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), @@ -220,27 +146,19 @@ static struct skcipher_alg algs[] = { .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha12_setkey, - .encrypt = crypto_xchacha_crypt_arch, - .decrypt = crypto_xchacha_crypt_arch, + .encrypt = crypto_xchacha_crypt, + .decrypt = crypto_xchacha_crypt, } }; -static unsigned int num_algs; - static int __init crypto_chacha_mod_init(void) { - /* register the arch flavours only if they differ from generic */ - num_algs = ARRAY_SIZE(algs); - BUILD_BUG_ON(ARRAY_SIZE(algs) % 2 != 0); - if (!chacha_is_arch_optimized()) - num_algs /= 2; - - return crypto_register_skciphers(algs, num_algs); + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit crypto_chacha_mod_fini(void) { - crypto_unregister_skciphers(algs, num_algs); + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(crypto_chacha_mod_init); @@ -250,11 +168,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi "); MODULE_DESCRIPTION("Crypto API wrappers for the ChaCha20, XChaCha20, and XChaCha12 stream ciphers"); MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-generic"); -MODULE_ALIAS_CRYPTO("chacha20-" __stringify(ARCH)); +MODULE_ALIAS_CRYPTO("chacha20-lib"); MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-generic"); -MODULE_ALIAS_CRYPTO("xchacha20-" __stringify(ARCH)); +MODULE_ALIAS_CRYPTO("xchacha20-lib"); MODULE_ALIAS_CRYPTO("xchacha12"); -MODULE_ALIAS_CRYPTO("xchacha12-generic"); -MODULE_ALIAS_CRYPTO("xchacha12-" __stringify(ARCH)); +MODULE_ALIAS_CRYPTO("xchacha12-lib"); diff --git a/crypto/curve25519-generic.c b/crypto/curve25519-generic.c deleted file mode 100644 index f3e56e73c66ca2..00000000000000 --- a/crypto/curve25519-generic.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include -#include -#include - -static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - u8 *secret = kpp_tfm_ctx(tfm); - - if (!len) - curve25519_generate_secret(secret); - else if (len == CURVE25519_KEY_SIZE && - crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) - memcpy(secret, buf, CURVE25519_KEY_SIZE); - else - return -EINVAL; - return 0; -} - -static int curve25519_compute_value(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 public_key[CURVE25519_KEY_SIZE]; - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - u8 const *bp; - - if (req->src) { - copied = sg_copy_to_buffer(req->src, - sg_nents_for_len(req->src, - CURVE25519_KEY_SIZE), - public_key, CURVE25519_KEY_SIZE); - if (copied != CURVE25519_KEY_SIZE) - return -EINVAL; - bp = public_key; - } else { - bp = curve25519_base_point; - } - - curve25519_generic(buf, secret, bp); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static unsigned int curve25519_max_size(struct crypto_kpp *tfm) -{ - return CURVE25519_KEY_SIZE; -} - -static struct kpp_alg curve25519_alg = { - .base.cra_name = "curve25519", - .base.cra_driver_name = "curve25519-generic", - .base.cra_priority = 100, - .base.cra_module = THIS_MODULE, - .base.cra_ctxsize = CURVE25519_KEY_SIZE, - - .set_secret = curve25519_set_secret, - .generate_public_key = curve25519_compute_value, - .compute_shared_secret = curve25519_compute_value, - .max_size = curve25519_max_size, -}; - -static int __init curve25519_init(void) -{ - return crypto_register_kpp(&curve25519_alg); -} - -static void __exit curve25519_exit(void) -{ - crypto_unregister_kpp(&curve25519_alg); -} - -module_init(curve25519_init); -module_exit(curve25519_exit); - -MODULE_ALIAS_CRYPTO("curve25519"); -MODULE_ALIAS_CRYPTO("curve25519-generic"); -MODULE_DESCRIPTION("Curve25519 elliptic curve (RFC7748)"); -MODULE_LICENSE("GPL"); diff --git a/crypto/md5.c b/crypto/md5.c index 32c0819f511858..c167d203c710ac 100644 --- a/crypto/md5.c +++ b/crypto/md5.c @@ -1,25 +1,62 @@ -/* - * Cryptographic API. - * - * MD5 Message Digest Algorithm (RFC1321). - * - * Derived from cryptoapi implementation, originally based on the - * public domain implementation written by Colin Plumb in 1993. - * - * Copyright (c) Cryptoapi developers. - * Copyright (c) 2002 James Morris - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Crypto API support for MD5 and HMAC-MD5 * + * Copyright 2025 Google LLC */ #include #include #include #include -#include + +/* + * Export and import functions. crypto_shash wants a particular format that + * matches that used by some legacy drivers. It currently is the same as the + * library MD5 context, except the value in bytecount must be block-aligned and + * the remainder must be stored in an extra u8 appended to the struct. + */ + +#define MD5_SHASH_STATE_SIZE (sizeof(struct md5_ctx) + 1) +static_assert(sizeof(struct md5_ctx) == sizeof(struct md5_state)); +static_assert(offsetof(struct md5_ctx, state) == offsetof(struct md5_state, hash)); +static_assert(offsetof(struct md5_ctx, bytecount) == offsetof(struct md5_state, byte_count)); +static_assert(offsetof(struct md5_ctx, buf) == offsetof(struct md5_state, block)); + +static int __crypto_md5_export(const struct md5_ctx *ctx0, void *out) +{ + struct md5_ctx ctx = *ctx0; + unsigned int partial; + u8 *p = out; + + partial = ctx.bytecount % MD5_BLOCK_SIZE; + ctx.bytecount -= partial; + memcpy(p, &ctx, sizeof(ctx)); + p += sizeof(ctx); + *p = partial; + return 0; +} + +static int __crypto_md5_import(struct md5_ctx *ctx, const void *in) +{ + const u8 *p = in; + + memcpy(ctx, p, sizeof(*ctx)); + p += sizeof(*ctx); + ctx->bytecount += *p; + return 0; +} + +static int __crypto_md5_export_core(const struct md5_ctx *ctx, void *out) +{ + memcpy(out, ctx, offsetof(struct md5_ctx, buf)); + return 0; +} + +static int __crypto_md5_import_core(struct md5_ctx *ctx, const void *in) +{ + memcpy(ctx, in, offsetof(struct md5_ctx, buf)); + return 0; +} const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = { 0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04, @@ -27,198 +64,173 @@ const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = { }; EXPORT_SYMBOL_GPL(md5_zero_message_hash); -#define F1(x, y, z) (z ^ (x & (y ^ z))) -#define F2(x, y, z) F1(z, x, y) -#define F3(x, y, z) (x ^ y ^ z) -#define F4(x, y, z) (y ^ (x | ~z)) - -#define MD5STEP(f, w, x, y, z, in, s) \ - (w += f(x, y, z) + in, w = (w<>(32-s)) + x) - -static void md5_transform(__u32 *hash, __u32 const *in) -{ - u32 a, b, c, d; - - a = hash[0]; - b = hash[1]; - c = hash[2]; - d = hash[3]; - - MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); - MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); - MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); - MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); - MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); - MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); - MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); - MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); - MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); - MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); - MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); - MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); - MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); - MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); - MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); - MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); - - MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); - MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); - MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); - MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); - MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); - MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); - MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); - MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); - MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); - MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); - MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); - MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); - MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); - MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); - MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); - MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); - - MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); - MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); - MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); - MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); - MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); - MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); - MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); - MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); - MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); - MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); - MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); - MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); - MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); - MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); - MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); - MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); - - MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); - MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); - MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); - MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); - MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); - MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); - MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); - MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); - MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); - MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); - MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); - MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); - MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); - MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); - MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); - MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); - - hash[0] += a; - hash[1] += b; - hash[2] += c; - hash[3] += d; -} - -static inline void md5_transform_helper(struct md5_state *ctx, - u32 block[MD5_BLOCK_WORDS]) -{ - le32_to_cpu_array(block, MD5_BLOCK_WORDS); - md5_transform(ctx->hash, block); -} - -static int md5_init(struct shash_desc *desc) -{ - struct md5_state *mctx = shash_desc_ctx(desc); - - mctx->hash[0] = MD5_H0; - mctx->hash[1] = MD5_H1; - mctx->hash[2] = MD5_H2; - mctx->hash[3] = MD5_H3; - mctx->byte_count = 0; +#define MD5_CTX(desc) ((struct md5_ctx *)shash_desc_ctx(desc)) +static int crypto_md5_init(struct shash_desc *desc) +{ + md5_init(MD5_CTX(desc)); return 0; } -static int md5_update(struct shash_desc *desc, const u8 *data, unsigned int len) -{ - struct md5_state *mctx = shash_desc_ctx(desc); - u32 block[MD5_BLOCK_WORDS]; - - mctx->byte_count += len; - do { - memcpy(block, data, sizeof(block)); - md5_transform_helper(mctx, block); - data += sizeof(block); - len -= sizeof(block); - } while (len >= sizeof(block)); - memzero_explicit(block, sizeof(block)); - mctx->byte_count -= len; - return len; -} - -static int md5_finup(struct shash_desc *desc, const u8 *data, unsigned int len, - u8 *out) -{ - struct md5_state *mctx = shash_desc_ctx(desc); - u32 block[MD5_BLOCK_WORDS]; - unsigned int offset; - int padding; - char *p; - - memcpy(block, data, len); - - offset = len; - p = (char *)block + offset; - padding = 56 - (offset + 1); - - *p++ = 0x80; - if (padding < 0) { - memset(p, 0x00, padding + sizeof (u64)); - md5_transform_helper(mctx, block); - p = (char *)block; - padding = 56; - } - - memset(p, 0, padding); - mctx->byte_count += len; - block[14] = mctx->byte_count << 3; - block[15] = mctx->byte_count >> 29; - le32_to_cpu_array(block, (sizeof(block) - sizeof(u64)) / sizeof(u32)); - md5_transform(mctx->hash, block); - memzero_explicit(block, sizeof(block)); - cpu_to_le32_array(mctx->hash, sizeof(mctx->hash) / sizeof(u32)); - memcpy(out, mctx->hash, sizeof(mctx->hash)); +static int crypto_md5_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + md5_update(MD5_CTX(desc), data, len); + return 0; +} +static int crypto_md5_final(struct shash_desc *desc, u8 *out) +{ + md5_final(MD5_CTX(desc), out); return 0; } -static struct shash_alg alg = { - .digestsize = MD5_DIGEST_SIZE, - .init = md5_init, - .update = md5_update, - .finup = md5_finup, - .descsize = MD5_STATE_SIZE, - .base = { - .cra_name = "md5", - .cra_driver_name = "md5-generic", - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = MD5_HMAC_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; +static int crypto_md5_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + md5(data, len, out); + return 0; +} + +static int crypto_md5_export(struct shash_desc *desc, void *out) +{ + return __crypto_md5_export(MD5_CTX(desc), out); +} + +static int crypto_md5_import(struct shash_desc *desc, const void *in) +{ + return __crypto_md5_import(MD5_CTX(desc), in); +} -static int __init md5_mod_init(void) +static int crypto_md5_export_core(struct shash_desc *desc, void *out) { - return crypto_register_shash(&alg); + return __crypto_md5_export_core(MD5_CTX(desc), out); } -static void __exit md5_mod_fini(void) +static int crypto_md5_import_core(struct shash_desc *desc, const void *in) { - crypto_unregister_shash(&alg); + return __crypto_md5_import_core(MD5_CTX(desc), in); } -module_init(md5_mod_init); -module_exit(md5_mod_fini); +#define HMAC_MD5_KEY(tfm) ((struct hmac_md5_key *)crypto_shash_ctx(tfm)) +#define HMAC_MD5_CTX(desc) ((struct hmac_md5_ctx *)shash_desc_ctx(desc)) + +static int crypto_hmac_md5_setkey(struct crypto_shash *tfm, + const u8 *raw_key, unsigned int keylen) +{ + hmac_md5_preparekey(HMAC_MD5_KEY(tfm), raw_key, keylen); + return 0; +} + +static int crypto_hmac_md5_init(struct shash_desc *desc) +{ + hmac_md5_init(HMAC_MD5_CTX(desc), HMAC_MD5_KEY(desc->tfm)); + return 0; +} + +static int crypto_hmac_md5_update(struct shash_desc *desc, + const u8 *data, unsigned int len) +{ + hmac_md5_update(HMAC_MD5_CTX(desc), data, len); + return 0; +} + +static int crypto_hmac_md5_final(struct shash_desc *desc, u8 *out) +{ + hmac_md5_final(HMAC_MD5_CTX(desc), out); + return 0; +} + +static int crypto_hmac_md5_digest(struct shash_desc *desc, + const u8 *data, unsigned int len, u8 *out) +{ + hmac_md5(HMAC_MD5_KEY(desc->tfm), data, len, out); + return 0; +} + +static int crypto_hmac_md5_export(struct shash_desc *desc, void *out) +{ + return __crypto_md5_export(&HMAC_MD5_CTX(desc)->hash_ctx, out); +} + +static int crypto_hmac_md5_import(struct shash_desc *desc, const void *in) +{ + struct hmac_md5_ctx *ctx = HMAC_MD5_CTX(desc); + + ctx->ostate = HMAC_MD5_KEY(desc->tfm)->ostate; + return __crypto_md5_import(&ctx->hash_ctx, in); +} + +static int crypto_hmac_md5_export_core(struct shash_desc *desc, void *out) +{ + return __crypto_md5_export_core(&HMAC_MD5_CTX(desc)->hash_ctx, out); +} + +static int crypto_hmac_md5_import_core(struct shash_desc *desc, const void *in) +{ + struct hmac_md5_ctx *ctx = HMAC_MD5_CTX(desc); + + ctx->ostate = HMAC_MD5_KEY(desc->tfm)->ostate; + return __crypto_md5_import_core(&ctx->hash_ctx, in); +} + +static struct shash_alg algs[] = { + { + .base.cra_name = "md5", + .base.cra_driver_name = "md5-lib", + .base.cra_priority = 300, + .base.cra_blocksize = MD5_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .digestsize = MD5_DIGEST_SIZE, + .init = crypto_md5_init, + .update = crypto_md5_update, + .final = crypto_md5_final, + .digest = crypto_md5_digest, + .export = crypto_md5_export, + .import = crypto_md5_import, + .export_core = crypto_md5_export_core, + .import_core = crypto_md5_import_core, + .descsize = sizeof(struct md5_ctx), + .statesize = MD5_SHASH_STATE_SIZE, + }, + { + .base.cra_name = "hmac(md5)", + .base.cra_driver_name = "hmac-md5-lib", + .base.cra_priority = 300, + .base.cra_blocksize = MD5_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct hmac_md5_key), + .base.cra_module = THIS_MODULE, + .digestsize = MD5_DIGEST_SIZE, + .setkey = crypto_hmac_md5_setkey, + .init = crypto_hmac_md5_init, + .update = crypto_hmac_md5_update, + .final = crypto_hmac_md5_final, + .digest = crypto_hmac_md5_digest, + .export = crypto_hmac_md5_export, + .import = crypto_hmac_md5_import, + .export_core = crypto_hmac_md5_export_core, + .import_core = crypto_hmac_md5_import_core, + .descsize = sizeof(struct hmac_md5_ctx), + .statesize = MD5_SHASH_STATE_SIZE, + }, +}; + +static int __init crypto_md5_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} +module_init(crypto_md5_mod_init); + +static void __exit crypto_md5_mod_exit(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} +module_exit(crypto_md5_mod_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MD5 Message Digest Algorithm"); +MODULE_DESCRIPTION("Crypto API support for MD5 and HMAC-MD5"); + MODULE_ALIAS_CRYPTO("md5"); +MODULE_ALIAS_CRYPTO("md5-lib"); +MODULE_ALIAS_CRYPTO("hmac(md5)"); +MODULE_ALIAS_CRYPTO("hmac-md5-lib"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index ee33ba21ae2bc0..9dca41e7ee7381 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4152,14 +4152,14 @@ static int alg_test_null(const struct alg_test_desc *desc, static const struct alg_test_desc alg_test_descs[] = { { .alg = "adiantum(xchacha12,aes)", - .generic_driver = "adiantum(xchacha12-generic,aes-generic,nhpoly1305-generic)", + .generic_driver = "adiantum(xchacha12-lib,aes-generic,nhpoly1305-generic)", .test = alg_test_skcipher, .suite = { .cipher = __VECS(adiantum_xchacha12_aes_tv_template) }, }, { .alg = "adiantum(xchacha20,aes)", - .generic_driver = "adiantum(xchacha20-generic,aes-generic,nhpoly1305-generic)", + .generic_driver = "adiantum(xchacha20-lib,aes-generic,nhpoly1305-generic)", .test = alg_test_skcipher, .suite = { .cipher = __VECS(adiantum_xchacha20_aes_tv_template) @@ -4178,6 +4178,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "authenc(hmac(md5),ecb(cipher_null))", + .generic_driver = "authenc(hmac-md5-lib,ecb-cipher_null)", .test = alg_test_aead, .suite = { .aead = __VECS(hmac_md5_ecb_cipher_null_tv_template) @@ -4484,6 +4485,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "chacha20", + .generic_driver = "chacha20-lib", .test = alg_test_skcipher, .suite = { .cipher = __VECS(chacha20_tv_template) @@ -4639,12 +4641,6 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .cipher = __VECS(sm4_cts_tv_template) } - }, { - .alg = "curve25519", - .test = alg_test_kpp, - .suite = { - .kpp = __VECS(curve25519_tv_template) - } }, { .alg = "deflate", .test = alg_test_comp, @@ -5064,6 +5060,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "hmac(md5)", + .generic_driver = "hmac-md5-lib", .test = alg_test_hash, .suite = { .hash = __VECS(hmac_md5_tv_template) @@ -5250,6 +5247,7 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "md5", + .generic_driver = "md5-lib", .test = alg_test_hash, .suite = { .hash = __VECS(md5_tv_template) @@ -5417,12 +5415,14 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "rfc7539(chacha20,poly1305)", + .generic_driver = "rfc7539(chacha20-lib,poly1305-generic)", .test = alg_test_aead, .suite = { .aead = __VECS(rfc7539_tv_template) } }, { .alg = "rfc7539esp(chacha20,poly1305)", + .generic_driver = "rfc7539esp(chacha20-lib,poly1305-generic)", .test = alg_test_aead, .suite = { .aead = { @@ -5588,12 +5588,14 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = "xchacha12", + .generic_driver = "xchacha12-lib", .test = alg_test_skcipher, .suite = { .cipher = __VECS(xchacha12_tv_template) }, }, { .alg = "xchacha20", + .generic_driver = "xchacha20-lib", .test = alg_test_skcipher, .suite = { .cipher = __VECS(xchacha20_tv_template) diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 32d099ac9e7378..2682312272824b 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -3798,1231 +3798,6 @@ static const struct kpp_testvec ffdhe8192_dh_tv_template[] __maybe_unused = { }, }; -static const struct kpp_testvec curve25519_tv_template[] = { -{ - .secret = (u8[32]){ 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, - 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, - 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, - 0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a }, - .b_public = (u8[32]){ 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, - 0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37, - 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, - 0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f }, - .expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, - 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, - 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, - 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, - 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6, - 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, - 0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb }, - .b_public = (u8[32]){ 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, - 0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a, - 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, - 0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a }, - .expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, - 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, - 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, - 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 1 }, - .b_public = (u8[32]){ 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64, - 0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d, - 0x0b, 0x95, 0x48, 0xdc, 0x0c, 0xd8, 0x19, 0x98, - 0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 1 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f, - 0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d, - 0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x08, 0xed, 0xe3, - 0x0b, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, - 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, - 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, - 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 }, - .b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, - 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, - 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, - 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, - .expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, - 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, - 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, - 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x0a, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x0a, 0x00, 0xfb, 0x9f }, - .expected_ss = (u8[32]){ 0x77, 0x52, 0xb6, 0x18, 0xc1, 0x2d, 0x48, 0xd2, - 0xc6, 0x93, 0x46, 0x83, 0x81, 0x7c, 0xc6, 0x57, - 0xf3, 0x31, 0x03, 0x19, 0x49, 0x48, 0x20, 0x05, - 0x42, 0x2b, 0x4e, 0xae, 0x8d, 0x1d, 0x43, 0x23 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -{ - .secret = (u8[32]){ 0x8e, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x06 }, - .expected_ss = (u8[32]){ 0x5a, 0xdf, 0xaa, 0x25, 0x86, 0x8e, 0x32, 0x3d, - 0xae, 0x49, 0x62, 0xc1, 0x01, 0x5c, 0xb3, 0x12, - 0xe1, 0xc5, 0xc7, 0x9e, 0x95, 0x3f, 0x03, 0x99, - 0xb0, 0xba, 0x16, 0x22, 0xf3, 0xb6, 0xf7, 0x0c }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - normal case */ -{ - .secret = (u8[32]){ 0x48, 0x52, 0x83, 0x4d, 0x9d, 0x6b, 0x77, 0xda, - 0xde, 0xab, 0xaa, 0xf2, 0xe1, 0x1d, 0xca, 0x66, - 0xd1, 0x9f, 0xe7, 0x49, 0x93, 0xa7, 0xbe, 0xc3, - 0x6c, 0x6e, 0x16, 0xa0, 0x98, 0x3f, 0xea, 0xba }, - .b_public = (u8[32]){ 0x9c, 0x64, 0x7d, 0x9a, 0xe5, 0x89, 0xb9, 0xf5, - 0x8f, 0xdc, 0x3c, 0xa4, 0x94, 0x7e, 0xfb, 0xc9, - 0x15, 0xc4, 0xb2, 0xe0, 0x8e, 0x74, 0x4a, 0x0e, - 0xdf, 0x46, 0x9d, 0xac, 0x59, 0xc8, 0xf8, 0x5a }, - .expected_ss = (u8[32]){ 0x87, 0xb7, 0xf2, 0x12, 0xb6, 0x27, 0xf7, 0xa5, - 0x4c, 0xa5, 0xe0, 0xbc, 0xda, 0xdd, 0xd5, 0x38, - 0x9d, 0x9d, 0xe6, 0x15, 0x6c, 0xdb, 0xcf, 0x8e, - 0xbe, 0x14, 0xff, 0xbc, 0xfb, 0x43, 0x65, 0x51 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0x58, 0x8c, 0x06, 0x1a, 0x50, 0x80, 0x4a, 0xc4, - 0x88, 0xad, 0x77, 0x4a, 0xc7, 0x16, 0xc3, 0xf5, - 0xba, 0x71, 0x4b, 0x27, 0x12, 0xe0, 0x48, 0x49, - 0x13, 0x79, 0xa5, 0x00, 0x21, 0x19, 0x98, 0xa8 }, - .b_public = (u8[32]){ 0x63, 0xaa, 0x40, 0xc6, 0xe3, 0x83, 0x46, 0xc5, - 0xca, 0xf2, 0x3a, 0x6d, 0xf0, 0xa5, 0xe6, 0xc8, - 0x08, 0x89, 0xa0, 0x86, 0x47, 0xe5, 0x51, 0xb3, - 0x56, 0x34, 0x49, 0xbe, 0xfc, 0xfc, 0x97, 0x33 }, - .expected_ss = (u8[32]){ 0xb1, 0xa7, 0x07, 0x51, 0x94, 0x95, 0xff, 0xff, - 0xb2, 0x98, 0xff, 0x94, 0x17, 0x16, 0xb0, 0x6d, - 0xfa, 0xb8, 0x7c, 0xf8, 0xd9, 0x11, 0x23, 0xfe, - 0x2b, 0xe9, 0xa2, 0x33, 0xdd, 0xa2, 0x22, 0x12 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0xb0, 0x5b, 0xfd, 0x32, 0xe5, 0x53, 0x25, 0xd9, - 0xfd, 0x64, 0x8c, 0xb3, 0x02, 0x84, 0x80, 0x39, - 0x00, 0x0b, 0x39, 0x0e, 0x44, 0xd5, 0x21, 0xe5, - 0x8a, 0xab, 0x3b, 0x29, 0xa6, 0x96, 0x0b, 0xa8 }, - .b_public = (u8[32]){ 0x0f, 0x83, 0xc3, 0x6f, 0xde, 0xd9, 0xd3, 0x2f, - 0xad, 0xf4, 0xef, 0xa3, 0xae, 0x93, 0xa9, 0x0b, - 0xb5, 0xcf, 0xa6, 0x68, 0x93, 0xbc, 0x41, 0x2c, - 0x43, 0xfa, 0x72, 0x87, 0xdb, 0xb9, 0x97, 0x79 }, - .expected_ss = (u8[32]){ 0x67, 0xdd, 0x4a, 0x6e, 0x16, 0x55, 0x33, 0x53, - 0x4c, 0x0e, 0x3f, 0x17, 0x2e, 0x4a, 0xb8, 0x57, - 0x6b, 0xca, 0x92, 0x3a, 0x5f, 0x07, 0xb2, 0xc0, - 0x69, 0xb4, 0xc3, 0x10, 0xff, 0x2e, 0x93, 0x5b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0x70, 0xe3, 0x4b, 0xcb, 0xe1, 0xf4, 0x7f, 0xbc, - 0x0f, 0xdd, 0xfd, 0x7c, 0x1e, 0x1a, 0xa5, 0x3d, - 0x57, 0xbf, 0xe0, 0xf6, 0x6d, 0x24, 0x30, 0x67, - 0xb4, 0x24, 0xbb, 0x62, 0x10, 0xbe, 0xd1, 0x9c }, - .b_public = (u8[32]){ 0x0b, 0x82, 0x11, 0xa2, 0xb6, 0x04, 0x90, 0x97, - 0xf6, 0x87, 0x1c, 0x6c, 0x05, 0x2d, 0x3c, 0x5f, - 0xc1, 0xba, 0x17, 0xda, 0x9e, 0x32, 0xae, 0x45, - 0x84, 0x03, 0xb0, 0x5b, 0xb2, 0x83, 0x09, 0x2a }, - .expected_ss = (u8[32]){ 0x4a, 0x06, 0x38, 0xcf, 0xaa, 0x9e, 0xf1, 0x93, - 0x3b, 0x47, 0xf8, 0x93, 0x92, 0x96, 0xa6, 0xb2, - 0x5b, 0xe5, 0x41, 0xef, 0x7f, 0x70, 0xe8, 0x44, - 0xc0, 0xbc, 0xc0, 0x0b, 0x13, 0x4d, 0xe6, 0x4a }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0x68, 0xc1, 0xf3, 0xa6, 0x53, 0xa4, 0xcd, 0xb1, - 0xd3, 0x7b, 0xba, 0x94, 0x73, 0x8f, 0x8b, 0x95, - 0x7a, 0x57, 0xbe, 0xb2, 0x4d, 0x64, 0x6e, 0x99, - 0x4d, 0xc2, 0x9a, 0x27, 0x6a, 0xad, 0x45, 0x8d }, - .b_public = (u8[32]){ 0x34, 0x3a, 0xc2, 0x0a, 0x3b, 0x9c, 0x6a, 0x27, - 0xb1, 0x00, 0x81, 0x76, 0x50, 0x9a, 0xd3, 0x07, - 0x35, 0x85, 0x6e, 0xc1, 0xc8, 0xd8, 0xfc, 0xae, - 0x13, 0x91, 0x2d, 0x08, 0xd1, 0x52, 0xf4, 0x6c }, - .expected_ss = (u8[32]){ 0x39, 0x94, 0x91, 0xfc, 0xe8, 0xdf, 0xab, 0x73, - 0xb4, 0xf9, 0xf6, 0x11, 0xde, 0x8e, 0xa0, 0xb2, - 0x7b, 0x28, 0xf8, 0x59, 0x94, 0x25, 0x0b, 0x0f, - 0x47, 0x5d, 0x58, 0x5d, 0x04, 0x2a, 0xc2, 0x07 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key on twist */ -{ - .secret = (u8[32]){ 0xd8, 0x77, 0xb2, 0x6d, 0x06, 0xdf, 0xf9, 0xd9, - 0xf7, 0xfd, 0x4c, 0x5b, 0x37, 0x69, 0xf8, 0xcd, - 0xd5, 0xb3, 0x05, 0x16, 0xa5, 0xab, 0x80, 0x6b, - 0xe3, 0x24, 0xff, 0x3e, 0xb6, 0x9e, 0xa0, 0xb2 }, - .b_public = (u8[32]){ 0xfa, 0x69, 0x5f, 0xc7, 0xbe, 0x8d, 0x1b, 0xe5, - 0xbf, 0x70, 0x48, 0x98, 0xf3, 0x88, 0xc4, 0x52, - 0xba, 0xfd, 0xd3, 0xb8, 0xea, 0xe8, 0x05, 0xf8, - 0x68, 0x1a, 0x8d, 0x15, 0xc2, 0xd4, 0xe1, 0x42 }, - .expected_ss = (u8[32]){ 0x2c, 0x4f, 0xe1, 0x1d, 0x49, 0x0a, 0x53, 0x86, - 0x17, 0x76, 0xb1, 0x3b, 0x43, 0x54, 0xab, 0xd4, - 0xcf, 0x5a, 0x97, 0x69, 0x9d, 0xb6, 0xe6, 0xc6, - 0x8c, 0x16, 0x26, 0xd0, 0x76, 0x62, 0xf7, 0x58 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x38, 0xdd, 0xe9, 0xf3, 0xe7, 0xb7, 0x99, 0x04, - 0x5f, 0x9a, 0xc3, 0x79, 0x3d, 0x4a, 0x92, 0x77, - 0xda, 0xde, 0xad, 0xc4, 0x1b, 0xec, 0x02, 0x90, - 0xf8, 0x1f, 0x74, 0x4f, 0x73, 0x77, 0x5f, 0x84 }, - .b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x9a, 0x2c, 0xfe, 0x84, 0xff, 0x9c, 0x4a, 0x97, - 0x39, 0x62, 0x5c, 0xae, 0x4a, 0x3b, 0x82, 0xa9, - 0x06, 0x87, 0x7a, 0x44, 0x19, 0x46, 0xf8, 0xd7, - 0xb3, 0xd7, 0x95, 0xfe, 0x8f, 0x5d, 0x16, 0x39 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x98, 0x57, 0xa9, 0x14, 0xe3, 0xc2, 0x90, 0x36, - 0xfd, 0x9a, 0x44, 0x2b, 0xa5, 0x26, 0xb5, 0xcd, - 0xcd, 0xf2, 0x82, 0x16, 0x15, 0x3e, 0x63, 0x6c, - 0x10, 0x67, 0x7a, 0xca, 0xb6, 0xbd, 0x6a, 0xa5 }, - .b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x4d, 0xa4, 0xe0, 0xaa, 0x07, 0x2c, 0x23, 0x2e, - 0xe2, 0xf0, 0xfa, 0x4e, 0x51, 0x9a, 0xe5, 0x0b, - 0x52, 0xc1, 0xed, 0xd0, 0x8a, 0x53, 0x4d, 0x4e, - 0xf3, 0x46, 0xc2, 0xe1, 0x06, 0xd2, 0x1d, 0x60 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x48, 0xe2, 0x13, 0x0d, 0x72, 0x33, 0x05, 0xed, - 0x05, 0xe6, 0xe5, 0x89, 0x4d, 0x39, 0x8a, 0x5e, - 0x33, 0x36, 0x7a, 0x8c, 0x6a, 0xac, 0x8f, 0xcd, - 0xf0, 0xa8, 0x8e, 0x4b, 0x42, 0x82, 0x0d, 0xb7 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0x03, 0x00, 0x00, 0xf8, 0xff, - 0xff, 0x1f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 0x00, - 0x00, 0xf0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x9e, 0xd1, 0x0c, 0x53, 0x74, 0x7f, 0x64, 0x7f, - 0x82, 0xf4, 0x51, 0x25, 0xd3, 0xde, 0x15, 0xa1, - 0xe6, 0xb8, 0x24, 0x49, 0x6a, 0xb4, 0x04, 0x10, - 0xff, 0xcc, 0x3c, 0xfe, 0x95, 0x76, 0x0f, 0x3b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x28, 0xf4, 0x10, 0x11, 0x69, 0x18, 0x51, 0xb3, - 0xa6, 0x2b, 0x64, 0x15, 0x53, 0xb3, 0x0d, 0x0d, - 0xfd, 0xdc, 0xb8, 0xff, 0xfc, 0xf5, 0x37, 0x00, - 0xa7, 0xbe, 0x2f, 0x6a, 0x87, 0x2e, 0x9f, 0xb0 }, - .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0xfc, 0xff, 0xff, 0x07, 0x00, - 0x00, 0xe0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0xf8, 0xff, - 0xff, 0x0f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0xcf, 0x72, 0xb4, 0xaa, 0x6a, 0xa1, 0xc9, 0xf8, - 0x94, 0xf4, 0x16, 0x5b, 0x86, 0x10, 0x9a, 0xa4, - 0x68, 0x51, 0x76, 0x48, 0xe1, 0xf0, 0xcc, 0x70, - 0xe1, 0xab, 0x08, 0x46, 0x01, 0x76, 0x50, 0x6b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0x18, 0xa9, 0x3b, 0x64, 0x99, 0xb9, 0xf6, 0xb3, - 0x22, 0x5c, 0xa0, 0x2f, 0xef, 0x41, 0x0e, 0x0a, - 0xde, 0xc2, 0x35, 0x32, 0x32, 0x1d, 0x2d, 0x8e, - 0xf1, 0xa6, 0xd6, 0x02, 0xa8, 0xc6, 0x5b, 0x83 }, - .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x5d, 0x50, 0xb6, 0x28, 0x36, 0xbb, 0x69, 0x57, - 0x94, 0x10, 0x38, 0x6c, 0xf7, 0xbb, 0x81, 0x1c, - 0x14, 0xbf, 0x85, 0xb1, 0xc7, 0xb1, 0x7e, 0x59, - 0x24, 0xc7, 0xff, 0xea, 0x91, 0xef, 0x9e, 0x12 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case on twist */ -{ - .secret = (u8[32]){ 0xc0, 0x1d, 0x13, 0x05, 0xa1, 0x33, 0x8a, 0x1f, - 0xca, 0xc2, 0xba, 0x7e, 0x2e, 0x03, 0x2b, 0x42, - 0x7e, 0x0b, 0x04, 0x90, 0x31, 0x65, 0xac, 0xa9, - 0x57, 0xd8, 0xd0, 0x55, 0x3d, 0x87, 0x17, 0xb0 }, - .b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x19, 0x23, 0x0e, 0xb1, 0x48, 0xd5, 0xd6, 0x7c, - 0x3c, 0x22, 0xab, 0x1d, 0xae, 0xff, 0x80, 0xa5, - 0x7e, 0xae, 0x42, 0x65, 0xce, 0x28, 0x72, 0x65, - 0x7b, 0x2c, 0x80, 0x99, 0xfc, 0x69, 0x8e, 0x50 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0x38, 0x6f, 0x7f, 0x16, 0xc5, 0x07, 0x31, 0xd6, - 0x4f, 0x82, 0xe6, 0xa1, 0x70, 0xb1, 0x42, 0xa4, - 0xe3, 0x4f, 0x31, 0xfd, 0x77, 0x68, 0xfc, 0xb8, - 0x90, 0x29, 0x25, 0xe7, 0xd1, 0xe2, 0x1a, 0xbe }, - .b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x0f, 0xca, 0xb5, 0xd8, 0x42, 0xa0, 0x78, 0xd7, - 0xa7, 0x1f, 0xc5, 0x9b, 0x57, 0xbf, 0xb4, 0xca, - 0x0b, 0xe6, 0x87, 0x3b, 0x49, 0xdc, 0xdb, 0x9f, - 0x44, 0xe1, 0x4a, 0xe8, 0xfb, 0xdf, 0xa5, 0x42 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0xe0, 0x23, 0xa2, 0x89, 0xbd, 0x5e, 0x90, 0xfa, - 0x28, 0x04, 0xdd, 0xc0, 0x19, 0xa0, 0x5e, 0xf3, - 0xe7, 0x9d, 0x43, 0x4b, 0xb6, 0xea, 0x2f, 0x52, - 0x2e, 0xcb, 0x64, 0x3a, 0x75, 0x29, 0x6e, 0x95 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, - .expected_ss = (u8[32]){ 0x54, 0xce, 0x8f, 0x22, 0x75, 0xc0, 0x77, 0xe3, - 0xb1, 0x30, 0x6a, 0x39, 0x39, 0xc5, 0xe0, 0x3e, - 0xef, 0x6b, 0xbb, 0x88, 0x06, 0x05, 0x44, 0x75, - 0x8d, 0x9f, 0xef, 0x59, 0xb0, 0xbc, 0x3e, 0x4f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0x68, 0xf0, 0x10, 0xd6, 0x2e, 0xe8, 0xd9, 0x26, - 0x05, 0x3a, 0x36, 0x1c, 0x3a, 0x75, 0xc6, 0xea, - 0x4e, 0xbd, 0xc8, 0x60, 0x6a, 0xb2, 0x85, 0x00, - 0x3a, 0x6f, 0x8f, 0x40, 0x76, 0xb0, 0x1e, 0x83 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 }, - .expected_ss = (u8[32]){ 0xf1, 0x36, 0x77, 0x5c, 0x5b, 0xeb, 0x0a, 0xf8, - 0x11, 0x0a, 0xf1, 0x0b, 0x20, 0x37, 0x23, 0x32, - 0x04, 0x3c, 0xab, 0x75, 0x24, 0x19, 0x67, 0x87, - 0x75, 0xa2, 0x23, 0xdf, 0x57, 0xc9, 0xd3, 0x0d }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0x58, 0xeb, 0xcb, 0x35, 0xb0, 0xf8, 0x84, 0x5c, - 0xaf, 0x1e, 0xc6, 0x30, 0xf9, 0x65, 0x76, 0xb6, - 0x2c, 0x4b, 0x7b, 0x6c, 0x36, 0xb2, 0x9d, 0xeb, - 0x2c, 0xb0, 0x08, 0x46, 0x51, 0x75, 0x5c, 0x96 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xfb, 0xff, - 0xff, 0xdf, 0xff, 0xff, 0xdf, 0xff, 0xff, 0xff, - 0xfe, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xf7, 0xff, - 0xff, 0xf7, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x3f }, - .expected_ss = (u8[32]){ 0xbf, 0x9a, 0xff, 0xd0, 0x6b, 0x84, 0x40, 0x85, - 0x58, 0x64, 0x60, 0x96, 0x2e, 0xf2, 0x14, 0x6f, - 0xf3, 0xd4, 0x53, 0x3d, 0x94, 0x44, 0xaa, 0xb0, - 0x06, 0xeb, 0x88, 0xcc, 0x30, 0x54, 0x40, 0x7d }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0x18, 0x8c, 0x4b, 0xc5, 0xb9, 0xc4, 0x4b, 0x38, - 0xbb, 0x65, 0x8b, 0x9b, 0x2a, 0xe8, 0x2d, 0x5b, - 0x01, 0x01, 0x5e, 0x09, 0x31, 0x84, 0xb1, 0x7c, - 0xb7, 0x86, 0x35, 0x03, 0xa7, 0x83, 0xe1, 0xbb }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .expected_ss = (u8[32]){ 0xd4, 0x80, 0xde, 0x04, 0xf6, 0x99, 0xcb, 0x3b, - 0xe0, 0x68, 0x4a, 0x9c, 0xc2, 0xe3, 0x12, 0x81, - 0xea, 0x0b, 0xc5, 0xa9, 0xdc, 0xc1, 0x57, 0xd3, - 0xd2, 0x01, 0x58, 0xd4, 0x6c, 0xa5, 0x24, 0x6d }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0xe0, 0x6c, 0x11, 0xbb, 0x2e, 0x13, 0xce, 0x3d, - 0xc7, 0x67, 0x3f, 0x67, 0xf5, 0x48, 0x22, 0x42, - 0x90, 0x94, 0x23, 0xa9, 0xae, 0x95, 0xee, 0x98, - 0x6a, 0x98, 0x8d, 0x98, 0xfa, 0xee, 0x23, 0xa2 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, - 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, - 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f, - 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x4c, 0x44, 0x01, 0xcc, 0xe6, 0xb5, 0x1e, 0x4c, - 0xb1, 0x8f, 0x27, 0x90, 0x24, 0x6c, 0x9b, 0xf9, - 0x14, 0xdb, 0x66, 0x77, 0x50, 0xa1, 0xcb, 0x89, - 0x06, 0x90, 0x92, 0xaf, 0x07, 0x29, 0x22, 0x76 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for public key */ -{ - .secret = (u8[32]){ 0xc0, 0x65, 0x8c, 0x46, 0xdd, 0xe1, 0x81, 0x29, - 0x29, 0x38, 0x77, 0x53, 0x5b, 0x11, 0x62, 0xb6, - 0xf9, 0xf5, 0x41, 0x4a, 0x23, 0xcf, 0x4d, 0x2c, - 0xbc, 0x14, 0x0a, 0x4d, 0x99, 0xda, 0x2b, 0x8f }, - .b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x57, 0x8b, 0xa8, 0xcc, 0x2d, 0xbd, 0xc5, 0x75, - 0xaf, 0xcf, 0x9d, 0xf2, 0xb3, 0xee, 0x61, 0x89, - 0xf5, 0x33, 0x7d, 0x68, 0x54, 0xc7, 0x9b, 0x4c, - 0xe1, 0x65, 0xea, 0x12, 0x29, 0x3b, 0x3a, 0x0f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xf0, 0x1e, 0x48, 0xda, 0xfa, 0xc9, 0xd7, 0xbc, - 0xf5, 0x89, 0xcb, 0xc3, 0x82, 0xc8, 0x78, 0xd1, - 0x8b, 0xda, 0x35, 0x50, 0x58, 0x9f, 0xfb, 0x5d, - 0x50, 0xb5, 0x23, 0xbe, 0xbe, 0x32, 0x9d, 0xae }, - .b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0xbd, 0x36, 0xa0, 0x79, 0x0e, 0xb8, 0x83, 0x09, - 0x8c, 0x98, 0x8b, 0x21, 0x78, 0x67, 0x73, 0xde, - 0x0b, 0x3a, 0x4d, 0xf1, 0x62, 0x28, 0x2c, 0xf1, - 0x10, 0xde, 0x18, 0xdd, 0x48, 0x4c, 0xe7, 0x4b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x28, 0x87, 0x96, 0xbc, 0x5a, 0xff, 0x4b, 0x81, - 0xa3, 0x75, 0x01, 0x75, 0x7b, 0xc0, 0x75, 0x3a, - 0x3c, 0x21, 0x96, 0x47, 0x90, 0xd3, 0x86, 0x99, - 0x30, 0x8d, 0xeb, 0xc1, 0x7a, 0x6e, 0xaf, 0x8d }, - .b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0xb4, 0xe0, 0xdd, 0x76, 0xda, 0x7b, 0x07, 0x17, - 0x28, 0xb6, 0x1f, 0x85, 0x67, 0x71, 0xaa, 0x35, - 0x6e, 0x57, 0xed, 0xa7, 0x8a, 0x5b, 0x16, 0x55, - 0xcc, 0x38, 0x20, 0xfb, 0x5f, 0x85, 0x4c, 0x5c }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x98, 0xdf, 0x84, 0x5f, 0x66, 0x51, 0xbf, 0x11, - 0x38, 0x22, 0x1f, 0x11, 0x90, 0x41, 0xf7, 0x2b, - 0x6d, 0xbc, 0x3c, 0x4a, 0xce, 0x71, 0x43, 0xd9, - 0x9f, 0xd5, 0x5a, 0xd8, 0x67, 0x48, 0x0d, 0xa8 }, - .b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x6f, 0xdf, 0x6c, 0x37, 0x61, 0x1d, 0xbd, 0x53, - 0x04, 0xdc, 0x0f, 0x2e, 0xb7, 0xc9, 0x51, 0x7e, - 0xb3, 0xc5, 0x0e, 0x12, 0xfd, 0x05, 0x0a, 0xc6, - 0xde, 0xc2, 0x70, 0x71, 0xd4, 0xbf, 0xc0, 0x34 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xf0, 0x94, 0x98, 0xe4, 0x6f, 0x02, 0xf8, 0x78, - 0x82, 0x9e, 0x78, 0xb8, 0x03, 0xd3, 0x16, 0xa2, - 0xed, 0x69, 0x5d, 0x04, 0x98, 0xa0, 0x8a, 0xbd, - 0xf8, 0x27, 0x69, 0x30, 0xe2, 0x4e, 0xdc, 0xb0 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .expected_ss = (u8[32]){ 0x4c, 0x8f, 0xc4, 0xb1, 0xc6, 0xab, 0x88, 0xfb, - 0x21, 0xf1, 0x8f, 0x6d, 0x4c, 0x81, 0x02, 0x40, - 0xd4, 0xe9, 0x46, 0x51, 0xba, 0x44, 0xf7, 0xa2, - 0xc8, 0x63, 0xce, 0xc7, 0xdc, 0x56, 0x60, 0x2d }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x18, 0x13, 0xc1, 0x0a, 0x5c, 0x7f, 0x21, 0xf9, - 0x6e, 0x17, 0xf2, 0x88, 0xc0, 0xcc, 0x37, 0x60, - 0x7c, 0x04, 0xc5, 0xf5, 0xae, 0xa2, 0xdb, 0x13, - 0x4f, 0x9e, 0x2f, 0xfc, 0x66, 0xbd, 0x9d, 0xb8 }, - .b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .expected_ss = (u8[32]){ 0x1c, 0xd0, 0xb2, 0x82, 0x67, 0xdc, 0x54, 0x1c, - 0x64, 0x2d, 0x6d, 0x7d, 0xca, 0x44, 0xa8, 0xb3, - 0x8a, 0x63, 0x73, 0x6e, 0xef, 0x5c, 0x4e, 0x65, - 0x01, 0xff, 0xbb, 0xb1, 0x78, 0x0c, 0x03, 0x3c }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x78, 0x57, 0xfb, 0x80, 0x86, 0x53, 0x64, 0x5a, - 0x0b, 0xeb, 0x13, 0x8a, 0x64, 0xf5, 0xf4, 0xd7, - 0x33, 0xa4, 0x5e, 0xa8, 0x4c, 0x3c, 0xda, 0x11, - 0xa9, 0xc0, 0x6f, 0x7e, 0x71, 0x39, 0x14, 0x9e }, - .b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .expected_ss = (u8[32]){ 0x87, 0x55, 0xbe, 0x01, 0xc6, 0x0a, 0x7e, 0x82, - 0x5c, 0xff, 0x3e, 0x0e, 0x78, 0xcb, 0x3a, 0xa4, - 0x33, 0x38, 0x61, 0x51, 0x6a, 0xa5, 0x9b, 0x1c, - 0x51, 0xa8, 0xb2, 0xa5, 0x43, 0xdf, 0xa8, 0x22 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xe0, 0x3a, 0xa8, 0x42, 0xe2, 0xab, 0xc5, 0x6e, - 0x81, 0xe8, 0x7b, 0x8b, 0x9f, 0x41, 0x7b, 0x2a, - 0x1e, 0x59, 0x13, 0xc7, 0x23, 0xee, 0xd2, 0x8d, - 0x75, 0x2f, 0x8d, 0x47, 0xa5, 0x9f, 0x49, 0x8f }, - .b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }, - .expected_ss = (u8[32]){ 0x54, 0xc9, 0xa1, 0xed, 0x95, 0xe5, 0x46, 0xd2, - 0x78, 0x22, 0xa3, 0x60, 0x93, 0x1d, 0xda, 0x60, - 0xa1, 0xdf, 0x04, 0x9d, 0xa6, 0xf9, 0x04, 0x25, - 0x3c, 0x06, 0x12, 0xbb, 0xdc, 0x08, 0x74, 0x76 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xf8, 0xf7, 0x07, 0xb7, 0x99, 0x9b, 0x18, 0xcb, - 0x0d, 0x6b, 0x96, 0x12, 0x4f, 0x20, 0x45, 0x97, - 0x2c, 0xa2, 0x74, 0xbf, 0xc1, 0x54, 0xad, 0x0c, - 0x87, 0x03, 0x8c, 0x24, 0xc6, 0xd0, 0xd4, 0xb2 }, - .b_public = (u8[32]){ 0xda, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0xcc, 0x1f, 0x40, 0xd7, 0x43, 0xcd, 0xc2, 0x23, - 0x0e, 0x10, 0x43, 0xda, 0xba, 0x8b, 0x75, 0xe8, - 0x10, 0xf1, 0xfb, 0xab, 0x7f, 0x25, 0x52, 0x69, - 0xbd, 0x9e, 0xbb, 0x29, 0xe6, 0xbf, 0x49, 0x4f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xa0, 0x34, 0xf6, 0x84, 0xfa, 0x63, 0x1e, 0x1a, - 0x34, 0x81, 0x18, 0xc1, 0xce, 0x4c, 0x98, 0x23, - 0x1f, 0x2d, 0x9e, 0xec, 0x9b, 0xa5, 0x36, 0x5b, - 0x4a, 0x05, 0xd6, 0x9a, 0x78, 0x5b, 0x07, 0x96 }, - .b_public = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x54, 0x99, 0x8e, 0xe4, 0x3a, 0x5b, 0x00, 0x7b, - 0xf4, 0x99, 0xf0, 0x78, 0xe7, 0x36, 0x52, 0x44, - 0x00, 0xa8, 0xb5, 0xc7, 0xe9, 0xb9, 0xb4, 0x37, - 0x71, 0x74, 0x8c, 0x7c, 0xdf, 0x88, 0x04, 0x12 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x30, 0xb6, 0xc6, 0xa0, 0xf2, 0xff, 0xa6, 0x80, - 0x76, 0x8f, 0x99, 0x2b, 0xa8, 0x9e, 0x15, 0x2d, - 0x5b, 0xc9, 0x89, 0x3d, 0x38, 0xc9, 0x11, 0x9b, - 0xe4, 0xf7, 0x67, 0xbf, 0xab, 0x6e, 0x0c, 0xa5 }, - .b_public = (u8[32]){ 0xdc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0xea, 0xd9, 0xb3, 0x8e, 0xfd, 0xd7, 0x23, 0x63, - 0x79, 0x34, 0xe5, 0x5a, 0xb7, 0x17, 0xa7, 0xae, - 0x09, 0xeb, 0x86, 0xa2, 0x1d, 0xc3, 0x6a, 0x3f, - 0xee, 0xb8, 0x8b, 0x75, 0x9e, 0x39, 0x1e, 0x09 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x90, 0x1b, 0x9d, 0xcf, 0x88, 0x1e, 0x01, 0xe0, - 0x27, 0x57, 0x50, 0x35, 0xd4, 0x0b, 0x43, 0xbd, - 0xc1, 0xc5, 0x24, 0x2e, 0x03, 0x08, 0x47, 0x49, - 0x5b, 0x0c, 0x72, 0x86, 0x46, 0x9b, 0x65, 0x91 }, - .b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x60, 0x2f, 0xf4, 0x07, 0x89, 0xb5, 0x4b, 0x41, - 0x80, 0x59, 0x15, 0xfe, 0x2a, 0x62, 0x21, 0xf0, - 0x7a, 0x50, 0xff, 0xc2, 0xc3, 0xfc, 0x94, 0xcf, - 0x61, 0xf1, 0x3d, 0x79, 0x04, 0xe8, 0x8e, 0x0e }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x80, 0x46, 0x67, 0x7c, 0x28, 0xfd, 0x82, 0xc9, - 0xa1, 0xbd, 0xb7, 0x1a, 0x1a, 0x1a, 0x34, 0xfa, - 0xba, 0x12, 0x25, 0xe2, 0x50, 0x7f, 0xe3, 0xf5, - 0x4d, 0x10, 0xbd, 0x5b, 0x0d, 0x86, 0x5f, 0x8e }, - .b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0xe0, 0x0a, 0xe8, 0xb1, 0x43, 0x47, 0x12, 0x47, - 0xba, 0x24, 0xf1, 0x2c, 0x88, 0x55, 0x36, 0xc3, - 0xcb, 0x98, 0x1b, 0x58, 0xe1, 0xe5, 0x6b, 0x2b, - 0xaf, 0x35, 0xc1, 0x2a, 0xe1, 0xf7, 0x9c, 0x26 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x60, 0x2f, 0x7e, 0x2f, 0x68, 0xa8, 0x46, 0xb8, - 0x2c, 0xc2, 0x69, 0xb1, 0xd4, 0x8e, 0x93, 0x98, - 0x86, 0xae, 0x54, 0xfd, 0x63, 0x6c, 0x1f, 0xe0, - 0x74, 0xd7, 0x10, 0x12, 0x7d, 0x47, 0x24, 0x91 }, - .b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x98, 0xcb, 0x9b, 0x50, 0xdd, 0x3f, 0xc2, 0xb0, - 0xd4, 0xf2, 0xd2, 0xbf, 0x7c, 0x5c, 0xfd, 0xd1, - 0x0c, 0x8f, 0xcd, 0x31, 0xfc, 0x40, 0xaf, 0x1a, - 0xd4, 0x4f, 0x47, 0xc1, 0x31, 0x37, 0x63, 0x62 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x60, 0x88, 0x7b, 0x3d, 0xc7, 0x24, 0x43, 0x02, - 0x6e, 0xbe, 0xdb, 0xbb, 0xb7, 0x06, 0x65, 0xf4, - 0x2b, 0x87, 0xad, 0xd1, 0x44, 0x0e, 0x77, 0x68, - 0xfb, 0xd7, 0xe8, 0xe2, 0xce, 0x5f, 0x63, 0x9d }, - .b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x38, 0xd6, 0x30, 0x4c, 0x4a, 0x7e, 0x6d, 0x9f, - 0x79, 0x59, 0x33, 0x4f, 0xb5, 0x24, 0x5b, 0xd2, - 0xc7, 0x54, 0x52, 0x5d, 0x4c, 0x91, 0xdb, 0x95, - 0x02, 0x06, 0x92, 0x62, 0x34, 0xc1, 0xf6, 0x33 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0x78, 0xd3, 0x1d, 0xfa, 0x85, 0x44, 0x97, 0xd7, - 0x2d, 0x8d, 0xef, 0x8a, 0x1b, 0x7f, 0xb0, 0x06, - 0xce, 0xc2, 0xd8, 0xc4, 0x92, 0x46, 0x47, 0xc9, - 0x38, 0x14, 0xae, 0x56, 0xfa, 0xed, 0xa4, 0x95 }, - .b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x78, 0x6c, 0xd5, 0x49, 0x96, 0xf0, 0x14, 0xa5, - 0xa0, 0x31, 0xec, 0x14, 0xdb, 0x81, 0x2e, 0xd0, - 0x83, 0x55, 0x06, 0x1f, 0xdb, 0x5d, 0xe6, 0x80, - 0xa8, 0x00, 0xac, 0x52, 0x1f, 0x31, 0x8e, 0x23 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - public key >= p */ -{ - .secret = (u8[32]){ 0xc0, 0x4c, 0x5b, 0xae, 0xfa, 0x83, 0x02, 0xdd, - 0xde, 0xd6, 0xa4, 0xbb, 0x95, 0x77, 0x61, 0xb4, - 0xeb, 0x97, 0xae, 0xfa, 0x4f, 0xc3, 0xb8, 0x04, - 0x30, 0x85, 0xf9, 0x6a, 0x56, 0x59, 0xb3, 0xa5 }, - .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .expected_ss = (u8[32]){ 0x29, 0xae, 0x8b, 0xc7, 0x3e, 0x9b, 0x10, 0xa0, - 0x8b, 0x4f, 0x68, 0x1c, 0x43, 0xc3, 0xe0, 0xac, - 0x1a, 0x17, 0x1d, 0x31, 0xb3, 0x8f, 0x1a, 0x48, - 0xef, 0xba, 0x29, 0xae, 0x63, 0x9e, 0xa1, 0x34 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - RFC 7748 */ -{ - .secret = (u8[32]){ 0xa0, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, - 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, - 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, - 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0x44 }, - .b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, - 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, - 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, - 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, - .expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, - 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, - 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, - 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - RFC 7748 */ -{ - .secret = (u8[32]){ 0x48, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c, - 0x5a, 0xd2, 0x26, 0x91, 0x95, 0x7d, 0x6a, 0xf5, - 0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea, 0x01, 0xd4, - 0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x4d }, - .b_public = (u8[32]){ 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3, - 0xf4, 0xb7, 0x95, 0x9d, 0x05, 0x38, 0xae, 0x2c, - 0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0, 0x3c, 0x3e, - 0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x13 }, - .expected_ss = (u8[32]){ 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d, - 0x7a, 0xad, 0xe4, 0x5c, 0xb4, 0xb8, 0x73, 0xf8, - 0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f, 0xa1, 0x52, - 0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x0a, 0xb4, 0xe7, 0x63, 0x80, 0xd8, 0x4d, 0xde, - 0x4f, 0x68, 0x33, 0xc5, 0x8f, 0x2a, 0x9f, 0xb8, - 0xf8, 0x3b, 0xb0, 0x16, 0x9b, 0x17, 0x2b, 0xe4, - 0xb6, 0xe0, 0x59, 0x28, 0x87, 0x74, 0x1a, 0x36 }, - .expected_ss = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x89, 0xe1, 0x0d, 0x57, 0x01, 0xb4, 0x33, 0x7d, - 0x2d, 0x03, 0x21, 0x81, 0x53, 0x8b, 0x10, 0x64, - 0xbd, 0x40, 0x84, 0x40, 0x1c, 0xec, 0xa1, 0xfd, - 0x12, 0x66, 0x3a, 0x19, 0x59, 0x38, 0x80, 0x00 }, - .expected_ss = (u8[32]){ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x2b, 0x55, 0xd3, 0xaa, 0x4a, 0x8f, 0x80, 0xc8, - 0xc0, 0xb2, 0xae, 0x5f, 0x93, 0x3e, 0x85, 0xaf, - 0x49, 0xbe, 0xac, 0x36, 0xc2, 0xfa, 0x73, 0x94, - 0xba, 0xb7, 0x6c, 0x89, 0x33, 0xf8, 0xf8, 0x1d }, - .expected_ss = (u8[32]){ 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x63, 0xe5, 0xb1, 0xfe, 0x96, 0x01, 0xfe, 0x84, - 0x38, 0x5d, 0x88, 0x66, 0xb0, 0x42, 0x12, 0x62, - 0xf7, 0x8f, 0xbf, 0xa5, 0xaf, 0xf9, 0x58, 0x5e, - 0x62, 0x66, 0x79, 0xb1, 0x85, 0x47, 0xd9, 0x59 }, - .expected_ss = (u8[32]){ 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0xe4, 0x28, 0xf3, 0xda, 0xc1, 0x78, 0x09, 0xf8, - 0x27, 0xa5, 0x22, 0xce, 0x32, 0x35, 0x50, 0x58, - 0xd0, 0x73, 0x69, 0x36, 0x4a, 0xa7, 0x89, 0x02, - 0xee, 0x10, 0x13, 0x9b, 0x9f, 0x9d, 0xd6, 0x53 }, - .expected_ss = (u8[32]){ 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0xb3, 0xb5, 0x0e, 0x3e, 0xd3, 0xa4, 0x07, 0xb9, - 0x5d, 0xe9, 0x42, 0xef, 0x74, 0x57, 0x5b, 0x5a, - 0xb8, 0xa1, 0x0c, 0x09, 0xee, 0x10, 0x35, 0x44, - 0xd6, 0x0b, 0xdf, 0xed, 0x81, 0x38, 0xab, 0x2b }, - .expected_ss = (u8[32]){ 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x21, 0x3f, 0xff, 0xe9, 0x3d, 0x5e, 0xa8, 0xcd, - 0x24, 0x2e, 0x46, 0x28, 0x44, 0x02, 0x99, 0x22, - 0xc4, 0x3c, 0x77, 0xc9, 0xe3, 0xe4, 0x2f, 0x56, - 0x2f, 0x48, 0x5d, 0x24, 0xc5, 0x01, 0xa2, 0x0b }, - .expected_ss = (u8[32]){ 0xf3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x91, 0xb2, 0x32, 0xa1, 0x78, 0xb3, 0xcd, 0x53, - 0x09, 0x32, 0x44, 0x1e, 0x61, 0x39, 0x41, 0x8f, - 0x72, 0x17, 0x22, 0x92, 0xf1, 0xda, 0x4c, 0x18, - 0x34, 0xfc, 0x5e, 0xbf, 0xef, 0xb5, 0x1e, 0x3f }, - .expected_ss = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x04, 0x5c, 0x6e, 0x11, 0xc5, 0xd3, 0x32, 0x55, - 0x6c, 0x78, 0x22, 0xfe, 0x94, 0xeb, 0xf8, 0x9b, - 0x56, 0xa3, 0x87, 0x8d, 0xc2, 0x7c, 0xa0, 0x79, - 0x10, 0x30, 0x58, 0x84, 0x9f, 0xab, 0xcb, 0x4f }, - .expected_ss = (u8[32]){ 0xe5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x1c, 0xa2, 0x19, 0x0b, 0x71, 0x16, 0x35, 0x39, - 0x06, 0x3c, 0x35, 0x77, 0x3b, 0xda, 0x0c, 0x9c, - 0x92, 0x8e, 0x91, 0x36, 0xf0, 0x62, 0x0a, 0xeb, - 0x09, 0x3f, 0x09, 0x91, 0x97, 0xb7, 0xf7, 0x4e }, - .expected_ss = (u8[32]){ 0xe3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0xf7, 0x6e, 0x90, 0x10, 0xac, 0x33, 0xc5, 0x04, - 0x3b, 0x2d, 0x3b, 0x76, 0xa8, 0x42, 0x17, 0x10, - 0x00, 0xc4, 0x91, 0x62, 0x22, 0xe9, 0xe8, 0x58, - 0x97, 0xa0, 0xae, 0xc7, 0xf6, 0x35, 0x0b, 0x3c }, - .expected_ss = (u8[32]){ 0xdd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0xbb, 0x72, 0x68, 0x8d, 0x8f, 0x8a, 0xa7, 0xa3, - 0x9c, 0xd6, 0x06, 0x0c, 0xd5, 0xc8, 0x09, 0x3c, - 0xde, 0xc6, 0xfe, 0x34, 0x19, 0x37, 0xc3, 0x88, - 0x6a, 0x99, 0x34, 0x6c, 0xd0, 0x7f, 0xaa, 0x55 }, - .expected_ss = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x88, 0xfd, 0xde, 0xa1, 0x93, 0x39, 0x1c, 0x6a, - 0x59, 0x33, 0xef, 0x9b, 0x71, 0x90, 0x15, 0x49, - 0x44, 0x72, 0x05, 0xaa, 0xe9, 0xda, 0x92, 0x8a, - 0x6b, 0x91, 0xa3, 0x52, 0xba, 0x10, 0xf4, 0x1f }, - .expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - edge case for shared secret */ -{ - .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4, - 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3, - 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc, - 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 }, - .b_public = (u8[32]){ 0x30, 0x3b, 0x39, 0x2f, 0x15, 0x31, 0x16, 0xca, - 0xd9, 0xcc, 0x68, 0x2a, 0x00, 0xcc, 0xc4, 0x4c, - 0x95, 0xff, 0x0d, 0x3b, 0xbe, 0x56, 0x8b, 0xeb, - 0x6c, 0x4e, 0x73, 0x9b, 0xaf, 0xdc, 0x2c, 0x68 }, - .expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0xfd, 0x30, 0x0a, 0xeb, 0x40, 0xe1, 0xfa, 0x58, - 0x25, 0x18, 0x41, 0x2b, 0x49, 0xb2, 0x08, 0xa7, - 0x84, 0x2b, 0x1e, 0x1f, 0x05, 0x6a, 0x04, 0x01, - 0x78, 0xea, 0x41, 0x41, 0x53, 0x4f, 0x65, 0x2d }, - .expected_ss = (u8[32]){ 0xb7, 0x34, 0x10, 0x5d, 0xc2, 0x57, 0x58, 0x5d, - 0x73, 0xb5, 0x66, 0xcc, 0xb7, 0x6f, 0x06, 0x27, - 0x95, 0xcc, 0xbe, 0xc8, 0x91, 0x28, 0xe5, 0x2b, - 0x02, 0xf3, 0xe5, 0x96, 0x39, 0xf1, 0x3c, 0x46 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0xc8, 0xef, 0x79, 0xb5, 0x14, 0xd7, 0x68, 0x26, - 0x77, 0xbc, 0x79, 0x31, 0xe0, 0x6e, 0xe5, 0xc2, - 0x7c, 0x9b, 0x39, 0x2b, 0x4a, 0xe9, 0x48, 0x44, - 0x73, 0xf5, 0x54, 0xe6, 0x67, 0x8e, 0xcc, 0x2e }, - .expected_ss = (u8[32]){ 0x64, 0x7a, 0x46, 0xb6, 0xfc, 0x3f, 0x40, 0xd6, - 0x21, 0x41, 0xee, 0x3c, 0xee, 0x70, 0x6b, 0x4d, - 0x7a, 0x92, 0x71, 0x59, 0x3a, 0x7b, 0x14, 0x3e, - 0x8e, 0x2e, 0x22, 0x79, 0x88, 0x3e, 0x45, 0x50 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0x64, 0xae, 0xac, 0x25, 0x04, 0x14, 0x48, 0x61, - 0x53, 0x2b, 0x7b, 0xbc, 0xb6, 0xc8, 0x7d, 0x67, - 0xdd, 0x4c, 0x1f, 0x07, 0xeb, 0xc2, 0xe0, 0x6e, - 0xff, 0xb9, 0x5a, 0xec, 0xc6, 0x17, 0x0b, 0x2c }, - .expected_ss = (u8[32]){ 0x4f, 0xf0, 0x3d, 0x5f, 0xb4, 0x3c, 0xd8, 0x65, - 0x7a, 0x3c, 0xf3, 0x7c, 0x13, 0x8c, 0xad, 0xce, - 0xcc, 0xe5, 0x09, 0xe4, 0xeb, 0xa0, 0x89, 0xd0, - 0xef, 0x40, 0xb4, 0xe4, 0xfb, 0x94, 0x61, 0x55 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0xbf, 0x68, 0xe3, 0x5e, 0x9b, 0xdb, 0x7e, 0xee, - 0x1b, 0x50, 0x57, 0x02, 0x21, 0x86, 0x0f, 0x5d, - 0xcd, 0xad, 0x8a, 0xcb, 0xab, 0x03, 0x1b, 0x14, - 0x97, 0x4c, 0xc4, 0x90, 0x13, 0xc4, 0x98, 0x31 }, - .expected_ss = (u8[32]){ 0x21, 0xce, 0xe5, 0x2e, 0xfd, 0xbc, 0x81, 0x2e, - 0x1d, 0x02, 0x1a, 0x4a, 0xf1, 0xe1, 0xd8, 0xbc, - 0x4d, 0xb3, 0xc4, 0x00, 0xe4, 0xd2, 0xa2, 0xc5, - 0x6a, 0x39, 0x26, 0xdb, 0x4d, 0x99, 0xc6, 0x5b }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - checking for overflow */ -{ - .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d, - 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d, - 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c, - 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 }, - .b_public = (u8[32]){ 0x53, 0x47, 0xc4, 0x91, 0x33, 0x1a, 0x64, 0xb4, - 0x3d, 0xdc, 0x68, 0x30, 0x34, 0xe6, 0x77, 0xf5, - 0x3d, 0xc3, 0x2b, 0x52, 0xa5, 0x2a, 0x57, 0x7c, - 0x15, 0xa8, 0x3b, 0xf2, 0x98, 0xe9, 0x9f, 0x19 }, - .expected_ss = (u8[32]){ 0x18, 0xcb, 0x89, 0xe4, 0xe2, 0x0c, 0x0c, 0x2b, - 0xd3, 0x24, 0x30, 0x52, 0x45, 0x26, 0x6c, 0x93, - 0x27, 0x69, 0x0b, 0xbe, 0x79, 0xac, 0xb8, 0x8f, - 0x5b, 0x8f, 0xb3, 0xf7, 0x4e, 0xca, 0x3e, 0x52 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - private key == -1 (mod order) */ -{ - .secret = (u8[32]){ 0xa0, 0x23, 0xcd, 0xd0, 0x83, 0xef, 0x5b, 0xb8, - 0x2f, 0x10, 0xd6, 0x2e, 0x59, 0xe1, 0x5a, 0x68, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50 }, - .b_public = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e, - 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57, - 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f, - 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 }, - .expected_ss = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e, - 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57, - 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f, - 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -}, -/* wycheproof - private key == 1 (mod order) on twist */ -{ - .secret = (u8[32]){ 0x58, 0x08, 0x3d, 0xd2, 0x61, 0xad, 0x91, 0xef, - 0xf9, 0x52, 0x32, 0x2e, 0xc8, 0x24, 0xc6, 0x82, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x5f }, - .b_public = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f, - 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6, - 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64, - 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 }, - .expected_ss = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f, - 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6, - 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64, - 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 }, - .secret_size = 32, - .b_public_size = 32, - .expected_ss_size = 32, - -} -}; - static const struct kpp_testvec ecdh_p192_tv_template[] = { { .secret = diff --git a/drivers/acpi/acpi_dbg.c b/drivers/acpi/acpi_dbg.c index d50261d05f3a1a..515b20d0b698a4 100644 --- a/drivers/acpi/acpi_dbg.c +++ b/drivers/acpi/acpi_dbg.c @@ -569,11 +569,11 @@ static int acpi_aml_release(struct inode *inode, struct file *file) return 0; } -static int acpi_aml_read_user(char __user *buf, int len) +static ssize_t acpi_aml_read_user(char __user *buf, size_t len) { - int ret; struct circ_buf *crc = &acpi_aml_io.out_crc; - int n; + ssize_t ret; + size_t n; char *p; ret = acpi_aml_lock_read(crc, ACPI_AML_OUT_USER); @@ -582,7 +582,7 @@ static int acpi_aml_read_user(char __user *buf, int len) /* sync head before removing logs */ smp_rmb(); p = &crc->buf[crc->tail]; - n = min(len, circ_count_to_end(crc)); + n = min_t(size_t, len, circ_count_to_end(crc)); if (copy_to_user(buf, p, n)) { ret = -EFAULT; goto out; @@ -599,8 +599,8 @@ static int acpi_aml_read_user(char __user *buf, int len) static ssize_t acpi_aml_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - int ret = 0; - int size = 0; + ssize_t ret = 0; + ssize_t size = 0; if (!count) return 0; @@ -639,11 +639,11 @@ static ssize_t acpi_aml_read(struct file *file, char __user *buf, return size > 0 ? size : ret; } -static int acpi_aml_write_user(const char __user *buf, int len) +static ssize_t acpi_aml_write_user(const char __user *buf, size_t len) { - int ret; struct circ_buf *crc = &acpi_aml_io.in_crc; - int n; + ssize_t ret; + size_t n; char *p; ret = acpi_aml_lock_write(crc, ACPI_AML_IN_USER); @@ -652,7 +652,7 @@ static int acpi_aml_write_user(const char __user *buf, int len) /* sync tail before inserting cmds */ smp_mb(); p = &crc->buf[crc->head]; - n = min(len, circ_space_to_end(crc)); + n = min_t(size_t, len, circ_space_to_end(crc)); if (copy_from_user(p, buf, n)) { ret = -EFAULT; goto out; @@ -663,14 +663,14 @@ static int acpi_aml_write_user(const char __user *buf, int len) ret = n; out: acpi_aml_unlock_fifo(ACPI_AML_IN_USER, ret >= 0); - return n; + return ret; } static ssize_t acpi_aml_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - int ret = 0; - int size = 0; + ssize_t ret = 0; + ssize_t size = 0; if (!count) return 0; diff --git a/drivers/acpi/acpica/acdebug.h b/drivers/acpi/acpica/acdebug.h index fe6d38b43c9a5c..91241bd6917a43 100644 --- a/drivers/acpi/acpica/acdebug.h +++ b/drivers/acpi/acpica/acdebug.h @@ -37,7 +37,7 @@ struct acpi_db_argument_info { struct acpi_db_execute_walk { u32 count; u32 max_count; - char name_seg[ACPI_NAMESEG_SIZE + 1] ACPI_NONSTRING; + char name_seg[ACPI_NAMESEG_SIZE + 1]; }; #define PARAM_LIST(pl) pl diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index 0c41f0097e8d71..f98640086f4ef3 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -1141,7 +1141,7 @@ struct acpi_port_info { #define ACPI_RESOURCE_NAME_PIN_GROUP_FUNCTION 0x91 #define ACPI_RESOURCE_NAME_PIN_GROUP_CONFIG 0x92 #define ACPI_RESOURCE_NAME_CLOCK_INPUT 0x93 -#define ACPI_RESOURCE_NAME_LARGE_MAX 0x94 +#define ACPI_RESOURCE_NAME_LARGE_MAX 0x93 /***************************************************************************** * diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h index 76c5ed02e91645..da2c45880cc7e9 100644 --- a/drivers/acpi/acpica/acpredef.h +++ b/drivers/acpi/acpica/acpredef.h @@ -450,7 +450,8 @@ const union acpi_predefined_info acpi_gbl_predefined_methods[] = { {{"_DSM", METHOD_4ARGS(ACPI_TYPE_BUFFER, ACPI_TYPE_INTEGER, ACPI_TYPE_INTEGER, - ACPI_TYPE_ANY) | ARG_COUNT_IS_MINIMUM, + ACPI_TYPE_ANY | ACPI_TYPE_PACKAGE) | + ARG_COUNT_IS_MINIMUM, METHOD_RETURNS(ACPI_RTYPE_ALL)}}, /* Must return a value, but it can be of any type */ {{"_DSS", METHOD_1ARGS(ACPI_TYPE_INTEGER), diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c index fef6fb29ece4d6..45ec32e81903ab 100644 --- a/drivers/acpi/acpica/dsmethod.c +++ b/drivers/acpi/acpica/dsmethod.c @@ -462,7 +462,6 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread, struct acpi_walk_state *next_walk_state = NULL; union acpi_operand_object *obj_desc; struct acpi_evaluate_info *info; - u32 i; ACPI_FUNCTION_TRACE_PTR(ds_call_control_method, this_walk_state); @@ -484,10 +483,17 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread, } if (this_walk_state->num_operands < obj_desc->method.param_count) { - ACPI_ERROR((AE_INFO, "Missing argument for method [%4.4s]", + ACPI_ERROR((AE_INFO, "Missing argument(s) for method [%4.4s]", acpi_ut_get_node_name(method_node))); - return_ACPI_STATUS(AE_AML_UNINITIALIZED_ARG); + return_ACPI_STATUS(AE_AML_TOO_FEW_ARGUMENTS); + } + + else if (this_walk_state->num_operands > obj_desc->method.param_count) { + ACPI_ERROR((AE_INFO, "Too many arguments for method [%4.4s]", + acpi_ut_get_node_name(method_node))); + + return_ACPI_STATUS(AE_AML_TOO_MANY_ARGUMENTS); } /* Init for new method, possibly wait on method mutex */ @@ -546,14 +552,7 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread, * Delete the operands on the previous walkstate operand stack * (they were copied to new objects) */ - for (i = 0; i < obj_desc->method.param_count; i++) { - acpi_ut_remove_reference(this_walk_state->operands[i]); - this_walk_state->operands[i] = NULL; - } - - /* Clear the operand stack */ - - this_walk_state->num_operands = 0; + acpi_ds_clear_operands(this_walk_state); ACPI_DEBUG_PRINT((ACPI_DB_DISPATCH, "**** Begin nested execution of [%4.4s] **** WalkState=%p\n", diff --git a/drivers/acpi/acpica/evglock.c b/drivers/acpi/acpica/evglock.c index fa3e0d00d1ca96..df2a4ab0e0da9d 100644 --- a/drivers/acpi/acpica/evglock.c +++ b/drivers/acpi/acpica/evglock.c @@ -42,6 +42,10 @@ acpi_status acpi_ev_init_global_lock_handler(void) return_ACPI_STATUS(AE_OK); } + if (!acpi_gbl_use_global_lock) { + return_ACPI_STATUS(AE_OK); + } + /* Attempt installation of the global lock handler */ status = acpi_install_fixed_event_handler(ACPI_EVENT_GLOBAL, diff --git a/drivers/acpi/acpica/psopinfo.c b/drivers/acpi/acpica/psopinfo.c index 1c8044ffcb97c5..532ea307a67548 100644 --- a/drivers/acpi/acpica/psopinfo.c +++ b/drivers/acpi/acpica/psopinfo.c @@ -34,7 +34,7 @@ static const u8 acpi_gbl_argument_count[] = const struct acpi_opcode_info *acpi_ps_get_opcode_info(u16 opcode) { -#ifdef ACPI_DEBUG_OUTPUT +#if defined ACPI_ASL_COMPILER && defined ACPI_DEBUG_OUTPUT const char *opcode_name = "Unknown AML opcode"; #endif @@ -102,11 +102,11 @@ const struct acpi_opcode_info *acpi_ps_get_opcode_info(u16 opcode) default: break; } -#endif /* Unknown AML opcode */ ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "%s [%4.4X]\n", opcode_name, opcode)); +#endif return (&acpi_gbl_aml_op_info[_UNK]); } diff --git a/drivers/acpi/acpica/tbprint.c b/drivers/acpi/acpica/tbprint.c index fd64460a2e2603..049f6c2f1e321d 100644 --- a/drivers/acpi/acpica/tbprint.c +++ b/drivers/acpi/acpica/tbprint.c @@ -121,6 +121,14 @@ acpi_tb_print_table_header(acpi_physical_address address, ACPI_CAST_PTR(struct acpi_table_rsdp, header)->revision, local_header.oem_id)); + } else if (acpi_gbl_CDAT && !acpi_ut_valid_nameseg(header->signature)) { + + /* CDAT does not use the common ACPI table header */ + + ACPI_INFO(("%-4.4s 0x%8.8X%8.8X %06X", + ACPI_SIG_CDAT, ACPI_FORMAT_UINT64(address), + ACPI_CAST_PTR(struct acpi_table_cdat, + header)->length)); } else { /* Standard ACPI table with full common header */ diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c index 2561b045acc7bc..3c87953dbd197a 100644 --- a/drivers/acpi/apei/einj-core.c +++ b/drivers/acpi/apei/einj-core.c @@ -656,6 +656,43 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, return rc; } +/* Allow almost all types of address except MMIO. */ +static bool is_allowed_range(u64 base_addr, u64 size) +{ + int i; + /* + * MMIO region is usually claimed with IORESOURCE_MEM + IORES_DESC_NONE. + * However, IORES_DESC_NONE is treated like a wildcard when we check if + * region intersects with known resource. So do an allow list check for + * IORES_DESCs that definitely or most likely not MMIO. + */ + int non_mmio_desc[] = { + IORES_DESC_CRASH_KERNEL, + IORES_DESC_ACPI_TABLES, + IORES_DESC_ACPI_NV_STORAGE, + IORES_DESC_PERSISTENT_MEMORY, + IORES_DESC_PERSISTENT_MEMORY_LEGACY, + /* Treat IORES_DESC_DEVICE_PRIVATE_MEMORY as MMIO. */ + IORES_DESC_RESERVED, + IORES_DESC_SOFT_RESERVED, + }; + + if (region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) + == REGION_INTERSECTS) + return true; + + for (i = 0; i < ARRAY_SIZE(non_mmio_desc); ++i) { + if (region_intersects(base_addr, size, IORESOURCE_MEM, non_mmio_desc[i]) + == REGION_INTERSECTS) + return true; + } + + if (arch_is_platform_page(base_addr)) + return true; + + return false; +} + /* Inject the specified hardware error */ int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3, u64 param4) @@ -702,19 +739,15 @@ int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3, * Disallow crazy address masks that give BIOS leeway to pick * injection address almost anywhere. Insist on page or * better granularity and that target address is normal RAM or - * NVDIMM. + * as long as is not MMIO. */ base_addr = param1 & param2; size = ~param2 + 1; - if (((param2 & PAGE_MASK) != PAGE_MASK) || - ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) - != REGION_INTERSECTS) && - (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY) - != REGION_INTERSECTS) && - (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_SOFT_RESERVED) - != REGION_INTERSECTS) && - !arch_is_platform_page(base_addr))) + if ((param2 & PAGE_MASK) != PAGE_MASK) + return -EINVAL; + + if (!is_allowed_range(base_addr, size)) return -EINVAL; if (is_zero_pfn(base_addr >> PAGE_SHIFT)) diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c index 246076341e8cc0..ff0e8bf8e97ac8 100644 --- a/drivers/acpi/apei/erst-dbg.c +++ b/drivers/acpi/apei/erst-dbg.c @@ -60,9 +60,8 @@ static long erst_dbg_ioctl(struct file *f, unsigned int cmd, unsigned long arg) switch (cmd) { case APEI_ERST_CLEAR_RECORD: - rc = copy_from_user(&record_id, (void __user *)arg, - sizeof(record_id)); - if (rc) + if (copy_from_user(&record_id, (void __user *)arg, + sizeof(record_id))) return -EFAULT; return erst_clear(record_id); case APEI_ERST_GET_RECORD_COUNT: @@ -175,8 +174,7 @@ static ssize_t erst_dbg_write(struct file *filp, const char __user *ubuf, erst_dbg_buf = p; erst_dbg_buf_len = usize; } - rc = copy_from_user(erst_dbg_buf, ubuf, usize); - if (rc) { + if (copy_from_user(erst_dbg_buf, ubuf, usize)) { rc = -EFAULT; goto out; } diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index a0d54993edb3b6..97ee19f2cae060 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1207,12 +1207,10 @@ static int ghes_notify_hed(struct notifier_block *this, unsigned long event, int ret = NOTIFY_DONE; spin_lock_irqsave(&ghes_notify_lock_irq, flags); - rcu_read_lock(); list_for_each_entry_rcu(ghes, &ghes_hed, list) { if (!ghes_proc(ghes)) ret = NOTIFY_OK; } - rcu_read_unlock(); spin_unlock_irqrestore(&ghes_notify_lock_irq, flags); return ret; diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 70f8290b659de5..fd995a1d3d248b 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -388,11 +388,11 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd, return 0; } -static int __init gtdt_sbsa_gwdt_init(void) +static int __init gtdt_platform_timer_init(void) { void *platform_timer; struct acpi_table_header *table; - int ret, timer_count, gwdt_count = 0; + int ret, timer_count, gwdt_count = 0, mmio_timer_count = 0; if (acpi_disabled) return 0; @@ -414,20 +414,41 @@ static int __init gtdt_sbsa_gwdt_init(void) goto out_put_gtdt; for_each_platform_timer(platform_timer) { + ret = 0; + if (is_non_secure_watchdog(platform_timer)) { ret = gtdt_import_sbsa_gwdt(platform_timer, gwdt_count); if (ret) - break; + continue; gwdt_count++; + } else if (is_timer_block(platform_timer)) { + struct arch_timer_mem atm = {}; + struct platform_device *pdev; + + ret = gtdt_parse_timer_block(platform_timer, &atm); + if (ret) + continue; + + pdev = platform_device_register_data(NULL, "gtdt-arm-mmio-timer", + gwdt_count, &atm, + sizeof(atm)); + if (IS_ERR(pdev)) { + pr_err("Can't register timer %d\n", gwdt_count); + continue; + } + + mmio_timer_count++; } } if (gwdt_count) pr_info("found %d SBSA generic Watchdog(s).\n", gwdt_count); + if (mmio_timer_count) + pr_info("found %d Generic MMIO timer(s).\n", mmio_timer_count); out_put_gtdt: acpi_put_table(table); return ret; } -device_initcall(gtdt_sbsa_gwdt_init); +device_initcall(gtdt_platform_timer_init); diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 6905b56bf3e458..67b76492c839c4 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -92,7 +92,7 @@ enum { struct acpi_battery { struct mutex lock; - struct mutex sysfs_lock; + struct mutex update_lock; struct power_supply *bat; struct power_supply_desc bat_desc; struct acpi_device *device; @@ -904,15 +904,12 @@ static int sysfs_add_battery(struct acpi_battery *battery) static void sysfs_remove_battery(struct acpi_battery *battery) { - mutex_lock(&battery->sysfs_lock); - if (!battery->bat) { - mutex_unlock(&battery->sysfs_lock); + if (!battery->bat) return; - } + battery_hook_remove_battery(battery); power_supply_unregister(battery->bat); battery->bat = NULL; - mutex_unlock(&battery->sysfs_lock); } static void find_battery(const struct dmi_header *dm, void *private) @@ -1072,6 +1069,9 @@ static void acpi_battery_notify(acpi_handle handle, u32 event, void *data) if (!battery) return; + + guard(mutex)(&battery->update_lock); + old = battery->bat; /* * On Acer Aspire V5-573G notifications are sometimes triggered too @@ -1094,21 +1094,22 @@ static void acpi_battery_notify(acpi_handle handle, u32 event, void *data) } static int battery_notify(struct notifier_block *nb, - unsigned long mode, void *_unused) + unsigned long mode, void *_unused) { struct acpi_battery *battery = container_of(nb, struct acpi_battery, pm_nb); - int result; - switch (mode) { - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: + if (mode == PM_POST_SUSPEND || mode == PM_POST_HIBERNATION) { + guard(mutex)(&battery->update_lock); + if (!acpi_battery_present(battery)) return 0; if (battery->bat) { acpi_battery_refresh(battery); } else { + int result; + result = acpi_battery_get_info(battery); if (result) return result; @@ -1120,7 +1121,6 @@ static int battery_notify(struct notifier_block *nb, acpi_battery_init_alarm(battery); acpi_battery_get_state(battery); - break; } return 0; @@ -1198,6 +1198,8 @@ static int acpi_battery_update_retry(struct acpi_battery *battery) { int retry, ret; + guard(mutex)(&battery->update_lock); + for (retry = 5; retry; retry--) { ret = acpi_battery_update(battery, false); if (!ret) @@ -1208,6 +1210,13 @@ static int acpi_battery_update_retry(struct acpi_battery *battery) return ret; } +static void sysfs_battery_cleanup(struct acpi_battery *battery) +{ + guard(mutex)(&battery->update_lock); + + sysfs_remove_battery(battery); +} + static int acpi_battery_add(struct acpi_device *device) { int result = 0; @@ -1230,7 +1239,7 @@ static int acpi_battery_add(struct acpi_device *device) if (result) return result; - result = devm_mutex_init(&device->dev, &battery->sysfs_lock); + result = devm_mutex_init(&device->dev, &battery->update_lock); if (result) return result; @@ -1262,7 +1271,7 @@ static int acpi_battery_add(struct acpi_device *device) device_init_wakeup(&device->dev, 0); unregister_pm_notifier(&battery->pm_nb); fail: - sysfs_remove_battery(battery); + sysfs_battery_cleanup(battery); return result; } @@ -1281,6 +1290,9 @@ static void acpi_battery_remove(struct acpi_device *device) device_init_wakeup(&device->dev, 0); unregister_pm_notifier(&battery->pm_nb); + + guard(mutex)(&battery->update_lock); + sysfs_remove_battery(battery); } @@ -1297,6 +1309,9 @@ static int acpi_battery_resume(struct device *dev) return -EINVAL; battery->update_time = 0; + + guard(mutex)(&battery->update_lock); + acpi_battery_update(battery, true); return 0; } diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 6b649031808f80..ab4651205e8adb 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1876,7 +1876,7 @@ EXPORT_SYMBOL_GPL(cppc_set_perf); * If desired_reg is in the SystemMemory or SystemIo ACPI address space, * then assume there is no latency. */ -unsigned int cppc_get_transition_latency(int cpu_num) +int cppc_get_transition_latency(int cpu_num) { /* * Expected transition latency is based on the PCCT timing values @@ -1889,31 +1889,29 @@ unsigned int cppc_get_transition_latency(int cpu_num) * completion of a command before issuing the next command, * in microseconds. */ - unsigned int latency_ns = 0; struct cpc_desc *cpc_desc; struct cpc_register_resource *desired_reg; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu_num); struct cppc_pcc_data *pcc_ss_data; + int latency_ns = 0; cpc_desc = per_cpu(cpc_desc_ptr, cpu_num); if (!cpc_desc) - return CPUFREQ_ETERNAL; + return -ENODATA; desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; if (CPC_IN_SYSTEM_MEMORY(desired_reg) || CPC_IN_SYSTEM_IO(desired_reg)) return 0; - else if (!CPC_IN_PCC(desired_reg)) - return CPUFREQ_ETERNAL; - if (pcc_ss_id < 0) - return CPUFREQ_ETERNAL; + if (!CPC_IN_PCC(desired_reg) || pcc_ss_id < 0) + return -ENODATA; pcc_ss_data = pcc_data[pcc_ss_id]; if (pcc_ss_data->pcc_mpar) latency_ns = 60 * (1000 * 1000 * 1000 / pcc_ss_data->pcc_mpar); - latency_ns = max(latency_ns, pcc_ss_data->pcc_nominal * 1000); - latency_ns = max(latency_ns, pcc_ss_data->pcc_mrtt * 1000); + latency_ns = max_t(int, latency_ns, pcc_ss_data->pcc_nominal * 1000); + latency_ns = max_t(int, latency_ns, pcc_ss_data->pcc_mrtt * 1000); return latency_ns; } diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index bc9f58a02c1db5..5d824435b26be3 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -166,8 +166,7 @@ static int __acpi_processor_start(struct acpi_device *device) if (result && !IS_ENABLED(CONFIG_ACPI_CPU_FREQ_PSS)) dev_dbg(&device->dev, "CPPC data invalid or not present\n"); - if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver) - acpi_processor_power_init(pr); + acpi_processor_power_init(pr); acpi_pss_perf_init(pr); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 5dacf41d7cc0a0..22b051b94a86c1 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -51,7 +51,7 @@ module_param(latency_factor, uint, 0644); static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device); -struct cpuidle_driver acpi_idle_driver = { +static struct cpuidle_driver acpi_idle_driver = { .name = "acpi_idle", .owner = THIS_MODULE, }; @@ -1400,47 +1400,52 @@ void acpi_processor_unregister_idle_driver(void) cpuidle_unregister_driver(&acpi_idle_driver); } -int acpi_processor_power_init(struct acpi_processor *pr) +void acpi_processor_power_init(struct acpi_processor *pr) { - int retval; struct cpuidle_device *dev; + /* + * The code below only works if the current cpuidle driver is the ACPI + * idle driver. + */ + if (cpuidle_get_driver() != &acpi_idle_driver) + return; + if (disabled_by_idle_boot_param()) - return 0; + return; acpi_processor_cstate_first_run_checks(); if (!acpi_processor_get_power_info(pr)) pr->flags.power_setup_done = 1; - if (pr->flags.power) { - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - per_cpu(acpi_cpuidle_device, pr->id) = dev; + if (!pr->flags.power) + return; - acpi_processor_setup_cpuidle_dev(pr, dev); + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return; - /* Register per-cpu cpuidle_device. Cpuidle driver - * must already be registered before registering device - */ - retval = cpuidle_register_device(dev); - if (retval) { + per_cpu(acpi_cpuidle_device, pr->id) = dev; - per_cpu(acpi_cpuidle_device, pr->id) = NULL; - kfree(dev); - return retval; - } + acpi_processor_setup_cpuidle_dev(pr, dev); + + /* + * Register a cpuidle device for this CPU. The cpuidle driver using + * this device is expected to be registered. + */ + if (cpuidle_register_device(dev)) { + per_cpu(acpi_cpuidle_device, pr->id) = NULL; + kfree(dev); } - return 0; } -int acpi_processor_power_exit(struct acpi_processor *pr) +void acpi_processor_power_exit(struct acpi_processor *pr) { struct cpuidle_device *dev = per_cpu(acpi_cpuidle_device, pr->id); if (disabled_by_idle_boot_param()) - return 0; + return; if (pr->flags.power) { cpuidle_unregister_device(dev); @@ -1448,7 +1453,6 @@ int acpi_processor_power_exit(struct acpi_processor *pr) } pr->flags.power_setup_done = 0; - return 0; } MODULE_IMPORT_NS("ACPI_PROCESSOR_IDLE"); diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 436019d96027bd..36d29135e16416 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -83,6 +83,7 @@ static bool acpi_nondev_subnode_extract(union acpi_object *desc, struct fwnode_handle *parent) { struct acpi_data_node *dn; + acpi_handle scope = NULL; bool result; if (acpi_graph_ignore_port(handle)) @@ -98,59 +99,45 @@ static bool acpi_nondev_subnode_extract(union acpi_object *desc, INIT_LIST_HEAD(&dn->data.properties); INIT_LIST_HEAD(&dn->data.subnodes); - result = acpi_extract_properties(handle, desc, &dn->data); - - if (handle) { - acpi_handle scope; - acpi_status status; + /* + * The scope for the completion of relative pathname segments and + * subnode object lookup is the one of the namespace node (device) + * containing the object that has returned the package. That is, it's + * the scope of that object's parent device. + */ + if (handle) + acpi_get_parent(handle, &scope); - /* - * The scope for the subnode object lookup is the one of the - * namespace node (device) containing the object that has - * returned the package. That is, it's the scope of that - * object's parent. - */ - status = acpi_get_parent(handle, &scope); - if (ACPI_SUCCESS(status) - && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, - &dn->fwnode)) - result = true; - } else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data, - &dn->fwnode)) { + /* + * Extract properties from the _DSD-equivalent package pointed to by + * desc and use scope (if not NULL) for the completion of relative + * pathname segments. + * + * The extracted properties will be held in the new data node dn. + */ + result = acpi_extract_properties(scope, desc, &dn->data); + /* + * Look for subnodes in the _DSD-equivalent package pointed to by desc + * and create child nodes of dn if there are any. + */ + if (acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, &dn->fwnode)) result = true; - } - - if (result) { - dn->handle = handle; - dn->data.pointer = desc; - list_add_tail(&dn->sibling, list); - return true; - } - - kfree(dn); - acpi_handle_debug(handle, "Invalid properties/subnodes data, skipping\n"); - return false; -} -static bool acpi_nondev_subnode_data_ok(acpi_handle handle, - const union acpi_object *link, - struct list_head *list, - struct fwnode_handle *parent) -{ - struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; - acpi_status status; - - status = acpi_evaluate_object_typed(handle, NULL, NULL, &buf, - ACPI_TYPE_PACKAGE); - if (ACPI_FAILURE(status)) + if (!result) { + kfree(dn); + acpi_handle_debug(handle, "Invalid properties/subnodes data, skipping\n"); return false; + } - if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list, - parent)) - return true; + /* + * This will be NULL if the desc package is embedded in an outer + * _DSD-equivalent package and its scope cannot be determined. + */ + dn->handle = handle; + dn->data.pointer = desc; + list_add_tail(&dn->sibling, list); - ACPI_FREE(buf.pointer); - return false; + return true; } static bool acpi_nondev_subnode_ok(acpi_handle scope, @@ -158,9 +145,16 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope, struct list_head *list, struct fwnode_handle *parent) { + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; acpi_handle handle; acpi_status status; + /* + * If the scope is unknown, the _DSD-equivalent package being parsed + * was embedded in an outer _DSD-equivalent package as a result of + * direct evaluation of an object pointed to by a reference. In that + * case, using a pathname as the target object pointer is invalid. + */ if (!scope) return false; @@ -169,7 +163,17 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope, if (ACPI_FAILURE(status)) return false; - return acpi_nondev_subnode_data_ok(handle, link, list, parent); + status = acpi_evaluate_object_typed(handle, NULL, NULL, &buf, + ACPI_TYPE_PACKAGE); + if (ACPI_FAILURE(status)) + return false; + + if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list, + parent)) + return true; + + ACPI_FREE(buf.pointer); + return false; } static bool acpi_add_nondev_subnodes(acpi_handle scope, @@ -180,9 +184,12 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope, bool ret = false; int i; + /* + * Every element in the links package is expected to represent a link + * to a non-device node in a tree containing device-specific data. + */ for (i = 0; i < links->package.count; i++) { union acpi_object *link, *desc; - acpi_handle handle; bool result; link = &links->package.elements[i]; @@ -190,26 +197,53 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope, if (link->package.count != 2) continue; - /* The first one must be a string. */ + /* The first one (the key) must be a string. */ if (link->package.elements[0].type != ACPI_TYPE_STRING) continue; - /* The second one may be a string, a reference or a package. */ + /* The second one (the target) may be a string or a package. */ switch (link->package.elements[1].type) { case ACPI_TYPE_STRING: + /* + * The string is expected to be a full pathname or a + * pathname segment relative to the given scope. That + * pathname is expected to point to an object returning + * a package that contains _DSD-equivalent information. + */ result = acpi_nondev_subnode_ok(scope, link, list, parent); break; - case ACPI_TYPE_LOCAL_REFERENCE: - handle = link->package.elements[1].reference.handle; - result = acpi_nondev_subnode_data_ok(handle, link, list, - parent); - break; case ACPI_TYPE_PACKAGE: + /* + * This happens when a reference is used in AML to + * point to the target. Since the target is expected + * to be a named object, a reference to it will cause it + * to be avaluated in place and its return package will + * be embedded in the links package at the location of + * the reference. + * + * The target package is expected to contain _DSD- + * equivalent information, but the scope in which it + * is located in the original AML is unknown. Thus + * it cannot contain pathname segments represented as + * strings because there is no way to build full + * pathnames out of them. + */ + acpi_handle_debug(scope, "subnode %s: Unknown scope\n", + link->package.elements[0].string.pointer); desc = &link->package.elements[1]; result = acpi_nondev_subnode_extract(desc, NULL, link, list, parent); break; + case ACPI_TYPE_LOCAL_REFERENCE: + /* + * It is not expected to see any local references in + * the links package because referencing a named object + * should cause it to be evaluated in place. + */ + acpi_handle_info(scope, "subnode %s: Unexpected reference\n", + link->package.elements[0].string.pointer); + fallthrough; default: result = false; break; @@ -369,6 +403,9 @@ static void acpi_untie_nondev_subnodes(struct acpi_device_data *data) struct acpi_data_node *dn; list_for_each_entry(dn, &data->subnodes, sibling) { + if (!dn->handle) + continue; + acpi_detach_data(dn->handle, acpi_nondev_subnode_tag); acpi_untie_nondev_subnodes(&dn->data); @@ -383,6 +420,9 @@ static bool acpi_tie_nondev_subnodes(struct acpi_device_data *data) acpi_status status; bool ret; + if (!dn->handle) + continue; + status = acpi_attach_data(dn->handle, acpi_nondev_subnode_tag, dn); if (ACPI_FAILURE(status) && status != AE_ALREADY_EXISTS) { acpi_handle_err(dn->handle, "Can't tag data node\n"); @@ -1318,6 +1358,28 @@ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, return NULL; } +/* + * acpi_get_next_present_subnode - Return the next present child node handle + * @fwnode: Firmware node to find the next child node for. + * @child: Handle to one of the device's child nodes or a null handle. + * + * Like acpi_get_next_subnode(), but the device nodes returned by + * acpi_get_next_present_subnode() are guaranteed to be present. + * + * Returns: The fwnode handle of the next present sub-node. + */ +static struct fwnode_handle * +acpi_get_next_present_subnode(const struct fwnode_handle *fwnode, + struct fwnode_handle *child) +{ + do { + child = acpi_get_next_subnode(fwnode, child); + } while (is_acpi_device_node(child) && + !acpi_device_is_present(to_acpi_device_node(child))); + + return child; +} + /** * acpi_node_get_parent - Return parent fwnode of this fwnode * @fwnode: Firmware node whose parent to get @@ -1662,7 +1724,7 @@ static int acpi_fwnode_irq_get(const struct fwnode_handle *fwnode, .property_read_string_array = \ acpi_fwnode_property_read_string_array, \ .get_parent = acpi_node_get_parent, \ - .get_next_child_node = acpi_get_next_subnode, \ + .get_next_child_node = acpi_get_next_present_subnode, \ .get_named_child_node = acpi_fwnode_get_named_child_node, \ .get_name = acpi_fwnode_get_name, \ .get_name_prefix = acpi_fwnode_get_name_prefix, \ diff --git a/drivers/acpi/spcr.c b/drivers/acpi/spcr.c index cd36a97b0ea2c7..d4d52d5e9016ca 100644 --- a/drivers/acpi/spcr.c +++ b/drivers/acpi/spcr.c @@ -141,12 +141,23 @@ int __init acpi_parse_spcr(bool enable_earlycon, bool enable_console) case ACPI_DBG2_16550_NVIDIA: uart = "uart"; break; + case ACPI_DBG2_RISCV_SBI_CON: + uart = "sbi"; + break; default: err = -ENOENT; goto done; } - switch (table->baud_rate) { + /* + * SPCR 1.09 defines Precise Baud Rate Filed contains a specific + * non-zero baud rate which overrides the value of the Configured + * Baud Rate field. If this field is zero or not present, Configured + * Baud Rate is used. + */ + if (table->precise_baudrate) + baud_rate = table->precise_baudrate; + else switch (table->baud_rate) { case 0: /* * SPCR 1.04 defines 0 as a preconfigured state of UART. diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 064eb52ff7e2d4..1786d87b29e227 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -167,6 +167,12 @@ config PM_QOS_KUNIT_TEST depends on KUNIT=y default KUNIT_ALL_TESTS +config PM_RUNTIME_KUNIT_TEST + tristate "KUnit Tests for runtime PM" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on PM + default KUNIT_ALL_TESTS + config HMEM_REPORTING bool default n diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index 12ffdd8437567f..04bdbff4dbe53f 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -171,17 +171,18 @@ static const struct auxiliary_device_id *auxiliary_match_id(const struct auxiliary_device_id *id, const struct auxiliary_device *auxdev) { - for (; id->name[0]; id++) { - const char *p = strrchr(dev_name(&auxdev->dev), '.'); - int match_size; + const char *auxdev_name = dev_name(&auxdev->dev); + const char *p = strrchr(auxdev_name, '.'); + int match_size; - if (!p) - continue; - match_size = p - dev_name(&auxdev->dev); + if (!p) + return NULL; + match_size = p - auxdev_name; + for (; id->name[0]; id++) { /* use dev_name(&auxdev->dev) prefix before last '.' char to match to */ if (strlen(id->name) == match_size && - !strncmp(dev_name(&auxdev->dev), id->name, match_size)) + !strncmp(auxdev_name, id->name, match_size)) return id; } return NULL; @@ -217,17 +218,14 @@ static int auxiliary_bus_probe(struct device *dev) struct auxiliary_device *auxdev = to_auxiliary_dev(dev); int ret; - ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON | + PD_FLAG_DETACH_POWER_OFF); if (ret) { dev_warn(dev, "Failed to attach to PM Domain : %d\n", ret); return ret; } - ret = auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev)); - if (ret) - dev_pm_domain_detach(dev, true); - - return ret; + return auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev)); } static void auxiliary_bus_remove(struct device *dev) @@ -237,7 +235,6 @@ static void auxiliary_bus_remove(struct device *dev) if (auxdrv->remove) auxdrv->remove(auxdev); - dev_pm_domain_detach(dev, true); } static void auxiliary_bus_shutdown(struct device *dev) diff --git a/drivers/base/base.h b/drivers/base/base.h index 700aecd22fd34a..86fa7fbb354891 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -248,6 +248,7 @@ void device_links_driver_cleanup(struct device *dev); void device_links_no_driver(struct device *dev); bool device_links_busy(struct device *dev); void device_links_unbind_consumers(struct device *dev); +bool device_link_flag_is_sync_state_only(u32 flags); void fw_devlink_drivers_done(void); void fw_devlink_probing_done(void); diff --git a/drivers/base/core.c b/drivers/base/core.c index d22d6b23e75898..3c533dab8fa530 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -287,7 +287,7 @@ static bool device_is_ancestor(struct device *dev, struct device *target) #define DL_MARKER_FLAGS (DL_FLAG_INFERRED | \ DL_FLAG_CYCLE | \ DL_FLAG_MANAGED) -static inline bool device_link_flag_is_sync_state_only(u32 flags) +bool device_link_flag_is_sync_state_only(u32 flags) { return (flags & ~DL_MARKER_FLAGS) == DL_FLAG_SYNC_STATE_ONLY; } @@ -3994,8 +3994,8 @@ const char *device_get_devnode(const struct device *dev, /** * device_for_each_child - device child iterator. * @parent: parent struct device. - * @fn: function to be called for each device. * @data: data for the callback. + * @fn: function to be called for each device. * * Iterate over @parent's child devices, and call @fn for each, * passing it @data. @@ -4024,8 +4024,8 @@ EXPORT_SYMBOL_GPL(device_for_each_child); /** * device_for_each_child_reverse - device child iterator in reversed order. * @parent: parent struct device. - * @fn: function to be called for each device. * @data: data for the callback. + * @fn: function to be called for each device. * * Iterate over @parent's child devices, and call @fn for each, * passing it @data. @@ -4055,8 +4055,8 @@ EXPORT_SYMBOL_GPL(device_for_each_child_reverse); * device_for_each_child_reverse_from - device child iterator in reversed order. * @parent: parent struct device. * @from: optional starting point in child list - * @fn: function to be called for each device. * @data: data for the callback. + * @fn: function to be called for each device. * * Iterate over @parent's child devices, starting at @from, and call @fn * for each, passing it @data. This helper is identical to @@ -4089,8 +4089,8 @@ EXPORT_SYMBOL_GPL(device_for_each_child_reverse_from); /** * device_find_child - device iterator for locating a particular device. * @parent: parent struct device - * @match: Callback function to check device * @data: Data to pass to match function + * @match: Callback function to check device * * This is similar to the device_for_each_child() function above, but it * returns a reference to a device that is 'found' for later use, as @@ -5278,6 +5278,25 @@ void device_set_node(struct device *dev, struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(device_set_node); +/** + * get_dev_from_fwnode - Obtain a reference count of the struct device the + * struct fwnode_handle is associated with. + * @fwnode: The pointer to the struct fwnode_handle to obtain the struct device + * reference count of. + * + * This function obtains a reference count of the device the device pointer + * embedded in the struct fwnode_handle points to. + * + * Note that the struct device pointer embedded in struct fwnode_handle does + * *not* have a reference count of the struct device itself. + * + * Hence, it is a UAF (and thus a bug) to call this function if the caller can't + * guarantee that the last reference count of the corresponding struct device is + * not dropped concurrently. + * + * This is possible since struct fwnode_handle has its own reference count and + * hence can out-live the struct device it is associated with. + */ struct device *get_dev_from_fwnode(struct fwnode_handle *fwnode) { return get_device((fwnode)->dev); diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index efc575a00edda9..fa0a2eef93ac81 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -325,7 +325,7 @@ static void cpu_device_release(struct device *dev) * This is an empty function to prevent the driver core from spitting a * warning at us. Yes, I know this is directly opposite of what the * documentation for the driver core and kobjects say, and the author - * of this code has already been publically ridiculed for doing + * of this code has already been publicly ridiculed for doing * something as foolish as this. However, at this point in time, it is * the only way to handle the issue of statically allocated cpu * devices. The different architectures will have their cpu device @@ -603,6 +603,7 @@ CPU_SHOW_VULN_FALLBACK(ghostwrite); CPU_SHOW_VULN_FALLBACK(old_microcode); CPU_SHOW_VULN_FALLBACK(indirect_target_selection); CPU_SHOW_VULN_FALLBACK(tsa); +CPU_SHOW_VULN_FALLBACK(vmscape); static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); @@ -622,6 +623,7 @@ static DEVICE_ATTR(ghostwrite, 0444, cpu_show_ghostwrite, NULL); static DEVICE_ATTR(old_microcode, 0444, cpu_show_old_microcode, NULL); static DEVICE_ATTR(indirect_target_selection, 0444, cpu_show_indirect_target_selection, NULL); static DEVICE_ATTR(tsa, 0444, cpu_show_tsa, NULL); +static DEVICE_ATTR(vmscape, 0444, cpu_show_vmscape, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -642,6 +644,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_old_microcode.attr, &dev_attr_indirect_target_selection.attr, &dev_attr_tsa.attr, + &dev_attr_vmscape.attr, NULL }; diff --git a/drivers/base/devres.c b/drivers/base/devres.c index ff55e1bcfa3005..c948c88d395607 100644 --- a/drivers/base/devres.c +++ b/drivers/base/devres.c @@ -1117,6 +1117,27 @@ void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL_GPL(devm_kmemdup); +/** + * devm_kmemdup_const - conditionally duplicate and manage a region of memory + * + * @dev: Device this memory belongs to + * @src: memory region to duplicate + * @len: memory region length, + * @gfp: GFP mask to use + * + * Return: source address if it is in .rodata or the return value of kmemdup() + * to which the function falls back otherwise. + */ +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)src)) + return src; + + return devm_kmemdup(dev, src, len, gfp); +} +EXPORT_SYMBOL_GPL(devm_kmemdup_const); + struct pages_devres { unsigned long addr; unsigned int order; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 31bfb3194b4c29..9d4e46ad835225 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -176,7 +176,7 @@ static int dev_mkdir(const char *name, umode_t mode) struct dentry *dentry; struct path path; - dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -184,7 +184,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return PTR_ERR_OR_ZERO(dentry); } @@ -222,10 +222,10 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, struct path path; int err; - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -246,7 +246,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -256,7 +256,7 @@ static int dev_rmdir(const char *name) struct dentry *dentry; int err; - dentry = kern_path_locked(name, &parent); + dentry = start_removing_path(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) @@ -265,9 +265,7 @@ static int dev_rmdir(const char *name) else err = -EPERM; - dput(dentry); - inode_unlock(d_inode(parent.dentry)); - path_put(&parent); + end_removing_path(&parent, dentry); return err; } @@ -325,7 +323,7 @@ static int handle_remove(const char *nodename, struct device *dev) int deleted = 0; int err = 0; - dentry = kern_path_locked(nodename, &parent); + dentry = start_removing_path(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -349,10 +347,8 @@ static int handle_remove(const char *nodename, struct device *dev) if (!err || err == -ENOENT) deleted = 1; } - dput(dentry); - inode_unlock(d_inode(parent.dentry)); + end_removing_path(&parent, dentry); - path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; diff --git a/drivers/base/faux.c b/drivers/base/faux.c index f5fbda0a9a44bd..21dd02124231a9 100644 --- a/drivers/base/faux.c +++ b/drivers/base/faux.c @@ -155,6 +155,7 @@ struct faux_device *faux_device_create_with_groups(const char *name, dev->parent = &faux_bus_root; dev->bus = &faux_bus_type; dev_set_name(dev, "%s", name); + device_set_pm_not_required(dev); ret = device_add(dev); if (ret) { diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 01f11629d241cd..2989e42d01611a 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_PM_SLEEP) += main.o wakeup.o wakeup_stats.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o obj-$(CONFIG_HAVE_CLK) += clock_ops.o obj-$(CONFIG_PM_QOS_KUNIT_TEST) += qos-test.o +obj-$(CONFIG_PM_RUNTIME_KUNIT_TEST) += runtime-test.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index b9a34c3425ecfa..e83503bdc1fdb8 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -278,7 +278,8 @@ static void dpm_wait_for_suppliers(struct device *dev, bool async) * walking. */ dev_for_each_link_to_supplier(link, dev) - if (READ_ONCE(link->status) != DL_STATE_DORMANT) + if (READ_ONCE(link->status) != DL_STATE_DORMANT && + !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->supplier, async); device_links_read_unlock(idx); @@ -335,7 +336,8 @@ static void dpm_wait_for_consumers(struct device *dev, bool async) * unregistration). */ dev_for_each_link_to_consumer(link, dev) - if (READ_ONCE(link->status) != DL_STATE_DORMANT) + if (READ_ONCE(link->status) != DL_STATE_DORMANT && + !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->consumer, async); device_links_read_unlock(idx); diff --git a/drivers/base/power/runtime-test.c b/drivers/base/power/runtime-test.c new file mode 100644 index 00000000000000..eca9885e807d1f --- /dev/null +++ b/drivers/base/power/runtime-test.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2025 Google, Inc. + */ + +#include +#include +#include +#include + +#define DEVICE_NAME "pm_runtime_test_device" + +static void pm_runtime_depth_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_get_sync(dev)); /* "already active" */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +/* Test pm_runtime_put() and friends when already suspended. */ +static void pm_runtime_already_suspended_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + pm_runtime_get_noresume(dev); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_barrier(dev)); /* no wakeup needed */ + pm_runtime_put(dev); + + pm_runtime_get_noresume(dev); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, 1, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_request_autosuspend(dev)); + + pm_runtime_get_noresume(dev); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_put_sync_autosuspend(dev)); + + pm_runtime_get_noresume(dev); + pm_runtime_put_autosuspend(dev); + + /* Grab 2 refcounts */ + pm_runtime_get_noresume(dev); + pm_runtime_get_noresume(dev); + /* The first put() sees usage_count 1 */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync_autosuspend(dev)); + /* The second put() sees usage_count 0 but tells us "already suspended". */ + KUNIT_EXPECT_EQ(test, 1, pm_runtime_put_sync_autosuspend(dev)); + + /* Should have remained suspended the whole time. */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +static void pm_runtime_idle_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_idle(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + pm_runtime_put_noidle(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_idle(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_request_idle(dev)); +} + +static void pm_runtime_disabled_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + /* Never called pm_runtime_enable() */ + KUNIT_EXPECT_FALSE(test, pm_runtime_enabled(dev)); + + /* "disabled" is treated as "active" */ + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_FALSE(test, pm_runtime_suspended(dev)); + + /* + * Note: these "fail", but they still acquire/release refcounts, so + * keep them balanced. + */ + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_get(dev)); + pm_runtime_put(dev); + + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_get(dev)); + pm_runtime_put_autosuspend(dev); + + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_resume_and_get(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_request_idle(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_request_resume(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_request_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_resume(dev)); + KUNIT_EXPECT_EQ(test, -EACCES, pm_runtime_autosuspend(dev)); + + /* Still disabled */ + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + KUNIT_EXPECT_FALSE(test, pm_runtime_enabled(dev)); +} + +static void pm_runtime_error_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + pm_runtime_enable(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + /* Fake a .runtime_resume() error */ + dev->power.runtime_error = -EIO; + + /* + * Note: these "fail", but they still acquire/release refcounts, so + * keep them balanced. + */ + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get(dev)); + pm_runtime_put(dev); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get(dev)); + pm_runtime_put_autosuspend(dev); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_get(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_put_sync_autosuspend(dev)); + + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_resume_and_get(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_request_idle(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_request_resume(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_request_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_resume(dev)); + KUNIT_EXPECT_EQ(test, -EINVAL, pm_runtime_autosuspend(dev)); + + /* Error is still pending */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + KUNIT_EXPECT_EQ(test, -EIO, dev->power.runtime_error); + /* Clear error */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_set_suspended(dev)); + KUNIT_EXPECT_EQ(test, 0, dev->power.runtime_error); + /* Still suspended */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_barrier(dev)); /* resume was pending */ + pm_runtime_put(dev); + pm_runtime_suspend(dev); /* flush the put(), to suspend */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_get_sync(dev)); + pm_runtime_put_autosuspend(dev); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_resume_and_get(dev)); + + /* + * The following should all return -EAGAIN (usage is non-zero) or 1 + * (already resumed). + */ + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_idle(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_request_idle(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_request_resume(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_request_autosuspend(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_suspend(dev)); + KUNIT_EXPECT_EQ(test, 1, pm_runtime_resume(dev)); + KUNIT_EXPECT_EQ(test, -EAGAIN, pm_runtime_autosuspend(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_put_sync(dev)); + + /* Suspended again */ + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +/* + * Explore a typical probe() sequence in which a device marks itself powered, + * but doesn't hold any runtime PM reference, so it suspends as soon as it goes + * idle. + */ +static void pm_runtime_probe_active_test(struct kunit *test) +{ + struct device *dev = kunit_device_register(test, DEVICE_NAME); + + KUNIT_ASSERT_PTR_NE(test, NULL, dev); + + KUNIT_EXPECT_TRUE(test, pm_runtime_status_suspended(dev)); + + KUNIT_EXPECT_EQ(test, 0, pm_runtime_set_active(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + + pm_runtime_enable(dev); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + + /* Nothing to flush. We stay active. */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_barrier(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_active(dev)); + + /* Ask for idle? Now we suspend. */ + KUNIT_EXPECT_EQ(test, 0, pm_runtime_idle(dev)); + KUNIT_EXPECT_TRUE(test, pm_runtime_suspended(dev)); +} + +static struct kunit_case pm_runtime_test_cases[] = { + KUNIT_CASE(pm_runtime_depth_test), + KUNIT_CASE(pm_runtime_already_suspended_test), + KUNIT_CASE(pm_runtime_idle_test), + KUNIT_CASE(pm_runtime_disabled_test), + KUNIT_CASE(pm_runtime_error_test), + KUNIT_CASE(pm_runtime_probe_active_test), + {} +}; + +static struct kunit_suite pm_runtime_test_suite = { + .name = "pm_runtime_test_cases", + .test_cases = pm_runtime_test_cases, +}; + +kunit_test_suite(pm_runtime_test_suite); +MODULE_DESCRIPTION("Runtime power management unit test suite"); +MODULE_LICENSE("GPL"); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 7420b9851fe0fd..1b11a3cd4acc27 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -498,6 +498,9 @@ static int rpm_idle(struct device *dev, int rpmflags) if (retval < 0) ; /* Conditions are wrong. */ + else if ((rpmflags & RPM_GET_PUT) && retval == 1) + ; /* put() is allowed in RPM_SUSPENDED */ + /* Idle notifications are allowed only in the RPM_ACTIVE state. */ else if (dev->power.runtime_status != RPM_ACTIVE) retval = -EAGAIN; @@ -796,6 +799,8 @@ static int rpm_resume(struct device *dev, int rpmflags) if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; + else if (rpmflags & RPM_TRANSPARENT) + goto out; else retval = -EACCES; } diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 16bf590a65bedd..ce9be3989a218d 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -1559,40 +1559,24 @@ static int _regmap_select_page(struct regmap *map, unsigned int *reg, return -EINVAL; } - /* - * It is possible to have selector register inside data window. - * In that case, selector register is located on every page and it - * needs no page switching, when accessed alone. - * - * Nevertheless we should synchronize the cache values for it. - */ + /* It is possible to have selector register inside data window. + In that case, selector register is located on every page and + it needs no page switching, when accessed alone. */ if (val_num > 1 || range->window_start + win_offset != range->selector_reg) { - unsigned int page_off = win_page * range->window_len; - unsigned int sel_offset = range->selector_reg - range->window_start; - unsigned int sel_register = range->range_min + page_off + sel_offset; - unsigned int val = win_page << range->selector_shift; - unsigned int mask = range->selector_mask; - /* Use separate work_buf during page switching */ orig_work_buf = map->work_buf; map->work_buf = map->selector_work_buf; - ret = _regmap_update_bits(map, range->selector_reg, mask, val, + ret = _regmap_update_bits(map, range->selector_reg, + range->selector_mask, + win_page << range->selector_shift, &page_chg, false); map->work_buf = orig_work_buf; if (ret != 0) return ret; - - /* - * If selector register has been just updated, update the respective - * virtual copy as well. - */ - if (page_chg && - in_range(range->selector_reg, range->window_start, range->window_len)) - _regmap_update_bits(map, sel_register, mask, val, NULL, false); } *reg = range->window_start + win_offset; diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index deda7f35a05987..be1e9e61a7bf4d 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -844,7 +844,7 @@ swnode_register(const struct software_node *node, struct swnode *parent, * of this function or by ordering the array such that parent comes before * child. */ -int software_node_register_node_group(const struct software_node **node_group) +int software_node_register_node_group(const struct software_node * const *node_group) { unsigned int i; int ret; @@ -877,8 +877,7 @@ EXPORT_SYMBOL_GPL(software_node_register_node_group); * remove the nodes individually, in the correct order (child before * parent). */ -void software_node_unregister_node_group( - const struct software_node **node_group) +void software_node_unregister_node_group(const struct software_node * const *node_group) { unsigned int i = 0; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e09930c2b22627..91f3b8afb63ce6 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1330,6 +1330,7 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device, lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; else lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; if ((lim.discard_granularity >> SECTOR_SHIFT) > lim.max_hw_discard_sectors) { diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs index d07e76ae2c13f4..6366da12c5a5fd 100644 --- a/drivers/block/rnull.rs +++ b/drivers/block/rnull.rs @@ -51,7 +51,7 @@ impl kernel::InPlaceModule for NullBlkModule { .logical_block_size(4096)? .physical_block_size(4096)? .rotational(false) - .build(format_args!("rnullb{}", 0), tagset) + .build(fmt!("rnullb{}", 0), tagset) })(); try_pin_init!(Self { diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8acad3cc6e6ea1..f31652085adcb6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1795,6 +1795,7 @@ static int write_same_filled_page(struct zram *zram, unsigned long fill, u32 index) { zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_SAME); zram_set_handle(zram, index, fill); zram_slot_unlock(zram, index); @@ -1832,6 +1833,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, kunmap_local(src); zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_HUGE); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, PAGE_SIZE); @@ -1855,11 +1857,6 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) unsigned long element; bool same_filled; - /* First, free memory allocated to this slot (if any) */ - zram_slot_lock(zram, index); - zram_free_page(zram, index); - zram_slot_unlock(zram, index); - mem = kmap_local_page(page); same_filled = page_same_filled(mem, &element); kunmap_local(mem); @@ -1901,6 +1898,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) zcomp_stream_put(zstrm); zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig index 4ab32abf0f4864..7df69ccb660053 100644 --- a/drivers/bluetooth/Kconfig +++ b/drivers/bluetooth/Kconfig @@ -312,7 +312,9 @@ config BT_HCIBCM4377 config BT_HCIBPA10X tristate "HCI BPA10x USB driver" + depends on BT_HCIUART depends on USB + select BT_HCIUART_H4 help Bluetooth HCI BPA10x USB driver. This driver provides support for the Digianswer BPA 100/105 Bluetooth @@ -437,8 +439,10 @@ config BT_MTKSDIO config BT_MTKUART tristate "MediaTek HCI UART driver" + depends on BT_HCIUART depends on SERIAL_DEV_BUS depends on USB || !BT_HCIBTUSB_MTK + select BT_HCIUART_H4 select BT_MTK help MediaTek Bluetooth HCI UART driver. @@ -483,7 +487,9 @@ config BT_VIRTIO config BT_NXPUART tristate "NXP protocol support" + depends on BT_HCIUART depends on SERIAL_DEV_BUS + select BT_HCIUART_H4 select CRC32 select CRC8 help diff --git a/drivers/bluetooth/hci_uart.h b/drivers/bluetooth/hci_uart.h index 5ea5dd80e297c7..cbbe79b241ce97 100644 --- a/drivers/bluetooth/hci_uart.h +++ b/drivers/bluetooth/hci_uart.h @@ -121,10 +121,6 @@ void hci_uart_set_flow_control(struct hci_uart *hu, bool enable); void hci_uart_set_speeds(struct hci_uart *hu, unsigned int init_speed, unsigned int oper_speed); -#ifdef CONFIG_BT_HCIUART_H4 -int h4_init(void); -int h4_deinit(void); - struct h4_recv_pkt { u8 type; /* Packet type */ u8 hlen; /* Header length */ @@ -162,6 +158,10 @@ struct h4_recv_pkt { .lsize = 2, \ .maxlen = HCI_MAX_FRAME_SIZE \ +#ifdef CONFIG_BT_HCIUART_H4 +int h4_init(void); +int h4_deinit(void); + struct sk_buff *h4_recv_buf(struct hci_dev *hdev, struct sk_buff *skb, const unsigned char *buffer, int count, const struct h4_recv_pkt *pkts, int pkts_count); diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c index fca83141e3e66e..3f8b9041babf5b 100644 --- a/drivers/cdx/controller/cdx_controller.c +++ b/drivers/cdx/controller/cdx_controller.c @@ -14,7 +14,7 @@ #include "cdx_controller.h" #include "../cdx.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static unsigned int cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) { diff --git a/drivers/cdx/controller/cdx_rpmsg.c b/drivers/cdx/controller/cdx_rpmsg.c index 61f1a290ff0890..59aabd99fa8f99 100644 --- a/drivers/cdx/controller/cdx_rpmsg.c +++ b/drivers/cdx/controller/cdx_rpmsg.c @@ -15,7 +15,7 @@ #include "../cdx.h" #include "cdx_controller.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static struct rpmsg_device_id cdx_rpmsg_id_table[] = { { .name = "mcdi_ipc" }, diff --git a/drivers/cdx/controller/mcdi.c b/drivers/cdx/controller/mcdi.c index e760f8d347cc19..2e82ffc18d89c5 100644 --- a/drivers/cdx/controller/mcdi.c +++ b/drivers/cdx/controller/mcdi.c @@ -23,9 +23,10 @@ #include #include #include +#include -#include "bitfield.h" -#include "mcdi.h" +#include +#include "mcdid.h" static void cdx_mcdi_cancel_cmd(struct cdx_mcdi *cdx, struct cdx_mcdi_cmd *cmd); static void cdx_mcdi_wait_for_cleanup(struct cdx_mcdi *cdx); @@ -99,6 +100,19 @@ static unsigned long cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd return cdx->mcdi_ops->mcdi_rpc_timeout(cdx, cmd); } +/** + * cdx_mcdi_init - Initialize MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function allocates and initializes internal MCDI structures and resources + * for the CDX device, including the workqueue, locking primitives, and command + * tracking mechanisms. It sets the initial operating mode and prepares the device + * for MCDI operations. + * + * Return: + * * 0 - on success + * * -ENOMEM - if memory allocation or workqueue creation fails + */ int cdx_mcdi_init(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -128,7 +142,16 @@ int cdx_mcdi_init(struct cdx_mcdi *cdx) fail: return rc; } +EXPORT_SYMBOL_GPL(cdx_mcdi_init); +/** + * cdx_mcdi_finish - Cleanup MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function is responsible for cleaning up the MCDI (Management Controller Driver Interface) + * resources associated with a cdx_mcdi structure. Also destroys the mcdi workqueue. + * + */ void cdx_mcdi_finish(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -143,6 +166,7 @@ void cdx_mcdi_finish(struct cdx_mcdi *cdx) kfree(cdx->mcdi); cdx->mcdi = NULL; } +EXPORT_SYMBOL_GPL(cdx_mcdi_finish); static bool cdx_mcdi_flushed(struct cdx_mcdi_iface *mcdi, bool ignore_cleanups) { @@ -553,6 +577,19 @@ static void cdx_mcdi_start_or_queue(struct cdx_mcdi_iface *mcdi, cdx_mcdi_cmd_start_or_queue(mcdi, cmd); } +/** + * cdx_mcdi_process_cmd - Process an incoming MCDI response + * @cdx: Handle to the CDX MCDI structure + * @outbuf: Pointer to the response buffer received from the management controller + * @len: Length of the response buffer in bytes + * + * This function handles a response from the management controller. It locates the + * corresponding command using the sequence number embedded in the header, + * completes the command if it is still pending, and initiates any necessary cleanup. + * + * The function assumes that the response buffer is well-formed and at least one + * dword in size. + */ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len) { struct cdx_mcdi_iface *mcdi; @@ -590,6 +627,7 @@ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int le cdx_mcdi_process_cleanup_list(mcdi->cdx, &cleanup_list); } +EXPORT_SYMBOL_GPL(cdx_mcdi_process_cmd); static void cdx_mcdi_cmd_work(struct work_struct *context) { @@ -757,6 +795,7 @@ int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, return cdx_mcdi_rpc_sync(cdx, cmd, inbuf, inlen, outbuf, outlen, outlen_actual, false); } +EXPORT_SYMBOL_GPL(cdx_mcdi_rpc); /** * cdx_mcdi_rpc_async - Schedule an MCDI command to run asynchronously diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c index 885c69e6ebe5b6..8ae2d99be81e5b 100644 --- a/drivers/cdx/controller/mcdi_functions.c +++ b/drivers/cdx/controller/mcdi_functions.c @@ -5,7 +5,6 @@ #include -#include "mcdi.h" #include "mcdi_functions.h" int cdx_mcdi_get_num_buses(struct cdx_mcdi *cdx) diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h index b9942affdc6b2d..57fd1bae706b96 100644 --- a/drivers/cdx/controller/mcdi_functions.h +++ b/drivers/cdx/controller/mcdi_functions.h @@ -8,7 +8,8 @@ #ifndef CDX_MCDI_FUNCTIONS_H #define CDX_MCDI_FUNCTIONS_H -#include "mcdi.h" +#include +#include "mcdid.h" #include "../cdx.h" /** diff --git a/drivers/cdx/controller/mcdid.h b/drivers/cdx/controller/mcdid.h new file mode 100644 index 00000000000000..7fc29f099265e7 --- /dev/null +++ b/drivers/cdx/controller/mcdid.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2008-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2025, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_MCDID_H +#define CDX_MCDID_H + +#include +#include +#include + +#include "mc_cdx_pcol.h" + +#ifdef DEBUG +#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) +#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) +#else +#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) +#define CDX_WARN_ON_PARANOID(x) do {} while (0) +#endif + +#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) + +static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) +{ + return cdx->mcdi ? &cdx->mcdi->iface : NULL; +} + +int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, + const struct cdx_dword *inbuf, size_t inlen, + cdx_mcdi_async_completer *complete, + unsigned long cookie); +int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, + unsigned int timeout_jiffies); + +/* + * We expect that 16- and 32-bit fields in MCDI requests and responses + * are appropriately aligned, but 64-bit fields are only + * 32-bit-aligned. + */ +#define MCDI_BYTE(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ + *MCDI_PTR(_buf, _field)) +#define MCDI_WORD(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ + le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) +#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ + CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ + MC_CMD_ ## _name1, _value1) +#define MCDI_SET_QWORD(_buf, _field, _value) \ + do { \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ + CDX_DWORD, (u32)(_value)); \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ + CDX_DWORD, (u64)(_value) >> 32); \ + } while (0) +#define MCDI_QWORD(_buf, _field) \ + (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ + (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) + +#endif /* CDX_MCDID_H */ diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig index f4adc6feb3b229..92bed266d07cdd 100644 --- a/drivers/char/ipmi/Kconfig +++ b/drivers/char/ipmi/Kconfig @@ -84,6 +84,13 @@ config IPMI_IPMB bus, and it also supports direct messaging on the bus using IPMB direct messages. This module requires I2C support. +config IPMI_LS2K + bool 'Loongson-2K IPMI interface' + depends on LOONGARCH + select MFD_LS2K_BMC_CORE + help + Provides a driver for Loongson-2K IPMI interfaces. + config IPMI_POWERNV depends on PPC_POWERNV tristate 'POWERNV (OPAL firmware) IPMI interface' diff --git a/drivers/char/ipmi/Makefile b/drivers/char/ipmi/Makefile index e0944547c9d0ee..4ea450a82242fc 100644 --- a/drivers/char/ipmi/Makefile +++ b/drivers/char/ipmi/Makefile @@ -8,6 +8,7 @@ ipmi_si-y := ipmi_si_intf.o ipmi_kcs_sm.o ipmi_smic_sm.o ipmi_bt_sm.o \ ipmi_si_mem_io.o ipmi_si-$(CONFIG_HAS_IOPORT) += ipmi_si_port_io.o ipmi_si-$(CONFIG_PCI) += ipmi_si_pci.o +ipmi_si-$(CONFIG_IPMI_LS2K) += ipmi_si_ls2k.o ipmi_si-$(CONFIG_PARISC) += ipmi_si_parisc.o obj-$(CONFIG_IPMI_HANDLER) += ipmi_msghandler.o diff --git a/drivers/char/ipmi/ipmi_ipmb.c b/drivers/char/ipmi/ipmi_ipmb.c index 6a4f279c7c1f53..3a51e58b248754 100644 --- a/drivers/char/ipmi/ipmi_ipmb.c +++ b/drivers/char/ipmi/ipmi_ipmb.c @@ -404,8 +404,7 @@ static void ipmi_ipmb_shutdown(void *send_info) ipmi_ipmb_stop_thread(iidev); } -static void ipmi_ipmb_sender(void *send_info, - struct ipmi_smi_msg *msg) +static int ipmi_ipmb_sender(void *send_info, struct ipmi_smi_msg *msg) { struct ipmi_ipmb_dev *iidev = send_info; unsigned long flags; @@ -417,6 +416,7 @@ static void ipmi_ipmb_sender(void *send_info, spin_unlock_irqrestore(&iidev->lock, flags); up(&iidev->wake_thread); + return IPMI_CC_NO_ERROR; } static void ipmi_ipmb_request_events(void *send_info) diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c index ecfcb50302f6ce..efda90dcf5b3d0 100644 --- a/drivers/char/ipmi/ipmi_kcs_sm.c +++ b/drivers/char/ipmi/ipmi_kcs_sm.c @@ -122,10 +122,10 @@ struct si_sm_data { unsigned long error0_timeout; }; -static unsigned int init_kcs_data_with_state(struct si_sm_data *kcs, - struct si_sm_io *io, enum kcs_states state) +static unsigned int init_kcs_data(struct si_sm_data *kcs, + struct si_sm_io *io) { - kcs->state = state; + kcs->state = KCS_IDLE; kcs->io = io; kcs->write_pos = 0; kcs->write_count = 0; @@ -140,12 +140,6 @@ static unsigned int init_kcs_data_with_state(struct si_sm_data *kcs, return 2; } -static unsigned int init_kcs_data(struct si_sm_data *kcs, - struct si_sm_io *io) -{ - return init_kcs_data_with_state(kcs, io, KCS_IDLE); -} - static inline unsigned char read_status(struct si_sm_data *kcs) { return kcs->io->inputb(kcs->io, 1); @@ -276,7 +270,7 @@ static int start_kcs_transaction(struct si_sm_data *kcs, unsigned char *data, if (size > MAX_KCS_WRITE_SIZE) return IPMI_REQ_LEN_EXCEEDED_ERR; - if (kcs->state != KCS_IDLE) { + if ((kcs->state != KCS_IDLE) && (kcs->state != KCS_HOSED)) { dev_warn(kcs->io->dev, "KCS in invalid state %d\n", kcs->state); return IPMI_NOT_IN_MY_STATE_ERR; } @@ -501,7 +495,7 @@ static enum si_sm_result kcs_event(struct si_sm_data *kcs, long time) } if (kcs->state == KCS_HOSED) { - init_kcs_data_with_state(kcs, kcs->io, KCS_ERROR0); + init_kcs_data(kcs, kcs->io); return SI_SM_HOSED; } diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 8e9050f99e9eff..a0b67a35a5f048 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -38,7 +38,9 @@ #define IPMI_DRIVER_VERSION "39.2" -static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void); +static struct ipmi_recv_msg *ipmi_alloc_recv_msg(struct ipmi_user *user); +static void ipmi_set_recv_msg_user(struct ipmi_recv_msg *msg, + struct ipmi_user *user); static int ipmi_init_msghandler(void); static void smi_work(struct work_struct *t); static void handle_new_recv_msgs(struct ipmi_smi *intf); @@ -50,6 +52,8 @@ static void intf_free(struct kref *ref); static bool initialized; static bool drvregistered; +static struct timer_list ipmi_timer; + /* Numbers in this enumerator should be mapped to ipmi_panic_event_str */ enum ipmi_panic_event_op { IPMI_SEND_PANIC_EVENT_NONE, @@ -432,6 +436,7 @@ struct ipmi_smi { atomic_t nr_users; struct device_attribute nr_users_devattr; struct device_attribute nr_msgs_devattr; + struct device_attribute maintenance_mode_devattr; /* Used for wake ups at startup. */ @@ -464,7 +469,7 @@ struct ipmi_smi { * interface to match them up with their responses. A routine * is called periodically to time the items in this list. */ - spinlock_t seq_lock; + struct mutex seq_lock; struct seq_table seq_table[IPMI_IPMB_NUM_SEQ]; int curr_seq; @@ -539,7 +544,11 @@ struct ipmi_smi { /* For handling of maintenance mode. */ int maintenance_mode; - bool maintenance_mode_enable; + +#define IPMI_MAINTENANCE_MODE_STATE_OFF 0 +#define IPMI_MAINTENANCE_MODE_STATE_FIRMWARE 1 +#define IPMI_MAINTENANCE_MODE_STATE_RESET 2 + int maintenance_mode_state; int auto_maintenance_timeout; spinlock_t maintenance_mode_lock; /* Used in a timer... */ @@ -955,7 +964,6 @@ static int deliver_response(struct ipmi_smi *intf, struct ipmi_recv_msg *msg) * risk. At this moment, simply skip it in that case. */ ipmi_free_recv_msg(msg); - atomic_dec(&msg->user->nr_msgs); } else { /* * Deliver it in smi_work. The message will hold a @@ -1116,12 +1124,11 @@ static int intf_find_seq(struct ipmi_smi *intf, struct ipmi_recv_msg **recv_msg) { int rv = -ENODEV; - unsigned long flags; if (seq >= IPMI_IPMB_NUM_SEQ) return -EINVAL; - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); if (intf->seq_table[seq].inuse) { struct ipmi_recv_msg *msg = intf->seq_table[seq].recv_msg; @@ -1134,7 +1141,7 @@ static int intf_find_seq(struct ipmi_smi *intf, rv = 0; } } - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); return rv; } @@ -1145,14 +1152,13 @@ static int intf_start_seq_timer(struct ipmi_smi *intf, long msgid) { int rv = -ENODEV; - unsigned long flags; unsigned char seq; unsigned long seqid; GET_SEQ_FROM_MSGID(msgid, seq, seqid); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); /* * We do this verification because the user can be deleted * while a message is outstanding. @@ -1163,7 +1169,7 @@ static int intf_start_seq_timer(struct ipmi_smi *intf, ent->timeout = ent->orig_timeout; rv = 0; } - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); return rv; } @@ -1174,7 +1180,6 @@ static int intf_err_seq(struct ipmi_smi *intf, unsigned int err) { int rv = -ENODEV; - unsigned long flags; unsigned char seq; unsigned long seqid; struct ipmi_recv_msg *msg = NULL; @@ -1182,7 +1187,7 @@ static int intf_err_seq(struct ipmi_smi *intf, GET_SEQ_FROM_MSGID(msgid, seq, seqid); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); /* * We do this verification because the user can be deleted * while a message is outstanding. @@ -1196,7 +1201,7 @@ static int intf_err_seq(struct ipmi_smi *intf, msg = ent->recv_msg; rv = 0; } - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); if (msg) deliver_err_response(intf, msg, err); @@ -1209,7 +1214,6 @@ int ipmi_create_user(unsigned int if_num, void *handler_data, struct ipmi_user **user) { - unsigned long flags; struct ipmi_user *new_user = NULL; int rv = 0; struct ipmi_smi *intf; @@ -1277,9 +1281,9 @@ int ipmi_create_user(unsigned int if_num, new_user->gets_events = false; mutex_lock(&intf->users_mutex); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); list_add(&new_user->link, &intf->users); - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); mutex_unlock(&intf->users_mutex); if (handler->ipmi_watchdog_pretimeout) @@ -1325,7 +1329,6 @@ static void _ipmi_destroy_user(struct ipmi_user *user) { struct ipmi_smi *intf = user->intf; int i; - unsigned long flags; struct cmd_rcvr *rcvr; struct cmd_rcvr *rcvrs = NULL; struct ipmi_recv_msg *msg, *msg2; @@ -1346,7 +1349,7 @@ static void _ipmi_destroy_user(struct ipmi_user *user) list_del(&user->link); atomic_dec(&intf->nr_users); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); for (i = 0; i < IPMI_IPMB_NUM_SEQ; i++) { if (intf->seq_table[i].inuse && (intf->seq_table[i].recv_msg->user == user)) { @@ -1355,7 +1358,7 @@ static void _ipmi_destroy_user(struct ipmi_user *user) ipmi_free_recv_msg(intf->seq_table[i].recv_msg); } } - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); /* * Remove the user from the command receiver's table. First @@ -1534,8 +1537,15 @@ EXPORT_SYMBOL(ipmi_get_maintenance_mode); static void maintenance_mode_update(struct ipmi_smi *intf) { if (intf->handlers->set_maintenance_mode) + /* + * Lower level drivers only care about firmware mode + * as it affects their timing. They don't care about + * reset, which disables all commands for a while. + */ intf->handlers->set_maintenance_mode( - intf->send_info, intf->maintenance_mode_enable); + intf->send_info, + (intf->maintenance_mode_state == + IPMI_MAINTENANCE_MODE_STATE_FIRMWARE)); } int ipmi_set_maintenance_mode(struct ipmi_user *user, int mode) @@ -1552,16 +1562,17 @@ int ipmi_set_maintenance_mode(struct ipmi_user *user, int mode) if (intf->maintenance_mode != mode) { switch (mode) { case IPMI_MAINTENANCE_MODE_AUTO: - intf->maintenance_mode_enable - = (intf->auto_maintenance_timeout > 0); + /* Just leave it alone. */ break; case IPMI_MAINTENANCE_MODE_OFF: - intf->maintenance_mode_enable = false; + intf->maintenance_mode_state = + IPMI_MAINTENANCE_MODE_STATE_OFF; break; case IPMI_MAINTENANCE_MODE_ON: - intf->maintenance_mode_enable = true; + intf->maintenance_mode_state = + IPMI_MAINTENANCE_MODE_STATE_FIRMWARE; break; default: @@ -1616,8 +1627,7 @@ int ipmi_set_gets_events(struct ipmi_user *user, bool val) } list_for_each_entry_safe(msg, msg2, &msgs, link) { - msg->user = user; - kref_get(&user->refcount); + ipmi_set_recv_msg_user(msg, user); deliver_local_response(intf, msg); } } @@ -1922,14 +1932,20 @@ static int i_ipmi_req_sysintf(struct ipmi_smi *intf, if (is_maintenance_mode_cmd(msg)) { unsigned long flags; + int newst; + + if (msg->netfn == IPMI_NETFN_FIRMWARE_REQUEST) + newst = IPMI_MAINTENANCE_MODE_STATE_FIRMWARE; + else + newst = IPMI_MAINTENANCE_MODE_STATE_RESET; spin_lock_irqsave(&intf->maintenance_mode_lock, flags); - intf->auto_maintenance_timeout - = maintenance_mode_timeout_ms; + intf->auto_maintenance_timeout = maintenance_mode_timeout_ms; if (!intf->maintenance_mode - && !intf->maintenance_mode_enable) { - intf->maintenance_mode_enable = true; + && intf->maintenance_mode_state < newst) { + intf->maintenance_mode_state = newst; maintenance_mode_update(intf); + mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES); } spin_unlock_irqrestore(&intf->maintenance_mode_lock, flags); @@ -1943,7 +1959,7 @@ static int i_ipmi_req_sysintf(struct ipmi_smi *intf, smi_msg->data[0] = (msg->netfn << 2) | (smi_addr->lun & 0x3); smi_msg->data[1] = msg->cmd; smi_msg->msgid = msgid; - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; if (msg->data_len > 0) memcpy(&smi_msg->data[2], msg->data, msg->data_len); smi_msg->data_size = msg->data_len + 2; @@ -2024,12 +2040,9 @@ static int i_ipmi_req_ipmb(struct ipmi_smi *intf, * Save the receive message so we can use it * to deliver the response. */ - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; } else { - /* It's a command, so get a sequence for it. */ - unsigned long flags; - - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); if (is_maintenance_mode_cmd(msg)) intf->ipmb_maintenance_mode_timeout = @@ -2087,7 +2100,7 @@ static int i_ipmi_req_ipmb(struct ipmi_smi *intf, * to be correct. */ out_err: - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); } return rv; @@ -2140,7 +2153,7 @@ static int i_ipmi_req_ipmb_direct(struct ipmi_smi *intf, memcpy(smi_msg->data + 4, msg->data, msg->data_len); smi_msg->data_size = msg->data_len + 4; - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; return 0; } @@ -2203,12 +2216,9 @@ static int i_ipmi_req_lan(struct ipmi_smi *intf, * Save the receive message so we can use it * to deliver the response. */ - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; } else { - /* It's a command, so get a sequence for it. */ - unsigned long flags; - - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); /* * Create a sequence number with a 1 second @@ -2257,7 +2267,7 @@ static int i_ipmi_req_lan(struct ipmi_smi *intf, * to be correct. */ out_err: - spin_unlock_irqrestore(&intf->seq_lock, flags); + mutex_unlock(&intf->seq_lock); } return rv; @@ -2288,22 +2298,15 @@ static int i_ipmi_request(struct ipmi_user *user, int run_to_completion = READ_ONCE(intf->run_to_completion); int rv = 0; - if (user) { - if (atomic_add_return(1, &user->nr_msgs) > max_msgs_per_user) { - /* Decrement will happen at the end of the routine. */ - rv = -EBUSY; - goto out; - } - } - - if (supplied_recv) + if (supplied_recv) { recv_msg = supplied_recv; - else { - recv_msg = ipmi_alloc_recv_msg(); - if (recv_msg == NULL) { - rv = -ENOMEM; - goto out; - } + recv_msg->user = user; + if (user) + atomic_inc(&user->nr_msgs); + } else { + recv_msg = ipmi_alloc_recv_msg(user); + if (IS_ERR(recv_msg)) + return PTR_ERR(recv_msg); } recv_msg->user_msg_data = user_msg_data; @@ -2314,22 +2317,22 @@ static int i_ipmi_request(struct ipmi_user *user, if (smi_msg == NULL) { if (!supplied_recv) ipmi_free_recv_msg(recv_msg); - rv = -ENOMEM; - goto out; + return -ENOMEM; } } if (!run_to_completion) mutex_lock(&intf->users_mutex); + if (intf->maintenance_mode_state == IPMI_MAINTENANCE_MODE_STATE_RESET) { + /* No messages while the BMC is in reset. */ + rv = -EBUSY; + goto out_err; + } if (intf->in_shutdown) { rv = -ENODEV; goto out_err; } - recv_msg->user = user; - if (user) - /* The put happens when the message is freed. */ - kref_get(&user->refcount); recv_msg->msgid = msgid; /* * Store the message to send in the receive message so timeout @@ -2358,8 +2361,10 @@ static int i_ipmi_request(struct ipmi_user *user, if (rv) { out_err: - ipmi_free_smi_msg(smi_msg); - ipmi_free_recv_msg(recv_msg); + if (!supplied_smi) + ipmi_free_smi_msg(smi_msg); + if (!supplied_recv) + ipmi_free_recv_msg(recv_msg); } else { dev_dbg(intf->si_dev, "Send: %*ph\n", smi_msg->data_size, smi_msg->data); @@ -2369,9 +2374,6 @@ static int i_ipmi_request(struct ipmi_user *user, if (!run_to_completion) mutex_unlock(&intf->users_mutex); -out: - if (rv && user) - atomic_dec(&user->nr_msgs); return rv; } @@ -2622,6 +2624,12 @@ static int __bmc_get_device_id(struct ipmi_smi *intf, struct bmc_device *bmc, (bmc->dyn_id_set && time_is_after_jiffies(bmc->dyn_id_expiry))) goto out_noprocessing; + /* Don't allow sysfs access when in maintenance mode. */ + if (intf->maintenance_mode_state) { + rv = -EBUSY; + goto out_noprocessing; + } + prev_guid_set = bmc->dyn_guid_set; __get_guid(intf); @@ -3517,6 +3525,19 @@ static ssize_t nr_msgs_show(struct device *dev, } static DEVICE_ATTR_RO(nr_msgs); +static ssize_t maintenance_mode_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipmi_smi *intf = container_of(attr, + struct ipmi_smi, + maintenance_mode_devattr); + + return sysfs_emit(buf, "%u %d\n", intf->maintenance_mode_state, + intf->auto_maintenance_timeout); +} +static DEVICE_ATTR_RO(maintenance_mode); + static void redo_bmc_reg(struct work_struct *work) { struct ipmi_smi *intf = container_of(work, struct ipmi_smi, @@ -3575,7 +3596,7 @@ int ipmi_add_smi(struct module *owner, atomic_set(&intf->nr_users, 0); intf->handlers = handlers; intf->send_info = send_info; - spin_lock_init(&intf->seq_lock); + mutex_init(&intf->seq_lock); for (j = 0; j < IPMI_IPMB_NUM_SEQ; j++) { intf->seq_table[j].inuse = 0; intf->seq_table[j].seqid = 0; @@ -3653,6 +3674,14 @@ int ipmi_add_smi(struct module *owner, goto out_err_bmc_reg; } + intf->maintenance_mode_devattr = dev_attr_maintenance_mode; + sysfs_attr_init(&intf->maintenance_mode_devattr.attr); + rv = device_create_file(intf->si_dev, &intf->maintenance_mode_devattr); + if (rv) { + device_remove_file(intf->si_dev, &intf->nr_users_devattr); + goto out_err_bmc_reg; + } + intf->intf_num = i; mutex_unlock(&ipmi_interfaces_mutex); @@ -3760,6 +3789,7 @@ void ipmi_unregister_smi(struct ipmi_smi *intf) if (intf->handlers->shutdown) intf->handlers->shutdown(intf->send_info); + device_remove_file(intf->si_dev, &intf->maintenance_mode_devattr); device_remove_file(intf->si_dev, &intf->nr_msgs_devattr); device_remove_file(intf->si_dev, &intf->nr_users_devattr); @@ -3862,7 +3892,7 @@ static int handle_ipmb_get_msg_cmd(struct ipmi_smi *intf, unsigned char chan; struct ipmi_user *user = NULL; struct ipmi_ipmb_addr *ipmb_addr; - struct ipmi_recv_msg *recv_msg; + struct ipmi_recv_msg *recv_msg = NULL; if (msg->rsp_size < 10) { /* Message not big enough, just ignore it. */ @@ -3883,9 +3913,8 @@ static int handle_ipmb_get_msg_cmd(struct ipmi_smi *intf, rcvr = find_cmd_rcvr(intf, netfn, cmd, chan); if (rcvr) { user = rcvr->user; - kref_get(&user->refcount); - } else - user = NULL; + recv_msg = ipmi_alloc_recv_msg(user); + } rcu_read_unlock(); if (user == NULL) { @@ -3915,47 +3944,41 @@ static int handle_ipmb_get_msg_cmd(struct ipmi_smi *intf, * causes it to not be freed or queued. */ rv = -1; - } else { - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { - /* - * We couldn't allocate memory for the - * message, so requeue it for handling - * later. - */ - rv = 1; - kref_put(&user->refcount, free_ipmi_user); - } else { - /* Extract the source address from the data. */ - ipmb_addr = (struct ipmi_ipmb_addr *) &recv_msg->addr; - ipmb_addr->addr_type = IPMI_IPMB_ADDR_TYPE; - ipmb_addr->slave_addr = msg->rsp[6]; - ipmb_addr->lun = msg->rsp[7] & 3; - ipmb_addr->channel = msg->rsp[3] & 0xf; + } else if (!IS_ERR(recv_msg)) { + /* Extract the source address from the data. */ + ipmb_addr = (struct ipmi_ipmb_addr *) &recv_msg->addr; + ipmb_addr->addr_type = IPMI_IPMB_ADDR_TYPE; + ipmb_addr->slave_addr = msg->rsp[6]; + ipmb_addr->lun = msg->rsp[7] & 3; + ipmb_addr->channel = msg->rsp[3] & 0xf; - /* - * Extract the rest of the message information - * from the IPMB header. - */ - recv_msg->user = user; - recv_msg->recv_type = IPMI_CMD_RECV_TYPE; - recv_msg->msgid = msg->rsp[7] >> 2; - recv_msg->msg.netfn = msg->rsp[4] >> 2; - recv_msg->msg.cmd = msg->rsp[8]; - recv_msg->msg.data = recv_msg->msg_data; + /* + * Extract the rest of the message information + * from the IPMB header. + */ + recv_msg->recv_type = IPMI_CMD_RECV_TYPE; + recv_msg->msgid = msg->rsp[7] >> 2; + recv_msg->msg.netfn = msg->rsp[4] >> 2; + recv_msg->msg.cmd = msg->rsp[8]; + recv_msg->msg.data = recv_msg->msg_data; - /* - * We chop off 10, not 9 bytes because the checksum - * at the end also needs to be removed. - */ - recv_msg->msg.data_len = msg->rsp_size - 10; - memcpy(recv_msg->msg_data, &msg->rsp[9], - msg->rsp_size - 10); - if (deliver_response(intf, recv_msg)) - ipmi_inc_stat(intf, unhandled_commands); - else - ipmi_inc_stat(intf, handled_commands); - } + /* + * We chop off 10, not 9 bytes because the checksum + * at the end also needs to be removed. + */ + recv_msg->msg.data_len = msg->rsp_size - 10; + memcpy(recv_msg->msg_data, &msg->rsp[9], + msg->rsp_size - 10); + if (deliver_response(intf, recv_msg)) + ipmi_inc_stat(intf, unhandled_commands); + else + ipmi_inc_stat(intf, handled_commands); + } else { + /* + * We couldn't allocate memory for the message, so + * requeue it for handling later. + */ + rv = 1; } return rv; @@ -3968,7 +3991,7 @@ static int handle_ipmb_direct_rcv_cmd(struct ipmi_smi *intf, int rv = 0; struct ipmi_user *user = NULL; struct ipmi_ipmb_direct_addr *daddr; - struct ipmi_recv_msg *recv_msg; + struct ipmi_recv_msg *recv_msg = NULL; unsigned char netfn = msg->rsp[0] >> 2; unsigned char cmd = msg->rsp[3]; @@ -3977,9 +4000,8 @@ static int handle_ipmb_direct_rcv_cmd(struct ipmi_smi *intf, rcvr = find_cmd_rcvr(intf, netfn, cmd, 0); if (rcvr) { user = rcvr->user; - kref_get(&user->refcount); - } else - user = NULL; + recv_msg = ipmi_alloc_recv_msg(user); + } rcu_read_unlock(); if (user == NULL) { @@ -4001,44 +4023,38 @@ static int handle_ipmb_direct_rcv_cmd(struct ipmi_smi *intf, * causes it to not be freed or queued. */ rv = -1; - } else { - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { - /* - * We couldn't allocate memory for the - * message, so requeue it for handling - * later. - */ - rv = 1; - kref_put(&user->refcount, free_ipmi_user); - } else { - /* Extract the source address from the data. */ - daddr = (struct ipmi_ipmb_direct_addr *)&recv_msg->addr; - daddr->addr_type = IPMI_IPMB_DIRECT_ADDR_TYPE; - daddr->channel = 0; - daddr->slave_addr = msg->rsp[1]; - daddr->rs_lun = msg->rsp[0] & 3; - daddr->rq_lun = msg->rsp[2] & 3; + } else if (!IS_ERR(recv_msg)) { + /* Extract the source address from the data. */ + daddr = (struct ipmi_ipmb_direct_addr *)&recv_msg->addr; + daddr->addr_type = IPMI_IPMB_DIRECT_ADDR_TYPE; + daddr->channel = 0; + daddr->slave_addr = msg->rsp[1]; + daddr->rs_lun = msg->rsp[0] & 3; + daddr->rq_lun = msg->rsp[2] & 3; - /* - * Extract the rest of the message information - * from the IPMB header. - */ - recv_msg->user = user; - recv_msg->recv_type = IPMI_CMD_RECV_TYPE; - recv_msg->msgid = (msg->rsp[2] >> 2); - recv_msg->msg.netfn = msg->rsp[0] >> 2; - recv_msg->msg.cmd = msg->rsp[3]; - recv_msg->msg.data = recv_msg->msg_data; - - recv_msg->msg.data_len = msg->rsp_size - 4; - memcpy(recv_msg->msg_data, msg->rsp + 4, - msg->rsp_size - 4); - if (deliver_response(intf, recv_msg)) - ipmi_inc_stat(intf, unhandled_commands); - else - ipmi_inc_stat(intf, handled_commands); - } + /* + * Extract the rest of the message information + * from the IPMB header. + */ + recv_msg->recv_type = IPMI_CMD_RECV_TYPE; + recv_msg->msgid = (msg->rsp[2] >> 2); + recv_msg->msg.netfn = msg->rsp[0] >> 2; + recv_msg->msg.cmd = msg->rsp[3]; + recv_msg->msg.data = recv_msg->msg_data; + + recv_msg->msg.data_len = msg->rsp_size - 4; + memcpy(recv_msg->msg_data, msg->rsp + 4, + msg->rsp_size - 4); + if (deliver_response(intf, recv_msg)) + ipmi_inc_stat(intf, unhandled_commands); + else + ipmi_inc_stat(intf, handled_commands); + } else { + /* + * We couldn't allocate memory for the message, so + * requeue it for handling later. + */ + rv = 1; } return rv; @@ -4050,7 +4066,7 @@ static int handle_ipmb_direct_rcv_rsp(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg; struct ipmi_ipmb_direct_addr *daddr; - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; if (recv_msg == NULL) { dev_warn(intf->si_dev, "IPMI direct message received with no owner. This could be because of a malformed message, or because of a hardware error. Contact your hardware vendor for assistance.\n"); @@ -4152,7 +4168,7 @@ static int handle_lan_get_msg_cmd(struct ipmi_smi *intf, unsigned char chan; struct ipmi_user *user = NULL; struct ipmi_lan_addr *lan_addr; - struct ipmi_recv_msg *recv_msg; + struct ipmi_recv_msg *recv_msg = NULL; if (msg->rsp_size < 12) { /* Message not big enough, just ignore it. */ @@ -4173,9 +4189,8 @@ static int handle_lan_get_msg_cmd(struct ipmi_smi *intf, rcvr = find_cmd_rcvr(intf, netfn, cmd, chan); if (rcvr) { user = rcvr->user; - kref_get(&user->refcount); - } else - user = NULL; + recv_msg = ipmi_alloc_recv_msg(user); + } rcu_read_unlock(); if (user == NULL) { @@ -4206,49 +4221,44 @@ static int handle_lan_get_msg_cmd(struct ipmi_smi *intf, * causes it to not be freed or queued. */ rv = -1; - } else { - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { - /* - * We couldn't allocate memory for the - * message, so requeue it for handling later. - */ - rv = 1; - kref_put(&user->refcount, free_ipmi_user); - } else { - /* Extract the source address from the data. */ - lan_addr = (struct ipmi_lan_addr *) &recv_msg->addr; - lan_addr->addr_type = IPMI_LAN_ADDR_TYPE; - lan_addr->session_handle = msg->rsp[4]; - lan_addr->remote_SWID = msg->rsp[8]; - lan_addr->local_SWID = msg->rsp[5]; - lan_addr->lun = msg->rsp[9] & 3; - lan_addr->channel = msg->rsp[3] & 0xf; - lan_addr->privilege = msg->rsp[3] >> 4; + } else if (!IS_ERR(recv_msg)) { + /* Extract the source address from the data. */ + lan_addr = (struct ipmi_lan_addr *) &recv_msg->addr; + lan_addr->addr_type = IPMI_LAN_ADDR_TYPE; + lan_addr->session_handle = msg->rsp[4]; + lan_addr->remote_SWID = msg->rsp[8]; + lan_addr->local_SWID = msg->rsp[5]; + lan_addr->lun = msg->rsp[9] & 3; + lan_addr->channel = msg->rsp[3] & 0xf; + lan_addr->privilege = msg->rsp[3] >> 4; - /* - * Extract the rest of the message information - * from the IPMB header. - */ - recv_msg->user = user; - recv_msg->recv_type = IPMI_CMD_RECV_TYPE; - recv_msg->msgid = msg->rsp[9] >> 2; - recv_msg->msg.netfn = msg->rsp[6] >> 2; - recv_msg->msg.cmd = msg->rsp[10]; - recv_msg->msg.data = recv_msg->msg_data; + /* + * Extract the rest of the message information + * from the IPMB header. + */ + recv_msg->recv_type = IPMI_CMD_RECV_TYPE; + recv_msg->msgid = msg->rsp[9] >> 2; + recv_msg->msg.netfn = msg->rsp[6] >> 2; + recv_msg->msg.cmd = msg->rsp[10]; + recv_msg->msg.data = recv_msg->msg_data; - /* - * We chop off 12, not 11 bytes because the checksum - * at the end also needs to be removed. - */ - recv_msg->msg.data_len = msg->rsp_size - 12; - memcpy(recv_msg->msg_data, &msg->rsp[11], - msg->rsp_size - 12); - if (deliver_response(intf, recv_msg)) - ipmi_inc_stat(intf, unhandled_commands); - else - ipmi_inc_stat(intf, handled_commands); - } + /* + * We chop off 12, not 11 bytes because the checksum + * at the end also needs to be removed. + */ + recv_msg->msg.data_len = msg->rsp_size - 12; + memcpy(recv_msg->msg_data, &msg->rsp[11], + msg->rsp_size - 12); + if (deliver_response(intf, recv_msg)) + ipmi_inc_stat(intf, unhandled_commands); + else + ipmi_inc_stat(intf, handled_commands); + } else { + /* + * We couldn't allocate memory for the message, so + * requeue it for handling later. + */ + rv = 1; } return rv; @@ -4270,7 +4280,7 @@ static int handle_oem_get_msg_cmd(struct ipmi_smi *intf, unsigned char chan; struct ipmi_user *user = NULL; struct ipmi_system_interface_addr *smi_addr; - struct ipmi_recv_msg *recv_msg; + struct ipmi_recv_msg *recv_msg = NULL; /* * We expect the OEM SW to perform error checking @@ -4299,9 +4309,8 @@ static int handle_oem_get_msg_cmd(struct ipmi_smi *intf, rcvr = find_cmd_rcvr(intf, netfn, cmd, chan); if (rcvr) { user = rcvr->user; - kref_get(&user->refcount); - } else - user = NULL; + recv_msg = ipmi_alloc_recv_msg(user); + } rcu_read_unlock(); if (user == NULL) { @@ -4314,48 +4323,42 @@ static int handle_oem_get_msg_cmd(struct ipmi_smi *intf, */ rv = 0; - } else { - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { - /* - * We couldn't allocate memory for the - * message, so requeue it for handling - * later. - */ - rv = 1; - kref_put(&user->refcount, free_ipmi_user); - } else { - /* - * OEM Messages are expected to be delivered via - * the system interface to SMS software. We might - * need to visit this again depending on OEM - * requirements - */ - smi_addr = ((struct ipmi_system_interface_addr *) - &recv_msg->addr); - smi_addr->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE; - smi_addr->channel = IPMI_BMC_CHANNEL; - smi_addr->lun = msg->rsp[0] & 3; - - recv_msg->user = user; - recv_msg->user_msg_data = NULL; - recv_msg->recv_type = IPMI_OEM_RECV_TYPE; - recv_msg->msg.netfn = msg->rsp[0] >> 2; - recv_msg->msg.cmd = msg->rsp[1]; - recv_msg->msg.data = recv_msg->msg_data; + } else if (!IS_ERR(recv_msg)) { + /* + * OEM Messages are expected to be delivered via + * the system interface to SMS software. We might + * need to visit this again depending on OEM + * requirements + */ + smi_addr = ((struct ipmi_system_interface_addr *) + &recv_msg->addr); + smi_addr->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE; + smi_addr->channel = IPMI_BMC_CHANNEL; + smi_addr->lun = msg->rsp[0] & 3; + + recv_msg->user_msg_data = NULL; + recv_msg->recv_type = IPMI_OEM_RECV_TYPE; + recv_msg->msg.netfn = msg->rsp[0] >> 2; + recv_msg->msg.cmd = msg->rsp[1]; + recv_msg->msg.data = recv_msg->msg_data; - /* - * The message starts at byte 4 which follows the - * Channel Byte in the "GET MESSAGE" command - */ - recv_msg->msg.data_len = msg->rsp_size - 4; - memcpy(recv_msg->msg_data, &msg->rsp[4], - msg->rsp_size - 4); - if (deliver_response(intf, recv_msg)) - ipmi_inc_stat(intf, unhandled_commands); - else - ipmi_inc_stat(intf, handled_commands); - } + /* + * The message starts at byte 4 which follows the + * Channel Byte in the "GET MESSAGE" command + */ + recv_msg->msg.data_len = msg->rsp_size - 4; + memcpy(recv_msg->msg_data, &msg->rsp[4], + msg->rsp_size - 4); + if (deliver_response(intf, recv_msg)) + ipmi_inc_stat(intf, unhandled_commands); + else + ipmi_inc_stat(intf, handled_commands); + } else { + /* + * We couldn't allocate memory for the message, so + * requeue it for handling later. + */ + rv = 1; } return rv; @@ -4413,8 +4416,8 @@ static int handle_read_event_rsp(struct ipmi_smi *intf, if (!user->gets_events) continue; - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { + recv_msg = ipmi_alloc_recv_msg(user); + if (IS_ERR(recv_msg)) { mutex_unlock(&intf->users_mutex); list_for_each_entry_safe(recv_msg, recv_msg2, &msgs, link) { @@ -4435,8 +4438,6 @@ static int handle_read_event_rsp(struct ipmi_smi *intf, deliver_count++; copy_event_into_recv_msg(recv_msg, msg); - recv_msg->user = user; - kref_get(&user->refcount); list_add_tail(&recv_msg->link, &msgs); } mutex_unlock(&intf->users_mutex); @@ -4452,8 +4453,8 @@ static int handle_read_event_rsp(struct ipmi_smi *intf, * No one to receive the message, put it in queue if there's * not already too many things in the queue. */ - recv_msg = ipmi_alloc_recv_msg(); - if (!recv_msg) { + recv_msg = ipmi_alloc_recv_msg(NULL); + if (IS_ERR(recv_msg)) { /* * We couldn't allocate memory for the * message, so requeue it for handling @@ -4488,7 +4489,7 @@ static int handle_bmc_rsp(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg; struct ipmi_system_interface_addr *smi_addr; - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; if (recv_msg == NULL) { dev_warn(intf->si_dev, "IPMI SMI message received with no owner. This could be because of a malformed message, or because of a hardware error. Contact your hardware vendor for assistance.\n"); @@ -4529,9 +4530,10 @@ static int handle_one_recv_msg(struct ipmi_smi *intf, if (msg->rsp_size < 2) { /* Message is too small to be correct. */ - dev_warn(intf->si_dev, - "BMC returned too small a message for netfn %x cmd %x, got %d bytes\n", - (msg->data[0] >> 2) | 1, msg->data[1], msg->rsp_size); + dev_warn_ratelimited(intf->si_dev, + "BMC returned too small a message for netfn %x cmd %x, got %d bytes\n", + (msg->data[0] >> 2) | 1, + msg->data[1], msg->rsp_size); return_unspecified: /* Generate an error response for the message. */ @@ -4561,14 +4563,14 @@ static int handle_one_recv_msg(struct ipmi_smi *intf, } else if ((msg->data_size >= 2) && (msg->data[0] == (IPMI_NETFN_APP_REQUEST << 2)) && (msg->data[1] == IPMI_SEND_MSG_CMD) - && (msg->user_data == NULL)) { + && (msg->recv_msg == NULL)) { if (intf->in_shutdown || intf->run_to_completion) goto out; /* * This is the local response to a command send, start - * the timer for these. The user_data will not be + * the timer for these. The recv_msg will not be * NULL if this is a response send, and we will let * response sends just go through. */ @@ -4628,7 +4630,7 @@ static int handle_one_recv_msg(struct ipmi_smi *intf, requeue = handle_ipmb_direct_rcv_rsp(intf, msg); } else if ((msg->rsp[0] == ((IPMI_NETFN_APP_REQUEST|1) << 2)) && (msg->rsp[1] == IPMI_SEND_MSG_CMD) - && (msg->user_data != NULL)) { + && (msg->recv_msg != NULL)) { /* * It's a response to a response we sent. For this we * deliver a send message response to the user. @@ -4645,7 +4647,7 @@ static int handle_one_recv_msg(struct ipmi_smi *intf, cc = msg->rsp[2]; process_response_response: - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; requeue = 0; if (!recv_msg) @@ -4801,6 +4803,7 @@ static void smi_work(struct work_struct *t) int run_to_completion = READ_ONCE(intf->run_to_completion); struct ipmi_smi_msg *newmsg = NULL; struct ipmi_recv_msg *msg, *msg2; + int cc; /* * Start the next message if available. @@ -4809,7 +4812,7 @@ static void smi_work(struct work_struct *t) * because the lower layer is allowed to hold locks while calling * message delivery. */ - +restart: if (!run_to_completion) spin_lock_irqsave(&intf->xmit_msgs_lock, flags); if (intf->curr_msg == NULL && !intf->in_shutdown) { @@ -4830,8 +4833,17 @@ static void smi_work(struct work_struct *t) if (!run_to_completion) spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags); - if (newmsg) - intf->handlers->sender(intf->send_info, newmsg); + if (newmsg) { + cc = intf->handlers->sender(intf->send_info, newmsg); + if (cc) { + if (newmsg->recv_msg) + deliver_err_response(intf, + newmsg->recv_msg, cc); + else + ipmi_free_smi_msg(newmsg); + goto restart; + } + } handle_new_recv_msgs(intf); @@ -4868,12 +4880,10 @@ static void smi_work(struct work_struct *t) list_del(&msg->link); - if (refcount_read(&user->destroyed) == 0) { + if (refcount_read(&user->destroyed) == 0) ipmi_free_recv_msg(msg); - } else { - atomic_dec(&user->nr_msgs); + else user->handler->ipmi_recv_hndl(msg, user->handler_data); - } } mutex_unlock(&intf->user_msgs_mutex); @@ -4951,8 +4961,7 @@ smi_from_recv_msg(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg, static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent, struct list_head *timeouts, unsigned long timeout_period, - int slot, unsigned long *flags, - bool *need_timer) + int slot, bool *need_timer) { struct ipmi_recv_msg *msg; @@ -5004,7 +5013,7 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent, return; } - spin_unlock_irqrestore(&intf->seq_lock, *flags); + mutex_unlock(&intf->seq_lock); /* * Send the new message. We send with a zero @@ -5025,7 +5034,7 @@ static void check_msg_timeout(struct ipmi_smi *intf, struct seq_table *ent, } else ipmi_free_smi_msg(smi_msg); - spin_lock_irqsave(&intf->seq_lock, *flags); + mutex_lock(&intf->seq_lock); } } @@ -5052,7 +5061,7 @@ static bool ipmi_timeout_handler(struct ipmi_smi *intf, * list. */ INIT_LIST_HEAD(&timeouts); - spin_lock_irqsave(&intf->seq_lock, flags); + mutex_lock(&intf->seq_lock); if (intf->ipmb_maintenance_mode_timeout) { if (intf->ipmb_maintenance_mode_timeout <= timeout_period) intf->ipmb_maintenance_mode_timeout = 0; @@ -5062,8 +5071,8 @@ static bool ipmi_timeout_handler(struct ipmi_smi *intf, for (i = 0; i < IPMI_IPMB_NUM_SEQ; i++) check_msg_timeout(intf, &intf->seq_table[i], &timeouts, timeout_period, i, - &flags, &need_timer); - spin_unlock_irqrestore(&intf->seq_lock, flags); + &need_timer); + mutex_unlock(&intf->seq_lock); list_for_each_entry_safe(msg, msg2, &timeouts, link) deliver_err_response(intf, msg, IPMI_TIMEOUT_COMPLETION_CODE); @@ -5083,7 +5092,9 @@ static bool ipmi_timeout_handler(struct ipmi_smi *intf, -= timeout_period; if (!intf->maintenance_mode && (intf->auto_maintenance_timeout <= 0)) { - intf->maintenance_mode_enable = false; + intf->maintenance_mode_state = + IPMI_MAINTENANCE_MODE_STATE_OFF; + intf->auto_maintenance_timeout = 0; maintenance_mode_update(intf); } } @@ -5099,15 +5110,13 @@ static bool ipmi_timeout_handler(struct ipmi_smi *intf, static void ipmi_request_event(struct ipmi_smi *intf) { /* No event requests when in maintenance mode. */ - if (intf->maintenance_mode_enable) + if (intf->maintenance_mode_state) return; if (!intf->in_shutdown) intf->handlers->request_events(intf->send_info); } -static struct timer_list ipmi_timer; - static atomic_t stop_operation; static void ipmi_timeout_work(struct work_struct *work) @@ -5131,6 +5140,8 @@ static void ipmi_timeout_work(struct work_struct *work) } need_timer = true; } + if (intf->maintenance_mode_state) + need_timer = true; need_timer |= ipmi_timeout_handler(intf, IPMI_TIMEOUT_TIME); } @@ -5174,7 +5185,7 @@ struct ipmi_smi_msg *ipmi_alloc_smi_msg(void) rv = kmalloc(sizeof(struct ipmi_smi_msg), GFP_ATOMIC); if (rv) { rv->done = free_smi_msg; - rv->user_data = NULL; + rv->recv_msg = NULL; rv->type = IPMI_SMI_MSG_TYPE_NORMAL; atomic_inc(&smi_msg_inuse_count); } @@ -5190,27 +5201,51 @@ static void free_recv_msg(struct ipmi_recv_msg *msg) kfree(msg); } -static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void) +static struct ipmi_recv_msg *ipmi_alloc_recv_msg(struct ipmi_user *user) { struct ipmi_recv_msg *rv; + if (user) { + if (atomic_add_return(1, &user->nr_msgs) > max_msgs_per_user) { + atomic_dec(&user->nr_msgs); + return ERR_PTR(-EBUSY); + } + } + rv = kmalloc(sizeof(struct ipmi_recv_msg), GFP_ATOMIC); - if (rv) { - rv->user = NULL; - rv->done = free_recv_msg; - atomic_inc(&recv_msg_inuse_count); + if (!rv) { + if (user) + atomic_dec(&user->nr_msgs); + return ERR_PTR(-ENOMEM); } + + rv->user = user; + rv->done = free_recv_msg; + if (user) + kref_get(&user->refcount); + atomic_inc(&recv_msg_inuse_count); return rv; } void ipmi_free_recv_msg(struct ipmi_recv_msg *msg) { - if (msg->user && !oops_in_progress) + if (msg->user && !oops_in_progress) { + atomic_dec(&msg->user->nr_msgs); kref_put(&msg->user->refcount, free_ipmi_user); + } msg->done(msg); } EXPORT_SYMBOL(ipmi_free_recv_msg); +static void ipmi_set_recv_msg_user(struct ipmi_recv_msg *msg, + struct ipmi_user *user) +{ + WARN_ON_ONCE(msg->user); /* User should not be set. */ + msg->user = user; + atomic_inc(&user->nr_msgs); + kref_get(&user->refcount); +} + static atomic_t panic_done_count = ATOMIC_INIT(0); static void dummy_smi_done_handler(struct ipmi_smi_msg *msg) diff --git a/drivers/char/ipmi/ipmi_powernv.c b/drivers/char/ipmi/ipmi_powernv.c index 4a2efafcd1f855..52a1130defe537 100644 --- a/drivers/char/ipmi/ipmi_powernv.c +++ b/drivers/char/ipmi/ipmi_powernv.c @@ -51,7 +51,7 @@ static void send_error_reply(struct ipmi_smi_powernv *smi, ipmi_smi_msg_received(smi->intf, msg); } -static void ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) +static int ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) { struct ipmi_smi_powernv *smi = send_info; struct opal_ipmi_msg *opal_msg; @@ -93,18 +93,19 @@ static void ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) smi->interface_id, opal_msg, size); rc = opal_ipmi_send(smi->interface_id, opal_msg, size); pr_devel("%s: -> %d\n", __func__, rc); - - if (!rc) { - smi->cur_msg = msg; - spin_unlock_irqrestore(&smi->msg_lock, flags); - return; + if (rc) { + comp = IPMI_ERR_UNSPECIFIED; + goto err_unlock; } - comp = IPMI_ERR_UNSPECIFIED; + smi->cur_msg = msg; + spin_unlock_irqrestore(&smi->msg_lock, flags); + return IPMI_CC_NO_ERROR; + err_unlock: spin_unlock_irqrestore(&smi->msg_lock, flags); err: - send_error_reply(smi, msg, comp); + return comp; } static int ipmi_powernv_recv(struct ipmi_smi_powernv *smi) diff --git a/drivers/char/ipmi/ipmi_si.h b/drivers/char/ipmi/ipmi_si.h index 508c3fd4587766..687835b53da588 100644 --- a/drivers/char/ipmi/ipmi_si.h +++ b/drivers/char/ipmi/ipmi_si.h @@ -101,6 +101,13 @@ void ipmi_si_pci_shutdown(void); static inline void ipmi_si_pci_init(void) { } static inline void ipmi_si_pci_shutdown(void) { } #endif +#ifdef CONFIG_IPMI_LS2K +void ipmi_si_ls2k_init(void); +void ipmi_si_ls2k_shutdown(void); +#else +static inline void ipmi_si_ls2k_init(void) { } +static inline void ipmi_si_ls2k_shutdown(void) { } +#endif #ifdef CONFIG_PARISC void ipmi_si_parisc_init(void); void ipmi_si_parisc_shutdown(void); diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 8b5524069c15a3..70e55f5ff85e78 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -53,6 +53,7 @@ #define SI_TIMEOUT_JIFFIES (SI_TIMEOUT_TIME_USEC/SI_USEC_PER_JIFFY) #define SI_SHORT_TIMEOUT_USEC 250 /* .25ms when the SM request a short timeout */ +#define SI_TIMEOUT_HOSED (HZ) /* 1 second when in hosed state. */ enum si_intf_state { SI_NORMAL, @@ -61,7 +62,8 @@ enum si_intf_state { SI_CLEARING_FLAGS, SI_GETTING_MESSAGES, SI_CHECKING_ENABLES, - SI_SETTING_ENABLES + SI_SETTING_ENABLES, + SI_HOSED /* FIXME - add watchdog stuff. */ }; @@ -313,7 +315,7 @@ static void return_hosed_msg(struct smi_info *smi_info, int cCode) static enum si_sm_result start_next_msg(struct smi_info *smi_info) { - int rv; + int rv; if (!smi_info->waiting_msg) { smi_info->curr_msg = NULL; @@ -390,6 +392,17 @@ static void start_clear_flags(struct smi_info *smi_info) smi_info->si_state = SI_CLEARING_FLAGS; } +static void start_get_flags(struct smi_info *smi_info) +{ + unsigned char msg[2]; + + msg[0] = (IPMI_NETFN_APP_REQUEST << 2); + msg[1] = IPMI_GET_MSG_FLAGS_CMD; + + start_new_msg(smi_info, msg, 2); + smi_info->si_state = SI_GETTING_FLAGS; +} + static void start_getting_msg_queue(struct smi_info *smi_info) { smi_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2); @@ -742,6 +755,8 @@ static void handle_transaction_done(struct smi_info *smi_info) } break; } + case SI_HOSED: /* Shouldn't happen. */ + break; } } @@ -756,6 +771,10 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, enum si_sm_result si_sm_result; restart: + if (smi_info->si_state == SI_HOSED) + /* Just in case, hosed state is only left from the timeout. */ + return SI_SM_HOSED; + /* * There used to be a loop here that waited a little while * (around 25us) before giving up. That turned out to be @@ -779,18 +798,20 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, /* * Do the before return_hosed_msg, because that - * releases the lock. + * releases the lock. We just disable operations for + * a while and retry in hosed state. */ - smi_info->si_state = SI_NORMAL; + smi_info->si_state = SI_HOSED; if (smi_info->curr_msg != NULL) { /* * If we were handling a user message, format * a response to send to the upper layer to * tell it about the error. */ - return_hosed_msg(smi_info, IPMI_ERR_UNSPECIFIED); + return_hosed_msg(smi_info, IPMI_BUS_ERR); } - goto restart; + smi_mod_timer(smi_info, jiffies + SI_TIMEOUT_HOSED); + goto out; } /* @@ -798,8 +819,6 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, * this if there is not yet an upper layer to handle anything. */ if (si_sm_result == SI_SM_ATTN || smi_info->got_attn) { - unsigned char msg[2]; - if (smi_info->si_state != SI_NORMAL) { /* * We got an ATTN, but we are doing something else. @@ -817,11 +836,7 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, * interrupts work with the SMI, that's not really * possible. */ - msg[0] = (IPMI_NETFN_APP_REQUEST << 2); - msg[1] = IPMI_GET_MSG_FLAGS_CMD; - - start_new_msg(smi_info, msg, 2); - smi_info->si_state = SI_GETTING_FLAGS; + start_get_flags(smi_info); goto restart; } } @@ -894,27 +909,29 @@ static void flush_messages(void *send_info) * mode. This means we are single-threaded, no need for locks. */ result = smi_event_handler(smi_info, 0); - while (result != SI_SM_IDLE) { + while (result != SI_SM_IDLE && result != SI_SM_HOSED) { udelay(SI_SHORT_TIMEOUT_USEC); result = smi_event_handler(smi_info, SI_SHORT_TIMEOUT_USEC); } } -static void sender(void *send_info, - struct ipmi_smi_msg *msg) +static int sender(void *send_info, struct ipmi_smi_msg *msg) { struct smi_info *smi_info = send_info; unsigned long flags; debug_timestamp(smi_info, "Enqueue"); + if (smi_info->si_state == SI_HOSED) + return IPMI_BUS_ERR; + if (smi_info->run_to_completion) { /* * If we are running to completion, start it. Upper * layer will call flush_messages to clear it out. */ smi_info->waiting_msg = msg; - return; + return IPMI_CC_NO_ERROR; } spin_lock_irqsave(&smi_info->si_lock, flags); @@ -929,6 +946,7 @@ static void sender(void *send_info, smi_info->waiting_msg = msg; check_start_timer_thread(smi_info); spin_unlock_irqrestore(&smi_info->si_lock, flags); + return IPMI_CC_NO_ERROR; } static void set_run_to_completion(void *send_info, bool i_run_to_completion) @@ -1087,6 +1105,10 @@ static void smi_timeout(struct timer_list *t) spin_lock_irqsave(&(smi_info->si_lock), flags); debug_timestamp(smi_info, "Timer"); + if (smi_info->si_state == SI_HOSED) + /* Try something to see if the BMC is now operational. */ + start_get_flags(smi_info); + jiffies_now = jiffies; time_diff = (((long)jiffies_now - (long)smi_info->last_timeout_jiffies) * SI_USEC_PER_JIFFY); @@ -1096,14 +1118,11 @@ static void smi_timeout(struct timer_list *t) /* Running with interrupts, only do long timeouts. */ timeout = jiffies + SI_TIMEOUT_JIFFIES; smi_inc_stat(smi_info, long_timeouts); - goto do_mod_timer; - } - - /* - * If the state machine asks for a short delay, then shorten - * the timer timeout. - */ - if (smi_result == SI_SM_CALL_WITH_DELAY) { + } else if (smi_result == SI_SM_CALL_WITH_DELAY) { + /* + * If the state machine asks for a short delay, then shorten + * the timer timeout. + */ smi_inc_stat(smi_info, short_timeouts); timeout = jiffies + 1; } else { @@ -1111,7 +1130,6 @@ static void smi_timeout(struct timer_list *t) timeout = jiffies + SI_TIMEOUT_JIFFIES; } -do_mod_timer: if (smi_result != SI_SM_IDLE) smi_mod_timer(smi_info, timeout); else @@ -2120,6 +2138,8 @@ static int __init init_ipmi_si(void) ipmi_si_pci_init(); + ipmi_si_ls2k_init(); + ipmi_si_parisc_init(); mutex_lock(&smi_infos_lock); @@ -2331,6 +2351,8 @@ static void cleanup_ipmi_si(void) ipmi_si_pci_shutdown(); + ipmi_si_ls2k_shutdown(); + ipmi_si_parisc_shutdown(); ipmi_si_platform_shutdown(); diff --git a/drivers/char/ipmi/ipmi_si_ls2k.c b/drivers/char/ipmi/ipmi_si_ls2k.c new file mode 100644 index 00000000000000..45442c257efdbe --- /dev/null +++ b/drivers/char/ipmi/ipmi_si_ls2k.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Driver for Loongson-2K BMC IPMI interface + * + * Copyright (C) 2024-2025 Loongson Technology Corporation Limited. + * + * Authors: + * Chong Qiao + * Binbin Zhou + */ + +#include +#include +#include +#include + +#include "ipmi_si.h" + +#define LS2K_KCS_FIFO_IBFH 0x0 +#define LS2K_KCS_FIFO_IBFT 0x1 +#define LS2K_KCS_FIFO_OBFH 0x2 +#define LS2K_KCS_FIFO_OBFT 0x3 + +/* KCS registers */ +#define LS2K_KCS_REG_STS 0x4 +#define LS2K_KCS_REG_DATA_OUT 0x5 +#define LS2K_KCS_REG_DATA_IN 0x6 +#define LS2K_KCS_REG_CMD 0x8 + +#define LS2K_KCS_CMD_DATA 0xa +#define LS2K_KCS_VERSION 0xb +#define LS2K_KCS_WR_REQ 0xc +#define LS2K_KCS_WR_ACK 0x10 + +#define LS2K_KCS_STS_OBF BIT(0) +#define LS2K_KCS_STS_IBF BIT(1) +#define LS2K_KCS_STS_SMS_ATN BIT(2) +#define LS2K_KCS_STS_CMD BIT(3) + +#define LS2K_KCS_DATA_MASK (LS2K_KCS_STS_OBF | LS2K_KCS_STS_IBF | LS2K_KCS_STS_CMD) + +static bool ls2k_registered; + +static unsigned char ls2k_mem_inb_v0(const struct si_sm_io *io, unsigned int offset) +{ + void __iomem *addr = io->addr; + int reg_offset; + + if (offset & BIT(0)) { + reg_offset = LS2K_KCS_REG_STS; + } else { + writeb(readb(addr + LS2K_KCS_REG_STS) & ~LS2K_KCS_STS_OBF, addr + LS2K_KCS_REG_STS); + reg_offset = LS2K_KCS_REG_DATA_OUT; + } + + return readb(addr + reg_offset); +} + +static unsigned char ls2k_mem_inb_v1(const struct si_sm_io *io, unsigned int offset) +{ + void __iomem *addr = io->addr; + unsigned char inb = 0, cmd; + bool obf, ibf; + + obf = readb(addr + LS2K_KCS_FIFO_OBFH) ^ readb(addr + LS2K_KCS_FIFO_OBFT); + ibf = readb(addr + LS2K_KCS_FIFO_IBFH) ^ readb(addr + LS2K_KCS_FIFO_IBFT); + cmd = readb(addr + LS2K_KCS_CMD_DATA); + + if (offset & BIT(0)) { + inb = readb(addr + LS2K_KCS_REG_STS) & ~LS2K_KCS_DATA_MASK; + inb |= FIELD_PREP(LS2K_KCS_STS_OBF, obf) + | FIELD_PREP(LS2K_KCS_STS_IBF, ibf) + | FIELD_PREP(LS2K_KCS_STS_CMD, cmd); + } else { + inb = readb(addr + LS2K_KCS_REG_DATA_OUT); + writeb(readb(addr + LS2K_KCS_FIFO_OBFH), addr + LS2K_KCS_FIFO_OBFT); + } + + return inb; +} + +static void ls2k_mem_outb_v0(const struct si_sm_io *io, unsigned int offset, + unsigned char val) +{ + void __iomem *addr = io->addr; + unsigned char sts = readb(addr + LS2K_KCS_REG_STS); + int reg_offset; + + if (sts & LS2K_KCS_STS_IBF) + return; + + if (offset & BIT(0)) { + reg_offset = LS2K_KCS_REG_CMD; + sts |= LS2K_KCS_STS_CMD; + } else { + reg_offset = LS2K_KCS_REG_DATA_IN; + sts &= ~LS2K_KCS_STS_CMD; + } + + writew(val, addr + reg_offset); + writeb(sts | LS2K_KCS_STS_IBF, addr + LS2K_KCS_REG_STS); + writel(readl(addr + LS2K_KCS_WR_REQ) + 1, addr + LS2K_KCS_WR_REQ); +} + +static void ls2k_mem_outb_v1(const struct si_sm_io *io, unsigned int offset, + unsigned char val) +{ + void __iomem *addr = io->addr; + unsigned char ibfh, ibft; + int reg_offset; + + ibfh = readb(addr + LS2K_KCS_FIFO_IBFH); + ibft = readb(addr + LS2K_KCS_FIFO_IBFT); + + if (ibfh ^ ibft) + return; + + reg_offset = (offset & BIT(0)) ? LS2K_KCS_REG_CMD : LS2K_KCS_REG_DATA_IN; + writew(val, addr + reg_offset); + + writeb(offset & BIT(0), addr + LS2K_KCS_CMD_DATA); + writeb(!ibft, addr + LS2K_KCS_FIFO_IBFH); + writel(readl(addr + LS2K_KCS_WR_REQ) + 1, addr + LS2K_KCS_WR_REQ); +} + +static void ls2k_mem_cleanup(struct si_sm_io *io) +{ + if (io->addr) + iounmap(io->addr); +} + +static int ipmi_ls2k_mem_setup(struct si_sm_io *io) +{ + unsigned char version; + + io->addr = ioremap(io->addr_data, io->regspacing); + if (!io->addr) + return -EIO; + + version = readb(io->addr + LS2K_KCS_VERSION); + + io->inputb = version ? ls2k_mem_inb_v1 : ls2k_mem_inb_v0; + io->outputb = version ? ls2k_mem_outb_v1 : ls2k_mem_outb_v0; + io->io_cleanup = ls2k_mem_cleanup; + + return 0; +} + +static int ipmi_ls2k_probe(struct platform_device *pdev) +{ + struct si_sm_io io; + + memset(&io, 0, sizeof(io)); + + io.si_info = &ipmi_kcs_si_info; + io.io_setup = ipmi_ls2k_mem_setup; + io.addr_data = pdev->resource[0].start; + io.regspacing = resource_size(&pdev->resource[0]); + io.dev = &pdev->dev; + + dev_dbg(&pdev->dev, "addr 0x%lx, spacing %d.\n", io.addr_data, io.regspacing); + + return ipmi_si_add_smi(&io); +} + +static void ipmi_ls2k_remove(struct platform_device *pdev) +{ + ipmi_si_remove_by_dev(&pdev->dev); +} + +struct platform_driver ipmi_ls2k_platform_driver = { + .driver = { + .name = "ls2k-ipmi-si", + }, + .probe = ipmi_ls2k_probe, + .remove = ipmi_ls2k_remove, +}; + +void ipmi_si_ls2k_init(void) +{ + platform_driver_register(&ipmi_ls2k_platform_driver); + ls2k_registered = true; +} + +void ipmi_si_ls2k_shutdown(void) +{ + if (ls2k_registered) + platform_driver_unregister(&ipmi_ls2k_platform_driver); +} diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index 1bc42830444dd0..1b63f7d2fcda5f 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -1068,8 +1068,7 @@ static void start_next_msg(struct ssif_info *ssif_info, unsigned long *flags) } } -static void sender(void *send_info, - struct ipmi_smi_msg *msg) +static int sender(void *send_info, struct ipmi_smi_msg *msg) { struct ssif_info *ssif_info = send_info; unsigned long oflags, *flags; @@ -1089,6 +1088,7 @@ static void sender(void *send_info, msg->data[0], msg->data[1], (long long)t.tv_sec, (long)t.tv_nsec / NSEC_PER_USEC); } + return IPMI_CC_NO_ERROR; } static int get_smi_info(void *send_info, struct ipmi_smi_info *data) diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index dddd702b2454a6..ba3924eb13ba89 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -189,6 +189,15 @@ config TCG_IBMVTPM will be accessible from within Linux. To compile this driver as a module, choose M here; the module will be called tpm_ibmvtpm. +config TCG_LOONGSON + tristate "Loongson TPM Interface" + depends on MFD_LOONGSON_SE + help + If you want to make Loongson TPM support available, say Yes and + it will be accessible from within Linux. To compile this + driver as a module, choose M here; the module will be called + tpm_loongson. + config TCG_XEN tristate "XEN TPM Interface" depends on TCG_TPM && XEN diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile index 9de1b3ea34a9f2..5b5cdc0d32e4e5 100644 --- a/drivers/char/tpm/Makefile +++ b/drivers/char/tpm/Makefile @@ -46,3 +46,4 @@ obj-$(CONFIG_TCG_ARM_CRB_FFA) += tpm_crb_ffa.o obj-$(CONFIG_TCG_VTPM_PROXY) += tpm_vtpm_proxy.o obj-$(CONFIG_TCG_FTPM_TEE) += tpm_ftpm_tee.o obj-$(CONFIG_TCG_SVSM) += tpm_svsm.o +obj-$(CONFIG_TCG_LOONGSON) += tpm_loongson.o diff --git a/drivers/char/tpm/tpm_loongson.c b/drivers/char/tpm/tpm_loongson.c new file mode 100644 index 00000000000000..9e50250763d16b --- /dev/null +++ b/drivers/char/tpm/tpm_loongson.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Loongson Technology Corporation Limited. */ + +#include +#include +#include +#include + +#include "tpm.h" + +struct tpm_loongson_cmd { + u32 cmd_id; + u32 data_off; + u32 data_len; + u32 pad[5]; +}; + +static int tpm_loongson_recv(struct tpm_chip *chip, u8 *buf, size_t count) +{ + struct loongson_se_engine *tpm_engine = dev_get_drvdata(&chip->dev); + struct tpm_loongson_cmd *cmd_ret = tpm_engine->command_ret; + + if (cmd_ret->data_len > count) + return -EIO; + + memcpy(buf, tpm_engine->data_buffer, cmd_ret->data_len); + + return cmd_ret->data_len; +} + +static int tpm_loongson_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz, size_t count) +{ + struct loongson_se_engine *tpm_engine = dev_get_drvdata(&chip->dev); + struct tpm_loongson_cmd *cmd = tpm_engine->command; + + if (count > tpm_engine->buffer_size) + return -E2BIG; + + cmd->data_len = count; + memcpy(tpm_engine->data_buffer, buf, count); + + return loongson_se_send_engine_cmd(tpm_engine); +} + +static const struct tpm_class_ops tpm_loongson_ops = { + .flags = TPM_OPS_AUTO_STARTUP, + .recv = tpm_loongson_recv, + .send = tpm_loongson_send, +}; + +static int tpm_loongson_probe(struct platform_device *pdev) +{ + struct loongson_se_engine *tpm_engine; + struct device *dev = &pdev->dev; + struct tpm_loongson_cmd *cmd; + struct tpm_chip *chip; + + tpm_engine = loongson_se_init_engine(dev->parent, SE_ENGINE_TPM); + if (!tpm_engine) + return -ENODEV; + cmd = tpm_engine->command; + cmd->cmd_id = SE_CMD_TPM; + cmd->data_off = tpm_engine->buffer_off; + + chip = tpmm_chip_alloc(dev, &tpm_loongson_ops); + if (IS_ERR(chip)) + return PTR_ERR(chip); + chip->flags = TPM_CHIP_FLAG_TPM2 | TPM_CHIP_FLAG_IRQ; + dev_set_drvdata(&chip->dev, tpm_engine); + + return tpm_chip_register(chip); +} + +static struct platform_driver tpm_loongson = { + .probe = tpm_loongson_probe, + .driver = { + .name = "tpm_loongson", + }, +}; +module_platform_driver(tpm_loongson); + +MODULE_ALIAS("platform:tpm_loongson"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Loongson TPM driver"); diff --git a/drivers/clk/renesas/clk-mstp.c b/drivers/clk/renesas/clk-mstp.c index 5bc473c2adb33a..2f65fe2c6bdf07 100644 --- a/drivers/clk/renesas/clk-mstp.c +++ b/drivers/clk/renesas/clk-mstp.c @@ -303,6 +303,9 @@ void cpg_mstp_detach_dev(struct generic_pm_domain *unused, struct device *dev) pm_clk_destroy(dev); } +static struct device_node *cpg_mstp_pd_np __initdata = NULL; +static struct generic_pm_domain *cpg_mstp_pd_genpd __initdata = NULL; + void __init cpg_mstp_add_clk_domain(struct device_node *np) { struct generic_pm_domain *pd; @@ -324,5 +327,20 @@ void __init cpg_mstp_add_clk_domain(struct device_node *np) pd->detach_dev = cpg_mstp_detach_dev; pm_genpd_init(pd, &pm_domain_always_on_gov, false); - of_genpd_add_provider_simple(np, pd); + cpg_mstp_pd_np = of_node_get(np); + cpg_mstp_pd_genpd = pd; +} + +static int __init cpg_mstp_pd_init_provider(void) +{ + int error; + + if (!cpg_mstp_pd_np) + return -ENODEV; + + error = of_genpd_add_provider_simple(cpg_mstp_pd_np, cpg_mstp_pd_genpd); + + of_node_put(cpg_mstp_pd_np); + return error; } +postcore_initcall(cpg_mstp_pd_init_provider); diff --git a/drivers/clk/sunxi-ng/ccu_mp.c b/drivers/clk/sunxi-ng/ccu_mp.c index 354c981943b6f8..4221b1888b38da 100644 --- a/drivers/clk/sunxi-ng/ccu_mp.c +++ b/drivers/clk/sunxi-ng/ccu_mp.c @@ -185,7 +185,7 @@ static unsigned long ccu_mp_recalc_rate(struct clk_hw *hw, p &= (1 << cmp->p.width) - 1; if (cmp->common.features & CCU_FEATURE_DUAL_DIV) - rate = (parent_rate / p) / m; + rate = (parent_rate / (p + cmp->p.offset)) / m; else rate = (parent_rate >> p) / m; diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 645f517a1ac26d..ffcd23668763fe 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -395,8 +395,7 @@ config ARM_GLOBAL_TIMER config ARM_GT_INITIAL_PRESCALER_VAL int "ARM global timer initial prescaler value" - default 2 if ARCH_ZYNQ - default 1 + default 0 depends on ARM_GLOBAL_TIMER help When the ARM global timer initializes, its current rate is declared @@ -406,6 +405,7 @@ config ARM_GT_INITIAL_PRESCALER_VAL bounds about how much the parent clock is allowed to decrease or increase wrt the initial clock value. This affects CPU_FREQ max delta from the initial frequency. + Use 0 to use auto-detection in the driver. config ARM_TIMER_SP804 bool "Support for Dual Timer SP804 module" @@ -474,11 +474,14 @@ config FSL_FTM_TIMER help Support for Freescale FlexTimer Module (FTM) timer. -config VF_PIT_TIMER - bool +config NXP_PIT_TIMER + bool "NXP Periodic Interrupt Timer" if COMPILE_TEST select CLKSRC_MMIO help - Support for Periodic Interrupt Timer on Freescale Vybrid Family SoCs. + Support for Periodic Interrupt Timer on Freescale / NXP + SoCs. This periodic timer is found on the Vybrid Family and + the Automotive S32G2/3 platforms. It contains 4 channels + where two can be coupled to form a 64 bits channel. config SYS_SUPPORTS_SH_CMT bool diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 205bf3b0a8f3f2..ec4452ee958f1a 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -49,7 +49,7 @@ obj-$(CONFIG_CLKSRC_LPC32XX) += timer-lpc32xx.o obj-$(CONFIG_CLKSRC_MPS2) += mps2-timer.o obj-$(CONFIG_CLKSRC_SAMSUNG_PWM) += samsung_pwm_timer.o obj-$(CONFIG_FSL_FTM_TIMER) += timer-fsl-ftm.o -obj-$(CONFIG_VF_PIT_TIMER) += timer-vf-pit.o +obj-$(CONFIG_NXP_PIT_TIMER) += timer-nxp-pit.o obj-$(CONFIG_CLKSRC_QCOM) += timer-qcom.o obj-$(CONFIG_MTK_TIMER) += timer-mediatek.o obj-$(CONFIG_MTK_CPUX_TIMER) += timer-mediatek-cpux.o @@ -64,6 +64,7 @@ obj-$(CONFIG_REALTEK_OTTO_TIMER) += timer-rtl-otto.o obj-$(CONFIG_ARC_TIMERS) += arc_timer.o obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer.o +obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer_mmio.o obj-$(CONFIG_ARM_GLOBAL_TIMER) += arm_global_timer.o obj-$(CONFIG_ARMV7M_SYSTICK) += armv7m_systick.o obj-$(CONFIG_ARM_TIMER_SP804) += timer-sp804.o diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 80ba6a54248c4c..90aeff44a2764c 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -34,42 +34,12 @@ #include -#define CNTTIDR 0x08 -#define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) - -#define CNTACR(n) (0x40 + ((n) * 4)) -#define CNTACR_RPCT BIT(0) -#define CNTACR_RVCT BIT(1) -#define CNTACR_RFRQ BIT(2) -#define CNTACR_RVOFF BIT(3) -#define CNTACR_RWVT BIT(4) -#define CNTACR_RWPT BIT(5) - -#define CNTPCT_LO 0x00 -#define CNTVCT_LO 0x08 -#define CNTFRQ 0x10 -#define CNTP_CVAL_LO 0x20 -#define CNTP_CTL 0x2c -#define CNTV_CVAL_LO 0x30 -#define CNTV_CTL 0x3c - /* * The minimum amount of time a generic counter is guaranteed to not roll over * (40 years) */ #define MIN_ROLLOVER_SECS (40ULL * 365 * 24 * 3600) -static unsigned arch_timers_present __initdata; - -struct arch_timer { - void __iomem *base; - struct clock_event_device evt; -}; - -static struct arch_timer *arch_timer_mem __ro_after_init; - -#define to_arch_timer(e) container_of(e, struct arch_timer, evt) - static u32 arch_timer_rate __ro_after_init; static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI] __ro_after_init; @@ -85,7 +55,6 @@ static struct clock_event_device __percpu *arch_timer_evt; static enum arch_timer_ppi_nr arch_timer_uses_ppi __ro_after_init = ARCH_TIMER_VIRT_PPI; static bool arch_timer_c3stop __ro_after_init; -static bool arch_timer_mem_use_virtual __ro_after_init; static bool arch_counter_suspend_stop __ro_after_init; #ifdef CONFIG_GENERIC_GETTIMEOFDAY static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_ARCHTIMER; @@ -121,76 +90,6 @@ static int arch_counter_get_width(void) /* * Architected system timer support. */ - -static __always_inline -void arch_timer_reg_write(int access, enum arch_timer_reg reg, u64 val, - struct clock_event_device *clk) -{ - if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - writel_relaxed((u32)val, timer->base + CNTP_CTL); - break; - case ARCH_TIMER_REG_CVAL: - /* - * Not guaranteed to be atomic, so the timer - * must be disabled at this point. - */ - writeq_relaxed(val, timer->base + CNTP_CVAL_LO); - break; - default: - BUILD_BUG(); - } - } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - writel_relaxed((u32)val, timer->base + CNTV_CTL); - break; - case ARCH_TIMER_REG_CVAL: - /* Same restriction as above */ - writeq_relaxed(val, timer->base + CNTV_CVAL_LO); - break; - default: - BUILD_BUG(); - } - } else { - arch_timer_reg_write_cp15(access, reg, val); - } -} - -static __always_inline -u32 arch_timer_reg_read(int access, enum arch_timer_reg reg, - struct clock_event_device *clk) -{ - u32 val; - - if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - val = readl_relaxed(timer->base + CNTP_CTL); - break; - default: - BUILD_BUG(); - } - } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { - struct arch_timer *timer = to_arch_timer(clk); - switch (reg) { - case ARCH_TIMER_REG_CTRL: - val = readl_relaxed(timer->base + CNTV_CTL); - break; - default: - BUILD_BUG(); - } - } else { - val = arch_timer_reg_read_cp15(access, reg); - } - - return val; -} - static noinstr u64 raw_counter_get_cntpct_stable(void) { return __arch_counter_get_cntpct_stable(); @@ -424,7 +323,7 @@ void erratum_set_next_event_generic(const int access, unsigned long evt, unsigned long ctrl; u64 cval; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; @@ -436,7 +335,7 @@ void erratum_set_next_event_generic(const int access, unsigned long evt, write_sysreg(cval, cntv_cval_el0); } - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); } static __maybe_unused int erratum_set_next_event_virt(unsigned long evt, @@ -667,10 +566,10 @@ static __always_inline irqreturn_t timer_handler(const int access, { unsigned long ctrl; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, evt); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); if (ctrl & ARCH_TIMER_CTRL_IT_STAT) { ctrl |= ARCH_TIMER_CTRL_IT_MASK; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, evt); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); evt->event_handler(evt); return IRQ_HANDLED; } @@ -692,28 +591,14 @@ static irqreturn_t arch_timer_handler_phys(int irq, void *dev_id) return timer_handler(ARCH_TIMER_PHYS_ACCESS, evt); } -static irqreturn_t arch_timer_handler_phys_mem(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - return timer_handler(ARCH_TIMER_MEM_PHYS_ACCESS, evt); -} - -static irqreturn_t arch_timer_handler_virt_mem(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - return timer_handler(ARCH_TIMER_MEM_VIRT_ACCESS, evt); -} - static __always_inline int arch_timer_shutdown(const int access, struct clock_event_device *clk) { unsigned long ctrl; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl &= ~ARCH_TIMER_CTRL_ENABLE; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); return 0; } @@ -728,23 +613,13 @@ static int arch_timer_shutdown_phys(struct clock_event_device *clk) return arch_timer_shutdown(ARCH_TIMER_PHYS_ACCESS, clk); } -static int arch_timer_shutdown_virt_mem(struct clock_event_device *clk) -{ - return arch_timer_shutdown(ARCH_TIMER_MEM_VIRT_ACCESS, clk); -} - -static int arch_timer_shutdown_phys_mem(struct clock_event_device *clk) -{ - return arch_timer_shutdown(ARCH_TIMER_MEM_PHYS_ACCESS, clk); -} - static __always_inline void set_next_event(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; u64 cnt; - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); + ctrl = arch_timer_reg_read_cp15(access, ARCH_TIMER_REG_CTRL); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; @@ -753,8 +628,8 @@ static __always_inline void set_next_event(const int access, unsigned long evt, else cnt = __arch_counter_get_cntvct(); - arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CVAL, evt + cnt); + arch_timer_reg_write_cp15(access, ARCH_TIMER_REG_CTRL, ctrl); } static int arch_timer_set_next_event_virt(unsigned long evt, @@ -771,60 +646,6 @@ static int arch_timer_set_next_event_phys(unsigned long evt, return 0; } -static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) -{ - u32 cnt_lo, cnt_hi, tmp_hi; - - do { - cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); - cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); - tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); - } while (cnt_hi != tmp_hi); - - return ((u64) cnt_hi << 32) | cnt_lo; -} - -static __always_inline void set_next_event_mem(const int access, unsigned long evt, - struct clock_event_device *clk) -{ - struct arch_timer *timer = to_arch_timer(clk); - unsigned long ctrl; - u64 cnt; - - ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); - - /* Timer must be disabled before programming CVAL */ - if (ctrl & ARCH_TIMER_CTRL_ENABLE) { - ctrl &= ~ARCH_TIMER_CTRL_ENABLE; - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); - } - - ctrl |= ARCH_TIMER_CTRL_ENABLE; - ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; - - if (access == ARCH_TIMER_MEM_VIRT_ACCESS) - cnt = arch_counter_get_cnt_mem(timer, CNTVCT_LO); - else - cnt = arch_counter_get_cnt_mem(timer, CNTPCT_LO); - - arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); - arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); -} - -static int arch_timer_set_next_event_virt_mem(unsigned long evt, - struct clock_event_device *clk) -{ - set_next_event_mem(ARCH_TIMER_MEM_VIRT_ACCESS, evt, clk); - return 0; -} - -static int arch_timer_set_next_event_phys_mem(unsigned long evt, - struct clock_event_device *clk) -{ - set_next_event_mem(ARCH_TIMER_MEM_PHYS_ACCESS, evt, clk); - return 0; -} - static u64 __arch_timer_check_delta(void) { #ifdef CONFIG_ARM64 @@ -850,63 +671,41 @@ static u64 __arch_timer_check_delta(void) return CLOCKSOURCE_MASK(arch_counter_get_width()); } -static void __arch_timer_setup(unsigned type, - struct clock_event_device *clk) +static void __arch_timer_setup(struct clock_event_device *clk) { + typeof(clk->set_next_event) sne; u64 max_delta; clk->features = CLOCK_EVT_FEAT_ONESHOT; - if (type == ARCH_TIMER_TYPE_CP15) { - typeof(clk->set_next_event) sne; - - arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); - - if (arch_timer_c3stop) - clk->features |= CLOCK_EVT_FEAT_C3STOP; - clk->name = "arch_sys_timer"; - clk->rating = 450; - clk->cpumask = cpumask_of(smp_processor_id()); - clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; - switch (arch_timer_uses_ppi) { - case ARCH_TIMER_VIRT_PPI: - clk->set_state_shutdown = arch_timer_shutdown_virt; - clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; - sne = erratum_handler(set_next_event_virt); - break; - case ARCH_TIMER_PHYS_SECURE_PPI: - case ARCH_TIMER_PHYS_NONSECURE_PPI: - case ARCH_TIMER_HYP_PPI: - clk->set_state_shutdown = arch_timer_shutdown_phys; - clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; - sne = erratum_handler(set_next_event_phys); - break; - default: - BUG(); - } + arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); - clk->set_next_event = sne; - max_delta = __arch_timer_check_delta(); - } else { - clk->features |= CLOCK_EVT_FEAT_DYNIRQ; - clk->name = "arch_mem_timer"; - clk->rating = 400; - clk->cpumask = cpu_possible_mask; - if (arch_timer_mem_use_virtual) { - clk->set_state_shutdown = arch_timer_shutdown_virt_mem; - clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem; - clk->set_next_event = - arch_timer_set_next_event_virt_mem; - } else { - clk->set_state_shutdown = arch_timer_shutdown_phys_mem; - clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem; - clk->set_next_event = - arch_timer_set_next_event_phys_mem; - } - - max_delta = CLOCKSOURCE_MASK(56); + if (arch_timer_c3stop) + clk->features |= CLOCK_EVT_FEAT_C3STOP; + clk->name = "arch_sys_timer"; + clk->rating = 450; + clk->cpumask = cpumask_of(smp_processor_id()); + clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; + switch (arch_timer_uses_ppi) { + case ARCH_TIMER_VIRT_PPI: + clk->set_state_shutdown = arch_timer_shutdown_virt; + clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; + sne = erratum_handler(set_next_event_virt); + break; + case ARCH_TIMER_PHYS_SECURE_PPI: + case ARCH_TIMER_PHYS_NONSECURE_PPI: + case ARCH_TIMER_HYP_PPI: + clk->set_state_shutdown = arch_timer_shutdown_phys; + clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; + sne = erratum_handler(set_next_event_phys); + break; + default: + BUG(); } + clk->set_next_event = sne; + max_delta = __arch_timer_check_delta(); + clk->set_state_shutdown(clk); clockevents_config_and_register(clk, arch_timer_rate, 0xf, max_delta); @@ -1029,7 +828,7 @@ static int arch_timer_starting_cpu(unsigned int cpu) struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt); u32 flags; - __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk); + __arch_timer_setup(clk); flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]); enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags); @@ -1075,22 +874,12 @@ static void __init arch_timer_of_configure_rate(u32 rate, struct device_node *np pr_warn("frequency not available\n"); } -static void __init arch_timer_banner(unsigned type) +static void __init arch_timer_banner(void) { - pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n", - type & ARCH_TIMER_TYPE_CP15 ? "cp15" : "", - type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? - " and " : "", - type & ARCH_TIMER_TYPE_MEM ? "mmio" : "", + pr_info("cp15 timer running at %lu.%02luMHz (%s).\n", (unsigned long)arch_timer_rate / 1000000, (unsigned long)(arch_timer_rate / 10000) % 100, - type & ARCH_TIMER_TYPE_CP15 ? - (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys" : - "", - type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? "/" : "", - type & ARCH_TIMER_TYPE_MEM ? - arch_timer_mem_use_virtual ? "virt" : "phys" : - ""); + (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys"); } u32 arch_timer_get_rate(void) @@ -1108,11 +897,6 @@ bool arch_timer_evtstrm_available(void) return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available); } -static noinstr u64 arch_counter_get_cntvct_mem(void) -{ - return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO); -} - static struct arch_timer_kvm_info arch_timer_kvm_info; struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) @@ -1120,42 +904,35 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) return &arch_timer_kvm_info; } -static void __init arch_counter_register(unsigned type) +static void __init arch_counter_register(void) { u64 (*scr)(void); + u64 (*rd)(void); u64 start_count; int width; - /* Register the CP15 based counter if we have one */ - if (type & ARCH_TIMER_TYPE_CP15) { - u64 (*rd)(void); - - if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || - arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { - if (arch_timer_counter_has_wa()) { - rd = arch_counter_get_cntvct_stable; - scr = raw_counter_get_cntvct_stable; - } else { - rd = arch_counter_get_cntvct; - scr = arch_counter_get_cntvct; - } + if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || + arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { + if (arch_timer_counter_has_wa()) { + rd = arch_counter_get_cntvct_stable; + scr = raw_counter_get_cntvct_stable; } else { - if (arch_timer_counter_has_wa()) { - rd = arch_counter_get_cntpct_stable; - scr = raw_counter_get_cntpct_stable; - } else { - rd = arch_counter_get_cntpct; - scr = arch_counter_get_cntpct; - } + rd = arch_counter_get_cntvct; + scr = arch_counter_get_cntvct; } - - arch_timer_read_counter = rd; - clocksource_counter.vdso_clock_mode = vdso_default; } else { - arch_timer_read_counter = arch_counter_get_cntvct_mem; - scr = arch_counter_get_cntvct_mem; + if (arch_timer_counter_has_wa()) { + rd = arch_counter_get_cntpct_stable; + scr = raw_counter_get_cntpct_stable; + } else { + rd = arch_counter_get_cntpct; + scr = arch_counter_get_cntpct; + } } + arch_timer_read_counter = rd; + clocksource_counter.vdso_clock_mode = vdso_default; + width = arch_counter_get_width(); clocksource_counter.mask = CLOCKSOURCE_MASK(width); cyclecounter.mask = CLOCKSOURCE_MASK(width); @@ -1303,76 +1080,10 @@ static int __init arch_timer_register(void) return err; } -static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq) -{ - int ret; - irq_handler_t func; - - arch_timer_mem = kzalloc(sizeof(*arch_timer_mem), GFP_KERNEL); - if (!arch_timer_mem) - return -ENOMEM; - - arch_timer_mem->base = base; - arch_timer_mem->evt.irq = irq; - __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &arch_timer_mem->evt); - - if (arch_timer_mem_use_virtual) - func = arch_timer_handler_virt_mem; - else - func = arch_timer_handler_phys_mem; - - ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &arch_timer_mem->evt); - if (ret) { - pr_err("Failed to request mem timer irq\n"); - kfree(arch_timer_mem); - arch_timer_mem = NULL; - } - - return ret; -} - -static const struct of_device_id arch_timer_of_match[] __initconst = { - { .compatible = "arm,armv7-timer", }, - { .compatible = "arm,armv8-timer", }, - {}, -}; - -static const struct of_device_id arch_timer_mem_of_match[] __initconst = { - { .compatible = "arm,armv7-timer-mem", }, - {}, -}; - -static bool __init arch_timer_needs_of_probing(void) -{ - struct device_node *dn; - bool needs_probing = false; - unsigned int mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM; - - /* We have two timers, and both device-tree nodes are probed. */ - if ((arch_timers_present & mask) == mask) - return false; - - /* - * Only one type of timer is probed, - * check if we have another type of timer node in device-tree. - */ - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) - dn = of_find_matching_node(NULL, arch_timer_mem_of_match); - else - dn = of_find_matching_node(NULL, arch_timer_of_match); - - if (dn && of_device_is_available(dn)) - needs_probing = true; - - of_node_put(dn); - - return needs_probing; -} - static int __init arch_timer_common_init(void) { - arch_timer_banner(arch_timers_present); - arch_counter_register(arch_timers_present); + arch_timer_banner(); + arch_counter_register(); return arch_timer_arch_init(); } @@ -1421,13 +1132,11 @@ static int __init arch_timer_of_init(struct device_node *np) u32 rate; bool has_names; - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { + if (arch_timer_evt) { pr_warn("multiple nodes in dt, skipping\n"); return 0; } - arch_timers_present |= ARCH_TIMER_TYPE_CP15; - has_names = of_property_present(np, "interrupt-names"); for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) { @@ -1472,283 +1181,22 @@ static int __init arch_timer_of_init(struct device_node *np) if (ret) return ret; - if (arch_timer_needs_of_probing()) - return 0; - return arch_timer_common_init(); } TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init); TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init); -static u32 __init -arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame) -{ - void __iomem *base; - u32 rate; - - base = ioremap(frame->cntbase, frame->size); - if (!base) { - pr_err("Unable to map frame @ %pa\n", &frame->cntbase); - return 0; - } - - rate = readl_relaxed(base + CNTFRQ); - - iounmap(base); - - return rate; -} - -static struct arch_timer_mem_frame * __init -arch_timer_mem_find_best_frame(struct arch_timer_mem *timer_mem) -{ - struct arch_timer_mem_frame *frame, *best_frame = NULL; - void __iomem *cntctlbase; - u32 cnttidr; - int i; - - cntctlbase = ioremap(timer_mem->cntctlbase, timer_mem->size); - if (!cntctlbase) { - pr_err("Can't map CNTCTLBase @ %pa\n", - &timer_mem->cntctlbase); - return NULL; - } - - cnttidr = readl_relaxed(cntctlbase + CNTTIDR); - - /* - * Try to find a virtual capable frame. Otherwise fall back to a - * physical capable frame. - */ - for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { - u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | - CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; - - frame = &timer_mem->frame[i]; - if (!frame->valid) - continue; - - /* Try enabling everything, and see what sticks */ - writel_relaxed(cntacr, cntctlbase + CNTACR(i)); - cntacr = readl_relaxed(cntctlbase + CNTACR(i)); - - if ((cnttidr & CNTTIDR_VIRT(i)) && - !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) { - best_frame = frame; - arch_timer_mem_use_virtual = true; - break; - } - - if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) - continue; - - best_frame = frame; - } - - iounmap(cntctlbase); - - return best_frame; -} - -static int __init -arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame) -{ - void __iomem *base; - int ret, irq; - - if (arch_timer_mem_use_virtual) - irq = frame->virt_irq; - else - irq = frame->phys_irq; - - if (!irq) { - pr_err("Frame missing %s irq.\n", - arch_timer_mem_use_virtual ? "virt" : "phys"); - return -EINVAL; - } - - if (!request_mem_region(frame->cntbase, frame->size, - "arch_mem_timer")) - return -EBUSY; - - base = ioremap(frame->cntbase, frame->size); - if (!base) { - pr_err("Can't map frame's registers\n"); - return -ENXIO; - } - - ret = arch_timer_mem_register(base, irq); - if (ret) { - iounmap(base); - return ret; - } - - arch_timers_present |= ARCH_TIMER_TYPE_MEM; - - return 0; -} - -static int __init arch_timer_mem_of_init(struct device_node *np) -{ - struct arch_timer_mem *timer_mem; - struct arch_timer_mem_frame *frame; - struct resource res; - int ret = -EINVAL; - u32 rate; - - timer_mem = kzalloc(sizeof(*timer_mem), GFP_KERNEL); - if (!timer_mem) - return -ENOMEM; - - if (of_address_to_resource(np, 0, &res)) - goto out; - timer_mem->cntctlbase = res.start; - timer_mem->size = resource_size(&res); - - for_each_available_child_of_node_scoped(np, frame_node) { - u32 n; - struct arch_timer_mem_frame *frame; - - if (of_property_read_u32(frame_node, "frame-number", &n)) { - pr_err(FW_BUG "Missing frame-number.\n"); - goto out; - } - if (n >= ARCH_TIMER_MEM_MAX_FRAMES) { - pr_err(FW_BUG "Wrong frame-number, only 0-%u are permitted.\n", - ARCH_TIMER_MEM_MAX_FRAMES - 1); - goto out; - } - frame = &timer_mem->frame[n]; - - if (frame->valid) { - pr_err(FW_BUG "Duplicated frame-number.\n"); - goto out; - } - - if (of_address_to_resource(frame_node, 0, &res)) - goto out; - - frame->cntbase = res.start; - frame->size = resource_size(&res); - - frame->virt_irq = irq_of_parse_and_map(frame_node, - ARCH_TIMER_VIRT_SPI); - frame->phys_irq = irq_of_parse_and_map(frame_node, - ARCH_TIMER_PHYS_SPI); - - frame->valid = true; - } - - frame = arch_timer_mem_find_best_frame(timer_mem); - if (!frame) { - pr_err("Unable to find a suitable frame in timer @ %pa\n", - &timer_mem->cntctlbase); - ret = -EINVAL; - goto out; - } - - rate = arch_timer_mem_frame_get_cntfrq(frame); - arch_timer_of_configure_rate(rate, np); - - ret = arch_timer_mem_frame_register(frame); - if (!ret && !arch_timer_needs_of_probing()) - ret = arch_timer_common_init(); -out: - kfree(timer_mem); - return ret; -} -TIMER_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", - arch_timer_mem_of_init); - #ifdef CONFIG_ACPI_GTDT -static int __init -arch_timer_mem_verify_cntfrq(struct arch_timer_mem *timer_mem) -{ - struct arch_timer_mem_frame *frame; - u32 rate; - int i; - - for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { - frame = &timer_mem->frame[i]; - - if (!frame->valid) - continue; - - rate = arch_timer_mem_frame_get_cntfrq(frame); - if (rate == arch_timer_rate) - continue; - - pr_err(FW_BUG "CNTFRQ mismatch: frame @ %pa: (0x%08lx), CPU: (0x%08lx)\n", - &frame->cntbase, - (unsigned long)rate, (unsigned long)arch_timer_rate); - - return -EINVAL; - } - - return 0; -} - -static int __init arch_timer_mem_acpi_init(int platform_timer_count) -{ - struct arch_timer_mem *timers, *timer; - struct arch_timer_mem_frame *frame, *best_frame = NULL; - int timer_count, i, ret = 0; - - timers = kcalloc(platform_timer_count, sizeof(*timers), - GFP_KERNEL); - if (!timers) - return -ENOMEM; - - ret = acpi_arch_timer_mem_init(timers, &timer_count); - if (ret || !timer_count) - goto out; - - /* - * While unlikely, it's theoretically possible that none of the frames - * in a timer expose the combination of feature we want. - */ - for (i = 0; i < timer_count; i++) { - timer = &timers[i]; - - frame = arch_timer_mem_find_best_frame(timer); - if (!best_frame) - best_frame = frame; - - ret = arch_timer_mem_verify_cntfrq(timer); - if (ret) { - pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n"); - goto out; - } - - if (!best_frame) /* implies !frame */ - /* - * Only complain about missing suitable frames if we - * haven't already found one in a previous iteration. - */ - pr_err("Unable to find a suitable frame in timer @ %pa\n", - &timer->cntctlbase); - } - - if (best_frame) - ret = arch_timer_mem_frame_register(best_frame); -out: - kfree(timers); - return ret; -} - -/* Initialize per-processor generic timer and memory-mapped timer(if present) */ static int __init arch_timer_acpi_init(struct acpi_table_header *table) { - int ret, platform_timer_count; + int ret; - if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { + if (arch_timer_evt) { pr_warn("already initialized, skipping\n"); return -EINVAL; } - arch_timers_present |= ARCH_TIMER_TYPE_CP15; - - ret = acpi_gtdt_init(table, &platform_timer_count); + ret = acpi_gtdt_init(table, NULL); if (ret) return ret; @@ -1790,10 +1238,6 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table) if (ret) return ret; - if (platform_timer_count && - arch_timer_mem_acpi_init(platform_timer_count)) - pr_err("Failed to initialize memory-mapped timer.\n"); - return arch_timer_common_init(); } TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); diff --git a/drivers/clocksource/arm_arch_timer_mmio.c b/drivers/clocksource/arm_arch_timer_mmio.c new file mode 100644 index 00000000000000..ebe1987d651ebc --- /dev/null +++ b/drivers/clocksource/arm_arch_timer_mmio.c @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ARM Generic Memory Mapped Timer support + * + * Split from drivers/clocksource/arm_arch_timer.c + * + * Copyright (C) 2011 ARM Ltd. + * All Rights Reserved + */ + +#define pr_fmt(fmt) "arch_timer_mmio: " fmt + +#include +#include +#include +#include +#include +#include + +#include + +#define CNTTIDR 0x08 +#define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) + +#define CNTACR(n) (0x40 + ((n) * 4)) +#define CNTACR_RPCT BIT(0) +#define CNTACR_RVCT BIT(1) +#define CNTACR_RFRQ BIT(2) +#define CNTACR_RVOFF BIT(3) +#define CNTACR_RWVT BIT(4) +#define CNTACR_RWPT BIT(5) + +#define CNTPCT_LO 0x00 +#define CNTVCT_LO 0x08 +#define CNTFRQ 0x10 +#define CNTP_CVAL_LO 0x20 +#define CNTP_CTL 0x2c +#define CNTV_CVAL_LO 0x30 +#define CNTV_CTL 0x3c + +enum arch_timer_access { + PHYS_ACCESS, + VIRT_ACCESS, +}; + +struct arch_timer { + struct clock_event_device evt; + struct clocksource cs; + struct arch_timer_mem *gt_block; + void __iomem *base; + enum arch_timer_access access; + u32 rate; +}; + +#define evt_to_arch_timer(e) container_of(e, struct arch_timer, evt) +#define cs_to_arch_timer(c) container_of(c, struct arch_timer, cs) + +static void arch_timer_mmio_write(struct arch_timer *timer, + enum arch_timer_reg reg, u64 val) +{ + switch (timer->access) { + case PHYS_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + writel_relaxed((u32)val, timer->base + CNTP_CTL); + return; + case ARCH_TIMER_REG_CVAL: + /* + * Not guaranteed to be atomic, so the timer + * must be disabled at this point. + */ + writeq_relaxed(val, timer->base + CNTP_CVAL_LO); + return; + } + break; + case VIRT_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + writel_relaxed((u32)val, timer->base + CNTV_CTL); + return; + case ARCH_TIMER_REG_CVAL: + /* Same restriction as above */ + writeq_relaxed(val, timer->base + CNTV_CVAL_LO); + return; + } + break; + } + + /* Should never be here */ + WARN_ON_ONCE(1); +} + +static u32 arch_timer_mmio_read(struct arch_timer *timer, enum arch_timer_reg reg) +{ + switch (timer->access) { + case PHYS_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + return readl_relaxed(timer->base + CNTP_CTL); + default: + break; + } + break; + case VIRT_ACCESS: + switch (reg) { + case ARCH_TIMER_REG_CTRL: + return readl_relaxed(timer->base + CNTV_CTL); + default: + break; + } + break; + } + + /* Should never be here */ + WARN_ON_ONCE(1); + return 0; +} + +static noinstr u64 arch_counter_mmio_get_cnt(struct arch_timer *t) +{ + int offset_lo = t->access == VIRT_ACCESS ? CNTVCT_LO : CNTPCT_LO; + u32 cnt_lo, cnt_hi, tmp_hi; + + do { + cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); + cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); + tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); + } while (cnt_hi != tmp_hi); + + return ((u64) cnt_hi << 32) | cnt_lo; +} + +static u64 arch_mmio_counter_read(struct clocksource *cs) +{ + struct arch_timer *at = cs_to_arch_timer(cs); + + return arch_counter_mmio_get_cnt(at); +} + +static int arch_timer_mmio_shutdown(struct clock_event_device *clk) +{ + struct arch_timer *at = evt_to_arch_timer(clk); + unsigned long ctrl; + + ctrl = arch_timer_mmio_read(at, ARCH_TIMER_REG_CTRL); + ctrl &= ~ARCH_TIMER_CTRL_ENABLE; + arch_timer_mmio_write(at, ARCH_TIMER_REG_CTRL, ctrl); + + return 0; +} + +static int arch_timer_mmio_set_next_event(unsigned long evt, + struct clock_event_device *clk) +{ + struct arch_timer *timer = evt_to_arch_timer(clk); + unsigned long ctrl; + u64 cnt; + + ctrl = arch_timer_mmio_read(timer, ARCH_TIMER_REG_CTRL); + + /* Timer must be disabled before programming CVAL */ + if (ctrl & ARCH_TIMER_CTRL_ENABLE) { + ctrl &= ~ARCH_TIMER_CTRL_ENABLE; + arch_timer_mmio_write(timer, ARCH_TIMER_REG_CTRL, ctrl); + } + + ctrl |= ARCH_TIMER_CTRL_ENABLE; + ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; + + cnt = arch_counter_mmio_get_cnt(timer); + + arch_timer_mmio_write(timer, ARCH_TIMER_REG_CVAL, evt + cnt); + arch_timer_mmio_write(timer, ARCH_TIMER_REG_CTRL, ctrl); + return 0; +} + +static irqreturn_t arch_timer_mmio_handler(int irq, void *dev_id) +{ + struct clock_event_device *evt = dev_id; + struct arch_timer *at = evt_to_arch_timer(evt); + unsigned long ctrl; + + ctrl = arch_timer_mmio_read(at, ARCH_TIMER_REG_CTRL); + if (ctrl & ARCH_TIMER_CTRL_IT_STAT) { + ctrl |= ARCH_TIMER_CTRL_IT_MASK; + arch_timer_mmio_write(at, ARCH_TIMER_REG_CTRL, ctrl); + evt->event_handler(evt); + return IRQ_HANDLED; + } + + return IRQ_NONE; +} + +static struct arch_timer_mem_frame *find_best_frame(struct platform_device *pdev) +{ + struct arch_timer_mem_frame *frame, *best_frame = NULL; + struct arch_timer *at = platform_get_drvdata(pdev); + void __iomem *cntctlbase; + u32 cnttidr; + + cntctlbase = ioremap(at->gt_block->cntctlbase, at->gt_block->size); + if (!cntctlbase) { + dev_err(&pdev->dev, "Can't map CNTCTLBase @ %pa\n", + &at->gt_block->cntctlbase); + return NULL; + } + + cnttidr = readl_relaxed(cntctlbase + CNTTIDR); + + /* + * Try to find a virtual capable frame. Otherwise fall back to a + * physical capable frame. + */ + for (int i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { + u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | + CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; + + frame = &at->gt_block->frame[i]; + if (!frame->valid) + continue; + + /* Try enabling everything, and see what sticks */ + writel_relaxed(cntacr, cntctlbase + CNTACR(i)); + cntacr = readl_relaxed(cntctlbase + CNTACR(i)); + + /* Pick a suitable frame for which we have an IRQ */ + if ((cnttidr & CNTTIDR_VIRT(i)) && + !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT)) && + frame->virt_irq) { + best_frame = frame; + at->access = VIRT_ACCESS; + break; + } + + if ((~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) || + !frame->phys_irq) + continue; + + at->access = PHYS_ACCESS; + best_frame = frame; + } + + iounmap(cntctlbase); + + return best_frame; +} + +static void arch_timer_mmio_setup(struct arch_timer *at, int irq) +{ + at->evt = (struct clock_event_device) { + .features = (CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_DYNIRQ), + .name = "arch_mem_timer", + .rating = 400, + .cpumask = cpu_possible_mask, + .irq = irq, + .set_next_event = arch_timer_mmio_set_next_event, + .set_state_oneshot_stopped = arch_timer_mmio_shutdown, + .set_state_shutdown = arch_timer_mmio_shutdown, + }; + + at->evt.set_state_shutdown(&at->evt); + + clockevents_config_and_register(&at->evt, at->rate, 0xf, + (unsigned long)CLOCKSOURCE_MASK(56)); + + enable_irq(at->evt.irq); + + at->cs = (struct clocksource) { + .name = "arch_mmio_counter", + .rating = 300, + .read = arch_mmio_counter_read, + .mask = CLOCKSOURCE_MASK(56), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + }; + + clocksource_register_hz(&at->cs, at->rate); +} + +static int arch_timer_mmio_frame_register(struct platform_device *pdev, + struct arch_timer_mem_frame *frame) +{ + struct arch_timer *at = platform_get_drvdata(pdev); + struct device_node *np = pdev->dev.of_node; + int ret, irq; + u32 rate; + + if (!devm_request_mem_region(&pdev->dev, frame->cntbase, frame->size, + "arch_mem_timer")) + return -EBUSY; + + at->base = devm_ioremap(&pdev->dev, frame->cntbase, frame->size); + if (!at->base) { + dev_err(&pdev->dev, "Can't map frame's registers\n"); + return -ENXIO; + } + + /* + * Allow "clock-frequency" to override the probed rate. If neither + * lead to something useful, use the CPU timer frequency as the + * fallback. The nice thing about that last point is that we woudn't + * made it here if we didn't have a valid frequency. + */ + rate = readl_relaxed(at->base + CNTFRQ); + + if (!np || of_property_read_u32(np, "clock-frequency", &at->rate)) + at->rate = rate; + + if (!at->rate) + at->rate = arch_timer_get_rate(); + + irq = at->access == VIRT_ACCESS ? frame->virt_irq : frame->phys_irq; + ret = devm_request_irq(&pdev->dev, irq, arch_timer_mmio_handler, + IRQF_TIMER | IRQF_NO_AUTOEN, "arch_mem_timer", + &at->evt); + if (ret) { + dev_err(&pdev->dev, "Failed to request mem timer irq\n"); + return ret; + } + + /* Afer this point, we're not allowed to fail anymore */ + arch_timer_mmio_setup(at, irq); + return 0; +} + +static int of_populate_gt_block(struct platform_device *pdev, + struct arch_timer *at) +{ + struct resource res; + + if (of_address_to_resource(pdev->dev.of_node, 0, &res)) + return -EINVAL; + + at->gt_block->cntctlbase = res.start; + at->gt_block->size = resource_size(&res); + + for_each_available_child_of_node_scoped(pdev->dev.of_node, frame_node) { + struct arch_timer_mem_frame *frame; + u32 n; + + if (of_property_read_u32(frame_node, "frame-number", &n)) { + dev_err(&pdev->dev, FW_BUG "Missing frame-number\n"); + return -EINVAL; + } + if (n >= ARCH_TIMER_MEM_MAX_FRAMES) { + dev_err(&pdev->dev, + FW_BUG "Wrong frame-number, only 0-%u are permitted\n", + ARCH_TIMER_MEM_MAX_FRAMES - 1); + return -EINVAL; + } + + frame = &at->gt_block->frame[n]; + + if (frame->valid) { + dev_err(&pdev->dev, FW_BUG "Duplicated frame-number\n"); + return -EINVAL; + } + + if (of_address_to_resource(frame_node, 0, &res)) + return -EINVAL; + + frame->cntbase = res.start; + frame->size = resource_size(&res); + + frame->phys_irq = irq_of_parse_and_map(frame_node, 0); + frame->virt_irq = irq_of_parse_and_map(frame_node, 1); + + frame->valid = true; + } + + return 0; +} + +static int arch_timer_mmio_probe(struct platform_device *pdev) +{ + struct arch_timer_mem_frame *frame; + struct arch_timer *at; + struct device_node *np; + int ret; + + np = pdev->dev.of_node; + + at = devm_kmalloc(&pdev->dev, sizeof(*at), GFP_KERNEL | __GFP_ZERO); + if (!at) + return -ENOMEM; + + if (np) { + at->gt_block = devm_kmalloc(&pdev->dev, sizeof(*at->gt_block), + GFP_KERNEL | __GFP_ZERO); + if (!at->gt_block) + return -ENOMEM; + ret = of_populate_gt_block(pdev, at); + if (ret) + return ret; + } else { + at->gt_block = dev_get_platdata(&pdev->dev); + } + + platform_set_drvdata(pdev, at); + + frame = find_best_frame(pdev); + if (!frame) { + dev_err(&pdev->dev, + "Unable to find a suitable frame in timer @ %pa\n", + &at->gt_block->cntctlbase); + return -EINVAL; + } + + ret = arch_timer_mmio_frame_register(pdev, frame); + if (!ret) + dev_info(&pdev->dev, + "mmio timer running at %lu.%02luMHz (%s)\n", + (unsigned long)at->rate / 1000000, + (unsigned long)(at->rate / 10000) % 100, + at->access == VIRT_ACCESS ? "virt" : "phys"); + + return ret; +} + +static const struct of_device_id arch_timer_mmio_of_table[] = { + { .compatible = "arm,armv7-timer-mem", }, + {} +}; + +static struct platform_driver arch_timer_mmio_drv = { + .driver = { + .name = "arch-timer-mmio", + .of_match_table = arch_timer_mmio_of_table, + }, + .probe = arch_timer_mmio_probe, +}; +builtin_platform_driver(arch_timer_mmio_drv); + +static struct platform_driver arch_timer_mmio_acpi_drv = { + .driver = { + .name = "gtdt-arm-mmio-timer", + }, + .probe = arch_timer_mmio_probe, +}; +builtin_platform_driver(arch_timer_mmio_acpi_drv); diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index 2d86bbc2764a04..5e3d6bb7e437ba 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -263,14 +263,13 @@ static void __init gt_delay_timer_init(void) register_current_timer_delay(>_delay_timer); } -static int __init gt_clocksource_init(void) +static int __init gt_clocksource_init(unsigned int psv) { writel(0, gt_base + GT_CONTROL); writel(0, gt_base + GT_COUNTER0); writel(0, gt_base + GT_COUNTER1); /* set prescaler and enable timer on all the cores */ - writel(FIELD_PREP(GT_CONTROL_PRESCALER_MASK, - CONFIG_ARM_GT_INITIAL_PRESCALER_VAL - 1) | + writel(FIELD_PREP(GT_CONTROL_PRESCALER_MASK, psv - 1) | GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); #ifdef CONFIG_CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK @@ -338,11 +337,45 @@ static int gt_clk_rate_change_cb(struct notifier_block *nb, return NOTIFY_DONE; } +struct gt_prescaler_config { + const char *compatible; + unsigned long prescaler; +}; + +static const struct gt_prescaler_config gt_prescaler_configs[] = { + /* + * On am43 the global timer clock is a child of the clock used for CPU + * OPPs, so the initial prescaler has to be compatible with all OPPs + * which are 300, 600, 720, 800 and 1000 with a fixed divider of 2, this + * gives us a GCD of 10. Initial frequency is 1000, so the prescaler is + * 50. + */ + { .compatible = "ti,am43", .prescaler = 50 }, + { .compatible = "xlnx,zynq-7000", .prescaler = 2 }, + { .compatible = NULL } +}; + +static unsigned long gt_get_initial_prescaler_value(struct device_node *np) +{ + const struct gt_prescaler_config *config; + + if (CONFIG_ARM_GT_INITIAL_PRESCALER_VAL != 0) + return CONFIG_ARM_GT_INITIAL_PRESCALER_VAL; + + for (config = gt_prescaler_configs; config->compatible; config++) { + if (of_machine_is_compatible(config->compatible)) + return config->prescaler; + } + + return 1; +} + static int __init global_timer_of_register(struct device_node *np) { struct clk *gt_clk; static unsigned long gt_clk_rate; int err; + unsigned long psv; /* * In A9 r2p0 the comparators for each processor with the global timer @@ -378,8 +411,9 @@ static int __init global_timer_of_register(struct device_node *np) goto out_unmap; } + psv = gt_get_initial_prescaler_value(np); gt_clk_rate = clk_get_rate(gt_clk); - gt_target_rate = gt_clk_rate / CONFIG_ARM_GT_INITIAL_PRESCALER_VAL; + gt_target_rate = gt_clk_rate / psv; gt_clk_rate_change_nb.notifier_call = gt_clk_rate_change_cb; err = clk_notifier_register(gt_clk, >_clk_rate_change_nb); @@ -404,7 +438,7 @@ static int __init global_timer_of_register(struct device_node *np) } /* Register and immediately configure the timer on the boot CPU */ - err = gt_clocksource_init(); + err = gt_clocksource_init(psv); if (err) goto out_irq; diff --git a/drivers/clocksource/clps711x-timer.c b/drivers/clocksource/clps711x-timer.c index e95fdc49c2269c..bbceb0289d457a 100644 --- a/drivers/clocksource/clps711x-timer.c +++ b/drivers/clocksource/clps711x-timer.c @@ -78,24 +78,33 @@ static int __init clps711x_timer_init(struct device_node *np) unsigned int irq = irq_of_parse_and_map(np, 0); struct clk *clock = of_clk_get(np, 0); void __iomem *base = of_iomap(np, 0); + int ret = 0; if (!base) return -ENOMEM; - if (!irq) - return -EINVAL; - if (IS_ERR(clock)) - return PTR_ERR(clock); + if (!irq) { + ret = -EINVAL; + goto unmap_io; + } + if (IS_ERR(clock)) { + ret = PTR_ERR(clock); + goto unmap_io; + } switch (of_alias_get_id(np, "timer")) { case CLPS711X_CLKSRC_CLOCKSOURCE: clps711x_clksrc_init(clock, base); break; case CLPS711X_CLKSRC_CLOCKEVENT: - return _clps711x_clkevt_init(clock, base, irq); + ret = _clps711x_clkevt_init(clock, base, irq); + break; default: - return -EINVAL; + ret = -EINVAL; + break; } - return 0; +unmap_io: + iounmap(base); + return ret; } TIMER_OF_DECLARE(clps711x, "cirrus,ep7209-timer", clps711x_timer_init); diff --git a/drivers/clocksource/ingenic-sysost.c b/drivers/clocksource/ingenic-sysost.c index cb6fc2f152d467..e79cfb0b8e05f6 100644 --- a/drivers/clocksource/ingenic-sysost.c +++ b/drivers/clocksource/ingenic-sysost.c @@ -127,18 +127,23 @@ static u8 ingenic_ost_get_prescale(unsigned long rate, unsigned long req_rate) return 2; /* /16 divider */ } -static long ingenic_ost_round_rate(struct clk_hw *hw, unsigned long req_rate, - unsigned long *parent_rate) +static int ingenic_ost_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { - unsigned long rate = *parent_rate; + unsigned long rate = req->best_parent_rate; u8 prescale; - if (req_rate > rate) - return rate; + if (req->rate > rate) { + req->rate = rate; - prescale = ingenic_ost_get_prescale(rate, req_rate); + return 0; + } + + prescale = ingenic_ost_get_prescale(rate, req->rate); - return rate >> (prescale * 2); + req->rate = rate >> (prescale * 2); + + return 0; } static int ingenic_ost_percpu_timer_set_rate(struct clk_hw *hw, unsigned long req_rate, @@ -175,14 +180,14 @@ static int ingenic_ost_global_timer_set_rate(struct clk_hw *hw, unsigned long re static const struct clk_ops ingenic_ost_percpu_timer_ops = { .recalc_rate = ingenic_ost_percpu_timer_recalc_rate, - .round_rate = ingenic_ost_round_rate, - .set_rate = ingenic_ost_percpu_timer_set_rate, + .determine_rate = ingenic_ost_determine_rate, + .set_rate = ingenic_ost_percpu_timer_set_rate, }; static const struct clk_ops ingenic_ost_global_timer_ops = { .recalc_rate = ingenic_ost_global_timer_recalc_rate, - .round_rate = ingenic_ost_round_rate, - .set_rate = ingenic_ost_global_timer_set_rate, + .determine_rate = ingenic_ost_determine_rate, + .set_rate = ingenic_ost_global_timer_set_rate, }; static const char * const ingenic_ost_clk_parents[] = { "ext" }; diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c index c3536fffbe9a0f..5a99801a165717 100644 --- a/drivers/clocksource/scx200_hrt.c +++ b/drivers/clocksource/scx200_hrt.c @@ -52,6 +52,7 @@ static struct clocksource cs_hrt = { .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, /* mult, shift are set based on mhz27 flag */ + .owner = THIS_MODULE, }; static int __init init_hrt_clocksource(void) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index b72b36e0abed86..385eb94bbe7ce5 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -578,37 +578,74 @@ static irqreturn_t sh_cmt_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int sh_cmt_start(struct sh_cmt_channel *ch, unsigned long flag) +static int sh_cmt_start_clocksource(struct sh_cmt_channel *ch) { int ret = 0; unsigned long flags; - if (flag & FLAG_CLOCKSOURCE) - pm_runtime_get_sync(&ch->cmt->pdev->dev); + pm_runtime_get_sync(&ch->cmt->pdev->dev); raw_spin_lock_irqsave(&ch->lock, flags); - if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { - if (flag & FLAG_CLOCKEVENT) - pm_runtime_get_sync(&ch->cmt->pdev->dev); + if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) ret = sh_cmt_enable(ch); - } if (ret) goto out; - ch->flags |= flag; + + ch->flags |= FLAG_CLOCKSOURCE; /* setup timeout if no clockevent */ - if (ch->cmt->num_channels == 1 && - flag == FLAG_CLOCKSOURCE && (!(ch->flags & FLAG_CLOCKEVENT))) + if (ch->cmt->num_channels == 1 && !(ch->flags & FLAG_CLOCKEVENT)) __sh_cmt_set_next(ch, ch->max_match_value); +out: + raw_spin_unlock_irqrestore(&ch->lock, flags); + + return ret; +} + +static void sh_cmt_stop_clocksource(struct sh_cmt_channel *ch) +{ + unsigned long flags; + unsigned long f; + + raw_spin_lock_irqsave(&ch->lock, flags); + + f = ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE); + + ch->flags &= ~FLAG_CLOCKSOURCE; + + if (f && !(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) + sh_cmt_disable(ch); + + raw_spin_unlock_irqrestore(&ch->lock, flags); + + pm_runtime_put(&ch->cmt->pdev->dev); +} + +static int sh_cmt_start_clockevent(struct sh_cmt_channel *ch) +{ + int ret = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&ch->lock, flags); + + if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { + pm_runtime_get_sync(&ch->cmt->pdev->dev); + ret = sh_cmt_enable(ch); + } + + if (ret) + goto out; + + ch->flags |= FLAG_CLOCKEVENT; out: raw_spin_unlock_irqrestore(&ch->lock, flags); return ret; } -static void sh_cmt_stop(struct sh_cmt_channel *ch, unsigned long flag) +static void sh_cmt_stop_clockevent(struct sh_cmt_channel *ch) { unsigned long flags; unsigned long f; @@ -616,22 +653,19 @@ static void sh_cmt_stop(struct sh_cmt_channel *ch, unsigned long flag) raw_spin_lock_irqsave(&ch->lock, flags); f = ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE); - ch->flags &= ~flag; + + ch->flags &= ~FLAG_CLOCKEVENT; if (f && !(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { sh_cmt_disable(ch); - if (flag & FLAG_CLOCKEVENT) - pm_runtime_put(&ch->cmt->pdev->dev); + pm_runtime_put(&ch->cmt->pdev->dev); } /* adjust the timeout to maximum if only clocksource left */ - if ((flag == FLAG_CLOCKEVENT) && (ch->flags & FLAG_CLOCKSOURCE)) + if (ch->flags & FLAG_CLOCKSOURCE) __sh_cmt_set_next(ch, ch->max_match_value); raw_spin_unlock_irqrestore(&ch->lock, flags); - - if (flag & FLAG_CLOCKSOURCE) - pm_runtime_put(&ch->cmt->pdev->dev); } static struct sh_cmt_channel *cs_to_sh_cmt(struct clocksource *cs) @@ -672,7 +706,7 @@ static int sh_cmt_clocksource_enable(struct clocksource *cs) ch->total_cycles = 0; - ret = sh_cmt_start(ch, FLAG_CLOCKSOURCE); + ret = sh_cmt_start_clocksource(ch); if (!ret) ch->cs_enabled = true; @@ -685,7 +719,7 @@ static void sh_cmt_clocksource_disable(struct clocksource *cs) WARN_ON(!ch->cs_enabled); - sh_cmt_stop(ch, FLAG_CLOCKSOURCE); + sh_cmt_stop_clocksource(ch); ch->cs_enabled = false; } @@ -696,7 +730,7 @@ static void sh_cmt_clocksource_suspend(struct clocksource *cs) if (!ch->cs_enabled) return; - sh_cmt_stop(ch, FLAG_CLOCKSOURCE); + sh_cmt_stop_clocksource(ch); dev_pm_genpd_suspend(&ch->cmt->pdev->dev); } @@ -708,7 +742,7 @@ static void sh_cmt_clocksource_resume(struct clocksource *cs) return; dev_pm_genpd_resume(&ch->cmt->pdev->dev); - sh_cmt_start(ch, FLAG_CLOCKSOURCE); + sh_cmt_start_clocksource(ch); } static int sh_cmt_register_clocksource(struct sh_cmt_channel *ch, @@ -740,7 +774,7 @@ static struct sh_cmt_channel *ced_to_sh_cmt(struct clock_event_device *ced) static void sh_cmt_clock_event_start(struct sh_cmt_channel *ch, int periodic) { - sh_cmt_start(ch, FLAG_CLOCKEVENT); + sh_cmt_start_clockevent(ch); if (periodic) sh_cmt_set_next(ch, ((ch->cmt->rate + HZ/2) / HZ) - 1); @@ -752,7 +786,7 @@ static int sh_cmt_clock_event_shutdown(struct clock_event_device *ced) { struct sh_cmt_channel *ch = ced_to_sh_cmt(ced); - sh_cmt_stop(ch, FLAG_CLOCKEVENT); + sh_cmt_stop_clockevent(ch); return 0; } @@ -763,7 +797,7 @@ static int sh_cmt_clock_event_set_state(struct clock_event_device *ced, /* deal with old setting first */ if (clockevent_state_oneshot(ced) || clockevent_state_periodic(ced)) - sh_cmt_stop(ch, FLAG_CLOCKEVENT); + sh_cmt_stop_clockevent(ch); dev_info(&ch->cmt->pdev->dev, "ch%u: used for %s clock events\n", ch->index, periodic ? "periodic" : "oneshot"); diff --git a/drivers/clocksource/timer-cs5535.c b/drivers/clocksource/timer-cs5535.c index d47acfe848ae45..8af666c398900b 100644 --- a/drivers/clocksource/timer-cs5535.c +++ b/drivers/clocksource/timer-cs5535.c @@ -101,6 +101,7 @@ static struct clock_event_device cs5535_clockevent = { .tick_resume = mfgpt_shutdown, .set_next_event = mfgpt_next_event, .rating = 250, + .owner = THIS_MODULE, }; static irqreturn_t mfgpt_tick(int irq, void *dev_id) diff --git a/drivers/clocksource/timer-econet-en751221.c b/drivers/clocksource/timer-econet-en751221.c index 3b449fdaafee03..4008076b1a2109 100644 --- a/drivers/clocksource/timer-econet-en751221.c +++ b/drivers/clocksource/timer-econet-en751221.c @@ -146,7 +146,7 @@ static int __init cevt_init(struct device_node *np) for_each_possible_cpu(i) { struct clock_event_device *cd = &per_cpu(econet_timer_pcpu, i); - cd->rating = 310, + cd->rating = 310; cd->features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_PERCPU; diff --git a/drivers/clocksource/timer-nxp-pit.c b/drivers/clocksource/timer-nxp-pit.c new file mode 100644 index 00000000000000..2d0a3554b6bf7d --- /dev/null +++ b/drivers/clocksource/timer-nxp-pit.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2012-2013 Freescale Semiconductor, Inc. + * Copyright 2018,2021-2025 NXP + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Each pit takes 0x10 Bytes register space + */ +#define PIT0_OFFSET 0x100 +#define PIT_CH(n) (PIT0_OFFSET + 0x10 * (n)) + +#define PITMCR(__base) (__base) + +#define PITMCR_FRZ BIT(0) +#define PITMCR_MDIS BIT(1) + +#define PITLDVAL(__base) (__base) +#define PITTCTRL(__base) ((__base) + 0x08) + +#define PITCVAL_OFFSET 0x04 +#define PITCVAL(__base) ((__base) + 0x04) + +#define PITTCTRL_TEN BIT(0) +#define PITTCTRL_TIE BIT(1) + +#define PITTFLG(__base) ((__base) + 0x0c) + +#define PITTFLG_TIF BIT(0) + +struct pit_timer { + void __iomem *clksrc_base; + void __iomem *clkevt_base; + struct clock_event_device ced; + struct clocksource cs; + int rate; +}; + +struct pit_timer_data { + int max_pit_instances; +}; + +static DEFINE_PER_CPU(struct pit_timer *, pit_timers); + +/* + * Global structure for multiple PITs initialization + */ +static int pit_instances; +static int max_pit_instances = 1; + +static void __iomem *sched_clock_base; + +static inline struct pit_timer *ced_to_pit(struct clock_event_device *ced) +{ + return container_of(ced, struct pit_timer, ced); +} + +static inline struct pit_timer *cs_to_pit(struct clocksource *cs) +{ + return container_of(cs, struct pit_timer, cs); +} + +static inline void pit_module_enable(void __iomem *base) +{ + writel(0, PITMCR(base)); +} + +static inline void pit_module_disable(void __iomem *base) +{ + writel(PITMCR_MDIS, PITMCR(base)); +} + +static inline void pit_timer_enable(void __iomem *base, bool tie) +{ + u32 val = PITTCTRL_TEN | (tie ? PITTCTRL_TIE : 0); + + writel(val, PITTCTRL(base)); +} + +static inline void pit_timer_disable(void __iomem *base) +{ + writel(0, PITTCTRL(base)); +} + +static inline void pit_timer_set_counter(void __iomem *base, unsigned int cnt) +{ + writel(cnt, PITLDVAL(base)); +} + +static inline void pit_timer_irqack(struct pit_timer *pit) +{ + writel(PITTFLG_TIF, PITTFLG(pit->clkevt_base)); +} + +static u64 notrace pit_read_sched_clock(void) +{ + return ~readl(sched_clock_base); +} + +static u64 pit_timer_clocksource_read(struct clocksource *cs) +{ + struct pit_timer *pit = cs_to_pit(cs); + + return (u64)~readl(PITCVAL(pit->clksrc_base)); +} + +static int pit_clocksource_init(struct pit_timer *pit, const char *name, + void __iomem *base, unsigned long rate) +{ + /* + * The channels 0 and 1 can be chained to build a 64-bit + * timer. Let's use the channel 2 as a clocksource and leave + * the channels 0 and 1 unused for anyone else who needs them + */ + pit->clksrc_base = base + PIT_CH(2); + pit->cs.name = name; + pit->cs.rating = 300; + pit->cs.read = pit_timer_clocksource_read; + pit->cs.mask = CLOCKSOURCE_MASK(32); + pit->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS; + + /* set the max load value and start the clock source counter */ + pit_timer_disable(pit->clksrc_base); + pit_timer_set_counter(pit->clksrc_base, ~0); + pit_timer_enable(pit->clksrc_base, 0); + + sched_clock_base = pit->clksrc_base + PITCVAL_OFFSET; + sched_clock_register(pit_read_sched_clock, 32, rate); + + return clocksource_register_hz(&pit->cs, rate); +} + +static int pit_set_next_event(unsigned long delta, struct clock_event_device *ced) +{ + struct pit_timer *pit = ced_to_pit(ced); + + /* + * set a new value to PITLDVAL register will not restart the timer, + * to abort the current cycle and start a timer period with the new + * value, the timer must be disabled and enabled again. + * and the PITLAVAL should be set to delta minus one according to pit + * hardware requirement. + */ + pit_timer_disable(pit->clkevt_base); + pit_timer_set_counter(pit->clkevt_base, delta - 1); + pit_timer_enable(pit->clkevt_base, true); + + return 0; +} + +static int pit_shutdown(struct clock_event_device *ced) +{ + struct pit_timer *pit = ced_to_pit(ced); + + pit_timer_disable(pit->clkevt_base); + + return 0; +} + +static int pit_set_periodic(struct clock_event_device *ced) +{ + struct pit_timer *pit = ced_to_pit(ced); + + pit_set_next_event(pit->rate / HZ, ced); + + return 0; +} + +static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *ced = dev_id; + struct pit_timer *pit = ced_to_pit(ced); + + pit_timer_irqack(pit); + + /* + * pit hardware doesn't support oneshot, it will generate an interrupt + * and reload the counter value from PITLDVAL when PITCVAL reach zero, + * and start the counter again. So software need to disable the timer + * to stop the counter loop in ONESHOT mode. + */ + if (likely(clockevent_state_oneshot(ced))) + pit_timer_disable(pit->clkevt_base); + + ced->event_handler(ced); + + return IRQ_HANDLED; +} + +static int pit_clockevent_per_cpu_init(struct pit_timer *pit, const char *name, + void __iomem *base, unsigned long rate, + int irq, unsigned int cpu) +{ + int ret; + + /* + * The channels 0 and 1 can be chained to build a 64-bit + * timer. Let's use the channel 3 as a clockevent and leave + * the channels 0 and 1 unused for anyone else who needs them + */ + pit->clkevt_base = base + PIT_CH(3); + pit->rate = rate; + + pit_timer_disable(pit->clkevt_base); + + pit_timer_irqack(pit); + + ret = request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_NOBALANCING, + name, &pit->ced); + if (ret) + return ret; + + pit->ced.cpumask = cpumask_of(cpu); + pit->ced.irq = irq; + + pit->ced.name = name; + pit->ced.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; + pit->ced.set_state_shutdown = pit_shutdown; + pit->ced.set_state_periodic = pit_set_periodic; + pit->ced.set_next_event = pit_set_next_event; + pit->ced.rating = 300; + + per_cpu(pit_timers, cpu) = pit; + + return 0; +} + +static void pit_clockevent_per_cpu_exit(struct pit_timer *pit, unsigned int cpu) +{ + pit_timer_disable(pit->clkevt_base); + free_irq(pit->ced.irq, &pit->ced); + per_cpu(pit_timers, cpu) = NULL; +} + +static int pit_clockevent_starting_cpu(unsigned int cpu) +{ + struct pit_timer *pit = per_cpu(pit_timers, cpu); + int ret; + + if (!pit) + return 0; + + ret = irq_force_affinity(pit->ced.irq, cpumask_of(cpu)); + if (ret) { + pit_clockevent_per_cpu_exit(pit, cpu); + return ret; + } + + /* + * The value for the LDVAL register trigger is calculated as: + * LDVAL trigger = (period / clock period) - 1 + * The pit is a 32-bit down count timer, when the counter value + * reaches 0, it will generate an interrupt, thus the minimal + * LDVAL trigger value is 1. And then the min_delta is + * minimal LDVAL trigger value + 1, and the max_delta is full 32-bit. + */ + clockevents_config_and_register(&pit->ced, pit->rate, 2, 0xffffffff); + + return 0; +} + +static int pit_timer_init(struct device_node *np) +{ + struct pit_timer *pit; + struct clk *pit_clk; + void __iomem *timer_base; + const char *name = of_node_full_name(np); + unsigned long clk_rate; + int irq, ret; + + pit = kzalloc(sizeof(*pit), GFP_KERNEL); + if (!pit) + return -ENOMEM; + + ret = -ENXIO; + timer_base = of_iomap(np, 0); + if (!timer_base) { + pr_err("Failed to iomap\n"); + goto out_kfree; + } + + ret = -EINVAL; + irq = irq_of_parse_and_map(np, 0); + if (irq <= 0) { + pr_err("Failed to irq_of_parse_and_map\n"); + goto out_iounmap; + } + + pit_clk = of_clk_get(np, 0); + if (IS_ERR(pit_clk)) { + ret = PTR_ERR(pit_clk); + goto out_irq_dispose_mapping; + } + + ret = clk_prepare_enable(pit_clk); + if (ret) + goto out_clk_put; + + clk_rate = clk_get_rate(pit_clk); + + pit_module_disable(timer_base); + + ret = pit_clocksource_init(pit, name, timer_base, clk_rate); + if (ret) { + pr_err("Failed to initialize clocksource '%pOF'\n", np); + goto out_pit_module_disable; + } + + ret = pit_clockevent_per_cpu_init(pit, name, timer_base, clk_rate, irq, pit_instances); + if (ret) { + pr_err("Failed to initialize clockevent '%pOF'\n", np); + goto out_pit_clocksource_unregister; + } + + /* enable the pit module */ + pit_module_enable(timer_base); + + pit_instances++; + + if (pit_instances == max_pit_instances) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "PIT timer:starting", + pit_clockevent_starting_cpu, NULL); + if (ret < 0) + goto out_pit_clocksource_unregister; + } + + return 0; + +out_pit_clocksource_unregister: + clocksource_unregister(&pit->cs); +out_pit_module_disable: + pit_module_disable(timer_base); + clk_disable_unprepare(pit_clk); +out_clk_put: + clk_put(pit_clk); +out_irq_dispose_mapping: + irq_dispose_mapping(irq); +out_iounmap: + iounmap(timer_base); +out_kfree: + kfree(pit); + + return ret; +} + +static int pit_timer_probe(struct platform_device *pdev) +{ + const struct pit_timer_data *pit_timer_data; + + pit_timer_data = of_device_get_match_data(&pdev->dev); + if (pit_timer_data) + max_pit_instances = pit_timer_data->max_pit_instances; + + return pit_timer_init(pdev->dev.of_node); +} + +static struct pit_timer_data s32g2_data = { .max_pit_instances = 2 }; + +static const struct of_device_id pit_timer_of_match[] = { + { .compatible = "nxp,s32g2-pit", .data = &s32g2_data }, + { } +}; +MODULE_DEVICE_TABLE(of, pit_timer_of_match); + +static struct platform_driver nxp_pit_driver = { + .driver = { + .name = "nxp-pit", + .of_match_table = pit_timer_of_match, + }, + .probe = pit_timer_probe, +}; +module_platform_driver(nxp_pit_driver); + +TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); diff --git a/drivers/clocksource/timer-nxp-stm.c b/drivers/clocksource/timer-nxp-stm.c index d7ccf900172983..bbc40623728fa5 100644 --- a/drivers/clocksource/timer-nxp-stm.c +++ b/drivers/clocksource/timer-nxp-stm.c @@ -201,6 +201,7 @@ static int __init nxp_stm_clocksource_init(struct device *dev, struct stm_timer stm_timer->cs.resume = nxp_stm_clocksource_resume; stm_timer->cs.mask = CLOCKSOURCE_MASK(32); stm_timer->cs.flags = CLOCK_SOURCE_IS_CONTINUOUS; + stm_timer->cs.owner = THIS_MODULE; ret = clocksource_register_hz(&stm_timer->cs, stm_timer->rate); if (ret) @@ -314,6 +315,7 @@ static int __init nxp_stm_clockevent_per_cpu_init(struct device *dev, struct stm stm_timer->ced.cpumask = cpumask_of(cpu); stm_timer->ced.rating = 460; stm_timer->ced.irq = irq; + stm_timer->ced.owner = THIS_MODULE; per_cpu(stm_timers, cpu) = stm_timer; diff --git a/drivers/clocksource/timer-rtl-otto.c b/drivers/clocksource/timer-rtl-otto.c index 8a3068b36e7529..6113d2fdd4de19 100644 --- a/drivers/clocksource/timer-rtl-otto.c +++ b/drivers/clocksource/timer-rtl-otto.c @@ -38,14 +38,13 @@ #define RTTM_BIT_COUNT 28 #define RTTM_MIN_DELTA 8 #define RTTM_MAX_DELTA CLOCKSOURCE_MASK(28) +#define RTTM_MAX_DIVISOR GENMASK(15, 0) /* - * Timers are derived from the LXB clock frequency. Usually this is a fixed - * multiple of the 25 MHz oscillator. The 930X SOC is an exception from that. - * Its LXB clock has only dividers and uses the switch PLL of 2.45 GHz as its - * base. The only meaningful frequencies we can achieve from that are 175.000 - * MHz and 153.125 MHz. The greatest common divisor of all explained possible - * speeds is 3125000. Pin the timers to this 3.125 MHz reference frequency. + * Timers are derived from the lexra bus (LXB) clock frequency. This is 175 MHz + * on RTL930x and 200 MHz on the other platforms. With 3.125 MHz choose a common + * divisor to have enough range and detail. This provides comparability between + * the different platforms. */ #define RTTM_TICKS_PER_SEC 3125000 @@ -55,11 +54,6 @@ struct rttm_cs { }; /* Simple internal register functions */ -static inline void rttm_set_counter(void __iomem *base, unsigned int counter) -{ - iowrite32(counter, base + RTTM_CNT); -} - static inline unsigned int rttm_get_counter(void __iomem *base) { return ioread32(base + RTTM_CNT); @@ -112,6 +106,22 @@ static irqreturn_t rttm_timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static void rttm_bounce_timer(void __iomem *base, u32 mode) +{ + /* + * When a running timer has less than ~5us left, a stop/start sequence + * might fail. While the details are unknown the most evident effect is + * that the subsequent interrupt will not be fired. + * + * As a workaround issue an intermediate restart with a very slow + * frequency of ~3kHz keeping the target counter (>=8). So the follow + * up restart will always be issued outside the critical window. + */ + + rttm_disable_timer(base); + rttm_enable_timer(base, mode, RTTM_MAX_DIVISOR); +} + static void rttm_stop_timer(void __iomem *base) { rttm_disable_timer(base); @@ -120,7 +130,6 @@ static void rttm_stop_timer(void __iomem *base) static void rttm_start_timer(struct timer_of *to, u32 mode) { - rttm_set_counter(to->of_base.base, 0); rttm_enable_timer(to->of_base.base, mode, to->of_clk.rate / RTTM_TICKS_PER_SEC); } @@ -129,7 +138,8 @@ static int rttm_next_event(unsigned long delta, struct clock_event_device *clkev struct timer_of *to = to_timer_of(clkevt); RTTM_DEBUG(to->of_base.base); - rttm_stop_timer(to->of_base.base); + rttm_bounce_timer(to->of_base.base, RTTM_CTRL_COUNTER); + rttm_disable_timer(to->of_base.base); rttm_set_period(to->of_base.base, delta); rttm_start_timer(to, RTTM_CTRL_COUNTER); @@ -141,7 +151,8 @@ static int rttm_state_oneshot(struct clock_event_device *clkevt) struct timer_of *to = to_timer_of(clkevt); RTTM_DEBUG(to->of_base.base); - rttm_stop_timer(to->of_base.base); + rttm_bounce_timer(to->of_base.base, RTTM_CTRL_COUNTER); + rttm_disable_timer(to->of_base.base); rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); rttm_start_timer(to, RTTM_CTRL_COUNTER); @@ -153,7 +164,8 @@ static int rttm_state_periodic(struct clock_event_device *clkevt) struct timer_of *to = to_timer_of(clkevt); RTTM_DEBUG(to->of_base.base); - rttm_stop_timer(to->of_base.base); + rttm_bounce_timer(to->of_base.base, RTTM_CTRL_TIMER); + rttm_disable_timer(to->of_base.base); rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); rttm_start_timer(to, RTTM_CTRL_TIMER); diff --git a/drivers/clocksource/timer-stm32-lp.c b/drivers/clocksource/timer-stm32-lp.c index 6e7944ffd7c032..c2a699f5c1dd72 100644 --- a/drivers/clocksource/timer-stm32-lp.c +++ b/drivers/clocksource/timer-stm32-lp.c @@ -211,6 +211,7 @@ static void stm32_clkevent_lp_init(struct stm32_lp_private *priv, priv->clkevt.rating = STM32_LP_RATING; priv->clkevt.suspend = stm32_clkevent_lp_suspend; priv->clkevt.resume = stm32_clkevent_lp_resume; + priv->clkevt.owner = THIS_MODULE; clockevents_config_and_register(&priv->clkevt, rate, 0x1, STM32_LPTIM_MAX_ARR); diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c index 6b48a9006444dd..f827d3f98f60e6 100644 --- a/drivers/clocksource/timer-sun5i.c +++ b/drivers/clocksource/timer-sun5i.c @@ -185,6 +185,7 @@ static int sun5i_setup_clocksource(struct platform_device *pdev, cs->clksrc.read = sun5i_clksrc_read; cs->clksrc.mask = CLOCKSOURCE_MASK(32); cs->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS; + cs->clksrc.owner = THIS_MODULE; ret = clocksource_register_hz(&cs->clksrc, rate); if (ret) { @@ -214,6 +215,7 @@ static int sun5i_setup_clockevent(struct platform_device *pdev, ce->clkevt.rating = 340; ce->clkevt.irq = irq; ce->clkevt.cpumask = cpu_possible_mask; + ce->clkevt.owner = THIS_MODULE; /* Enable timer0 interrupt */ val = readl(base + TIMER_IRQ_EN_REG); diff --git a/drivers/clocksource/timer-tegra186.c b/drivers/clocksource/timer-tegra186.c index e5394f98a02e66..355558893e5f3c 100644 --- a/drivers/clocksource/timer-tegra186.c +++ b/drivers/clocksource/timer-tegra186.c @@ -159,7 +159,7 @@ static void tegra186_wdt_enable(struct tegra186_wdt *wdt) tmr_writel(wdt->tmr, TMRCSSR_SRC_USEC, TMRCSSR); /* configure timer (system reset happens on the fifth expiration) */ - value = TMRCR_PTV(wdt->base.timeout * USEC_PER_SEC / 5) | + value = TMRCR_PTV(wdt->base.timeout * (USEC_PER_SEC / 5)) | TMRCR_PERIODIC | TMRCR_ENABLE; tmr_writel(wdt->tmr, value, TMRCR); @@ -231,7 +231,7 @@ static unsigned int tegra186_wdt_get_timeleft(struct watchdog_device *wdd) { struct tegra186_wdt *wdt = to_tegra186_wdt(wdd); u32 expiration, val; - u64 timeleft; + u32 timeleft; if (!watchdog_active(&wdt->base)) { /* return zero if the watchdog timer is not activated. */ @@ -266,21 +266,26 @@ static unsigned int tegra186_wdt_get_timeleft(struct watchdog_device *wdd) * Calculate the time remaining by adding the time for the * counter value to the time of the counter expirations that * remain. + * Note: Since wdt->base.timeout is bound to 255, the maximum + * value added to timeleft is + * 255 * (1,000,000 / 5) * 4 + * = 255 * 200,000 * 4 + * = 204,000,000 + * TMRSR_PCV is a 29-bit field. + * Its maximum value is 0x1fffffff = 536,870,911. + * 204,000,000 + 536,870,911 = 740,870,911 = 0x2C28CAFF. + * timeleft can therefore not overflow, and 64-bit calculations + * are not necessary. */ - timeleft += (((u64)wdt->base.timeout * USEC_PER_SEC) / 5) * (4 - expiration); + timeleft += (wdt->base.timeout * (USEC_PER_SEC / 5)) * (4 - expiration); /* * Convert the current counter value to seconds, - * rounding up to the nearest second. Cast u64 to - * u32 under the assumption that no overflow happens - * when coverting to seconds. + * rounding to the nearest second. */ - timeleft = DIV_ROUND_CLOSEST_ULL(timeleft, USEC_PER_SEC); + timeleft = DIV_ROUND_CLOSEST(timeleft, USEC_PER_SEC); - if (WARN_ON_ONCE(timeleft > U32_MAX)) - return U32_MAX; - - return lower_32_bits(timeleft); + return timeleft; } static const struct watchdog_ops tegra186_wdt_ops = { @@ -328,16 +333,12 @@ static struct tegra186_wdt *tegra186_wdt_create(struct tegra186_timer *tegra, wdt->base.parent = tegra->dev; err = watchdog_init_timeout(&wdt->base, 5, tegra->dev); - if (err < 0) { - dev_err(tegra->dev, "failed to initialize timeout: %d\n", err); + if (err < 0) return ERR_PTR(err); - } err = devm_watchdog_register_device(tegra->dev, &wdt->base); - if (err < 0) { - dev_err(tegra->dev, "failed to register WDT: %d\n", err); + if (err < 0) return ERR_PTR(err); - } return wdt; } @@ -373,6 +374,7 @@ static int tegra186_timer_tsc_init(struct tegra186_timer *tegra) tegra->tsc.read = tegra186_timer_tsc_read; tegra->tsc.mask = CLOCKSOURCE_MASK(56); tegra->tsc.flags = CLOCK_SOURCE_IS_CONTINUOUS; + tegra->tsc.owner = THIS_MODULE; return clocksource_register_hz(&tegra->tsc, 31250000); } @@ -392,6 +394,7 @@ static int tegra186_timer_osc_init(struct tegra186_timer *tegra) tegra->osc.read = tegra186_timer_osc_read; tegra->osc.mask = CLOCKSOURCE_MASK(32); tegra->osc.flags = CLOCK_SOURCE_IS_CONTINUOUS; + tegra->osc.owner = THIS_MODULE; return clocksource_register_hz(&tegra->osc, 38400000); } @@ -411,6 +414,7 @@ static int tegra186_timer_usec_init(struct tegra186_timer *tegra) tegra->usec.read = tegra186_timer_usec_read; tegra->usec.mask = CLOCKSOURCE_MASK(32); tegra->usec.flags = CLOCK_SOURCE_IS_CONTINUOUS; + tegra->usec.owner = THIS_MODULE; return clocksource_register_hz(&tegra->usec, USEC_PER_SEC); } diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index e9e32df6b56664..793e7cdcb1b16b 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -31,6 +31,7 @@ #include #include +#include /* * timer errata flags @@ -836,6 +837,48 @@ static int omap_dm_timer_set_match(struct omap_dm_timer *cookie, int enable, return 0; } +static int omap_dm_timer_set_cap(struct omap_dm_timer *cookie, + int autoreload, bool config_period) +{ + struct dmtimer *timer; + struct device *dev; + int rc; + u32 l; + + timer = to_dmtimer(cookie); + if (unlikely(!timer)) + return -EINVAL; + + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + /* + * 1. Select autoreload mode. TIMER_TCLR[1] AR bit. + * 2. TIMER_TCLR[14]: Sets the functionality of the TIMER IO pin. + * 3. TIMER_TCLR[13] : Capture mode select bit. + * 3. TIMER_TCLR[9-8] : Select transition capture mode. + */ + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + + if (autoreload) + l |= OMAP_TIMER_CTRL_AR; + + l |= OMAP_TIMER_CTRL_CAPTMODE | OMAP_TIMER_CTRL_GPOCFG; + + if (config_period == true) + l |= OMAP_TIMER_CTRL_TCM_LOWTOHIGH; /* Time Period config */ + else + l |= OMAP_TIMER_CTRL_TCM_BOTHEDGES; /* Duty Cycle config */ + + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + pm_runtime_put_sync(dev); + + return 0; +} + static int omap_dm_timer_set_pwm(struct omap_dm_timer *cookie, int def_on, int toggle, int trigger, int autoreload) { @@ -1023,23 +1066,92 @@ static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *cookie) return __omap_dm_timer_read_counter(timer); } +static inline unsigned int __omap_dm_timer_cap(struct dmtimer *timer, int idx) +{ + return idx == 0 ? dmtimer_read(timer, OMAP_TIMER_CAPTURE_REG) : + dmtimer_read(timer, OMAP_TIMER_CAPTURE2_REG); +} + static int omap_dm_timer_write_counter(struct omap_dm_timer *cookie, unsigned int value) { struct dmtimer *timer; + struct device *dev; timer = to_dmtimer(cookie); - if (unlikely(!timer || !atomic_read(&timer->enabled))) { - pr_err("%s: timer not available or enabled.\n", __func__); + if (unlikely(!timer)) { + pr_err("%s: timer not available.\n", __func__); return -EINVAL; } + dev = &timer->pdev->dev; + + pm_runtime_resume_and_get(dev); dmtimer_write(timer, OMAP_TIMER_COUNTER_REG, value); + pm_runtime_put_sync(dev); /* Save the context */ timer->context.tcrr = value; return 0; } +/** + * omap_dm_timer_cap_counter() - Calculate the high count or period count depending on the + * configuration. + * @cookie:Pointer to OMAP DM timer + * @is_period:Whether to configure timer in period or duty cycle mode + * + * Return high count or period count if timer is enabled else appropriate error. + */ +static unsigned int omap_dm_timer_cap_counter(struct omap_dm_timer *cookie, bool is_period) +{ + struct dmtimer *timer; + unsigned int cap1 = 0; + unsigned int cap2 = 0; + u32 l, ret; + + timer = to_dmtimer(cookie); + if (unlikely(!timer || !atomic_read(&timer->enabled))) { + pr_err("%s:timer is not available or enabled.%p\n", __func__, (void *)timer); + return -EINVAL; + } + + /* Stop the timer */ + omap_dm_timer_stop(cookie); + + /* Clear the timer counter value to 0 */ + ret = omap_dm_timer_write_counter(cookie, 0); + if (ret) + return ret; + + /* Sets the timer capture configuration for period/duty cycle calculation */ + ret = omap_dm_timer_set_cap(cookie, true, is_period); + if (ret) { + pr_err("%s: Failed to set timer capture configuration.\n", __func__); + return ret; + } + /* Start the timer */ + omap_dm_timer_start(cookie); + + /* + * 1 sec delay is given so as to provide + * enough time to capture low frequency signals. + */ + msleep(1000); + + cap1 = __omap_dm_timer_cap(timer, 0); + cap2 = __omap_dm_timer_cap(timer, 1); + + /* + * Clears the TCLR configuration. + * The start bit must be set to 1 as the timer is already in start mode. + */ + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + l &= ~(0xffff) | 0x1; + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + return (cap2-cap1); +} + static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) { struct dmtimer *timer = dev_get_drvdata(dev); @@ -1246,6 +1358,9 @@ static const struct omap_dm_timer_ops dmtimer_ops = { .write_counter = omap_dm_timer_write_counter, .read_status = omap_dm_timer_read_status, .write_status = omap_dm_timer_write_status, + .set_cap = omap_dm_timer_set_cap, + .get_cap_status = omap_dm_timer_get_pwm_status, + .read_cap = omap_dm_timer_cap_counter, }; static const struct dmtimer_platform_data omap3plus_pdata = { diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c deleted file mode 100644 index 911c92146eca6d..00000000000000 --- a/drivers/clocksource/timer-vf-pit.c +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright 2012-2013 Freescale Semiconductor, Inc. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Each pit takes 0x10 Bytes register space - */ -#define PITMCR 0x00 -#define PIT0_OFFSET 0x100 -#define PITn_OFFSET(n) (PIT0_OFFSET + 0x10 * (n)) -#define PITLDVAL 0x00 -#define PITCVAL 0x04 -#define PITTCTRL 0x08 -#define PITTFLG 0x0c - -#define PITMCR_MDIS (0x1 << 1) - -#define PITTCTRL_TEN (0x1 << 0) -#define PITTCTRL_TIE (0x1 << 1) -#define PITCTRL_CHN (0x1 << 2) - -#define PITTFLG_TIF 0x1 - -static void __iomem *clksrc_base; -static void __iomem *clkevt_base; -static unsigned long cycle_per_jiffy; - -static inline void pit_timer_enable(void) -{ - __raw_writel(PITTCTRL_TEN | PITTCTRL_TIE, clkevt_base + PITTCTRL); -} - -static inline void pit_timer_disable(void) -{ - __raw_writel(0, clkevt_base + PITTCTRL); -} - -static inline void pit_irq_acknowledge(void) -{ - __raw_writel(PITTFLG_TIF, clkevt_base + PITTFLG); -} - -static u64 notrace pit_read_sched_clock(void) -{ - return ~__raw_readl(clksrc_base + PITCVAL); -} - -static int __init pit_clocksource_init(unsigned long rate) -{ - /* set the max load value and start the clock source counter */ - __raw_writel(0, clksrc_base + PITTCTRL); - __raw_writel(~0UL, clksrc_base + PITLDVAL); - __raw_writel(PITTCTRL_TEN, clksrc_base + PITTCTRL); - - sched_clock_register(pit_read_sched_clock, 32, rate); - return clocksource_mmio_init(clksrc_base + PITCVAL, "vf-pit", rate, - 300, 32, clocksource_mmio_readl_down); -} - -static int pit_set_next_event(unsigned long delta, - struct clock_event_device *unused) -{ - /* - * set a new value to PITLDVAL register will not restart the timer, - * to abort the current cycle and start a timer period with the new - * value, the timer must be disabled and enabled again. - * and the PITLAVAL should be set to delta minus one according to pit - * hardware requirement. - */ - pit_timer_disable(); - __raw_writel(delta - 1, clkevt_base + PITLDVAL); - pit_timer_enable(); - - return 0; -} - -static int pit_shutdown(struct clock_event_device *evt) -{ - pit_timer_disable(); - return 0; -} - -static int pit_set_periodic(struct clock_event_device *evt) -{ - pit_set_next_event(cycle_per_jiffy, evt); - return 0; -} - -static irqreturn_t pit_timer_interrupt(int irq, void *dev_id) -{ - struct clock_event_device *evt = dev_id; - - pit_irq_acknowledge(); - - /* - * pit hardware doesn't support oneshot, it will generate an interrupt - * and reload the counter value from PITLDVAL when PITCVAL reach zero, - * and start the counter again. So software need to disable the timer - * to stop the counter loop in ONESHOT mode. - */ - if (likely(clockevent_state_oneshot(evt))) - pit_timer_disable(); - - evt->event_handler(evt); - - return IRQ_HANDLED; -} - -static struct clock_event_device clockevent_pit = { - .name = "VF pit timer", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_state_shutdown = pit_shutdown, - .set_state_periodic = pit_set_periodic, - .set_next_event = pit_set_next_event, - .rating = 300, -}; - -static int __init pit_clockevent_init(unsigned long rate, int irq) -{ - __raw_writel(0, clkevt_base + PITTCTRL); - __raw_writel(PITTFLG_TIF, clkevt_base + PITTFLG); - - BUG_ON(request_irq(irq, pit_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL, - "VF pit timer", &clockevent_pit)); - - clockevent_pit.cpumask = cpumask_of(0); - clockevent_pit.irq = irq; - /* - * The value for the LDVAL register trigger is calculated as: - * LDVAL trigger = (period / clock period) - 1 - * The pit is a 32-bit down count timer, when the counter value - * reaches 0, it will generate an interrupt, thus the minimal - * LDVAL trigger value is 1. And then the min_delta is - * minimal LDVAL trigger value + 1, and the max_delta is full 32-bit. - */ - clockevents_config_and_register(&clockevent_pit, rate, 2, 0xffffffff); - - return 0; -} - -static int __init pit_timer_init(struct device_node *np) -{ - struct clk *pit_clk; - void __iomem *timer_base; - unsigned long clk_rate; - int irq, ret; - - timer_base = of_iomap(np, 0); - if (!timer_base) { - pr_err("Failed to iomap\n"); - return -ENXIO; - } - - /* - * PIT0 and PIT1 can be chained to build a 64-bit timer, - * so choose PIT2 as clocksource, PIT3 as clockevent device, - * and leave PIT0 and PIT1 unused for anyone else who needs them. - */ - clksrc_base = timer_base + PITn_OFFSET(2); - clkevt_base = timer_base + PITn_OFFSET(3); - - irq = irq_of_parse_and_map(np, 0); - if (irq <= 0) - return -EINVAL; - - pit_clk = of_clk_get(np, 0); - if (IS_ERR(pit_clk)) - return PTR_ERR(pit_clk); - - ret = clk_prepare_enable(pit_clk); - if (ret) - return ret; - - clk_rate = clk_get_rate(pit_clk); - cycle_per_jiffy = clk_rate / (HZ); - - /* enable the pit module */ - __raw_writel(~PITMCR_MDIS, timer_base + PITMCR); - - ret = pit_clocksource_init(clk_rate); - if (ret) - return ret; - - return pit_clockevent_init(clk_rate, irq); -} -TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 4f7f9201598dc4..083d8369a59121 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -318,7 +318,6 @@ static u32 drv_read(struct acpi_cpufreq_data *data, const struct cpumask *mask) return cmd.val; } -/* Called via smp_call_function_many(), on the target CPUs */ static void do_drv_write(void *_cmd) { struct drv_cmd *cmd = _cmd; @@ -335,14 +334,8 @@ static void drv_write(struct acpi_cpufreq_data *data, .val = val, .func.write = data->cpu_freq_write, }; - int this_cpu; - this_cpu = get_cpu(); - if (cpumask_test_cpu(this_cpu, mask)) - do_drv_write(&cmd); - - smp_call_function_many(mask, do_drv_write, &cmd, 1); - put_cpu(); + on_each_cpu_mask(mask, do_drv_write, &cmd, true); } static u32 get_cur_val(const struct cpumask *mask, struct acpi_cpufreq_data *data) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index b4c79fde1979b6..298e92d8cc0315 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -872,10 +872,10 @@ static void amd_pstate_update_limits(struct cpufreq_policy *policy) */ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) { - u32 transition_delay_ns; + int transition_delay_ns; transition_delay_ns = cppc_get_transition_latency(cpu); - if (transition_delay_ns == CPUFREQ_ETERNAL) { + if (transition_delay_ns < 0) { if (cpu_feature_enabled(X86_FEATURE_AMD_FAST_CPPC)) return AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY; else @@ -891,10 +891,10 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) */ static u32 amd_pstate_get_transition_latency(unsigned int cpu) { - u32 transition_latency; + int transition_latency; transition_latency = cppc_get_transition_latency(cpu); - if (transition_latency == CPUFREQ_ETERNAL) + if (transition_latency < 0) return AMD_PSTATE_TRANSITION_LATENCY; return transition_latency; diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 12de0ac7bbaff0..e23d9abea13592 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -308,6 +308,16 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) return 0; } +static unsigned int __cppc_cpufreq_get_transition_delay_us(unsigned int cpu) +{ + int transition_latency_ns = cppc_get_transition_latency(cpu); + + if (transition_latency_ns < 0) + return CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS / NSEC_PER_USEC; + + return transition_latency_ns / NSEC_PER_USEC; +} + /* * The PCC subspace describes the rate at which platform can accept commands * on the shared PCC channel (including READs which do not count towards freq @@ -330,12 +340,12 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) return 10000; } } - return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; + return __cppc_cpufreq_get_transition_delay_us(cpu); } #else static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { - return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; + return __cppc_cpufreq_get_transition_delay_us(cpu); } #endif diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 506437489b4db2..7d5079fd168825 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -104,7 +104,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev); if (!transition_latency) - transition_latency = CPUFREQ_ETERNAL; + transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; cpumask_copy(policy->cpus, priv->cpus); policy->driver_data = priv; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 30e8e9b3c12fc0..852e024facc3cb 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2911,6 +2911,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) return -EPROBE_DEFER; if (!driver_data || !driver_data->verify || !driver_data->init || + (driver_data->target_index && driver_data->target) || (!!driver_data->setpolicy == (driver_data->target_index || driver_data->target)) || (!driver_data->get_intermediate != !driver_data->target_intermediate) || (!driver_data->online != !driver_data->offline) || @@ -2940,6 +2941,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) goto err_null_driver; } + /* + * Mark support for the scheduler's frequency invariance engine for + * drivers that implement target(), target_index() or fast_switch(). + */ + if (!cpufreq_driver->setpolicy) { + static_branch_enable_cpuslocked(&cpufreq_freq_invariance); + pr_debug("cpufreq: supports frequency invariance\n"); + } + ret = subsys_interface_register(&cpufreq_interface); if (ret) goto err_boost_unreg; @@ -2961,21 +2971,14 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) hp_online = ret; ret = 0; - /* - * Mark support for the scheduler's frequency invariance engine for - * drivers that implement target(), target_index() or fast_switch(). - */ - if (!cpufreq_driver->setpolicy) { - static_branch_enable_cpuslocked(&cpufreq_freq_invariance); - pr_debug("supports frequency invariance"); - } - pr_debug("driver %s up and running\n", driver_data->name); goto out; err_if_unreg: subsys_interface_unregister(&cpufreq_interface); err_boost_unreg: + if (!cpufreq_driver->setpolicy) + static_branch_disable_cpuslocked(&cpufreq_freq_invariance); remove_boost_sysfs_file(); err_null_driver: write_lock_irqsave(&cpufreq_driver_lock, flags); diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index d5111ee56e3803..7f251daf03ce32 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -33,16 +33,16 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy) struct cpufreq_frequency_table *pos, *table = policy->freq_table; unsigned int min_freq = ~0; unsigned int max_freq = 0; - unsigned int freq; + unsigned int freq, i; - cpufreq_for_each_valid_entry(pos, table) { + cpufreq_for_each_valid_entry_idx(pos, table, i) { freq = pos->frequency; if ((!cpufreq_boost_enabled() || !policy->boost_enabled) && (pos->flags & CPUFREQ_BOOST_FREQ)) continue; - pr_debug("table entry %u: %u kHz\n", (int)(pos - table), freq); + pr_debug("table entry %u: %u kHz\n", i, freq); if (freq < min_freq) min_freq = freq; if (freq > max_freq) @@ -126,7 +126,7 @@ int cpufreq_table_index_unsorted(struct cpufreq_policy *policy, }; struct cpufreq_frequency_table *pos; struct cpufreq_frequency_table *table = policy->freq_table; - unsigned int freq, diff, i = 0; + unsigned int freq, diff, i; int index; pr_debug("request for target %u kHz (relation: %u) for cpu %u\n", diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index db1c88e9d3f9cd..e93697d3edfd9b 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -442,7 +442,7 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev) } if (of_property_read_u32(np, "clock-latency", &transition_latency)) - transition_latency = CPUFREQ_ETERNAL; + transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; /* * Calculate the ramp time for max voltage change in the diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2519eb52746805..38897bb14a2c6d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -897,11 +897,19 @@ static ssize_t show_base_frequency(struct cpufreq_policy *policy, char *buf) cpufreq_freq_attr_ro(base_frequency); +enum hwp_cpufreq_attr_index { + HWP_BASE_FREQUENCY_INDEX = 0, + HWP_PERFORMANCE_PREFERENCE_INDEX, + HWP_PERFORMANCE_AVAILABLE_PREFERENCES_INDEX, + HWP_CPUFREQ_ATTR_COUNT, +}; + static struct freq_attr *hwp_cpufreq_attrs[] = { - &energy_performance_preference, - &energy_performance_available_preferences, - &base_frequency, - NULL, + [HWP_BASE_FREQUENCY_INDEX] = &base_frequency, + [HWP_PERFORMANCE_PREFERENCE_INDEX] = &energy_performance_preference, + [HWP_PERFORMANCE_AVAILABLE_PREFERENCES_INDEX] = + &energy_performance_available_preferences, + [HWP_CPUFREQ_ATTR_COUNT] = NULL, }; static bool no_cas __ro_after_init; @@ -1370,6 +1378,9 @@ static void intel_pstate_hwp_offline(struct cpudata *cpu) #define POWER_CTL_EE_ENABLE 1 #define POWER_CTL_EE_DISABLE 2 +/* Enable bit for Dynamic Efficiency Control (DEC) */ +#define POWER_CTL_DEC_ENABLE 27 + static int power_ctl_ee_state; static void set_power_ctl_ee_state(bool input) @@ -2531,7 +2542,7 @@ static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) * that sample.time will always be reset before setting the utilization * update hook and make the caller skip the sample then. */ - if (cpu->last_sample_time) { + if (likely(cpu->last_sample_time)) { intel_pstate_calc_avg_perf(cpu); return true; } @@ -3758,6 +3769,26 @@ static const struct x86_cpu_id intel_hybrid_scaling_factor[] = { {} }; +static bool hwp_check_epp(void) +{ + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) + return true; + + /* Without EPP support, don't expose EPP-related sysfs attributes. */ + hwp_cpufreq_attrs[HWP_PERFORMANCE_PREFERENCE_INDEX] = NULL; + hwp_cpufreq_attrs[HWP_PERFORMANCE_AVAILABLE_PREFERENCES_INDEX] = NULL; + + return false; +} + +static bool hwp_check_dec(void) +{ + u64 power_ctl; + + rdmsrq(MSR_IA32_POWER_CTL, power_ctl); + return !!(power_ctl & BIT(POWER_CTL_DEC_ENABLE)); +} + static int __init intel_pstate_init(void) { static struct cpudata **_all_cpu_data; @@ -3778,23 +3809,32 @@ static int __init intel_pstate_init(void) id = x86_match_cpu(hwp_support_ids); if (id) { - hwp_forced = intel_pstate_hwp_is_enabled(); + bool epp_present = hwp_check_epp(); - if (hwp_forced) + /* + * If HWP is enabled already, there is no choice but to deal + * with it. + */ + hwp_forced = intel_pstate_hwp_is_enabled(); + if (hwp_forced) { pr_info("HWP enabled by BIOS\n"); - else if (no_load) + no_hwp = 0; + } else if (no_load) { return -ENODEV; + } else if (!epp_present && !hwp_check_dec()) { + /* + * Avoid enabling HWP for processors without EPP support + * unless the Dynamic Efficiency Control (DEC) enable + * bit (MSR_IA32_POWER_CTL, bit 27) is set because that + * means incomplete HWP implementation which is a corner + * case and supporting it is generally problematic. + */ + no_hwp = 1; + } copy_cpu_funcs(&core_funcs); - /* - * Avoid enabling HWP for processors without EPP support, - * because that means incomplete HWP implementation which is a - * corner case and supporting it is generally problematic. - * - * If HWP is enabled already, though, there is no choice but to - * deal with it. - */ - if ((!no_hwp && boot_cpu_has(X86_FEATURE_HWP_EPP)) || hwp_forced) { + + if (!no_hwp) { hwp_active = true; hwp_mode_bdw = id->driver_data; intel_pstate.attr = hwp_cpufreq_attrs; diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index fce5aa5ceea033..ae4500ab48913d 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -309,7 +309,7 @@ static int mtk_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) latency = readl_relaxed(data->reg_bases[REG_FREQ_LATENCY]) * 1000; if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; policy->fast_switch_possible = true; diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c index 00de1166188ab7..5d50a231f94441 100644 --- a/drivers/cpufreq/mediatek-cpufreq.c +++ b/drivers/cpufreq/mediatek-cpufreq.c @@ -403,9 +403,11 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu) } info->cpu_clk = clk_get(cpu_dev, "cpu"); - if (IS_ERR(info->cpu_clk)) - return dev_err_probe(cpu_dev, PTR_ERR(info->cpu_clk), - "cpu%d: failed to get cpu clk\n", cpu); + if (IS_ERR(info->cpu_clk)) { + ret = PTR_ERR(info->cpu_clk); + dev_err_probe(cpu_dev, ret, "cpu%d: failed to get cpu clk\n", cpu); + goto out_put_cci_dev; + } info->inter_clk = clk_get(cpu_dev, "intermediate"); if (IS_ERR(info->inter_clk)) { @@ -551,6 +553,10 @@ static int mtk_cpu_dvfs_info_init(struct mtk_cpu_dvfs_info *info, int cpu) out_free_mux_clock: clk_put(info->cpu_clk); +out_put_cci_dev: + if (info->soc_data->ccifreq_supported) + put_device(info->cci_dev); + return ret; } @@ -568,6 +574,8 @@ static void mtk_cpu_dvfs_info_release(struct mtk_cpu_dvfs_info *info) clk_put(info->inter_clk); dev_pm_opp_of_cpumask_remove_table(&info->cpus); dev_pm_opp_unregister_notifier(info->cpu_dev, &info->opp_nb); + if (info->soc_data->ccifreq_supported) + put_device(info->cci_dev); } static int mtk_cpufreq_init(struct cpufreq_policy *policy) diff --git a/drivers/cpufreq/rcpufreq_dt.rs b/drivers/cpufreq/rcpufreq_dt.rs index 7e1fbf9a091f74..53923b8ef7a140 100644 --- a/drivers/cpufreq/rcpufreq_dt.rs +++ b/drivers/cpufreq/rcpufreq_dt.rs @@ -28,15 +28,11 @@ fn find_supply_name_exact(dev: &Device, name: &str) -> Option { /// Finds supply name for the CPU from DT. fn find_supply_names(dev: &Device, cpu: cpu::CpuId) -> Option> { // Try "cpu0" for older DTs, fallback to "cpu". - let name = (cpu.as_u32() == 0) + (cpu.as_u32() == 0) .then(|| find_supply_name_exact(dev, "cpu0")) .flatten() - .or_else(|| find_supply_name_exact(dev, "cpu"))?; - - let mut list = KVec::with_capacity(1, GFP_KERNEL).ok()?; - list.push(name, GFP_KERNEL).ok()?; - - Some(list) + .or_else(|| find_supply_name_exact(dev, "cpu")) + .and_then(|name| kernel::kvec![name].ok()) } /// Represents the cpufreq dt device. @@ -123,7 +119,7 @@ impl cpufreq::Driver for CPUFreqDTDriver { let mut transition_latency = opp_table.max_transition_latency_ns() as u32; if transition_latency == 0 { - transition_latency = cpufreq::ETERNAL_LATENCY_NS; + transition_latency = cpufreq::DEFAULT_TRANSITION_LATENCY_NS; } policy diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 38c165d526d144..d2a110079f5fd5 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -294,7 +294,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) latency = perf_ops->transition_latency_get(ph, domain); if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index dcbb0ae7dd476c..e530345baddf6a 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -157,7 +157,7 @@ static int scpi_cpufreq_init(struct cpufreq_policy *policy) latency = scpi_ops->get_transition_latency(cpu_dev); if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c index 707c71090cc322..2a1550e1aa21fc 100644 --- a/drivers/cpufreq/spear-cpufreq.c +++ b/drivers/cpufreq/spear-cpufreq.c @@ -182,7 +182,7 @@ static int spear_cpufreq_probe(struct platform_device *pdev) if (of_property_read_u32(np, "clock-latency", &spear_cpufreq.transition_latency)) - spear_cpufreq.transition_latency = CPUFREQ_ETERNAL; + spear_cpufreq.transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; cnt = of_property_count_u32_elems(np, "cpufreq_tbl"); if (cnt <= 0) { diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index 4270686fc3e3eb..136ab102f636aa 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -93,10 +93,14 @@ static int tegra186_cpufreq_set_target(struct cpufreq_policy *policy, { struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); struct cpufreq_frequency_table *tbl = policy->freq_table + index; - unsigned int edvd_offset = data->cpus[policy->cpu].edvd_offset; + unsigned int edvd_offset; u32 edvd_val = tbl->driver_data; + u32 cpu; - writel(edvd_val, data->regs + edvd_offset); + for_each_cpu(cpu, policy->cpus) { + edvd_offset = data->cpus[cpu].edvd_offset; + writel(edvd_val, data->regs + edvd_offset); + } return 0; } @@ -132,13 +136,14 @@ static struct cpufreq_driver tegra186_cpufreq_driver = { static struct cpufreq_frequency_table *init_vhint_table( struct platform_device *pdev, struct tegra_bpmp *bpmp, - struct tegra186_cpufreq_cluster *cluster, unsigned int cluster_id) + struct tegra186_cpufreq_cluster *cluster, unsigned int cluster_id, + int *num_rates) { struct cpufreq_frequency_table *table; struct mrq_cpu_vhint_request req; struct tegra_bpmp_message msg; struct cpu_vhint_data *data; - int err, i, j, num_rates = 0; + int err, i, j; dma_addr_t phys; void *virt; @@ -168,6 +173,7 @@ static struct cpufreq_frequency_table *init_vhint_table( goto free; } + *num_rates = 0; for (i = data->vfloor; i <= data->vceil; i++) { u16 ndiv = data->ndiv[i]; @@ -178,10 +184,10 @@ static struct cpufreq_frequency_table *init_vhint_table( if (i > 0 && ndiv == data->ndiv[i - 1]) continue; - num_rates++; + (*num_rates)++; } - table = devm_kcalloc(&pdev->dev, num_rates + 1, sizeof(*table), + table = devm_kcalloc(&pdev->dev, *num_rates + 1, sizeof(*table), GFP_KERNEL); if (!table) { table = ERR_PTR(-ENOMEM); @@ -223,7 +229,9 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) { struct tegra186_cpufreq_data *data; struct tegra_bpmp *bpmp; - unsigned int i = 0, err; + unsigned int i = 0, err, edvd_offset; + int num_rates = 0; + u32 edvd_val, cpu; data = devm_kzalloc(&pdev->dev, struct_size(data, clusters, TEGRA186_NUM_CLUSTERS), @@ -246,10 +254,21 @@ static int tegra186_cpufreq_probe(struct platform_device *pdev) for (i = 0; i < TEGRA186_NUM_CLUSTERS; i++) { struct tegra186_cpufreq_cluster *cluster = &data->clusters[i]; - cluster->table = init_vhint_table(pdev, bpmp, cluster, i); + cluster->table = init_vhint_table(pdev, bpmp, cluster, i, &num_rates); if (IS_ERR(cluster->table)) { err = PTR_ERR(cluster->table); goto put_bpmp; + } else if (!num_rates) { + err = -EINVAL; + goto put_bpmp; + } + + for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) { + if (data->cpus[cpu].bpmp_cluster_id == i) { + edvd_val = cluster->table[num_rates - 1].driver_data; + edvd_offset = data->cpus[cpu].edvd_offset; + writel(edvd_val, data->regs + edvd_offset); + } } } diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 0835da449db8b4..56132e843c9919 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -635,8 +635,14 @@ static void __cpuidle_device_init(struct cpuidle_device *dev) static int __cpuidle_register_device(struct cpuidle_device *dev) { struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + unsigned int cpu = dev->cpu; int i, ret; + if (per_cpu(cpuidle_devices, cpu)) { + pr_info("CPU%d: cpuidle device already registered\n", cpu); + return -EEXIST; + } + if (!try_module_get(drv->owner)) return -EINVAL; @@ -648,7 +654,7 @@ static int __cpuidle_register_device(struct cpuidle_device *dev) dev->states_usage[i].disable |= CPUIDLE_STATE_DISABLED_BY_USER; } - per_cpu(cpuidle_devices, dev->cpu) = dev; + per_cpu(cpuidle_devices, cpu) = dev; list_add(&dev->device_list, &cpuidle_detected_devices); ret = cpuidle_coupled_register_device(dev); diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index d6f5da61cb7d86..61de6481760471 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -27,14 +27,14 @@ static ssize_t show_available_governors(struct device *dev, mutex_lock(&cpuidle_lock); list_for_each_entry(tmp, &cpuidle_governors, governor_list) { - if (i >= (ssize_t) (PAGE_SIZE - (CPUIDLE_NAME_LEN + 2))) + if (i >= (ssize_t)(PAGE_SIZE - (CPUIDLE_NAME_LEN + 2))) goto out; - i += scnprintf(&buf[i], CPUIDLE_NAME_LEN + 1, "%s ", tmp->name); + i += sysfs_emit_at(buf, i, "%.*s ", CPUIDLE_NAME_LEN, tmp->name); } out: - i+= sprintf(&buf[i], "\n"); + i += sysfs_emit_at(buf, i, "\n"); mutex_unlock(&cpuidle_lock); return i; } @@ -49,9 +49,9 @@ static ssize_t show_current_driver(struct device *dev, spin_lock(&cpuidle_driver_lock); drv = cpuidle_get_driver(); if (drv) - ret = sprintf(buf, "%s\n", drv->name); + ret = sysfs_emit(buf, "%s\n", drv->name); else - ret = sprintf(buf, "none\n"); + ret = sysfs_emit(buf, "none\n"); spin_unlock(&cpuidle_driver_lock); return ret; @@ -65,9 +65,9 @@ static ssize_t show_current_governor(struct device *dev, mutex_lock(&cpuidle_lock); if (cpuidle_curr_governor) - ret = sprintf(buf, "%s\n", cpuidle_curr_governor->name); + ret = sysfs_emit(buf, "%s\n", cpuidle_curr_governor->name); else - ret = sprintf(buf, "none\n"); + ret = sysfs_emit(buf, "none\n"); mutex_unlock(&cpuidle_lock); return ret; @@ -230,7 +230,7 @@ static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0644, show, store) static ssize_t show_state_##_name(struct cpuidle_state *state, \ struct cpuidle_state_usage *state_usage, char *buf) \ { \ - return sprintf(buf, "%u\n", state->_name);\ + return sysfs_emit(buf, "%u\n", state->_name);\ } #define define_show_state_ull_function(_name) \ @@ -238,7 +238,7 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \ struct cpuidle_state_usage *state_usage, \ char *buf) \ { \ - return sprintf(buf, "%llu\n", state_usage->_name);\ + return sysfs_emit(buf, "%llu\n", state_usage->_name);\ } #define define_show_state_str_function(_name) \ @@ -247,8 +247,8 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \ char *buf) \ { \ if (state->_name[0] == '\0')\ - return sprintf(buf, "\n");\ - return sprintf(buf, "%s\n", state->_name);\ + return sysfs_emit(buf, "\n");\ + return sysfs_emit(buf, "%s\n", state->_name);\ } #define define_show_state_time_function(_name) \ @@ -256,7 +256,7 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \ struct cpuidle_state_usage *state_usage, \ char *buf) \ { \ - return sprintf(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \ + return sysfs_emit(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \ } define_show_state_time_function(exit_latency) @@ -273,14 +273,14 @@ static ssize_t show_state_time(struct cpuidle_state *state, struct cpuidle_state_usage *state_usage, char *buf) { - return sprintf(buf, "%llu\n", ktime_to_us(state_usage->time_ns)); + return sysfs_emit(buf, "%llu\n", ktime_to_us(state_usage->time_ns)); } static ssize_t show_state_disable(struct cpuidle_state *state, struct cpuidle_state_usage *state_usage, char *buf) { - return sprintf(buf, "%llu\n", + return sysfs_emit(buf, "%llu\n", state_usage->disable & CPUIDLE_STATE_DISABLED_BY_USER); } @@ -310,7 +310,7 @@ static ssize_t show_state_default_status(struct cpuidle_state *state, struct cpuidle_state_usage *state_usage, char *buf) { - return sprintf(buf, "%s\n", + return sysfs_emit(buf, "%s\n", state->flags & CPUIDLE_FLAG_OFF ? "disabled" : "enabled"); } @@ -358,7 +358,7 @@ static ssize_t show_state_s2idle_##_name(struct cpuidle_state *state, \ struct cpuidle_state_usage *state_usage, \ char *buf) \ { \ - return sprintf(buf, "%llu\n", state_usage->s2idle_##_name);\ + return sysfs_emit(buf, "%llu\n", state_usage->s2idle_##_name);\ } define_show_state_s2idle_ull_function(usage); @@ -550,7 +550,7 @@ static ssize_t show_driver_name(struct cpuidle_driver *drv, char *buf) ssize_t ret; spin_lock(&cpuidle_driver_lock); - ret = sprintf(buf, "%s\n", drv ? drv->name : "none"); + ret = sysfs_emit(buf, "%s\n", drv ? drv->name : "none"); spin_unlock(&cpuidle_driver_lock); return ret; diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 04b4c43b6bae77..c7a1060ba57a6b 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -840,6 +840,7 @@ config CRYPTO_DEV_CCREE If unsure say Y. source "drivers/crypto/hisilicon/Kconfig" +source "drivers/crypto/loongson/Kconfig" source "drivers/crypto/amlogic/Kconfig" diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile index 22eadcc8f4a25a..125b99b24af141 100644 --- a/drivers/crypto/Makefile +++ b/drivers/crypto/Makefile @@ -44,6 +44,7 @@ obj-y += inside-secure/ obj-$(CONFIG_CRYPTO_DEV_ARTPEC6) += axis/ obj-y += xilinx/ obj-y += hisilicon/ +obj-y += loongson/ obj-$(CONFIG_CRYPTO_DEV_AMLOGIC_GXL) += amlogic/ obj-y += intel/ obj-y += starfive/ diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 394484929dae3b..a9626b30044aed 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -13,7 +13,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ tee-dev.o \ platform-access.o \ dbc.o \ - hsti.o + hsti.o \ + sfs.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 1c5a7189631eca..9e21da0e298ad7 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -17,6 +17,7 @@ #include "psp-dev.h" #include "sev-dev.h" #include "tee-dev.h" +#include "sfs.h" #include "platform-access.h" #include "dbc.h" #include "hsti.h" @@ -182,6 +183,17 @@ static int psp_check_tee_support(struct psp_device *psp) return 0; } +static int psp_check_sfs_support(struct psp_device *psp) +{ + /* Check if device supports SFS feature */ + if (!psp->capability.sfs) { + dev_dbg(psp->dev, "psp does not support SFS\n"); + return -ENODEV; + } + + return 0; +} + static int psp_init(struct psp_device *psp) { int ret; @@ -198,6 +210,12 @@ static int psp_init(struct psp_device *psp) return ret; } + if (!psp_check_sfs_support(psp)) { + ret = sfs_dev_init(psp); + if (ret) + return ret; + } + if (psp->vdata->platform_access) { ret = platform_access_dev_init(psp); if (ret) @@ -302,6 +320,8 @@ void psp_dev_destroy(struct sp_device *sp) tee_dev_destroy(psp); + sfs_dev_destroy(psp); + dbc_dev_destroy(psp); platform_access_dev_destroy(psp); diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index e43ce87ede7690..268c83f298cb0d 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -32,7 +32,8 @@ union psp_cap_register { unsigned int sev :1, tee :1, dbc_thru_ext :1, - rsvd1 :4, + sfs :1, + rsvd1 :3, security_reporting :1, fused_part :1, rsvd2 :1, @@ -68,6 +69,7 @@ struct psp_device { void *tee_data; void *platform_access_data; void *dbc_data; + void *sfs_data; union psp_cap_register capability; }; @@ -118,12 +120,16 @@ struct psp_ext_request { * @PSP_SUB_CMD_DBC_SET_UID: Set UID for DBC * @PSP_SUB_CMD_DBC_GET_PARAMETER: Get parameter from DBC * @PSP_SUB_CMD_DBC_SET_PARAMETER: Set parameter for DBC + * @PSP_SUB_CMD_SFS_GET_FW_VERS: Get firmware versions for ASP and other MP + * @PSP_SUB_CMD_SFS_UPDATE: Command to load, verify and execute SFS package */ enum psp_sub_cmd { PSP_SUB_CMD_DBC_GET_NONCE = PSP_DYNAMIC_BOOST_GET_NONCE, PSP_SUB_CMD_DBC_SET_UID = PSP_DYNAMIC_BOOST_SET_UID, PSP_SUB_CMD_DBC_GET_PARAMETER = PSP_DYNAMIC_BOOST_GET_PARAMETER, PSP_SUB_CMD_DBC_SET_PARAMETER = PSP_DYNAMIC_BOOST_SET_PARAMETER, + PSP_SUB_CMD_SFS_GET_FW_VERS = PSP_SFS_GET_FW_VERSIONS, + PSP_SUB_CMD_SFS_UPDATE = PSP_SFS_UPDATE, }; int psp_extended_mailbox_cmd(struct psp_device *psp, unsigned int timeout_msecs, diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index e058ba02779296..65d6d0af140a13 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -82,6 +82,21 @@ MODULE_FIRMWARE("amd/amd_sev_fam19h_model1xh.sbin"); /* 4th gen EPYC */ static bool psp_dead; static int psp_timeout; +enum snp_hv_fixed_pages_state { + ALLOCATED, + HV_FIXED, +}; + +struct snp_hv_fixed_pages_entry { + struct list_head list; + struct page *page; + unsigned int order; + bool free; + enum snp_hv_fixed_pages_state page_state; +}; + +static LIST_HEAD(snp_hv_fixed_pages); + /* Trusted Memory Region (TMR): * The TMR is a 1MB area that must be 1MB aligned. Use the page allocator * to allocate the memory, which will return aligned memory for the specified @@ -1073,6 +1088,165 @@ static void snp_set_hsave_pa(void *arg) wrmsrq(MSR_VM_HSAVE_PA, 0); } +/* Hypervisor Fixed pages API interface */ +static void snp_hv_fixed_pages_state_update(struct sev_device *sev, + enum snp_hv_fixed_pages_state page_state) +{ + struct snp_hv_fixed_pages_entry *entry; + + /* List is protected by sev_cmd_mutex */ + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + list_for_each_entry(entry, &snp_hv_fixed_pages, list) + entry->page_state = page_state; +} + +/* + * Allocate HV_FIXED pages in 2MB aligned sizes to ensure the whole + * 2MB pages are marked as HV_FIXED. + */ +struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages) +{ + struct psp_device *psp_master = psp_get_master_device(); + struct snp_hv_fixed_pages_entry *entry; + struct sev_device *sev; + unsigned int order; + struct page *page; + + if (!psp_master || !psp_master->sev_data) + return NULL; + + sev = psp_master->sev_data; + + order = get_order(PMD_SIZE * num_2mb_pages); + + /* + * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list + * also needs to be protected using the same mutex. + */ + guard(mutex)(&sev_cmd_mutex); + + /* + * This API uses SNP_INIT_EX to transition allocated pages to HV_Fixed + * page state, fail if SNP is already initialized. + */ + if (sev->snp_initialized) + return NULL; + + /* Re-use freed pages that match the request */ + list_for_each_entry(entry, &snp_hv_fixed_pages, list) { + /* Hypervisor fixed page allocator implements exact fit policy */ + if (entry->order == order && entry->free) { + entry->free = false; + memset(page_address(entry->page), 0, + (1 << entry->order) * PAGE_SIZE); + return entry->page; + } + } + + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!page) + return NULL; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + __free_pages(page, order); + return NULL; + } + + entry->page = page; + entry->order = order; + list_add_tail(&entry->list, &snp_hv_fixed_pages); + + return page; +} + +void snp_free_hv_fixed_pages(struct page *page) +{ + struct psp_device *psp_master = psp_get_master_device(); + struct snp_hv_fixed_pages_entry *entry, *nentry; + + if (!psp_master || !psp_master->sev_data) + return; + + /* + * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list + * also needs to be protected using the same mutex. + */ + guard(mutex)(&sev_cmd_mutex); + + list_for_each_entry_safe(entry, nentry, &snp_hv_fixed_pages, list) { + if (entry->page != page) + continue; + + /* + * HV_FIXED page state cannot be changed until reboot + * and they cannot be used by an SNP guest, so they cannot + * be returned back to the page allocator. + * Mark the pages as free internally to allow possible re-use. + */ + if (entry->page_state == HV_FIXED) { + entry->free = true; + } else { + __free_pages(page, entry->order); + list_del(&entry->list); + kfree(entry); + } + return; + } +} + +static void snp_add_hv_fixed_pages(struct sev_device *sev, struct sev_data_range_list *range_list) +{ + struct snp_hv_fixed_pages_entry *entry; + struct sev_data_range *range; + int num_elements; + + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + num_elements = list_count_nodes(&snp_hv_fixed_pages) + + range_list->num_elements; + + /* + * Ensure the list of HV_FIXED pages that will be passed to firmware + * do not exceed the page-sized argument buffer. + */ + if (num_elements * sizeof(*range) + sizeof(*range_list) > PAGE_SIZE) { + dev_warn(sev->dev, "Additional HV_Fixed pages cannot be accommodated, omitting\n"); + return; + } + + range = &range_list->ranges[range_list->num_elements]; + list_for_each_entry(entry, &snp_hv_fixed_pages, list) { + range->base = page_to_pfn(entry->page) << PAGE_SHIFT; + range->page_count = 1 << entry->order; + range++; + } + range_list->num_elements = num_elements; +} + +static void snp_leak_hv_fixed_pages(void) +{ + struct snp_hv_fixed_pages_entry *entry; + + /* List is protected by sev_cmd_mutex */ + lockdep_assert_held(&sev_cmd_mutex); + + if (list_empty(&snp_hv_fixed_pages)) + return; + + list_for_each_entry(entry, &snp_hv_fixed_pages, list) + if (entry->page_state == HV_FIXED) + __snp_leak_pages(page_to_pfn(entry->page), + 1 << entry->order, false); +} + static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg) { struct sev_data_range_list *range_list = arg; @@ -1163,6 +1337,12 @@ static int __sev_snp_init_locked(int *error) return rc; } + /* + * Add HV_Fixed pages from other PSP sub-devices, such as SFS to the + * HV_Fixed page list. + */ + snp_add_hv_fixed_pages(sev, snp_range_list); + memset(&data, 0, sizeof(data)); data.init_rmp = 1; data.list_paddr_en = 1; @@ -1202,6 +1382,7 @@ static int __sev_snp_init_locked(int *error) return rc; } + snp_hv_fixed_pages_state_update(sev, HV_FIXED); sev->snp_initialized = true; dev_dbg(sev->dev, "SEV-SNP firmware initialized\n"); @@ -1784,6 +1965,7 @@ static int __sev_snp_shutdown_locked(int *error, bool panic) return ret; } + snp_leak_hv_fixed_pages(); sev->snp_initialized = false; dev_dbg(sev->dev, "SEV-SNP firmware shutdown\n"); @@ -2430,7 +2612,7 @@ static void __sev_firmware_shutdown(struct sev_device *sev, bool panic) { int error; - __sev_platform_shutdown_locked(NULL); + __sev_platform_shutdown_locked(&error); if (sev_es_tmr) { /* diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h index 3e4e5574e88a30..28021abc85ad26 100644 --- a/drivers/crypto/ccp/sev-dev.h +++ b/drivers/crypto/ccp/sev-dev.h @@ -65,4 +65,7 @@ void sev_dev_destroy(struct psp_device *psp); void sev_pci_init(void); void sev_pci_exit(void); +struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages); +void snp_free_hv_fixed_pages(struct page *page); + #endif /* __SEV_DEV_H */ diff --git a/drivers/crypto/ccp/sfs.c b/drivers/crypto/ccp/sfs.c new file mode 100644 index 00000000000000..2f4beaafe7ec67 --- /dev/null +++ b/drivers/crypto/ccp/sfs.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor Seamless Firmware Servicing support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#include + +#include "sfs.h" +#include "sev-dev.h" + +#define SFS_DEFAULT_TIMEOUT (10 * MSEC_PER_SEC) +#define SFS_MAX_PAYLOAD_SIZE (2 * 1024 * 1024) +#define SFS_NUM_2MB_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PMD_SIZE) +#define SFS_NUM_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PAGE_SIZE) + +static DEFINE_MUTEX(sfs_ioctl_mutex); + +static struct sfs_misc_dev *misc_dev; + +static int send_sfs_cmd(struct sfs_device *sfs_dev, int msg) +{ + int ret; + + sfs_dev->command_buf->hdr.status = 0; + sfs_dev->command_buf->hdr.sub_cmd_id = msg; + + ret = psp_extended_mailbox_cmd(sfs_dev->psp, + SFS_DEFAULT_TIMEOUT, + (struct psp_ext_request *)sfs_dev->command_buf); + if (ret == -EIO) { + dev_dbg(sfs_dev->dev, + "msg 0x%x failed with PSP error: 0x%x, extended status: 0x%x\n", + msg, sfs_dev->command_buf->hdr.status, + *(u32 *)sfs_dev->command_buf->buf); + } + + return ret; +} + +static int send_sfs_get_fw_versions(struct sfs_device *sfs_dev) +{ + /* + * SFS_GET_FW_VERSIONS command needs the output buffer to be + * initialized to 0xC7 in every byte. + */ + memset(sfs_dev->command_buf->sfs_buffer, 0xc7, PAGE_SIZE); + sfs_dev->command_buf->hdr.payload_size = 2 * PAGE_SIZE; + + return send_sfs_cmd(sfs_dev, PSP_SFS_GET_FW_VERSIONS); +} + +static int send_sfs_update_package(struct sfs_device *sfs_dev, const char *payload_name) +{ + char payload_path[PAYLOAD_NAME_SIZE + sizeof("amd/")]; + const struct firmware *firmware; + unsigned long package_size; + int ret; + + /* Sanitize userspace provided payload name */ + if (!strnchr(payload_name, PAYLOAD_NAME_SIZE, '\0')) + return -EINVAL; + + snprintf(payload_path, sizeof(payload_path), "amd/%s", payload_name); + + ret = firmware_request_nowarn(&firmware, payload_path, sfs_dev->dev); + if (ret < 0) { + dev_warn_ratelimited(sfs_dev->dev, "firmware request failed for %s (%d)\n", + payload_path, ret); + return -ENOENT; + } + + /* + * SFS Update Package command's input buffer contains TEE_EXT_CMD_BUFFER + * followed by the Update Package and it should be 64KB aligned. + */ + package_size = ALIGN(firmware->size + PAGE_SIZE, 0x10000U); + + /* + * SFS command buffer is a pre-allocated 2MB buffer, fail update package + * if SFS payload is larger than the pre-allocated command buffer. + */ + if (package_size > SFS_MAX_PAYLOAD_SIZE) { + dev_warn_ratelimited(sfs_dev->dev, + "SFS payload size %ld larger than maximum supported payload size of %u\n", + package_size, SFS_MAX_PAYLOAD_SIZE); + release_firmware(firmware); + return -E2BIG; + } + + /* + * Copy firmware data to a HV_Fixed memory region. + */ + memcpy(sfs_dev->command_buf->sfs_buffer, firmware->data, firmware->size); + sfs_dev->command_buf->hdr.payload_size = package_size; + + release_firmware(firmware); + + return send_sfs_cmd(sfs_dev, PSP_SFS_UPDATE); +} + +static long sfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct sfs_user_get_fw_versions __user *sfs_get_fw_versions; + struct sfs_user_update_package __user *sfs_update_package; + struct psp_device *psp_master = psp_get_master_device(); + char payload_name[PAYLOAD_NAME_SIZE]; + struct sfs_device *sfs_dev; + int ret = 0; + + if (!psp_master || !psp_master->sfs_data) + return -ENODEV; + + sfs_dev = psp_master->sfs_data; + + guard(mutex)(&sfs_ioctl_mutex); + + switch (cmd) { + case SFSIOCFWVERS: + dev_dbg(sfs_dev->dev, "in SFSIOCFWVERS\n"); + + sfs_get_fw_versions = (struct sfs_user_get_fw_versions __user *)arg; + + ret = send_sfs_get_fw_versions(sfs_dev); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_get_fw_versions->blob, sfs_dev->command_buf->sfs_buffer, + PAGE_SIZE)) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_get_fw_versions->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_get_fw_versions->sfs_extended_status))) + return -EFAULT; + break; + case SFSIOCUPDATEPKG: + dev_dbg(sfs_dev->dev, "in SFSIOCUPDATEPKG\n"); + + sfs_update_package = (struct sfs_user_update_package __user *)arg; + + if (copy_from_user(payload_name, sfs_update_package->payload_name, + PAYLOAD_NAME_SIZE)) + return -EFAULT; + + ret = send_sfs_update_package(sfs_dev, payload_name); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_update_package->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_update_package->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_update_package->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_update_package->sfs_extended_status))) + return -EFAULT; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static const struct file_operations sfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = sfs_ioctl, +}; + +static void sfs_exit(struct kref *ref) +{ + misc_deregister(&misc_dev->misc); + kfree(misc_dev); + misc_dev = NULL; +} + +void sfs_dev_destroy(struct psp_device *psp) +{ + struct sfs_device *sfs_dev = psp->sfs_data; + + if (!sfs_dev) + return; + + /* + * Change SFS command buffer back to the default "Write-Back" type. + */ + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + + snp_free_hv_fixed_pages(sfs_dev->page); + + if (sfs_dev->misc) + kref_put(&misc_dev->refcount, sfs_exit); + + psp->sfs_data = NULL; +} + +/* Based on sev_misc_init() */ +static int sfs_misc_init(struct sfs_device *sfs) +{ + struct device *dev = sfs->dev; + int ret; + + /* + * SFS feature support can be detected on multiple devices but the SFS + * FW commands must be issued on the master. During probe, we do not + * know the master hence we create /dev/sfs on the first device probe. + */ + if (!misc_dev) { + struct miscdevice *misc; + + misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL); + if (!misc_dev) + return -ENOMEM; + + misc = &misc_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = "sfs"; + misc->fops = &sfs_fops; + misc->mode = 0600; + + ret = misc_register(misc); + if (ret) + return ret; + + kref_init(&misc_dev->refcount); + } else { + kref_get(&misc_dev->refcount); + } + + sfs->misc = misc_dev; + dev_dbg(dev, "registered SFS device\n"); + + return 0; +} + +int sfs_dev_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + struct sfs_device *sfs_dev; + struct page *page; + int ret = -ENOMEM; + + sfs_dev = devm_kzalloc(dev, sizeof(*sfs_dev), GFP_KERNEL); + if (!sfs_dev) + return -ENOMEM; + + /* + * Pre-allocate 2MB command buffer for all SFS commands using + * SNP HV_Fixed page allocator which also transitions the + * SFS command buffer to HV_Fixed page state if SNP is enabled. + */ + page = snp_alloc_hv_fixed_pages(SFS_NUM_2MB_PAGES_CMDBUF); + if (!page) { + dev_dbg(dev, "Command Buffer HV-Fixed page allocation failed\n"); + goto cleanup_dev; + } + sfs_dev->page = page; + sfs_dev->command_buf = page_address(page); + + dev_dbg(dev, "Command buffer 0x%px to be marked as HV_Fixed\n", sfs_dev->command_buf); + + /* + * SFS command buffer must be mapped as non-cacheable. + */ + ret = set_memory_uc((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + if (ret) { + dev_dbg(dev, "Set memory uc failed\n"); + goto cleanup_cmd_buf; + } + + dev_dbg(dev, "Command buffer 0x%px marked uncacheable\n", sfs_dev->command_buf); + + psp->sfs_data = sfs_dev; + sfs_dev->dev = dev; + sfs_dev->psp = psp; + + ret = sfs_misc_init(sfs_dev); + if (ret) + goto cleanup_mem_attr; + + dev_notice(sfs_dev->dev, "SFS support is available\n"); + + return 0; + +cleanup_mem_attr: + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + +cleanup_cmd_buf: + snp_free_hv_fixed_pages(page); + +cleanup_dev: + psp->sfs_data = NULL; + devm_kfree(dev, sfs_dev); + + return ret; +} diff --git a/drivers/crypto/ccp/sfs.h b/drivers/crypto/ccp/sfs.h new file mode 100644 index 00000000000000..97704c210efdd2 --- /dev/null +++ b/drivers/crypto/ccp/sfs.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Platform Security Processor (PSP) Seamless Firmware (SFS) Support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __SFS_H__ +#define __SFS_H__ + +#include + +#include +#include +#include +#include +#include + +#include "psp-dev.h" + +struct sfs_misc_dev { + struct kref refcount; + struct miscdevice misc; +}; + +struct sfs_command { + struct psp_ext_req_buffer_hdr hdr; + u8 buf[PAGE_SIZE - sizeof(struct psp_ext_req_buffer_hdr)]; + u8 sfs_buffer[]; +} __packed; + +struct sfs_device { + struct device *dev; + struct psp_device *psp; + + struct page *page; + struct sfs_command *command_buf; + + struct sfs_misc_dev *misc; +}; + +void sfs_dev_destroy(struct psp_device *psp); +int sfs_dev_init(struct psp_device *psp); + +#endif /* __SFS_H__ */ diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig index 4137a8bf131f0c..4835bdebdbb381 100644 --- a/drivers/crypto/hisilicon/Kconfig +++ b/drivers/crypto/hisilicon/Kconfig @@ -69,7 +69,6 @@ config CRYPTO_DEV_HISI_HPRE select CRYPTO_DEV_HISI_QM select CRYPTO_DH select CRYPTO_RSA - select CRYPTO_CURVE25519 select CRYPTO_ECDH help Support for HiSilicon HPRE(High Performance RSA Engine) diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c index 1550c3818383ab..21ccf879f70c55 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c +++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 HiSilicon Limited. */ #include -#include #include #include #include @@ -106,16 +105,6 @@ struct hpre_ecdh_ctx { dma_addr_t dma_g; }; -struct hpre_curve25519_ctx { - /* low address: p->a->k */ - unsigned char *p; - dma_addr_t dma_p; - - /* gx coordinate */ - unsigned char *g; - dma_addr_t dma_g; -}; - struct hpre_ctx { struct hisi_qp *qp; struct device *dev; @@ -129,7 +118,6 @@ struct hpre_ctx { struct hpre_rsa_ctx rsa; struct hpre_dh_ctx dh; struct hpre_ecdh_ctx ecdh; - struct hpre_curve25519_ctx curve25519; }; /* for ecc algorithms */ unsigned int curve_id; @@ -146,7 +134,6 @@ struct hpre_asym_request { struct akcipher_request *rsa; struct kpp_request *dh; struct kpp_request *ecdh; - struct kpp_request *curve25519; } areq; int err; int req_id; @@ -1214,8 +1201,7 @@ static void hpre_key_to_big_end(u8 *data, int len) } } -static void hpre_ecc_clear_ctx(struct hpre_ctx *ctx, bool is_clear_all, - bool is_ecdh) +static void hpre_ecc_clear_ctx(struct hpre_ctx *ctx, bool is_clear_all) { struct device *dev = ctx->dev; unsigned int sz = ctx->key_sz; @@ -1224,17 +1210,11 @@ static void hpre_ecc_clear_ctx(struct hpre_ctx *ctx, bool is_clear_all, if (is_clear_all) hisi_qm_stop_qp(ctx->qp); - if (is_ecdh && ctx->ecdh.p) { + if (ctx->ecdh.p) { /* ecdh: p->a->k->b */ memzero_explicit(ctx->ecdh.p + shift, sz); dma_free_coherent(dev, sz << 3, ctx->ecdh.p, ctx->ecdh.dma_p); ctx->ecdh.p = NULL; - } else if (!is_ecdh && ctx->curve25519.p) { - /* curve25519: p->a->k */ - memzero_explicit(ctx->curve25519.p + shift, sz); - dma_free_coherent(dev, sz << 2, ctx->curve25519.p, - ctx->curve25519.dma_p); - ctx->curve25519.p = NULL; } hpre_ctx_clear(ctx, is_clear_all); @@ -1432,7 +1412,7 @@ static int hpre_ecdh_set_secret(struct crypto_kpp *tfm, const void *buf, return -EINVAL; } - hpre_ecc_clear_ctx(ctx, false, true); + hpre_ecc_clear_ctx(ctx, false); ret = hpre_ecdh_set_param(ctx, ¶ms); if (ret < 0) { @@ -1683,337 +1663,7 @@ static void hpre_ecdh_exit_tfm(struct crypto_kpp *tfm) { struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - hpre_ecc_clear_ctx(ctx, true, true); -} - -static void hpre_curve25519_fill_curve(struct hpre_ctx *ctx, const void *buf, - unsigned int len) -{ - u8 secret[CURVE25519_KEY_SIZE] = { 0 }; - unsigned int sz = ctx->key_sz; - const struct ecc_curve *curve; - unsigned int shift = sz << 1; - void *p; - - /* - * The key from 'buf' is in little-endian, we should preprocess it as - * the description in rfc7748: "k[0] &= 248, k[31] &= 127, k[31] |= 64", - * then convert it to big endian. Only in this way, the result can be - * the same as the software curve-25519 that exists in crypto. - */ - memcpy(secret, buf, len); - curve25519_clamp_secret(secret); - hpre_key_to_big_end(secret, CURVE25519_KEY_SIZE); - - p = ctx->curve25519.p + sz - len; - - curve = ecc_get_curve25519(); - - /* fill curve parameters */ - fill_curve_param(p, curve->p, len, curve->g.ndigits); - fill_curve_param(p + sz, curve->a, len, curve->g.ndigits); - memcpy(p + shift, secret, len); - fill_curve_param(p + shift + sz, curve->g.x, len, curve->g.ndigits); - memzero_explicit(secret, CURVE25519_KEY_SIZE); -} - -static int hpre_curve25519_set_param(struct hpre_ctx *ctx, const void *buf, - unsigned int len) -{ - struct device *dev = ctx->dev; - unsigned int sz = ctx->key_sz; - unsigned int shift = sz << 1; - - /* p->a->k->gx */ - if (!ctx->curve25519.p) { - ctx->curve25519.p = dma_alloc_coherent(dev, sz << 2, - &ctx->curve25519.dma_p, - GFP_KERNEL); - if (!ctx->curve25519.p) - return -ENOMEM; - } - - ctx->curve25519.g = ctx->curve25519.p + shift + sz; - ctx->curve25519.dma_g = ctx->curve25519.dma_p + shift + sz; - - hpre_curve25519_fill_curve(ctx, buf, len); - - return 0; -} - -static int hpre_curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - struct device *dev = ctx->dev; - int ret = -EINVAL; - - if (len != CURVE25519_KEY_SIZE || - !crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) { - dev_err(dev, "key is null or key len is not 32bytes!\n"); - return ret; - } - - /* Free old secret if any */ - hpre_ecc_clear_ctx(ctx, false, false); - - ctx->key_sz = CURVE25519_KEY_SIZE; - ret = hpre_curve25519_set_param(ctx, buf, CURVE25519_KEY_SIZE); - if (ret) { - dev_err(dev, "failed to set curve25519 param, ret = %d!\n", ret); - hpre_ecc_clear_ctx(ctx, false, false); - return ret; - } - - return 0; -} - -static void hpre_curve25519_hw_data_clr_all(struct hpre_ctx *ctx, - struct hpre_asym_request *req, - struct scatterlist *dst, - struct scatterlist *src) -{ - struct device *dev = ctx->dev; - struct hpre_sqe *sqe = &req->req; - dma_addr_t dma; - - dma = le64_to_cpu(sqe->in); - if (unlikely(dma_mapping_error(dev, dma))) - return; - - if (src && req->src) - dma_free_coherent(dev, ctx->key_sz, req->src, dma); - - dma = le64_to_cpu(sqe->out); - if (unlikely(dma_mapping_error(dev, dma))) - return; - - if (req->dst) - dma_free_coherent(dev, ctx->key_sz, req->dst, dma); - if (dst) - dma_unmap_single(dev, dma, ctx->key_sz, DMA_FROM_DEVICE); -} - -static void hpre_curve25519_cb(struct hpre_ctx *ctx, void *resp) -{ - struct hpre_dfx *dfx = ctx->hpre->debug.dfx; - struct hpre_asym_request *req = NULL; - struct kpp_request *areq; - u64 overtime_thrhld; - int ret; - - ret = hpre_alg_res_post_hf(ctx, resp, (void **)&req); - areq = req->areq.curve25519; - areq->dst_len = ctx->key_sz; - - overtime_thrhld = atomic64_read(&dfx[HPRE_OVERTIME_THRHLD].value); - if (overtime_thrhld && hpre_is_bd_timeout(req, overtime_thrhld)) - atomic64_inc(&dfx[HPRE_OVER_THRHLD_CNT].value); - - /* Do unmap before data processing */ - hpre_curve25519_hw_data_clr_all(ctx, req, areq->dst, areq->src); - - hpre_key_to_big_end(sg_virt(areq->dst), CURVE25519_KEY_SIZE); - - kpp_request_complete(areq, ret); - - atomic64_inc(&dfx[HPRE_RECV_CNT].value); -} - -static int hpre_curve25519_msg_request_set(struct hpre_ctx *ctx, - struct kpp_request *req) -{ - struct hpre_asym_request *h_req; - struct hpre_sqe *msg; - int req_id; - void *tmp; - - if (unlikely(req->dst_len < ctx->key_sz)) { - req->dst_len = ctx->key_sz; - return -EINVAL; - } - - tmp = kpp_request_ctx(req); - h_req = PTR_ALIGN(tmp, hpre_align_sz()); - h_req->cb = hpre_curve25519_cb; - h_req->areq.curve25519 = req; - msg = &h_req->req; - memset(msg, 0, sizeof(*msg)); - msg->in = cpu_to_le64(DMA_MAPPING_ERROR); - msg->out = cpu_to_le64(DMA_MAPPING_ERROR); - msg->key = cpu_to_le64(ctx->curve25519.dma_p); - - msg->dw0 |= cpu_to_le32(0x1U << HPRE_SQE_DONE_SHIFT); - msg->task_len1 = (ctx->key_sz >> HPRE_BITS_2_BYTES_SHIFT) - 1; - h_req->ctx = ctx; - - req_id = hpre_add_req_to_ctx(h_req); - if (req_id < 0) - return -EBUSY; - - msg->tag = cpu_to_le16((u16)req_id); - return 0; -} - -static void hpre_curve25519_src_modulo_p(u8 *ptr) -{ - int i; - - for (i = 0; i < CURVE25519_KEY_SIZE - 1; i++) - ptr[i] = 0; - - /* The modulus is ptr's last byte minus '0xed'(last byte of p) */ - ptr[i] -= 0xed; -} - -static int hpre_curve25519_src_init(struct hpre_asym_request *hpre_req, - struct scatterlist *data, unsigned int len) -{ - struct hpre_sqe *msg = &hpre_req->req; - struct hpre_ctx *ctx = hpre_req->ctx; - struct device *dev = ctx->dev; - u8 p[CURVE25519_KEY_SIZE] = { 0 }; - const struct ecc_curve *curve; - dma_addr_t dma = 0; - u8 *ptr; - - if (len != CURVE25519_KEY_SIZE) { - dev_err(dev, "sourc_data len is not 32bytes, len = %u!\n", len); - return -EINVAL; - } - - ptr = dma_alloc_coherent(dev, ctx->key_sz, &dma, GFP_KERNEL); - if (unlikely(!ptr)) - return -ENOMEM; - - scatterwalk_map_and_copy(ptr, data, 0, len, 0); - - if (!crypto_memneq(ptr, curve25519_null_point, CURVE25519_KEY_SIZE)) { - dev_err(dev, "gx is null!\n"); - goto err; - } - - /* - * Src_data(gx) is in little-endian order, MSB in the final byte should - * be masked as described in RFC7748, then transform it to big-endian - * form, then hisi_hpre can use the data. - */ - ptr[31] &= 0x7f; - hpre_key_to_big_end(ptr, CURVE25519_KEY_SIZE); - - curve = ecc_get_curve25519(); - - fill_curve_param(p, curve->p, CURVE25519_KEY_SIZE, curve->g.ndigits); - - /* - * When src_data equals (2^255 - 19) ~ (2^255 - 1), it is out of p, - * we get its modulus to p, and then use it. - */ - if (memcmp(ptr, p, ctx->key_sz) == 0) { - dev_err(dev, "gx is p!\n"); - goto err; - } else if (memcmp(ptr, p, ctx->key_sz) > 0) { - hpre_curve25519_src_modulo_p(ptr); - } - - hpre_req->src = ptr; - msg->in = cpu_to_le64(dma); - return 0; - -err: - dma_free_coherent(dev, ctx->key_sz, ptr, dma); - return -EINVAL; -} - -static int hpre_curve25519_dst_init(struct hpre_asym_request *hpre_req, - struct scatterlist *data, unsigned int len) -{ - struct hpre_sqe *msg = &hpre_req->req; - struct hpre_ctx *ctx = hpre_req->ctx; - struct device *dev = ctx->dev; - dma_addr_t dma; - - if (!data || !sg_is_last(data) || len != ctx->key_sz) { - dev_err(dev, "data or data length is illegal!\n"); - return -EINVAL; - } - - hpre_req->dst = NULL; - dma = dma_map_single(dev, sg_virt(data), len, DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(dev, dma))) { - dev_err(dev, "dma map data err!\n"); - return -ENOMEM; - } - - msg->out = cpu_to_le64(dma); - return 0; -} - -static int hpre_curve25519_compute_value(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - struct device *dev = ctx->dev; - void *tmp = kpp_request_ctx(req); - struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, hpre_align_sz()); - struct hpre_sqe *msg = &hpre_req->req; - int ret; - - ret = hpre_curve25519_msg_request_set(ctx, req); - if (unlikely(ret)) { - dev_err(dev, "failed to set curve25519 request, ret = %d!\n", ret); - return ret; - } - - if (req->src) { - ret = hpre_curve25519_src_init(hpre_req, req->src, req->src_len); - if (unlikely(ret)) { - dev_err(dev, "failed to init src data, ret = %d!\n", - ret); - goto clear_all; - } - } else { - msg->in = cpu_to_le64(ctx->curve25519.dma_g); - } - - ret = hpre_curve25519_dst_init(hpre_req, req->dst, req->dst_len); - if (unlikely(ret)) { - dev_err(dev, "failed to init dst data, ret = %d!\n", ret); - goto clear_all; - } - - msg->dw0 = cpu_to_le32(le32_to_cpu(msg->dw0) | HPRE_ALG_CURVE25519_MUL); - ret = hpre_send(ctx, msg); - if (likely(!ret)) - return -EINPROGRESS; - -clear_all: - hpre_rm_req_from_ctx(hpre_req); - hpre_curve25519_hw_data_clr_all(ctx, hpre_req, req->dst, req->src); - return ret; -} - -static unsigned int hpre_curve25519_max_size(struct crypto_kpp *tfm) -{ - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - - return ctx->key_sz; -} - -static int hpre_curve25519_init_tfm(struct crypto_kpp *tfm) -{ - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - - kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + hpre_align_pd()); - - return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE); -} - -static void hpre_curve25519_exit_tfm(struct crypto_kpp *tfm) -{ - struct hpre_ctx *ctx = kpp_tfm_ctx(tfm); - - hpre_ecc_clear_ctx(ctx, true, false); + hpre_ecc_clear_ctx(ctx, true); } static struct akcipher_alg rsa = { @@ -2095,22 +1745,6 @@ static struct kpp_alg ecdh_curves[] = { } }; -static struct kpp_alg curve25519_alg = { - .set_secret = hpre_curve25519_set_secret, - .generate_public_key = hpre_curve25519_compute_value, - .compute_shared_secret = hpre_curve25519_compute_value, - .max_size = hpre_curve25519_max_size, - .init = hpre_curve25519_init_tfm, - .exit = hpre_curve25519_exit_tfm, - .base = { - .cra_ctxsize = sizeof(struct hpre_ctx), - .cra_priority = HPRE_CRYPTO_ALG_PRI, - .cra_name = "curve25519", - .cra_driver_name = "hpre-curve25519", - .cra_module = THIS_MODULE, - }, -}; - static int hpre_register_rsa(struct hisi_qm *qm) { int ret; @@ -2192,28 +1826,6 @@ static void hpre_unregister_ecdh(struct hisi_qm *qm) crypto_unregister_kpp(&ecdh_curves[i]); } -static int hpre_register_x25519(struct hisi_qm *qm) -{ - int ret; - - if (!hpre_check_alg_support(qm, HPRE_DRV_X25519_MASK_CAP)) - return 0; - - ret = crypto_register_kpp(&curve25519_alg); - if (ret) - dev_err(&qm->pdev->dev, "failed to register x25519 (%d)!\n", ret); - - return ret; -} - -static void hpre_unregister_x25519(struct hisi_qm *qm) -{ - if (!hpre_check_alg_support(qm, HPRE_DRV_X25519_MASK_CAP)) - return; - - crypto_unregister_kpp(&curve25519_alg); -} - int hpre_algs_register(struct hisi_qm *qm) { int ret = 0; @@ -2236,17 +1848,11 @@ int hpre_algs_register(struct hisi_qm *qm) if (ret) goto unreg_dh; - ret = hpre_register_x25519(qm); - if (ret) - goto unreg_ecdh; - hpre_available_devs++; mutex_unlock(&hpre_algs_lock); return ret; -unreg_ecdh: - hpre_unregister_ecdh(qm); unreg_dh: hpre_unregister_dh(qm); unreg_rsa: @@ -2262,7 +1868,6 @@ void hpre_algs_unregister(struct hisi_qm *qm) if (--hpre_available_devs) goto unlock; - hpre_unregister_x25519(qm); hpre_unregister_ecdh(qm); hpre_unregister_dh(qm); hpre_unregister_rsa(qm); diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c index 76b7ecb5624b16..f22c12e36b56cc 100644 --- a/drivers/crypto/img-hash.c +++ b/drivers/crypto/img-hash.c @@ -700,7 +700,7 @@ static int img_hash_cra_init(struct crypto_tfm *tfm, const char *alg_name) static int img_hash_cra_md5_init(struct crypto_tfm *tfm) { - return img_hash_cra_init(tfm, "md5-generic"); + return img_hash_cra_init(tfm, "md5-lib"); } static int img_hash_cra_sha1_init(struct crypto_tfm *tfm) diff --git a/drivers/crypto/loongson/Kconfig b/drivers/crypto/loongson/Kconfig new file mode 100644 index 00000000000000..15475da8fc11df --- /dev/null +++ b/drivers/crypto/loongson/Kconfig @@ -0,0 +1,5 @@ +config CRYPTO_DEV_LOONGSON_RNG + tristate "Support for Loongson RNG Driver" + depends on MFD_LOONGSON_SE + help + Support for Loongson RNG Driver. diff --git a/drivers/crypto/loongson/Makefile b/drivers/crypto/loongson/Makefile new file mode 100644 index 00000000000000..1ce5ec32b5534c --- /dev/null +++ b/drivers/crypto/loongson/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_CRYPTO_DEV_LOONGSON_RNG) += loongson-rng.o diff --git a/drivers/crypto/loongson/loongson-rng.c b/drivers/crypto/loongson/loongson-rng.c new file mode 100644 index 00000000000000..3a4940260f9e59 --- /dev/null +++ b/drivers/crypto/loongson/loongson-rng.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 HiSilicon Limited. */ +/* Copyright (c) 2025 Loongson Technology Corporation Limited. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SE_SEED_SIZE 32 + +struct loongson_rng_list { + struct mutex lock; + struct list_head list; + int registered; +}; + +struct loongson_rng { + u32 used; + struct loongson_se_engine *engine; + struct list_head list; + struct mutex lock; +}; + +struct loongson_rng_ctx { + struct loongson_rng *rng; +}; + +struct loongson_rng_cmd { + u32 cmd_id; + union { + u32 len; + u32 ret; + } u; + u32 seed_off; + u32 out_off; + u32 pad[4]; +}; + +static struct loongson_rng_list rng_devices = { + .lock = __MUTEX_INITIALIZER(rng_devices.lock), + .list = LIST_HEAD_INIT(rng_devices.list), +}; + +static int loongson_rng_generate(struct crypto_rng *tfm, const u8 *src, + unsigned int slen, u8 *dstn, unsigned int dlen) +{ + struct loongson_rng_ctx *ctx = crypto_rng_ctx(tfm); + struct loongson_rng *rng = ctx->rng; + struct loongson_rng_cmd *cmd = rng->engine->command; + int err, len; + + mutex_lock(&rng->lock); + cmd->seed_off = 0; + do { + len = min(dlen, rng->engine->buffer_size); + cmd = rng->engine->command; + cmd->u.len = len; + err = loongson_se_send_engine_cmd(rng->engine); + if (err) + break; + + cmd = rng->engine->command_ret; + if (cmd->u.ret) { + err = -EIO; + break; + } + + memcpy(dstn, rng->engine->data_buffer, len); + dlen -= len; + dstn += len; + } while (dlen > 0); + mutex_unlock(&rng->lock); + + return err; +} + +static int loongson_rng_init(struct crypto_tfm *tfm) +{ + struct loongson_rng_ctx *ctx = crypto_tfm_ctx(tfm); + struct loongson_rng *rng; + u32 min_used = U32_MAX; + + mutex_lock(&rng_devices.lock); + list_for_each_entry(rng, &rng_devices.list, list) { + if (rng->used < min_used) { + ctx->rng = rng; + min_used = rng->used; + } + } + ctx->rng->used++; + mutex_unlock(&rng_devices.lock); + + return 0; +} + +static void loongson_rng_exit(struct crypto_tfm *tfm) +{ + struct loongson_rng_ctx *ctx = crypto_tfm_ctx(tfm); + + mutex_lock(&rng_devices.lock); + ctx->rng->used--; + mutex_unlock(&rng_devices.lock); +} + +static int loongson_rng_seed(struct crypto_rng *tfm, const u8 *seed, + unsigned int slen) +{ + struct loongson_rng_ctx *ctx = crypto_rng_ctx(tfm); + struct loongson_rng *rng = ctx->rng; + struct loongson_rng_cmd *cmd; + int err; + + if (slen < SE_SEED_SIZE) + return -EINVAL; + + slen = min(slen, rng->engine->buffer_size); + + mutex_lock(&rng->lock); + cmd = rng->engine->command; + cmd->u.len = slen; + cmd->seed_off = rng->engine->buffer_off; + memcpy(rng->engine->data_buffer, seed, slen); + err = loongson_se_send_engine_cmd(rng->engine); + if (err) + goto out; + + cmd = rng->engine->command_ret; + if (cmd->u.ret) + err = -EIO; +out: + mutex_unlock(&rng->lock); + + return err; +} + +static struct rng_alg loongson_rng_alg = { + .generate = loongson_rng_generate, + .seed = loongson_rng_seed, + .seedsize = SE_SEED_SIZE, + .base = { + .cra_name = "stdrng", + .cra_driver_name = "loongson_stdrng", + .cra_priority = 300, + .cra_ctxsize = sizeof(struct loongson_rng_ctx), + .cra_module = THIS_MODULE, + .cra_init = loongson_rng_init, + .cra_exit = loongson_rng_exit, + }, +}; + +static int loongson_rng_probe(struct platform_device *pdev) +{ + struct loongson_rng_cmd *cmd; + struct loongson_rng *rng; + int ret = 0; + + rng = devm_kzalloc(&pdev->dev, sizeof(*rng), GFP_KERNEL); + if (!rng) + return -ENOMEM; + + rng->engine = loongson_se_init_engine(pdev->dev.parent, SE_ENGINE_RNG); + if (!rng->engine) + return -ENODEV; + cmd = rng->engine->command; + cmd->cmd_id = SE_CMD_RNG; + cmd->out_off = rng->engine->buffer_off; + mutex_init(&rng->lock); + + mutex_lock(&rng_devices.lock); + + if (!rng_devices.registered) { + ret = crypto_register_rng(&loongson_rng_alg); + if (ret) { + dev_err(&pdev->dev, "failed to register crypto(%d)\n", ret); + goto out; + } + rng_devices.registered = 1; + } + + list_add_tail(&rng->list, &rng_devices.list); +out: + mutex_unlock(&rng_devices.lock); + + return ret; +} + +static struct platform_driver loongson_rng_driver = { + .probe = loongson_rng_probe, + .driver = { + .name = "loongson-rng", + }, +}; +module_platform_driver(loongson_rng_driver); + +MODULE_ALIAS("platform:loongson-rng"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yinggang Gu "); +MODULE_AUTHOR("Qunqin Zhao "); +MODULE_DESCRIPTION("Loongson Random Number Generator driver"); diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 54c480e874cb30..d7714d8afb0fa0 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -388,7 +388,7 @@ static const struct super_operations dax_sops = { .alloc_inode = dax_alloc_inode, .destroy_inode = dax_destroy_inode, .free_inode = dax_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int dax_init_fs_context(struct fs_context *fc) diff --git a/drivers/dma/dw/rzn1-dmamux.c b/drivers/dma/dw/rzn1-dmamux.c index 4fb8508419dbd8..deadf135681b67 100644 --- a/drivers/dma/dw/rzn1-dmamux.c +++ b/drivers/dma/dw/rzn1-dmamux.c @@ -48,12 +48,16 @@ static void *rzn1_dmamux_route_allocate(struct of_phandle_args *dma_spec, u32 mask; int ret; - if (dma_spec->args_count != RNZ1_DMAMUX_NCELLS) - return ERR_PTR(-EINVAL); + if (dma_spec->args_count != RNZ1_DMAMUX_NCELLS) { + ret = -EINVAL; + goto put_device; + } map = kzalloc(sizeof(*map), GFP_KERNEL); - if (!map) - return ERR_PTR(-ENOMEM); + if (!map) { + ret = -ENOMEM; + goto put_device; + } chan = dma_spec->args[0]; map->req_idx = dma_spec->args[4]; @@ -94,12 +98,15 @@ static void *rzn1_dmamux_route_allocate(struct of_phandle_args *dma_spec, if (ret) goto clear_bitmap; + put_device(&pdev->dev); return map; clear_bitmap: clear_bit(map->req_idx, dmamux->used_chans); free_map: kfree(map); +put_device: + put_device(&pdev->dev); return ERR_PTR(ret); } diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 35bdefd3728bb8..8c4725ad1f648d 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -189,27 +189,30 @@ static int idxd_setup_wqs(struct idxd_device *idxd) idxd->wq_enable_map = bitmap_zalloc_node(idxd->max_wqs, GFP_KERNEL, dev_to_node(dev)); if (!idxd->wq_enable_map) { rc = -ENOMEM; - goto err_bitmap; + goto err_free_wqs; } for (i = 0; i < idxd->max_wqs; i++) { wq = kzalloc_node(sizeof(*wq), GFP_KERNEL, dev_to_node(dev)); if (!wq) { rc = -ENOMEM; - goto err; + goto err_unwind; } idxd_dev_set_type(&wq->idxd_dev, IDXD_DEV_WQ); conf_dev = wq_confdev(wq); wq->id = i; wq->idxd = idxd; - device_initialize(wq_confdev(wq)); + device_initialize(conf_dev); conf_dev->parent = idxd_confdev(idxd); conf_dev->bus = &dsa_bus_type; conf_dev->type = &idxd_wq_device_type; rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id); - if (rc < 0) - goto err; + if (rc < 0) { + put_device(conf_dev); + kfree(wq); + goto err_unwind; + } mutex_init(&wq->wq_lock); init_waitqueue_head(&wq->err_queue); @@ -220,15 +223,20 @@ static int idxd_setup_wqs(struct idxd_device *idxd) wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES; wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev)); if (!wq->wqcfg) { + put_device(conf_dev); + kfree(wq); rc = -ENOMEM; - goto err; + goto err_unwind; } if (idxd->hw.wq_cap.op_config) { wq->opcap_bmap = bitmap_zalloc(IDXD_MAX_OPCAP_BITS, GFP_KERNEL); if (!wq->opcap_bmap) { + kfree(wq->wqcfg); + put_device(conf_dev); + kfree(wq); rc = -ENOMEM; - goto err_opcap_bmap; + goto err_unwind; } bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS); } @@ -239,13 +247,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd) return 0; -err_opcap_bmap: - kfree(wq->wqcfg); - -err: - put_device(conf_dev); - kfree(wq); - +err_unwind: while (--i >= 0) { wq = idxd->wqs[i]; if (idxd->hw.wq_cap.op_config) @@ -254,11 +256,10 @@ static int idxd_setup_wqs(struct idxd_device *idxd) conf_dev = wq_confdev(wq); put_device(conf_dev); kfree(wq); - } bitmap_free(idxd->wq_enable_map); -err_bitmap: +err_free_wqs: kfree(idxd->wqs); return rc; @@ -1291,10 +1292,12 @@ static void idxd_remove(struct pci_dev *pdev) device_unregister(idxd_confdev(idxd)); idxd_shutdown(pdev); idxd_device_remove_debugfs(idxd); - idxd_cleanup(idxd); + perfmon_pmu_remove(idxd); + idxd_cleanup_interrupts(idxd); + if (device_pasid_enabled(idxd)) + idxd_disable_system_pasid(idxd); pci_iounmap(pdev, idxd->reg_base); put_device(idxd_confdev(idxd)); - idxd_free(idxd); pci_disable_device(pdev); } diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c index bbc3276992bb01..2cf060174795fe 100644 --- a/drivers/dma/qcom/bam_dma.c +++ b/drivers/dma/qcom/bam_dma.c @@ -1283,13 +1283,17 @@ static int bam_dma_probe(struct platform_device *pdev) if (!bdev->bamclk) { ret = of_property_read_u32(pdev->dev.of_node, "num-channels", &bdev->num_channels); - if (ret) + if (ret) { dev_err(bdev->dev, "num-channels unspecified in dt\n"); + return ret; + } ret = of_property_read_u32(pdev->dev.of_node, "qcom,num-ees", &bdev->num_ees); - if (ret) + if (ret) { dev_err(bdev->dev, "num-ees unspecified in dt\n"); + return ret; + } } ret = clk_prepare_enable(bdev->bamclk); diff --git a/drivers/dma/ti/edma.c b/drivers/dma/ti/edma.c index 3ed406f08c442e..552be71db6c47b 100644 --- a/drivers/dma/ti/edma.c +++ b/drivers/dma/ti/edma.c @@ -2064,8 +2064,8 @@ static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata, * priority. So Q0 is the highest priority queue and the last queue has * the lowest priority. */ - queue_priority_map = devm_kcalloc(dev, ecc->num_tc + 1, sizeof(s8), - GFP_KERNEL); + queue_priority_map = devm_kcalloc(dev, ecc->num_tc + 1, + sizeof(*queue_priority_map), GFP_KERNEL); if (!queue_priority_map) return -ENOMEM; diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 036f21cac0a918..0a852011653c4c 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -211,8 +211,8 @@ static int dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll, struct netlink_ext_ack *extack) { + DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1) = { 0 }; const struct dpll_device_ops *ops = dpll_device_ops(dpll); - DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) = { 0 }; enum dpll_clock_quality_level ql; int ret; @@ -221,7 +221,7 @@ dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll, ret = ops->clock_quality_level_get(dpll, dpll_priv(dpll), qls, extack); if (ret) return ret; - for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) + for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1) if (nla_put_u32(msg, DPLL_A_CLOCK_QUALITY_LEVEL, ql)) return -EMSGSIZE; diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 19ad3c3b675ddb..39352b9b7a7e45 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -576,4 +576,20 @@ config EDAC_LOONGSON errors (CE) only. Loongson-3A5000/3C5000/3D5000/3A6000/3C6000 are compatible. +config EDAC_CORTEX_A72 + tristate "ARM Cortex A72" + depends on ARM64 + help + Support for L1/L2 cache error detection for ARM Cortex A72 processor. + The detected and reported errors are from reading CPU/L2 memory error + syndrome registers. + +config EDAC_VERSALNET + tristate "AMD VersalNET DDR Controller" + depends on CDX_CONTROLLER && ARCH_ZYNQMP + help + Support for single bit error correction, double bit error detection + and other system errors from various IP subsystems like RPU, NOCs, + HNICX, PL on the AMD Versal NET DDR memory controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index a8f2d8f6c894a9..1c14796410a3e2 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -88,3 +88,5 @@ obj-$(CONFIG_EDAC_NPCM) += npcm_edac.o obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o obj-$(CONFIG_EDAC_VERSAL) += versal_edac.o obj-$(CONFIG_EDAC_LOONGSON) += loongson_edac.o +obj-$(CONFIG_EDAC_VERSALNET) += versalnet_edac.o +obj-$(CONFIG_EDAC_CORTEX_A72) += a72_edac.o diff --git a/drivers/edac/a72_edac.c b/drivers/edac/a72_edac.c new file mode 100644 index 00000000000000..9262d75c385599 --- /dev/null +++ b/drivers/edac/a72_edac.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cortex A72 EDAC L1 and L2 cache error detection + * + * Copyright (c) 2020 Pengutronix, Sascha Hauer + * Copyright (c) 2025 Microsoft Corporation, + * + * Based on Code from: + * Copyright (c) 2018, NXP Semiconductor + * Author: York Sun + */ + +#include +#include +#include +#include + +#include "edac_module.h" + +#define DRVNAME "a72-edac" + +#define SYS_CPUMERRSR_EL1 sys_reg(3, 1, 15, 2, 2) +#define SYS_L2MERRSR_EL1 sys_reg(3, 1, 15, 2, 3) + +#define CPUMERRSR_EL1_RAMID GENMASK(30, 24) +#define L2MERRSR_EL1_CPUID_WAY GENMASK(21, 18) + +#define CPUMERRSR_EL1_VALID BIT(31) +#define CPUMERRSR_EL1_FATAL BIT(63) +#define L2MERRSR_EL1_VALID BIT(31) +#define L2MERRSR_EL1_FATAL BIT(63) + +#define L1_I_TAG_RAM 0x00 +#define L1_I_DATA_RAM 0x01 +#define L1_D_TAG_RAM 0x08 +#define L1_D_DATA_RAM 0x09 +#define TLB_RAM 0x18 + +#define MESSAGE_SIZE 64 + +struct mem_err_synd_reg { + u64 cpu_mesr; + u64 l2_mesr; +}; + +static struct cpumask compat_mask; + +static void report_errors(struct edac_device_ctl_info *edac_ctl, int cpu, + struct mem_err_synd_reg *mesr) +{ + u64 cpu_mesr = mesr->cpu_mesr; + u64 l2_mesr = mesr->l2_mesr; + char msg[MESSAGE_SIZE]; + + if (cpu_mesr & CPUMERRSR_EL1_VALID) { + const char *str; + bool fatal = cpu_mesr & CPUMERRSR_EL1_FATAL; + + switch (FIELD_GET(CPUMERRSR_EL1_RAMID, cpu_mesr)) { + case L1_I_TAG_RAM: + str = "L1-I Tag RAM"; + break; + case L1_I_DATA_RAM: + str = "L1-I Data RAM"; + break; + case L1_D_TAG_RAM: + str = "L1-D Tag RAM"; + break; + case L1_D_DATA_RAM: + str = "L1-D Data RAM"; + break; + case TLB_RAM: + str = "TLB RAM"; + break; + default: + str = "Unspecified"; + break; + } + + snprintf(msg, MESSAGE_SIZE, "%s %s error(s) on CPU %d", + str, fatal ? "fatal" : "correctable", cpu); + + if (fatal) + edac_device_handle_ue(edac_ctl, cpu, 0, msg); + else + edac_device_handle_ce(edac_ctl, cpu, 0, msg); + } + + if (l2_mesr & L2MERRSR_EL1_VALID) { + bool fatal = l2_mesr & L2MERRSR_EL1_FATAL; + + snprintf(msg, MESSAGE_SIZE, "L2 %s error(s) on CPU %d CPUID/WAY 0x%lx", + fatal ? "fatal" : "correctable", cpu, + FIELD_GET(L2MERRSR_EL1_CPUID_WAY, l2_mesr)); + if (fatal) + edac_device_handle_ue(edac_ctl, cpu, 1, msg); + else + edac_device_handle_ce(edac_ctl, cpu, 1, msg); + } +} + +static void read_errors(void *data) +{ + struct mem_err_synd_reg *mesr = data; + + mesr->cpu_mesr = read_sysreg_s(SYS_CPUMERRSR_EL1); + if (mesr->cpu_mesr & CPUMERRSR_EL1_VALID) { + write_sysreg_s(0, SYS_CPUMERRSR_EL1); + isb(); + } + mesr->l2_mesr = read_sysreg_s(SYS_L2MERRSR_EL1); + if (mesr->l2_mesr & L2MERRSR_EL1_VALID) { + write_sysreg_s(0, SYS_L2MERRSR_EL1); + isb(); + } +} + +static void a72_edac_check(struct edac_device_ctl_info *edac_ctl) +{ + struct mem_err_synd_reg mesr; + int cpu; + + cpus_read_lock(); + for_each_cpu_and(cpu, cpu_online_mask, &compat_mask) { + smp_call_function_single(cpu, read_errors, &mesr, true); + report_errors(edac_ctl, cpu, &mesr); + } + cpus_read_unlock(); +} + +static int a72_edac_probe(struct platform_device *pdev) +{ + struct edac_device_ctl_info *edac_ctl; + struct device *dev = &pdev->dev; + int rc; + + edac_ctl = edac_device_alloc_ctl_info(0, "cpu", + num_possible_cpus(), "L", 2, 1, + edac_device_alloc_index()); + if (!edac_ctl) + return -ENOMEM; + + edac_ctl->edac_check = a72_edac_check; + edac_ctl->dev = dev; + edac_ctl->mod_name = dev_name(dev); + edac_ctl->dev_name = dev_name(dev); + edac_ctl->ctl_name = DRVNAME; + dev_set_drvdata(dev, edac_ctl); + + rc = edac_device_add_device(edac_ctl); + if (rc) + goto out_dev; + + return 0; + +out_dev: + edac_device_free_ctl_info(edac_ctl); + + return rc; +} + +static void a72_edac_remove(struct platform_device *pdev) +{ + struct edac_device_ctl_info *edac_ctl = dev_get_drvdata(&pdev->dev); + + edac_device_del_device(edac_ctl->dev); + edac_device_free_ctl_info(edac_ctl); +} + +static const struct of_device_id cortex_arm64_edac_of_match[] = { + { .compatible = "arm,cortex-a72" }, + {} +}; +MODULE_DEVICE_TABLE(of, cortex_arm64_edac_of_match); + +static struct platform_driver a72_edac_driver = { + .probe = a72_edac_probe, + .remove = a72_edac_remove, + .driver = { + .name = DRVNAME, + }, +}; + +static struct platform_device *a72_pdev; + +static int __init a72_edac_driver_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device_node *np __free(device_node) = of_cpu_device_node_get(cpu); + if (np) { + if (of_match_node(cortex_arm64_edac_of_match, np) && + of_property_read_bool(np, "edac-enabled")) { + cpumask_set_cpu(cpu, &compat_mask); + } + } else { + pr_warn("failed to find device node for CPU %d\n", cpu); + } + } + + if (cpumask_empty(&compat_mask)) + return 0; + + a72_pdev = platform_device_register_simple(DRVNAME, -1, NULL, 0); + if (IS_ERR(a72_pdev)) { + pr_err("failed to register A72 EDAC device\n"); + return PTR_ERR(a72_pdev); + } + + return platform_driver_register(&a72_edac_driver); +} + +static void __exit a72_edac_driver_exit(void) +{ + platform_device_unregister(a72_pdev); + platform_driver_unregister(&a72_edac_driver); +} + +module_init(a72_edac_driver_init); +module_exit(a72_edac_driver_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sascha Hauer "); +MODULE_DESCRIPTION("Cortex A72 L1 and L2 cache EDAC driver"); diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 7685a8550d4b1f..103b2c2eba2aba 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -2130,8 +2130,8 @@ static int altr_edac_a10_probe(struct platform_device *pdev) edac->irq_chip.name = pdev->dev.of_node->name; edac->irq_chip.irq_mask = a10_eccmgr_irq_mask; edac->irq_chip.irq_unmask = a10_eccmgr_irq_unmask; - edac->domain = irq_domain_create_linear(of_fwnode_handle(pdev->dev.of_node), - 64, &a10_eccmgr_ic_ops, edac); + edac->domain = irq_domain_create_linear(dev_fwnode(&pdev->dev), 64, &a10_eccmgr_ic_ops, + edac); if (!edac->domain) { dev_err(&pdev->dev, "Error adding IRQ domain\n"); return -ENOMEM; diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 07f1e9dc1ca71b..2f6ab783bf2097 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3923,6 +3923,26 @@ static int per_family_init(struct amd64_pvt *pvt) pvt->ctl_name = "F1Ah_M40h"; pvt->flags.zn_regs_v2 = 1; break; + case 0x50 ... 0x57: + pvt->ctl_name = "F1Ah_M50h"; + pvt->max_mcs = 16; + pvt->flags.zn_regs_v2 = 1; + break; + case 0x90 ... 0x9f: + pvt->ctl_name = "F1Ah_M90h"; + pvt->max_mcs = 8; + pvt->flags.zn_regs_v2 = 1; + break; + case 0xa0 ... 0xaf: + pvt->ctl_name = "F1Ah_MA0h"; + pvt->max_mcs = 8; + pvt->flags.zn_regs_v2 = 1; + break; + case 0xc0 ... 0xc7: + pvt->ctl_name = "F1Ah_MC0h"; + pvt->max_mcs = 16; + pvt->flags.zn_regs_v2 = 1; + break; } break; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 17228d07de4c8c..d70b8a8d0b092a 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -96,7 +96,7 @@ /* Hardware limit on ChipSelect rows per MC and processors per system */ #define NUM_CHIPSELECTS 8 #define DRAM_RANGES 8 -#define NUM_CONTROLLERS 12 +#define NUM_CONTROLLERS 16 #define ON true #define OFF false diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c old mode 100755 new mode 100644 diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 0f338adf7d9376..8689631f190536 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 10); DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 11); +DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 12); +DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 13); +DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 14); +DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 15); /* Total possible dynamic DIMM Label attribute file table */ static struct attribute *dynamic_csrow_dimm_attr[] = { @@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { &dev_attr_legacy_ch9_dimm_label.attr.attr, &dev_attr_legacy_ch10_dimm_label.attr.attr, &dev_attr_legacy_ch11_dimm_label.attr.attr, + &dev_attr_legacy_ch12_dimm_label.attr.attr, + &dev_attr_legacy_ch13_dimm_label.attr.attr, + &dev_attr_legacy_ch14_dimm_label.attr.attr, + &dev_attr_legacy_ch15_dimm_label.attr.attr, NULL }; @@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, channel_ce_count_show, NULL, 10); DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, channel_ce_count_show, NULL, 11); +DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 12); +DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 13); +DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 14); +DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 15); /* Total possible dynamic ce_count attribute file table */ static struct attribute *dynamic_csrow_ce_count_attr[] = { @@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { &dev_attr_legacy_ch9_ce_count.attr.attr, &dev_attr_legacy_ch10_ce_count.attr.attr, &dev_attr_legacy_ch11_ce_count.attr.attr, + &dev_attr_legacy_ch12_ce_count.attr.attr, + &dev_attr_legacy_ch13_ce_count.attr.attr, + &dev_attr_legacy_ch14_ce_count.attr.attr, + &dev_attr_legacy_ch15_ce_count.attr.attr, NULL }; diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index bf4171ac191d3f..2010a47149f448 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -468,17 +468,18 @@ static int i10nm_get_imc_num(struct res_config *cfg) return -ENODEV; } - if (imc_num > I10NM_NUM_DDR_IMC) { - i10nm_printk(KERN_ERR, "Need to make I10NM_NUM_DDR_IMC >= %d\n", imc_num); - return -EINVAL; - } - if (cfg->ddr_imc_num != imc_num) { /* - * Store the number of present DDR memory controllers. + * Update the configuration data to reflect the number of + * present DDR memory controllers. */ cfg->ddr_imc_num = imc_num; edac_dbg(2, "Set DDR MC number: %d", imc_num); + + /* Release and reallocate skx_dev list with the updated number. */ + skx_remove(); + if (skx_get_all_bus_mappings(cfg, &i10nm_edac_list) <= 0) + return -ENODEV; } return 0; @@ -1057,6 +1058,15 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan) return !!GET_BITFIELD(mcmtr, 2, 2); } +static bool i10nm_channel_disabled(struct skx_imc *imc, int chan) +{ + u32 mcmtr = I10NM_GET_MCMTR(imc, chan); + + edac_dbg(1, "mc%d ch%d mcmtr reg %x\n", imc->mc, chan, mcmtr); + + return (mcmtr == ~0 || GET_BITFIELD(mcmtr, 18, 18)); +} + static int i10nm_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) { @@ -1070,6 +1080,11 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, if (!imc->mbase) continue; + if (i10nm_channel_disabled(imc, i)) { + edac_dbg(1, "mc%d ch%d is disabled.\n", imc->mc, i); + continue; + } + ndimms = 0; if (res_cfg->type != GNR) diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c index 5c1fa1c0d12e3c..5a080ab65476da 100644 --- a/drivers/edac/ie31200_edac.c +++ b/drivers/edac/ie31200_edac.c @@ -99,6 +99,8 @@ /* Alder Lake-S */ #define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1 0x4660 +#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2 0x4668 /* 8P+4E, e.g. i7-12700K */ +#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3 0x4648 /* 6P+4E, e.g. i5-12600K */ /* Bartlett Lake-S */ #define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1 0x4639 @@ -761,6 +763,8 @@ static const struct pci_device_id ie31200_pci_tbl[] = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_6), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_HX_1), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_2), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_3), (kernel_ulong_t)&rpl_s_cfg}, diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c old mode 100755 new mode 100644 diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c old mode 100755 new mode 100644 diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 29897b21fb8e35..078ddf95cc6e60 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -33,6 +33,15 @@ static unsigned int nvdimm_count; #define MASK26 0x3FFFFFF /* Mask for 2^26 */ #define MASK29 0x1FFFFFFF /* Mask for 2^29 */ +static struct res_config skx_cfg = { + .type = SKX, + .decs_did = 0x2016, + .busno_cfg_offset = 0xcc, + .ddr_imc_num = 2, + .ddr_chan_num = 3, + .ddr_dimm_num = 2, +}; + static struct skx_dev *get_skx_dev(struct pci_bus *bus, u8 idx) { struct skx_dev *d; @@ -52,7 +61,7 @@ enum munittype { struct munit { u16 did; - u16 devfn[SKX_NUM_IMC]; + u16 devfn[2]; u8 busidx; u8 per_socket; enum munittype mtype; @@ -89,11 +98,11 @@ static int get_all_munits(const struct munit *m) if (!pdev) break; ndev++; - if (m->per_socket == SKX_NUM_IMC) { - for (i = 0; i < SKX_NUM_IMC; i++) + if (m->per_socket == skx_cfg.ddr_imc_num) { + for (i = 0; i < skx_cfg.ddr_imc_num; i++) if (m->devfn[i] == pdev->devfn) break; - if (i == SKX_NUM_IMC) + if (i == skx_cfg.ddr_imc_num) goto fail; } d = get_skx_dev(pdev->bus, m->busidx); @@ -157,12 +166,6 @@ static int get_all_munits(const struct munit *m) return -ENODEV; } -static struct res_config skx_cfg = { - .type = SKX, - .decs_did = 0x2016, - .busno_cfg_offset = 0xcc, -}; - static const struct x86_cpu_id skx_cpuids[] = { X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_cfg), { } @@ -186,11 +189,11 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) /* Only the mcmtr on the first channel is effective */ pci_read_config_dword(imc->chan[0].cdev, 0x87c, &mcmtr); - for (i = 0; i < SKX_NUM_CHANNELS; i++) { + for (i = 0; i < cfg->ddr_chan_num; i++) { ndimms = 0; pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap); pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg); - for (j = 0; j < SKX_NUM_DIMMS; j++) { + for (j = 0; j < cfg->ddr_dimm_num; j++) { dimm = edac_get_dimm(mci, i, j, 0); pci_read_config_dword(imc->chan[i].cdev, 0x80 + 4 * j, &mtr); @@ -620,6 +623,7 @@ static int __init skx_init(void) return -ENODEV; cfg = (struct res_config *)id->driver_data; + skx_set_res_cfg(cfg); rc = skx_get_hi_lo(0x2034, off, &skx_tolm, &skx_tohm); if (rc) @@ -652,10 +656,13 @@ static int __init skx_init(void) goto fail; edac_dbg(2, "src_id = %d\n", src_id); - for (i = 0; i < SKX_NUM_IMC; i++) { + for (i = 0; i < cfg->ddr_imc_num; i++) { d->imc[i].mc = mc++; d->imc[i].lmc = i; d->imc[i].src_id = src_id; + d->imc[i].num_channels = cfg->ddr_chan_num; + d->imc[i].num_dimms = cfg->ddr_dimm_num; + rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev, "Skylake Socket", EDAC_MOD_STR, skx_get_dimm_config, cfg); diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 39c733dbc5b9fb..724842f512acac 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -14,9 +14,11 @@ * Copyright (c) 2018, Intel Corporation. */ +#include #include #include #include +#include #include #include #include @@ -130,8 +132,8 @@ static void skx_init_mc_mapping(struct skx_dev *d) * the logical indices of the memory controllers enumerated by the * EDAC driver. */ - for (int i = 0; i < NUM_IMC; i++) - d->mc_mapping[i] = i; + for (int i = 0; i < d->num_imc; i++) + d->imc[i].mc_mapping = i; } void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) @@ -139,22 +141,28 @@ void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n", pmc, lmc); - d->mc_mapping[pmc] = lmc; + d->imc[lmc].mc_mapping = pmc; } EXPORT_SYMBOL_GPL(skx_set_mc_mapping); -static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc) +static int skx_get_mc_mapping(struct skx_dev *d, u8 pmc) { - edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", - pmc, d->mc_mapping[pmc]); + for (int lmc = 0; lmc < d->num_imc; lmc++) { + if (d->imc[lmc].mc_mapping == pmc) { + edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", + pmc, lmc); - return d->mc_mapping[pmc]; + return lmc; + } + } + + return -1; } static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) { + int i, lmc, len = 0; struct skx_dev *d; - int i, len = 0; if (res->addr >= skx_tohm || (res->addr >= skx_tolm && res->addr < BIT_ULL(32))) { @@ -200,7 +208,7 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) res->cs = (int)adxl_values[component_indices[INDEX_CS]]; } - if (res->imc > NUM_IMC - 1 || res->imc < 0) { + if (res->imc < 0) { skx_printk(KERN_ERR, "Bad imc %d\n", res->imc); return false; } @@ -218,7 +226,13 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) return false; } - res->imc = skx_get_mc_mapping(d, res->imc); + lmc = skx_get_mc_mapping(d, res->imc); + if (lmc < 0) { + skx_printk(KERN_ERR, "No lmc for imc %d\n", res->imc); + return false; + } + + res->imc = lmc; for (i = 0; i < adxl_component_count; i++) { if (adxl_values[i] == ~0x0ull) @@ -265,7 +279,7 @@ static int skx_get_pkg_id(struct skx_dev *d, u8 *id) struct cpuinfo_x86 *c = &cpu_data(cpu); if (c->initialized && cpu_to_node(cpu) == node) { - *id = c->topo.pkg_id; + *id = topology_physical_package_id(cpu); return 0; } } @@ -320,10 +334,10 @@ static int get_width(u32 mtr) */ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) { + int ndev = 0, imc_num = cfg->ddr_imc_num + cfg->hbm_imc_num; struct pci_dev *pdev, *prev; struct skx_dev *d; u32 reg; - int ndev = 0; prev = NULL; for (;;) { @@ -331,7 +345,7 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) if (!pdev) break; ndev++; - d = kzalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc(struct_size(d, imc, imc_num), GFP_KERNEL); if (!d) { pci_dev_put(pdev); return -ENOMEM; @@ -354,8 +368,10 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) d->seg = GET_BITFIELD(reg, 16, 23); } - edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x\n", - d->bus[0], d->bus[1], d->bus[2], d->bus[3]); + d->num_imc = imc_num; + + edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x, imcs %d\n", + d->bus[0], d->bus[1], d->bus[2], d->bus[3], imc_num); list_add_tail(&d->list, &dev_edac_list); prev = pdev; @@ -541,10 +557,10 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, /* Allocate a new MC control structure */ layers[0].type = EDAC_MC_LAYER_CHANNEL; - layers[0].size = NUM_CHANNELS; + layers[0].size = imc->num_channels; layers[0].is_virt_csrow = false; layers[1].type = EDAC_MC_LAYER_SLOT; - layers[1].size = NUM_DIMMS; + layers[1].size = imc->num_dimms; layers[1].is_virt_csrow = true; mci = edac_mc_alloc(imc->mc, ARRAY_SIZE(layers), layers, sizeof(struct skx_pvt)); @@ -784,7 +800,7 @@ void skx_remove(void) list_for_each_entry_safe(d, tmp, &dev_edac_list, list) { list_del(&d->list); - for (i = 0; i < NUM_IMC; i++) { + for (i = 0; i < d->num_imc; i++) { if (d->imc[i].mci) skx_unregister_mci(&d->imc[i]); @@ -794,7 +810,7 @@ void skx_remove(void) if (d->imc[i].mbase) iounmap(d->imc[i].mbase); - for (j = 0; j < NUM_CHANNELS; j++) { + for (j = 0; j < d->imc[i].num_channels; j++) { if (d->imc[i].chan[j].cdev) pci_dev_put(d->imc[i].chan[j].cdev); } diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index ec4966f7ea40b3..73ba89786cdfd5 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -29,23 +29,18 @@ #define GET_BITFIELD(v, lo, hi) \ (((v) & GENMASK_ULL((hi), (lo))) >> (lo)) -#define SKX_NUM_IMC 2 /* Memory controllers per socket */ #define SKX_NUM_CHANNELS 3 /* Channels per memory controller */ #define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */ -#define I10NM_NUM_DDR_IMC 12 #define I10NM_NUM_DDR_CHANNELS 2 #define I10NM_NUM_DDR_DIMMS 2 -#define I10NM_NUM_HBM_IMC 16 #define I10NM_NUM_HBM_CHANNELS 2 #define I10NM_NUM_HBM_DIMMS 1 -#define I10NM_NUM_IMC (I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC) #define I10NM_NUM_CHANNELS MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS) #define I10NM_NUM_DIMMS MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS) -#define NUM_IMC MAX(SKX_NUM_IMC, I10NM_NUM_IMC) #define NUM_CHANNELS MAX(SKX_NUM_CHANNELS, I10NM_NUM_CHANNELS) #define NUM_DIMMS MAX(SKX_NUM_DIMMS, I10NM_NUM_DIMMS) @@ -134,16 +129,7 @@ struct skx_dev { struct pci_dev *uracu; /* for i10nm CPU */ struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; - /* - * Some server BIOS may hide certain memory controllers, and the - * EDAC driver skips those hidden memory controllers. However, the - * ADXL still decodes memory error address using physical memory - * controller indices. The mapping table is used to convert the - * physical indices (reported by ADXL) to the logical indices - * (used the EDAC driver) of present memory controllers during the - * error handling process. - */ - u8 mc_mapping[NUM_IMC]; + int num_imc; struct skx_imc { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ @@ -155,6 +141,16 @@ struct skx_dev { u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ u8 src_id; + /* + * Some server BIOS may hide certain memory controllers, and the + * EDAC driver skips those hidden memory controllers. However, the + * ADXL still decodes memory error address using physical memory + * controller indices. The mapping table is used to convert the + * physical indices (reported by ADXL) to the logical indices + * (used the EDAC driver) of present memory controllers during the + * error handling process. + */ + u8 mc_mapping; struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; @@ -171,7 +167,7 @@ struct skx_dev { u8 colbits; } dimms[NUM_DIMMS]; } chan[NUM_CHANNELS]; - } imc[NUM_IMC]; + } imc[]; }; struct skx_pvt { diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c new file mode 100644 index 00000000000000..7c5db8bf0595b6 --- /dev/null +++ b/drivers/edac/versalnet_edac.c @@ -0,0 +1,960 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Versal NET memory controller driver + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "edac_module.h" + +/* Granularity of reported error in bytes */ +#define MC5_ERR_GRAIN 1 +#define MC_GET_DDR_CONFIG_IN_LEN 4 + +#define MC5_IRQ_CE_MASK GENMASK(18, 15) +#define MC5_IRQ_UE_MASK GENMASK(14, 11) + +#define MC5_RANK_1_MASK GENMASK(11, 6) +#define MASK_24 GENMASK(29, 24) +#define MASK_0 GENMASK(5, 0) + +#define MC5_LRANK_1_MASK GENMASK(11, 6) +#define MC5_LRANK_2_MASK GENMASK(17, 12) +#define MC5_BANK1_MASK GENMASK(11, 6) +#define MC5_GRP_0_MASK GENMASK(17, 12) +#define MC5_GRP_1_MASK GENMASK(23, 18) + +#define MC5_REGHI_ROW 7 +#define MC5_EACHBIT 1 +#define MC5_ERR_TYPE_CE 0 +#define MC5_ERR_TYPE_UE 1 +#define MC5_HIGH_MEM_EN BIT(20) +#define MC5_MEM_MASK GENMASK(19, 0) +#define MC5_X16_BASE 256 +#define MC5_X16_ECC 32 +#define MC5_X16_SIZE (MC5_X16_BASE + MC5_X16_ECC) +#define MC5_X32_SIZE 576 +#define MC5_HIMEM_BASE (256 * SZ_1M) +#define MC5_ILC_HIMEM_EN BIT(28) +#define MC5_ILC_MEM GENMASK(27, 0) +#define MC5_INTERLEAVE_SEL GENMASK(3, 0) +#define MC5_BUS_WIDTH_MASK GENMASK(19, 18) +#define MC5_NUM_CHANS_MASK BIT(17) +#define MC5_RANK_MASK GENMASK(15, 14) + +#define ERROR_LEVEL 2 +#define ERROR_ID 3 +#define TOTAL_ERR_LENGTH 5 +#define MSG_ERR_OFFSET 8 +#define MSG_ERR_LENGTH 9 +#define ERROR_DATA 10 +#define MCDI_RESPONSE 0xFF + +#define REG_MAX 152 +#define ADEC_MAX 152 +#define NUM_CONTROLLERS 8 +#define REGS_PER_CONTROLLER 19 +#define ADEC_NUM 19 +#define BUFFER_SZ 80 + +#define XDDR5_BUS_WIDTH_64 0 +#define XDDR5_BUS_WIDTH_32 1 +#define XDDR5_BUS_WIDTH_16 2 + +/** + * struct ecc_error_info - ECC error log information. + * @burstpos: Burst position. + * @lrank: Logical Rank number. + * @rank: Rank number. + * @group: Group number. + * @bank: Bank number. + * @col: Column number. + * @row: Row number. + * @rowhi: Row number higher bits. + * @i: Combined ECC error vector containing encoded values of burst position, + * rank, bank, column, and row information. + */ +union ecc_error_info { + struct { + u32 burstpos:3; + u32 lrank:4; + u32 rank:2; + u32 group:3; + u32 bank:2; + u32 col:11; + u32 row:7; + u32 rowhi; + }; + u64 i; +} __packed; + +/* Row and column bit positions in the address decoder (ADEC) registers. */ +union row_col_mapping { + struct { + u32 row0:6; + u32 row1:6; + u32 row2:6; + u32 row3:6; + u32 row4:6; + u32 reserved:2; + }; + struct { + u32 col1:6; + u32 col2:6; + u32 col3:6; + u32 col4:6; + u32 col5:6; + u32 reservedcol:2; + }; + u32 i; +} __packed; + +/** + * struct ecc_status - ECC status information to report. + * @ceinfo: Correctable errors. + * @ueinfo: Uncorrected errors. + * @channel: Channel number. + * @error_type: Error type. + */ +struct ecc_status { + union ecc_error_info ceinfo[2]; + union ecc_error_info ueinfo[2]; + u8 channel; + u8 error_type; +}; + +/** + * struct mc_priv - DDR memory controller private instance data. + * @message: Buffer for framing the event specific info. + * @stat: ECC status information. + * @error_id: The error id. + * @error_level: The error level. + * @dwidth: Width of data bus excluding ECC bits. + * @part_len: The support of the message received. + * @regs: The registers sent on the rpmsg. + * @adec: Address decode registers. + * @mci: Memory controller interface. + * @ept: rpmsg endpoint. + * @mcdi: The mcdi handle. + */ +struct mc_priv { + char message[256]; + struct ecc_status stat; + u32 error_id; + u32 error_level; + u32 dwidth; + u32 part_len; + u32 regs[REG_MAX]; + u32 adec[ADEC_MAX]; + struct mem_ctl_info *mci[NUM_CONTROLLERS]; + struct rpmsg_endpoint *ept; + struct cdx_mcdi *mcdi; +}; + +/* + * Address decoder (ADEC) registers to match the order in which the register + * information is received from the firmware. + */ +enum adec_info { + CONF = 0, + ADEC0, + ADEC1, + ADEC2, + ADEC3, + ADEC4, + ADEC5, + ADEC6, + ADEC7, + ADEC8, + ADEC9, + ADEC10, + ADEC11, + ADEC12, + ADEC13, + ADEC14, + ADEC15, + ADEC16, + ADECILC, +}; + +enum reg_info { + ISR = 0, + IMR, + ECCR0_ERR_STATUS, + ECCR0_ADDR_LO, + ECCR0_ADDR_HI, + ECCR0_DATA_LO, + ECCR0_DATA_HI, + ECCR0_PAR, + ECCR1_ERR_STATUS, + ECCR1_ADDR_LO, + ECCR1_ADDR_HI, + ECCR1_DATA_LO, + ECCR1_DATA_HI, + ECCR1_PAR, + XMPU_ERR, + XMPU_ERR_ADDR_L0, + XMPU_ERR_ADDR_HI, + XMPU_ERR_AXI_ID, + ADEC_CHK_ERR_LOG, +}; + +static bool get_ddr_info(u32 *error_data, struct mc_priv *priv) +{ + u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr; + struct ecc_status *p; + + isr = error_data[ISR]; + + if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK))) + return false; + + eccr0_val = error_data[ECCR0_ERR_STATUS]; + eccr1_val = error_data[ECCR1_ERR_STATUS]; + + if (!eccr0_val && !eccr1_val) + return false; + + p = &priv->stat; + + if (!eccr0_val) + p->channel = 1; + else + p->channel = 0; + + reglo = error_data[ECCR0_ADDR_LO]; + reghi = error_data[ECCR0_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[0].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[0].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR0_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + reglo = error_data[ECCR1_ADDR_LO]; + reghi = error_data[ECCR1_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[1].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[1].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR1_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + return true; +} + +/** + * convert_to_physical - Convert @error_data to a physical address. + * @priv: DDR memory controller private instance data. + * @pinf: ECC error info structure. + * @controller: Controller number of the MC5 + * @error_data: the DDRMC5 ADEC address decoder register data + * + * Return: physical address of the DDR memory. + */ +static unsigned long convert_to_physical(struct mc_priv *priv, + union ecc_error_info pinf, + int controller, int *error_data) +{ + u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset; + u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base; + unsigned long err_addr = 0, addr; + union row_col_mapping cols; + union row_col_mapping rows; + u32 col_bit_0; + + row = pinf.rowhi << MC5_REGHI_ROW | pinf.row; + offset = controller * ADEC_NUM; + + reg = error_data[ADEC6]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC7]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC8]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + + reg = error_data[ADEC9]; + rows.i = reg; + + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + + col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]); + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << col_bit_0; + + cols.i = error_data[ADEC10]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + cols.i = error_data[ADEC11]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + reg = error_data[ADEC12]; + err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0); + pinf.bank >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg); + pinf.bank >>= MC5_EACHBIT; + + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.group >>= MC5_EACHBIT; + + reg = error_data[ADEC4]; + err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0); + pinf.rank >>= MC5_EACHBIT; + err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg); + pinf.rank >>= MC5_EACHBIT; + + reg = error_data[ADEC5]; + err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.lrank >>= MC5_EACHBIT; + + high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE; + interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL; + + high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK; + low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK; + reg = priv->adec[ADEC14 + offset]; + ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN); + ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M; + if (ilc_himem_en) + ilc_base_ctrl_add = ilcmem_base - high_mem_offset; + else + ilc_base_ctrl_add = ilcmem_base - low_mem_offset; + + if (priv->dwidth == DEV_X16) { + blk = err_addr / MC5_X16_SIZE; + rsh_req_addr = (blk << 8) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } else { + blk = err_addr / MC5_X32_SIZE; + rsh_req_addr = (blk << 9) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } + + if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base) + addr = err_addr - high_mem_offset; + else + addr = err_addr - low_mem_offset; + + return addr; +} + +/** + * handle_error - Handle errors. + * @priv: DDR memory controller private instance data. + * @stat: ECC status structure. + * @ctl_num: Controller number of the MC5 + * @error_data: the MC5 ADEC address decoder register data + * + * Handles ECC correctable and uncorrectable errors. + */ +static void handle_error(struct mc_priv *priv, struct ecc_status *stat, + int ctl_num, int *error_data) +{ + union ecc_error_info pinf; + struct mem_ctl_info *mci; + unsigned long pa; + phys_addr_t pfn; + int err; + + if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS)) + return; + + mci = priv->mci[ctl_num]; + + if (stat->error_type == MC5_ERR_TYPE_CE) { + pinf = stat->ceinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s Controller %d Addr at %lx\n", + "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + } + + if (stat->error_type == MC5_ERR_TYPE_UE) { + pinf = stat->ueinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s controller %d Addr at %lx\n", + "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + pa = convert_to_physical(priv, pinf, ctl_num, error_data); + pfn = PHYS_PFN(pa); + + if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) { + err = memory_failure(pfn, MF_ACTION_REQUIRED); + if (err) + edac_dbg(2, "memory_failure() error: %d", err); + else + edac_dbg(2, "Poison page at PA 0x%lx\n", pa); + } + } +} + +static void mc_init(struct mem_ctl_info *mci, struct device *dev) +{ + struct mc_priv *priv = mci->pvt_info; + struct csrow_info *csi; + struct dimm_info *dimm; + u32 row; + int ch; + + /* Initialize controller capabilities and configuration */ + mci->mtype_cap = MEM_FLAG_DDR5; + mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED; + mci->scrub_cap = SCRUB_HW_SRC; + mci->scrub_mode = SCRUB_NONE; + + mci->edac_cap = EDAC_FLAG_SECDED; + mci->ctl_name = "VersalNET DDR5"; + mci->dev_name = dev_name(dev); + mci->mod_name = "versalnet_edac"; + + edac_op_state = EDAC_OPSTATE_INT; + + for (row = 0; row < mci->nr_csrows; row++) { + csi = mci->csrows[row]; + for (ch = 0; ch < csi->nr_channels; ch++) { + dimm = csi->channels[ch]->dimm; + dimm->edac_mode = EDAC_SECDED; + dimm->mtype = MEM_DDR5; + dimm->grain = MC5_ERR_GRAIN; + dimm->dtype = priv->dwidth; + } + } +} + +#define to_mci(k) container_of(k, struct mem_ctl_info, dev) + +static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) +{ + return MCDI_RPC_TIMEOUT; +} + +static void mcdi_request(struct cdx_mcdi *cdx, + const struct cdx_dword *hdr, size_t hdr_len, + const struct cdx_dword *sdu, size_t sdu_len) +{ + void *send_buf; + int ret; + + send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL); + if (!send_buf) + return; + + memcpy(send_buf, hdr, hdr_len); + memcpy(send_buf + hdr_len, sdu, sdu_len); + + ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len); + if (ret) + dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret); + + kfree(send_buf); +} + +static const struct cdx_mcdi_ops mcdi_ops = { + .mcdi_rpc_timeout = mcdi_rpc_timeout, + .mcdi_request = mcdi_request, +}; + +static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi) +{ + size_t outlen; + int ret; + + MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN); + MCDI_DECLARE_BUF(outbuf, BUFFER_SZ); + + MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index); + + ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + if (!ret) + memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG), + (ADEC_NUM * 4)); +} + +static int setup_mcdi(struct mc_priv *mc_priv) +{ + struct cdx_mcdi *amd_mcdi; + int ret, i; + + amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL); + if (!amd_mcdi) + return -ENOMEM; + + amd_mcdi->mcdi_ops = &mcdi_ops; + ret = cdx_mcdi_init(amd_mcdi); + if (ret) { + kfree(amd_mcdi); + return ret; + } + + amd_mcdi->ept = mc_priv->ept; + mc_priv->mcdi = amd_mcdi; + + for (i = 0; i < NUM_CONTROLLERS; i++) + get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi); + + return 0; +} + +static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2, + 0xb8, 0xb4, 0x45, 0x56, 0x2e, + 0x8c, 0x5b, 0xec); + +static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, + int len, void *priv, u32 src) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + const guid_t *sec_type = &guid_null; + u32 length, offset, error_id; + u32 *result = (u32 *)data; + struct ecc_status *p; + int i, j, k, sec_sev; + const char *err_str; + u32 *adec_data; + + if (*(u8 *)data == MCDI_RESPONSE) { + cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len); + return 0; + } + + sec_sev = result[ERROR_LEVEL]; + error_id = result[ERROR_ID]; + length = result[MSG_ERR_LENGTH]; + offset = result[MSG_ERR_OFFSET]; + + if (result[TOTAL_ERR_LENGTH] > length) { + if (!mc_priv->part_len) + mc_priv->part_len = length; + else + mc_priv->part_len += length; + /* + * The data can come in 2 stretches. Construct the regs from 2 + * messages the offset indicates the offset from which the data is to + * be taken + */ + for (i = 0 ; i < length; i++) { + k = offset + i; + j = ERROR_DATA + i; + mc_priv->regs[k] = result[j]; + } + if (mc_priv->part_len < result[TOTAL_ERR_LENGTH]) + return 0; + mc_priv->part_len = 0; + } + + mc_priv->error_id = error_id; + mc_priv->error_level = result[ERROR_LEVEL]; + + switch (error_id) { + case 5: err_str = "General Software Non-Correctable error"; break; + case 6: err_str = "CFU error"; break; + case 7: err_str = "CFRAME error"; break; + case 10: err_str = "DDRMC Microblaze Correctable ECC error"; break; + case 11: err_str = "DDRMC Microblaze Non-Correctable ECC error"; break; + case 15: err_str = "MMCM error"; break; + case 16: err_str = "HNICX Correctable error"; break; + case 17: err_str = "HNICX Non-Correctable error"; break; + + case 18: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_CE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + case 19: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_UE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + + case 21: err_str = "GT Non-Correctable error"; break; + case 22: err_str = "PL Sysmon Correctable error"; break; + case 23: err_str = "PL Sysmon Non-Correctable error"; break; + case 111: err_str = "LPX unexpected dfx activation error"; break; + case 114: err_str = "INT_LPD Non-Correctable error"; break; + case 116: err_str = "INT_OCM Non-Correctable error"; break; + case 117: err_str = "INT_FPD Correctable error"; break; + case 118: err_str = "INT_FPD Non-Correctable error"; break; + case 120: err_str = "INT_IOU Non-Correctable error"; break; + case 123: err_str = "err_int_irq from APU GIC Distributor"; break; + case 124: err_str = "fault_int_irq from APU GIC Distribute"; break; + case 132 ... 139: err_str = "FPX SPLITTER error"; break; + case 140: err_str = "APU Cluster 0 error"; break; + case 141: err_str = "APU Cluster 1 error"; break; + case 142: err_str = "APU Cluster 2 error"; break; + case 143: err_str = "APU Cluster 3 error"; break; + case 145: err_str = "WWDT1 LPX error"; break; + case 147: err_str = "IPI error"; break; + case 152 ... 153: err_str = "AFIFS error"; break; + case 154 ... 155: err_str = "LPX glitch error"; break; + case 185 ... 186: err_str = "FPX AFIFS error"; break; + case 195 ... 199: err_str = "AFIFM error"; break; + case 108: err_str = "PSM Correctable error"; break; + case 59: err_str = "PMC correctable error"; break; + case 60: err_str = "PMC Un correctable error"; break; + case 43 ... 47: err_str = "PMC Sysmon error"; break; + case 163 ... 184: err_str = "RPU error"; break; + case 148: err_str = "OCM0 correctable error"; break; + case 149: err_str = "OCM1 correctable error"; break; + case 150: err_str = "OCM0 Un-correctable error"; break; + case 151: err_str = "OCM1 Un-correctable error"; break; + case 189: err_str = "PSX_CMN_3 PD block consolidated error"; break; + case 191: err_str = "FPD_INT_WRAP PD block consolidated error"; break; + case 232: err_str = "CRAM Un-Correctable error"; break; + default: err_str = "VERSAL_EDAC_ERR_ID: %d"; break; + } + + snprintf(mc_priv->message, + sizeof(mc_priv->message), + "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str); + + /* Convert to bytes */ + length = result[TOTAL_ERR_LENGTH] * 4; + log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message, + sec_sev, (void *)&result[ERROR_DATA], length); + + return 0; +} + +static struct rpmsg_device_id amd_rpmsg_id_table[] = { + { .name = "error_ipc" }, + { }, +}; +MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table); + +static int rpmsg_probe(struct rpmsg_device *rpdev) +{ + struct rpmsg_channel_info chinfo; + struct mc_priv *pg; + + pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data; + chinfo.src = RPMSG_ADDR_ANY; + chinfo.dst = rpdev->dst; + strscpy(chinfo.name, amd_rpmsg_id_table[0].name, + strlen(amd_rpmsg_id_table[0].name)); + + pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo); + if (!pg->ept) + return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n", + chinfo.name); + + dev_set_drvdata(&rpdev->dev, pg); + + return 0; +} + +static void rpmsg_remove(struct rpmsg_device *rpdev) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + + rpmsg_destroy_ept(mc_priv->ept); + dev_set_drvdata(&rpdev->dev, NULL); +} + +static struct rpmsg_driver amd_rpmsg_driver = { + .drv.name = KBUILD_MODNAME, + .probe = rpmsg_probe, + .remove = rpmsg_remove, + .callback = rpmsg_cb, + .id_table = amd_rpmsg_id_table, +}; + +static void versal_edac_release(struct device *dev) +{ + kfree(dev); +} + +static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev) +{ + u32 num_chans, rank, dwidth, config; + struct edac_mc_layer layers[2]; + struct mem_ctl_info *mci; + struct device *dev; + enum dev_type dt; + char *name; + int rc, i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + config = priv->adec[CONF + i * ADEC_NUM]; + num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config); + rank = 1 << FIELD_GET(MC5_RANK_MASK, config); + dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config); + + switch (dwidth) { + case XDDR5_BUS_WIDTH_16: + dt = DEV_X16; + break; + case XDDR5_BUS_WIDTH_32: + dt = DEV_X32; + break; + case XDDR5_BUS_WIDTH_64: + dt = DEV_X64; + break; + default: + dt = DEV_UNKNOWN; + } + + if (dt == DEV_UNKNOWN) + continue; + + /* Find the first enabled device and register that one. */ + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = rank; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = num_chans; + layers[1].is_virt_csrow = false; + + rc = -ENOMEM; + mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers, + sizeof(struct mc_priv)); + if (!mci) { + edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i); + goto err_alloc; + } + + priv->mci[i] = mci; + priv->dwidth = dt; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev->release = versal_edac_release; + name = kmalloc(32, GFP_KERNEL); + sprintf(name, "versal-net-ddrmc5-edac-%d", i); + dev->init_name = name; + rc = device_register(dev); + if (rc) + goto err_alloc; + + mci->pdev = dev; + + platform_set_drvdata(pdev, priv); + + mc_init(mci, dev); + rc = edac_mc_add_mc(mci); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i); + goto err_alloc; + } + } + return 0; + +err_alloc: + while (i--) { + mci = priv->mci[i]; + if (!mci) + continue; + + if (mci->pdev) { + device_unregister(mci->pdev); + edac_mc_del_mc(mci->pdev); + } + + edac_mc_free(mci); + } + + return rc; +} + +static void remove_versalnet(struct mc_priv *priv) +{ + struct mem_ctl_info *mci; + int i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + device_unregister(priv->mci[i]->pdev); + mci = edac_mc_del_mc(priv->mci[i]->pdev); + if (!mci) + return; + + edac_mc_free(mci); + } +} + +static int mc_probe(struct platform_device *pdev) +{ + struct device_node *r5_core_node; + struct mc_priv *priv; + struct rproc *rp; + int rc; + + r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0); + if (!r5_core_node) { + dev_err(&pdev->dev, "amd,rproc: invalid phandle\n"); + return -EINVAL; + } + + rp = rproc_get_by_phandle(r5_core_node->phandle); + if (!rp) + return -EPROBE_DEFER; + + rc = rproc_boot(rp); + if (rc) { + dev_err(&pdev->dev, "Failed to attach to remote processor\n"); + goto err_rproc_boot; + } + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) { + rc = -ENOMEM; + goto err_alloc; + } + + amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv; + + rc = register_rpmsg_driver(&amd_rpmsg_driver); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc); + goto err_alloc; + } + + rc = setup_mcdi(priv); + if (rc) + goto err_unreg; + + priv->mcdi->r5_rproc = rp; + + rc = init_versalnet(priv, pdev); + if (rc) + goto err_init; + + return 0; + +err_init: + cdx_mcdi_finish(priv->mcdi); + +err_unreg: + unregister_rpmsg_driver(&amd_rpmsg_driver); + +err_alloc: + rproc_shutdown(rp); + +err_rproc_boot: + rproc_put(rp); + + return rc; +} + +static void mc_remove(struct platform_device *pdev) +{ + struct mc_priv *priv = platform_get_drvdata(pdev); + + unregister_rpmsg_driver(&amd_rpmsg_driver); + remove_versalnet(priv); + rproc_shutdown(priv->mcdi->r5_rproc); + cdx_mcdi_finish(priv->mcdi); +} + +static const struct of_device_id amd_edac_match[] = { + { .compatible = "xlnx,versal-net-ddrmc5", }, + {} +}; +MODULE_DEVICE_TABLE(of, amd_edac_match); + +static struct platform_driver amd_ddr_edac_mc_driver = { + .driver = { + .name = "versal-net-edac", + .of_match_table = amd_edac_match, + }, + .probe = mc_probe, + .remove = mc_remove, +}; + +module_platform_driver(amd_ddr_edac_mc_driver); + +MODULE_AUTHOR("AMD Inc"); +MODULE_DESCRIPTION("Versal NET EDAC driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index aae774e7a5c3eb..e5e0174a0335c2 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -229,8 +229,7 @@ void fw_schedule_bus_reset(struct fw_card *card, bool delayed, bool short_reset) /* Use an arbitrary short delay to combine multiple reset requests. */ fw_card_get(card); - if (!queue_delayed_work(fw_workqueue, &card->br_work, - delayed ? DIV_ROUND_UP(HZ, 100) : 0)) + if (!queue_delayed_work(fw_workqueue, &card->br_work, delayed ? msecs_to_jiffies(10) : 0)) fw_card_put(card); } EXPORT_SYMBOL(fw_schedule_bus_reset); @@ -241,10 +240,10 @@ static void br_work(struct work_struct *work) /* Delay for 2s after last reset per IEEE 1394 clause 8.2.1. */ if (card->reset_jiffies != 0 && - time_before64(get_jiffies_64(), card->reset_jiffies + 2 * HZ)) { + time_is_after_jiffies64(card->reset_jiffies + secs_to_jiffies(2))) { trace_bus_reset_postpone(card->index, card->generation, card->br_short); - if (!queue_delayed_work(fw_workqueue, &card->br_work, 2 * HZ)) + if (!queue_delayed_work(fw_workqueue, &card->br_work, secs_to_jiffies(2))) fw_card_put(card); return; } @@ -280,225 +279,254 @@ void fw_schedule_bm_work(struct fw_card *card, unsigned long delay) fw_card_put(card); } -static void bm_work(struct work_struct *work) +enum bm_contention_outcome { + // The bus management contention window is not expired. + BM_CONTENTION_OUTCOME_WITHIN_WINDOW = 0, + // The IRM node has link off. + BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF, + // The IRM node complies IEEE 1394:1994 only. + BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY, + // Another bus reset, BM work has been rescheduled. + BM_CONTENTION_OUTCOME_AT_NEW_GENERATION, + // We have been unable to send the lock request to IRM node due to some local problem. + BM_CONTENTION_OUTCOME_LOCAL_PROBLEM_AT_TRANSACTION, + // The lock request failed, maybe the IRM isn't really IRM capable after all. + BM_CONTENTION_OUTCOME_IRM_IS_NOT_CAPABLE_FOR_IRM, + // Somebody else is BM. + BM_CONTENTION_OUTCOME_IRM_HOLDS_ANOTHER_NODE_AS_BM, + // The local node succeeds after contending for bus manager. + BM_CONTENTION_OUTCOME_IRM_HOLDS_LOCAL_NODE_AS_BM, +}; + +static enum bm_contention_outcome contend_for_bm(struct fw_card *card) +__must_hold(&card->lock) { - static const char gap_count_table[] = { - 63, 5, 7, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 35, 37, 40 + int generation = card->generation; + int local_id = card->local_node->node_id; + __be32 data[2] = { + cpu_to_be32(BUS_MANAGER_ID_NOT_REGISTERED), + cpu_to_be32(local_id), }; - struct fw_card *card = from_work(card, work, bm_work.work); - struct fw_device *root_device, *irm_device; - struct fw_node *root_node; - int root_id, new_root_id, irm_id, bm_id, local_id; - int gap_count, generation, grace, rcode; - bool do_reset = false; - bool root_device_is_running; - bool root_device_is_cmc; - bool irm_is_1394_1995_only; - bool keep_this_irm; - __be32 transaction_data[2]; - - spin_lock_irq(&card->lock); + bool grace = time_is_before_jiffies64(card->reset_jiffies + msecs_to_jiffies(125)); + bool irm_is_1394_1995_only = false; + bool keep_this_irm = false; + struct fw_node *irm_node; + struct fw_device *irm_device; + int irm_node_id; + int rcode; + + lockdep_assert_held(&card->lock); + + if (!grace) { + if (!is_next_generation(generation, card->bm_generation) || card->bm_abdicate) + return BM_CONTENTION_OUTCOME_WITHIN_WINDOW; + } - if (card->local_node == NULL) { - spin_unlock_irq(&card->lock); - goto out_put_card; + irm_node = card->irm_node; + if (!irm_node->link_on) { + fw_notice(card, "IRM has link off, making local node (%02x) root\n", local_id); + return BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF; } - generation = card->generation; + irm_device = fw_node_get_device(irm_node); + if (irm_device && irm_device->config_rom) { + irm_is_1394_1995_only = (irm_device->config_rom[2] & 0x000000f0) == 0; - root_node = card->root_node; - fw_node_get(root_node); - root_device = root_node->data; - root_device_is_running = root_device && - atomic_read(&root_device->state) == FW_DEVICE_RUNNING; - root_device_is_cmc = root_device && root_device->cmc; + // Canon MV5i works unreliably if it is not root node. + keep_this_irm = irm_device->config_rom[3] >> 8 == CANON_OUI; + } - irm_device = card->irm_node->data; - irm_is_1394_1995_only = irm_device && irm_device->config_rom && - (irm_device->config_rom[2] & 0x000000f0) == 0; + if (irm_is_1394_1995_only && !keep_this_irm) { + fw_notice(card, "IRM is not 1394a compliant, making local node (%02x) root\n", + local_id); + return BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY; + } - /* Canon MV5i works unreliably if it is not root node. */ - keep_this_irm = irm_device && irm_device->config_rom && - irm_device->config_rom[3] >> 8 == CANON_OUI; + irm_node_id = irm_node->node_id; - root_id = root_node->node_id; - irm_id = card->irm_node->node_id; - local_id = card->local_node->node_id; + spin_unlock_irq(&card->lock); - grace = time_after64(get_jiffies_64(), - card->reset_jiffies + DIV_ROUND_UP(HZ, 8)); + rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, irm_node_id, generation, + SCODE_100, CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, data, + sizeof(data)); - if ((is_next_generation(generation, card->bm_generation) && - !card->bm_abdicate) || - (card->bm_generation != generation && grace)) { - /* - * This first step is to figure out who is IRM and - * then try to become bus manager. If the IRM is not - * well defined (e.g. does not have an active link - * layer or does not responds to our lock request, we - * will have to do a little vigilante bus management. - * In that case, we do a goto into the gap count logic - * so that when we do the reset, we still optimize the - * gap count. That could well save a reset in the - * next generation. - */ + spin_lock_irq(&card->lock); - if (!card->irm_node->link_on) { - new_root_id = local_id; - fw_notice(card, "%s, making local node (%02x) root\n", - "IRM has link off", new_root_id); - goto pick_me; + switch (rcode) { + case RCODE_GENERATION: + return BM_CONTENTION_OUTCOME_AT_NEW_GENERATION; + case RCODE_SEND_ERROR: + return BM_CONTENTION_OUTCOME_LOCAL_PROBLEM_AT_TRANSACTION; + case RCODE_COMPLETE: + { + int bm_id = be32_to_cpu(data[0]); + + // Used by cdev layer for "struct fw_cdev_event_bus_reset". + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) + card->bm_node_id = 0xffc0 & bm_id; + else + card->bm_node_id = local_id; + + if (bm_id != BUS_MANAGER_ID_NOT_REGISTERED) + return BM_CONTENTION_OUTCOME_IRM_HOLDS_ANOTHER_NODE_AS_BM; + else + return BM_CONTENTION_OUTCOME_IRM_HOLDS_LOCAL_NODE_AS_BM; + } + default: + if (!keep_this_irm) { + fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n", + fw_rcode_string(rcode), local_id); + return BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY; + } else { + return BM_CONTENTION_OUTCOME_IRM_IS_NOT_CAPABLE_FOR_IRM; } + } +} - if (irm_is_1394_1995_only && !keep_this_irm) { - new_root_id = local_id; - fw_notice(card, "%s, making local node (%02x) root\n", - "IRM is not 1394a compliant", new_root_id); - goto pick_me; - } +DEFINE_FREE(node_unref, struct fw_node *, if (_T) fw_node_put(_T)) +DEFINE_FREE(card_unref, struct fw_card *, if (_T) fw_card_put(_T)) + +static void bm_work(struct work_struct *work) +{ + static const char gap_count_table[] = { + 63, 5, 7, 8, 10, 13, 16, 18, 21, 24, 26, 29, 32, 35, 37, 40 + }; + struct fw_card *card __free(card_unref) = from_work(card, work, bm_work.work); + struct fw_node *root_node __free(node_unref) = NULL; + int root_id, new_root_id, irm_id, local_id; + int expected_gap_count, generation; + bool stand_for_root = false; - transaction_data[0] = cpu_to_be32(0x3f); - transaction_data[1] = cpu_to_be32(local_id); + spin_lock_irq(&card->lock); + if (card->local_node == NULL) { spin_unlock_irq(&card->lock); + return; + } - rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, - irm_id, generation, SCODE_100, - CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, - transaction_data, 8); + generation = card->generation; - if (rcode == RCODE_GENERATION) - /* Another bus reset, BM work has been rescheduled. */ - goto out; + root_node = fw_node_get(card->root_node); - bm_id = be32_to_cpu(transaction_data[0]); + root_id = root_node->node_id; + irm_id = card->irm_node->node_id; + local_id = card->local_node->node_id; - scoped_guard(spinlock_irq, &card->lock) { - if (rcode == RCODE_COMPLETE && generation == card->generation) - card->bm_node_id = - bm_id == 0x3f ? local_id : 0xffc0 | bm_id; - } + if (card->bm_generation != generation) { + enum bm_contention_outcome result = contend_for_bm(card); - if (rcode == RCODE_COMPLETE && bm_id != 0x3f) { - /* Somebody else is BM. Only act as IRM. */ - if (local_id == irm_id) + switch (result) { + case BM_CONTENTION_OUTCOME_WITHIN_WINDOW: + spin_unlock_irq(&card->lock); + fw_schedule_bm_work(card, msecs_to_jiffies(125)); + return; + case BM_CONTENTION_OUTCOME_IRM_HAS_LINK_OFF: + stand_for_root = true; + break; + case BM_CONTENTION_OUTCOME_IRM_COMPLIES_1394_1995_ONLY: + stand_for_root = true; + break; + case BM_CONTENTION_OUTCOME_AT_NEW_GENERATION: + // BM work has been rescheduled. + spin_unlock_irq(&card->lock); + return; + case BM_CONTENTION_OUTCOME_LOCAL_PROBLEM_AT_TRANSACTION: + // Let's try again later and hope that the local problem has gone away by + // then. + spin_unlock_irq(&card->lock); + fw_schedule_bm_work(card, msecs_to_jiffies(125)); + return; + case BM_CONTENTION_OUTCOME_IRM_IS_NOT_CAPABLE_FOR_IRM: + // Let's do a bus reset and pick the local node as root, and thus, IRM. + stand_for_root = true; + break; + case BM_CONTENTION_OUTCOME_IRM_HOLDS_ANOTHER_NODE_AS_BM: + if (local_id == irm_id) { + // Only acts as IRM. + spin_unlock_irq(&card->lock); allocate_broadcast_channel(card, generation); - - goto out; - } - - if (rcode == RCODE_SEND_ERROR) { - /* - * We have been unable to send the lock request due to - * some local problem. Let's try again later and hope - * that the problem has gone away by then. - */ - fw_schedule_bm_work(card, DIV_ROUND_UP(HZ, 8)); - goto out; + spin_lock_irq(&card->lock); + } + fallthrough; + case BM_CONTENTION_OUTCOME_IRM_HOLDS_LOCAL_NODE_AS_BM: + default: + card->bm_generation = generation; + break; } + } - spin_lock_irq(&card->lock); - - if (rcode != RCODE_COMPLETE && !keep_this_irm) { - /* - * The lock request failed, maybe the IRM - * isn't really IRM capable after all. Let's - * do a bus reset and pick the local node as - * root, and thus, IRM. - */ - new_root_id = local_id; - fw_notice(card, "BM lock failed (%s), making local node (%02x) root\n", - fw_rcode_string(rcode), new_root_id); - goto pick_me; + // We're bus manager for this generation, so next step is to make sure we have an active + // cycle master and do gap count optimization. + if (!stand_for_root) { + if (card->gap_count == GAP_COUNT_MISMATCHED) { + // If self IDs have inconsistent gap counts, do a + // bus reset ASAP. The config rom read might never + // complete, so don't wait for it. However, still + // send a PHY configuration packet prior to the + // bus reset. The PHY configuration packet might + // fail, but 1394-2008 8.4.5.2 explicitly permits + // it in this case, so it should be safe to try. + stand_for_root = true; + + // We must always send a bus reset if the gap count + // is inconsistent, so bypass the 5-reset limit. + card->bm_retries = 0; + } else { + // Now investigate root node. + struct fw_device *root_device = fw_node_get_device(root_node); + + if (root_device == NULL) { + // Either link_on is false, or we failed to read the + // config rom. In either case, pick another root. + stand_for_root = true; + } else { + bool root_device_is_running = + atomic_read(&root_device->state) == FW_DEVICE_RUNNING; + + if (!root_device_is_running) { + // If we haven't probed this device yet, bail out now + // and let's try again once that's done. + spin_unlock_irq(&card->lock); + return; + } else if (!root_device->cmc) { + // Current root has an active link layer and we + // successfully read the config rom, but it's not + // cycle master capable. + stand_for_root = true; + } + } } - } else if (card->bm_generation != generation) { - /* - * We weren't BM in the last generation, and the last - * bus reset is less than 125ms ago. Reschedule this job. - */ - spin_unlock_irq(&card->lock); - fw_schedule_bm_work(card, DIV_ROUND_UP(HZ, 8)); - goto out; } - /* - * We're bus manager for this generation, so next step is to - * make sure we have an active cycle master and do gap count - * optimization. - */ - card->bm_generation = generation; - - if (card->gap_count == 0) { - /* - * If self IDs have inconsistent gap counts, do a - * bus reset ASAP. The config rom read might never - * complete, so don't wait for it. However, still - * send a PHY configuration packet prior to the - * bus reset. The PHY configuration packet might - * fail, but 1394-2008 8.4.5.2 explicitly permits - * it in this case, so it should be safe to try. - */ - new_root_id = local_id; - /* - * We must always send a bus reset if the gap count - * is inconsistent, so bypass the 5-reset limit. - */ - card->bm_retries = 0; - } else if (root_device == NULL) { - /* - * Either link_on is false, or we failed to read the - * config rom. In either case, pick another root. - */ + if (stand_for_root) { new_root_id = local_id; - } else if (!root_device_is_running) { - /* - * If we haven't probed this device yet, bail out now - * and let's try again once that's done. - */ - spin_unlock_irq(&card->lock); - goto out; - } else if (root_device_is_cmc) { - /* - * We will send out a force root packet for this - * node as part of the gap count optimization. - */ - new_root_id = root_id; } else { - /* - * Current root has an active link layer and we - * successfully read the config rom, but it's not - * cycle master capable. - */ - new_root_id = local_id; + // We will send out a force root packet for this node as part of the gap count + // optimization on behalf of the node. + new_root_id = root_id; } - pick_me: /* * Pick a gap count from 1394a table E-1. The table doesn't cover * the typically much larger 1394b beta repeater delays though. */ if (!card->beta_repeaters_present && root_node->max_hops < ARRAY_SIZE(gap_count_table)) - gap_count = gap_count_table[root_node->max_hops]; + expected_gap_count = gap_count_table[root_node->max_hops]; else - gap_count = 63; + expected_gap_count = 63; - /* - * Finally, figure out if we should do a reset or not. If we have - * done less than 5 resets with the same physical topology and we - * have either a new root or a new gap count setting, let's do it. - */ - - if (card->bm_retries++ < 5 && - (card->gap_count != gap_count || new_root_id != root_id)) - do_reset = true; + // Finally, figure out if we should do a reset or not. If we have done less than 5 resets + // with the same physical topology and we have either a new root or a new gap count + // setting, let's do it. + if (card->bm_retries++ < 5 && (card->gap_count != expected_gap_count || new_root_id != root_id)) { + int card_gap_count = card->gap_count; - spin_unlock_irq(&card->lock); + spin_unlock_irq(&card->lock); - if (do_reset) { fw_notice(card, "phy config: new root=%x, gap_count=%d\n", - new_root_id, gap_count); - fw_send_phy_config(card, new_root_id, generation, gap_count); + new_root_id, expected_gap_count); + fw_send_phy_config(card, new_root_id, generation, expected_gap_count); /* * Where possible, use a short bus reset to minimize * disruption to isochronous transfers. But in the event @@ -511,31 +539,27 @@ static void bm_work(struct work_struct *work) * may treat it as two, causing a gap count inconsistency * again. Using a long bus reset prevents this. */ - reset_bus(card, card->gap_count != 0); + reset_bus(card, card_gap_count != 0); /* Will allocate broadcast channel after the reset. */ - goto out; - } + } else { + struct fw_device *root_device = fw_node_get_device(root_node); - if (root_device_is_cmc) { - /* - * Make sure that the cycle master sends cycle start packets. - */ - transaction_data[0] = cpu_to_be32(CSR_STATE_BIT_CMSTR); - rcode = fw_run_transaction(card, TCODE_WRITE_QUADLET_REQUEST, - root_id, generation, SCODE_100, - CSR_REGISTER_BASE + CSR_STATE_SET, - transaction_data, 4); - if (rcode == RCODE_GENERATION) - goto out; - } + spin_unlock_irq(&card->lock); - if (local_id == irm_id) - allocate_broadcast_channel(card, generation); + if (root_device && root_device->cmc) { + // Make sure that the cycle master sends cycle start packets. + __be32 data = cpu_to_be32(CSR_STATE_BIT_CMSTR); + int rcode = fw_run_transaction(card, TCODE_WRITE_QUADLET_REQUEST, + root_id, generation, SCODE_100, + CSR_REGISTER_BASE + CSR_STATE_SET, + &data, sizeof(data)); + if (rcode == RCODE_GENERATION) + return; + } - out: - fw_node_put(root_node); - out_put_card: - fw_card_put(card); + if (local_id == irm_id) + allocate_broadcast_channel(card, generation); + } } void fw_card_initialize(struct fw_card *card, @@ -547,20 +571,24 @@ void fw_card_initialize(struct fw_card *card, card->index = atomic_inc_return(&index); card->driver = driver; card->device = device; - card->current_tlabel = 0; - card->tlabel_mask = 0; - card->split_timeout_hi = DEFAULT_SPLIT_TIMEOUT / 8000; - card->split_timeout_lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; - card->split_timeout_cycles = DEFAULT_SPLIT_TIMEOUT; - card->split_timeout_jiffies = - DIV_ROUND_UP(DEFAULT_SPLIT_TIMEOUT * HZ, 8000); + + card->transactions.current_tlabel = 0; + card->transactions.tlabel_mask = 0; + INIT_LIST_HEAD(&card->transactions.list); + spin_lock_init(&card->transactions.lock); + + card->split_timeout.hi = DEFAULT_SPLIT_TIMEOUT / 8000; + card->split_timeout.lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; + card->split_timeout.cycles = DEFAULT_SPLIT_TIMEOUT; + card->split_timeout.jiffies = isoc_cycles_to_jiffies(DEFAULT_SPLIT_TIMEOUT); + spin_lock_init(&card->split_timeout.lock); + card->color = 0; card->broadcast_channel = BROADCAST_CHANNEL_INITIAL; kref_init(&card->kref); init_completion(&card->done); - INIT_LIST_HEAD(&card->transaction_list); - INIT_LIST_HEAD(&card->phy_receiver_list); + spin_lock_init(&card->lock); card->local_node = NULL; @@ -570,9 +598,13 @@ void fw_card_initialize(struct fw_card *card, } EXPORT_SYMBOL(fw_card_initialize); +DEFINE_FREE(workqueue_destroy, struct workqueue_struct *, if (_T) destroy_workqueue(_T)) + int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid, unsigned int supported_isoc_contexts) { + struct workqueue_struct *isoc_wq __free(workqueue_destroy) = NULL; + struct workqueue_struct *async_wq __free(workqueue_destroy) = NULL; int ret; // This workqueue should be: @@ -587,10 +619,10 @@ int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid, // * == WQ_SYSFS Parameters are available via sysfs. // * max_active == n_it + n_ir A hardIRQ could notify events for multiple isochronous // contexts if they are scheduled to the same cycle. - card->isoc_wq = alloc_workqueue("firewire-isoc-card%u", - WQ_UNBOUND | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS, - supported_isoc_contexts, card->index); - if (!card->isoc_wq) + isoc_wq = alloc_workqueue("firewire-isoc-card%u", + WQ_UNBOUND | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS, + supported_isoc_contexts, card->index); + if (!isoc_wq) return -ENOMEM; // This workqueue should be: @@ -602,14 +634,14 @@ int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid, // * == WQ_SYSFS Parameters are available via sysfs. // * max_active == 4 A hardIRQ could notify events for a pair of requests and // response AR/AT contexts. - card->async_wq = alloc_workqueue("firewire-async-card%u", - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS, - 4, card->index); - if (!card->async_wq) { - ret = -ENOMEM; - goto err_isoc; - } + async_wq = alloc_workqueue("firewire-async-card%u", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS, + 4, card->index); + if (!async_wq) + return -ENOMEM; + card->isoc_wq = isoc_wq; + card->async_wq = async_wq; card->max_receive = max_receive; card->link_speed = link_speed; card->guid = guid; @@ -617,18 +649,18 @@ int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid, scoped_guard(mutex, &card_mutex) { generate_config_rom(card, tmp_config_rom); ret = card->driver->enable(card, tmp_config_rom, config_rom_length); - if (ret < 0) - goto err_async; + if (ret < 0) { + card->isoc_wq = NULL; + card->async_wq = NULL; + return ret; + } + retain_and_null_ptr(isoc_wq); + retain_and_null_ptr(async_wq); list_add_tail(&card->link, &card_list); } return 0; -err_async: - destroy_workqueue(card->async_wq); -err_isoc: - destroy_workqueue(card->isoc_wq); - return ret; } EXPORT_SYMBOL(fw_card_add); @@ -773,7 +805,7 @@ void fw_core_remove_card(struct fw_card *card) destroy_workqueue(card->isoc_wq); destroy_workqueue(card->async_wq); - WARN_ON(!list_empty(&card->transaction_list)); + WARN_ON(!list_empty(&card->transactions.list)); } EXPORT_SYMBOL(fw_core_remove_card); diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 78b10c6ef7fec7..49dc1612c6911e 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -41,12 +41,15 @@ /* * ABI version history is documented in linux/firewire-cdev.h. */ -#define FW_CDEV_KERNEL_VERSION 5 +#define FW_CDEV_KERNEL_VERSION 6 #define FW_CDEV_VERSION_EVENT_REQUEST2 4 #define FW_CDEV_VERSION_ALLOCATE_REGION_END 4 #define FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW 5 #define FW_CDEV_VERSION_EVENT_ASYNC_TSTAMP 6 +static DEFINE_SPINLOCK(phy_receiver_list_lock); +static LIST_HEAD(phy_receiver_list); + struct client { u32 version; struct fw_device *device; @@ -937,11 +940,12 @@ static int ioctl_add_descriptor(struct client *client, union ioctl_arg *arg) if (a->length > 256) return -EINVAL; - r = kmalloc(sizeof(*r) + a->length * 4, GFP_KERNEL); + r = kmalloc(struct_size(r, data, a->length), GFP_KERNEL); if (r == NULL) return -ENOMEM; - if (copy_from_user(r->data, u64_to_uptr(a->data), a->length * 4)) { + if (copy_from_user(r->data, u64_to_uptr(a->data), + flex_array_size(r, data, a->length))) { ret = -EFAULT; goto failed; } @@ -1324,8 +1328,8 @@ static void iso_resource_work(struct work_struct *work) todo = r->todo; // Allow 1000ms grace period for other reallocations. if (todo == ISO_RES_ALLOC && - time_before64(get_jiffies_64(), client->device->card->reset_jiffies + HZ)) { - schedule_iso_resource(r, DIV_ROUND_UP(HZ, 3)); + time_is_after_jiffies64(client->device->card->reset_jiffies + secs_to_jiffies(1))) { + schedule_iso_resource(r, msecs_to_jiffies(333)); skip = true; } else { // We could be called twice within the same generation. @@ -1669,15 +1673,16 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg) static int ioctl_receive_phy_packets(struct client *client, union ioctl_arg *arg) { struct fw_cdev_receive_phy_packets *a = &arg->receive_phy_packets; - struct fw_card *card = client->device->card; /* Access policy: Allow this ioctl only on local nodes' device files. */ if (!client->device->is_local) return -ENOSYS; - guard(spinlock_irq)(&card->lock); + // NOTE: This can be without irq when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irq, &phy_receiver_list_lock) + list_move_tail(&client->phy_receiver_link, &phy_receiver_list); - list_move_tail(&client->phy_receiver_link, &card->phy_receiver_list); client->phy_receiver_closure = a->closure; return 0; @@ -1687,10 +1692,17 @@ void fw_cdev_handle_phy_packet(struct fw_card *card, struct fw_packet *p) { struct client *client; - guard(spinlock_irqsave)(&card->lock); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + guard(spinlock_irqsave)(&phy_receiver_list_lock); + + list_for_each_entry(client, &phy_receiver_list, phy_receiver_link) { + struct inbound_phy_packet_event *e; + + if (client->device->card != card) + continue; - list_for_each_entry(client, &card->phy_receiver_list, phy_receiver_link) { - struct inbound_phy_packet_event *e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC); + e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC); if (e == NULL) break; @@ -1857,7 +1869,9 @@ static int fw_device_op_release(struct inode *inode, struct file *file) struct client_resource *resource; unsigned long index; - scoped_guard(spinlock_irq, &client->device->card->lock) + // NOTE: This can be without irq when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irq, &phy_receiver_list_lock) list_del(&client->phy_receiver_link); scoped_guard(mutex, &client->device->client_list_mutex) diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index aeacd4cfd6944e..457a0da024a761 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -847,16 +847,15 @@ static void fw_schedule_device_work(struct fw_device *device, */ #define MAX_RETRIES 10 -#define RETRY_DELAY (3 * HZ) -#define INITIAL_DELAY (HZ / 2) -#define SHUTDOWN_DELAY (2 * HZ) +#define RETRY_DELAY secs_to_jiffies(3) +#define INITIAL_DELAY msecs_to_jiffies(500) +#define SHUTDOWN_DELAY secs_to_jiffies(2) static void fw_device_shutdown(struct work_struct *work) { struct fw_device *device = from_work(device, work, work.work); - if (time_before64(get_jiffies_64(), - device->card->reset_jiffies + SHUTDOWN_DELAY) + if (time_is_after_jiffies64(device->card->reset_jiffies + SHUTDOWN_DELAY) && !list_empty(&device->card->link)) { fw_schedule_device_work(device, SHUTDOWN_DELAY); return; @@ -887,7 +886,7 @@ static void fw_device_release(struct device *dev) * bus manager work looks at this node. */ scoped_guard(spinlock_irqsave, &card->lock) - device->node->data = NULL; + fw_node_set_device(device->node, NULL); fw_node_put(device->node); kfree(device->config_rom); @@ -1007,7 +1006,7 @@ static void fw_device_init(struct work_struct *work) int ret; /* - * All failure paths here set node->data to NULL, so that we + * All failure paths here call fw_node_set_device(node, NULL), so that we * don't try to do device_for_each_child() on a kfree()'d * device. */ @@ -1051,9 +1050,9 @@ static void fw_device_init(struct work_struct *work) struct fw_node *obsolete_node = reused->node; device->node = obsolete_node; - device->node->data = device; + fw_node_set_device(device->node, device); reused->node = current_node; - reused->node->data = reused; + fw_node_set_device(reused->node, reused); reused->max_speed = device->max_speed; reused->node_id = current_node->node_id; @@ -1292,7 +1291,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) * FW_NODE_UPDATED callbacks can update the node_id * and generation for the device. */ - node->data = device; + fw_node_set_device(node, device); /* * Many devices are slow to respond after bus resets, @@ -1307,7 +1306,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) case FW_NODE_INITIATED_RESET: case FW_NODE_LINK_ON: - device = node->data; + device = fw_node_get_device(node); if (device == NULL) goto create; @@ -1324,7 +1323,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) break; case FW_NODE_UPDATED: - device = node->data; + device = fw_node_get_device(node); if (device == NULL) break; @@ -1339,7 +1338,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) case FW_NODE_DESTROYED: case FW_NODE_LINK_OFF: - if (!node->data) + if (!fw_node_get_device(node)) break; /* @@ -1354,7 +1353,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) * the device in shutdown state to have that code fail * to create the device. */ - device = node->data; + device = fw_node_get_device(node); if (atomic_xchg(&device->state, FW_DEVICE_GONE) == FW_DEVICE_RUNNING) { device->workfn = fw_device_shutdown; diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 74a6aa7d8cc92c..2f73bcd5696f2b 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -241,7 +241,7 @@ static struct fw_node *build_tree(struct fw_card *card, const u32 *sid, int self // If PHYs report different gap counts, set an invalid count which will force a gap // count reconfiguration and a reset. if (phy_packet_self_id_zero_get_gap_count(self_id_sequence[0]) != gap_count) - gap_count = 0; + gap_count = GAP_COUNT_MISMATCHED; update_hop_count(node); @@ -325,9 +325,11 @@ static void report_found_node(struct fw_card *card, card->bm_retries = 0; } -/* Must be called with card->lock held */ void fw_destroy_nodes(struct fw_card *card) +__must_hold(&card->lock) { + lockdep_assert_held(&card->lock); + card->color++; if (card->local_node != NULL) for_each_fw_node(card, card->local_node, report_lost_node); @@ -435,20 +437,22 @@ static void update_tree(struct fw_card *card, struct fw_node *root) } } -static void update_topology_map(struct fw_card *card, - u32 *self_ids, int self_id_count) +static void update_topology_map(__be32 *buffer, size_t buffer_size, int root_node_id, + const u32 *self_ids, int self_id_count) { - int node_count = (card->root_node->node_id & 0x3f) + 1; - __be32 *map = card->topology_map; + __be32 *map = buffer; + int node_count = (root_node_id & 0x3f) + 1; + + memset(map, 0, buffer_size); *map++ = cpu_to_be32((self_id_count + 2) << 16); - *map++ = cpu_to_be32(be32_to_cpu(card->topology_map[1]) + 1); + *map++ = cpu_to_be32(be32_to_cpu(buffer[1]) + 1); *map++ = cpu_to_be32((node_count << 16) | self_id_count); while (self_id_count--) *map++ = cpu_to_be32p(self_ids++); - fw_compute_block_crc(card->topology_map); + fw_compute_block_crc(buffer); } void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, @@ -458,46 +462,45 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, trace_bus_reset_handle(card->index, generation, node_id, bm_abdicate, self_ids, self_id_count); - guard(spinlock_irqsave)(&card->lock); - - /* - * If the selfID buffer is not the immediate successor of the - * previously processed one, we cannot reliably compare the - * old and new topologies. - */ - if (!is_next_generation(generation, card->generation) && - card->local_node != NULL) { - fw_destroy_nodes(card); - card->bm_retries = 0; + scoped_guard(spinlock, &card->lock) { + // If the selfID buffer is not the immediate successor of the + // previously processed one, we cannot reliably compare the + // old and new topologies. + if (!is_next_generation(generation, card->generation) && card->local_node != NULL) { + fw_destroy_nodes(card); + card->bm_retries = 0; + } + card->broadcast_channel_allocated = card->broadcast_channel_auto_allocated; + card->node_id = node_id; + // Update node_id before generation to prevent anybody from using + // a stale node_id together with a current generation. + smp_wmb(); + card->generation = generation; + card->reset_jiffies = get_jiffies_64(); + card->bm_node_id = 0xffff; + card->bm_abdicate = bm_abdicate; + + local_node = build_tree(card, self_ids, self_id_count, generation); + + card->color++; + + if (local_node == NULL) { + fw_err(card, "topology build failed\n"); + // FIXME: We need to issue a bus reset in this case. + } else if (card->local_node == NULL) { + card->local_node = local_node; + for_each_fw_node(card, local_node, report_found_node); + } else { + update_tree(card, local_node); + } } - card->broadcast_channel_allocated = card->broadcast_channel_auto_allocated; - card->node_id = node_id; - /* - * Update node_id before generation to prevent anybody from using - * a stale node_id together with a current generation. - */ - smp_wmb(); - card->generation = generation; - card->reset_jiffies = get_jiffies_64(); - card->bm_node_id = 0xffff; - card->bm_abdicate = bm_abdicate; fw_schedule_bm_work(card, 0); - local_node = build_tree(card, self_ids, self_id_count, generation); - - update_topology_map(card, self_ids, self_id_count); - - card->color++; - - if (local_node == NULL) { - fw_err(card, "topology build failed\n"); - /* FIXME: We need to issue a bus reset in this case. */ - } else if (card->local_node == NULL) { - card->local_node = local_node; - for_each_fw_node(card, local_node, report_found_node); - } else { - update_tree(card, local_node); + // Just used by transaction layer. + scoped_guard(spinlock, &card->topology_map.lock) { + update_topology_map(card->topology_map.buffer, sizeof(card->topology_map.buffer), + card->root_node->node_id, self_ids, self_id_count); } } EXPORT_SYMBOL(fw_core_handle_bus_reset); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 1d1c2d8f85aec8..dd3656a0c1ff0d 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -49,12 +49,14 @@ static int close_transaction(struct fw_transaction *transaction, struct fw_card { struct fw_transaction *t = NULL, *iter; - scoped_guard(spinlock_irqsave, &card->lock) { - list_for_each_entry(iter, &card->transaction_list, link) { + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) { + list_for_each_entry(iter, &card->transactions.list, link) { if (iter == transaction) { if (try_cancel_split_timeout(iter)) { list_del_init(&iter->link); - card->tlabel_mask &= ~(1ULL << iter->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << iter->tlabel); t = iter; } break; @@ -117,11 +119,11 @@ static void split_transaction_timeout_callback(struct timer_list *timer) struct fw_transaction *t = timer_container_of(t, timer, split_timeout_timer); struct fw_card *card = t->card; - scoped_guard(spinlock_irqsave, &card->lock) { + scoped_guard(spinlock_irqsave, &card->transactions.lock) { if (list_empty(&t->link)) return; list_del(&t->link); - card->tlabel_mask &= ~(1ULL << t->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << t->tlabel); } if (!t->with_tstamp) { @@ -135,14 +137,18 @@ static void split_transaction_timeout_callback(struct timer_list *timer) static void start_split_transaction_timeout(struct fw_transaction *t, struct fw_card *card) { - guard(spinlock_irqsave)(&card->lock); + unsigned long delta; if (list_empty(&t->link) || WARN_ON(t->is_split_transaction)) return; t->is_split_transaction = true; - mod_timer(&t->split_timeout_timer, - jiffies + card->split_timeout_jiffies); + + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) + delta = card->split_timeout.jiffies; + mod_timer(&t->split_timeout_timer, jiffies + delta); } static u32 compute_split_timeout_timestamp(struct fw_card *card, u32 request_timestamp); @@ -162,8 +168,12 @@ static void transmit_complete_callback(struct fw_packet *packet, break; case ACK_PENDING: { - t->split_timeout_cycle = - compute_split_timeout_timestamp(card, packet->timestamp) & 0xffff; + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + t->split_timeout_cycle = + compute_split_timeout_timestamp(card, packet->timestamp) & 0xffff; + } start_split_transaction_timeout(t, card); break; } @@ -259,18 +269,21 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel, } static int allocate_tlabel(struct fw_card *card) +__must_hold(&card->transactions_lock) { int tlabel; - tlabel = card->current_tlabel; - while (card->tlabel_mask & (1ULL << tlabel)) { + lockdep_assert_held(&card->transactions.lock); + + tlabel = card->transactions.current_tlabel; + while (card->transactions.tlabel_mask & (1ULL << tlabel)) { tlabel = (tlabel + 1) & 0x3f; - if (tlabel == card->current_tlabel) + if (tlabel == card->transactions.current_tlabel) return -EBUSY; } - card->current_tlabel = (tlabel + 1) & 0x3f; - card->tlabel_mask |= 1ULL << tlabel; + card->transactions.current_tlabel = (tlabel + 1) & 0x3f; + card->transactions.tlabel_mask |= 1ULL << tlabel; return tlabel; } @@ -331,7 +344,6 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode void *payload, size_t length, union fw_transaction_callback callback, bool with_tstamp, void *callback_data) { - unsigned long flags; int tlabel; /* @@ -339,11 +351,11 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode * the list while holding the card spinlock. */ - spin_lock_irqsave(&card->lock, flags); - - tlabel = allocate_tlabel(card); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) + tlabel = allocate_tlabel(card); if (tlabel < 0) { - spin_unlock_irqrestore(&card->lock, flags); if (!with_tstamp) { callback.without_tstamp(card, RCODE_SEND_ERROR, NULL, 0, callback_data); } else { @@ -368,15 +380,22 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode t->callback = callback; t->with_tstamp = with_tstamp; t->callback_data = callback_data; - - fw_fill_request(&t->packet, tcode, t->tlabel, destination_id, card->node_id, generation, - speed, offset, payload, length); t->packet.callback = transmit_complete_callback; - list_add_tail(&t->link, &card->transaction_list); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->lock) { + // The node_id field of fw_card can be updated when handling SelfIDComplete. + fw_fill_request(&t->packet, tcode, t->tlabel, destination_id, card->node_id, + generation, speed, offset, payload, length); + } - spin_unlock_irqrestore(&card->lock, flags); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) + list_add_tail(&t->link, &card->transactions.list); + // Safe with no lock, since the index field of fw_card is immutable once assigned. trace_async_request_outbound_initiate((uintptr_t)t, card->index, generation, speed, t->packet.header, payload, tcode_is_read_request(tcode) ? 0 : length / 4); @@ -458,7 +477,7 @@ static struct fw_packet phy_config_packet = { void fw_send_phy_config(struct fw_card *card, int node_id, int generation, int gap_count) { - long timeout = DIV_ROUND_UP(HZ, 10); + long timeout = msecs_to_jiffies(100); u32 data = 0; phy_packet_set_packet_identifier(&data, PHY_PACKET_PACKET_IDENTIFIER_PHY_CONFIG); @@ -779,11 +798,14 @@ EXPORT_SYMBOL(fw_fill_response); static u32 compute_split_timeout_timestamp(struct fw_card *card, u32 request_timestamp) +__must_hold(&card->split_timeout.lock) { unsigned int cycles; u32 timestamp; - cycles = card->split_timeout_cycles; + lockdep_assert_held(&card->split_timeout.lock); + + cycles = card->split_timeout.cycles; cycles += request_timestamp & 0x1fff; timestamp = request_timestamp & ~0x1fff; @@ -834,9 +856,12 @@ static struct fw_request *allocate_request(struct fw_card *card, return NULL; kref_init(&request->kref); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) + request->response.timestamp = compute_split_timeout_timestamp(card, p->timestamp); + request->response.speed = p->speed; - request->response.timestamp = - compute_split_timeout_timestamp(card, p->timestamp); request->response.generation = p->generation; request->response.ack = 0; request->response.callback = free_response_callback; @@ -1111,12 +1136,14 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *p) break; } - scoped_guard(spinlock_irqsave, &card->lock) { - list_for_each_entry(iter, &card->transaction_list, link) { + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) { + list_for_each_entry(iter, &card->transactions.list, link) { if (iter->node_id == source && iter->tlabel == tlabel) { if (try_cancel_split_timeout(iter)) { list_del_init(&iter->link); - card->tlabel_mask &= ~(1ULL << iter->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << iter->tlabel); t = iter; } break; @@ -1196,7 +1223,11 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request } start = (offset - topology_map_region.start) / 4; - memcpy(payload, &card->topology_map[start], length); + + // NOTE: This can be without irqsave when we can guarantee that fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->topology_map.lock) + memcpy(payload, &card->topology_map.buffer[start], length); fw_send_response(card, request, RCODE_COMPLETE); } @@ -1211,16 +1242,17 @@ static const struct fw_address_region registers_region = .end = CSR_REGISTER_BASE | CSR_CONFIG_ROM, }; static void update_split_timeout(struct fw_card *card) +__must_hold(&card->split_timeout.lock) { unsigned int cycles; - cycles = card->split_timeout_hi * 8000 + (card->split_timeout_lo >> 19); + cycles = card->split_timeout.hi * 8000 + (card->split_timeout.lo >> 19); /* minimum per IEEE 1394, maximum which doesn't overflow OHCI */ cycles = clamp(cycles, 800u, 3u * 8000u); - card->split_timeout_cycles = cycles; - card->split_timeout_jiffies = DIV_ROUND_UP(cycles * HZ, 8000); + card->split_timeout.cycles = cycles; + card->split_timeout.jiffies = isoc_cycles_to_jiffies(cycles); } static void handle_registers(struct fw_card *card, struct fw_request *request, @@ -1270,12 +1302,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, case CSR_SPLIT_TIMEOUT_HI: if (tcode == TCODE_READ_QUADLET_REQUEST) { - *data = cpu_to_be32(card->split_timeout_hi); + *data = cpu_to_be32(card->split_timeout.hi); } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { - guard(spinlock_irqsave)(&card->lock); - - card->split_timeout_hi = be32_to_cpu(*data) & 7; - update_split_timeout(card); + // NOTE: This can be without irqsave when we can guarantee that + // __fw_send_request() for local destination never runs in any type of IRQ + // context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + card->split_timeout.hi = be32_to_cpu(*data) & 7; + update_split_timeout(card); + } } else { rcode = RCODE_TYPE_ERROR; } @@ -1283,12 +1318,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, case CSR_SPLIT_TIMEOUT_LO: if (tcode == TCODE_READ_QUADLET_REQUEST) { - *data = cpu_to_be32(card->split_timeout_lo); + *data = cpu_to_be32(card->split_timeout.lo); } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { - guard(spinlock_irqsave)(&card->lock); - - card->split_timeout_lo = be32_to_cpu(*data) & 0xfff80000; - update_split_timeout(card); + // NOTE: This can be without irqsave when we can guarantee that + // __fw_send_request() for local destination never runs in any type of IRQ + // context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + card->split_timeout.lo = be32_to_cpu(*data) & 0xfff80000; + update_split_timeout(card); + } } else { rcode = RCODE_TYPE_ERROR; } diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 9b298af1cac0b8..e67395ce26b5e3 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -27,6 +27,11 @@ struct fw_packet; /* -card */ +// This is the arbitrary value we use to indicate a mismatched gap count. +#define GAP_COUNT_MISMATCHED 0 + +#define isoc_cycles_to_jiffies(cycles) usecs_to_jiffies((u32)div_u64((u64)cycles * USEC_PER_SEC, 8000)) + extern __printf(2, 3) void fw_err(const struct fw_card *card, const char *fmt, ...); extern __printf(2, 3) @@ -167,6 +172,9 @@ static inline void fw_iso_context_init_work(struct fw_iso_context *ctx, work_fun /* -topology */ +// The initial value of BUS_MANAGER_ID register, to express nothing registered. +#define BUS_MANAGER_ID_NOT_REGISTERED 0x3f + enum { FW_NODE_CREATED, FW_NODE_UPDATED, @@ -194,8 +202,8 @@ struct fw_node { /* For serializing node topology into a list. */ struct list_head link; - /* Upper layer specific data. */ - void *data; + // The device when already associated, else NULL. + struct fw_device *device; struct fw_node *ports[] __counted_by(port_count); }; @@ -219,6 +227,16 @@ static inline void fw_node_put(struct fw_node *node) kref_put(&node->kref, release_node); } +static inline struct fw_device *fw_node_get_device(struct fw_node *node) +{ + return node->device; +} + +static inline void fw_node_set_device(struct fw_node *node, struct fw_device *device) +{ + node->device = device; +} + void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, int self_id_count, u32 *self_ids, bool bm_abdicate); void fw_destroy_nodes(struct fw_card *card); diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 5d8301b0f3aa8c..030aed5453a17d 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -228,13 +228,10 @@ struct fw_ohci { __le32 *self_id; dma_addr_t self_id_bus; - struct work_struct bus_reset_work; u32 self_id_buffer[512]; }; -static struct workqueue_struct *selfid_workqueue; - static inline struct fw_ohci *fw_ohci(struct fw_card *card) { return container_of(card, struct fw_ohci, card); @@ -393,225 +390,10 @@ MODULE_PARM_DESC(quirks, "Chip quirks (default = 0" ", IR wake unreliable = " __stringify(QUIRK_IR_WAKE) ")"); -#define OHCI_PARAM_DEBUG_AT_AR 1 -#define OHCI_PARAM_DEBUG_SELFIDS 2 -#define OHCI_PARAM_DEBUG_IRQS 4 - -static int param_debug; -module_param_named(debug, param_debug, int, 0644); -MODULE_PARM_DESC(debug, "Verbose logging, deprecated in v6.11 kernel or later. (default = 0" - ", AT/AR events = " __stringify(OHCI_PARAM_DEBUG_AT_AR) - ", self-IDs = " __stringify(OHCI_PARAM_DEBUG_SELFIDS) - ", IRQs = " __stringify(OHCI_PARAM_DEBUG_IRQS) - ", or a combination, or all = -1)"); - static bool param_remote_dma; module_param_named(remote_dma, param_remote_dma, bool, 0444); MODULE_PARM_DESC(remote_dma, "Enable unfiltered remote DMA (default = N)"); -static void log_irqs(struct fw_ohci *ohci, u32 evt) -{ - if (likely(!(param_debug & OHCI_PARAM_DEBUG_IRQS))) - return; - - ohci_notice(ohci, "IRQ %08x%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", evt, - evt & OHCI1394_selfIDComplete ? " selfID" : "", - evt & OHCI1394_RQPkt ? " AR_req" : "", - evt & OHCI1394_RSPkt ? " AR_resp" : "", - evt & OHCI1394_reqTxComplete ? " AT_req" : "", - evt & OHCI1394_respTxComplete ? " AT_resp" : "", - evt & OHCI1394_isochRx ? " IR" : "", - evt & OHCI1394_isochTx ? " IT" : "", - evt & OHCI1394_postedWriteErr ? " postedWriteErr" : "", - evt & OHCI1394_cycleTooLong ? " cycleTooLong" : "", - evt & OHCI1394_cycle64Seconds ? " cycle64Seconds" : "", - evt & OHCI1394_cycleInconsistent ? " cycleInconsistent" : "", - evt & OHCI1394_regAccessFail ? " regAccessFail" : "", - evt & OHCI1394_unrecoverableError ? " unrecoverableError" : "", - evt & OHCI1394_busReset ? " busReset" : "", - evt & ~(OHCI1394_selfIDComplete | OHCI1394_RQPkt | - OHCI1394_RSPkt | OHCI1394_reqTxComplete | - OHCI1394_respTxComplete | OHCI1394_isochRx | - OHCI1394_isochTx | OHCI1394_postedWriteErr | - OHCI1394_cycleTooLong | OHCI1394_cycle64Seconds | - OHCI1394_cycleInconsistent | - OHCI1394_regAccessFail | OHCI1394_busReset) - ? " ?" : ""); -} - -static void log_selfids(struct fw_ohci *ohci, int generation, int self_id_count) -{ - static const char *const speed[] = { - [0] = "S100", [1] = "S200", [2] = "S400", [3] = "beta", - }; - static const char *const power[] = { - [0] = "+0W", [1] = "+15W", [2] = "+30W", [3] = "+45W", - [4] = "-3W", [5] = " ?W", [6] = "-3..-6W", [7] = "-3..-10W", - }; - static const char port[] = { - [PHY_PACKET_SELF_ID_PORT_STATUS_NONE] = '.', - [PHY_PACKET_SELF_ID_PORT_STATUS_NCONN] = '-', - [PHY_PACKET_SELF_ID_PORT_STATUS_PARENT] = 'p', - [PHY_PACKET_SELF_ID_PORT_STATUS_CHILD] = 'c', - }; - struct self_id_sequence_enumerator enumerator = { - .cursor = ohci->self_id_buffer, - .quadlet_count = self_id_count, - }; - - if (likely(!(param_debug & OHCI_PARAM_DEBUG_SELFIDS))) - return; - - ohci_notice(ohci, "%d selfIDs, generation %d, local node ID %04x\n", - self_id_count, generation, ohci->node_id); - - while (enumerator.quadlet_count > 0) { - unsigned int quadlet_count; - unsigned int port_index; - const u32 *s; - int i; - - s = self_id_sequence_enumerator_next(&enumerator, &quadlet_count); - if (IS_ERR(s)) - break; - - ohci_notice(ohci, - "selfID 0: %08x, phy %d [%c%c%c] %s gc=%d %s %s%s%s\n", - *s, - phy_packet_self_id_get_phy_id(*s), - port[self_id_sequence_get_port_status(s, quadlet_count, 0)], - port[self_id_sequence_get_port_status(s, quadlet_count, 1)], - port[self_id_sequence_get_port_status(s, quadlet_count, 2)], - speed[*s >> 14 & 3], *s >> 16 & 63, - power[*s >> 8 & 7], *s >> 22 & 1 ? "L" : "", - *s >> 11 & 1 ? "c" : "", *s & 2 ? "i" : ""); - - port_index = 3; - for (i = 1; i < quadlet_count; ++i) { - ohci_notice(ohci, - "selfID n: %08x, phy %d [%c%c%c%c%c%c%c%c]\n", - s[i], - phy_packet_self_id_get_phy_id(s[i]), - port[self_id_sequence_get_port_status(s, quadlet_count, port_index)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 1)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 2)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 3)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 4)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 5)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 6)], - port[self_id_sequence_get_port_status(s, quadlet_count, port_index + 7)] - ); - - port_index += 8; - } - } -} - -static const char *evts[] = { - [0x00] = "evt_no_status", [0x01] = "-reserved-", - [0x02] = "evt_long_packet", [0x03] = "evt_missing_ack", - [0x04] = "evt_underrun", [0x05] = "evt_overrun", - [0x06] = "evt_descriptor_read", [0x07] = "evt_data_read", - [0x08] = "evt_data_write", [0x09] = "evt_bus_reset", - [0x0a] = "evt_timeout", [0x0b] = "evt_tcode_err", - [0x0c] = "-reserved-", [0x0d] = "-reserved-", - [0x0e] = "evt_unknown", [0x0f] = "evt_flushed", - [0x10] = "-reserved-", [0x11] = "ack_complete", - [0x12] = "ack_pending ", [0x13] = "-reserved-", - [0x14] = "ack_busy_X", [0x15] = "ack_busy_A", - [0x16] = "ack_busy_B", [0x17] = "-reserved-", - [0x18] = "-reserved-", [0x19] = "-reserved-", - [0x1a] = "-reserved-", [0x1b] = "ack_tardy", - [0x1c] = "-reserved-", [0x1d] = "ack_data_error", - [0x1e] = "ack_type_error", [0x1f] = "-reserved-", - [0x20] = "pending/cancelled", -}; - -static void log_ar_at_event(struct fw_ohci *ohci, - char dir, int speed, u32 *header, int evt) -{ - static const char *const tcodes[] = { - [TCODE_WRITE_QUADLET_REQUEST] = "QW req", - [TCODE_WRITE_BLOCK_REQUEST] = "BW req", - [TCODE_WRITE_RESPONSE] = "W resp", - [0x3] = "-reserved-", - [TCODE_READ_QUADLET_REQUEST] = "QR req", - [TCODE_READ_BLOCK_REQUEST] = "BR req", - [TCODE_READ_QUADLET_RESPONSE] = "QR resp", - [TCODE_READ_BLOCK_RESPONSE] = "BR resp", - [TCODE_CYCLE_START] = "cycle start", - [TCODE_LOCK_REQUEST] = "Lk req", - [TCODE_STREAM_DATA] = "async stream packet", - [TCODE_LOCK_RESPONSE] = "Lk resp", - [0xc] = "-reserved-", - [0xd] = "-reserved-", - [TCODE_LINK_INTERNAL] = "link internal", - [0xf] = "-reserved-", - }; - int tcode = async_header_get_tcode(header); - char specific[12]; - - if (likely(!(param_debug & OHCI_PARAM_DEBUG_AT_AR))) - return; - - if (unlikely(evt >= ARRAY_SIZE(evts))) - evt = 0x1f; - - if (evt == OHCI1394_evt_bus_reset) { - ohci_notice(ohci, "A%c evt_bus_reset, generation %d\n", - dir, (header[2] >> 16) & 0xff); - return; - } - - switch (tcode) { - case TCODE_WRITE_QUADLET_REQUEST: - case TCODE_READ_QUADLET_RESPONSE: - case TCODE_CYCLE_START: - snprintf(specific, sizeof(specific), " = %08x", - be32_to_cpu((__force __be32)header[3])); - break; - case TCODE_WRITE_BLOCK_REQUEST: - case TCODE_READ_BLOCK_REQUEST: - case TCODE_READ_BLOCK_RESPONSE: - case TCODE_LOCK_REQUEST: - case TCODE_LOCK_RESPONSE: - snprintf(specific, sizeof(specific), " %x,%x", - async_header_get_data_length(header), - async_header_get_extended_tcode(header)); - break; - default: - specific[0] = '\0'; - } - - switch (tcode) { - case TCODE_STREAM_DATA: - ohci_notice(ohci, "A%c %s, %s\n", - dir, evts[evt], tcodes[tcode]); - break; - case TCODE_LINK_INTERNAL: - ohci_notice(ohci, "A%c %s, PHY %08x %08x\n", - dir, evts[evt], header[1], header[2]); - break; - case TCODE_WRITE_QUADLET_REQUEST: - case TCODE_WRITE_BLOCK_REQUEST: - case TCODE_READ_QUADLET_REQUEST: - case TCODE_READ_BLOCK_REQUEST: - case TCODE_LOCK_REQUEST: - ohci_notice(ohci, - "A%c spd %x tl %02x, %04x -> %04x, %s, %s, %012llx%s\n", - dir, speed, async_header_get_tlabel(header), - async_header_get_source(header), async_header_get_destination(header), - evts[evt], tcodes[tcode], async_header_get_offset(header), specific); - break; - default: - ohci_notice(ohci, - "A%c spd %x tl %02x, %04x -> %04x, %s, %s%s\n", - dir, speed, async_header_get_tlabel(header), - async_header_get_source(header), async_header_get_destination(header), - evts[evt], tcodes[tcode], specific); - } -} - static inline void reg_write(const struct fw_ohci *ohci, int offset, u32 data) { writel(data, ohci->registers + offset); @@ -957,8 +739,6 @@ static __le32 *handle_ar_packet(struct ar_context *ctx, __le32 *buffer) p.timestamp = status & 0xffff; p.generation = ohci->request_generation; - log_ar_at_event(ohci, 'R', p.speed, p.header, evt); - /* * Several controllers, notably from NEC and VIA, forget to * write ack_complete status at PHY packet reception. @@ -977,7 +757,7 @@ static __le32 *handle_ar_packet(struct ar_context *ctx, __le32 *buffer) * * Alas some chips sometimes emit bus reset packets with a * wrong generation. We set the correct generation for these - * at a slightly incorrect time (in bus_reset_work). + * at a slightly incorrect time (in handle_selfid_complete_event). */ if (evt == OHCI1394_evt_bus_reset) { if (!(ohci->quirks & QUIRK_RESET_PACKET)) @@ -1566,8 +1346,6 @@ static int handle_at_packet(struct context *context, evt = le16_to_cpu(last->transfer_status) & 0x1f; packet->timestamp = le16_to_cpu(last->res_count); - log_ar_at_event(ohci, 'T', packet->speed, packet->header, evt); - switch (evt) { case OHCI1394_evt_timeout: /* Async response transmit timed out. */ @@ -1772,6 +1550,25 @@ static void at_context_transmit(struct at_context *ctx, struct fw_packet *packet static void detect_dead_context(struct fw_ohci *ohci, const char *name, unsigned int regs) { + static const char *const evts[] = { + [0x00] = "evt_no_status", [0x01] = "-reserved-", + [0x02] = "evt_long_packet", [0x03] = "evt_missing_ack", + [0x04] = "evt_underrun", [0x05] = "evt_overrun", + [0x06] = "evt_descriptor_read", [0x07] = "evt_data_read", + [0x08] = "evt_data_write", [0x09] = "evt_bus_reset", + [0x0a] = "evt_timeout", [0x0b] = "evt_tcode_err", + [0x0c] = "-reserved-", [0x0d] = "-reserved-", + [0x0e] = "evt_unknown", [0x0f] = "evt_flushed", + [0x10] = "-reserved-", [0x11] = "ack_complete", + [0x12] = "ack_pending ", [0x13] = "-reserved-", + [0x14] = "ack_busy_X", [0x15] = "ack_busy_A", + [0x16] = "ack_busy_B", [0x17] = "-reserved-", + [0x18] = "-reserved-", [0x19] = "-reserved-", + [0x1a] = "-reserved-", [0x1b] = "ack_tardy", + [0x1c] = "-reserved-", [0x1d] = "ack_data_error", + [0x1e] = "ack_type_error", [0x1f] = "-reserved-", + [0x20] = "pending/cancelled", + }; u32 ctl; ctl = reg_read(ohci, CONTROL_SET(regs)); @@ -2030,9 +1827,9 @@ static int find_and_insert_self_id(struct fw_ohci *ohci, int self_id_count) return self_id_count; } -static void bus_reset_work(struct work_struct *work) +static irqreturn_t handle_selfid_complete_event(int irq, void *data) { - struct fw_ohci *ohci = from_work(ohci, work, bus_reset_work); + struct fw_ohci *ohci = data; int self_id_count, generation, new_generation, i, j; u32 reg, quadlet; void *free_rom = NULL; @@ -2043,11 +1840,11 @@ static void bus_reset_work(struct work_struct *work) if (!(reg & OHCI1394_NodeID_idValid)) { ohci_notice(ohci, "node ID not valid, new bus reset in progress\n"); - return; + goto end; } if ((reg & OHCI1394_NodeID_nodeNumber) == 63) { ohci_notice(ohci, "malconfigured bus\n"); - return; + goto end; } ohci->node_id = reg & (OHCI1394_NodeID_busNumber | OHCI1394_NodeID_nodeNumber); @@ -2061,8 +1858,11 @@ static void bus_reset_work(struct work_struct *work) reg = reg_read(ohci, OHCI1394_SelfIDCount); if (ohci1394_self_id_count_is_error(reg)) { ohci_notice(ohci, "self ID receive error\n"); - return; + goto end; } + + trace_self_id_complete(ohci->card.index, reg, ohci->self_id, has_be_header_quirk(ohci)); + /* * The count in the SelfIDCount register is the number of * bytes in the self ID receive buffer. Since we also receive @@ -2073,7 +1873,7 @@ static void bus_reset_work(struct work_struct *work) if (self_id_count > 252) { ohci_notice(ohci, "bad selfIDSize (%08x)\n", reg); - return; + goto end; } quadlet = cond_le32_to_cpu(ohci->self_id[0], has_be_header_quirk(ohci)); @@ -2100,7 +1900,7 @@ static void bus_reset_work(struct work_struct *work) ohci_notice(ohci, "bad self ID %d/%d (%08x != ~%08x)\n", j, self_id_count, id, id2); - return; + goto end; } ohci->self_id_buffer[j] = id; } @@ -2110,13 +1910,13 @@ static void bus_reset_work(struct work_struct *work) if (self_id_count < 0) { ohci_notice(ohci, "could not construct local self ID\n"); - return; + goto end; } } if (self_id_count == 0) { ohci_notice(ohci, "no self IDs\n"); - return; + goto end; } rmb(); @@ -2138,7 +1938,7 @@ static void bus_reset_work(struct work_struct *work) new_generation = ohci1394_self_id_count_get_generation(reg); if (new_generation != generation) { ohci_notice(ohci, "new bus reset, discarding self ids\n"); - return; + goto end; } // FIXME: Document how the locking works. @@ -2195,12 +1995,12 @@ static void bus_reset_work(struct work_struct *work) if (free_rom) dmam_free_coherent(ohci->card.device, CONFIG_ROM_SIZE, free_rom, free_rom_bus); - log_selfids(ohci, generation, self_id_count); - fw_core_handle_bus_reset(&ohci->card, ohci->node_id, generation, self_id_count, ohci->self_id_buffer, ohci->csr_state_setclear_abdicate); ohci->csr_state_setclear_abdicate = false; +end: + return IRQ_HANDLED; } static irqreturn_t irq_handler(int irq, void *data) @@ -2214,11 +2014,6 @@ static irqreturn_t irq_handler(int irq, void *data) if (!event || !~event) return IRQ_NONE; - if (unlikely(param_debug > 0)) { - dev_notice_ratelimited(ohci->card.device, - "The debug parameter is superseded by tracepoints events, and deprecated."); - } - /* * busReset and postedWriteErr events must not be cleared yet * (OHCI 1.1 clauses 7.2.3.2 and 13.2.8.1) @@ -2226,21 +2021,11 @@ static irqreturn_t irq_handler(int irq, void *data) reg_write(ohci, OHCI1394_IntEventClear, event & ~(OHCI1394_busReset | OHCI1394_postedWriteErr)); trace_irqs(ohci->card.index, event); - log_irqs(ohci, event); - // The flag is masked again at bus_reset_work() scheduled by selfID event. + + // The flag is masked again at handle_selfid_complete_event() scheduled by selfID event. if (event & OHCI1394_busReset) reg_write(ohci, OHCI1394_IntMaskClear, OHCI1394_busReset); - if (event & OHCI1394_selfIDComplete) { - if (trace_self_id_complete_enabled()) { - u32 reg = reg_read(ohci, OHCI1394_SelfIDCount); - - trace_self_id_complete(ohci->card.index, reg, ohci->self_id, - has_be_header_quirk(ohci)); - } - queue_work(selfid_workqueue, &ohci->bus_reset_work); - } - if (event & OHCI1394_RQPkt) queue_work(ohci->card.async_wq, &ohci->ar_request_ctx.work); @@ -2311,7 +2096,10 @@ static irqreturn_t irq_handler(int irq, void *data) } else flush_writes(ohci); - return IRQ_HANDLED; + if (event & OHCI1394_selfIDComplete) + return IRQ_WAKE_THREAD; + else + return IRQ_HANDLED; } static int software_reset(struct fw_ohci *ohci) @@ -2624,7 +2412,7 @@ static int ohci_set_config_rom(struct fw_card *card, * then set up the real values for the two registers. * * We use ohci->lock to avoid racing with the code that sets - * ohci->next_config_rom to NULL (see bus_reset_work). + * ohci->next_config_rom to NULL (see handle_selfid_complete_event). */ next_config_rom = dmam_alloc_coherent(ohci->card.device, CONFIG_ROM_SIZE, @@ -2705,7 +2493,6 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet) dma_unmap_single(ohci->card.device, packet->payload_bus, packet->payload_length, DMA_TO_DEVICE); - log_ar_at_event(ohci, 'T', packet->speed, packet->header, 0x20); driver_data->packet = NULL; packet->ack = RCODE_CANCELLED; @@ -3695,7 +3482,6 @@ static int pci_probe(struct pci_dev *dev, u32 bus_options, max_receive, link_speed, version; u64 guid; int i, flags, irq, err; - size_t size; if (dev->vendor == PCI_VENDOR_ID_PINNACLE_SYSTEMS) { dev_err(&dev->dev, "Pinnacle MovieBoard is not yet supported\n"); @@ -3722,8 +3508,6 @@ static int pci_probe(struct pci_dev *dev, spin_lock_init(&ohci->lock); mutex_init(&ohci->phy_reg_mutex); - INIT_WORK(&ohci->bus_reset_work, bus_reset_work); - if (!(pci_resource_flags(dev, 0) & IORESOURCE_MEM) || pci_resource_len(dev, 0) < OHCI1394_REGISTER_SIZE) { ohci_err(ohci, "invalid MMIO resource\n"); @@ -3791,8 +3575,7 @@ static int pci_probe(struct pci_dev *dev, reg_write(ohci, OHCI1394_IsoRecvIntMaskClear, ~0); ohci->ir_context_mask = ohci->ir_context_support; ohci->n_ir = hweight32(ohci->ir_context_mask); - size = sizeof(struct iso_context) * ohci->n_ir; - ohci->ir_context_list = devm_kzalloc(&dev->dev, size, GFP_KERNEL); + ohci->ir_context_list = devm_kcalloc(&dev->dev, ohci->n_ir, sizeof(struct iso_context), GFP_KERNEL); if (!ohci->ir_context_list) return -ENOMEM; @@ -3806,8 +3589,7 @@ static int pci_probe(struct pci_dev *dev, reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, ~0); ohci->it_context_mask = ohci->it_context_support; ohci->n_it = hweight32(ohci->it_context_mask); - size = sizeof(struct iso_context) * ohci->n_it; - ohci->it_context_list = devm_kzalloc(&dev->dev, size, GFP_KERNEL); + ohci->it_context_list = devm_kcalloc(&dev->dev, ohci->n_it, sizeof(struct iso_context), GFP_KERNEL); if (!ohci->it_context_list) return -ENOMEM; @@ -3832,7 +3614,9 @@ static int pci_probe(struct pci_dev *dev, goto fail_msi; } - err = request_threaded_irq(irq, irq_handler, NULL, + // IRQF_ONESHOT is not applied so that any events are handled in the hardIRQ handler during + // invoking the threaded IRQ handler for SelfIDComplete event. + err = request_threaded_irq(irq, irq_handler, handle_selfid_complete_event, pci_dev_msi_enabled(dev) ? 0 : IRQF_SHARED, ohci_driver_name, ohci); if (err < 0) { @@ -3876,7 +3660,6 @@ static void pci_remove(struct pci_dev *dev) reg_write(ohci, OHCI1394_IntMaskClear, ~0); flush_writes(ohci); } - cancel_work_sync(&ohci->bus_reset_work); fw_core_remove_card(&ohci->card); /* @@ -3949,17 +3732,12 @@ static struct pci_driver fw_ohci_pci_driver = { static int __init fw_ohci_init(void) { - selfid_workqueue = alloc_workqueue(KBUILD_MODNAME, WQ_MEM_RECLAIM, 0); - if (!selfid_workqueue) - return -ENOMEM; - return pci_register_driver(&fw_ohci_pci_driver); } static void __exit fw_ohci_cleanup(void) { pci_unregister_driver(&fw_ohci_pci_driver); - destroy_workqueue(selfid_workqueue); } module_init(fw_ohci_init); diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index cafc90d4caafa6..0d05eac7c72b24 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -788,7 +788,9 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry, *kernel_entry = addr + entry; - return efi_adjust_memory_range_protection(addr, kernel_text_size); + return efi_adjust_memory_range_protection(addr, kernel_text_size) ?: + efi_adjust_memory_range_protection(addr + kernel_inittext_offset, + kernel_inittext_size); } static void __noreturn enter_kernel(unsigned long kernel_addr, diff --git a/drivers/firmware/tegra/bpmp-tegra186.c b/drivers/firmware/tegra/bpmp-tegra186.c index 7cfc5fdfa49d50..64863db7a71576 100644 --- a/drivers/firmware/tegra/bpmp-tegra186.c +++ b/drivers/firmware/tegra/bpmp-tegra186.c @@ -198,7 +198,10 @@ static int tegra186_bpmp_dram_init(struct tegra_bpmp *bpmp) err = of_reserved_mem_region_to_resource(bpmp->dev->of_node, 0, &res); if (err < 0) { - dev_warn(bpmp->dev, "failed to parse memory region: %d\n", err); + if (err != -ENODEV) + dev_warn(bpmp->dev, + "failed to parse memory region: %d\n", err); + return err; } diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index d8ac40d0eb6fb6..caeb7bee50cf84 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -303,7 +303,7 @@ config GPIO_EN7523 config GPIO_EP93XX def_bool y - depends on ARCH_EP93XX + depends on ARCH_EP93XX || COMPILE_TEST select GPIO_GENERIC select GPIOLIB_IRQCHIP @@ -408,8 +408,7 @@ config GPIO_IMX_SCU config GPIO_IXP4XX bool "Intel IXP4xx GPIO" - depends on ARCH_IXP4XX - depends on OF + depends on (ARCH_IXP4XX && OF) || COMPILE_TEST select GPIO_GENERIC select GPIOLIB_IRQCHIP select IRQ_DOMAIN_HIERARCHY @@ -437,6 +436,7 @@ config GPIO_LOONGSON_64BIT depends on LOONGARCH || COMPILE_TEST depends on OF_GPIO select GPIO_GENERIC + select GPIOLIB_IRQCHIP help Say yes here to support the GPIO functionality of a number of Loongson series of chips. The Loongson GPIO controller supports @@ -485,7 +485,6 @@ config GPIO_MM_LANTIQ config GPIO_MPC5200 def_bool y depends on PPC_MPC52xx - select OF_GPIO_MM_GPIOCHIP config GPIO_MPC8XXX bool "MPC512x/MPC8xxx/QorIQ GPIO support" @@ -735,7 +734,8 @@ config GPIO_TANGIER If built as a module its name will be gpio-tangier. config GPIO_TB10X - bool + bool "Abilis Systems TB10x GPIO controller" + depends on ARC_PLAT_TB10X || COMPILE_TEST select GPIO_GENERIC select GENERIC_IRQ_CHIP select OF_GPIO @@ -884,7 +884,7 @@ config GPIO_ZYNQMP_MODEPIN config GPIO_LOONGSON1 tristate "Loongson1 GPIO support" - depends on MACH_LOONGSON32 + depends on MACH_LOONGSON32 || COMPILE_TEST select GPIO_GENERIC help Say Y or M here to support GPIO on Loongson1 SoCs. @@ -1194,14 +1194,18 @@ config GPIO_PCA953X 4 bits: pca9536, pca9537 8 bits: max7310, max7315, pca6107, pca9534, pca9538, pca9554, - pca9556, pca9557, pca9574, tca6408, tca9554, xra1202 + pca9556, pca9557, pca9574, tca6408, tca9554, xra1202, + pcal6408, pcal9554b, tca9538 16 bits: max7312, max7313, pca9535, pca9539, pca9555, pca9575, - tca6416 + tca6416, pca6416, pcal6416, pcal9535, pcal9555a, max7318, + tca9539 + + 18 bits: tca6418 - 24 bits: tca6424 + 24 bits: tca6424, pcal6524 - 40 bits: pca9505, pca9698 + 40 bits: pca9505, pca9698, pca9506 config GPIO_PCA953X_IRQ bool "Interrupt controller support for PCA953x" @@ -1492,6 +1496,18 @@ config GPIO_MADERA help Support for GPIOs on Cirrus Logic Madera class codecs. +config GPIO_MAX7360 + tristate "MAX7360 GPIO support" + depends on MFD_MAX7360 + select GPIO_REGMAP + select REGMAP_IRQ + help + Allows to use MAX7360 I/O Expander PWM lines as GPIO and keypad COL + lines as GPO. + + This driver can also be built as a module. If so, the module will be + called gpio-max7360. + config GPIO_MAX77620 tristate "GPIO support for PMIC MAX77620 and MAX20024" depends on MFD_MAX77620 @@ -1522,6 +1538,18 @@ config GPIO_MAX77759 This driver can also be built as a module. If so, the module will be called gpio-max77759. +config GPIO_NCT6694 + tristate "Nuvoton NCT6694 GPIO controller support" + depends on MFD_NCT6694 + select GENERIC_IRQ_CHIP + select GPIOLIB_IRQCHIP + help + This driver supports 8 GPIO pins per bank that can all be interrupt + sources. + + This driver can also be built as a module. If so, the module will be + called gpio-nct6694. + config GPIO_PALMAS tristate "TI PALMAS series PMICs GPIO" depends on MFD_PALMAS @@ -1559,7 +1587,7 @@ config GPIO_SL28CPLD called gpio-sl28cpld. config GPIO_STMPE - bool "STMPE GPIOs" + tristate "STMPE GPIOs" depends on MFD_STMPE depends on OF_GPIO select GPIOLIB_IRQCHIP diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 379f55e9ed1e69..000fa2e397c2a1 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -106,6 +106,7 @@ obj-$(CONFIG_GPIO_MAX7300) += gpio-max7300.o obj-$(CONFIG_GPIO_MAX7301) += gpio-max7301.o obj-$(CONFIG_GPIO_MAX730X) += gpio-max730x.o obj-$(CONFIG_GPIO_MAX732X) += gpio-max732x.o +obj-$(CONFIG_GPIO_MAX7360) += gpio-max7360.o obj-$(CONFIG_GPIO_MAX77620) += gpio-max77620.o obj-$(CONFIG_GPIO_MAX77650) += gpio-max77650.o obj-$(CONFIG_GPIO_MAX77759) += gpio-max77759.o @@ -128,6 +129,7 @@ obj-$(CONFIG_GPIO_MT7621) += gpio-mt7621.o obj-$(CONFIG_GPIO_MVEBU) += gpio-mvebu.o obj-$(CONFIG_GPIO_MXC) += gpio-mxc.o obj-$(CONFIG_GPIO_MXS) += gpio-mxs.o +obj-$(CONFIG_GPIO_NCT6694) += gpio-nct6694.o obj-$(CONFIG_GPIO_NOMADIK) += gpio-nomadik.o obj-$(CONFIG_GPIO_NPCM_SGPIO) += gpio-npcm-sgpio.o obj-$(CONFIG_GPIO_OCTEON) += gpio-octeon.o diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO index 7a09a4f58551b5..8ed74e05903a97 100644 --- a/drivers/gpio/TODO +++ b/drivers/gpio/TODO @@ -131,11 +131,6 @@ Work items: helpers (x86 inb()/outb()) and convert port-mapped I/O drivers to use this with dry-coding and sending to maintainers to test -- Move the MMIO GPIO specific fields out of struct gpio_chip into a - dedicated structure. Currently every GPIO chip has them if gpio-mmio is - enabled in Kconfig even if it itself doesn't register with the helper - library. - ------------------------------------------------------------------------------- Generic regmap GPIO @@ -176,18 +171,6 @@ cannot be converted yet, but watch this space! ------------------------------------------------------------------------------- -Convert all GPIO chips to using the new, value returning line setters - -struct gpio_chip's set() and set_multiple() callbacks are now deprecated. They -return void and thus do not allow drivers to indicate failure to set the line -value back to the caller. - -We've now added new variants - set_rv() and set_multiple_rv() that return an -integer. Let's convert all GPIO drivers treewide to use the new callbacks, -remove the old ones and finally rename the new ones back to the old names. - -------------------------------------------------------------------------------- - Remove legacy sysfs features We have two parallel per-chip class devices and per-exported-line attribute diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index af9d8b3a711dad..37600faf4a4b72 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include +#include #include #include "dev-sync-probe.h" @@ -244,18 +246,34 @@ struct gpiochip_fwd { spinlock_t slock; /* protects tmp[] if !can_sleep */ }; struct gpiochip_fwd_timing *delay_timings; + void *data; + unsigned long *valid_mask; unsigned long tmp[]; /* values and descs for multiple ops */ }; -#define fwd_tmp_values(fwd) &(fwd)->tmp[0] -#define fwd_tmp_descs(fwd) (void *)&(fwd)->tmp[BITS_TO_LONGS((fwd)->chip.ngpio)] +#define fwd_tmp_values(fwd) (&(fwd)->tmp[0]) +#define fwd_tmp_descs(fwd) ((void *)&(fwd)->tmp[BITS_TO_LONGS((fwd)->chip.ngpio)]) #define fwd_tmp_size(ngpios) (BITS_TO_LONGS((ngpios)) + (ngpios)) +static int gpio_fwd_request(struct gpio_chip *chip, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(chip); + + return test_bit(offset, fwd->valid_mask) ? 0 : -ENODEV; +} + static int gpio_fwd_get_direction(struct gpio_chip *chip, unsigned int offset) { struct gpiochip_fwd *fwd = gpiochip_get_data(chip); + /* + * get_direction() is called during gpiochip registration, return + * -ENODEV if there is no GPIO desc for the line. + */ + if (!test_bit(offset, fwd->valid_mask)) + return -ENODEV; + return gpiod_get_direction(fwd->descs[offset]); } @@ -453,10 +471,11 @@ static int gpiochip_fwd_delay_of_xlate(struct gpio_chip *chip, return line; } -static int gpiochip_fwd_setup_delay_line(struct device *dev, struct gpio_chip *chip, - struct gpiochip_fwd *fwd) +static int gpiochip_fwd_setup_delay_line(struct gpiochip_fwd *fwd) { - fwd->delay_timings = devm_kcalloc(dev, chip->ngpio, + struct gpio_chip *chip = &fwd->chip; + + fwd->delay_timings = devm_kcalloc(chip->parent, chip->ngpio, sizeof(*fwd->delay_timings), GFP_KERNEL); if (!fwd->delay_timings) @@ -468,67 +487,235 @@ static int gpiochip_fwd_setup_delay_line(struct device *dev, struct gpio_chip *c return 0; } #else -static int gpiochip_fwd_setup_delay_line(struct device *dev, struct gpio_chip *chip, - struct gpiochip_fwd *fwd) +static int gpiochip_fwd_setup_delay_line(struct gpiochip_fwd *fwd) { return 0; } #endif /* !CONFIG_OF_GPIO */ /** - * gpiochip_fwd_create() - Create a new GPIO forwarder - * @dev: Parent device pointer - * @ngpios: Number of GPIOs in the forwarder. - * @descs: Array containing the GPIO descriptors to forward to. - * This array must contain @ngpios entries, and must not be deallocated - * before the forwarder has been destroyed again. - * @features: Bitwise ORed features as defined with FWD_FEATURE_*. + * gpiochip_fwd_get_gpiochip - Get the GPIO chip for the GPIO forwarder + * @fwd: GPIO forwarder * - * This function creates a new gpiochip, which forwards all GPIO operations to - * the passed GPIO descriptors. + * Returns: The GPIO chip for the GPIO forwarder + */ +struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) +{ + return &fwd->chip; +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_get_data - Get driver-private data for the GPIO forwarder + * @fwd: GPIO forwarder * - * Return: An opaque object pointer, or an ERR_PTR()-encoded negative error - * code on failure. + * Returns: The driver-private data for the GPIO forwarder */ -static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, - unsigned int ngpios, - struct gpio_desc *descs[], - unsigned long features) +void *gpiochip_fwd_get_data(struct gpiochip_fwd *fwd) +{ + return fwd->data; +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_data, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_request - Request a line of the GPIO forwarder + * @fwd: GPIO forwarder + * @offset: the offset of the line to request + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_request(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_request, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get_direction - Return the current direction of a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: 0 for output, 1 for input, or an error code in case of error. + */ +int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get_direction(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get_direction, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_direction_output - Set a GPIO forwarder line direction to + * output + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @value: value to set + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_direction_output(struct gpiochip_fwd *fwd, unsigned int offset, + int value) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_direction_output(gc, offset, value); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_direction_output, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_direction_input - Set a GPIO forwarder line direction to input + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_direction_input(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_direction_input, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get - Return a GPIO forwarder line's value + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: The GPIO's logical value, i.e. taking the ACTIVE_LOW status into + * account, or negative errno on failure. + */ +int gpiochip_fwd_gpio_get(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get_multiple - Get values for multiple GPIO forwarder lines + * @fwd: GPIO forwarder + * @mask: bit mask array; one bit per line; BITS_PER_LONG bits per word defines + * which lines are to be read + * @bits: bit value array; one bit per line; BITS_PER_LONG bits per word will + * contains the read values for the lines specified by mask + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_get_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, + unsigned long *bits) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get_multiple_locked(gc, mask, bits); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get_multiple, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set - Assign value to a GPIO forwarder line. + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @value: value to set + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_set(struct gpiochip_fwd *fwd, unsigned int offset, int value) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set(gc, offset, value); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set_multiple - Assign values to multiple GPIO forwarder lines + * @fwd: GPIO forwarder + * @mask: bit mask array; one bit per output; BITS_PER_LONG bits per word + * defines which outputs are to be changed + * @bits: bit value array; one bit per output; BITS_PER_LONG bits per word + * defines the values the outputs specified by mask are to be set to + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_set_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, + unsigned long *bits) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set_multiple_locked(gc, mask, bits); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set_multiple, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set_config - Set @config for a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @config: Same packed config format as generic pinconf + * + * Returns: 0 on success, %-ENOTSUPP if the controller doesn't support setting + * the configuration. + */ +int gpiochip_fwd_gpio_set_config(struct gpiochip_fwd *fwd, unsigned int offset, + unsigned long config) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set_config(gc, offset, config); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set_config, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_to_irq - Return the IRQ corresponding to a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: The Linux IRQ corresponding to the passed line, or an error code in + * case of error. + */ +int gpiochip_fwd_gpio_to_irq(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_to_irq(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_to_irq, "GPIO_FORWARDER"); + +/** + * devm_gpiochip_fwd_alloc - Allocate and initialize a new GPIO forwarder + * @dev: Parent device pointer + * @ngpios: Number of GPIOs in the forwarder + * + * Returns: An opaque object pointer, or an ERR_PTR()-encoded negative error + * code on failure. + */ +struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, + unsigned int ngpios) { - const char *label = dev_name(dev); struct gpiochip_fwd *fwd; struct gpio_chip *chip; - unsigned int i; - int error; - fwd = devm_kzalloc(dev, struct_size(fwd, tmp, fwd_tmp_size(ngpios)), - GFP_KERNEL); + fwd = devm_kzalloc(dev, struct_size(fwd, tmp, fwd_tmp_size(ngpios)), GFP_KERNEL); if (!fwd) return ERR_PTR(-ENOMEM); - chip = &fwd->chip; - - /* - * If any of the GPIO lines are sleeping, then the entire forwarder - * will be sleeping. - * If any of the chips support .set_config(), then the forwarder will - * support setting configs. - */ - for (i = 0; i < ngpios; i++) { - struct gpio_chip *parent = gpiod_to_chip(descs[i]); + fwd->descs = devm_kcalloc(dev, ngpios, sizeof(*fwd->descs), GFP_KERNEL); + if (!fwd->descs) + return ERR_PTR(-ENOMEM); - dev_dbg(dev, "%u => gpio %d irq %d\n", i, - desc_to_gpio(descs[i]), gpiod_to_irq(descs[i])); + fwd->valid_mask = devm_bitmap_zalloc(dev, ngpios, GFP_KERNEL); + if (!fwd->valid_mask) + return ERR_PTR(-ENOMEM); - if (gpiod_cansleep(descs[i])) - chip->can_sleep = true; - if (parent && parent->set_config) - chip->set_config = gpio_fwd_set_config; - } + chip = &fwd->chip; - chip->label = label; + chip->label = dev_name(dev); chip->parent = dev; chip->owner = THIS_MODULE; + chip->request = gpio_fwd_request; chip->get_direction = gpio_fwd_get_direction; chip->direction_input = gpio_fwd_direction_input; chip->direction_output = gpio_fwd_direction_output; @@ -539,20 +726,128 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, chip->to_irq = gpio_fwd_to_irq; chip->base = -1; chip->ngpio = ngpios; - fwd->descs = descs; + + return fwd; +} +EXPORT_SYMBOL_NS_GPL(devm_gpiochip_fwd_alloc, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_desc_add - Add a GPIO desc in the forwarder + * @fwd: GPIO forwarder + * @desc: GPIO descriptor to register + * @offset: offset for the GPIO in the forwarder + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, + unsigned int offset) +{ + struct gpio_chip *chip = &fwd->chip; + + if (offset >= chip->ngpio) + return -EINVAL; + + if (test_and_set_bit(offset, fwd->valid_mask)) + return -EEXIST; + + /* + * If any of the GPIO lines are sleeping, then the entire forwarder + * will be sleeping. + */ + if (gpiod_cansleep(desc)) + chip->can_sleep = true; + + fwd->descs[offset] = desc; + + dev_dbg(chip->parent, "%u => gpio %d irq %d\n", offset, + desc_to_gpio(desc), gpiod_to_irq(desc)); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_add, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_desc_free - Remove a GPIO desc from the forwarder + * @fwd: GPIO forwarder + * @offset: offset of GPIO desc to remove + */ +void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset) +{ + if (test_and_clear_bit(offset, fwd->valid_mask)) + gpiod_put(fwd->descs[offset]); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_free, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_register - Register a GPIO forwarder + * @fwd: GPIO forwarder + * @data: driver-private data associated with this forwarder + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_register(struct gpiochip_fwd *fwd, void *data) +{ + struct gpio_chip *chip = &fwd->chip; + + /* + * Some gpio_desc were not registered. They will be registered at runtime + * but we have to suppose they can sleep. + */ + if (!bitmap_full(fwd->valid_mask, chip->ngpio)) + chip->can_sleep = true; if (chip->can_sleep) mutex_init(&fwd->mlock); else spin_lock_init(&fwd->slock); + fwd->data = data; + + return devm_gpiochip_add_data(chip->parent, chip, fwd); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_register, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_create() - Create a new GPIO forwarder + * @dev: Parent device pointer + * @ngpios: Number of GPIOs in the forwarder. + * @descs: Array containing the GPIO descriptors to forward to. + * This array must contain @ngpios entries, and can be deallocated + * as the forwarder has its own array. + * @features: Bitwise ORed features as defined with FWD_FEATURE_*. + * + * This function creates a new gpiochip, which forwards all GPIO operations to + * the passed GPIO descriptors. + * + * Return: An opaque object pointer, or an ERR_PTR()-encoded negative error + * code on failure. + */ +static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, + unsigned int ngpios, + struct gpio_desc *descs[], + unsigned long features) +{ + struct gpiochip_fwd *fwd; + unsigned int i; + int error; + + fwd = devm_gpiochip_fwd_alloc(dev, ngpios); + if (IS_ERR(fwd)) + return fwd; + + for (i = 0; i < ngpios; i++) { + error = gpiochip_fwd_desc_add(fwd, descs[i], i); + if (error) + return ERR_PTR(error); + } + if (features & FWD_FEATURE_DELAY) { - error = gpiochip_fwd_setup_delay_line(dev, chip, fwd); + error = gpiochip_fwd_setup_delay_line(fwd); if (error) return ERR_PTR(error); } - error = devm_gpiochip_add_data(dev, chip, fwd); + error = gpiochip_fwd_register(fwd, NULL); if (error) return ERR_PTR(error); @@ -1334,6 +1629,7 @@ static int gpio_aggregator_probe(struct platform_device *pdev) return PTR_ERR(fwd); platform_set_drvdata(pdev, fwd); + devm_kfree(dev, descs); return 0; } diff --git a/drivers/gpio/gpio-amdpt.c b/drivers/gpio/gpio-amdpt.c index b70036587d9c3f..8458a6949c65d3 100644 --- a/drivers/gpio/gpio-amdpt.c +++ b/drivers/gpio/gpio-amdpt.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -24,54 +25,50 @@ #define PT_SYNC_REG 0x28 struct pt_gpio_chip { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *reg_base; }; static int pt_gpio_request(struct gpio_chip *gc, unsigned offset) { + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct pt_gpio_chip *pt_gpio = gpiochip_get_data(gc); - unsigned long flags; u32 using_pins; dev_dbg(gc->parent, "pt_gpio_request offset=%x\n", offset); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); using_pins = readl(pt_gpio->reg_base + PT_SYNC_REG); if (using_pins & BIT(offset)) { dev_warn(gc->parent, "PT GPIO pin %x reconfigured\n", offset); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return -EINVAL; } writel(using_pins | BIT(offset), pt_gpio->reg_base + PT_SYNC_REG); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); - return 0; } static void pt_gpio_free(struct gpio_chip *gc, unsigned offset) { + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct pt_gpio_chip *pt_gpio = gpiochip_get_data(gc); - unsigned long flags; u32 using_pins; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); using_pins = readl(pt_gpio->reg_base + PT_SYNC_REG); using_pins &= ~BIT(offset); writel(using_pins, pt_gpio->reg_base + PT_SYNC_REG); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); - dev_dbg(gc->parent, "pt_gpio_free offset=%x\n", offset); } static int pt_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct pt_gpio_chip *pt_gpio; int ret = 0; @@ -91,22 +88,27 @@ static int pt_gpio_probe(struct platform_device *pdev) return PTR_ERR(pt_gpio->reg_base); } - ret = bgpio_init(&pt_gpio->gc, dev, 4, - pt_gpio->reg_base + PT_INPUTDATA_REG, - pt_gpio->reg_base + PT_OUTPUTDATA_REG, NULL, - pt_gpio->reg_base + PT_DIRECTION_REG, NULL, - BGPIOF_READ_OUTPUT_REG_SET); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = pt_gpio->reg_base + PT_INPUTDATA_REG, + .set = pt_gpio->reg_base + PT_OUTPUTDATA_REG, + .dirout = pt_gpio->reg_base + PT_DIRECTION_REG, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&pt_gpio->chip, &config); if (ret) { - dev_err(dev, "bgpio_init failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return ret; } - pt_gpio->gc.owner = THIS_MODULE; - pt_gpio->gc.request = pt_gpio_request; - pt_gpio->gc.free = pt_gpio_free; - pt_gpio->gc.ngpio = (uintptr_t)device_get_match_data(dev); + pt_gpio->chip.gc.owner = THIS_MODULE; + pt_gpio->chip.gc.request = pt_gpio_request; + pt_gpio->chip.gc.free = pt_gpio_free; + pt_gpio->chip.gc.ngpio = (uintptr_t)device_get_match_data(dev); - ret = devm_gpiochip_add_data(dev, &pt_gpio->gc, pt_gpio); + ret = devm_gpiochip_add_data(dev, &pt_gpio->chip.gc, pt_gpio); if (ret) { dev_err(dev, "Failed to register GPIO lib\n"); return ret; diff --git a/drivers/gpio/gpio-ath79.c b/drivers/gpio/gpio-ath79.c index de4cc12e5e0399..2ad9f6ac66362f 100644 --- a/drivers/gpio/gpio-ath79.c +++ b/drivers/gpio/gpio-ath79.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -28,17 +29,17 @@ #define AR71XX_GPIO_REG_INT_MASK 0x24 struct ath79_gpio_ctrl { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; - raw_spinlock_t lock; unsigned long both_edges; }; static struct ath79_gpio_ctrl *irq_data_to_ath79_gpio(struct irq_data *data) { struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); - return container_of(gc, struct ath79_gpio_ctrl, gc); + return container_of(gen_gc, struct ath79_gpio_ctrl, chip); } static u32 ath79_gpio_read(struct ath79_gpio_ctrl *ctrl, unsigned reg) @@ -70,48 +71,43 @@ static void ath79_gpio_irq_unmask(struct irq_data *data) { struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); - unsigned long flags; - gpiochip_enable_irq(&ctrl->gc, irqd_to_hwirq(data)); - raw_spin_lock_irqsave(&ctrl->lock, flags); + gpiochip_enable_irq(&ctrl->chip.gc, irqd_to_hwirq(data)); + + guard(gpio_generic_lock_irqsave)(&ctrl->chip); + ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, mask); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); } static void ath79_gpio_irq_mask(struct irq_data *data) { struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); - unsigned long flags; - raw_spin_lock_irqsave(&ctrl->lock, flags); - ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, 0); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); - gpiochip_disable_irq(&ctrl->gc, irqd_to_hwirq(data)); + scoped_guard(gpio_generic_lock_irqsave, &ctrl->chip) + ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, 0); + + gpiochip_disable_irq(&ctrl->chip.gc, irqd_to_hwirq(data)); } static void ath79_gpio_irq_enable(struct irq_data *data) { struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); - unsigned long flags; - raw_spin_lock_irqsave(&ctrl->lock, flags); + guard(gpio_generic_lock_irqsave)(&ctrl->chip); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_ENABLE, mask, mask); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, mask); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); } static void ath79_gpio_irq_disable(struct irq_data *data) { struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); - unsigned long flags; - raw_spin_lock_irqsave(&ctrl->lock, flags); + guard(gpio_generic_lock_irqsave)(&ctrl->chip); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_MASK, mask, 0); ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_ENABLE, mask, 0); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); } static int ath79_gpio_irq_set_type(struct irq_data *data, @@ -120,7 +116,6 @@ static int ath79_gpio_irq_set_type(struct irq_data *data, struct ath79_gpio_ctrl *ctrl = irq_data_to_ath79_gpio(data); u32 mask = BIT(irqd_to_hwirq(data)); u32 type = 0, polarity = 0; - unsigned long flags; bool disabled; switch (flow_type) { @@ -142,7 +137,7 @@ static int ath79_gpio_irq_set_type(struct irq_data *data, return -EINVAL; } - raw_spin_lock_irqsave(&ctrl->lock, flags); + guard(gpio_generic_lock_irqsave)(&ctrl->chip); if (flow_type == IRQ_TYPE_EDGE_BOTH) { ctrl->both_edges |= mask; @@ -167,8 +162,6 @@ static int ath79_gpio_irq_set_type(struct irq_data *data, ath79_gpio_update_bits( ctrl, AR71XX_GPIO_REG_INT_ENABLE, mask, mask); - raw_spin_unlock_irqrestore(&ctrl->lock, flags); - return 0; } @@ -187,28 +180,27 @@ static void ath79_gpio_irq_handler(struct irq_desc *desc) { struct gpio_chip *gc = irq_desc_get_handler_data(desc); struct irq_chip *irqchip = irq_desc_get_chip(desc); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct ath79_gpio_ctrl *ctrl = - container_of(gc, struct ath79_gpio_ctrl, gc); - unsigned long flags, pending; + container_of(gen_gc, struct ath79_gpio_ctrl, chip); + unsigned long pending; u32 both_edges, state; int irq; chained_irq_enter(irqchip, desc); - raw_spin_lock_irqsave(&ctrl->lock, flags); - - pending = ath79_gpio_read(ctrl, AR71XX_GPIO_REG_INT_PENDING); + scoped_guard(gpio_generic_lock_irqsave, &ctrl->chip) { + pending = ath79_gpio_read(ctrl, AR71XX_GPIO_REG_INT_PENDING); - /* Update the polarity of the both edges irqs */ - both_edges = ctrl->both_edges & pending; - if (both_edges) { - state = ath79_gpio_read(ctrl, AR71XX_GPIO_REG_IN); - ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_POLARITY, - both_edges, ~state); + /* Update the polarity of the both edges irqs */ + both_edges = ctrl->both_edges & pending; + if (both_edges) { + state = ath79_gpio_read(ctrl, AR71XX_GPIO_REG_IN); + ath79_gpio_update_bits(ctrl, AR71XX_GPIO_REG_INT_POLARITY, + both_edges, ~state); + } } - raw_spin_unlock_irqrestore(&ctrl->lock, flags); - for_each_set_bit(irq, &pending, gc->ngpio) generic_handle_domain_irq(gc->irq.domain, irq); @@ -224,6 +216,7 @@ MODULE_DEVICE_TABLE(of, ath79_gpio_of_match); static int ath79_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct ath79_gpio_ctrl *ctrl; struct gpio_irq_chip *girq; @@ -252,22 +245,25 @@ static int ath79_gpio_probe(struct platform_device *pdev) if (IS_ERR(ctrl->base)) return PTR_ERR(ctrl->base); - raw_spin_lock_init(&ctrl->lock); - err = bgpio_init(&ctrl->gc, dev, 4, - ctrl->base + AR71XX_GPIO_REG_IN, - ctrl->base + AR71XX_GPIO_REG_SET, - ctrl->base + AR71XX_GPIO_REG_CLEAR, - oe_inverted ? NULL : ctrl->base + AR71XX_GPIO_REG_OE, - oe_inverted ? ctrl->base + AR71XX_GPIO_REG_OE : NULL, - 0); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = ctrl->base + AR71XX_GPIO_REG_IN, + .set = ctrl->base + AR71XX_GPIO_REG_SET, + .clr = ctrl->base + AR71XX_GPIO_REG_CLEAR, + .dirout = oe_inverted ? NULL : ctrl->base + AR71XX_GPIO_REG_OE, + .dirin = oe_inverted ? ctrl->base + AR71XX_GPIO_REG_OE : NULL, + }; + + err = gpio_generic_chip_init(&ctrl->chip, &config); if (err) { - dev_err(dev, "bgpio_init failed\n"); + dev_err(dev, "failed to initialize generic GPIO chip\n"); return err; } /* Optional interrupt setup */ if (device_property_read_bool(dev, "interrupt-controller")) { - girq = &ctrl->gc.irq; + girq = &ctrl->chip.gc.irq; gpio_irq_chip_set_chip(girq, &ath79_gpio_irqchip); girq->parent_handler = ath79_gpio_irq_handler; girq->num_parents = 1; @@ -280,7 +276,7 @@ static int ath79_gpio_probe(struct platform_device *pdev) girq->handler = handle_simple_irq; } - return devm_gpiochip_add_data(dev, &ctrl->gc, ctrl); + return devm_gpiochip_add_data(dev, &ctrl->chip.gc, ctrl); } static struct platform_driver ath79_gpio_driver = { diff --git a/drivers/gpio/gpio-blzp1600.c b/drivers/gpio/gpio-blzp1600.c index 055cb296ae5475..0f8c826ba87612 100644 --- a/drivers/gpio/gpio-blzp1600.c +++ b/drivers/gpio/gpio-blzp1600.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -36,7 +37,7 @@ struct blzp1600_gpio { void __iomem *base; - struct gpio_chip gc; + struct gpio_generic_chip gen_gc; int irq; }; @@ -76,7 +77,7 @@ static void blzp1600_gpio_irq_mask(struct irq_data *d) { struct blzp1600_gpio *chip = get_blzp1600_gpio_from_irq_data(d); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_IM_REG, BIT(d->hwirq), 1); } @@ -84,7 +85,7 @@ static void blzp1600_gpio_irq_unmask(struct irq_data *d) { struct blzp1600_gpio *chip = get_blzp1600_gpio_from_irq_data(d); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_IM_REG, BIT(d->hwirq), 0); } @@ -99,9 +100,9 @@ static void blzp1600_gpio_irq_enable(struct irq_data *d) { struct blzp1600_gpio *chip = get_blzp1600_gpio_from_irq_data(d); - gpiochip_enable_irq(&chip->gc, irqd_to_hwirq(d)); + gpiochip_enable_irq(&chip->gen_gc.gc, irqd_to_hwirq(d)); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_DIR_REG, BIT(d->hwirq), 0); blzp1600_gpio_rmw(chip->base + GPIO_IEN_REG, BIT(d->hwirq), 1); } @@ -110,9 +111,9 @@ static void blzp1600_gpio_irq_disable(struct irq_data *d) { struct blzp1600_gpio *chip = get_blzp1600_gpio_from_irq_data(d); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_IEN_REG, BIT(d->hwirq), 0); - gpiochip_disable_irq(&chip->gc, irqd_to_hwirq(d)); + gpiochip_disable_irq(&chip->gen_gc.gc, irqd_to_hwirq(d)); } static int blzp1600_gpio_irq_set_type(struct irq_data *d, u32 type) @@ -121,7 +122,7 @@ static int blzp1600_gpio_irq_set_type(struct irq_data *d, u32 type) u32 edge_level, single_both, fall_rise; int mask = BIT(d->hwirq); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); edge_level = blzp1600_gpio_read(chip, GPIO_IS_REG); single_both = blzp1600_gpio_read(chip, GPIO_IBE_REG); fall_rise = blzp1600_gpio_read(chip, GPIO_IEV_REG); @@ -186,8 +187,8 @@ static void blzp1600_gpio_irqhandler(struct irq_desc *desc) chained_irq_enter(irqchip, desc); irq_status = blzp1600_gpio_read(gpio, GPIO_RIS_REG); - for_each_set_bit(hwirq, &irq_status, gpio->gc.ngpio) - generic_handle_domain_irq(gpio->gc.irq.domain, hwirq); + for_each_set_bit(hwirq, &irq_status, gpio->gen_gc.gc.ngpio) + generic_handle_domain_irq(gpio->gen_gc.gc.irq.domain, hwirq); chained_irq_exit(irqchip, desc); } @@ -197,7 +198,7 @@ static int blzp1600_gpio_set_debounce(struct gpio_chip *gc, unsigned int offset, { struct blzp1600_gpio *chip = gpiochip_get_data(gc); - guard(raw_spinlock_irqsave)(&chip->gc.bgpio_lock); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); blzp1600_gpio_rmw(chip->base + GPIO_DB_REG, BIT(offset), debounce); return 0; @@ -216,6 +217,7 @@ static int blzp1600_gpio_set_config(struct gpio_chip *gc, unsigned int offset, u static int blzp1600_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct blzp1600_gpio *chip; struct gpio_chip *gc; int ret; @@ -228,14 +230,21 @@ static int blzp1600_gpio_probe(struct platform_device *pdev) if (IS_ERR(chip->base)) return PTR_ERR(chip->base); - ret = bgpio_init(&chip->gc, &pdev->dev, 4, chip->base + GPIO_IDATA_REG, - chip->base + GPIO_SET_REG, chip->base + GPIO_CLR_REG, - chip->base + GPIO_DIR_REG, NULL, 0); + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 4, + .dat = chip->base + GPIO_IDATA_REG, + .set = chip->base + GPIO_SET_REG, + .clr = chip->base + GPIO_CLR_REG, + .dirout = chip->base + GPIO_DIR_REG, + }; + + ret = gpio_generic_chip_init(&chip->gen_gc, &config); if (ret) return dev_err_probe(&pdev->dev, ret, "Failed to register generic gpio\n"); /* configure the gpio chip */ - gc = &chip->gc; + gc = &chip->gen_gc.gc; gc->set_config = blzp1600_gpio_set_config; if (device_property_present(&pdev->dev, "interrupt-controller")) { diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c index e29a9589b3ccbd..f40c9472588bc7 100644 --- a/drivers/gpio/gpio-brcmstb.c +++ b/drivers/gpio/gpio-brcmstb.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -37,7 +38,7 @@ enum gio_reg_index { struct brcmstb_gpio_bank { struct list_head node; int id; - struct gpio_chip gc; + struct gpio_generic_chip chip; struct brcmstb_gpio_priv *parent_priv; u32 width; u32 wake_active; @@ -72,19 +73,18 @@ __brcmstb_gpio_get_active_irqs(struct brcmstb_gpio_bank *bank) { void __iomem *reg_base = bank->parent_priv->reg_base; - return bank->gc.read_reg(reg_base + GIO_STAT(bank->id)) & - bank->gc.read_reg(reg_base + GIO_MASK(bank->id)); + return gpio_generic_read_reg(&bank->chip, reg_base + GIO_STAT(bank->id)) & + gpio_generic_read_reg(&bank->chip, reg_base + GIO_MASK(bank->id)); } static unsigned long brcmstb_gpio_get_active_irqs(struct brcmstb_gpio_bank *bank) { unsigned long status; - unsigned long flags; - raw_spin_lock_irqsave(&bank->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&bank->chip); + status = __brcmstb_gpio_get_active_irqs(bank); - raw_spin_unlock_irqrestore(&bank->gc.bgpio_lock, flags); return status; } @@ -92,26 +92,26 @@ brcmstb_gpio_get_active_irqs(struct brcmstb_gpio_bank *bank) static int brcmstb_gpio_hwirq_to_offset(irq_hw_number_t hwirq, struct brcmstb_gpio_bank *bank) { - return hwirq - bank->gc.offset; + return hwirq - bank->chip.gc.offset; } static void brcmstb_gpio_set_imask(struct brcmstb_gpio_bank *bank, unsigned int hwirq, bool enable) { - struct gpio_chip *gc = &bank->gc; struct brcmstb_gpio_priv *priv = bank->parent_priv; u32 mask = BIT(brcmstb_gpio_hwirq_to_offset(hwirq, bank)); u32 imask; - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - imask = gc->read_reg(priv->reg_base + GIO_MASK(bank->id)); + guard(gpio_generic_lock_irqsave)(&bank->chip); + + imask = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_MASK(bank->id)); if (enable) imask |= mask; else imask &= ~mask; - gc->write_reg(priv->reg_base + GIO_MASK(bank->id), imask); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_MASK(bank->id), imask); } static int brcmstb_gpio_to_irq(struct gpio_chip *gc, unsigned offset) @@ -150,7 +150,8 @@ static void brcmstb_gpio_irq_ack(struct irq_data *d) struct brcmstb_gpio_priv *priv = bank->parent_priv; u32 mask = BIT(brcmstb_gpio_hwirq_to_offset(d->hwirq, bank)); - gc->write_reg(priv->reg_base + GIO_STAT(bank->id), mask); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_STAT(bank->id), mask); } static int brcmstb_gpio_irq_set_type(struct irq_data *d, unsigned int type) @@ -162,7 +163,6 @@ static int brcmstb_gpio_irq_set_type(struct irq_data *d, unsigned int type) u32 edge_insensitive, iedge_insensitive; u32 edge_config, iedge_config; u32 level, ilevel; - unsigned long flags; switch (type) { case IRQ_TYPE_LEVEL_LOW: @@ -194,23 +194,25 @@ static int brcmstb_gpio_irq_set_type(struct irq_data *d, unsigned int type) return -EINVAL; } - raw_spin_lock_irqsave(&bank->gc.bgpio_lock, flags); - - iedge_config = bank->gc.read_reg(priv->reg_base + - GIO_EC(bank->id)) & ~mask; - iedge_insensitive = bank->gc.read_reg(priv->reg_base + - GIO_EI(bank->id)) & ~mask; - ilevel = bank->gc.read_reg(priv->reg_base + - GIO_LEVEL(bank->id)) & ~mask; - - bank->gc.write_reg(priv->reg_base + GIO_EC(bank->id), - iedge_config | edge_config); - bank->gc.write_reg(priv->reg_base + GIO_EI(bank->id), - iedge_insensitive | edge_insensitive); - bank->gc.write_reg(priv->reg_base + GIO_LEVEL(bank->id), - ilevel | level); + guard(gpio_generic_lock_irqsave)(&bank->chip); + + iedge_config = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_EC(bank->id)) & ~mask; + iedge_insensitive = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_EI(bank->id)) & ~mask; + ilevel = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_LEVEL(bank->id)) & ~mask; + + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_EC(bank->id), + iedge_config | edge_config); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_EI(bank->id), + iedge_insensitive | edge_insensitive); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_LEVEL(bank->id), + ilevel | level); - raw_spin_unlock_irqrestore(&bank->gc.bgpio_lock, flags); return 0; } @@ -263,7 +265,7 @@ static void brcmstb_gpio_irq_bank_handler(struct brcmstb_gpio_bank *bank) { struct brcmstb_gpio_priv *priv = bank->parent_priv; struct irq_domain *domain = priv->irq_domain; - int hwbase = bank->gc.offset; + int hwbase = bank->chip.gc.offset; unsigned long status; while ((status = brcmstb_gpio_get_active_irqs(bank))) { @@ -303,7 +305,7 @@ static struct brcmstb_gpio_bank *brcmstb_gpio_hwirq_to_bank( /* banks are in descending order */ list_for_each_entry_reverse(bank, &priv->bank_list, node) { - i += bank->gc.ngpio; + i += bank->chip.gc.ngpio; if (hwirq < i) return bank; } @@ -332,7 +334,7 @@ static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq, dev_dbg(&pdev->dev, "Mapping irq %d for gpio line %d (bank %d)\n", irq, (int)hwirq, bank->id); - ret = irq_set_chip_data(irq, &bank->gc); + ret = irq_set_chip_data(irq, &bank->chip.gc); if (ret < 0) return ret; irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class, @@ -394,7 +396,7 @@ static void brcmstb_gpio_remove(struct platform_device *pdev) * more important to actually perform all of the steps. */ list_for_each_entry(bank, &priv->bank_list, node) - gpiochip_remove(&bank->gc); + gpiochip_remove(&bank->chip.gc); } static int brcmstb_gpio_of_xlate(struct gpio_chip *gc, @@ -412,7 +414,7 @@ static int brcmstb_gpio_of_xlate(struct gpio_chip *gc, if (WARN_ON(gpiospec->args_count < gc->of_gpio_n_cells)) return -EINVAL; - offset = gpiospec->args[0] - bank->gc.offset; + offset = gpiospec->args[0] - bank->chip.gc.offset; if (offset >= gc->ngpio || offset < 0) return -EINVAL; @@ -493,19 +495,17 @@ static int brcmstb_gpio_irq_setup(struct platform_device *pdev, static void brcmstb_gpio_bank_save(struct brcmstb_gpio_priv *priv, struct brcmstb_gpio_bank *bank) { - struct gpio_chip *gc = &bank->gc; unsigned int i; for (i = 0; i < GIO_REG_STAT; i++) - bank->saved_regs[i] = gc->read_reg(priv->reg_base + - GIO_BANK_OFF(bank->id, i)); + bank->saved_regs[i] = gpio_generic_read_reg(&bank->chip, + priv->reg_base + GIO_BANK_OFF(bank->id, i)); } static void brcmstb_gpio_quiesce(struct device *dev, bool save) { struct brcmstb_gpio_priv *priv = dev_get_drvdata(dev); struct brcmstb_gpio_bank *bank; - struct gpio_chip *gc; u32 imask; /* disable non-wake interrupt */ @@ -513,8 +513,6 @@ static void brcmstb_gpio_quiesce(struct device *dev, bool save) disable_irq(priv->parent_irq); list_for_each_entry(bank, &priv->bank_list, node) { - gc = &bank->gc; - if (save) brcmstb_gpio_bank_save(priv, bank); @@ -523,8 +521,9 @@ static void brcmstb_gpio_quiesce(struct device *dev, bool save) imask = bank->wake_active; else imask = 0; - gc->write_reg(priv->reg_base + GIO_MASK(bank->id), - imask); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_MASK(bank->id), + imask); } } @@ -538,12 +537,12 @@ static void brcmstb_gpio_shutdown(struct platform_device *pdev) static void brcmstb_gpio_bank_restore(struct brcmstb_gpio_priv *priv, struct brcmstb_gpio_bank *bank) { - struct gpio_chip *gc = &bank->gc; unsigned int i; for (i = 0; i < GIO_REG_STAT; i++) - gc->write_reg(priv->reg_base + GIO_BANK_OFF(bank->id, i), - bank->saved_regs[i]); + gpio_generic_write_reg(&bank->chip, + priv->reg_base + GIO_BANK_OFF(bank->id, i), + bank->saved_regs[i]); } static int brcmstb_gpio_suspend(struct device *dev) @@ -585,6 +584,7 @@ static const struct dev_pm_ops brcmstb_gpio_pm_ops = { static int brcmstb_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; void __iomem *reg_base; @@ -630,7 +630,7 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) * else leave I/O in little endian mode. */ #if defined(CONFIG_MIPS) && defined(__BIG_ENDIAN) - flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; #endif of_property_for_each_u32(np, "brcm,gpio-bank-widths", bank_width) { @@ -665,17 +665,24 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) bank->width = bank_width; } + gc = &bank->chip.gc; + /* * Regs are 4 bytes wide, have data reg, no set/clear regs, * and direction bits have 0 = output and 1 = input */ - gc = &bank->gc; - err = bgpio_init(gc, dev, 4, - reg_base + GIO_DATA(bank->id), - NULL, NULL, NULL, - reg_base + GIO_IODIR(bank->id), flags); + + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = reg_base + GIO_DATA(bank->id), + .dirin = reg_base + GIO_IODIR(bank->id), + .flags = flags, + }; + + err = gpio_generic_chip_init(&bank->chip, &config); if (err) { - dev_err(dev, "bgpio_init() failed\n"); + dev_err(dev, "failed to initialize generic GPIO chip\n"); goto fail; } @@ -700,7 +707,8 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) * be retained from S5 cold boot */ need_wakeup_event |= !!__brcmstb_gpio_get_active_irqs(bank); - gc->write_reg(reg_base + GIO_MASK(bank->id), 0); + gpio_generic_write_reg(&bank->chip, + reg_base + GIO_MASK(bank->id), 0); err = gpiochip_add_data(gc, bank); if (err) { diff --git a/drivers/gpio/gpio-cadence.c b/drivers/gpio/gpio-cadence.c index c647953521c716..b75734ca22dd73 100644 --- a/drivers/gpio/gpio-cadence.c +++ b/drivers/gpio/gpio-cadence.c @@ -181,7 +181,7 @@ static int cdns_gpio_probe(struct platform_device *pdev) config.dat = cgpio->regs + CDNS_GPIO_INPUT_VALUE; config.set = cgpio->regs + CDNS_GPIO_OUTPUT_VALUE; config.dirin = cgpio->regs + CDNS_GPIO_DIRECTION_MODE; - config.flags = BGPIOF_READ_OUTPUT_REG_SET; + config.flags = GPIO_GENERIC_READ_OUTPUT_REG_SET; ret = gpio_generic_chip_init(&cgpio->gen_gc, &config); if (ret) { diff --git a/drivers/gpio/gpio-dwapb.c b/drivers/gpio/gpio-dwapb.c index 43b667b41f5dce..b42ff46d292bd8 100644 --- a/drivers/gpio/gpio-dwapb.c +++ b/drivers/gpio/gpio-dwapb.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -99,7 +100,7 @@ struct dwapb_gpio_port_irqchip { }; struct dwapb_gpio_port { - struct gpio_chip gc; + struct gpio_generic_chip chip; struct dwapb_gpio_port_irqchip *pirq; struct dwapb_gpio *gpio; #ifdef CONFIG_PM_SLEEP @@ -107,8 +108,12 @@ struct dwapb_gpio_port { #endif unsigned int idx; }; -#define to_dwapb_gpio(_gc) \ - (container_of(_gc, struct dwapb_gpio_port, gc)->gpio) + +static inline struct dwapb_gpio *to_dwapb_gpio(struct gpio_chip *gc) +{ + return container_of(to_gpio_generic_chip(gc), + struct dwapb_gpio_port, chip)->gpio; +} struct dwapb_gpio { struct device *dev; @@ -148,19 +153,19 @@ static inline u32 gpio_reg_convert(struct dwapb_gpio *gpio, unsigned int offset) static inline u32 dwapb_read(struct dwapb_gpio *gpio, unsigned int offset) { - struct gpio_chip *gc = &gpio->ports[0].gc; - void __iomem *reg_base = gpio->regs; + struct gpio_generic_chip *chip = &gpio->ports[0].chip; + void __iomem *reg_base = gpio->regs; - return gc->read_reg(reg_base + gpio_reg_convert(gpio, offset)); + return gpio_generic_read_reg(chip, reg_base + gpio_reg_convert(gpio, offset)); } static inline void dwapb_write(struct dwapb_gpio *gpio, unsigned int offset, u32 val) { - struct gpio_chip *gc = &gpio->ports[0].gc; - void __iomem *reg_base = gpio->regs; + struct gpio_generic_chip *chip = &gpio->ports[0].chip; + void __iomem *reg_base = gpio->regs; - gc->write_reg(reg_base + gpio_reg_convert(gpio, offset), val); + gpio_generic_write_reg(chip, reg_base + gpio_reg_convert(gpio, offset), val); } static struct dwapb_gpio_port *dwapb_offs_to_port(struct dwapb_gpio *gpio, unsigned int offs) @@ -186,7 +191,7 @@ static void dwapb_toggle_trigger(struct dwapb_gpio *gpio, unsigned int offs) if (!port) return; - gc = &port->gc; + gc = &port->chip.gc; pol = dwapb_read(gpio, GPIO_INT_POLARITY); /* Just read the current value right out of the data register */ @@ -201,13 +206,13 @@ static void dwapb_toggle_trigger(struct dwapb_gpio *gpio, unsigned int offs) static u32 dwapb_do_irq(struct dwapb_gpio *gpio) { - struct gpio_chip *gc = &gpio->ports[0].gc; + struct gpio_generic_chip *gen_gc = &gpio->ports[0].chip; unsigned long irq_status; irq_hw_number_t hwirq; irq_status = dwapb_read(gpio, GPIO_INTSTATUS); for_each_set_bit(hwirq, &irq_status, DWAPB_MAX_GPIOS) { - int gpio_irq = irq_find_mapping(gc->irq.domain, hwirq); + int gpio_irq = irq_find_mapping(gen_gc->gc.irq.domain, hwirq); u32 irq_type = irq_get_trigger_type(gpio_irq); generic_handle_irq(gpio_irq); @@ -237,27 +242,27 @@ static irqreturn_t dwapb_irq_handler_mfd(int irq, void *dev_id) static void dwapb_irq_ack(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); u32 val = BIT(irqd_to_hwirq(d)); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + dwapb_write(gpio, GPIO_PORTA_EOI, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void dwapb_irq_mask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - val = dwapb_read(gpio, GPIO_INTMASK) | BIT(hwirq); - dwapb_write(gpio, GPIO_INTMASK, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, gen_gc) { + val = dwapb_read(gpio, GPIO_INTMASK) | BIT(hwirq); + dwapb_write(gpio, GPIO_INTMASK, val); + } gpiochip_disable_irq(gc, hwirq); } @@ -265,59 +270,61 @@ static void dwapb_irq_mask(struct irq_data *d) static void dwapb_irq_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); - unsigned long flags; u32 val; gpiochip_enable_irq(gc, hwirq); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + val = dwapb_read(gpio, GPIO_INTMASK) & ~BIT(hwirq); dwapb_write(gpio, GPIO_INTMASK, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void dwapb_irq_enable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + val = dwapb_read(gpio, GPIO_INTEN) | BIT(hwirq); dwapb_write(gpio, GPIO_INTEN, val); val = dwapb_read(gpio, GPIO_INTMASK) & ~BIT(hwirq); dwapb_write(gpio, GPIO_INTMASK, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void dwapb_irq_disable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + val = dwapb_read(gpio, GPIO_INTMASK) | BIT(hwirq); dwapb_write(gpio, GPIO_INTMASK, val); val = dwapb_read(gpio, GPIO_INTEN) & ~BIT(hwirq); dwapb_write(gpio, GPIO_INTEN, val); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static int dwapb_irq_set_type(struct irq_data *d, u32 type) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = to_dwapb_gpio(gc); irq_hw_number_t bit = irqd_to_hwirq(d); - unsigned long level, polarity, flags; + unsigned long level, polarity; + + guard(gpio_generic_lock_irqsave)(gen_gc); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); level = dwapb_read(gpio, GPIO_INTTYPE_LEVEL); polarity = dwapb_read(gpio, GPIO_INT_POLARITY); @@ -352,7 +359,6 @@ static int dwapb_irq_set_type(struct irq_data *d, u32 type) dwapb_write(gpio, GPIO_INTTYPE_LEVEL, level); if (type != IRQ_TYPE_EDGE_BOTH) dwapb_write(gpio, GPIO_INT_POLARITY, polarity); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return 0; } @@ -393,11 +399,12 @@ static int dwapb_gpio_set_debounce(struct gpio_chip *gc, unsigned offset, unsigned debounce) { struct dwapb_gpio_port *port = gpiochip_get_data(gc); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); struct dwapb_gpio *gpio = port->gpio; - unsigned long flags, val_deb; + unsigned long val_deb; unsigned long mask = BIT(offset); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); val_deb = dwapb_read(gpio, GPIO_PORTA_DEBOUNCE); if (debounce) @@ -406,8 +413,6 @@ static int dwapb_gpio_set_debounce(struct gpio_chip *gc, val_deb &= ~mask; dwapb_write(gpio, GPIO_PORTA_DEBOUNCE, val_deb); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); - return 0; } @@ -445,7 +450,7 @@ static void dwapb_configure_irqs(struct dwapb_gpio *gpio, struct dwapb_port_property *pp) { struct dwapb_gpio_port_irqchip *pirq; - struct gpio_chip *gc = &port->gc; + struct gpio_chip *gc = &port->chip.gc; struct gpio_irq_chip *girq; int err; @@ -501,6 +506,7 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio, struct dwapb_port_property *pp, unsigned int offs) { + struct gpio_generic_chip_config config; struct dwapb_gpio_port *port; void __iomem *dat, *set, *dirout; int err; @@ -519,32 +525,39 @@ static int dwapb_gpio_add_port(struct dwapb_gpio *gpio, set = gpio->regs + GPIO_SWPORTA_DR + pp->idx * GPIO_SWPORT_DR_STRIDE; dirout = gpio->regs + GPIO_SWPORTA_DDR + pp->idx * GPIO_SWPORT_DDR_STRIDE; + config = (struct gpio_generic_chip_config) { + .dev = gpio->dev, + .sz = 4, + .dat = dat, + .set = set, + .dirout = dirout, + }; + /* This registers 32 GPIO lines per port */ - err = bgpio_init(&port->gc, gpio->dev, 4, dat, set, NULL, dirout, - NULL, 0); + err = gpio_generic_chip_init(&port->chip, &config); if (err) { dev_err(gpio->dev, "failed to init gpio chip for port%d\n", port->idx); return err; } - port->gc.fwnode = pp->fwnode; - port->gc.ngpio = pp->ngpio; - port->gc.base = pp->gpio_base; - port->gc.request = gpiochip_generic_request; - port->gc.free = gpiochip_generic_free; + port->chip.gc.fwnode = pp->fwnode; + port->chip.gc.ngpio = pp->ngpio; + port->chip.gc.base = pp->gpio_base; + port->chip.gc.request = gpiochip_generic_request; + port->chip.gc.free = gpiochip_generic_free; /* Only port A support debounce */ if (pp->idx == 0) - port->gc.set_config = dwapb_gpio_set_config; + port->chip.gc.set_config = dwapb_gpio_set_config; else - port->gc.set_config = gpiochip_generic_config; + port->chip.gc.set_config = gpiochip_generic_config; /* Only port A can provide interrupts in all configurations of the IP */ if (pp->idx == 0) dwapb_configure_irqs(gpio, port, pp); - err = devm_gpiochip_add_data(gpio->dev, &port->gc, port); + err = devm_gpiochip_add_data(gpio->dev, &port->chip.gc, port); if (err) { dev_err(gpio->dev, "failed to register gpiochip for port%d\n", port->idx); @@ -750,38 +763,37 @@ static int dwapb_gpio_probe(struct platform_device *pdev) static int dwapb_gpio_suspend(struct device *dev) { struct dwapb_gpio *gpio = dev_get_drvdata(dev); - struct gpio_chip *gc = &gpio->ports[0].gc; - unsigned long flags; + struct gpio_generic_chip *gen_gc = &gpio->ports[0].chip; int i; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - for (i = 0; i < gpio->nr_ports; i++) { - unsigned int offset; - unsigned int idx = gpio->ports[i].idx; - struct dwapb_context *ctx = gpio->ports[i].ctx; + scoped_guard(gpio_generic_lock_irqsave, gen_gc) { + for (i = 0; i < gpio->nr_ports; i++) { + unsigned int offset; + unsigned int idx = gpio->ports[i].idx; + struct dwapb_context *ctx = gpio->ports[i].ctx; - offset = GPIO_SWPORTA_DDR + idx * GPIO_SWPORT_DDR_STRIDE; - ctx->dir = dwapb_read(gpio, offset); + offset = GPIO_SWPORTA_DDR + idx * GPIO_SWPORT_DDR_STRIDE; + ctx->dir = dwapb_read(gpio, offset); - offset = GPIO_SWPORTA_DR + idx * GPIO_SWPORT_DR_STRIDE; - ctx->data = dwapb_read(gpio, offset); + offset = GPIO_SWPORTA_DR + idx * GPIO_SWPORT_DR_STRIDE; + ctx->data = dwapb_read(gpio, offset); - offset = GPIO_EXT_PORTA + idx * GPIO_EXT_PORT_STRIDE; - ctx->ext = dwapb_read(gpio, offset); + offset = GPIO_EXT_PORTA + idx * GPIO_EXT_PORT_STRIDE; + ctx->ext = dwapb_read(gpio, offset); - /* Only port A can provide interrupts */ - if (idx == 0) { - ctx->int_mask = dwapb_read(gpio, GPIO_INTMASK); - ctx->int_en = dwapb_read(gpio, GPIO_INTEN); - ctx->int_pol = dwapb_read(gpio, GPIO_INT_POLARITY); - ctx->int_type = dwapb_read(gpio, GPIO_INTTYPE_LEVEL); - ctx->int_deb = dwapb_read(gpio, GPIO_PORTA_DEBOUNCE); - - /* Mask out interrupts */ - dwapb_write(gpio, GPIO_INTMASK, ~ctx->wake_en); + /* Only port A can provide interrupts */ + if (idx == 0) { + ctx->int_mask = dwapb_read(gpio, GPIO_INTMASK); + ctx->int_en = dwapb_read(gpio, GPIO_INTEN); + ctx->int_pol = dwapb_read(gpio, GPIO_INT_POLARITY); + ctx->int_type = dwapb_read(gpio, GPIO_INTTYPE_LEVEL); + ctx->int_deb = dwapb_read(gpio, GPIO_PORTA_DEBOUNCE); + + /* Mask out interrupts */ + dwapb_write(gpio, GPIO_INTMASK, ~ctx->wake_en); + } } } - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); clk_bulk_disable_unprepare(DWAPB_NR_CLOCKS, gpio->clks); @@ -791,8 +803,8 @@ static int dwapb_gpio_suspend(struct device *dev) static int dwapb_gpio_resume(struct device *dev) { struct dwapb_gpio *gpio = dev_get_drvdata(dev); - struct gpio_chip *gc = &gpio->ports[0].gc; - unsigned long flags; + struct gpio_chip *gc = &gpio->ports[0].chip.gc; + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(gc); int i, err; err = clk_bulk_prepare_enable(DWAPB_NR_CLOCKS, gpio->clks); @@ -801,7 +813,8 @@ static int dwapb_gpio_resume(struct device *dev) return err; } - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(gen_gc); + for (i = 0; i < gpio->nr_ports; i++) { unsigned int offset; unsigned int idx = gpio->ports[i].idx; @@ -828,7 +841,6 @@ static int dwapb_gpio_resume(struct device *dev) dwapb_write(gpio, GPIO_PORTA_EOI, 0xffffffff); } } - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return 0; } diff --git a/drivers/gpio/gpio-ep93xx.c b/drivers/gpio/gpio-ep93xx.c index 58d2464c07bc36..1f56e44ffc9a3c 100644 --- a/drivers/gpio/gpio-ep93xx.c +++ b/drivers/gpio/gpio-ep93xx.c @@ -9,16 +9,17 @@ * linux/arch/arm/mach-ep93xx/core.c */ +#include +#include +#include #include -#include -#include #include #include #include -#include -#include -#include +#include +#include #include +#include struct ep93xx_gpio_irq_chip { void __iomem *base; @@ -31,11 +32,14 @@ struct ep93xx_gpio_irq_chip { struct ep93xx_gpio_chip { void __iomem *base; - struct gpio_chip gc; + struct gpio_generic_chip chip; struct ep93xx_gpio_irq_chip *eic; }; -#define to_ep93xx_gpio_chip(x) container_of(x, struct ep93xx_gpio_chip, gc) +static struct ep93xx_gpio_chip *to_ep93xx_gpio_chip(struct gpio_chip *gc) +{ + return container_of(to_gpio_generic_chip(gc), struct ep93xx_gpio_chip, chip); +} static struct ep93xx_gpio_irq_chip *to_ep93xx_gpio_irq_chip(struct gpio_chip *gc) { @@ -267,7 +271,7 @@ static const struct irq_chip gpio_eic_irq_chip = { static int ep93xx_setup_irqs(struct platform_device *pdev, struct ep93xx_gpio_chip *egc) { - struct gpio_chip *gc = &egc->gc; + struct gpio_chip *gc = &egc->chip.gc; struct device *dev = &pdev->dev; struct gpio_irq_chip *girq = &gc->irq; int ret, irq, i; @@ -327,6 +331,7 @@ static int ep93xx_setup_irqs(struct platform_device *pdev, static int ep93xx_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct ep93xx_gpio_chip *egc; struct gpio_chip *gc; void __iomem *data; @@ -345,8 +350,16 @@ static int ep93xx_gpio_probe(struct platform_device *pdev) if (IS_ERR(dir)) return PTR_ERR(dir); - gc = &egc->gc; - ret = bgpio_init(gc, &pdev->dev, 1, data, NULL, NULL, dir, NULL, 0); + gc = &egc->chip.gc; + + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 1, + .dat = data, + .dirout = dir, + }; + + ret = gpio_generic_chip_init(&egc->chip, &config); if (ret) return dev_err_probe(&pdev->dev, ret, "unable to init generic GPIO\n"); diff --git a/drivers/gpio/gpio-ftgpio010.c b/drivers/gpio/gpio-ftgpio010.c index c35eaa2851d853..11e6907c3b5401 100644 --- a/drivers/gpio/gpio-ftgpio010.c +++ b/drivers/gpio/gpio-ftgpio010.c @@ -10,12 +10,14 @@ * MXC GPIO support. (c) 2008 Daniel Mack * Copyright 2008 Juergen Beisert, kernel@pengutronix.de */ + +#include +#include #include -#include +#include #include +#include #include -#include -#include /* GPIO registers definition */ #define GPIO_DATA_OUT 0x00 @@ -40,13 +42,13 @@ /** * struct ftgpio_gpio - Gemini GPIO state container * @dev: containing device for this instance - * @gc: gpiochip for this instance + * @chip: generic GPIO chip for this instance * @base: remapped I/O-memory base * @clk: silicon clock */ struct ftgpio_gpio { struct device *dev; - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; struct clk *clk; }; @@ -233,6 +235,7 @@ static const struct irq_chip ftgpio_irq_chip = { static int ftgpio_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct ftgpio_gpio *g; struct gpio_irq_chip *girq; @@ -261,27 +264,30 @@ static int ftgpio_gpio_probe(struct platform_device *pdev) */ return PTR_ERR(g->clk); - ret = bgpio_init(&g->gc, dev, 4, - g->base + GPIO_DATA_IN, - g->base + GPIO_DATA_SET, - g->base + GPIO_DATA_CLR, - g->base + GPIO_DIR, - NULL, - 0); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = g->base + GPIO_DATA_IN, + .set = g->base + GPIO_DATA_SET, + .clr = g->base + GPIO_DATA_CLR, + .dirout = g->base + GPIO_DIR, + }; + + ret = gpio_generic_chip_init(&g->chip, &config); if (ret) return dev_err_probe(dev, ret, "unable to init generic GPIO\n"); - g->gc.label = dev_name(dev); - g->gc.base = -1; - g->gc.parent = dev; - g->gc.owner = THIS_MODULE; - /* ngpio is set by bgpio_init() */ + g->chip.gc.label = dev_name(dev); + g->chip.gc.base = -1; + g->chip.gc.parent = dev; + g->chip.gc.owner = THIS_MODULE; + /* ngpio is set by gpio_generic_chip_init() */ /* We need a silicon clock to do debounce */ if (!IS_ERR(g->clk)) - g->gc.set_config = ftgpio_gpio_set_config; + g->chip.gc.set_config = ftgpio_gpio_set_config; - girq = &g->gc.irq; + girq = &g->chip.gc.irq; gpio_irq_chip_set_chip(girq, &ftgpio_irq_chip); girq->parent_handler = ftgpio_gpio_irq_handler; girq->num_parents = 1; @@ -302,7 +308,7 @@ static int ftgpio_gpio_probe(struct platform_device *pdev) /* Clear any use of debounce */ writel(0x0, g->base + GPIO_DEBOUNCE_EN); - return devm_gpiochip_add_data(dev, &g->gc, g); + return devm_gpiochip_add_data(dev, &g->chip.gc, g); } static const struct of_device_id ftgpio_gpio_of_match[] = { diff --git a/drivers/gpio/gpio-ge.c b/drivers/gpio/gpio-ge.c index 5dc49648d8e378..66bdff36eb615e 100644 --- a/drivers/gpio/gpio-ge.c +++ b/drivers/gpio/gpio-ge.c @@ -16,6 +16,7 @@ */ #include +#include #include #include #include @@ -51,24 +52,36 @@ MODULE_DEVICE_TABLE(of, gef_gpio_ids); static int __init gef_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; + struct gpio_generic_chip *chip; struct gpio_chip *gc; void __iomem *regs; int ret; - gc = devm_kzalloc(dev, sizeof(*gc), GFP_KERNEL); - if (!gc) + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); + if (!chip) return -ENOMEM; regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(regs)) return PTR_ERR(regs); - ret = bgpio_init(gc, dev, 4, regs + GEF_GPIO_IN, regs + GEF_GPIO_OUT, - NULL, NULL, regs + GEF_GPIO_DIRECT, - BGPIOF_BIG_ENDIAN_BYTE_ORDER); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = regs + GEF_GPIO_IN, + .set = regs + GEF_GPIO_OUT, + .dirin = regs + GEF_GPIO_DIRECT, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, + }; + + ret = gpio_generic_chip_init(chip, &config); if (ret) - return dev_err_probe(dev, ret, "bgpio_init failed\n"); + return dev_err_probe(dev, ret, + "failed to initialize the generic GPIO chip\n"); + + gc = &chip->gc; /* Setup pointers to chip functions */ gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", dev_fwnode(dev)); diff --git a/drivers/gpio/gpio-grgpio.c b/drivers/gpio/gpio-grgpio.c index f3f8bab62f94ce..0c0f97fa14fc9d 100644 --- a/drivers/gpio/gpio-grgpio.c +++ b/drivers/gpio/gpio-grgpio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -59,7 +60,7 @@ struct grgpio_lirq { }; struct grgpio_priv { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *regs; struct device *dev; @@ -91,13 +92,12 @@ struct grgpio_priv { static void grgpio_set_imask(struct grgpio_priv *priv, unsigned int offset, int val) { - struct gpio_chip *gc = &priv->gc; - if (val) priv->imask |= BIT(offset); else priv->imask &= ~BIT(offset); - gc->write_reg(priv->regs + GRGPIO_IMASK, priv->imask); + + gpio_generic_write_reg(&priv->chip, priv->regs + GRGPIO_IMASK, priv->imask); } static int grgpio_to_irq(struct gpio_chip *gc, unsigned offset) @@ -118,7 +118,6 @@ static int grgpio_to_irq(struct gpio_chip *gc, unsigned offset) static int grgpio_irq_set_type(struct irq_data *d, unsigned int type) { struct grgpio_priv *priv = irq_data_get_irq_chip_data(d); - unsigned long flags; u32 mask = BIT(d->hwirq); u32 ipol; u32 iedge; @@ -146,15 +145,13 @@ static int grgpio_irq_set_type(struct irq_data *d, unsigned int type) return -EINVAL; } - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&priv->chip); - ipol = priv->gc.read_reg(priv->regs + GRGPIO_IPOL) & ~mask; - iedge = priv->gc.read_reg(priv->regs + GRGPIO_IEDGE) & ~mask; + ipol = gpio_generic_read_reg(&priv->chip, priv->regs + GRGPIO_IPOL) & ~mask; + iedge = gpio_generic_read_reg(&priv->chip, priv->regs + GRGPIO_IEDGE) & ~mask; - priv->gc.write_reg(priv->regs + GRGPIO_IPOL, ipol | pol); - priv->gc.write_reg(priv->regs + GRGPIO_IEDGE, iedge | edge); - - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + gpio_generic_write_reg(&priv->chip, priv->regs + GRGPIO_IPOL, ipol | pol); + gpio_generic_write_reg(&priv->chip, priv->regs + GRGPIO_IEDGE, iedge | edge); return 0; } @@ -163,29 +160,23 @@ static void grgpio_irq_mask(struct irq_data *d) { struct grgpio_priv *priv = irq_data_get_irq_chip_data(d); int offset = d->hwirq; - unsigned long flags; - - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); - grgpio_set_imask(priv, offset, 0); + scoped_guard(gpio_generic_lock_irqsave, &priv->chip) + grgpio_set_imask(priv, offset, 0); - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); - - gpiochip_disable_irq(&priv->gc, d->hwirq); + gpiochip_disable_irq(&priv->chip.gc, d->hwirq); } static void grgpio_irq_unmask(struct irq_data *d) { struct grgpio_priv *priv = irq_data_get_irq_chip_data(d); int offset = d->hwirq; - unsigned long flags; - gpiochip_enable_irq(&priv->gc, d->hwirq); - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + gpiochip_enable_irq(&priv->chip.gc, d->hwirq); - grgpio_set_imask(priv, offset, 1); + guard(gpio_generic_lock_irqsave)(&priv->chip); - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + grgpio_set_imask(priv, offset, 1); } static const struct irq_chip grgpio_irq_chip = { @@ -200,12 +191,11 @@ static const struct irq_chip grgpio_irq_chip = { static irqreturn_t grgpio_irq_handler(int irq, void *dev) { struct grgpio_priv *priv = dev; - int ngpio = priv->gc.ngpio; - unsigned long flags; + int ngpio = priv->chip.gc.ngpio; int i; int match = 0; - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&priv->chip); /* * For each gpio line, call its interrupt handler if it its underlying @@ -221,8 +211,6 @@ static irqreturn_t grgpio_irq_handler(int irq, void *dev) } } - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); - if (!match) dev_warn(priv->dev, "No gpio line matched irq %d\n", irq); @@ -253,13 +241,18 @@ static int grgpio_irq_map(struct irq_domain *d, unsigned int irq, dev_dbg(priv->dev, "Mapping irq %d for gpio line %d\n", irq, offset); - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_lock_irqsave(&priv->chip, flags); /* Request underlying irq if not already requested */ lirq->irq = irq; uirq = &priv->uirqs[lirq->index]; if (uirq->refcnt == 0) { - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + /* + * FIXME: This is not how locking works at all, you can't just + * release the lock for a moment to do something that can't + * sleep... + */ + gpio_generic_chip_unlock_irqrestore(&priv->chip, flags); ret = request_irq(uirq->uirq, grgpio_irq_handler, 0, dev_name(priv->dev), priv); if (ret) { @@ -268,11 +261,11 @@ static int grgpio_irq_map(struct irq_domain *d, unsigned int irq, uirq->uirq); return ret; } - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_lock_irqsave(&priv->chip, flags); } uirq->refcnt++; - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_unlock_irqrestore(&priv->chip, flags); /* Setup irq */ irq_set_chip_data(irq, priv); @@ -290,13 +283,13 @@ static void grgpio_irq_unmap(struct irq_domain *d, unsigned int irq) struct grgpio_lirq *lirq; struct grgpio_uirq *uirq; unsigned long flags; - int ngpio = priv->gc.ngpio; + int ngpio = priv->chip.gc.ngpio; int i; irq_set_chip_and_handler(irq, NULL, NULL); irq_set_chip_data(irq, NULL); - raw_spin_lock_irqsave(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_lock_irqsave(&priv->chip, flags); /* Free underlying irq if last user unmapped */ index = -1; @@ -315,13 +308,13 @@ static void grgpio_irq_unmap(struct irq_domain *d, unsigned int irq) uirq = &priv->uirqs[lirq->index]; uirq->refcnt--; if (uirq->refcnt == 0) { - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_unlock_irqrestore(&priv->chip, flags); free_irq(uirq->uirq, priv); return; } } - raw_spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags); + gpio_generic_chip_unlock_irqrestore(&priv->chip, flags); } static void grgpio_irq_domain_remove(void *data) @@ -341,6 +334,7 @@ static const struct irq_domain_ops grgpio_irq_domain_ops = { static int grgpio_probe(struct platform_device *ofdev) { struct device_node *np = ofdev->dev.of_node; + struct gpio_generic_chip_config config; struct device *dev = &ofdev->dev; void __iomem *regs; struct gpio_chip *gc; @@ -359,17 +353,24 @@ static int grgpio_probe(struct platform_device *ofdev) if (IS_ERR(regs)) return PTR_ERR(regs); - gc = &priv->gc; - err = bgpio_init(gc, dev, 4, regs + GRGPIO_DATA, - regs + GRGPIO_OUTPUT, NULL, regs + GRGPIO_DIR, NULL, - BGPIOF_BIG_ENDIAN_BYTE_ORDER); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = regs + GRGPIO_DATA, + .set = regs + GRGPIO_OUTPUT, + .dirout = regs + GRGPIO_DIR, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, + }; + + gc = &priv->chip.gc; + err = gpio_generic_chip_init(&priv->chip, &config); if (err) { - dev_err(dev, "bgpio_init() failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return err; } priv->regs = regs; - priv->imask = gc->read_reg(regs + GRGPIO_IMASK); + priv->imask = gpio_generic_read_reg(&priv->chip, regs + GRGPIO_IMASK); priv->dev = dev; gc->owner = THIS_MODULE; diff --git a/drivers/gpio/gpio-hisi.c b/drivers/gpio/gpio-hisi.c index ef5cc654a24e23..d26298c8351b71 100644 --- a/drivers/gpio/gpio-hisi.c +++ b/drivers/gpio/gpio-hisi.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2020 HiSilicon Limited. */ + #include +#include #include #include #include @@ -33,7 +35,7 @@ #define HISI_GPIO_DRIVER_NAME "gpio-hisi" struct hisi_gpio { - struct gpio_chip chip; + struct gpio_generic_chip chip; struct device *dev; void __iomem *reg_base; unsigned int line_num; @@ -43,8 +45,8 @@ struct hisi_gpio { static inline u32 hisi_gpio_read_reg(struct gpio_chip *chip, unsigned int off) { - struct hisi_gpio *hisi_gpio = - container_of(chip, struct hisi_gpio, chip); + struct hisi_gpio *hisi_gpio = container_of(to_gpio_generic_chip(chip), + struct hisi_gpio, chip); void __iomem *reg = hisi_gpio->reg_base + off; return readl(reg); @@ -53,8 +55,8 @@ static inline u32 hisi_gpio_read_reg(struct gpio_chip *chip, static inline void hisi_gpio_write_reg(struct gpio_chip *chip, unsigned int off, u32 val) { - struct hisi_gpio *hisi_gpio = - container_of(chip, struct hisi_gpio, chip); + struct hisi_gpio *hisi_gpio = container_of(to_gpio_generic_chip(chip), + struct hisi_gpio, chip); void __iomem *reg = hisi_gpio->reg_base + off; writel(val, reg); @@ -180,14 +182,14 @@ static void hisi_gpio_irq_disable(struct irq_data *d) static void hisi_gpio_irq_handler(struct irq_desc *desc) { struct hisi_gpio *hisi_gpio = irq_desc_get_handler_data(desc); - unsigned long irq_msk = hisi_gpio_read_reg(&hisi_gpio->chip, + unsigned long irq_msk = hisi_gpio_read_reg(&hisi_gpio->chip.gc, HISI_GPIO_INTSTATUS_WX); struct irq_chip *irq_c = irq_desc_get_chip(desc); int hwirq; chained_irq_enter(irq_c, desc); for_each_set_bit(hwirq, &irq_msk, HISI_GPIO_LINE_NUM_MAX) - generic_handle_domain_irq(hisi_gpio->chip.irq.domain, + generic_handle_domain_irq(hisi_gpio->chip.gc.irq.domain, hwirq); chained_irq_exit(irq_c, desc); } @@ -206,7 +208,7 @@ static const struct irq_chip hisi_gpio_irq_chip = { static void hisi_gpio_init_irq(struct hisi_gpio *hisi_gpio) { - struct gpio_chip *chip = &hisi_gpio->chip; + struct gpio_chip *chip = &hisi_gpio->chip.gc; struct gpio_irq_chip *girq_chip = &chip->irq; gpio_irq_chip_set_chip(girq_chip, &hisi_gpio_irq_chip); @@ -264,6 +266,7 @@ static void hisi_gpio_get_pdata(struct device *dev, static int hisi_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct hisi_gpio *hisi_gpio; int port_num; @@ -289,27 +292,32 @@ static int hisi_gpio_probe(struct platform_device *pdev) hisi_gpio->dev = dev; - ret = bgpio_init(&hisi_gpio->chip, hisi_gpio->dev, 0x4, - hisi_gpio->reg_base + HISI_GPIO_EXT_PORT_WX, - hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_SET_WX, - hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX, - hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_SET_WX, - hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_CLR_WX, - BGPIOF_NO_SET_ON_INPUT); + config = (struct gpio_generic_chip_config) { + .dev = hisi_gpio->dev, + .sz = 4, + .dat = hisi_gpio->reg_base + HISI_GPIO_EXT_PORT_WX, + .set = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_SET_WX, + .clr = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX, + .dirout = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_SET_WX, + .dirin = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_CLR_WX, + .flags = GPIO_GENERIC_NO_SET_ON_INPUT | + GPIO_GENERIC_UNREADABLE_REG_DIR, + }; + + ret = gpio_generic_chip_init(&hisi_gpio->chip, &config); if (ret) { dev_err(dev, "failed to init, ret = %d\n", ret); return ret; } - hisi_gpio->chip.set_config = hisi_gpio_set_config; - hisi_gpio->chip.ngpio = hisi_gpio->line_num; - hisi_gpio->chip.bgpio_dir_unreadable = 1; - hisi_gpio->chip.base = -1; + hisi_gpio->chip.gc.set_config = hisi_gpio_set_config; + hisi_gpio->chip.gc.ngpio = hisi_gpio->line_num; + hisi_gpio->chip.gc.base = -1; if (hisi_gpio->irq > 0) hisi_gpio_init_irq(hisi_gpio); - ret = devm_gpiochip_add_data(dev, &hisi_gpio->chip, hisi_gpio); + ret = devm_gpiochip_add_data(dev, &hisi_gpio->chip.gc, hisi_gpio); if (ret) { dev_err(dev, "failed to register gpiochip, ret = %d\n", ret); return ret; diff --git a/drivers/gpio/gpio-hlwd.c b/drivers/gpio/gpio-hlwd.c index 0580f6712bea9a..043ce5ef3b07e9 100644 --- a/drivers/gpio/gpio-hlwd.c +++ b/drivers/gpio/gpio-hlwd.c @@ -6,6 +6,7 @@ // Nintendo Wii (Hollywood) GPIO driver #include +#include #include #include #include @@ -48,7 +49,7 @@ #define HW_GPIO_OWNER 0x3c struct hlwd_gpio { - struct gpio_chip gpioc; + struct gpio_generic_chip gpioc; struct device *dev; void __iomem *regs; int irq; @@ -61,45 +62,44 @@ static void hlwd_gpio_irqhandler(struct irq_desc *desc) struct hlwd_gpio *hlwd = gpiochip_get_data(irq_desc_get_handler_data(desc)); struct irq_chip *chip = irq_desc_get_chip(desc); - unsigned long flags; unsigned long pending; int hwirq; u32 emulated_pending; - raw_spin_lock_irqsave(&hlwd->gpioc.bgpio_lock, flags); - pending = ioread32be(hlwd->regs + HW_GPIOB_INTFLAG); - pending &= ioread32be(hlwd->regs + HW_GPIOB_INTMASK); + scoped_guard(gpio_generic_lock_irqsave, &hlwd->gpioc) { + pending = ioread32be(hlwd->regs + HW_GPIOB_INTFLAG); + pending &= ioread32be(hlwd->regs + HW_GPIOB_INTMASK); - /* Treat interrupts due to edge trigger emulation separately */ - emulated_pending = hlwd->edge_emulation & pending; - pending &= ~emulated_pending; - if (emulated_pending) { - u32 level, rising, falling; + /* Treat interrupts due to edge trigger emulation separately */ + emulated_pending = hlwd->edge_emulation & pending; + pending &= ~emulated_pending; + if (emulated_pending) { + u32 level, rising, falling; - level = ioread32be(hlwd->regs + HW_GPIOB_INTLVL); - rising = level & emulated_pending; - falling = ~level & emulated_pending; + level = ioread32be(hlwd->regs + HW_GPIOB_INTLVL); + rising = level & emulated_pending; + falling = ~level & emulated_pending; - /* Invert the levels */ - iowrite32be(level ^ emulated_pending, - hlwd->regs + HW_GPIOB_INTLVL); + /* Invert the levels */ + iowrite32be(level ^ emulated_pending, + hlwd->regs + HW_GPIOB_INTLVL); - /* Ack all emulated-edge interrupts */ - iowrite32be(emulated_pending, hlwd->regs + HW_GPIOB_INTFLAG); + /* Ack all emulated-edge interrupts */ + iowrite32be(emulated_pending, hlwd->regs + HW_GPIOB_INTFLAG); - /* Signal interrupts only on the correct edge */ - rising &= hlwd->rising_edge; - falling &= hlwd->falling_edge; + /* Signal interrupts only on the correct edge */ + rising &= hlwd->rising_edge; + falling &= hlwd->falling_edge; - /* Mark emulated interrupts as pending */ - pending |= rising | falling; + /* Mark emulated interrupts as pending */ + pending |= rising | falling; + } } - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); chained_irq_enter(chip, desc); for_each_set_bit(hwirq, &pending, 32) - generic_handle_domain_irq(hlwd->gpioc.irq.domain, hwirq); + generic_handle_domain_irq(hlwd->gpioc.gc.irq.domain, hwirq); chained_irq_exit(chip, desc); } @@ -116,30 +116,29 @@ static void hlwd_gpio_irq_mask(struct irq_data *data) { struct hlwd_gpio *hlwd = gpiochip_get_data(irq_data_get_irq_chip_data(data)); - unsigned long flags; u32 mask; - raw_spin_lock_irqsave(&hlwd->gpioc.bgpio_lock, flags); - mask = ioread32be(hlwd->regs + HW_GPIOB_INTMASK); - mask &= ~BIT(data->hwirq); - iowrite32be(mask, hlwd->regs + HW_GPIOB_INTMASK); - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); - gpiochip_disable_irq(&hlwd->gpioc, irqd_to_hwirq(data)); + scoped_guard(gpio_generic_lock_irqsave, &hlwd->gpioc) { + mask = ioread32be(hlwd->regs + HW_GPIOB_INTMASK); + mask &= ~BIT(data->hwirq); + iowrite32be(mask, hlwd->regs + HW_GPIOB_INTMASK); + } + gpiochip_disable_irq(&hlwd->gpioc.gc, irqd_to_hwirq(data)); } static void hlwd_gpio_irq_unmask(struct irq_data *data) { struct hlwd_gpio *hlwd = gpiochip_get_data(irq_data_get_irq_chip_data(data)); - unsigned long flags; u32 mask; - gpiochip_enable_irq(&hlwd->gpioc, irqd_to_hwirq(data)); - raw_spin_lock_irqsave(&hlwd->gpioc.bgpio_lock, flags); + gpiochip_enable_irq(&hlwd->gpioc.gc, irqd_to_hwirq(data)); + + guard(gpio_generic_lock_irqsave)(&hlwd->gpioc); + mask = ioread32be(hlwd->regs + HW_GPIOB_INTMASK); mask |= BIT(data->hwirq); iowrite32be(mask, hlwd->regs + HW_GPIOB_INTMASK); - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); } static void hlwd_gpio_irq_enable(struct irq_data *data) @@ -173,10 +172,9 @@ static int hlwd_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type) { struct hlwd_gpio *hlwd = gpiochip_get_data(irq_data_get_irq_chip_data(data)); - unsigned long flags; u32 level; - raw_spin_lock_irqsave(&hlwd->gpioc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&hlwd->gpioc); hlwd->edge_emulation &= ~BIT(data->hwirq); @@ -197,11 +195,9 @@ static int hlwd_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type) hlwd_gpio_irq_setup_emulation(hlwd, data->hwirq, flow_type); break; default: - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); return -EINVAL; } - raw_spin_unlock_irqrestore(&hlwd->gpioc.bgpio_lock, flags); return 0; } @@ -225,6 +221,7 @@ static const struct irq_chip hlwd_gpio_irq_chip = { static int hlwd_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct hlwd_gpio *hlwd; u32 ngpios; int res; @@ -244,25 +241,31 @@ static int hlwd_gpio_probe(struct platform_device *pdev) * systems where the AHBPROT memory firewall hasn't been configured to * permit PPC access to HW_GPIO_*. * - * Note that this has to happen before bgpio_init reads the - * HW_GPIOB_OUT and HW_GPIOB_DIR, because otherwise it reads the wrong - * values. + * Note that this has to happen before gpio_generic_chip_init() reads + * the HW_GPIOB_OUT and HW_GPIOB_DIR, because otherwise it reads the + * wrong values. */ iowrite32be(0xffffffff, hlwd->regs + HW_GPIO_OWNER); - res = bgpio_init(&hlwd->gpioc, &pdev->dev, 4, - hlwd->regs + HW_GPIOB_IN, hlwd->regs + HW_GPIOB_OUT, - NULL, hlwd->regs + HW_GPIOB_DIR, NULL, - BGPIOF_BIG_ENDIAN_BYTE_ORDER); + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 4, + .dat = hlwd->regs + HW_GPIOB_IN, + .set = hlwd->regs + HW_GPIOB_OUT, + .dirout = hlwd->regs + HW_GPIOB_DIR, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, + }; + + res = gpio_generic_chip_init(&hlwd->gpioc, &config); if (res < 0) { - dev_warn(&pdev->dev, "bgpio_init failed: %d\n", res); + dev_warn(&pdev->dev, "failed to initialize generic GPIO chip: %d\n", res); return res; } res = of_property_read_u32(pdev->dev.of_node, "ngpios", &ngpios); if (res) ngpios = 32; - hlwd->gpioc.ngpio = ngpios; + hlwd->gpioc.gc.ngpio = ngpios; /* Mask and ack all interrupts */ iowrite32be(0, hlwd->regs + HW_GPIOB_INTMASK); @@ -282,7 +285,7 @@ static int hlwd_gpio_probe(struct platform_device *pdev) return hlwd->irq; } - girq = &hlwd->gpioc.irq; + girq = &hlwd->gpioc.gc.irq; gpio_irq_chip_set_chip(girq, &hlwd_gpio_irq_chip); girq->parent_handler = hlwd_gpio_irqhandler; girq->num_parents = 1; @@ -296,7 +299,7 @@ static int hlwd_gpio_probe(struct platform_device *pdev) girq->handler = handle_level_irq; } - return devm_gpiochip_add_data(&pdev->dev, &hlwd->gpioc, hlwd); + return devm_gpiochip_add_data(&pdev->dev, &hlwd->gpioc.gc, hlwd); } static const struct of_device_id hlwd_gpio_match[] = { diff --git a/drivers/gpio/gpio-idt3243x.c b/drivers/gpio/gpio-idt3243x.c index 535f255144556e..56f1f1e57b6943 100644 --- a/drivers/gpio/gpio-idt3243x.c +++ b/drivers/gpio/gpio-idt3243x.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -18,7 +19,7 @@ #define IDT_GPIO_ISTAT 0x0C struct idt_gpio_ctrl { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *pic; void __iomem *gpio; u32 mask_cache; @@ -50,14 +51,13 @@ static int idt_gpio_irq_set_type(struct irq_data *d, unsigned int flow_type) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct idt_gpio_ctrl *ctrl = gpiochip_get_data(gc); unsigned int sense = flow_type & IRQ_TYPE_SENSE_MASK; - unsigned long flags; u32 ilevel; /* hardware only supports level triggered */ if (sense == IRQ_TYPE_NONE || (sense & IRQ_TYPE_EDGE_BOTH)) return -EINVAL; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&ctrl->chip); ilevel = readl(ctrl->gpio + IDT_GPIO_ILEVEL); if (sense & IRQ_TYPE_LEVEL_HIGH) @@ -68,7 +68,6 @@ static int idt_gpio_irq_set_type(struct irq_data *d, unsigned int flow_type) writel(ilevel, ctrl->gpio + IDT_GPIO_ILEVEL); irq_set_handler_locked(d, handle_level_irq); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return 0; } @@ -84,14 +83,11 @@ static void idt_gpio_mask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct idt_gpio_ctrl *ctrl = gpiochip_get_data(gc); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - - ctrl->mask_cache |= BIT(d->hwirq); - writel(ctrl->mask_cache, ctrl->pic + IDT_PIC_IRQ_MASK); - - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &ctrl->chip) { + ctrl->mask_cache |= BIT(d->hwirq); + writel(ctrl->mask_cache, ctrl->pic + IDT_PIC_IRQ_MASK); + } gpiochip_disable_irq(gc, irqd_to_hwirq(d)); } @@ -100,15 +96,13 @@ static void idt_gpio_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct idt_gpio_ctrl *ctrl = gpiochip_get_data(gc); - unsigned long flags; gpiochip_enable_irq(gc, irqd_to_hwirq(d)); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + + guard(gpio_generic_lock_irqsave)(&ctrl->chip); ctrl->mask_cache &= ~BIT(d->hwirq); writel(ctrl->mask_cache, ctrl->pic + IDT_PIC_IRQ_MASK); - - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static int idt_gpio_irq_init_hw(struct gpio_chip *gc) @@ -134,6 +128,7 @@ static const struct irq_chip idt_gpio_irqchip = { static int idt_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct gpio_irq_chip *girq; struct idt_gpio_ctrl *ctrl; @@ -150,18 +145,24 @@ static int idt_gpio_probe(struct platform_device *pdev) if (IS_ERR(ctrl->gpio)) return PTR_ERR(ctrl->gpio); - ctrl->gc.parent = dev; + ctrl->chip.gc.parent = dev; + + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 4, + .dat = ctrl->gpio + IDT_GPIO_DATA, + .dirout = ctrl->gpio + IDT_GPIO_DIR, + }; - ret = bgpio_init(&ctrl->gc, &pdev->dev, 4, ctrl->gpio + IDT_GPIO_DATA, - NULL, NULL, ctrl->gpio + IDT_GPIO_DIR, NULL, 0); + ret = gpio_generic_chip_init(&ctrl->chip, &config); if (ret) { - dev_err(dev, "bgpio_init failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return ret; } ret = device_property_read_u32(dev, "ngpios", &ngpios); if (!ret) - ctrl->gc.ngpio = ngpios; + ctrl->chip.gc.ngpio = ngpios; if (device_property_read_bool(dev, "interrupt-controller")) { ctrl->pic = devm_platform_ioremap_resource_byname(pdev, "pic"); @@ -172,7 +173,7 @@ static int idt_gpio_probe(struct platform_device *pdev) if (parent_irq < 0) return parent_irq; - girq = &ctrl->gc.irq; + girq = &ctrl->chip.gc.irq; gpio_irq_chip_set_chip(girq, &idt_gpio_irqchip); girq->init_hw = idt_gpio_irq_init_hw; girq->parent_handler = idt_gpio_dispatch; @@ -188,7 +189,7 @@ static int idt_gpio_probe(struct platform_device *pdev) girq->handler = handle_bad_irq; } - return devm_gpiochip_add_data(&pdev->dev, &ctrl->gc, ctrl); + return devm_gpiochip_add_data(&pdev->dev, &ctrl->chip.gc, ctrl); } static const struct of_device_id idt_gpio_of_match[] = { diff --git a/drivers/gpio/gpio-ixp4xx.c b/drivers/gpio/gpio-ixp4xx.c index 28a8a6a8f05fee..f34d87869c8b04 100644 --- a/drivers/gpio/gpio-ixp4xx.c +++ b/drivers/gpio/gpio-ixp4xx.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -53,14 +54,14 @@ /** * struct ixp4xx_gpio - IXP4 GPIO state container + * @chip: generic GPIO chip for this instance * @dev: containing device for this instance - * @gc: gpiochip for this instance * @base: remapped I/O-memory base * @irq_edge: Each bit represents an IRQ: 1: edge-triggered, * 0: level triggered */ struct ixp4xx_gpio { - struct gpio_chip gc; + struct gpio_generic_chip chip; struct device *dev; void __iomem *base; unsigned long long irq_edge; @@ -100,7 +101,6 @@ static int ixp4xx_gpio_irq_set_type(struct irq_data *d, unsigned int type) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct ixp4xx_gpio *g = gpiochip_get_data(gc); int line = d->hwirq; - unsigned long flags; u32 int_style; u32 int_reg; u32 val; @@ -144,26 +144,24 @@ static int ixp4xx_gpio_irq_set_type(struct irq_data *d, unsigned int type) int_reg = IXP4XX_REG_GPIT1; } - raw_spin_lock_irqsave(&g->gc.bgpio_lock, flags); - - /* Clear the style for the appropriate pin */ - val = __raw_readl(g->base + int_reg); - val &= ~(IXP4XX_GPIO_STYLE_MASK << (line * IXP4XX_GPIO_STYLE_SIZE)); - __raw_writel(val, g->base + int_reg); - - __raw_writel(BIT(line), g->base + IXP4XX_REG_GPIS); + scoped_guard(gpio_generic_lock_irqsave, &g->chip) { + /* Clear the style for the appropriate pin */ + val = __raw_readl(g->base + int_reg); + val &= ~(IXP4XX_GPIO_STYLE_MASK << (line * IXP4XX_GPIO_STYLE_SIZE)); + __raw_writel(val, g->base + int_reg); - /* Set the new style */ - val = __raw_readl(g->base + int_reg); - val |= (int_style << (line * IXP4XX_GPIO_STYLE_SIZE)); - __raw_writel(val, g->base + int_reg); + __raw_writel(BIT(line), g->base + IXP4XX_REG_GPIS); - /* Force-configure this line as an input */ - val = __raw_readl(g->base + IXP4XX_REG_GPOE); - val |= BIT(d->hwirq); - __raw_writel(val, g->base + IXP4XX_REG_GPOE); + /* Set the new style */ + val = __raw_readl(g->base + int_reg); + val |= (int_style << (line * IXP4XX_GPIO_STYLE_SIZE)); + __raw_writel(val, g->base + int_reg); - raw_spin_unlock_irqrestore(&g->gc.bgpio_lock, flags); + /* Force-configure this line as an input */ + val = __raw_readl(g->base + IXP4XX_REG_GPOE); + val |= BIT(d->hwirq); + __raw_writel(val, g->base + IXP4XX_REG_GPOE); + } /* This parent only accept level high (asserted) */ return irq_chip_set_type_parent(d, IRQ_TYPE_LEVEL_HIGH); @@ -206,6 +204,7 @@ static int ixp4xx_gpio_child_to_parent_hwirq(struct gpio_chip *gc, static int ixp4xx_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; unsigned long flags; struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; @@ -290,35 +289,38 @@ static int ixp4xx_gpio_probe(struct platform_device *pdev) * for big endian. */ #if defined(CONFIG_CPU_BIG_ENDIAN) - flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; #else flags = 0; #endif + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = g->base + IXP4XX_REG_GPIN, + .set = g->base + IXP4XX_REG_GPOUT, + .dirin = g->base + IXP4XX_REG_GPOE, + .flags = flags, + }; + /* Populate and register gpio chip */ - ret = bgpio_init(&g->gc, dev, 4, - g->base + IXP4XX_REG_GPIN, - g->base + IXP4XX_REG_GPOUT, - NULL, - NULL, - g->base + IXP4XX_REG_GPOE, - flags); + ret = gpio_generic_chip_init(&g->chip, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; } - g->gc.ngpio = 16; - g->gc.label = "IXP4XX_GPIO_CHIP"; + g->chip.gc.ngpio = 16; + g->chip.gc.label = "IXP4XX_GPIO_CHIP"; /* * TODO: when we have migrated to device tree and all GPIOs * are fetched using phandles, set this to -1 to get rid of * the fixed gpiochip base. */ - g->gc.base = 0; - g->gc.parent = &pdev->dev; - g->gc.owner = THIS_MODULE; + g->chip.gc.base = 0; + g->chip.gc.parent = &pdev->dev; + g->chip.gc.owner = THIS_MODULE; - girq = &g->gc.irq; + girq = &g->chip.gc.irq; gpio_irq_chip_set_chip(girq, &ixp4xx_gpio_irqchip); girq->fwnode = dev_fwnode(dev); girq->parent_domain = parent; @@ -326,7 +328,7 @@ static int ixp4xx_gpio_probe(struct platform_device *pdev) girq->handler = handle_bad_irq; girq->default_type = IRQ_TYPE_NONE; - ret = devm_gpiochip_add_data(dev, &g->gc, g); + ret = devm_gpiochip_add_data(dev, &g->chip.gc, g); if (ret) { dev_err(dev, "failed to add SoC gpiochip\n"); return ret; diff --git a/drivers/gpio/gpio-loongson-64bit.c b/drivers/gpio/gpio-loongson-64bit.c index 818c606fbc5149..02f181cb219e99 100644 --- a/drivers/gpio/gpio-loongson-64bit.c +++ b/drivers/gpio/gpio-loongson-64bit.c @@ -7,12 +7,16 @@ #include #include +#include +#include #include #include #include #include +#include #include #include +#include #include enum loongson_gpio_mode { @@ -27,10 +31,18 @@ struct loongson_gpio_chip_data { unsigned int out_offset; unsigned int in_offset; unsigned int inten_offset; + unsigned int intpol_offset; + unsigned int intedge_offset; + unsigned int intclr_offset; + unsigned int intsts_offset; + unsigned int intdual_offset; + unsigned int intr_num; + irq_flow_handler_t irq_handler; + const struct irq_chip *girqchip; }; struct loongson_gpio_chip { - struct gpio_chip chip; + struct gpio_generic_chip chip; spinlock_t lock; void __iomem *reg_base; const struct loongson_gpio_chip_data *chip_data; @@ -38,7 +50,8 @@ struct loongson_gpio_chip { static inline struct loongson_gpio_chip *to_loongson_gpio_chip(struct gpio_chip *chip) { - return container_of(chip, struct loongson_gpio_chip, chip); + return container_of(to_gpio_generic_chip(chip), + struct loongson_gpio_chip, chip); } static inline void loongson_commit_direction(struct loongson_gpio_chip *lgpio, unsigned int pin, @@ -135,39 +148,184 @@ static int loongson_gpio_to_irq(struct gpio_chip *chip, unsigned int offset) return platform_get_irq(pdev, offset); } -static int loongson_gpio_init(struct device *dev, struct loongson_gpio_chip *lgpio, +static void loongson_gpio_irq_ack(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct loongson_gpio_chip *lgpio = to_loongson_gpio_chip(chip); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + writeb(0x1, lgpio->reg_base + lgpio->chip_data->intclr_offset + hwirq); +} + +static void loongson_gpio_irq_mask(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct loongson_gpio_chip *lgpio = to_loongson_gpio_chip(chip); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + writeb(0x0, lgpio->reg_base + lgpio->chip_data->inten_offset + hwirq); +} + +static void loongson_gpio_irq_unmask(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct loongson_gpio_chip *lgpio = to_loongson_gpio_chip(chip); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + writeb(0x1, lgpio->reg_base + lgpio->chip_data->inten_offset + hwirq); +} + +static int loongson_gpio_irq_set_type(struct irq_data *data, unsigned int type) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct loongson_gpio_chip *lgpio = to_loongson_gpio_chip(chip); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + u8 pol = 0, edge = 0, dual = 0; + + if ((type & IRQ_TYPE_SENSE_MASK) == IRQ_TYPE_EDGE_BOTH) { + edge = 1; + dual = 1; + irq_set_handler_locked(data, handle_edge_irq); + } else { + switch (type) { + case IRQ_TYPE_LEVEL_HIGH: + pol = 1; + fallthrough; + case IRQ_TYPE_LEVEL_LOW: + irq_set_handler_locked(data, handle_level_irq); + break; + + case IRQ_TYPE_EDGE_RISING: + pol = 1; + fallthrough; + case IRQ_TYPE_EDGE_FALLING: + edge = 1; + irq_set_handler_locked(data, handle_edge_irq); + break; + + default: + return -EINVAL; + } + } + + writeb(pol, lgpio->reg_base + lgpio->chip_data->intpol_offset + hwirq); + writeb(edge, lgpio->reg_base + lgpio->chip_data->intedge_offset + hwirq); + writeb(dual, lgpio->reg_base + lgpio->chip_data->intdual_offset + hwirq); + + return 0; +} + +static void loongson_gpio_ls2k0300_irq_handler(struct irq_desc *desc) +{ + struct loongson_gpio_chip *lgpio = irq_desc_get_handler_data(desc); + struct irq_chip *girqchip = irq_desc_get_chip(desc); + int i; + + chained_irq_enter(girqchip, desc); + + for (i = 0; i < lgpio->chip.gc.ngpio; i++) { + /* + * For the GPIO controller of LS2K0300, interrupts status bits + * may be wrongly set even if the corresponding interrupt is + * disabled. Thus interrupt enable bits are checked along with + * status bits to detect interrupts reliably. + */ + if (readb(lgpio->reg_base + lgpio->chip_data->intsts_offset + i) && + readb(lgpio->reg_base + lgpio->chip_data->inten_offset + i)) + generic_handle_domain_irq(lgpio->chip.gc.irq.domain, i); + } + + chained_irq_exit(girqchip, desc); +} + +static const struct irq_chip loongson_gpio_ls2k0300_irqchip = { + .irq_ack = loongson_gpio_irq_ack, + .irq_mask = loongson_gpio_irq_mask, + .irq_unmask = loongson_gpio_irq_unmask, + .irq_set_type = loongson_gpio_irq_set_type, + .flags = IRQCHIP_IMMUTABLE | IRQCHIP_SKIP_SET_WAKE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + +static int loongson_gpio_init_irqchip(struct platform_device *pdev, + struct loongson_gpio_chip *lgpio) +{ + const struct loongson_gpio_chip_data *data = lgpio->chip_data; + struct gpio_chip *chip = &lgpio->chip.gc; + int i; + + chip->irq.default_type = IRQ_TYPE_NONE; + chip->irq.handler = handle_bad_irq; + chip->irq.parent_handler = data->irq_handler; + chip->irq.parent_handler_data = lgpio; + gpio_irq_chip_set_chip(&chip->irq, data->girqchip); + + chip->irq.num_parents = data->intr_num; + chip->irq.parents = devm_kcalloc(&pdev->dev, data->intr_num, + sizeof(*chip->irq.parents), GFP_KERNEL); + if (!chip->parent) + return -ENOMEM; + + for (i = 0; i < data->intr_num; i++) { + int ret; + + ret = platform_get_irq(pdev, i); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "failed to get IRQ %d\n", i); + chip->irq.parents[i] = ret; + } + + for (i = 0; i < data->intr_num; i++) { + writeb(0x0, lgpio->reg_base + data->inten_offset + i); + writeb(0x1, lgpio->reg_base + data->intclr_offset + i); + } + + return 0; +} + +static int loongson_gpio_init(struct platform_device *pdev, struct loongson_gpio_chip *lgpio, void __iomem *reg_base) { + struct gpio_generic_chip_config config; int ret; lgpio->reg_base = reg_base; if (lgpio->chip_data->mode == BIT_CTRL_MODE) { - ret = bgpio_init(&lgpio->chip, dev, 8, - lgpio->reg_base + lgpio->chip_data->in_offset, - lgpio->reg_base + lgpio->chip_data->out_offset, - NULL, NULL, - lgpio->reg_base + lgpio->chip_data->conf_offset, - 0); + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 8, + .dat = lgpio->reg_base + lgpio->chip_data->in_offset, + .set = lgpio->reg_base + lgpio->chip_data->out_offset, + .dirin = lgpio->reg_base + lgpio->chip_data->conf_offset, + }; + + ret = gpio_generic_chip_init(&lgpio->chip, &config); if (ret) { - dev_err(dev, "unable to init generic GPIO\n"); + dev_err(&pdev->dev, "unable to init generic GPIO\n"); return ret; } } else { - lgpio->chip.direction_input = loongson_gpio_direction_input; - lgpio->chip.get = loongson_gpio_get; - lgpio->chip.get_direction = loongson_gpio_get_direction; - lgpio->chip.direction_output = loongson_gpio_direction_output; - lgpio->chip.set = loongson_gpio_set; - lgpio->chip.parent = dev; + lgpio->chip.gc.direction_input = loongson_gpio_direction_input; + lgpio->chip.gc.get = loongson_gpio_get; + lgpio->chip.gc.get_direction = loongson_gpio_get_direction; + lgpio->chip.gc.direction_output = loongson_gpio_direction_output; + lgpio->chip.gc.set = loongson_gpio_set; + lgpio->chip.gc.parent = &pdev->dev; spin_lock_init(&lgpio->lock); } - lgpio->chip.label = lgpio->chip_data->label; - lgpio->chip.can_sleep = false; - if (lgpio->chip_data->inten_offset) - lgpio->chip.to_irq = loongson_gpio_to_irq; + lgpio->chip.gc.label = lgpio->chip_data->label; + lgpio->chip.gc.can_sleep = false; + if (lgpio->chip_data->girqchip) { + ret = loongson_gpio_init_irqchip(pdev, lgpio); + if (ret) + return dev_err_probe(&pdev->dev, ret, "failed to initialize irqchip\n"); + } else if (lgpio->chip_data->inten_offset) { + lgpio->chip.gc.to_irq = loongson_gpio_to_irq; + } - return devm_gpiochip_add_data(dev, &lgpio->chip, lgpio); + return devm_gpiochip_add_data(&pdev->dev, &lgpio->chip.gc, lgpio); } static int loongson_gpio_probe(struct platform_device *pdev) @@ -175,6 +333,7 @@ static int loongson_gpio_probe(struct platform_device *pdev) void __iomem *reg_base; struct loongson_gpio_chip *lgpio; struct device *dev = &pdev->dev; + struct reset_control *rst; lgpio = devm_kzalloc(dev, sizeof(*lgpio), GFP_KERNEL); if (!lgpio) @@ -186,7 +345,11 @@ static int loongson_gpio_probe(struct platform_device *pdev) if (IS_ERR(reg_base)) return PTR_ERR(reg_base); - return loongson_gpio_init(dev, lgpio, reg_base); + rst = devm_reset_control_get_optional_exclusive_deasserted(&pdev->dev, NULL); + if (IS_ERR(rst)) + return dev_err_probe(&pdev->dev, PTR_ERR(rst), "failed to get reset control\n"); + + return loongson_gpio_init(pdev, lgpio, reg_base); } static const struct loongson_gpio_chip_data loongson_gpio_ls2k_data = { @@ -198,6 +361,23 @@ static const struct loongson_gpio_chip_data loongson_gpio_ls2k_data = { .inten_offset = 0x30, }; +static const struct loongson_gpio_chip_data loongson_gpio_ls2k0300_data = { + .label = "ls2k0300_gpio", + .mode = BYTE_CTRL_MODE, + .conf_offset = 0x800, + .in_offset = 0xa00, + .out_offset = 0x900, + .inten_offset = 0xb00, + .intpol_offset = 0xc00, + .intedge_offset = 0xd00, + .intclr_offset = 0xe00, + .intsts_offset = 0xf00, + .intdual_offset = 0xf80, + .intr_num = 7, + .irq_handler = loongson_gpio_ls2k0300_irq_handler, + .girqchip = &loongson_gpio_ls2k0300_irqchip, +}; + static const struct loongson_gpio_chip_data loongson_gpio_ls2k0500_data0 = { .label = "ls2k0500_gpio", .mode = BIT_CTRL_MODE, @@ -294,6 +474,10 @@ static const struct of_device_id loongson_gpio_of_match[] = { .compatible = "loongson,ls2k-gpio", .data = &loongson_gpio_ls2k_data, }, + { + .compatible = "loongson,ls2k0300-gpio", + .data = &loongson_gpio_ls2k0300_data, + }, { .compatible = "loongson,ls2k0500-gpio0", .data = &loongson_gpio_ls2k0500_data0, diff --git a/drivers/gpio/gpio-loongson1.c b/drivers/gpio/gpio-loongson1.c index 6ca3b969db4df2..9750a7a1750817 100644 --- a/drivers/gpio/gpio-loongson1.c +++ b/drivers/gpio/gpio-loongson1.c @@ -5,10 +5,11 @@ * Copyright (C) 2015-2023 Keguang Zhang */ +#include #include #include +#include #include -#include /* Loongson 1 GPIO Register Definitions */ #define GPIO_CFG 0x0 @@ -17,19 +18,18 @@ #define GPIO_OUTPUT 0x30 struct ls1x_gpio_chip { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *reg_base; }; static int ls1x_gpio_request(struct gpio_chip *gc, unsigned int offset) { struct ls1x_gpio_chip *ls1x_gc = gpiochip_get_data(gc); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&ls1x_gc->chip); + __raw_writel(__raw_readl(ls1x_gc->reg_base + GPIO_CFG) | BIT(offset), ls1x_gc->reg_base + GPIO_CFG); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); return 0; } @@ -37,16 +37,16 @@ static int ls1x_gpio_request(struct gpio_chip *gc, unsigned int offset) static void ls1x_gpio_free(struct gpio_chip *gc, unsigned int offset) { struct ls1x_gpio_chip *ls1x_gc = gpiochip_get_data(gc); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&ls1x_gc->chip); + __raw_writel(__raw_readl(ls1x_gc->reg_base + GPIO_CFG) & ~BIT(offset), ls1x_gc->reg_base + GPIO_CFG); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static int ls1x_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct ls1x_gpio_chip *ls1x_gc; int ret; @@ -59,29 +59,35 @@ static int ls1x_gpio_probe(struct platform_device *pdev) if (IS_ERR(ls1x_gc->reg_base)) return PTR_ERR(ls1x_gc->reg_base); - ret = bgpio_init(&ls1x_gc->gc, dev, 4, ls1x_gc->reg_base + GPIO_DATA, - ls1x_gc->reg_base + GPIO_OUTPUT, NULL, - NULL, ls1x_gc->reg_base + GPIO_DIR, 0); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = ls1x_gc->reg_base + GPIO_DATA, + .set = ls1x_gc->reg_base + GPIO_OUTPUT, + .dirin = ls1x_gc->reg_base + GPIO_DIR, + }; + + ret = gpio_generic_chip_init(&ls1x_gc->chip, &config); if (ret) goto err; - ls1x_gc->gc.owner = THIS_MODULE; - ls1x_gc->gc.request = ls1x_gpio_request; - ls1x_gc->gc.free = ls1x_gpio_free; + ls1x_gc->chip.gc.owner = THIS_MODULE; + ls1x_gc->chip.gc.request = ls1x_gpio_request; + ls1x_gc->chip.gc.free = ls1x_gpio_free; /* * Clear ngpio to let gpiolib get the correct number * by reading ngpios property */ - ls1x_gc->gc.ngpio = 0; + ls1x_gc->chip.gc.ngpio = 0; - ret = devm_gpiochip_add_data(dev, &ls1x_gc->gc, ls1x_gc); + ret = devm_gpiochip_add_data(dev, &ls1x_gc->chip.gc, ls1x_gc); if (ret) goto err; platform_set_drvdata(pdev, ls1x_gc); dev_info(dev, "GPIO controller registered with %d pins\n", - ls1x_gc->gc.ngpio); + ls1x_gc->chip.gc.ngpio); return 0; err: diff --git a/drivers/gpio/gpio-max7360.c b/drivers/gpio/gpio-max7360.c new file mode 100644 index 00000000000000..db92a43776a920 --- /dev/null +++ b/drivers/gpio/gpio-max7360.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Kamel BOUHARA + * Author: Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX7360_GPIO_PORT 1 +#define MAX7360_GPIO_COL 2 + +struct max7360_gpio_plat_data { + unsigned int function; +}; + +static struct max7360_gpio_plat_data max7360_gpio_port_plat = { .function = MAX7360_GPIO_PORT }; +static struct max7360_gpio_plat_data max7360_gpio_col_plat = { .function = MAX7360_GPIO_COL }; + +static int max7360_get_available_gpos(struct device *dev, unsigned int *available_gpios) +{ + u32 columns; + int ret; + + ret = device_property_read_u32(dev->parent, "keypad,num-columns", &columns); + if (ret) { + dev_err(dev, "Failed to read columns count\n"); + return ret; + } + + *available_gpios = min(MAX7360_MAX_GPO, MAX7360_MAX_KEY_COLS - columns); + + return 0; +} + +static int max7360_gpo_init_valid_mask(struct gpio_chip *gc, + unsigned long *valid_mask, + unsigned int ngpios) +{ + unsigned int available_gpios; + int ret; + + ret = max7360_get_available_gpos(gc->parent, &available_gpios); + if (ret) + return ret; + + bitmap_clear(valid_mask, 0, MAX7360_MAX_KEY_COLS - available_gpios); + + return 0; +} + +static int max7360_set_gpos_count(struct device *dev, struct regmap *regmap) +{ + /* + * MAX7360 COL0 to COL7 pins can be used either as keypad columns, + * general purpose output or a mix of both. + * By default, all pins are used as keypad, here we update this + * configuration to allow to use some of them as GPIOs. + */ + unsigned int available_gpios; + unsigned int val; + int ret; + + ret = max7360_get_available_gpos(dev, &available_gpios); + if (ret) + return ret; + + /* + * Configure which GPIOs will be used for keypad. + * MAX7360_REG_DEBOUNCE contains configuration both for keypad debounce + * timings and gpos/keypad columns repartition. Only the later is + * modified here. + */ + val = FIELD_PREP(MAX7360_PORTS, available_gpios); + ret = regmap_write_bits(regmap, MAX7360_REG_DEBOUNCE, MAX7360_PORTS, val); + if (ret) + dev_err(dev, "Failed to write max7360 columns/gpos configuration"); + + return ret; +} + +static int max7360_gpio_reg_mask_xlate(struct gpio_regmap *gpio, + unsigned int base, unsigned int offset, + unsigned int *reg, unsigned int *mask) +{ + if (base == MAX7360_REG_PWMBASE) { + /* + * GPIO output is using PWM duty cycle registers: one register + * per line, with value being either 0 or 255. + */ + *reg = base + offset; + *mask = GENMASK(7, 0); + } else { + *reg = base; + *mask = BIT(offset); + } + + return 0; +} + +static const struct regmap_irq max7360_regmap_irqs[MAX7360_MAX_GPIO] = { + REGMAP_IRQ_REG(0, 0, BIT(0)), + REGMAP_IRQ_REG(1, 0, BIT(1)), + REGMAP_IRQ_REG(2, 0, BIT(2)), + REGMAP_IRQ_REG(3, 0, BIT(3)), + REGMAP_IRQ_REG(4, 0, BIT(4)), + REGMAP_IRQ_REG(5, 0, BIT(5)), + REGMAP_IRQ_REG(6, 0, BIT(6)), + REGMAP_IRQ_REG(7, 0, BIT(7)), +}; + +static int max7360_handle_mask_sync(const int index, + const unsigned int mask_buf_def, + const unsigned int mask_buf, + void *const irq_drv_data) +{ + struct regmap *regmap = irq_drv_data; + int ret; + + for (unsigned int i = 0; i < MAX7360_MAX_GPIO; i++) { + ret = regmap_assign_bits(regmap, MAX7360_REG_PWMCFG(i), + MAX7360_PORT_CFG_INTERRUPT_MASK, mask_buf & BIT(i)); + if (ret) + return ret; + } + + return 0; +} + +static int max7360_gpio_probe(struct platform_device *pdev) +{ + const struct max7360_gpio_plat_data *plat_data; + struct gpio_regmap_config gpio_config = { }; + struct regmap_irq_chip *irq_chip; + struct device *dev = &pdev->dev; + struct regmap *regmap; + unsigned int outconf; + int ret; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "could not get parent regmap\n"); + + plat_data = device_get_match_data(dev); + if (plat_data->function == MAX7360_GPIO_PORT) { + if (device_property_read_bool(dev, "interrupt-controller")) { + /* + * Port GPIOs with interrupt-controller property: add IRQ + * controller. + */ + gpio_config.regmap_irq_flags = IRQF_ONESHOT | IRQF_SHARED; + gpio_config.regmap_irq_line = + fwnode_irq_get_byname(dev_fwnode(dev->parent), "inti"); + if (gpio_config.regmap_irq_line < 0) + return dev_err_probe(dev, gpio_config.regmap_irq_line, + "Failed to get IRQ\n"); + + /* Create custom IRQ configuration. */ + irq_chip = devm_kzalloc(dev, sizeof(*irq_chip), GFP_KERNEL); + gpio_config.regmap_irq_chip = irq_chip; + if (!irq_chip) + return -ENOMEM; + + irq_chip->name = dev_name(dev); + irq_chip->status_base = MAX7360_REG_GPIOIN; + irq_chip->status_is_level = true; + irq_chip->num_regs = 1; + irq_chip->num_irqs = MAX7360_MAX_GPIO; + irq_chip->irqs = max7360_regmap_irqs; + irq_chip->handle_mask_sync = max7360_handle_mask_sync; + irq_chip->irq_drv_data = regmap; + + for (unsigned int i = 0; i < MAX7360_MAX_GPIO; i++) { + ret = regmap_write_bits(regmap, MAX7360_REG_PWMCFG(i), + MAX7360_PORT_CFG_INTERRUPT_EDGES, + MAX7360_PORT_CFG_INTERRUPT_EDGES); + if (ret) + return dev_err_probe(dev, ret, + "Failed to enable interrupts\n"); + } + } + + /* + * Port GPIOs: set output mode configuration (constant-current or not). + * This property is optional. + */ + ret = device_property_read_u32(dev, "maxim,constant-current-disable", &outconf); + if (!ret) { + ret = regmap_write(regmap, MAX7360_REG_GPIOOUTM, outconf); + if (ret) + return dev_err_probe(dev, ret, + "Failed to set constant-current configuration\n"); + } + } + + /* Add gpio device. */ + gpio_config.parent = dev; + gpio_config.regmap = regmap; + if (plat_data->function == MAX7360_GPIO_PORT) { + gpio_config.ngpio = MAX7360_MAX_GPIO; + gpio_config.reg_dat_base = GPIO_REGMAP_ADDR(MAX7360_REG_GPIOIN); + gpio_config.reg_set_base = GPIO_REGMAP_ADDR(MAX7360_REG_PWMBASE); + gpio_config.reg_dir_out_base = GPIO_REGMAP_ADDR(MAX7360_REG_GPIOCTRL); + gpio_config.ngpio_per_reg = MAX7360_MAX_GPIO; + gpio_config.reg_mask_xlate = max7360_gpio_reg_mask_xlate; + } else { + ret = max7360_set_gpos_count(dev, regmap); + if (ret) + return dev_err_probe(dev, ret, "Failed to set GPOS pin count\n"); + + gpio_config.reg_set_base = GPIO_REGMAP_ADDR(MAX7360_REG_PORTS); + gpio_config.ngpio = MAX7360_MAX_KEY_COLS; + gpio_config.init_valid_mask = max7360_gpo_init_valid_mask; + } + + return PTR_ERR_OR_ZERO(devm_gpio_regmap_register(dev, &gpio_config)); +} + +static const struct of_device_id max7360_gpio_of_match[] = { + { + .compatible = "maxim,max7360-gpo", + .data = &max7360_gpio_col_plat + }, { + .compatible = "maxim,max7360-gpio", + .data = &max7360_gpio_port_plat + }, { + } +}; +MODULE_DEVICE_TABLE(of, max7360_gpio_of_match); + +static struct platform_driver max7360_gpio_driver = { + .driver = { + .name = "max7360-gpio", + .of_match_table = max7360_gpio_of_match, + }, + .probe = max7360_gpio_probe, +}; +module_platform_driver(max7360_gpio_driver); + +MODULE_DESCRIPTION("MAX7360 GPIO driver"); +MODULE_AUTHOR("Kamel BOUHARA "); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); diff --git a/drivers/gpio/gpio-menz127.c b/drivers/gpio/gpio-menz127.c index ebe5da4933bce7..da2bf9381cc43c 100644 --- a/drivers/gpio/gpio-menz127.c +++ b/drivers/gpio/gpio-menz127.c @@ -12,6 +12,7 @@ #include #include #include +#include #define MEN_Z127_CTRL 0x00 #define MEN_Z127_PSR 0x04 @@ -30,7 +31,7 @@ (db <= MEN_Z127_DB_MAX_US)) struct men_z127_gpio { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *reg_base; struct resource *mem; }; @@ -64,7 +65,7 @@ static int men_z127_debounce(struct gpio_chip *gc, unsigned gpio, debounce /= 50; } - raw_spin_lock(&gc->bgpio_lock); + guard(gpio_generic_lock)(&priv->chip); db_en = readl(priv->reg_base + MEN_Z127_DBER); @@ -79,8 +80,6 @@ static int men_z127_debounce(struct gpio_chip *gc, unsigned gpio, writel(db_en, priv->reg_base + MEN_Z127_DBER); writel(db_cnt, priv->reg_base + GPIO_TO_DBCNT_REG(gpio)); - raw_spin_unlock(&gc->bgpio_lock); - return 0; } @@ -91,7 +90,8 @@ static int men_z127_set_single_ended(struct gpio_chip *gc, struct men_z127_gpio *priv = gpiochip_get_data(gc); u32 od_en; - raw_spin_lock(&gc->bgpio_lock); + guard(gpio_generic_lock)(&priv->chip); + od_en = readl(priv->reg_base + MEN_Z127_ODER); if (param == PIN_CONFIG_DRIVE_OPEN_DRAIN) @@ -101,7 +101,6 @@ static int men_z127_set_single_ended(struct gpio_chip *gc, od_en &= ~BIT(offset); writel(od_en, priv->reg_base + MEN_Z127_ODER); - raw_spin_unlock(&gc->bgpio_lock); return 0; } @@ -137,6 +136,7 @@ static void men_z127_release_mem(void *data) static int men_z127_probe(struct mcb_device *mdev, const struct mcb_device_id *id) { + struct gpio_generic_chip_config config; struct men_z127_gpio *men_z127_gpio; struct device *dev = &mdev->dev; int ret; @@ -163,18 +163,21 @@ static int men_z127_probe(struct mcb_device *mdev, mcb_set_drvdata(mdev, men_z127_gpio); - ret = bgpio_init(&men_z127_gpio->gc, &mdev->dev, 4, - men_z127_gpio->reg_base + MEN_Z127_PSR, - men_z127_gpio->reg_base + MEN_Z127_CTRL, - NULL, - men_z127_gpio->reg_base + MEN_Z127_GPIODR, - NULL, 0); + config = (struct gpio_generic_chip_config) { + .dev = &mdev->dev, + .sz = 4, + .dat = men_z127_gpio->reg_base + MEN_Z127_PSR, + .set = men_z127_gpio->reg_base + MEN_Z127_CTRL, + .dirout = men_z127_gpio->reg_base + MEN_Z127_GPIODR, + }; + + ret = gpio_generic_chip_init(&men_z127_gpio->chip, &config); if (ret) return ret; - men_z127_gpio->gc.set_config = men_z127_set_config; + men_z127_gpio->chip.gc.set_config = men_z127_set_config; - ret = devm_gpiochip_add_data(dev, &men_z127_gpio->gc, men_z127_gpio); + ret = devm_gpiochip_add_data(dev, &men_z127_gpio->chip.gc, men_z127_gpio); if (ret) return dev_err_probe(dev, ret, "failed to register MEN 16Z127 GPIO controller"); diff --git a/drivers/gpio/gpio-mlxbf.c b/drivers/gpio/gpio-mlxbf.c index 1fa9973f55b96a..a18fedbc463e67 100644 --- a/drivers/gpio/gpio-mlxbf.c +++ b/drivers/gpio/gpio-mlxbf.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,7 @@ struct mlxbf_gpio_context_save_regs { /* Device state structure. */ struct mlxbf_gpio_state { - struct gpio_chip gc; + struct gpio_generic_chip chip; /* Memory Address */ void __iomem *base; @@ -49,6 +50,7 @@ struct mlxbf_gpio_state { static int mlxbf_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct mlxbf_gpio_state *gs; struct device *dev = &pdev->dev; struct gpio_chip *gc; @@ -62,21 +64,24 @@ static int mlxbf_gpio_probe(struct platform_device *pdev) if (IS_ERR(gs->base)) return PTR_ERR(gs->base); - gc = &gs->gc; - ret = bgpio_init(gc, dev, 8, - gs->base + MLXBF_GPIO_PIN_STATE, - NULL, - NULL, - gs->base + MLXBF_GPIO_PIN_DIR_O, - gs->base + MLXBF_GPIO_PIN_DIR_I, - 0); + gc = &gs->chip.gc; + + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 8, + .dat = gs->base + MLXBF_GPIO_PIN_STATE, + .dirout = gs->base + MLXBF_GPIO_PIN_DIR_O, + .dirin = gs->base + MLXBF_GPIO_PIN_DIR_I, + }; + + ret = gpio_generic_chip_init(&gs->chip, &config); if (ret) return -ENODEV; gc->owner = THIS_MODULE; gc->ngpio = MLXBF_GPIO_NR; - ret = devm_gpiochip_add_data(dev, &gs->gc, gs); + ret = devm_gpiochip_add_data(dev, &gs->chip.gc, gs); if (ret) { dev_err(&pdev->dev, "Failed adding memory mapped gpiochip\n"); return ret; diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c index 390f2e74a9d819..abffce3894fc44 100644 --- a/drivers/gpio/gpio-mlxbf2.c +++ b/drivers/gpio/gpio-mlxbf2.c @@ -6,8 +6,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -65,7 +67,7 @@ struct mlxbf2_gpio_context_save_regs { /* BlueField-2 gpio block context structure. */ struct mlxbf2_gpio_context { - struct gpio_chip gc; + struct gpio_generic_chip chip; /* YU GPIO blocks address */ void __iomem *gpio_io; @@ -132,7 +134,7 @@ static int mlxbf2_gpio_lock_acquire(struct mlxbf2_gpio_context *gs) u32 arm_gpio_lock_val; mutex_lock(yu_arm_gpio_lock_param.lock); - raw_spin_lock(&gs->gc.bgpio_lock); + gpio_generic_chip_lock(&gs->chip); arm_gpio_lock_val = readl(yu_arm_gpio_lock_param.io); @@ -140,7 +142,7 @@ static int mlxbf2_gpio_lock_acquire(struct mlxbf2_gpio_context *gs) * When lock active bit[31] is set, ModeX is write enabled */ if (YU_LOCK_ACTIVE_BIT(arm_gpio_lock_val)) { - raw_spin_unlock(&gs->gc.bgpio_lock); + gpio_generic_chip_unlock(&gs->chip); mutex_unlock(yu_arm_gpio_lock_param.lock); return -EINVAL; } @@ -154,11 +156,11 @@ static int mlxbf2_gpio_lock_acquire(struct mlxbf2_gpio_context *gs) * Release the YU arm_gpio_lock after changing the direction mode. */ static void mlxbf2_gpio_lock_release(struct mlxbf2_gpio_context *gs) - __releases(&gs->gc.bgpio_lock) + __releases(&gs->chip.lock) __releases(yu_arm_gpio_lock_param.lock) { writel(YU_ARM_GPIO_LOCK_RELEASE, yu_arm_gpio_lock_param.io); - raw_spin_unlock(&gs->gc.bgpio_lock); + gpio_generic_chip_unlock(&gs->chip); mutex_unlock(yu_arm_gpio_lock_param.lock); } @@ -235,11 +237,10 @@ static void mlxbf2_gpio_irq_enable(struct irq_data *irqd) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf2_gpio_context *gs = gpiochip_get_data(gc); int offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; gpiochip_enable_irq(gc, irqd_to_hwirq(irqd)); - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&gs->chip); val = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_CLRCAUSE); val |= BIT(offset); writel(val, gs->gpio_io + YU_GPIO_CAUSE_OR_CLRCAUSE); @@ -247,7 +248,6 @@ static void mlxbf2_gpio_irq_enable(struct irq_data *irqd) val = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); val |= BIT(offset); writel(val, gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); } static void mlxbf2_gpio_irq_disable(struct irq_data *irqd) @@ -255,21 +255,21 @@ static void mlxbf2_gpio_irq_disable(struct irq_data *irqd) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf2_gpio_context *gs = gpiochip_get_data(gc); int offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); - val = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); - val &= ~BIT(offset); - writel(val, gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &gs->chip) { + val = readl(gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); + val &= ~BIT(offset); + writel(val, gs->gpio_io + YU_GPIO_CAUSE_OR_EVTEN0); + } + gpiochip_disable_irq(gc, irqd_to_hwirq(irqd)); } static irqreturn_t mlxbf2_gpio_irq_handler(int irq, void *ptr) { struct mlxbf2_gpio_context *gs = ptr; - struct gpio_chip *gc = &gs->gc; + struct gpio_chip *gc = &gs->chip.gc; unsigned long pending; u32 level; @@ -288,7 +288,6 @@ mlxbf2_gpio_irq_set_type(struct irq_data *irqd, unsigned int type) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf2_gpio_context *gs = gpiochip_get_data(gc); int offset = irqd_to_hwirq(irqd); - unsigned long flags; bool fall = false; bool rise = false; u32 val; @@ -308,7 +307,8 @@ mlxbf2_gpio_irq_set_type(struct irq_data *irqd, unsigned int type) return -EINVAL; } - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&gs->chip); + if (fall) { val = readl(gs->gpio_io + YU_GPIO_CAUSE_FALL_EN); val |= BIT(offset); @@ -320,7 +320,6 @@ mlxbf2_gpio_irq_set_type(struct irq_data *irqd, unsigned int type) val |= BIT(offset); writel(val, gs->gpio_io + YU_GPIO_CAUSE_RISE_EN); } - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); return 0; } @@ -347,6 +346,7 @@ static const struct irq_chip mlxbf2_gpio_irq_chip = { static int mlxbf2_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct mlxbf2_gpio_context *gs; struct device *dev = &pdev->dev; struct gpio_irq_chip *girq; @@ -369,28 +369,25 @@ mlxbf2_gpio_probe(struct platform_device *pdev) return PTR_ERR(gs->gpio_io); ret = mlxbf2_gpio_get_lock_res(pdev); - if (ret) { - dev_err(dev, "Failed to get yu_arm_gpio_lock resource\n"); - return ret; - } + if (ret) + return dev_err_probe(dev, ret, "Failed to get yu_arm_gpio_lock resource\n"); if (device_property_read_u32(dev, "npins", &npins)) npins = MLXBF2_GPIO_MAX_PINS_PER_BLOCK; - gc = &gs->gc; + gc = &gs->chip.gc; - ret = bgpio_init(gc, dev, 4, - gs->gpio_io + YU_GPIO_DATAIN, - gs->gpio_io + YU_GPIO_DATASET, - gs->gpio_io + YU_GPIO_DATACLEAR, - NULL, - NULL, - 0); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = gs->gpio_io + YU_GPIO_DATAIN, + .set = gs->gpio_io + YU_GPIO_DATASET, + .clr = gs->gpio_io + YU_GPIO_DATACLEAR, + }; - if (ret) { - dev_err(dev, "bgpio_init failed\n"); - return ret; - } + ret = gpio_generic_chip_init(&gs->chip, &config); + if (ret) + return dev_err_probe(dev, ret, "failed to initialize the generic GPIO chip\n"); gc->direction_input = mlxbf2_gpio_direction_input; gc->direction_output = mlxbf2_gpio_direction_output; @@ -399,7 +396,7 @@ mlxbf2_gpio_probe(struct platform_device *pdev) irq = platform_get_irq_optional(pdev, 0); if (irq >= 0) { - girq = &gs->gc.irq; + girq = &gs->chip.gc.irq; gpio_irq_chip_set_chip(girq, &mlxbf2_gpio_irq_chip); girq->handler = handle_simple_irq; girq->default_type = IRQ_TYPE_NONE; @@ -414,19 +411,15 @@ mlxbf2_gpio_probe(struct platform_device *pdev) */ ret = devm_request_irq(dev, irq, mlxbf2_gpio_irq_handler, IRQF_SHARED, name, gs); - if (ret) { - dev_err(dev, "failed to request IRQ"); - return ret; - } + if (ret) + return dev_err_probe(dev, ret, "failed to request IRQ"); } platform_set_drvdata(pdev, gs); - ret = devm_gpiochip_add_data(dev, &gs->gc, gs); - if (ret) { - dev_err(dev, "Failed adding memory mapped gpiochip\n"); - return ret; - } + ret = devm_gpiochip_add_data(dev, &gs->chip.gc, gs); + if (ret) + return dev_err_probe(dev, ret, "Failed adding memory mapped gpiochip\n"); return 0; } diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c index ed29b07d16c190..4770578269bae8 100644 --- a/drivers/gpio/gpio-mlxbf3.c +++ b/drivers/gpio/gpio-mlxbf3.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -42,7 +43,7 @@ #define MLXBF_GPIO_CLR_ALL_INTS GENMASK(31, 0) struct mlxbf3_gpio_context { - struct gpio_chip gc; + struct gpio_generic_chip chip; /* YU GPIO block address */ void __iomem *gpio_set_io; @@ -58,18 +59,17 @@ static void mlxbf3_gpio_irq_enable(struct irq_data *irqd) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf3_gpio_context *gs = gpiochip_get_data(gc); irq_hw_number_t offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; gpiochip_enable_irq(gc, offset); - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&gs->chip); + writel(BIT(offset), gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); val = readl(gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); val |= BIT(offset); writel(val, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); } static void mlxbf3_gpio_irq_disable(struct irq_data *irqd) @@ -77,16 +77,15 @@ static void mlxbf3_gpio_irq_disable(struct irq_data *irqd) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf3_gpio_context *gs = gpiochip_get_data(gc); irq_hw_number_t offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); - val = readl(gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); - val &= ~BIT(offset); - writel(val, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); + scoped_guard(gpio_generic_lock_irqsave, &gs->chip) { + val = readl(gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); + val &= ~BIT(offset); + writel(val, gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_EVTEN0); - writel(BIT(offset), gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); + writel(BIT(offset), gs->gpio_cause_io + MLXBF_GPIO_CAUSE_OR_CLRCAUSE); + } gpiochip_disable_irq(gc, offset); } @@ -94,7 +93,7 @@ static void mlxbf3_gpio_irq_disable(struct irq_data *irqd) static irqreturn_t mlxbf3_gpio_irq_handler(int irq, void *ptr) { struct mlxbf3_gpio_context *gs = ptr; - struct gpio_chip *gc = &gs->gc; + struct gpio_chip *gc = &gs->chip.gc; unsigned long pending; u32 level; @@ -113,37 +112,33 @@ mlxbf3_gpio_irq_set_type(struct irq_data *irqd, unsigned int type) struct gpio_chip *gc = irq_data_get_irq_chip_data(irqd); struct mlxbf3_gpio_context *gs = gpiochip_get_data(gc); irq_hw_number_t offset = irqd_to_hwirq(irqd); - unsigned long flags; u32 val; - raw_spin_lock_irqsave(&gs->gc.bgpio_lock, flags); - - switch (type & IRQ_TYPE_SENSE_MASK) { - case IRQ_TYPE_EDGE_BOTH: - val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); - val |= BIT(offset); - writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); - val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); - val |= BIT(offset); - writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); - break; - case IRQ_TYPE_EDGE_RISING: - val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); - val |= BIT(offset); - writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); - break; - case IRQ_TYPE_EDGE_FALLING: - val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); - val |= BIT(offset); - writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); - break; - default: - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); - return -EINVAL; + scoped_guard(gpio_generic_lock_irqsave, &gs->chip) { + switch (type & IRQ_TYPE_SENSE_MASK) { + case IRQ_TYPE_EDGE_BOTH: + val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); + val |= BIT(offset); + writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); + val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); + val |= BIT(offset); + writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); + break; + case IRQ_TYPE_EDGE_RISING: + val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); + val |= BIT(offset); + writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_RISE_EN); + break; + case IRQ_TYPE_EDGE_FALLING: + val = readl(gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); + val |= BIT(offset); + writel(val, gs->gpio_io + MLXBF_GPIO_CAUSE_FALL_EN); + break; + default: + return -EINVAL; + } } - raw_spin_unlock_irqrestore(&gs->gc.bgpio_lock, flags); - irq_set_handler_locked(irqd, handle_edge_irq); return 0; @@ -186,6 +181,7 @@ static int mlxbf3_gpio_add_pin_ranges(struct gpio_chip *chip) static int mlxbf3_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct mlxbf3_gpio_context *gs; struct gpio_irq_chip *girq; @@ -211,16 +207,23 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) gs->gpio_clr_io = devm_platform_ioremap_resource(pdev, 3); if (IS_ERR(gs->gpio_clr_io)) return PTR_ERR(gs->gpio_clr_io); - gc = &gs->gc; - - ret = bgpio_init(gc, dev, 4, - gs->gpio_io + MLXBF_GPIO_READ_DATA_IN, - gs->gpio_set_io + MLXBF_GPIO_FW_DATA_OUT_SET, - gs->gpio_clr_io + MLXBF_GPIO_FW_DATA_OUT_CLEAR, - gs->gpio_set_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_SET, - gs->gpio_clr_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_CLEAR, 0); + gc = &gs->chip.gc; + + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = gs->gpio_io + MLXBF_GPIO_READ_DATA_IN, + .set = gs->gpio_set_io + MLXBF_GPIO_FW_DATA_OUT_SET, + .clr = gs->gpio_clr_io + MLXBF_GPIO_FW_DATA_OUT_CLEAR, + .dirout = gs->gpio_set_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_SET, + .dirin = gs->gpio_clr_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_CLEAR, + }; + + ret = gpio_generic_chip_init(&gs->chip, &config); if (ret) - return dev_err_probe(dev, ret, "%s: bgpio_init() failed", __func__); + return dev_err_probe(dev, ret, + "%s: failed to initialize the generic GPIO chip", + __func__); gc->request = gpiochip_generic_request; gc->free = gpiochip_generic_free; @@ -229,7 +232,7 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) irq = platform_get_irq_optional(pdev, 0); if (irq >= 0) { - girq = &gs->gc.irq; + girq = &gs->chip.gc.irq; gpio_irq_chip_set_chip(girq, &gpio_mlxbf3_irqchip); girq->default_type = IRQ_TYPE_NONE; /* This will let us handle the parent IRQ in the driver */ @@ -250,7 +253,7 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) platform_set_drvdata(pdev, gs); - ret = devm_gpiochip_add_data(dev, &gs->gc, gs); + ret = devm_gpiochip_add_data(dev, gc, gs); if (ret) dev_err_probe(dev, ret, "Failed adding memory mapped gpiochip\n"); diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index 021ad62778c2f4..7d6dd36cf1aeff 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -57,6 +57,7 @@ o ` ~~~~\___/~~~~ ` controller in FPGA is ,.` #include #include +#include #include "gpiolib.h" @@ -124,20 +125,23 @@ static unsigned long bgpio_read32be(void __iomem *reg) static unsigned long bgpio_line2mask(struct gpio_chip *gc, unsigned int line) { - if (gc->be_bits) - return BIT(gc->bgpio_bits - 1 - line); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (chip->be_bits) + return BIT(chip->bits - 1 - line); return BIT(line); } static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long pinmask = bgpio_line2mask(gc, gpio); - bool dir = !!(gc->bgpio_dir & pinmask); + bool dir = !!(chip->sdir & pinmask); if (dir) - return !!(gc->read_reg(gc->reg_set) & pinmask); - else - return !!(gc->read_reg(gc->reg_dat) & pinmask); + return !!(chip->read_reg(chip->reg_set) & pinmask); + + return !!(chip->read_reg(chip->reg_dat) & pinmask); } /* @@ -147,26 +151,28 @@ static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio) static int bgpio_get_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - unsigned long get_mask = 0; - unsigned long set_mask = 0; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long get_mask = 0, set_mask = 0; /* Make sure we first clear any bits that are zero when we read the register */ *bits &= ~*mask; - set_mask = *mask & gc->bgpio_dir; - get_mask = *mask & ~gc->bgpio_dir; + set_mask = *mask & chip->sdir; + get_mask = *mask & ~chip->sdir; if (set_mask) - *bits |= gc->read_reg(gc->reg_set) & set_mask; + *bits |= chip->read_reg(chip->reg_set) & set_mask; if (get_mask) - *bits |= gc->read_reg(gc->reg_dat) & get_mask; + *bits |= chip->read_reg(chip->reg_dat) & get_mask; return 0; } static int bgpio_get(struct gpio_chip *gc, unsigned int gpio) { - return !!(gc->read_reg(gc->reg_dat) & bgpio_line2mask(gc, gpio)); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + return !!(chip->read_reg(chip->reg_dat) & bgpio_line2mask(gc, gpio)); } /* @@ -175,9 +181,11 @@ static int bgpio_get(struct gpio_chip *gc, unsigned int gpio) static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + /* Make sure we first clear any bits that are zero when we read the register */ *bits &= ~*mask; - *bits |= gc->read_reg(gc->reg_dat) & *mask; + *bits |= chip->read_reg(chip->reg_dat) & *mask; return 0; } @@ -187,6 +195,7 @@ static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long readmask = 0; unsigned long val; int bit; @@ -199,7 +208,7 @@ static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, readmask |= bgpio_line2mask(gc, bit); /* Read the register */ - val = gc->read_reg(gc->reg_dat) & readmask; + val = chip->read_reg(chip->reg_dat) & readmask; /* * Mirror the result into the "bits" result, this will give line 0 @@ -218,19 +227,20 @@ static int bgpio_set_none(struct gpio_chip *gc, unsigned int gpio, int val) static int bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long mask = bgpio_line2mask(gc, gpio); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); if (val) - gc->bgpio_data |= mask; + chip->sdata |= mask; else - gc->bgpio_data &= ~mask; + chip->sdata &= ~mask; - gc->write_reg(gc->reg_dat, gc->bgpio_data); + chip->write_reg(chip->reg_dat, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return 0; } @@ -238,31 +248,32 @@ static int bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int bgpio_set_with_clear(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long mask = bgpio_line2mask(gc, gpio); if (val) - gc->write_reg(gc->reg_set, mask); + chip->write_reg(chip->reg_set, mask); else - gc->write_reg(gc->reg_clr, mask); + chip->write_reg(chip->reg_clr, mask); return 0; } static int bgpio_set_set(struct gpio_chip *gc, unsigned int gpio, int val) { - unsigned long mask = bgpio_line2mask(gc, gpio); - unsigned long flags; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long mask = bgpio_line2mask(gc, gpio), flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); if (val) - gc->bgpio_data |= mask; + chip->sdata |= mask; else - gc->bgpio_data &= ~mask; + chip->sdata &= ~mask; - gc->write_reg(gc->reg_set, gc->bgpio_data); + chip->write_reg(chip->reg_set, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return 0; } @@ -272,12 +283,13 @@ static void bgpio_multiple_get_masks(struct gpio_chip *gc, unsigned long *set_mask, unsigned long *clear_mask) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); int i; *set_mask = 0; *clear_mask = 0; - for_each_set_bit(i, mask, gc->bgpio_bits) { + for_each_set_bit(i, mask, chip->bits) { if (test_bit(i, bits)) *set_mask |= bgpio_line2mask(gc, i); else @@ -290,25 +302,27 @@ static void bgpio_set_multiple_single_reg(struct gpio_chip *gc, unsigned long *bits, void __iomem *reg) { - unsigned long flags; - unsigned long set_mask, clear_mask; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long flags, set_mask, clear_mask; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); bgpio_multiple_get_masks(gc, mask, bits, &set_mask, &clear_mask); - gc->bgpio_data |= set_mask; - gc->bgpio_data &= ~clear_mask; + chip->sdata |= set_mask; + chip->sdata &= ~clear_mask; - gc->write_reg(reg, gc->bgpio_data); + chip->write_reg(reg, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); } static int bgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - bgpio_set_multiple_single_reg(gc, mask, bits, gc->reg_dat); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + bgpio_set_multiple_single_reg(gc, mask, bits, chip->reg_dat); return 0; } @@ -316,7 +330,9 @@ static int bgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, static int bgpio_set_multiple_set(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - bgpio_set_multiple_single_reg(gc, mask, bits, gc->reg_set); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + bgpio_set_multiple_single_reg(gc, mask, bits, chip->reg_set); return 0; } @@ -325,21 +341,24 @@ static int bgpio_set_multiple_with_clear(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long set_mask, clear_mask; bgpio_multiple_get_masks(gc, mask, bits, &set_mask, &clear_mask); if (set_mask) - gc->write_reg(gc->reg_set, set_mask); + chip->write_reg(chip->reg_set, set_mask); if (clear_mask) - gc->write_reg(gc->reg_clr, clear_mask); + chip->write_reg(chip->reg_clr, clear_mask); return 0; } static int bgpio_dir_return(struct gpio_chip *gc, unsigned int gpio, bool dir_out) { - if (!gc->bgpio_pinctrl) + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (!chip->pinctrl) return 0; if (dir_out) @@ -374,39 +393,42 @@ static int bgpio_simple_dir_out(struct gpio_chip *gc, unsigned int gpio, static int bgpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); - gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio); + chip->sdir &= ~bgpio_line2mask(gc, gpio); - if (gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); - if (gc->reg_dir_out) - gc->write_reg(gc->reg_dir_out, gc->bgpio_dir); + if (chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); + if (chip->reg_dir_out) + chip->write_reg(chip->reg_dir_out, chip->sdir); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return bgpio_dir_return(gc, gpio, false); } static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + /* Return 0 if output, 1 if input */ - if (gc->bgpio_dir_unreadable) { - if (gc->bgpio_dir & bgpio_line2mask(gc, gpio)) + if (chip->dir_unreadable) { + if (chip->sdir & bgpio_line2mask(gc, gpio)) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; } - if (gc->reg_dir_out) { - if (gc->read_reg(gc->reg_dir_out) & bgpio_line2mask(gc, gpio)) + if (chip->reg_dir_out) { + if (chip->read_reg(chip->reg_dir_out) & bgpio_line2mask(gc, gpio)) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; } - if (gc->reg_dir_in) - if (!(gc->read_reg(gc->reg_dir_in) & bgpio_line2mask(gc, gpio))) + if (chip->reg_dir_in) + if (!(chip->read_reg(chip->reg_dir_in) & bgpio_line2mask(gc, gpio))) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; @@ -414,18 +436,19 @@ static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio) static void bgpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); - gc->bgpio_dir |= bgpio_line2mask(gc, gpio); + chip->sdir |= bgpio_line2mask(gc, gpio); - if (gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); - if (gc->reg_dir_out) - gc->write_reg(gc->reg_dir_out, gc->bgpio_dir); + if (chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); + if (chip->reg_dir_out) + chip->write_reg(chip->reg_dir_out, chip->sdir); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); } static int bgpio_dir_out_dir_first(struct gpio_chip *gc, unsigned int gpio, @@ -445,31 +468,30 @@ static int bgpio_dir_out_val_first(struct gpio_chip *gc, unsigned int gpio, } static int bgpio_setup_accessors(struct device *dev, - struct gpio_chip *gc, + struct gpio_generic_chip *chip, bool byte_be) { - - switch (gc->bgpio_bits) { + switch (chip->bits) { case 8: - gc->read_reg = bgpio_read8; - gc->write_reg = bgpio_write8; + chip->read_reg = bgpio_read8; + chip->write_reg = bgpio_write8; break; case 16: if (byte_be) { - gc->read_reg = bgpio_read16be; - gc->write_reg = bgpio_write16be; + chip->read_reg = bgpio_read16be; + chip->write_reg = bgpio_write16be; } else { - gc->read_reg = bgpio_read16; - gc->write_reg = bgpio_write16; + chip->read_reg = bgpio_read16; + chip->write_reg = bgpio_write16; } break; case 32: if (byte_be) { - gc->read_reg = bgpio_read32be; - gc->write_reg = bgpio_write32be; + chip->read_reg = bgpio_read32be; + chip->write_reg = bgpio_write32be; } else { - gc->read_reg = bgpio_read32; - gc->write_reg = bgpio_write32; + chip->read_reg = bgpio_read32; + chip->write_reg = bgpio_write32; } break; #if BITS_PER_LONG >= 64 @@ -479,13 +501,13 @@ static int bgpio_setup_accessors(struct device *dev, "64 bit big endian byte order unsupported\n"); return -EINVAL; } else { - gc->read_reg = bgpio_read64; - gc->write_reg = bgpio_write64; + chip->read_reg = bgpio_read64; + chip->write_reg = bgpio_write64; } break; #endif /* BITS_PER_LONG >= 64 */ default: - dev_err(dev, "unsupported data width %u bits\n", gc->bgpio_bits); + dev_err(dev, "unsupported data width %u bits\n", chip->bits); return -EINVAL; } @@ -514,27 +536,25 @@ static int bgpio_setup_accessors(struct device *dev, * - an input direction register (named "dirin") where a 1 bit indicates * the GPIO is an input. */ -static int bgpio_setup_io(struct gpio_chip *gc, - void __iomem *dat, - void __iomem *set, - void __iomem *clr, - unsigned long flags) +static int bgpio_setup_io(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { + struct gpio_chip *gc = &chip->gc; - gc->reg_dat = dat; - if (!gc->reg_dat) + chip->reg_dat = cfg->dat; + if (!chip->reg_dat) return -EINVAL; - if (set && clr) { - gc->reg_set = set; - gc->reg_clr = clr; + if (cfg->set && cfg->clr) { + chip->reg_set = cfg->set; + chip->reg_clr = cfg->clr; gc->set = bgpio_set_with_clear; gc->set_multiple = bgpio_set_multiple_with_clear; - } else if (set && !clr) { - gc->reg_set = set; + } else if (cfg->set && !cfg->clr) { + chip->reg_set = cfg->set; gc->set = bgpio_set_set; gc->set_multiple = bgpio_set_multiple_set; - } else if (flags & BGPIOF_NO_OUTPUT) { + } else if (cfg->flags & GPIO_GENERIC_NO_OUTPUT) { gc->set = bgpio_set_none; gc->set_multiple = NULL; } else { @@ -542,10 +562,10 @@ static int bgpio_setup_io(struct gpio_chip *gc, gc->set_multiple = bgpio_set_multiple; } - if (!(flags & BGPIOF_UNREADABLE_REG_SET) && - (flags & BGPIOF_READ_OUTPUT_REG_SET)) { + if (!(cfg->flags & GPIO_GENERIC_UNREADABLE_REG_SET) && + (cfg->flags & GPIO_GENERIC_READ_OUTPUT_REG_SET)) { gc->get = bgpio_get_set; - if (!gc->be_bits) + if (!chip->be_bits) gc->get_multiple = bgpio_get_set_multiple; /* * We deliberately avoid assigning the ->get_multiple() call @@ -556,7 +576,7 @@ static int bgpio_setup_io(struct gpio_chip *gc, */ } else { gc->get = bgpio_get; - if (gc->be_bits) + if (chip->be_bits) gc->get_multiple = bgpio_get_multiple_be; else gc->get_multiple = bgpio_get_multiple; @@ -565,27 +585,27 @@ static int bgpio_setup_io(struct gpio_chip *gc, return 0; } -static int bgpio_setup_direction(struct gpio_chip *gc, - void __iomem *dirout, - void __iomem *dirin, - unsigned long flags) +static int bgpio_setup_direction(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { - if (dirout || dirin) { - gc->reg_dir_out = dirout; - gc->reg_dir_in = dirin; - if (flags & BGPIOF_NO_SET_ON_INPUT) + struct gpio_chip *gc = &chip->gc; + + if (cfg->dirout || cfg->dirin) { + chip->reg_dir_out = cfg->dirout; + chip->reg_dir_in = cfg->dirin; + if (cfg->flags & GPIO_GENERIC_NO_SET_ON_INPUT) gc->direction_output = bgpio_dir_out_dir_first; else gc->direction_output = bgpio_dir_out_val_first; gc->direction_input = bgpio_dir_in; gc->get_direction = bgpio_get_dir; } else { - if (flags & BGPIOF_NO_OUTPUT) + if (cfg->flags & GPIO_GENERIC_NO_OUTPUT) gc->direction_output = bgpio_dir_out_err; else gc->direction_output = bgpio_simple_dir_out; - if (flags & BGPIOF_NO_INPUT) + if (cfg->flags & GPIO_GENERIC_NO_INPUT) gc->direction_input = bgpio_dir_in_err; else gc->direction_input = bgpio_simple_dir_in; @@ -594,117 +614,101 @@ static int bgpio_setup_direction(struct gpio_chip *gc, return 0; } -static int bgpio_request(struct gpio_chip *chip, unsigned gpio_pin) +static int bgpio_request(struct gpio_chip *gc, unsigned int gpio_pin) { - if (gpio_pin >= chip->ngpio) + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (gpio_pin >= gc->ngpio) return -EINVAL; - if (chip->bgpio_pinctrl) - return gpiochip_generic_request(chip, gpio_pin); + if (chip->pinctrl) + return gpiochip_generic_request(gc, gpio_pin); return 0; } /** - * bgpio_init() - Initialize generic GPIO accessor functions - * @gc: the GPIO chip to set up - * @dev: the parent device of the new GPIO chip (compulsory) - * @sz: the size (width) of the MMIO registers in bytes, typically 1, 2 or 4 - * @dat: MMIO address for the register to READ the value of the GPIO lines, it - * is expected that a 1 in the corresponding bit in this register means the - * line is asserted - * @set: MMIO address for the register to SET the value of the GPIO lines, it is - * expected that we write the line with 1 in this register to drive the GPIO line - * high. - * @clr: MMIO address for the register to CLEAR the value of the GPIO lines, it is - * expected that we write the line with 1 in this register to drive the GPIO line - * low. It is allowed to leave this address as NULL, in that case the SET register - * will be assumed to also clear the GPIO lines, by actively writing the line - * with 0. - * @dirout: MMIO address for the register to set the line as OUTPUT. It is assumed - * that setting a line to 1 in this register will turn that line into an - * output line. Conversely, setting the line to 0 will turn that line into - * an input. - * @dirin: MMIO address for the register to set this line as INPUT. It is assumed - * that setting a line to 1 in this register will turn that line into an - * input line. Conversely, setting the line to 0 will turn that line into - * an output. - * @flags: Different flags that will affect the behaviour of the device, such as - * endianness etc. + * gpio_generic_chip_init() - Initialize a generic GPIO chip. + * @chip: Generic GPIO chip to set up. + * @cfg: Generic GPIO chip configuration. + * + * Returns 0 on success, negative error number on failure. */ -int bgpio_init(struct gpio_chip *gc, struct device *dev, - unsigned long sz, void __iomem *dat, void __iomem *set, - void __iomem *clr, void __iomem *dirout, void __iomem *dirin, - unsigned long flags) +int gpio_generic_chip_init(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { + struct gpio_chip *gc = &chip->gc; + unsigned long flags = cfg->flags; + struct device *dev = cfg->dev; int ret; - if (!is_power_of_2(sz)) + if (!is_power_of_2(cfg->sz)) return -EINVAL; - gc->bgpio_bits = sz * 8; - if (gc->bgpio_bits > BITS_PER_LONG) + chip->bits = cfg->sz * 8; + if (chip->bits > BITS_PER_LONG) return -EINVAL; - raw_spin_lock_init(&gc->bgpio_lock); + raw_spin_lock_init(&chip->lock); gc->parent = dev; gc->label = dev_name(dev); gc->base = -1; gc->request = bgpio_request; - gc->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); + chip->be_bits = !!(flags & GPIO_GENERIC_BIG_ENDIAN); ret = gpiochip_get_ngpios(gc, dev); if (ret) - gc->ngpio = gc->bgpio_bits; + gc->ngpio = chip->bits; - ret = bgpio_setup_io(gc, dat, set, clr, flags); + ret = bgpio_setup_io(chip, cfg); if (ret) return ret; - ret = bgpio_setup_accessors(dev, gc, flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); + ret = bgpio_setup_accessors(dev, chip, + flags & GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER); if (ret) return ret; - ret = bgpio_setup_direction(gc, dirout, dirin, flags); + ret = bgpio_setup_direction(chip, cfg); if (ret) return ret; - if (flags & BGPIOF_PINCTRL_BACKEND) { - gc->bgpio_pinctrl = true; + if (flags & GPIO_GENERIC_PINCTRL_BACKEND) { + chip->pinctrl = true; /* Currently this callback is only used for pincontrol */ gc->free = gpiochip_generic_free; } - gc->bgpio_data = gc->read_reg(gc->reg_dat); + chip->sdata = chip->read_reg(chip->reg_dat); if (gc->set == bgpio_set_set && - !(flags & BGPIOF_UNREADABLE_REG_SET)) - gc->bgpio_data = gc->read_reg(gc->reg_set); + !(flags & GPIO_GENERIC_UNREADABLE_REG_SET)) + chip->sdata = chip->read_reg(chip->reg_set); - if (flags & BGPIOF_UNREADABLE_REG_DIR) - gc->bgpio_dir_unreadable = true; + if (flags & GPIO_GENERIC_UNREADABLE_REG_DIR) + chip->dir_unreadable = true; /* * Inspect hardware to find initial direction setting. */ - if ((gc->reg_dir_out || gc->reg_dir_in) && - !(flags & BGPIOF_UNREADABLE_REG_DIR)) { - if (gc->reg_dir_out) - gc->bgpio_dir = gc->read_reg(gc->reg_dir_out); - else if (gc->reg_dir_in) - gc->bgpio_dir = ~gc->read_reg(gc->reg_dir_in); + if ((chip->reg_dir_out || chip->reg_dir_in) && + !(flags & GPIO_GENERIC_UNREADABLE_REG_DIR)) { + if (chip->reg_dir_out) + chip->sdir = chip->read_reg(chip->reg_dir_out); + else if (chip->reg_dir_in) + chip->sdir = ~chip->read_reg(chip->reg_dir_in); /* * If we have two direction registers, synchronise * input setting to output setting, the library * can not handle a line being input and output at * the same time. */ - if (gc->reg_dir_out && gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); + if (chip->reg_dir_out && chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); } return ret; } -EXPORT_SYMBOL_GPL(bgpio_init); +EXPORT_SYMBOL_GPL(gpio_generic_chip_init); #if IS_ENABLED(CONFIG_GPIO_GENERIC_PLATFORM) @@ -730,12 +734,15 @@ static const struct of_device_id bgpio_of_match[] = { { .compatible = "brcm,bcm6345-gpio" }, { .compatible = "wd,mbl-gpio" }, { .compatible = "ni,169445-nand-gpio" }, + { .compatible = "intel,ixp4xx-expansion-bus-mmio-gpio" }, { } }; MODULE_DEVICE_TABLE(of, bgpio_of_match); static int bgpio_pdev_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; + struct gpio_generic_chip *gen_gc; struct device *dev = &pdev->dev; struct resource *r; void __iomem *dat; @@ -747,7 +754,6 @@ static int bgpio_pdev_probe(struct platform_device *pdev) unsigned long flags = 0; unsigned int base; int err; - struct gpio_chip *gc; const char *label; r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dat"); @@ -776,23 +782,34 @@ static int bgpio_pdev_probe(struct platform_device *pdev) if (IS_ERR(dirin)) return PTR_ERR(dirin); - gc = devm_kzalloc(&pdev->dev, sizeof(*gc), GFP_KERNEL); - if (!gc) + gen_gc = devm_kzalloc(&pdev->dev, sizeof(*gen_gc), GFP_KERNEL); + if (!gen_gc) return -ENOMEM; if (device_is_big_endian(dev)) - flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags |= GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; if (device_property_read_bool(dev, "no-output")) - flags |= BGPIOF_NO_OUTPUT; - - err = bgpio_init(gc, dev, sz, dat, set, clr, dirout, dirin, flags); + flags |= GPIO_GENERIC_NO_OUTPUT; + + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = sz, + .dat = dat, + .set = set, + .clr = clr, + .dirout = dirout, + .dirin = dirin, + .flags = flags, + }; + + err = gpio_generic_chip_init(gen_gc, &config); if (err) return err; err = device_property_read_string(dev, "label", &label); if (!err) - gc->label = label; + gen_gc->gc.label = label; /* * This property *must not* be used in device-tree sources, it's only @@ -800,11 +817,11 @@ static int bgpio_pdev_probe(struct platform_device *pdev) */ err = device_property_read_u32(dev, "gpio-mmio,base", &base); if (!err && base <= INT_MAX) - gc->base = base; + gen_gc->gc.base = base; - platform_set_drvdata(pdev, gc); + platform_set_drvdata(pdev, &gen_gc->gc); - return devm_gpiochip_add_data(&pdev->dev, gc, NULL); + return devm_gpiochip_add_data(&pdev->dev, &gen_gc->gc, NULL); } static const struct platform_device_id bgpio_id_table[] = { diff --git a/drivers/gpio/gpio-mpc5200.c b/drivers/gpio/gpio-mpc5200.c index dad0eca1ca2ead..00f209157fd0f0 100644 --- a/drivers/gpio/gpio-mpc5200.c +++ b/drivers/gpio/gpio-mpc5200.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -19,7 +19,8 @@ static DEFINE_SPINLOCK(gpio_lock); struct mpc52xx_gpiochip { - struct of_mm_gpio_chip mmchip; + struct gpio_chip gc; + void __iomem *regs; unsigned int shadow_dvo; unsigned int shadow_gpioe; unsigned int shadow_ddr; @@ -43,8 +44,8 @@ struct mpc52xx_gpiochip { */ static int mpc52xx_wkup_gpio_get(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs; + struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); + struct mpc52xx_gpio_wkup __iomem *regs = chip->regs; unsigned int ret; ret = (in_8(®s->wkup_ival) >> (7 - gpio)) & 1; @@ -57,9 +58,8 @@ static int mpc52xx_wkup_gpio_get(struct gpio_chip *gc, unsigned int gpio) static inline void __mpc52xx_wkup_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio_wkup __iomem *regs = chip->regs; if (val) chip->shadow_dvo |= 1 << (7 - gpio); @@ -87,9 +87,8 @@ mpc52xx_wkup_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int mpc52xx_wkup_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio_wkup __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); @@ -110,9 +109,8 @@ static int mpc52xx_wkup_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) static int mpc52xx_wkup_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs; struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); + struct mpc52xx_gpio_wkup __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); @@ -136,30 +134,41 @@ mpc52xx_wkup_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static int mpc52xx_wkup_gpiochip_probe(struct platform_device *ofdev) { + struct device *dev = &ofdev->dev; + struct device_node *np = dev->of_node; struct mpc52xx_gpiochip *chip; struct mpc52xx_gpio_wkup __iomem *regs; struct gpio_chip *gc; int ret; - chip = devm_kzalloc(&ofdev->dev, sizeof(*chip), GFP_KERNEL); + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM; platform_set_drvdata(ofdev, chip); - gc = &chip->mmchip.gc; + gc = &chip->gc; + gc->base = -1; gc->ngpio = 8; gc->direction_input = mpc52xx_wkup_gpio_dir_in; gc->direction_output = mpc52xx_wkup_gpio_dir_out; gc->get = mpc52xx_wkup_gpio_get; gc->set = mpc52xx_wkup_gpio_set; - ret = of_mm_gpiochip_add_data(ofdev->dev.of_node, &chip->mmchip, chip); + gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np); + if (!gc->label) + return -ENOMEM; + + chip->regs = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(chip->regs)) + return PTR_ERR(chip->regs); + + ret = devm_gpiochip_add_data(dev, gc, chip); if (ret) return ret; - regs = chip->mmchip.regs; + regs = chip->regs; chip->shadow_gpioe = in_8(®s->wkup_gpioe); chip->shadow_ddr = in_8(®s->wkup_ddr); chip->shadow_dvo = in_8(®s->wkup_dvo); @@ -167,13 +176,6 @@ static int mpc52xx_wkup_gpiochip_probe(struct platform_device *ofdev) return 0; } -static void mpc52xx_gpiochip_remove(struct platform_device *ofdev) -{ - struct mpc52xx_gpiochip *chip = platform_get_drvdata(ofdev); - - of_mm_gpiochip_remove(&chip->mmchip); -} - static const struct of_device_id mpc52xx_wkup_gpiochip_match[] = { { .compatible = "fsl,mpc5200-gpio-wkup", }, {} @@ -185,7 +187,6 @@ static struct platform_driver mpc52xx_wkup_gpiochip_driver = { .of_match_table = mpc52xx_wkup_gpiochip_match, }, .probe = mpc52xx_wkup_gpiochip_probe, - .remove = mpc52xx_gpiochip_remove, }; /* @@ -207,8 +208,8 @@ static struct platform_driver mpc52xx_wkup_gpiochip_driver = { */ static int mpc52xx_simple_gpio_get(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct mpc52xx_gpio __iomem *regs = mm_gc->regs; + struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); + struct mpc52xx_gpio __iomem *regs = chip->regs; unsigned int ret; ret = (in_be32(®s->simple_ival) >> (31 - gpio)) & 1; @@ -219,9 +220,8 @@ static int mpc52xx_simple_gpio_get(struct gpio_chip *gc, unsigned int gpio) static inline void __mpc52xx_simple_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio __iomem *regs = chip->regs; if (val) chip->shadow_dvo |= 1 << (31 - gpio); @@ -248,9 +248,8 @@ mpc52xx_simple_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int mpc52xx_simple_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); @@ -271,9 +270,8 @@ static int mpc52xx_simple_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) static int mpc52xx_simple_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc); - struct mpc52xx_gpio __iomem *regs = mm_gc->regs; + struct mpc52xx_gpio __iomem *regs = chip->regs; unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); @@ -298,30 +296,41 @@ mpc52xx_simple_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static int mpc52xx_simple_gpiochip_probe(struct platform_device *ofdev) { + struct device *dev = &ofdev->dev; + struct device_node *np = dev->of_node; struct mpc52xx_gpiochip *chip; struct gpio_chip *gc; struct mpc52xx_gpio __iomem *regs; int ret; - chip = devm_kzalloc(&ofdev->dev, sizeof(*chip), GFP_KERNEL); + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM; platform_set_drvdata(ofdev, chip); - gc = &chip->mmchip.gc; + gc = &chip->gc; + gc->base = -1; gc->ngpio = 32; gc->direction_input = mpc52xx_simple_gpio_dir_in; gc->direction_output = mpc52xx_simple_gpio_dir_out; gc->get = mpc52xx_simple_gpio_get; gc->set = mpc52xx_simple_gpio_set; - ret = of_mm_gpiochip_add_data(ofdev->dev.of_node, &chip->mmchip, chip); + gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np); + if (!gc->label) + return -ENOMEM; + + chip->regs = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(chip->regs)) + return PTR_ERR(chip->regs); + + ret = devm_gpiochip_add_data(dev, gc, chip); if (ret) return ret; - regs = chip->mmchip.regs; + regs = chip->regs; chip->shadow_gpioe = in_be32(®s->simple_gpioe); chip->shadow_ddr = in_be32(®s->simple_ddr); chip->shadow_dvo = in_be32(®s->simple_dvo); @@ -340,7 +349,6 @@ static struct platform_driver mpc52xx_simple_gpiochip_driver = { .of_match_table = mpc52xx_simple_gpiochip_match, }, .probe = mpc52xx_simple_gpiochip_probe, - .remove = mpc52xx_gpiochip_remove, }; static struct platform_driver * const drivers[] = { diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index 121efdd71e451d..bfe828734ee1ba 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -34,7 +35,7 @@ #define GPIO_IBE 0x18 struct mpc8xxx_gpio_chip { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *regs; raw_spinlock_t lock; @@ -66,9 +67,11 @@ static int mpc8572_gpio_get(struct gpio_chip *gc, unsigned int gpio) struct mpc8xxx_gpio_chip *mpc8xxx_gc = gpiochip_get_data(gc); u32 out_mask, out_shadow; - out_mask = gc->read_reg(mpc8xxx_gc->regs + GPIO_DIR); - val = gc->read_reg(mpc8xxx_gc->regs + GPIO_DAT) & ~out_mask; - out_shadow = gc->bgpio_data & out_mask; + out_mask = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_DIR); + val = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_DAT) & ~out_mask; + out_shadow = mpc8xxx_gc->chip.sdata & out_mask; return !!((val | out_shadow) & mpc_pin2mask(gpio)); } @@ -108,12 +111,13 @@ static int mpc8xxx_gpio_to_irq(struct gpio_chip *gc, unsigned offset) static irqreturn_t mpc8xxx_gpio_irq_cascade(int irq, void *data) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = data; - struct gpio_chip *gc = &mpc8xxx_gc->gc; unsigned long mask; int i; - mask = gc->read_reg(mpc8xxx_gc->regs + GPIO_IER) - & gc->read_reg(mpc8xxx_gc->regs + GPIO_IMR); + mask = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IER) & + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR); for_each_set_bit(i, &mask, 32) generic_handle_domain_irq(mpc8xxx_gc->irq, 31 - i); @@ -124,15 +128,17 @@ static void mpc8xxx_irq_unmask(struct irq_data *d) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); irq_hw_number_t hwirq = irqd_to_hwirq(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; + struct gpio_chip *gc = &mpc8xxx_gc->chip.gc; unsigned long flags; gpiochip_enable_irq(gc, hwirq); raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(mpc8xxx_gc->regs + GPIO_IMR, - gc->read_reg(mpc8xxx_gc->regs + GPIO_IMR) + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR, + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR) | mpc_pin2mask(irqd_to_hwirq(d))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); @@ -142,13 +148,14 @@ static void mpc8xxx_irq_mask(struct irq_data *d) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); irq_hw_number_t hwirq = irqd_to_hwirq(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; + struct gpio_chip *gc = &mpc8xxx_gc->chip.gc; unsigned long flags; raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(mpc8xxx_gc->regs + GPIO_IMR, - gc->read_reg(mpc8xxx_gc->regs + GPIO_IMR) + gpio_generic_write_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_IMR, + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR) & ~mpc_pin2mask(irqd_to_hwirq(d))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); @@ -159,32 +166,34 @@ static void mpc8xxx_irq_mask(struct irq_data *d) static void mpc8xxx_irq_ack(struct irq_data *d) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; - gc->write_reg(mpc8xxx_gc->regs + GPIO_IER, + gpio_generic_write_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_IER, mpc_pin2mask(irqd_to_hwirq(d))); } static int mpc8xxx_irq_set_type(struct irq_data *d, unsigned int flow_type) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; unsigned long flags; switch (flow_type) { case IRQ_TYPE_EDGE_FALLING: case IRQ_TYPE_LEVEL_LOW: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(mpc8xxx_gc->regs + GPIO_ICR, - gc->read_reg(mpc8xxx_gc->regs + GPIO_ICR) + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_ICR, + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_ICR) | mpc_pin2mask(irqd_to_hwirq(d))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; case IRQ_TYPE_EDGE_BOTH: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(mpc8xxx_gc->regs + GPIO_ICR, - gc->read_reg(mpc8xxx_gc->regs + GPIO_ICR) + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_ICR, + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_ICR) & ~mpc_pin2mask(irqd_to_hwirq(d))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; @@ -199,7 +208,6 @@ static int mpc8xxx_irq_set_type(struct irq_data *d, unsigned int flow_type) static int mpc512x_irq_set_type(struct irq_data *d, unsigned int flow_type) { struct mpc8xxx_gpio_chip *mpc8xxx_gc = irq_data_get_irq_chip_data(d); - struct gpio_chip *gc = &mpc8xxx_gc->gc; unsigned long gpio = irqd_to_hwirq(d); void __iomem *reg; unsigned int shift; @@ -217,7 +225,9 @@ static int mpc512x_irq_set_type(struct irq_data *d, unsigned int flow_type) case IRQ_TYPE_EDGE_FALLING: case IRQ_TYPE_LEVEL_LOW: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(reg, (gc->read_reg(reg) & ~(3 << shift)) + gpio_generic_write_reg(&mpc8xxx_gc->chip, reg, + (gpio_generic_read_reg(&mpc8xxx_gc->chip, + reg) & ~(3 << shift)) | (2 << shift)); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; @@ -225,14 +235,18 @@ static int mpc512x_irq_set_type(struct irq_data *d, unsigned int flow_type) case IRQ_TYPE_EDGE_RISING: case IRQ_TYPE_LEVEL_HIGH: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(reg, (gc->read_reg(reg) & ~(3 << shift)) + gpio_generic_write_reg(&mpc8xxx_gc->chip, reg, + (gpio_generic_read_reg(&mpc8xxx_gc->chip, + reg) & ~(3 << shift)) | (1 << shift)); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; case IRQ_TYPE_EDGE_BOTH: raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); - gc->write_reg(reg, (gc->read_reg(reg) & ~(3 << shift))); + gpio_generic_write_reg(&mpc8xxx_gc->chip, reg, + (gpio_generic_read_reg(&mpc8xxx_gc->chip, + reg) & ~(3 << shift))); raw_spin_unlock_irqrestore(&mpc8xxx_gc->lock, flags); break; @@ -309,6 +323,7 @@ static const struct of_device_id mpc8xxx_gpio_ids[] = { static int mpc8xxx_probe(struct platform_device *pdev) { const struct mpc8xxx_gpio_devtype *devtype = NULL; + struct gpio_generic_chip_config config; struct mpc8xxx_gpio_chip *mpc8xxx_gc; struct device *dev = &pdev->dev; struct fwnode_handle *fwnode; @@ -327,26 +342,28 @@ static int mpc8xxx_probe(struct platform_device *pdev) if (IS_ERR(mpc8xxx_gc->regs)) return PTR_ERR(mpc8xxx_gc->regs); - gc = &mpc8xxx_gc->gc; + gc = &mpc8xxx_gc->chip.gc; gc->parent = dev; + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = mpc8xxx_gc->regs + GPIO_DAT, + .dirout = mpc8xxx_gc->regs + GPIO_DIR, + .flags = GPIO_GENERIC_BIG_ENDIAN + }; + if (device_property_read_bool(dev, "little-endian")) { - ret = bgpio_init(gc, dev, 4, mpc8xxx_gc->regs + GPIO_DAT, - NULL, NULL, mpc8xxx_gc->regs + GPIO_DIR, - NULL, BGPIOF_BIG_ENDIAN); - if (ret) - return ret; dev_dbg(dev, "GPIO registers are LITTLE endian\n"); } else { - ret = bgpio_init(gc, dev, 4, mpc8xxx_gc->regs + GPIO_DAT, - NULL, NULL, mpc8xxx_gc->regs + GPIO_DIR, - NULL, BGPIOF_BIG_ENDIAN - | BGPIOF_BIG_ENDIAN_BYTE_ORDER); - if (ret) - return ret; + config.flags |= GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; dev_dbg(dev, "GPIO registers are BIG endian\n"); } + ret = gpio_generic_chip_init(&mpc8xxx_gc->chip, &config); + if (ret) + return ret; + mpc8xxx_gc->direction_output = gc->direction_output; devtype = device_get_match_data(dev); @@ -379,10 +396,14 @@ static int mpc8xxx_probe(struct platform_device *pdev) device_is_compatible(dev, "fsl,ls1028a-gpio") || device_is_compatible(dev, "fsl,ls1088a-gpio") || is_acpi_node(fwnode)) { - gc->write_reg(mpc8xxx_gc->regs + GPIO_IBE, 0xffffffff); + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IBE, 0xffffffff); /* Also, latch state of GPIOs configured as output by bootloader. */ - gc->bgpio_data = gc->read_reg(mpc8xxx_gc->regs + GPIO_DAT) & - gc->read_reg(mpc8xxx_gc->regs + GPIO_DIR); + mpc8xxx_gc->chip.sdata = + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_DAT) & + gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_DIR); } ret = devm_gpiochip_add_data(dev, gc, mpc8xxx_gc); @@ -405,8 +426,10 @@ static int mpc8xxx_probe(struct platform_device *pdev) return 0; /* ack and mask all irqs */ - gc->write_reg(mpc8xxx_gc->regs + GPIO_IER, 0xffffffff); - gc->write_reg(mpc8xxx_gc->regs + GPIO_IMR, 0); + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IER, 0xffffffff); + gpio_generic_write_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->regs + GPIO_IMR, 0); ret = devm_request_irq(dev, mpc8xxx_gc->irqn, mpc8xxx_gpio_irq_cascade, diff --git a/drivers/gpio/gpio-mpfs.c b/drivers/gpio/gpio-mpfs.c index 82d557a7e5d8d5..9468795b96348a 100644 --- a/drivers/gpio/gpio-mpfs.c +++ b/drivers/gpio/gpio-mpfs.c @@ -69,7 +69,7 @@ static int mpfs_gpio_direction_output(struct gpio_chip *gc, unsigned int gpio_in struct mpfs_gpio_chip *mpfs_gpio = gpiochip_get_data(gc); regmap_update_bits(mpfs_gpio->regs, MPFS_GPIO_CTRL(gpio_index), - MPFS_GPIO_DIR_MASK, MPFS_GPIO_EN_IN); + MPFS_GPIO_DIR_MASK, MPFS_GPIO_EN_OUT | MPFS_GPIO_EN_OUT_BUF); regmap_update_bits(mpfs_gpio->regs, mpfs_gpio->offsets->outp, BIT(gpio_index), value << gpio_index); diff --git a/drivers/gpio/gpio-mt7621.c b/drivers/gpio/gpio-mt7621.c index 93facbebb80efa..91230be5158790 100644 --- a/drivers/gpio/gpio-mt7621.c +++ b/drivers/gpio/gpio-mt7621.c @@ -6,11 +6,11 @@ #include #include +#include #include #include #include #include -#include #define MTK_BANK_CNT 3 #define MTK_BANK_WIDTH 32 @@ -30,8 +30,7 @@ struct mtk_gc { struct irq_chip irq_chip; - struct gpio_chip chip; - spinlock_t lock; + struct gpio_generic_chip chip; int bank; u32 rising; u32 falling; @@ -59,27 +58,29 @@ struct mtk { static inline struct mtk_gc * to_mediatek_gpio(struct gpio_chip *chip) { - return container_of(chip, struct mtk_gc, chip); + struct gpio_generic_chip *gen_gc = to_gpio_generic_chip(chip); + + return container_of(gen_gc, struct mtk_gc, chip); } static inline void mtk_gpio_w32(struct mtk_gc *rg, u32 offset, u32 val) { - struct gpio_chip *gc = &rg->chip; + struct gpio_chip *gc = &rg->chip.gc; struct mtk *mtk = gpiochip_get_data(gc); offset = (rg->bank * GPIO_BANK_STRIDE) + offset; - gc->write_reg(mtk->base + offset, val); + gpio_generic_write_reg(&rg->chip, mtk->base + offset, val); } static inline u32 mtk_gpio_r32(struct mtk_gc *rg, u32 offset) { - struct gpio_chip *gc = &rg->chip; + struct gpio_chip *gc = &rg->chip.gc; struct mtk *mtk = gpiochip_get_data(gc); offset = (rg->bank * GPIO_BANK_STRIDE) + offset; - return gc->read_reg(mtk->base + offset); + return gpio_generic_read_reg(&rg->chip, mtk->base + offset); } static irqreturn_t @@ -108,12 +109,12 @@ mediatek_gpio_irq_unmask(struct irq_data *d) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct mtk_gc *rg = to_mediatek_gpio(gc); int pin = d->hwirq; - unsigned long flags; u32 rise, fall, high, low; gpiochip_enable_irq(gc, d->hwirq); - spin_lock_irqsave(&rg->lock, flags); + guard(gpio_generic_lock_irqsave)(&rg->chip); + rise = mtk_gpio_r32(rg, GPIO_REG_REDGE); fall = mtk_gpio_r32(rg, GPIO_REG_FEDGE); high = mtk_gpio_r32(rg, GPIO_REG_HLVL); @@ -122,7 +123,6 @@ mediatek_gpio_irq_unmask(struct irq_data *d) mtk_gpio_w32(rg, GPIO_REG_FEDGE, fall | (BIT(pin) & rg->falling)); mtk_gpio_w32(rg, GPIO_REG_HLVL, high | (BIT(pin) & rg->hlevel)); mtk_gpio_w32(rg, GPIO_REG_LLVL, low | (BIT(pin) & rg->llevel)); - spin_unlock_irqrestore(&rg->lock, flags); } static void @@ -131,19 +131,18 @@ mediatek_gpio_irq_mask(struct irq_data *d) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct mtk_gc *rg = to_mediatek_gpio(gc); int pin = d->hwirq; - unsigned long flags; u32 rise, fall, high, low; - spin_lock_irqsave(&rg->lock, flags); - rise = mtk_gpio_r32(rg, GPIO_REG_REDGE); - fall = mtk_gpio_r32(rg, GPIO_REG_FEDGE); - high = mtk_gpio_r32(rg, GPIO_REG_HLVL); - low = mtk_gpio_r32(rg, GPIO_REG_LLVL); - mtk_gpio_w32(rg, GPIO_REG_FEDGE, fall & ~BIT(pin)); - mtk_gpio_w32(rg, GPIO_REG_REDGE, rise & ~BIT(pin)); - mtk_gpio_w32(rg, GPIO_REG_HLVL, high & ~BIT(pin)); - mtk_gpio_w32(rg, GPIO_REG_LLVL, low & ~BIT(pin)); - spin_unlock_irqrestore(&rg->lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &rg->chip) { + rise = mtk_gpio_r32(rg, GPIO_REG_REDGE); + fall = mtk_gpio_r32(rg, GPIO_REG_FEDGE); + high = mtk_gpio_r32(rg, GPIO_REG_HLVL); + low = mtk_gpio_r32(rg, GPIO_REG_LLVL); + mtk_gpio_w32(rg, GPIO_REG_FEDGE, fall & ~BIT(pin)); + mtk_gpio_w32(rg, GPIO_REG_REDGE, rise & ~BIT(pin)); + mtk_gpio_w32(rg, GPIO_REG_HLVL, high & ~BIT(pin)); + mtk_gpio_w32(rg, GPIO_REG_LLVL, low & ~BIT(pin)); + } gpiochip_disable_irq(gc, d->hwirq); } @@ -220,6 +219,7 @@ static const struct irq_chip mt7621_irq_chip = { static int mediatek_gpio_bank_probe(struct device *dev, int bank) { + struct gpio_generic_chip_config config; struct mtk *mtk = dev_get_drvdata(dev); struct mtk_gc *rg; void __iomem *dat, *set, *ctrl, *diro; @@ -228,7 +228,6 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) rg = &mtk->gc_map[bank]; memset(rg, 0, sizeof(*rg)); - spin_lock_init(&rg->lock); rg->bank = bank; dat = mtk->base + GPIO_REG_DATA + (rg->bank * GPIO_BANK_STRIDE); @@ -236,21 +235,30 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) ctrl = mtk->base + GPIO_REG_DCLR + (rg->bank * GPIO_BANK_STRIDE); diro = mtk->base + GPIO_REG_CTRL + (rg->bank * GPIO_BANK_STRIDE); - ret = bgpio_init(&rg->chip, dev, 4, dat, set, ctrl, diro, NULL, - BGPIOF_NO_SET_ON_INPUT); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = dat, + .set = set, + .clr = ctrl, + .dirout = diro, + .flags = GPIO_GENERIC_NO_SET_ON_INPUT, + }; + + ret = gpio_generic_chip_init(&rg->chip, &config); if (ret) { - dev_err(dev, "bgpio_init() failed\n"); + dev_err(dev, "failed to initialize generic GPIO chip\n"); return ret; } - rg->chip.of_gpio_n_cells = 2; - rg->chip.of_xlate = mediatek_gpio_xlate; - rg->chip.label = devm_kasprintf(dev, GFP_KERNEL, "%s-bank%d", + rg->chip.gc.of_gpio_n_cells = 2; + rg->chip.gc.of_xlate = mediatek_gpio_xlate; + rg->chip.gc.label = devm_kasprintf(dev, GFP_KERNEL, "%s-bank%d", dev_name(dev), bank); - if (!rg->chip.label) + if (!rg->chip.gc.label) return -ENOMEM; - rg->chip.offset = bank * MTK_BANK_WIDTH; + rg->chip.gc.offset = bank * MTK_BANK_WIDTH; if (mtk->gpio_irq) { struct gpio_irq_chip *girq; @@ -261,7 +269,7 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) */ ret = devm_request_irq(dev, mtk->gpio_irq, mediatek_gpio_irq_handler, IRQF_SHARED, - rg->chip.label, &rg->chip); + rg->chip.gc.label, &rg->chip.gc); if (ret) { dev_err(dev, "Error requesting IRQ %d: %d\n", @@ -269,7 +277,7 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) return ret; } - girq = &rg->chip.irq; + girq = &rg->chip.gc.irq; gpio_irq_chip_set_chip(girq, &mt7621_irq_chip); /* This will let us handle the parent IRQ in the driver */ girq->parent_handler = NULL; @@ -279,17 +287,17 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) girq->handler = handle_simple_irq; } - ret = devm_gpiochip_add_data(dev, &rg->chip, mtk); + ret = devm_gpiochip_add_data(dev, &rg->chip.gc, mtk); if (ret < 0) { dev_err(dev, "Could not register gpio %d, ret=%d\n", - rg->chip.ngpio, ret); + rg->chip.gc.ngpio, ret); return ret; } /* set polarity to low for all gpios */ mtk_gpio_w32(rg, GPIO_REG_POL, 0); - dev_info(dev, "registering %d gpios\n", rg->chip.ngpio); + dev_info(dev, "registering %d gpios\n", rg->chip.gc.ngpio); return 0; } diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c index 5e3f54cb8bc463..ac799fced950e3 100644 --- a/drivers/gpio/gpio-mvebu.c +++ b/drivers/gpio/gpio-mvebu.c @@ -602,7 +602,6 @@ static const struct regmap_config mvebu_gpio_regmap_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .fast_io = true, }; /* @@ -899,7 +898,7 @@ static void mvebu_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) msk = BIT(i); is_out = !(io_conf & msk); - seq_printf(s, " gpio-%-3d (%-20.20s)", chip->base + i, label); + seq_printf(s, " gpio-%-3d (%-20.20s)", i, label); if (is_out) { seq_printf(s, " out %s %s\n", diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c index 433cbadc3a4cc6..52060b3ec7458a 100644 --- a/drivers/gpio/gpio-mxc.c +++ b/drivers/gpio/gpio-mxc.c @@ -481,7 +481,7 @@ static int mxc_gpio_probe(struct platform_device *pdev) config.dat = port->base + GPIO_PSR; config.set = port->base + GPIO_DR; config.dirout = port->base + GPIO_GDIR; - config.flags = BGPIOF_READ_OUTPUT_REG_SET; + config.flags = GPIO_GENERIC_READ_OUTPUT_REG_SET; err = gpio_generic_chip_init(&port->gen_gc, &config); if (err) diff --git a/drivers/gpio/gpio-mxs.c b/drivers/gpio/gpio-mxs.c index 0ea46f3d04e128..5635694bf9f448 100644 --- a/drivers/gpio/gpio-mxs.c +++ b/drivers/gpio/gpio-mxs.c @@ -7,17 +7,18 @@ // Copyright (C) 2004-2010 Freescale Semiconductor, Inc. All Rights Reserved. #include +#include +#include #include #include #include #include #include +#include #include #include #include #include -#include -#include #define MXS_SET 0x4 #define MXS_CLR 0x8 @@ -48,7 +49,7 @@ struct mxs_gpio_port { int id; int irq; struct irq_domain *domain; - struct gpio_chip gc; + struct gpio_generic_chip chip; struct device *dev; enum mxs_gpio_id devid; u32 both_edges; @@ -258,6 +259,7 @@ MODULE_DEVICE_TABLE(of, mxs_gpio_dt_ids); static int mxs_gpio_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; + struct gpio_generic_chip_config config; struct device_node *parent; static void __iomem *base; struct mxs_gpio_port *port; @@ -319,19 +321,24 @@ static int mxs_gpio_probe(struct platform_device *pdev) irq_set_chained_handler_and_data(port->irq, mxs_gpio_irq_handler, port); - err = bgpio_init(&port->gc, &pdev->dev, 4, - port->base + PINCTRL_DIN(port), - port->base + PINCTRL_DOUT(port) + MXS_SET, - port->base + PINCTRL_DOUT(port) + MXS_CLR, - port->base + PINCTRL_DOE(port), NULL, 0); + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 4, + .dat = port->base + PINCTRL_DIN(port), + .set = port->base + PINCTRL_DOUT(port) + MXS_SET, + .clr = port->base + PINCTRL_DOUT(port) + MXS_CLR, + .dirout = port->base + PINCTRL_DOE(port), + }; + + err = gpio_generic_chip_init(&port->chip, &config); if (err) goto out_irqdomain_remove; - port->gc.to_irq = mxs_gpio_to_irq; - port->gc.get_direction = mxs_gpio_get_direction; - port->gc.base = port->id * 32; + port->chip.gc.to_irq = mxs_gpio_to_irq; + port->chip.gc.get_direction = mxs_gpio_get_direction; + port->chip.gc.base = port->id * 32; - err = gpiochip_add_data(&port->gc, port); + err = gpiochip_add_data(&port->chip.gc, port); if (err) goto out_irqdomain_remove; diff --git a/drivers/gpio/gpio-nct6694.c b/drivers/gpio/gpio-nct6694.c new file mode 100644 index 00000000000000..a8607f0d99153a --- /dev/null +++ b/drivers/gpio/gpio-nct6694.c @@ -0,0 +1,499 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 GPIO controller driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * USB command module type for NCT6694 GPIO controller. + * This defines the module type used for communication with the NCT6694 + * GPIO controller over the USB interface. + */ +#define NCT6694_GPIO_MOD 0xFF + +#define NCT6694_GPIO_VER 0x90 +#define NCT6694_GPIO_VALID 0x110 +#define NCT6694_GPI_DATA 0x120 +#define NCT6694_GPO_DIR 0x170 +#define NCT6694_GPO_TYPE 0x180 +#define NCT6694_GPO_DATA 0x190 + +#define NCT6694_GPI_STS 0x130 +#define NCT6694_GPI_CLR 0x140 +#define NCT6694_GPI_FALLING 0x150 +#define NCT6694_GPI_RISING 0x160 + +#define NCT6694_NR_GPIO 8 + +struct nct6694_gpio_data { + struct nct6694 *nct6694; + struct gpio_chip gpio; + struct mutex lock; + /* Protect irq operation */ + struct mutex irq_lock; + + unsigned char reg_val; + unsigned char irq_trig_falling; + unsigned char irq_trig_rising; + + /* Current gpio group */ + unsigned char group; + int irq; +}; + +static int nct6694_get_direction(struct gpio_chip *gpio, unsigned int offset) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DIR + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + return !(BIT(offset) & data->reg_val); +} + +static int nct6694_direction_input(struct gpio_chip *gpio, unsigned int offset) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DIR + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + data->reg_val &= ~BIT(offset); + + return nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); +} + +static int nct6694_direction_output(struct gpio_chip *gpio, + unsigned int offset, int val) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DIR + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + /* Set direction to output */ + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + data->reg_val |= BIT(offset); + ret = nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + /* Then set output level */ + cmd_hd.offset = cpu_to_le16(NCT6694_GPO_DATA + data->group); + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + if (val) + data->reg_val |= BIT(offset); + else + data->reg_val &= ~BIT(offset); + + return nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); +} + +static int nct6694_get_value(struct gpio_chip *gpio, unsigned int offset) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DIR + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + if (BIT(offset) & data->reg_val) { + cmd_hd.offset = cpu_to_le16(NCT6694_GPO_DATA + data->group); + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + return !!(BIT(offset) & data->reg_val); + } + + cmd_hd.offset = cpu_to_le16(NCT6694_GPI_DATA + data->group); + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + return !!(BIT(offset) & data->reg_val); +} + +static int nct6694_set_value(struct gpio_chip *gpio, unsigned int offset, + int val) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_DATA + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + if (val) + data->reg_val |= BIT(offset); + else + data->reg_val &= ~BIT(offset); + + return nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); +} + +static int nct6694_set_config(struct gpio_chip *gpio, unsigned int offset, + unsigned long config) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPO_TYPE + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + switch (pinconf_to_config_param(config)) { + case PIN_CONFIG_DRIVE_OPEN_DRAIN: + data->reg_val |= BIT(offset); + break; + case PIN_CONFIG_DRIVE_PUSH_PULL: + data->reg_val &= ~BIT(offset); + break; + default: + return -ENOTSUPP; + } + + return nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); +} + +static int nct6694_init_valid_mask(struct gpio_chip *gpio, + unsigned long *valid_mask, + unsigned int ngpios) +{ + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPIO_VALID + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret < 0) + return ret; + + *valid_mask = data->reg_val; + + return ret; +} + +static irqreturn_t nct6694_irq_handler(int irq, void *priv) +{ + struct nct6694_gpio_data *data = priv; + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPI_STS + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + unsigned char status; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->reg_val); + if (ret) + return IRQ_NONE; + + status = data->reg_val; + + while (status) { + int bit = __ffs(status); + + data->reg_val = BIT(bit); + handle_nested_irq(irq_find_mapping(data->gpio.irq.domain, bit)); + status &= ~BIT(bit); + cmd_hd.offset = cpu_to_le16(NCT6694_GPI_CLR + data->group); + nct6694_write_msg(data->nct6694, &cmd_hd, &data->reg_val); + } + + return IRQ_HANDLED; +} + +static int nct6694_get_irq_trig(struct nct6694_gpio_data *data) +{ + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPI_FALLING + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + int ret; + + guard(mutex)(&data->lock); + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, &data->irq_trig_falling); + if (ret) + return ret; + + cmd_hd.offset = cpu_to_le16(NCT6694_GPI_RISING + data->group); + return nct6694_read_msg(data->nct6694, &cmd_hd, &data->irq_trig_rising); +} + +static void nct6694_irq_mask(struct irq_data *d) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + + gpiochip_disable_irq(gpio, hwirq); +} + +static void nct6694_irq_unmask(struct irq_data *d) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + + gpiochip_enable_irq(gpio, hwirq); +} + +static int nct6694_irq_set_type(struct irq_data *d, unsigned int type) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + + guard(mutex)(&data->lock); + + switch (type) { + case IRQ_TYPE_EDGE_RISING: + data->irq_trig_rising |= BIT(hwirq); + break; + + case IRQ_TYPE_EDGE_FALLING: + data->irq_trig_falling |= BIT(hwirq); + break; + + case IRQ_TYPE_EDGE_BOTH: + data->irq_trig_rising |= BIT(hwirq); + data->irq_trig_falling |= BIT(hwirq); + break; + + default: + return -ENOTSUPP; + } + + return 0; +} + +static void nct6694_irq_bus_lock(struct irq_data *d) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + + mutex_lock(&data->irq_lock); +} + +static void nct6694_irq_bus_sync_unlock(struct irq_data *d) +{ + struct gpio_chip *gpio = irq_data_get_irq_chip_data(d); + struct nct6694_gpio_data *data = gpiochip_get_data(gpio); + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_GPIO_MOD, + .offset = cpu_to_le16(NCT6694_GPI_FALLING + data->group), + .len = cpu_to_le16(sizeof(data->reg_val)) + }; + + scoped_guard(mutex, &data->lock) { + nct6694_write_msg(data->nct6694, &cmd_hd, &data->irq_trig_falling); + + cmd_hd.offset = cpu_to_le16(NCT6694_GPI_RISING + data->group); + nct6694_write_msg(data->nct6694, &cmd_hd, &data->irq_trig_rising); + } + + mutex_unlock(&data->irq_lock); +} + +static const struct irq_chip nct6694_irq_chip = { + .name = "gpio-nct6694", + .irq_mask = nct6694_irq_mask, + .irq_unmask = nct6694_irq_unmask, + .irq_set_type = nct6694_irq_set_type, + .irq_bus_lock = nct6694_irq_bus_lock, + .irq_bus_sync_unlock = nct6694_irq_bus_sync_unlock, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + +static void nct6694_irq_dispose_mapping(void *d) +{ + struct nct6694_gpio_data *data = d; + + irq_dispose_mapping(data->irq); +} + +static void nct6694_gpio_ida_free(void *d) +{ + struct nct6694_gpio_data *data = d; + struct nct6694 *nct6694 = data->nct6694; + + ida_free(&nct6694->gpio_ida, data->group); +} + +static int nct6694_gpio_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct nct6694 *nct6694 = dev_get_drvdata(dev->parent); + struct nct6694_gpio_data *data; + struct gpio_irq_chip *girq; + int ret, i; + char **names; + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->nct6694 = nct6694; + + ret = ida_alloc(&nct6694->gpio_ida, GFP_KERNEL); + if (ret < 0) + return ret; + data->group = ret; + + ret = devm_add_action_or_reset(dev, nct6694_gpio_ida_free, data); + if (ret) + return ret; + + names = devm_kcalloc(dev, NCT6694_NR_GPIO, sizeof(char *), + GFP_KERNEL); + if (!names) + return -ENOMEM; + + for (i = 0; i < NCT6694_NR_GPIO; i++) { + names[i] = devm_kasprintf(dev, GFP_KERNEL, "GPIO%X%d", + data->group, i); + if (!names[i]) + return -ENOMEM; + } + + data->irq = irq_create_mapping(nct6694->domain, + NCT6694_IRQ_GPIO0 + data->group); + if (!data->irq) + return -EINVAL; + + ret = devm_add_action_or_reset(dev, nct6694_irq_dispose_mapping, data); + if (ret) + return ret; + + data->gpio.names = (const char * const*)names; + data->gpio.label = pdev->name; + data->gpio.direction_input = nct6694_direction_input; + data->gpio.get = nct6694_get_value; + data->gpio.direction_output = nct6694_direction_output; + data->gpio.set = nct6694_set_value; + data->gpio.get_direction = nct6694_get_direction; + data->gpio.set_config = nct6694_set_config; + data->gpio.init_valid_mask = nct6694_init_valid_mask; + data->gpio.base = -1; + data->gpio.can_sleep = false; + data->gpio.owner = THIS_MODULE; + data->gpio.ngpio = NCT6694_NR_GPIO; + + platform_set_drvdata(pdev, data); + + ret = devm_mutex_init(dev, &data->lock); + if (ret) + return ret; + + ret = devm_mutex_init(dev, &data->irq_lock); + if (ret) + return ret; + + ret = nct6694_get_irq_trig(data); + if (ret) { + dev_err_probe(dev, ret, "Failed to get irq trigger type\n"); + return ret; + } + + girq = &data->gpio.irq; + gpio_irq_chip_set_chip(girq, &nct6694_irq_chip); + girq->parent_handler = NULL; + girq->num_parents = 0; + girq->parents = NULL; + girq->default_type = IRQ_TYPE_NONE; + girq->handler = handle_level_irq; + girq->threaded = true; + + ret = devm_request_threaded_irq(dev, data->irq, NULL, nct6694_irq_handler, + IRQF_ONESHOT | IRQF_SHARED, + "gpio-nct6694", data); + if (ret) { + dev_err_probe(dev, ret, "Failed to request irq\n"); + return ret; + } + + return devm_gpiochip_add_data(dev, &data->gpio, data); +} + +static struct platform_driver nct6694_gpio_driver = { + .driver = { + .name = "nct6694-gpio", + }, + .probe = nct6694_gpio_probe, +}; + +module_platform_driver(nct6694_gpio_driver); + +MODULE_DESCRIPTION("USB-GPIO controller driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-gpio"); diff --git a/drivers/gpio/gpio-nomadik.c b/drivers/gpio/gpio-nomadik.c index bcf4b07dd4584f..97c5cd33279d56 100644 --- a/drivers/gpio/gpio-nomadik.c +++ b/drivers/gpio/gpio-nomadik.c @@ -20,6 +20,7 @@ */ #include #include +#include #include #include #include @@ -396,10 +397,12 @@ static int nmk_gpio_get_mode(struct nmk_gpio_chip *nmk_chip, int offset) } void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, - struct gpio_chip *chip, unsigned int offset, - unsigned int gpio) + struct gpio_chip *chip, unsigned int offset) { struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(chip); +#ifdef CONFIG_PINCTRL_NOMADIK + struct gpio_desc *desc; +#endif int mode; bool is_out; bool data_out; @@ -425,15 +428,15 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, data_out = !!(readl(nmk_chip->addr + NMK_GPIO_DAT) & BIT(offset)); mode = nmk_gpio_get_mode(nmk_chip, offset); #ifdef CONFIG_PINCTRL_NOMADIK - if (mode == NMK_GPIO_ALT_C && pctldev) - mode = nmk_prcm_gpiocr_get_mode(pctldev, gpio); + if (mode == NMK_GPIO_ALT_C && pctldev) { + desc = gpio_device_get_desc(chip->gpiodev, offset); + mode = nmk_prcm_gpiocr_get_mode(pctldev, desc_to_gpio(desc)); + } #endif if (is_out) { seq_printf(s, " gpio-%-3d (%-20.20s) out %s %s", - gpio, - label ?: "(none)", - str_hi_lo(data_out), + offset, label ?: "(none)", str_hi_lo(data_out), (mode < 0) ? "unknown" : modes[mode]); } else { int irq = chip->to_irq(chip, offset); @@ -445,9 +448,7 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, }; seq_printf(s, " gpio-%-3d (%-20.20s) in %s %s", - gpio, - label ?: "(none)", - pulls[pullidx], + offset, label ?: "(none)", pulls[pullidx], (mode < 0) ? "unknown" : modes[mode]); val = nmk_gpio_get_input(chip, offset); @@ -479,10 +480,10 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, static void nmk_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) { - unsigned int i, gpio = chip->base; + unsigned int i; - for (i = 0; i < chip->ngpio; i++, gpio++) { - nmk_gpio_dbg_show_one(s, NULL, chip, i, gpio); + for (i = 0; i < chip->ngpio; i++) { + nmk_gpio_dbg_show_one(s, NULL, chip, i); seq_puts(s, "\n"); } } diff --git a/drivers/gpio/gpio-pisosr.c b/drivers/gpio/gpio-pisosr.c index a69b74866a1344..7ec6a46ed60071 100644 --- a/drivers/gpio/gpio-pisosr.c +++ b/drivers/gpio/gpio-pisosr.c @@ -108,11 +108,6 @@ static const struct gpio_chip template_chip = { .can_sleep = true, }; -static void pisosr_mutex_destroy(void *lock) -{ - mutex_destroy(lock); -} - static int pisosr_gpio_probe(struct spi_device *spi) { struct device *dev = &spi->dev; @@ -139,8 +134,7 @@ static int pisosr_gpio_probe(struct spi_device *spi) return dev_err_probe(dev, PTR_ERR(gpio->load_gpio), "Unable to allocate load GPIO\n"); - mutex_init(&gpio->lock); - ret = devm_add_action_or_reset(dev, pisosr_mutex_destroy, &gpio->lock); + ret = devm_mutex_init(dev, &gpio->lock); if (ret) return ret; diff --git a/drivers/gpio/gpio-rda.c b/drivers/gpio/gpio-rda.c index cb2f63eee2aa10..7bbc6f0ce4c8a7 100644 --- a/drivers/gpio/gpio-rda.c +++ b/drivers/gpio/gpio-rda.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -35,7 +36,7 @@ #define RDA_GPIO_BANK_NR 32 struct rda_gpio { - struct gpio_chip chip; + struct gpio_generic_chip chip; void __iomem *base; spinlock_t lock; int irq; @@ -208,6 +209,7 @@ static const struct irq_chip rda_gpio_irq_chip = { static int rda_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct gpio_irq_chip *girq; struct rda_gpio *rda_gpio; @@ -235,24 +237,29 @@ static int rda_gpio_probe(struct platform_device *pdev) spin_lock_init(&rda_gpio->lock); - ret = bgpio_init(&rda_gpio->chip, dev, 4, - rda_gpio->base + RDA_GPIO_VAL, - rda_gpio->base + RDA_GPIO_SET, - rda_gpio->base + RDA_GPIO_CLR, - rda_gpio->base + RDA_GPIO_OEN_SET_OUT, - rda_gpio->base + RDA_GPIO_OEN_SET_IN, - BGPIOF_READ_OUTPUT_REG_SET); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = rda_gpio->base + RDA_GPIO_VAL, + .set = rda_gpio->base + RDA_GPIO_SET, + .clr = rda_gpio->base + RDA_GPIO_CLR, + .dirout = rda_gpio->base + RDA_GPIO_OEN_SET_OUT, + .dirin = rda_gpio->base + RDA_GPIO_OEN_SET_IN, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&rda_gpio->chip, &config); if (ret) { - dev_err(dev, "bgpio_init failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return ret; } - rda_gpio->chip.label = dev_name(dev); - rda_gpio->chip.ngpio = ngpios; - rda_gpio->chip.base = -1; + rda_gpio->chip.gc.label = dev_name(dev); + rda_gpio->chip.gc.ngpio = ngpios; + rda_gpio->chip.gc.base = -1; if (rda_gpio->irq >= 0) { - girq = &rda_gpio->chip.irq; + girq = &rda_gpio->chip.gc.irq; gpio_irq_chip_set_chip(girq, &rda_gpio_irq_chip); girq->handler = handle_bad_irq; girq->default_type = IRQ_TYPE_NONE; @@ -269,7 +276,7 @@ static int rda_gpio_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rda_gpio); - return devm_gpiochip_add_data(dev, &rda_gpio->chip, rda_gpio); + return devm_gpiochip_add_data(dev, &rda_gpio->chip.gc, rda_gpio); } static const struct of_device_id rda_gpio_of_match[] = { diff --git a/drivers/gpio/gpio-realtek-otto.c b/drivers/gpio/gpio-realtek-otto.c index d6418f89d3f63d..de527f4fc6c2ad 100644 --- a/drivers/gpio/gpio-realtek-otto.c +++ b/drivers/gpio/gpio-realtek-otto.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only -#include #include +#include +#include #include #include #include @@ -41,7 +42,7 @@ /** * realtek_gpio_ctrl - Realtek Otto GPIO driver data * - * @gc: Associated gpio_chip instance + * @chip: Associated gpio_generic_chip instance * @base: Base address of the register block for a GPIO bank * @lock: Lock for accessing the IRQ registers and values * @intr_mask: Mask for interrupts lines @@ -64,7 +65,7 @@ * IMR on changes. */ struct realtek_gpio_ctrl { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; void __iomem *cpumask_base; struct cpumask cpu_irq_maskable; @@ -101,7 +102,7 @@ static struct realtek_gpio_ctrl *irq_data_to_ctrl(struct irq_data *data) { struct gpio_chip *gc = irq_data_get_irq_chip_data(data); - return container_of(gc, struct realtek_gpio_ctrl, gc); + return container_of(to_gpio_generic_chip(gc), struct realtek_gpio_ctrl, chip); } /* @@ -194,7 +195,7 @@ static void realtek_gpio_irq_unmask(struct irq_data *data) unsigned int line = irqd_to_hwirq(data); unsigned long flags; - gpiochip_enable_irq(&ctrl->gc, line); + gpiochip_enable_irq(&ctrl->chip.gc, line); raw_spin_lock_irqsave(&ctrl->lock, flags); ctrl->intr_mask[line] = REALTEK_GPIO_IMR_LINE_MASK; @@ -213,7 +214,7 @@ static void realtek_gpio_irq_mask(struct irq_data *data) realtek_gpio_update_line_imr(ctrl, line); raw_spin_unlock_irqrestore(&ctrl->lock, flags); - gpiochip_disable_irq(&ctrl->gc, line); + gpiochip_disable_irq(&ctrl->chip.gc, line); } static int realtek_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type) @@ -356,8 +357,9 @@ MODULE_DEVICE_TABLE(of, realtek_gpio_of_match); static int realtek_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; - unsigned long bgpio_flags; + unsigned long gen_gc_flags; unsigned int dev_flags; struct gpio_irq_chip *girq; struct realtek_gpio_ctrl *ctrl; @@ -388,32 +390,37 @@ static int realtek_gpio_probe(struct platform_device *pdev) raw_spin_lock_init(&ctrl->lock); if (dev_flags & GPIO_PORTS_REVERSED) { - bgpio_flags = 0; + gen_gc_flags = 0; ctrl->bank_read = realtek_gpio_bank_read; ctrl->bank_write = realtek_gpio_bank_write; ctrl->line_imr_pos = realtek_gpio_line_imr_pos; } else { - bgpio_flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + gen_gc_flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; ctrl->bank_read = realtek_gpio_bank_read_swapped; ctrl->bank_write = realtek_gpio_bank_write_swapped; ctrl->line_imr_pos = realtek_gpio_line_imr_pos_swapped; } - err = bgpio_init(&ctrl->gc, dev, 4, - ctrl->base + REALTEK_GPIO_REG_DATA, NULL, NULL, - ctrl->base + REALTEK_GPIO_REG_DIR, NULL, - bgpio_flags); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = ctrl->base + REALTEK_GPIO_REG_DATA, + .dirout = ctrl->base + REALTEK_GPIO_REG_DIR, + .flags = gen_gc_flags, + }; + + err = gpio_generic_chip_init(&ctrl->chip, &config); if (err) { dev_err(dev, "unable to init generic GPIO"); return err; } - ctrl->gc.ngpio = ngpios; - ctrl->gc.owner = THIS_MODULE; + ctrl->chip.gc.ngpio = ngpios; + ctrl->chip.gc.owner = THIS_MODULE; irq = platform_get_irq_optional(pdev, 0); if (!(dev_flags & GPIO_INTERRUPTS_DISABLED) && irq > 0) { - girq = &ctrl->gc.irq; + girq = &ctrl->chip.gc.irq; gpio_irq_chip_set_chip(girq, &realtek_gpio_irq_chip); girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_bad_irq; @@ -442,7 +449,7 @@ static int realtek_gpio_probe(struct platform_device *pdev) cpumask_set_cpu(cpu, &ctrl->cpu_irq_maskable); } - return devm_gpiochip_add_data(dev, &ctrl->gc, ctrl); + return devm_gpiochip_add_data(dev, &ctrl->chip.gc, ctrl); } static struct platform_driver realtek_gpio_driver = { diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index e8a32dfebdcb31..ab9e4077fa608c 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -32,6 +32,11 @@ struct gpio_regmap { unsigned int reg_dir_in_base; unsigned int reg_dir_out_base; +#ifdef CONFIG_REGMAP_IRQ + int regmap_irq_line; + struct regmap_irq_chip_data *irq_chip_data; +#endif + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, unsigned int offset, unsigned int *reg, unsigned int *mask); @@ -215,6 +220,7 @@ EXPORT_SYMBOL_GPL(gpio_regmap_get_drvdata); */ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config) { + struct irq_domain *irq_domain; struct gpio_regmap *gpio; struct gpio_chip *chip; int ret; @@ -255,6 +261,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config chip->names = config->names; chip->label = config->label ?: dev_name(config->parent); chip->can_sleep = regmap_might_sleep(config->regmap); + chip->init_valid_mask = config->init_valid_mask; chip->request = gpiochip_generic_request; chip->free = gpiochip_generic_free; @@ -274,7 +281,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config if (!chip->ngpio) { ret = gpiochip_get_ngpios(chip, chip->parent); if (ret) - return ERR_PTR(ret); + goto err_free_gpio; } /* if not set, assume there is only one register */ @@ -295,8 +302,22 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config if (ret < 0) goto err_free_gpio; - if (config->irq_domain) { - ret = gpiochip_irqchip_add_domain(chip, config->irq_domain); +#ifdef CONFIG_REGMAP_IRQ + if (config->regmap_irq_chip) { + gpio->regmap_irq_line = config->regmap_irq_line; + ret = regmap_add_irq_chip_fwnode(dev_fwnode(config->parent), config->regmap, + config->regmap_irq_line, config->regmap_irq_flags, + 0, config->regmap_irq_chip, &gpio->irq_chip_data); + if (ret) + goto err_free_gpio; + + irq_domain = regmap_irq_get_domain(gpio->irq_chip_data); + } else +#endif + irq_domain = config->irq_domain; + + if (irq_domain) { + ret = gpiochip_irqchip_add_domain(chip, irq_domain); if (ret) goto err_remove_gpiochip; } @@ -317,6 +338,11 @@ EXPORT_SYMBOL_GPL(gpio_regmap_register); */ void gpio_regmap_unregister(struct gpio_regmap *gpio) { +#ifdef CONFIG_REGMAP_IRQ + if (gpio->irq_chip_data) + regmap_del_irq_chip(gpio->regmap_irq_line, gpio->irq_chip_data); +#endif + gpiochip_remove(&gpio->gpio_chip); kfree(gpio); } diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index bcfc323a8315ef..47174eb3ba76fb 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -769,7 +769,7 @@ static int rockchip_gpio_probe(struct platform_device *pdev) list_del(&cfg->head); switch (cfg->param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = rockchip_gpio_direction_output(&bank->gpio_chip, cfg->pin, cfg->arg); if (ret) dev_warn(dev, "setting output pin %u to %u failed\n", cfg->pin, diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 067c8edb62e205..94ef2efbd14f57 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,7 @@ struct sifive_gpio { void __iomem *base; - struct gpio_chip gc; + struct gpio_generic_chip gen_gc; struct regmap *regs; unsigned long irq_state; unsigned int trigger[SIFIVE_GPIO_MAX]; @@ -41,10 +42,10 @@ struct sifive_gpio { static void sifive_gpio_set_ie(struct sifive_gpio *chip, unsigned int offset) { - unsigned long flags; unsigned int trigger; - raw_spin_lock_irqsave(&chip->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&chip->gen_gc); + trigger = (chip->irq_state & BIT(offset)) ? chip->trigger[offset] : 0; regmap_update_bits(chip->regs, SIFIVE_GPIO_RISE_IE, BIT(offset), (trigger & IRQ_TYPE_EDGE_RISING) ? BIT(offset) : 0); @@ -54,7 +55,6 @@ static void sifive_gpio_set_ie(struct sifive_gpio *chip, unsigned int offset) (trigger & IRQ_TYPE_LEVEL_HIGH) ? BIT(offset) : 0); regmap_update_bits(chip->regs, SIFIVE_GPIO_LOW_IE, BIT(offset), (trigger & IRQ_TYPE_LEVEL_LOW) ? BIT(offset) : 0); - raw_spin_unlock_irqrestore(&chip->gc.bgpio_lock, flags); } static int sifive_gpio_irq_set_type(struct irq_data *d, unsigned int trigger) @@ -72,13 +72,12 @@ static int sifive_gpio_irq_set_type(struct irq_data *d, unsigned int trigger) } static void sifive_gpio_irq_enable(struct irq_data *d) -{ + { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct sifive_gpio *chip = gpiochip_get_data(gc); irq_hw_number_t hwirq = irqd_to_hwirq(d); int offset = hwirq % SIFIVE_GPIO_MAX; u32 bit = BIT(offset); - unsigned long flags; gpiochip_enable_irq(gc, hwirq); irq_chip_enable_parent(d); @@ -86,13 +85,13 @@ static void sifive_gpio_irq_enable(struct irq_data *d) /* Switch to input */ gc->direction_input(gc, offset); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - /* Clear any sticky pending interrupts */ - regmap_write(chip->regs, SIFIVE_GPIO_RISE_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_FALL_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_HIGH_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_LOW_IP, bit); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &chip->gen_gc) { + /* Clear any sticky pending interrupts */ + regmap_write(chip->regs, SIFIVE_GPIO_RISE_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_FALL_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_HIGH_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_LOW_IP, bit); + } /* Enable interrupts */ assign_bit(offset, &chip->irq_state, 1); @@ -118,15 +117,14 @@ static void sifive_gpio_irq_eoi(struct irq_data *d) struct sifive_gpio *chip = gpiochip_get_data(gc); int offset = irqd_to_hwirq(d) % SIFIVE_GPIO_MAX; u32 bit = BIT(offset); - unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); - /* Clear all pending interrupts */ - regmap_write(chip->regs, SIFIVE_GPIO_RISE_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_FALL_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_HIGH_IP, bit); - regmap_write(chip->regs, SIFIVE_GPIO_LOW_IP, bit); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + scoped_guard(gpio_generic_lock_irqsave, &chip->gen_gc) { + /* Clear all pending interrupts */ + regmap_write(chip->regs, SIFIVE_GPIO_RISE_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_FALL_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_HIGH_IP, bit); + regmap_write(chip->regs, SIFIVE_GPIO_LOW_IP, bit); + } irq_chip_eoi_parent(d); } @@ -174,12 +172,12 @@ static const struct regmap_config sifive_gpio_regmap_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .fast_io = true, .disable_locking = true, }; static int sifive_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct irq_domain *parent; struct gpio_irq_chip *girq; @@ -218,13 +216,17 @@ static int sifive_gpio_probe(struct platform_device *pdev) */ parent = irq_get_irq_data(chip->irq_number[0])->domain; - ret = bgpio_init(&chip->gc, dev, 4, - chip->base + SIFIVE_GPIO_INPUT_VAL, - chip->base + SIFIVE_GPIO_OUTPUT_VAL, - NULL, - chip->base + SIFIVE_GPIO_OUTPUT_EN, - chip->base + SIFIVE_GPIO_INPUT_EN, - BGPIOF_READ_OUTPUT_REG_SET); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = chip->base + SIFIVE_GPIO_INPUT_VAL, + .set = chip->base + SIFIVE_GPIO_OUTPUT_VAL, + .dirout = chip->base + SIFIVE_GPIO_OUTPUT_EN, + .dirin = chip->base + SIFIVE_GPIO_INPUT_EN, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&chip->gen_gc, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; @@ -237,12 +239,12 @@ static int sifive_gpio_probe(struct platform_device *pdev) regmap_write(chip->regs, SIFIVE_GPIO_LOW_IE, 0); chip->irq_state = 0; - chip->gc.base = -1; - chip->gc.ngpio = ngpio; - chip->gc.label = dev_name(dev); - chip->gc.parent = dev; - chip->gc.owner = THIS_MODULE; - girq = &chip->gc.irq; + chip->gen_gc.gc.base = -1; + chip->gen_gc.gc.ngpio = ngpio; + chip->gen_gc.gc.label = dev_name(dev); + chip->gen_gc.gc.parent = dev; + chip->gen_gc.gc.owner = THIS_MODULE; + girq = &chip->gen_gc.gc.irq; gpio_irq_chip_set_chip(girq, &sifive_gpio_irqchip); girq->fwnode = dev_fwnode(dev); girq->parent_domain = parent; @@ -250,7 +252,7 @@ static int sifive_gpio_probe(struct platform_device *pdev) girq->handler = handle_bad_irq; girq->default_type = IRQ_TYPE_NONE; - return gpiochip_add_data(&chip->gc, chip); + return gpiochip_add_data(&chip->gen_gc.gc, chip); } static const struct of_device_id sifive_gpio_match[] = { diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 050092583f799b..a83f5238427cdb 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -262,8 +262,7 @@ static void gpio_sim_dbg_show(struct seq_file *seq, struct gpio_chip *gc) guard(mutex)(&chip->lock); for_each_hwgpio(gc, i, label) - seq_printf(seq, " gpio-%-3d (%s) %s,%s\n", - gc->base + i, + seq_printf(seq, " gpio-%-3d (%s) %s,%s\n", i, label ?: "", test_bit(i, chip->direction_map) ? "input" : test_bit(i, chip->value_map) ? "output-high" : diff --git a/drivers/gpio/gpio-sodaville.c b/drivers/gpio/gpio-sodaville.c index abd13c79ace09d..37c1338377295f 100644 --- a/drivers/gpio/gpio-sodaville.c +++ b/drivers/gpio/gpio-sodaville.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -39,7 +40,7 @@ struct sdv_gpio_chip_data { void __iomem *gpio_pub_base; struct irq_domain *id; struct irq_chip_generic *gc; - struct gpio_chip chip; + struct gpio_generic_chip gen_gc; }; static int sdv_gpio_pub_set_type(struct irq_data *d, unsigned int type) @@ -180,6 +181,7 @@ static int sdv_register_irqsupport(struct sdv_gpio_chip_data *sd, static int sdv_gpio_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id) { + struct gpio_generic_chip_config config; struct sdv_gpio_chip_data *sd; int ret; u32 mux_val; @@ -206,15 +208,21 @@ static int sdv_gpio_probe(struct pci_dev *pdev, if (!ret) writel(mux_val, sd->gpio_pub_base + GPMUXCTL); - ret = bgpio_init(&sd->chip, &pdev->dev, 4, - sd->gpio_pub_base + GPINR, sd->gpio_pub_base + GPOUTR, - NULL, sd->gpio_pub_base + GPOER, NULL, 0); + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 4, + .dat = sd->gpio_pub_base + GPINR, + .set = sd->gpio_pub_base + GPOUTR, + .dirout = sd->gpio_pub_base + GPOER, + }; + + ret = gpio_generic_chip_init(&sd->gen_gc, &config); if (ret) return ret; - sd->chip.ngpio = SDV_NUM_PUB_GPIOS; + sd->gen_gc.gc.ngpio = SDV_NUM_PUB_GPIOS; - ret = devm_gpiochip_add_data(&pdev->dev, &sd->chip, sd); + ret = devm_gpiochip_add_data(&pdev->dev, &sd->gen_gc.gc, sd); if (ret < 0) { dev_err(&pdev->dev, "gpiochip_add() failed.\n"); return ret; diff --git a/drivers/gpio/gpio-spacemit-k1.c b/drivers/gpio/gpio-spacemit-k1.c index 3cc75c701ec401..eb66a15c002fc3 100644 --- a/drivers/gpio/gpio-spacemit-k1.c +++ b/drivers/gpio/gpio-spacemit-k1.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -38,7 +39,7 @@ struct spacemit_gpio; struct spacemit_gpio_bank { - struct gpio_chip gc; + struct gpio_generic_chip chip; struct spacemit_gpio *sg; void __iomem *base; u32 irq_mask; @@ -72,7 +73,7 @@ static irqreturn_t spacemit_gpio_irq_handler(int irq, void *dev_id) return IRQ_NONE; for_each_set_bit(n, &pending, BITS_PER_LONG) - handle_nested_irq(irq_find_mapping(gb->gc.irq.domain, n)); + handle_nested_irq(irq_find_mapping(gb->chip.gc.irq.domain, n)); return IRQ_HANDLED; } @@ -143,7 +144,7 @@ static void spacemit_gpio_irq_print_chip(struct irq_data *data, struct seq_file { struct spacemit_gpio_bank *gb = irq_data_get_irq_chip_data(data); - seq_printf(p, "%s-%d", dev_name(gb->gc.parent), spacemit_gpio_bank_index(gb)); + seq_printf(p, "%s-%d", dev_name(gb->chip.gc.parent), spacemit_gpio_bank_index(gb)); } static struct irq_chip spacemit_gpio_chip = { @@ -165,7 +166,7 @@ static bool spacemit_of_node_instance_match(struct gpio_chip *gc, unsigned int i if (i >= SPACEMIT_NR_BANKS) return false; - return (gc == &sg->sgb[i].gc); + return (gc == &sg->sgb[i].chip.gc); } static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, @@ -173,7 +174,8 @@ static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, int index, int irq) { struct spacemit_gpio_bank *gb = &sg->sgb[index]; - struct gpio_chip *gc = &gb->gc; + struct gpio_generic_chip_config config; + struct gpio_chip *gc = &gb->chip.gc; struct device *dev = sg->dev; struct gpio_irq_chip *girq; void __iomem *dat, *set, *clr, *dirin, *dirout; @@ -187,9 +189,20 @@ static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, dirin = gb->base + SPACEMIT_GCDR; dirout = gb->base + SPACEMIT_GSDR; + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = dat, + .set = set, + .clr = clr, + .dirout = dirout, + .dirin = dirin, + .flags = GPIO_GENERIC_UNREADABLE_REG_SET | + GPIO_GENERIC_UNREADABLE_REG_DIR, + }; + /* This registers 32 GPIO lines per bank */ - ret = bgpio_init(gc, dev, 4, dat, set, clr, dirout, dirin, - BGPIOF_UNREADABLE_REG_SET | BGPIOF_UNREADABLE_REG_DIR); + ret = gpio_generic_chip_init(&gb->chip, &config); if (ret) return dev_err_probe(dev, ret, "failed to init gpio chip\n"); @@ -221,7 +234,7 @@ static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, ret = devm_request_threaded_irq(dev, irq, NULL, spacemit_gpio_irq_handler, IRQF_ONESHOT | IRQF_SHARED, - gb->gc.label, gb); + gb->chip.gc.label, gb); if (ret < 0) return dev_err_probe(dev, ret, "failed to register IRQ\n"); diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c index 5dd4c21a8e601b..6faf30347a3639 100644 --- a/drivers/gpio/gpio-stmpe.c +++ b/drivers/gpio/gpio-stmpe.c @@ -262,9 +262,8 @@ static void stmpe_gpio_irq_unmask(struct irq_data *d) stmpe_gpio->regs[REG_IE][regoffset] |= mask; } -static void stmpe_dbg_show_one(struct seq_file *s, - struct gpio_chip *gc, - unsigned offset, unsigned gpio) +static void stmpe_dbg_show_one(struct seq_file *s, struct gpio_chip *gc, + unsigned int offset) { struct stmpe_gpio *stmpe_gpio = gpiochip_get_data(gc); struct stmpe *stmpe = stmpe_gpio->stmpe; @@ -286,7 +285,7 @@ static void stmpe_dbg_show_one(struct seq_file *s, if (dir) { seq_printf(s, " gpio-%-3d (%-20.20s) out %s", - gpio, label ?: "(none)", str_hi_lo(val)); + offset, label ?: "(none)", str_hi_lo(val)); } else { u8 edge_det_reg; u8 rise_reg; @@ -354,7 +353,7 @@ static void stmpe_dbg_show_one(struct seq_file *s, irqen = !!(ret & mask); seq_printf(s, " gpio-%-3d (%-20.20s) in %s %13s %13s %25s %25s", - gpio, label ?: "(none)", + offset, label ?: "(none)", str_hi_lo(val), edge_det_values[edge_det], irqen ? "IRQ-enabled" : "IRQ-disabled", @@ -366,10 +365,9 @@ static void stmpe_dbg_show_one(struct seq_file *s, static void stmpe_dbg_show(struct seq_file *s, struct gpio_chip *gc) { unsigned i; - unsigned gpio = gc->base; - for (i = 0; i < gc->ngpio; i++, gpio++) { - stmpe_dbg_show_one(s, gc, i, gpio); + for (i = 0; i < gc->ngpio; i++) { + stmpe_dbg_show_one(s, gc, i); seq_putc(s, '\n'); } } @@ -534,10 +532,16 @@ static int stmpe_gpio_probe(struct platform_device *pdev) return devm_gpiochip_add_data(dev, &stmpe_gpio->chip, stmpe_gpio); } +static const struct of_device_id stmpe_gpio_of_matches[] = { + { .compatible = "st,stmpe-gpio", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, stmpe_gpio_of_matches); + static struct platform_driver stmpe_gpio_driver = { .driver = { - .suppress_bind_attrs = true, - .name = "stmpe-gpio", + .name = "stmpe-gpio", + .of_match_table = stmpe_gpio_of_matches, }, .probe = stmpe_gpio_probe, }; @@ -547,3 +551,13 @@ static int __init stmpe_gpio_init(void) return platform_driver_register(&stmpe_gpio_driver); } subsys_initcall(stmpe_gpio_init); + +static void __exit stmpe_gpio_exit(void) +{ + platform_driver_unregister(&stmpe_gpio_driver); +} +module_exit(stmpe_gpio_exit); + +MODULE_DESCRIPTION("STMPE expander GPIO"); +MODULE_AUTHOR("Rabin Vincent "); +MODULE_LICENSE("GPL"); diff --git a/drivers/gpio/gpio-tb10x.c b/drivers/gpio/gpio-tb10x.c index 1869ee7f9423ef..09a448ce3eec2f 100644 --- a/drivers/gpio/gpio-tb10x.c +++ b/drivers/gpio/gpio-tb10x.c @@ -7,20 +7,20 @@ * Christian Ruppert */ -#include -#include -#include +#include #include -#include -#include -#include +#include #include #include +#include +#include +#include +#include #include #include -#include -#include #include +#include +#include #define TB10X_GPIO_DIR_IN (0x00000000) #define TB10X_GPIO_DIR_OUT (0x00000001) @@ -36,13 +36,13 @@ * @base: register base address * @domain: IRQ domain of GPIO generated interrupts managed by this controller * @irq: Interrupt line of parent interrupt controller - * @gc: gpio_chip structure associated to this GPIO controller + * @chip: Generic GPIO chip structure associated with this GPIO controller */ struct tb10x_gpio { void __iomem *base; struct irq_domain *domain; int irq; - struct gpio_chip gc; + struct gpio_generic_chip chip; }; static inline u32 tb10x_reg_read(struct tb10x_gpio *gpio, unsigned int offs) @@ -60,16 +60,13 @@ static inline void tb10x_set_bits(struct tb10x_gpio *gpio, unsigned int offs, u32 mask, u32 val) { u32 r; - unsigned long flags; - raw_spin_lock_irqsave(&gpio->gc.bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(&gpio->chip); r = tb10x_reg_read(gpio, offs); r = (r & ~mask) | (val & mask); tb10x_reg_write(gpio, offs, r); - - raw_spin_unlock_irqrestore(&gpio->gc.bgpio_lock, flags); } static int tb10x_gpio_to_irq(struct gpio_chip *chip, unsigned offset) @@ -107,6 +104,7 @@ static irqreturn_t tb10x_gpio_irq_cascade(int irq, void *data) static int tb10x_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct tb10x_gpio *tb10x_gpio; struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; @@ -127,9 +125,9 @@ static int tb10x_gpio_probe(struct platform_device *pdev) if (IS_ERR(tb10x_gpio->base)) return PTR_ERR(tb10x_gpio->base); - tb10x_gpio->gc.label = + tb10x_gpio->chip.gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", pdev->dev.of_node); - if (!tb10x_gpio->gc.label) + if (!tb10x_gpio->chip.gc.label) return -ENOMEM; /* @@ -137,29 +135,30 @@ static int tb10x_gpio_probe(struct platform_device *pdev) * the lines, no special set or clear registers and a data direction register * wher 1 means "output". */ - ret = bgpio_init(&tb10x_gpio->gc, dev, 4, - tb10x_gpio->base + OFFSET_TO_REG_DATA, - NULL, - NULL, - tb10x_gpio->base + OFFSET_TO_REG_DDR, - NULL, - 0); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = tb10x_gpio->base + OFFSET_TO_REG_DATA, + .dirout = tb10x_gpio->base + OFFSET_TO_REG_DDR, + }; + + ret = gpio_generic_chip_init(&tb10x_gpio->chip, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; } - tb10x_gpio->gc.base = -1; - tb10x_gpio->gc.parent = dev; - tb10x_gpio->gc.owner = THIS_MODULE; + tb10x_gpio->chip.gc.base = -1; + tb10x_gpio->chip.gc.parent = dev; + tb10x_gpio->chip.gc.owner = THIS_MODULE; /* - * ngpio is set by bgpio_init() but we override it, this .request() - * callback also overrides the one set up by generic GPIO. + * ngpio is set by gpio_generic_chip_init() but we override it, this + * .request() callback also overrides the one set up by generic GPIO. */ - tb10x_gpio->gc.ngpio = ngpio; - tb10x_gpio->gc.request = gpiochip_generic_request; - tb10x_gpio->gc.free = gpiochip_generic_free; + tb10x_gpio->chip.gc.ngpio = ngpio; + tb10x_gpio->chip.gc.request = gpiochip_generic_request; + tb10x_gpio->chip.gc.free = gpiochip_generic_free; - ret = devm_gpiochip_add_data(dev, &tb10x_gpio->gc, tb10x_gpio); + ret = devm_gpiochip_add_data(dev, &tb10x_gpio->chip.gc, tb10x_gpio); if (ret < 0) { dev_err(dev, "Could not add gpiochip.\n"); return ret; @@ -174,7 +173,7 @@ static int tb10x_gpio_probe(struct platform_device *pdev) if (ret < 0) return ret; - tb10x_gpio->gc.to_irq = tb10x_gpio_to_irq; + tb10x_gpio->chip.gc.to_irq = tb10x_gpio_to_irq; tb10x_gpio->irq = ret; ret = devm_request_irq(dev, ret, tb10x_gpio_irq_cascade, @@ -183,14 +182,15 @@ static int tb10x_gpio_probe(struct platform_device *pdev) if (ret != 0) return ret; - tb10x_gpio->domain = irq_domain_create_linear(dev_fwnode(dev), tb10x_gpio->gc.ngpio, + tb10x_gpio->domain = irq_domain_create_linear(dev_fwnode(dev), + tb10x_gpio->chip.gc.ngpio, &irq_generic_chip_ops, NULL); if (!tb10x_gpio->domain) { return -ENOMEM; } ret = irq_alloc_domain_generic_chips(tb10x_gpio->domain, - tb10x_gpio->gc.ngpio, 1, tb10x_gpio->gc.label, + tb10x_gpio->chip.gc.ngpio, 1, tb10x_gpio->chip.gc.label, handle_edge_irq, IRQ_NOREQUEST, IRQ_NOPROBE, IRQ_GC_INIT_MASK_CACHE); if (ret) @@ -218,9 +218,9 @@ static void tb10x_gpio_remove(struct platform_device *pdev) { struct tb10x_gpio *tb10x_gpio = platform_get_drvdata(pdev); - if (tb10x_gpio->gc.to_irq) { + if (tb10x_gpio->chip.gc.to_irq) { irq_remove_generic_chip(tb10x_gpio->domain->gc->gc[0], - BIT(tb10x_gpio->gc.ngpio) - 1, 0, 0); + BIT(tb10x_gpio->chip.gc.ngpio) - 1, 0, 0); kfree(tb10x_gpio->domain->gc); irq_domain_remove(tb10x_gpio->domain); } diff --git a/drivers/gpio/gpio-tegra186.c b/drivers/gpio/gpio-tegra186.c index 5fd3ec3e2c53d2..4d3db6e06eeb27 100644 --- a/drivers/gpio/gpio-tegra186.c +++ b/drivers/gpio/gpio-tegra186.c @@ -20,6 +20,7 @@ #include #include #include +#include /* security registers */ #define TEGRA186_GPIO_CTL_SCR 0x0c @@ -1279,6 +1280,30 @@ static const struct tegra_gpio_soc tegra241_aon_soc = { .has_vm_support = false, }; +#define TEGRA256_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ + [TEGRA256_MAIN_GPIO_PORT_##_name] = { \ + .name = #_name, \ + .bank = _bank, \ + .port = _port, \ + .pins = _pins, \ + } + +static const struct tegra_gpio_port tegra256_main_ports[] = { + TEGRA256_MAIN_GPIO_PORT(A, 0, 0, 8), + TEGRA256_MAIN_GPIO_PORT(B, 0, 1, 8), + TEGRA256_MAIN_GPIO_PORT(C, 0, 2, 8), + TEGRA256_MAIN_GPIO_PORT(D, 0, 3, 8), +}; + +static const struct tegra_gpio_soc tegra256_main_soc = { + .num_ports = ARRAY_SIZE(tegra256_main_ports), + .ports = tegra256_main_ports, + .name = "tegra256-gpio-main", + .instance = 1, + .num_irqs_per_bank = 8, + .has_vm_support = true, +}; + static const struct of_device_id tegra186_gpio_of_match[] = { { .compatible = "nvidia,tegra186-gpio", @@ -1298,6 +1323,9 @@ static const struct of_device_id tegra186_gpio_of_match[] = { }, { .compatible = "nvidia,tegra234-gpio-aon", .data = &tegra234_aon_soc + }, { + .compatible = "nvidia,tegra256-gpio", + .data = &tegra256_main_soc }, { /* sentinel */ } diff --git a/drivers/gpio/gpio-ts4800.c b/drivers/gpio/gpio-ts4800.c index 4748e3d47106cd..992ee231db9ff8 100644 --- a/drivers/gpio/gpio-ts4800.c +++ b/drivers/gpio/gpio-ts4800.c @@ -6,9 +6,10 @@ */ #include +#include #include -#include #include +#include #define DEFAULT_PIN_NUMBER 16 #define INPUT_REG_OFFSET 0x00 @@ -17,13 +18,14 @@ static int ts4800_gpio_probe(struct platform_device *pdev) { - struct device_node *node; - struct gpio_chip *chip; + struct gpio_generic_chip_config config; + struct device *dev = &pdev->dev; + struct gpio_generic_chip *chip; void __iomem *base_addr; int retval; u32 ngpios; - chip = devm_kzalloc(&pdev->dev, sizeof(struct gpio_chip), GFP_KERNEL); + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM; @@ -31,29 +33,28 @@ static int ts4800_gpio_probe(struct platform_device *pdev) if (IS_ERR(base_addr)) return PTR_ERR(base_addr); - node = pdev->dev.of_node; - if (!node) - return -EINVAL; - - retval = of_property_read_u32(node, "ngpios", &ngpios); + retval = device_property_read_u32(dev, "ngpios", &ngpios); if (retval == -EINVAL) ngpios = DEFAULT_PIN_NUMBER; else if (retval) return retval; - retval = bgpio_init(chip, &pdev->dev, 2, base_addr + INPUT_REG_OFFSET, - base_addr + OUTPUT_REG_OFFSET, NULL, - base_addr + DIRECTION_REG_OFFSET, NULL, 0); - if (retval) { - dev_err(&pdev->dev, "bgpio_init failed\n"); - return retval; - } + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 2, + .dat = base_addr + INPUT_REG_OFFSET, + .set = base_addr + OUTPUT_REG_OFFSET, + .dirout = base_addr + DIRECTION_REG_OFFSET, + }; - chip->ngpio = ngpios; + retval = gpio_generic_chip_init(chip, &config); + if (retval) + return dev_err_probe(dev, retval, + "failed to initialize the generic GPIO chip\n"); - platform_set_drvdata(pdev, chip); + chip->gc.ngpio = ngpios; - return devm_gpiochip_add_data(&pdev->dev, chip, NULL); + return devm_gpiochip_add_data(dev, &chip->gc, NULL); } static const struct of_device_id ts4800_gpio_of_match[] = { diff --git a/drivers/gpio/gpio-twl4030.c b/drivers/gpio/gpio-twl4030.c index a33dc7c7e7a08d..a851702befdea7 100644 --- a/drivers/gpio/gpio-twl4030.c +++ b/drivers/gpio/gpio-twl4030.c @@ -597,9 +597,7 @@ static int gpio_twl4030_probe(struct platform_device *pdev) ret = devm_add_action_or_reset(&pdev->dev, gpio_twl4030_power_off_action, d); if (ret) - return dev_err_probe(&pdev->dev, ret, - "failed to install power off handler\n"); - + return ret; } return 0; diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index 7de0d5b53d5604..aa8586d8a787f0 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,7 @@ struct fsl_gpio_soc_data { }; struct vf610_gpio_port { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; void __iomem *gpio_base; const struct fsl_gpio_soc_data *sdata; @@ -108,7 +109,7 @@ static void vf610_gpio_irq_handler(struct irq_desc *desc) for_each_set_bit(pin, &irq_isfr, VF610_GPIO_PER_PORT) { vf610_gpio_writel(BIT(pin), port->base + PORT_ISFR); - generic_handle_domain_irq(port->gc.irq.domain, pin); + generic_handle_domain_irq(port->chip.gc.irq.domain, pin); } chained_irq_exit(chip, desc); @@ -214,6 +215,7 @@ static void vf610_gpio_disable_clk(void *data) static int vf610_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct vf610_gpio_port *port; struct gpio_chip *gc; @@ -293,22 +295,27 @@ static int vf610_gpio_probe(struct platform_device *pdev) return ret; } - gc = &port->gc; - flags = BGPIOF_PINCTRL_BACKEND; + gc = &port->chip.gc; + flags = GPIO_GENERIC_PINCTRL_BACKEND; /* * We only read the output register for current value on output * lines if the direction register is available so we can switch * direction. */ if (port->sdata->have_paddr) - flags |= BGPIOF_READ_OUTPUT_REG_SET; - ret = bgpio_init(gc, dev, 4, - port->gpio_base + GPIO_PDIR, - port->gpio_base + GPIO_PDOR, - NULL, - port->sdata->have_paddr ? port->gpio_base + GPIO_PDDR : NULL, - NULL, - flags); + flags |= GPIO_GENERIC_READ_OUTPUT_REG_SET; + + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = port->gpio_base + GPIO_PDIR, + .set = port->gpio_base + GPIO_PDOR, + .dirout = port->sdata->have_paddr ? + port->gpio_base + GPIO_PDDR : NULL, + .flags = flags, + }; + + ret = gpio_generic_chip_init(&port->chip, &config); if (ret) return dev_err_probe(dev, ret, "unable to init generic GPIO\n"); gc->label = dev_name(dev); diff --git a/drivers/gpio/gpio-visconti.c b/drivers/gpio/gpio-visconti.c index 5bd965c18a465f..6d5d829634ad76 100644 --- a/drivers/gpio/gpio-visconti.c +++ b/drivers/gpio/gpio-visconti.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -32,7 +33,7 @@ struct visconti_gpio { void __iomem *base; spinlock_t lock; /* protect gpio register */ - struct gpio_chip gpio_chip; + struct gpio_generic_chip chip; struct device *dev; }; @@ -158,6 +159,7 @@ static const struct irq_chip visconti_gpio_irq_chip = { static int visconti_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct visconti_gpio *priv; struct gpio_irq_chip *girq; @@ -189,19 +191,22 @@ static int visconti_gpio_probe(struct platform_device *pdev) return -ENODEV; } - ret = bgpio_init(&priv->gpio_chip, dev, 4, - priv->base + GPIO_IDATA, - priv->base + GPIO_OSET, - priv->base + GPIO_OCLR, - priv->base + GPIO_DIR, - NULL, - 0); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = priv->base + GPIO_IDATA, + .set = priv->base + GPIO_OSET, + .clr = priv->base + GPIO_OCLR, + .dirout = priv->base + GPIO_DIR, + }; + + ret = gpio_generic_chip_init(&priv->chip, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; } - girq = &priv->gpio_chip.irq; + girq = &priv->chip.gc.irq; gpio_irq_chip_set_chip(girq, &visconti_gpio_irq_chip); girq->fwnode = dev_fwnode(dev); girq->parent_domain = parent; @@ -210,7 +215,7 @@ static int visconti_gpio_probe(struct platform_device *pdev) girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_level_irq; - return devm_gpiochip_add_data(dev, &priv->gpio_chip, priv); + return devm_gpiochip_add_data(dev, &priv->chip.gc, priv); } static const struct of_device_id visconti_gpio_of_match[] = { diff --git a/drivers/gpio/gpio-wm831x.c b/drivers/gpio/gpio-wm831x.c index f03c0e808fab27..489479d6f32b3a 100644 --- a/drivers/gpio/gpio-wm831x.c +++ b/drivers/gpio/gpio-wm831x.c @@ -159,7 +159,6 @@ static void wm831x_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) int i, tristated; for (i = 0; i < chip->ngpio; i++) { - int gpio = i + chip->base; int reg; const char *pull, *powerdomain; @@ -175,13 +174,13 @@ static void wm831x_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) } seq_printf(s, " gpio-%-3d (%-20.20s) ", - gpio, label ?: "Unrequested"); + i, label ?: "Unrequested"); reg = wm831x_reg_read(wm831x, WM831X_GPIO1_CONTROL + i); if (reg < 0) { dev_err(wm831x->dev, "GPIO control %d read failed: %d\n", - gpio, reg); + i, reg); seq_putc(s, '\n'); continue; } diff --git a/drivers/gpio/gpio-wm8994.c b/drivers/gpio/gpio-wm8994.c index df47a27f508d94..a0665cf3ff2f45 100644 --- a/drivers/gpio/gpio-wm8994.c +++ b/drivers/gpio/gpio-wm8994.c @@ -194,7 +194,6 @@ static void wm8994_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) int i; for (i = 0; i < chip->ngpio; i++) { - int gpio = i + chip->base; int reg; /* We report the GPIO even if it's not requested since @@ -208,14 +207,13 @@ static void wm8994_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) continue; } - seq_printf(s, " gpio-%-3d (%-20.20s) ", gpio, + seq_printf(s, " gpio-%-3d (%-20.20s) ", i, label ?: "Unrequested"); reg = wm8994_reg_read(wm8994, WM8994_GPIO_1 + i); if (reg < 0) { dev_err(wm8994->dev, - "GPIO control %d read failed: %d\n", - gpio, reg); + "GPIO control %d read failed: %d\n", i, reg); seq_printf(s, "\n"); continue; } diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c index b51b1fa726bb5a..661259f026e191 100644 --- a/drivers/gpio/gpio-xgene-sb.c +++ b/drivers/gpio/gpio-xgene-sb.c @@ -21,6 +21,7 @@ #include #include +#include #include "gpiolib-acpi.h" @@ -40,7 +41,7 @@ /** * struct xgene_gpio_sb - GPIO-Standby private data structure. - * @gc: memory-mapped GPIO controllers. + * @chip: Generic GPIO chip data * @regs: GPIO register base offset * @irq_domain: GPIO interrupt domain * @irq_start: GPIO pin that start support interrupt @@ -48,7 +49,7 @@ * @parent_irq_base: Start parent HWIRQ */ struct xgene_gpio_sb { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *regs; struct irq_domain *irq_domain; u16 irq_start; @@ -62,14 +63,15 @@ struct xgene_gpio_sb { static void xgene_gpio_set_bit(struct gpio_chip *gc, void __iomem *reg, u32 gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); u32 data; - data = gc->read_reg(reg); + data = gpio_generic_read_reg(chip, reg); if (val) data |= GPIO_MASK(gpio); else data &= ~GPIO_MASK(gpio); - gc->write_reg(reg, data); + gpio_generic_write_reg(chip, reg, data); } static int xgene_gpio_sb_irq_set_type(struct irq_data *d, unsigned int type) @@ -91,9 +93,9 @@ static int xgene_gpio_sb_irq_set_type(struct irq_data *d, unsigned int type) break; } - xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_SEL_LO, + xgene_gpio_set_bit(&priv->chip.gc, priv->regs + MPA_GPIO_SEL_LO, gpio * 2, 1); - xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_INT_LVL, + xgene_gpio_set_bit(&priv->chip.gc, priv->regs + MPA_GPIO_INT_LVL, d->hwirq, lvl_type); /* Propagate IRQ type setting to parent */ @@ -109,14 +111,14 @@ static void xgene_gpio_sb_irq_mask(struct irq_data *d) irq_chip_mask_parent(d); - gpiochip_disable_irq(&priv->gc, d->hwirq); + gpiochip_disable_irq(&priv->chip.gc, d->hwirq); } static void xgene_gpio_sb_irq_unmask(struct irq_data *d) { struct xgene_gpio_sb *priv = irq_data_get_irq_chip_data(d); - gpiochip_enable_irq(&priv->gc, d->hwirq); + gpiochip_enable_irq(&priv->chip.gc, d->hwirq); irq_chip_unmask_parent(d); } @@ -155,15 +157,15 @@ static int xgene_gpio_sb_domain_activate(struct irq_domain *d, u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq); int ret; - ret = gpiochip_lock_as_irq(&priv->gc, gpio); + ret = gpiochip_lock_as_irq(&priv->chip.gc, gpio); if (ret) { - dev_err(priv->gc.parent, + dev_err(priv->chip.gc.parent, "Unable to configure XGene GPIO standby pin %d as IRQ\n", gpio); return ret; } - xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_SEL_LO, + xgene_gpio_set_bit(&priv->chip.gc, priv->regs + MPA_GPIO_SEL_LO, gpio * 2, 1); return 0; } @@ -174,8 +176,8 @@ static void xgene_gpio_sb_domain_deactivate(struct irq_domain *d, struct xgene_gpio_sb *priv = d->host_data; u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq); - gpiochip_unlock_as_irq(&priv->gc, gpio); - xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_SEL_LO, + gpiochip_unlock_as_irq(&priv->chip.gc, gpio); + xgene_gpio_set_bit(&priv->chip.gc, priv->regs + MPA_GPIO_SEL_LO, gpio * 2, 0); } @@ -237,6 +239,7 @@ static const struct irq_domain_ops xgene_gpio_sb_domain_ops = { static int xgene_gpio_sb_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct xgene_gpio_sb *priv; int ret; void __iomem *regs; @@ -263,14 +266,19 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev) return -ENODEV; } - ret = bgpio_init(&priv->gc, &pdev->dev, 4, - regs + MPA_GPIO_IN_ADDR, - regs + MPA_GPIO_OUT_ADDR, NULL, - regs + MPA_GPIO_OE_ADDR, NULL, 0); + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 4, + .dat = regs + MPA_GPIO_IN_ADDR, + .set = regs + MPA_GPIO_OUT_ADDR, + .dirout = regs + MPA_GPIO_OE_ADDR, + }; + + ret = gpio_generic_chip_init(&priv->chip, &config); if (ret) return ret; - priv->gc.to_irq = xgene_gpio_sb_to_irq; + priv->chip.gc.to_irq = xgene_gpio_sb_to_irq; /* Retrieve start irq pin, use default if property not found */ priv->irq_start = XGENE_DFLT_IRQ_START_PIN; @@ -283,12 +291,12 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev) priv->nirq = val32; /* Retrieve number gpio, use default if property not found */ - priv->gc.ngpio = XGENE_DFLT_MAX_NGPIO; + priv->chip.gc.ngpio = XGENE_DFLT_MAX_NGPIO; if (!device_property_read_u32(&pdev->dev, "apm,nr-gpios", &val32)) - priv->gc.ngpio = val32; + priv->chip.gc.ngpio = val32; dev_info(&pdev->dev, "Support %d gpios, %d irqs start from pin %d\n", - priv->gc.ngpio, priv->nirq, priv->irq_start); + priv->chip.gc.ngpio, priv->nirq, priv->irq_start); platform_set_drvdata(pdev, priv); @@ -298,9 +306,9 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev) if (!priv->irq_domain) return -ENODEV; - priv->gc.irq.domain = priv->irq_domain; + priv->chip.gc.irq.domain = priv->irq_domain; - ret = devm_gpiochip_add_data(&pdev->dev, &priv->gc, priv); + ret = devm_gpiochip_add_data(&pdev->dev, &priv->chip.gc, priv); if (ret) { dev_err(&pdev->dev, "failed to register X-Gene GPIO Standby driver\n"); @@ -311,7 +319,7 @@ static int xgene_gpio_sb_probe(struct platform_device *pdev) dev_info(&pdev->dev, "X-Gene GPIO Standby driver registered\n"); /* Register interrupt handlers for GPIO signaled ACPI Events */ - acpi_gpiochip_request_interrupts(&priv->gc); + acpi_gpiochip_request_interrupts(&priv->chip.gc); return ret; } @@ -320,7 +328,7 @@ static void xgene_gpio_sb_remove(struct platform_device *pdev) { struct xgene_gpio_sb *priv = platform_get_drvdata(pdev); - acpi_gpiochip_free_interrupts(&priv->gc); + acpi_gpiochip_free_interrupts(&priv->chip.gc); irq_domain_remove(priv->irq_domain); } diff --git a/drivers/gpio/gpio-xgs-iproc.c b/drivers/gpio/gpio-xgs-iproc.c index 93544e98ccbd3f..77eb29dcc2171a 100644 --- a/drivers/gpio/gpio-xgs-iproc.c +++ b/drivers/gpio/gpio-xgs-iproc.c @@ -3,11 +3,12 @@ * Copyright (C) 2017 Broadcom */ -#include #include #include #include #include +#include +#include #include #include #include @@ -28,7 +29,7 @@ #define IPROC_GPIO_CCA_INT_EDGE 0x24 struct iproc_gpio_chip { - struct gpio_chip gc; + struct gpio_generic_chip gen_gc; spinlock_t lock; struct device *dev; void __iomem *base; @@ -38,7 +39,7 @@ struct iproc_gpio_chip { static inline struct iproc_gpio_chip * to_iproc_gpio(struct gpio_chip *gc) { - return container_of(gc, struct iproc_gpio_chip, gc); + return container_of(to_gpio_generic_chip(gc), struct iproc_gpio_chip, gen_gc); } static void iproc_gpio_irq_ack(struct irq_data *d) @@ -213,6 +214,7 @@ static const struct irq_chip iproc_gpio_irq_chip = { static int iproc_gpio_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct device_node *dn = pdev->dev.of_node; struct iproc_gpio_chip *chip; @@ -231,21 +233,23 @@ static int iproc_gpio_probe(struct platform_device *pdev) if (IS_ERR(chip->base)) return PTR_ERR(chip->base); - ret = bgpio_init(&chip->gc, dev, 4, - chip->base + IPROC_GPIO_CCA_DIN, - chip->base + IPROC_GPIO_CCA_DOUT, - NULL, - chip->base + IPROC_GPIO_CCA_OUT_EN, - NULL, - 0); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = chip->base + IPROC_GPIO_CCA_DIN, + .set = chip->base + IPROC_GPIO_CCA_DOUT, + .dirout = chip->base + IPROC_GPIO_CCA_OUT_EN, + }; + + ret = gpio_generic_chip_init(&chip->gen_gc, &config); if (ret) { dev_err(dev, "unable to init GPIO chip\n"); return ret; } - chip->gc.label = dev_name(dev); + chip->gen_gc.gc.label = dev_name(dev); if (!of_property_read_u32(dn, "ngpios", &num_gpios)) - chip->gc.ngpio = num_gpios; + chip->gen_gc.gc.ngpio = num_gpios; irq = platform_get_irq(pdev, 0); if (irq > 0) { @@ -266,13 +270,13 @@ static int iproc_gpio_probe(struct platform_device *pdev) * a flow-handler because the irq is shared. */ ret = devm_request_irq(dev, irq, iproc_gpio_irq_handler, - IRQF_SHARED, chip->gc.label, &chip->gc); + IRQF_SHARED, chip->gen_gc.gc.label, &chip->gen_gc.gc); if (ret) { dev_err(dev, "Fail to request IRQ%d: %d\n", irq, ret); return ret; } - girq = &chip->gc.irq; + girq = &chip->gen_gc.gc.irq; gpio_irq_chip_set_chip(girq, &iproc_gpio_irq_chip); /* This will let us handle the parent IRQ in the driver */ girq->parent_handler = NULL; @@ -282,7 +286,7 @@ static int iproc_gpio_probe(struct platform_device *pdev) girq->handler = handle_simple_irq; } - ret = devm_gpiochip_add_data(dev, &chip->gc, chip); + ret = devm_gpiochip_add_data(dev, &chip->gen_gc.gc, chip); if (ret) { dev_err(dev, "unable to add GPIO chip\n"); return ret; diff --git a/drivers/gpio/gpio-xra1403.c b/drivers/gpio/gpio-xra1403.c index faadcb4b0b2df0..7f3c98f9f90201 100644 --- a/drivers/gpio/gpio-xra1403.c +++ b/drivers/gpio/gpio-xra1403.c @@ -135,8 +135,7 @@ static void xra1403_dbg_show(struct seq_file *s, struct gpio_chip *chip) gcr = value[XRA_GCR + 1] << 8 | value[XRA_GCR]; gsr = value[XRA_GSR + 1] << 8 | value[XRA_GSR]; for_each_requested_gpio(chip, i, label) { - seq_printf(s, " gpio-%-3d (%-12s) %s %s\n", - chip->base + i, label, + seq_printf(s, " gpio-%-3d (%-12s) %s %s\n", i, label, (gcr & BIT(i)) ? "in" : "out", str_hi_lo(gsr & BIT(i))); } diff --git a/drivers/gpio/gpiolib-acpi-core.c b/drivers/gpio/gpiolib-acpi-core.c index 12b24a717e43f1..284e762d92c4d4 100644 --- a/drivers/gpio/gpiolib-acpi-core.c +++ b/drivers/gpio/gpiolib-acpi-core.c @@ -942,8 +942,9 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, { struct acpi_device *adev = to_acpi_device_node(fwnode); bool can_fallback = acpi_can_fallback_to_crs(adev, con_id); - struct acpi_gpio_info info; + struct acpi_gpio_info info = {}; struct gpio_desc *desc; + int ret; desc = __acpi_find_gpio(fwnode, con_id, idx, can_fallback, &info); if (IS_ERR(desc)) @@ -957,6 +958,12 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, acpi_gpio_update_gpiod_flags(dflags, &info); acpi_gpio_update_gpiod_lookup_flags(lookupflags, &info); + + /* ACPI uses hundredths of milliseconds units */ + ret = gpio_set_debounce_timeout(desc, info.debounce * 10); + if (ret) + return ERR_PTR(ret); + return desc; } @@ -992,7 +999,7 @@ int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int ret; for (i = 0, idx = 0; idx <= index; i++) { - struct acpi_gpio_info info; + struct acpi_gpio_info info = {}; struct gpio_desc *desc; /* Ignore -EPROBE_DEFER, it only matters if idx matches */ diff --git a/drivers/gpio/gpiolib-acpi-quirks.c b/drivers/gpio/gpiolib-acpi-quirks.c index bfb04e67c4bc87..7b95d1b0336149 100644 --- a/drivers/gpio/gpiolib-acpi-quirks.c +++ b/drivers/gpio/gpiolib-acpi-quirks.c @@ -317,6 +317,18 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { .ignore_wake = "PNP0C50:00@8", }, }, + { + /* + * Same as G1619-04. New model. + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1619-05"), + }, + .driver_data = &(struct acpi_gpiolib_dmi_quirk) { + .ignore_wake = "PNP0C50:00@8", + }, + }, { /* * Spurious wakeups from GPIO 11 diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index e6a289fa0f8fd5..175836467f216a 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -144,17 +144,17 @@ static void linehandle_flags_to_desc_flags(u32 lflags, unsigned long *flagsp) { unsigned long flags = READ_ONCE(*flagsp); - assign_bit(FLAG_ACTIVE_LOW, &flags, + assign_bit(GPIOD_FLAG_ACTIVE_LOW, &flags, lflags & GPIOHANDLE_REQUEST_ACTIVE_LOW); - assign_bit(FLAG_OPEN_DRAIN, &flags, + assign_bit(GPIOD_FLAG_OPEN_DRAIN, &flags, lflags & GPIOHANDLE_REQUEST_OPEN_DRAIN); - assign_bit(FLAG_OPEN_SOURCE, &flags, + assign_bit(GPIOD_FLAG_OPEN_SOURCE, &flags, lflags & GPIOHANDLE_REQUEST_OPEN_SOURCE); - assign_bit(FLAG_PULL_UP, &flags, + assign_bit(GPIOD_FLAG_PULL_UP, &flags, lflags & GPIOHANDLE_REQUEST_BIAS_PULL_UP); - assign_bit(FLAG_PULL_DOWN, &flags, + assign_bit(GPIOD_FLAG_PULL_DOWN, &flags, lflags & GPIOHANDLE_REQUEST_BIAS_PULL_DOWN); - assign_bit(FLAG_BIAS_DISABLE, &flags, + assign_bit(GPIOD_FLAG_BIAS_DISABLE, &flags, lflags & GPIOHANDLE_REQUEST_BIAS_DISABLE); WRITE_ONCE(*flagsp, flags); @@ -238,7 +238,7 @@ static long linehandle_ioctl(struct file *file, unsigned int cmd, * All line descriptors were created at once with the same * flags so just check if the first one is really output. */ - if (!test_bit(FLAG_IS_OUT, &lh->descs[0]->flags)) + if (!test_bit(GPIOD_FLAG_IS_OUT, &lh->descs[0]->flags)) return -EPERM; if (copy_from_user(&ghd, ip, sizeof(ghd))) @@ -599,10 +599,10 @@ static void linereq_put_event(struct linereq *lr, static u64 line_event_timestamp(struct line *line) { - if (test_bit(FLAG_EVENT_CLOCK_REALTIME, &line->desc->flags)) + if (test_bit(GPIOD_FLAG_EVENT_CLOCK_REALTIME, &line->desc->flags)) return ktime_get_real_ns(); else if (IS_ENABLED(CONFIG_HTE) && - test_bit(FLAG_EVENT_CLOCK_HTE, &line->desc->flags)) + test_bit(GPIOD_FLAG_EVENT_CLOCK_HTE, &line->desc->flags)) return line->timestamp_ns; return ktime_get_ns(); @@ -725,11 +725,11 @@ static int hte_edge_setup(struct line *line, u64 eflags) struct hte_ts_desc *hdesc = &line->hdesc; if (eflags & GPIO_V2_LINE_FLAG_EDGE_RISING) - flags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ? + flags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags) ? HTE_FALLING_EDGE_TS : HTE_RISING_EDGE_TS; if (eflags & GPIO_V2_LINE_FLAG_EDGE_FALLING) - flags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ? + flags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags) ? HTE_RISING_EDGE_TS : HTE_FALLING_EDGE_TS; @@ -831,7 +831,7 @@ static bool debounced_value(struct line *line) */ value = READ_ONCE(line->level); - if (test_bit(FLAG_ACTIVE_LOW, &line->desc->flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags)) value = !value; return value; @@ -939,7 +939,7 @@ static int debounce_setup(struct line *line, unsigned int debounce_period_us) return level; if (!(IS_ENABLED(CONFIG_HTE) && - test_bit(FLAG_EVENT_CLOCK_HTE, &line->desc->flags))) { + test_bit(GPIOD_FLAG_EVENT_CLOCK_HTE, &line->desc->flags))) { irq = gpiod_to_irq(line->desc); if (irq < 0) return -ENXIO; @@ -1061,10 +1061,10 @@ static int edge_detector_setup(struct line *line, return -ENXIO; if (eflags & GPIO_V2_LINE_FLAG_EDGE_RISING) - irqflags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ? + irqflags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags) ? IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING; if (eflags & GPIO_V2_LINE_FLAG_EDGE_FALLING) - irqflags |= test_bit(FLAG_ACTIVE_LOW, &line->desc->flags) ? + irqflags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &line->desc->flags) ? IRQF_TRIGGER_RISING : IRQF_TRIGGER_FALLING; irqflags |= IRQF_ONESHOT; @@ -1237,34 +1237,34 @@ static void gpio_v2_line_config_flags_to_desc_flags(u64 lflags, { unsigned long flags = READ_ONCE(*flagsp); - assign_bit(FLAG_ACTIVE_LOW, &flags, + assign_bit(GPIOD_FLAG_ACTIVE_LOW, &flags, lflags & GPIO_V2_LINE_FLAG_ACTIVE_LOW); if (lflags & GPIO_V2_LINE_FLAG_OUTPUT) - set_bit(FLAG_IS_OUT, &flags); + set_bit(GPIOD_FLAG_IS_OUT, &flags); else if (lflags & GPIO_V2_LINE_FLAG_INPUT) - clear_bit(FLAG_IS_OUT, &flags); + clear_bit(GPIOD_FLAG_IS_OUT, &flags); - assign_bit(FLAG_EDGE_RISING, &flags, + assign_bit(GPIOD_FLAG_EDGE_RISING, &flags, lflags & GPIO_V2_LINE_FLAG_EDGE_RISING); - assign_bit(FLAG_EDGE_FALLING, &flags, + assign_bit(GPIOD_FLAG_EDGE_FALLING, &flags, lflags & GPIO_V2_LINE_FLAG_EDGE_FALLING); - assign_bit(FLAG_OPEN_DRAIN, &flags, + assign_bit(GPIOD_FLAG_OPEN_DRAIN, &flags, lflags & GPIO_V2_LINE_FLAG_OPEN_DRAIN); - assign_bit(FLAG_OPEN_SOURCE, &flags, + assign_bit(GPIOD_FLAG_OPEN_SOURCE, &flags, lflags & GPIO_V2_LINE_FLAG_OPEN_SOURCE); - assign_bit(FLAG_PULL_UP, &flags, + assign_bit(GPIOD_FLAG_PULL_UP, &flags, lflags & GPIO_V2_LINE_FLAG_BIAS_PULL_UP); - assign_bit(FLAG_PULL_DOWN, &flags, + assign_bit(GPIOD_FLAG_PULL_DOWN, &flags, lflags & GPIO_V2_LINE_FLAG_BIAS_PULL_DOWN); - assign_bit(FLAG_BIAS_DISABLE, &flags, + assign_bit(GPIOD_FLAG_BIAS_DISABLE, &flags, lflags & GPIO_V2_LINE_FLAG_BIAS_DISABLED); - assign_bit(FLAG_EVENT_CLOCK_REALTIME, &flags, + assign_bit(GPIOD_FLAG_EVENT_CLOCK_REALTIME, &flags, lflags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME); - assign_bit(FLAG_EVENT_CLOCK_HTE, &flags, + assign_bit(GPIOD_FLAG_EVENT_CLOCK_HTE, &flags, lflags & GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE); WRITE_ONCE(*flagsp, flags); @@ -2115,10 +2115,10 @@ static int lineevent_create(struct gpio_device *gdev, void __user *ip) } if (eflags & GPIOEVENT_REQUEST_RISING_EDGE) - irqflags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ? + irqflags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) ? IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING; if (eflags & GPIOEVENT_REQUEST_FALLING_EDGE) - irqflags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ? + irqflags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) ? IRQF_TRIGGER_RISING : IRQF_TRIGGER_FALLING; irqflags |= IRQF_ONESHOT; @@ -2253,7 +2253,7 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc, scoped_guard(srcu, &desc->gdev->desc_srcu) { label = gpiod_get_label(desc); - if (label && test_bit(FLAG_REQUESTED, &dflags)) + if (label && test_bit(GPIOD_FLAG_REQUESTED, &dflags)) strscpy(info->consumer, label, sizeof(info->consumer)); } @@ -2270,10 +2270,10 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc, * The definitive test that a line is available to userspace is to * request it. */ - if (test_bit(FLAG_REQUESTED, &dflags) || - test_bit(FLAG_IS_HOGGED, &dflags) || - test_bit(FLAG_EXPORT, &dflags) || - test_bit(FLAG_SYSFS, &dflags) || + if (test_bit(GPIOD_FLAG_REQUESTED, &dflags) || + test_bit(GPIOD_FLAG_IS_HOGGED, &dflags) || + test_bit(GPIOD_FLAG_EXPORT, &dflags) || + test_bit(GPIOD_FLAG_SYSFS, &dflags) || !gpiochip_line_is_valid(guard.gc, info->offset)) { info->flags |= GPIO_V2_LINE_FLAG_USED; } else if (!atomic) { @@ -2281,34 +2281,34 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc, info->flags |= GPIO_V2_LINE_FLAG_USED; } - if (test_bit(FLAG_IS_OUT, &dflags)) + if (test_bit(GPIOD_FLAG_IS_OUT, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_OUTPUT; else info->flags |= GPIO_V2_LINE_FLAG_INPUT; - if (test_bit(FLAG_ACTIVE_LOW, &dflags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_ACTIVE_LOW; - if (test_bit(FLAG_OPEN_DRAIN, &dflags)) + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_OPEN_DRAIN; - if (test_bit(FLAG_OPEN_SOURCE, &dflags)) + if (test_bit(GPIOD_FLAG_OPEN_SOURCE, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_OPEN_SOURCE; - if (test_bit(FLAG_BIAS_DISABLE, &dflags)) + if (test_bit(GPIOD_FLAG_BIAS_DISABLE, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_BIAS_DISABLED; - if (test_bit(FLAG_PULL_DOWN, &dflags)) + if (test_bit(GPIOD_FLAG_PULL_DOWN, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_BIAS_PULL_DOWN; - if (test_bit(FLAG_PULL_UP, &dflags)) + if (test_bit(GPIOD_FLAG_PULL_UP, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_BIAS_PULL_UP; - if (test_bit(FLAG_EDGE_RISING, &dflags)) + if (test_bit(GPIOD_FLAG_EDGE_RISING, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_EDGE_RISING; - if (test_bit(FLAG_EDGE_FALLING, &dflags)) + if (test_bit(GPIOD_FLAG_EDGE_FALLING, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_EDGE_FALLING; - if (test_bit(FLAG_EVENT_CLOCK_REALTIME, &dflags)) + if (test_bit(GPIOD_FLAG_EVENT_CLOCK_REALTIME, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_EVENT_CLOCK_REALTIME; - else if (test_bit(FLAG_EVENT_CLOCK_HTE, &dflags)) + else if (test_bit(GPIOD_FLAG_EVENT_CLOCK_HTE, &dflags)) info->flags |= GPIO_V2_LINE_FLAG_EVENT_CLOCK_HTE; debounce_period_us = READ_ONCE(desc->debounce_period_us); diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 37ab78243faba2..fad4edf9cc5c0c 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -878,7 +878,7 @@ static void of_gpiochip_remove_hog(struct gpio_chip *chip, { struct gpio_desc *desc; - for_each_gpio_desc_with_flag(chip, desc, FLAG_IS_HOGGED) + for_each_gpio_desc_with_flag(chip, desc, GPIOD_FLAG_IS_HOGGED) if (READ_ONCE(desc->hog) == hog) gpiochip_free_own_desc(desc); } diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c index b64106f1cb7b90..9a849245b35880 100644 --- a/drivers/gpio/gpiolib-sysfs.c +++ b/drivers/gpio/gpiolib-sysfs.c @@ -131,7 +131,7 @@ static ssize_t direction_show(struct device *dev, scoped_guard(mutex, &data->mutex) { gpiod_get_direction(desc); - value = !!test_bit(FLAG_IS_OUT, &desc->flags); + value = !!test_bit(GPIOD_FLAG_IS_OUT, &desc->flags); } return sysfs_emit(buf, "%s\n", value ? "out" : "in"); @@ -226,14 +226,14 @@ static int gpio_sysfs_request_irq(struct gpiod_data *data, unsigned char flags) irq_flags = IRQF_SHARED; if (flags & GPIO_IRQF_TRIGGER_FALLING) { - irq_flags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ? + irq_flags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) ? IRQF_TRIGGER_RISING : IRQF_TRIGGER_FALLING; - set_bit(FLAG_EDGE_FALLING, &desc->flags); + set_bit(GPIOD_FLAG_EDGE_FALLING, &desc->flags); } if (flags & GPIO_IRQF_TRIGGER_RISING) { - irq_flags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ? + irq_flags |= test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) ? IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING; - set_bit(FLAG_EDGE_RISING, &desc->flags); + set_bit(GPIOD_FLAG_EDGE_RISING, &desc->flags); } /* @@ -260,8 +260,8 @@ static int gpio_sysfs_request_irq(struct gpiod_data *data, unsigned char flags) err_unlock: gpiochip_unlock_as_irq(guard.gc, gpio_chip_hwgpio(desc)); err_clr_bits: - clear_bit(FLAG_EDGE_RISING, &desc->flags); - clear_bit(FLAG_EDGE_FALLING, &desc->flags); + clear_bit(GPIOD_FLAG_EDGE_RISING, &desc->flags); + clear_bit(GPIOD_FLAG_EDGE_FALLING, &desc->flags); return ret; } @@ -281,8 +281,8 @@ static void gpio_sysfs_free_irq(struct gpiod_data *data) data->irq_flags = 0; free_irq(data->irq, data); gpiochip_unlock_as_irq(guard.gc, gpio_chip_hwgpio(desc)); - clear_bit(FLAG_EDGE_RISING, &desc->flags); - clear_bit(FLAG_EDGE_FALLING, &desc->flags); + clear_bit(GPIOD_FLAG_EDGE_RISING, &desc->flags); + clear_bit(GPIOD_FLAG_EDGE_FALLING, &desc->flags); } static const char *const trigger_names[] = { @@ -347,10 +347,10 @@ static int gpio_sysfs_set_active_low(struct gpiod_data *data, int value) struct gpio_desc *desc = data->desc; int status = 0; - if (!!test_bit(FLAG_ACTIVE_LOW, &desc->flags) == !!value) + if (!!test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags) == !!value) return 0; - assign_bit(FLAG_ACTIVE_LOW, &desc->flags, value); + assign_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags, value); /* reconfigure poll(2) support if enabled on one edge only */ if (flags == GPIO_IRQF_TRIGGER_FALLING || @@ -373,7 +373,7 @@ static ssize_t active_low_show(struct device *dev, int value; scoped_guard(mutex, &data->mutex) - value = !!test_bit(FLAG_ACTIVE_LOW, &desc->flags); + value = !!test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags); return sysfs_emit(buf, "%d\n", value); } @@ -418,7 +418,7 @@ static umode_t gpio_is_visible(struct kobject *kobj, struct attribute *attr, mode = 0; if (!data->direction_can_change && - test_bit(FLAG_IS_OUT, &data->desc->flags)) + test_bit(GPIOD_FLAG_IS_OUT, &data->desc->flags)) mode = 0; #endif /* CONFIG_GPIO_SYSFS_LEGACY */ } @@ -486,7 +486,7 @@ static int export_gpio_desc(struct gpio_desc *desc) } /* - * No extra locking here; FLAG_SYSFS just signifies that the + * No extra locking here; GPIOD_FLAG_SYSFS just signifies that the * request and export were done by on behalf of userspace, so * they may be undone on its behalf too. */ @@ -505,7 +505,7 @@ static int export_gpio_desc(struct gpio_desc *desc) if (ret < 0) { gpiod_free(desc); } else { - set_bit(FLAG_SYSFS, &desc->flags); + set_bit(GPIOD_FLAG_SYSFS, &desc->flags); gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_REQUESTED); } @@ -515,11 +515,11 @@ static int export_gpio_desc(struct gpio_desc *desc) static int unexport_gpio_desc(struct gpio_desc *desc) { /* - * No extra locking here; FLAG_SYSFS just signifies that the + * No extra locking here; GPIOD_FLAG_SYSFS just signifies that the * request and export were done by on behalf of userspace, so * they may be undone on its behalf too. */ - if (!test_and_clear_bit(FLAG_SYSFS, &desc->flags)) + if (!test_and_clear_bit(GPIOD_FLAG_SYSFS, &desc->flags)) return -EINVAL; gpiod_unexport(desc); @@ -748,14 +748,14 @@ int gpiod_export(struct gpio_desc *desc, bool direction_may_change) if (!guard.gc) return -ENODEV; - if (test_and_set_bit(FLAG_EXPORT, &desc->flags)) + if (test_and_set_bit(GPIOD_FLAG_EXPORT, &desc->flags)) return -EPERM; gdev = desc->gdev; guard(mutex)(&sysfs_lock); - if (!test_bit(FLAG_REQUESTED, &desc->flags)) { + if (!test_bit(GPIOD_FLAG_REQUESTED, &desc->flags)) { gpiod_dbg(desc, "%s: unavailable (not requested)\n", __func__); status = -EPERM; goto err_clear_bit; @@ -866,7 +866,7 @@ int gpiod_export(struct gpio_desc *desc, bool direction_may_change) #endif /* CONFIG_GPIO_SYSFS_LEGACY */ kfree(desc_data); err_clear_bit: - clear_bit(FLAG_EXPORT, &desc->flags); + clear_bit(GPIOD_FLAG_EXPORT, &desc->flags); gpiod_dbg(desc, "%s: status %d\n", __func__, status); return status; } @@ -937,7 +937,7 @@ void gpiod_unexport(struct gpio_desc *desc) } scoped_guard(mutex, &sysfs_lock) { - if (!test_bit(FLAG_EXPORT, &desc->flags)) + if (!test_bit(GPIOD_FLAG_EXPORT, &desc->flags)) return; gdev = gpiod_to_gpio_device(desc); @@ -956,7 +956,7 @@ void gpiod_unexport(struct gpio_desc *desc) return; list_del(&desc_data->list); - clear_bit(FLAG_EXPORT, &desc->flags); + clear_bit(GPIOD_FLAG_EXPORT, &desc->flags); #if IS_ENABLED(CONFIG_GPIO_SYSFS_LEGACY) sysfs_put(desc_data->value_kn); device_unregister(desc_data->dev); @@ -1073,7 +1073,7 @@ void gpiochip_sysfs_unregister(struct gpio_device *gdev) return; /* unregister gpiod class devices owned by sysfs */ - for_each_gpio_desc_with_flag(chip, desc, FLAG_SYSFS) { + for_each_gpio_desc_with_flag(chip, desc, GPIOD_FLAG_SYSFS) { gpiod_unexport(desc); gpiod_free(desc); } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 0d2b470a252eeb..9952e412da505c 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -127,10 +127,10 @@ const char *gpiod_get_label(struct gpio_desc *desc) label = srcu_dereference_check(desc->label, &desc->gdev->desc_srcu, srcu_read_lock_held(&desc->gdev->desc_srcu)); - if (test_bit(FLAG_USED_AS_IRQ, &flags)) + if (test_bit(GPIOD_FLAG_USED_AS_IRQ, &flags)) return label ? label->str : "interrupt"; - if (!test_bit(FLAG_REQUESTED, &flags)) + if (!test_bit(GPIOD_FLAG_REQUESTED, &flags)) return NULL; return label ? label->str : NULL; @@ -450,8 +450,8 @@ int gpiod_get_direction(struct gpio_desc *desc) * Open drain emulation using input mode may incorrectly report * input here, fix that up. */ - if (test_bit(FLAG_OPEN_DRAIN, &flags) && - test_bit(FLAG_IS_OUT, &flags)) + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &flags) && + test_bit(GPIOD_FLAG_IS_OUT, &flags)) return 0; if (!guard.gc->get_direction) @@ -468,7 +468,7 @@ int gpiod_get_direction(struct gpio_desc *desc) if (ret > 0) ret = 1; - assign_bit(FLAG_IS_OUT, &flags, !ret); + assign_bit(GPIOD_FLAG_IS_OUT, &flags, !ret); WRITE_ONCE(desc->flags, flags); return ret; @@ -846,7 +846,7 @@ static void gpiochip_free_remaining_irqs(struct gpio_chip *gc) { struct gpio_desc *desc; - for_each_gpio_desc_with_flag(gc, desc, FLAG_USED_AS_IRQ) + for_each_gpio_desc_with_flag(gc, desc, GPIOD_FLAG_USED_AS_IRQ) gpiod_free_irqs(desc); } @@ -1169,10 +1169,10 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, * lock here. */ if (gc->get_direction && gpiochip_line_is_valid(gc, desc_index)) - assign_bit(FLAG_IS_OUT, &desc->flags, + assign_bit(GPIOD_FLAG_IS_OUT, &desc->flags, !gc->get_direction(gc, desc_index)); else - assign_bit(FLAG_IS_OUT, + assign_bit(GPIOD_FLAG_IS_OUT, &desc->flags, !gc->direction_input); } @@ -2349,11 +2349,13 @@ int gpiochip_add_pingroup_range(struct gpio_chip *gc, EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); /** - * gpiochip_add_pin_range() - add a range for GPIO <-> pin mapping + * gpiochip_add_pin_range_with_pins() - add a range for GPIO <-> pin mapping * @gc: the gpiochip to add the range for * @pinctl_name: the dev_name() of the pin controller to map to * @gpio_offset: the start offset in the current gpio_chip number space * @pin_offset: the start offset in the pin controller number space + * @pins: the list of non consecutive pins to accumulate in this range (if not + * NULL, pin_offset is ignored by pinctrl core) * @npins: the number of pins from the offset of each pin space (GPIO and * pin controller) to accumulate in this range * @@ -2365,9 +2367,12 @@ EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); * Returns: * 0 on success, or a negative errno on failure. */ -int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins) +int gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int const *pins, + unsigned int npins) { struct gpio_pin_range *pin_range; struct gpio_device *gdev = gc->gpiodev; @@ -2385,6 +2390,7 @@ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, pin_range->range.name = gc->label; pin_range->range.base = gdev->base + gpio_offset; pin_range->range.pin_base = pin_offset; + pin_range->range.pins = pins; pin_range->range.npins = npins; pin_range->pctldev = pinctrl_find_and_add_gpio_range(pinctl_name, &pin_range->range); @@ -2394,16 +2400,21 @@ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, kfree(pin_range); return ret; } - chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", - gpio_offset, gpio_offset + npins - 1, - pinctl_name, - pin_offset, pin_offset + npins - 1); + if (pin_range->range.pins) + chip_dbg(gc, "created GPIO range %d->%d ==> %s %d sparse PIN range { %d, ... }", + gpio_offset, gpio_offset + npins - 1, + pinctl_name, npins, pins[0]); + else + chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", + gpio_offset, gpio_offset + npins - 1, + pinctl_name, + pin_offset, pin_offset + npins - 1); list_add_tail(&pin_range->node, &gdev->pin_ranges); return 0; } -EXPORT_SYMBOL_GPL(gpiochip_add_pin_range); +EXPORT_SYMBOL_GPL(gpiochip_add_pin_range_with_pins); /** * gpiochip_remove_pin_ranges() - remove all the GPIO <-> pin mappings @@ -2438,7 +2449,7 @@ static int gpiod_request_commit(struct gpio_desc *desc, const char *label) if (!guard.gc) return -ENODEV; - if (test_and_set_bit(FLAG_REQUESTED, &desc->flags)) + if (test_and_set_bit(GPIOD_FLAG_REQUESTED, &desc->flags)) return -EBUSY; offset = gpio_chip_hwgpio(desc); @@ -2467,7 +2478,7 @@ static int gpiod_request_commit(struct gpio_desc *desc, const char *label) return 0; out_clear_bit: - clear_bit(FLAG_REQUESTED, &desc->flags); + clear_bit(GPIOD_FLAG_REQUESTED, &desc->flags); return ret; } @@ -2501,20 +2512,20 @@ static void gpiod_free_commit(struct gpio_desc *desc) flags = READ_ONCE(desc->flags); - if (guard.gc && test_bit(FLAG_REQUESTED, &flags)) { + if (guard.gc && test_bit(GPIOD_FLAG_REQUESTED, &flags)) { if (guard.gc->free) guard.gc->free(guard.gc, gpio_chip_hwgpio(desc)); - clear_bit(FLAG_ACTIVE_LOW, &flags); - clear_bit(FLAG_REQUESTED, &flags); - clear_bit(FLAG_OPEN_DRAIN, &flags); - clear_bit(FLAG_OPEN_SOURCE, &flags); - clear_bit(FLAG_PULL_UP, &flags); - clear_bit(FLAG_PULL_DOWN, &flags); - clear_bit(FLAG_BIAS_DISABLE, &flags); - clear_bit(FLAG_EDGE_RISING, &flags); - clear_bit(FLAG_EDGE_FALLING, &flags); - clear_bit(FLAG_IS_HOGGED, &flags); + clear_bit(GPIOD_FLAG_ACTIVE_LOW, &flags); + clear_bit(GPIOD_FLAG_REQUESTED, &flags); + clear_bit(GPIOD_FLAG_OPEN_DRAIN, &flags); + clear_bit(GPIOD_FLAG_OPEN_SOURCE, &flags); + clear_bit(GPIOD_FLAG_PULL_UP, &flags); + clear_bit(GPIOD_FLAG_PULL_DOWN, &flags); + clear_bit(GPIOD_FLAG_BIAS_DISABLE, &flags); + clear_bit(GPIOD_FLAG_EDGE_RISING, &flags); + clear_bit(GPIOD_FLAG_EDGE_FALLING, &flags); + clear_bit(GPIOD_FLAG_IS_HOGGED, &flags); #ifdef CONFIG_OF_DYNAMIC WRITE_ONCE(desc->hog, NULL); #endif @@ -2557,7 +2568,7 @@ char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset) if (IS_ERR(desc)) return NULL; - if (!test_bit(FLAG_REQUESTED, &desc->flags)) + if (!test_bit(GPIOD_FLAG_REQUESTED, &desc->flags)) return NULL; guard(srcu)(&desc->gdev->desc_srcu); @@ -2725,11 +2736,11 @@ static int gpio_set_bias(struct gpio_desc *desc) flags = READ_ONCE(desc->flags); - if (test_bit(FLAG_BIAS_DISABLE, &flags)) + if (test_bit(GPIOD_FLAG_BIAS_DISABLE, &flags)) bias = PIN_CONFIG_BIAS_DISABLE; - else if (test_bit(FLAG_PULL_UP, &flags)) + else if (test_bit(GPIOD_FLAG_PULL_UP, &flags)) bias = PIN_CONFIG_BIAS_PULL_UP; - else if (test_bit(FLAG_PULL_DOWN, &flags)) + else if (test_bit(GPIOD_FLAG_PULL_DOWN, &flags)) bias = PIN_CONFIG_BIAS_PULL_DOWN; else return 0; @@ -2871,7 +2882,7 @@ int gpiod_direction_input_nonotify(struct gpio_desc *desc) } } if (ret == 0) { - clear_bit(FLAG_IS_OUT, &desc->flags); + clear_bit(GPIOD_FLAG_IS_OUT, &desc->flags); ret = gpio_set_bias(desc); } @@ -2944,7 +2955,7 @@ static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value) } if (!ret) - set_bit(FLAG_IS_OUT, &desc->flags); + set_bit(GPIOD_FLAG_IS_OUT, &desc->flags); trace_gpio_value(desc_to_gpio(desc), 0, val); trace_gpio_direction(desc_to_gpio(desc), 0, ret); return ret; @@ -3010,21 +3021,21 @@ int gpiod_direction_output_nonotify(struct gpio_desc *desc, int value) flags = READ_ONCE(desc->flags); - if (test_bit(FLAG_ACTIVE_LOW, &flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &flags)) value = !value; else value = !!value; /* GPIOs used for enabled IRQs shall not be set as output */ - if (test_bit(FLAG_USED_AS_IRQ, &flags) && - test_bit(FLAG_IRQ_IS_ENABLED, &flags)) { + if (test_bit(GPIOD_FLAG_USED_AS_IRQ, &flags) && + test_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &flags)) { gpiod_err(desc, "%s: tried to set a GPIO tied to an IRQ as output\n", __func__); return -EIO; } - if (test_bit(FLAG_OPEN_DRAIN, &flags)) { + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &flags)) { /* First see if we can enable open drain in hardware */ ret = gpio_set_config(desc, PIN_CONFIG_DRIVE_OPEN_DRAIN); if (!ret) @@ -3032,7 +3043,7 @@ int gpiod_direction_output_nonotify(struct gpio_desc *desc, int value) /* Emulate open drain by not actively driving the line high */ if (value) goto set_output_flag; - } else if (test_bit(FLAG_OPEN_SOURCE, &flags)) { + } else if (test_bit(GPIOD_FLAG_OPEN_SOURCE, &flags)) { ret = gpio_set_config(desc, PIN_CONFIG_DRIVE_OPEN_SOURCE); if (!ret) goto set_output_value; @@ -3059,7 +3070,7 @@ int gpiod_direction_output_nonotify(struct gpio_desc *desc, int value) * set the IS_OUT flag or otherwise we won't be able to set the line * value anymore. */ - set_bit(FLAG_IS_OUT, &desc->flags); + set_bit(GPIOD_FLAG_IS_OUT, &desc->flags); return 0; } @@ -3199,10 +3210,10 @@ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory) { VALIDATE_DESC(desc); /* - * Handle FLAG_TRANSITORY first, enabling queries to gpiolib for + * Handle GPIOD_FLAG_TRANSITORY first, enabling queries to gpiolib for * persistence state. */ - assign_bit(FLAG_TRANSITORY, &desc->flags, transitory); + assign_bit(GPIOD_FLAG_TRANSITORY, &desc->flags, transitory); /* If the driver supports it, set the persistence state now */ return gpio_set_config_with_argument_optional(desc, @@ -3220,7 +3231,7 @@ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory) int gpiod_is_active_low(const struct gpio_desc *desc) { VALIDATE_DESC(desc); - return test_bit(FLAG_ACTIVE_LOW, &desc->flags); + return test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags); } EXPORT_SYMBOL_GPL(gpiod_is_active_low); @@ -3231,7 +3242,7 @@ EXPORT_SYMBOL_GPL(gpiod_is_active_low); void gpiod_toggle_active_low(struct gpio_desc *desc) { VALIDATE_DESC_VOID(desc); - change_bit(FLAG_ACTIVE_LOW, &desc->flags); + change_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags); gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_CONFIG); } EXPORT_SYMBOL_GPL(gpiod_toggle_active_low); @@ -3437,7 +3448,7 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep, int hwgpio = gpio_chip_hwgpio(desc); int value = test_bit(hwgpio, bits); - if (!raw && test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + if (!raw && test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; __assign_bit(j, value_bitmap, value); trace_gpio_value(desc_to_gpio(desc), 1, value); @@ -3499,7 +3510,7 @@ int gpiod_get_value(const struct gpio_desc *desc) if (value < 0) return value; - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; return value; @@ -3582,7 +3593,7 @@ static int gpio_set_open_drain_value_commit(struct gpio_desc *desc, bool value) } else { ret = gpiochip_direction_output(guard.gc, offset, 0); if (!ret) - set_bit(FLAG_IS_OUT, &desc->flags); + set_bit(GPIOD_FLAG_IS_OUT, &desc->flags); } trace_gpio_direction(desc_to_gpio(desc), value, ret); if (ret < 0) @@ -3609,7 +3620,7 @@ static int gpio_set_open_source_value_commit(struct gpio_desc *desc, bool value) if (value) { ret = gpiochip_direction_output(guard.gc, offset, 1); if (!ret) - set_bit(FLAG_IS_OUT, &desc->flags); + set_bit(GPIOD_FLAG_IS_OUT, &desc->flags); } else { ret = gpiochip_direction_input(guard.gc, offset); } @@ -3624,7 +3635,7 @@ static int gpio_set_open_source_value_commit(struct gpio_desc *desc, bool value) static int gpiod_set_raw_value_commit(struct gpio_desc *desc, bool value) { - if (unlikely(!test_bit(FLAG_IS_OUT, &desc->flags))) + if (unlikely(!test_bit(GPIOD_FLAG_IS_OUT, &desc->flags))) return -EPERM; CLASS(gpio_chip_guard, guard)(desc); @@ -3694,7 +3705,7 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, WARN_ON(array_info->gdev->can_sleep); for (i = 0; i < array_size; i++) { - if (unlikely(!test_bit(FLAG_IS_OUT, + if (unlikely(!test_bit(GPIOD_FLAG_IS_OUT, &desc_array[i]->flags))) return -EPERM; } @@ -3758,7 +3769,7 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, int hwgpio = gpio_chip_hwgpio(desc); int value = test_bit(i, value_bitmap); - if (unlikely(!test_bit(FLAG_IS_OUT, &desc->flags))) + if (unlikely(!test_bit(GPIOD_FLAG_IS_OUT, &desc->flags))) return -EPERM; /* @@ -3768,16 +3779,16 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, */ if (!raw && !(array_info && test_bit(i, array_info->invert_mask)) && - test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; trace_gpio_value(desc_to_gpio(desc), 0, value); /* * collect all normal outputs belonging to the same chip * open drain and open source outputs are set individually */ - if (test_bit(FLAG_OPEN_DRAIN, &desc->flags) && !raw) { + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags) && !raw) { gpio_set_open_drain_value_commit(desc, value); - } else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags) && !raw) { + } else if (test_bit(GPIOD_FLAG_OPEN_SOURCE, &desc->flags) && !raw) { gpio_set_open_source_value_commit(desc, value); } else { __set_bit(hwgpio, mask); @@ -3843,12 +3854,12 @@ EXPORT_SYMBOL_GPL(gpiod_set_raw_value); */ static int gpiod_set_value_nocheck(struct gpio_desc *desc, int value) { - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; - if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags)) return gpio_set_open_drain_value_commit(desc, value); - else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) + else if (test_bit(GPIOD_FLAG_OPEN_SOURCE, &desc->flags)) return gpio_set_open_source_value_commit(desc, value); return gpiod_set_raw_value_commit(desc, value); @@ -4052,16 +4063,16 @@ int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset) } /* To be valid for IRQ the line needs to be input or open drain */ - if (test_bit(FLAG_IS_OUT, &desc->flags) && - !test_bit(FLAG_OPEN_DRAIN, &desc->flags)) { + if (test_bit(GPIOD_FLAG_IS_OUT, &desc->flags) && + !test_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags)) { chip_err(gc, "%s: tried to flag a GPIO set as output for IRQ\n", __func__); return -EIO; } - set_bit(FLAG_USED_AS_IRQ, &desc->flags); - set_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); + set_bit(GPIOD_FLAG_USED_AS_IRQ, &desc->flags); + set_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &desc->flags); return 0; } @@ -4083,8 +4094,8 @@ void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset) if (IS_ERR(desc)) return; - clear_bit(FLAG_USED_AS_IRQ, &desc->flags); - clear_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); + clear_bit(GPIOD_FLAG_USED_AS_IRQ, &desc->flags); + clear_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &desc->flags); } EXPORT_SYMBOL_GPL(gpiochip_unlock_as_irq); @@ -4093,8 +4104,8 @@ void gpiochip_disable_irq(struct gpio_chip *gc, unsigned int offset) struct gpio_desc *desc = gpiochip_get_desc(gc, offset); if (!IS_ERR(desc) && - !WARN_ON(!test_bit(FLAG_USED_AS_IRQ, &desc->flags))) - clear_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); + !WARN_ON(!test_bit(GPIOD_FLAG_USED_AS_IRQ, &desc->flags))) + clear_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &desc->flags); } EXPORT_SYMBOL_GPL(gpiochip_disable_irq); @@ -4103,14 +4114,14 @@ void gpiochip_enable_irq(struct gpio_chip *gc, unsigned int offset) struct gpio_desc *desc = gpiochip_get_desc(gc, offset); if (!IS_ERR(desc) && - !WARN_ON(!test_bit(FLAG_USED_AS_IRQ, &desc->flags))) { + !WARN_ON(!test_bit(GPIOD_FLAG_USED_AS_IRQ, &desc->flags))) { /* * We must not be output when using IRQ UNLESS we are * open drain. */ - WARN_ON(test_bit(FLAG_IS_OUT, &desc->flags) && - !test_bit(FLAG_OPEN_DRAIN, &desc->flags)); - set_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); + WARN_ON(test_bit(GPIOD_FLAG_IS_OUT, &desc->flags) && + !test_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags)); + set_bit(GPIOD_FLAG_IRQ_IS_ENABLED, &desc->flags); } } EXPORT_SYMBOL_GPL(gpiochip_enable_irq); @@ -4120,7 +4131,7 @@ bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset) if (offset >= gc->ngpio) return false; - return test_bit(FLAG_USED_AS_IRQ, &gc->gpiodev->descs[offset].flags); + return test_bit(GPIOD_FLAG_USED_AS_IRQ, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_irq); @@ -4153,7 +4164,7 @@ bool gpiochip_line_is_open_drain(struct gpio_chip *gc, unsigned int offset) if (offset >= gc->ngpio) return false; - return test_bit(FLAG_OPEN_DRAIN, &gc->gpiodev->descs[offset].flags); + return test_bit(GPIOD_FLAG_OPEN_DRAIN, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_open_drain); @@ -4162,7 +4173,7 @@ bool gpiochip_line_is_open_source(struct gpio_chip *gc, unsigned int offset) if (offset >= gc->ngpio) return false; - return test_bit(FLAG_OPEN_SOURCE, &gc->gpiodev->descs[offset].flags); + return test_bit(GPIOD_FLAG_OPEN_SOURCE, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_open_source); @@ -4171,7 +4182,7 @@ bool gpiochip_line_is_persistent(struct gpio_chip *gc, unsigned int offset) if (offset >= gc->ngpio) return false; - return !test_bit(FLAG_TRANSITORY, &gc->gpiodev->descs[offset].flags); + return !test_bit(GPIOD_FLAG_TRANSITORY, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_persistent); @@ -4213,7 +4224,7 @@ int gpiod_get_value_cansleep(const struct gpio_desc *desc) if (value < 0) return value; - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + if (test_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags)) value = !value; return value; @@ -4604,6 +4615,23 @@ static struct gpio_desc *gpiod_find_by_fwnode(struct fwnode_handle *fwnode, return desc; } +static struct gpio_desc *gpiod_fwnode_lookup(struct fwnode_handle *fwnode, + struct device *consumer, + const char *con_id, + unsigned int idx, + enum gpiod_flags *flags, + unsigned long *lookupflags) +{ + struct gpio_desc *desc; + + desc = gpiod_find_by_fwnode(fwnode, consumer, con_id, idx, flags, lookupflags); + if (gpiod_not_found(desc) && !IS_ERR_OR_NULL(fwnode)) + desc = gpiod_find_by_fwnode(fwnode->secondary, consumer, con_id, + idx, flags, lookupflags); + + return desc; +} + struct gpio_desc *gpiod_find_and_request(struct device *consumer, struct fwnode_handle *fwnode, const char *con_id, @@ -4622,8 +4650,8 @@ struct gpio_desc *gpiod_find_and_request(struct device *consumer, int ret = 0; scoped_guard(srcu, &gpio_devices_srcu) { - desc = gpiod_find_by_fwnode(fwnode, consumer, con_id, idx, - &flags, &lookupflags); + desc = gpiod_fwnode_lookup(fwnode, consumer, con_id, idx, + &flags, &lookupflags); if (gpiod_not_found(desc) && platform_lookup_allowed) { /* * Either we are not using DT or ACPI, or their lookup @@ -4795,10 +4823,10 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id, int ret; if (lflags & GPIO_ACTIVE_LOW) - set_bit(FLAG_ACTIVE_LOW, &desc->flags); + set_bit(GPIOD_FLAG_ACTIVE_LOW, &desc->flags); if (lflags & GPIO_OPEN_DRAIN) - set_bit(FLAG_OPEN_DRAIN, &desc->flags); + set_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags); else if (dflags & GPIOD_FLAGS_BIT_OPEN_DRAIN) { /* * This enforces open drain mode from the consumer side. @@ -4806,13 +4834,13 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id, * should *REALLY* have specified them as open drain in the * first place, so print a little warning here. */ - set_bit(FLAG_OPEN_DRAIN, &desc->flags); + set_bit(GPIOD_FLAG_OPEN_DRAIN, &desc->flags); gpiod_warn(desc, "enforced open drain please flag it properly in DT/ACPI DSDT/board file\n"); } if (lflags & GPIO_OPEN_SOURCE) - set_bit(FLAG_OPEN_SOURCE, &desc->flags); + set_bit(GPIOD_FLAG_OPEN_SOURCE, &desc->flags); if (((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DOWN)) || ((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DISABLE)) || @@ -4823,11 +4851,11 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id, } if (lflags & GPIO_PULL_UP) - set_bit(FLAG_PULL_UP, &desc->flags); + set_bit(GPIOD_FLAG_PULL_UP, &desc->flags); else if (lflags & GPIO_PULL_DOWN) - set_bit(FLAG_PULL_DOWN, &desc->flags); + set_bit(GPIOD_FLAG_PULL_DOWN, &desc->flags); else if (lflags & GPIO_PULL_DISABLE) - set_bit(FLAG_BIAS_DISABLE, &desc->flags); + set_bit(GPIOD_FLAG_BIAS_DISABLE, &desc->flags); ret = gpiod_set_transitory(desc, (lflags & GPIO_TRANSITORY)); if (ret < 0) @@ -4932,7 +4960,7 @@ int gpiod_hog(struct gpio_desc *desc, const char *name, if (!guard.gc) return -ENODEV; - if (test_and_set_bit(FLAG_IS_HOGGED, &desc->flags)) + if (test_and_set_bit(GPIOD_FLAG_IS_HOGGED, &desc->flags)) return 0; hwnum = gpio_chip_hwgpio(desc); @@ -4940,7 +4968,7 @@ int gpiod_hog(struct gpio_desc *desc, const char *name, local_desc = gpiochip_request_own_desc(guard.gc, hwnum, name, lflags, dflags); if (IS_ERR(local_desc)) { - clear_bit(FLAG_IS_HOGGED, &desc->flags); + clear_bit(GPIOD_FLAG_IS_HOGGED, &desc->flags); ret = PTR_ERR(local_desc); pr_err("requesting hog GPIO %s (chip %s, offset %d) failed, %d\n", name, gdev->label, hwnum, ret); @@ -4963,7 +4991,7 @@ static void gpiochip_free_hogs(struct gpio_chip *gc) { struct gpio_desc *desc; - for_each_gpio_desc_with_flag(gc, desc, FLAG_IS_HOGGED) + for_each_gpio_desc_with_flag(gc, desc, GPIOD_FLAG_IS_HOGGED) gpiochip_free_own_desc(desc); } @@ -5078,8 +5106,8 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, } else { dflags = READ_ONCE(desc->flags); /* Exclude open drain or open source from fast output */ - if (test_bit(FLAG_OPEN_DRAIN, &dflags) || - test_bit(FLAG_OPEN_SOURCE, &dflags)) + if (test_bit(GPIOD_FLAG_OPEN_DRAIN, &dflags) || + test_bit(GPIOD_FLAG_OPEN_SOURCE, &dflags)) __clear_bit(descs->ndescs, array_info->set_mask); /* Identify 'fast' pins which require invertion */ @@ -5237,12 +5265,12 @@ static void gpiolib_dbg_show(struct seq_file *s, struct gpio_device *gdev) for_each_gpio_desc(gc, desc) { guard(srcu)(&desc->gdev->desc_srcu); flags = READ_ONCE(desc->flags); - is_irq = test_bit(FLAG_USED_AS_IRQ, &flags); - if (is_irq || test_bit(FLAG_REQUESTED, &flags)) { + is_irq = test_bit(GPIOD_FLAG_USED_AS_IRQ, &flags); + if (is_irq || test_bit(GPIOD_FLAG_REQUESTED, &flags)) { gpiod_get_direction(desc); - is_out = test_bit(FLAG_IS_OUT, &flags); + is_out = test_bit(GPIOD_FLAG_IS_OUT, &flags); value = gpio_chip_get_value(gc, desc); - active_low = test_bit(FLAG_ACTIVE_LOW, &flags); + active_low = test_bit(GPIOD_FLAG_ACTIVE_LOW, &flags); seq_printf(s, " gpio-%-3u (%-20.20s|%-20.20s) %s %s %s%s\n", gpio, desc->name ?: "", gpiod_get_label(desc), is_out ? "out" : "in ", diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 9b74738a9ca5b1..2a003a7311e7ac 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -186,24 +186,24 @@ struct gpio_desc { struct gpio_device *gdev; unsigned long flags; /* flag symbols are bit numbers */ -#define FLAG_REQUESTED 0 -#define FLAG_IS_OUT 1 -#define FLAG_EXPORT 2 /* protected by sysfs_lock */ -#define FLAG_SYSFS 3 /* exported via /sys/class/gpio/control */ -#define FLAG_ACTIVE_LOW 6 /* value has active low */ -#define FLAG_OPEN_DRAIN 7 /* Gpio is open drain type */ -#define FLAG_OPEN_SOURCE 8 /* Gpio is open source type */ -#define FLAG_USED_AS_IRQ 9 /* GPIO is connected to an IRQ */ -#define FLAG_IRQ_IS_ENABLED 10 /* GPIO is connected to an enabled IRQ */ -#define FLAG_IS_HOGGED 11 /* GPIO is hogged */ -#define FLAG_TRANSITORY 12 /* GPIO may lose value in sleep or reset */ -#define FLAG_PULL_UP 13 /* GPIO has pull up enabled */ -#define FLAG_PULL_DOWN 14 /* GPIO has pull down enabled */ -#define FLAG_BIAS_DISABLE 15 /* GPIO has pull disabled */ -#define FLAG_EDGE_RISING 16 /* GPIO CDEV detects rising edge events */ -#define FLAG_EDGE_FALLING 17 /* GPIO CDEV detects falling edge events */ -#define FLAG_EVENT_CLOCK_REALTIME 18 /* GPIO CDEV reports REALTIME timestamps in events */ -#define FLAG_EVENT_CLOCK_HTE 19 /* GPIO CDEV reports hardware timestamps in events */ +#define GPIOD_FLAG_REQUESTED 0 /* GPIO is in use */ +#define GPIOD_FLAG_IS_OUT 1 /* GPIO is in output mode */ +#define GPIOD_FLAG_EXPORT 2 /* GPIO is exported to user-space */ +#define GPIOD_FLAG_SYSFS 3 /* GPIO is exported via /sys/class/gpio */ +#define GPIOD_FLAG_ACTIVE_LOW 6 /* GPIO is active-low */ +#define GPIOD_FLAG_OPEN_DRAIN 7 /* GPIO is open drain type */ +#define GPIOD_FLAG_OPEN_SOURCE 8 /* GPIO is open source type */ +#define GPIOD_FLAG_USED_AS_IRQ 9 /* GPIO is connected to an IRQ */ +#define GPIOD_FLAG_IRQ_IS_ENABLED 10 /* GPIO is connected to an enabled IRQ */ +#define GPIOD_FLAG_IS_HOGGED 11 /* GPIO is hogged */ +#define GPIOD_FLAG_TRANSITORY 12 /* GPIO may lose value in sleep or reset */ +#define GPIOD_FLAG_PULL_UP 13 /* GPIO has pull up enabled */ +#define GPIOD_FLAG_PULL_DOWN 14 /* GPIO has pull down enabled */ +#define GPIOD_FLAG_BIAS_DISABLE 15 /* GPIO has pull disabled */ +#define GPIOD_FLAG_EDGE_RISING 16 /* GPIO CDEV detects rising edge events */ +#define GPIOD_FLAG_EDGE_FALLING 17 /* GPIO CDEV detects falling edge events */ +#define GPIOD_FLAG_EVENT_CLOCK_REALTIME 18 /* GPIO CDEV reports REALTIME timestamps in events */ +#define GPIOD_FLAG_EVENT_CLOCK_HTE 19 /* GPIO CDEV reports hardware timestamps in events */ /* Connection label */ struct gpio_desc_label __rcu *label; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index fbe7616555c83f..a2879d2b7c8ec1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -250,16 +250,24 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev, void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc) { - if (adev->kfd.dev) - kgd2kfd_suspend(adev->kfd.dev, suspend_proc); + if (adev->kfd.dev) { + if (adev->in_s0ix) + kgd2kfd_stop_sched_all_nodes(adev->kfd.dev); + else + kgd2kfd_suspend(adev->kfd.dev, suspend_proc); + } } int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool resume_proc) { int r = 0; - if (adev->kfd.dev) - r = kgd2kfd_resume(adev->kfd.dev, resume_proc); + if (adev->kfd.dev) { + if (adev->in_s0ix) + r = kgd2kfd_start_sched_all_nodes(adev->kfd.dev); + else + r = kgd2kfd_resume(adev->kfd.dev, resume_proc); + } return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 33eb4826b58b1a..aa88bad7416bf4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -426,7 +426,9 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask); int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd); void kgd2kfd_unlock_kfd(struct kfd_dev *kfd); int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id); +int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd); int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id); +int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd); bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id); bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, bool retry_fault); @@ -516,11 +518,21 @@ static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) return 0; } +static inline int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd) +{ + return 0; +} + static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) { return 0; } +static inline int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd) +{ + return 0; +} + static inline bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) { return false; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 260165bbe3736d..b16cce7c22c373 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -213,19 +213,35 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, spin_lock(&kfd_mem_limit.mem_limit_lock); if (kfd_mem_limit.system_mem_used + system_mem_needed > - kfd_mem_limit.max_system_mem_limit) + kfd_mem_limit.max_system_mem_limit) { pr_debug("Set no_system_mem_limit=1 if using shared memory\n"); + if (!no_system_mem_limit) { + ret = -ENOMEM; + goto release; + } + } - if ((kfd_mem_limit.system_mem_used + system_mem_needed > - kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) || - (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > - kfd_mem_limit.max_ttm_mem_limit) || - (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed > - vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size))) { + if (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > + kfd_mem_limit.max_ttm_mem_limit) { ret = -ENOMEM; goto release; } + /*if is_app_apu is false and apu_prefer_gtt is true, it is an APU with + * carve out < gtt. In that case, VRAM allocation will go to gtt domain, skip + * VRAM check since ttm_mem_limit check already cover this allocation + */ + + if (adev && xcp_id >= 0 && (!adev->apu_prefer_gtt || adev->gmc.is_app_apu)) { + uint64_t vram_available = + vram_size - reserved_for_pt - reserved_for_ras - + atomic64_read(&adev->vram_pin_size); + if (adev->kfd.vram_used[xcp_id] + vram_needed > vram_available) { + ret = -ENOMEM; + goto release; + } + } + /* Update memory accounting by decreasing available system * memory, TTM memory and GPU memory as computed above */ @@ -1626,11 +1642,15 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, uint64_t vram_available, system_mem_available, ttm_mem_available; spin_lock(&kfd_mem_limit.mem_limit_lock); - vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) - - adev->kfd.vram_used_aligned[xcp_id] - - atomic64_read(&adev->vram_pin_size) - - reserved_for_pt - - reserved_for_ras; + if (adev->apu_prefer_gtt && !adev->gmc.is_app_apu) + vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) + - adev->kfd.vram_used_aligned[xcp_id]; + else + vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) + - adev->kfd.vram_used_aligned[xcp_id] + - atomic64_read(&adev->vram_pin_size) + - reserved_for_pt + - reserved_for_ras; if (adev->apu_prefer_gtt) { system_mem_available = no_system_mem_limit ? diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 01d234cf815647..c8459337fcb898 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5136,7 +5136,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) adev->in_suspend = true; if (amdgpu_sriov_vf(adev)) { - if (!adev->in_s0ix && !adev->in_runpm) + if (!adev->in_runpm) amdgpu_amdkfd_suspend_process(adev); amdgpu_virt_fini_data_exchange(adev); r = amdgpu_virt_request_full_gpu(adev, false); @@ -5156,10 +5156,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) amdgpu_device_ip_suspend_phase1(adev); - if (!adev->in_s0ix) { - amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); - amdgpu_userq_suspend(adev); - } + amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + amdgpu_userq_suspend(adev); r = amdgpu_device_evict_resources(adev); if (r) @@ -5254,15 +5252,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) goto exit; } - if (!adev->in_s0ix) { - r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); - if (r) - goto exit; + r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + if (r) + goto exit; - r = amdgpu_userq_resume(adev); - if (r) - goto exit; - } + r = amdgpu_userq_resume(adev); + if (r) + goto exit; r = amdgpu_device_ip_late_init(adev); if (r) @@ -5275,7 +5271,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) amdgpu_virt_init_data_exchange(adev); amdgpu_virt_release_full_gpu(adev, true); - if (!adev->in_s0ix && !r && !adev->in_runpm) + if (!r && !adev->in_runpm) r = amdgpu_amdkfd_resume_process(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 395c6be901ce7a..dcea66aadfa335 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2665,7 +2665,7 @@ static int amdgpu_pmops_thaw(struct device *dev) struct drm_device *drm_dev = dev_get_drvdata(dev); /* do not resume device if it's normal hibernation */ - if (!pm_hibernate_is_recovering()) + if (!pm_hibernate_is_recovering() && !pm_hibernation_mode_is_suspend()) return 0; return amdgpu_device_resume(drm_dev, true); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 6379bb25bf5ce3..486c3646710cc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -421,8 +421,6 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring) dma_fence_put(ring->vmid_wait); ring->vmid_wait = NULL; ring->me = 0; - - ring->adev->rings[ring->idx] = NULL; } /** diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index c85de8c8f6f50b..c37527704d4332 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1654,6 +1654,21 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) } } break; + case IP_VERSION(11, 0, 1): + case IP_VERSION(11, 0, 4): + adev->gfx.cleaner_shader_ptr = gfx_11_0_3_cleaner_shader_hex; + adev->gfx.cleaner_shader_size = sizeof(gfx_11_0_3_cleaner_shader_hex); + if (adev->gfx.pfp_fw_version >= 102 && + adev->gfx.mec_fw_version >= 66 && + adev->mes.fw_version[0] >= 128) { + adev->gfx.enable_cleaner_shader = true; + r = amdgpu_gfx_cleaner_shader_sw_init(adev, adev->gfx.cleaner_shader_size); + if (r) { + adev->gfx.enable_cleaner_shader = false; + dev_err(adev->dev, "Failed to initialize cleaner shader\n"); + } + } + break; case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): adev->gfx.cleaner_shader_ptr = gfx_11_0_3_cleaner_shader_hex; diff --git a/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c b/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c index a887df52041407..4258d3e0b706c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c +++ b/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c @@ -29,6 +29,8 @@ #include "amdgpu.h" #include "isp_v4_1_1.h" +MODULE_FIRMWARE("amdgpu/isp_4_1_1.bin"); + #define ISP_PERFORMANCE_STATE_LOW 0 #define ISP_PERFORMANCE_STATE_HIGH 1 diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index 6cc05d36e3594d..64b240b51f1aa7 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -149,12 +149,12 @@ static int psp_v11_0_wait_for_bootloader(struct psp_context *psp) int ret; int retry_loop; - for (retry_loop = 0; retry_loop < 10; retry_loop++) { + for (retry_loop = 0; retry_loop < 20; retry_loop++) { /* Wait for bootloader to signify that is ready having bit 31 of C2PMSG_35 set to 1 */ ret = psp_wait_for( psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_35), - 0x80000000, 0x80000000, PSP_WAITREG_NOVERBOSE); + 0x80000000, 0x8000FFFF, PSP_WAITREG_NOVERBOSE); if (ret == 0) return 0; @@ -397,18 +397,6 @@ static int psp_v11_0_mode1_reset(struct psp_context *psp) msleep(500); - offset = SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_33); - - ret = psp_wait_for(psp, offset, MBOX_TOS_RESP_FLAG, MBOX_TOS_RESP_MASK, - 0); - - if (ret) { - DRM_INFO("psp mode 1 reset failed!\n"); - return -EINVAL; - } - - DRM_INFO("psp mode1 reset succeed \n"); - return 0; } @@ -665,7 +653,8 @@ static const struct psp_funcs psp_v11_0_funcs = { .ring_get_wptr = psp_v11_0_ring_get_wptr, .ring_set_wptr = psp_v11_0_ring_set_wptr, .load_usbc_pd_fw = psp_v11_0_load_usbc_pd_fw, - .read_usbc_pd_fw = psp_v11_0_read_usbc_pd_fw + .read_usbc_pd_fw = psp_v11_0_read_usbc_pd_fw, + .wait_for_bootloader = psp_v11_0_wait_for_bootloader }; void psp_v11_0_set_psp_funcs(struct psp_context *psp) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c index 4b8f4407047fc0..2811226b0ea5dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c @@ -1888,15 +1888,19 @@ static int vcn_v3_0_limit_sched(struct amdgpu_cs_parser *p, struct amdgpu_job *job) { struct drm_gpu_scheduler **scheds; - - /* The create msg must be in the first IB submitted */ - if (atomic_read(&job->base.entity->fence_seq)) - return -EINVAL; + struct dma_fence *fence; /* if VCN0 is harvested, we can't support AV1 */ if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0) return -EINVAL; + /* wait for all jobs to finish before switching to instance 0 */ + fence = amdgpu_ctx_get_fence(p->ctx, job->base.entity, ~0ull); + if (fence) { + dma_fence_wait(fence, false); + dma_fence_put(fence); + } + scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_DEC] [AMDGPU_RING_PRIO_DEFAULT].sched; drm_sched_entity_modify_sched(job->base.entity, scheds, 1); diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 1924e075b66f41..706f3b2f484f7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -1808,15 +1808,19 @@ static int vcn_v4_0_limit_sched(struct amdgpu_cs_parser *p, struct amdgpu_job *job) { struct drm_gpu_scheduler **scheds; - - /* The create msg must be in the first IB submitted */ - if (atomic_read(&job->base.entity->fence_seq)) - return -EINVAL; + struct dma_fence *fence; /* if VCN0 is harvested, we can't support AV1 */ if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0) return -EINVAL; + /* wait for all jobs to finish before switching to instance 0 */ + fence = amdgpu_ctx_get_fence(p->ctx, job->base.entity, ~0ull); + if (fence) { + dma_fence_wait(fence, false); + dma_fence_put(fence); + } + scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_ENC] [AMDGPU_RING_PRIO_0].sched; drm_sched_entity_modify_sched(job->base.entity, scheds, 1); @@ -1907,22 +1911,16 @@ static int vcn_v4_0_dec_msg(struct amdgpu_cs_parser *p, struct amdgpu_job *job, #define RADEON_VCN_ENGINE_TYPE_ENCODE (0x00000002) #define RADEON_VCN_ENGINE_TYPE_DECODE (0x00000003) - #define RADEON_VCN_ENGINE_INFO (0x30000001) -#define RADEON_VCN_ENGINE_INFO_MAX_OFFSET 16 - #define RENCODE_ENCODE_STANDARD_AV1 2 #define RENCODE_IB_PARAM_SESSION_INIT 0x00000003 -#define RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET 64 -/* return the offset in ib if id is found, -1 otherwise - * to speed up the searching we only search upto max_offset - */ -static int vcn_v4_0_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int max_offset) +/* return the offset in ib if id is found, -1 otherwise */ +static int vcn_v4_0_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int start) { int i; - for (i = 0; i < ib->length_dw && i < max_offset && ib->ptr[i] >= 8; i += ib->ptr[i]/4) { + for (i = start; i < ib->length_dw && ib->ptr[i] >= 8; i += ib->ptr[i] / 4) { if (ib->ptr[i + 1] == id) return i; } @@ -1937,33 +1935,29 @@ static int vcn_v4_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p, struct amdgpu_vcn_decode_buffer *decode_buffer; uint64_t addr; uint32_t val; - int idx; + int idx = 0, sidx; /* The first instance can decode anything */ if (!ring->me) return 0; - /* RADEON_VCN_ENGINE_INFO is at the top of ib block */ - idx = vcn_v4_0_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO, - RADEON_VCN_ENGINE_INFO_MAX_OFFSET); - if (idx < 0) /* engine info is missing */ - return 0; - - val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */ - if (val == RADEON_VCN_ENGINE_TYPE_DECODE) { - decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6]; - - if (!(decode_buffer->valid_buf_flag & 0x1)) - return 0; - - addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 | - decode_buffer->msg_buffer_address_lo; - return vcn_v4_0_dec_msg(p, job, addr); - } else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) { - idx = vcn_v4_0_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT, - RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET); - if (idx >= 0 && ib->ptr[idx + 2] == RENCODE_ENCODE_STANDARD_AV1) - return vcn_v4_0_limit_sched(p, job); + while ((idx = vcn_v4_0_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO, idx)) >= 0) { + val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */ + if (val == RADEON_VCN_ENGINE_TYPE_DECODE) { + decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6]; + + if (!(decode_buffer->valid_buf_flag & 0x1)) + return 0; + + addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 | + decode_buffer->msg_buffer_address_lo; + return vcn_v4_0_dec_msg(p, job, addr); + } else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) { + sidx = vcn_v4_0_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT, idx); + if (sidx >= 0 && ib->ptr[sidx + 2] == RENCODE_ENCODE_STANDARD_AV1) + return vcn_v4_0_limit_sched(p, job); + } + idx += ib->ptr[idx] / 4; } return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 7e749f9b6d69da..349c351e242b59 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1550,6 +1550,25 @@ int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) return ret; } +int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd) +{ + struct kfd_node *node; + int i, r; + + if (!kfd->init_complete) + return 0; + + for (i = 0; i < kfd->num_nodes; i++) { + node = kfd->nodes[i]; + r = node->dqm->ops.unhalt(node->dqm); + if (r) { + dev_err(kfd_device, "Error in starting scheduler\n"); + return r; + } + } + return 0; +} + int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) { struct kfd_node *node; @@ -1567,6 +1586,23 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) return node->dqm->ops.halt(node->dqm); } +int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd) +{ + struct kfd_node *node; + int i, r; + + if (!kfd->init_complete) + return 0; + + for (i = 0; i < kfd->num_nodes; i++) { + node = kfd->nodes[i]; + r = node->dqm->ops.halt(node->dqm); + if (r) + return r; + } + return 0; +} + bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) { struct kfd_node *node; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 4ec73f33535ebf..720b20e842ba43 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1587,7 +1587,8 @@ static int kfd_dev_create_p2p_links(void) break; if (!dev->gpu || !dev->gpu->adev || (dev->gpu->kfd->hive_id && - dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id)) + dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id && + amdgpu_xgmi_get_is_sharing_enabled(dev->gpu->adev, new_dev->gpu->adev))) goto next; /* check if node(s) is/are peer accessible in one direction or bi-direction */ diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7808a647a306c2..ef026143dc1ca9 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2037,6 +2037,8 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) dc_hardware_init(adev->dm.dc); + adev->dm.restore_backlight = true; + adev->dm.hpd_rx_offload_wq = hpd_rx_irq_create_workqueue(adev); if (!adev->dm.hpd_rx_offload_wq) { drm_err(adev_to_drm(adev), "failed to create hpd rx offload workqueue.\n"); @@ -2913,6 +2915,17 @@ static int dm_oem_i2c_hw_init(struct amdgpu_device *adev) return 0; } +static void dm_oem_i2c_hw_fini(struct amdgpu_device *adev) +{ + struct amdgpu_display_manager *dm = &adev->dm; + + if (dm->oem_i2c) { + i2c_del_adapter(&dm->oem_i2c->base); + kfree(dm->oem_i2c); + dm->oem_i2c = NULL; + } +} + /** * dm_hw_init() - Initialize DC device * @ip_block: Pointer to the amdgpu_ip_block for this hw instance. @@ -2963,7 +2976,7 @@ static int dm_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - kfree(adev->dm.oem_i2c); + dm_oem_i2c_hw_fini(adev); amdgpu_dm_hpd_fini(adev); @@ -3127,25 +3140,6 @@ static void dm_destroy_cached_state(struct amdgpu_device *adev) dm->cached_state = NULL; } -static void dm_complete(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - dm_destroy_cached_state(adev); -} - -static int dm_prepare_suspend(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (amdgpu_in_reset(adev)) - return 0; - - WARN_ON(adev->dm.cached_state); - - return dm_cache_state(adev); -} - static int dm_suspend(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; @@ -3407,6 +3401,7 @@ static int dm_resume(struct amdgpu_ip_block *ip_block) dc_set_power_state(dm->dc, DC_ACPI_CM_POWER_STATE_D0); dc_resume(dm->dc); + adev->dm.restore_backlight = true; amdgpu_dm_irq_resume_early(adev); @@ -3571,10 +3566,8 @@ static const struct amd_ip_funcs amdgpu_dm_funcs = { .early_fini = amdgpu_dm_early_fini, .hw_init = dm_hw_init, .hw_fini = dm_hw_fini, - .prepare_suspend = dm_prepare_suspend, .suspend = dm_suspend, .resume = dm_resume, - .complete = dm_complete, .is_idle = dm_is_idle, .wait_for_idle = dm_wait_for_idle, .check_soft_reset = dm_check_soft_reset, @@ -8727,7 +8720,16 @@ static int amdgpu_dm_encoder_init(struct drm_device *dev, static void manage_dm_interrupts(struct amdgpu_device *adev, struct amdgpu_crtc *acrtc, struct dm_crtc_state *acrtc_state) -{ +{ /* + * We cannot be sure that the frontend index maps to the same + * backend index - some even map to more than one. + * So we have to go through the CRTC to find the right IRQ. + */ + int irq_type = amdgpu_display_crtc_idx_to_irq_type( + adev, + acrtc->crtc_id); + struct drm_device *dev = adev_to_drm(adev); + struct drm_vblank_crtc_config config = {0}; struct dc_crtc_timing *timing; int offdelay; @@ -8780,7 +8782,35 @@ static void manage_dm_interrupts(struct amdgpu_device *adev, drm_crtc_vblank_on_config(&acrtc->base, &config); + /* Allow RX6xxx, RX7700, RX7800 GPUs to call amdgpu_irq_get.*/ + switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) { + case IP_VERSION(3, 0, 0): + case IP_VERSION(3, 0, 2): + case IP_VERSION(3, 0, 3): + case IP_VERSION(3, 2, 0): + if (amdgpu_irq_get(adev, &adev->pageflip_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot get pageflip irq!\n"); +#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) + if (amdgpu_irq_get(adev, &adev->vline0_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot get vline0 irq!\n"); +#endif + } + } else { + /* Allow RX6xxx, RX7700, RX7800 GPUs to call amdgpu_irq_put.*/ + switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) { + case IP_VERSION(3, 0, 0): + case IP_VERSION(3, 0, 2): + case IP_VERSION(3, 0, 3): + case IP_VERSION(3, 2, 0): +#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) + if (amdgpu_irq_put(adev, &adev->vline0_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot put vline0 irq!\n"); +#endif + if (amdgpu_irq_put(adev, &adev->pageflip_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot put pageflip irq!\n"); + } + drm_crtc_vblank_off(&acrtc->base); } } @@ -9802,7 +9832,6 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state, bool mode_set_reset_required = false; u32 i; struct dc_commit_streams_params params = {dc_state->streams, dc_state->stream_count}; - bool set_backlight_level = false; /* Disable writeback */ for_each_old_connector_in_state(state, connector, old_con_state, i) { @@ -9922,7 +9951,6 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state, acrtc->hw_mode = new_crtc_state->mode; crtc->hwmode = new_crtc_state->mode; mode_set_reset_required = true; - set_backlight_level = true; } else if (modereset_required(new_crtc_state)) { drm_dbg_atomic(dev, "Atomic commit: RESET. crtc id %d:[%p]\n", @@ -9979,13 +10007,16 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state, * to fix a flicker issue. * It will cause the dm->actual_brightness is not the current panel brightness * level. (the dm->brightness is the correct panel level) - * So we set the backlight level with dm->brightness value after set mode + * So we set the backlight level with dm->brightness value after initial + * set mode. Use restore_backlight flag to avoid setting backlight level + * for every subsequent mode set. */ - if (set_backlight_level) { + if (dm->restore_backlight) { for (i = 0; i < dm->num_of_edps; i++) { if (dm->backlight_dev[i]) amdgpu_dm_backlight_set_level(dm, i, dm->brightness[i]); } + dm->restore_backlight = false; } } diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index b937da0a4e4a00..6aae51c1beb363 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -610,6 +610,13 @@ struct amdgpu_display_manager { */ u32 actual_brightness[AMDGPU_DM_MAX_NUM_EDP]; + /** + * @restore_backlight: + * + * Flag to indicate whether to restore backlight after modeset. + */ + bool restore_backlight; + /** * @aux_hpd_discon_quirk: * diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c index ebabfe3a512f49..c0dfe2d8b3becd 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c @@ -821,7 +821,7 @@ int amdgpu_dm_verify_lut3d_size(struct amdgpu_device *adev, struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state); const struct drm_color_lut *shaper = NULL, *lut3d = NULL; uint32_t exp_size, size, dim_size = MAX_COLOR_3DLUT_SIZE; - bool has_3dlut = adev->dm.dc->caps.color.dpp.hw_3d_lut; + bool has_3dlut = adev->dm.dc->caps.color.dpp.hw_3d_lut || adev->dm.dc->caps.color.mpc.preblend; /* shaper LUT is only available if 3D LUT color caps */ exp_size = has_3dlut ? MAX_COLOR_LUT_ENTRIES : 0; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 7187d5aedf0a50..77a9d2c7d31856 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -809,6 +809,7 @@ void amdgpu_dm_initialize_dp_connector(struct amdgpu_display_manager *dm, drm_dp_aux_init(&aconnector->dm_dp_aux.aux); drm_dp_cec_register_connector(&aconnector->dm_dp_aux.aux, &aconnector->base); + drm_dp_dpcd_set_probe(&aconnector->dm_dp_aux.aux, false); if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_eDP) return; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c index eef51652ca3560..3d2f8eedeef23b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c @@ -1633,7 +1633,7 @@ dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm, drm_object_attach_property(&plane->base, dm->adev->mode_info.plane_ctm_property, 0); - if (dpp_color_caps.hw_3d_lut) { + if (dpp_color_caps.hw_3d_lut || dm->dc->caps.color.mpc.preblend) { drm_object_attach_property(&plane->base, mode_info.plane_shaper_lut_property, 0); drm_object_attach_property(&plane->base, diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index bb1ac12a2b0955..0e638bc6bf77bf 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -587,9 +587,118 @@ bool dcn35_are_clock_states_equal(struct dc_clocks *a, return true; } -static void dcn35_dump_clk_registers(struct clk_state_registers_and_bypass *regs_and_bypass, +static void dcn35_save_clk_registers_internal(struct dcn35_clk_internal *internal, struct clk_mgr *clk_mgr_base) +{ + struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); + + // read dtbclk + internal->CLK1_CLK4_CURRENT_CNT = REG_READ(CLK1_CLK4_CURRENT_CNT); + internal->CLK1_CLK4_BYPASS_CNTL = REG_READ(CLK1_CLK4_BYPASS_CNTL); + + // read dcfclk + internal->CLK1_CLK3_CURRENT_CNT = REG_READ(CLK1_CLK3_CURRENT_CNT); + internal->CLK1_CLK3_BYPASS_CNTL = REG_READ(CLK1_CLK3_BYPASS_CNTL); + + // read dcf deep sleep divider + internal->CLK1_CLK3_DS_CNTL = REG_READ(CLK1_CLK3_DS_CNTL); + internal->CLK1_CLK3_ALLOW_DS = REG_READ(CLK1_CLK3_ALLOW_DS); + + // read dppclk + internal->CLK1_CLK1_CURRENT_CNT = REG_READ(CLK1_CLK1_CURRENT_CNT); + internal->CLK1_CLK1_BYPASS_CNTL = REG_READ(CLK1_CLK1_BYPASS_CNTL); + + // read dprefclk + internal->CLK1_CLK2_CURRENT_CNT = REG_READ(CLK1_CLK2_CURRENT_CNT); + internal->CLK1_CLK2_BYPASS_CNTL = REG_READ(CLK1_CLK2_BYPASS_CNTL); + + // read dispclk + internal->CLK1_CLK0_CURRENT_CNT = REG_READ(CLK1_CLK0_CURRENT_CNT); + internal->CLK1_CLK0_BYPASS_CNTL = REG_READ(CLK1_CLK0_BYPASS_CNTL); +} + +static void dcn35_save_clk_registers(struct clk_state_registers_and_bypass *regs_and_bypass, struct clk_mgr_dcn35 *clk_mgr) { + struct dcn35_clk_internal internal = {0}; + char *bypass_clks[5] = {"0x0 DFS", "0x1 REFCLK", "0x2 ERROR", "0x3 400 FCH", "0x4 600 FCH"}; + + dcn35_save_clk_registers_internal(&internal, &clk_mgr->base.base); + + regs_and_bypass->dcfclk = internal.CLK1_CLK3_CURRENT_CNT / 10; + regs_and_bypass->dcf_deep_sleep_divider = internal.CLK1_CLK3_DS_CNTL / 10; + regs_and_bypass->dcf_deep_sleep_allow = internal.CLK1_CLK3_ALLOW_DS; + regs_and_bypass->dprefclk = internal.CLK1_CLK2_CURRENT_CNT / 10; + regs_and_bypass->dispclk = internal.CLK1_CLK0_CURRENT_CNT / 10; + regs_and_bypass->dppclk = internal.CLK1_CLK1_CURRENT_CNT / 10; + regs_and_bypass->dtbclk = internal.CLK1_CLK4_CURRENT_CNT / 10; + + regs_and_bypass->dppclk_bypass = internal.CLK1_CLK1_BYPASS_CNTL & 0x0007; + if (regs_and_bypass->dppclk_bypass < 0 || regs_and_bypass->dppclk_bypass > 4) + regs_and_bypass->dppclk_bypass = 0; + regs_and_bypass->dcfclk_bypass = internal.CLK1_CLK3_BYPASS_CNTL & 0x0007; + if (regs_and_bypass->dcfclk_bypass < 0 || regs_and_bypass->dcfclk_bypass > 4) + regs_and_bypass->dcfclk_bypass = 0; + regs_and_bypass->dispclk_bypass = internal.CLK1_CLK0_BYPASS_CNTL & 0x0007; + if (regs_and_bypass->dispclk_bypass < 0 || regs_and_bypass->dispclk_bypass > 4) + regs_and_bypass->dispclk_bypass = 0; + regs_and_bypass->dprefclk_bypass = internal.CLK1_CLK2_BYPASS_CNTL & 0x0007; + if (regs_and_bypass->dprefclk_bypass < 0 || regs_and_bypass->dprefclk_bypass > 4) + regs_and_bypass->dprefclk_bypass = 0; + + if (clk_mgr->base.base.ctx->dc->debug.pstate_enabled) { + DC_LOG_SMU("clk_type,clk_value,deepsleep_cntl,deepsleep_allow,bypass\n"); + + DC_LOG_SMU("dcfclk,%d,%d,%d,%s\n", + regs_and_bypass->dcfclk, + regs_and_bypass->dcf_deep_sleep_divider, + regs_and_bypass->dcf_deep_sleep_allow, + bypass_clks[(int) regs_and_bypass->dcfclk_bypass]); + + DC_LOG_SMU("dprefclk,%d,N/A,N/A,%s\n", + regs_and_bypass->dprefclk, + bypass_clks[(int) regs_and_bypass->dprefclk_bypass]); + + DC_LOG_SMU("dispclk,%d,N/A,N/A,%s\n", + regs_and_bypass->dispclk, + bypass_clks[(int) regs_and_bypass->dispclk_bypass]); + + // REGISTER VALUES + DC_LOG_SMU("reg_name,value,clk_type"); + + DC_LOG_SMU("CLK1_CLK3_CURRENT_CNT,%d,dcfclk", + internal.CLK1_CLK3_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK4_CURRENT_CNT,%d,dtbclk", + internal.CLK1_CLK4_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK3_DS_CNTL,%d,dcf_deep_sleep_divider", + internal.CLK1_CLK3_DS_CNTL); + + DC_LOG_SMU("CLK1_CLK3_ALLOW_DS,%d,dcf_deep_sleep_allow", + internal.CLK1_CLK3_ALLOW_DS); + + DC_LOG_SMU("CLK1_CLK2_CURRENT_CNT,%d,dprefclk", + internal.CLK1_CLK2_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK0_CURRENT_CNT,%d,dispclk", + internal.CLK1_CLK0_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK1_CURRENT_CNT,%d,dppclk", + internal.CLK1_CLK1_CURRENT_CNT); + + DC_LOG_SMU("CLK1_CLK3_BYPASS_CNTL,%d,dcfclk_bypass", + internal.CLK1_CLK3_BYPASS_CNTL); + + DC_LOG_SMU("CLK1_CLK2_BYPASS_CNTL,%d,dprefclk_bypass", + internal.CLK1_CLK2_BYPASS_CNTL); + + DC_LOG_SMU("CLK1_CLK0_BYPASS_CNTL,%d,dispclk_bypass", + internal.CLK1_CLK0_BYPASS_CNTL); + + DC_LOG_SMU("CLK1_CLK1_BYPASS_CNTL,%d,dppclk_bypass", + internal.CLK1_CLK1_BYPASS_CNTL); + + } } static bool dcn35_is_spll_ssc_enabled(struct clk_mgr *clk_mgr_base) @@ -623,6 +732,7 @@ static void init_clk_states(struct clk_mgr *clk_mgr) void dcn35_init_clocks(struct clk_mgr *clk_mgr) { struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr); + struct clk_mgr_dcn35 *clk_mgr_dcn35 = TO_CLK_MGR_DCN35(clk_mgr_int); init_clk_states(clk_mgr); @@ -633,6 +743,13 @@ void dcn35_init_clocks(struct clk_mgr *clk_mgr) else clk_mgr->dp_dto_source_clock_in_khz = clk_mgr->dprefclk_khz; + dcn35_save_clk_registers(&clk_mgr->boot_snapshot, clk_mgr_dcn35); + + clk_mgr->clks.ref_dtbclk_khz = clk_mgr->boot_snapshot.dtbclk * 10; + if (clk_mgr->boot_snapshot.dtbclk > 59000) { + /*dtbclk enabled based on */ + clk_mgr->clks.dtbclk_en = true; + } } static struct clk_bw_params dcn35_bw_params = { .vram_type = Ddr4MemType, @@ -1323,7 +1440,7 @@ void dcn35_clk_mgr_construct( dcn35_bw_params.wm_table = ddr5_wm_table; } /* Saved clocks configured at boot for debug purposes */ - dcn35_dump_clk_registers(&clk_mgr->base.base.boot_snapshot, clk_mgr); + dcn35_save_clk_registers(&clk_mgr->base.base.boot_snapshot, clk_mgr); clk_mgr->base.base.dprefclk_khz = dcn35_smu_get_dprefclk(&clk_mgr->base); clk_mgr->base.base.clks.ref_dtbclk_khz = 600000; diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 59c07756130d5a..8c230cf8939b53 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1145,6 +1145,7 @@ struct dc_debug_options { bool enable_hblank_borrow; bool force_subvp_df_throttle; uint32_t acpi_transition_bitmasks[MAX_PIPES]; + bool enable_pg_cntl_debug_logs; }; @@ -1347,7 +1348,6 @@ union surface_update_flags { uint32_t in_transfer_func_change:1; uint32_t input_csc_change:1; uint32_t coeff_reduction_change:1; - uint32_t output_tf_change:1; uint32_t pixel_format_change:1; uint32_t plane_size_change:1; uint32_t gamut_remap_change:1; diff --git a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c index 58c84f555c0fb8..0ce9489ac6b728 100644 --- a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c +++ b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c @@ -133,30 +133,34 @@ enum dsc_clk_source { }; -static void dccg35_set_dsc_clk_rcg(struct dccg *dccg, int inst, bool enable) +static void dccg35_set_dsc_clk_rcg(struct dccg *dccg, int inst, bool allow_rcg) { struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg); - if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dsc && enable) + if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dsc && allow_rcg) return; switch (inst) { case 0: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 1: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 2: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 3: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; default: BREAK_TO_DEBUGGER(); return; } + + /* Wait for clock to ramp */ + if (!allow_rcg) + udelay(10); } static void dccg35_set_symclk32_se_rcg( @@ -385,35 +389,34 @@ static void dccg35_set_dtbclk_p_rcg(struct dccg *dccg, int inst, bool enable) } } -static void dccg35_set_dppclk_rcg(struct dccg *dccg, - int inst, bool enable) +static void dccg35_set_dppclk_rcg(struct dccg *dccg, int inst, bool allow_rcg) { - struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg); - - if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && enable) + if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && allow_rcg) return; switch (inst) { case 0: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 1: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 2: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 3: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; default: BREAK_TO_DEBUGGER(); break; } - //DC_LOG_DEBUG("%s: inst(%d) DPPCLK rcg_disable: %d\n", __func__, inst, enable ? 0 : 1); + /* Wait for clock to ramp */ + if (!allow_rcg) + udelay(10); } static void dccg35_set_dpstreamclk_rcg( @@ -1177,32 +1180,34 @@ static void dccg35_update_dpp_dto(struct dccg *dccg, int dpp_inst, } static void dccg35_set_dppclk_root_clock_gating(struct dccg *dccg, - uint32_t dpp_inst, uint32_t enable) + uint32_t dpp_inst, uint32_t disallow_rcg) { struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg); - if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp) + if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && !disallow_rcg) return; switch (dpp_inst) { case 0: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, disallow_rcg); break; case 1: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, disallow_rcg); break; case 2: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, disallow_rcg); break; case 3: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, disallow_rcg); break; default: break; } - //DC_LOG_DEBUG("%s: dpp_inst(%d) rcg: %d\n", __func__, dpp_inst, enable); + /* Wait for clock to ramp */ + if (disallow_rcg) + udelay(10); } static void dccg35_get_pixel_rate_div( @@ -1782,8 +1787,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) //Disable DTO switch (inst) { case 0: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK0_DTO_PARAM, DSCCLK0_DTO_PHASE, 0, @@ -1791,8 +1795,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK0_EN, 1); break; case 1: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK1_DTO_PARAM, DSCCLK1_DTO_PHASE, 0, @@ -1800,8 +1803,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK1_EN, 1); break; case 2: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK2_DTO_PARAM, DSCCLK2_DTO_PHASE, 0, @@ -1809,8 +1811,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK2_EN, 1); break; case 3: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK3_DTO_PARAM, DSCCLK3_DTO_PHASE, 0, @@ -1821,6 +1822,9 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) BREAK_TO_DEBUGGER(); return; } + + /* Wait for clock to ramp */ + udelay(10); } static void dccg35_disable_dscclk(struct dccg *dccg, @@ -1864,6 +1868,9 @@ static void dccg35_disable_dscclk(struct dccg *dccg, default: return; } + + /* Wait for clock ramp */ + udelay(10); } static void dccg35_enable_symclk_se(struct dccg *dccg, uint32_t stream_enc_inst, uint32_t link_enc_inst) @@ -2349,10 +2356,7 @@ static void dccg35_disable_symclk_se_cb( void dccg35_root_gate_disable_control(struct dccg *dccg, uint32_t pipe_idx, uint32_t disable_clock_gating) { - - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dpp) { - dccg35_set_dppclk_root_clock_gating(dccg, pipe_idx, disable_clock_gating); - } + dccg35_set_dppclk_root_clock_gating(dccg, pipe_idx, disable_clock_gating); } static const struct dccg_funcs dccg35_funcs_new = { diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index 3207addbd4ebb3..5e57bd1a08e73d 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -955,7 +955,7 @@ enum dc_status dcn20_enable_stream_timing( return DC_ERROR_UNEXPECTED; } - fsleep(stream->timing.v_total * (stream->timing.h_total * 10000u / stream->timing.pix_clk_100hz)); + udelay(stream->timing.v_total * (stream->timing.h_total * 10000u / stream->timing.pix_clk_100hz)); params.vertical_total_min = stream->adjust.v_total_min; params.vertical_total_max = stream->adjust.v_total_max; @@ -1982,10 +1982,8 @@ static void dcn20_program_pipe( * updating on slave planes */ if (pipe_ctx->update_flags.bits.enable || - pipe_ctx->update_flags.bits.plane_changed || - pipe_ctx->stream->update_flags.bits.out_tf || - (pipe_ctx->plane_state && - pipe_ctx->plane_state->update_flags.bits.output_tf_change)) + pipe_ctx->update_flags.bits.plane_changed || + pipe_ctx->stream->update_flags.bits.out_tf) hws->funcs.set_output_transfer_func(dc, pipe_ctx, pipe_ctx->stream); /* If the pipe has been enabled or has a different opp, we diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index a267f574b61937..764eff6a4ec6b7 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -113,6 +113,14 @@ static void enable_memory_low_power(struct dc *dc) } #endif +static void print_pg_status(struct dc *dc, const char *debug_func, const char *debug_log) +{ + if (dc->debug.enable_pg_cntl_debug_logs && dc->res_pool->pg_cntl) { + if (dc->res_pool->pg_cntl->funcs->print_pg_status) + dc->res_pool->pg_cntl->funcs->print_pg_status(dc->res_pool->pg_cntl, debug_func, debug_log); + } +} + void dcn35_set_dmu_fgcg(struct dce_hwseq *hws, bool enable) { REG_UPDATE_3(DMU_CLK_CNTL, @@ -137,6 +145,8 @@ void dcn35_init_hw(struct dc *dc) uint32_t user_level = MAX_BACKLIGHT_LEVEL; int i; + print_pg_status(dc, __func__, ": start"); + if (dc->clk_mgr && dc->clk_mgr->funcs->init_clocks) dc->clk_mgr->funcs->init_clocks(dc->clk_mgr); @@ -200,10 +210,7 @@ void dcn35_init_hw(struct dc *dc) /* we want to turn off all dp displays before doing detection */ dc->link_srv->blank_all_dp_displays(dc); -/* - if (hws->funcs.enable_power_gating_plane) - hws->funcs.enable_power_gating_plane(dc->hwseq, true); -*/ + if (res_pool->hubbub && res_pool->hubbub->funcs->dchubbub_init) res_pool->hubbub->funcs->dchubbub_init(dc->res_pool->hubbub); /* If taking control over from VBIOS, we may want to optimize our first @@ -236,6 +243,8 @@ void dcn35_init_hw(struct dc *dc) } hws->funcs.init_pipes(dc, dc->current_state); + print_pg_status(dc, __func__, ": after init_pipes"); + if (dc->res_pool->hubbub->funcs->allow_self_refresh_control && !dc->res_pool->hubbub->ctx->dc->debug.disable_stutter) dc->res_pool->hubbub->funcs->allow_self_refresh_control(dc->res_pool->hubbub, @@ -312,6 +321,7 @@ void dcn35_init_hw(struct dc *dc) if (dc->res_pool->pg_cntl->funcs->init_pg_status) dc->res_pool->pg_cntl->funcs->init_pg_status(dc->res_pool->pg_cntl); } + print_pg_status(dc, __func__, ": after init_pg_status"); } static void update_dsc_on_stream(struct pipe_ctx *pipe_ctx, bool enable) @@ -500,97 +510,6 @@ void dcn35_physymclk_root_clock_control(struct dce_hwseq *hws, unsigned int phy_ } } -void dcn35_dsc_pg_control( - struct dce_hwseq *hws, - unsigned int dsc_inst, - bool power_on) -{ - uint32_t power_gate = power_on ? 0 : 1; - uint32_t pwr_status = power_on ? 0 : 2; - uint32_t org_ip_request_cntl = 0; - - if (hws->ctx->dc->debug.disable_dsc_power_gate) - return; - if (hws->ctx->dc->debug.ignore_pg) - return; - REG_GET(DC_IP_REQUEST_CNTL, IP_REQUEST_EN, &org_ip_request_cntl); - if (org_ip_request_cntl == 0) - REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 1); - - switch (dsc_inst) { - case 0: /* DSC0 */ - REG_UPDATE(DOMAIN16_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN16_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - case 1: /* DSC1 */ - REG_UPDATE(DOMAIN17_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN17_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - case 2: /* DSC2 */ - REG_UPDATE(DOMAIN18_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN18_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - case 3: /* DSC3 */ - REG_UPDATE(DOMAIN19_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN19_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - default: - BREAK_TO_DEBUGGER(); - break; - } - - if (org_ip_request_cntl == 0) - REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 0); -} - -void dcn35_enable_power_gating_plane(struct dce_hwseq *hws, bool enable) -{ - bool force_on = true; /* disable power gating */ - uint32_t org_ip_request_cntl = 0; - - if (hws->ctx->dc->debug.disable_hubp_power_gate) - return; - if (hws->ctx->dc->debug.ignore_pg) - return; - REG_GET(DC_IP_REQUEST_CNTL, IP_REQUEST_EN, &org_ip_request_cntl); - if (org_ip_request_cntl == 0) - REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 1); - /* DCHUBP0/1/2/3/4/5 */ - REG_UPDATE(DOMAIN0_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN2_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - /* DPP0/1/2/3/4/5 */ - REG_UPDATE(DOMAIN1_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN3_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - - force_on = true; /* disable power gating */ - if (enable && !hws->ctx->dc->debug.disable_dsc_power_gate) - force_on = false; - - /* DCS0/1/2/3/4 */ - REG_UPDATE(DOMAIN16_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN17_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN18_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN19_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - - -} - /* In headless boot cases, DIG may be turned * on which causes HW/SW discrepancies. * To avoid this, power down hardware on boot @@ -1453,6 +1372,8 @@ void dcn35_prepare_bandwidth( } dcn20_prepare_bandwidth(dc, context); + + print_pg_status(dc, __func__, ": after rcg and power up"); } void dcn35_optimize_bandwidth( @@ -1461,6 +1382,8 @@ void dcn35_optimize_bandwidth( { struct pg_block_update pg_update_state; + print_pg_status(dc, __func__, ": before rcg and power up"); + dcn20_optimize_bandwidth(dc, context); if (dc->hwss.calc_blocks_to_gate) { @@ -1472,6 +1395,8 @@ void dcn35_optimize_bandwidth( if (dc->hwss.root_clock_control) dc->hwss.root_clock_control(dc, &pg_update_state, false); } + + print_pg_status(dc, __func__, ": after rcg and power up"); } void dcn35_set_drr(struct pipe_ctx **pipe_ctx, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c index a3ccf805bd16ae..aefb7c47374158 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c @@ -115,7 +115,6 @@ static const struct hw_sequencer_funcs dcn35_funcs = { .exit_optimized_pwr_state = dcn21_exit_optimized_pwr_state, .update_visual_confirm_color = dcn10_update_visual_confirm_color, .apply_idle_power_optimizations = dcn35_apply_idle_power_optimizations, - .update_dsc_pg = dcn32_update_dsc_pg, .calc_blocks_to_gate = dcn35_calc_blocks_to_gate, .calc_blocks_to_ungate = dcn35_calc_blocks_to_ungate, .hw_block_power_up = dcn35_hw_block_power_up, @@ -150,7 +149,6 @@ static const struct hwseq_private_funcs dcn35_private_funcs = { .plane_atomic_disable = dcn35_plane_atomic_disable, //.plane_atomic_disable = dcn20_plane_atomic_disable,/*todo*/ //.hubp_pg_control = dcn35_hubp_pg_control, - .enable_power_gating_plane = dcn35_enable_power_gating_plane, .dpp_root_clock_control = dcn35_dpp_root_clock_control, .dpstream_root_clock_control = dcn35_dpstream_root_clock_control, .physymclk_root_clock_control = dcn35_physymclk_root_clock_control, @@ -165,7 +163,6 @@ static const struct hwseq_private_funcs dcn35_private_funcs = { .calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values, .resync_fifo_dccg_dio = dcn314_resync_fifo_dccg_dio, .is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy, - .dsc_pg_control = dcn35_dsc_pg_control, .dsc_pg_status = dcn32_dsc_pg_status, .enable_plane = dcn35_enable_plane, .wait_for_pipe_update_if_needed = dcn10_wait_for_pipe_update_if_needed, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c index 58f2be2a326b89..a580a55695c3b0 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c @@ -114,7 +114,6 @@ static const struct hw_sequencer_funcs dcn351_funcs = { .exit_optimized_pwr_state = dcn21_exit_optimized_pwr_state, .update_visual_confirm_color = dcn10_update_visual_confirm_color, .apply_idle_power_optimizations = dcn35_apply_idle_power_optimizations, - .update_dsc_pg = dcn32_update_dsc_pg, .calc_blocks_to_gate = dcn351_calc_blocks_to_gate, .calc_blocks_to_ungate = dcn351_calc_blocks_to_ungate, .hw_block_power_up = dcn351_hw_block_power_up, @@ -145,7 +144,6 @@ static const struct hwseq_private_funcs dcn351_private_funcs = { .plane_atomic_disable = dcn35_plane_atomic_disable, //.plane_atomic_disable = dcn20_plane_atomic_disable,/*todo*/ //.hubp_pg_control = dcn35_hubp_pg_control, - .enable_power_gating_plane = dcn35_enable_power_gating_plane, .dpp_root_clock_control = dcn35_dpp_root_clock_control, .dpstream_root_clock_control = dcn35_dpstream_root_clock_control, .physymclk_root_clock_control = dcn35_physymclk_root_clock_control, @@ -159,7 +157,6 @@ static const struct hwseq_private_funcs dcn351_private_funcs = { .setup_hpo_hw_control = dcn35_setup_hpo_hw_control, .calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values, .is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy, - .dsc_pg_control = dcn35_dsc_pg_control, .dsc_pg_status = dcn32_dsc_pg_status, .enable_plane = dcn35_enable_plane, .wait_for_pipe_update_if_needed = dcn10_wait_for_pipe_update_if_needed, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c index cc9f40d97af2fb..61167c19359d57 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c @@ -2019,10 +2019,8 @@ void dcn401_program_pipe( * updating on slave planes */ if (pipe_ctx->update_flags.bits.enable || - pipe_ctx->update_flags.bits.plane_changed || - pipe_ctx->stream->update_flags.bits.out_tf || - (pipe_ctx->plane_state && - pipe_ctx->plane_state->update_flags.bits.output_tf_change)) + pipe_ctx->update_flags.bits.plane_changed || + pipe_ctx->stream->update_flags.bits.out_tf) hws->funcs.set_output_transfer_func(dc, pipe_ctx, pipe_ctx->stream); /* If the pipe has been enabled or has a different opp, we diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h b/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h index 44f86cc2d1d686..227e3f8d7e5f56 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h @@ -49,6 +49,7 @@ struct pg_cntl_funcs { void (*mem_pg_control)(struct pg_cntl *pg_cntl, bool power_on); void (*dio_pg_control)(struct pg_cntl *pg_cntl, bool power_on); void (*init_pg_status)(struct pg_cntl *pg_cntl); + void (*print_pg_status)(struct pg_cntl *pg_cntl, const char *debug_func, const char *debug_log); }; #endif //__DC_PG_CNTL_H__ diff --git a/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c b/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c index af21c0a27f8657..72bd43f9bbe288 100644 --- a/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c +++ b/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c @@ -79,16 +79,12 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo uint32_t power_gate = power_on ? 0 : 1; uint32_t pwr_status = power_on ? 0 : 2; uint32_t org_ip_request_cntl = 0; - bool block_enabled; - - /*need to enable dscclk regardless DSC_PG*/ - if (pg_cntl->ctx->dc->res_pool->dccg->funcs->enable_dsc && power_on) - pg_cntl->ctx->dc->res_pool->dccg->funcs->enable_dsc( - pg_cntl->ctx->dc->res_pool->dccg, dsc_inst); + bool block_enabled = false; + bool skip_pg = pg_cntl->ctx->dc->debug.ignore_pg || + pg_cntl->ctx->dc->debug.disable_dsc_power_gate || + pg_cntl->ctx->dc->idle_optimizations_allowed; - if (pg_cntl->ctx->dc->debug.ignore_pg || - pg_cntl->ctx->dc->debug.disable_dsc_power_gate || - pg_cntl->ctx->dc->idle_optimizations_allowed) + if (skip_pg && !power_on) return; block_enabled = pg_cntl35_dsc_pg_status(pg_cntl, dsc_inst); @@ -111,7 +107,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN16_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; case 1: /* DSC1 */ REG_UPDATE(DOMAIN17_PG_CONFIG, @@ -119,7 +115,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN17_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; case 2: /* DSC2 */ REG_UPDATE(DOMAIN18_PG_CONFIG, @@ -127,7 +123,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN18_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; case 3: /* DSC3 */ REG_UPDATE(DOMAIN19_PG_CONFIG, @@ -135,7 +131,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN19_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; default: BREAK_TO_DEBUGGER(); @@ -144,12 +140,6 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo if (dsc_inst < MAX_PIPES) pg_cntl->pg_pipe_res_enable[PG_DSC][dsc_inst] = power_on; - - if (pg_cntl->ctx->dc->res_pool->dccg->funcs->disable_dsc && !power_on) { - /*this is to disable dscclk*/ - pg_cntl->ctx->dc->res_pool->dccg->funcs->disable_dsc( - pg_cntl->ctx->dc->res_pool->dccg, dsc_inst); - } } static bool pg_cntl35_hubp_dpp_pg_status(struct pg_cntl *pg_cntl, unsigned int hubp_dpp_inst) @@ -189,11 +179,12 @@ void pg_cntl35_hubp_dpp_pg_control(struct pg_cntl *pg_cntl, unsigned int hubp_dp uint32_t pwr_status = power_on ? 0 : 2; uint32_t org_ip_request_cntl; bool block_enabled; + bool skip_pg = pg_cntl->ctx->dc->debug.ignore_pg || + pg_cntl->ctx->dc->debug.disable_hubp_power_gate || + pg_cntl->ctx->dc->debug.disable_dpp_power_gate || + pg_cntl->ctx->dc->idle_optimizations_allowed; - if (pg_cntl->ctx->dc->debug.ignore_pg || - pg_cntl->ctx->dc->debug.disable_hubp_power_gate || - pg_cntl->ctx->dc->debug.disable_dpp_power_gate || - pg_cntl->ctx->dc->idle_optimizations_allowed) + if (skip_pg && !power_on) return; block_enabled = pg_cntl35_hubp_dpp_pg_status(pg_cntl, hubp_dpp_inst); @@ -213,22 +204,22 @@ void pg_cntl35_hubp_dpp_pg_control(struct pg_cntl *pg_cntl, unsigned int hubp_dp case 0: /* DPP0 & HUBP0 */ REG_UPDATE(DOMAIN0_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN0_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN0_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; case 1: /* DPP1 & HUBP1 */ REG_UPDATE(DOMAIN1_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN1_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN1_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; case 2: /* DPP2 & HUBP2 */ REG_UPDATE(DOMAIN2_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN2_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN2_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; case 3: /* DPP3 & HUBP3 */ REG_UPDATE(DOMAIN3_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN3_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN3_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; default: BREAK_TO_DEBUGGER(); @@ -501,6 +492,36 @@ void pg_cntl35_init_pg_status(struct pg_cntl *pg_cntl) pg_cntl->pg_res_enable[PG_DWB] = block_enabled; } +static void pg_cntl35_print_pg_status(struct pg_cntl *pg_cntl, const char *debug_func, const char *debug_log) +{ + int i = 0; + bool block_enabled = false; + + DC_LOG_DEBUG("%s: %s", debug_func, debug_log); + + DC_LOG_DEBUG("PG_CNTL status:\n"); + + block_enabled = pg_cntl35_io_clk_status(pg_cntl); + DC_LOG_DEBUG("ONO0=%d (DCCG, DIO, DCIO)\n", block_enabled ? 1 : 0); + + block_enabled = pg_cntl35_mem_status(pg_cntl); + DC_LOG_DEBUG("ONO1=%d (DCHUBBUB, DCHVM, DCHUBBUBMEM)\n", block_enabled ? 1 : 0); + + block_enabled = pg_cntl35_plane_otg_status(pg_cntl); + DC_LOG_DEBUG("ONO2=%d (MPC, OPP, OPTC, DWB)\n", block_enabled ? 1 : 0); + + block_enabled = pg_cntl35_hpo_pg_status(pg_cntl); + DC_LOG_DEBUG("ONO3=%d (HPO)\n", block_enabled ? 1 : 0); + + for (i = 0; i < pg_cntl->ctx->dc->res_pool->pipe_count; i++) { + block_enabled = pg_cntl35_hubp_dpp_pg_status(pg_cntl, i); + DC_LOG_DEBUG("ONO%d=%d (DCHUBP%d, DPP%d)\n", 4 + i * 2, block_enabled ? 1 : 0, i, i); + + block_enabled = pg_cntl35_dsc_pg_status(pg_cntl, i); + DC_LOG_DEBUG("ONO%d=%d (DSC%d)\n", 5 + i * 2, block_enabled ? 1 : 0, i); + } +} + static const struct pg_cntl_funcs pg_cntl35_funcs = { .init_pg_status = pg_cntl35_init_pg_status, .dsc_pg_control = pg_cntl35_dsc_pg_control, @@ -511,7 +532,8 @@ static const struct pg_cntl_funcs pg_cntl35_funcs = { .mpcc_pg_control = pg_cntl35_mpcc_pg_control, .opp_pg_control = pg_cntl35_opp_pg_control, .optc_pg_control = pg_cntl35_optc_pg_control, - .dwb_pg_control = pg_cntl35_dwb_pg_control + .dwb_pg_control = pg_cntl35_dwb_pg_control, + .print_pg_status = pg_cntl35_print_pg_status }; struct pg_cntl *pg_cntl35_create( diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index b47cb4a5f4887d..408f05dfab9015 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -2236,7 +2236,7 @@ static int smu_resume(struct amdgpu_ip_block *ip_block) return ret; } - if (smu_dpm_ctx->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL) { + if (smu_dpm_ctx->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL && smu->od_enabled) { ret = smu_od_edit_dpm_table(smu, PP_OD_COMMIT_DPM_TABLE, NULL, 0); if (ret) return ret; diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c index 19c04687b0fe1f..8e650a02c5287b 100644 --- a/drivers/gpu/drm/ast/ast_dp.c +++ b/drivers/gpu/drm/ast/ast_dp.c @@ -134,7 +134,7 @@ static int ast_astdp_read_edid_block(void *data, u8 *buf, unsigned int block, si * 3. The Delays are often longer a lot when system resume from S3/S4. */ if (j) - mdelay(j + 1); + msleep(j + 1); /* Wait for EDID offset to show up in mirror register */ vgacrd7 = ast_get_index_reg(ast, AST_IO_VGACRI, 0xd7); diff --git a/drivers/gpu/drm/bridge/analogix/anx7625.c b/drivers/gpu/drm/bridge/analogix/anx7625.c index c0ad8f59e48398..8b3304dedcd998 100644 --- a/drivers/gpu/drm/bridge/analogix/anx7625.c +++ b/drivers/gpu/drm/bridge/analogix/anx7625.c @@ -2677,7 +2677,7 @@ static int anx7625_i2c_probe(struct i2c_client *client) ret = devm_request_threaded_irq(dev, platform->pdata.intp_irq, NULL, anx7625_intr_hpd_isr, IRQF_TRIGGER_FALLING | - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "anx7625-intp", platform); if (ret) { DRM_DEV_ERROR(dev, "fail to request irq\n"); @@ -2746,8 +2746,10 @@ static int anx7625_i2c_probe(struct i2c_client *client) } /* Add work function */ - if (platform->pdata.intp_irq) + if (platform->pdata.intp_irq) { + enable_irq(platform->pdata.intp_irq); queue_work(platform->workqueue, &platform->work); + } if (platform->pdata.audio_en) anx7625_register_audio(dev, platform); diff --git a/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c b/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c index a614d1384f7152..38726ae1bf1504 100644 --- a/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c +++ b/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c @@ -1984,8 +1984,10 @@ static void cdns_mhdp_atomic_enable(struct drm_bridge *bridge, mhdp_state = to_cdns_mhdp_bridge_state(new_state); mhdp_state->current_mode = drm_mode_duplicate(bridge->dev, mode); - if (!mhdp_state->current_mode) - return; + if (!mhdp_state->current_mode) { + ret = -EINVAL; + goto out; + } drm_mode_set_name(mhdp_state->current_mode); diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c index db9b089ef62c85..86853535fb7bd7 100644 --- a/drivers/gpu/drm/drm_gpuvm.c +++ b/drivers/gpu/drm/drm_gpuvm.c @@ -2432,8 +2432,6 @@ static const struct drm_gpuvm_ops lock_ops = { * * The expected usage is:: * - * .. code-block:: c - * * vm_bind { * struct drm_exec exec; * diff --git a/drivers/gpu/drm/drm_panic_qr.rs b/drivers/gpu/drm/drm_panic_qr.rs index 50c286c5cee8be..ac27e86c601c8c 100644 --- a/drivers/gpu/drm/drm_panic_qr.rs +++ b/drivers/gpu/drm/drm_panic_qr.rs @@ -968,7 +968,7 @@ pub unsafe extern "C" fn drm_panic_qr_generate( // nul-terminated string. let url_cstr: &CStr = unsafe { CStr::from_char_ptr(url) }; let segments = &[ - &Segment::Binary(url_cstr.as_bytes()), + &Segment::Binary(url_cstr.to_bytes()), &Segment::Numeric(&data_slice[0..data_len]), ]; match EncodedMsg::new(segments, tmp_slice) { diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi.c b/drivers/gpu/drm/gma500/oaktrail_hdmi.c index 1cf39436912776..c0feca58511df3 100644 --- a/drivers/gpu/drm/gma500/oaktrail_hdmi.c +++ b/drivers/gpu/drm/gma500/oaktrail_hdmi.c @@ -726,8 +726,8 @@ void oaktrail_hdmi_teardown(struct drm_device *dev) if (hdmi_dev) { pdev = hdmi_dev->dev; - pci_set_drvdata(pdev, NULL); oaktrail_hdmi_i2c_exit(pdev); + pci_set_drvdata(pdev, NULL); iounmap(hdmi_dev->regs); kfree(hdmi_dev); pci_dev_put(pdev); diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 0405396c7750ea..9ecbb4b99c3786 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -596,8 +596,9 @@ intel_ddi_transcoder_func_reg_val_get(struct intel_encoder *encoder, enum transcoder master; master = crtc_state->mst_master_transcoder; - drm_WARN_ON(display->drm, - master == INVALID_TRANSCODER); + if (drm_WARN_ON(display->drm, + master == INVALID_TRANSCODER)) + master = TRANSCODER_A; temp |= TRANS_DDI_MST_TRANSPORT_SELECT(master); } } else { diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c index 273054c2232524..c92f3e73622886 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power.c +++ b/drivers/gpu/drm/i915/display/intel_display_power.c @@ -1172,7 +1172,7 @@ static void icl_mbus_init(struct intel_display *display) if (DISPLAY_VER(display) == 12) abox_regs |= BIT(0); - for_each_set_bit(i, &abox_regs, sizeof(abox_regs)) + for_each_set_bit(i, &abox_regs, BITS_PER_TYPE(abox_regs)) intel_de_rmw(display, MBUS_ABOX_CTL(i), mask, val); } @@ -1629,11 +1629,11 @@ static void tgl_bw_buddy_init(struct intel_display *display) if (table[config].page_mask == 0) { drm_dbg_kms(display->drm, "Unknown memory configuration; disabling address buddy logic.\n"); - for_each_set_bit(i, &abox_mask, sizeof(abox_mask)) + for_each_set_bit(i, &abox_mask, BITS_PER_TYPE(abox_mask)) intel_de_write(display, BW_BUDDY_CTL(i), BW_BUDDY_DISABLE); } else { - for_each_set_bit(i, &abox_mask, sizeof(abox_mask)) { + for_each_set_bit(i, &abox_mask, BITS_PER_TYPE(abox_mask)) { intel_de_write(display, BW_BUDDY_PAGE_MASK(i), table[config].page_mask); diff --git a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c index 41228478b21c78..0a3a3f6a5f9d89 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c +++ b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c @@ -546,7 +546,7 @@ static int intel_dp_aux_vesa_setup_backlight(struct intel_connector *connector, luminance_range->max_luminance, panel->vbt.backlight.pwm_freq_hz, intel_dp->edp_dpcd, ¤t_level, ¤t_mode, - false); + panel->backlight.edp.vesa.luminance_control_support); if (ret < 0) return ret; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index e3d188455f6754..b9dae15c1d1667 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -514,6 +514,13 @@ static int __create_shmem(struct drm_i915_private *i915, if (IS_ERR(filp)) return PTR_ERR(filp); + /* + * Prevent -EFBIG by allowing large writes beyond MAX_NON_LFS on shmem + * objects by setting O_LARGEFILE. + */ + if (force_o_largefile()) + filp->f_flags |= O_LARGEFILE; + obj->filp = filp; return 0; } diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index f8a817689e1626..eb5537f0ac90d8 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -387,19 +387,21 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) of_id = of_match_node(mtk_drm_of_ids, node); if (!of_id) - goto next_put_node; + continue; pdev = of_find_device_by_node(node); if (!pdev) - goto next_put_node; + continue; drm_dev = device_find_child(&pdev->dev, NULL, mtk_drm_match); + put_device(&pdev->dev); if (!drm_dev) - goto next_put_device_pdev_dev; + continue; temp_drm_priv = dev_get_drvdata(drm_dev); + put_device(drm_dev); if (!temp_drm_priv) - goto next_put_device_drm_dev; + continue; if (temp_drm_priv->data->main_len) all_drm_priv[CRTC_MAIN] = temp_drm_priv; @@ -411,17 +413,10 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) if (temp_drm_priv->mtk_drm_bound) cnt++; -next_put_device_drm_dev: - put_device(drm_dev); - -next_put_device_pdev_dev: - put_device(&pdev->dev); - -next_put_node: - of_node_put(node); - - if (cnt == MAX_CRTC) + if (cnt == MAX_CRTC) { + of_node_put(node); break; + } } if (drm_priv->data->mmsys_dev_num == cnt) { diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c index 9f345a0087175c..869d4335c0f45c 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.c +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c @@ -240,21 +240,6 @@ nouveau_fence_emit(struct nouveau_fence *fence) return ret; } -void -nouveau_fence_cancel(struct nouveau_fence *fence) -{ - struct nouveau_fence_chan *fctx = nouveau_fctx(fence); - unsigned long flags; - - spin_lock_irqsave(&fctx->lock, flags); - if (!dma_fence_is_signaled_locked(&fence->base)) { - dma_fence_set_error(&fence->base, -ECANCELED); - if (nouveau_fence_signal(fence)) - nvif_event_block(&fctx->event); - } - spin_unlock_irqrestore(&fctx->lock, flags); -} - bool nouveau_fence_done(struct nouveau_fence *fence) { diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h index 9957a919bd38e7..183dd43ecfff4a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.h +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h @@ -29,7 +29,6 @@ void nouveau_fence_unref(struct nouveau_fence **); int nouveau_fence_emit(struct nouveau_fence *); bool nouveau_fence_done(struct nouveau_fence *); -void nouveau_fence_cancel(struct nouveau_fence *fence); int nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr); int nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool exclusive, bool intr); diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c index 0cc0bc9f9952b1..e60f7892f5ce9a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sched.c +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c @@ -11,7 +11,6 @@ #include "nouveau_exec.h" #include "nouveau_abi16.h" #include "nouveau_sched.h" -#include "nouveau_chan.h" #define NOUVEAU_SCHED_JOB_TIMEOUT_MS 10000 @@ -122,9 +121,11 @@ nouveau_job_done(struct nouveau_job *job) { struct nouveau_sched *sched = job->sched; - spin_lock(&sched->job_list.lock); + spin_lock(&sched->job.list.lock); list_del(&job->entry); - spin_unlock(&sched->job_list.lock); + spin_unlock(&sched->job.list.lock); + + wake_up(&sched->job.wq); } void @@ -305,9 +306,9 @@ nouveau_job_submit(struct nouveau_job *job) } /* Submit was successful; add the job to the schedulers job list. */ - spin_lock(&sched->job_list.lock); - list_add(&job->entry, &sched->job_list.head); - spin_unlock(&sched->job_list.lock); + spin_lock(&sched->job.list.lock); + list_add(&job->entry, &sched->job.list.head); + spin_unlock(&sched->job.list.lock); drm_sched_job_arm(&job->base); job->done_fence = dma_fence_get(&job->base.s_fence->finished); @@ -392,23 +393,10 @@ nouveau_sched_free_job(struct drm_sched_job *sched_job) nouveau_job_fini(job); } -static void -nouveau_sched_cancel_job(struct drm_sched_job *sched_job) -{ - struct nouveau_fence *fence; - struct nouveau_job *job; - - job = to_nouveau_job(sched_job); - fence = to_nouveau_fence(job->done_fence); - - nouveau_fence_cancel(fence); -} - static const struct drm_sched_backend_ops nouveau_sched_ops = { .run_job = nouveau_sched_run_job, .timedout_job = nouveau_sched_timedout_job, .free_job = nouveau_sched_free_job, - .cancel_job = nouveau_sched_cancel_job, }; static int @@ -458,8 +446,9 @@ nouveau_sched_init(struct nouveau_sched *sched, struct nouveau_drm *drm, goto fail_sched; mutex_init(&sched->mutex); - spin_lock_init(&sched->job_list.lock); - INIT_LIST_HEAD(&sched->job_list.head); + spin_lock_init(&sched->job.list.lock); + INIT_LIST_HEAD(&sched->job.list.head); + init_waitqueue_head(&sched->job.wq); return 0; @@ -493,12 +482,16 @@ nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm, return 0; } + static void nouveau_sched_fini(struct nouveau_sched *sched) { struct drm_gpu_scheduler *drm_sched = &sched->base; struct drm_sched_entity *entity = &sched->entity; + rmb(); /* for list_empty to work without lock */ + wait_event(sched->job.wq, list_empty(&sched->job.list.head)); + drm_sched_entity_fini(entity); drm_sched_fini(drm_sched); diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h b/drivers/gpu/drm/nouveau/nouveau_sched.h index b98c3f0bef3029..20cd1da8db73c3 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sched.h +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h @@ -103,9 +103,12 @@ struct nouveau_sched { struct mutex mutex; struct { - struct list_head head; - spinlock_t lock; - } job_list; + struct { + struct list_head head; + spinlock_t lock; + } list; + struct wait_queue_head wq; + } job; }; int nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm, diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c index ddfc46bc1b3e26..48f105239f42d8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c @@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range) u64 end = addr + range; again: - spin_lock(&sched->job_list.lock); - list_for_each_entry(__job, &sched->job_list.head, entry) { + spin_lock(&sched->job.list.lock); + list_for_each_entry(__job, &sched->job.list.head, entry) { struct nouveau_uvmm_bind_job *bind_job = to_uvmm_bind_job(__job); list_for_each_op(op, &bind_job->ops) { @@ -1030,7 +1030,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range) if (!(end <= op_addr || addr >= op_end)) { nouveau_uvmm_bind_job_get(bind_job); - spin_unlock(&sched->job_list.lock); + spin_unlock(&sched->job.list.lock); wait_for_completion(&bind_job->complete); nouveau_uvmm_bind_job_put(bind_job); goto again; @@ -1038,7 +1038,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range) } } } - spin_unlock(&sched->job_list.lock); + spin_unlock(&sched->job.list.lock); } static int diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c index 1116f2d2826eeb..4d8e9b34702a76 100644 --- a/drivers/gpu/drm/panthor/panthor_drv.c +++ b/drivers/gpu/drm/panthor/panthor_drv.c @@ -1094,7 +1094,7 @@ static int panthor_ioctl_group_create(struct drm_device *ddev, void *data, struct drm_panthor_queue_create *queue_args; int ret; - if (!args->queues.count) + if (!args->queues.count || args->queues.count > MAX_CS_PER_CSG) return -EINVAL; ret = PANTHOR_UOBJ_GET_ARRAY(queue_args, &args->queues); diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 8f17394cc82aad..df76653e649a30 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -886,8 +886,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue * if (IS_ERR_OR_NULL(queue)) return; - if (queue->entity.fence_context) - drm_sched_entity_destroy(&queue->entity); + drm_sched_entity_destroy(&queue->entity); if (queue->scheduler.ops) drm_sched_fini(&queue->scheduler); @@ -3558,11 +3557,6 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle) if (!group) return -EINVAL; - for (u32 i = 0; i < group->queue_count; i++) { - if (group->queues[i]) - drm_sched_entity_destroy(&group->queues[i]->entity); - } - mutex_lock(&sched->reset.lock); mutex_lock(&sched->lock); group->destroyed = true; diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h index 81eb046aeebfef..b9f67d7a00d879 100644 --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h @@ -117,6 +117,7 @@ enum xe_guc_action { XE_GUC_ACTION_ENTER_S_STATE = 0x501, XE_GUC_ACTION_EXIT_S_STATE = 0x502, XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE = 0x506, + XE_GUC_ACTION_UPDATE_SCHEDULING_POLICIES_KLV = 0x509, XE_GUC_ACTION_SCHED_CONTEXT = 0x1000, XE_GUC_ACTION_SCHED_CONTEXT_MODE_SET = 0x1001, XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE = 0x1002, diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h index 0366a9da597751..d7719d0e36ca78 100644 --- a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h +++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h @@ -17,6 +17,7 @@ * | 0 | 31:16 | **KEY** - KLV key identifier | * | | | - `GuC Self Config KLVs`_ | * | | | - `GuC Opt In Feature KLVs`_ | + * | | | - `GuC Scheduling Policies KLVs`_ | * | | | - `GuC VGT Policy KLVs`_ | * | | | - `GuC VF Configuration KLVs`_ | * | | | | @@ -152,6 +153,30 @@ enum { #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_KEY 0x4003 #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_LEN 0u +/** + * DOC: GuC Scheduling Policies KLVs + * + * `GuC KLV`_ keys available for use with UPDATE_SCHEDULING_POLICIES_KLV. + * + * _`GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD` : 0x1001 + * Some platforms do not allow concurrent execution of RCS and CCS + * workloads from different address spaces. By default, the GuC prioritizes + * RCS submissions over CCS ones, which can lead to CCS workloads being + * significantly (or completely) starved of execution time. This KLV allows + * the driver to specify a quantum (in ms) and a ratio (percentage value + * between 0 and 100), and the GuC will prioritize the CCS for that + * percentage of each quantum. For example, specifying 100ms and 30% will + * make the GuC prioritize the CCS for 30ms of every 100ms. + * Note that this does not necessarly mean that RCS and CCS engines will + * only be active for their percentage of the quantum, as the restriction + * only kicks in if both classes are fully busy with non-compatible address + * spaces; i.e., if one engine is idle or running the same address space, + * a pending job on the other engine will still be submitted to the HW no + * matter what the ratio is + */ +#define GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD_KEY 0x1001 +#define GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD_LEN 2u + /** * DOC: GuC VGT Policy KLVs * diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c b/drivers/gpu/drm/xe/tests/xe_bo.c index bb469096d072b5..7b40cc8be1c9c2 100644 --- a/drivers/gpu/drm/xe/tests/xe_bo.c +++ b/drivers/gpu/drm/xe/tests/xe_bo.c @@ -236,7 +236,7 @@ static int evict_test_run_tile(struct xe_device *xe, struct xe_tile *tile, struc } xe_bo_lock(external, false); - err = xe_bo_pin_external(external); + err = xe_bo_pin_external(external, false); xe_bo_unlock(external); if (err) { KUNIT_FAIL(test, "external bo pin err=%pe\n", diff --git a/drivers/gpu/drm/xe/tests/xe_dma_buf.c b/drivers/gpu/drm/xe/tests/xe_dma_buf.c index c53f67ce4b0aa2..121f17c112ec6a 100644 --- a/drivers/gpu/drm/xe/tests/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/tests/xe_dma_buf.c @@ -89,15 +89,7 @@ static void check_residency(struct kunit *test, struct xe_bo *exported, return; } - /* - * If on different devices, the exporter is kept in system if - * possible, saving a migration step as the transfer is just - * likely as fast from system memory. - */ - if (params->mem_mask & XE_BO_FLAG_SYSTEM) - KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, XE_PL_TT)); - else - KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type)); + KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type)); if (params->force_different_devices) KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(imported, XE_PL_TT)); diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 9954bb458ce12d..bae7ff2e59276c 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -186,6 +186,8 @@ static void try_add_system(struct xe_device *xe, struct xe_bo *bo, bo->placements[*c] = (struct ttm_place) { .mem_type = XE_PL_TT, + .flags = (bo_flags & XE_BO_FLAG_VRAM_MASK) ? + TTM_PL_FLAG_FALLBACK : 0, }; *c += 1; } @@ -2269,6 +2271,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res) /** * xe_bo_pin_external - pin an external BO * @bo: buffer object to be pinned + * @in_place: Pin in current placement, don't attempt to migrate. * * Pin an external (not tied to a VM, can be exported via dma-buf / prime FD) * BO. Unique call compared to xe_bo_pin as this function has it own set of @@ -2276,7 +2279,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res) * * Returns 0 for success, negative error code otherwise. */ -int xe_bo_pin_external(struct xe_bo *bo) +int xe_bo_pin_external(struct xe_bo *bo, bool in_place) { struct xe_device *xe = xe_bo_device(bo); int err; @@ -2285,9 +2288,11 @@ int xe_bo_pin_external(struct xe_bo *bo) xe_assert(xe, xe_bo_is_user(bo)); if (!xe_bo_is_pinned(bo)) { - err = xe_bo_validate(bo, NULL, false); - if (err) - return err; + if (!in_place) { + err = xe_bo_validate(bo, NULL, false); + if (err) + return err; + } spin_lock(&xe->pinned.lock); list_add_tail(&bo->pinned_link, &xe->pinned.late.external); @@ -2440,6 +2445,9 @@ int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict) }; int ret; + if (xe_bo_is_pinned(bo)) + return 0; + if (vm) { lockdep_assert_held(&vm->lock); xe_vm_assert_held(vm); diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h index 02e8cde4c6b201..9ce94d25201562 100644 --- a/drivers/gpu/drm/xe/xe_bo.h +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -198,7 +198,7 @@ static inline void xe_bo_unlock_vm_held(struct xe_bo *bo) } } -int xe_bo_pin_external(struct xe_bo *bo); +int xe_bo_pin_external(struct xe_bo *bo, bool in_place); int xe_bo_pin(struct xe_bo *bo); void xe_bo_unpin_external(struct xe_bo *bo); void xe_bo_unpin(struct xe_bo *bo); diff --git a/drivers/gpu/drm/xe/xe_bo_evict.c b/drivers/gpu/drm/xe/xe_bo_evict.c index 7484ce55a303d6..d5dbc51e8612d8 100644 --- a/drivers/gpu/drm/xe/xe_bo_evict.c +++ b/drivers/gpu/drm/xe/xe_bo_evict.c @@ -158,8 +158,8 @@ int xe_bo_evict_all(struct xe_device *xe) if (ret) return ret; - ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.kernel_bo_present, - &xe->pinned.late.evicted, xe_bo_evict_pinned); + ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.external, + &xe->pinned.late.external, xe_bo_evict_pinned); if (!ret) ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.kernel_bo_present, diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c index e9b46a2d00195a..58c1f397c68c94 100644 --- a/drivers/gpu/drm/xe/xe_configfs.c +++ b/drivers/gpu/drm/xe/xe_configfs.c @@ -404,7 +404,7 @@ int __init xe_configfs_init(void) return 0; } -void __exit xe_configfs_exit(void) +void xe_configfs_exit(void) { configfs_unregister_subsystem(&xe_configfs); } diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c index bd9015761aa0ed..927ee7991696ba 100644 --- a/drivers/gpu/drm/xe/xe_device_sysfs.c +++ b/drivers/gpu/drm/xe/xe_device_sysfs.c @@ -308,15 +308,19 @@ int xe_device_sysfs_init(struct xe_device *xe) return ret; } - if (xe->info.platform == XE_BATTLEMAGE) { + if (xe->info.platform == XE_BATTLEMAGE && !IS_SRIOV_VF(xe)) { ret = sysfs_create_files(&dev->kobj, auto_link_downgrade_attrs); if (ret) - return ret; + goto cleanup; ret = late_bind_create_files(dev); if (ret) - return ret; + goto cleanup; } return devm_add_action_or_reset(dev, xe_device_sysfs_fini, xe); + +cleanup: + xe_device_sysfs_fini(xe); + return ret; } diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index d4d2c6854790ca..7ceb0c90f3914c 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -553,6 +553,12 @@ struct xe_device { /** @pm_notifier: Our PM notifier to perform actions in response to various PM events. */ struct notifier_block pm_notifier; + /** @pm_block: Completion to block validating tasks on suspend / hibernate prepare */ + struct completion pm_block; + /** @rebind_resume_list: List of wq items to kick on resume. */ + struct list_head rebind_resume_list; + /** @rebind_resume_lock: Lock to protect the rebind_resume_list */ + struct mutex rebind_resume_lock; /** @pmt: Support the PMT driver callback interface */ struct { diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index 346f857f38374f..af64baf872ef7b 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -72,7 +72,7 @@ static int xe_dma_buf_pin(struct dma_buf_attachment *attach) return ret; } - ret = xe_bo_pin_external(bo); + ret = xe_bo_pin_external(bo, true); xe_assert(xe, !ret); return 0; diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 44364c042ad72d..374c831e691b2b 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -237,6 +237,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) goto err_unlock_list; } + /* + * It's OK to block interruptible here with the vm lock held, since + * on task freezing during suspend / hibernate, the call will + * return -ERESTARTSYS and the IOCTL will be rerun. + */ + err = wait_for_completion_interruptible(&xe->pm_block); + if (err) + goto err_unlock_list; + vm_exec.vm = &vm->gpuvm; vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; if (xe_vm_in_lr_mode(vm)) { diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 8991b4aed44071..c07edcda99c5ca 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -151,6 +151,16 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q) return err; } +static void __xe_exec_queue_fini(struct xe_exec_queue *q) +{ + int i; + + q->ops->fini(q); + + for (i = 0; i < q->width; ++i) + xe_lrc_put(q->lrc[i]); +} + struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm, u32 logical_mask, u16 width, struct xe_hw_engine *hwe, u32 flags, @@ -181,11 +191,13 @@ struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *v if (xe_exec_queue_uses_pxp(q)) { err = xe_pxp_exec_queue_add(xe->pxp, q); if (err) - goto err_post_alloc; + goto err_post_init; } return q; +err_post_init: + __xe_exec_queue_fini(q); err_post_alloc: __xe_exec_queue_free(q); return ERR_PTR(err); @@ -283,13 +295,11 @@ void xe_exec_queue_destroy(struct kref *ref) xe_exec_queue_put(eq); } - q->ops->fini(q); + q->ops->destroy(q); } void xe_exec_queue_fini(struct xe_exec_queue *q) { - int i; - /* * Before releasing our ref to lrc and xef, accumulate our run ticks * and wakeup any waiters. @@ -298,9 +308,7 @@ void xe_exec_queue_fini(struct xe_exec_queue *q) if (q->xef && atomic_dec_and_test(&q->xef->exec_queue.pending_removal)) wake_up_var(&q->xef->exec_queue.pending_removal); - for (i = 0; i < q->width; ++i) - xe_lrc_put(q->lrc[i]); - + __xe_exec_queue_fini(q); __xe_exec_queue_free(q); } diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index cc1cffb5c87f1d..1c9d03f2a3e5da 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -166,8 +166,14 @@ struct xe_exec_queue_ops { int (*init)(struct xe_exec_queue *q); /** @kill: Kill inflight submissions for backend */ void (*kill)(struct xe_exec_queue *q); - /** @fini: Fini exec queue for submission backend */ + /** @fini: Undoes the init() for submission backend */ void (*fini)(struct xe_exec_queue *q); + /** + * @destroy: Destroy exec queue for submission backend. The backend + * function must call xe_exec_queue_fini() (which will in turn call the + * fini() backend function) to ensure the queue is properly cleaned up. + */ + void (*destroy)(struct xe_exec_queue *q); /** @set_priority: Set priority for exec queue */ int (*set_priority)(struct xe_exec_queue *q, enum xe_exec_queue_priority priority); diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c index 788f56b066b6ad..f83d421ac9d3d2 100644 --- a/drivers/gpu/drm/xe/xe_execlist.c +++ b/drivers/gpu/drm/xe/xe_execlist.c @@ -385,10 +385,20 @@ static int execlist_exec_queue_init(struct xe_exec_queue *q) return err; } -static void execlist_exec_queue_fini_async(struct work_struct *w) +static void execlist_exec_queue_fini(struct xe_exec_queue *q) +{ + struct xe_execlist_exec_queue *exl = q->execlist; + + drm_sched_entity_fini(&exl->entity); + drm_sched_fini(&exl->sched); + + kfree(exl); +} + +static void execlist_exec_queue_destroy_async(struct work_struct *w) { struct xe_execlist_exec_queue *ee = - container_of(w, struct xe_execlist_exec_queue, fini_async); + container_of(w, struct xe_execlist_exec_queue, destroy_async); struct xe_exec_queue *q = ee->q; struct xe_execlist_exec_queue *exl = q->execlist; struct xe_device *xe = gt_to_xe(q->gt); @@ -401,10 +411,6 @@ static void execlist_exec_queue_fini_async(struct work_struct *w) list_del(&exl->active_link); spin_unlock_irqrestore(&exl->port->lock, flags); - drm_sched_entity_fini(&exl->entity); - drm_sched_fini(&exl->sched); - kfree(exl); - xe_exec_queue_fini(q); } @@ -413,10 +419,10 @@ static void execlist_exec_queue_kill(struct xe_exec_queue *q) /* NIY */ } -static void execlist_exec_queue_fini(struct xe_exec_queue *q) +static void execlist_exec_queue_destroy(struct xe_exec_queue *q) { - INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async); - queue_work(system_unbound_wq, &q->execlist->fini_async); + INIT_WORK(&q->execlist->destroy_async, execlist_exec_queue_destroy_async); + queue_work(system_unbound_wq, &q->execlist->destroy_async); } static int execlist_exec_queue_set_priority(struct xe_exec_queue *q, @@ -467,6 +473,7 @@ static const struct xe_exec_queue_ops execlist_exec_queue_ops = { .init = execlist_exec_queue_init, .kill = execlist_exec_queue_kill, .fini = execlist_exec_queue_fini, + .destroy = execlist_exec_queue_destroy, .set_priority = execlist_exec_queue_set_priority, .set_timeslice = execlist_exec_queue_set_timeslice, .set_preempt_timeout = execlist_exec_queue_set_preempt_timeout, diff --git a/drivers/gpu/drm/xe/xe_execlist_types.h b/drivers/gpu/drm/xe/xe_execlist_types.h index 415140936f11da..92c4ba52db0cb1 100644 --- a/drivers/gpu/drm/xe/xe_execlist_types.h +++ b/drivers/gpu/drm/xe/xe_execlist_types.h @@ -42,7 +42,7 @@ struct xe_execlist_exec_queue { bool has_run; - struct work_struct fini_async; + struct work_struct destroy_async; enum xe_exec_queue_priority active_priority; struct list_head active_link; diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index c8eda36546d343..17634195cdc26a 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -41,6 +41,7 @@ #include "xe_gt_topology.h" #include "xe_guc_exec_queue_types.h" #include "xe_guc_pc.h" +#include "xe_guc_submit.h" #include "xe_hw_fence.h" #include "xe_hw_engine_class_sysfs.h" #include "xe_irq.h" @@ -97,7 +98,7 @@ void xe_gt_sanitize(struct xe_gt *gt) * FIXME: if xe_uc_sanitize is called here, on TGL driver will not * reload */ - gt->uc.guc.submission_state.enabled = false; + xe_guc_submit_disable(>->uc.guc); } static void xe_gt_enable_host_l2_vram(struct xe_gt *gt) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 494909f74eb22c..d84831a03610db 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -1632,7 +1632,6 @@ static u64 pf_estimate_fair_lmem(struct xe_gt *gt, unsigned int num_vfs) u64 fair; fair = div_u64(available, num_vfs); - fair = rounddown_pow_of_two(fair); /* XXX: ttm_vram_mgr & drm_buddy limitation */ fair = ALIGN_DOWN(fair, alignment); #ifdef MAX_FAIR_LMEM fair = min_t(u64, MAX_FAIR_LMEM, fair); diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index b1d1d6da37581e..270fc379249366 100644 --- a/drivers/gpu/drm/xe/xe_guc.c +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -880,9 +880,7 @@ int xe_guc_post_load_init(struct xe_guc *guc) return ret; } - guc->submission_state.enabled = true; - - return 0; + return xe_guc_submit_enable(guc); } int xe_guc_reset(struct xe_guc *guc) @@ -1579,7 +1577,7 @@ void xe_guc_sanitize(struct xe_guc *guc) { xe_uc_fw_sanitize(&guc->fw); xe_guc_ct_disable(&guc->ct); - guc->submission_state.enabled = false; + xe_guc_submit_disable(guc); } int xe_guc_reset_prepare(struct xe_guc *guc) diff --git a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h index a3f421e2adc03b..c30c0e3ccbbb93 100644 --- a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h @@ -35,8 +35,8 @@ struct xe_guc_exec_queue { struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE]; /** @lr_tdr: long running TDR worker */ struct work_struct lr_tdr; - /** @fini_async: do final fini async from this worker */ - struct work_struct fini_async; + /** @destroy_async: do final destroy async from this worker */ + struct work_struct destroy_async; /** @resume_time: time of last resume */ u64 resume_time; /** @state: GuC specific state for this xe_exec_queue */ diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index cafb47711e9b3f..0104afbc941c84 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -32,6 +32,7 @@ #include "xe_guc_ct.h" #include "xe_guc_exec_queue_types.h" #include "xe_guc_id_mgr.h" +#include "xe_guc_klv_helpers.h" #include "xe_guc_submit_types.h" #include "xe_hw_engine.h" #include "xe_hw_fence.h" @@ -316,6 +317,71 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids) return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc); } +/* + * Given that we want to guarantee enough RCS throughput to avoid missing + * frames, we set the yield policy to 20% of each 80ms interval. + */ +#define RC_YIELD_DURATION 80 /* in ms */ +#define RC_YIELD_RATIO 20 /* in percent */ +static u32 *emit_render_compute_yield_klv(u32 *emit) +{ + *emit++ = PREP_GUC_KLV_TAG(SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD); + *emit++ = RC_YIELD_DURATION; + *emit++ = RC_YIELD_RATIO; + + return emit; +} + +#define SCHEDULING_POLICY_MAX_DWORDS 16 +static int guc_init_global_schedule_policy(struct xe_guc *guc) +{ + u32 data[SCHEDULING_POLICY_MAX_DWORDS]; + u32 *emit = data; + u32 count = 0; + int ret; + + if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 1, 0)) + return 0; + + *emit++ = XE_GUC_ACTION_UPDATE_SCHEDULING_POLICIES_KLV; + + if (CCS_MASK(guc_to_gt(guc))) + emit = emit_render_compute_yield_klv(emit); + + count = emit - data; + if (count > 1) { + xe_assert(guc_to_xe(guc), count <= SCHEDULING_POLICY_MAX_DWORDS); + + ret = xe_guc_ct_send_block(&guc->ct, data, count); + if (ret < 0) { + xe_gt_err(guc_to_gt(guc), + "failed to enable GuC sheduling policies: %pe\n", + ERR_PTR(ret)); + return ret; + } + } + + return 0; +} + +int xe_guc_submit_enable(struct xe_guc *guc) +{ + int ret; + + ret = guc_init_global_schedule_policy(guc); + if (ret) + return ret; + + guc->submission_state.enabled = true; + + return 0; +} + +void xe_guc_submit_disable(struct xe_guc *guc) +{ + guc->submission_state.enabled = false; +} + static void __release_guc_id(struct xe_guc *guc, struct xe_exec_queue *q, u32 xa_count) { int i; @@ -1277,48 +1343,57 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) return DRM_GPU_SCHED_STAT_NO_HANG; } -static void __guc_exec_queue_fini_async(struct work_struct *w) +static void guc_exec_queue_fini(struct xe_exec_queue *q) +{ + struct xe_guc_exec_queue *ge = q->guc; + struct xe_guc *guc = exec_queue_to_guc(q); + + release_guc_id(guc, q); + xe_sched_entity_fini(&ge->entity); + xe_sched_fini(&ge->sched); + + /* + * RCU free due sched being exported via DRM scheduler fences + * (timeline name). + */ + kfree_rcu(ge, rcu); +} + +static void __guc_exec_queue_destroy_async(struct work_struct *w) { struct xe_guc_exec_queue *ge = - container_of(w, struct xe_guc_exec_queue, fini_async); + container_of(w, struct xe_guc_exec_queue, destroy_async); struct xe_exec_queue *q = ge->q; struct xe_guc *guc = exec_queue_to_guc(q); xe_pm_runtime_get(guc_to_xe(guc)); trace_xe_exec_queue_destroy(q); - release_guc_id(guc, q); if (xe_exec_queue_is_lr(q)) cancel_work_sync(&ge->lr_tdr); /* Confirm no work left behind accessing device structures */ cancel_delayed_work_sync(&ge->sched.base.work_tdr); - xe_sched_entity_fini(&ge->entity); - xe_sched_fini(&ge->sched); - /* - * RCU free due sched being exported via DRM scheduler fences - * (timeline name). - */ - kfree_rcu(ge, rcu); xe_exec_queue_fini(q); + xe_pm_runtime_put(guc_to_xe(guc)); } -static void guc_exec_queue_fini_async(struct xe_exec_queue *q) +static void guc_exec_queue_destroy_async(struct xe_exec_queue *q) { struct xe_guc *guc = exec_queue_to_guc(q); struct xe_device *xe = guc_to_xe(guc); - INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async); + INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async); /* We must block on kernel engines so slabs are empty on driver unload */ if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q)) - __guc_exec_queue_fini_async(&q->guc->fini_async); + __guc_exec_queue_destroy_async(&q->guc->destroy_async); else - queue_work(xe->destroy_wq, &q->guc->fini_async); + queue_work(xe->destroy_wq, &q->guc->destroy_async); } -static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q) +static void __guc_exec_queue_destroy(struct xe_guc *guc, struct xe_exec_queue *q) { /* * Might be done from within the GPU scheduler, need to do async as we @@ -1327,7 +1402,7 @@ static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q) * this we and don't really care when everything is fini'd, just that it * is. */ - guc_exec_queue_fini_async(q); + guc_exec_queue_destroy_async(q); } static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) @@ -1341,7 +1416,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) if (exec_queue_registered(q)) disable_scheduling_deregister(guc, q); else - __guc_exec_queue_fini(guc, q); + __guc_exec_queue_destroy(guc, q); } static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q) @@ -1574,14 +1649,14 @@ static bool guc_exec_queue_try_add_msg(struct xe_exec_queue *q, #define STATIC_MSG_CLEANUP 0 #define STATIC_MSG_SUSPEND 1 #define STATIC_MSG_RESUME 2 -static void guc_exec_queue_fini(struct xe_exec_queue *q) +static void guc_exec_queue_destroy(struct xe_exec_queue *q) { struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP; if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q)) guc_exec_queue_add_msg(q, msg, CLEANUP); else - __guc_exec_queue_fini(exec_queue_to_guc(q), q); + __guc_exec_queue_destroy(exec_queue_to_guc(q), q); } static int guc_exec_queue_set_priority(struct xe_exec_queue *q, @@ -1711,6 +1786,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = { .init = guc_exec_queue_init, .kill = guc_exec_queue_kill, .fini = guc_exec_queue_fini, + .destroy = guc_exec_queue_destroy, .set_priority = guc_exec_queue_set_priority, .set_timeslice = guc_exec_queue_set_timeslice, .set_preempt_timeout = guc_exec_queue_set_preempt_timeout, @@ -1732,7 +1808,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q) if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) xe_exec_queue_put(q); else if (exec_queue_destroyed(q)) - __guc_exec_queue_fini(guc, q); + __guc_exec_queue_destroy(guc, q); } if (q->guc->suspend_pending) { set_exec_queue_suspended(q); @@ -1989,7 +2065,7 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q) if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) xe_exec_queue_put(q); else - __guc_exec_queue_fini(guc, q); + __guc_exec_queue_destroy(guc, q); } int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h index 9b71a986c6ca69..0d126b807c1041 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.h +++ b/drivers/gpu/drm/xe/xe_guc_submit.h @@ -13,6 +13,8 @@ struct xe_exec_queue; struct xe_guc; int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids); +int xe_guc_submit_enable(struct xe_guc *guc); +void xe_guc_submit_disable(struct xe_guc *guc); int xe_guc_submit_reset_prepare(struct xe_guc *guc); void xe_guc_submit_reset_wait(struct xe_guc *guc); diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index c17ed1ae86493c..c5b63e10bb9113 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -286,7 +286,7 @@ static struct xe_reg xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg */ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *value) { - u64 reg_val = 0, min, max; + u32 reg_val = 0; struct xe_device *xe = hwmon->xe; struct xe_reg rapl_limit, pkg_power_sku; struct xe_mmio *mmio = xe_root_tile_mmio(xe); @@ -294,7 +294,7 @@ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channe mutex_lock(&hwmon->hwmon_lock); if (hwmon->xe->info.has_mbx_power_limits) { - xe_hwmon_pcode_read_power_limit(hwmon, attr, channel, (u32 *)®_val); + xe_hwmon_pcode_read_power_limit(hwmon, attr, channel, ®_val); } else { rapl_limit = xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, channel); pkg_power_sku = xe_hwmon_get_reg(hwmon, REG_PKG_POWER_SKU, channel); @@ -304,19 +304,21 @@ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channe /* Check if PL limits are disabled. */ if (!(reg_val & PWR_LIM_EN)) { *value = PL_DISABLE; - drm_info(&hwmon->xe->drm, "%s disabled for channel %d, val 0x%016llx\n", + drm_info(&hwmon->xe->drm, "%s disabled for channel %d, val 0x%08x\n", PWR_ATTR_TO_STR(attr), channel, reg_val); goto unlock; } reg_val = REG_FIELD_GET(PWR_LIM_VAL, reg_val); - *value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power); + *value = mul_u32_u32(reg_val, SF_POWER) >> hwmon->scl_shift_power; /* For platforms with mailbox power limit support clamping would be done by pcode. */ if (!hwmon->xe->info.has_mbx_power_limits) { - reg_val = xe_mmio_read64_2x32(mmio, pkg_power_sku); - min = REG_FIELD_GET(PKG_MIN_PWR, reg_val); - max = REG_FIELD_GET(PKG_MAX_PWR, reg_val); + u64 pkg_pwr, min, max; + + pkg_pwr = xe_mmio_read64_2x32(mmio, pkg_power_sku); + min = REG_FIELD_GET(PKG_MIN_PWR, pkg_pwr); + max = REG_FIELD_GET(PKG_MAX_PWR, pkg_pwr); min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power); max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power); if (min && max) @@ -493,8 +495,8 @@ xe_hwmon_power_max_interval_show(struct device *dev, struct device_attribute *at { struct xe_hwmon *hwmon = dev_get_drvdata(dev); struct xe_mmio *mmio = xe_root_tile_mmio(hwmon->xe); - u32 x, y, x_w = 2; /* 2 bits */ - u64 r, tau4, out; + u32 reg_val, x, y, x_w = 2; /* 2 bits */ + u64 tau4, out; int channel = (to_sensor_dev_attr(attr)->index % 2) ? CHANNEL_PKG : CHANNEL_CARD; u32 power_attr = (to_sensor_dev_attr(attr)->index > 1) ? PL2_HWMON_ATTR : PL1_HWMON_ATTR; @@ -505,23 +507,24 @@ xe_hwmon_power_max_interval_show(struct device *dev, struct device_attribute *at mutex_lock(&hwmon->hwmon_lock); if (hwmon->xe->info.has_mbx_power_limits) { - ret = xe_hwmon_pcode_read_power_limit(hwmon, power_attr, channel, (u32 *)&r); + ret = xe_hwmon_pcode_read_power_limit(hwmon, power_attr, channel, ®_val); if (ret) { drm_err(&hwmon->xe->drm, - "power interval read fail, ch %d, attr %d, r 0%llx, ret %d\n", - channel, power_attr, r, ret); - r = 0; + "power interval read fail, ch %d, attr %d, val 0x%08x, ret %d\n", + channel, power_attr, reg_val, ret); + reg_val = 0; } } else { - r = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, channel)); + reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, + channel)); } mutex_unlock(&hwmon->hwmon_lock); xe_pm_runtime_put(hwmon->xe); - x = REG_FIELD_GET(PWR_LIM_TIME_X, r); - y = REG_FIELD_GET(PWR_LIM_TIME_Y, r); + x = REG_FIELD_GET(PWR_LIM_TIME_X, reg_val); + y = REG_FIELD_GET(PWR_LIM_TIME_Y, reg_val); /* * tau = (1 + (x / 4)) * power(2,y), x = bits(23:22), y = bits(21:17) diff --git a/drivers/gpu/drm/xe/xe_nvm.c b/drivers/gpu/drm/xe/xe_nvm.c index 61b0a1531a539b..2cfe9eb673913f 100644 --- a/drivers/gpu/drm/xe/xe_nvm.c +++ b/drivers/gpu/drm/xe/xe_nvm.c @@ -35,6 +35,10 @@ static const struct intel_dg_nvm_region regions[INTEL_DG_NVM_REGIONS] = { static void xe_nvm_release_dev(struct device *dev) { + struct auxiliary_device *aux = container_of(dev, struct auxiliary_device, dev); + struct intel_dg_nvm_dev *nvm = container_of(aux, struct intel_dg_nvm_dev, aux_dev); + + kfree(nvm); } static bool xe_nvm_non_posted_erase(struct xe_device *xe) @@ -162,6 +166,5 @@ void xe_nvm_fini(struct xe_device *xe) auxiliary_device_delete(&nvm->aux_dev); auxiliary_device_uninit(&nvm->aux_dev); - kfree(nvm); xe->nvm = NULL; } diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index e279b47ba03bf6..bb9b6ecad2afcd 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -24,6 +24,7 @@ #include "xe_pcode.h" #include "xe_pxp.h" #include "xe_trace.h" +#include "xe_vm.h" #include "xe_wa.h" /** @@ -290,6 +291,19 @@ static u32 vram_threshold_value(struct xe_device *xe) return DEFAULT_VRAM_THRESHOLD; } +static void xe_pm_wake_rebind_workers(struct xe_device *xe) +{ + struct xe_vm *vm, *next; + + mutex_lock(&xe->rebind_resume_lock); + list_for_each_entry_safe(vm, next, &xe->rebind_resume_list, + preempt.pm_activate_link) { + list_del_init(&vm->preempt.pm_activate_link); + xe_vm_resume_rebind_worker(vm); + } + mutex_unlock(&xe->rebind_resume_lock); +} + static int xe_pm_notifier_callback(struct notifier_block *nb, unsigned long action, void *data) { @@ -299,30 +313,30 @@ static int xe_pm_notifier_callback(struct notifier_block *nb, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: + reinit_completion(&xe->pm_block); xe_pm_runtime_get(xe); err = xe_bo_evict_all_user(xe); - if (err) { + if (err) drm_dbg(&xe->drm, "Notifier evict user failed (%d)\n", err); - xe_pm_runtime_put(xe); - break; - } err = xe_bo_notifier_prepare_all_pinned(xe); - if (err) { + if (err) drm_dbg(&xe->drm, "Notifier prepare pin failed (%d)\n", err); - xe_pm_runtime_put(xe); - } + /* + * Keep the runtime pm reference until post hibernation / post suspend to + * avoid a runtime suspend interfering with evicted objects or backup + * allocations. + */ break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: + complete_all(&xe->pm_block); + xe_pm_wake_rebind_workers(xe); xe_bo_notifier_unprepare_all_pinned(xe); xe_pm_runtime_put(xe); break; } - if (err) - return NOTIFY_BAD; - return NOTIFY_DONE; } @@ -344,6 +358,14 @@ int xe_pm_init(struct xe_device *xe) if (err) return err; + err = drmm_mutex_init(&xe->drm, &xe->rebind_resume_lock); + if (err) + goto err_unregister; + + init_completion(&xe->pm_block); + complete_all(&xe->pm_block); + INIT_LIST_HEAD(&xe->rebind_resume_list); + /* For now suspend/resume is only allowed with GuC */ if (!xe_device_uc_enabled(xe)) return 0; diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 41705f5d52e3a3..8f7b0add2364f8 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -41,6 +41,8 @@ * * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode * + * It is the responsibility of the user to clear the mode once firmware flash is complete. + * * Refer :ref:`xe_configfs` for more details on how to use configfs * * Survivability mode is indicated by the below admin-only readable sysfs which provides additional @@ -147,7 +149,6 @@ static void xe_survivability_mode_fini(void *arg) struct pci_dev *pdev = to_pci_dev(xe->drm.dev); struct device *dev = &pdev->dev; - xe_configfs_clear_survivability_mode(pdev); sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); } diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs.c b/drivers/gpu/drm/xe/xe_tile_sysfs.c index b804234a655160..9e1236a9ec6734 100644 --- a/drivers/gpu/drm/xe/xe_tile_sysfs.c +++ b/drivers/gpu/drm/xe/xe_tile_sysfs.c @@ -44,16 +44,18 @@ int xe_tile_sysfs_init(struct xe_tile *tile) kt->tile = tile; err = kobject_add(&kt->base, &dev->kobj, "tile%d", tile->id); - if (err) { - kobject_put(&kt->base); - return err; - } + if (err) + goto err_object; tile->sysfs = &kt->base; err = xe_vram_freq_sysfs_init(tile); if (err) - return err; + goto err_object; return devm_add_action_or_reset(xe->drm.dev, tile_sysfs_fini, tile); + +err_object: + kobject_put(&kt->base); + return err; } diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index d60c4b1153043c..5146999d27fa2d 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -240,8 +240,8 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) pfence = xe_preempt_fence_create(q, q->lr.context, ++q->lr.seqno); - if (!pfence) { - err = -ENOMEM; + if (IS_ERR(pfence)) { + err = PTR_ERR(pfence); goto out_fini; } @@ -393,6 +393,9 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec) list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind, &vm->rebind_list); + if (!try_wait_for_completion(&vm->xe->pm_block)) + return -EAGAIN; + ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false); if (ret) return ret; @@ -479,6 +482,33 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues); } +static bool vm_suspend_rebind_worker(struct xe_vm *vm) +{ + struct xe_device *xe = vm->xe; + bool ret = false; + + mutex_lock(&xe->rebind_resume_lock); + if (!try_wait_for_completion(&vm->xe->pm_block)) { + ret = true; + list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list); + } + mutex_unlock(&xe->rebind_resume_lock); + + return ret; +} + +/** + * xe_vm_resume_rebind_worker() - Resume the rebind worker. + * @vm: The vm whose preempt worker to resume. + * + * Resume a preempt worker that was previously suspended by + * vm_suspend_rebind_worker(). + */ +void xe_vm_resume_rebind_worker(struct xe_vm *vm) +{ + queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work); +} + static void preempt_rebind_work_func(struct work_struct *w) { struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); @@ -502,6 +532,11 @@ static void preempt_rebind_work_func(struct work_struct *w) } retry: + if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) { + up_write(&vm->lock); + return; + } + if (xe_vm_userptr_check_repin(vm)) { err = xe_vm_userptr_pin(vm); if (err) @@ -1714,6 +1749,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef) if (flags & XE_VM_FLAG_LR_MODE) { INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func); xe_pm_runtime_get_noresume(xe); + INIT_LIST_HEAD(&vm->preempt.pm_activate_link); } if (flags & XE_VM_FLAG_FAULT_MODE) { @@ -1895,8 +1931,12 @@ void xe_vm_close_and_put(struct xe_vm *vm) xe_assert(xe, !vm->preempt.num_exec_queues); xe_vm_close(vm); - if (xe_vm_in_preempt_fence_mode(vm)) + if (xe_vm_in_preempt_fence_mode(vm)) { + mutex_lock(&xe->rebind_resume_lock); + list_del_init(&vm->preempt.pm_activate_link); + mutex_unlock(&xe->rebind_resume_lock); flush_work(&vm->preempt.rebind_work); + } if (xe_vm_in_fault_mode(vm)) xe_svm_close(vm); diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index 2ecb417c19a280..82b1127958071f 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -273,6 +273,8 @@ struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo, struct xe_exec_queue *q, u64 addr, enum xe_cache_level cache_lvl); +void xe_vm_resume_rebind_worker(struct xe_vm *vm); + /** * xe_vm_resv() - Return's the vm's reservation object * @vm: The vm diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 8a07feef503bad..6058cf739388bc 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -293,6 +293,11 @@ struct xe_vm { * BOs */ struct work_struct rebind_work; + /** + * @preempt.pm_activate_link: Link to list of rebind workers to be + * kicked on resume. + */ + struct list_head pm_activate_link; } preempt; /** @um: unified memory state */ diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules index e990f20eccfe3c..710f4423726c99 100644 --- a/drivers/gpu/drm/xe/xe_wa_oob.rules +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -30,7 +30,8 @@ 16022287689 GRAPHICS_VERSION(2001) GRAPHICS_VERSION(2004) 13011645652 GRAPHICS_VERSION(2004) - GRAPHICS_VERSION(3001) + GRAPHICS_VERSION_RANGE(3000, 3001) + GRAPHICS_VERSION(3003) 14022293748 GRAPHICS_VERSION_RANGE(2001, 2002) GRAPHICS_VERSION(2004) GRAPHICS_VERSION_RANGE(3000, 3001) diff --git a/drivers/gpu/nova-core/Kconfig b/drivers/gpu/nova-core/Kconfig index 8726d80d6ba401..20d3e6d0d796ef 100644 --- a/drivers/gpu/nova-core/Kconfig +++ b/drivers/gpu/nova-core/Kconfig @@ -1,5 +1,6 @@ config NOVA_CORE tristate "Nova Core GPU driver" + depends on 64BIT depends on PCI depends on RUST depends on RUST_FW_LOADER_ABSTRACTIONS diff --git a/drivers/gpu/nova-core/driver.rs b/drivers/gpu/nova-core/driver.rs index 274989ea1fb4a5..5d23a91f51ddbd 100644 --- a/drivers/gpu/nova-core/driver.rs +++ b/drivers/gpu/nova-core/driver.rs @@ -1,6 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 -use kernel::{auxiliary, bindings, c_str, device::Core, pci, prelude::*, sizes::SZ_16M, sync::Arc}; +use kernel::{ + auxiliary, c_str, + device::Core, + pci, + pci::{Class, ClassMask, Vendor}, + prelude::*, + sizes::SZ_16M, + sync::Arc, +}; use crate::gpu::Gpu; @@ -18,10 +26,25 @@ kernel::pci_device_table!( PCI_TABLE, MODULE_PCI_TABLE, ::IdInfo, - [( - pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_NVIDIA, bindings::PCI_ANY_ID as u32), - () - )] + [ + // Modern NVIDIA GPUs will show up as either VGA or 3D controllers. + ( + pci::DeviceId::from_class_and_vendor( + Class::DISPLAY_VGA, + ClassMask::ClassSubclass, + Vendor::NVIDIA + ), + () + ), + ( + pci::DeviceId::from_class_and_vendor( + Class::DISPLAY_3D, + ClassMask::ClassSubclass, + Vendor::NVIDIA + ), + () + ), + ] ); impl pci::Driver for NovaCore { diff --git a/drivers/gpu/nova-core/fb.rs b/drivers/gpu/nova-core/fb.rs index 4a702525fff4f3..e4dc74f2f90a7b 100644 --- a/drivers/gpu/nova-core/fb.rs +++ b/drivers/gpu/nova-core/fb.rs @@ -3,6 +3,7 @@ use core::ops::Range; use kernel::prelude::*; +use kernel::ptr::{Alignable, Alignment}; use kernel::sizes::*; use kernel::types::ARef; use kernel::{dev_warn, device}; @@ -130,10 +131,9 @@ impl FbLayout { }; let frts = { - const FRTS_DOWN_ALIGN: u64 = SZ_128K as u64; + const FRTS_DOWN_ALIGN: Alignment = Alignment::new::(); const FRTS_SIZE: u64 = SZ_1M as u64; - // TODO[NUMM]: replace with `align_down` once it lands. - let frts_base = (vga_workspace.start & !(FRTS_DOWN_ALIGN - 1)) - FRTS_SIZE; + let frts_base = vga_workspace.start.align_down(FRTS_DOWN_ALIGN) - FRTS_SIZE; frts_base..frts_base + FRTS_SIZE }; diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs index b5c9786619a9d4..600cc90b5fabea 100644 --- a/drivers/gpu/nova-core/gpu.rs +++ b/drivers/gpu/nova-core/gpu.rs @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -use kernel::{device, devres::Devres, error::code::*, pci, prelude::*, sync::Arc}; +use kernel::{device, devres::Devres, error::code::*, fmt, pci, prelude::*, sync::Arc}; use crate::driver::Bar0; use crate::falcon::{gsp::Gsp, sec2::Sec2, Falcon}; @@ -12,7 +12,6 @@ use crate::gfw; use crate::regs; use crate::util; use crate::vbios::Vbios; -use core::fmt; macro_rules! define_chipset { ({ $($variant:ident = $value:expr),* $(,)* }) => diff --git a/drivers/gpu/nova-core/regs/macros.rs b/drivers/gpu/nova-core/regs/macros.rs index a3e6de1779d413..6b9df4205f4698 100644 --- a/drivers/gpu/nova-core/regs/macros.rs +++ b/drivers/gpu/nova-core/regs/macros.rs @@ -149,10 +149,10 @@ macro_rules! register { // TODO[REGA]: display the raw hex value, then the value of all the fields. This requires // matching the fields, which will complexify the syntax considerably... - impl ::core::fmt::Debug for $name { - fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { + impl ::kernel::fmt::Debug for $name { + fn fmt(&self, f: &mut ::kernel::fmt::Formatter<'_>) -> ::kernel::fmt::Result { f.debug_tuple(stringify!($name)) - .field(&format_args!("0x{0:x}", &self.0)) + .field(&::kernel::prelude::fmt!("0x{0:x}", &self.0)) .finish() } } diff --git a/drivers/gpu/nova-core/vbios.rs b/drivers/gpu/nova-core/vbios.rs index 5b5d9f38cbb3a6..091642d6a5a158 100644 --- a/drivers/gpu/nova-core/vbios.rs +++ b/drivers/gpu/nova-core/vbios.rs @@ -10,6 +10,7 @@ use kernel::device; use kernel::error::Result; use kernel::pci; use kernel::prelude::*; +use kernel::ptr::{Alignable, Alignment}; /// The offset of the VBIOS ROM in the BAR0 space. const ROM_OFFSET: usize = 0x300000; @@ -177,8 +178,7 @@ impl<'a> Iterator for VbiosIterator<'a> { // Advance to next image (aligned to 512 bytes). self.current_offset += image_size; - // TODO[NUMM]: replace with `align_up` once it lands. - self.current_offset = self.current_offset.next_multiple_of(512); + self.current_offset = self.current_offset.align_up(Alignment::new::<512>())?; Some(Ok(full_image)) } diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 79997553d8f987..b934523593d95d 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -597,8 +597,6 @@ config HID_LED config HID_LENOVO tristate "Lenovo / Thinkpad devices" - depends on ACPI - select ACPI_PLATFORM_PROFILE select NEW_LEDS select LEDS_CLASS help diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_client.c b/drivers/hid/amd-sfh-hid/amd_sfh_client.c index 0f2cbae39b2bb4..7017bfa590931b 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_client.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_client.c @@ -39,8 +39,12 @@ int amd_sfh_get_report(struct hid_device *hid, int report_id, int report_type) struct amdtp_hid_data *hid_data = hid->driver_data; struct amdtp_cl_data *cli_data = hid_data->cli_data; struct request_list *req_list = &cli_data->req_list; + struct amd_input_data *in_data = cli_data->in_data; + struct amd_mp2_dev *mp2; int i; + mp2 = container_of(in_data, struct amd_mp2_dev, in_data); + guard(mutex)(&mp2->lock); for (i = 0; i < cli_data->num_hid_devices; i++) { if (cli_data->hid_sensor_hubs[i] == hid) { struct request_list *new = kzalloc(sizeof(*new), GFP_KERNEL); @@ -75,6 +79,8 @@ void amd_sfh_work(struct work_struct *work) u8 report_id, node_type; u8 report_size = 0; + mp2 = container_of(in_data, struct amd_mp2_dev, in_data); + guard(mutex)(&mp2->lock); req_node = list_last_entry(&req_list->list, struct request_list, list); list_del(&req_node->list); current_index = req_node->current_index; @@ -83,7 +89,6 @@ void amd_sfh_work(struct work_struct *work) node_type = req_node->report_type; kfree(req_node); - mp2 = container_of(in_data, struct amd_mp2_dev, in_data); mp2_ops = mp2->mp2_ops; if (node_type == HID_FEATURE_REPORT) { report_size = mp2_ops->get_feat_rep(sensor_index, report_id, @@ -107,6 +112,8 @@ void amd_sfh_work(struct work_struct *work) cli_data->cur_hid_dev = current_index; cli_data->sensor_requested_cnt[current_index] = 0; amdtp_hid_wakeup(cli_data->hid_sensor_hubs[current_index]); + if (!list_empty(&req_list->list)) + schedule_delayed_work(&cli_data->work, 0); } void amd_sfh_work_buffer(struct work_struct *work) @@ -117,9 +124,10 @@ void amd_sfh_work_buffer(struct work_struct *work) u8 report_size; int i; + mp2 = container_of(in_data, struct amd_mp2_dev, in_data); + guard(mutex)(&mp2->lock); for (i = 0; i < cli_data->num_hid_devices; i++) { if (cli_data->sensor_sts[i] == SENSOR_ENABLED) { - mp2 = container_of(in_data, struct amd_mp2_dev, in_data); report_size = mp2->mp2_ops->get_in_rep(i, cli_data->sensor_idx[i], cli_data->report_id[i], in_data); hid_input_report(cli_data->hid_sensor_hubs[i], HID_INPUT_REPORT, diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_common.h b/drivers/hid/amd-sfh-hid/amd_sfh_common.h index f44a3bb2fbd4fe..78f830c133e5cd 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_common.h +++ b/drivers/hid/amd-sfh-hid/amd_sfh_common.h @@ -10,6 +10,7 @@ #ifndef AMD_SFH_COMMON_H #define AMD_SFH_COMMON_H +#include #include #include "amd_sfh_hid.h" @@ -59,6 +60,8 @@ struct amd_mp2_dev { u32 mp2_acs; struct sfh_dev_status dev_en; struct work_struct work; + /* mp2 to protect data */ + struct mutex lock; u8 init_done; u8 rver; }; diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c index 2983af969579ea..1d9f955573aa43 100644 --- a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c +++ b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c @@ -466,6 +466,10 @@ static int amd_mp2_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i if (!privdata->cl_data) return -ENOMEM; + rc = devm_mutex_init(&pdev->dev, &privdata->lock); + if (rc) + return rc; + privdata->sfh1_1_ops = (const struct amd_sfh1_1_ops *)id->driver_data; if (privdata->sfh1_1_ops) { if (boot_cpu_data.x86 >= 0x1A) diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index d27dcfb2b9e4e1..8db9d4e7c3b0b2 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -974,7 +974,10 @@ static int asus_input_mapping(struct hid_device *hdev, case 0xc4: asus_map_key_clear(KEY_KBDILLUMUP); break; case 0xc5: asus_map_key_clear(KEY_KBDILLUMDOWN); break; case 0xc7: asus_map_key_clear(KEY_KBDILLUMTOGGLE); break; + case 0x4e: asus_map_key_clear(KEY_FN_ESC); break; + case 0x7e: asus_map_key_clear(KEY_EMOJI_PICKER); break; + case 0x8b: asus_map_key_clear(KEY_PROG1); break; /* ProArt Creator Hub key */ case 0x6b: asus_map_key_clear(KEY_F21); break; /* ASUS touchpad toggle */ case 0x38: asus_map_key_clear(KEY_PROG1); break; /* ROG key */ case 0xba: asus_map_key_clear(KEY_PROG2); break; /* Fn+C ASUS Splendid */ diff --git a/drivers/hid/hid-cp2112.c b/drivers/hid/hid-cp2112.c index 482f62a78c4155..5a95ea3bec9805 100644 --- a/drivers/hid/hid-cp2112.c +++ b/drivers/hid/hid-cp2112.c @@ -229,10 +229,12 @@ static int cp2112_gpio_set_unlocked(struct cp2112_device *dev, ret = hid_hw_raw_request(hdev, CP2112_GPIO_SET, buf, CP2112_GPIO_SET_LENGTH, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); - if (ret < 0) + if (ret != CP2112_GPIO_SET_LENGTH) { hid_err(hdev, "error setting GPIO values: %d\n", ret); + return ret < 0 ? ret : -EIO; + } - return ret; + return 0; } static int cp2112_gpio_set(struct gpio_chip *chip, unsigned int offset, @@ -309,9 +311,7 @@ static int cp2112_gpio_direction_output(struct gpio_chip *chip, * Set gpio value when output direction is already set, * as specified in AN495, Rev. 0.2, cpt. 4.4 */ - cp2112_gpio_set_unlocked(dev, offset, value); - - return 0; + return cp2112_gpio_set_unlocked(dev, offset, value); } static int cp2112_hid_get(struct hid_device *hdev, unsigned char report_number, diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index b3121fa7a72d73..654879814f97aa 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -32,8 +32,6 @@ #include #include -#include - #include "hid-ids.h" /* Userspace expects F20 for mic-mute KEY_MICMUTE does not work */ @@ -734,7 +732,7 @@ static int lenovo_raw_event_TP_X12_tab(struct hid_device *hdev, u32 raw_data) report_key_event(input, KEY_RFKILL); return 1; } - platform_profile_cycle(); + report_key_event(input, KEY_PERFORMANCE); return 1; case TP_X12_RAW_HOTKEY_FN_F10: /* TAB1 has PICKUP Phone and TAB2 use Snipping tool*/ diff --git a/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c b/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c index 854926b3cfd455..a2643ae790d6e4 100644 --- a/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c +++ b/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c @@ -997,6 +997,8 @@ static const struct pci_device_id quicki2c_pci_tbl[] = { { PCI_DEVICE_DATA(INTEL, THC_PTL_H_DEVICE_ID_I2C_PORT2, &ptl_ddata) }, { PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_I2C_PORT1, &ptl_ddata) }, { PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_I2C_PORT2, &ptl_ddata) }, + { PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_I2C_PORT1, &ptl_ddata) }, + { PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_I2C_PORT2, &ptl_ddata) }, { } }; MODULE_DEVICE_TABLE(pci, quicki2c_pci_tbl); diff --git a/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h b/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h index d412eafcf9ea48..4e60a7de4727d1 100644 --- a/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h +++ b/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h @@ -13,6 +13,8 @@ #define PCI_DEVICE_ID_INTEL_THC_PTL_H_DEVICE_ID_I2C_PORT2 0xE34A #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_I2C_PORT1 0xE448 #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_I2C_PORT2 0xE44A +#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_I2C_PORT1 0x4D48 +#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_I2C_PORT2 0x4D4A /* Packet size value, the unit is 16 bytes */ #define MAX_PACKET_SIZE_VALUE_LNL 256 diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c b/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c index 5e5f179dd11300..84314989dc5346 100644 --- a/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c +++ b/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c @@ -976,6 +976,8 @@ static const struct pci_device_id quickspi_pci_tbl[] = { {PCI_DEVICE_DATA(INTEL, THC_PTL_H_DEVICE_ID_SPI_PORT2, &ptl), }, {PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_SPI_PORT1, &ptl), }, {PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_SPI_PORT2, &ptl), }, + {PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_SPI_PORT1, &ptl), }, + {PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_SPI_PORT2, &ptl), }, {} }; MODULE_DEVICE_TABLE(pci, quickspi_pci_tbl); diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h index 6fdf674b21c5a6..f3532d866749ca 100644 --- a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h +++ b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h @@ -19,6 +19,8 @@ #define PCI_DEVICE_ID_INTEL_THC_PTL_H_DEVICE_ID_SPI_PORT2 0xE34B #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_SPI_PORT1 0xE449 #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_SPI_PORT2 0xE44B +#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_SPI_PORT1 0x4D49 +#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_SPI_PORT2 0x4D4B /* HIDSPI special ACPI parameters DSM methods */ #define ACPI_QUICKSPI_REVISION_NUM 2 diff --git a/drivers/hsi/controllers/omap_ssi_port.c b/drivers/hsi/controllers/omap_ssi_port.c index aeb92b803a177a..50dde968febe87 100644 --- a/drivers/hsi/controllers/omap_ssi_port.c +++ b/drivers/hsi/controllers/omap_ssi_port.c @@ -362,7 +362,6 @@ static int ssi_async_break(struct hsi_msg *msg) spin_unlock_bh(&omap_port->lock); } out: - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); return err; @@ -401,7 +400,6 @@ static int ssi_async(struct hsi_msg *msg) msg->status = HSI_STATUS_ERROR; } spin_unlock_bh(&omap_port->lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); dev_dbg(&port->device, "msg status %d ttype %d ch %d\n", msg->status, msg->ttype, msg->channel); @@ -504,7 +502,6 @@ static int ssi_setup(struct hsi_client *cl) omap_port->ssr.mode = cl->rx_cfg.mode; out: spin_unlock_bh(&omap_port->lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); return err; @@ -570,7 +567,6 @@ static int ssi_flush(struct hsi_client *cl) pinctrl_pm_select_default_state(omap_port->pdev); spin_unlock_bh(&omap_port->lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); return 0; @@ -625,7 +621,6 @@ static int ssi_stop_tx(struct hsi_client *cl) writel(SSI_WAKE(0), omap_ssi->sys + SSI_CLEAR_WAKE_REG(port->num)); spin_unlock_bh(&omap_port->wk_lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); /* Release clocks */ @@ -653,7 +648,6 @@ static void ssi_transfer(struct omap_ssi_port *omap_port, } } spin_unlock_bh(&omap_port->lock); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } @@ -683,7 +677,6 @@ static void ssi_cleanup_queues(struct hsi_client *cl) txbufstate |= (1 << i); status |= SSI_DATAACCEPT(i); /* Release the clocks writes, also GDD ones */ - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } ssi_flush_queue(&omap_port->txqueue[i], cl); @@ -739,7 +732,6 @@ static void ssi_cleanup_gdd(struct hsi_controller *ssi, struct hsi_client *cl) * ssi_cleanup_queues */ if (msg->ttype == HSI_MSG_READ) { - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } omap_ssi->gdd_trn[i].msg = NULL; @@ -936,7 +928,6 @@ static void ssi_pio_complete(struct hsi_port *port, struct list_head *queue) reg = readl(omap_ssi->sys + SSI_MPU_ENABLE_REG(port->num, 0)); if (msg->ttype == HSI_MSG_WRITE) { /* Release clocks for write transfer */ - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } reg &= ~val; @@ -981,7 +972,6 @@ static irqreturn_t ssi_pio_thread(int irq, void *ssi_port) /* TODO: sleep if we retry? */ } while (status_reg); - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); return IRQ_HANDLED; @@ -1018,7 +1008,6 @@ static irqreturn_t ssi_wake_thread(int irq __maybe_unused, void *ssi_port) } hsi_event(port, HSI_EVENT_STOP_RX); if (test_and_clear_bit(SSI_WAKE_EN, &omap_port->flags)) { - pm_runtime_mark_last_busy(omap_port->pdev); pm_runtime_put_autosuspend(omap_port->pdev); } } diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 9d28fcf7cd2a6f..2760feb9f83b5d 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -769,6 +769,16 @@ config SENSORS_GL520SM This driver can also be built as a module. If so, the module will be called gl520sm. +config SENSORS_GPD + tristate "GPD handhelds" + depends on X86 && DMI && HAS_IOPORT + help + If you say yes here you get support for fan readings and + control over GPD handheld devices. + + Can also be built as a module. In that case it will be + called gpd-fan. + config SENSORS_G760A tristate "GMT G760A" depends on I2C @@ -1698,6 +1708,16 @@ config SENSORS_NCT6683 This driver can also be built as a module. If so, the module will be called nct6683. +config SENSORS_NCT6694 + tristate "Nuvoton NCT6694 Hardware Monitor support" + depends on MFD_NCT6694 + help + Say Y here to support Nuvoton NCT6694 hardware monitoring + functionality. + + This driver can also be built as a module. If so, the module + will be called nct6694-hwmon. + config SENSORS_NCT6775_CORE tristate select REGMAP @@ -1895,6 +1915,16 @@ config SENSORS_RASPBERRYPI_HWMON This driver can also be built as a module. If so, the module will be called raspberrypi-hwmon. +config SENSORS_SA67MCU + tristate "Kontron sa67mcu hardware monitoring driver" + depends on MFD_SL28CPLD || COMPILE_TEST + help + If you say yes here you get support for the voltage and temperature + monitor of the sa67 board management controller. + + This driver can also be built as a module. If so, the module + will be called sa67mcu-hwmon. + config SENSORS_SL28CPLD tristate "Kontron sl28cpld hardware monitoring driver" depends on MFD_SL28CPLD || COMPILE_TEST @@ -1930,8 +1960,8 @@ config SENSORS_SHT21 tristate "Sensiron humidity and temperature sensors. SHT21 and compat." depends on I2C help - If you say yes here you get support for the Sensiron SHT21, SHT25 - humidity and temperature sensors. + If you say yes here you get support for the Sensiron SHT20, SHT21, + SHT25 humidity and temperature sensors. This driver can also be built as a module. If so, the module will be called sht21. @@ -2252,13 +2282,14 @@ config SENSORS_INA2XX will be called ina2xx. config SENSORS_INA238 - tristate "Texas Instruments INA238" + tristate "Texas Instruments INA238 and compatibles" depends on I2C select REGMAP_I2C help - If you say yes here you get support for the INA238 power monitor - chip. This driver supports voltage, current, power and temperature - measurements as well as alarm configuration. + If you say yes here you get support for INA228, INA237, INA238, + INA700, INA780, and SQ52206 power monitor chips. This driver supports + voltage, current, power, energy, and temperature measurements as well + as alarm configuration. This driver can also be built as a module. If so, the module will be called ina238. @@ -2673,9 +2704,10 @@ config SENSORS_ASUS_EC depends on ACPI_EC help If you say yes here you get support for the ACPI embedded controller - hardware monitoring interface found in ASUS motherboards. The driver - currently supports B550/X570 boards, although other ASUS boards might - provide this monitoring interface as well. + hardware monitoring interface found in some ASUS motherboards. This is + where such sensors as water flow and temperature, optional fans, and + additional temperature sensors (T_Sensor, chipset temperatures) + find themselves. This driver can also be built as a module. If so, the module will be called asus_ec_sensors. diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index cd8bc4752b4dbf..73b2abdcc6dd9c 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -88,6 +88,7 @@ obj-$(CONFIG_SENSORS_GIGABYTE_WATERFORCE) += gigabyte_waterforce.o obj-$(CONFIG_SENSORS_GL518SM) += gl518sm.o obj-$(CONFIG_SENSORS_GL520SM) += gl520sm.o obj-$(CONFIG_SENSORS_GSC) += gsc-hwmon.o +obj-$(CONFIG_SENSORS_GPD) += gpd-fan.o obj-$(CONFIG_SENSORS_GPIO_FAN) += gpio-fan.o obj-$(CONFIG_SENSORS_GXP_FAN_CTRL) += gxp-fan-ctrl.o obj-$(CONFIG_SENSORS_HIH6130) += hih6130.o @@ -174,6 +175,7 @@ obj-$(CONFIG_SENSORS_MLXREG_FAN) += mlxreg-fan.o obj-$(CONFIG_SENSORS_MENF21BMC_HWMON) += menf21bmc_hwmon.o obj-$(CONFIG_SENSORS_MR75203) += mr75203.o obj-$(CONFIG_SENSORS_NCT6683) += nct6683.o +obj-$(CONFIG_SENSORS_NCT6694) += nct6694-hwmon.o obj-$(CONFIG_SENSORS_NCT6775_CORE) += nct6775-core.o nct6775-objs := nct6775-platform.o obj-$(CONFIG_SENSORS_NCT6775) += nct6775.o @@ -196,6 +198,7 @@ obj-$(CONFIG_SENSORS_PT5161L) += pt5161l.o obj-$(CONFIG_SENSORS_PWM_FAN) += pwm-fan.o obj-$(CONFIG_SENSORS_QNAP_MCU_HWMON) += qnap-mcu-hwmon.o obj-$(CONFIG_SENSORS_RASPBERRYPI_HWMON) += raspberrypi-hwmon.o +obj-$(CONFIG_SENSORS_SA67MCU) += sa67mcu-hwmon.o obj-$(CONFIG_SENSORS_SBTSI) += sbtsi_temp.o obj-$(CONFIG_SENSORS_SBRMI) += sbrmi.o obj-$(CONFIG_SENSORS_SCH56XX_COMMON)+= sch56xx-common.o diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c index 4ac554731e98a7..34a8f6b834c97f 100644 --- a/drivers/hwmon/asus-ec-sensors.c +++ b/drivers/hwmon/asus-ec-sensors.c @@ -49,15 +49,19 @@ static char *mutex_path_override; */ #define ASUS_EC_MAX_BANK 3 -#define ACPI_LOCK_DELAY_MS 500 +#define ACPI_LOCK_DELAY_MS 800 /* ACPI mutex for locking access to the EC for the firmware */ #define ASUS_HW_ACCESS_MUTEX_ASMX "\\AMW0.ASMX" #define ASUS_HW_ACCESS_MUTEX_RMTW_ASMX "\\RMTW.ASMX" +#define ASUS_HW_ACCESS_MUTEX_SB_PC00_LPCB_SIO1_MUT0 "\\_SB.PC00.LPCB.SIO1.MUT0" + #define ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0 "\\_SB_.PCI0.SBRG.SIO1.MUT0" +#define ASUS_HW_ACCESS_MUTEX_SB_PCI0_LPCB_SIO1_MUT0 "\\_SB_.PCI0.LPCB.SIO1.MUT0" + #define MAX_IDENTICAL_BOARD_VARIATIONS 3 /* Moniker for the ACPI global lock (':' is not allowed in ASL identifiers) */ @@ -115,10 +119,18 @@ enum ec_sensors { ec_sensor_fan_cpu_opt, /* VRM heat sink fan [RPM] */ ec_sensor_fan_vrm_hs, + /* VRM east heat sink fan [RPM] */ + ec_sensor_fan_vrme_hs, + /* VRM west heat sink fan [RPM] */ + ec_sensor_fan_vrmw_hs, /* Chipset fan [RPM] */ ec_sensor_fan_chipset, /* Water flow sensor reading [RPM] */ ec_sensor_fan_water_flow, + /* USB4 fan [RPM] */ + ec_sensor_fan_usb4, + /* M.2 fan [RPM] */ + ec_sensor_fan_m2, /* CPU current [A] */ ec_sensor_curr_cpu, /* "Water_In" temperature sensor reading [℃] */ @@ -148,8 +160,12 @@ enum ec_sensors { #define SENSOR_IN_CPU_CORE BIT(ec_sensor_in_cpu_core) #define SENSOR_FAN_CPU_OPT BIT(ec_sensor_fan_cpu_opt) #define SENSOR_FAN_VRM_HS BIT(ec_sensor_fan_vrm_hs) +#define SENSOR_FAN_VRME_HS BIT(ec_sensor_fan_vrme_hs) +#define SENSOR_FAN_VRMW_HS BIT(ec_sensor_fan_vrmw_hs) #define SENSOR_FAN_CHIPSET BIT(ec_sensor_fan_chipset) #define SENSOR_FAN_WATER_FLOW BIT(ec_sensor_fan_water_flow) +#define SENSOR_FAN_USB4 BIT(ec_sensor_fan_usb4) +#define SENSOR_FAN_M2 BIT(ec_sensor_fan_m2) #define SENSOR_CURR_CPU BIT(ec_sensor_curr_cpu) #define SENSOR_TEMP_WATER_IN BIT(ec_sensor_temp_water_in) #define SENSOR_TEMP_WATER_OUT BIT(ec_sensor_temp_water_out) @@ -166,9 +182,12 @@ enum board_family { family_amd_500_series, family_amd_600_series, family_amd_800_series, + family_amd_wrx_90, + family_intel_200_series, family_intel_300_series, family_intel_400_series, - family_intel_600_series + family_intel_600_series, + family_intel_700_series }; /* @@ -275,6 +294,33 @@ static const struct ec_sensor_info sensors_family_amd_800[] = { EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xb0), }; +static const struct ec_sensor_info sensors_family_amd_wrx_90[] = { + [ec_sensor_temp_cpu_package] = + EC_SENSOR("CPU Package", hwmon_temp, 1, 0x00, 0x31), + [ec_sensor_fan_cpu_opt] = + EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xb0), + [ec_sensor_fan_vrmw_hs] = + EC_SENSOR("VRMW HS", hwmon_fan, 2, 0x00, 0xb4), + [ec_sensor_fan_usb4] = EC_SENSOR("USB4", hwmon_fan, 2, 0x00, 0xb6), + [ec_sensor_fan_vrme_hs] = + EC_SENSOR("VRME HS", hwmon_fan, 2, 0x00, 0xbc), + [ec_sensor_fan_m2] = EC_SENSOR("M.2", hwmon_fan, 2, 0x00, 0xbe), + [ec_sensor_temp_t_sensor] = + EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x01, 0x04), +}; + +static const struct ec_sensor_info sensors_family_intel_200[] = { + [ec_sensor_temp_chipset] = + EC_SENSOR("Chipset", hwmon_temp, 1, 0x00, 0x3a), + [ec_sensor_temp_cpu] = EC_SENSOR("CPU", hwmon_temp, 1, 0x00, 0x3b), + [ec_sensor_temp_mb] = + EC_SENSOR("Motherboard", hwmon_temp, 1, 0x00, 0x3c), + [ec_sensor_temp_t_sensor] = + EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x00, 0x3d), + [ec_sensor_fan_cpu_opt] = + EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xbc), +}; + static const struct ec_sensor_info sensors_family_intel_300[] = { [ec_sensor_temp_chipset] = EC_SENSOR("Chipset", hwmon_temp, 1, 0x00, 0x3a), @@ -323,6 +369,16 @@ static const struct ec_sensor_info sensors_family_intel_600[] = { EC_SENSOR("Water_Block_In", hwmon_temp, 1, 0x01, 0x02), }; +static const struct ec_sensor_info sensors_family_intel_700[] = { + [ec_sensor_temp_t_sensor] = + EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x01, 0x09), + [ec_sensor_temp_t_sensor_2] = + EC_SENSOR("T_Sensor 2", hwmon_temp, 1, 0x01, 0x05), + [ec_sensor_temp_vrm] = EC_SENSOR("VRM", hwmon_temp, 1, 0x00, 0x33), + [ec_sensor_fan_cpu_opt] = + EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xb0), +}; + /* Shortcuts for common combinations */ #define SENSOR_SET_TEMP_CHIPSET_CPU_MB \ (SENSOR_TEMP_CHIPSET | SENSOR_TEMP_CPU | SENSOR_TEMP_MB) @@ -343,6 +399,52 @@ struct ec_board_info { enum board_family family; }; +static const struct ec_board_info board_info_crosshair_viii_dark_hero = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | + SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | + SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW | + SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, + .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, + .family = family_amd_500_series, +}; + +static const struct ec_board_info board_info_crosshair_viii_hero = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | + SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | + SENSOR_FAN_CPU_OPT | SENSOR_FAN_CHIPSET | + SENSOR_FAN_WATER_FLOW | SENSOR_CURR_CPU | + SENSOR_IN_CPU_CORE, + .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, + .family = family_amd_500_series, +}; + +static const struct ec_board_info board_info_crosshair_viii_impact = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | + SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | + SENSOR_IN_CPU_CORE, + .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, + .family = family_amd_500_series, +}; + +static const struct ec_board_info board_info_crosshair_x670e_gene = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_T_SENSOR | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + +static const struct ec_board_info board_info_crosshair_x670e_hero = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM | + SENSOR_SET_TEMP_WATER, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + static const struct ec_board_info board_info_maximus_vi_hero = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | @@ -352,6 +454,22 @@ static const struct ec_board_info board_info_maximus_vi_hero = { .family = family_intel_300_series, }; +static const struct ec_board_info board_info_maximus_xi_hero = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | + SENSOR_TEMP_T_SENSOR | + SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | + SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW, + .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, + .family = family_intel_300_series, +}; + +static const struct ec_board_info board_info_maximus_z690_formula = { + .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | + SENSOR_SET_TEMP_WATER | SENSOR_FAN_WATER_FLOW, + .mutex_path = ASUS_HW_ACCESS_MUTEX_RMTW_ASMX, + .family = family_intel_600_series, +}; + static const struct ec_board_info board_info_prime_x470_pro = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | @@ -376,28 +494,11 @@ static const struct ec_board_info board_info_prime_x670e_pro_wifi = { .family = family_amd_600_series, }; -static const struct ec_board_info board_info_pro_art_x570_creator_wifi = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | - SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CPU_OPT | - SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, - .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, - .family = family_amd_500_series, -}; - -static const struct ec_board_info board_info_pro_art_x670E_creator_wifi = { - .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | - SENSOR_TEMP_MB | SENSOR_TEMP_VRM | - SENSOR_TEMP_T_SENSOR, - .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, - .family = family_amd_600_series, -}; - -static const struct ec_board_info board_info_pro_art_x870E_creator_wifi = { - .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | - SENSOR_TEMP_MB | SENSOR_TEMP_VRM | +static const struct ec_board_info board_info_prime_z270_a = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CPU_OPT, - .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, - .family = family_amd_800_series, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_LPCB_SIO1_MUT0, + .family = family_intel_200_series, }; static const struct ec_board_info board_info_pro_art_b550_creator = { @@ -408,72 +509,43 @@ static const struct ec_board_info board_info_pro_art_b550_creator = { .family = family_amd_500_series, }; -static const struct ec_board_info board_info_pro_ws_x570_ace = { +static const struct ec_board_info board_info_pro_art_x570_creator_wifi = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | - SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET | + SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CPU_OPT | SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, .family = family_amd_500_series, }; -static const struct ec_board_info board_info_crosshair_x670e_hero = { +static const struct ec_board_info board_info_pro_art_x670E_creator_wifi = { .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | SENSOR_TEMP_MB | SENSOR_TEMP_VRM | - SENSOR_SET_TEMP_WATER, + SENSOR_TEMP_T_SENSOR, .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, .family = family_amd_600_series, }; -static const struct ec_board_info board_info_crosshair_x670e_gene = { +static const struct ec_board_info board_info_pro_art_x870E_creator_wifi = { .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | - SENSOR_TEMP_T_SENSOR | - SENSOR_TEMP_MB | SENSOR_TEMP_VRM, - .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, - .family = family_amd_600_series, -}; - -static const struct ec_board_info board_info_crosshair_viii_dark_hero = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | - SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | - SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW | - SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, - .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, - .family = family_amd_500_series, -}; - -static const struct ec_board_info board_info_crosshair_viii_hero = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | - SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | - SENSOR_FAN_CPU_OPT | SENSOR_FAN_CHIPSET | - SENSOR_FAN_WATER_FLOW | SENSOR_CURR_CPU | - SENSOR_IN_CPU_CORE, - .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, - .family = family_amd_500_series, -}; - -static const struct ec_board_info board_info_maximus_xi_hero = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | - SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | - SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW, - .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, - .family = family_intel_300_series, + SENSOR_TEMP_MB | SENSOR_TEMP_VRM | + SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CPU_OPT, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0, + .family = family_amd_800_series, }; -static const struct ec_board_info board_info_maximus_z690_formula = { - .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | - SENSOR_SET_TEMP_WATER | SENSOR_FAN_WATER_FLOW, +static const struct ec_board_info board_info_pro_ws_wrx90e_sage_se = { + /* Board also has a nct6798 with 7 more fans and temperatures */ + .sensors = SENSOR_TEMP_CPU_PACKAGE | SENSOR_TEMP_T_SENSOR | + SENSOR_FAN_CPU_OPT | SENSOR_FAN_USB4 | SENSOR_FAN_M2 | + SENSOR_FAN_VRME_HS | SENSOR_FAN_VRMW_HS, .mutex_path = ASUS_HW_ACCESS_MUTEX_RMTW_ASMX, - .family = family_intel_600_series, + .family = family_amd_wrx_90, }; -static const struct ec_board_info board_info_crosshair_viii_impact = { - .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | - SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | - SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | - SENSOR_IN_CPU_CORE, +static const struct ec_board_info board_info_pro_ws_x570_ace = { + .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | + SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET | + SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, .family = family_amd_500_series, }; @@ -495,6 +567,20 @@ static const struct ec_board_info board_info_strix_b550_i_gaming = { .family = family_amd_500_series, }; +static const struct ec_board_info board_info_strix_b650e_i_gaming = { + .sensors = SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR | + SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_IN_CPU_CORE, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + +static const struct ec_board_info board_info_strix_b850_i_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_800_series, +}; + static const struct ec_board_info board_info_strix_x570_e_gaming = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | @@ -528,6 +614,35 @@ static const struct ec_board_info board_info_strix_x570_i_gaming = { .family = family_amd_500_series, }; +static const struct ec_board_info board_info_strix_x670e_e_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0, + .family = family_amd_600_series, +}; + +static const struct ec_board_info board_info_strix_x670e_i_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + +static const struct ec_board_info board_info_strix_x870_i_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0, + .family = family_amd_800_series, +}; + +static const struct ec_board_info board_info_strix_x870e_e_gaming_wifi = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM | + SENSOR_FAN_CPU_OPT, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0, + .family = family_amd_800_series, +}; + static const struct ec_board_info board_info_strix_z390_f_gaming = { .sensors = SENSOR_TEMP_CHIPSET | SENSOR_TEMP_VRM | SENSOR_TEMP_T_SENSOR | @@ -554,6 +669,35 @@ static const struct ec_board_info board_info_strix_z690_a_gaming_wifi_d4 = { .family = family_intel_600_series, }; +static const struct ec_board_info board_info_strix_z690_e_gaming_wifi = { + .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM, + .mutex_path = ASUS_HW_ACCESS_MUTEX_RMTW_ASMX, + .family = family_intel_600_series, +}; + +static const struct ec_board_info board_info_strix_z790_e_gaming_wifi_ii = { + .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | + SENSOR_FAN_CPU_OPT, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PC00_LPCB_SIO1_MUT0, + .family = family_intel_700_series, +}; + +static const struct ec_board_info board_info_strix_z790_i_gaming_wifi = { + .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_T_SENSOR_2 | + SENSOR_TEMP_VRM, + .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PC00_LPCB_SIO1_MUT0, + .family = family_intel_700_series, +}; + +static const struct ec_board_info board_info_tuf_gaming_x670e_plus = { + .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | + SENSOR_TEMP_MB | SENSOR_TEMP_VRM | + SENSOR_TEMP_WATER_IN | SENSOR_TEMP_WATER_OUT | + SENSOR_FAN_CPU_OPT, + .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, + .family = family_amd_600_series, +}; + static const struct ec_board_info board_info_zenith_ii_extreme = { .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | @@ -566,15 +710,6 @@ static const struct ec_board_info board_info_zenith_ii_extreme = { .family = family_amd_500_series, }; -static const struct ec_board_info board_info_tuf_gaming_x670e_plus = { - .sensors = SENSOR_TEMP_CPU | SENSOR_TEMP_CPU_PACKAGE | - SENSOR_TEMP_MB | SENSOR_TEMP_VRM | - SENSOR_TEMP_WATER_IN | SENSOR_TEMP_WATER_OUT | - SENSOR_FAN_CPU_OPT, - .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, - .family = family_amd_600_series, -}; - #define DMI_EXACT_MATCH_ASUS_BOARD_NAME(name, board_info) \ { \ .matches = { \ @@ -594,14 +729,18 @@ static const struct dmi_system_id dmi_table[] = { &board_info_prime_x570_pro), DMI_EXACT_MATCH_ASUS_BOARD_NAME("PRIME X670E-PRO WIFI", &board_info_prime_x670e_pro_wifi), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("PRIME Z270-A", + &board_info_prime_z270_a), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt B550-CREATOR", + &board_info_pro_art_b550_creator), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt X570-CREATOR WIFI", &board_info_pro_art_x570_creator_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt X670E-CREATOR WIFI", &board_info_pro_art_x670E_creator_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt X870E-CREATOR WIFI", &board_info_pro_art_x870E_creator_wifi), - DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt B550-CREATOR", - &board_info_pro_art_b550_creator), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("Pro WS WRX90E-SAGE SE", + &board_info_pro_ws_wrx90e_sage_se), DMI_EXACT_MATCH_ASUS_BOARD_NAME("Pro WS X570-ACE", &board_info_pro_ws_x570_ace), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII DARK HERO", @@ -612,22 +751,26 @@ static const struct dmi_system_id dmi_table[] = { &board_info_crosshair_viii_hero), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII HERO (WI-FI)", &board_info_crosshair_viii_hero), - DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR X670E HERO", - &board_info_crosshair_x670e_hero), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII IMPACT", + &board_info_crosshair_viii_impact), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR X670E GENE", &board_info_crosshair_x670e_gene), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR X670E HERO", + &board_info_crosshair_x670e_hero), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG MAXIMUS XI HERO", &board_info_maximus_xi_hero), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG MAXIMUS XI HERO (WI-FI)", &board_info_maximus_xi_hero), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG MAXIMUS Z690 FORMULA", &board_info_maximus_z690_formula), - DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII IMPACT", - &board_info_crosshair_viii_impact), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-E GAMING", &board_info_strix_b550_e_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-I GAMING", &board_info_strix_b550_i_gaming), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B650E-I GAMING WIFI", + &board_info_strix_b650e_i_gaming), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B850-I GAMING WIFI", + &board_info_strix_b850_i_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-E GAMING", &board_info_strix_x570_e_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-E GAMING WIFI II", @@ -636,18 +779,34 @@ static const struct dmi_system_id dmi_table[] = { &board_info_strix_x570_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-I GAMING", &board_info_strix_x570_i_gaming), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X670E-E GAMING WIFI", + &board_info_strix_x670e_e_gaming_wifi), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X670E-I GAMING WIFI", + &board_info_strix_x670e_i_gaming_wifi), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X870-I GAMING WIFI", + &board_info_strix_x870_i_gaming_wifi), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X870E-E GAMING WIFI", + &board_info_strix_x870e_e_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z390-F GAMING", &board_info_strix_z390_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z490-F GAMING", &board_info_strix_z490_f_gaming), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z690-A GAMING WIFI D4", &board_info_strix_z690_a_gaming_wifi_d4), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z690-E GAMING WIFI", + &board_info_strix_z690_e_gaming_wifi), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z790-E GAMING WIFI II", + &board_info_strix_z790_e_gaming_wifi_ii), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z790-I GAMING WIFI", + &board_info_strix_z790_i_gaming_wifi), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG ZENITH II EXTREME", &board_info_zenith_ii_extreme), DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG ZENITH II EXTREME ALPHA", &board_info_zenith_ii_extreme), DMI_EXACT_MATCH_ASUS_BOARD_NAME("TUF GAMING X670E-PLUS", &board_info_tuf_gaming_x670e_plus), + DMI_EXACT_MATCH_ASUS_BOARD_NAME("TUF GAMING X670E-PLUS WIFI", + &board_info_tuf_gaming_x670e_plus), {}, }; @@ -1115,6 +1274,12 @@ static int asus_ec_probe(struct platform_device *pdev) case family_amd_800_series: ec_data->sensors_info = sensors_family_amd_800; break; + case family_amd_wrx_90: + ec_data->sensors_info = sensors_family_amd_wrx_90; + break; + case family_intel_200_series: + ec_data->sensors_info = sensors_family_intel_200; + break; case family_intel_300_series: ec_data->sensors_info = sensors_family_intel_300; break; @@ -1124,6 +1289,9 @@ static int asus_ec_probe(struct platform_device *pdev) case family_intel_600_series: ec_data->sensors_info = sensors_family_intel_600; break; + case family_intel_700_series: + ec_data->sensors_info = sensors_family_intel_700; + break; default: dev_err(dev, "Unknown board family: %d", ec_data->board_info->family); diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index 1b9203b20d7099..ad79db5a183e05 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -122,29 +122,29 @@ static const struct tjmax tjmax_table[] = { }; struct tjmax_model { - u8 model; - u8 mask; + u32 vfm; + u8 stepping_mask; int tjmax; }; #define ANY 0xff static const struct tjmax_model tjmax_model_table[] = { - { 0x1c, 10, 100000 }, /* D4xx, K4xx, N4xx, D5xx, K5xx, N5xx */ - { 0x1c, ANY, 90000 }, /* Z5xx, N2xx, possibly others - * Note: Also matches 230 and 330, - * which are covered by tjmax_table - */ - { 0x26, ANY, 90000 }, /* Atom Tunnel Creek (Exx), Lincroft (Z6xx) - * Note: TjMax for E6xxT is 110C, but CPU type - * is undetectable by software - */ - { 0x27, ANY, 90000 }, /* Atom Medfield (Z2460) */ - { 0x35, ANY, 90000 }, /* Atom Clover Trail/Cloverview (Z27x0) */ - { 0x36, ANY, 100000 }, /* Atom Cedar Trail/Cedarview (N2xxx, D2xxx) - * Also matches S12x0 (stepping 9), covered by - * PCI table - */ + { INTEL_ATOM_BONNELL, 10, 100000 }, /* D4xx, K4xx, N4xx, D5xx, K5xx, N5xx */ + { INTEL_ATOM_BONNELL, ANY, 90000 }, /* Z5xx, N2xx, possibly others + * Note: Also matches 230 and 330, + * which are covered by tjmax_table + */ + { INTEL_ATOM_BONNELL_MID, ANY, 90000 }, /* Atom Tunnel Creek (Exx), Lincroft (Z6xx) + * Note: TjMax for E6xxT is 110C, but CPU type + * is undetectable by software + */ + { INTEL_ATOM_SALTWELL_MID, ANY, 90000 }, /* Atom Medfield (Z2460) */ + { INTEL_ATOM_SALTWELL_TABLET, ANY, 90000 }, /* Atom Clover Trail/Cloverview (Z27x0) */ + { INTEL_ATOM_SALTWELL, ANY, 100000 }, /* Atom Cedar Trail/Cedarview (N2xxx, D2xxx) + * Also matches S12x0 (stepping 9), covered by + * PCI table + */ }; static bool is_pkg_temp_data(struct temp_data *tdata) @@ -180,6 +180,11 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) } pci_dev_put(host_bridge); + /* + * This is literally looking for "CPU XXX" in the model string. + * Not checking it against the model as well. Just purely a + * string search. + */ for (i = 0; i < ARRAY_SIZE(tjmax_table); i++) { if (strstr(c->x86_model_id, tjmax_table[i].id)) return tjmax_table[i].tjmax; @@ -187,17 +192,18 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) for (i = 0; i < ARRAY_SIZE(tjmax_model_table); i++) { const struct tjmax_model *tm = &tjmax_model_table[i]; - if (c->x86_model == tm->model && - (tm->mask == ANY || c->x86_stepping == tm->mask)) + if (c->x86_vfm == tm->vfm && + (tm->stepping_mask == ANY || + tm->stepping_mask == c->x86_stepping)) return tm->tjmax; } /* Early chips have no MSR for TjMax */ - if (c->x86_model == 0xf && c->x86_stepping < 4) + if (c->x86_vfm == INTEL_CORE2_MEROM && c->x86_stepping < 4) usemsr_ee = 0; - if (c->x86_model > 0xe && usemsr_ee) { + if (c->x86_vfm > INTEL_CORE_YONAH && usemsr_ee) { u8 platform_id; /* @@ -211,7 +217,8 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) "Unable to access MSR 0x17, assuming desktop" " CPU\n"); usemsr_ee = 0; - } else if (c->x86_model < 0x17 && !(eax & 0x10000000)) { + } else if (c->x86_vfm < INTEL_CORE2_PENRYN && + !(eax & 0x10000000)) { /* * Trust bit 28 up to Penryn, I could not find any * documentation on that; if you happen to know @@ -226,7 +233,7 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) * Mobile Penryn CPU seems to be platform ID 7 or 5 * (guesswork) */ - if (c->x86_model == 0x17 && + if (c->x86_vfm == INTEL_CORE2_PENRYN && (platform_id == 5 || platform_id == 7)) { /* * If MSR EE bit is set, set it to 90 degrees C, @@ -258,18 +265,6 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev) return tjmax; } -static bool cpu_has_tjmax(struct cpuinfo_x86 *c) -{ - u8 model = c->x86_model; - - return model > 0xe && - model != 0x1c && - model != 0x26 && - model != 0x27 && - model != 0x35 && - model != 0x36; -} - static int get_tjmax(struct temp_data *tdata, struct device *dev) { struct cpuinfo_x86 *c = &cpu_data(tdata->cpu); @@ -287,8 +282,7 @@ static int get_tjmax(struct temp_data *tdata, struct device *dev) */ err = rdmsr_safe_on_cpu(tdata->cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx); if (err) { - if (cpu_has_tjmax(c)) - dev_warn(dev, "Unable to read TjMax from CPU %u\n", tdata->cpu); + dev_warn_once(dev, "Unable to read TjMax from CPU %u\n", tdata->cpu); } else { val = (eax >> 16) & 0xff; if (val) @@ -460,7 +454,7 @@ static int chk_ucode_version(unsigned int cpu) * Readings might stop update when processor visited too deep sleep, * fixed for stepping D0 (6EC). */ - if (c->x86_model == 0xe && c->x86_stepping < 0xc && c->microcode < 0x39) { + if (c->x86_vfm == INTEL_CORE_YONAH && c->x86_stepping < 0xc && c->microcode < 0x39) { pr_err("Errata AE18 not fixed, update BIOS or microcode of the CPU!\n"); return -ENODEV; } @@ -580,7 +574,7 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, * MSR_IA32_TEMPERATURE_TARGET register. Atoms don't have the register * at all. */ - if (c->x86_model > 0xe && c->x86_model != 0x1c) + if (c->x86_vfm > INTEL_CORE_YONAH && c->x86_vfm != INTEL_ATOM_BONNELL) if (get_ttarget(tdata, &pdev->dev) >= 0) tdata->attr_size++; @@ -793,7 +787,9 @@ static int __init coretemp_init(void) /* * CPUID.06H.EAX[0] indicates whether the CPU has thermal * sensors. We check this bit only, all the early CPUs - * without thermal sensors will be filtered out. + * without thermal sensors will be filtered out. This + * includes all the Family 5 and Family 15 (Pentium 4) + * models, since they never set the CPUID bit. */ if (!x86_match_cpu(coretemp_ids)) return -ENODEV; diff --git a/drivers/hwmon/cros_ec_hwmon.c b/drivers/hwmon/cros_ec_hwmon.c index 9991c3fa020ac8..48331703f2f50d 100644 --- a/drivers/hwmon/cros_ec_hwmon.c +++ b/drivers/hwmon/cros_ec_hwmon.c @@ -7,20 +7,34 @@ #include #include +#include #include #include #include #include #include +#include #include #include #define DRV_NAME "cros-ec-hwmon" +#define CROS_EC_HWMON_PWM_GET_FAN_DUTY_CMD_VERSION 0 +#define CROS_EC_HWMON_PWM_SET_FAN_DUTY_CMD_VERSION 1 +#define CROS_EC_HWMON_THERMAL_AUTO_FAN_CTRL_CMD_VERSION 2 + struct cros_ec_hwmon_priv { struct cros_ec_device *cros_ec; const char *temp_sensor_names[EC_TEMP_SENSOR_ENTRIES + EC_TEMP_SENSOR_B_ENTRIES]; u8 usable_fans; + bool fan_control_supported; + u8 manual_fans; /* bits to indicate whether the fan is set to manual */ + u8 manual_fan_pwm[EC_FAN_SPEED_ENTRIES]; +}; + +struct cros_ec_hwmon_cooling_priv { + struct cros_ec_hwmon_priv *hwmon_priv; + u8 index; }; static int cros_ec_hwmon_read_fan_speed(struct cros_ec_device *cros_ec, u8 index, u16 *speed) @@ -36,6 +50,42 @@ static int cros_ec_hwmon_read_fan_speed(struct cros_ec_device *cros_ec, u8 index return 0; } +static int cros_ec_hwmon_read_pwm_value(struct cros_ec_device *cros_ec, u8 index, u8 *pwm_value) +{ + struct ec_params_pwm_get_fan_duty req = { + .fan_idx = index, + }; + struct ec_response_pwm_get_fan_duty resp; + int ret; + + ret = cros_ec_cmd(cros_ec, CROS_EC_HWMON_PWM_GET_FAN_DUTY_CMD_VERSION, + EC_CMD_PWM_GET_FAN_DUTY, &req, sizeof(req), &resp, sizeof(resp)); + if (ret < 0) + return ret; + + *pwm_value = (u8)DIV_ROUND_CLOSEST(le32_to_cpu(resp.percent) * 255, 100); + return 0; +} + +static int cros_ec_hwmon_read_pwm_enable(struct cros_ec_device *cros_ec, u8 index, + u8 *control_method) +{ + struct ec_params_auto_fan_ctrl_v2 req = { + .cmd = EC_AUTO_FAN_CONTROL_CMD_GET, + .fan_idx = index, + }; + struct ec_response_auto_fan_control resp; + int ret; + + ret = cros_ec_cmd(cros_ec, CROS_EC_HWMON_THERMAL_AUTO_FAN_CTRL_CMD_VERSION, + EC_CMD_THERMAL_AUTO_FAN_CTRL, &req, sizeof(req), &resp, sizeof(resp)); + if (ret < 0) + return ret; + + *control_method = resp.is_auto ? 2 : 1; + return 0; +} + static int cros_ec_hwmon_read_temp(struct cros_ec_device *cros_ec, u8 index, u8 *temp) { unsigned int offset; @@ -75,6 +125,8 @@ static int cros_ec_hwmon_read(struct device *dev, enum hwmon_sensor_types type, { struct cros_ec_hwmon_priv *priv = dev_get_drvdata(dev); int ret = -EOPNOTSUPP; + u8 control_method; + u8 pwm_value; u16 speed; u8 temp; @@ -92,6 +144,17 @@ static int cros_ec_hwmon_read(struct device *dev, enum hwmon_sensor_types type, if (ret == 0) *val = cros_ec_hwmon_is_error_fan(speed); } + } else if (type == hwmon_pwm) { + if (attr == hwmon_pwm_enable) { + ret = cros_ec_hwmon_read_pwm_enable(priv->cros_ec, channel, + &control_method); + if (ret == 0) + *val = control_method; + } else if (attr == hwmon_pwm_input) { + ret = cros_ec_hwmon_read_pwm_value(priv->cros_ec, channel, &pwm_value); + if (ret == 0) + *val = pwm_value; + } } else if (type == hwmon_temp) { if (attr == hwmon_temp_input) { ret = cros_ec_hwmon_read_temp(priv->cros_ec, channel, &temp); @@ -124,6 +187,74 @@ static int cros_ec_hwmon_read_string(struct device *dev, enum hwmon_sensor_types return -EOPNOTSUPP; } +static int cros_ec_hwmon_set_fan_pwm_val(struct cros_ec_device *cros_ec, u8 index, u8 val) +{ + struct ec_params_pwm_set_fan_duty_v1 req = { + .fan_idx = index, + .percent = DIV_ROUND_CLOSEST((uint32_t)val * 100, 255), + }; + int ret; + + ret = cros_ec_cmd(cros_ec, CROS_EC_HWMON_PWM_SET_FAN_DUTY_CMD_VERSION, + EC_CMD_PWM_SET_FAN_DUTY, &req, sizeof(req), NULL, 0); + if (ret < 0) + return ret; + return 0; +} + +static int cros_ec_hwmon_write_pwm_input(struct cros_ec_device *cros_ec, u8 index, u8 val) +{ + u8 control_method; + int ret; + + ret = cros_ec_hwmon_read_pwm_enable(cros_ec, index, &control_method); + if (ret) + return ret; + if (control_method != 1) + return -EOPNOTSUPP; + + return cros_ec_hwmon_set_fan_pwm_val(cros_ec, index, val); +} + +static int cros_ec_hwmon_write_pwm_enable(struct cros_ec_device *cros_ec, u8 index, u8 val) +{ + struct ec_params_auto_fan_ctrl_v2 req = { + .fan_idx = index, + .cmd = EC_AUTO_FAN_CONTROL_CMD_SET, + }; + int ret; + + /* No CrOS EC supports no fan speed control */ + if (val == 0) + return -EOPNOTSUPP; + + req.set_auto = (val != 1) ? true : false; + ret = cros_ec_cmd(cros_ec, CROS_EC_HWMON_THERMAL_AUTO_FAN_CTRL_CMD_VERSION, + EC_CMD_THERMAL_AUTO_FAN_CTRL, &req, sizeof(req), NULL, 0); + if (ret < 0) + return ret; + return 0; +} + +static int cros_ec_hwmon_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, + int channel, long val) +{ + struct cros_ec_hwmon_priv *priv = dev_get_drvdata(dev); + + if (type == hwmon_pwm) { + switch (attr) { + case hwmon_pwm_input: + return cros_ec_hwmon_write_pwm_input(priv->cros_ec, channel, val); + case hwmon_pwm_enable: + return cros_ec_hwmon_write_pwm_enable(priv->cros_ec, channel, val); + default: + return -EOPNOTSUPP; + } + } + + return -EOPNOTSUPP; +} + static umode_t cros_ec_hwmon_is_visible(const void *data, enum hwmon_sensor_types type, u32 attr, int channel) { @@ -132,6 +263,9 @@ static umode_t cros_ec_hwmon_is_visible(const void *data, enum hwmon_sensor_type if (type == hwmon_fan) { if (priv->usable_fans & BIT(channel)) return 0444; + } else if (type == hwmon_pwm) { + if (priv->fan_control_supported && priv->usable_fans & BIT(channel)) + return 0644; } else if (type == hwmon_temp) { if (priv->temp_sensor_names[channel]) return 0444; @@ -147,6 +281,11 @@ static const struct hwmon_channel_info * const cros_ec_hwmon_info[] = { HWMON_F_INPUT | HWMON_F_FAULT, HWMON_F_INPUT | HWMON_F_FAULT, HWMON_F_INPUT | HWMON_F_FAULT), + HWMON_CHANNEL_INFO(pwm, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE), HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_FAULT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_FAULT | HWMON_T_LABEL, @@ -175,9 +314,46 @@ static const struct hwmon_channel_info * const cros_ec_hwmon_info[] = { NULL }; +static int cros_ec_hwmon_cooling_get_max_state(struct thermal_cooling_device *cdev, + unsigned long *val) +{ + *val = 255; + return 0; +} + +static int cros_ec_hwmon_cooling_get_cur_state(struct thermal_cooling_device *cdev, + unsigned long *val) +{ + const struct cros_ec_hwmon_cooling_priv *priv = cdev->devdata; + u8 read_val; + int ret; + + ret = cros_ec_hwmon_read_pwm_value(priv->hwmon_priv->cros_ec, priv->index, &read_val); + if (ret) + return ret; + + *val = read_val; + return 0; +} + +static int cros_ec_hwmon_cooling_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long val) +{ + const struct cros_ec_hwmon_cooling_priv *priv = cdev->devdata; + + return cros_ec_hwmon_write_pwm_input(priv->hwmon_priv->cros_ec, priv->index, val); +} + +static const struct thermal_cooling_device_ops cros_ec_thermal_cooling_ops = { + .get_max_state = cros_ec_hwmon_cooling_get_max_state, + .get_cur_state = cros_ec_hwmon_cooling_get_cur_state, + .set_cur_state = cros_ec_hwmon_cooling_set_cur_state, +}; + static const struct hwmon_ops cros_ec_hwmon_ops = { .read = cros_ec_hwmon_read, .read_string = cros_ec_hwmon_read_string, + .write = cros_ec_hwmon_write, .is_visible = cros_ec_hwmon_is_visible, }; @@ -233,6 +409,65 @@ static void cros_ec_hwmon_probe_fans(struct cros_ec_hwmon_priv *priv) } } +static inline bool is_cros_ec_cmd_available(struct cros_ec_device *cros_ec, + u16 cmd, u8 version) +{ + int ret; + + ret = cros_ec_get_cmd_versions(cros_ec, cmd); + return ret >= 0 && (ret & EC_VER_MASK(version)); +} + +static bool cros_ec_hwmon_probe_fan_control_supported(struct cros_ec_device *cros_ec) +{ + return is_cros_ec_cmd_available(cros_ec, EC_CMD_PWM_GET_FAN_DUTY, + CROS_EC_HWMON_PWM_GET_FAN_DUTY_CMD_VERSION) && + is_cros_ec_cmd_available(cros_ec, EC_CMD_PWM_SET_FAN_DUTY, + CROS_EC_HWMON_PWM_SET_FAN_DUTY_CMD_VERSION) && + is_cros_ec_cmd_available(cros_ec, EC_CMD_THERMAL_AUTO_FAN_CTRL, + CROS_EC_HWMON_THERMAL_AUTO_FAN_CTRL_CMD_VERSION); +} + +static void cros_ec_hwmon_register_fan_cooling_devices(struct device *dev, + struct cros_ec_hwmon_priv *priv) +{ + struct cros_ec_hwmon_cooling_priv *cpriv; + struct thermal_cooling_device *cdev; + const char *type; + size_t i; + + if (!IS_ENABLED(CONFIG_THERMAL)) + return; + + if (!priv->fan_control_supported) + return; + + for (i = 0; i < EC_FAN_SPEED_ENTRIES; i++) { + if (!(priv->usable_fans & BIT(i))) + continue; + + cpriv = devm_kzalloc(dev, sizeof(*cpriv), GFP_KERNEL); + if (!cpriv) + continue; + + type = devm_kasprintf(dev, GFP_KERNEL, "%s-fan%zu", dev_name(dev), i); + if (!type) { + dev_warn(dev, "no memory to compose cooling device type for fan %zu\n", i); + continue; + } + + cpriv->hwmon_priv = priv; + cpriv->index = i; + cdev = devm_thermal_of_cooling_device_register(dev, NULL, type, cpriv, + &cros_ec_thermal_cooling_ops); + if (IS_ERR(cdev)) { + dev_warn(dev, "failed to register fan %zu as a cooling device: %pe\n", i, + cdev); + continue; + } + } +} + static int cros_ec_hwmon_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -259,13 +494,89 @@ static int cros_ec_hwmon_probe(struct platform_device *pdev) cros_ec_hwmon_probe_temp_sensors(dev, priv, thermal_version); cros_ec_hwmon_probe_fans(priv); + priv->fan_control_supported = cros_ec_hwmon_probe_fan_control_supported(priv->cros_ec); + cros_ec_hwmon_register_fan_cooling_devices(dev, priv); hwmon_dev = devm_hwmon_device_register_with_info(dev, "cros_ec", priv, &cros_ec_hwmon_chip_info, NULL); + platform_set_drvdata(pdev, priv); return PTR_ERR_OR_ZERO(hwmon_dev); } +static int cros_ec_hwmon_suspend(struct platform_device *pdev, pm_message_t state) +{ + struct cros_ec_hwmon_priv *priv = platform_get_drvdata(pdev); + u8 control_method; + size_t i; + int ret; + + if (!priv->fan_control_supported) + return 0; + + /* EC sets fan control to auto after suspended, store settings before suspending. */ + for (i = 0; i < EC_FAN_SPEED_ENTRIES; i++) { + if (!(priv->usable_fans & BIT(i))) + continue; + + ret = cros_ec_hwmon_read_pwm_enable(priv->cros_ec, i, &control_method); + if (ret) { + dev_warn(&pdev->dev, "failed to get mode setting for fan %zu: %d\n", i, + ret); + continue; + } + + if (control_method != 1) { + priv->manual_fans &= ~BIT(i); + continue; + } else { + priv->manual_fans |= BIT(i); + } + + ret = cros_ec_hwmon_read_pwm_value(priv->cros_ec, i, &priv->manual_fan_pwm[i]); + /* + * If storing the value failed, invalidate the stored mode value by setting it + * to auto control. EC will automatically switch to auto mode for that fan after + * suspended. + */ + if (ret) { + dev_warn(&pdev->dev, "failed to get PWM setting for fan %zu: %pe\n", i, + ERR_PTR(ret)); + priv->manual_fans &= ~BIT(i); + continue; + } + } + + return 0; +} + +static int cros_ec_hwmon_resume(struct platform_device *pdev) +{ + const struct cros_ec_hwmon_priv *priv = platform_get_drvdata(pdev); + size_t i; + int ret; + + if (!priv->fan_control_supported) + return 0; + + /* EC sets fan control to auto after suspend, restore to settings before suspend. */ + for (i = 0; i < EC_FAN_SPEED_ENTRIES; i++) { + if (!(priv->manual_fans & BIT(i))) + continue; + + /* + * Setting fan PWM value to EC will change the mode to manual for that fan in EC as + * well, so we do not need to issue a separate fan mode to manual call. + */ + ret = cros_ec_hwmon_set_fan_pwm_val(priv->cros_ec, i, priv->manual_fan_pwm[i]); + if (ret) + dev_warn(&pdev->dev, "failed to restore settings for fan %zu: %pe\n", i, + ERR_PTR(ret)); + } + + return 0; +} + static const struct platform_device_id cros_ec_hwmon_id[] = { { DRV_NAME, 0 }, {} @@ -274,6 +585,8 @@ static const struct platform_device_id cros_ec_hwmon_id[] = { static struct platform_driver cros_ec_hwmon_driver = { .driver.name = DRV_NAME, .probe = cros_ec_hwmon_probe, + .suspend = pm_ptr(cros_ec_hwmon_suspend), + .resume = pm_ptr(cros_ec_hwmon_resume), .id_table = cros_ec_hwmon_id, }; module_platform_driver(cros_ec_hwmon_driver); diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 1e2c8e2840015a..cbe1a74a3deea3 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -446,7 +447,6 @@ static int i8k_set_fan(const struct dell_smm_data *data, u8 fan, int speed) if (disallow_fan_support) return -EINVAL; - speed = (speed < 0) ? 0 : ((speed > data->i8k_fan_max) ? data->i8k_fan_max : speed); regs.ebx = fan | (speed << 8); return dell_smm_call(data->ops, ®s); @@ -637,6 +637,8 @@ static long i8k_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) if (copy_from_user(&speed, argp + 1, sizeof(int))) return -EFAULT; + speed = clamp_val(speed, 0, data->i8k_fan_max); + mutex_lock(&data->i8k_mutex); err = i8k_set_fan(data, val, speed); if (err < 0) @@ -762,6 +764,13 @@ static int dell_smm_get_cur_state(struct thermal_cooling_device *dev, unsigned l if (ret < 0) return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > cdata->data->i8k_fan_max) + return -ENODATA; + *state = ret; return 0; @@ -849,7 +858,14 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types break; case hwmon_pwm_enable: - if (auto_fan) + if (auto_fan) { + /* + * The setting affects all fans, so only create a + * single attribute. + */ + if (channel != 1) + return 0; + /* * There is no command for retrieve the current status * from BIOS, and userspace/firmware itself can change @@ -857,6 +873,10 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types * Thus we can only provide write-only access for now. */ return 0200; + } + + if (data->fan[channel] && data->i8k_fan_max < I8K_FAN_AUTO) + return 0644; break; default: @@ -926,14 +946,28 @@ static int dell_smm_read(struct device *dev, enum hwmon_sensor_types type, u32 a } break; case hwmon_pwm: + ret = i8k_get_fan_status(data, channel); + if (ret < 0) + return ret; + switch (attr) { case hwmon_pwm_input: - ret = i8k_get_fan_status(data, channel); - if (ret < 0) - return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > data->i8k_fan_max) + return -ENODATA; *val = clamp_val(ret * data->i8k_pwm_mult, 0, 255); + return 0; + case hwmon_pwm_enable: + if (ret == I8K_FAN_AUTO) + *val = 2; + else + *val = 1; + return 0; default: break; @@ -1020,16 +1054,32 @@ static int dell_smm_write(struct device *dev, enum hwmon_sensor_types type, u32 return 0; case hwmon_pwm_enable: - if (!val) - return -EINVAL; - - if (val == 1) + switch (val) { + case 1: enable = false; - else + break; + case 2: enable = true; + break; + default: + return -EINVAL; + } mutex_lock(&data->i8k_mutex); - err = i8k_enable_fan_auto_mode(data, enable); + if (auto_fan) { + err = i8k_enable_fan_auto_mode(data, enable); + } else { + /* + * When putting the fan into manual control mode we have to ensure + * that the device does not overheat until the userspace fan control + * software takes over. Because of this we set the fan speed to + * i8k_fan_max when disabling automatic fan control. + */ + if (enable) + err = i8k_set_fan(data, channel, I8K_FAN_AUTO); + else + err = i8k_set_fan(data, channel, data->i8k_fan_max); + } mutex_unlock(&data->i8k_mutex); if (err < 0) @@ -1080,9 +1130,9 @@ static const struct hwmon_channel_info * const dell_smm_info[] = { ), HWMON_CHANNEL_INFO(pwm, HWMON_PWM_INPUT | HWMON_PWM_ENABLE, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE ), NULL }; @@ -1280,6 +1330,13 @@ static const struct dmi_system_id i8k_dmi_table[] __initconst = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7050"), }, }, + { + .ident = "Dell OptiPlex 7040", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7040"), + }, + }, { .ident = "Dell Precision", .matches = { @@ -1331,7 +1388,6 @@ struct i8k_config_data { enum i8k_configs { DELL_LATITUDE_D520, - DELL_PRECISION_490, DELL_STUDIO, DELL_XPS, }; @@ -1341,10 +1397,6 @@ static const struct i8k_config_data i8k_config_data[] __initconst = { .fan_mult = 1, .fan_max = I8K_FAN_TURBO, }, - [DELL_PRECISION_490] = { - .fan_mult = 1, - .fan_max = I8K_FAN_TURBO, - }, [DELL_STUDIO] = { .fan_mult = 1, .fan_max = I8K_FAN_HIGH, @@ -1364,15 +1416,6 @@ static const struct dmi_system_id i8k_config_dmi_table[] __initconst = { }, .driver_data = (void *)&i8k_config_data[DELL_LATITUDE_D520], }, - { - .ident = "Dell Precision 490", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, - "Precision WorkStation 490"), - }, - .driver_data = (void *)&i8k_config_data[DELL_PRECISION_490], - }, { .ident = "Dell Studio", .matches = { diff --git a/drivers/hwmon/gpd-fan.c b/drivers/hwmon/gpd-fan.c new file mode 100644 index 00000000000000..644dc3ca9df7da --- /dev/null +++ b/drivers/hwmon/gpd-fan.c @@ -0,0 +1,715 @@ +// SPDX-License-Identifier: GPL-2.0+ + +/* Platform driver for GPD devices that expose fan control via hwmon sysfs. + * + * Fan control is provided via pwm interface in the range [0-255]. + * Each model has a different range in the EC, the written value is scaled to + * accommodate for that. + * + * Based on this repo: + * https://github.com/Cryolitia/gpd-fan-driver + * + * Copyright (c) 2024 Cryolitia PukNgae + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_NAME "gpdfan" +#define GPD_PWM_CTR_OFFSET 0x1841 + +static char *gpd_fan_board = ""; +module_param(gpd_fan_board, charp, 0444); + +// EC read/write locker, protecting a sequence of EC operations +static DEFINE_MUTEX(gpd_fan_sequence_lock); + +enum gpd_board { + win_mini, + win4_6800u, + win_max_2, + duo, +}; + +enum FAN_PWM_ENABLE { + DISABLE = 0, + MANUAL = 1, + AUTOMATIC = 2, +}; + +static struct { + enum FAN_PWM_ENABLE pwm_enable; + u8 pwm_value; + + const struct gpd_fan_drvdata *drvdata; +} gpd_driver_priv; + +struct gpd_fan_drvdata { + const char *board_name; // Board name for module param comparison + const enum gpd_board board; + + const u8 addr_port; + const u8 data_port; + const u16 manual_control_enable; + const u16 rpm_read; + const u16 pwm_write; + const u16 pwm_max; +}; + +static struct gpd_fan_drvdata gpd_win_mini_drvdata = { + .board_name = "win_mini", + .board = win_mini, + + .addr_port = 0x4E, + .data_port = 0x4F, + .manual_control_enable = 0x047A, + .rpm_read = 0x0478, + .pwm_write = 0x047A, + .pwm_max = 244, +}; + +static struct gpd_fan_drvdata gpd_duo_drvdata = { + .board_name = "duo", + .board = duo, + + .addr_port = 0x4E, + .data_port = 0x4F, + .manual_control_enable = 0x047A, + .rpm_read = 0x0478, + .pwm_write = 0x047A, + .pwm_max = 244, +}; + +static struct gpd_fan_drvdata gpd_win4_drvdata = { + .board_name = "win4", + .board = win4_6800u, + + .addr_port = 0x2E, + .data_port = 0x2F, + .manual_control_enable = 0xC311, + .rpm_read = 0xC880, + .pwm_write = 0xC311, + .pwm_max = 127, +}; + +static struct gpd_fan_drvdata gpd_wm2_drvdata = { + .board_name = "wm2", + .board = win_max_2, + + .addr_port = 0x4E, + .data_port = 0x4F, + .manual_control_enable = 0x0275, + .rpm_read = 0x0218, + .pwm_write = 0x1809, + .pwm_max = 184, +}; + +static const struct dmi_system_id dmi_table[] = { + { + // GPD Win Mini + // GPD Win Mini with AMD Ryzen 8840U + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1617-01") + }, + .driver_data = &gpd_win_mini_drvdata, + }, + { + // GPD Win Mini + // GPD Win Mini with AMD Ryzen HX370 + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1617-02") + }, + .driver_data = &gpd_win_mini_drvdata, + }, + { + // GPD Win Mini + // GPD Win Mini with AMD Ryzen HX370 + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1617-02-L") + }, + .driver_data = &gpd_win_mini_drvdata, + }, + { + // GPD Win 4 with AMD Ryzen 6800U + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1618-04"), + DMI_MATCH(DMI_BOARD_VERSION, "Default string"), + }, + .driver_data = &gpd_win4_drvdata, + }, + { + // GPD Win 4 with Ryzen 7840U + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1618-04"), + DMI_MATCH(DMI_BOARD_VERSION, "Ver. 1.0"), + }, + // Since 7840U, win4 uses the same drvdata as wm2 + .driver_data = &gpd_wm2_drvdata, + }, + { + // GPD Win 4 with Ryzen 7840U (another) + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1618-04"), + DMI_MATCH(DMI_BOARD_VERSION, "Ver.1.0"), + }, + .driver_data = &gpd_wm2_drvdata, + }, + { + // GPD Win Max 2 with Ryzen 6800U + // GPD Win Max 2 2023 with Ryzen 7840U + // GPD Win Max 2 2024 with Ryzen 8840U + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1619-04"), + }, + .driver_data = &gpd_wm2_drvdata, + }, + { + // GPD Win Max 2 with AMD Ryzen HX370 + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1619-05"), + }, + .driver_data = &gpd_wm2_drvdata, + }, + { + // GPD Duo + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1622-01"), + }, + .driver_data = &gpd_duo_drvdata, + }, + { + // GPD Duo (another) + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1622-01-L"), + }, + .driver_data = &gpd_duo_drvdata, + }, + { + // GPD Pocket 4 + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1628-04"), + }, + .driver_data = &gpd_win_mini_drvdata, + }, + { + // GPD Pocket 4 (another) + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1628-04-L"), + }, + .driver_data = &gpd_win_mini_drvdata, + }, + {} +}; + +static const struct gpd_fan_drvdata *gpd_module_drvdata[] = { + &gpd_win_mini_drvdata, &gpd_win4_drvdata, &gpd_wm2_drvdata, NULL +}; + +// Helper functions to handle EC read/write +static void gpd_ecram_read(u16 offset, u8 *val) +{ + u16 addr_port = gpd_driver_priv.drvdata->addr_port; + u16 data_port = gpd_driver_priv.drvdata->data_port; + + outb(0x2E, addr_port); + outb(0x11, data_port); + outb(0x2F, addr_port); + outb((u8)((offset >> 8) & 0xFF), data_port); + + outb(0x2E, addr_port); + outb(0x10, data_port); + outb(0x2F, addr_port); + outb((u8)(offset & 0xFF), data_port); + + outb(0x2E, addr_port); + outb(0x12, data_port); + outb(0x2F, addr_port); + *val = inb(data_port); +} + +static void gpd_ecram_write(u16 offset, u8 value) +{ + u16 addr_port = gpd_driver_priv.drvdata->addr_port; + u16 data_port = gpd_driver_priv.drvdata->data_port; + + outb(0x2E, addr_port); + outb(0x11, data_port); + outb(0x2F, addr_port); + outb((u8)((offset >> 8) & 0xFF), data_port); + + outb(0x2E, addr_port); + outb(0x10, data_port); + outb(0x2F, addr_port); + outb((u8)(offset & 0xFF), data_port); + + outb(0x2E, addr_port); + outb(0x12, data_port); + outb(0x2F, addr_port); + outb(value, data_port); +} + +static int gpd_generic_read_rpm(void) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + u8 high, low; + + gpd_ecram_read(drvdata->rpm_read, &high); + gpd_ecram_read(drvdata->rpm_read + 1, &low); + + return (u16)high << 8 | low; +} + +static void gpd_win4_init_ec(void) +{ + u8 chip_id, chip_ver; + + gpd_ecram_read(0x2000, &chip_id); + + if (chip_id == 0x55) { + gpd_ecram_read(0x1060, &chip_ver); + gpd_ecram_write(0x1060, chip_ver | 0x80); + } +} + +static int gpd_win4_read_rpm(void) +{ + int ret; + + ret = gpd_generic_read_rpm(); + + if (ret == 0) + // Re-init EC when speed is 0 + gpd_win4_init_ec(); + + return ret; +} + +static int gpd_wm2_read_rpm(void) +{ + for (u16 pwm_ctr_offset = GPD_PWM_CTR_OFFSET; + pwm_ctr_offset <= GPD_PWM_CTR_OFFSET + 2; pwm_ctr_offset++) { + u8 PWMCTR; + + gpd_ecram_read(pwm_ctr_offset, &PWMCTR); + + if (PWMCTR != 0xB8) + gpd_ecram_write(pwm_ctr_offset, 0xB8); + } + + return gpd_generic_read_rpm(); +} + +// Read value for fan1_input +static int gpd_read_rpm(void) +{ + switch (gpd_driver_priv.drvdata->board) { + case win_mini: + case duo: + return gpd_generic_read_rpm(); + case win4_6800u: + return gpd_win4_read_rpm(); + case win_max_2: + return gpd_wm2_read_rpm(); + } + + return 0; +} + +static int gpd_wm2_read_pwm(void) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + u8 var; + + gpd_ecram_read(drvdata->pwm_write, &var); + + // Match gpd_generic_write_pwm(u8) below + return DIV_ROUND_CLOSEST((var - 1) * 255, (drvdata->pwm_max - 1)); +} + +// Read value for pwm1 +static int gpd_read_pwm(void) +{ + switch (gpd_driver_priv.drvdata->board) { + case win_mini: + case duo: + case win4_6800u: + switch (gpd_driver_priv.pwm_enable) { + case DISABLE: + return 255; + case MANUAL: + return gpd_driver_priv.pwm_value; + case AUTOMATIC: + return -EOPNOTSUPP; + } + break; + case win_max_2: + return gpd_wm2_read_pwm(); + } + return 0; +} + +// PWM value's range in EC is 1 - pwm_max, cast 0 - 255 to it. +static inline u8 gpd_cast_pwm_range(u8 val) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + + return DIV_ROUND_CLOSEST(val * (drvdata->pwm_max - 1), 255) + 1; +} + +static void gpd_generic_write_pwm(u8 val) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + u8 pwm_reg; + + pwm_reg = gpd_cast_pwm_range(val); + gpd_ecram_write(drvdata->pwm_write, pwm_reg); +} + +static void gpd_duo_write_pwm(u8 val) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + u8 pwm_reg; + + pwm_reg = gpd_cast_pwm_range(val); + gpd_ecram_write(drvdata->pwm_write, pwm_reg); + gpd_ecram_write(drvdata->pwm_write + 1, pwm_reg); +} + +// Write value for pwm1 +static int gpd_write_pwm(u8 val) +{ + if (gpd_driver_priv.pwm_enable != MANUAL) + return -EPERM; + + switch (gpd_driver_priv.drvdata->board) { + case duo: + gpd_duo_write_pwm(val); + break; + case win_mini: + case win4_6800u: + case win_max_2: + gpd_generic_write_pwm(val); + break; + } + + return 0; +} + +static void gpd_win_mini_set_pwm_enable(enum FAN_PWM_ENABLE pwm_enable) +{ + switch (pwm_enable) { + case DISABLE: + gpd_generic_write_pwm(255); + break; + case MANUAL: + gpd_generic_write_pwm(gpd_driver_priv.pwm_value); + break; + case AUTOMATIC: + gpd_ecram_write(gpd_driver_priv.drvdata->pwm_write, 0); + break; + } +} + +static void gpd_duo_set_pwm_enable(enum FAN_PWM_ENABLE pwm_enable) +{ + switch (pwm_enable) { + case DISABLE: + gpd_duo_write_pwm(255); + break; + case MANUAL: + gpd_duo_write_pwm(gpd_driver_priv.pwm_value); + break; + case AUTOMATIC: + gpd_ecram_write(gpd_driver_priv.drvdata->pwm_write, 0); + break; + } +} + +static void gpd_wm2_set_pwm_enable(enum FAN_PWM_ENABLE enable) +{ + const struct gpd_fan_drvdata *const drvdata = gpd_driver_priv.drvdata; + + switch (enable) { + case DISABLE: + gpd_generic_write_pwm(255); + gpd_ecram_write(drvdata->manual_control_enable, 1); + break; + case MANUAL: + gpd_generic_write_pwm(gpd_driver_priv.pwm_value); + gpd_ecram_write(drvdata->manual_control_enable, 1); + break; + case AUTOMATIC: + gpd_ecram_write(drvdata->manual_control_enable, 0); + break; + } +} + +// Write value for pwm1_enable +static void gpd_set_pwm_enable(enum FAN_PWM_ENABLE enable) +{ + if (enable == MANUAL) + // Set pwm_value to max firstly when switching to manual mode, in + // consideration of device safety. + gpd_driver_priv.pwm_value = 255; + + switch (gpd_driver_priv.drvdata->board) { + case win_mini: + case win4_6800u: + gpd_win_mini_set_pwm_enable(enable); + break; + case duo: + gpd_duo_set_pwm_enable(enable); + break; + case win_max_2: + gpd_wm2_set_pwm_enable(enable); + break; + } +} + +static umode_t gpd_fan_hwmon_is_visible(__always_unused const void *drvdata, + enum hwmon_sensor_types type, u32 attr, + __always_unused int channel) +{ + if (type == hwmon_fan && attr == hwmon_fan_input) { + return 0444; + } else if (type == hwmon_pwm) { + switch (attr) { + case hwmon_pwm_enable: + case hwmon_pwm_input: + return 0644; + default: + return 0; + } + } + return 0; +} + +static int gpd_fan_hwmon_read(__always_unused struct device *dev, + enum hwmon_sensor_types type, u32 attr, + __always_unused int channel, long *val) +{ + int ret; + + ret = mutex_lock_interruptible(&gpd_fan_sequence_lock); + if (ret) + return ret; + + if (type == hwmon_fan) { + if (attr == hwmon_fan_input) { + ret = gpd_read_rpm(); + + if (ret < 0) + goto OUT; + + *val = ret; + ret = 0; + goto OUT; + } + } else if (type == hwmon_pwm) { + switch (attr) { + case hwmon_pwm_enable: + *val = gpd_driver_priv.pwm_enable; + ret = 0; + goto OUT; + case hwmon_pwm_input: + ret = gpd_read_pwm(); + + if (ret < 0) + goto OUT; + + *val = ret; + ret = 0; + goto OUT; + } + } + + ret = -EOPNOTSUPP; + +OUT: + mutex_unlock(&gpd_fan_sequence_lock); + return ret; +} + +static int gpd_fan_hwmon_write(__always_unused struct device *dev, + enum hwmon_sensor_types type, u32 attr, + __always_unused int channel, long val) +{ + int ret; + + ret = mutex_lock_interruptible(&gpd_fan_sequence_lock); + if (ret) + return ret; + + if (type == hwmon_pwm) { + switch (attr) { + case hwmon_pwm_enable: + if (!in_range(val, 0, 3)) { + ret = -EINVAL; + goto OUT; + } + + gpd_driver_priv.pwm_enable = val; + + gpd_set_pwm_enable(gpd_driver_priv.pwm_enable); + ret = 0; + goto OUT; + case hwmon_pwm_input: + if (!in_range(val, 0, 256)) { + ret = -ERANGE; + goto OUT; + } + + gpd_driver_priv.pwm_value = val; + + ret = gpd_write_pwm(val); + goto OUT; + } + } + + ret = -EOPNOTSUPP; + +OUT: + mutex_unlock(&gpd_fan_sequence_lock); + return ret; +} + +static const struct hwmon_ops gpd_fan_ops = { + .is_visible = gpd_fan_hwmon_is_visible, + .read = gpd_fan_hwmon_read, + .write = gpd_fan_hwmon_write, +}; + +static const struct hwmon_channel_info *gpd_fan_hwmon_channel_info[] = { + HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT), + HWMON_CHANNEL_INFO(pwm, HWMON_PWM_INPUT | HWMON_PWM_ENABLE), + NULL +}; + +static struct hwmon_chip_info gpd_fan_chip_info = { + .ops = &gpd_fan_ops, + .info = gpd_fan_hwmon_channel_info +}; + +static int gpd_fan_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + const struct resource *region; + const struct resource *res; + const struct device *hwdev; + + res = platform_get_resource(pdev, IORESOURCE_IO, 0); + if (IS_ERR(res)) + return dev_err_probe(dev, PTR_ERR(res), + "Failed to get platform resource\n"); + + region = devm_request_region(dev, res->start, + resource_size(res), DRIVER_NAME); + if (IS_ERR(region)) + return dev_err_probe(dev, PTR_ERR(region), + "Failed to request region\n"); + + hwdev = devm_hwmon_device_register_with_info(dev, + DRIVER_NAME, + NULL, + &gpd_fan_chip_info, + NULL); + if (IS_ERR(hwdev)) + return dev_err_probe(dev, PTR_ERR(region), + "Failed to register hwmon device\n"); + + return 0; +} + +static void gpd_fan_remove(__always_unused struct platform_device *pdev) +{ + gpd_driver_priv.pwm_enable = AUTOMATIC; + gpd_set_pwm_enable(AUTOMATIC); +} + +static struct platform_driver gpd_fan_driver = { + .probe = gpd_fan_probe, + .remove = gpd_fan_remove, + .driver = { + .name = KBUILD_MODNAME, + }, +}; + +static struct platform_device *gpd_fan_platform_device; + +static int __init gpd_fan_init(void) +{ + const struct gpd_fan_drvdata *match = NULL; + + for (const struct gpd_fan_drvdata **p = gpd_module_drvdata; *p; p++) { + if (strcmp(gpd_fan_board, (*p)->board_name) == 0) { + match = *p; + break; + } + } + + if (!match) { + const struct dmi_system_id *dmi_match = + dmi_first_match(dmi_table); + if (dmi_match) + match = dmi_match->driver_data; + } + + if (!match) + return -ENODEV; + + gpd_driver_priv.pwm_enable = AUTOMATIC; + gpd_driver_priv.pwm_value = 255; + gpd_driver_priv.drvdata = match; + + struct resource gpd_fan_resources[] = { + { + .start = match->addr_port, + .end = match->data_port, + .flags = IORESOURCE_IO, + }, + }; + + gpd_fan_platform_device = platform_create_bundle(&gpd_fan_driver, + gpd_fan_probe, + gpd_fan_resources, + 1, NULL, 0); + + if (IS_ERR(gpd_fan_platform_device)) { + pr_warn("Failed to create platform device\n"); + return PTR_ERR(gpd_fan_platform_device); + } + + return 0; +} + +static void __exit gpd_fan_exit(void) +{ + platform_device_unregister(gpd_fan_platform_device); + platform_driver_unregister(&gpd_fan_driver); +} + +MODULE_DEVICE_TABLE(dmi, dmi_table); + +module_init(gpd_fan_init); +module_exit(gpd_fan_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Cryolitia PukNgae "); +MODULE_DESCRIPTION("GPD Devices fan control driver"); diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 1688c210888abf..0b4bdcd33c7b8d 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ struct hwmon_device { const char *label; struct device dev; const struct hwmon_chip_info *chip; + struct mutex lock; struct list_head tzdata; struct attribute_group group; const struct attribute_group **groups; @@ -165,6 +167,8 @@ static int hwmon_thermal_get_temp(struct thermal_zone_device *tz, int *temp) int ret; long t; + guard(mutex)(&hwdev->lock); + ret = hwdev->chip->ops->read(tdata->dev, hwmon_temp, hwmon_temp_input, tdata->index, &t); if (ret < 0) @@ -193,6 +197,8 @@ static int hwmon_thermal_set_trips(struct thermal_zone_device *tz, int low, int if (!info[i]) return 0; + guard(mutex)(&hwdev->lock); + if (info[i]->config[tdata->index] & HWMON_T_MIN) { err = chip->ops->write(tdata->dev, hwmon_temp, hwmon_temp_min, tdata->index, low); @@ -330,8 +336,6 @@ static int hwmon_attr_base(enum hwmon_sensor_types type) * attached to an i2c client device. */ -static DEFINE_MUTEX(hwmon_pec_mutex); - static int hwmon_match_device(struct device *dev, const void *data) { return dev->class == &hwmon_class; @@ -362,17 +366,16 @@ static ssize_t pec_store(struct device *dev, struct device_attribute *devattr, if (!hdev) return -ENODEV; - mutex_lock(&hwmon_pec_mutex); - /* * If there is no write function, we assume that chip specific * handling is not required. */ hwdev = to_hwmon_device(hdev); + guard(mutex)(&hwdev->lock); if (hwdev->chip->ops->write) { err = hwdev->chip->ops->write(hdev, hwmon_chip, hwmon_chip_pec, 0, val); if (err && err != -EOPNOTSUPP) - goto unlock; + goto put; } if (!val) @@ -381,8 +384,7 @@ static ssize_t pec_store(struct device *dev, struct device_attribute *devattr, client->flags |= I2C_CLIENT_PEC; err = count; -unlock: - mutex_unlock(&hwmon_pec_mutex); +put: put_device(hdev); return err; @@ -426,18 +428,25 @@ static ssize_t hwmon_attr_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); + s64 val64; long val; int ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->read(dev, hattr->type, hattr->attr, hattr->index, - &val); + (hattr->type == hwmon_energy64) ? (long *)&val64 : &val); if (ret < 0) return ret; + if (hattr->type != hwmon_energy64) + val64 = val; + trace_hwmon_attr_show(hattr->index + hwmon_attr_base(hattr->type), - hattr->name, val); + hattr->name, val64); - return sprintf(buf, "%ld\n", val); + return sprintf(buf, "%lld\n", val64); } static ssize_t hwmon_attr_show_string(struct device *dev, @@ -445,10 +454,13 @@ static ssize_t hwmon_attr_show_string(struct device *dev, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); enum hwmon_sensor_types type = hattr->type; const char *s; int ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->read_string(dev, hattr->type, hattr->attr, hattr->index, &s); if (ret < 0) @@ -465,6 +477,7 @@ static ssize_t hwmon_attr_store(struct device *dev, const char *buf, size_t count) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); long val; int ret; @@ -472,13 +485,15 @@ static ssize_t hwmon_attr_store(struct device *dev, if (ret < 0) return ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->write(dev, hattr->type, hattr->attr, hattr->index, val); if (ret < 0) return ret; trace_hwmon_attr_store(hattr->index + hwmon_attr_base(hattr->type), - hattr->name, val); + hattr->name, (s64)val); return count; } @@ -734,6 +749,7 @@ static const char * const *__templates[] = { [hwmon_curr] = hwmon_curr_attr_templates, [hwmon_power] = hwmon_power_attr_templates, [hwmon_energy] = hwmon_energy_attr_templates, + [hwmon_energy64] = hwmon_energy_attr_templates, [hwmon_humidity] = hwmon_humidity_attr_templates, [hwmon_fan] = hwmon_fan_attr_templates, [hwmon_pwm] = hwmon_pwm_attr_templates, @@ -747,6 +763,7 @@ static const int __templates_size[] = { [hwmon_curr] = ARRAY_SIZE(hwmon_curr_attr_templates), [hwmon_power] = ARRAY_SIZE(hwmon_power_attr_templates), [hwmon_energy] = ARRAY_SIZE(hwmon_energy_attr_templates), + [hwmon_energy64] = ARRAY_SIZE(hwmon_energy_attr_templates), [hwmon_humidity] = ARRAY_SIZE(hwmon_humidity_attr_templates), [hwmon_fan] = ARRAY_SIZE(hwmon_fan_attr_templates), [hwmon_pwm] = ARRAY_SIZE(hwmon_pwm_attr_templates), @@ -785,6 +802,22 @@ int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, } EXPORT_SYMBOL_GPL(hwmon_notify_event); +void hwmon_lock(struct device *dev) +{ + struct hwmon_device *hwdev = to_hwmon_device(dev); + + mutex_lock(&hwdev->lock); +} +EXPORT_SYMBOL_GPL(hwmon_lock); + +void hwmon_unlock(struct device *dev) +{ + struct hwmon_device *hwdev = to_hwmon_device(dev); + + mutex_unlock(&hwdev->lock); +} +EXPORT_SYMBOL_GPL(hwmon_unlock); + static int hwmon_num_channel_attrs(const struct hwmon_channel_info *info) { int i, n; @@ -945,6 +978,7 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata, tdev = tdev->parent; hdev->of_node = tdev ? tdev->of_node : NULL; hwdev->chip = chip; + mutex_init(&hwdev->lock); dev_set_drvdata(hdev, drvdata); dev_set_name(hdev, HWMON_ID_FORMAT, id); err = device_register(hdev); diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c index 59a2c8889fa2b1..356d19b7675cac 100644 --- a/drivers/hwmon/ina238.c +++ b/drivers/hwmon/ina238.c @@ -16,8 +16,6 @@ #include #include -#include - /* INA238 register definitions */ #define INA238_CONFIG 0x0 #define INA238_ADC_CONFIG 0x1 @@ -53,7 +51,7 @@ #define INA238_REGISTERS 0x20 -#define INA238_RSHUNT_DEFAULT 10000 /* uOhm */ +#define INA238_RSHUNT_DEFAULT 2500 /* uOhm */ /* Default configuration of device on reset. */ #define INA238_CONFIG_DEFAULT 0 @@ -62,6 +60,7 @@ #define INA238_ADC_CONFIG_DEFAULT 0xfb6a /* Configure alerts to be based on averaged value (SLOWALERT) */ #define INA238_DIAG_ALERT_DEFAULT 0x2000 +#define INA238_DIAG_ALERT_APOL BIT(12) /* * This driver uses a fixed calibration value in order to scale current/power * based on a fixed shunt resistor value. This allows for conversion within the @@ -69,46 +68,32 @@ * relative to the shunt resistor value within the driver. This is similar to * how the ina2xx driver handles current/power scaling. * - * The end result of this is that increasing shunt values (from a fixed 20 mOhm - * shunt) increase the effective current/power accuracy whilst limiting the - * range and decreasing shunt values decrease the effective accuracy but - * increase the range. - * - * The value of the Current register is calculated given the following: - * Current (A) = (shunt voltage register * 5) * calibration / 81920 - * - * The maximum shunt voltage is 163.835 mV (0x7fff, ADC_RANGE = 0, gain = 4). - * With the maximum current value of 0x7fff and a fixed shunt value results in - * a calibration value of 16384 (0x4000). - * - * 0x7fff = (0x7fff * 5) * calibration / 81920 - * calibration = 0x4000 - * - * Equivalent calibration is applied for the Power register (maximum value for - * bus voltage is 102396.875 mV, 0x7fff), where the maximum power that can - * occur is ~16776192 uW (register value 0x147a8): - * - * This scaling means the resulting values for Current and Power registers need - * to be scaled by the difference between the fixed shunt resistor and the - * actual shunt resistor: + * To achieve the best possible dynamic range, the value of the shunt voltage + * register should match the value of the current register. With that, the shunt + * voltage of 0x7fff = 32,767 uV = 163,785 uV matches the maximum current, + * and no accuracy is lost. Experiments with a real chip show that this is + * achieved by setting the SHUNT_CAL register to a value of 0x1000 = 4,096. + * Per datasheet, + * SHUNT_CAL = 819.2 x 10^6 x CURRENT_LSB x Rshunt + * = 819,200,000 x CURRENT_LSB x Rshunt + * With SHUNT_CAL set to 4,096, we get + * CURRENT_LSB = 4,096 / (819,200,000 x Rshunt) + * Assuming an Rshunt value of 5 mOhm, we get + * CURRENT_LSB = 4,096 / (819,200,000 x 0.005) = 1mA + * and thus a dynamic range of 1mA ... 32,767mA, which is sufficient for most + * applications. The actual dynamic range is of course determined by the actual + * shunt resistor value. * - * shunt = 0x4000 / (819.2 * 10^6) / 0.001 = 20000 uOhms (with 1mA/lsb) - * - * Current (mA) = register value * 20000 / rshunt / 4 * gain - * Power (mW) = 0.2 * register value * 20000 / rshunt / 4 * gain - * (Specific for SQ52206) - * Power (mW) = 0.24 * register value * 20000 / rshunt / 4 * gain - * Energy (uJ) = 16 * 0.24 * register value * 20000 / rshunt / 4 * gain * 1000 + * Power and energy values are scaled accordingly. */ -#define INA238_CALIBRATION_VALUE 16384 -#define INA238_FIXED_SHUNT 20000 +#define INA238_CALIBRATION_VALUE 4096 +#define INA238_FIXED_SHUNT 5000 + +#define INA238_SHUNT_VOLTAGE_LSB 5000 /* 5 uV/lsb, in nV */ +#define INA238_BUS_VOLTAGE_LSB 3125000 /* 3.125 mV/lsb, in nV */ +#define SQ52206_BUS_VOLTAGE_LSB 3750000 /* 3.75 mV/lsb, in nV */ -#define INA238_SHUNT_VOLTAGE_LSB 5 /* 5 uV/lsb */ -#define INA238_BUS_VOLTAGE_LSB 3125 /* 3.125 mV/lsb */ -#define INA238_DIE_TEMP_LSB 1250000 /* 125.0000 mC/lsb */ -#define SQ52206_BUS_VOLTAGE_LSB 3750 /* 3.75 mV/lsb */ -#define SQ52206_DIE_TEMP_LSB 78125 /* 7.8125 mC/lsb */ -#define INA228_DIE_TEMP_LSB 78125 /* 7.8125 mC/lsb */ +#define NUNIT_PER_MUNIT 1000000 /* n[AV] -> m[AV] */ static const struct regmap_config ina238_regmap_config = { .max_register = INA238_REGISTERS, @@ -116,17 +101,17 @@ static const struct regmap_config ina238_regmap_config = { .val_bits = 16, }; -enum ina238_ids { ina238, ina237, sq52206, ina228 }; +enum ina238_ids { ina228, ina237, ina238, ina700, ina780, sq52206 }; struct ina238_config { bool has_20bit_voltage_current; /* vshunt, vbus and current are 20-bit fields */ bool has_power_highest; /* chip detection power peak */ bool has_energy; /* chip detection energy */ - u8 temp_shift; /* fixed parameters for temp calculate */ - u32 power_calculate_factor; /* fixed parameters for power calculate */ + u8 temp_resolution; /* temperature register resolution in bit */ u16 config_default; /* Power-on default state */ - int bus_voltage_lsb; /* use for temperature calculate, uV/lsb */ - int temp_lsb; /* use for temperature calculate */ + u32 power_calculate_factor; /* fixed parameter for power calculation, from datasheet */ + u32 bus_voltage_lsb; /* bus voltage LSB, in nV */ + int current_lsb; /* current LSB, in uA */ }; struct ina238_data { @@ -136,48 +121,68 @@ struct ina238_data { struct regmap *regmap; u32 rshunt; int gain; + u32 voltage_lsb[2]; /* shunt, bus voltage LSB, in nV */ + int current_lsb; /* current LSB, in uA */ + int power_lsb; /* power LSB, in uW */ + int energy_lsb; /* energy LSB, in uJ */ }; static const struct ina238_config ina238_config[] = { - [ina238] = { + [ina228] = { + .has_20bit_voltage_current = true, + .has_energy = true, + .has_power_highest = false, + .power_calculate_factor = 20, + .config_default = INA238_CONFIG_DEFAULT, + .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, + .temp_resolution = 16, + }, + [ina237] = { .has_20bit_voltage_current = false, .has_energy = false, .has_power_highest = false, - .temp_shift = 4, .power_calculate_factor = 20, .config_default = INA238_CONFIG_DEFAULT, .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, - .temp_lsb = INA238_DIE_TEMP_LSB, + .temp_resolution = 12, }, - [ina237] = { + [ina238] = { .has_20bit_voltage_current = false, .has_energy = false, .has_power_highest = false, - .temp_shift = 4, .power_calculate_factor = 20, .config_default = INA238_CONFIG_DEFAULT, .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, - .temp_lsb = INA238_DIE_TEMP_LSB, + .temp_resolution = 12, }, - [sq52206] = { + [ina700] = { .has_20bit_voltage_current = false, .has_energy = true, - .has_power_highest = true, - .temp_shift = 0, - .power_calculate_factor = 24, - .config_default = SQ52206_CONFIG_DEFAULT, - .bus_voltage_lsb = SQ52206_BUS_VOLTAGE_LSB, - .temp_lsb = SQ52206_DIE_TEMP_LSB, + .has_power_highest = false, + .power_calculate_factor = 20, + .config_default = INA238_CONFIG_DEFAULT, + .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, + .temp_resolution = 12, + .current_lsb = 480, }, - [ina228] = { - .has_20bit_voltage_current = true, + [ina780] = { + .has_20bit_voltage_current = false, .has_energy = true, .has_power_highest = false, - .temp_shift = 0, .power_calculate_factor = 20, .config_default = INA238_CONFIG_DEFAULT, .bus_voltage_lsb = INA238_BUS_VOLTAGE_LSB, - .temp_lsb = INA228_DIE_TEMP_LSB, + .temp_resolution = 12, + .current_lsb = 2400, + }, + [sq52206] = { + .has_20bit_voltage_current = false, + .has_energy = true, + .has_power_highest = true, + .power_calculate_factor = 24, + .config_default = SQ52206_CONFIG_DEFAULT, + .bus_voltage_lsb = SQ52206_BUS_VOLTAGE_LSB, + .temp_resolution = 16, }, }; @@ -232,45 +237,28 @@ static int ina238_read_field_s20(const struct i2c_client *client, u8 reg, s32 *v return 0; } -static int ina228_read_shunt_voltage(struct device *dev, u32 attr, int channel, - long *val) +static int ina228_read_voltage(struct ina238_data *data, int channel, long *val) { - struct ina238_data *data = dev_get_drvdata(dev); - int regval; - int err; + int reg = channel ? INA238_BUS_VOLTAGE : INA238_CURRENT; + u32 lsb = data->voltage_lsb[channel]; + u32 factor = NUNIT_PER_MUNIT; + int err, regval; - err = ina238_read_field_s20(data->client, INA238_SHUNT_VOLTAGE, ®val); - if (err) - return err; - - /* - * gain of 1 -> LSB / 4 - * This field has 16 bit on ina238. ina228 adds another 4 bits of - * precision. ina238 conversion factors can still be applied when - * dividing by 16. - */ - *val = (regval * INA238_SHUNT_VOLTAGE_LSB) * data->gain / (1000 * 4) / 16; - return 0; -} - -static int ina228_read_bus_voltage(struct device *dev, u32 attr, int channel, - long *val) -{ - struct ina238_data *data = dev_get_drvdata(dev); - int regval; - int err; - - err = ina238_read_field_s20(data->client, INA238_BUS_VOLTAGE, ®val); - if (err) - return err; + if (data->config->has_20bit_voltage_current) { + err = ina238_read_field_s20(data->client, reg, ®val); + if (err) + return err; + /* Adjust accuracy: LSB in units of 500 pV */ + lsb /= 8; + factor *= 2; + } else { + err = regmap_read(data->regmap, reg, ®val); + if (err) + return err; + regval = (s16)regval; + } - /* - * gain of 1 -> LSB / 4 - * This field has 16 bit on ina238. ina228 adds another 4 bits of - * precision. ina238 conversion factors can still be applied when - * dividing by 16. - */ - *val = (regval * data->config->bus_voltage_lsb) / 1000 / 16; + *val = DIV_S64_ROUND_CLOSEST((s64)regval * lsb, factor); return 0; } @@ -278,18 +266,16 @@ static int ina238_read_in(struct device *dev, u32 attr, int channel, long *val) { struct ina238_data *data = dev_get_drvdata(dev); - int reg, mask; + int reg, mask = 0; int regval; int err; + if (attr == hwmon_in_input) + return ina228_read_voltage(data, channel, val); + switch (channel) { case 0: switch (attr) { - case hwmon_in_input: - if (data->config->has_20bit_voltage_current) - return ina228_read_shunt_voltage(dev, attr, channel, val); - reg = INA238_SHUNT_VOLTAGE; - break; case hwmon_in_max: reg = INA238_SHUNT_OVER_VOLTAGE; break; @@ -310,11 +296,6 @@ static int ina238_read_in(struct device *dev, u32 attr, int channel, break; case 1: switch (attr) { - case hwmon_in_input: - if (data->config->has_20bit_voltage_current) - return ina228_read_bus_voltage(dev, attr, channel, val); - reg = INA238_BUS_VOLTAGE; - break; case hwmon_in_max: reg = INA238_BUS_OVER_VOLTAGE; break; @@ -341,112 +322,126 @@ static int ina238_read_in(struct device *dev, u32 attr, int channel, if (err < 0) return err; - switch (attr) { - case hwmon_in_input: - case hwmon_in_max: - case hwmon_in_min: - /* signed register, value in mV */ - regval = (s16)regval; - if (channel == 0) - /* gain of 1 -> LSB / 4 */ - *val = (regval * INA238_SHUNT_VOLTAGE_LSB) * - data->gain / (1000 * 4); - else - *val = (regval * data->config->bus_voltage_lsb) / 1000; - break; - case hwmon_in_max_alarm: - case hwmon_in_min_alarm: + if (mask) *val = !!(regval & mask); - break; - } + else + *val = DIV_S64_ROUND_CLOSEST((s64)(s16)regval * data->voltage_lsb[channel], + NUNIT_PER_MUNIT); return 0; } -static int ina238_write_in(struct device *dev, u32 attr, int channel, - long val) +static int ina238_write_in(struct device *dev, u32 attr, int channel, long val) { struct ina238_data *data = dev_get_drvdata(dev); + static const int low_limits[2] = {-164, 0}; + static const int high_limits[2] = {164, 150000}; + static const u8 low_regs[2] = {INA238_SHUNT_UNDER_VOLTAGE, INA238_BUS_UNDER_VOLTAGE}; + static const u8 high_regs[2] = {INA238_SHUNT_OVER_VOLTAGE, INA238_BUS_OVER_VOLTAGE}; int regval; - if (attr != hwmon_in_max && attr != hwmon_in_min) - return -EOPNOTSUPP; - - /* convert decimal to register value */ - switch (channel) { - case 0: - /* signed value, clamp to max range +/-163 mV */ - regval = clamp_val(val, -163, 163); - regval = (regval * 1000 * 4) / - (INA238_SHUNT_VOLTAGE_LSB * data->gain); - regval = clamp_val(regval, S16_MIN, S16_MAX) & 0xffff; + /* Initial clamp to avoid overflows */ + val = clamp_val(val, low_limits[channel], high_limits[channel]); + val = DIV_S64_ROUND_CLOSEST((s64)val * NUNIT_PER_MUNIT, data->voltage_lsb[channel]); + /* Final clamp to register limits */ + regval = clamp_val(val, S16_MIN, S16_MAX) & 0xffff; - switch (attr) { - case hwmon_in_max: - return regmap_write(data->regmap, - INA238_SHUNT_OVER_VOLTAGE, regval); - case hwmon_in_min: - return regmap_write(data->regmap, - INA238_SHUNT_UNDER_VOLTAGE, regval); - default: - return -EOPNOTSUPP; - } - case 1: - /* signed value, positive values only. Clamp to max 102.396 V */ - regval = clamp_val(val, 0, 102396); - regval = (regval * 1000) / data->config->bus_voltage_lsb; - regval = clamp_val(regval, 0, S16_MAX); - - switch (attr) { - case hwmon_in_max: - return regmap_write(data->regmap, - INA238_BUS_OVER_VOLTAGE, regval); - case hwmon_in_min: - return regmap_write(data->regmap, - INA238_BUS_UNDER_VOLTAGE, regval); - default: - return -EOPNOTSUPP; - } + switch (attr) { + case hwmon_in_min: + return regmap_write(data->regmap, low_regs[channel], regval); + case hwmon_in_max: + return regmap_write(data->regmap, high_regs[channel], regval); default: return -EOPNOTSUPP; } } -static int ina238_read_current(struct device *dev, u32 attr, long *val) +static int __ina238_read_curr(struct ina238_data *data, long *val) +{ + u32 lsb = data->current_lsb; + int err, regval; + + if (data->config->has_20bit_voltage_current) { + err = ina238_read_field_s20(data->client, INA238_CURRENT, ®val); + if (err) + return err; + lsb /= 16; /* Adjust accuracy */ + } else { + err = regmap_read(data->regmap, INA238_CURRENT, ®val); + if (err) + return err; + regval = (s16)regval; + } + + *val = DIV_S64_ROUND_CLOSEST((s64)regval * lsb, 1000); + return 0; +} + +static int ina238_read_curr(struct device *dev, u32 attr, long *val) { struct ina238_data *data = dev_get_drvdata(dev); + int reg, mask = 0; int regval; int err; - switch (attr) { - case hwmon_curr_input: - if (data->config->has_20bit_voltage_current) { - err = ina238_read_field_s20(data->client, INA238_CURRENT, ®val); - if (err) - return err; - } else { - err = regmap_read(data->regmap, INA238_CURRENT, ®val); - if (err < 0) - return err; - /* sign-extend */ - regval = (s16)regval; - } - - /* Signed register, fixed 1mA current lsb. result in mA */ - *val = div_s64((s64)regval * INA238_FIXED_SHUNT * data->gain, - data->rshunt * 4); + if (attr == hwmon_curr_input) + return __ina238_read_curr(data, val); - /* Account for 4 bit offset */ - if (data->config->has_20bit_voltage_current) - *val /= 16; + switch (attr) { + case hwmon_curr_min: + reg = INA238_SHUNT_UNDER_VOLTAGE; + break; + case hwmon_curr_min_alarm: + reg = INA238_DIAG_ALERT; + mask = INA238_DIAG_ALERT_SHNTUL; + break; + case hwmon_curr_max: + reg = INA238_SHUNT_OVER_VOLTAGE; + break; + case hwmon_curr_max_alarm: + reg = INA238_DIAG_ALERT; + mask = INA238_DIAG_ALERT_SHNTOL; break; default: return -EOPNOTSUPP; } + err = regmap_read(data->regmap, reg, ®val); + if (err < 0) + return err; + + if (mask) + *val = !!(regval & mask); + else + *val = DIV_S64_ROUND_CLOSEST((s64)(s16)regval * data->current_lsb, 1000); + return 0; } +static int ina238_write_curr(struct device *dev, u32 attr, long val) +{ + struct ina238_data *data = dev_get_drvdata(dev); + int regval; + + /* Set baseline range to avoid over/underflows */ + val = clamp_val(val, -1000000, 1000000); + /* Scale */ + val = DIV_ROUND_CLOSEST(val * 1000, data->current_lsb); + /* Clamp to register size */ + regval = clamp_val(val, S16_MIN, S16_MAX) & 0xffff; + + switch (attr) { + case hwmon_curr_min: + return regmap_write(data->regmap, INA238_SHUNT_UNDER_VOLTAGE, + regval); + case hwmon_curr_max: + return regmap_write(data->regmap, INA238_SHUNT_OVER_VOLTAGE, + regval); + default: + return -EOPNOTSUPP; + } +} + static int ina238_read_power(struct device *dev, u32 attr, long *val) { struct ina238_data *data = dev_get_drvdata(dev); @@ -460,9 +455,7 @@ static int ina238_read_power(struct device *dev, u32 attr, long *val) if (err) return err; - /* Fixed 1mA lsb, scaled by 1000000 to have result in uW */ - power = div_u64(regval * 1000ULL * INA238_FIXED_SHUNT * data->gain * - data->config->power_calculate_factor, 4 * 100 * data->rshunt); + power = (long long)regval * data->power_lsb; /* Clamp value to maximum value of long */ *val = clamp_val(power, 0, LONG_MAX); break; @@ -471,9 +464,7 @@ static int ina238_read_power(struct device *dev, u32 attr, long *val) if (err) return err; - /* Fixed 1mA lsb, scaled by 1000000 to have result in uW */ - power = div_u64(regval * 1000ULL * INA238_FIXED_SHUNT * data->gain * - data->config->power_calculate_factor, 4 * 100 * data->rshunt); + power = (long long)regval * data->power_lsb; /* Clamp value to maximum value of long */ *val = clamp_val(power, 0, LONG_MAX); break; @@ -486,8 +477,7 @@ static int ina238_read_power(struct device *dev, u32 attr, long *val) * Truncated 24-bit compare register, lower 8-bits are * truncated. Same conversion to/from uW as POWER register. */ - power = div_u64((regval << 8) * 1000ULL * INA238_FIXED_SHUNT * data->gain * - data->config->power_calculate_factor, 4 * 100 * data->rshunt); + power = ((long long)regval << 8) * data->power_lsb; /* Clamp value to maximum value of long */ *val = clamp_val(power, 0, LONG_MAX); break; @@ -505,13 +495,9 @@ static int ina238_read_power(struct device *dev, u32 attr, long *val) return 0; } -static int ina238_write_power(struct device *dev, u32 attr, long val) +static int ina238_write_power_max(struct device *dev, long val) { struct ina238_data *data = dev_get_drvdata(dev); - long regval; - - if (attr != hwmon_power_max) - return -EOPNOTSUPP; /* * Unsigned postive values. Compared against the 24-bit power register, @@ -519,12 +505,16 @@ static int ina238_write_power(struct device *dev, u32 attr, long val) * register. * The first clamp_val() is to establish a baseline to avoid overflows. */ - regval = clamp_val(val, 0, LONG_MAX / 2); - regval = div_u64(regval * 4 * 100 * data->rshunt, data->config->power_calculate_factor * - 1000ULL * INA238_FIXED_SHUNT * data->gain); - regval = clamp_val(regval >> 8, 0, U16_MAX); + val = clamp_val(val, 0, LONG_MAX / 2); + val = DIV_ROUND_CLOSEST(val, data->power_lsb); + val = clamp_val(val >> 8, 0, U16_MAX); + + return regmap_write(data->regmap, INA238_POWER_LIMIT, val); +} - return regmap_write(data->regmap, INA238_POWER_LIMIT, regval); +static int ina238_temp_from_reg(s16 regval, u8 resolution) +{ + return ((regval >> (16 - resolution)) * 1000) >> (resolution - 9); } static int ina238_read_temp(struct device *dev, u32 attr, long *val) @@ -538,17 +528,14 @@ static int ina238_read_temp(struct device *dev, u32 attr, long *val) err = regmap_read(data->regmap, INA238_DIE_TEMP, ®val); if (err) return err; - /* Signed, result in mC */ - *val = div_s64(((s64)((s16)regval) >> data->config->temp_shift) * - (s64)data->config->temp_lsb, 10000); + *val = ina238_temp_from_reg(regval, data->config->temp_resolution); break; case hwmon_temp_max: err = regmap_read(data->regmap, INA238_TEMP_LIMIT, ®val); if (err) return err; /* Signed, result in mC */ - *val = div_s64(((s64)((s16)regval) >> data->config->temp_shift) * - (s64)data->config->temp_lsb, 10000); + *val = ina238_temp_from_reg(regval, data->config->temp_resolution); break; case hwmon_temp_max_alarm: err = regmap_read(data->regmap, INA238_DIAG_ALERT, ®val); @@ -564,39 +551,37 @@ static int ina238_read_temp(struct device *dev, u32 attr, long *val) return 0; } -static int ina238_write_temp(struct device *dev, u32 attr, long val) +static u16 ina238_temp_to_reg(long val, u8 resolution) { - struct ina238_data *data = dev_get_drvdata(dev); - int regval; + int fraction = 1000 - DIV_ROUND_CLOSEST(1000, BIT(resolution - 9)); - if (attr != hwmon_temp_max) - return -EOPNOTSUPP; + val = clamp_val(val, -255000 - fraction, 255000 + fraction); + + return (DIV_ROUND_CLOSEST(val << (resolution - 9), 1000) << (16 - resolution)) & 0xffff; +} - /* Signed */ - val = clamp_val(val, -40000, 125000); - regval = div_s64(val * 10000, data->config->temp_lsb) << data->config->temp_shift; - regval = clamp_val(regval, S16_MIN, S16_MAX) & (0xffff << data->config->temp_shift); +static int ina238_write_temp_max(struct device *dev, long val) +{ + struct ina238_data *data = dev_get_drvdata(dev); + int regval; + regval = ina238_temp_to_reg(val, data->config->temp_resolution); return regmap_write(data->regmap, INA238_TEMP_LIMIT, regval); } -static ssize_t energy1_input_show(struct device *dev, - struct device_attribute *da, char *buf) +static int ina238_read_energy(struct device *dev, s64 *energy) { struct ina238_data *data = dev_get_drvdata(dev); - int ret; u64 regval; - u64 energy; + int ret; ret = ina238_read_reg40(data->client, SQ52206_ENERGY, ®val); if (ret) return ret; /* result in uJ */ - energy = div_u64(regval * INA238_FIXED_SHUNT * data->gain * 16 * 10 * - data->config->power_calculate_factor, 4 * data->rshunt); - - return sysfs_emit(buf, "%llu\n", energy); + *energy = regval * data->energy_lsb; + return 0; } static int ina238_read(struct device *dev, enum hwmon_sensor_types type, @@ -606,9 +591,11 @@ static int ina238_read(struct device *dev, enum hwmon_sensor_types type, case hwmon_in: return ina238_read_in(dev, attr, channel, val); case hwmon_curr: - return ina238_read_current(dev, attr, val); + return ina238_read_curr(dev, attr, val); case hwmon_power: return ina238_read_power(dev, attr, val); + case hwmon_energy64: + return ina238_read_energy(dev, (s64 *)val); case hwmon_temp: return ina238_read_temp(dev, attr, val); default: @@ -629,11 +616,14 @@ static int ina238_write(struct device *dev, enum hwmon_sensor_types type, case hwmon_in: err = ina238_write_in(dev, attr, channel, val); break; + case hwmon_curr: + err = ina238_write_curr(dev, attr, val); + break; case hwmon_power: - err = ina238_write_power(dev, attr, val); + err = ina238_write_power_max(dev, val); break; case hwmon_temp: - err = ina238_write_temp(dev, attr, val); + err = ina238_write_temp_max(dev, val); break; default: err = -EOPNOTSUPP; @@ -650,6 +640,7 @@ static umode_t ina238_is_visible(const void *drvdata, { const struct ina238_data *data = drvdata; bool has_power_highest = data->config->has_power_highest; + bool has_energy = data->config->has_energy; switch (type) { case hwmon_in: @@ -667,7 +658,12 @@ static umode_t ina238_is_visible(const void *drvdata, case hwmon_curr: switch (attr) { case hwmon_curr_input: + case hwmon_curr_max_alarm: + case hwmon_curr_min_alarm: return 0444; + case hwmon_curr_max: + case hwmon_curr_min: + return 0644; default: return 0; } @@ -685,6 +681,11 @@ static umode_t ina238_is_visible(const void *drvdata, default: return 0; } + case hwmon_energy64: + /* hwmon_energy_input */ + if (has_energy) + return 0444; + return 0; case hwmon_temp: switch (attr) { case hwmon_temp_input: @@ -712,11 +713,14 @@ static const struct hwmon_channel_info * const ina238_info[] = { INA238_HWMON_IN_CONFIG), HWMON_CHANNEL_INFO(curr, /* 0: current through shunt */ - HWMON_C_INPUT), + HWMON_C_INPUT | HWMON_C_MIN | HWMON_C_MIN_ALARM | + HWMON_C_MAX | HWMON_C_MAX_ALARM), HWMON_CHANNEL_INFO(power, /* 0: power */ HWMON_P_INPUT | HWMON_P_MAX | HWMON_P_MAX_ALARM | HWMON_P_INPUT_HIGHEST), + HWMON_CHANNEL_INFO(energy64, + HWMON_E_INPUT), HWMON_CHANNEL_INFO(temp, /* 0: die temperature */ HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MAX_ALARM), @@ -734,18 +738,8 @@ static const struct hwmon_chip_info ina238_chip_info = { .info = ina238_info, }; -/* energy attributes are 5 bytes wide so we need u64 */ -static DEVICE_ATTR_RO(energy1_input); - -static struct attribute *ina238_attrs[] = { - &dev_attr_energy1_input.attr, - NULL, -}; -ATTRIBUTE_GROUPS(ina238); - static int ina238_probe(struct i2c_client *client) { - struct ina2xx_platform_data *pdata = dev_get_platdata(&client->dev); struct device *dev = &client->dev; struct device *hwmon_dev; struct ina238_data *data; @@ -771,33 +765,48 @@ static int ina238_probe(struct i2c_client *client) return PTR_ERR(data->regmap); } - /* load shunt value */ - data->rshunt = INA238_RSHUNT_DEFAULT; - if (device_property_read_u32(dev, "shunt-resistor", &data->rshunt) < 0 && pdata) - data->rshunt = pdata->shunt_uohms; - if (data->rshunt == 0) { - dev_err(dev, "invalid shunt resister value %u\n", data->rshunt); - return -EINVAL; - } - - /* load shunt gain value */ - if (device_property_read_u32(dev, "ti,shunt-gain", &data->gain) < 0) - data->gain = 4; /* Default of ADCRANGE = 0 */ - if (data->gain != 1 && data->gain != 2 && data->gain != 4) { - dev_err(dev, "invalid shunt gain value %u\n", data->gain); - return -EINVAL; - } - /* Setup CONFIG register */ config = data->config->config_default; - if (chip == sq52206) { - if (data->gain == 1) - config |= SQ52206_CONFIG_ADCRANGE_HIGH; /* ADCRANGE = 10/11 is /1 */ - else if (data->gain == 2) - config |= SQ52206_CONFIG_ADCRANGE_LOW; /* ADCRANGE = 01 is /2 */ - } else if (data->gain == 1) { - config |= INA238_CONFIG_ADCRANGE; /* ADCRANGE = 1 is /1 */ + if (data->config->current_lsb) { + data->voltage_lsb[0] = INA238_SHUNT_VOLTAGE_LSB; + data->current_lsb = data->config->current_lsb; + } else { + /* load shunt value */ + if (device_property_read_u32(dev, "shunt-resistor", &data->rshunt) < 0) + data->rshunt = INA238_RSHUNT_DEFAULT; + if (data->rshunt == 0) { + dev_err(dev, "invalid shunt resister value %u\n", data->rshunt); + return -EINVAL; + } + + /* load shunt gain value */ + if (device_property_read_u32(dev, "ti,shunt-gain", &data->gain) < 0) + data->gain = 4; /* Default of ADCRANGE = 0 */ + if (data->gain != 1 && data->gain != 2 && data->gain != 4) { + dev_err(dev, "invalid shunt gain value %u\n", data->gain); + return -EINVAL; + } + + /* Setup SHUNT_CALIBRATION register with fixed value */ + ret = regmap_write(data->regmap, INA238_SHUNT_CALIBRATION, + INA238_CALIBRATION_VALUE); + if (ret < 0) { + dev_err(dev, "error configuring the device: %d\n", ret); + return -ENODEV; + } + if (chip == sq52206) { + if (data->gain == 1) /* ADCRANGE = 10/11 is /1 */ + config |= SQ52206_CONFIG_ADCRANGE_HIGH; + else if (data->gain == 2) /* ADCRANGE = 01 is /2 */ + config |= SQ52206_CONFIG_ADCRANGE_LOW; + } else if (data->gain == 1) { /* ADCRANGE = 1 is /1 */ + config |= INA238_CONFIG_ADCRANGE; + } + data->voltage_lsb[0] = INA238_SHUNT_VOLTAGE_LSB * data->gain / 4; + data->current_lsb = DIV_U64_ROUND_CLOSEST(250ULL * INA238_FIXED_SHUNT * data->gain, + data->rshunt); } + ret = regmap_write(data->regmap, INA238_CONFIG, config); if (ret < 0) { dev_err(dev, "error configuring the device: %d\n", ret); @@ -812,31 +821,33 @@ static int ina238_probe(struct i2c_client *client) return -ENODEV; } - /* Setup SHUNT_CALIBRATION register with fixed value */ - ret = regmap_write(data->regmap, INA238_SHUNT_CALIBRATION, - INA238_CALIBRATION_VALUE); - if (ret < 0) { - dev_err(dev, "error configuring the device: %d\n", ret); - return -ENODEV; - } - /* Setup alert/alarm configuration */ - ret = regmap_write(data->regmap, INA238_DIAG_ALERT, - INA238_DIAG_ALERT_DEFAULT); + config = INA238_DIAG_ALERT_DEFAULT; + if (device_property_read_bool(dev, "ti,alert-polarity-active-high")) + config |= INA238_DIAG_ALERT_APOL; + + ret = regmap_write(data->regmap, INA238_DIAG_ALERT, config); if (ret < 0) { dev_err(dev, "error configuring the device: %d\n", ret); return -ENODEV; } + data->voltage_lsb[1] = data->config->bus_voltage_lsb; + + data->power_lsb = DIV_ROUND_CLOSEST(data->current_lsb * + data->config->power_calculate_factor, + 100); + + data->energy_lsb = data->power_lsb * 16; + hwmon_dev = devm_hwmon_device_register_with_info(dev, client->name, data, - &ina238_chip_info, - data->config->has_energy ? - ina238_groups : NULL); + &ina238_chip_info, NULL); if (IS_ERR(hwmon_dev)) return PTR_ERR(hwmon_dev); - dev_info(dev, "power monitor %s (Rshunt = %u uOhm, gain = %u)\n", - client->name, data->rshunt, data->gain); + if (data->rshunt) + dev_info(dev, "power monitor %s (Rshunt = %u uOhm, gain = %u)\n", + client->name, data->rshunt, data->gain); return 0; } @@ -845,6 +856,8 @@ static const struct i2c_device_id ina238_id[] = { { "ina228", ina228 }, { "ina237", ina237 }, { "ina238", ina238 }, + { "ina700", ina700 }, + { "ina780", ina780 }, { "sq52206", sq52206 }, { } }; @@ -863,6 +876,14 @@ static const struct of_device_id __maybe_unused ina238_of_match[] = { .compatible = "ti,ina238", .data = (void *)ina238 }, + { + .compatible = "ti,ina700", + .data = (void *)ina700 + }, + { + .compatible = "ti,ina780", + .data = (void *)ina780 + }, { .compatible = "silergy,sq52206", .data = (void *)sq52206 diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index babf2413d666f7..b98d5ec72c4ff1 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -84,6 +84,13 @@ static DEFINE_MUTEX(nb_smu_ind_mutex); */ #define AMD_I3255_STR "3255" +/* + * PCI Device IDs for AMD's Family 1Ah-based SOCs. + * Defining locally as IDs are not shared. + */ +#define PCI_DEVICE_ID_AMD_1AH_M50H_DF_F3 0x12cb +#define PCI_DEVICE_ID_AMD_1AH_M90H_DF_F3 0x127b + struct k10temp_data { struct pci_dev *pdev; void (*read_htcreg)(struct pci_dev *pdev, u32 *regval); @@ -556,7 +563,10 @@ static const struct pci_device_id k10temp_id_table[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M50H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M90H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; diff --git a/drivers/hwmon/lenovo-ec-sensors.c b/drivers/hwmon/lenovo-ec-sensors.c index 143fb79713f7d3..8681bbf6665b1e 100644 --- a/drivers/hwmon/lenovo-ec-sensors.c +++ b/drivers/hwmon/lenovo-ec-sensors.c @@ -66,7 +66,7 @@ enum systems { LENOVO_P8, }; -static int px_temp_map[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +static int px_temp_map[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31, 32}; static const char * const lenovo_px_ec_temp_label[] = { "CPU1", @@ -84,9 +84,29 @@ static const char * const lenovo_px_ec_temp_label[] = { "PCI_Z3", "PCI_Z4", "AMB", + "PSU1", + "PSU2", }; -static int gen_temp_map[] = {0, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +static int p8_temp_map[] = {0, 1, 2, 8, 9, 13, 14, 15, 16, 17, 19, 20, 33}; + +static const char * const lenovo_p8_ec_temp_label[] = { + "CPU1", + "CPU_DIMM_BANK1", + "CPU_DIMM_BANK2", + "M2_Z2R", + "M2_Z3R", + "DIMM_RIGHT", + "DIMM_LEFT", + "PCI_Z1", + "PCI_Z2", + "PCI_Z3", + "AMB", + "REAR_VR", + "PSU", +}; + +static int gen_temp_map[] = {0, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31}; static const char * const lenovo_gen_ec_temp_label[] = { "CPU1", @@ -101,6 +121,7 @@ static const char * const lenovo_gen_ec_temp_label[] = { "PCI_Z3", "PCI_Z4", "AMB", + "PSU", }; static int px_fan_map[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; @@ -293,6 +314,8 @@ static const struct hwmon_channel_info *lenovo_ec_hwmon_info_px[] = { HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MAX, @@ -327,6 +350,7 @@ static const struct hwmon_channel_info *lenovo_ec_hwmon_info_p8[] = { HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MAX, @@ -359,6 +383,7 @@ static const struct hwmon_channel_info *lenovo_ec_hwmon_info_p7[] = { HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MAX, @@ -388,6 +413,7 @@ static const struct hwmon_channel_info *lenovo_ec_hwmon_info_p5[] = { HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MAX, @@ -545,9 +571,9 @@ static int lenovo_ec_probe(struct platform_device *pdev) break; case 3: ec_data->fan_labels = p8_ec_fan_label; - ec_data->temp_labels = lenovo_gen_ec_temp_label; + ec_data->temp_labels = lenovo_p8_ec_temp_label; ec_data->fan_map = p8_fan_map; - ec_data->temp_map = gen_temp_map; + ec_data->temp_map = p8_temp_map; lenovo_ec_chip_info.info = lenovo_ec_hwmon_info_p8; break; default: diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c index 9b4875e2fd8d84..3c23b6e8e1bf5c 100644 --- a/drivers/hwmon/lm75.c +++ b/drivers/hwmon/lm75.c @@ -39,6 +39,7 @@ enum lm75_type { /* keep sorted in alphabetical order */ max6626, max31725, mcp980x, + p3t1750, p3t1755, pct2075, stds75, @@ -222,6 +223,13 @@ static const struct lm75_params device_params[] = { .default_resolution = 9, .default_sample_time = MSEC_PER_SEC / 18, }, + [p3t1750] = { + .clr_mask = 1 << 1 | 1 << 7, /* disable SMBAlert and one-shot */ + .default_resolution = 12, + .default_sample_time = 55, + .num_sample_times = 4, + .sample_times = (unsigned int []){ 28, 55, 110, 220 }, + }, [p3t1755] = { .clr_mask = 1 << 1 | 1 << 7, /* disable SMBAlert and one-shot */ .default_resolution = 12, @@ -805,6 +813,7 @@ static const struct i2c_device_id lm75_i2c_ids[] = { { "max31725", max31725, }, { "max31726", max31725, }, { "mcp980x", mcp980x, }, + { "p3t1750", p3t1750, }, { "p3t1755", p3t1755, }, { "pct2075", pct2075, }, { "stds75", stds75, }, @@ -916,6 +925,10 @@ static const struct of_device_id __maybe_unused lm75_of_match[] = { .compatible = "maxim,mcp980x", .data = (void *)mcp980x }, + { + .compatible = "nxp,p3t1750", + .data = (void *)p3t1750 + }, { .compatible = "nxp,p3t1755", .data = (void *)p3t1755 diff --git a/drivers/hwmon/ltc4282.c b/drivers/hwmon/ltc4282.c index dbb30abcd343f3..1d664a2d7b3cb9 100644 --- a/drivers/hwmon/ltc4282.c +++ b/drivers/hwmon/ltc4282.c @@ -1693,8 +1693,7 @@ static int ltc4282_probe(struct i2c_client *i2c) st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL); if (!st) - return dev_err_probe(dev, -ENOMEM, - "Failed to allocate memory\n"); + return -ENOMEM; st->map = devm_regmap_init_i2c(i2c, <c4282_regmap_config); if (IS_ERR(st->map)) diff --git a/drivers/hwmon/mlxreg-fan.c b/drivers/hwmon/mlxreg-fan.c index c25a54d5b39ad5..137a90dd207523 100644 --- a/drivers/hwmon/mlxreg-fan.c +++ b/drivers/hwmon/mlxreg-fan.c @@ -63,12 +63,14 @@ struct mlxreg_fan; * @reg: register offset; * @mask: fault mask; * @prsnt: present register offset; + * @shift: tacho presence bit shift; */ struct mlxreg_fan_tacho { bool connected; u32 reg; u32 mask; u32 prsnt; + u32 shift; }; /* @@ -113,8 +115,8 @@ struct mlxreg_fan { int divider; }; -static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, - unsigned long state); +static int _mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long state, bool thermal); static int mlxreg_fan_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, @@ -143,8 +145,10 @@ mlxreg_fan_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, /* * Map channel to presence bit - drawer can be equipped with * one or few FANs, while presence is indicated per drawer. + * Shift channel value if necessary to align with register value. */ - if (BIT(channel / fan->tachos_per_drwr) & regval) { + if (BIT(rol32(channel, tacho->shift) / fan->tachos_per_drwr) & + regval) { /* FAN is not connected - return zero for FAN speed. */ *val = 0; return 0; @@ -224,8 +228,9 @@ mlxreg_fan_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, * last thermal state. */ if (pwm->last_hwmon_state >= pwm->last_thermal_state) - return mlxreg_fan_set_cur_state(pwm->cdev, - pwm->last_hwmon_state); + return _mlxreg_fan_set_cur_state(pwm->cdev, + pwm->last_hwmon_state, + false); return 0; } return regmap_write(fan->regmap, pwm->reg, val); @@ -357,9 +362,8 @@ static int mlxreg_fan_get_cur_state(struct thermal_cooling_device *cdev, return 0; } -static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, - unsigned long state) - +static int _mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long state, bool thermal) { struct mlxreg_fan_pwm *pwm = cdev->devdata; struct mlxreg_fan *fan = pwm->fan; @@ -369,7 +373,8 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, return -EINVAL; /* Save thermal state. */ - pwm->last_thermal_state = state; + if (thermal) + pwm->last_thermal_state = state; state = max_t(unsigned long, state, pwm->last_hwmon_state); err = regmap_write(fan->regmap, pwm->reg, @@ -381,6 +386,13 @@ static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, return 0; } +static int mlxreg_fan_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long state) + +{ + return _mlxreg_fan_set_cur_state(cdev, state, true); +} + static const struct thermal_cooling_device_ops mlxreg_fan_cooling_ops = { .get_max_state = mlxreg_fan_get_max_state, .get_cur_state = mlxreg_fan_get_cur_state, @@ -400,7 +412,7 @@ static int mlxreg_fan_connect_verify(struct mlxreg_fan *fan, return err; } - return !!(regval & data->bit); + return data->slot ? (data->slot <= regval ? 1 : 0) : !!(regval & data->bit); } static int mlxreg_pwm_connect_verify(struct mlxreg_fan *fan, @@ -537,7 +549,15 @@ static int mlxreg_fan_config(struct mlxreg_fan *fan, return err; } - drwr_avail = hweight32(regval); + /* + * The number of drawers could be specified in registers by counters for newer + * systems, or by bitmasks for older systems. In case the data is provided by + * counter, it is indicated through 'version' field. + */ + if (pdata->version) + drwr_avail = regval; + else + drwr_avail = hweight32(regval); if (!tacho_avail || !drwr_avail || tacho_avail < drwr_avail) { dev_err(fan->dev, "Configuration is invalid: drawers num %d tachos num %d\n", drwr_avail, tacho_avail); diff --git a/drivers/hwmon/nct6694-hwmon.c b/drivers/hwmon/nct6694-hwmon.c new file mode 100644 index 00000000000000..6dcf22ca5018a5 --- /dev/null +++ b/drivers/hwmon/nct6694-hwmon.c @@ -0,0 +1,949 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 HWMON driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * USB command module type for NCT6694 report channel + * This defines the module type used for communication with the NCT6694 + * report channel over the USB interface. + */ +#define NCT6694_RPT_MOD 0xFF + +/* Report channel */ +/* + * The report channel is used to report the status of the hardware monitor + * devices, such as voltage, temperature, fan speed, and PWM. + */ +#define NCT6694_VIN_IDX(x) (0x00 + (x)) +#define NCT6694_TIN_IDX(x) \ + ({ typeof(x) (_x) = (x); \ + ((_x) < 10) ? (0x10 + ((_x) * 2)) : \ + (0x30 + (((_x) - 10) * 2)); }) +#define NCT6694_FIN_IDX(x) (0x50 + ((x) * 2)) +#define NCT6694_PWM_IDX(x) (0x70 + (x)) +#define NCT6694_VIN_STS(x) (0x68 + (x)) +#define NCT6694_TIN_STS(x) (0x6A + (x)) +#define NCT6694_FIN_STS(x) (0x6E + (x)) + +/* + * USB command module type for NCT6694 HWMON controller. + * This defines the module type used for communication with the NCT6694 + * HWMON controller over the USB interface. + */ +#define NCT6694_HWMON_MOD 0x00 + +/* Command 00h - Hardware Monitor Control */ +#define NCT6694_HWMON_CONTROL 0x00 +#define NCT6694_HWMON_CONTROL_SEL 0x00 + +/* Command 02h - Alarm Control */ +#define NCT6694_HWMON_ALARM 0x02 +#define NCT6694_HWMON_ALARM_SEL 0x00 + +/* + * USB command module type for NCT6694 PWM controller. + * This defines the module type used for communication with the NCT6694 + * PWM controller over the USB interface. + */ +#define NCT6694_PWM_MOD 0x01 + +/* PWM Command - Manual Control */ +#define NCT6694_PWM_CONTROL 0x01 +#define NCT6694_PWM_CONTROL_SEL 0x00 + +#define NCT6694_FREQ_FROM_REG(reg) ((reg) * 25000 / 255) +#define NCT6694_FREQ_TO_REG(val) \ + (DIV_ROUND_CLOSEST(clamp_val((val), 100, 25000) * 255, 25000)) + +#define NCT6694_LSB_REG_MASK GENMASK(7, 5) +#define NCT6694_TIN_HYST_MASK GENMASK(7, 5) + +enum nct6694_hwmon_temp_mode { + NCT6694_HWMON_TWOTIME_IRQ = 0, + NCT6694_HWMON_ONETIME_IRQ, + NCT6694_HWMON_REALTIME_IRQ, + NCT6694_HWMON_COMPARE_IRQ, +}; + +struct __packed nct6694_hwmon_control { + u8 vin_en[2]; + u8 tin_en[2]; + u8 fin_en[2]; + u8 pwm_en[2]; + u8 reserved1[40]; + u8 pwm_freq[10]; + u8 reserved2[6]; +}; + +struct __packed nct6694_hwmon_alarm { + u8 smi_ctrl; + u8 reserved1[15]; + struct { + u8 hl; + u8 ll; + } vin_limit[16]; + struct { + u8 hyst; + s8 hl; + } tin_cfg[32]; + __be16 fin_ll[10]; + u8 reserved2[4]; +}; + +struct __packed nct6694_pwm_control { + u8 mal_en[2]; + u8 mal_val[10]; + u8 reserved[12]; +}; + +union __packed nct6694_hwmon_rpt { + u8 vin; + struct { + u8 msb; + u8 lsb; + } tin; + __be16 fin; + u8 pwm; + u8 status; +}; + +union __packed nct6694_hwmon_msg { + struct nct6694_hwmon_alarm hwmon_alarm; + struct nct6694_pwm_control pwm_ctrl; +}; + +struct nct6694_hwmon_data { + struct nct6694 *nct6694; + struct mutex lock; + struct nct6694_hwmon_control hwmon_en; + union nct6694_hwmon_rpt *rpt; + union nct6694_hwmon_msg *msg; +}; + +static inline long in_from_reg(u8 reg) +{ + return reg * 16; +} + +static inline u8 in_to_reg(long val) +{ + return DIV_ROUND_CLOSEST(val, 16); +} + +static inline long temp_from_reg(s8 reg) +{ + return reg * 1000; +} + +static inline s8 temp_to_reg(long val) +{ + return DIV_ROUND_CLOSEST(val, 1000); +} + +#define NCT6694_HWMON_IN_CONFIG (HWMON_I_INPUT | HWMON_I_ENABLE | \ + HWMON_I_MAX | HWMON_I_MIN | \ + HWMON_I_ALARM) +#define NCT6694_HWMON_TEMP_CONFIG (HWMON_T_INPUT | HWMON_T_ENABLE | \ + HWMON_T_MAX | HWMON_T_MAX_HYST | \ + HWMON_T_MAX_ALARM) +#define NCT6694_HWMON_FAN_CONFIG (HWMON_F_INPUT | HWMON_F_ENABLE | \ + HWMON_F_MIN | HWMON_F_MIN_ALARM) +#define NCT6694_HWMON_PWM_CONFIG (HWMON_PWM_INPUT | HWMON_PWM_ENABLE | \ + HWMON_PWM_FREQ) +static const struct hwmon_channel_info *nct6694_info[] = { + HWMON_CHANNEL_INFO(in, + NCT6694_HWMON_IN_CONFIG, /* VIN0 */ + NCT6694_HWMON_IN_CONFIG, /* VIN1 */ + NCT6694_HWMON_IN_CONFIG, /* VIN2 */ + NCT6694_HWMON_IN_CONFIG, /* VIN3 */ + NCT6694_HWMON_IN_CONFIG, /* VIN5 */ + NCT6694_HWMON_IN_CONFIG, /* VIN6 */ + NCT6694_HWMON_IN_CONFIG, /* VIN7 */ + NCT6694_HWMON_IN_CONFIG, /* VIN14 */ + NCT6694_HWMON_IN_CONFIG, /* VIN15 */ + NCT6694_HWMON_IN_CONFIG, /* VIN16 */ + NCT6694_HWMON_IN_CONFIG, /* VBAT */ + NCT6694_HWMON_IN_CONFIG, /* VSB */ + NCT6694_HWMON_IN_CONFIG, /* AVSB */ + NCT6694_HWMON_IN_CONFIG, /* VCC */ + NCT6694_HWMON_IN_CONFIG, /* VHIF */ + NCT6694_HWMON_IN_CONFIG), /* VTT */ + + HWMON_CHANNEL_INFO(temp, + NCT6694_HWMON_TEMP_CONFIG, /* THR1 */ + NCT6694_HWMON_TEMP_CONFIG, /* THR2 */ + NCT6694_HWMON_TEMP_CONFIG, /* THR14 */ + NCT6694_HWMON_TEMP_CONFIG, /* THR15 */ + NCT6694_HWMON_TEMP_CONFIG, /* THR16 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP0 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP1 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP2 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP3 */ + NCT6694_HWMON_TEMP_CONFIG, /* TDP4 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN0 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN1 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN2 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN3 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN4 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN5 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN6 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN7 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN8 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN9 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN10 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN11 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN12 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN13 */ + NCT6694_HWMON_TEMP_CONFIG, /* DTIN14 */ + NCT6694_HWMON_TEMP_CONFIG), /* DTIN15 */ + + HWMON_CHANNEL_INFO(fan, + NCT6694_HWMON_FAN_CONFIG, /* FIN0 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN1 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN2 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN3 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN4 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN5 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN6 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN7 */ + NCT6694_HWMON_FAN_CONFIG, /* FIN8 */ + NCT6694_HWMON_FAN_CONFIG), /* FIN9 */ + + HWMON_CHANNEL_INFO(pwm, + NCT6694_HWMON_PWM_CONFIG, /* PWM0 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM1 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM2 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM3 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM4 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM5 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM6 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM7 */ + NCT6694_HWMON_PWM_CONFIG, /* PWM8 */ + NCT6694_HWMON_PWM_CONFIG), /* PWM9 */ + NULL +}; + +static int nct6694_in_read(struct device *dev, u32 attr, int channel, + long *val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char vin_en; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_in_enable: + vin_en = data->hwmon_en.vin_en[(channel / 8)]; + *val = !!(vin_en & BIT(channel % 8)); + + return 0; + case hwmon_in_input: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_VIN_IDX(channel)), + .len = cpu_to_le16(sizeof(data->rpt->vin)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->vin); + if (ret) + return ret; + + *val = in_from_reg(data->rpt->vin); + + return 0; + case hwmon_in_max: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + *val = in_from_reg(data->msg->hwmon_alarm.vin_limit[channel].hl); + + return 0; + case hwmon_in_min: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + *val = in_from_reg(data->msg->hwmon_alarm.vin_limit[channel].ll); + + return 0; + case hwmon_in_alarm: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_VIN_STS(channel / 8)), + .len = cpu_to_le16(sizeof(data->rpt->status)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->status); + if (ret) + return ret; + + *val = !!(data->rpt->status & BIT(channel % 8)); + + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_temp_read(struct device *dev, u32 attr, int channel, + long *val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char temp_en, temp_hyst; + signed char temp_max; + int ret, temp_raw; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_temp_enable: + temp_en = data->hwmon_en.tin_en[channel / 8]; + *val = !!(temp_en & BIT(channel % 8)); + + return 0; + case hwmon_temp_input: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_TIN_IDX(channel)), + .len = cpu_to_le16(sizeof(data->rpt->tin)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->tin); + if (ret) + return ret; + + temp_raw = data->rpt->tin.msb << 3; + temp_raw |= FIELD_GET(NCT6694_LSB_REG_MASK, data->rpt->tin.lsb); + + /* Real temperature(milli degrees Celsius) = temp_raw * 1000 * 0.125 */ + *val = sign_extend32(temp_raw, 10) * 125; + + return 0; + case hwmon_temp_max: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + *val = temp_from_reg(data->msg->hwmon_alarm.tin_cfg[channel].hl); + + return 0; + case hwmon_temp_max_hyst: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + temp_max = data->msg->hwmon_alarm.tin_cfg[channel].hl; + temp_hyst = FIELD_GET(NCT6694_TIN_HYST_MASK, + data->msg->hwmon_alarm.tin_cfg[channel].hyst); + *val = temp_from_reg(temp_max - temp_hyst); + + return 0; + case hwmon_temp_max_alarm: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_TIN_STS(channel / 8)), + .len = cpu_to_le16(sizeof(data->rpt->status)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->status); + if (ret) + return ret; + + *val = !!(data->rpt->status & BIT(channel % 8)); + + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_fan_read(struct device *dev, u32 attr, int channel, + long *val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char fanin_en; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_fan_enable: + fanin_en = data->hwmon_en.fin_en[channel / 8]; + *val = !!(fanin_en & BIT(channel % 8)); + + return 0; + case hwmon_fan_input: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_FIN_IDX(channel)), + .len = cpu_to_le16(sizeof(data->rpt->fin)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->fin); + if (ret) + return ret; + + *val = be16_to_cpu(data->rpt->fin); + + return 0; + case hwmon_fan_min: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + *val = be16_to_cpu(data->msg->hwmon_alarm.fin_ll[channel]); + + return 0; + case hwmon_fan_min_alarm: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_FIN_STS(channel / 8)), + .len = cpu_to_le16(sizeof(data->rpt->status)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->status); + if (ret) + return ret; + + *val = !!(data->rpt->status & BIT(channel % 8)); + + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_pwm_read(struct device *dev, u32 attr, int channel, + long *val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char pwm_en; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_pwm_enable: + pwm_en = data->hwmon_en.pwm_en[channel / 8]; + *val = !!(pwm_en & BIT(channel % 8)); + + return 0; + case hwmon_pwm_input: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_RPT_MOD, + .offset = cpu_to_le16(NCT6694_PWM_IDX(channel)), + .len = cpu_to_le16(sizeof(data->rpt->pwm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->rpt->pwm); + if (ret) + return ret; + + *val = data->rpt->pwm; + + return 0; + case hwmon_pwm_freq: + *val = NCT6694_FREQ_FROM_REG(data->hwmon_en.pwm_freq[channel]); + + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_in_write(struct device *dev, u32 attr, int channel, + long val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_in_enable: + if (val == 0) + data->hwmon_en.vin_en[channel / 8] &= ~BIT(channel % 8); + else if (val == 1) + data->hwmon_en.vin_en[channel / 8] |= BIT(channel % 8); + else + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + case hwmon_in_max: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + val = clamp_val(val, 0, 2032); + data->msg->hwmon_alarm.vin_limit[channel].hl = in_to_reg(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + case hwmon_in_min: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + val = clamp_val(val, 0, 2032); + data->msg->hwmon_alarm.vin_limit[channel].ll = in_to_reg(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_temp_write(struct device *dev, u32 attr, int channel, + long val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + unsigned char temp_hyst; + signed char temp_max; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_temp_enable: + if (val == 0) + data->hwmon_en.tin_en[channel / 8] &= ~BIT(channel % 8); + else if (val == 1) + data->hwmon_en.tin_en[channel / 8] |= BIT(channel % 8); + else + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + case hwmon_temp_max: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + val = clamp_val(val, -127000, 127000); + data->msg->hwmon_alarm.tin_cfg[channel].hl = temp_to_reg(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + case hwmon_temp_max_hyst: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + + val = clamp_val(val, -127000, 127000); + temp_max = data->msg->hwmon_alarm.tin_cfg[channel].hl; + temp_hyst = temp_max - temp_to_reg(val); + temp_hyst = clamp_val(temp_hyst, 0, 7); + data->msg->hwmon_alarm.tin_cfg[channel].hyst = + (data->msg->hwmon_alarm.tin_cfg[channel].hyst & ~NCT6694_TIN_HYST_MASK) | + FIELD_PREP(NCT6694_TIN_HYST_MASK, temp_hyst); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_fan_write(struct device *dev, u32 attr, int channel, + long val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_fan_enable: + if (val == 0) + data->hwmon_en.fin_en[channel / 8] &= ~BIT(channel % 8); + else if (val == 1) + data->hwmon_en.fin_en[channel / 8] |= BIT(channel % 8); + else + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + case hwmon_fan_min: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + val = clamp_val(val, 1, 65535); + data->msg->hwmon_alarm.fin_ll[channel] = cpu_to_be16(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_pwm_write(struct device *dev, u32 attr, int channel, + long val) +{ + struct nct6694_hwmon_data *data = dev_get_drvdata(dev); + struct nct6694_cmd_header cmd_hd; + int ret; + + guard(mutex)(&data->lock); + + switch (attr) { + case hwmon_pwm_enable: + if (val == 0) + data->hwmon_en.pwm_en[channel / 8] &= ~BIT(channel % 8); + else if (val == 1) + data->hwmon_en.pwm_en[channel / 8] |= BIT(channel % 8); + else + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + case hwmon_pwm_input: + if (val < 0 || val > 255) + return -EINVAL; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_PWM_MOD, + .cmd = NCT6694_PWM_CONTROL, + .sel = NCT6694_PWM_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->msg->pwm_ctrl)) + }; + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->pwm_ctrl); + if (ret) + return ret; + + data->msg->pwm_ctrl.mal_val[channel] = val; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->pwm_ctrl); + case hwmon_pwm_freq: + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + + data->hwmon_en.pwm_freq[channel] = NCT6694_FREQ_TO_REG(val); + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + switch (type) { + case hwmon_in: + /* in mV */ + return nct6694_in_read(dev, attr, channel, val); + case hwmon_temp: + /* in mC */ + return nct6694_temp_read(dev, attr, channel, val); + case hwmon_fan: + /* in RPM */ + return nct6694_fan_read(dev, attr, channel, val); + case hwmon_pwm: + /* in value 0~255 */ + return nct6694_pwm_read(dev, attr, channel, val); + default: + return -EOPNOTSUPP; + } +} + +static int nct6694_write(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long val) +{ + switch (type) { + case hwmon_in: + return nct6694_in_write(dev, attr, channel, val); + case hwmon_temp: + return nct6694_temp_write(dev, attr, channel, val); + case hwmon_fan: + return nct6694_fan_write(dev, attr, channel, val); + case hwmon_pwm: + return nct6694_pwm_write(dev, attr, channel, val); + default: + return -EOPNOTSUPP; + } +} + +static umode_t nct6694_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_enable: + case hwmon_in_max: + case hwmon_in_min: + return 0644; + case hwmon_in_alarm: + case hwmon_in_input: + return 0444; + default: + return 0; + } + case hwmon_temp: + switch (attr) { + case hwmon_temp_enable: + case hwmon_temp_max: + case hwmon_temp_max_hyst: + return 0644; + case hwmon_temp_input: + case hwmon_temp_max_alarm: + return 0444; + default: + return 0; + } + case hwmon_fan: + switch (attr) { + case hwmon_fan_enable: + case hwmon_fan_min: + return 0644; + case hwmon_fan_input: + case hwmon_fan_min_alarm: + return 0444; + default: + return 0; + } + case hwmon_pwm: + switch (attr) { + case hwmon_pwm_enable: + case hwmon_pwm_freq: + case hwmon_pwm_input: + return 0644; + default: + return 0; + } + default: + return 0; + } +} + +static const struct hwmon_ops nct6694_hwmon_ops = { + .is_visible = nct6694_is_visible, + .read = nct6694_read, + .write = nct6694_write, +}; + +static const struct hwmon_chip_info nct6694_chip_info = { + .ops = &nct6694_hwmon_ops, + .info = nct6694_info, +}; + +static int nct6694_hwmon_init(struct nct6694_hwmon_data *data) +{ + struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_CONTROL, + .sel = NCT6694_HWMON_CONTROL_SEL, + .len = cpu_to_le16(sizeof(data->hwmon_en)) + }; + int ret; + + /* + * Record each Hardware Monitor Channel enable status + * and PWM frequency register + */ + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->hwmon_en); + if (ret) + return ret; + + cmd_hd = (struct nct6694_cmd_header) { + .mod = NCT6694_HWMON_MOD, + .cmd = NCT6694_HWMON_ALARM, + .sel = NCT6694_HWMON_ALARM_SEL, + .len = cpu_to_le16(sizeof(data->msg->hwmon_alarm)) + }; + + /* Select hwmon device alarm mode */ + ret = nct6694_read_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); + if (ret) + return ret; + + data->msg->hwmon_alarm.smi_ctrl = NCT6694_HWMON_REALTIME_IRQ; + + return nct6694_write_msg(data->nct6694, &cmd_hd, + &data->msg->hwmon_alarm); +} + +static int nct6694_hwmon_probe(struct platform_device *pdev) +{ + struct nct6694_hwmon_data *data; + struct nct6694 *nct6694 = dev_get_drvdata(pdev->dev.parent); + struct device *hwmon_dev; + int ret; + + data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->rpt = devm_kzalloc(&pdev->dev, sizeof(union nct6694_hwmon_rpt), + GFP_KERNEL); + if (!data->rpt) + return -ENOMEM; + + data->msg = devm_kzalloc(&pdev->dev, sizeof(union nct6694_hwmon_msg), + GFP_KERNEL); + if (!data->msg) + return -ENOMEM; + + data->nct6694 = nct6694; + ret = devm_mutex_init(&pdev->dev, &data->lock); + if (ret) + return ret; + + ret = nct6694_hwmon_init(data); + if (ret) + return ret; + + /* Register hwmon device to HWMON framework */ + hwmon_dev = devm_hwmon_device_register_with_info(&pdev->dev, + "nct6694", data, + &nct6694_chip_info, + NULL); + return PTR_ERR_OR_ZERO(hwmon_dev); +} + +static struct platform_driver nct6694_hwmon_driver = { + .driver = { + .name = "nct6694-hwmon", + }, + .probe = nct6694_hwmon_probe, +}; + +module_platform_driver(nct6694_hwmon_driver); + +MODULE_DESCRIPTION("USB-HWMON driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-hwmon"); diff --git a/drivers/hwmon/nct6775-platform.c b/drivers/hwmon/nct6775-platform.c index 0a040364b5127e..407945d2cd6a80 100644 --- a/drivers/hwmon/nct6775-platform.c +++ b/drivers/hwmon/nct6775-platform.c @@ -167,7 +167,8 @@ static inline int nct6775_asuswmi_write(u8 bank, u8 reg, u8 val) static inline int nct6775_asuswmi_read(u8 bank, u8 reg, u8 *val) { - u32 ret, tmp = 0; + u32 tmp = 0; + int ret; ret = nct6775_asuswmi_evaluate_method(ASUSWMI_METHODID_RHWM, bank, reg, 0, &tmp); diff --git a/drivers/hwmon/nzxt-smart2.c b/drivers/hwmon/nzxt-smart2.c index c2d1173f42fefb..58ef9fa0184be4 100644 --- a/drivers/hwmon/nzxt-smart2.c +++ b/drivers/hwmon/nzxt-smart2.c @@ -721,11 +721,6 @@ static int __maybe_unused nzxt_smart2_hid_reset_resume(struct hid_device *hdev) return init_device(drvdata, drvdata->update_interval); } -static void mutex_fini(void *lock) -{ - mutex_destroy(lock); -} - static int nzxt_smart2_hid_probe(struct hid_device *hdev, const struct hid_device_id *id) { @@ -741,8 +736,7 @@ static int nzxt_smart2_hid_probe(struct hid_device *hdev, init_waitqueue_head(&drvdata->wq); - mutex_init(&drvdata->mutex); - ret = devm_add_action_or_reset(&hdev->dev, mutex_fini, &drvdata->mutex); + ret = devm_mutex_init(&hdev->dev, &drvdata->mutex); if (ret) return ret; diff --git a/drivers/hwmon/pmbus/Kconfig b/drivers/hwmon/pmbus/Kconfig index 55e492452ce811..da04ff6df28bd1 100644 --- a/drivers/hwmon/pmbus/Kconfig +++ b/drivers/hwmon/pmbus/Kconfig @@ -52,7 +52,8 @@ config SENSORS_ADM1275 help If you say yes here you get hardware monitoring support for Analog Devices ADM1075, ADM1272, ADM1273, ADM1275, ADM1276, ADM1278, ADM1281, - ADM1293, and ADM1294 Hot-Swap Controller and Digital Power Monitors. + ADM1293, ADM1294 and SQ24905C Hot-Swap Controller and + Digital Power Monitors. This driver can also be built as a module. If so, the module will be called adm1275. @@ -373,6 +374,15 @@ config SENSORS_MP2856 This driver can also be built as a module. If so, the module will be called mp2856. +config SENSORS_MP2869 + tristate "MPS MP2869" + help + If you say yes here you get hardware monitoring support for MPS + MP2869 Dual Loop Digital Multi-Phase Controller. + + This driver can also be built as a module. If so, the module will + be called mp2869. + config SENSORS_MP2888 tristate "MPS MP2888" help @@ -391,6 +401,15 @@ config SENSORS_MP2891 This driver can also be built as a module. If so, the module will be called mp2891. +config SENSORS_MP29502 + tristate "MPS MP29502" + help + If you say yes here you get hardware monitoring support for MPS + MP29502 Dual Loop Digital Multi-Phase Controller. + + This driver can also be built as a module. If so, the module will + be called mp29502. + config SENSORS_MP2975 tristate "MPS MP2975" help diff --git a/drivers/hwmon/pmbus/Makefile b/drivers/hwmon/pmbus/Makefile index 29cd8a3317d29f..4c5ff3f32c5ecb 100644 --- a/drivers/hwmon/pmbus/Makefile +++ b/drivers/hwmon/pmbus/Makefile @@ -37,8 +37,10 @@ obj-$(CONFIG_SENSORS_MAX31785) += max31785.o obj-$(CONFIG_SENSORS_MAX34440) += max34440.o obj-$(CONFIG_SENSORS_MAX8688) += max8688.o obj-$(CONFIG_SENSORS_MP2856) += mp2856.o +obj-$(CONFIG_SENSORS_MP2869) += mp2869.o obj-$(CONFIG_SENSORS_MP2888) += mp2888.o obj-$(CONFIG_SENSORS_MP2891) += mp2891.o +obj-$(CONFIG_SENSORS_MP29502) += mp29502.o obj-$(CONFIG_SENSORS_MP2975) += mp2975.o obj-$(CONFIG_SENSORS_MP2993) += mp2993.o obj-$(CONFIG_SENSORS_MP5023) += mp5023.o diff --git a/drivers/hwmon/pmbus/adm1275.c b/drivers/hwmon/pmbus/adm1275.c index 7d175baa5de2fb..bc2a6a07dc3e2e 100644 --- a/drivers/hwmon/pmbus/adm1275.c +++ b/drivers/hwmon/pmbus/adm1275.c @@ -18,7 +18,8 @@ #include #include "pmbus.h" -enum chips { adm1075, adm1272, adm1273, adm1275, adm1276, adm1278, adm1281, adm1293, adm1294 }; +enum chips { adm1075, adm1272, adm1273, adm1275, adm1276, adm1278, adm1281, + adm1293, adm1294, sq24905c }; #define ADM1275_MFR_STATUS_IOUT_WARN2 BIT(0) #define ADM1293_MFR_STATUS_VAUX_UV_WARN BIT(5) @@ -486,6 +487,7 @@ static const struct i2c_device_id adm1275_id[] = { { "adm1281", adm1281 }, { "adm1293", adm1293 }, { "adm1294", adm1294 }, + { "mc09c", sq24905c }, { } }; MODULE_DEVICE_TABLE(i2c, adm1275_id); @@ -532,7 +534,8 @@ static int adm1275_probe(struct i2c_client *client) dev_err(&client->dev, "Failed to read Manufacturer ID\n"); return ret; } - if (ret != 3 || strncmp(block_buffer, "ADI", 3)) { + if ((ret != 3 || strncmp(block_buffer, "ADI", 3)) && + (ret != 2 || strncmp(block_buffer, "SY", 2))) { dev_err(&client->dev, "Unsupported Manufacturer ID\n"); return -ENODEV; } @@ -558,7 +561,8 @@ static int adm1275_probe(struct i2c_client *client) if (mid->driver_data == adm1272 || mid->driver_data == adm1273 || mid->driver_data == adm1278 || mid->driver_data == adm1281 || - mid->driver_data == adm1293 || mid->driver_data == adm1294) + mid->driver_data == adm1293 || mid->driver_data == adm1294 || + mid->driver_data == sq24905c) config_read_fn = i2c_smbus_read_word_data; else config_read_fn = i2c_smbus_read_byte_data; @@ -708,6 +712,7 @@ static int adm1275_probe(struct i2c_client *client) break; case adm1278: case adm1281: + case sq24905c: data->have_vout = true; data->have_pin_max = true; data->have_temp_max = true; diff --git a/drivers/hwmon/pmbus/isl68137.c b/drivers/hwmon/pmbus/isl68137.c index c52c55d2e7f48d..52cf62e45a86f1 100644 --- a/drivers/hwmon/pmbus/isl68137.c +++ b/drivers/hwmon/pmbus/isl68137.c @@ -61,6 +61,8 @@ enum chips { raa228004, raa228006, raa228228, + raa228244, + raa228246, raa229001, raa229004, raa229621, @@ -464,6 +466,8 @@ static const struct i2c_device_id raa_dmpvr_id[] = { {"raa228004", raa_dmpvr2_hv}, {"raa228006", raa_dmpvr2_hv}, {"raa228228", raa_dmpvr2_2rail_nontc}, + {"raa228244", raa_dmpvr2_2rail_nontc}, + {"raa228246", raa_dmpvr2_2rail_nontc}, {"raa229001", raa_dmpvr2_2rail}, {"raa229004", raa_dmpvr2_2rail}, {"raa229621", raa_dmpvr2_2rail}, @@ -512,6 +516,8 @@ static const struct of_device_id isl68137_of_match[] = { { .compatible = "renesas,raa228004", .data = (void *)raa_dmpvr2_hv }, { .compatible = "renesas,raa228006", .data = (void *)raa_dmpvr2_hv }, { .compatible = "renesas,raa228228", .data = (void *)raa_dmpvr2_2rail_nontc }, + { .compatible = "renesas,raa228244", .data = (void *)raa_dmpvr2_2rail_nontc }, + { .compatible = "renesas,raa228246", .data = (void *)raa_dmpvr2_2rail_nontc }, { .compatible = "renesas,raa229001", .data = (void *)raa_dmpvr2_2rail }, { .compatible = "renesas,raa229004", .data = (void *)raa_dmpvr2_2rail }, { .compatible = "renesas,raa229621", .data = (void *)raa_dmpvr2_2rail }, diff --git a/drivers/hwmon/pmbus/mp2869.c b/drivers/hwmon/pmbus/mp2869.c new file mode 100644 index 00000000000000..cc69a1e91dfe8a --- /dev/null +++ b/drivers/hwmon/pmbus/mp2869.c @@ -0,0 +1,659 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Hardware monitoring driver for MPS Multi-phase Digital VR Controllers(MP2869) + */ + +#include +#include +#include +#include +#include "pmbus.h" + +/* + * Vender specific registers, the register MFR_SVI3_IOUT_PRT(0x67), + * READ_PIN_EST(0x94)and READ_IIN_EST(0x95) redefine the standard + * PMBUS register. The MFR_VOUT_LOOP_CTRL(0x29) is used to identify + * the vout scale and the MFR_SVI3_IOUT_PRT(0x67) is used to identify + * the iout scale. The READ_PIN_EST(0x94) is used to read input power + * per rail. The MP2891 does not have standard READ_IIN register(0x89), + * the iin telemetry can be obtained through the vendor redefined + * register READ_IIN_EST(0x95). + */ +#define MFR_SVI3_IOUT_PRT 0x67 +#define MFR_READ_PIN_EST 0x94 +#define MFR_READ_IIN_EST 0x95 +#define MFR_TSNS_FLT_SET 0xBB + +#define MP2869_VIN_OV_FAULT_GAIN 4 +#define MP2869_READ_VOUT_DIV 1024 +#define MP2869_READ_IOUT_DIV 32 +#define MP2869_OVUV_LIMIT_SCALE 10 +#define MP2869_OVUV_DELTA_SCALE 50 +#define MP2869_TEMP_LIMIT_OFFSET 40 +#define MP2869_IOUT_LIMIT_UINT 8 +#define MP2869_POUT_OP_GAIN 2 + +#define MP2869_PAGE_NUM 2 + +#define MP2869_RAIL1_FUNC (PMBUS_HAVE_VIN | PMBUS_HAVE_VOUT | \ + PMBUS_HAVE_IOUT | PMBUS_HAVE_POUT | \ + PMBUS_HAVE_TEMP | PMBUS_HAVE_PIN | \ + PMBUS_HAVE_IIN | \ + PMBUS_HAVE_STATUS_VOUT | \ + PMBUS_HAVE_STATUS_IOUT | \ + PMBUS_HAVE_STATUS_TEMP | \ + PMBUS_HAVE_STATUS_INPUT) + +#define MP2869_RAIL2_FUNC (PMBUS_HAVE_VOUT | PMBUS_HAVE_IOUT | \ + PMBUS_HAVE_POUT | PMBUS_HAVE_TEMP | \ + PMBUS_HAVE_PIN | PMBUS_HAVE_IIN | \ + PMBUS_HAVE_STATUS_VOUT | \ + PMBUS_HAVE_STATUS_IOUT | \ + PMBUS_HAVE_STATUS_TEMP | \ + PMBUS_HAVE_STATUS_INPUT) + +struct mp2869_data { + struct pmbus_driver_info info; + bool mfr_thwn_flt_en; + int vout_scale[MP2869_PAGE_NUM]; + int iout_scale[MP2869_PAGE_NUM]; +}; + +static const int mp2869_vout_sacle[8] = {6400, 5120, 2560, 2048, 1024, + 4, 2, 1}; +static const int mp2869_iout_sacle[8] = {32, 1, 2, 4, 8, 16, 32, 64}; + +#define to_mp2869_data(x) container_of(x, struct mp2869_data, info) + +static u16 mp2869_reg2data_linear11(u16 word) +{ + s16 exponent; + s32 mantissa; + s64 val; + + exponent = ((s16)word) >> 11; + mantissa = ((s16)((word & 0x7ff) << 5)) >> 5; + val = mantissa; + + if (exponent >= 0) + val <<= exponent; + else + val >>= -exponent; + + return val; +} + +static int +mp2869_identify_thwn_flt(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_TSNS_FLT_SET); + if (ret < 0) + return ret; + + data->mfr_thwn_flt_en = FIELD_GET(GENMASK(13, 13), ret); + + return 0; +} + +static int +mp2869_identify_vout_scale(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, PMBUS_VOUT_SCALE_LOOP); + if (ret < 0) + return ret; + + /* + * The output voltage is equal to the READ_VOUT(0x8B) register value multiply + * by vout_scale. + * Obtain vout scale from the register PMBUS_VOUT_SCALE_LOOP, bits 12-10 + * PMBUS_VOUT_SCALE_LOOP[12:10]: + * 000b - 6.25mV/LSB, 001b - 5mV/LSB, 010b - 2.5mV/LSB, 011b - 2mV/LSB + * 100b - 1mV/Lsb, 101b - (1/256)mV/LSB, 110b - (1/512)mV/LSB, + * 111b - (1/1024)mV/LSB + */ + data->vout_scale[page] = mp2869_vout_sacle[FIELD_GET(GENMASK(12, 10), ret)]; + + return 0; +} + +static int +mp2869_identify_iout_scale(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_SVI3_IOUT_PRT); + if (ret < 0) + return ret; + + /* + * The output current is equal to the READ_IOUT(0x8C) register value + * multiply by iout_scale. + * Obtain iout_scale from the register MFR_SVI3_IOUT_PRT[2:0]. + * The value is selected as below: + * 000b - 1A/LSB, 001b - (1/32)A/LSB, 010b - (1/16)A/LSB, + * 011b - (1/8)A/LSB, 100b - (1/4)A/LSB, 101b - (1/2)A/LSB + * 110b - 1A/LSB, 111b - 2A/LSB + */ + data->iout_scale[page] = mp2869_iout_sacle[FIELD_GET(GENMASK(2, 0), ret)]; + + return 0; +} + +static int mp2869_read_byte_data(struct i2c_client *client, int page, int reg) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + switch (reg) { + case PMBUS_VOUT_MODE: + /* + * The calculation of vout in this driver is based on direct format. + * As a result, the format of vout is enforced to direct. + */ + ret = PB_VOUT_MODE_DIRECT; + break; + case PMBUS_STATUS_BYTE: + /* + * If the tsns digital fault is enabled, the TEMPERATURE flag + * of PMBUS_STATUS_BYTE should come from STATUS_MFR_SPECIFIC + * register bit1. + */ + if (!data->mfr_thwn_flt_en) + return -ENODATA; + + ret = pmbus_read_byte_data(client, page, reg); + if (ret < 0) + return ret; + + ret = (ret & ~GENMASK(2, 2)) | + FIELD_PREP(GENMASK(2, 2), + FIELD_GET(GENMASK(1, 1), + pmbus_read_byte_data(client, page, + PMBUS_STATUS_MFR_SPECIFIC))); + break; + case PMBUS_STATUS_TEMPERATURE: + /* + * If the tsns digital fault is enabled, the OT Fault and OT Warning + * flag of PMBUS_STATUS_TEMPERATURE should come from STATUS_MFR_SPECIFIC + * register bit1. + */ + if (!data->mfr_thwn_flt_en) + return -ENODATA; + + ret = pmbus_read_byte_data(client, page, reg); + if (ret < 0) + return ret; + + ret = (ret & ~GENMASK(7, 6)) | + FIELD_PREP(GENMASK(6, 6), + FIELD_GET(GENMASK(1, 1), + pmbus_read_byte_data(client, page, + PMBUS_STATUS_MFR_SPECIFIC))) | + FIELD_PREP(GENMASK(7, 7), + FIELD_GET(GENMASK(1, 1), + pmbus_read_byte_data(client, page, + PMBUS_STATUS_MFR_SPECIFIC))); + break; + default: + ret = -ENODATA; + break; + } + + return ret; +} + +static int mp2869_read_word_data(struct i2c_client *client, int page, int phase, + int reg) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + switch (reg) { + case PMBUS_STATUS_WORD: + /* + * If the tsns digital fault is enabled, the OT Fault flag + * of PMBUS_STATUS_WORD should come from STATUS_MFR_SPECIFIC + * register bit1. + */ + if (!data->mfr_thwn_flt_en) + return -ENODATA; + + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & ~GENMASK(2, 2)) | + FIELD_PREP(GENMASK(2, 2), + FIELD_GET(GENMASK(1, 1), + pmbus_read_byte_data(client, page, + PMBUS_STATUS_MFR_SPECIFIC))); + break; + case PMBUS_READ_VIN: + /* + * The MP2869 PMBUS_READ_VIN[10:0] is the vin value, the vin scale is + * 31.25mV/LSB. And the vin scale is set to 31.25mV/Lsb(using r/m/b scale) + * in MP2869 pmbus_driver_info struct, so the word data bit0-bit10 can be + * returned to pmbus core directly. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(10, 0), ret); + break; + case PMBUS_READ_IIN: + /* + * The MP2869 redefine the standard 0x95 register as iin telemetry + * per rail. + */ + ret = pmbus_read_word_data(client, page, phase, MFR_READ_IIN_EST); + if (ret < 0) + return ret; + + break; + case PMBUS_READ_PIN: + /* + * The MP2869 redefine the standard 0x94 register as pin telemetry + * per rail. The MP2869 MFR_READ_PIN_EST register is linear11 format, + * but the pin scale is set to 1W/Lsb(using r/m/b scale). As a result, + * the pin read from MP2869 should be converted to W, then return + * the result to pmbus core. + */ + ret = pmbus_read_word_data(client, page, phase, MFR_READ_PIN_EST); + if (ret < 0) + return ret; + + ret = mp2869_reg2data_linear11(ret); + break; + case PMBUS_READ_VOUT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(11, 0)) * data->vout_scale[page], + MP2869_READ_VOUT_DIV); + break; + case PMBUS_READ_IOUT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(10, 0)) * data->iout_scale[page], + MP2869_READ_IOUT_DIV); + break; + case PMBUS_READ_POUT: + /* + * The MP2869 PMBUS_READ_POUT register is linear11 format, but the pout + * scale is set to 1W/Lsb(using r/m/b scale). As a result, the pout read + * from MP2869 should be converted to W, then return the result to pmbus + * core. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = mp2869_reg2data_linear11(ret); + break; + case PMBUS_READ_TEMPERATURE_1: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(10, 0), ret); + break; + case PMBUS_VOUT_OV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + if (FIELD_GET(GENMASK(12, 9), ret)) + ret = FIELD_GET(GENMASK(8, 0), ret) * MP2869_OVUV_LIMIT_SCALE + + (FIELD_GET(GENMASK(12, 9), ret) + 1) * MP2869_OVUV_DELTA_SCALE; + else + ret = FIELD_GET(GENMASK(8, 0), ret) * MP2869_OVUV_LIMIT_SCALE; + break; + case PMBUS_VOUT_UV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + if (FIELD_GET(GENMASK(12, 9), ret)) + ret = FIELD_GET(GENMASK(8, 0), ret) * MP2869_OVUV_LIMIT_SCALE - + (FIELD_GET(GENMASK(12, 9), ret) + 1) * MP2869_OVUV_DELTA_SCALE; + else + ret = FIELD_GET(GENMASK(8, 0), ret) * MP2869_OVUV_LIMIT_SCALE; + break; + case PMBUS_OT_FAULT_LIMIT: + case PMBUS_OT_WARN_LIMIT: + /* + * The scale of MP2869 PMBUS_OT_FAULT_LIMIT and PMBUS_OT_WARN_LIMIT + * is 1°C/LSB and they have 40°C offset. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & GENMASK(7, 0)) - MP2869_TEMP_LIMIT_OFFSET; + break; + case PMBUS_VIN_OV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & GENMASK(7, 0)) * MP2869_VIN_OV_FAULT_GAIN; + break; + case PMBUS_VIN_UV_WARN_LIMIT: + case PMBUS_VIN_UV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(9, 0), ret); + break; + case PMBUS_IOUT_OC_FAULT_LIMIT: + case PMBUS_IOUT_OC_WARN_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(7, 0)) * data->iout_scale[page] * + MP2869_IOUT_LIMIT_UINT, MP2869_READ_IOUT_DIV); + break; + case PMBUS_POUT_OP_WARN_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & GENMASK(7, 0)) * MP2869_POUT_OP_GAIN; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int mp2869_write_word_data(struct i2c_client *client, int page, int reg, + u16 word) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp2869_data *data = to_mp2869_data(info); + int ret; + + switch (reg) { + case PMBUS_VOUT_UV_FAULT_LIMIT: + /* + * The MP2869 PMBUS_VOUT_UV_FAULT_LIMIT[8:0] is the limit value, + * and bit9-bit15 should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + if (FIELD_GET(GENMASK(12, 9), ret)) + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word + + (FIELD_GET(GENMASK(12, 9), + ret) + 1) * + MP2869_OVUV_DELTA_SCALE, + MP2869_OVUV_LIMIT_SCALE))); + else + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word, + MP2869_OVUV_LIMIT_SCALE))); + break; + case PMBUS_VOUT_OV_FAULT_LIMIT: + /* + * The MP2869 PMBUS_VOUT_OV_FAULT_LIMIT[8:0] is the limit value, + * and bit9-bit15 should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + if (FIELD_GET(GENMASK(12, 9), ret)) + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word - + (FIELD_GET(GENMASK(12, 9), + ret) + 1) * + MP2869_OVUV_DELTA_SCALE, + MP2869_OVUV_LIMIT_SCALE))); + else + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word, + MP2869_OVUV_LIMIT_SCALE))); + break; + case PMBUS_OT_FAULT_LIMIT: + case PMBUS_OT_WARN_LIMIT: + /* + * If the tsns digital fault is enabled, the PMBUS_OT_FAULT_LIMIT and + * PMBUS_OT_WARN_LIMIT can not be written. + */ + if (data->mfr_thwn_flt_en) + return -EINVAL; + + /* + * The MP2869 scale of MP2869 PMBUS_OT_FAULT_LIMIT and PMBUS_OT_WARN_LIMIT + * have 40°C offset. The bit0-bit7 is the limit value, and bit8-bit15 + * should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(7, 0)) | + FIELD_PREP(GENMASK(7, 0), + word + MP2869_TEMP_LIMIT_OFFSET)); + break; + case PMBUS_VIN_OV_FAULT_LIMIT: + /* + * The MP2869 PMBUS_VIN_OV_FAULT_LIMIT[7:0] is the limit value, and bit8-bit15 + * should not be changed. The scale of PMBUS_VIN_OV_FAULT_LIMIT is 125mV/Lsb, + * but the vin scale is set to 31.25mV/Lsb(using r/m/b scale), so the word data + * should divide by MP2869_VIN_OV_FAULT_GAIN(4) + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(7, 0)) | + FIELD_PREP(GENMASK(7, 0), + DIV_ROUND_CLOSEST(word, + MP2869_VIN_OV_FAULT_GAIN))); + break; + case PMBUS_VIN_UV_WARN_LIMIT: + case PMBUS_VIN_UV_FAULT_LIMIT: + /* + * The PMBUS_VIN_UV_LIMIT[9:0] is the limit value, and bit10-bit15 should + * not be changed. The scale of PMBUS_VIN_UV_LIMIT is 31.25mV/Lsb, and the + * vin scale is set to 31.25mV/Lsb(using r/m/b scale), so the word data can + * be written directly. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(9, 0)) | + FIELD_PREP(GENMASK(9, 0), + word)); + break; + case PMBUS_IOUT_OC_FAULT_LIMIT: + case PMBUS_IOUT_OC_WARN_LIMIT: + ret = pmbus_write_word_data(client, page, reg, + DIV_ROUND_CLOSEST(word * MP2869_READ_IOUT_DIV, + MP2869_IOUT_LIMIT_UINT * + data->iout_scale[page])); + break; + case PMBUS_POUT_OP_WARN_LIMIT: + /* + * The POUT_OP_WARN_LIMIT[11:0] is the limit value, and bit12-bit15 should + * not be changed. The scale of POUT_OP_WARN_LIMIT is 2W/Lsb. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(11, 0)) | + FIELD_PREP(GENMASK(11, 0), + DIV_ROUND_CLOSEST(word, + MP2869_POUT_OP_GAIN))); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int mp2869_identify(struct i2c_client *client, struct pmbus_driver_info *info) +{ + int ret; + + /* Identify whether tsns digital fault is enable */ + ret = mp2869_identify_thwn_flt(client, info, 1); + if (ret < 0) + return 0; + + /* Identify vout scale for rail1. */ + ret = mp2869_identify_vout_scale(client, info, 0); + if (ret < 0) + return ret; + + /* Identify vout scale for rail2. */ + ret = mp2869_identify_vout_scale(client, info, 1); + if (ret < 0) + return ret; + + /* Identify iout scale for rail 1. */ + ret = mp2869_identify_iout_scale(client, info, 0); + if (ret < 0) + return ret; + + /* Identify iout scale for rail 2. */ + return mp2869_identify_iout_scale(client, info, 1); +} + +static const struct pmbus_driver_info mp2869_info = { + .pages = MP2869_PAGE_NUM, + .format[PSC_VOLTAGE_IN] = direct, + .format[PSC_CURRENT_IN] = linear, + .format[PSC_CURRENT_OUT] = direct, + .format[PSC_TEMPERATURE] = direct, + .format[PSC_POWER] = direct, + .format[PSC_VOLTAGE_OUT] = direct, + + .m[PSC_VOLTAGE_IN] = 32, + .R[PSC_VOLTAGE_IN] = 0, + .b[PSC_VOLTAGE_IN] = 0, + + .m[PSC_VOLTAGE_OUT] = 1, + .R[PSC_VOLTAGE_OUT] = 3, + .b[PSC_VOLTAGE_OUT] = 0, + + .m[PSC_CURRENT_OUT] = 1, + .R[PSC_CURRENT_OUT] = 0, + .b[PSC_CURRENT_OUT] = 0, + + .m[PSC_TEMPERATURE] = 1, + .R[PSC_TEMPERATURE] = 0, + .b[PSC_TEMPERATURE] = 0, + + .m[PSC_POWER] = 1, + .R[PSC_POWER] = 0, + .b[PSC_POWER] = 0, + + .func[0] = MP2869_RAIL1_FUNC, + .func[1] = MP2869_RAIL2_FUNC, + .read_word_data = mp2869_read_word_data, + .write_word_data = mp2869_write_word_data, + .read_byte_data = mp2869_read_byte_data, + .identify = mp2869_identify, +}; + +static int mp2869_probe(struct i2c_client *client) +{ + struct pmbus_driver_info *info; + struct mp2869_data *data; + + data = devm_kzalloc(&client->dev, sizeof(struct mp2869_data), + GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->info, &mp2869_info, sizeof(*info)); + info = &data->info; + + return pmbus_do_probe(client, info); +} + +static const struct i2c_device_id mp2869_id[] = { + {"mp2869", 0}, + {"mp29608", 1}, + {"mp29612", 2}, + {"mp29816", 3}, + {} +}; +MODULE_DEVICE_TABLE(i2c, mp2869_id); + +static const struct of_device_id __maybe_unused mp2869_of_match[] = { + {.compatible = "mps,mp2869", .data = (void *)0}, + {.compatible = "mps,mp29608", .data = (void *)1}, + {.compatible = "mps,mp29612", .data = (void *)2}, + {.compatible = "mps,mp29816", .data = (void *)3}, + {} +}; +MODULE_DEVICE_TABLE(of, mp2869_of_match); + +static struct i2c_driver mp2869_driver = { + .driver = { + .name = "mp2869", + .of_match_table = mp2869_of_match, + }, + .probe = mp2869_probe, + .id_table = mp2869_id, +}; + +module_i2c_driver(mp2869_driver); + +MODULE_AUTHOR("Wensheng Wang "); +MODULE_DESCRIPTION("PMBus driver for MPS MP2869"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS("PMBUS"); diff --git a/drivers/hwmon/pmbus/mp29502.c b/drivers/hwmon/pmbus/mp29502.c new file mode 100644 index 00000000000000..7241373f155770 --- /dev/null +++ b/drivers/hwmon/pmbus/mp29502.c @@ -0,0 +1,670 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Hardware monitoring driver for MPS Multi-phase Digital VR Controllers(MP29502) + */ + +#include +#include +#include +#include +#include "pmbus.h" + +#define MFR_VOUT_SCALE_LOOP 0x29 +#define MFR_SVI3_IOUT_PRT 0x67 +#define MFR_READ_PIN_EST 0x94 +#define MFR_READ_IIN_EST 0x95 +#define MFR_VOUT_PROT1 0x3D +#define MFR_VOUT_PROT2 0x51 +#define MFR_SLOPE_CNT_SET 0xA8 +#define MFR_TSNS_FLT_SET 0xBB + +#define MP29502_VIN_OV_GAIN 4 +#define MP29502_TEMP_LIMIT_OFFSET 40 +#define MP29502_READ_VOUT_DIV 1024 +#define MP29502_READ_IOUT_DIV 32 +#define MP29502_IOUT_LIMIT_UINT 8 +#define MP29502_OVUV_LIMIT_SCALE 10 +#define MP28502_VOUT_OV_GAIN 512 +#define MP28502_VOUT_OV_SCALE 40 +#define MP29502_VOUT_UV_OFFSET 36 +#define MP29502_PIN_GAIN 2 +#define MP29502_IIN_DIV 2 + +#define MP29502_PAGE_NUM 1 + +#define MP29502_RAIL_FUNC (PMBUS_HAVE_VIN | PMBUS_HAVE_VOUT | \ + PMBUS_HAVE_IOUT | PMBUS_HAVE_POUT | \ + PMBUS_HAVE_TEMP | PMBUS_HAVE_PIN | \ + PMBUS_HAVE_IIN | \ + PMBUS_HAVE_STATUS_VOUT | \ + PMBUS_HAVE_STATUS_IOUT | \ + PMBUS_HAVE_STATUS_TEMP | \ + PMBUS_HAVE_STATUS_INPUT) + +struct mp29502_data { + struct pmbus_driver_info info; + int vout_scale; + int vout_bottom_div; + int vout_top_div; + int ovp_div; + int iout_scale; +}; + +#define to_mp29502_data(x) container_of(x, struct mp29502_data, info) + +static u16 mp29502_reg2data_linear11(u16 word) +{ + s16 exponent; + s32 mantissa; + s64 val; + + exponent = ((s16)word) >> 11; + mantissa = ((s16)((word & 0x7ff) << 5)) >> 5; + val = mantissa; + + if (exponent >= 0) + val <<= exponent; + else + val >>= -exponent; + + return val; +} + +static int +mp29502_identify_vout_scale(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_VOUT_SCALE_LOOP); + if (ret < 0) + return ret; + + switch (FIELD_GET(GENMASK(12, 10), ret)) { + case 0: + data->vout_scale = 6400; + break; + case 1: + data->vout_scale = 5120; + break; + case 2: + data->vout_scale = 2560; + break; + case 3: + data->vout_scale = 2048; + break; + case 4: + data->vout_scale = 1024; + break; + case 5: + data->vout_scale = 4; + break; + case 6: + data->vout_scale = 2; + break; + case 7: + data->vout_scale = 1; + break; + default: + data->vout_scale = 1; + break; + } + + return 0; +} + +static int +mp29502_identify_vout_divider(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_VOUT_PROT1); + if (ret < 0) + return ret; + + data->vout_bottom_div = FIELD_GET(GENMASK(11, 0), ret); + + ret = i2c_smbus_read_word_data(client, MFR_VOUT_PROT2); + if (ret < 0) + return ret; + + data->vout_top_div = FIELD_GET(GENMASK(14, 0), ret); + + return 0; +} + +static int +mp29502_identify_ovp_divider(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_SLOPE_CNT_SET); + if (ret < 0) + return ret; + + data->ovp_div = FIELD_GET(GENMASK(9, 0), ret); + + return 0; +} + +static int +mp29502_identify_iout_scale(struct i2c_client *client, struct pmbus_driver_info *info, + int page) +{ + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, page); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_SVI3_IOUT_PRT); + if (ret < 0) + return ret; + + switch (ret & GENMASK(2, 0)) { + case 0: + case 6: + data->iout_scale = 32; + break; + case 1: + data->iout_scale = 1; + break; + case 2: + data->iout_scale = 2; + break; + case 3: + data->iout_scale = 4; + break; + case 4: + data->iout_scale = 8; + break; + case 5: + data->iout_scale = 16; + break; + default: + data->iout_scale = 64; + break; + } + + return 0; +} + +static int mp29502_read_vout_ov_limit(struct i2c_client *client, struct mp29502_data *data) +{ + int ret; + int ov_value; + + /* + * This is because the vout ov fault limit value comes from + * page1 MFR_TSNS_FLT_SET reg, and other telemetry and limit + * value comes from page0 reg. So the page should be set to + * 0 after the reading of vout ov limit. + */ + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 1); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_TSNS_FLT_SET); + if (ret < 0) + return ret; + + ov_value = DIV_ROUND_CLOSEST(FIELD_GET(GENMASK(12, 7), ret) * + MP28502_VOUT_OV_GAIN * MP28502_VOUT_OV_SCALE, + data->ovp_div); + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0); + if (ret < 0) + return ret; + + return ov_value; +} + +static int mp29502_write_vout_ov_limit(struct i2c_client *client, u16 word, + struct mp29502_data *data) +{ + int ret; + + /* + * This is because the vout ov fault limit value comes from + * page1 MFR_TSNS_FLT_SET reg, and other telemetry and limit + * value comes from page0 reg. So the page should be set to + * 0 after the writing of vout ov limit. + */ + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 1); + if (ret < 0) + return ret; + + ret = i2c_smbus_read_word_data(client, MFR_TSNS_FLT_SET); + if (ret < 0) + return ret; + + ret = i2c_smbus_write_word_data(client, MFR_TSNS_FLT_SET, + (ret & ~GENMASK(12, 7)) | + FIELD_PREP(GENMASK(12, 7), + DIV_ROUND_CLOSEST(word * data->ovp_div, + MP28502_VOUT_OV_GAIN * MP28502_VOUT_OV_SCALE))); + + return i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0); +} + +static int mp29502_read_byte_data(struct i2c_client *client, int page, int reg) +{ + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0); + if (ret < 0) + return ret; + + switch (reg) { + case PMBUS_VOUT_MODE: + ret = PB_VOUT_MODE_DIRECT; + break; + default: + ret = -ENODATA; + break; + } + + return ret; +} + +static int mp29502_read_word_data(struct i2c_client *client, int page, + int phase, int reg) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + switch (reg) { + case PMBUS_STATUS_WORD: + ret = -ENODATA; + break; + case PMBUS_READ_VIN: + /* + * The MP29502 PMBUS_READ_VIN[10:0] is the vin value, the vin scale is + * 125mV/LSB. And the vin scale is set to 125mV/Lsb(using r/m/b scale) + * in MP29502 pmbus_driver_info struct, so the word data bit0-bit10 can + * be returned to pmbus core directly. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(10, 0), ret); + break; + case PMBUS_READ_VOUT: + /* + * The MP29502 PMBUS_READ_VOUT[11:0] is the vout value, and vout + * value is calculated based on vout scale and vout divider. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(11, 0)) * + data->vout_scale * + (data->vout_bottom_div + + 4 * data->vout_top_div), + MP29502_READ_VOUT_DIV * + data->vout_bottom_div); + break; + case PMBUS_READ_IIN: + /* + * The MP29502 MFR_READ_IIN_EST register is linear11 format, and the + * exponent is not a constant value. But the iin scale is set to + * 1A/Lsb(using r/m/b scale). As a result, the iin read from MP29502 + * should be calculated to A, then return the result to pmbus core. + */ + ret = pmbus_read_word_data(client, page, phase, MFR_READ_IIN_EST); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST(mp29502_reg2data_linear11(ret), + MP29502_IIN_DIV); + break; + case PMBUS_READ_PIN: + /* + * The MP29502 MFR_READ_PIN_EST register is linear11 format, and the + * exponent is not a constant value. But the pin scale is set to + * 1W/Lsb(using r/m/b scale). As a result, the pout read from MP29502 + * should be calculated to W, then return the result to pmbus core. + */ + ret = pmbus_read_word_data(client, page, phase, MFR_READ_PIN_EST); + if (ret < 0) + return ret; + + ret = mp29502_reg2data_linear11(ret) * MP29502_PIN_GAIN; + break; + case PMBUS_READ_POUT: + /* + * The MP29502 PMBUS_READ_POUT register is linear11 format, and the + * exponent is not a constant value. But the pout scale is set to + * 1W/Lsb(using r/m/b scale). As a result, the pout read from MP29502 + * should be calculated to W, then return the result to pmbus core. + * And the pout is calculated based on vout divider. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST(mp29502_reg2data_linear11(ret) * + (data->vout_bottom_div + + 4 * data->vout_top_div), + data->vout_bottom_div); + break; + case PMBUS_READ_IOUT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(10, 0)) * data->iout_scale, + MP29502_READ_IOUT_DIV); + break; + case PMBUS_READ_TEMPERATURE_1: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(10, 0), ret); + break; + case PMBUS_VIN_OV_FAULT_LIMIT: + /* + * The MP29502 PMBUS_VIN_OV_FAULT_LIMIT is 500mV/Lsb, but + * the vin scale is set to 125mV/Lsb(using r/m/b scale), + * so the word data should multiply by 4. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(7, 0), ret) * MP29502_VIN_OV_GAIN; + break; + case PMBUS_VIN_UV_WARN_LIMIT: + case PMBUS_VIN_UV_FAULT_LIMIT: + /* + * The MP29502 PMBUS_VIN_UV_WARN_LIMIT and PMBUS_VIN_UV_FAULT_LIMIT + * scale is 125mV/Lsb, but the vin scale is set to 125mV/Lsb(using + * r/m/b scale), so the word data bit0-bit9 can be returned to pmbus + * core directly. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = FIELD_GET(GENMASK(9, 0), ret); + break; + case PMBUS_VOUT_OV_FAULT_LIMIT: + /* + * The MP29502 vout ov fault limit value comes from + * page1 MFR_TSNS_FLT_SET[12:7]. + */ + ret = mp29502_read_vout_ov_limit(client, data); + if (ret < 0) + return ret; + + break; + case PMBUS_VOUT_UV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((FIELD_GET(GENMASK(8, 0), ret) * + MP29502_OVUV_LIMIT_SCALE - + MP29502_VOUT_UV_OFFSET) * + (data->vout_bottom_div + + 4 * data->vout_top_div), + data->vout_bottom_div); + break; + case PMBUS_IOUT_OC_FAULT_LIMIT: + case PMBUS_IOUT_OC_WARN_LIMIT: + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = DIV_ROUND_CLOSEST((ret & GENMASK(7, 0)) * + data->iout_scale * + MP29502_IOUT_LIMIT_UINT, + MP29502_READ_IOUT_DIV); + break; + case PMBUS_OT_FAULT_LIMIT: + case PMBUS_OT_WARN_LIMIT: + /* + * The scale of MP29502 PMBUS_OT_FAULT_LIMIT and PMBUS_OT_WARN_LIMIT + * is 1°C/LSB and they have 40°C offset. + */ + ret = pmbus_read_word_data(client, page, phase, reg); + if (ret < 0) + return ret; + + ret = (ret & GENMASK(7, 0)) - MP29502_TEMP_LIMIT_OFFSET; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int mp29502_write_word_data(struct i2c_client *client, int page, int reg, + u16 word) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct mp29502_data *data = to_mp29502_data(info); + int ret; + + ret = i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0); + if (ret < 0) + return ret; + + switch (reg) { + case PMBUS_VIN_OV_FAULT_LIMIT: + /* + * The PMBUS_VIN_OV_FAULT_LIMIT[7:0] is the limit value, + * and bit8-bit15 should not be changed. The scale of + * PMBUS_VIN_OV_FAULT_LIMIT is 500mV/Lsb, but the vin + * scale is set to 125mV/Lsb(using r/m/b scale), so + * the word data should divide by 4. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(7, 0)) | + FIELD_PREP(GENMASK(7, 0), + DIV_ROUND_CLOSEST(word, + MP29502_VIN_OV_GAIN))); + break; + case PMBUS_VIN_UV_WARN_LIMIT: + case PMBUS_VIN_UV_FAULT_LIMIT: + /* + * The PMBUS_VIN_UV_WARN_LIMIT[9:0] and PMBUS_VIN_UV_FAULT_LIMIT[9:0] + * are the limit value, and bit10-bit15 should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(9, 0)) | + FIELD_PREP(GENMASK(9, 0), + word)); + break; + case PMBUS_VOUT_OV_FAULT_LIMIT: + ret = mp29502_write_vout_ov_limit(client, word, data); + if (ret < 0) + return ret; + + break; + case PMBUS_VOUT_UV_FAULT_LIMIT: + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(8, 0)) | + FIELD_PREP(GENMASK(8, 0), + DIV_ROUND_CLOSEST(word * + data->vout_bottom_div + + MP29502_VOUT_UV_OFFSET * + (data->vout_bottom_div + + 4 * data->vout_top_div), + MP29502_OVUV_LIMIT_SCALE * + (data->vout_bottom_div + + 4 * data->vout_top_div)))); + break; + case PMBUS_IOUT_OC_FAULT_LIMIT: + case PMBUS_IOUT_OC_WARN_LIMIT: + ret = pmbus_write_word_data(client, page, reg, + DIV_ROUND_CLOSEST(word * + MP29502_READ_IOUT_DIV, + MP29502_IOUT_LIMIT_UINT * + data->iout_scale)); + break; + case PMBUS_OT_FAULT_LIMIT: + case PMBUS_OT_WARN_LIMIT: + /* + * The PMBUS_OT_FAULT_LIMIT[7:0] and PMBUS_OT_WARN_LIMIT[7:0] + * are the limit value, and bit8-bit15 should not be changed. + */ + ret = pmbus_read_word_data(client, page, 0xff, reg); + if (ret < 0) + return ret; + + ret = pmbus_write_word_data(client, page, reg, + (ret & ~GENMASK(7, 0)) | + FIELD_PREP(GENMASK(7, 0), + word + MP29502_TEMP_LIMIT_OFFSET)); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int mp29502_identify(struct i2c_client *client, struct pmbus_driver_info *info) +{ + int ret; + + /* Identify vout scale */ + ret = mp29502_identify_vout_scale(client, info, 0); + if (ret < 0) + return ret; + + /* Identify vout divider. */ + ret = mp29502_identify_vout_divider(client, info, 1); + if (ret < 0) + return ret; + + /* Identify ovp divider. */ + ret = mp29502_identify_ovp_divider(client, info, 1); + if (ret < 0) + return ret; + + /* Identify iout scale */ + return mp29502_identify_iout_scale(client, info, 0); +} + +static const struct pmbus_driver_info mp29502_info = { + .pages = MP29502_PAGE_NUM, + .format[PSC_VOLTAGE_IN] = direct, + .format[PSC_TEMPERATURE] = direct, + .format[PSC_CURRENT_IN] = direct, + .format[PSC_CURRENT_OUT] = direct, + .format[PSC_VOLTAGE_OUT] = direct, + .format[PSC_POWER] = direct, + + .m[PSC_VOLTAGE_IN] = 8, + .R[PSC_VOLTAGE_IN] = 0, + .b[PSC_VOLTAGE_IN] = 0, + + .m[PSC_VOLTAGE_OUT] = 1, + .R[PSC_VOLTAGE_OUT] = 3, + .b[PSC_VOLTAGE_OUT] = 0, + + .m[PSC_TEMPERATURE] = 1, + .R[PSC_TEMPERATURE] = 0, + .b[PSC_TEMPERATURE] = 0, + + .m[PSC_CURRENT_IN] = 1, + .R[PSC_CURRENT_IN] = 0, + .b[PSC_CURRENT_IN] = 0, + + .m[PSC_CURRENT_OUT] = 1, + .R[PSC_CURRENT_OUT] = 0, + .b[PSC_CURRENT_OUT] = 0, + + .m[PSC_POWER] = 1, + .R[PSC_POWER] = 0, + .b[PSC_POWER] = 0, + + .func[0] = MP29502_RAIL_FUNC, + .read_word_data = mp29502_read_word_data, + .read_byte_data = mp29502_read_byte_data, + .write_word_data = mp29502_write_word_data, + .identify = mp29502_identify, +}; + +static int mp29502_probe(struct i2c_client *client) +{ + struct pmbus_driver_info *info; + struct mp29502_data *data; + + data = devm_kzalloc(&client->dev, sizeof(struct mp29502_data), + GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->info, &mp29502_info, sizeof(*info)); + info = &data->info; + + return pmbus_do_probe(client, info); +} + +static const struct i2c_device_id mp29502_id[] = { + {"mp29502", 0}, + {} +}; +MODULE_DEVICE_TABLE(i2c, mp29502_id); + +static const struct of_device_id __maybe_unused mp29502_of_match[] = { + {.compatible = "mps,mp29502"}, + {} +}; +MODULE_DEVICE_TABLE(of, mp29502_of_match); + +static struct i2c_driver mp29502_driver = { + .driver = { + .name = "mp29502", + .of_match_table = mp29502_of_match, + }, + .probe = mp29502_probe, + .id_table = mp29502_id, +}; + +module_i2c_driver(mp29502_driver); + +MODULE_AUTHOR("Wensheng Wang #include "pmbus.h" +enum chips { mp5990, mp5998 }; + #define MP5990_EFUSE_CFG (0xC4) #define MP5990_VOUT_FORMAT BIT(9) @@ -110,10 +112,53 @@ static struct pmbus_driver_info mp5990_info = { .read_word_data = mp5990_read_word_data, }; +static struct pmbus_driver_info mp5998_info = { + .pages = 1, + .format[PSC_VOLTAGE_IN] = direct, + .format[PSC_VOLTAGE_OUT] = direct, + .format[PSC_CURRENT_IN] = direct, + .format[PSC_CURRENT_OUT] = direct, + .format[PSC_POWER] = direct, + .format[PSC_TEMPERATURE] = direct, + .m[PSC_VOLTAGE_IN] = 64, + .b[PSC_VOLTAGE_IN] = 0, + .R[PSC_VOLTAGE_IN] = 0, + .m[PSC_VOLTAGE_OUT] = 64, + .b[PSC_VOLTAGE_OUT] = 0, + .R[PSC_VOLTAGE_OUT] = 0, + .m[PSC_CURRENT_IN] = 16, + .b[PSC_CURRENT_IN] = 0, + .R[PSC_CURRENT_IN] = 0, + .m[PSC_CURRENT_OUT] = 16, + .b[PSC_CURRENT_OUT] = 0, + .R[PSC_CURRENT_OUT] = 0, + .m[PSC_POWER] = 2, + .b[PSC_POWER] = 0, + .R[PSC_POWER] = 0, + .m[PSC_TEMPERATURE] = 1, + .b[PSC_TEMPERATURE] = 0, + .R[PSC_TEMPERATURE] = 0, + .func[0] = + PMBUS_HAVE_VIN | PMBUS_HAVE_VOUT | PMBUS_HAVE_IOUT | + PMBUS_HAVE_IIN | PMBUS_HAVE_PIN | PMBUS_HAVE_POUT | + PMBUS_HAVE_TEMP | PMBUS_HAVE_STATUS_IOUT | + PMBUS_HAVE_STATUS_INPUT | PMBUS_HAVE_STATUS_TEMP, + .read_byte_data = mp5990_read_byte_data, + .read_word_data = mp5990_read_word_data, +}; + +static const struct i2c_device_id mp5990_id[] = { + {"mp5990", mp5990}, + {"mp5998", mp5998}, + { } +}; +MODULE_DEVICE_TABLE(i2c, mp5990_id); + static int mp5990_probe(struct i2c_client *client) { struct pmbus_driver_info *info; struct mp5990_data *data; + enum chips chip; int ret; data = devm_kzalloc(&client->dev, sizeof(struct mp5990_data), @@ -121,7 +166,15 @@ static int mp5990_probe(struct i2c_client *client) if (!data) return -ENOMEM; - memcpy(&data->info, &mp5990_info, sizeof(*info)); + if (client->dev.of_node) + chip = (uintptr_t)of_device_get_match_data(&client->dev); + else + chip = i2c_match_id(mp5990_id, client)->driver_data; + + if (chip == mp5990) + memcpy(&data->info, &mp5990_info, sizeof(*info)); + else + memcpy(&data->info, &mp5998_info, sizeof(*info)); info = &data->info; /* Read Vout Config */ @@ -140,6 +193,9 @@ static int mp5990_probe(struct i2c_client *client) data->info.format[PSC_VOLTAGE_OUT] = linear; data->info.format[PSC_CURRENT_OUT] = linear; data->info.format[PSC_POWER] = linear; + if (chip == mp5998) + data->info.format[PSC_CURRENT_IN] = linear; + ret = i2c_smbus_read_word_data(client, PMBUS_READ_VOUT); if (ret < 0) { dev_err(&client->dev, "Can't get vout exponent."); @@ -153,16 +209,11 @@ static int mp5990_probe(struct i2c_client *client) } static const struct of_device_id mp5990_of_match[] = { - { .compatible = "mps,mp5990" }, + { .compatible = "mps,mp5990", .data = (void *)mp5990 }, + { .compatible = "mps,mp5998", .data = (void *)mp5998 }, {} }; -static const struct i2c_device_id mp5990_id[] = { - {"mp5990"}, - { } -}; -MODULE_DEVICE_TABLE(i2c, mp5990_id); - static struct i2c_driver mp5990_driver = { .driver = { .name = "mp5990", diff --git a/drivers/hwmon/pwm-fan.c b/drivers/hwmon/pwm-fan.c index d0fe53451bdf8b..37269db2de84fb 100644 --- a/drivers/hwmon/pwm-fan.c +++ b/drivers/hwmon/pwm-fan.c @@ -64,6 +64,7 @@ struct pwm_fan_ctx { u64 pwm_duty_cycle_from_stopped; u32 pwm_usec_from_stopped; + u8 pwm_shutdown; }; /* This handler assumes self resetting edge triggered interrupt. */ @@ -484,9 +485,14 @@ static void pwm_fan_cleanup(void *__ctx) struct pwm_fan_ctx *ctx = __ctx; timer_delete_sync(&ctx->rpm_timer); - /* Switch off everything */ - ctx->enable_mode = pwm_disable_reg_disable; - pwm_fan_power_off(ctx, true); + if (ctx->pwm_shutdown) { + ctx->enable_mode = pwm_enable_reg_enable; + __set_pwm(ctx, ctx->pwm_shutdown); + } else { + /* Switch off everything */ + ctx->enable_mode = pwm_disable_reg_disable; + pwm_fan_power_off(ctx, true); + } } static int pwm_fan_probe(struct platform_device *pdev) @@ -498,6 +504,7 @@ static int pwm_fan_probe(struct platform_device *pdev) int ret; const struct hwmon_channel_info **channels; u32 initial_pwm, pwm_min_from_stopped = 0; + u32 pwm_shutdown_percent = 0; u32 *fan_channel_config; int channel_count = 1; /* We always have a PWM channel. */ int i; @@ -648,6 +655,11 @@ static int pwm_fan_probe(struct platform_device *pdev) channels[1] = &ctx->fan_channel; } + ret = device_property_read_u32(dev, "fan-shutdown-percent", + &pwm_shutdown_percent); + if (!ret && pwm_shutdown_percent) + ctx->pwm_shutdown = (clamp(pwm_shutdown_percent, 0, 100) * 255) / 100; + ret = device_property_read_u32(dev, "fan-stop-to-start-percent", &pwm_min_from_stopped); if (!ret && pwm_min_from_stopped) { diff --git a/drivers/hwmon/sa67mcu-hwmon.c b/drivers/hwmon/sa67mcu-hwmon.c new file mode 100644 index 00000000000000..22f703b7b25609 --- /dev/null +++ b/drivers/hwmon/sa67mcu-hwmon.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sl67mcu hardware monitoring driver + * + * Copyright 2025 Kontron Europe GmbH + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define SA67MCU_VOLTAGE(n) (0x00 + ((n) * 2)) +#define SA67MCU_TEMP(n) (0x04 + ((n) * 2)) + +struct sa67mcu_hwmon { + struct regmap *regmap; + u32 offset; +}; + +static int sa67mcu_hwmon_read(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, long *input) +{ + struct sa67mcu_hwmon *hwmon = dev_get_drvdata(dev); + unsigned int offset; + u8 reg[2]; + int ret; + + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_input: + offset = hwmon->offset + SA67MCU_VOLTAGE(channel); + break; + default: + return -EOPNOTSUPP; + } + break; + case hwmon_temp: + switch (attr) { + case hwmon_temp_input: + offset = hwmon->offset + SA67MCU_TEMP(channel); + break; + default: + return -EOPNOTSUPP; + } + break; + default: + return -EOPNOTSUPP; + } + + /* Reading the low byte will capture the value */ + ret = regmap_bulk_read(hwmon->regmap, offset, reg, ARRAY_SIZE(reg)); + if (ret) + return ret; + + *input = reg[1] << 8 | reg[0]; + + /* Temperatures are s16 and in 0.1degC steps. */ + if (type == hwmon_temp) + *input = sign_extend32(*input, 15) * 100; + + return 0; +} + +static const struct hwmon_channel_info * const sa67mcu_hwmon_info[] = { + HWMON_CHANNEL_INFO(in, + HWMON_I_INPUT | HWMON_I_LABEL, + HWMON_I_INPUT | HWMON_I_LABEL), + HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT), + NULL +}; + +static const char *const sa67mcu_hwmon_in_labels[] = { + "VDDIN", + "VDD_RTC", +}; + +static int sa67mcu_hwmon_read_string(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, const char **str) +{ + switch (type) { + case hwmon_in: + switch (attr) { + case hwmon_in_label: + *str = sa67mcu_hwmon_in_labels[channel]; + return 0; + default: + return -EOPNOTSUPP; + } + default: + return -EOPNOTSUPP; + } +} + +static const struct hwmon_ops sa67mcu_hwmon_ops = { + .visible = 0444, + .read = sa67mcu_hwmon_read, + .read_string = sa67mcu_hwmon_read_string, +}; + +static const struct hwmon_chip_info sa67mcu_hwmon_chip_info = { + .ops = &sa67mcu_hwmon_ops, + .info = sa67mcu_hwmon_info, +}; + +static int sa67mcu_hwmon_probe(struct platform_device *pdev) +{ + struct sa67mcu_hwmon *hwmon; + struct device *hwmon_dev; + int ret; + + if (!pdev->dev.parent) + return -ENODEV; + + hwmon = devm_kzalloc(&pdev->dev, sizeof(*hwmon), GFP_KERNEL); + if (!hwmon) + return -ENOMEM; + + hwmon->regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!hwmon->regmap) + return -ENODEV; + + ret = device_property_read_u32(&pdev->dev, "reg", &hwmon->offset); + if (ret) + return -EINVAL; + + hwmon_dev = devm_hwmon_device_register_with_info(&pdev->dev, + "sa67mcu_hwmon", hwmon, + &sa67mcu_hwmon_chip_info, + NULL); + if (IS_ERR(hwmon_dev)) + dev_err(&pdev->dev, "failed to register as hwmon device"); + + return PTR_ERR_OR_ZERO(hwmon_dev); +} + +static const struct of_device_id sa67mcu_hwmon_of_match[] = { + { .compatible = "kontron,sa67mcu-hwmon", }, + {} +}; +MODULE_DEVICE_TABLE(of, sa67mcu_hwmon_of_match); + +static struct platform_driver sa67mcu_hwmon_driver = { + .probe = sa67mcu_hwmon_probe, + .driver = { + .name = "sa67mcu-hwmon", + .of_match_table = sa67mcu_hwmon_of_match, + }, +}; +module_platform_driver(sa67mcu_hwmon_driver); + +MODULE_DESCRIPTION("sa67mcu Hardware Monitoring Driver"); +MODULE_AUTHOR("Michael Walle "); +MODULE_LICENSE("GPL"); diff --git a/drivers/hwmon/sbtsi_temp.c b/drivers/hwmon/sbtsi_temp.c index 3c839f56c46038..a6c439e376ff7c 100644 --- a/drivers/hwmon/sbtsi_temp.c +++ b/drivers/hwmon/sbtsi_temp.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * SB-TSI registers only support SMBus byte data access. "_INT" registers are @@ -29,8 +30,22 @@ #define SBTSI_REG_TEMP_HIGH_DEC 0x13 /* RW */ #define SBTSI_REG_TEMP_LOW_DEC 0x14 /* RW */ +/* + * Bit for reporting value with temperature measurement range. + * bit == 0: Use default temperature range (0C to 255.875C). + * bit == 1: Use extended temperature range (-49C to +206.875C). + */ +#define SBTSI_CONFIG_EXT_RANGE_SHIFT 2 +/* + * ReadOrder bit specifies the reading order of integer and decimal part of + * CPU temperature for atomic reads. If bit == 0, reading integer part triggers + * latching of the decimal part, so integer part should be read first. + * If bit == 1, read order should be reversed. + */ #define SBTSI_CONFIG_READ_ORDER_SHIFT 5 +#define SBTSI_TEMP_EXT_RANGE_ADJ 49000 + #define SBTSI_TEMP_MIN 0 #define SBTSI_TEMP_MAX 255875 @@ -38,6 +53,8 @@ struct sbtsi_data { struct i2c_client *client; struct mutex lock; + bool ext_range_mode; + bool read_order; }; /* @@ -74,23 +91,11 @@ static int sbtsi_read(struct device *dev, enum hwmon_sensor_types type, { struct sbtsi_data *data = dev_get_drvdata(dev); s32 temp_int, temp_dec; - int err; switch (attr) { case hwmon_temp_input: - /* - * ReadOrder bit specifies the reading order of integer and - * decimal part of CPU temp for atomic reads. If bit == 0, - * reading integer part triggers latching of the decimal part, - * so integer part should be read first. If bit == 1, read - * order should be reversed. - */ - err = i2c_smbus_read_byte_data(data->client, SBTSI_REG_CONFIG); - if (err < 0) - return err; - mutex_lock(&data->lock); - if (err & BIT(SBTSI_CONFIG_READ_ORDER_SHIFT)) { + if (data->read_order) { temp_dec = i2c_smbus_read_byte_data(data->client, SBTSI_REG_TEMP_DEC); temp_int = i2c_smbus_read_byte_data(data->client, SBTSI_REG_TEMP_INT); } else { @@ -122,6 +127,8 @@ static int sbtsi_read(struct device *dev, enum hwmon_sensor_types type, return temp_dec; *val = sbtsi_reg_to_mc(temp_int, temp_dec); + if (data->ext_range_mode) + *val -= SBTSI_TEMP_EXT_RANGE_ADJ; return 0; } @@ -146,6 +153,8 @@ static int sbtsi_write(struct device *dev, enum hwmon_sensor_types type, return -EINVAL; } + if (data->ext_range_mode) + val += SBTSI_TEMP_EXT_RANGE_ADJ; val = clamp_val(val, SBTSI_TEMP_MIN, SBTSI_TEMP_MAX); sbtsi_mc_to_reg(val, &temp_int, &temp_dec); @@ -203,6 +212,7 @@ static int sbtsi_probe(struct i2c_client *client) struct device *dev = &client->dev; struct device *hwmon_dev; struct sbtsi_data *data; + int err; data = devm_kzalloc(dev, sizeof(struct sbtsi_data), GFP_KERNEL); if (!data) @@ -211,8 +221,14 @@ static int sbtsi_probe(struct i2c_client *client) data->client = client; mutex_init(&data->lock); - hwmon_dev = devm_hwmon_device_register_with_info(dev, client->name, data, &sbtsi_chip_info, - NULL); + err = i2c_smbus_read_byte_data(data->client, SBTSI_REG_CONFIG); + if (err < 0) + return err; + data->ext_range_mode = FIELD_GET(BIT(SBTSI_CONFIG_EXT_RANGE_SHIFT), err); + data->read_order = FIELD_GET(BIT(SBTSI_CONFIG_READ_ORDER_SHIFT), err); + + hwmon_dev = devm_hwmon_device_register_with_info(dev, client->name, data, + &sbtsi_chip_info, NULL); return PTR_ERR_OR_ZERO(hwmon_dev); } diff --git a/drivers/hwmon/sch56xx-common.c b/drivers/hwmon/sch56xx-common.c index 71941b1bb57328..98e075e54e9dee 100644 --- a/drivers/hwmon/sch56xx-common.c +++ b/drivers/hwmon/sch56xx-common.c @@ -544,10 +544,8 @@ void sch56xx_watchdog_register(struct device *parent, u16 addr, u32 revision, watchdog_set_drvdata(&data->wddev, data); err = devm_watchdog_register_device(parent, &data->wddev); - if (err) { - pr_err("Registering watchdog chardev: %d\n", err); + if (err) devm_kfree(parent, data); - } } EXPORT_SYMBOL(sch56xx_watchdog_register); diff --git a/drivers/hwmon/sht21.c b/drivers/hwmon/sht21.c index 97327313529b46..627d35070a420a 100644 --- a/drivers/hwmon/sht21.c +++ b/drivers/hwmon/sht21.c @@ -275,13 +275,26 @@ static int sht21_probe(struct i2c_client *client) /* Device ID table */ static const struct i2c_device_id sht21_id[] = { + { "sht20" }, { "sht21" }, + { "sht25" }, { } }; MODULE_DEVICE_TABLE(i2c, sht21_id); +static const struct of_device_id sht21_of_match[] = { + { .compatible = "sensirion,sht20" }, + { .compatible = "sensirion,sht21" }, + { .compatible = "sensirion,sht25" }, + { } +}; +MODULE_DEVICE_TABLE(of, sht21_of_match); + static struct i2c_driver sht21_driver = { - .driver.name = "sht21", + .driver = { + .name = "sht21", + .of_match_table = sht21_of_match, + }, .probe = sht21_probe, .id_table = sht21_id, }; diff --git a/drivers/hwmon/sy7636a-hwmon.c b/drivers/hwmon/sy7636a-hwmon.c index ed110884786b48..a12fc0ce70e76e 100644 --- a/drivers/hwmon/sy7636a-hwmon.c +++ b/drivers/hwmon/sy7636a-hwmon.c @@ -104,3 +104,4 @@ module_platform_driver(sy7636a_sensor_driver); MODULE_DESCRIPTION("SY7636A sensor driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:sy7636a-temperature"); diff --git a/drivers/hwmon/tmp102.c b/drivers/hwmon/tmp102.c index a02daa496c9c44..376e0eac8cc1c9 100644 --- a/drivers/hwmon/tmp102.c +++ b/drivers/hwmon/tmp102.c @@ -53,6 +53,7 @@ #define CONVERSION_TIME_MS 35 /* in milli-seconds */ struct tmp102 { + const char *label; struct regmap *regmap; u16 config_orig; unsigned long ready_time; @@ -70,6 +71,16 @@ static inline u16 tmp102_mC_to_reg(int val) return (val * 128) / 1000; } +static int tmp102_read_string(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, const char **str) +{ + struct tmp102 *tmp102 = dev_get_drvdata(dev); + + *str = tmp102->label; + + return 0; +} + static int tmp102_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *temp) { @@ -128,12 +139,18 @@ static int tmp102_write(struct device *dev, enum hwmon_sensor_types type, static umode_t tmp102_is_visible(const void *data, enum hwmon_sensor_types type, u32 attr, int channel) { + const struct tmp102 *tmp102 = data; + if (type != hwmon_temp) return 0; switch (attr) { case hwmon_temp_input: return 0444; + case hwmon_temp_label: + if (tmp102->label) + return 0444; + return 0; case hwmon_temp_max_hyst: case hwmon_temp_max: return 0644; @@ -146,12 +163,13 @@ static const struct hwmon_channel_info * const tmp102_info[] = { HWMON_CHANNEL_INFO(chip, HWMON_C_REGISTER_TZ), HWMON_CHANNEL_INFO(temp, - HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MAX_HYST), + HWMON_T_INPUT | HWMON_T_LABEL | HWMON_T_MAX | HWMON_T_MAX_HYST), NULL }; static const struct hwmon_ops tmp102_hwmon_ops = { .is_visible = tmp102_is_visible, + .read_string = tmp102_read_string, .read = tmp102_read, .write = tmp102_write, }; @@ -213,6 +231,8 @@ static int tmp102_probe(struct i2c_client *client) if (!tmp102) return -ENOMEM; + of_property_read_string(dev->of_node, "label", &tmp102->label); + i2c_set_clientdata(client, tmp102); tmp102->regmap = devm_regmap_init_i2c(client, &tmp102_regmap_config); diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index 8267dd1a2130d3..8f426f94e32a15 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -23,7 +23,8 @@ #include "coresight-self-hosted-trace.h" #include "coresight-trbe.h" -#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT)) +#define PERF_IDX2OFF(idx, buf) \ + ((idx) % ((unsigned long)(buf)->nr_pages << PAGE_SHIFT)) /* * A padding packet that will help the user space tools diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 070d014fdc5d5d..e0c2c2ab0aa373 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -165,6 +165,7 @@ config I2C_I801 Birch Stream (SOC) Arrow Lake (SOC) Panther Lake (SOC) + Wildcat Lake (SOC) This driver can also be built as a module. If so, the module will be called i2c-i801. @@ -1357,6 +1358,16 @@ config I2C_LJCA This driver can also be built as a module. If so, the module will be called i2c-ljca. +config I2C_NCT6694 + tristate "Nuvoton NCT6694 I2C adapter support" + depends on MFD_NCT6694 + help + If you say yes to this option, support will be included for Nuvoton + NCT6694, a USB to I2C interface. + + This driver can also be built as a module. If so, the module will + be called i2c-nct6694. + config I2C_CP2615 tristate "Silicon Labs CP2615 USB sound card and I2C adapter" depends on USB diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 04db855fdfd66f..fe8cf6325fc989 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -135,6 +135,7 @@ obj-$(CONFIG_I2C_GXP) += i2c-gxp.o obj-$(CONFIG_I2C_DIOLAN_U2C) += i2c-diolan-u2c.o obj-$(CONFIG_I2C_DLN2) += i2c-dln2.o obj-$(CONFIG_I2C_LJCA) += i2c-ljca.o +obj-$(CONFIG_I2C_NCT6694) += i2c-nct6694.o obj-$(CONFIG_I2C_CP2615) += i2c-cp2615.o obj-$(CONFIG_I2C_PARPORT) += i2c-parport.o obj-$(CONFIG_I2C_PCI1XXXX) += i2c-mchp-pci1xxxx.o diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index cbd88ffa561010..c7a72c28786c2b 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -1068,11 +1068,10 @@ int i2c_dw_probe_master(struct dw_i2c_dev *dev) if (!(dev->flags & ACCESS_POLLING)) { ret = devm_request_irq(dev->dev, dev->irq, i2c_dw_isr, irq_flags, dev_name(dev->dev), dev); - if (ret) { - dev_err(dev->dev, "failure requesting irq %i: %d\n", - dev->irq, ret); - return ret; - } + if (ret) + return dev_err_probe(dev->dev, ret, + "failure requesting irq %i: %d\n", + dev->irq, ret); } ret = i2c_dw_init_recovery_info(dev); diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index a35e4c64a1d46f..34d881572351cc 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -238,7 +238,7 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) dev->rst = devm_reset_control_get_optional_exclusive(device, NULL); if (IS_ERR(dev->rst)) - return PTR_ERR(dev->rst); + return dev_err_probe(device, PTR_ERR(dev->rst), "failed to acquire reset\n"); reset_control_deassert(dev->rst); @@ -247,21 +247,23 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) goto exit_reset; ret = i2c_dw_probe_lock_support(dev); - if (ret) + if (ret) { + ret = dev_err_probe(device, ret, "failed to probe lock support\n"); goto exit_reset; + } i2c_dw_configure(dev); /* Optional interface clock */ dev->pclk = devm_clk_get_optional(device, "pclk"); if (IS_ERR(dev->pclk)) { - ret = PTR_ERR(dev->pclk); + ret = dev_err_probe(device, PTR_ERR(dev->pclk), "failed to acquire pclk\n"); goto exit_reset; } dev->clk = devm_clk_get_optional(device, NULL); if (IS_ERR(dev->clk)) { - ret = PTR_ERR(dev->clk); + ret = dev_err_probe(device, PTR_ERR(dev->clk), "failed to acquire clock\n"); goto exit_reset; } @@ -314,6 +316,7 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) exit_probe: dw_i2c_plat_pm_cleanup(dev); + i2c_dw_prepare_clk(dev, false); exit_reset: reset_control_assert(dev->rst); return ret; @@ -331,9 +334,11 @@ static void dw_i2c_plat_remove(struct platform_device *pdev) i2c_dw_disable(dev); pm_runtime_dont_use_autosuspend(device); - pm_runtime_put_sync(device); + pm_runtime_put_noidle(device); dw_i2c_plat_pm_cleanup(dev); + i2c_dw_prepare_clk(dev, false); + i2c_dw_remove_lock_support(dev); reset_control_assert(dev->rst); diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index b936a240db0a93..6eb16b7d75a6d0 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -266,11 +266,10 @@ int i2c_dw_probe_slave(struct dw_i2c_dev *dev) ret = devm_request_irq(dev->dev, dev->irq, i2c_dw_isr_slave, IRQF_SHARED, dev_name(dev->dev), dev); - if (ret) { - dev_err(dev->dev, "failure requesting IRQ %i: %d\n", - dev->irq, ret); - return ret; - } + if (ret) + return dev_err_probe(dev->dev, ret, + "failure requesting IRQ %i: %d\n", + dev->irq, ret); ret = i2c_add_numbered_adapter(adap); if (ret) diff --git a/drivers/i2c/busses/i2c-hix5hd2.c b/drivers/i2c/busses/i2c-hix5hd2.c index 370f329747637c..5358f5ddf924b8 100644 --- a/drivers/i2c/busses/i2c-hix5hd2.c +++ b/drivers/i2c/busses/i2c-hix5hd2.c @@ -339,7 +339,7 @@ static int hix5hd2_i2c_xfer_msg(struct hix5hd2_i2c_priv *priv, ret = priv->state; /* - * If this is the last message to be transfered (stop == 1) + * If this is the last message to be transferred (stop == 1) * Then check if the bus can be brought back to idle. */ if (priv->state == HIX5I2C_STAT_RW_SUCCESS && stop) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index e94ac746a741af..cba992fa655791 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -83,6 +83,7 @@ * Arrow Lake-H (SOC) 0x7722 32 hard yes yes yes * Panther Lake-H (SOC) 0xe322 32 hard yes yes yes * Panther Lake-P (SOC) 0xe422 32 hard yes yes yes + * Wildcat Lake-U (SOC) 0x4d22 32 hard yes yes yes * * Features supported by this driver: * Software PEC no @@ -236,6 +237,7 @@ #define PCI_DEVICE_ID_INTEL_5_3400_SERIES_SMBUS 0x3b30 #define PCI_DEVICE_ID_INTEL_TIGERLAKE_H_SMBUS 0x43a3 #define PCI_DEVICE_ID_INTEL_ELKHART_LAKE_SMBUS 0x4b23 +#define PCI_DEVICE_ID_INTEL_WILDCAT_LAKE_U_SMBUS 0x4d22 #define PCI_DEVICE_ID_INTEL_JASPER_LAKE_SMBUS 0x4da3 #define PCI_DEVICE_ID_INTEL_ALDER_LAKE_P_SMBUS 0x51a3 #define PCI_DEVICE_ID_INTEL_ALDER_LAKE_M_SMBUS 0x54a3 @@ -1056,6 +1058,7 @@ static const struct pci_device_id i801_ids[] = { { PCI_DEVICE_DATA(INTEL, ARROW_LAKE_H_SMBUS, FEATURES_ICH5 | FEATURE_TCO_CNL) }, { PCI_DEVICE_DATA(INTEL, PANTHER_LAKE_H_SMBUS, FEATURES_ICH5 | FEATURE_TCO_CNL) }, { PCI_DEVICE_DATA(INTEL, PANTHER_LAKE_P_SMBUS, FEATURES_ICH5 | FEATURE_TCO_CNL) }, + { PCI_DEVICE_DATA(INTEL, WILDCAT_LAKE_U_SMBUS, FEATURES_ICH5 | FEATURE_TCO_CNL) }, { 0, } }; diff --git a/drivers/i2c/busses/i2c-k1.c b/drivers/i2c/busses/i2c-k1.c index b68a21fff0b56b..6b918770e612e0 100644 --- a/drivers/i2c/busses/i2c-k1.c +++ b/drivers/i2c/busses/i2c-k1.c @@ -3,6 +3,7 @@ * Copyright (C) 2024-2025 Troy Mitchell */ +#include #include #include #include @@ -14,6 +15,7 @@ #define SPACEMIT_ICR 0x0 /* Control register */ #define SPACEMIT_ISR 0x4 /* Status register */ #define SPACEMIT_IDBR 0xc /* Data buffer register */ +#define SPACEMIT_IRCR 0x18 /* Reset cycle counter */ #define SPACEMIT_IBMR 0x1c /* Bus monitor register */ /* SPACEMIT_ICR register fields */ @@ -25,7 +27,8 @@ #define SPACEMIT_CR_MODE_FAST BIT(8) /* bus mode (master operation) */ /* Bit 9 is reserved */ #define SPACEMIT_CR_UR BIT(10) /* unit reset */ -/* Bits 11-12 are reserved */ +#define SPACEMIT_CR_RSTREQ BIT(11) /* i2c bus reset request */ +/* Bit 12 is reserved */ #define SPACEMIT_CR_SCLE BIT(13) /* master clock enable */ #define SPACEMIT_CR_IUE BIT(14) /* unit enable */ /* Bits 15-17 are reserved */ @@ -76,6 +79,10 @@ SPACEMIT_SR_GCAD | SPACEMIT_SR_IRF | SPACEMIT_SR_ITE | \ SPACEMIT_SR_ALD) +#define SPACEMIT_RCR_SDA_GLITCH_NOFIX BIT(7) /* bypass the SDA glitch fix */ +/* the cycles of SCL during bus reset */ +#define SPACEMIT_RCR_FIELD_RST_CYC GENMASK(3, 0) + /* SPACEMIT_IBMR register fields */ #define SPACEMIT_BMR_SDA BIT(0) /* SDA line level */ #define SPACEMIT_BMR_SCL BIT(1) /* SCL line level */ @@ -88,6 +95,8 @@ #define SPACEMIT_SR_ERR (SPACEMIT_SR_BED | SPACEMIT_SR_RXOV | SPACEMIT_SR_ALD) +#define SPACEMIT_BUS_RESET_CLK_CNT_MAX 9 + enum spacemit_i2c_state { SPACEMIT_STATE_IDLE, SPACEMIT_STATE_START, @@ -160,6 +169,7 @@ static int spacemit_i2c_handle_err(struct spacemit_i2c_dev *i2c) static void spacemit_i2c_conditionally_reset_bus(struct spacemit_i2c_dev *i2c) { u32 status; + u8 clk_cnt; /* if bus is locked, reset unit. 0: locked */ status = readl(i2c->base + SPACEMIT_IBMR); @@ -169,9 +179,21 @@ static void spacemit_i2c_conditionally_reset_bus(struct spacemit_i2c_dev *i2c) spacemit_i2c_reset(i2c); usleep_range(10, 20); - /* check scl status again */ + for (clk_cnt = 0; clk_cnt < SPACEMIT_BUS_RESET_CLK_CNT_MAX; clk_cnt++) { + status = readl(i2c->base + SPACEMIT_IBMR); + if (status & SPACEMIT_BMR_SDA) + return; + + /* There's nothing left to save here, we are about to exit */ + writel(FIELD_PREP(SPACEMIT_RCR_FIELD_RST_CYC, 1), + i2c->base + SPACEMIT_IRCR); + writel(SPACEMIT_CR_RSTREQ, i2c->base + SPACEMIT_ICR); + usleep_range(20, 30); + } + + /* check sda again here */ status = readl(i2c->base + SPACEMIT_IBMR); - if (!(status & SPACEMIT_BMR_SCL)) + if (!(status & SPACEMIT_BMR_SDA)) dev_warn_ratelimited(i2c->dev, "unit reset failed\n"); } @@ -237,6 +259,14 @@ static void spacemit_i2c_init(struct spacemit_i2c_dev *i2c) val |= SPACEMIT_CR_MSDE | SPACEMIT_CR_MSDIE; writel(val, i2c->base + SPACEMIT_ICR); + + /* + * The glitch fix in the K1 I2C controller introduces a delay + * on restart signals, so we disable the fix here. + */ + val = readl(i2c->base + SPACEMIT_IRCR); + val |= SPACEMIT_RCR_SDA_GLITCH_NOFIX; + writel(val, i2c->base + SPACEMIT_IRCR); } static inline void @@ -267,19 +297,6 @@ static void spacemit_i2c_start(struct spacemit_i2c_dev *i2c) writel(val, i2c->base + SPACEMIT_ICR); } -static void spacemit_i2c_stop(struct spacemit_i2c_dev *i2c) -{ - u32 val; - - val = readl(i2c->base + SPACEMIT_ICR); - val |= SPACEMIT_CR_STOP | SPACEMIT_CR_ALDIE | SPACEMIT_CR_TB; - - if (i2c->read) - val |= SPACEMIT_CR_ACKNAK; - - writel(val, i2c->base + SPACEMIT_ICR); -} - static int spacemit_i2c_xfer_msg(struct spacemit_i2c_dev *i2c) { unsigned long time_left; @@ -412,7 +429,6 @@ static irqreturn_t spacemit_i2c_irq_handler(int irq, void *devid) val = readl(i2c->base + SPACEMIT_ICR); val &= ~(SPACEMIT_CR_TB | SPACEMIT_CR_ACKNAK | SPACEMIT_CR_STOP | SPACEMIT_CR_START); - writel(val, i2c->base + SPACEMIT_ICR); switch (i2c->state) { case SPACEMIT_STATE_START: @@ -429,14 +445,16 @@ static irqreturn_t spacemit_i2c_irq_handler(int irq, void *devid) } if (i2c->state != SPACEMIT_STATE_IDLE) { + val |= SPACEMIT_CR_TB | SPACEMIT_CR_ALDIE; + if (spacemit_i2c_is_last_msg(i2c)) { /* trigger next byte with stop */ - spacemit_i2c_stop(i2c); - } else { - /* trigger next byte */ - val |= SPACEMIT_CR_ALDIE | SPACEMIT_CR_TB; - writel(val, i2c->base + SPACEMIT_ICR); + val |= SPACEMIT_CR_STOP; + + if (i2c->read) + val |= SPACEMIT_CR_ACKNAK; } + writel(val, i2c->base + SPACEMIT_ICR); } err_out: @@ -476,12 +494,13 @@ static int spacemit_i2c_xfer(struct i2c_adapter *adapt, struct i2c_msg *msgs, in spacemit_i2c_enable(i2c); ret = spacemit_i2c_wait_bus_idle(i2c); - if (!ret) + if (!ret) { ret = spacemit_i2c_xfer_msg(i2c); - else if (ret < 0) - dev_dbg(i2c->dev, "i2c transfer error: %d\n", ret); - else + if (ret < 0) + dev_dbg(i2c->dev, "i2c transfer error: %d\n", ret); + } else { spacemit_i2c_check_bus_release(i2c); + } spacemit_i2c_disable(i2c); diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index ab456c3717db18..dee40704825cb4 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -1243,6 +1243,7 @@ static int mtk_i2c_transfer(struct i2c_adapter *adap, { int ret; int left_num = num; + bool write_then_read_en = false; struct mtk_i2c *i2c = i2c_get_adapdata(adap); ret = clk_bulk_enable(I2C_MT65XX_CLK_MAX, i2c->clocks); @@ -1256,6 +1257,7 @@ static int mtk_i2c_transfer(struct i2c_adapter *adap, if (!(msgs[0].flags & I2C_M_RD) && (msgs[1].flags & I2C_M_RD) && msgs[0].addr == msgs[1].addr) { i2c->auto_restart = 0; + write_then_read_en = true; } } @@ -1280,12 +1282,10 @@ static int mtk_i2c_transfer(struct i2c_adapter *adap, else i2c->op = I2C_MASTER_WR; - if (!i2c->auto_restart) { - if (num > 1) { - /* combined two messages into one transaction */ - i2c->op = I2C_MASTER_WRRD; - left_num--; - } + if (write_then_read_en) { + /* combined two messages into one transaction */ + i2c->op = I2C_MASTER_WRRD; + left_num--; } /* always use DMA mode. */ @@ -1293,7 +1293,10 @@ static int mtk_i2c_transfer(struct i2c_adapter *adap, if (ret < 0) goto err_exit; - msgs++; + if (i2c->op == I2C_MASTER_WRRD) + msgs += 2; + else + msgs++; } /* the return value is number of executed messages */ ret = num; diff --git a/drivers/i2c/busses/i2c-nct6694.c b/drivers/i2c/busses/i2c-nct6694.c new file mode 100644 index 00000000000000..1413ab6f94628b --- /dev/null +++ b/drivers/i2c/busses/i2c-nct6694.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 I2C adapter driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include + +/* + * USB command module type for NCT6694 I2C controller. + * This defines the module type used for communication with the NCT6694 + * I2C controller over the USB interface. + */ +#define NCT6694_I2C_MOD 0x03 + +/* Command 00h - I2C Deliver */ +#define NCT6694_I2C_DELIVER 0x00 +#define NCT6694_I2C_DELIVER_SEL 0x00 + +#define NCT6694_I2C_MAX_XFER_SIZE 64 +#define NCT6694_I2C_MAX_DEVS 6 + +static unsigned char br_reg[NCT6694_I2C_MAX_DEVS] = {[0 ... (NCT6694_I2C_MAX_DEVS - 1)] = 0xFF}; + +module_param_array(br_reg, byte, NULL, 0644); +MODULE_PARM_DESC(br_reg, + "I2C Baudrate register per adapter: (0=25K, 1=50K, 2=100K, 3=200K, 4=400K, 5=800K, 6=1M), default=2"); + +enum nct6694_i2c_baudrate { + NCT6694_I2C_BR_25K = 0, + NCT6694_I2C_BR_50K, + NCT6694_I2C_BR_100K, + NCT6694_I2C_BR_200K, + NCT6694_I2C_BR_400K, + NCT6694_I2C_BR_800K, + NCT6694_I2C_BR_1M +}; + +struct __packed nct6694_i2c_deliver { + u8 port; + u8 br; + u8 addr; + u8 w_cnt; + u8 r_cnt; + u8 rsv[11]; + u8 write_data[NCT6694_I2C_MAX_XFER_SIZE]; + u8 read_data[NCT6694_I2C_MAX_XFER_SIZE]; +}; + +struct nct6694_i2c_data { + struct device *dev; + struct nct6694 *nct6694; + struct i2c_adapter adapter; + struct nct6694_i2c_deliver deliver; + unsigned char port; + unsigned char br; +}; + +static int nct6694_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) +{ + struct nct6694_i2c_data *data = adap->algo_data; + struct nct6694_i2c_deliver *deliver = &data->deliver; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_I2C_MOD, + .cmd = NCT6694_I2C_DELIVER, + .sel = NCT6694_I2C_DELIVER_SEL, + .len = cpu_to_le16(sizeof(*deliver)) + }; + int ret, i; + + for (i = 0; i < num; i++) { + struct i2c_msg *msg_temp = &msgs[i]; + + memset(deliver, 0, sizeof(*deliver)); + + deliver->port = data->port; + deliver->br = data->br; + deliver->addr = i2c_8bit_addr_from_msg(msg_temp); + if (msg_temp->flags & I2C_M_RD) { + deliver->r_cnt = msg_temp->len; + ret = nct6694_write_msg(data->nct6694, &cmd_hd, deliver); + if (ret < 0) + return ret; + + memcpy(msg_temp->buf, deliver->read_data, msg_temp->len); + } else { + deliver->w_cnt = msg_temp->len; + memcpy(deliver->write_data, msg_temp->buf, msg_temp->len); + ret = nct6694_write_msg(data->nct6694, &cmd_hd, deliver); + if (ret < 0) + return ret; + } + } + + return num; +} + +static u32 nct6694_i2c_func(struct i2c_adapter *adapter) +{ + return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL; +} + +static const struct i2c_adapter_quirks nct6694_i2c_quirks = { + .max_read_len = NCT6694_I2C_MAX_XFER_SIZE, + .max_write_len = NCT6694_I2C_MAX_XFER_SIZE, +}; + +static const struct i2c_algorithm nct6694_i2c_algo = { + .xfer = nct6694_i2c_xfer, + .functionality = nct6694_i2c_func, +}; + +static int nct6694_i2c_set_baudrate(struct nct6694_i2c_data *data) +{ + if (data->port >= NCT6694_I2C_MAX_DEVS) { + dev_err(data->dev, "Invalid I2C port index %d\n", data->port); + return -EINVAL; + } + + if (br_reg[data->port] > NCT6694_I2C_BR_1M) { + dev_warn(data->dev, "Invalid baudrate %d for I2C%d, using 100K\n", + br_reg[data->port], data->port); + br_reg[data->port] = NCT6694_I2C_BR_100K; + } + + data->br = br_reg[data->port]; + + return 0; +} + +static void nct6694_i2c_ida_free(void *d) +{ + struct nct6694_i2c_data *data = d; + struct nct6694 *nct6694 = data->nct6694; + + ida_free(&nct6694->i2c_ida, data->port); +} + +static int nct6694_i2c_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct nct6694 *nct6694 = dev_get_drvdata(dev->parent); + struct nct6694_i2c_data *data; + int ret; + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->dev = dev; + data->nct6694 = nct6694; + + ret = ida_alloc(&nct6694->i2c_ida, GFP_KERNEL); + if (ret < 0) + return ret; + data->port = ret; + + ret = devm_add_action_or_reset(dev, nct6694_i2c_ida_free, data); + if (ret) + return ret; + + ret = nct6694_i2c_set_baudrate(data); + if (ret) + return ret; + + sprintf(data->adapter.name, "NCT6694 I2C Adapter %d", data->port); + data->adapter.owner = THIS_MODULE; + data->adapter.algo = &nct6694_i2c_algo; + data->adapter.quirks = &nct6694_i2c_quirks; + data->adapter.dev.parent = dev; + data->adapter.algo_data = data; + + platform_set_drvdata(pdev, data); + + return devm_i2c_add_adapter(dev, &data->adapter); +} + +static struct platform_driver nct6694_i2c_driver = { + .driver = { + .name = "nct6694-i2c", + }, + .probe = nct6694_i2c_probe, +}; + +module_platform_driver(nct6694_i2c_driver); + +MODULE_DESCRIPTION("USB-I2C adapter driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-i2c"); diff --git a/drivers/i2c/busses/i2c-riic.c b/drivers/i2c/busses/i2c-riic.c index 9c164a4b9bb91c..b0ee9ac45a976d 100644 --- a/drivers/i2c/busses/i2c-riic.c +++ b/drivers/i2c/busses/i2c-riic.c @@ -386,7 +386,7 @@ static int riic_init_hw(struct riic_dev *riic) */ total_ticks = DIV_ROUND_UP(rate, t->bus_freq_hz ?: 1); - for (cks = 0; cks < 7; cks++) { + for (cks = 0; cks <= 7; cks++) { /* * 60% low time must be less than BRL + 2 + 1 * BRL max register value is 0x1F. diff --git a/drivers/i2c/busses/i2c-rtl9300.c b/drivers/i2c/busses/i2c-rtl9300.c index 9e1f71fed0feac..af991b28e4f835 100644 --- a/drivers/i2c/busses/i2c-rtl9300.c +++ b/drivers/i2c/busses/i2c-rtl9300.c @@ -307,8 +307,7 @@ static int rtl9300_i2c_smbus_xfer(struct i2c_adapter *adap, u16 addr, unsigned s static u32 rtl9300_i2c_func(struct i2c_adapter *a) { return I2C_FUNC_SMBUS_BYTE | I2C_FUNC_SMBUS_BYTE_DATA | - I2C_FUNC_SMBUS_WORD_DATA | I2C_FUNC_SMBUS_BLOCK_DATA | - I2C_FUNC_SMBUS_I2C_BLOCK; + I2C_FUNC_SMBUS_WORD_DATA | I2C_FUNC_SMBUS_BLOCK_DATA; } static const struct i2c_algorithm rtl9300_i2c_algo = { diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c index f4fa4703acbd0e..8138f5ef40f06b 100644 --- a/drivers/i2c/busses/i2c-s3c2410.c +++ b/drivers/i2c/busses/i2c-s3c2410.c @@ -138,7 +138,6 @@ static void i2c_s3c_irq_nextbyte(struct s3c24xx_i2c *i2c, unsigned long iicstat) #ifdef CONFIG_OF static const struct of_device_id s3c24xx_i2c_match[] = { - { .compatible = "samsung,s3c2410-i2c", .data = (void *)0 }, { .compatible = "samsung,s3c2440-i2c", .data = (void *)QUIRK_S3C2440 }, { .compatible = "samsung,s3c2440-hdmiphy-i2c", .data = (void *)(QUIRK_S3C2440 | QUIRK_HDMIPHY | QUIRK_NO_GPIO) }, diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 56b2e5c5fb49aa..26ec34b19ad51a 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -425,7 +425,7 @@ static irqreturn_t sprd_i2c_isr(int irq, void *dev_id) * If we did not get one ACK from target when writing data, then we * should finish this transmission since we got some errors. * - * When writing data, if i2c_tran == 0 which means we have writen + * When writing data, if i2c_tran == 0 which means we have written * done all data, then we can finish this transmission. * * When reading data, if conut < rx fifo full threshold, which diff --git a/drivers/i2c/busses/i2c-st.c b/drivers/i2c/busses/i2c-st.c index bf28f8e3ee6bda..97d70e66722706 100644 --- a/drivers/i2c/busses/i2c-st.c +++ b/drivers/i2c/busses/i2c-st.c @@ -152,7 +152,7 @@ struct st_i2c_timings { /** * struct st_i2c_client - client specific data * @addr: 8-bit target addr, including r/w bit - * @count: number of bytes to be transfered + * @count: number of bytes to be transferred * @xfered: number of bytes already transferred * @buf: data buffer * @result: result of the transfer diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 4eb31b913c1a7d..e533460bccc39e 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1649,7 +1649,33 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { .has_interface_timing_reg = true, }; +static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { + .has_continue_xfer_support = true, + .has_per_pkt_xfer_complete_irq = true, + .clk_divisor_hs_mode = 7, + .clk_divisor_std_mode = 0x7a, + .clk_divisor_fast_mode = 0x40, + .clk_divisor_fast_plus_mode = 0x19, + .has_config_load_reg = true, + .has_multi_master_mode = true, + .has_slcg_override_reg = true, + .has_mst_fifo = true, + .has_mst_reset = true, + .quirks = &tegra194_i2c_quirks, + .supports_bus_clear = true, + .has_apb_dma = false, + .tlow_std_mode = 0x8, + .thigh_std_mode = 0x7, + .tlow_fast_fastplus_mode = 0x3, + .thigh_fast_fastplus_mode = 0x3, + .setup_hold_time_std_mode = 0x08080808, + .setup_hold_time_fast_fast_plus_mode = 0x02020202, + .setup_hold_time_hs_mode = 0x090909, + .has_interface_timing_reg = true, +}; + static const struct of_device_id tegra_i2c_of_match[] = { + { .compatible = "nvidia,tegra256-i2c", .data = &tegra256_i2c_hw, }, { .compatible = "nvidia,tegra194-i2c", .data = &tegra194_i2c_hw, }, { .compatible = "nvidia,tegra186-i2c", .data = &tegra186_i2c_hw, }, #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) diff --git a/drivers/i2c/busses/i2c-viperboard.c b/drivers/i2c/busses/i2c-viperboard.c index 1bd602852e35c1..f596efcc291c22 100644 --- a/drivers/i2c/busses/i2c-viperboard.c +++ b/drivers/i2c/busses/i2c-viperboard.c @@ -204,7 +204,7 @@ static int vprbrd_i2c_read(struct vprbrd *vb, struct i2c_msg *msg) /* copy the received data */ memcpy(msg->buf + start, rmsg, len1); - /* second read transfer if neccessary */ + /* second read transfer if necessary */ if (len2 > 0) { ret = vprbrd_i2c_receive(vb->usb_dev, rmsg, len2); if (ret < 0) diff --git a/drivers/i2c/i2c-boardinfo.c b/drivers/i2c/i2c-boardinfo.c index 4df8ad092df383..338800321f8b66 100644 --- a/drivers/i2c/i2c-boardinfo.c +++ b/drivers/i2c/i2c-boardinfo.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL_GPL(__i2c_board_lock); LIST_HEAD(__i2c_board_list); EXPORT_SYMBOL_GPL(__i2c_board_list); -int __i2c_first_dynamic_bus_num; +int __i2c_first_dynamic_bus_num __ro_after_init; EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); @@ -48,7 +48,7 @@ EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); * The board info passed can safely be __initdata, but be careful of embedded * pointers (for platform_data, functions, etc) since that won't be copied. */ -int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) +int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) { int status; diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index ecca8c006b0203..ae7e9c8b65a65c 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -573,7 +573,8 @@ static int i2c_device_probe(struct device *dev) goto err_clear_wakeup_irq; do_power_on = !i2c_acpi_waive_d0_probe(dev); - status = dev_pm_domain_attach(&client->dev, do_power_on ? PD_FLAG_ATTACH_POWER_ON : 0); + status = dev_pm_domain_attach(&client->dev, PD_FLAG_DETACH_POWER_OFF | + (do_power_on ? PD_FLAG_ATTACH_POWER_ON : 0)); if (status) goto err_clear_wakeup_irq; @@ -581,7 +582,7 @@ static int i2c_device_probe(struct device *dev) GFP_KERNEL); if (!client->devres_group_id) { status = -ENOMEM; - goto err_detach_pm_domain; + goto err_clear_wakeup_irq; } client->debugfs = debugfs_create_dir(dev_name(&client->dev), @@ -608,8 +609,6 @@ static int i2c_device_probe(struct device *dev) err_release_driver_resources: debugfs_remove_recursive(client->debugfs); devres_release_group(&client->dev, client->devres_group_id); -err_detach_pm_domain: - dev_pm_domain_detach(&client->dev, do_power_on); err_clear_wakeup_irq: dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); @@ -636,8 +635,6 @@ static void i2c_device_remove(struct device *dev) devres_release_group(&client->dev, client->devres_group_id); - dev_pm_domain_detach(&client->dev, true); - dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); diff --git a/drivers/i2c/i2c-core-slave.c b/drivers/i2c/i2c-core-slave.c index 7ee6b992b835f5..02ca55c2246bcf 100644 --- a/drivers/i2c/i2c-core-slave.c +++ b/drivers/i2c/i2c-core-slave.c @@ -112,10 +112,9 @@ bool i2c_detect_slave_mode(struct device *dev) struct fwnode_handle *fwnode = dev_fwnode(dev); if (is_of_node(fwnode)) { - struct fwnode_handle *child __free(fwnode_handle) = NULL; u32 reg; - fwnode_for_each_child_node(fwnode, child) { + fwnode_for_each_child_node_scoped(fwnode, child) { fwnode_property_read_u32(child, "reg", ®); if (reg & I2C_OWN_SLAVE_ADDRESS) return true; diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c index 4d8690981a55dc..d59644e50f14d0 100644 --- a/drivers/i2c/i2c-mux.c +++ b/drivers/i2c/i2c-mux.c @@ -241,12 +241,9 @@ struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent, muxc->parent = parent; muxc->dev = dev; - if (flags & I2C_MUX_LOCKED) - muxc->mux_locked = true; - if (flags & I2C_MUX_ARBITRATOR) - muxc->arbitrator = true; - if (flags & I2C_MUX_GATE) - muxc->gate = true; + muxc->mux_locked = !!(flags & I2C_MUX_LOCKED); + muxc->arbitrator = !!(flags & I2C_MUX_ARBITRATOR); + muxc->gate = !!(flags & I2C_MUX_GATE); muxc->select = select; muxc->deselect = deselect; muxc->max_adapters = max_adapters; diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index 8663c8a7c26936..3d8002caf7031b 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -63,10 +63,6 @@ #define mybus(x) (!((x) & MYBUS) || ((x) & MYBUS) == MYBUS) #define busoff(x) (!((x) & BUSON) || ((x) & BUSON) == BUSON) -/* arbitration timeouts, in jiffies */ -#define ARB_TIMEOUT (HZ / 8) /* 125 ms until forcing bus ownership */ -#define ARB2_TIMEOUT (HZ / 4) /* 250 ms until acquisition failure */ - /* arbitration retry delays, in us */ #define SELECT_DELAY_SHORT 50 #define SELECT_DELAY_LONG 1000 @@ -229,6 +225,9 @@ static int pca9541_arbitrate(struct i2c_client *client) */ data->select_timeout = SELECT_DELAY_LONG; if (time_is_before_eq_jiffies(data->arb_timeout)) { + dev_warn(&client->dev, + "Arbitration timeout on I2C bus, forcing bus ownership\n"); + /* Time is up, take the bus and reset it. */ pca9541_reg_write(client, PCA9541_CONTROL, @@ -251,10 +250,10 @@ static int pca9541_select_chan(struct i2c_mux_core *muxc, u32 chan) struct pca9541 *data = i2c_mux_priv(muxc); struct i2c_client *client = data->client; int ret; - unsigned long timeout = jiffies + ARB2_TIMEOUT; + unsigned long timeout = jiffies + (2 * client->adapter->timeout); /* give up after this time */ - data->arb_timeout = jiffies + ARB_TIMEOUT; + data->arb_timeout = jiffies + client->adapter->timeout; /* force bus ownership after this time */ do { @@ -267,6 +266,7 @@ static int pca9541_select_chan(struct i2c_mux_core *muxc, u32 chan) else msleep(data->select_timeout / 1000); } while (time_is_after_eq_jiffies(timeout)); + dev_warn(&client->dev, "Failed to acquire I2C bus, timed out\n"); return -ETIMEDOUT; } diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c index b9f370c9f018c2..75c8d08fa24e56 100644 --- a/drivers/i2c/muxes/i2c-mux-pca954x.c +++ b/drivers/i2c/muxes/i2c-mux-pca954x.c @@ -118,7 +118,6 @@ struct pca954x { raw_spinlock_t lock; struct regulator *supply; - struct gpio_desc *reset_gpio; struct reset_control *reset_cont; }; @@ -316,6 +315,25 @@ static u8 pca954x_regval(struct pca954x *data, u8 chan) return 1 << chan; } +static void pca954x_reset_assert(struct pca954x *data) +{ + if (data->reset_cont) + reset_control_assert(data->reset_cont); +} + +static void pca954x_reset_deassert(struct pca954x *data) +{ + if (data->reset_cont) + reset_control_deassert(data->reset_cont); +} + +static void pca954x_reset_mux(struct pca954x *data) +{ + pca954x_reset_assert(data); + udelay(1); + pca954x_reset_deassert(data); +} + static int pca954x_select_chan(struct i2c_mux_core *muxc, u32 chan) { struct pca954x *data = i2c_mux_priv(muxc); @@ -329,6 +347,8 @@ static int pca954x_select_chan(struct i2c_mux_core *muxc, u32 chan) ret = pca954x_reg_write(muxc->parent, client, regval); data->last_chan = ret < 0 ? 0 : regval; } + if (ret == -ETIMEDOUT && data->reset_cont) + pca954x_reset_mux(data); return ret; } @@ -338,6 +358,7 @@ static int pca954x_deselect_mux(struct i2c_mux_core *muxc, u32 chan) struct pca954x *data = i2c_mux_priv(muxc); struct i2c_client *client = data->client; s32 idle_state; + int ret = 0; idle_state = READ_ONCE(data->idle_state); if (idle_state >= 0) @@ -347,8 +368,10 @@ static int pca954x_deselect_mux(struct i2c_mux_core *muxc, u32 chan) if (idle_state == MUX_IDLE_DISCONNECT) { /* Deselect active channel */ data->last_chan = 0; - return pca954x_reg_write(muxc->parent, client, - data->last_chan); + ret = pca954x_reg_write(muxc->parent, client, + data->last_chan); + if (ret == -ETIMEDOUT && data->reset_cont) + pca954x_reset_mux(data); } /* otherwise leave as-is */ @@ -527,29 +550,10 @@ static int pca954x_get_reset(struct device *dev, struct pca954x *data) if (IS_ERR(data->reset_cont)) return dev_err_probe(dev, PTR_ERR(data->reset_cont), "Failed to get reset\n"); - else if (data->reset_cont) - return 0; - - /* - * fallback to legacy reset-gpios - */ - data->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH); - if (IS_ERR(data->reset_gpio)) { - return dev_err_probe(dev, PTR_ERR(data->reset_gpio), - "Failed to get reset gpio"); - } return 0; } -static void pca954x_reset_deassert(struct pca954x *data) -{ - if (data->reset_cont) - reset_control_deassert(data->reset_cont); - else - gpiod_set_value_cansleep(data->reset_gpio, 0); -} - /* * I2C init/probing/exit functions */ @@ -589,7 +593,7 @@ static int pca954x_probe(struct i2c_client *client) if (ret) goto fail_cleanup; - if (data->reset_cont || data->reset_gpio) { + if (data->reset_cont) { udelay(1); pca954x_reset_deassert(data); /* Give the chip some time to recover. */ diff --git a/drivers/i3c/internals.h b/drivers/i3c/internals.h index 0d857cc68cc5d4..79ceaa5f5afd6f 100644 --- a/drivers/i3c/internals.h +++ b/drivers/i3c/internals.h @@ -38,7 +38,11 @@ static inline void i3c_writel_fifo(void __iomem *addr, const void *buf, u32 tmp = 0; memcpy(&tmp, buf + (nbytes & ~3), nbytes & 3); - writel(tmp, addr); + /* + * writesl() instead of writel() to keep FIFO + * byteorder on big-endian targets + */ + writesl(addr, &tmp, 1); } } @@ -55,7 +59,11 @@ static inline void i3c_readl_fifo(const void __iomem *addr, void *buf, if (nbytes & 3) { u32 tmp; - tmp = readl(addr); + /* + * readsl() instead of readl() to keep FIFO + * byteorder on big-endian targets + */ + readsl(addr, &tmp, 1); memcpy(buf + (nbytes & ~3), &tmp, nbytes & 3); } } diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 2ef898a8fd8065..d946db75df7068 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1727,6 +1728,79 @@ int i3c_master_do_daa(struct i3c_master_controller *master) } EXPORT_SYMBOL_GPL(i3c_master_do_daa); +/** + * i3c_master_dma_map_single() - Map buffer for single DMA transfer + * @dev: device object of a device doing DMA + * @buf: destination/source buffer for DMA + * @len: length of transfer + * @force_bounce: true, force to use a bounce buffer, + * false, function will auto check is a bounce buffer required + * @dir: DMA direction + * + * Map buffer for a DMA transfer and allocate a bounce buffer if required. + * + * Return: I3C DMA transfer descriptor or NULL in case of error. + */ +struct i3c_dma *i3c_master_dma_map_single(struct device *dev, void *buf, + size_t len, bool force_bounce, enum dma_data_direction dir) +{ + struct i3c_dma *dma_xfer __free(kfree) = NULL; + void *bounce __free(kfree) = NULL; + void *dma_buf = buf; + + dma_xfer = kzalloc(sizeof(*dma_xfer), GFP_KERNEL); + if (!dma_xfer) + return NULL; + + dma_xfer->dev = dev; + dma_xfer->buf = buf; + dma_xfer->dir = dir; + dma_xfer->len = len; + dma_xfer->map_len = len; + + if (is_vmalloc_addr(buf)) + force_bounce = true; + + if (force_bounce) { + dma_xfer->map_len = ALIGN(len, cache_line_size()); + if (dir == DMA_FROM_DEVICE) + bounce = kzalloc(dma_xfer->map_len, GFP_KERNEL); + else + bounce = kmemdup(buf, dma_xfer->map_len, GFP_KERNEL); + if (!bounce) + return NULL; + dma_buf = bounce; + } + + dma_xfer->addr = dma_map_single(dev, dma_buf, dma_xfer->map_len, dir); + if (dma_mapping_error(dev, dma_xfer->addr)) + return NULL; + + dma_xfer->bounce_buf = no_free_ptr(bounce); + return no_free_ptr(dma_xfer); +} +EXPORT_SYMBOL_GPL(i3c_master_dma_map_single); + +/** + * i3c_master_dma_unmap_single() - Unmap buffer after DMA + * @dma_xfer: DMA transfer and mapping descriptor + * + * Unmap buffer and cleanup DMA transfer descriptor. + */ +void i3c_master_dma_unmap_single(struct i3c_dma *dma_xfer) +{ + dma_unmap_single(dma_xfer->dev, dma_xfer->addr, + dma_xfer->map_len, dma_xfer->dir); + if (dma_xfer->bounce_buf) { + if (dma_xfer->dir == DMA_FROM_DEVICE) + memcpy(dma_xfer->buf, dma_xfer->bounce_buf, + dma_xfer->len); + kfree(dma_xfer->bounce_buf); + } + kfree(dma_xfer); +} +EXPORT_SYMBOL_GPL(i3c_master_dma_unmap_single); + /** * i3c_master_set_info() - set master device information * @master: master used to send frames on the bus @@ -2490,9 +2564,7 @@ static int i3c_master_i2c_adapter_init(struct i3c_master_controller *master) adap->owner = master->dev.parent->driver->owner; adap->algo = &i3c_master_i2c_algo; strscpy(adap->name, dev_name(master->dev.parent), sizeof(adap->name)); - - /* FIXME: Should we allow i3c masters to override these values? */ - adap->timeout = 1000; + adap->timeout = HZ; adap->retries = 3; id = of_alias_get_id(master->dev.of_node, "i2c"); diff --git a/drivers/i3c/master/Kconfig b/drivers/i3c/master/Kconfig index 13df2944f2ec9a..82cf330778d5a7 100644 --- a/drivers/i3c/master/Kconfig +++ b/drivers/i3c/master/Kconfig @@ -1,4 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-only +config ADI_I3C_MASTER + tristate "Analog Devices I3C master driver" + depends on HAS_IOMEM + help + Support for Analog Devices I3C Controller IP, an AXI-interfaced IP + core that supports I3C and I2C devices, multiple speed-grades and I3C + IBIs. + + This driver can also be built as a module. If so, the module will be + called adi-i3c-master. + config CDNS_I3C_MASTER tristate "Cadence I3C master driver" depends on HAS_IOMEM diff --git a/drivers/i3c/master/Makefile b/drivers/i3c/master/Makefile index aac74f3e385144..816a227b6f7acf 100644 --- a/drivers/i3c/master/Makefile +++ b/drivers/i3c/master/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_ADI_I3C_MASTER) += adi-i3c-master.o obj-$(CONFIG_CDNS_I3C_MASTER) += i3c-master-cdns.o obj-$(CONFIG_DW_I3C_MASTER) += dw-i3c-master.o obj-$(CONFIG_AST2600_I3C_MASTER) += ast2600-i3c-master.o diff --git a/drivers/i3c/master/adi-i3c-master.c b/drivers/i3c/master/adi-i3c-master.c new file mode 100644 index 00000000000000..82ac0b3d057abd --- /dev/null +++ b/drivers/i3c/master/adi-i3c-master.c @@ -0,0 +1,1019 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * I3C Controller driver + * Copyright 2025 Analog Devices Inc. + * Author: Jorge Marques + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internals.h" + +#define ADI_MAX_DEVS 16 +#define ADI_HAS_MDB_FROM_BCR(x) (FIELD_GET(BIT(2), (x))) + +#define REG_ENABLE 0x040 + +#define REG_PID_L 0x054 +#define REG_PID_H 0x058 +#define REG_DCR_BCR_DA 0x05c +#define REG_DCR_BCR_DA_GET_DA(x) FIELD_GET(GENMASK(22, 16), (x)) +#define REG_DCR_BCR_DA_GET_BCR(x) FIELD_GET(GENMASK(15, 8), (x)) +#define REG_DCR_BCR_DA_GET_DCR(x) FIELD_GET(GENMASK(7, 0), (x)) + +#define REG_IRQ_MASK 0x080 +#define REG_IRQ_PENDING 0x084 +#define REG_IRQ_PENDING_DAA BIT(7) +#define REG_IRQ_PENDING_IBI BIT(6) +#define REG_IRQ_PENDING_CMDR BIT(5) + +#define REG_CMD_FIFO 0x0d4 +#define REG_CMD_FIFO_0_IS_CCC BIT(22) +#define REG_CMD_FIFO_0_BCAST BIT(21) +#define REG_CMD_FIFO_0_SR BIT(20) +#define REG_CMD_FIFO_0_LEN(l) FIELD_PREP(GENMASK(19, 8), (l)) +#define REG_CMD_FIFO_0_DEV_ADDR(a) FIELD_PREP(GENMASK(7, 1), (a)) +#define REG_CMD_FIFO_0_RNW BIT(0) +#define REG_CMD_FIFO_1_CCC(id) FIELD_PREP(GENMASK(7, 0), (id)) + +#define REG_CMD_FIFO_ROOM 0x0c0 +#define REG_CMDR_FIFO 0x0d8 +#define REG_CMDR_FIFO_UDA_ERROR 8 +#define REG_CMDR_FIFO_NACK_RESP 6 +#define REG_CMDR_FIFO_CE2_ERROR 4 +#define REG_CMDR_FIFO_CE0_ERROR 1 +#define REG_CMDR_FIFO_NO_ERROR 0 +#define REG_CMDR_FIFO_ERROR(x) FIELD_GET(GENMASK(23, 20), (x)) +#define REG_CMDR_FIFO_XFER_BYTES(x) FIELD_GET(GENMASK(19, 8), (x)) + +#define REG_SDO_FIFO 0x0dc +#define REG_SDO_FIFO_ROOM 0x0c8 +#define REG_SDI_FIFO 0x0e0 +#define REG_IBI_FIFO 0x0e4 +#define REG_FIFO_STATUS 0x0e8 +#define REG_FIFO_STATUS_CMDR_EMPTY BIT(0) +#define REG_FIFO_STATUS_IBI_EMPTY BIT(1) + +#define REG_OPS 0x100 +#define REG_OPS_PP_SG_MASK GENMASK(6, 5) +#define REG_OPS_SET_SG(x) FIELD_PREP(REG_OPS_PP_SG_MASK, (x)) + +#define REG_IBI_CONFIG 0x140 +#define REG_IBI_CONFIG_ENABLE BIT(0) +#define REG_IBI_CONFIG_LISTEN BIT(1) + +#define REG_DEV_CHAR 0x180 +#define REG_DEV_CHAR_IS_I2C BIT(0) +#define REG_DEV_CHAR_IS_ATTACHED BIT(1) +#define REG_DEV_CHAR_BCR_IBI(x) FIELD_PREP(GENMASK(3, 2), (x)) +#define REG_DEV_CHAR_WEN BIT(8) +#define REG_DEV_CHAR_ADDR(x) FIELD_PREP(GENMASK(15, 9), (x)) + +enum speed_grade {PP_SG_UNSET, PP_SG_1MHZ, PP_SG_3MHZ, PP_SG_6MHZ, PP_SG_12MHZ}; + +struct adi_i3c_cmd { + u32 cmd0; + u32 cmd1; + u32 tx_len; + const void *tx_buf; + u32 rx_len; + void *rx_buf; + u32 error; +}; + +struct adi_i3c_xfer { + struct list_head node; + struct completion comp; + int ret; + unsigned int ncmds; + unsigned int ncmds_comp; + struct adi_i3c_cmd cmds[] __counted_by(ncmds); +}; + +struct adi_i3c_master { + struct i3c_master_controller base; + u32 free_rr_slots; + struct { + unsigned int num_slots; + struct i3c_dev_desc **slots; + spinlock_t lock; /* Protect IBI slot access */ + } ibi; + struct { + struct list_head list; + struct adi_i3c_xfer *cur; + spinlock_t lock; /* Protect transfer */ + } xferqueue; + void __iomem *regs; + struct clk *clk; + unsigned long i3c_scl_lim; + struct { + u8 addrs[ADI_MAX_DEVS]; + u8 index; + } daa; +}; + +static inline struct adi_i3c_master *to_adi_i3c_master(struct i3c_master_controller *master) +{ + return container_of(master, struct adi_i3c_master, base); +} + +static void adi_i3c_master_wr_to_tx_fifo(struct adi_i3c_master *master, + const u8 *buf, unsigned int nbytes) +{ + unsigned int n, m; + + n = readl(master->regs + REG_SDO_FIFO_ROOM); + m = min(n, nbytes); + i3c_writel_fifo(master->regs + REG_SDO_FIFO, buf, m); +} + +static void adi_i3c_master_rd_from_rx_fifo(struct adi_i3c_master *master, + u8 *buf, unsigned int nbytes) +{ + i3c_readl_fifo(master->regs + REG_SDI_FIFO, buf, nbytes); +} + +static bool adi_i3c_master_supports_ccc_cmd(struct i3c_master_controller *m, + const struct i3c_ccc_cmd *cmd) +{ + if (cmd->ndests > 1) + return false; + + switch (cmd->id) { + case I3C_CCC_ENEC(true): + case I3C_CCC_ENEC(false): + case I3C_CCC_DISEC(true): + case I3C_CCC_DISEC(false): + case I3C_CCC_RSTDAA(true): + case I3C_CCC_RSTDAA(false): + case I3C_CCC_ENTDAA: + case I3C_CCC_SETDASA: + case I3C_CCC_SETNEWDA: + case I3C_CCC_GETMWL: + case I3C_CCC_GETMRL: + case I3C_CCC_GETPID: + case I3C_CCC_GETBCR: + case I3C_CCC_GETDCR: + case I3C_CCC_GETSTATUS: + case I3C_CCC_GETHDRCAP: + return true; + default: + break; + } + + return false; +} + +static int adi_i3c_master_disable(struct adi_i3c_master *master) +{ + writel(0, master->regs + REG_IBI_CONFIG); + + return 0; +} + +static struct adi_i3c_xfer *adi_i3c_master_alloc_xfer(struct adi_i3c_master *master, + unsigned int ncmds) +{ + struct adi_i3c_xfer *xfer; + + xfer = kzalloc(struct_size(xfer, cmds, ncmds), GFP_KERNEL); + if (!xfer) + return NULL; + + INIT_LIST_HEAD(&xfer->node); + xfer->ncmds = ncmds; + xfer->ret = -ETIMEDOUT; + + return xfer; +} + +static void adi_i3c_master_start_xfer_locked(struct adi_i3c_master *master) +{ + struct adi_i3c_xfer *xfer = master->xferqueue.cur; + unsigned int i, n, m; + + if (!xfer) + return; + + for (i = 0; i < xfer->ncmds; i++) { + struct adi_i3c_cmd *cmd = &xfer->cmds[i]; + + if (!(cmd->cmd0 & REG_CMD_FIFO_0_RNW)) + adi_i3c_master_wr_to_tx_fifo(master, cmd->tx_buf, cmd->tx_len); + } + + n = readl(master->regs + REG_CMD_FIFO_ROOM); + for (i = 0; i < xfer->ncmds; i++) { + struct adi_i3c_cmd *cmd = &xfer->cmds[i]; + + m = cmd->cmd0 & REG_CMD_FIFO_0_IS_CCC ? 2 : 1; + if (m > n) + break; + writel(cmd->cmd0, master->regs + REG_CMD_FIFO); + if (cmd->cmd0 & REG_CMD_FIFO_0_IS_CCC) + writel(cmd->cmd1, master->regs + REG_CMD_FIFO); + n -= m; + } +} + +static void adi_i3c_master_end_xfer_locked(struct adi_i3c_master *master, + u32 pending) +{ + struct adi_i3c_xfer *xfer = master->xferqueue.cur; + int i, ret = 0; + + if (!xfer) + return; + + while (!(readl(master->regs + REG_FIFO_STATUS) & REG_FIFO_STATUS_CMDR_EMPTY)) { + struct adi_i3c_cmd *cmd; + u32 cmdr, rx_len; + + cmdr = readl(master->regs + REG_CMDR_FIFO); + + cmd = &xfer->cmds[xfer->ncmds_comp++]; + if (cmd->cmd0 & REG_CMD_FIFO_0_RNW) { + rx_len = min_t(u32, REG_CMDR_FIFO_XFER_BYTES(cmdr), cmd->rx_len); + adi_i3c_master_rd_from_rx_fifo(master, cmd->rx_buf, rx_len); + } + cmd->error = REG_CMDR_FIFO_ERROR(cmdr); + } + + for (i = 0; i < xfer->ncmds_comp; i++) { + switch (xfer->cmds[i].error) { + case REG_CMDR_FIFO_NO_ERROR: + break; + + case REG_CMDR_FIFO_CE0_ERROR: + case REG_CMDR_FIFO_CE2_ERROR: + case REG_CMDR_FIFO_NACK_RESP: + case REG_CMDR_FIFO_UDA_ERROR: + ret = -EIO; + break; + + default: + ret = -EINVAL; + break; + } + } + + xfer->ret = ret; + + if (xfer->ncmds_comp != xfer->ncmds) + return; + + complete(&xfer->comp); + + xfer = list_first_entry_or_null(&master->xferqueue.list, + struct adi_i3c_xfer, node); + if (xfer) + list_del_init(&xfer->node); + + master->xferqueue.cur = xfer; + adi_i3c_master_start_xfer_locked(master); +} + +static void adi_i3c_master_queue_xfer(struct adi_i3c_master *master, + struct adi_i3c_xfer *xfer) +{ + init_completion(&xfer->comp); + guard(spinlock_irqsave)(&master->xferqueue.lock); + if (master->xferqueue.cur) { + list_add_tail(&xfer->node, &master->xferqueue.list); + } else { + master->xferqueue.cur = xfer; + adi_i3c_master_start_xfer_locked(master); + } +} + +static void adi_i3c_master_unqueue_xfer(struct adi_i3c_master *master, + struct adi_i3c_xfer *xfer) +{ + guard(spinlock_irqsave)(&master->xferqueue.lock); + if (master->xferqueue.cur == xfer) + master->xferqueue.cur = NULL; + else + list_del_init(&xfer->node); + + writel(0x01, master->regs + REG_ENABLE); + writel(0x00, master->regs + REG_ENABLE); + writel(REG_IRQ_PENDING_CMDR, master->regs + REG_IRQ_MASK); +} + +static enum i3c_error_code adi_i3c_cmd_get_err(struct adi_i3c_cmd *cmd) +{ + switch (cmd->error) { + case REG_CMDR_FIFO_CE0_ERROR: + return I3C_ERROR_M0; + + case REG_CMDR_FIFO_CE2_ERROR: + case REG_CMDR_FIFO_NACK_RESP: + return I3C_ERROR_M2; + + default: + break; + } + + return I3C_ERROR_UNKNOWN; +} + +static int adi_i3c_master_send_ccc_cmd(struct i3c_master_controller *m, + struct i3c_ccc_cmd *cmd) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_xfer *xfer __free(kfree) = NULL; + struct adi_i3c_cmd *ccmd; + + xfer = adi_i3c_master_alloc_xfer(master, 1); + if (!xfer) + return -ENOMEM; + + ccmd = xfer->cmds; + ccmd->cmd1 = REG_CMD_FIFO_1_CCC(cmd->id); + ccmd->cmd0 = REG_CMD_FIFO_0_IS_CCC | + REG_CMD_FIFO_0_LEN(cmd->dests[0].payload.len); + + if (cmd->id & I3C_CCC_DIRECT) + ccmd->cmd0 |= REG_CMD_FIFO_0_DEV_ADDR(cmd->dests[0].addr); + + if (cmd->rnw) { + ccmd->cmd0 |= REG_CMD_FIFO_0_RNW; + ccmd->rx_buf = cmd->dests[0].payload.data; + ccmd->rx_len = cmd->dests[0].payload.len; + } else { + ccmd->tx_buf = cmd->dests[0].payload.data; + ccmd->tx_len = cmd->dests[0].payload.len; + } + + adi_i3c_master_queue_xfer(master, xfer); + if (!wait_for_completion_timeout(&xfer->comp, msecs_to_jiffies(1000))) + adi_i3c_master_unqueue_xfer(master, xfer); + + cmd->err = adi_i3c_cmd_get_err(&xfer->cmds[0]); + + return 0; +} + +static int adi_i3c_master_priv_xfers(struct i3c_dev_desc *dev, + struct i3c_priv_xfer *xfers, + int nxfers) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_xfer *xfer __free(kfree) = NULL; + int i, ret; + + if (!nxfers) + return 0; + + xfer = adi_i3c_master_alloc_xfer(master, nxfers); + if (!xfer) + return -ENOMEM; + + for (i = 0; i < nxfers; i++) { + struct adi_i3c_cmd *ccmd = &xfer->cmds[i]; + + ccmd->cmd0 = REG_CMD_FIFO_0_DEV_ADDR(dev->info.dyn_addr); + + if (xfers[i].rnw) { + ccmd->cmd0 |= REG_CMD_FIFO_0_RNW; + ccmd->rx_buf = xfers[i].data.in; + ccmd->rx_len = xfers[i].len; + } else { + ccmd->tx_buf = xfers[i].data.out; + ccmd->tx_len = xfers[i].len; + } + + ccmd->cmd0 |= REG_CMD_FIFO_0_LEN(xfers[i].len); + + if (i < nxfers - 1) + ccmd->cmd0 |= REG_CMD_FIFO_0_SR; + + if (!i) + ccmd->cmd0 |= REG_CMD_FIFO_0_BCAST; + } + + adi_i3c_master_queue_xfer(master, xfer); + if (!wait_for_completion_timeout(&xfer->comp, + msecs_to_jiffies(1000))) + adi_i3c_master_unqueue_xfer(master, xfer); + + ret = xfer->ret; + + for (i = 0; i < nxfers; i++) + xfers[i].err = adi_i3c_cmd_get_err(&xfer->cmds[i]); + + return ret; +} + +struct adi_i3c_i2c_dev_data { + struct i3c_generic_ibi_pool *ibi_pool; + u16 id; + s16 ibi; +}; + +static int adi_i3c_master_get_rr_slot(struct adi_i3c_master *master, + u8 dyn_addr) +{ + if (!master->free_rr_slots) + return -ENOSPC; + + return ffs(master->free_rr_slots) - 1; +} + +static int adi_i3c_master_reattach_i3c_dev(struct i3c_dev_desc *dev, u8 dyn_addr) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + u8 addr; + + addr = dev->info.dyn_addr ? dev->info.dyn_addr : dev->info.static_addr; + + writel(REG_DEV_CHAR_ADDR(dyn_addr), master->regs + REG_DEV_CHAR); + writel((readl(master->regs + REG_DEV_CHAR) & + ~REG_DEV_CHAR_IS_ATTACHED) | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + writel(REG_DEV_CHAR_ADDR(addr), master->regs + REG_DEV_CHAR); + writel(readl(master->regs + REG_DEV_CHAR) | + REG_DEV_CHAR_IS_ATTACHED | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + return 0; +} + +static int adi_i3c_master_attach_i3c_dev(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data; + int slot; + u8 addr; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + slot = adi_i3c_master_get_rr_slot(master, dev->info.dyn_addr); + if (slot < 0) { + kfree(data); + return slot; + } + + data->id = slot; + i3c_dev_set_master_data(dev, data); + master->free_rr_slots &= ~BIT(slot); + + addr = dev->info.dyn_addr ? dev->info.dyn_addr : dev->info.static_addr; + + writel(REG_DEV_CHAR_ADDR(addr), master->regs + REG_DEV_CHAR); + writel(readl(master->regs + REG_DEV_CHAR) | + REG_DEV_CHAR_IS_ATTACHED | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + return 0; +} + +static void adi_i3c_master_sync_dev_char(struct i3c_master_controller *m) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct i3c_dev_desc *i3cdev; + u32 bcr_ibi; + u8 addr; + + i3c_bus_for_each_i3cdev(&m->bus, i3cdev) { + addr = i3cdev->info.dyn_addr ? + i3cdev->info.dyn_addr : i3cdev->info.static_addr; + writel(REG_DEV_CHAR_ADDR(addr), master->regs + REG_DEV_CHAR); + bcr_ibi = FIELD_GET(I3C_BCR_IBI_PAYLOAD | I3C_BCR_IBI_REQ_CAP, (i3cdev->info.bcr)); + writel(readl(master->regs + REG_DEV_CHAR) | + REG_DEV_CHAR_BCR_IBI(bcr_ibi) | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + } +} + +static void adi_i3c_master_detach_i3c_dev(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data = i3c_dev_get_master_data(dev); + u8 addr; + + addr = dev->info.dyn_addr ? dev->info.dyn_addr : dev->info.static_addr; + + writel(REG_DEV_CHAR_ADDR(addr), master->regs + REG_DEV_CHAR); + writel((readl(master->regs + REG_DEV_CHAR) & + ~REG_DEV_CHAR_IS_ATTACHED) | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + i3c_dev_set_master_data(dev, NULL); + master->free_rr_slots |= BIT(data->id); + kfree(data); +} + +static int adi_i3c_master_attach_i2c_dev(struct i2c_dev_desc *dev) +{ + struct i3c_master_controller *m = i2c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data; + int slot; + + slot = adi_i3c_master_get_rr_slot(master, 0); + if (slot < 0) + return slot; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->id = slot; + master->free_rr_slots &= ~BIT(slot); + i2c_dev_set_master_data(dev, data); + + writel(REG_DEV_CHAR_ADDR(dev->addr) | + REG_DEV_CHAR_IS_I2C | REG_DEV_CHAR_IS_ATTACHED | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + return 0; +} + +static void adi_i3c_master_detach_i2c_dev(struct i2c_dev_desc *dev) +{ + struct i3c_master_controller *m = i2c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data = i2c_dev_get_master_data(dev); + + writel(REG_DEV_CHAR_ADDR(dev->addr) | + REG_DEV_CHAR_IS_I2C | REG_DEV_CHAR_WEN, + master->regs + REG_DEV_CHAR); + + i2c_dev_set_master_data(dev, NULL); + master->free_rr_slots |= BIT(data->id); + kfree(data); +} + +static void adi_i3c_master_bus_cleanup(struct i3c_master_controller *m) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + + adi_i3c_master_disable(master); +} + +static void adi_i3c_master_upd_i3c_scl_lim(struct adi_i3c_master *master) +{ + struct i3c_master_controller *m = &master->base; + struct i3c_bus *bus = i3c_master_get_bus(m); + u8 i3c_scl_lim = 0; + struct i3c_dev_desc *dev; + u8 pp_sg; + + i3c_bus_for_each_i3cdev(bus, dev) { + u8 max_fscl; + + max_fscl = max(I3C_CCC_MAX_SDR_FSCL(dev->info.max_read_ds), + I3C_CCC_MAX_SDR_FSCL(dev->info.max_write_ds)); + + switch (max_fscl) { + case I3C_SDR1_FSCL_8MHZ: + max_fscl = PP_SG_6MHZ; + break; + case I3C_SDR2_FSCL_6MHZ: + max_fscl = PP_SG_3MHZ; + break; + case I3C_SDR3_FSCL_4MHZ: + max_fscl = PP_SG_3MHZ; + break; + case I3C_SDR4_FSCL_2MHZ: + max_fscl = PP_SG_1MHZ; + break; + case I3C_SDR0_FSCL_MAX: + default: + max_fscl = PP_SG_12MHZ; + break; + } + + if (max_fscl && + (i3c_scl_lim > max_fscl || !i3c_scl_lim)) + i3c_scl_lim = max_fscl; + } + + if (!i3c_scl_lim) + return; + + master->i3c_scl_lim = i3c_scl_lim - 1; + + pp_sg = readl(master->regs + REG_OPS) & ~REG_OPS_PP_SG_MASK; + pp_sg |= REG_OPS_SET_SG(master->i3c_scl_lim); + + writel(pp_sg, master->regs + REG_OPS); +} + +static void adi_i3c_master_get_features(struct adi_i3c_master *master, + unsigned int slot, + struct i3c_device_info *info) +{ + u32 buf; + + /* Dynamic address and PID are for identification only */ + memset(info, 0, sizeof(*info)); + buf = readl(master->regs + REG_DCR_BCR_DA); + info->dyn_addr = REG_DCR_BCR_DA_GET_DA(buf); + info->dcr = REG_DCR_BCR_DA_GET_DCR(buf); + info->bcr = REG_DCR_BCR_DA_GET_BCR(buf); + info->pid = readl(master->regs + REG_PID_L); + info->pid |= (u64)readl(master->regs + REG_PID_H) << 32; +} + +static int adi_i3c_master_do_daa(struct i3c_master_controller *m) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + int ret, addr = 0; + u32 irq_mask; + + for (u8 i = 0; i < ADI_MAX_DEVS; i++) { + addr = i3c_master_get_free_addr(m, addr); + if (addr < 0) + return addr; + master->daa.addrs[i] = addr; + } + + irq_mask = readl(master->regs + REG_IRQ_MASK); + writel(irq_mask | REG_IRQ_PENDING_DAA, + master->regs + REG_IRQ_MASK); + + master->daa.index = 0; + ret = i3c_master_entdaa_locked(&master->base); + + writel(irq_mask, master->regs + REG_IRQ_MASK); + + /* DAA always finishes with CE2_ERROR or NACK_RESP */ + if (ret && ret != I3C_ERROR_M2) + return ret; + + /* Add I3C devices discovered */ + for (u8 i = 0; i < master->daa.index; i++) + i3c_master_add_i3c_dev_locked(m, master->daa.addrs[i]); + /* Sync retrieved devs info with the IP */ + adi_i3c_master_sync_dev_char(m); + + i3c_master_defslvs_locked(&master->base); + + adi_i3c_master_upd_i3c_scl_lim(master); + + return 0; +} + +static int adi_i3c_master_bus_init(struct i3c_master_controller *m) +{ + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct i3c_device_info info = { }; + int ret; + + ret = i3c_master_get_free_addr(m, 0); + if (ret < 0) + return ret; + + adi_i3c_master_get_features(master, 0, &info); + ret = i3c_master_set_info(&master->base, &info); + if (ret) + return ret; + + writel(REG_IBI_CONFIG_LISTEN, + master->regs + REG_IBI_CONFIG); + + return 0; +} + +static void adi_i3c_master_handle_ibi(struct adi_i3c_master *master, + u32 raw) +{ + struct adi_i3c_i2c_dev_data *data; + struct i3c_ibi_slot *slot; + struct i3c_dev_desc *dev; + u8 da, id, mdb, len; + u8 *buf; + + da = FIELD_GET(GENMASK(23, 17), raw); + mdb = FIELD_GET(GENMASK(15, 8), raw); + for (id = 0; id < master->ibi.num_slots; id++) { + if (master->ibi.slots[id] && + master->ibi.slots[id]->info.dyn_addr == da) + break; + } + + if (id == master->ibi.num_slots) + return; + + dev = master->ibi.slots[id]; + len = ADI_HAS_MDB_FROM_BCR(dev->info.bcr); + data = i3c_dev_get_master_data(dev); + + guard(spinlock)(&master->ibi.lock); + slot = i3c_generic_ibi_get_free_slot(data->ibi_pool); + if (!slot) + return; + + slot->len = len; + buf = slot->data; + buf[0] = mdb; + i3c_master_queue_ibi(dev, slot); +} + +static void adi_i3c_master_demux_ibis(struct adi_i3c_master *master) +{ + while (!(readl(master->regs + REG_FIFO_STATUS) & REG_FIFO_STATUS_IBI_EMPTY)) { + u32 raw = readl(master->regs + REG_IBI_FIFO); + + adi_i3c_master_handle_ibi(master, raw); + } +} + +static void adi_i3c_master_handle_da_req(struct adi_i3c_master *master) +{ + u8 payload0[8]; + u32 addr; + + adi_i3c_master_rd_from_rx_fifo(master, payload0, 6); + addr = master->daa.addrs[master->daa.index++]; + addr = (addr << 1) | (parity8(addr) ? 0 : 1); + + writel(addr, master->regs + REG_SDO_FIFO); +} + +static irqreturn_t adi_i3c_master_irq(int irq, void *data) +{ + struct adi_i3c_master *master = data; + u32 pending; + + pending = readl(master->regs + REG_IRQ_PENDING); + writel(pending, master->regs + REG_IRQ_PENDING); + if (pending & REG_IRQ_PENDING_CMDR) { + scoped_guard(spinlock_irqsave, &master->xferqueue.lock) { + adi_i3c_master_end_xfer_locked(master, pending); + } + } + if (pending & REG_IRQ_PENDING_IBI) + adi_i3c_master_demux_ibis(master); + if (pending & REG_IRQ_PENDING_DAA) + adi_i3c_master_handle_da_req(master); + + return IRQ_HANDLED; +} + +static int adi_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, + struct i2c_msg *xfers, + int nxfers) +{ + struct i3c_master_controller *m = i2c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_xfer *xfer __free(kfree) = NULL; + int i; + + if (!nxfers) + return 0; + for (i = 0; i < nxfers; i++) { + if (xfers[i].flags & I2C_M_TEN) + return -EOPNOTSUPP; + } + xfer = adi_i3c_master_alloc_xfer(master, nxfers); + if (!xfer) + return -ENOMEM; + + for (i = 0; i < nxfers; i++) { + struct adi_i3c_cmd *ccmd = &xfer->cmds[i]; + + ccmd->cmd0 = REG_CMD_FIFO_0_DEV_ADDR(xfers[i].addr); + + if (xfers[i].flags & I2C_M_RD) { + ccmd->cmd0 |= REG_CMD_FIFO_0_RNW; + ccmd->rx_buf = xfers[i].buf; + ccmd->rx_len = xfers[i].len; + } else { + ccmd->tx_buf = xfers[i].buf; + ccmd->tx_len = xfers[i].len; + } + + ccmd->cmd0 |= REG_CMD_FIFO_0_LEN(xfers[i].len); + } + + adi_i3c_master_queue_xfer(master, xfer); + if (!wait_for_completion_timeout(&xfer->comp, + m->i2c.timeout)) + adi_i3c_master_unqueue_xfer(master, xfer); + + return xfer->ret; +} + +static int adi_i3c_master_disable_ibi(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct i3c_dev_desc *i3cdev; + u32 enabled = 0; + int ret; + + ret = i3c_master_disec_locked(m, dev->info.dyn_addr, + I3C_CCC_EVENT_SIR); + + i3c_bus_for_each_i3cdev(&m->bus, i3cdev) { + if (dev != i3cdev && i3cdev->ibi) + enabled |= i3cdev->ibi->enabled; + } + if (!enabled) { + writel(REG_IBI_CONFIG_LISTEN, + master->regs + REG_IBI_CONFIG); + writel(readl(master->regs + REG_IRQ_MASK) & ~REG_IRQ_PENDING_IBI, + master->regs + REG_IRQ_MASK); + } + + return ret; +} + +static int adi_i3c_master_enable_ibi(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + + writel(REG_IBI_CONFIG_LISTEN | REG_IBI_CONFIG_ENABLE, + master->regs + REG_IBI_CONFIG); + + writel(readl(master->regs + REG_IRQ_MASK) | REG_IRQ_PENDING_IBI, + master->regs + REG_IRQ_MASK); + + return i3c_master_enec_locked(m, dev->info.dyn_addr, + I3C_CCC_EVENT_SIR); +} + +static int adi_i3c_master_request_ibi(struct i3c_dev_desc *dev, + const struct i3c_ibi_setup *req) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data; + unsigned int i; + + data = i3c_dev_get_master_data(dev); + data->ibi_pool = i3c_generic_ibi_alloc_pool(dev, req); + if (IS_ERR(data->ibi_pool)) + return PTR_ERR(data->ibi_pool); + + scoped_guard(spinlock_irqsave, &master->ibi.lock) { + for (i = 0; i < master->ibi.num_slots; i++) { + if (!master->ibi.slots[i]) { + data->ibi = i; + master->ibi.slots[i] = dev; + break; + } + } + } + + if (i < master->ibi.num_slots) + return 0; + + i3c_generic_ibi_free_pool(data->ibi_pool); + data->ibi_pool = NULL; + + return -ENOSPC; +} + +static void adi_i3c_master_free_ibi(struct i3c_dev_desc *dev) +{ + struct i3c_master_controller *m = i3c_dev_get_master(dev); + struct adi_i3c_master *master = to_adi_i3c_master(m); + struct adi_i3c_i2c_dev_data *data = i3c_dev_get_master_data(dev); + + scoped_guard(spinlock_irqsave, &master->ibi.lock) { + master->ibi.slots[data->ibi] = NULL; + } + + i3c_generic_ibi_free_pool(data->ibi_pool); +} + +static void adi_i3c_master_recycle_ibi_slot(struct i3c_dev_desc *dev, + struct i3c_ibi_slot *slot) +{ + struct adi_i3c_i2c_dev_data *data = i3c_dev_get_master_data(dev); + + i3c_generic_ibi_recycle_slot(data->ibi_pool, slot); +} + +static const struct i3c_master_controller_ops adi_i3c_master_ops = { + .bus_init = adi_i3c_master_bus_init, + .bus_cleanup = adi_i3c_master_bus_cleanup, + .attach_i3c_dev = adi_i3c_master_attach_i3c_dev, + .reattach_i3c_dev = adi_i3c_master_reattach_i3c_dev, + .detach_i3c_dev = adi_i3c_master_detach_i3c_dev, + .attach_i2c_dev = adi_i3c_master_attach_i2c_dev, + .detach_i2c_dev = adi_i3c_master_detach_i2c_dev, + .do_daa = adi_i3c_master_do_daa, + .supports_ccc_cmd = adi_i3c_master_supports_ccc_cmd, + .send_ccc_cmd = adi_i3c_master_send_ccc_cmd, + .priv_xfers = adi_i3c_master_priv_xfers, + .i2c_xfers = adi_i3c_master_i2c_xfers, + .request_ibi = adi_i3c_master_request_ibi, + .enable_ibi = adi_i3c_master_enable_ibi, + .disable_ibi = adi_i3c_master_disable_ibi, + .free_ibi = adi_i3c_master_free_ibi, + .recycle_ibi_slot = adi_i3c_master_recycle_ibi_slot, +}; + +static const struct of_device_id adi_i3c_master_of_match[] = { + { .compatible = "adi,i3c-master-v1" }, + {} +}; + +static int adi_i3c_master_probe(struct platform_device *pdev) +{ + struct adi_i3c_master *master; + struct clk_bulk_data *clk; + unsigned int version; + int ret, irq; + + master = devm_kzalloc(&pdev->dev, sizeof(*master), GFP_KERNEL); + if (!master) + return -ENOMEM; + + master->regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(master->regs)) + return PTR_ERR(master->regs); + + ret = devm_clk_bulk_get_all_enabled(&pdev->dev, &clk); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Failed to get clocks\n"); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + version = readl(master->regs + ADI_AXI_REG_VERSION); + if (ADI_AXI_PCORE_VER_MAJOR(version) != 1) + dev_err_probe(&pdev->dev, -ENODEV, "Unsupported peripheral version %u.%u.%u\n", + ADI_AXI_PCORE_VER_MAJOR(version), + ADI_AXI_PCORE_VER_MINOR(version), + ADI_AXI_PCORE_VER_PATCH(version)); + + writel(0x00, master->regs + REG_ENABLE); + writel(0x00, master->regs + REG_IRQ_MASK); + + ret = devm_request_irq(&pdev->dev, irq, adi_i3c_master_irq, 0, + dev_name(&pdev->dev), master); + if (ret) + return ret; + + platform_set_drvdata(pdev, master); + + master->free_rr_slots = GENMASK(ADI_MAX_DEVS, 1); + + writel(REG_IRQ_PENDING_CMDR, master->regs + REG_IRQ_MASK); + + spin_lock_init(&master->ibi.lock); + master->ibi.num_slots = 15; + master->ibi.slots = devm_kcalloc(&pdev->dev, master->ibi.num_slots, + sizeof(*master->ibi.slots), + GFP_KERNEL); + if (!master->ibi.slots) + return -ENOMEM; + + spin_lock_init(&master->xferqueue.lock); + INIT_LIST_HEAD(&master->xferqueue.list); + + return i3c_master_register(&master->base, &pdev->dev, + &adi_i3c_master_ops, false); +} + +static void adi_i3c_master_remove(struct platform_device *pdev) +{ + struct adi_i3c_master *master = platform_get_drvdata(pdev); + + writel(0xff, master->regs + REG_IRQ_PENDING); + writel(0x00, master->regs + REG_IRQ_MASK); + writel(0x01, master->regs + REG_ENABLE); + + i3c_master_unregister(&master->base); +} + +static struct platform_driver adi_i3c_master = { + .probe = adi_i3c_master_probe, + .remove = adi_i3c_master_remove, + .driver = { + .name = "adi-i3c-master", + .of_match_table = adi_i3c_master_of_match, + }, +}; +module_platform_driver(adi_i3c_master); + +MODULE_AUTHOR("Jorge Marques "); +MODULE_DESCRIPTION("Analog Devices I3C master driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 974122b2d20ee5..9ceedf09c3b6a6 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -1737,6 +1737,28 @@ static const struct dev_pm_ops dw_i3c_pm_ops = { SET_RUNTIME_PM_OPS(dw_i3c_master_runtime_suspend, dw_i3c_master_runtime_resume, NULL) }; +static void dw_i3c_shutdown(struct platform_device *pdev) +{ + struct dw_i3c_master *master = platform_get_drvdata(pdev); + int ret; + + ret = pm_runtime_resume_and_get(master->dev); + if (ret < 0) { + dev_err(master->dev, + "<%s> cannot resume i3c bus master, err: %d\n", + __func__, ret); + return; + } + + cancel_work_sync(&master->hj_work); + + /* Disable interrupts */ + writel((u32)~INTR_ALL, master->regs + INTR_STATUS_EN); + writel((u32)~INTR_ALL, master->regs + INTR_SIGNAL_EN); + + pm_runtime_put_autosuspend(master->dev); +} + static const struct of_device_id dw_i3c_master_of_match[] = { { .compatible = "snps,dw-i3c-master-1.00a", }, {}, @@ -1752,6 +1774,7 @@ MODULE_DEVICE_TABLE(acpi, amd_i3c_device_match); static struct platform_driver dw_i3c_driver = { .probe = dw_i3c_probe, .remove = dw_i3c_remove, + .shutdown = dw_i3c_shutdown, .driver = { .name = "dw-i3c-master", .of_match_table = dw_i3c_master_of_match, diff --git a/drivers/i3c/master/mipi-i3c-hci/cmd_v1.c b/drivers/i3c/master/mipi-i3c-hci/cmd_v1.c index dd636094b07f59..eb8a3ae2990d77 100644 --- a/drivers/i3c/master/mipi-i3c-hci/cmd_v1.c +++ b/drivers/i3c/master/mipi-i3c-hci/cmd_v1.c @@ -317,7 +317,9 @@ static int hci_cmd_v1_daa(struct i3c_hci *hci) break; next_addr = ret; - DBG("next_addr = 0x%02x, DAA using DAT %d", next_addr, dat_idx); + dev_dbg(&hci->master.dev, + "next_addr = 0x%02x, DAA using DAT %d", + next_addr, dat_idx); mipi_i3c_hci_dat_v1.set_dynamic_addr(hci, dat_idx, next_addr); mipi_i3c_hci_dct_index_reset(hci); @@ -349,8 +351,9 @@ static int hci_cmd_v1_daa(struct i3c_hci *hci) } i3c_hci_dct_get_val(hci, 0, &pid, &dcr, &bcr); - DBG("assigned address %#x to device PID=0x%llx DCR=%#x BCR=%#x", - next_addr, pid, dcr, bcr); + dev_dbg(&hci->master.dev, + "assigned address %#x to device PID=0x%llx DCR=%#x BCR=%#x", + next_addr, pid, dcr, bcr); mipi_i3c_hci_dat_v1.free_entry(hci, dat_idx); dat_idx = -1; diff --git a/drivers/i3c/master/mipi-i3c-hci/cmd_v2.c b/drivers/i3c/master/mipi-i3c-hci/cmd_v2.c index 4493b2b067cbce..efb4326a25b73e 100644 --- a/drivers/i3c/master/mipi-i3c-hci/cmd_v2.c +++ b/drivers/i3c/master/mipi-i3c-hci/cmd_v2.c @@ -261,7 +261,7 @@ static int hci_cmd_v2_daa(struct i3c_hci *hci) if (ret < 0) break; next_addr = ret; - DBG("next_addr = 0x%02x", next_addr); + dev_dbg(&hci->master.dev, "next_addr = 0x%02x", next_addr); xfer[0].cmd_tid = hci_get_tid(); xfer[0].cmd_desc[0] = CMD_0_ATTR_A | @@ -293,8 +293,9 @@ static int hci_cmd_v2_daa(struct i3c_hci *hci) pid = (pid << 32) | device_id[0]; bcr = FIELD_GET(W1_MASK(55, 48), device_id[1]); dcr = FIELD_GET(W1_MASK(63, 56), device_id[1]); - DBG("assigned address %#x to device PID=0x%llx DCR=%#x BCR=%#x", - next_addr, pid, dcr, bcr); + dev_dbg(&hci->master.dev, + "assigned address %#x to device PID=0x%llx DCR=%#x BCR=%#x", + next_addr, pid, dcr, bcr); /* * TODO: Extend the subsystem layer to allow for registering * new device and provide BCR/DCR/PID at the same time. diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index 60f1175f1f37cc..47e42cb4dbe71e 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -121,8 +121,6 @@ static int i3c_hci_bus_init(struct i3c_master_controller *m) struct i3c_device_info info; int ret; - DBG(""); - if (hci->cmd == &mipi_i3c_hci_cmd_v1) { ret = mipi_i3c_hci_dat_v1.init(hci); if (ret) @@ -149,7 +147,7 @@ static int i3c_hci_bus_init(struct i3c_master_controller *m) amd_set_resp_buf_thld(hci); reg_set(HC_CONTROL, HC_CONTROL_BUS_ENABLE); - DBG("HC_CONTROL = %#x", reg_read(HC_CONTROL)); + dev_dbg(&hci->master.dev, "HC_CONTROL = %#x", reg_read(HC_CONTROL)); return 0; } @@ -159,8 +157,6 @@ static void i3c_hci_bus_cleanup(struct i3c_master_controller *m) struct i3c_hci *hci = to_i3c_hci(m); struct platform_device *pdev = to_platform_device(m->dev.parent); - DBG(""); - reg_clear(HC_CONTROL, HC_CONTROL_BUS_ENABLE); synchronize_irq(platform_get_irq(pdev, 0)); hci->io->cleanup(hci); @@ -196,8 +192,8 @@ static int i3c_hci_send_ccc_cmd(struct i3c_master_controller *m, DECLARE_COMPLETION_ONSTACK(done); int i, last, ret = 0; - DBG("cmd=%#x rnw=%d ndests=%d data[0].len=%d", - ccc->id, ccc->rnw, ccc->ndests, ccc->dests[0].payload.len); + dev_dbg(&hci->master.dev, "cmd=%#x rnw=%d ndests=%d data[0].len=%d", + ccc->id, ccc->rnw, ccc->ndests, ccc->dests[0].payload.len); xfer = hci_alloc_xfer(nxfers); if (!xfer) @@ -255,8 +251,8 @@ static int i3c_hci_send_ccc_cmd(struct i3c_master_controller *m, } if (ccc->rnw) - DBG("got: %*ph", - ccc->dests[0].payload.len, ccc->dests[0].payload.data); + dev_dbg(&hci->master.dev, "got: %*ph", + ccc->dests[0].payload.len, ccc->dests[0].payload.data); out: hci_free_xfer(xfer, nxfers); @@ -267,39 +263,9 @@ static int i3c_hci_daa(struct i3c_master_controller *m) { struct i3c_hci *hci = to_i3c_hci(m); - DBG(""); - return hci->cmd->perform_daa(hci); } -static int i3c_hci_alloc_safe_xfer_buf(struct i3c_hci *hci, - struct hci_xfer *xfer) -{ - if (hci->io != &mipi_i3c_hci_dma || - xfer->data == NULL || !is_vmalloc_addr(xfer->data)) - return 0; - - if (xfer->rnw) - xfer->bounce_buf = kzalloc(xfer->data_len, GFP_KERNEL); - else - xfer->bounce_buf = kmemdup(xfer->data, - xfer->data_len, GFP_KERNEL); - - return xfer->bounce_buf == NULL ? -ENOMEM : 0; -} - -static void i3c_hci_free_safe_xfer_buf(struct i3c_hci *hci, - struct hci_xfer *xfer) -{ - if (hci->io != &mipi_i3c_hci_dma || xfer->bounce_buf == NULL) - return; - - if (xfer->rnw) - memcpy(xfer->data, xfer->bounce_buf, xfer->data_len); - - kfree(xfer->bounce_buf); -} - static int i3c_hci_priv_xfers(struct i3c_dev_desc *dev, struct i3c_priv_xfer *i3c_xfers, int nxfers) @@ -311,7 +277,7 @@ static int i3c_hci_priv_xfers(struct i3c_dev_desc *dev, unsigned int size_limit; int i, last, ret = 0; - DBG("nxfers = %d", nxfers); + dev_dbg(&hci->master.dev, "nxfers = %d", nxfers); xfer = hci_alloc_xfer(nxfers); if (!xfer) @@ -333,9 +299,6 @@ static int i3c_hci_priv_xfers(struct i3c_dev_desc *dev, } hci->cmd->prep_i3c_xfer(hci, dev, &xfer[i]); xfer[i].cmd_desc[0] |= CMD_0_ROC; - ret = i3c_hci_alloc_safe_xfer_buf(hci, &xfer[i]); - if (ret) - goto out; } last = i - 1; xfer[last].cmd_desc[0] |= CMD_0_TOC; @@ -359,9 +322,6 @@ static int i3c_hci_priv_xfers(struct i3c_dev_desc *dev, } out: - for (i = 0; i < nxfers; i++) - i3c_hci_free_safe_xfer_buf(hci, &xfer[i]); - hci_free_xfer(xfer, nxfers); return ret; } @@ -375,14 +335,14 @@ static int i3c_hci_i2c_xfers(struct i2c_dev_desc *dev, DECLARE_COMPLETION_ONSTACK(done); int i, last, ret = 0; - DBG("nxfers = %d", nxfers); + dev_dbg(&hci->master.dev, "nxfers = %d", nxfers); xfer = hci_alloc_xfer(nxfers); if (!xfer) return -ENOMEM; for (i = 0; i < nxfers; i++) { - xfer[i].data = i2c_get_dma_safe_msg_buf(&i2c_xfers[i], 1); + xfer[i].data = i2c_xfers[i].buf; xfer[i].data_len = i2c_xfers[i].len; xfer[i].rnw = i2c_xfers[i].flags & I2C_M_RD; hci->cmd->prep_i2c_xfer(hci, dev, &xfer[i]); @@ -408,10 +368,6 @@ static int i3c_hci_i2c_xfers(struct i2c_dev_desc *dev, } out: - for (i = 0; i < nxfers; i++) - i2c_put_dma_safe_msg_buf(xfer[i].data, &i2c_xfers[i], - ret ? false : true); - hci_free_xfer(xfer, nxfers); return ret; } @@ -423,8 +379,6 @@ static int i3c_hci_attach_i3c_dev(struct i3c_dev_desc *dev) struct i3c_hci_dev_data *dev_data; int ret; - DBG(""); - dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); if (!dev_data) return -ENOMEM; @@ -448,8 +402,6 @@ static int i3c_hci_reattach_i3c_dev(struct i3c_dev_desc *dev, u8 old_dyn_addr) struct i3c_hci *hci = to_i3c_hci(m); struct i3c_hci_dev_data *dev_data = i3c_dev_get_master_data(dev); - DBG(""); - if (hci->cmd == &mipi_i3c_hci_cmd_v1) mipi_i3c_hci_dat_v1.set_dynamic_addr(hci, dev_data->dat_idx, dev->info.dyn_addr); @@ -462,8 +414,6 @@ static void i3c_hci_detach_i3c_dev(struct i3c_dev_desc *dev) struct i3c_hci *hci = to_i3c_hci(m); struct i3c_hci_dev_data *dev_data = i3c_dev_get_master_data(dev); - DBG(""); - i3c_dev_set_master_data(dev, NULL); if (hci->cmd == &mipi_i3c_hci_cmd_v1) mipi_i3c_hci_dat_v1.free_entry(hci, dev_data->dat_idx); @@ -477,8 +427,6 @@ static int i3c_hci_attach_i2c_dev(struct i2c_dev_desc *dev) struct i3c_hci_dev_data *dev_data; int ret; - DBG(""); - if (hci->cmd != &mipi_i3c_hci_cmd_v1) return 0; dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); @@ -502,8 +450,6 @@ static void i3c_hci_detach_i2c_dev(struct i2c_dev_desc *dev) struct i3c_hci *hci = to_i3c_hci(m); struct i3c_hci_dev_data *dev_data = i2c_dev_get_master_data(dev); - DBG(""); - if (dev_data) { i2c_dev_set_master_data(dev, NULL); if (hci->cmd == &mipi_i3c_hci_cmd_v1) @@ -591,7 +537,7 @@ static irqreturn_t i3c_hci_irq_handler(int irq, void *dev_id) val = reg_read(INTR_STATUS); reg_write(INTR_STATUS, val); - DBG("INTR_STATUS = %#x", val); + dev_dbg(&hci->master.dev, "INTR_STATUS %#x", val); if (val) result = IRQ_HANDLED; @@ -641,7 +587,7 @@ static int i3c_hci_init(struct i3c_hci *hci) } hci->caps = reg_read(HC_CAPABILITIES); - DBG("caps = %#x", hci->caps); + dev_dbg(&hci->master.dev, "caps = %#x", hci->caps); size_in_dwords = hci->version_major < 1 || (hci->version_major == 1 && hci->version_minor < 1); diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c index 491dfe70b66002..c401a9425cdc59 100644 --- a/drivers/i3c/master/mipi-i3c-hci/dma.c +++ b/drivers/i3c/master/mipi-i3c-hci/dma.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "hci.h" #include "cmd.h" @@ -76,7 +77,6 @@ #define INTR_TRANSFER_COMPLETION BIT(11) #define INTR_RING_OP BIT(10) #define INTR_TRANSFER_ERR BIT(9) -#define INTR_WARN_INS_STOP_MODE BIT(7) #define INTR_IBI_RING_FULL BIT(6) #define INTR_TRANSFER_ABORT BIT(5) @@ -138,6 +138,7 @@ struct hci_rh_data { }; struct hci_rings_data { + struct device *sysdev; unsigned int total; struct hci_rh_data headers[] __counted_by(total); }; @@ -165,20 +166,20 @@ static void hci_dma_cleanup(struct i3c_hci *hci) rh_reg_write(IBI_SETUP, 0); if (rh->xfer) - dma_free_coherent(&hci->master.dev, + dma_free_coherent(rings->sysdev, rh->xfer_struct_sz * rh->xfer_entries, rh->xfer, rh->xfer_dma); if (rh->resp) - dma_free_coherent(&hci->master.dev, + dma_free_coherent(rings->sysdev, rh->resp_struct_sz * rh->xfer_entries, rh->resp, rh->resp_dma); kfree(rh->src_xfers); if (rh->ibi_status) - dma_free_coherent(&hci->master.dev, + dma_free_coherent(rings->sysdev, rh->ibi_status_sz * rh->ibi_status_entries, rh->ibi_status, rh->ibi_status_dma); if (rh->ibi_data_dma) - dma_unmap_single(&hci->master.dev, rh->ibi_data_dma, + dma_unmap_single(rings->sysdev, rh->ibi_data_dma, rh->ibi_chunk_sz * rh->ibi_chunks_total, DMA_FROM_DEVICE); kfree(rh->ibi_data); @@ -194,11 +195,23 @@ static int hci_dma_init(struct i3c_hci *hci) { struct hci_rings_data *rings; struct hci_rh_data *rh; + struct device *sysdev; u32 regval; unsigned int i, nr_rings, xfers_sz, resps_sz; unsigned int ibi_status_ring_sz, ibi_data_ring_sz; int ret; + /* + * Set pointer to a physical device that does DMA and has IOMMU setup + * done for it in case of enabled IOMMU and use it with the DMA API. + * Here such device is either + * "mipi-i3c-hci" platform device (OF/ACPI enumeration) parent or + * grandparent (PCI enumeration). + */ + sysdev = hci->master.dev.parent; + if (sysdev->parent && dev_is_pci(sysdev->parent)) + sysdev = sysdev->parent; + regval = rhs_reg_read(CONTROL); nr_rings = FIELD_GET(MAX_HEADER_COUNT_CAP, regval); dev_info(&hci->master.dev, "%d DMA rings available\n", nr_rings); @@ -213,6 +226,7 @@ static int hci_dma_init(struct i3c_hci *hci) return -ENOMEM; hci->io_data = rings; rings->total = nr_rings; + rings->sysdev = sysdev; regval = FIELD_PREP(MAX_HEADER_COUNT, rings->total); rhs_reg_write(CONTROL, regval); @@ -234,14 +248,15 @@ static int hci_dma_init(struct i3c_hci *hci) regval = rh_reg_read(CR_SETUP); rh->xfer_struct_sz = FIELD_GET(CR_XFER_STRUCT_SIZE, regval); rh->resp_struct_sz = FIELD_GET(CR_RESP_STRUCT_SIZE, regval); - DBG("xfer_struct_sz = %d, resp_struct_sz = %d", - rh->xfer_struct_sz, rh->resp_struct_sz); + dev_dbg(&hci->master.dev, + "xfer_struct_sz = %d, resp_struct_sz = %d", + rh->xfer_struct_sz, rh->resp_struct_sz); xfers_sz = rh->xfer_struct_sz * rh->xfer_entries; resps_sz = rh->resp_struct_sz * rh->xfer_entries; - rh->xfer = dma_alloc_coherent(&hci->master.dev, xfers_sz, + rh->xfer = dma_alloc_coherent(rings->sysdev, xfers_sz, &rh->xfer_dma, GFP_KERNEL); - rh->resp = dma_alloc_coherent(&hci->master.dev, resps_sz, + rh->resp = dma_alloc_coherent(rings->sysdev, resps_sz, &rh->resp_dma, GFP_KERNEL); rh->src_xfers = kmalloc_array(rh->xfer_entries, sizeof(*rh->src_xfers), @@ -263,7 +278,6 @@ static int hci_dma_init(struct i3c_hci *hci) INTR_TRANSFER_COMPLETION | INTR_RING_OP | INTR_TRANSFER_ERR | - INTR_WARN_INS_STOP_MODE | INTR_IBI_RING_FULL | INTR_TRANSFER_ABORT); @@ -295,16 +309,16 @@ static int hci_dma_init(struct i3c_hci *hci) ibi_data_ring_sz = rh->ibi_chunk_sz * rh->ibi_chunks_total; rh->ibi_status = - dma_alloc_coherent(&hci->master.dev, ibi_status_ring_sz, + dma_alloc_coherent(rings->sysdev, ibi_status_ring_sz, &rh->ibi_status_dma, GFP_KERNEL); rh->ibi_data = kmalloc(ibi_data_ring_sz, GFP_KERNEL); ret = -ENOMEM; if (!rh->ibi_status || !rh->ibi_data) goto err_out; rh->ibi_data_dma = - dma_map_single(&hci->master.dev, rh->ibi_data, + dma_map_single(rings->sysdev, rh->ibi_data, ibi_data_ring_sz, DMA_FROM_DEVICE); - if (dma_mapping_error(&hci->master.dev, rh->ibi_data_dma)) { + if (dma_mapping_error(rings->sysdev, rh->ibi_data_dma)) { rh->ibi_data_dma = 0; ret = -ENOMEM; goto err_out; @@ -349,9 +363,7 @@ static void hci_dma_unmap_xfer(struct i3c_hci *hci, xfer = xfer_list + i; if (!xfer->data) continue; - dma_unmap_single(&hci->master.dev, - xfer->data_dma, xfer->data_len, - xfer->rnw ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + i3c_master_dma_unmap_single(xfer->dma); } } @@ -362,7 +374,6 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci, struct hci_rh_data *rh; unsigned int i, ring, enqueue_ptr; u32 op1_val, op2_val; - void *buf; /* For now we only use ring 0 */ ring = 0; @@ -373,6 +384,9 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci, for (i = 0; i < n; i++) { struct hci_xfer *xfer = xfer_list + i; u32 *ring_data = rh->xfer + rh->xfer_struct_sz * enqueue_ptr; + enum dma_data_direction dir = xfer->rnw ? DMA_FROM_DEVICE : + DMA_TO_DEVICE; + bool need_bounce; /* store cmd descriptor */ *ring_data++ = xfer->cmd_desc[0]; @@ -391,21 +405,20 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci, /* 2nd and 3rd words of Data Buffer Descriptor Structure */ if (xfer->data) { - buf = xfer->bounce_buf ? xfer->bounce_buf : xfer->data; - xfer->data_dma = - dma_map_single(&hci->master.dev, - buf, - xfer->data_len, - xfer->rnw ? - DMA_FROM_DEVICE : - DMA_TO_DEVICE); - if (dma_mapping_error(&hci->master.dev, - xfer->data_dma)) { + need_bounce = device_iommu_mapped(rings->sysdev) && + xfer->rnw && + xfer->data_len != ALIGN(xfer->data_len, 4); + xfer->dma = i3c_master_dma_map_single(rings->sysdev, + xfer->data, + xfer->data_len, + need_bounce, + dir); + if (!xfer->dma) { hci_dma_unmap_xfer(hci, xfer_list, i); return -ENOMEM; } - *ring_data++ = lower_32_bits(xfer->data_dma); - *ring_data++ = upper_32_bits(xfer->data_dma); + *ring_data++ = lower_32_bits(xfer->dma->addr); + *ring_data++ = upper_32_bits(xfer->dma->addr); } else { *ring_data++ = 0; *ring_data++ = 0; @@ -511,11 +524,11 @@ static void hci_dma_xfer_done(struct i3c_hci *hci, struct hci_rh_data *rh) ring_resp = rh->resp + rh->resp_struct_sz * done_ptr; resp = *ring_resp; tid = RESP_TID(resp); - DBG("resp = 0x%08x", resp); + dev_dbg(&hci->master.dev, "resp = 0x%08x", resp); xfer = rh->src_xfers[done_ptr]; if (!xfer) { - DBG("orphaned ring entry"); + dev_dbg(&hci->master.dev, "orphaned ring entry"); } else { hci_dma_unmap_xfer(hci, xfer, 1); xfer->ring_entry = -1; @@ -586,6 +599,7 @@ static void hci_dma_recycle_ibi_slot(struct i3c_hci *hci, static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) { + struct hci_rings_data *rings = hci->io_data; struct i3c_dev_desc *dev; struct i3c_hci_dev_data *dev_data; struct hci_dma_dev_ibi_data *dev_ibi; @@ -617,7 +631,7 @@ static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) ring_ibi_status = rh->ibi_status + rh->ibi_status_sz * ptr; ibi_status = *ring_ibi_status; - DBG("status = %#x", ibi_status); + dev_dbg(&hci->master.dev, "status = %#x", ibi_status); if (ibi_status_error) { /* we no longer care */ @@ -645,7 +659,9 @@ static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) if (last_ptr == -1) { /* this IBI sequence is not yet complete */ - DBG("no LAST_STATUS available (e=%d d=%d)", enq_ptr, deq_ptr); + dev_dbg(&hci->master.dev, + "no LAST_STATUS available (e=%d d=%d)", + enq_ptr, deq_ptr); return; } deq_ptr = last_ptr + 1; @@ -696,7 +712,7 @@ static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) * rh->ibi_chunk_sz; if (first_part > ibi_size) first_part = ibi_size; - dma_sync_single_for_cpu(&hci->master.dev, ring_ibi_data_dma, + dma_sync_single_for_cpu(rings->sysdev, ring_ibi_data_dma, first_part, DMA_FROM_DEVICE); memcpy(slot->data, ring_ibi_data, first_part); @@ -705,7 +721,7 @@ static void hci_dma_process_ibi(struct i3c_hci *hci, struct hci_rh_data *rh) /* we wrap back to the start and copy remaining data */ ring_ibi_data = rh->ibi_data; ring_ibi_data_dma = rh->ibi_data_dma; - dma_sync_single_for_cpu(&hci->master.dev, ring_ibi_data_dma, + dma_sync_single_for_cpu(rings->sysdev, ring_ibi_data_dma, ibi_size - first_part, DMA_FROM_DEVICE); memcpy(slot->data + first_part, ring_ibi_data, ibi_size - first_part); @@ -745,7 +761,8 @@ static bool hci_dma_irq_handler(struct i3c_hci *hci) rh = &rings->headers[i]; status = rh_reg_read(INTR_STATUS); - DBG("rh%d status: %#x", i, status); + dev_dbg(&hci->master.dev, "Ring %d: RH_INTR_STATUS %#x", + i, status); if (!status) continue; rh_reg_write(INTR_STATUS, status); @@ -761,7 +778,7 @@ static bool hci_dma_irq_handler(struct i3c_hci *hci) u32 ring_status; dev_notice_ratelimited(&hci->master.dev, - "ring %d: Transfer Aborted\n", i); + "Ring %d: Transfer Aborted\n", i); mipi_i3c_hci_resume(hci); ring_status = rh_reg_read(RING_STATUS); if (!(ring_status & RING_STATUS_RUNNING) && @@ -779,12 +796,9 @@ static bool hci_dma_irq_handler(struct i3c_hci *hci) RING_CTRL_RUN_STOP); } } - if (status & INTR_WARN_INS_STOP_MODE) - dev_warn_ratelimited(&hci->master.dev, - "ring %d: Inserted Stop on Mode Change\n", i); if (status & INTR_IBI_RING_FULL) dev_err_ratelimited(&hci->master.dev, - "ring %d: IBI Ring Full Condition\n", i); + "Ring %d: IBI Ring Full Condition\n", i); handled = true; } diff --git a/drivers/i3c/master/mipi-i3c-hci/ext_caps.c b/drivers/i3c/master/mipi-i3c-hci/ext_caps.c index 2e9b23efdc45da..7714f00ea9cc09 100644 --- a/drivers/i3c/master/mipi-i3c-hci/ext_caps.c +++ b/drivers/i3c/master/mipi-i3c-hci/ext_caps.c @@ -35,7 +35,7 @@ static int hci_extcap_hardware_id(struct i3c_hci *hci, void __iomem *base) switch (hci->vendor_mipi_id) { case MIPI_VENDOR_NXP: hci->quirks |= HCI_QUIRK_RAW_CCC; - DBG("raw CCC quirks set"); + dev_dbg(&hci->master.dev, "raw CCC quirks set"); break; } @@ -77,7 +77,8 @@ static int hci_extcap_xfer_modes(struct i3c_hci *hci, void __iomem *base) for (index = 0; index < entries; index++) { u32 mode_entry = readl(base); - DBG("mode %d: 0x%08x", index, mode_entry); + dev_dbg(&hci->master.dev, "mode %d: 0x%08x", + index, mode_entry); /* TODO: will be needed when I3C core does more than SDR */ base += 4; } @@ -97,7 +98,8 @@ static int hci_extcap_xfer_rates(struct i3c_hci *hci, void __iomem *base) dev_info(&hci->master.dev, "available data rates:\n"); for (index = 0; index < entries; index++) { rate_entry = readl(base); - DBG("entry %d: 0x%08x", index, rate_entry); + dev_dbg(&hci->master.dev, "entry %d: 0x%08x", + index, rate_entry); rate = FIELD_GET(XFERRATE_ACTUAL_RATE_KHZ, rate_entry); rate_id = FIELD_GET(XFERRATE_RATE_ID, rate_entry); mode_id = FIELD_GET(XFERRATE_MODE_ID, rate_entry); @@ -268,7 +270,8 @@ int i3c_hci_parse_ext_caps(struct i3c_hci *hci) cap_header = readl(curr_cap); cap_id = FIELD_GET(CAP_HEADER_ID, cap_header); cap_length = FIELD_GET(CAP_HEADER_LENGTH, cap_header); - DBG("id=0x%02x length=%d", cap_id, cap_length); + dev_dbg(&hci->master.dev, "id=0x%02x length=%d", + cap_id, cap_length); if (!cap_length) break; if (curr_cap + cap_length * 4 >= end) { diff --git a/drivers/i3c/master/mipi-i3c-hci/hci.h b/drivers/i3c/master/mipi-i3c-hci/hci.h index 69ea1d10414b8c..249ccb13c90928 100644 --- a/drivers/i3c/master/mipi-i3c-hci/hci.h +++ b/drivers/i3c/master/mipi-i3c-hci/hci.h @@ -12,9 +12,6 @@ #include -/* Handy logging macro to save on line length */ -#define DBG(x, ...) pr_devel("%s: " x "\n", __func__, ##__VA_ARGS__) - /* 32-bit word aware bit and mask macros */ #define W0_MASK(h, l) GENMASK((h) - 0, (l) - 0) #define W1_MASK(h, l) GENMASK((h) - 32, (l) - 32) @@ -94,8 +91,7 @@ struct hci_xfer { }; struct { /* DMA specific */ - dma_addr_t data_dma; - void *bounce_buf; + struct i3c_dma *dma; int ring_number; int ring_entry; }; diff --git a/drivers/i3c/master/mipi-i3c-hci/mipi-i3c-hci-pci.c b/drivers/i3c/master/mipi-i3c-hci/mipi-i3c-hci-pci.c index c6c3a3ec11eae3..08e6cbdf89cead 100644 --- a/drivers/i3c/master/mipi-i3c-hci/mipi-i3c-hci-pci.c +++ b/drivers/i3c/master/mipi-i3c-hci/mipi-i3c-hci-pci.c @@ -124,6 +124,9 @@ static void mipi_i3c_hci_pci_remove(struct pci_dev *pci) } static const struct pci_device_id mipi_i3c_hci_pci_devices[] = { + /* Wildcat Lake-U */ + { PCI_VDEVICE(INTEL, 0x4d7c), (kernel_ulong_t)&intel_info}, + { PCI_VDEVICE(INTEL, 0x4d6f), (kernel_ulong_t)&intel_info}, /* Panther Lake-H */ { PCI_VDEVICE(INTEL, 0xe37c), (kernel_ulong_t)&intel_info}, { PCI_VDEVICE(INTEL, 0xe36f), (kernel_ulong_t)&intel_info}, diff --git a/drivers/i3c/master/mipi-i3c-hci/pio.c b/drivers/i3c/master/mipi-i3c-hci/pio.c index 2fc71e6969111a..710faa46a00faa 100644 --- a/drivers/i3c/master/mipi-i3c-hci/pio.c +++ b/drivers/i3c/master/mipi-i3c-hci/pio.c @@ -213,8 +213,8 @@ static void hci_pio_cleanup(struct i3c_hci *hci) pio_reg_write(INTR_SIGNAL_ENABLE, 0x0); if (pio) { - DBG("status = %#x/%#x", - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "status = %#x/%#x", + pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); BUG_ON(pio->curr_xfer); BUG_ON(pio->curr_rx); BUG_ON(pio->curr_tx); @@ -226,13 +226,17 @@ static void hci_pio_cleanup(struct i3c_hci *hci) static void hci_pio_write_cmd(struct i3c_hci *hci, struct hci_xfer *xfer) { - DBG("cmd_desc[%d] = 0x%08x", 0, xfer->cmd_desc[0]); - DBG("cmd_desc[%d] = 0x%08x", 1, xfer->cmd_desc[1]); + dev_dbg(&hci->master.dev, "cmd_desc[%d] = 0x%08x", + 0, xfer->cmd_desc[0]); + dev_dbg(&hci->master.dev, "cmd_desc[%d] = 0x%08x", + 1, xfer->cmd_desc[1]); pio_reg_write(COMMAND_QUEUE_PORT, xfer->cmd_desc[0]); pio_reg_write(COMMAND_QUEUE_PORT, xfer->cmd_desc[1]); if (hci->cmd == &mipi_i3c_hci_cmd_v2) { - DBG("cmd_desc[%d] = 0x%08x", 2, xfer->cmd_desc[2]); - DBG("cmd_desc[%d] = 0x%08x", 3, xfer->cmd_desc[3]); + dev_dbg(&hci->master.dev, "cmd_desc[%d] = 0x%08x", + 2, xfer->cmd_desc[2]); + dev_dbg(&hci->master.dev, "cmd_desc[%d] = 0x%08x", + 3, xfer->cmd_desc[3]); pio_reg_write(COMMAND_QUEUE_PORT, xfer->cmd_desc[2]); pio_reg_write(COMMAND_QUEUE_PORT, xfer->cmd_desc[3]); } @@ -254,7 +258,8 @@ static bool hci_pio_do_rx(struct i3c_hci *hci, struct hci_pio_data *pio) nr_words = min(xfer->data_left / 4, pio->rx_thresh_size); /* extract data from FIFO */ xfer->data_left -= nr_words * 4; - DBG("now %d left %d", nr_words * 4, xfer->data_left); + dev_dbg(&hci->master.dev, "now %d left %d", + nr_words * 4, xfer->data_left); while (nr_words--) *p++ = pio_reg_read(XFER_DATA_PORT); } @@ -269,7 +274,7 @@ static void hci_pio_do_trailing_rx(struct i3c_hci *hci, struct hci_xfer *xfer = pio->curr_rx; u32 *p; - DBG("%d remaining", count); + dev_dbg(&hci->master.dev, "%d remaining", count); p = xfer->data; p += (xfer->data_len - xfer->data_left) / 4; @@ -278,7 +283,8 @@ static void hci_pio_do_trailing_rx(struct i3c_hci *hci, unsigned int nr_words = count / 4; /* extract data from FIFO */ xfer->data_left -= nr_words * 4; - DBG("now %d left %d", nr_words * 4, xfer->data_left); + dev_dbg(&hci->master.dev, "now %d left %d", + nr_words * 4, xfer->data_left); while (nr_words--) *p++ = pio_reg_read(XFER_DATA_PORT); } @@ -321,7 +327,8 @@ static bool hci_pio_do_tx(struct i3c_hci *hci, struct hci_pio_data *pio) nr_words = min(xfer->data_left / 4, pio->tx_thresh_size); /* push data into the FIFO */ xfer->data_left -= nr_words * 4; - DBG("now %d left %d", nr_words * 4, xfer->data_left); + dev_dbg(&hci->master.dev, "now %d left %d", + nr_words * 4, xfer->data_left); while (nr_words--) pio_reg_write(XFER_DATA_PORT, *p++); } @@ -336,7 +343,7 @@ static bool hci_pio_do_tx(struct i3c_hci *hci, struct hci_pio_data *pio) */ if (!(pio_reg_read(INTR_STATUS) & STAT_TX_THLD)) return false; - DBG("trailing %d", xfer->data_left); + dev_dbg(&hci->master.dev, "trailing %d", xfer->data_left); pio_reg_write(XFER_DATA_PORT, *p); xfer->data_left = 0; } @@ -481,7 +488,7 @@ static bool hci_pio_process_resp(struct i3c_hci *hci, struct hci_pio_data *pio) u32 resp = pio_reg_read(RESPONSE_QUEUE_PORT); unsigned int tid = RESP_TID(resp); - DBG("resp = 0x%08x", resp); + dev_dbg(&hci->master.dev, "resp = 0x%08x", resp); if (tid != xfer->cmd_tid) { dev_err(&hci->master.dev, "response tid=%d when expecting %d\n", @@ -522,14 +529,15 @@ static bool hci_pio_process_resp(struct i3c_hci *hci, struct hci_pio_data *pio) * still exists. */ if (pio->curr_rx == xfer) { - DBG("short RX ?"); + dev_dbg(&hci->master.dev, "short RX ?"); pio->curr_rx = pio->curr_rx->next_data; } else if (pio->curr_tx == xfer) { - DBG("short TX ?"); + dev_dbg(&hci->master.dev, "short TX ?"); pio->curr_tx = pio->curr_tx->next_data; } else if (xfer->data_left) { - DBG("PIO xfer count = %d after response", - xfer->data_left); + dev_dbg(&hci->master.dev, + "PIO xfer count = %d after response", + xfer->data_left); } pio->curr_resp = xfer->next_resp; @@ -591,7 +599,7 @@ static int hci_pio_queue_xfer(struct i3c_hci *hci, struct hci_xfer *xfer, int n) struct hci_xfer *prev_queue_tail; int i; - DBG("n = %d", n); + dev_dbg(&hci->master.dev, "n = %d", n); /* link xfer instances together and initialize data count */ for (i = 0; i < n; i++) { @@ -611,8 +619,9 @@ static int hci_pio_queue_xfer(struct i3c_hci *hci, struct hci_xfer *xfer, int n) if (!hci_pio_process_cmd(hci, pio)) pio->enabled_irqs |= STAT_CMD_QUEUE_READY; pio_reg_write(INTR_SIGNAL_ENABLE, pio->enabled_irqs); - DBG("status = %#x/%#x", - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "status = %#x/%#x", + pio_reg_read(INTR_STATUS), + pio_reg_read(INTR_SIGNAL_ENABLE)); } spin_unlock_irq(&pio->lock); return 0; @@ -686,10 +695,10 @@ static bool hci_pio_dequeue_xfer(struct i3c_hci *hci, struct hci_xfer *xfer, int int ret; spin_lock_irq(&pio->lock); - DBG("n=%d status=%#x/%#x", n, - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); - DBG("main_status = %#x/%#x", - readl(hci->base_regs + 0x20), readl(hci->base_regs + 0x28)); + dev_dbg(&hci->master.dev, "n=%d status=%#x/%#x", n, + pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "main_status = %#x/%#x", + readl(hci->base_regs + 0x20), readl(hci->base_regs + 0x28)); ret = hci_pio_dequeue_xfer_common(hci, pio, xfer, n); spin_unlock_irq(&pio->lock); @@ -733,8 +742,8 @@ static void hci_pio_err(struct i3c_hci *hci, struct hci_pio_data *pio, mipi_i3c_hci_pio_reset(hci); mipi_i3c_hci_resume(hci); - DBG("status=%#x/%#x", - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "status=%#x/%#x", + pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); } static void hci_pio_set_ibi_thresh(struct i3c_hci *hci, @@ -749,7 +758,7 @@ static void hci_pio_set_ibi_thresh(struct i3c_hci *hci, if (regval != pio->reg_queue_thresh) { pio_reg_write(QUEUE_THLD_CTRL, regval); pio->reg_queue_thresh = regval; - DBG("%d", thresh_val); + dev_dbg(&hci->master.dev, "%d", thresh_val); } } @@ -773,7 +782,8 @@ static bool hci_pio_get_ibi_segment(struct i3c_hci *hci, /* extract the data from the IBI port */ nr_words = thresh_val; ibi->seg_cnt -= nr_words * 4; - DBG("now %d left %d", nr_words * 4, ibi->seg_cnt); + dev_dbg(&hci->master.dev, "now %d left %d", + nr_words * 4, ibi->seg_cnt); while (nr_words--) *p++ = pio_reg_read(IBI_PORT); } @@ -791,7 +801,7 @@ static bool hci_pio_get_ibi_segment(struct i3c_hci *hci, hci_pio_set_ibi_thresh(hci, pio, 1); if (!(pio_reg_read(INTR_STATUS) & STAT_IBI_STATUS_THLD)) return false; - DBG("trailing %d", ibi->seg_cnt); + dev_dbg(&hci->master.dev, "trailing %d", ibi->seg_cnt); data = pio_reg_read(IBI_PORT); data = (__force u32) cpu_to_le32(data); while (ibi->seg_cnt--) { @@ -820,7 +830,7 @@ static bool hci_pio_prep_new_ibi(struct i3c_hci *hci, struct hci_pio_data *pio) */ ibi_status = pio_reg_read(IBI_PORT); - DBG("status = %#x", ibi_status); + dev_dbg(&hci->master.dev, "status = %#x", ibi_status); ibi->addr = FIELD_GET(IBI_TARGET_ADDR, ibi_status); if (ibi_status & IBI_ERROR) { dev_err(&hci->master.dev, "IBI error from %#x\n", ibi->addr); @@ -986,7 +996,8 @@ static bool hci_pio_irq_handler(struct i3c_hci *hci) spin_lock(&pio->lock); status = pio_reg_read(INTR_STATUS); - DBG("(in) status: %#x/%#x", status, pio->enabled_irqs); + dev_dbg(&hci->master.dev, "PIO_INTR_STATUS %#x/%#x", + status, pio->enabled_irqs); status &= pio->enabled_irqs | STAT_LATENCY_WARNINGS; if (!status) { spin_unlock(&pio->lock); @@ -1023,8 +1034,8 @@ static bool hci_pio_irq_handler(struct i3c_hci *hci) pio->enabled_irqs &= ~STAT_CMD_QUEUE_READY; pio_reg_write(INTR_SIGNAL_ENABLE, pio->enabled_irqs); - DBG("(out) status: %#x/%#x", - pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); + dev_dbg(&hci->master.dev, "PIO_INTR_STATUS %#x/%#x", + pio_reg_read(INTR_STATUS), pio_reg_read(INTR_SIGNAL_ENABLE)); spin_unlock(&pio->lock); return true; } diff --git a/drivers/i3c/master/renesas-i3c.c b/drivers/i3c/master/renesas-i3c.c index 174d3dc5d2764e..275f7b9242886e 100644 --- a/drivers/i3c/master/renesas-i3c.c +++ b/drivers/i3c/master/renesas-i3c.c @@ -679,7 +679,7 @@ static int renesas_i3c_daa(struct i3c_master_controller *m) i3c_master_add_i3c_dev_locked(m, i3c->addrs[pos]); } - return ret < 0 ? ret : 0; + return 0; } static bool renesas_i3c_supports_ccc_cmd(struct i3c_master_controller *m, diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index 701ae165b25b79..9641e66a4e5f2d 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -417,6 +417,7 @@ static int svc_i3c_master_handle_ibi(struct svc_i3c_master *master, SVC_I3C_MSTATUS_COMPLETE(val), 0, 1000); if (ret) { dev_err(master->dev, "Timeout when polling for COMPLETE\n"); + i3c_generic_ibi_recycle_slot(data->ibi_pool, slot); return ret; } @@ -517,9 +518,24 @@ static void svc_i3c_master_ibi_isr(struct svc_i3c_master *master) */ writel(SVC_I3C_MINT_IBIWON, master->regs + SVC_I3C_MSTATUS); - /* Acknowledge the incoming interrupt with the AUTOIBI mechanism */ - writel(SVC_I3C_MCTRL_REQUEST_AUTO_IBI | - SVC_I3C_MCTRL_IBIRESP_AUTO, + /* + * Write REQUEST_START_ADDR request to emit broadcast address for arbitration, + * instend of using AUTO_IBI. + * + * Using AutoIBI request may cause controller to remain in AutoIBI state when + * there is a glitch on SDA line (high->low->high). + * 1. SDA high->low, raising an interrupt to execute IBI isr. + * 2. SDA low->high. + * 3. IBI isr writes an AutoIBI request. + * 4. The controller will not start AutoIBI process because SDA is not low. + * 5. IBIWON polling times out. + * 6. Controller reamins in AutoIBI state and doesn't accept EmitStop request. + */ + writel(SVC_I3C_MCTRL_REQUEST_START_ADDR | + SVC_I3C_MCTRL_TYPE_I3C | + SVC_I3C_MCTRL_IBIRESP_MANUAL | + SVC_I3C_MCTRL_DIR(SVC_I3C_MCTRL_DIR_WRITE) | + SVC_I3C_MCTRL_ADDR(I3C_BROADCAST_ADDR), master->regs + SVC_I3C_MCTRL); /* Wait for IBIWON, should take approximately 100us */ @@ -539,10 +555,15 @@ static void svc_i3c_master_ibi_isr(struct svc_i3c_master *master) switch (ibitype) { case SVC_I3C_MSTATUS_IBITYPE_IBI: dev = svc_i3c_master_dev_from_addr(master, ibiaddr); - if (!dev || !is_events_enabled(master, SVC_I3C_EVENT_IBI)) + if (!dev || !is_events_enabled(master, SVC_I3C_EVENT_IBI)) { svc_i3c_master_nack_ibi(master); - else + } else { + if (dev->info.bcr & I3C_BCR_IBI_PAYLOAD) + svc_i3c_master_ack_ibi(master, true); + else + svc_i3c_master_ack_ibi(master, false); svc_i3c_master_handle_ibi(master, dev); + } break; case SVC_I3C_MSTATUS_IBITYPE_HOT_JOIN: if (is_events_enabled(master, SVC_I3C_EVENT_HOTJOIN)) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 028d9f031ddee3..8b506417ad2fbe 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -233,6 +233,7 @@ static u16 get_legacy_obj_type(u16 opcode) { switch (opcode) { case MLX5_CMD_OP_CREATE_RQ: + case MLX5_CMD_OP_CREATE_RMP: return MLX5_EVENT_QUEUE_TYPE_RQ; case MLX5_CMD_OP_CREATE_QP: return MLX5_EVENT_QUEUE_TYPE_QP; diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 4c94297e17e66c..d72e89c25e5031 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -422,6 +422,7 @@ static const struct xpad_device { { 0x3537, 0x1010, "GameSir G7 SE", 0, XTYPE_XBOXONE }, { 0x366c, 0x0005, "ByoWave Proteus Controller", MAP_SHARE_BUTTON, XTYPE_XBOXONE, FLAG_DELAY_INIT }, { 0x3767, 0x0101, "Fanatec Speedster 3 Forceshock Wheel", 0, XTYPE_XBOX }, + { 0x37d7, 0x2501, "Flydigi Apex 5", 0, XTYPE_XBOX360 }, { 0x413d, 0x2104, "Black Shark Green Ghost Gamepad", 0, XTYPE_XBOX360 }, { 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX }, { 0x0000, 0x0000, "Generic X-Box pad", 0, XTYPE_UNKNOWN } @@ -578,6 +579,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOX360_VENDOR(0x3537), /* GameSir Controllers */ XPAD_XBOXONE_VENDOR(0x3537), /* GameSir Controllers */ XPAD_XBOXONE_VENDOR(0x366c), /* ByoWave controllers */ + XPAD_XBOX360_VENDOR(0x37d7), /* Flydigi Controllers */ XPAD_XBOX360_VENDOR(0x413d), /* Black Shark Green Ghost Controller */ { } }; diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 7c4f309a4cb63a..1b10528b7ca3eb 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -422,6 +422,18 @@ config KEYBOARD_MAX7359 To compile this driver as a module, choose M here: the module will be called max7359_keypad. +config KEYBOARD_MAX7360 + tristate "Maxim MAX7360 Key Switch Controller" + select INPUT_MATRIXKMAP + depends on I2C + depends on MFD_MAX7360 + help + If you say yes here you get support for the keypad controller on the + Maxim MAX7360 I/O Expander. + + To compile this driver as a module, choose M here: the module will be + called max7360_keypad. + config KEYBOARD_MPR121 tristate "Freescale MPR121 Touchkey" depends on I2C diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile index 8bc20ab2b103b0..636367cd1042c4 100644 --- a/drivers/input/keyboard/Makefile +++ b/drivers/input/keyboard/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_KEYBOARD_LPC32XX) += lpc32xx-keys.o obj-$(CONFIG_KEYBOARD_MAPLE) += maple_keyb.o obj-$(CONFIG_KEYBOARD_MATRIX) += matrix_keypad.o obj-$(CONFIG_KEYBOARD_MAX7359) += max7359_keypad.o +obj-$(CONFIG_KEYBOARD_MAX7360) += max7360-keypad.o obj-$(CONFIG_KEYBOARD_MPR121) += mpr121_touchkey.o obj-$(CONFIG_KEYBOARD_MT6779) += mt6779-keypad.o obj-$(CONFIG_KEYBOARD_MTK_PMIC) += mtk-pmic-keys.o diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c index c1e53d87c8a759..f7209c8ebbccd1 100644 --- a/drivers/input/keyboard/cros_ec_keyb.c +++ b/drivers/input/keyboard/cros_ec_keyb.c @@ -705,6 +705,12 @@ static int cros_ec_keyb_probe(struct platform_device *pdev) ec = dev_get_drvdata(pdev->dev.parent); if (!ec) return -EPROBE_DEFER; + /* + * Even if the cros_ec_device pointer is available, still need to check + * if the device is fully registered before using it. + */ + if (!cros_ec_device_registered(ec)) + return -EPROBE_DEFER; ckdev = devm_kzalloc(dev, sizeof(*ckdev), GFP_KERNEL); if (!ckdev) diff --git a/drivers/input/keyboard/max7360-keypad.c b/drivers/input/keyboard/max7360-keypad.c new file mode 100644 index 00000000000000..503be952b0a668 --- /dev/null +++ b/drivers/input/keyboard/max7360-keypad.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct max7360_keypad { + struct input_dev *input; + unsigned int rows; + unsigned int cols; + unsigned int debounce_ms; + int irq; + struct regmap *regmap; + unsigned short keycodes[MAX7360_MAX_KEY_ROWS * MAX7360_MAX_KEY_COLS]; +}; + +static irqreturn_t max7360_keypad_irq(int irq, void *data) +{ + struct max7360_keypad *max7360_keypad = data; + struct device *dev = max7360_keypad->input->dev.parent; + unsigned int val; + unsigned int row, col; + unsigned int release; + unsigned int code; + int error; + + error = regmap_read(max7360_keypad->regmap, MAX7360_REG_KEYFIFO, &val); + if (error) { + dev_err(dev, "Failed to read MAX7360 FIFO"); + return IRQ_NONE; + } + + /* FIFO overflow: ignore it and get next event. */ + if (val == MAX7360_FIFO_OVERFLOW) { + dev_warn(dev, "max7360 FIFO overflow"); + error = regmap_read_poll_timeout(max7360_keypad->regmap, MAX7360_REG_KEYFIFO, + val, val != MAX7360_FIFO_OVERFLOW, 0, 1000); + if (error) { + dev_err(dev, "Failed to empty MAX7360 FIFO"); + return IRQ_NONE; + } + } + + if (val == MAX7360_FIFO_EMPTY) { + dev_dbg(dev, "Got a spurious interrupt"); + + return IRQ_NONE; + } + + row = FIELD_GET(MAX7360_FIFO_ROW, val); + col = FIELD_GET(MAX7360_FIFO_COL, val); + release = val & MAX7360_FIFO_RELEASE; + + code = MATRIX_SCAN_CODE(row, col, get_count_order(max7360_keypad->cols)); + + dev_dbg(dev, "key[%d:%d] %s\n", row, col, release ? "release" : "press"); + + input_event(max7360_keypad->input, EV_MSC, MSC_SCAN, code); + input_report_key(max7360_keypad->input, max7360_keypad->keycodes[code], !release); + input_sync(max7360_keypad->input); + + return IRQ_HANDLED; +} + +static int max7360_keypad_open(struct input_dev *pdev) +{ + struct max7360_keypad *max7360_keypad = input_get_drvdata(pdev); + struct device *dev = max7360_keypad->input->dev.parent; + int error; + + /* Somebody is using the device: get out of sleep. */ + error = regmap_write_bits(max7360_keypad->regmap, MAX7360_REG_CONFIG, + MAX7360_CFG_SLEEP, MAX7360_CFG_SLEEP); + if (error) + dev_err(dev, "Failed to write max7360 configuration: %d\n", error); + + return error; +} + +static void max7360_keypad_close(struct input_dev *pdev) +{ + struct max7360_keypad *max7360_keypad = input_get_drvdata(pdev); + struct device *dev = max7360_keypad->input->dev.parent; + int error; + + /* Nobody is using the device anymore: go to sleep. */ + error = regmap_write_bits(max7360_keypad->regmap, MAX7360_REG_CONFIG, MAX7360_CFG_SLEEP, 0); + if (error) + dev_err(dev, "Failed to write max7360 configuration: %d\n", error); +} + +static int max7360_keypad_hw_init(struct max7360_keypad *max7360_keypad) +{ + struct device *dev = max7360_keypad->input->dev.parent; + unsigned int val; + int error; + + val = max7360_keypad->debounce_ms - MAX7360_DEBOUNCE_MIN; + error = regmap_write_bits(max7360_keypad->regmap, MAX7360_REG_DEBOUNCE, + MAX7360_DEBOUNCE, + FIELD_PREP(MAX7360_DEBOUNCE, val)); + if (error) + return dev_err_probe(dev, error, + "Failed to write max7360 debounce configuration\n"); + + error = regmap_write_bits(max7360_keypad->regmap, MAX7360_REG_INTERRUPT, + MAX7360_INTERRUPT_TIME_MASK, + FIELD_PREP(MAX7360_INTERRUPT_TIME_MASK, 1)); + if (error) + return dev_err_probe(dev, error, + "Failed to write max7360 keypad interrupt configuration\n"); + + return 0; +} + +static int max7360_keypad_build_keymap(struct max7360_keypad *max7360_keypad) +{ + struct input_dev *input_dev = max7360_keypad->input; + struct device *dev = input_dev->dev.parent->parent; + struct matrix_keymap_data keymap_data; + const char *propname = "linux,keymap"; + unsigned int max_keys; + int error; + int size; + + size = device_property_count_u32(dev, propname); + if (size <= 0) { + dev_err(dev, "missing or malformed property %s: %d\n", propname, size); + return size < 0 ? size : -EINVAL; + } + + max_keys = max7360_keypad->cols * max7360_keypad->rows; + if (size > max_keys) { + dev_err(dev, "%s size overflow (%d vs max %u)\n", propname, size, max_keys); + return -EINVAL; + } + + u32 *keys __free(kfree) = kmalloc_array(size, sizeof(*keys), GFP_KERNEL); + if (!keys) + return -ENOMEM; + + error = device_property_read_u32_array(dev, propname, keys, size); + if (error) { + dev_err(dev, "failed to read %s property: %d\n", propname, error); + return error; + } + + keymap_data.keymap = keys; + keymap_data.keymap_size = size; + error = matrix_keypad_build_keymap(&keymap_data, NULL, + max7360_keypad->rows, max7360_keypad->cols, + max7360_keypad->keycodes, max7360_keypad->input); + if (error) + return error; + + return 0; +} + +static int max7360_keypad_parse_fw(struct device *dev, + struct max7360_keypad *max7360_keypad, + bool *autorepeat) +{ + int error; + + error = matrix_keypad_parse_properties(dev->parent, &max7360_keypad->rows, + &max7360_keypad->cols); + if (error) + return error; + + if (!max7360_keypad->rows || !max7360_keypad->cols || + max7360_keypad->rows > MAX7360_MAX_KEY_ROWS || + max7360_keypad->cols > MAX7360_MAX_KEY_COLS) { + dev_err(dev, "Invalid number of columns or rows (%ux%u)\n", + max7360_keypad->cols, max7360_keypad->rows); + return -EINVAL; + } + + *autorepeat = device_property_read_bool(dev->parent, "autorepeat"); + + max7360_keypad->debounce_ms = MAX7360_DEBOUNCE_MIN; + error = device_property_read_u32(dev->parent, "keypad-debounce-delay-ms", + &max7360_keypad->debounce_ms); + if (error == -EINVAL) { + dev_info(dev, "Using default keypad-debounce-delay-ms: %u\n", + max7360_keypad->debounce_ms); + } else if (error < 0) { + dev_err(dev, "Failed to read keypad-debounce-delay-ms property\n"); + return error; + } + + if (!in_range(max7360_keypad->debounce_ms, MAX7360_DEBOUNCE_MIN, + MAX7360_DEBOUNCE_MAX - MAX7360_DEBOUNCE_MIN + 1)) { + dev_err(dev, "Invalid keypad-debounce-delay-ms: %u, should be between %u and %u.\n", + max7360_keypad->debounce_ms, MAX7360_DEBOUNCE_MIN, MAX7360_DEBOUNCE_MAX); + return -EINVAL; + } + + return 0; +} + +static int max7360_keypad_probe(struct platform_device *pdev) +{ + struct max7360_keypad *max7360_keypad; + struct device *dev = &pdev->dev; + struct input_dev *input; + struct regmap *regmap; + bool autorepeat; + int error; + int irq; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "Could not get parent regmap\n"); + + irq = fwnode_irq_get_byname(dev_fwnode(dev->parent), "intk"); + if (irq < 0) + return dev_err_probe(dev, irq, "Failed to get IRQ\n"); + + max7360_keypad = devm_kzalloc(dev, sizeof(*max7360_keypad), GFP_KERNEL); + if (!max7360_keypad) + return -ENOMEM; + + max7360_keypad->regmap = regmap; + + error = max7360_keypad_parse_fw(dev, max7360_keypad, &autorepeat); + if (error) + return error; + + input = devm_input_allocate_device(dev); + if (!input) + return -ENOMEM; + + max7360_keypad->input = input; + + input->id.bustype = BUS_I2C; + input->name = pdev->name; + input->open = max7360_keypad_open; + input->close = max7360_keypad_close; + + error = max7360_keypad_build_keymap(max7360_keypad); + if (error) + return dev_err_probe(dev, error, "Failed to build keymap\n"); + + input_set_capability(input, EV_MSC, MSC_SCAN); + if (autorepeat) + __set_bit(EV_REP, input->evbit); + + input_set_drvdata(input, max7360_keypad); + + error = devm_request_threaded_irq(dev, irq, NULL, max7360_keypad_irq, + IRQF_ONESHOT, + "max7360-keypad", max7360_keypad); + if (error) + return dev_err_probe(dev, error, "Failed to register interrupt\n"); + + error = input_register_device(input); + if (error) + return dev_err_probe(dev, error, "Could not register input device\n"); + + error = max7360_keypad_hw_init(max7360_keypad); + if (error) + return dev_err_probe(dev, error, "Failed to initialize max7360 keypad\n"); + + device_init_wakeup(dev, true); + error = dev_pm_set_wake_irq(dev, irq); + if (error) + dev_warn(dev, "Failed to set up wakeup irq: %d\n", error); + + return 0; +} + +static void max7360_keypad_remove(struct platform_device *pdev) +{ + dev_pm_clear_wake_irq(&pdev->dev); + device_init_wakeup(&pdev->dev, false); +} + +static struct platform_driver max7360_keypad_driver = { + .driver = { + .name = "max7360-keypad", + }, + .probe = max7360_keypad_probe, + .remove = max7360_keypad_remove, +}; +module_platform_driver(max7360_keypad_driver); + +MODULE_DESCRIPTION("MAX7360 Keypad driver"); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); diff --git a/drivers/input/keyboard/mtk-pmic-keys.c b/drivers/input/keyboard/mtk-pmic-keys.c index 50e2e792c91d26..c78d9f6d97c4f7 100644 --- a/drivers/input/keyboard/mtk-pmic-keys.c +++ b/drivers/input/keyboard/mtk-pmic-keys.c @@ -55,6 +55,7 @@ struct mtk_pmic_regs { const struct mtk_pmic_keys_regs keys_regs[MTK_PMIC_MAX_KEY_COUNT]; u32 pmic_rst_reg; u32 rst_lprst_mask; /* Long-press reset timeout bitmask */ + bool key_release_irq; }; static const struct mtk_pmic_regs mt6397_regs = { @@ -116,6 +117,7 @@ static const struct mtk_pmic_regs mt6358_regs = { MTK_PMIC_HOMEKEY_RST), .pmic_rst_reg = MT6358_TOP_RST_MISC, .rst_lprst_mask = MTK_PMIC_RST_DU_MASK, + .key_release_irq = true, }; static const struct mtk_pmic_regs mt6359_regs = { @@ -129,6 +131,7 @@ static const struct mtk_pmic_regs mt6359_regs = { MTK_PMIC_HOMEKEY_RST), .pmic_rst_reg = MT6359_TOP_RST_MISC, .rst_lprst_mask = MTK_PMIC_RST_DU_MASK, + .key_release_irq = true, }; struct mtk_pmic_keys_info { @@ -368,7 +371,7 @@ static int mtk_pmic_keys_probe(struct platform_device *pdev) if (keys->keys[index].irq < 0) return keys->keys[index].irq; - if (of_device_is_compatible(node, "mediatek,mt6358-keys")) { + if (mtk_pmic_regs->key_release_irq) { keys->keys[index].irq_r = platform_get_irq_byname(pdev, irqnames_r[index]); diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig index 0fb21c99a5e3d1..0e6b49fb54bc4b 100644 --- a/drivers/input/misc/Kconfig +++ b/drivers/input/misc/Kconfig @@ -230,6 +230,16 @@ config INPUT_M68K_BEEP tristate "M68k Beeper support" depends on M68K +config INPUT_MAX7360_ROTARY + tristate "Maxim MAX7360 Rotary Encoder" + depends on MFD_MAX7360 + help + If you say yes here you get support for the rotary encoder on the + Maxim MAX7360 I/O Expander. + + To compile this driver as a module, choose M here: the module will be + called max7360_rotary. + config INPUT_MAX77650_ONKEY tristate "Maxim MAX77650 ONKEY support" depends on MFD_MAX77650 @@ -506,6 +516,16 @@ config INPUT_TPS65219_PWRBUTTON To compile this driver as a module, choose M here. The module will be called tps65219-pwrbutton. +config INPUT_TPS6594_PWRBUTTON + tristate "TPS6594 Power button driver" + depends on MFD_TPS6594 + help + Say Y here if you want to enable power button reporting for + TPS6594 Power Management IC devices. + + To compile this driver as a module, choose M here. The module will + be called tps6594-pwrbutton. + config INPUT_AXP20X_PEK tristate "X-Powers AXP20X power button driver" depends on MFD_AXP20X diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile index d468c8140b93da..ae857c24f48ed2 100644 --- a/drivers/input/misc/Makefile +++ b/drivers/input/misc/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_INPUT_IQS7222) += iqs7222.o obj-$(CONFIG_INPUT_KEYSPAN_REMOTE) += keyspan_remote.o obj-$(CONFIG_INPUT_KXTJ9) += kxtj9.o obj-$(CONFIG_INPUT_M68K_BEEP) += m68kspkr.o +obj-$(CONFIG_INPUT_MAX7360_ROTARY) += max7360-rotary.o obj-$(CONFIG_INPUT_MAX77650_ONKEY) += max77650-onkey.o obj-$(CONFIG_INPUT_MAX77693_HAPTIC) += max77693-haptic.o obj-$(CONFIG_INPUT_MAX8925_ONKEY) += max8925_onkey.o @@ -83,6 +84,7 @@ obj-$(CONFIG_INPUT_SPARCSPKR) += sparcspkr.o obj-$(CONFIG_INPUT_STPMIC1_ONKEY) += stpmic1_onkey.o obj-$(CONFIG_INPUT_TPS65218_PWRBUTTON) += tps65218-pwrbutton.o obj-$(CONFIG_INPUT_TPS65219_PWRBUTTON) += tps65219-pwrbutton.o +obj-$(CONFIG_INPUT_TPS6594_PWRBUTTON) += tps6594-pwrbutton.o obj-$(CONFIG_INPUT_TWL4030_PWRBUTTON) += twl4030-pwrbutton.o obj-$(CONFIG_INPUT_TWL4030_VIBRA) += twl4030-vibra.o obj-$(CONFIG_INPUT_TWL6040_VIBRA) += twl6040-vibra.o diff --git a/drivers/input/misc/iqs7222.c b/drivers/input/misc/iqs7222.c index 6fac31c0d99f2b..ff23219a582ab8 100644 --- a/drivers/input/misc/iqs7222.c +++ b/drivers/input/misc/iqs7222.c @@ -2427,6 +2427,9 @@ static int iqs7222_parse_chan(struct iqs7222_private *iqs7222, if (error) return error; + if (!iqs7222->kp_type[chan_index][i]) + continue; + if (!dev_desc->event_offset) continue; diff --git a/drivers/input/misc/max7360-rotary.c b/drivers/input/misc/max7360-rotary.c new file mode 100644 index 00000000000000..385831ef34b633 --- /dev/null +++ b/drivers/input/misc/max7360-rotary.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX7360_ROTARY_DEFAULT_STEPS 24 + +struct max7360_rotary { + struct input_dev *input; + struct regmap *regmap; + unsigned int debounce_ms; + + unsigned int pos; + + u32 steps; + u32 axis; + bool relative_axis; + bool rollover; +}; + +static void max7360_rotary_report_event(struct max7360_rotary *max7360_rotary, int steps) +{ + if (max7360_rotary->relative_axis) { + input_report_rel(max7360_rotary->input, max7360_rotary->axis, steps); + } else { + int pos = max7360_rotary->pos; + int maxval = max7360_rotary->steps; + + /* + * Add steps to the position. + * Make sure added steps are always in ]-maxval; maxval[ + * interval, so (pos + maxval) is always >= 0. + * Then set back pos to the [0; maxval[ interval. + */ + pos += steps % maxval; + if (max7360_rotary->rollover) + pos = (pos + maxval) % maxval; + else + pos = clamp(pos, 0, maxval - 1); + + max7360_rotary->pos = pos; + input_report_abs(max7360_rotary->input, max7360_rotary->axis, max7360_rotary->pos); + } + + input_sync(max7360_rotary->input); +} + +static irqreturn_t max7360_rotary_irq(int irq, void *data) +{ + struct max7360_rotary *max7360_rotary = data; + struct device *dev = max7360_rotary->input->dev.parent; + unsigned int val; + int error; + + error = regmap_read(max7360_rotary->regmap, MAX7360_REG_RTR_CNT, &val); + if (error < 0) { + dev_err(dev, "Failed to read rotary counter\n"); + return IRQ_NONE; + } + + if (val == 0) + return IRQ_NONE; + + max7360_rotary_report_event(max7360_rotary, sign_extend32(val, 7)); + + return IRQ_HANDLED; +} + +static int max7360_rotary_hw_init(struct max7360_rotary *max7360_rotary) +{ + struct device *dev = max7360_rotary->input->dev.parent; + int val; + int error; + + val = FIELD_PREP(MAX7360_ROT_DEBOUNCE, max7360_rotary->debounce_ms) | + FIELD_PREP(MAX7360_ROT_INTCNT, 1) | MAX7360_ROT_INTCNT_DLY; + error = regmap_write(max7360_rotary->regmap, MAX7360_REG_RTRCFG, val); + if (error) + dev_err(dev, "Failed to set max7360 rotary encoder configuration\n"); + + return error; +} + +static int max7360_rotary_probe(struct platform_device *pdev) +{ + struct max7360_rotary *max7360_rotary; + struct device *dev = &pdev->dev; + struct input_dev *input; + struct regmap *regmap; + int irq; + int error; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "Could not get parent regmap\n"); + + irq = fwnode_irq_get_byname(dev_fwnode(dev->parent), "inti"); + if (irq < 0) + return dev_err_probe(dev, irq, "Failed to get IRQ\n"); + + max7360_rotary = devm_kzalloc(dev, sizeof(*max7360_rotary), GFP_KERNEL); + if (!max7360_rotary) + return -ENOMEM; + + max7360_rotary->regmap = regmap; + + device_property_read_u32(dev->parent, "linux,axis", &max7360_rotary->axis); + max7360_rotary->rollover = device_property_read_bool(dev->parent, + "rotary-encoder,rollover"); + max7360_rotary->relative_axis = + device_property_read_bool(dev->parent, "rotary-encoder,relative-axis"); + + error = device_property_read_u32(dev->parent, "rotary-encoder,steps", + &max7360_rotary->steps); + if (error) + max7360_rotary->steps = MAX7360_ROTARY_DEFAULT_STEPS; + + device_property_read_u32(dev->parent, "rotary-debounce-delay-ms", + &max7360_rotary->debounce_ms); + if (max7360_rotary->debounce_ms > MAX7360_ROT_DEBOUNCE_MAX) + return dev_err_probe(dev, -EINVAL, "Invalid debounce timing: %u\n", + max7360_rotary->debounce_ms); + + input = devm_input_allocate_device(dev); + if (!input) + return -ENOMEM; + + max7360_rotary->input = input; + + input->id.bustype = BUS_I2C; + input->name = pdev->name; + + if (max7360_rotary->relative_axis) + input_set_capability(input, EV_REL, max7360_rotary->axis); + else + input_set_abs_params(input, max7360_rotary->axis, 0, max7360_rotary->steps, 0, 1); + + error = devm_request_threaded_irq(dev, irq, NULL, max7360_rotary_irq, + IRQF_ONESHOT | IRQF_SHARED, + "max7360-rotary", max7360_rotary); + if (error) + return dev_err_probe(dev, error, "Failed to register interrupt\n"); + + error = input_register_device(input); + if (error) + return dev_err_probe(dev, error, "Could not register input device\n"); + + error = max7360_rotary_hw_init(max7360_rotary); + if (error) + return dev_err_probe(dev, error, "Failed to initialize max7360 rotary\n"); + + device_init_wakeup(dev, true); + error = dev_pm_set_wake_irq(dev, irq); + if (error) + dev_warn(dev, "Failed to set up wakeup irq: %d\n", error); + + return 0; +} + +static void max7360_rotary_remove(struct platform_device *pdev) +{ + dev_pm_clear_wake_irq(&pdev->dev); + device_init_wakeup(&pdev->dev, false); +} + +static struct platform_driver max7360_rotary_driver = { + .driver = { + .name = "max7360-rotary", + }, + .probe = max7360_rotary_probe, + .remove = max7360_rotary_remove, +}; +module_platform_driver(max7360_rotary_driver); + +MODULE_DESCRIPTION("MAX7360 Rotary driver"); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); diff --git a/drivers/input/misc/mc13783-pwrbutton.c b/drivers/input/misc/mc13783-pwrbutton.c index 1c7faa9b7afe04..b83d762ae2e9f0 100644 --- a/drivers/input/misc/mc13783-pwrbutton.c +++ b/drivers/input/misc/mc13783-pwrbutton.c @@ -57,7 +57,6 @@ static irqreturn_t button_irq(int irq, void *_priv) struct mc13783_pwrb *priv = _priv; int val; - mc13xxx_irq_ack(priv->mc13783, irq); mc13xxx_reg_read(priv->mc13783, MC13783_REG_INTERRUPT_SENSE_1, &val); switch (irq) { diff --git a/drivers/input/misc/tps6594-pwrbutton.c b/drivers/input/misc/tps6594-pwrbutton.c new file mode 100644 index 00000000000000..cd039b3866dc21 --- /dev/null +++ b/drivers/input/misc/tps6594-pwrbutton.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * power button driver for TI TPS6594 PMICs + * + * Copyright (C) 2025 Critical Link LLC - https://www.criticallink.com/ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct tps6594_pwrbutton { + struct device *dev; + struct input_dev *idev; + char phys[32]; +}; + +static irqreturn_t tps6594_pb_push_irq(int irq, void *_pwr) +{ + struct tps6594_pwrbutton *pwr = _pwr; + + input_report_key(pwr->idev, KEY_POWER, 1); + pm_wakeup_event(pwr->dev, 0); + input_sync(pwr->idev); + + return IRQ_HANDLED; +} + +static irqreturn_t tps6594_pb_release_irq(int irq, void *_pwr) +{ + struct tps6594_pwrbutton *pwr = _pwr; + + input_report_key(pwr->idev, KEY_POWER, 0); + input_sync(pwr->idev); + + return IRQ_HANDLED; +} + +static int tps6594_pb_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct tps6594_pwrbutton *pwr; + struct input_dev *idev; + int error; + int push_irq; + int release_irq; + + pwr = devm_kzalloc(dev, sizeof(*pwr), GFP_KERNEL); + if (!pwr) + return -ENOMEM; + + idev = devm_input_allocate_device(dev); + if (!idev) + return -ENOMEM; + + idev->name = pdev->name; + snprintf(pwr->phys, sizeof(pwr->phys), "%s/input0", + pdev->name); + idev->phys = pwr->phys; + idev->id.bustype = BUS_I2C; + + input_set_capability(idev, EV_KEY, KEY_POWER); + + pwr->dev = dev; + pwr->idev = idev; + device_init_wakeup(dev, true); + + push_irq = platform_get_irq(pdev, 0); + if (push_irq < 0) + return -EINVAL; + + release_irq = platform_get_irq(pdev, 1); + if (release_irq < 0) + return -EINVAL; + + error = devm_request_threaded_irq(dev, push_irq, NULL, + tps6594_pb_push_irq, + IRQF_ONESHOT, + pdev->resource[0].name, pwr); + if (error) { + dev_err(dev, "failed to request push IRQ #%d: %d\n", push_irq, + error); + return error; + } + + error = devm_request_threaded_irq(dev, release_irq, NULL, + tps6594_pb_release_irq, + IRQF_ONESHOT, + pdev->resource[1].name, pwr); + if (error) { + dev_err(dev, "failed to request release IRQ #%d: %d\n", + release_irq, error); + return error; + } + + error = input_register_device(idev); + if (error) { + dev_err(dev, "Can't register power button: %d\n", error); + return error; + } + + return 0; +} + +static const struct platform_device_id tps6594_pwrbtn_id_table[] = { + { "tps6594-pwrbutton", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(platform, tps6594_pwrbtn_id_table); + +static struct platform_driver tps6594_pb_driver = { + .probe = tps6594_pb_probe, + .driver = { + .name = "tps6594_pwrbutton", + }, + .id_table = tps6594_pwrbtn_id_table, +}; +module_platform_driver(tps6594_pb_driver); + +MODULE_DESCRIPTION("TPS6594 Power Button"); +MODULE_LICENSE("GPL"); diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index 6ed9fc34948cbe..1caa6c4ca435c7 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -1155,6 +1155,20 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "XxHP4NAx"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | + SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "XxKK4NAx_XxSP4NAx"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | + SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + }, /* * A lot of modern Clevo barebones have touchpad and/or keyboard issues * after suspend fixable with the forcenorestore quirk. diff --git a/drivers/input/touchscreen/mc13783_ts.c b/drivers/input/touchscreen/mc13783_ts.c index 33635da8507999..47b8da00027fd5 100644 --- a/drivers/input/touchscreen/mc13783_ts.c +++ b/drivers/input/touchscreen/mc13783_ts.c @@ -42,8 +42,6 @@ static irqreturn_t mc13783_ts_handler(int irq, void *data) { struct mc13783_ts_priv *priv = data; - mc13xxx_irq_ack(priv->mc13xxx, irq); - /* * Kick off reading coordinates. Note that if work happens already * be queued for future execution (it rearms itself) it will not @@ -137,8 +135,6 @@ static int mc13783_ts_open(struct input_dev *dev) mc13xxx_lock(priv->mc13xxx); - mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_TS); - ret = mc13xxx_irq_request(priv->mc13xxx, MC13XXX_IRQ_TS, mc13783_ts_handler, MC13783_TS_NAME, priv); if (ret) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 5219d7ddfdaa8b..95f63c5f6159f7 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -555,6 +555,7 @@ struct gcr3_tbl_info { }; struct amd_io_pgtable { + seqcount_t seqcount; /* Protects root/mode update */ struct io_pgtable pgtbl; int mode; u64 *root; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 8de689b2c5ed57..ba9e582a8bbe5d 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1455,12 +1455,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, PCI_FUNC(e->devid)); devid = e->devid; - for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) + if (alias) { + for (dev_i = devid_start; dev_i <= devid; ++dev_i) pci_seg->alias_table[dev_i] = devid_to; + set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags); } set_dev_entry_from_acpi_range(iommu, devid_start, devid, flags, ext_flags); - set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags); break; case IVHD_DEV_SPECIAL: { u8 handle, type; @@ -3067,7 +3067,8 @@ static int __init early_amd_iommu_init(void) if (!boot_cpu_has(X86_FEATURE_CX16)) { pr_err("Failed to initialize. The CMPXCHG16B feature is required.\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } /* diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index a91e71f981efb9..70c2f5b1631b05 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -130,8 +131,11 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable, *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); + write_seqcount_begin(&pgtable->seqcount); pgtable->root = pte; pgtable->mode += 1; + write_seqcount_end(&pgtable->seqcount); + amd_iommu_update_and_flush_device_table(domain); pte = NULL; @@ -153,6 +157,7 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, { unsigned long last_addr = address + (page_size - 1); struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; + unsigned int seqcount; int level, end_lvl; u64 *pte, *page; @@ -170,8 +175,14 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, } - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + do { + seqcount = read_seqcount_begin(&pgtable->seqcount); + + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); + + address = PAGE_SIZE_ALIGN(address, page_size); end_lvl = PAGE_SIZE_LEVEL(page_size); @@ -249,6 +260,7 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable, unsigned long *page_size) { int level; + unsigned int seqcount; u64 *pte; *page_size = 0; @@ -256,8 +268,12 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable, if (address > PM_LEVEL_SIZE(pgtable->mode)) return NULL; - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + do { + seqcount = read_seqcount_begin(&pgtable->seqcount); + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); + *page_size = PTE_LEVEL_PAGE_SIZE(level); while (level > 0) { @@ -541,6 +557,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo if (!pgtable->root) return NULL; pgtable->mode = PAGE_MODE_3_LEVEL; + seqcount_init(&pgtable->seqcount); cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9c3ab9d9f69a3e..dff2d895b8abd7 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1575,6 +1575,10 @@ static void switch_to_super_page(struct dmar_domain *domain, unsigned long lvl_pages = lvl_to_nr_pages(level); struct dma_pte *pte = NULL; + if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) || + !IS_ALIGNED(end_pfn + 1, lvl_pages))) + return; + while (start_pfn <= end_pfn) { if (!pte) pte = pfn_to_dma_pte(domain, start_pfn, &level, @@ -1650,7 +1654,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long pages_to_remove; pteval |= DMA_PTE_LARGE_PAGE; - pages_to_remove = min_t(unsigned long, nr_pages, + pages_to_remove = min_t(unsigned long, + round_down(nr_pages, lvl_pages), nr_pte_to_next_page(pte) * lvl_pages); end_pfn = iov_pfn + pages_to_remove - 1; switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 65fbd098f9e98f..4c842368289f08 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -711,6 +711,8 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); mutex_unlock(&igroup->lock); + iommufd_hw_pagetable_put(idev->ictx, hwpt); + /* Caller must destroy hwpt */ return hwpt; } @@ -1057,7 +1059,6 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid) hwpt = iommufd_hw_pagetable_detach(idev, pasid); if (!hwpt) return; - iommufd_hw_pagetable_put(idev->ictx, hwpt); refcount_dec(&idev->obj.users); } EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, "IOMMUFD"); diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c index fc4de63b0bce64..e23d9ee4fe3806 100644 --- a/drivers/iommu/iommufd/eventq.c +++ b/drivers/iommu/iommufd/eventq.c @@ -393,12 +393,12 @@ static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name, const struct file_operations *fops) { struct file *filep; - int fdno; spin_lock_init(&eventq->lock); INIT_LIST_HEAD(&eventq->deliver); init_waitqueue_head(&eventq->wait_queue); + /* The filep is fput() by the core code during failure */ filep = anon_inode_getfile(name, fops, eventq, O_RDWR); if (IS_ERR(filep)) return PTR_ERR(filep); @@ -408,10 +408,7 @@ static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name, eventq->filep = filep; refcount_inc(&eventq->obj.users); - fdno = get_unused_fd_flags(O_CLOEXEC); - if (fdno < 0) - fput(filep); - return fdno; + return get_unused_fd_flags(O_CLOEXEC); } static const struct file_operations iommufd_fault_fops = @@ -452,7 +449,6 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) return 0; out_put_fdno: put_unused_fd(fdno); - fput(fault->common.filep); return rc; } @@ -536,7 +532,6 @@ int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd) out_put_fdno: put_unused_fd(fdno); - fput(veventq->common.filep); out_abort: iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj); out_unlock_veventqs: diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 0da2a81eedfa8b..627f9b78483a0e 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -454,9 +454,8 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); - lockdep_assert_not_held(&hwpt_paging->ioas->mutex); - if (hwpt_paging->auto_domain) { + lockdep_assert_not_held(&hwpt_paging->ioas->mutex); iommufd_object_put_and_try_destroy(ictx, &hwpt->obj); return; } diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 15af7ced0501d6..ce775fbbae94e7 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -23,6 +23,7 @@ #include "iommufd_test.h" struct iommufd_object_ops { + size_t file_offset; void (*pre_destroy)(struct iommufd_object *obj); void (*destroy)(struct iommufd_object *obj); void (*abort)(struct iommufd_object *obj); @@ -121,6 +122,10 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) old = xas_store(&xas, NULL); xa_unlock(&ictx->objects); WARN_ON(old != XA_ZERO_ENTRY); + + if (WARN_ON(!refcount_dec_and_test(&obj->users))) + return; + kfree(obj); } @@ -131,10 +136,30 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj) { - if (iommufd_object_ops[obj->type].abort) - iommufd_object_ops[obj->type].abort(obj); + const struct iommufd_object_ops *ops = &iommufd_object_ops[obj->type]; + + if (ops->file_offset) { + struct file **filep = ((void *)obj) + ops->file_offset; + + /* + * A file should hold a users refcount while the file is open + * and put it back in its release. The file should hold a + * pointer to obj in their private data. Normal fput() is + * deferred to a workqueue and can get out of order with the + * following kfree(obj). Using the sync version ensures the + * release happens immediately. During abort we require the file + * refcount is one at this point - meaning the object alloc + * function cannot do anything to allow another thread to take a + * refcount prior to a guaranteed success. + */ + if (*filep) + __fput_sync(*filep); + } + + if (ops->abort) + ops->abort(obj); else - iommufd_object_ops[obj->type].destroy(obj); + ops->destroy(obj); iommufd_object_abort(ictx, obj); } @@ -550,16 +575,23 @@ static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_EXEC) return -EPERM; + mtree_lock(&ictx->mt_mmap); /* vma->vm_pgoff carries a page-shifted start position to an immap */ immap = mtree_load(&ictx->mt_mmap, vma->vm_pgoff << PAGE_SHIFT); - if (!immap) + if (!immap || !refcount_inc_not_zero(&immap->owner->users)) { + mtree_unlock(&ictx->mt_mmap); return -ENXIO; + } + mtree_unlock(&ictx->mt_mmap); + /* * mtree_load() returns the immap for any contained mmio_addr, so only * allow the exact immap thing to be mapped */ - if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length) - return -ENXIO; + if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length) { + rc = -ENXIO; + goto err_refcount; + } vma->vm_pgoff = 0; vma->vm_private_data = immap; @@ -570,10 +602,11 @@ static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma) immap->mmio_addr >> PAGE_SHIFT, length, vma->vm_page_prot); if (rc) - return rc; + goto err_refcount; + return 0; - /* vm_ops.open won't be called for mmap itself. */ - refcount_inc(&immap->owner->users); +err_refcount: + refcount_dec(&immap->owner->users); return rc; } @@ -651,6 +684,12 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx) } EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, "IOMMUFD"); +#define IOMMUFD_FILE_OFFSET(_struct, _filep, _obj) \ + .file_offset = (offsetof(_struct, _filep) + \ + BUILD_BUG_ON_ZERO(!__same_type( \ + struct file *, ((_struct *)NULL)->_filep)) + \ + BUILD_BUG_ON_ZERO(offsetof(_struct, _obj))) + static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_ACCESS] = { .destroy = iommufd_access_destroy_object, @@ -661,6 +700,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { }, [IOMMUFD_OBJ_FAULT] = { .destroy = iommufd_fault_destroy, + IOMMUFD_FILE_OFFSET(struct iommufd_fault, common.filep, common.obj), }, [IOMMUFD_OBJ_HW_QUEUE] = { .destroy = iommufd_hw_queue_destroy, @@ -683,6 +723,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_VEVENTQ] = { .destroy = iommufd_veventq_destroy, .abort = iommufd_veventq_abort, + IOMMUFD_FILE_OFFSET(struct iommufd_veventq, common.filep, common.obj), }, [IOMMUFD_OBJ_VIOMMU] = { .destroy = iommufd_viommu_destroy, diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 9c80d61deb2c0b..aa576736d60baa 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -612,6 +612,23 @@ static u64 get_iota_region_flag(struct s390_domain *domain) } } +static bool reg_ioat_propagate_error(int cc, u8 status) +{ + /* + * If the device is in the error state the reset routine + * will register the IOAT of the newly set domain on re-enable + */ + if (cc == ZPCI_CC_ERR && status == ZPCI_PCI_ST_FUNC_NOT_AVAIL) + return false; + /* + * If the device was removed treat registration as success + * and let the subsequent error event trigger tear down. + */ + if (cc == ZPCI_CC_INVAL_HANDLE) + return false; + return cc != ZPCI_CC_OK; +} + static int s390_iommu_domain_reg_ioat(struct zpci_dev *zdev, struct iommu_domain *domain, u8 *status) { @@ -696,7 +713,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); - if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) + if (reg_ioat_propagate_error(cc, status)) return -EIO; zdev->dma_table = s390_domain->dma_table; zdev_s390_domain_update(zdev, domain); @@ -1032,7 +1049,8 @@ struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev) lockdep_assert_held(&zdev->dom_lock); - if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED) + if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED || + zdev->s390_domain->type == IOMMU_DOMAIN_IDENTITY) return NULL; s390_domain = to_s390_domain(zdev->s390_domain); @@ -1123,12 +1141,7 @@ static int s390_attach_dev_identity(struct iommu_domain *domain, /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); - - /* - * If the device is undergoing error recovery the reset code - * will re-establish the new domain. - */ - if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) + if (reg_ioat_propagate_error(cc, status)) return -EIO; zdev_s390_domain_update(zdev, domain); diff --git a/drivers/irqchip/irq-aspeed-scu-ic.c b/drivers/irqchip/irq-aspeed-scu-ic.c index 1c7045467c4860..5584e0f82cce83 100644 --- a/drivers/irqchip/irq-aspeed-scu-ic.c +++ b/drivers/irqchip/irq-aspeed-scu-ic.c @@ -1,61 +1,78 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Aspeed AST24XX, AST25XX, and AST26XX SCU Interrupt Controller + * Aspeed AST24XX, AST25XX, AST26XX, and AST27XX SCU Interrupt Controller * Copyright 2019 IBM Corporation * * Eddie James */ #include +#include #include #include #include #include -#include +#include #include -#include -#define ASPEED_SCU_IC_REG 0x018 -#define ASPEED_SCU_IC_SHIFT 0 -#define ASPEED_SCU_IC_ENABLE GENMASK(15, ASPEED_SCU_IC_SHIFT) -#define ASPEED_SCU_IC_NUM_IRQS 7 #define ASPEED_SCU_IC_STATUS GENMASK(28, 16) #define ASPEED_SCU_IC_STATUS_SHIFT 16 +#define AST2700_SCU_IC_STATUS GENMASK(15, 0) + +struct aspeed_scu_ic_variant { + const char *compatible; + unsigned long irq_enable; + unsigned long irq_shift; + unsigned int num_irqs; + unsigned long ier; + unsigned long isr; +}; -#define ASPEED_AST2600_SCU_IC0_REG 0x560 -#define ASPEED_AST2600_SCU_IC0_SHIFT 0 -#define ASPEED_AST2600_SCU_IC0_ENABLE \ - GENMASK(5, ASPEED_AST2600_SCU_IC0_SHIFT) -#define ASPEED_AST2600_SCU_IC0_NUM_IRQS 6 +#define SCU_VARIANT(_compat, _shift, _enable, _num, _ier, _isr) { \ + .compatible = _compat, \ + .irq_shift = _shift, \ + .irq_enable = _enable, \ + .num_irqs = _num, \ + .ier = _ier, \ + .isr = _isr, \ +} -#define ASPEED_AST2600_SCU_IC1_REG 0x570 -#define ASPEED_AST2600_SCU_IC1_SHIFT 4 -#define ASPEED_AST2600_SCU_IC1_ENABLE \ - GENMASK(5, ASPEED_AST2600_SCU_IC1_SHIFT) -#define ASPEED_AST2600_SCU_IC1_NUM_IRQS 2 +static const struct aspeed_scu_ic_variant scu_ic_variants[] __initconst = { + SCU_VARIANT("aspeed,ast2400-scu-ic", 0, GENMASK(15, 0), 7, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2500-scu-ic", 0, GENMASK(15, 0), 7, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2600-scu-ic0", 0, GENMASK(5, 0), 6, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2600-scu-ic1", 4, GENMASK(5, 4), 2, 0x00, 0x00), + SCU_VARIANT("aspeed,ast2700-scu-ic0", 0, GENMASK(3, 0), 4, 0x00, 0x04), + SCU_VARIANT("aspeed,ast2700-scu-ic1", 0, GENMASK(3, 0), 4, 0x00, 0x04), + SCU_VARIANT("aspeed,ast2700-scu-ic2", 0, GENMASK(3, 0), 4, 0x04, 0x00), + SCU_VARIANT("aspeed,ast2700-scu-ic3", 0, GENMASK(1, 0), 2, 0x04, 0x00), +}; struct aspeed_scu_ic { - unsigned long irq_enable; - unsigned long irq_shift; - unsigned int num_irqs; - unsigned int reg; - struct regmap *scu; - struct irq_domain *irq_domain; + unsigned long irq_enable; + unsigned long irq_shift; + unsigned int num_irqs; + void __iomem *base; + struct irq_domain *irq_domain; + unsigned long ier; + unsigned long isr; }; -static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) +static inline bool scu_has_split_isr(struct aspeed_scu_ic *scu) +{ + return scu->ier != scu->isr; +} + +static void aspeed_scu_ic_irq_handler_combined(struct irq_desc *desc) { - unsigned int sts; - unsigned long bit; - unsigned long enabled; - unsigned long max; - unsigned long status; struct aspeed_scu_ic *scu_ic = irq_desc_get_handler_data(desc); struct irq_chip *chip = irq_desc_get_chip(desc); - unsigned int mask = scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT; + unsigned long bit, enabled, max, status; + unsigned int sts, mask; chained_irq_enter(chip, desc); + mask = scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT; /* * The SCU IC has just one register to control its operation and read * status. The interrupt enable bits occupy the lower 16 bits of the @@ -66,7 +83,7 @@ static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) * shifting the status down to get the mapping and then back up to * clear the bit. */ - regmap_read(scu_ic->scu, scu_ic->reg, &sts); + sts = readl(scu_ic->base); enabled = sts & scu_ic->irq_enable; status = (sts >> ASPEED_SCU_IC_STATUS_SHIFT) & enabled; @@ -74,43 +91,83 @@ static void aspeed_scu_ic_irq_handler(struct irq_desc *desc) max = scu_ic->num_irqs + bit; for_each_set_bit_from(bit, &status, max) { - generic_handle_domain_irq(scu_ic->irq_domain, - bit - scu_ic->irq_shift); + generic_handle_domain_irq(scu_ic->irq_domain, bit - scu_ic->irq_shift); + writel((readl(scu_ic->base) & ~mask) | BIT(bit + ASPEED_SCU_IC_STATUS_SHIFT), + scu_ic->base); + } + + chained_irq_exit(chip, desc); +} + +static void aspeed_scu_ic_irq_handler_split(struct irq_desc *desc) +{ + struct aspeed_scu_ic *scu_ic = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned long bit, enabled, max, status; + unsigned int sts, mask; - regmap_write_bits(scu_ic->scu, scu_ic->reg, mask, - BIT(bit + ASPEED_SCU_IC_STATUS_SHIFT)); + chained_irq_enter(chip, desc); + + mask = scu_ic->irq_enable; + sts = readl(scu_ic->base + scu_ic->isr); + enabled = sts & scu_ic->irq_enable; + sts = readl(scu_ic->base + scu_ic->isr); + status = sts & enabled; + + bit = scu_ic->irq_shift; + max = scu_ic->num_irqs + bit; + + for_each_set_bit_from(bit, &status, max) { + generic_handle_domain_irq(scu_ic->irq_domain, bit - scu_ic->irq_shift); + /* Clear interrupt */ + writel(BIT(bit), scu_ic->base + scu_ic->isr); } chained_irq_exit(chip, desc); } -static void aspeed_scu_ic_irq_mask(struct irq_data *data) +static void aspeed_scu_ic_irq_mask_combined(struct irq_data *data) { struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); - unsigned int mask = BIT(data->hwirq + scu_ic->irq_shift) | - (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); + unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); + unsigned int mask = bit | (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); /* * Status bits are cleared by writing 1. In order to prevent the mask * operation from clearing the status bits, they should be under the * mask and written with 0. */ - regmap_update_bits(scu_ic->scu, scu_ic->reg, mask, 0); + writel(readl(scu_ic->base) & ~mask, scu_ic->base); } -static void aspeed_scu_ic_irq_unmask(struct irq_data *data) +static void aspeed_scu_ic_irq_unmask_combined(struct irq_data *data) { struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); - unsigned int mask = bit | - (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); + unsigned int mask = bit | (scu_ic->irq_enable << ASPEED_SCU_IC_STATUS_SHIFT); /* * Status bits are cleared by writing 1. In order to prevent the unmask * operation from clearing the status bits, they should be under the * mask and written with 0. */ - regmap_update_bits(scu_ic->scu, scu_ic->reg, mask, bit); + writel((readl(scu_ic->base) & ~mask) | bit, scu_ic->base); +} + +static void aspeed_scu_ic_irq_mask_split(struct irq_data *data) +{ + struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); + unsigned int mask = BIT(data->hwirq + scu_ic->irq_shift); + + writel(readl(scu_ic->base) & ~mask, scu_ic->base + scu_ic->ier); +} + +static void aspeed_scu_ic_irq_unmask_split(struct irq_data *data) +{ + struct aspeed_scu_ic *scu_ic = irq_data_get_irq_chip_data(data); + unsigned int bit = BIT(data->hwirq + scu_ic->irq_shift); + + writel(readl(scu_ic->base) | bit, scu_ic->base + scu_ic->ier); } static int aspeed_scu_ic_irq_set_affinity(struct irq_data *data, @@ -120,17 +177,29 @@ static int aspeed_scu_ic_irq_set_affinity(struct irq_data *data, return -EINVAL; } -static struct irq_chip aspeed_scu_ic_chip = { +static struct irq_chip aspeed_scu_ic_chip_combined = { .name = "aspeed-scu-ic", - .irq_mask = aspeed_scu_ic_irq_mask, - .irq_unmask = aspeed_scu_ic_irq_unmask, - .irq_set_affinity = aspeed_scu_ic_irq_set_affinity, + .irq_mask = aspeed_scu_ic_irq_mask_combined, + .irq_unmask = aspeed_scu_ic_irq_unmask_combined, + .irq_set_affinity = aspeed_scu_ic_irq_set_affinity, +}; + +static struct irq_chip aspeed_scu_ic_chip_split = { + .name = "ast2700-scu-ic", + .irq_mask = aspeed_scu_ic_irq_mask_split, + .irq_unmask = aspeed_scu_ic_irq_unmask_split, + .irq_set_affinity = aspeed_scu_ic_irq_set_affinity, }; static int aspeed_scu_ic_map(struct irq_domain *domain, unsigned int irq, irq_hw_number_t hwirq) { - irq_set_chip_and_handler(irq, &aspeed_scu_ic_chip, handle_level_irq); + struct aspeed_scu_ic *scu_ic = domain->host_data; + + if (scu_has_split_isr(scu_ic)) + irq_set_chip_and_handler(irq, &aspeed_scu_ic_chip_split, handle_level_irq); + else + irq_set_chip_and_handler(irq, &aspeed_scu_ic_chip_combined, handle_level_irq); irq_set_chip_data(irq, domain->host_data); return 0; @@ -143,21 +212,21 @@ static const struct irq_domain_ops aspeed_scu_ic_domain_ops = { static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, struct device_node *node) { - int irq; - int rc = 0; + int irq, rc = 0; - if (!node->parent) { - rc = -ENODEV; + scu_ic->base = of_iomap(node, 0); + if (IS_ERR(scu_ic->base)) { + rc = PTR_ERR(scu_ic->base); goto err; } - scu_ic->scu = syscon_node_to_regmap(node->parent); - if (IS_ERR(scu_ic->scu)) { - rc = PTR_ERR(scu_ic->scu); - goto err; + if (scu_has_split_isr(scu_ic)) { + writel(AST2700_SCU_IC_STATUS, scu_ic->base + scu_ic->isr); + writel(0, scu_ic->base + scu_ic->ier); + } else { + writel(ASPEED_SCU_IC_STATUS, scu_ic->base); + writel(0, scu_ic->base); } - regmap_write_bits(scu_ic->scu, scu_ic->reg, ASPEED_SCU_IC_STATUS, ASPEED_SCU_IC_STATUS); - regmap_write_bits(scu_ic->scu, scu_ic->reg, ASPEED_SCU_IC_ENABLE, 0); irq = irq_of_parse_and_map(node, 0); if (!irq) { @@ -166,75 +235,60 @@ static int aspeed_scu_ic_of_init_common(struct aspeed_scu_ic *scu_ic, } scu_ic->irq_domain = irq_domain_create_linear(of_fwnode_handle(node), scu_ic->num_irqs, - &aspeed_scu_ic_domain_ops, - scu_ic); + &aspeed_scu_ic_domain_ops, scu_ic); if (!scu_ic->irq_domain) { rc = -ENOMEM; goto err; } - irq_set_chained_handler_and_data(irq, aspeed_scu_ic_irq_handler, + irq_set_chained_handler_and_data(irq, scu_has_split_isr(scu_ic) ? + aspeed_scu_ic_irq_handler_split : + aspeed_scu_ic_irq_handler_combined, scu_ic); return 0; err: kfree(scu_ic); - return rc; } -static int __init aspeed_scu_ic_of_init(struct device_node *node, - struct device_node *parent) +static const struct aspeed_scu_ic_variant *aspeed_scu_ic_find_variant(struct device_node *np) { - struct aspeed_scu_ic *scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); - - if (!scu_ic) - return -ENOMEM; - - scu_ic->irq_enable = ASPEED_SCU_IC_ENABLE; - scu_ic->irq_shift = ASPEED_SCU_IC_SHIFT; - scu_ic->num_irqs = ASPEED_SCU_IC_NUM_IRQS; - scu_ic->reg = ASPEED_SCU_IC_REG; - - return aspeed_scu_ic_of_init_common(scu_ic, node); + for (int i = 0; i < ARRAY_SIZE(scu_ic_variants); i++) { + if (of_device_is_compatible(np, scu_ic_variants[i].compatible)) + return &scu_ic_variants[i]; + } + return NULL; } -static int __init aspeed_ast2600_scu_ic0_of_init(struct device_node *node, - struct device_node *parent) +static int __init aspeed_scu_ic_of_init(struct device_node *node, struct device_node *parent) { - struct aspeed_scu_ic *scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); + const struct aspeed_scu_ic_variant *variant; + struct aspeed_scu_ic *scu_ic; - if (!scu_ic) - return -ENOMEM; - - scu_ic->irq_enable = ASPEED_AST2600_SCU_IC0_ENABLE; - scu_ic->irq_shift = ASPEED_AST2600_SCU_IC0_SHIFT; - scu_ic->num_irqs = ASPEED_AST2600_SCU_IC0_NUM_IRQS; - scu_ic->reg = ASPEED_AST2600_SCU_IC0_REG; - - return aspeed_scu_ic_of_init_common(scu_ic, node); -} - -static int __init aspeed_ast2600_scu_ic1_of_init(struct device_node *node, - struct device_node *parent) -{ - struct aspeed_scu_ic *scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); + variant = aspeed_scu_ic_find_variant(node); + if (!variant) + return -ENODEV; + scu_ic = kzalloc(sizeof(*scu_ic), GFP_KERNEL); if (!scu_ic) return -ENOMEM; - scu_ic->irq_enable = ASPEED_AST2600_SCU_IC1_ENABLE; - scu_ic->irq_shift = ASPEED_AST2600_SCU_IC1_SHIFT; - scu_ic->num_irqs = ASPEED_AST2600_SCU_IC1_NUM_IRQS; - scu_ic->reg = ASPEED_AST2600_SCU_IC1_REG; + scu_ic->irq_enable = variant->irq_enable; + scu_ic->irq_shift = variant->irq_shift; + scu_ic->num_irqs = variant->num_irqs; + scu_ic->ier = variant->ier; + scu_ic->isr = variant->isr; return aspeed_scu_ic_of_init_common(scu_ic, node); } IRQCHIP_DECLARE(ast2400_scu_ic, "aspeed,ast2400-scu-ic", aspeed_scu_ic_of_init); IRQCHIP_DECLARE(ast2500_scu_ic, "aspeed,ast2500-scu-ic", aspeed_scu_ic_of_init); -IRQCHIP_DECLARE(ast2600_scu_ic0, "aspeed,ast2600-scu-ic0", - aspeed_ast2600_scu_ic0_of_init); -IRQCHIP_DECLARE(ast2600_scu_ic1, "aspeed,ast2600-scu-ic1", - aspeed_ast2600_scu_ic1_of_init); +IRQCHIP_DECLARE(ast2600_scu_ic0, "aspeed,ast2600-scu-ic0", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2600_scu_ic1, "aspeed,ast2600-scu-ic1", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic0, "aspeed,ast2700-scu-ic0", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic1, "aspeed,ast2700-scu-ic1", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic2, "aspeed,ast2700-scu-ic2", aspeed_scu_ic_of_init); +IRQCHIP_DECLARE(ast2700_scu_ic3, "aspeed,ast2700-scu-ic3", aspeed_scu_ic_of_init); diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index 24ef5af569fe4e..8a3410c2b7b575 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -153,14 +153,19 @@ static int gicv2m_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, { msi_alloc_info_t *info = args; struct v2m_data *v2m = NULL, *tmp; - int hwirq, offset, i, err = 0; + int hwirq, i, err = 0; + unsigned long offset; + unsigned long align_mask = nr_irqs - 1; spin_lock(&v2m_lock); list_for_each_entry(tmp, &v2m_nodes, entry) { - offset = bitmap_find_free_region(tmp->bm, tmp->nr_spis, - get_count_order(nr_irqs)); - if (offset >= 0) { + unsigned long align_off = tmp->spi_start - (tmp->spi_start & ~align_mask); + + offset = bitmap_find_next_zero_area_off(tmp->bm, tmp->nr_spis, 0, + nr_irqs, align_mask, align_off); + if (offset < tmp->nr_spis) { v2m = tmp; + bitmap_set(v2m->bm, offset, nr_irqs); break; } } diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index dbeb85677b08cb..3de351e66ee844 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1766,8 +1766,9 @@ static int gic_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { - unsigned int type, ret, ppi_idx; + unsigned int type, ppi_idx; irq_hw_number_t hwirq; + int ret; /* Not for us */ if (fwspec->fwnode != d->fwnode) diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c index 13c035727e32b0..ce2732d649a3e9 100644 --- a/drivers/irqchip/irq-gic-v5-irs.c +++ b/drivers/irqchip/irq-gic-v5-irs.c @@ -571,7 +571,7 @@ static void __init gicv5_irs_init_bases(struct gicv5_irs_chip_data *irs_data, FIELD_PREP(GICV5_IRS_CR1_IST_RA, GICV5_NO_READ_ALLOC) | FIELD_PREP(GICV5_IRS_CR1_IC, GICV5_NON_CACHE) | FIELD_PREP(GICV5_IRS_CR1_OC, GICV5_NON_CACHE); - irs_data->flags |= IRS_FLAGS_NON_COHERENT; + irs_data->flags |= IRS_FLAGS_NON_COHERENT; } else { cr1 = FIELD_PREP(GICV5_IRS_CR1_VPED_WA, GICV5_WRITE_ALLOC) | FIELD_PREP(GICV5_IRS_CR1_VPED_RA, GICV5_READ_ALLOC) | diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index 9290ac741949ca..554485f0be1fbf 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -191,9 +191,9 @@ static int gicv5_its_create_itt_two_level(struct gicv5_its_chip_data *its, unsigned int num_events) { unsigned int l1_bits, l2_bits, span, events_per_l2_table; - unsigned int i, complete_tables, final_span, num_ents; + unsigned int complete_tables, final_span, num_ents; __le64 *itt_l1, *itt_l2, **l2ptrs; - int ret; + int i, ret; u64 val; ret = gicv5_its_l2sz_to_l2_bits(itt_l2sz); @@ -768,8 +768,6 @@ static struct gicv5_its_dev *gicv5_its_alloc_device(struct gicv5_its_chip_data * goto out_dev_free; } - gicv5_its_device_cache_inv(its, its_dev); - its_dev->its_node = its; its_dev->event_map = (unsigned long *)bitmap_zalloc(its_dev->num_events, GFP_KERNEL); @@ -949,15 +947,18 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi device_id = its_dev->device_id; for (i = 0; i < nr_irqs; i++) { - lpi = gicv5_alloc_lpi(); + ret = gicv5_alloc_lpi(); if (ret < 0) { pr_debug("Failed to find free LPI!\n"); - goto out_eventid; + goto out_free_irqs; } + lpi = ret; ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi); - if (ret) - goto out_free_lpi; + if (ret) { + gicv5_free_lpi(lpi); + goto out_free_irqs; + } /* * Store eventid and deviceid into the hwirq for later use. @@ -977,8 +978,13 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi return 0; -out_free_lpi: - gicv5_free_lpi(lpi); +out_free_irqs: + while (--i >= 0) { + irqd = irq_domain_get_irq_data(domain, virq + i); + gicv5_free_lpi(irqd->parent_data->hwirq); + irq_domain_reset_irq_data(irqd); + irq_domain_free_irqs_parent(domain, virq + i, 1); + } out_eventid: gicv5_its_free_eventid(its_dev, event_id_base, nr_irqs); return ret; diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c index b2860eb2d32c55..39e5a72ccd3c83 100644 --- a/drivers/irqchip/irq-loongson-eiointc.c +++ b/drivers/irqchip/irq-loongson-eiointc.c @@ -46,6 +46,7 @@ #define EIOINTC_ALL_ENABLE_VEC_MASK(vector) (EIOINTC_ALL_ENABLE & ~BIT(vector & 0x1f)) #define EIOINTC_REG_ENABLE_VEC(vector) (EIOINTC_REG_ENABLE + ((vector >> 5) << 2)) #define EIOINTC_USE_CPU_ENCODE BIT(0) +#define EIOINTC_ROUTE_MULT_IP BIT(1) #define MAX_EIO_NODES (NR_CPUS / CORES_PER_EIO_NODE) @@ -59,6 +60,14 @@ #define EIOINTC_REG_ROUTE_VEC_MASK(vector) (0xff << EIOINTC_REG_ROUTE_VEC_SHIFT(vector)) static int nr_pics; +struct eiointc_priv; + +struct eiointc_ip_route { + struct eiointc_priv *priv; + /* Offset Routed destination IP */ + int start; + int end; +}; struct eiointc_priv { u32 node; @@ -68,6 +77,8 @@ struct eiointc_priv { struct fwnode_handle *domain_handle; struct irq_domain *eiointc_domain; int flags; + irq_hw_number_t parent_hwirq; + struct eiointc_ip_route route_info[VEC_REG_COUNT]; }; static struct eiointc_priv *eiointc_priv[MAX_IO_PICS]; @@ -188,6 +199,7 @@ static int eiointc_router_init(unsigned int cpu) { int i, bit, cores, index, node; unsigned int data; + int hwirq, mask; node = cpu_to_eio_node(cpu); index = eiointc_index(node); @@ -197,6 +209,13 @@ static int eiointc_router_init(unsigned int cpu) return -EINVAL; } + /* Enable cpu interrupt pin from eiointc */ + hwirq = eiointc_priv[index]->parent_hwirq; + mask = BIT(hwirq); + if (eiointc_priv[index]->flags & EIOINTC_ROUTE_MULT_IP) + mask |= BIT(hwirq + 1) | BIT(hwirq + 2) | BIT(hwirq + 3); + set_csr_ecfg(mask); + if (!(eiointc_priv[index]->flags & EIOINTC_USE_CPU_ENCODE)) cores = CORES_PER_EIO_NODE; else @@ -211,8 +230,31 @@ static int eiointc_router_init(unsigned int cpu) } for (i = 0; i < eiointc_priv[0]->vec_count / 32 / 4; i++) { - bit = BIT(1 + index); /* Route to IP[1 + index] */ - data = bit | (bit << 8) | (bit << 16) | (bit << 24); + /* + * Route to interrupt pin, relative offset used here + * Offset 0 means routing to IP0 and so on + * + * If flags is set with EIOINTC_ROUTE_MULT_IP, + * every 64 vector routes to different consecutive + * IPs, otherwise all vector routes to the same IP + */ + if (eiointc_priv[index]->flags & EIOINTC_ROUTE_MULT_IP) { + /* The first 64 vectors route to hwirq */ + bit = BIT(hwirq++ - INT_HWI0); + data = bit | (bit << 8); + + /* The second 64 vectors route to hwirq + 1 */ + bit = BIT(hwirq++ - INT_HWI0); + data |= (bit << 16) | (bit << 24); + + /* + * Route to hwirq + 2/hwirq + 3 separately + * in next loop + */ + } else { + bit = BIT(hwirq - INT_HWI0); + data = bit | (bit << 8) | (bit << 16) | (bit << 24); + } iocsr_write32(data, EIOINTC_REG_IPMAP + i * 4); } @@ -241,15 +283,22 @@ static int eiointc_router_init(unsigned int cpu) static void eiointc_irq_dispatch(struct irq_desc *desc) { - int i; - u64 pending; - bool handled = false; + struct eiointc_ip_route *info = irq_desc_get_handler_data(desc); struct irq_chip *chip = irq_desc_get_chip(desc); - struct eiointc_priv *priv = irq_desc_get_handler_data(desc); + bool handled = false; + u64 pending; + int i; chained_irq_enter(chip, desc); - for (i = 0; i < eiointc_priv[0]->vec_count / VEC_COUNT_PER_REG; i++) { + /* + * If EIOINTC_ROUTE_MULT_IP is set, every 64 interrupt vectors in + * eiointc interrupt controller routes to different cpu interrupt pins + * + * Every cpu interrupt pin has its own irq handler, it is ok to + * read ISR for these 64 interrupt vectors rather than all vectors + */ + for (i = info->start; i < info->end; i++) { pending = iocsr_read64(EIOINTC_REG_ISR + (i << 3)); /* Skip handling if pending bitmap is zero */ @@ -262,7 +311,7 @@ static void eiointc_irq_dispatch(struct irq_desc *desc) int bit = __ffs(pending); int irq = bit + VEC_COUNT_PER_REG * i; - generic_handle_domain_irq(priv->eiointc_domain, irq); + generic_handle_domain_irq(info->priv->eiointc_domain, irq); pending &= ~BIT(bit); handled = true; } @@ -462,8 +511,33 @@ static int __init eiointc_init(struct eiointc_priv *priv, int parent_irq, } eiointc_priv[nr_pics++] = priv; + /* + * Only the first eiointc device on VM supports routing to + * different CPU interrupt pins. The later eiointc devices use + * generic method if there are multiple eiointc devices in future + */ + if (cpu_has_hypervisor && (nr_pics == 1)) { + priv->flags |= EIOINTC_ROUTE_MULT_IP; + priv->parent_hwirq = INT_HWI0; + } + + if (priv->flags & EIOINTC_ROUTE_MULT_IP) { + for (i = 0; i < priv->vec_count / VEC_COUNT_PER_REG; i++) { + priv->route_info[i].start = priv->parent_hwirq - INT_HWI0 + i; + priv->route_info[i].end = priv->route_info[i].start + 1; + priv->route_info[i].priv = priv; + parent_irq = get_percpu_irq(priv->parent_hwirq + i); + irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, + &priv->route_info[i]); + } + } else { + priv->route_info[0].start = 0; + priv->route_info[0].end = priv->vec_count / VEC_COUNT_PER_REG; + priv->route_info[0].priv = priv; + irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, + &priv->route_info[0]); + } eiointc_router_init(0); - irq_set_chained_handler_and_data(parent_irq, eiointc_irq_dispatch, priv); if (nr_pics == 1) { register_syscore_ops(&eiointc_syscore_ops); @@ -495,7 +569,7 @@ int __init eiointc_acpi_init(struct irq_domain *parent, priv->vec_count = VEC_COUNT; priv->node = acpi_eiointc->node; - + priv->parent_hwirq = acpi_eiointc->cascade; parent_irq = irq_create_mapping(parent, acpi_eiointc->cascade); ret = eiointc_init(priv, parent_irq, acpi_eiointc->node_map); @@ -527,8 +601,9 @@ int __init eiointc_acpi_init(struct irq_domain *parent, static int __init eiointc_of_init(struct device_node *of_node, struct device_node *parent) { - int parent_irq, ret; struct eiointc_priv *priv; + struct irq_data *irq_data; + int parent_irq, ret; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) @@ -544,6 +619,12 @@ static int __init eiointc_of_init(struct device_node *of_node, if (ret < 0) goto out_free_priv; + irq_data = irq_get_irq_data(parent_irq); + if (!irq_data) { + ret = -ENODEV; + goto out_free_priv; + } + /* * In particular, the number of devices supported by the LS2K0500 * extended I/O interrupt vector is 128. @@ -552,7 +633,7 @@ static int __init eiointc_of_init(struct device_node *of_node, priv->vec_count = 128; else priv->vec_count = VEC_COUNT; - + priv->parent_hwirq = irqd_to_hwirq(irq_data); priv->node = 0; priv->domain_handle = of_fwnode_handle(of_node); diff --git a/drivers/irqchip/irq-loongson-pch-lpc.c b/drivers/irqchip/irq-loongson-pch-lpc.c index 2d4c3ec128b8f2..912bf50a5c7ca7 100644 --- a/drivers/irqchip/irq-loongson-pch-lpc.c +++ b/drivers/irqchip/irq-loongson-pch-lpc.c @@ -200,8 +200,13 @@ int __init pch_lpc_acpi_init(struct irq_domain *parent, goto iounmap_base; } - priv->lpc_domain = irq_domain_create_linear(irq_handle, LPC_COUNT, - &pch_lpc_domain_ops, priv); + /* + * The LPC interrupt controller is a legacy i8259-compatible device, + * which requires a static 1:1 mapping for IRQs 0-15. + * Use irq_domain_create_legacy to establish this static mapping early. + */ + priv->lpc_domain = irq_domain_create_legacy(irq_handle, LPC_COUNT, 0, 0, + &pch_lpc_domain_ops, priv); if (!priv->lpc_domain) { pr_err("Failed to create IRQ domain\n"); goto free_irq_handle; diff --git a/drivers/irqchip/irq-msi-lib.c b/drivers/irqchip/irq-msi-lib.c index 908944009c210b..d5eefc3d72155c 100644 --- a/drivers/irqchip/irq-msi-lib.c +++ b/drivers/irqchip/irq-msi-lib.c @@ -112,6 +112,20 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, */ if (!chip->irq_set_affinity && !(info->flags & MSI_FLAG_NO_AFFINITY)) chip->irq_set_affinity = msi_domain_set_affinity; + + /* + * If the parent domain insists on being in charge of masking, obey + * blindly. The interrupt is un-masked at the PCI level on startup + * and masked on shutdown to prevent rogue interrupts after the + * driver freed the interrupt. Not masking it at the PCI level + * speeds up operation for disable/enable_irq() as it avoids + * getting all the way out to the PCI device. + */ + if (info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT) { + chip->irq_mask = irq_chip_mask_parent; + chip->irq_unmask = irq_chip_unmask_parent; + } + return true; } EXPORT_SYMBOL_GPL(msi_lib_init_dev_msi_info); diff --git a/drivers/irqchip/irq-nvic.c b/drivers/irqchip/irq-nvic.c index 76e11cac96318c..2191a2b795785b 100644 --- a/drivers/irqchip/irq-nvic.c +++ b/drivers/irqchip/irq-nvic.c @@ -73,8 +73,9 @@ static int __init nvic_of_init(struct device_node *node, struct device_node *parent) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; - unsigned int irqs, i, ret, numbanks; + unsigned int irqs, i, numbanks; void __iomem *nvic_base; + int ret; numbanks = (readl_relaxed(V7M_SCS_ICTR) & V7M_SCS_ICTR_INTLINESNUM_MASK) + 1; diff --git a/drivers/irqchip/irq-renesas-rza1.c b/drivers/irqchip/irq-renesas-rza1.c index a697eb55ac90e4..6047a524ac77e2 100644 --- a/drivers/irqchip/irq-renesas-rza1.c +++ b/drivers/irqchip/irq-renesas-rza1.c @@ -142,11 +142,12 @@ static const struct irq_domain_ops rza1_irqc_domain_ops = { static int rza1_irqc_parse_map(struct rza1_irqc_priv *priv, struct device_node *gic_node) { - unsigned int imaplen, i, j, ret; struct device *dev = priv->dev; + unsigned int imaplen, i, j; struct device_node *ipar; const __be32 *imap; u32 intsize; + int ret; imap = of_get_property(dev->of_node, "interrupt-map", &imaplen); if (!imap) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 360d88687e4f58..2a54adeb4cc714 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -578,7 +578,7 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * &rzg2l_irqc_domain_ops, rzg2l_irqc_data); if (!irq_domain) { pm_runtime_put(dev); - return dev_err_probe(dev, -ENOMEM, "failed to add irq domain\n"); + return -ENOMEM; } register_syscore_ops(&rzg2l_irqc_syscore_ops); diff --git a/drivers/irqchip/irq-sg2042-msi.c b/drivers/irqchip/irq-sg2042-msi.c index bcfddc51bc6a18..f7cf0dc72eabbf 100644 --- a/drivers/irqchip/irq-sg2042-msi.c +++ b/drivers/irqchip/irq-sg2042-msi.c @@ -30,6 +30,7 @@ struct sg204x_msi_chip_info { * @doorbell_addr: see TRM, 10.1.32, GP_INTR0_SET * @irq_first: First vectors number that MSIs starts * @num_irqs: Number of vectors for MSIs + * @irq_type: IRQ type for MSIs * @msi_map: mapping for allocated MSI vectors. * @msi_map_lock: Lock for msi_map * @chip_info: chip specific infomations @@ -41,6 +42,7 @@ struct sg204x_msi_chipdata { u32 irq_first; u32 num_irqs; + unsigned int irq_type; unsigned long *msi_map; struct mutex msi_map_lock; @@ -85,6 +87,8 @@ static void sg2042_msi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *m static const struct irq_chip sg2042_msi_middle_irq_chip = { .name = "SG2042 MSI", + .irq_startup = irq_chip_startup_parent, + .irq_shutdown = irq_chip_shutdown_parent, .irq_ack = sg2042_msi_irq_ack, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, @@ -114,6 +118,8 @@ static void sg2044_msi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *m static struct irq_chip sg2044_msi_middle_irq_chip = { .name = "SG2044 MSI", + .irq_startup = irq_chip_startup_parent, + .irq_shutdown = irq_chip_shutdown_parent, .irq_ack = sg2044_msi_irq_ack, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, @@ -133,14 +139,14 @@ static int sg204x_msi_parent_domain_alloc(struct irq_domain *domain, unsigned in fwspec.fwnode = domain->parent->fwnode; fwspec.param_count = 2; fwspec.param[0] = data->irq_first + hwirq; - fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + fwspec.param[1] = data->irq_type; ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); if (ret) return ret; d = irq_domain_get_irq_data(domain->parent, virq); - return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); + return d->chip->irq_set_type(d, data->irq_type); } static int sg204x_msi_middle_domain_alloc(struct irq_domain *domain, unsigned int virq, @@ -185,8 +191,10 @@ static const struct irq_domain_ops sg204x_msi_middle_domain_ops = { .select = msi_lib_irq_domain_select, }; -#define SG2042_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ - MSI_FLAG_USE_DEF_CHIP_OPS) +#define SG2042_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT | \ + MSI_FLAG_PCI_MSI_STARTUP_PARENT) #define SG2042_MSI_FLAGS_SUPPORTED MSI_GENERIC_FLAGS_MASK @@ -200,10 +208,13 @@ static const struct msi_parent_ops sg2042_msi_parent_ops = { .init_dev_msi_info = msi_lib_init_dev_msi_info, }; -#define SG2044_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ - MSI_FLAG_USE_DEF_CHIP_OPS) +#define SG2044_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT | \ + MSI_FLAG_PCI_MSI_STARTUP_PARENT) -#define SG2044_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ +#define SG2044_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_MULTI_PCI_MSI | \ MSI_FLAG_PCI_MSIX) static const struct msi_parent_ops sg2044_msi_parent_ops = { @@ -289,6 +300,7 @@ static int sg2042_msi_probe(struct platform_device *pdev) } data->irq_first = (u32)args.args[0]; + data->irq_type = (unsigned int)args.args[1]; data->num_irqs = (u32)args.args[args.nargs - 1]; mutex_init(&data->msi_map_lock); diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index bf69a4802b71e7..559fda8fb3a86a 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -179,12 +179,14 @@ static int plic_set_affinity(struct irq_data *d, if (cpu >= nr_cpu_ids) return -EINVAL; - plic_irq_disable(d); + /* Invalidate the original routing entry */ + plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 0); irq_data_update_effective_affinity(d, cpumask_of(cpu)); + /* Setting the new routing entry if irq is enabled */ if (!irqd_irq_disabled(d)) - plic_irq_enable(d); + plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 1); return IRQ_SET_MASK_OK_DONE; } @@ -257,7 +259,7 @@ static int plic_irq_suspend(void) readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)); } - for_each_cpu(cpu, cpu_present_mask) { + for_each_present_cpu(cpu) { struct plic_handler *handler = per_cpu_ptr(&plic_handlers, cpu); if (!handler->present) @@ -289,7 +291,7 @@ static void plic_irq_resume(void) priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID); } - for_each_cpu(cpu, cpu_present_mask) { + for_each_present_cpu(cpu) { struct plic_handler *handler = per_cpu_ptr(&plic_handlers, cpu); if (!handler->present) diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index 6e3dce7e35a490..06e6291be11b2f 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -674,7 +674,7 @@ config LEDS_BD2606MVV help This option enables support for BD2606MVV LED driver chips accessed via the I2C bus. It supports setting brightness, with - the limitiation that there are groups of two channels sharing + the limitation that there are groups of two channels sharing a brightness setting, but not the on/off setting. To compile this driver as a module, choose M here: the module will diff --git a/drivers/leds/flash/leds-qcom-flash.c b/drivers/leds/flash/leds-qcom-flash.c index 89cf5120f5d55b..b03a6833e3e3a0 100644 --- a/drivers/leds/flash/leds-qcom-flash.c +++ b/drivers/leds/flash/leds-qcom-flash.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2022, 2024 Qualcomm Innovation Center, Inc. All rights reserved. + * Copyright (c) 2022, 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved. */ #include @@ -114,36 +114,55 @@ enum { REG_THERM_THRSH1, REG_THERM_THRSH2, REG_THERM_THRSH3, + REG_TORCH_CLAMP, REG_MAX_COUNT, }; +static const struct reg_field mvflash_3ch_pmi8998_regs[REG_MAX_COUNT] = { + [REG_STATUS1] = REG_FIELD(0x08, 0, 5), + [REG_STATUS2] = REG_FIELD(0x09, 0, 7), + [REG_STATUS3] = REG_FIELD(0x0a, 0, 7), + [REG_CHAN_TIMER] = REG_FIELD_ID(0x40, 0, 7, 3, 1), + [REG_ITARGET] = REG_FIELD_ID(0x43, 0, 6, 3, 1), + [REG_MODULE_EN] = REG_FIELD(0x46, 7, 7), + [REG_IRESOLUTION] = REG_FIELD(0x47, 0, 5), + [REG_CHAN_STROBE] = REG_FIELD_ID(0x49, 0, 2, 3, 1), + [REG_CHAN_EN] = REG_FIELD(0x4c, 0, 2), + [REG_THERM_THRSH1] = REG_FIELD(0x56, 0, 2), + [REG_THERM_THRSH2] = REG_FIELD(0x57, 0, 2), + [REG_THERM_THRSH3] = REG_FIELD(0x58, 0, 2), + [REG_TORCH_CLAMP] = REG_FIELD(0xea, 0, 6), +}; + static const struct reg_field mvflash_3ch_regs[REG_MAX_COUNT] = { - REG_FIELD(0x08, 0, 7), /* status1 */ - REG_FIELD(0x09, 0, 7), /* status2 */ - REG_FIELD(0x0a, 0, 7), /* status3 */ - REG_FIELD_ID(0x40, 0, 7, 3, 1), /* chan_timer */ - REG_FIELD_ID(0x43, 0, 6, 3, 1), /* itarget */ - REG_FIELD(0x46, 7, 7), /* module_en */ - REG_FIELD(0x47, 0, 5), /* iresolution */ - REG_FIELD_ID(0x49, 0, 2, 3, 1), /* chan_strobe */ - REG_FIELD(0x4c, 0, 2), /* chan_en */ - REG_FIELD(0x56, 0, 2), /* therm_thrsh1 */ - REG_FIELD(0x57, 0, 2), /* therm_thrsh2 */ - REG_FIELD(0x58, 0, 2), /* therm_thrsh3 */ + [REG_STATUS1] = REG_FIELD(0x08, 0, 7), + [REG_STATUS2] = REG_FIELD(0x09, 0, 7), + [REG_STATUS3] = REG_FIELD(0x0a, 0, 7), + [REG_CHAN_TIMER] = REG_FIELD_ID(0x40, 0, 7, 3, 1), + [REG_ITARGET] = REG_FIELD_ID(0x43, 0, 6, 3, 1), + [REG_MODULE_EN] = REG_FIELD(0x46, 7, 7), + [REG_IRESOLUTION] = REG_FIELD(0x47, 0, 5), + [REG_CHAN_STROBE] = REG_FIELD_ID(0x49, 0, 2, 3, 1), + [REG_CHAN_EN] = REG_FIELD(0x4c, 0, 2), + [REG_THERM_THRSH1] = REG_FIELD(0x56, 0, 2), + [REG_THERM_THRSH2] = REG_FIELD(0x57, 0, 2), + [REG_THERM_THRSH3] = REG_FIELD(0x58, 0, 2), + [REG_TORCH_CLAMP] = REG_FIELD(0xec, 0, 6), }; static const struct reg_field mvflash_4ch_regs[REG_MAX_COUNT] = { - REG_FIELD(0x06, 0, 7), /* status1 */ - REG_FIELD(0x07, 0, 6), /* status2 */ - REG_FIELD(0x09, 0, 7), /* status3 */ - REG_FIELD_ID(0x3e, 0, 7, 4, 1), /* chan_timer */ - REG_FIELD_ID(0x42, 0, 6, 4, 1), /* itarget */ - REG_FIELD(0x46, 7, 7), /* module_en */ - REG_FIELD(0x49, 0, 3), /* iresolution */ - REG_FIELD_ID(0x4a, 0, 6, 4, 1), /* chan_strobe */ - REG_FIELD(0x4e, 0, 3), /* chan_en */ - REG_FIELD(0x7a, 0, 2), /* therm_thrsh1 */ - REG_FIELD(0x78, 0, 2), /* therm_thrsh2 */ + [REG_STATUS1] = REG_FIELD(0x06, 0, 7), + [REG_STATUS2] = REG_FIELD(0x07, 0, 6), + [REG_STATUS3] = REG_FIELD(0x09, 0, 7), + [REG_CHAN_TIMER] = REG_FIELD_ID(0x3e, 0, 7, 4, 1), + [REG_ITARGET] = REG_FIELD_ID(0x42, 0, 6, 4, 1), + [REG_MODULE_EN] = REG_FIELD(0x46, 7, 7), + [REG_IRESOLUTION] = REG_FIELD(0x49, 0, 3), + [REG_CHAN_STROBE] = REG_FIELD_ID(0x4a, 0, 6, 4, 1), + [REG_CHAN_EN] = REG_FIELD(0x4e, 0, 3), + [REG_THERM_THRSH1] = REG_FIELD(0x7a, 0, 2), + [REG_THERM_THRSH2] = REG_FIELD(0x78, 0, 2), + [REG_TORCH_CLAMP] = REG_FIELD(0xed, 0, 6), }; struct qcom_flash_data { @@ -156,6 +175,7 @@ struct qcom_flash_data { u8 max_channels; u8 chan_en_bits; u8 revision; + u8 torch_clamp; }; struct qcom_flash_led { @@ -702,6 +722,7 @@ static int qcom_flash_register_led_device(struct device *dev, u32 current_ua, timeout_us; u32 channels[4]; int i, rc, count; + u8 torch_clamp; count = fwnode_property_count_u32(node, "led-sources"); if (count <= 0) { @@ -751,6 +772,12 @@ static int qcom_flash_register_led_device(struct device *dev, current_ua = min_t(u32, current_ua, TORCH_CURRENT_MAX_UA * led->chan_count); led->max_torch_current_ma = current_ua / UA_PER_MA; + torch_clamp = (current_ua / led->chan_count) / TORCH_IRES_UA; + if (torch_clamp != 0) + torch_clamp--; + + flash_data->torch_clamp = max_t(u8, flash_data->torch_clamp, torch_clamp); + if (fwnode_property_present(node, "flash-max-microamp")) { flash->led_cdev.flags |= LED_DEV_CAP_FLASH; @@ -851,13 +878,20 @@ static int qcom_flash_led_probe(struct platform_device *pdev) return rc; } - if (val == FLASH_SUBTYPE_3CH_PM8150_VAL || val == FLASH_SUBTYPE_3CH_PMI8998_VAL) { + if (val == FLASH_SUBTYPE_3CH_PM8150_VAL) { flash_data->hw_type = QCOM_MVFLASH_3CH; flash_data->max_channels = 3; regs = devm_kmemdup(dev, mvflash_3ch_regs, sizeof(mvflash_3ch_regs), GFP_KERNEL); if (!regs) return -ENOMEM; + } else if (val == FLASH_SUBTYPE_3CH_PMI8998_VAL) { + flash_data->hw_type = QCOM_MVFLASH_3CH; + flash_data->max_channels = 3; + regs = devm_kmemdup(dev, mvflash_3ch_pmi8998_regs, + sizeof(mvflash_3ch_pmi8998_regs), GFP_KERNEL); + if (!regs) + return -ENOMEM; } else if (val == FLASH_SUBTYPE_4CH_VAL) { flash_data->hw_type = QCOM_MVFLASH_4CH; flash_data->max_channels = 4; @@ -917,8 +951,7 @@ static int qcom_flash_led_probe(struct platform_device *pdev) flash_data->leds_count++; } - return 0; - + return regmap_field_write(flash_data->r_fields[REG_TORCH_CLAMP], flash_data->torch_clamp); release: while (flash_data->v4l2_flash[flash_data->leds_count] && flash_data->leds_count) v4l2_flash_release(flash_data->v4l2_flash[flash_data->leds_count--]); diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 15633fbf3c166a..f3faf37f9a08ac 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -252,15 +252,23 @@ static const struct class leds_class = { * of_led_get() - request a LED device via the LED framework * @np: device node to get the LED device from * @index: the index of the LED + * @name: the name of the LED used to map it to its function, if present * * Returns the LED device parsed from the phandle specified in the "leds" * property of a device tree node or a negative error-code on failure. */ -static struct led_classdev *of_led_get(struct device_node *np, int index) +static struct led_classdev *of_led_get(struct device_node *np, int index, + const char *name) { struct device *led_dev; struct device_node *led_node; + /* + * For named LEDs, first look up the name in the "led-names" property. + * If it cannot be found, then of_parse_phandle() will propagate the error. + */ + if (name) + index = of_property_match_string(np, "led-names", name); led_node = of_parse_phandle(np, "leds", index); if (!led_node) return ERR_PTR(-ENOENT); @@ -324,7 +332,7 @@ struct led_classdev *__must_check devm_of_led_get(struct device *dev, if (!dev) return ERR_PTR(-EINVAL); - led = of_led_get(dev->of_node, index); + led = of_led_get(dev->of_node, index, NULL); if (IS_ERR(led)) return led; @@ -342,9 +350,14 @@ EXPORT_SYMBOL_GPL(devm_of_led_get); struct led_classdev *led_get(struct device *dev, char *con_id) { struct led_lookup_data *lookup; + struct led_classdev *led_cdev; const char *provider = NULL; struct device *led_dev; + led_cdev = of_led_get(dev->of_node, -1, con_id); + if (!IS_ERR(led_cdev) || PTR_ERR(led_cdev) != -ENOENT) + return led_cdev; + mutex_lock(&leds_lookup_lock); list_for_each_entry(lookup, &leds_lookup_list, list) { if (!strcmp(lookup->dev_id, dev_name(dev)) && diff --git a/drivers/leds/leds-is31fl319x.c b/drivers/leds/leds-is31fl319x.c index 27bfab3da47984..e411cee06dabde 100644 --- a/drivers/leds/leds-is31fl319x.c +++ b/drivers/leds/leds-is31fl319x.c @@ -483,11 +483,6 @@ static inline int is31fl3196_db_to_gain(u32 dezibel) return dezibel / IS31FL3196_AUDIO_GAIN_DB_STEP; } -static void is31f1319x_mutex_destroy(void *lock) -{ - mutex_destroy(lock); -} - static int is31fl319x_probe(struct i2c_client *client) { struct is31fl319x_chip *is31; @@ -503,8 +498,7 @@ static int is31fl319x_probe(struct i2c_client *client) if (!is31) return -ENOMEM; - mutex_init(&is31->lock); - err = devm_add_action_or_reset(dev, is31f1319x_mutex_destroy, &is31->lock); + err = devm_mutex_init(dev, &is31->lock); if (err) return err; diff --git a/drivers/leds/leds-is31fl32xx.c b/drivers/leds/leds-is31fl32xx.c index 8793330dd4142f..dc9349f9d3501b 100644 --- a/drivers/leds/leds-is31fl32xx.c +++ b/drivers/leds/leds-is31fl32xx.c @@ -32,6 +32,8 @@ #define IS31FL3216_CONFIG_SSD_ENABLE BIT(7) #define IS31FL3216_CONFIG_SSD_DISABLE 0 +#define IS31FL32XX_PWM_FREQUENCY_22KHZ 0x01 + struct is31fl32xx_priv; struct is31fl32xx_led_data { struct led_classdev cdev; @@ -53,6 +55,7 @@ struct is31fl32xx_priv { * @pwm_update_reg : address of PWM Update register * @global_control_reg : address of Global Control register (optional) * @reset_reg : address of Reset register (optional) + * @output_frequency_setting_reg: address of output frequency register (optional) * @pwm_register_base : address of first PWM register * @pwm_registers_reversed: : true if PWM registers count down instead of up * @led_control_register_base : address of first LED control register (optional) @@ -76,6 +79,7 @@ struct is31fl32xx_chipdef { u8 pwm_update_reg; u8 global_control_reg; u8 reset_reg; + u8 output_frequency_setting_reg; u8 pwm_register_base; bool pwm_registers_reversed; u8 led_control_register_base; @@ -90,6 +94,19 @@ static const struct is31fl32xx_chipdef is31fl3236_cdef = { .pwm_update_reg = 0x25, .global_control_reg = 0x4a, .reset_reg = 0x4f, + .output_frequency_setting_reg = IS31FL32XX_REG_NONE, + .pwm_register_base = 0x01, + .led_control_register_base = 0x26, + .enable_bits_per_led_control_register = 1, +}; + +static const struct is31fl32xx_chipdef is31fl3236a_cdef = { + .channels = 36, + .shutdown_reg = 0x00, + .pwm_update_reg = 0x25, + .global_control_reg = 0x4a, + .reset_reg = 0x4f, + .output_frequency_setting_reg = 0x4b, .pwm_register_base = 0x01, .led_control_register_base = 0x26, .enable_bits_per_led_control_register = 1, @@ -101,6 +118,7 @@ static const struct is31fl32xx_chipdef is31fl3235_cdef = { .pwm_update_reg = 0x25, .global_control_reg = 0x4a, .reset_reg = 0x4f, + .output_frequency_setting_reg = IS31FL32XX_REG_NONE, .pwm_register_base = 0x05, .led_control_register_base = 0x2a, .enable_bits_per_led_control_register = 1, @@ -112,6 +130,7 @@ static const struct is31fl32xx_chipdef is31fl3218_cdef = { .pwm_update_reg = 0x16, .global_control_reg = IS31FL32XX_REG_NONE, .reset_reg = 0x17, + .output_frequency_setting_reg = IS31FL32XX_REG_NONE, .pwm_register_base = 0x01, .led_control_register_base = 0x13, .enable_bits_per_led_control_register = 6, @@ -126,6 +145,7 @@ static const struct is31fl32xx_chipdef is31fl3216_cdef = { .pwm_update_reg = 0xB0, .global_control_reg = IS31FL32XX_REG_NONE, .reset_reg = IS31FL32XX_REG_NONE, + .output_frequency_setting_reg = IS31FL32XX_REG_NONE, .pwm_register_base = 0x10, .pwm_registers_reversed = true, .led_control_register_base = 0x01, @@ -363,8 +383,21 @@ static struct is31fl32xx_led_data *is31fl32xx_find_led_data( static int is31fl32xx_parse_dt(struct device *dev, struct is31fl32xx_priv *priv) { + const struct is31fl32xx_chipdef *cdef = priv->cdef; int ret = 0; + if ((cdef->output_frequency_setting_reg != IS31FL32XX_REG_NONE) && + of_property_read_bool(dev_of_node(dev), "issi,22khz-pwm")) { + + ret = is31fl32xx_write(priv, cdef->output_frequency_setting_reg, + IS31FL32XX_PWM_FREQUENCY_22KHZ); + + if (ret) { + dev_err(dev, "Failed to write output PWM frequency register\n"); + return ret; + } + } + for_each_available_child_of_node_scoped(dev_of_node(dev), child) { struct led_init_data init_data = {}; struct is31fl32xx_led_data *led_data = @@ -404,12 +437,13 @@ static int is31fl32xx_parse_dt(struct device *dev, } static const struct of_device_id of_is31fl32xx_match[] = { - { .compatible = "issi,is31fl3236", .data = &is31fl3236_cdef, }, - { .compatible = "issi,is31fl3235", .data = &is31fl3235_cdef, }, - { .compatible = "issi,is31fl3218", .data = &is31fl3218_cdef, }, - { .compatible = "si-en,sn3218", .data = &is31fl3218_cdef, }, - { .compatible = "issi,is31fl3216", .data = &is31fl3216_cdef, }, - { .compatible = "si-en,sn3216", .data = &is31fl3216_cdef, }, + { .compatible = "issi,is31fl3236", .data = &is31fl3236_cdef, }, + { .compatible = "issi,is31fl3236a", .data = &is31fl3236a_cdef, }, + { .compatible = "issi,is31fl3235", .data = &is31fl3235_cdef, }, + { .compatible = "issi,is31fl3218", .data = &is31fl3218_cdef, }, + { .compatible = "si-en,sn3218", .data = &is31fl3218_cdef, }, + { .compatible = "issi,is31fl3216", .data = &is31fl3216_cdef, }, + { .compatible = "si-en,sn3216", .data = &is31fl3216_cdef, }, {}, }; @@ -466,6 +500,7 @@ static void is31fl32xx_remove(struct i2c_client *client) */ static const struct i2c_device_id is31fl32xx_id[] = { { "is31fl3236" }, + { "is31fl3236a" }, { "is31fl3235" }, { "is31fl3218" }, { "sn3218" }, diff --git a/drivers/leds/leds-lp55xx-common.c b/drivers/leds/leds-lp55xx-common.c index e71456a56ab8da..fd447eb7eb15e2 100644 --- a/drivers/leds/leds-lp55xx-common.c +++ b/drivers/leds/leds-lp55xx-common.c @@ -212,7 +212,7 @@ int lp55xx_update_program_memory(struct lp55xx_chip *chip, * For LED chip that support page, PAGE is already set in load_engine. */ if (!cfg->pages_per_engine) - start_addr += LP55xx_BYTES_PER_PAGE * idx; + start_addr += LP55xx_BYTES_PER_PAGE * (idx - 1); for (page = 0; page < program_length / LP55xx_BYTES_PER_PAGE; page++) { /* Write to the next page each 32 bytes (if supported) */ diff --git a/drivers/leds/leds-max77705.c b/drivers/leds/leds-max77705.c index 933cb4f19be9bc..b7403b3fcf5e72 100644 --- a/drivers/leds/leds-max77705.c +++ b/drivers/leds/leds-max77705.c @@ -180,7 +180,7 @@ static int max77705_add_led(struct device *dev, struct regmap *regmap, struct fw ret = fwnode_property_read_u32(np, "reg", ®); if (ret || reg >= MAX77705_LED_NUM_LEDS) - ret = -EINVAL; + return -EINVAL; info = devm_kcalloc(dev, num_channels, sizeof(*info), GFP_KERNEL); if (!info) diff --git a/drivers/leds/leds-qnap-mcu.c b/drivers/leds/leds-qnap-mcu.c index 4e470945626170..6df110e33ac9c4 100644 --- a/drivers/leds/leds-qnap-mcu.c +++ b/drivers/leds/leds-qnap-mcu.c @@ -104,9 +104,9 @@ static int qnap_mcu_register_err_led(struct device *dev, struct qnap_mcu *mcu, i } enum qnap_mcu_usb_led_mode { - QNAP_MCU_USB_LED_ON = 1, - QNAP_MCU_USB_LED_OFF = 3, - QNAP_MCU_USB_LED_BLINK = 2, + QNAP_MCU_USB_LED_ON = 0, + QNAP_MCU_USB_LED_OFF = 2, + QNAP_MCU_USB_LED_BLINK = 1, }; struct qnap_mcu_usb_led { @@ -137,7 +137,7 @@ static int qnap_mcu_usb_led_set(struct led_classdev *led_cdev, * Byte 3 is shared between the usb led target on/off/blink * and also the buzzer control (in the input driver) */ - cmd[2] = 'D' + usb_led->mode; + cmd[2] = 'E' + usb_led->mode; return qnap_mcu_exec_with_ack(usb_led->mcu, cmd, sizeof(cmd)); } @@ -161,7 +161,7 @@ static int qnap_mcu_usb_led_blink_set(struct led_classdev *led_cdev, * Byte 3 is shared between the USB LED target on/off/blink * and also the buzzer control (in the input driver) */ - cmd[2] = 'D' + usb_led->mode; + cmd[2] = 'E' + usb_led->mode; return qnap_mcu_exec_with_ack(usb_led->mcu, cmd, sizeof(cmd)); } @@ -190,6 +190,166 @@ static int qnap_mcu_register_usb_led(struct device *dev, struct qnap_mcu *mcu) return qnap_mcu_usb_led_set(&usb_led->cdev, 0); } +enum qnap_mcu_status_led_mode { + QNAP_MCU_STATUS_LED_OFF = 0, + QNAP_MCU_STATUS_LED_ON = 1, + QNAP_MCU_STATUS_LED_BLINK_FAST = 2, /* 500ms / 500ms */ + QNAP_MCU_STATUS_LED_BLINK_SLOW = 3, /* 1s / 1s */ +}; + +struct qnap_mcu_status_led { + struct led_classdev cdev; + struct qnap_mcu_status_led *red; + u8 mode; +}; + +struct qnap_mcu_status { + struct qnap_mcu *mcu; + struct qnap_mcu_status_led red; + struct qnap_mcu_status_led green; +}; + +static inline struct qnap_mcu_status_led *cdev_to_qnap_mcu_status_led(struct led_classdev *led_cdev) +{ + return container_of(led_cdev, struct qnap_mcu_status_led, cdev); +} + +static inline struct qnap_mcu_status *statusled_to_qnap_mcu_status(struct qnap_mcu_status_led *led) +{ + return container_of(led->red, struct qnap_mcu_status, red); +} + +static u8 qnap_mcu_status_led_encode(struct qnap_mcu_status *status) +{ + if (status->red.mode == QNAP_MCU_STATUS_LED_OFF) { + switch (status->green.mode) { + case QNAP_MCU_STATUS_LED_OFF: + return '9'; + case QNAP_MCU_STATUS_LED_ON: + return '6'; + case QNAP_MCU_STATUS_LED_BLINK_FAST: + return '5'; + case QNAP_MCU_STATUS_LED_BLINK_SLOW: + return 'A'; + } + } else if (status->green.mode == QNAP_MCU_STATUS_LED_OFF) { + switch (status->red.mode) { + case QNAP_MCU_STATUS_LED_OFF: + return '9'; + case QNAP_MCU_STATUS_LED_ON: + return '7'; + case QNAP_MCU_STATUS_LED_BLINK_FAST: + return '4'; + case QNAP_MCU_STATUS_LED_BLINK_SLOW: + return 'B'; + } + } else if (status->green.mode == QNAP_MCU_STATUS_LED_ON && + status->red.mode == QNAP_MCU_STATUS_LED_ON) { + return 'D'; + } else if (status->green.mode == QNAP_MCU_STATUS_LED_BLINK_SLOW && + status->red.mode == QNAP_MCU_STATUS_LED_BLINK_SLOW) { + return 'C'; + } + + /* + * Here both LEDs are on in some fashion, either both blinking fast, + * or in different speeds, so default to fast blinking for both. + */ + return '8'; +} + +static int qnap_mcu_status_led_update(struct qnap_mcu *mcu, + struct qnap_mcu_status *status) +{ + u8 cmd[] = { '@', 'C', 0 }; + + cmd[2] = qnap_mcu_status_led_encode(status); + + return qnap_mcu_exec_with_ack(mcu, cmd, sizeof(cmd)); +} + +static int qnap_mcu_status_led_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + struct qnap_mcu_status_led *status_led = cdev_to_qnap_mcu_status_led(led_cdev); + struct qnap_mcu_status *base = statusled_to_qnap_mcu_status(status_led); + + /* Don't disturb a possible set blink-mode if LED stays on */ + if (brightness != 0 && status_led->mode >= QNAP_MCU_STATUS_LED_BLINK_FAST) + return 0; + + status_led->mode = brightness ? QNAP_MCU_STATUS_LED_ON : + QNAP_MCU_STATUS_LED_OFF; + + return qnap_mcu_status_led_update(base->mcu, base); +} + +static int qnap_mcu_status_led_blink_set(struct led_classdev *led_cdev, + unsigned long *delay_on, + unsigned long *delay_off) +{ + struct qnap_mcu_status_led *status_led = cdev_to_qnap_mcu_status_led(led_cdev); + struct qnap_mcu_status *base = statusled_to_qnap_mcu_status(status_led); + + if (status_led->mode == QNAP_MCU_STATUS_LED_OFF) + return 0; + + if (*delay_on <= 500) { + *delay_on = 500; + *delay_off = 500; + status_led->mode = QNAP_MCU_STATUS_LED_BLINK_FAST; + } else { + *delay_on = 1000; + *delay_off = 1000; + status_led->mode = QNAP_MCU_STATUS_LED_BLINK_SLOW; + } + + return qnap_mcu_status_led_update(base->mcu, base); +} + +static int qnap_mcu_register_status_leds(struct device *dev, struct qnap_mcu *mcu) +{ + struct qnap_mcu_status *status; + int ret; + + status = devm_kzalloc(dev, sizeof(*status), GFP_KERNEL); + if (!status) + return -ENOMEM; + + status->mcu = mcu; + + /* + * point to the red led, so that statusled_to_qnap_mcu_status + * can resolve the main status struct containing both leds + */ + status->red.red = &status->red; + status->green.red = &status->red; + + status->red.mode = QNAP_MCU_STATUS_LED_OFF; + status->red.cdev.name = "red:status"; + status->red.cdev.brightness_set_blocking = qnap_mcu_status_led_set; + status->red.cdev.blink_set = qnap_mcu_status_led_blink_set; + status->red.cdev.brightness = 0; + status->red.cdev.max_brightness = 1; + + status->green.mode = QNAP_MCU_STATUS_LED_OFF; + status->green.cdev.name = "green:status"; + status->green.cdev.brightness_set_blocking = qnap_mcu_status_led_set; + status->green.cdev.blink_set = qnap_mcu_status_led_blink_set; + status->green.cdev.brightness = 0; + status->green.cdev.max_brightness = 1; + + ret = devm_led_classdev_register(dev, &status->red.cdev); + if (ret) + return ret; + + ret = devm_led_classdev_register(dev, &status->green.cdev); + if (ret) + return ret; + + return qnap_mcu_status_led_update(status->mcu, status); +} + static int qnap_mcu_leds_probe(struct platform_device *pdev) { struct qnap_mcu *mcu = dev_get_drvdata(pdev->dev.parent); @@ -210,6 +370,11 @@ static int qnap_mcu_leds_probe(struct platform_device *pdev) "failed to register USB LED\n"); } + ret = qnap_mcu_register_status_leds(&pdev->dev, mcu); + if (ret) + return dev_err_probe(&pdev->dev, ret, + "failed to register status LEDs\n"); + return 0; } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index efeee0a873c064..ab96b692e5a3eb 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -133,7 +133,7 @@ struct journal_sector { commit_id_t commit_id; }; -#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) +#define MAX_TAG_SIZE 255 #define METADATA_PADDING_SECTORS 8 diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 79ea85d18e24e5..f4b904e2432853 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3813,8 +3813,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) struct raid_set *rs = ti->private; unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); - limits->io_min = chunk_size_bytes; - limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + if (chunk_size_bytes) { + limits->io_min = chunk_size_bytes; + limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + } } static void raid_presuspend(struct dm_target *ti) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 58902091bf79b9..1461dc740dae6c 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -456,11 +456,15 @@ static void stripe_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct stripe_c *sc = ti->private; - unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; + unsigned int io_min, io_opt; limits->chunk_sectors = sc->chunk_size; - limits->io_min = chunk_size; - limits->io_opt = chunk_size * sc->stripes; + + if (!check_shl_overflow(sc->chunk_size, SECTOR_SHIFT, &io_min) && + !check_mul_overflow(io_min, sc->stripes, &io_opt)) { + limits->io_min = io_min; + limits->io_opt = io_opt; + } } static struct target_type stripe_target = { diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 5497eaee96e7d3..6e9a0045f0ffca 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -979,7 +979,7 @@ static int join(struct mddev *mddev, int nodes) lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); mddev->cluster_info = NULL; kfree(cinfo); return ret; @@ -1042,7 +1042,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); kfree(cinfo); return 0; } diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5d9b081153757e..3e1f165c2d20f6 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -73,6 +73,7 @@ static int linear_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f1d8811a542ae2..419139ad7663cc 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -382,6 +382,7 @@ static int raid0_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; lim.chunk_sectors = mddev->chunk_sectors; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bf44878ec640e5..d30b82beeb92fb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3211,6 +3211,7 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b60c30bfb6c794..9832eefb2f157b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4008,6 +4008,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.chunk_sectors = mddev->chunk_sectors; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 023649fe2476f3..e385ef1355e8b3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7732,6 +7732,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, diff --git a/drivers/media/rc/pwm-ir-tx.c b/drivers/media/rc/pwm-ir-tx.c index 84533fdd61aa6c..047472dc9244af 100644 --- a/drivers/media/rc/pwm-ir-tx.c +++ b/drivers/media/rc/pwm-ir-tx.c @@ -117,7 +117,6 @@ static int pwm_ir_tx_atomic(struct rc_dev *dev, unsigned int *txbuf, static enum hrtimer_restart pwm_ir_timer(struct hrtimer *timer) { struct pwm_ir *pwm_ir = container_of(timer, struct pwm_ir, timer); - ktime_t now; /* * If we happen to hit an odd latency spike, loop through the @@ -139,9 +138,7 @@ static enum hrtimer_restart pwm_ir_timer(struct hrtimer *timer) hrtimer_add_expires_ns(timer, ns); pwm_ir->txbuf_index++; - - now = timer->base->get_time(); - } while (hrtimer_get_expires_tv64(timer) < now); + } while (hrtimer_expires_remaining(timer) > 0); return HRTIMER_RESTART; } diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c index e4275f8ee5db8a..acafc910bbaccf 100644 --- a/drivers/memstick/core/memstick.c +++ b/drivers/memstick/core/memstick.c @@ -370,7 +370,9 @@ int memstick_set_rw_addr(struct memstick_dev *card) { card->next_request = h_memstick_set_rw_addr; memstick_new_req(card->host); - wait_for_completion(&card->mrq_complete); + if (!wait_for_completion_timeout(&card->mrq_complete, + msecs_to_jiffies(500))) + card->current_mrq.error = -ETIMEDOUT; return card->current_mrq.error; } @@ -404,7 +406,9 @@ static struct memstick_dev *memstick_alloc_card(struct memstick_host *host) card->next_request = h_memstick_read_dev_id; memstick_new_req(host); - wait_for_completion(&card->mrq_complete); + if (!wait_for_completion_timeout(&card->mrq_complete, + msecs_to_jiffies(500))) + card->current_mrq.error = -ETIMEDOUT; if (card->current_mrq.error) goto err_out; diff --git a/drivers/memstick/host/rtsx_usb_ms.c b/drivers/memstick/host/rtsx_usb_ms.c index 5b5e9354fb2e4f..beadc389f15f39 100644 --- a/drivers/memstick/host/rtsx_usb_ms.c +++ b/drivers/memstick/host/rtsx_usb_ms.c @@ -216,7 +216,10 @@ static int ms_power_off(struct rtsx_usb_ms *host) rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_CLK_EN, MS_CLK_EN, 0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_OE, MS_OUTPUT_EN, 0); - + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, + POWER_MASK, POWER_OFF); + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, + POWER_MASK | LDO3318_PWR_MASK, POWER_OFF | LDO_SUSPEND); err = rtsx_usb_send_cmd(ucr, MODE_C, 100); if (err < 0) return err; diff --git a/drivers/mfd/88pm886.c b/drivers/mfd/88pm886.c index 39dd9a818b0f0e..e411d8dee55420 100644 --- a/drivers/mfd/88pm886.c +++ b/drivers/mfd/88pm886.c @@ -35,6 +35,7 @@ static const struct resource pm886_onkey_resources[] = { }; static const struct mfd_cell pm886_devs[] = { + MFD_CELL_NAME("88pm886-gpadc"), MFD_CELL_RES("88pm886-onkey", pm886_onkey_resources), MFD_CELL_NAME("88pm886-regulator"), MFD_CELL_NAME("88pm886-rtc"), diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1e7..67b54e0fd4527a 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -129,6 +129,7 @@ config MFD_AAT2870_CORE select MFD_CORE depends on I2C=y depends on GPIOLIB || COMPILE_TEST + depends on GPIOLIB_LEGACY help If you say yes here you get support for the AAT2870. This driver provides common support for accessing the device, @@ -1134,6 +1135,21 @@ config MFD_MENF21BMC This driver can also be built as a module. If so the module will be called menf21bmc. +config MFD_NCT6694 + tristate "Nuvoton NCT6694 support" + select MFD_CORE + depends on USB + help + This enables support for the Nuvoton USB device NCT6694, which shares + peripherals. + The Nuvoton NCT6694 is a peripheral expander with 16 GPIO chips, + 6 I2C controllers, 2 CANfd controllers, 2 Watchdog timers, ADC, + PWM, and RTC. + This driver provides core APIs to access the NCT6694 hardware + monitoring and control features. + Additional drivers must be enabled to utilize the specific + functionalities of the device. + config MFD_OCELOT tristate "Microsemi Ocelot External Control Support" depends on SPI_MASTER @@ -1238,6 +1254,19 @@ config MFD_QCOM_RPM Say M here if you want to include support for the Qualcomm RPM as a module. This will build a module called "qcom_rpm". +config MFD_SPACEMIT_P1 + tristate "SpacemiT P1 PMIC" + depends on ARCH_SPACEMIT || COMPILE_TEST + depends on I2C + select I2C_K1 + select MFD_SIMPLE_MFD_I2C + help + This option supports the I2C-based SpacemiT P1 PMIC, which + contains regulators, a power switch, GPIOs, an RTC, and more. + This option is selected when any of the supported sub-devices + is configured. The basic functionality is implemented by the + simple MFD I2C driver. + config MFD_SPMI_PMIC tristate "Qualcomm SPMI PMICs" depends on ARCH_QCOM || COMPILE_TEST @@ -1411,6 +1440,7 @@ config MFD_SEC_I2C config MFD_SI476X_CORE tristate "Silicon Laboratories 4761/64/68 AM/FM radio." depends on I2C + depends on GPIOLIB_LEGACY select MFD_CORE select REGMAP_I2C help @@ -1539,8 +1569,8 @@ config MFD_DB8500_PRCMU through a register map. config MFD_STMPE - bool "STMicroelectronics STMPE" - depends on I2C=y || SPI_MASTER=y + tristate "STMicroelectronics STMPE" + depends on I2C || SPI_MASTER depends on OF select MFD_CORE help @@ -1568,14 +1598,14 @@ menu "STMicroelectronics STMPE Interface Drivers" depends on MFD_STMPE config STMPE_I2C - bool "STMicroelectronics STMPE I2C Interface" - depends on I2C=y + tristate "STMicroelectronics STMPE I2C Interface" + depends on I2C default y help This is used to enable I2C interface of STMPE config STMPE_SPI - bool "STMicroelectronics STMPE SPI Interface" + tristate "STMicroelectronics STMPE SPI Interface" depends on SPI_MASTER help This is used to enable SPI interface of STMPE @@ -1641,6 +1671,17 @@ config MFD_TI_LMU LM36274. It consists of backlight, LED and regulator driver. It provides consistent device controls for lighting functions. +config MFD_BQ257XX + tristate "TI BQ257XX Buck/Boost Charge Controller" + depends on I2C + select MFD_CORE + select REGMAP_I2C + help + Support Texas Instruments BQ25703 Buck/Boost converter with + charge controller. It consists of regulators that provide + system voltage and OTG voltage, and a charger manager for + batteries containing one or more cells. + config MFD_OMAP_USB_HOST bool "TI OMAP USBHS core and TLL driver" depends on USB_EHCI_HCD_OMAP || USB_OHCI_HCD_OMAP3 @@ -1977,7 +2018,7 @@ config MFD_TIMBERDALE multifunction device which exposes numerous platform devices. The timberdale FPGA can be found on the Intel Atom development board - for in-vehicle infontainment, called Russellville. + for in-vehicle infotainment, called Russellville. config MFD_TC3589X bool "Toshiba TC35892 and variants" @@ -2428,6 +2469,30 @@ config MFD_INTEL_M10_BMC_PMCI additional drivers must be enabled in order to use the functionality of the device. +config MFD_LOONGSON_SE + tristate "Loongson Security Engine chip controller driver" + depends on LOONGARCH && ACPI + select MFD_CORE + help + The Loongson Security Engine chip supports RNG, SM2, SM3 and + SM4 accelerator engines. Each engine have its own DMA buffer + provided by the controller. The kernel cannot directly send + commands to the engine and must first send them to the controller, + which will forward them to the corresponding engine. + +config MFD_LS2K_BMC_CORE + bool "Loongson-2K Board Management Controller Support" + depends on PCI && ACPI_GENERIC_GSI + select MFD_CORE + help + Say yes here to add support for the Loongson-2K BMC which is a Board + Management Controller connected to the PCIe bus. The device supports + multiple sub-devices like display and IPMI. This driver provides common + support for accessing the devices. + + The display is enabled by default in the driver, while the IPMI interface + is enabled independently through the IPMI_LS2K option in the IPMI section. + config MFD_QNAP_MCU tristate "QNAP microcontroller unit core driver" depends on SERIAL_DEV_BUS @@ -2481,5 +2546,19 @@ config MFD_UPBOARD_FPGA To compile this driver as a module, choose M here: the module will be called upboard-fpga. +config MFD_MAX7360 + tristate "Maxim MAX7360 I2C IO Expander" + depends on I2C + select MFD_CORE + select REGMAP_I2C + select REGMAP_IRQ + help + Say yes here to add support for Maxim MAX7360 device, embedding + keypad, rotary encoder, PWM and GPIO features. + + This driver provides common support for accessing the device; + additional drivers must be enabled in order to use the functionality + of the device. + endmenu endif diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d16..865e9f12faff02 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_MFD_SM501) += sm501.o obj-$(CONFIG_ARCH_BCM2835) += bcm2835-pm.o obj-$(CONFIG_MFD_BCM590XX) += bcm590xx.o obj-$(CONFIG_MFD_BD9571MWV) += bd9571mwv.o +obj-$(CONFIG_MFD_BQ257XX) += bq257xx.o obj-$(CONFIG_MFD_CGBC) += cgbc-core.o obj-$(CONFIG_MFD_CROS_EC_DEV) += cros_ec_dev.o obj-$(CONFIG_MFD_CS42L43) += cs42l43.o @@ -121,6 +122,8 @@ obj-$(CONFIG_MFD_MC13XXX) += mc13xxx-core.o obj-$(CONFIG_MFD_MC13XXX_SPI) += mc13xxx-spi.o obj-$(CONFIG_MFD_MC13XXX_I2C) += mc13xxx-i2c.o +obj-$(CONFIG_MFD_NCT6694) += nct6694.o + obj-$(CONFIG_MFD_CORE) += mfd-core.o ocelot-soc-objs := ocelot-core.o ocelot-spi.o @@ -163,6 +166,7 @@ obj-$(CONFIG_MFD_DA9063) += da9063.o obj-$(CONFIG_MFD_DA9150) += da9150-core.o obj-$(CONFIG_MFD_MAX14577) += max14577.o +obj-$(CONFIG_MFD_MAX7360) += max7360.o obj-$(CONFIG_MFD_MAX77541) += max77541.o obj-$(CONFIG_MFD_MAX77620) += max77620.o obj-$(CONFIG_MFD_MAX77650) += max77650.o @@ -286,6 +290,8 @@ obj-$(CONFIG_MFD_INTEL_M10_BMC_CORE) += intel-m10-bmc-core.o obj-$(CONFIG_MFD_INTEL_M10_BMC_SPI) += intel-m10-bmc-spi.o obj-$(CONFIG_MFD_INTEL_M10_BMC_PMCI) += intel-m10-bmc-pmci.o +obj-$(CONFIG_MFD_LS2K_BMC_CORE) += ls2k-bmc-core.o + obj-$(CONFIG_MFD_ATC260X) += atc260x-core.o obj-$(CONFIG_MFD_ATC260X_I2C) += atc260x-i2c.o @@ -295,3 +301,5 @@ obj-$(CONFIG_MFD_RSMU_I2C) += rsmu_i2c.o rsmu_core.o obj-$(CONFIG_MFD_RSMU_SPI) += rsmu_spi.o rsmu_core.o obj-$(CONFIG_MFD_UPBOARD_FPGA) += upboard-fpga.o + +obj-$(CONFIG_MFD_LOONGSON_SE) += loongson-se.o diff --git a/drivers/mfd/adp5585.c b/drivers/mfd/adp5585.c index 58f7cebe2ea4f2..46b3ce3d7bae89 100644 --- a/drivers/mfd/adp5585.c +++ b/drivers/mfd/adp5585.c @@ -432,7 +432,6 @@ static int adp5585_reset_ev_parse(struct adp5585_dev *adp5585) "Invalid value(%u) for adi,reset-pulse-width-us\n", prop_val); } - return ret; } return 0; diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c index 3f8622ee0e59ca..544016d420fe2a 100644 --- a/drivers/mfd/arizona-irq.c +++ b/drivers/mfd/arizona-irq.c @@ -136,7 +136,7 @@ static irqreturn_t arizona_irq_thread(int irq, void *data) dev_err(arizona->dev, "Failed to read main IRQ status: %d\n", ret); } - +#ifdef CONFIG_GPIOLIB_LEGACY /* * Poll the IRQ pin status to see if we're really done * if the interrupt controller can't do it for us. @@ -150,6 +150,7 @@ static irqreturn_t arizona_irq_thread(int irq, void *data) !gpio_get_value_cansleep(arizona->pdata.irq_gpio)) { poll = true; } +#endif } while (poll); pm_runtime_put_autosuspend(arizona->dev); @@ -349,6 +350,7 @@ int arizona_irq_init(struct arizona *arizona) goto err_map_main_irq; } +#ifdef CONFIG_GPIOLIB_LEGACY /* Used to emulate edge trigger and to work around broken pinmux */ if (arizona->pdata.irq_gpio) { if (gpio_to_irq(arizona->pdata.irq_gpio) != arizona->irq) { @@ -368,6 +370,7 @@ int arizona_irq_init(struct arizona *arizona) arizona->pdata.irq_gpio = 0; } } +#endif ret = request_threaded_irq(arizona->irq, NULL, arizona_irq_thread, flags, "arizona", arizona); diff --git a/drivers/mfd/bq257xx.c b/drivers/mfd/bq257xx.c new file mode 100644 index 00000000000000..e9d49dac0a1670 --- /dev/null +++ b/drivers/mfd/bq257xx.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BQ257XX Core Driver + * Copyright (C) 2025 Chris Morgan + */ + +#include +#include +#include +#include +#include + +static const struct regmap_range bq25703_readonly_reg_ranges[] = { + regmap_reg_range(BQ25703_CHARGER_STATUS, BQ25703_MANUFACT_DEV_ID), +}; + +static const struct regmap_access_table bq25703_writeable_regs = { + .no_ranges = bq25703_readonly_reg_ranges, + .n_no_ranges = ARRAY_SIZE(bq25703_readonly_reg_ranges), +}; + +static const struct regmap_range bq25703_volatile_reg_ranges[] = { + regmap_reg_range(BQ25703_CHARGE_OPTION_0, BQ25703_IIN_HOST), + regmap_reg_range(BQ25703_CHARGER_STATUS, BQ25703_ADC_OPTION), +}; + +static const struct regmap_access_table bq25703_volatile_regs = { + .yes_ranges = bq25703_volatile_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(bq25703_volatile_reg_ranges), +}; + +static const struct regmap_config bq25703_regmap_config = { + .reg_bits = 8, + .val_bits = 16, + .max_register = BQ25703_ADC_OPTION, + .cache_type = REGCACHE_MAPLE, + .wr_table = &bq25703_writeable_regs, + .volatile_table = &bq25703_volatile_regs, + .val_format_endian = REGMAP_ENDIAN_LITTLE, +}; + +static const struct mfd_cell cells[] = { + MFD_CELL_NAME("bq257xx-regulator"), + MFD_CELL_NAME("bq257xx-charger"), +}; + +static int bq257xx_probe(struct i2c_client *client) +{ + struct bq257xx_device *ddata; + int ret; + + ddata = devm_kzalloc(&client->dev, sizeof(*ddata), GFP_KERNEL); + if (!ddata) + return -ENOMEM; + + ddata->client = client; + + ddata->regmap = devm_regmap_init_i2c(client, &bq25703_regmap_config); + if (IS_ERR(ddata->regmap)) { + return dev_err_probe(&client->dev, PTR_ERR(ddata->regmap), + "Failed to allocate register map\n"); + } + + i2c_set_clientdata(client, ddata); + + ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_AUTO, + cells, ARRAY_SIZE(cells), NULL, 0, NULL); + if (ret) + return dev_err_probe(&client->dev, ret, + "Failed to register child devices\n"); + + return ret; +} + +static const struct i2c_device_id bq257xx_i2c_ids[] = { + { "bq25703a" }, + {} +}; +MODULE_DEVICE_TABLE(i2c, bq257xx_i2c_ids); + +static const struct of_device_id bq257xx_of_match[] = { + { .compatible = "ti,bq25703a" }, + {} +}; +MODULE_DEVICE_TABLE(of, bq257xx_of_match); + +static struct i2c_driver bq257xx_driver = { + .driver = { + .name = "bq257xx", + .of_match_table = bq257xx_of_match, + }, + .probe = bq257xx_probe, + .id_table = bq257xx_i2c_ids, +}; +module_i2c_driver(bq257xx_driver); + +MODULE_DESCRIPTION("bq257xx buck/boost/charger driver"); +MODULE_AUTHOR("Chris Morgan "); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/cs42l43.c b/drivers/mfd/cs42l43.c index 07c8f1b8183eec..107cfb983fec41 100644 --- a/drivers/mfd/cs42l43.c +++ b/drivers/mfd/cs42l43.c @@ -1117,24 +1117,6 @@ EXPORT_SYMBOL_NS_GPL(cs42l43_dev_probe, "MFD_CS42L43"); static int cs42l43_suspend(struct device *dev) { struct cs42l43 *cs42l43 = dev_get_drvdata(dev); - static const struct reg_sequence mask_all[] = { - { CS42L43_DECIM_MASK, 0xFFFFFFFF, }, - { CS42L43_EQ_MIX_MASK, 0xFFFFFFFF, }, - { CS42L43_ASP_MASK, 0xFFFFFFFF, }, - { CS42L43_PLL_MASK, 0xFFFFFFFF, }, - { CS42L43_SOFT_MASK, 0xFFFFFFFF, }, - { CS42L43_SWIRE_MASK, 0xFFFFFFFF, }, - { CS42L43_MSM_MASK, 0xFFFFFFFF, }, - { CS42L43_ACC_DET_MASK, 0xFFFFFFFF, }, - { CS42L43_I2C_TGT_MASK, 0xFFFFFFFF, }, - { CS42L43_SPI_MSTR_MASK, 0xFFFFFFFF, }, - { CS42L43_SW_TO_SPI_BRIDGE_MASK, 0xFFFFFFFF, }, - { CS42L43_OTP_MASK, 0xFFFFFFFF, }, - { CS42L43_CLASS_D_AMP_MASK, 0xFFFFFFFF, }, - { CS42L43_GPIO_INT_MASK, 0xFFFFFFFF, }, - { CS42L43_ASRC_MASK, 0xFFFFFFFF, }, - { CS42L43_HPOUT_MASK, 0xFFFFFFFF, }, - }; int ret; ret = pm_runtime_resume_and_get(dev); @@ -1143,13 +1125,7 @@ static int cs42l43_suspend(struct device *dev) return ret; } - /* The IRQs will be re-enabled on resume by the cache sync */ - ret = regmap_multi_reg_write_bypassed(cs42l43->regmap, - mask_all, ARRAY_SIZE(mask_all)); - if (ret) { - dev_err(cs42l43->dev, "Failed to mask IRQs: %d\n", ret); - return ret; - } + disable_irq(cs42l43->irq); ret = pm_runtime_force_suspend(dev); if (ret) { @@ -1164,8 +1140,6 @@ static int cs42l43_suspend(struct device *dev) if (ret) return ret; - disable_irq(cs42l43->irq); - return 0; } @@ -1196,14 +1170,14 @@ static int cs42l43_resume(struct device *dev) if (ret) return ret; - enable_irq(cs42l43->irq); - ret = pm_runtime_force_resume(dev); if (ret) { dev_err(cs42l43->dev, "Failed to force resume: %d\n", ret); return ret; } + enable_irq(cs42l43->irq); + return 0; } diff --git a/drivers/mfd/da9063-i2c.c b/drivers/mfd/da9063-i2c.c index c6235cd0dbdc40..1ec9ab56442dfc 100644 --- a/drivers/mfd/da9063-i2c.c +++ b/drivers/mfd/da9063-i2c.c @@ -37,9 +37,13 @@ enum da9063_page_sel_buf_fmt { DA9063_PAGE_SEL_BUF_SIZE, }; +enum da9063_page_sel_msgs { + DA9063_PAGE_SEL_MSG = 0, + DA9063_PAGE_SEL_CNT, +}; + enum da9063_paged_read_msgs { - DA9063_PAGED_READ_MSG_PAGE_SEL = 0, - DA9063_PAGED_READ_MSG_REG_SEL, + DA9063_PAGED_READ_MSG_REG_SEL = 0, DA9063_PAGED_READ_MSG_DATA, DA9063_PAGED_READ_MSG_CNT, }; @@ -65,10 +69,21 @@ static int da9063_i2c_blockreg_read(struct i2c_client *client, u16 addr, (page_num << DA9063_I2C_PAGE_SEL_SHIFT) & DA9063_REG_PAGE_MASK; /* Write reg address, page selection */ - xfer[DA9063_PAGED_READ_MSG_PAGE_SEL].addr = client->addr; - xfer[DA9063_PAGED_READ_MSG_PAGE_SEL].flags = 0; - xfer[DA9063_PAGED_READ_MSG_PAGE_SEL].len = DA9063_PAGE_SEL_BUF_SIZE; - xfer[DA9063_PAGED_READ_MSG_PAGE_SEL].buf = page_sel_buf; + xfer[DA9063_PAGE_SEL_MSG].addr = client->addr; + xfer[DA9063_PAGE_SEL_MSG].flags = 0; + xfer[DA9063_PAGE_SEL_MSG].len = DA9063_PAGE_SEL_BUF_SIZE; + xfer[DA9063_PAGE_SEL_MSG].buf = page_sel_buf; + + ret = i2c_transfer(client->adapter, xfer, DA9063_PAGE_SEL_CNT); + if (ret < 0) { + dev_err(&client->dev, "Page switch failed: %d\n", ret); + return ret; + } + + if (ret != DA9063_PAGE_SEL_CNT) { + dev_err(&client->dev, "Page switch failed to complete\n"); + return -EIO; + } /* Select register address */ xfer[DA9063_PAGED_READ_MSG_REG_SEL].addr = client->addr; diff --git a/drivers/mfd/exynos-lpass.c b/drivers/mfd/exynos-lpass.c index 44797001a4322b..9bb2687c28355d 100644 --- a/drivers/mfd/exynos-lpass.c +++ b/drivers/mfd/exynos-lpass.c @@ -101,7 +101,6 @@ static const struct regmap_config exynos_lpass_reg_conf = { .reg_stride = 4, .val_bits = 32, .max_register = 0xfc, - .fast_io = true, }; static void exynos_lpass_disable_lpass(void *data) diff --git a/drivers/mfd/fsl-imx25-tsadc.c b/drivers/mfd/fsl-imx25-tsadc.c index 0aab6428e04232..467b1a23faeb24 100644 --- a/drivers/mfd/fsl-imx25-tsadc.c +++ b/drivers/mfd/fsl-imx25-tsadc.c @@ -17,7 +17,6 @@ #include static const struct regmap_config mx25_tsadc_regmap_config = { - .fast_io = true, .max_register = 8, .reg_bits = 32, .val_bits = 32, diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c index 1a5b8b13f8d0b2..8d92c895d3aeff 100644 --- a/drivers/mfd/intel-lpss-pci.c +++ b/drivers/mfd/intel-lpss-pci.c @@ -367,6 +367,19 @@ static const struct pci_device_id intel_lpss_pci_ids[] = { { PCI_VDEVICE(INTEL, 0x4b79), (kernel_ulong_t)&ehl_i2c_info }, { PCI_VDEVICE(INTEL, 0x4b7a), (kernel_ulong_t)&ehl_i2c_info }, { PCI_VDEVICE(INTEL, 0x4b7b), (kernel_ulong_t)&ehl_i2c_info }, + /* WCL */ + { PCI_VDEVICE(INTEL, 0x4d25), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0x4d26), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0x4d27), (kernel_ulong_t)&tgl_spi_info }, + { PCI_VDEVICE(INTEL, 0x4d30), (kernel_ulong_t)&tgl_spi_info }, + { PCI_VDEVICE(INTEL, 0x4d46), (kernel_ulong_t)&tgl_spi_info }, + { PCI_VDEVICE(INTEL, 0x4d50), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d51), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d52), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0x4d78), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d79), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d7a), (kernel_ulong_t)&ehl_i2c_info }, + { PCI_VDEVICE(INTEL, 0x4d7b), (kernel_ulong_t)&ehl_i2c_info }, /* JSL */ { PCI_VDEVICE(INTEL, 0x4da8), (kernel_ulong_t)&spt_uart_info }, { PCI_VDEVICE(INTEL, 0x4da9), (kernel_ulong_t)&spt_uart_info }, diff --git a/drivers/mfd/intel_soc_pmic_chtdc_ti.c b/drivers/mfd/intel_soc_pmic_chtdc_ti.c index 4c1a68c9f5750f..6daf33e07ea0a8 100644 --- a/drivers/mfd/intel_soc_pmic_chtdc_ti.c +++ b/drivers/mfd/intel_soc_pmic_chtdc_ti.c @@ -82,6 +82,8 @@ static const struct regmap_config chtdc_ti_regmap_config = { .reg_bits = 8, .val_bits = 8, .max_register = 0xff, + /* The hardware does not support reading multiple registers at once */ + .use_single_read = true, }; static const struct regmap_irq chtdc_ti_irqs[] = { diff --git a/drivers/mfd/kempld-core.c b/drivers/mfd/kempld-core.c index c5bfb6440a930f..c2008d2dc95aa9 100644 --- a/drivers/mfd/kempld-core.c +++ b/drivers/mfd/kempld-core.c @@ -141,10 +141,8 @@ static int kempld_create_platform_device(const struct kempld_platform_data *pdat }; kempld_pdev = platform_device_register_full(&pdevinfo); - if (IS_ERR(kempld_pdev)) - return PTR_ERR(kempld_pdev); - return 0; + return PTR_ERR_OR_ZERO(kempld_pdev); } /** @@ -779,22 +777,26 @@ MODULE_DEVICE_TABLE(dmi, kempld_dmi_table); static int __init kempld_init(void) { const struct dmi_system_id *id; - int ret = -ENODEV; - - for (id = dmi_first_match(kempld_dmi_table); id; id = dmi_first_match(id + 1)) { - /* Check, if user asked for the exact device ID match */ - if (force_device_id[0] && !strstr(id->ident, force_device_id)) - continue; - ret = kempld_create_platform_device(&kempld_platform_data_generic); - if (ret) - continue; - - break; + /* + * This custom DMI iteration allows the driver to be initialized in three ways: + * - When a forced_device_id string matches any ident in the kempld_dmi_table, + * regardless of whether the DMI device is present in the system dmi table. + * - When a matching entry is present in the DMI system tabe. + * - Through alternative mechanisms like ACPI. + */ + if (force_device_id[0]) { + for (id = kempld_dmi_table; id->matches[0].slot != DMI_NONE; id++) + if (strstr(id->ident, force_device_id)) + if (!kempld_create_platform_device(&kempld_platform_data_generic)) + break; + if (id->matches[0].slot == DMI_NONE) + return -ENODEV; + } else { + for (id = dmi_first_match(kempld_dmi_table); id; id = dmi_first_match(id+1)) + if (kempld_create_platform_device(&kempld_platform_data_generic)) + break; } - if (ret) - return ret; - return platform_driver_register(&kempld_driver); } diff --git a/drivers/mfd/loongson-se.c b/drivers/mfd/loongson-se.c new file mode 100644 index 00000000000000..3902ba377d6908 --- /dev/null +++ b/drivers/mfd/loongson-se.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2025 Loongson Technology Corporation Limited + * + * Author: Yinggang Gu + * Author: Qunqin Zhao + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct loongson_se { + void __iomem *base; + spinlock_t dev_lock; + struct completion cmd_completion; + + void *dmam_base; + int dmam_size; + + struct mutex engine_init_lock; + struct loongson_se_engine engines[SE_ENGINE_MAX]; +}; + +struct loongson_se_controller_cmd { + u32 command_id; + u32 info[7]; +}; + +static int loongson_se_poll(struct loongson_se *se, u32 int_bit) +{ + u32 status; + int err; + + spin_lock_irq(&se->dev_lock); + + /* Notify the controller that the engine needs to be started */ + writel(int_bit, se->base + SE_L2SINT_SET); + + /* Polling until the controller has forwarded the engine command */ + err = readl_relaxed_poll_timeout_atomic(se->base + SE_L2SINT_STAT, status, + !(status & int_bit), + 1, LOONGSON_ENGINE_CMD_TIMEOUT_US); + + spin_unlock_irq(&se->dev_lock); + + return err; +} + +static int loongson_se_send_controller_cmd(struct loongson_se *se, + struct loongson_se_controller_cmd *cmd) +{ + u32 *send_cmd = (u32 *)cmd; + int err, i; + + for (i = 0; i < SE_SEND_CMD_REG_LEN; i++) + writel(send_cmd[i], se->base + SE_SEND_CMD_REG + i * 4); + + err = loongson_se_poll(se, SE_INT_CONTROLLER); + if (err) + return err; + + return wait_for_completion_interruptible(&se->cmd_completion); +} + +int loongson_se_send_engine_cmd(struct loongson_se_engine *engine) +{ + /* + * After engine initialization, the controller already knows + * where to obtain engine commands from. Now all we need to + * do is notify the controller that the engine needs to be started. + */ + int err = loongson_se_poll(engine->se, BIT(engine->id)); + + if (err) + return err; + + return wait_for_completion_interruptible(&engine->completion); +} +EXPORT_SYMBOL_GPL(loongson_se_send_engine_cmd); + +struct loongson_se_engine *loongson_se_init_engine(struct device *dev, int id) +{ + struct loongson_se *se = dev_get_drvdata(dev); + struct loongson_se_engine *engine = &se->engines[id]; + struct loongson_se_controller_cmd cmd; + + engine->se = se; + engine->id = id; + init_completion(&engine->completion); + + /* Divide DMA memory equally among all engines */ + engine->buffer_size = se->dmam_size / SE_ENGINE_MAX; + engine->buffer_off = (se->dmam_size / SE_ENGINE_MAX) * id; + engine->data_buffer = se->dmam_base + engine->buffer_off; + + /* + * There has no engine0, use its data buffer as command buffer for other + * engines. The DMA memory size is obtained from the ACPI table, which + * ensures that the data buffer size of engine0 is larger than the + * command buffer size of all engines. + */ + engine->command = se->dmam_base + id * (2 * SE_ENGINE_CMD_SIZE); + engine->command_ret = engine->command + SE_ENGINE_CMD_SIZE; + + mutex_lock(&se->engine_init_lock); + + /* Tell the controller where to find engine command */ + cmd.command_id = SE_CMD_SET_ENGINE_CMDBUF; + cmd.info[0] = id; + cmd.info[1] = engine->command - se->dmam_base; + cmd.info[2] = 2 * SE_ENGINE_CMD_SIZE; + + if (loongson_se_send_controller_cmd(se, &cmd)) + engine = NULL; + + mutex_unlock(&se->engine_init_lock); + + return engine; +} +EXPORT_SYMBOL_GPL(loongson_se_init_engine); + +static irqreturn_t se_irq_handler(int irq, void *dev_id) +{ + struct loongson_se *se = dev_id; + u32 int_status; + int id; + + spin_lock(&se->dev_lock); + + int_status = readl(se->base + SE_S2LINT_STAT); + + /* For controller */ + if (int_status & SE_INT_CONTROLLER) { + complete(&se->cmd_completion); + int_status &= ~SE_INT_CONTROLLER; + writel(SE_INT_CONTROLLER, se->base + SE_S2LINT_CL); + } + + /* For engines */ + while (int_status) { + id = __ffs(int_status); + complete(&se->engines[id].completion); + int_status &= ~BIT(id); + writel(BIT(id), se->base + SE_S2LINT_CL); + } + + spin_unlock(&se->dev_lock); + + return IRQ_HANDLED; +} + +static int loongson_se_init(struct loongson_se *se, dma_addr_t addr, int size) +{ + struct loongson_se_controller_cmd cmd; + int err; + + cmd.command_id = SE_CMD_START; + err = loongson_se_send_controller_cmd(se, &cmd); + if (err) + return err; + + cmd.command_id = SE_CMD_SET_DMA; + cmd.info[0] = lower_32_bits(addr); + cmd.info[1] = upper_32_bits(addr); + cmd.info[2] = size; + + return loongson_se_send_controller_cmd(se, &cmd); +} + +static const struct mfd_cell engines[] = { + { .name = "loongson-rng" }, + { .name = "tpm_loongson" }, +}; + +static int loongson_se_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct loongson_se *se; + int nr_irq, irq, err, i; + dma_addr_t paddr; + + se = devm_kmalloc(dev, sizeof(*se), GFP_KERNEL); + if (!se) + return -ENOMEM; + + dev_set_drvdata(dev, se); + init_completion(&se->cmd_completion); + spin_lock_init(&se->dev_lock); + mutex_init(&se->engine_init_lock); + + dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + if (device_property_read_u32(dev, "dmam_size", &se->dmam_size)) + return -ENODEV; + + se->dmam_base = dmam_alloc_coherent(dev, se->dmam_size, &paddr, GFP_KERNEL); + if (!se->dmam_base) + return -ENOMEM; + + se->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(se->base)) + return PTR_ERR(se->base); + + writel(SE_INT_ALL, se->base + SE_S2LINT_EN); + + nr_irq = platform_irq_count(pdev); + if (nr_irq <= 0) + return -ENODEV; + + for (i = 0; i < nr_irq; i++) { + irq = platform_get_irq(pdev, i); + err = devm_request_irq(dev, irq, se_irq_handler, 0, "loongson-se", se); + if (err) + dev_err(dev, "failed to request IRQ: %d\n", irq); + } + + err = loongson_se_init(se, paddr, se->dmam_size); + if (err) + return err; + + return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, engines, + ARRAY_SIZE(engines), NULL, 0, NULL); +} + +static const struct acpi_device_id loongson_se_acpi_match[] = { + { "LOON0011", 0 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, loongson_se_acpi_match); + +static struct platform_driver loongson_se_driver = { + .probe = loongson_se_probe, + .driver = { + .name = "loongson-se", + .acpi_match_table = loongson_se_acpi_match, + }, +}; +module_platform_driver(loongson_se_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yinggang Gu "); +MODULE_AUTHOR("Qunqin Zhao "); +MODULE_DESCRIPTION("Loongson Security Engine chip controller driver"); diff --git a/drivers/mfd/ls2k-bmc-core.c b/drivers/mfd/ls2k-bmc-core.c new file mode 100644 index 00000000000000..e162b3c7c9f821 --- /dev/null +++ b/drivers/mfd/ls2k-bmc-core.c @@ -0,0 +1,528 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Loongson-2K Board Management Controller (BMC) Core Driver. + * + * Copyright (C) 2024-2025 Loongson Technology Corporation Limited. + * + * Authors: + * Chong Qiao + * Binbin Zhou + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* LS2K BMC resources */ +#define LS2K_DISPLAY_RES_START (SZ_16M + SZ_2M) +#define LS2K_IPMI_RES_SIZE 0x1C +#define LS2K_IPMI0_RES_START (SZ_16M + 0xF00000) +#define LS2K_IPMI1_RES_START (LS2K_IPMI0_RES_START + LS2K_IPMI_RES_SIZE) +#define LS2K_IPMI2_RES_START (LS2K_IPMI1_RES_START + LS2K_IPMI_RES_SIZE) +#define LS2K_IPMI3_RES_START (LS2K_IPMI2_RES_START + LS2K_IPMI_RES_SIZE) +#define LS2K_IPMI4_RES_START (LS2K_IPMI3_RES_START + LS2K_IPMI_RES_SIZE) + +#define LS7A_PCI_CFG_SIZE 0x100 + +/* LS7A bridge registers */ +#define LS7A_PCIE_PORT_CTL0 0x0 +#define LS7A_PCIE_PORT_STS1 0xC +#define LS7A_GEN2_CTL 0x80C +#define LS7A_SYMBOL_TIMER 0x71C + +/* Bits of LS7A_PCIE_PORT_CTL0 */ +#define LS2K_BMC_PCIE_LTSSM_ENABLE BIT(3) + +/* Bits of LS7A_PCIE_PORT_STS1 */ +#define LS2K_BMC_PCIE_LTSSM_STS GENMASK(5, 0) +#define LS2K_BMC_PCIE_CONNECTED 0x11 + +#define LS2K_BMC_PCIE_DELAY_US 1000 +#define LS2K_BMC_PCIE_TIMEOUT_US 1000000 + +/* Bits of LS7A_GEN2_CTL */ +#define LS7A_GEN2_SPEED_CHANG BIT(17) +#define LS7A_CONF_PHY_TX BIT(18) + +/* Bits of LS7A_SYMBOL_TIMER */ +#define LS7A_MASK_LEN_MATCH BIT(26) + +/* Interval between interruptions */ +#define LS2K_BMC_INT_INTERVAL (60 * HZ) + +/* Maximum time to wait for U-Boot and DDR to be ready with ms. */ +#define LS2K_BMC_RESET_WAIT_TIME 10000 + +/* It's an experience value */ +#define LS7A_BAR0_CHECK_MAX_TIMES 2000 + +#define PCI_REG_STRIDE 0x4 + +#define LS2K_BMC_RESET_GPIO 14 +#define LOONGSON_GPIO_REG_BASE 0x1FE00500 +#define LOONGSON_GPIO_REG_SIZE 0x18 +#define LOONGSON_GPIO_OEN 0x0 +#define LOONGSON_GPIO_FUNC 0x4 +#define LOONGSON_GPIO_INTPOL 0x10 +#define LOONGSON_GPIO_INTEN 0x14 + +#define LOONGSON_IO_INT_BASE 16 +#define LS2K_BMC_RESET_GPIO_INT_VEC (LS2K_BMC_RESET_GPIO % 8) +#define LS2K_BMC_RESET_GPIO_GSI (LOONGSON_IO_INT_BASE + LS2K_BMC_RESET_GPIO_INT_VEC) + +enum { + LS2K_BMC_DISPLAY, + LS2K_BMC_IPMI0, + LS2K_BMC_IPMI1, + LS2K_BMC_IPMI2, + LS2K_BMC_IPMI3, + LS2K_BMC_IPMI4, +}; + +static struct resource ls2k_display_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_DISPLAY_RES_START, SZ_4M, "simpledrm-res"), +}; + +static struct resource ls2k_ipmi0_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI0_RES_START, LS2K_IPMI_RES_SIZE, "ipmi0-res"), +}; + +static struct resource ls2k_ipmi1_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI1_RES_START, LS2K_IPMI_RES_SIZE, "ipmi1-res"), +}; + +static struct resource ls2k_ipmi2_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI2_RES_START, LS2K_IPMI_RES_SIZE, "ipmi2-res"), +}; + +static struct resource ls2k_ipmi3_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI3_RES_START, LS2K_IPMI_RES_SIZE, "ipmi3-res"), +}; + +static struct resource ls2k_ipmi4_resources[] = { + DEFINE_RES_MEM_NAMED(LS2K_IPMI4_RES_START, LS2K_IPMI_RES_SIZE, "ipmi4-res"), +}; + +static struct mfd_cell ls2k_bmc_cells[] = { + [LS2K_BMC_DISPLAY] = { + .name = "simple-framebuffer", + .num_resources = ARRAY_SIZE(ls2k_display_resources), + .resources = ls2k_display_resources + }, + [LS2K_BMC_IPMI0] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi0_resources), + .resources = ls2k_ipmi0_resources + }, + [LS2K_BMC_IPMI1] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi1_resources), + .resources = ls2k_ipmi1_resources + }, + [LS2K_BMC_IPMI2] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi2_resources), + .resources = ls2k_ipmi2_resources + }, + [LS2K_BMC_IPMI3] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi3_resources), + .resources = ls2k_ipmi3_resources + }, + [LS2K_BMC_IPMI4] = { + .name = "ls2k-ipmi-si", + .num_resources = ARRAY_SIZE(ls2k_ipmi4_resources), + .resources = ls2k_ipmi4_resources + }, +}; + +/* Index of the BMC PCI configuration space to be restored at BMC reset. */ +struct ls2k_bmc_pci_data { + u32 pci_command; + u32 base_address0; + u32 interrupt_line; +}; + +/* Index of the parent PCI configuration space to be restored at BMC reset. */ +struct ls2k_bmc_bridge_pci_data { + u32 pci_command; + u32 base_address[6]; + u32 rom_addreess; + u32 interrupt_line; + u32 msi_hi; + u32 msi_lo; + u32 devctl; + u32 linkcap; + u32 linkctl_sts; + u32 symbol_timer; + u32 gen2_ctrl; +}; + +struct ls2k_bmc_ddata { + struct device *dev; + struct work_struct bmc_reset_work; + struct ls2k_bmc_pci_data bmc_pci_data; + struct ls2k_bmc_bridge_pci_data bridge_pci_data; +}; + +static bool ls2k_bmc_bar0_addr_is_set(struct pci_dev *pdev) +{ + u32 addr; + + pci_read_config_dword(pdev, PCI_BASE_ADDRESS_0, &addr); + + return addr & PCI_BASE_ADDRESS_MEM_MASK ? true : false; +} + +static bool ls2k_bmc_pcie_is_connected(struct pci_dev *parent, struct ls2k_bmc_ddata *ddata) +{ + void __iomem *base; + int val, ret; + + base = pci_iomap(parent, 0, LS7A_PCI_CFG_SIZE); + if (!base) + return false; + + val = readl(base + LS7A_PCIE_PORT_CTL0); + writel(val | LS2K_BMC_PCIE_LTSSM_ENABLE, base + LS7A_PCIE_PORT_CTL0); + + ret = readl_poll_timeout_atomic(base + LS7A_PCIE_PORT_STS1, val, + (val & LS2K_BMC_PCIE_LTSSM_STS) == LS2K_BMC_PCIE_CONNECTED, + LS2K_BMC_PCIE_DELAY_US, LS2K_BMC_PCIE_TIMEOUT_US); + if (ret) { + pci_iounmap(parent, base); + dev_err(ddata->dev, "PCI-E training failed status=0x%x\n", val); + return false; + } + + pci_iounmap(parent, base); + return true; +} + +static void ls2k_bmc_restore_bridge_pci_data(struct pci_dev *parent, struct ls2k_bmc_ddata *ddata) +{ + int base, i = 0; + + pci_write_config_dword(parent, PCI_COMMAND, ddata->bridge_pci_data.pci_command); + + for (base = PCI_BASE_ADDRESS_0; base <= PCI_BASE_ADDRESS_5; base += PCI_REG_STRIDE, i++) + pci_write_config_dword(parent, base, ddata->bridge_pci_data.base_address[i]); + + pci_write_config_dword(parent, PCI_ROM_ADDRESS, ddata->bridge_pci_data.rom_addreess); + pci_write_config_dword(parent, PCI_INTERRUPT_LINE, ddata->bridge_pci_data.interrupt_line); + + pci_write_config_dword(parent, parent->msi_cap + PCI_MSI_ADDRESS_LO, + ddata->bridge_pci_data.msi_lo); + pci_write_config_dword(parent, parent->msi_cap + PCI_MSI_ADDRESS_HI, + ddata->bridge_pci_data.msi_hi); + pci_write_config_dword(parent, parent->pcie_cap + PCI_EXP_DEVCTL, + ddata->bridge_pci_data.devctl); + pci_write_config_dword(parent, parent->pcie_cap + PCI_EXP_LNKCAP, + ddata->bridge_pci_data.linkcap); + pci_write_config_dword(parent, parent->pcie_cap + PCI_EXP_LNKCTL, + ddata->bridge_pci_data.linkctl_sts); + + pci_write_config_dword(parent, LS7A_GEN2_CTL, ddata->bridge_pci_data.gen2_ctrl); + pci_write_config_dword(parent, LS7A_SYMBOL_TIMER, ddata->bridge_pci_data.symbol_timer); +} + +static int ls2k_bmc_recover_pci_data(void *data) +{ + struct ls2k_bmc_ddata *ddata = data; + struct pci_dev *pdev = to_pci_dev(ddata->dev); + struct pci_dev *parent = pdev->bus->self; + u32 i; + + /* + * Clear the bus, io and mem resources of the PCI-E bridge to zero, so that + * the processor can not access the LS2K PCI-E port, to avoid crashing due to + * the lack of return signal from accessing the LS2K PCI-E port. + */ + pci_write_config_dword(parent, PCI_BASE_ADDRESS_2, 0); + pci_write_config_dword(parent, PCI_BASE_ADDRESS_3, 0); + pci_write_config_dword(parent, PCI_BASE_ADDRESS_4, 0); + + /* + * When the LS2K BMC is reset, the LS7A PCI-E port is also reset, and its PCI + * BAR0 register is cleared. Due to the time gap between the GPIO interrupt + * generation and the LS2K BMC reset, the LS7A PCI BAR0 register is read to + * determine whether the reset has begun. + */ + for (i = LS7A_BAR0_CHECK_MAX_TIMES; i > 0 ; i--) { + if (!ls2k_bmc_bar0_addr_is_set(parent)) + break; + mdelay(1); + }; + + if (i == 0) + return false; + + ls2k_bmc_restore_bridge_pci_data(parent, ddata); + + /* Check if PCI-E is connected */ + if (!ls2k_bmc_pcie_is_connected(parent, ddata)) + return false; + + /* Waiting for U-Boot and DDR ready */ + mdelay(LS2K_BMC_RESET_WAIT_TIME); + if (!ls2k_bmc_bar0_addr_is_set(parent)) + return false; + + /* Restore LS2K BMC PCI-E config data */ + pci_write_config_dword(pdev, PCI_COMMAND, ddata->bmc_pci_data.pci_command); + pci_write_config_dword(pdev, PCI_BASE_ADDRESS_0, ddata->bmc_pci_data.base_address0); + pci_write_config_dword(pdev, PCI_INTERRUPT_LINE, ddata->bmc_pci_data.interrupt_line); + + return 0; +} + +static void ls2k_bmc_events_fn(struct work_struct *work) +{ + struct ls2k_bmc_ddata *ddata = container_of(work, struct ls2k_bmc_ddata, bmc_reset_work); + + /* + * The PCI-E is lost when the BMC resets, at which point access to the PCI-E + * from other CPUs is suspended to prevent a crash. + */ + stop_machine(ls2k_bmc_recover_pci_data, ddata, NULL); + + if (IS_ENABLED(CONFIG_VT)) { + /* Re-push the display due to previous PCI-E loss. */ + set_console(vt_move_to_console(MAX_NR_CONSOLES - 1, 1)); + } +} + +static irqreturn_t ls2k_bmc_interrupt(int irq, void *arg) +{ + struct ls2k_bmc_ddata *ddata = arg; + static unsigned long last_jiffies; + + if (system_state != SYSTEM_RUNNING) + return IRQ_HANDLED; + + /* Skip interrupt in LS2K_BMC_INT_INTERVAL */ + if (time_after(jiffies, last_jiffies + LS2K_BMC_INT_INTERVAL)) { + schedule_work(&ddata->bmc_reset_work); + last_jiffies = jiffies; + } + + return IRQ_HANDLED; +} + +/* + * Saves the BMC parent device (LS7A) and its own PCI configuration space registers + * that need to be restored after BMC reset. + */ +static void ls2k_bmc_save_pci_data(struct pci_dev *pdev, struct ls2k_bmc_ddata *ddata) +{ + struct pci_dev *parent = pdev->bus->self; + int base, i = 0; + + pci_read_config_dword(parent, PCI_COMMAND, &ddata->bridge_pci_data.pci_command); + + for (base = PCI_BASE_ADDRESS_0; base <= PCI_BASE_ADDRESS_5; base += PCI_REG_STRIDE, i++) + pci_read_config_dword(parent, base, &ddata->bridge_pci_data.base_address[i]); + + pci_read_config_dword(parent, PCI_ROM_ADDRESS, &ddata->bridge_pci_data.rom_addreess); + pci_read_config_dword(parent, PCI_INTERRUPT_LINE, &ddata->bridge_pci_data.interrupt_line); + + pci_read_config_dword(parent, parent->msi_cap + PCI_MSI_ADDRESS_LO, + &ddata->bridge_pci_data.msi_lo); + pci_read_config_dword(parent, parent->msi_cap + PCI_MSI_ADDRESS_HI, + &ddata->bridge_pci_data.msi_hi); + + pci_read_config_dword(parent, parent->pcie_cap + PCI_EXP_DEVCTL, + &ddata->bridge_pci_data.devctl); + pci_read_config_dword(parent, parent->pcie_cap + PCI_EXP_LNKCAP, + &ddata->bridge_pci_data.linkcap); + pci_read_config_dword(parent, parent->pcie_cap + PCI_EXP_LNKCTL, + &ddata->bridge_pci_data.linkctl_sts); + + pci_read_config_dword(parent, LS7A_GEN2_CTL, &ddata->bridge_pci_data.gen2_ctrl); + ddata->bridge_pci_data.gen2_ctrl |= FIELD_PREP(LS7A_GEN2_SPEED_CHANG, 0x1) | + FIELD_PREP(LS7A_CONF_PHY_TX, 0x0); + + pci_read_config_dword(parent, LS7A_SYMBOL_TIMER, &ddata->bridge_pci_data.symbol_timer); + ddata->bridge_pci_data.symbol_timer |= LS7A_MASK_LEN_MATCH; + + pci_read_config_dword(pdev, PCI_COMMAND, &ddata->bmc_pci_data.pci_command); + pci_read_config_dword(pdev, PCI_BASE_ADDRESS_0, &ddata->bmc_pci_data.base_address0); + pci_read_config_dword(pdev, PCI_INTERRUPT_LINE, &ddata->bmc_pci_data.interrupt_line); +} + +static int ls2k_bmc_init(struct ls2k_bmc_ddata *ddata) +{ + struct pci_dev *pdev = to_pci_dev(ddata->dev); + void __iomem *gpio_base; + int gpio_irq, ret, val; + + ls2k_bmc_save_pci_data(pdev, ddata); + + INIT_WORK(&ddata->bmc_reset_work, ls2k_bmc_events_fn); + + ret = devm_request_irq(&pdev->dev, pdev->irq, ls2k_bmc_interrupt, + IRQF_SHARED | IRQF_TRIGGER_FALLING, "ls2kbmc pcie", ddata); + if (ret) { + dev_err(ddata->dev, "Failed to request LS2KBMC PCI-E IRQ %d.\n", pdev->irq); + return ret; + } + + gpio_base = ioremap(LOONGSON_GPIO_REG_BASE, LOONGSON_GPIO_REG_SIZE); + if (!gpio_base) + return -ENOMEM; + + /* Disable GPIO output */ + val = readl(gpio_base + LOONGSON_GPIO_OEN); + writel(val | BIT(LS2K_BMC_RESET_GPIO), gpio_base + LOONGSON_GPIO_OEN); + + /* Enable GPIO functionality */ + val = readl(gpio_base + LOONGSON_GPIO_FUNC); + writel(val & ~BIT(LS2K_BMC_RESET_GPIO), gpio_base + LOONGSON_GPIO_FUNC); + + /* Set GPIO interrupts to low-level active */ + val = readl(gpio_base + LOONGSON_GPIO_INTPOL); + writel(val & ~BIT(LS2K_BMC_RESET_GPIO), gpio_base + LOONGSON_GPIO_INTPOL); + + /* Enable GPIO interrupts */ + val = readl(gpio_base + LOONGSON_GPIO_INTEN); + writel(val | BIT(LS2K_BMC_RESET_GPIO), gpio_base + LOONGSON_GPIO_INTEN); + + iounmap(gpio_base); + + /* + * Since gpio_chip->to_irq is not implemented in the Loongson-3 GPIO driver, + * acpi_register_gsi() is used to obtain the GPIO IRQ. The GPIO interrupt is a + * watchdog interrupt that is triggered when the BMC resets. + */ + gpio_irq = acpi_register_gsi(NULL, LS2K_BMC_RESET_GPIO_GSI, ACPI_EDGE_SENSITIVE, + ACPI_ACTIVE_LOW); + if (gpio_irq < 0) + return gpio_irq; + + ret = devm_request_irq(ddata->dev, gpio_irq, ls2k_bmc_interrupt, + IRQF_SHARED | IRQF_TRIGGER_FALLING, "ls2kbmc gpio", ddata); + if (ret) + dev_err(ddata->dev, "Failed to request LS2KBMC GPIO IRQ %d.\n", gpio_irq); + + acpi_unregister_gsi(LS2K_BMC_RESET_GPIO_GSI); + return ret; +} + +/* + * Currently the Loongson-2K BMC hardware does not have an I2C interface to adapt to the + * resolution. We set the resolution by presetting "video=1280x1024-16@2M" to the BMC memory. + */ +static int ls2k_bmc_parse_mode(struct pci_dev *pdev, struct simplefb_platform_data *pd) +{ + char *mode; + int depth, ret; + + /* The last 16M of PCI BAR0 is used to store the resolution string. */ + mode = devm_ioremap(&pdev->dev, pci_resource_start(pdev, 0) + SZ_16M, SZ_16M); + if (!mode) + return -ENOMEM; + + /* The resolution field starts with the flag "video=". */ + if (!strncmp(mode, "video=", 6)) + mode = mode + 6; + + ret = kstrtoint(strsep(&mode, "x"), 10, &pd->width); + if (ret) + return ret; + + ret = kstrtoint(strsep(&mode, "-"), 10, &pd->height); + if (ret) + return ret; + + ret = kstrtoint(strsep(&mode, "@"), 10, &depth); + if (ret) + return ret; + + pd->stride = pd->width * depth / 8; + pd->format = depth == 32 ? "a8r8g8b8" : "r5g6b5"; + + return 0; +} + +static int ls2k_bmc_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + struct simplefb_platform_data pd; + struct ls2k_bmc_ddata *ddata; + resource_size_t base; + int ret; + + ret = pci_enable_device(dev); + if (ret) + return ret; + + ddata = devm_kzalloc(&dev->dev, sizeof(*ddata), GFP_KERNEL); + if (IS_ERR(ddata)) { + ret = -ENOMEM; + goto disable_pci; + } + + ddata->dev = &dev->dev; + + ret = ls2k_bmc_init(ddata); + if (ret) + goto disable_pci; + + ret = ls2k_bmc_parse_mode(dev, &pd); + if (ret) + goto disable_pci; + + ls2k_bmc_cells[LS2K_BMC_DISPLAY].platform_data = &pd; + ls2k_bmc_cells[LS2K_BMC_DISPLAY].pdata_size = sizeof(pd); + base = dev->resource[0].start + LS2K_DISPLAY_RES_START; + + /* Remove conflicting efifb device */ + ret = aperture_remove_conflicting_devices(base, SZ_4M, "simple-framebuffer"); + if (ret) { + dev_err(&dev->dev, "Failed to removed firmware framebuffers: %d\n", ret); + goto disable_pci; + } + + return devm_mfd_add_devices(&dev->dev, PLATFORM_DEVID_AUTO, + ls2k_bmc_cells, ARRAY_SIZE(ls2k_bmc_cells), + &dev->resource[0], 0, NULL); + +disable_pci: + pci_disable_device(dev); + return ret; +} + +static void ls2k_bmc_remove(struct pci_dev *dev) +{ + pci_disable_device(dev); +} + +static struct pci_device_id ls2k_bmc_devices[] = { + { PCI_DEVICE(PCI_VENDOR_ID_LOONGSON, 0x1a05) }, + { } +}; +MODULE_DEVICE_TABLE(pci, ls2k_bmc_devices); + +static struct pci_driver ls2k_bmc_driver = { + .name = "ls2k-bmc", + .id_table = ls2k_bmc_devices, + .probe = ls2k_bmc_probe, + .remove = ls2k_bmc_remove, +}; +module_pci_driver(ls2k_bmc_driver); + +MODULE_DESCRIPTION("Loongson-2K Board Management Controller (BMC) Core driver"); +MODULE_AUTHOR("Loongson Technology Corporation Limited"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/macsmc.c b/drivers/mfd/macsmc.c index 870c8b2028a8fc..e6cdae221f1d4e 100644 --- a/drivers/mfd/macsmc.c +++ b/drivers/mfd/macsmc.c @@ -429,7 +429,7 @@ static int apple_smc_probe(struct platform_device *pdev) ret = devm_add_action_or_reset(dev, apple_smc_rtkit_shutdown, smc); if (ret) - return dev_err_probe(dev, ret, "Failed to register rtkit shutdown action"); + return ret; ret = apple_rtkit_start_ep(smc->rtk, SMC_ENDPOINT); if (ret) @@ -465,7 +465,7 @@ static int apple_smc_probe(struct platform_device *pdev) apple_smc_write_flag(smc, SMC_KEY(NTAP), true); ret = devm_add_action_or_reset(dev, apple_smc_disable_notifications, smc); if (ret) - return dev_err_probe(dev, ret, "Failed to register notification disable action"); + return ret; ret = devm_mfd_add_devices(smc->dev, PLATFORM_DEVID_NONE, apple_smc_devs, ARRAY_SIZE(apple_smc_devs), @@ -478,6 +478,7 @@ static int apple_smc_probe(struct platform_device *pdev) } static const struct of_device_id apple_smc_of_match[] = { + { .compatible = "apple,t8103-smc" }, { .compatible = "apple,smc" }, {}, }; diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c index bdbd5bfc971456..2f74a8c644a32a 100644 --- a/drivers/mfd/madera-core.c +++ b/drivers/mfd/madera-core.c @@ -456,7 +456,7 @@ int madera_dev_init(struct madera *madera) struct device *dev = madera->dev; unsigned int hwid; int (*patch_fn)(struct madera *) = NULL; - const struct mfd_cell *mfd_devs; + const struct mfd_cell *mfd_devs = NULL; int n_devs = 0; int i, ret; @@ -670,7 +670,7 @@ int madera_dev_init(struct madera *madera) goto err_reset; } - if (!n_devs) { + if (!n_devs || !mfd_devs) { dev_err(madera->dev, "Device ID 0x%x not a %s\n", hwid, madera->type_name); ret = -ENODEV; diff --git a/drivers/mfd/max7360.c b/drivers/mfd/max7360.c new file mode 100644 index 00000000000000..5ee459c490ecd5 --- /dev/null +++ b/drivers/mfd/max7360.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Maxim MAX7360 Core Driver + * + * Copyright 2025 Bootlin + * + * Authors: + * Kamel Bouhara + * Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell max7360_cells[] = { + { .name = "max7360-pinctrl" }, + { .name = "max7360-pwm" }, + { .name = "max7360-keypad" }, + { .name = "max7360-rotary" }, + { + .name = "max7360-gpo", + .of_compatible = "maxim,max7360-gpo", + }, + { + .name = "max7360-gpio", + .of_compatible = "maxim,max7360-gpio", + }, +}; + +static const struct regmap_range max7360_volatile_ranges[] = { + regmap_reg_range(MAX7360_REG_KEYFIFO, MAX7360_REG_KEYFIFO), + regmap_reg_range(MAX7360_REG_I2C_TIMEOUT, MAX7360_REG_RTR_CNT), +}; + +static const struct regmap_access_table max7360_volatile_table = { + .yes_ranges = max7360_volatile_ranges, + .n_yes_ranges = ARRAY_SIZE(max7360_volatile_ranges), +}; + +static const struct regmap_config max7360_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = MAX7360_REG_PWMCFG(MAX7360_PORT_PWM_COUNT - 1), + .volatile_table = &max7360_volatile_table, + .cache_type = REGCACHE_MAPLE, +}; + +static int max7360_mask_irqs(struct regmap *regmap) +{ + struct device *dev = regmap_get_device(regmap); + unsigned int val; + int ret; + + /* + * GPIO/PWM interrupts are not masked on reset: as the MAX7360 "INTI" + * interrupt line is shared between GPIOs and rotary encoder, this could + * result in repeated spurious interrupts on the rotary encoder driver + * if the GPIO driver is not loaded. Mask them now to avoid this + * situation. + */ + for (unsigned int i = 0; i < MAX7360_PORT_PWM_COUNT; i++) { + ret = regmap_write_bits(regmap, MAX7360_REG_PWMCFG(i), + MAX7360_PORT_CFG_INTERRUPT_MASK, + MAX7360_PORT_CFG_INTERRUPT_MASK); + if (ret) + return dev_err_probe(dev, ret, + "Failed to write MAX7360 port configuration\n"); + } + + /* Read GPIO in register, to ACK any pending IRQ. */ + ret = regmap_read(regmap, MAX7360_REG_GPIOIN, &val); + if (ret) + return dev_err_probe(dev, ret, "Failed to read GPIO values\n"); + + return 0; +} + +static int max7360_reset(struct regmap *regmap) +{ + struct device *dev = regmap_get_device(regmap); + int ret; + + ret = regmap_write(regmap, MAX7360_REG_GPIOCFG, MAX7360_GPIO_CFG_GPIO_RST); + if (ret) { + dev_err(dev, "Failed to reset GPIO configuration: %x\n", ret); + return ret; + } + + ret = regcache_drop_region(regmap, MAX7360_REG_GPIOCFG, MAX7360_REG_GPIO_LAST); + if (ret) { + dev_err(dev, "Failed to drop regmap cache: %x\n", ret); + return ret; + } + + ret = regmap_write(regmap, MAX7360_REG_SLEEP, 0); + if (ret) { + dev_err(dev, "Failed to reset autosleep configuration: %x\n", ret); + return ret; + } + + ret = regmap_write(regmap, MAX7360_REG_DEBOUNCE, 0); + if (ret) + dev_err(dev, "Failed to reset GPO port count: %x\n", ret); + + return ret; +} + +static int max7360_probe(struct i2c_client *client) +{ + struct device *dev = &client->dev; + struct regmap *regmap; + int ret; + + regmap = devm_regmap_init_i2c(client, &max7360_regmap_config); + if (IS_ERR(regmap)) + return dev_err_probe(dev, PTR_ERR(regmap), "Failed to initialise regmap\n"); + + ret = max7360_reset(regmap); + if (ret) + return dev_err_probe(dev, ret, "Failed to reset device\n"); + + /* Get the device out of shutdown mode. */ + ret = regmap_write_bits(regmap, MAX7360_REG_GPIOCFG, + MAX7360_GPIO_CFG_GPIO_EN, + MAX7360_GPIO_CFG_GPIO_EN); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable GPIO and PWM module\n"); + + ret = max7360_mask_irqs(regmap); + if (ret) + return dev_err_probe(dev, ret, "Could not mask interrupts\n"); + + ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, + max7360_cells, ARRAY_SIZE(max7360_cells), + NULL, 0, NULL); + if (ret) + return dev_err_probe(dev, ret, "Failed to register child devices\n"); + + return 0; +} + +static const struct of_device_id max7360_dt_match[] = { + { .compatible = "maxim,max7360" }, + {} +}; +MODULE_DEVICE_TABLE(of, max7360_dt_match); + +static struct i2c_driver max7360_driver = { + .driver = { + .name = "max7360", + .of_match_table = max7360_dt_match, + }, + .probe = max7360_probe, +}; +module_i2c_driver(max7360_driver); + +MODULE_DESCRIPTION("Maxim MAX7360 I2C IO Expander core driver"); +MODULE_AUTHOR("Kamel Bouhara "); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/max77705.c b/drivers/mfd/max77705.c index 6b263bacb8c28d..e1a9bfd6585603 100644 --- a/drivers/mfd/max77705.c +++ b/drivers/mfd/max77705.c @@ -61,21 +61,21 @@ static const struct regmap_config max77705_regmap_config = { .max_register = MAX77705_PMIC_REG_USBC_RESET, }; -static const struct regmap_irq max77705_topsys_irqs[] = { - { .mask = MAX77705_SYSTEM_IRQ_BSTEN_INT, }, - { .mask = MAX77705_SYSTEM_IRQ_SYSUVLO_INT, }, - { .mask = MAX77705_SYSTEM_IRQ_SYSOVLO_INT, }, - { .mask = MAX77705_SYSTEM_IRQ_TSHDN_INT, }, - { .mask = MAX77705_SYSTEM_IRQ_TM_INT, }, +static const struct regmap_irq max77705_irqs[] = { + { .mask = MAX77705_SRC_IRQ_CHG, }, + { .mask = MAX77705_SRC_IRQ_TOP, }, + { .mask = MAX77705_SRC_IRQ_FG, }, + { .mask = MAX77705_SRC_IRQ_USBC, }, }; -static const struct regmap_irq_chip max77705_topsys_irq_chip = { - .name = "max77705-topsys", - .status_base = MAX77705_PMIC_REG_SYSTEM_INT, - .mask_base = MAX77705_PMIC_REG_SYSTEM_INT_MASK, +static const struct regmap_irq_chip max77705_irq_chip = { + .name = "max77705", + .status_base = MAX77705_PMIC_REG_INTSRC, + .ack_base = MAX77705_PMIC_REG_INTSRC, + .mask_base = MAX77705_PMIC_REG_INTSRC_MASK, .num_regs = 1, - .irqs = max77705_topsys_irqs, - .num_irqs = ARRAY_SIZE(max77705_topsys_irqs), + .irqs = max77705_irqs, + .num_irqs = ARRAY_SIZE(max77705_irqs), }; static int max77705_i2c_probe(struct i2c_client *i2c) @@ -108,21 +108,17 @@ static int max77705_i2c_probe(struct i2c_client *i2c) if (pmic_rev != MAX77705_PASS3) return dev_err_probe(dev, -ENODEV, "Rev.0x%x is not tested\n", pmic_rev); + /* Active Discharge Enable */ + regmap_update_bits(max77705->regmap, MAX77705_PMIC_REG_MAINCTRL1, 1, 1); + ret = devm_regmap_add_irq_chip(dev, max77705->regmap, i2c->irq, - IRQF_ONESHOT | IRQF_SHARED, 0, - &max77705_topsys_irq_chip, + IRQF_ONESHOT, 0, + &max77705_irq_chip, &irq_data); if (ret) return dev_err_probe(dev, ret, "Failed to add IRQ chip\n"); - /* Unmask interrupts from all blocks in interrupt source register */ - ret = regmap_update_bits(max77705->regmap, - MAX77705_PMIC_REG_INTSRC_MASK, - MAX77705_SRC_IRQ_ALL, (unsigned int)~MAX77705_SRC_IRQ_ALL); - if (ret < 0) - return dev_err_probe(dev, ret, "Could not unmask interrupts in INTSRC\n"); - domain = regmap_irq_get_domain(irq_data); ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, diff --git a/drivers/mfd/max8997.c b/drivers/mfd/max8997.c index ffe96b40368e12..7ba8ed1dfde3ea 100644 --- a/drivers/mfd/max8997.c +++ b/drivers/mfd/max8997.c @@ -438,7 +438,7 @@ static int max8997_suspend(struct device *dev) disable_irq(max8997->irq); if (device_may_wakeup(dev)) - irq_set_irq_wake(max8997->irq, 1); + enable_irq_wake(max8997->irq); return 0; } @@ -448,7 +448,7 @@ static int max8997_resume(struct device *dev) struct max8997_dev *max8997 = i2c_get_clientdata(i2c); if (device_may_wakeup(dev)) - irq_set_irq_wake(max8997->irq, 0); + disable_irq_wake(max8997->irq); enable_irq(max8997->irq); return max8997_irq_resume(max8997); } diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c index 6ba27171da28b7..eb13bbaeda5528 100644 --- a/drivers/mfd/max8998.c +++ b/drivers/mfd/max8998.c @@ -234,7 +234,7 @@ static int max8998_suspend(struct device *dev) struct max8998_dev *max8998 = i2c_get_clientdata(i2c); if (device_may_wakeup(dev)) - irq_set_irq_wake(max8998->irq, 1); + enable_irq_wake(max8998->irq); return 0; } @@ -244,7 +244,7 @@ static int max8998_resume(struct device *dev) struct max8998_dev *max8998 = i2c_get_clientdata(i2c); if (device_may_wakeup(dev)) - irq_set_irq_wake(max8998->irq, 0); + disable_irq_wake(max8998->irq); /* * In LP3974, if IRQ registers are not "read & clear" * when it's set during sleep, the interrupt becomes diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c index 76bd316a50afc5..7d14a1e7631ee8 100644 --- a/drivers/mfd/mfd-core.c +++ b/drivers/mfd/mfd-core.c @@ -131,6 +131,7 @@ static int mfd_match_of_node_to_dev(struct platform_device *pdev, of_entry->np = np; list_add_tail(&of_entry->list, &mfd_of_node_list); + of_node_get(np); device_set_node(&pdev->dev, of_fwnode_handle(np)); #endif return 0; diff --git a/drivers/mfd/nct6694.c b/drivers/mfd/nct6694.c new file mode 100644 index 00000000000000..308b2fda3055c5 --- /dev/null +++ b/drivers/mfd/nct6694.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Nuvoton Technology Corp. + * + * Nuvoton NCT6694 core driver using USB interface to provide + * access to the NCT6694 hardware monitoring and control features. + * + * The NCT6694 is an integrated controller that provides GPIO, I2C, + * CAN, WDT, HWMON and RTC management. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell nct6694_devs[] = { + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + + MFD_CELL_NAME("nct6694-canfd"), + MFD_CELL_NAME("nct6694-canfd"), + + MFD_CELL_NAME("nct6694-wdt"), + MFD_CELL_NAME("nct6694-wdt"), + + MFD_CELL_NAME("nct6694-hwmon"), + + MFD_CELL_NAME("nct6694-rtc"), +}; + +static int nct6694_response_err_handling(struct nct6694 *nct6694, unsigned char err_status) +{ + switch (err_status) { + case NCT6694_NO_ERROR: + return 0; + case NCT6694_NOT_SUPPORT_ERROR: + dev_err(nct6694->dev, "Command is not supported!\n"); + break; + case NCT6694_NO_RESPONSE_ERROR: + dev_warn(nct6694->dev, "Command received no response!\n"); + break; + case NCT6694_TIMEOUT_ERROR: + dev_warn(nct6694->dev, "Command timed out!\n"); + break; + case NCT6694_PENDING: + dev_err(nct6694->dev, "Command is pending!\n"); + break; + default: + return -EINVAL; + } + + return -EIO; +} + +/** + * nct6694_read_msg() - Read message from NCT6694 device + * @nct6694: NCT6694 device pointer + * @cmd_hd: command header structure + * @buf: buffer to store the response data + * + * Sends a command to the NCT6694 device and reads the response. + * The command header is specified in @cmd_hd, and the response + * data is stored in @buf. + * + * Return: Negative value on error or 0 on success. + */ +int nct6694_read_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf) +{ + union nct6694_usb_msg *msg = nct6694->usb_msg; + struct usb_device *udev = nct6694->udev; + int tx_len, rx_len, ret; + + guard(mutex)(&nct6694->access_lock); + + memcpy(&msg->cmd_header, cmd_hd, sizeof(*cmd_hd)); + msg->cmd_header.hctrl = NCT6694_HCTRL_GET; + + /* Send command packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), &msg->cmd_header, + sizeof(*msg), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive response packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), &msg->response_header, + sizeof(*msg), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive data packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), buf, + le16_to_cpu(cmd_hd->len), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + if (rx_len != le16_to_cpu(cmd_hd->len)) { + dev_err(nct6694->dev, "Expected received length %d, but got %d\n", + le16_to_cpu(cmd_hd->len), rx_len); + return -EIO; + } + + return nct6694_response_err_handling(nct6694, msg->response_header.sts); +} +EXPORT_SYMBOL_GPL(nct6694_read_msg); + +/** + * nct6694_write_msg() - Write message to NCT6694 device + * @nct6694: NCT6694 device pointer + * @cmd_hd: command header structure + * @buf: buffer containing the data to be sent + * + * Sends a command to the NCT6694 device and writes the data + * from @buf. The command header is specified in @cmd_hd. + * + * Return: Negative value on error or 0 on success. + */ +int nct6694_write_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf) +{ + union nct6694_usb_msg *msg = nct6694->usb_msg; + struct usb_device *udev = nct6694->udev; + int tx_len, rx_len, ret; + + guard(mutex)(&nct6694->access_lock); + + memcpy(&msg->cmd_header, cmd_hd, sizeof(*cmd_hd)); + msg->cmd_header.hctrl = NCT6694_HCTRL_SET; + + /* Send command packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), &msg->cmd_header, + sizeof(*msg), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Send data packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), buf, + le16_to_cpu(cmd_hd->len), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive response packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), &msg->response_header, + sizeof(*msg), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive data packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), buf, + le16_to_cpu(cmd_hd->len), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + if (rx_len != le16_to_cpu(cmd_hd->len)) { + dev_err(nct6694->dev, "Expected transmitted length %d, but got %d\n", + le16_to_cpu(cmd_hd->len), rx_len); + return -EIO; + } + + return nct6694_response_err_handling(nct6694, msg->response_header.sts); +} +EXPORT_SYMBOL_GPL(nct6694_write_msg); + +static void usb_int_callback(struct urb *urb) +{ + struct nct6694 *nct6694 = urb->context; + __le32 *status_le = urb->transfer_buffer; + u32 int_status; + int ret; + + switch (urb->status) { + case 0: + break; + case -ECONNRESET: + case -ENOENT: + case -ESHUTDOWN: + return; + default: + goto resubmit; + } + + int_status = le32_to_cpu(*status_le); + + while (int_status) { + int irq = __ffs(int_status); + + generic_handle_irq_safe(irq_find_mapping(nct6694->domain, irq)); + int_status &= ~BIT(irq); + } + +resubmit: + ret = usb_submit_urb(urb, GFP_ATOMIC); + if (ret) + dev_warn(nct6694->dev, "Failed to resubmit urb, status %pe", ERR_PTR(ret)); +} + +static void nct6694_irq_enable(struct irq_data *data) +{ + struct nct6694 *nct6694 = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + guard(spinlock_irqsave)(&nct6694->irq_lock); + + nct6694->irq_enable |= BIT(hwirq); +} + +static void nct6694_irq_disable(struct irq_data *data) +{ + struct nct6694 *nct6694 = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + guard(spinlock_irqsave)(&nct6694->irq_lock); + + nct6694->irq_enable &= ~BIT(hwirq); +} + +static const struct irq_chip nct6694_irq_chip = { + .name = "nct6694-irq", + .flags = IRQCHIP_SKIP_SET_WAKE, + .irq_enable = nct6694_irq_enable, + .irq_disable = nct6694_irq_disable, +}; + +static int nct6694_irq_domain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) +{ + struct nct6694 *nct6694 = d->host_data; + + irq_set_chip_data(irq, nct6694); + irq_set_chip_and_handler(irq, &nct6694_irq_chip, handle_simple_irq); + + return 0; +} + +static void nct6694_irq_domain_unmap(struct irq_domain *d, unsigned int irq) +{ + irq_set_chip_and_handler(irq, NULL, NULL); + irq_set_chip_data(irq, NULL); +} + +static const struct irq_domain_ops nct6694_irq_domain_ops = { + .map = nct6694_irq_domain_map, + .unmap = nct6694_irq_domain_unmap, +}; + +static int nct6694_usb_probe(struct usb_interface *iface, + const struct usb_device_id *id) +{ + struct usb_device *udev = interface_to_usbdev(iface); + struct usb_endpoint_descriptor *int_endpoint; + struct usb_host_interface *interface; + struct device *dev = &iface->dev; + struct nct6694 *nct6694; + int ret; + + nct6694 = devm_kzalloc(dev, sizeof(*nct6694), GFP_KERNEL); + if (!nct6694) + return -ENOMEM; + + nct6694->usb_msg = devm_kzalloc(dev, sizeof(union nct6694_usb_msg), GFP_KERNEL); + if (!nct6694->usb_msg) + return -ENOMEM; + + nct6694->int_buffer = devm_kzalloc(dev, sizeof(*nct6694->int_buffer), GFP_KERNEL); + if (!nct6694->int_buffer) + return -ENOMEM; + + nct6694->int_in_urb = usb_alloc_urb(0, GFP_KERNEL); + if (!nct6694->int_in_urb) + return -ENOMEM; + + nct6694->domain = irq_domain_create_simple(NULL, NCT6694_NR_IRQS, 0, + &nct6694_irq_domain_ops, + nct6694); + if (!nct6694->domain) { + ret = -ENODEV; + goto err_urb; + } + + nct6694->dev = dev; + nct6694->udev = udev; + + ida_init(&nct6694->gpio_ida); + ida_init(&nct6694->i2c_ida); + ida_init(&nct6694->canfd_ida); + ida_init(&nct6694->wdt_ida); + + spin_lock_init(&nct6694->irq_lock); + + ret = devm_mutex_init(dev, &nct6694->access_lock); + if (ret) + goto err_ida; + + interface = iface->cur_altsetting; + + int_endpoint = &interface->endpoint[0].desc; + if (!usb_endpoint_is_int_in(int_endpoint)) { + ret = -ENODEV; + goto err_ida; + } + + usb_fill_int_urb(nct6694->int_in_urb, udev, usb_rcvintpipe(udev, NCT6694_INT_IN_EP), + nct6694->int_buffer, sizeof(*nct6694->int_buffer), usb_int_callback, + nct6694, int_endpoint->bInterval); + + ret = usb_submit_urb(nct6694->int_in_urb, GFP_KERNEL); + if (ret) + goto err_ida; + + usb_set_intfdata(iface, nct6694); + + ret = mfd_add_hotplug_devices(dev, nct6694_devs, ARRAY_SIZE(nct6694_devs)); + if (ret) + goto err_mfd; + + return 0; + +err_mfd: + usb_kill_urb(nct6694->int_in_urb); +err_ida: + ida_destroy(&nct6694->wdt_ida); + ida_destroy(&nct6694->canfd_ida); + ida_destroy(&nct6694->i2c_ida); + ida_destroy(&nct6694->gpio_ida); + irq_domain_remove(nct6694->domain); +err_urb: + usb_free_urb(nct6694->int_in_urb); + return ret; +} + +static void nct6694_usb_disconnect(struct usb_interface *iface) +{ + struct nct6694 *nct6694 = usb_get_intfdata(iface); + + mfd_remove_devices(nct6694->dev); + usb_kill_urb(nct6694->int_in_urb); + ida_destroy(&nct6694->wdt_ida); + ida_destroy(&nct6694->canfd_ida); + ida_destroy(&nct6694->i2c_ida); + ida_destroy(&nct6694->gpio_ida); + irq_domain_remove(nct6694->domain); + usb_free_urb(nct6694->int_in_urb); +} + +static const struct usb_device_id nct6694_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(NCT6694_VENDOR_ID, NCT6694_PRODUCT_ID, 0xFF, 0x00, 0x00) }, + { } +}; +MODULE_DEVICE_TABLE(usb, nct6694_ids); + +static struct usb_driver nct6694_usb_driver = { + .name = "nct6694", + .id_table = nct6694_ids, + .probe = nct6694_usb_probe, + .disconnect = nct6694_usb_disconnect, +}; +module_usb_driver(nct6694_usb_driver); + +MODULE_DESCRIPTION("Nuvoton NCT6694 core driver"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/qnap-mcu.c b/drivers/mfd/qnap-mcu.c index 89a8a1913d42dd..4ec1f4cf902f60 100644 --- a/drivers/mfd/qnap-mcu.c +++ b/drivers/mfd/qnap-mcu.c @@ -150,40 +150,40 @@ int qnap_mcu_exec(struct qnap_mcu *mcu, size_t length = reply_data_size + QNAP_MCU_CHECKSUM_SIZE; struct qnap_mcu_reply *reply = &mcu->reply; int ret = 0; + u8 crc; if (length > sizeof(rx)) { dev_err(&mcu->serdev->dev, "expected data too big for receive buffer"); return -EINVAL; } - mutex_lock(&mcu->bus_lock); + guard(mutex)(&mcu->bus_lock); reply->data = rx; reply->length = length; reply->received = 0; reinit_completion(&reply->done); - qnap_mcu_write(mcu, cmd_data, cmd_data_size); + ret = qnap_mcu_write(mcu, cmd_data, cmd_data_size); + if (ret < 0) + return ret; serdev_device_wait_until_sent(mcu->serdev, msecs_to_jiffies(QNAP_MCU_TIMEOUT_MS)); if (!wait_for_completion_timeout(&reply->done, msecs_to_jiffies(QNAP_MCU_TIMEOUT_MS))) { dev_err(&mcu->serdev->dev, "Command timeout\n"); - ret = -ETIMEDOUT; - } else { - u8 crc = qnap_mcu_csum(rx, reply_data_size); - - if (crc != rx[reply_data_size]) { - dev_err(&mcu->serdev->dev, - "Invalid Checksum received\n"); - ret = -EIO; - } else { - memcpy(reply_data, rx, reply_data_size); - } + return -ETIMEDOUT; } - mutex_unlock(&mcu->bus_lock); - return ret; + crc = qnap_mcu_csum(rx, reply_data_size); + if (crc != rx[reply_data_size]) { + dev_err(&mcu->serdev->dev, "Invalid Checksum received\n"); + return -EIO; + } + + memcpy(reply_data, rx, reply_data_size); + + return 0; } EXPORT_SYMBOL_GPL(qnap_mcu_exec); @@ -247,6 +247,14 @@ static int qnap_mcu_power_off(struct sys_off_data *data) return NOTIFY_DONE; } +static const struct qnap_mcu_variant qnap_ts233_mcu = { + .baud_rate = 115200, + .num_drives = 2, + .fan_pwm_min = 51, /* Specified in original model.conf */ + .fan_pwm_max = 255, + .usb_led = true, +}; + static const struct qnap_mcu_variant qnap_ts433_mcu = { .baud_rate = 115200, .num_drives = 4, @@ -319,6 +327,7 @@ static int qnap_mcu_probe(struct serdev_device *serdev) } static const struct of_device_id qnap_mcu_dt_ids[] = { + { .compatible = "qnap,ts233-mcu", .data = &qnap_ts233_mcu }, { .compatible = "qnap,ts433-mcu", .data = &qnap_ts433_mcu }, { /* sentinel */ } }; diff --git a/drivers/mfd/rohm-bd71828.c b/drivers/mfd/rohm-bd71828.c index a14b7aa69c3c61..84a64c3b9c9f52 100644 --- a/drivers/mfd/rohm-bd71828.c +++ b/drivers/mfd/rohm-bd71828.c @@ -45,8 +45,8 @@ static const struct resource bd71828_rtc_irqs[] = { static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_RMV, "bd71815-dcin-rmv"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_OUT, "bd71815-clps-out"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_IN, "bd71815-clps-in"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_OUT, "bd71815-dcin-clps-out"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_IN, "bd71815-dcin-clps-in"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_RES, "bd71815-dcin-ovp-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_DET, "bd71815-dcin-ovp-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_MON_RES, "bd71815-dcin-mon-res"), @@ -56,7 +56,7 @@ static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_RES, "bd71815-vsys-low-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_DET, "bd71815-vsys-low-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_DET, "bd71815-vsys-mon-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TEMP, "bd71815-chg-wdg-temp"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TIME, "bd71815-chg-wdg"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_RECHARGE_RES, "bd71815-rechg-res"), @@ -87,10 +87,10 @@ static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_2_DET, "bd71815-bat-oc2-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_RES, "bd71815-bat-oc3-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_DET, "bd71815-bat-oc3-det"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_RES, "bd71815-bat-low-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_DET, "bd71815-bat-low-det"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_RES, "bd71815-bat-hi-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_DET, "bd71815-bat-hi-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_RES, "bd71815-temp-bat-low-res"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_DET, "bd71815-temp-bat-low-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_RES, "bd71815-temp-bat-hi-res"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_DET, "bd71815-temp-bat-hi-det"), }; static const struct mfd_cell bd71815_mfd_cells[] = { @@ -109,7 +109,30 @@ static const struct mfd_cell bd71815_mfd_cells[] = { }, }; -static const struct mfd_cell bd71828_mfd_cells[] = { +static const struct resource bd71828_power_irqs[] = { + DEFINE_RES_IRQ_NAMED(BD71828_INT_CHG_TOPOFF_TO_DONE, + "bd71828-chg-done"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_DCIN_DET, "bd71828-pwr-dcin-in"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_DCIN_RMV, "bd71828-pwr-dcin-out"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_BAT_LOW_VOLT_RES, + "bd71828-vbat-normal"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_BAT_LOW_VOLT_DET, "bd71828-vbat-low"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_HI_DET, "bd71828-btemp-hi"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_HI_RES, "bd71828-btemp-cool"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_LOW_DET, "bd71828-btemp-lo"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_LOW_RES, + "bd71828-btemp-warm"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_VF_DET, + "bd71828-temp-hi"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_VF_RES, + "bd71828-temp-norm"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_125_DET, + "bd71828-temp-125-over"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_125_RES, + "bd71828-temp-125-under"), +}; + +static struct mfd_cell bd71828_mfd_cells[] = { { .name = "bd71828-pmic", }, { .name = "bd71828-gpio", }, { .name = "bd71828-led", .of_compatible = "rohm,bd71828-leds" }, @@ -118,8 +141,11 @@ static const struct mfd_cell bd71828_mfd_cells[] = { * BD70528 clock gate are the register address and mask. */ { .name = "bd71828-clk", }, - { .name = "bd71827-power", }, { + .name = "bd71828-power", + .resources = bd71828_power_irqs, + .num_resources = ARRAY_SIZE(bd71828_power_irqs), + }, { .name = "bd71828-rtc", .resources = bd71828_rtc_irqs, .num_resources = ARRAY_SIZE(bd71828_rtc_irqs), diff --git a/drivers/mfd/rz-mtu3.c b/drivers/mfd/rz-mtu3.c index f3dac4a29a8324..9cdfef610398f3 100644 --- a/drivers/mfd/rz-mtu3.c +++ b/drivers/mfd/rz-mtu3.c @@ -32,7 +32,7 @@ static const unsigned long rz_mtu3_8bit_ch_reg_offs[][13] = { [RZ_MTU3_CHAN_2] = MTU_8BIT_CH_1_2(0x204, 0x092, 0x205, 0x200, 0x20c, 0x201, 0x202), [RZ_MTU3_CHAN_3] = MTU_8BIT_CH_3_4_6_7(0x008, 0x093, 0x02c, 0x000, 0x04c, 0x002, 0x004, 0x005, 0x038), [RZ_MTU3_CHAN_4] = MTU_8BIT_CH_3_4_6_7(0x009, 0x094, 0x02d, 0x001, 0x04d, 0x003, 0x006, 0x007, 0x039), - [RZ_MTU3_CHAN_5] = MTU_8BIT_CH_5(0xab2, 0x1eb, 0xab4, 0xab6, 0xa84, 0xa85, 0xa86, 0xa94, 0xa95, 0xa96, 0xaa4, 0xaa5, 0xaa6), + [RZ_MTU3_CHAN_5] = MTU_8BIT_CH_5(0xab2, 0x895, 0xab4, 0xab6, 0xa84, 0xa85, 0xa86, 0xa94, 0xa95, 0xa96, 0xaa4, 0xaa5, 0xaa6), [RZ_MTU3_CHAN_6] = MTU_8BIT_CH_3_4_6_7(0x808, 0x893, 0x82c, 0x800, 0x84c, 0x802, 0x804, 0x805, 0x838), [RZ_MTU3_CHAN_7] = MTU_8BIT_CH_3_4_6_7(0x809, 0x894, 0x82d, 0x801, 0x84d, 0x803, 0x806, 0x807, 0x839), [RZ_MTU3_CHAN_8] = MTU_8BIT_CH_8(0x404, 0x098, 0x400, 0x406, 0x401, 0x402, 0x403) diff --git a/drivers/mfd/simple-mfd-i2c.c b/drivers/mfd/simple-mfd-i2c.c index 22159913bea034..0a607a1e3ca1de 100644 --- a/drivers/mfd/simple-mfd-i2c.c +++ b/drivers/mfd/simple-mfd-i2c.c @@ -93,12 +93,32 @@ static const struct simple_mfd_data maxim_mon_max77705 = { .mfd_cell_size = ARRAY_SIZE(max77705_sensor_cells), }; +static const struct regmap_config spacemit_p1_regmap_config = { + .reg_bits = 8, + .val_bits = 8, +}; + +static const struct mfd_cell spacemit_p1_cells[] = { + { .name = "spacemit-p1-regulator", }, + { .name = "spacemit-p1-rtc", }, +}; + +static const struct simple_mfd_data spacemit_p1 = { + .regmap_config = &spacemit_p1_regmap_config, + .mfd_cell = spacemit_p1_cells, + .mfd_cell_size = ARRAY_SIZE(spacemit_p1_cells), +}; + static const struct of_device_id simple_mfd_i2c_of_match[] = { + { .compatible = "fsl,ls1028aqds-fpga" }, + { .compatible = "fsl,lx2160aqds-fpga" }, + { .compatible = "fsl,lx2160ardb-fpga" }, { .compatible = "kontron,sl28cpld" }, - { .compatible = "silergy,sy7636a", .data = &silergy_sy7636a}, { .compatible = "maxim,max5970", .data = &maxim_max5970}, { .compatible = "maxim,max5978", .data = &maxim_max5970}, { .compatible = "maxim,max77705-battery", .data = &maxim_mon_max77705}, + { .compatible = "silergy,sy7636a", .data = &silergy_sy7636a}, + { .compatible = "spacemit,p1", .data = &spacemit_p1, }, {} }; MODULE_DEVICE_TABLE(of, simple_mfd_i2c_of_match); diff --git a/drivers/mfd/stm32-lptimer.c b/drivers/mfd/stm32-lptimer.c index 09073dbc9c8049..123659178cc2b6 100644 --- a/drivers/mfd/stm32-lptimer.c +++ b/drivers/mfd/stm32-lptimer.c @@ -19,7 +19,6 @@ static const struct regmap_config stm32_lptimer_regmap_cfg = { .val_bits = 32, .reg_stride = sizeof(u32), .max_register = STM32_LPTIM_MAX_REGISTER, - .fast_io = true, }; static int stm32_lptimer_detect_encoder(struct stm32_lptimer *ddata) diff --git a/drivers/mfd/stmpe-i2c.c b/drivers/mfd/stmpe-i2c.c index fe018bedab9837..943fa363efc35a 100644 --- a/drivers/mfd/stmpe-i2c.c +++ b/drivers/mfd/stmpe-i2c.c @@ -122,18 +122,8 @@ static struct i2c_driver stmpe_i2c_driver = { .remove = stmpe_i2c_remove, .id_table = stmpe_i2c_id, }; - -static int __init stmpe_init(void) -{ - return i2c_add_driver(&stmpe_i2c_driver); -} -subsys_initcall(stmpe_init); - -static void __exit stmpe_exit(void) -{ - i2c_del_driver(&stmpe_i2c_driver); -} -module_exit(stmpe_exit); +module_i2c_driver(stmpe_i2c_driver); MODULE_DESCRIPTION("STMPE MFD I2C Interface Driver"); MODULE_AUTHOR("Rabin Vincent "); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/stmpe-spi.c b/drivers/mfd/stmpe-spi.c index b9cc85ea2c4019..dea31efface6ec 100644 --- a/drivers/mfd/stmpe-spi.c +++ b/drivers/mfd/stmpe-spi.c @@ -141,18 +141,8 @@ static struct spi_driver stmpe_spi_driver = { .remove = stmpe_spi_remove, .id_table = stmpe_spi_id, }; - -static int __init stmpe_init(void) -{ - return spi_register_driver(&stmpe_spi_driver); -} -subsys_initcall(stmpe_init); - -static void __exit stmpe_exit(void) -{ - spi_unregister_driver(&stmpe_spi_driver); -} -module_exit(stmpe_exit); +module_spi_driver(stmpe_spi_driver); MODULE_DESCRIPTION("STMPE MFD SPI Interface Driver"); MODULE_AUTHOR("Viresh Kumar "); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c index 819d19dc9b4a91..3c5c2f157f5293 100644 --- a/drivers/mfd/stmpe.c +++ b/drivers/mfd/stmpe.c @@ -1482,9 +1482,13 @@ int stmpe_probe(struct stmpe_client_info *ci, enum stmpe_partnum partnum) return ret; } +EXPORT_SYMBOL_GPL(stmpe_probe); void stmpe_remove(struct stmpe *stmpe) { + if (stmpe->domain) + irq_domain_remove(stmpe->domain); + if (!IS_ERR(stmpe->vio) && regulator_is_enabled(stmpe->vio)) regulator_disable(stmpe->vio); if (!IS_ERR(stmpe->vcc) && regulator_is_enabled(stmpe->vcc)) @@ -1494,6 +1498,7 @@ void stmpe_remove(struct stmpe *stmpe) mfd_remove_devices(stmpe->dev); } +EXPORT_SYMBOL_GPL(stmpe_remove); static int stmpe_suspend(struct device *dev) { @@ -1517,3 +1522,7 @@ static int stmpe_resume(struct device *dev) EXPORT_GPL_SIMPLE_DEV_PM_OPS(stmpe_dev_pm_ops, stmpe_suspend, stmpe_resume); + +MODULE_DESCRIPTION("STMPE Core driver"); +MODULE_AUTHOR("Rabin Vincent "); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/sun4i-gpadc.c b/drivers/mfd/sun4i-gpadc.c index 3029d48e982cfd..bf2f6fdaf8bf9d 100644 --- a/drivers/mfd/sun4i-gpadc.c +++ b/drivers/mfd/sun4i-gpadc.c @@ -72,7 +72,6 @@ static const struct regmap_config sun4i_gpadc_regmap_config = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, - .fast_io = true, }; static const struct of_device_id sun4i_gpadc_of_match[] = { diff --git a/drivers/mfd/tps6594-core.c b/drivers/mfd/tps6594-core.c index c16c37e3661787..8b26c412747279 100644 --- a/drivers/mfd/tps6594-core.c +++ b/drivers/mfd/tps6594-core.c @@ -10,16 +10,20 @@ * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/ */ +#include #include #include #include #include #include +#include #include #include #define TPS6594_CRC_SYNC_TIMEOUT_MS 150 +#define TPS65224_EN_SEL_PB 1 +#define TPS65224_GPIO3_SEL_PB 3 /* Completion to synchronize CRC feature enabling on all PMICs */ static DECLARE_COMPLETION(tps6594_crc_comp); @@ -128,6 +132,12 @@ static const struct resource tps6594_rtc_resources[] = { DEFINE_RES_IRQ_NAMED(TPS6594_IRQ_POWER_UP, TPS6594_IRQ_NAME_POWERUP), }; +static const struct resource tps6594_pwrbutton_resources[] = { + DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_FALL, TPS65224_IRQ_NAME_PB_FALL), + DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_RISE, TPS65224_IRQ_NAME_PB_RISE), + DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_SHORT, TPS65224_IRQ_NAME_PB_SHORT), +}; + static const struct mfd_cell tps6594_common_cells[] = { MFD_CELL_RES("tps6594-regulator", tps6594_regulator_resources), MFD_CELL_RES("tps6594-pinctrl", tps6594_pinctrl_resources), @@ -318,8 +328,6 @@ static const struct resource tps65224_pfsm_resources[] = { DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_REG_UNLOCK, TPS65224_IRQ_NAME_REG_UNLOCK), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_TWARN, TPS65224_IRQ_NAME_TWARN), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_LONG, TPS65224_IRQ_NAME_PB_LONG), - DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_FALL, TPS65224_IRQ_NAME_PB_FALL), - DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_PB_RISE, TPS65224_IRQ_NAME_PB_RISE), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_TSD_ORD, TPS65224_IRQ_NAME_TSD_ORD), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_BIST_FAIL, TPS65224_IRQ_NAME_BIST_FAIL), DEFINE_RES_IRQ_NAMED(TPS65224_IRQ_REG_CRC_ERR, TPS65224_IRQ_NAME_REG_CRC_ERR), @@ -347,6 +355,12 @@ static const struct mfd_cell tps65224_common_cells[] = { MFD_CELL_RES("tps6594-regulator", tps65224_regulator_resources), }; +static const struct mfd_cell tps6594_pwrbutton_cell = { + .name = "tps6594-pwrbutton", + .resources = tps6594_pwrbutton_resources, + .num_resources = ARRAY_SIZE(tps6594_pwrbutton_resources), +}; + static const struct regmap_irq tps65224_irqs[] = { /* INT_BUCK register */ REGMAP_IRQ_REG(TPS65224_IRQ_BUCK1_UVOV, 0, TPS65224_BIT_BUCK1_UVOV_INT), @@ -676,11 +690,25 @@ static int tps6594_enable_crc(struct tps6594 *tps) return ret; } +static int tps6594_power_off_handler(struct sys_off_data *data) +{ + struct tps6594 *tps = data->cb_data; + int ret; + + ret = regmap_update_bits(tps->regmap, TPS6594_REG_FSM_I2C_TRIGGERS, + TPS6594_BIT_TRIGGER_I2C(0), TPS6594_BIT_TRIGGER_I2C(0)); + if (ret) + return notifier_from_errno(ret); + + return NOTIFY_DONE; +} + int tps6594_device_init(struct tps6594 *tps, bool enable_crc) { struct device *dev = tps->dev; int ret; struct regmap_irq_chip *irq_chip; + unsigned int pwr_on, gpio3_cfg; const struct mfd_cell *cells; int n_cells; @@ -727,6 +755,27 @@ int tps6594_device_init(struct tps6594 *tps, bool enable_crc) if (ret) return dev_err_probe(dev, ret, "Failed to add common child devices\n"); + /* If either the PB/EN/VSENSE or GPIO3 is configured as PB, register a driver for it */ + if (tps->chip_id == TPS65224 || tps->chip_id == TPS652G1) { + ret = regmap_read(tps->regmap, TPS6594_REG_NPWRON_CONF, &pwr_on); + if (ret) + return dev_err_probe(dev, ret, "Failed to read PB/EN/VSENSE config\n"); + + ret = regmap_read(tps->regmap, TPS6594_REG_GPIOX_CONF(2), &gpio3_cfg); + if (ret) + return dev_err_probe(dev, ret, "Failed to read GPIO3 config\n"); + + if (FIELD_GET(TPS65224_MASK_EN_PB_VSENSE_CONFIG, pwr_on) == TPS65224_EN_SEL_PB || + FIELD_GET(TPS65224_MASK_GPIO_SEL, gpio3_cfg) == TPS65224_GPIO3_SEL_PB) { + ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, + &tps6594_pwrbutton_cell, 1, NULL, 0, + regmap_irq_get_domain(tps->irq_data)); + if (ret) + return dev_err_probe(dev, ret, + "Failed to add power button device.\n"); + } + } + /* No RTC for LP8764, TPS65224 and TPS652G1 */ if (tps->chip_id != LP8764 && tps->chip_id != TPS65224 && tps->chip_id != TPS652G1) { ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, tps6594_rtc_cells, @@ -736,6 +785,12 @@ int tps6594_device_init(struct tps6594 *tps, bool enable_crc) return dev_err_probe(dev, ret, "Failed to add RTC child device\n"); } + if (of_device_is_system_power_controller(dev->of_node)) { + ret = devm_register_power_off_handler(tps->dev, tps6594_power_off_handler, tps); + if (ret) + return dev_err_probe(dev, ret, "Failed to register power-off handler\n"); + } + return 0; } EXPORT_SYMBOL_GPL(tps6594_device_init); diff --git a/drivers/mfd/vexpress-sysreg.c b/drivers/mfd/vexpress-sysreg.c index fc2daffc4352cc..f49cee91f71cc2 100644 --- a/drivers/mfd/vexpress-sysreg.c +++ b/drivers/mfd/vexpress-sysreg.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -96,9 +97,11 @@ static struct mfd_cell vexpress_sysreg_cells[] = { static int vexpress_sysreg_probe(struct platform_device *pdev) { + struct gpio_generic_chip *mmc_gpio_chip; + struct gpio_generic_chip_config config; struct resource *mem; void __iomem *base; - struct gpio_chip *mmc_gpio_chip; + int ret; mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!mem) @@ -116,10 +119,22 @@ static int vexpress_sysreg_probe(struct platform_device *pdev) GFP_KERNEL); if (!mmc_gpio_chip) return -ENOMEM; - bgpio_init(mmc_gpio_chip, &pdev->dev, 0x4, base + SYS_MCI, - NULL, NULL, NULL, NULL, 0); - mmc_gpio_chip->ngpio = 2; - devm_gpiochip_add_data(&pdev->dev, mmc_gpio_chip, NULL); + + config = (struct gpio_generic_chip_config) { + .dev = &pdev->dev, + .sz = 4, + .dat = base + SYS_MCI, + }; + + ret = gpio_generic_chip_init(mmc_gpio_chip, &config); + if (ret) + return ret; + + mmc_gpio_chip->gc.ngpio = 2; + + ret = devm_gpiochip_add_data(&pdev->dev, &mmc_gpio_chip->gc, NULL); + if (ret) + return ret; return devm_mfd_add_devices(&pdev->dev, PLATFORM_DEVID_AUTO, vexpress_sysreg_cells, diff --git a/drivers/misc/cardreader/rtsx_usb.c b/drivers/misc/cardreader/rtsx_usb.c index d007a4455ce5ba..1830e9ed252165 100644 --- a/drivers/misc/cardreader/rtsx_usb.c +++ b/drivers/misc/cardreader/rtsx_usb.c @@ -552,6 +552,10 @@ static int rtsx_usb_reset_chip(struct rtsx_ucr *ucr) ret = rtsx_usb_send_cmd(ucr, MODE_C, 100); if (ret) return ret; + /* config OCP */ + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_DETECT_EN, MS_OCP_DETECT_EN); + rtsx_usb_write_register(ucr, OCPPARA1, 0xF0, 0x50); + rtsx_usb_write_register(ucr, OCPPARA2, 0x7, 0x3); /* config non-crystal mode */ rtsx_usb_read_register(ucr, CFG_MODE, &val); @@ -722,6 +726,9 @@ static int rtsx_usb_suspend(struct usb_interface *intf, pm_message_t message) if (val & (SD_CD | MS_CD)) { device_for_each_child(&intf->dev, NULL, rtsx_usb_resume_child); return -EAGAIN; + } else { + /* if the card does not exists, clear OCP status */ + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_CLEAR, MS_OCP_CLEAR); } } else { /* There is an ongoing operation*/ diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index c44de892a61ec4..5372ed2a363ecd 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -94,7 +94,7 @@ static int ibmasmfs_init_fs_context(struct fs_context *fc) static const struct super_operations ibmasmfs_s_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations; diff --git a/drivers/misc/lkdtm/cfi.c b/drivers/misc/lkdtm/cfi.c index 6a33889d0902af..c3971f7caa65ed 100644 --- a/drivers/misc/lkdtm/cfi.c +++ b/drivers/misc/lkdtm/cfi.c @@ -43,7 +43,7 @@ static void lkdtm_CFI_FORWARD_PROTO(void) lkdtm_indirect_call((void *)lkdtm_increment_int); pr_err("FAIL: survived mismatched prototype function call!\n"); - pr_expected_config(CONFIG_CFI_CLANG); + pr_expected_config(CONFIG_CFI); } /* diff --git a/drivers/misc/lkdtm/fortify.c b/drivers/misc/lkdtm/fortify.c index 0159276656780d..00ed2147113e69 100644 --- a/drivers/misc/lkdtm/fortify.c +++ b/drivers/misc/lkdtm/fortify.c @@ -44,6 +44,9 @@ static void lkdtm_FORTIFY_STR_MEMBER(void) char *src; src = kmalloc(size, GFP_KERNEL); + if (!src) + return; + strscpy(src, "over ten bytes", size); size = strlen(src) + 1; @@ -109,6 +112,9 @@ static void lkdtm_FORTIFY_MEM_MEMBER(void) char *src; src = kmalloc(size, GFP_KERNEL); + if (!src) + return; + strscpy(src, "over ten bytes", size); size = strlen(src) + 1; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 9cc47bf94804b6..a74e75df93b061 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -121,6 +121,10 @@ struct rpmb_frame { #define RPMB_READ_DATA 0x4 /* Read data from RPMB partition */ #define RPMB_RESULT_READ 0x5 /* Read result request (Internal) */ +#define RPMB_FRAME_SIZE sizeof(struct rpmb_frame) +#define CHECK_SIZE_NEQ(val) ((val) != sizeof(struct rpmb_frame)) +#define CHECK_SIZE_ALIGNED(val) IS_ALIGNED((val), sizeof(struct rpmb_frame)) + static DEFINE_MUTEX(block_mutex); /* @@ -1768,8 +1772,7 @@ static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq, * these, while retaining features like reliable writes. */ if ((md->flags & MMC_BLK_CMD23) && mmc_op_multi(brq->cmd.opcode) && - (do_rel_wr || !(card->quirks & MMC_QUIRK_BLK_NO_CMD23) || - do_data_tag)) { + (do_rel_wr || !mmc_card_blk_no_cmd23(card) || do_data_tag)) { brq->sbc.opcode = MMC_SET_BLOCK_COUNT; brq->sbc.arg = brq->data.blocks | (do_rel_wr ? (1 << 31) : 0) | @@ -2618,13 +2621,8 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, */ md->read_only = mmc_blk_readonly(card); - if (mmc_host_can_cmd23(card->host)) { - if ((mmc_card_mmc(card) && - card->csd.mmca_vsn >= CSD_SPEC_VER_3) || - (mmc_card_sd(card) && !mmc_card_ult_capacity(card) && - card->scr.cmds & SD_SCR_CMD23_SUPPORT)) - md->flags |= MMC_BLK_CMD23; - } + if (mmc_host_can_cmd23(card->host) && mmc_card_can_cmd23(card)) + md->flags |= MMC_BLK_CMD23; if (md->flags & MMC_BLK_CMD23 && ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) || @@ -2864,12 +2862,12 @@ static void set_idata(struct mmc_blk_ioc_data *idata, u32 opcode, * The size of an RPMB frame must match what's expected by the * hardware. */ - BUILD_BUG_ON(sizeof(struct rpmb_frame) != 512); + static_assert(!CHECK_SIZE_NEQ(512), "RPMB frame size must be 512 bytes"); idata->ic.opcode = opcode; idata->ic.flags = MMC_RSP_R1 | MMC_CMD_ADTC; idata->ic.write_flag = write_flag; - idata->ic.blksz = sizeof(struct rpmb_frame); + idata->ic.blksz = RPMB_FRAME_SIZE; idata->ic.blocks = buf_bytes / idata->ic.blksz; idata->buf = buf; idata->buf_bytes = buf_bytes; @@ -2893,32 +2891,28 @@ static int mmc_route_rpmb_frames(struct device *dev, u8 *req, if (IS_ERR(md->queue.card)) return PTR_ERR(md->queue.card); - if (req_len < sizeof(*frm)) + if (req_len < RPMB_FRAME_SIZE) return -EINVAL; req_type = be16_to_cpu(frm->req_resp); switch (req_type) { case RPMB_PROGRAM_KEY: - if (req_len != sizeof(struct rpmb_frame) || - resp_len != sizeof(struct rpmb_frame)) + if (CHECK_SIZE_NEQ(req_len) || CHECK_SIZE_NEQ(resp_len)) return -EINVAL; write = true; break; case RPMB_GET_WRITE_COUNTER: - if (req_len != sizeof(struct rpmb_frame) || - resp_len != sizeof(struct rpmb_frame)) + if (CHECK_SIZE_NEQ(req_len) || CHECK_SIZE_NEQ(resp_len)) return -EINVAL; write = false; break; case RPMB_WRITE_DATA: - if (req_len % sizeof(struct rpmb_frame) || - resp_len != sizeof(struct rpmb_frame)) + if (!CHECK_SIZE_ALIGNED(req_len) || CHECK_SIZE_NEQ(resp_len)) return -EINVAL; write = true; break; case RPMB_READ_DATA: - if (req_len != sizeof(struct rpmb_frame) || - resp_len % sizeof(struct rpmb_frame)) + if (CHECK_SIZE_NEQ(req_len) || !CHECK_SIZE_ALIGNED(resp_len)) return -EINVAL; write = false; break; @@ -2926,25 +2920,23 @@ static int mmc_route_rpmb_frames(struct device *dev, u8 *req, return -EINVAL; } - if (write) - cmd_count = 3; - else - cmd_count = 2; + /* Write operations require 3 commands, read operations require 2 */ + cmd_count = write ? 3 : 2; idata = alloc_idata(rpmb, cmd_count); if (!idata) return -ENOMEM; if (write) { - struct rpmb_frame *frm = (struct rpmb_frame *)resp; + struct rpmb_frame *resp_frm = (struct rpmb_frame *)resp; /* Send write request frame(s) */ set_idata(idata[0], MMC_WRITE_MULTIPLE_BLOCK, 1 | MMC_CMD23_ARG_REL_WR, req, req_len); /* Send result request frame */ - memset(frm, 0, sizeof(*frm)); - frm->req_resp = cpu_to_be16(RPMB_RESULT_READ); + memset(resp_frm, 0, RPMB_FRAME_SIZE); + resp_frm->req_resp = cpu_to_be16(RPMB_RESULT_READ); set_idata(idata[1], MMC_WRITE_MULTIPLE_BLOCK, 1, resp, resp_len); diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 1cf64e0952fbe2..ec4f3462bf8092 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -19,6 +19,7 @@ #include #include +#include #include "core.h" #include "card.h" @@ -383,6 +384,14 @@ int mmc_add_card(struct mmc_card *card) mmc_card_set_present(card); + /* + * Register for undervoltage notification if the card supports + * power-off notification, enabling emergency shutdowns. + */ + if (mmc_card_mmc(card) && + card->ext_csd.power_off_notification == EXT_CSD_POWER_ON) + mmc_regulator_register_undervoltage_notifier(card->host); + return 0; } @@ -394,6 +403,9 @@ void mmc_remove_card(struct mmc_card *card) { struct mmc_host *host = card->host; + if (mmc_card_present(card)) + mmc_regulator_unregister_undervoltage_notifier(host); + mmc_remove_card_debugfs(card); if (mmc_card_present(card)) { diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h index 9cbdd240c3a7d4..1200951bab08c2 100644 --- a/drivers/mmc/core/card.h +++ b/drivers/mmc/core/card.h @@ -245,14 +245,19 @@ static inline int mmc_blksz_for_byte_mode(const struct mmc_card *c) return c->quirks & MMC_QUIRK_BLKSZ_FOR_BYTE_MODE; } +static inline int mmc_card_nonstd_func_interface(const struct mmc_card *c) +{ + return c->quirks & MMC_QUIRK_NONSTD_FUNC_IF; +} + static inline int mmc_card_disable_cd(const struct mmc_card *c) { return c->quirks & MMC_QUIRK_DISABLE_CD; } -static inline int mmc_card_nonstd_func_interface(const struct mmc_card *c) +static inline int mmc_card_blk_no_cmd23(const struct mmc_card *c) { - return c->quirks & MMC_QUIRK_NONSTD_FUNC_IF; + return c->quirks & MMC_QUIRK_BLK_NO_CMD23; } static inline int mmc_card_broken_byte_mode_512(const struct mmc_card *c) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 874c6fe92855e3..860378bea557b3 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1398,6 +1398,29 @@ void mmc_power_cycle(struct mmc_host *host, u32 ocr) mmc_power_up(host, ocr); } +/** + * mmc_handle_undervoltage - Handle an undervoltage event on the MMC bus + * @host: The MMC host that detected the undervoltage condition + * + * This function is called when an undervoltage event is detected on one of + * the MMC regulators. + * + * Returns: 0 on success or a negative error code on failure. + */ +int mmc_handle_undervoltage(struct mmc_host *host) +{ + /* Stop the host to prevent races with card removal */ + __mmc_stop_host(host); + + if (!host->bus_ops || !host->bus_ops->handle_undervoltage) + return 0; + + dev_warn(mmc_dev(host), "%s: Undervoltage detected, initiating emergency stop\n", + mmc_hostname(host)); + + return host->bus_ops->handle_undervoltage(host); +} + /* * Assign a mmc bus handler to a host. Only one bus handler may control a * host at any given time. @@ -1875,6 +1898,15 @@ bool mmc_card_can_secure_erase_trim(struct mmc_card *card) } EXPORT_SYMBOL(mmc_card_can_secure_erase_trim); +bool mmc_card_can_cmd23(struct mmc_card *card) +{ + return ((mmc_card_mmc(card) && + card->csd.mmca_vsn >= CSD_SPEC_VER_3) || + (mmc_card_sd(card) && !mmc_card_ult_capacity(card) && + card->scr.cmds & SD_SCR_CMD23_SUPPORT)); +} +EXPORT_SYMBOL(mmc_card_can_cmd23); + int mmc_erase_group_aligned(struct mmc_card *card, sector_t from, unsigned int nr) { diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 622085cd766f91..a028b48be16447 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -31,6 +31,7 @@ struct mmc_bus_ops { int (*sw_reset)(struct mmc_host *); bool (*cache_enabled)(struct mmc_host *); int (*flush_cache)(struct mmc_host *); + int (*handle_undervoltage)(struct mmc_host *host); }; void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops); @@ -59,6 +60,10 @@ void mmc_power_off(struct mmc_host *host); void mmc_power_cycle(struct mmc_host *host, u32 ocr); void mmc_set_initial_state(struct mmc_host *host); u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max); +int mmc_handle_undervoltage(struct mmc_host *host); +void mmc_regulator_register_undervoltage_notifier(struct mmc_host *host); +void mmc_regulator_unregister_undervoltage_notifier(struct mmc_host *host); +void mmc_undervoltage_workfn(struct work_struct *work); static inline void mmc_delay(unsigned int ms) { @@ -123,6 +128,7 @@ bool mmc_card_can_trim(struct mmc_card *card); bool mmc_card_can_discard(struct mmc_card *card); bool mmc_card_can_sanitize(struct mmc_card *card); bool mmc_card_can_secure_erase_trim(struct mmc_card *card); +bool mmc_card_can_cmd23(struct mmc_card *card); int mmc_erase_group_aligned(struct mmc_card *card, sector_t from, unsigned int nr); unsigned int mmc_calc_max_discard(struct mmc_card *card); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index f14671ea571628..88c95dbfd9cfd5 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -302,6 +302,8 @@ int mmc_of_parse(struct mmc_host *host) /* f_max is obtained from the optional "max-frequency" property */ device_property_read_u32(dev, "max-frequency", &host->f_max); + device_property_read_u32(dev, "max-sd-hs-hz", &host->max_sd_hs_hz); + /* * Configure CD and WP pins. They are both by default active low to * match the SDHCI spec. If GPIOs are provided for CD and / or WP, the @@ -564,6 +566,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) INIT_WORK(&host->sdio_irq_work, sdio_irq_work); timer_setup(&host->retune_timer, mmc_retune_timer, 0); + INIT_WORK(&host->supply.uv_work, mmc_undervoltage_workfn); + /* * By default, hosts do not support SGIO or large requests. * They have to set these according to their abilities. diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 5be9b42d5057eb..3e7d9437477c76 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -36,6 +36,7 @@ enum mmc_poweroff_type { MMC_POWEROFF_SUSPEND, MMC_POWEROFF_SHUTDOWN, + MMC_POWEROFF_UNDERVOLTAGE, MMC_POWEROFF_UNBIND, }; @@ -2132,9 +2133,15 @@ static int _mmc_suspend(struct mmc_host *host, enum mmc_poweroff_type pm_type) if (mmc_card_suspended(host->card)) goto out; - err = _mmc_flush_cache(host); - if (err) - goto out; + /* + * For the undervoltage case, we care more about device integrity. + * Avoid cache flush and notify the device to power off quickly. + */ + if (pm_type != MMC_POWEROFF_UNDERVOLTAGE) { + err = _mmc_flush_cache(host); + if (err) + goto out; + } if (mmc_card_can_poweroff_notify(host->card) && mmc_host_can_poweroff_notify(host, pm_type)) @@ -2212,6 +2219,13 @@ static int mmc_shutdown(struct mmc_host *host) { int err = 0; + /* + * In case of undervoltage, the card will be powered off (removed) by + * _mmc_handle_undervoltage() + */ + if (mmc_card_removed(host->card)) + return 0; + /* * If the card remains suspended at this point and it was done by using * the sleep-cmd (CMD5), we may need to re-initialize it first, to allow @@ -2302,6 +2316,55 @@ static int _mmc_hw_reset(struct mmc_host *host) return mmc_init_card(host, card->ocr, card); } +/** + * _mmc_handle_undervoltage - Handle an undervoltage event for MMC/eMMC devices + * @host: MMC host structure + * + * This function is triggered when an undervoltage condition is detected. + * It attempts to transition the device into a low-power or safe state to + * prevent data corruption. + * + * Steps performed: + * - Perform an emergency suspend using EXT_CSD_POWER_OFF_SHORT if possible. + * - If power-off notify is not supported, fallback mechanisms like sleep or + * deselecting the card are attempted. + * - Cache flushing is skipped to reduce execution time. + * - Mark the card as removed to prevent further interactions after + * undervoltage. + * + * Note: This function does not handle host claiming or releasing. The caller + * must ensure that the host is properly claimed before calling this + * function and released afterward. + * + * Returns: 0 on success, or a negative error code if any step fails. + */ +static int _mmc_handle_undervoltage(struct mmc_host *host) +{ + struct mmc_card *card = host->card; + int err; + + /* + * Perform an emergency suspend to power off the eMMC quickly. + * This ensures the device enters a safe state before power is lost. + * We first attempt EXT_CSD_POWER_OFF_SHORT, but if power-off notify + * is not supported, we fall back to sleep mode or deselecting the card. + * Cache flushing is skipped to minimize delay. + */ + err = _mmc_suspend(host, MMC_POWEROFF_UNDERVOLTAGE); + if (err) + pr_err("%s: undervoltage suspend failed: %pe\n", + mmc_hostname(host), ERR_PTR(err)); + + /* + * Mark the card as removed to prevent further operations. + * This ensures the system does not attempt to access the device + * after an undervoltage event, avoiding potential corruption. + */ + mmc_card_set_removed(card); + + return err; +} + static const struct mmc_bus_ops mmc_ops = { .remove = mmc_remove, .detect = mmc_detect, @@ -2314,6 +2377,7 @@ static const struct mmc_bus_ops mmc_ops = { .hw_reset = _mmc_hw_reset, .cache_enabled = _mmc_cache_enabled, .flush_cache = _mmc_flush_cache, + .handle_undervoltage = _mmc_handle_undervoltage, }; /* diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index 66283825513cb4..a952cc8265af8f 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -1077,3 +1077,75 @@ int mmc_sanitize(struct mmc_card *card, unsigned int timeout_ms) return err; } EXPORT_SYMBOL_GPL(mmc_sanitize); + +/** + * mmc_read_tuning() - read data blocks from the mmc + * @host: mmc host doing the read + * @blksz: data block size + * @blocks: number of blocks to read + * + * Read one or more blocks of data from the beginning of the mmc. This is a + * low-level helper for tuning operation. It is assumed that CMD23 can be used + * for multi-block read if the host supports it. + * + * Note: Allocate and free a temporary buffer to store the data read. The data + * is not available outside of the function, only the status of the read + * operation. + * + * Return: 0 in case of success, otherwise -EIO / -ENOMEM / -E2BIG + */ +int mmc_read_tuning(struct mmc_host *host, unsigned int blksz, unsigned int blocks) +{ + struct mmc_request mrq = {}; + struct mmc_command sbc = {}; + struct mmc_command cmd = {}; + struct mmc_command stop = {}; + struct mmc_data data = {}; + struct scatterlist sg; + void *buf; + unsigned int len; + + if (blocks > 1) { + if (mmc_host_can_cmd23(host)) { + mrq.sbc = &sbc; + sbc.opcode = MMC_SET_BLOCK_COUNT; + sbc.arg = blocks; + sbc.flags = MMC_RSP_R1 | MMC_CMD_AC; + } + cmd.opcode = MMC_READ_MULTIPLE_BLOCK; + mrq.stop = &stop; + stop.opcode = MMC_STOP_TRANSMISSION; + stop.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC; + } else { + cmd.opcode = MMC_READ_SINGLE_BLOCK; + } + + mrq.cmd = &cmd; + cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC; + + mrq.data = &data; + data.flags = MMC_DATA_READ; + data.blksz = blksz; + data.blocks = blocks; + data.blk_addr = 0; + data.sg = &sg; + data.sg_len = 1; + data.timeout_ns = 1000000000; + + if (check_mul_overflow(blksz, blocks, &len)) + return -E2BIG; + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + sg_init_one(&sg, buf, len); + + mmc_wait_for_req(host, &mrq); + kfree(buf); + + if (sbc.error || cmd.error || data.error) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_GPL(mmc_read_tuning); diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c index 80e5d87a5e50be..67d4a301895c90 100644 --- a/drivers/mmc/core/mmc_test.c +++ b/drivers/mmc/core/mmc_test.c @@ -180,20 +180,14 @@ static int mmc_test_set_blksize(struct mmc_test_card *test, unsigned size) return mmc_set_blocklen(test->card, size); } -static bool mmc_test_card_cmd23(struct mmc_card *card) -{ - return mmc_card_mmc(card) || - (mmc_card_sd(card) && card->scr.cmds & SD_SCR_CMD23_SUPPORT); -} - static void mmc_test_prepare_sbc(struct mmc_test_card *test, struct mmc_request *mrq, unsigned int blocks) { struct mmc_card *card = test->card; if (!mrq->sbc || !mmc_host_can_cmd23(card->host) || - !mmc_test_card_cmd23(card) || !mmc_op_multi(mrq->cmd->opcode) || - (card->quirks & MMC_QUIRK_BLK_NO_CMD23)) { + !mmc_card_can_cmd23(card) || !mmc_op_multi(mrq->cmd->opcode) || + mmc_card_blk_no_cmd23(card)) { mrq->sbc = NULL; return; } diff --git a/drivers/mmc/core/regulator.c b/drivers/mmc/core/regulator.c index 3dae2e9b797813..a85179f1a4de6d 100644 --- a/drivers/mmc/core/regulator.c +++ b/drivers/mmc/core/regulator.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -262,6 +263,82 @@ static inline int mmc_regulator_get_ocrmask(struct regulator *supply) #endif /* CONFIG_REGULATOR */ +/* To be called from a high-priority workqueue */ +void mmc_undervoltage_workfn(struct work_struct *work) +{ + struct mmc_supply *supply; + struct mmc_host *host; + + supply = container_of(work, struct mmc_supply, uv_work); + host = container_of(supply, struct mmc_host, supply); + + mmc_handle_undervoltage(host); +} + +static int mmc_handle_regulator_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mmc_supply *supply = container_of(nb, struct mmc_supply, + vmmc_nb); + struct mmc_host *host = container_of(supply, struct mmc_host, supply); + unsigned long flags; + + switch (event) { + case REGULATOR_EVENT_UNDER_VOLTAGE: + spin_lock_irqsave(&host->lock, flags); + if (host->undervoltage) { + spin_unlock_irqrestore(&host->lock, flags); + return NOTIFY_OK; + } + + host->undervoltage = true; + spin_unlock_irqrestore(&host->lock, flags); + + queue_work(system_highpri_wq, &host->supply.uv_work); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +/** + * mmc_regulator_register_undervoltage_notifier - Register for undervoltage + * events + * @host: MMC host + * + * To be called by a bus driver when a card supporting graceful shutdown + * is attached. + */ +void mmc_regulator_register_undervoltage_notifier(struct mmc_host *host) +{ + int ret; + + if (IS_ERR_OR_NULL(host->supply.vmmc)) + return; + + host->supply.vmmc_nb.notifier_call = mmc_handle_regulator_event; + ret = regulator_register_notifier(host->supply.vmmc, + &host->supply.vmmc_nb); + if (ret) + dev_warn(mmc_dev(host), "Failed to register vmmc notifier: %d\n", ret); +} + +/** + * mmc_regulator_unregister_undervoltage_notifier - Unregister undervoltage + * notifier + * @host: MMC host + */ +void mmc_regulator_unregister_undervoltage_notifier(struct mmc_host *host) +{ + if (IS_ERR_OR_NULL(host->supply.vmmc)) + return; + + regulator_unregister_notifier(host->supply.vmmc, &host->supply.vmmc_nb); + cancel_work_sync(&host->supply.uv_work); +} + /** * mmc_regulator_get_supply - try to get VMMC and VQMMC regulators for a host * @mmc: the host to regulate diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index ec02067f03c5c5..67cd6300482976 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -359,7 +359,7 @@ static int mmc_read_switch(struct mmc_card *card) } if (status[13] & SD_MODE_HIGH_SPEED) - card->sw_caps.hs_max_dtr = HIGH_SPEED_MAX_DTR; + card->sw_caps.hs_max_dtr = card->host->max_sd_hs_hz ?: HIGH_SPEED_MAX_DTR; if (card->scr.sda_spec3) { card->sw_caps.sd3_bus_mode = status[13]; diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c index 0f753367aec1c1..83085e76486aa8 100644 --- a/drivers/mmc/core/sdio.c +++ b/drivers/mmc/core/sdio.c @@ -945,7 +945,11 @@ static void mmc_sdio_remove(struct mmc_host *host) */ static int mmc_sdio_alive(struct mmc_host *host) { - return mmc_select_card(host->card); + if (!mmc_host_is_spi(host)) + return mmc_select_card(host->card); + else + return mmc_io_rw_direct(host->card, 0, 0, SDIO_CCCR_CCCR, 0, + NULL); } /* diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index 656601754966b2..10799772494a2c 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -200,7 +200,6 @@ static int sdio_bus_probe(struct device *dev) atomic_dec(&func->card->sdio_funcs_probed); if (func->card->host->caps & MMC_CAP_POWER_OFF_CARD) pm_runtime_put_noidle(dev); - dev_pm_domain_detach(dev, false); return ret; } @@ -231,8 +230,6 @@ static void sdio_bus_remove(struct device *dev) /* Then undo the runtime PM settings in sdio_bus_probe() */ if (func->card->host->caps & MMC_CAP_POWER_OFF_CARD) pm_runtime_put_sync(dev); - - dev_pm_domain_detach(dev, false); } static const struct dev_pm_ops sdio_bus_pm_ops = { diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 7232de1c068873..2c963cb6724b9e 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -56,7 +56,7 @@ config MMC_STM32_SDMMC config MMC_PXA tristate "Intel PXA25x/26x/27x Multimedia Card Interface support" - depends on ARCH_PXA + depends on ARCH_PXA || COMPILE_TEST help This selects the Intel(R) PXA(R) Multimedia card Interface. If you have a PXA(R) platform with a Multimedia Card slot, @@ -359,7 +359,7 @@ config MMC_SDHCI_S3C depends on PLAT_SAMSUNG || ARCH_S5PV210 || ARCH_EXYNOS || COMPILE_TEST help This selects the Secure Digital Host Controller Interface (SDHCI) - often referrered to as the HSMMC block in some of the Samsung + often referred to as the HSMMC block in some of the Samsung S3C6410, S5Pv210 and Exynos (Exynso4210, Exynos4412) SoCs. If you have a controller with this interface (thereforeyou build for @@ -401,7 +401,7 @@ config MMC_SDHCI_SPEAR depends on OF help This selects the Secure Digital Host Controller Interface (SDHCI) - often referrered to as the HSMMC block in some of the ST SPEAR range + often referred to as the HSMMC block in some of the ST SPEAR range of SoC If you have a controller with this interface, say Y or M here. @@ -608,7 +608,7 @@ config MMC_SDHCI_MSM config MMC_MXC tristate "Freescale i.MX21/27/31 or MPC512x Multimedia Card support" - depends on ARCH_MXC || PPC_MPC512x + depends on ARCH_MXC || PPC_MPC512x || COMPILE_TEST help This selects the Freescale i.MX21, i.MX27, i.MX31 or MPC512x Multimedia Card Interface. If you have an i.MX or MPC512x platform @@ -866,7 +866,8 @@ config MMC_DW_PCI config MMC_DW_ROCKCHIP tristate "Rockchip specific extensions for Synopsys DW Memory Card Interface" - depends on MMC_DW && ARCH_ROCKCHIP + depends on MMC_DW + depends on ARCH_ROCKCHIP || COMPILE_TEST select MMC_DW_PLTFM help This selects support for Rockchip SoC specific extensions to the @@ -948,7 +949,7 @@ config MMC_USHC config MMC_WMT tristate "Wondermedia SD/MMC Host Controller support" - depends on ARCH_VT8500 + depends on ARCH_VT8500 || COMPILE_TEST default y help This selects support for the SD/MMC Host Controller on @@ -1115,6 +1116,7 @@ config MMC_LOONGSON2 tristate "Loongson-2K SD/SDIO/eMMC Host Interface support" depends on LOONGARCH || COMPILE_TEST depends on HAS_DMA + select REGMAP_MMIO help This selects support for the SD/SDIO/eMMC Host Controller on Loongson-2K series CPUs. diff --git a/drivers/mmc/host/alcor.c b/drivers/mmc/host/alcor.c index 288c3a91a0aff7..721db54739c165 100644 --- a/drivers/mmc/host/alcor.c +++ b/drivers/mmc/host/alcor.c @@ -1129,7 +1129,6 @@ static void alcor_pci_sdmmc_drv_remove(struct platform_device *pdev) mmc_remove_host(mmc); } -#ifdef CONFIG_PM_SLEEP static int alcor_pci_sdmmc_suspend(struct device *dev) { struct alcor_sdmmc_host *host = dev_get_drvdata(dev); @@ -1150,10 +1149,9 @@ static int alcor_pci_sdmmc_resume(struct device *dev) return 0; } -#endif /* CONFIG_PM_SLEEP */ -static SIMPLE_DEV_PM_OPS(alcor_mmc_pm_ops, alcor_pci_sdmmc_suspend, - alcor_pci_sdmmc_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(alcor_mmc_pm_ops, alcor_pci_sdmmc_suspend, + alcor_pci_sdmmc_resume); static const struct platform_device_id alcor_pci_sdmmc_ids[] = { { @@ -1171,7 +1169,7 @@ static struct platform_driver alcor_pci_sdmmc_driver = { .driver = { .name = DRV_NAME_ALCOR_PCI_SDMMC, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &alcor_mmc_pm_ops + .pm = pm_sleep_ptr(&alcor_mmc_pm_ops), }, }; module_platform_driver(alcor_pci_sdmmc_driver); diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 777342fb76576f..d1fbc6811563a3 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -2622,7 +2622,6 @@ static void atmci_remove(struct platform_device *pdev) pm_runtime_put_noidle(dev); } -#ifdef CONFIG_PM static int atmci_runtime_suspend(struct device *dev) { struct atmel_mci *host = dev_get_drvdata(dev); @@ -2642,12 +2641,10 @@ static int atmci_runtime_resume(struct device *dev) return clk_prepare_enable(host->mck); } -#endif static const struct dev_pm_ops atmci_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(atmci_runtime_suspend, atmci_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(atmci_runtime_suspend, atmci_runtime_resume, NULL) }; static struct platform_driver atmci_driver = { @@ -2657,7 +2654,7 @@ static struct platform_driver atmci_driver = { .name = "atmel_mci", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = atmci_dt_ids, - .pm = &atmci_dev_pm_ops, + .pm = pm_ptr(&atmci_dev_pm_ops), }, }; module_platform_driver(atmci_driver); diff --git a/drivers/mmc/host/au1xmmc.c b/drivers/mmc/host/au1xmmc.c index 85470773650d68..cc6e05f9b96faa 100644 --- a/drivers/mmc/host/au1xmmc.c +++ b/drivers/mmc/host/au1xmmc.c @@ -1150,10 +1150,9 @@ static void au1xmmc_remove(struct platform_device *pdev) } } -#ifdef CONFIG_PM -static int au1xmmc_suspend(struct platform_device *pdev, pm_message_t state) +static int au1xmmc_suspend(struct device *dev) { - struct au1xmmc_host *host = platform_get_drvdata(pdev); + struct au1xmmc_host *host = dev_get_drvdata(dev); __raw_writel(0, HOST_CONFIG2(host)); __raw_writel(0, HOST_CONFIG(host)); @@ -1164,27 +1163,24 @@ static int au1xmmc_suspend(struct platform_device *pdev, pm_message_t state) return 0; } -static int au1xmmc_resume(struct platform_device *pdev) +static int au1xmmc_resume(struct device *dev) { - struct au1xmmc_host *host = platform_get_drvdata(pdev); + struct au1xmmc_host *host = dev_get_drvdata(dev); au1xmmc_reset_controller(host); return 0; } -#else -#define au1xmmc_suspend NULL -#define au1xmmc_resume NULL -#endif + +static DEFINE_SIMPLE_DEV_PM_OPS(au1xmmc_pmops, au1xmmc_suspend, au1xmmc_resume); static struct platform_driver au1xmmc_driver = { .probe = au1xmmc_probe, .remove = au1xmmc_remove, - .suspend = au1xmmc_suspend, - .resume = au1xmmc_resume, .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .pm = pm_sleep_ptr(&au1xmmc_pmops), }, }; diff --git a/drivers/mmc/host/cb710-mmc.c b/drivers/mmc/host/cb710-mmc.c index 448d2f9159eab4..31daec78749521 100644 --- a/drivers/mmc/host/cb710-mmc.c +++ b/drivers/mmc/host/cb710-mmc.c @@ -664,25 +664,25 @@ static const struct mmc_host_ops cb710_mmc_host = { .get_cd = cb710_mmc_get_cd, }; -#ifdef CONFIG_PM - -static int cb710_mmc_suspend(struct platform_device *pdev, pm_message_t state) +static int cb710_mmc_suspend(struct device *dev) { + struct platform_device *pdev = to_platform_device(dev); struct cb710_slot *slot = cb710_pdev_to_slot(pdev); cb710_mmc_enable_irq(slot, 0, ~0); return 0; } -static int cb710_mmc_resume(struct platform_device *pdev) +static int cb710_mmc_resume(struct device *dev) { + struct platform_device *pdev = to_platform_device(dev); struct cb710_slot *slot = cb710_pdev_to_slot(pdev); cb710_mmc_enable_irq(slot, 0, ~0); return 0; } -#endif /* CONFIG_PM */ +static DEFINE_SIMPLE_DEV_PM_OPS(cb710_mmc_pmops, cb710_mmc_suspend, cb710_mmc_resume); static int cb710_mmc_init(struct platform_device *pdev) { @@ -767,13 +767,12 @@ static void cb710_mmc_exit(struct platform_device *pdev) } static struct platform_driver cb710_mmc_driver = { - .driver.name = "cb710-mmc", + .driver = { + .name = "cb710-mmc", + .pm = pm_sleep_ptr(&cb710_mmc_pmops), + }, .probe = cb710_mmc_init, .remove = cb710_mmc_exit, -#ifdef CONFIG_PM - .suspend = cb710_mmc_suspend, - .resume = cb710_mmc_resume, -#endif }; module_platform_driver(cb710_mmc_driver); diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c index c691f1b603953f..2b7d6d9bcde514 100644 --- a/drivers/mmc/host/davinci_mmc.c +++ b/drivers/mmc/host/davinci_mmc.c @@ -588,7 +588,7 @@ static void mmc_davinci_request(struct mmc_host *mmc, struct mmc_request *req) cpu_relax(); } if (mmcst1 & MMCST1_BUSY) { - dev_err(mmc_dev(host->mmc), "still BUSY? bad ... \n"); + dev_err(mmc_dev(host->mmc), "still BUSY? bad ...\n"); req->cmd->error = -ETIMEDOUT; mmc_request_done(mmc, req); return; @@ -1347,7 +1347,6 @@ static void davinci_mmcsd_remove(struct platform_device *pdev) clk_disable_unprepare(host->clk); } -#ifdef CONFIG_PM static int davinci_mmcsd_suspend(struct device *dev) { struct mmc_davinci_host *host = dev_get_drvdata(dev); @@ -1373,21 +1372,14 @@ static int davinci_mmcsd_resume(struct device *dev) return 0; } -static const struct dev_pm_ops davinci_mmcsd_pm = { - .suspend = davinci_mmcsd_suspend, - .resume = davinci_mmcsd_resume, -}; - -#define davinci_mmcsd_pm_ops (&davinci_mmcsd_pm) -#else -#define davinci_mmcsd_pm_ops NULL -#endif +static DEFINE_SIMPLE_DEV_PM_OPS(davinci_mmcsd_pm_ops, + davinci_mmcsd_suspend, davinci_mmcsd_resume); static struct platform_driver davinci_mmcsd_driver = { .driver = { .name = "davinci_mmc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = davinci_mmcsd_pm_ops, + .pm = pm_sleep_ptr(&davinci_mmcsd_pm_ops), .of_match_table = davinci_mmc_dt_ids, }, .probe = davinci_mmcsd_probe, diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c index e3548408ca392c..384609671a9ab9 100644 --- a/drivers/mmc/host/dw_mmc-exynos.c +++ b/drivers/mmc/host/dw_mmc-exynos.c @@ -189,7 +189,6 @@ static void dw_mci_exynos_set_clksel_timing(struct dw_mci *host, u32 timing) set_bit(DW_MMC_CARD_NO_USE_HOLD, &host->slot->flags); } -#ifdef CONFIG_PM static int dw_mci_exynos_runtime_resume(struct device *dev) { struct dw_mci *host = dev_get_drvdata(dev); @@ -203,9 +202,7 @@ static int dw_mci_exynos_runtime_resume(struct device *dev) return ret; } -#endif /* CONFIG_PM */ -#ifdef CONFIG_PM_SLEEP /** * dw_mci_exynos_suspend_noirq - Exynos-specific suspend code * @dev: Device to suspend (this device) @@ -265,7 +262,6 @@ static int dw_mci_exynos_resume_noirq(struct device *dev) return 0; } -#endif /* CONFIG_PM_SLEEP */ static void dw_mci_exynos_config_hs400(struct dw_mci *host, u32 timing) { @@ -712,11 +708,8 @@ static void dw_mci_exynos_remove(struct platform_device *pdev) } static const struct dev_pm_ops dw_mci_exynos_pmops = { - SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(dw_mci_exynos_suspend_noirq, - dw_mci_exynos_resume_noirq) - SET_RUNTIME_PM_OPS(dw_mci_runtime_suspend, - dw_mci_exynos_runtime_resume, - NULL) + NOIRQ_SYSTEM_SLEEP_PM_OPS(dw_mci_exynos_suspend_noirq, dw_mci_exynos_resume_noirq) + RUNTIME_PM_OPS(dw_mci_runtime_suspend, dw_mci_exynos_runtime_resume, NULL) }; static struct platform_driver dw_mci_exynos_pltfm_driver = { @@ -726,7 +719,7 @@ static struct platform_driver dw_mci_exynos_pltfm_driver = { .name = "dwmmc_exynos", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_exynos_match, - .pm = &dw_mci_exynos_pmops, + .pm = pm_ptr(&dw_mci_exynos_pmops), }, }; diff --git a/drivers/mmc/host/dw_mmc-k3.c b/drivers/mmc/host/dw_mmc-k3.c index 0311a37dd4abfa..ad6aa1aea54915 100644 --- a/drivers/mmc/host/dw_mmc-k3.c +++ b/drivers/mmc/host/dw_mmc-k3.c @@ -461,11 +461,8 @@ static int dw_mci_k3_probe(struct platform_device *pdev) } static const struct dev_pm_ops dw_mci_k3_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(dw_mci_runtime_suspend, - dw_mci_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(dw_mci_runtime_suspend, dw_mci_runtime_resume, NULL) }; static struct platform_driver dw_mci_k3_pltfm_driver = { @@ -475,7 +472,7 @@ static struct platform_driver dw_mci_k3_pltfm_driver = { .name = "dwmmc_k3", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_k3_match, - .pm = &dw_mci_k3_dev_pm_ops, + .pm = pm_ptr(&dw_mci_k3_dev_pm_ops), }, }; diff --git a/drivers/mmc/host/dw_mmc-pci.c b/drivers/mmc/host/dw_mmc-pci.c index e7ab699f488e67..092cc99175af0f 100644 --- a/drivers/mmc/host/dw_mmc-pci.c +++ b/drivers/mmc/host/dw_mmc-pci.c @@ -75,11 +75,8 @@ static void dw_mci_pci_remove(struct pci_dev *pdev) } static const struct dev_pm_ops dw_mci_pci_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(dw_mci_runtime_suspend, - dw_mci_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(dw_mci_runtime_suspend, dw_mci_runtime_resume, NULL) }; static const struct pci_device_id dw_mci_pci_id[] = { @@ -94,7 +91,7 @@ static struct pci_driver dw_mci_pci_driver = { .probe = dw_mci_pci_probe, .remove = dw_mci_pci_remove, .driver = { - .pm = &dw_mci_pci_dev_pm_ops, + .pm = pm_ptr(&dw_mci_pci_dev_pm_ops), }, }; diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c index baa23b51773127..d2aec6cf9773dc 100644 --- a/drivers/mmc/host/dw_mmc-rockchip.c +++ b/drivers/mmc/host/dw_mmc-rockchip.c @@ -568,11 +568,8 @@ static void dw_mci_rockchip_remove(struct platform_device *pdev) } static const struct dev_pm_ops dw_mci_rockchip_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(dw_mci_runtime_suspend, - dw_mci_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(dw_mci_runtime_suspend, dw_mci_runtime_resume, NULL) }; static struct platform_driver dw_mci_rockchip_pltfm_driver = { @@ -582,7 +579,7 @@ static struct platform_driver dw_mci_rockchip_pltfm_driver = { .name = "dwmmc_rockchip", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_rockchip_match, - .pm = &dw_mci_rockchip_dev_pm_ops, + .pm = pm_ptr(&dw_mci_rockchip_dev_pm_ops), }, }; diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h index 5463392dc81105..648b4a5641bf24 100644 --- a/drivers/mmc/host/dw_mmc.h +++ b/drivers/mmc/host/dw_mmc.h @@ -541,6 +541,9 @@ extern void dw_mci_remove(struct dw_mci *host); #ifdef CONFIG_PM extern int dw_mci_runtime_suspend(struct device *device); extern int dw_mci_runtime_resume(struct device *device); +#else +static inline int dw_mci_runtime_suspend(struct device *device) { return -EOPNOTSUPP; } +static inline int dw_mci_runtime_resume(struct device *device) { return -EOPNOTSUPP; } #endif /** diff --git a/drivers/mmc/host/meson-mx-sdhc-clkc.c b/drivers/mmc/host/meson-mx-sdhc-clkc.c index cbd17a596cd25c..6d619bd0a8dc89 100644 --- a/drivers/mmc/host/meson-mx-sdhc-clkc.c +++ b/drivers/mmc/host/meson-mx-sdhc-clkc.c @@ -84,10 +84,8 @@ static int meson_mx_sdhc_gate_clk_hw_register(struct device *dev, return ret; clk_bulk_data[bulk_index].clk = devm_clk_hw_get_clk(dev, hw, name_suffix); - if (IS_ERR(clk_bulk_data[bulk_index].clk)) - return PTR_ERR(clk_bulk_data[bulk_index].clk); - return 0; + return PTR_ERR_OR_ZERO(clk_bulk_data[bulk_index].clk); } int meson_mx_sdhc_register_clkc(struct device *dev, void __iomem *base, diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 35b0ad273b4ff6..42936e248c5536 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -563,10 +563,10 @@ mmc_spi_setup_data_message(struct mmc_spi_host *host, bool multiple, bool write) * the next token (next data block, or STOP_TRAN). We can try to * minimize I/O ops by using a single read to collect end-of-busy. */ - if (multiple || write) { + if (write) { t = &host->early_status; memset(t, 0, sizeof(*t)); - t->len = write ? sizeof(scratch->status) : 1; + t->len = sizeof(scratch->status); t->tx_buf = host->ones; t->rx_buf = scratch->status; t->cs_change = 1; diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index 8367283647a9b2..e500051bd572f7 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -2516,7 +2516,6 @@ static void mmci_remove(struct amba_device *dev) } } -#ifdef CONFIG_PM static void mmci_save(struct mmci_host *host) { unsigned long flags; @@ -2581,12 +2580,10 @@ static int mmci_runtime_resume(struct device *dev) return 0; } -#endif static const struct dev_pm_ops mmci_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(mmci_runtime_suspend, mmci_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(mmci_runtime_suspend, mmci_runtime_resume, NULL) }; static const struct amba_id mmci_ids[] = { @@ -2675,7 +2672,7 @@ MODULE_DEVICE_TABLE(amba, mmci_ids); static struct amba_driver mmci_driver = { .drv = { .name = DRIVER_NAME, - .pm = &mmci_dev_pm_ops, + .pm = pm_ptr(&mmci_dev_pm_ops), .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, .probe = mmci_probe, diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index d7020e06dd55aa..79074291e9d22e 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -3278,7 +3278,7 @@ static void msdc_restore_reg(struct msdc_host *host) __msdc_enable_sdio_irq(host, 1); } -static int __maybe_unused msdc_runtime_suspend(struct device *dev) +static int msdc_runtime_suspend(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct msdc_host *host = mmc_priv(mmc); @@ -3300,7 +3300,7 @@ static int __maybe_unused msdc_runtime_suspend(struct device *dev) return 0; } -static int __maybe_unused msdc_runtime_resume(struct device *dev) +static int msdc_runtime_resume(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct msdc_host *host = mmc_priv(mmc); @@ -3323,7 +3323,7 @@ static int __maybe_unused msdc_runtime_resume(struct device *dev) return 0; } -static int __maybe_unused msdc_suspend(struct device *dev) +static int msdc_suspend(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct msdc_host *host = mmc_priv(mmc); @@ -3348,7 +3348,7 @@ static int __maybe_unused msdc_suspend(struct device *dev) return pm_runtime_force_suspend(dev); } -static int __maybe_unused msdc_resume(struct device *dev) +static int msdc_resume(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct msdc_host *host = mmc_priv(mmc); @@ -3360,8 +3360,8 @@ static int __maybe_unused msdc_resume(struct device *dev) } static const struct dev_pm_ops msdc_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(msdc_suspend, msdc_resume) - SET_RUNTIME_PM_OPS(msdc_runtime_suspend, msdc_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(msdc_suspend, msdc_resume) + RUNTIME_PM_OPS(msdc_runtime_suspend, msdc_runtime_resume, NULL) }; static struct platform_driver mt_msdc_driver = { @@ -3371,7 +3371,7 @@ static struct platform_driver mt_msdc_driver = { .name = "mtk-msdc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = msdc_of_ids, - .pm = &msdc_dev_pm_ops, + .pm = pm_ptr(&msdc_dev_pm_ops), }, }; diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index a9e6277789ba69..79df2fa89a3fdd 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -292,7 +292,7 @@ static u32 mvsd_finish_data(struct mvsd_host *host, struct mmc_data *data, host->pio_ptr = NULL; host->pio_size = 0; } else { - dma_unmap_sg(mmc_dev(host->mmc), data->sg, host->sg_frags, + dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len, mmc_get_dma_dir(data)); } diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index a6e44e4061061e..7c7c52d9e8e72b 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -680,7 +680,6 @@ static void mxs_mmc_remove(struct platform_device *pdev) clk_disable_unprepare(ssp->clk); } -#ifdef CONFIG_PM_SLEEP static int mxs_mmc_suspend(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); @@ -699,9 +698,8 @@ static int mxs_mmc_resume(struct device *dev) return clk_prepare_enable(ssp->clk); } -#endif -static SIMPLE_DEV_PM_OPS(mxs_mmc_pm_ops, mxs_mmc_suspend, mxs_mmc_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(mxs_mmc_pm_ops, mxs_mmc_suspend, mxs_mmc_resume); static struct platform_driver mxs_mmc_driver = { .probe = mxs_mmc_probe, @@ -709,7 +707,7 @@ static struct platform_driver mxs_mmc_driver = { .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &mxs_mmc_pm_ops, + .pm = pm_sleep_ptr(&mxs_mmc_pm_ops), .of_match_table = mxs_mmc_dt_ids, }, }; diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index adc0d0b6ae377e..09e4354d1f1db8 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -620,8 +620,6 @@ static void omap_hsmmc_set_bus_mode(struct omap_hsmmc_host *host) OMAP_HSMMC_WRITE(host->base, CON, con & ~OD); } -#ifdef CONFIG_PM - /* * Restore the MMC host context, if it was lost as result of a * power state change. @@ -689,6 +687,7 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host) return 0; } +#ifdef CONFIG_PM /* * Save the MMC host context (store the number of power state changes so far). */ @@ -1990,7 +1989,6 @@ static void omap_hsmmc_remove(struct platform_device *pdev) clk_disable_unprepare(host->dbclk); } -#ifdef CONFIG_PM_SLEEP static int omap_hsmmc_suspend(struct device *dev) { struct omap_hsmmc_host *host = dev_get_drvdata(dev); @@ -2032,9 +2030,7 @@ static int omap_hsmmc_resume(struct device *dev) pm_runtime_put_autosuspend(host->dev); return 0; } -#endif -#ifdef CONFIG_PM static int omap_hsmmc_runtime_suspend(struct device *dev) { struct omap_hsmmc_host *host; @@ -2102,11 +2098,10 @@ static int omap_hsmmc_runtime_resume(struct device *dev) spin_unlock_irqrestore(&host->irq_lock, flags); return 0; } -#endif static const struct dev_pm_ops omap_hsmmc_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(omap_hsmmc_suspend, omap_hsmmc_resume) - SET_RUNTIME_PM_OPS(omap_hsmmc_runtime_suspend, omap_hsmmc_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(omap_hsmmc_suspend, omap_hsmmc_resume) + RUNTIME_PM_OPS(omap_hsmmc_runtime_suspend, omap_hsmmc_runtime_resume, NULL) }; static struct platform_driver omap_hsmmc_driver = { @@ -2115,7 +2110,7 @@ static struct platform_driver omap_hsmmc_driver = { .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &omap_hsmmc_dev_pm_ops, + .pm = pm_ptr(&omap_hsmmc_dev_pm_ops), .of_match_table = of_match_ptr(omap_mmc_of_match), }, }; diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index fb8ca03f661d7f..f56fa2cd208dd9 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -222,7 +222,11 @@ static void renesas_sdhi_set_clock(struct tmio_mmc_host *host, clk &= ~0xff; } - sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, clk & CLK_CTL_DIV_MASK); + clock = clk & CLK_CTL_DIV_MASK; + if (clock != CLK_CTL_DIV_MASK) + host->mmc->actual_clock /= (1 << (ffs(clock) + 1)); + + sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, clock); if (!(host->pdata->flags & TMIO_MMC_MIN_RCAR2)) usleep_range(10000, 11000); diff --git a/drivers/mmc/host/renesas_sdhi_internal_dmac.c b/drivers/mmc/host/renesas_sdhi_internal_dmac.c index 4b389e92399e82..9e3ed0bcddd6c4 100644 --- a/drivers/mmc/host/renesas_sdhi_internal_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_internal_dmac.c @@ -107,7 +107,8 @@ static const struct renesas_sdhi_of_data of_data_rza2 = { static const struct renesas_sdhi_of_data of_data_rcar_gen3 = { .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_CLK_ACTUAL | - TMIO_MMC_HAVE_CBSY | TMIO_MMC_MIN_RCAR2, + TMIO_MMC_HAVE_CBSY | TMIO_MMC_MIN_RCAR2 | + TMIO_MMC_64BIT_DATA_PORT, .capabilities = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ | MMC_CAP_CMD23 | MMC_CAP_WAIT_WHILE_BUSY, .capabilities2 = MMC_CAP2_NO_WRITE_PROTECT | MMC_CAP2_MERGE_CAPABLE, diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index c5f6b9df066b58..84674659a84d4a 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -48,7 +48,7 @@ struct rtsx_usb_sdmmc { bool ddr_mode; unsigned char power_mode; - + u16 ocp_stat; #ifdef RTSX_USB_USE_LEDS_CLASS struct led_classdev led; char led_name[32]; @@ -789,12 +789,20 @@ static int sdmmc_get_cd(struct mmc_host *mmc) if (err) goto no_card; + /* get OCP status */ + host->ocp_stat = (val >> 4) & 0x03; + if (val & SD_CD) { host->card_exist = true; return 1; } no_card: + /* clear OCP status */ + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_CLEAR, MS_OCP_CLEAR); + host->ocp_stat = 0; + } host->card_exist = false; return 0; } @@ -818,7 +826,11 @@ static void sdmmc_request(struct mmc_host *mmc, struct mmc_request *mrq) cmd->error = -ENOMEDIUM; goto finish_detect_card; } - + /* check OCP stat */ + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + cmd->error = -ENOMEDIUM; + goto finish_detect_card; + } mutex_lock(&ucr->dev_mutex); mutex_lock(&host->host_mutex); @@ -952,6 +964,10 @@ static int sd_power_on(struct rtsx_usb_sdmmc *host) struct rtsx_ucr *ucr = host->ucr; int err; + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + dev_dbg(sdmmc_dev(host), "over current\n"); + return -EIO; + } dev_dbg(sdmmc_dev(host), "%s\n", __func__); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_SELECT, 0x07, SD_MOD_SEL); @@ -977,9 +993,19 @@ static int sd_power_on(struct rtsx_usb_sdmmc *host) usleep_range(800, 1000); + rtsx_usb_init_cmd(ucr); + /* WA OCP issue: after OCP, there were problems with reopen card power */ + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, POWER_MASK, POWER_ON); + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, FPDCTL, SSC_POWER_MASK, SSC_POWER_DOWN); + err = rtsx_usb_send_cmd(ucr, MODE_C, 100); + if (err) + return err; + msleep(20); + rtsx_usb_write_register(ucr, FPDCTL, SSC_POWER_MASK, SSC_POWER_ON); + usleep_range(180, 200); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, - POWER_MASK|LDO3318_PWR_MASK, POWER_ON|LDO_ON); + LDO3318_PWR_MASK, LDO_ON); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_OE, SD_OUTPUT_EN, SD_OUTPUT_EN); @@ -1332,6 +1358,7 @@ static void rtsx_usb_init_host(struct rtsx_usb_sdmmc *host) mmc->max_req_size = 524288; host->power_mode = MMC_POWER_OFF; + host->ocp_stat = 0; } static int rtsx_usb_sdmmc_drv_probe(struct platform_device *pdev) @@ -1428,7 +1455,6 @@ static void rtsx_usb_sdmmc_drv_remove(struct platform_device *pdev) ": Realtek USB SD/MMC module has been removed\n"); } -#ifdef CONFIG_PM static int rtsx_usb_sdmmc_runtime_suspend(struct device *dev) { struct rtsx_usb_sdmmc *host = dev_get_drvdata(dev); @@ -1446,11 +1472,9 @@ static int rtsx_usb_sdmmc_runtime_resume(struct device *dev) mmc_detect_change(host->mmc, 0); return 0; } -#endif static const struct dev_pm_ops rtsx_usb_sdmmc_dev_pm_ops = { - SET_RUNTIME_PM_OPS(rtsx_usb_sdmmc_runtime_suspend, - rtsx_usb_sdmmc_runtime_resume, NULL) + RUNTIME_PM_OPS(rtsx_usb_sdmmc_runtime_suspend, rtsx_usb_sdmmc_runtime_resume, NULL) }; static const struct platform_device_id rtsx_usb_sdmmc_ids[] = { @@ -1469,7 +1493,7 @@ static struct platform_driver rtsx_usb_sdmmc_driver = { .driver = { .name = "rtsx_usb_sdmmc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &rtsx_usb_sdmmc_dev_pm_ops, + .pm = pm_ptr(&rtsx_usb_sdmmc_dev_pm_ops), }, }; module_platform_driver(rtsx_usb_sdmmc_driver); diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 2d46d4854fa1b6..84c7054607fcb9 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -973,8 +973,7 @@ static void sdhci_acpi_remove(struct platform_device *pdev) c->slot->free_slot(pdev); } -static void __maybe_unused sdhci_acpi_reset_signal_voltage_if_needed( - struct device *dev) +static void sdhci_acpi_reset_signal_voltage_if_needed(struct device *dev) { struct sdhci_acpi_host *c = dev_get_drvdata(dev); struct sdhci_host *host = c->host; @@ -989,8 +988,6 @@ static void __maybe_unused sdhci_acpi_reset_signal_voltage_if_needed( } } -#ifdef CONFIG_PM_SLEEP - static int sdhci_acpi_suspend(struct device *dev) { struct sdhci_acpi_host *c = dev_get_drvdata(dev); @@ -1017,10 +1014,6 @@ static int sdhci_acpi_resume(struct device *dev) return sdhci_resume_host(c->host); } -#endif - -#ifdef CONFIG_PM - static int sdhci_acpi_runtime_suspend(struct device *dev) { struct sdhci_acpi_host *c = dev_get_drvdata(dev); @@ -1045,12 +1038,9 @@ static int sdhci_acpi_runtime_resume(struct device *dev) return 0; } -#endif - static const struct dev_pm_ops sdhci_acpi_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_acpi_suspend, sdhci_acpi_resume) - SET_RUNTIME_PM_OPS(sdhci_acpi_runtime_suspend, - sdhci_acpi_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_acpi_suspend, sdhci_acpi_resume) + RUNTIME_PM_OPS(sdhci_acpi_runtime_suspend, sdhci_acpi_runtime_resume, NULL) }; static struct platform_driver sdhci_acpi_driver = { @@ -1058,7 +1048,7 @@ static struct platform_driver sdhci_acpi_driver = { .name = "sdhci-acpi", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .acpi_match_table = sdhci_acpi_ids, - .pm = &sdhci_acpi_pm_ops, + .pm = pm_ptr(&sdhci_acpi_pm_ops), }, .probe = sdhci_acpi_probe, .remove = sdhci_acpi_remove, diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c index efc2f3bdc63158..15705e85417f40 100644 --- a/drivers/mmc/host/sdhci-brcmstb.c +++ b/drivers/mmc/host/sdhci-brcmstb.c @@ -496,7 +496,6 @@ static void sdhci_brcmstb_shutdown(struct platform_device *pdev) MODULE_DEVICE_TABLE(of, sdhci_brcm_of_match); -#ifdef CONFIG_PM_SLEEP static int sdhci_brcmstb_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -540,17 +539,14 @@ static int sdhci_brcmstb_resume(struct device *dev) return ret; } -#endif -static const struct dev_pm_ops sdhci_brcmstb_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_brcmstb_suspend, sdhci_brcmstb_resume) -}; +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_brcmstb_pmops, sdhci_brcmstb_suspend, sdhci_brcmstb_resume); static struct platform_driver sdhci_brcmstb_driver = { .driver = { .name = "sdhci-brcmstb", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_brcmstb_pmops, + .pm = pm_sleep_ptr(&sdhci_brcmstb_pmops), .of_match_table = of_match_ptr(sdhci_brcm_of_match), }, .probe = sdhci_brcmstb_probe, diff --git a/drivers/mmc/host/sdhci-cadence.c b/drivers/mmc/host/sdhci-cadence.c index 2d823e158c5984..435603c8c00b2a 100644 --- a/drivers/mmc/host/sdhci-cadence.c +++ b/drivers/mmc/host/sdhci-cadence.c @@ -36,6 +36,24 @@ #define SDHCI_CDNS_HRS06_MODE_MMC_HS400 0x5 #define SDHCI_CDNS_HRS06_MODE_MMC_HS400ES 0x6 +/* Read block gap */ +#define SDHCI_CDNS_HRS37 0x94 /* interface mode select */ +#define SDHCI_CDNS_HRS37_MODE_DS 0x0 +#define SDHCI_CDNS_HRS37_MODE_HS 0x1 +#define SDHCI_CDNS_HRS37_MODE_UDS_SDR12 0x8 +#define SDHCI_CDNS_HRS37_MODE_UDS_SDR25 0x9 +#define SDHCI_CDNS_HRS37_MODE_UDS_SDR50 0xa +#define SDHCI_CDNS_HRS37_MODE_UDS_SDR104 0xb +#define SDHCI_CDNS_HRS37_MODE_UDS_DDR50 0xc +#define SDHCI_CDNS_HRS37_MODE_MMC_LEGACY 0x20 +#define SDHCI_CDNS_HRS37_MODE_MMC_SDR 0x21 +#define SDHCI_CDNS_HRS37_MODE_MMC_DDR 0x22 +#define SDHCI_CDNS_HRS37_MODE_MMC_HS200 0x23 +#define SDHCI_CDNS_HRS37_MODE_MMC_HS400 0x24 +#define SDHCI_CDNS_HRS37_MODE_MMC_HS400ES 0x25 +#define SDHCI_CDNS_HRS38 0x98 /* Read block gap coefficient */ +#define SDHCI_CDNS_HRS38_BLKGAP_MAX 0xf + /* SRS - Slot Register Set (SDHCI-compatible) */ #define SDHCI_CDNS_SRS_BASE 0x200 @@ -251,6 +269,43 @@ static int sdhci_cdns_set_tune_val(struct sdhci_host *host, unsigned int val) return 0; } +/** + * sdhci_cdns_tune_blkgap() - tune multi-block read gap + * @mmc: MMC host + * + * Tune delay used in multi block read. To do so, + * try sending multi-block read command with incremented gap, unless + * it succeeds. + * + * Return: error code + */ +static int sdhci_cdns_tune_blkgap(struct mmc_host *mmc) +{ + struct sdhci_host *host = mmc_priv(mmc); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_cdns_priv *priv = sdhci_pltfm_priv(pltfm_host); + void __iomem *hrs37_reg = priv->hrs_addr + SDHCI_CDNS_HRS37; + void __iomem *hrs38_reg = priv->hrs_addr + SDHCI_CDNS_HRS38; + int ret; + u32 gap; + + /* Currently only needed in HS200 mode */ + if (host->timing != MMC_TIMING_MMC_HS200) + return 0; + + writel(SDHCI_CDNS_HRS37_MODE_MMC_HS200, hrs37_reg); + + for (gap = 0; gap <= SDHCI_CDNS_HRS38_BLKGAP_MAX; gap++) { + writel(gap, hrs38_reg); + ret = mmc_read_tuning(mmc, 512, 32); + if (!ret) + break; + } + + dev_dbg(mmc_dev(mmc), "read block gap tune %s, gap %d\n", ret ? "failed" : "OK", gap); + return ret; +} + /* * In SD mode, software must not use the hardware tuning and instead perform * an almost identical procedure to eMMC. @@ -261,6 +316,7 @@ static int sdhci_cdns_execute_tuning(struct sdhci_host *host, u32 opcode) int max_streak = 0; int end_of_streak = 0; int i; + int ret; /* * Do not execute tuning for UHS_SDR50 or UHS_DDR50. @@ -288,7 +344,11 @@ static int sdhci_cdns_execute_tuning(struct sdhci_host *host, u32 opcode) return -EIO; } - return sdhci_cdns_set_tune_val(host, end_of_streak - max_streak / 2); + ret = sdhci_cdns_set_tune_val(host, end_of_streak - max_streak / 2); + if (ret) + return ret; + + return sdhci_cdns_tune_blkgap(host->mmc); } static void sdhci_cdns_set_uhs_signaling(struct sdhci_host *host, @@ -551,7 +611,6 @@ static int sdhci_cdns_probe(struct platform_device *pdev) return sdhci_add_host(host); } -#ifdef CONFIG_PM_SLEEP static int sdhci_cdns_resume(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -578,11 +637,8 @@ static int sdhci_cdns_resume(struct device *dev) return ret; } -#endif -static const struct dev_pm_ops sdhci_cdns_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_pltfm_suspend, sdhci_cdns_resume) -}; +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_cdns_pm_ops, sdhci_pltfm_suspend, sdhci_cdns_resume); static const struct of_device_id sdhci_cdns_match[] = { { @@ -606,7 +662,7 @@ static struct platform_driver sdhci_cdns_driver = { .driver = { .name = "sdhci-cdns", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_cdns_pm_ops, + .pm = pm_sleep_ptr(&sdhci_cdns_pm_ops), .of_match_table = sdhci_cdns_match, }, .probe = sdhci_cdns_probe, diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index a040c0896a7b30..a7a5df673b0f6d 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -1650,7 +1650,6 @@ static void sdhci_esdhc_imx_hwinit(struct sdhci_host *host) } } -#ifdef CONFIG_PM_SLEEP static void sdhc_esdhc_tuning_save(struct sdhci_host *host) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1707,7 +1706,6 @@ static void sdhc_esdhc_tuning_restore(struct sdhci_host *host) host->ioaddr + ESDHC_TUNE_CTRL_STATUS); } } -#endif static void esdhc_cqe_enable(struct mmc_host *mmc) { @@ -2016,7 +2014,6 @@ static void sdhci_esdhc_imx_remove(struct platform_device *pdev) cpu_latency_qos_remove_request(&imx_data->pm_qos_req); } -#ifdef CONFIG_PM_SLEEP static int sdhci_esdhc_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -2112,9 +2109,7 @@ static int sdhci_esdhc_resume(struct device *dev) return ret; } -#endif -#ifdef CONFIG_PM static int sdhci_esdhc_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -2188,12 +2183,10 @@ static int sdhci_esdhc_runtime_resume(struct device *dev) cpu_latency_qos_remove_request(&imx_data->pm_qos_req); return err; } -#endif static const struct dev_pm_ops sdhci_esdhc_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_esdhc_suspend, sdhci_esdhc_resume) - SET_RUNTIME_PM_OPS(sdhci_esdhc_runtime_suspend, - sdhci_esdhc_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_esdhc_suspend, sdhci_esdhc_resume) + RUNTIME_PM_OPS(sdhci_esdhc_runtime_suspend, sdhci_esdhc_runtime_resume, NULL) }; static struct platform_driver sdhci_esdhc_imx_driver = { @@ -2201,7 +2194,7 @@ static struct platform_driver sdhci_esdhc_imx_driver = { .name = "sdhci-esdhc-imx", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = imx_esdhc_dt_ids, - .pm = &sdhci_esdhc_pmops, + .pm = pm_ptr(&sdhci_esdhc_pmops), }, .probe = sdhci_esdhc_imx_probe, .remove = sdhci_esdhc_imx_remove, diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 9d8e20dc8ca11a..4e5edbf2fc9b6f 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -81,6 +81,7 @@ #define CORE_IO_PAD_PWR_SWITCH_EN BIT(15) #define CORE_IO_PAD_PWR_SWITCH BIT(16) #define CORE_HC_SELECT_IN_EN BIT(18) +#define CORE_HC_SELECT_IN_SDR50 (4 << 19) #define CORE_HC_SELECT_IN_HS400 (6 << 19) #define CORE_HC_SELECT_IN_MASK (7 << 19) @@ -1133,6 +1134,10 @@ static bool sdhci_msm_is_tuning_needed(struct sdhci_host *host) { struct mmc_ios *ios = &host->mmc->ios; + if (ios->timing == MMC_TIMING_UHS_SDR50 && + host->flags & SDHCI_SDR50_NEEDS_TUNING) + return true; + /* * Tuning is required for SDR104, HS200 and HS400 cards and * if clock frequency is greater than 100MHz in these modes. @@ -1201,6 +1206,8 @@ static int sdhci_msm_execute_tuning(struct mmc_host *mmc, u32 opcode) struct mmc_ios ios = host->mmc->ios; struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); + const struct sdhci_msm_offset *msm_offset = msm_host->offset; + u32 config; if (!sdhci_msm_is_tuning_needed(host)) { msm_host->use_cdr = false; @@ -1217,6 +1224,14 @@ static int sdhci_msm_execute_tuning(struct mmc_host *mmc, u32 opcode) */ msm_host->tuning_done = 0; + if (ios.timing == MMC_TIMING_UHS_SDR50 && + host->flags & SDHCI_SDR50_NEEDS_TUNING) { + config = readl_relaxed(host->ioaddr + msm_offset->core_vendor_spec); + config &= ~CORE_HC_SELECT_IN_MASK; + config |= CORE_HC_SELECT_IN_EN | CORE_HC_SELECT_IN_SDR50; + writel_relaxed(config, host->ioaddr + msm_offset->core_vendor_spec); + } + /* * For HS400 tuning in HS200 timing requires: * - select MCLK/2 in VENDOR_SPEC @@ -1943,7 +1958,7 @@ static void sdhci_msm_ice_enable(struct sdhci_msm_host *msm_host) qcom_ice_enable(msm_host->ice); } -static __maybe_unused int sdhci_msm_ice_resume(struct sdhci_msm_host *msm_host) +static int sdhci_msm_ice_resume(struct sdhci_msm_host *msm_host) { if (msm_host->mmc->caps2 & MMC_CAP2_CRYPTO) return qcom_ice_resume(msm_host->ice); @@ -1951,7 +1966,7 @@ static __maybe_unused int sdhci_msm_ice_resume(struct sdhci_msm_host *msm_host) return 0; } -static __maybe_unused int sdhci_msm_ice_suspend(struct sdhci_msm_host *msm_host) +static int sdhci_msm_ice_suspend(struct sdhci_msm_host *msm_host) { if (msm_host->mmc->caps2 & MMC_CAP2_CRYPTO) return qcom_ice_suspend(msm_host->ice); @@ -2011,13 +2026,13 @@ static inline void sdhci_msm_ice_enable(struct sdhci_msm_host *msm_host) { } -static inline __maybe_unused int +static inline int sdhci_msm_ice_resume(struct sdhci_msm_host *msm_host) { return 0; } -static inline __maybe_unused int +static inline int sdhci_msm_ice_suspend(struct sdhci_msm_host *msm_host) { return 0; @@ -2801,7 +2816,7 @@ static void sdhci_msm_remove(struct platform_device *pdev) clk_disable_unprepare(msm_host->bus_clk); } -static __maybe_unused int sdhci_msm_runtime_suspend(struct device *dev) +static int sdhci_msm_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -2820,7 +2835,7 @@ static __maybe_unused int sdhci_msm_runtime_suspend(struct device *dev) return sdhci_msm_ice_suspend(msm_host); } -static __maybe_unused int sdhci_msm_runtime_resume(struct device *dev) +static int sdhci_msm_runtime_resume(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -2856,11 +2871,8 @@ static __maybe_unused int sdhci_msm_runtime_resume(struct device *dev) } static const struct dev_pm_ops sdhci_msm_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(sdhci_msm_runtime_suspend, - sdhci_msm_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_msm_runtime_suspend, sdhci_msm_runtime_resume, NULL) }; static struct platform_driver sdhci_msm_driver = { @@ -2869,7 +2881,7 @@ static struct platform_driver sdhci_msm_driver = { .driver = { .name = "sdhci_msm", .of_match_table = sdhci_msm_dt_match, - .pm = &sdhci_msm_pm_ops, + .pm = pm_ptr(&sdhci_msm_pm_ops), .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c index 60dbc815e5019f..c6f09b53325d2d 100644 --- a/drivers/mmc/host/sdhci-of-arasan.c +++ b/drivers/mmc/host/sdhci-of-arasan.c @@ -605,7 +605,6 @@ static const struct sdhci_pltfm_data sdhci_arasan_cqe_pdata = { SDHCI_QUIRK2_CLOCK_DIV_ZERO_BROKEN, }; -#ifdef CONFIG_PM_SLEEP /** * sdhci_arasan_suspend - Suspend method for the driver * @dev: Address of the device structure @@ -699,10 +698,9 @@ static int sdhci_arasan_resume(struct device *dev) return 0; } -#endif /* ! CONFIG_PM_SLEEP */ -static SIMPLE_DEV_PM_OPS(sdhci_arasan_dev_pm_ops, sdhci_arasan_suspend, - sdhci_arasan_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_arasan_dev_pm_ops, sdhci_arasan_suspend, + sdhci_arasan_resume); /** * sdhci_arasan_sdcardclk_recalc_rate - Return the card clock rate @@ -2080,7 +2078,7 @@ static struct platform_driver sdhci_arasan_driver = { .name = "sdhci-arasan", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_arasan_of_match, - .pm = &sdhci_arasan_dev_pm_ops, + .pm = pm_sleep_ptr(&sdhci_arasan_dev_pm_ops), }, .probe = sdhci_arasan_probe, .remove = sdhci_arasan_remove, diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index 1ba2effaf376af..7c4ac65f247d39 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -229,7 +229,6 @@ static int sdhci_at91_set_clks_presets(struct device *dev) return 0; } -#ifdef CONFIG_PM_SLEEP static int sdhci_at91_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -243,9 +242,7 @@ static int sdhci_at91_suspend(struct device *dev) return ret; } -#endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_PM static int sdhci_at91_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -302,13 +299,10 @@ static int sdhci_at91_runtime_resume(struct device *dev) sdhci_runtime_resume_host(host, 0); return 0; } -#endif /* CONFIG_PM */ static const struct dev_pm_ops sdhci_at91_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_at91_suspend, pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(sdhci_at91_runtime_suspend, - sdhci_at91_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_at91_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_at91_runtime_suspend, sdhci_at91_runtime_resume, NULL) }; static int sdhci_at91_probe(struct platform_device *pdev) @@ -460,7 +454,7 @@ static struct platform_driver sdhci_at91_driver = { .name = "sdhci-at91", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_at91_dt_match, - .pm = &sdhci_at91_dev_pm_ops, + .pm = pm_ptr(&sdhci_at91_dev_pm_ops), }, .probe = sdhci_at91_probe, .remove = sdhci_at91_remove, diff --git a/drivers/mmc/host/sdhci-of-dwcmshc.c b/drivers/mmc/host/sdhci-of-dwcmshc.c index ee6b1096f70921..eebd4538995663 100644 --- a/drivers/mmc/host/sdhci-of-dwcmshc.c +++ b/drivers/mmc/host/sdhci-of-dwcmshc.c @@ -1499,7 +1499,6 @@ static void dwcmshc_remove(struct platform_device *pdev) clk_bulk_disable_unprepare(priv->num_other_clks, priv->other_clks); } -#ifdef CONFIG_PM_SLEEP static int dwcmshc_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -1570,9 +1569,6 @@ static int dwcmshc_resume(struct device *dev) clk_disable_unprepare(pltfm_host->clk); return ret; } -#endif - -#ifdef CONFIG_PM static void dwcmshc_enable_card_clk(struct sdhci_host *host) { @@ -1603,12 +1599,9 @@ static int dwcmshc_runtime_resume(struct device *dev) return 0; } -#endif - static const struct dev_pm_ops dwcmshc_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(dwcmshc_suspend, dwcmshc_resume) - SET_RUNTIME_PM_OPS(dwcmshc_runtime_suspend, - dwcmshc_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(dwcmshc_suspend, dwcmshc_resume) + RUNTIME_PM_OPS(dwcmshc_runtime_suspend, dwcmshc_runtime_resume, NULL) }; static struct platform_driver sdhci_dwcmshc_driver = { @@ -1617,7 +1610,7 @@ static struct platform_driver sdhci_dwcmshc_driver = { .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_dwcmshc_dt_ids, .acpi_match_table = ACPI_PTR(sdhci_dwcmshc_acpi_ids), - .pm = &dwcmshc_pmops, + .pm = pm_ptr(&dwcmshc_pmops), }, .probe = dwcmshc_probe, .remove = dwcmshc_remove, diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index c6ee0099ead096..8345e2c5a03413 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -1234,7 +1234,6 @@ static u32 esdhc_irq(struct sdhci_host *host, u32 intmask) return intmask; } -#ifdef CONFIG_PM_SLEEP static u32 esdhc_proctl; static int esdhc_of_suspend(struct device *dev) { @@ -1260,11 +1259,8 @@ static int esdhc_of_resume(struct device *dev) } return ret; } -#endif -static SIMPLE_DEV_PM_OPS(esdhc_of_dev_pm_ops, - esdhc_of_suspend, - esdhc_of_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(esdhc_of_dev_pm_ops, esdhc_of_suspend, esdhc_of_resume); static const struct sdhci_ops sdhci_esdhc_be_ops = { .read_l = esdhc_be_readl, @@ -1511,7 +1507,7 @@ static struct platform_driver sdhci_esdhc_driver = { .name = "sdhci-esdhc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_esdhc_of_match, - .pm = &esdhc_of_dev_pm_ops, + .pm = pm_sleep_ptr(&esdhc_of_dev_pm_ops), }, .probe = sdhci_esdhc_probe, .remove = sdhci_pltfm_remove, diff --git a/drivers/mmc/host/sdhci-omap.c b/drivers/mmc/host/sdhci-omap.c index cdb09605e009aa..b5d7c1a80a92f2 100644 --- a/drivers/mmc/host/sdhci-omap.c +++ b/drivers/mmc/host/sdhci-omap.c @@ -1400,8 +1400,7 @@ static void sdhci_omap_remove(struct platform_device *pdev) pm_runtime_force_suspend(dev); } -#ifdef CONFIG_PM -static void __maybe_unused sdhci_omap_context_save(struct sdhci_omap_host *omap_host) +static void sdhci_omap_context_save(struct sdhci_omap_host *omap_host) { omap_host->con = sdhci_omap_readl(omap_host, SDHCI_OMAP_CON); omap_host->hctl = sdhci_omap_readl(omap_host, SDHCI_OMAP_HCTL); @@ -1412,7 +1411,7 @@ static void __maybe_unused sdhci_omap_context_save(struct sdhci_omap_host *omap_ } /* Order matters here, HCTL must be restored in two phases */ -static void __maybe_unused sdhci_omap_context_restore(struct sdhci_omap_host *omap_host) +static void sdhci_omap_context_restore(struct sdhci_omap_host *omap_host) { sdhci_omap_writel(omap_host, SDHCI_OMAP_HCTL, omap_host->hctl); sdhci_omap_writel(omap_host, SDHCI_OMAP_CAPA, omap_host->capa); @@ -1424,7 +1423,7 @@ static void __maybe_unused sdhci_omap_context_restore(struct sdhci_omap_host *om sdhci_omap_writel(omap_host, SDHCI_OMAP_ISE, omap_host->ise); } -static int __maybe_unused sdhci_omap_runtime_suspend(struct device *dev) +static int sdhci_omap_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1443,7 +1442,7 @@ static int __maybe_unused sdhci_omap_runtime_suspend(struct device *dev) return 0; } -static int __maybe_unused sdhci_omap_runtime_resume(struct device *dev) +static int sdhci_omap_runtime_resume(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1458,13 +1457,10 @@ static int __maybe_unused sdhci_omap_runtime_resume(struct device *dev) return 0; } -#endif static const struct dev_pm_ops sdhci_omap_dev_pm_ops = { - SET_RUNTIME_PM_OPS(sdhci_omap_runtime_suspend, - sdhci_omap_runtime_resume, NULL) - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_omap_runtime_suspend, sdhci_omap_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) }; static struct platform_driver sdhci_omap_driver = { @@ -1473,7 +1469,7 @@ static struct platform_driver sdhci_omap_driver = { .driver = { .name = "sdhci-omap", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_omap_dev_pm_ops, + .pm = pm_ptr(&sdhci_omap_dev_pm_ops), .of_match_table = omap_sdhci_match, }, }; diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index 826958992dfe26..47a0a738862b58 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -679,8 +679,19 @@ static int intel_start_signal_voltage_switch(struct mmc_host *mmc, return 0; } +static void sdhci_intel_set_clock(struct sdhci_host *host, unsigned int clock) +{ + u16 clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL); + + /* Stop card clock separately to avoid glitches on clock line */ + if (clk & SDHCI_CLOCK_CARD_EN) + sdhci_writew(host, clk & ~SDHCI_CLOCK_CARD_EN, SDHCI_CLOCK_CONTROL); + + sdhci_set_clock(host, clock); +} + static const struct sdhci_ops sdhci_intel_byt_ops = { - .set_clock = sdhci_set_clock, + .set_clock = sdhci_intel_set_clock, .set_power = sdhci_intel_set_power, .enable_dma = sdhci_pci_enable_dma, .set_bus_width = sdhci_set_bus_width, @@ -690,7 +701,7 @@ static const struct sdhci_ops sdhci_intel_byt_ops = { }; static const struct sdhci_ops sdhci_intel_glk_ops = { - .set_clock = sdhci_set_clock, + .set_clock = sdhci_intel_set_clock, .set_power = sdhci_intel_set_power, .enable_dma = sdhci_pci_enable_dma, .set_bus_width = sdhci_set_bus_width, diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c index 3a1de477e9af8d..b0f91cc9e40e43 100644 --- a/drivers/mmc/host/sdhci-pci-gli.c +++ b/drivers/mmc/host/sdhci-pci-gli.c @@ -283,6 +283,8 @@ #define PCIE_GLI_9767_UHS2_CTL2_ZC_VALUE 0xb #define PCIE_GLI_9767_UHS2_CTL2_ZC_CTL BIT(6) #define PCIE_GLI_9767_UHS2_CTL2_ZC_CTL_VALUE 0x1 +#define PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN BIT(13) +#define PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE BIT(14) #define GLI_MAX_TUNING_LOOP 40 @@ -1179,6 +1181,65 @@ static void gl9767_set_low_power_negotiation(struct pci_dev *pdev, bool enable) gl9767_vhs_read(pdev); } +static void sdhci_gl9767_uhs2_phy_reset(struct sdhci_host *host, bool assert) +{ + struct sdhci_pci_slot *slot = sdhci_priv(host); + struct pci_dev *pdev = slot->chip->pdev; + u32 value, set, clr; + + if (assert) { + /* Assert reset, set RESETN and clean RESETN_VALUE */ + set = PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN; + clr = PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE; + } else { + /* De-assert reset, clean RESETN and set RESETN_VALUE */ + set = PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE; + clr = PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN; + } + + gl9767_vhs_write(pdev); + pci_read_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, &value); + value |= set; + pci_write_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, value); + value &= ~clr; + pci_write_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, value); + gl9767_vhs_read(pdev); +} + +static void __gl9767_uhs2_set_power(struct sdhci_host *host, unsigned char mode, unsigned short vdd) +{ + u8 pwr = 0; + + if (mode != MMC_POWER_OFF) { + pwr = sdhci_get_vdd_value(vdd); + if (!pwr) + WARN(1, "%s: Invalid vdd %#x\n", + mmc_hostname(host->mmc), vdd); + pwr |= SDHCI_VDD2_POWER_180; + } + + if (host->pwr == pwr) + return; + + host->pwr = pwr; + + if (pwr == 0) { + sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); + } else { + sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); + + pwr |= SDHCI_POWER_ON; + sdhci_writeb(host, pwr & 0xf, SDHCI_POWER_CONTROL); + usleep_range(5000, 6250); + + /* Assert reset */ + sdhci_gl9767_uhs2_phy_reset(host, true); + pwr |= SDHCI_VDD2_POWER_ON; + sdhci_writeb(host, pwr, SDHCI_POWER_CONTROL); + usleep_range(5000, 6250); + } +} + static void sdhci_gl9767_set_clock(struct sdhci_host *host, unsigned int clock) { struct sdhci_pci_slot *slot = sdhci_priv(host); @@ -1205,6 +1266,11 @@ static void sdhci_gl9767_set_clock(struct sdhci_host *host, unsigned int clock) } sdhci_enable_clk(host, clk); + + if (mmc_card_uhs2(host->mmc)) + /* De-assert reset */ + sdhci_gl9767_uhs2_phy_reset(host, false); + gl9767_set_low_power_negotiation(pdev, true); } @@ -1476,7 +1542,7 @@ static void sdhci_gl9767_set_power(struct sdhci_host *host, unsigned char mode, gl9767_vhs_read(pdev); sdhci_gli_overcurrent_event_enable(host, false); - sdhci_uhs2_set_power(host, mode, vdd); + __gl9767_uhs2_set_power(host, mode, vdd); sdhci_gli_overcurrent_event_enable(host, true); } else { gl9767_vhs_write(pdev); diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c index 1371960e34ebbb..d082c4e21aa9ed 100644 --- a/drivers/mmc/host/sdhci-pxav3.c +++ b/drivers/mmc/host/sdhci-pxav3.c @@ -20,9 +20,11 @@ #include #include #include +#include #include #include #include +#include #include "sdhci.h" #include "sdhci-pltfm.h" @@ -51,6 +53,9 @@ struct sdhci_pxa { struct clk *clk_io; u8 power_mode; void __iomem *sdio3_conf_reg; + struct pinctrl *pinctrl; + struct pinctrl_state *pins_default; + struct pinctrl_state *pins_uhs; }; /* @@ -313,8 +318,20 @@ static void pxav3_set_power(struct sdhci_host *host, unsigned char mode, mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, vdd); } +static void pxav3_set_clock(struct sdhci_host *host, unsigned int clock) +{ + struct sdhci_pltfm_host *phost = sdhci_priv(host); + struct sdhci_pxa *pxa = sdhci_pltfm_priv(phost); + struct pinctrl_state *pins = clock < 100 * HZ_PER_MHZ ? pxa->pins_default : pxa->pins_uhs; + + if (pins) + pinctrl_select_state(pxa->pinctrl, pins); + + sdhci_set_clock(host, clock); +} + static const struct sdhci_ops pxav3_sdhci_ops = { - .set_clock = sdhci_set_clock, + .set_clock = pxav3_set_clock, .set_power = pxav3_set_power, .platform_send_init_74_clocks = pxav3_gen_init_74_clocks, .get_max_clock = sdhci_pltfm_clk_get_max_clock, @@ -366,6 +383,19 @@ static inline struct sdhci_pxa_platdata *pxav3_get_mmc_pdata(struct device *dev) } #endif +static struct pinctrl_state *pxav3_lookup_pinstate(struct device *dev, struct pinctrl *pinctrl, + const char *name) +{ + struct pinctrl_state *pins = pinctrl_lookup_state(pinctrl, name); + + if (IS_ERR(pins)) { + dev_dbg(dev, "could not get pinstate '%s': %ld\n", name, PTR_ERR(pins)); + return NULL; + } + + return pins; +} + static int sdhci_pxav3_probe(struct platform_device *pdev) { struct sdhci_pltfm_host *pltfm_host; @@ -440,6 +470,15 @@ static int sdhci_pxav3_probe(struct platform_device *pdev) host->mmc->pm_caps |= pdata->pm_caps; } + pxa->pinctrl = devm_pinctrl_get(dev); + if (!IS_ERR(pxa->pinctrl)) { + pxa->pins_default = pxav3_lookup_pinstate(dev, pxa->pinctrl, "default"); + if (pxa->pins_default) + pxa->pins_uhs = pxav3_lookup_pinstate(dev, pxa->pinctrl, "state_uhs"); + } else { + dev_dbg(dev, "could not get pinctrl handle: %ld\n", PTR_ERR(pxa->pinctrl)); + } + pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); pm_runtime_set_autosuspend_delay(&pdev->dev, PXAV3_RPM_DELAY_MS); @@ -484,7 +523,6 @@ static void sdhci_pxav3_remove(struct platform_device *pdev) clk_disable_unprepare(pxa->clk_core); } -#ifdef CONFIG_PM_SLEEP static int sdhci_pxav3_suspend(struct device *dev) { int ret; @@ -510,9 +548,7 @@ static int sdhci_pxav3_resume(struct device *dev) return ret; } -#endif -#ifdef CONFIG_PM static int sdhci_pxav3_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -544,12 +580,10 @@ static int sdhci_pxav3_runtime_resume(struct device *dev) sdhci_runtime_resume_host(host, 0); return 0; } -#endif static const struct dev_pm_ops sdhci_pxav3_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_pxav3_suspend, sdhci_pxav3_resume) - SET_RUNTIME_PM_OPS(sdhci_pxav3_runtime_suspend, - sdhci_pxav3_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_pxav3_suspend, sdhci_pxav3_resume) + RUNTIME_PM_OPS(sdhci_pxav3_runtime_suspend, sdhci_pxav3_runtime_resume, NULL) }; static struct platform_driver sdhci_pxav3_driver = { @@ -557,7 +591,7 @@ static struct platform_driver sdhci_pxav3_driver = { .name = "sdhci-pxav3", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_pxav3_of_match), - .pm = &sdhci_pxav3_pmops, + .pm = pm_ptr(&sdhci_pxav3_pmops), }, .probe = sdhci_pxav3_probe, .remove = sdhci_pxav3_remove, diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c index 40857fc2e21b4a..6bf66aaa86a6df 100644 --- a/drivers/mmc/host/sdhci-s3c.c +++ b/drivers/mmc/host/sdhci-s3c.c @@ -681,7 +681,6 @@ static void sdhci_s3c_remove(struct platform_device *pdev) clk_disable_unprepare(sc->clk_io); } -#ifdef CONFIG_PM_SLEEP static int sdhci_s3c_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -698,9 +697,7 @@ static int sdhci_s3c_resume(struct device *dev) return sdhci_resume_host(host); } -#endif -#ifdef CONFIG_PM static int sdhci_s3c_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -730,12 +727,10 @@ static int sdhci_s3c_runtime_resume(struct device *dev) sdhci_runtime_resume_host(host, 0); return 0; } -#endif static const struct dev_pm_ops sdhci_s3c_pmops = { - SET_SYSTEM_SLEEP_PM_OPS(sdhci_s3c_suspend, sdhci_s3c_resume) - SET_RUNTIME_PM_OPS(sdhci_s3c_runtime_suspend, sdhci_s3c_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_s3c_suspend, sdhci_s3c_resume) + RUNTIME_PM_OPS(sdhci_s3c_runtime_suspend, sdhci_s3c_runtime_resume, NULL) }; static const struct platform_device_id sdhci_s3c_driver_ids[] = { @@ -770,7 +765,7 @@ static struct platform_driver sdhci_s3c_driver = { .name = "s3c-sdhci", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_s3c_dt_match), - .pm = &sdhci_s3c_pmops, + .pm = pm_ptr(&sdhci_s3c_pmops), }, }; diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c index fa0f8aeb7ee0e8..72d21dc0cb698a 100644 --- a/drivers/mmc/host/sdhci-spear.c +++ b/drivers/mmc/host/sdhci-spear.c @@ -130,7 +130,6 @@ static void sdhci_remove(struct platform_device *pdev) clk_disable_unprepare(sdhci->clk); } -#ifdef CONFIG_PM_SLEEP static int sdhci_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -161,9 +160,8 @@ static int sdhci_resume(struct device *dev) return sdhci_resume_host(host); } -#endif -static SIMPLE_DEV_PM_OPS(sdhci_pm_ops, sdhci_suspend, sdhci_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_pm_ops, sdhci_suspend, sdhci_resume); static const struct of_device_id sdhci_spear_id_table[] = { { .compatible = "st,spear300-sdhci" }, @@ -175,7 +173,7 @@ static struct platform_driver sdhci_driver = { .driver = { .name = "sdhci", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_pm_ops, + .pm = pm_sleep_ptr(&sdhci_pm_ops), .of_match_table = sdhci_spear_id_table, }, .probe = sdhci_probe, diff --git a/drivers/mmc/host/sdhci-sprd.c b/drivers/mmc/host/sdhci-sprd.c index fe2fe52b23b27a..3584a2b314a9f9 100644 --- a/drivers/mmc/host/sdhci-sprd.c +++ b/drivers/mmc/host/sdhci-sprd.c @@ -903,7 +903,6 @@ static const struct of_device_id sdhci_sprd_of_match[] = { }; MODULE_DEVICE_TABLE(of, sdhci_sprd_of_match); -#ifdef CONFIG_PM static int sdhci_sprd_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -950,13 +949,10 @@ static int sdhci_sprd_runtime_resume(struct device *dev) return ret; } -#endif static const struct dev_pm_ops sdhci_sprd_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(sdhci_sprd_runtime_suspend, - sdhci_sprd_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_sprd_runtime_suspend, sdhci_sprd_runtime_resume, NULL) }; static struct platform_driver sdhci_sprd_driver = { @@ -966,7 +962,7 @@ static struct platform_driver sdhci_sprd_driver = { .name = "sdhci_sprd_r11", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_sprd_of_match, - .pm = &sdhci_sprd_pm_ops, + .pm = pm_ptr(&sdhci_sprd_pm_ops), }, }; module_platform_driver(sdhci_sprd_driver); diff --git a/drivers/mmc/host/sdhci-st.c b/drivers/mmc/host/sdhci-st.c index 9157342ff7a4bb..bf668580513778 100644 --- a/drivers/mmc/host/sdhci-st.c +++ b/drivers/mmc/host/sdhci-st.c @@ -445,7 +445,6 @@ static void sdhci_st_remove(struct platform_device *pdev) reset_control_assert(rstc); } -#ifdef CONFIG_PM_SLEEP static int sdhci_st_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -492,9 +491,8 @@ static int sdhci_st_resume(struct device *dev) return sdhci_resume_host(host); } -#endif -static SIMPLE_DEV_PM_OPS(sdhci_st_pmops, sdhci_st_suspend, sdhci_st_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(sdhci_st_pmops, sdhci_st_suspend, sdhci_st_resume); static const struct of_device_id st_sdhci_match[] = { { .compatible = "st,sdhci" }, @@ -509,7 +507,7 @@ static struct platform_driver sdhci_st_driver = { .driver = { .name = "sdhci-st", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_st_pmops, + .pm = pm_sleep_ptr(&sdhci_st_pmops), .of_match_table = st_sdhci_match, }, }; diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index c811297185d8f8..820ce4dae58bac 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -1831,7 +1831,7 @@ static void sdhci_tegra_remove(struct platform_device *pdev) clk_disable_unprepare(tegra_host->tmclk); } -static int __maybe_unused sdhci_tegra_runtime_suspend(struct device *dev) +static int sdhci_tegra_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1841,7 +1841,7 @@ static int __maybe_unused sdhci_tegra_runtime_suspend(struct device *dev) return 0; } -static int __maybe_unused sdhci_tegra_runtime_resume(struct device *dev) +static int sdhci_tegra_runtime_resume(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1849,7 +1849,6 @@ static int __maybe_unused sdhci_tegra_runtime_resume(struct device *dev) return clk_prepare_enable(pltfm_host->clk); } -#ifdef CONFIG_PM_SLEEP static int sdhci_tegra_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -1910,12 +1909,10 @@ static int sdhci_tegra_resume(struct device *dev) pm_runtime_force_suspend(dev); return ret; } -#endif static const struct dev_pm_ops sdhci_tegra_dev_pm_ops = { - SET_RUNTIME_PM_OPS(sdhci_tegra_runtime_suspend, sdhci_tegra_runtime_resume, - NULL) - SET_SYSTEM_SLEEP_PM_OPS(sdhci_tegra_suspend, sdhci_tegra_resume) + RUNTIME_PM_OPS(sdhci_tegra_runtime_suspend, sdhci_tegra_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(sdhci_tegra_suspend, sdhci_tegra_resume) }; static struct platform_driver sdhci_tegra_driver = { @@ -1923,7 +1920,7 @@ static struct platform_driver sdhci_tegra_driver = { .name = "sdhci-tegra", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_tegra_dt_match, - .pm = &sdhci_tegra_dev_pm_ops, + .pm = pm_ptr(&sdhci_tegra_dev_pm_ops), }, .probe = sdhci_tegra_probe, .remove = sdhci_tegra_remove, diff --git a/drivers/mmc/host/sdhci-uhs2.c b/drivers/mmc/host/sdhci-uhs2.c index 0efeb9d0c3765a..c459a08d01da52 100644 --- a/drivers/mmc/host/sdhci-uhs2.c +++ b/drivers/mmc/host/sdhci-uhs2.c @@ -295,7 +295,8 @@ static void __sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) else sdhci_uhs2_set_power(host, ios->power_mode, ios->vdd); - sdhci_set_clock(host, host->clock); + host->ops->set_clock(host, ios->clock); + host->clock = ios->clock; } static int sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c index b12bee8342bdd5..046e8100dd0899 100644 --- a/drivers/mmc/host/sdhci-xenon.c +++ b/drivers/mmc/host/sdhci-xenon.c @@ -622,7 +622,6 @@ static void xenon_remove(struct platform_device *pdev) clk_disable_unprepare(pltfm_host->clk); } -#ifdef CONFIG_PM_SLEEP static int xenon_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -635,9 +634,7 @@ static int xenon_suspend(struct device *dev) priv->restore_needed = true; return ret; } -#endif -#ifdef CONFIG_PM static int xenon_runtime_suspend(struct device *dev) { struct sdhci_host *host = dev_get_drvdata(dev); @@ -685,14 +682,10 @@ static int xenon_runtime_resume(struct device *dev) clk_disable_unprepare(pltfm_host->clk); return ret; } -#endif /* CONFIG_PM */ static const struct dev_pm_ops sdhci_xenon_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(xenon_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(xenon_runtime_suspend, - xenon_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(xenon_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(xenon_runtime_suspend, xenon_runtime_resume, NULL) }; static const struct of_device_id sdhci_xenon_dt_ids[] = { @@ -721,7 +714,7 @@ static struct platform_driver sdhci_xenon_driver = { .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_xenon_dt_ids, .acpi_match_table = ACPI_PTR(sdhci_xenon_acpi_ids), - .pm = &sdhci_xenon_dev_pm_ops, + .pm = pm_ptr(&sdhci_xenon_dev_pm_ops), }, .probe = xenon_probe, .remove = xenon_remove, diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 3a17821efa5ca9..ac7e11f37af71f 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2367,23 +2367,6 @@ void sdhci_set_ios_common(struct mmc_host *mmc, struct mmc_ios *ios) (ios->power_mode == MMC_POWER_UP) && !(host->quirks2 & SDHCI_QUIRK2_PRESET_VALUE_BROKEN)) sdhci_enable_preset_value(host, false); - - if (!ios->clock || ios->clock != host->clock) { - host->ops->set_clock(host, ios->clock); - host->clock = ios->clock; - - if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK && - host->clock) { - host->timeout_clk = mmc->actual_clock ? - mmc->actual_clock / 1000 : - host->clock / 1000; - mmc->max_busy_timeout = - host->ops->get_max_timeout_count ? - host->ops->get_max_timeout_count(host) : - 1 << 27; - mmc->max_busy_timeout /= host->timeout_clk; - } - } } EXPORT_SYMBOL_GPL(sdhci_set_ios_common); @@ -2410,6 +2393,23 @@ void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) sdhci_set_ios_common(mmc, ios); + if (!ios->clock || ios->clock != host->clock) { + host->ops->set_clock(host, ios->clock); + host->clock = ios->clock; + + if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK && + host->clock) { + host->timeout_clk = mmc->actual_clock ? + mmc->actual_clock / 1000 : + host->clock / 1000; + mmc->max_busy_timeout = + host->ops->get_max_timeout_count ? + host->ops->get_max_timeout_count(host) : + 1 << 27; + mmc->max_busy_timeout /= host->timeout_clk; + } + } + if (host->ops->set_power) host->ops->set_power(host, ios->power_mode, ios->vdd); else diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 58fcbeaf281e02..b6a571d866fa53 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -880,6 +880,13 @@ int sdhci_suspend_host(struct sdhci_host *host); int sdhci_resume_host(struct sdhci_host *host); void sdhci_runtime_suspend_host(struct sdhci_host *host); void sdhci_runtime_resume_host(struct sdhci_host *host, int soft_reset); +#else +static inline bool sdhci_enable_irq_wakeups(struct sdhci_host *host) { return false; } +static inline void sdhci_disable_irq_wakeups(struct sdhci_host *host) {} +static inline int sdhci_suspend_host(struct sdhci_host *host) { return -EOPNOTSUPP; } +static inline int sdhci_resume_host(struct sdhci_host *host) { return -EOPNOTSUPP; } +static inline void sdhci_runtime_suspend_host(struct sdhci_host *host) {} +static inline void sdhci_runtime_resume_host(struct sdhci_host *host, int soft_reset) {} #endif void sdhci_cqe_enable(struct mmc_host *mmc); diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index 17e62c61b6e688..d235b0aecfdb12 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -95,7 +95,6 @@ static const struct regmap_config sdhci_am654_regmap_config = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, - .fast_io = true, }; struct timing_data { @@ -1036,7 +1035,6 @@ static void sdhci_am654_remove(struct platform_device *pdev) pm_runtime_put_noidle(dev); } -#ifdef CONFIG_PM static int sdhci_am654_restore(struct sdhci_host *host) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -1124,20 +1122,17 @@ static int sdhci_am654_runtime_resume(struct device *dev) return 0; } -#endif static const struct dev_pm_ops sdhci_am654_dev_pm_ops = { - SET_RUNTIME_PM_OPS(sdhci_am654_runtime_suspend, - sdhci_am654_runtime_resume, NULL) - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) + RUNTIME_PM_OPS(sdhci_am654_runtime_suspend, sdhci_am654_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) }; static struct platform_driver sdhci_am654_driver = { .driver = { .name = "sdhci-am654", .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sdhci_am654_dev_pm_ops, + .pm = pm_ptr(&sdhci_am654_dev_pm_ops), .of_match_table = sdhci_am654_of_match, }, .probe = sdhci_am654_probe, diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 19f84584ecfa0e..bf899c8e38f517 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -1568,7 +1568,6 @@ static void sh_mmcif_remove(struct platform_device *pdev) pm_runtime_disable(&pdev->dev); } -#ifdef CONFIG_PM_SLEEP static int sh_mmcif_suspend(struct device *dev) { struct sh_mmcif_host *host = dev_get_drvdata(dev); @@ -1580,15 +1579,7 @@ static int sh_mmcif_suspend(struct device *dev) return 0; } -static int sh_mmcif_resume(struct device *dev) -{ - return 0; -} -#endif - -static const struct dev_pm_ops sh_mmcif_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(sh_mmcif_suspend, sh_mmcif_resume) -}; +static DEFINE_SIMPLE_DEV_PM_OPS(sh_mmcif_dev_pm_ops, sh_mmcif_suspend, NULL); static struct platform_driver sh_mmcif_driver = { .probe = sh_mmcif_probe, @@ -1596,7 +1587,7 @@ static struct platform_driver sh_mmcif_driver = { .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = &sh_mmcif_dev_pm_ops, + .pm = pm_sleep_ptr(&sh_mmcif_dev_pm_ops), .of_match_table = sh_mmcif_of_match, }, }; diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index ee4a65b0a22dce..8dbcff53a63133 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1495,7 +1495,6 @@ static void sunxi_mmc_remove(struct platform_device *pdev) dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma); } -#ifdef CONFIG_PM static int sunxi_mmc_runtime_resume(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); @@ -1530,14 +1529,10 @@ static int sunxi_mmc_runtime_suspend(struct device *dev) return 0; } -#endif static const struct dev_pm_ops sunxi_mmc_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, - pm_runtime_force_resume) - SET_RUNTIME_PM_OPS(sunxi_mmc_runtime_suspend, - sunxi_mmc_runtime_resume, - NULL) + SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + RUNTIME_PM_OPS(sunxi_mmc_runtime_suspend, sunxi_mmc_runtime_resume, NULL) }; static struct platform_driver sunxi_mmc_driver = { @@ -1545,7 +1540,7 @@ static struct platform_driver sunxi_mmc_driver = { .name = "sunxi-mmc", .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sunxi_mmc_of_match, - .pm = &sunxi_mmc_pm_ops, + .pm = pm_ptr(&sunxi_mmc_pm_ops), }, .probe = sunxi_mmc_probe, .remove = sunxi_mmc_remove, diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index d730b7633ae1aa..c8cdb1c0722e7b 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -242,6 +243,20 @@ static inline void sd_ctrl_read32_rep(struct tmio_mmc_host *host, int addr, ioread32_rep(host->ctl + (addr << host->bus_shift), buf, count); } +#ifdef CONFIG_64BIT +static inline void sd_ctrl_read64_rep(struct tmio_mmc_host *host, int addr, + u64 *buf, int count) +{ + readsq(host->ctl + (addr << host->bus_shift), buf, count); +} + +static inline void sd_ctrl_write64_rep(struct tmio_mmc_host *host, int addr, + const u64 *buf, int count) +{ + writesq(host->ctl + (addr << host->bus_shift), buf, count); +} +#endif + static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val) { diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 21c2f9095baca2..775e0d9353d571 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -349,6 +349,39 @@ static void tmio_mmc_transfer_data(struct tmio_mmc_host *host, /* * Transfer the data */ +#ifdef CONFIG_64BIT + if (host->pdata->flags & TMIO_MMC_64BIT_DATA_PORT) { + u64 *buf64 = (u64 *)buf; + u64 data = 0; + + if (count >= 8) { + if (is_read) + sd_ctrl_read64_rep(host, CTL_SD_DATA_PORT, + buf64, count >> 3); + else + sd_ctrl_write64_rep(host, CTL_SD_DATA_PORT, + buf64, count >> 3); + } + + /* if count was multiple of 8 */ + if (!(count & 0x7)) + return; + + buf64 += count >> 3; + count %= 8; + + if (is_read) { + sd_ctrl_read64_rep(host, CTL_SD_DATA_PORT, &data, 1); + memcpy(buf64, &data, count); + } else { + memcpy(&data, buf64, count); + sd_ctrl_write64_rep(host, CTL_SD_DATA_PORT, &data, 1); + } + + return; + } +#endif + if (host->pdata->flags & TMIO_MMC_32BIT_DATA_PORT) { u32 data = 0; u32 *buf32 = (u32 *)buf; diff --git a/drivers/mmc/host/toshsd.c b/drivers/mmc/host/toshsd.c index e5f7f8abafc055..aa5d2511a62b8f 100644 --- a/drivers/mmc/host/toshsd.c +++ b/drivers/mmc/host/toshsd.c @@ -567,7 +567,6 @@ static void toshsd_powerdown(struct toshsd_host *host) pci_write_config_byte(host->pdev, SD_PCICFG_CLKSTOP, 0); } -#ifdef CONFIG_PM_SLEEP static int toshsd_pm_suspend(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); @@ -599,7 +598,6 @@ static int toshsd_pm_resume(struct device *dev) return 0; } -#endif /* CONFIG_PM_SLEEP */ static int toshsd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -688,16 +686,14 @@ static void toshsd_remove(struct pci_dev *pdev) pci_disable_device(pdev); } -static const struct dev_pm_ops toshsd_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(toshsd_pm_suspend, toshsd_pm_resume) -}; +static DEFINE_SIMPLE_DEV_PM_OPS(toshsd_pm_ops, toshsd_pm_suspend, toshsd_pm_resume); static struct pci_driver toshsd_driver = { .name = DRIVER_NAME, .id_table = pci_ids, .probe = toshsd_probe, .remove = toshsd_remove, - .driver.pm = &toshsd_pm_ops, + .driver.pm = pm_sleep_ptr(&toshsd_pm_ops), }; module_pci_driver(toshsd_driver); diff --git a/drivers/mmc/host/via-sdmmc.c b/drivers/mmc/host/via-sdmmc.c index 3bd49f64899d62..c628b3bbfd7aad 100644 --- a/drivers/mmc/host/via-sdmmc.c +++ b/drivers/mmc/host/via-sdmmc.c @@ -1218,7 +1218,7 @@ static void via_sd_remove(struct pci_dev *pcidev) pci_name(pcidev), (int)pcidev->vendor, (int)pcidev->device); } -static void __maybe_unused via_init_sdc_pm(struct via_crdr_mmc_host *host) +static void via_init_sdc_pm(struct via_crdr_mmc_host *host) { struct sdhcreg *pm_sdhcreg; void __iomem *addrbase; @@ -1252,7 +1252,7 @@ static void __maybe_unused via_init_sdc_pm(struct via_crdr_mmc_host *host) via_print_sdchc(host); } -static int __maybe_unused via_sd_suspend(struct device *dev) +static int via_sd_suspend(struct device *dev) { struct via_crdr_mmc_host *host; unsigned long flags; @@ -1269,7 +1269,7 @@ static int __maybe_unused via_sd_suspend(struct device *dev) return 0; } -static int __maybe_unused via_sd_resume(struct device *dev) +static int via_sd_resume(struct device *dev) { struct via_crdr_mmc_host *sdhost; u8 gatt; @@ -1295,14 +1295,14 @@ static int __maybe_unused via_sd_resume(struct device *dev) return 0; } -static SIMPLE_DEV_PM_OPS(via_sd_pm_ops, via_sd_suspend, via_sd_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(via_sd_pm_ops, via_sd_suspend, via_sd_resume); static struct pci_driver via_sd_driver = { .name = DRV_NAME, .id_table = via_ids, .probe = via_sd_probe, .remove = via_sd_remove, - .driver.pm = &via_sd_pm_ops, + .driver.pm = pm_sleep_ptr(&via_sd_pm_ops), }; module_pci_driver(via_sd_driver); diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c index 0d2929cfe39756..1b1d691e19fcc7 100644 --- a/drivers/mmc/host/wmt-sdmmc.c +++ b/drivers/mmc/host/wmt-sdmmc.c @@ -911,7 +911,6 @@ static void wmt_mci_remove(struct platform_device *pdev) dev_info(&pdev->dev, "WMT MCI device removed\n"); } -#ifdef CONFIG_PM static int wmt_mci_suspend(struct device *dev) { u32 reg_tmp; @@ -963,18 +962,7 @@ static int wmt_mci_resume(struct device *dev) return 0; } -static const struct dev_pm_ops wmt_mci_pm = { - .suspend = wmt_mci_suspend, - .resume = wmt_mci_resume, -}; - -#define wmt_mci_pm_ops (&wmt_mci_pm) - -#else /* !CONFIG_PM */ - -#define wmt_mci_pm_ops NULL - -#endif +static DEFINE_SIMPLE_DEV_PM_OPS(wmt_mci_pm_ops, wmt_mci_suspend, wmt_mci_resume); static struct platform_driver wmt_mci_driver = { .probe = wmt_mci_probe, @@ -982,7 +970,7 @@ static struct platform_driver wmt_mci_driver = { .driver = { .name = DRIVER_NAME, .probe_type = PROBE_PREFER_ASYNCHRONOUS, - .pm = wmt_mci_pm_ops, + .pm = pm_sleep_ptr(&wmt_mci_pm_ops), .of_match_table = wmt_mci_dt_ids, }, }; diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig index 46cebde79f34b0..e518dfeee65426 100644 --- a/drivers/mtd/devices/Kconfig +++ b/drivers/mtd/devices/Kconfig @@ -185,8 +185,8 @@ config MTD_POWERNV_FLASH config MTD_INTEL_DG tristate "Intel Discrete Graphics non-volatile memory driver" - depends on AUXILIARY_BUS - depends on MTD + depends on AUXILIARY_BUS && MTD + depends on DRM_I915!=n || DRM_XE!=n || COMPILE_TEST help This provides an MTD device to access Intel Discrete Graphics non-volatile memory. diff --git a/drivers/mtd/nand/raw/atmel/nand-controller.c b/drivers/mtd/nand/raw/atmel/nand-controller.c index 84ab4a83cbd686..db94d14a3807f5 100644 --- a/drivers/mtd/nand/raw/atmel/nand-controller.c +++ b/drivers/mtd/nand/raw/atmel/nand-controller.c @@ -1377,14 +1377,24 @@ static int atmel_smc_nand_prepare_smcconf(struct atmel_nand *nand, if (ret) return ret; + /* + * Read setup timing depends on the operation done on the NAND: + * + * NRD_SETUP = max(tAR, tCLR) + */ + timeps = max(conf->timings.sdr.tAR_min, conf->timings.sdr.tCLR_min); + ncycles = DIV_ROUND_UP(timeps, mckperiodps); + totalcycles += ncycles; + ret = atmel_smc_cs_conf_set_setup(smcconf, ATMEL_SMC_NRD_SHIFT, ncycles); + if (ret) + return ret; + /* * The read cycle timing is directly matching tRC, but is also * dependent on the setup and hold timings we calculated earlier, * which gives: * - * NRD_CYCLE = max(tRC, NRD_PULSE + NRD_HOLD) - * - * NRD_SETUP is always 0. + * NRD_CYCLE = max(tRC, NRD_SETUP + NRD_PULSE + NRD_HOLD) */ ncycles = DIV_ROUND_UP(conf->timings.sdr.tRC_min, mckperiodps); ncycles = max(totalcycles, ncycles); diff --git a/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c b/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c index c23b537948d5e6..1a285cd8fad62a 100644 --- a/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c +++ b/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c @@ -935,10 +935,10 @@ static void ma35_chips_cleanup(struct ma35_nand_info *nand) static int ma35_nand_chips_init(struct device *dev, struct ma35_nand_info *nand) { - struct device_node *np = dev->of_node, *nand_np; + struct device_node *np = dev->of_node; int ret; - for_each_child_of_node(np, nand_np) { + for_each_child_of_node_scoped(np, nand_np) { ret = ma35_nand_chip_init(dev, nand, nand_np); if (ret) { ma35_chips_cleanup(nand); diff --git a/drivers/mtd/nand/raw/stm32_fmc2_nand.c b/drivers/mtd/nand/raw/stm32_fmc2_nand.c index a960403081f110..d957327fb4fa04 100644 --- a/drivers/mtd/nand/raw/stm32_fmc2_nand.c +++ b/drivers/mtd/nand/raw/stm32_fmc2_nand.c @@ -272,6 +272,7 @@ struct stm32_fmc2_nfc { struct sg_table dma_data_sg; struct sg_table dma_ecc_sg; u8 *ecc_buf; + dma_addr_t dma_ecc_addr; int dma_ecc_len; u32 tx_dma_max_burst; u32 rx_dma_max_burst; @@ -902,17 +903,10 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, if (!write_data && !raw) { /* Configure DMA ECC status */ - p = nfc->ecc_buf; for_each_sg(nfc->dma_ecc_sg.sgl, sg, eccsteps, s) { - sg_set_buf(sg, p, nfc->dma_ecc_len); - p += nfc->dma_ecc_len; - } - - ret = dma_map_sg(nfc->dev, nfc->dma_ecc_sg.sgl, - eccsteps, dma_data_dir); - if (!ret) { - ret = -EIO; - goto err_unmap_data; + sg_dma_address(sg) = nfc->dma_ecc_addr + + s * nfc->dma_ecc_len; + sg_dma_len(sg) = nfc->dma_ecc_len; } desc_ecc = dmaengine_prep_slave_sg(nfc->dma_ecc_ch, @@ -921,7 +915,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, DMA_PREP_INTERRUPT); if (!desc_ecc) { ret = -ENOMEM; - goto err_unmap_ecc; + goto err_unmap_data; } reinit_completion(&nfc->dma_ecc_complete); @@ -929,7 +923,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, desc_ecc->callback_param = &nfc->dma_ecc_complete; ret = dma_submit_error(dmaengine_submit(desc_ecc)); if (ret) - goto err_unmap_ecc; + goto err_unmap_data; dma_async_issue_pending(nfc->dma_ecc_ch); } @@ -949,7 +943,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, if (!write_data && !raw) dmaengine_terminate_all(nfc->dma_ecc_ch); ret = -ETIMEDOUT; - goto err_unmap_ecc; + goto err_unmap_data; } /* Wait DMA data transfer completion */ @@ -969,11 +963,6 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, } } -err_unmap_ecc: - if (!write_data && !raw) - dma_unmap_sg(nfc->dev, nfc->dma_ecc_sg.sgl, - eccsteps, dma_data_dir); - err_unmap_data: dma_unmap_sg(nfc->dev, nfc->dma_data_sg.sgl, eccsteps, dma_data_dir); @@ -996,9 +985,21 @@ static int stm32_fmc2_nfc_seq_write(struct nand_chip *chip, const u8 *buf, /* Write oob */ if (oob_required) { - ret = nand_change_write_column_op(chip, mtd->writesize, - chip->oob_poi, mtd->oobsize, - false); + unsigned int offset_in_page = mtd->writesize; + const void *buf = chip->oob_poi; + unsigned int len = mtd->oobsize; + + if (!raw) { + struct mtd_oob_region oob_free; + + mtd_ooblayout_free(mtd, 0, &oob_free); + offset_in_page += oob_free.offset; + buf += oob_free.offset; + len = oob_free.length; + } + + ret = nand_change_write_column_op(chip, offset_in_page, + buf, len, false); if (ret) return ret; } @@ -1610,7 +1611,8 @@ static int stm32_fmc2_nfc_dma_setup(struct stm32_fmc2_nfc *nfc) return ret; /* Allocate a buffer to store ECC status registers */ - nfc->ecc_buf = devm_kzalloc(nfc->dev, FMC2_MAX_ECC_BUF_LEN, GFP_KERNEL); + nfc->ecc_buf = dmam_alloc_coherent(nfc->dev, FMC2_MAX_ECC_BUF_LEN, + &nfc->dma_ecc_addr, GFP_KERNEL); if (!nfc->ecc_buf) return -ENOMEM; diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c index 87053389a1fc7a..4870b2d5edb2a1 100644 --- a/drivers/mtd/nand/spi/winbond.c +++ b/drivers/mtd/nand/spi/winbond.c @@ -176,6 +176,36 @@ static const struct mtd_ooblayout_ops w25n02kv_ooblayout = { .free = w25n02kv_ooblayout_free, }; +static int w25n01jw_ooblayout_ecc(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + if (section > 3) + return -ERANGE; + + region->offset = (16 * section) + 12; + region->length = 4; + + return 0; +} + +static int w25n01jw_ooblayout_free(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + if (section > 3) + return -ERANGE; + + region->offset = (16 * section); + region->length = 12; + + /* Extract BBM */ + if (!section) { + region->offset += 2; + region->length -= 2; + } + + return 0; +} + static int w35n01jw_ooblayout_ecc(struct mtd_info *mtd, int section, struct mtd_oob_region *region) { @@ -206,6 +236,11 @@ static int w35n01jw_ooblayout_free(struct mtd_info *mtd, int section, return 0; } +static const struct mtd_ooblayout_ops w25n01jw_ooblayout = { + .ecc = w25n01jw_ooblayout_ecc, + .free = w25n01jw_ooblayout_free, +}; + static const struct mtd_ooblayout_ops w35n01jw_ooblayout = { .ecc = w35n01jw_ooblayout_ecc, .free = w35n01jw_ooblayout_free, @@ -394,7 +429,7 @@ static const struct spinand_info winbond_spinand_table[] = { &write_cache_variants, &update_cache_variants), 0, - SPINAND_ECCINFO(&w25m02gv_ooblayout, NULL), + SPINAND_ECCINFO(&w25n01jw_ooblayout, NULL), SPINAND_CONFIGURE_CHIP(w25n0xjw_hs_cfg)), SPINAND_INFO("W25N01KV", /* 3.3V */ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xae, 0x21), diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index b29628d46be9b3..ac12eaf11755dd 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -76,24 +76,11 @@ config WIREGUARD tristate "WireGuard secure network tunnel" depends on NET && INET depends on IPV6 || !IPV6 - depends on !KMSAN # KMSAN doesn't support the crypto configs below select NET_UDP_TUNNEL select DST_CACHE - select CRYPTO select CRYPTO_LIB_CURVE25519 select CRYPTO_LIB_CHACHA20POLY1305 - select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT - select CRYPTO_POLY1305_X86_64 if X86 && 64BIT - select CRYPTO_BLAKE2S_X86 if X86 && 64BIT - select CRYPTO_CURVE25519_X86 if X86 && 64BIT - select CRYPTO_CHACHA20_NEON if ARM || (ARM64 && KERNEL_MODE_NEON) - select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON - select CRYPTO_POLY1305_ARM if ARM - select CRYPTO_BLAKE2S_ARM if ARM - select CRYPTO_CURVE25519_NEON if ARM && KERNEL_MODE_NEON - select CRYPTO_CHACHA_MIPS if CPU_MIPS32_R2 - select CRYPTO_POLY1305_MIPS if MIPS - select CRYPTO_CHACHA_S390 if S390 + select CRYPTO_LIB_UTILS help WireGuard is a secure, fast, and easy to use replacement for IPSec that uses modern cryptography and clever networking tricks. It's diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 257333c8871092..57be04f6cb11a8 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2132,6 +2132,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); } else if (bond->params.fail_over_mac == BOND_FOM_FOLLOW && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && + bond_has_slaves(bond) && memcmp(slave_dev->dev_addr, bond_dev->dev_addr, bond_dev->addr_len) == 0) { /* Set slave to random address to avoid duplicate mac * address in later fail over. @@ -3355,7 +3356,6 @@ static void bond_ns_send_all(struct bonding *bond, struct slave *slave) /* Find out through which dev should the packet go */ memset(&fl6, 0, sizeof(struct flowi6)); fl6.daddr = targets[i]; - fl6.flowi6_oif = bond->dev->ifindex; dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6); if (dst->error) { diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c index 64e664f5adcc4a..87c134bcd48db5 100644 --- a/drivers/net/can/rcar/rcar_can.c +++ b/drivers/net/can/rcar/rcar_can.c @@ -861,7 +861,6 @@ static int rcar_can_resume(struct device *dev) { struct net_device *ndev = dev_get_drvdata(dev); struct rcar_can_priv *priv = netdev_priv(ndev); - u16 ctlr; int err; if (!netif_running(ndev)) @@ -873,12 +872,7 @@ static int rcar_can_resume(struct device *dev) return err; } - ctlr = readw(&priv->regs->ctlr); - ctlr &= ~RCAR_CAN_CTLR_SLPM; - writew(ctlr, &priv->regs->ctlr); - ctlr &= ~RCAR_CAN_CTLR_CANM; - writew(ctlr, &priv->regs->ctlr); - priv->can.state = CAN_STATE_ERROR_ACTIVE; + rcar_can_start(ndev); netif_device_attach(ndev); netif_start_queue(ndev); diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index b3c8c592fb0e04..7e8b1d2f1af651 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -823,9 +823,6 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv) /* Reset Global error flags */ rcar_canfd_write(gpriv->base, RCANFD_GERFL, 0x0); - /* Set the controller into appropriate mode */ - rcar_canfd_set_mode(gpriv); - /* Transition all Channels to reset mode */ for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) { rcar_canfd_clear_bit(gpriv->base, @@ -844,6 +841,10 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv) return err; } } + + /* Set the controller into appropriate mode */ + rcar_canfd_set_mode(gpriv); + return 0; } diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c index 09ae218315d73d..963ea8510dd9bf 100644 --- a/drivers/net/can/spi/hi311x.c +++ b/drivers/net/can/spi/hi311x.c @@ -545,8 +545,6 @@ static int hi3110_stop(struct net_device *net) priv->force_quit = 1; free_irq(spi->irq, priv); - destroy_workqueue(priv->wq); - priv->wq = NULL; mutex_lock(&priv->hi3110_lock); @@ -770,34 +768,23 @@ static int hi3110_open(struct net_device *net) goto out_close; } - priv->wq = alloc_workqueue("hi3110_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, - 0); - if (!priv->wq) { - ret = -ENOMEM; - goto out_free_irq; - } - INIT_WORK(&priv->tx_work, hi3110_tx_work_handler); - INIT_WORK(&priv->restart_work, hi3110_restart_work_handler); - ret = hi3110_hw_reset(spi); if (ret) - goto out_free_wq; + goto out_free_irq; ret = hi3110_setup(net); if (ret) - goto out_free_wq; + goto out_free_irq; ret = hi3110_set_normal_mode(spi); if (ret) - goto out_free_wq; + goto out_free_irq; netif_wake_queue(net); mutex_unlock(&priv->hi3110_lock); return 0; - out_free_wq: - destroy_workqueue(priv->wq); out_free_irq: free_irq(spi->irq, priv); hi3110_hw_sleep(spi); @@ -812,6 +799,7 @@ static const struct net_device_ops hi3110_netdev_ops = { .ndo_open = hi3110_open, .ndo_stop = hi3110_stop, .ndo_start_xmit = hi3110_hard_start_xmit, + .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops hi3110_ethtool_ops = { @@ -908,6 +896,15 @@ static int hi3110_can_probe(struct spi_device *spi) if (ret) goto out_clk; + priv->wq = alloc_workqueue("hi3110_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, + 0); + if (!priv->wq) { + ret = -ENOMEM; + goto out_clk; + } + INIT_WORK(&priv->tx_work, hi3110_tx_work_handler); + INIT_WORK(&priv->restart_work, hi3110_restart_work_handler); + priv->spi = spi; mutex_init(&priv->hi3110_lock); @@ -943,6 +940,8 @@ static int hi3110_can_probe(struct spi_device *spi) return 0; error_probe: + destroy_workqueue(priv->wq); + priv->wq = NULL; hi3110_power_enable(priv->power, 0); out_clk: @@ -963,6 +962,9 @@ static void hi3110_can_remove(struct spi_device *spi) hi3110_power_enable(priv->power, 0); + destroy_workqueue(priv->wq); + priv->wq = NULL; + clk_disable_unprepare(priv->clk); free_candev(net); diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c index 6fcb301ef611d0..53bfd873de9bde 100644 --- a/drivers/net/can/sun4i_can.c +++ b/drivers/net/can/sun4i_can.c @@ -768,6 +768,7 @@ static const struct net_device_ops sun4ican_netdev_ops = { .ndo_open = sun4ican_open, .ndo_stop = sun4ican_close, .ndo_start_xmit = sun4ican_start_xmit, + .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops sun4ican_ethtool_ops = { diff --git a/drivers/net/can/usb/Kconfig b/drivers/net/can/usb/Kconfig index a7547a83120e84..cf65a90816b9e4 100644 --- a/drivers/net/can/usb/Kconfig +++ b/drivers/net/can/usb/Kconfig @@ -134,6 +134,17 @@ config CAN_MCBA_USB This driver supports the CAN BUS Analyzer interface from Microchip (http://www.microchip.com/development-tools/). +config CAN_NCT6694 + tristate "Nuvoton NCT6694 Socket CANfd support" + depends on MFD_NCT6694 + select CAN_RX_OFFLOAD + help + If you say yes to this option, support will be included for Nuvoton + NCT6694, a USB device to socket CANfd controller. + + This driver can also be built as a module. If so, the module will + be called nct6694_canfd. + config CAN_PEAK_USB tristate "PEAK PCAN-USB/USB Pro interfaces for CAN 2.0b/CAN-FD" help diff --git a/drivers/net/can/usb/Makefile b/drivers/net/can/usb/Makefile index 8b11088e9a5956..fcafb1ac262eaf 100644 --- a/drivers/net/can/usb/Makefile +++ b/drivers/net/can/usb/Makefile @@ -11,5 +11,6 @@ obj-$(CONFIG_CAN_F81604) += f81604.o obj-$(CONFIG_CAN_GS_USB) += gs_usb.o obj-$(CONFIG_CAN_KVASER_USB) += kvaser_usb/ obj-$(CONFIG_CAN_MCBA_USB) += mcba_usb.o +obj-$(CONFIG_CAN_NCT6694) += nct6694_canfd.o obj-$(CONFIG_CAN_PEAK_USB) += peak_usb/ obj-$(CONFIG_CAN_UCAN) += ucan.o diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c index db1acf6d504cf3..adc91873c083f9 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.c +++ b/drivers/net/can/usb/etas_es58x/es58x_core.c @@ -7,7 +7,7 @@ * * Copyright (c) 2019 Robert Bosch Engineering and Business Solutions. All rights reserved. * Copyright (c) 2020 ETAS K.K.. All rights reserved. - * Copyright (c) 2020-2022 Vincent Mailhol + * Copyright (c) 2020-2025 Vincent Mailhol */ #include @@ -1977,6 +1977,7 @@ static const struct net_device_ops es58x_netdev_ops = { .ndo_stop = es58x_stop, .ndo_start_xmit = es58x_start_xmit, .ndo_eth_ioctl = can_eth_ioctl_hwts, + .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops es58x_ethtool_ops = { diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 41c0a1c399bf36..1f9b915094e64d 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -761,6 +761,7 @@ static const struct net_device_ops mcba_netdev_ops = { .ndo_open = mcba_usb_open, .ndo_stop = mcba_usb_close, .ndo_start_xmit = mcba_usb_start_xmit, + .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops mcba_ethtool_ops = { diff --git a/drivers/net/can/usb/nct6694_canfd.c b/drivers/net/can/usb/nct6694_canfd.c new file mode 100644 index 00000000000000..8deff16491a1a3 --- /dev/null +++ b/drivers/net/can/usb/nct6694_canfd.c @@ -0,0 +1,832 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Nuvoton NCT6694 Socket CANfd driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEVICE_NAME "nct6694-canfd" + +/* USB command module type for NCT6694 CANfd controller. + * This defines the module type used for communication with the NCT6694 + * CANfd controller over the USB interface. + */ +#define NCT6694_CANFD_MOD 0x05 + +/* Command 00h - CAN Setting and Initialization */ +#define NCT6694_CANFD_SETTING 0x00 +#define NCT6694_CANFD_SETTING_ACTIVE_CTRL1 BIT(0) +#define NCT6694_CANFD_SETTING_ACTIVE_CTRL2 BIT(1) +#define NCT6694_CANFD_SETTING_ACTIVE_NBTP_DBTP BIT(2) +#define NCT6694_CANFD_SETTING_CTRL1_MON BIT(0) +#define NCT6694_CANFD_SETTING_CTRL1_NISO BIT(1) +#define NCT6694_CANFD_SETTING_CTRL1_LBCK BIT(2) +#define NCT6694_CANFD_SETTING_NBTP_NTSEG2 GENMASK(6, 0) +#define NCT6694_CANFD_SETTING_NBTP_NTSEG1 GENMASK(15, 8) +#define NCT6694_CANFD_SETTING_NBTP_NBRP GENMASK(24, 16) +#define NCT6694_CANFD_SETTING_NBTP_NSJW GENMASK(31, 25) +#define NCT6694_CANFD_SETTING_DBTP_DSJW GENMASK(3, 0) +#define NCT6694_CANFD_SETTING_DBTP_DTSEG2 GENMASK(7, 4) +#define NCT6694_CANFD_SETTING_DBTP_DTSEG1 GENMASK(12, 8) +#define NCT6694_CANFD_SETTING_DBTP_DBRP GENMASK(20, 16) +#define NCT6694_CANFD_SETTING_DBTP_TDC BIT(23) + +/* Command 01h - CAN Information */ +#define NCT6694_CANFD_INFORMATION 0x01 +#define NCT6694_CANFD_INFORMATION_SEL 0x00 + +/* Command 02h - CAN Event */ +#define NCT6694_CANFD_EVENT 0x02 +#define NCT6694_CANFD_EVENT_SEL(idx, mask) \ + ((idx ? 0x80 : 0x00) | ((mask) & 0x7F)) + +#define NCT6694_CANFD_EVENT_MASK GENMASK(5, 0) +#define NCT6694_CANFD_EVT_TX_FIFO_EMPTY BIT(7) /* Read-clear */ +#define NCT6694_CANFD_EVT_RX_DATA_LOST BIT(5) /* Read-clear */ +#define NCT6694_CANFD_EVT_RX_DATA_IN BIT(7) /* Read-clear */ + +/* Command 10h - CAN Deliver */ +#define NCT6694_CANFD_DELIVER 0x10 +#define NCT6694_CANFD_DELIVER_SEL(buf_cnt) \ + ((buf_cnt) & 0xFF) + +/* Command 11h - CAN Receive */ +#define NCT6694_CANFD_RECEIVE 0x11 +#define NCT6694_CANFD_RECEIVE_SEL(idx, buf_cnt) \ + ((idx ? 0x80 : 0x00) | ((buf_cnt) & 0x7F)) + +#define NCT6694_CANFD_FRAME_TAG(idx) (0xC0 | (idx)) +#define NCT6694_CANFD_FRAME_FLAG_EFF BIT(0) +#define NCT6694_CANFD_FRAME_FLAG_RTR BIT(1) +#define NCT6694_CANFD_FRAME_FLAG_FD BIT(2) +#define NCT6694_CANFD_FRAME_FLAG_BRS BIT(3) +#define NCT6694_CANFD_FRAME_FLAG_ERR BIT(4) + +#define NCT6694_NAPI_WEIGHT 32 + +enum nct6694_event_err { + NCT6694_CANFD_EVT_ERR_NO_ERROR = 0, + NCT6694_CANFD_EVT_ERR_CRC_ERROR, + NCT6694_CANFD_EVT_ERR_STUFF_ERROR, + NCT6694_CANFD_EVT_ERR_ACK_ERROR, + NCT6694_CANFD_EVT_ERR_FORM_ERROR, + NCT6694_CANFD_EVT_ERR_BIT_ERROR, + NCT6694_CANFD_EVT_ERR_TIMEOUT_ERROR, + NCT6694_CANFD_EVT_ERR_UNKNOWN_ERROR, +}; + +enum nct6694_event_status { + NCT6694_CANFD_EVT_STS_ERROR_ACTIVE = 0, + NCT6694_CANFD_EVT_STS_ERROR_PASSIVE, + NCT6694_CANFD_EVT_STS_BUS_OFF, + NCT6694_CANFD_EVT_STS_WARNING, +}; + +struct __packed nct6694_canfd_setting { + __le32 nbr; + __le32 dbr; + u8 active; + u8 reserved[3]; + __le16 ctrl1; + __le16 ctrl2; + __le32 nbtp; + __le32 dbtp; +}; + +struct __packed nct6694_canfd_information { + u8 tx_fifo_cnt; + u8 rx_fifo_cnt; + u8 reserved[2]; + __le32 can_clk; +}; + +struct __packed nct6694_canfd_event { + u8 err; + u8 status; + u8 tx_evt; + u8 rx_evt; + u8 rec; + u8 tec; + u8 reserved[2]; +}; + +struct __packed nct6694_canfd_frame { + u8 tag; + u8 flag; + u8 reserved; + u8 length; + __le32 id; + u8 data[CANFD_MAX_DLEN]; +}; + +struct nct6694_canfd_priv { + struct can_priv can; /* must be the first member */ + struct can_rx_offload offload; + struct net_device *ndev; + struct nct6694 *nct6694; + struct workqueue_struct *wq; + struct work_struct tx_work; + struct nct6694_canfd_frame tx; + struct nct6694_canfd_frame rx; + struct nct6694_canfd_event event[2]; + struct can_berr_counter bec; +}; + +static inline struct nct6694_canfd_priv *rx_offload_to_priv(struct can_rx_offload *offload) +{ + return container_of(offload, struct nct6694_canfd_priv, offload); +} + +static const struct can_bittiming_const nct6694_canfd_bittiming_nominal_const = { + .name = DEVICE_NAME, + .tseg1_min = 1, + .tseg1_max = 256, + .tseg2_min = 1, + .tseg2_max = 128, + .sjw_max = 128, + .brp_min = 1, + .brp_max = 512, + .brp_inc = 1, +}; + +static const struct can_bittiming_const nct6694_canfd_bittiming_data_const = { + .name = DEVICE_NAME, + .tseg1_min = 1, + .tseg1_max = 32, + .tseg2_min = 1, + .tseg2_max = 16, + .sjw_max = 16, + .brp_min = 1, + .brp_max = 32, + .brp_inc = 1, +}; + +static void nct6694_canfd_rx_offload(struct can_rx_offload *offload, + struct sk_buff *skb) +{ + struct nct6694_canfd_priv *priv = rx_offload_to_priv(offload); + int ret; + + ret = can_rx_offload_queue_tail(offload, skb); + if (ret) + priv->ndev->stats.rx_fifo_errors++; +} + +static void nct6694_canfd_handle_lost_msg(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct net_device_stats *stats = &ndev->stats; + struct can_frame *cf; + struct sk_buff *skb; + + netdev_dbg(ndev, "RX FIFO overflow, message(s) lost.\n"); + + stats->rx_errors++; + stats->rx_over_errors++; + + skb = alloc_can_err_skb(ndev, &cf); + if (!skb) + return; + + cf->can_id |= CAN_ERR_CRTL; + cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW; + + nct6694_canfd_rx_offload(&priv->offload, skb); +} + +static void nct6694_canfd_handle_rx(struct net_device *ndev, u8 rx_evt) +{ + struct net_device_stats *stats = &ndev->stats; + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct nct6694_canfd_frame *frame = &priv->rx; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_RECEIVE, + .sel = NCT6694_CANFD_RECEIVE_SEL(ndev->dev_port, 1), + .len = cpu_to_le16(sizeof(*frame)) + }; + struct sk_buff *skb; + int ret; + + ret = nct6694_read_msg(priv->nct6694, &cmd_hd, frame); + if (ret) + return; + + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_FD) { + struct canfd_frame *cfd; + + skb = alloc_canfd_skb(priv->ndev, &cfd); + if (!skb) { + stats->rx_dropped++; + return; + } + + cfd->can_id = le32_to_cpu(frame->id); + cfd->len = canfd_sanitize_len(frame->length); + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_EFF) + cfd->can_id |= CAN_EFF_FLAG; + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_BRS) + cfd->flags |= CANFD_BRS; + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_ERR) + cfd->flags |= CANFD_ESI; + + memcpy(cfd->data, frame->data, cfd->len); + } else { + struct can_frame *cf; + + skb = alloc_can_skb(priv->ndev, &cf); + if (!skb) { + stats->rx_dropped++; + return; + } + + cf->can_id = le32_to_cpu(frame->id); + cf->len = can_cc_dlc2len(frame->length); + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_EFF) + cf->can_id |= CAN_EFF_FLAG; + + if (frame->flag & NCT6694_CANFD_FRAME_FLAG_RTR) + cf->can_id |= CAN_RTR_FLAG; + else + memcpy(cf->data, frame->data, cf->len); + } + + nct6694_canfd_rx_offload(&priv->offload, skb); +} + +static int nct6694_canfd_get_berr_counter(const struct net_device *ndev, + struct can_berr_counter *bec) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + + *bec = priv->bec; + + return 0; +} + +static void nct6694_canfd_handle_state_change(struct net_device *ndev, u8 status) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + enum can_state new_state, rx_state, tx_state; + struct can_berr_counter bec; + struct can_frame *cf; + struct sk_buff *skb; + + nct6694_canfd_get_berr_counter(ndev, &bec); + can_state_get_by_berr_counter(ndev, &bec, &tx_state, &rx_state); + + new_state = max(tx_state, rx_state); + + /* state hasn't changed */ + if (new_state == priv->can.state) + return; + + skb = alloc_can_err_skb(ndev, &cf); + + can_change_state(ndev, cf, tx_state, rx_state); + + if (new_state == CAN_STATE_BUS_OFF) { + can_bus_off(ndev); + } else if (cf) { + cf->can_id |= CAN_ERR_CNT; + cf->data[6] = bec.txerr; + cf->data[7] = bec.rxerr; + } + + if (skb) + nct6694_canfd_rx_offload(&priv->offload, skb); +} + +static void nct6694_canfd_handle_bus_err(struct net_device *ndev, u8 bus_err) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct can_frame *cf; + struct sk_buff *skb; + + priv->can.can_stats.bus_error++; + + skb = alloc_can_err_skb(ndev, &cf); + if (cf) + cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR; + + switch (bus_err) { + case NCT6694_CANFD_EVT_ERR_CRC_ERROR: + netdev_dbg(ndev, "CRC error\n"); + ndev->stats.rx_errors++; + if (cf) + cf->data[3] |= CAN_ERR_PROT_LOC_CRC_SEQ; + break; + + case NCT6694_CANFD_EVT_ERR_STUFF_ERROR: + netdev_dbg(ndev, "Stuff error\n"); + ndev->stats.rx_errors++; + if (cf) + cf->data[2] |= CAN_ERR_PROT_STUFF; + break; + + case NCT6694_CANFD_EVT_ERR_ACK_ERROR: + netdev_dbg(ndev, "Ack error\n"); + ndev->stats.tx_errors++; + if (cf) { + cf->can_id |= CAN_ERR_ACK; + cf->data[2] |= CAN_ERR_PROT_TX; + } + break; + + case NCT6694_CANFD_EVT_ERR_FORM_ERROR: + netdev_dbg(ndev, "Form error\n"); + ndev->stats.rx_errors++; + if (cf) + cf->data[2] |= CAN_ERR_PROT_FORM; + break; + + case NCT6694_CANFD_EVT_ERR_BIT_ERROR: + netdev_dbg(ndev, "Bit error\n"); + ndev->stats.tx_errors++; + if (cf) + cf->data[2] |= CAN_ERR_PROT_TX | CAN_ERR_PROT_BIT; + break; + + default: + break; + } + + if (skb) + nct6694_canfd_rx_offload(&priv->offload, skb); +} + +static void nct6694_canfd_handle_tx(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct net_device_stats *stats = &ndev->stats; + + stats->tx_bytes += can_rx_offload_get_echo_skb_queue_tail(&priv->offload, + 0, NULL); + stats->tx_packets++; + netif_wake_queue(ndev); +} + +static irqreturn_t nct6694_canfd_irq(int irq, void *data) +{ + struct net_device *ndev = data; + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct nct6694_canfd_event *event = &priv->event[ndev->dev_port]; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_EVENT, + .sel = NCT6694_CANFD_EVENT_SEL(ndev->dev_port, NCT6694_CANFD_EVENT_MASK), + .len = cpu_to_le16(sizeof(priv->event)) + }; + irqreturn_t handled = IRQ_NONE; + int ret; + + ret = nct6694_read_msg(priv->nct6694, &cmd_hd, priv->event); + if (ret < 0) + return handled; + + if (event->rx_evt & NCT6694_CANFD_EVT_RX_DATA_IN) { + nct6694_canfd_handle_rx(ndev, event->rx_evt); + handled = IRQ_HANDLED; + } + + if (event->rx_evt & NCT6694_CANFD_EVT_RX_DATA_LOST) { + nct6694_canfd_handle_lost_msg(ndev); + handled = IRQ_HANDLED; + } + + if (event->status) { + nct6694_canfd_handle_state_change(ndev, event->status); + handled = IRQ_HANDLED; + } + + if (event->err != NCT6694_CANFD_EVT_ERR_NO_ERROR) { + if (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) + nct6694_canfd_handle_bus_err(ndev, event->err); + handled = IRQ_HANDLED; + } + + if (event->tx_evt & NCT6694_CANFD_EVT_TX_FIFO_EMPTY) { + nct6694_canfd_handle_tx(ndev); + handled = IRQ_HANDLED; + } + + if (handled) + can_rx_offload_threaded_irq_finish(&priv->offload); + + priv->bec.rxerr = event->rec; + priv->bec.txerr = event->tec; + + return handled; +} + +static void nct6694_canfd_tx_work(struct work_struct *work) +{ + struct nct6694_canfd_priv *priv = container_of(work, + struct nct6694_canfd_priv, + tx_work); + struct nct6694_canfd_frame *frame = &priv->tx; + struct net_device *ndev = priv->ndev; + struct net_device_stats *stats = &ndev->stats; + struct sk_buff *skb = priv->can.echo_skb[0]; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_DELIVER, + .sel = NCT6694_CANFD_DELIVER_SEL(1), + .len = cpu_to_le16(sizeof(*frame)) + }; + u32 txid; + int err; + + memset(frame, 0, sizeof(*frame)); + + frame->tag = NCT6694_CANFD_FRAME_TAG(ndev->dev_port); + + if (can_is_canfd_skb(skb)) { + struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + + if (cfd->flags & CANFD_BRS) + frame->flag |= NCT6694_CANFD_FRAME_FLAG_BRS; + + if (cfd->can_id & CAN_EFF_FLAG) { + txid = cfd->can_id & CAN_EFF_MASK; + frame->flag |= NCT6694_CANFD_FRAME_FLAG_EFF; + } else { + txid = cfd->can_id & CAN_SFF_MASK; + } + frame->flag |= NCT6694_CANFD_FRAME_FLAG_FD; + frame->id = cpu_to_le32(txid); + frame->length = canfd_sanitize_len(cfd->len); + + memcpy(frame->data, cfd->data, frame->length); + } else { + struct can_frame *cf = (struct can_frame *)skb->data; + + if (cf->can_id & CAN_EFF_FLAG) { + txid = cf->can_id & CAN_EFF_MASK; + frame->flag |= NCT6694_CANFD_FRAME_FLAG_EFF; + } else { + txid = cf->can_id & CAN_SFF_MASK; + } + + if (cf->can_id & CAN_RTR_FLAG) + frame->flag |= NCT6694_CANFD_FRAME_FLAG_RTR; + else + memcpy(frame->data, cf->data, cf->len); + + frame->id = cpu_to_le32(txid); + frame->length = cf->len; + } + + err = nct6694_write_msg(priv->nct6694, &cmd_hd, frame); + if (err) { + can_free_echo_skb(ndev, 0, NULL); + stats->tx_dropped++; + stats->tx_errors++; + netif_wake_queue(ndev); + } +} + +static netdev_tx_t nct6694_canfd_start_xmit(struct sk_buff *skb, + struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + + if (can_dev_dropped_skb(ndev, skb)) + return NETDEV_TX_OK; + + netif_stop_queue(ndev); + can_put_echo_skb(skb, ndev, 0, 0); + queue_work(priv->wq, &priv->tx_work); + + return NETDEV_TX_OK; +} + +static int nct6694_canfd_start(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + const struct can_bittiming *n_bt = &priv->can.bittiming; + const struct can_bittiming *d_bt = &priv->can.fd.data_bittiming; + struct nct6694_canfd_setting *setting __free(kfree) = NULL; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_SETTING, + .sel = ndev->dev_port, + .len = cpu_to_le16(sizeof(*setting)) + }; + u32 en_tdc; + int ret; + + setting = kzalloc(sizeof(*setting), GFP_KERNEL); + if (!setting) + return -ENOMEM; + + if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) + setting->ctrl1 |= cpu_to_le16(NCT6694_CANFD_SETTING_CTRL1_MON); + + if (priv->can.ctrlmode & CAN_CTRLMODE_FD_NON_ISO) + setting->ctrl1 |= cpu_to_le16(NCT6694_CANFD_SETTING_CTRL1_NISO); + + if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) + setting->ctrl1 |= cpu_to_le16(NCT6694_CANFD_SETTING_CTRL1_LBCK); + + /* Disable clock divider */ + setting->ctrl2 = 0; + + setting->nbtp = cpu_to_le32(FIELD_PREP(NCT6694_CANFD_SETTING_NBTP_NSJW, + n_bt->sjw - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_NBTP_NBRP, + n_bt->brp - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_NBTP_NTSEG2, + n_bt->phase_seg2 - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_NBTP_NTSEG1, + n_bt->prop_seg + n_bt->phase_seg1 - 1)); + + if (d_bt->brp <= 2) + en_tdc = NCT6694_CANFD_SETTING_DBTP_TDC; + else + en_tdc = 0; + + setting->dbtp = cpu_to_le32(FIELD_PREP(NCT6694_CANFD_SETTING_DBTP_DSJW, + d_bt->sjw - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_DBTP_DBRP, + d_bt->brp - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_DBTP_DTSEG2, + d_bt->phase_seg2 - 1) | + FIELD_PREP(NCT6694_CANFD_SETTING_DBTP_DTSEG1, + d_bt->prop_seg + d_bt->phase_seg1 - 1) | + en_tdc); + + setting->active = NCT6694_CANFD_SETTING_ACTIVE_CTRL1 | + NCT6694_CANFD_SETTING_ACTIVE_CTRL2 | + NCT6694_CANFD_SETTING_ACTIVE_NBTP_DBTP; + + ret = nct6694_write_msg(priv->nct6694, &cmd_hd, setting); + if (ret) + return ret; + + priv->can.state = CAN_STATE_ERROR_ACTIVE; + + return 0; +} + +static void nct6694_canfd_stop(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + struct nct6694_canfd_setting *setting __free(kfree) = NULL; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_SETTING, + .sel = ndev->dev_port, + .len = cpu_to_le16(sizeof(*setting)) + }; + + /* The NCT6694 cannot be stopped. To ensure safe operation and avoid + * interference, the control mode is set to Listen-Only mode. This + * mode allows the device to monitor bus activity without actively + * participating in communication. + */ + setting = kzalloc(sizeof(*setting), GFP_KERNEL); + if (!setting) + return; + + nct6694_read_msg(priv->nct6694, &cmd_hd, setting); + setting->ctrl1 = cpu_to_le16(NCT6694_CANFD_SETTING_CTRL1_MON); + setting->active = NCT6694_CANFD_SETTING_ACTIVE_CTRL1; + nct6694_write_msg(priv->nct6694, &cmd_hd, setting); + + priv->can.state = CAN_STATE_STOPPED; +} + +static int nct6694_canfd_close(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + + netif_stop_queue(ndev); + nct6694_canfd_stop(ndev); + destroy_workqueue(priv->wq); + free_irq(ndev->irq, ndev); + can_rx_offload_disable(&priv->offload); + close_candev(ndev); + return 0; +} + +static int nct6694_canfd_set_mode(struct net_device *ndev, enum can_mode mode) +{ + int ret; + + switch (mode) { + case CAN_MODE_START: + ret = nct6694_canfd_start(ndev); + if (ret) + return ret; + + netif_wake_queue(ndev); + break; + + default: + return -EOPNOTSUPP; + } + + return ret; +} + +static int nct6694_canfd_open(struct net_device *ndev) +{ + struct nct6694_canfd_priv *priv = netdev_priv(ndev); + int ret; + + ret = open_candev(ndev); + if (ret) + return ret; + + can_rx_offload_enable(&priv->offload); + + ret = request_threaded_irq(ndev->irq, NULL, + nct6694_canfd_irq, IRQF_ONESHOT, + "nct6694_canfd", ndev); + if (ret) { + netdev_err(ndev, "Failed to request IRQ\n"); + goto can_rx_offload_disable; + } + + priv->wq = alloc_ordered_workqueue("%s-nct6694_wq", + WQ_FREEZABLE | WQ_MEM_RECLAIM, + ndev->name); + if (!priv->wq) { + ret = -ENOMEM; + goto free_irq; + } + + ret = nct6694_canfd_start(ndev); + if (ret) + goto destroy_wq; + + netif_start_queue(ndev); + + return 0; + +destroy_wq: + destroy_workqueue(priv->wq); +free_irq: + free_irq(ndev->irq, ndev); +can_rx_offload_disable: + can_rx_offload_disable(&priv->offload); + close_candev(ndev); + return ret; +} + +static const struct net_device_ops nct6694_canfd_netdev_ops = { + .ndo_open = nct6694_canfd_open, + .ndo_stop = nct6694_canfd_close, + .ndo_start_xmit = nct6694_canfd_start_xmit, + .ndo_change_mtu = can_change_mtu, +}; + +static const struct ethtool_ops nct6694_canfd_ethtool_ops = { + .get_ts_info = ethtool_op_get_ts_info, +}; + +static int nct6694_canfd_get_clock(struct nct6694_canfd_priv *priv) +{ + struct nct6694_canfd_information *info __free(kfree) = NULL; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_CANFD_MOD, + .cmd = NCT6694_CANFD_INFORMATION, + .sel = NCT6694_CANFD_INFORMATION_SEL, + .len = cpu_to_le16(sizeof(*info)) + }; + int ret; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + ret = nct6694_read_msg(priv->nct6694, &cmd_hd, info); + if (ret) + return ret; + + return le32_to_cpu(info->can_clk); +} + +static int nct6694_canfd_probe(struct platform_device *pdev) +{ + struct nct6694 *nct6694 = dev_get_drvdata(pdev->dev.parent); + struct nct6694_canfd_priv *priv; + struct net_device *ndev; + int port, irq, ret, can_clk; + + port = ida_alloc(&nct6694->canfd_ida, GFP_KERNEL); + if (port < 0) + return port; + + irq = irq_create_mapping(nct6694->domain, + NCT6694_IRQ_CAN0 + port); + if (!irq) { + ret = -EINVAL; + goto free_ida; + } + + ndev = alloc_candev(sizeof(struct nct6694_canfd_priv), 1); + if (!ndev) { + ret = -ENOMEM; + goto dispose_irq; + } + + ndev->irq = irq; + ndev->flags |= IFF_ECHO; + ndev->dev_port = port; + ndev->netdev_ops = &nct6694_canfd_netdev_ops; + ndev->ethtool_ops = &nct6694_canfd_ethtool_ops; + + priv = netdev_priv(ndev); + priv->nct6694 = nct6694; + priv->ndev = ndev; + + can_clk = nct6694_canfd_get_clock(priv); + if (can_clk < 0) { + ret = dev_err_probe(&pdev->dev, can_clk, + "Failed to get clock\n"); + goto free_candev; + } + + INIT_WORK(&priv->tx_work, nct6694_canfd_tx_work); + + priv->can.clock.freq = can_clk; + priv->can.bittiming_const = &nct6694_canfd_bittiming_nominal_const; + priv->can.fd.data_bittiming_const = &nct6694_canfd_bittiming_data_const; + priv->can.do_set_mode = nct6694_canfd_set_mode; + priv->can.do_get_berr_counter = nct6694_canfd_get_berr_counter; + priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK | + CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_BERR_REPORTING | + CAN_CTRLMODE_FD_NON_ISO; + + ret = can_set_static_ctrlmode(ndev, CAN_CTRLMODE_FD); + if (ret) + goto free_candev; + + ret = can_rx_offload_add_manual(ndev, &priv->offload, + NCT6694_NAPI_WEIGHT); + if (ret) { + dev_err_probe(&pdev->dev, ret, "Failed to add rx_offload\n"); + goto free_candev; + } + + platform_set_drvdata(pdev, priv); + SET_NETDEV_DEV(priv->ndev, &pdev->dev); + + ret = register_candev(priv->ndev); + if (ret) + goto rx_offload_del; + + return 0; + +rx_offload_del: + can_rx_offload_del(&priv->offload); +free_candev: + free_candev(ndev); +dispose_irq: + irq_dispose_mapping(irq); +free_ida: + ida_free(&nct6694->canfd_ida, port); + return ret; +} + +static void nct6694_canfd_remove(struct platform_device *pdev) +{ + struct nct6694_canfd_priv *priv = platform_get_drvdata(pdev); + struct nct6694 *nct6694 = priv->nct6694; + struct net_device *ndev = priv->ndev; + int port = ndev->dev_port; + int irq = ndev->irq; + + unregister_candev(ndev); + can_rx_offload_del(&priv->offload); + free_candev(ndev); + irq_dispose_mapping(irq); + ida_free(&nct6694->canfd_ida, port); +} + +static struct platform_driver nct6694_canfd_driver = { + .driver = { + .name = DEVICE_NAME, + }, + .probe = nct6694_canfd_probe, + .remove = nct6694_canfd_remove, +}; + +module_platform_driver(nct6694_canfd_driver); + +MODULE_DESCRIPTION("USB-CAN FD driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index 117637b9b995b9..dd5caa1c302b99 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -111,7 +111,7 @@ void peak_usb_update_ts_now(struct peak_time_ref *time_ref, u32 ts_now) u32 delta_ts = time_ref->ts_dev_2 - time_ref->ts_dev_1; if (time_ref->ts_dev_2 < time_ref->ts_dev_1) - delta_ts &= (1 << time_ref->adapter->ts_used_bits) - 1; + delta_ts &= (1ULL << time_ref->adapter->ts_used_bits) - 1; time_ref->ts_total += delta_ts; } diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index 81baec8eb1e5da..a25a3ca62c12e3 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -690,14 +690,6 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb, dlc |= XCAN_DLCR_EDL_MASK; } - if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) && - (priv->devtype.flags & XCAN_FLAG_TXFEMP)) - can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0); - else - can_put_echo_skb(skb, ndev, 0, 0); - - priv->tx_head++; - priv->write_reg(priv, XCAN_FRAME_ID_OFFSET(frame_offset), id); /* If the CAN frame is RTR frame this write triggers transmission * (not on CAN FD) @@ -730,6 +722,14 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb, data[1]); } } + + if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) && + (priv->devtype.flags & XCAN_FLAG_TXFEMP)) + can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0); + else + can_put_echo_skb(skb, ndev, 0, 0); + + priv->tx_head++; } /** diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 829b1f087e9e0e..2f846381d5a762 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1273,9 +1273,15 @@ static int b53_setup(struct dsa_switch *ds) */ ds->untag_vlan_aware_bridge_pvid = true; - /* Ageing time is set in seconds */ - ds->ageing_time_min = 1 * 1000; - ds->ageing_time_max = AGE_TIME_MAX * 1000; + if (dev->chip_id == BCM53101_DEVICE_ID) { + /* BCM53101 uses 0.5 second increments */ + ds->ageing_time_min = 1 * 500; + ds->ageing_time_max = AGE_TIME_MAX * 500; + } else { + /* Everything else uses 1 second increments */ + ds->ageing_time_min = 1 * 1000; + ds->ageing_time_max = AGE_TIME_MAX * 1000; + } ret = b53_reset_switch(dev); if (ret) { @@ -2559,7 +2565,10 @@ int b53_set_ageing_time(struct dsa_switch *ds, unsigned int msecs) else reg = B53_AGING_TIME_CONTROL; - atc = DIV_ROUND_CLOSEST(msecs, 1000); + if (dev->chip_id == BCM53101_DEVICE_ID) + atc = DIV_ROUND_CLOSEST(msecs, 500); + else + atc = DIV_ROUND_CLOSEST(msecs, 1000); if (!is5325(dev) && !is5365(dev)) atc |= AGE_CHANGE; diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 6eb3140d404449..84dc6e517acf94 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -685,18 +685,27 @@ static int gswip_add_single_port_br(struct gswip_priv *priv, int port, bool add) return 0; } -static int gswip_port_enable(struct dsa_switch *ds, int port, - struct phy_device *phydev) +static int gswip_port_setup(struct dsa_switch *ds, int port) { struct gswip_priv *priv = ds->priv; int err; if (!dsa_is_cpu_port(ds, port)) { - u32 mdio_phy = 0; - err = gswip_add_single_port_br(priv, port, true); if (err) return err; + } + + return 0; +} + +static int gswip_port_enable(struct dsa_switch *ds, int port, + struct phy_device *phydev) +{ + struct gswip_priv *priv = ds->priv; + + if (!dsa_is_cpu_port(ds, port)) { + u32 mdio_phy = 0; if (phydev) mdio_phy = phydev->mdio.addr & GSWIP_MDIO_PHY_ADDR_MASK; @@ -1359,8 +1368,9 @@ static int gswip_port_fdb(struct dsa_switch *ds, int port, int i; int err; + /* Operation not supported on the CPU port, don't throw errors */ if (!bridge) - return -EINVAL; + return 0; for (i = max_ports; i < ARRAY_SIZE(priv->vlans); i++) { if (priv->vlans[i].bridge == bridge) { @@ -1829,6 +1839,7 @@ static const struct phylink_mac_ops gswip_phylink_mac_ops = { static const struct dsa_switch_ops gswip_xrx200_switch_ops = { .get_tag_protocol = gswip_get_tag_protocol, .setup = gswip_setup, + .port_setup = gswip_port_setup, .port_enable = gswip_port_enable, .port_disable = gswip_port_disable, .port_bridge_join = gswip_port_bridge_join, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index d72fd248f3aa92..2d66bf59cd64d6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -244,7 +244,7 @@ bnxt_tc_parse_pedit(struct bnxt *bp, struct bnxt_tc_actions *actions, offset < offset_of_ip6_daddr + 16) { actions->nat.src_xlate = false; idx = (offset - offset_of_ip6_daddr) / 4; - actions->nat.l3.ipv6.saddr.s6_addr32[idx] = htonl(val); + actions->nat.l3.ipv6.daddr.s6_addr32[idx] = htonl(val); } else { netdev_err(bp->dev, "%s: IPv6_hdr: Invalid pedit field\n", diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index a9040c42d2ff97..6e97a5a7daaf9c 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -4230,8 +4230,7 @@ static void cnic_cm_stop_bnx2x_hw(struct cnic_dev *dev) cnic_bnx2x_delete_wait(dev, 0); - cancel_delayed_work(&cp->delete_task); - flush_workqueue(cnic_wq); + cancel_delayed_work_sync(&cp->delete_task); if (atomic_read(&cp->iscsi_conn) != 0) netdev_warn(dev->netdev, "%d iSCSI connections not destroyed\n", diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c index de8a6ce86ad7e2..12105ffb5dac6d 100644 --- a/drivers/net/ethernet/cavium/liquidio/request_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c @@ -126,7 +126,7 @@ int octeon_init_instr_queue(struct octeon_device *oct, oct->io_qmask.iq |= BIT_ULL(iq_no); /* Set the 32B/64B mode for each input queue */ - oct->io_qmask.iq64B |= ((conf->instr_type == 64) << iq_no); + oct->io_qmask.iq64B |= ((u64)(conf->instr_type == 64) << iq_no); iq->iqcmd_64B = (conf->instr_type == 64); oct->fn_list.setup_iq_regs(oct, iq_no); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index 4643a338061820..b1e1ad9e4b48e6 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -2736,7 +2736,7 @@ static int dpaa2_switch_setup_dpbp(struct ethsw_core *ethsw) dev_err(dev, "dpsw_ctrl_if_set_pools() failed\n"); goto err_get_attr; } - ethsw->bpid = dpbp_attrs.id; + ethsw->bpid = dpbp_attrs.bpid; return 0; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1383918f8a3fc6..adf1f2bbcbb16f 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2363,7 +2363,8 @@ static void fec_enet_phy_reset_after_clk_enable(struct net_device *ndev) */ phy_dev = of_phy_find_device(fep->phy_node); phy_reset_after_clk_enable(phy_dev); - put_device(&phy_dev->mdio.dev); + if (phy_dev) + put_device(&phy_dev->mdio.dev); } } diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 49aa4497efce7d..801a57a925dadc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1278,7 +1278,8 @@ struct i40e_mac_filter *i40e_add_mac_filter(struct i40e_vsi *vsi, const u8 *macaddr); int i40e_del_mac_filter(struct i40e_vsi *vsi, const u8 *macaddr); bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi); -int i40e_count_filters(struct i40e_vsi *vsi); +int i40e_count_all_filters(struct i40e_vsi *vsi); +int i40e_count_active_filters(struct i40e_vsi *vsi); struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, const u8 *macaddr); void i40e_vlan_stripping_enable(struct i40e_vsi *vsi); static inline bool i40e_is_sw_dcb(struct i40e_pf *pf) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index 76d872b91a383d..cc02a85ad42bae 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -1561,6 +1561,7 @@ I40E_CHECK_CMD_LENGTH(i40e_aq_set_phy_config); struct i40e_aq_set_mac_config { __le16 max_frame_size; u8 params; +#define I40E_AQ_SET_MAC_CONFIG_CRC_EN BIT(2) u8 tx_timer_priority; /* bitmap */ __le16 tx_timer_value; __le16 fc_refresh_threshold; diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 270e7e8cf9cfd1..59f5c1e810eb05 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1189,6 +1189,40 @@ int i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures, return status; } +/** + * i40e_aq_set_mac_config - Configure MAC settings + * @hw: pointer to the hw struct + * @max_frame_size: Maximum Frame Size to be supported by the port + * @cmd_details: pointer to command details structure or NULL + * + * Set MAC configuration (0x0603). Note that max_frame_size must be greater + * than zero. + * + * Return: 0 on success, or a negative error code on failure. + */ +int i40e_aq_set_mac_config(struct i40e_hw *hw, u16 max_frame_size, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_set_mac_config *cmd; + struct libie_aq_desc desc; + + cmd = libie_aq_raw(&desc); + + if (max_frame_size == 0) + return -EINVAL; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_set_mac_config); + + cmd->max_frame_size = cpu_to_le16(max_frame_size); + cmd->params = I40E_AQ_SET_MAC_CONFIG_CRC_EN; + +#define I40E_AQ_SET_MAC_CONFIG_FC_DEFAULT_THRESHOLD 0x7FFF + cmd->fc_refresh_threshold = + cpu_to_le16(I40E_AQ_SET_MAC_CONFIG_FC_DEFAULT_THRESHOLD); + + return i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); +} + /** * i40e_aq_clear_pxe_mode * @hw: pointer to the hw struct diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index b83f823e49177c..529d5501baacad 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1243,12 +1243,30 @@ void i40e_update_stats(struct i40e_vsi *vsi) } /** - * i40e_count_filters - counts VSI mac filters + * i40e_count_all_filters - counts VSI MAC filters * @vsi: the VSI to be searched * - * Returns count of mac filters - **/ -int i40e_count_filters(struct i40e_vsi *vsi) + * Return: count of MAC filters in any state. + */ +int i40e_count_all_filters(struct i40e_vsi *vsi) +{ + struct i40e_mac_filter *f; + struct hlist_node *h; + int bkt, cnt = 0; + + hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) + cnt++; + + return cnt; +} + +/** + * i40e_count_active_filters - counts VSI MAC filters + * @vsi: the VSI to be searched + * + * Return: count of active MAC filters. + */ +int i40e_count_active_filters(struct i40e_vsi *vsi) { struct i40e_mac_filter *f; struct hlist_node *h; @@ -4156,7 +4174,7 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename) irq_num = pf->msix_entries[base + vector].vector; irq_set_affinity_notifier(irq_num, NULL); irq_update_affinity_hint(irq_num, NULL); - free_irq(irq_num, &vsi->q_vectors[vector]); + free_irq(irq_num, vsi->q_vectors[vector]); } return err; } @@ -16045,13 +16063,17 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) dev_dbg(&pf->pdev->dev, "get supported phy types ret = %pe last_status = %s\n", ERR_PTR(err), libie_aq_str(pf->hw.aq.asq_last_status)); - /* make sure the MFS hasn't been set lower than the default */ #define MAX_FRAME_SIZE_DEFAULT 0x2600 - val = FIELD_GET(I40E_PRTGL_SAH_MFS_MASK, - rd32(&pf->hw, I40E_PRTGL_SAH)); - if (val < MAX_FRAME_SIZE_DEFAULT) - dev_warn(&pdev->dev, "MFS for port %x (%d) has been set below the default (%d)\n", - pf->hw.port, val, MAX_FRAME_SIZE_DEFAULT); + + err = i40e_aq_set_mac_config(hw, MAX_FRAME_SIZE_DEFAULT, NULL); + if (err) + dev_warn(&pdev->dev, "set mac config ret = %pe last_status = %s\n", + ERR_PTR(err), libie_aq_str(pf->hw.aq.asq_last_status)); + + /* Make sure the MFS is set to the expected value */ + val = rd32(hw, I40E_PRTGL_SAH); + FIELD_MODIFY(I40E_PRTGL_SAH_MFS_MASK, &val, MAX_FRAME_SIZE_DEFAULT); + wr32(hw, I40E_PRTGL_SAH, val); /* Add a filter to drop all Flow control frames from any VSI from being * transmitted. By doing so we stop a malicious VF from sending out diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index aef5de53ce3bb8..26bb7bffe36101 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -98,6 +98,8 @@ int i40e_aq_set_mac_loopback(struct i40e_hw *hw, struct i40e_asq_cmd_details *cmd_details); int i40e_aq_set_phy_int_mask(struct i40e_hw *hw, u16 mask, struct i40e_asq_cmd_details *cmd_details); +int i40e_aq_set_mac_config(struct i40e_hw *hw, u16 max_frame_size, + struct i40e_asq_cmd_details *cmd_details); int i40e_aq_clear_pxe_mode(struct i40e_hw *hw, struct i40e_asq_cmd_details *cmd_details); int i40e_aq_set_link_restart_an(struct i40e_hw *hw, diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 048c3303913094..b194eae032084d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -948,9 +948,6 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi, if (!eop_desc) break; - /* prevent any other reads prior to eop_desc */ - smp_rmb(); - i40e_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf); /* we have caught up to head, no work left to do */ if (tx_head == tx_desc) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 9b8efdeafbcf13..081a4526a2f000 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -448,7 +448,7 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id, (qtype << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | (pf_queue_id << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | BIT(I40E_QINT_RQCTL_CAUSE_ENA_SHIFT) | - (itr_idx << I40E_QINT_RQCTL_ITR_INDX_SHIFT); + FIELD_PREP(I40E_QINT_RQCTL_ITR_INDX_MASK, itr_idx); wr32(hw, reg_idx, reg); } @@ -653,6 +653,13 @@ static int i40e_config_vsi_tx_queue(struct i40e_vf *vf, u16 vsi_id, /* only set the required fields */ tx_ctx.base = info->dma_ring_addr / 128; + + /* ring_len has to be multiple of 8 */ + if (!IS_ALIGNED(info->ring_len, 8) || + info->ring_len > I40E_MAX_NUM_DESCRIPTORS_XL710) { + ret = -EINVAL; + goto error_context; + } tx_ctx.qlen = info->ring_len; tx_ctx.rdylist = le16_to_cpu(vsi->info.qs_handle[0]); tx_ctx.rdylist_act = 0; @@ -716,6 +723,13 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id, /* only set the required fields */ rx_ctx.base = info->dma_ring_addr / 128; + + /* ring_len has to be multiple of 32 */ + if (!IS_ALIGNED(info->ring_len, 32) || + info->ring_len > I40E_MAX_NUM_DESCRIPTORS_XL710) { + ret = -EINVAL; + goto error_param; + } rx_ctx.qlen = info->ring_len; if (info->splithdr_enabled) { @@ -1450,6 +1464,7 @@ static void i40e_trigger_vf_reset(struct i40e_vf *vf, bool flr) * functions that may still be running at this point. */ clear_bit(I40E_VF_STATE_INIT, &vf->vf_states); + clear_bit(I40E_VF_STATE_RESOURCES_LOADED, &vf->vf_states); /* In the case of a VFLR, the HW has already reset the VF and we * just need to clean up, so don't hit the VFRTRIG register. @@ -2116,7 +2131,10 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg) size_t len = 0; int ret; - if (!i40e_sync_vf_state(vf, I40E_VF_STATE_INIT)) { + i40e_sync_vf_state(vf, I40E_VF_STATE_INIT); + + if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states) || + test_bit(I40E_VF_STATE_RESOURCES_LOADED, &vf->vf_states)) { aq_ret = -EINVAL; goto err; } @@ -2219,6 +2237,7 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg) vf->default_lan_addr.addr); } set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states); + set_bit(I40E_VF_STATE_RESOURCES_LOADED, &vf->vf_states); err: /* send the response back to the VF */ @@ -2381,7 +2400,7 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg) } if (vf->adq_enabled) { - if (idx >= ARRAY_SIZE(vf->ch)) { + if (idx >= vf->num_tc) { aq_ret = -ENODEV; goto error_param; } @@ -2402,7 +2421,7 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg) * to its appropriate VSIs based on TC mapping */ if (vf->adq_enabled) { - if (idx >= ARRAY_SIZE(vf->ch)) { + if (idx >= vf->num_tc) { aq_ret = -ENODEV; goto error_param; } @@ -2452,8 +2471,10 @@ static int i40e_validate_queue_map(struct i40e_vf *vf, u16 vsi_id, u16 vsi_queue_id, queue_id; for_each_set_bit(vsi_queue_id, &queuemap, I40E_MAX_VSI_QP) { - if (vf->adq_enabled) { - vsi_id = vf->ch[vsi_queue_id / I40E_MAX_VF_VSI].vsi_id; + u16 idx = vsi_queue_id / I40E_MAX_VF_VSI; + + if (vf->adq_enabled && idx < vf->num_tc) { + vsi_id = vf->ch[idx].vsi_id; queue_id = (vsi_queue_id % I40E_DEFAULT_QUEUES_PER_VF); } else { queue_id = vsi_queue_id; @@ -2841,24 +2862,6 @@ static int i40e_vc_get_stats_msg(struct i40e_vf *vf, u8 *msg) (u8 *)&stats, sizeof(stats)); } -/** - * i40e_can_vf_change_mac - * @vf: pointer to the VF info - * - * Return true if the VF is allowed to change its MAC filters, false otherwise - */ -static bool i40e_can_vf_change_mac(struct i40e_vf *vf) -{ - /* If the VF MAC address has been set administratively (via the - * ndo_set_vf_mac command), then deny permission to the VF to - * add/delete unicast MAC addresses, unless the VF is trusted - */ - if (vf->pf_set_mac && !vf->trusted) - return false; - - return true; -} - #define I40E_MAX_MACVLAN_PER_HW 3072 #define I40E_MAX_MACVLAN_PER_PF(num_ports) (I40E_MAX_MACVLAN_PER_HW / \ (num_ports)) @@ -2897,8 +2900,10 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = pf->vsi[vf->lan_vsi_idx]; struct i40e_hw *hw = &pf->hw; - int mac2add_cnt = 0; - int i; + int i, mac_add_max, mac_add_cnt = 0; + bool vf_trusted; + + vf_trusted = test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps); for (i = 0; i < al->num_elements; i++) { struct i40e_mac_filter *f; @@ -2918,9 +2923,8 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, * The VF may request to set the MAC address filter already * assigned to it so do not return an error in that case. */ - if (!i40e_can_vf_change_mac(vf) && - !is_multicast_ether_addr(addr) && - !ether_addr_equal(addr, vf->default_lan_addr.addr)) { + if (!vf_trusted && !is_multicast_ether_addr(addr) && + vf->pf_set_mac && !ether_addr_equal(addr, vf->default_lan_addr.addr)) { dev_err(&pf->pdev->dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); return -EPERM; @@ -2929,29 +2933,33 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, /*count filters that really will be added*/ f = i40e_find_mac(vsi, addr); if (!f) - ++mac2add_cnt; + ++mac_add_cnt; } /* If this VF is not privileged, then we can't add more than a limited - * number of addresses. Check to make sure that the additions do not - * push us over the limit. - */ - if (!test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) { - if ((i40e_count_filters(vsi) + mac2add_cnt) > - I40E_VC_MAX_MAC_ADDR_PER_VF) { - dev_err(&pf->pdev->dev, - "Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n"); - return -EPERM; - } - /* If this VF is trusted, it can use more resources than untrusted. + * number of addresses. + * + * If this VF is trusted, it can use more resources than untrusted. * However to ensure that every trusted VF has appropriate number of * resources, divide whole pool of resources per port and then across * all VFs. */ - } else { - if ((i40e_count_filters(vsi) + mac2add_cnt) > - I40E_VC_MAX_MACVLAN_PER_TRUSTED_VF(pf->num_alloc_vfs, - hw->num_ports)) { + if (!vf_trusted) + mac_add_max = I40E_VC_MAX_MAC_ADDR_PER_VF; + else + mac_add_max = I40E_VC_MAX_MACVLAN_PER_TRUSTED_VF(pf->num_alloc_vfs, hw->num_ports); + + /* VF can replace all its filters in one step, in this case mac_add_max + * will be added as active and another mac_add_max will be in + * a to-be-removed state. Account for that. + */ + if ((i40e_count_active_filters(vsi) + mac_add_cnt) > mac_add_max || + (i40e_count_all_filters(vsi) + mac_add_cnt) > 2 * mac_add_max) { + if (!vf_trusted) { + dev_err(&pf->pdev->dev, + "Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n"); + return -EPERM; + } else { dev_err(&pf->pdev->dev, "Cannot add more MAC addresses, trusted VF exhausted it's resources\n"); return -EPERM; @@ -3587,7 +3595,7 @@ static int i40e_validate_cloud_filter(struct i40e_vf *vf, /* action_meta is TC number here to which the filter is applied */ if (!tc_filter->action_meta || - tc_filter->action_meta > vf->num_tc) { + tc_filter->action_meta >= vf->num_tc) { dev_info(&pf->pdev->dev, "VF %d: Invalid TC number %u\n", vf->vf_id, tc_filter->action_meta); goto err; @@ -3884,6 +3892,8 @@ static int i40e_vc_del_cloud_filter(struct i40e_vf *vf, u8 *msg) aq_ret); } +#define I40E_MAX_VF_CLOUD_FILTER 0xFF00 + /** * i40e_vc_add_cloud_filter * @vf: pointer to the VF info @@ -3923,6 +3933,14 @@ static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg) goto err_out; } + if (vf->num_cloud_filters >= I40E_MAX_VF_CLOUD_FILTER) { + dev_warn(&pf->pdev->dev, + "VF %d: Max number of filters reached, can't apply cloud filter\n", + vf->vf_id); + aq_ret = -ENOSPC; + goto err_out; + } + cfilter = kzalloc(sizeof(*cfilter), GFP_KERNEL); if (!cfilter) { aq_ret = -ENOMEM; diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 5cf74f16f433f3..f558b45725c816 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -41,7 +41,8 @@ enum i40e_vf_states { I40E_VF_STATE_MC_PROMISC, I40E_VF_STATE_UC_PROMISC, I40E_VF_STATE_PRE_ENABLE, - I40E_VF_STATE_RESETTING + I40E_VF_STATE_RESETTING, + I40E_VF_STATE_RESOURCES_LOADED, }; /* VF capabilities */ diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index d2871757ec9401..41e7e29879a309 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -894,10 +894,6 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, rx_buf->page_offset, size); sinfo->xdp_frags_size += size; - /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail() - * can pop off frags but driver has to handle it on its own - */ - rx_ring->nr_frags = sinfo->nr_frags; if (page_is_pfmemalloc(rx_buf->page)) xdp_buff_set_frag_pfmemalloc(xdp); @@ -968,20 +964,20 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, /** * ice_get_pgcnts - grab page_count() for gathered fragments * @rx_ring: Rx descriptor ring to store the page counts on + * @ntc: the next to clean element (not included in this frame!) * * This function is intended to be called right before running XDP * program so that the page recycling mechanism will be able to take * a correct decision regarding underlying pages; this is done in such * way as XDP program can change the refcount of page */ -static void ice_get_pgcnts(struct ice_rx_ring *rx_ring) +static void ice_get_pgcnts(struct ice_rx_ring *rx_ring, unsigned int ntc) { - u32 nr_frags = rx_ring->nr_frags + 1; u32 idx = rx_ring->first_desc; struct ice_rx_buf *rx_buf; u32 cnt = rx_ring->count; - for (int i = 0; i < nr_frags; i++) { + while (idx != ntc) { rx_buf = &rx_ring->rx_buf[idx]; rx_buf->pgcnt = page_count(rx_buf->page); @@ -1154,62 +1150,51 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf) } /** - * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags + * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all buffers in frame * @rx_ring: Rx ring with all the auxiliary data * @xdp: XDP buffer carrying linear + frags part - * @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage - * @ntc: a current next_to_clean value to be stored at rx_ring + * @ntc: the next to clean element (not included in this frame!) * @verdict: return code from XDP program execution * - * Walk through gathered fragments and satisfy internal page - * recycle mechanism; we take here an action related to verdict - * returned by XDP program; + * Called after XDP program is completed, or on error with verdict set to + * ICE_XDP_CONSUMED. + * + * Walk through buffers from first_desc to the end of the frame, releasing + * buffers and satisfying internal page recycle mechanism. The action depends + * on verdict from XDP program. */ static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, - u32 *xdp_xmit, u32 ntc, u32 verdict) + u32 ntc, u32 verdict) { - u32 nr_frags = rx_ring->nr_frags + 1; u32 idx = rx_ring->first_desc; u32 cnt = rx_ring->count; - u32 post_xdp_frags = 1; struct ice_rx_buf *buf; - int i; + u32 xdp_frags = 0; + int i = 0; if (unlikely(xdp_buff_has_frags(xdp))) - post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags; + xdp_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; - for (i = 0; i < post_xdp_frags; i++) { + while (idx != ntc) { buf = &rx_ring->rx_buf[idx]; + if (++idx == cnt) + idx = 0; - if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) { + /* An XDP program could release fragments from the end of the + * buffer. For these, we need to keep the pagecnt_bias as-is. + * To do this, only adjust pagecnt_bias for fragments up to + * the total remaining after the XDP program has run. + */ + if (verdict != ICE_XDP_CONSUMED) ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - *xdp_xmit |= verdict; - } else if (verdict & ICE_XDP_CONSUMED) { + else if (i++ <= xdp_frags) buf->pagecnt_bias++; - } else if (verdict == ICE_XDP_PASS) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - } ice_put_rx_buf(rx_ring, buf); - - if (++idx == cnt) - idx = 0; - } - /* handle buffers that represented frags released by XDP prog; - * for these we keep pagecnt_bias as-is; refcount from struct page - * has been decremented within XDP prog and we do not have to increase - * the biased refcnt - */ - for (; i < nr_frags; i++) { - buf = &rx_ring->rx_buf[idx]; - ice_put_rx_buf(rx_ring, buf); - if (++idx == cnt) - idx = 0; } xdp->data = NULL; rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; } /** @@ -1317,6 +1302,10 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) /* retrieve a buffer from the ring */ rx_buf = ice_get_rx_buf(rx_ring, size, ntc); + /* Increment ntc before calls to ice_put_rx_mbuf() */ + if (++ntc == cnt) + ntc = 0; + if (!xdp->data) { void *hard_start; @@ -1325,24 +1314,23 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) xdp_prepare_buff(xdp, hard_start, offset, size, !!offset); xdp_buff_clear_frags_flag(xdp); } else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) { - ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED); + ice_put_rx_mbuf(rx_ring, xdp, ntc, ICE_XDP_CONSUMED); break; } - if (++ntc == cnt) - ntc = 0; /* skip if it is NOP desc */ if (ice_is_non_eop(rx_ring, rx_desc)) continue; - ice_get_pgcnts(rx_ring); + ice_get_pgcnts(rx_ring, ntc); xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc); if (xdp_verdict == ICE_XDP_PASS) goto construct_skb; total_rx_bytes += xdp_get_buff_len(xdp); total_rx_pkts++; - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict); + xdp_xmit |= xdp_verdict & (ICE_XDP_TX | ICE_XDP_REDIR); continue; construct_skb: @@ -1355,7 +1343,7 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) rx_ring->ring_stats->rx_stats.alloc_buf_failed++; xdp_verdict = ICE_XDP_CONSUMED; } - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict); if (!skb) break; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index fef750c5f288f3..2fd8e78178a271 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -358,7 +358,6 @@ struct ice_rx_ring { struct ice_tx_ring *xdp_ring; struct ice_rx_ring *next; /* pointer to next ring in q_vector */ struct xsk_buff_pool *xsk_pool; - u32 nr_frags; u16 max_frame; u16 rx_buf_len; dma_addr_t dma; /* physical address of ring */ diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 92ef33459aec78..7b8f32c5169aa3 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -2081,11 +2081,8 @@ static void igb_diag_test(struct net_device *netdev, } else { dev_info(&adapter->pdev->dev, "online testing starting\n"); - /* PHY is powered down when interface is down */ - if (if_running && igb_link_test(adapter, &data[TEST_LINK])) + if (igb_link_test(adapter, &data[TEST_LINK])) eth_test->flags |= ETH_TEST_FL_FAILED; - else - data[TEST_LINK] = 0; /* Online tests aren't run; pass by default */ data[TEST_REG] = 0; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a9a7a94ae61e93..453deb6d14b312 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -4453,8 +4453,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) xdp_rxq_info_unreg(&rx_ring->xdp_rxq); res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index, - rx_ring->q_vector->napi.napi_id); + rx_ring->queue_index, 0); if (res < 0) { dev_err(dev, "Failed to register xdp_rxq index %u\n", rx_ring->queue_index); diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 266bfcf2a28f02..a427f05814c1ae 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -345,6 +345,7 @@ struct igc_adapter { /* LEDs */ struct mutex led_mutex; struct igc_led_classdev *leds; + bool leds_available; }; void igc_up(struct igc_adapter *adapter); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index e79b14d50b2405..728d7ca5338bf2 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7335,8 +7335,14 @@ static int igc_probe(struct pci_dev *pdev, if (IS_ENABLED(CONFIG_IGC_LEDS)) { err = igc_led_setup(adapter); - if (err) - goto err_register; + if (err) { + netdev_warn_once(netdev, + "LED init failed (%d); continuing without LED support\n", + err); + adapter->leds_available = false; + } else { + adapter->leds_available = true; + } } return 0; @@ -7392,7 +7398,7 @@ static void igc_remove(struct pci_dev *pdev) cancel_work_sync(&adapter->watchdog_task); hrtimer_cancel(&adapter->hrtimer); - if (IS_ENABLED(CONFIG_IGC_LEDS)) + if (IS_ENABLED(CONFIG_IGC_LEDS) && adapter->leds_available) igc_led_free(adapter); /* Release control of h/w to f/w. If f/w is AMT enabled, this diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 80e6a2ef1350e3..6218bdb7f941f6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6973,6 +6973,13 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, break; } + /* Make sure the SWFW semaphore is in a valid state */ + if (hw->mac.ops.init_swfw_sync) + hw->mac.ops.init_swfw_sync(hw); + + if (hw->mac.type == ixgbe_mac_e610) + mutex_init(&hw->aci.lock); + #ifdef IXGBE_FCOE /* FCoE support exists, always init the FCoE lock */ spin_lock_init(&adapter->fcoe.lock); @@ -11643,10 +11650,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; - /* Make sure the SWFW semaphore is in a valid state */ - if (hw->mac.ops.init_swfw_sync) - hw->mac.ops.init_swfw_sync(hw); - if (ixgbe_check_fw_error(adapter)) return ixgbe_recovery_probe(adapter); @@ -11850,8 +11853,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ether_addr_copy(hw->mac.addr, hw->mac.perm_addr); ixgbe_mac_set_default_filter(adapter); - if (hw->mac.type == ixgbe_mac_e610) - mutex_init(&hw->aci.lock); timer_setup(&adapter->service_timer, ixgbe_service_timer, 0); if (ixgbe_removed(hw->hw_addr)) { @@ -12007,9 +12008,9 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) devl_unlock(adapter->devlink); ixgbe_release_hw_control(adapter); ixgbe_clear_interrupt_scheme(adapter); +err_sw_init: if (hw->mac.type == ixgbe_mac_e610) mutex_destroy(&adapter->hw.aci.lock); -err_sw_init: ixgbe_disable_sriov(adapter); adapter->flags2 &= ~IXGBE_FLAG2_SEARCH_FOR_SFP; iounmap(adapter->io_addr); @@ -12060,10 +12061,8 @@ static void ixgbe_remove(struct pci_dev *pdev) set_bit(__IXGBE_REMOVING, &adapter->state); cancel_work_sync(&adapter->service_task); - if (adapter->hw.mac.type == ixgbe_mac_e610) { + if (adapter->hw.mac.type == ixgbe_mac_e610) ixgbe_disable_link_status_events(adapter); - mutex_destroy(&adapter->hw.aci.lock); - } if (adapter->mii_bus) mdiobus_unregister(adapter->mii_bus); @@ -12123,6 +12122,9 @@ static void ixgbe_remove(struct pci_dev *pdev) disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state); free_netdev(netdev); + if (adapter->hw.mac.type == ixgbe_mac_e610) + mutex_destroy(&adapter->hw.aci.lock); + if (disable_dev) pci_disable_device(pdev); } diff --git a/drivers/net/ethernet/intel/libie/adminq.c b/drivers/net/ethernet/intel/libie/adminq.c index 55356548e3f0a1..7b4ff479e7e57f 100644 --- a/drivers/net/ethernet/intel/libie/adminq.c +++ b/drivers/net/ethernet/intel/libie/adminq.c @@ -6,7 +6,7 @@ static const char * const libie_aq_str_arr[] = { #define LIBIE_AQ_STR(x) \ - [LIBIE_AQ_RC_##x] = "LIBIE_AQ_RC" #x + [LIBIE_AQ_RC_##x] = "LIBIE_AQ_RC_" #x LIBIE_AQ_STR(OK), LIBIE_AQ_STR(EPERM), LIBIE_AQ_STR(ENOENT), diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 24499bb36c0057..bcea3fc26a8c7d 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1124,11 +1124,24 @@ static int octep_set_features(struct net_device *dev, netdev_features_t features return err; } +static bool octep_is_vf_valid(struct octep_device *oct, int vf) +{ + if (vf >= CFG_GET_ACTIVE_VFS(oct->conf)) { + netdev_err(oct->netdev, "Invalid VF ID %d\n", vf); + return false; + } + + return true; +} + static int octep_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi) { struct octep_device *oct = netdev_priv(dev); + if (!octep_is_vf_valid(oct, vf)) + return -EINVAL; + ivi->vf = vf; ether_addr_copy(ivi->mac, oct->vf_info[vf].mac_addr); ivi->spoofchk = true; @@ -1143,6 +1156,9 @@ static int octep_set_vf_mac(struct net_device *dev, int vf, u8 *mac) struct octep_device *oct = netdev_priv(dev); int err; + if (!octep_is_vf_valid(oct, vf)) + return -EINVAL; + if (!is_valid_ether_addr(mac)) { dev_err(&oct->pdev->dev, "Invalid MAC Address %pM\n", mac); return -EADDRNOTAVAIL; diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c index ebecdd29f3bd05..0867fab61b1905 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c @@ -196,6 +196,7 @@ static void octep_pfvf_get_mac_addr(struct octep_device *oct, u32 vf_id, vf_id); return; } + ether_addr_copy(oct->vf_info[vf_id].mac_addr, rsp->s_set_mac.mac_addr); rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; } @@ -205,6 +206,8 @@ static void octep_pfvf_dev_remove(struct octep_device *oct, u32 vf_id, { int err; + /* Reset VF-specific information maintained by the PF */ + memset(&oct->vf_info[vf_id], 0, sizeof(struct octep_pfvf_info)); err = octep_ctrl_net_dev_remove(oct, vf_id); if (err) { rsp->s.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 0c46ba8a5adc8f..69324ae093973e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -21,8 +21,7 @@ #include "rvu.h" #include "lmac_common.h" -#define DRV_NAME "Marvell-CGX/RPM" -#define DRV_STRING "Marvell CGX/RPM Driver" +#define DRV_NAME "Marvell-CGX-RPM" #define CGX_RX_STAT_GLOBAL_INDEX 9 diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c index e52cc6b1a26cc8..dedd586ed3108e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c @@ -491,7 +491,7 @@ void otx2_ptp_destroy(struct otx2_nic *pfvf) if (!ptp) return; - cancel_delayed_work(&pfvf->ptp->synctstamp_work); + cancel_delayed_work_sync(&pfvf->ptp->synctstamp_work); ptp_clock_unregister(ptp->ptp_clock); kfree(ptp); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c index 5f80b23c5335cd..26a08d2cfbb1b6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c @@ -1326,7 +1326,6 @@ static int otx2_tc_add_flow(struct otx2_nic *nic, free_leaf: otx2_tc_del_from_flow_list(flow_cfg, new_node); - kfree_rcu(new_node, rcu); if (new_node->is_act_police) { mutex_lock(&nic->mbox.lock); @@ -1346,6 +1345,7 @@ static int otx2_tc_add_flow(struct otx2_nic *nic, mutex_unlock(&nic->mbox.lock); } + kfree_rcu(new_node, rcu); return rc; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index 9560fcba643f50..ac65e319148029 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -92,6 +92,7 @@ enum { MLX5E_ACCEL_FS_ESP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL, MLX5E_ACCEL_FS_POL_FT_LEVEL, + MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL, MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, #endif }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index ffcd0cdeb77544..23703f28386ad9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -185,6 +185,7 @@ struct mlx5e_ipsec_rx_create_attr { u32 family; int prio; int pol_level; + int pol_miss_level; int sa_level; int status_level; enum mlx5_flow_namespace_type chains_ns; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 98b6a3a623f995..65dc3529283b69 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -747,6 +747,7 @@ static void ipsec_rx_create_attr_set(struct mlx5e_ipsec *ipsec, attr->family = family; attr->prio = MLX5E_NIC_PRIO; attr->pol_level = MLX5E_ACCEL_FS_POL_FT_LEVEL; + attr->pol_miss_level = MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL; attr->sa_level = MLX5E_ACCEL_FS_ESP_FT_LEVEL; attr->status_level = MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL; attr->chains_ns = MLX5_FLOW_NAMESPACE_KERNEL; @@ -833,7 +834,7 @@ static int ipsec_rx_chains_create_miss(struct mlx5e_ipsec *ipsec, ft_attr.max_fte = 1; ft_attr.autogroup.max_num_groups = 1; - ft_attr.level = attr->pol_level; + ft_attr.level = attr->pol_miss_level; ft_attr.prio = attr->prio; ft = mlx5_create_auto_grouped_flow_table(attr->ns, &ft_attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index e680673ffb725c..15eded36b872a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -139,8 +139,6 @@ void mlx5e_update_carrier(struct mlx5e_priv *priv) if (up) { netdev_info(priv->netdev, "Link up\n"); netif_carrier_on(priv->netdev); - mlx5e_port_manual_buffer_config(priv, 0, priv->netdev->mtu, - NULL, NULL, NULL); } else { netdev_info(priv->netdev, "Link down\n"); netif_carrier_off(priv->netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 63a7a788fb0db5..cd0242eb008c29 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1506,12 +1506,21 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = { static int mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { - struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; + + netdev = mlx5_uplink_netdev_get(dev); + if (!netdev) + return 0; + priv = netdev_priv(netdev); rpriv->netdev = priv->netdev; - return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, - rpriv); + err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, + rpriv); + mlx5_uplink_netdev_put(dev, netdev); + return err; } static void @@ -1638,8 +1647,16 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) { struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - void *ppriv = priv->ppriv; + struct mlx5e_priv *priv; + void *ppriv; + + if (!netdev) { + ppriv = rpriv; + goto free_ppriv; + } + + priv = netdev_priv(netdev); + ppriv = priv->ppriv; if (rep->vport == MLX5_VPORT_UPLINK) { mlx5e_vport_uplink_rep_unload(rpriv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 87536f158d07b4..c6185ddba04b84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -1466,6 +1466,7 @@ static void fec_set_block_stats(struct mlx5e_priv *priv, case MLX5E_FEC_RS_528_514: case MLX5E_FEC_RS_544_514: case MLX5E_FEC_LLRS_272_257_1: + case MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD: fec_set_rs_stats(fec_stats, out); return; case MLX5E_FEC_FIRECODE: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 8b497765018381..5f2d6c35f1ad59 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1515,6 +1515,7 @@ static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev) speed = lksettings.base.speed; out: + mlx5_uplink_netdev_put(mdev, slave); return speed; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index cb165085a4c10c..80245c38dbad39 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -114,9 +114,9 @@ #define ETHTOOL_NUM_PRIOS 11 #define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS) /* Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy, - * {IPsec RoCE MPV,Alias table},IPsec RoCE policy + * IPsec policy miss, {IPsec RoCE MPV,Alias table},IPsec RoCE policy */ -#define KERNEL_NIC_PRIO_NUM_LEVELS 10 +#define KERNEL_NIC_PRIO_NUM_LEVELS 11 #define KERNEL_NIC_NUM_PRIOS 1 /* One more level for tc, and one more for promisc */ #define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 2) @@ -663,7 +663,7 @@ static void del_sw_hw_rule(struct fs_node *node) BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) | BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS); fte->act_dests.action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT; - mlx5_fc_local_destroy(rule->dest_attr.counter); + mlx5_fc_local_put(rule->dest_attr.counter); goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 500826229b0beb..e6a95b310b5554 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -343,6 +343,7 @@ struct mlx5_fc { enum mlx5_fc_type type; struct mlx5_fc_bulk *bulk; struct mlx5_fc_cache cache; + refcount_t fc_local_refcount; /* last{packets,bytes} are used for calculating deltas since last reading. */ u64 lastpackets; u64 lastbytes; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 492775d3d193a3..83001eda38842a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -562,17 +562,36 @@ mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size) counter->id = counter_id; fc_bulk->base_id = counter_id - offset; fc_bulk->fs_bulk.bulk_len = bulk_size; + refcount_set(&fc_bulk->hws_data.hws_action_refcount, 0); + mutex_init(&fc_bulk->hws_data.lock); counter->bulk = fc_bulk; + refcount_set(&counter->fc_local_refcount, 1); return counter; } EXPORT_SYMBOL(mlx5_fc_local_create); void mlx5_fc_local_destroy(struct mlx5_fc *counter) { - if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) - return; - kfree(counter->bulk); kfree(counter); } EXPORT_SYMBOL(mlx5_fc_local_destroy); + +void mlx5_fc_local_get(struct mlx5_fc *counter) +{ + if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) + return; + + refcount_inc(&counter->fc_local_refcount); +} + +void mlx5_fc_local_put(struct mlx5_fc *counter) +{ + if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) + return; + + if (!refcount_dec_and_test(&counter->fc_local_refcount)) + return; + + mlx5_fc_local_destroy(counter); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index b111ccd03b0267..74ea5da58b7ea4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -47,7 +47,20 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) { - return mdev->mlx5e_res.uplink_netdev; + struct mlx5e_resources *mlx5e_res = &mdev->mlx5e_res; + struct net_device *netdev; + + mutex_lock(&mlx5e_res->uplink_netdev_lock); + netdev = mlx5e_res->uplink_netdev; + netdev_hold(netdev, &mlx5e_res->tracker, GFP_KERNEL); + mutex_unlock(&mlx5e_res->uplink_netdev_lock); + return netdev; +} + +static inline void mlx5_uplink_netdev_put(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + netdev_put(netdev, &mdev->mlx5e_res.tracker); } struct mlx5_sd; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 2d7adf7444ba29..aa9f2b0a77d36f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1170,7 +1170,11 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev, mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, force_legacy); i = find_first_bit(&temp, max_size); - if (i < max_size) + + /* mlx5e_link_info has holes. Check speed + * is not zero as indication of one. + */ + if (i < max_size && table[i].speed) return &table[i]; return NULL; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index 6b36a4a7d895fc..fe56b59e24c59c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -1360,7 +1360,7 @@ mlx5hws_action_create_modify_header(struct mlx5hws_context *ctx, struct mlx5hws_action * mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest, struct mlx5hws_action_dest_attr *dests, - bool ignore_flow_level, u32 flags) + u32 flags) { struct mlx5hws_cmd_set_fte_dest *dest_list = NULL; struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; @@ -1397,7 +1397,7 @@ mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest, MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest_list[i].destination_id = dests[i].dest->dest_obj.obj_id; fte_attr.action_flags |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - fte_attr.ignore_flow_level = ignore_flow_level; + fte_attr.ignore_flow_level = 1; if (dests[i].is_wire_ft) last_dest_idx = i; break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c index 131e74b2b77435..6a4c4cccd64342 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c @@ -572,12 +572,12 @@ static void mlx5_fs_put_dest_action_sampler(struct mlx5_fs_hws_context *fs_ctx, static struct mlx5hws_action * mlx5_fs_create_action_dest_array(struct mlx5hws_context *ctx, struct mlx5hws_action_dest_attr *dests, - u32 num_of_dests, bool ignore_flow_level) + u32 num_of_dests) { u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED; return mlx5hws_action_create_dest_array(ctx, num_of_dests, dests, - ignore_flow_level, flags); + flags); } static struct mlx5hws_action * @@ -1014,19 +1014,14 @@ static int mlx5_fs_fte_get_hws_actions(struct mlx5_flow_root_namespace *ns, } (*ractions)[num_actions++].action = dest_actions->dest; } else if (num_dest_actions > 1) { - bool ignore_flow_level; - if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { err = -EOPNOTSUPP; goto free_actions; } - ignore_flow_level = - !!(fte_action->flags & FLOW_ACT_IGNORE_FLOW_LEVEL); tmp_action = mlx5_fs_create_action_dest_array(ctx, dest_actions, - num_dest_actions, - ignore_flow_level); + num_dest_actions); if (!tmp_action) { err = -EOPNOTSUPP; goto free_actions; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c index f1ecdba74e1f46..839d71bd42164f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c @@ -407,15 +407,21 @@ struct mlx5hws_action *mlx5_fc_get_hws_action(struct mlx5hws_context *ctx, { struct mlx5_fs_hws_create_action_ctx create_ctx; struct mlx5_fc_bulk *fc_bulk = counter->bulk; + struct mlx5hws_action *hws_action; create_ctx.hws_ctx = ctx; create_ctx.id = fc_bulk->base_id; create_ctx.actions_type = MLX5HWS_ACTION_TYP_CTR; - return mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx); + mlx5_fc_local_get(counter); + hws_action = mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx); + if (!hws_action) + mlx5_fc_local_put(counter); + return hws_action; } void mlx5_fc_put_hws_action(struct mlx5_fc *counter) { mlx5_fs_put_hws_action(&counter->bulk->hws_data); + mlx5_fc_local_put(counter); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h index 2498ceff2060fd..1ad7a50d938b6c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h @@ -735,7 +735,6 @@ mlx5hws_action_create_push_vlan(struct mlx5hws_context *ctx, u32 flags); * @num_dest: The number of dests attributes. * @dests: The destination array. Each contains a destination action and can * have additional actions. - * @ignore_flow_level: Whether to turn on 'ignore_flow_level' for this dest. * @flags: Action creation flags (enum mlx5hws_action_flags). * * Return: pointer to mlx5hws_action on success NULL otherwise. @@ -743,7 +742,7 @@ mlx5hws_action_create_push_vlan(struct mlx5hws_context *ctx, u32 flags); struct mlx5hws_action * mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest, struct mlx5hws_action_dest_attr *dests, - bool ignore_flow_level, u32 flags); + u32 flags); /** * mlx5hws_action_create_insert_header - Create insert header action. diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index 56d5464222d97a..cdbf82affa7bea 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -820,7 +820,7 @@ static void rx_irq(struct net_device *ndev) struct ns83820 *dev = PRIV(ndev); struct rx_info *info = &dev->rx_info; unsigned next_rx; - int rx_rc, len; + int len; u32 cmdsts; __le32 *desc; unsigned long flags; @@ -881,8 +881,10 @@ static void rx_irq(struct net_device *ndev) if (likely(CMDSTS_OK & cmdsts)) { #endif skb_put(skb, len); - if (unlikely(!skb)) + if (unlikely(!skb)) { + ndev->stats.rx_dropped++; goto netdev_mangle_me_harder_failed; + } if (cmdsts & CMDSTS_DEST_MULTI) ndev->stats.multicast++; ndev->stats.rx_packets++; @@ -901,15 +903,12 @@ static void rx_irq(struct net_device *ndev) __vlan_hwaccel_put_tag(skb, htons(ETH_P_IPV6), tag); } #endif - rx_rc = netif_rx(skb); - if (NET_RX_DROP == rx_rc) { -netdev_mangle_me_harder_failed: - ndev->stats.rx_dropped++; - } + netif_rx(skb); } else { dev_kfree_skb_irq(skb); } +netdev_mangle_me_harder_failed: nr++; next_rx = info->next_rx; desc = info->descs + (DESC_SIZE * next_rx); diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 9c3d3dd2f84753..1f0cea3cae92f5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -4462,10 +4462,11 @@ static enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn, goto out; } - /* Add override window info to buffer */ + /* Add override window info to buffer, preventing buffer overflow */ override_window_dwords = - qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) * - PROTECTION_OVERRIDE_ELEMENT_DWORDS; + min(qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) * + PROTECTION_OVERRIDE_ELEMENT_DWORDS, + PROTECTION_OVERRIDE_DEPTH_DWORDS); if (override_window_dwords) { addr = BYTES_TO_DWORDS(GRC_REG_PROTECTION_OVERRIDE_WINDOW); offset += qed_grc_dump_addr_range(p_hwfn, diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index dadce6009791bc..e42d0fdefee128 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -654,7 +654,7 @@ static void icssg_prueth_hsr_fdb_add_del(struct prueth_emac *emac, static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) { - struct net_device *real_dev; + struct net_device *real_dev, *port_dev; struct prueth_emac *emac; u8 vlan_id, i; @@ -663,11 +663,15 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) if (is_hsr_master(real_dev)) { for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) { - emac = netdev_priv(hsr_get_port_ndev(real_dev, i)); - if (!emac) + port_dev = hsr_get_port_ndev(real_dev, i); + emac = netdev_priv(port_dev); + if (!emac) { + dev_put(port_dev); return -EINVAL; + } icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id, true); + dev_put(port_dev); } } else { emac = netdev_priv(real_dev); @@ -679,7 +683,7 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr) { - struct net_device *real_dev; + struct net_device *real_dev, *port_dev; struct prueth_emac *emac; u8 vlan_id, i; @@ -688,11 +692,15 @@ static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr) if (is_hsr_master(real_dev)) { for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) { - emac = netdev_priv(hsr_get_port_ndev(real_dev, i)); - if (!emac) + port_dev = hsr_get_port_ndev(real_dev, i); + emac = netdev_priv(port_dev); + if (!emac) { + dev_put(port_dev); return -EINVAL; + } icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id, false); + dev_put(port_dev); } } else { emac = netdev_priv(real_dev); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index bcd07a71575241..5cb353a97d6d80 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -2078,10 +2078,6 @@ static void wx_setup_mrqc(struct wx *wx) { u32 rss_field = 0; - /* VT, and RSS do not coexist at the same time */ - if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) - return; - /* Disable indicating checksum in descriptor, enables RSS hash */ wr32m(wx, WX_PSR_CTL, WX_PSR_CTL_PCSD, WX_PSR_CTL_PCSD); diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 01329fe7451a12..0eca96eeed58ab 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4286,6 +4286,7 @@ static int macsec_newlink(struct net_device *dev, if (err < 0) goto del_dev; + netdev_update_features(dev); netif_stacked_transfer_operstate(real_dev, dev); linkwatch_fire_event(dev); diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 28acc6392cfc89..392749aae54df4 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -361,7 +361,7 @@ config NXP_TJA11XX_PHY tristate "NXP TJA11xx PHYs support" depends on HWMON help - Currently supports the NXP TJA1100 and TJA1101 PHY. + Currently supports the NXP TJA1100, TJA1101 and TJA1102 PHYs. config NCN26000_PHY tristate "Onsemi 10BASE-T1S Ethernet PHY" diff --git a/drivers/net/phy/bcm-phy-ptp.c b/drivers/net/phy/bcm-phy-ptp.c index eba8b5fb1365f4..d3501f8487d964 100644 --- a/drivers/net/phy/bcm-phy-ptp.c +++ b/drivers/net/phy/bcm-phy-ptp.c @@ -597,10 +597,6 @@ static int bcm_ptp_perout_locked(struct bcm_ptp_private *priv, period = BCM_MAX_PERIOD_8NS; /* write nonzero value */ - /* Reject unsupported flags */ - if (req->flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - if (req->flags & PTP_PEROUT_DUTY_CYCLE) pulse = ktime_to_ns(ktime_set(req->on.sec, req->on.nsec)); else @@ -741,6 +737,8 @@ static const struct ptp_clock_info bcm_ptp_clock_info = { .n_pins = 1, .n_per_out = 1, .n_ext_ts = 1, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE, + .supported_extts_flags = PTP_STRICT_FLAGS | PTP_RISING_EDGE, }; static void bcm_ptp_txtstamp(struct mii_timestamper *mii_ts, diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 13df28445f0201..c02da57a4da5e3 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1065,23 +1065,19 @@ EXPORT_SYMBOL_GPL(phy_inband_caps); */ int phy_config_inband(struct phy_device *phydev, unsigned int modes) { - int err; + lockdep_assert_held(&phydev->lock); if (!!(modes & LINK_INBAND_DISABLE) + !!(modes & LINK_INBAND_ENABLE) + !!(modes & LINK_INBAND_BYPASS) != 1) return -EINVAL; - mutex_lock(&phydev->lock); if (!phydev->drv) - err = -EIO; + return -EIO; else if (!phydev->drv->config_inband) - err = -EOPNOTSUPP; - else - err = phydev->drv->config_inband(phydev, modes); - mutex_unlock(&phydev->lock); + return -EOPNOTSUPP; - return err; + return phydev->drv->config_inband(phydev, modes); } EXPORT_SYMBOL(phy_config_inband); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 7556aa3dd7eeba..c82c1997147bfe 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -287,8 +287,7 @@ static bool phy_uses_state_machine(struct phy_device *phydev) if (phydev->phy_link_change == phy_link_change) return phydev->attached_dev && phydev->adjust_link; - /* phydev->phy_link_change is implicitly phylink_phy_change() */ - return true; + return !!phydev->phy_link_change; } static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) @@ -1864,6 +1863,8 @@ void phy_detach(struct phy_device *phydev) phydev->attached_dev = NULL; phy_link_topo_del_phy(dev, phydev); } + + phydev->phy_link_change = NULL; phydev->phylink = NULL; if (!phydev->is_on_sfp_module) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index c7cb95aa80074a..1988b7d2089a6c 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -67,6 +67,8 @@ struct phylink { struct timer_list link_poll; struct mutex state_mutex; + /* Serialize updates to pl->phydev with phylink_resolve() */ + struct mutex phydev_mutex; struct phylink_link_state phy_state; unsigned int phy_ib_mode; struct work_struct resolve; @@ -1432,6 +1434,7 @@ static void phylink_get_fixed_state(struct phylink *pl, static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) { struct phylink_link_state link_state; + struct phy_device *phy = pl->phydev; switch (pl->req_link_an_mode) { case MLO_AN_PHY: @@ -1455,7 +1458,11 @@ static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) link_state.link = false; phylink_apply_manual_flow(pl, &link_state); + if (phy) + mutex_lock(&phy->lock); phylink_major_config(pl, force_restart, &link_state); + if (phy) + mutex_unlock(&phy->lock); } static const char *phylink_pause_to_str(int pause) @@ -1591,8 +1598,13 @@ static void phylink_resolve(struct work_struct *w) struct phylink_link_state link_state; bool mac_config = false; bool retrigger = false; + struct phy_device *phy; bool cur_link_state; + mutex_lock(&pl->phydev_mutex); + phy = pl->phydev; + if (phy) + mutex_lock(&phy->lock); mutex_lock(&pl->state_mutex); cur_link_state = phylink_link_is_up(pl); @@ -1626,11 +1638,11 @@ static void phylink_resolve(struct work_struct *w) /* If we have a phy, the "up" state is the union of both the * PHY and the MAC */ - if (pl->phydev) + if (phy) link_state.link &= pl->phy_state.link; /* Only update if the PHY link is up */ - if (pl->phydev && pl->phy_state.link) { + if (phy && pl->phy_state.link) { /* If the interface has changed, force a link down * event if the link isn't already down, and re-resolve. */ @@ -1694,6 +1706,9 @@ static void phylink_resolve(struct work_struct *w) queue_work(system_power_efficient_wq, &pl->resolve); } mutex_unlock(&pl->state_mutex); + if (phy) + mutex_unlock(&phy->lock); + mutex_unlock(&pl->phydev_mutex); } static void phylink_run_resolve(struct phylink *pl) @@ -1829,6 +1844,7 @@ struct phylink *phylink_create(struct phylink_config *config, if (!pl) return ERR_PTR(-ENOMEM); + mutex_init(&pl->phydev_mutex); mutex_init(&pl->state_mutex); INIT_WORK(&pl->resolve, phylink_resolve); @@ -2089,6 +2105,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, dev_name(&phy->mdio.dev), phy->drv->name, irq_str); kfree(irq_str); + mutex_lock(&pl->phydev_mutex); mutex_lock(&phy->lock); mutex_lock(&pl->state_mutex); pl->phydev = phy; @@ -2134,6 +2151,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, mutex_unlock(&pl->state_mutex); mutex_unlock(&phy->lock); + mutex_unlock(&pl->phydev_mutex); phylink_dbg(pl, "phy: %s setting supported %*pb advertising %*pb\n", @@ -2312,6 +2330,7 @@ void phylink_disconnect_phy(struct phylink *pl) ASSERT_RTNL(); + mutex_lock(&pl->phydev_mutex); phy = pl->phydev; if (phy) { mutex_lock(&phy->lock); @@ -2321,8 +2340,11 @@ void phylink_disconnect_phy(struct phylink *pl) pl->mac_tx_clk_stop = false; mutex_unlock(&pl->state_mutex); mutex_unlock(&phy->lock); - flush_work(&pl->resolve); + } + mutex_unlock(&pl->phydev_mutex); + if (phy) { + flush_work(&pl->resolve); phy_disconnect(phy); } } diff --git a/drivers/net/tun.c b/drivers/net/tun.c index cc6c5018066370..47ddcb4b9a788b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1875,6 +1875,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, local_bh_enable(); goto unlock_frags; } + + if (frags && skb != tfile->napi.skb) + tfile->napi.skb = skb; } rcu_read_unlock(); local_bh_enable(); diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index bd1ec3b2c08416..3a3965b79942d2 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4078,12 +4078,68 @@ static int ath12k_mac_fils_discovery(struct ath12k_link_vif *arvif, return ret; } +static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) +{ + struct ath12k *ar = arvif->ar; + struct ieee80211_vif *vif = arvif->ahvif->vif; + struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf; + enum wmi_sta_powersave_param param; + struct ieee80211_bss_conf *info; + enum wmi_sta_ps_mode psmode; + int ret; + int timeout; + bool enable_ps; + + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + if (vif->type != NL80211_IFTYPE_STATION) + return; + + enable_ps = arvif->ahvif->ps; + if (enable_ps) { + psmode = WMI_STA_PS_MODE_ENABLED; + param = WMI_STA_PS_PARAM_INACTIVITY_TIME; + + timeout = conf->dynamic_ps_timeout; + if (timeout == 0) { + info = ath12k_mac_get_link_bss_conf(arvif); + if (!info) { + ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n", + vif->addr, arvif->link_id); + return; + } + + /* firmware doesn't like 0 */ + timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000; + } + + ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param, + timeout); + if (ret) { + ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n", + arvif->vdev_id, ret); + return; + } + } else { + psmode = WMI_STA_PS_MODE_DISABLED; + } + + ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n", + arvif->vdev_id, psmode ? "enable" : "disable"); + + ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode); + if (ret) + ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n", + psmode, arvif->vdev_id, ret); +} + static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 changed) { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); unsigned long links = ahvif->links_map; + struct ieee80211_vif_cfg *vif_cfg; struct ieee80211_bss_conf *info; struct ath12k_link_vif *arvif; struct ieee80211_sta *sta; @@ -4147,61 +4203,24 @@ static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, } } } -} - -static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) -{ - struct ath12k *ar = arvif->ar; - struct ieee80211_vif *vif = arvif->ahvif->vif; - struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf; - enum wmi_sta_powersave_param param; - struct ieee80211_bss_conf *info; - enum wmi_sta_ps_mode psmode; - int ret; - int timeout; - bool enable_ps; - lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + if (changed & BSS_CHANGED_PS) { + links = ahvif->links_map; + vif_cfg = &vif->cfg; - if (vif->type != NL80211_IFTYPE_STATION) - return; + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + if (!arvif || !arvif->ar) + continue; - enable_ps = arvif->ahvif->ps; - if (enable_ps) { - psmode = WMI_STA_PS_MODE_ENABLED; - param = WMI_STA_PS_PARAM_INACTIVITY_TIME; + ar = arvif->ar; - timeout = conf->dynamic_ps_timeout; - if (timeout == 0) { - info = ath12k_mac_get_link_bss_conf(arvif); - if (!info) { - ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n", - vif->addr, arvif->link_id); - return; + if (ar->ab->hw_params->supports_sta_ps) { + ahvif->ps = vif_cfg->ps; + ath12k_mac_vif_setup_ps(arvif); } - - /* firmware doesn't like 0 */ - timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000; } - - ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param, - timeout); - if (ret) { - ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n", - arvif->vdev_id, ret); - return; - } - } else { - psmode = WMI_STA_PS_MODE_DISABLED; } - - ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n", - arvif->vdev_id, psmode ? "enable" : "disable"); - - ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode); - if (ret) - ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n", - psmode, arvif->vdev_id, ret); } static bool ath12k_mac_supports_tpc(struct ath12k *ar, struct ath12k_vif *ahvif, @@ -4223,7 +4242,6 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, { struct ath12k_vif *ahvif = arvif->ahvif; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif); - struct ieee80211_vif_cfg *vif_cfg = &vif->cfg; struct cfg80211_chan_def def; u32 param_id, param_value; enum nl80211_band band; @@ -4510,12 +4528,6 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, } ath12k_mac_fils_discovery(arvif, info); - - if (changed & BSS_CHANGED_PS && - ar->ab->hw_params->supports_sta_ps) { - ahvif->ps = vif_cfg->ps; - ath12k_mac_vif_setup_ps(arvif); - } } static struct ath12k_vif_cache *ath12k_ahvif_get_link_cache(struct ath12k_vif *ahvif, diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 742ffeb48bce77..29dadedefdd27a 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -843,7 +843,7 @@ int ath12k_wmi_mgmt_send(struct ath12k_link_vif *arvif, u32 buf_id, cmd->tx_params_valid = 0; frame_tlv = (struct wmi_tlv *)(skb->data + sizeof(*cmd)); - frame_tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_BYTE, buf_len); + frame_tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_BYTE, buf_len_aligned); memcpy(frame_tlv->value, frame->data, buf_len); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index f9e2095d649050..7e56e4ff764295 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -124,13 +124,13 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x0082, 0x1304, iwl6005_mac_cfg)},/* low 5GHz active */ {IWL_PCI_DEVICE(0x0082, 0x1305, iwl6005_mac_cfg)},/* high 5GHz active */ -/* 6x30 Series */ - {IWL_PCI_DEVICE(0x008A, 0x5305, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008A, 0x5307, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008A, 0x5325, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008A, 0x5327, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008B, 0x5315, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008B, 0x5317, iwl1000_mac_cfg)}, +/* 1030/6x30 Series */ + {IWL_PCI_DEVICE(0x008A, 0x5305, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008A, 0x5307, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008A, 0x5325, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008A, 0x5327, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008B, 0x5315, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008B, 0x5317, iwl6030_mac_cfg)}, {IWL_PCI_DEVICE(0x0090, 0x5211, iwl6030_mac_cfg)}, {IWL_PCI_DEVICE(0x0090, 0x5215, iwl6030_mac_cfg)}, {IWL_PCI_DEVICE(0x0090, 0x5216, iwl6030_mac_cfg)}, @@ -181,12 +181,12 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x08AE, 0x1027, iwl1000_mac_cfg)}, /* 130 Series WiFi */ - {IWL_PCI_DEVICE(0x0896, 0x5005, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0896, 0x5007, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0897, 0x5015, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0897, 0x5017, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0896, 0x5025, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0896, 0x5027, iwl1000_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5005, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5007, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0897, 0x5015, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0897, 0x5017, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5025, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5027, iwl6030_mac_cfg)}, /* 2x00 Series */ {IWL_PCI_DEVICE(0x0890, 0x4022, iwl2000_mac_cfg)}, diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c index d912e709a92cbf..bb03dad4a3006b 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c @@ -2092,7 +2092,7 @@ static void iwl_txq_gen1_update_byte_cnt_tbl(struct iwl_trans *trans, break; } - if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_9000 && + if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_7000 && trans->mac_cfg->device_family < IWL_DEVICE_FAMILY_AX210) len = DIV_ROUND_UP(len, 4); diff --git a/drivers/net/wireless/virtual/virt_wifi.c b/drivers/net/wireless/virtual/virt_wifi.c index 1fffeff2190ca8..4eae89376feb55 100644 --- a/drivers/net/wireless/virtual/virt_wifi.c +++ b/drivers/net/wireless/virtual/virt_wifi.c @@ -277,7 +277,9 @@ static void virt_wifi_connect_complete(struct work_struct *work) priv->is_connected = true; /* Schedules an event that acquires the rtnl lock. */ - cfg80211_connect_result(priv->upperdev, requested_bss, NULL, 0, NULL, 0, + cfg80211_connect_result(priv->upperdev, + priv->is_connected ? fake_router_bssid : NULL, + NULL, 0, NULL, 0, status, GFP_KERNEL); netif_carrier_on(priv->upperdev); } diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 74aaea61de13c9..d2b690857e5885 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -519,6 +519,7 @@ int of_irq_count(struct device_node *dev) return nr; } +EXPORT_SYMBOL_GPL(of_irq_count); /** * of_irq_to_resource_table - Fill in resource table with node's IRQ info diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c index 755651f33811d7..a72aa57591c04e 100644 --- a/drivers/pci/controller/pci-mvebu.c +++ b/drivers/pci/controller/pci-mvebu.c @@ -1168,12 +1168,6 @@ static void __iomem *mvebu_pcie_map_registers(struct platform_device *pdev, return devm_ioremap_resource(&pdev->dev, &port->regs); } -#define DT_FLAGS_TO_TYPE(flags) (((flags) >> 24) & 0x03) -#define DT_TYPE_IO 0x1 -#define DT_TYPE_MEM32 0x2 -#define DT_CPUADDR_TO_TARGET(cpuaddr) (((cpuaddr) >> 56) & 0xFF) -#define DT_CPUADDR_TO_ATTR(cpuaddr) (((cpuaddr) >> 48) & 0xFF) - static int mvebu_get_tgt_attr(struct device_node *np, int devfn, unsigned long type, unsigned int *tgt, @@ -1189,19 +1183,12 @@ static int mvebu_get_tgt_attr(struct device_node *np, int devfn, return -EINVAL; for_each_of_range(&parser, &range) { - unsigned long rtype; u32 slot = upper_32_bits(range.bus_addr); - if (DT_FLAGS_TO_TYPE(range.flags) == DT_TYPE_IO) - rtype = IORESOURCE_IO; - else if (DT_FLAGS_TO_TYPE(range.flags) == DT_TYPE_MEM32) - rtype = IORESOURCE_MEM; - else - continue; - - if (slot == PCI_SLOT(devfn) && type == rtype) { - *tgt = DT_CPUADDR_TO_TARGET(range.cpu_addr); - *attr = DT_CPUADDR_TO_ATTR(range.cpu_addr); + if (slot == PCI_SLOT(devfn) && + type == (range.flags & IORESOURCE_TYPE_BITS)) { + *tgt = (range.parent_bus_addr >> 56) & 0xFF; + *attr = (range.parent_bus_addr >> 48) & 0xFF; return 0; } } diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 0938ef7ebabf2d..dfb61f152702bd 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -148,20 +148,43 @@ static void pci_device_domain_set_desc(msi_alloc_info_t *arg, struct msi_desc *d arg->hwirq = desc->msi_index; } -static __always_inline void cond_mask_parent(struct irq_data *data) +static void cond_shutdown_parent(struct irq_data *data) { struct msi_domain_info *info = data->domain->host_data; - if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + irq_chip_shutdown_parent(data); + else if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) irq_chip_mask_parent(data); } -static __always_inline void cond_unmask_parent(struct irq_data *data) +static unsigned int cond_startup_parent(struct irq_data *data) { struct msi_domain_info *info = data->domain->host_data; - if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + return irq_chip_startup_parent(data); + else if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) irq_chip_unmask_parent(data); + + return 0; +} + +static void pci_irq_shutdown_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + + pci_msi_mask(desc, BIT(data->irq - desc->irq)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + unsigned int ret = cond_startup_parent(data); + + pci_msi_unmask(desc, BIT(data->irq - desc->irq)); + return ret; } static void pci_irq_mask_msi(struct irq_data *data) @@ -169,14 +192,12 @@ static void pci_irq_mask_msi(struct irq_data *data) struct msi_desc *desc = irq_data_get_msi_desc(data); pci_msi_mask(desc, BIT(data->irq - desc->irq)); - cond_mask_parent(data); } static void pci_irq_unmask_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); - cond_unmask_parent(data); pci_msi_unmask(desc, BIT(data->irq - desc->irq)); } @@ -194,6 +215,8 @@ static void pci_irq_unmask_msi(struct irq_data *data) static const struct msi_domain_template pci_msi_template = { .chip = { .name = "PCI-MSI", + .irq_startup = pci_irq_startup_msi, + .irq_shutdown = pci_irq_shutdown_msi, .irq_mask = pci_irq_mask_msi, .irq_unmask = pci_irq_unmask_msi, .irq_write_msi_msg = pci_msi_domain_write_msg, @@ -210,15 +233,27 @@ static const struct msi_domain_template pci_msi_template = { }, }; +static void pci_irq_shutdown_msix(struct irq_data *data) +{ + pci_msix_mask(irq_data_get_msi_desc(data)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msix(struct irq_data *data) +{ + unsigned int ret = cond_startup_parent(data); + + pci_msix_unmask(irq_data_get_msi_desc(data)); + return ret; +} + static void pci_irq_mask_msix(struct irq_data *data) { pci_msix_mask(irq_data_get_msi_desc(data)); - cond_mask_parent(data); } static void pci_irq_unmask_msix(struct irq_data *data) { - cond_unmask_parent(data); pci_msix_unmask(irq_data_get_msi_desc(data)); } @@ -234,6 +269,8 @@ EXPORT_SYMBOL_GPL(pci_msix_prepare_desc); static const struct msi_domain_template pci_msix_template = { .chip = { .name = "PCI-MSIX", + .irq_startup = pci_irq_startup_msix, + .irq_shutdown = pci_irq_shutdown_msix, .irq_mask = pci_irq_mask_msix, .irq_unmask = pci_irq_unmask_msix, .irq_write_msi_msg = pci_msi_domain_write_msg, diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 5eea14c1f7f5f7..c96301026f014e 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1475,8 +1475,9 @@ static ssize_t reset_method_store(struct device *dev, return count; } - pm_runtime_get_sync(dev); - struct device *pmdev __free(pm_runtime_put) = dev; + ACQUIRE(pm_runtime_active_try, pm)(dev); + if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) + return -ENXIO; if (sysfs_streq(buf, "default")) { pci_init_reset_methods(pdev); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b0f4d98036cddd..005b92e6585e91 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5932,6 +5932,7 @@ int pcie_set_readrq(struct pci_dev *dev, int rq) { u16 v; int ret; + unsigned int firstbit; struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus); if (rq < 128 || rq > 4096 || !is_power_of_2(rq)) @@ -5949,7 +5950,10 @@ int pcie_set_readrq(struct pci_dev *dev, int rq) rq = mps; } - v = FIELD_PREP(PCI_EXP_DEVCTL_READRQ, ffs(rq) - 8); + firstbit = ffs(rq); + if (firstbit < 8) + return -EINVAL; + v = FIELD_PREP(PCI_EXP_DEVCTL_READRQ, firstbit - 8); if (bridge->no_inc_mrrs) { int max_mrrs = pcie_get_readrq(dev); diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index a9188dec36fe15..638321fc9800ca 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -178,6 +178,15 @@ config FSL_IMX9_DDR_PMU can give information about memory throughput and other related events. +config FUJITSU_UNCORE_PMU + tristate "Fujitsu Uncore PMU" + depends on (ARM64 && ACPI) || (COMPILE_TEST && 64BIT) + help + Provides support for the Uncore performance monitor unit (PMU) + in Fujitsu processors. + Adds the Uncore PMU into the perf events subsystem for + monitoring Uncore events. + config QCOM_L2_PMU bool "Qualcomm Technologies L2-cache PMU" depends on ARCH_QCOM && ARM64 && ACPI diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 192fc8b16204dc..ea52711a87e326 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_ARM_XSCALE_PMU) += arm_xscale_pmu.o obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o obj-$(CONFIG_FSL_IMX8_DDR_PMU) += fsl_imx8_ddr_perf.o obj-$(CONFIG_FSL_IMX9_DDR_PMU) += fsl_imx9_ddr_perf.o +obj-$(CONFIG_FUJITSU_UNCORE_PMU) += fujitsu_uncore_pmu.o obj-$(CONFIG_HISI_PMU) += hisilicon/ obj-$(CONFIG_QCOM_L2_PMU) += qcom_l2_pmu.o obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c index 1a0d0e1a226334..8af3563fdf60a3 100644 --- a/drivers/perf/arm-ccn.c +++ b/drivers/perf/arm-ccn.c @@ -565,7 +565,7 @@ module_param_named(pmu_poll_period_us, arm_ccn_pmu_poll_period_us, uint, static ktime_t arm_ccn_pmu_timer_period(void) { - return ns_to_ktime((u64)arm_ccn_pmu_poll_period_us * 1000); + return us_to_ktime((u64)arm_ccn_pmu_poll_period_us); } diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 11fb2234b10fcf..23245352a3fc0a 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -65,7 +65,7 @@ /* PMU registers occupy the 3rd 4KB page of each node's region */ #define CMN_PMU_OFFSET 0x2000 /* ...except when they don't :( */ -#define CMN_S3_DTM_OFFSET 0xa000 +#define CMN_S3_R1_DTM_OFFSET 0xa000 #define CMN_S3_PMU_OFFSET 0xd900 /* For most nodes, this is all there is */ @@ -233,6 +233,9 @@ enum cmn_revision { REV_CMN700_R1P0, REV_CMN700_R2P0, REV_CMN700_R3P0, + REV_CMNS3_R0P0 = 0, + REV_CMNS3_R0P1, + REV_CMNS3_R1P0, REV_CI700_R0P0 = 0, REV_CI700_R1P0, REV_CI700_R2P0, @@ -425,8 +428,8 @@ static enum cmn_model arm_cmn_model(const struct arm_cmn *cmn) static int arm_cmn_pmu_offset(const struct arm_cmn *cmn, const struct arm_cmn_node *dn) { if (cmn->part == PART_CMN_S3) { - if (dn->type == CMN_TYPE_XP) - return CMN_S3_DTM_OFFSET; + if (cmn->rev >= REV_CMNS3_R1P0 && dn->type == CMN_TYPE_XP) + return CMN_S3_R1_DTM_OFFSET; return CMN_S3_PMU_OFFSET; } return CMN_PMU_OFFSET; diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index f6d7bab5d555c0..69c5cc8f56067c 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -978,6 +978,32 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, return -EAGAIN; } +static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; + + if (evtype != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) + return false; + + /* + * A CPU_CYCLES event with threshold counting cannot use PMCCNTR_EL0 + * since it lacks threshold support. + */ + if (armv8pmu_event_get_threshold(&event->attr)) + return false; + + /* + * PMCCNTR_EL0 is not affected by BRBE controls like BRBCR_ELx.FZP. + * So don't use it for branch events. + */ + if (has_branch_stack(event)) + return false; + + return true; +} + static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, struct perf_event *event) { @@ -986,8 +1012,7 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; /* Always prefer to place a cycle counter into the cycle counter. */ - if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) && - !armv8pmu_event_get_threshold(&event->attr) && !has_branch_stack(event)) { + if (armv8pmu_can_use_pmccntr(cpuc, event)) { if (!test_and_set_bit(ARMV8_PMU_CYCLE_IDX, cpuc->used_mask)) return ARMV8_PMU_CYCLE_IDX; else if (armv8pmu_event_is_64bit(event) && diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 369e77ad5f13ff..fa50645feddadb 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -86,9 +86,11 @@ struct arm_spe_pmu { #define SPE_PMU_FEAT_ERND (1UL << 5) #define SPE_PMU_FEAT_INV_FILT_EVT (1UL << 6) #define SPE_PMU_FEAT_DISCARD (1UL << 7) +#define SPE_PMU_FEAT_EFT (1UL << 8) #define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) u64 features; + u64 pmsevfr_res0; u16 max_record_sz; u16 align; struct perf_output_handle __percpu *handle; @@ -97,7 +99,8 @@ struct arm_spe_pmu { #define to_spe_pmu(p) (container_of(p, struct arm_spe_pmu, pmu)) /* Convert a free-running index from perf into an SPE buffer offset */ -#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT)) +#define PERF_IDX2OFF(idx, buf) \ + ((idx) % ((unsigned long)(buf)->nr_pages << PAGE_SHIFT)) /* Keep track of our dynamic hotplug state */ static enum cpuhp_state arm_spe_pmu_online; @@ -115,6 +118,7 @@ enum arm_spe_pmu_capabilities { SPE_PMU_CAP_FEAT_MAX, SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX, SPE_PMU_CAP_MIN_IVAL, + SPE_PMU_CAP_EVENT_FILTER, }; static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = { @@ -122,7 +126,7 @@ static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = { [SPE_PMU_CAP_ERND] = SPE_PMU_FEAT_ERND, }; -static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap) +static u64 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap) { if (cap < SPE_PMU_CAP_FEAT_MAX) return !!(spe_pmu->features & arm_spe_pmu_feat_caps[cap]); @@ -132,6 +136,8 @@ static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap) return spe_pmu->counter_sz; case SPE_PMU_CAP_MIN_IVAL: return spe_pmu->min_period; + case SPE_PMU_CAP_EVENT_FILTER: + return ~spe_pmu->pmsevfr_res0; default: WARN(1, "unknown cap %d\n", cap); } @@ -148,7 +154,19 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev, container_of(attr, struct dev_ext_attribute, attr); int cap = (long)ea->var; - return sysfs_emit(buf, "%u\n", arm_spe_pmu_cap_get(spe_pmu, cap)); + return sysfs_emit(buf, "%llu\n", arm_spe_pmu_cap_get(spe_pmu, cap)); +} + +static ssize_t arm_spe_pmu_cap_show_hex(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct arm_spe_pmu *spe_pmu = dev_get_drvdata(dev); + struct dev_ext_attribute *ea = + container_of(attr, struct dev_ext_attribute, attr); + int cap = (long)ea->var; + + return sysfs_emit(buf, "0x%llx\n", arm_spe_pmu_cap_get(spe_pmu, cap)); } #define SPE_EXT_ATTR_ENTRY(_name, _func, _var) \ @@ -158,12 +176,15 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev, #define SPE_CAP_EXT_ATTR_ENTRY(_name, _var) \ SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show, _var) +#define SPE_CAP_EXT_ATTR_ENTRY_HEX(_name, _var) \ + SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show_hex, _var) static struct attribute *arm_spe_pmu_cap_attr[] = { SPE_CAP_EXT_ATTR_ENTRY(arch_inst, SPE_PMU_CAP_ARCH_INST), SPE_CAP_EXT_ATTR_ENTRY(ernd, SPE_PMU_CAP_ERND), SPE_CAP_EXT_ATTR_ENTRY(count_size, SPE_PMU_CAP_CNT_SZ), SPE_CAP_EXT_ATTR_ENTRY(min_interval, SPE_PMU_CAP_MIN_IVAL), + SPE_CAP_EXT_ATTR_ENTRY_HEX(event_filter, SPE_PMU_CAP_EVENT_FILTER), NULL, }; @@ -197,6 +218,27 @@ static const struct attribute_group arm_spe_pmu_cap_group = { #define ATTR_CFG_FLD_discard_CFG config /* PMBLIMITR_EL1.FM = DISCARD */ #define ATTR_CFG_FLD_discard_LO 35 #define ATTR_CFG_FLD_discard_HI 35 +#define ATTR_CFG_FLD_branch_filter_mask_CFG config /* PMSFCR_EL1.Bm */ +#define ATTR_CFG_FLD_branch_filter_mask_LO 36 +#define ATTR_CFG_FLD_branch_filter_mask_HI 36 +#define ATTR_CFG_FLD_load_filter_mask_CFG config /* PMSFCR_EL1.LDm */ +#define ATTR_CFG_FLD_load_filter_mask_LO 37 +#define ATTR_CFG_FLD_load_filter_mask_HI 37 +#define ATTR_CFG_FLD_store_filter_mask_CFG config /* PMSFCR_EL1.STm */ +#define ATTR_CFG_FLD_store_filter_mask_LO 38 +#define ATTR_CFG_FLD_store_filter_mask_HI 38 +#define ATTR_CFG_FLD_simd_filter_CFG config /* PMSFCR_EL1.SIMD */ +#define ATTR_CFG_FLD_simd_filter_LO 39 +#define ATTR_CFG_FLD_simd_filter_HI 39 +#define ATTR_CFG_FLD_simd_filter_mask_CFG config /* PMSFCR_EL1.SIMDm */ +#define ATTR_CFG_FLD_simd_filter_mask_LO 40 +#define ATTR_CFG_FLD_simd_filter_mask_HI 40 +#define ATTR_CFG_FLD_float_filter_CFG config /* PMSFCR_EL1.FP */ +#define ATTR_CFG_FLD_float_filter_LO 41 +#define ATTR_CFG_FLD_float_filter_HI 41 +#define ATTR_CFG_FLD_float_filter_mask_CFG config /* PMSFCR_EL1.FPm */ +#define ATTR_CFG_FLD_float_filter_mask_LO 42 +#define ATTR_CFG_FLD_float_filter_mask_HI 42 #define ATTR_CFG_FLD_event_filter_CFG config1 /* PMSEVFR_EL1 */ #define ATTR_CFG_FLD_event_filter_LO 0 @@ -215,8 +257,15 @@ GEN_PMU_FORMAT_ATTR(pa_enable); GEN_PMU_FORMAT_ATTR(pct_enable); GEN_PMU_FORMAT_ATTR(jitter); GEN_PMU_FORMAT_ATTR(branch_filter); +GEN_PMU_FORMAT_ATTR(branch_filter_mask); GEN_PMU_FORMAT_ATTR(load_filter); +GEN_PMU_FORMAT_ATTR(load_filter_mask); GEN_PMU_FORMAT_ATTR(store_filter); +GEN_PMU_FORMAT_ATTR(store_filter_mask); +GEN_PMU_FORMAT_ATTR(simd_filter); +GEN_PMU_FORMAT_ATTR(simd_filter_mask); +GEN_PMU_FORMAT_ATTR(float_filter); +GEN_PMU_FORMAT_ATTR(float_filter_mask); GEN_PMU_FORMAT_ATTR(event_filter); GEN_PMU_FORMAT_ATTR(inv_event_filter); GEN_PMU_FORMAT_ATTR(min_latency); @@ -228,8 +277,15 @@ static struct attribute *arm_spe_pmu_formats_attr[] = { &format_attr_pct_enable.attr, &format_attr_jitter.attr, &format_attr_branch_filter.attr, + &format_attr_branch_filter_mask.attr, &format_attr_load_filter.attr, + &format_attr_load_filter_mask.attr, &format_attr_store_filter.attr, + &format_attr_store_filter_mask.attr, + &format_attr_simd_filter.attr, + &format_attr_simd_filter_mask.attr, + &format_attr_float_filter.attr, + &format_attr_float_filter_mask.attr, &format_attr_event_filter.attr, &format_attr_inv_event_filter.attr, &format_attr_min_latency.attr, @@ -250,6 +306,16 @@ static umode_t arm_spe_pmu_format_attr_is_visible(struct kobject *kobj, if (attr == &format_attr_inv_event_filter.attr && !(spe_pmu->features & SPE_PMU_FEAT_INV_FILT_EVT)) return 0; + if ((attr == &format_attr_branch_filter_mask.attr || + attr == &format_attr_load_filter_mask.attr || + attr == &format_attr_store_filter_mask.attr || + attr == &format_attr_simd_filter.attr || + attr == &format_attr_simd_filter_mask.attr || + attr == &format_attr_float_filter.attr || + attr == &format_attr_float_filter_mask.attr) && + !(spe_pmu->features & SPE_PMU_FEAT_EFT)) + return 0; + return attr->mode; } @@ -345,8 +411,15 @@ static u64 arm_spe_event_to_pmsfcr(struct perf_event *event) u64 reg = 0; reg |= FIELD_PREP(PMSFCR_EL1_LD, ATTR_CFG_GET_FLD(attr, load_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_LDm, ATTR_CFG_GET_FLD(attr, load_filter_mask)); reg |= FIELD_PREP(PMSFCR_EL1_ST, ATTR_CFG_GET_FLD(attr, store_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_STm, ATTR_CFG_GET_FLD(attr, store_filter_mask)); reg |= FIELD_PREP(PMSFCR_EL1_B, ATTR_CFG_GET_FLD(attr, branch_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_Bm, ATTR_CFG_GET_FLD(attr, branch_filter_mask)); + reg |= FIELD_PREP(PMSFCR_EL1_SIMD, ATTR_CFG_GET_FLD(attr, simd_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_SIMDm, ATTR_CFG_GET_FLD(attr, simd_filter_mask)); + reg |= FIELD_PREP(PMSFCR_EL1_FP, ATTR_CFG_GET_FLD(attr, float_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_FPm, ATTR_CFG_GET_FLD(attr, float_filter_mask)); if (reg) reg |= PMSFCR_EL1_FT; @@ -697,20 +770,6 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) return IRQ_HANDLED; } -static u64 arm_spe_pmsevfr_res0(u16 pmsver) -{ - switch (pmsver) { - case ID_AA64DFR0_EL1_PMSVer_IMP: - return PMSEVFR_EL1_RES0_IMP; - case ID_AA64DFR0_EL1_PMSVer_V1P1: - return PMSEVFR_EL1_RES0_V1P1; - case ID_AA64DFR0_EL1_PMSVer_V1P2: - /* Return the highest version we support in default */ - default: - return PMSEVFR_EL1_RES0_V1P2; - } -} - /* Perf callbacks */ static int arm_spe_pmu_event_init(struct perf_event *event) { @@ -726,10 +785,10 @@ static int arm_spe_pmu_event_init(struct perf_event *event) !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus)) return -ENOENT; - if (arm_spe_event_to_pmsevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver)) + if (arm_spe_event_to_pmsevfr(event) & spe_pmu->pmsevfr_res0) return -EOPNOTSUPP; - if (arm_spe_event_to_pmsnevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver)) + if (arm_spe_event_to_pmsnevfr(event) & spe_pmu->pmsevfr_res0) return -EOPNOTSUPP; if (attr->exclude_idle) @@ -762,6 +821,16 @@ static int arm_spe_pmu_event_init(struct perf_event *event) !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT)) return -EOPNOTSUPP; + if ((FIELD_GET(PMSFCR_EL1_LDm, reg) || + FIELD_GET(PMSFCR_EL1_STm, reg) || + FIELD_GET(PMSFCR_EL1_Bm, reg) || + FIELD_GET(PMSFCR_EL1_SIMD, reg) || + FIELD_GET(PMSFCR_EL1_SIMDm, reg) || + FIELD_GET(PMSFCR_EL1_FP, reg) || + FIELD_GET(PMSFCR_EL1_FPm, reg)) && + !(spe_pmu->features & SPE_PMU_FEAT_EFT)) + return -EOPNOTSUPP; + if (ATTR_CFG_GET_FLD(&event->attr, discard) && !(spe_pmu->features & SPE_PMU_FEAT_DISCARD)) return -EOPNOTSUPP; @@ -1053,6 +1122,9 @@ static void __arm_spe_pmu_dev_probe(void *info) if (spe_pmu->pmsver >= ID_AA64DFR0_EL1_PMSVer_V1P2) spe_pmu->features |= SPE_PMU_FEAT_DISCARD; + if (FIELD_GET(PMSIDR_EL1_EFT, reg)) + spe_pmu->features |= SPE_PMU_FEAT_EFT; + /* This field has a spaced out encoding, so just use a look-up */ fld = FIELD_GET(PMSIDR_EL1_INTERVAL, reg); switch (fld) { @@ -1107,6 +1179,10 @@ static void __arm_spe_pmu_dev_probe(void *info) spe_pmu->counter_sz = 16; } + /* Write all 1s and then read back. Unsupported filter bits are RAZ/WI. */ + write_sysreg_s(U64_MAX, SYS_PMSEVFR_EL1); + spe_pmu->pmsevfr_res0 = ~read_sysreg_s(SYS_PMSEVFR_EL1); + dev_info(dev, "probed SPEv1.%d for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n", spe_pmu->pmsver - 1, cpumask_pr_args(&spe_pmu->supported_cpus), diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index 146ff57813fb16..22f73ac894e959 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -39,6 +39,10 @@ #define DWC_PCIE_EVENT_CLEAR GENMASK(1, 0) #define DWC_PCIE_EVENT_PER_CLEAR 0x1 +/* Event Selection Field has two subfields */ +#define DWC_PCIE_CNT_EVENT_SEL_GROUP GENMASK(11, 8) +#define DWC_PCIE_CNT_EVENT_SEL_EVID GENMASK(7, 0) + #define DWC_PCIE_EVENT_CNT_DATA 0xC #define DWC_PCIE_TIME_BASED_ANAL_CTL 0x10 @@ -73,6 +77,10 @@ enum dwc_pcie_event_type { DWC_PCIE_EVENT_TYPE_MAX, }; +#define DWC_PCIE_LANE_GROUP_6 6 +#define DWC_PCIE_LANE_GROUP_7 7 +#define DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP 256 + #define DWC_PCIE_LANE_EVENT_MAX_PERIOD GENMASK_ULL(31, 0) #define DWC_PCIE_MAX_PERIOD GENMASK_ULL(63, 0) @@ -82,8 +90,11 @@ struct dwc_pcie_pmu { u16 ras_des_offset; u32 nr_lanes; + /* Groups #6 and #7 */ + DECLARE_BITMAP(lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP); + struct perf_event *time_based_event; + struct hlist_node cpuhp_node; - struct perf_event *event[DWC_PCIE_EVENT_TYPE_MAX]; int on_cpu; }; @@ -246,19 +257,26 @@ static const struct attribute_group *dwc_pcie_attr_groups[] = { }; static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu, + struct perf_event *event, bool enable) { struct pci_dev *pdev = pcie_pmu->pdev; u16 ras_des_offset = pcie_pmu->ras_des_offset; + int event_id = DWC_PCIE_EVENT_ID(event); + int lane = DWC_PCIE_EVENT_LANE(event); + u32 ctrl; + + ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | + FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | + FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR); if (enable) - pci_clear_and_set_config_dword(pdev, - ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, - DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); + ctrl |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); else - pci_clear_and_set_config_dword(pdev, - ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, - DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF); + ctrl |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF); + + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); } static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu, @@ -276,11 +294,22 @@ static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event) { struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); struct pci_dev *pdev = pcie_pmu->pdev; + int event_id = DWC_PCIE_EVENT_ID(event); + int lane = DWC_PCIE_EVENT_LANE(event); u16 ras_des_offset = pcie_pmu->ras_des_offset; - u32 val; + u32 val, ctrl; + ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | + FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | + FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val); + ctrl |= FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR); + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); + return val; } @@ -329,26 +358,77 @@ static void dwc_pcie_pmu_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event); - u64 delta, prev, now = 0; + u64 delta, prev, now; + + if (type == DWC_PCIE_LANE_EVENT) { + now = dwc_pcie_pmu_read_lane_event_counter(event) & + DWC_PCIE_LANE_EVENT_MAX_PERIOD; + local64_add(now, &event->count); + return; + } do { prev = local64_read(&hwc->prev_count); - - if (type == DWC_PCIE_LANE_EVENT) - now = dwc_pcie_pmu_read_lane_event_counter(event); - else if (type == DWC_PCIE_TIME_BASE_EVENT) - now = dwc_pcie_pmu_read_time_based_counter(event); + now = dwc_pcie_pmu_read_time_based_counter(event); } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); delta = (now - prev) & DWC_PCIE_MAX_PERIOD; - /* 32-bit counter for Lane Event Counting */ - if (type == DWC_PCIE_LANE_EVENT) - delta &= DWC_PCIE_LANE_EVENT_MAX_PERIOD; - local64_add(delta, &event->count); } +static int dwc_pcie_pmu_validate_add_lane_event(struct perf_event *event, + unsigned long val_lane_events[]) +{ + int event_id, event_nr, group; + + event_id = DWC_PCIE_EVENT_ID(event); + event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id); + group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id); + + if (group != DWC_PCIE_LANE_GROUP_6 && group != DWC_PCIE_LANE_GROUP_7) + return -EINVAL; + + group -= DWC_PCIE_LANE_GROUP_6; + + if (test_and_set_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr, + val_lane_events)) + return -EINVAL; + + return 0; +} + +static int dwc_pcie_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + DECLARE_BITMAP(val_lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP); + bool time_event = false; + int type; + + type = DWC_PCIE_EVENT_TYPE(leader); + if (type == DWC_PCIE_TIME_BASE_EVENT) + time_event = true; + else + if (dwc_pcie_pmu_validate_add_lane_event(leader, val_lane_events)) + return -ENOSPC; + + for_each_sibling_event(sibling, leader) { + type = DWC_PCIE_EVENT_TYPE(sibling); + if (type == DWC_PCIE_TIME_BASE_EVENT) { + if (time_event) + return -ENOSPC; + + time_event = true; + continue; + } + + if (dwc_pcie_pmu_validate_add_lane_event(sibling, val_lane_events)) + return -ENOSPC; + } + + return 0; +} + static int dwc_pcie_pmu_event_init(struct perf_event *event) { struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); @@ -367,10 +447,6 @@ static int dwc_pcie_pmu_event_init(struct perf_event *event) if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) return -EINVAL; - if (event->group_leader != event && - !is_software_event(event->group_leader)) - return -EINVAL; - for_each_sibling_event(sibling, event->group_leader) { if (sibling->pmu != event->pmu && !is_software_event(sibling)) return -EINVAL; @@ -385,6 +461,9 @@ static int dwc_pcie_pmu_event_init(struct perf_event *event) return -EINVAL; } + if (dwc_pcie_pmu_validate_group(event)) + return -ENOSPC; + event->cpu = pcie_pmu->on_cpu; return 0; @@ -400,7 +479,7 @@ static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags) local64_set(&hwc->prev_count, 0); if (type == DWC_PCIE_LANE_EVENT) - dwc_pcie_pmu_lane_event_enable(pcie_pmu, true); + dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, true); else if (type == DWC_PCIE_TIME_BASE_EVENT) dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true); } @@ -414,12 +493,13 @@ static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags) if (event->hw.state & PERF_HES_STOPPED) return; + dwc_pcie_pmu_event_update(event); + if (type == DWC_PCIE_LANE_EVENT) - dwc_pcie_pmu_lane_event_enable(pcie_pmu, false); + dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, false); else if (type == DWC_PCIE_TIME_BASE_EVENT) dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false); - dwc_pcie_pmu_event_update(event); hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; } @@ -434,14 +514,17 @@ static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags) u16 ras_des_offset = pcie_pmu->ras_des_offset; u32 ctrl; - /* one counter for each type and it is in use */ - if (pcie_pmu->event[type]) - return -ENOSPC; - - pcie_pmu->event[type] = event; hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; if (type == DWC_PCIE_LANE_EVENT) { + int event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id); + int group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id) - + DWC_PCIE_LANE_GROUP_6; + + if (test_and_set_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr, + pcie_pmu->lane_events)) + return -ENOSPC; + /* EVENT_COUNTER_DATA_REG needs clear manually */ ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | @@ -450,6 +533,11 @@ static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags) pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, ctrl); } else if (type == DWC_PCIE_TIME_BASE_EVENT) { + if (pcie_pmu->time_based_event) + return -ENOSPC; + + pcie_pmu->time_based_event = event; + /* * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely * use it with any manually controlled duration. And it is @@ -478,7 +566,18 @@ static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags) dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE); perf_event_update_userpage(event); - pcie_pmu->event[type] = NULL; + + if (type == DWC_PCIE_TIME_BASE_EVENT) { + pcie_pmu->time_based_event = NULL; + } else { + int event_id = DWC_PCIE_EVENT_ID(event); + int event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id); + int group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id) - + DWC_PCIE_LANE_GROUP_6; + + clear_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr, + pcie_pmu->lane_events); + } } static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node) diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c index 267754fdf58146..7050b48c046717 100644 --- a/drivers/perf/fsl_imx9_ddr_perf.c +++ b/drivers/perf/fsl_imx9_ddr_perf.c @@ -104,6 +104,11 @@ static const struct imx_ddr_devtype_data imx93_devtype_data = { .filter_ver = DDR_PERF_AXI_FILTER_V1 }; +static const struct imx_ddr_devtype_data imx94_devtype_data = { + .identifier = "imx94", + .filter_ver = DDR_PERF_AXI_FILTER_V2 +}; + static const struct imx_ddr_devtype_data imx95_devtype_data = { .identifier = "imx95", .filter_ver = DDR_PERF_AXI_FILTER_V2 @@ -122,6 +127,7 @@ static inline bool axi_filter_v2(struct ddr_pmu *pmu) static const struct of_device_id imx_ddr_pmu_dt_ids[] = { { .compatible = "fsl,imx91-ddr-pmu", .data = &imx91_devtype_data }, { .compatible = "fsl,imx93-ddr-pmu", .data = &imx93_devtype_data }, + { .compatible = "fsl,imx94-ddr-pmu", .data = &imx94_devtype_data }, { .compatible = "fsl,imx95-ddr-pmu", .data = &imx95_devtype_data }, { /* sentinel */ } }; diff --git a/drivers/perf/fujitsu_uncore_pmu.c b/drivers/perf/fujitsu_uncore_pmu.c new file mode 100644 index 00000000000000..c3c6f56474adda --- /dev/null +++ b/drivers/perf/fujitsu_uncore_pmu.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Driver for the Uncore PMUs in Fujitsu chips. + * + * See Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst for more details. + * + * Copyright (c) 2025 Fujitsu. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Number of counters on each PMU */ +#define MAC_NUM_COUNTERS 8 +#define PCI_NUM_COUNTERS 8 +/* Mask for the event type field within perf_event_attr.config and EVTYPE reg */ +#define UNCORE_EVTYPE_MASK 0xFF + +/* Perfmon registers */ +#define PM_EVCNTR(__cntr) (0x000 + (__cntr) * 8) +#define PM_CNTCTL(__cntr) (0x100 + (__cntr) * 8) +#define PM_CNTCTL_RESET 0 +#define PM_EVTYPE(__cntr) (0x200 + (__cntr) * 8) +#define PM_EVTYPE_EVSEL(__val) FIELD_GET(UNCORE_EVTYPE_MASK, __val) +#define PM_CR 0x400 +#define PM_CR_RESET BIT(1) +#define PM_CR_ENABLE BIT(0) +#define PM_CNTENSET 0x410 +#define PM_CNTENSET_IDX(__cntr) BIT(__cntr) +#define PM_CNTENCLR 0x418 +#define PM_CNTENCLR_IDX(__cntr) BIT(__cntr) +#define PM_CNTENCLR_RESET 0xFF +#define PM_INTENSET 0x420 +#define PM_INTENSET_IDX(__cntr) BIT(__cntr) +#define PM_INTENCLR 0x428 +#define PM_INTENCLR_IDX(__cntr) BIT(__cntr) +#define PM_INTENCLR_RESET 0xFF +#define PM_OVSR 0x440 +#define PM_OVSR_OVSRCLR_RESET 0xFF + +enum fujitsu_uncore_pmu { + FUJITSU_UNCORE_PMU_MAC = 1, + FUJITSU_UNCORE_PMU_PCI = 2, +}; + +struct uncore_pmu { + int num_counters; + struct pmu pmu; + struct hlist_node node; + void __iomem *regs; + struct perf_event **events; + unsigned long *used_mask; + int cpu; + int irq; + struct device *dev; +}; + +#define to_uncore_pmu(p) (container_of(p, struct uncore_pmu, pmu)) + +static int uncore_pmu_cpuhp_state; + +static void fujitsu_uncore_counter_start(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + int idx = event->hw.idx; + + /* Initialize the hardware counter and reset prev_count*/ + local64_set(&event->hw.prev_count, 0); + writeq_relaxed(0, uncorepmu->regs + PM_EVCNTR(idx)); + + /* Set the event type */ + writeq_relaxed(PM_EVTYPE_EVSEL(event->attr.config), uncorepmu->regs + PM_EVTYPE(idx)); + + /* Enable interrupt generation by this counter */ + writeq_relaxed(PM_INTENSET_IDX(idx), uncorepmu->regs + PM_INTENSET); + + /* Finally, enable the counter */ + writeq_relaxed(PM_CNTCTL_RESET, uncorepmu->regs + PM_CNTCTL(idx)); + writeq_relaxed(PM_CNTENSET_IDX(idx), uncorepmu->regs + PM_CNTENSET); +} + +static void fujitsu_uncore_counter_stop(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + int idx = event->hw.idx; + + /* Disable the counter */ + writeq_relaxed(PM_CNTENCLR_IDX(idx), uncorepmu->regs + PM_CNTENCLR); + + /* Disable interrupt generation by this counter */ + writeq_relaxed(PM_INTENCLR_IDX(idx), uncorepmu->regs + PM_INTENCLR); +} + +static void fujitsu_uncore_counter_update(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + int idx = event->hw.idx; + u64 prev, new; + + do { + prev = local64_read(&event->hw.prev_count); + new = readq_relaxed(uncorepmu->regs + PM_EVCNTR(idx)); + } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + + local64_add(new - prev, &event->count); +} + +static inline void fujitsu_uncore_init(struct uncore_pmu *uncorepmu) +{ + int i; + + writeq_relaxed(PM_CR_RESET, uncorepmu->regs + PM_CR); + + writeq_relaxed(PM_CNTENCLR_RESET, uncorepmu->regs + PM_CNTENCLR); + writeq_relaxed(PM_INTENCLR_RESET, uncorepmu->regs + PM_INTENCLR); + writeq_relaxed(PM_OVSR_OVSRCLR_RESET, uncorepmu->regs + PM_OVSR); + + for (i = 0; i < uncorepmu->num_counters; ++i) { + writeq_relaxed(PM_CNTCTL_RESET, uncorepmu->regs + PM_CNTCTL(i)); + writeq_relaxed(PM_EVTYPE_EVSEL(0), uncorepmu->regs + PM_EVTYPE(i)); + } + writeq_relaxed(PM_CR_ENABLE, uncorepmu->regs + PM_CR); +} + +static irqreturn_t fujitsu_uncore_handle_irq(int irq_num, void *data) +{ + struct uncore_pmu *uncorepmu = data; + /* Read the overflow status register */ + long status = readq_relaxed(uncorepmu->regs + PM_OVSR); + int idx; + + if (status == 0) + return IRQ_NONE; + + /* Clear the bits we read on the overflow status register */ + writeq_relaxed(status, uncorepmu->regs + PM_OVSR); + + for_each_set_bit(idx, &status, uncorepmu->num_counters) { + struct perf_event *event; + + event = uncorepmu->events[idx]; + if (!event) + continue; + + fujitsu_uncore_counter_update(event); + } + + return IRQ_HANDLED; +} + +static void fujitsu_uncore_pmu_enable(struct pmu *pmu) +{ + writeq_relaxed(PM_CR_ENABLE, to_uncore_pmu(pmu)->regs + PM_CR); +} + +static void fujitsu_uncore_pmu_disable(struct pmu *pmu) +{ + writeq_relaxed(0, to_uncore_pmu(pmu)->regs + PM_CR); +} + +static bool fujitsu_uncore_validate_event_group(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct perf_event *leader = event->group_leader; + struct perf_event *sibling; + int counters = 1; + + if (leader == event) + return true; + + if (leader->pmu == event->pmu) + counters++; + + for_each_sibling_event(sibling, leader) { + if (sibling->pmu == event->pmu) + counters++; + } + + /* + * If the group requires more counters than the HW has, it + * cannot ever be scheduled. + */ + return counters <= uncorepmu->num_counters; +} + +static int fujitsu_uncore_event_init(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + /* Is the event for this PMU? */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * Sampling not supported since these events are not + * core-attributable. + */ + if (is_sampling_event(event)) + return -EINVAL; + + /* + * Task mode not available, we run the counters as socket counters, + * not attributable to any CPU and therefore cannot attribute per-task. + */ + if (event->cpu < 0) + return -EINVAL; + + /* Validate the group */ + if (!fujitsu_uncore_validate_event_group(event)) + return -EINVAL; + + hwc->idx = -1; + + event->cpu = uncorepmu->cpu; + + return 0; +} + +static void fujitsu_uncore_event_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->state = 0; + fujitsu_uncore_counter_start(event); +} + +static void fujitsu_uncore_event_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->state & PERF_HES_STOPPED) + return; + + fujitsu_uncore_counter_stop(event); + if (flags & PERF_EF_UPDATE) + fujitsu_uncore_counter_update(event); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; +} + +static int fujitsu_uncore_event_add(struct perf_event *event, int flags) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx; + + /* Try to allocate a counter. */ + idx = bitmap_find_free_region(uncorepmu->used_mask, uncorepmu->num_counters, 0); + if (idx < 0) + /* The counters are all in use. */ + return -EAGAIN; + + hwc->idx = idx; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + uncorepmu->events[idx] = event; + + if (flags & PERF_EF_START) + fujitsu_uncore_event_start(event, 0); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); + + return 0; +} + +static void fujitsu_uncore_event_del(struct perf_event *event, int flags) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + /* Stop and clean up */ + fujitsu_uncore_event_stop(event, flags | PERF_EF_UPDATE); + uncorepmu->events[hwc->idx] = NULL; + bitmap_release_region(uncorepmu->used_mask, hwc->idx, 0); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); +} + +static void fujitsu_uncore_event_read(struct perf_event *event) +{ + fujitsu_uncore_counter_update(event); +} + +#define UNCORE_PMU_FORMAT_ATTR(_name, _config) \ + (&((struct dev_ext_attribute[]) { \ + { .attr = __ATTR(_name, 0444, device_show_string, NULL), \ + .var = (void *)_config, } \ + })[0].attr.attr) + +static struct attribute *fujitsu_uncore_pmu_formats[] = { + UNCORE_PMU_FORMAT_ATTR(event, "config:0-7"), + NULL +}; + +static const struct attribute_group fujitsu_uncore_pmu_format_group = { + .name = "format", + .attrs = fujitsu_uncore_pmu_formats, +}; + +static ssize_t fujitsu_uncore_pmu_event_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); + return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id); +} + +#define MAC_EVENT_ATTR(_name, _id) \ + PMU_EVENT_ATTR_ID(_name, fujitsu_uncore_pmu_event_show, _id) + +static struct attribute *fujitsu_uncore_mac_pmu_events[] = { + MAC_EVENT_ATTR(cycles, 0x00), + MAC_EVENT_ATTR(read-count, 0x10), + MAC_EVENT_ATTR(read-count-request, 0x11), + MAC_EVENT_ATTR(read-count-return, 0x12), + MAC_EVENT_ATTR(read-count-request-pftgt, 0x13), + MAC_EVENT_ATTR(read-count-request-normal, 0x14), + MAC_EVENT_ATTR(read-count-return-pftgt-hit, 0x15), + MAC_EVENT_ATTR(read-count-return-pftgt-miss, 0x16), + MAC_EVENT_ATTR(read-wait, 0x17), + MAC_EVENT_ATTR(write-count, 0x20), + MAC_EVENT_ATTR(write-count-write, 0x21), + MAC_EVENT_ATTR(write-count-pwrite, 0x22), + MAC_EVENT_ATTR(memory-read-count, 0x40), + MAC_EVENT_ATTR(memory-write-count, 0x50), + MAC_EVENT_ATTR(memory-pwrite-count, 0x60), + MAC_EVENT_ATTR(ea-mac, 0x80), + MAC_EVENT_ATTR(ea-memory, 0x90), + MAC_EVENT_ATTR(ea-memory-mac-write, 0x92), + MAC_EVENT_ATTR(ea-ha, 0xa0), + NULL +}; + +#define PCI_EVENT_ATTR(_name, _id) \ + PMU_EVENT_ATTR_ID(_name, fujitsu_uncore_pmu_event_show, _id) + +static struct attribute *fujitsu_uncore_pci_pmu_events[] = { + PCI_EVENT_ATTR(pci-port0-cycles, 0x00), + PCI_EVENT_ATTR(pci-port0-read-count, 0x10), + PCI_EVENT_ATTR(pci-port0-read-count-bus, 0x14), + PCI_EVENT_ATTR(pci-port0-write-count, 0x20), + PCI_EVENT_ATTR(pci-port0-write-count-bus, 0x24), + PCI_EVENT_ATTR(pci-port1-cycles, 0x40), + PCI_EVENT_ATTR(pci-port1-read-count, 0x50), + PCI_EVENT_ATTR(pci-port1-read-count-bus, 0x54), + PCI_EVENT_ATTR(pci-port1-write-count, 0x60), + PCI_EVENT_ATTR(pci-port1-write-count-bus, 0x64), + PCI_EVENT_ATTR(ea-pci, 0x80), + NULL +}; + +static const struct attribute_group fujitsu_uncore_mac_pmu_events_group = { + .name = "events", + .attrs = fujitsu_uncore_mac_pmu_events, +}; + +static const struct attribute_group fujitsu_uncore_pci_pmu_events_group = { + .name = "events", + .attrs = fujitsu_uncore_pci_pmu_events, +}; + +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(dev_get_drvdata(dev)); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(uncorepmu->cpu)); +} +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *fujitsu_uncore_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static const struct attribute_group fujitsu_uncore_pmu_cpumask_attr_group = { + .attrs = fujitsu_uncore_pmu_cpumask_attrs, +}; + +static const struct attribute_group *fujitsu_uncore_mac_pmu_attr_grps[] = { + &fujitsu_uncore_pmu_format_group, + &fujitsu_uncore_mac_pmu_events_group, + &fujitsu_uncore_pmu_cpumask_attr_group, + NULL +}; + +static const struct attribute_group *fujitsu_uncore_pci_pmu_attr_grps[] = { + &fujitsu_uncore_pmu_format_group, + &fujitsu_uncore_pci_pmu_events_group, + &fujitsu_uncore_pmu_cpumask_attr_group, + NULL +}; + +static void fujitsu_uncore_pmu_migrate(struct uncore_pmu *uncorepmu, unsigned int cpu) +{ + perf_pmu_migrate_context(&uncorepmu->pmu, uncorepmu->cpu, cpu); + irq_set_affinity(uncorepmu->irq, cpumask_of(cpu)); + uncorepmu->cpu = cpu; +} + +static int fujitsu_uncore_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct uncore_pmu *uncorepmu; + int node; + + uncorepmu = hlist_entry_safe(cpuhp_node, struct uncore_pmu, node); + node = dev_to_node(uncorepmu->dev); + if (cpu_to_node(uncorepmu->cpu) != node && cpu_to_node(cpu) == node) + fujitsu_uncore_pmu_migrate(uncorepmu, cpu); + + return 0; +} + +static int fujitsu_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct uncore_pmu *uncorepmu; + unsigned int target; + int node; + + uncorepmu = hlist_entry_safe(cpuhp_node, struct uncore_pmu, node); + if (cpu != uncorepmu->cpu) + return 0; + + node = dev_to_node(uncorepmu->dev); + target = cpumask_any_and_but(cpumask_of_node(node), cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + target = cpumask_any_but(cpu_online_mask, cpu); + + if (target < nr_cpu_ids) + fujitsu_uncore_pmu_migrate(uncorepmu, target); + + return 0; +} + +static int fujitsu_uncore_pmu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + unsigned long device_type = (unsigned long)device_get_match_data(dev); + const struct attribute_group **attr_groups; + struct uncore_pmu *uncorepmu; + struct resource *memrc; + size_t alloc_size; + char *name; + int ret; + int irq; + u64 uid; + + ret = acpi_dev_uid_to_integer(ACPI_COMPANION(dev), &uid); + if (ret) + return dev_err_probe(dev, ret, "unable to read ACPI uid\n"); + + uncorepmu = devm_kzalloc(dev, sizeof(*uncorepmu), GFP_KERNEL); + if (!uncorepmu) + return -ENOMEM; + uncorepmu->dev = dev; + uncorepmu->cpu = cpumask_local_spread(0, dev_to_node(dev)); + platform_set_drvdata(pdev, uncorepmu); + + switch (device_type) { + case FUJITSU_UNCORE_PMU_MAC: + uncorepmu->num_counters = MAC_NUM_COUNTERS; + attr_groups = fujitsu_uncore_mac_pmu_attr_grps; + name = devm_kasprintf(dev, GFP_KERNEL, "mac_iod%llu_mac%llu_ch%llu", + (uid >> 8) & 0xF, (uid >> 4) & 0xF, uid & 0xF); + break; + case FUJITSU_UNCORE_PMU_PCI: + uncorepmu->num_counters = PCI_NUM_COUNTERS; + attr_groups = fujitsu_uncore_pci_pmu_attr_grps; + name = devm_kasprintf(dev, GFP_KERNEL, "pci_iod%llu_pci%llu", + (uid >> 4) & 0xF, uid & 0xF); + break; + default: + return dev_err_probe(dev, -EINVAL, "illegal device type: %lu\n", device_type); + } + if (!name) + return -ENOMEM; + + uncorepmu->pmu = (struct pmu) { + .parent = dev, + .task_ctx_nr = perf_invalid_context, + + .attr_groups = attr_groups, + + .pmu_enable = fujitsu_uncore_pmu_enable, + .pmu_disable = fujitsu_uncore_pmu_disable, + .event_init = fujitsu_uncore_event_init, + .add = fujitsu_uncore_event_add, + .del = fujitsu_uncore_event_del, + .start = fujitsu_uncore_event_start, + .stop = fujitsu_uncore_event_stop, + .read = fujitsu_uncore_event_read, + + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + }; + + alloc_size = sizeof(uncorepmu->events[0]) * uncorepmu->num_counters; + uncorepmu->events = devm_kzalloc(dev, alloc_size, GFP_KERNEL); + if (!uncorepmu->events) + return -ENOMEM; + + alloc_size = sizeof(uncorepmu->used_mask[0]) * BITS_TO_LONGS(uncorepmu->num_counters); + uncorepmu->used_mask = devm_kzalloc(dev, alloc_size, GFP_KERNEL); + if (!uncorepmu->used_mask) + return -ENOMEM; + + uncorepmu->regs = devm_platform_get_and_ioremap_resource(pdev, 0, &memrc); + if (IS_ERR(uncorepmu->regs)) + return PTR_ERR(uncorepmu->regs); + + fujitsu_uncore_init(uncorepmu); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + ret = devm_request_irq(dev, irq, fujitsu_uncore_handle_irq, + IRQF_NOBALANCING | IRQF_NO_THREAD, + name, uncorepmu); + if (ret) + return dev_err_probe(dev, ret, "Failed to request IRQ:%d\n", irq); + + ret = irq_set_affinity(irq, cpumask_of(uncorepmu->cpu)); + if (ret) + return dev_err_probe(dev, ret, "Failed to set irq affinity:%d\n", irq); + + uncorepmu->irq = irq; + + /* Add this instance to the list used by the offline callback */ + ret = cpuhp_state_add_instance(uncore_pmu_cpuhp_state, &uncorepmu->node); + if (ret) + return dev_err_probe(dev, ret, "Error registering hotplug"); + + ret = perf_pmu_register(&uncorepmu->pmu, name, -1); + if (ret < 0) { + cpuhp_state_remove_instance_nocalls(uncore_pmu_cpuhp_state, &uncorepmu->node); + return dev_err_probe(dev, ret, "Failed to register %s PMU\n", name); + } + + dev_dbg(dev, "Registered %s, type: %d\n", name, uncorepmu->pmu.type); + + return 0; +} + +static void fujitsu_uncore_pmu_remove(struct platform_device *pdev) +{ + struct uncore_pmu *uncorepmu = platform_get_drvdata(pdev); + + writeq_relaxed(0, uncorepmu->regs + PM_CR); + + perf_pmu_unregister(&uncorepmu->pmu); + cpuhp_state_remove_instance_nocalls(uncore_pmu_cpuhp_state, &uncorepmu->node); +} + +static const struct acpi_device_id fujitsu_uncore_pmu_acpi_match[] = { + { "FUJI200C", FUJITSU_UNCORE_PMU_MAC }, + { "FUJI200D", FUJITSU_UNCORE_PMU_PCI }, + { } +}; +MODULE_DEVICE_TABLE(acpi, fujitsu_uncore_pmu_acpi_match); + +static struct platform_driver fujitsu_uncore_pmu_driver = { + .driver = { + .name = "fujitsu-uncore-pmu", + .acpi_match_table = fujitsu_uncore_pmu_acpi_match, + .suppress_bind_attrs = true, + }, + .probe = fujitsu_uncore_pmu_probe, + .remove = fujitsu_uncore_pmu_remove, +}; + +static int __init fujitsu_uncore_pmu_init(void) +{ + int ret; + + /* Install a hook to update the reader CPU in case it goes offline */ + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/fujitsu/uncore:online", + fujitsu_uncore_pmu_online_cpu, + fujitsu_uncore_pmu_offline_cpu); + if (ret < 0) + return ret; + + uncore_pmu_cpuhp_state = ret; + + ret = platform_driver_register(&fujitsu_uncore_pmu_driver); + if (ret) + cpuhp_remove_multi_state(uncore_pmu_cpuhp_state); + + return ret; +} + +static void __exit fujitsu_uncore_pmu_exit(void) +{ + platform_driver_unregister(&fujitsu_uncore_pmu_driver); + cpuhp_remove_multi_state(uncore_pmu_cpuhp_state); +} + +module_init(fujitsu_uncore_pmu_init); +module_exit(fujitsu_uncore_pmu_exit); + +MODULE_AUTHOR("Koichi Okuno "); +MODULE_DESCRIPTION("Fujitsu Uncore PMU driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index 48dcc8381ea75d..186be3d02238b1 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \ - hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o + hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o \ + hisi_uncore_noc_pmu.o hisi_uncore_mn_pmu.o obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o obj-$(CONFIG_HNS3_PMU) += hns3_pmu.o diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 412fc3a979639c..bbd81a43047d28 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -39,6 +39,7 @@ /* L3C has 8-counters */ #define L3C_NR_COUNTERS 0x8 +#define L3C_MAX_EXT 2 #define L3C_PERF_CTRL_EN 0x10000 #define L3C_TRACETAG_EN BIT(31) @@ -55,59 +56,152 @@ #define L3C_V1_NR_EVENTS 0x59 #define L3C_V2_NR_EVENTS 0xFF -HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config1, 7, 0); +HISI_PMU_EVENT_ATTR_EXTRACTOR(ext, config, 17, 16); HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_req, config1, 10, 8); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_cfg, config1, 15, 11); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_skt, config1, 16, 16); +HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config2, 15, 0); -static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event) +struct hisi_l3c_pmu { + struct hisi_pmu l3c_pmu; + + /* MMIO and IRQ resources for extension events */ + void __iomem *ext_base[L3C_MAX_EXT]; + int ext_irq[L3C_MAX_EXT]; + int ext_num; +}; + +#define to_hisi_l3c_pmu(_l3c_pmu) \ + container_of(_l3c_pmu, struct hisi_l3c_pmu, l3c_pmu) + +/* + * The hardware counter idx used in counter enable/disable, + * interrupt enable/disable and status check, etc. + */ +#define L3C_HW_IDX(_cntr_idx) ((_cntr_idx) % L3C_NR_COUNTERS) + +/* Range of ext counters in used mask. */ +#define L3C_CNTR_EXT_L(_ext) (((_ext) + 1) * L3C_NR_COUNTERS) +#define L3C_CNTR_EXT_H(_ext) (((_ext) + 2) * L3C_NR_COUNTERS) + +struct hisi_l3c_pmu_ext { + bool support_ext; +}; + +static bool support_ext(struct hisi_l3c_pmu *pmu) +{ + struct hisi_l3c_pmu_ext *l3c_pmu_ext = pmu->l3c_pmu.dev_info->private; + + return l3c_pmu_ext->support_ext; +} + +static int hisi_l3c_pmu_get_event_idx(struct perf_event *event) { struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; + int ext = hisi_get_ext(event); + int idx; + + /* + * For an L3C PMU that supports extension events, we can monitor + * maximum 2 * num_counters to 3 * num_counters events, depending on + * the number of ext regions supported by hardware. Thus use bit + * [0, num_counters - 1] for normal events and bit + * [ext * num_counters, (ext + 1) * num_counters - 1] for extension + * events. The idx allocation will keep unchanged for normal events and + * we can also use the idx to distinguish whether it's an extension + * event or not. + * + * Since normal events and extension events locates on the different + * address space, save the base address to the event->hw.event_base. + */ + if (ext && !support_ext(hisi_l3c_pmu)) + return -EOPNOTSUPP; + + if (ext) + event->hw.event_base = (unsigned long)hisi_l3c_pmu->ext_base[ext - 1]; + else + event->hw.event_base = (unsigned long)l3c_pmu->base; + + ext -= 1; + idx = find_next_zero_bit(used_mask, L3C_CNTR_EXT_H(ext), L3C_CNTR_EXT_L(ext)); + + if (idx >= L3C_CNTR_EXT_H(ext)) + return -EAGAIN; + + set_bit(idx, used_mask); + + return idx; +} + +static u32 hisi_l3c_pmu_event_readl(struct hw_perf_event *hwc, u32 reg) +{ + return readl((void __iomem *)hwc->event_base + reg); +} + +static void hisi_l3c_pmu_event_writel(struct hw_perf_event *hwc, u32 reg, u32 val) +{ + writel(val, (void __iomem *)hwc->event_base + reg); +} + +static u64 hisi_l3c_pmu_event_readq(struct hw_perf_event *hwc, u32 reg) +{ + return readq((void __iomem *)hwc->event_base + reg); +} + +static void hisi_l3c_pmu_event_writeq(struct hw_perf_event *hwc, u32 reg, u64 val) +{ + writeq(val, (void __iomem *)hwc->event_base + reg); +} + +static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; u32 tt_req = hisi_get_tt_req(event); if (tt_req) { u32 val; /* Set request-type for tracetag */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val |= tt_req << L3C_TRACETAG_REQ_SHIFT; val |= L3C_TRACETAG_REQ_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); /* Enable request-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val |= L3C_TRACETAG_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); } } static void hisi_l3c_pmu_clear_req_tracetag(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 tt_req = hisi_get_tt_req(event); if (tt_req) { u32 val; /* Clear request-type */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val &= ~(tt_req << L3C_TRACETAG_REQ_SHIFT); val &= ~L3C_TRACETAG_REQ_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); /* Disable request-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val &= ~L3C_TRACETAG_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); } } static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; u32 reg, reg_idx, shift, val; - int idx = hwc->idx; + int idx = L3C_HW_IDX(hwc->idx); /* * Select the appropriate datasource register(L3C_DATSRC_TYPE0/1). @@ -120,15 +214,15 @@ static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg) reg_idx = idx % 4; shift = 8 * reg_idx; - val = readl(l3c_pmu->base + reg); + val = hisi_l3c_pmu_event_readl(hwc, reg); val &= ~(L3C_DATSRC_MASK << shift); val |= ds_cfg << shift; - writel(val, l3c_pmu->base + reg); + hisi_l3c_pmu_event_writel(hwc, reg, val); } static void hisi_l3c_pmu_config_ds(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 ds_cfg = hisi_get_datasrc_cfg(event); u32 ds_skt = hisi_get_datasrc_skt(event); @@ -138,15 +232,15 @@ static void hisi_l3c_pmu_config_ds(struct perf_event *event) if (ds_skt) { u32 val; - val = readl(l3c_pmu->base + L3C_DATSRC_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_DATSRC_CTRL); val |= L3C_DATSRC_SKT_EN; - writel(val, l3c_pmu->base + L3C_DATSRC_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_DATSRC_CTRL, val); } } static void hisi_l3c_pmu_clear_ds(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 ds_cfg = hisi_get_datasrc_cfg(event); u32 ds_skt = hisi_get_datasrc_skt(event); @@ -156,57 +250,63 @@ static void hisi_l3c_pmu_clear_ds(struct perf_event *event) if (ds_skt) { u32 val; - val = readl(l3c_pmu->base + L3C_DATSRC_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_DATSRC_CTRL); val &= ~L3C_DATSRC_SKT_EN; - writel(val, l3c_pmu->base + L3C_DATSRC_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_DATSRC_CTRL, val); } } static void hisi_l3c_pmu_config_core_tracetag(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 core = hisi_get_tt_core(event); if (core) { u32 val; /* Config and enable core information */ - writel(core, l3c_pmu->base + L3C_CORE_CTRL); - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_CORE_CTRL, core); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val |= L3C_CORE_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); /* Enable core-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val |= L3C_TRACETAG_CORE_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); } } static void hisi_l3c_pmu_clear_core_tracetag(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 core = hisi_get_tt_core(event); if (core) { u32 val; /* Clear core information */ - writel(L3C_COER_NONE, l3c_pmu->base + L3C_CORE_CTRL); - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_CORE_CTRL, L3C_COER_NONE); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val &= ~L3C_CORE_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); /* Disable core-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val &= ~L3C_TRACETAG_CORE_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); } } +static bool hisi_l3c_pmu_have_filter(struct perf_event *event) +{ + return hisi_get_tt_req(event) || hisi_get_tt_core(event) || + hisi_get_datasrc_cfg(event) || hisi_get_datasrc_skt(event); +} + static void hisi_l3c_pmu_enable_filter(struct perf_event *event) { - if (event->attr.config1 != 0x0) { + if (hisi_l3c_pmu_have_filter(event)) { hisi_l3c_pmu_config_req_tracetag(event); hisi_l3c_pmu_config_core_tracetag(event); hisi_l3c_pmu_config_ds(event); @@ -215,38 +315,53 @@ static void hisi_l3c_pmu_enable_filter(struct perf_event *event) static void hisi_l3c_pmu_disable_filter(struct perf_event *event) { - if (event->attr.config1 != 0x0) { + if (hisi_l3c_pmu_have_filter(event)) { hisi_l3c_pmu_clear_ds(event); hisi_l3c_pmu_clear_core_tracetag(event); hisi_l3c_pmu_clear_req_tracetag(event); } } +static int hisi_l3c_pmu_check_filter(struct perf_event *event) +{ + struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ext = hisi_get_ext(event); + + if (ext < 0 || ext > hisi_l3c_pmu->ext_num) + return -EINVAL; + + return 0; +} + /* * Select the counter register offset using the counter index */ static u32 hisi_l3c_pmu_get_counter_offset(int cntr_idx) { - return (L3C_CNTR0_LOWER + (cntr_idx * 8)); + return L3C_CNTR0_LOWER + L3C_HW_IDX(cntr_idx) * 8; } static u64 hisi_l3c_pmu_read_counter(struct hisi_pmu *l3c_pmu, struct hw_perf_event *hwc) { - return readq(l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(hwc->idx)); + return hisi_l3c_pmu_event_readq(hwc, hisi_l3c_pmu_get_counter_offset(hwc->idx)); } static void hisi_l3c_pmu_write_counter(struct hisi_pmu *l3c_pmu, struct hw_perf_event *hwc, u64 val) { - writeq(val, l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(hwc->idx)); + hisi_l3c_pmu_event_writeq(hwc, hisi_l3c_pmu_get_counter_offset(hwc->idx), val); } static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, u32 type) { + struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; u32 reg, reg_idx, shift, val; + idx = L3C_HW_IDX(idx); + /* * Select the appropriate event select register(L3C_EVENT_TYPE0/1). * There are 2 event select registers for the 8 hardware counters. @@ -259,36 +374,72 @@ static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, shift = 8 * reg_idx; /* Write event code to L3C_EVENT_TYPEx Register */ - val = readl(l3c_pmu->base + reg); + val = hisi_l3c_pmu_event_readl(hwc, reg); val &= ~(L3C_EVTYPE_NONE << shift); - val |= (type << shift); - writel(val, l3c_pmu->base + reg); + val |= type << shift; + hisi_l3c_pmu_event_writel(hwc, reg, val); } static void hisi_l3c_pmu_start_counters(struct hisi_pmu *l3c_pmu) { + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; + unsigned long used_cntr = find_first_bit(used_mask, l3c_pmu->num_counters); u32 val; + int i; /* - * Set perf_enable bit in L3C_PERF_CTRL register to start counting - * for all enabled counters. + * Check if any counter belongs to the normal range (instead of ext + * range). If so, enable it. */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); - val |= L3C_PERF_CTRL_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + if (used_cntr < L3C_NR_COUNTERS) { + val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val |= L3C_PERF_CTRL_EN; + writel(val, l3c_pmu->base + L3C_PERF_CTRL); + } + + /* If not, do enable it on ext ranges. */ + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + /* Find used counter in this ext range, skip the range if not. */ + used_cntr = find_next_bit(used_mask, L3C_CNTR_EXT_H(i), L3C_CNTR_EXT_L(i)); + if (used_cntr >= L3C_CNTR_EXT_H(i)) + continue; + + val = readl(hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + val |= L3C_PERF_CTRL_EN; + writel(val, hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + } } static void hisi_l3c_pmu_stop_counters(struct hisi_pmu *l3c_pmu) { + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; + unsigned long used_cntr = find_first_bit(used_mask, l3c_pmu->num_counters); u32 val; + int i; /* - * Clear perf_enable bit in L3C_PERF_CTRL register to stop counting - * for all enabled counters. + * Check if any counter belongs to the normal range (instead of ext + * range). If so, stop it. */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); - val &= ~(L3C_PERF_CTRL_EN); - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + if (used_cntr < L3C_NR_COUNTERS) { + val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val &= ~L3C_PERF_CTRL_EN; + writel(val, l3c_pmu->base + L3C_PERF_CTRL); + } + + /* If not, do stop it on ext ranges. */ + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + /* Find used counter in this ext range, skip the range if not. */ + used_cntr = find_next_bit(used_mask, L3C_CNTR_EXT_H(i), L3C_CNTR_EXT_L(i)); + if (used_cntr >= L3C_CNTR_EXT_H(i)) + continue; + + val = readl(hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + val &= ~L3C_PERF_CTRL_EN; + writel(val, hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + } } static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu, @@ -297,9 +448,9 @@ static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu, u32 val; /* Enable counter index in L3C_EVENT_CTRL register */ - val = readl(l3c_pmu->base + L3C_EVENT_CTRL); - val |= (1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_EVENT_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); + val |= 1 << L3C_HW_IDX(hwc->idx); + hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu, @@ -308,9 +459,9 @@ static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu, u32 val; /* Clear counter index in L3C_EVENT_CTRL register */ - val = readl(l3c_pmu->base + L3C_EVENT_CTRL); - val &= ~(1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_EVENT_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); + val &= ~(1 << L3C_HW_IDX(hwc->idx)); + hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu, @@ -318,10 +469,10 @@ static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu, { u32 val; - val = readl(l3c_pmu->base + L3C_INT_MASK); + val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 0 to enable interrupt */ - val &= ~(1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_INT_MASK); + val &= ~(1 << L3C_HW_IDX(hwc->idx)); + hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu, @@ -329,28 +480,37 @@ static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu, { u32 val; - val = readl(l3c_pmu->base + L3C_INT_MASK); + val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 1 to mask interrupt */ - val |= (1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_INT_MASK); + val |= 1 << L3C_HW_IDX(hwc->idx); + hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } static u32 hisi_l3c_pmu_get_int_status(struct hisi_pmu *l3c_pmu) { - return readl(l3c_pmu->base + L3C_INT_STATUS); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + u32 ext_int, status, status_ext = 0; + int i; + + status = readl(l3c_pmu->base + L3C_INT_STATUS); + + if (!support_ext(hisi_l3c_pmu)) + return status; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + ext_int = readl(hisi_l3c_pmu->ext_base[i] + L3C_INT_STATUS); + status_ext |= ext_int << (L3C_NR_COUNTERS * i); + } + + return status | (status_ext << L3C_NR_COUNTERS); } static void hisi_l3c_pmu_clear_int_status(struct hisi_pmu *l3c_pmu, int idx) { - writel(1 << idx, l3c_pmu->base + L3C_INT_CLEAR); -} + struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; -static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = { - { "HISI0213", }, - { "HISI0214", }, - {} -}; -MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match); + hisi_l3c_pmu_event_writel(hwc, L3C_INT_CLEAR, 1 << L3C_HW_IDX(idx)); +} static int hisi_l3c_pmu_init_data(struct platform_device *pdev, struct hisi_pmu *l3c_pmu) @@ -371,6 +531,10 @@ static int hisi_l3c_pmu_init_data(struct platform_device *pdev, return -EINVAL; } + l3c_pmu->dev_info = device_get_match_data(&pdev->dev); + if (!l3c_pmu->dev_info) + return -ENODEV; + l3c_pmu->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(l3c_pmu->base)) { dev_err(&pdev->dev, "ioremap failed for l3c_pmu resource\n"); @@ -382,6 +546,50 @@ static int hisi_l3c_pmu_init_data(struct platform_device *pdev, return 0; } +static int hisi_l3c_pmu_init_ext(struct hisi_pmu *l3c_pmu, struct platform_device *pdev) +{ + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ret, irq, ext_num, i; + char *irqname; + + /* HiSilicon L3C PMU supporting ext should have more than 1 irq resources. */ + ext_num = platform_irq_count(pdev); + if (ext_num < L3C_MAX_EXT) + return -ENODEV; + + /* + * The number of ext supported equals the number of irq - 1, since one + * of the irqs belongs to the normal part of PMU. + */ + hisi_l3c_pmu->ext_num = ext_num - 1; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + hisi_l3c_pmu->ext_base[i] = devm_platform_ioremap_resource(pdev, i + 1); + if (IS_ERR(hisi_l3c_pmu->ext_base[i])) + return PTR_ERR(hisi_l3c_pmu->ext_base[i]); + + irq = platform_get_irq(pdev, i + 1); + if (irq < 0) + return irq; + + irqname = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s ext%d", + dev_name(&pdev->dev), i + 1); + if (!irqname) + return -ENOMEM; + + ret = devm_request_irq(&pdev->dev, irq, hisi_uncore_pmu_isr, + IRQF_NOBALANCING | IRQF_NO_THREAD, + irqname, l3c_pmu); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Fail to request EXT IRQ: %d.\n", irq); + + hisi_l3c_pmu->ext_irq[i] = irq; + } + + return 0; +} + static struct attribute *hisi_l3c_pmu_v1_format_attr[] = { HISI_PMU_FORMAT_ATTR(event, "config:0-7"), NULL, @@ -394,7 +602,7 @@ static const struct attribute_group hisi_l3c_pmu_v1_format_group = { static struct attribute *hisi_l3c_pmu_v2_format_attr[] = { HISI_PMU_FORMAT_ATTR(event, "config:0-7"), - HISI_PMU_FORMAT_ATTR(tt_core, "config1:0-7"), + HISI_PMU_FORMAT_ATTR(tt_core, "config2:0-15"), HISI_PMU_FORMAT_ATTR(tt_req, "config1:8-10"), HISI_PMU_FORMAT_ATTR(datasrc_cfg, "config1:11-15"), HISI_PMU_FORMAT_ATTR(datasrc_skt, "config1:16"), @@ -406,6 +614,19 @@ static const struct attribute_group hisi_l3c_pmu_v2_format_group = { .attrs = hisi_l3c_pmu_v2_format_attr, }; +static struct attribute *hisi_l3c_pmu_v3_format_attr[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-7"), + HISI_PMU_FORMAT_ATTR(ext, "config:16-17"), + HISI_PMU_FORMAT_ATTR(tt_req, "config1:8-10"), + HISI_PMU_FORMAT_ATTR(tt_core, "config2:0-15"), + NULL +}; + +static const struct attribute_group hisi_l3c_pmu_v3_format_group = { + .name = "format", + .attrs = hisi_l3c_pmu_v3_format_attr, +}; + static struct attribute *hisi_l3c_pmu_v1_events_attr[] = { HISI_PMU_EVENT_ATTR(rd_cpipe, 0x00), HISI_PMU_EVENT_ATTR(wr_cpipe, 0x01), @@ -441,6 +662,26 @@ static const struct attribute_group hisi_l3c_pmu_v2_events_group = { .attrs = hisi_l3c_pmu_v2_events_attr, }; +static struct attribute *hisi_l3c_pmu_v3_events_attr[] = { + HISI_PMU_EVENT_ATTR(rd_spipe, 0x18), + HISI_PMU_EVENT_ATTR(rd_hit_spipe, 0x19), + HISI_PMU_EVENT_ATTR(wr_spipe, 0x1a), + HISI_PMU_EVENT_ATTR(wr_hit_spipe, 0x1b), + HISI_PMU_EVENT_ATTR(io_rd_spipe, 0x1c), + HISI_PMU_EVENT_ATTR(io_rd_hit_spipe, 0x1d), + HISI_PMU_EVENT_ATTR(io_wr_spipe, 0x1e), + HISI_PMU_EVENT_ATTR(io_wr_hit_spipe, 0x1f), + HISI_PMU_EVENT_ATTR(cycles, 0x7f), + HISI_PMU_EVENT_ATTR(l3c_ref, 0xbc), + HISI_PMU_EVENT_ATTR(l3c2ring, 0xbd), + NULL +}; + +static const struct attribute_group hisi_l3c_pmu_v3_events_group = { + .name = "events", + .attrs = hisi_l3c_pmu_v3_events_attr, +}; + static const struct attribute_group *hisi_l3c_pmu_v1_attr_groups[] = { &hisi_l3c_pmu_v1_format_group, &hisi_l3c_pmu_v1_events_group, @@ -457,9 +698,46 @@ static const struct attribute_group *hisi_l3c_pmu_v2_attr_groups[] = { NULL }; +static const struct attribute_group *hisi_l3c_pmu_v3_attr_groups[] = { + &hisi_l3c_pmu_v3_format_group, + &hisi_l3c_pmu_v3_events_group, + &hisi_pmu_cpumask_attr_group, + &hisi_pmu_identifier_group, + NULL +}; + +static struct hisi_l3c_pmu_ext hisi_l3c_pmu_support_ext = { + .support_ext = true, +}; + +static struct hisi_l3c_pmu_ext hisi_l3c_pmu_not_support_ext = { + .support_ext = false, +}; + +static const struct hisi_pmu_dev_info hisi_l3c_pmu_v1 = { + .attr_groups = hisi_l3c_pmu_v1_attr_groups, + .counter_bits = 48, + .check_event = L3C_V1_NR_EVENTS, + .private = &hisi_l3c_pmu_not_support_ext, +}; + +static const struct hisi_pmu_dev_info hisi_l3c_pmu_v2 = { + .attr_groups = hisi_l3c_pmu_v2_attr_groups, + .counter_bits = 64, + .check_event = L3C_V2_NR_EVENTS, + .private = &hisi_l3c_pmu_not_support_ext, +}; + +static const struct hisi_pmu_dev_info hisi_l3c_pmu_v3 = { + .attr_groups = hisi_l3c_pmu_v3_attr_groups, + .counter_bits = 64, + .check_event = L3C_V2_NR_EVENTS, + .private = &hisi_l3c_pmu_support_ext, +}; + static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { .write_evtype = hisi_l3c_pmu_write_evtype, - .get_event_idx = hisi_uncore_pmu_get_event_idx, + .get_event_idx = hisi_l3c_pmu_get_event_idx, .start_counters = hisi_l3c_pmu_start_counters, .stop_counters = hisi_l3c_pmu_stop_counters, .enable_counter = hisi_l3c_pmu_enable_counter, @@ -472,11 +750,14 @@ static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { .clear_int_status = hisi_l3c_pmu_clear_int_status, .enable_filter = hisi_l3c_pmu_enable_filter, .disable_filter = hisi_l3c_pmu_disable_filter, + .check_filter = hisi_l3c_pmu_check_filter, }; static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev, struct hisi_pmu *l3c_pmu) { + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + struct hisi_l3c_pmu_ext *l3c_pmu_dev_ext; int ret; ret = hisi_l3c_pmu_init_data(pdev, l3c_pmu); @@ -487,42 +768,55 @@ static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev, if (ret) return ret; - if (l3c_pmu->identifier >= HISI_PMU_V2) { - l3c_pmu->counter_bits = 64; - l3c_pmu->check_event = L3C_V2_NR_EVENTS; - l3c_pmu->pmu_events.attr_groups = hisi_l3c_pmu_v2_attr_groups; - } else { - l3c_pmu->counter_bits = 48; - l3c_pmu->check_event = L3C_V1_NR_EVENTS; - l3c_pmu->pmu_events.attr_groups = hisi_l3c_pmu_v1_attr_groups; - } - + l3c_pmu->pmu_events.attr_groups = l3c_pmu->dev_info->attr_groups; + l3c_pmu->counter_bits = l3c_pmu->dev_info->counter_bits; + l3c_pmu->check_event = l3c_pmu->dev_info->check_event; l3c_pmu->num_counters = L3C_NR_COUNTERS; l3c_pmu->ops = &hisi_uncore_l3c_ops; l3c_pmu->dev = &pdev->dev; l3c_pmu->on_cpu = -1; + l3c_pmu_dev_ext = l3c_pmu->dev_info->private; + if (l3c_pmu_dev_ext->support_ext) { + ret = hisi_l3c_pmu_init_ext(l3c_pmu, pdev); + if (ret) + return ret; + /* + * The extension events have their own counters with the + * same number of the normal events counters. So we can + * have at maximum num_counters * ext events monitored. + */ + l3c_pmu->num_counters += hisi_l3c_pmu->ext_num * L3C_NR_COUNTERS; + } + return 0; } static int hisi_l3c_pmu_probe(struct platform_device *pdev) { + struct hisi_l3c_pmu *hisi_l3c_pmu; struct hisi_pmu *l3c_pmu; char *name; int ret; - l3c_pmu = devm_kzalloc(&pdev->dev, sizeof(*l3c_pmu), GFP_KERNEL); - if (!l3c_pmu) + hisi_l3c_pmu = devm_kzalloc(&pdev->dev, sizeof(*hisi_l3c_pmu), GFP_KERNEL); + if (!hisi_l3c_pmu) return -ENOMEM; + l3c_pmu = &hisi_l3c_pmu->l3c_pmu; platform_set_drvdata(pdev, l3c_pmu); ret = hisi_l3c_pmu_dev_probe(pdev, l3c_pmu); if (ret) return ret; - name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d", - l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id); + if (l3c_pmu->topo.sub_id >= 0) + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d_%d", + l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id, + l3c_pmu->topo.sub_id); + else + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d", + l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id); if (!name) return -ENOMEM; @@ -554,6 +848,14 @@ static void hisi_l3c_pmu_remove(struct platform_device *pdev) &l3c_pmu->node); } +static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = { + { "HISI0213", (kernel_ulong_t)&hisi_l3c_pmu_v1 }, + { "HISI0214", (kernel_ulong_t)&hisi_l3c_pmu_v2 }, + { "HISI0215", (kernel_ulong_t)&hisi_l3c_pmu_v3 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match); + static struct platform_driver hisi_l3c_pmu_driver = { .driver = { .name = "hisi_l3c_pmu", @@ -564,14 +866,60 @@ static struct platform_driver hisi_l3c_pmu_driver = { .remove = hisi_l3c_pmu_remove, }; +static int hisi_l3c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pmu *l3c_pmu = hlist_entry_safe(node, struct hisi_pmu, node); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ret, i; + + ret = hisi_uncore_pmu_online_cpu(cpu, node); + if (ret) + return ret; + + /* Avoid L3C pmu not supporting ext from ext irq migrating. */ + if (!support_ext(hisi_l3c_pmu)) + return 0; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) + WARN_ON(irq_set_affinity(hisi_l3c_pmu->ext_irq[i], + cpumask_of(l3c_pmu->on_cpu))); + + return 0; +} + +static int hisi_l3c_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pmu *l3c_pmu = hlist_entry_safe(node, struct hisi_pmu, node); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ret, i; + + ret = hisi_uncore_pmu_offline_cpu(cpu, node); + if (ret) + return ret; + + /* If failed to find any available CPU, skip irq migration. */ + if (l3c_pmu->on_cpu < 0) + return 0; + + /* Avoid L3C pmu not supporting ext from ext irq migrating. */ + if (!support_ext(hisi_l3c_pmu)) + return 0; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) + WARN_ON(irq_set_affinity(hisi_l3c_pmu->ext_irq[i], + cpumask_of(l3c_pmu->on_cpu))); + + return 0; +} + static int __init hisi_l3c_pmu_module_init(void) { int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, "AP_PERF_ARM_HISI_L3_ONLINE", - hisi_uncore_pmu_online_cpu, - hisi_uncore_pmu_offline_cpu); + hisi_l3c_pmu_online_cpu, + hisi_l3c_pmu_offline_cpu); if (ret) { pr_err("L3C PMU: Error setup hotplug, ret = %d\n", ret); return ret; diff --git a/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c b/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c new file mode 100644 index 00000000000000..4df4eebe243e66 --- /dev/null +++ b/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HiSilicon SoC MN uncore Hardware event counters support + * + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "hisi_uncore_pmu.h" + +/* Dynamic CPU hotplug state used by MN PMU */ +static enum cpuhp_state hisi_mn_pmu_online; + +/* MN register definition */ +#define HISI_MN_DYNAMIC_CTRL_REG 0x400 +#define HISI_MN_DYNAMIC_CTRL_EN BIT(0) +#define HISI_MN_PERF_CTRL_REG 0x408 +#define HISI_MN_PERF_CTRL_EN BIT(6) +#define HISI_MN_INT_MASK_REG 0x800 +#define HISI_MN_INT_STATUS_REG 0x808 +#define HISI_MN_INT_CLEAR_REG 0x80C +#define HISI_MN_EVENT_CTRL_REG 0x1C00 +#define HISI_MN_VERSION_REG 0x1C04 +#define HISI_MN_EVTYPE0_REG 0x1d00 +#define HISI_MN_EVTYPE_MASK GENMASK(7, 0) +#define HISI_MN_CNTR0_REG 0x1e00 +#define HISI_MN_EVTYPE_REGn(evtype0, n) ((evtype0) + (n) * 4) +#define HISI_MN_CNTR_REGn(cntr0, n) ((cntr0) + (n) * 8) + +#define HISI_MN_NR_COUNTERS 4 +#define HISI_MN_TIMEOUT_US 500U + +struct hisi_mn_pmu_regs { + u32 version; + u32 dyn_ctrl; + u32 perf_ctrl; + u32 int_mask; + u32 int_clear; + u32 int_status; + u32 event_ctrl; + u32 event_type0; + u32 event_cntr0; +}; + +/* + * Each event request takes a certain amount of time to complete. If + * we counting the latency related event, we need to wait for the all + * requests complete. Otherwise, the value of counter is slightly larger. + */ +static void hisi_mn_pmu_counter_flush(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + int ret; + u32 val; + + val = readl(mn_pmu->base + reg_info->dyn_ctrl); + val |= HISI_MN_DYNAMIC_CTRL_EN; + writel(val, mn_pmu->base + reg_info->dyn_ctrl); + + ret = readl_poll_timeout_atomic(mn_pmu->base + reg_info->dyn_ctrl, + val, !(val & HISI_MN_DYNAMIC_CTRL_EN), + 1, HISI_MN_TIMEOUT_US); + if (ret) + dev_warn(mn_pmu->dev, "Counter flush timeout\n"); +} + +static u64 hisi_mn_pmu_read_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + return readq(mn_pmu->base + HISI_MN_CNTR_REGn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_mn_pmu_write_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc, u64 val) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + writeq(val, mn_pmu->base + HISI_MN_CNTR_REGn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_mn_pmu_write_evtype(struct hisi_pmu *mn_pmu, int idx, u32 type) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + /* + * Select the appropriate event select register. + * There are 2 32-bit event select registers for the + * 8 hardware counters, each event code is 8-bit wide. + */ + val = readl(mn_pmu->base + HISI_MN_EVTYPE_REGn(reg_info->event_type0, idx / 4)); + val &= ~(HISI_MN_EVTYPE_MASK << HISI_PMU_EVTYPE_SHIFT(idx)); + val |= (type << HISI_PMU_EVTYPE_SHIFT(idx)); + writel(val, mn_pmu->base + HISI_MN_EVTYPE_REGn(reg_info->event_type0, idx / 4)); +} + +static void hisi_mn_pmu_start_counters(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->perf_ctrl); + val |= HISI_MN_PERF_CTRL_EN; + writel(val, mn_pmu->base + reg_info->perf_ctrl); +} + +static void hisi_mn_pmu_stop_counters(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->perf_ctrl); + val &= ~HISI_MN_PERF_CTRL_EN; + writel(val, mn_pmu->base + reg_info->perf_ctrl); + + hisi_mn_pmu_counter_flush(mn_pmu); +} + +static void hisi_mn_pmu_enable_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->event_ctrl); + val |= BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->event_ctrl); +} + +static void hisi_mn_pmu_disable_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->event_ctrl); + val &= ~BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->event_ctrl); +} + +static void hisi_mn_pmu_enable_counter_int(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->int_mask); + val &= ~BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->int_mask); +} + +static void hisi_mn_pmu_disable_counter_int(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->int_mask); + val |= BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->int_mask); +} + +static u32 hisi_mn_pmu_get_int_status(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + return readl(mn_pmu->base + reg_info->int_status); +} + +static void hisi_mn_pmu_clear_int_status(struct hisi_pmu *mn_pmu, int idx) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + writel(BIT(idx), mn_pmu->base + reg_info->int_clear); +} + +static struct attribute *hisi_mn_pmu_format_attr[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-7"), + NULL +}; + +static const struct attribute_group hisi_mn_pmu_format_group = { + .name = "format", + .attrs = hisi_mn_pmu_format_attr, +}; + +static struct attribute *hisi_mn_pmu_events_attr[] = { + HISI_PMU_EVENT_ATTR(req_eobarrier_num, 0x00), + HISI_PMU_EVENT_ATTR(req_ecbarrier_num, 0x01), + HISI_PMU_EVENT_ATTR(req_dvmop_num, 0x02), + HISI_PMU_EVENT_ATTR(req_dvmsync_num, 0x03), + HISI_PMU_EVENT_ATTR(req_retry_num, 0x04), + HISI_PMU_EVENT_ATTR(req_writenosnp_num, 0x05), + HISI_PMU_EVENT_ATTR(req_readnosnp_num, 0x06), + HISI_PMU_EVENT_ATTR(snp_dvm_num, 0x07), + HISI_PMU_EVENT_ATTR(snp_dvmsync_num, 0x08), + HISI_PMU_EVENT_ATTR(l3t_req_dvm_num, 0x09), + HISI_PMU_EVENT_ATTR(l3t_req_dvmsync_num, 0x0A), + HISI_PMU_EVENT_ATTR(mn_req_dvm_num, 0x0B), + HISI_PMU_EVENT_ATTR(mn_req_dvmsync_num, 0x0C), + HISI_PMU_EVENT_ATTR(pa_req_dvm_num, 0x0D), + HISI_PMU_EVENT_ATTR(pa_req_dvmsync_num, 0x0E), + HISI_PMU_EVENT_ATTR(snp_dvm_latency, 0x80), + HISI_PMU_EVENT_ATTR(snp_dvmsync_latency, 0x81), + HISI_PMU_EVENT_ATTR(l3t_req_dvm_latency, 0x82), + HISI_PMU_EVENT_ATTR(l3t_req_dvmsync_latency, 0x83), + HISI_PMU_EVENT_ATTR(mn_req_dvm_latency, 0x84), + HISI_PMU_EVENT_ATTR(mn_req_dvmsync_latency, 0x85), + HISI_PMU_EVENT_ATTR(pa_req_dvm_latency, 0x86), + HISI_PMU_EVENT_ATTR(pa_req_dvmsync_latency, 0x87), + NULL +}; + +static const struct attribute_group hisi_mn_pmu_events_group = { + .name = "events", + .attrs = hisi_mn_pmu_events_attr, +}; + +static const struct attribute_group *hisi_mn_pmu_attr_groups[] = { + &hisi_mn_pmu_format_group, + &hisi_mn_pmu_events_group, + &hisi_pmu_cpumask_attr_group, + &hisi_pmu_identifier_group, + NULL +}; + +static const struct hisi_uncore_ops hisi_uncore_mn_ops = { + .write_evtype = hisi_mn_pmu_write_evtype, + .get_event_idx = hisi_uncore_pmu_get_event_idx, + .start_counters = hisi_mn_pmu_start_counters, + .stop_counters = hisi_mn_pmu_stop_counters, + .enable_counter = hisi_mn_pmu_enable_counter, + .disable_counter = hisi_mn_pmu_disable_counter, + .enable_counter_int = hisi_mn_pmu_enable_counter_int, + .disable_counter_int = hisi_mn_pmu_disable_counter_int, + .write_counter = hisi_mn_pmu_write_counter, + .read_counter = hisi_mn_pmu_read_counter, + .get_int_status = hisi_mn_pmu_get_int_status, + .clear_int_status = hisi_mn_pmu_clear_int_status, +}; + +static int hisi_mn_pmu_dev_init(struct platform_device *pdev, + struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info; + int ret; + + hisi_uncore_pmu_init_topology(mn_pmu, &pdev->dev); + + if (mn_pmu->topo.scl_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, + "Failed to read MN scl id\n"); + + if (mn_pmu->topo.index_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, + "Failed to read MN index id\n"); + + mn_pmu->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(mn_pmu->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(mn_pmu->base), + "Failed to ioremap resource\n"); + + ret = hisi_uncore_pmu_init_irq(mn_pmu, pdev); + if (ret) + return ret; + + mn_pmu->dev_info = device_get_match_data(&pdev->dev); + if (!mn_pmu->dev_info) + return -ENODEV; + + mn_pmu->pmu_events.attr_groups = mn_pmu->dev_info->attr_groups; + mn_pmu->counter_bits = mn_pmu->dev_info->counter_bits; + mn_pmu->check_event = mn_pmu->dev_info->check_event; + mn_pmu->num_counters = HISI_MN_NR_COUNTERS; + mn_pmu->ops = &hisi_uncore_mn_ops; + mn_pmu->dev = &pdev->dev; + mn_pmu->on_cpu = -1; + + reg_info = mn_pmu->dev_info->private; + mn_pmu->identifier = readl(mn_pmu->base + reg_info->version); + + return 0; +} + +static void hisi_mn_pmu_remove_cpuhp(void *hotplug_node) +{ + cpuhp_state_remove_instance_nocalls(hisi_mn_pmu_online, hotplug_node); +} + +static void hisi_mn_pmu_unregister(void *pmu) +{ + perf_pmu_unregister(pmu); +} + +static int hisi_mn_pmu_probe(struct platform_device *pdev) +{ + struct hisi_pmu *mn_pmu; + char *name; + int ret; + + mn_pmu = devm_kzalloc(&pdev->dev, sizeof(*mn_pmu), GFP_KERNEL); + if (!mn_pmu) + return -ENOMEM; + + platform_set_drvdata(pdev, mn_pmu); + + ret = hisi_mn_pmu_dev_init(pdev, mn_pmu); + if (ret) + return ret; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_scl%d_mn%d", + mn_pmu->topo.scl_id, mn_pmu->topo.index_id); + if (!name) + return -ENOMEM; + + ret = cpuhp_state_add_instance(hisi_mn_pmu_online, &mn_pmu->node); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to register cpu hotplug\n"); + + ret = devm_add_action_or_reset(&pdev->dev, hisi_mn_pmu_remove_cpuhp, &mn_pmu->node); + if (ret) + return ret; + + hisi_pmu_init(mn_pmu, THIS_MODULE); + + ret = perf_pmu_register(&mn_pmu->pmu, name, -1); + if (ret) + return dev_err_probe(mn_pmu->dev, ret, "Failed to register MN PMU\n"); + + return devm_add_action_or_reset(&pdev->dev, hisi_mn_pmu_unregister, &mn_pmu->pmu); +} + +static struct hisi_mn_pmu_regs hisi_mn_v1_pmu_regs = { + .version = HISI_MN_VERSION_REG, + .dyn_ctrl = HISI_MN_DYNAMIC_CTRL_REG, + .perf_ctrl = HISI_MN_PERF_CTRL_REG, + .int_mask = HISI_MN_INT_MASK_REG, + .int_clear = HISI_MN_INT_CLEAR_REG, + .int_status = HISI_MN_INT_STATUS_REG, + .event_ctrl = HISI_MN_EVENT_CTRL_REG, + .event_type0 = HISI_MN_EVTYPE0_REG, + .event_cntr0 = HISI_MN_CNTR0_REG, +}; + +static const struct hisi_pmu_dev_info hisi_mn_v1 = { + .attr_groups = hisi_mn_pmu_attr_groups, + .counter_bits = 48, + .check_event = HISI_MN_EVTYPE_MASK, + .private = &hisi_mn_v1_pmu_regs, +}; + +static const struct acpi_device_id hisi_mn_pmu_acpi_match[] = { + { "HISI0222", (kernel_ulong_t) &hisi_mn_v1 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, hisi_mn_pmu_acpi_match); + +static struct platform_driver hisi_mn_pmu_driver = { + .driver = { + .name = "hisi_mn_pmu", + .acpi_match_table = hisi_mn_pmu_acpi_match, + /* + * We have not worked out a safe bind/unbind process, + * Forcefully unbinding during sampling will lead to a + * kernel panic, so this is not supported yet. + */ + .suppress_bind_attrs = true, + }, + .probe = hisi_mn_pmu_probe, +}; + +static int __init hisi_mn_pmu_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/hisi/mn:online", + hisi_uncore_pmu_online_cpu, + hisi_uncore_pmu_offline_cpu); + if (ret < 0) { + pr_err("hisi_mn_pmu: Failed to setup MN PMU hotplug: %d\n", ret); + return ret; + } + hisi_mn_pmu_online = ret; + + ret = platform_driver_register(&hisi_mn_pmu_driver); + if (ret) + cpuhp_remove_multi_state(hisi_mn_pmu_online); + + return ret; +} +module_init(hisi_mn_pmu_module_init); + +static void __exit hisi_mn_pmu_module_exit(void) +{ + platform_driver_unregister(&hisi_mn_pmu_driver); + cpuhp_remove_multi_state(hisi_mn_pmu_online); +} +module_exit(hisi_mn_pmu_module_exit); + +MODULE_IMPORT_NS("HISI_PMU"); +MODULE_DESCRIPTION("HiSilicon SoC MN uncore PMU driver"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Junhao He "); diff --git a/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c new file mode 100644 index 00000000000000..de3b9cc7aadad0 --- /dev/null +++ b/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c @@ -0,0 +1,443 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for HiSilicon Uncore NoC (Network on Chip) PMU device + * + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. + * Author: Yicong Yang + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hisi_uncore_pmu.h" + +#define NOC_PMU_VERSION 0x1e00 +#define NOC_PMU_GLOBAL_CTRL 0x1e04 +#define NOC_PMU_GLOBAL_CTRL_PMU_EN BIT(0) +#define NOC_PMU_GLOBAL_CTRL_TT_EN BIT(1) +#define NOC_PMU_CNT_INFO 0x1e08 +#define NOC_PMU_CNT_INFO_OVERFLOW(n) BIT(n) +#define NOC_PMU_EVENT_CTRL0 0x1e20 +#define NOC_PMU_EVENT_CTRL_TYPE GENMASK(4, 0) +/* + * Note channel of 0x0 will reset the counter value, so don't do it before + * we read out the counter. + */ +#define NOC_PMU_EVENT_CTRL_CHANNEL GENMASK(10, 8) +#define NOC_PMU_EVENT_CTRL_EN BIT(11) +#define NOC_PMU_EVENT_COUNTER0 0x1e80 + +#define NOC_PMU_NR_COUNTERS 4 +#define NOC_PMU_CH_DEFAULT 0x7 + +#define NOC_PMU_EVENT_CTRLn(ctrl0, n) ((ctrl0) + 4 * (n)) +#define NOC_PMU_EVENT_CNTRn(cntr0, n) ((cntr0) + 8 * (n)) + +HISI_PMU_EVENT_ATTR_EXTRACTOR(ch, config1, 2, 0); +HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_en, config1, 3, 3); + +/* Dynamic CPU hotplug state used by this PMU driver */ +static enum cpuhp_state hisi_noc_pmu_cpuhp_state; + +struct hisi_noc_pmu_regs { + u32 version; + u32 pmu_ctrl; + u32 event_ctrl0; + u32 event_cntr0; + u32 overflow_status; +}; + +/* + * Tracetag filtering is not per event and all the events should keep + * the consistence. Return true if the new comer doesn't match the + * tracetag filtering configuration of the current scheduled events. + */ +static bool hisi_noc_pmu_check_global_filter(struct perf_event *curr, + struct perf_event *new) +{ + return hisi_get_tt_en(curr) == hisi_get_tt_en(new); +} + +static void hisi_noc_pmu_write_evtype(struct hisi_pmu *noc_pmu, int idx, u32 type) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, idx)); + reg &= ~NOC_PMU_EVENT_CTRL_TYPE; + reg |= FIELD_PREP(NOC_PMU_EVENT_CTRL_TYPE, type); + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, idx)); +} + +static int hisi_noc_pmu_get_event_idx(struct perf_event *event) +{ + struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu); + struct hisi_pmu_hwevents *pmu_events = &noc_pmu->pmu_events; + int cur_idx; + + cur_idx = find_first_bit(pmu_events->used_mask, noc_pmu->num_counters); + if (cur_idx != noc_pmu->num_counters && + !hisi_noc_pmu_check_global_filter(pmu_events->hw_events[cur_idx], event)) + return -EAGAIN; + + return hisi_uncore_pmu_get_event_idx(event); +} + +static u64 hisi_noc_pmu_read_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + + return readq(noc_pmu->base + NOC_PMU_EVENT_CNTRn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_noc_pmu_write_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc, u64 val) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + + writeq(val, noc_pmu->base + NOC_PMU_EVENT_CNTRn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_noc_pmu_enable_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + reg |= NOC_PMU_EVENT_CTRL_EN; + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); +} + +static void hisi_noc_pmu_disable_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + reg &= ~NOC_PMU_EVENT_CTRL_EN; + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); +} + +static void hisi_noc_pmu_enable_counter_int(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + /* We don't support interrupt, so a stub here. */ +} + +static void hisi_noc_pmu_disable_counter_int(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ +} + +static void hisi_noc_pmu_start_counters(struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg |= NOC_PMU_GLOBAL_CTRL_PMU_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); +} + +static void hisi_noc_pmu_stop_counters(struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg &= ~NOC_PMU_GLOBAL_CTRL_PMU_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); +} + +static u32 hisi_noc_pmu_get_int_status(struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + + return readl(noc_pmu->base + reg_info->overflow_status); +} + +static void hisi_noc_pmu_clear_int_status(struct hisi_pmu *noc_pmu, int idx) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + reg_info->overflow_status); + reg &= ~NOC_PMU_CNT_INFO_OVERFLOW(idx); + writel(reg, noc_pmu->base + reg_info->overflow_status); +} + +static void hisi_noc_pmu_enable_filter(struct perf_event *event) +{ + struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu); + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + struct hw_perf_event *hwc = &event->hw; + u32 tt_en = hisi_get_tt_en(event); + u32 ch = hisi_get_ch(event); + u32 reg; + + if (!ch) + ch = NOC_PMU_CH_DEFAULT; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + reg &= ~NOC_PMU_EVENT_CTRL_CHANNEL; + reg |= FIELD_PREP(NOC_PMU_EVENT_CTRL_CHANNEL, ch); + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + + /* + * Since tracetag filter applies to all the counters, don't touch it + * if user doesn't specify it explicitly. + */ + if (tt_en) { + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg |= NOC_PMU_GLOBAL_CTRL_TT_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); + } +} + +static void hisi_noc_pmu_disable_filter(struct perf_event *event) +{ + struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu); + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 tt_en = hisi_get_tt_en(event); + u32 reg; + + /* + * If we're not the last counter, don't touch the global tracetag + * configuration. + */ + if (bitmap_weight(noc_pmu->pmu_events.used_mask, noc_pmu->num_counters) > 1) + return; + + if (tt_en) { + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg &= ~NOC_PMU_GLOBAL_CTRL_TT_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); + } +} + +static const struct hisi_uncore_ops hisi_uncore_noc_ops = { + .write_evtype = hisi_noc_pmu_write_evtype, + .get_event_idx = hisi_noc_pmu_get_event_idx, + .read_counter = hisi_noc_pmu_read_counter, + .write_counter = hisi_noc_pmu_write_counter, + .enable_counter = hisi_noc_pmu_enable_counter, + .disable_counter = hisi_noc_pmu_disable_counter, + .enable_counter_int = hisi_noc_pmu_enable_counter_int, + .disable_counter_int = hisi_noc_pmu_disable_counter_int, + .start_counters = hisi_noc_pmu_start_counters, + .stop_counters = hisi_noc_pmu_stop_counters, + .get_int_status = hisi_noc_pmu_get_int_status, + .clear_int_status = hisi_noc_pmu_clear_int_status, + .enable_filter = hisi_noc_pmu_enable_filter, + .disable_filter = hisi_noc_pmu_disable_filter, +}; + +static struct attribute *hisi_noc_pmu_format_attrs[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-7"), + HISI_PMU_FORMAT_ATTR(ch, "config1:0-2"), + HISI_PMU_FORMAT_ATTR(tt_en, "config1:3"), + NULL +}; + +static const struct attribute_group hisi_noc_pmu_format_group = { + .name = "format", + .attrs = hisi_noc_pmu_format_attrs, +}; + +static struct attribute *hisi_noc_pmu_events_attrs[] = { + HISI_PMU_EVENT_ATTR(cycles, 0x0e), + /* Flux on/off the ring */ + HISI_PMU_EVENT_ATTR(ingress_flow_sum, 0x1a), + HISI_PMU_EVENT_ATTR(egress_flow_sum, 0x17), + /* Buffer full duration on/off the ring */ + HISI_PMU_EVENT_ATTR(ingress_buf_full, 0x19), + HISI_PMU_EVENT_ATTR(egress_buf_full, 0x12), + /* Failure packets count on/off the ring */ + HISI_PMU_EVENT_ATTR(cw_ingress_fail, 0x01), + HISI_PMU_EVENT_ATTR(cc_ingress_fail, 0x09), + HISI_PMU_EVENT_ATTR(cw_egress_fail, 0x03), + HISI_PMU_EVENT_ATTR(cc_egress_fail, 0x0b), + /* Flux of the ring */ + HISI_PMU_EVENT_ATTR(cw_main_flow_sum, 0x05), + HISI_PMU_EVENT_ATTR(cc_main_flow_sum, 0x0d), + NULL +}; + +static const struct attribute_group hisi_noc_pmu_events_group = { + .name = "events", + .attrs = hisi_noc_pmu_events_attrs, +}; + +static const struct attribute_group *hisi_noc_pmu_attr_groups[] = { + &hisi_noc_pmu_format_group, + &hisi_noc_pmu_events_group, + &hisi_pmu_cpumask_attr_group, + &hisi_pmu_identifier_group, + NULL +}; + +static int hisi_noc_pmu_dev_init(struct platform_device *pdev, struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info; + + hisi_uncore_pmu_init_topology(noc_pmu, &pdev->dev); + + if (noc_pmu->topo.scl_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, "failed to get scl-id\n"); + + if (noc_pmu->topo.index_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, "failed to get idx-id\n"); + + if (noc_pmu->topo.sub_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, "failed to get sub-id\n"); + + noc_pmu->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(noc_pmu->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(noc_pmu->base), + "fail to remap io memory\n"); + + noc_pmu->dev_info = device_get_match_data(&pdev->dev); + if (!noc_pmu->dev_info) + return -ENODEV; + + noc_pmu->pmu_events.attr_groups = noc_pmu->dev_info->attr_groups; + noc_pmu->counter_bits = noc_pmu->dev_info->counter_bits; + noc_pmu->check_event = noc_pmu->dev_info->check_event; + noc_pmu->num_counters = NOC_PMU_NR_COUNTERS; + noc_pmu->ops = &hisi_uncore_noc_ops; + noc_pmu->dev = &pdev->dev; + noc_pmu->on_cpu = -1; + + reg_info = noc_pmu->dev_info->private; + noc_pmu->identifier = readl(noc_pmu->base + reg_info->version); + + return 0; +} + +static void hisi_noc_pmu_remove_cpuhp_instance(void *hotplug_node) +{ + cpuhp_state_remove_instance_nocalls(hisi_noc_pmu_cpuhp_state, hotplug_node); +} + +static void hisi_noc_pmu_unregister_pmu(void *pmu) +{ + perf_pmu_unregister(pmu); +} + +static int hisi_noc_pmu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct hisi_pmu *noc_pmu; + char *name; + int ret; + + noc_pmu = devm_kzalloc(dev, sizeof(*noc_pmu), GFP_KERNEL); + if (!noc_pmu) + return -ENOMEM; + + /* + * HiSilicon Uncore PMU framework needs to get common hisi_pmu device + * from device's drvdata. + */ + platform_set_drvdata(pdev, noc_pmu); + + ret = hisi_noc_pmu_dev_init(pdev, noc_pmu); + if (ret) + return ret; + + ret = cpuhp_state_add_instance(hisi_noc_pmu_cpuhp_state, &noc_pmu->node); + if (ret) + return dev_err_probe(dev, ret, "Fail to register cpuhp instance\n"); + + ret = devm_add_action_or_reset(dev, hisi_noc_pmu_remove_cpuhp_instance, + &noc_pmu->node); + if (ret) + return ret; + + hisi_pmu_init(noc_pmu, THIS_MODULE); + + name = devm_kasprintf(dev, GFP_KERNEL, "hisi_scl%d_noc%d_%d", + noc_pmu->topo.scl_id, noc_pmu->topo.index_id, + noc_pmu->topo.sub_id); + if (!name) + return -ENOMEM; + + ret = perf_pmu_register(&noc_pmu->pmu, name, -1); + if (ret) + return dev_err_probe(dev, ret, "Fail to register PMU\n"); + + return devm_add_action_or_reset(dev, hisi_noc_pmu_unregister_pmu, + &noc_pmu->pmu); +} + +static struct hisi_noc_pmu_regs hisi_noc_v1_pmu_regs = { + .version = NOC_PMU_VERSION, + .pmu_ctrl = NOC_PMU_GLOBAL_CTRL, + .event_ctrl0 = NOC_PMU_EVENT_CTRL0, + .event_cntr0 = NOC_PMU_EVENT_COUNTER0, + .overflow_status = NOC_PMU_CNT_INFO, +}; + +static const struct hisi_pmu_dev_info hisi_noc_v1 = { + .attr_groups = hisi_noc_pmu_attr_groups, + .counter_bits = 64, + .check_event = NOC_PMU_EVENT_CTRL_TYPE, + .private = &hisi_noc_v1_pmu_regs, +}; + +static const struct acpi_device_id hisi_noc_pmu_ids[] = { + { "HISI04E0", (kernel_ulong_t) &hisi_noc_v1 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, hisi_noc_pmu_ids); + +static struct platform_driver hisi_noc_pmu_driver = { + .driver = { + .name = "hisi_noc_pmu", + .acpi_match_table = hisi_noc_pmu_ids, + .suppress_bind_attrs = true, + }, + .probe = hisi_noc_pmu_probe, +}; + +static int __init hisi_noc_pmu_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/hisi/noc:online", + hisi_uncore_pmu_online_cpu, + hisi_uncore_pmu_offline_cpu); + if (ret < 0) { + pr_err("hisi_noc_pmu: Fail to setup cpuhp callbacks, ret = %d\n", ret); + return ret; + } + hisi_noc_pmu_cpuhp_state = ret; + + ret = platform_driver_register(&hisi_noc_pmu_driver); + if (ret) + cpuhp_remove_multi_state(hisi_noc_pmu_cpuhp_state); + + return ret; +} +module_init(hisi_noc_pmu_module_init); + +static void __exit hisi_noc_pmu_module_exit(void) +{ + platform_driver_unregister(&hisi_noc_pmu_driver); + cpuhp_remove_multi_state(hisi_noc_pmu_cpuhp_state); +} +module_exit(hisi_noc_pmu_module_exit); + +MODULE_IMPORT_NS("HISI_PMU"); +MODULE_DESCRIPTION("HiSilicon SoC Uncore NoC PMU driver"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yicong Yang "); diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index a449651f79c9f6..de71dcf116538b 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -149,7 +149,7 @@ static void hisi_uncore_pmu_clear_event_idx(struct hisi_pmu *hisi_pmu, int idx) clear_bit(idx, hisi_pmu->pmu_events.used_mask); } -static irqreturn_t hisi_uncore_pmu_isr(int irq, void *data) +irqreturn_t hisi_uncore_pmu_isr(int irq, void *data) { struct hisi_pmu *hisi_pmu = data; struct perf_event *event; @@ -178,6 +178,7 @@ static irqreturn_t hisi_uncore_pmu_isr(int irq, void *data) return IRQ_HANDLED; } +EXPORT_SYMBOL_NS_GPL(hisi_uncore_pmu_isr, "HISI_PMU"); int hisi_uncore_pmu_init_irq(struct hisi_pmu *hisi_pmu, struct platform_device *pdev) @@ -234,7 +235,7 @@ int hisi_uncore_pmu_event_init(struct perf_event *event) return -EINVAL; hisi_pmu = to_hisi_pmu(event->pmu); - if (event->attr.config > hisi_pmu->check_event) + if ((event->attr.config & HISI_EVENTID_MASK) > hisi_pmu->check_event) return -EINVAL; if (hisi_pmu->on_cpu == -1) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index 777675838b8081..3ffe6acda65391 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -24,7 +24,7 @@ #define pr_fmt(fmt) "hisi_pmu: " fmt #define HISI_PMU_V2 0x30 -#define HISI_MAX_COUNTERS 0x10 +#define HISI_MAX_COUNTERS 0x18 #define to_hisi_pmu(p) (container_of(p, struct hisi_pmu, pmu)) #define HISI_PMU_ATTR(_name, _func, _config) \ @@ -43,7 +43,8 @@ return FIELD_GET(GENMASK_ULL(hi, lo), event->attr.config); \ } -#define HISI_GET_EVENTID(ev) (ev->hw.config_base & 0xff) +#define HISI_EVENTID_MASK GENMASK(7, 0) +#define HISI_GET_EVENTID(ev) ((ev)->hw.config_base & HISI_EVENTID_MASK) #define HISI_PMU_EVTYPE_BITS 8 #define HISI_PMU_EVTYPE_SHIFT(idx) ((idx) % 4 * HISI_PMU_EVTYPE_BITS) @@ -164,6 +165,7 @@ int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node); ssize_t hisi_uncore_pmu_identifier_attr_show(struct device *dev, struct device_attribute *attr, char *page); +irqreturn_t hisi_uncore_pmu_isr(int irq, void *data); int hisi_uncore_pmu_init_irq(struct hisi_pmu *hisi_pmu, struct platform_device *pdev); void hisi_uncore_pmu_init_topology(struct hisi_pmu *hisi_pmu, struct device *dev); diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 698de8ddf895ba..3fc16bbab0250b 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -339,7 +339,7 @@ static bool pmu_sbi_ctr_is_fw(int cidx) if (!info) return false; - return (info->type == SBI_PMU_CTR_TYPE_FW) ? true : false; + return info->type == SBI_PMU_CTR_TYPE_FW; } /* @@ -877,8 +877,10 @@ static inline void pmu_sbi_start_ovf_ctrs_sbi(struct cpu_hw_events *cpu_hw_evt, for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) { ctr_start_mask = cpu_hw_evt->used_hw_ctrs[i] & ~ctr_ovf_mask; /* Start all the counters that did not overflow in a single shot */ - sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, i * BITS_PER_LONG, ctr_start_mask, - 0, 0, 0, 0); + if (ctr_start_mask) { + sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, i * BITS_PER_LONG, + ctr_start_mask, 0, 0, 0, 0); + } } /* Reinitialize and start all the counter that overflowed */ diff --git a/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c b/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c index e0f2acc8109c10..8fcbc312fd616a 100644 --- a/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c +++ b/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c @@ -127,13 +127,13 @@ static int eusb2_repeater_init(struct phy *phy) rptr->cfg->init_tbl[i].value); /* Override registers from devicetree values */ - if (!of_property_read_u8(np, "qcom,tune-usb2-amplitude", &val)) + if (!of_property_read_u8(np, "qcom,tune-usb2-preem", &val)) regmap_write(regmap, base + EUSB2_TUNE_USB2_PREEM, val); if (!of_property_read_u8(np, "qcom,tune-usb2-disc-thres", &val)) regmap_write(regmap, base + EUSB2_TUNE_HSDISC, val); - if (!of_property_read_u8(np, "qcom,tune-usb2-preem", &val)) + if (!of_property_read_u8(np, "qcom,tune-usb2-amplitude", &val)) regmap_write(regmap, base + EUSB2_TUNE_IUSB2, val); /* Wait for status OK */ diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c index 95830dcfdec9b1..0fa63b734b67b8 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c @@ -3067,6 +3067,14 @@ struct qmp_pcie { struct clk_fixed_rate aux_clk_fixed; }; +static bool qphy_checkbits(const void __iomem *base, u32 offset, u32 val) +{ + u32 reg; + + reg = readl(base + offset); + return (reg & val) == val; +} + static inline void qphy_setbits(void __iomem *base, u32 offset, u32 val) { u32 reg; @@ -4339,16 +4347,21 @@ static int qmp_pcie_init(struct phy *phy) struct qmp_pcie *qmp = phy_get_drvdata(phy); const struct qmp_phy_cfg *cfg = qmp->cfg; void __iomem *pcs = qmp->pcs; - bool phy_initialized = !!(readl(pcs + cfg->regs[QPHY_START_CTRL])); int ret; - qmp->skip_init = qmp->nocsr_reset && phy_initialized; /* - * We need to check the existence of init sequences in two cases: - * 1. The PHY doesn't support no_csr reset. - * 2. The PHY supports no_csr reset but isn't initialized by bootloader. - * As we can't skip init in these two cases. + * We can skip PHY initialization if all of the following conditions + * are met: + * 1. The PHY supports the nocsr_reset that preserves the PHY config. + * 2. The PHY was started (and not powered down again) by the + * bootloader, with all of the expected bits set correctly. + * In this case, we can continue without having the init sequence + * defined in the driver. */ + qmp->skip_init = qmp->nocsr_reset && + qphy_checkbits(pcs, cfg->regs[QPHY_START_CTRL], SERDES_START | PCS_START) && + qphy_checkbits(pcs, cfg->regs[QPHY_PCS_POWER_DOWN_CONTROL], cfg->pwrdn_ctrl); + if (!qmp->skip_init && !cfg->tbls.serdes_num) { dev_err(qmp->dev, "Init sequence not available\n"); return -ENODATA; diff --git a/drivers/phy/tegra/xusb-tegra210.c b/drivers/phy/tegra/xusb-tegra210.c index ebc8a7e21a3181..3409924498e9cf 100644 --- a/drivers/phy/tegra/xusb-tegra210.c +++ b/drivers/phy/tegra/xusb-tegra210.c @@ -3164,18 +3164,22 @@ tegra210_xusb_padctl_probe(struct device *dev, } pdev = of_find_device_by_node(np); + of_node_put(np); if (!pdev) { dev_warn(dev, "PMC device is not available\n"); goto out; } - if (!platform_get_drvdata(pdev)) + if (!platform_get_drvdata(pdev)) { + put_device(&pdev->dev); return ERR_PTR(-EPROBE_DEFER); + } padctl->regmap = dev_get_regmap(&pdev->dev, "usb_sleepwalk"); if (!padctl->regmap) dev_info(dev, "failed to find PMC regmap\n"); + put_device(&pdev->dev); out: return &padctl->base; } diff --git a/drivers/phy/ti/phy-gmii-sel.c b/drivers/phy/ti/phy-gmii-sel.c index ff5d5e29629fab..50adabb867cb12 100644 --- a/drivers/phy/ti/phy-gmii-sel.c +++ b/drivers/phy/ti/phy-gmii-sel.c @@ -34,6 +34,7 @@ enum { PHY_GMII_SEL_PORT_MODE = 0, PHY_GMII_SEL_RGMII_ID_MODE, PHY_GMII_SEL_RMII_IO_CLK_EN, + PHY_GMII_SEL_FIXED_TX_DELAY, PHY_GMII_SEL_LAST, }; @@ -127,6 +128,11 @@ static int phy_gmii_sel_mode(struct phy *phy, enum phy_mode mode, int submode) goto unsupported; } + /* With a fixed delay, some modes are not supported at all. */ + if (soc_data->features & BIT(PHY_GMII_SEL_FIXED_TX_DELAY) && + rgmii_id != 0) + return -EINVAL; + if_phy->phy_if_mode = submode; dev_dbg(dev, "%s id:%u mode:%u rgmii_id:%d rmii_clk_ext:%d\n", @@ -210,25 +216,46 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_soc_dm814 = { static const struct reg_field phy_gmii_sel_fields_am654[][PHY_GMII_SEL_LAST] = { - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x0, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x4, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x8, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0xC, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x10, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x14, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x18, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x1C, 0, 2), }, + { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x0, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x0, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x4, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x4, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x8, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x8, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0xC, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0xC, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x10, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x10, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x14, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x14, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x18, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x18, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x1C, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x1C, 4, 4), + }, }; static const struct phy_gmii_sel_soc_data phy_gmii_sel_soc_am654 = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, }; static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw5g_soc_j7200 = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII) | BIT(PHY_INTERFACE_MODE_USXGMII), @@ -239,6 +266,8 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw5g_soc_j7200 = { static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j721e = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII), .num_ports = 8, @@ -248,6 +277,8 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j721e = { static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j784s4 = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII) | BIT(PHY_INTERFACE_MODE_USXGMII), diff --git a/drivers/phy/ti/phy-omap-usb2.c b/drivers/phy/ti/phy-omap-usb2.c index c1a0ef979142ce..c444bb2530ca29 100644 --- a/drivers/phy/ti/phy-omap-usb2.c +++ b/drivers/phy/ti/phy-omap-usb2.c @@ -363,6 +363,13 @@ static void omap_usb2_init_errata(struct omap_usb *phy) phy->flags |= OMAP_USB2_DISABLE_CHRG_DET; } +static void omap_usb2_put_device(void *_dev) +{ + struct device *dev = _dev; + + put_device(dev); +} + static int omap_usb2_probe(struct platform_device *pdev) { struct omap_usb *phy; @@ -373,6 +380,7 @@ static int omap_usb2_probe(struct platform_device *pdev) struct device_node *control_node; struct platform_device *control_pdev; const struct usb_phy_data *phy_data; + int ret; phy_data = device_get_match_data(&pdev->dev); if (!phy_data) @@ -423,6 +431,11 @@ static int omap_usb2_probe(struct platform_device *pdev) return -EINVAL; } phy->control_dev = &control_pdev->dev; + + ret = devm_add_action_or_reset(&pdev->dev, omap_usb2_put_device, + phy->control_dev); + if (ret) + return ret; } else { if (of_property_read_u32_index(node, "syscon-phy-power", 1, diff --git a/drivers/phy/ti/phy-ti-pipe3.c b/drivers/phy/ti/phy-ti-pipe3.c index da2cbacb982c6b..ae764d6524c99a 100644 --- a/drivers/phy/ti/phy-ti-pipe3.c +++ b/drivers/phy/ti/phy-ti-pipe3.c @@ -667,12 +667,20 @@ static int ti_pipe3_get_clk(struct ti_pipe3 *phy) return 0; } +static void ti_pipe3_put_device(void *_dev) +{ + struct device *dev = _dev; + + put_device(dev); +} + static int ti_pipe3_get_sysctrl(struct ti_pipe3 *phy) { struct device *dev = phy->dev; struct device_node *node = dev->of_node; struct device_node *control_node; struct platform_device *control_pdev; + int ret; phy->phy_power_syscon = syscon_regmap_lookup_by_phandle(node, "syscon-phy-power"); @@ -704,6 +712,11 @@ static int ti_pipe3_get_sysctrl(struct ti_pipe3 *phy) } phy->control_dev = &control_pdev->dev; + + ret = devm_add_action_or_reset(dev, ti_pipe3_put_device, + phy->control_dev); + if (ret) + return ret; } if (phy->mode == PIPE3_MODE_PCIE) { diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index be1ca8e85754bc..4f8507ebbdacdd 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -211,6 +211,8 @@ config PINCTRL_EIC7700 depends on ARCH_ESWIN || COMPILE_TEST select PINMUX select GENERIC_PINCONF + select REGULATOR + select REGULATOR_FIXED_VOLTAGE help This driver support for the pin controller in ESWIN's EIC7700 SoC, which supports pin multiplexing, pin configuration,and rgmii voltage @@ -358,6 +360,17 @@ config PINCTRL_LPC18XX help Pinctrl driver for NXP LPC18xx/43xx System Control Unit (SCU). +config PINCTRL_MAX7360 + tristate "MAX7360 Pincontrol support" + depends on MFD_MAX7360 + select PINMUX + select GENERIC_PINCONF + help + Say Y here to enable pin control support for Maxim MAX7360 keypad + controller. + This keypad controller has 8 GPIO pins that may work as GPIO, or PWM, + or rotary encoder alternate modes. + config PINCTRL_MAX77620 tristate "MAX77620/MAX20024 Pincontrol support" depends on MFD_MAX77620 && OF @@ -551,7 +564,7 @@ config PINCTRL_STMFX interrupt-controller. config PINCTRL_SX150X - bool "Semtech SX150x I2C GPIO expander pinctrl driver" + tristate "Semtech SX150x I2C GPIO expander pinctrl driver" depends on I2C=y select PINMUX select PINCONF @@ -601,6 +614,25 @@ config PINCTRL_TH1520 This driver is needed for RISC-V development boards like the BeagleV Ahead and the LicheePi 4A. +config PINCTRL_UPBOARD + tristate "AAeon UP board FPGA pin controller" + depends on MFD_UPBOARD_FPGA + select PINMUX + select GENERIC_PINCTRL_GROUPS + select GENERIC_PINMUX_FUNCTIONS + select GPIOLIB + select GPIO_AGGREGATOR + help + Pin controller for the FPGA GPIO lines on UP boards. Due to the + hardware layout, the driver controls the FPGA pins in tandem with + their corresponding Intel SoC GPIOs. + + Currently supported: + - UP Squared + + To compile this driver as a module, choose M here: the module + will be called pinctrl-upboard. + config PINCTRL_ZYNQ bool "Pinctrl driver for Xilinx Zynq" depends on ARCH_ZYNQ || COMPILE_TEST diff --git a/drivers/pinctrl/Makefile b/drivers/pinctrl/Makefile index 909ab89a56d293..e0cfb9b7c99bae 100644 --- a/drivers/pinctrl/Makefile +++ b/drivers/pinctrl/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_PINCTRL_FALCON) += pinctrl-falcon.o obj-$(CONFIG_PINCTRL_LOONGSON2) += pinctrl-loongson2.o obj-$(CONFIG_PINCTRL_XWAY) += pinctrl-xway.o obj-$(CONFIG_PINCTRL_LPC18XX) += pinctrl-lpc18xx.o +obj-$(CONFIG_PINCTRL_MAX7360) += pinctrl-max7360.o obj-$(CONFIG_PINCTRL_MAX77620) += pinctrl-max77620.o obj-$(CONFIG_PINCTRL_MCP23S08_I2C) += pinctrl-mcp23s08_i2c.o obj-$(CONFIG_PINCTRL_MCP23S08_SPI) += pinctrl-mcp23s08_spi.o @@ -59,6 +60,7 @@ obj-$(CONFIG_PINCTRL_SX150X) += pinctrl-sx150x.o obj-$(CONFIG_PINCTRL_TB10X) += pinctrl-tb10x.o obj-$(CONFIG_PINCTRL_TPS6594) += pinctrl-tps6594.o obj-$(CONFIG_PINCTRL_TH1520) += pinctrl-th1520.o +obj-$(CONFIG_PINCTRL_UPBOARD) += pinctrl-upboard.o obj-$(CONFIG_PINCTRL_ZYNQMP) += pinctrl-zynqmp.o obj-$(CONFIG_PINCTRL_ZYNQ) += pinctrl-zynq.o diff --git a/drivers/pinctrl/bcm/Kconfig b/drivers/pinctrl/bcm/Kconfig index 35b51ce4298e25..096d0778427e27 100644 --- a/drivers/pinctrl/bcm/Kconfig +++ b/drivers/pinctrl/bcm/Kconfig @@ -106,6 +106,18 @@ config PINCTRL_BCM63268 help Say Y here to enable the Broadcom BCM63268 GPIO driver. +config PINCTRL_BRCMSTB + tristate "Broadcom STB product line pin controller driver" + depends on OF && (ARCH_BCM2835 || ARCH_BRCMSTB || COMPILE_TEST) + select PINMUX + select PINCONF + select GENERIC_PINCONF + help + Enable pin muxing and configuration functionality + for Broadcom STB product line chipsets. + +source "drivers/pinctrl/bcm/Kconfig.stb" + config PINCTRL_IPROC_GPIO bool "Broadcom iProc GPIO (with PINCONF) driver" depends on OF_GPIO && (ARCH_BCM_IPROC || COMPILE_TEST) diff --git a/drivers/pinctrl/bcm/Kconfig.stb b/drivers/pinctrl/bcm/Kconfig.stb new file mode 100644 index 00000000000000..c1ba361e1d78b0 --- /dev/null +++ b/drivers/pinctrl/bcm/Kconfig.stb @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +if PINCTRL_BRCMSTB + +config PINCTRL_BCM2712 + tristate "BCM2712 SoC pin controller driver" + help + Driver for BCM2712 integrated pin controller, + commonly found on Raspberry Pi 5. + +endif diff --git a/drivers/pinctrl/bcm/Makefile b/drivers/pinctrl/bcm/Makefile index 82b868ec14716d..482d769b1a81da 100644 --- a/drivers/pinctrl/bcm/Makefile +++ b/drivers/pinctrl/bcm/Makefile @@ -11,6 +11,8 @@ obj-$(CONFIG_PINCTRL_BCM6358) += pinctrl-bcm6358.o obj-$(CONFIG_PINCTRL_BCM6362) += pinctrl-bcm6362.o obj-$(CONFIG_PINCTRL_BCM6368) += pinctrl-bcm6368.o obj-$(CONFIG_PINCTRL_BCM63268) += pinctrl-bcm63268.o +obj-$(CONFIG_PINCTRL_BRCMSTB) += pinctrl-brcmstb.o +obj-$(CONFIG_PINCTRL_BCM2712) += pinctrl-brcmstb-bcm2712.o obj-$(CONFIG_PINCTRL_IPROC_GPIO) += pinctrl-iproc-gpio.o obj-$(CONFIG_PINCTRL_CYGNUS_MUX) += pinctrl-cygnus-mux.o obj-$(CONFIG_PINCTRL_NS) += pinctrl-ns.o diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c index 7dbf079739bcc1..c165674c5b4db0 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c @@ -1023,7 +1023,7 @@ static int bcm2835_pinconf_get(struct pinctrl_dev *pctldev, /* No way to read back bias config in HW */ switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (fsel != BCM2835_FSEL_GPIO_OUT) return -EINVAL; @@ -1091,7 +1091,7 @@ static int bcm2835_pinconf_set(struct pinctrl_dev *pctldev, break; /* Set output-high or output-low */ - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bcm2835_gpio_set_bit(pc, arg ? GPSET0 : GPCLR0, pin); break; @@ -1202,7 +1202,7 @@ static int bcm2711_pinconf_set(struct pinctrl_dev *pctldev, break; /* Set output-high or output-low */ - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bcm2835_gpio_set_bit(pc, arg ? GPSET0 : GPCLR0, pin); break; diff --git a/drivers/pinctrl/bcm/pinctrl-bcm6358.c b/drivers/pinctrl/bcm/pinctrl-bcm6358.c index 891de49d76e744..4c8cd65fc31e8f 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm6358.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm6358.c @@ -343,10 +343,8 @@ static int bcm6358_pinctrl_probe(struct platform_device *pdev) pc = platform_get_drvdata(pdev); priv->overlays = devm_regmap_field_alloc(dev, pc->regs, overlays); - if (IS_ERR(priv->overlays)) - return PTR_ERR(priv->overlays); - return 0; + return PTR_ERR_OR_ZERO(priv->overlays); } static const struct of_device_id bcm6358_pinctrl_match[] = { diff --git a/drivers/pinctrl/bcm/pinctrl-brcmstb-bcm2712.c b/drivers/pinctrl/bcm/pinctrl-brcmstb-bcm2712.c new file mode 100644 index 00000000000000..752b78e2c0d8ce --- /dev/null +++ b/drivers/pinctrl/bcm/pinctrl-brcmstb-bcm2712.c @@ -0,0 +1,747 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Driver for Broadcom brcmstb GPIO units (pinctrl only) + * + * Copyright (C) 2024-2025 Ivan T. Ivanov, Andrea della Porta + * Copyright (C) 2021-3 Raspberry Pi Ltd. + * Copyright (C) 2012 Chris Boot, Simon Arlott, Stephen Warren + * + * Based heavily on the BCM2835 GPIO & pinctrl driver, which was inspired by: + * pinctrl-nomadik.c, please see original file for copyright information + * pinctrl-tegra.c, please see original file for copyright information + */ + +#include +#include +#include "pinctrl-brcmstb.h" + +#define BRCMSTB_FSEL_COUNT 8 +#define BRCMSTB_FSEL_MASK 0xf + +#define BRCMSTB_PIN(i, f1, f2, f3, f4, f5, f6, f7, f8) \ + [i] = { \ + .funcs = (u8[]) { \ + func_##f1, \ + func_##f2, \ + func_##f3, \ + func_##f4, \ + func_##f5, \ + func_##f6, \ + func_##f7, \ + func_##f8, \ + }, \ + .n_funcs = BRCMSTB_FSEL_COUNT, \ + .func_mask = BRCMSTB_FSEL_MASK, \ + } + +enum bcm2712_funcs { + func_gpio, + func_alt1, + func_alt2, + func_alt3, + func_alt4, + func_alt5, + func_alt6, + func_alt7, + func_alt8, + func_aon_cpu_standbyb, + func_aon_fp_4sec_resetb, + func_aon_gpclk, + func_aon_pwm, + func_arm_jtag, + func_aud_fs_clk0, + func_avs_pmu_bsc, + func_bsc_m0, + func_bsc_m1, + func_bsc_m2, + func_bsc_m3, + func_clk_observe, + func_ctl_hdmi_5v, + func_enet0, + func_enet0_mii, + func_enet0_rgmii, + func_ext_sc_clk, + func_fl0, + func_fl1, + func_gpclk0, + func_gpclk1, + func_gpclk2, + func_hdmi_tx0_auto_i2c, + func_hdmi_tx0_bsc, + func_hdmi_tx1_auto_i2c, + func_hdmi_tx1_bsc, + func_i2s_in, + func_i2s_out, + func_ir_in, + func_mtsif, + func_mtsif_alt, + func_mtsif_alt1, + func_pdm, + func_pkt, + func_pm_led_out, + func_sc0, + func_sd0, + func_sd2, + func_sd_card_a, + func_sd_card_b, + func_sd_card_c, + func_sd_card_d, + func_sd_card_e, + func_sd_card_f, + func_sd_card_g, + func_spdif_out, + func_spi_m, + func_spi_s, + func_sr_edm_sense, + func_te0, + func_te1, + func_tsio, + func_uart0, + func_uart1, + func_uart2, + func_usb_pwr, + func_usb_vbus, + func_uui, + func_vc_i2c0, + func_vc_i2c3, + func_vc_i2c4, + func_vc_i2c5, + func_vc_i2csl, + func_vc_pcm, + func_vc_pwm0, + func_vc_pwm1, + func_vc_spi0, + func_vc_spi3, + func_vc_spi4, + func_vc_spi5, + func_vc_uart0, + func_vc_uart2, + func_vc_uart3, + func_vc_uart4, + func__, + func_count = func__ +}; + +static const struct pin_regs bcm2712_c0_gpio_pin_regs[] = { + GPIO_REGS(0, 0, 0, 7, 7), + GPIO_REGS(1, 0, 1, 7, 8), + GPIO_REGS(2, 0, 2, 7, 9), + GPIO_REGS(3, 0, 3, 7, 10), + GPIO_REGS(4, 0, 4, 7, 11), + GPIO_REGS(5, 0, 5, 7, 12), + GPIO_REGS(6, 0, 6, 7, 13), + GPIO_REGS(7, 0, 7, 7, 14), + GPIO_REGS(8, 1, 0, 8, 0), + GPIO_REGS(9, 1, 1, 8, 1), + GPIO_REGS(10, 1, 2, 8, 2), + GPIO_REGS(11, 1, 3, 8, 3), + GPIO_REGS(12, 1, 4, 8, 4), + GPIO_REGS(13, 1, 5, 8, 5), + GPIO_REGS(14, 1, 6, 8, 6), + GPIO_REGS(15, 1, 7, 8, 7), + GPIO_REGS(16, 2, 0, 8, 8), + GPIO_REGS(17, 2, 1, 8, 9), + GPIO_REGS(18, 2, 2, 8, 10), + GPIO_REGS(19, 2, 3, 8, 11), + GPIO_REGS(20, 2, 4, 8, 12), + GPIO_REGS(21, 2, 5, 8, 13), + GPIO_REGS(22, 2, 6, 8, 14), + GPIO_REGS(23, 2, 7, 9, 0), + GPIO_REGS(24, 3, 0, 9, 1), + GPIO_REGS(25, 3, 1, 9, 2), + GPIO_REGS(26, 3, 2, 9, 3), + GPIO_REGS(27, 3, 3, 9, 4), + GPIO_REGS(28, 3, 4, 9, 5), + GPIO_REGS(29, 3, 5, 9, 6), + GPIO_REGS(30, 3, 6, 9, 7), + GPIO_REGS(31, 3, 7, 9, 8), + GPIO_REGS(32, 4, 0, 9, 9), + GPIO_REGS(33, 4, 1, 9, 10), + GPIO_REGS(34, 4, 2, 9, 11), + GPIO_REGS(35, 4, 3, 9, 12), + GPIO_REGS(36, 4, 4, 9, 13), + GPIO_REGS(37, 4, 5, 9, 14), + GPIO_REGS(38, 4, 6, 10, 0), + GPIO_REGS(39, 4, 7, 10, 1), + GPIO_REGS(40, 5, 0, 10, 2), + GPIO_REGS(41, 5, 1, 10, 3), + GPIO_REGS(42, 5, 2, 10, 4), + GPIO_REGS(43, 5, 3, 10, 5), + GPIO_REGS(44, 5, 4, 10, 6), + GPIO_REGS(45, 5, 5, 10, 7), + GPIO_REGS(46, 5, 6, 10, 8), + GPIO_REGS(47, 5, 7, 10, 9), + GPIO_REGS(48, 6, 0, 10, 10), + GPIO_REGS(49, 6, 1, 10, 11), + GPIO_REGS(50, 6, 2, 10, 12), + GPIO_REGS(51, 6, 3, 10, 13), + GPIO_REGS(52, 6, 4, 10, 14), + GPIO_REGS(53, 6, 5, 11, 0), + EMMC_REGS(54, 11, 1), /* EMMC_CMD */ + EMMC_REGS(55, 11, 2), /* EMMC_DS */ + EMMC_REGS(56, 11, 3), /* EMMC_CLK */ + EMMC_REGS(57, 11, 4), /* EMMC_DAT0 */ + EMMC_REGS(58, 11, 5), /* EMMC_DAT1 */ + EMMC_REGS(59, 11, 6), /* EMMC_DAT2 */ + EMMC_REGS(60, 11, 7), /* EMMC_DAT3 */ + EMMC_REGS(61, 11, 8), /* EMMC_DAT4 */ + EMMC_REGS(62, 11, 9), /* EMMC_DAT5 */ + EMMC_REGS(63, 11, 10), /* EMMC_DAT6 */ + EMMC_REGS(64, 11, 11), /* EMMC_DAT7 */ +}; + +static struct pin_regs bcm2712_c0_aon_gpio_pin_regs[] = { + AON_GPIO_REGS(0, 3, 0, 6, 10), + AON_GPIO_REGS(1, 3, 1, 6, 11), + AON_GPIO_REGS(2, 3, 2, 6, 12), + AON_GPIO_REGS(3, 3, 3, 6, 13), + AON_GPIO_REGS(4, 3, 4, 6, 14), + AON_GPIO_REGS(5, 3, 5, 7, 0), + AON_GPIO_REGS(6, 3, 6, 7, 1), + AON_GPIO_REGS(7, 3, 7, 7, 2), + AON_GPIO_REGS(8, 4, 0, 7, 3), + AON_GPIO_REGS(9, 4, 1, 7, 4), + AON_GPIO_REGS(10, 4, 2, 7, 5), + AON_GPIO_REGS(11, 4, 3, 7, 6), + AON_GPIO_REGS(12, 4, 4, 7, 7), + AON_GPIO_REGS(13, 4, 5, 7, 8), + AON_GPIO_REGS(14, 4, 6, 7, 9), + AON_GPIO_REGS(15, 4, 7, 7, 10), + AON_GPIO_REGS(16, 5, 0, 7, 11), + AON_SGPIO_REGS(0, 0, 0), + AON_SGPIO_REGS(1, 0, 1), + AON_SGPIO_REGS(2, 0, 2), + AON_SGPIO_REGS(3, 0, 3), + AON_SGPIO_REGS(4, 1, 0), + AON_SGPIO_REGS(5, 2, 0), +}; + +static const struct pinctrl_pin_desc bcm2712_c0_gpio_pins[] = { + GPIO_PIN(0), + GPIO_PIN(1), + GPIO_PIN(2), + GPIO_PIN(3), + GPIO_PIN(4), + GPIO_PIN(5), + GPIO_PIN(6), + GPIO_PIN(7), + GPIO_PIN(8), + GPIO_PIN(9), + GPIO_PIN(10), + GPIO_PIN(11), + GPIO_PIN(12), + GPIO_PIN(13), + GPIO_PIN(14), + GPIO_PIN(15), + GPIO_PIN(16), + GPIO_PIN(17), + GPIO_PIN(18), + GPIO_PIN(19), + GPIO_PIN(20), + GPIO_PIN(21), + GPIO_PIN(22), + GPIO_PIN(23), + GPIO_PIN(24), + GPIO_PIN(25), + GPIO_PIN(26), + GPIO_PIN(27), + GPIO_PIN(28), + GPIO_PIN(29), + GPIO_PIN(30), + GPIO_PIN(31), + GPIO_PIN(32), + GPIO_PIN(33), + GPIO_PIN(34), + GPIO_PIN(35), + GPIO_PIN(36), + GPIO_PIN(37), + GPIO_PIN(38), + GPIO_PIN(39), + GPIO_PIN(40), + GPIO_PIN(41), + GPIO_PIN(42), + GPIO_PIN(43), + GPIO_PIN(44), + GPIO_PIN(45), + GPIO_PIN(46), + GPIO_PIN(47), + GPIO_PIN(48), + GPIO_PIN(49), + GPIO_PIN(50), + GPIO_PIN(51), + GPIO_PIN(52), + GPIO_PIN(53), + PINCTRL_PIN(54, "emmc_cmd"), + PINCTRL_PIN(55, "emmc_ds"), + PINCTRL_PIN(56, "emmc_clk"), + PINCTRL_PIN(57, "emmc_dat0"), + PINCTRL_PIN(58, "emmc_dat1"), + PINCTRL_PIN(59, "emmc_dat2"), + PINCTRL_PIN(60, "emmc_dat3"), + PINCTRL_PIN(61, "emmc_dat4"), + PINCTRL_PIN(62, "emmc_dat5"), + PINCTRL_PIN(63, "emmc_dat6"), + PINCTRL_PIN(64, "emmc_dat7"), +}; + +static struct pinctrl_pin_desc bcm2712_c0_aon_gpio_pins[] = { + AON_GPIO_PIN(0), AON_GPIO_PIN(1), AON_GPIO_PIN(2), AON_GPIO_PIN(3), + AON_GPIO_PIN(4), AON_GPIO_PIN(5), AON_GPIO_PIN(6), AON_GPIO_PIN(7), + AON_GPIO_PIN(8), AON_GPIO_PIN(9), AON_GPIO_PIN(10), AON_GPIO_PIN(11), + AON_GPIO_PIN(12), AON_GPIO_PIN(13), AON_GPIO_PIN(14), AON_GPIO_PIN(15), + AON_GPIO_PIN(16), AON_SGPIO_PIN(0), AON_SGPIO_PIN(1), AON_SGPIO_PIN(2), + AON_SGPIO_PIN(3), AON_SGPIO_PIN(4), AON_SGPIO_PIN(5), +}; + +static const struct pin_regs bcm2712_d0_gpio_pin_regs[] = { + GPIO_REGS(1, 0, 0, 4, 5), + GPIO_REGS(2, 0, 1, 4, 6), + GPIO_REGS(3, 0, 2, 4, 7), + GPIO_REGS(4, 0, 3, 4, 8), + GPIO_REGS(10, 0, 4, 4, 9), + GPIO_REGS(11, 0, 5, 4, 10), + GPIO_REGS(12, 0, 6, 4, 11), + GPIO_REGS(13, 0, 7, 4, 12), + GPIO_REGS(14, 1, 0, 4, 13), + GPIO_REGS(15, 1, 1, 4, 14), + GPIO_REGS(18, 1, 2, 5, 0), + GPIO_REGS(19, 1, 3, 5, 1), + GPIO_REGS(20, 1, 4, 5, 2), + GPIO_REGS(21, 1, 5, 5, 3), + GPIO_REGS(22, 1, 6, 5, 4), + GPIO_REGS(23, 1, 7, 5, 5), + GPIO_REGS(24, 2, 0, 5, 6), + GPIO_REGS(25, 2, 1, 5, 7), + GPIO_REGS(26, 2, 2, 5, 8), + GPIO_REGS(27, 2, 3, 5, 9), + GPIO_REGS(28, 2, 4, 5, 10), + GPIO_REGS(29, 2, 5, 5, 11), + GPIO_REGS(30, 2, 6, 5, 12), + GPIO_REGS(31, 2, 7, 5, 13), + GPIO_REGS(32, 3, 0, 5, 14), + GPIO_REGS(33, 3, 1, 6, 0), + GPIO_REGS(34, 3, 2, 6, 1), + GPIO_REGS(35, 3, 3, 6, 2), + EMMC_REGS(36, 6, 3), /* EMMC_CMD */ + EMMC_REGS(37, 6, 4), /* EMMC_DS */ + EMMC_REGS(38, 6, 5), /* EMMC_CLK */ + EMMC_REGS(39, 6, 6), /* EMMC_DAT0 */ + EMMC_REGS(40, 6, 7), /* EMMC_DAT1 */ + EMMC_REGS(41, 6, 8), /* EMMC_DAT2 */ + EMMC_REGS(42, 6, 9), /* EMMC_DAT3 */ + EMMC_REGS(43, 6, 10), /* EMMC_DAT4 */ + EMMC_REGS(44, 6, 11), /* EMMC_DAT5 */ + EMMC_REGS(45, 6, 12), /* EMMC_DAT6 */ + EMMC_REGS(46, 6, 13), /* EMMC_DAT7 */ +}; + +static struct pin_regs bcm2712_d0_aon_gpio_pin_regs[] = { + AON_GPIO_REGS(0, 3, 0, 5, 9), + AON_GPIO_REGS(1, 3, 1, 5, 10), + AON_GPIO_REGS(2, 3, 2, 5, 11), + AON_GPIO_REGS(3, 3, 3, 5, 12), + AON_GPIO_REGS(4, 3, 4, 5, 13), + AON_GPIO_REGS(5, 3, 5, 5, 14), + AON_GPIO_REGS(6, 3, 6, 6, 0), + AON_GPIO_REGS(8, 3, 7, 6, 1), + AON_GPIO_REGS(9, 4, 0, 6, 2), + AON_GPIO_REGS(12, 4, 1, 6, 3), + AON_GPIO_REGS(13, 4, 2, 6, 4), + AON_GPIO_REGS(14, 4, 3, 6, 5), + AON_SGPIO_REGS(0, 0, 0), + AON_SGPIO_REGS(1, 0, 1), + AON_SGPIO_REGS(2, 0, 2), + AON_SGPIO_REGS(3, 0, 3), + AON_SGPIO_REGS(4, 1, 0), + AON_SGPIO_REGS(5, 2, 0), +}; + +static const struct pinctrl_pin_desc bcm2712_d0_gpio_pins[] = { + GPIO_PIN(1), + GPIO_PIN(2), + GPIO_PIN(3), + GPIO_PIN(4), + GPIO_PIN(10), + GPIO_PIN(11), + GPIO_PIN(12), + GPIO_PIN(13), + GPIO_PIN(14), + GPIO_PIN(15), + GPIO_PIN(18), + GPIO_PIN(19), + GPIO_PIN(20), + GPIO_PIN(21), + GPIO_PIN(22), + GPIO_PIN(23), + GPIO_PIN(24), + GPIO_PIN(25), + GPIO_PIN(26), + GPIO_PIN(27), + GPIO_PIN(28), + GPIO_PIN(29), + GPIO_PIN(30), + GPIO_PIN(31), + GPIO_PIN(32), + GPIO_PIN(33), + GPIO_PIN(34), + GPIO_PIN(35), + PINCTRL_PIN(36, "emmc_cmd"), + PINCTRL_PIN(37, "emmc_ds"), + PINCTRL_PIN(38, "emmc_clk"), + PINCTRL_PIN(39, "emmc_dat0"), + PINCTRL_PIN(40, "emmc_dat1"), + PINCTRL_PIN(41, "emmc_dat2"), + PINCTRL_PIN(42, "emmc_dat3"), + PINCTRL_PIN(43, "emmc_dat4"), + PINCTRL_PIN(44, "emmc_dat5"), + PINCTRL_PIN(45, "emmc_dat6"), + PINCTRL_PIN(46, "emmc_dat7"), +}; + +static struct pinctrl_pin_desc bcm2712_d0_aon_gpio_pins[] = { + AON_GPIO_PIN(0), AON_GPIO_PIN(1), AON_GPIO_PIN(2), AON_GPIO_PIN(3), + AON_GPIO_PIN(4), AON_GPIO_PIN(5), AON_GPIO_PIN(6), AON_GPIO_PIN(8), + AON_GPIO_PIN(9), AON_GPIO_PIN(12), AON_GPIO_PIN(13), AON_GPIO_PIN(14), + AON_SGPIO_PIN(0), AON_SGPIO_PIN(1), AON_SGPIO_PIN(2), + AON_SGPIO_PIN(3), AON_SGPIO_PIN(4), AON_SGPIO_PIN(5), +}; + +static const char * const bcm2712_func_names[] = { + BRCMSTB_FUNC(gpio), + BRCMSTB_FUNC(alt1), + BRCMSTB_FUNC(alt2), + BRCMSTB_FUNC(alt3), + BRCMSTB_FUNC(alt4), + BRCMSTB_FUNC(alt5), + BRCMSTB_FUNC(alt6), + BRCMSTB_FUNC(alt7), + BRCMSTB_FUNC(alt8), + BRCMSTB_FUNC(aon_cpu_standbyb), + BRCMSTB_FUNC(aon_fp_4sec_resetb), + BRCMSTB_FUNC(aon_gpclk), + BRCMSTB_FUNC(aon_pwm), + BRCMSTB_FUNC(arm_jtag), + BRCMSTB_FUNC(aud_fs_clk0), + BRCMSTB_FUNC(avs_pmu_bsc), + BRCMSTB_FUNC(bsc_m0), + BRCMSTB_FUNC(bsc_m1), + BRCMSTB_FUNC(bsc_m2), + BRCMSTB_FUNC(bsc_m3), + BRCMSTB_FUNC(clk_observe), + BRCMSTB_FUNC(ctl_hdmi_5v), + BRCMSTB_FUNC(enet0), + BRCMSTB_FUNC(enet0_mii), + BRCMSTB_FUNC(enet0_rgmii), + BRCMSTB_FUNC(ext_sc_clk), + BRCMSTB_FUNC(fl0), + BRCMSTB_FUNC(fl1), + BRCMSTB_FUNC(gpclk0), + BRCMSTB_FUNC(gpclk1), + BRCMSTB_FUNC(gpclk2), + BRCMSTB_FUNC(hdmi_tx0_auto_i2c), + BRCMSTB_FUNC(hdmi_tx0_bsc), + BRCMSTB_FUNC(hdmi_tx1_auto_i2c), + BRCMSTB_FUNC(hdmi_tx1_bsc), + BRCMSTB_FUNC(i2s_in), + BRCMSTB_FUNC(i2s_out), + BRCMSTB_FUNC(ir_in), + BRCMSTB_FUNC(mtsif), + BRCMSTB_FUNC(mtsif_alt), + BRCMSTB_FUNC(mtsif_alt1), + BRCMSTB_FUNC(pdm), + BRCMSTB_FUNC(pkt), + BRCMSTB_FUNC(pm_led_out), + BRCMSTB_FUNC(sc0), + BRCMSTB_FUNC(sd0), + BRCMSTB_FUNC(sd2), + BRCMSTB_FUNC(sd_card_a), + BRCMSTB_FUNC(sd_card_b), + BRCMSTB_FUNC(sd_card_c), + BRCMSTB_FUNC(sd_card_d), + BRCMSTB_FUNC(sd_card_e), + BRCMSTB_FUNC(sd_card_f), + BRCMSTB_FUNC(sd_card_g), + BRCMSTB_FUNC(spdif_out), + BRCMSTB_FUNC(spi_m), + BRCMSTB_FUNC(spi_s), + BRCMSTB_FUNC(sr_edm_sense), + BRCMSTB_FUNC(te0), + BRCMSTB_FUNC(te1), + BRCMSTB_FUNC(tsio), + BRCMSTB_FUNC(uart0), + BRCMSTB_FUNC(uart1), + BRCMSTB_FUNC(uart2), + BRCMSTB_FUNC(usb_pwr), + BRCMSTB_FUNC(usb_vbus), + BRCMSTB_FUNC(uui), + BRCMSTB_FUNC(vc_i2c0), + BRCMSTB_FUNC(vc_i2c3), + BRCMSTB_FUNC(vc_i2c4), + BRCMSTB_FUNC(vc_i2c5), + BRCMSTB_FUNC(vc_i2csl), + BRCMSTB_FUNC(vc_pcm), + BRCMSTB_FUNC(vc_pwm0), + BRCMSTB_FUNC(vc_pwm1), + BRCMSTB_FUNC(vc_spi0), + BRCMSTB_FUNC(vc_spi3), + BRCMSTB_FUNC(vc_spi4), + BRCMSTB_FUNC(vc_spi5), + BRCMSTB_FUNC(vc_uart0), + BRCMSTB_FUNC(vc_uart2), + BRCMSTB_FUNC(vc_uart3), + BRCMSTB_FUNC(vc_uart4), +}; + +static const struct brcmstb_pin_funcs bcm2712_c0_aon_gpio_pin_funcs[] = { + BRCMSTB_PIN(0, ir_in, vc_spi0, vc_uart3, vc_i2c3, te0, vc_i2c0, _, _), + BRCMSTB_PIN(1, vc_pwm0, vc_spi0, vc_uart3, vc_i2c3, te1, aon_pwm, vc_i2c0, vc_pwm1), + BRCMSTB_PIN(2, vc_pwm0, vc_spi0, vc_uart3, ctl_hdmi_5v, fl0, aon_pwm, ir_in, vc_pwm1), + BRCMSTB_PIN(3, ir_in, vc_spi0, vc_uart3, aon_fp_4sec_resetb, fl1, sd_card_g, aon_gpclk, _), + BRCMSTB_PIN(4, gpclk0, vc_spi0, vc_i2csl, aon_gpclk, pm_led_out, aon_pwm, sd_card_g, vc_pwm0), + BRCMSTB_PIN(5, gpclk1, ir_in, vc_i2csl, clk_observe, aon_pwm, sd_card_g, vc_pwm0, _), + BRCMSTB_PIN(6, uart1, vc_uart4, gpclk2, ctl_hdmi_5v, vc_uart0, vc_spi3, _, _), + BRCMSTB_PIN(7, uart1, vc_uart4, gpclk0, aon_pwm, vc_uart0, vc_spi3, _, _), + BRCMSTB_PIN(8, uart1, vc_uart4, vc_i2csl, ctl_hdmi_5v, vc_uart0, vc_spi3, _, _), + BRCMSTB_PIN(9, uart1, vc_uart4, vc_i2csl, aon_pwm, vc_uart0, vc_spi3, _, _), + BRCMSTB_PIN(10, tsio, ctl_hdmi_5v, sc0, spdif_out, vc_spi5, usb_pwr, aon_gpclk, sd_card_f), + BRCMSTB_PIN(11, tsio, uart0, sc0, aud_fs_clk0, vc_spi5, usb_vbus, vc_uart2, sd_card_f), + BRCMSTB_PIN(12, tsio, uart0, vc_uart0, tsio, vc_spi5, usb_pwr, vc_uart2, sd_card_f), + BRCMSTB_PIN(13, bsc_m1, uart0, vc_uart0, uui, vc_spi5, arm_jtag, vc_uart2, vc_i2c3), + BRCMSTB_PIN(14, bsc_m1, uart0, vc_uart0, uui, vc_spi5, arm_jtag, vc_uart2, vc_i2c3), + BRCMSTB_PIN(15, ir_in, aon_fp_4sec_resetb, vc_uart0, pm_led_out, ctl_hdmi_5v, aon_pwm, aon_gpclk, _), + BRCMSTB_PIN(16, aon_cpu_standbyb, gpclk0, pm_led_out, ctl_hdmi_5v, vc_pwm0, usb_pwr, aud_fs_clk0, _), +}; + +static const struct brcmstb_pin_funcs bcm2712_c0_gpio_pin_funcs[] = { + BRCMSTB_PIN(0, bsc_m3, vc_i2c0, gpclk0, enet0, vc_pwm1, vc_spi0, ir_in, _), + BRCMSTB_PIN(1, bsc_m3, vc_i2c0, gpclk1, enet0, vc_pwm1, sr_edm_sense, vc_spi0, vc_uart3), + BRCMSTB_PIN(2, pdm, i2s_in, gpclk2, vc_spi4, pkt, vc_spi0, vc_uart3, _), + BRCMSTB_PIN(3, pdm, i2s_in, vc_spi4, pkt, vc_spi0, vc_uart3, _, _), + BRCMSTB_PIN(4, pdm, i2s_in, arm_jtag, vc_spi4, pkt, vc_spi0, vc_uart3, _), + BRCMSTB_PIN(5, pdm, vc_i2c3, arm_jtag, sd_card_e, vc_spi4, pkt, vc_pcm, vc_i2c5), + BRCMSTB_PIN(6, pdm, vc_i2c3, arm_jtag, sd_card_e, vc_spi4, pkt, vc_pcm, vc_i2c5), + BRCMSTB_PIN(7, i2s_out, spdif_out, arm_jtag, sd_card_e, vc_i2c3, enet0_rgmii, vc_pcm, vc_spi4), + BRCMSTB_PIN(8, i2s_out, aud_fs_clk0, arm_jtag, sd_card_e, vc_i2c3, enet0_mii, vc_pcm, vc_spi4), + BRCMSTB_PIN(9, i2s_out, aud_fs_clk0, arm_jtag, sd_card_e, enet0_mii, sd_card_c, vc_spi4, _), + BRCMSTB_PIN(10, bsc_m3, mtsif_alt1, i2s_in, i2s_out, vc_spi5, enet0_mii, sd_card_c, vc_spi4), + BRCMSTB_PIN(11, bsc_m3, mtsif_alt1, i2s_in, i2s_out, vc_spi5, enet0_mii, sd_card_c, vc_spi4), + BRCMSTB_PIN(12, spi_s, mtsif_alt1, i2s_in, i2s_out, vc_spi5, vc_i2csl, sd0, sd_card_d), + BRCMSTB_PIN(13, spi_s, mtsif_alt1, i2s_out, usb_vbus, vc_spi5, vc_i2csl, sd0, sd_card_d), + BRCMSTB_PIN(14, spi_s, vc_i2csl, enet0_rgmii, arm_jtag, vc_spi5, vc_pwm0, vc_i2c4, sd_card_d), + BRCMSTB_PIN(15, spi_s, vc_i2csl, vc_spi3, arm_jtag, vc_pwm0, vc_i2c4, gpclk0, _), + BRCMSTB_PIN(16, sd_card_b, i2s_out, vc_spi3, i2s_in, sd0, enet0_rgmii, gpclk1, _), + BRCMSTB_PIN(17, sd_card_b, i2s_out, vc_spi3, i2s_in, ext_sc_clk, sd0, enet0_rgmii, gpclk2), + BRCMSTB_PIN(18, sd_card_b, i2s_out, vc_spi3, i2s_in, sd0, enet0_rgmii, vc_pwm1, _), + BRCMSTB_PIN(19, sd_card_b, usb_pwr, vc_spi3, pkt, spdif_out, sd0, ir_in, vc_pwm1), + BRCMSTB_PIN(20, sd_card_b, uui, vc_uart0, arm_jtag, uart2, usb_pwr, vc_pcm, vc_uart4), + BRCMSTB_PIN(21, usb_pwr, uui, vc_uart0, arm_jtag, uart2, sd_card_b, vc_pcm, vc_uart4), + BRCMSTB_PIN(22, usb_pwr, enet0, vc_uart0, mtsif, uart2, usb_vbus, vc_pcm, vc_i2c5), + BRCMSTB_PIN(23, usb_vbus, enet0, vc_uart0, mtsif, uart2, i2s_out, vc_pcm, vc_i2c5), + BRCMSTB_PIN(24, mtsif, pkt, uart0, enet0_rgmii, enet0_rgmii, vc_i2c4, vc_uart3, _), + BRCMSTB_PIN(25, mtsif, pkt, sc0, uart0, enet0_rgmii, enet0_rgmii, vc_i2c4, vc_uart3), + BRCMSTB_PIN(26, mtsif, pkt, sc0, uart0, enet0_rgmii, vc_uart4, vc_spi5, _), + BRCMSTB_PIN(27, mtsif, pkt, sc0, uart0, enet0_rgmii, vc_uart4, vc_spi5, _), + BRCMSTB_PIN(28, mtsif, pkt, sc0, enet0_rgmii, vc_uart4, vc_spi5, _, _), + BRCMSTB_PIN(29, mtsif, pkt, sc0, enet0_rgmii, vc_uart4, vc_spi5, _, _), + BRCMSTB_PIN(30, mtsif, pkt, sc0, sd2, enet0_rgmii, gpclk0, vc_pwm0, _), + BRCMSTB_PIN(31, mtsif, pkt, sc0, sd2, enet0_rgmii, vc_spi3, vc_pwm0, _), + BRCMSTB_PIN(32, mtsif, pkt, sc0, sd2, enet0_rgmii, vc_spi3, vc_uart3, _), + BRCMSTB_PIN(33, mtsif, pkt, sd2, enet0_rgmii, vc_spi3, vc_uart3, _, _), + BRCMSTB_PIN(34, mtsif, pkt, ext_sc_clk, sd2, enet0_rgmii, vc_spi3, vc_i2c5, _), + BRCMSTB_PIN(35, mtsif, pkt, sd2, enet0_rgmii, vc_spi3, vc_i2c5, _, _), + BRCMSTB_PIN(36, sd0, mtsif, sc0, i2s_in, vc_uart3, vc_uart2, _, _), + BRCMSTB_PIN(37, sd0, mtsif, sc0, vc_spi0, i2s_in, vc_uart3, vc_uart2, _), + BRCMSTB_PIN(38, sd0, mtsif_alt, sc0, vc_spi0, i2s_in, vc_uart3, vc_uart2, _), + BRCMSTB_PIN(39, sd0, mtsif_alt, sc0, vc_spi0, vc_uart3, vc_uart2, _, _), + BRCMSTB_PIN(40, sd0, mtsif_alt, sc0, vc_spi0, bsc_m3, _, _, _), + BRCMSTB_PIN(41, sd0, mtsif_alt, sc0, vc_spi0, bsc_m3, _, _, _), + BRCMSTB_PIN(42, vc_spi0, mtsif_alt, vc_i2c0, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m), + BRCMSTB_PIN(43, vc_spi0, mtsif_alt, vc_i2c0, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m), + BRCMSTB_PIN(44, vc_spi0, mtsif_alt, enet0, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m), + BRCMSTB_PIN(45, vc_spi0, mtsif_alt, enet0, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m), + BRCMSTB_PIN(46, vc_spi0, mtsif_alt, sd_card_a, mtsif_alt1, arm_jtag, pdm, spi_m, _), + BRCMSTB_PIN(47, enet0, mtsif_alt, i2s_out, mtsif_alt1, arm_jtag, _, _, _), + BRCMSTB_PIN(48, sc0, usb_pwr, spdif_out, mtsif, _, _, _, _), + BRCMSTB_PIN(49, sc0, usb_pwr, aud_fs_clk0, mtsif, _, _, _, _), + BRCMSTB_PIN(50, sc0, usb_vbus, sc0, _, _, _, _, _), + BRCMSTB_PIN(51, sc0, enet0, sc0, sr_edm_sense, _, _, _, _), + BRCMSTB_PIN(52, sc0, enet0, vc_pwm1, _, _, _, _, _), + BRCMSTB_PIN(53, sc0, enet0_rgmii, ext_sc_clk, _, _, _, _, _), +}; + +static const struct brcmstb_pin_funcs bcm2712_d0_aon_gpio_pin_funcs[] = { + BRCMSTB_PIN(0, ir_in, vc_spi0, vc_uart0, vc_i2c3, uart0, vc_i2c0, _, _), + BRCMSTB_PIN(1, vc_pwm0, vc_spi0, vc_uart0, vc_i2c3, uart0, aon_pwm, vc_i2c0, vc_pwm1), + BRCMSTB_PIN(2, vc_pwm0, vc_spi0, vc_uart0, ctl_hdmi_5v, uart0, aon_pwm, ir_in, vc_pwm1), + BRCMSTB_PIN(3, ir_in, vc_spi0, vc_uart0, uart0, sd_card_g, aon_gpclk, _, _), + BRCMSTB_PIN(4, gpclk0, vc_spi0, pm_led_out, aon_pwm, sd_card_g, vc_pwm0, _, _), + BRCMSTB_PIN(5, gpclk1, ir_in, aon_pwm, sd_card_g, vc_pwm0, _, _, _), + BRCMSTB_PIN(6, uart1, vc_uart2, ctl_hdmi_5v, gpclk2, vc_spi3, _, _, _), + BRCMSTB_PIN(7, _, _, _, _, _, _, _, _), /* non-existent on D0 silicon */ + BRCMSTB_PIN(8, uart1, vc_uart2, ctl_hdmi_5v, vc_spi0, vc_spi3, _, _, _), + BRCMSTB_PIN(9, uart1, vc_uart2, vc_uart0, aon_pwm, vc_spi0, vc_uart2, vc_spi3, _), + BRCMSTB_PIN(10, _, _, _, _, _, _, _, _), /* non-existent on D0 silicon */ + BRCMSTB_PIN(11, _, _, _, _, _, _, _, _), /* non-existent on D0 silicon */ + BRCMSTB_PIN(12, uart1, vc_uart2, vc_uart0, vc_spi0, usb_pwr, vc_uart2, vc_spi3, _), + BRCMSTB_PIN(13, bsc_m1, vc_uart0, uui, vc_spi0, arm_jtag, vc_uart2, vc_i2c3, _), + BRCMSTB_PIN(14, bsc_m1, aon_gpclk, vc_uart0, uui, vc_spi0, arm_jtag, vc_uart2, vc_i2c3), +}; + +static const struct brcmstb_pin_funcs bcm2712_d0_gpio_pin_funcs[] = { + BRCMSTB_PIN(1, vc_i2c0, usb_pwr, gpclk0, sd_card_e, vc_spi3, sr_edm_sense, vc_spi0, vc_uart0), + BRCMSTB_PIN(2, vc_i2c0, usb_pwr, gpclk1, sd_card_e, vc_spi3, clk_observe, vc_spi0, vc_uart0), + BRCMSTB_PIN(3, vc_i2c3, usb_vbus, gpclk2, sd_card_e, vc_spi3, vc_spi0, vc_uart0, _), + BRCMSTB_PIN(4, vc_i2c3, vc_pwm1, vc_spi3, sd_card_e, vc_spi3, vc_spi0, vc_uart0, _), + BRCMSTB_PIN(10, bsc_m3, vc_pwm1, vc_spi3, sd_card_e, vc_spi3, gpclk0, _, _), + BRCMSTB_PIN(11, bsc_m3, vc_spi3, clk_observe, sd_card_c, gpclk1, _, _, _), + BRCMSTB_PIN(12, spi_s, vc_spi3, sd_card_c, sd_card_d, _, _, _, _), + BRCMSTB_PIN(13, spi_s, vc_spi3, sd_card_c, sd_card_d, _, _, _, _), + BRCMSTB_PIN(14, spi_s, uui, arm_jtag, vc_pwm0, vc_i2c0, sd_card_d, _, _), + BRCMSTB_PIN(15, spi_s, uui, arm_jtag, vc_pwm0, vc_i2c0, gpclk0, _, _), + BRCMSTB_PIN(18, sd_card_f, vc_pwm1, _, _, _, _, _, _), + BRCMSTB_PIN(19, sd_card_f, usb_pwr, vc_pwm1, _, _, _, _, _), + BRCMSTB_PIN(20, vc_i2c3, uui, vc_uart0, arm_jtag, vc_uart2, _, _, _), + BRCMSTB_PIN(21, vc_i2c3, uui, vc_uart0, arm_jtag, vc_uart2, _, _, _), + BRCMSTB_PIN(22, sd_card_f, vc_uart0, vc_i2c3, _, _, _, _, _), + BRCMSTB_PIN(23, vc_uart0, vc_i2c3, _, _, _, _, _, _), + BRCMSTB_PIN(24, sd_card_b, vc_spi0, arm_jtag, uart0, usb_pwr, vc_uart2, vc_uart0, _), + BRCMSTB_PIN(25, sd_card_b, vc_spi0, arm_jtag, uart0, usb_pwr, vc_uart2, vc_uart0, _), + BRCMSTB_PIN(26, sd_card_b, vc_spi0, arm_jtag, uart0, usb_vbus, vc_uart2, vc_spi0, _), + BRCMSTB_PIN(27, sd_card_b, vc_spi0, arm_jtag, uart0, vc_uart2, vc_spi0, _, _), + BRCMSTB_PIN(28, sd_card_b, vc_spi0, arm_jtag, vc_i2c0, vc_spi0, _, _, _), + BRCMSTB_PIN(29, arm_jtag, vc_i2c0, vc_spi0, _, _, _, _, _), + BRCMSTB_PIN(30, sd2, gpclk0, vc_pwm0, _, _, _, _, _), + BRCMSTB_PIN(31, sd2, vc_spi3, vc_pwm0, _, _, _, _, _), + BRCMSTB_PIN(32, sd2, vc_spi3, vc_uart3, _, _, _, _, _), + BRCMSTB_PIN(33, sd2, vc_spi3, vc_uart3, _, _, _, _, _), + BRCMSTB_PIN(34, sd2, vc_spi3, vc_i2c5, _, _, _, _, _), + BRCMSTB_PIN(35, sd2, vc_spi3, vc_i2c5, _, _, _, _, _), +}; + +static const struct pinctrl_desc bcm2712_c0_pinctrl_desc = { + .name = "pinctrl-bcm2712", + .pins = bcm2712_c0_gpio_pins, + .npins = ARRAY_SIZE(bcm2712_c0_gpio_pins), +}; + +static const struct pinctrl_desc bcm2712_c0_aon_pinctrl_desc = { + .name = "aon-pinctrl-bcm2712", + .pins = bcm2712_c0_aon_gpio_pins, + .npins = ARRAY_SIZE(bcm2712_c0_aon_gpio_pins), +}; + +static const struct pinctrl_desc bcm2712_d0_pinctrl_desc = { + .name = "pinctrl-bcm2712", + .pins = bcm2712_d0_gpio_pins, + .npins = ARRAY_SIZE(bcm2712_d0_gpio_pins), +}; + +static const struct pinctrl_desc bcm2712_d0_aon_pinctrl_desc = { + .name = "aon-pinctrl-bcm2712", + .pins = bcm2712_d0_aon_gpio_pins, + .npins = ARRAY_SIZE(bcm2712_d0_aon_gpio_pins), +}; + +static const struct pinctrl_gpio_range bcm2712_c0_pinctrl_gpio_range = { + .name = "pinctrl-bcm2712", + .npins = ARRAY_SIZE(bcm2712_c0_gpio_pins), +}; + +static const struct pinctrl_gpio_range bcm2712_c0_aon_pinctrl_gpio_range = { + .name = "aon-pinctrl-bcm2712", + .npins = ARRAY_SIZE(bcm2712_c0_aon_gpio_pins), +}; + +static const struct pinctrl_gpio_range bcm2712_d0_pinctrl_gpio_range = { + .name = "pinctrl-bcm2712", + .npins = ARRAY_SIZE(bcm2712_d0_gpio_pins), +}; + +static const struct pinctrl_gpio_range bcm2712_d0_aon_pinctrl_gpio_range = { + .name = "aon-pinctrl-bcm2712", + .npins = ARRAY_SIZE(bcm2712_d0_aon_gpio_pins), +}; + +static const struct brcmstb_pdata bcm2712_c0_pdata = { + .pctl_desc = &bcm2712_c0_pinctrl_desc, + .gpio_range = &bcm2712_c0_pinctrl_gpio_range, + .pin_regs = bcm2712_c0_gpio_pin_regs, + .pin_funcs = bcm2712_c0_gpio_pin_funcs, + .func_count = func_count, + .func_gpio = func_gpio, + .func_names = bcm2712_func_names, +}; + +static const struct brcmstb_pdata bcm2712_c0_aon_pdata = { + .pctl_desc = &bcm2712_c0_aon_pinctrl_desc, + .gpio_range = &bcm2712_c0_aon_pinctrl_gpio_range, + .pin_regs = bcm2712_c0_aon_gpio_pin_regs, + .pin_funcs = bcm2712_c0_aon_gpio_pin_funcs, + .func_count = func_count, + .func_gpio = func_gpio, + .func_names = bcm2712_func_names, +}; + +static const struct brcmstb_pdata bcm2712_d0_pdata = { + .pctl_desc = &bcm2712_d0_pinctrl_desc, + .gpio_range = &bcm2712_d0_pinctrl_gpio_range, + .pin_regs = bcm2712_d0_gpio_pin_regs, + .pin_funcs = bcm2712_d0_gpio_pin_funcs, + .func_count = func_count, + .func_gpio = func_gpio, + .func_names = bcm2712_func_names, +}; + +static const struct brcmstb_pdata bcm2712_d0_aon_pdata = { + .pctl_desc = &bcm2712_d0_aon_pinctrl_desc, + .gpio_range = &bcm2712_d0_aon_pinctrl_gpio_range, + .pin_regs = bcm2712_d0_aon_gpio_pin_regs, + .pin_funcs = bcm2712_d0_aon_gpio_pin_funcs, + .func_count = func_count, + .func_gpio = func_gpio, + .func_names = bcm2712_func_names, +}; + +static int bcm2712_pinctrl_probe(struct platform_device *pdev) +{ + return brcmstb_pinctrl_probe(pdev); +} + +static const struct of_device_id bcm2712_pinctrl_match[] = { + { + .compatible = "brcm,bcm2712c0-pinctrl", + .data = &bcm2712_c0_pdata + }, + { + .compatible = "brcm,bcm2712c0-aon-pinctrl", + .data = &bcm2712_c0_aon_pdata + }, + + { + .compatible = "brcm,bcm2712d0-pinctrl", + .data = &bcm2712_d0_pdata + }, + { + .compatible = "brcm,bcm2712d0-aon-pinctrl", + .data = &bcm2712_d0_aon_pdata + }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, bcm2712_pinctrl_match); + +static struct platform_driver bcm2712_pinctrl_driver = { + .probe = bcm2712_pinctrl_probe, + .driver = { + .name = "pinctrl-bcm2712", + .of_match_table = bcm2712_pinctrl_match, + .suppress_bind_attrs = true, + }, +}; +module_platform_driver(bcm2712_pinctrl_driver); + +MODULE_AUTHOR("Phil Elwell"); +MODULE_AUTHOR("Jonathan Bell"); +MODULE_AUTHOR("Ivan T. Ivanov"); +MODULE_AUTHOR("Andrea della Porta"); +MODULE_DESCRIPTION("Broadcom BCM2712 pinctrl driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/pinctrl/bcm/pinctrl-brcmstb.c b/drivers/pinctrl/bcm/pinctrl-brcmstb.c new file mode 100644 index 00000000000000..f46b27155c3c40 --- /dev/null +++ b/drivers/pinctrl/bcm/pinctrl-brcmstb.c @@ -0,0 +1,442 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Driver for Broadcom brcmstb GPIO units (pinctrl only) + * + * Copyright (C) 2024-2025 Ivan T. Ivanov, Andrea della Porta + * Copyright (C) 2021-3 Raspberry Pi Ltd. + * Copyright (C) 2012 Chris Boot, Simon Arlott, Stephen Warren + * + * Based heavily on the BCM2835 GPIO & pinctrl driver, which was inspired by: + * pinctrl-nomadik.c, please see original file for copyright information + * pinctrl-tegra.c, please see original file for copyright information + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pinctrl-brcmstb.h" + +#define BRCMSTB_PULL_NONE 0 +#define BRCMSTB_PULL_DOWN 1 +#define BRCMSTB_PULL_UP 2 +#define BRCMSTB_PULL_MASK 0x3 + +#define BIT_TO_REG(b) (((b) >> 5) << 2) +#define BIT_TO_SHIFT(b) ((b) & 0x1f) + +struct brcmstb_pinctrl { + struct device *dev; + void __iomem *base; + struct pinctrl_dev *pctl_dev; + struct pinctrl_desc pctl_desc; + const struct pin_regs *pin_regs; + const struct brcmstb_pin_funcs *pin_funcs; + const char * const *func_names; + unsigned int func_count; + unsigned int func_gpio; + const char *const *gpio_groups; + struct pinctrl_gpio_range gpio_range; + /* Protect FSEL registers */ + spinlock_t fsel_lock; +}; + +static unsigned int brcmstb_pinctrl_fsel_get(struct brcmstb_pinctrl *pc, + unsigned int pin) +{ + u32 bit = pc->pin_regs[pin].mux_bit; + unsigned int func; + int fsel; + u32 val; + + if (!bit) + return pc->func_gpio; + + bit &= ~MUX_BIT_VALID; + + val = readl(pc->base + BIT_TO_REG(bit)); + fsel = (val >> BIT_TO_SHIFT(bit)) & pc->pin_funcs[pin].func_mask; + func = pc->pin_funcs[pin].funcs[fsel]; + + if (func >= pc->func_count) + func = fsel; + + dev_dbg(pc->dev, "get %04x: %08x (%u => %s)\n", + BIT_TO_REG(bit), val, pin, + pc->func_names[func]); + + return func; +} + +static int brcmstb_pinctrl_fsel_set(struct brcmstb_pinctrl *pc, + unsigned int pin, unsigned int func) +{ + u32 bit = pc->pin_regs[pin].mux_bit, val, fsel_mask; + const u8 *pin_funcs; + int fsel; + int cur; + int i; + + if (!bit || func >= pc->func_count) + return -EINVAL; + + bit &= ~MUX_BIT_VALID; + + fsel = pc->pin_funcs[pin].n_funcs + 1; + fsel_mask = pc->pin_funcs[pin].func_mask; + + if (func >= fsel) { + /* Convert to an fsel number */ + pin_funcs = pc->pin_funcs[pin].funcs; + for (i = 1; i < fsel; i++) { + if (pin_funcs[i - 1] == func) { + fsel = i; + break; + } + } + } else { + fsel = func; + } + + if (fsel >= pc->pin_funcs[pin].n_funcs + 1) + return -EINVAL; + + guard(spinlock_irqsave)(&pc->fsel_lock); + + val = readl(pc->base + BIT_TO_REG(bit)); + cur = (val >> BIT_TO_SHIFT(bit)) & fsel_mask; + + dev_dbg(pc->dev, "read %04x: %08x (%u => %s)\n", + BIT_TO_REG(bit), val, pin, + pc->func_names[cur]); + + if (cur != fsel) { + val &= ~(fsel_mask << BIT_TO_SHIFT(bit)); + val |= fsel << BIT_TO_SHIFT(bit); + + dev_dbg(pc->dev, "write %04x: %08x (%u <= %s)\n", + BIT_TO_REG(bit), val, pin, + pc->func_names[fsel]); + writel(val, pc->base + BIT_TO_REG(bit)); + } + + return 0; +} + +static int brcmstb_pctl_get_groups_count(struct pinctrl_dev *pctldev) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return pc->pctl_desc.npins; +} + +static const char *brcmstb_pctl_get_group_name(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return pc->gpio_groups[selector]; +} + +static int brcmstb_pctl_get_group_pins(struct pinctrl_dev *pctldev, + unsigned int selector, + const unsigned int **pins, + unsigned int *num_pins) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + *pins = &pc->pctl_desc.pins[selector].number; + *num_pins = 1; + + return 0; +} + +static void brcmstb_pctl_pin_dbg_show(struct pinctrl_dev *pctldev, + struct seq_file *s, unsigned int offset) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + unsigned int fsel = brcmstb_pinctrl_fsel_get(pc, offset); + const char *fname = pc->func_names[fsel]; + + seq_printf(s, "function %s", fname); +} + +static void brcmstb_pctl_dt_free_map(struct pinctrl_dev *pctldev, + struct pinctrl_map *maps, + unsigned int num_maps) +{ + int i; + + for (i = 0; i < num_maps; i++) + if (maps[i].type == PIN_MAP_TYPE_CONFIGS_PIN) + kfree(maps[i].data.configs.configs); + + kfree(maps); +} + +static const struct pinctrl_ops brcmstb_pctl_ops = { + .get_groups_count = brcmstb_pctl_get_groups_count, + .get_group_name = brcmstb_pctl_get_group_name, + .get_group_pins = brcmstb_pctl_get_group_pins, + .pin_dbg_show = brcmstb_pctl_pin_dbg_show, + .dt_node_to_map = pinconf_generic_dt_node_to_map_all, + .dt_free_map = brcmstb_pctl_dt_free_map, +}; + +static int brcmstb_pmx_free(struct pinctrl_dev *pctldev, unsigned int offset) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + /* disable by setting to GPIO */ + return brcmstb_pinctrl_fsel_set(pc, offset, pc->func_gpio); +} + +static int brcmstb_pmx_get_functions_count(struct pinctrl_dev *pctldev) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return pc->func_count; +} + +static const char *brcmstb_pmx_get_function_name(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return (selector < pc->func_count) ? pc->func_names[selector] : NULL; +} + +static int brcmstb_pmx_get_function_groups(struct pinctrl_dev *pctldev, + unsigned int selector, + const char *const **groups, + unsigned *const num_groups) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + *groups = pc->gpio_groups; + *num_groups = pc->pctl_desc.npins; + + return 0; +} + +static int brcmstb_pmx_set(struct pinctrl_dev *pctldev, + unsigned int func_selector, + unsigned int group_selector) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + const struct pinctrl_desc *pctldesc = &pc->pctl_desc; + const struct pinctrl_pin_desc *pindesc; + + if (group_selector >= pctldesc->npins) + return -EINVAL; + + pindesc = &pctldesc->pins[group_selector]; + return brcmstb_pinctrl_fsel_set(pc, pindesc->number, func_selector); +} + +static int brcmstb_pmx_gpio_request_enable(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, + unsigned int pin) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return brcmstb_pinctrl_fsel_set(pc, pin, pc->func_gpio); +} + +static void brcmstb_pmx_gpio_disable_free(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, + unsigned int offset) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + /* disable by setting to GPIO */ + (void)brcmstb_pinctrl_fsel_set(pc, offset, pc->func_gpio); +} + +static bool brcmstb_pmx_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + + return pc->func_gpio == selector; +} + +static const struct pinmux_ops brcmstb_pmx_ops = { + .free = brcmstb_pmx_free, + .get_functions_count = brcmstb_pmx_get_functions_count, + .get_function_name = brcmstb_pmx_get_function_name, + .get_function_groups = brcmstb_pmx_get_function_groups, + .set_mux = brcmstb_pmx_set, + .gpio_request_enable = brcmstb_pmx_gpio_request_enable, + .gpio_disable_free = brcmstb_pmx_gpio_disable_free, + .function_is_gpio = brcmstb_pmx_function_is_gpio, + .strict = true, +}; + +static unsigned int brcmstb_pull_config_get(struct brcmstb_pinctrl *pc, + unsigned int pin) +{ + u32 bit = pc->pin_regs[pin].pad_bit, val; + + if (bit == PAD_BIT_INVALID) + return BRCMSTB_PULL_NONE; + + val = readl(pc->base + BIT_TO_REG(bit)); + return (val >> BIT_TO_SHIFT(bit)) & BRCMSTB_PULL_MASK; +} + +static int brcmstb_pull_config_set(struct brcmstb_pinctrl *pc, + unsigned int pin, unsigned int arg) +{ + u32 bit = pc->pin_regs[pin].pad_bit, val; + + if (bit == PAD_BIT_INVALID) { + dev_warn(pc->dev, "Can't set pulls for %s\n", + pc->gpio_groups[pin]); + return -EINVAL; + } + + guard(spinlock_irqsave)(&pc->fsel_lock); + + val = readl(pc->base + BIT_TO_REG(bit)); + val &= ~(BRCMSTB_PULL_MASK << BIT_TO_SHIFT(bit)); + val |= (arg << BIT_TO_SHIFT(bit)); + writel(val, pc->base + BIT_TO_REG(bit)); + + return 0; +} + +static int brcmstb_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, + unsigned long *config) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + enum pin_config_param param = pinconf_to_config_param(*config); + u32 arg; + + switch (param) { + case PIN_CONFIG_BIAS_DISABLE: + arg = (brcmstb_pull_config_get(pc, pin) == BRCMSTB_PULL_NONE); + break; + case PIN_CONFIG_BIAS_PULL_DOWN: + arg = (brcmstb_pull_config_get(pc, pin) == BRCMSTB_PULL_DOWN); + break; + case PIN_CONFIG_BIAS_PULL_UP: + arg = (brcmstb_pull_config_get(pc, pin) == BRCMSTB_PULL_UP); + break; + default: + return -ENOTSUPP; + } + + *config = pinconf_to_config_packed(param, arg); + + return 0; +} + +static int brcmstb_pinconf_set(struct pinctrl_dev *pctldev, + unsigned int pin, unsigned long *configs, + unsigned int num_configs) +{ + struct brcmstb_pinctrl *pc = pinctrl_dev_get_drvdata(pctldev); + int ret = 0; + u32 param; + int i; + + for (i = 0; i < num_configs; i++) { + param = pinconf_to_config_param(configs[i]); + + switch (param) { + case PIN_CONFIG_BIAS_DISABLE: + ret = brcmstb_pull_config_set(pc, pin, BRCMSTB_PULL_NONE); + break; + case PIN_CONFIG_BIAS_PULL_DOWN: + ret = brcmstb_pull_config_set(pc, pin, BRCMSTB_PULL_DOWN); + break; + case PIN_CONFIG_BIAS_PULL_UP: + ret = brcmstb_pull_config_set(pc, pin, BRCMSTB_PULL_UP); + break; + default: + return -ENOTSUPP; + } + } + + return ret; +} + +static const struct pinconf_ops brcmstb_pinconf_ops = { + .is_generic = true, + .pin_config_get = brcmstb_pinconf_get, + .pin_config_set = brcmstb_pinconf_set, +}; + +int brcmstb_pinctrl_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + const struct brcmstb_pdata *pdata; + struct brcmstb_pinctrl *pc; + const char **names; + int num_pins, i; + + pdata = of_device_get_match_data(dev); + + pc = devm_kzalloc(dev, sizeof(*pc), GFP_KERNEL); + if (!pc) + return -ENOMEM; + + platform_set_drvdata(pdev, pc); + pc->dev = dev; + spin_lock_init(&pc->fsel_lock); + + pc->base = devm_of_iomap(dev, np, 0, NULL); + if (IS_ERR(pc->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(pc->base), + "Could not get IO memory\n"); + + pc->pctl_desc = *pdata->pctl_desc; + pc->pctl_desc.pctlops = &brcmstb_pctl_ops; + pc->pctl_desc.pmxops = &brcmstb_pmx_ops; + pc->pctl_desc.confops = &brcmstb_pinconf_ops; + pc->pctl_desc.owner = THIS_MODULE; + num_pins = pc->pctl_desc.npins; + names = devm_kmalloc_array(dev, num_pins, sizeof(const char *), + GFP_KERNEL); + if (!names) + return -ENOMEM; + + for (i = 0; i < num_pins; i++) + names[i] = pc->pctl_desc.pins[i].name; + + pc->gpio_groups = names; + pc->pin_regs = pdata->pin_regs; + pc->pin_funcs = pdata->pin_funcs; + pc->func_count = pdata->func_count; + pc->func_names = pdata->func_names; + + pc->pctl_dev = devm_pinctrl_register(dev, &pc->pctl_desc, pc); + if (IS_ERR(pc->pctl_dev)) + return dev_err_probe(&pdev->dev, PTR_ERR(pc->pctl_dev), + "Failed to register pinctrl device\n"); + + pc->gpio_range = *pdata->gpio_range; + pinctrl_add_gpio_range(pc->pctl_dev, &pc->gpio_range); + + return 0; +} +EXPORT_SYMBOL(brcmstb_pinctrl_probe); + +MODULE_AUTHOR("Phil Elwell"); +MODULE_AUTHOR("Jonathan Bell"); +MODULE_AUTHOR("Ivan T. Ivanov"); +MODULE_AUTHOR("Andrea della Porta"); +MODULE_DESCRIPTION("Broadcom brcmstb pinctrl driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/pinctrl/bcm/pinctrl-brcmstb.h b/drivers/pinctrl/bcm/pinctrl-brcmstb.h new file mode 100644 index 00000000000000..c3459103e05639 --- /dev/null +++ b/drivers/pinctrl/bcm/pinctrl-brcmstb.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Header for Broadcom brcmstb GPIO based drivers + * + * Copyright (C) 2024-2025 Ivan T. Ivanov, Andrea della Porta + * Copyright (C) 2021-3 Raspberry Pi Ltd. + * Copyright (C) 2012 Chris Boot, Simon Arlott, Stephen Warren + * + * Based heavily on the BCM2835 GPIO & pinctrl driver, which was inspired by: + * pinctrl-nomadik.c, please see original file for copyright information + * pinctrl-tegra.c, please see original file for copyright information + */ + +#ifndef __PINCTRL_BRCMSTB_H__ +#define __PINCTRL_BRCMSTB_H__ + +#include +#include + +#define BRCMSTB_FUNC(f) \ + [func_##f] = #f + +#define MUX_BIT_VALID 0x8000 +#define PAD_BIT_INVALID 0xffff + +#define MUX_BIT(muxreg, muxshift) \ + (MUX_BIT_VALID + ((muxreg) << 5) + ((muxshift) << 2)) +#define PAD_BIT(padreg, padshift) \ + (((padreg) << 5) + ((padshift) << 1)) + +#define GPIO_REGS(n, muxreg, muxshift, padreg, padshift) \ + [n] = { MUX_BIT(muxreg, muxshift), PAD_BIT(padreg, padshift) } + +#define EMMC_REGS(n, padreg, padshift) \ + [n] = { 0, PAD_BIT(padreg, padshift) } + +#define AON_GPIO_REGS(n, muxreg, muxshift, padreg, padshift) \ + GPIO_REGS(n, muxreg, muxshift, padreg, padshift) + +#define AON_SGPIO_REGS(n, muxreg, muxshift) \ + [(n) + 32] = { MUX_BIT(muxreg, muxshift), PAD_BIT_INVALID } + +#define GPIO_PIN(n) PINCTRL_PIN(n, "gpio" #n) +/** + * AON pins are in the Always-On power domain. SGPIOs are also 'Safe' + * being 5V tolerant (necessary for the HDMI I2C pins), and can be driven + * while the power is off. + */ +#define AON_GPIO_PIN(n) PINCTRL_PIN(n, "aon_gpio" #n) +#define AON_SGPIO_PIN(n) PINCTRL_PIN((n) + 32, "aon_sgpio" #n) + +struct pin_regs { + u16 mux_bit; + u16 pad_bit; +}; + +/** + * struct brcmstb_pin_funcs - pins provide their primary/alternate + * functions in this struct + * @func_mask: mask representing valid bits of the function selector + * in the registers + * @funcs: array of function identifiers + * @n_funcs: number of identifiers of the @funcs array above + */ +struct brcmstb_pin_funcs { + const u32 func_mask; + const u8 *funcs; + const unsigned int n_funcs; +}; + +/** + * struct brcmstb_pdata - specific data for a pinctrl chip implementation + * @pctl_desc: pin controller descriptor for this implementation + * @gpio_range: range of GPIOs served by this controller + * @pin_regs: array of register descriptors for each pin + * @pin_funcs: array of all possible assignable function for each pin + * @func_count: total number of functions + * @func_gpio: which function number is GPIO (usually 0) + * @func_names: an array listing all function names + */ +struct brcmstb_pdata { + const struct pinctrl_desc *pctl_desc; + const struct pinctrl_gpio_range *gpio_range; + const struct pin_regs *pin_regs; + const struct brcmstb_pin_funcs *pin_funcs; + const unsigned int func_count; + const unsigned int func_gpio; + const char * const *func_names; +}; + +int brcmstb_pinctrl_probe(struct platform_device *pdev); + +#endif diff --git a/drivers/pinctrl/cirrus/pinctrl-madera-core.c b/drivers/pinctrl/cirrus/pinctrl-madera-core.c index d19ef13224cca7..1d9481b1709178 100644 --- a/drivers/pinctrl/cirrus/pinctrl-madera-core.c +++ b/drivers/pinctrl/cirrus/pinctrl-madera-core.c @@ -804,7 +804,7 @@ static int madera_pin_conf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (conf[0] & MADERA_GP1_IP_CFG_MASK) result = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if ((conf[1] & MADERA_GP1_DIR_MASK) && (conf[0] & MADERA_GP1_LVL_MASK)) result = 1; @@ -902,7 +902,7 @@ static int madera_pin_conf_set(struct pinctrl_dev *pctldev, unsigned int pin, mask[1] |= MADERA_GP1_DIR_MASK; conf[1] |= MADERA_GP1_DIR; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: val = pinconf_to_config_argument(*configs); mask[0] |= MADERA_GP1_LVL_MASK; if (val) diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c index 18de3132854045..731c58ad43eea9 100644 --- a/drivers/pinctrl/freescale/pinctrl-imx.c +++ b/drivers/pinctrl/freescale/pinctrl-imx.c @@ -245,7 +245,7 @@ static int imx_pmx_set(struct pinctrl_dev *pctldev, unsigned selector, { struct imx_pinctrl *ipctl = pinctrl_dev_get_drvdata(pctldev); const struct imx_pinctrl_soc_info *info = ipctl->info; - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; struct imx_pin *pin; unsigned int npins; @@ -266,7 +266,7 @@ static int imx_pmx_set(struct pinctrl_dev *pctldev, unsigned selector, npins = grp->grp.npins; dev_dbg(ipctl->dev, "enable function %s group %s\n", - func->func.name, grp->grp.name); + func->func->name, grp->grp.name); for (i = 0; i < npins; i++) { /* @@ -580,33 +580,38 @@ static int imx_pinctrl_parse_functions(struct device_node *np, u32 index) { struct pinctrl_dev *pctl = ipctl->pctl; - struct function_desc *func; + struct pinfunction *func; struct group_desc *grp; const char **group_names; + int ret; u32 i; dev_dbg(pctl->dev, "parse function(%d): %pOFn\n", index, np); - func = pinmux_generic_get_function(pctl, index); + func = devm_kzalloc(ipctl->dev, sizeof(*func), GFP_KERNEL); if (!func) - return -EINVAL; + return -ENOMEM; /* Initialise function */ - func->func.name = np->name; - func->func.ngroups = of_get_child_count(np); - if (func->func.ngroups == 0) { + func->name = np->name; + func->ngroups = of_get_child_count(np); + if (func->ngroups == 0) { dev_info(ipctl->dev, "no groups defined in %pOF\n", np); return -EINVAL; } - group_names = devm_kcalloc(ipctl->dev, func->func.ngroups, - sizeof(*func->func.groups), GFP_KERNEL); + group_names = devm_kcalloc(ipctl->dev, func->ngroups, + sizeof(*func->groups), GFP_KERNEL); if (!group_names) return -ENOMEM; i = 0; for_each_child_of_node_scoped(np, child) group_names[i++] = child->name; - func->func.groups = group_names; + func->groups = group_names; + + ret = pinmux_generic_add_pinfunction(pctl, func, NULL); + if (ret < 0) + return ret; i = 0; for_each_child_of_node_scoped(np, child) { @@ -615,6 +620,10 @@ static int imx_pinctrl_parse_functions(struct device_node *np, return -ENOMEM; mutex_lock(&ipctl->mutex); + /* + * FIXME: This should use pinctrl_generic_add_group() and not + * access the private radix tree directly. + */ radix_tree_insert(&pctl->pin_group_tree, ipctl->group_index++, grp); mutex_unlock(&ipctl->mutex); @@ -669,20 +678,6 @@ static int imx_pinctrl_probe_dt(struct platform_device *pdev, } } - for (i = 0; i < nfuncs; i++) { - struct function_desc *function; - - function = devm_kzalloc(&pdev->dev, sizeof(*function), - GFP_KERNEL); - if (!function) - return -ENOMEM; - - mutex_lock(&ipctl->mutex); - radix_tree_insert(&pctl->pin_function_tree, i, function); - mutex_unlock(&ipctl->mutex); - } - pctl->num_functions = nfuncs; - ipctl->group_index = 0; if (flat_funcs) { pctl->num_groups = of_get_child_count(np); diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c index 1b2f132d76f0af..f1cf2578fe423e 100644 --- a/drivers/pinctrl/mediatek/pinctrl-airoha.c +++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c @@ -35,13 +35,8 @@ #define PINCTRL_FUNC_DESC(id) \ { \ - .desc = { \ - .func = { \ - .name = #id, \ - .groups = id##_groups, \ - .ngroups = ARRAY_SIZE(id##_groups), \ - } \ - }, \ + .desc = PINCTRL_PINFUNCTION(#id, id##_groups, \ + ARRAY_SIZE(id##_groups)), \ .groups = id##_func_group, \ .group_size = ARRAY_SIZE(id##_func_group), \ } @@ -108,6 +103,9 @@ #define JTAG_UDI_EN_MASK BIT(4) #define JTAG_DFD_EN_MASK BIT(3) +#define REG_FORCE_GPIO_EN 0x0228 +#define FORCE_GPIO_EN(n) BIT(n) + /* LED MAP */ #define REG_LAN_LED0_MAPPING 0x027c #define REG_LAN_LED1_MAPPING 0x0280 @@ -334,7 +332,7 @@ struct airoha_pinctrl_func_group { }; struct airoha_pinctrl_func { - const struct function_desc desc; + const struct pinfunction desc; const struct airoha_pinctrl_func_group *groups; u8 group_size; }; @@ -718,17 +716,17 @@ static const struct airoha_pinctrl_func_group mdio_func_group[] = { { .name = "mdio", .regmap[0] = { - AIROHA_FUNC_MUX, - REG_GPIO_PON_MODE, - GPIO_SGMII_MDIO_MODE_MASK, - GPIO_SGMII_MDIO_MODE_MASK - }, - .regmap[1] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, GPIO_MDC_IO_MASTER_MODE_MODE, GPIO_MDC_IO_MASTER_MODE_MODE }, + .regmap[1] = { + AIROHA_FUNC_MUX, + REG_FORCE_GPIO_EN, + FORCE_GPIO_EN(1) | FORCE_GPIO_EN(2), + FORCE_GPIO_EN(1) | FORCE_GPIO_EN(2) + }, .regmap_size = 2, }, }; @@ -1752,8 +1750,8 @@ static const struct airoha_pinctrl_func_group phy1_led1_func_group[] = { .regmap[0] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, - GPIO_LAN3_LED0_MODE_MASK, - GPIO_LAN3_LED0_MODE_MASK + GPIO_LAN3_LED1_MODE_MASK, + GPIO_LAN3_LED1_MODE_MASK }, .regmap[1] = { AIROHA_FUNC_MUX, @@ -1816,8 +1814,8 @@ static const struct airoha_pinctrl_func_group phy2_led1_func_group[] = { .regmap[0] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, - GPIO_LAN3_LED0_MODE_MASK, - GPIO_LAN3_LED0_MODE_MASK + GPIO_LAN3_LED1_MODE_MASK, + GPIO_LAN3_LED1_MODE_MASK }, .regmap[1] = { AIROHA_FUNC_MUX, @@ -1880,8 +1878,8 @@ static const struct airoha_pinctrl_func_group phy3_led1_func_group[] = { .regmap[0] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, - GPIO_LAN3_LED0_MODE_MASK, - GPIO_LAN3_LED0_MODE_MASK + GPIO_LAN3_LED1_MODE_MASK, + GPIO_LAN3_LED1_MODE_MASK }, .regmap[1] = { AIROHA_FUNC_MUX, @@ -1944,8 +1942,8 @@ static const struct airoha_pinctrl_func_group phy4_led1_func_group[] = { .regmap[0] = { AIROHA_FUNC_MUX, REG_GPIO_2ND_I2C_MODE, - GPIO_LAN3_LED0_MODE_MASK, - GPIO_LAN3_LED0_MODE_MASK + GPIO_LAN3_LED1_MODE_MASK, + GPIO_LAN3_LED1_MODE_MASK }, .regmap[1] = { AIROHA_FUNC_MUX, @@ -2448,7 +2446,7 @@ static int airoha_pinmux_set_mux(struct pinctrl_dev *pctrl_dev, { struct airoha_pinctrl *pinctrl = pinctrl_dev_get_drvdata(pctrl_dev); const struct airoha_pinctrl_func *func; - struct function_desc *desc; + const struct function_desc *desc; struct group_desc *grp; int i; @@ -2461,7 +2459,7 @@ static int airoha_pinmux_set_mux(struct pinctrl_dev *pctrl_dev, return -EINVAL; dev_dbg(pctrl_dev->dev, "enable function %s group %s\n", - desc->func.name, grp->grp.name); + desc->func->name, grp->grp.name); func = desc->data; for (i = 0; i < func->group_size; i++) { @@ -2770,7 +2768,7 @@ static int airoha_pinconf_set(struct pinctrl_dev *pctrl_dev, break; case PIN_CONFIG_OUTPUT_ENABLE: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: { + case PIN_CONFIG_LEVEL: { bool input = param == PIN_CONFIG_INPUT_ENABLE; int err; @@ -2779,7 +2777,7 @@ static int airoha_pinconf_set(struct pinctrl_dev *pctrl_dev, if (err) return err; - if (param == PIN_CONFIG_OUTPUT) { + if (param == PIN_CONFIG_LEVEL) { err = airoha_pinconf_set_pin_value(pctrl_dev, pin, !!arg); if (err) @@ -2908,11 +2906,11 @@ static int airoha_pinctrl_probe(struct platform_device *pdev) func = &airoha_pinctrl_funcs[i]; err = pinmux_generic_add_pinfunction(pinctrl->ctrl, - &func->desc.func, + &func->desc, (void *)func); if (err < 0) { dev_err(dev, "Failed to register function %s\n", - func->desc.func.name); + func->desc.name); return err; } } diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c index 6e4f6c07a50932..70f608347a5f68 100644 --- a/drivers/pinctrl/mediatek/pinctrl-moore.c +++ b/drivers/pinctrl/mediatek/pinctrl-moore.c @@ -43,7 +43,7 @@ static int mtk_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, unsigned int group) { struct mtk_pinctrl *hw = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; int i, err; @@ -56,7 +56,7 @@ static int mtk_pinmux_set_mux(struct pinctrl_dev *pctldev, return -EINVAL; dev_dbg(pctldev->dev, "enable function %s group %s\n", - func->func.name, grp->grp.name); + func->func->name, grp->grp.name); for (i = 0; i < grp->grp.npins; i++) { const struct mtk_pin_desc *desc; @@ -332,7 +332,7 @@ static int mtk_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, goto err; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_DIR, MTK_OUTPUT); if (err) @@ -622,11 +622,9 @@ static int mtk_build_functions(struct mtk_pinctrl *hw) int i, err; for (i = 0; i < hw->soc->nfuncs ; i++) { - const struct function_desc *function = hw->soc->funcs + i; - const struct pinfunction *func = &function->func; + const struct pinfunction *func = hw->soc->funcs + i; - err = pinmux_generic_add_pinfunction(hw->pctrl, func, - function->data); + err = pinmux_generic_add_pinfunction(hw->pctrl, func, NULL); if (err < 0) { dev_err(hw->dev, "Failed to register function %s\n", func->name); diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.h b/drivers/pinctrl/mediatek/pinctrl-moore.h index 229d19561e229c..fe1f087cacd044 100644 --- a/drivers/pinctrl/mediatek/pinctrl-moore.h +++ b/drivers/pinctrl/mediatek/pinctrl-moore.h @@ -43,11 +43,8 @@ .data = id##_funcs, \ } -#define PINCTRL_PIN_FUNCTION(_name_, id) \ - { \ - .func = PINCTRL_PINFUNCTION(_name_, id##_groups, ARRAY_SIZE(id##_groups)), \ - .data = NULL, \ - } +#define PINCTRL_PIN_FUNCTION(_name_, id) \ + PINCTRL_PINFUNCTION(_name_, id##_groups, ARRAY_SIZE(id##_groups)) int mtk_moore_pinctrl_probe(struct platform_device *pdev, const struct mtk_pin_soc *soc); diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7622.c b/drivers/pinctrl/mediatek/pinctrl-mt7622.c index 2dc1019910662a..d5777889448aab 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7622.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7622.c @@ -822,7 +822,7 @@ static const char *mt7622_uart_groups[] = { "uart0_0_tx_rx", "uart4_2_rts_cts",}; static const char *mt7622_wdt_groups[] = { "watchdog", }; -static const struct function_desc mt7622_functions[] = { +static const struct pinfunction mt7622_functions[] = { PINCTRL_PIN_FUNCTION("antsel", mt7622_antsel), PINCTRL_PIN_FUNCTION("emmc", mt7622_emmc), PINCTRL_PIN_FUNCTION("eth", mt7622_ethernet), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7623.c b/drivers/pinctrl/mediatek/pinctrl-mt7623.c index 3e59eada282527..69c06c2c0e21e4 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7623.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7623.c @@ -1340,7 +1340,7 @@ static const char *mt7623_uart_groups[] = { "uart0_0_txd_rxd", "uart3_rts_cts", }; static const char *mt7623_wdt_groups[] = { "watchdog_0", "watchdog_1", }; -static const struct function_desc mt7623_functions[] = { +static const struct pinfunction mt7623_functions[] = { PINCTRL_PIN_FUNCTION("audck", mt7623_aud_clk), PINCTRL_PIN_FUNCTION("disp", mt7623_disp_pwm), PINCTRL_PIN_FUNCTION("eth", mt7623_ethernet), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7629.c b/drivers/pinctrl/mediatek/pinctrl-mt7629.c index 98142e8c98011d..cc0694881ac9dc 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7629.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7629.c @@ -384,7 +384,7 @@ static const char *mt7629_wdt_groups[] = { "watchdog", }; static const char *mt7629_wifi_groups[] = { "wf0_5g", "wf0_2g", }; static const char *mt7629_flash_groups[] = { "snfi", "spi_nor" }; -static const struct function_desc mt7629_functions[] = { +static const struct pinfunction mt7629_functions[] = { PINCTRL_PIN_FUNCTION("eth", mt7629_ethernet), PINCTRL_PIN_FUNCTION("i2c", mt7629_i2c), PINCTRL_PIN_FUNCTION("led", mt7629_led), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7981.c b/drivers/pinctrl/mediatek/pinctrl-mt7981.c index 83092be5b614cc..6216c2e057f649 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7981.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7981.c @@ -977,7 +977,7 @@ static const char *mt7981_ethernet_groups[] = { "smi_mdc_mdio", "gbe_ext_mdc_mdi "wf0_mode1", "wf0_mode3", "mt7531_int", }; static const char *mt7981_ant_groups[] = { "ant_sel", }; -static const struct function_desc mt7981_functions[] = { +static const struct pinfunction mt7981_functions[] = { PINCTRL_PIN_FUNCTION("wa_aice", mt7981_wa_aice), PINCTRL_PIN_FUNCTION("dfd", mt7981_dfd), PINCTRL_PIN_FUNCTION("jtag", mt7981_jtag), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7986.c b/drivers/pinctrl/mediatek/pinctrl-mt7986.c index 5816b5fdb7ca91..2a762ade9c3550 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7986.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7986.c @@ -878,7 +878,7 @@ static const char *mt7986_uart_groups[] = { static const char *mt7986_wdt_groups[] = { "watchdog", }; static const char *mt7986_wf_groups[] = { "wf_2g", "wf_5g", "wf_dbdc", }; -static const struct function_desc mt7986_functions[] = { +static const struct pinfunction mt7986_functions[] = { PINCTRL_PIN_FUNCTION("audio", mt7986_audio), PINCTRL_PIN_FUNCTION("emmc", mt7986_emmc), PINCTRL_PIN_FUNCTION("eth", mt7986_ethernet), diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7988.c b/drivers/pinctrl/mediatek/pinctrl-mt7988.c index 68b4097792b883..9569e8c0cec15f 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7988.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7988.c @@ -1464,33 +1464,23 @@ static const char * const mt7988_usb_groups[] = { "drv_vbus_p1", }; -static const struct function_desc mt7988_functions[] = { - { { "audio", mt7988_audio_groups, ARRAY_SIZE(mt7988_audio_groups) }, - NULL }, - { { "jtag", mt7988_jtag_groups, ARRAY_SIZE(mt7988_jtag_groups) }, - NULL }, - { { "int_usxgmii", mt7988_int_usxgmii_groups, - ARRAY_SIZE(mt7988_int_usxgmii_groups) }, - NULL }, - { { "pwm", mt7988_pwm_groups, ARRAY_SIZE(mt7988_pwm_groups) }, NULL }, - { { "dfd", mt7988_dfd_groups, ARRAY_SIZE(mt7988_dfd_groups) }, NULL }, - { { "i2c", mt7988_i2c_groups, ARRAY_SIZE(mt7988_i2c_groups) }, NULL }, - { { "eth", mt7988_ethernet_groups, ARRAY_SIZE(mt7988_ethernet_groups) }, - NULL }, - { { "pcie", mt7988_pcie_groups, ARRAY_SIZE(mt7988_pcie_groups) }, - NULL }, - { { "pmic", mt7988_pmic_groups, ARRAY_SIZE(mt7988_pmic_groups) }, - NULL }, - { { "watchdog", mt7988_wdt_groups, ARRAY_SIZE(mt7988_wdt_groups) }, - NULL }, - { { "spi", mt7988_spi_groups, ARRAY_SIZE(mt7988_spi_groups) }, NULL }, - { { "flash", mt7988_flash_groups, ARRAY_SIZE(mt7988_flash_groups) }, - NULL }, - { { "uart", mt7988_uart_groups, ARRAY_SIZE(mt7988_uart_groups) }, - NULL }, - { { "udi", mt7988_udi_groups, ARRAY_SIZE(mt7988_udi_groups) }, NULL }, - { { "usb", mt7988_usb_groups, ARRAY_SIZE(mt7988_usb_groups) }, NULL }, - { { "led", mt7988_led_groups, ARRAY_SIZE(mt7988_led_groups) }, NULL }, +static const struct pinfunction mt7988_functions[] = { + PINCTRL_PIN_FUNCTION("audio", mt7988_audio), + PINCTRL_PIN_FUNCTION("jtag", mt7988_jtag), + PINCTRL_PIN_FUNCTION("int_usxgmii", mt7988_int_usxgmii), + PINCTRL_PIN_FUNCTION("pwm", mt7988_pwm), + PINCTRL_PIN_FUNCTION("dfd", mt7988_dfd), + PINCTRL_PIN_FUNCTION("i2c", mt7988_i2c), + PINCTRL_PIN_FUNCTION("eth", mt7988_ethernet), + PINCTRL_PIN_FUNCTION("pcie", mt7988_pcie), + PINCTRL_PIN_FUNCTION("pmic", mt7988_pmic), + PINCTRL_PIN_FUNCTION("watchdog", mt7988_wdt), + PINCTRL_PIN_FUNCTION("spi", mt7988_spi), + PINCTRL_PIN_FUNCTION("flash", mt7988_flash), + PINCTRL_PIN_FUNCTION("uart", mt7988_uart), + PINCTRL_PIN_FUNCTION("udi", mt7988_udi), + PINCTRL_PIN_FUNCTION("usb", mt7988_usb), + PINCTRL_PIN_FUNCTION("led", mt7988_led), }; static const struct mtk_eint_hw mt7988_eint_hw = { diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h index 36d2898037dd04..fa7c0ed4934648 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h @@ -238,7 +238,7 @@ struct mtk_pin_soc { unsigned int npins; const struct group_desc *grps; unsigned int ngrps; - const struct function_desc *funcs; + const struct pinfunction *funcs; unsigned int nfuncs; const struct mtk_eint_regs *eint_regs; const struct mtk_eint_hw *eint_hw; diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c index d10306024111c8..d6a46fe0cda891 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c @@ -384,7 +384,7 @@ static int mtk_pconf_parse_conf(struct pinctrl_dev *pctldev, mtk_pmx_gpio_set_direction(pctldev, NULL, pin, true); ret = mtk_pconf_set_ies_smt(pctl, pin, arg, param); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: mtk_gpio_set(pctl->chip, pin, arg); ret = mtk_pmx_gpio_set_direction(pctldev, NULL, pin, false); break; diff --git a/drivers/pinctrl/mediatek/pinctrl-paris.c b/drivers/pinctrl/mediatek/pinctrl-paris.c index 3e714554789d0e..6bf37d8085fae5 100644 --- a/drivers/pinctrl/mediatek/pinctrl-paris.c +++ b/drivers/pinctrl/mediatek/pinctrl-paris.c @@ -169,7 +169,7 @@ static int mtk_pinconf_get(struct pinctrl_dev *pctldev, if (!ret) err = -EINVAL; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_get_value(hw, desc, PINCTRL_PIN_REG_DIR, &ret); if (err) break; @@ -292,7 +292,7 @@ static int mtk_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, /* regard all non-zero value as enable */ err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_SR, !!arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_DO, arg); if (err) diff --git a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c index 6132710aff6881..d9e3a8d5932a82 100644 --- a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c +++ b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c @@ -422,7 +422,7 @@ static int aml_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = aml_pinconf_get_output(info, pin); if (ret <= 0) return -EINVAL; @@ -568,7 +568,7 @@ static int aml_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, switch (param) { case PIN_CONFIG_DRIVE_STRENGTH_UA: case PIN_CONFIG_OUTPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pinconf_to_config_argument(configs[i]); break; @@ -592,7 +592,7 @@ static int aml_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: ret = aml_pinconf_set_output(info, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = aml_pinconf_set_output_drive(info, pin, arg); break; default: diff --git a/drivers/pinctrl/meson/pinctrl-meson-g12a.c b/drivers/pinctrl/meson/pinctrl-meson-g12a.c index 8b9130c6e170b9..117e72b4ffcb7b 100644 --- a/drivers/pinctrl/meson/pinctrl-meson-g12a.c +++ b/drivers/pinctrl/meson/pinctrl-meson-g12a.c @@ -442,6 +442,8 @@ static const unsigned int tdm_c_dout1_z_pins[] = { GPIOZ_3 }; static const unsigned int tdm_c_dout2_z_pins[] = { GPIOZ_4 }; static const unsigned int tdm_c_dout3_z_pins[] = { GPIOZ_5 }; +static const unsigned int pcie_clkreqn_pins[] = { GPIOC_7 }; + static const struct meson_pmx_group meson_g12a_periphs_groups[] = { GPIO_GROUP(GPIOZ_0), GPIO_GROUP(GPIOZ_1), @@ -721,6 +723,7 @@ static const struct meson_pmx_group meson_g12a_periphs_groups[] = { GROUP(pdm_din2_c, 4), GROUP(pdm_din3_c, 4), GROUP(pdm_dclk_c, 4), + GROUP(pcie_clkreqn, 1), /* bank GPIOH */ GROUP(spi1_mosi, 3), @@ -1183,6 +1186,10 @@ static const char * const tdm_c_groups[] = { "tdm_c_dout2_z", "tdm_c_dout3_z", }; +static const char * const pcie_clkreqn_groups[] = { + "pcie_clkreqn" +}; + static const char * const gpio_aobus_groups[] = { "GPIOAO_0", "GPIOAO_1", "GPIOAO_2", "GPIOAO_3", "GPIOAO_4", "GPIOAO_5", "GPIOAO_6", "GPIOAO_7", "GPIOAO_8", "GPIOAO_9", @@ -1309,6 +1316,7 @@ static const struct meson_pmx_func meson_g12a_periphs_functions[] = { FUNCTION(tdm_a), FUNCTION(tdm_b), FUNCTION(tdm_c), + FUNCTION(pcie_clkreqn), }; static const struct meson_pmx_func meson_g12a_aobus_functions[] = { diff --git a/drivers/pinctrl/meson/pinctrl-meson-gxl.c b/drivers/pinctrl/meson/pinctrl-meson-gxl.c index 9171de657f9780..a75762e4d26418 100644 --- a/drivers/pinctrl/meson/pinctrl-meson-gxl.c +++ b/drivers/pinctrl/meson/pinctrl-meson-gxl.c @@ -187,6 +187,9 @@ static const unsigned int i2c_sda_c_pins[] = { GPIODV_28 }; static const unsigned int i2c_sck_c_dv19_pins[] = { GPIODV_19 }; static const unsigned int i2c_sda_c_dv18_pins[] = { GPIODV_18 }; +static const unsigned int i2c_sck_d_pins[] = { GPIOX_11 }; +static const unsigned int i2c_sda_d_pins[] = { GPIOX_10 }; + static const unsigned int eth_mdio_pins[] = { GPIOZ_0 }; static const unsigned int eth_mdc_pins[] = { GPIOZ_1 }; static const unsigned int eth_clk_rx_clk_pins[] = { GPIOZ_2 }; @@ -411,6 +414,8 @@ static const struct meson_pmx_group meson_gxl_periphs_groups[] = { GPIO_GROUP(GPIO_TEST_N), /* Bank X */ + GROUP(i2c_sda_d, 5, 5), + GROUP(i2c_sck_d, 5, 4), GROUP(sdio_d0, 5, 31), GROUP(sdio_d1, 5, 30), GROUP(sdio_d2, 5, 29), @@ -651,6 +656,10 @@ static const char * const i2c_c_groups[] = { "i2c_sck_c", "i2c_sda_c", "i2c_sda_c_dv18", "i2c_sck_c_dv19", }; +static const char * const i2c_d_groups[] = { + "i2c_sck_d", "i2c_sda_d", +}; + static const char * const eth_groups[] = { "eth_mdio", "eth_mdc", "eth_clk_rx_clk", "eth_rx_dv", "eth_rxd0", "eth_rxd1", "eth_rxd2", "eth_rxd3", @@ -777,6 +786,7 @@ static const struct meson_pmx_func meson_gxl_periphs_functions[] = { FUNCTION(i2c_a), FUNCTION(i2c_b), FUNCTION(i2c_c), + FUNCTION(i2c_d), FUNCTION(eth), FUNCTION(pwm_a), FUNCTION(pwm_b), diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c index 277e9c40490d71..18295b15ecd9dd 100644 --- a/drivers/pinctrl/meson/pinctrl-meson.c +++ b/drivers/pinctrl/meson/pinctrl-meson.c @@ -360,7 +360,7 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, switch (param) { case PIN_CONFIG_DRIVE_STRENGTH_UA: case PIN_CONFIG_OUTPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pinconf_to_config_argument(configs[i]); break; @@ -384,7 +384,7 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: ret = meson_pinconf_set_output(pc, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = meson_pinconf_set_output_drive(pc, pin, arg); break; default: @@ -502,7 +502,7 @@ static int meson_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = meson_pinconf_get_output(pc, pin); if (ret <= 0) return -EINVAL; diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c index 881df5e08f6176..81dfbd5e7f0711 100644 --- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c +++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c @@ -420,7 +420,8 @@ static int armada_37xx_gpio_direction_output(struct gpio_chip *chip, struct armada_37xx_pinctrl *info = gpiochip_get_data(chip); unsigned int en_offset = offset; unsigned int reg = OUTPUT_VAL; - unsigned int mask, val, ret; + unsigned int mask, val; + int ret; armada_37xx_update_reg(®, &offset); mask = BIT(offset); @@ -634,8 +635,9 @@ static int armada_37xx_edge_both_irq_swap_pol(struct armada_37xx_pinctrl *info, { u32 reg_idx = pin_idx / GPIO_PER_REG; u32 bit_num = pin_idx % GPIO_PER_REG; - u32 p, l, ret; unsigned long flags; + u32 p, l; + int ret; regmap_read(info->regmap, INPUT_VAL + 4*reg_idx, &l); diff --git a/drivers/pinctrl/nomadik/pinctrl-abx500.c b/drivers/pinctrl/nomadik/pinctrl-abx500.c index 7b5f94d8cb23cd..fc7ebeda8440eb 100644 --- a/drivers/pinctrl/nomadik/pinctrl-abx500.c +++ b/drivers/pinctrl/nomadik/pinctrl-abx500.c @@ -860,8 +860,8 @@ static int abx500_pin_config_set(struct pinctrl_dev *pctldev, dev_dbg(chip->parent, "pin %d [%#lx]: %s %s\n", pin, configs[i], - (param == PIN_CONFIG_OUTPUT) ? "output " : "input", - (param == PIN_CONFIG_OUTPUT) ? + (param == PIN_CONFIG_LEVEL) ? "output " : "input", + (param == PIN_CONFIG_LEVEL) ? str_high_low(argument) : (argument ? "pull up" : "pull down")); @@ -907,7 +907,7 @@ static int abx500_pin_config_set(struct pinctrl_dev *pctldev, ret = abx500_gpio_direction_input(chip, offset); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = abx500_gpio_direction_output(chip, offset, argument); break; diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c index 8940e04fcf4cc4..db0311b1413227 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -584,7 +584,7 @@ static void nmk_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, seq_printf(s, "invalid pin offset"); return; } - nmk_gpio_dbg_show_one(s, pctldev, chip, offset - chip->base, offset); + nmk_gpio_dbg_show_one(s, pctldev, chip, offset - chip->base); } static int nmk_dt_add_map_mux(struct pinctrl_map **map, unsigned int *reserved_maps, diff --git a/drivers/pinctrl/nuvoton/pinctrl-ma35.c b/drivers/pinctrl/nuvoton/pinctrl-ma35.c index 54652bfbe6ac48..cdad01d68a37e3 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-ma35.c +++ b/drivers/pinctrl/nuvoton/pinctrl-ma35.c @@ -1038,7 +1038,8 @@ static int ma35_pinctrl_parse_functions(struct fwnode_handle *fwnode, struct ma3 struct group_desc *grp; static u32 grp_index; const char **groups; - u32 ret, i = 0; + u32 i = 0; + int ret; dev_dbg(npctl->dev, "parse function(%d): %s\n", index, np->name); diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index b8872d8f5930ad..13ed87d5d30cf3 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -77,7 +78,7 @@ /* Structure for register banks */ struct npcm7xx_gpio { void __iomem *base; - struct gpio_chip gc; + struct gpio_generic_chip chip; int irqbase; int irq; u32 pinctrl_id; @@ -99,32 +100,26 @@ struct npcm7xx_pinctrl { }; /* GPIO handling in the pinctrl driver */ -static void npcm_gpio_set(struct gpio_chip *gc, void __iomem *reg, +static void npcm_gpio_set(struct gpio_generic_chip *chip, void __iomem *reg, unsigned int pinmask) { - unsigned long flags; unsigned long val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(chip); val = ioread32(reg) | pinmask; iowrite32(val, reg); - - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } -static void npcm_gpio_clr(struct gpio_chip *gc, void __iomem *reg, +static void npcm_gpio_clr(struct gpio_generic_chip *chip, void __iomem *reg, unsigned int pinmask) { - unsigned long flags; unsigned long val; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + guard(gpio_generic_lock_irqsave)(chip); val = ioread32(reg) & ~pinmask; iowrite32(val, reg); - - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void npcmgpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) @@ -132,9 +127,9 @@ static void npcmgpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) struct npcm7xx_gpio *bank = gpiochip_get_data(chip); seq_printf(s, "-- module %d [gpio%d - %d]\n", - bank->gc.base / bank->gc.ngpio, - bank->gc.base, - bank->gc.base + bank->gc.ngpio); + bank->chip.gc.base / bank->chip.gc.ngpio, + bank->chip.gc.base, + bank->chip.gc.base + bank->chip.gc.ngpio); seq_printf(s, "DIN :%.8x DOUT:%.8x IE :%.8x OE :%.8x\n", ioread32(bank->base + NPCM7XX_GP_N_DIN), ioread32(bank->base + NPCM7XX_GP_N_DOUT), @@ -220,7 +215,7 @@ static void npcmgpio_irq_handler(struct irq_desc *desc) chained_irq_enter(chip, desc); sts = ioread32(bank->base + NPCM7XX_GP_N_EVST); en = ioread32(bank->base + NPCM7XX_GP_N_EVEN); - dev_dbg(bank->gc.parent, "==> got irq sts %.8lx %.8lx\n", sts, + dev_dbg(bank->chip.gc.parent, "==> got irq sts %.8lx %.8lx\n", sts, en); sts &= en; @@ -235,42 +230,42 @@ static int npcmgpio_set_irq_type(struct irq_data *d, unsigned int type) struct npcm7xx_gpio *bank = gpiochip_get_data(gc); unsigned int gpio = BIT(irqd_to_hwirq(d)); - dev_dbg(bank->gc.parent, "setirqtype: %u.%u = %u\n", gpio, + dev_dbg(bank->chip.gc.parent, "setirqtype: %u.%u = %u\n", gpio, d->irq, type); switch (type) { case IRQ_TYPE_EDGE_RISING: - dev_dbg(bank->gc.parent, "edge.rising\n"); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_EVBE, gpio); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_POL, gpio); + dev_dbg(bank->chip.gc.parent, "edge.rising\n"); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_EVBE, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_POL, gpio); break; case IRQ_TYPE_EDGE_FALLING: - dev_dbg(bank->gc.parent, "edge.falling\n"); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_EVBE, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_POL, gpio); + dev_dbg(bank->chip.gc.parent, "edge.falling\n"); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_EVBE, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_POL, gpio); break; case IRQ_TYPE_EDGE_BOTH: - dev_dbg(bank->gc.parent, "edge.both\n"); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_EVBE, gpio); + dev_dbg(bank->chip.gc.parent, "edge.both\n"); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_EVBE, gpio); break; case IRQ_TYPE_LEVEL_LOW: - dev_dbg(bank->gc.parent, "level.low\n"); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_POL, gpio); + dev_dbg(bank->chip.gc.parent, "level.low\n"); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_POL, gpio); break; case IRQ_TYPE_LEVEL_HIGH: - dev_dbg(bank->gc.parent, "level.high\n"); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_POL, gpio); + dev_dbg(bank->chip.gc.parent, "level.high\n"); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_POL, gpio); break; default: - dev_dbg(bank->gc.parent, "invalid irq type\n"); + dev_dbg(bank->chip.gc.parent, "invalid irq type\n"); return -EINVAL; } if (type & (IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW)) { - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_EVTYP, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_EVTYP, gpio); irq_set_handler_locked(d, handle_level_irq); } else if (type & (IRQ_TYPE_EDGE_BOTH | IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING)) { - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_EVTYP, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_EVTYP, gpio); irq_set_handler_locked(d, handle_edge_irq); } @@ -283,7 +278,7 @@ static void npcmgpio_irq_ack(struct irq_data *d) struct npcm7xx_gpio *bank = gpiochip_get_data(gc); unsigned int gpio = irqd_to_hwirq(d); - dev_dbg(bank->gc.parent, "irq_ack: %u.%u\n", gpio, d->irq); + dev_dbg(bank->chip.gc.parent, "irq_ack: %u.%u\n", gpio, d->irq); iowrite32(BIT(gpio), bank->base + NPCM7XX_GP_N_EVST); } @@ -295,7 +290,7 @@ static void npcmgpio_irq_mask(struct irq_data *d) unsigned int gpio = irqd_to_hwirq(d); /* Clear events */ - dev_dbg(bank->gc.parent, "irq_mask: %u.%u\n", gpio, d->irq); + dev_dbg(bank->chip.gc.parent, "irq_mask: %u.%u\n", gpio, d->irq); iowrite32(BIT(gpio), bank->base + NPCM7XX_GP_N_EVENC); gpiochip_disable_irq(gc, gpio); } @@ -309,7 +304,7 @@ static void npcmgpio_irq_unmask(struct irq_data *d) /* Enable events */ gpiochip_enable_irq(gc, gpio); - dev_dbg(bank->gc.parent, "irq_unmask: %u.%u\n", gpio, d->irq); + dev_dbg(bank->chip.gc.parent, "irq_unmask: %u.%u\n", gpio, d->irq); iowrite32(BIT(gpio), bank->base + NPCM7XX_GP_N_EVENS); } @@ -1423,7 +1418,7 @@ static int npcm7xx_get_slew_rate(struct npcm7xx_gpio *bank, struct regmap *gcr_regmap, unsigned int pin) { u32 val; - int gpio = (pin % bank->gc.ngpio); + int gpio = (pin % bank->chip.gc.ngpio); unsigned long pinmask = BIT(gpio); if (pincfg[pin].flag & SLEW) @@ -1443,16 +1438,16 @@ static int npcm7xx_set_slew_rate(struct npcm7xx_gpio *bank, struct regmap *gcr_regmap, unsigned int pin, int arg) { - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); if (pincfg[pin].flag & SLEW) { switch (arg) { case 0: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_OSRC, + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_OSRC, gpio); return 0; case 1: - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_OSRC, + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_OSRC, gpio); return 0; default: @@ -1485,7 +1480,7 @@ static int npcm7xx_get_drive_strength(struct pinctrl_dev *pctldev, struct npcm7xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm7xx_gpio *bank = &npcm->gpio_bank[pin / NPCM7XX_GPIO_PER_BANK]; - int gpio = (pin % bank->gc.ngpio); + int gpio = (pin % bank->chip.gc.ngpio); unsigned long pinmask = BIT(gpio); u32 ds = 0; int flg, val; @@ -1496,7 +1491,7 @@ static int npcm7xx_get_drive_strength(struct pinctrl_dev *pctldev, val = ioread32(bank->base + NPCM7XX_GP_N_ODSC) & pinmask; ds = val ? DSHI(flg) : DSLO(flg); - dev_dbg(bank->gc.parent, + dev_dbg(bank->chip.gc.parent, "pin %d strength %d = %d\n", pin, val, ds); return ds; } @@ -1511,20 +1506,20 @@ static int npcm7xx_set_drive_strength(struct npcm7xx_pinctrl *npcm, int v; struct npcm7xx_gpio *bank = &npcm->gpio_bank[pin / NPCM7XX_GPIO_PER_BANK]; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); v = (pincfg[pin].flag & DRIVE_STRENGTH_MASK); if (!nval || !v) return -ENOTSUPP; if (DSLO(v) == nval) { - dev_dbg(bank->gc.parent, + dev_dbg(bank->chip.gc.parent, "setting pin %d to low strength [%d]\n", pin, nval); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_ODSC, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_ODSC, gpio); return 0; } else if (DSHI(v) == nval) { - dev_dbg(bank->gc.parent, + dev_dbg(bank->chip.gc.parent, "setting pin %d to high strength [%d]\n", pin, nval); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_ODSC, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_ODSC, gpio); return 0; } @@ -1657,9 +1652,9 @@ static int npcm_gpio_set_direction(struct pinctrl_dev *pctldev, struct npcm7xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm7xx_gpio *bank = &npcm->gpio_bank[offset / NPCM7XX_GPIO_PER_BANK]; - int gpio = BIT(offset % bank->gc.ngpio); + int gpio = BIT(offset % bank->chip.gc.ngpio); - dev_dbg(bank->gc.parent, "GPIO Set Direction: %d = %d\n", offset, + dev_dbg(bank->chip.gc.parent, "GPIO Set Direction: %d = %d\n", offset, input); if (input) iowrite32(gpio, bank->base + NPCM7XX_GP_N_OEC); @@ -1687,7 +1682,7 @@ static int npcm7xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, struct npcm7xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm7xx_gpio *bank = &npcm->gpio_bank[pin / NPCM7XX_GPIO_PER_BANK]; - int gpio = (pin % bank->gc.ngpio); + int gpio = (pin % bank->chip.gc.ngpio); unsigned long pinmask = BIT(gpio); u32 ie, oe, pu, pd; int rc = 0; @@ -1705,13 +1700,13 @@ static int npcm7xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, else if (param == PIN_CONFIG_BIAS_PULL_DOWN) rc = (!pu && pd); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: ie = ioread32(bank->base + NPCM7XX_GP_N_IEM) & pinmask; oe = ioread32(bank->base + NPCM7XX_GP_N_OE) & pinmask; if (param == PIN_CONFIG_INPUT_ENABLE) rc = (ie && !oe); - else if (param == PIN_CONFIG_OUTPUT) + else if (param == PIN_CONFIG_LEVEL) rc = (!ie && oe); break; case PIN_CONFIG_DRIVE_PUSH_PULL: @@ -1750,38 +1745,38 @@ static int npcm7xx_config_set_one(struct npcm7xx_pinctrl *npcm, u16 arg = pinconf_to_config_argument(config); struct npcm7xx_gpio *bank = &npcm->gpio_bank[pin / NPCM7XX_GPIO_PER_BANK]; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); - dev_dbg(bank->gc.parent, "param=%d %d[GPIO]\n", param, pin); + dev_dbg(bank->chip.gc.parent, "param=%d %d[GPIO]\n", param, pin); switch (param) { case PIN_CONFIG_BIAS_DISABLE: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_PU, gpio); - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_PD, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_PU, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_PD, gpio); break; case PIN_CONFIG_BIAS_PULL_DOWN: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_PU, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_PD, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_PU, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_PD, gpio); break; case PIN_CONFIG_BIAS_PULL_UP: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_PD, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_PU, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_PD, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_PU, gpio); break; case PIN_CONFIG_INPUT_ENABLE: iowrite32(gpio, bank->base + NPCM7XX_GP_N_OEC); - bank->direction_input(&bank->gc, pin % bank->gc.ngpio); + bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; - case PIN_CONFIG_OUTPUT: - bank->direction_output(&bank->gc, pin % bank->gc.ngpio, arg); + case PIN_CONFIG_LEVEL: + bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM7XX_GP_N_OES); break; case PIN_CONFIG_DRIVE_PUSH_PULL: - npcm_gpio_clr(&bank->gc, bank->base + NPCM7XX_GP_N_OTYP, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM7XX_GP_N_OTYP, gpio); break; case PIN_CONFIG_DRIVE_OPEN_DRAIN: - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_OTYP, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_OTYP, gpio); break; case PIN_CONFIG_INPUT_DEBOUNCE: - npcm_gpio_set(&bank->gc, bank->base + NPCM7XX_GP_N_DBNC, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM7XX_GP_N_DBNC, gpio); break; case PIN_CONFIG_SLEW_RATE: return npcm7xx_set_slew_rate(bank, npcm->gcr_regmap, pin, arg); @@ -1829,6 +1824,7 @@ static const struct pinctrl_desc npcm7xx_pinctrl_desc = { static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) { + struct gpio_generic_chip_config config; int ret = -ENXIO; struct device *dev = pctrl->dev; struct fwnode_reference_args args; @@ -1840,15 +1836,18 @@ static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) if (!pctrl->gpio_bank[id].base) return -EINVAL; - ret = bgpio_init(&pctrl->gpio_bank[id].gc, dev, 4, - pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DIN, - pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DOUT, - NULL, - NULL, - pctrl->gpio_bank[id].base + NPCM7XX_GP_N_IEM, - BGPIOF_READ_OUTPUT_REG_SET); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DIN, + .set = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DOUT, + .dirin = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_IEM, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); if (ret) { - dev_err(dev, "bgpio_init() failed\n"); + dev_err(dev, "failed to initialize the generic GPIO chip\n"); return ret; } @@ -1866,23 +1865,23 @@ static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) pctrl->gpio_bank[id].irq = ret; pctrl->gpio_bank[id].irqbase = id * NPCM7XX_GPIO_PER_BANK; pctrl->gpio_bank[id].pinctrl_id = args.args[0]; - pctrl->gpio_bank[id].gc.base = args.args[1]; - pctrl->gpio_bank[id].gc.ngpio = args.args[2]; - pctrl->gpio_bank[id].gc.owner = THIS_MODULE; - pctrl->gpio_bank[id].gc.parent = dev; - pctrl->gpio_bank[id].gc.fwnode = child; - pctrl->gpio_bank[id].gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", child); - if (pctrl->gpio_bank[id].gc.label == NULL) + pctrl->gpio_bank[id].chip.gc.base = args.args[1]; + pctrl->gpio_bank[id].chip.gc.ngpio = args.args[2]; + pctrl->gpio_bank[id].chip.gc.owner = THIS_MODULE; + pctrl->gpio_bank[id].chip.gc.parent = dev; + pctrl->gpio_bank[id].chip.gc.fwnode = child; + pctrl->gpio_bank[id].chip.gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", child); + if (pctrl->gpio_bank[id].chip.gc.label == NULL) return -ENOMEM; - pctrl->gpio_bank[id].gc.dbg_show = npcmgpio_dbg_show; - pctrl->gpio_bank[id].direction_input = pctrl->gpio_bank[id].gc.direction_input; - pctrl->gpio_bank[id].gc.direction_input = npcmgpio_direction_input; - pctrl->gpio_bank[id].direction_output = pctrl->gpio_bank[id].gc.direction_output; - pctrl->gpio_bank[id].gc.direction_output = npcmgpio_direction_output; - pctrl->gpio_bank[id].request = pctrl->gpio_bank[id].gc.request; - pctrl->gpio_bank[id].gc.request = npcmgpio_gpio_request; - pctrl->gpio_bank[id].gc.free = pinctrl_gpio_free; + pctrl->gpio_bank[id].chip.gc.dbg_show = npcmgpio_dbg_show; + pctrl->gpio_bank[id].direction_input = pctrl->gpio_bank[id].chip.gc.direction_input; + pctrl->gpio_bank[id].chip.gc.direction_input = npcmgpio_direction_input; + pctrl->gpio_bank[id].direction_output = pctrl->gpio_bank[id].chip.gc.direction_output; + pctrl->gpio_bank[id].chip.gc.direction_output = npcmgpio_direction_output; + pctrl->gpio_bank[id].request = pctrl->gpio_bank[id].chip.gc.request; + pctrl->gpio_bank[id].chip.gc.request = npcmgpio_gpio_request; + pctrl->gpio_bank[id].chip.gc.free = pinctrl_gpio_free; id++; } @@ -1897,7 +1896,7 @@ static int npcm7xx_gpio_register(struct npcm7xx_pinctrl *pctrl) for (id = 0 ; id < pctrl->bank_num ; id++) { struct gpio_irq_chip *girq; - girq = &pctrl->gpio_bank[id].gc.irq; + girq = &pctrl->gpio_bank[id].chip.gc.irq; gpio_irq_chip_set_chip(girq, &npcmgpio_irqchip); girq->parent_handler = npcmgpio_irq_handler; girq->num_parents = 1; @@ -1912,21 +1911,21 @@ static int npcm7xx_gpio_register(struct npcm7xx_pinctrl *pctrl) girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_level_irq; ret = devm_gpiochip_add_data(pctrl->dev, - &pctrl->gpio_bank[id].gc, + &pctrl->gpio_bank[id].chip.gc, &pctrl->gpio_bank[id]); if (ret) { dev_err(pctrl->dev, "Failed to add GPIO chip %u\n", id); goto err_register; } - ret = gpiochip_add_pin_range(&pctrl->gpio_bank[id].gc, + ret = gpiochip_add_pin_range(&pctrl->gpio_bank[id].chip.gc, dev_name(pctrl->dev), pctrl->gpio_bank[id].pinctrl_id, - pctrl->gpio_bank[id].gc.base, - pctrl->gpio_bank[id].gc.ngpio); + pctrl->gpio_bank[id].chip.gc.base, + pctrl->gpio_bank[id].chip.gc.ngpio); if (ret < 0) { dev_err(pctrl->dev, "Failed to add GPIO bank %u\n", id); - gpiochip_remove(&pctrl->gpio_bank[id].gc); + gpiochip_remove(&pctrl->gpio_bank[id].chip.gc); goto err_register; } } @@ -1935,7 +1934,7 @@ static int npcm7xx_gpio_register(struct npcm7xx_pinctrl *pctrl) err_register: for (; id > 0; id--) - gpiochip_remove(&pctrl->gpio_bank[id - 1].gc); + gpiochip_remove(&pctrl->gpio_bank[id - 1].chip.gc); return ret; } diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c index 3c3b9d8d3681c6..0aae1a25345940 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -90,7 +91,7 @@ struct debounce_time { }; struct npcm8xx_gpio { - struct gpio_chip gc; + struct gpio_generic_chip chip; void __iomem *base; struct debounce_time debounce; int irqbase; @@ -115,24 +116,20 @@ struct npcm8xx_pinctrl { }; /* GPIO handling in the pinctrl driver */ -static void npcm_gpio_set(struct gpio_chip *gc, void __iomem *reg, +static void npcm_gpio_set(struct gpio_generic_chip *chip, void __iomem *reg, unsigned int pinmask) { - unsigned long flags; + guard(gpio_generic_lock_irqsave)(chip); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); iowrite32(ioread32(reg) | pinmask, reg); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } -static void npcm_gpio_clr(struct gpio_chip *gc, void __iomem *reg, +static void npcm_gpio_clr(struct gpio_generic_chip *chip, void __iomem *reg, unsigned int pinmask) { - unsigned long flags; + guard(gpio_generic_lock_irqsave)(chip); - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); iowrite32(ioread32(reg) & ~pinmask, reg); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); } static void npcmgpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) @@ -233,32 +230,32 @@ static int npcmgpio_set_irq_type(struct irq_data *d, unsigned int type) switch (type) { case IRQ_TYPE_EDGE_RISING: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_EVBE, gpio); - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_EVBE, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); break; case IRQ_TYPE_EDGE_FALLING: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_EVBE, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_EVBE, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); break; case IRQ_TYPE_EDGE_BOTH: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_EVBE, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_EVBE, gpio); break; case IRQ_TYPE_LEVEL_LOW: - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); break; case IRQ_TYPE_LEVEL_HIGH: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_POL, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_POL, gpio); break; default: return -EINVAL; } if (type & IRQ_TYPE_LEVEL_MASK) { - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_EVTYP, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_EVTYP, gpio); irq_set_handler_locked(d, handle_level_irq); } else if (type & IRQ_TYPE_EDGE_BOTH) { - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_EVTYP, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_EVTYP, gpio); irq_set_handler_locked(d, handle_edge_irq); } @@ -1842,7 +1839,7 @@ static void npcm8xx_setfunc(struct regmap *gcr_regmap, const unsigned int *pin, static int npcm8xx_get_slew_rate(struct npcm8xx_gpio *bank, struct regmap *gcr_regmap, unsigned int pin) { - int gpio = pin % bank->gc.ngpio; + int gpio = pin % bank->chip.gc.ngpio; unsigned long pinmask = BIT(gpio); u32 val; @@ -1862,15 +1859,15 @@ static int npcm8xx_set_slew_rate(struct npcm8xx_gpio *bank, int arg) { void __iomem *OSRC_Offset = bank->base + NPCM8XX_GP_N_OSRC; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); if (pincfg[pin].flag & SLEW) { switch (arg) { case 0: - npcm_gpio_clr(&bank->gc, OSRC_Offset, gpio); + npcm_gpio_clr(&bank->chip, OSRC_Offset, gpio); return 0; case 1: - npcm_gpio_set(&bank->gc, OSRC_Offset, gpio); + npcm_gpio_set(&bank->chip, OSRC_Offset, gpio); return 0; default: return -EINVAL; @@ -1902,7 +1899,7 @@ static int npcm8xx_get_drive_strength(struct pinctrl_dev *pctldev, struct npcm8xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; - int gpio = pin % bank->gc.ngpio; + int gpio = pin % bank->chip.gc.ngpio; unsigned long pinmask = BIT(gpio); int flg, val; u32 ds = 0; @@ -1913,7 +1910,7 @@ static int npcm8xx_get_drive_strength(struct pinctrl_dev *pctldev, val = ioread32(bank->base + NPCM8XX_GP_N_ODSC) & pinmask; ds = val ? DSHI(flg) : DSLO(flg); - dev_dbg(bank->gc.parent, "pin %d strength %d = %d\n", pin, val, ds); + dev_dbg(bank->chip.gc.parent, "pin %d strength %d = %d\n", pin, val, ds); return ds; } @@ -1923,15 +1920,15 @@ static int npcm8xx_set_drive_strength(struct npcm8xx_pinctrl *npcm, { struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); int v; v = pincfg[pin].flag & DRIVE_STRENGTH_MASK; if (DSLO(v) == nval) - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_ODSC, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_ODSC, gpio); else if (DSHI(v) == nval) - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_ODSC, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_ODSC, gpio); else return -ENOTSUPP; @@ -2054,7 +2051,7 @@ static int npcm_gpio_set_direction(struct pinctrl_dev *pctldev, struct npcm8xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm8xx_gpio *bank = &npcm->gpio_bank[offset / NPCM8XX_GPIO_PER_BANK]; - int gpio = BIT(offset % bank->gc.ngpio); + int gpio = BIT(offset % bank->chip.gc.ngpio); if (input) iowrite32(gpio, bank->base + NPCM8XX_GP_N_OEC); @@ -2085,7 +2082,7 @@ static int debounce_timing_setting(struct npcm8xx_gpio *bank, u32 gpio, if (bank->debounce.set_val[i]) { if (bank->debounce.nanosec_val[i] == nanosecs) { debounce_select = i << gpio_debounce; - npcm_gpio_set(&bank->gc, DBNCS_offset, + npcm_gpio_set(&bank->chip, DBNCS_offset, debounce_select); break; } @@ -2093,7 +2090,7 @@ static int debounce_timing_setting(struct npcm8xx_gpio *bank, u32 gpio, bank->debounce.set_val[i] = true; bank->debounce.nanosec_val[i] = nanosecs; debounce_select = i << gpio_debounce; - npcm_gpio_set(&bank->gc, DBNCS_offset, debounce_select); + npcm_gpio_set(&bank->chip, DBNCS_offset, debounce_select); switch (nanosecs) { case 1 ... 1040: iowrite32(0, bank->base + NPCM8XX_GP_N_DBNCP0 + (i * 4)); @@ -2145,21 +2142,21 @@ static int npcm_set_debounce(struct npcm8xx_pinctrl *npcm, unsigned int pin, { struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); int ret; if (nanosecs) { - ret = debounce_timing_setting(bank, pin % bank->gc.ngpio, + ret = debounce_timing_setting(bank, pin % bank->chip.gc.ngpio, nanosecs); if (ret) dev_err(npcm->dev, "Pin %d, All four debounce timing values are used, please use one of exist debounce values\n", pin); else - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_DBNC, + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_DBNC, gpio); return ret; } - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_DBNC, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_DBNC, gpio); return 0; } @@ -2172,7 +2169,7 @@ static int npcm8xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, struct npcm8xx_pinctrl *npcm = pinctrl_dev_get_drvdata(pctldev); struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; - int gpio = pin % bank->gc.ngpio; + int gpio = pin % bank->chip.gc.ngpio; unsigned long pinmask = BIT(gpio); u32 ie, oe, pu, pd; int rc = 0; @@ -2190,13 +2187,13 @@ static int npcm8xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, else if (param == PIN_CONFIG_BIAS_PULL_DOWN) rc = !pu && pd; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: ie = ioread32(bank->base + NPCM8XX_GP_N_IEM) & pinmask; oe = ioread32(bank->base + NPCM8XX_GP_N_OE) & pinmask; if (param == PIN_CONFIG_INPUT_ENABLE) rc = (ie && !oe); - else if (param == PIN_CONFIG_OUTPUT) + else if (param == PIN_CONFIG_LEVEL) rc = (!ie && oe); break; case PIN_CONFIG_DRIVE_PUSH_PULL: @@ -2235,34 +2232,34 @@ static int npcm8xx_config_set_one(struct npcm8xx_pinctrl *npcm, struct npcm8xx_gpio *bank = &npcm->gpio_bank[pin / NPCM8XX_GPIO_PER_BANK]; u32 arg = pinconf_to_config_argument(config); - int gpio = BIT(pin % bank->gc.ngpio); + int gpio = BIT(pin % bank->chip.gc.ngpio); switch (param) { case PIN_CONFIG_BIAS_DISABLE: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_PU, gpio); - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_PD, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_PU, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_PD, gpio); break; case PIN_CONFIG_BIAS_PULL_DOWN: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_PU, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_PD, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_PU, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_PD, gpio); break; case PIN_CONFIG_BIAS_PULL_UP: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_PD, gpio); - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_PU, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_PD, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_PU, gpio); break; case PIN_CONFIG_INPUT_ENABLE: iowrite32(gpio, bank->base + NPCM8XX_GP_N_OEC); - bank->direction_input(&bank->gc, pin % bank->gc.ngpio); + bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; - case PIN_CONFIG_OUTPUT: - bank->direction_output(&bank->gc, pin % bank->gc.ngpio, arg); + case PIN_CONFIG_LEVEL: + bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM8XX_GP_N_OES); break; case PIN_CONFIG_DRIVE_PUSH_PULL: - npcm_gpio_clr(&bank->gc, bank->base + NPCM8XX_GP_N_OTYP, gpio); + npcm_gpio_clr(&bank->chip, bank->base + NPCM8XX_GP_N_OTYP, gpio); break; case PIN_CONFIG_DRIVE_OPEN_DRAIN: - npcm_gpio_set(&bank->gc, bank->base + NPCM8XX_GP_N_OTYP, gpio); + npcm_gpio_set(&bank->chip, bank->base + NPCM8XX_GP_N_OTYP, gpio); break; case PIN_CONFIG_INPUT_DEBOUNCE: return npcm_set_debounce(npcm, pin, arg * 1000); @@ -2313,13 +2310,14 @@ static int npcmgpio_add_pin_ranges(struct gpio_chip *chip) { struct npcm8xx_gpio *bank = gpiochip_get_data(chip); - return gpiochip_add_pin_range(&bank->gc, dev_name(chip->parent), - bank->pinctrl_id, bank->gc.base, - bank->gc.ngpio); + return gpiochip_add_pin_range(&bank->chip.gc, dev_name(chip->parent), + bank->pinctrl_id, bank->chip.gc.base, + bank->chip.gc.ngpio); } static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) { + struct gpio_generic_chip_config config; struct fwnode_reference_args args; struct device *dev = pctrl->dev; struct fwnode_handle *child; @@ -2331,15 +2329,19 @@ static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) if (!pctrl->gpio_bank[id].base) return dev_err_probe(dev, -ENXIO, "fwnode_iomap id %d failed\n", id); - ret = bgpio_init(&pctrl->gpio_bank[id].gc, dev, 4, - pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DIN, - pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DOUT, - NULL, - NULL, - pctrl->gpio_bank[id].base + NPCM8XX_GP_N_IEM, - BGPIOF_READ_OUTPUT_REG_SET); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DIN, + .set = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DOUT, + .dirin = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_IEM, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, + }; + + ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); if (ret) - return dev_err_probe(dev, ret, "bgpio_init() failed\n"); + return dev_err_probe(dev, ret, + "failed to initialize the generic GPIO chip\n"); ret = fwnode_property_get_reference_args(child, "gpio-ranges", NULL, 3, 0, &args); if (ret < 0) @@ -2353,26 +2355,26 @@ static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) pctrl->gpio_bank[id].irq_chip = npcmgpio_irqchip; pctrl->gpio_bank[id].irqbase = id * NPCM8XX_GPIO_PER_BANK; pctrl->gpio_bank[id].pinctrl_id = args.args[0]; - pctrl->gpio_bank[id].gc.base = -1; - pctrl->gpio_bank[id].gc.ngpio = args.args[2]; - pctrl->gpio_bank[id].gc.owner = THIS_MODULE; - pctrl->gpio_bank[id].gc.parent = dev; - pctrl->gpio_bank[id].gc.fwnode = child; - pctrl->gpio_bank[id].gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", child); - if (pctrl->gpio_bank[id].gc.label == NULL) + pctrl->gpio_bank[id].chip.gc.base = -1; + pctrl->gpio_bank[id].chip.gc.ngpio = args.args[2]; + pctrl->gpio_bank[id].chip.gc.owner = THIS_MODULE; + pctrl->gpio_bank[id].chip.gc.parent = dev; + pctrl->gpio_bank[id].chip.gc.fwnode = child; + pctrl->gpio_bank[id].chip.gc.label = devm_kasprintf(dev, GFP_KERNEL, "%pfw", child); + if (pctrl->gpio_bank[id].chip.gc.label == NULL) return -ENOMEM; - pctrl->gpio_bank[id].gc.dbg_show = npcmgpio_dbg_show; - pctrl->gpio_bank[id].direction_input = pctrl->gpio_bank[id].gc.direction_input; - pctrl->gpio_bank[id].gc.direction_input = npcmgpio_direction_input; - pctrl->gpio_bank[id].direction_output = pctrl->gpio_bank[id].gc.direction_output; - pctrl->gpio_bank[id].gc.direction_output = npcmgpio_direction_output; - pctrl->gpio_bank[id].request = pctrl->gpio_bank[id].gc.request; - pctrl->gpio_bank[id].gc.request = npcmgpio_gpio_request; - pctrl->gpio_bank[id].gc.free = pinctrl_gpio_free; + pctrl->gpio_bank[id].chip.gc.dbg_show = npcmgpio_dbg_show; + pctrl->gpio_bank[id].direction_input = pctrl->gpio_bank[id].chip.gc.direction_input; + pctrl->gpio_bank[id].chip.gc.direction_input = npcmgpio_direction_input; + pctrl->gpio_bank[id].direction_output = pctrl->gpio_bank[id].chip.gc.direction_output; + pctrl->gpio_bank[id].chip.gc.direction_output = npcmgpio_direction_output; + pctrl->gpio_bank[id].request = pctrl->gpio_bank[id].chip.gc.request; + pctrl->gpio_bank[id].chip.gc.request = npcmgpio_gpio_request; + pctrl->gpio_bank[id].chip.gc.free = pinctrl_gpio_free; for (i = 0 ; i < NPCM8XX_DEBOUNCE_MAX ; i++) pctrl->gpio_bank[id].debounce.set_val[i] = false; - pctrl->gpio_bank[id].gc.add_pin_ranges = npcmgpio_add_pin_ranges; + pctrl->gpio_bank[id].chip.gc.add_pin_ranges = npcmgpio_add_pin_ranges; id++; } @@ -2387,7 +2389,7 @@ static int npcm8xx_gpio_register(struct npcm8xx_pinctrl *pctrl) for (id = 0 ; id < pctrl->bank_num ; id++) { struct gpio_irq_chip *girq; - girq = &pctrl->gpio_bank[id].gc.irq; + girq = &pctrl->gpio_bank[id].chip.gc.irq; girq->chip = &pctrl->gpio_bank[id].irq_chip; girq->parent_handler = npcmgpio_irq_handler; girq->num_parents = 1; @@ -2401,7 +2403,7 @@ static int npcm8xx_gpio_register(struct npcm8xx_pinctrl *pctrl) girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_level_irq; ret = devm_gpiochip_add_data(pctrl->dev, - &pctrl->gpio_bank[id].gc, + &pctrl->gpio_bank[id].chip.gc, &pctrl->gpio_bank[id]); if (ret) return dev_err_probe(pctrl->dev, ret, "Failed to add GPIO chip %u\n", id); diff --git a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c index 8d8314ba0e4cb5..d624a4d302a889 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c +++ b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -47,7 +48,7 @@ struct wpcm450_pinctrl; struct wpcm450_bank; struct wpcm450_gpio { - struct gpio_chip gc; + struct gpio_generic_chip chip; struct wpcm450_pinctrl *pctrl; const struct wpcm450_bank *bank; }; @@ -184,11 +185,12 @@ static void wpcm450_gpio_irq_unmask(struct irq_data *d) } /* - * This is an implementation of the gpio_chip->get() function, for use in - * wpcm450_gpio_fix_evpol. Unfortunately, we can't use the bgpio-provided - * implementation there, because it would require taking gpio_chip->bgpio_lock, - * which is a spin lock, but wpcm450_gpio_fix_evpol must work in contexts where - * a raw spin lock is held. + * FIXME: This is an implementation of the gpio_chip->get() function, for use + * in wpcm450_gpio_fix_evpol(). It was implemented back when gpio-mmio used a + * regular spinlock internally, while wpcm450_gpio_fix_evpol() needed to work + * in contexts with a raw spinlock held. Since then, the gpio generic chip has + * been switched to using a raw spinlock so this should be converted to using + * the locking interfaces provided in linux/gpio/gneneric.h. */ static int wpcm450_gpio_get(struct wpcm450_gpio *gpio, int offset) { @@ -329,7 +331,7 @@ static void wpcm450_gpio_irqhandler(struct irq_desc *desc) for_each_set_bit(bit, &pending, 32) { int offset = wpcm450_irq_bitnum_to_gpio(gpio, bit); - generic_handle_domain_irq(gpio->gc.irq.domain, offset); + generic_handle_domain_irq(gpio->chip.gc.irq.domain, offset); } chained_irq_exit(chip, desc); } @@ -1012,7 +1014,7 @@ static int wpcm450_gpio_add_pin_ranges(struct gpio_chip *chip) struct wpcm450_gpio *gpio = gpiochip_get_data(chip); const struct wpcm450_bank *bank = gpio->bank; - return gpiochip_add_pin_range(&gpio->gc, dev_name(gpio->pctrl->dev), + return gpiochip_add_pin_range(&gpio->chip.gc, dev_name(gpio->pctrl->dev), 0, bank->base, bank->length); } @@ -1029,6 +1031,7 @@ static int wpcm450_gpio_register(struct platform_device *pdev, "Resource fail for GPIO controller\n"); for_each_gpiochip_node(dev, child) { + struct gpio_generic_chip_config config; void __iomem *dat = NULL; void __iomem *set = NULL; void __iomem *dirout = NULL; @@ -1058,19 +1061,28 @@ static int wpcm450_gpio_register(struct platform_device *pdev, set = pctrl->gpio_base + bank->dataout; dirout = pctrl->gpio_base + bank->cfg0; } else { - flags = BGPIOF_NO_OUTPUT; + flags = GPIO_GENERIC_NO_OUTPUT; } - ret = bgpio_init(&gpio->gc, dev, 4, - dat, set, NULL, dirout, NULL, flags); + + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = dat, + .set = set, + .dirout = dirout, + .flags = flags, + }; + + ret = gpio_generic_chip_init(&gpio->chip, &config); if (ret < 0) return dev_err_probe(dev, ret, "GPIO initialization failed\n"); - gpio->gc.ngpio = bank->length; - gpio->gc.set_config = wpcm450_gpio_set_config; - gpio->gc.fwnode = child; - gpio->gc.add_pin_ranges = wpcm450_gpio_add_pin_ranges; + gpio->chip.gc.ngpio = bank->length; + gpio->chip.gc.set_config = wpcm450_gpio_set_config; + gpio->chip.gc.fwnode = child; + gpio->chip.gc.add_pin_ranges = wpcm450_gpio_add_pin_ranges; - girq = &gpio->gc.irq; + girq = &gpio->chip.gc.irq; gpio_irq_chip_set_chip(girq, &wpcm450_gpio_irqchip); girq->parent_handler = wpcm450_gpio_irqhandler; girq->parents = devm_kcalloc(dev, WPCM450_NUM_GPIO_IRQS, @@ -1094,7 +1106,7 @@ static int wpcm450_gpio_register(struct platform_device *pdev, girq->num_parents++; } - ret = devm_gpiochip_add_data(dev, &gpio->gc, gpio); + ret = devm_gpiochip_add_data(dev, &gpio->chip.gc, gpio); if (ret) return dev_err_probe(dev, ret, "Failed to add GPIO chip\n"); } diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index d67838afb08574..5de6ff62c69bdb 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -48,7 +48,7 @@ static const struct pin_config_item conf_items[] = { PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT_ENABLE, "input schmitt enabled", NULL, false), PCONFDUMP(PIN_CONFIG_MODE_LOW_POWER, "pin low power", "mode", true), PCONFDUMP(PIN_CONFIG_OUTPUT_ENABLE, "output enabled", NULL, false), - PCONFDUMP(PIN_CONFIG_OUTPUT, "pin output", "level", true), + PCONFDUMP(PIN_CONFIG_LEVEL, "pin output", "level", true), PCONFDUMP(PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, "output impedance", "ohms", true), PCONFDUMP(PIN_CONFIG_POWER_SOURCE, "pin power source", "selector", true), PCONFDUMP(PIN_CONFIG_SLEEP_HARDWARE_STATE, "sleep hardware state", NULL, false), @@ -183,9 +183,9 @@ static const struct pinconf_generic_params dt_params[] = { { "low-power-enable", PIN_CONFIG_MODE_LOW_POWER, 1 }, { "output-disable", PIN_CONFIG_OUTPUT_ENABLE, 0 }, { "output-enable", PIN_CONFIG_OUTPUT_ENABLE, 1 }, - { "output-high", PIN_CONFIG_OUTPUT, 1, }, + { "output-high", PIN_CONFIG_LEVEL, 1, }, { "output-impedance-ohms", PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, 0 }, - { "output-low", PIN_CONFIG_OUTPUT, 0, }, + { "output-low", PIN_CONFIG_LEVEL, 0, }, { "power-source", PIN_CONFIG_POWER_SOURCE, 0 }, { "sleep-hardware-state", PIN_CONFIG_SLEEP_HARDWARE_STATE, 0 }, { "slew-rate", PIN_CONFIG_SLEW_RATE, 0 }, diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 09a5425d54ba38..2dac5c71eb008f 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -383,14 +383,15 @@ static void amd_gpio_irq_enable(struct irq_data *d) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); - gpiochip_enable_irq(gc, d->hwirq); + gpiochip_enable_irq(gc, hwirq); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); pin_reg |= BIT(INTERRUPT_ENABLE_OFF); pin_reg |= BIT(INTERRUPT_MASK_OFF); - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } @@ -400,15 +401,16 @@ static void amd_gpio_irq_disable(struct irq_data *d) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); pin_reg &= ~BIT(INTERRUPT_ENABLE_OFF); pin_reg &= ~BIT(INTERRUPT_MASK_OFF); - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); - gpiochip_disable_irq(gc, d->hwirq); + gpiochip_disable_irq(gc, hwirq); } static void amd_gpio_irq_mask(struct irq_data *d) @@ -417,11 +419,12 @@ static void amd_gpio_irq_mask(struct irq_data *d) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); pin_reg &= ~BIT(INTERRUPT_MASK_OFF); - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } @@ -431,11 +434,12 @@ static void amd_gpio_irq_unmask(struct irq_data *d) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); pin_reg |= BIT(INTERRUPT_MASK_OFF); - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } @@ -446,17 +450,21 @@ static int amd_gpio_irq_set_wake(struct irq_data *d, unsigned int on) struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); u32 wake_mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3); + irq_hw_number_t hwirq = irqd_to_hwirq(d); int err; + pm_pr_dbg("Setting wake for GPIO %lu to %s\n", + hwirq, str_enable_disable(on)); + raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); if (on) pin_reg |= wake_mask; else pin_reg &= ~wake_mask; - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); if (on) @@ -492,9 +500,10 @@ static int amd_gpio_irq_set_type(struct irq_data *d, unsigned int type) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + irq_hw_number_t hwirq = irqd_to_hwirq(d); raw_spin_lock_irqsave(&gpio_dev->lock, flags); - pin_reg = readl(gpio_dev->base + (d->hwirq)*4); + pin_reg = readl(gpio_dev->base + hwirq * 4); switch (type & IRQ_TYPE_SENSE_MASK) { case IRQ_TYPE_EDGE_RISING: @@ -560,10 +569,10 @@ static int amd_gpio_irq_set_type(struct irq_data *d, unsigned int type) pin_reg_irq_en = pin_reg; pin_reg_irq_en |= mask; pin_reg_irq_en &= ~BIT(INTERRUPT_MASK_OFF); - writel(pin_reg_irq_en, gpio_dev->base + (d->hwirq)*4); - while ((readl(gpio_dev->base + (d->hwirq)*4) & mask) != mask) + writel(pin_reg_irq_en, gpio_dev->base + hwirq * 4); + while ((readl(gpio_dev->base + hwirq * 4) & mask) != mask) continue; - writel(pin_reg, gpio_dev->base + (d->hwirq)*4); + writel(pin_reg, gpio_dev->base + hwirq * 4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); return ret; diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c index 35ea3414cb96d7..ec5351fc282e20 100644 --- a/drivers/pinctrl/pinctrl-at91-pio4.c +++ b/drivers/pinctrl/pinctrl-at91-pio4.c @@ -862,7 +862,7 @@ static int atmel_conf_pin_config_group_set(struct pinctrl_dev *pctldev, conf |= ATMEL_PIO_IFSCEN_MASK; } break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: conf |= ATMEL_PIO_DIR_MASK; bank = ATMEL_PIO_BANK(pin_id); pin = ATMEL_PIO_LINE(pin_id); diff --git a/drivers/pinctrl/pinctrl-aw9523.c b/drivers/pinctrl/pinctrl-aw9523.c index 890b83fddea3c5..479553a7921614 100644 --- a/drivers/pinctrl/pinctrl-aw9523.c +++ b/drivers/pinctrl/pinctrl-aw9523.c @@ -215,7 +215,7 @@ static int aw9523_pcfg_param_to_reg(enum pin_config_param pcp, int pin, u8 *r) case PIN_CONFIG_OUTPUT_ENABLE: reg = AW9523_REG_CONF_STATE(pin); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: reg = AW9523_REG_OUT_STATE(pin); break; default: @@ -249,7 +249,7 @@ static int aw9523_pconf_get(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: val &= BIT(regbit); break; case PIN_CONFIG_BIAS_PULL_DOWN: @@ -301,7 +301,7 @@ static int aw9523_pconf_set(struct pinctrl_dev *pctldev, unsigned int pin, goto end; switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* First, enable pin output */ rc = regmap_update_bits(awi->regmap, AW9523_REG_CONF_STATE(pin), diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index cf7f80497fdeaf..a4b04bf6d081f6 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -808,7 +808,7 @@ static int cy8c95x0_gpio_get_pincfg(struct cy8c95x0_pinctrl *chip, case PIN_CONFIG_MODE_PWM: reg = CY8C95X0_SELPWM; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: reg = CY8C95X0_OUTPUT; break; case PIN_CONFIG_OUTPUT_ENABLE: diff --git a/drivers/pinctrl/pinctrl-eic7700.c b/drivers/pinctrl/pinctrl-eic7700.c index 4874b55323439a..ffcd0ec5c2dc6c 100644 --- a/drivers/pinctrl/pinctrl-eic7700.c +++ b/drivers/pinctrl/pinctrl-eic7700.c @@ -634,7 +634,7 @@ static int eic7700_pinctrl_probe(struct platform_device *pdev) return PTR_ERR(pc->base); regulator = devm_regulator_get(dev, "vrgmii"); - if (IS_ERR_OR_NULL(regulator)) { + if (IS_ERR(regulator)) { return dev_err_probe(dev, PTR_ERR(regulator), "failed to get vrgmii regulator\n"); } diff --git a/drivers/pinctrl/pinctrl-equilibrium.c b/drivers/pinctrl/pinctrl-equilibrium.c index fce804d42e7d7f..2d04829b29c997 100644 --- a/drivers/pinctrl/pinctrl-equilibrium.c +++ b/drivers/pinctrl/pinctrl-equilibrium.c @@ -2,6 +2,7 @@ /* Copyright (C) 2019 Intel Corporation */ #include +#include #include #include #include @@ -179,7 +180,7 @@ static int gpiochip_setup(struct device *dev, struct eqbr_gpio_ctrl *gctrl) struct gpio_irq_chip *girq; struct gpio_chip *gc; - gc = &gctrl->chip; + gc = &gctrl->chip.gc; gc->label = gctrl->name; gc->fwnode = gctrl->fwnode; gc->request = gpiochip_generic_request; @@ -191,7 +192,7 @@ static int gpiochip_setup(struct device *dev, struct eqbr_gpio_ctrl *gctrl) return 0; } - girq = &gctrl->chip.irq; + girq = &gctrl->chip.gc.irq; gpio_irq_chip_set_chip(girq, &eqbr_irq_chip); girq->parent_handler = eqbr_irq_handler; girq->num_parents = 1; @@ -208,6 +209,7 @@ static int gpiochip_setup(struct device *dev, struct eqbr_gpio_ctrl *gctrl) static int gpiolib_reg(struct eqbr_pinctrl_drv_data *drvdata) { + struct gpio_generic_chip_config config; struct device *dev = drvdata->dev; struct eqbr_gpio_ctrl *gctrl; struct device_node *np; @@ -239,12 +241,16 @@ static int gpiolib_reg(struct eqbr_pinctrl_drv_data *drvdata) } raw_spin_lock_init(&gctrl->lock); - ret = bgpio_init(&gctrl->chip, dev, gctrl->bank->nr_pins / 8, - gctrl->membase + GPIO_IN, - gctrl->membase + GPIO_OUTSET, - gctrl->membase + GPIO_OUTCLR, - gctrl->membase + GPIO_DIR, - NULL, 0); + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = gctrl->bank->nr_pins / 8, + .dat = gctrl->membase + GPIO_IN, + .set = gctrl->membase + GPIO_OUTSET, + .clr = gctrl->membase + GPIO_OUTCLR, + .dirout = gctrl->membase + GPIO_DIR, + }; + + ret = gpio_generic_chip_init(&gctrl->chip, &config); if (ret) { dev_err(dev, "unable to init generic GPIO\n"); return ret; @@ -254,7 +260,7 @@ static int gpiolib_reg(struct eqbr_pinctrl_drv_data *drvdata) if (ret) return ret; - ret = devm_gpiochip_add_data(dev, &gctrl->chip, gctrl); + ret = devm_gpiochip_add_data(dev, &gctrl->chip.gc, gctrl); if (ret) return ret; } @@ -319,7 +325,7 @@ static int eqbr_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, unsigned int group) { struct eqbr_pinctrl_drv_data *pctl = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; unsigned int *pinmux; int i; @@ -439,7 +445,7 @@ static int eqbr_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, } raw_spin_unlock_irqrestore(&pctl->lock, flags); *config = pinconf_to_config_packed(param, val); -; + return 0; } @@ -499,7 +505,7 @@ static int eqbr_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, bank->pin_base, pin); return -ENODEV; } - gc = &gctrl->chip; + gc = &gctrl->chip.gc; gc->direction_output(gc, offset, 0); continue; default: diff --git a/drivers/pinctrl/pinctrl-equilibrium.h b/drivers/pinctrl/pinctrl-equilibrium.h index b4d149bde39d8d..b56124d7fe9132 100644 --- a/drivers/pinctrl/pinctrl-equilibrium.h +++ b/drivers/pinctrl/pinctrl-equilibrium.h @@ -96,7 +96,7 @@ struct fwnode_handle; * @lock: spin lock to protect gpio register write. */ struct eqbr_gpio_ctrl { - struct gpio_chip chip; + struct gpio_generic_chip chip; struct fwnode_handle *fwnode; struct eqbr_pin_bank *bank; void __iomem *membase; diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index 2900513467fa4e..c7f14546de0528 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -96,11 +96,8 @@ .data = (void *)func, \ } -#define INGENIC_PIN_FUNCTION(_name_, id) \ - { \ - .func = PINCTRL_PINFUNCTION(_name_, id##_groups, ARRAY_SIZE(id##_groups)), \ - .data = NULL, \ - } +#define INGENIC_PIN_FUNCTION(_name_, id) \ + PINCTRL_PINFUNCTION(_name_, id##_groups, ARRAY_SIZE(id##_groups)) enum jz_version { ID_JZ4730, @@ -128,7 +125,7 @@ struct ingenic_chip_info { const struct group_desc *groups; unsigned int num_groups; - const struct function_desc *functions; + const struct pinfunction *functions; unsigned int num_functions; const u32 *pull_ups, *pull_downs; @@ -263,7 +260,7 @@ static const char *jz4730_pwm1_groups[] = { "pwm1", }; static const char *jz4730_mii_groups[] = { "mii", }; static const char *jz4730_i2s_groups[] = { "i2s-data", "i2s-master", "i2s-slave", }; -static const struct function_desc jz4730_functions[] = { +static const struct pinfunction jz4730_functions[] = { INGENIC_PIN_FUNCTION("mmc", jz4730_mmc), INGENIC_PIN_FUNCTION("uart0", jz4730_uart0), INGENIC_PIN_FUNCTION("uart1", jz4730_uart1), @@ -370,7 +367,7 @@ static const char *jz4740_pwm5_groups[] = { "pwm5", }; static const char *jz4740_pwm6_groups[] = { "pwm6", }; static const char *jz4740_pwm7_groups[] = { "pwm7", }; -static const struct function_desc jz4740_functions[] = { +static const struct pinfunction jz4740_functions[] = { INGENIC_PIN_FUNCTION("mmc", jz4740_mmc), INGENIC_PIN_FUNCTION("uart0", jz4740_uart0), INGENIC_PIN_FUNCTION("uart1", jz4740_uart1), @@ -474,7 +471,7 @@ static const char *jz4725b_pwm3_groups[] = { "pwm3", }; static const char *jz4725b_pwm4_groups[] = { "pwm4", }; static const char *jz4725b_pwm5_groups[] = { "pwm5", }; -static const struct function_desc jz4725b_functions[] = { +static const struct pinfunction jz4725b_functions[] = { INGENIC_PIN_FUNCTION("mmc0", jz4725b_mmc0), INGENIC_PIN_FUNCTION("mmc1", jz4725b_mmc1), INGENIC_PIN_FUNCTION("uart", jz4725b_uart), @@ -606,7 +603,7 @@ static const char *jz4750_pwm3_groups[] = { "pwm3", }; static const char *jz4750_pwm4_groups[] = { "pwm4", }; static const char *jz4750_pwm5_groups[] = { "pwm5", }; -static const struct function_desc jz4750_functions[] = { +static const struct pinfunction jz4750_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4750_uart0), INGENIC_PIN_FUNCTION("uart1", jz4750_uart1), INGENIC_PIN_FUNCTION("uart2", jz4750_uart2), @@ -771,7 +768,7 @@ static const char *jz4755_pwm3_groups[] = { "pwm3", }; static const char *jz4755_pwm4_groups[] = { "pwm4", }; static const char *jz4755_pwm5_groups[] = { "pwm5", }; -static const struct function_desc jz4755_functions[] = { +static const struct pinfunction jz4755_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4755_uart0), INGENIC_PIN_FUNCTION("uart1", jz4755_uart1), INGENIC_PIN_FUNCTION("uart2", jz4755_uart2), @@ -1106,7 +1103,7 @@ static const char *jz4760_pwm6_groups[] = { "pwm6", }; static const char *jz4760_pwm7_groups[] = { "pwm7", }; static const char *jz4760_otg_groups[] = { "otg-vbus", }; -static const struct function_desc jz4760_functions[] = { +static const struct pinfunction jz4760_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4760_uart0), INGENIC_PIN_FUNCTION("uart1", jz4760_uart1), INGENIC_PIN_FUNCTION("uart2", jz4760_uart2), @@ -1444,7 +1441,7 @@ static const char *jz4770_pwm6_groups[] = { "pwm6", }; static const char *jz4770_pwm7_groups[] = { "pwm7", }; static const char *jz4770_mac_groups[] = { "mac-rmii", "mac-mii", }; -static const struct function_desc jz4770_functions[] = { +static const struct pinfunction jz4770_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4770_uart0), INGENIC_PIN_FUNCTION("uart1", jz4770_uart1), INGENIC_PIN_FUNCTION("uart2", jz4770_uart2), @@ -1723,7 +1720,7 @@ static const char *jz4775_mac_groups[] = { }; static const char *jz4775_otg_groups[] = { "otg-vbus", }; -static const struct function_desc jz4775_functions[] = { +static const struct pinfunction jz4775_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4775_uart0), INGENIC_PIN_FUNCTION("uart1", jz4775_uart1), INGENIC_PIN_FUNCTION("uart2", jz4775_uart2), @@ -1976,7 +1973,7 @@ static const char *jz4780_dmic_groups[] = { "dmic", }; static const char *jz4780_cim_groups[] = { "cim-data", }; static const char *jz4780_hdmi_ddc_groups[] = { "hdmi-ddc", }; -static const struct function_desc jz4780_functions[] = { +static const struct pinfunction jz4780_functions[] = { INGENIC_PIN_FUNCTION("uart0", jz4770_uart0), INGENIC_PIN_FUNCTION("uart1", jz4770_uart1), INGENIC_PIN_FUNCTION("uart2", jz4780_uart2), @@ -2211,7 +2208,7 @@ static const char *x1000_pwm3_groups[] = { "pwm3", }; static const char *x1000_pwm4_groups[] = { "pwm4", }; static const char *x1000_mac_groups[] = { "mac", }; -static const struct function_desc x1000_functions[] = { +static const struct pinfunction x1000_functions[] = { INGENIC_PIN_FUNCTION("uart0", x1000_uart0), INGENIC_PIN_FUNCTION("uart1", x1000_uart1), INGENIC_PIN_FUNCTION("uart2", x1000_uart2), @@ -2341,7 +2338,7 @@ static const char *x1500_pwm2_groups[] = { "pwm2", }; static const char *x1500_pwm3_groups[] = { "pwm3", }; static const char *x1500_pwm4_groups[] = { "pwm4", }; -static const struct function_desc x1500_functions[] = { +static const struct pinfunction x1500_functions[] = { INGENIC_PIN_FUNCTION("uart0", x1500_uart0), INGENIC_PIN_FUNCTION("uart1", x1500_uart1), INGENIC_PIN_FUNCTION("uart2", x1500_uart2), @@ -2562,7 +2559,7 @@ static const char * const x1600_pwm7_groups[] = { "pwm7-b10", "pwm7-b21", }; static const char * const x1600_mac_groups[] = { "mac", }; -static const struct function_desc x1600_functions[] = { +static const struct pinfunction x1600_functions[] = { INGENIC_PIN_FUNCTION("uart0", x1600_uart0), INGENIC_PIN_FUNCTION("uart1", x1600_uart1), INGENIC_PIN_FUNCTION("uart2", x1600_uart2), @@ -2779,7 +2776,7 @@ static const char *x1830_pwm6_groups[] = { "pwm6-c-17", "pwm6-c-27", }; static const char *x1830_pwm7_groups[] = { "pwm7-c-18", "pwm7-c-28", }; static const char *x1830_mac_groups[] = { "mac", }; -static const struct function_desc x1830_functions[] = { +static const struct pinfunction x1830_functions[] = { INGENIC_PIN_FUNCTION("uart0", x1830_uart0), INGENIC_PIN_FUNCTION("uart1", x1830_uart1), INGENIC_PIN_FUNCTION("sfc", x1830_sfc), @@ -3225,7 +3222,7 @@ static const char *x2000_mac0_groups[] = { "mac0-rmii", "mac0-rgmii", }; static const char *x2000_mac1_groups[] = { "mac1-rmii", "mac1-rgmii", }; static const char *x2000_otg_groups[] = { "otg-vbus", }; -static const struct function_desc x2000_functions[] = { +static const struct pinfunction x2000_functions[] = { INGENIC_PIN_FUNCTION("uart0", x2000_uart0), INGENIC_PIN_FUNCTION("uart1", x2000_uart1), INGENIC_PIN_FUNCTION("uart2", x2000_uart2), @@ -3449,7 +3446,7 @@ static const struct group_desc x2100_groups[] = { static const char *x2100_mac_groups[] = { "mac", }; -static const struct function_desc x2100_functions[] = { +static const struct pinfunction x2100_functions[] = { INGENIC_PIN_FUNCTION("uart0", x2000_uart0), INGENIC_PIN_FUNCTION("uart1", x2000_uart1), INGENIC_PIN_FUNCTION("uart2", x2000_uart2), @@ -4003,7 +4000,7 @@ static int ingenic_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, unsigned int group) { struct ingenic_pinctrl *jzpc = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; unsigned int i; uintptr_t mode; @@ -4018,7 +4015,7 @@ static int ingenic_pinmux_set_mux(struct pinctrl_dev *pctldev, return -EINVAL; dev_dbg(pctldev->dev, "enable function %s group %s\n", - func->func.name, grp->grp.name); + func->func->name, grp->grp.name); mode = (uintptr_t)grp->data; if (mode <= 3) { @@ -4267,7 +4264,7 @@ static int ingenic_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_INPUT_SCHMITT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_SLEW_RATE: continue; default: @@ -4308,7 +4305,7 @@ static int ingenic_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, ingenic_set_schmitt_trigger(jzpc, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = pinctrl_gpio_direction_output(jzpc->gc, pin - jzpc->gc->base); if (ret) @@ -4571,11 +4568,9 @@ static int __init ingenic_pinctrl_probe(struct platform_device *pdev) } for (i = 0; i < chip_info->num_functions; i++) { - const struct function_desc *function = &chip_info->functions[i]; - const struct pinfunction *func = &function->func; + const struct pinfunction *func = &chip_info->functions[i]; - err = pinmux_generic_add_pinfunction(jzpc->pctl, func, - function->data); + err = pinmux_generic_add_pinfunction(jzpc->pctl, func, NULL); if (err < 0) { dev_err(dev, "Failed to register function %s\n", func->name); return err; diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c index 66c04120c29dec..ddd6d6bfd51399 100644 --- a/drivers/pinctrl/pinctrl-k210.c +++ b/drivers/pinctrl/pinctrl-k210.c @@ -551,7 +551,7 @@ static int k210_pinconf_set_param(struct pinctrl_dev *pctldev, else val &= ~K210_PC_ST; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: k210_pinmux_set_pin_function(pctldev, pin, K210_PCF_CONSTANT); val = readl(&pdata->fpioa->pins[pin]); val |= K210_PC_MODE_OUT; diff --git a/drivers/pinctrl/pinctrl-keembay.c b/drivers/pinctrl/pinctrl-keembay.c index 60cf017498b32a..3241d3ae621917 100644 --- a/drivers/pinctrl/pinctrl-keembay.c +++ b/drivers/pinctrl/pinctrl-keembay.c @@ -135,6 +135,11 @@ struct keembay_pin_soc { const struct pinctrl_pin_desc *pins; }; +struct keembay_pinfunction { + struct pinfunction func; + u8 mux_mode; +}; + static const struct pinctrl_pin_desc keembay_pins[] = { KEEMBAY_PIN_DESC(0, "GPIO0", KEEMBAY_MUX(0x0, "I2S0_M0"), @@ -930,7 +935,7 @@ static int keembay_set_mux(struct pinctrl_dev *pctldev, unsigned int fun_sel, unsigned int grp_sel) { struct keembay_pinctrl *kpc = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; unsigned int val; u8 pin_mode; @@ -1556,13 +1561,13 @@ static int keembay_pinctrl_reg(struct keembay_pinctrl *kpc, struct device *dev) } static int keembay_add_functions(struct keembay_pinctrl *kpc, - struct function_desc *functions) + struct keembay_pinfunction *functions) { unsigned int i; /* Assign the groups for each function */ for (i = 0; i < kpc->nfuncs; i++) { - struct function_desc *func = &functions[i]; + struct keembay_pinfunction *func = &functions[i]; const char **group_names; unsigned int grp_idx = 0; int j; @@ -1588,14 +1593,14 @@ static int keembay_add_functions(struct keembay_pinctrl *kpc, /* Add all functions */ for (i = 0; i < kpc->nfuncs; i++) pinmux_generic_add_pinfunction(kpc->pctrl, &functions[i].func, - functions[i].data); + &functions[i].mux_mode); return 0; } static int keembay_build_functions(struct keembay_pinctrl *kpc) { - struct function_desc *keembay_funcs, *new_funcs; + struct keembay_pinfunction *keembay_funcs, *new_funcs; int i; /* @@ -1603,7 +1608,8 @@ static int keembay_build_functions(struct keembay_pinctrl *kpc) * being part of 8 (hw maximum) globally unique muxes. */ kpc->nfuncs = 0; - keembay_funcs = kcalloc(kpc->npins * 8, sizeof(*keembay_funcs), GFP_KERNEL); + keembay_funcs = devm_kcalloc(kpc->dev, kpc->npins * 8, + sizeof(*keembay_funcs), GFP_KERNEL); if (!keembay_funcs) return -ENOMEM; @@ -1613,7 +1619,7 @@ static int keembay_build_functions(struct keembay_pinctrl *kpc) struct keembay_mux_desc *mux; for (mux = pdesc->drv_data; mux->name; mux++) { - struct function_desc *fdesc; + struct keembay_pinfunction *fdesc; /* Check if we already have function for this mux */ for (fdesc = keembay_funcs; fdesc->func.name; fdesc++) { @@ -1627,18 +1633,18 @@ static int keembay_build_functions(struct keembay_pinctrl *kpc) if (!fdesc->func.name) { fdesc->func.name = mux->name; fdesc->func.ngroups = 1; - fdesc->data = &mux->mode; + fdesc->mux_mode = mux->mode; kpc->nfuncs++; } } } /* Reallocate memory based on actual number of functions */ - new_funcs = krealloc(keembay_funcs, kpc->nfuncs * sizeof(*new_funcs), GFP_KERNEL); - if (!new_funcs) { - kfree(keembay_funcs); + new_funcs = devm_krealloc_array(kpc->dev, keembay_funcs, + kpc->nfuncs, sizeof(*new_funcs), + GFP_KERNEL); + if (!new_funcs) return -ENOMEM; - } return keembay_add_functions(kpc, new_funcs); } diff --git a/drivers/pinctrl/pinctrl-max7360.c b/drivers/pinctrl/pinctrl-max7360.c new file mode 100644 index 00000000000000..abfaff468bad1f --- /dev/null +++ b/drivers/pinctrl/pinctrl-max7360.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "core.h" +#include "pinmux.h" + +struct max7360_pinctrl { + struct pinctrl_dev *pctldev; + struct pinctrl_desc pinctrl_desc; +}; + +static const struct pinctrl_pin_desc max7360_pins[] = { + PINCTRL_PIN(0, "PORT0"), + PINCTRL_PIN(1, "PORT1"), + PINCTRL_PIN(2, "PORT2"), + PINCTRL_PIN(3, "PORT3"), + PINCTRL_PIN(4, "PORT4"), + PINCTRL_PIN(5, "PORT5"), + PINCTRL_PIN(6, "PORT6"), + PINCTRL_PIN(7, "PORT7"), +}; + +static const unsigned int port0_pins[] = {0}; +static const unsigned int port1_pins[] = {1}; +static const unsigned int port2_pins[] = {2}; +static const unsigned int port3_pins[] = {3}; +static const unsigned int port4_pins[] = {4}; +static const unsigned int port5_pins[] = {5}; +static const unsigned int port6_pins[] = {6}; +static const unsigned int port7_pins[] = {7}; +static const unsigned int rotary_pins[] = {6, 7}; + +static const struct pingroup max7360_groups[] = { + PINCTRL_PINGROUP("PORT0", port0_pins, ARRAY_SIZE(port0_pins)), + PINCTRL_PINGROUP("PORT1", port1_pins, ARRAY_SIZE(port1_pins)), + PINCTRL_PINGROUP("PORT2", port2_pins, ARRAY_SIZE(port2_pins)), + PINCTRL_PINGROUP("PORT3", port3_pins, ARRAY_SIZE(port3_pins)), + PINCTRL_PINGROUP("PORT4", port4_pins, ARRAY_SIZE(port4_pins)), + PINCTRL_PINGROUP("PORT5", port5_pins, ARRAY_SIZE(port5_pins)), + PINCTRL_PINGROUP("PORT6", port6_pins, ARRAY_SIZE(port6_pins)), + PINCTRL_PINGROUP("PORT7", port7_pins, ARRAY_SIZE(port7_pins)), + PINCTRL_PINGROUP("ROTARY", rotary_pins, ARRAY_SIZE(rotary_pins)), +}; + +static int max7360_pinctrl_get_groups_count(struct pinctrl_dev *pctldev) +{ + return ARRAY_SIZE(max7360_groups); +} + +static const char *max7360_pinctrl_get_group_name(struct pinctrl_dev *pctldev, + unsigned int group) +{ + return max7360_groups[group].name; +} + +static int max7360_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, + unsigned int group, + const unsigned int **pins, + unsigned int *num_pins) +{ + *pins = max7360_groups[group].pins; + *num_pins = max7360_groups[group].npins; + return 0; +} + +static const struct pinctrl_ops max7360_pinctrl_ops = { + .get_groups_count = max7360_pinctrl_get_groups_count, + .get_group_name = max7360_pinctrl_get_group_name, + .get_group_pins = max7360_pinctrl_get_group_pins, +#ifdef CONFIG_OF + .dt_node_to_map = pinconf_generic_dt_node_to_map_pin, + .dt_free_map = pinconf_generic_dt_free_map, +#endif +}; + +static const char * const simple_groups[] = { + "PORT0", "PORT1", "PORT2", "PORT3", + "PORT4", "PORT5", "PORT6", "PORT7", +}; + +static const char * const rotary_groups[] = { "ROTARY" }; + +#define MAX7360_PINCTRL_FN_GPIO 0 +#define MAX7360_PINCTRL_FN_PWM 1 +#define MAX7360_PINCTRL_FN_ROTARY 2 +static const struct pinfunction max7360_functions[] = { + [MAX7360_PINCTRL_FN_GPIO] = PINCTRL_PINFUNCTION("gpio", simple_groups, + ARRAY_SIZE(simple_groups)), + [MAX7360_PINCTRL_FN_PWM] = PINCTRL_PINFUNCTION("pwm", simple_groups, + ARRAY_SIZE(simple_groups)), + [MAX7360_PINCTRL_FN_ROTARY] = PINCTRL_PINFUNCTION("rotary", rotary_groups, + ARRAY_SIZE(rotary_groups)), +}; + +static int max7360_get_functions_count(struct pinctrl_dev *pctldev) +{ + return ARRAY_SIZE(max7360_functions); +} + +static const char *max7360_get_function_name(struct pinctrl_dev *pctldev, unsigned int selector) +{ + return max7360_functions[selector].name; +} + +static int max7360_get_function_groups(struct pinctrl_dev *pctldev, unsigned int selector, + const char * const **groups, + unsigned int * const num_groups) +{ + *groups = max7360_functions[selector].groups; + *num_groups = max7360_functions[selector].ngroups; + + return 0; +} + +static int max7360_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, + unsigned int group) +{ + struct regmap *regmap = dev_get_regmap(pctldev->dev->parent, NULL); + int val; + + /* + * GPIO and PWM functions are the same: we only need to handle the + * rotary encoder function, on pins 6 and 7. + */ + if (max7360_groups[group].pins[0] >= 6) { + if (selector == MAX7360_PINCTRL_FN_ROTARY) + val = MAX7360_GPIO_CFG_RTR_EN; + else + val = 0; + + return regmap_write_bits(regmap, MAX7360_REG_GPIOCFG, MAX7360_GPIO_CFG_RTR_EN, val); + } + + return 0; +} + +static const struct pinmux_ops max7360_pmxops = { + .get_functions_count = max7360_get_functions_count, + .get_function_name = max7360_get_function_name, + .get_function_groups = max7360_get_function_groups, + .set_mux = max7360_set_mux, + .strict = true, +}; + +static int max7360_pinctrl_probe(struct platform_device *pdev) +{ + struct regmap *regmap; + struct pinctrl_desc *pd; + struct max7360_pinctrl *chip; + struct device *dev = &pdev->dev; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "Could not get parent regmap\n"); + + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + pd = &chip->pinctrl_desc; + + pd->pctlops = &max7360_pinctrl_ops; + pd->pmxops = &max7360_pmxops; + pd->name = dev_name(dev); + pd->pins = max7360_pins; + pd->npins = MAX7360_MAX_GPIO; + pd->owner = THIS_MODULE; + + /* + * This MFD sub-device does not have any associated device tree node: + * properties are stored in the device node of the parent (MFD) device + * and this same node is used in phandles of client devices. + * Reuse this device tree node here, as otherwise the pinctrl subsystem + * would be confused by this topology. + */ + device_set_of_node_from_dev(dev, dev->parent); + + chip->pctldev = devm_pinctrl_register(dev, pd, chip); + if (IS_ERR(chip->pctldev)) + return dev_err_probe(dev, PTR_ERR(chip->pctldev), "can't register controller\n"); + + return 0; +} + +static struct platform_driver max7360_pinctrl_driver = { + .driver = { + .name = "max7360-pinctrl", + }, + .probe = max7360_pinctrl_probe, +}; +module_platform_driver(max7360_pinctrl_driver); + +MODULE_DESCRIPTION("MAX7360 pinctrl driver"); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); diff --git a/drivers/pinctrl/pinctrl-microchip-sgpio.c b/drivers/pinctrl/pinctrl-microchip-sgpio.c index 6191e5c1381531..b6363f3cdce94e 100644 --- a/drivers/pinctrl/pinctrl-microchip-sgpio.c +++ b/drivers/pinctrl/pinctrl-microchip-sgpio.c @@ -371,7 +371,7 @@ static int sgpio_pinconf_get(struct pinctrl_dev *pctldev, val = !bank->is_input; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (bank->is_input) return -EINVAL; val = sgpio_output_get(priv, &addr); @@ -402,7 +402,7 @@ static int sgpio_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, arg = pinconf_to_config_argument(configs[cfg]); switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (bank->is_input) return -EINVAL; err = sgpio_output_set(priv, &addr, arg); @@ -824,7 +824,7 @@ static int microchip_sgpio_register_bank(struct device *dev, pctl_desc->confops = &sgpio_confops; pctl_desc->owner = THIS_MODULE; - pins = devm_kzalloc(dev, sizeof(*pins)*ngpios, GFP_KERNEL); + pins = devm_kcalloc(dev, ngpios, sizeof(*pins), GFP_KERNEL); if (!pins) return -ENOMEM; diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c index b82bf83fed25b6..70da3f37567a5c 100644 --- a/drivers/pinctrl/pinctrl-ocelot.c +++ b/drivers/pinctrl/pinctrl-ocelot.c @@ -1656,7 +1656,7 @@ static int ocelot_pinconf_get(struct pinctrl_dev *pctldev, return err; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = regmap_read(info->map, REG(OCELOT_GPIO_OUT, info, pin), &val); if (err) @@ -1735,7 +1735,7 @@ static int ocelot_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: p = pin % 32; if (arg) regmap_write(info->map, diff --git a/drivers/pinctrl/pinctrl-pic32.c b/drivers/pinctrl/pinctrl-pic32.c index 37c2bf752154d4..e8b481e87c7792 100644 --- a/drivers/pinctrl/pinctrl-pic32.c +++ b/drivers/pinctrl/pinctrl-pic32.c @@ -1905,7 +1905,7 @@ static int pic32_pinconf_get(struct pinctrl_dev *pctldev, unsigned pin, case PIN_CONFIG_INPUT_ENABLE: arg = !!(readl(bank->reg_base + TRIS_REG) & mask); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = !(readl(bank->reg_base + TRIS_REG) & mask); break; default: @@ -1960,7 +1960,7 @@ static int pic32_pinconf_set(struct pinctrl_dev *pctldev, unsigned pin, case PIN_CONFIG_INPUT_ENABLE: pic32_gpio_direction_input(&bank->gpio_chip, offset); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pic32_gpio_direction_output(&bank->gpio_chip, offset, arg); break; diff --git a/drivers/pinctrl/pinctrl-rk805.c b/drivers/pinctrl/pinctrl-rk805.c index 3acf770316c1aa..22f576337faa9c 100644 --- a/drivers/pinctrl/pinctrl-rk805.c +++ b/drivers/pinctrl/pinctrl-rk805.c @@ -541,7 +541,7 @@ static int rk805_pinconf_get(struct pinctrl_dev *pctldev, u32 arg = 0; switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: arg = rk805_gpio_get(&pci->gpio_chip, pin); break; @@ -568,7 +568,7 @@ static int rk805_pinconf_set(struct pinctrl_dev *pctldev, arg = pinconf_to_config_argument(configs[i]); switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rk805_gpio_set(&pci->gpio_chip, pin, arg); rk805_pmx_gpio_set_direction(pctldev, NULL, pin, false); break; diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 930c454e0cec7d..7a68a6237649c1 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -3272,7 +3272,7 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, param = pinconf_to_config_param(configs[i]); arg = pinconf_to_config_argument(configs[i]); - if (param == PIN_CONFIG_OUTPUT || param == PIN_CONFIG_INPUT_ENABLE) { + if (param == PIN_CONFIG_LEVEL || param == PIN_CONFIG_INPUT_ENABLE) { /* * Check for gpio driver not being probed yet. * The lock makes sure that either gpio-probe has completed @@ -3313,7 +3313,7 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (rc) return rc; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rc = rockchip_set_mux(bank, pin - bank->pin_base, RK_FUNC_GPIO); if (rc != RK_FUNC_GPIO) @@ -3392,7 +3392,7 @@ static int rockchip_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rc = rockchip_get_mux(bank, pin - bank->pin_base); if (rc != RK_FUNC_GPIO) return -EINVAL; diff --git a/drivers/pinctrl/pinctrl-rp1.c b/drivers/pinctrl/pinctrl-rp1.c index dadafc935dbb28..ffc2f0b460a641 100644 --- a/drivers/pinctrl/pinctrl-rp1.c +++ b/drivers/pinctrl/pinctrl-rp1.c @@ -1440,7 +1440,7 @@ static int rp1_pinconf_set(struct pinctrl_dev *pctldev, unsigned int offset, rp1_output_enable(pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rp1_set_value(pin, arg); rp1_set_dir(pin, RP1_DIR_OUTPUT); rp1_set_fsel(pin, RP1_FSEL_GPIO); @@ -1623,12 +1623,94 @@ MODULE_DEVICE_TABLE(of, rp1_pinctrl_match); static struct rp1_pinctrl rp1_pinctrl_data = {}; -static const struct regmap_config rp1_pinctrl_regmap_cfg = { +static const struct regmap_range rp1_gpio_reg_ranges[] = { + /* BANK 0 */ + regmap_reg_range(0x2004, 0x20dc), + regmap_reg_range(0x3004, 0x30dc), + regmap_reg_range(0x0004, 0x00dc), + regmap_reg_range(0x0124, 0x0124), + regmap_reg_range(0x211c, 0x211c), + regmap_reg_range(0x311c, 0x311c), + /* BANK 1 */ + regmap_reg_range(0x6004, 0x602c), + regmap_reg_range(0x7004, 0x702c), + regmap_reg_range(0x4004, 0x402c), + regmap_reg_range(0x4124, 0x4124), + regmap_reg_range(0x611c, 0x611c), + regmap_reg_range(0x711c, 0x711c), + /* BANK 2 */ + regmap_reg_range(0xa004, 0xa09c), + regmap_reg_range(0xb004, 0xb09c), + regmap_reg_range(0x8004, 0x809c), + regmap_reg_range(0x8124, 0x8124), + regmap_reg_range(0xa11c, 0xa11c), + regmap_reg_range(0xb11c, 0xb11c), +}; + +static const struct regmap_range rp1_rio_reg_ranges[] = { + /* BANK 0 */ + regmap_reg_range(0x2000, 0x2004), + regmap_reg_range(0x3000, 0x3004), + regmap_reg_range(0x0004, 0x0008), + /* BANK 1 */ + regmap_reg_range(0x6000, 0x6004), + regmap_reg_range(0x7000, 0x7004), + regmap_reg_range(0x4004, 0x4008), + /* BANK 2 */ + regmap_reg_range(0xa000, 0xa004), + regmap_reg_range(0xb000, 0xb004), + regmap_reg_range(0x8004, 0x8008), +}; + +static const struct regmap_range rp1_pads_reg_ranges[] = { + /* BANK 0 */ + regmap_reg_range(0x0004, 0x0070), + /* BANK 1 */ + regmap_reg_range(0x4004, 0x4018), + /* BANK 2 */ + regmap_reg_range(0x8004, 0x8050), +}; + +static const struct regmap_access_table rp1_gpio_reg_table = { + .yes_ranges = rp1_gpio_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(rp1_gpio_reg_ranges), +}; + +static const struct regmap_access_table rp1_rio_reg_table = { + .yes_ranges = rp1_rio_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(rp1_rio_reg_ranges), +}; + +static const struct regmap_access_table rp1_pads_reg_table = { + .yes_ranges = rp1_pads_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(rp1_pads_reg_ranges), +}; + +static const struct regmap_config rp1_pinctrl_gpio_regmap_cfg = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .rd_table = &rp1_gpio_reg_table, + .name = "rp1-gpio", + .max_register = 0xb11c, +}; + +static const struct regmap_config rp1_pinctrl_rio_regmap_cfg = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .rd_table = &rp1_rio_reg_table, + .name = "rp1-rio", + .max_register = 0xb004, +}; + +static const struct regmap_config rp1_pinctrl_pads_regmap_cfg = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, - .fast_io = true, - .name = "rp1-pinctrl", + .rd_table = &rp1_pads_reg_table, + .name = "rp1-pads", + .max_register = 0x8050, }; static int rp1_gen_regfield(struct device *dev, @@ -1685,17 +1767,17 @@ static int rp1_pinctrl_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(pc->pads_base), "could not get PADS IO memory\n"); gpio_regmap = devm_regmap_init_mmio(dev, pc->gpio_base, - &rp1_pinctrl_regmap_cfg); + &rp1_pinctrl_gpio_regmap_cfg); if (IS_ERR(gpio_regmap)) return dev_err_probe(dev, PTR_ERR(gpio_regmap), "could not init GPIO regmap\n"); rio_regmap = devm_regmap_init_mmio(dev, pc->rio_base, - &rp1_pinctrl_regmap_cfg); + &rp1_pinctrl_rio_regmap_cfg); if (IS_ERR(rio_regmap)) return dev_err_probe(dev, PTR_ERR(rio_regmap), "could not init RIO regmap\n"); pads_regmap = devm_regmap_init_mmio(dev, pc->pads_base, - &rp1_pinctrl_regmap_cfg); + &rp1_pinctrl_pads_regmap_cfg); if (IS_ERR(pads_regmap)) return dev_err_probe(dev, PTR_ERR(pads_regmap), "could not init PADS regmap\n"); diff --git a/drivers/pinctrl/pinctrl-scmi.c b/drivers/pinctrl/pinctrl-scmi.c index 383681041e4c05..d14528b9aa31ef 100644 --- a/drivers/pinctrl/pinctrl-scmi.c +++ b/drivers/pinctrl/pinctrl-scmi.c @@ -253,7 +253,7 @@ static int pinctrl_scmi_map_pinconf_type(enum pin_config_param param, case PIN_CONFIG_MODE_LOW_POWER: *type = SCMI_PIN_LOW_POWER_MODE; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: *type = SCMI_PIN_OUTPUT_VALUE; break; case PIN_CONFIG_OUTPUT_ENABLE: diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index 5cda6201b60f53..6d580aa282ec98 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -336,7 +336,7 @@ static int pcs_get_function(struct pinctrl_dev *pctldev, unsigned pin, struct pcs_device *pcs = pinctrl_dev_get_drvdata(pctldev); struct pin_desc *pdesc = pin_desc_get(pctldev, pin); const struct pinctrl_setting_mux *setting; - struct function_desc *function; + const struct function_desc *function; unsigned fselector; /* If pin is not described in DTS & enabled, mux_setting is NULL. */ @@ -360,7 +360,7 @@ static int pcs_set_mux(struct pinctrl_dev *pctldev, unsigned fselector, unsigned group) { struct pcs_device *pcs; - struct function_desc *function; + const struct function_desc *function; struct pcs_function *func; int i; @@ -589,8 +589,10 @@ static int pcs_pinconf_set(struct pinctrl_dev *pctldev, /* 4 parameters */ case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_BIAS_PULL_UP: - if (arg) + if (arg) { pcs_pinconf_clear_bias(pctldev, pin); + data = pcs->read(pcs->base + offset); + } fallthrough; case PIN_CONFIG_INPUT_SCHMITT_ENABLE: data &= ~func->conf[i].mask; diff --git a/drivers/pinctrl/pinctrl-stmfx.c b/drivers/pinctrl/pinctrl-stmfx.c index c89b99003b7111..03ee13844b5073 100644 --- a/drivers/pinctrl/pinctrl-stmfx.c +++ b/drivers/pinctrl/pinctrl-stmfx.c @@ -267,7 +267,7 @@ static int stmfx_pinconf_get(struct pinctrl_dev *pctldev, if ((!dir && !type) || (dir && type)) arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (dir) return -EINVAL; @@ -334,7 +334,7 @@ static int stmfx_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (ret) return ret; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = stmfx_gpio_direction_output(&pctl->gpio_chip, pin, arg); if (ret) diff --git a/drivers/pinctrl/pinctrl-sx150x.c b/drivers/pinctrl/pinctrl-sx150x.c index 53cf8168b274c5..1d6760ffe809a2 100644 --- a/drivers/pinctrl/pinctrl-sx150x.c +++ b/drivers/pinctrl/pinctrl-sx150x.c @@ -611,7 +611,7 @@ static int sx150x_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (sx150x_pin_is_oscio(pctl, pin)) { switch (param) { case PIN_CONFIG_DRIVE_PUSH_PULL: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = regmap_read(pctl->regmap, pctl->data->pri.x789.reg_clock, &data); @@ -705,7 +705,7 @@ static int sx150x_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, } break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = sx150x_gpio_get_direction(&pctl->gpio, pin); if (ret < 0) return ret; @@ -744,7 +744,7 @@ static int sx150x_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, arg = pinconf_to_config_argument(configs[i]); if (sx150x_pin_is_oscio(pctl, pin)) { - if (param == PIN_CONFIG_OUTPUT) { + if (param == PIN_CONFIG_LEVEL) { ret = sx150x_gpio_direction_output(&pctl->gpio, pin, arg); if (ret < 0) @@ -816,7 +816,7 @@ static int sx150x_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = sx150x_gpio_direction_output(&pctl->gpio, pin, arg); if (ret < 0) @@ -863,6 +863,7 @@ static const struct of_device_id sx150x_of_match[] = { { .compatible = "semtech,sx1509q", .data = &sx1509q_device_data }, {}, }; +MODULE_DEVICE_TABLE(of, sx150x_of_match); static int sx150x_reset(struct sx150x_pinctrl *pctl) { @@ -1266,3 +1267,6 @@ static int __init sx150x_init(void) return i2c_add_driver(&sx150x_driver); } subsys_initcall(sx150x_init); + +MODULE_DESCRIPTION("Semtech SX150x I2C GPIO expander pinctrl driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/pinctrl/pinctrl-upboard.c b/drivers/pinctrl/pinctrl-upboard.c new file mode 100644 index 00000000000000..f8c8b9d8499003 --- /dev/null +++ b/drivers/pinctrl/pinctrl-upboard.c @@ -0,0 +1,1070 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * UP board pin control driver. + * + * Copyright (C) 2025 Bootlin + * + * Author: Thomas Richard + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "core.h" +#include "pinmux.h" + +enum upboard_pin_mode { + UPBOARD_PIN_MODE_FUNCTION, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_DISABLED, +}; + +struct upboard_pin { + struct regmap_field *funcbit; + struct regmap_field *enbit; + struct regmap_field *dirbit; +}; + +struct upboard_pingroup { + struct pingroup grp; + enum upboard_pin_mode mode; + const enum upboard_pin_mode *modes; +}; + +struct upboard_pinctrl_data { + const struct upboard_pingroup *groups; + size_t ngroups; + const struct pinfunction *funcs; + size_t nfuncs; + const unsigned int *pin_header; + size_t ngpio; +}; + +struct upboard_pinctrl { + struct device *dev; + struct pinctrl_dev *pctldev; + const struct upboard_pinctrl_data *pctrl_data; + struct gpio_pin_range pin_range; + struct upboard_pin *pins; +}; + +struct upboard_pinctrl_map { + const struct pinctrl_map *maps; + size_t nmaps; +}; + +enum upboard_func0_fpgabit { + UPBOARD_FUNC_I2C0_EN = 8, + UPBOARD_FUNC_I2C1_EN = 9, + UPBOARD_FUNC_CEC0_EN = 12, + UPBOARD_FUNC_ADC0_EN = 14, +}; + +static const struct reg_field upboard_i2c0_reg = + REG_FIELD(UPBOARD_REG_FUNC_EN0, UPBOARD_FUNC_I2C0_EN, UPBOARD_FUNC_I2C0_EN); + +static const struct reg_field upboard_i2c1_reg = + REG_FIELD(UPBOARD_REG_FUNC_EN0, UPBOARD_FUNC_I2C1_EN, UPBOARD_FUNC_I2C1_EN); + +static const struct reg_field upboard_adc0_reg = + REG_FIELD(UPBOARD_REG_FUNC_EN0, UPBOARD_FUNC_ADC0_EN, UPBOARD_FUNC_ADC0_EN); + +#define UPBOARD_UP_BIT_TO_PIN(bit) UPBOARD_UP_BIT_##bit + +#define UPBOARD_UP_PIN_NAME(id) \ + { \ + .number = UPBOARD_UP_BIT_##id, \ + .name = #id, \ + } + +#define UPBOARD_UP_PIN_MUX(bit, data) \ + { \ + .number = UPBOARD_UP_BIT_##bit, \ + .name = "PINMUX_"#bit, \ + .drv_data = (void *)(data), \ + } + +#define UPBOARD_UP_PIN_FUNC(id, data) \ + { \ + .number = UPBOARD_UP_BIT_##id, \ + .name = #id, \ + .drv_data = (void *)(data), \ + } + +enum upboard_up_fpgabit { + UPBOARD_UP_BIT_I2C1_SDA, + UPBOARD_UP_BIT_I2C1_SCL, + UPBOARD_UP_BIT_ADC0, + UPBOARD_UP_BIT_UART1_RTS, + UPBOARD_UP_BIT_GPIO27, + UPBOARD_UP_BIT_GPIO22, + UPBOARD_UP_BIT_SPI_MOSI, + UPBOARD_UP_BIT_SPI_MISO, + UPBOARD_UP_BIT_SPI_CLK, + UPBOARD_UP_BIT_I2C0_SDA, + UPBOARD_UP_BIT_GPIO5, + UPBOARD_UP_BIT_GPIO6, + UPBOARD_UP_BIT_PWM1, + UPBOARD_UP_BIT_I2S_FRM, + UPBOARD_UP_BIT_GPIO26, + UPBOARD_UP_BIT_UART1_TX, + UPBOARD_UP_BIT_UART1_RX, + UPBOARD_UP_BIT_I2S_CLK, + UPBOARD_UP_BIT_GPIO23, + UPBOARD_UP_BIT_GPIO24, + UPBOARD_UP_BIT_GPIO25, + UPBOARD_UP_BIT_SPI_CS0, + UPBOARD_UP_BIT_SPI_CS1, + UPBOARD_UP_BIT_I2C0_SCL, + UPBOARD_UP_BIT_PWM0, + UPBOARD_UP_BIT_UART1_CTS, + UPBOARD_UP_BIT_I2S_DIN, + UPBOARD_UP_BIT_I2S_DOUT, +}; + +static const struct pinctrl_pin_desc upboard_up_pins[] = { + UPBOARD_UP_PIN_FUNC(I2C1_SDA, &upboard_i2c1_reg), + UPBOARD_UP_PIN_FUNC(I2C1_SCL, &upboard_i2c1_reg), + UPBOARD_UP_PIN_FUNC(ADC0, &upboard_adc0_reg), + UPBOARD_UP_PIN_NAME(UART1_RTS), + UPBOARD_UP_PIN_NAME(GPIO27), + UPBOARD_UP_PIN_NAME(GPIO22), + UPBOARD_UP_PIN_NAME(SPI_MOSI), + UPBOARD_UP_PIN_NAME(SPI_MISO), + UPBOARD_UP_PIN_NAME(SPI_CLK), + UPBOARD_UP_PIN_FUNC(I2C0_SDA, &upboard_i2c0_reg), + UPBOARD_UP_PIN_NAME(GPIO5), + UPBOARD_UP_PIN_NAME(GPIO6), + UPBOARD_UP_PIN_NAME(PWM1), + UPBOARD_UP_PIN_NAME(I2S_FRM), + UPBOARD_UP_PIN_NAME(GPIO26), + UPBOARD_UP_PIN_NAME(UART1_TX), + UPBOARD_UP_PIN_NAME(UART1_RX), + UPBOARD_UP_PIN_NAME(I2S_CLK), + UPBOARD_UP_PIN_NAME(GPIO23), + UPBOARD_UP_PIN_NAME(GPIO24), + UPBOARD_UP_PIN_NAME(GPIO25), + UPBOARD_UP_PIN_NAME(SPI_CS0), + UPBOARD_UP_PIN_NAME(SPI_CS1), + UPBOARD_UP_PIN_FUNC(I2C0_SCL, &upboard_i2c0_reg), + UPBOARD_UP_PIN_NAME(PWM0), + UPBOARD_UP_PIN_NAME(UART1_CTS), + UPBOARD_UP_PIN_NAME(I2S_DIN), + UPBOARD_UP_PIN_NAME(I2S_DOUT), +}; + +static const unsigned int upboard_up_pin_header[] = { + UPBOARD_UP_BIT_TO_PIN(I2C0_SDA), + UPBOARD_UP_BIT_TO_PIN(I2C0_SCL), + UPBOARD_UP_BIT_TO_PIN(I2C1_SDA), + UPBOARD_UP_BIT_TO_PIN(I2C1_SCL), + UPBOARD_UP_BIT_TO_PIN(ADC0), + UPBOARD_UP_BIT_TO_PIN(GPIO5), + UPBOARD_UP_BIT_TO_PIN(GPIO6), + UPBOARD_UP_BIT_TO_PIN(SPI_CS1), + UPBOARD_UP_BIT_TO_PIN(SPI_CS0), + UPBOARD_UP_BIT_TO_PIN(SPI_MISO), + UPBOARD_UP_BIT_TO_PIN(SPI_MOSI), + UPBOARD_UP_BIT_TO_PIN(SPI_CLK), + UPBOARD_UP_BIT_TO_PIN(PWM0), + UPBOARD_UP_BIT_TO_PIN(PWM1), + UPBOARD_UP_BIT_TO_PIN(UART1_TX), + UPBOARD_UP_BIT_TO_PIN(UART1_RX), + UPBOARD_UP_BIT_TO_PIN(UART1_CTS), + UPBOARD_UP_BIT_TO_PIN(UART1_RTS), + UPBOARD_UP_BIT_TO_PIN(I2S_CLK), + UPBOARD_UP_BIT_TO_PIN(I2S_FRM), + UPBOARD_UP_BIT_TO_PIN(I2S_DIN), + UPBOARD_UP_BIT_TO_PIN(I2S_DOUT), + UPBOARD_UP_BIT_TO_PIN(GPIO22), + UPBOARD_UP_BIT_TO_PIN(GPIO23), + UPBOARD_UP_BIT_TO_PIN(GPIO24), + UPBOARD_UP_BIT_TO_PIN(GPIO25), + UPBOARD_UP_BIT_TO_PIN(GPIO26), + UPBOARD_UP_BIT_TO_PIN(GPIO27), +}; + +static const unsigned int upboard_up_uart1_pins[] = { + UPBOARD_UP_BIT_TO_PIN(UART1_TX), + UPBOARD_UP_BIT_TO_PIN(UART1_RX), + UPBOARD_UP_BIT_TO_PIN(UART1_RTS), + UPBOARD_UP_BIT_TO_PIN(UART1_CTS), +}; + +static const enum upboard_pin_mode upboard_up_uart1_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, +}; + +static_assert(ARRAY_SIZE(upboard_up_uart1_modes) == ARRAY_SIZE(upboard_up_uart1_pins)); + +static const unsigned int upboard_up_i2c0_pins[] = { + UPBOARD_UP_BIT_TO_PIN(I2C0_SCL), + UPBOARD_UP_BIT_TO_PIN(I2C0_SDA), +}; + +static const unsigned int upboard_up_i2c1_pins[] = { + UPBOARD_UP_BIT_TO_PIN(I2C1_SCL), + UPBOARD_UP_BIT_TO_PIN(I2C1_SDA), +}; + +static const unsigned int upboard_up_spi2_pins[] = { + UPBOARD_UP_BIT_TO_PIN(SPI_MOSI), + UPBOARD_UP_BIT_TO_PIN(SPI_MISO), + UPBOARD_UP_BIT_TO_PIN(SPI_CLK), + UPBOARD_UP_BIT_TO_PIN(SPI_CS0), + UPBOARD_UP_BIT_TO_PIN(SPI_CS1), +}; + +static const enum upboard_pin_mode upboard_up_spi2_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, +}; + +static_assert(ARRAY_SIZE(upboard_up_spi2_modes) == ARRAY_SIZE(upboard_up_spi2_pins)); + +static const unsigned int upboard_up_i2s0_pins[] = { + UPBOARD_UP_BIT_TO_PIN(I2S_FRM), + UPBOARD_UP_BIT_TO_PIN(I2S_CLK), + UPBOARD_UP_BIT_TO_PIN(I2S_DIN), + UPBOARD_UP_BIT_TO_PIN(I2S_DOUT), +}; + +static const enum upboard_pin_mode upboard_up_i2s0_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, +}; + +static_assert(ARRAY_SIZE(upboard_up_i2s0_pins) == ARRAY_SIZE(upboard_up_i2s0_modes)); + +static const unsigned int upboard_up_pwm0_pins[] = { + UPBOARD_UP_BIT_TO_PIN(PWM0), +}; + +static const unsigned int upboard_up_pwm1_pins[] = { + UPBOARD_UP_BIT_TO_PIN(PWM1), +}; + +static const unsigned int upboard_up_adc0_pins[] = { + UPBOARD_UP_BIT_TO_PIN(ADC0), +}; + +#define UPBOARD_PINGROUP(n, p, m) \ +{ \ + .grp = PINCTRL_PINGROUP(n, p, ARRAY_SIZE(p)), \ + .mode = __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(m), const enum upboard_pin_mode *), \ + 0, m), \ + .modes = __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(m), const enum upboard_pin_mode *), \ + m, NULL), \ +} + +static const struct upboard_pingroup upboard_up_pin_groups[] = { + UPBOARD_PINGROUP("uart1_grp", upboard_up_uart1_pins, &upboard_up_uart1_modes[0]), + UPBOARD_PINGROUP("i2c0_grp", upboard_up_i2c0_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("i2c1_grp", upboard_up_i2c1_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("spi2_grp", upboard_up_spi2_pins, &upboard_up_spi2_modes[0]), + UPBOARD_PINGROUP("i2s0_grp", upboard_up_i2s0_pins, &upboard_up_i2s0_modes[0]), + UPBOARD_PINGROUP("pwm0_grp", upboard_up_pwm0_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("pwm1_grp", upboard_up_pwm1_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("adc0_grp", upboard_up_adc0_pins, UPBOARD_PIN_MODE_GPIO_IN), +}; + +static const char * const upboard_up_uart1_groups[] = { "uart1_grp" }; +static const char * const upboard_up_i2c0_groups[] = { "i2c0_grp" }; +static const char * const upboard_up_i2c1_groups[] = { "i2c1_grp" }; +static const char * const upboard_up_spi2_groups[] = { "spi2_grp" }; +static const char * const upboard_up_i2s0_groups[] = { "i2s0_grp" }; +static const char * const upboard_up_pwm0_groups[] = { "pwm0_grp" }; +static const char * const upboard_up_pwm1_groups[] = { "pwm1_grp" }; +static const char * const upboard_up_adc0_groups[] = { "adc0_grp" }; + +#define UPBOARD_FUNCTION(func, groups) PINCTRL_PINFUNCTION(func, groups, ARRAY_SIZE(groups)) + +static const struct pinfunction upboard_up_pin_functions[] = { + UPBOARD_FUNCTION("uart1", upboard_up_uart1_groups), + UPBOARD_FUNCTION("i2c0", upboard_up_i2c0_groups), + UPBOARD_FUNCTION("i2c1", upboard_up_i2c1_groups), + UPBOARD_FUNCTION("spi2", upboard_up_spi2_groups), + UPBOARD_FUNCTION("i2s0", upboard_up_i2s0_groups), + UPBOARD_FUNCTION("pwm0", upboard_up_pwm0_groups), + UPBOARD_FUNCTION("pwm1", upboard_up_pwm1_groups), + UPBOARD_FUNCTION("adc0", upboard_up_adc0_groups), +}; + +static const struct upboard_pinctrl_data upboard_up_pinctrl_data = { + .groups = &upboard_up_pin_groups[0], + .ngroups = ARRAY_SIZE(upboard_up_pin_groups), + .funcs = &upboard_up_pin_functions[0], + .nfuncs = ARRAY_SIZE(upboard_up_pin_functions), + .pin_header = &upboard_up_pin_header[0], + .ngpio = ARRAY_SIZE(upboard_up_pin_header), +}; + +#define UPBOARD_UP2_BIT_TO_PIN(bit) UPBOARD_UP2_BIT_##bit + +#define UPBOARD_UP2_PIN_NAME(id) \ + { \ + .number = UPBOARD_UP2_BIT_##id, \ + .name = #id, \ + } + +#define UPBOARD_UP2_PIN_MUX(bit, data) \ + { \ + .number = UPBOARD_UP2_BIT_##bit, \ + .name = "PINMUX_"#bit, \ + .drv_data = (void *)(data), \ + } + +#define UPBOARD_UP2_PIN_FUNC(id, data) \ + { \ + .number = UPBOARD_UP2_BIT_##id, \ + .name = #id, \ + .drv_data = (void *)(data), \ + } + +enum upboard_up2_fpgabit { + UPBOARD_UP2_BIT_UART1_TXD, + UPBOARD_UP2_BIT_UART1_RXD, + UPBOARD_UP2_BIT_UART1_RTS, + UPBOARD_UP2_BIT_UART1_CTS, + UPBOARD_UP2_BIT_GPIO3_ADC0, + UPBOARD_UP2_BIT_GPIO5_ADC2, + UPBOARD_UP2_BIT_GPIO6_ADC3, + UPBOARD_UP2_BIT_GPIO11, + UPBOARD_UP2_BIT_EXHAT_LVDS1n, + UPBOARD_UP2_BIT_EXHAT_LVDS1p, + UPBOARD_UP2_BIT_SPI2_TXD, + UPBOARD_UP2_BIT_SPI2_RXD, + UPBOARD_UP2_BIT_SPI2_FS1, + UPBOARD_UP2_BIT_SPI2_FS0, + UPBOARD_UP2_BIT_SPI2_CLK, + UPBOARD_UP2_BIT_SPI1_TXD, + UPBOARD_UP2_BIT_SPI1_RXD, + UPBOARD_UP2_BIT_SPI1_FS1, + UPBOARD_UP2_BIT_SPI1_FS0, + UPBOARD_UP2_BIT_SPI1_CLK, + UPBOARD_UP2_BIT_I2C0_SCL, + UPBOARD_UP2_BIT_I2C0_SDA, + UPBOARD_UP2_BIT_I2C1_SCL, + UPBOARD_UP2_BIT_I2C1_SDA, + UPBOARD_UP2_BIT_PWM1, + UPBOARD_UP2_BIT_PWM0, + UPBOARD_UP2_BIT_EXHAT_LVDS0n, + UPBOARD_UP2_BIT_EXHAT_LVDS0p, + UPBOARD_UP2_BIT_GPIO24, + UPBOARD_UP2_BIT_GPIO10, + UPBOARD_UP2_BIT_GPIO2, + UPBOARD_UP2_BIT_GPIO1, + UPBOARD_UP2_BIT_EXHAT_LVDS3n, + UPBOARD_UP2_BIT_EXHAT_LVDS3p, + UPBOARD_UP2_BIT_EXHAT_LVDS4n, + UPBOARD_UP2_BIT_EXHAT_LVDS4p, + UPBOARD_UP2_BIT_EXHAT_LVDS5n, + UPBOARD_UP2_BIT_EXHAT_LVDS5p, + UPBOARD_UP2_BIT_I2S_SDO, + UPBOARD_UP2_BIT_I2S_SDI, + UPBOARD_UP2_BIT_I2S_WS_SYNC, + UPBOARD_UP2_BIT_I2S_BCLK, + UPBOARD_UP2_BIT_EXHAT_LVDS6n, + UPBOARD_UP2_BIT_EXHAT_LVDS6p, + UPBOARD_UP2_BIT_EXHAT_LVDS7n, + UPBOARD_UP2_BIT_EXHAT_LVDS7p, + UPBOARD_UP2_BIT_EXHAT_LVDS2n, + UPBOARD_UP2_BIT_EXHAT_LVDS2p, +}; + +static const struct pinctrl_pin_desc upboard_up2_pins[] = { + UPBOARD_UP2_PIN_NAME(UART1_TXD), + UPBOARD_UP2_PIN_NAME(UART1_RXD), + UPBOARD_UP2_PIN_NAME(UART1_RTS), + UPBOARD_UP2_PIN_NAME(UART1_CTS), + UPBOARD_UP2_PIN_NAME(GPIO3_ADC0), + UPBOARD_UP2_PIN_NAME(GPIO5_ADC2), + UPBOARD_UP2_PIN_NAME(GPIO6_ADC3), + UPBOARD_UP2_PIN_NAME(GPIO11), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS1n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS1p), + UPBOARD_UP2_PIN_NAME(SPI2_TXD), + UPBOARD_UP2_PIN_NAME(SPI2_RXD), + UPBOARD_UP2_PIN_NAME(SPI2_FS1), + UPBOARD_UP2_PIN_NAME(SPI2_FS0), + UPBOARD_UP2_PIN_NAME(SPI2_CLK), + UPBOARD_UP2_PIN_NAME(SPI1_TXD), + UPBOARD_UP2_PIN_NAME(SPI1_RXD), + UPBOARD_UP2_PIN_NAME(SPI1_FS1), + UPBOARD_UP2_PIN_NAME(SPI1_FS0), + UPBOARD_UP2_PIN_NAME(SPI1_CLK), + UPBOARD_UP2_PIN_MUX(I2C0_SCL, &upboard_i2c0_reg), + UPBOARD_UP2_PIN_MUX(I2C0_SDA, &upboard_i2c0_reg), + UPBOARD_UP2_PIN_MUX(I2C1_SCL, &upboard_i2c1_reg), + UPBOARD_UP2_PIN_MUX(I2C1_SDA, &upboard_i2c1_reg), + UPBOARD_UP2_PIN_NAME(PWM1), + UPBOARD_UP2_PIN_NAME(PWM0), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS0n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS0p), + UPBOARD_UP2_PIN_MUX(GPIO24, &upboard_i2c0_reg), + UPBOARD_UP2_PIN_MUX(GPIO10, &upboard_i2c0_reg), + UPBOARD_UP2_PIN_MUX(GPIO2, &upboard_i2c1_reg), + UPBOARD_UP2_PIN_MUX(GPIO1, &upboard_i2c1_reg), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS3n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS3p), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS4n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS4p), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS5n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS5p), + UPBOARD_UP2_PIN_NAME(I2S_SDO), + UPBOARD_UP2_PIN_NAME(I2S_SDI), + UPBOARD_UP2_PIN_NAME(I2S_WS_SYNC), + UPBOARD_UP2_PIN_NAME(I2S_BCLK), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS6n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS6p), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS7n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS7p), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS2n), + UPBOARD_UP2_PIN_NAME(EXHAT_LVDS2p), +}; + +static const unsigned int upboard_up2_pin_header[] = { + UPBOARD_UP2_BIT_TO_PIN(GPIO10), + UPBOARD_UP2_BIT_TO_PIN(GPIO24), + UPBOARD_UP2_BIT_TO_PIN(GPIO1), + UPBOARD_UP2_BIT_TO_PIN(GPIO2), + UPBOARD_UP2_BIT_TO_PIN(GPIO3_ADC0), + UPBOARD_UP2_BIT_TO_PIN(GPIO11), + UPBOARD_UP2_BIT_TO_PIN(SPI2_CLK), + UPBOARD_UP2_BIT_TO_PIN(SPI1_FS1), + UPBOARD_UP2_BIT_TO_PIN(SPI1_FS0), + UPBOARD_UP2_BIT_TO_PIN(SPI1_RXD), + UPBOARD_UP2_BIT_TO_PIN(SPI1_TXD), + UPBOARD_UP2_BIT_TO_PIN(SPI1_CLK), + UPBOARD_UP2_BIT_TO_PIN(PWM0), + UPBOARD_UP2_BIT_TO_PIN(PWM1), + UPBOARD_UP2_BIT_TO_PIN(UART1_TXD), + UPBOARD_UP2_BIT_TO_PIN(UART1_RXD), + UPBOARD_UP2_BIT_TO_PIN(UART1_CTS), + UPBOARD_UP2_BIT_TO_PIN(UART1_RTS), + UPBOARD_UP2_BIT_TO_PIN(I2S_BCLK), + UPBOARD_UP2_BIT_TO_PIN(I2S_WS_SYNC), + UPBOARD_UP2_BIT_TO_PIN(I2S_SDI), + UPBOARD_UP2_BIT_TO_PIN(I2S_SDO), + UPBOARD_UP2_BIT_TO_PIN(GPIO6_ADC3), + UPBOARD_UP2_BIT_TO_PIN(SPI2_FS1), + UPBOARD_UP2_BIT_TO_PIN(SPI2_RXD), + UPBOARD_UP2_BIT_TO_PIN(SPI2_TXD), + UPBOARD_UP2_BIT_TO_PIN(SPI2_FS0), + UPBOARD_UP2_BIT_TO_PIN(GPIO5_ADC2), +}; + +static const unsigned int upboard_up2_uart1_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(UART1_TXD), + UPBOARD_UP2_BIT_TO_PIN(UART1_RXD), + UPBOARD_UP2_BIT_TO_PIN(UART1_RTS), + UPBOARD_UP2_BIT_TO_PIN(UART1_CTS), +}; + +static const enum upboard_pin_mode upboard_up2_uart1_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, +}; + +static_assert(ARRAY_SIZE(upboard_up2_uart1_modes) == ARRAY_SIZE(upboard_up2_uart1_pins)); + +static const unsigned int upboard_up2_i2c0_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(I2C0_SCL), + UPBOARD_UP2_BIT_TO_PIN(I2C0_SDA), + UPBOARD_UP2_BIT_TO_PIN(GPIO24), + UPBOARD_UP2_BIT_TO_PIN(GPIO10), +}; + +static const unsigned int upboard_up2_i2c1_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(I2C1_SCL), + UPBOARD_UP2_BIT_TO_PIN(I2C1_SDA), + UPBOARD_UP2_BIT_TO_PIN(GPIO2), + UPBOARD_UP2_BIT_TO_PIN(GPIO1), +}; + +static const unsigned int upboard_up2_spi1_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(SPI1_TXD), + UPBOARD_UP2_BIT_TO_PIN(SPI1_RXD), + UPBOARD_UP2_BIT_TO_PIN(SPI1_FS1), + UPBOARD_UP2_BIT_TO_PIN(SPI1_FS0), + UPBOARD_UP2_BIT_TO_PIN(SPI1_CLK), +}; + +static const unsigned int upboard_up2_spi2_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(SPI2_TXD), + UPBOARD_UP2_BIT_TO_PIN(SPI2_RXD), + UPBOARD_UP2_BIT_TO_PIN(SPI2_FS1), + UPBOARD_UP2_BIT_TO_PIN(SPI2_FS0), + UPBOARD_UP2_BIT_TO_PIN(SPI2_CLK), +}; + +static const enum upboard_pin_mode upboard_up2_spi_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, +}; + +static_assert(ARRAY_SIZE(upboard_up2_spi_modes) == ARRAY_SIZE(upboard_up2_spi1_pins)); + +static_assert(ARRAY_SIZE(upboard_up2_spi_modes) == ARRAY_SIZE(upboard_up2_spi2_pins)); + +static const unsigned int upboard_up2_i2s0_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(I2S_BCLK), + UPBOARD_UP2_BIT_TO_PIN(I2S_WS_SYNC), + UPBOARD_UP2_BIT_TO_PIN(I2S_SDI), + UPBOARD_UP2_BIT_TO_PIN(I2S_SDO), +}; + +static const enum upboard_pin_mode upboard_up2_i2s0_modes[] = { + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_OUT, + UPBOARD_PIN_MODE_GPIO_IN, + UPBOARD_PIN_MODE_GPIO_OUT, +}; + +static_assert(ARRAY_SIZE(upboard_up2_i2s0_modes) == ARRAY_SIZE(upboard_up2_i2s0_pins)); + +static const unsigned int upboard_up2_pwm0_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(PWM0), +}; + +static const unsigned int upboard_up2_pwm1_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(PWM1), +}; + +static const unsigned int upboard_up2_adc0_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(GPIO3_ADC0), +}; + +static const unsigned int upboard_up2_adc2_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(GPIO5_ADC2), +}; + +static const unsigned int upboard_up2_adc3_pins[] = { + UPBOARD_UP2_BIT_TO_PIN(GPIO6_ADC3), +}; + +static const struct upboard_pingroup upboard_up2_pin_groups[] = { + UPBOARD_PINGROUP("uart1_grp", upboard_up2_uart1_pins, &upboard_up2_uart1_modes[0]), + UPBOARD_PINGROUP("i2c0_grp", upboard_up2_i2c0_pins, UPBOARD_PIN_MODE_FUNCTION), + UPBOARD_PINGROUP("i2c1_grp", upboard_up2_i2c1_pins, UPBOARD_PIN_MODE_FUNCTION), + UPBOARD_PINGROUP("spi1_grp", upboard_up2_spi1_pins, &upboard_up2_spi_modes[0]), + UPBOARD_PINGROUP("spi2_grp", upboard_up2_spi2_pins, &upboard_up2_spi_modes[0]), + UPBOARD_PINGROUP("i2s0_grp", upboard_up2_i2s0_pins, &upboard_up2_i2s0_modes[0]), + UPBOARD_PINGROUP("pwm0_grp", upboard_up2_pwm0_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("pwm1_grp", upboard_up2_pwm1_pins, UPBOARD_PIN_MODE_GPIO_OUT), + UPBOARD_PINGROUP("adc0_grp", upboard_up2_adc0_pins, UPBOARD_PIN_MODE_GPIO_IN), + UPBOARD_PINGROUP("adc2_grp", upboard_up2_adc2_pins, UPBOARD_PIN_MODE_GPIO_IN), + UPBOARD_PINGROUP("adc3_grp", upboard_up2_adc3_pins, UPBOARD_PIN_MODE_GPIO_IN), +}; + +static const char * const upboard_up2_uart1_groups[] = { "uart1_grp" }; +static const char * const upboard_up2_i2c0_groups[] = { "i2c0_grp" }; +static const char * const upboard_up2_i2c1_groups[] = { "i2c1_grp" }; +static const char * const upboard_up2_spi1_groups[] = { "spi1_grp" }; +static const char * const upboard_up2_spi2_groups[] = { "spi2_grp" }; +static const char * const upboard_up2_i2s0_groups[] = { "i2s0_grp" }; +static const char * const upboard_up2_pwm0_groups[] = { "pwm0_grp" }; +static const char * const upboard_up2_pwm1_groups[] = { "pwm1_grp" }; +static const char * const upboard_up2_adc0_groups[] = { "adc0_grp" }; +static const char * const upboard_up2_adc2_groups[] = { "adc2_grp" }; +static const char * const upboard_up2_adc3_groups[] = { "adc3_grp" }; + +static const struct pinfunction upboard_up2_pin_functions[] = { + UPBOARD_FUNCTION("uart1", upboard_up2_uart1_groups), + UPBOARD_FUNCTION("i2c0", upboard_up2_i2c0_groups), + UPBOARD_FUNCTION("i2c1", upboard_up2_i2c1_groups), + UPBOARD_FUNCTION("spi1", upboard_up2_spi1_groups), + UPBOARD_FUNCTION("spi2", upboard_up2_spi2_groups), + UPBOARD_FUNCTION("i2s0", upboard_up2_i2s0_groups), + UPBOARD_FUNCTION("pwm0", upboard_up2_pwm0_groups), + UPBOARD_FUNCTION("pwm1", upboard_up2_pwm1_groups), + UPBOARD_FUNCTION("adc0", upboard_up2_adc0_groups), + UPBOARD_FUNCTION("adc2", upboard_up2_adc2_groups), + UPBOARD_FUNCTION("adc3", upboard_up2_adc3_groups), +}; + +static const struct upboard_pinctrl_data upboard_up2_pinctrl_data = { + .groups = &upboard_up2_pin_groups[0], + .ngroups = ARRAY_SIZE(upboard_up2_pin_groups), + .funcs = &upboard_up2_pin_functions[0], + .nfuncs = ARRAY_SIZE(upboard_up2_pin_functions), + .pin_header = &upboard_up2_pin_header[0], + .ngpio = ARRAY_SIZE(upboard_up2_pin_header), +}; + +static int upboard_pinctrl_set_function(struct pinctrl_dev *pctldev, unsigned int offset) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[offset]; + int ret; + + if (!p->funcbit) + return -EPERM; + + ret = regmap_field_write(p->enbit, 0); + if (ret) + return ret; + + return regmap_field_write(p->funcbit, 1); +} + +static int upboard_pinctrl_gpio_commit_enable(struct pinctrl_dev *pctldev, unsigned int offset) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[offset]; + int ret; + + if (p->funcbit) { + ret = regmap_field_write(p->funcbit, 0); + if (ret) + return ret; + } + + return regmap_field_write(p->enbit, 1); +} + +static int upboard_pinctrl_gpio_request_enable(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, + unsigned int offset) +{ + return upboard_pinctrl_gpio_commit_enable(pctldev, offset); +} + +static void upboard_pinctrl_gpio_commit_disable(struct pinctrl_dev *pctldev, unsigned int offset) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[offset]; + + regmap_field_write(p->enbit, 0); +}; + +static void upboard_pinctrl_gpio_disable_free(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, unsigned int offset) +{ + return upboard_pinctrl_gpio_commit_disable(pctldev, offset); +} + +static int upboard_pinctrl_gpio_commit_direction(struct pinctrl_dev *pctldev, unsigned int offset, + bool input) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[offset]; + + return regmap_field_write(p->dirbit, input); +} + +static int upboard_pinctrl_gpio_set_direction(struct pinctrl_dev *pctldev, + struct pinctrl_gpio_range *range, + unsigned int offset, bool input) +{ + return upboard_pinctrl_gpio_commit_direction(pctldev, offset, input); +} + +static int upboard_pinctrl_set_mux(struct pinctrl_dev *pctldev, unsigned int func_selector, + unsigned int group_selector) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + const struct upboard_pinctrl_data *pctrl_data = pctrl->pctrl_data; + const struct upboard_pingroup *upgroups = pctrl_data->groups; + struct group_desc *grp; + unsigned int mode, i; + int ret; + + grp = pinctrl_generic_get_group(pctldev, group_selector); + if (!grp) + return -EINVAL; + + for (i = 0; i < grp->grp.npins; i++) { + mode = upgroups[group_selector].mode ?: upgroups[group_selector].modes[i]; + if (mode == UPBOARD_PIN_MODE_FUNCTION) { + ret = upboard_pinctrl_set_function(pctldev, grp->grp.pins[i]); + if (ret) + return ret; + + continue; + } + + ret = upboard_pinctrl_gpio_commit_enable(pctldev, grp->grp.pins[i]); + if (ret) + return ret; + + ret = upboard_pinctrl_gpio_commit_direction(pctldev, grp->grp.pins[i], + mode == UPBOARD_PIN_MODE_GPIO_IN); + if (ret) + return ret; + } + + return 0; +} + +static const struct pinmux_ops upboard_pinmux_ops = { + .get_functions_count = pinmux_generic_get_function_count, + .get_function_name = pinmux_generic_get_function_name, + .get_function_groups = pinmux_generic_get_function_groups, + .set_mux = upboard_pinctrl_set_mux, + .gpio_request_enable = upboard_pinctrl_gpio_request_enable, + .gpio_disable_free = upboard_pinctrl_gpio_disable_free, + .gpio_set_direction = upboard_pinctrl_gpio_set_direction, +}; + +static int upboard_pinctrl_pin_get_mode(struct pinctrl_dev *pctldev, unsigned int pin) +{ + struct upboard_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct upboard_pin *p = &pctrl->pins[pin]; + unsigned int val; + int ret; + + if (p->funcbit) { + ret = regmap_field_read(p->funcbit, &val); + if (ret) + return ret; + if (val) + return UPBOARD_PIN_MODE_FUNCTION; + } + + ret = regmap_field_read(p->enbit, &val); + if (ret) + return ret; + if (!val) + return UPBOARD_PIN_MODE_DISABLED; + + ret = regmap_field_read(p->dirbit, &val); + if (ret) + return ret; + + return val ? UPBOARD_PIN_MODE_GPIO_IN : UPBOARD_PIN_MODE_GPIO_OUT; +} + +static void upboard_pinctrl_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, + unsigned int offset) +{ + int ret; + + ret = upboard_pinctrl_pin_get_mode(pctldev, offset); + if (ret == UPBOARD_PIN_MODE_FUNCTION) + seq_puts(s, "mode function "); + else if (ret == UPBOARD_PIN_MODE_DISABLED) + seq_puts(s, "HIGH-Z "); + else if (ret < 0) + seq_puts(s, "N/A "); + else + seq_printf(s, "GPIO (%s) ", str_input_output(ret == UPBOARD_PIN_MODE_GPIO_IN)); +} + +static const struct pinctrl_ops upboard_pinctrl_ops = { + .get_groups_count = pinctrl_generic_get_group_count, + .get_group_name = pinctrl_generic_get_group_name, + .get_group_pins = pinctrl_generic_get_group_pins, + .pin_dbg_show = upboard_pinctrl_dbg_show, +}; + +static int upboard_gpio_request(struct gpio_chip *gc, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + struct upboard_pinctrl *pctrl = gpiochip_fwd_get_data(fwd); + unsigned int pin = pctrl->pctrl_data->pin_header[offset]; + struct gpio_desc *desc; + int ret; + + ret = pinctrl_gpio_request(gc, offset); + if (ret) + return ret; + + desc = gpiod_get_index(pctrl->dev, "external", pin, 0); + if (IS_ERR(desc)) { + pinctrl_gpio_free(gc, offset); + return PTR_ERR(desc); + } + + return gpiochip_fwd_desc_add(fwd, desc, offset); +} + +static void upboard_gpio_free(struct gpio_chip *gc, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + + gpiochip_fwd_desc_free(fwd, offset); + pinctrl_gpio_free(gc, offset); +} + +static int upboard_gpio_get_direction(struct gpio_chip *gc, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + struct upboard_pinctrl *pctrl = gpiochip_fwd_get_data(fwd); + unsigned int pin = pctrl->pctrl_data->pin_header[offset]; + int mode; + + /* If the pin is in function mode or high-z, input direction is returned */ + mode = upboard_pinctrl_pin_get_mode(pctrl->pctldev, pin); + if (mode < 0) + return mode; + + if (mode == UPBOARD_PIN_MODE_GPIO_OUT) + return GPIO_LINE_DIRECTION_OUT; + + return GPIO_LINE_DIRECTION_IN; +} + +static int upboard_gpio_direction_input(struct gpio_chip *gc, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + int ret; + + ret = pinctrl_gpio_direction_input(gc, offset); + if (ret) + return ret; + + return gpiochip_fwd_gpio_direction_input(fwd, offset); +} + +static int upboard_gpio_direction_output(struct gpio_chip *gc, unsigned int offset, int value) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(gc); + int ret; + + ret = pinctrl_gpio_direction_output(gc, offset); + if (ret) + return ret; + + return gpiochip_fwd_gpio_direction_output(fwd, offset, value); +} + +static int upboard_pinctrl_register_groups(struct upboard_pinctrl *pctrl) +{ + const struct upboard_pingroup *groups = pctrl->pctrl_data->groups; + size_t ngroups = pctrl->pctrl_data->ngroups; + unsigned int i; + int ret; + + for (i = 0; i < ngroups; i++) { + ret = pinctrl_generic_add_group(pctrl->pctldev, groups[i].grp.name, + groups[i].grp.pins, groups[i].grp.npins, pctrl); + if (ret < 0) + return ret; + } + + return 0; +} + +static int upboard_pinctrl_register_functions(struct upboard_pinctrl *pctrl) +{ + const struct pinfunction *funcs = pctrl->pctrl_data->funcs; + size_t nfuncs = pctrl->pctrl_data->nfuncs; + unsigned int i; + int ret; + + for (i = 0; i < nfuncs ; i++) { + ret = pinmux_generic_add_function(pctrl->pctldev, funcs[i].name, + funcs[i].groups, funcs[i].ngroups, NULL); + if (ret < 0) + return ret; + } + + return 0; +} + +static const struct pinctrl_map pinctrl_map_apl01[] = { + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:00", "pwm0_grp", "pwm0"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:00", "pwm1_grp", "pwm1"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:00", "uart1_grp", "uart1"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:02", "i2c0_grp", "i2c0"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:02", "i2c1_grp", "i2c1"), + PIN_MAP_MUX_GROUP_DEFAULT("upboard-pinctrl", "INT3452:01", "ssp0_grp", "ssp0"), +}; + +static const struct upboard_pinctrl_map upboard_pinctrl_map_apl01 = { + .maps = &pinctrl_map_apl01[0], + .nmaps = ARRAY_SIZE(pinctrl_map_apl01), +}; + +static const struct dmi_system_id dmi_platform_info[] = { + { + /* UP Squared */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "AAEON"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "UP-APL01"), + }, + .driver_data = (void *)&upboard_pinctrl_map_apl01, + }, + { } +}; + +static int upboard_pinctrl_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct upboard_fpga *fpga = dev_get_drvdata(dev->parent); + const struct upboard_pinctrl_map *board_map; + const struct dmi_system_id *dmi_id; + struct pinctrl_desc *pctldesc; + struct upboard_pinctrl *pctrl; + struct upboard_pin *pins; + struct gpiochip_fwd *fwd; + struct pinctrl *pinctrl; + struct gpio_chip *chip; + unsigned int i; + int ret; + + pctldesc = devm_kzalloc(dev, sizeof(*pctldesc), GFP_KERNEL); + if (!pctldesc) + return -ENOMEM; + + pctrl = devm_kzalloc(dev, sizeof(*pctrl), GFP_KERNEL); + if (!pctrl) + return -ENOMEM; + + switch (fpga->fpga_data->type) { + case UPBOARD_UP_FPGA: + pctldesc->pins = upboard_up_pins; + pctldesc->npins = ARRAY_SIZE(upboard_up_pins); + pctrl->pctrl_data = &upboard_up_pinctrl_data; + break; + case UPBOARD_UP2_FPGA: + pctldesc->pins = upboard_up2_pins; + pctldesc->npins = ARRAY_SIZE(upboard_up2_pins); + pctrl->pctrl_data = &upboard_up2_pinctrl_data; + break; + default: + return dev_err_probe(dev, -ENODEV, "Unsupported device type %d\n", + fpga->fpga_data->type); + } + + dmi_id = dmi_first_match(dmi_platform_info); + if (!dmi_id) + return dev_err_probe(dev, -ENODEV, "Unsupported board\n"); + + board_map = (const struct upboard_pinctrl_map *)dmi_id->driver_data; + + pctldesc->name = dev_name(dev); + pctldesc->owner = THIS_MODULE; + pctldesc->pctlops = &upboard_pinctrl_ops; + pctldesc->pmxops = &upboard_pinmux_ops; + + pctrl->dev = dev; + + pins = devm_kcalloc(dev, pctldesc->npins, sizeof(*pins), GFP_KERNEL); + if (!pins) + return -ENOMEM; + + /* Initialize pins */ + for (i = 0; i < pctldesc->npins; i++) { + const struct pinctrl_pin_desc *pin_desc = &pctldesc->pins[i]; + unsigned int regoff = pin_desc->number / UPBOARD_REGISTER_SIZE; + unsigned int lsb = pin_desc->number % UPBOARD_REGISTER_SIZE; + struct reg_field * const fld_func = pin_desc->drv_data; + struct upboard_pin *pin = &pins[i]; + struct reg_field fldconf = {}; + + if (fld_func) { + pin->funcbit = devm_regmap_field_alloc(dev, fpga->regmap, *fld_func); + if (IS_ERR(pin->funcbit)) + return PTR_ERR(pin->funcbit); + } + + fldconf.reg = UPBOARD_REG_GPIO_EN0 + regoff; + fldconf.lsb = lsb; + fldconf.msb = lsb; + pin->enbit = devm_regmap_field_alloc(dev, fpga->regmap, fldconf); + if (IS_ERR(pin->enbit)) + return PTR_ERR(pin->enbit); + + fldconf.reg = UPBOARD_REG_GPIO_DIR0 + regoff; + fldconf.lsb = lsb; + fldconf.msb = lsb; + pin->dirbit = devm_regmap_field_alloc(dev, fpga->regmap, fldconf); + if (IS_ERR(pin->dirbit)) + return PTR_ERR(pin->dirbit); + } + + pctrl->pins = pins; + + ret = devm_pinctrl_register_and_init(dev, pctldesc, pctrl, &pctrl->pctldev); + if (ret) + return dev_err_probe(dev, ret, "Failed to register pinctrl\n"); + + ret = upboard_pinctrl_register_groups(pctrl); + if (ret) + return dev_err_probe(dev, ret, "Failed to register groups\n"); + + ret = upboard_pinctrl_register_functions(pctrl); + if (ret) + return dev_err_probe(dev, ret, "Failed to register functions\n"); + + ret = devm_pinctrl_register_mappings(dev, board_map->maps, board_map->nmaps); + if (ret) + return ret; + + pinctrl = devm_pinctrl_get_select_default(dev); + if (IS_ERR(pinctrl)) + return dev_err_probe(dev, PTR_ERR(pinctrl), "Failed to select pinctrl\n"); + + ret = pinctrl_enable(pctrl->pctldev); + if (ret) + return ret; + + fwd = devm_gpiochip_fwd_alloc(dev, pctrl->pctrl_data->ngpio); + if (IS_ERR(fwd)) + return dev_err_probe(dev, PTR_ERR(fwd), "Failed to allocate the gpiochip forwarder\n"); + + chip = gpiochip_fwd_get_gpiochip(fwd); + chip->request = upboard_gpio_request; + chip->free = upboard_gpio_free; + chip->get_direction = upboard_gpio_get_direction; + chip->direction_output = upboard_gpio_direction_output; + chip->direction_input = upboard_gpio_direction_input; + + ret = gpiochip_fwd_register(fwd, pctrl); + if (ret) + return dev_err_probe(dev, ret, "Failed to register the gpiochip forwarder\n"); + + return gpiochip_add_sparse_pin_range(chip, dev_name(dev), 0, pctrl->pctrl_data->pin_header, + pctrl->pctrl_data->ngpio); +} + +static struct platform_driver upboard_pinctrl_driver = { + .driver = { + .name = "upboard-pinctrl", + }, + .probe = upboard_pinctrl_probe, +}; +module_platform_driver(upboard_pinctrl_driver); + +MODULE_AUTHOR("Thomas Richard desc->pmxops; + const struct pinctrl_setting_mux *mux_setting; + bool func_is_gpio = false; /* Can't inspect pin, assume it can be used */ if (!desc || !ops) return true; + mux_setting = desc->mux_setting; + guard(mutex)(&desc->mux_lock); - if (ops->strict && desc->mux_usecount) + if (mux_setting && ops->function_is_gpio) + func_is_gpio = ops->function_is_gpio(pctldev, mux_setting->func); + + if (ops->strict && desc->mux_usecount && !func_is_gpio) return false; return !(ops->strict && !!desc->gpio_owner); @@ -116,7 +123,9 @@ static int pin_request(struct pinctrl_dev *pctldev, { struct pin_desc *desc; const struct pinmux_ops *ops = pctldev->desc->pmxops; + const struct pinctrl_setting_mux *mux_setting; int status = -EINVAL; + bool gpio_ok = false; desc = pin_desc_get(pctldev, pin); if (desc == NULL) { @@ -126,11 +135,21 @@ static int pin_request(struct pinctrl_dev *pctldev, goto out; } + mux_setting = desc->mux_setting; + dev_dbg(pctldev->dev, "request pin %d (%s) for %s\n", pin, desc->name, owner); scoped_guard(mutex, &desc->mux_lock) { - if ((!gpio_range || ops->strict) && + if (mux_setting) { + if (ops->function_is_gpio) + gpio_ok = ops->function_is_gpio(pctldev, + mux_setting->func); + } else { + gpio_ok = true; + } + + if ((!gpio_range || ops->strict) && !gpio_ok && desc->mux_usecount && strcmp(desc->mux_owner, owner)) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", @@ -138,7 +157,7 @@ static int pin_request(struct pinctrl_dev *pctldev, goto out; } - if ((gpio_range || ops->strict) && desc->gpio_owner) { + if ((gpio_range || ops->strict) && !gpio_ok && desc->gpio_owner) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", desc->name, desc->gpio_owner, owner); @@ -337,7 +356,7 @@ static int pinmux_func_name_to_selector(struct pinctrl_dev *pctldev, while (selector < nfuncs) { const char *fname = ops->get_function_name(pctldev, selector); - if (!strcmp(function, fname)) + if (fname && !strcmp(function, fname)) return selector; selector++; @@ -810,7 +829,7 @@ pinmux_generic_get_function_name(struct pinctrl_dev *pctldev, if (!function) return NULL; - return function->func.name; + return function->func->name; } EXPORT_SYMBOL_GPL(pinmux_generic_get_function_name); @@ -835,8 +854,8 @@ int pinmux_generic_get_function_groups(struct pinctrl_dev *pctldev, __func__, selector); return -EINVAL; } - *groups = function->func.groups; - *ngroups = function->func.ngroups; + *groups = function->func->groups; + *ngroups = function->func->ngroups; return 0; } @@ -847,8 +866,8 @@ EXPORT_SYMBOL_GPL(pinmux_generic_get_function_groups); * @pctldev: pin controller device * @selector: function number */ -struct function_desc *pinmux_generic_get_function(struct pinctrl_dev *pctldev, - unsigned int selector) +const struct function_desc * +pinmux_generic_get_function(struct pinctrl_dev *pctldev, unsigned int selector) { struct function_desc *function; @@ -861,6 +880,27 @@ struct function_desc *pinmux_generic_get_function(struct pinctrl_dev *pctldev, } EXPORT_SYMBOL_GPL(pinmux_generic_get_function); +/** + * pinmux_generic_function_is_gpio() - returns true if given function is a GPIO + * @pctldev: pin controller device + * @selector: function number + * + * Returns: + * True if given function is a GPIO, false otherwise. + */ +bool pinmux_generic_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct function_desc *function; + + function = radix_tree_lookup(&pctldev->pin_function_tree, selector); + if (!function) + return false; + + return function->func->flags & PINFUNCTION_FLAG_GPIO; +} +EXPORT_SYMBOL_GPL(pinmux_generic_function_is_gpio); + /** * pinmux_generic_add_function() - adds a function group * @pctldev: pin controller device @@ -903,7 +943,17 @@ int pinmux_generic_add_pinfunction(struct pinctrl_dev *pctldev, if (!function) return -ENOMEM; - function->func = *func; + /* + * FIXME: It's generally a bad idea to use devres in subsystem core + * code - managed interfaces are aimed at drivers - but pinctrl already + * uses it all over the place so it's a larger piece of technical debt + * to fix. + */ + function->func = devm_kmemdup_const(pctldev->dev, func, + sizeof(*func), GFP_KERNEL); + if (!function->func) + return -ENOMEM; + function->data = data; error = radix_tree_insert(&pctldev->pin_function_tree, selector, function); diff --git a/drivers/pinctrl/pinmux.h b/drivers/pinctrl/pinmux.h index bdb5be1a636ead..4e826c1a5246cf 100644 --- a/drivers/pinctrl/pinmux.h +++ b/drivers/pinctrl/pinmux.h @@ -137,7 +137,7 @@ static inline void pinmux_init_device_debugfs(struct dentry *devroot, * @data: pin controller driver specific data */ struct function_desc { - struct pinfunction func; + const struct pinfunction *func; void *data; }; @@ -152,8 +152,8 @@ int pinmux_generic_get_function_groups(struct pinctrl_dev *pctldev, const char * const **groups, unsigned int * const ngroups); -struct function_desc *pinmux_generic_get_function(struct pinctrl_dev *pctldev, - unsigned int selector); +const struct function_desc * +pinmux_generic_get_function(struct pinctrl_dev *pctldev, unsigned int selector); int pinmux_generic_add_function(struct pinctrl_dev *pctldev, const char *name, @@ -169,6 +169,9 @@ int pinmux_generic_remove_function(struct pinctrl_dev *pctldev, void pinmux_generic_free_functions(struct pinctrl_dev *pctldev); +bool pinmux_generic_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector); + #else static inline void pinmux_generic_free_functions(struct pinctrl_dev *pctldev) diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index dd9bbe8f3e11c3..c480e8b7850329 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -8,6 +8,7 @@ config PINCTRL_MSM depends on OF select QCOM_SCM select PINMUX + select GENERIC_PINMUX_FUNCTIONS select PINCONF select GENERIC_PINCONF select GPIOLIB_IRQCHIP @@ -68,6 +69,16 @@ config PINCTRL_SC7280_LPASS_LPI Qualcomm Technologies Inc LPASS (Low Power Audio SubSystem) LPI (Low Power Island) found on the Qualcomm Technologies Inc SC7280 platform. +config PINCTRL_SDM660_LPASS_LPI + tristate "Qualcomm Technologies Inc SDM660 LPASS LPI pin controller driver" + depends on GPIOLIB + depends on ARM64 || COMPILE_TEST + depends on PINCTRL_LPASS_LPI + help + This is the pinctrl, pinmux, pinconf and gpiolib driver for the + Qualcomm Technologies Inc LPASS (Low Power Audio SubSystem) LPI + (Low Power Island) found on the Qualcomm Technologies Inc SDM660 platform. + config PINCTRL_SM4250_LPASS_LPI tristate "Qualcomm Technologies Inc SM4250 LPASS LPI pin controller driver" depends on ARM64 || COMPILE_TEST diff --git a/drivers/pinctrl/qcom/Kconfig.msm b/drivers/pinctrl/qcom/Kconfig.msm index 6dad942b00a35f..69a5b47adedc2a 100644 --- a/drivers/pinctrl/qcom/Kconfig.msm +++ b/drivers/pinctrl/qcom/Kconfig.msm @@ -15,6 +15,16 @@ config PINCTRL_APQ8084 This is the pinctrl, pinmux, pinconf and gpiolib driver for the Qualcomm TLMM block found in the Qualcomm APQ8084 platform. +config PINCTRL_GLYMUR + tristate "Qualcomm Technologies Inc Glymur pin controller driver" + depends on ARM64 || COMPILE_TEST + help + This is the pinctrl, pinmux, pinconf and gpiolib driver for the + Qualcomm Technologies Inc Top Level Mode Multiplexer block (TLMM) + block found on the Qualcomm Technologies Inc Glymur platform. + Say Y here to compile statically, or M here to compile it as a module. + If unsure, say N. + config PINCTRL_IPQ4019 tristate "Qualcomm IPQ4019 pin controller driver" depends on ARM || COMPILE_TEST diff --git a/drivers/pinctrl/qcom/Makefile b/drivers/pinctrl/qcom/Makefile index 2acff520a285a4..567d3051e760dd 100644 --- a/drivers/pinctrl/qcom/Makefile +++ b/drivers/pinctrl/qcom/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_PINCTRL_MSM) += pinctrl-msm.o obj-$(CONFIG_PINCTRL_APQ8064) += pinctrl-apq8064.o obj-$(CONFIG_PINCTRL_APQ8084) += pinctrl-apq8084.o +obj-$(CONFIG_PINCTRL_GLYMUR) += pinctrl-glymur.o obj-$(CONFIG_PINCTRL_IPQ4019) += pinctrl-ipq4019.o obj-$(CONFIG_PINCTRL_IPQ5018) += pinctrl-ipq5018.o obj-$(CONFIG_PINCTRL_IPQ8064) += pinctrl-ipq8064.o @@ -44,6 +45,7 @@ obj-$(CONFIG_PINCTRL_SC7280_LPASS_LPI) += pinctrl-sc7280-lpass-lpi.o obj-$(CONFIG_PINCTRL_SC8180X) += pinctrl-sc8180x.o obj-$(CONFIG_PINCTRL_SC8280XP) += pinctrl-sc8280xp.o obj-$(CONFIG_PINCTRL_SDM660) += pinctrl-sdm660.o +obj-$(CONFIG_PINCTRL_SDM660_LPASS_LPI) += pinctrl-sdm660-lpass-lpi.o obj-$(CONFIG_PINCTRL_SDM670) += pinctrl-sdm670.o obj-$(CONFIG_PINCTRL_SDM845) += pinctrl-sdm845.o obj-$(CONFIG_PINCTRL_SDX55) += pinctrl-sdx55.o diff --git a/drivers/pinctrl/qcom/pinctrl-glymur.c b/drivers/pinctrl/qcom/pinctrl-glymur.c new file mode 100644 index 00000000000000..9913f98e953110 --- /dev/null +++ b/drivers/pinctrl/qcom/pinctrl-glymur.c @@ -0,0 +1,1777 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025 Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#include +#include +#include +#include +#include + +#include "pinctrl-msm.h" + +#define REG_SIZE 0x1000 +#define PINGROUP(id, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11) \ + { \ + .grp = PINCTRL_PINGROUP("gpio" #id, \ + gpio##id##_pins, \ + ARRAY_SIZE(gpio##id##_pins)), \ + .ctl_reg = REG_SIZE * id, \ + .io_reg = 0x4 + REG_SIZE * id, \ + .intr_cfg_reg = 0x8 + REG_SIZE * id, \ + .intr_status_reg = 0xc + REG_SIZE * id, \ + .intr_target_reg = 0x8 + REG_SIZE * id, \ + .mux_bit = 2, \ + .pull_bit = 0, \ + .drv_bit = 6, \ + .egpio_enable = 12, \ + .egpio_present = 11, \ + .oe_bit = 9, \ + .in_bit = 0, \ + .out_bit = 1, \ + .intr_enable_bit = 0, \ + .intr_status_bit = 0, \ + .intr_target_bit = 5, \ + .intr_target_kpss_val = 3, \ + .intr_raw_status_bit = 4, \ + .intr_polarity_bit = 1, \ + .intr_detection_bit = 2, \ + .intr_detection_width = 2, \ + .funcs = (int[]){ \ + msm_mux_gpio, /* gpio mode */ \ + msm_mux_##f1, \ + msm_mux_##f2, \ + msm_mux_##f3, \ + msm_mux_##f4, \ + msm_mux_##f5, \ + msm_mux_##f6, \ + msm_mux_##f7, \ + msm_mux_##f8, \ + msm_mux_##f9, \ + msm_mux_##f10, \ + msm_mux_##f11 /* egpio mode */ \ + }, \ + .nfuncs = 12, \ + } + +#define SDC_QDSD_PINGROUP(pg_name, ctl, pull, drv) \ + { \ + .grp = PINCTRL_PINGROUP(#pg_name, \ + pg_name##_pins, \ + ARRAY_SIZE(pg_name##_pins)), \ + .ctl_reg = ctl, \ + .io_reg = 0, \ + .intr_cfg_reg = 0, \ + .intr_status_reg = 0, \ + .intr_target_reg = 0, \ + .mux_bit = -1, \ + .pull_bit = pull, \ + .drv_bit = drv, \ + .oe_bit = -1, \ + .in_bit = -1, \ + .out_bit = -1, \ + .intr_enable_bit = -1, \ + .intr_status_bit = -1, \ + .intr_target_bit = -1, \ + .intr_raw_status_bit = -1, \ + .intr_polarity_bit = -1, \ + .intr_detection_bit = -1, \ + .intr_detection_width = -1, \ + } + +#define UFS_RESET(pg_name, ctl, io) \ + { \ + .grp = PINCTRL_PINGROUP(#pg_name, \ + pg_name##_pins, \ + ARRAY_SIZE(pg_name##_pins)), \ + .ctl_reg = ctl, \ + .io_reg = io, \ + .intr_cfg_reg = 0, \ + .intr_status_reg = 0, \ + .intr_target_reg = 0, \ + .mux_bit = -1, \ + .pull_bit = 3, \ + .drv_bit = 0, \ + .oe_bit = -1, \ + .in_bit = -1, \ + .out_bit = 0, \ + .intr_enable_bit = -1, \ + .intr_status_bit = -1, \ + .intr_target_bit = -1, \ + .intr_raw_status_bit = -1, \ + .intr_polarity_bit = -1, \ + .intr_detection_bit = -1, \ + .intr_detection_width = -1, \ + } + +static const struct pinctrl_pin_desc glymur_pins[] = { + PINCTRL_PIN(0, "GPIO_0"), + PINCTRL_PIN(1, "GPIO_1"), + PINCTRL_PIN(2, "GPIO_2"), + PINCTRL_PIN(3, "GPIO_3"), + PINCTRL_PIN(4, "GPIO_4"), + PINCTRL_PIN(5, "GPIO_5"), + PINCTRL_PIN(6, "GPIO_6"), + PINCTRL_PIN(7, "GPIO_7"), + PINCTRL_PIN(8, "GPIO_8"), + PINCTRL_PIN(9, "GPIO_9"), + PINCTRL_PIN(10, "GPIO_10"), + PINCTRL_PIN(11, "GPIO_11"), + PINCTRL_PIN(12, "GPIO_12"), + PINCTRL_PIN(13, "GPIO_13"), + PINCTRL_PIN(14, "GPIO_14"), + PINCTRL_PIN(15, "GPIO_15"), + PINCTRL_PIN(16, "GPIO_16"), + PINCTRL_PIN(17, "GPIO_17"), + PINCTRL_PIN(18, "GPIO_18"), + PINCTRL_PIN(19, "GPIO_19"), + PINCTRL_PIN(20, "GPIO_20"), + PINCTRL_PIN(21, "GPIO_21"), + PINCTRL_PIN(22, "GPIO_22"), + PINCTRL_PIN(23, "GPIO_23"), + PINCTRL_PIN(24, "GPIO_24"), + PINCTRL_PIN(25, "GPIO_25"), + PINCTRL_PIN(26, "GPIO_26"), + PINCTRL_PIN(27, "GPIO_27"), + PINCTRL_PIN(28, "GPIO_28"), + PINCTRL_PIN(29, "GPIO_29"), + PINCTRL_PIN(30, "GPIO_30"), + PINCTRL_PIN(31, "GPIO_31"), + PINCTRL_PIN(32, "GPIO_32"), + PINCTRL_PIN(33, "GPIO_33"), + PINCTRL_PIN(34, "GPIO_34"), + PINCTRL_PIN(35, "GPIO_35"), + PINCTRL_PIN(36, "GPIO_36"), + PINCTRL_PIN(37, "GPIO_37"), + PINCTRL_PIN(38, "GPIO_38"), + PINCTRL_PIN(39, "GPIO_39"), + PINCTRL_PIN(40, "GPIO_40"), + PINCTRL_PIN(41, "GPIO_41"), + PINCTRL_PIN(42, "GPIO_42"), + PINCTRL_PIN(43, "GPIO_43"), + PINCTRL_PIN(44, "GPIO_44"), + PINCTRL_PIN(45, "GPIO_45"), + PINCTRL_PIN(46, "GPIO_46"), + PINCTRL_PIN(47, "GPIO_47"), + PINCTRL_PIN(48, "GPIO_48"), + PINCTRL_PIN(49, "GPIO_49"), + PINCTRL_PIN(50, "GPIO_50"), + PINCTRL_PIN(51, "GPIO_51"), + PINCTRL_PIN(52, "GPIO_52"), + PINCTRL_PIN(53, "GPIO_53"), + PINCTRL_PIN(54, "GPIO_54"), + PINCTRL_PIN(55, "GPIO_55"), + PINCTRL_PIN(56, "GPIO_56"), + PINCTRL_PIN(57, "GPIO_57"), + PINCTRL_PIN(58, "GPIO_58"), + PINCTRL_PIN(59, "GPIO_59"), + PINCTRL_PIN(60, "GPIO_60"), + PINCTRL_PIN(61, "GPIO_61"), + PINCTRL_PIN(62, "GPIO_62"), + PINCTRL_PIN(63, "GPIO_63"), + PINCTRL_PIN(64, "GPIO_64"), + PINCTRL_PIN(65, "GPIO_65"), + PINCTRL_PIN(66, "GPIO_66"), + PINCTRL_PIN(67, "GPIO_67"), + PINCTRL_PIN(68, "GPIO_68"), + PINCTRL_PIN(69, "GPIO_69"), + PINCTRL_PIN(70, "GPIO_70"), + PINCTRL_PIN(71, "GPIO_71"), + PINCTRL_PIN(72, "GPIO_72"), + PINCTRL_PIN(73, "GPIO_73"), + PINCTRL_PIN(74, "GPIO_74"), + PINCTRL_PIN(75, "GPIO_75"), + PINCTRL_PIN(76, "GPIO_76"), + PINCTRL_PIN(77, "GPIO_77"), + PINCTRL_PIN(78, "GPIO_78"), + PINCTRL_PIN(79, "GPIO_79"), + PINCTRL_PIN(80, "GPIO_80"), + PINCTRL_PIN(81, "GPIO_81"), + PINCTRL_PIN(82, "GPIO_82"), + PINCTRL_PIN(83, "GPIO_83"), + PINCTRL_PIN(84, "GPIO_84"), + PINCTRL_PIN(85, "GPIO_85"), + PINCTRL_PIN(86, "GPIO_86"), + PINCTRL_PIN(87, "GPIO_87"), + PINCTRL_PIN(88, "GPIO_88"), + PINCTRL_PIN(89, "GPIO_89"), + PINCTRL_PIN(90, "GPIO_90"), + PINCTRL_PIN(91, "GPIO_91"), + PINCTRL_PIN(92, "GPIO_92"), + PINCTRL_PIN(93, "GPIO_93"), + PINCTRL_PIN(94, "GPIO_94"), + PINCTRL_PIN(95, "GPIO_95"), + PINCTRL_PIN(96, "GPIO_96"), + PINCTRL_PIN(97, "GPIO_97"), + PINCTRL_PIN(98, "GPIO_98"), + PINCTRL_PIN(99, "GPIO_99"), + PINCTRL_PIN(100, "GPIO_100"), + PINCTRL_PIN(101, "GPIO_101"), + PINCTRL_PIN(102, "GPIO_102"), + PINCTRL_PIN(103, "GPIO_103"), + PINCTRL_PIN(104, "GPIO_104"), + PINCTRL_PIN(105, "GPIO_105"), + PINCTRL_PIN(106, "GPIO_106"), + PINCTRL_PIN(107, "GPIO_107"), + PINCTRL_PIN(108, "GPIO_108"), + PINCTRL_PIN(109, "GPIO_109"), + PINCTRL_PIN(110, "GPIO_110"), + PINCTRL_PIN(111, "GPIO_111"), + PINCTRL_PIN(112, "GPIO_112"), + PINCTRL_PIN(113, "GPIO_113"), + PINCTRL_PIN(114, "GPIO_114"), + PINCTRL_PIN(115, "GPIO_115"), + PINCTRL_PIN(116, "GPIO_116"), + PINCTRL_PIN(117, "GPIO_117"), + PINCTRL_PIN(118, "GPIO_118"), + PINCTRL_PIN(119, "GPIO_119"), + PINCTRL_PIN(120, "GPIO_120"), + PINCTRL_PIN(121, "GPIO_121"), + PINCTRL_PIN(122, "GPIO_122"), + PINCTRL_PIN(123, "GPIO_123"), + PINCTRL_PIN(124, "GPIO_124"), + PINCTRL_PIN(125, "GPIO_125"), + PINCTRL_PIN(126, "GPIO_126"), + PINCTRL_PIN(127, "GPIO_127"), + PINCTRL_PIN(128, "GPIO_128"), + PINCTRL_PIN(129, "GPIO_129"), + PINCTRL_PIN(130, "GPIO_130"), + PINCTRL_PIN(131, "GPIO_131"), + PINCTRL_PIN(132, "GPIO_132"), + PINCTRL_PIN(133, "GPIO_133"), + PINCTRL_PIN(134, "GPIO_134"), + PINCTRL_PIN(135, "GPIO_135"), + PINCTRL_PIN(136, "GPIO_136"), + PINCTRL_PIN(137, "GPIO_137"), + PINCTRL_PIN(138, "GPIO_138"), + PINCTRL_PIN(139, "GPIO_139"), + PINCTRL_PIN(140, "GPIO_140"), + PINCTRL_PIN(141, "GPIO_141"), + PINCTRL_PIN(142, "GPIO_142"), + PINCTRL_PIN(143, "GPIO_143"), + PINCTRL_PIN(144, "GPIO_144"), + PINCTRL_PIN(145, "GPIO_145"), + PINCTRL_PIN(146, "GPIO_146"), + PINCTRL_PIN(147, "GPIO_147"), + PINCTRL_PIN(148, "GPIO_148"), + PINCTRL_PIN(149, "GPIO_149"), + PINCTRL_PIN(150, "GPIO_150"), + PINCTRL_PIN(151, "GPIO_151"), + PINCTRL_PIN(152, "GPIO_152"), + PINCTRL_PIN(153, "GPIO_153"), + PINCTRL_PIN(154, "GPIO_154"), + PINCTRL_PIN(155, "GPIO_155"), + PINCTRL_PIN(156, "GPIO_156"), + PINCTRL_PIN(157, "GPIO_157"), + PINCTRL_PIN(158, "GPIO_158"), + PINCTRL_PIN(159, "GPIO_159"), + PINCTRL_PIN(160, "GPIO_160"), + PINCTRL_PIN(161, "GPIO_161"), + PINCTRL_PIN(162, "GPIO_162"), + PINCTRL_PIN(163, "GPIO_163"), + PINCTRL_PIN(164, "GPIO_164"), + PINCTRL_PIN(165, "GPIO_165"), + PINCTRL_PIN(166, "GPIO_166"), + PINCTRL_PIN(167, "GPIO_167"), + PINCTRL_PIN(168, "GPIO_168"), + PINCTRL_PIN(169, "GPIO_169"), + PINCTRL_PIN(170, "GPIO_170"), + PINCTRL_PIN(171, "GPIO_171"), + PINCTRL_PIN(172, "GPIO_172"), + PINCTRL_PIN(173, "GPIO_173"), + PINCTRL_PIN(174, "GPIO_174"), + PINCTRL_PIN(175, "GPIO_175"), + PINCTRL_PIN(176, "GPIO_176"), + PINCTRL_PIN(177, "GPIO_177"), + PINCTRL_PIN(178, "GPIO_178"), + PINCTRL_PIN(179, "GPIO_179"), + PINCTRL_PIN(180, "GPIO_180"), + PINCTRL_PIN(181, "GPIO_181"), + PINCTRL_PIN(182, "GPIO_182"), + PINCTRL_PIN(183, "GPIO_183"), + PINCTRL_PIN(184, "GPIO_184"), + PINCTRL_PIN(185, "GPIO_185"), + PINCTRL_PIN(186, "GPIO_186"), + PINCTRL_PIN(187, "GPIO_187"), + PINCTRL_PIN(188, "GPIO_188"), + PINCTRL_PIN(189, "GPIO_189"), + PINCTRL_PIN(190, "GPIO_190"), + PINCTRL_PIN(191, "GPIO_191"), + PINCTRL_PIN(192, "GPIO_192"), + PINCTRL_PIN(193, "GPIO_193"), + PINCTRL_PIN(194, "GPIO_194"), + PINCTRL_PIN(195, "GPIO_195"), + PINCTRL_PIN(196, "GPIO_196"), + PINCTRL_PIN(197, "GPIO_197"), + PINCTRL_PIN(198, "GPIO_198"), + PINCTRL_PIN(199, "GPIO_199"), + PINCTRL_PIN(200, "GPIO_200"), + PINCTRL_PIN(201, "GPIO_201"), + PINCTRL_PIN(202, "GPIO_202"), + PINCTRL_PIN(203, "GPIO_203"), + PINCTRL_PIN(204, "GPIO_204"), + PINCTRL_PIN(205, "GPIO_205"), + PINCTRL_PIN(206, "GPIO_206"), + PINCTRL_PIN(207, "GPIO_207"), + PINCTRL_PIN(208, "GPIO_208"), + PINCTRL_PIN(209, "GPIO_209"), + PINCTRL_PIN(210, "GPIO_210"), + PINCTRL_PIN(211, "GPIO_211"), + PINCTRL_PIN(212, "GPIO_212"), + PINCTRL_PIN(213, "GPIO_213"), + PINCTRL_PIN(214, "GPIO_214"), + PINCTRL_PIN(215, "GPIO_215"), + PINCTRL_PIN(216, "GPIO_216"), + PINCTRL_PIN(217, "GPIO_217"), + PINCTRL_PIN(218, "GPIO_218"), + PINCTRL_PIN(219, "GPIO_219"), + PINCTRL_PIN(220, "GPIO_220"), + PINCTRL_PIN(221, "GPIO_221"), + PINCTRL_PIN(222, "GPIO_222"), + PINCTRL_PIN(223, "GPIO_223"), + PINCTRL_PIN(224, "GPIO_224"), + PINCTRL_PIN(225, "GPIO_225"), + PINCTRL_PIN(226, "GPIO_226"), + PINCTRL_PIN(227, "GPIO_227"), + PINCTRL_PIN(228, "GPIO_228"), + PINCTRL_PIN(229, "GPIO_229"), + PINCTRL_PIN(230, "GPIO_230"), + PINCTRL_PIN(231, "GPIO_231"), + PINCTRL_PIN(232, "GPIO_232"), + PINCTRL_PIN(233, "GPIO_233"), + PINCTRL_PIN(234, "GPIO_234"), + PINCTRL_PIN(235, "GPIO_235"), + PINCTRL_PIN(236, "GPIO_236"), + PINCTRL_PIN(237, "GPIO_237"), + PINCTRL_PIN(238, "GPIO_238"), + PINCTRL_PIN(239, "GPIO_239"), + PINCTRL_PIN(240, "GPIO_240"), + PINCTRL_PIN(241, "GPIO_241"), + PINCTRL_PIN(242, "GPIO_242"), + PINCTRL_PIN(243, "GPIO_243"), + PINCTRL_PIN(244, "GPIO_244"), + PINCTRL_PIN(245, "GPIO_245"), + PINCTRL_PIN(246, "GPIO_246"), + PINCTRL_PIN(247, "GPIO_247"), + PINCTRL_PIN(248, "GPIO_248"), + PINCTRL_PIN(249, "GPIO_249"), +}; + +#define DECLARE_MSM_GPIO_PINS(pin) \ + static const unsigned int gpio##pin##_pins[] = { pin } +DECLARE_MSM_GPIO_PINS(0); +DECLARE_MSM_GPIO_PINS(1); +DECLARE_MSM_GPIO_PINS(2); +DECLARE_MSM_GPIO_PINS(3); +DECLARE_MSM_GPIO_PINS(4); +DECLARE_MSM_GPIO_PINS(5); +DECLARE_MSM_GPIO_PINS(6); +DECLARE_MSM_GPIO_PINS(7); +DECLARE_MSM_GPIO_PINS(8); +DECLARE_MSM_GPIO_PINS(9); +DECLARE_MSM_GPIO_PINS(10); +DECLARE_MSM_GPIO_PINS(11); +DECLARE_MSM_GPIO_PINS(12); +DECLARE_MSM_GPIO_PINS(13); +DECLARE_MSM_GPIO_PINS(14); +DECLARE_MSM_GPIO_PINS(15); +DECLARE_MSM_GPIO_PINS(16); +DECLARE_MSM_GPIO_PINS(17); +DECLARE_MSM_GPIO_PINS(18); +DECLARE_MSM_GPIO_PINS(19); +DECLARE_MSM_GPIO_PINS(20); +DECLARE_MSM_GPIO_PINS(21); +DECLARE_MSM_GPIO_PINS(22); +DECLARE_MSM_GPIO_PINS(23); +DECLARE_MSM_GPIO_PINS(24); +DECLARE_MSM_GPIO_PINS(25); +DECLARE_MSM_GPIO_PINS(26); +DECLARE_MSM_GPIO_PINS(27); +DECLARE_MSM_GPIO_PINS(28); +DECLARE_MSM_GPIO_PINS(29); +DECLARE_MSM_GPIO_PINS(30); +DECLARE_MSM_GPIO_PINS(31); +DECLARE_MSM_GPIO_PINS(32); +DECLARE_MSM_GPIO_PINS(33); +DECLARE_MSM_GPIO_PINS(34); +DECLARE_MSM_GPIO_PINS(35); +DECLARE_MSM_GPIO_PINS(36); +DECLARE_MSM_GPIO_PINS(37); +DECLARE_MSM_GPIO_PINS(38); +DECLARE_MSM_GPIO_PINS(39); +DECLARE_MSM_GPIO_PINS(40); +DECLARE_MSM_GPIO_PINS(41); +DECLARE_MSM_GPIO_PINS(42); +DECLARE_MSM_GPIO_PINS(43); +DECLARE_MSM_GPIO_PINS(44); +DECLARE_MSM_GPIO_PINS(45); +DECLARE_MSM_GPIO_PINS(46); +DECLARE_MSM_GPIO_PINS(47); +DECLARE_MSM_GPIO_PINS(48); +DECLARE_MSM_GPIO_PINS(49); +DECLARE_MSM_GPIO_PINS(50); +DECLARE_MSM_GPIO_PINS(51); +DECLARE_MSM_GPIO_PINS(52); +DECLARE_MSM_GPIO_PINS(53); +DECLARE_MSM_GPIO_PINS(54); +DECLARE_MSM_GPIO_PINS(55); +DECLARE_MSM_GPIO_PINS(56); +DECLARE_MSM_GPIO_PINS(57); +DECLARE_MSM_GPIO_PINS(58); +DECLARE_MSM_GPIO_PINS(59); +DECLARE_MSM_GPIO_PINS(60); +DECLARE_MSM_GPIO_PINS(61); +DECLARE_MSM_GPIO_PINS(62); +DECLARE_MSM_GPIO_PINS(63); +DECLARE_MSM_GPIO_PINS(64); +DECLARE_MSM_GPIO_PINS(65); +DECLARE_MSM_GPIO_PINS(66); +DECLARE_MSM_GPIO_PINS(67); +DECLARE_MSM_GPIO_PINS(68); +DECLARE_MSM_GPIO_PINS(69); +DECLARE_MSM_GPIO_PINS(70); +DECLARE_MSM_GPIO_PINS(71); +DECLARE_MSM_GPIO_PINS(72); +DECLARE_MSM_GPIO_PINS(73); +DECLARE_MSM_GPIO_PINS(74); +DECLARE_MSM_GPIO_PINS(75); +DECLARE_MSM_GPIO_PINS(76); +DECLARE_MSM_GPIO_PINS(77); +DECLARE_MSM_GPIO_PINS(78); +DECLARE_MSM_GPIO_PINS(79); +DECLARE_MSM_GPIO_PINS(80); +DECLARE_MSM_GPIO_PINS(81); +DECLARE_MSM_GPIO_PINS(82); +DECLARE_MSM_GPIO_PINS(83); +DECLARE_MSM_GPIO_PINS(84); +DECLARE_MSM_GPIO_PINS(85); +DECLARE_MSM_GPIO_PINS(86); +DECLARE_MSM_GPIO_PINS(87); +DECLARE_MSM_GPIO_PINS(88); +DECLARE_MSM_GPIO_PINS(89); +DECLARE_MSM_GPIO_PINS(90); +DECLARE_MSM_GPIO_PINS(91); +DECLARE_MSM_GPIO_PINS(92); +DECLARE_MSM_GPIO_PINS(93); +DECLARE_MSM_GPIO_PINS(94); +DECLARE_MSM_GPIO_PINS(95); +DECLARE_MSM_GPIO_PINS(96); +DECLARE_MSM_GPIO_PINS(97); +DECLARE_MSM_GPIO_PINS(98); +DECLARE_MSM_GPIO_PINS(99); +DECLARE_MSM_GPIO_PINS(100); +DECLARE_MSM_GPIO_PINS(101); +DECLARE_MSM_GPIO_PINS(102); +DECLARE_MSM_GPIO_PINS(103); +DECLARE_MSM_GPIO_PINS(104); +DECLARE_MSM_GPIO_PINS(105); +DECLARE_MSM_GPIO_PINS(106); +DECLARE_MSM_GPIO_PINS(107); +DECLARE_MSM_GPIO_PINS(108); +DECLARE_MSM_GPIO_PINS(109); +DECLARE_MSM_GPIO_PINS(110); +DECLARE_MSM_GPIO_PINS(111); +DECLARE_MSM_GPIO_PINS(112); +DECLARE_MSM_GPIO_PINS(113); +DECLARE_MSM_GPIO_PINS(114); +DECLARE_MSM_GPIO_PINS(115); +DECLARE_MSM_GPIO_PINS(116); +DECLARE_MSM_GPIO_PINS(117); +DECLARE_MSM_GPIO_PINS(118); +DECLARE_MSM_GPIO_PINS(119); +DECLARE_MSM_GPIO_PINS(120); +DECLARE_MSM_GPIO_PINS(121); +DECLARE_MSM_GPIO_PINS(122); +DECLARE_MSM_GPIO_PINS(123); +DECLARE_MSM_GPIO_PINS(124); +DECLARE_MSM_GPIO_PINS(125); +DECLARE_MSM_GPIO_PINS(126); +DECLARE_MSM_GPIO_PINS(127); +DECLARE_MSM_GPIO_PINS(128); +DECLARE_MSM_GPIO_PINS(129); +DECLARE_MSM_GPIO_PINS(130); +DECLARE_MSM_GPIO_PINS(131); +DECLARE_MSM_GPIO_PINS(132); +DECLARE_MSM_GPIO_PINS(133); +DECLARE_MSM_GPIO_PINS(134); +DECLARE_MSM_GPIO_PINS(135); +DECLARE_MSM_GPIO_PINS(136); +DECLARE_MSM_GPIO_PINS(137); +DECLARE_MSM_GPIO_PINS(138); +DECLARE_MSM_GPIO_PINS(139); +DECLARE_MSM_GPIO_PINS(140); +DECLARE_MSM_GPIO_PINS(141); +DECLARE_MSM_GPIO_PINS(142); +DECLARE_MSM_GPIO_PINS(143); +DECLARE_MSM_GPIO_PINS(144); +DECLARE_MSM_GPIO_PINS(145); +DECLARE_MSM_GPIO_PINS(146); +DECLARE_MSM_GPIO_PINS(147); +DECLARE_MSM_GPIO_PINS(148); +DECLARE_MSM_GPIO_PINS(149); +DECLARE_MSM_GPIO_PINS(150); +DECLARE_MSM_GPIO_PINS(151); +DECLARE_MSM_GPIO_PINS(152); +DECLARE_MSM_GPIO_PINS(153); +DECLARE_MSM_GPIO_PINS(154); +DECLARE_MSM_GPIO_PINS(155); +DECLARE_MSM_GPIO_PINS(156); +DECLARE_MSM_GPIO_PINS(157); +DECLARE_MSM_GPIO_PINS(158); +DECLARE_MSM_GPIO_PINS(159); +DECLARE_MSM_GPIO_PINS(160); +DECLARE_MSM_GPIO_PINS(161); +DECLARE_MSM_GPIO_PINS(162); +DECLARE_MSM_GPIO_PINS(163); +DECLARE_MSM_GPIO_PINS(164); +DECLARE_MSM_GPIO_PINS(165); +DECLARE_MSM_GPIO_PINS(166); +DECLARE_MSM_GPIO_PINS(167); +DECLARE_MSM_GPIO_PINS(168); +DECLARE_MSM_GPIO_PINS(169); +DECLARE_MSM_GPIO_PINS(170); +DECLARE_MSM_GPIO_PINS(171); +DECLARE_MSM_GPIO_PINS(172); +DECLARE_MSM_GPIO_PINS(173); +DECLARE_MSM_GPIO_PINS(174); +DECLARE_MSM_GPIO_PINS(175); +DECLARE_MSM_GPIO_PINS(176); +DECLARE_MSM_GPIO_PINS(177); +DECLARE_MSM_GPIO_PINS(178); +DECLARE_MSM_GPIO_PINS(179); +DECLARE_MSM_GPIO_PINS(180); +DECLARE_MSM_GPIO_PINS(181); +DECLARE_MSM_GPIO_PINS(182); +DECLARE_MSM_GPIO_PINS(183); +DECLARE_MSM_GPIO_PINS(184); +DECLARE_MSM_GPIO_PINS(185); +DECLARE_MSM_GPIO_PINS(186); +DECLARE_MSM_GPIO_PINS(187); +DECLARE_MSM_GPIO_PINS(188); +DECLARE_MSM_GPIO_PINS(189); +DECLARE_MSM_GPIO_PINS(190); +DECLARE_MSM_GPIO_PINS(191); +DECLARE_MSM_GPIO_PINS(192); +DECLARE_MSM_GPIO_PINS(193); +DECLARE_MSM_GPIO_PINS(194); +DECLARE_MSM_GPIO_PINS(195); +DECLARE_MSM_GPIO_PINS(196); +DECLARE_MSM_GPIO_PINS(197); +DECLARE_MSM_GPIO_PINS(198); +DECLARE_MSM_GPIO_PINS(199); +DECLARE_MSM_GPIO_PINS(200); +DECLARE_MSM_GPIO_PINS(201); +DECLARE_MSM_GPIO_PINS(202); +DECLARE_MSM_GPIO_PINS(203); +DECLARE_MSM_GPIO_PINS(204); +DECLARE_MSM_GPIO_PINS(205); +DECLARE_MSM_GPIO_PINS(206); +DECLARE_MSM_GPIO_PINS(207); +DECLARE_MSM_GPIO_PINS(208); +DECLARE_MSM_GPIO_PINS(209); +DECLARE_MSM_GPIO_PINS(210); +DECLARE_MSM_GPIO_PINS(211); +DECLARE_MSM_GPIO_PINS(212); +DECLARE_MSM_GPIO_PINS(213); +DECLARE_MSM_GPIO_PINS(214); +DECLARE_MSM_GPIO_PINS(215); +DECLARE_MSM_GPIO_PINS(216); +DECLARE_MSM_GPIO_PINS(217); +DECLARE_MSM_GPIO_PINS(218); +DECLARE_MSM_GPIO_PINS(219); +DECLARE_MSM_GPIO_PINS(220); +DECLARE_MSM_GPIO_PINS(221); +DECLARE_MSM_GPIO_PINS(222); +DECLARE_MSM_GPIO_PINS(223); +DECLARE_MSM_GPIO_PINS(224); +DECLARE_MSM_GPIO_PINS(225); +DECLARE_MSM_GPIO_PINS(226); +DECLARE_MSM_GPIO_PINS(227); +DECLARE_MSM_GPIO_PINS(228); +DECLARE_MSM_GPIO_PINS(229); +DECLARE_MSM_GPIO_PINS(230); +DECLARE_MSM_GPIO_PINS(231); +DECLARE_MSM_GPIO_PINS(232); +DECLARE_MSM_GPIO_PINS(233); +DECLARE_MSM_GPIO_PINS(234); +DECLARE_MSM_GPIO_PINS(235); +DECLARE_MSM_GPIO_PINS(236); +DECLARE_MSM_GPIO_PINS(237); +DECLARE_MSM_GPIO_PINS(238); +DECLARE_MSM_GPIO_PINS(239); +DECLARE_MSM_GPIO_PINS(240); +DECLARE_MSM_GPIO_PINS(241); +DECLARE_MSM_GPIO_PINS(242); +DECLARE_MSM_GPIO_PINS(243); +DECLARE_MSM_GPIO_PINS(244); +DECLARE_MSM_GPIO_PINS(245); +DECLARE_MSM_GPIO_PINS(246); +DECLARE_MSM_GPIO_PINS(247); +DECLARE_MSM_GPIO_PINS(248); +DECLARE_MSM_GPIO_PINS(249); + +static const unsigned int ufs_reset_pins[] = { 250 }; +static const unsigned int sdc2_clk_pins[] = { 251 }; +static const unsigned int sdc2_cmd_pins[] = { 252 }; +static const unsigned int sdc2_data_pins[] = { 253 }; + +enum glymur_functions { + msm_mux_gpio, + msm_mux_resout_gpio_n, + msm_mux_aoss_cti, + msm_mux_asc_cci, + msm_mux_atest_char, + msm_mux_atest_usb, + msm_mux_audio_ext_mclk0, + msm_mux_audio_ext_mclk1, + msm_mux_audio_ref_clk, + msm_mux_cam_asc_mclk4, + msm_mux_cam_mclk, + msm_mux_cci_async_in, + msm_mux_cci_i2c_scl, + msm_mux_cci_i2c_sda, + msm_mux_cci_timer, + msm_mux_cmu_rng, + msm_mux_cri_trng, + msm_mux_dbg_out_clk, + msm_mux_ddr_bist_complete, + msm_mux_ddr_bist_fail, + msm_mux_ddr_bist_start, + msm_mux_ddr_bist_stop, + msm_mux_ddr_pxi, + msm_mux_edp0_hot, + msm_mux_edp0_lcd, + msm_mux_edp1_lcd, + msm_mux_egpio, + msm_mux_eusb_ac_en, + msm_mux_gcc_gp1, + msm_mux_gcc_gp2, + msm_mux_gcc_gp3, + msm_mux_host2wlan_sol, + msm_mux_i2c0_s_scl, + msm_mux_i2c0_s_sda, + msm_mux_i2s0_data, + msm_mux_i2s0_sck, + msm_mux_i2s0_ws, + msm_mux_i2s1_data, + msm_mux_i2s1_sck, + msm_mux_i2s1_ws, + msm_mux_ibi_i3c, + msm_mux_jitter_bist, + msm_mux_mdp_vsync_out, + msm_mux_mdp_vsync_e, + msm_mux_mdp_vsync_p, + msm_mux_mdp_vsync_s, + msm_mux_pcie3a_clk, + msm_mux_pcie3a_rst_n, + msm_mux_pcie3b_clk, + msm_mux_pcie4_clk_req_n, + msm_mux_pcie5_clk_req_n, + msm_mux_pcie6_clk_req_n, + msm_mux_phase_flag, + msm_mux_pll_bist_sync, + msm_mux_pll_clk_aux, + msm_mux_pmc_oca_n, + msm_mux_pmc_uva_n, + msm_mux_prng_rosc, + msm_mux_qdss_cti, + msm_mux_qdss_gpio, + msm_mux_qspi0, + msm_mux_qup0_se0, + msm_mux_qup0_se1, + msm_mux_qup0_se2, + msm_mux_qup0_se3, + msm_mux_qup0_se4, + msm_mux_qup0_se5, + msm_mux_qup0_se6, + msm_mux_qup0_se7, + msm_mux_qup1_se0, + msm_mux_qup1_se1, + msm_mux_qup1_se2, + msm_mux_qup1_se3, + msm_mux_qup1_se4, + msm_mux_qup1_se5, + msm_mux_qup1_se6, + msm_mux_qup1_se7, + msm_mux_qup2_se0, + msm_mux_qup2_se1, + msm_mux_qup2_se2, + msm_mux_qup2_se3, + msm_mux_qup2_se4, + msm_mux_qup2_se5, + msm_mux_qup2_se6, + msm_mux_qup2_se7, + msm_mux_qup3_se0, + msm_mux_qup3_se1, + msm_mux_sd_write_protect, + msm_mux_sdc4_clk, + msm_mux_sdc4_cmd, + msm_mux_sdc4_data, + msm_mux_smb_acok_n, + msm_mux_sys_throttle, + msm_mux_tb_trig_sdc2, + msm_mux_tb_trig_sdc4, + msm_mux_tmess_prng, + msm_mux_tsense_pwm, + msm_mux_tsense_therm, + msm_mux_usb0_dp, + msm_mux_usb0_phy_ps, + msm_mux_usb0_sbrx, + msm_mux_usb0_sbtx, + msm_mux_usb0_tmu, + msm_mux_usb1_dbg, + msm_mux_usb1_dp, + msm_mux_usb1_phy_ps, + msm_mux_usb1_sbrx, + msm_mux_usb1_sbtx, + msm_mux_usb1_tmu, + msm_mux_usb2_dp, + msm_mux_usb2_phy_ps, + msm_mux_usb2_sbrx, + msm_mux_usb2_sbtx, + msm_mux_usb2_tmu, + msm_mux_vsense_trigger_mirnat, + msm_mux_wcn_sw, + msm_mux_wcn_sw_ctrl, + msm_mux__, +}; + +static const char *const gpio_groups[] = { + "gpio0", "gpio1", "gpio2", "gpio3", "gpio4", "gpio5", + "gpio6", "gpio7", "gpio8", "gpio9", "gpio10", "gpio11", + "gpio12", "gpio13", "gpio14", "gpio15", "gpio16", "gpio17", + "gpio18", "gpio19", "gpio20", "gpio21", "gpio22", "gpio23", + "gpio24", "gpio25", "gpio26", "gpio27", "gpio28", "gpio29", + "gpio30", "gpio31", "gpio32", "gpio33", "gpio34", "gpio35", + "gpio36", "gpio37", "gpio38", "gpio39", "gpio40", "gpio41", + "gpio42", "gpio43", "gpio44", "gpio45", "gpio46", "gpio47", + "gpio48", "gpio49", "gpio50", "gpio51", "gpio52", "gpio53", + "gpio54", "gpio55", "gpio56", "gpio57", "gpio58", "gpio59", + "gpio60", "gpio61", "gpio62", "gpio63", "gpio64", "gpio65", + "gpio66", "gpio67", "gpio68", "gpio69", "gpio70", "gpio71", + "gpio72", "gpio73", "gpio74", "gpio75", "gpio76", "gpio77", + "gpio78", "gpio79", "gpio80", "gpio81", "gpio82", "gpio83", + "gpio84", "gpio85", "gpio86", "gpio87", "gpio88", "gpio89", + "gpio90", "gpio91", "gpio92", "gpio93", "gpio94", "gpio95", + "gpio96", "gpio97", "gpio98", "gpio99", "gpio100", "gpio101", + "gpio102", "gpio103", "gpio104", "gpio105", "gpio106", "gpio107", + "gpio108", "gpio109", "gpio110", "gpio111", "gpio112", "gpio113", + "gpio114", "gpio115", "gpio116", "gpio117", "gpio118", "gpio119", + "gpio120", "gpio121", "gpio122", "gpio123", "gpio124", "gpio125", + "gpio126", "gpio127", "gpio128", "gpio129", "gpio130", "gpio131", + "gpio132", "gpio133", "gpio134", "gpio135", "gpio136", "gpio137", + "gpio138", "gpio139", "gpio140", "gpio141", "gpio142", "gpio143", + "gpio144", "gpio145", "gpio146", "gpio147", "gpio148", "gpio149", + "gpio150", "gpio151", "gpio152", "gpio153", "gpio154", "gpio155", + "gpio156", "gpio157", "gpio158", "gpio159", "gpio160", "gpio161", + "gpio162", "gpio163", "gpio164", "gpio165", "gpio166", "gpio167", + "gpio168", "gpio169", "gpio170", "gpio171", "gpio172", "gpio173", + "gpio174", "gpio175", "gpio176", "gpio177", "gpio178", "gpio179", + "gpio180", "gpio181", "gpio182", "gpio183", "gpio184", "gpio185", + "gpio186", "gpio187", "gpio188", "gpio189", "gpio190", "gpio191", + "gpio192", "gpio193", "gpio194", "gpio195", "gpio196", "gpio197", + "gpio198", "gpio199", "gpio200", "gpio201", "gpio202", "gpio203", + "gpio204", "gpio205", "gpio206", "gpio207", "gpio208", "gpio209", + "gpio210", "gpio211", "gpio212", "gpio213", "gpio214", "gpio215", + "gpio216", "gpio217", "gpio218", "gpio219", "gpio220", "gpio221", + "gpio222", "gpio223", "gpio224", "gpio225", "gpio226", "gpio227", + "gpio228", "gpio229", "gpio230", "gpio231", "gpio232", "gpio233", + "gpio234", "gpio235", "gpio236", "gpio237", "gpio238", "gpio239", + "gpio240", "gpio241", "gpio242", "gpio243", "gpio244", "gpio245", + "gpio246", "gpio247", "gpio248", "gpio249", +}; + +static const char *const resout_gpio_n_groups[] = { + "gpio160", +}; + +static const char *const aoss_cti_groups[] = { + "gpio60", + "gpio61", + "gpio62", + "gpio63", +}; + +static const char *const asc_cci_groups[] = { + "gpio235", + "gpio236", +}; + +static const char *const atest_char_groups[] = { + "gpio172", "gpio184", "gpio188", "gpio164", + "gpio163", +}; + +static const char *const atest_usb_groups[] = { + "gpio39", "gpio40", "gpio41", "gpio38", + "gpio44", "gpio45", "gpio42", "gpio43", + "gpio49", "gpio50", "gpio51", "gpio48", + "gpio54", "gpio55", "gpio52", "gpio53", + "gpio65", "gpio66", "gpio46", "gpio47", + "gpio72", "gpio73", "gpio80", "gpio81", +}; + +static const char *const audio_ext_mclk0_groups[] = { + "gpio134", +}; + +static const char *const audio_ext_mclk1_groups[] = { + "gpio142", +}; + +static const char *const audio_ref_clk_groups[] = { + "gpio142", +}; + +static const char *const cam_asc_mclk4_groups[] = { + "gpio100", +}; + +static const char *const cam_mclk_groups[] = { + "gpio96", + "gpio97", + "gpio98", + "gpio99", +}; + +static const char *const cci_async_in_groups[] = { + "gpio113", "gpio112", "gpio111", +}; + +static const char *const cci_i2c_scl_groups[] = { + "gpio102", "gpio104", "gpio106", +}; + +static const char *const cci_i2c_sda_groups[] = { + "gpio101", "gpio103", "gpio105", +}; + +static const char *const cci_timer_groups[] = { + "gpio109", "gpio110", "gpio111", "gpio112", + "gpio113", +}; + +static const char *const cmu_rng_groups[] = { + "gpio48", "gpio47", "gpio46", "gpio45", +}; + +static const char *const cri_trng_groups[] = { + "gpio173", +}; + +static const char *const dbg_out_clk_groups[] = { + "gpio51", +}; + +static const char *const ddr_bist_complete_groups[] = { + "gpio57", +}; + +static const char *const ddr_bist_fail_groups[] = { + "gpio56", +}; + +static const char *const ddr_bist_start_groups[] = { + "gpio54", +}; + +static const char *const ddr_bist_stop_groups[] = { + "gpio55", +}; + +static const char *const ddr_pxi_groups[] = { + "gpio38", "gpio39", "gpio40", "gpio41", + "gpio72", "gpio73", "gpio80", "gpio81", + "gpio42", "gpio43", "gpio44", "gpio45", + "gpio46", "gpio47", "gpio48", "gpio49", + "gpio50", "gpio51", "gpio52", "gpio53", + "gpio54", "gpio55", "gpio65", "gpio66", +}; + +static const char *const edp0_hot_groups[] = { + "gpio119", +}; + +static const char *const edp0_lcd_groups[] = { + "gpio120", +}; + +static const char *const edp1_lcd_groups[] = { + "gpio115", + "gpio119", +}; + +static const char *const egpio_groups[] = { + "gpio192", "gpio193", "gpio194", "gpio195", "gpio196", "gpio197", + "gpio198", "gpio199", "gpio200", "gpio201", "gpio202", "gpio203", + "gpio204", "gpio205", "gpio206", "gpio207", "gpio208", "gpio209", + "gpio210", "gpio211", "gpio212", "gpio213", "gpio214", "gpio215", + "gpio216", "gpio217", "gpio218", "gpio219", "gpio220", "gpio221", + "gpio222", "gpio223", "gpio224", "gpio225", "gpio226", "gpio227", + "gpio228", "gpio229", "gpio230", "gpio231", "gpio232", "gpio233", + "gpio234", "gpio235", "gpio236", "gpio237", "gpio238", "gpio239", + "gpio240", "gpio241", "gpio242", "gpio243", "gpio244", +}; + +static const char *const eusb_ac_en_groups[] = { + "gpio168", "gpio177", "gpio186", "gpio69", + "gpio187", "gpio178", +}; + +static const char *const gcc_gp1_groups[] = { + "gpio71", + "gpio72", +}; + +static const char *const gcc_gp2_groups[] = { + "gpio64", + "gpio73", +}; + +static const char *const gcc_gp3_groups[] = { + "gpio74", + "gpio82", +}; + +static const char *const host2wlan_sol_groups[] = { + "gpio118", +}; + +static const char *const i2c0_s_scl_groups[] = { + "gpio7", +}; + +static const char *const i2c0_s_sda_groups[] = { + "gpio6", +}; + +static const char *const i2s0_data_groups[] = { + "gpio136", "gpio137", +}; + +static const char *const i2s0_sck_groups[] = { + "gpio135", +}; + +static const char *const i2s0_ws_groups[] = { + "gpio138", +}; + +static const char *const i2s1_data_groups[] = { + "gpio140", "gpio142", +}; + +static const char *const i2s1_sck_groups[] = { + "gpio139", +}; + +static const char *const i2s1_ws_groups[] = { + "gpio141", +}; + +static const char *const ibi_i3c_groups[] = { + "gpio0", "gpio1", "gpio4", "gpio5", "gpio32", "gpio33", + "gpio36", "gpio37", "gpio64", "gpio65", "gpio68", "gpio69", +}; + +static const char *const jitter_bist_groups[] = { + "gpio52", +}; + +static const char *const mdp_vsync_out_groups[] = { + "gpio114", "gpio114", "gpio115", "gpio115", + "gpio109", "gpio110", "gpio111", "gpio112", + "gpio113", +}; + +static const char *const mdp_vsync_e_groups[] = { + "gpio106", +}; + +static const char *const mdp_vsync_p_groups[] = { + "gpio98", +}; + +static const char *const mdp_vsync_s_groups[] = { + "gpio105", +}; + +static const char *const pcie3a_clk_groups[] = { + "gpio144", +}; + +static const char *const pcie3a_rst_n_groups[] = { + "gpio143", +}; + +static const char *const pcie3b_clk_groups[] = { + "gpio156", +}; + +static const char *const pcie4_clk_req_n_groups[] = { + "gpio147", +}; + +static const char *const pcie5_clk_req_n_groups[] = { + "gpio153", +}; + +static const char *const pcie6_clk_req_n_groups[] = { + "gpio150", +}; + +static const char *const phase_flag_groups[] = { + "gpio6", "gpio7", "gpio16", "gpio17", + "gpio18", "gpio19", "gpio20", "gpio21", + "gpio22", "gpio23", "gpio24", "gpio25", + "gpio8", "gpio26", "gpio27", "gpio163", + "gpio164", "gpio188", "gpio184", "gpio172", + "gpio186", "gpio173", "gpio76", "gpio9", + "gpio77", "gpio78", "gpio10", "gpio11", + "gpio12", "gpio13", "gpio14", "gpio15", +}; + +static const char *const pll_bist_sync_groups[] = { + "gpio28", +}; + +static const char *const pll_clk_aux_groups[] = { + "gpio35", +}; + +static const char *const pmc_oca_n_groups[] = { + "gpio249", +}; + +static const char *const pmc_uva_n_groups[] = { + "gpio248", +}; + +static const char *const prng_rosc_groups[] = { + "gpio186", "gpio188", "gpio164", "gpio163", +}; + +static const char *const qdss_cti_groups[] = { + "gpio18", "gpio19", "gpio23", "gpio27", + "gpio161", "gpio162", "gpio215", "gpio217", +}; + +static const char *const qdss_gpio_groups[] = { + "gpio104", "gpio151", "gpio227", "gpio228", + "gpio96", "gpio219", "gpio97", "gpio220", + "gpio108", "gpio231", "gpio109", "gpio232", + "gpio110", "gpio233", "gpio111", "gpio234", + "gpio112", "gpio235", "gpio113", "gpio236", + "gpio149", "gpio221", "gpio99", "gpio222", + "gpio100", "gpio223", "gpio101", "gpio224", + "gpio102", "gpio225", "gpio103", "gpio226", + "gpio152", "gpio237", "gpio107", "gpio238", +}; + +static const char *const qspi0_groups[] = { + "gpio127", "gpio132", "gpio133", "gpio128", + "gpio129", "gpio130", "gpio131", +}; + +static const char *const qup0_se0_groups[] = { + "gpio0", "gpio1", "gpio2", "gpio3", +}; + +static const char *const qup0_se1_groups[] = { + "gpio4", "gpio5", "gpio6", "gpio7", +}; + +static const char *const qup0_se2_groups[] = { + "gpio8", "gpio9", "gpio10", "gpio11", + "gpio17", "gpio18", "gpio19", +}; + +static const char *const qup0_se3_groups[] = { + "gpio12", "gpio13", "gpio14", "gpio15", + "gpio21", "gpio22", "gpio23", +}; + +static const char *const qup0_se4_groups[] = { + "gpio16", "gpio17", "gpio18", "gpio19", +}; + +static const char *const qup0_se5_groups[] = { + "gpio20", "gpio21", "gpio22", "gpio23", +}; + +static const char *const qup0_se6_groups[] = { + "gpio6", "gpio7", "gpio4", "gpio5", +}; + +static const char *const qup0_se7_groups[] = { + "gpio14", "gpio15", "gpio12", "gpio13", +}; + +static const char *const qup1_se0_groups[] = { + "gpio32", "gpio33", "gpio34", "gpio35", +}; + +static const char *const qup1_se1_groups[] = { + "gpio36", "gpio37", "gpio38", "gpio39", +}; + +static const char *const qup1_se2_groups[] = { + "gpio40", "gpio41", "gpio42", "gpio43", + "gpio49", "gpio50", "gpio51", +}; + +static const char *const qup1_se3_groups[] = { + "gpio44", "gpio45", "gpio46", "gpio47", + "gpio33", "gpio34", "gpio35", +}; + +static const char *const qup1_se4_groups[] = { + "gpio48", "gpio49", "gpio50", "gpio51", +}; + +static const char *const qup1_se5_groups[] = { + "gpio52", "gpio53", "gpio54", "gpio55", +}; + +static const char *const qup1_se6_groups[] = { + "gpio56", "gpio57", "gpio58", "gpio59", +}; + +static const char *const qup1_se7_groups[] = { + "gpio54", "gpio55", "gpio52", "gpio53", +}; + +static const char *const qup2_se0_groups[] = { + "gpio64", "gpio65", "gpio66", "gpio67", +}; + +static const char *const qup2_se1_groups[] = { + "gpio68", "gpio69", "gpio70", "gpio71", +}; + +static const char *const qup2_se2_groups[] = { + "gpio72", "gpio73", "gpio74", "gpio75", + "gpio81", "gpio82", "gpio83", +}; + +static const char *const qup2_se3_groups[] = { + "gpio76", "gpio77", "gpio78", "gpio79", + "gpio65", "gpio66", "gpio67", +}; + +static const char *const qup2_se4_groups[] = { + "gpio80", "gpio81", "gpio82", "gpio83", +}; + +static const char *const qup2_se5_groups[] = { + "gpio84", "gpio85", "gpio86", "gpio87", +}; + +static const char *const qup2_se6_groups[] = { + "gpio88", "gpio89", "gpio90", "gpio91", +}; + +static const char *const qup2_se7_groups[] = { + "gpio80", "gpio81", "gpio82", "gpio83", +}; + +static const char *const qup3_se0_groups[] = { + "gpio128", "gpio129", "gpio127", "gpio132", + "gpio130", "gpio131", "gpio133", "gpio247", +}; + +static const char *const qup3_se1_groups[] = { + "gpio40", "gpio41", "gpio42", "gpio43", + "gpio49", "gpio50", "gpio51", "gpio48", +}; + +static const char *const sd_write_protect_groups[] = { + "gpio162", +}; + +static const char *const sdc4_clk_groups[] = { + "gpio127", +}; + +static const char *const sdc4_cmd_groups[] = { + "gpio132", +}; + +static const char *const sdc4_data_groups[] = { + "gpio128", + "gpio129", + "gpio130", + "gpio131", +}; + +static const char *const smb_acok_n_groups[] = { + "gpio245", +}; + +static const char *const sys_throttle_groups[] = { + "gpio39", + "gpio94", +}; + +static const char *const tb_trig_sdc2_groups[] = { + "gpio137", +}; + +static const char *const tb_trig_sdc4_groups[] = { + "gpio133", +}; + +static const char *const tmess_prng_groups[] = { + "gpio92", "gpio93", "gpio94", "gpio95", +}; + +static const char *const tsense_pwm_groups[] = { + "gpio28", "gpio29", "gpio30", "gpio31", + "gpio34", "gpio138", "gpio139", "gpio140", +}; + +static const char *const tsense_therm_groups[] = { + "gpio141", +}; + +static const char *const usb0_dp_groups[] = { + "gpio122", +}; + +static const char *const usb0_phy_ps_groups[] = { + "gpio121", +}; + +static const char *const usb0_sbrx_groups[] = { + "gpio163", +}; + +static const char *const usb0_sbtx_groups[] = { + "gpio164", + "gpio165", +}; + +static const char *const usb0_tmu_groups[] = { + "gpio98", +}; + +static const char *const usb1_dbg_groups[] = { + "gpio105", + "gpio106", +}; + +static const char *const usb1_dp_groups[] = { + "gpio124", +}; + +static const char *const usb1_phy_ps_groups[] = { + "gpio123", +}; + +static const char *const usb1_sbrx_groups[] = { + "gpio172", +}; + +static const char *const usb1_sbtx_groups[] = { + "gpio173", + "gpio174", +}; + +static const char *const usb1_tmu_groups[] = { + "gpio98", +}; + +static const char *const usb2_dp_groups[] = { + "gpio126", +}; + +static const char *const usb2_phy_ps_groups[] = { + "gpio125", +}; + +static const char *const usb2_sbrx_groups[] = { + "gpio181", +}; + +static const char *const usb2_sbtx_groups[] = { + "gpio182", + "gpio183", +}; + +static const char *const usb2_tmu_groups[] = { + "gpio98", +}; + +static const char *const vsense_trigger_mirnat_groups[] = { + "gpio38", +}; + +static const char *const wcn_sw_groups[] = { + "gpio221", +}; + +static const char *const wcn_sw_ctrl_groups[] = { + "gpio214", +}; + +static const struct pinfunction glymur_functions[] = { + MSM_PIN_FUNCTION(gpio), + MSM_PIN_FUNCTION(resout_gpio_n), + MSM_PIN_FUNCTION(aoss_cti), + MSM_PIN_FUNCTION(asc_cci), + MSM_PIN_FUNCTION(atest_char), + MSM_PIN_FUNCTION(atest_usb), + MSM_PIN_FUNCTION(audio_ext_mclk0), + MSM_PIN_FUNCTION(audio_ext_mclk1), + MSM_PIN_FUNCTION(audio_ref_clk), + MSM_PIN_FUNCTION(cam_asc_mclk4), + MSM_PIN_FUNCTION(cam_mclk), + MSM_PIN_FUNCTION(cci_async_in), + MSM_PIN_FUNCTION(cci_i2c_scl), + MSM_PIN_FUNCTION(cci_i2c_sda), + MSM_PIN_FUNCTION(cci_timer), + MSM_PIN_FUNCTION(cmu_rng), + MSM_PIN_FUNCTION(cri_trng), + MSM_PIN_FUNCTION(dbg_out_clk), + MSM_PIN_FUNCTION(ddr_bist_complete), + MSM_PIN_FUNCTION(ddr_bist_fail), + MSM_PIN_FUNCTION(ddr_bist_start), + MSM_PIN_FUNCTION(ddr_bist_stop), + MSM_PIN_FUNCTION(ddr_pxi), + MSM_PIN_FUNCTION(edp0_hot), + MSM_PIN_FUNCTION(edp0_lcd), + MSM_PIN_FUNCTION(edp1_lcd), + MSM_PIN_FUNCTION(egpio), + MSM_PIN_FUNCTION(eusb_ac_en), + MSM_PIN_FUNCTION(gcc_gp1), + MSM_PIN_FUNCTION(gcc_gp2), + MSM_PIN_FUNCTION(gcc_gp3), + MSM_PIN_FUNCTION(host2wlan_sol), + MSM_PIN_FUNCTION(i2c0_s_scl), + MSM_PIN_FUNCTION(i2c0_s_sda), + MSM_PIN_FUNCTION(i2s0_data), + MSM_PIN_FUNCTION(i2s0_sck), + MSM_PIN_FUNCTION(i2s0_ws), + MSM_PIN_FUNCTION(i2s1_data), + MSM_PIN_FUNCTION(i2s1_sck), + MSM_PIN_FUNCTION(i2s1_ws), + MSM_PIN_FUNCTION(ibi_i3c), + MSM_PIN_FUNCTION(jitter_bist), + MSM_PIN_FUNCTION(mdp_vsync_out), + MSM_PIN_FUNCTION(mdp_vsync_e), + MSM_PIN_FUNCTION(mdp_vsync_p), + MSM_PIN_FUNCTION(mdp_vsync_s), + MSM_PIN_FUNCTION(pcie3a_clk), + MSM_PIN_FUNCTION(pcie3a_rst_n), + MSM_PIN_FUNCTION(pcie3b_clk), + MSM_PIN_FUNCTION(pcie4_clk_req_n), + MSM_PIN_FUNCTION(pcie5_clk_req_n), + MSM_PIN_FUNCTION(pcie6_clk_req_n), + MSM_PIN_FUNCTION(phase_flag), + MSM_PIN_FUNCTION(pll_bist_sync), + MSM_PIN_FUNCTION(pll_clk_aux), + MSM_PIN_FUNCTION(pmc_oca_n), + MSM_PIN_FUNCTION(pmc_uva_n), + MSM_PIN_FUNCTION(prng_rosc), + MSM_PIN_FUNCTION(qdss_cti), + MSM_PIN_FUNCTION(qdss_gpio), + MSM_PIN_FUNCTION(qspi0), + MSM_PIN_FUNCTION(qup0_se0), + MSM_PIN_FUNCTION(qup0_se1), + MSM_PIN_FUNCTION(qup0_se2), + MSM_PIN_FUNCTION(qup0_se3), + MSM_PIN_FUNCTION(qup0_se4), + MSM_PIN_FUNCTION(qup0_se5), + MSM_PIN_FUNCTION(qup0_se6), + MSM_PIN_FUNCTION(qup0_se7), + MSM_PIN_FUNCTION(qup1_se0), + MSM_PIN_FUNCTION(qup1_se1), + MSM_PIN_FUNCTION(qup1_se2), + MSM_PIN_FUNCTION(qup1_se3), + MSM_PIN_FUNCTION(qup1_se4), + MSM_PIN_FUNCTION(qup1_se5), + MSM_PIN_FUNCTION(qup1_se6), + MSM_PIN_FUNCTION(qup1_se7), + MSM_PIN_FUNCTION(qup2_se0), + MSM_PIN_FUNCTION(qup2_se1), + MSM_PIN_FUNCTION(qup2_se2), + MSM_PIN_FUNCTION(qup2_se3), + MSM_PIN_FUNCTION(qup2_se4), + MSM_PIN_FUNCTION(qup2_se5), + MSM_PIN_FUNCTION(qup2_se6), + MSM_PIN_FUNCTION(qup2_se7), + MSM_PIN_FUNCTION(qup3_se0), + MSM_PIN_FUNCTION(qup3_se1), + MSM_PIN_FUNCTION(sd_write_protect), + MSM_PIN_FUNCTION(sdc4_clk), + MSM_PIN_FUNCTION(sdc4_cmd), + MSM_PIN_FUNCTION(sdc4_data), + MSM_PIN_FUNCTION(smb_acok_n), + MSM_PIN_FUNCTION(sys_throttle), + MSM_PIN_FUNCTION(tb_trig_sdc2), + MSM_PIN_FUNCTION(tb_trig_sdc4), + MSM_PIN_FUNCTION(tmess_prng), + MSM_PIN_FUNCTION(tsense_pwm), + MSM_PIN_FUNCTION(tsense_therm), + MSM_PIN_FUNCTION(usb0_dp), + MSM_PIN_FUNCTION(usb0_phy_ps), + MSM_PIN_FUNCTION(usb0_sbrx), + MSM_PIN_FUNCTION(usb0_sbtx), + MSM_PIN_FUNCTION(usb0_tmu), + MSM_PIN_FUNCTION(usb1_dbg), + MSM_PIN_FUNCTION(usb1_dp), + MSM_PIN_FUNCTION(usb1_phy_ps), + MSM_PIN_FUNCTION(usb1_sbrx), + MSM_PIN_FUNCTION(usb1_sbtx), + MSM_PIN_FUNCTION(usb1_tmu), + MSM_PIN_FUNCTION(usb2_dp), + MSM_PIN_FUNCTION(usb2_phy_ps), + MSM_PIN_FUNCTION(usb2_sbrx), + MSM_PIN_FUNCTION(usb2_sbtx), + MSM_PIN_FUNCTION(usb2_tmu), + MSM_PIN_FUNCTION(vsense_trigger_mirnat), + MSM_PIN_FUNCTION(wcn_sw), + MSM_PIN_FUNCTION(wcn_sw_ctrl), +}; + +static const struct msm_pingroup glymur_groups[] = { + [0] = PINGROUP(0, qup0_se0, ibi_i3c, _, _, _, _, _, _, _, _, _), + [1] = PINGROUP(1, qup0_se0, ibi_i3c, _, _, _, _, _, _, _, _, _), + [2] = PINGROUP(2, qup0_se0, _, _, _, _, _, _, _, _, _, _), + [3] = PINGROUP(3, qup0_se0, _, _, _, _, _, _, _, _, _, _), + [4] = PINGROUP(4, qup0_se1, qup0_se6, ibi_i3c, _, _, _, _, _, _, _, _), + [5] = PINGROUP(5, qup0_se1, qup0_se6, ibi_i3c, _, _, _, _, _, _, _, _), + [6] = PINGROUP(6, qup0_se1, qup0_se6, i2c0_s_sda, phase_flag, _, _, _, _, _, _, _), + [7] = PINGROUP(7, qup0_se1, qup0_se6, i2c0_s_scl, phase_flag, _, _, _, _, _, _, _), + [8] = PINGROUP(8, qup0_se2, phase_flag, _, _, _, _, _, _, _, _, _), + [9] = PINGROUP(9, qup0_se2, phase_flag, _, _, _, _, _, _, _, _, _), + [10] = PINGROUP(10, qup0_se2, phase_flag, _, _, _, _, _, _, _, _, _), + [11] = PINGROUP(11, qup0_se2, phase_flag, _, _, _, _, _, _, _, _, _), + [12] = PINGROUP(12, qup0_se3, qup0_se7, phase_flag, _, _, _, _, _, _, _, _), + [13] = PINGROUP(13, qup0_se3, qup0_se7, phase_flag, _, _, _, _, _, _, _, _), + [14] = PINGROUP(14, qup0_se3, qup0_se7, phase_flag, _, _, _, _, _, _, _, _), + [15] = PINGROUP(15, qup0_se3, qup0_se7, phase_flag, _, _, _, _, _, _, _, _), + [16] = PINGROUP(16, qup0_se4, phase_flag, _, _, _, _, _, _, _, _, _), + [17] = PINGROUP(17, qup0_se4, qup0_se2, phase_flag, _, _, _, _, _, _, _, _), + [18] = PINGROUP(18, qup0_se4, qup0_se2, phase_flag, _, qdss_cti, _, _, _, _, _, _), + [19] = PINGROUP(19, qup0_se4, qup0_se2, phase_flag, _, qdss_cti, _, _, _, _, _, _), + [20] = PINGROUP(20, qup0_se5, _, phase_flag, _, _, _, _, _, _, _, _), + [21] = PINGROUP(21, qup0_se5, qup0_se3, _, phase_flag, _, _, _, _, _, _, _), + [22] = PINGROUP(22, qup0_se5, qup0_se3, _, phase_flag, _, _, _, _, _, _, _), + [23] = PINGROUP(23, qup0_se5, qup0_se3, phase_flag, _, qdss_cti, _, _, _, _, _, _), + [24] = PINGROUP(24, phase_flag, _, _, _, _, _, _, _, _, _, _), + [25] = PINGROUP(25, phase_flag, _, _, _, _, _, _, _, _, _, _), + [26] = PINGROUP(26, phase_flag, _, _, _, _, _, _, _, _, _, _), + [27] = PINGROUP(27, phase_flag, _, qdss_cti, _, _, _, _, _, _, _, _), + [28] = PINGROUP(28, pll_bist_sync, tsense_pwm, _, _, _, _, _, _, _, _, _), + [29] = PINGROUP(29, tsense_pwm, _, _, _, _, _, _, _, _, _, _), + [30] = PINGROUP(30, tsense_pwm, _, _, _, _, _, _, _, _, _, _), + [31] = PINGROUP(31, tsense_pwm, _, _, _, _, _, _, _, _, _, _), + [32] = PINGROUP(32, qup1_se0, ibi_i3c, _, _, _, _, _, _, _, _, _), + [33] = PINGROUP(33, qup1_se0, ibi_i3c, qup1_se3, _, _, _, _, _, _, _, _), + [34] = PINGROUP(34, qup1_se0, qup1_se3, tsense_pwm, _, _, _, _, _, _, _, _), + [35] = PINGROUP(35, qup1_se0, qup1_se3, pll_clk_aux, _, _, _, _, _, _, _, _), + [36] = PINGROUP(36, qup1_se1, ibi_i3c, _, _, _, _, _, _, _, _, _), + [37] = PINGROUP(37, qup1_se1, ibi_i3c, _, _, _, _, _, _, _, _, _), + [38] = PINGROUP(38, qup1_se1, atest_usb, ddr_pxi, vsense_trigger_mirnat, _, _, _, _, + _, _, _), + [39] = PINGROUP(39, qup1_se1, sys_throttle, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [40] = PINGROUP(40, qup1_se2, qup3_se1, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [41] = PINGROUP(41, qup1_se2, qup3_se1, qup3_se0, atest_usb, ddr_pxi, _, _, _, _, + _, _), + [42] = PINGROUP(42, qup1_se2, qup3_se1, qup0_se1, atest_usb, ddr_pxi, _, _, _, _, + _, _), + [43] = PINGROUP(43, qup1_se2, qup3_se1, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [44] = PINGROUP(44, qup1_se3, _, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [45] = PINGROUP(45, qup1_se3, cmu_rng, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [46] = PINGROUP(46, qup1_se3, cmu_rng, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [47] = PINGROUP(47, qup1_se3, cmu_rng, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [48] = PINGROUP(48, qup1_se4, qup3_se1, cmu_rng, _, atest_usb, ddr_pxi, _, _, _, + _, _), + [49] = PINGROUP(49, qup1_se4, qup1_se2, qup3_se1, _, atest_usb, ddr_pxi, _, _, + _, _, _), + [50] = PINGROUP(50, qup1_se4, qup1_se2, qup3_se1, _, atest_usb, ddr_pxi, _, _, + _, _, _), + [51] = PINGROUP(51, qup1_se4, qup1_se2, qup3_se1, dbg_out_clk, atest_usb, + ddr_pxi, _, _, _, _, _), + [52] = PINGROUP(52, qup1_se5, qup1_se7, jitter_bist, atest_usb, ddr_pxi, _, _, _, + _, _, _), + [53] = PINGROUP(53, qup1_se5, qup1_se7, _, atest_usb, ddr_pxi, _, _, _, _, _, _), + [54] = PINGROUP(54, qup1_se5, qup1_se7, ddr_bist_start, atest_usb, ddr_pxi, _, _, + _, _, _, _), + [55] = PINGROUP(55, qup1_se5, qup1_se7, ddr_bist_stop, atest_usb, ddr_pxi, _, _, + _, _, _, _), + [56] = PINGROUP(56, qup1_se6, ddr_bist_fail, _, _, _, _, _, _, _, _, _), + [57] = PINGROUP(57, qup1_se6, ddr_bist_complete, _, _, _, _, _, _, _, _, _), + [58] = PINGROUP(58, qup1_se6, _, _, _, _, _, _, _, _, _, _), + [59] = PINGROUP(59, qup1_se6, _, _, _, _, _, _, _, _, _, _), + [60] = PINGROUP(60, aoss_cti, _, _, _, _, _, _, _, _, _, _), + [61] = PINGROUP(61, aoss_cti, _, _, _, _, _, _, _, _, _, _), + [62] = PINGROUP(62, aoss_cti, _, _, _, _, _, _, _, _, _, _), + [63] = PINGROUP(63, aoss_cti, _, _, _, _, _, _, _, _, _, _), + [64] = PINGROUP(64, qup2_se0, ibi_i3c, gcc_gp2, _, _, _, _, _, _, _, _), + [65] = PINGROUP(65, qup2_se0, qup2_se3, ibi_i3c, atest_usb, ddr_pxi, _, _, _, _, + _, _), + [66] = PINGROUP(66, qup2_se0, qup2_se3, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [67] = PINGROUP(67, qup2_se0, qup2_se3, _, _, _, _, _, _, _, _, _), + [68] = PINGROUP(68, qup2_se1, ibi_i3c, _, _, _, _, _, _, _, _, _), + [69] = PINGROUP(69, qup2_se1, ibi_i3c, _, _, _, _, _, _, _, _, _), + [70] = PINGROUP(70, qup2_se1, _, _, _, _, _, _, _, _, _, _), + [71] = PINGROUP(71, qup2_se1, gcc_gp1, _, _, _, _, _, _, _, _, _), + [72] = PINGROUP(72, qup2_se2, gcc_gp1, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [73] = PINGROUP(73, qup2_se2, gcc_gp2, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [74] = PINGROUP(74, qup2_se2, gcc_gp3, _, _, _, _, _, _, _, _, _), + [75] = PINGROUP(75, qup2_se2, _, _, _, _, _, _, _, _, _, _), + [76] = PINGROUP(76, qup2_se3, phase_flag, _, _, _, _, _, _, _, _, _), + [77] = PINGROUP(77, qup2_se3, phase_flag, _, _, _, _, _, _, _, _, _), + [78] = PINGROUP(78, qup2_se3, phase_flag, _, _, _, _, _, _, _, _, _), + [79] = PINGROUP(79, qup2_se3, _, _, _, _, _, _, _, _, _, _), + [80] = PINGROUP(80, qup2_se4, qup2_se7, atest_usb, ddr_pxi, _, _, _, _, _, _, _), + [81] = PINGROUP(81, qup2_se4, qup2_se2, qup2_se7, atest_usb, ddr_pxi, _, _, _, + _, _, _), + [82] = PINGROUP(82, qup2_se4, qup2_se2, qup2_se7, gcc_gp3, _, _, _, _, _, _, _), + [83] = PINGROUP(83, qup2_se4, qup2_se2, qup2_se7, _, _, _, _, _, _, _, _), + [84] = PINGROUP(84, qup2_se5, _, _, _, _, _, _, _, _, _, _), + [85] = PINGROUP(85, qup2_se5, _, _, _, _, _, _, _, _, _, _), + [86] = PINGROUP(86, qup2_se5, _, _, _, _, _, _, _, _, _, _), + [87] = PINGROUP(87, qup2_se5, _, _, _, _, _, _, _, _, _, _), + [88] = PINGROUP(88, qup2_se6, _, _, _, _, _, _, _, _, _, _), + [89] = PINGROUP(89, qup2_se6, _, _, _, _, _, _, _, _, _, _), + [90] = PINGROUP(90, qup2_se6, _, _, _, _, _, _, _, _, _, _), + [91] = PINGROUP(91, qup2_se6, _, _, _, _, _, _, _, _, _, _), + [92] = PINGROUP(92, tmess_prng, _, _, _, _, _, _, _, _, _, _), + [93] = PINGROUP(93, tmess_prng, _, _, _, _, _, _, _, _, _, _), + [94] = PINGROUP(94, sys_throttle, tmess_prng, _, _, _, _, _, _, _, _, _), + [95] = PINGROUP(95, tmess_prng, _, _, _, _, _, _, _, _, _, _), + [96] = PINGROUP(96, cam_mclk, qdss_gpio, _, _, _, _, _, _, _, _, _), + [97] = PINGROUP(97, cam_mclk, qdss_gpio, _, _, _, _, _, _, _, _, _), + [98] = PINGROUP(98, cam_mclk, mdp_vsync_p, usb0_tmu, usb1_tmu, usb2_tmu, _, _, _, _, _, _), + [99] = PINGROUP(99, cam_mclk, qdss_gpio, _, _, _, _, _, _, _, _, _), + [100] = PINGROUP(100, cam_asc_mclk4, qdss_gpio, _, _, _, _, _, _, _, _, _), + [101] = PINGROUP(101, cci_i2c_sda, qdss_gpio, _, _, _, _, _, _, _, _, _), + [102] = PINGROUP(102, cci_i2c_scl, qdss_gpio, _, _, _, _, _, _, _, _, _), + [103] = PINGROUP(103, cci_i2c_sda, qdss_gpio, _, _, _, _, _, _, _, _, _), + [104] = PINGROUP(104, cci_i2c_scl, qdss_gpio, _, _, _, _, _, _, _, _, _), + [105] = PINGROUP(105, cci_i2c_sda, mdp_vsync_s, usb1_dbg, _, _, _, _, _, _, _, _), + [106] = PINGROUP(106, cci_i2c_scl, mdp_vsync_e, usb1_dbg, _, _, _, _, _, _, _, _), + [107] = PINGROUP(107, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [108] = PINGROUP(108, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [109] = PINGROUP(109, cci_timer, mdp_vsync_out, qdss_gpio, _, _, _, _, _, _, _, _), + [110] = PINGROUP(110, cci_timer, mdp_vsync_out, qdss_gpio, _, _, _, _, _, _, _, _), + [111] = PINGROUP(111, cci_timer, cci_async_in, mdp_vsync_out, qdss_gpio, _, _, _, _, + _, _, _), + [112] = PINGROUP(112, cci_timer, cci_async_in, mdp_vsync_out, qdss_gpio, _, _, _, _, + _, _, _), + [113] = PINGROUP(113, cci_timer, cci_async_in, mdp_vsync_out, qdss_gpio, _, _, _, _, + _, _, _), + [114] = PINGROUP(114, mdp_vsync_out, mdp_vsync_out, _, _, _, _, _, _, _, _, _), + [115] = PINGROUP(115, mdp_vsync_out, mdp_vsync_out, edp1_lcd, _, _, _, _, _, _, _, _), + [116] = PINGROUP(116, _, _, _, _, _, _, _, _, _, _, _), + [117] = PINGROUP(117, _, _, _, _, _, _, _, _, _, _, _), + [118] = PINGROUP(118, host2wlan_sol, _, _, _, _, _, _, _, _, _, _), + [119] = PINGROUP(119, edp0_hot, edp1_lcd, _, _, _, _, _, _, _, _, _), + [120] = PINGROUP(120, edp0_lcd, _, _, _, _, _, _, _, _, _, _), + [121] = PINGROUP(121, usb0_phy_ps, _, _, _, _, _, _, _, _, _, _), + [122] = PINGROUP(122, usb0_dp, _, _, _, _, _, _, _, _, _, _), + [123] = PINGROUP(123, usb1_phy_ps, _, _, _, _, _, _, _, _, _, _), + [124] = PINGROUP(124, usb1_dp, _, _, _, _, _, _, _, _, _, _), + [125] = PINGROUP(125, usb2_phy_ps, _, _, _, _, _, _, _, _, _, _), + [126] = PINGROUP(126, usb2_dp, _, _, _, _, _, _, _, _, _, _), + [127] = PINGROUP(127, qspi0, sdc4_clk, qup3_se0, _, _, _, _, _, _, _, _), + [128] = PINGROUP(128, qspi0, sdc4_data, qup3_se0, _, _, _, _, _, _, _, _), + [129] = PINGROUP(129, qspi0, sdc4_data, qup3_se0, _, _, _, _, _, _, _, _), + [130] = PINGROUP(130, qspi0, sdc4_data, qup3_se0, _, _, _, _, _, _, _, _), + [131] = PINGROUP(131, qspi0, sdc4_data, qup3_se0, _, _, _, _, _, _, _, _), + [132] = PINGROUP(132, qspi0, sdc4_cmd, qup3_se0, _, _, _, _, _, _, _, _), + [133] = PINGROUP(133, qspi0, tb_trig_sdc4, qup3_se0, _, _, _, _, _, _, _, _), + [134] = PINGROUP(134, audio_ext_mclk0, _, _, _, _, _, _, _, _, _, _), + [135] = PINGROUP(135, i2s0_sck, _, _, _, _, _, _, _, _, _, _), + [136] = PINGROUP(136, i2s0_data, _, _, _, _, _, _, _, _, _, _), + [137] = PINGROUP(137, i2s0_data, tb_trig_sdc2, _, _, _, _, _, _, _, _, _), + [138] = PINGROUP(138, i2s0_ws, tsense_pwm, _, _, _, _, _, _, _, _, _), + [139] = PINGROUP(139, i2s1_sck, tsense_pwm, _, _, _, _, _, _, _, _, _), + [140] = PINGROUP(140, i2s1_data, tsense_pwm, _, _, _, _, _, _, _, _, _), + [141] = PINGROUP(141, i2s1_ws, tsense_therm, _, _, _, _, _, _, _, _, _), + [142] = PINGROUP(142, i2s1_data, audio_ext_mclk1, audio_ref_clk, _, _, _, _, _, _, _, _), + [143] = PINGROUP(143, pcie3a_rst_n, _, _, _, _, _, _, _, _, _, _), + [144] = PINGROUP(144, pcie3a_clk, _, _, _, _, _, _, _, _, _, _), + [145] = PINGROUP(145, _, _, _, _, _, _, _, _, _, _, _), + [146] = PINGROUP(146, _, _, _, _, _, _, _, _, _, _, _), + [147] = PINGROUP(147, pcie4_clk_req_n, _, _, _, _, _, _, _, _, _, _), + [148] = PINGROUP(148, _, _, _, _, _, _, _, _, _, _, _), + [149] = PINGROUP(149, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [150] = PINGROUP(150, pcie6_clk_req_n, _, _, _, _, _, _, _, _, _, _), + [151] = PINGROUP(151, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [152] = PINGROUP(152, qdss_gpio, _, _, _, _, _, _, _, _, _, _), + [153] = PINGROUP(153, pcie5_clk_req_n, _, _, _, _, _, _, _, _, _, _), + [154] = PINGROUP(154, _, _, _, _, _, _, _, _, _, _, _), + [155] = PINGROUP(155, _, _, _, _, _, _, _, _, _, _, _), + [156] = PINGROUP(156, pcie3b_clk, _, _, _, _, _, _, _, _, _, _), + [157] = PINGROUP(157, _, _, _, _, _, _, _, _, _, _, _), + [158] = PINGROUP(158, _, _, _, _, _, _, _, _, _, _, _), + [159] = PINGROUP(159, _, _, _, _, _, _, _, _, _, _, _), + [160] = PINGROUP(160, resout_gpio_n, _, _, _, _, _, _, _, _, _, _), + [161] = PINGROUP(161, qdss_cti, _, _, _, _, _, _, _, _, _, _), + [162] = PINGROUP(162, sd_write_protect, qdss_cti, _, _, _, _, _, _, _, _, _), + [163] = PINGROUP(163, usb0_sbrx, prng_rosc, phase_flag, _, atest_char, _, _, _, + _, _, _), + [164] = PINGROUP(164, usb0_sbtx, prng_rosc, phase_flag, _, atest_char, _, _, _, _, _, + _), + [165] = PINGROUP(165, usb0_sbtx, _, _, _, _, _, _, _, _, _, _), + [166] = PINGROUP(166, _, _, _, _, _, _, _, _, _, _, _), + [167] = PINGROUP(167, _, _, _, _, _, _, _, _, _, _, _), + [168] = PINGROUP(168, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [169] = PINGROUP(169, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [170] = PINGROUP(170, _, _, _, _, _, _, _, _, _, _, _), + [171] = PINGROUP(171, _, _, _, _, _, _, _, _, _, _, _), + [172] = PINGROUP(172, usb1_sbrx, phase_flag, _, atest_char, _, _, _, _, _, _, _), + [173] = PINGROUP(173, usb1_sbtx, cri_trng, phase_flag, _, _, _, _, _, _, _, _), + [174] = PINGROUP(174, usb1_sbtx, _, _, _, _, _, _, _, _, _, _), + [175] = PINGROUP(175, _, _, _, _, _, _, _, _, _, _, _), + [176] = PINGROUP(176, _, _, _, _, _, _, _, _, _, _, _), + [177] = PINGROUP(177, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [178] = PINGROUP(178, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [179] = PINGROUP(179, _, _, _, _, _, _, _, _, _, _, _), + [180] = PINGROUP(180, _, _, _, _, _, _, _, _, _, _, _), + [181] = PINGROUP(181, usb2_sbrx, _, _, _, _, _, _, _, _, _, _), + [182] = PINGROUP(182, usb2_sbtx, _, _, _, _, _, _, _, _, _, _), + [183] = PINGROUP(183, usb2_sbtx, _, _, _, _, _, _, _, _, _, _), + [184] = PINGROUP(184, phase_flag, _, atest_char, _, _, _, _, _, _, _, _), + [185] = PINGROUP(185, _, _, _, _, _, _, _, _, _, _, _), + [186] = PINGROUP(186, eusb_ac_en, prng_rosc, phase_flag, _, _, _, _, _, _, _, _), + [187] = PINGROUP(187, eusb_ac_en, _, _, _, _, _, _, _, _, _, _), + [188] = PINGROUP(188, prng_rosc, phase_flag, _, atest_char, _, _, _, _, _, _, _), + [189] = PINGROUP(189, _, _, _, _, _, _, _, _, _, _, _), + [190] = PINGROUP(190, _, _, _, _, _, _, _, _, _, _, _), + [191] = PINGROUP(191, _, _, _, _, _, _, _, _, _, _, _), + [192] = PINGROUP(192, _, _, _, _, _, _, _, _, _, _, egpio), + [193] = PINGROUP(193, _, _, _, _, _, _, _, _, _, _, egpio), + [194] = PINGROUP(194, _, _, _, _, _, _, _, _, _, _, egpio), + [195] = PINGROUP(195, _, _, _, _, _, _, _, _, _, _, egpio), + [196] = PINGROUP(196, _, _, _, _, _, _, _, _, _, _, egpio), + [197] = PINGROUP(197, _, _, _, _, _, _, _, _, _, _, egpio), + [198] = PINGROUP(198, _, _, _, _, _, _, _, _, _, _, egpio), + [199] = PINGROUP(199, _, _, _, _, _, _, _, _, _, _, egpio), + [200] = PINGROUP(200, _, _, _, _, _, _, _, _, _, _, egpio), + [201] = PINGROUP(201, _, _, _, _, _, _, _, _, _, _, egpio), + [202] = PINGROUP(202, _, _, _, _, _, _, _, _, _, _, egpio), + [203] = PINGROUP(203, _, _, _, _, _, _, _, _, _, _, egpio), + [204] = PINGROUP(204, _, _, _, _, _, _, _, _, _, _, egpio), + [205] = PINGROUP(205, _, _, _, _, _, _, _, _, _, _, egpio), + [206] = PINGROUP(206, _, _, _, _, _, _, _, _, _, _, egpio), + [207] = PINGROUP(207, _, _, _, _, _, _, _, _, _, _, egpio), + [208] = PINGROUP(208, _, _, _, _, _, _, _, _, _, _, egpio), + [209] = PINGROUP(209, _, _, _, _, _, _, _, _, _, _, egpio), + [210] = PINGROUP(210, _, _, _, _, _, _, _, _, _, _, egpio), + [211] = PINGROUP(211, _, _, _, _, _, _, _, _, _, _, egpio), + [212] = PINGROUP(212, _, _, _, _, _, _, _, _, _, _, egpio), + [213] = PINGROUP(213, _, _, _, _, _, _, _, _, _, _, egpio), + [214] = PINGROUP(214, wcn_sw_ctrl, _, _, _, _, _, _, _, _, _, egpio), + [215] = PINGROUP(215, _, qdss_cti, _, _, _, _, _, _, _, _, egpio), + [216] = PINGROUP(216, _, _, _, _, _, _, _, _, _, _, egpio), + [217] = PINGROUP(217, _, qdss_cti, _, _, _, _, _, _, _, _, egpio), + [218] = PINGROUP(218, _, _, _, _, _, _, _, _, _, _, egpio), + [219] = PINGROUP(219, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [220] = PINGROUP(220, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [221] = PINGROUP(221, wcn_sw, _, qdss_gpio, _, _, _, _, _, _, _, egpio), + [222] = PINGROUP(222, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [223] = PINGROUP(223, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [224] = PINGROUP(224, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [225] = PINGROUP(225, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [226] = PINGROUP(226, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [227] = PINGROUP(227, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [228] = PINGROUP(228, _, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [229] = PINGROUP(229, _, _, _, _, _, _, _, _, _, _, egpio), + [230] = PINGROUP(230, _, _, _, _, _, _, _, _, _, _, egpio), + [231] = PINGROUP(231, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [232] = PINGROUP(232, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [233] = PINGROUP(233, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [234] = PINGROUP(234, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [235] = PINGROUP(235, asc_cci, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [236] = PINGROUP(236, asc_cci, qdss_gpio, _, _, _, _, _, _, _, _, egpio), + [237] = PINGROUP(237, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [238] = PINGROUP(238, qdss_gpio, _, _, _, _, _, _, _, _, _, egpio), + [239] = PINGROUP(239, _, _, _, _, _, _, _, _, _, _, egpio), + [240] = PINGROUP(240, _, _, _, _, _, _, _, _, _, _, egpio), + [241] = PINGROUP(241, _, _, _, _, _, _, _, _, _, _, egpio), + [242] = PINGROUP(242, _, _, _, _, _, _, _, _, _, _, egpio), + [243] = PINGROUP(243, _, _, _, _, _, _, _, _, _, _, egpio), + [244] = PINGROUP(244, _, _, _, _, _, _, _, _, _, _, egpio), + [245] = PINGROUP(245, smb_acok_n, _, _, _, _, _, _, _, _, _, _), + [246] = PINGROUP(246, _, _, _, _, _, _, _, _, _, _, _), + [247] = PINGROUP(247, qup3_se0, _, _, _, _, _, _, _, _, _, _), + [248] = PINGROUP(248, pmc_uva_n, _, _, _, _, _, _, _, _, _, _), + [249] = PINGROUP(249, pmc_oca_n, _, _, _, _, _, _, _, _, _, _), + [250] = UFS_RESET(ufs_reset, 0x104004, 0x105000), + [251] = SDC_QDSD_PINGROUP(sdc2_clk, 0xff000, 14, 6), + [252] = SDC_QDSD_PINGROUP(sdc2_cmd, 0xff000, 11, 3), + [253] = SDC_QDSD_PINGROUP(sdc2_data, 0xff000, 9, 0), +}; + +static const struct msm_gpio_wakeirq_map glymur_pdc_map[] = { + { 0, 116 }, { 2, 114 }, { 3, 115 }, { 4, 175 }, { 5, 176 }, + { 7, 111 }, { 11, 129 }, { 13, 130 }, { 15, 112 }, { 19, 113 }, + { 23, 187 }, { 27, 188 }, { 28, 121 }, { 29, 122 }, { 30, 136 }, + { 31, 203 }, { 32, 189 }, { 34, 174 }, { 35, 190 }, { 36, 191 }, + { 39, 124 }, { 43, 192 }, { 47, 193 }, { 51, 123 }, { 53, 133 }, + { 55, 125 }, { 59, 131 }, { 64, 134 }, { 65, 150 }, { 66, 186 }, + { 67, 132 }, { 68, 195 }, { 71, 135 }, { 75, 196 }, { 79, 197 }, + { 83, 198 }, { 84, 181 }, { 85, 199 }, { 87, 200 }, { 91, 201 }, + { 92, 182 }, { 93, 183 }, { 94, 184 }, { 95, 185 }, { 98, 202 }, + { 105, 157 }, { 113, 128 }, { 121, 117 }, { 123, 118 }, { 125, 119 }, + { 129, 120 }, { 131, 126 }, { 132, 160 }, { 133, 194 }, { 134, 127 }, + { 141, 137 }, { 143, 159 }, { 144, 138 }, { 145, 139 }, { 147, 140 }, + { 148, 141 }, { 150, 146 }, { 151, 147 }, { 153, 148 }, { 154, 144 }, + { 156, 149 }, { 157, 151 }, { 163, 142 }, { 172, 143 }, { 181, 145 }, + { 193, 161 }, { 196, 152 }, { 203, 177 }, { 208, 178 }, { 215, 162 }, + { 217, 153 }, { 220, 154 }, { 221, 155 }, { 228, 179 }, { 230, 180 }, + { 232, 206 }, { 234, 172 }, { 235, 173 }, { 242, 158 }, { 244, 156 }, +}; + +static const struct msm_pinctrl_soc_data glymur_tlmm = { + .pins = glymur_pins, + .npins = ARRAY_SIZE(glymur_pins), + .functions = glymur_functions, + .nfunctions = ARRAY_SIZE(glymur_functions), + .groups = glymur_groups, + .ngroups = ARRAY_SIZE(glymur_groups), + .ngpios = 251, + .wakeirq_map = glymur_pdc_map, + .nwakeirq_map = ARRAY_SIZE(glymur_pdc_map), + .egpio_func = 11, +}; + +static const struct of_device_id glymur_tlmm_of_match[] = { + { .compatible = "qcom,glymur-tlmm", .data = &glymur_tlmm }, + { } +}; + +static int glymur_tlmm_probe(struct platform_device *pdev) +{ + return msm_pinctrl_probe(pdev, &glymur_tlmm); +} + +static struct platform_driver glymur_tlmm_driver = { + .driver = { + .name = "glymur-tlmm", + .of_match_table = glymur_tlmm_of_match, + }, + .probe = glymur_tlmm_probe, +}; + +static int __init glymur_tlmm_init(void) +{ + return platform_driver_register(&glymur_tlmm_driver); +} +arch_initcall(glymur_tlmm_init); + +static void __exit glymur_tlmm_exit(void) +{ + platform_driver_unregister(&glymur_tlmm_driver); +} +module_exit(glymur_tlmm_exit); + +MODULE_DESCRIPTION("QTI GLYMUR TLMM driver"); +MODULE_LICENSE("GPL"); +MODULE_DEVICE_TABLE(of, glymur_tlmm_of_match); diff --git a/drivers/pinctrl/qcom/pinctrl-ipq5018.c b/drivers/pinctrl/qcom/pinctrl-ipq5018.c index 10b99d5d8a11db..cbf34854f88265 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq5018.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq5018.c @@ -630,7 +630,7 @@ static const struct pinfunction ipq5018_functions[] = { MSM_PIN_FUNCTION(eud_gpio), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(led0), MSM_PIN_FUNCTION(led2), MSM_PIN_FUNCTION(mac0), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq5332.c b/drivers/pinctrl/qcom/pinctrl-ipq5332.c index 1ac2fc09c11923..239cbe75f198d3 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq5332.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq5332.c @@ -692,7 +692,7 @@ static const struct pinfunction ipq5332_functions[] = { MSM_PIN_FUNCTION(dbg_out), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(lock_det), MSM_PIN_FUNCTION(mac0), MSM_PIN_FUNCTION(mac1), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq5424.c b/drivers/pinctrl/qcom/pinctrl-ipq5424.c index 7ff1f8acc1a3a8..67b452a033d623 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq5424.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq5424.c @@ -641,7 +641,7 @@ static const struct pinfunction ipq5424_functions[] = { MSM_PIN_FUNCTION(dbg_out), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(i2c0_scl), MSM_PIN_FUNCTION(i2c0_sda), MSM_PIN_FUNCTION(i2c1_scl), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq6018.c b/drivers/pinctrl/qcom/pinctrl-ipq6018.c index a4ba980252e187..be177fb0a92d95 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq6018.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq6018.c @@ -891,7 +891,7 @@ static const struct pinfunction ipq6018_functions[] = { MSM_PIN_FUNCTION(dbg_out), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(lpass_aud), MSM_PIN_FUNCTION(lpass_aud0), MSM_PIN_FUNCTION(lpass_aud1), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq8074.c b/drivers/pinctrl/qcom/pinctrl-ipq8074.c index 482f13282fc2be..e94de90833140c 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq8074.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq8074.c @@ -838,7 +838,7 @@ static const struct pinfunction ipq8074_functions[] = { MSM_PIN_FUNCTION(dbg_out), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(ldo_en), MSM_PIN_FUNCTION(ldo_update), MSM_PIN_FUNCTION(led0), diff --git a/drivers/pinctrl/qcom/pinctrl-ipq9574.c b/drivers/pinctrl/qcom/pinctrl-ipq9574.c index 89c05d8eb55034..3ed093ea8eb907 100644 --- a/drivers/pinctrl/qcom/pinctrl-ipq9574.c +++ b/drivers/pinctrl/qcom/pinctrl-ipq9574.c @@ -651,7 +651,7 @@ static const struct pinfunction ipq9574_functions[] = { MSM_PIN_FUNCTION(dwc_ddrphy), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(mac), MSM_PIN_FUNCTION(mdc), MSM_PIN_FUNCTION(mdio), diff --git a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c index 54c77e0b96e91d..1c97ec44aa5ff7 100644 --- a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c +++ b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c @@ -41,13 +41,27 @@ struct lpi_pinctrl { static int lpi_gpio_read(struct lpi_pinctrl *state, unsigned int pin, unsigned int addr) { - return ioread32(state->tlmm_base + LPI_TLMM_REG_OFFSET * pin + addr); + u32 pin_offset; + + if (state->data->flags & LPI_FLAG_USE_PREDEFINED_PIN_OFFSET) + pin_offset = state->data->groups[pin].pin_offset; + else + pin_offset = LPI_TLMM_REG_OFFSET * pin; + + return ioread32(state->tlmm_base + pin_offset + addr); } static int lpi_gpio_write(struct lpi_pinctrl *state, unsigned int pin, unsigned int addr, unsigned int val) { - iowrite32(val, state->tlmm_base + LPI_TLMM_REG_OFFSET * pin + addr); + u32 pin_offset; + + if (state->data->flags & LPI_FLAG_USE_PREDEFINED_PIN_OFFSET) + pin_offset = state->data->groups[pin].pin_offset; + else + pin_offset = LPI_TLMM_REG_OFFSET * pin; + + iowrite32(val, state->tlmm_base + pin_offset + addr); return 0; } @@ -174,7 +188,7 @@ static int lpi_config_get(struct pinctrl_dev *pctldev, arg = 1; break; case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (is_out) arg = 1; break; @@ -252,7 +266,7 @@ static int lpi_config_set(struct pinctrl_dev *pctldev, unsigned int group, case PIN_CONFIG_INPUT_ENABLE: output_enabled = false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: output_enabled = true; value = arg; break; @@ -314,7 +328,7 @@ static int lpi_gpio_direction_output(struct gpio_chip *chip, struct lpi_pinctrl *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return lpi_config_set(state->ctrl, pin, &config, 1); } @@ -332,7 +346,7 @@ static int lpi_gpio_set(struct gpio_chip *chip, unsigned int pin, int value) struct lpi_pinctrl *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return lpi_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.h b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.h index a9b2f65c1ebe0f..f4836849286134 100644 --- a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.h +++ b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.h @@ -55,6 +55,22 @@ struct pinctrl_pin_desc; LPI_MUX_##f4, \ }, \ .nfuncs = 5, \ + .pin_offset = 0, \ + } + +#define LPI_PINGROUP_OFFSET(id, soff, f1, f2, f3, f4, poff) \ + { \ + .pin = id, \ + .slew_offset = soff, \ + .funcs = (int[]){ \ + LPI_MUX_gpio, \ + LPI_MUX_##f1, \ + LPI_MUX_##f2, \ + LPI_MUX_##f3, \ + LPI_MUX_##f4, \ + }, \ + .nfuncs = 5, \ + .pin_offset = poff, \ } /* @@ -62,6 +78,7 @@ struct pinctrl_pin_desc; * pin configuration. */ #define LPI_FLAG_SLEW_RATE_SAME_REG BIT(0) +#define LPI_FLAG_USE_PREDEFINED_PIN_OFFSET BIT(1) struct lpi_pingroup { unsigned int pin; @@ -69,6 +86,7 @@ struct lpi_pingroup { int slew_offset; unsigned int *funcs; unsigned int nfuncs; + unsigned int pin_offset; }; struct lpi_function { diff --git a/drivers/pinctrl/qcom/pinctrl-mdm9607.c b/drivers/pinctrl/qcom/pinctrl-mdm9607.c index 3e18ba124fede9..cef330547ce78d 100644 --- a/drivers/pinctrl/qcom/pinctrl-mdm9607.c +++ b/drivers/pinctrl/qcom/pinctrl-mdm9607.c @@ -861,7 +861,7 @@ static const struct pinfunction mdm9607_functions[] = { MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), MSM_PIN_FUNCTION(gmac_mdio), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx), MSM_PIN_FUNCTION(lcd_rst), MSM_PIN_FUNCTION(ldo_en), diff --git a/drivers/pinctrl/qcom/pinctrl-mdm9615.c b/drivers/pinctrl/qcom/pinctrl-mdm9615.c index bea1ca3d1b7f84..729fe3d7e14efc 100644 --- a/drivers/pinctrl/qcom/pinctrl-mdm9615.c +++ b/drivers/pinctrl/qcom/pinctrl-mdm9615.c @@ -313,7 +313,7 @@ static const char * const cdc_mclk_groups[] = { }; static const struct pinfunction mdm9615_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsbi2_i2c), MSM_PIN_FUNCTION(gsbi3), MSM_PIN_FUNCTION(gsbi4), diff --git a/drivers/pinctrl/qcom/pinctrl-milos.c b/drivers/pinctrl/qcom/pinctrl-milos.c index d11a7bbcd73316..19abd5233a2c54 100644 --- a/drivers/pinctrl/qcom/pinctrl-milos.c +++ b/drivers/pinctrl/qcom/pinctrl-milos.c @@ -974,7 +974,7 @@ static const char *const wcn_sw_ctrl_groups[] = { }; static const struct pinfunction milos_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb), diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 83eb075b6bfa17..67525d542c5b7b 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -31,6 +31,7 @@ #include "../core.h" #include "../pinconf.h" #include "../pinctrl-utils.h" +#include "../pinmux.h" #include "pinctrl-msm.h" @@ -150,33 +151,6 @@ static int msm_pinmux_request(struct pinctrl_dev *pctldev, unsigned offset) return gpiochip_line_is_valid(chip, offset) ? 0 : -EINVAL; } -static int msm_get_functions_count(struct pinctrl_dev *pctldev) -{ - struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); - - return pctrl->soc->nfunctions; -} - -static const char *msm_get_function_name(struct pinctrl_dev *pctldev, - unsigned function) -{ - struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); - - return pctrl->soc->functions[function].name; -} - -static int msm_get_function_groups(struct pinctrl_dev *pctldev, - unsigned function, - const char * const **groups, - unsigned * const num_groups) -{ - struct msm_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); - - *groups = pctrl->soc->functions[function].groups; - *num_groups = pctrl->soc->functions[function].ngroups; - return 0; -} - static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev, unsigned function, unsigned group) @@ -288,11 +262,13 @@ static int msm_pinmux_request_gpio(struct pinctrl_dev *pctldev, static const struct pinmux_ops msm_pinmux_ops = { .request = msm_pinmux_request, - .get_functions_count = msm_get_functions_count, - .get_function_name = msm_get_function_name, - .get_function_groups = msm_get_function_groups, + .get_functions_count = pinmux_generic_get_function_count, + .get_function_name = pinmux_generic_get_function_name, + .get_function_groups = pinmux_generic_get_function_groups, + .function_is_gpio = pinmux_generic_function_is_gpio, .gpio_request_enable = msm_pinmux_request_gpio, .set_mux = msm_pinmux_set_mux, + .strict = true, }; static int msm_config_reg(struct msm_pinctrl *pctrl, @@ -319,7 +295,7 @@ static int msm_config_reg(struct msm_pinctrl *pctrl, *bit = g->drv_bit; *mask = 7; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: case PIN_CONFIG_OUTPUT_ENABLE: *bit = g->oe_bit; @@ -409,7 +385,7 @@ static int msm_config_group_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_DRIVE_STRENGTH: arg = msm_regval_to_drive(arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* Pin is not output */ if (!arg) return -EINVAL; @@ -488,7 +464,7 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev, else arg = (arg / 2) - 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* set output value */ raw_spin_lock_irqsave(&pctrl->lock, flags); val = msm_readl_io(pctrl, g); @@ -1552,6 +1528,7 @@ EXPORT_SYMBOL(msm_pinctrl_dev_pm_ops); int msm_pinctrl_probe(struct platform_device *pdev, const struct msm_pinctrl_soc_data *soc_data) { + const struct pinfunction *func; struct msm_pinctrl *pctrl; struct resource *res; int ret; @@ -1606,6 +1583,14 @@ int msm_pinctrl_probe(struct platform_device *pdev, return PTR_ERR(pctrl->pctrl); } + for (i = 0; i < soc_data->nfunctions; i++) { + func = &soc_data->functions[i]; + + ret = pinmux_generic_add_pinfunction(pctrl->pctrl, func, NULL); + if (ret < 0) + return ret; + } + ret = msm_gpio_init(pctrl); if (ret) return ret; diff --git a/drivers/pinctrl/qcom/pinctrl-msm.h b/drivers/pinctrl/qcom/pinctrl-msm.h index d7dc0947bb1618..4625fa5320a95a 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.h +++ b/drivers/pinctrl/qcom/pinctrl-msm.h @@ -29,6 +29,11 @@ struct pinctrl_pin_desc; fname##_groups, \ ARRAY_SIZE(fname##_groups)) +#define MSM_GPIO_PIN_FUNCTION(fname) \ + [msm_mux_##fname] = PINCTRL_GPIO_PINFUNCTION(#fname, \ + fname##_groups, \ + ARRAY_SIZE(fname##_groups)) + #define QCA_PIN_FUNCTION(fname) \ [qca_mux_##fname] = PINCTRL_PINFUNCTION(#fname, \ fname##_groups, \ diff --git a/drivers/pinctrl/qcom/pinctrl-msm8226.c b/drivers/pinctrl/qcom/pinctrl-msm8226.c index f9a95734734081..a81aa092ef1240 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8226.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8226.c @@ -483,7 +483,7 @@ static const struct pinfunction msm8226_functions[] = { MSM_PIN_FUNCTION(cci_i2c0), MSM_PIN_FUNCTION(gp0_clk), MSM_PIN_FUNCTION(gp1_clk), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(sdc3), MSM_PIN_FUNCTION(wlan), }; diff --git a/drivers/pinctrl/qcom/pinctrl-msm8660.c b/drivers/pinctrl/qcom/pinctrl-msm8660.c index 4dbc19ffd80efc..5ded00396cd949 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8660.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8660.c @@ -714,7 +714,7 @@ static const char * const ebi2_groups[] = { }; static const struct pinfunction msm8660_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(cam_mclk), MSM_PIN_FUNCTION(dsub), MSM_PIN_FUNCTION(ext_gps), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8909.c b/drivers/pinctrl/qcom/pinctrl-msm8909.c index 0aa4f77b774f45..544a52fb8f3d6e 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8909.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8909.c @@ -696,7 +696,7 @@ static const struct pinfunction msm8909_functions[] = { MSM_PIN_FUNCTION(gcc_gp3_clk_a), MSM_PIN_FUNCTION(gcc_gp3_clk_b), MSM_PIN_FUNCTION(gcc_plltest), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx), MSM_PIN_FUNCTION(ldo_en), MSM_PIN_FUNCTION(ldo_update), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8916.c b/drivers/pinctrl/qcom/pinctrl-msm8916.c index 0dfc6dd33d58b2..b1b6934bb4b635 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8916.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8916.c @@ -743,7 +743,7 @@ static const struct pinfunction msm8916_functions[] = { MSM_PIN_FUNCTION(gcc_gp2_clk_b), MSM_PIN_FUNCTION(gcc_gp3_clk_a), MSM_PIN_FUNCTION(gcc_gp3_clk_b), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx0), MSM_PIN_FUNCTION(gsm0_tx1), MSM_PIN_FUNCTION(gsm1_tx0), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8917.c b/drivers/pinctrl/qcom/pinctrl-msm8917.c index 2e1a94ab18b219..f23d92d6615b89 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8917.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8917.c @@ -1302,7 +1302,7 @@ static const struct pinfunction msm8917_functions[] = { MSM_PIN_FUNCTION(gcc_gp3_clk_b), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx), MSM_PIN_FUNCTION(key_focus), MSM_PIN_FUNCTION(key_snapshot), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8953.c b/drivers/pinctrl/qcom/pinctrl-msm8953.c index 956383341a7a77..67db062fdf5628 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8953.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8953.c @@ -1533,7 +1533,7 @@ static const struct pinfunction msm8953_functions[] = { MSM_PIN_FUNCTION(gcc_gp3_clk_b), MSM_PIN_FUNCTION(gcc_plltest), MSM_PIN_FUNCTION(gcc_tlmm), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm0_tx), MSM_PIN_FUNCTION(gsm1_tx), MSM_PIN_FUNCTION(gyro_int), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8960.c b/drivers/pinctrl/qcom/pinctrl-msm8960.c index a937ea867de709..2fb15208aba050 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8960.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8960.c @@ -974,7 +974,7 @@ static const struct pinfunction msm8960_functions[] = { MSM_PIN_FUNCTION(gp_pdm_1b), MSM_PIN_FUNCTION(gp_pdm_2a), MSM_PIN_FUNCTION(gp_pdm_2b), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsbi1), MSM_PIN_FUNCTION(gsbi1_spi_cs1_n), MSM_PIN_FUNCTION(gsbi1_spi_cs2a_n), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8976.c b/drivers/pinctrl/qcom/pinctrl-msm8976.c index 3bcb03387781f8..345539b9e696f0 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8976.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8976.c @@ -812,7 +812,7 @@ static const char * const ss_switch_groups[] = { }; static const struct pinfunction msm8976_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(blsp_spi1), MSM_PIN_FUNCTION(smb_int), MSM_PIN_FUNCTION(blsp_i2c1), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8994.c b/drivers/pinctrl/qcom/pinctrl-msm8994.c index 7a3b6cbccb687c..94e042d1f4b2a5 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8994.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8994.c @@ -1071,7 +1071,7 @@ static const struct pinfunction msm8994_functions[] = { MSM_PIN_FUNCTION(uim2), MSM_PIN_FUNCTION(uim3), MSM_PIN_FUNCTION(uim4), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), }; static const struct msm_pingroup msm8994_groups[] = { diff --git a/drivers/pinctrl/qcom/pinctrl-msm8996.c b/drivers/pinctrl/qcom/pinctrl-msm8996.c index d86d83106d3ba1..e5b55693d02377 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8996.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8996.c @@ -1532,7 +1532,7 @@ static const struct pinfunction msm8996_functions[] = { MSM_PIN_FUNCTION(gcc_gp2_clk_b), MSM_PIN_FUNCTION(gcc_gp3_clk_a), MSM_PIN_FUNCTION(gcc_gp3_clk_b), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gsm_tx), MSM_PIN_FUNCTION(hdmi_cec), MSM_PIN_FUNCTION(hdmi_ddc), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8998.c b/drivers/pinctrl/qcom/pinctrl-msm8998.c index 1daee815888f54..b727593af34af9 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8998.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8998.c @@ -1160,7 +1160,7 @@ static const char * const mss_lte_groups[] = { }; static const struct pinfunction msm8998_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-msm8x74.c b/drivers/pinctrl/qcom/pinctrl-msm8x74.c index 8253aa25775b24..202bec003e96f1 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm8x74.c +++ b/drivers/pinctrl/qcom/pinctrl-msm8x74.c @@ -778,7 +778,7 @@ static const char * const slimbus_groups[] = { "gpio70", "gpio71" }; static const char * const hsic_ctl_groups[] = { "hsic_strobe", "hsic_data" }; static const struct pinfunction msm8x74_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(cci_i2c0), MSM_PIN_FUNCTION(cci_i2c1), MSM_PIN_FUNCTION(uim1), diff --git a/drivers/pinctrl/qcom/pinctrl-qcm2290.c b/drivers/pinctrl/qcom/pinctrl-qcm2290.c index eeeec6434f6a68..38200957451e19 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcm2290.c +++ b/drivers/pinctrl/qcom/pinctrl-qcm2290.c @@ -870,11 +870,11 @@ static const struct pinfunction qcm2290_functions[] = { MSM_PIN_FUNCTION(ddr_pxi1), MSM_PIN_FUNCTION(ddr_pxi2), MSM_PIN_FUNCTION(ddr_pxi3), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), diff --git a/drivers/pinctrl/qcom/pinctrl-qcs404.c b/drivers/pinctrl/qcom/pinctrl-qcs404.c index 54e3b44353494e..0b8db2c7e58a9b 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcs404.c +++ b/drivers/pinctrl/qcom/pinctrl-qcs404.c @@ -1296,7 +1296,7 @@ static const char * const i2s_3_ws_a_groups[] = { }; static const struct pinfunction qcs404_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(hdmi_tx), MSM_PIN_FUNCTION(hdmi_ddc), MSM_PIN_FUNCTION(blsp_uart_tx_a2), diff --git a/drivers/pinctrl/qcom/pinctrl-qcs615.c b/drivers/pinctrl/qcom/pinctrl-qcs615.c index 2a943bc46a6299..4dfa820d4e77ce 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcs615.c +++ b/drivers/pinctrl/qcom/pinctrl-qcs615.c @@ -819,7 +819,7 @@ static const char *const wsa_data_groups[] = { }; static const struct pinfunction qcs615_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(aoss_cti), diff --git a/drivers/pinctrl/qcom/pinctrl-qcs8300.c b/drivers/pinctrl/qcom/pinctrl-qcs8300.c index d6437e26392b60..f1af1a620684cd 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcs8300.c +++ b/drivers/pinctrl/qcom/pinctrl-qcs8300.c @@ -929,7 +929,7 @@ static const char *const vsense_trigger_groups[] = { }; static const struct pinfunction qcs8300_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb2), @@ -949,7 +949,7 @@ static const struct pinfunction qcs8300_functions[] = { MSM_PIN_FUNCTION(edp0_hot), MSM_PIN_FUNCTION(edp0_lcd), MSM_PIN_FUNCTION(edp1_lcd), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(emac0_mcg0), MSM_PIN_FUNCTION(emac0_mcg1), MSM_PIN_FUNCTION(emac0_mcg2), diff --git a/drivers/pinctrl/qcom/pinctrl-qdu1000.c b/drivers/pinctrl/qcom/pinctrl-qdu1000.c index eacb89fa388850..7c535698a78004 100644 --- a/drivers/pinctrl/qcom/pinctrl-qdu1000.c +++ b/drivers/pinctrl/qcom/pinctrl-qdu1000.c @@ -904,7 +904,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction qdu1000_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(cmo_pri), MSM_PIN_FUNCTION(si5518_int), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-sa8775p.c b/drivers/pinctrl/qcom/pinctrl-sa8775p.c index 1b62eb3e6620c9..53f28b9c49ba2d 100644 --- a/drivers/pinctrl/qcom/pinctrl-sa8775p.c +++ b/drivers/pinctrl/qcom/pinctrl-sa8775p.c @@ -1181,7 +1181,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction sa8775p_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb2), MSM_PIN_FUNCTION(audio_ref), @@ -1217,7 +1217,7 @@ static const struct pinfunction sa8775p_functions[] = { MSM_PIN_FUNCTION(edp2_lcd), MSM_PIN_FUNCTION(edp3_hot), MSM_PIN_FUNCTION(edp3_lcd), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(emac0_mcg0), MSM_PIN_FUNCTION(emac0_mcg1), MSM_PIN_FUNCTION(emac0_mcg2), diff --git a/drivers/pinctrl/qcom/pinctrl-sar2130p.c b/drivers/pinctrl/qcom/pinctrl-sar2130p.c index 3dd1b5e5cfee48..4a53f4ee20418e 100644 --- a/drivers/pinctrl/qcom/pinctrl-sar2130p.c +++ b/drivers/pinctrl/qcom/pinctrl-sar2130p.c @@ -1128,7 +1128,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction sar2130p_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(qup0), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), diff --git a/drivers/pinctrl/qcom/pinctrl-sc7180.c b/drivers/pinctrl/qcom/pinctrl-sc7180.c index c43fe10b71add7..3eae51472b1373 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc7180.c +++ b/drivers/pinctrl/qcom/pinctrl-sc7180.c @@ -903,7 +903,7 @@ static const struct pinfunction sc7180_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), diff --git a/drivers/pinctrl/qcom/pinctrl-sc7280.c b/drivers/pinctrl/qcom/pinctrl-sc7280.c index 1b070e9d41f597..44e09608aad07a 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc7280.c +++ b/drivers/pinctrl/qcom/pinctrl-sc7280.c @@ -1153,11 +1153,11 @@ static const struct pinfunction sc7280_functions[] = { MSM_PIN_FUNCTION(dp_lcd), MSM_PIN_FUNCTION(edp_hot), MSM_PIN_FUNCTION(edp_lcd), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(host2wlan_sol), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), diff --git a/drivers/pinctrl/qcom/pinctrl-sc8180x.c b/drivers/pinctrl/qcom/pinctrl-sc8180x.c index 26dd165d154348..d9f9e3dd9dd176 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc8180x.c +++ b/drivers/pinctrl/qcom/pinctrl-sc8180x.c @@ -1272,7 +1272,7 @@ static const struct pinfunction sc8180x_functions[] = { MSM_PIN_FUNCTION(gcc_gp3), MSM_PIN_FUNCTION(gcc_gp4), MSM_PIN_FUNCTION(gcc_gp5), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gps), MSM_PIN_FUNCTION(grfc), MSM_PIN_FUNCTION(hs1_mi2s), @@ -1634,7 +1634,7 @@ static int sc8180x_pinctrl_add_tile_resources(struct platform_device *pdev) return 0; /* Allocate for new resources */ - nres = devm_kzalloc(&pdev->dev, sizeof(*nres) * nres_num, GFP_KERNEL); + nres = devm_kcalloc(&pdev->dev, nres_num, sizeof(*nres), GFP_KERNEL); if (!nres) return -ENOMEM; diff --git a/drivers/pinctrl/qcom/pinctrl-sc8280xp.c b/drivers/pinctrl/qcom/pinctrl-sc8280xp.c index 6ccd7e5648d420..cf8297e8b8f8c9 100644 --- a/drivers/pinctrl/qcom/pinctrl-sc8280xp.c +++ b/drivers/pinctrl/qcom/pinctrl-sc8280xp.c @@ -1506,7 +1506,7 @@ static const struct pinfunction sc8280xp_functions[] = { MSM_PIN_FUNCTION(edp2_lcd), MSM_PIN_FUNCTION(edp3_lcd), MSM_PIN_FUNCTION(edp_hot), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(emac0_dll), MSM_PIN_FUNCTION(emac0_mcg0), MSM_PIN_FUNCTION(emac0_mcg1), @@ -1527,7 +1527,7 @@ static const struct pinfunction sc8280xp_functions[] = { MSM_PIN_FUNCTION(gcc_gp3), MSM_PIN_FUNCTION(gcc_gp4), MSM_PIN_FUNCTION(gcc_gp5), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(hs1_mi2s), MSM_PIN_FUNCTION(hs2_mi2s), MSM_PIN_FUNCTION(hs3_mi2s), diff --git a/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c new file mode 100644 index 00000000000000..d93af5f0e8d301 --- /dev/null +++ b/drivers/pinctrl/qcom/pinctrl-sdm660-lpass-lpi.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This driver is solely based on the limited information in downstream code. + * Any verification with schematics would be greatly appreciated. + * + * Copyright (c) 2023, Richard Acayan. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include "pinctrl-lpass-lpi.h" + +enum lpass_lpi_functions { + LPI_MUX_comp_rx, + LPI_MUX_dmic1_clk, + LPI_MUX_dmic1_data, + LPI_MUX_dmic2_clk, + LPI_MUX_dmic2_data, + LPI_MUX_mclk0, + LPI_MUX_pdm_tx, + LPI_MUX_pdm_clk, + LPI_MUX_pdm_rx, + LPI_MUX_pdm_sync, + + LPI_MUX_gpio, + LPI_MUX__, +}; + +static const struct pinctrl_pin_desc sdm660_lpi_pinctrl_pins[] = { + PINCTRL_PIN(0, "gpio0"), + PINCTRL_PIN(1, "gpio1"), + PINCTRL_PIN(2, "gpio2"), + PINCTRL_PIN(3, "gpio3"), + PINCTRL_PIN(4, "gpio4"), + PINCTRL_PIN(5, "gpio5"), + PINCTRL_PIN(6, "gpio6"), + PINCTRL_PIN(7, "gpio7"), + PINCTRL_PIN(8, "gpio8"), + PINCTRL_PIN(9, "gpio9"), + PINCTRL_PIN(10, "gpio10"), + PINCTRL_PIN(11, "gpio11"), + PINCTRL_PIN(12, "gpio12"), + PINCTRL_PIN(13, "gpio13"), + PINCTRL_PIN(14, "gpio14"), + PINCTRL_PIN(15, "gpio15"), + PINCTRL_PIN(16, "gpio16"), + PINCTRL_PIN(17, "gpio17"), + PINCTRL_PIN(18, "gpio18"), + PINCTRL_PIN(19, "gpio19"), + PINCTRL_PIN(20, "gpio20"), + PINCTRL_PIN(21, "gpio21"), + PINCTRL_PIN(22, "gpio22"), + PINCTRL_PIN(23, "gpio23"), + PINCTRL_PIN(24, "gpio24"), + PINCTRL_PIN(25, "gpio25"), + PINCTRL_PIN(26, "gpio26"), + PINCTRL_PIN(27, "gpio27"), + PINCTRL_PIN(28, "gpio28"), + PINCTRL_PIN(29, "gpio29"), + PINCTRL_PIN(30, "gpio30"), + PINCTRL_PIN(31, "gpio31"), +}; + +static const char * const comp_rx_groups[] = { "gpio22", "gpio24" }; +static const char * const dmic1_clk_groups[] = { "gpio26" }; +static const char * const dmic1_data_groups[] = { "gpio27" }; +static const char * const dmic2_clk_groups[] = { "gpio28" }; +static const char * const dmic2_data_groups[] = { "gpio29" }; +static const char * const mclk0_groups[] = { "gpio18" }; +static const char * const pdm_tx_groups[] = { "gpio20" }; +static const char * const pdm_clk_groups[] = { "gpio18" }; +static const char * const pdm_rx_groups[] = { "gpio21", "gpio23", "gpio25" }; +static const char * const pdm_sync_groups[] = { "gpio19" }; + +const struct lpi_pingroup sdm660_lpi_pinctrl_groups[] = { + LPI_PINGROUP_OFFSET(0, LPI_NO_SLEW, _, _, _, _, 0x0000), + LPI_PINGROUP_OFFSET(1, LPI_NO_SLEW, _, _, _, _, 0x1000), + LPI_PINGROUP_OFFSET(2, LPI_NO_SLEW, _, _, _, _, 0x2000), + LPI_PINGROUP_OFFSET(3, LPI_NO_SLEW, _, _, _, _, 0x2010), + LPI_PINGROUP_OFFSET(4, LPI_NO_SLEW, _, _, _, _, 0x3000), + LPI_PINGROUP_OFFSET(5, LPI_NO_SLEW, _, _, _, _, 0x3010), + LPI_PINGROUP_OFFSET(6, LPI_NO_SLEW, _, _, _, _, 0x4000), + LPI_PINGROUP_OFFSET(7, LPI_NO_SLEW, _, _, _, _, 0x4010), + LPI_PINGROUP_OFFSET(8, LPI_NO_SLEW, _, _, _, _, 0x5000), + LPI_PINGROUP_OFFSET(9, LPI_NO_SLEW, _, _, _, _, 0x5010), + LPI_PINGROUP_OFFSET(10, LPI_NO_SLEW, _, _, _, _, 0x5020), + LPI_PINGROUP_OFFSET(11, LPI_NO_SLEW, _, _, _, _, 0x5030), + LPI_PINGROUP_OFFSET(12, LPI_NO_SLEW, _, _, _, _, 0x6000), + LPI_PINGROUP_OFFSET(13, LPI_NO_SLEW, _, _, _, _, 0x6010), + LPI_PINGROUP_OFFSET(14, LPI_NO_SLEW, _, _, _, _, 0x7000), + LPI_PINGROUP_OFFSET(15, LPI_NO_SLEW, _, _, _, _, 0x7010), + LPI_PINGROUP_OFFSET(16, LPI_NO_SLEW, _, _, _, _, 0x5040), + LPI_PINGROUP_OFFSET(17, LPI_NO_SLEW, _, _, _, _, 0x5050), + + LPI_PINGROUP_OFFSET(18, LPI_NO_SLEW, pdm_clk, mclk0, _, _, 0x8000), + LPI_PINGROUP_OFFSET(19, LPI_NO_SLEW, pdm_sync, _, _, _, 0x8010), + LPI_PINGROUP_OFFSET(20, LPI_NO_SLEW, pdm_tx, _, _, _, 0x8020), + LPI_PINGROUP_OFFSET(21, LPI_NO_SLEW, pdm_rx, _, _, _, 0x8030), + LPI_PINGROUP_OFFSET(22, LPI_NO_SLEW, comp_rx, _, _, _, 0x8040), + LPI_PINGROUP_OFFSET(23, LPI_NO_SLEW, pdm_rx, _, _, _, 0x8050), + LPI_PINGROUP_OFFSET(24, LPI_NO_SLEW, comp_rx, _, _, _, 0x8060), + LPI_PINGROUP_OFFSET(25, LPI_NO_SLEW, pdm_rx, _, _, _, 0x8070), + LPI_PINGROUP_OFFSET(26, LPI_NO_SLEW, dmic1_clk, _, _, _, 0x9000), + LPI_PINGROUP_OFFSET(27, LPI_NO_SLEW, dmic1_data, _, _, _, 0x9010), + LPI_PINGROUP_OFFSET(28, LPI_NO_SLEW, dmic2_clk, _, _, _, 0xa000), + LPI_PINGROUP_OFFSET(29, LPI_NO_SLEW, dmic2_data, _, _, _, 0xa010), + + LPI_PINGROUP_OFFSET(30, LPI_NO_SLEW, _, _, _, _, 0xb000), + LPI_PINGROUP_OFFSET(31, LPI_NO_SLEW, _, _, _, _, 0xb010), +}; + +const struct lpi_function sdm660_lpi_pinctrl_functions[] = { + LPI_FUNCTION(comp_rx), + LPI_FUNCTION(dmic1_clk), + LPI_FUNCTION(dmic1_data), + LPI_FUNCTION(dmic2_clk), + LPI_FUNCTION(dmic2_data), + LPI_FUNCTION(mclk0), + LPI_FUNCTION(pdm_tx), + LPI_FUNCTION(pdm_clk), + LPI_FUNCTION(pdm_rx), + LPI_FUNCTION(pdm_sync), +}; + +static const struct lpi_pinctrl_variant_data sdm660_lpi_pinctrl_data = { + .pins = sdm660_lpi_pinctrl_pins, + .npins = ARRAY_SIZE(sdm660_lpi_pinctrl_pins), + .groups = sdm660_lpi_pinctrl_groups, + .ngroups = ARRAY_SIZE(sdm660_lpi_pinctrl_groups), + .functions = sdm660_lpi_pinctrl_functions, + .nfunctions = ARRAY_SIZE(sdm660_lpi_pinctrl_functions), + .flags = LPI_FLAG_SLEW_RATE_SAME_REG | LPI_FLAG_USE_PREDEFINED_PIN_OFFSET +}; + +static const struct of_device_id sdm660_lpi_pinctrl_of_match[] = { + { + .compatible = "qcom,sdm660-lpass-lpi-pinctrl", + .data = &sdm660_lpi_pinctrl_data, + }, + { } +}; +MODULE_DEVICE_TABLE(of, sdm660_lpi_pinctrl_of_match); + +static struct platform_driver sdm660_lpi_pinctrl_driver = { + .driver = { + .name = "qcom-sdm660-lpass-lpi-pinctrl", + .of_match_table = sdm660_lpi_pinctrl_of_match, + }, + .probe = lpi_pinctrl_probe, + .remove = lpi_pinctrl_remove, +}; +module_platform_driver(sdm660_lpi_pinctrl_driver); + +MODULE_AUTHOR("Richard Acayan "); +MODULE_DESCRIPTION("QTI SDM660 LPI GPIO pin control driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/pinctrl/qcom/pinctrl-sdm660.c b/drivers/pinctrl/qcom/pinctrl-sdm660.c index 1a78288f1bc832..687d986de75c4d 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdm660.c +++ b/drivers/pinctrl/qcom/pinctrl-sdm660.c @@ -1157,7 +1157,7 @@ static const struct pinfunction sdm660_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gps_tx_a), MSM_PIN_FUNCTION(gps_tx_b), MSM_PIN_FUNCTION(gps_tx_c), diff --git a/drivers/pinctrl/qcom/pinctrl-sdm670.c b/drivers/pinctrl/qcom/pinctrl-sdm670.c index 0fe1fa94cd6da1..486b72edf7b4ec 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdm670.c +++ b/drivers/pinctrl/qcom/pinctrl-sdm670.c @@ -991,7 +991,7 @@ static const char * const mss_lte_groups[] = { }; static const struct pinfunction sdm670_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-sdm845.c b/drivers/pinctrl/qcom/pinctrl-sdm845.c index 0446e291aa4831..4cf8575797a0f4 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdm845.c +++ b/drivers/pinctrl/qcom/pinctrl-sdm845.c @@ -976,7 +976,7 @@ static const char * const tsif1_sync_groups[] = { }; static const struct pinfunction sdm845_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-sdx55.c b/drivers/pinctrl/qcom/pinctrl-sdx55.c index 2c17bf88914636..79a7010b73f187 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdx55.c +++ b/drivers/pinctrl/qcom/pinctrl-sdx55.c @@ -796,7 +796,7 @@ static const struct pinfunction sdx55_functions[] = { MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), MSM_PIN_FUNCTION(gcc_plltest), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(i2s_mclk), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(ldo_en), diff --git a/drivers/pinctrl/qcom/pinctrl-sdx65.c b/drivers/pinctrl/qcom/pinctrl-sdx65.c index 85b5c0206dbd19..cc8a99a6a91ed2 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdx65.c +++ b/drivers/pinctrl/qcom/pinctrl-sdx65.c @@ -732,7 +732,7 @@ static const struct pinfunction sdx65_functions[] = { MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), MSM_PIN_FUNCTION(gcc_plltest), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(i2s_mclk), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(ldo_en), diff --git a/drivers/pinctrl/qcom/pinctrl-sdx75.c b/drivers/pinctrl/qcom/pinctrl-sdx75.c index ab13a3a57a8307..4078d83d818c33 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdx75.c +++ b/drivers/pinctrl/qcom/pinctrl-sdx75.c @@ -852,7 +852,7 @@ static const struct pinfunction sdx75_functions[] = { MSM_PIN_FUNCTION(gcc_gp2_clk), MSM_PIN_FUNCTION(gcc_gp3_clk), MSM_PIN_FUNCTION(gcc_plltest), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(i2s_mclk), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(ldo_en), diff --git a/drivers/pinctrl/qcom/pinctrl-sm4450.c b/drivers/pinctrl/qcom/pinctrl-sm4450.c index 1ecdf1ab4f275e..d51e271e336101 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm4450.c +++ b/drivers/pinctrl/qcom/pinctrl-sm4450.c @@ -722,7 +722,7 @@ static const char * const wlan1_adc_dtest1_groups[] = { }; static const struct pinfunction sm4450_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb0), MSM_PIN_FUNCTION(audio_ref_clk), diff --git a/drivers/pinctrl/qcom/pinctrl-sm6115.c b/drivers/pinctrl/qcom/pinctrl-sm6115.c index c273efa4399630..06700685ea2a38 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm6115.c +++ b/drivers/pinctrl/qcom/pinctrl-sm6115.c @@ -687,7 +687,7 @@ static const struct pinfunction sm6115_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), diff --git a/drivers/pinctrl/qcom/pinctrl-sm6125.c b/drivers/pinctrl/qcom/pinctrl-sm6125.c index 5092f20e0c1bde..5d3d1e402345eb 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm6125.c +++ b/drivers/pinctrl/qcom/pinctrl-sm6125.c @@ -943,7 +943,7 @@ static const char * const dmic1_data_groups[] = { static const struct pinfunction sm6125_functions[] = { MSM_PIN_FUNCTION(qup00), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(qdss), MSM_PIN_FUNCTION(qup01), MSM_PIN_FUNCTION(qup02), diff --git a/drivers/pinctrl/qcom/pinctrl-sm6350.c b/drivers/pinctrl/qcom/pinctrl-sm6350.c index ba4686c86c54b8..220fb582cac9fc 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm6350.c +++ b/drivers/pinctrl/qcom/pinctrl-sm6350.c @@ -1048,7 +1048,7 @@ static const struct pinfunction sm6350_functions[] = { MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gps_tx), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), diff --git a/drivers/pinctrl/qcom/pinctrl-sm6375.c b/drivers/pinctrl/qcom/pinctrl-sm6375.c index 49031571e65ee3..08b8ef6efaf097 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm6375.c +++ b/drivers/pinctrl/qcom/pinctrl-sm6375.c @@ -1172,7 +1172,7 @@ static const struct pinfunction sm6375_functions[] = { MSM_PIN_FUNCTION(gp_pdm0), MSM_PIN_FUNCTION(gp_pdm1), MSM_PIN_FUNCTION(gp_pdm2), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(gps_tx), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), diff --git a/drivers/pinctrl/qcom/pinctrl-sm7150.c b/drivers/pinctrl/qcom/pinctrl-sm7150.c index 6e89966cd70e34..78dd8153a4d4e5 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm7150.c +++ b/drivers/pinctrl/qcom/pinctrl-sm7150.c @@ -960,7 +960,7 @@ static const char * const wsa_data_groups[] = { }; static const struct pinfunction sm7150_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(adsp_ext), MSM_PIN_FUNCTION(agera_pll), MSM_PIN_FUNCTION(aoss_cti), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8150.c b/drivers/pinctrl/qcom/pinctrl-sm8150.c index 794ed99463f760..ad861cd66958c4 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8150.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8150.c @@ -1217,7 +1217,7 @@ static const struct pinfunction sm8150_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(hs1_mi2s), MSM_PIN_FUNCTION(hs2_mi2s), MSM_PIN_FUNCTION(hs3_mi2s), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8250.c b/drivers/pinctrl/qcom/pinctrl-sm8250.c index fb6f005d64f53f..f05361f3100db5 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8250.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8250.c @@ -49,6 +49,8 @@ enum { .mux_bit = 2, \ .pull_bit = 0, \ .drv_bit = 6, \ + .egpio_enable = 12, \ + .egpio_present = 11, \ .oe_bit = 9, \ .in_bit = 0, \ .out_bit = 1, \ @@ -511,6 +513,7 @@ enum sm8250_functions { msm_mux_ddr_pxi2, msm_mux_ddr_pxi3, msm_mux_dp_hot, + msm_mux_egpio, msm_mux_dp_lcd, msm_mux_gcc_gp1, msm_mux_gcc_gp2, @@ -830,6 +833,14 @@ static const char * const gpio_groups[] = { "gpio171", "gpio172", "gpio173", "gpio174", "gpio175", "gpio176", "gpio177", "gpio178", "gpio179", }; +static const char * const egpio_groups[] = { + "gpio146", "gpio147", "gpio148", "gpio149", "gpio150", "gpio151", + "gpio152", "gpio153", "gpio154", "gpio155", "gpio156", "gpio157", + "gpio158", "gpio159", "gpio160", "gpio161", "gpio162", "gpio163", + "gpio164", "gpio165", "gpio166", "gpio167", "gpio168", "gpio169", + "gpio170", "gpio171", "gpio172", "gpio173", "gpio174", "gpio175", + "gpio176", "gpio177", "gpio178", "gpio179", +}; static const char * const qdss_cti_groups[] = { "gpio0", "gpio2", "gpio2", "gpio44", "gpio45", "gpio46", "gpio92", "gpio93", @@ -1018,10 +1029,11 @@ static const struct pinfunction sm8250_functions[] = { MSM_PIN_FUNCTION(ddr_pxi3), MSM_PIN_FUNCTION(dp_hot), MSM_PIN_FUNCTION(dp_lcd), + MSM_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(lpass_slimbus), @@ -1265,40 +1277,40 @@ static const struct msm_pingroup sm8250_groups[] = { [143] = PINGROUP(143, WEST, lpass_slimbus, mi2s1_data0, ddr_bist, _, _, _, _, _, _), [144] = PINGROUP(144, WEST, lpass_slimbus, mi2s1_data1, ddr_bist, _, _, _, _, _, _), [145] = PINGROUP(145, WEST, lpass_slimbus, mi2s1_ws, _, _, _, _, _, _, _), - [146] = PINGROUP(146, WEST, _, _, _, _, _, _, _, _, _), - [147] = PINGROUP(147, WEST, _, _, _, _, _, _, _, _, _), - [148] = PINGROUP(148, WEST, _, _, _, _, _, _, _, _, _), - [149] = PINGROUP(149, WEST, _, _, _, _, _, _, _, _, _), - [150] = PINGROUP(150, WEST, _, _, _, _, _, _, _, _, _), - [151] = PINGROUP(151, WEST, _, _, _, _, _, _, _, _, _), - [152] = PINGROUP(152, WEST, _, _, _, _, _, _, _, _, _), - [153] = PINGROUP(153, WEST, _, _, _, _, _, _, _, _, _), - [154] = PINGROUP(154, WEST, _, _, _, _, _, _, _, _, _), - [155] = PINGROUP(155, WEST, _, _, _, _, _, _, _, _, _), - [156] = PINGROUP(156, WEST, _, _, _, _, _, _, _, _, _), - [157] = PINGROUP(157, WEST, _, _, _, _, _, _, _, _, _), - [158] = PINGROUP(158, WEST, _, _, _, _, _, _, _, _, _), - [159] = PINGROUP(159, WEST, cri_trng0, _, _, _, _, _, _, _, _), - [160] = PINGROUP(160, WEST, cri_trng1, qdss_gpio, _, _, _, _, _, _, _), - [161] = PINGROUP(161, WEST, cri_trng, qdss_gpio, _, _, _, _, _, _, _), - [162] = PINGROUP(162, WEST, sp_cmu, qdss_gpio, _, _, _, _, _, _, _), - [163] = PINGROUP(163, WEST, prng_rosc, qdss_gpio, _, _, _, _, _, _, _), - [164] = PINGROUP(164, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [165] = PINGROUP(165, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [166] = PINGROUP(166, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [167] = PINGROUP(167, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [168] = PINGROUP(168, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [169] = PINGROUP(169, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [170] = PINGROUP(170, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [171] = PINGROUP(171, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [172] = PINGROUP(172, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [173] = PINGROUP(173, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [174] = PINGROUP(174, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [175] = PINGROUP(175, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [176] = PINGROUP(176, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [177] = PINGROUP(177, WEST, qdss_gpio, _, _, _, _, _, _, _, _), - [178] = PINGROUP(178, WEST, _, _, _, _, _, _, _, _, _), - [179] = PINGROUP(179, WEST, _, _, _, _, _, _, _, _, _), + [146] = PINGROUP(146, WEST, _, _, _, _, _, _, _, _, egpio), + [147] = PINGROUP(147, WEST, _, _, _, _, _, _, _, _, egpio), + [148] = PINGROUP(148, WEST, _, _, _, _, _, _, _, _, egpio), + [149] = PINGROUP(149, WEST, _, _, _, _, _, _, _, _, egpio), + [150] = PINGROUP(150, WEST, _, _, _, _, _, _, _, _, egpio), + [151] = PINGROUP(151, WEST, _, _, _, _, _, _, _, _, egpio), + [152] = PINGROUP(152, WEST, _, _, _, _, _, _, _, _, egpio), + [153] = PINGROUP(153, WEST, _, _, _, _, _, _, _, _, egpio), + [154] = PINGROUP(154, WEST, _, _, _, _, _, _, _, _, egpio), + [155] = PINGROUP(155, WEST, _, _, _, _, _, _, _, _, egpio), + [156] = PINGROUP(156, WEST, _, _, _, _, _, _, _, _, egpio), + [157] = PINGROUP(157, WEST, _, _, _, _, _, _, _, _, egpio), + [158] = PINGROUP(158, WEST, _, _, _, _, _, _, _, _, egpio), + [159] = PINGROUP(159, WEST, cri_trng0, _, _, _, _, _, _, _, egpio), + [160] = PINGROUP(160, WEST, cri_trng1, qdss_gpio, _, _, _, _, _, _, egpio), + [161] = PINGROUP(161, WEST, cri_trng, qdss_gpio, _, _, _, _, _, _, egpio), + [162] = PINGROUP(162, WEST, sp_cmu, qdss_gpio, _, _, _, _, _, _, egpio), + [163] = PINGROUP(163, WEST, prng_rosc, qdss_gpio, _, _, _, _, _, _, egpio), + [164] = PINGROUP(164, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [165] = PINGROUP(165, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [166] = PINGROUP(166, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [167] = PINGROUP(167, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [168] = PINGROUP(168, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [169] = PINGROUP(169, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [170] = PINGROUP(170, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [171] = PINGROUP(171, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [172] = PINGROUP(172, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [173] = PINGROUP(173, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [174] = PINGROUP(174, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [175] = PINGROUP(175, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [176] = PINGROUP(176, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [177] = PINGROUP(177, WEST, qdss_gpio, _, _, _, _, _, _, _, egpio), + [178] = PINGROUP(178, WEST, _, _, _, _, _, _, _, _, egpio), + [179] = PINGROUP(179, WEST, _, _, _, _, _, _, _, _, egpio), [180] = UFS_RESET(ufs_reset, 0xb8000), [181] = SDC_PINGROUP(sdc2_clk, 0xb7000, 14, 6), [182] = SDC_PINGROUP(sdc2_cmd, 0xb7000, 11, 3), @@ -1333,6 +1345,7 @@ static const struct msm_pinctrl_soc_data sm8250_pinctrl = { .ntiles = ARRAY_SIZE(sm8250_tiles), .wakeirq_map = sm8250_pdc_map, .nwakeirq_map = ARRAY_SIZE(sm8250_pdc_map), + .egpio_func = 9, }; static int sm8250_pinctrl_probe(struct platform_device *pdev) diff --git a/drivers/pinctrl/qcom/pinctrl-sm8350.c b/drivers/pinctrl/qcom/pinctrl-sm8350.c index c8a3f39ce6f1b8..99949b55202113 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8350.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8350.c @@ -1267,7 +1267,7 @@ static const struct pinfunction sm8350_functions[] = { MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(ibi_i3c), MSM_PIN_FUNCTION(jitter_bist), MSM_PIN_FUNCTION(lpass_slimbus), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8450.c b/drivers/pinctrl/qcom/pinctrl-sm8450.c index f2e52d5a0f9369..9889fc5dc2cd20 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8450.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8450.c @@ -1269,7 +1269,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction sm8450_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aon_cam), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb), @@ -1291,7 +1291,7 @@ static const struct pinfunction sm8450_functions[] = { MSM_PIN_FUNCTION(ddr_pxi2), MSM_PIN_FUNCTION(ddr_pxi3), MSM_PIN_FUNCTION(dp_hot), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8550.c b/drivers/pinctrl/qcom/pinctrl-sm8550.c index 1b4496cb39eb46..10a62031fdfd04 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8550.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8550.c @@ -1340,7 +1340,7 @@ static const char *const vsense_trigger_mirnat_groups[] = { }; static const struct pinfunction sm8550_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aon_cci), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8650.c b/drivers/pinctrl/qcom/pinctrl-sm8650.c index 449a0077f4b106..e2ae038002060d 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8650.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8650.c @@ -1328,7 +1328,7 @@ static const char *const vsense_trigger_mirnat_groups[] = { }; static const struct pinfunction sm8650_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb), @@ -1359,7 +1359,7 @@ static const struct pinfunction sm8650_functions[] = { MSM_PIN_FUNCTION(ddr_pxi3), MSM_PIN_FUNCTION(do_not), MSM_PIN_FUNCTION(dp_hot), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), diff --git a/drivers/pinctrl/qcom/pinctrl-sm8750.c b/drivers/pinctrl/qcom/pinctrl-sm8750.c index 8516693d1db51d..6f92f176edd459 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8750.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8750.c @@ -1290,7 +1290,7 @@ static const char *const wcn_sw_ctrl_groups[] = { }; static const struct pinfunction sm8750_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(aoss_cti), MSM_PIN_FUNCTION(atest_char), MSM_PIN_FUNCTION(atest_usb), @@ -1319,7 +1319,7 @@ static const struct pinfunction sm8750_functions[] = { MSM_PIN_FUNCTION(ddr_pxi2), MSM_PIN_FUNCTION(ddr_pxi3), MSM_PIN_FUNCTION(dp_hot), - MSM_PIN_FUNCTION(egpio), + MSM_GPIO_PIN_FUNCTION(egpio), MSM_PIN_FUNCTION(gcc_gp1), MSM_PIN_FUNCTION(gcc_gp2), MSM_PIN_FUNCTION(gcc_gp3), diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index b7b15874e488a1..485b68cc93f8ed 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -438,7 +438,7 @@ static int pmic_gpio_config_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_OUTPUT_ENABLE: arg = pad->output_enabled; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pad->out_value; break; case PMIC_GPIO_CONF_PULL_UP: @@ -530,7 +530,7 @@ static int pmic_gpio_config_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: pad->output_enabled = arg ? true : false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pad->output_enabled = true; pad->out_value = arg; break; @@ -737,7 +737,7 @@ static int pmic_gpio_direction_output(struct gpio_chip *chip, struct pmic_gpio_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return pmic_gpio_config_set(state->ctrl, pin, &config, 1); } @@ -769,7 +769,7 @@ static int pmic_gpio_set(struct gpio_chip *chip, unsigned int pin, int value) struct pmic_gpio_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return pmic_gpio_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c index 22d76b1013a313..64f8024a865cd3 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c @@ -370,7 +370,7 @@ static int pmic_mpp_config_get(struct pinctrl_dev *pctldev, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pad->out_value; break; case PMIC_MPP_CONF_DTEST_SELECTOR: @@ -447,7 +447,7 @@ static int pmic_mpp_config_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_INPUT_ENABLE: pad->input_enabled = arg ? true : false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pad->output_enabled = true; pad->out_value = arg; break; @@ -576,7 +576,7 @@ static int pmic_mpp_direction_output(struct gpio_chip *chip, struct pmic_mpp_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return pmic_mpp_config_set(state->ctrl, pin, &config, 1); } @@ -605,7 +605,7 @@ static int pmic_mpp_set(struct gpio_chip *chip, unsigned int pin, int value) struct pmic_mpp_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return pmic_mpp_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c index fb37b1c1acb41f..5c966d51eda7bb 100644 --- a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c @@ -282,7 +282,7 @@ static int pm8xxx_pin_config_get(struct pinctrl_dev *pctldev, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (pin->mode & PM8XXX_GPIO_MODE_OUTPUT) arg = pin->output_value; else @@ -364,7 +364,7 @@ static int pm8xxx_pin_config_set(struct pinctrl_dev *pctldev, pin->mode = PM8XXX_GPIO_MODE_INPUT; banks |= BIT(0) | BIT(1); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pin->mode = PM8XXX_GPIO_MODE_OUTPUT; pin->output_value = !!arg; banks |= BIT(0) | BIT(1); diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c index 6103849af042d6..7970fa6e15579e 100644 --- a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c +++ b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c @@ -337,7 +337,7 @@ static int pm8xxx_pin_config_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_INPUT_ENABLE: arg = pin->input; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pin->output_value; break; case PIN_CONFIG_POWER_SOURCE: @@ -392,7 +392,7 @@ static int pm8xxx_pin_config_set(struct pinctrl_dev *pctldev, case PIN_CONFIG_INPUT_ENABLE: pin->input = true; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pin->output = true; pin->output_value = !!arg; break; diff --git a/drivers/pinctrl/qcom/pinctrl-x1e80100.c b/drivers/pinctrl/qcom/pinctrl-x1e80100.c index d4b215f34c39bf..bb36f40b19fa53 100644 --- a/drivers/pinctrl/qcom/pinctrl-x1e80100.c +++ b/drivers/pinctrl/qcom/pinctrl-x1e80100.c @@ -1407,7 +1407,7 @@ static const char * const vsense_trigger_groups[] = { }; static const struct pinfunction x1e80100_functions[] = { - MSM_PIN_FUNCTION(gpio), + MSM_GPIO_PIN_FUNCTION(gpio), MSM_PIN_FUNCTION(RESOUT_GPIO), MSM_PIN_FUNCTION(aon_cci), MSM_PIN_FUNCTION(aoss_cti), diff --git a/drivers/pinctrl/renesas/Kconfig b/drivers/pinctrl/renesas/Kconfig index 99ae34a56871c2..8cbd79a1341468 100644 --- a/drivers/pinctrl/renesas/Kconfig +++ b/drivers/pinctrl/renesas/Kconfig @@ -44,6 +44,8 @@ config PINCTRL_RENESAS select PINCTRL_RZG2L if ARCH_R9A09G047 select PINCTRL_RZG2L if ARCH_R9A09G056 select PINCTRL_RZG2L if ARCH_R9A09G057 + select PINCTRL_RZT2H if ARCH_R9A09G077 + select PINCTRL_RZT2H if ARCH_R9A09G087 select PINCTRL_PFC_SH7203 if CPU_SUBTYPE_SH7203 select PINCTRL_PFC_SH7264 if CPU_SUBTYPE_SH7264 select PINCTRL_PFC_SH7269 if CPU_SUBTYPE_SH7269 @@ -302,6 +304,17 @@ config PINCTRL_RZN1 help This selects pinctrl driver for Renesas RZ/N1 devices. +config PINCTRL_RZT2H + bool "pin control support for RZ/N2H and RZ/T2H" if COMPILE_TEST + depends on 64BIT && OF + select GPIOLIB + select GENERIC_PINCTRL_GROUPS + select GENERIC_PINMUX_FUNCTIONS + select GENERIC_PINCONF + help + This selects GPIO and pinctrl driver for Renesas RZ/T2H + platforms. + config PINCTRL_RZV2M bool "pin control support for RZ/V2M" if COMPILE_TEST depends on OF diff --git a/drivers/pinctrl/renesas/Makefile b/drivers/pinctrl/renesas/Makefile index 2ba623e04bf8c7..1c5144a1c4b8f0 100644 --- a/drivers/pinctrl/renesas/Makefile +++ b/drivers/pinctrl/renesas/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_PINCTRL_RZA1) += pinctrl-rza1.o obj-$(CONFIG_PINCTRL_RZA2) += pinctrl-rza2.o obj-$(CONFIG_PINCTRL_RZG2L) += pinctrl-rzg2l.o obj-$(CONFIG_PINCTRL_RZN1) += pinctrl-rzn1.o +obj-$(CONFIG_PINCTRL_RZT2H) += pinctrl-rzt2h.o obj-$(CONFIG_PINCTRL_RZV2M) += pinctrl-rzv2m.o ifeq ($(CONFIG_COMPILE_TEST),y) diff --git a/drivers/pinctrl/renesas/pfc-r8a779g0.c b/drivers/pinctrl/renesas/pfc-r8a779g0.c index cae3e65534997d..218c5eff9b67fe 100644 --- a/drivers/pinctrl/renesas/pfc-r8a779g0.c +++ b/drivers/pinctrl/renesas/pfc-r8a779g0.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * R8A779A0 processor support - PFC hardware block. + * R8A779G0 processor support - PFC hardware block. * * Copyright (C) 2021 Renesas Electronics Corp. * diff --git a/drivers/pinctrl/renesas/pinctrl-rza1.c b/drivers/pinctrl/renesas/pinctrl-rza1.c index 23812116ef4268..f24e5915cbe4b4 100644 --- a/drivers/pinctrl/renesas/pinctrl-rza1.c +++ b/drivers/pinctrl/renesas/pinctrl-rza1.c @@ -933,7 +933,7 @@ static int rza1_parse_pinmux_node(struct rza1_pinctrl *rza1_pctl, case PIN_CONFIG_INPUT_ENABLE: pinmux_flags |= MUX_FLAGS_SWIO_INPUT; break; - case PIN_CONFIG_OUTPUT: /* for DT backwards compatibility */ + case PIN_CONFIG_LEVEL: /* for DT backwards compatibility */ case PIN_CONFIG_OUTPUT_ENABLE: pinmux_flags |= MUX_FLAGS_SWIO_OUTPUT; break; @@ -1120,7 +1120,7 @@ static int rza1_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, { struct rza1_pinctrl *rza1_pctl = pinctrl_dev_get_drvdata(pctldev); struct rza1_mux_conf *mux_confs; - struct function_desc *func; + const struct function_desc *func; struct group_desc *grp; int i; diff --git a/drivers/pinctrl/renesas/pinctrl-rza2.c b/drivers/pinctrl/renesas/pinctrl-rza2.c index b78b5b4ec5afd9..29a9db19759906 100644 --- a/drivers/pinctrl/renesas/pinctrl-rza2.c +++ b/drivers/pinctrl/renesas/pinctrl-rza2.c @@ -442,7 +442,7 @@ static int rza2_set_mux(struct pinctrl_dev *pctldev, unsigned int selector, unsigned int group) { struct rza2_pinctrl_priv *priv = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; unsigned int i, *psel_val; struct group_desc *grp; diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index c52263c2a7b093..f524af6f586f4a 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -146,8 +146,6 @@ #define SD_CH(off, ch) ((off) + (ch) * 4) #define ETH_POC(off, ch) ((off) + (ch) * 4) #define QSPI (0x3008) -#define ETH_MODE (0x3018) -#define PFC_OEN (0x3C40) /* known on RZ/V2H(P) only */ #define PVDD_2500 2 /* I/O domain voltage 2.5V */ #define PVDD_1800 1 /* I/O domain voltage <= 1.8V */ @@ -221,11 +219,13 @@ static const struct pin_config_item renesas_rzv2h_conf_items[] = { * @pwpr: PWPR register offset * @sd_ch: SD_CH register offset * @eth_poc: ETH_POC register offset + * @oen: OEN register offset */ struct rzg2l_register_offsets { u16 pwpr; u16 sd_ch; u16 eth_poc; + u16 oen; }; /** @@ -254,6 +254,7 @@ enum rzg2l_iolh_index { * @iolh_groupb_oi: IOLH group B output impedance specific values * @tint_start_index: the start index for the TINT interrupts * @drive_strength_ua: drive strength in uA is supported (otherwise mA is supported) + * @oen_pwpr_lock: flag indicating if the OEN register is locked by PWPR * @func_base: base number for port function (see register PFC) * @oen_max_pin: the maximum pin number supporting output enable * @oen_max_port: the maximum port number supporting output enable @@ -266,6 +267,7 @@ struct rzg2l_hwcfg { u16 iolh_groupb_oi[4]; u16 tint_start_index; bool drive_strength_ua; + bool oen_pwpr_lock; u8 func_base; u8 oen_max_pin; u8 oen_max_port; @@ -295,8 +297,7 @@ struct rzg2l_pinctrl_data { #endif void (*pwpr_pfc_lock_unlock)(struct rzg2l_pinctrl *pctrl, bool lock); void (*pmc_writeb)(struct rzg2l_pinctrl *pctrl, u8 val, u16 offset); - u32 (*oen_read)(struct rzg2l_pinctrl *pctrl, unsigned int _pin); - int (*oen_write)(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen); + int (*pin_to_oen_bit)(struct rzg2l_pinctrl *pctrl, unsigned int _pin); int (*hw_to_bias_param)(unsigned int val); int (*bias_param_to_hw)(enum pin_config_param param); }; @@ -320,9 +321,10 @@ struct rzg2l_pinctrl_pin_settings { * @iolh: IOLH registers cache * @pupd: PUPD registers cache * @ien: IEN registers cache + * @smt: SMT registers cache * @sd_ch: SD_CH registers cache * @eth_poc: ET_POC registers cache - * @eth_mode: ETH_MODE register cache + * @oen: Output Enable register cache * @qspi: QSPI registers cache */ struct rzg2l_pinctrl_reg_cache { @@ -333,9 +335,10 @@ struct rzg2l_pinctrl_reg_cache { u32 *iolh[2]; u32 *ien[2]; u32 *pupd[2]; + u32 *smt; u8 sd_ch[2]; u8 eth_poc[2]; - u8 eth_mode; + u8 oen; u8 qspi; }; @@ -394,6 +397,14 @@ static const u64 r9a09g047_variable_pin_cfg[] = { RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PA, 5, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PA, 6, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PA, 7, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 0, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 1, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 2, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 3, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 4, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 5, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 6, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PB, 7, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 0, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_IEN), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 1, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 2, RZV2H_MPXED_PIN_FUNCS), @@ -402,6 +413,14 @@ static const u64 r9a09g047_variable_pin_cfg[] = { RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 5, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 6, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PD, 7, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 0, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 1, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 2, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 3, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 4, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 5, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 6, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PE, 7, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PG, 0, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PG, 1, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_IEN), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PG, 2, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_IEN), @@ -421,6 +440,14 @@ static const u64 r9a09g047_variable_pin_cfg[] = { RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PJ, 2, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PJ, 3, RZV2H_MPXED_PIN_FUNCS), RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PJ, 4, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 0, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 1, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 2, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 3, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 4, RZV2H_MPXED_PIN_FUNCS | PIN_CFG_OEN), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 5, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 6, RZV2H_MPXED_PIN_FUNCS), + RZG2L_VARIABLE_PIN_CFG_PACK(RZG3E_PL, 7, RZV2H_MPXED_PIN_FUNCS), }; static const u64 r9a09g057_variable_pin_cfg[] = { @@ -549,7 +576,7 @@ static int rzg2l_pinctrl_set_mux(struct pinctrl_dev *pctldev, { struct rzg2l_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); const struct rzg2l_hwcfg *hwcfg = pctrl->data->hwcfg; - struct function_desc *func; + const struct function_desc *func; unsigned int i, *psel_val; struct group_desc *group; const unsigned int *pins; @@ -1065,34 +1092,48 @@ static int rzg2l_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) return -EINVAL; } -static u32 rzg2l_read_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin) +static int rzg2l_read_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin) { int bit; - bit = rzg2l_pin_to_oen_bit(pctrl, _pin); + if (!pctrl->data->pin_to_oen_bit) + return -EOPNOTSUPP; + + bit = pctrl->data->pin_to_oen_bit(pctrl, _pin); if (bit < 0) - return 0; + return -EINVAL; - return !(readb(pctrl->base + ETH_MODE) & BIT(bit)); + return !(readb(pctrl->base + pctrl->data->hwcfg->regs.oen) & BIT(bit)); } static int rzg2l_write_oen(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen) { + const struct rzg2l_register_offsets *regs = &pctrl->data->hwcfg->regs; + u16 oen_offset = pctrl->data->hwcfg->regs.oen; unsigned long flags; + u8 val, pwpr; int bit; - u8 val; - bit = rzg2l_pin_to_oen_bit(pctrl, _pin); + if (!pctrl->data->pin_to_oen_bit) + return -EOPNOTSUPP; + + bit = pctrl->data->pin_to_oen_bit(pctrl, _pin); if (bit < 0) - return bit; + return -EINVAL; spin_lock_irqsave(&pctrl->lock, flags); - val = readb(pctrl->base + ETH_MODE); + val = readb(pctrl->base + oen_offset); if (oen) val &= ~BIT(bit); else val |= BIT(bit); - writeb(val, pctrl->base + ETH_MODE); + if (pctrl->data->hwcfg->oen_pwpr_lock) { + pwpr = readb(pctrl->base + regs->pwpr); + writeb(pwpr | PWPR_REGWE_B, pctrl->base + regs->pwpr); + } + writeb(val, pctrl->base + oen_offset); + if (pctrl->data->hwcfg->oen_pwpr_lock) + writeb(pwpr & ~PWPR_REGWE_B, pctrl->base + regs->pwpr); spin_unlock_irqrestore(&pctrl->lock, flags); return 0; @@ -1118,39 +1159,6 @@ static int rzg3s_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) return bit; } -static u32 rzg3s_oen_read(struct rzg2l_pinctrl *pctrl, unsigned int _pin) -{ - int bit; - - bit = rzg3s_pin_to_oen_bit(pctrl, _pin); - if (bit < 0) - return bit; - - return !(readb(pctrl->base + ETH_MODE) & BIT(bit)); -} - -static int rzg3s_oen_write(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen) -{ - unsigned long flags; - int bit; - u8 val; - - bit = rzg3s_pin_to_oen_bit(pctrl, _pin); - if (bit < 0) - return bit; - - spin_lock_irqsave(&pctrl->lock, flags); - val = readb(pctrl->base + ETH_MODE); - if (oen) - val &= ~BIT(bit); - else - val |= BIT(bit); - writeb(val, pctrl->base + ETH_MODE); - spin_unlock_irqrestore(&pctrl->lock, flags); - - return 0; -} - static int rzg2l_hw_to_bias_param(unsigned int bias) { switch (bias) { @@ -1216,55 +1224,37 @@ static int rzv2h_bias_param_to_hw(enum pin_config_param param) return -EINVAL; } -static u8 rzv2h_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) +static int rzg2l_pin_names_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin, + const char * const pin_names[], unsigned int count) { - static const char * const pin_names[] = { "ET0_TXC_TXCLK", "ET1_TXC_TXCLK", - "XSPI0_RESET0N", "XSPI0_CS0N", - "XSPI0_CKN", "XSPI0_CKP" }; const struct pinctrl_pin_desc *pin_desc = &pctrl->desc.pins[_pin]; unsigned int i; - for (i = 0; i < ARRAY_SIZE(pin_names); i++) { + for (i = 0; i < count; i++) { if (!strcmp(pin_desc->name, pin_names[i])) return i; } - /* Should not happen. */ - return 0; + return -EINVAL; } -static u32 rzv2h_oen_read(struct rzg2l_pinctrl *pctrl, unsigned int _pin) +static int rzv2h_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) { - u8 bit; - - bit = rzv2h_pin_to_oen_bit(pctrl, _pin); + static const char * const pin_names[] = { + "ET0_TXC_TXCLK", "ET1_TXC_TXCLK", "XSPI0_RESET0N", + "XSPI0_CS0N", "XSPI0_CKN", "XSPI0_CKP" + }; - return !(readb(pctrl->base + PFC_OEN) & BIT(bit)); + return rzg2l_pin_names_to_oen_bit(pctrl, _pin, pin_names, ARRAY_SIZE(pin_names)); } -static int rzv2h_oen_write(struct rzg2l_pinctrl *pctrl, unsigned int _pin, u8 oen) +static int rzg3e_pin_to_oen_bit(struct rzg2l_pinctrl *pctrl, unsigned int _pin) { - const struct rzg2l_hwcfg *hwcfg = pctrl->data->hwcfg; - const struct rzg2l_register_offsets *regs = &hwcfg->regs; - unsigned long flags; - u8 val, bit; - u8 pwpr; - - bit = rzv2h_pin_to_oen_bit(pctrl, _pin); - spin_lock_irqsave(&pctrl->lock, flags); - val = readb(pctrl->base + PFC_OEN); - if (oen) - val &= ~BIT(bit); - else - val |= BIT(bit); - - pwpr = readb(pctrl->base + regs->pwpr); - writeb(pwpr | PWPR_REGWE_B, pctrl->base + regs->pwpr); - writeb(val, pctrl->base + PFC_OEN); - writeb(pwpr & ~PWPR_REGWE_B, pctrl->base + regs->pwpr); - spin_unlock_irqrestore(&pctrl->lock, flags); + static const char * const pin_names[] = { + "PB1", "PE1", "PL4", "PL1", "PL2", "PL0" + }; - return 0; + return rzg2l_pin_names_to_oen_bit(pctrl, _pin, pin_names, ARRAY_SIZE(pin_names)); } static int rzg2l_pinctrl_pinconf_get(struct pinctrl_dev *pctldev, @@ -1308,11 +1298,10 @@ static int rzg2l_pinctrl_pinconf_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_OUTPUT_ENABLE: if (!(cfg & PIN_CFG_OEN)) return -EINVAL; - if (!pctrl->data->oen_read) - return -EOPNOTSUPP; - arg = pctrl->data->oen_read(pctrl, _pin); - if (!arg) - return -EINVAL; + ret = rzg2l_read_oen(pctrl, _pin); + if (ret < 0) + return ret; + arg = ret; break; case PIN_CONFIG_POWER_SOURCE: @@ -1471,9 +1460,7 @@ static int rzg2l_pinctrl_pinconf_set(struct pinctrl_dev *pctldev, case PIN_CONFIG_OUTPUT_ENABLE: if (!(cfg & PIN_CFG_OEN)) return -EINVAL; - if (!pctrl->data->oen_write) - return -EOPNOTSUPP; - ret = pctrl->data->oen_write(pctrl, _pin, !!arg); + ret = rzg2l_write_oen(pctrl, _pin, !!arg); if (ret) return ret; break; @@ -2058,17 +2045,17 @@ static const u64 r9a09g047_gpio_configs[] = { RZG2L_GPIO_PORT_PACK(6, 0x28, RZV2H_MPXED_PIN_FUNCS), /* P8 */ 0x0, RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x2a), /* PA */ - RZG2L_GPIO_PORT_PACK(8, 0x2b, RZV2H_MPXED_PIN_FUNCS), /* PB */ + RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x2b), /* PB */ RZG2L_GPIO_PORT_PACK(3, 0x2c, RZV2H_MPXED_PIN_FUNCS), /* PC */ RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x2d), /* PD */ - RZG2L_GPIO_PORT_PACK(8, 0x2e, RZV2H_MPXED_PIN_FUNCS), /* PE */ + RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x2e), /* PE */ RZG2L_GPIO_PORT_PACK(3, 0x2f, RZV2H_MPXED_PIN_FUNCS), /* PF */ RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x30), /* PG */ RZG2L_GPIO_PORT_PACK_VARIABLE(6, 0x31), /* PH */ 0x0, RZG2L_GPIO_PORT_PACK_VARIABLE(5, 0x33), /* PJ */ RZG2L_GPIO_PORT_PACK(4, 0x34, RZV2H_MPXED_PIN_FUNCS), /* PK */ - RZG2L_GPIO_PORT_PACK(8, 0x35, RZV2H_MPXED_PIN_FUNCS), /* PL */ + RZG2L_GPIO_PORT_PACK_VARIABLE(8, 0x35), /* PL */ RZG2L_GPIO_PORT_PACK(8, 0x36, RZV2H_MPXED_PIN_FUNCS), /* PM */ 0x0, 0x0, @@ -2719,6 +2706,10 @@ static int rzg2l_pinctrl_reg_cache_alloc(struct rzg2l_pinctrl *pctrl) if (!cache->pfc) return -ENOMEM; + cache->smt = devm_kcalloc(pctrl->dev, nports, sizeof(*cache->smt), GFP_KERNEL); + if (!cache->smt) + return -ENOMEM; + for (u8 i = 0; i < 2; i++) { u32 n_dedicated_pins = pctrl->data->n_dedicated_pins; @@ -2980,7 +2971,7 @@ static void rzg2l_pinctrl_pm_setup_regs(struct rzg2l_pinctrl *pctrl, bool suspen struct rzg2l_pinctrl_reg_cache *cache = pctrl->cache; for (u32 port = 0; port < nports; port++) { - bool has_iolh, has_ien, has_pupd; + bool has_iolh, has_ien, has_pupd, has_smt; u32 off, caps; u8 pincnt; u64 cfg; @@ -2993,6 +2984,7 @@ static void rzg2l_pinctrl_pm_setup_regs(struct rzg2l_pinctrl *pctrl, bool suspen has_iolh = !!(caps & (PIN_CFG_IOLH_A | PIN_CFG_IOLH_B | PIN_CFG_IOLH_C)); has_ien = !!(caps & PIN_CFG_IEN); has_pupd = !!(caps & PIN_CFG_PUPD); + has_smt = !!(caps & PIN_CFG_SMT); if (suspend) RZG2L_PCTRL_REG_ACCESS32(suspend, pctrl->base + PFC(off), cache->pfc[port]); @@ -3031,6 +3023,9 @@ static void rzg2l_pinctrl_pm_setup_regs(struct rzg2l_pinctrl *pctrl, bool suspen cache->ien[1][port]); } } + + if (has_smt) + RZG2L_PCTRL_REG_ACCESS32(suspend, pctrl->base + SMT(off), cache->smt[port]); } } @@ -3164,7 +3159,7 @@ static int rzg2l_pinctrl_suspend_noirq(struct device *dev) } cache->qspi = readb(pctrl->base + QSPI); - cache->eth_mode = readb(pctrl->base + ETH_MODE); + cache->oen = readb(pctrl->base + pctrl->data->hwcfg->regs.oen); if (!atomic_read(&pctrl->wakeup_path)) clk_disable_unprepare(pctrl->clk); @@ -3180,6 +3175,8 @@ static int rzg2l_pinctrl_resume_noirq(struct device *dev) const struct rzg2l_hwcfg *hwcfg = pctrl->data->hwcfg; const struct rzg2l_register_offsets *regs = &hwcfg->regs; struct rzg2l_pinctrl_reg_cache *cache = pctrl->cache; + unsigned long flags; + u8 pwpr; int ret; if (!atomic_read(&pctrl->wakeup_path)) { @@ -3189,7 +3186,16 @@ static int rzg2l_pinctrl_resume_noirq(struct device *dev) } writeb(cache->qspi, pctrl->base + QSPI); - writeb(cache->eth_mode, pctrl->base + ETH_MODE); + if (pctrl->data->hwcfg->oen_pwpr_lock) { + spin_lock_irqsave(&pctrl->lock, flags); + pwpr = readb(pctrl->base + regs->pwpr); + writeb(pwpr | PWPR_REGWE_B, pctrl->base + regs->pwpr); + } + writeb(cache->oen, pctrl->base + pctrl->data->hwcfg->regs.oen); + if (pctrl->data->hwcfg->oen_pwpr_lock) { + writeb(pwpr & ~PWPR_REGWE_B, pctrl->base + regs->pwpr); + spin_unlock_irqrestore(&pctrl->lock, flags); + } for (u8 i = 0; i < 2; i++) { if (regs->sd_ch) writeb(cache->sd_ch[i], pctrl->base + SD_CH(regs->sd_ch, i)); @@ -3241,6 +3247,7 @@ static const struct rzg2l_hwcfg rzg2l_hwcfg = { .pwpr = 0x3014, .sd_ch = 0x3000, .eth_poc = 0x300c, + .oen = 0x3018, }, .iolh_groupa_ua = { /* 3v3 power source */ @@ -3256,6 +3263,7 @@ static const struct rzg2l_hwcfg rzg3s_hwcfg = { .pwpr = 0x3000, .sd_ch = 0x3004, .eth_poc = 0x3010, + .oen = 0x3018, }, .iolh_groupa_ua = { /* 1v8 power source */ @@ -3287,8 +3295,10 @@ static const struct rzg2l_hwcfg rzg3s_hwcfg = { static const struct rzg2l_hwcfg rzv2h_hwcfg = { .regs = { .pwpr = 0x3c04, + .oen = 0x3c40, }, .tint_start_index = 17, + .oen_pwpr_lock = true, }; static struct rzg2l_pinctrl_data r9a07g043_data = { @@ -3305,8 +3315,7 @@ static struct rzg2l_pinctrl_data r9a07g043_data = { #endif .pwpr_pfc_lock_unlock = &rzg2l_pwpr_pfc_lock_unlock, .pmc_writeb = &rzg2l_pmc_writeb, - .oen_read = &rzg2l_read_oen, - .oen_write = &rzg2l_write_oen, + .pin_to_oen_bit = &rzg2l_pin_to_oen_bit, .hw_to_bias_param = &rzg2l_hw_to_bias_param, .bias_param_to_hw = &rzg2l_bias_param_to_hw, }; @@ -3322,8 +3331,7 @@ static struct rzg2l_pinctrl_data r9a07g044_data = { .hwcfg = &rzg2l_hwcfg, .pwpr_pfc_lock_unlock = &rzg2l_pwpr_pfc_lock_unlock, .pmc_writeb = &rzg2l_pmc_writeb, - .oen_read = &rzg2l_read_oen, - .oen_write = &rzg2l_write_oen, + .pin_to_oen_bit = &rzg2l_pin_to_oen_bit, .hw_to_bias_param = &rzg2l_hw_to_bias_param, .bias_param_to_hw = &rzg2l_bias_param_to_hw, }; @@ -3338,8 +3346,7 @@ static struct rzg2l_pinctrl_data r9a08g045_data = { .hwcfg = &rzg3s_hwcfg, .pwpr_pfc_lock_unlock = &rzg2l_pwpr_pfc_lock_unlock, .pmc_writeb = &rzg2l_pmc_writeb, - .oen_read = &rzg3s_oen_read, - .oen_write = &rzg3s_oen_write, + .pin_to_oen_bit = &rzg3s_pin_to_oen_bit, .hw_to_bias_param = &rzg2l_hw_to_bias_param, .bias_param_to_hw = &rzg2l_bias_param_to_hw, }; @@ -3361,8 +3368,7 @@ static struct rzg2l_pinctrl_data r9a09g047_data = { #endif .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, - .oen_read = &rzv2h_oen_read, - .oen_write = &rzv2h_oen_write, + .pin_to_oen_bit = &rzg3e_pin_to_oen_bit, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; @@ -3384,8 +3390,7 @@ static struct rzg2l_pinctrl_data r9a09g056_data = { #endif .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, - .oen_read = &rzv2h_oen_read, - .oen_write = &rzv2h_oen_write, + .pin_to_oen_bit = &rzv2h_pin_to_oen_bit, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; @@ -3408,8 +3413,7 @@ static struct rzg2l_pinctrl_data r9a09g057_data = { #endif .pwpr_pfc_lock_unlock = &rzv2h_pwpr_pfc_lock_unlock, .pmc_writeb = &rzv2h_pmc_writeb, - .oen_read = &rzv2h_oen_read, - .oen_write = &rzv2h_oen_write, + .pin_to_oen_bit = &rzv2h_pin_to_oen_bit, .hw_to_bias_param = &rzv2h_hw_to_bias_param, .bias_param_to_hw = &rzv2h_bias_param_to_hw, }; diff --git a/drivers/pinctrl/renesas/pinctrl-rzt2h.c b/drivers/pinctrl/renesas/pinctrl-rzt2h.c new file mode 100644 index 00000000000000..3872638f5ebb39 --- /dev/null +++ b/drivers/pinctrl/renesas/pinctrl-rzt2h.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/T2H Pin Control and GPIO driver core + * + * Based on drivers/pinctrl/renesas/pinctrl-rzg2l.c + * + * Copyright (C) 2025 Renesas Electronics Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "../core.h" +#include "../pinconf.h" +#include "../pinmux.h" + +#define DRV_NAME "pinctrl-rzt2h" + +#define P(m) (0x001 * (m)) +#define PM(m) (0x200 + 2 * (m)) +#define PMC(m) (0x400 + (m)) +#define PFC(m) (0x600 + 8 * (m)) +#define PIN(m) (0x800 + (m)) +#define RSELP(m) (0xc00 + (m)) + +#define PM_MASK GENMASK(1, 0) +#define PM_PIN_MASK(pin) (PM_MASK << ((pin) * 2)) +#define PM_INPUT BIT(0) +#define PM_OUTPUT BIT(1) + +#define PFC_MASK GENMASK_ULL(5, 0) +#define PFC_PIN_MASK(pin) (PFC_MASK << ((pin) * 8)) + +/* + * Use 16 lower bits [15:0] for pin identifier + * Use 8 higher bits [23:16] for pin mux function + */ +#define MUX_PIN_ID_MASK GENMASK(15, 0) +#define MUX_FUNC_MASK GENMASK(23, 16) + +#define RZT2H_PIN_ID_TO_PORT(id) ((id) / RZT2H_PINS_PER_PORT) +#define RZT2H_PIN_ID_TO_PIN(id) ((id) % RZT2H_PINS_PER_PORT) + +#define RZT2H_MAX_SAFETY_PORTS 12 + +struct rzt2h_pinctrl_data { + unsigned int n_port_pins; + const u8 *port_pin_configs; + unsigned int n_ports; +}; + +struct rzt2h_pinctrl { + struct pinctrl_dev *pctl; + struct pinctrl_desc desc; + struct pinctrl_pin_desc *pins; + const struct rzt2h_pinctrl_data *data; + void __iomem *base0, *base1; + struct device *dev; + struct gpio_chip gpio_chip; + struct pinctrl_gpio_range gpio_range; + spinlock_t lock; /* lock read/write registers */ + struct mutex mutex; /* serialize adding groups and functions */ + bool safety_port_enabled; +}; + +#define RZT2H_GET_BASE(pctrl, port) \ + ((port) > RZT2H_MAX_SAFETY_PORTS ? (pctrl)->base0 : (pctrl)->base1) + +#define RZT2H_PINCTRL_REG_ACCESS(size, type) \ +static inline void rzt2h_pinctrl_write##size(struct rzt2h_pinctrl *pctrl, u8 port, \ + type val, unsigned int offset) \ +{ \ + write##size(val, RZT2H_GET_BASE(pctrl, port) + offset); \ +} \ +static inline type rzt2h_pinctrl_read##size(struct rzt2h_pinctrl *pctrl, u8 port, \ + unsigned int offset) \ +{ \ + return read##size(RZT2H_GET_BASE(pctrl, port) + offset); \ +} + +RZT2H_PINCTRL_REG_ACCESS(b, u8) +RZT2H_PINCTRL_REG_ACCESS(w, u16) +RZT2H_PINCTRL_REG_ACCESS(q, u64) + +static int rzt2h_validate_pin(struct rzt2h_pinctrl *pctrl, unsigned int offset) +{ + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 pin = RZT2H_PIN_ID_TO_PIN(offset); + u8 pincfg; + + if (offset >= pctrl->data->n_port_pins || port >= pctrl->data->n_ports) + return -EINVAL; + + if (!pctrl->safety_port_enabled && port <= RZT2H_MAX_SAFETY_PORTS) + return -EINVAL; + + pincfg = pctrl->data->port_pin_configs[port]; + return (pincfg & BIT(pin)) ? 0 : -EINVAL; +} + +static void rzt2h_pinctrl_set_pfc_mode(struct rzt2h_pinctrl *pctrl, + u8 port, u8 pin, u8 func) +{ + u64 reg64; + u16 reg16; + + guard(spinlock_irqsave)(&pctrl->lock); + + /* Set pin to 'Non-use (Hi-Z input protection)' */ + reg16 = rzt2h_pinctrl_readw(pctrl, port, PM(port)); + reg16 &= ~PM_PIN_MASK(pin); + rzt2h_pinctrl_writew(pctrl, port, reg16, PM(port)); + + /* Temporarily switch to GPIO mode with PMC register */ + reg16 = rzt2h_pinctrl_readb(pctrl, port, PMC(port)); + rzt2h_pinctrl_writeb(pctrl, port, reg16 & ~BIT(pin), PMC(port)); + + /* Select Pin function mode with PFC register */ + reg64 = rzt2h_pinctrl_readq(pctrl, port, PFC(port)); + reg64 &= ~PFC_PIN_MASK(pin); + rzt2h_pinctrl_writeq(pctrl, port, reg64 | ((u64)func << (pin * 8)), PFC(port)); + + /* Switch to Peripheral pin function with PMC register */ + reg16 = rzt2h_pinctrl_readb(pctrl, port, PMC(port)); + rzt2h_pinctrl_writeb(pctrl, port, reg16 | BIT(pin), PMC(port)); +}; + +static int rzt2h_pinctrl_set_mux(struct pinctrl_dev *pctldev, + unsigned int func_selector, + unsigned int group_selector) +{ + struct rzt2h_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + const struct function_desc *func; + struct group_desc *group; + const unsigned int *pins; + unsigned int i; + u8 *psel_val; + int ret; + + func = pinmux_generic_get_function(pctldev, func_selector); + if (!func) + return -EINVAL; + + group = pinctrl_generic_get_group(pctldev, group_selector); + if (!group) + return -EINVAL; + + psel_val = func->data; + pins = group->grp.pins; + + for (i = 0; i < group->grp.npins; i++) { + dev_dbg(pctrl->dev, "port:%u pin:%u PSEL:%u\n", + RZT2H_PIN_ID_TO_PORT(pins[i]), RZT2H_PIN_ID_TO_PIN(pins[i]), + psel_val[i]); + ret = rzt2h_validate_pin(pctrl, pins[i]); + if (ret) + return ret; + + rzt2h_pinctrl_set_pfc_mode(pctrl, RZT2H_PIN_ID_TO_PORT(pins[i]), + RZT2H_PIN_ID_TO_PIN(pins[i]), psel_val[i]); + } + + return 0; +}; + +static int rzt2h_map_add_config(struct pinctrl_map *map, + const char *group_or_pin, + enum pinctrl_map_type type, + unsigned long *configs, + unsigned int num_configs) +{ + unsigned long *cfgs; + + cfgs = kmemdup_array(configs, num_configs, sizeof(*cfgs), GFP_KERNEL); + if (!cfgs) + return -ENOMEM; + + map->type = type; + map->data.configs.group_or_pin = group_or_pin; + map->data.configs.configs = cfgs; + map->data.configs.num_configs = num_configs; + + return 0; +} + +static int rzt2h_dt_subnode_to_map(struct pinctrl_dev *pctldev, + struct device_node *np, + struct device_node *parent, + struct pinctrl_map **map, + unsigned int *num_maps, + unsigned int *index) +{ + struct rzt2h_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + struct pinctrl_map *maps = *map; + unsigned int nmaps = *num_maps; + unsigned long *configs = NULL; + unsigned int num_pinmux = 0; + unsigned int idx = *index; + unsigned int num_pins, i; + unsigned int num_configs; + struct property *pinmux; + struct property *prop; + int ret, gsel, fsel; + const char **pin_fn; + unsigned int *pins; + const char *name; + const char *pin; + u8 *psel_val; + + pinmux = of_find_property(np, "pinmux", NULL); + if (pinmux) + num_pinmux = pinmux->length / sizeof(u32); + + ret = of_property_count_strings(np, "pins"); + if (ret == -EINVAL) { + num_pins = 0; + } else if (ret < 0) { + dev_err(pctrl->dev, "Invalid pins list in DT\n"); + return ret; + } else { + num_pins = ret; + } + + if (!num_pinmux && !num_pins) + return 0; + + if (num_pinmux && num_pins) { + dev_err(pctrl->dev, + "DT node must contain either a pinmux or pins and not both\n"); + return -EINVAL; + } + + ret = pinconf_generic_parse_dt_config(np, pctldev, &configs, &num_configs); + if (ret < 0) + return ret; + + if (num_pins && !num_configs) { + dev_err(pctrl->dev, "DT node must contain a config\n"); + ret = -ENODEV; + goto done; + } + + if (num_pinmux) { + nmaps += 1; + if (num_configs) + nmaps += 1; + } + + if (num_pins) + nmaps += num_pins; + + maps = krealloc_array(maps, nmaps, sizeof(*maps), GFP_KERNEL); + if (!maps) { + ret = -ENOMEM; + goto done; + } + + *map = maps; + *num_maps = nmaps; + if (num_pins) { + of_property_for_each_string(np, "pins", prop, pin) { + ret = rzt2h_map_add_config(&maps[idx], pin, + PIN_MAP_TYPE_CONFIGS_PIN, + configs, num_configs); + if (ret < 0) + goto done; + + idx++; + } + ret = 0; + goto done; + } + + pins = devm_kcalloc(pctrl->dev, num_pinmux, sizeof(*pins), GFP_KERNEL); + psel_val = devm_kcalloc(pctrl->dev, num_pinmux, sizeof(*psel_val), + GFP_KERNEL); + pin_fn = devm_kzalloc(pctrl->dev, sizeof(*pin_fn), GFP_KERNEL); + if (!pins || !psel_val || !pin_fn) { + ret = -ENOMEM; + goto done; + } + + /* Collect pin locations and mux settings from DT properties */ + for (i = 0; i < num_pinmux; ++i) { + u32 value; + + ret = of_property_read_u32_index(np, "pinmux", i, &value); + if (ret) + goto done; + pins[i] = FIELD_GET(MUX_PIN_ID_MASK, value); + psel_val[i] = FIELD_GET(MUX_FUNC_MASK, value); + } + + if (parent) { + name = devm_kasprintf(pctrl->dev, GFP_KERNEL, "%pOFn.%pOFn", + parent, np); + if (!name) { + ret = -ENOMEM; + goto done; + } + } else { + name = np->name; + } + + if (num_configs) { + ret = rzt2h_map_add_config(&maps[idx], name, + PIN_MAP_TYPE_CONFIGS_GROUP, + configs, num_configs); + if (ret < 0) + goto done; + + idx++; + } + + scoped_guard(mutex, &pctrl->mutex) { + /* Register a single pin group listing all the pins we read from DT */ + gsel = pinctrl_generic_add_group(pctldev, name, pins, num_pinmux, NULL); + if (gsel < 0) { + ret = gsel; + goto done; + } + + /* + * Register a single group function where the 'data' is an array PSEL + * register values read from DT. + */ + pin_fn[0] = name; + fsel = pinmux_generic_add_function(pctldev, name, pin_fn, 1, psel_val); + if (fsel < 0) { + ret = fsel; + goto remove_group; + } + } + + maps[idx].type = PIN_MAP_TYPE_MUX_GROUP; + maps[idx].data.mux.group = name; + maps[idx].data.mux.function = name; + idx++; + + dev_dbg(pctrl->dev, "Parsed %pOF with %d pins\n", np, num_pinmux); + ret = 0; + goto done; + +remove_group: + pinctrl_generic_remove_group(pctldev, gsel); +done: + *index = idx; + kfree(configs); + return ret; +} + +static void rzt2h_dt_free_map(struct pinctrl_dev *pctldev, + struct pinctrl_map *map, + unsigned int num_maps) +{ + unsigned int i; + + if (!map) + return; + + for (i = 0; i < num_maps; ++i) { + if (map[i].type == PIN_MAP_TYPE_CONFIGS_GROUP || + map[i].type == PIN_MAP_TYPE_CONFIGS_PIN) + kfree(map[i].data.configs.configs); + } + kfree(map); +} + +static int rzt2h_dt_node_to_map(struct pinctrl_dev *pctldev, + struct device_node *np, + struct pinctrl_map **map, + unsigned int *num_maps) +{ + struct rzt2h_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); + unsigned int index; + int ret; + + *map = NULL; + *num_maps = 0; + index = 0; + + for_each_child_of_node_scoped(np, child) { + ret = rzt2h_dt_subnode_to_map(pctldev, child, np, map, + num_maps, &index); + if (ret < 0) + goto done; + } + + if (*num_maps == 0) { + ret = rzt2h_dt_subnode_to_map(pctldev, np, NULL, map, + num_maps, &index); + if (ret < 0) + goto done; + } + + if (*num_maps) + return 0; + + dev_err(pctrl->dev, "no mapping found in node %pOF\n", np); + ret = -EINVAL; + +done: + rzt2h_dt_free_map(pctldev, *map, *num_maps); + return ret; +} + +static const struct pinctrl_ops rzt2h_pinctrl_pctlops = { + .get_groups_count = pinctrl_generic_get_group_count, + .get_group_name = pinctrl_generic_get_group_name, + .get_group_pins = pinctrl_generic_get_group_pins, + .dt_node_to_map = rzt2h_dt_node_to_map, + .dt_free_map = rzt2h_dt_free_map, +}; + +static const struct pinmux_ops rzt2h_pinctrl_pmxops = { + .get_functions_count = pinmux_generic_get_function_count, + .get_function_name = pinmux_generic_get_function_name, + .get_function_groups = pinmux_generic_get_function_groups, + .set_mux = rzt2h_pinctrl_set_mux, + .strict = true, +}; + +static int rzt2h_gpio_request(struct gpio_chip *chip, unsigned int offset) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + int ret; + u8 reg; + + ret = rzt2h_validate_pin(pctrl, offset); + if (ret) + return ret; + + ret = pinctrl_gpio_request(chip, offset); + if (ret) + return ret; + + guard(spinlock_irqsave)(&pctrl->lock); + + /* Select GPIO mode in PMC Register */ + reg = rzt2h_pinctrl_readb(pctrl, port, PMC(port)); + reg &= ~BIT(bit); + rzt2h_pinctrl_writeb(pctrl, port, reg, PMC(port)); + + return 0; +} + +static void rzt2h_gpio_set_direction(struct rzt2h_pinctrl *pctrl, u32 port, + u8 bit, bool output) +{ + u16 reg; + + guard(spinlock_irqsave)(&pctrl->lock); + + reg = rzt2h_pinctrl_readw(pctrl, port, PM(port)); + reg &= ~PM_PIN_MASK(bit); + + reg |= (output ? PM_OUTPUT : PM_INPUT) << (bit * 2); + rzt2h_pinctrl_writew(pctrl, port, reg, PM(port)); +} + +static int rzt2h_gpio_get_direction(struct gpio_chip *chip, unsigned int offset) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + u16 reg; + int ret; + + ret = rzt2h_validate_pin(pctrl, offset); + if (ret) + return ret; + + if (rzt2h_pinctrl_readb(pctrl, port, PMC(port)) & BIT(bit)) + return -EINVAL; + + reg = rzt2h_pinctrl_readw(pctrl, port, PM(port)); + reg = (reg >> (bit * 2)) & PM_MASK; + if (reg & PM_OUTPUT) + return GPIO_LINE_DIRECTION_OUT; + if (reg & PM_INPUT) + return GPIO_LINE_DIRECTION_IN; + + return -EINVAL; +} + +static int rzt2h_gpio_set(struct gpio_chip *chip, unsigned int offset, + int value) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + u8 reg; + + guard(spinlock_irqsave)(&pctrl->lock); + + reg = rzt2h_pinctrl_readb(pctrl, port, P(port)); + if (value) + rzt2h_pinctrl_writeb(pctrl, port, reg | BIT(bit), P(port)); + else + rzt2h_pinctrl_writeb(pctrl, port, reg & ~BIT(bit), P(port)); + + return 0; +} + +static int rzt2h_gpio_get(struct gpio_chip *chip, unsigned int offset) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + u16 reg; + + reg = rzt2h_pinctrl_readw(pctrl, port, PM(port)); + reg = (reg >> (bit * 2)) & PM_MASK; + if (reg & PM_INPUT) + return !!(rzt2h_pinctrl_readb(pctrl, port, PIN(port)) & BIT(bit)); + if (reg & PM_OUTPUT) + return !!(rzt2h_pinctrl_readb(pctrl, port, P(port)) & BIT(bit)); + + return -EINVAL; +} + +static int rzt2h_gpio_direction_input(struct gpio_chip *chip, + unsigned int offset) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + + rzt2h_gpio_set_direction(pctrl, port, bit, false); + + return 0; +} + +static int rzt2h_gpio_direction_output(struct gpio_chip *chip, + unsigned int offset, int value) +{ + struct rzt2h_pinctrl *pctrl = gpiochip_get_data(chip); + u8 port = RZT2H_PIN_ID_TO_PORT(offset); + u8 bit = RZT2H_PIN_ID_TO_PIN(offset); + + rzt2h_gpio_set(chip, offset, value); + rzt2h_gpio_set_direction(pctrl, port, bit, true); + + return 0; +} + +static void rzt2h_gpio_free(struct gpio_chip *chip, unsigned int offset) +{ + pinctrl_gpio_free(chip, offset); + + /* + * Set the GPIO as an input to ensure that the next GPIO request won't + * drive the GPIO pin as an output. + */ + rzt2h_gpio_direction_input(chip, offset); +} + +static const char * const rzt2h_gpio_names[] = { + "P00_0", "P00_1", "P00_2", "P00_3", "P00_4", "P00_5", "P00_6", "P00_7", + "P01_0", "P01_1", "P01_2", "P01_3", "P01_4", "P01_5", "P01_6", "P01_7", + "P02_0", "P02_1", "P02_2", "P02_3", "P02_4", "P02_5", "P02_6", "P02_7", + "P03_0", "P03_1", "P03_2", "P03_3", "P03_4", "P03_5", "P03_6", "P03_7", + "P04_0", "P04_1", "P04_2", "P04_3", "P04_4", "P04_5", "P04_6", "P04_7", + "P05_0", "P05_1", "P05_2", "P05_3", "P05_4", "P05_5", "P05_6", "P05_7", + "P06_0", "P06_1", "P06_2", "P06_3", "P06_4", "P06_5", "P06_6", "P06_7", + "P07_0", "P07_1", "P07_2", "P07_3", "P07_4", "P07_5", "P07_6", "P07_7", + "P08_0", "P08_1", "P08_2", "P08_3", "P08_4", "P08_5", "P08_6", "P08_7", + "P09_0", "P09_1", "P09_2", "P09_3", "P09_4", "P09_5", "P09_6", "P09_7", + "P10_0", "P10_1", "P10_2", "P10_3", "P10_4", "P10_5", "P10_6", "P10_7", + "P11_0", "P11_1", "P11_2", "P11_3", "P11_4", "P11_5", "P11_6", "P11_7", + "P12_0", "P12_1", "P12_2", "P12_3", "P12_4", "P12_5", "P12_6", "P12_7", + "P13_0", "P13_1", "P13_2", "P13_3", "P13_4", "P13_5", "P13_6", "P13_7", + "P14_0", "P14_1", "P14_2", "P14_3", "P14_4", "P14_5", "P14_6", "P14_7", + "P15_0", "P15_1", "P15_2", "P15_3", "P15_4", "P15_5", "P15_6", "P15_7", + "P16_0", "P16_1", "P16_2", "P16_3", "P16_4", "P16_5", "P16_6", "P16_7", + "P17_0", "P17_1", "P17_2", "P17_3", "P17_4", "P17_5", "P17_6", "P17_7", + "P18_0", "P18_1", "P18_2", "P18_3", "P18_4", "P18_5", "P18_6", "P18_7", + "P19_0", "P19_1", "P19_2", "P19_3", "P19_4", "P19_5", "P19_6", "P19_7", + "P20_0", "P20_1", "P20_2", "P20_3", "P20_4", "P20_5", "P20_6", "P20_7", + "P21_0", "P21_1", "P21_2", "P21_3", "P21_4", "P21_5", "P21_6", "P21_7", + "P22_0", "P22_1", "P22_2", "P22_3", "P22_4", "P22_5", "P22_6", "P22_7", + "P23_0", "P23_1", "P23_2", "P23_3", "P23_4", "P23_5", "P23_6", "P23_7", + "P24_0", "P24_1", "P24_2", "P24_3", "P24_4", "P24_5", "P24_6", "P24_7", + "P25_0", "P25_1", "P25_2", "P25_3", "P25_4", "P25_5", "P25_6", "P25_7", + "P26_0", "P26_1", "P26_2", "P26_3", "P26_4", "P26_5", "P26_6", "P26_7", + "P27_0", "P27_1", "P27_2", "P27_3", "P27_4", "P27_5", "P27_6", "P27_7", + "P28_0", "P28_1", "P28_2", "P28_3", "P28_4", "P28_5", "P28_6", "P28_7", + "P29_0", "P29_1", "P29_2", "P29_3", "P29_4", "P29_5", "P29_6", "P29_7", + "P30_0", "P30_1", "P30_2", "P30_3", "P30_4", "P30_5", "P30_6", "P30_7", + "P31_0", "P31_1", "P31_2", "P31_3", "P31_4", "P31_5", "P31_6", "P31_7", + "P32_0", "P32_1", "P32_2", "P32_3", "P32_4", "P32_5", "P32_6", "P32_7", + "P33_0", "P33_1", "P33_2", "P33_3", "P33_4", "P33_5", "P33_6", "P33_7", + "P34_0", "P34_1", "P34_2", "P34_3", "P34_4", "P34_5", "P34_6", "P34_7", + "P35_0", "P35_1", "P35_2", "P35_3", "P35_4", "P35_5", "P35_6", "P35_7", +}; + +static int rzt2h_gpio_register(struct rzt2h_pinctrl *pctrl) +{ + struct pinctrl_gpio_range *range = &pctrl->gpio_range; + struct gpio_chip *chip = &pctrl->gpio_chip; + struct device *dev = pctrl->dev; + struct of_phandle_args of_args; + int ret; + + ret = of_parse_phandle_with_fixed_args(dev->of_node, "gpio-ranges", 3, 0, &of_args); + if (ret) + return dev_err_probe(dev, ret, "Unable to parse gpio-ranges\n"); + + if (of_args.args[0] != 0 || of_args.args[1] != 0 || + of_args.args[2] != pctrl->data->n_port_pins) + return dev_err_probe(dev, -EINVAL, + "gpio-ranges does not match selected SOC\n"); + + chip->base = -1; + chip->parent = dev; + chip->owner = THIS_MODULE; + chip->ngpio = of_args.args[2]; + chip->names = rzt2h_gpio_names; + chip->request = rzt2h_gpio_request; + chip->free = rzt2h_gpio_free; + chip->get_direction = rzt2h_gpio_get_direction; + chip->direction_input = rzt2h_gpio_direction_input; + chip->direction_output = rzt2h_gpio_direction_output; + chip->get = rzt2h_gpio_get; + chip->set = rzt2h_gpio_set; + chip->label = dev_name(dev); + + range->id = 0; + range->pin_base = 0; + range->base = 0; + range->npins = chip->ngpio; + range->name = chip->label; + range->gc = chip; + + ret = devm_gpiochip_add_data(dev, chip, pctrl); + if (ret) + return dev_err_probe(dev, ret, "gpiochip registration failed\n"); + + return ret; +} + +static int rzt2h_pinctrl_register(struct rzt2h_pinctrl *pctrl) +{ + struct pinctrl_desc *desc = &pctrl->desc; + struct device *dev = pctrl->dev; + struct pinctrl_pin_desc *pins; + unsigned int i, j; + int ret; + + desc->name = DRV_NAME; + desc->npins = pctrl->data->n_port_pins; + desc->pctlops = &rzt2h_pinctrl_pctlops; + desc->pmxops = &rzt2h_pinctrl_pmxops; + desc->owner = THIS_MODULE; + + pins = devm_kcalloc(dev, desc->npins, sizeof(*pins), GFP_KERNEL); + if (!pins) + return -ENOMEM; + + pctrl->pins = pins; + desc->pins = pins; + + for (i = 0, j = 0; i < pctrl->data->n_port_pins; i++) { + pins[i].number = i; + pins[i].name = rzt2h_gpio_names[i]; + if (i && !(i % RZT2H_PINS_PER_PORT)) + j++; + } + + ret = devm_pinctrl_register_and_init(dev, desc, pctrl, &pctrl->pctl); + if (ret) + return dev_err_probe(dev, ret, "pinctrl registration failed\n"); + + ret = pinctrl_enable(pctrl->pctl); + if (ret) + return dev_err_probe(dev, ret, "pinctrl enable failed\n"); + + return rzt2h_gpio_register(pctrl); +} + +static int rzt2h_pinctrl_cfg_regions(struct platform_device *pdev, + struct rzt2h_pinctrl *pctrl) +{ + struct resource *res; + + pctrl->base0 = devm_platform_ioremap_resource_byname(pdev, "nsr"); + if (IS_ERR(pctrl->base0)) + return PTR_ERR(pctrl->base0); + + /* + * Open-coded instead of using devm_platform_ioremap_resource_byname() + * because the "srs" region is optional. + */ + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "srs"); + if (res) { + u8 port; + + pctrl->base1 = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pctrl->base1)) + return PTR_ERR(pctrl->base1); + + pctrl->safety_port_enabled = true; + + /* Configure to select safety region 0x812c0xxx */ + for (port = 0; port <= RZT2H_MAX_SAFETY_PORTS; port++) + writeb(0x0, pctrl->base1 + RSELP(port)); + } + + return 0; +} + +static int rzt2h_pinctrl_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rzt2h_pinctrl *pctrl; + int ret; + + pctrl = devm_kzalloc(dev, sizeof(*pctrl), GFP_KERNEL); + if (!pctrl) + return -ENOMEM; + + pctrl->dev = dev; + pctrl->data = of_device_get_match_data(dev); + + ret = rzt2h_pinctrl_cfg_regions(pdev, pctrl); + if (ret) + return ret; + + spin_lock_init(&pctrl->lock); + mutex_init(&pctrl->mutex); + platform_set_drvdata(pdev, pctrl); + + return rzt2h_pinctrl_register(pctrl); +} + +static const u8 r9a09g077_gpio_configs[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, +}; + +static const u8 r9a09g087_gpio_configs[] = { + 0x1f, 0xff, 0xff, 0x1f, 0x00, 0xfe, 0xff, 0x00, 0x7e, 0xf0, 0xff, 0x01, + 0xff, 0xff, 0xff, 0x00, 0xe0, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, + 0xe0, 0xff, 0xff, 0x7f, 0x00, 0xfe, 0xff, 0x7f, 0x00, 0xfc, 0x7f, +}; + +static struct rzt2h_pinctrl_data r9a09g077_data = { + .n_port_pins = ARRAY_SIZE(r9a09g077_gpio_configs) * RZT2H_PINS_PER_PORT, + .port_pin_configs = r9a09g077_gpio_configs, + .n_ports = ARRAY_SIZE(r9a09g077_gpio_configs), +}; + +static struct rzt2h_pinctrl_data r9a09g087_data = { + .n_port_pins = ARRAY_SIZE(r9a09g087_gpio_configs) * RZT2H_PINS_PER_PORT, + .port_pin_configs = r9a09g087_gpio_configs, + .n_ports = ARRAY_SIZE(r9a09g087_gpio_configs), +}; + +static const struct of_device_id rzt2h_pinctrl_of_table[] = { + { + .compatible = "renesas,r9a09g077-pinctrl", + .data = &r9a09g077_data, + }, + { + .compatible = "renesas,r9a09g087-pinctrl", + .data = &r9a09g087_data, + }, + { /* sentinel */ } +}; + +static struct platform_driver rzt2h_pinctrl_driver = { + .driver = { + .name = DRV_NAME, + .of_match_table = of_match_ptr(rzt2h_pinctrl_of_table), + .suppress_bind_attrs = true, + }, + .probe = rzt2h_pinctrl_probe, +}; + +static int __init rzt2h_pinctrl_init(void) +{ + return platform_driver_register(&rzt2h_pinctrl_driver); +} +core_initcall(rzt2h_pinctrl_init); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Thierry Bultel "); +MODULE_AUTHOR("Lad Prabhakar "); +MODULE_DESCRIPTION("Pin and gpio controller driver for the RZ/T2H family"); diff --git a/drivers/pinctrl/renesas/pinctrl-rzv2m.c b/drivers/pinctrl/renesas/pinctrl-rzv2m.c index daaa986d994dac..dce68f93d2d57f 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzv2m.c +++ b/drivers/pinctrl/renesas/pinctrl-rzv2m.c @@ -162,7 +162,7 @@ static int rzv2m_pinctrl_set_mux(struct pinctrl_dev *pctldev, unsigned int group_selector) { struct rzv2m_pinctrl *pctrl = pinctrl_dev_get_drvdata(pctldev); - struct function_desc *func; + const struct function_desc *func; unsigned int i, *psel_val; struct group_desc *group; const unsigned int *pins; diff --git a/drivers/pinctrl/renesas/pinctrl.c b/drivers/pinctrl/renesas/pinctrl.c index 29d16c9c1bd194..3a742f74ecd1dc 100644 --- a/drivers/pinctrl/renesas/pinctrl.c +++ b/drivers/pinctrl/renesas/pinctrl.c @@ -726,7 +726,8 @@ static int sh_pfc_pinconf_group_set(struct pinctrl_dev *pctldev, unsigned group, struct sh_pfc_pinctrl *pmx = pinctrl_dev_get_drvdata(pctldev); const unsigned int *pins; unsigned int num_pins; - unsigned int i, ret; + unsigned int i; + int ret; pins = pmx->pfc->info->groups[group].pins; num_pins = pmx->pfc->info->groups[group].nr_pins; diff --git a/drivers/pinctrl/samsung/pinctrl-exynos-arm64.c b/drivers/pinctrl/samsung/pinctrl-exynos-arm64.c index 5fe7c4b9f7bd42..323487dfa8c2cb 100644 --- a/drivers/pinctrl/samsung/pinctrl-exynos-arm64.c +++ b/drivers/pinctrl/samsung/pinctrl-exynos-arm64.c @@ -76,6 +76,15 @@ static const struct samsung_pin_bank_type exynos8895_bank_type_off = { .reg_offset = { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, }, }; +/* + * Bank type for non-alive type. Bit fields: + * CON: 4, DAT: 1, PUD: 4, DRV: 4 + */ +static const struct samsung_pin_bank_type artpec_bank_type_off = { + .fld_width = { 4, 1, 4, 4, }, + .reg_offset = { 0x00, 0x04, 0x08, 0x0c, }, +}; + /* Pad retention control code for accessing PMU regmap */ static atomic_t exynos_shared_retention_refcnt; @@ -1816,3 +1825,44 @@ const struct samsung_pinctrl_of_match_data gs101_of_data __initconst = { .ctrl = gs101_pin_ctrl, .num_ctrl = ARRAY_SIZE(gs101_pin_ctrl), }; + +/* pin banks of artpec8 pin-controller (FSYS0) */ +static const struct samsung_pin_bank_data artpec8_pin_banks0[] __initconst = { + ARTPEC_PIN_BANK_EINTG(5, 0x000, "gpf0", 0x00), + ARTPEC_PIN_BANK_EINTG(4, 0x020, "gpf1", 0x04), + ARTPEC_PIN_BANK_EINTG(8, 0x040, "gpf2", 0x08), + ARTPEC_PIN_BANK_EINTG(4, 0x060, "gpf3", 0x0c), + ARTPEC_PIN_BANK_EINTG(7, 0x080, "gpf4", 0x10), + ARTPEC_PIN_BANK_EINTG(8, 0x0a0, "gpe0", 0x14), + ARTPEC_PIN_BANK_EINTG(8, 0x0c0, "gpe1", 0x18), + ARTPEC_PIN_BANK_EINTG(6, 0x0e0, "gpe2", 0x1c), + ARTPEC_PIN_BANK_EINTG(8, 0x100, "gps0", 0x20), + ARTPEC_PIN_BANK_EINTG(8, 0x120, "gps1", 0x24), +}; + +/* pin banks of artpec8 pin-controller (PERIC) */ +static const struct samsung_pin_bank_data artpec8_pin_banks1[] __initconst = { + ARTPEC_PIN_BANK_EINTG(8, 0x000, "gpa0", 0x00), + ARTPEC_PIN_BANK_EINTG(8, 0x020, "gpa1", 0x04), + ARTPEC_PIN_BANK_EINTG(8, 0x040, "gpa2", 0x08), + ARTPEC_PIN_BANK_EINTG(2, 0x060, "gpk0", 0x0c), +}; + +static const struct samsung_pin_ctrl artpec8_pin_ctrl[] __initconst = { + { + /* pin-controller instance 0 FSYS data */ + .pin_banks = artpec8_pin_banks0, + .nr_banks = ARRAY_SIZE(artpec8_pin_banks0), + .eint_gpio_init = exynos_eint_gpio_init, + }, { + /* pin-controller instance 1 PERIC data */ + .pin_banks = artpec8_pin_banks1, + .nr_banks = ARRAY_SIZE(artpec8_pin_banks1), + .eint_gpio_init = exynos_eint_gpio_init, + }, +}; + +const struct samsung_pinctrl_of_match_data artpec8_of_data __initconst = { + .ctrl = artpec8_pin_ctrl, + .num_ctrl = ARRAY_SIZE(artpec8_pin_ctrl), +}; diff --git a/drivers/pinctrl/samsung/pinctrl-exynos.h b/drivers/pinctrl/samsung/pinctrl-exynos.h index 362dc533186fb4..c9c38f8988dd2c 100644 --- a/drivers/pinctrl/samsung/pinctrl-exynos.h +++ b/drivers/pinctrl/samsung/pinctrl-exynos.h @@ -236,6 +236,16 @@ .name = id \ } +#define ARTPEC_PIN_BANK_EINTG(pins, reg, id, offs) \ + { \ + .type = &artpec_bank_type_off, \ + .pctl_offset = reg, \ + .nr_pins = pins, \ + .eint_type = EINT_TYPE_GPIO, \ + .eint_offset = offs, \ + .name = id \ + } + /** * struct exynos_weint_data: irq specific data for all the wakeup interrupts * generated by the external wakeup interrupt controller. diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.c b/drivers/pinctrl/samsung/pinctrl-samsung.c index 24745e1d78cec5..c099195fc464e3 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.c +++ b/drivers/pinctrl/samsung/pinctrl-samsung.c @@ -1482,6 +1482,8 @@ static const struct of_device_id samsung_pinctrl_dt_match[] = { .data = &s5pv210_of_data }, #endif #ifdef CONFIG_PINCTRL_EXYNOS_ARM64 + { .compatible = "axis,artpec8-pinctrl", + .data = &artpec8_of_data }, { .compatible = "google,gs101-pinctrl", .data = &gs101_of_data }, { .compatible = "samsung,exynos2200-pinctrl", diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.h b/drivers/pinctrl/samsung/pinctrl-samsung.h index 1cabcbe1401a61..3e8ef91d94a36e 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.h +++ b/drivers/pinctrl/samsung/pinctrl-samsung.h @@ -381,6 +381,7 @@ struct samsung_pmx_func { }; /* list of all exported SoC specific data */ +extern const struct samsung_pinctrl_of_match_data artpec8_of_data; extern const struct samsung_pinctrl_of_match_data exynos2200_of_data; extern const struct samsung_pinctrl_of_match_data exynos3250_of_data; extern const struct samsung_pinctrl_of_match_data exynos4210_of_data; @@ -402,10 +403,6 @@ extern const struct samsung_pinctrl_of_match_data exynosautov920_of_data; extern const struct samsung_pinctrl_of_match_data fsd_of_data; extern const struct samsung_pinctrl_of_match_data gs101_of_data; extern const struct samsung_pinctrl_of_match_data s3c64xx_of_data; -extern const struct samsung_pinctrl_of_match_data s3c2412_of_data; -extern const struct samsung_pinctrl_of_match_data s3c2416_of_data; -extern const struct samsung_pinctrl_of_match_data s3c2440_of_data; -extern const struct samsung_pinctrl_of_match_data s3c2450_of_data; extern const struct samsung_pinctrl_of_match_data s5pv210_of_data; #endif /* __PINCTRL_SAMSUNG_H */ diff --git a/drivers/pinctrl/spacemit/pinctrl-k1.c b/drivers/pinctrl/spacemit/pinctrl-k1.c index 9996b1c4a07e72..33af9b5791c110 100644 --- a/drivers/pinctrl/spacemit/pinctrl-k1.c +++ b/drivers/pinctrl/spacemit/pinctrl-k1.c @@ -707,7 +707,7 @@ static void spacemit_pinconf_dbg_show(struct pinctrl_dev *pctldev, spacemit_get_drive_strength_mA(IO_TYPE_1V8, tmp), spacemit_get_drive_strength_mA(IO_TYPE_3V3, tmp)); - seq_printf(seq, ", register (0x%04x)\n", value); + seq_printf(seq, ", register (0x%04x)", value); } static const struct pinconf_ops spacemit_pinconf_ops = { @@ -847,7 +847,7 @@ static const struct pinctrl_pin_desc k1_pin_desc[] = { PINCTRL_PIN(67, "GPIO_67"), PINCTRL_PIN(68, "GPIO_68"), PINCTRL_PIN(69, "GPIO_69"), - PINCTRL_PIN(70, "GPIO_70/PRI_DTI"), + PINCTRL_PIN(70, "GPIO_70/PRI_TDI"), PINCTRL_PIN(71, "GPIO_71/PRI_TMS"), PINCTRL_PIN(72, "GPIO_72/PRI_TCK"), PINCTRL_PIN(73, "GPIO_73/PRI_TDO"), diff --git a/drivers/pinctrl/sprd/pinctrl-sprd.c b/drivers/pinctrl/sprd/pinctrl-sprd.c index c4a1d99dfed043..16cf9d15f24707 100644 --- a/drivers/pinctrl/sprd/pinctrl-sprd.c +++ b/drivers/pinctrl/sprd/pinctrl-sprd.c @@ -258,8 +258,7 @@ static int sprd_dt_node_to_map(struct pinctrl_dev *pctldev, grp = sprd_pinctrl_find_group_by_name(pctl, np->name); if (!grp) { - dev_err(pctl->dev, "unable to find group for node %s\n", - of_node_full_name(np)); + dev_err(pctl->dev, "unable to find group for node %pOF\n", np); return -EINVAL; } @@ -276,16 +275,14 @@ static int sprd_dt_node_to_map(struct pinctrl_dev *pctldev, if (ret < 0) { if (ret != -EINVAL) dev_err(pctl->dev, - "%s: could not parse property function\n", - of_node_full_name(np)); + "%pOF: could not parse property function\n", np); function = NULL; } ret = pinconf_generic_parse_dt_config(np, pctldev, &configs, &num_configs); if (ret < 0) { - dev_err(pctl->dev, "%s: could not parse node property\n", - of_node_full_name(np)); + dev_err(pctl->dev, "%pOF: could not parse node property\n", np); return ret; } diff --git a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c index e91442eb566bb2..0b1dff01e04c11 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -45,7 +46,7 @@ struct stm32_hdp { void __iomem *base; struct clk *clk; struct pinctrl_dev *pctl_dev; - struct gpio_chip gpio_chip; + struct gpio_generic_chip gpio_chip; u32 mux_conf; u32 gposet_conf; const char * const *func_name; @@ -575,7 +576,7 @@ static const struct pinmux_ops stm32_hdp_pinmux_ops = { .gpio_set_direction = NULL, }; -static struct pinctrl_desc stm32_hdp_pdesc = { +static const struct pinctrl_desc stm32_hdp_pdesc = { .name = DRIVER_NAME, .pins = stm32_hdp_pins, .npins = ARRAY_SIZE(stm32_hdp_pins), @@ -603,6 +604,7 @@ MODULE_DEVICE_TABLE(of, stm32_hdp_of_match); static int stm32_hdp_probe(struct platform_device *pdev) { + struct gpio_generic_chip_config config; struct device *dev = &pdev->dev; struct stm32_hdp *hdp; u8 version; @@ -635,21 +637,25 @@ static int stm32_hdp_probe(struct platform_device *pdev) if (err) return dev_err_probe(dev, err, "Failed to enable pinctrl\n"); - hdp->gpio_chip.get_direction = stm32_hdp_gpio_get_direction; - hdp->gpio_chip.ngpio = ARRAY_SIZE(stm32_hdp_pins); - hdp->gpio_chip.can_sleep = true; - hdp->gpio_chip.names = stm32_hdp_pins_group; - - err = bgpio_init(&hdp->gpio_chip, dev, 4, - hdp->base + HDP_GPOVAL, - hdp->base + HDP_GPOSET, - hdp->base + HDP_GPOCLR, - NULL, NULL, BGPIOF_NO_INPUT); + hdp->gpio_chip.gc.get_direction = stm32_hdp_gpio_get_direction; + hdp->gpio_chip.gc.ngpio = ARRAY_SIZE(stm32_hdp_pins); + hdp->gpio_chip.gc.can_sleep = true; + hdp->gpio_chip.gc.names = stm32_hdp_pins_group; + + config = (struct gpio_generic_chip_config) { + .dev = dev, + .sz = 4, + .dat = hdp->base + HDP_GPOVAL, + .set = hdp->base + HDP_GPOSET, + .clr = hdp->base + HDP_GPOCLR, + .flags = GPIO_GENERIC_NO_INPUT, + }; + + err = gpio_generic_chip_init(&hdp->gpio_chip, &config); if (err) - return dev_err_probe(dev, err, "Failed to init bgpio\n"); - + return dev_err_probe(dev, err, "Failed to init the generic GPIO chip\n"); - err = devm_gpiochip_add_data(dev, &hdp->gpio_chip, hdp); + err = devm_gpiochip_add_data(dev, &hdp->gpio_chip.gc, hdp); if (err) return dev_err_probe(dev, err, "Failed to add gpiochip\n"); diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 823c8fe758e2c0..3ebb468de830db 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -1236,7 +1236,7 @@ static int stm32_pconf_parse_conf(struct pinctrl_dev *pctldev, case PIN_CONFIG_BIAS_PULL_DOWN: ret = stm32_pconf_set_bias(bank, offset, 2); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: __stm32_gpio_set(bank, offset, arg); ret = stm32_pmx_gpio_set_direction(pctldev, range, pin, false); break; diff --git a/drivers/pinctrl/sunplus/sppctl.c b/drivers/pinctrl/sunplus/sppctl.c index 3e924aa86cc2fa..fabe7efaa837a4 100644 --- a/drivers/pinctrl/sunplus/sppctl.c +++ b/drivers/pinctrl/sunplus/sppctl.c @@ -488,7 +488,7 @@ static int sppctl_gpio_set_config(struct gpio_chip *chip, unsigned int offset, case PIN_CONFIG_INPUT_ENABLE: break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: return sppctl_gpio_direction_output(chip, offset, 0); case PIN_CONFIG_PERSIST_STATE: @@ -580,7 +580,7 @@ static int sppctl_pin_config_get(struct pinctrl_dev *pctldev, unsigned int pin, arg = 0; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (!sppctl_first_get(&pctl->spp_gchip->chip, pin)) return -EINVAL; if (!sppctl_master_get(&pctl->spp_gchip->chip, pin)) diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c b/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c index 4e34b0cd3b73aa..50a16f3bd13161 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c +++ b/drivers/pinctrl/sunxi/pinctrl-sunxi-dt.c @@ -103,7 +103,7 @@ static struct sunxi_desc_pin *init_pins_table(struct device *dev, return ERR_PTR(-EINVAL); } - pins = devm_kzalloc(dev, desc->npins * sizeof(*pins), GFP_KERNEL); + pins = devm_kcalloc(dev, desc->npins, sizeof(*pins), GFP_KERNEL); if (!pins) return ERR_PTR(-ENOMEM); @@ -199,7 +199,7 @@ static int prepare_function_table(struct device *dev, struct device_node *pnode, * Allocate the memory needed for the functions in one table. * We later use pointers into this table to mark each pin. */ - func = devm_kzalloc(dev, num_funcs * sizeof(*func), GFP_KERNEL); + func = devm_kcalloc(dev, num_funcs, sizeof(*func), GFP_KERNEL); if (!func) return -ENOMEM; @@ -274,8 +274,7 @@ static void fill_pin_function(struct device *dev, struct device_node *node, if (!strcmp(pins[pin].pin.name, name)) break; if (pin == npins) { - dev_warn(dev, "%s: cannot find pin %s\n", - of_node_full_name(node), name); + dev_warn(dev, "%pOF: cannot find pin %s\n", node, name); index++; continue; } @@ -283,8 +282,8 @@ static void fill_pin_function(struct device *dev, struct device_node *node, /* Read the associated mux value. */ muxval = sunxi_pinctrl_dt_read_pinmux(node, index); if (muxval == INVALID_MUX) { - dev_warn(dev, "%s: invalid mux value for pin %s\n", - of_node_full_name(node), name); + dev_warn(dev, "%pOF: invalid mux value for pin %s\n", + node, name); index++; continue; } diff --git a/drivers/pinctrl/tegra/Kconfig b/drivers/pinctrl/tegra/Kconfig index 4e87d19323ba88..660d101ea3679a 100644 --- a/drivers/pinctrl/tegra/Kconfig +++ b/drivers/pinctrl/tegra/Kconfig @@ -24,6 +24,10 @@ config PINCTRL_TEGRA210 bool select PINCTRL_TEGRA +config PINCTRL_TEGRA186 + bool + select PINCTRL_TEGRA + config PINCTRL_TEGRA194 bool select PINCTRL_TEGRA diff --git a/drivers/pinctrl/tegra/Makefile b/drivers/pinctrl/tegra/Makefile index a93973701d4cce..82176526549e7d 100644 --- a/drivers/pinctrl/tegra/Makefile +++ b/drivers/pinctrl/tegra/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_PINCTRL_TEGRA30) += pinctrl-tegra30.o obj-$(CONFIG_PINCTRL_TEGRA114) += pinctrl-tegra114.o obj-$(CONFIG_PINCTRL_TEGRA124) += pinctrl-tegra124.o obj-$(CONFIG_PINCTRL_TEGRA210) += pinctrl-tegra210.o +obj-$(CONFIG_PINCTRL_TEGRA186) += pinctrl-tegra186.o obj-$(CONFIG_PINCTRL_TEGRA194) += pinctrl-tegra194.o obj-$(CONFIG_PINCTRL_TEGRA234) += pinctrl-tegra234.o obj-$(CONFIG_PINCTRL_TEGRA_XUSB) += pinctrl-tegra-xusb.o diff --git a/drivers/pinctrl/tegra/pinctrl-tegra186.c b/drivers/pinctrl/tegra/pinctrl-tegra186.c new file mode 100644 index 00000000000000..4a1d6476af9bb4 --- /dev/null +++ b/drivers/pinctrl/tegra/pinctrl-tegra186.c @@ -0,0 +1,1979 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Pinctrl data for the NVIDIA Tegra186 pinmux + * + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include + +#include "pinctrl-tegra.h" + +/* Define unique ID for each pins */ +enum { + TEGRA_PIN_PEX_L0_RST_N_PA0, + TEGRA_PIN_PEX_L0_CLKREQ_N_PA1, + TEGRA_PIN_PEX_WAKE_N_PA2, + TEGRA_PIN_PEX_L1_RST_N_PA3, + TEGRA_PIN_PEX_L1_CLKREQ_N_PA4, + TEGRA_PIN_PEX_L2_RST_N_PA5, + TEGRA_PIN_PEX_L2_CLKREQ_N_PA6, + TEGRA_PIN_UART4_TX_PB0, + TEGRA_PIN_UART4_RX_PB1, + TEGRA_PIN_UART4_RTS_PB2, + TEGRA_PIN_UART4_CTS_PB3, + TEGRA_PIN_GPIO_WAN1_PB4, + TEGRA_PIN_GPIO_WAN2_PB5, + TEGRA_PIN_GPIO_WAN3_PB6, + TEGRA_PIN_GPIO_WAN4_PC0, + TEGRA_PIN_DAP2_SCLK_PC1, + TEGRA_PIN_DAP2_DOUT_PC2, + TEGRA_PIN_DAP2_DIN_PC3, + TEGRA_PIN_DAP2_FS_PC4, + TEGRA_PIN_GEN1_I2C_SCL_PC5, + TEGRA_PIN_GEN1_I2C_SDA_PC6, + TEGRA_PIN_SDMMC1_CLK_PD0, + TEGRA_PIN_SDMMC1_CMD_PD1, + TEGRA_PIN_SDMMC1_DAT0_PD2, + TEGRA_PIN_SDMMC1_DAT1_PD3, + TEGRA_PIN_SDMMC1_DAT2_PD4, + TEGRA_PIN_SDMMC1_DAT3_PD5, + TEGRA_PIN_EQOS_TXC_PE0, + TEGRA_PIN_EQOS_TD0_PE1, + TEGRA_PIN_EQOS_TD1_PE2, + TEGRA_PIN_EQOS_TD2_PE3, + TEGRA_PIN_EQOS_TD3_PE4, + TEGRA_PIN_EQOS_TX_CTL_PE5, + TEGRA_PIN_EQOS_RD0_PE6, + TEGRA_PIN_EQOS_RD1_PE7, + TEGRA_PIN_EQOS_RD2_PF0, + TEGRA_PIN_EQOS_RD3_PF1, + TEGRA_PIN_EQOS_RX_CTL_PF2, + TEGRA_PIN_EQOS_RXC_PF3, + TEGRA_PIN_EQOS_MDIO_PF4, + TEGRA_PIN_EQOS_MDC_PF5, + TEGRA_PIN_SDMMC3_CLK_PG0, + TEGRA_PIN_SDMMC3_CMD_PG1, + TEGRA_PIN_SDMMC3_DAT0_PG2, + TEGRA_PIN_SDMMC3_DAT1_PG3, + TEGRA_PIN_SDMMC3_DAT2_PG4, + TEGRA_PIN_SDMMC3_DAT3_PG5, + TEGRA_PIN_GPIO_WAN5_PH0, + TEGRA_PIN_GPIO_WAN6_PH1, + TEGRA_PIN_GPIO_WAN7_PH2, + TEGRA_PIN_GPIO_WAN8_PH3, + TEGRA_PIN_BCPU_PWR_REQ_PH4, + TEGRA_PIN_MCPU_PWR_REQ_PH5, + TEGRA_PIN_GPU_PWR_REQ_PH6, + TEGRA_PIN_GPIO_PQ0_PI0, + TEGRA_PIN_GPIO_PQ1_PI1, + TEGRA_PIN_GPIO_PQ2_PI2, + TEGRA_PIN_GPIO_PQ3_PI3, + TEGRA_PIN_GPIO_PQ4_PI4, + TEGRA_PIN_GPIO_PQ5_PI5, + TEGRA_PIN_GPIO_PQ6_PI6, + TEGRA_PIN_GPIO_PQ7_PI7, + TEGRA_PIN_DAP1_SCLK_PJ0, + TEGRA_PIN_DAP1_DOUT_PJ1, + TEGRA_PIN_DAP1_DIN_PJ2, + TEGRA_PIN_DAP1_FS_PJ3, + TEGRA_PIN_AUD_MCLK_PJ4, + TEGRA_PIN_GPIO_AUD0_PJ5, + TEGRA_PIN_GPIO_AUD1_PJ6, + TEGRA_PIN_GPIO_AUD2_PJ7, + TEGRA_PIN_GPIO_AUD3_PK0, + TEGRA_PIN_GEN7_I2C_SCL_PL0, + TEGRA_PIN_GEN7_I2C_SDA_PL1, + TEGRA_PIN_GEN9_I2C_SCL_PL2, + TEGRA_PIN_GEN9_I2C_SDA_PL3, + TEGRA_PIN_USB_VBUS_EN0_PL4, + TEGRA_PIN_USB_VBUS_EN1_PL5, + TEGRA_PIN_GP_PWM6_PL6, + TEGRA_PIN_GP_PWM7_PL7, + TEGRA_PIN_DMIC1_DAT_PM0, + TEGRA_PIN_DMIC1_CLK_PM1, + TEGRA_PIN_DMIC2_DAT_PM2, + TEGRA_PIN_DMIC2_CLK_PM3, + TEGRA_PIN_DMIC4_DAT_PM4, + TEGRA_PIN_DMIC4_CLK_PM5, + TEGRA_PIN_GPIO_CAM1_PN0, + TEGRA_PIN_GPIO_CAM2_PN1, + TEGRA_PIN_GPIO_CAM3_PN2, + TEGRA_PIN_GPIO_CAM4_PN3, + TEGRA_PIN_GPIO_CAM5_PN4, + TEGRA_PIN_GPIO_CAM6_PN5, + TEGRA_PIN_GPIO_CAM7_PN6, + TEGRA_PIN_EXTPERIPH1_CLK_PO0, + TEGRA_PIN_EXTPERIPH2_CLK_PO1, + TEGRA_PIN_CAM_I2C_SCL_PO2, + TEGRA_PIN_CAM_I2C_SDA_PO3, + TEGRA_PIN_DP_AUX_CH0_HPD_PP0, + TEGRA_PIN_DP_AUX_CH1_HPD_PP1, + TEGRA_PIN_HDMI_CEC_PP2, + TEGRA_PIN_GPIO_EDP0_PP3, + TEGRA_PIN_GPIO_EDP1_PP4, + TEGRA_PIN_GPIO_EDP2_PP5, + TEGRA_PIN_GPIO_EDP3_PP6, + TEGRA_PIN_DIRECTDC1_CLK_PQ0, + TEGRA_PIN_DIRECTDC1_IN_PQ1, + TEGRA_PIN_DIRECTDC1_OUT0_PQ2, + TEGRA_PIN_DIRECTDC1_OUT1_PQ3, + TEGRA_PIN_DIRECTDC1_OUT2_PQ4, + TEGRA_PIN_DIRECTDC1_OUT3_PQ5, + TEGRA_PIN_QSPI_SCK_PR0, + TEGRA_PIN_QSPI_IO0_PR1, + TEGRA_PIN_QSPI_IO1_PR2, + TEGRA_PIN_QSPI_IO2_PR3, + TEGRA_PIN_QSPI_IO3_PR4, + TEGRA_PIN_QSPI_CS_N_PR5, + TEGRA_PIN_UART1_TX_PT0, + TEGRA_PIN_UART1_RX_PT1, + TEGRA_PIN_UART1_RTS_PT2, + TEGRA_PIN_UART1_CTS_PT3, + TEGRA_PIN_UART2_TX_PX0, + TEGRA_PIN_UART2_RX_PX1, + TEGRA_PIN_UART2_RTS_PX2, + TEGRA_PIN_UART2_CTS_PX3, + TEGRA_PIN_UART5_TX_PX4, + TEGRA_PIN_UART5_RX_PX5, + TEGRA_PIN_UART5_RTS_PX6, + TEGRA_PIN_UART5_CTS_PX7, + TEGRA_PIN_GPIO_MDM1_PY0, + TEGRA_PIN_GPIO_MDM2_PY1, + TEGRA_PIN_GPIO_MDM3_PY2, + TEGRA_PIN_GPIO_MDM4_PY3, + TEGRA_PIN_GPIO_MDM5_PY4, + TEGRA_PIN_GPIO_MDM6_PY5, + TEGRA_PIN_GPIO_MDM7_PY6, + TEGRA_PIN_UFS0_REF_CLK_PBB0, + TEGRA_PIN_UFS0_RST_PBB1, + TEGRA_PIN_DAP4_SCLK_PCC0, + TEGRA_PIN_DAP4_DOUT_PCC1, + TEGRA_PIN_DAP4_DIN_PCC2, + TEGRA_PIN_DAP4_FS_PCC3, + TEGRA_PIN_DIRECTDC_COMP, + TEGRA_PIN_SDMMC1_COMP, + TEGRA_PIN_EQOS_COMP, + TEGRA_PIN_SDMMC3_COMP, + TEGRA_PIN_QSPI_COMP, +}; + +enum { + TEGRA_PIN_PWR_I2C_SCL_PS0, + TEGRA_PIN_PWR_I2C_SDA_PS1, + TEGRA_PIN_BATT_OC_PS2, + TEGRA_PIN_SAFE_STATE_PS3, + TEGRA_PIN_VCOMP_ALERT_PS4, + TEGRA_PIN_GPIO_DIS0_PU0, + TEGRA_PIN_GPIO_DIS1_PU1, + TEGRA_PIN_GPIO_DIS2_PU2, + TEGRA_PIN_GPIO_DIS3_PU3, + TEGRA_PIN_GPIO_DIS4_PU4, + TEGRA_PIN_GPIO_DIS5_PU5, + TEGRA_PIN_GPIO_SEN0_PV0, + TEGRA_PIN_GPIO_SEN1_PV1, + TEGRA_PIN_GPIO_SEN2_PV2, + TEGRA_PIN_GPIO_SEN3_PV3, + TEGRA_PIN_GPIO_SEN4_PV4, + TEGRA_PIN_GPIO_SEN5_PV5, + TEGRA_PIN_GPIO_SEN6_PV6, + TEGRA_PIN_GPIO_SEN7_PV7, + TEGRA_PIN_GEN8_I2C_SCL_PW0, + TEGRA_PIN_GEN8_I2C_SDA_PW1, + TEGRA_PIN_UART3_TX_PW2, + TEGRA_PIN_UART3_RX_PW3, + TEGRA_PIN_UART3_RTS_PW4, + TEGRA_PIN_UART3_CTS_PW5, + TEGRA_PIN_UART7_TX_PW6, + TEGRA_PIN_UART7_RX_PW7, + TEGRA_PIN_CAN1_DOUT_PZ0, + TEGRA_PIN_CAN1_DIN_PZ1, + TEGRA_PIN_CAN0_DOUT_PZ2, + TEGRA_PIN_CAN0_DIN_PZ3, + TEGRA_PIN_CAN_GPIO0_PAA0, + TEGRA_PIN_CAN_GPIO1_PAA1, + TEGRA_PIN_CAN_GPIO2_PAA2, + TEGRA_PIN_CAN_GPIO3_PAA3, + TEGRA_PIN_CAN_GPIO4_PAA4, + TEGRA_PIN_CAN_GPIO5_PAA5, + TEGRA_PIN_CAN_GPIO6_PAA6, + TEGRA_PIN_CAN_GPIO7_PAA7, + TEGRA_PIN_GPIO_SEN8_PEE0, + TEGRA_PIN_GPIO_SEN9_PEE1, + TEGRA_PIN_TOUCH_CLK_PEE2, + TEGRA_PIN_POWER_ON_PFF0, + TEGRA_PIN_GPIO_SW1_PFF1, + TEGRA_PIN_GPIO_SW2_PFF2, + TEGRA_PIN_GPIO_SW3_PFF3, + TEGRA_PIN_GPIO_SW4_PFF4, + TEGRA_PIN_SHUTDOWN, + TEGRA_PIN_PMU_INT, + TEGRA_PIN_SOC_PWR_REQ, + TEGRA_PIN_CLK_32K_IN, +}; + +/* Table for pin descriptor */ +static const struct pinctrl_pin_desc tegra186_pins[] = { + PINCTRL_PIN(TEGRA_PIN_PEX_L0_RST_N_PA0, "PEX_L0_RST_N_PA0"), + PINCTRL_PIN(TEGRA_PIN_PEX_L0_CLKREQ_N_PA1, "PEX_L0_CLKREQ_N_PA1"), + PINCTRL_PIN(TEGRA_PIN_PEX_WAKE_N_PA2, "PEX_WAKE_N_PA2"), + PINCTRL_PIN(TEGRA_PIN_PEX_L1_RST_N_PA3, "PEX_L1_RST_N_PA3"), + PINCTRL_PIN(TEGRA_PIN_PEX_L1_CLKREQ_N_PA4, "PEX_L1_CLKREQ_N_PA4"), + PINCTRL_PIN(TEGRA_PIN_PEX_L2_RST_N_PA5, "PEX_L2_RST_N_PA5"), + PINCTRL_PIN(TEGRA_PIN_PEX_L2_CLKREQ_N_PA6, "PEX_L2_CLKREQ_N_PA6"), + PINCTRL_PIN(TEGRA_PIN_UART4_TX_PB0, "UART4_TX_PB0"), + PINCTRL_PIN(TEGRA_PIN_UART4_RX_PB1, "UART4_RX_PB1"), + PINCTRL_PIN(TEGRA_PIN_UART4_RTS_PB2, "UART4_RTS_PB2"), + PINCTRL_PIN(TEGRA_PIN_UART4_CTS_PB3, "UART4_CTS_PB3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN1_PB4, "GPIO_WAN1_PB4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN2_PB5, "GPIO_WAN2_PB5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN3_PB6, "GPIO_WAN3_PB6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN4_PC0, "GPIO_WAN4_PC0"), + PINCTRL_PIN(TEGRA_PIN_DAP2_SCLK_PC1, "DAP2_SCLK_PC1"), + PINCTRL_PIN(TEGRA_PIN_DAP2_DOUT_PC2, "DAP2_DOUT_PC2"), + PINCTRL_PIN(TEGRA_PIN_DAP2_DIN_PC3, "DAP2_DIN_PC3"), + PINCTRL_PIN(TEGRA_PIN_DAP2_FS_PC4, "DAP2_FS_PC4"), + PINCTRL_PIN(TEGRA_PIN_GEN1_I2C_SCL_PC5, "GEN1_I2C_SCL_PC5"), + PINCTRL_PIN(TEGRA_PIN_GEN1_I2C_SDA_PC6, "GEN1_I2C_SDA_PC6"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_CLK_PD0, "SDMMC1_CLK_PD0"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_CMD_PD1, "SDMMC1_CMD_PD1"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_DAT0_PD2, "SDMMC1_DAT0_PD2"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_DAT1_PD3, "SDMMC1_DAT1_PD3"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_DAT2_PD4, "SDMMC1_DAT2_PD4"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_DAT3_PD5, "SDMMC1_DAT3_PD5"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TXC_PE0, "EQOS_TXC_PE0"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TD0_PE1, "EQOS_TD0_PE1"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TD1_PE2, "EQOS_TD1_PE2"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TD2_PE3, "EQOS_TD2_PE3"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TD3_PE4, "EQOS_TD3_PE4"), + PINCTRL_PIN(TEGRA_PIN_EQOS_TX_CTL_PE5, "EQOS_TX_CTL_PE5"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RD0_PE6, "EQOS_RD0_PE6"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RD1_PE7, "EQOS_RD1_PE7"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RD2_PF0, "EQOS_RD2_PF0"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RD3_PF1, "EQOS_RD3_PF1"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RX_CTL_PF2, "EQOS_RX_CTL_PF2"), + PINCTRL_PIN(TEGRA_PIN_EQOS_RXC_PF3, "EQOS_RXC_PF3"), + PINCTRL_PIN(TEGRA_PIN_EQOS_MDIO_PF4, "EQOS_MDIO_PF4"), + PINCTRL_PIN(TEGRA_PIN_EQOS_MDC_PF5, "EQOS_MDC_PF5"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_CLK_PG0, "SDMMC3_CLK_PG0"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_CMD_PG1, "SDMMC3_CMD_PG1"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_DAT0_PG2, "SDMMC3_DAT0_PG2"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_DAT1_PG3, "SDMMC3_DAT1_PG3"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_DAT2_PG4, "SDMMC3_DAT2_PG4"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_DAT3_PG5, "SDMMC3_DAT3_PG5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN5_PH0, "GPIO_WAN5_PH0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN6_PH1, "GPIO_WAN6_PH1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN7_PH2, "GPIO_WAN7_PH2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_WAN8_PH3, "GPIO_WAN8_PH3"), + PINCTRL_PIN(TEGRA_PIN_BCPU_PWR_REQ_PH4, "BCPU_PWR_REQ_PH4"), + PINCTRL_PIN(TEGRA_PIN_MCPU_PWR_REQ_PH5, "MCPU_PWR_REQ_PH5"), + PINCTRL_PIN(TEGRA_PIN_GPU_PWR_REQ_PH6, "GPU_PWR_REQ_PH6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ0_PI0, "GPIO_PQ0_PI0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ1_PI1, "GPIO_PQ1_PI1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ2_PI2, "GPIO_PQ2_PI2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ3_PI3, "GPIO_PQ3_PI3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ4_PI4, "GPIO_PQ4_PI4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ5_PI5, "GPIO_PQ5_PI5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ6_PI6, "GPIO_PQ6_PI6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_PQ7_PI7, "GPIO_PQ7_PI7"), + PINCTRL_PIN(TEGRA_PIN_DAP1_SCLK_PJ0, "DAP1_SCLK_PJ0"), + PINCTRL_PIN(TEGRA_PIN_DAP1_DOUT_PJ1, "DAP1_DOUT_PJ1"), + PINCTRL_PIN(TEGRA_PIN_DAP1_DIN_PJ2, "DAP1_DIN_PJ2"), + PINCTRL_PIN(TEGRA_PIN_DAP1_FS_PJ3, "DAP1_FS_PJ3"), + PINCTRL_PIN(TEGRA_PIN_AUD_MCLK_PJ4, "AUD_MCLK_PJ4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_AUD0_PJ5, "GPIO_AUD0_PJ5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_AUD1_PJ6, "GPIO_AUD1_PJ6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_AUD2_PJ7, "GPIO_AUD2_PJ7"), + PINCTRL_PIN(TEGRA_PIN_GPIO_AUD3_PK0, "GPIO_AUD3_PK0"), + PINCTRL_PIN(TEGRA_PIN_GEN7_I2C_SCL_PL0, "GEN7_I2C_SCL_PL0"), + PINCTRL_PIN(TEGRA_PIN_GEN7_I2C_SDA_PL1, "GEN7_I2C_SDA_PL1"), + PINCTRL_PIN(TEGRA_PIN_GEN9_I2C_SCL_PL2, "GEN9_I2C_SCL_PL2"), + PINCTRL_PIN(TEGRA_PIN_GEN9_I2C_SDA_PL3, "GEN9_I2C_SDA_PL3"), + PINCTRL_PIN(TEGRA_PIN_USB_VBUS_EN0_PL4, "USB_VBUS_EN0_PL4"), + PINCTRL_PIN(TEGRA_PIN_USB_VBUS_EN1_PL5, "USB_VBUS_EN1_PL5"), + PINCTRL_PIN(TEGRA_PIN_GP_PWM6_PL6, "GP_PWM6_PL6"), + PINCTRL_PIN(TEGRA_PIN_GP_PWM7_PL7, "GP_PWM7_PL7"), + PINCTRL_PIN(TEGRA_PIN_DMIC1_DAT_PM0, "DMIC1_DAT_PM0"), + PINCTRL_PIN(TEGRA_PIN_DMIC1_CLK_PM1, "DMIC1_CLK_PM1"), + PINCTRL_PIN(TEGRA_PIN_DMIC2_DAT_PM2, "DMIC2_DAT_PM2"), + PINCTRL_PIN(TEGRA_PIN_DMIC2_CLK_PM3, "DMIC2_CLK_PM3"), + PINCTRL_PIN(TEGRA_PIN_DMIC4_DAT_PM4, "DMIC4_DAT_PM4"), + PINCTRL_PIN(TEGRA_PIN_DMIC4_CLK_PM5, "DMIC4_CLK_PM5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM1_PN0, "GPIO_CAM1_PN0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM2_PN1, "GPIO_CAM2_PN1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM3_PN2, "GPIO_CAM3_PN2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM4_PN3, "GPIO_CAM4_PN3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM5_PN4, "GPIO_CAM6_PN5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM6_PN5, "GPIO_CAM6_PN5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_CAM7_PN6, "GPIO_CAM7_PN6"), + PINCTRL_PIN(TEGRA_PIN_EXTPERIPH1_CLK_PO0, "EXTPERIPH1_CLK_PO0"), + PINCTRL_PIN(TEGRA_PIN_EXTPERIPH2_CLK_PO1, "EXTPERIPH2_CLK_PO1"), + PINCTRL_PIN(TEGRA_PIN_CAM_I2C_SCL_PO2, "CAM_I2C_SCL_PO2"), + PINCTRL_PIN(TEGRA_PIN_CAM_I2C_SDA_PO3, "CAM_I2C_SDA_PO3"), + PINCTRL_PIN(TEGRA_PIN_DP_AUX_CH0_HPD_PP0, "DP_AUX_CH0_HPD_PP0"), + PINCTRL_PIN(TEGRA_PIN_DP_AUX_CH1_HPD_PP1, "DP_AUX_CH1_HPD_PP1"), + PINCTRL_PIN(TEGRA_PIN_HDMI_CEC_PP2, "HDMI_CEC_PP2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_EDP0_PP3, "GPIO_EDP0_PP3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_EDP1_PP4, "GPIO_EDP1_PP4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_EDP2_PP5, "GPIO_EDP2_PP5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_EDP3_PP6, "GPIO_EDP3_PP6"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_CLK_PQ0, "DIRECTDC1_CLK_PQ0"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_IN_PQ1, "DIRECTDC1_IN_PQ1"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_OUT0_PQ2, "DIRECTDC1_OUT0_PQ2"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_OUT1_PQ3, "DIRECTDC1_OUT1_PQ3"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_OUT2_PQ4, "DIRECTDC1_OUT2_PQ4"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC1_OUT3_PQ5, "DIRECTDC1_OUT3_PQ5"), + PINCTRL_PIN(TEGRA_PIN_QSPI_SCK_PR0, "QSPI_SCK_PR0"), + PINCTRL_PIN(TEGRA_PIN_QSPI_IO0_PR1, "QSPI_IO0_PR1"), + PINCTRL_PIN(TEGRA_PIN_QSPI_IO1_PR2, "QSPI_IO1_PR2"), + PINCTRL_PIN(TEGRA_PIN_QSPI_IO2_PR3, "QSPI_IO2_PR3"), + PINCTRL_PIN(TEGRA_PIN_QSPI_IO3_PR4, "QSPI_IO3_PR4"), + PINCTRL_PIN(TEGRA_PIN_QSPI_CS_N_PR5, "QSPI_CS_N_PR5"), + PINCTRL_PIN(TEGRA_PIN_UART1_TX_PT0, "UART1_TX_PT0"), + PINCTRL_PIN(TEGRA_PIN_UART1_RX_PT1, "UART1_RX_PT1"), + PINCTRL_PIN(TEGRA_PIN_UART1_RTS_PT2, "UART1_RTS_PT2"), + PINCTRL_PIN(TEGRA_PIN_UART1_CTS_PT3, "UART1_CTS_PT3"), + PINCTRL_PIN(TEGRA_PIN_UART2_TX_PX0, "UART2_TX_PX0"), + PINCTRL_PIN(TEGRA_PIN_UART2_RX_PX1, "UART2_RX_PX1"), + PINCTRL_PIN(TEGRA_PIN_UART2_RTS_PX2, "UART2_RTS_PX2"), + PINCTRL_PIN(TEGRA_PIN_UART2_CTS_PX3, "UART2_CTS_PX3"), + PINCTRL_PIN(TEGRA_PIN_UART5_TX_PX4, "UART5_TX_PX4"), + PINCTRL_PIN(TEGRA_PIN_UART5_RX_PX5, "UART5_RX_PX5"), + PINCTRL_PIN(TEGRA_PIN_UART5_RTS_PX6, "UART5_RTS_PX6"), + PINCTRL_PIN(TEGRA_PIN_UART5_CTS_PX7, "UART5_CTS_PX7"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM1_PY0, "GPIO_MDM1_PY0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM2_PY1, "GPIO_MDM2_PY1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM3_PY2, "GPIO_MDM3_PY2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM4_PY3, "GPIO_MDM4_PY3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM5_PY4, "GPIO_MDM5_PY4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM6_PY5, "GPIO_MDM6_PY5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_MDM7_PY6, "GPIO_MDM7_PY6"), + PINCTRL_PIN(TEGRA_PIN_UFS0_REF_CLK_PBB0, "UFS0_REF_CLK_PBB0"), + PINCTRL_PIN(TEGRA_PIN_UFS0_RST_PBB1, "UFS0_RST_PBB1"), + PINCTRL_PIN(TEGRA_PIN_DAP4_SCLK_PCC0, "DAP4_SCLK_PCC0"), + PINCTRL_PIN(TEGRA_PIN_DAP4_DOUT_PCC1, "DAP4_DOUT_PCC1"), + PINCTRL_PIN(TEGRA_PIN_DAP4_DIN_PCC2, "DAP4_DIN_PCC2"), + PINCTRL_PIN(TEGRA_PIN_DAP4_FS_PCC3, "DAP4_FS_PCC3"), + PINCTRL_PIN(TEGRA_PIN_DIRECTDC_COMP, "DIRECTDC_COMP"), + PINCTRL_PIN(TEGRA_PIN_SDMMC1_COMP, "SDMMC1_COMP"), + PINCTRL_PIN(TEGRA_PIN_EQOS_COMP, "EQOS_COMP"), + PINCTRL_PIN(TEGRA_PIN_SDMMC3_COMP, "SDMMC3_COMP"), + PINCTRL_PIN(TEGRA_PIN_QSPI_COMP, "QSPI_COMP"), +}; + +static const unsigned int pex_l0_rst_n_pa0_pins[] = { + TEGRA_PIN_PEX_L0_RST_N_PA0, +}; + +static const unsigned int pex_l0_clkreq_n_pa1_pins[] = { + TEGRA_PIN_PEX_L0_CLKREQ_N_PA1, +}; + +static const unsigned int pex_wake_n_pa2_pins[] = { + TEGRA_PIN_PEX_WAKE_N_PA2, +}; + +static const unsigned int pex_l1_rst_n_pa3_pins[] = { + TEGRA_PIN_PEX_L1_RST_N_PA3, +}; + +static const unsigned int pex_l1_clkreq_n_pa4_pins[] = { + TEGRA_PIN_PEX_L1_CLKREQ_N_PA4, +}; + +static const unsigned int pex_l2_rst_n_pa5_pins[] = { + TEGRA_PIN_PEX_L2_RST_N_PA5, +}; + +static const unsigned int pex_l2_clkreq_n_pa6_pins[] = { + TEGRA_PIN_PEX_L2_CLKREQ_N_PA6, +}; + +static const unsigned int uart4_tx_pb0_pins[] = { + TEGRA_PIN_UART4_TX_PB0, +}; + +static const unsigned int uart4_rx_pb1_pins[] = { + TEGRA_PIN_UART4_RX_PB1, +}; + +static const unsigned int uart4_rts_pb2_pins[] = { + TEGRA_PIN_UART4_RTS_PB2, +}; + +static const unsigned int uart4_cts_pb3_pins[] = { + TEGRA_PIN_UART4_CTS_PB3, +}; + +static const unsigned int gpio_wan1_pb4_pins[] = { + TEGRA_PIN_GPIO_WAN1_PB4, +}; + +static const unsigned int gpio_wan2_pb5_pins[] = { + TEGRA_PIN_GPIO_WAN2_PB5, +}; + +static const unsigned int gpio_wan3_pb6_pins[] = { + TEGRA_PIN_GPIO_WAN3_PB6, +}; + +static const unsigned int gpio_wan4_pc0_pins[] = { + TEGRA_PIN_GPIO_WAN4_PC0, +}; + +static const unsigned int dap2_sclk_pc1_pins[] = { + TEGRA_PIN_DAP2_SCLK_PC1, +}; + +static const unsigned int dap2_dout_pc2_pins[] = { + TEGRA_PIN_DAP2_DOUT_PC2, +}; + +static const unsigned int dap2_din_pc3_pins[] = { + TEGRA_PIN_DAP2_DIN_PC3, +}; + +static const unsigned int dap2_fs_pc4_pins[] = { + TEGRA_PIN_DAP2_FS_PC4, +}; + +static const unsigned int gen1_i2c_scl_pc5_pins[] = { + TEGRA_PIN_GEN1_I2C_SCL_PC5, +}; + +static const unsigned int gen1_i2c_sda_pc6_pins[] = { + TEGRA_PIN_GEN1_I2C_SDA_PC6, +}; + +static const unsigned int sdmmc1_clk_pd0_pins[] = { + TEGRA_PIN_SDMMC1_CLK_PD0, +}; + +static const unsigned int sdmmc1_cmd_pd1_pins[] = { + TEGRA_PIN_SDMMC1_CMD_PD1, +}; + +static const unsigned int sdmmc1_dat0_pd2_pins[] = { + TEGRA_PIN_SDMMC1_DAT0_PD2, +}; + +static const unsigned int sdmmc1_dat1_pd3_pins[] = { + TEGRA_PIN_SDMMC1_DAT1_PD3, +}; + +static const unsigned int sdmmc1_dat2_pd4_pins[] = { + TEGRA_PIN_SDMMC1_DAT2_PD4, +}; + +static const unsigned int sdmmc1_dat3_pd5_pins[] = { + TEGRA_PIN_SDMMC1_DAT3_PD5, +}; + +static const unsigned int eqos_txc_pe0_pins[] = { + TEGRA_PIN_EQOS_TXC_PE0, +}; + +static const unsigned int eqos_td0_pe1_pins[] = { + TEGRA_PIN_EQOS_TD0_PE1, +}; + +static const unsigned int eqos_td1_pe2_pins[] = { + TEGRA_PIN_EQOS_TD1_PE2, +}; + +static const unsigned int eqos_td2_pe3_pins[] = { + TEGRA_PIN_EQOS_TD2_PE3, +}; + +static const unsigned int eqos_td3_pe4_pins[] = { + TEGRA_PIN_EQOS_TD3_PE4, +}; + +static const unsigned int eqos_tx_ctl_pe5_pins[] = { + TEGRA_PIN_EQOS_TX_CTL_PE5, +}; + +static const unsigned int eqos_rd0_pe6_pins[] = { + TEGRA_PIN_EQOS_RD0_PE6, +}; + +static const unsigned int eqos_rd1_pe7_pins[] = { + TEGRA_PIN_EQOS_RD1_PE7, +}; + +static const unsigned int eqos_rd2_pf0_pins[] = { + TEGRA_PIN_EQOS_RD2_PF0, +}; + +static const unsigned int eqos_rd3_pf1_pins[] = { + TEGRA_PIN_EQOS_RD3_PF1, +}; + +static const unsigned int eqos_rx_ctl_pf2_pins[] = { + TEGRA_PIN_EQOS_RX_CTL_PF2, +}; + +static const unsigned int eqos_rxc_pf3_pins[] = { + TEGRA_PIN_EQOS_RXC_PF3, +}; + +static const unsigned int eqos_mdio_pf4_pins[] = { + TEGRA_PIN_EQOS_MDIO_PF4, +}; + +static const unsigned int eqos_mdc_pf5_pins[] = { + TEGRA_PIN_EQOS_MDC_PF5, +}; + +static const unsigned int sdmmc3_clk_pg0_pins[] = { + TEGRA_PIN_SDMMC3_CLK_PG0, +}; + +static const unsigned int sdmmc3_cmd_pg1_pins[] = { + TEGRA_PIN_SDMMC3_CMD_PG1, +}; + +static const unsigned int sdmmc3_dat0_pg2_pins[] = { + TEGRA_PIN_SDMMC3_DAT0_PG2, +}; + +static const unsigned int sdmmc3_dat1_pg3_pins[] = { + TEGRA_PIN_SDMMC3_DAT1_PG3, +}; + +static const unsigned int sdmmc3_dat2_pg4_pins[] = { + TEGRA_PIN_SDMMC3_DAT2_PG4, +}; + +static const unsigned int sdmmc3_dat3_pg5_pins[] = { + TEGRA_PIN_SDMMC3_DAT3_PG5, +}; + +static const unsigned int gpio_wan5_ph0_pins[] = { + TEGRA_PIN_GPIO_WAN5_PH0, +}; + +static const unsigned int gpio_wan6_ph1_pins[] = { + TEGRA_PIN_GPIO_WAN6_PH1, +}; + +static const unsigned int gpio_wan7_ph2_pins[] = { + TEGRA_PIN_GPIO_WAN7_PH2, +}; + +static const unsigned int gpio_wan8_ph3_pins[] = { + TEGRA_PIN_GPIO_WAN8_PH3, +}; + +static const unsigned int bcpu_pwr_req_ph4_pins[] = { + TEGRA_PIN_BCPU_PWR_REQ_PH4, +}; + +static const unsigned int mcpu_pwr_req_ph5_pins[] = { + TEGRA_PIN_MCPU_PWR_REQ_PH5, +}; + +static const unsigned int gpu_pwr_req_ph6_pins[] = { + TEGRA_PIN_GPU_PWR_REQ_PH6, +}; + +static const unsigned int gpio_pq0_pi0_pins[] = { + TEGRA_PIN_GPIO_PQ0_PI0, +}; + +static const unsigned int gpio_pq1_pi1_pins[] = { + TEGRA_PIN_GPIO_PQ1_PI1, +}; + +static const unsigned int gpio_pq2_pi2_pins[] = { + TEGRA_PIN_GPIO_PQ2_PI2, +}; + +static const unsigned int gpio_pq3_pi3_pins[] = { + TEGRA_PIN_GPIO_PQ3_PI3, +}; + +static const unsigned int gpio_pq4_pi4_pins[] = { + TEGRA_PIN_GPIO_PQ4_PI4, +}; + +static const unsigned int gpio_pq5_pi5_pins[] = { + TEGRA_PIN_GPIO_PQ5_PI5, +}; + +static const unsigned int gpio_pq6_pi6_pins[] = { + TEGRA_PIN_GPIO_PQ6_PI6, +}; + +static const unsigned int gpio_pq7_pi7_pins[] = { + TEGRA_PIN_GPIO_PQ7_PI7, +}; + +static const unsigned int dap1_sclk_pj0_pins[] = { + TEGRA_PIN_DAP1_SCLK_PJ0, +}; + +static const unsigned int dap1_dout_pj1_pins[] = { + TEGRA_PIN_DAP1_DOUT_PJ1, +}; + +static const unsigned int dap1_din_pj2_pins[] = { + TEGRA_PIN_DAP1_DIN_PJ2, +}; + +static const unsigned int dap1_fs_pj3_pins[] = { + TEGRA_PIN_DAP1_FS_PJ3, +}; + +static const unsigned int aud_mclk_pj4_pins[] = { + TEGRA_PIN_AUD_MCLK_PJ4, +}; + +static const unsigned int gpio_aud0_pj5_pins[] = { + TEGRA_PIN_GPIO_AUD0_PJ5, +}; + +static const unsigned int gpio_aud1_pj6_pins[] = { + TEGRA_PIN_GPIO_AUD1_PJ6, +}; + +static const unsigned int gpio_aud2_pj7_pins[] = { + TEGRA_PIN_GPIO_AUD2_PJ7, +}; + +static const unsigned int gpio_aud3_pk0_pins[] = { + TEGRA_PIN_GPIO_AUD3_PK0, +}; + +static const unsigned int gen7_i2c_scl_pl0_pins[] = { + TEGRA_PIN_GEN7_I2C_SCL_PL0, +}; + +static const unsigned int gen7_i2c_sda_pl1_pins[] = { + TEGRA_PIN_GEN7_I2C_SDA_PL1, +}; + +static const unsigned int gen9_i2c_scl_pl2_pins[] = { + TEGRA_PIN_GEN9_I2C_SCL_PL2, +}; + +static const unsigned int gen9_i2c_sda_pl3_pins[] = { + TEGRA_PIN_GEN9_I2C_SDA_PL3, +}; + +static const unsigned int usb_vbus_en0_pl4_pins[] = { + TEGRA_PIN_USB_VBUS_EN0_PL4, +}; + +static const unsigned int usb_vbus_en1_pl5_pins[] = { + TEGRA_PIN_USB_VBUS_EN1_PL5, +}; + +static const unsigned int gp_pwm6_pl6_pins[] = { + TEGRA_PIN_GP_PWM6_PL6, +}; + +static const unsigned int gp_pwm7_pl7_pins[] = { + TEGRA_PIN_GP_PWM7_PL7, +}; + +static const unsigned int dmic1_dat_pm0_pins[] = { + TEGRA_PIN_DMIC1_DAT_PM0, +}; + +static const unsigned int dmic1_clk_pm1_pins[] = { + TEGRA_PIN_DMIC1_CLK_PM1, +}; + +static const unsigned int dmic2_dat_pm2_pins[] = { + TEGRA_PIN_DMIC2_DAT_PM2, +}; + +static const unsigned int dmic2_clk_pm3_pins[] = { + TEGRA_PIN_DMIC2_CLK_PM3, +}; + +static const unsigned int dmic4_dat_pm4_pins[] = { + TEGRA_PIN_DMIC4_DAT_PM4, +}; + +static const unsigned int dmic4_clk_pm5_pins[] = { + TEGRA_PIN_DMIC4_CLK_PM5, +}; + +static const unsigned int gpio_cam1_pn0_pins[] = { + TEGRA_PIN_GPIO_CAM1_PN0, +}; + +static const unsigned int gpio_cam2_pn1_pins[] = { + TEGRA_PIN_GPIO_CAM2_PN1, +}; + +static const unsigned int gpio_cam3_pn2_pins[] = { + TEGRA_PIN_GPIO_CAM3_PN2, +}; + +static const unsigned int gpio_cam4_pn3_pins[] = { + TEGRA_PIN_GPIO_CAM4_PN3, +}; + +static const unsigned int gpio_cam5_pn4_pins[] = { + TEGRA_PIN_GPIO_CAM5_PN4, +}; + +static const unsigned int gpio_cam6_pn5_pins[] = { + TEGRA_PIN_GPIO_CAM6_PN5, +}; + +static const unsigned int gpio_cam7_pn6_pins[] = { + TEGRA_PIN_GPIO_CAM7_PN6, +}; + +static const unsigned int extperiph1_clk_po0_pins[] = { + TEGRA_PIN_EXTPERIPH1_CLK_PO0, +}; + +static const unsigned int extperiph2_clk_po1_pins[] = { + TEGRA_PIN_EXTPERIPH2_CLK_PO1, +}; + +static const unsigned int cam_i2c_scl_po2_pins[] = { + TEGRA_PIN_CAM_I2C_SCL_PO2, +}; + +static const unsigned int cam_i2c_sda_po3_pins[] = { + TEGRA_PIN_CAM_I2C_SDA_PO3, +}; + +static const unsigned int dp_aux_ch0_hpd_pp0_pins[] = { + TEGRA_PIN_DP_AUX_CH0_HPD_PP0, +}; + +static const unsigned int dp_aux_ch1_hpd_pp1_pins[] = { + TEGRA_PIN_DP_AUX_CH1_HPD_PP1, +}; + +static const unsigned int hdmi_cec_pp2_pins[] = { + TEGRA_PIN_HDMI_CEC_PP2, +}; + +static const unsigned int gpio_edp0_pp3_pins[] = { + TEGRA_PIN_GPIO_EDP0_PP3, +}; + +static const unsigned int gpio_edp1_pp4_pins[] = { + TEGRA_PIN_GPIO_EDP1_PP4, +}; + +static const unsigned int gpio_edp2_pp5_pins[] = { + TEGRA_PIN_GPIO_EDP2_PP5, +}; + +static const unsigned int gpio_edp3_pp6_pins[] = { + TEGRA_PIN_GPIO_EDP3_PP6, +}; + +static const unsigned int directdc1_clk_pq0_pins[] = { + TEGRA_PIN_DIRECTDC1_CLK_PQ0, +}; + +static const unsigned int directdc1_in_pq1_pins[] = { + TEGRA_PIN_DIRECTDC1_IN_PQ1, +}; + +static const unsigned int directdc1_out0_pq2_pins[] = { + TEGRA_PIN_DIRECTDC1_OUT0_PQ2, +}; + +static const unsigned int directdc1_out1_pq3_pins[] = { + TEGRA_PIN_DIRECTDC1_OUT1_PQ3, +}; + +static const unsigned int directdc1_out2_pq4_pins[] = { + TEGRA_PIN_DIRECTDC1_OUT2_PQ4, +}; + +static const unsigned int directdc1_out3_pq5_pins[] = { + TEGRA_PIN_DIRECTDC1_OUT3_PQ5, +}; + +static const unsigned int qspi_sck_pr0_pins[] = { + TEGRA_PIN_QSPI_SCK_PR0, +}; + +static const unsigned int qspi_io0_pr1_pins[] = { + TEGRA_PIN_QSPI_IO0_PR1, +}; + +static const unsigned int qspi_io1_pr2_pins[] = { + TEGRA_PIN_QSPI_IO1_PR2, +}; + +static const unsigned int qspi_io2_pr3_pins[] = { + TEGRA_PIN_QSPI_IO2_PR3, +}; + +static const unsigned int qspi_io3_pr4_pins[] = { + TEGRA_PIN_QSPI_IO3_PR4, +}; + +static const unsigned int qspi_cs_n_pr5_pins[] = { + TEGRA_PIN_QSPI_CS_N_PR5, +}; + +static const unsigned int pwr_i2c_scl_ps0_pins[] = { + TEGRA_PIN_PWR_I2C_SCL_PS0, +}; + +static const unsigned int pwr_i2c_sda_ps1_pins[] = { + TEGRA_PIN_PWR_I2C_SDA_PS1, +}; + +static const unsigned int batt_oc_ps2_pins[] = { + TEGRA_PIN_BATT_OC_PS2, +}; + +static const unsigned int safe_state_ps3_pins[] = { + TEGRA_PIN_SAFE_STATE_PS3, +}; + +static const unsigned int vcomp_alert_ps4_pins[] = { + TEGRA_PIN_VCOMP_ALERT_PS4, +}; + +static const unsigned int uart1_tx_pt0_pins[] = { + TEGRA_PIN_UART1_TX_PT0, +}; + +static const unsigned int uart1_rx_pt1_pins[] = { + TEGRA_PIN_UART1_RX_PT1, +}; + +static const unsigned int uart1_rts_pt2_pins[] = { + TEGRA_PIN_UART1_RTS_PT2, +}; + +static const unsigned int uart1_cts_pt3_pins[] = { + TEGRA_PIN_UART1_CTS_PT3, +}; + +static const unsigned int gpio_dis0_pu0_pins[] = { + TEGRA_PIN_GPIO_DIS0_PU0, +}; + +static const unsigned int gpio_dis1_pu1_pins[] = { + TEGRA_PIN_GPIO_DIS1_PU1, +}; + +static const unsigned int gpio_dis2_pu2_pins[] = { + TEGRA_PIN_GPIO_DIS2_PU2, +}; + +static const unsigned int gpio_dis3_pu3_pins[] = { + TEGRA_PIN_GPIO_DIS3_PU3, +}; + +static const unsigned int gpio_dis4_pu4_pins[] = { + TEGRA_PIN_GPIO_DIS4_PU4, +}; + +static const unsigned int gpio_dis5_pu5_pins[] = { + TEGRA_PIN_GPIO_DIS5_PU5, +}; + +static const unsigned int gpio_sen0_pv0_pins[] = { + TEGRA_PIN_GPIO_SEN0_PV0, +}; + +static const unsigned int gpio_sen1_pv1_pins[] = { + TEGRA_PIN_GPIO_SEN1_PV1, +}; + +static const unsigned int gpio_sen2_pv2_pins[] = { + TEGRA_PIN_GPIO_SEN2_PV2, +}; + +static const unsigned int gpio_sen3_pv3_pins[] = { + TEGRA_PIN_GPIO_SEN3_PV3, +}; + +static const unsigned int gpio_sen4_pv4_pins[] = { + TEGRA_PIN_GPIO_SEN4_PV4, +}; + +static const unsigned int gpio_sen5_pv5_pins[] = { + TEGRA_PIN_GPIO_SEN5_PV5, +}; + +static const unsigned int gpio_sen6_pv6_pins[] = { + TEGRA_PIN_GPIO_SEN6_PV6, +}; + +static const unsigned int gpio_sen7_pv7_pins[] = { + TEGRA_PIN_GPIO_SEN7_PV7, +}; + +static const unsigned int gen8_i2c_scl_pw0_pins[] = { + TEGRA_PIN_GEN8_I2C_SCL_PW0, +}; + +static const unsigned int gen8_i2c_sda_pw1_pins[] = { + TEGRA_PIN_GEN8_I2C_SDA_PW1, +}; + +static const unsigned int uart3_tx_pw2_pins[] = { + TEGRA_PIN_UART3_TX_PW2, +}; + +static const unsigned int uart3_rx_pw3_pins[] = { + TEGRA_PIN_UART3_RX_PW3, +}; + +static const unsigned int uart3_rts_pw4_pins[] = { + TEGRA_PIN_UART3_RTS_PW4, +}; + +static const unsigned int uart3_cts_pw5_pins[] = { + TEGRA_PIN_UART3_CTS_PW5, +}; + +static const unsigned int uart7_tx_pw6_pins[] = { + TEGRA_PIN_UART7_TX_PW6, +}; + +static const unsigned int uart7_rx_pw7_pins[] = { + TEGRA_PIN_UART7_RX_PW7, +}; + +static const unsigned int uart2_tx_px0_pins[] = { + TEGRA_PIN_UART2_TX_PX0, +}; + +static const unsigned int uart2_rx_px1_pins[] = { + TEGRA_PIN_UART2_RX_PX1, +}; + +static const unsigned int uart2_rts_px2_pins[] = { + TEGRA_PIN_UART2_RTS_PX2, +}; + +static const unsigned int uart2_cts_px3_pins[] = { + TEGRA_PIN_UART2_CTS_PX3, +}; + +static const unsigned int uart5_tx_px4_pins[] = { + TEGRA_PIN_UART5_TX_PX4, +}; + +static const unsigned int uart5_rx_px5_pins[] = { + TEGRA_PIN_UART5_RX_PX5, +}; + +static const unsigned int uart5_rts_px6_pins[] = { + TEGRA_PIN_UART5_RTS_PX6, +}; + +static const unsigned int uart5_cts_px7_pins[] = { + TEGRA_PIN_UART5_CTS_PX7, +}; + +static const unsigned int gpio_mdm1_py0_pins[] = { + TEGRA_PIN_GPIO_MDM1_PY0, +}; + +static const unsigned int gpio_mdm2_py1_pins[] = { + TEGRA_PIN_GPIO_MDM2_PY1, +}; + +static const unsigned int gpio_mdm3_py2_pins[] = { + TEGRA_PIN_GPIO_MDM3_PY2, +}; + +static const unsigned int gpio_mdm4_py3_pins[] = { + TEGRA_PIN_GPIO_MDM4_PY3, +}; + +static const unsigned int gpio_mdm5_py4_pins[] = { + TEGRA_PIN_GPIO_MDM5_PY4, +}; + +static const unsigned int gpio_mdm6_py5_pins[] = { + TEGRA_PIN_GPIO_MDM6_PY5, +}; + +static const unsigned int gpio_mdm7_py6_pins[] = { + TEGRA_PIN_GPIO_MDM7_PY6, +}; + +static const unsigned int can1_dout_pz0_pins[] = { + TEGRA_PIN_CAN1_DOUT_PZ0, +}; + +static const unsigned int can1_din_pz1_pins[] = { + TEGRA_PIN_CAN1_DIN_PZ1, +}; + +static const unsigned int can0_dout_pz2_pins[] = { + TEGRA_PIN_CAN0_DOUT_PZ2, +}; + +static const unsigned int can0_din_pz3_pins[] = { + TEGRA_PIN_CAN0_DIN_PZ3, +}; + +static const unsigned int can_gpio0_paa0_pins[] = { + TEGRA_PIN_CAN_GPIO0_PAA0, +}; + +static const unsigned int can_gpio1_paa1_pins[] = { + TEGRA_PIN_CAN_GPIO1_PAA1, +}; + +static const unsigned int can_gpio2_paa2_pins[] = { + TEGRA_PIN_CAN_GPIO2_PAA2, +}; + +static const unsigned int can_gpio3_paa3_pins[] = { + TEGRA_PIN_CAN_GPIO3_PAA3, +}; + +static const unsigned int can_gpio4_paa4_pins[] = { + TEGRA_PIN_CAN_GPIO4_PAA4, +}; + +static const unsigned int can_gpio5_paa5_pins[] = { + TEGRA_PIN_CAN_GPIO5_PAA5, +}; + +static const unsigned int can_gpio6_paa6_pins[] = { + TEGRA_PIN_CAN_GPIO6_PAA6, +}; + +static const unsigned int can_gpio7_paa7_pins[] = { + TEGRA_PIN_CAN_GPIO7_PAA7, +}; + +static const unsigned int ufs0_ref_clk_pbb0_pins[] = { + TEGRA_PIN_UFS0_REF_CLK_PBB0, +}; + +static const unsigned int ufs0_rst_pbb1_pins[] = { + TEGRA_PIN_UFS0_RST_PBB1, +}; + +static const unsigned int dap4_sclk_pcc0_pins[] = { + TEGRA_PIN_DAP4_SCLK_PCC0, +}; + +static const unsigned int dap4_dout_pcc1_pins[] = { + TEGRA_PIN_DAP4_DOUT_PCC1, +}; + +static const unsigned int dap4_din_pcc2_pins[] = { + TEGRA_PIN_DAP4_DIN_PCC2, +}; + +static const unsigned int dap4_fs_pcc3_pins[] = { + TEGRA_PIN_DAP4_FS_PCC3, +}; + +static const unsigned int gpio_sen8_pee0_pins[] = { + TEGRA_PIN_GPIO_SEN8_PEE0, +}; + +static const unsigned int gpio_sen9_pee1_pins[] = { + TEGRA_PIN_GPIO_SEN9_PEE1, +}; + +static const unsigned int touch_clk_pee2_pins[] = { + TEGRA_PIN_TOUCH_CLK_PEE2, +}; + +static const unsigned int power_on_pff0_pins[] = { + TEGRA_PIN_POWER_ON_PFF0, +}; + +static const unsigned int gpio_sw1_pff1_pins[] = { + TEGRA_PIN_GPIO_SW1_PFF1, +}; + +static const unsigned int gpio_sw2_pff2_pins[] = { + TEGRA_PIN_GPIO_SW2_PFF2, +}; + +static const unsigned int gpio_sw3_pff3_pins[] = { + TEGRA_PIN_GPIO_SW3_PFF3, +}; + +static const unsigned int gpio_sw4_pff4_pins[] = { + TEGRA_PIN_GPIO_SW4_PFF4, +}; + +static const unsigned int directdc_comp_pins[] = { + TEGRA_PIN_DIRECTDC_COMP, +}; + +static const unsigned int sdmmc1_comp_pins[] = { + TEGRA_PIN_SDMMC1_COMP, +}; + +static const unsigned int eqos_comp_pins[] = { + TEGRA_PIN_EQOS_COMP, +}; + +static const unsigned int sdmmc3_comp_pins[] = { + TEGRA_PIN_SDMMC3_COMP, +}; + +static const unsigned int qspi_comp_pins[] = { + TEGRA_PIN_QSPI_COMP, +}; + +static const unsigned int shutdown_pins[] = { + TEGRA_PIN_SHUTDOWN, +}; + +static const unsigned int pmu_int_pins[] = { + TEGRA_PIN_PMU_INT, +}; + +static const unsigned int soc_pwr_req_pins[] = { + TEGRA_PIN_SOC_PWR_REQ, +}; + +static const unsigned int clk_32k_in_pins[] = { + TEGRA_PIN_CLK_32K_IN, +}; + +static const unsigned int sdmmc4_clk_pins[] = {}; + +static const unsigned int sdmmc4_cmd_pins[] = {}; + +static const unsigned int sdmmc4_dqs_pins[] = {}; + +static const unsigned int sdmmc4_dat7_pins[] = {}; + +static const unsigned int sdmmc4_dat6_pins[] = {}; + +static const unsigned int sdmmc4_dat5_pins[] = {}; + +static const unsigned int sdmmc4_dat4_pins[] = {}; + +static const unsigned int sdmmc4_dat3_pins[] = {}; + +static const unsigned int sdmmc4_dat2_pins[] = {}; + +static const unsigned int sdmmc4_dat1_pins[] = {}; + +static const unsigned int sdmmc4_dat0_pins[] = {}; + +/* Define unique ID for each function */ +enum tegra_mux_dt { + TEGRA_MUX_RSVD0, + TEGRA_MUX_RSVD1, + TEGRA_MUX_RSVD2, + TEGRA_MUX_RSVD3, + TEGRA_MUX_TOUCH, + TEGRA_MUX_UARTC, + TEGRA_MUX_I2C8, + TEGRA_MUX_UARTG, + TEGRA_MUX_SPI2, + TEGRA_MUX_GP, + TEGRA_MUX_DCA, + TEGRA_MUX_WDT, + TEGRA_MUX_I2C2, + TEGRA_MUX_CAN1, + TEGRA_MUX_CAN0, + TEGRA_MUX_DMIC3, + TEGRA_MUX_DMIC5, + TEGRA_MUX_GPIO, + TEGRA_MUX_DSPK1, + TEGRA_MUX_DSPK0, + TEGRA_MUX_SPDIF, + TEGRA_MUX_AUD, + TEGRA_MUX_I2S1, + TEGRA_MUX_DMIC1, + TEGRA_MUX_DMIC2, + TEGRA_MUX_I2S3, + TEGRA_MUX_DMIC4, + TEGRA_MUX_I2S4, + TEGRA_MUX_EXTPERIPH2, + TEGRA_MUX_EXTPERIPH1, + TEGRA_MUX_I2C3, + TEGRA_MUX_VGP1, + TEGRA_MUX_VGP2, + TEGRA_MUX_VGP3, + TEGRA_MUX_VGP4, + TEGRA_MUX_VGP5, + TEGRA_MUX_VGP6, + TEGRA_MUX_EXTPERIPH3, + TEGRA_MUX_EXTPERIPH4, + TEGRA_MUX_SPI4, + TEGRA_MUX_I2S2, + TEGRA_MUX_UARTD, + TEGRA_MUX_I2C1, + TEGRA_MUX_UARTA, + TEGRA_MUX_DIRECTDC1, + TEGRA_MUX_DIRECTDC, + TEGRA_MUX_IQC0, + TEGRA_MUX_IQC1, + TEGRA_MUX_I2S6, + TEGRA_MUX_DTV, + TEGRA_MUX_UARTF, + TEGRA_MUX_SDMMC3, + TEGRA_MUX_SDMMC4, + TEGRA_MUX_SDMMC1, + TEGRA_MUX_DP, + TEGRA_MUX_HDMI, + TEGRA_MUX_PE2, + TEGRA_MUX_SATA, + TEGRA_MUX_PE, + TEGRA_MUX_PE1, + TEGRA_MUX_PE0, + TEGRA_MUX_SOC, + TEGRA_MUX_EQOS, + TEGRA_MUX_SDMMC2, + TEGRA_MUX_QSPI, + TEGRA_MUX_SCE, + TEGRA_MUX_I2C5, + TEGRA_MUX_DISPLAYA, + TEGRA_MUX_DISPLAYB, + TEGRA_MUX_DCC, + TEGRA_MUX_DCB, + TEGRA_MUX_SPI1, + TEGRA_MUX_UARTB, + TEGRA_MUX_UARTE, + TEGRA_MUX_SPI3, + TEGRA_MUX_NV, + TEGRA_MUX_CCLA, + TEGRA_MUX_I2C7, + TEGRA_MUX_I2C9, + TEGRA_MUX_I2S5, + TEGRA_MUX_USB, + TEGRA_MUX_UFS0, +}; + +/* Make list of each function name */ +#define TEGRA_PIN_FUNCTION(lid) #lid + +static const char * const tegra186_functions[] = { + TEGRA_PIN_FUNCTION(rsvd0), + TEGRA_PIN_FUNCTION(rsvd1), + TEGRA_PIN_FUNCTION(rsvd2), + TEGRA_PIN_FUNCTION(rsvd3), + TEGRA_PIN_FUNCTION(touch), + TEGRA_PIN_FUNCTION(uartc), + TEGRA_PIN_FUNCTION(i2c8), + TEGRA_PIN_FUNCTION(uartg), + TEGRA_PIN_FUNCTION(spi2), + TEGRA_PIN_FUNCTION(gp), + TEGRA_PIN_FUNCTION(dca), + TEGRA_PIN_FUNCTION(wdt), + TEGRA_PIN_FUNCTION(i2c2), + TEGRA_PIN_FUNCTION(can1), + TEGRA_PIN_FUNCTION(can0), + TEGRA_PIN_FUNCTION(dmic3), + TEGRA_PIN_FUNCTION(dmic5), + TEGRA_PIN_FUNCTION(gpio), + TEGRA_PIN_FUNCTION(dspk1), + TEGRA_PIN_FUNCTION(dspk0), + TEGRA_PIN_FUNCTION(spdif), + TEGRA_PIN_FUNCTION(aud), + TEGRA_PIN_FUNCTION(i2s1), + TEGRA_PIN_FUNCTION(dmic1), + TEGRA_PIN_FUNCTION(dmic2), + TEGRA_PIN_FUNCTION(i2s3), + TEGRA_PIN_FUNCTION(dmic4), + TEGRA_PIN_FUNCTION(i2s4), + TEGRA_PIN_FUNCTION(extperiph2), + TEGRA_PIN_FUNCTION(extperiph1), + TEGRA_PIN_FUNCTION(i2c3), + TEGRA_PIN_FUNCTION(vgp1), + TEGRA_PIN_FUNCTION(vgp2), + TEGRA_PIN_FUNCTION(vgp3), + TEGRA_PIN_FUNCTION(vgp4), + TEGRA_PIN_FUNCTION(vgp5), + TEGRA_PIN_FUNCTION(vgp6), + TEGRA_PIN_FUNCTION(extperiph3), + TEGRA_PIN_FUNCTION(extperiph4), + TEGRA_PIN_FUNCTION(spi4), + TEGRA_PIN_FUNCTION(i2s2), + TEGRA_PIN_FUNCTION(uartd), + TEGRA_PIN_FUNCTION(i2c1), + TEGRA_PIN_FUNCTION(uarta), + TEGRA_PIN_FUNCTION(directdc1), + TEGRA_PIN_FUNCTION(directdc), + TEGRA_PIN_FUNCTION(iqc0), + TEGRA_PIN_FUNCTION(iqc1), + TEGRA_PIN_FUNCTION(i2s6), + TEGRA_PIN_FUNCTION(dtv), + TEGRA_PIN_FUNCTION(uartf), + TEGRA_PIN_FUNCTION(sdmmc3), + TEGRA_PIN_FUNCTION(sdmmc4), + TEGRA_PIN_FUNCTION(sdmmc1), + TEGRA_PIN_FUNCTION(dp), + TEGRA_PIN_FUNCTION(hdmi), + TEGRA_PIN_FUNCTION(pe2), + TEGRA_PIN_FUNCTION(sata), + TEGRA_PIN_FUNCTION(pe), + TEGRA_PIN_FUNCTION(pe1), + TEGRA_PIN_FUNCTION(pe0), + TEGRA_PIN_FUNCTION(soc), + TEGRA_PIN_FUNCTION(eqos), + TEGRA_PIN_FUNCTION(sdmmc2), + TEGRA_PIN_FUNCTION(qspi), + TEGRA_PIN_FUNCTION(sce), + TEGRA_PIN_FUNCTION(i2c5), + TEGRA_PIN_FUNCTION(displaya), + TEGRA_PIN_FUNCTION(displayb), + TEGRA_PIN_FUNCTION(dcc), + TEGRA_PIN_FUNCTION(dcb), + TEGRA_PIN_FUNCTION(spi1), + TEGRA_PIN_FUNCTION(uartb), + TEGRA_PIN_FUNCTION(uarte), + TEGRA_PIN_FUNCTION(spi3), + TEGRA_PIN_FUNCTION(nv), + TEGRA_PIN_FUNCTION(ccla), + TEGRA_PIN_FUNCTION(i2c7), + TEGRA_PIN_FUNCTION(i2c9), + TEGRA_PIN_FUNCTION(i2s5), + TEGRA_PIN_FUNCTION(usb), + TEGRA_PIN_FUNCTION(ufs0), +}; + +#define PINGROUP_REG_Y(r) ((r)) +#define PINGROUP_REG_N(r) -1 + +#define DRV_PINGROUP_Y(r) ((r)) +#define DRV_PINGROUP_N(r) -1 + +#define DRV_PINGROUP_ENTRY_N(pg_name) \ + .drv_reg = -1, \ + .drv_bank = -1, \ + .drvdn_bit = -1, \ + .drvdn_width = -1, \ + .drvup_bit = -1, \ + .drvup_width = -1, \ + .slwr_bit = -1, \ + .slwr_width = -1, \ + .slwf_bit = -1, \ + .slwf_width = -1 + +#define DRV_PINGROUP_ENTRY_Y(r, drvdn_b, drvdn_w, drvup_b, \ + drvup_w, slwr_b, slwr_w, slwf_b, \ + slwf_w, bank) \ + .drv_reg = ((r)), \ + .drv_bank = bank, \ + .drvdn_bit = drvdn_b, \ + .drvdn_width = drvdn_w, \ + .drvup_bit = drvup_b, \ + .drvup_width = drvup_w, \ + .slwr_bit = slwr_b, \ + .slwr_width = slwr_w, \ + .slwf_bit = slwf_b, \ + .slwf_width = slwf_w + +#define PIN_PINGROUP_ENTRY_N(pg_name) \ + .mux_reg = -1, \ + .pupd_reg = -1, \ + .tri_reg = -1, \ + .einput_bit = -1, \ + .e_io_hv_bit = -1, \ + .odrain_bit = -1, \ + .lock_bit = -1, \ + .parked_bit = -1, \ + .lpmd_bit = -1, \ + .drvtype_bit = -1, \ + .lpdr_bit = -1, \ + .pbias_buf_bit = -1, \ + .preemp_bit = -1, \ + .rfu_in_bit = -1 + +#define PIN_PINGROUP_ENTRY_Y(r, bank, pupd, e_io_hv, e_lpbk, e_input, \ + e_lpdr, e_pbias_buf, gpio_sfio_sel, \ + e_od, schmitt_b, drvtype, epreemp, \ + io_reset, rfu_in) \ + .mux_reg = PINGROUP_REG_Y(r), \ + .lpmd_bit = -1, \ + .lock_bit = -1, \ + .hsm_bit = -1, \ + .mux_bank = bank, \ + .mux_bit = 0, \ + .pupd_reg = PINGROUP_REG_##pupd(r), \ + .pupd_bank = bank, \ + .pupd_bit = 2, \ + .tri_reg = PINGROUP_REG_Y(r), \ + .tri_bank = bank, \ + .tri_bit = 4, \ + .einput_bit = e_input, \ + .sfsel_bit = gpio_sfio_sel, \ + .odrain_bit = e_od, \ + .schmitt_bit = schmitt_b, \ + .drvtype_bit = 13, \ + .lpdr_bit = e_lpdr, \ + +/* main drive pin groups */ +#define drive_gpio_aud3_pk0 DRV_PINGROUP_ENTRY_Y(0x1004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_aud2_pj7 DRV_PINGROUP_ENTRY_Y(0x100c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_aud1_pj6 DRV_PINGROUP_ENTRY_Y(0x1014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_aud0_pj5 DRV_PINGROUP_ENTRY_Y(0x101c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_aud_mclk_pj4 DRV_PINGROUP_ENTRY_Y(0x1024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap1_fs_pj3 DRV_PINGROUP_ENTRY_Y(0x102c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap1_din_pj2 DRV_PINGROUP_ENTRY_Y(0x1034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap1_dout_pj1 DRV_PINGROUP_ENTRY_Y(0x103c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap1_sclk_pj0 DRV_PINGROUP_ENTRY_Y(0x1044, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dmic1_clk_pm1 DRV_PINGROUP_ENTRY_Y(0x2004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic1_dat_pm0 DRV_PINGROUP_ENTRY_Y(0x200c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic2_dat_pm2 DRV_PINGROUP_ENTRY_Y(0x2014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic2_clk_pm3 DRV_PINGROUP_ENTRY_Y(0x201c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic4_dat_pm4 DRV_PINGROUP_ENTRY_Y(0x2024, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dmic4_clk_pm5 DRV_PINGROUP_ENTRY_Y(0x202c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dap4_fs_pcc3 DRV_PINGROUP_ENTRY_Y(0x2034, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dap4_din_pcc2 DRV_PINGROUP_ENTRY_Y(0x203c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dap4_dout_pcc1 DRV_PINGROUP_ENTRY_Y(0x2044, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_dap4_sclk_pcc0 DRV_PINGROUP_ENTRY_Y(0x204c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_extperiph2_clk_po1 DRV_PINGROUP_ENTRY_Y(0x0004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_extperiph1_clk_po0 DRV_PINGROUP_ENTRY_Y(0x000c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_cam_i2c_sda_po3 DRV_PINGROUP_ENTRY_Y(0x0014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_cam_i2c_scl_po2 DRV_PINGROUP_ENTRY_Y(0x001c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam1_pn0 DRV_PINGROUP_ENTRY_Y(0x0024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam2_pn1 DRV_PINGROUP_ENTRY_Y(0x002c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam3_pn2 DRV_PINGROUP_ENTRY_Y(0x0034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam4_pn3 DRV_PINGROUP_ENTRY_Y(0x003c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam5_pn4 DRV_PINGROUP_ENTRY_Y(0x0044, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam6_pn5 DRV_PINGROUP_ENTRY_Y(0x004c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_cam7_pn6 DRV_PINGROUP_ENTRY_Y(0x0054, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap2_din_pc3 DRV_PINGROUP_ENTRY_Y(0x4004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap2_dout_pc2 DRV_PINGROUP_ENTRY_Y(0x400c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap2_fs_pc4 DRV_PINGROUP_ENTRY_Y(0x4014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dap2_sclk_pc1 DRV_PINGROUP_ENTRY_Y(0x401c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart4_cts_pb3 DRV_PINGROUP_ENTRY_Y(0x4024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart4_rts_pb2 DRV_PINGROUP_ENTRY_Y(0x402c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart4_rx_pb1 DRV_PINGROUP_ENTRY_Y(0x4034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart4_tx_pb0 DRV_PINGROUP_ENTRY_Y(0x403c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan4_pc0 DRV_PINGROUP_ENTRY_Y(0x4044, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan3_pb6 DRV_PINGROUP_ENTRY_Y(0x404c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan2_pb5 DRV_PINGROUP_ENTRY_Y(0x4054, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan1_pb4 DRV_PINGROUP_ENTRY_Y(0x405c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen1_i2c_scl_pc5 DRV_PINGROUP_ENTRY_Y(0x4064, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen1_i2c_sda_pc6 DRV_PINGROUP_ENTRY_Y(0x406c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart1_cts_pt3 DRV_PINGROUP_ENTRY_Y(0x5004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart1_rts_pt2 DRV_PINGROUP_ENTRY_Y(0x500c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart1_rx_pt1 DRV_PINGROUP_ENTRY_Y(0x5014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart1_tx_pt0 DRV_PINGROUP_ENTRY_Y(0x501c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_directdc1_out3_pq5 DRV_PINGROUP_ENTRY_Y(0x502c, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_out2_pq4 DRV_PINGROUP_ENTRY_Y(0x5034, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_out1_pq3 DRV_PINGROUP_ENTRY_Y(0x503c, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_out0_pq2 DRV_PINGROUP_ENTRY_Y(0x5044, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_in_pq1 DRV_PINGROUP_ENTRY_Y(0x504c, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_directdc1_clk_pq0 DRV_PINGROUP_ENTRY_Y(0x5054, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_gpio_pq0_pi0 DRV_PINGROUP_ENTRY_Y(0x3004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq1_pi1 DRV_PINGROUP_ENTRY_Y(0x300c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq2_pi2 DRV_PINGROUP_ENTRY_Y(0x3014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq3_pi3 DRV_PINGROUP_ENTRY_Y(0x301c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq4_pi4 DRV_PINGROUP_ENTRY_Y(0x3024, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq5_pi5 DRV_PINGROUP_ENTRY_Y(0x302c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq6_pi6 DRV_PINGROUP_ENTRY_Y(0x3034, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_pq7_pi7 DRV_PINGROUP_ENTRY_Y(0x303c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_edp2_pp5 DRV_PINGROUP_ENTRY_Y(0x10004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_edp3_pp6 DRV_PINGROUP_ENTRY_Y(0x1000c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_edp0_pp3 DRV_PINGROUP_ENTRY_Y(0x10014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_edp1_pp4 DRV_PINGROUP_ENTRY_Y(0x1001c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dp_aux_ch0_hpd_pp0 DRV_PINGROUP_ENTRY_Y(0x10024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_dp_aux_ch1_hpd_pp1 DRV_PINGROUP_ENTRY_Y(0x1002c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_hdmi_cec_pp2 DRV_PINGROUP_ENTRY_Y(0x10034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l2_clkreq_n_pa6 DRV_PINGROUP_ENTRY_Y(0x7004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_wake_n_pa2 DRV_PINGROUP_ENTRY_Y(0x700c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l1_clkreq_n_pa4 DRV_PINGROUP_ENTRY_Y(0x7014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l1_rst_n_pa3 DRV_PINGROUP_ENTRY_Y(0x701c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l0_clkreq_n_pa1 DRV_PINGROUP_ENTRY_Y(0x7024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l0_rst_n_pa0 DRV_PINGROUP_ENTRY_Y(0x702c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_pex_l2_rst_n_pa5 DRV_PINGROUP_ENTRY_Y(0x7034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_sdmmc1_clk_pd0 DRV_PINGROUP_ENTRY_Y(0x8004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_cmd_pd1 DRV_PINGROUP_ENTRY_Y(0x800c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_dat3_pd5 DRV_PINGROUP_ENTRY_Y(0x8018, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_dat2_pd4 DRV_PINGROUP_ENTRY_Y(0x8020, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_dat1_pd3 DRV_PINGROUP_ENTRY_Y(0x8028, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc1_dat0_pd2 DRV_PINGROUP_ENTRY_Y(0x8030, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_td3_pe4 DRV_PINGROUP_ENTRY_Y(0x9004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_td2_pe3 DRV_PINGROUP_ENTRY_Y(0x900c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_td1_pe2 DRV_PINGROUP_ENTRY_Y(0x9014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_td0_pe1 DRV_PINGROUP_ENTRY_Y(0x901c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rd3_pf1 DRV_PINGROUP_ENTRY_Y(0x9024, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rd2_pf0 DRV_PINGROUP_ENTRY_Y(0x902c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rd1_pe7 DRV_PINGROUP_ENTRY_Y(0x9034, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_mdio_pf4 DRV_PINGROUP_ENTRY_Y(0x903c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rd0_pe6 DRV_PINGROUP_ENTRY_Y(0x9044, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_mdc_pf5 DRV_PINGROUP_ENTRY_Y(0x904c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_txc_pe0 DRV_PINGROUP_ENTRY_Y(0x9058, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rxc_pf3 DRV_PINGROUP_ENTRY_Y(0x9060, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_tx_ctl_pe5 DRV_PINGROUP_ENTRY_Y(0x9068, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_eqos_rx_ctl_pf2 DRV_PINGROUP_ENTRY_Y(0x9070, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_dat3_pg5 DRV_PINGROUP_ENTRY_Y(0xa004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_dat2_pg4 DRV_PINGROUP_ENTRY_Y(0xa00c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_dat1_pg3 DRV_PINGROUP_ENTRY_Y(0xa014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_dat0_pg2 DRV_PINGROUP_ENTRY_Y(0xa01c, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_cmd_pg1 DRV_PINGROUP_ENTRY_Y(0xa028, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_sdmmc3_clk_pg0 DRV_PINGROUP_ENTRY_Y(0xa030, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_io3_pr4 DRV_PINGROUP_ENTRY_Y(0xB004, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_io2_pr3 DRV_PINGROUP_ENTRY_Y(0xB00C, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_io1_pr2 DRV_PINGROUP_ENTRY_Y(0xB014, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_io0_pr1 DRV_PINGROUP_ENTRY_Y(0xB01C, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_sck_pr0 DRV_PINGROUP_ENTRY_Y(0xB024, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_qspi_cs_n_pr5 DRV_PINGROUP_ENTRY_Y(0xB02C, -1, -1, -1, -1, 28, 2, 30, 2, 0) +#define drive_gpio_wan8_ph3 DRV_PINGROUP_ENTRY_Y(0xd004, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan7_ph2 DRV_PINGROUP_ENTRY_Y(0xd00c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan6_ph1 DRV_PINGROUP_ENTRY_Y(0xd014, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_wan5_ph0 DRV_PINGROUP_ENTRY_Y(0xd01c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart2_tx_px0 DRV_PINGROUP_ENTRY_Y(0xd024, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart2_rx_px1 DRV_PINGROUP_ENTRY_Y(0xd02c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart2_rts_px2 DRV_PINGROUP_ENTRY_Y(0xd034, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart2_cts_px3 DRV_PINGROUP_ENTRY_Y(0xd03c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart5_rx_px5 DRV_PINGROUP_ENTRY_Y(0xd044, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart5_tx_px4 DRV_PINGROUP_ENTRY_Y(0xd04c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart5_rts_px6 DRV_PINGROUP_ENTRY_Y(0xd054, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_uart5_cts_px7 DRV_PINGROUP_ENTRY_Y(0xd05c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm1_py0 DRV_PINGROUP_ENTRY_Y(0xd064, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm2_py1 DRV_PINGROUP_ENTRY_Y(0xd06c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm3_py2 DRV_PINGROUP_ENTRY_Y(0xd074, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm4_py3 DRV_PINGROUP_ENTRY_Y(0xd07c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm5_py4 DRV_PINGROUP_ENTRY_Y(0xd084, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm6_py5 DRV_PINGROUP_ENTRY_Y(0xd08c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpio_mdm7_py6 DRV_PINGROUP_ENTRY_Y(0xd094, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_bcpu_pwr_req_ph4 DRV_PINGROUP_ENTRY_Y(0xd09c, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_mcpu_pwr_req_ph5 DRV_PINGROUP_ENTRY_Y(0xd0a4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gpu_pwr_req_ph6 DRV_PINGROUP_ENTRY_Y(0xd0ac, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen7_i2c_scl_pl0 DRV_PINGROUP_ENTRY_Y(0xd0b4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen7_i2c_sda_pl1 DRV_PINGROUP_ENTRY_Y(0xd0bc, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen9_i2c_sda_pl3 DRV_PINGROUP_ENTRY_Y(0xd0c4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gen9_i2c_scl_pl2 DRV_PINGROUP_ENTRY_Y(0xd0cc, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_usb_vbus_en0_pl4 DRV_PINGROUP_ENTRY_Y(0xd0d4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_usb_vbus_en1_pl5 DRV_PINGROUP_ENTRY_Y(0xd0dc, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gp_pwm7_pl7 DRV_PINGROUP_ENTRY_Y(0xd0e4, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_gp_pwm6_pl6 DRV_PINGROUP_ENTRY_Y(0xd0ec, 12, 5, 20, 5, -1, -1, -1, -1, 0) +#define drive_ufs0_rst_pbb1 DRV_PINGROUP_ENTRY_Y(0x11004, 12, 9, 24, 8, -1, -1, -1, -1, 0) +#define drive_ufs0_ref_clk_pbb0 DRV_PINGROUP_ENTRY_Y(0x1100c, 12, 9, 24, 8, -1, -1, -1, -1, 0) + +#define drive_directdc_comp DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc1_comp DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_eqos_comp DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc3_comp DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_clk DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_cmd DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dqs DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat7 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat6 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat5 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat4 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat3 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat2 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat1 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_sdmmc4_dat0 DRV_PINGROUP_ENTRY_N(no_entry) +#define drive_qspi_comp DRV_PINGROUP_ENTRY_N(no_entry) + +/* AON drive pin groups */ +#define drive_touch_clk_pee2 DRV_PINGROUP_ENTRY_Y(0x2004, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart3_cts_pw5 DRV_PINGROUP_ENTRY_Y(0x200c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart3_rts_pw4 DRV_PINGROUP_ENTRY_Y(0x2014, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart3_rx_pw3 DRV_PINGROUP_ENTRY_Y(0x201c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart3_tx_pw2 DRV_PINGROUP_ENTRY_Y(0x2024, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gen8_i2c_sda_pw1 DRV_PINGROUP_ENTRY_Y(0x202c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gen8_i2c_scl_pw0 DRV_PINGROUP_ENTRY_Y(0x2034, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart7_rx_pw7 DRV_PINGROUP_ENTRY_Y(0x203c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_uart7_tx_pw6 DRV_PINGROUP_ENTRY_Y(0x2044, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen0_pv0 DRV_PINGROUP_ENTRY_Y(0x204c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen1_pv1 DRV_PINGROUP_ENTRY_Y(0x2054, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen2_pv2 DRV_PINGROUP_ENTRY_Y(0x205c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen3_pv3 DRV_PINGROUP_ENTRY_Y(0x2064, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen4_pv4 DRV_PINGROUP_ENTRY_Y(0x206c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen5_pv5 DRV_PINGROUP_ENTRY_Y(0x2074, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen6_pv6 DRV_PINGROUP_ENTRY_Y(0x207c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen7_pv7 DRV_PINGROUP_ENTRY_Y(0x2084, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen8_pee0 DRV_PINGROUP_ENTRY_Y(0x208c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sen9_pee1 DRV_PINGROUP_ENTRY_Y(0x2094, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_can_gpio7_paa7 DRV_PINGROUP_ENTRY_Y(0x3004, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can1_dout_pz0 DRV_PINGROUP_ENTRY_Y(0x300C, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can1_din_pz1 DRV_PINGROUP_ENTRY_Y(0x3014, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can0_dout_pz2 DRV_PINGROUP_ENTRY_Y(0x301c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can0_din_pz3 DRV_PINGROUP_ENTRY_Y(0x3024, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio0_paa0 DRV_PINGROUP_ENTRY_Y(0x302c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio1_paa1 DRV_PINGROUP_ENTRY_Y(0x3034, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio2_paa2 DRV_PINGROUP_ENTRY_Y(0x303c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio3_paa3 DRV_PINGROUP_ENTRY_Y(0x3044, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio4_paa4 DRV_PINGROUP_ENTRY_Y(0x304c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio5_paa5 DRV_PINGROUP_ENTRY_Y(0x3054, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_can_gpio6_paa6 DRV_PINGROUP_ENTRY_Y(0x305c, -1, -1, -1, -1, 28, 2, 30, 2, 1) +#define drive_gpio_sw1_pff1 DRV_PINGROUP_ENTRY_Y(0x1004, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sw2_pff2 DRV_PINGROUP_ENTRY_Y(0x100c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sw3_pff3 DRV_PINGROUP_ENTRY_Y(0x1014, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_sw4_pff4 DRV_PINGROUP_ENTRY_Y(0x101c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_shutdown DRV_PINGROUP_ENTRY_Y(0x1024, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_pmu_int DRV_PINGROUP_ENTRY_Y(0x102C, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_safe_state_ps3 DRV_PINGROUP_ENTRY_Y(0x1034, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_vcomp_alert_ps4 DRV_PINGROUP_ENTRY_Y(0x103c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_soc_pwr_req DRV_PINGROUP_ENTRY_Y(0x1044, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_batt_oc_ps2 DRV_PINGROUP_ENTRY_Y(0x104c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_clk_32k_in DRV_PINGROUP_ENTRY_Y(0x1054, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_power_on_pff0 DRV_PINGROUP_ENTRY_Y(0x105c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_pwr_i2c_scl_ps0 DRV_PINGROUP_ENTRY_Y(0x1064, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_pwr_i2c_sda_ps1 DRV_PINGROUP_ENTRY_Y(0x106c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis0_pu0 DRV_PINGROUP_ENTRY_Y(0x1084, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis1_pu1 DRV_PINGROUP_ENTRY_Y(0x108c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis2_pu2 DRV_PINGROUP_ENTRY_Y(0x1094, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis3_pu3 DRV_PINGROUP_ENTRY_Y(0x109c, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis4_pu4 DRV_PINGROUP_ENTRY_Y(0x10a4, 12, 5, 20, 5, -1, -1, -1, -1, 1) +#define drive_gpio_dis5_pu5 DRV_PINGROUP_ENTRY_Y(0x10ac, 12, 5, 20, 5, -1, -1, -1, -1, 1) + +#define PINGROUP(pg_name, f0, f1, f2, f3, r, bank, pupd, e_io_hv, e_lpbk, e_input, e_lpdr, e_pbias_buf, \ + gpio_sfio_sel, e_od, schmitt_b, drvtype, epreemp, io_reset, rfu_in) \ + { \ + .name = #pg_name, \ + .pins = pg_name##_pins, \ + .npins = ARRAY_SIZE(pg_name##_pins), \ + .funcs = { \ + TEGRA_MUX_##f0, \ + TEGRA_MUX_##f1, \ + TEGRA_MUX_##f2, \ + TEGRA_MUX_##f3, \ + }, \ + PIN_PINGROUP_ENTRY_Y(r, bank, pupd, e_io_hv, e_lpbk, \ + e_input, e_lpdr, e_pbias_buf, \ + gpio_sfio_sel, e_od, \ + schmitt_b, drvtype, \ + epreemp, io_reset, \ + rfu_in) \ + drive_##pg_name, \ + } + +static const struct tegra_pingroup tegra186_groups[] = { + PINGROUP(gpio_aud3_pk0, RSVD0, DSPK1, SPDIF, RSVD3, 0x1000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_aud2_pj7, RSVD0, DSPK1, SPDIF, RSVD3, 0x1008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_aud1_pj6, RSVD0, RSVD1, RSVD2, RSVD3, 0x1010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_aud0_pj5, RSVD0, RSVD1, RSVD2, RSVD3, 0x1018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(aud_mclk_pj4, AUD, RSVD1, RSVD2, RSVD3, 0x1020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap1_fs_pj3, I2S1, RSVD1, RSVD2, RSVD3, 0x1028, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap1_din_pj2, I2S1, RSVD1, RSVD2, RSVD3, 0x1030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap1_dout_pj1, I2S1, RSVD1, RSVD2, RSVD3, 0x1038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap1_sclk_pj0, I2S1, RSVD1, RSVD2, RSVD3, 0x1040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dmic1_clk_pm1, DMIC1, I2S3, RSVD2, RSVD3, 0x2000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic1_dat_pm0, DMIC1, I2S3, RSVD2, RSVD3, 0x2008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic2_dat_pm2, DMIC2, I2S3, RSVD2, RSVD3, 0x2010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic2_clk_pm3, DMIC2, I2S3, RSVD2, RSVD3, 0x2018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic4_dat_pm4, DMIC4, DSPK0, RSVD2, RSVD3, 0x2020, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dmic4_clk_pm5, DMIC4, DSPK0, RSVD2, RSVD3, 0x2028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dap4_fs_pcc3, I2S4, RSVD1, RSVD2, RSVD3, 0x2030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dap4_din_pcc2, I2S4, RSVD1, RSVD2, RSVD3, 0x2038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dap4_dout_pcc1, I2S4, RSVD1, RSVD2, RSVD3, 0x2040, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(dap4_sclk_pcc0, I2S4, RSVD1, RSVD2, RSVD3, 0x2048, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(extperiph2_clk_po1, EXTPERIPH2, RSVD1, RSVD2, RSVD3, 0x0000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(extperiph1_clk_po0, EXTPERIPH1, RSVD1, RSVD2, RSVD3, 0x0008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(cam_i2c_sda_po3, I2C3, RSVD1, RSVD2, RSVD3, 0x0010, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(cam_i2c_scl_po2, I2C3, RSVD1, RSVD2, RSVD3, 0x0018, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam1_pn0, VGP1, RSVD1, RSVD2, RSVD3, 0x0020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam2_pn1, VGP2, EXTPERIPH3, RSVD2, RSVD3, 0x0028, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam3_pn2, VGP3, EXTPERIPH4, RSVD2, RSVD3, 0x0030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam4_pn3, VGP4, SPI4, RSVD2, RSVD3, 0x0038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam5_pn4, VGP5, SPI4, RSVD2, RSVD3, 0x0040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam6_pn5, VGP6, SPI4, RSVD2, RSVD3, 0x0048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_cam7_pn6, RSVD0, SPI4, RSVD2, RSVD3, 0x0050, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap2_din_pc3, I2S2, RSVD1, RSVD2, RSVD3, 0x4000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap2_dout_pc2, I2S2, RSVD1, RSVD2, RSVD3, 0x4008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap2_fs_pc4, I2S2, RSVD1, RSVD2, RSVD3, 0x4010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dap2_sclk_pc1, I2S2, RSVD1, RSVD2, RSVD3, 0x4018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart4_cts_pb3, UARTD, RSVD1, RSVD2, RSVD3, 0x4020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart4_rts_pb2, UARTD, RSVD1, RSVD2, RSVD3, 0x4028, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart4_rx_pb1, UARTD, RSVD1, RSVD2, RSVD3, 0x4030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart4_tx_pb0, UARTD, RSVD1, RSVD2, RSVD3, 0x4038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan4_pc0, RSVD0, RSVD1, RSVD2, RSVD3, 0x4040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan3_pb6, RSVD0, RSVD1, RSVD2, RSVD3, 0x4048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan2_pb5, RSVD0, RSVD1, RSVD2, RSVD3, 0x4050, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan1_pb4, RSVD0, RSVD1, RSVD2, RSVD3, 0x4058, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen1_i2c_scl_pc5, I2C1, RSVD1, RSVD2, RSVD3, 0x4060, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen1_i2c_sda_pc6, I2C1, RSVD1, RSVD2, RSVD3, 0x4068, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart1_cts_pt3, UARTA, RSVD1, RSVD2, RSVD3, 0x5000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart1_rts_pt2, UARTA, RSVD1, RSVD2, RSVD3, 0x5008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart1_rx_pt1, UARTA, RSVD1, RSVD2, RSVD3, 0x5010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart1_tx_pt0, UARTA, RSVD1, RSVD2, RSVD3, 0x5018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(directdc1_out3_pq5, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_out2_pq4, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_out1_pq3, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_out0_pq2, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5040, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_in_pq1, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5048, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc1_clk_pq0, DIRECTDC1, RSVD1, RSVD2, RSVD3, 0x5050, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(directdc_comp, DIRECTDC, RSVD1, RSVD2, RSVD3, 0x5058, 0, Y, -1, -1, -1, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq0_pi0, RSVD0, IQC0, I2S6, RSVD3, 0x3000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq1_pi1, RSVD0, IQC0, I2S6, RSVD3, 0x3008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq2_pi2, RSVD0, IQC0, I2S6, RSVD3, 0x3010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq3_pi3, RSVD0, IQC0, I2S6, RSVD3, 0x3018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq4_pi4, RSVD0, IQC1, DTV, RSVD3, 0x3020, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq5_pi5, RSVD0, IQC1, DTV, RSVD3, 0x3028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq6_pi6, RSVD0, IQC1, DTV, RSVD3, 0x3030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_pq7_pi7, RSVD0, IQC1, DTV, RSVD3, 0x3038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_edp2_pp5, RSVD0, UARTF, SDMMC3, RSVD3, 0x10000, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_edp3_pp6, RSVD0, UARTF, SDMMC1, RSVD3, 0x10008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_edp0_pp3, RSVD0, UARTF, SDMMC3, RSVD3, 0x10010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_edp1_pp4, RSVD0, UARTF, SDMMC1, RSVD3, 0x10018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dp_aux_ch0_hpd_pp0, DP, RSVD1, RSVD2, RSVD3, 0x10020, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(dp_aux_ch1_hpd_pp1, DP, RSVD1, RSVD2, RSVD3, 0x10028, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(hdmi_cec_pp2, HDMI, RSVD1, RSVD2, RSVD3, 0x10030, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l2_clkreq_n_pa6, PE2, GP, SATA, RSVD3, 0x7000, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_wake_n_pa2, PE, RSVD1, RSVD2, RSVD3, 0x7008, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l1_clkreq_n_pa4, PE1, RSVD1, RSVD2, RSVD3, 0x7010, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l1_rst_n_pa3, PE1, RSVD1, RSVD2, RSVD3, 0x7018, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l0_clkreq_n_pa1, PE0, RSVD1, RSVD2, RSVD3, 0x7020, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l0_rst_n_pa0, PE0, RSVD1, RSVD2, RSVD3, 0x7028, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pex_l2_rst_n_pa5, PE2, SOC, SATA, RSVD3, 0x7030, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(sdmmc1_clk_pd0, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8000, 0, Y, 5, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_cmd_pd1, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_comp, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8010, 0, Y, -1, -1, 6, -1, -1, -1, -1, -1, N, -1, -1, N), + PINGROUP(sdmmc1_dat3_pd5, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8014, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_dat2_pd4, SDMMC1, RSVD1, RSVD2, RSVD3, 0x801c, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_dat1_pd3, SDMMC1, RSVD1, RSVD2, RSVD3, 0x8024, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc1_dat0_pd2, SDMMC1, RSVD1, RSVD2, RSVD3, 0x802c, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_td3_pe4, EQOS, SDMMC2, RSVD2, RSVD3, 0x9000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_td2_pe3, EQOS, SDMMC2, RSVD2, RSVD3, 0x9008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_td1_pe2, EQOS, SDMMC2, RSVD2, RSVD3, 0x9010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_td0_pe1, EQOS, SDMMC2, RSVD2, RSVD3, 0x9018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rd3_pf1, EQOS, SDMMC2, RSVD2, RSVD3, 0x9020, 0, Y, -1, 5, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rd2_pf0, EQOS, SDMMC2, RSVD2, RSVD3, 0x9028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rd1_pe7, EQOS, SDMMC2, RSVD2, RSVD3, 0x9030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_mdio_pf4, EQOS, SOC, RSVD2, RSVD3, 0x9038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rd0_pe6, EQOS, SDMMC2, RSVD2, RSVD3, 0x9040, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_mdc_pf5, EQOS, RSVD1, RSVD2, RSVD3, 0x9048, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_comp, EQOS, SDMMC2, RSVD2, RSVD3, 0x9050, 0, Y, -1, -1, -1, -1, -1, -1, -1, -1, N, -1, -1, N), + PINGROUP(eqos_txc_pe0, EQOS, SDMMC2, RSVD2, RSVD3, 0x9054, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rxc_pf3, EQOS, SDMMC2, RSVD2, RSVD3, 0x905c, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_tx_ctl_pe5, EQOS, SDMMC2, RSVD2, RSVD3, 0x9064, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(eqos_rx_ctl_pf2, EQOS, SDMMC2, RSVD2, RSVD3, 0x906c, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_dat3_pg5, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_dat2_pg4, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_dat1_pg3, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_dat0_pg2, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_comp, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa020, 0, Y, -1, -1, -1, -1, -1, -1, -1, -1, N, -1, -1, N), + PINGROUP(sdmmc3_cmd_pg1, SDMMC3, RSVD1, RSVD2, RSVD3, 0xa024, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc3_clk_pg0, SDMMC3, RSVD1, RSVD1, RSVD3, 0xa02c, 0, Y, -1, 5, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_clk, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6004, 0, Y, -1, 5, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_cmd, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6008, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dqs, SDMMC4, RSVD1, RSVD2, RSVD3, 0x600c, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat7, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6010, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat6, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6014, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat5, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6018, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat4, SDMMC4, RSVD1, RSVD2, RSVD3, 0x601c, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat3, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6020, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat2, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6024, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat1, SDMMC4, RSVD1, RSVD2, RSVD3, 0x6028, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(sdmmc4_dat0, SDMMC4, RSVD1, RSVD2, RSVD3, 0x602c, 0, Y, -1, -1, 6, -1, 9, -1, -1, 12, Y, -1, -1, Y), + PINGROUP(qspi_io3_pr4, QSPI, RSVD1, RSVD2, RSVD3, 0xB000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_io2_pr3, QSPI, RSVD1, RSVD2, RSVD3, 0xB008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_io1_pr2, QSPI, RSVD1, RSVD2, RSVD3, 0xB010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_io0_pr1, QSPI, RSVD1, RSVD2, RSVD3, 0xB018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_sck_pr0, QSPI, RSVD1, RSVD2, RSVD3, 0xB020, 0, Y, -1, 5, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_cs_n_pr5, QSPI, RSVD1, RSVD2, RSVD3, 0xB028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(qspi_comp, QSPI, RSVD1, RSVD2, RSVD3, 0xB030, 0, Y, -1, -1, -1, -1, -1, -1, -1, -1, Y, -1, -1, Y), + PINGROUP(gpio_wan8_ph3, RSVD0, RSVD1, SPI1, RSVD3, 0xd000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan7_ph2, RSVD0, RSVD1, SPI1, RSVD3, 0xd008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan6_ph1, RSVD0, RSVD1, SPI1, RSVD3, 0xd010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_wan5_ph0, RSVD0, RSVD1, SPI1, RSVD3, 0xd018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart2_tx_px0, UARTB, RSVD1, RSVD2, RSVD3, 0xd020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart2_rx_px1, UARTB, RSVD1, RSVD2, RSVD3, 0xd028, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart2_rts_px2, UARTB, RSVD1, RSVD2, RSVD3, 0xd030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart2_cts_px3, UARTB, RSVD1, RSVD2, RSVD3, 0xd038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart5_rx_px5, UARTE, SPI3, GP, RSVD3, 0xd040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart5_tx_px4, UARTE, SPI3, NV, RSVD3, 0xd048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart5_rts_px6, UARTE, SPI3, RSVD2, RSVD3, 0xd050, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart5_cts_px7, UARTE, SPI3, RSVD2, RSVD3, 0xd058, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm1_py0, RSVD0, RSVD1, RSVD2, RSVD3, 0xd060, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm2_py1, RSVD0, RSVD1, RSVD2, RSVD3, 0xd068, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm3_py2, RSVD0, RSVD1, RSVD2, RSVD3, 0xd070, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm4_py3, RSVD0, SPI1, CCLA, RSVD3, 0xd078, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm5_py4, RSVD0, SPI1, RSVD2, RSVD3, 0xd080, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm6_py5, SOC, RSVD1, RSVD2, RSVD3, 0xd088, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_mdm7_py6, RSVD0, RSVD1, RSVD2, RSVD3, 0xd090, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(bcpu_pwr_req_ph4, RSVD0, RSVD1, RSVD2, RSVD3, 0xd098, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(mcpu_pwr_req_ph5, RSVD0, RSVD1, RSVD2, RSVD3, 0xd0a0, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpu_pwr_req_ph6, RSVD0, RSVD1, RSVD2, RSVD3, 0xd0a8, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen7_i2c_scl_pl0, I2C7, I2S5, RSVD2, RSVD3, 0xd0b0, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen7_i2c_sda_pl1, I2C7, I2S5, RSVD2, RSVD3, 0xd0b8, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen9_i2c_sda_pl3, I2C9, I2S5, RSVD2, RSVD3, 0xd0c0, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen9_i2c_scl_pl2, I2C9, I2S5, RSVD2, RSVD3, 0xd0c8, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(usb_vbus_en0_pl4, USB, RSVD1, RSVD2, RSVD3, 0xd0d0, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(usb_vbus_en1_pl5, USB, RSVD1, RSVD2, RSVD3, 0xd0d8, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gp_pwm7_pl7, GP, RSVD1, RSVD2, RSVD3, 0xd0e0, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gp_pwm6_pl6, GP, RSVD1, RSVD2, RSVD3, 0xd0e8, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(ufs0_rst_pbb1, UFS0, RSVD1, RSVD2, RSVD3, 0x11000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), + PINGROUP(ufs0_ref_clk_pbb0, UFS0, RSVD1, RSVD2, RSVD3, 0x11008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, 15, 17, Y), +}; + +static const struct tegra_pinctrl_soc_data tegra186_pinctrl = { + .pins = tegra186_pins, + .npins = ARRAY_SIZE(tegra186_pins), + .functions = tegra186_functions, + .nfunctions = ARRAY_SIZE(tegra186_functions), + .groups = tegra186_groups, + .ngroups = ARRAY_SIZE(tegra186_groups), + .hsm_in_mux = false, + .schmitt_in_mux = true, + .drvtype_in_mux = true, + .sfsel_in_mux = true, +}; + +static const struct pinctrl_pin_desc tegra186_aon_pins[] = { + PINCTRL_PIN(TEGRA_PIN_PWR_I2C_SCL_PS0, "PWR_I2C_SCL_PS0"), + PINCTRL_PIN(TEGRA_PIN_PWR_I2C_SDA_PS1, "PWR_I2C_SDA_PS1"), + PINCTRL_PIN(TEGRA_PIN_BATT_OC_PS2, "BATT_OC_PS2"), + PINCTRL_PIN(TEGRA_PIN_SAFE_STATE_PS3, "SAFE_STATE_PS3"), + PINCTRL_PIN(TEGRA_PIN_VCOMP_ALERT_PS4, "VCOMP_ALERT_PS4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS0_PU0, "GPIO_DIS0_PU0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS1_PU1, "GPIO_DIS1_PU1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS2_PU2, "GPIO_DIS2_PU2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS3_PU3, "GPIO_DIS3_PU3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS4_PU4, "GPIO_DIS4_PU4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_DIS5_PU5, "GPIO_DIS5_PU5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN0_PV0, "GPIO_SEN0_PV0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN1_PV1, "GPIO_SEN1_PV1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN2_PV2, "GPIO_SEN2_PV2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN3_PV3, "GPIO_SEN3_PV3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN4_PV4, "GPIO_SEN4_PV4"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN5_PV5, "GPIO_SEN5_PV5"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN6_PV6, "GPIO_SEN6_PV6"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN7_PV7, "GPIO_SEN7_PV7"), + PINCTRL_PIN(TEGRA_PIN_GEN8_I2C_SCL_PW0, "GEN8_I2C_SCL_PW0"), + PINCTRL_PIN(TEGRA_PIN_GEN8_I2C_SDA_PW1, "GEN8_I2C_SDA_PW1"), + PINCTRL_PIN(TEGRA_PIN_UART3_TX_PW2, "UART3_TX_PW2"), + PINCTRL_PIN(TEGRA_PIN_UART3_RX_PW3, "UART3_RX_PW3"), + PINCTRL_PIN(TEGRA_PIN_UART3_RTS_PW4, "UART3_RTS_PW4"), + PINCTRL_PIN(TEGRA_PIN_UART3_CTS_PW5, "UART3_CTS_PW5"), + PINCTRL_PIN(TEGRA_PIN_UART7_TX_PW6, "UART7_TX_PW6"), + PINCTRL_PIN(TEGRA_PIN_UART7_RX_PW7, "UART7_RX_PW7"), + PINCTRL_PIN(TEGRA_PIN_CAN1_DOUT_PZ0, "CAN1_DOUT_PZ0"), + PINCTRL_PIN(TEGRA_PIN_CAN1_DIN_PZ1, "CAN1_DIN_PZ1"), + PINCTRL_PIN(TEGRA_PIN_CAN0_DOUT_PZ2, "CAN0_DOUT_PZ2"), + PINCTRL_PIN(TEGRA_PIN_CAN0_DIN_PZ3, "CAN0_DIN_PZ3"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO0_PAA0, "CAN_GPIO0_PAA0"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO1_PAA1, "CAN_GPIO1_PAA1"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO2_PAA2, "CAN_GPIO2_PAA2"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO3_PAA3, "CAN_GPIO3_PAA3"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO4_PAA4, "CAN_GPIO4_PAA4"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO5_PAA5, "CAN_GPIO5_PAA5"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO6_PAA6, "CAN_GPIO6_PAA6"), + PINCTRL_PIN(TEGRA_PIN_CAN_GPIO7_PAA7, "CAN_GPIO7_PAA7"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN8_PEE0, "GPIO_SEN8_PEE0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SEN9_PEE1, "GPIO_SEN9_PEE1"), + PINCTRL_PIN(TEGRA_PIN_TOUCH_CLK_PEE2, "TOUCH_CLK_PEE2"), + PINCTRL_PIN(TEGRA_PIN_POWER_ON_PFF0, "POWER_ON_PFF0"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SW1_PFF1, "GPIO_SW1_PFF1"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SW2_PFF2, "GPIO_SW2_PFF2"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SW3_PFF3, "GPIO_SW3_PFF3"), + PINCTRL_PIN(TEGRA_PIN_GPIO_SW4_PFF4, "GPIO_SW4_PFF4"), + PINCTRL_PIN(TEGRA_PIN_SHUTDOWN, "SHUTDOWN"), + PINCTRL_PIN(TEGRA_PIN_PMU_INT, "PMU_INT"), + PINCTRL_PIN(TEGRA_PIN_SOC_PWR_REQ, "SOC_PWR_REQ"), + PINCTRL_PIN(TEGRA_PIN_CLK_32K_IN, "CLK_32K_IN"), +}; + +static const struct tegra_pingroup tegra186_aon_groups[] = { + PINGROUP(touch_clk_pee2, TOUCH, RSVD1, RSVD2, RSVD3, 0x2000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart3_cts_pw5, UARTC, RSVD1, RSVD2, RSVD3, 0x2008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart3_rts_pw4, UARTC, RSVD1, RSVD2, RSVD3, 0x2010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart3_rx_pw3, UARTC, RSVD1, RSVD2, RSVD3, 0x2018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart3_tx_pw2, UARTC, RSVD1, RSVD2, RSVD3, 0x2020, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen8_i2c_sda_pw1, I2C8, RSVD1, RSVD2, RSVD3, 0x2028, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gen8_i2c_scl_pw0, I2C8, RSVD1, RSVD2, RSVD3, 0x2030, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart7_rx_pw7, UARTG, RSVD1, RSVD2, RSVD3, 0x2038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(uart7_tx_pw6, UARTG, RSVD1, RSVD2, RSVD3, 0x2040, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen0_pv0, RSVD0, RSVD1, RSVD2, RSVD3, 0x2048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen1_pv1, SPI2, RSVD1, RSVD2, RSVD3, 0x2050, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen2_pv2, SPI2, RSVD1, RSVD2, RSVD3, 0x2058, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen3_pv3, SPI2, RSVD1, RSVD2, RSVD3, 0x2060, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen4_pv4, SPI2, RSVD1, RSVD2, RSVD3, 0x2068, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen5_pv5, RSVD0, RSVD1, RSVD2, RSVD3, 0x2070, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen6_pv6, RSVD0, GP, RSVD2, RSVD3, 0x2078, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen7_pv7, RSVD0, WDT, RSVD2, RSVD3, 0x2080, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen8_pee0, RSVD0, I2C2, RSVD2, RSVD3, 0x2088, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sen9_pee1, RSVD0, I2C2, RSVD2, RSVD3, 0x2090, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(can_gpio7_paa7, RSVD0, WDT, RSVD2, RSVD3, 0x3000, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can1_dout_pz0, CAN1, RSVD1, RSVD2, RSVD3, 0x3008, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can1_din_pz1, CAN1, RSVD1, RSVD2, RSVD3, 0x3010, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can0_dout_pz2, CAN0, RSVD1, RSVD2, RSVD3, 0x3018, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can0_din_pz3, CAN0, RSVD1, RSVD2, RSVD3, 0x3020, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio0_paa0, RSVD0, DMIC3, DMIC5, RSVD3, 0x3028, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio1_paa1, RSVD0, DMIC3, DMIC5, RSVD3, 0x3030, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio2_paa2, GPIO, RSVD1, RSVD2, RSVD3, 0x3038, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio3_paa3, RSVD0, RSVD1, RSVD2, RSVD3, 0x3040, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio4_paa4, RSVD0, RSVD1, RSVD2, RSVD3, 0x3048, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio5_paa5, RSVD0, RSVD1, RSVD2, RSVD3, 0x3050, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(can_gpio6_paa6, RSVD0, RSVD1, RSVD2, RSVD3, 0x3058, 0, Y, -1, -1, 6, -1, 9, 10, -1, 12, Y, -1, -1, Y), + PINGROUP(gpio_sw1_pff1, RSVD0, RSVD1, RSVD2, RSVD3, 0x1000, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sw2_pff2, RSVD0, RSVD1, RSVD2, RSVD3, 0x1008, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sw3_pff3, RSVD0, RSVD1, RSVD2, RSVD3, 0x1010, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_sw4_pff4, RSVD0, RSVD1, RSVD2, RSVD3, 0x1018, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(shutdown, RSVD0, RSVD1, RSVD2, RSVD3, 0x1020, 0, Y, -1, -1, 6, 8, -1, -1, -1, 12, N, -1, -1, N), + PINGROUP(pmu_int, RSVD0, RSVD1, RSVD2, RSVD3, 0x1028, 0, Y, -1, -1, 6, 8, -1, -1, -1, 12, N, -1, -1, N), + PINGROUP(safe_state_ps3, SCE, RSVD1, RSVD2, RSVD3, 0x1030, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(vcomp_alert_ps4, SOC, RSVD1, RSVD2, RSVD3, 0x1038, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(soc_pwr_req, RSVD0, RSVD1, RSVD2, RSVD3, 0x1040, 0, Y, -1, -1, 6, 8, -1, -1, -1, 12, N, -1, -1, N), + PINGROUP(batt_oc_ps2, SOC, RSVD1, RSVD2, RSVD3, 0x1048, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(clk_32k_in, RSVD0, RSVD1, RSVD2, RSVD3, 0x1050, 0, Y, -1, -1, 6, 8, -1, -1, -1, -1, N, -1, -1, N), + PINGROUP(power_on_pff0, RSVD0, RSVD1, RSVD2, RSVD3, 0x1058, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pwr_i2c_scl_ps0, I2C5, RSVD1, RSVD2, RSVD3, 0x1060, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(pwr_i2c_sda_ps1, I2C5, RSVD1, RSVD2, RSVD3, 0x1068, 0, Y, 5, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis0_pu0, RSVD0, GP, DCB, DCC, 0x1080, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis1_pu1, RSVD0, RSVD1, DISPLAYA, RSVD3, 0x1088, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis2_pu2, RSVD0, GP, DCA, RSVD3, 0x1090, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis3_pu3, RSVD0, RSVD1, DISPLAYB, DCC, 0x1098, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis4_pu4, RSVD0, SOC, DCA, RSVD3, 0x10a0, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), + PINGROUP(gpio_dis5_pu5, RSVD0, GP, DCC, DCB, 0x10a8, 0, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N), +}; + +static const struct tegra_pinctrl_soc_data tegra186_pinctrl_aon = { + .pins = tegra186_aon_pins, + .npins = ARRAY_SIZE(tegra186_aon_pins), + .functions = tegra186_functions, + .nfunctions = ARRAY_SIZE(tegra186_functions), + .groups = tegra186_aon_groups, + .ngroups = ARRAY_SIZE(tegra186_aon_groups), + .hsm_in_mux = false, + .schmitt_in_mux = true, + .drvtype_in_mux = true, + .sfsel_in_mux = true, +}; + +static int tegra186_pinctrl_probe(struct platform_device *pdev) +{ + const struct tegra_pinctrl_soc_data *soc = of_device_get_match_data(&pdev->dev); + + return tegra_pinctrl_probe(pdev, soc); +} + +static const struct of_device_id tegra186_pinctrl_of_match[] = { + { .compatible = "nvidia,tegra186-pinmux", .data = &tegra186_pinctrl }, + { .compatible = "nvidia,tegra186-pinmux-aon", .data = &tegra186_pinctrl_aon }, + { }, +}; + +static struct platform_driver tegra186_pinctrl_driver = { + .driver = { + .name = "tegra186-pinctrl", + .of_match_table = tegra186_pinctrl_of_match, + }, + .probe = tegra186_pinctrl_probe, +}; + +static int __init tegra186_pinctrl_init(void) +{ + return platform_driver_register(&tegra186_pinctrl_driver); +} +arch_initcall(tegra186_pinctrl_init); diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index fd58781a2fb7da..1da79e3d215bfa 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -9,6 +9,7 @@ * battery charging and regulator control, firmware update. */ +#include #include #include #include @@ -30,6 +31,56 @@ static struct cros_ec_platform pd_p = { .cmd_offset = EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX), }; +static void cros_ec_device_free(void *data) +{ + struct cros_ec_device *ec_dev = data; + + mutex_destroy(&ec_dev->lock); + lockdep_unregister_key(&ec_dev->lockdep_key); +} + +struct cros_ec_device *cros_ec_device_alloc(struct device *dev) +{ + struct cros_ec_device *ec_dev; + + ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + if (!ec_dev) + return NULL; + + ec_dev->din_size = sizeof(struct ec_host_response) + + sizeof(struct ec_response_get_protocol_info) + + EC_MAX_RESPONSE_OVERHEAD; + ec_dev->dout_size = sizeof(struct ec_host_request) + + sizeof(struct ec_params_rwsig_action) + + EC_MAX_REQUEST_OVERHEAD; + + ec_dev->din = devm_kzalloc(dev, ec_dev->din_size, GFP_KERNEL); + if (!ec_dev->din) + return NULL; + + ec_dev->dout = devm_kzalloc(dev, ec_dev->dout_size, GFP_KERNEL); + if (!ec_dev->dout) + return NULL; + + ec_dev->dev = dev; + ec_dev->max_response = sizeof(struct ec_response_get_protocol_info); + ec_dev->max_request = sizeof(struct ec_params_rwsig_action); + ec_dev->suspend_timeout_ms = EC_HOST_SLEEP_TIMEOUT_DEFAULT; + + BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->event_notifier); + BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->panic_notifier); + + lockdep_register_key(&ec_dev->lockdep_key); + mutex_init(&ec_dev->lock); + lockdep_set_class(&ec_dev->lock, &ec_dev->lockdep_key); + + if (devm_add_action_or_reset(dev, cros_ec_device_free, ec_dev)) + return NULL; + + return ec_dev; +} +EXPORT_SYMBOL(cros_ec_device_alloc); + /** * cros_ec_irq_handler() - top half part of the interrupt handler * @irq: IRQ id @@ -102,14 +153,13 @@ EXPORT_SYMBOL(cros_ec_irq_thread); static int cros_ec_sleep_event(struct cros_ec_device *ec_dev, u8 sleep_event) { int ret; - struct { - struct cros_ec_command msg; + TRAILING_OVERLAP(struct cros_ec_command, msg, data, union { struct ec_params_host_sleep_event req0; struct ec_params_host_sleep_event_v1 req1; struct ec_response_host_sleep_event_v1 resp1; } u; - } __packed buf; + ) __packed buf; memset(&buf, 0, sizeof(buf)); @@ -180,29 +230,7 @@ static int cros_ec_ready_event(struct notifier_block *nb, int cros_ec_register(struct cros_ec_device *ec_dev) { struct device *dev = ec_dev->dev; - int err = 0; - - BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->event_notifier); - BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->panic_notifier); - - ec_dev->max_request = sizeof(struct ec_params_hello); - ec_dev->max_response = sizeof(struct ec_response_get_protocol_info); - ec_dev->max_passthru = 0; - ec_dev->ec = NULL; - ec_dev->pd = NULL; - ec_dev->suspend_timeout_ms = EC_HOST_SLEEP_TIMEOUT_DEFAULT; - - ec_dev->din = devm_kzalloc(dev, ec_dev->din_size, GFP_KERNEL); - if (!ec_dev->din) - return -ENOMEM; - - ec_dev->dout = devm_kzalloc(dev, ec_dev->dout_size, GFP_KERNEL); - if (!ec_dev->dout) - return -ENOMEM; - - lockdep_register_key(&ec_dev->lockdep_key); - mutex_init(&ec_dev->lock); - lockdep_set_class(&ec_dev->lock, &ec_dev->lockdep_key); + int err; /* Send RWSIG continue to jump to RW for devices using RWSIG. */ err = cros_ec_rwsig_continue(ec_dev); @@ -289,6 +317,9 @@ int cros_ec_register(struct cros_ec_device *ec_dev) goto exit; } + scoped_guard(mutex, &ec_dev->lock) + ec_dev->registered = true; + dev_info(dev, "Chrome EC device registered\n"); /* @@ -302,8 +333,6 @@ int cros_ec_register(struct cros_ec_device *ec_dev) exit: platform_device_unregister(ec_dev->ec); platform_device_unregister(ec_dev->pd); - mutex_destroy(&ec_dev->lock); - lockdep_unregister_key(&ec_dev->lockdep_key); return err; } EXPORT_SYMBOL(cros_ec_register); @@ -318,13 +347,14 @@ EXPORT_SYMBOL(cros_ec_register); */ void cros_ec_unregister(struct cros_ec_device *ec_dev) { + scoped_guard(mutex, &ec_dev->lock) + ec_dev->registered = false; + if (ec_dev->mkbp_event_supported) blocking_notifier_chain_unregister(&ec_dev->event_notifier, &ec_dev->notifier_ready); platform_device_unregister(ec_dev->pd); platform_device_unregister(ec_dev->ec); - mutex_destroy(&ec_dev->lock); - lockdep_unregister_key(&ec_dev->lockdep_key); } EXPORT_SYMBOL(cros_ec_unregister); diff --git a/drivers/platform/chrome/cros_ec.h b/drivers/platform/chrome/cros_ec.h index 6b95f1e0bace3d..cd4643bc536710 100644 --- a/drivers/platform/chrome/cros_ec.h +++ b/drivers/platform/chrome/cros_ec.h @@ -11,6 +11,9 @@ #include struct cros_ec_device; +struct device; + +struct cros_ec_device *cros_ec_device_alloc(struct device *dev); int cros_ec_register(struct cros_ec_device *ec_dev); void cros_ec_unregister(struct cros_ec_device *ec_dev); diff --git a/drivers/platform/chrome/cros_ec_chardev.c b/drivers/platform/chrome/cros_ec_chardev.c index 21a484385fc5db..c9d80ad5b57e18 100644 --- a/drivers/platform/chrome/cros_ec_chardev.c +++ b/drivers/platform/chrome/cros_ec_chardev.c @@ -31,18 +31,14 @@ /* Arbitrary bounded size for the event queue */ #define CROS_MAX_EVENT_LEN PAGE_SIZE -struct chardev_data { - struct cros_ec_dev *ec_dev; - struct miscdevice misc; -}; - struct chardev_priv { - struct cros_ec_dev *ec_dev; + struct cros_ec_device *ec_dev; struct notifier_block notifier; wait_queue_head_t wait_event; unsigned long event_mask; struct list_head events; size_t event_len; + u16 cmd_offset; }; struct ec_event { @@ -52,7 +48,7 @@ struct ec_event { u8 data[]; }; -static int ec_get_version(struct cros_ec_dev *ec, char *str, int maxlen) +static int ec_get_version(struct chardev_priv *priv, char *str, int maxlen) { static const char * const current_image_name[] = { "unknown", "read-only", "read-write", "invalid", @@ -65,10 +61,10 @@ static int ec_get_version(struct cros_ec_dev *ec, char *str, int maxlen) if (!msg) return -ENOMEM; - msg->command = EC_CMD_GET_VERSION + ec->cmd_offset; + msg->command = EC_CMD_GET_VERSION + priv->cmd_offset; msg->insize = sizeof(*resp); - ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg); + ret = cros_ec_cmd_xfer_status(priv->ec_dev, msg); if (ret < 0) { snprintf(str, maxlen, "Unknown EC version, returned error: %d\n", @@ -96,7 +92,7 @@ static int cros_ec_chardev_mkbp_event(struct notifier_block *nb, { struct chardev_priv *priv = container_of(nb, struct chardev_priv, notifier); - struct cros_ec_device *ec_dev = priv->ec_dev->ec_dev; + struct cros_ec_device *ec_dev = priv->ec_dev; struct ec_event *event; unsigned long event_bit = 1 << ec_dev->event_data.event_type; int total_size = sizeof(*event) + ec_dev->event_size; @@ -161,7 +157,8 @@ static struct ec_event *cros_ec_chardev_fetch_event(struct chardev_priv *priv, static int cros_ec_chardev_open(struct inode *inode, struct file *filp) { struct miscdevice *mdev = filp->private_data; - struct cros_ec_dev *ec_dev = dev_get_drvdata(mdev->parent); + struct cros_ec_dev *ec = dev_get_drvdata(mdev->parent); + struct cros_ec_device *ec_dev = ec->ec_dev; struct chardev_priv *priv; int ret; @@ -170,13 +167,14 @@ static int cros_ec_chardev_open(struct inode *inode, struct file *filp) return -ENOMEM; priv->ec_dev = ec_dev; + priv->cmd_offset = ec->cmd_offset; filp->private_data = priv; INIT_LIST_HEAD(&priv->events); init_waitqueue_head(&priv->wait_event); nonseekable_open(inode, filp); priv->notifier.notifier_call = cros_ec_chardev_mkbp_event; - ret = blocking_notifier_chain_register(&ec_dev->ec_dev->event_notifier, + ret = blocking_notifier_chain_register(&ec_dev->event_notifier, &priv->notifier); if (ret) { dev_err(ec_dev->dev, "failed to register event notifier\n"); @@ -204,7 +202,6 @@ static ssize_t cros_ec_chardev_read(struct file *filp, char __user *buffer, char msg[sizeof(struct ec_response_get_version) + sizeof(CROS_EC_DEV_VERSION)]; struct chardev_priv *priv = filp->private_data; - struct cros_ec_dev *ec_dev = priv->ec_dev; size_t count; int ret; @@ -238,7 +235,7 @@ static ssize_t cros_ec_chardev_read(struct file *filp, char __user *buffer, if (*offset != 0) return 0; - ret = ec_get_version(ec_dev, msg, sizeof(msg)); + ret = ec_get_version(priv, msg, sizeof(msg)); if (ret) return ret; @@ -254,10 +251,10 @@ static ssize_t cros_ec_chardev_read(struct file *filp, char __user *buffer, static int cros_ec_chardev_release(struct inode *inode, struct file *filp) { struct chardev_priv *priv = filp->private_data; - struct cros_ec_dev *ec_dev = priv->ec_dev; + struct cros_ec_device *ec_dev = priv->ec_dev; struct ec_event *event, *e; - blocking_notifier_chain_unregister(&ec_dev->ec_dev->event_notifier, + blocking_notifier_chain_unregister(&ec_dev->event_notifier, &priv->notifier); list_for_each_entry_safe(event, e, &priv->events, node) { @@ -272,7 +269,7 @@ static int cros_ec_chardev_release(struct inode *inode, struct file *filp) /* * Ioctls */ -static long cros_ec_chardev_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg) +static long cros_ec_chardev_ioctl_xcmd(struct chardev_priv *priv, void __user *arg) { struct cros_ec_command *s_cmd; struct cros_ec_command u_cmd; @@ -301,8 +298,8 @@ static long cros_ec_chardev_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg) goto exit; } - s_cmd->command += ec->cmd_offset; - ret = cros_ec_cmd_xfer(ec->ec_dev, s_cmd); + s_cmd->command += priv->cmd_offset; + ret = cros_ec_cmd_xfer(priv->ec_dev, s_cmd); /* Only copy data to userland if data was received. */ if (ret < 0) goto exit; @@ -314,10 +311,9 @@ static long cros_ec_chardev_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg) return ret; } -static long cros_ec_chardev_ioctl_readmem(struct cros_ec_dev *ec, - void __user *arg) +static long cros_ec_chardev_ioctl_readmem(struct chardev_priv *priv, void __user *arg) { - struct cros_ec_device *ec_dev = ec->ec_dev; + struct cros_ec_device *ec_dev = priv->ec_dev; struct cros_ec_readmem s_mem = { }; long num; @@ -346,16 +342,15 @@ static long cros_ec_chardev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct chardev_priv *priv = filp->private_data; - struct cros_ec_dev *ec = priv->ec_dev; if (_IOC_TYPE(cmd) != CROS_EC_DEV_IOC) return -ENOTTY; switch (cmd) { case CROS_EC_DEV_IOCXCMD: - return cros_ec_chardev_ioctl_xcmd(ec, (void __user *)arg); + return cros_ec_chardev_ioctl_xcmd(priv, (void __user *)arg); case CROS_EC_DEV_IOCRDMEM: - return cros_ec_chardev_ioctl_readmem(ec, (void __user *)arg); + return cros_ec_chardev_ioctl_readmem(priv, (void __user *)arg); case CROS_EC_DEV_IOCEVENTMASK: priv->event_mask = arg; return 0; @@ -377,31 +372,30 @@ static const struct file_operations chardev_fops = { static int cros_ec_chardev_probe(struct platform_device *pdev) { - struct cros_ec_dev *ec_dev = dev_get_drvdata(pdev->dev.parent); - struct cros_ec_platform *ec_platform = dev_get_platdata(ec_dev->dev); - struct chardev_data *data; + struct cros_ec_dev *ec = dev_get_drvdata(pdev->dev.parent); + struct cros_ec_platform *ec_platform = dev_get_platdata(ec->dev); + struct miscdevice *misc; /* Create a char device: we want to create it anew */ - data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); - if (!data) + misc = devm_kzalloc(&pdev->dev, sizeof(*misc), GFP_KERNEL); + if (!misc) return -ENOMEM; - data->ec_dev = ec_dev; - data->misc.minor = MISC_DYNAMIC_MINOR; - data->misc.fops = &chardev_fops; - data->misc.name = ec_platform->ec_name; - data->misc.parent = pdev->dev.parent; + misc->minor = MISC_DYNAMIC_MINOR; + misc->fops = &chardev_fops; + misc->name = ec_platform->ec_name; + misc->parent = pdev->dev.parent; - dev_set_drvdata(&pdev->dev, data); + dev_set_drvdata(&pdev->dev, misc); - return misc_register(&data->misc); + return misc_register(misc); } static void cros_ec_chardev_remove(struct platform_device *pdev) { - struct chardev_data *data = dev_get_drvdata(&pdev->dev); + struct miscdevice *misc = dev_get_drvdata(&pdev->dev); - misc_deregister(&data->misc); + misc_deregister(misc); } static const struct platform_device_id cros_ec_chardev_id[] = { diff --git a/drivers/platform/chrome/cros_ec_i2c.c b/drivers/platform/chrome/cros_ec_i2c.c index 38af97cdaab229..def1144a077ea6 100644 --- a/drivers/platform/chrome/cros_ec_i2c.c +++ b/drivers/platform/chrome/cros_ec_i2c.c @@ -289,24 +289,19 @@ static int cros_ec_cmd_xfer_i2c(struct cros_ec_device *ec_dev, static int cros_ec_i2c_probe(struct i2c_client *client) { struct device *dev = &client->dev; - struct cros_ec_device *ec_dev = NULL; + struct cros_ec_device *ec_dev; int err; - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; i2c_set_clientdata(client, ec_dev); - ec_dev->dev = dev; ec_dev->priv = client; ec_dev->irq = client->irq; ec_dev->cmd_xfer = cros_ec_cmd_xfer_i2c; ec_dev->pkt_xfer = cros_ec_pkt_xfer_i2c; ec_dev->phys_name = client->adapter->name; - ec_dev->din_size = sizeof(struct ec_host_response_i2c) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request_i2c) + - sizeof(struct ec_params_rwsig_action); err = cros_ec_register(ec_dev); if (err) { diff --git a/drivers/platform/chrome/cros_ec_ishtp.c b/drivers/platform/chrome/cros_ec_ishtp.c index 7e7190b30cbb97..4e74e702c5a24d 100644 --- a/drivers/platform/chrome/cros_ec_ishtp.c +++ b/drivers/platform/chrome/cros_ec_ishtp.c @@ -543,21 +543,17 @@ static int cros_ec_dev_init(struct ishtp_cl_data *client_data) struct cros_ec_device *ec_dev; struct device *dev = cl_data_to_dev(client_data); - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; client_data->ec_dev = ec_dev; dev->driver_data = ec_dev; - ec_dev->dev = dev; ec_dev->priv = client_data->cros_ish_cl; ec_dev->cmd_xfer = NULL; ec_dev->pkt_xfer = cros_ec_pkt_xfer_ish; ec_dev->phys_name = dev_name(dev); - ec_dev->din_size = sizeof(struct cros_ish_in_msg) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct cros_ish_out_msg) + sizeof(struct ec_params_rwsig_action); return cros_ec_register(ec_dev); } diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c index 7d9a78289c9626..78cfff80cdeafe 100644 --- a/drivers/platform/chrome/cros_ec_lpc.c +++ b/drivers/platform/chrome/cros_ec_lpc.c @@ -637,19 +637,15 @@ static int cros_ec_lpc_probe(struct platform_device *pdev) } } - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; platform_set_drvdata(pdev, ec_dev); - ec_dev->dev = dev; ec_dev->phys_name = dev_name(dev); ec_dev->cmd_xfer = cros_ec_cmd_xfer_lpc; ec_dev->pkt_xfer = cros_ec_pkt_xfer_lpc; ec_dev->cmd_readmem = cros_ec_lpc_readmem; - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); ec_dev->priv = ec_lpc; /* diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index 3e94a0a82173db..1d8d9168ec1aab 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -3,6 +3,7 @@ // // Copyright (C) 2015 Google, Inc +#include #include #include #include @@ -1153,5 +1154,19 @@ int cros_ec_get_cmd_versions(struct cros_ec_device *ec_dev, u16 cmd) } EXPORT_SYMBOL_GPL(cros_ec_get_cmd_versions); +/** + * cros_ec_device_registered - Return if the ec_dev is registered. + * + * @ec_dev: EC device + * + * Return: true if registered. Otherwise, false. + */ +bool cros_ec_device_registered(struct cros_ec_device *ec_dev) +{ + guard(mutex)(&ec_dev->lock); + return ec_dev->registered; +} +EXPORT_SYMBOL_GPL(cros_ec_device_registered); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ChromeOS EC communication protocol helpers"); diff --git a/drivers/platform/chrome/cros_ec_rpmsg.c b/drivers/platform/chrome/cros_ec_rpmsg.c index bc2666491db1f1..09bd9e49464e62 100644 --- a/drivers/platform/chrome/cros_ec_rpmsg.c +++ b/drivers/platform/chrome/cros_ec_rpmsg.c @@ -216,7 +216,7 @@ static int cros_ec_rpmsg_probe(struct rpmsg_device *rpdev) struct cros_ec_device *ec_dev; int ret; - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; @@ -224,14 +224,10 @@ static int cros_ec_rpmsg_probe(struct rpmsg_device *rpdev) if (!ec_rpmsg) return -ENOMEM; - ec_dev->dev = dev; ec_dev->priv = ec_rpmsg; ec_dev->cmd_xfer = cros_ec_cmd_xfer_rpmsg; ec_dev->pkt_xfer = cros_ec_pkt_xfer_rpmsg; ec_dev->phys_name = dev_name(&rpdev->dev); - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); dev_set_drvdata(dev, ec_dev); ec_rpmsg->rpdev = rpdev; diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c index 8ca0f854e7ac56..28fa82f8cb07e0 100644 --- a/drivers/platform/chrome/cros_ec_spi.c +++ b/drivers/platform/chrome/cros_ec_spi.c @@ -749,7 +749,7 @@ static int cros_ec_spi_probe(struct spi_device *spi) if (ec_spi == NULL) return -ENOMEM; ec_spi->spi = spi; - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; @@ -757,16 +757,11 @@ static int cros_ec_spi_probe(struct spi_device *spi) cros_ec_spi_dt_probe(ec_spi, dev); spi_set_drvdata(spi, ec_dev); - ec_dev->dev = dev; ec_dev->priv = ec_spi; ec_dev->irq = spi->irq; ec_dev->cmd_xfer = cros_ec_cmd_xfer_spi; ec_dev->pkt_xfer = cros_ec_pkt_xfer_spi; ec_dev->phys_name = dev_name(&ec_spi->spi->dev); - ec_dev->din_size = EC_MSG_PREAMBLE_COUNT + - sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); ec_spi->last_transfer_ns = ktime_get_ns(); diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c index 19c179d49c90df..d5b37414ff1245 100644 --- a/drivers/platform/chrome/cros_ec_uart.c +++ b/drivers/platform/chrome/cros_ec_uart.c @@ -259,7 +259,7 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) if (!ec_uart) return -ENOMEM; - ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + ec_dev = cros_ec_device_alloc(dev); if (!ec_dev) return -ENOMEM; @@ -276,14 +276,10 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) /* Initialize ec_dev for cros_ec */ ec_dev->phys_name = dev_name(dev); - ec_dev->dev = dev; ec_dev->priv = ec_uart; ec_dev->irq = ec_uart->irq; ec_dev->cmd_xfer = NULL; ec_dev->pkt_xfer = cros_ec_uart_pkt_xfer; - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); serdev_device_set_client_ops(serdev, &cros_ec_uart_client_ops); diff --git a/drivers/platform/chrome/wilco_ec/telemetry.c b/drivers/platform/chrome/wilco_ec/telemetry.c index 7d8ae2cbf72f9e..b18043e31ae4fa 100644 --- a/drivers/platform/chrome/wilco_ec/telemetry.c +++ b/drivers/platform/chrome/wilco_ec/telemetry.c @@ -388,7 +388,7 @@ static int telem_device_probe(struct platform_device *pdev) dev_set_name(&dev_data->dev, TELEM_DEV_NAME_FMT, minor); device_initialize(&dev_data->dev); - /* Initialize the character device and add it to userspace */; + /* Initialize the character device and add it to userspace */ cdev_init(&dev_data->cdev, &telem_fops); error = cdev_device_add(&dev_data->cdev, &dev_data->dev); if (error) { diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index 18fb44139de251..d63aaad7ef5998 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -239,6 +239,14 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_BOARD_NAME, "WUJIE14-GX4HRXL"), } }, + { + .ident = "MECHREVO Yilong15Pro Series GM5HG7A", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "MECHREVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "Yilong15Pro Series GM5HG7A"), + } + }, /* https://bugzilla.kernel.org/show_bug.cgi?id=220116 */ { .ident = "PCSpecialist Lafite Pro V 14M", @@ -248,6 +256,13 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Lafite Pro V 14M"), } }, + { + .ident = "TUXEDO Stellaris Slim 15 AMD Gen6", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GMxHGxx"), + } + }, { .ident = "TUXEDO InfinityBook Pro 14/15 AMD Gen10", .driver_data = &quirk_spurious_8042, diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index ef988605c4da63..bc544a4a5266ee 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -403,6 +403,7 @@ static const struct acpi_device_id amd_pmf_acpi_ids[] = { {"AMDI0103", 0}, {"AMDI0105", 0}, {"AMDI0107", 0}, + {"AMDI0108", 0}, { } }; MODULE_DEVICE_TABLE(acpi, amd_pmf_acpi_ids); diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index 3a488cf9ca06c9..6a62bc5b02fda7 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -673,6 +673,8 @@ static void asus_nb_wmi_key_filter(struct asus_wmi_driver *asus_wmi, int *code, if (atkbd_reports_vol_keys) *code = ASUS_WMI_KEY_IGNORE; break; + case 0x5D: /* Wireless console Toggle */ + case 0x5E: /* Wireless console Enable / Keyboard Attach, Detach */ case 0x5F: /* Wireless console Disable / Special Key */ if (quirks->key_wlan_event) *code = quirks->key_wlan_event; diff --git a/drivers/platform/x86/dell/dell-lis3lv02d.c b/drivers/platform/x86/dell/dell-lis3lv02d.c index 732de5f556f83b..77905a9ddde9dd 100644 --- a/drivers/platform/x86/dell/dell-lis3lv02d.c +++ b/drivers/platform/x86/dell/dell-lis3lv02d.c @@ -48,6 +48,7 @@ static const struct dmi_system_id lis3lv02d_devices[] __initconst = { DELL_LIS3LV02D_DMI_ENTRY("Latitude 5500", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Latitude E6330", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Latitude E6430", 0x29), + DELL_LIS3LV02D_DMI_ENTRY("Latitude E6530", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Precision 3540", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Precision 3551", 0x29), DELL_LIS3LV02D_DMI_ENTRY("Precision M6800", 0x29), diff --git a/drivers/platform/x86/dell/dell-pc.c b/drivers/platform/x86/dell/dell-pc.c index 48cc7511905a62..becdd9aaef2970 100644 --- a/drivers/platform/x86/dell/dell-pc.c +++ b/drivers/platform/x86/dell/dell-pc.c @@ -228,6 +228,8 @@ static int thermal_platform_profile_get(struct device *dev, static int thermal_platform_profile_probe(void *drvdata, unsigned long *choices) { + int current_mode; + if (supported_modes & DELL_QUIET) __set_bit(PLATFORM_PROFILE_QUIET, choices); if (supported_modes & DELL_COOL_BOTTOM) @@ -237,6 +239,13 @@ static int thermal_platform_profile_probe(void *drvdata, unsigned long *choices) if (supported_modes & DELL_PERFORMANCE) __set_bit(PLATFORM_PROFILE_PERFORMANCE, choices); + /* Make sure that ACPI is in sync with the profile set by USTT */ + current_mode = thermal_get_mode(); + if (current_mode < 0) + return current_mode; + + thermal_set_mode(current_mode); + return 0; } diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index 71e104a068e9e2..7449873c3d4068 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -790,7 +790,7 @@ static const struct x86_cpu_id isst_cpu_ids[] = { X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_ICELAKE_D, 0), X86_MATCH_VFM(INTEL_ICELAKE_X, 0), - X86_MATCH_VFM(INTEL_PANTHERCOVE_X, SST_HPM_SUPPORTED), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), X86_MATCH_VFM(INTEL_SKYLAKE_X, SST_MBOX_SUPPORTED), {} diff --git a/drivers/platform/x86/intel/tpmi_power_domains.c b/drivers/platform/x86/intel/tpmi_power_domains.c index 8641353b2e0617..7d93119a4c30cc 100644 --- a/drivers/platform/x86/intel/tpmi_power_domains.c +++ b/drivers/platform/x86/intel/tpmi_power_domains.c @@ -85,7 +85,7 @@ static const struct x86_cpu_id tpmi_cpu_ids[] = { X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, NULL), X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, NULL), X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, NULL), - X86_MATCH_VFM(INTEL_PANTHERCOVE_X, NULL), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, tpmi_cpu_ids); diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c index 4b57102c7f6270..6af6cf477c5b5b 100644 --- a/drivers/platform/x86/lg-laptop.c +++ b/drivers/platform/x86/lg-laptop.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -75,6 +76,9 @@ MODULE_PARM_DESC(fw_debug, "Enable printing of firmware debug messages"); #define WMBB_USB_CHARGE 0x10B #define WMBB_BATT_LIMIT 0x10C +#define FAN_MODE_LOWER GENMASK(1, 0) +#define FAN_MODE_UPPER GENMASK(5, 4) + #define PLATFORM_NAME "lg-laptop" MODULE_ALIAS("wmi:" WMI_EVENT_GUID0); @@ -274,29 +278,19 @@ static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr, const char *buffer, size_t count) { - bool value; + unsigned long value; union acpi_object *r; - u32 m; int ret; - ret = kstrtobool(buffer, &value); + ret = kstrtoul(buffer, 10, &value); if (ret) return ret; + if (value >= 3) + return -EINVAL; - r = lg_wmab(dev, WM_FAN_MODE, WM_GET, 0); - if (!r) - return -EIO; - - if (r->type != ACPI_TYPE_INTEGER) { - kfree(r); - return -EIO; - } - - m = r->integer.value; - kfree(r); - r = lg_wmab(dev, WM_FAN_MODE, WM_SET, (m & 0xffffff0f) | (value << 4)); - kfree(r); - r = lg_wmab(dev, WM_FAN_MODE, WM_SET, (m & 0xfffffff0) | value); + r = lg_wmab(dev, WM_FAN_MODE, WM_SET, + FIELD_PREP(FAN_MODE_LOWER, value) | + FIELD_PREP(FAN_MODE_UPPER, value)); kfree(r); return count; @@ -305,7 +299,7 @@ static ssize_t fan_mode_store(struct device *dev, static ssize_t fan_mode_show(struct device *dev, struct device_attribute *attr, char *buffer) { - unsigned int status; + unsigned int mode; union acpi_object *r; r = lg_wmab(dev, WM_FAN_MODE, WM_GET, 0); @@ -317,10 +311,10 @@ static ssize_t fan_mode_show(struct device *dev, return -EIO; } - status = r->integer.value & 0x01; + mode = FIELD_GET(FAN_MODE_LOWER, r->integer.value); kfree(r); - return sysfs_emit(buffer, "%d\n", status); + return sysfs_emit(buffer, "%d\n", mode); } static ssize_t usb_charge_store(struct device *dev, diff --git a/drivers/platform/x86/oxpec.c b/drivers/platform/x86/oxpec.c index eb076bb4099bed..54377b282ff885 100644 --- a/drivers/platform/x86/oxpec.c +++ b/drivers/platform/x86/oxpec.c @@ -124,6 +124,13 @@ static const struct dmi_system_id dmi_table[] = { }, .driver_data = (void *)aok_zoe_a1, }, + { + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "AOKZOE"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "AOKZOE A1X"), + }, + .driver_data = (void *)oxp_fly, + }, { .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "AYANEO"), @@ -306,6 +313,13 @@ static const struct dmi_system_id dmi_table[] = { }, .driver_data = (void *)oxp_x1, }, + { + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ONE-NETBOOK"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "ONEXPLAYER X1Pro EVA-02"), + }, + .driver_data = (void *)oxp_x1, + }, {}, }; diff --git a/drivers/pmdomain/Kconfig b/drivers/pmdomain/Kconfig index 91f04ace35d4b0..23076ae90e6641 100644 --- a/drivers/pmdomain/Kconfig +++ b/drivers/pmdomain/Kconfig @@ -7,6 +7,7 @@ source "drivers/pmdomain/apple/Kconfig" source "drivers/pmdomain/arm/Kconfig" source "drivers/pmdomain/bcm/Kconfig" source "drivers/pmdomain/imx/Kconfig" +source "drivers/pmdomain/marvell/Kconfig" source "drivers/pmdomain/mediatek/Kconfig" source "drivers/pmdomain/qcom/Kconfig" source "drivers/pmdomain/renesas/Kconfig" diff --git a/drivers/pmdomain/Makefile b/drivers/pmdomain/Makefile index 7030f44a49df9e..ebc802f13eb953 100644 --- a/drivers/pmdomain/Makefile +++ b/drivers/pmdomain/Makefile @@ -5,6 +5,7 @@ obj-y += apple/ obj-y += arm/ obj-y += bcm/ obj-y += imx/ +obj-y += marvell/ obj-y += mediatek/ obj-y += qcom/ obj-y += renesas/ diff --git a/drivers/pmdomain/amlogic/meson-secure-pwrc.c b/drivers/pmdomain/amlogic/meson-secure-pwrc.c index e8bda60078c455..1d2f371d2d7f04 100644 --- a/drivers/pmdomain/amlogic/meson-secure-pwrc.c +++ b/drivers/pmdomain/amlogic/meson-secure-pwrc.c @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -201,6 +204,71 @@ static const struct meson_secure_pwrc_domain_desc s4_pwrc_domains[] = { SEC_PD(S4_AUDIO, 0), }; +static const struct meson_secure_pwrc_domain_desc s6_pwrc_domains[] = { + SEC_PD(S6_DSPA, 0), + SEC_PD(S6_DOS_HEVC, 0), + SEC_PD(S6_DOS_VDEC, 0), + SEC_PD(S6_VPU_HDMI, 0), + SEC_PD(S6_U2DRD, 0), + SEC_PD(S6_U3DRD, 0), + SEC_PD(S6_SD_EMMC_C, 0), + SEC_PD(S6_GE2D, 0), + SEC_PD(S6_AMFC, 0), + SEC_PD(S6_VC9000E, 0), + SEC_PD(S6_DEWARP, 0), + SEC_PD(S6_VICP, 0), + SEC_PD(S6_SD_EMMC_A, 0), + SEC_PD(S6_SD_EMMC_B, 0), + /* ETH is for ethernet online wakeup, and should be always on */ + SEC_PD(S6_ETH, GENPD_FLAG_ALWAYS_ON), + SEC_PD(S6_PCIE, 0), + SEC_PD(S6_NNA_4T, 0), + SEC_PD(S6_AUDIO, 0), + SEC_PD(S6_AUCPU, 0), + SEC_PD(S6_ADAPT, 0), +}; + +static const struct meson_secure_pwrc_domain_desc s7_pwrc_domains[] = { + SEC_PD(S7_DOS_HEVC, 0), + SEC_PD(S7_DOS_VDEC, 0), + SEC_PD(S7_VPU_HDMI, 0), + SEC_PD(S7_USB_COMB, 0), + SEC_PD(S7_SD_EMMC_C, 0), + SEC_PD(S7_GE2D, 0), + SEC_PD(S7_SD_EMMC_A, 0), + SEC_PD(S7_SD_EMMC_B, 0), + /* ETH is for ethernet online wakeup, and should be always on */ + SEC_PD(S7_ETH, GENPD_FLAG_ALWAYS_ON), + SEC_PD(S7_AUCPU, 0), + SEC_PD(S7_AUDIO, 0), +}; + +static const struct meson_secure_pwrc_domain_desc s7d_pwrc_domains[] = { + SEC_PD(S7D_DOS_HCODEC, 0), + SEC_PD(S7D_DOS_HEVC, 0), + SEC_PD(S7D_DOS_VDEC, 0), + SEC_PD(S7D_VPU_HDMI, 0), + SEC_PD(S7D_USB_U2DRD, 0), + SEC_PD(S7D_USB_U2H, 0), + SEC_PD(S7D_SSD_EMMC_C, 0), + SEC_PD(S7D_GE2D, 0), + SEC_PD(S7D_AMFC, 0), + SEC_PD(S7D_EMMC_A, 0), + SEC_PD(S7D_EMMC_B, 0), + /* ETH is for ethernet online wakeup, and should be always on */ + SEC_PD(S7D_ETH, GENPD_FLAG_ALWAYS_ON), + SEC_PD(S7D_AUCPU, 0), + SEC_PD(S7D_AUDIO, 0), + /* SRAMA is used as ATF runtime memory, and should be always on */ + SEC_PD(S7D_SRAMA, GENPD_FLAG_ALWAYS_ON), + /* DMC0 is for DDR PHY ana/dig and DMC, and should be always on */ + SEC_PD(S7D_DMC0, GENPD_FLAG_ALWAYS_ON), + /* DMC1 is for DDR PHY ana/dig and DMC, and should be always on */ + SEC_PD(S7D_DMC1, GENPD_FLAG_ALWAYS_ON), + /* DDR should be always on */ + SEC_PD(S7D_DDR, GENPD_FLAG_ALWAYS_ON), +}; + static const struct meson_secure_pwrc_domain_desc t7_pwrc_domains[] = { SEC_PD(T7_DSPA, 0), SEC_PD(T7_DSPB, 0), @@ -367,6 +435,21 @@ static const struct meson_secure_pwrc_domain_data meson_secure_s4_pwrc_data = { .count = ARRAY_SIZE(s4_pwrc_domains), }; +static const struct meson_secure_pwrc_domain_data amlogic_secure_s6_pwrc_data = { + .domains = s6_pwrc_domains, + .count = ARRAY_SIZE(s6_pwrc_domains), +}; + +static const struct meson_secure_pwrc_domain_data amlogic_secure_s7_pwrc_data = { + .domains = s7_pwrc_domains, + .count = ARRAY_SIZE(s7_pwrc_domains), +}; + +static const struct meson_secure_pwrc_domain_data amlogic_secure_s7d_pwrc_data = { + .domains = s7d_pwrc_domains, + .count = ARRAY_SIZE(s7d_pwrc_domains), +}; + static const struct meson_secure_pwrc_domain_data amlogic_secure_t7_pwrc_data = { .domains = t7_pwrc_domains, .count = ARRAY_SIZE(t7_pwrc_domains), @@ -393,6 +476,18 @@ static const struct of_device_id meson_secure_pwrc_match_table[] = { .compatible = "amlogic,meson-s4-pwrc", .data = &meson_secure_s4_pwrc_data, }, + { + .compatible = "amlogic,s6-pwrc", + .data = &amlogic_secure_s6_pwrc_data, + }, + { + .compatible = "amlogic,s7-pwrc", + .data = &amlogic_secure_s7_pwrc_data, + }, + { + .compatible = "amlogic,s7d-pwrc", + .data = &amlogic_secure_s7d_pwrc_data, + }, { .compatible = "amlogic,t7-pwrc", .data = &amlogic_secure_t7_pwrc_data, diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index 0006ab3d078972..61c2277c9ce39f 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -187,6 +187,7 @@ static const struct genpd_lock_ops genpd_raw_spin_ops = { #define genpd_is_opp_table_fw(genpd) (genpd->flags & GENPD_FLAG_OPP_TABLE_FW) #define genpd_is_dev_name_fw(genpd) (genpd->flags & GENPD_FLAG_DEV_NAME_FW) #define genpd_is_no_sync_state(genpd) (genpd->flags & GENPD_FLAG_NO_SYNC_STATE) +#define genpd_is_no_stay_on(genpd) (genpd->flags & GENPD_FLAG_NO_STAY_ON) static inline bool irq_safe_dev_in_sleep_domain(struct device *dev, const struct generic_pm_domain *genpd) @@ -1357,7 +1358,6 @@ static int genpd_runtime_resume(struct device *dev) return ret; } -#ifndef CONFIG_PM_GENERIC_DOMAINS_OF static bool pd_ignore_unused; static int __init pd_ignore_unused_setup(char *__unused) { @@ -1382,9 +1382,6 @@ static int __init genpd_power_off_unused(void) mutex_lock(&gpd_list_lock); list_for_each_entry(genpd, &gpd_list, gpd_list_node) { - genpd_lock(genpd); - genpd->stay_on = false; - genpd_unlock(genpd); genpd_queue_power_off_work(genpd); } @@ -1393,7 +1390,6 @@ static int __init genpd_power_off_unused(void) return 0; } late_initcall_sync(genpd_power_off_unused); -#endif #ifdef CONFIG_PM_SLEEP @@ -2367,6 +2363,18 @@ static void genpd_lock_init(struct generic_pm_domain *genpd) } } +#ifdef CONFIG_PM_GENERIC_DOMAINS_OF +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = !genpd_is_no_stay_on(genpd) && !is_off; +} +#else +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = false; +} +#endif + /** * pm_genpd_init - Initialize a generic I/O PM domain object. * @genpd: PM domain object to initialize. @@ -2392,7 +2400,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd, INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); atomic_set(&genpd->sd_count, 0); genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON; - genpd->stay_on = !is_off; + genpd_set_stay_on(genpd, is_off); genpd->sync_state = GENPD_SYNC_STATE_OFF; genpd->device_count = 0; genpd->provider = NULL; diff --git a/drivers/pmdomain/imx/gpc.c b/drivers/pmdomain/imx/gpc.c index f18c7e6e75ddc5..33991f3c6b5564 100644 --- a/drivers/pmdomain/imx/gpc.c +++ b/drivers/pmdomain/imx/gpc.c @@ -343,7 +343,6 @@ static const struct regmap_config imx_gpc_regmap_config = { .rd_table = &access_table, .wr_table = &access_table, .max_register = 0x2ac, - .fast_io = true, }; static struct generic_pm_domain *imx_gpc_onecell_domains[] = { diff --git a/drivers/pmdomain/imx/imx93-blk-ctrl.c b/drivers/pmdomain/imx/imx93-blk-ctrl.c index 0e2ba8ec55d757..e094fe5a42bf64 100644 --- a/drivers/pmdomain/imx/imx93-blk-ctrl.c +++ b/drivers/pmdomain/imx/imx93-blk-ctrl.c @@ -86,6 +86,7 @@ struct imx93_blk_ctrl_domain { struct imx93_blk_ctrl_data { const struct imx93_blk_ctrl_domain_data *domains; + u32 skip_mask; int num_domains; const char * const *clk_names; int num_clks; @@ -250,6 +251,8 @@ static int imx93_blk_ctrl_probe(struct platform_device *pdev) int j; domain->data = data; + if (bc_data->skip_mask & BIT(i)) + continue; for (j = 0; j < data->num_clks; j++) domain->clks[j].id = data->clk_names[j]; @@ -418,16 +421,32 @@ static const struct regmap_access_table imx93_media_blk_ctl_access_table = { .n_yes_ranges = ARRAY_SIZE(imx93_media_blk_ctl_yes_ranges), }; +static const char * const media_blk_clk_names[] = { + "axi", "apb", "nic" +}; + +static const struct imx93_blk_ctrl_data imx91_media_blk_ctl_dev_data = { + .domains = imx93_media_blk_ctl_domain_data, + .skip_mask = BIT(IMX93_MEDIABLK_PD_MIPI_DSI) | BIT(IMX93_MEDIABLK_PD_PXP), + .num_domains = ARRAY_SIZE(imx93_media_blk_ctl_domain_data), + .clk_names = media_blk_clk_names, + .num_clks = ARRAY_SIZE(media_blk_clk_names), + .reg_access_table = &imx93_media_blk_ctl_access_table, +}; + static const struct imx93_blk_ctrl_data imx93_media_blk_ctl_dev_data = { .domains = imx93_media_blk_ctl_domain_data, .num_domains = ARRAY_SIZE(imx93_media_blk_ctl_domain_data), - .clk_names = (const char *[]){ "axi", "apb", "nic", }, - .num_clks = 3, + .clk_names = media_blk_clk_names, + .num_clks = ARRAY_SIZE(media_blk_clk_names), .reg_access_table = &imx93_media_blk_ctl_access_table, }; static const struct of_device_id imx93_blk_ctrl_of_match[] = { { + .compatible = "fsl,imx91-media-blk-ctrl", + .data = &imx91_media_blk_ctl_dev_data + }, { .compatible = "fsl,imx93-media-blk-ctrl", .data = &imx93_media_blk_ctl_dev_data }, { diff --git a/drivers/pmdomain/marvell/Kconfig b/drivers/pmdomain/marvell/Kconfig new file mode 100644 index 00000000000000..6c4084c8266702 --- /dev/null +++ b/drivers/pmdomain/marvell/Kconfig @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Marvell PM Domains" + depends on ARCH_MMP || COMPILE_TEST + +config PXA1908_PM_DOMAINS + tristate "Marvell PXA1908 power domains" + depends on OF + depends on PM + default y if ARCH_MMP && ARM64 + select AUXILIARY_BUS + select MFD_SYSCON + select PM_GENERIC_DOMAINS + select PM_GENERIC_DOMAINS_OF + help + Say Y here to enable support for Marvell PXA1908's power domanis. + +endmenu diff --git a/drivers/pmdomain/marvell/Makefile b/drivers/pmdomain/marvell/Makefile new file mode 100644 index 00000000000000..22c25013f6c856 --- /dev/null +++ b/drivers/pmdomain/marvell/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_PXA1908_PM_DOMAINS) += pxa1908-power-controller.o diff --git a/drivers/pmdomain/marvell/pxa1908-power-controller.c b/drivers/pmdomain/marvell/pxa1908-power-controller.c new file mode 100644 index 00000000000000..ff5e6e82d3f8df --- /dev/null +++ b/drivers/pmdomain/marvell/pxa1908-power-controller.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Duje Mihanović + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* VPU, GPU, ISP */ +#define APMU_PWR_CTRL_REG 0xd8 +#define APMU_PWR_BLK_TMR_REG 0xdc +#define APMU_PWR_STATUS_REG 0xf0 + +/* DSI */ +#define APMU_DEBUG 0x88 +#define DSI_PHY_DVM_MASK BIT(31) + +#define POWER_ON_LATENCY_US 300 +#define POWER_OFF_LATENCY_US 20 +#define POWER_POLL_TIMEOUT_US (25 * USEC_PER_MSEC) +#define POWER_POLL_SLEEP_US 6 + +#define NR_DOMAINS 5 + +#define to_pxa1908_pd(_genpd) container_of(_genpd, struct pxa1908_pd, genpd) + +struct pxa1908_pd_ctrl { + struct generic_pm_domain *domains[NR_DOMAINS]; + struct genpd_onecell_data onecell_data; + struct regmap *base; + struct device *dev; +}; + +struct pxa1908_pd_data { + u32 reg_clk_res_ctrl; + u32 pwr_state; + u32 hw_mode; + bool keep_on; + int id; +}; + +struct pxa1908_pd { + const struct pxa1908_pd_data data; + struct pxa1908_pd_ctrl *ctrl; + struct generic_pm_domain genpd; + bool initialized; +}; + +static inline bool pxa1908_pd_is_on(struct pxa1908_pd *pd) +{ + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + + return pd->data.id != PXA1908_POWER_DOMAIN_DSI + ? regmap_test_bits(ctrl->base, APMU_PWR_STATUS_REG, pd->data.pwr_state) + : regmap_test_bits(ctrl->base, APMU_DEBUG, DSI_PHY_DVM_MASK); +} + +static int pxa1908_pd_power_on(struct generic_pm_domain *genpd) +{ + struct pxa1908_pd *pd = to_pxa1908_pd(genpd); + const struct pxa1908_pd_data *data = &pd->data; + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + unsigned int status; + int ret = 0; + + regmap_set_bits(ctrl->base, data->reg_clk_res_ctrl, data->hw_mode); + if (data->id != PXA1908_POWER_DOMAIN_ISP) + regmap_write(ctrl->base, APMU_PWR_BLK_TMR_REG, 0x20001fff); + regmap_set_bits(ctrl->base, APMU_PWR_CTRL_REG, data->pwr_state); + + ret = regmap_read_poll_timeout(ctrl->base, APMU_PWR_STATUS_REG, status, + status & data->pwr_state, POWER_POLL_SLEEP_US, + POWER_ON_LATENCY_US + POWER_POLL_TIMEOUT_US); + if (ret == -ETIMEDOUT) + dev_err(ctrl->dev, "timed out powering on domain '%s'\n", pd->genpd.name); + + return ret; +} + +static int pxa1908_pd_power_off(struct generic_pm_domain *genpd) +{ + struct pxa1908_pd *pd = to_pxa1908_pd(genpd); + const struct pxa1908_pd_data *data = &pd->data; + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + unsigned int status; + int ret; + + regmap_clear_bits(ctrl->base, APMU_PWR_CTRL_REG, data->pwr_state); + + ret = regmap_read_poll_timeout(ctrl->base, APMU_PWR_STATUS_REG, status, + !(status & data->pwr_state), POWER_POLL_SLEEP_US, + POWER_OFF_LATENCY_US + POWER_POLL_TIMEOUT_US); + if (ret == -ETIMEDOUT) { + dev_err(ctrl->dev, "timed out powering off domain '%s'\n", pd->genpd.name); + return ret; + } + + return regmap_clear_bits(ctrl->base, data->reg_clk_res_ctrl, data->hw_mode); +} + +static inline int pxa1908_dsi_power_on(struct generic_pm_domain *genpd) +{ + struct pxa1908_pd *pd = to_pxa1908_pd(genpd); + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + + return regmap_set_bits(ctrl->base, APMU_DEBUG, DSI_PHY_DVM_MASK); +} + +static inline int pxa1908_dsi_power_off(struct generic_pm_domain *genpd) +{ + struct pxa1908_pd *pd = to_pxa1908_pd(genpd); + struct pxa1908_pd_ctrl *ctrl = pd->ctrl; + + return regmap_clear_bits(ctrl->base, APMU_DEBUG, DSI_PHY_DVM_MASK); +} + +#define DOMAIN(_id, _name, ctrl, mode, state) \ + [_id] = { \ + .data = { \ + .reg_clk_res_ctrl = ctrl, \ + .hw_mode = BIT(mode), \ + .pwr_state = BIT(state), \ + .id = _id, \ + }, \ + .genpd = { \ + .name = _name, \ + .power_on = pxa1908_pd_power_on, \ + .power_off = pxa1908_pd_power_off, \ + }, \ + } + +static struct pxa1908_pd domains[NR_DOMAINS] = { + DOMAIN(PXA1908_POWER_DOMAIN_VPU, "vpu", 0xa4, 19, 2), + DOMAIN(PXA1908_POWER_DOMAIN_GPU, "gpu", 0xcc, 11, 0), + DOMAIN(PXA1908_POWER_DOMAIN_GPU2D, "gpu2d", 0xf4, 11, 6), + DOMAIN(PXA1908_POWER_DOMAIN_ISP, "isp", 0x38, 15, 4), + [PXA1908_POWER_DOMAIN_DSI] = { + .genpd = { + .name = "dsi", + .power_on = pxa1908_dsi_power_on, + .power_off = pxa1908_dsi_power_off, + /* + * TODO: There is no DSI driver written yet and until then we probably + * don't want to power off the DSI PHY ever. + */ + .flags = GENPD_FLAG_ALWAYS_ON, + }, + .data = { + /* See above. */ + .keep_on = true, + }, + }, +}; + +static void pxa1908_pd_remove(struct auxiliary_device *auxdev) +{ + struct pxa1908_pd *pd; + int ret; + + for (int i = NR_DOMAINS - 1; i >= 0; i--) { + pd = &domains[i]; + + if (!pd->initialized) + continue; + + if (pxa1908_pd_is_on(pd) && !pd->data.keep_on) + pxa1908_pd_power_off(&pd->genpd); + + ret = pm_genpd_remove(&pd->genpd); + if (ret) + dev_err(&auxdev->dev, "failed to remove domain '%s': %d\n", + pd->genpd.name, ret); + } +} + +static int +pxa1908_pd_init(struct pxa1908_pd_ctrl *ctrl, int id, struct device *dev) +{ + struct pxa1908_pd *pd = &domains[id]; + int ret; + + ctrl->domains[id] = &pd->genpd; + + pd->ctrl = ctrl; + + /* Make sure the state of the hardware is synced with the domain table above. */ + if (pd->data.keep_on) { + ret = pd->genpd.power_on(&pd->genpd); + if (ret) + return dev_err_probe(dev, ret, "failed to power on domain '%s'\n", + pd->genpd.name); + } else { + if (pxa1908_pd_is_on(pd)) { + dev_warn(dev, + "domain '%s' is on despite being default off; powering off\n", + pd->genpd.name); + + ret = pd->genpd.power_off(&pd->genpd); + if (ret) + return dev_err_probe(dev, ret, + "failed to power off domain '%s'\n", + pd->genpd.name); + } + } + + ret = pm_genpd_init(&pd->genpd, NULL, !pd->data.keep_on); + if (ret) + return dev_err_probe(dev, ret, "domain '%s' failed to initialize\n", + pd->genpd.name); + + pd->initialized = true; + + return 0; +} + +static int +pxa1908_pd_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *aux_id) +{ + struct pxa1908_pd_ctrl *ctrl; + struct device *dev = &auxdev->dev; + int ret; + + ctrl = devm_kzalloc(dev, sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return -ENOMEM; + + auxiliary_set_drvdata(auxdev, ctrl); + + ctrl->base = syscon_node_to_regmap(dev->parent->of_node); + if (IS_ERR(ctrl->base)) + return dev_err_probe(dev, PTR_ERR(ctrl->base), "no regmap available\n"); + + ctrl->dev = dev; + ctrl->onecell_data.domains = ctrl->domains; + ctrl->onecell_data.num_domains = NR_DOMAINS; + + for (int i = 0; i < NR_DOMAINS; i++) { + ret = pxa1908_pd_init(ctrl, i, dev); + if (ret) + goto err; + } + + return of_genpd_add_provider_onecell(dev->parent->of_node, &ctrl->onecell_data); + +err: + pxa1908_pd_remove(auxdev); + return ret; +} + +static const struct auxiliary_device_id pxa1908_pd_id[] = { + { .name = "clk_pxa1908_apmu.power" }, + { } +}; +MODULE_DEVICE_TABLE(auxiliary, pxa1908_pd_id); + +static struct auxiliary_driver pxa1908_pd_driver = { + .probe = pxa1908_pd_probe, + .remove = pxa1908_pd_remove, + .id_table = pxa1908_pd_id, +}; +module_auxiliary_driver(pxa1908_pd_driver); + +MODULE_AUTHOR("Duje Mihanović "); +MODULE_DESCRIPTION("Marvell PXA1908 power domain driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/pmdomain/mediatek/airoha-cpu-pmdomain.c b/drivers/pmdomain/mediatek/airoha-cpu-pmdomain.c index 0fd88d2f9ac29d..3b1d202f89dc51 100644 --- a/drivers/pmdomain/mediatek/airoha-cpu-pmdomain.c +++ b/drivers/pmdomain/mediatek/airoha-cpu-pmdomain.c @@ -21,10 +21,10 @@ struct airoha_cpu_pmdomain_priv { struct generic_pm_domain pd; }; -static long airoha_cpu_pmdomain_clk_round(struct clk_hw *hw, unsigned long rate, - unsigned long *parent_rate) +static int airoha_cpu_pmdomain_clk_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) { - return rate; + return 0; } static unsigned long airoha_cpu_pmdomain_clk_get(struct clk_hw *hw, @@ -48,7 +48,7 @@ static int airoha_cpu_pmdomain_clk_is_enabled(struct clk_hw *hw) static const struct clk_ops airoha_cpu_pmdomain_clk_ops = { .recalc_rate = airoha_cpu_pmdomain_clk_get, .is_enabled = airoha_cpu_pmdomain_clk_is_enabled, - .round_rate = airoha_cpu_pmdomain_clk_round, + .determine_rate = airoha_cpu_pmdomain_clk_determine_rate, }; static int airoha_cpu_pmdomain_set_performance_state(struct generic_pm_domain *domain, diff --git a/drivers/pmdomain/mediatek/mt6795-pm-domains.h b/drivers/pmdomain/mediatek/mt6795-pm-domains.h index a3f7785b04bd38..dc8e9f8877addd 100644 --- a/drivers/pmdomain/mediatek/mt6795-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt6795-pm-domains.h @@ -9,6 +9,9 @@ /* * MT6795 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt6795[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt6795[] = { [MT6795_POWER_DOMAIN_VDEC] = { @@ -107,6 +110,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt6795[] = { static const struct scpsys_soc_data mt6795_scpsys_data = { .domains_data = scpsys_domain_data_mt6795, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt6795), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt6795, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt6795), }; #endif /* __SOC_MEDIATEK_MT6795_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8167-pm-domains.h b/drivers/pmdomain/mediatek/mt8167-pm-domains.h index 8a0e898b79ab8b..f6ee48a711a16a 100644 --- a/drivers/pmdomain/mediatek/mt8167-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8167-pm-domains.h @@ -12,6 +12,9 @@ /* * MT8167 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8167[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8167[] = { [MT8167_POWER_DOMAIN_MM] = { @@ -99,6 +102,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8167[] = { static const struct scpsys_soc_data mt8167_scpsys_data = { .domains_data = scpsys_domain_data_mt8167, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8167), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8167, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8167), }; #endif /* __SOC_MEDIATEK_MT8167_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8173-pm-domains.h b/drivers/pmdomain/mediatek/mt8173-pm-domains.h index 7be0f47f521404..561a644b5d1cb2 100644 --- a/drivers/pmdomain/mediatek/mt8173-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8173-pm-domains.h @@ -9,6 +9,9 @@ /* * MT8173 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8173[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8173[] = { [MT8173_POWER_DOMAIN_VDEC] = { @@ -118,6 +121,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8173[] = { static const struct scpsys_soc_data mt8173_scpsys_data = { .domains_data = scpsys_domain_data_mt8173, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8173), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8173, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8173), }; #endif /* __SOC_MEDIATEK_MT8173_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8183-pm-domains.h b/drivers/pmdomain/mediatek/mt8183-pm-domains.h index c4c1b63d85b194..3742782a2702e4 100644 --- a/drivers/pmdomain/mediatek/mt8183-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8183-pm-domains.h @@ -9,6 +9,9 @@ /* * MT8183 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8183[] = { + BUS_PROT_BLOCK_INFRA, BUS_PROT_BLOCK_SMI +}; static const struct scpsys_domain_data scpsys_domain_data_mt8183[] = { [MT8183_POWER_DOMAIN_AUDIO] = { @@ -290,6 +293,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8183[] = { static const struct scpsys_soc_data mt8183_scpsys_data = { .domains_data = scpsys_domain_data_mt8183, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8183), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8183, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8183), }; #endif /* __SOC_MEDIATEK_MT8183_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8186-pm-domains.h b/drivers/pmdomain/mediatek/mt8186-pm-domains.h index cbac715c38fac2..00b9861af7c9c0 100644 --- a/drivers/pmdomain/mediatek/mt8186-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8186-pm-domains.h @@ -13,6 +13,9 @@ /* * MT8186 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8186[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8186[] = { [MT8186_POWER_DOMAIN_MFG0] = { @@ -361,6 +364,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8186[] = { static const struct scpsys_soc_data mt8186_scpsys_data = { .domains_data = scpsys_domain_data_mt8186, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8186), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8186, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8186), }; #endif /* __SOC_MEDIATEK_MT8186_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8188-pm-domains.h b/drivers/pmdomain/mediatek/mt8188-pm-domains.h index 007235be9efe59..3a989e83e9b791 100644 --- a/drivers/pmdomain/mediatek/mt8188-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8188-pm-domains.h @@ -14,6 +14,10 @@ * MT8188 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8188[] = { + BUS_PROT_BLOCK_INFRA +}; + static const struct scpsys_domain_data scpsys_domain_data_mt8188[] = { [MT8188_POWER_DOMAIN_MFG0] = { .name = "mfg0", @@ -685,6 +689,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8188[] = { static const struct scpsys_soc_data mt8188_scpsys_data = { .domains_data = scpsys_domain_data_mt8188, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8188), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8188, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8188), }; #endif /* __SOC_MEDIATEK_MT8188_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8192-pm-domains.h b/drivers/pmdomain/mediatek/mt8192-pm-domains.h index 6f139eed376937..5d62fac5f68231 100644 --- a/drivers/pmdomain/mediatek/mt8192-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8192-pm-domains.h @@ -9,6 +9,9 @@ /* * MT8192 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8192[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8192[] = { [MT8192_POWER_DOMAIN_AUDIO] = { @@ -380,6 +383,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8192[] = { static const struct scpsys_soc_data mt8192_scpsys_data = { .domains_data = scpsys_domain_data_mt8192, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8192), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8192, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8192), }; #endif /* __SOC_MEDIATEK_MT8192_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8195-pm-domains.h b/drivers/pmdomain/mediatek/mt8195-pm-domains.h index 59aa031ae6323b..1d3ca195ac7580 100644 --- a/drivers/pmdomain/mediatek/mt8195-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8195-pm-domains.h @@ -13,6 +13,9 @@ /* * MT8195 power domain support */ +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8195[] = { + BUS_PROT_BLOCK_INFRA +}; static const struct scpsys_domain_data scpsys_domain_data_mt8195[] = { [MT8195_POWER_DOMAIN_PCIE_MAC_P0] = { @@ -123,6 +126,7 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8195[] = { MT8195_TOP_AXI_PROT_EN_2_CLR, MT8195_TOP_AXI_PROT_EN_2_STA1), }, + .caps = MTK_SCPD_KEEP_DEFAULT_OFF | MTK_SCPD_ACTIVE_WAKEUP, }, [MT8195_POWER_DOMAIN_MFG0] = { .name = "mfg0", @@ -661,6 +665,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8195[] = { static const struct scpsys_soc_data mt8195_scpsys_data = { .domains_data = scpsys_domain_data_mt8195, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8195), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8195, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8195), }; #endif /* __SOC_MEDIATEK_MT8195_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mt8365-pm-domains.h b/drivers/pmdomain/mediatek/mt8365-pm-domains.h index 3d83d49eaa7c8d..33265ab8ce76f2 100644 --- a/drivers/pmdomain/mediatek/mt8365-pm-domains.h +++ b/drivers/pmdomain/mediatek/mt8365-pm-domains.h @@ -29,11 +29,13 @@ MT8365_SMI_COMMON_CLAMP_EN) #define MT8365_BUS_PROT_WAY_EN(_set_mask, _set, _sta_mask, _sta) \ - _BUS_PROT(_set_mask, _set, _set, _sta_mask, _sta, \ - BUS_PROT_COMPONENT_INFRA | \ - BUS_PROT_STA_COMPONENT_INFRA_NAO | \ - BUS_PROT_INVERTED | \ - BUS_PROT_REG_UPDATE) + _BUS_PROT_STA(INFRA, INFRA_NAO, _set_mask, _set, _set, \ + _sta_mask, _sta, \ + BUS_PROT_INVERTED | BUS_PROT_REG_UPDATE) + +static enum scpsys_bus_prot_block scpsys_bus_prot_blocks_mt8365[] = { + BUS_PROT_BLOCK_INFRA, BUS_PROT_BLOCK_INFRA_NAO, BUS_PROT_BLOCK_SMI +}; static const struct scpsys_domain_data scpsys_domain_data_mt8365[] = { [MT8365_POWER_DOMAIN_MM] = { @@ -192,6 +194,8 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8365[] = { static const struct scpsys_soc_data mt8365_scpsys_data = { .domains_data = scpsys_domain_data_mt8365, .num_domains = ARRAY_SIZE(scpsys_domain_data_mt8365), + .bus_prot_blocks = scpsys_bus_prot_blocks_mt8365, + .num_bus_prot_blocks = ARRAY_SIZE(scpsys_bus_prot_blocks_mt8365), }; #endif /* __SOC_MEDIATEK_MT8365_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c index a58ed7e2d9a479..0ebe7379b94e50 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.c +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c @@ -39,6 +39,12 @@ #define PWR_SRAM_CLKISO_BIT BIT(5) #define PWR_SRAM_ISOINT_B_BIT BIT(6) +#define PWR_RTFF_SAVE BIT(24) +#define PWR_RTFF_NRESTORE BIT(25) +#define PWR_RTFF_CLK_DIS BIT(26) +#define PWR_RTFF_SAVE_FLAG BIT(27) +#define PWR_RTFF_UFS_CLK_DIS BIT(28) + struct scpsys_domain { struct generic_pm_domain genpd; const struct scpsys_domain_data *data; @@ -47,9 +53,6 @@ struct scpsys_domain { struct clk_bulk_data *clks; int num_subsys_clks; struct clk_bulk_data *subsys_clks; - struct regmap *infracfg_nao; - struct regmap *infracfg; - struct regmap *smi; struct regulator *supply; }; @@ -57,6 +60,8 @@ struct scpsys { struct device *dev; struct regmap *base; const struct scpsys_soc_data *soc_data; + u8 bus_prot_index[BUS_PROT_BLOCK_COUNT]; + struct regmap **bus_prot; struct genpd_onecell_data pd_data; struct generic_pm_domain *domains[]; }; @@ -80,16 +85,23 @@ static bool scpsys_domain_is_on(struct scpsys_domain *pd) static int scpsys_sram_enable(struct scpsys_domain *pd) { - u32 pdn_ack = pd->data->sram_pdn_ack_bits; + u32 expected_ack, pdn_ack = pd->data->sram_pdn_ack_bits; struct scpsys *scpsys = pd->scpsys; unsigned int tmp; int ret; - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + if (MTK_SCPD_CAPS(pd, MTK_SCPD_SRAM_PDN_INVERTED)) { + regmap_set_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + expected_ack = pdn_ack; + } else { + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + expected_ack = 0; + } /* Either wait until SRAM_PDN_ACK all 1 or 0 */ ret = regmap_read_poll_timeout(scpsys->base, pd->data->ctl_offs, tmp, - (tmp & pdn_ack) == 0, MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT); + (tmp & pdn_ack) == expected_ack, + MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT); if (ret < 0) return ret; @@ -104,7 +116,7 @@ static int scpsys_sram_enable(struct scpsys_domain *pd) static int scpsys_sram_disable(struct scpsys_domain *pd) { - u32 pdn_ack = pd->data->sram_pdn_ack_bits; + u32 expected_ack, pdn_ack = pd->data->sram_pdn_ack_bits; struct scpsys *scpsys = pd->scpsys; unsigned int tmp; @@ -114,30 +126,36 @@ static int scpsys_sram_disable(struct scpsys_domain *pd) regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_SRAM_ISOINT_B_BIT); } - regmap_set_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + if (MTK_SCPD_CAPS(pd, MTK_SCPD_SRAM_PDN_INVERTED)) { + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + expected_ack = 0; + } else { + regmap_set_bits(scpsys->base, pd->data->ctl_offs, pd->data->sram_pdn_bits); + expected_ack = pdn_ack; + } /* Either wait until SRAM_PDN_ACK all 1 or 0 */ return regmap_read_poll_timeout(scpsys->base, pd->data->ctl_offs, tmp, - (tmp & pdn_ack) == pdn_ack, MTK_POLL_DELAY_US, - MTK_POLL_TIMEOUT); + (tmp & pdn_ack) == expected_ack, + MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT); } static struct regmap *scpsys_bus_protect_get_regmap(struct scpsys_domain *pd, const struct scpsys_bus_prot_data *bpd) { - if (bpd->flags & BUS_PROT_COMPONENT_SMI) - return pd->smi; - else - return pd->infracfg; + struct scpsys *scpsys = pd->scpsys; + unsigned short block_idx = scpsys->bus_prot_index[bpd->bus_prot_block]; + + return scpsys->bus_prot[block_idx]; } static struct regmap *scpsys_bus_protect_get_sta_regmap(struct scpsys_domain *pd, const struct scpsys_bus_prot_data *bpd) { - if (bpd->flags & BUS_PROT_STA_COMPONENT_INFRA_NAO) - return pd->infracfg_nao; - else - return scpsys_bus_protect_get_regmap(pd, bpd); + struct scpsys *scpsys = pd->scpsys; + int block_idx = scpsys->bus_prot_index[bpd->bus_prot_sta_block]; + + return scpsys->bus_prot[block_idx]; } static int scpsys_bus_protect_clear(struct scpsys_domain *pd, @@ -149,7 +167,7 @@ static int scpsys_bus_protect_clear(struct scpsys_domain *pd, u32 expected_ack; u32 val; - expected_ack = (bpd->flags & BUS_PROT_STA_COMPONENT_INFRA_NAO ? sta_mask : 0); + expected_ack = (bpd->bus_prot_sta_block == BUS_PROT_BLOCK_INFRA_NAO ? sta_mask : 0); if (bpd->flags & BUS_PROT_REG_UPDATE) regmap_clear_bits(regmap, bpd->bus_prot_clr, bpd->bus_prot_set_clr_mask); @@ -232,11 +250,161 @@ static int scpsys_regulator_disable(struct regulator *supply) return supply ? regulator_disable(supply) : 0; } +static int scpsys_ctl_pwrseq_on(struct scpsys_domain *pd) +{ + struct scpsys *scpsys = pd->scpsys; + bool do_rtff_nrestore, tmp; + int ret; + + /* subsys power on */ + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_2ND_BIT); + + /* wait until PWR_ACK = 1 */ + ret = readx_poll_timeout(scpsys_domain_is_on, pd, tmp, tmp, MTK_POLL_DELAY_US, + MTK_POLL_TIMEOUT); + if (ret < 0) + return ret; + + if (pd->data->rtff_type == SCPSYS_RTFF_TYPE_PCIE_PHY) + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); + + /* Wait for RTFF HW to sync buck isolation state if this is PCIe PHY RTFF */ + if (pd->data->rtff_type == SCPSYS_RTFF_TYPE_PCIE_PHY) + udelay(5); + + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); + + /* + * RTFF HW state may be modified by secure world or remote processors. + * + * With the only exception of STOR_UFS, which always needs save/restore, + * check if this power domain's RTFF is already on before trying to do + * the NRESTORE procedure, otherwise the system will lock up. + */ + switch (pd->data->rtff_type) { + case SCPSYS_RTFF_TYPE_GENERIC: + case SCPSYS_RTFF_TYPE_PCIE_PHY: + { + u32 ctl_status; + + regmap_read(scpsys->base, pd->data->ctl_offs, &ctl_status); + do_rtff_nrestore = ctl_status & PWR_RTFF_SAVE_FLAG; + break; + } + case SCPSYS_RTFF_TYPE_STOR_UFS: + /* STOR_UFS always needs NRESTORE */ + do_rtff_nrestore = true; + break; + default: + do_rtff_nrestore = false; + break; + } + + /* Return early if RTFF NRESTORE shall not be done */ + if (!do_rtff_nrestore) + return 0; + + switch (pd->data->rtff_type) { + case SCPSYS_RTFF_TYPE_GENERIC: + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE_FLAG); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + break; + case SCPSYS_RTFF_TYPE_PCIE_PHY: + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE_FLAG); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + break; + case SCPSYS_RTFF_TYPE_STOR_UFS: + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_UFS_CLK_DIS); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_NRESTORE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_UFS_CLK_DIS); + break; + default: + break; + } + + return 0; +} + +static void scpsys_ctl_pwrseq_off(struct scpsys_domain *pd) +{ + struct scpsys *scpsys = pd->scpsys; + + switch (pd->data->rtff_type) { + case SCPSYS_RTFF_TYPE_GENERIC: + case SCPSYS_RTFF_TYPE_PCIE_PHY: + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_CLK_DIS); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE_FLAG); + break; + case SCPSYS_RTFF_TYPE_STOR_UFS: + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_UFS_CLK_DIS); + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_SAVE); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RTFF_UFS_CLK_DIS); + break; + default: + break; + } + + /* subsys power off */ + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); + + /* Wait for RTFF HW to sync buck isolation state if this is PCIe PHY RTFF */ + if (pd->data->rtff_type == SCPSYS_RTFF_TYPE_PCIE_PHY) + udelay(1); + + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_2ND_BIT); + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); +} + +static int scpsys_modem_pwrseq_on(struct scpsys_domain *pd) +{ + struct scpsys *scpsys = pd->scpsys; + bool tmp; + int ret; + + if (!MTK_SCPD_CAPS(pd, MTK_SCPD_SKIP_RESET_B)) + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); + + regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); + + /* wait until PWR_ACK = 1 */ + ret = readx_poll_timeout(scpsys_domain_is_on, pd, tmp, tmp, MTK_POLL_DELAY_US, + MTK_POLL_TIMEOUT); + if (ret < 0) + return ret; + + return 0; +} + +static void scpsys_modem_pwrseq_off(struct scpsys_domain *pd) +{ + struct scpsys *scpsys = pd->scpsys; + + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); + + if (!MTK_SCPD_CAPS(pd, MTK_SCPD_SKIP_RESET_B)) + regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); +} + static int scpsys_power_on(struct generic_pm_domain *genpd) { struct scpsys_domain *pd = container_of(genpd, struct scpsys_domain, genpd); struct scpsys *scpsys = pd->scpsys; - bool tmp; int ret; ret = scpsys_regulator_enable(pd->supply); @@ -251,20 +419,14 @@ static int scpsys_power_on(struct generic_pm_domain *genpd) regmap_clear_bits(scpsys->base, pd->data->ext_buck_iso_offs, pd->data->ext_buck_iso_mask); - /* subsys power on */ - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_2ND_BIT); + if (MTK_SCPD_CAPS(pd, MTK_SCPD_MODEM_PWRSEQ)) + ret = scpsys_modem_pwrseq_on(pd); + else + ret = scpsys_ctl_pwrseq_on(pd); - /* wait until PWR_ACK = 1 */ - ret = readx_poll_timeout(scpsys_domain_is_on, pd, tmp, tmp, MTK_POLL_DELAY_US, - MTK_POLL_TIMEOUT); - if (ret < 0) + if (ret) goto err_pwr_ack; - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); - /* * In few Mediatek platforms(e.g. MT6779), the bus protect policy is * stricter, which leads to bus protect release must be prior to bus @@ -330,12 +492,10 @@ static int scpsys_power_off(struct generic_pm_domain *genpd) clk_bulk_disable_unprepare(pd->num_subsys_clks, pd->subsys_clks); - /* subsys power off */ - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_ISO_BIT); - regmap_set_bits(scpsys->base, pd->data->ctl_offs, PWR_CLK_DIS_BIT); - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_RST_B_BIT); - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_2ND_BIT); - regmap_clear_bits(scpsys->base, pd->data->ctl_offs, PWR_ON_BIT); + if (MTK_SCPD_CAPS(pd, MTK_SCPD_MODEM_PWRSEQ)) + scpsys_modem_pwrseq_off(pd); + else + scpsys_ctl_pwrseq_off(pd); /* wait until PWR_ACK = 0 */ ret = readx_poll_timeout(scpsys_domain_is_on, pd, tmp, !tmp, MTK_POLL_DELAY_US, @@ -355,7 +515,6 @@ generic_pm_domain *scpsys_add_one_domain(struct scpsys *scpsys, struct device_no { const struct scpsys_domain_data *domain_data; struct scpsys_domain *pd; - struct device_node *smi_node; struct property *prop; const char *clk_name; int i, ret, num_clks; @@ -396,32 +555,6 @@ generic_pm_domain *scpsys_add_one_domain(struct scpsys *scpsys, struct device_no node); } - pd->infracfg = syscon_regmap_lookup_by_phandle_optional(node, "mediatek,infracfg"); - if (IS_ERR(pd->infracfg)) - return dev_err_cast_probe(scpsys->dev, pd->infracfg, - "%pOF: failed to get infracfg regmap\n", - node); - - smi_node = of_parse_phandle(node, "mediatek,smi", 0); - if (smi_node) { - pd->smi = device_node_to_regmap(smi_node); - of_node_put(smi_node); - if (IS_ERR(pd->smi)) - return dev_err_cast_probe(scpsys->dev, pd->smi, - "%pOF: failed to get SMI regmap\n", - node); - } - - if (MTK_SCPD_CAPS(pd, MTK_SCPD_HAS_INFRA_NAO)) { - pd->infracfg_nao = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg-nao"); - if (IS_ERR(pd->infracfg_nao)) - return dev_err_cast_probe(scpsys->dev, pd->infracfg_nao, - "%pOF: failed to get infracfg-nao regmap\n", - node); - } else { - pd->infracfg_nao = NULL; - } - num_clks = of_clk_get_parent_count(node); if (num_clks > 0) { /* Calculate number of subsys_clks */ @@ -615,6 +748,136 @@ static void scpsys_domain_cleanup(struct scpsys *scpsys) } } +static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *scpsys) +{ + const u8 bp_blocks[3] = { + BUS_PROT_BLOCK_INFRA, BUS_PROT_BLOCK_SMI, BUS_PROT_BLOCK_INFRA_NAO + }; + struct device_node *np = dev->of_node; + struct device_node *node, *smi_np; + int num_regmaps = 0, i, j; + struct regmap *regmap[3]; + + /* + * Legacy code retrieves a maximum of three bus protection handles: + * some may be optional, or may not be, so the array of bp blocks + * that is normally passed in as platform data must be dynamically + * built in this case. + * + * Here, try to retrieve all of the regmaps that the legacy code + * supported and then count the number of the ones that are present, + * this makes it then possible to allocate the array of bus_prot + * regmaps and convert all to the new style handling. + */ + node = of_find_node_with_property(np, "mediatek,infracfg"); + if (node) { + regmap[0] = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg"); + of_node_put(node); + num_regmaps++; + if (IS_ERR(regmap[0])) + return dev_err_probe(dev, PTR_ERR(regmap[0]), + "%pOF: failed to get infracfg regmap\n", + node); + } else { + regmap[0] = NULL; + } + + node = of_find_node_with_property(np, "mediatek,smi"); + if (node) { + smi_np = of_parse_phandle(node, "mediatek,smi", 0); + of_node_put(node); + if (!smi_np) + return -ENODEV; + + regmap[1] = device_node_to_regmap(smi_np); + num_regmaps++; + of_node_put(smi_np); + if (IS_ERR(regmap[1])) + return dev_err_probe(dev, PTR_ERR(regmap[1]), + "%pOF: failed to get SMI regmap\n", + node); + } else { + regmap[1] = NULL; + } + + node = of_find_node_with_property(np, "mediatek,infracfg-nao"); + if (node) { + regmap[2] = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg-nao"); + num_regmaps++; + of_node_put(node); + if (IS_ERR(regmap[2])) + return dev_err_probe(dev, PTR_ERR(regmap[2]), + "%pOF: failed to get infracfg regmap\n", + node); + } else { + regmap[2] = NULL; + } + + scpsys->bus_prot = devm_kmalloc_array(dev, num_regmaps, + sizeof(*scpsys->bus_prot), GFP_KERNEL); + if (!scpsys->bus_prot) + return -ENOMEM; + + for (i = 0, j = 0; i < ARRAY_SIZE(bp_blocks); i++) { + enum scpsys_bus_prot_block bp_type; + + if (!regmap[i]) + continue; + + bp_type = bp_blocks[i]; + scpsys->bus_prot_index[bp_type] = j; + scpsys->bus_prot[j] = regmap[i]; + + j++; + } + + return 0; +} + +static int scpsys_get_bus_protection(struct device *dev, struct scpsys *scpsys) +{ + const struct scpsys_soc_data *soc = scpsys->soc_data; + struct device_node *np = dev->of_node; + int i, num_handles; + + num_handles = of_count_phandle_with_args(np, "access-controllers", NULL); + if (num_handles < 0 || num_handles != soc->num_bus_prot_blocks) + return dev_err_probe(dev, -EINVAL, + "Cannot get access controllers: expected %u, got %d\n", + soc->num_bus_prot_blocks, num_handles); + + scpsys->bus_prot = devm_kmalloc_array(dev, soc->num_bus_prot_blocks, + sizeof(*scpsys->bus_prot), GFP_KERNEL); + if (!scpsys->bus_prot) + return -ENOMEM; + + for (i = 0; i < soc->num_bus_prot_blocks; i++) { + enum scpsys_bus_prot_block bp_type; + struct device_node *node; + + node = of_parse_phandle(np, "access-controllers", i); + if (!node) + return -EINVAL; + + /* + * Index the bus protection regmaps so that we don't have to + * find the right one by type with a loop at every execution + * of power sequence(s). + */ + bp_type = soc->bus_prot_blocks[i]; + scpsys->bus_prot_index[bp_type] = i; + + scpsys->bus_prot[i] = device_node_to_regmap(node); + of_node_put(node); + if (IS_ERR_OR_NULL(scpsys->bus_prot[i])) + return dev_err_probe(dev, scpsys->bus_prot[i] ? + PTR_ERR(scpsys->bus_prot[i]) : -ENXIO, + "Cannot get regmap for access controller %d\n", i); + } + + return 0; +} + static const struct of_device_id scpsys_of_match[] = { { .compatible = "mediatek,mt6735-power-controller", @@ -701,6 +964,14 @@ static int scpsys_probe(struct platform_device *pdev) return PTR_ERR(scpsys->base); } + if (of_find_property(np, "access-controllers", NULL)) + ret = scpsys_get_bus_protection(dev, scpsys); + else + ret = scpsys_get_bus_protection_legacy(dev, scpsys); + + if (ret) + return ret; + ret = -ENODEV; for_each_available_child_of_node(np, node) { struct generic_pm_domain *domain; diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.h b/drivers/pmdomain/mediatek/mtk-pm-domains.h index 7085fa2976e98b..b2e3dee0383119 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.h +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.h @@ -13,6 +13,9 @@ #define MTK_SCPD_EXT_BUCK_ISO BIT(6) #define MTK_SCPD_HAS_INFRA_NAO BIT(7) #define MTK_SCPD_STRICT_BUS_PROTECTION BIT(8) +#define MTK_SCPD_SRAM_PDN_INVERTED BIT(9) +#define MTK_SCPD_MODEM_PWRSEQ BIT(10) +#define MTK_SCPD_SKIP_RESET_B BIT(11) #define MTK_SCPD_CAPS(_scpd, _x) ((_scpd)->data->caps & (_x)) #define SPM_VDE_PWR_CON 0x0210 @@ -50,30 +53,43 @@ enum scpsys_bus_prot_flags { BUS_PROT_REG_UPDATE = BIT(1), BUS_PROT_IGNORE_CLR_ACK = BIT(2), BUS_PROT_INVERTED = BIT(3), - BUS_PROT_COMPONENT_INFRA = BIT(4), - BUS_PROT_COMPONENT_SMI = BIT(5), - BUS_PROT_STA_COMPONENT_INFRA_NAO = BIT(6), }; -#define _BUS_PROT(_set_clr_mask, _set, _clr, _sta_mask, _sta, _flags) { \ - .bus_prot_set_clr_mask = (_set_clr_mask), \ - .bus_prot_set = _set, \ - .bus_prot_clr = _clr, \ - .bus_prot_sta_mask = (_sta_mask), \ - .bus_prot_sta = _sta, \ - .flags = _flags \ +enum scpsys_bus_prot_block { + BUS_PROT_BLOCK_INFRA, + BUS_PROT_BLOCK_INFRA_NAO, + BUS_PROT_BLOCK_SMI, + BUS_PROT_BLOCK_COUNT, +}; + +#define _BUS_PROT_STA(_hwip, _sta_hwip, _set_clr_mask, _set, _clr, \ + _sta_mask, _sta, _flags) \ + { \ + .bus_prot_block = BUS_PROT_BLOCK_##_hwip, \ + .bus_prot_sta_block = BUS_PROT_BLOCK_##_sta_hwip, \ + .bus_prot_set_clr_mask = (_set_clr_mask), \ + .bus_prot_set = _set, \ + .bus_prot_clr = _clr, \ + .bus_prot_sta_mask = (_sta_mask), \ + .bus_prot_sta = _sta, \ + .flags = _flags \ } -#define BUS_PROT_WR(_hwip, _mask, _set, _clr, _sta) \ - _BUS_PROT(_mask, _set, _clr, _mask, _sta, BUS_PROT_COMPONENT_##_hwip) +#define _BUS_PROT(_hwip, _set_clr_mask, _set, _clr, _sta_mask, \ + _sta, _flags) \ + _BUS_PROT_STA(_hwip, _hwip, _set_clr_mask, _set, _clr, \ + _sta_mask, _sta, _flags) -#define BUS_PROT_WR_IGN(_hwip, _mask, _set, _clr, _sta) \ - _BUS_PROT(_mask, _set, _clr, _mask, _sta, \ - BUS_PROT_COMPONENT_##_hwip | BUS_PROT_IGNORE_CLR_ACK) +#define BUS_PROT_WR(_hwip, _mask, _set, _clr, _sta) \ + _BUS_PROT(_hwip, _mask, _set, _clr, _mask, _sta, 0) -#define BUS_PROT_UPDATE(_hwip, _mask, _set, _clr, _sta) \ - _BUS_PROT(_mask, _set, _clr, _mask, _sta, \ - BUS_PROT_COMPONENT_##_hwip | BUS_PROT_REG_UPDATE) +#define BUS_PROT_WR_IGN(_hwip, _mask, _set, _clr, _sta) \ + _BUS_PROT(_hwip, _mask, _set, _clr, _mask, _sta, \ + BUS_PROT_IGNORE_CLR_ACK) + +#define BUS_PROT_UPDATE(_hwip, _mask, _set, _clr, _sta) \ + _BUS_PROT(_hwip, _mask, _set, _clr, _mask, _sta, \ + BUS_PROT_REG_UPDATE) #define BUS_PROT_INFRA_UPDATE_TOPAXI(_mask) \ BUS_PROT_UPDATE(INFRA, _mask, \ @@ -82,6 +98,8 @@ enum scpsys_bus_prot_flags { INFRA_TOPAXI_PROTECTSTA1) struct scpsys_bus_prot_data { + u8 bus_prot_block; + u8 bus_prot_sta_block; u32 bus_prot_set_clr_mask; u32 bus_prot_set; u32 bus_prot_clr; @@ -90,6 +108,22 @@ struct scpsys_bus_prot_data { u8 flags; }; +/** + * enum scpsys_rtff_type - Type of RTFF Hardware for power domain + * @SCPSYS_RTFF_NONE: RTFF HW not present or domain not RTFF managed + * @SCPSYS_RTFF_TYPE_GENERIC: Non-CPU, peripheral-generic RTFF HW + * @SCPSYS_RTFF_TYPE_PCIE_PHY: PCI-Express PHY specific RTFF HW + * @SCPSYS_RTFF_TYPE_STOR_UFS: Storage (UFS) specific RTFF HW + * @SCPSYS_RTFF_TYPE_MAX: Number of supported RTFF HW Types + */ +enum scpsys_rtff_type { + SCPSYS_RTFF_NONE = 0, + SCPSYS_RTFF_TYPE_GENERIC, + SCPSYS_RTFF_TYPE_PCIE_PHY, + SCPSYS_RTFF_TYPE_STOR_UFS, + SCPSYS_RTFF_TYPE_MAX +}; + /** * struct scpsys_domain_data - scp domain data for power on/off flow * @name: The name of the power domain. @@ -100,6 +134,7 @@ struct scpsys_bus_prot_data { * @ext_buck_iso_offs: The offset for external buck isolation * @ext_buck_iso_mask: The mask for external buck isolation * @caps: The flag for active wake-up action. + * @rtff_type: The power domain RTFF HW type * @bp_cfg: bus protection configuration for any subsystem */ struct scpsys_domain_data { @@ -111,6 +146,7 @@ struct scpsys_domain_data { int ext_buck_iso_offs; u32 ext_buck_iso_mask; u16 caps; + enum scpsys_rtff_type rtff_type; const struct scpsys_bus_prot_data bp_cfg[SPM_MAX_BUS_PROT_DATA]; int pwr_sta_offs; int pwr_sta2nd_offs; @@ -119,6 +155,8 @@ struct scpsys_domain_data { struct scpsys_soc_data { const struct scpsys_domain_data *domains_data; int num_domains; + enum scpsys_bus_prot_block *bus_prot_blocks; + int num_bus_prot_blocks; }; #endif /* __SOC_MEDIATEK_MTK_PM_DOMAINS_H */ diff --git a/drivers/pmdomain/qcom/rpmpd.c b/drivers/pmdomain/qcom/rpmpd.c index 833c46944600fa..f8580ec0f73785 100644 --- a/drivers/pmdomain/qcom/rpmpd.c +++ b/drivers/pmdomain/qcom/rpmpd.c @@ -631,12 +631,12 @@ static struct rpmpd ssc_mx_rwsm0_vfl = { }; static struct rpmpd *mdm9607_rpmpds[] = { - [MDM9607_VDDCX] = &cx_s3a_lvl, - [MDM9607_VDDCX_AO] = &cx_s3a_lvl_ao, - [MDM9607_VDDCX_VFL] = &cx_s3a_vfl, - [MDM9607_VDDMX] = &mx_l12a_lvl, - [MDM9607_VDDMX_AO] = &mx_l12a_lvl_ao, - [MDM9607_VDDMX_VFL] = &mx_l12a_vfl, + [RPMPD_VDDCX] = &cx_s3a_lvl, + [RPMPD_VDDCX_AO] = &cx_s3a_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_s3a_vfl, + [RPMPD_VDDMX] = &mx_l12a_lvl, + [RPMPD_VDDMX_AO] = &mx_l12a_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_l12a_vfl, }; static const struct rpmpd_desc mdm9607_desc = { @@ -646,9 +646,9 @@ static const struct rpmpd_desc mdm9607_desc = { }; static struct rpmpd *msm8226_rpmpds[] = { - [MSM8226_VDDCX] = &cx_s1a_corner, - [MSM8226_VDDCX_AO] = &cx_s1a_corner_ao, - [MSM8226_VDDCX_VFC] = &cx_s1a_vfc, + [RPMPD_VDDCX] = &cx_s1a_corner, + [RPMPD_VDDCX_AO] = &cx_s1a_corner_ao, + [RPMPD_VDDCX_VFC] = &cx_s1a_vfc, }; static const struct rpmpd_desc msm8226_desc = { @@ -675,11 +675,11 @@ static const struct rpmpd_desc msm8939_desc = { }; static struct rpmpd *msm8916_rpmpds[] = { - [MSM8916_VDDCX] = &cx_s1a_corner, - [MSM8916_VDDCX_AO] = &cx_s1a_corner_ao, - [MSM8916_VDDCX_VFC] = &cx_s1a_vfc, - [MSM8916_VDDMX] = &mx_l3a_corner, - [MSM8916_VDDMX_AO] = &mx_l3a_corner_ao, + [RPMPD_VDDCX] = &cx_s1a_corner, + [RPMPD_VDDCX_AO] = &cx_s1a_corner_ao, + [RPMPD_VDDCX_VFC] = &cx_s1a_vfc, + [RPMPD_VDDMX] = &mx_l3a_corner, + [RPMPD_VDDMX_AO] = &mx_l3a_corner_ao, }; static const struct rpmpd_desc msm8916_desc = { @@ -689,11 +689,11 @@ static const struct rpmpd_desc msm8916_desc = { }; static struct rpmpd *msm8917_rpmpds[] = { - [MSM8917_VDDCX] = &cx_s2a_lvl, - [MSM8917_VDDCX_AO] = &cx_s2a_lvl_ao, - [MSM8917_VDDCX_VFL] = &cx_s2a_vfl, - [MSM8917_VDDMX] = &mx_l3a_lvl, - [MSM8917_VDDMX_AO] = &mx_l3a_lvl_ao, + [RPMPD_VDDCX] = &cx_s2a_lvl, + [RPMPD_VDDCX_AO] = &cx_s2a_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_s2a_vfl, + [RPMPD_VDDMX] = &mx_l3a_lvl, + [RPMPD_VDDMX_AO] = &mx_l3a_lvl_ao, }; static const struct rpmpd_desc msm8917_desc = { @@ -747,12 +747,12 @@ static const struct rpmpd_desc msm8974pro_pma8084_desc = { }; static struct rpmpd *msm8976_rpmpds[] = { - [MSM8976_VDDCX] = &cx_s2a_lvl, - [MSM8976_VDDCX_AO] = &cx_s2a_lvl_ao, - [MSM8976_VDDCX_VFL] = &cx_rwsc2_vfl, - [MSM8976_VDDMX] = &mx_s6a_lvl, - [MSM8976_VDDMX_AO] = &mx_s6a_lvl_ao, - [MSM8976_VDDMX_VFL] = &mx_rwsm6_vfl, + [RPMPD_VDDCX] = &cx_s2a_lvl, + [RPMPD_VDDCX_AO] = &cx_s2a_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_rwsc2_vfl, + [RPMPD_VDDMX] = &mx_s6a_lvl, + [RPMPD_VDDMX_AO] = &mx_s6a_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_rwsm6_vfl, }; static const struct rpmpd_desc msm8976_desc = { @@ -796,16 +796,16 @@ static const struct rpmpd_desc msm8996_desc = { }; static struct rpmpd *msm8998_rpmpds[] = { - [MSM8998_VDDCX] = &cx_rwcx0_lvl, - [MSM8998_VDDCX_AO] = &cx_rwcx0_lvl_ao, - [MSM8998_VDDCX_VFL] = &cx_rwcx0_vfl, - [MSM8998_VDDMX] = &mx_rwmx0_lvl, - [MSM8998_VDDMX_AO] = &mx_rwmx0_lvl_ao, - [MSM8998_VDDMX_VFL] = &mx_rwmx0_vfl, - [MSM8998_SSCCX] = &ssc_cx_rwsc0_lvl, - [MSM8998_SSCCX_VFL] = &ssc_cx_rwsc0_vfl, - [MSM8998_SSCMX] = &ssc_mx_rwsm0_lvl, - [MSM8998_SSCMX_VFL] = &ssc_mx_rwsm0_vfl, + [RPMPD_VDDCX] = &cx_rwcx0_lvl, + [RPMPD_VDDCX_AO] = &cx_rwcx0_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_rwcx0_vfl, + [RPMPD_VDDMX] = &mx_rwmx0_lvl, + [RPMPD_VDDMX_AO] = &mx_rwmx0_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_rwmx0_vfl, + [RPMPD_SSCCX] = &ssc_cx_rwsc0_lvl, + [RPMPD_SSCCX_VFL] = &ssc_cx_rwsc0_vfl, + [RPMPD_SSCMX] = &ssc_mx_rwsm0_lvl, + [RPMPD_SSCMX_VFL] = &ssc_mx_rwsm0_vfl, }; static const struct rpmpd_desc msm8998_desc = { @@ -831,11 +831,11 @@ static const struct rpmpd_desc qcs404_desc = { }; static struct rpmpd *qm215_rpmpds[] = { - [QM215_VDDCX] = &cx_s1a_lvl, - [QM215_VDDCX_AO] = &cx_s1a_lvl_ao, - [QM215_VDDCX_VFL] = &cx_s1a_vfl, - [QM215_VDDMX] = &mx_l2a_lvl, - [QM215_VDDMX_AO] = &mx_l2a_lvl_ao, + [RPMPD_VDDCX] = &cx_s1a_lvl, + [RPMPD_VDDCX_AO] = &cx_s1a_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_s1a_vfl, + [RPMPD_VDDMX] = &mx_l2a_lvl, + [RPMPD_VDDMX_AO] = &mx_l2a_lvl_ao, }; static const struct rpmpd_desc qm215_desc = { @@ -845,16 +845,16 @@ static const struct rpmpd_desc qm215_desc = { }; static struct rpmpd *sdm660_rpmpds[] = { - [SDM660_VDDCX] = &cx_rwcx0_lvl, - [SDM660_VDDCX_AO] = &cx_rwcx0_lvl_ao, - [SDM660_VDDCX_VFL] = &cx_rwcx0_vfl, - [SDM660_VDDMX] = &mx_rwmx0_lvl, - [SDM660_VDDMX_AO] = &mx_rwmx0_lvl_ao, - [SDM660_VDDMX_VFL] = &mx_rwmx0_vfl, - [SDM660_SSCCX] = &ssc_cx_rwlc0_lvl, - [SDM660_SSCCX_VFL] = &ssc_cx_rwlc0_vfl, - [SDM660_SSCMX] = &ssc_mx_rwlm0_lvl, - [SDM660_SSCMX_VFL] = &ssc_mx_rwlm0_vfl, + [RPMPD_VDDCX] = &cx_rwcx0_lvl, + [RPMPD_VDDCX_AO] = &cx_rwcx0_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_rwcx0_vfl, + [RPMPD_VDDMX] = &mx_rwmx0_lvl, + [RPMPD_VDDMX_AO] = &mx_rwmx0_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_rwmx0_vfl, + [RPMPD_SSCCX] = &ssc_cx_rwlc0_lvl, + [RPMPD_SSCCX_VFL] = &ssc_cx_rwlc0_vfl, + [RPMPD_SSCMX] = &ssc_mx_rwlm0_lvl, + [RPMPD_SSCMX_VFL] = &ssc_mx_rwlm0_vfl, }; static const struct rpmpd_desc sdm660_desc = { @@ -881,12 +881,12 @@ static const struct rpmpd_desc sm6115_desc = { }; static struct rpmpd *sm6125_rpmpds[] = { - [SM6125_VDDCX] = &cx_rwcx0_lvl, - [SM6125_VDDCX_AO] = &cx_rwcx0_lvl_ao, - [SM6125_VDDCX_VFL] = &cx_rwcx0_vfl, - [SM6125_VDDMX] = &mx_rwmx0_lvl, - [SM6125_VDDMX_AO] = &mx_rwmx0_lvl_ao, - [SM6125_VDDMX_VFL] = &mx_rwmx0_vfl, + [RPMPD_VDDCX] = &cx_rwcx0_lvl, + [RPMPD_VDDCX_AO] = &cx_rwcx0_lvl_ao, + [RPMPD_VDDCX_VFL] = &cx_rwcx0_vfl, + [RPMPD_VDDMX] = &mx_rwmx0_lvl, + [RPMPD_VDDMX_AO] = &mx_rwmx0_lvl_ao, + [RPMPD_VDDMX_VFL] = &mx_rwmx0_vfl, }; static const struct rpmpd_desc sm6125_desc = { diff --git a/drivers/pmdomain/renesas/rcar-gen4-sysc.c b/drivers/pmdomain/renesas/rcar-gen4-sysc.c index 5aa7fa1df8fe58..7434bf42d21562 100644 --- a/drivers/pmdomain/renesas/rcar-gen4-sysc.c +++ b/drivers/pmdomain/renesas/rcar-gen4-sysc.c @@ -251,6 +251,7 @@ static int __init rcar_gen4_sysc_pd_setup(struct rcar_gen4_sysc_pd *pd) genpd->detach_dev = cpg_mssr_detach_dev; } + genpd->flags |= GENPD_FLAG_NO_STAY_ON; genpd->power_off = rcar_gen4_sysc_pd_power_off; genpd->power_on = rcar_gen4_sysc_pd_power_on; diff --git a/drivers/pmdomain/renesas/rcar-sysc.c b/drivers/pmdomain/renesas/rcar-sysc.c index 4b310c1d35fa6b..d8a8ffcde38d8b 100644 --- a/drivers/pmdomain/renesas/rcar-sysc.c +++ b/drivers/pmdomain/renesas/rcar-sysc.c @@ -241,6 +241,7 @@ static int __init rcar_sysc_pd_setup(struct rcar_sysc_pd *pd) } } + genpd->flags |= GENPD_FLAG_NO_STAY_ON; genpd->power_off = rcar_sysc_pd_power_off; genpd->power_on = rcar_sysc_pd_power_on; @@ -342,7 +343,7 @@ struct rcar_pm_domains { }; static struct genpd_onecell_data *rcar_sysc_onecell_data; -static struct device_node *rcar_sysc_onecell_np; +static struct device_node *rcar_sysc_onecell_np __initdata = NULL; static int __init rcar_sysc_pd_init(void) { diff --git a/drivers/pmdomain/renesas/rmobile-sysc.c b/drivers/pmdomain/renesas/rmobile-sysc.c index 8eedc9a1d82501..a6bf7295e909e7 100644 --- a/drivers/pmdomain/renesas/rmobile-sysc.c +++ b/drivers/pmdomain/renesas/rmobile-sysc.c @@ -100,7 +100,8 @@ static void rmobile_init_pm_domain(struct rmobile_pm_domain *rmobile_pd) struct generic_pm_domain *genpd = &rmobile_pd->genpd; struct dev_power_governor *gov = rmobile_pd->gov; - genpd->flags |= GENPD_FLAG_PM_CLK | GENPD_FLAG_ACTIVE_WAKEUP; + genpd->flags |= GENPD_FLAG_PM_CLK | GENPD_FLAG_ACTIVE_WAKEUP | + GENPD_FLAG_NO_STAY_ON; genpd->attach_dev = cpg_mstp_attach_dev; genpd->detach_dev = cpg_mstp_detach_dev; diff --git a/drivers/pmdomain/rockchip/Kconfig b/drivers/pmdomain/rockchip/Kconfig index 218d43186e5b9a..17f2e6fe86b6f7 100644 --- a/drivers/pmdomain/rockchip/Kconfig +++ b/drivers/pmdomain/rockchip/Kconfig @@ -3,6 +3,7 @@ if ARCH_ROCKCHIP || COMPILE_TEST config ROCKCHIP_PM_DOMAINS bool "Rockchip generic power domain" + default ARCH_ROCKCHIP depends on PM depends on HAVE_ARM_SMCCC_DISCOVERY depends on REGULATOR diff --git a/drivers/pmdomain/rockchip/pm-domains.c b/drivers/pmdomain/rockchip/pm-domains.c index 242570c505fb65..1955c6d453e4f6 100644 --- a/drivers/pmdomain/rockchip/pm-domains.c +++ b/drivers/pmdomain/rockchip/pm-domains.c @@ -865,7 +865,7 @@ static int rockchip_pm_add_one_domain(struct rockchip_pmu *pmu, pd->genpd.power_on = rockchip_pd_power_on; pd->genpd.attach_dev = rockchip_pd_attach_dev; pd->genpd.detach_dev = rockchip_pd_detach_dev; - pd->genpd.flags = GENPD_FLAG_PM_CLK; + pd->genpd.flags = GENPD_FLAG_PM_CLK | GENPD_FLAG_NO_STAY_ON; if (pd_info->active_wakeup) pd->genpd.flags |= GENPD_FLAG_ACTIVE_WAKEUP; pm_genpd_init(&pd->genpd, NULL, diff --git a/drivers/pmdomain/thead/th1520-pm-domains.c b/drivers/pmdomain/thead/th1520-pm-domains.c index 9040b698e7f7f2..d7cb9633c7c8a3 100644 --- a/drivers/pmdomain/thead/th1520-pm-domains.c +++ b/drivers/pmdomain/thead/th1520-pm-domains.c @@ -173,6 +173,18 @@ static int th1520_pd_pwrseq_gpu_init(struct device *dev) adev); } +static int th1520_pd_reboot_init(struct device *dev, + struct th1520_aon_chan *aon_chan) +{ + struct auxiliary_device *adev; + + adev = devm_auxiliary_device_create(dev, "reboot", aon_chan); + if (!adev) + return -ENODEV; + + return 0; +} + static int th1520_pd_probe(struct platform_device *pdev) { struct generic_pm_domain **domains; @@ -235,6 +247,10 @@ static int th1520_pd_probe(struct platform_device *pdev) if (ret) goto err_clean_provider; + ret = th1520_pd_reboot_init(dev, aon_chan); + if (ret) + goto err_clean_provider; + return 0; err_clean_provider: diff --git a/drivers/pmdomain/ti/ti_sci_pm_domains.c b/drivers/pmdomain/ti/ti_sci_pm_domains.c index 82df7e44250bb6..e5d1934f78d9ee 100644 --- a/drivers/pmdomain/ti/ti_sci_pm_domains.c +++ b/drivers/pmdomain/ti/ti_sci_pm_domains.c @@ -200,6 +200,23 @@ static bool ti_sci_pm_idx_exists(struct ti_sci_genpd_provider *pd_provider, u32 return false; } +static bool ti_sci_pm_pd_is_on(struct ti_sci_genpd_provider *pd_provider, + int pd_idx) +{ + bool is_on; + int ret; + + if (!pd_provider->ti_sci->ops.dev_ops.is_on) + return false; + + ret = pd_provider->ti_sci->ops.dev_ops.is_on(pd_provider->ti_sci, + pd_idx, NULL, &is_on); + if (ret) + return false; + + return is_on; +} + static int ti_sci_pm_domain_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -231,6 +248,8 @@ static int ti_sci_pm_domain_probe(struct platform_device *pdev) index, &args)) { if (args.args_count >= 1 && args.np == dev->of_node) { + bool is_on; + of_node_put(args.np); if (args.args[0] > max_id) { max_id = args.args[0]; @@ -264,7 +283,10 @@ static int ti_sci_pm_domain_probe(struct platform_device *pdev) pd_provider->ti_sci->ops.pm_ops.set_latency_constraint) pd->pd.domain.ops.suspend = ti_sci_pd_suspend; - pm_genpd_init(&pd->pd, NULL, true); + is_on = ti_sci_pm_pd_is_on(pd_provider, + pd->idx); + + pm_genpd_init(&pd->pd, NULL, !is_on); list_add(&pd->node, &pd_provider->pd_list); } else { diff --git a/drivers/power/reset/Kconfig b/drivers/power/reset/Kconfig index 77ea3129c70806..8248895ca90389 100644 --- a/drivers/power/reset/Kconfig +++ b/drivers/power/reset/Kconfig @@ -225,6 +225,13 @@ config POWER_RESET_ST help Reset support for STMicroelectronics boards. +config POWER_RESET_TH1520_AON + tristate "T-Head TH1520 AON firmware poweroff and reset driver" + depends on TH1520_PM_DOMAINS + help + This driver supports power-off and reset operations for T-Head + TH1520 SoCs running the AON firmware. + config POWER_RESET_TORADEX_EC tristate "Toradex Embedded Controller power-off and reset driver" depends on ARCH_MXC || COMPILE_TEST diff --git a/drivers/power/reset/Makefile b/drivers/power/reset/Makefile index b7c2b5940be997..51da87e05ce76b 100644 --- a/drivers/power/reset/Makefile +++ b/drivers/power/reset/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_POWER_RESET_QNAP) += qnap-poweroff.o obj-$(CONFIG_POWER_RESET_REGULATOR) += regulator-poweroff.o obj-$(CONFIG_POWER_RESET_RESTART) += restart-poweroff.o obj-$(CONFIG_POWER_RESET_ST) += st-poweroff.o +obj-$(CONFIG_POWER_RESET_TH1520_AON) += th1520-aon-reboot.o obj-$(CONFIG_POWER_RESET_TORADEX_EC) += tdx-ec-poweroff.o obj-$(CONFIG_POWER_RESET_TPS65086) += tps65086-restart.o obj-$(CONFIG_POWER_RESET_VERSATILE) += arm-versatile-reboot.o diff --git a/drivers/power/reset/th1520-aon-reboot.c b/drivers/power/reset/th1520-aon-reboot.c new file mode 100644 index 00000000000000..ec249667a0ffd7 --- /dev/null +++ b/drivers/power/reset/th1520-aon-reboot.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * T-HEAD TH1520 AON Firmware Reboot Driver + * + * Copyright (c) 2025 Icenowy Zheng + */ + +#include +#include +#include +#include +#include +#include +#include + +#define TH1520_AON_REBOOT_PRIORITY 200 + +struct th1520_aon_msg_empty_body { + struct th1520_aon_rpc_msg_hdr hdr; + u16 reserved[12]; +} __packed __aligned(1); + +static int th1520_aon_pwroff_handler(struct sys_off_data *data) +{ + struct th1520_aon_chan *aon_chan = data->cb_data; + struct th1520_aon_msg_empty_body msg = {}; + + msg.hdr.svc = TH1520_AON_RPC_SVC_WDG; + msg.hdr.func = TH1520_AON_WDG_FUNC_POWER_OFF; + msg.hdr.size = TH1520_AON_RPC_MSG_NUM; + + th1520_aon_call_rpc(aon_chan, &msg); + + return NOTIFY_DONE; +} + +static int th1520_aon_restart_handler(struct sys_off_data *data) +{ + struct th1520_aon_chan *aon_chan = data->cb_data; + struct th1520_aon_msg_empty_body msg = {}; + + msg.hdr.svc = TH1520_AON_RPC_SVC_WDG; + msg.hdr.func = TH1520_AON_WDG_FUNC_RESTART; + msg.hdr.size = TH1520_AON_RPC_MSG_NUM; + + th1520_aon_call_rpc(aon_chan, &msg); + + return NOTIFY_DONE; +} + +static int th1520_aon_reboot_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct device *dev = &adev->dev; + int ret; + + /* Expect struct th1520_aon_chan to be passed via platform_data */ + ret = devm_register_sys_off_handler(dev, SYS_OFF_MODE_POWER_OFF, + TH1520_AON_REBOOT_PRIORITY, + th1520_aon_pwroff_handler, + adev->dev.platform_data); + + if (ret) { + dev_err(dev, "Failed to register power off handler\n"); + return ret; + } + + ret = devm_register_sys_off_handler(dev, SYS_OFF_MODE_RESTART, + TH1520_AON_REBOOT_PRIORITY, + th1520_aon_restart_handler, + adev->dev.platform_data); + + if (ret) { + dev_err(dev, "Failed to register restart handler\n"); + return ret; + } + + return 0; +} + +static const struct auxiliary_device_id th1520_aon_reboot_id_table[] = { + { .name = "th1520_pm_domains.reboot" }, + {}, +}; +MODULE_DEVICE_TABLE(auxiliary, th1520_aon_reboot_id_table); + +static struct auxiliary_driver th1520_aon_reboot_driver = { + .driver = { + .name = "th1520-aon-reboot", + }, + .probe = th1520_aon_reboot_probe, + .id_table = th1520_aon_reboot_id_table, +}; +module_auxiliary_driver(th1520_aon_reboot_driver); + +MODULE_AUTHOR("Icenowy Zheng "); +MODULE_DESCRIPTION("T-HEAD TH1520 AON-firmware-based reboot driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/88pm860x_charger.c b/drivers/power/supply/88pm860x_charger.c index 2b9fcb7e71d79e..8d99c6ff72edf6 100644 --- a/drivers/power/supply/88pm860x_charger.c +++ b/drivers/power/supply/88pm860x_charger.c @@ -284,8 +284,8 @@ static int set_charging_fsm(struct pm860x_charger_info *info) { struct power_supply *psy; union power_supply_propval data; - unsigned char fsm_state[][16] = { "init", "discharge", "precharge", - "fastcharge", + static const unsigned char fsm_state[][16] = { + "init", "discharge", "precharge", "fastcharge", }; int ret; int vbatt; @@ -313,7 +313,7 @@ static int set_charging_fsm(struct pm860x_charger_info *info) dev_dbg(info->dev, "Entering FSM:%s, Charger:%s, Battery:%s, " "Allowed:%d\n", - &fsm_state[info->state][0], + fsm_state[info->state], (info->online) ? "online" : "N/A", (info->present) ? "present" : "N/A", info->allowed); dev_dbg(info->dev, "set_charging_fsm:vbatt:%d(mV)\n", vbatt); @@ -385,7 +385,7 @@ static int set_charging_fsm(struct pm860x_charger_info *info) } dev_dbg(info->dev, "Out FSM:%s, Charger:%s, Battery:%s, Allowed:%d\n", - &fsm_state[info->state][0], + fsm_state[info->state], (info->online) ? "online" : "N/A", (info->present) ? "present" : "N/A", info->allowed); mutex_unlock(&info->lock); diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig index 79ddb006e2dad6..dca4be23ee7095 100644 --- a/drivers/power/supply/Kconfig +++ b/drivers/power/supply/Kconfig @@ -35,6 +35,9 @@ config APM_POWER Say Y here to enable support APM status emulation using battery class devices. +config ADC_BATTERY_HELPER + tristate + config GENERIC_ADC_BATTERY tristate "Generic battery support using IIO" depends on IIO @@ -244,6 +247,18 @@ config BATTERY_INGENIC This driver can also be built as a module. If so, the module will be called ingenic-battery. +config BATTERY_INTEL_DC_TI + tristate "Intel Bay / Cherry Trail Dollar Cove TI battery driver" + depends on INTEL_SOC_PMIC_CHTDC_TI && INTEL_DC_TI_ADC && IIO && ACPI + select ADC_BATTERY_HELPER + help + Choose this option if you want to monitor battery status on Intel + Bay Trail / Cherry Trail tablets using the Dollar Cove TI PMIC's + coulomb-counter as fuel-gauge. + + To compile this driver as a module, choose M here: the module will be + called intel_dc_ti_battery. + config BATTERY_IPAQ_MICRO tristate "iPAQ Atmel Micro ASIC battery driver" depends on MFD_IPAQ_MICRO @@ -767,6 +782,13 @@ config CHARGER_BQ2515X rail, ADC for battery and system monitoring, and push-button controller. +config CHARGER_BQ257XX + tristate "TI BQ257XX battery charger family" + depends on MFD_BQ257XX + help + Say Y to enable support for the TI BQ257XX family of battery + charging integrated circuits. + config CHARGER_BQ25890 tristate "TI BQ25890 battery charger driver" depends on I2C @@ -1043,6 +1065,7 @@ config CHARGER_SURFACE config BATTERY_UG3105 tristate "uPI uG3105 battery monitor driver" depends on I2C + select ADC_BATTERY_HELPER help Battery monitor driver for the uPI uG3105 battery monitor. diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile index f943c9150b326d..99a820d38197cd 100644 --- a/drivers/power/supply/Makefile +++ b/drivers/power/supply/Makefile @@ -7,6 +7,7 @@ power_supply-$(CONFIG_LEDS_TRIGGERS) += power_supply_leds.o obj-$(CONFIG_POWER_SUPPLY) += power_supply.o obj-$(CONFIG_POWER_SUPPLY_HWMON) += power_supply_hwmon.o +obj-$(CONFIG_ADC_BATTERY_HELPER) += adc-battery-helper.o obj-$(CONFIG_GENERIC_ADC_BATTERY) += generic-adc-battery.o obj-$(CONFIG_APM_POWER) += apm_power.o @@ -41,6 +42,7 @@ obj-$(CONFIG_BATTERY_OLPC) += olpc_battery.o obj-$(CONFIG_BATTERY_SAMSUNG_SDI) += samsung-sdi-battery.o obj-$(CONFIG_BATTERY_COLLIE) += collie_battery.o obj-$(CONFIG_BATTERY_INGENIC) += ingenic-battery.o +obj-$(CONFIG_BATTERY_INTEL_DC_TI) += intel_dc_ti_battery.o obj-$(CONFIG_BATTERY_IPAQ_MICRO) += ipaq_micro_battery.o obj-$(CONFIG_BATTERY_WM97XX) += wm97xx_battery.o obj-$(CONFIG_BATTERY_SBS) += sbs-battery.o @@ -97,6 +99,7 @@ obj-$(CONFIG_CHARGER_BQ24190) += bq24190_charger.o obj-$(CONFIG_CHARGER_BQ24257) += bq24257_charger.o obj-$(CONFIG_CHARGER_BQ24735) += bq24735-charger.o obj-$(CONFIG_CHARGER_BQ2515X) += bq2515x_charger.o +obj-$(CONFIG_CHARGER_BQ257XX) += bq257xx_charger.o obj-$(CONFIG_CHARGER_BQ25890) += bq25890_charger.o obj-$(CONFIG_CHARGER_BQ25980) += bq25980_charger.o obj-$(CONFIG_CHARGER_BQ256XX) += bq256xx_charger.o diff --git a/drivers/power/supply/ab8500_btemp.c b/drivers/power/supply/ab8500_btemp.c index b00c84fbc33c2a..e5202a7b6209b4 100644 --- a/drivers/power/supply/ab8500_btemp.c +++ b/drivers/power/supply/ab8500_btemp.c @@ -667,7 +667,8 @@ static int ab8500_btemp_bind(struct device *dev, struct device *master, /* Create a work queue for the btemp */ di->btemp_wq = - alloc_workqueue("ab8500_btemp_wq", WQ_MEM_RECLAIM, 0); + alloc_workqueue("ab8500_btemp_wq", WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (di->btemp_wq == NULL) { dev_err(dev, "failed to create work queue\n"); return -ENOMEM; diff --git a/drivers/power/supply/adc-battery-helper.c b/drivers/power/supply/adc-battery-helper.c new file mode 100644 index 00000000000000..6e0f5b6d73d7c1 --- /dev/null +++ b/drivers/power/supply/adc-battery-helper.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Helper for batteries with accurate current and voltage measurement, but + * without temperature measurement or without a "resistance-temp-table". + * + * Some fuel-gauges are not full-featured autonomous fuel-gauges. + * These fuel-gauges offer accurate current and voltage measurements but + * their coulomb-counters are intended to work together with an always on + * micro-controller monitoring the fuel-gauge. + * + * This adc-battery-helper code offers open-circuit-voltage (ocv) and through + * that capacity estimation for devices where such limited functionality + * fuel-gauges are exposed directly to Linux. + * + * This helper requires the hw to provide accurate battery current_now and + * voltage_now measurement and this helper the provides the following properties + * based on top of those readings: + * + * POWER_SUPPLY_PROP_STATUS + * POWER_SUPPLY_PROP_VOLTAGE_OCV + * POWER_SUPPLY_PROP_VOLTAGE_NOW + * POWER_SUPPLY_PROP_CURRENT_NOW + * POWER_SUPPLY_PROP_CAPACITY + * + * As well as optional the following properties assuming an always present + * system-scope battery, allowing direct use of adc_battery_helper_get_prop() + * in this common case: + * POWER_SUPPLY_PROP_PRESENT + * POWER_SUPPLY_PROP_SCOPE + * + * Using this helper is as simple as: + * + * 1. Embed a struct adc_battery_helper this MUST be the first member of + * the battery driver's data struct. + * 2. Use adc_battery_helper_props[] or add the above properties to + * the list of properties in power_supply_desc + * 3. Call adc_battery_helper_init() after registering the power_supply and + * before returning from the probe() function + * 4. Use adc_battery_helper_get_prop() as the power-supply's get_property() + * method, or call it for the above properties. + * 5. Use adc_battery_helper_external_power_changed() as the power-supply's + * external_power_changed() method or call it from that method. + * 6. Use adc_battery_helper_[suspend|resume]() as suspend-resume methods or + * call them from the driver's suspend-resume methods. + * + * The provided get_voltage_and_current_now() method will be called by this + * helper at adc_battery_helper_init() time and later. + * + * Copyright (c) 2021-2025 Hans de Goede + */ + +#include +#include +#include +#include +#include +#include + +#include "adc-battery-helper.h" + +#define MOV_AVG_WINDOW_SIZE ADC_BAT_HELPER_MOV_AVG_WINDOW_SIZE +#define INIT_POLL_TIME (5 * HZ) +#define POLL_TIME (30 * HZ) +#define SETTLE_TIME (1 * HZ) + +#define INIT_POLL_COUNT 30 + +#define CURR_HYST_UA 65000 + +#define LOW_BAT_UV 3700000 +#define FULL_BAT_HYST_UV 38000 + +#define AMBIENT_TEMP_CELSIUS 25 + +static int adc_battery_helper_get_status(struct adc_battery_helper *help) +{ + int full_uv = + help->psy->battery_info->constant_charge_voltage_max_uv - FULL_BAT_HYST_UV; + + if (help->curr_ua > CURR_HYST_UA) + return POWER_SUPPLY_STATUS_CHARGING; + + if (help->curr_ua < -CURR_HYST_UA) + return POWER_SUPPLY_STATUS_DISCHARGING; + + if (help->supplied) { + bool full; + + if (help->charge_finished) + full = gpiod_get_value_cansleep(help->charge_finished); + else + full = help->ocv_avg_uv > full_uv; + + if (full) + return POWER_SUPPLY_STATUS_FULL; + } + + return POWER_SUPPLY_STATUS_NOT_CHARGING; +} + +static void adc_battery_helper_work(struct work_struct *work) +{ + struct adc_battery_helper *help = container_of(work, struct adc_battery_helper, + work.work); + int i, curr_diff_ua, volt_diff_uv, res_mohm, ret, win_size; + struct device *dev = help->psy->dev.parent; + int volt_uv, prev_volt_uv = help->volt_uv; + int curr_ua, prev_curr_ua = help->curr_ua; + bool prev_supplied = help->supplied; + int prev_status = help->status; + + guard(mutex)(&help->lock); + + ret = help->get_voltage_and_current_now(help->psy, &volt_uv, &curr_ua); + if (ret) + goto out; + + help->volt_uv = volt_uv; + help->curr_ua = curr_ua; + + help->ocv_uv[help->ocv_avg_index] = + help->volt_uv - help->curr_ua * help->intern_res_avg_mohm / 1000; + dev_dbg(dev, "volt-now: %d, curr-now: %d, volt-ocv: %d\n", + help->volt_uv, help->curr_ua, help->ocv_uv[help->ocv_avg_index]); + help->ocv_avg_index = (help->ocv_avg_index + 1) % MOV_AVG_WINDOW_SIZE; + help->poll_count++; + + help->ocv_avg_uv = 0; + win_size = min(help->poll_count, MOV_AVG_WINDOW_SIZE); + for (i = 0; i < win_size; i++) + help->ocv_avg_uv += help->ocv_uv[i]; + help->ocv_avg_uv /= win_size; + + help->supplied = power_supply_am_i_supplied(help->psy); + help->status = adc_battery_helper_get_status(help); + if (help->status == POWER_SUPPLY_STATUS_FULL) + help->capacity = 100; + else + help->capacity = power_supply_batinfo_ocv2cap(help->psy->battery_info, + help->ocv_avg_uv, + AMBIENT_TEMP_CELSIUS); + + /* + * Skip internal resistance calc on charger [un]plug and + * when the battery is almost empty (voltage low). + */ + if (help->supplied != prev_supplied || + help->volt_uv < LOW_BAT_UV || + help->poll_count < 2) + goto out; + + /* + * Assuming that the OCV voltage does not change significantly + * between 2 polls, then we can calculate the internal resistance + * on a significant current change by attributing all voltage + * change between the 2 readings to the internal resistance. + */ + curr_diff_ua = abs(help->curr_ua - prev_curr_ua); + if (curr_diff_ua < CURR_HYST_UA) + goto out; + + volt_diff_uv = abs(help->volt_uv - prev_volt_uv); + res_mohm = volt_diff_uv * 1000 / curr_diff_ua; + + if ((res_mohm < (help->intern_res_avg_mohm * 2 / 3)) || + (res_mohm > (help->intern_res_avg_mohm * 4 / 3))) { + dev_dbg(dev, "Ignoring outlier internal resistance %d mOhm\n", res_mohm); + goto out; + } + + dev_dbg(dev, "Internal resistance %d mOhm\n", res_mohm); + + help->intern_res_mohm[help->intern_res_avg_index] = res_mohm; + help->intern_res_avg_index = (help->intern_res_avg_index + 1) % MOV_AVG_WINDOW_SIZE; + help->intern_res_poll_count++; + + help->intern_res_avg_mohm = 0; + win_size = min(help->intern_res_poll_count, MOV_AVG_WINDOW_SIZE); + for (i = 0; i < win_size; i++) + help->intern_res_avg_mohm += help->intern_res_mohm[i]; + help->intern_res_avg_mohm /= win_size; + +out: + queue_delayed_work(system_percpu_wq, &help->work, + (help->poll_count <= INIT_POLL_COUNT) ? + INIT_POLL_TIME : POLL_TIME); + + if (help->status != prev_status) + power_supply_changed(help->psy); +} + +const enum power_supply_property adc_battery_helper_properties[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_VOLTAGE_OCV, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_CAPACITY, + POWER_SUPPLY_PROP_PRESENT, + POWER_SUPPLY_PROP_SCOPE, +}; +EXPORT_SYMBOL_GPL(adc_battery_helper_properties); + +static_assert(ARRAY_SIZE(adc_battery_helper_properties) == + ADC_HELPER_NUM_PROPERTIES); + +int adc_battery_helper_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct adc_battery_helper *help = power_supply_get_drvdata(psy); + int dummy, ret = 0; + + /* + * Avoid racing with adc_battery_helper_work() while it is updating + * variables and avoid calling get_voltage_and_current_now() reentrantly. + */ + guard(mutex)(&help->lock); + + switch (psp) { + case POWER_SUPPLY_PROP_STATUS: + val->intval = help->status; + break; + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + ret = help->get_voltage_and_current_now(psy, &val->intval, &dummy); + break; + case POWER_SUPPLY_PROP_VOLTAGE_OCV: + val->intval = help->ocv_avg_uv; + break; + case POWER_SUPPLY_PROP_CURRENT_NOW: + ret = help->get_voltage_and_current_now(psy, &dummy, &val->intval); + break; + case POWER_SUPPLY_PROP_CAPACITY: + val->intval = help->capacity; + break; + case POWER_SUPPLY_PROP_PRESENT: + val->intval = 1; + break; + case POWER_SUPPLY_PROP_SCOPE: + val->intval = POWER_SUPPLY_SCOPE_SYSTEM; + break; + default: + return -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(adc_battery_helper_get_property); + +void adc_battery_helper_external_power_changed(struct power_supply *psy) +{ + struct adc_battery_helper *help = power_supply_get_drvdata(psy); + + dev_dbg(help->psy->dev.parent, "external power changed\n"); + mod_delayed_work(system_percpu_wq, &help->work, SETTLE_TIME); +} +EXPORT_SYMBOL_GPL(adc_battery_helper_external_power_changed); + +static void adc_battery_helper_start_work(struct adc_battery_helper *help) +{ + help->poll_count = 0; + help->ocv_avg_index = 0; + + queue_delayed_work(system_percpu_wq, &help->work, 0); + flush_delayed_work(&help->work); +} + +int adc_battery_helper_init(struct adc_battery_helper *help, struct power_supply *psy, + adc_battery_helper_get_func get_voltage_and_current_now, + struct gpio_desc *charge_finished_gpio) +{ + struct device *dev = psy->dev.parent; + int ret; + + help->psy = psy; + help->get_voltage_and_current_now = get_voltage_and_current_now; + help->charge_finished = charge_finished_gpio; + + ret = devm_mutex_init(dev, &help->lock); + if (ret) + return ret; + + ret = devm_delayed_work_autocancel(dev, &help->work, adc_battery_helper_work); + if (ret) + return ret; + + if (!help->psy->battery_info || + help->psy->battery_info->factory_internal_resistance_uohm == -EINVAL || + help->psy->battery_info->constant_charge_voltage_max_uv == -EINVAL || + !psy->battery_info->ocv_table[0]) { + dev_err(dev, "error required properties are missing\n"); + return -ENODEV; + } + + /* Use provided internal resistance as start point (in milli-ohm) */ + help->intern_res_avg_mohm = + help->psy->battery_info->factory_internal_resistance_uohm / 1000; + /* Also add it to the internal resistance moving average window */ + help->intern_res_mohm[0] = help->intern_res_avg_mohm; + help->intern_res_avg_index = 1; + help->intern_res_poll_count = 1; + + adc_battery_helper_start_work(help); + return 0; +} +EXPORT_SYMBOL_GPL(adc_battery_helper_init); + +int adc_battery_helper_suspend(struct device *dev) +{ + struct adc_battery_helper *help = dev_get_drvdata(dev); + + cancel_delayed_work_sync(&help->work); + return 0; +} +EXPORT_SYMBOL_GPL(adc_battery_helper_suspend); + +int adc_battery_helper_resume(struct device *dev) +{ + struct adc_battery_helper *help = dev_get_drvdata(dev); + + adc_battery_helper_start_work(help); + return 0; +} +EXPORT_SYMBOL_GPL(adc_battery_helper_resume); + +MODULE_AUTHOR("Hans de Goede "); +MODULE_DESCRIPTION("ADC battery capacity estimation helper"); +MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/adc-battery-helper.h b/drivers/power/supply/adc-battery-helper.h new file mode 100644 index 00000000000000..4e42181c898375 --- /dev/null +++ b/drivers/power/supply/adc-battery-helper.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Helper for batteries with accurate current and voltage measurement, but + * without temperature measurement or without a "resistance-temp-table". + * Copyright (c) 2021-2025 Hans de Goede + */ + +#include +#include + +#define ADC_BAT_HELPER_MOV_AVG_WINDOW_SIZE 8 + +struct power_supply; +struct gpio_desc; + +/* + * The adc battery helper code needs voltage- and current-now to be sampled as + * close to each other (in sample-time) as possible. A single getter function is + * used to allow the battery driver to handle this in the best way possible. + */ +typedef int (*adc_battery_helper_get_func)(struct power_supply *psy, int *volt, int *curr); + +struct adc_battery_helper { + struct power_supply *psy; + struct gpio_desc *charge_finished; + struct delayed_work work; + struct mutex lock; + adc_battery_helper_get_func get_voltage_and_current_now; + int ocv_uv[ADC_BAT_HELPER_MOV_AVG_WINDOW_SIZE]; /* micro-volt */ + int intern_res_mohm[ADC_BAT_HELPER_MOV_AVG_WINDOW_SIZE]; /* milli-ohm */ + int poll_count; + int ocv_avg_index; + int ocv_avg_uv; /* micro-volt */ + int intern_res_poll_count; + int intern_res_avg_index; + int intern_res_avg_mohm; /* milli-ohm */ + int volt_uv; /* micro-volt */ + int curr_ua; /* micro-ampere */ + int capacity; /* percent */ + int status; + bool supplied; +}; + +extern const enum power_supply_property adc_battery_helper_properties[]; +/* Must be const cannot be an external. Asserted in adc-battery-helper.c */ +#define ADC_HELPER_NUM_PROPERTIES 7 + +int adc_battery_helper_init(struct adc_battery_helper *help, struct power_supply *psy, + adc_battery_helper_get_func get_voltage_and_current_now, + struct gpio_desc *charge_finished_gpio); +/* + * The below functions can be directly used as power-supply / suspend-resume + * callbacks. They cast the power_supply_get_drvdata() / dev_get_drvdata() data + * directly to struct adc_battery_helper. Therefor struct adc_battery_helper + * MUST be the first member of the battery driver's data struct. + */ +int adc_battery_helper_get_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val); +void adc_battery_helper_external_power_changed(struct power_supply *psy); +int adc_battery_helper_suspend(struct device *dev); +int adc_battery_helper_resume(struct device *dev); diff --git a/drivers/power/supply/bq2415x_charger.c b/drivers/power/supply/bq2415x_charger.c index 917c26ee56bc9f..b50a28b9dd3867 100644 --- a/drivers/power/supply/bq2415x_charger.c +++ b/drivers/power/supply/bq2415x_charger.c @@ -842,7 +842,7 @@ static int bq2415x_notifier_call(struct notifier_block *nb, if (bq->automode < 1) return NOTIFY_OK; - mod_delayed_work(system_wq, &bq->work, 0); + mod_delayed_work(system_percpu_wq, &bq->work, 0); return NOTIFY_OK; } @@ -1516,7 +1516,7 @@ static int bq2415x_power_supply_init(struct bq2415x_device *bq) ret = bq2415x_detect_revision(bq); if (ret < 0) - strcpy(revstr, "unknown"); + strscpy(revstr, "unknown", sizeof(revstr)); else sprintf(revstr, "1.%d", ret); diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c index e1510c7fdab3bb..ed0ceae8d90b14 100644 --- a/drivers/power/supply/bq24190_charger.c +++ b/drivers/power/supply/bq24190_charger.c @@ -1467,7 +1467,7 @@ static void bq24190_charger_external_power_changed(struct power_supply *psy) * too low default 500mA iinlim. Delay setting the input-current-limit * for 300ms to avoid this. */ - queue_delayed_work(system_wq, &bdi->input_current_limit_work, + queue_delayed_work(system_percpu_wq, &bdi->input_current_limit_work, msecs_to_jiffies(300)); } diff --git a/drivers/power/supply/bq257xx_charger.c b/drivers/power/supply/bq257xx_charger.c new file mode 100644 index 00000000000000..02c7d8b61e82b6 --- /dev/null +++ b/drivers/power/supply/bq257xx_charger.c @@ -0,0 +1,755 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BQ257XX Battery Charger Driver + * Copyright (C) 2025 Chris Morgan + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Forward declaration of driver data. */ +struct bq257xx_chg; + +/** + * struct bq257xx_chip_info - chip specific routines + * @bq257xx_hw_init: init function for hw + * @bq257xx_hw_shutdown: shutdown function for hw + * @bq257xx_get_state: get and update state of hardware + * @bq257xx_set_ichg: set maximum charge current (in uA) + * @bq257xx_set_vbatreg: set maximum charge voltage (in uV) + * @bq257xx_set_iindpm: set maximum input current (in uA) + */ +struct bq257xx_chip_info { + int (*bq257xx_hw_init)(struct bq257xx_chg *pdata); + void (*bq257xx_hw_shutdown)(struct bq257xx_chg *pdata); + int (*bq257xx_get_state)(struct bq257xx_chg *pdata); + int (*bq257xx_set_ichg)(struct bq257xx_chg *pdata, int ichg); + int (*bq257xx_set_vbatreg)(struct bq257xx_chg *pdata, int vbatreg); + int (*bq257xx_set_iindpm)(struct bq257xx_chg *pdata, int iindpm); +}; + +/** + * struct bq257xx_chg - driver data for charger + * @chip: hw specific functions + * @bq: parent MFD device + * @charger: power supply device + * @online: charger input is present + * @fast_charge: charger is in fast charge mode + * @pre_charge: charger is in pre-charge mode + * @ov_fault: charger reports over voltage fault + * @batoc_fault: charger reports battery over current fault + * @oc_fault: charger reports over current fault + * @usb_type: USB type reported from parent power supply + * @supplied: Status of parent power supply + * @iindpm_max: maximum input current limit (uA) + * @vbat_max: maximum charge voltage (uV) + * @ichg_max: maximum charge current (uA) + * @vsys_min: minimum system voltage (uV) + */ +struct bq257xx_chg { + const struct bq257xx_chip_info *chip; + struct bq257xx_device *bq; + struct power_supply *charger; + bool online; + bool fast_charge; + bool pre_charge; + bool ov_fault; + bool batoc_fault; + bool oc_fault; + int usb_type; + int supplied; + u32 iindpm_max; + u32 vbat_max; + u32 ichg_max; + u32 vsys_min; +}; + +/** + * bq25703_get_state() - Get the current state of the device + * @pdata: driver platform data + * + * Get the current state of the charger. Check if the charger is + * powered, what kind of charge state (if any) the device is in, + * and if there are any active faults. + * + * Return: Returns 0 on success, or error on failure to read device. + */ +static int bq25703_get_state(struct bq257xx_chg *pdata) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_CHARGER_STATUS, ®); + if (ret) + return ret; + + pdata->online = reg & BQ25703_STS_AC_STAT; + pdata->fast_charge = reg & BQ25703_STS_IN_FCHRG; + pdata->pre_charge = reg & BQ25703_STS_IN_PCHRG; + pdata->ov_fault = reg & BQ25703_STS_FAULT_ACOV; + pdata->batoc_fault = reg & BQ25703_STS_FAULT_BATOC; + pdata->oc_fault = reg & BQ25703_STS_FAULT_ACOC; + + return 0; +} + +/** + * bq25703_get_min_vsys() - Get the minimum system voltage + * @pdata: driver platform data + * @intval: value for minimum voltage + * + * Return: Returns 0 on success or error on failure to read. + */ +static int bq25703_get_min_vsys(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_MIN_VSYS, + ®); + if (ret) + return ret; + + reg = FIELD_GET(BQ25703_MINVSYS_MASK, reg); + *intval = (reg * BQ25703_MINVSYS_STEP_UV) + BQ25703_MINVSYS_MIN_UV; + + return ret; +} + +/** + * bq25703_set_min_vsys() - Set the minimum system voltage + * @pdata: driver platform data + * @vsys: voltage value to set in uV. + * + * This function takes a requested minimum system voltage value, clamps + * it between the minimum supported value by the charger and a user + * defined minimum system value, and then writes the value to the + * appropriate register. + * + * Return: Returns 0 on success or error if an error occurs. + */ +static int bq25703_set_min_vsys(struct bq257xx_chg *pdata, int vsys) +{ + unsigned int reg; + int vsys_min = pdata->vsys_min; + + vsys = clamp(vsys, BQ25703_MINVSYS_MIN_UV, vsys_min); + reg = ((vsys - BQ25703_MINVSYS_MIN_UV) / BQ25703_MINVSYS_STEP_UV); + reg = FIELD_PREP(BQ25703_MINVSYS_MASK, reg); + + return regmap_write(pdata->bq->regmap, BQ25703_MIN_VSYS, + reg); +} + +/** + * bq25703_get_cur() - Get the reported current from the battery + * @pdata: driver platform data + * @intval: value of reported battery current + * + * Read the reported current from the battery. Since value is always + * positive set sign to negative if discharging. + * + * Return: Returns 0 on success or error if unable to read value. + */ +static int bq25703_get_cur(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_ADCIBAT_CHG, ®); + if (ret < 0) + return ret; + + if (pdata->online) + *intval = FIELD_GET(BQ25703_ADCIBAT_CHG_MASK, reg) * + BQ25703_ADCIBAT_CHG_STEP_UA; + else + *intval = -(FIELD_GET(BQ25703_ADCIBAT_DISCHG_MASK, reg) * + BQ25703_ADCIBAT_DIS_STEP_UA); + + return ret; +} + +/** + * bq25703_get_ichg_cur() - Get the maximum reported charge current + * @pdata: driver platform data + * @intval: value of maximum reported charge current + * + * Get the maximum reported charge current from the battery. + * + * Return: Returns 0 on success or error if unable to read value. + */ +static int bq25703_get_ichg_cur(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_CHARGE_CURRENT, ®); + if (ret) + return ret; + + *intval = FIELD_GET(BQ25703_ICHG_MASK, reg) * BQ25703_ICHG_STEP_UA; + + return ret; +} + +/** + * bq25703_set_ichg_cur() - Set the maximum charge current + * @pdata: driver platform data + * @ichg: current value to set in uA. + * + * This function takes a requested maximum charge current value, clamps + * it between the minimum supported value by the charger and a user + * defined maximum charging value, and then writes the value to the + * appropriate register. + * + * Return: Returns 0 on success or error if an error occurs. + */ +static int bq25703_set_ichg_cur(struct bq257xx_chg *pdata, int ichg) +{ + unsigned int reg; + int ichg_max = pdata->ichg_max; + + ichg = clamp(ichg, BQ25703_ICHG_MIN_UA, ichg_max); + reg = FIELD_PREP(BQ25703_ICHG_MASK, (ichg / BQ25703_ICHG_STEP_UA)); + + return regmap_write(pdata->bq->regmap, BQ25703_CHARGE_CURRENT, + reg); +} + +/** + * bq25703_get_chrg_volt() - Get the maximum set charge voltage + * @pdata: driver platform data + * @intval: maximum charge voltage value + * + * Return: Returns 0 on success or error if unable to read value. + */ +static int bq25703_get_chrg_volt(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_MAX_CHARGE_VOLT, + ®); + if (ret) + return ret; + + *intval = FIELD_GET(BQ25703_MAX_CHARGE_VOLT_MASK, reg) * + BQ25703_VBATREG_STEP_UV; + + return ret; +} + +/** + * bq25703_set_chrg_volt() - Set the maximum charge voltage + * @pdata: driver platform data + * @vbat: voltage value to set in uV. + * + * This function takes a requested maximum charge voltage value, clamps + * it between the minimum supported value by the charger and a user + * defined maximum charging value, and then writes the value to the + * appropriate register. + * + * Return: Returns 0 on success or error if an error occurs. + */ +static int bq25703_set_chrg_volt(struct bq257xx_chg *pdata, int vbat) +{ + unsigned int reg; + int vbat_max = pdata->vbat_max; + + vbat = clamp(vbat, BQ25703_VBATREG_MIN_UV, vbat_max); + + reg = FIELD_PREP(BQ25703_MAX_CHARGE_VOLT_MASK, + (vbat / BQ25703_VBATREG_STEP_UV)); + + return regmap_write(pdata->bq->regmap, BQ25703_MAX_CHARGE_VOLT, + reg); +} + +/** + * bq25703_get_iindpm() - Get the maximum set input current + * @pdata: driver platform data + * @intval: maximum input current value + * + * Read the actual input current limit from the device into intval. + * This can differ from the value programmed due to some autonomous + * functions that may be enabled (but are not currently). This is why + * there is a different register used. + * + * Return: Returns 0 on success or error if unable to read register + * value. + */ +static int bq25703_get_iindpm(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_IIN_DPM, ®); + if (ret) + return ret; + + reg = FIELD_GET(BQ25703_IINDPM_MASK, reg); + *intval = (reg * BQ25703_IINDPM_STEP_UA) + BQ25703_IINDPM_OFFSET_UA; + + return ret; +} + +/** + * bq25703_set_iindpm() - Set the maximum input current + * @pdata: driver platform data + * @iindpm: current value in uA. + * + * This function takes a requested maximum input current value, clamps + * it between the minimum supported value by the charger and a user + * defined maximum input value, and then writes the value to the + * appropriate register. + * + * Return: Returns 0 on success or error if an error occurs. + */ +static int bq25703_set_iindpm(struct bq257xx_chg *pdata, int iindpm) +{ + unsigned int reg; + int iindpm_max = pdata->iindpm_max; + + iindpm = clamp(iindpm, BQ25703_IINDPM_MIN_UA, iindpm_max); + + reg = ((iindpm - BQ25703_IINDPM_OFFSET_UA) / BQ25703_IINDPM_STEP_UA); + + return regmap_write(pdata->bq->regmap, BQ25703_IIN_HOST, + FIELD_PREP(BQ25703_IINDPM_MASK, reg)); +} + +/** + * bq25703_get_vbat() - Get the reported voltage from the battery + * @pdata: driver platform data + * @intval: value of reported battery voltage + * + * Read value of battery voltage into intval. + * + * Return: Returns 0 on success or error if unable to read value. + */ +static int bq25703_get_vbat(struct bq257xx_chg *pdata, int *intval) +{ + unsigned int reg; + int ret; + + ret = regmap_read(pdata->bq->regmap, BQ25703_ADCVSYSVBAT, ®); + if (ret) + return ret; + + reg = FIELD_GET(BQ25703_ADCVBAT_MASK, reg); + *intval = (reg * BQ25703_ADCVSYSVBAT_STEP) + BQ25703_ADCVSYSVBAT_OFFSET_UV; + + return ret; +} + +/** + * bq25703_hw_init() - Set all the required registers to init the charger + * @pdata: driver platform data + * + * Initialize the BQ25703 by first disabling the watchdog timer (which + * shuts off the charger in the absence of periodic writes). Then, set + * the charge current, charge voltage, minimum system voltage, and + * input current limit. Disable low power mode to allow ADCs and + * interrupts. Enable the ADC, start the ADC, set the ADC scale to + * full, and enable each individual ADC channel. + * + * Return: Returns 0 on success or error code on error. + */ +static int bq25703_hw_init(struct bq257xx_chg *pdata) +{ + struct regmap *regmap = pdata->bq->regmap; + int ret = 0; + + regmap_update_bits(regmap, BQ25703_CHARGE_OPTION_0, + BQ25703_WDTMR_ADJ_MASK, + FIELD_PREP(BQ25703_WDTMR_ADJ_MASK, + BQ25703_WDTMR_DISABLE)); + + ret = pdata->chip->bq257xx_set_ichg(pdata, pdata->ichg_max); + if (ret) + return ret; + + ret = pdata->chip->bq257xx_set_vbatreg(pdata, pdata->vbat_max); + if (ret) + return ret; + + ret = bq25703_set_min_vsys(pdata, pdata->vsys_min); + if (ret) + return ret; + + ret = pdata->chip->bq257xx_set_iindpm(pdata, pdata->iindpm_max); + if (ret) + return ret; + + /* Disable low power mode by writing 0 to the register. */ + regmap_update_bits(regmap, BQ25703_CHARGE_OPTION_0, + BQ25703_EN_LWPWR, 0); + + /* Enable the ADC. */ + regmap_update_bits(regmap, BQ25703_ADC_OPTION, + BQ25703_ADC_CONV_EN, BQ25703_ADC_CONV_EN); + + /* Start the ADC. */ + regmap_update_bits(regmap, BQ25703_ADC_OPTION, + BQ25703_ADC_START, BQ25703_ADC_START); + + /* Set the scale of the ADC. */ + regmap_update_bits(regmap, BQ25703_ADC_OPTION, + BQ25703_ADC_FULL_SCALE, BQ25703_ADC_FULL_SCALE); + + /* Enable each of the ADC channels available. */ + regmap_update_bits(regmap, BQ25703_ADC_OPTION, + BQ25703_ADC_CH_MASK, + (BQ25703_ADC_CMPIN_EN | BQ25703_ADC_VBUS_EN | + BQ25703_ADC_PSYS_EN | BQ25703_ADC_IIN_EN | + BQ25703_ADC_IDCHG_EN | BQ25703_ADC_ICHG_EN | + BQ25703_ADC_VSYS_EN | BQ25703_ADC_VBAT_EN)); + + return ret; +} + +/** + * bq25703_hw_shutdown() - Set registers for shutdown + * @pdata: driver platform data + * + * Enable low power mode for the device while in shutdown. + */ +static void bq25703_hw_shutdown(struct bq257xx_chg *pdata) +{ + regmap_update_bits(pdata->bq->regmap, BQ25703_CHARGE_OPTION_0, + BQ25703_EN_LWPWR, BQ25703_EN_LWPWR); +} + +static int bq257xx_set_charger_property(struct power_supply *psy, + enum power_supply_property prop, + const union power_supply_propval *val) +{ + struct bq257xx_chg *pdata = power_supply_get_drvdata(psy); + + switch (prop) { + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + return pdata->chip->bq257xx_set_iindpm(pdata, val->intval); + + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + return pdata->chip->bq257xx_set_vbatreg(pdata, val->intval); + + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + return pdata->chip->bq257xx_set_ichg(pdata, val->intval); + + default: + break; + } + + return -EINVAL; +} + +static int bq257xx_get_charger_property(struct power_supply *psy, + enum power_supply_property psp, + union power_supply_propval *val) +{ + struct bq257xx_chg *pdata = power_supply_get_drvdata(psy); + int ret = 0; + + ret = pdata->chip->bq257xx_get_state(pdata); + if (ret) + return ret; + + switch (psp) { + case POWER_SUPPLY_PROP_STATUS: + if (!pdata->online) + val->intval = POWER_SUPPLY_STATUS_DISCHARGING; + else if (pdata->fast_charge || pdata->pre_charge) + val->intval = POWER_SUPPLY_STATUS_CHARGING; + else + val->intval = POWER_SUPPLY_STATUS_NOT_CHARGING; + break; + + case POWER_SUPPLY_PROP_HEALTH: + if (pdata->ov_fault || pdata->batoc_fault) + val->intval = POWER_SUPPLY_HEALTH_OVERVOLTAGE; + else if (pdata->oc_fault) + val->intval = POWER_SUPPLY_HEALTH_OVERCURRENT; + else + val->intval = POWER_SUPPLY_HEALTH_GOOD; + break; + + case POWER_SUPPLY_PROP_MANUFACTURER: + val->strval = "Texas Instruments"; + break; + + case POWER_SUPPLY_PROP_ONLINE: + val->intval = pdata->online; + break; + + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + return bq25703_get_iindpm(pdata, &val->intval); + + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + return bq25703_get_chrg_volt(pdata, &val->intval); + + case POWER_SUPPLY_PROP_CURRENT_NOW: + return bq25703_get_cur(pdata, &val->intval); + + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + return bq25703_get_vbat(pdata, &val->intval); + + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + return bq25703_get_ichg_cur(pdata, &val->intval); + + case POWER_SUPPLY_PROP_VOLTAGE_MIN: + return bq25703_get_min_vsys(pdata, &val->intval); + + case POWER_SUPPLY_PROP_USB_TYPE: + val->intval = pdata->usb_type; + break; + + default: + return -EINVAL; + } + + return ret; +} + +static enum power_supply_property bq257xx_power_supply_props[] = { + POWER_SUPPLY_PROP_MANUFACTURER, + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_ONLINE, + POWER_SUPPLY_PROP_HEALTH, + POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX, + POWER_SUPPLY_PROP_VOLTAGE_MIN, + POWER_SUPPLY_PROP_USB_TYPE, +}; + +static int bq257xx_property_is_writeable(struct power_supply *psy, + enum power_supply_property prop) +{ + switch (prop) { + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + return true; + default: + return false; + } +} + +/** + * bq257xx_external_power_changed() - Handler for external power change + * @psy: Power supply data + * + * When the external power into the charger is changed, check the USB + * type so that it can be reported. Additionally, update the max input + * current and max charging current to the value reported if it is a + * USB PD charger, otherwise use the default value. Note that each time + * a charger is removed the max charge current register is erased, so + * it must be set again each time the input changes or the device will + * not charge. + */ +static void bq257xx_external_power_changed(struct power_supply *psy) +{ + struct bq257xx_chg *pdata = power_supply_get_drvdata(psy); + union power_supply_propval val; + int ret; + int imax = pdata->iindpm_max; + + pdata->chip->bq257xx_get_state(pdata); + + pdata->supplied = power_supply_am_i_supplied(pdata->charger); + if (pdata->supplied < 0) + return; + + if (pdata->supplied == 0) + goto out; + + ret = power_supply_get_property_from_supplier(psy, + POWER_SUPPLY_PROP_USB_TYPE, + &val); + if (ret) + return; + + pdata->usb_type = val.intval; + + if ((pdata->usb_type == POWER_SUPPLY_USB_TYPE_PD) || + (pdata->usb_type == POWER_SUPPLY_USB_TYPE_PD_DRP) || + (pdata->usb_type == POWER_SUPPLY_USB_TYPE_PD_PPS)) { + ret = power_supply_get_property_from_supplier(psy, + POWER_SUPPLY_PROP_CURRENT_MAX, + &val); + if (ret) + return; + + if (val.intval) + imax = val.intval; + } + + if (pdata->supplied) { + pdata->chip->bq257xx_set_ichg(pdata, pdata->ichg_max); + pdata->chip->bq257xx_set_iindpm(pdata, imax); + pdata->chip->bq257xx_set_vbatreg(pdata, pdata->vbat_max); + } + +out: + power_supply_changed(psy); +} + +static irqreturn_t bq257xx_irq_handler_thread(int irq, void *private) +{ + struct bq257xx_chg *pdata = private; + + bq257xx_external_power_changed(pdata->charger); + return IRQ_HANDLED; +} + +static const struct power_supply_desc bq257xx_power_supply_desc = { + .name = "bq257xx-charger", + .type = POWER_SUPPLY_TYPE_USB, + .usb_types = BIT(POWER_SUPPLY_USB_TYPE_C) | + BIT(POWER_SUPPLY_USB_TYPE_PD) | + BIT(POWER_SUPPLY_USB_TYPE_PD_DRP) | + BIT(POWER_SUPPLY_USB_TYPE_PD_PPS) | + BIT(POWER_SUPPLY_USB_TYPE_UNKNOWN), + .properties = bq257xx_power_supply_props, + .num_properties = ARRAY_SIZE(bq257xx_power_supply_props), + .get_property = bq257xx_get_charger_property, + .set_property = bq257xx_set_charger_property, + .property_is_writeable = bq257xx_property_is_writeable, + .external_power_changed = bq257xx_external_power_changed, +}; + +static const struct bq257xx_chip_info bq25703_chip_info = { + .bq257xx_hw_init = &bq25703_hw_init, + .bq257xx_hw_shutdown = &bq25703_hw_shutdown, + .bq257xx_get_state = &bq25703_get_state, + .bq257xx_set_ichg = &bq25703_set_ichg_cur, + .bq257xx_set_vbatreg = &bq25703_set_chrg_volt, + .bq257xx_set_iindpm = &bq25703_set_iindpm, +}; + +/** + * bq257xx_parse_dt() - Parse the device tree for required properties + * @pdata: driver platform data + * @psy_cfg: power supply config data + * @dev: device struct + * + * Read the device tree to identify the minimum system voltage, the + * maximum charge current, the maximum charge voltage, and the maximum + * input current. + * + * Return: Returns 0 on success or error code on error. + */ +static int bq257xx_parse_dt(struct bq257xx_chg *pdata, + struct power_supply_config *psy_cfg, struct device *dev) +{ + struct power_supply_battery_info *bat_info; + int ret; + + ret = power_supply_get_battery_info(pdata->charger, + &bat_info); + if (ret) + return dev_err_probe(dev, ret, + "Unable to get battery info\n"); + + if ((bat_info->voltage_min_design_uv <= 0) || + (bat_info->constant_charge_voltage_max_uv <= 0) || + (bat_info->constant_charge_current_max_ua <= 0)) + return dev_err_probe(dev, -EINVAL, + "Required bat info missing or invalid\n"); + + pdata->vsys_min = bat_info->voltage_min_design_uv; + pdata->vbat_max = bat_info->constant_charge_voltage_max_uv; + pdata->ichg_max = bat_info->constant_charge_current_max_ua; + + power_supply_put_battery_info(pdata->charger, bat_info); + + ret = device_property_read_u32(dev, + "input-current-limit-microamp", + &pdata->iindpm_max); + if (ret) + pdata->iindpm_max = BQ25703_IINDPM_DEFAULT_UA; + + return 0; +} + +static int bq257xx_charger_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct bq257xx_device *bq = dev_get_drvdata(pdev->dev.parent); + struct bq257xx_chg *pdata; + struct power_supply_config psy_cfg = { }; + int ret; + + device_set_of_node_from_dev(dev, pdev->dev.parent); + + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) + return -ENOMEM; + + pdata->bq = bq; + pdata->chip = &bq25703_chip_info; + + platform_set_drvdata(pdev, pdata); + + psy_cfg.drv_data = pdata; + psy_cfg.fwnode = dev_fwnode(dev); + + pdata->charger = devm_power_supply_register(dev, + &bq257xx_power_supply_desc, + &psy_cfg); + if (IS_ERR(pdata->charger)) + return dev_err_probe(dev, PTR_ERR(pdata->charger), + "Power supply register charger failed\n"); + + ret = bq257xx_parse_dt(pdata, &psy_cfg, dev); + if (ret) + return ret; + + ret = pdata->chip->bq257xx_hw_init(pdata); + if (ret) + return dev_err_probe(dev, ret, "Cannot initialize the charger\n"); + + platform_set_drvdata(pdev, pdata); + + if (bq->client->irq) { + ret = devm_request_threaded_irq(dev, bq->client->irq, NULL, + bq257xx_irq_handler_thread, + IRQF_TRIGGER_RISING | + IRQF_TRIGGER_FALLING | + IRQF_ONESHOT, + dev_name(&bq->client->dev), pdata); + if (ret < 0) + dev_err_probe(dev, ret, "Charger get irq failed\n"); + } + + return ret; +} + +static void bq257xx_charger_shutdown(struct platform_device *pdev) +{ + struct bq257xx_chg *pdata = platform_get_drvdata(pdev); + + pdata->chip->bq257xx_hw_shutdown(pdata); +} + +static struct platform_driver bq257xx_chg_driver = { + .driver = { + .name = "bq257xx-charger", + }, + .probe = bq257xx_charger_probe, + .shutdown = bq257xx_charger_shutdown, +}; +module_platform_driver(bq257xx_chg_driver); + +MODULE_DESCRIPTION("bq257xx charger driver"); +MODULE_AUTHOR("Chris Morgan "); +MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 93dcebbe114175..19445e39651c71 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1127,7 +1127,7 @@ static int poll_interval_param_set(const char *val, const struct kernel_param *k mutex_lock(&bq27xxx_list_lock); list_for_each_entry(di, &bq27xxx_battery_devices, list) - mod_delayed_work(system_wq, &di->work, 0); + mod_delayed_work(system_percpu_wq, &di->work, 0); mutex_unlock(&bq27xxx_list_lock); return ret; @@ -1919,8 +1919,8 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) bool has_singe_flag = di->opts & BQ27XXX_O_ZERO; cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag); - if ((cache.flags & 0xff) == 0xff) - cache.flags = -1; /* read error */ + if (di->chip == BQ27000 && (cache.flags & 0xff) == 0xff) + cache.flags = -ENODEV; /* bq27000 hdq read error */ if (cache.flags >= 0) { cache.capacity = bq27xxx_battery_read_soc(di); @@ -1945,7 +1945,7 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) di->last_update = jiffies; if (!di->removed && poll_interval > 0) - mod_delayed_work(system_wq, &di->work, poll_interval * HZ); + mod_delayed_work(system_percpu_wq, &di->work, poll_interval * HZ); } void bq27xxx_battery_update(struct bq27xxx_device_info *di) @@ -2221,14 +2221,7 @@ static void bq27xxx_external_power_changed(struct power_supply *psy) struct bq27xxx_device_info *di = power_supply_get_drvdata(psy); /* After charger plug in/out wait 0.5s for things to stabilize */ - mod_delayed_work(system_wq, &di->work, HZ / 2); -} - -static void bq27xxx_battery_mutex_destroy(void *data) -{ - struct mutex *lock = data; - - mutex_destroy(lock); + mod_delayed_work(system_percpu_wq, &di->work, HZ / 2); } int bq27xxx_battery_setup(struct bq27xxx_device_info *di) @@ -2242,9 +2235,7 @@ int bq27xxx_battery_setup(struct bq27xxx_device_info *di) int ret; INIT_DELAYED_WORK(&di->work, bq27xxx_battery_poll); - mutex_init(&di->lock); - ret = devm_add_action_or_reset(di->dev, bq27xxx_battery_mutex_destroy, - &di->lock); + ret = devm_mutex_init(di->dev, &di->lock); if (ret) return ret; diff --git a/drivers/power/supply/cw2015_battery.c b/drivers/power/supply/cw2015_battery.c index f63c3c41045155..2263d5d3448fdf 100644 --- a/drivers/power/supply/cw2015_battery.c +++ b/drivers/power/supply/cw2015_battery.c @@ -506,10 +506,7 @@ static int cw_battery_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_CHARGE_FULL: case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN: - if (cw_bat->battery->charge_full_design_uah > 0) - val->intval = cw_bat->battery->charge_full_design_uah; - else - val->intval = 0; + val->intval = max(cw_bat->battery->charge_full_design_uah, 0); break; case POWER_SUPPLY_PROP_CHARGE_NOW: @@ -702,8 +699,7 @@ static int cw_bat_probe(struct i2c_client *client) if (!cw_bat->battery_workqueue) return -ENOMEM; - devm_delayed_work_autocancel(&client->dev, - &cw_bat->battery_delay_work, cw_bat_work); + devm_delayed_work_autocancel(&client->dev, &cw_bat->battery_delay_work, cw_bat_work); queue_delayed_work(cw_bat->battery_workqueue, &cw_bat->battery_delay_work, msecs_to_jiffies(10)); return 0; diff --git a/drivers/power/supply/gpio-charger.c b/drivers/power/supply/gpio-charger.c index 1b2da9b5fb6541..2504190eba82e6 100644 --- a/drivers/power/supply/gpio-charger.c +++ b/drivers/power/supply/gpio-charger.c @@ -79,7 +79,8 @@ static int set_charge_current_limit(struct gpio_charger *gpio_charger, int val) for (i = 0; i < ndescs; i++) { bool val = (mapping.gpiodata >> i) & 1; - gpiod_set_value_cansleep(gpios[ndescs-i-1], val); + + gpiod_set_value_cansleep(gpios[ndescs - i - 1], val); } gpio_charger->charge_current_limit = mapping.limit_ua; @@ -226,14 +227,14 @@ static int init_charge_current_limit(struct device *dev, gpio_charger->current_limit_map_size = len / 2; len = device_property_read_u32_array(dev, "charge-current-limit-mapping", - (u32*) gpio_charger->current_limit_map, len); + (u32 *) gpio_charger->current_limit_map, len); if (len < 0) return len; set_def_limit = !device_property_read_u32(dev, "charge-current-limit-default-microamp", &def_limit); - for (i=0; i < gpio_charger->current_limit_map_size; i++) { + for (i = 0; i < gpio_charger->current_limit_map_size; i++) { if (gpio_charger->current_limit_map[i].limit_ua > cur_limit) { dev_err(dev, "charge-current-limit-mapping not sorted by current in descending order\n"); return -EINVAL; diff --git a/drivers/power/supply/intel_dc_ti_battery.c b/drivers/power/supply/intel_dc_ti_battery.c new file mode 100644 index 00000000000000..56b0c92e9d28a1 --- /dev/null +++ b/drivers/power/supply/intel_dc_ti_battery.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Battery driver for the coulomb-counter of the Intel Dollar Cove TI PMIC + * + * Note the Intel Dollar Cove TI PMIC coulomb-counter is not a full-featured + * autonomous fuel-gauge. It is intended to work together with an always on + * micro-controller monitoring it. + * + * Since Linux does not monitor coulomb-counter changes while the device + * is off or suspended, voltage based capacity estimation from + * the adc-battery-helper code is used. + * + * Copyright (C) 2024 Hans de Goede + * + * Register definitions and calibration code was taken from + * kernel/drivers/platform/x86/dc_ti_cc.c from the Acer A1-840 Android kernel + * which has the following copyright header: + * + * Copyright (C) 2014 Intel Corporation + * Author: Ramakrishna Pallala + * + * dc_ti_cc.c is part of the Acer A1-840 Android kernel source-code archive + * named: "App. Guide_Acer_20151221_A_A.zip" + * which is distributed by Acer from the Acer A1-840 support page: + * https://www.acer.com/us-en/support/product-support/A1-840/downloads + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "adc-battery-helper.h" + +#define DC_TI_PMIC_VERSION_REG 0x00 +#define PMIC_VERSION_A0 0xC0 +#define PMIC_VERSION_A1 0xC1 + +#define DC_TI_CC_CNTL_REG 0x60 +#define CC_CNTL_CC_CTR_EN BIT(0) +#define CC_CNTL_CC_CLR_EN BIT(1) +#define CC_CNTL_CC_CAL_EN BIT(2) +#define CC_CNTL_CC_OFFSET_EN BIT(3) +#define CC_CNTL_SMPL_INTVL GENMASK(5, 4) +#define CC_CNTL_SMPL_INTVL_15MS FIELD_PREP(CC_CNTL_SMPL_INTVL, 0) +#define CC_CNTL_SMPL_INTVL_62MS FIELD_PREP(CC_CNTL_SMPL_INTVL, 1) +#define CC_CNTL_SMPL_INTVL_125MS FIELD_PREP(CC_CNTL_SMPL_INTVL, 2) +#define CC_CNTL_SMPL_INTVL_250MS FIELD_PREP(CC_CNTL_SMPL_INTVL, 3) + +#define DC_TI_SMPL_CTR0_REG 0x69 +#define DC_TI_SMPL_CTR1_REG 0x68 +#define DC_TI_SMPL_CTR2_REG 0x67 + +#define DC_TI_CC_OFFSET_HI_REG 0x61 +#define CC_OFFSET_HI_MASK 0x3F +#define DC_TI_CC_OFFSET_LO_REG 0x62 + +#define DC_TI_SW_OFFSET_REG 0x6C + +#define DC_TI_CC_ACC3_REG 0x63 +#define DC_TI_CC_ACC2_REG 0x64 +#define DC_TI_CC_ACC1_REG 0x65 +#define DC_TI_CC_ACC0_REG 0x66 + +#define DC_TI_CC_INTG1_REG 0x6A +#define DC_TI_CC_INTG1_MASK 0x3F +#define DC_TI_CC_INTG0_REG 0x6B + +#define DC_TI_EEPROM_ACCESS_CONTROL 0x88 +#define EEPROM_UNLOCK 0xDA +#define EEPROM_LOCK 0x00 + +#define DC_TI_EEPROM_CC_GAIN_REG 0xF4 +#define CC_TRIM_REVISION GENMASK(3, 0) +#define CC_GAIN_CORRECTION GENMASK(7, 4) + +#define PMIC_VERSION_A0_TRIM_REV 3 +#define PMIC_VERSION_A1_MIN_TRIM_REV 1 + +#define DC_TI_EEPROM_CC_OFFSET_REG 0xFD + +#define DC_TI_EEPROM_CTRL 0xFE +#define EEPROM_BANK0_SEL 0x01 +#define EEPROM_BANK1_SEL 0x02 + +#define SMPL_INTVL_US 15000 +#define SMPL_INTVL_MS (SMPL_INTVL_US / USEC_PER_MSEC) +#define CALIBRATION_TIME_US (10 * SMPL_INTVL_US) +#define SLEEP_SLACK_US 2500 + +/* CC gain correction is in 0.0025 increments */ +#define CC_GAIN_STEP 25 +#define CC_GAIN_DIV 10000 + +/* CC offset is in 0.5 units per 250ms (default sample interval) */ +#define CC_OFFSET_DIV 2 +#define CC_OFFSET_SMPL_INTVL_MS 250 + +/* CC accumulator scale is 366.2 ųCoulumb / unit */ +#define CC_ACC_TO_UA(acc, smpl_ctr) \ + ((acc) * (3662 * MSEC_PER_SEC / 10) / ((smpl_ctr) * SMPL_INTVL_MS)) + +#define DEV_NAME "chtdc_ti_battery" + +struct dc_ti_battery_chip { + /* Must be the first member see adc-battery-helper documentation */ + struct adc_battery_helper helper; + struct device *dev; + struct regmap *regmap; + struct iio_channel *vbat_channel; + struct power_supply *psy; + int cc_gain; + int cc_offset; +}; + +static int dc_ti_battery_get_voltage_and_current_now(struct power_supply *psy, int *volt, int *curr) +{ + struct dc_ti_battery_chip *chip = power_supply_get_drvdata(psy); + s64 cnt_start_usec, now_usec, sleep_usec; + unsigned int reg_val; + s32 acc, smpl_ctr; + int ret; + + /* + * Enable coulomb-counter before reading Vbat from ADC, so that the CC + * samples are from the same time period as the Vbat reading. + */ + ret = regmap_write(chip->regmap, DC_TI_CC_CNTL_REG, + CC_CNTL_SMPL_INTVL_15MS | CC_CNTL_CC_OFFSET_EN | CC_CNTL_CC_CTR_EN); + if (ret) + goto out_err; + + cnt_start_usec = ktime_get_ns() / NSEC_PER_USEC; + + /* Read Vbat, convert IIO mV to power-supply ųV */ + ret = iio_read_channel_processed_scale(chip->vbat_channel, volt, 1000); + if (ret < 0) + goto out_err; + + /* Sleep at least 3 sample-times + slack to get 3+ CC samples */ + now_usec = ktime_get_ns() / NSEC_PER_USEC; + sleep_usec = 3 * SMPL_INTVL_US + SLEEP_SLACK_US - (now_usec - cnt_start_usec); + if (sleep_usec > 0 && sleep_usec < 1000000) + usleep_range(sleep_usec, sleep_usec + SLEEP_SLACK_US); + + /* + * The PMIC latches the coulomb- and sample-counters upon reading the + * CC_ACC0 register. Reading multiple registers at once is not supported. + * + * Step 1: Read CC_ACC0 - CC_ACC3 + */ + ret = regmap_read(chip->regmap, DC_TI_CC_ACC0_REG, ®_val); + if (ret) + goto out_err; + + acc = reg_val; + + ret = regmap_read(chip->regmap, DC_TI_CC_ACC1_REG, ®_val); + if (ret) + goto out_err; + + acc |= reg_val << 8; + + ret = regmap_read(chip->regmap, DC_TI_CC_ACC2_REG, ®_val); + if (ret) + goto out_err; + + acc |= reg_val << 16; + + ret = regmap_read(chip->regmap, DC_TI_CC_ACC3_REG, ®_val); + if (ret) + goto out_err; + + acc |= reg_val << 24; + + /* Step 2: Read SMPL_CTR0 - SMPL_CTR2 */ + ret = regmap_read(chip->regmap, DC_TI_SMPL_CTR0_REG, ®_val); + if (ret) + goto out_err; + + smpl_ctr = reg_val; + + ret = regmap_read(chip->regmap, DC_TI_SMPL_CTR1_REG, ®_val); + if (ret) + goto out_err; + + smpl_ctr |= reg_val << 8; + + ret = regmap_read(chip->regmap, DC_TI_SMPL_CTR2_REG, ®_val); + if (ret) + goto out_err; + + smpl_ctr |= reg_val << 16; + + /* Disable the coulumb-counter again */ + ret = regmap_write(chip->regmap, DC_TI_CC_CNTL_REG, + CC_CNTL_SMPL_INTVL_15MS | CC_CNTL_CC_OFFSET_EN); + if (ret) + goto out_err; + + /* Apply calibration */ + acc -= chip->cc_offset * smpl_ctr * SMPL_INTVL_MS / + (CC_OFFSET_SMPL_INTVL_MS * CC_OFFSET_DIV); + acc = acc * (CC_GAIN_DIV - chip->cc_gain * CC_GAIN_STEP) / CC_GAIN_DIV; + *curr = CC_ACC_TO_UA(acc, smpl_ctr); + + return 0; + +out_err: + dev_err(chip->dev, "IO-error %d communicating with PMIC\n", ret); + return ret; +} + +static const struct power_supply_desc dc_ti_battery_psy_desc = { + .name = "intel_dc_ti_battery", + .type = POWER_SUPPLY_TYPE_BATTERY, + .get_property = adc_battery_helper_get_property, + .external_power_changed = adc_battery_helper_external_power_changed, + .properties = adc_battery_helper_properties, + .num_properties = ADC_HELPER_NUM_PROPERTIES, +}; + +static int dc_ti_battery_hw_init(struct dc_ti_battery_chip *chip) +{ + u8 pmic_version, cc_trim_rev; + unsigned int reg_val; + int ret; + + /* Set sample rate to 15 ms and calibrate the coulomb-counter */ + ret = regmap_write(chip->regmap, DC_TI_CC_CNTL_REG, + CC_CNTL_SMPL_INTVL_15MS | CC_CNTL_CC_OFFSET_EN | + CC_CNTL_CC_CAL_EN | CC_CNTL_CC_CTR_EN); + if (ret) + goto out; + + fsleep(CALIBRATION_TIME_US); + + /* Disable coulomb-counter it is only used while getting the current */ + ret = regmap_write(chip->regmap, DC_TI_CC_CNTL_REG, + CC_CNTL_SMPL_INTVL_15MS | CC_CNTL_CC_OFFSET_EN); + if (ret) + goto out; + + ret = regmap_read(chip->regmap, DC_TI_PMIC_VERSION_REG, ®_val); + if (ret) + goto out; + + pmic_version = reg_val; + + /* + * As per the PMIC vendor (TI), the calibration offset and gain err + * values are stored in EEPROM Bank 0 and Bank 1 of the PMIC. + * We need to read the stored offset and gain margins and need + * to apply the corrections to the raw coulomb counter value. + */ + + /* Unlock the EEPROM Access */ + ret = regmap_write(chip->regmap, DC_TI_EEPROM_ACCESS_CONTROL, EEPROM_UNLOCK); + if (ret) + goto out; + + /* Select Bank 1 to read CC GAIN Err correction */ + ret = regmap_write(chip->regmap, DC_TI_EEPROM_CTRL, EEPROM_BANK1_SEL); + if (ret) + goto out; + + ret = regmap_read(chip->regmap, DC_TI_EEPROM_CC_GAIN_REG, ®_val); + if (ret) + goto out; + + cc_trim_rev = FIELD_GET(CC_TRIM_REVISION, reg_val); + + dev_dbg(chip->dev, "pmic-ver 0x%02x trim-rev %d\n", pmic_version, cc_trim_rev); + + if (!(pmic_version == PMIC_VERSION_A0 && cc_trim_rev == PMIC_VERSION_A0_TRIM_REV) && + !(pmic_version == PMIC_VERSION_A1 && cc_trim_rev >= PMIC_VERSION_A1_MIN_TRIM_REV)) { + dev_dbg(chip->dev, "unsupported trim-revision, using uncalibrated CC values\n"); + goto out_relock; + } + + chip->cc_gain = 1 - (int)FIELD_GET(CC_GAIN_CORRECTION, reg_val); + + /* Select Bank 0 to read CC OFFSET Correction */ + ret = regmap_write(chip->regmap, DC_TI_EEPROM_CTRL, EEPROM_BANK0_SEL); + if (ret) + goto out_relock; + + ret = regmap_read(chip->regmap, DC_TI_EEPROM_CC_OFFSET_REG, ®_val); + if (ret) + goto out_relock; + + chip->cc_offset = (s8)reg_val; + + dev_dbg(chip->dev, "cc-offset %d cc-gain %d\n", chip->cc_offset, chip->cc_gain); + +out_relock: + /* Re-lock the EEPROM Access */ + regmap_write(chip->regmap, DC_TI_EEPROM_ACCESS_CONTROL, EEPROM_LOCK); +out: + if (ret) + dev_err(chip->dev, "IO-error %d initializing PMIC\n", ret); + + return ret; +} + +static int dc_ti_battery_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct intel_soc_pmic *pmic = dev_get_drvdata(dev->parent); + struct power_supply_config psy_cfg = {}; + struct fwnode_reference_args args; + struct gpio_desc *charge_finished; + struct dc_ti_battery_chip *chip; + int ret; + + /* On most devices with a Dollar Cove TI the battery is handled by ACPI */ + if (!acpi_quirk_skip_acpi_ac_and_battery()) + return -ENODEV; + + /* ACPI glue code adds a "monitored-battery" fwnode, wait for this */ + ret = fwnode_property_get_reference_args(dev_fwnode(dev), "monitored-battery", + NULL, 0, 0, &args); + if (ret) { + dev_dbg(dev, "fwnode_property_get_ref() ret %d\n", ret); + return dev_err_probe(dev, -EPROBE_DEFER, "Waiting for monitored-battery fwnode\n"); + } + + fwnode_handle_put(args.fwnode); + + chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + chip->dev = dev; + chip->regmap = pmic->regmap; + + chip->vbat_channel = devm_iio_channel_get(dev, "VBAT"); + if (IS_ERR(chip->vbat_channel)) { + dev_dbg(dev, "devm_iio_channel_get() ret %ld\n", PTR_ERR(chip->vbat_channel)); + return dev_err_probe(dev, -EPROBE_DEFER, "Waiting for VBAT IIO channel\n"); + } + + charge_finished = devm_gpiod_get_optional(dev, "charged", GPIOD_IN); + if (IS_ERR(charge_finished)) + return dev_err_probe(dev, PTR_ERR(charge_finished), "Getting charged GPIO\n"); + + ret = dc_ti_battery_hw_init(chip); + if (ret) + return ret; + + platform_set_drvdata(pdev, chip); + + psy_cfg.drv_data = chip; + chip->psy = devm_power_supply_register(dev, &dc_ti_battery_psy_desc, &psy_cfg); + if (IS_ERR(chip->psy)) + return PTR_ERR(chip->psy); + + return adc_battery_helper_init(&chip->helper, chip->psy, + dc_ti_battery_get_voltage_and_current_now, + charge_finished); +} + +static DEFINE_RUNTIME_DEV_PM_OPS(dc_ti_battery_pm_ops, adc_battery_helper_suspend, + adc_battery_helper_resume, NULL); + +static struct platform_driver dc_ti_battery_driver = { + .driver = { + .name = DEV_NAME, + .pm = pm_sleep_ptr(&dc_ti_battery_pm_ops), + }, + .probe = dc_ti_battery_probe, +}; +module_platform_driver(dc_ti_battery_driver); + +MODULE_ALIAS("platform:" DEV_NAME); +MODULE_AUTHOR("Hans de Goede "); +MODULE_DESCRIPTION("Intel Dollar Cove (TI) battery driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/ipaq_micro_battery.c b/drivers/power/supply/ipaq_micro_battery.c index 7e0568a5353f16..ff8573a5ca6d09 100644 --- a/drivers/power/supply/ipaq_micro_battery.c +++ b/drivers/power/supply/ipaq_micro_battery.c @@ -232,7 +232,8 @@ static int micro_batt_probe(struct platform_device *pdev) return -ENOMEM; mb->micro = dev_get_drvdata(pdev->dev.parent); - mb->wq = alloc_workqueue("ipaq-battery-wq", WQ_MEM_RECLAIM, 0); + mb->wq = alloc_workqueue("ipaq-battery-wq", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!mb->wq) return -ENOMEM; diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index 329b430d0e5065..b1a227bf72e26f 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -40,31 +40,30 @@ static enum power_supply_property max77705_charger_props[] = { POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, }; -static int max77705_chgin_irq(void *irq_drv_data) +static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data) { - struct max77705_charger_data *charger = irq_drv_data; + struct max77705_charger_data *chg = irq_drv_data; - queue_work(charger->wqueue, &charger->chgin_work); + queue_work(chg->wqueue, &chg->chgin_work); - return 0; + return IRQ_HANDLED; } static const struct regmap_irq max77705_charger_irqs[] = { - { .mask = MAX77705_BYP_IM, }, - { .mask = MAX77705_INP_LIMIT_IM, }, - { .mask = MAX77705_BATP_IM, }, - { .mask = MAX77705_BAT_IM, }, - { .mask = MAX77705_CHG_IM, }, - { .mask = MAX77705_WCIN_IM, }, - { .mask = MAX77705_CHGIN_IM, }, - { .mask = MAX77705_AICL_IM, }, + REGMAP_IRQ_REG_LINE(MAX77705_BYP_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_INP_LIMIT_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_BATP_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_BAT_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_CHG_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_WCIN_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_CHGIN_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_AICL_I, BITS_PER_BYTE), }; static struct regmap_irq_chip max77705_charger_irq_chip = { .name = "max77705-charger", .status_base = MAX77705_CHG_REG_INT, .mask_base = MAX77705_CHG_REG_INT_MASK, - .handle_post_irq = max77705_chgin_irq, .num_regs = 1, .irqs = max77705_charger_irqs, .num_irqs = ARRAY_SIZE(max77705_charger_irqs), @@ -74,8 +73,7 @@ static int max77705_charger_enable(struct max77705_charger_data *chg) { int rv; - rv = regmap_update_bits(chg->regmap, MAX77705_CHG_REG_CNFG_09, - MAX77705_CHG_EN_MASK, MAX77705_CHG_EN_MASK); + rv = regmap_field_write(chg->rfield[MAX77705_CHG_EN], 1); if (rv) dev_err(chg->dev, "unable to enable the charger: %d\n", rv); @@ -87,10 +85,7 @@ static void max77705_charger_disable(void *data) struct max77705_charger_data *chg = data; int rv; - rv = regmap_update_bits(chg->regmap, - MAX77705_CHG_REG_CNFG_09, - MAX77705_CHG_EN_MASK, - MAX77705_CHG_DISABLE); + rv = regmap_field_write(chg->rfield[MAX77705_CHG_EN], MAX77705_CHG_DISABLE); if (rv) dev_err(chg->dev, "unable to disable the charger: %d\n", rv); } @@ -109,19 +104,30 @@ static int max77705_get_online(struct regmap *regmap, int *val) return 0; } -static int max77705_check_battery(struct max77705_charger_data *charger, int *val) +static int max77705_set_integer(struct max77705_charger_data *chg, enum max77705_field_idx fidx, + unsigned int clamp_min, unsigned int clamp_max, + unsigned int div, int val) +{ + unsigned int regval; + + regval = clamp_val(val, clamp_min, clamp_max) / div; + + return regmap_field_write(chg->rfield[fidx], regval); +} + +static int max77705_check_battery(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; unsigned int reg_data2; - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; regmap_read(regmap, MAX77705_CHG_REG_INT_OK, ®_data); - dev_dbg(charger->dev, "CHG_INT_OK(0x%x)\n", reg_data); + dev_dbg(chg->dev, "CHG_INT_OK(0x%x)\n", reg_data); regmap_read(regmap, MAX77705_CHG_REG_DETAILS_00, ®_data2); - dev_dbg(charger->dev, "CHG_DETAILS00(0x%x)\n", reg_data2); + dev_dbg(chg->dev, "CHG_DETAILS00(0x%x)\n", reg_data2); if ((reg_data & MAX77705_BATP_OK) || !(reg_data2 & MAX77705_BATP_DTLS)) *val = true; @@ -131,13 +137,13 @@ static int max77705_check_battery(struct max77705_charger_data *charger, int *va return 0; } -static int max77705_get_charge_type(struct max77705_charger_data *charger, int *val) +static int max77705_get_charge_type(struct max77705_charger_data *chg, int *val) { - struct regmap *regmap = charger->regmap; - unsigned int reg_data; + struct regmap *regmap = chg->regmap; + unsigned int reg_data, chg_en; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - if (!MAX77705_CHARGER_CHG_CHARGING(reg_data)) { + regmap_field_read(chg->rfield[MAX77705_CHG_EN], &chg_en); + if (!chg_en) { *val = POWER_SUPPLY_CHARGE_TYPE_NONE; return 0; } @@ -159,13 +165,13 @@ static int max77705_get_charge_type(struct max77705_charger_data *charger, int * return 0; } -static int max77705_get_status(struct max77705_charger_data *charger, int *val) +static int max77705_get_status(struct max77705_charger_data *chg, int *val) { - struct regmap *regmap = charger->regmap; - unsigned int reg_data; + struct regmap *regmap = chg->regmap; + unsigned int reg_data, chg_en; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - if (!MAX77705_CHARGER_CHG_CHARGING(reg_data)) { + regmap_field_read(chg->rfield[MAX77705_CHG_EN], &chg_en); + if (!chg_en) { *val = POWER_SUPPLY_CHARGE_TYPE_NONE; return 0; } @@ -234,10 +240,10 @@ static int max77705_get_vbus_state(struct regmap *regmap, int *value) return 0; } -static int max77705_get_battery_health(struct max77705_charger_data *charger, +static int max77705_get_battery_health(struct max77705_charger_data *chg, int *value) { - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; unsigned int bat_dtls; regmap_read(regmap, MAX77705_CHG_REG_DETAILS_01, &bat_dtls); @@ -245,16 +251,16 @@ static int max77705_get_battery_health(struct max77705_charger_data *charger, switch (bat_dtls) { case MAX77705_BATTERY_NOBAT: - dev_dbg(charger->dev, "%s: No battery and the charger is suspended\n", + dev_dbg(chg->dev, "%s: No battery and the chg is suspended\n", __func__); *value = POWER_SUPPLY_HEALTH_NO_BATTERY; break; case MAX77705_BATTERY_PREQUALIFICATION: - dev_dbg(charger->dev, "%s: battery is okay but its voltage is low(~VPQLB)\n", + dev_dbg(chg->dev, "%s: battery is okay but its voltage is low(~VPQLB)\n", __func__); break; case MAX77705_BATTERY_DEAD: - dev_dbg(charger->dev, "%s: battery dead\n", __func__); + dev_dbg(chg->dev, "%s: battery dead\n", __func__); *value = POWER_SUPPLY_HEALTH_DEAD; break; case MAX77705_BATTERY_GOOD: @@ -262,11 +268,11 @@ static int max77705_get_battery_health(struct max77705_charger_data *charger, *value = POWER_SUPPLY_HEALTH_GOOD; break; case MAX77705_BATTERY_OVERVOLTAGE: - dev_dbg(charger->dev, "%s: battery ovp\n", __func__); + dev_dbg(chg->dev, "%s: battery ovp\n", __func__); *value = POWER_SUPPLY_HEALTH_OVERVOLTAGE; break; default: - dev_dbg(charger->dev, "%s: battery unknown\n", __func__); + dev_dbg(chg->dev, "%s: battery unknown\n", __func__); *value = POWER_SUPPLY_HEALTH_UNSPEC_FAILURE; break; } @@ -274,9 +280,9 @@ static int max77705_get_battery_health(struct max77705_charger_data *charger, return 0; } -static int max77705_get_health(struct max77705_charger_data *charger, int *val) +static int max77705_get_health(struct max77705_charger_data *chg, int *val) { - struct regmap *regmap = charger->regmap; + struct regmap *regmap = chg->regmap; int ret, is_online = 0; ret = max77705_get_online(regmap, &is_online); @@ -287,24 +293,19 @@ static int max77705_get_health(struct max77705_charger_data *charger, int *val) if (ret || (*val != POWER_SUPPLY_HEALTH_GOOD)) return ret; } - return max77705_get_battery_health(charger, val); + return max77705_get_battery_health(chg, val); } -static int max77705_get_input_current(struct max77705_charger_data *charger, +static int max77705_get_input_current(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; int get_current = 0; - struct regmap *regmap = charger->regmap; - - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - reg_data &= MAX77705_CHG_CHGIN_LIM_MASK; + regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], ®_data); if (reg_data <= 3) get_current = MAX77705_CURRENT_CHGIN_MIN; - else if (reg_data >= MAX77705_CHG_CHGIN_LIM_MASK) - get_current = MAX77705_CURRENT_CHGIN_MAX; else get_current = (reg_data + 1) * MAX77705_CURRENT_CHGIN_STEP; @@ -313,26 +314,23 @@ static int max77705_get_input_current(struct max77705_charger_data *charger, return 0; } -static int max77705_get_charge_current(struct max77705_charger_data *charger, +static int max77705_get_charge_current(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; - struct regmap *regmap = charger->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_02, ®_data); - reg_data &= MAX77705_CHG_CC; + regmap_field_read(chg->rfield[MAX77705_CHG_CC_LIM], ®_data); *val = reg_data <= 0x2 ? MAX77705_CURRENT_CHGIN_MIN : reg_data * MAX77705_CURRENT_CHG_STEP; return 0; } -static int max77705_set_float_voltage(struct max77705_charger_data *charger, +static int max77705_set_float_voltage(struct max77705_charger_data *chg, int float_voltage) { int float_voltage_mv; unsigned int reg_data = 0; - struct regmap *regmap = charger->regmap; float_voltage_mv = float_voltage / 1000; reg_data = float_voltage_mv <= 4000 ? 0x0 : @@ -340,20 +338,16 @@ static int max77705_set_float_voltage(struct max77705_charger_data *charger, (float_voltage_mv <= 4200) ? (float_voltage_mv - 4000) / 50 : (((float_voltage_mv - 4200) / 10) + 0x04); - return regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_04, - MAX77705_CHG_CV_PRM_MASK, - (reg_data << MAX77705_CHG_CV_PRM_SHIFT)); + return regmap_field_write(chg->rfield[MAX77705_CHG_CV_PRM], reg_data); } -static int max77705_get_float_voltage(struct max77705_charger_data *charger, +static int max77705_get_float_voltage(struct max77705_charger_data *chg, int *val) { unsigned int reg_data = 0; int voltage_mv; - struct regmap *regmap = charger->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_04, ®_data); - reg_data &= MAX77705_CHG_PRM_MASK; + regmap_field_read(chg->rfield[MAX77705_CHG_CV_PRM], ®_data); voltage_mv = reg_data <= 0x04 ? reg_data * 50 + 4000 : (reg_data - 4) * 10 + 4200; *val = voltage_mv * 1000; @@ -365,28 +359,28 @@ static int max77705_chg_get_property(struct power_supply *psy, enum power_supply_property psp, union power_supply_propval *val) { - struct max77705_charger_data *charger = power_supply_get_drvdata(psy); - struct regmap *regmap = charger->regmap; + struct max77705_charger_data *chg = power_supply_get_drvdata(psy); + struct regmap *regmap = chg->regmap; switch (psp) { case POWER_SUPPLY_PROP_ONLINE: return max77705_get_online(regmap, &val->intval); case POWER_SUPPLY_PROP_PRESENT: - return max77705_check_battery(charger, &val->intval); + return max77705_check_battery(chg, &val->intval); case POWER_SUPPLY_PROP_STATUS: - return max77705_get_status(charger, &val->intval); + return max77705_get_status(chg, &val->intval); case POWER_SUPPLY_PROP_CHARGE_TYPE: - return max77705_get_charge_type(charger, &val->intval); + return max77705_get_charge_type(chg, &val->intval); case POWER_SUPPLY_PROP_HEALTH: - return max77705_get_health(charger, &val->intval); + return max77705_get_health(chg, &val->intval); case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: - return max77705_get_input_current(charger, &val->intval); + return max77705_get_input_current(chg, &val->intval); case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: - return max77705_get_charge_current(charger, &val->intval); + return max77705_get_charge_current(chg, &val->intval); case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE: - return max77705_get_float_voltage(charger, &val->intval); + return max77705_get_float_voltage(chg, &val->intval); case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN: - val->intval = charger->bat_info->voltage_max_design_uv; + val->intval = chg->bat_info->voltage_max_design_uv; break; case POWER_SUPPLY_PROP_MODEL_NAME: val->strval = max77705_charger_model; @@ -400,74 +394,131 @@ static int max77705_chg_get_property(struct power_supply *psy, return 0; } +static int max77705_set_property(struct power_supply *psy, + enum power_supply_property psp, + const union power_supply_propval *val) +{ + struct max77705_charger_data *chg = power_supply_get_drvdata(psy); + int err = 0; + + switch (psp) { + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: + err = max77705_set_integer(chg, MAX77705_CHG_CC_LIM, + MAX77705_CURRENT_CHGIN_MIN, + MAX77705_CURRENT_CHGIN_MAX, + MAX77705_CURRENT_CHG_STEP, + val->intval); + break; + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + err = max77705_set_integer(chg, MAX77705_CHG_CHGIN_LIM, + MAX77705_CURRENT_CHGIN_MIN, + MAX77705_CURRENT_CHGIN_MAX, + MAX77705_CURRENT_CHGIN_STEP, + val->intval); + break; + default: + err = -EINVAL; + } + + return err; +}; + +static int max77705_property_is_writeable(struct power_supply *psy, + enum power_supply_property psp) +{ + switch (psp) { + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: + case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: + return true; + default: + return false; + } +} + static const struct power_supply_desc max77705_charger_psy_desc = { .name = "max77705-charger", - .type = POWER_SUPPLY_TYPE_USB, + .type = POWER_SUPPLY_TYPE_USB, .properties = max77705_charger_props, + .property_is_writeable = max77705_property_is_writeable, .num_properties = ARRAY_SIZE(max77705_charger_props), .get_property = max77705_chg_get_property, + .set_property = max77705_set_property, }; static void max77705_chgin_isr_work(struct work_struct *work) { - struct max77705_charger_data *charger = + struct max77705_charger_data *chg = container_of(work, struct max77705_charger_data, chgin_work); - power_supply_changed(charger->psy_chg); + power_supply_changed(chg->psy_chg); } -static void max77705_charger_initialize(struct max77705_charger_data *chg) +static int max77705_charger_initialize(struct max77705_charger_data *chg) { - u8 reg_data; struct power_supply_battery_info *info; struct regmap *regmap = chg->regmap; + int err; - if (power_supply_get_battery_info(chg->psy_chg, &info) < 0) - return; + err = power_supply_get_battery_info(chg->psy_chg, &info); + if (err) + return dev_err_probe(chg->dev, err, "error on getting battery info"); chg->bat_info = info; /* unlock charger setting protect */ /* slowest LX slope */ - reg_data = MAX77705_CHGPROT_MASK | MAX77705_SLOWEST_LX_SLOPE; - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_06, reg_data, - reg_data); + err = regmap_field_write(chg->rfield[MAX77705_CHGPROT], MAX77705_CHGPROT_UNLOCKED); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_LX_SLOPE], MAX77705_SLOWEST_LX_SLOPE); + if (err) + goto err; /* fast charge timer disable */ /* restart threshold disable */ /* pre-qual charge disable */ - reg_data = (MAX77705_FCHGTIME_DISABLE << MAX77705_FCHGTIME_SHIFT) | - (MAX77705_CHG_RSTRT_DISABLE << MAX77705_CHG_RSTRT_SHIFT) | - (MAX77705_CHG_PQEN_DISABLE << MAX77705_PQEN_SHIFT); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_01, - (MAX77705_FCHGTIME_MASK | - MAX77705_CHG_RSTRT_MASK | - MAX77705_PQEN_MASK), - reg_data); - - /* OTG off(UNO on), boost off */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, - MAX77705_OTG_CTRL, 0); + err = regmap_field_write(chg->rfield[MAX77705_FCHGTIME], MAX77705_FCHGTIME_DISABLE); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_CHG_RSTRT], MAX77705_CHG_RSTRT_DISABLE); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_CHG_PQEN], MAX77705_CHG_PQEN_DISABLE); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_MODE], + MAX77705_CHG_MASK | MAX77705_BUCK_MASK); + if (err) + goto err; /* charge current 450mA(default) */ /* otg current limit 900mA */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_02, - MAX77705_OTG_ILIM_MASK, - MAX77705_OTG_ILIM_900 << MAX77705_OTG_ILIM_SHIFT); + err = regmap_field_write(chg->rfield[MAX77705_OTG_ILIM], MAX77705_OTG_ILIM_900); + if (err) + goto err; /* BAT to SYS OCP 4.80A */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_05, - MAX77705_REG_B2SOVRC_MASK, - MAX77705_B2SOVRC_4_8A << MAX77705_REG_B2SOVRC_SHIFT); + err = regmap_field_write(chg->rfield[MAX77705_REG_B2SOVRC], MAX77705_B2SOVRC_4_8A); + if (err) + goto err; + /* top off current 150mA */ /* top off timer 30min */ - reg_data = (MAX77705_TO_ITH_150MA << MAX77705_TO_ITH_SHIFT) | - (MAX77705_TO_TIME_30M << MAX77705_TO_TIME_SHIFT) | - (MAX77705_SYS_TRACK_DISABLE << MAX77705_SYS_TRACK_DIS_SHIFT); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_03, - (MAX77705_TO_ITH_MASK | - MAX77705_TO_TIME_MASK | - MAX77705_SYS_TRACK_DIS_MASK), reg_data); + err = regmap_field_write(chg->rfield[MAX77705_TO], MAX77705_TO_ITH_150MA); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_TO_TIME], MAX77705_TO_TIME_30M); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_SYS_TRACK], MAX77705_SYS_TRACK_DISABLE); + if (err) + goto err; /* cv voltage 4.2V or 4.35V */ /* MINVSYS 3.6V(default) */ @@ -478,28 +529,38 @@ static void max77705_charger_initialize(struct max77705_charger_data *chg) max77705_set_float_voltage(chg, info->voltage_max_design_uv); } - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, - MAX77705_VCHGIN_REG_MASK, MAX77705_VCHGIN_4_5); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, - MAX77705_WCIN_REG_MASK, MAX77705_WCIN_4_5); + err = regmap_field_write(chg->rfield[MAX77705_VCHGIN], MAX77705_VCHGIN_4_5); + if (err) + goto err; + + err = regmap_field_write(chg->rfield[MAX77705_WCIN], MAX77705_WCIN_4_5); + if (err) + goto err; /* Watchdog timer */ regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, MAX77705_WDTEN_MASK, 0); - /* Active Discharge Enable */ - regmap_update_bits(regmap, MAX77705_PMIC_REG_MAINCTRL1, 1, 1); - /* VBYPSET=5.0V */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_11, MAX77705_VBYPSET_MASK, 0); + err = regmap_field_write(chg->rfield[MAX77705_VBYPSET], 0); + if (err) + goto err; /* Switching Frequency : 1.5MHz */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_08, MAX77705_REG_FSW_MASK, - (MAX77705_CHG_FSW_1_5MHz << MAX77705_REG_FSW_SHIFT)); + err = regmap_field_write(chg->rfield[MAX77705_REG_FSW], MAX77705_CHG_FSW_1_5MHz); + if (err) + goto err; /* Auto skip mode */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, MAX77705_REG_DISKIP_MASK, - (MAX77705_AUTO_SKIP << MAX77705_REG_DISKIP_SHIFT)); + err = regmap_field_write(chg->rfield[MAX77705_REG_DISKIP], MAX77705_AUTO_SKIP); + if (err) + goto err; + + return 0; + +err: + return dev_err_probe(chg->dev, err, "error while configuring"); + } static int max77705_charger_probe(struct i2c_client *i2c) @@ -523,11 +584,13 @@ static int max77705_charger_probe(struct i2c_client *i2c) if (IS_ERR(chg->regmap)) return PTR_ERR(chg->regmap); - ret = regmap_update_bits(chg->regmap, - MAX77705_CHG_REG_INT_MASK, - MAX77705_CHGIN_IM, 0); - if (ret) - return ret; + for (int i = 0; i < MAX77705_N_REGMAP_FIELDS; i++) { + chg->rfield[i] = devm_regmap_field_alloc(dev, chg->regmap, + max77705_reg_field[i]); + if (IS_ERR(chg->rfield[i])) + return dev_err_probe(dev, PTR_ERR(chg->rfield[i]), + "cannot allocate regmap field\n"); + } pscfg.fwnode = dev_fwnode(dev); pscfg.drv_data = chg; @@ -538,7 +601,7 @@ static int max77705_charger_probe(struct i2c_client *i2c) max77705_charger_irq_chip.irq_drv_data = chg; ret = devm_regmap_add_irq_chip(chg->dev, chg->regmap, i2c->irq, - IRQF_ONESHOT | IRQF_SHARED, 0, + IRQF_ONESHOT, 0, &max77705_charger_irq_chip, &irq_data); if (ret) @@ -546,7 +609,7 @@ static int max77705_charger_probe(struct i2c_client *i2c) chg->wqueue = create_singlethread_workqueue(dev_name(dev)); if (!chg->wqueue) - return dev_err_probe(dev, -ENOMEM, "failed to create workqueue\n"); + return -ENOMEM; ret = devm_work_autocancel(dev, &chg->chgin_work, max77705_chgin_isr_work); if (ret) { @@ -554,7 +617,20 @@ static int max77705_charger_probe(struct i2c_client *i2c) goto destroy_wq; } - max77705_charger_initialize(chg); + ret = max77705_charger_initialize(chg); + if (ret) { + dev_err_probe(dev, ret, "failed to initialize charger IC\n"); + goto destroy_wq; + } + + ret = devm_request_threaded_irq(dev, regmap_irq_get_virq(irq_data, MAX77705_CHGIN_I), + NULL, max77705_chgin_irq, + IRQF_TRIGGER_NONE, + "chgin-irq", chg); + if (ret) { + dev_err_probe(dev, ret, "Failed to Request chgin IRQ\n"); + goto destroy_wq; + } ret = max77705_charger_enable(chg); if (ret) { diff --git a/drivers/power/supply/max77976_charger.c b/drivers/power/supply/max77976_charger.c index e6fe68cebc32b6..3d6ff400553305 100644 --- a/drivers/power/supply/max77976_charger.c +++ b/drivers/power/supply/max77976_charger.c @@ -292,10 +292,10 @@ static int max77976_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_ONLINE: err = max77976_get_online(chg, &val->intval); break; - case POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: val->intval = MAX77976_CHG_CC_MAX; break; - case POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: err = max77976_get_integer(chg, CHG_CC, MAX77976_CHG_CC_MIN, MAX77976_CHG_CC_MAX, @@ -330,7 +330,7 @@ static int max77976_set_property(struct power_supply *psy, int err = 0; switch (psp) { - case POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: err = max77976_set_integer(chg, CHG_CC, MAX77976_CHG_CC_MIN, MAX77976_CHG_CC_MAX, @@ -355,7 +355,7 @@ static int max77976_property_is_writeable(struct power_supply *psy, enum power_supply_property psp) { switch (psp) { - case POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT: + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: case POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT: return true; default: @@ -368,8 +368,8 @@ static enum power_supply_property max77976_psy_props[] = { POWER_SUPPLY_PROP_CHARGE_TYPE, POWER_SUPPLY_PROP_HEALTH, POWER_SUPPLY_PROP_ONLINE, - POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT, - POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT, + POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX, POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, diff --git a/drivers/power/supply/mt6370-charger.c b/drivers/power/supply/mt6370-charger.c index 98579998b300db..e6db961d5818d7 100644 --- a/drivers/power/supply/mt6370-charger.c +++ b/drivers/power/supply/mt6370-charger.c @@ -761,13 +761,6 @@ static int mt6370_chg_init_psy(struct mt6370_priv *priv) return PTR_ERR_OR_ZERO(priv->psy); } -static void mt6370_chg_destroy_attach_lock(void *data) -{ - struct mutex *attach_lock = data; - - mutex_destroy(attach_lock); -} - static void mt6370_chg_destroy_wq(void *data) { struct workqueue_struct *wq = data; @@ -894,22 +887,19 @@ static int mt6370_chg_probe(struct platform_device *pdev) if (ret) return dev_err_probe(dev, ret, "Failed to init psy\n"); - mutex_init(&priv->attach_lock); - ret = devm_add_action_or_reset(dev, mt6370_chg_destroy_attach_lock, - &priv->attach_lock); + ret = devm_mutex_init(dev, &priv->attach_lock); if (ret) - return dev_err_probe(dev, ret, "Failed to init attach lock\n"); + return ret; priv->attach = MT6370_ATTACH_STAT_DETACH; priv->wq = create_singlethread_workqueue(dev_name(priv->dev)); if (!priv->wq) - return dev_err_probe(dev, -ENOMEM, - "Failed to create workqueue\n"); + return -ENOMEM; ret = devm_add_action_or_reset(dev, mt6370_chg_destroy_wq, priv->wq); if (ret) - return dev_err_probe(dev, ret, "Failed to init wq\n"); + return ret; ret = devm_work_autocancel(dev, &priv->bc12_work, mt6370_chg_bc12_work_func); if (ret) diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index 18e5e84a81c634..198405f7126f96 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -223,6 +223,8 @@ static struct power_supply_attr power_supply_attrs[] __ro_after_init = { POWER_SUPPLY_ATTR(MANUFACTURE_YEAR), POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), POWER_SUPPLY_ATTR(MANUFACTURE_DAY), + POWER_SUPPLY_ATTR(INTERNAL_RESISTANCE), + POWER_SUPPLY_ATTR(STATE_OF_HEALTH), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index 99808ea9851f6a..3c2837ef346173 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -2,10 +2,12 @@ /* * Copyright (c) 2019-2020, The Linux Foundation. All rights reserved. * Copyright (c) 2022, Linaro Ltd + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. */ #include #include #include +#include #include #include #include @@ -18,8 +20,10 @@ #define BATTMGR_STRING_LEN 128 enum qcom_battmgr_variant { - QCOM_BATTMGR_SM8350, QCOM_BATTMGR_SC8280XP, + QCOM_BATTMGR_SM8350, + QCOM_BATTMGR_SM8550, + QCOM_BATTMGR_X1E80100, }; #define BATTMGR_BAT_STATUS 0x1 @@ -30,8 +34,9 @@ enum qcom_battmgr_variant { #define NOTIF_BAT_PROPERTY 0x30 #define NOTIF_USB_PROPERTY 0x32 #define NOTIF_WLS_PROPERTY 0x34 -#define NOTIF_BAT_INFO 0x81 #define NOTIF_BAT_STATUS 0x80 +#define NOTIF_BAT_INFO 0x81 +#define NOTIF_BAT_CHARGING_STATE 0x83 #define BATTMGR_BAT_INFO 0x9 @@ -65,6 +70,9 @@ enum qcom_battmgr_variant { #define BATT_RESISTANCE 21 #define BATT_POWER_NOW 22 #define BATT_POWER_AVG 23 +#define BATT_CHG_CTRL_EN 24 +#define BATT_CHG_CTRL_START_THR 25 +#define BATT_CHG_CTRL_END_THR 26 #define BATTMGR_USB_PROPERTY_GET 0x32 #define BATTMGR_USB_PROPERTY_SET 0x33 @@ -89,6 +97,13 @@ enum qcom_battmgr_variant { #define WLS_TYPE 5 #define WLS_BOOST_EN 6 +#define BATTMGR_CHG_CTRL_LIMIT_EN 0x48 +#define CHARGE_CTRL_START_THR_MIN 50 +#define CHARGE_CTRL_START_THR_MAX 95 +#define CHARGE_CTRL_END_THR_MIN 55 +#define CHARGE_CTRL_END_THR_MAX 100 +#define CHARGE_CTRL_DELTA_SOC 5 + struct qcom_battmgr_enable_request { struct pmic_glink_hdr hdr; __le32 battery_id; @@ -123,6 +138,13 @@ struct qcom_battmgr_discharge_time_request { __le32 reserved; }; +struct qcom_battmgr_charge_ctrl_request { + struct pmic_glink_hdr hdr; + __le32 enable; + __le32 target_soc; + __le32 delta_soc; +}; + struct qcom_battmgr_message { struct pmic_glink_hdr hdr; union { @@ -235,6 +257,8 @@ struct qcom_battmgr_info { unsigned int capacity_warning; unsigned int cycle_count; unsigned int charge_count; + unsigned int charge_ctrl_start; + unsigned int charge_ctrl_end; char model_number[BATTMGR_STRING_LEN]; char serial_number[BATTMGR_STRING_LEN]; char oem_info[BATTMGR_STRING_LEN]; @@ -254,6 +278,8 @@ struct qcom_battmgr_status { unsigned int voltage_now; unsigned int voltage_ocv; unsigned int temperature; + unsigned int resistance; + unsigned int soh_percent; unsigned int discharge_time; unsigned int charge_time; @@ -418,7 +444,11 @@ static const u8 sm8350_bat_prop_map[] = { [POWER_SUPPLY_PROP_MODEL_NAME] = BATT_MODEL_NAME, [POWER_SUPPLY_PROP_TIME_TO_FULL_AVG] = BATT_TTF_AVG, [POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG] = BATT_TTE_AVG, + [POWER_SUPPLY_PROP_INTERNAL_RESISTANCE] = BATT_RESISTANCE, + [POWER_SUPPLY_PROP_STATE_OF_HEALTH] = BATT_SOH, [POWER_SUPPLY_PROP_POWER_NOW] = BATT_POWER_NOW, + [POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD] = BATT_CHG_CTRL_START_THR, + [POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD] = BATT_CHG_CTRL_END_THR, }; static int qcom_battmgr_bat_sm8350_update(struct qcom_battmgr *battmgr, @@ -489,7 +519,8 @@ static int qcom_battmgr_bat_get_property(struct power_supply *psy, if (!battmgr->service_up) return -EAGAIN; - if (battmgr->variant == QCOM_BATTMGR_SC8280XP) + if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); else ret = qcom_battmgr_bat_sm8350_update(battmgr, psp); @@ -584,12 +615,24 @@ static int qcom_battmgr_bat_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_TEMP: val->intval = battmgr->status.temperature; break; + case POWER_SUPPLY_PROP_INTERNAL_RESISTANCE: + val->intval = battmgr->status.resistance; + break; + case POWER_SUPPLY_PROP_STATE_OF_HEALTH: + val->intval = battmgr->status.soh_percent; + break; case POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG: val->intval = battmgr->status.discharge_time; break; case POWER_SUPPLY_PROP_TIME_TO_FULL_AVG: val->intval = battmgr->status.charge_time; break; + case POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD: + val->intval = battmgr->info.charge_ctrl_start; + break; + case POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD: + val->intval = battmgr->info.charge_ctrl_end; + break; case POWER_SUPPLY_PROP_MANUFACTURE_YEAR: val->intval = battmgr->info.year; break; @@ -615,6 +658,149 @@ static int qcom_battmgr_bat_get_property(struct power_supply *psy, return 0; } +static int qcom_battmgr_set_charge_control(struct qcom_battmgr *battmgr, + u32 target_soc, u32 delta_soc) +{ + struct qcom_battmgr_charge_ctrl_request request = { + .hdr.owner = cpu_to_le32(PMIC_GLINK_OWNER_BATTMGR), + .hdr.type = cpu_to_le32(PMIC_GLINK_REQ_RESP), + .hdr.opcode = cpu_to_le32(BATTMGR_CHG_CTRL_LIMIT_EN), + .enable = cpu_to_le32(1), + .target_soc = cpu_to_le32(target_soc), + .delta_soc = cpu_to_le32(delta_soc), + }; + + return qcom_battmgr_request(battmgr, &request, sizeof(request)); +} + +static int qcom_battmgr_set_charge_start_threshold(struct qcom_battmgr *battmgr, int start_soc) +{ + u32 target_soc, delta_soc; + int ret; + + if (start_soc < CHARGE_CTRL_START_THR_MIN || + start_soc > CHARGE_CTRL_START_THR_MAX) { + dev_err(battmgr->dev, "charge control start threshold exceed range: [%u - %u]\n", + CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX); + return -EINVAL; + } + + /* + * If the new start threshold is larger than the old end threshold, + * move the end threshold one step (DELTA_SOC) after the new start + * threshold. + */ + if (start_soc > battmgr->info.charge_ctrl_end) { + target_soc = start_soc + CHARGE_CTRL_DELTA_SOC; + target_soc = min_t(u32, target_soc, CHARGE_CTRL_END_THR_MAX); + delta_soc = target_soc - start_soc; + delta_soc = min_t(u32, delta_soc, CHARGE_CTRL_DELTA_SOC); + } else { + target_soc = battmgr->info.charge_ctrl_end; + delta_soc = battmgr->info.charge_ctrl_end - start_soc; + } + + mutex_lock(&battmgr->lock); + ret = qcom_battmgr_set_charge_control(battmgr, target_soc, delta_soc); + mutex_unlock(&battmgr->lock); + if (!ret) { + battmgr->info.charge_ctrl_start = start_soc; + battmgr->info.charge_ctrl_end = target_soc; + } + + return 0; +} + +static int qcom_battmgr_set_charge_end_threshold(struct qcom_battmgr *battmgr, int end_soc) +{ + u32 delta_soc = CHARGE_CTRL_DELTA_SOC; + int ret; + + if (end_soc < CHARGE_CTRL_END_THR_MIN || + end_soc > CHARGE_CTRL_END_THR_MAX) { + dev_err(battmgr->dev, "charge control end threshold exceed range: [%u - %u]\n", + CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX); + return -EINVAL; + } + + if (battmgr->info.charge_ctrl_start && end_soc > battmgr->info.charge_ctrl_start) + delta_soc = end_soc - battmgr->info.charge_ctrl_start; + + mutex_lock(&battmgr->lock); + ret = qcom_battmgr_set_charge_control(battmgr, end_soc, delta_soc); + mutex_unlock(&battmgr->lock); + if (!ret) { + battmgr->info.charge_ctrl_start = end_soc - delta_soc; + battmgr->info.charge_ctrl_end = end_soc; + } + + return 0; +} + +static int qcom_battmgr_charge_control_thresholds_init(struct qcom_battmgr *battmgr) +{ + int ret; + u8 en, end_soc, start_soc, delta_soc; + + ret = nvmem_cell_read_u8(battmgr->dev->parent, "charge_limit_en", &en); + if (!ret && en != 0) { + ret = nvmem_cell_read_u8(battmgr->dev->parent, "charge_limit_end", &end_soc); + if (ret < 0) + return ret; + + ret = nvmem_cell_read_u8(battmgr->dev->parent, "charge_limit_delta", &delta_soc); + if (ret < 0) + return ret; + + if (delta_soc >= end_soc) + return -EINVAL; + + start_soc = end_soc - delta_soc; + end_soc = clamp(end_soc, CHARGE_CTRL_END_THR_MIN, CHARGE_CTRL_END_THR_MAX); + start_soc = clamp(start_soc, CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX); + + battmgr->info.charge_ctrl_start = start_soc; + battmgr->info.charge_ctrl_end = end_soc; + } + + return 0; +} + +static int qcom_battmgr_bat_is_writeable(struct power_supply *psy, + enum power_supply_property psp) +{ + switch (psp) { + case POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD: + case POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD: + return 1; + default: + return 0; + } + + return 0; +} + +static int qcom_battmgr_bat_set_property(struct power_supply *psy, + enum power_supply_property psp, + const union power_supply_propval *pval) +{ + struct qcom_battmgr *battmgr = power_supply_get_drvdata(psy); + + if (!battmgr->service_up) + return -EAGAIN; + + switch (psp) { + case POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD: + return qcom_battmgr_set_charge_start_threshold(battmgr, pval->intval); + case POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD: + return qcom_battmgr_set_charge_end_threshold(battmgr, pval->intval); + default: + return -EINVAL; + } + + return 0; +} + static const enum power_supply_property sc8280xp_bat_props[] = { POWER_SUPPLY_PROP_STATUS, POWER_SUPPLY_PROP_PRESENT, @@ -649,6 +835,43 @@ static const struct power_supply_desc sc8280xp_bat_psy_desc = { .get_property = qcom_battmgr_bat_get_property, }; +static const enum power_supply_property x1e80100_bat_props[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_PRESENT, + POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_CYCLE_COUNT, + POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_POWER_NOW, + POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN, + POWER_SUPPLY_PROP_CHARGE_FULL, + POWER_SUPPLY_PROP_CHARGE_EMPTY, + POWER_SUPPLY_PROP_CHARGE_NOW, + POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN, + POWER_SUPPLY_PROP_ENERGY_FULL, + POWER_SUPPLY_PROP_ENERGY_EMPTY, + POWER_SUPPLY_PROP_ENERGY_NOW, + POWER_SUPPLY_PROP_TEMP, + POWER_SUPPLY_PROP_MANUFACTURE_YEAR, + POWER_SUPPLY_PROP_MANUFACTURE_MONTH, + POWER_SUPPLY_PROP_MANUFACTURE_DAY, + POWER_SUPPLY_PROP_MODEL_NAME, + POWER_SUPPLY_PROP_MANUFACTURER, + POWER_SUPPLY_PROP_SERIAL_NUMBER, + POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD, + POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD, +}; + +static const struct power_supply_desc x1e80100_bat_psy_desc = { + .name = "qcom-battmgr-bat", + .type = POWER_SUPPLY_TYPE_BATTERY, + .properties = x1e80100_bat_props, + .num_properties = ARRAY_SIZE(x1e80100_bat_props), + .get_property = qcom_battmgr_bat_get_property, + .set_property = qcom_battmgr_bat_set_property, + .property_is_writeable = qcom_battmgr_bat_is_writeable, +}; + static const enum power_supply_property sm8350_bat_props[] = { POWER_SUPPLY_PROP_STATUS, POWER_SUPPLY_PROP_HEALTH, @@ -668,6 +891,8 @@ static const enum power_supply_property sm8350_bat_props[] = { POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_TIME_TO_FULL_AVG, POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG, + POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, + POWER_SUPPLY_PROP_STATE_OF_HEALTH, POWER_SUPPLY_PROP_POWER_NOW, }; @@ -679,6 +904,42 @@ static const struct power_supply_desc sm8350_bat_psy_desc = { .get_property = qcom_battmgr_bat_get_property, }; +static const enum power_supply_property sm8550_bat_props[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_HEALTH, + POWER_SUPPLY_PROP_PRESENT, + POWER_SUPPLY_PROP_CHARGE_TYPE, + POWER_SUPPLY_PROP_CAPACITY, + POWER_SUPPLY_PROP_VOLTAGE_OCV, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_VOLTAGE_MAX, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_TEMP, + POWER_SUPPLY_PROP_TECHNOLOGY, + POWER_SUPPLY_PROP_CHARGE_COUNTER, + POWER_SUPPLY_PROP_CYCLE_COUNT, + POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN, + POWER_SUPPLY_PROP_CHARGE_FULL, + POWER_SUPPLY_PROP_MODEL_NAME, + POWER_SUPPLY_PROP_TIME_TO_FULL_AVG, + POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG, + POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, + POWER_SUPPLY_PROP_STATE_OF_HEALTH, + POWER_SUPPLY_PROP_POWER_NOW, + POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD, + POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD, +}; + +static const struct power_supply_desc sm8550_bat_psy_desc = { + .name = "qcom-battmgr-bat", + .type = POWER_SUPPLY_TYPE_BATTERY, + .properties = sm8550_bat_props, + .num_properties = ARRAY_SIZE(sm8550_bat_props), + .get_property = qcom_battmgr_bat_get_property, + .set_property = qcom_battmgr_bat_set_property, + .property_is_writeable = qcom_battmgr_bat_is_writeable, +}; + static int qcom_battmgr_ac_get_property(struct power_supply *psy, enum power_supply_property psp, union power_supply_propval *val) @@ -754,7 +1015,8 @@ static int qcom_battmgr_usb_get_property(struct power_supply *psy, if (!battmgr->service_up) return -EAGAIN; - if (battmgr->variant == QCOM_BATTMGR_SC8280XP) + if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); else ret = qcom_battmgr_usb_sm8350_update(battmgr, psp); @@ -876,7 +1138,8 @@ static int qcom_battmgr_wls_get_property(struct power_supply *psy, if (!battmgr->service_up) return -EAGAIN; - if (battmgr->variant == QCOM_BATTMGR_SC8280XP) + if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) ret = qcom_battmgr_bat_sc8280xp_update(battmgr, psp); else ret = qcom_battmgr_wls_sm8350_update(battmgr, psp); @@ -947,12 +1210,14 @@ static void qcom_battmgr_notification(struct qcom_battmgr *battmgr, } notification = le32_to_cpu(msg->notification); + notification &= 0xff; switch (notification) { case NOTIF_BAT_INFO: battmgr->info.valid = false; fallthrough; case NOTIF_BAT_STATUS: case NOTIF_BAT_PROPERTY: + case NOTIF_BAT_CHARGING_STATE: power_supply_changed(battmgr->bat_psy); break; case NOTIF_USB_PROPERTY: @@ -982,7 +1247,8 @@ static void qcom_battmgr_sc8280xp_strcpy(char *dest, const char *src) static unsigned int qcom_battmgr_sc8280xp_parse_technology(const char *chemistry) { - if (!strncmp(chemistry, "LIO", BATTMGR_CHEMISTRY_LEN)) + if ((!strncmp(chemistry, "LIO", BATTMGR_CHEMISTRY_LEN)) || + (!strncmp(chemistry, "OOI", BATTMGR_CHEMISTRY_LEN))) return POWER_SUPPLY_TECHNOLOGY_LION; if (!strncmp(chemistry, "LIP", BATTMGR_CHEMISTRY_LEN)) return POWER_SUPPLY_TECHNOLOGY_LIPO; @@ -1095,6 +1361,9 @@ static void qcom_battmgr_sc8280xp_callback(struct qcom_battmgr *battmgr, case BATTMGR_BAT_CHARGE_TIME: battmgr->status.charge_time = le32_to_cpu(resp->time); break; + case BATTMGR_CHG_CTRL_LIMIT_EN: + battmgr->error = 0; + break; default: dev_warn(battmgr->dev, "unknown message %#x\n", opcode); break; @@ -1159,6 +1428,9 @@ static void qcom_battmgr_sm8350_callback(struct qcom_battmgr *battmgr, case BATT_CAPACITY: battmgr->status.percent = le32_to_cpu(resp->intval.value) / 100; break; + case BATT_SOH: + battmgr->status.soh_percent = le32_to_cpu(resp->intval.value); + break; case BATT_VOLT_OCV: battmgr->status.voltage_ocv = le32_to_cpu(resp->intval.value); break; @@ -1199,9 +1471,18 @@ static void qcom_battmgr_sm8350_callback(struct qcom_battmgr *battmgr, case BATT_TTE_AVG: battmgr->status.discharge_time = le32_to_cpu(resp->intval.value); break; + case BATT_RESISTANCE: + battmgr->status.resistance = le32_to_cpu(resp->intval.value); + break; case BATT_POWER_NOW: battmgr->status.power_now = le32_to_cpu(resp->intval.value); break; + case BATT_CHG_CTRL_START_THR: + battmgr->info.charge_ctrl_start = le32_to_cpu(resp->intval.value); + break; + case BATT_CHG_CTRL_END_THR: + battmgr->info.charge_ctrl_end = le32_to_cpu(resp->intval.value); + break; default: dev_warn(battmgr->dev, "unknown property %#x\n", property); break; @@ -1284,6 +1565,7 @@ static void qcom_battmgr_sm8350_callback(struct qcom_battmgr *battmgr, } break; case BATTMGR_REQUEST_NOTIFICATION: + case BATTMGR_CHG_CTRL_LIMIT_EN: battmgr->error = 0; break; default: @@ -1303,7 +1585,8 @@ static void qcom_battmgr_callback(const void *data, size_t len, void *priv) if (opcode == BATTMGR_NOTIFICATION) qcom_battmgr_notification(battmgr, data, len); - else if (battmgr->variant == QCOM_BATTMGR_SC8280XP) + else if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) qcom_battmgr_sc8280xp_callback(battmgr, data, len); else qcom_battmgr_sm8350_callback(battmgr, data, len); @@ -1339,7 +1622,8 @@ static void qcom_battmgr_pdr_notify(void *priv, int state) static const struct of_device_id qcom_battmgr_of_variants[] = { { .compatible = "qcom,sc8180x-pmic-glink", .data = (void *)QCOM_BATTMGR_SC8280XP }, { .compatible = "qcom,sc8280xp-pmic-glink", .data = (void *)QCOM_BATTMGR_SC8280XP }, - { .compatible = "qcom,x1e80100-pmic-glink", .data = (void *)QCOM_BATTMGR_SC8280XP }, + { .compatible = "qcom,sm8550-pmic-glink", .data = (void *)QCOM_BATTMGR_SM8550 }, + { .compatible = "qcom,x1e80100-pmic-glink", .data = (void *)QCOM_BATTMGR_X1E80100 }, /* Unmatched devices falls back to QCOM_BATTMGR_SM8350 */ {} }; @@ -1349,11 +1633,13 @@ static char *qcom_battmgr_battery[] = { "battery" }; static int qcom_battmgr_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { + const struct power_supply_desc *psy_desc; struct power_supply_config psy_cfg_supply = {}; struct power_supply_config psy_cfg = {}; const struct of_device_id *match; struct qcom_battmgr *battmgr; struct device *dev = &adev->dev; + int ret; battmgr = devm_kzalloc(dev, sizeof(*battmgr), GFP_KERNEL); if (!battmgr) @@ -1379,8 +1665,19 @@ static int qcom_battmgr_probe(struct auxiliary_device *adev, else battmgr->variant = QCOM_BATTMGR_SM8350; - if (battmgr->variant == QCOM_BATTMGR_SC8280XP) { - battmgr->bat_psy = devm_power_supply_register(dev, &sc8280xp_bat_psy_desc, &psy_cfg); + ret = qcom_battmgr_charge_control_thresholds_init(battmgr); + if (ret < 0) + return dev_err_probe(dev, ret, + "failed to init battery charge control thresholds\n"); + + if (battmgr->variant == QCOM_BATTMGR_SC8280XP || + battmgr->variant == QCOM_BATTMGR_X1E80100) { + if (battmgr->variant == QCOM_BATTMGR_X1E80100) + psy_desc = &x1e80100_bat_psy_desc; + else + psy_desc = &sc8280xp_bat_psy_desc; + + battmgr->bat_psy = devm_power_supply_register(dev, psy_desc, &psy_cfg); if (IS_ERR(battmgr->bat_psy)) return dev_err_probe(dev, PTR_ERR(battmgr->bat_psy), "failed to register battery power supply\n"); @@ -1400,7 +1697,12 @@ static int qcom_battmgr_probe(struct auxiliary_device *adev, return dev_err_probe(dev, PTR_ERR(battmgr->wls_psy), "failed to register wireless charing power supply\n"); } else { - battmgr->bat_psy = devm_power_supply_register(dev, &sm8350_bat_psy_desc, &psy_cfg); + if (battmgr->variant == QCOM_BATTMGR_SM8550) + psy_desc = &sm8550_bat_psy_desc; + else + psy_desc = &sm8350_bat_psy_desc; + + battmgr->bat_psy = devm_power_supply_register(dev, psy_desc, &psy_cfg); if (IS_ERR(battmgr->bat_psy)) return dev_err_probe(dev, PTR_ERR(battmgr->bat_psy), "failed to register battery power supply\n"); diff --git a/drivers/power/supply/rk817_charger.c b/drivers/power/supply/rk817_charger.c index 1251022eb052c1..9436c6bbf51fb4 100644 --- a/drivers/power/supply/rk817_charger.c +++ b/drivers/power/supply/rk817_charger.c @@ -1046,7 +1046,7 @@ static void rk817_charging_monitor(struct work_struct *work) rk817_read_props(charger); /* Run every 8 seconds like the BSP driver did. */ - queue_delayed_work(system_wq, &charger->work, msecs_to_jiffies(8000)); + queue_delayed_work(system_percpu_wq, &charger->work, msecs_to_jiffies(8000)); } static void rk817_cleanup_node(void *data) @@ -1206,7 +1206,7 @@ static int rk817_charger_probe(struct platform_device *pdev) return ret; /* Force the first update immediately. */ - mod_delayed_work(system_wq, &charger->work, 0); + mod_delayed_work(system_percpu_wq, &charger->work, 0); return 0; } @@ -1226,7 +1226,7 @@ static int __maybe_unused rk817_resume(struct device *dev) struct rk817_charger *charger = dev_get_drvdata(dev); /* force an immediate update */ - mod_delayed_work(system_wq, &charger->work, 0); + mod_delayed_work(system_percpu_wq, &charger->work, 0); return 0; } diff --git a/drivers/power/supply/rt9467-charger.c b/drivers/power/supply/rt9467-charger.c index e9aba9ad393c9c..fe773dd8b404f3 100644 --- a/drivers/power/supply/rt9467-charger.c +++ b/drivers/power/supply/rt9467-charger.c @@ -633,7 +633,9 @@ static int rt9467_psy_set_ieoc(struct rt9467_chg_data *data, int microamp) static const enum power_supply_property rt9467_chg_properties[] = { POWER_SUPPLY_PROP_STATUS, POWER_SUPPLY_PROP_ONLINE, + POWER_SUPPLY_PROP_VOLTAGE_NOW, POWER_SUPPLY_PROP_CURRENT_MAX, + POWER_SUPPLY_PROP_CURRENT_NOW, POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT, POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX, POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE, @@ -656,6 +658,8 @@ static int rt9467_psy_get_property(struct power_supply *psy, return rt9467_psy_get_status(data, &val->intval); case POWER_SUPPLY_PROP_ONLINE: return regmap_field_read(data->rm_field[F_PWR_RDY], &val->intval); + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + return rt9467_get_adc(data, RT9467_ADC_VBUS_DIV5, &val->intval); case POWER_SUPPLY_PROP_CURRENT_MAX: mutex_lock(&data->attach_lock); if (data->psy_usb_type == POWER_SUPPLY_USB_TYPE_UNKNOWN || @@ -665,6 +669,8 @@ static int rt9467_psy_get_property(struct power_supply *psy, val->intval = 1500000; mutex_unlock(&data->attach_lock); return 0; + case POWER_SUPPLY_PROP_CURRENT_NOW: + return rt9467_get_adc(data, RT9467_ADC_IBUS, &val->intval); case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT: mutex_lock(&data->ichg_ieoc_lock); val->intval = data->ichg_ua; @@ -1141,27 +1147,6 @@ static int rt9467_reset_chip(struct rt9467_chg_data *data) return regmap_field_write(data->rm_field[F_RST], 1); } -static void rt9467_chg_destroy_adc_lock(void *data) -{ - struct mutex *adc_lock = data; - - mutex_destroy(adc_lock); -} - -static void rt9467_chg_destroy_attach_lock(void *data) -{ - struct mutex *attach_lock = data; - - mutex_destroy(attach_lock); -} - -static void rt9467_chg_destroy_ichg_ieoc_lock(void *data) -{ - struct mutex *ichg_ieoc_lock = data; - - mutex_destroy(ichg_ieoc_lock); -} - static void rt9467_chg_complete_aicl_done(void *data) { struct completion *aicl_done = data; @@ -1214,29 +1199,23 @@ static int rt9467_charger_probe(struct i2c_client *i2c) if (ret) return dev_err_probe(dev, ret, "Failed to add irq chip\n"); - mutex_init(&data->adc_lock); - ret = devm_add_action_or_reset(dev, rt9467_chg_destroy_adc_lock, - &data->adc_lock); + ret = devm_mutex_init(dev, &data->adc_lock); if (ret) - return dev_err_probe(dev, ret, "Failed to init ADC lock\n"); + return ret; - mutex_init(&data->attach_lock); - ret = devm_add_action_or_reset(dev, rt9467_chg_destroy_attach_lock, - &data->attach_lock); + ret = devm_mutex_init(dev, &data->attach_lock); if (ret) - return dev_err_probe(dev, ret, "Failed to init attach lock\n"); + return ret; - mutex_init(&data->ichg_ieoc_lock); - ret = devm_add_action_or_reset(dev, rt9467_chg_destroy_ichg_ieoc_lock, - &data->ichg_ieoc_lock); + ret = devm_mutex_init(dev, &data->ichg_ieoc_lock); if (ret) - return dev_err_probe(dev, ret, "Failed to init ICHG/IEOC lock\n"); + return ret; init_completion(&data->aicl_done); ret = devm_add_action_or_reset(dev, rt9467_chg_complete_aicl_done, &data->aicl_done); if (ret) - return dev_err_probe(dev, ret, "Failed to init AICL done completion\n"); + return ret; ret = rt9467_do_charger_init(data); if (ret) diff --git a/drivers/power/supply/rx51_battery.c b/drivers/power/supply/rx51_battery.c index 7cdcd415e8684d..b0220ec2d92661 100644 --- a/drivers/power/supply/rx51_battery.c +++ b/drivers/power/supply/rx51_battery.c @@ -116,7 +116,7 @@ static int rx51_battery_read_temperature(struct rx51_device_info *di) int mid = (max + min) / 2; if (rx51_temp_table2[mid] <= raw) min = mid; - else if (rx51_temp_table2[mid] > raw) + else max = mid; if (rx51_temp_table2[mid] == raw) break; diff --git a/drivers/power/supply/sbs-charger.c b/drivers/power/supply/sbs-charger.c index 27764123b929e2..7d5e676205805d 100644 --- a/drivers/power/supply/sbs-charger.c +++ b/drivers/power/supply/sbs-charger.c @@ -154,8 +154,7 @@ static const struct regmap_config sbs_regmap = { .val_format_endian = REGMAP_ENDIAN_LITTLE, /* since based on SMBus */ }; -static const struct power_supply_desc sbs_desc = { - .name = "sbs-charger", +static const struct power_supply_desc sbs_default_desc = { .type = POWER_SUPPLY_TYPE_MAINS, .properties = sbs_properties, .num_properties = ARRAY_SIZE(sbs_properties), @@ -165,9 +164,20 @@ static const struct power_supply_desc sbs_desc = { static int sbs_probe(struct i2c_client *client) { struct power_supply_config psy_cfg = {}; + struct power_supply_desc *sbs_desc; struct sbs_info *chip; int ret, val; + sbs_desc = devm_kmemdup(&client->dev, &sbs_default_desc, + sizeof(*sbs_desc), GFP_KERNEL); + if (!sbs_desc) + return -ENOMEM; + + sbs_desc->name = devm_kasprintf(&client->dev, GFP_KERNEL, "sbs-%s", + dev_name(&client->dev)); + if (!sbs_desc->name) + return -ENOMEM; + chip = devm_kzalloc(&client->dev, sizeof(struct sbs_info), GFP_KERNEL); if (!chip) return -ENOMEM; @@ -191,7 +201,7 @@ static int sbs_probe(struct i2c_client *client) return dev_err_probe(&client->dev, ret, "Failed to get device status\n"); chip->last_state = val; - chip->power_supply = devm_power_supply_register(&client->dev, &sbs_desc, &psy_cfg); + chip->power_supply = devm_power_supply_register(&client->dev, sbs_desc, &psy_cfg); if (IS_ERR(chip->power_supply)) return dev_err_probe(&client->dev, PTR_ERR(chip->power_supply), "Failed to register power supply\n"); diff --git a/drivers/power/supply/sbs-manager.c b/drivers/power/supply/sbs-manager.c index 869729dfcd664c..6fe526222f7f4d 100644 --- a/drivers/power/supply/sbs-manager.c +++ b/drivers/power/supply/sbs-manager.c @@ -348,7 +348,7 @@ static int sbsm_probe(struct i2c_client *client) data->muxc = i2c_mux_alloc(adapter, dev, SBSM_MAX_BATS, 0, I2C_MUX_LOCKED, &sbsm_select, NULL); if (!data->muxc) - return dev_err_probe(dev, -ENOMEM, "failed to alloc i2c mux\n"); + return -ENOMEM; data->muxc->priv = data; ret = devm_add_action_or_reset(dev, sbsm_del_mux_adapter, data); diff --git a/drivers/power/supply/ucs1002_power.c b/drivers/power/supply/ucs1002_power.c index d32a7633f9e7d7..fe94435340de65 100644 --- a/drivers/power/supply/ucs1002_power.c +++ b/drivers/power/supply/ucs1002_power.c @@ -493,7 +493,7 @@ static irqreturn_t ucs1002_alert_irq(int irq, void *data) { struct ucs1002_info *info = data; - mod_delayed_work(system_wq, &info->health_poll, 0); + mod_delayed_work(system_percpu_wq, &info->health_poll, 0); return IRQ_HANDLED; } diff --git a/drivers/power/supply/ug3105_battery.c b/drivers/power/supply/ug3105_battery.c index e8a1de7cade0ce..210e0f9aa5e094 100644 --- a/drivers/power/supply/ug3105_battery.c +++ b/drivers/power/supply/ug3105_battery.c @@ -10,7 +10,22 @@ * is off or suspended, the coulomb counter is not used atm. * * Possible improvements: - * 1. Activate commented out total_coulomb_count code + * 1. Add coulumb counter reading, e.g. something like this: + * Read + reset coulomb counter every 10 polls (every 300 seconds) + * + * if ((chip->poll_count % 10) == 0) { + * val = ug3105_read_word(chip->client, UG3105_REG_COULOMB_CNT); + * if (val < 0) + * goto out; + * + * i2c_smbus_write_byte_data(chip->client, UG3105_REG_CTRL1, + * UG3105_CTRL1_RESET_COULOMB_CNT); + * + * chip->total_coulomb_count += (s16)val; + * dev_dbg(&chip->client->dev, "coulomb count %d total %d\n", + * (s16)val, chip->total_coulomb_count); + * } + * * 2. Reset total_coulomb_count val to 0 when the battery is as good as empty * and remember that we did this (and clear the flag for this on susp/resume) * 3. When the battery is full check if the flag that we set total_coulomb_count @@ -31,24 +46,16 @@ * has shown that an estimated 7404mWh increase of the battery's energy results * in a total_coulomb_count increase of 3277 units with a 5 milli-ohm sense R. * - * Copyright (C) 2021 Hans de Goede + * Copyright (C) 2021 - 2025 Hans de Goede */ -#include #include -#include #include #include #include #include -#include - -#define UG3105_MOV_AVG_WINDOW 8 -#define UG3105_INIT_POLL_TIME (5 * HZ) -#define UG3105_POLL_TIME (30 * HZ) -#define UG3105_SETTLE_TIME (1 * HZ) -#define UG3105_INIT_POLL_COUNT 30 +#include "adc-battery-helper.h" #define UG3105_REG_MODE 0x00 #define UG3105_REG_CTRL1 0x01 @@ -61,34 +68,13 @@ #define UG3105_CTRL1_RESET_COULOMB_CNT 0x03 -#define UG3105_CURR_HYST_UA 65000 - -#define UG3105_LOW_BAT_UV 3700000 -#define UG3105_FULL_BAT_HYST_UV 38000 - -#define AMBIENT_TEMP_CELCIUS 25 - struct ug3105_chip { + /* Must be the first member see adc-battery-helper documentation */ + struct adc_battery_helper helper; struct i2c_client *client; struct power_supply *psy; - struct delayed_work work; - struct mutex lock; - int ocv[UG3105_MOV_AVG_WINDOW]; /* micro-volt */ - int intern_res[UG3105_MOV_AVG_WINDOW]; /* milli-ohm */ - int poll_count; - int ocv_avg_index; - int ocv_avg; /* micro-volt */ - int intern_res_poll_count; - int intern_res_avg_index; - int intern_res_avg; /* milli-ohm */ - int volt; /* micro-volt */ - int curr; /* micro-ampere */ - int total_coulomb_count; int uv_per_unit; int ua_per_unit; - int status; - int capacity; - bool supplied; }; static int ug3105_read_word(struct i2c_client *client, u8 reg) @@ -102,230 +88,43 @@ static int ug3105_read_word(struct i2c_client *client, u8 reg) return val; } -static int ug3105_get_status(struct ug3105_chip *chip) -{ - int full = chip->psy->battery_info->constant_charge_voltage_max_uv - - UG3105_FULL_BAT_HYST_UV; - - if (chip->curr > UG3105_CURR_HYST_UA) - return POWER_SUPPLY_STATUS_CHARGING; - - if (chip->curr < -UG3105_CURR_HYST_UA) - return POWER_SUPPLY_STATUS_DISCHARGING; - - if (chip->supplied && chip->ocv_avg > full) - return POWER_SUPPLY_STATUS_FULL; - - return POWER_SUPPLY_STATUS_NOT_CHARGING; -} - -static void ug3105_work(struct work_struct *work) -{ - struct ug3105_chip *chip = container_of(work, struct ug3105_chip, - work.work); - int i, val, curr_diff, volt_diff, res, win_size; - bool prev_supplied = chip->supplied; - int prev_status = chip->status; - int prev_volt = chip->volt; - int prev_curr = chip->curr; - struct power_supply *psy; - - mutex_lock(&chip->lock); - - psy = chip->psy; - if (!psy) - goto out; - - val = ug3105_read_word(chip->client, UG3105_REG_BAT_VOLT); - if (val < 0) - goto out; - chip->volt = val * chip->uv_per_unit; - - val = ug3105_read_word(chip->client, UG3105_REG_BAT_CURR); - if (val < 0) - goto out; - chip->curr = (s16)val * chip->ua_per_unit; - - chip->ocv[chip->ocv_avg_index] = - chip->volt - chip->curr * chip->intern_res_avg / 1000; - chip->ocv_avg_index = (chip->ocv_avg_index + 1) % UG3105_MOV_AVG_WINDOW; - chip->poll_count++; - - /* - * See possible improvements comment above. - * - * Read + reset coulomb counter every 10 polls (every 300 seconds) - * if ((chip->poll_count % 10) == 0) { - * val = ug3105_read_word(chip->client, UG3105_REG_COULOMB_CNT); - * if (val < 0) - * goto out; - * - * i2c_smbus_write_byte_data(chip->client, UG3105_REG_CTRL1, - * UG3105_CTRL1_RESET_COULOMB_CNT); - * - * chip->total_coulomb_count += (s16)val; - * dev_dbg(&chip->client->dev, "coulomb count %d total %d\n", - * (s16)val, chip->total_coulomb_count); - * } - */ - - chip->ocv_avg = 0; - win_size = min(chip->poll_count, UG3105_MOV_AVG_WINDOW); - for (i = 0; i < win_size; i++) - chip->ocv_avg += chip->ocv[i]; - chip->ocv_avg /= win_size; - - chip->supplied = power_supply_am_i_supplied(psy); - chip->status = ug3105_get_status(chip); - if (chip->status == POWER_SUPPLY_STATUS_FULL) - chip->capacity = 100; - else - chip->capacity = power_supply_batinfo_ocv2cap(chip->psy->battery_info, - chip->ocv_avg, - AMBIENT_TEMP_CELCIUS); - - /* - * Skip internal resistance calc on charger [un]plug and - * when the battery is almost empty (voltage low). - */ - if (chip->supplied != prev_supplied || - chip->volt < UG3105_LOW_BAT_UV || - chip->poll_count < 2) - goto out; - - /* - * Assuming that the OCV voltage does not change significantly - * between 2 polls, then we can calculate the internal resistance - * on a significant current change by attributing all voltage - * change between the 2 readings to the internal resistance. - */ - curr_diff = abs(chip->curr - prev_curr); - if (curr_diff < UG3105_CURR_HYST_UA) - goto out; - - volt_diff = abs(chip->volt - prev_volt); - res = volt_diff * 1000 / curr_diff; - - if ((res < (chip->intern_res_avg * 2 / 3)) || - (res > (chip->intern_res_avg * 4 / 3))) { - dev_dbg(&chip->client->dev, "Ignoring outlier internal resistance %d mOhm\n", res); - goto out; - } - - dev_dbg(&chip->client->dev, "Internal resistance %d mOhm\n", res); - - chip->intern_res[chip->intern_res_avg_index] = res; - chip->intern_res_avg_index = (chip->intern_res_avg_index + 1) % UG3105_MOV_AVG_WINDOW; - chip->intern_res_poll_count++; - - chip->intern_res_avg = 0; - win_size = min(chip->intern_res_poll_count, UG3105_MOV_AVG_WINDOW); - for (i = 0; i < win_size; i++) - chip->intern_res_avg += chip->intern_res[i]; - chip->intern_res_avg /= win_size; - -out: - mutex_unlock(&chip->lock); - - queue_delayed_work(system_wq, &chip->work, - (chip->poll_count <= UG3105_INIT_POLL_COUNT) ? - UG3105_INIT_POLL_TIME : UG3105_POLL_TIME); - - if (chip->status != prev_status && psy) - power_supply_changed(psy); -} - -static enum power_supply_property ug3105_battery_props[] = { - POWER_SUPPLY_PROP_STATUS, - POWER_SUPPLY_PROP_PRESENT, - POWER_SUPPLY_PROP_SCOPE, - POWER_SUPPLY_PROP_VOLTAGE_NOW, - POWER_SUPPLY_PROP_VOLTAGE_OCV, - POWER_SUPPLY_PROP_CURRENT_NOW, - POWER_SUPPLY_PROP_CAPACITY, -}; - -static int ug3105_get_property(struct power_supply *psy, - enum power_supply_property psp, - union power_supply_propval *val) +static int ug3105_get_voltage_and_current_now(struct power_supply *psy, int *volt, int *curr) { struct ug3105_chip *chip = power_supply_get_drvdata(psy); - int ret = 0; - - mutex_lock(&chip->lock); + int ret; - if (!chip->psy) { - ret = -EAGAIN; - goto out; - } + ret = ug3105_read_word(chip->client, UG3105_REG_BAT_VOLT); + if (ret < 0) + return ret; - switch (psp) { - case POWER_SUPPLY_PROP_STATUS: - val->intval = chip->status; - break; - case POWER_SUPPLY_PROP_PRESENT: - val->intval = 1; - break; - case POWER_SUPPLY_PROP_SCOPE: - val->intval = POWER_SUPPLY_SCOPE_SYSTEM; - break; - case POWER_SUPPLY_PROP_VOLTAGE_NOW: - ret = ug3105_read_word(chip->client, UG3105_REG_BAT_VOLT); - if (ret < 0) - break; - val->intval = ret * chip->uv_per_unit; - ret = 0; - break; - case POWER_SUPPLY_PROP_VOLTAGE_OCV: - val->intval = chip->ocv_avg; - break; - case POWER_SUPPLY_PROP_CURRENT_NOW: - ret = ug3105_read_word(chip->client, UG3105_REG_BAT_CURR); - if (ret < 0) - break; - val->intval = (s16)ret * chip->ua_per_unit; - ret = 0; - break; - case POWER_SUPPLY_PROP_CAPACITY: - val->intval = chip->capacity; - break; - default: - ret = -EINVAL; - } + *volt = ret * chip->uv_per_unit; -out: - mutex_unlock(&chip->lock); - return ret; -} - -static void ug3105_external_power_changed(struct power_supply *psy) -{ - struct ug3105_chip *chip = power_supply_get_drvdata(psy); + ret = ug3105_read_word(chip->client, UG3105_REG_BAT_CURR); + if (ret < 0) + return ret; - dev_dbg(&chip->client->dev, "external power changed\n"); - mod_delayed_work(system_wq, &chip->work, UG3105_SETTLE_TIME); + *curr = (s16)ret * chip->ua_per_unit; + return 0; } static const struct power_supply_desc ug3105_psy_desc = { .name = "ug3105_battery", .type = POWER_SUPPLY_TYPE_BATTERY, - .get_property = ug3105_get_property, - .external_power_changed = ug3105_external_power_changed, - .properties = ug3105_battery_props, - .num_properties = ARRAY_SIZE(ug3105_battery_props), + .get_property = adc_battery_helper_get_property, + .external_power_changed = adc_battery_helper_external_power_changed, + .properties = adc_battery_helper_properties, + .num_properties = ADC_HELPER_NUM_PROPERTIES, }; -static void ug3105_init(struct ug3105_chip *chip) +static void ug3105_start(struct i2c_client *client) +{ + i2c_smbus_write_byte_data(client, UG3105_REG_MODE, UG3105_MODE_RUN); + i2c_smbus_write_byte_data(client, UG3105_REG_CTRL1, UG3105_CTRL1_RESET_COULOMB_CNT); +} + +static void ug3105_stop(struct i2c_client *client) { - chip->poll_count = 0; - chip->ocv_avg_index = 0; - chip->total_coulomb_count = 0; - i2c_smbus_write_byte_data(chip->client, UG3105_REG_MODE, - UG3105_MODE_RUN); - i2c_smbus_write_byte_data(chip->client, UG3105_REG_CTRL1, - UG3105_CTRL1_RESET_COULOMB_CNT); - queue_delayed_work(system_wq, &chip->work, 0); - flush_delayed_work(&chip->work); + i2c_smbus_write_byte_data(client, UG3105_REG_MODE, UG3105_MODE_STANDBY); } static int ug3105_probe(struct i2c_client *client) @@ -333,7 +132,6 @@ static int ug3105_probe(struct i2c_client *client) struct power_supply_config psy_cfg = {}; struct device *dev = &client->dev; u32 curr_sense_res_uohm = 10000; - struct power_supply *psy; struct ug3105_chip *chip; int ret; @@ -342,23 +140,8 @@ static int ug3105_probe(struct i2c_client *client) return -ENOMEM; chip->client = client; - mutex_init(&chip->lock); - ret = devm_delayed_work_autocancel(dev, &chip->work, ug3105_work); - if (ret) - return ret; - psy_cfg.drv_data = chip; - psy = devm_power_supply_register(dev, &ug3105_psy_desc, &psy_cfg); - if (IS_ERR(psy)) - return PTR_ERR(psy); - - if (!psy->battery_info || - psy->battery_info->factory_internal_resistance_uohm == -EINVAL || - psy->battery_info->constant_charge_voltage_max_uv == -EINVAL || - !psy->battery_info->ocv_table[0]) { - dev_err(dev, "error required properties are missing\n"); - return -ENODEV; - } + ug3105_start(client); device_property_read_u32(dev, "upisemi,rsns-microohm", &curr_sense_res_uohm); @@ -366,35 +149,36 @@ static int ug3105_probe(struct i2c_client *client) * DAC maximum is 4.5V divided by 65536 steps + an unknown factor of 10 * coming from somewhere for some reason (verified with a volt-meter). */ - chip->uv_per_unit = 45000000/65536; + chip->uv_per_unit = 45000000 / 65536; /* Datasheet says 8.1 uV per unit for the current ADC */ chip->ua_per_unit = 8100000 / curr_sense_res_uohm; - /* Use provided internal resistance as start point (in milli-ohm) */ - chip->intern_res_avg = psy->battery_info->factory_internal_resistance_uohm / 1000; - /* Also add it to the internal resistance moving average window */ - chip->intern_res[0] = chip->intern_res_avg; - chip->intern_res_avg_index = 1; - chip->intern_res_poll_count = 1; - - mutex_lock(&chip->lock); - chip->psy = psy; - mutex_unlock(&chip->lock); + psy_cfg.drv_data = chip; + chip->psy = devm_power_supply_register(dev, &ug3105_psy_desc, &psy_cfg); + if (IS_ERR(chip->psy)) { + ret = PTR_ERR(chip->psy); + goto stop; + } - ug3105_init(chip); + ret = adc_battery_helper_init(&chip->helper, chip->psy, + ug3105_get_voltage_and_current_now, NULL); + if (ret) + goto stop; i2c_set_clientdata(client, chip); return 0; + +stop: + ug3105_stop(client); + return ret; } static int __maybe_unused ug3105_suspend(struct device *dev) { struct ug3105_chip *chip = dev_get_drvdata(dev); - cancel_delayed_work_sync(&chip->work); - i2c_smbus_write_byte_data(chip->client, UG3105_REG_MODE, - UG3105_MODE_STANDBY); - + adc_battery_helper_suspend(dev); + ug3105_stop(chip->client); return 0; } @@ -402,8 +186,8 @@ static int __maybe_unused ug3105_resume(struct device *dev) { struct ug3105_chip *chip = dev_get_drvdata(dev); - ug3105_init(chip); - + ug3105_start(chip->client); + adc_battery_helper_resume(dev); return 0; } @@ -422,10 +206,12 @@ static struct i2c_driver ug3105_i2c_driver = { .pm = &ug3105_pm_ops, }, .probe = ug3105_probe, + .remove = ug3105_stop, + .shutdown = ug3105_stop, .id_table = ug3105_id, }; module_i2c_driver(ug3105_i2c_driver); -MODULE_AUTHOR("Hans de Goede #include +#include #include #include @@ -265,7 +266,7 @@ u64 ps3stor_read_write_sectors(struct ps3_storage_device *dev, u64 lpar, u64 start_sector, u64 sectors, int write) { unsigned int region_id = dev->regions[dev->region_idx].id; - const char *op = write ? "write" : "read"; + const char *op = str_write_read(write); int res; dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n", diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index f00ce973dddf65..c2fd3f4b62d9ea 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -38,6 +38,15 @@ config PWM_DEBUG It is expected to introduce some runtime overhead and diagnostic output to the kernel log, so only enable while working on a driver. +config PWM_PROVIDE_GPIO + bool "Provide a GPIO chip for each PWM chip" + depends on GPIOLIB + help + Most PWMs can emit both a constant active high and a constant active + low signal and so they can be used as GPIO. Say Y here to let each + PWM chip provide a GPIO chip and so be easily plugged into consumers + that know how to handle GPIOs but not PWMs. + config PWM_AB8500 tristate "AB8500 PWM support" depends on AB8500_CORE && ARCH_U8500 @@ -432,6 +441,16 @@ config PWM_LPSS_PLATFORM To compile this driver as a module, choose M here: the module will be called pwm-lpss-platform. +config PWM_MAX7360 + tristate "MAX7360 PWMs" + depends on MFD_MAX7360 + help + PWM driver for Maxim Integrated MAX7360 multifunction device, with + support for up to 8 PWM outputs. + + To compile this driver as a module, choose M here: the module + will be called pwm-max7360. + config PWM_MC33XS2410 tristate "MC33XS2410 PWM support" depends on OF diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index ff4f47e5fb7a0d..dfa8b4966ee19a 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_PWM_LPC32XX) += pwm-lpc32xx.o obj-$(CONFIG_PWM_LPSS) += pwm-lpss.o obj-$(CONFIG_PWM_LPSS_PCI) += pwm-lpss-pci.o obj-$(CONFIG_PWM_LPSS_PLATFORM) += pwm-lpss-platform.o +obj-$(CONFIG_PWM_MAX7360) += pwm-max7360.o obj-$(CONFIG_PWM_MC33XS2410) += pwm-mc33xs2410.o obj-$(CONFIG_PWM_MEDIATEK) += pwm-mediatek.o obj-$(CONFIG_PWM_MESON) += pwm-meson.o diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 0d66376a83ec35..ea2ccf42e81441 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -276,7 +276,7 @@ int pwm_round_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform * if (IS_ENABLED(CONFIG_PWM_DEBUG) && ret_fromhw > 0) dev_err(&chip->dev, "Unexpected return value from __pwm_round_waveform_fromhw: requested %llu/%llu [+%llu], return value %d\n", - wf_req.duty_length_ns, wf_req.period_length_ns, wf_req.duty_offset_ns, ret_tohw); + wf_req.duty_length_ns, wf_req.period_length_ns, wf_req.duty_offset_ns, ret_fromhw); if (IS_ENABLED(CONFIG_PWM_DEBUG) && (ret_tohw == 0) != pwm_check_rounding(&wf_req, wf)) @@ -496,6 +496,13 @@ static void pwm_apply_debug(struct pwm_device *pwm, if (!chip->ops->get_state) return; + /* + * If a disabled PWM was requested the result is unspecified, so nothing + * to check. + */ + if (!state->enabled) + return; + /* * *state was just applied. Read out the hardware state and do some * checks. @@ -507,26 +514,32 @@ static void pwm_apply_debug(struct pwm_device *pwm, /* If that failed there isn't much to debug */ return; + /* + * If the PWM was disabled that's maybe strange but there is nothing + * that can be sensibly checked then. So return early. + */ + if (!s1.enabled) + return; + /* * The lowlevel driver either ignored .polarity (which is a bug) or as * best effort inverted .polarity and fixed .duty_cycle respectively. * Undo this inversion and fixup for further tests. */ - if (s1.enabled && s1.polarity != state->polarity) { + if (s1.polarity != state->polarity) { s2.polarity = state->polarity; s2.duty_cycle = s1.period - s1.duty_cycle; s2.period = s1.period; - s2.enabled = s1.enabled; + s2.enabled = true; } else { s2 = s1; } if (s2.polarity != state->polarity && - state->duty_cycle < state->period) + s2.duty_cycle < s2.period) dev_warn(pwmchip_parent(chip), ".apply ignored .polarity\n"); - if (state->enabled && s2.enabled && - last->polarity == state->polarity && + if (last->polarity == state->polarity && last->period > s2.period && last->period <= state->period) dev_warn(pwmchip_parent(chip), @@ -537,13 +550,12 @@ static void pwm_apply_debug(struct pwm_device *pwm, * Rounding period up is fine only if duty_cycle is 0 then, because a * flat line doesn't have a characteristic period. */ - if (state->enabled && s2.enabled && state->period < s2.period && s2.duty_cycle) + if (state->period < s2.period && s2.duty_cycle) dev_warn(pwmchip_parent(chip), ".apply is supposed to round down period (requested: %llu, applied: %llu)\n", state->period, s2.period); - if (state->enabled && - last->polarity == state->polarity && + if (last->polarity == state->polarity && last->period == s2.period && last->duty_cycle > s2.duty_cycle && last->duty_cycle <= state->duty_cycle) @@ -553,16 +565,12 @@ static void pwm_apply_debug(struct pwm_device *pwm, s2.duty_cycle, s2.period, last->duty_cycle, last->period); - if (state->enabled && s2.enabled && state->duty_cycle < s2.duty_cycle) + if (state->duty_cycle < s2.duty_cycle) dev_warn(pwmchip_parent(chip), ".apply is supposed to round down duty_cycle (requested: %llu/%llu, applied: %llu/%llu)\n", state->duty_cycle, state->period, s2.duty_cycle, s2.period); - if (!state->enabled && s2.enabled && s2.duty_cycle > 0) - dev_warn(pwmchip_parent(chip), - "requested disabled, but yielded enabled with duty > 0\n"); - /* reapply the state that the driver reported being configured. */ err = chip->ops->apply(chip, pwm, &s1); trace_pwm_apply(pwm, &s1, err); @@ -2383,6 +2391,51 @@ static const struct file_operations pwm_cdev_fileops = { static dev_t pwm_devt; +static int pwm_gpio_request(struct gpio_chip *gc, unsigned int offset) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + struct pwm_device *pwm; + + pwm = pwm_request_from_chip(chip, offset, "pwm-gpio"); + if (IS_ERR(pwm)) + return PTR_ERR(pwm); + + return 0; +} + +static void pwm_gpio_free(struct gpio_chip *gc, unsigned int offset) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + + pwm_put(&chip->pwms[offset]); +} + +static int pwm_gpio_get_direction(struct gpio_chip *gc, unsigned int offset) +{ + return GPIO_LINE_DIRECTION_OUT; +} + +static int pwm_gpio_set(struct gpio_chip *gc, unsigned int offset, int value) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + struct pwm_device *pwm = &chip->pwms[offset]; + int ret; + struct pwm_waveform wf = { + .period_length_ns = 1, + }; + + ret = pwm_round_waveform_might_sleep(pwm, &wf); + if (ret < 0) + return ret; + + if (value) + wf.duty_length_ns = wf.period_length_ns; + else + wf.duty_length_ns = 0; + + return pwm_set_waveform_might_sleep(pwm, &wf, true); +} + /** * __pwmchip_add() - register a new PWM chip * @chip: the PWM chip to add @@ -2449,9 +2502,33 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner) if (ret) goto err_device_add; + if (IS_ENABLED(CONFIG_PWM_PROVIDE_GPIO) && chip->ops->write_waveform) { + struct device *parent = pwmchip_parent(chip); + + chip->gpio = (typeof(chip->gpio)){ + .label = dev_name(parent), + .parent = parent, + .request = pwm_gpio_request, + .free = pwm_gpio_free, + .get_direction = pwm_gpio_get_direction, + .set = pwm_gpio_set, + .base = -1, + .ngpio = chip->npwm, + .can_sleep = true, + }; + + ret = gpiochip_add_data(&chip->gpio, chip); + if (ret) + goto err_gpiochip_add; + } + return 0; +err_gpiochip_add: + + cdev_device_del(&chip->cdev, &chip->dev); err_device_add: + scoped_guard(pwmchip, chip) chip->operational = false; @@ -2472,6 +2549,9 @@ EXPORT_SYMBOL_GPL(__pwmchip_add); */ void pwmchip_remove(struct pwm_chip *chip) { + if (IS_ENABLED(CONFIG_PWM_PROVIDE_GPIO) && chip->ops->write_waveform) + gpiochip_remove(&chip->gpio); + pwmchip_sysfs_unexport(chip); scoped_guard(mutex, &pwm_lock) { diff --git a/drivers/pwm/pwm-berlin.c b/drivers/pwm/pwm-berlin.c index 831aed228cafcb..858d369913742c 100644 --- a/drivers/pwm/pwm-berlin.c +++ b/drivers/pwm/pwm-berlin.c @@ -234,7 +234,7 @@ static int berlin_pwm_suspend(struct device *dev) for (i = 0; i < chip->npwm; i++) { struct berlin_pwm_channel *channel = &bpc->channel[i]; - channel->enable = berlin_pwm_readl(bpc, i, BERLIN_PWM_ENABLE); + channel->enable = berlin_pwm_readl(bpc, i, BERLIN_PWM_EN); channel->ctrl = berlin_pwm_readl(bpc, i, BERLIN_PWM_CONTROL); channel->duty = berlin_pwm_readl(bpc, i, BERLIN_PWM_DUTY); channel->tcnt = berlin_pwm_readl(bpc, i, BERLIN_PWM_TCNT); @@ -262,7 +262,7 @@ static int berlin_pwm_resume(struct device *dev) berlin_pwm_writel(bpc, i, channel->ctrl, BERLIN_PWM_CONTROL); berlin_pwm_writel(bpc, i, channel->duty, BERLIN_PWM_DUTY); berlin_pwm_writel(bpc, i, channel->tcnt, BERLIN_PWM_TCNT); - berlin_pwm_writel(bpc, i, channel->enable, BERLIN_PWM_ENABLE); + berlin_pwm_writel(bpc, i, channel->enable, BERLIN_PWM_EN); } return 0; diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index 189301dc395e25..67cfa17f58e0d5 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -49,10 +49,9 @@ static int cros_ec_pwm_set_duty(struct cros_ec_pwm_device *ec_pwm, u8 index, u16 duty) { struct cros_ec_device *ec = ec_pwm->ec; - struct { - struct cros_ec_command msg; + TRAILING_OVERLAP(struct cros_ec_command, msg, data, struct ec_params_pwm_set_duty params; - } __packed buf; + ) __packed buf; struct ec_params_pwm_set_duty *params = &buf.params; struct cros_ec_command *msg = &buf.msg; int ret; @@ -83,13 +82,12 @@ static int cros_ec_pwm_set_duty(struct cros_ec_pwm_device *ec_pwm, u8 index, static int cros_ec_pwm_get_duty(struct cros_ec_device *ec, bool use_pwm_type, u8 index) { - struct { - struct cros_ec_command msg; + TRAILING_OVERLAP(struct cros_ec_command, msg, data, union { struct ec_params_pwm_get_duty params; struct ec_response_pwm_get_duty resp; }; - } __packed buf; + ) __packed buf; struct ec_params_pwm_get_duty *params = &buf.params; struct ec_response_pwm_get_duty *resp = &buf.resp; struct cros_ec_command *msg = &buf.msg; diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c index 6683931872fc74..35406b2e1925e8 100644 --- a/drivers/pwm/pwm-fsl-ftm.c +++ b/drivers/pwm/pwm-fsl-ftm.c @@ -3,6 +3,7 @@ * Freescale FlexTimer Module (FTM) PWM Driver * * Copyright 2012-2013 Freescale Semiconductor, Inc. + * Copyright 2020-2025 NXP */ #include @@ -30,6 +31,8 @@ enum fsl_pwm_clk { struct fsl_ftm_soc { bool has_enable_bits; + bool has_flt_reg; + unsigned int npwm; }; struct fsl_pwm_periodcfg { @@ -374,6 +377,20 @@ static bool fsl_pwm_volatile_reg(struct device *dev, unsigned int reg) return false; } +static bool fsl_pwm_is_reg(struct device *dev, unsigned int reg) +{ + struct pwm_chip *chip = dev_get_drvdata(dev); + struct fsl_pwm_chip *fpc = to_fsl_chip(chip); + + if (reg >= FTM_CSC(fpc->soc->npwm) && reg < FTM_CNTIN) + return false; + + if ((reg == FTM_FLTCTRL || reg == FTM_FLTPOL) && !fpc->soc->has_flt_reg) + return false; + + return true; +} + static const struct regmap_config fsl_pwm_regmap_config = { .reg_bits = 32, .reg_stride = 4, @@ -382,21 +399,24 @@ static const struct regmap_config fsl_pwm_regmap_config = { .max_register = FTM_PWMLOAD, .volatile_reg = fsl_pwm_volatile_reg, .cache_type = REGCACHE_FLAT, + .writeable_reg = fsl_pwm_is_reg, + .readable_reg = fsl_pwm_is_reg, }; static int fsl_pwm_probe(struct platform_device *pdev) { + const struct fsl_ftm_soc *soc = of_device_get_match_data(&pdev->dev); struct pwm_chip *chip; struct fsl_pwm_chip *fpc; void __iomem *base; int ret; - chip = devm_pwmchip_alloc(&pdev->dev, 8, sizeof(*fpc)); + chip = devm_pwmchip_alloc(&pdev->dev, soc->npwm, sizeof(*fpc)); if (IS_ERR(chip)) return PTR_ERR(chip); fpc = to_fsl_chip(chip); - fpc->soc = of_device_get_match_data(&pdev->dev); + fpc->soc = soc; base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(base)) @@ -512,15 +532,26 @@ static const struct dev_pm_ops fsl_pwm_pm_ops = { static const struct fsl_ftm_soc vf610_ftm_pwm = { .has_enable_bits = false, + .has_flt_reg = true, + .npwm = 8, }; static const struct fsl_ftm_soc imx8qm_ftm_pwm = { .has_enable_bits = true, + .has_flt_reg = true, + .npwm = 8, +}; + +static const struct fsl_ftm_soc s32g2_ftm_pwm = { + .has_enable_bits = true, + .has_flt_reg = false, + .npwm = 6, }; static const struct of_device_id fsl_pwm_dt_ids[] = { { .compatible = "fsl,vf610-ftm-pwm", .data = &vf610_ftm_pwm }, { .compatible = "fsl,imx8qm-ftm-pwm", .data = &imx8qm_ftm_pwm }, + { .compatible = "nxp,s32g2-ftm-pwm", .data = &s32g2_ftm_pwm }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, fsl_pwm_dt_ids); diff --git a/drivers/pwm/pwm-loongson.c b/drivers/pwm/pwm-loongson.c index 1ba16168cbb408..31a57edecfd0ba 100644 --- a/drivers/pwm/pwm-loongson.c +++ b/drivers/pwm/pwm-loongson.c @@ -49,7 +49,7 @@ #define LOONGSON_PWM_CTRL_REG_DZONE BIT(10) /* Anti-dead Zone Enable Bit */ /* default input clk frequency for the ACPI case */ -#define LOONGSON_PWM_FREQ_DEFAULT 50000 /* Hz */ +#define LOONGSON_PWM_FREQ_DEFAULT 50000000 /* Hz */ struct pwm_loongson_ddata { struct clk *clk; diff --git a/drivers/pwm/pwm-max7360.c b/drivers/pwm/pwm-max7360.c new file mode 100644 index 00000000000000..ebf93a7aee5be4 --- /dev/null +++ b/drivers/pwm/pwm-max7360.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025 Bootlin + * + * Author: Kamel BOUHARA + * Author: Mathieu Dubois-Briand + * + * PWM functionality of the MAX7360 multi-function device. + * https://www.analog.com/media/en/technical-documentation/data-sheets/MAX7360.pdf + * + * Limitations: + * - Only supports normal polarity. + * - The period is fixed to 2 ms. + * - Only the duty cycle can be changed, new values are applied at the beginning + * of the next cycle. + * - When disabled, the output is put in Hi-Z immediately. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX7360_NUM_PWMS 8 +#define MAX7360_PWM_MAX 255 +#define MAX7360_PWM_STEPS 256 +#define MAX7360_PWM_PERIOD_NS (2 * NSEC_PER_MSEC) + +struct max7360_pwm_waveform { + u8 duty_steps; + bool enabled; +}; + +static int max7360_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) +{ + struct regmap *regmap = pwmchip_get_drvdata(chip); + + /* + * Make sure we use the individual PWM configuration register and not + * the global one. + * We never need to use the global one, so there is no need to revert + * that in the .free() callback. + */ + return regmap_write_bits(regmap, MAX7360_REG_PWMCFG(pwm->hwpwm), + MAX7360_PORT_CFG_COMMON_PWM, 0); +} + +static int max7360_pwm_round_waveform_tohw(struct pwm_chip *chip, + struct pwm_device *pwm, + const struct pwm_waveform *wf, + void *_wfhw) +{ + struct max7360_pwm_waveform *wfhw = _wfhw; + u64 duty_steps; + + /* + * Ignore user provided values for period_length_ns and duty_offset_ns: + * we only support fixed period of MAX7360_PWM_PERIOD_NS and offset of 0. + * Values from 0 to 254 as duty_steps will provide duty cycles of 0/256 + * to 254/256, while value 255 will provide a duty cycle of 100%. + */ + if (wf->duty_length_ns >= MAX7360_PWM_PERIOD_NS) { + duty_steps = MAX7360_PWM_MAX; + } else { + duty_steps = (u32)wf->duty_length_ns * MAX7360_PWM_STEPS / MAX7360_PWM_PERIOD_NS; + if (duty_steps == MAX7360_PWM_MAX) + duty_steps = MAX7360_PWM_MAX - 1; + } + + wfhw->duty_steps = min(MAX7360_PWM_MAX, duty_steps); + wfhw->enabled = !!wf->period_length_ns; + + if (wf->period_length_ns && wf->period_length_ns < MAX7360_PWM_PERIOD_NS) + return 1; + else + return 0; +} + +static int max7360_pwm_round_waveform_fromhw(struct pwm_chip *chip, struct pwm_device *pwm, + const void *_wfhw, struct pwm_waveform *wf) +{ + const struct max7360_pwm_waveform *wfhw = _wfhw; + + wf->period_length_ns = wfhw->enabled ? MAX7360_PWM_PERIOD_NS : 0; + wf->duty_offset_ns = 0; + + if (wfhw->enabled) { + if (wfhw->duty_steps == MAX7360_PWM_MAX) + wf->duty_length_ns = MAX7360_PWM_PERIOD_NS; + else + wf->duty_length_ns = DIV_ROUND_UP(wfhw->duty_steps * MAX7360_PWM_PERIOD_NS, + MAX7360_PWM_STEPS); + } else { + wf->duty_length_ns = 0; + } + + return 0; +} + +static int max7360_pwm_write_waveform(struct pwm_chip *chip, + struct pwm_device *pwm, + const void *_wfhw) +{ + struct regmap *regmap = pwmchip_get_drvdata(chip); + const struct max7360_pwm_waveform *wfhw = _wfhw; + unsigned int val; + int ret; + + if (wfhw->enabled) { + ret = regmap_write(regmap, MAX7360_REG_PWM(pwm->hwpwm), wfhw->duty_steps); + if (ret) + return ret; + } + + val = wfhw->enabled ? BIT(pwm->hwpwm) : 0; + return regmap_write_bits(regmap, MAX7360_REG_GPIOCTRL, BIT(pwm->hwpwm), val); +} + +static int max7360_pwm_read_waveform(struct pwm_chip *chip, + struct pwm_device *pwm, + void *_wfhw) +{ + struct regmap *regmap = pwmchip_get_drvdata(chip); + struct max7360_pwm_waveform *wfhw = _wfhw; + unsigned int val; + int ret; + + ret = regmap_read(regmap, MAX7360_REG_GPIOCTRL, &val); + if (ret) + return ret; + + if (val & BIT(pwm->hwpwm)) { + wfhw->enabled = true; + ret = regmap_read(regmap, MAX7360_REG_PWM(pwm->hwpwm), &val); + if (ret) + return ret; + + wfhw->duty_steps = val; + } else { + wfhw->enabled = false; + wfhw->duty_steps = 0; + } + + return 0; +} + +static const struct pwm_ops max7360_pwm_ops = { + .request = max7360_pwm_request, + .round_waveform_tohw = max7360_pwm_round_waveform_tohw, + .round_waveform_fromhw = max7360_pwm_round_waveform_fromhw, + .read_waveform = max7360_pwm_read_waveform, + .write_waveform = max7360_pwm_write_waveform, +}; + +static int max7360_pwm_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct pwm_chip *chip; + struct regmap *regmap; + int ret; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return dev_err_probe(dev, -ENODEV, "Could not get parent regmap\n"); + + /* + * This MFD sub-device does not have any associated device tree node: + * properties are stored in the device node of the parent (MFD) device + * and this same node is used in phandles of client devices. + * Reuse this device tree node here, as otherwise the PWM subsystem + * would be confused by this topology. + */ + device_set_of_node_from_dev(dev, dev->parent); + + chip = devm_pwmchip_alloc(dev, MAX7360_NUM_PWMS, 0); + if (IS_ERR(chip)) + return PTR_ERR(chip); + chip->ops = &max7360_pwm_ops; + + pwmchip_set_drvdata(chip, regmap); + + ret = devm_pwmchip_add(dev, chip); + if (ret) + return dev_err_probe(dev, ret, "Failed to add PWM chip\n"); + + return 0; +} + +static struct platform_driver max7360_pwm_driver = { + .driver = { + .name = "max7360-pwm", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, + .probe = max7360_pwm_probe, +}; +module_platform_driver(max7360_pwm_driver); + +MODULE_DESCRIPTION("MAX7360 PWM driver"); +MODULE_AUTHOR("Kamel BOUHARA "); +MODULE_AUTHOR("Mathieu Dubois-Briand "); +MODULE_LICENSE("GPL"); diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index e4b595fc5a5e04..4291072a13a7f6 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -7,6 +7,7 @@ * */ +#include #include #include #include @@ -21,24 +22,26 @@ /* PWM registers and bits definitions */ #define PWMCON 0x00 +#define PWMCON_CLKDIV GENMASK(2, 0) #define PWMHDUR 0x04 #define PWMLDUR 0x08 #define PWMGDUR 0x0c #define PWMWAVENUM 0x28 #define PWMDWIDTH 0x2c +#define PWMDWIDTH_PERIOD GENMASK(12, 0) #define PWM45DWIDTH_FIXUP 0x30 #define PWMTHRES 0x30 +#define PWMTHRES_DUTY GENMASK(12, 0) #define PWM45THRES_FIXUP 0x34 #define PWM_CK_26M_SEL_V3 0x74 #define PWM_CK_26M_SEL 0x210 -#define PWM_CLK_DIV_MAX 7 - struct pwm_mediatek_of_data { unsigned int num_pwms; bool pwm45_fixup; u16 pwm_ck_26m_sel_reg; - const unsigned int *reg_offset; + unsigned int chanreg_base; + unsigned int chanreg_width; }; /** @@ -46,28 +49,18 @@ struct pwm_mediatek_of_data { * @regs: base address of PWM chip * @clk_top: the top clock generator * @clk_main: the clock used by PWM core - * @clk_pwms: the clock used by each PWM channel * @soc: pointer to chip's platform data + * @clk_pwms: the clock and clkrate used by each PWM channel */ struct pwm_mediatek_chip { void __iomem *regs; struct clk *clk_top; struct clk *clk_main; - struct clk **clk_pwms; const struct pwm_mediatek_of_data *soc; -}; - -static const unsigned int mtk_pwm_reg_offset_v1[] = { - 0x0010, 0x0050, 0x0090, 0x00d0, 0x0110, 0x0150, 0x0190, 0x0220 -}; - -static const unsigned int mtk_pwm_reg_offset_v2[] = { - 0x0080, 0x00c0, 0x0100, 0x0140, 0x0180, 0x01c0, 0x0200, 0x0240 -}; - -/* PWM IP Version 3.0.2 */ -static const unsigned int mtk_pwm_reg_offset_v3[] = { - 0x0100, 0x0200, 0x0300, 0x0400, 0x0500, 0x0600, 0x0700, 0x0800 + struct { + struct clk *clk; + unsigned long rate; + } clk_pwms[]; }; static inline struct pwm_mediatek_chip * @@ -76,10 +69,9 @@ to_pwm_mediatek_chip(struct pwm_chip *chip) return pwmchip_get_drvdata(chip); } -static int pwm_mediatek_clk_enable(struct pwm_chip *chip, - struct pwm_device *pwm) +static int pwm_mediatek_clk_enable(struct pwm_mediatek_chip *pc, + unsigned int hwpwm) { - struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); int ret; ret = clk_prepare_enable(pc->clk_top); @@ -90,12 +82,28 @@ static int pwm_mediatek_clk_enable(struct pwm_chip *chip, if (ret < 0) goto disable_clk_top; - ret = clk_prepare_enable(pc->clk_pwms[pwm->hwpwm]); + ret = clk_prepare_enable(pc->clk_pwms[hwpwm].clk); if (ret < 0) goto disable_clk_main; + if (!pc->clk_pwms[hwpwm].rate) { + pc->clk_pwms[hwpwm].rate = clk_get_rate(pc->clk_pwms[hwpwm].clk); + + /* + * With the clk running with not more than 1 GHz the + * calculations in .apply() won't overflow. + */ + if (!pc->clk_pwms[hwpwm].rate || + pc->clk_pwms[hwpwm].rate > 1000000000) { + ret = -EINVAL; + goto disable_clk_hwpwm; + } + } + return 0; +disable_clk_hwpwm: + clk_disable_unprepare(pc->clk_pwms[hwpwm].clk); disable_clk_main: clk_disable_unprepare(pc->clk_main); disable_clk_top: @@ -104,12 +112,10 @@ static int pwm_mediatek_clk_enable(struct pwm_chip *chip, return ret; } -static void pwm_mediatek_clk_disable(struct pwm_chip *chip, - struct pwm_device *pwm) +static void pwm_mediatek_clk_disable(struct pwm_mediatek_chip *pc, + unsigned int hwpwm) { - struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); - - clk_disable_unprepare(pc->clk_pwms[pwm->hwpwm]); + clk_disable_unprepare(pc->clk_pwms[hwpwm].clk); clk_disable_unprepare(pc->clk_main); clk_disable_unprepare(pc->clk_top); } @@ -118,7 +124,15 @@ static inline void pwm_mediatek_writel(struct pwm_mediatek_chip *chip, unsigned int num, unsigned int offset, u32 value) { - writel(value, chip->regs + chip->soc->reg_offset[num] + offset); + writel(value, chip->regs + chip->soc->chanreg_base + + num * chip->soc->chanreg_width + offset); +} + +static inline u32 pwm_mediatek_readl(struct pwm_mediatek_chip *chip, + unsigned int num, unsigned int offset) +{ + return readl(chip->regs + chip->soc->chanreg_base + + num * chip->soc->chanreg_width + offset); } static void pwm_mediatek_enable(struct pwm_chip *chip, struct pwm_device *pwm) @@ -142,50 +156,59 @@ static void pwm_mediatek_disable(struct pwm_chip *chip, struct pwm_device *pwm) } static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) + u64 duty_ns, u64 period_ns) { struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); - u32 clkdiv = 0, cnt_period, cnt_duty, reg_width = PWMDWIDTH, - reg_thres = PWMTHRES; + u32 clkdiv, enable; + u32 reg_width = PWMDWIDTH, reg_thres = PWMTHRES; + u64 cnt_period, cnt_duty; unsigned long clk_rate; - u64 resolution; int ret; - ret = pwm_mediatek_clk_enable(chip, pwm); + ret = pwm_mediatek_clk_enable(pc, pwm->hwpwm); if (ret < 0) return ret; - clk_rate = clk_get_rate(pc->clk_pwms[pwm->hwpwm]); - if (!clk_rate) { - ret = -EINVAL; - goto out; - } + clk_rate = pc->clk_pwms[pwm->hwpwm].rate; /* Make sure we use the bus clock and not the 26MHz clock */ if (pc->soc->pwm_ck_26m_sel_reg) writel(0, pc->regs + pc->soc->pwm_ck_26m_sel_reg); - /* Using resolution in picosecond gets accuracy higher */ - resolution = (u64)NSEC_PER_SEC * 1000; - do_div(resolution, clk_rate); - - cnt_period = DIV_ROUND_CLOSEST_ULL((u64)period_ns * 1000, resolution); - if (!cnt_period) - return -EINVAL; + cnt_period = mul_u64_u64_div_u64(period_ns, clk_rate, NSEC_PER_SEC); + if (cnt_period == 0) { + ret = -ERANGE; + goto out; + } - while (cnt_period > 8192) { - resolution *= 2; - clkdiv++; - cnt_period = DIV_ROUND_CLOSEST_ULL((u64)period_ns * 1000, - resolution); + if (cnt_period > FIELD_MAX(PWMDWIDTH_PERIOD) + 1) { + if (cnt_period >= ((FIELD_MAX(PWMDWIDTH_PERIOD) + 1) << FIELD_MAX(PWMCON_CLKDIV))) { + clkdiv = FIELD_MAX(PWMCON_CLKDIV); + cnt_period = FIELD_MAX(PWMDWIDTH_PERIOD) + 1; + } else { + clkdiv = ilog2(cnt_period) - ilog2(FIELD_MAX(PWMDWIDTH_PERIOD)); + cnt_period >>= clkdiv; + } + } else { + clkdiv = 0; } - if (clkdiv > PWM_CLK_DIV_MAX) { - dev_err(pwmchip_parent(chip), "period of %d ns not supported\n", period_ns); - ret = -EINVAL; - goto out; + cnt_duty = mul_u64_u64_div_u64(duty_ns, clk_rate, NSEC_PER_SEC) >> clkdiv; + if (cnt_duty > cnt_period) + cnt_duty = cnt_period; + + if (cnt_duty) { + cnt_duty -= 1; + enable = BIT(pwm->hwpwm); + } else { + enable = 0; } + cnt_period -= 1; + + dev_dbg(&chip->dev, "pwm#%u: %lld/%lld @%lu -> CON: %x, PERIOD: %llx, DUTY: %llx\n", + pwm->hwpwm, duty_ns, period_ns, clk_rate, clkdiv, cnt_period, cnt_duty); + if (pc->soc->pwm45_fixup && pwm->hwpwm > 2) { /* * PWM[4,5] has distinct offset for PWMDWIDTH and PWMTHRES @@ -195,20 +218,18 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, reg_thres = PWM45THRES_FIXUP; } - cnt_duty = DIV_ROUND_CLOSEST_ULL((u64)duty_ns * 1000, resolution); - pwm_mediatek_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | clkdiv); - pwm_mediatek_writel(pc, pwm->hwpwm, reg_width, cnt_period - 1); + pwm_mediatek_writel(pc, pwm->hwpwm, reg_width, cnt_period); - if (cnt_duty) { - pwm_mediatek_writel(pc, pwm->hwpwm, reg_thres, cnt_duty - 1); + if (enable) { + pwm_mediatek_writel(pc, pwm->hwpwm, reg_thres, cnt_duty); pwm_mediatek_enable(chip, pwm); } else { pwm_mediatek_disable(chip, pwm); } out: - pwm_mediatek_clk_disable(chip, pwm); + pwm_mediatek_clk_disable(pc, pwm->hwpwm); return ret; } @@ -216,6 +237,7 @@ static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm, static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_state *state) { + struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); int err; if (state->polarity != PWM_POLARITY_NORMAL) @@ -224,7 +246,7 @@ static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm, if (!state->enabled) { if (pwm->state.enabled) { pwm_mediatek_disable(chip, pwm); - pwm_mediatek_clk_disable(chip, pwm); + pwm_mediatek_clk_disable(pc, pwm->hwpwm); } return 0; @@ -235,15 +257,115 @@ static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm, return err; if (!pwm->state.enabled) - err = pwm_mediatek_clk_enable(chip, pwm); + err = pwm_mediatek_clk_enable(pc, pwm->hwpwm); return err; } +static int pwm_mediatek_get_state(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip); + int ret; + u32 enable; + u32 reg_width = PWMDWIDTH, reg_thres = PWMTHRES; + + if (pc->soc->pwm45_fixup && pwm->hwpwm > 2) { + /* + * PWM[4,5] has distinct offset for PWMDWIDTH and PWMTHRES + * from the other PWMs on MT7623. + */ + reg_width = PWM45DWIDTH_FIXUP; + reg_thres = PWM45THRES_FIXUP; + } + + ret = pwm_mediatek_clk_enable(pc, pwm->hwpwm); + if (ret < 0) + return ret; + + enable = readl(pc->regs); + if (enable & BIT(pwm->hwpwm)) { + u32 clkdiv, cnt_period, cnt_duty; + unsigned long clk_rate; + + clk_rate = pc->clk_pwms[pwm->hwpwm].rate; + + state->enabled = true; + state->polarity = PWM_POLARITY_NORMAL; + + clkdiv = FIELD_GET(PWMCON_CLKDIV, + pwm_mediatek_readl(pc, pwm->hwpwm, PWMCON)); + cnt_period = FIELD_GET(PWMDWIDTH_PERIOD, + pwm_mediatek_readl(pc, pwm->hwpwm, reg_width)); + cnt_duty = FIELD_GET(PWMTHRES_DUTY, + pwm_mediatek_readl(pc, pwm->hwpwm, reg_thres)); + + /* + * cnt_period is a 13 bit value, NSEC_PER_SEC is 30 bits wide + * and clkdiv is less than 8, so the multiplication doesn't + * overflow an u64. + */ + state->period = + DIV_ROUND_UP_ULL((u64)cnt_period * NSEC_PER_SEC << clkdiv, clk_rate); + state->duty_cycle = + DIV_ROUND_UP_ULL((u64)cnt_duty * NSEC_PER_SEC << clkdiv, clk_rate); + } else { + state->enabled = false; + } + + pwm_mediatek_clk_disable(pc, pwm->hwpwm); + + return ret; +} + static const struct pwm_ops pwm_mediatek_ops = { .apply = pwm_mediatek_apply, + .get_state = pwm_mediatek_get_state, }; +static int pwm_mediatek_init_used_clks(struct pwm_mediatek_chip *pc) +{ + const struct pwm_mediatek_of_data *soc = pc->soc; + unsigned int hwpwm; + u32 enabled, handled = 0; + int ret; + + ret = clk_prepare_enable(pc->clk_top); + if (ret) + return ret; + + ret = clk_prepare_enable(pc->clk_main); + if (ret) + goto err_enable_main; + + enabled = readl(pc->regs) & GENMASK(soc->num_pwms - 1, 0); + + while (enabled & ~handled) { + hwpwm = ilog2(enabled & ~handled); + + ret = pwm_mediatek_clk_enable(pc, hwpwm); + if (ret) { + while (handled) { + hwpwm = ilog2(handled); + + pwm_mediatek_clk_disable(pc, hwpwm); + handled &= ~BIT(hwpwm); + } + + break; + } + + handled |= BIT(hwpwm); + } + + clk_disable_unprepare(pc->clk_main); +err_enable_main: + + clk_disable_unprepare(pc->clk_top); + + return ret; +} + static int pwm_mediatek_probe(struct platform_device *pdev) { struct pwm_chip *chip; @@ -254,7 +376,8 @@ static int pwm_mediatek_probe(struct platform_device *pdev) soc = of_device_get_match_data(&pdev->dev); - chip = devm_pwmchip_alloc(&pdev->dev, soc->num_pwms, sizeof(*pc)); + chip = devm_pwmchip_alloc(&pdev->dev, soc->num_pwms, + sizeof(*pc) + soc->num_pwms * sizeof(*pc->clk_pwms)); if (IS_ERR(chip)) return PTR_ERR(chip); pc = to_pwm_mediatek_chip(chip); @@ -265,11 +388,6 @@ static int pwm_mediatek_probe(struct platform_device *pdev) if (IS_ERR(pc->regs)) return PTR_ERR(pc->regs); - pc->clk_pwms = devm_kmalloc_array(&pdev->dev, soc->num_pwms, - sizeof(*pc->clk_pwms), GFP_KERNEL); - if (!pc->clk_pwms) - return -ENOMEM; - pc->clk_top = devm_clk_get(&pdev->dev, "top"); if (IS_ERR(pc->clk_top)) return dev_err_probe(&pdev->dev, PTR_ERR(pc->clk_top), @@ -285,12 +403,21 @@ static int pwm_mediatek_probe(struct platform_device *pdev) snprintf(name, sizeof(name), "pwm%d", i + 1); - pc->clk_pwms[i] = devm_clk_get(&pdev->dev, name); - if (IS_ERR(pc->clk_pwms[i])) - return dev_err_probe(&pdev->dev, PTR_ERR(pc->clk_pwms[i]), + pc->clk_pwms[i].clk = devm_clk_get(&pdev->dev, name); + if (IS_ERR(pc->clk_pwms[i].clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(pc->clk_pwms[i].clk), "Failed to get %s clock\n", name); + + ret = devm_clk_rate_exclusive_get(&pdev->dev, pc->clk_pwms[i].clk); + if (ret) + return dev_err_probe(&pdev->dev, ret, + "Failed to lock clock rate for %s\n", name); } + ret = pwm_mediatek_init_used_clks(pc); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to initialize used clocks\n"); + chip->ops = &pwm_mediatek_ops; ret = devm_pwmchip_add(&pdev->dev, chip); @@ -303,86 +430,99 @@ static int pwm_mediatek_probe(struct platform_device *pdev) static const struct pwm_mediatek_of_data mt2712_pwm_data = { .num_pwms = 8, .pwm45_fixup = false, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt6795_pwm_data = { .num_pwms = 7, .pwm45_fixup = false, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7622_pwm_data = { .num_pwms = 6, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7623_pwm_data = { .num_pwms = 5, .pwm45_fixup = true, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7628_pwm_data = { .num_pwms = 4, .pwm45_fixup = true, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7629_pwm_data = { .num_pwms = 1, .pwm45_fixup = false, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7981_pwm_data = { .num_pwms = 3, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v2, + .chanreg_base = 0x80, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7986_pwm_data = { .num_pwms = 2, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt7988_pwm_data = { .num_pwms = 8, .pwm45_fixup = false, - .reg_offset = mtk_pwm_reg_offset_v2, + .chanreg_base = 0x80, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt8183_pwm_data = { .num_pwms = 4, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt8365_pwm_data = { .num_pwms = 3, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt8516_pwm_data = { .num_pwms = 5, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL, - .reg_offset = mtk_pwm_reg_offset_v1, + .chanreg_base = 0x10, + .chanreg_width = 0x40, }; static const struct pwm_mediatek_of_data mt6991_pwm_data = { .num_pwms = 4, .pwm45_fixup = false, .pwm_ck_26m_sel_reg = PWM_CK_26M_SEL_V3, - .reg_offset = mtk_pwm_reg_offset_v3, + .chanreg_base = 0x100, + .chanreg_width = 0x100, }; static const struct of_device_id pwm_mediatek_of_match[] = { diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index 9ce75704a15f89..107bebec3546ed 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -26,7 +26,6 @@ * that is enabled is allowed to change the prescale register. * PWM channels requested afterwards must use a period that results in the same * prescale setting as the one set by the first requested channel. - * GPIOs do not count as enabled PWMs as they are not using the prescaler. */ #define PCA9685_MODE1 0x00 @@ -50,7 +49,14 @@ #define PCA9685_PRESCALE_MAX 0xFF /* => min. frequency of 24 Hz */ #define PCA9685_COUNTER_RANGE 4096 -#define PCA9685_OSC_CLOCK_MHZ 25 /* Internal oscillator with 25 MHz */ +#define PCA9685_OSC_CLOCK_HZ 25000000 /* Internal oscillator with 25 MHz */ + +/* + * The time value of one counter tick. Note that NSEC_PER_SEC is an integer + * multiple of PCA9685_OSC_CLOCK_HZ, so there is no rounding involved and we're + * not loosing precision due to the early division. + */ +#define PCA9685_QUANTUM_NS(_prescale) ((NSEC_PER_SEC / PCA9685_OSC_CLOCK_HZ) * (_prescale + 1)) #define PCA9685_NUMREGS 0xFF #define PCA9685_MAXCHAN 0x10 @@ -61,6 +67,8 @@ #define MODE1_SUB2 BIT(2) #define MODE1_SUB1 BIT(3) #define MODE1_SLEEP BIT(4) +#define MODE1_AI BIT(5) + #define MODE2_INVRT BIT(4) #define MODE2_OUTDRV BIT(2) @@ -78,10 +86,6 @@ struct pca9685 { struct regmap *regmap; struct mutex lock; DECLARE_BITMAP(pwms_enabled, PCA9685_MAXCHAN + 1); -#if IS_ENABLED(CONFIG_GPIOLIB) - struct gpio_chip gpio; - DECLARE_BITMAP(pwms_inuse, PCA9685_MAXCHAN + 1); -#endif }; static inline struct pca9685 *to_pca(struct pwm_chip *chip) @@ -131,355 +135,232 @@ static int pca9685_write_reg(struct pwm_chip *chip, unsigned int reg, unsigned i return err; } -/* Helper function to set the duty cycle ratio to duty/4096 (e.g. duty=2048 -> 50%) */ -static void pca9685_pwm_set_duty(struct pwm_chip *chip, int channel, unsigned int duty) +static int pca9685_write_4reg(struct pwm_chip *chip, unsigned int reg, u8 val[4]) { - struct pwm_device *pwm = &chip->pwms[channel]; - unsigned int on, off; - - if (duty == 0) { - /* Set the full OFF bit, which has the highest precedence */ - pca9685_write_reg(chip, REG_OFF_H(channel), LED_FULL); - return; - } else if (duty >= PCA9685_COUNTER_RANGE) { - /* Set the full ON bit and clear the full OFF bit */ - pca9685_write_reg(chip, REG_ON_H(channel), LED_FULL); - pca9685_write_reg(chip, REG_OFF_H(channel), 0); - return; - } + struct pca9685 *pca = to_pca(chip); + struct device *dev = pwmchip_parent(chip); + int err; + err = regmap_bulk_write(pca->regmap, reg, val, 4); + if (err) + dev_err(dev, "regmap_write to register 0x%x failed: %pe\n", reg, ERR_PTR(err)); - if (pwm->state.usage_power && channel < PCA9685_MAXCHAN) { - /* - * If usage_power is set, the pca9685 driver will phase shift - * the individual channels relative to their channel number. - * This improves EMI because the enabled channels no longer - * turn on at the same time, while still maintaining the - * configured duty cycle / power output. - */ - on = channel * PCA9685_COUNTER_RANGE / PCA9685_MAXCHAN; - } else - on = 0; - - off = (on + duty) % PCA9685_COUNTER_RANGE; - - /* Set ON time (clears full ON bit) */ - pca9685_write_reg(chip, REG_ON_L(channel), on & 0xff); - pca9685_write_reg(chip, REG_ON_H(channel), (on >> 8) & 0xf); - /* Set OFF time (clears full OFF bit) */ - pca9685_write_reg(chip, REG_OFF_L(channel), off & 0xff); - pca9685_write_reg(chip, REG_OFF_H(channel), (off >> 8) & 0xf); + return err; } -static unsigned int pca9685_pwm_get_duty(struct pwm_chip *chip, int channel) +static int pca9685_set_sleep_mode(struct pwm_chip *chip, bool enable) { - struct pwm_device *pwm = &chip->pwms[channel]; - unsigned int off = 0, on = 0, val = 0; - - if (WARN_ON(channel >= PCA9685_MAXCHAN)) { - /* HW does not support reading state of "all LEDs" channel */ - return 0; - } + struct pca9685 *pca = to_pca(chip); + int err; - pca9685_read_reg(chip, LED_N_OFF_H(channel), &off); - if (off & LED_FULL) { - /* Full OFF bit is set */ - return 0; - } + err = regmap_update_bits(pca->regmap, PCA9685_MODE1, + MODE1_SLEEP, enable ? MODE1_SLEEP : 0); + if (err) + return err; - pca9685_read_reg(chip, LED_N_ON_H(channel), &on); - if (on & LED_FULL) { - /* Full ON bit is set */ - return PCA9685_COUNTER_RANGE; + if (!enable) { + /* Wait 500us for the oscillator to be back up */ + udelay(500); } - pca9685_read_reg(chip, LED_N_OFF_L(channel), &val); - off = ((off & 0xf) << 8) | (val & 0xff); - if (!pwm->state.usage_power) - return off; - - /* Read ON register to calculate duty cycle of staggered output */ - if (pca9685_read_reg(chip, LED_N_ON_L(channel), &val)) { - /* Reset val to 0 in case reading LED_N_ON_L failed */ - val = 0; - } - on = ((on & 0xf) << 8) | (val & 0xff); - return (off - on) & (PCA9685_COUNTER_RANGE - 1); + return 0; } -#if IS_ENABLED(CONFIG_GPIOLIB) -static bool pca9685_pwm_test_and_set_inuse(struct pca9685 *pca, int pwm_idx) +struct pca9685_waveform { + u8 onoff[4]; + u8 prescale; +}; + +static int pca9685_round_waveform_tohw(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_waveform *wf, void *_wfhw) { - bool is_inuse; + struct pca9685_waveform *wfhw = _wfhw; + struct pca9685 *pca = to_pca(chip); + unsigned int best_prescale; + u8 prescale; + unsigned int period_ns, duty; + int ret_tohw = 0; - mutex_lock(&pca->lock); - if (pwm_idx >= PCA9685_MAXCHAN) { - /* - * "All LEDs" channel: - * pretend already in use if any of the PWMs are requested - */ - if (!bitmap_empty(pca->pwms_inuse, PCA9685_MAXCHAN)) { - is_inuse = true; - goto out; - } - } else { - /* - * Regular channel: - * pretend already in use if the "all LEDs" channel is requested - */ - if (test_bit(PCA9685_MAXCHAN, pca->pwms_inuse)) { - is_inuse = true; - goto out; - } + if (!wf->period_length_ns) { + *wfhw = (typeof(*wfhw)){ + .onoff = { 0, 0, 0, LED_FULL, }, + .prescale = 0, + }; + + dev_dbg(&chip->dev, "pwm#%u: %lld/%lld [+%lld] -> [%hhx %hhx %hhx %hhx] PSC:%hhx\n", + pwm->hwpwm, wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns, + wfhw->onoff[0], wfhw->onoff[1], wfhw->onoff[2], wfhw->onoff[3], wfhw->prescale); + + return 0; } - is_inuse = test_and_set_bit(pwm_idx, pca->pwms_inuse); -out: - mutex_unlock(&pca->lock); - return is_inuse; -} -static void pca9685_pwm_clear_inuse(struct pca9685 *pca, int pwm_idx) -{ - mutex_lock(&pca->lock); - clear_bit(pwm_idx, pca->pwms_inuse); - mutex_unlock(&pca->lock); -} + if (wf->period_length_ns >= PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(255)) { + best_prescale = 255; + } else if (wf->period_length_ns < PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(3)) { + best_prescale = 3; + ret_tohw = 1; + } else { + best_prescale = (unsigned int)wf->period_length_ns / (PCA9685_COUNTER_RANGE * (NSEC_PER_SEC / PCA9685_OSC_CLOCK_HZ)) - 1; + } -static int pca9685_pwm_gpio_request(struct gpio_chip *gpio, unsigned int offset) -{ - struct pwm_chip *chip = gpiochip_get_data(gpio); - struct pca9685 *pca = to_pca(chip); + guard(mutex)(&pca->lock); - if (pca9685_pwm_test_and_set_inuse(pca, offset)) - return -EBUSY; - pm_runtime_get_sync(pwmchip_parent(chip)); - return 0; -} + if (!pca9685_prescaler_can_change(pca, pwm->hwpwm)) { + unsigned int current_prescale; + int ret; -static int pca9685_pwm_gpio_get(struct gpio_chip *gpio, unsigned int offset) -{ - struct pwm_chip *chip = gpiochip_get_data(gpio); + ret = regmap_read(pca->regmap, PCA9685_PRESCALE, ¤t_prescale); + if (ret) + return ret; - return pca9685_pwm_get_duty(chip, offset) != 0; -} + if (current_prescale > best_prescale) + ret_tohw = 1; -static int pca9685_pwm_gpio_set(struct gpio_chip *gpio, unsigned int offset, - int value) -{ - struct pwm_chip *chip = gpiochip_get_data(gpio); + prescale = current_prescale; + } else { + prescale = best_prescale; + } - pca9685_pwm_set_duty(chip, offset, value ? PCA9685_COUNTER_RANGE : 0); + period_ns = PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(prescale); - return 0; -} + duty = (unsigned)min_t(u64, wf->duty_length_ns, period_ns) / PCA9685_QUANTUM_NS(prescale); -static void pca9685_pwm_gpio_free(struct gpio_chip *gpio, unsigned int offset) -{ - struct pwm_chip *chip = gpiochip_get_data(gpio); - struct pca9685 *pca = to_pca(chip); + if (duty < PCA9685_COUNTER_RANGE) { + unsigned int on, off; - pca9685_pwm_set_duty(chip, offset, 0); - pm_runtime_put(pwmchip_parent(chip)); - pca9685_pwm_clear_inuse(pca, offset); -} + on = (unsigned)min_t(u64, wf->duty_offset_ns, period_ns) / PCA9685_QUANTUM_NS(prescale); + off = (on + duty) % PCA9685_COUNTER_RANGE; -static int pca9685_pwm_gpio_get_direction(struct gpio_chip *chip, - unsigned int offset) -{ - /* Always out */ - return GPIO_LINE_DIRECTION_OUT; -} + /* + * With a zero duty cycle, it doesn't matter if period was + * rounded up + */ + if (!duty) + ret_tohw = 0; -static int pca9685_pwm_gpio_direction_input(struct gpio_chip *gpio, - unsigned int offset) -{ - return -EINVAL; -} + *wfhw = (typeof(*wfhw)){ + .onoff = { on & 0xff, (on >> 8) & 0xf, off & 0xff, (off >> 8) & 0xf }, + .prescale = prescale, + }; + } else { + *wfhw = (typeof(*wfhw)){ + .onoff = { 0, LED_FULL, 0, 0, }, + .prescale = prescale, + }; + } -static int pca9685_pwm_gpio_direction_output(struct gpio_chip *gpio, - unsigned int offset, int value) -{ - pca9685_pwm_gpio_set(gpio, offset, value); + dev_dbg(&chip->dev, "pwm#%u: %lld/%lld [+%lld] -> %s[%hhx %hhx %hhx %hhx] PSC:%hhx\n", + pwm->hwpwm, wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns, + ret_tohw ? "#" : "", wfhw->onoff[0], wfhw->onoff[1], wfhw->onoff[2], wfhw->onoff[3], wfhw->prescale); - return 0; + return ret_tohw; } -/* - * The PCA9685 has a bit for turning the PWM output full off or on. Some - * boards like Intel Galileo actually uses these as normal GPIOs so we - * expose a GPIO chip here which can exclusively take over the underlying - * PWM channel. - */ -static int pca9685_pwm_gpio_probe(struct pwm_chip *chip) +static int pca9685_round_waveform_fromhw(struct pwm_chip *chip, struct pwm_device *pwm, + const void *_wfhw, struct pwm_waveform *wf) { + const struct pca9685_waveform *wfhw = _wfhw; struct pca9685 *pca = to_pca(chip); - struct device *dev = pwmchip_parent(chip); + unsigned int prescale; - pca->gpio.label = dev_name(dev); - pca->gpio.parent = dev; - pca->gpio.request = pca9685_pwm_gpio_request; - pca->gpio.free = pca9685_pwm_gpio_free; - pca->gpio.get_direction = pca9685_pwm_gpio_get_direction; - pca->gpio.direction_input = pca9685_pwm_gpio_direction_input; - pca->gpio.direction_output = pca9685_pwm_gpio_direction_output; - pca->gpio.get = pca9685_pwm_gpio_get; - pca->gpio.set = pca9685_pwm_gpio_set; - pca->gpio.base = -1; - pca->gpio.ngpio = PCA9685_MAXCHAN; - pca->gpio.can_sleep = true; - - return devm_gpiochip_add_data(dev, &pca->gpio, chip); -} -#else -static inline bool pca9685_pwm_test_and_set_inuse(struct pca9685 *pca, - int pwm_idx) -{ - return false; -} + if (wfhw->prescale) + prescale = wfhw->prescale; + else + scoped_guard(mutex, &pca->lock) { + int ret; -static inline void -pca9685_pwm_clear_inuse(struct pca9685 *pca, int pwm_idx) -{ -} + ret = regmap_read(pca->regmap, PCA9685_PRESCALE, &prescale); + if (ret) + return ret; + } -static inline int pca9685_pwm_gpio_probe(struct pwm_chip *chip) -{ - return 0; -} -#endif + wf->period_length_ns = PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(prescale); -static void pca9685_set_sleep_mode(struct pwm_chip *chip, bool enable) -{ - struct device *dev = pwmchip_parent(chip); - struct pca9685 *pca = to_pca(chip); - int err = regmap_update_bits(pca->regmap, PCA9685_MODE1, - MODE1_SLEEP, enable ? MODE1_SLEEP : 0); - if (err) { - dev_err(dev, "regmap_update_bits of register 0x%x failed: %pe\n", - PCA9685_MODE1, ERR_PTR(err)); - return; - } + if (wfhw->onoff[3] & LED_FULL) { + wf->duty_length_ns = 0; + wf->duty_offset_ns = 0; + } else if (wfhw->onoff[1] & LED_FULL) { + wf->duty_length_ns = wf->period_length_ns; + wf->duty_offset_ns = 0; + } else { + unsigned int on = wfhw->onoff[0] | (wfhw->onoff[1] & 0xf) << 8; + unsigned int off = wfhw->onoff[2] | (wfhw->onoff[3] & 0xf) << 8; - if (!enable) { - /* Wait 500us for the oscillator to be back up */ - udelay(500); + wf->duty_length_ns = (off - on) % PCA9685_COUNTER_RANGE * PCA9685_QUANTUM_NS(prescale); + wf->duty_offset_ns = on * PCA9685_QUANTUM_NS(prescale); } + + dev_dbg(&chip->dev, "pwm#%u: [%hhx %hhx %hhx %hhx] PSC:%hhx -> %lld/%lld [+%lld]\n", + pwm->hwpwm, + wfhw->onoff[0], wfhw->onoff[1], wfhw->onoff[2], wfhw->onoff[3], wfhw->prescale, + wf->duty_length_ns, wf->period_length_ns, wf->duty_offset_ns); + + return 0; } -static int __pca9685_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - const struct pwm_state *state) +static int pca9685_read_waveform(struct pwm_chip *chip, struct pwm_device *pwm, void *_wfhw) { + struct pca9685_waveform *wfhw = _wfhw; struct pca9685 *pca = to_pca(chip); - unsigned long long duty, prescale; - unsigned int val = 0; - - if (state->polarity != PWM_POLARITY_NORMAL) - return -EINVAL; - - prescale = DIV_ROUND_CLOSEST_ULL(PCA9685_OSC_CLOCK_MHZ * state->period, - PCA9685_COUNTER_RANGE * 1000) - 1; - if (prescale < PCA9685_PRESCALE_MIN || prescale > PCA9685_PRESCALE_MAX) { - dev_err(pwmchip_parent(chip), "pwm not changed: period out of bounds!\n"); - return -EINVAL; - } + unsigned int prescale; + int ret; - if (!state->enabled) { - pca9685_pwm_set_duty(chip, pwm->hwpwm, 0); - return 0; - } + guard(mutex)(&pca->lock); - pca9685_read_reg(chip, PCA9685_PRESCALE, &val); - if (prescale != val) { - if (!pca9685_prescaler_can_change(pca, pwm->hwpwm)) { - dev_err(pwmchip_parent(chip), - "pwm not changed: periods of enabled pwms must match!\n"); - return -EBUSY; - } + ret = regmap_bulk_read(pca->regmap, REG_ON_L(pwm->hwpwm), &wfhw->onoff, 4); + if (ret) + return ret; - /* - * Putting the chip briefly into SLEEP mode - * at this point won't interfere with the - * pm_runtime framework, because the pm_runtime - * state is guaranteed active here. - */ - /* Put chip into sleep mode */ - pca9685_set_sleep_mode(chip, true); + ret = regmap_read(pca->regmap, PCA9685_PRESCALE, &prescale); + if (ret) + return ret; - /* Change the chip-wide output frequency */ - pca9685_write_reg(chip, PCA9685_PRESCALE, prescale); + wfhw->prescale = prescale; - /* Wake the chip up */ - pca9685_set_sleep_mode(chip, false); - } - - duty = PCA9685_COUNTER_RANGE * state->duty_cycle; - duty = DIV_ROUND_UP_ULL(duty, state->period); - pca9685_pwm_set_duty(chip, pwm->hwpwm, duty); return 0; } -static int pca9685_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, - const struct pwm_state *state) +static int pca9685_write_waveform(struct pwm_chip *chip, struct pwm_device *pwm, const void *_wfhw) { + const struct pca9685_waveform *wfhw = _wfhw; struct pca9685 *pca = to_pca(chip); + unsigned int current_prescale; int ret; - mutex_lock(&pca->lock); - ret = __pca9685_pwm_apply(chip, pwm, state); - if (ret == 0) { - if (state->enabled) - set_bit(pwm->hwpwm, pca->pwms_enabled); - else - clear_bit(pwm->hwpwm, pca->pwms_enabled); - } - mutex_unlock(&pca->lock); + guard(mutex)(&pca->lock); - return ret; -} + if (wfhw->prescale) { + ret = regmap_read(pca->regmap, PCA9685_PRESCALE, ¤t_prescale); + if (ret) + return ret; -static int pca9685_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state) -{ - unsigned long long duty; - unsigned int val = 0; + if (current_prescale != wfhw->prescale) { + if (!pca9685_prescaler_can_change(pca, pwm->hwpwm)) + return -EBUSY; - /* Calculate (chip-wide) period from prescale value */ - pca9685_read_reg(chip, PCA9685_PRESCALE, &val); - /* - * PCA9685_OSC_CLOCK_MHZ is 25, i.e. an integer divider of 1000. - * The following calculation is therefore only a multiplication - * and we are not losing precision. - */ - state->period = (PCA9685_COUNTER_RANGE * 1000 / PCA9685_OSC_CLOCK_MHZ) * - (val + 1); + /* Put chip into sleep mode */ + ret = pca9685_set_sleep_mode(chip, true); + if (ret) + return ret; - /* The (per-channel) polarity is fixed */ - state->polarity = PWM_POLARITY_NORMAL; + /* Change the chip-wide output frequency */ + ret = regmap_write(pca->regmap, PCA9685_PRESCALE, wfhw->prescale); + if (ret) + return ret; - if (pwm->hwpwm >= PCA9685_MAXCHAN) { - /* - * The "all LEDs" channel does not support HW readout - * Return 0 and disabled for backwards compatibility - */ - state->duty_cycle = 0; - state->enabled = false; - return 0; + /* Wake the chip up */ + ret = pca9685_set_sleep_mode(chip, false); + if (ret) + return ret; + } } - state->enabled = true; - duty = pca9685_pwm_get_duty(chip, pwm->hwpwm); - state->duty_cycle = DIV_ROUND_DOWN_ULL(duty * state->period, PCA9685_COUNTER_RANGE); - - return 0; + return regmap_bulk_write(pca->regmap, REG_ON_L(pwm->hwpwm), &wfhw->onoff, 4); } static int pca9685_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct pca9685 *pca = to_pca(chip); - if (pca9685_pwm_test_and_set_inuse(pca, pwm->hwpwm)) - return -EBUSY; - if (pwm->hwpwm < PCA9685_MAXCHAN) { /* PWMs - except the "all LEDs" channel - default to enabled */ mutex_lock(&pca->lock); @@ -497,26 +378,52 @@ static void pca9685_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) struct pca9685 *pca = to_pca(chip); mutex_lock(&pca->lock); - pca9685_pwm_set_duty(chip, pwm->hwpwm, 0); clear_bit(pwm->hwpwm, pca->pwms_enabled); mutex_unlock(&pca->lock); pm_runtime_put(pwmchip_parent(chip)); - pca9685_pwm_clear_inuse(pca, pwm->hwpwm); } static const struct pwm_ops pca9685_pwm_ops = { - .apply = pca9685_pwm_apply, - .get_state = pca9685_pwm_get_state, + .sizeof_wfhw = sizeof(struct pca9685_waveform), + .round_waveform_tohw = pca9685_round_waveform_tohw, + .round_waveform_fromhw = pca9685_round_waveform_fromhw, + .read_waveform = pca9685_read_waveform, + .write_waveform = pca9685_write_waveform, .request = pca9685_pwm_request, .free = pca9685_pwm_free, }; +static bool pca9685_readable_reg(struct device *dev, unsigned int reg) +{ + /* The ALL_LED registers are readable but read as zero */ + return reg <= REG_OFF_H(15) || reg >= PCA9685_PRESCALE; +} + +static bool pca9685_writeable_reg(struct device *dev, unsigned int reg) +{ + return reg <= REG_OFF_H(15) || reg >= PCA9685_ALL_LED_ON_L; +} + +static bool pca9685_volatile_reg(struct device *dev, unsigned int reg) +{ + /* + * Writing to an ALL_LED register affects all LEDi registers, so they + * are not cachable. :-\ + */ + return reg < PCA9685_PRESCALE; +} + static const struct regmap_config pca9685_regmap_i2c_config = { .reg_bits = 8, .val_bits = 8, + + .readable_reg = pca9685_readable_reg, + .writeable_reg = pca9685_writeable_reg, + .volatile_reg = pca9685_volatile_reg, + .max_register = PCA9685_NUMREGS, - .cache_type = REGCACHE_NONE, + .cache_type = REGCACHE_MAPLE, }; static int pca9685_pwm_probe(struct i2c_client *client) @@ -544,9 +451,8 @@ static int pca9685_pwm_probe(struct i2c_client *client) mutex_init(&pca->lock); - ret = pca9685_read_reg(chip, PCA9685_MODE2, ®); - if (ret) - return ret; + /* clear MODE2_OCH */ + reg = 0; if (device_property_read_bool(&client->dev, "invert")) reg |= MODE2_INVRT; @@ -562,16 +468,19 @@ static int pca9685_pwm_probe(struct i2c_client *client) if (ret) return ret; - /* Disable all LED ALLCALL and SUBx addresses to avoid bus collisions */ + /* + * Disable all LED ALLCALL and SUBx addresses to avoid bus collisions, + * enable Auto-Increment. + */ pca9685_read_reg(chip, PCA9685_MODE1, ®); reg &= ~(MODE1_ALLCALL | MODE1_SUB1 | MODE1_SUB2 | MODE1_SUB3); + reg |= MODE1_AI; pca9685_write_reg(chip, PCA9685_MODE1, reg); /* Reset OFF/ON registers to POR default */ - pca9685_write_reg(chip, PCA9685_ALL_LED_OFF_L, 0); - pca9685_write_reg(chip, PCA9685_ALL_LED_OFF_H, LED_FULL); - pca9685_write_reg(chip, PCA9685_ALL_LED_ON_L, 0); - pca9685_write_reg(chip, PCA9685_ALL_LED_ON_H, LED_FULL); + ret = pca9685_write_4reg(chip, PCA9685_ALL_LED_ON_L, (u8[]){ 0, LED_FULL, 0, LED_FULL }); + if (ret < 0) + return dev_err_probe(&client->dev, ret, "Failed to reset ON/OFF registers\n"); chip->ops = &pca9685_pwm_ops; @@ -579,12 +488,6 @@ static int pca9685_pwm_probe(struct i2c_client *client) if (ret < 0) return ret; - ret = pca9685_pwm_gpio_probe(chip); - if (ret < 0) { - pwmchip_remove(chip); - return ret; - } - pm_runtime_enable(&client->dev); if (pm_runtime_enabled(&client->dev)) { diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c index d91b2bdc88fce2..67cc5e8bdb0ef8 100644 --- a/drivers/pwm/pwm-tiecap.c +++ b/drivers/pwm/pwm-tiecap.c @@ -3,6 +3,10 @@ * ECAP PWM driver * * Copyright (C) 2012 Texas Instruments, Inc. - https://www.ti.com/ + * + * Hardware properties: + * - On disable the PWM pin becomes an input, so the behaviour depends on + * external wiring. */ #include diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c index 0125e73b98dfb4..7a86cb090f76f1 100644 --- a/drivers/pwm/pwm-tiehrpwm.c +++ b/drivers/pwm/pwm-tiehrpwm.c @@ -36,7 +36,7 @@ #define CLKDIV_MAX 7 #define HSPCLKDIV_MAX 7 -#define PERIOD_MAX 0xFFFF +#define PERIOD_MAX 0x10000 /* compare module registers */ #define CMPA 0x12 @@ -65,14 +65,10 @@ #define AQCTL_ZRO_FRCHIGH BIT(1) #define AQCTL_ZRO_FRCTOGGLE (BIT(1) | BIT(0)) -#define AQCTL_CHANA_POLNORMAL (AQCTL_CAU_FRCLOW | AQCTL_PRD_FRCHIGH | \ - AQCTL_ZRO_FRCHIGH) -#define AQCTL_CHANA_POLINVERSED (AQCTL_CAU_FRCHIGH | AQCTL_PRD_FRCLOW | \ - AQCTL_ZRO_FRCLOW) -#define AQCTL_CHANB_POLNORMAL (AQCTL_CBU_FRCLOW | AQCTL_PRD_FRCHIGH | \ - AQCTL_ZRO_FRCHIGH) -#define AQCTL_CHANB_POLINVERSED (AQCTL_CBU_FRCHIGH | AQCTL_PRD_FRCLOW | \ - AQCTL_ZRO_FRCLOW) +#define AQCTL_CHANA_POLNORMAL (AQCTL_CAU_FRCLOW | AQCTL_ZRO_FRCHIGH) +#define AQCTL_CHANA_POLINVERSED (AQCTL_CAU_FRCHIGH | AQCTL_ZRO_FRCLOW) +#define AQCTL_CHANB_POLNORMAL (AQCTL_CBU_FRCLOW | AQCTL_ZRO_FRCHIGH) +#define AQCTL_CHANB_POLINVERSED (AQCTL_CBU_FRCHIGH | AQCTL_ZRO_FRCLOW) #define AQSFRC_RLDCSF_MASK (BIT(7) | BIT(6)) #define AQSFRC_RLDCSF_ZRO 0 @@ -108,7 +104,6 @@ struct ehrpwm_pwm_chip { unsigned long clk_rate; void __iomem *mmio_base; unsigned long period_cycles[NUM_PWM_CHANNEL]; - enum pwm_polarity polarity[NUM_PWM_CHANNEL]; struct clk *tbclk; struct ehrpwm_context ctx; }; @@ -166,7 +161,7 @@ static int set_prescale_div(unsigned long rqst_prescaler, u16 *prescale_div, *prescale_div = (1 << clkdiv) * (hspclkdiv ? (hspclkdiv * 2) : 1); - if (*prescale_div > rqst_prescaler) { + if (*prescale_div >= rqst_prescaler) { *tb_clk_div = (clkdiv << TBCTL_CLKDIV_SHIFT) | (hspclkdiv << TBCTL_HSPCLKDIV_SHIFT); return 0; @@ -177,51 +172,20 @@ static int set_prescale_div(unsigned long rqst_prescaler, u16 *prescale_div, return 1; } -static void configure_polarity(struct ehrpwm_pwm_chip *pc, int chan) -{ - u16 aqctl_val, aqctl_mask; - unsigned int aqctl_reg; - - /* - * Configure PWM output to HIGH/LOW level on counter - * reaches compare register value and LOW/HIGH level - * on counter value reaches period register value and - * zero value on counter - */ - if (chan == 1) { - aqctl_reg = AQCTLB; - aqctl_mask = AQCTL_CBU_MASK; - - if (pc->polarity[chan] == PWM_POLARITY_INVERSED) - aqctl_val = AQCTL_CHANB_POLINVERSED; - else - aqctl_val = AQCTL_CHANB_POLNORMAL; - } else { - aqctl_reg = AQCTLA; - aqctl_mask = AQCTL_CAU_MASK; - - if (pc->polarity[chan] == PWM_POLARITY_INVERSED) - aqctl_val = AQCTL_CHANA_POLINVERSED; - else - aqctl_val = AQCTL_CHANA_POLNORMAL; - } - - aqctl_mask |= AQCTL_PRD_MASK | AQCTL_ZRO_MASK; - ehrpwm_modify(pc->mmio_base, aqctl_reg, aqctl_mask, aqctl_val); -} - /* * period_ns = 10^9 * (ps_divval * period_cycles) / PWM_CLK_RATE * duty_ns = 10^9 * (ps_divval * duty_cycles) / PWM_CLK_RATE */ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - u64 duty_ns, u64 period_ns) + u64 duty_ns, u64 period_ns, enum pwm_polarity polarity) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); u32 period_cycles, duty_cycles; u16 ps_divval, tb_divval; unsigned int i, cmp_reg; unsigned long long c; + u16 aqctl_val, aqctl_mask; + unsigned int aqctl_reg; if (period_ns > NSEC_PER_SEC) return -ERANGE; @@ -231,15 +195,10 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, do_div(c, NSEC_PER_SEC); period_cycles = (unsigned long)c; - if (period_cycles < 1) { - period_cycles = 1; - duty_cycles = 1; - } else { - c = pc->clk_rate; - c = c * duty_ns; - do_div(c, NSEC_PER_SEC); - duty_cycles = (unsigned long)c; - } + c = pc->clk_rate; + c = c * duty_ns; + do_div(c, NSEC_PER_SEC); + duty_cycles = (unsigned long)c; /* * Period values should be same for multiple PWM channels as IP uses @@ -265,52 +224,73 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, pc->period_cycles[pwm->hwpwm] = period_cycles; /* Configure clock prescaler to support Low frequency PWM wave */ - if (set_prescale_div(period_cycles/PERIOD_MAX, &ps_divval, + if (set_prescale_div(DIV_ROUND_UP(period_cycles, PERIOD_MAX), &ps_divval, &tb_divval)) { dev_err(pwmchip_parent(chip), "Unsupported values\n"); return -EINVAL; } - pm_runtime_get_sync(pwmchip_parent(chip)); - - /* Update clock prescaler values */ - ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CLKDIV_MASK, tb_divval); - /* Update period & duty cycle with presacler division */ period_cycles = period_cycles / ps_divval; duty_cycles = duty_cycles / ps_divval; - /* Configure shadow loading on Period register */ - ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_PRDLD_MASK, TBCTL_PRDLD_SHDW); + if (period_cycles < 1) + period_cycles = 1; - ehrpwm_write(pc->mmio_base, TBPRD, period_cycles); + pm_runtime_get_sync(pwmchip_parent(chip)); - /* Configure ehrpwm counter for up-count mode */ - ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CTRMODE_MASK, - TBCTL_CTRMODE_UP); + /* Update clock prescaler values */ + ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CLKDIV_MASK, tb_divval); - if (pwm->hwpwm == 1) + if (pwm->hwpwm == 1) { /* Channel 1 configured with compare B register */ cmp_reg = CMPB; - else + + aqctl_reg = AQCTLB; + aqctl_mask = AQCTL_CBU_MASK; + + if (polarity == PWM_POLARITY_INVERSED) + aqctl_val = AQCTL_CHANB_POLINVERSED; + else + aqctl_val = AQCTL_CHANB_POLNORMAL; + + /* if duty_cycle is big, don't toggle on CBU */ + if (duty_cycles > period_cycles) + aqctl_val &= ~AQCTL_CBU_MASK; + + } else { /* Channel 0 configured with compare A register */ cmp_reg = CMPA; - ehrpwm_write(pc->mmio_base, cmp_reg, duty_cycles); + aqctl_reg = AQCTLA; + aqctl_mask = AQCTL_CAU_MASK; - pm_runtime_put_sync(pwmchip_parent(chip)); + if (polarity == PWM_POLARITY_INVERSED) + aqctl_val = AQCTL_CHANA_POLINVERSED; + else + aqctl_val = AQCTL_CHANA_POLNORMAL; - return 0; -} + /* if duty_cycle is big, don't toggle on CAU */ + if (duty_cycles > period_cycles) + aqctl_val &= ~AQCTL_CAU_MASK; + } -static int ehrpwm_pwm_set_polarity(struct pwm_chip *chip, - struct pwm_device *pwm, - enum pwm_polarity polarity) -{ - struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); + aqctl_mask |= AQCTL_PRD_MASK | AQCTL_ZRO_MASK; + ehrpwm_modify(pc->mmio_base, aqctl_reg, aqctl_mask, aqctl_val); + + /* Configure shadow loading on Period register */ + ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_PRDLD_MASK, TBCTL_PRDLD_SHDW); + + ehrpwm_write(pc->mmio_base, TBPRD, period_cycles - 1); - /* Configuration of polarity in hardware delayed, do at enable */ - pc->polarity[pwm->hwpwm] = polarity; + /* Configure ehrpwm counter for up-count mode */ + ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CTRMODE_MASK, + TBCTL_CTRMODE_UP); + + if (!(duty_cycles > period_cycles)) + ehrpwm_write(pc->mmio_base, cmp_reg, duty_cycles); + + pm_runtime_put_sync(pwmchip_parent(chip)); return 0; } @@ -339,9 +319,6 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) ehrpwm_modify(pc->mmio_base, AQCSFRC, aqcsfrc_mask, aqcsfrc_val); - /* Channels polarity can be configured from action qualifier module */ - configure_polarity(pc, pwm->hwpwm); - /* Enable TBCLK */ ret = clk_enable(pc->tbclk); if (ret) { @@ -391,12 +368,7 @@ static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); - if (pwm_is_enabled(pwm)) { - dev_warn(pwmchip_parent(chip), "Removing PWM device without disabling\n"); - pm_runtime_put_sync(pwmchip_parent(chip)); - } - - /* set period value to zero on free */ + /* Don't let a pwm without consumer block requests to the other channel */ pc->period_cycles[pwm->hwpwm] = 0; } @@ -411,10 +383,6 @@ static int ehrpwm_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, ehrpwm_pwm_disable(chip, pwm); enabled = false; } - - err = ehrpwm_pwm_set_polarity(chip, pwm, state->polarity); - if (err) - return err; } if (!state->enabled) { @@ -423,7 +391,7 @@ static int ehrpwm_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } - err = ehrpwm_pwm_config(chip, pwm, state->duty_cycle, state->period); + err = ehrpwm_pwm_config(chip, pwm, state->duty_cycle, state->period, state->polarity); if (err) return err; diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index a6e4792a1b2e92..ac0e132ccc3eb9 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -51,6 +51,7 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, { trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len); } +EXPORT_SYMBOL_GPL(log_non_standard_event); void log_arm_hw_error(struct cper_sec_proc_arm *err) { diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index eaa6df1c9f8066..d84f3d054c59d8 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -297,6 +297,14 @@ config REGULATOR_BD96801 This driver can also be built as a module. If so, the module will be called bd96801-regulator. +config REGULATOR_BQ257XX + tristate "TI BQ257XX regulator family" + depends on MFD_BQ257XX + depends on GPIOLIB || COMPILE_TEST + help + Say Y to enable support for the boost regulator function of + the BQ257XX family of charger circuits. + config REGULATOR_CPCAP tristate "Motorola CPCAP regulator" depends on MFD_CPCAP @@ -777,6 +785,15 @@ config REGULATOR_MAX77826 It includes support for control of output voltage. This regulator is found on the Samsung Galaxy S5 (klte) smartphone. +config REGULATOR_MAX77838 + tristate "Maxim 77838 regulator" + depends on I2C + select REGMAP_I2C + help + This driver controls a Maxim 77838 regulator via I2C bus. + The regulator include 4 LDOs and a BUCK regulator. It's + present on the Samsung Galaxy S7 lineup of smartphones. + config REGULATOR_MC13XXX_CORE tristate @@ -1006,6 +1023,26 @@ config REGULATOR_PCAP This driver provides support for the voltage regulators of the PCAP2 PMIC. +config REGULATOR_PF0900 + tristate "NXP PF0900/PF0901/PF09XX regulator driver" + depends on I2C + select REGMAP_I2C + help + Say y here to support the NXP PF0900/PF0901/PF09XX PMIC + regulator driver. + +config REGULATOR_PF530X + tristate "NXP PF5300/PF5301/PF5302 regulator driver" + depends on I2C && OF + select REGMAP_I2C + help + Say y here to support the regulators found on the NXP + PF5300/PF5301/PF5302 PMIC. + + Say M here if you want to support for the regulators found + on the NXP PF5300/PF5301/PF5302 PMIC. The module will be named + "pf530x-regulator". + config REGULATOR_PF8X00 tristate "NXP PF8100/PF8121A/PF8200 regulator driver" depends on I2C && OF @@ -1240,6 +1277,18 @@ config REGULATOR_RT5120 600mV to 1395mV, per step 6.250mV. The others are all fixed voltage by external hardware circuit. +config REGULATOR_RT5133 + tristate "Richtek RT5133 PMIC Regulators" + depends on I2C && GPIOLIB && OF + select REGMAP + select CRC8 + select OF_GPIO + help + This driver adds support for RT5133 PMIC regulators. + RT5133 is an integrated chip. It includes 8 LDOs and 3 GPOs that + can be used to drive output high/low purpose. The dependency of the + GPO block is internally LDO1 Voltage. + config REGULATOR_RT5190A tristate "Richtek RT5190A PMIC" depends on I2C @@ -1344,6 +1393,14 @@ config REGULATOR_RTQ2208 and two ldos. It features wide output voltage range from 0.4V to 2.05V and the capability to configure the corresponding power stages. +config REGULATOR_S2DOS05 + tristate "Samsung S2DOS05 voltage regulator" + depends on MFD_SEC_CORE || COMPILE_TEST + help + This driver provides support for the voltage regulators of the S2DOS05. + The S2DOS05 is a companion power management IC for the smart phones. + The S2DOS05 has 4 LDOs and 1 BUCK outputs. + config REGULATOR_S2MPA01 tristate "Samsung S2MPA01 voltage regulator" depends on MFD_SEC_CORE || COMPILE_TEST @@ -1395,6 +1452,19 @@ config REGULATOR_SLG51000 The SLG51000 is seven compact and customizable low dropout regulators. +config REGULATOR_SPACEMIT_P1 + tristate "SpacemiT P1 regulators" + depends on ARCH_SPACEMIT || COMPILE_TEST + depends on I2C + select MFD_SPACEMIT_P1 + default ARCH_SPACEMIT + help + Enable support for regulators implemented by the SpacemiT P1 + power controller. The P1 implements 6 high-efficiency buck + converters and 12 programmable LDO regulators. To compile this + driver as a module, choose M here. The module will be called + "spacemit-pmic". + config REGULATOR_STM32_BOOSTER tristate "STMicroelectronics STM32 BOOSTER" depends on ARCH_STM32 || COMPILE_TEST diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index be98b29d6675d8..b3101376029d71 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_REGULATOR_BD71828) += bd71828-regulator.o obj-$(CONFIG_REGULATOR_BD718XX) += bd718x7-regulator.o obj-$(CONFIG_REGULATOR_BD9571MWV) += bd9571mwv-regulator.o obj-$(CONFIG_REGULATOR_BD957XMUF) += bd9576-regulator.o +obj-$(CONFIG_REGULATOR_BQ257XX) += bq257xx-regulator.o obj-$(CONFIG_REGULATOR_DA903X) += da903x-regulator.o obj-$(CONFIG_REGULATOR_BD96801) += bd96801-regulator.o obj-$(CONFIG_REGULATOR_DA9052) += da9052-regulator.o @@ -92,6 +93,7 @@ obj-$(CONFIG_REGULATOR_MAX77686) += max77686-regulator.o obj-$(CONFIG_REGULATOR_MAX77693) += max77693-regulator.o obj-$(CONFIG_REGULATOR_MAX77802) += max77802-regulator.o obj-$(CONFIG_REGULATOR_MAX77826) += max77826-regulator.o +obj-$(CONFIG_REGULATOR_MAX77838) += max77838-regulator.o obj-$(CONFIG_REGULATOR_MAX77857) += max77857-regulator.o obj-$(CONFIG_REGULATOR_MC13783) += mc13783-regulator.o obj-$(CONFIG_REGULATOR_MC13892) += mc13892-regulator.o @@ -124,7 +126,9 @@ obj-$(CONFIG_REGULATOR_QCOM_SPMI) += qcom_spmi-regulator.o obj-$(CONFIG_REGULATOR_QCOM_USB_VBUS) += qcom_usb_vbus-regulator.o obj-$(CONFIG_REGULATOR_PALMAS) += palmas-regulator.o obj-$(CONFIG_REGULATOR_PCA9450) += pca9450-regulator.o +obj-$(CONFIG_REGULATOR_PF0900) += pf0900-regulator.o obj-$(CONFIG_REGULATOR_PF9453) += pf9453-regulator.o +obj-$(CONFIG_REGULATOR_PF530X) += pf530x-regulator.o obj-$(CONFIG_REGULATOR_PF8X00) += pf8x00-regulator.o obj-$(CONFIG_REGULATOR_PFUZE100) += pfuze100-regulator.o obj-$(CONFIG_REGULATOR_PV88060) += pv88060-regulator.o @@ -146,6 +150,7 @@ obj-$(CONFIG_REGULATOR_RT4803) += rt4803.o obj-$(CONFIG_REGULATOR_RT4831) += rt4831-regulator.o obj-$(CONFIG_REGULATOR_RT5033) += rt5033-regulator.o obj-$(CONFIG_REGULATOR_RT5120) += rt5120-regulator.o +obj-$(CONFIG_REGULATOR_RT5133) += rt5133-regulator.o obj-$(CONFIG_REGULATOR_RT5190A) += rt5190a-regulator.o obj-$(CONFIG_REGULATOR_RT5739) += rt5739.o obj-$(CONFIG_REGULATOR_RT5759) += rt5759-regulator.o @@ -156,12 +161,14 @@ obj-$(CONFIG_REGULATOR_RTMV20) += rtmv20-regulator.o obj-$(CONFIG_REGULATOR_RTQ2134) += rtq2134-regulator.o obj-$(CONFIG_REGULATOR_RTQ6752) += rtq6752-regulator.o obj-$(CONFIG_REGULATOR_RTQ2208) += rtq2208-regulator.o +obj-$(CONFIG_REGULATOR_S2DOS05) += s2dos05-regulator.o obj-$(CONFIG_REGULATOR_S2MPA01) += s2mpa01.o obj-$(CONFIG_REGULATOR_S2MPS11) += s2mps11.o obj-$(CONFIG_REGULATOR_S5M8767) += s5m8767.o obj-$(CONFIG_REGULATOR_SC2731) += sc2731-regulator.o obj-$(CONFIG_REGULATOR_SKY81452) += sky81452-regulator.o obj-$(CONFIG_REGULATOR_SLG51000) += slg51000-regulator.o +obj-$(CONFIG_REGULATOR_SPACEMIT_P1) += spacemit-p1.o obj-$(CONFIG_REGULATOR_STM32_BOOSTER) += stm32-booster.o obj-$(CONFIG_REGULATOR_STM32_VREFBUF) += stm32-vrefbuf.o obj-$(CONFIG_REGULATOR_STM32_PWR) += stm32-pwr.o diff --git a/drivers/regulator/bd718x7-regulator.c b/drivers/regulator/bd718x7-regulator.c index e803cc59d68a5c..022d98f3c32a2d 100644 --- a/drivers/regulator/bd718x7-regulator.c +++ b/drivers/regulator/bd718x7-regulator.c @@ -1598,7 +1598,7 @@ static int setup_feedback_loop(struct device *dev, struct device_node *np, if (desc->n_linear_ranges && desc->linear_ranges) { struct linear_range *new; - new = devm_kzalloc(dev, desc->n_linear_ranges * + new = devm_kcalloc(dev, desc->n_linear_ranges, sizeof(struct linear_range), GFP_KERNEL); if (!new) diff --git a/drivers/regulator/bq257xx-regulator.c b/drivers/regulator/bq257xx-regulator.c new file mode 100644 index 00000000000000..fc1ccede446882 --- /dev/null +++ b/drivers/regulator/bq257xx-regulator.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BQ257XX Battery Charger Driver + * Copyright (C) 2025 Chris Morgan + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct bq257xx_reg_data { + struct bq257xx_device *bq; + struct regulator_dev *bq257xx_reg; + struct gpio_desc *otg_en_gpio; + struct regulator_desc desc; +}; + +static int bq25703_vbus_get_cur_limit(struct regulator_dev *rdev) +{ + struct bq257xx_reg_data *pdata = rdev_get_drvdata(rdev); + int ret; + unsigned int reg; + + ret = regmap_read(pdata->bq->regmap, BQ25703_OTG_CURRENT, ®); + if (ret) + return ret; + return FIELD_GET(BQ25703_OTG_CUR_MASK, reg) * BQ25703_OTG_CUR_STEP_UA; +} + +/* + * Check if the minimum current and maximum current requested are + * sane values, then set the register accordingly. + */ +static int bq25703_vbus_set_cur_limit(struct regulator_dev *rdev, + int min_uA, int max_uA) +{ + struct bq257xx_reg_data *pdata = rdev_get_drvdata(rdev); + unsigned int reg; + + if ((min_uA > BQ25703_OTG_CUR_MAX_UA) || (max_uA < 0)) + return -EINVAL; + + reg = (max_uA / BQ25703_OTG_CUR_STEP_UA); + + /* Catch rounding errors since our step is 50000uA. */ + if ((reg * BQ25703_OTG_CUR_STEP_UA) < min_uA) + return -EINVAL; + + return regmap_write(pdata->bq->regmap, BQ25703_OTG_CURRENT, + FIELD_PREP(BQ25703_OTG_CUR_MASK, reg)); +} + +static int bq25703_vbus_enable(struct regulator_dev *rdev) +{ + struct bq257xx_reg_data *pdata = rdev_get_drvdata(rdev); + + if (pdata->otg_en_gpio) + gpiod_set_value_cansleep(pdata->otg_en_gpio, 1); + return regulator_enable_regmap(rdev); +} + +static int bq25703_vbus_disable(struct regulator_dev *rdev) +{ + struct bq257xx_reg_data *pdata = rdev_get_drvdata(rdev); + + if (pdata->otg_en_gpio) + gpiod_set_value_cansleep(pdata->otg_en_gpio, 0); + return regulator_disable_regmap(rdev); +} + +static const struct regulator_ops bq25703_vbus_ops = { + .enable = bq25703_vbus_enable, + .disable = bq25703_vbus_disable, + .is_enabled = regulator_is_enabled_regmap, + .list_voltage = regulator_list_voltage_linear, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_current_limit = bq25703_vbus_get_cur_limit, + .set_current_limit = bq25703_vbus_set_cur_limit, +}; + +static const struct regulator_desc bq25703_vbus_desc = { + .name = "vbus", + .of_match = of_match_ptr("vbus"), + .regulators_node = of_match_ptr("regulators"), + .type = REGULATOR_VOLTAGE, + .owner = THIS_MODULE, + .ops = &bq25703_vbus_ops, + .min_uV = BQ25703_OTG_VOLT_MIN_UV, + .uV_step = BQ25703_OTG_VOLT_STEP_UV, + .n_voltages = BQ25703_OTG_VOLT_NUM_VOLT, + .enable_mask = BQ25703_EN_OTG_MASK, + .enable_reg = BQ25703_CHARGE_OPTION_3, + .enable_val = BQ25703_EN_OTG_MASK, + .disable_val = 0, + .vsel_reg = BQ25703_OTG_VOLT, + .vsel_mask = BQ25703_OTG_VOLT_MASK, +}; + +/* Get optional GPIO for OTG regulator enable. */ +static void bq257xx_reg_dt_parse_gpio(struct platform_device *pdev) +{ + struct device_node *child, *subchild; + struct bq257xx_reg_data *pdata = platform_get_drvdata(pdev); + + child = of_get_child_by_name(pdev->dev.of_node, + pdata->desc.regulators_node); + if (!child) + return; + + subchild = of_get_child_by_name(child, pdata->desc.of_match); + if (!subchild) + return; + + of_node_put(child); + + pdata->otg_en_gpio = devm_fwnode_gpiod_get_index(&pdev->dev, + of_fwnode_handle(subchild), + "enable", 0, + GPIOD_OUT_LOW, + pdata->desc.of_match); + + of_node_put(subchild); + + if (IS_ERR(pdata->otg_en_gpio)) { + dev_err(&pdev->dev, "Error getting enable gpio: %ld\n", + PTR_ERR(pdata->otg_en_gpio)); + return; + } +} + +static int bq257xx_regulator_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct bq257xx_device *bq = dev_get_drvdata(pdev->dev.parent); + struct bq257xx_reg_data *pdata; + struct device_node *np = dev->of_node; + struct regulator_config cfg = {}; + + pdev->dev.of_node = pdev->dev.parent->of_node; + pdev->dev.of_node_reused = true; + + pdata = devm_kzalloc(&pdev->dev, sizeof(struct bq257xx_reg_data), GFP_KERNEL); + if (!pdata) + return -ENOMEM; + + pdata->bq = bq; + pdata->desc = bq25703_vbus_desc; + + platform_set_drvdata(pdev, pdata); + bq257xx_reg_dt_parse_gpio(pdev); + + cfg.dev = &pdev->dev; + cfg.driver_data = pdata; + cfg.of_node = np; + cfg.regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!cfg.regmap) + return -ENODEV; + + pdata->bq257xx_reg = devm_regulator_register(dev, &pdata->desc, &cfg); + if (IS_ERR(pdata->bq257xx_reg)) { + return dev_err_probe(&pdev->dev, PTR_ERR(pdata->bq257xx_reg), + "error registering bq257xx regulator"); + } + + return 0; +} + +static struct platform_driver bq257xx_reg_driver = { + .driver = { + .name = "bq257xx-regulator", + }, + .probe = bq257xx_regulator_probe, +}; + +module_platform_driver(bq257xx_reg_driver); + +MODULE_DESCRIPTION("bq257xx regulator driver"); +MODULE_AUTHOR("Chris Morgan "); +MODULE_LICENSE("GPL"); diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 554d83c4af0c1c..dd7b10e768c06c 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1586,8 +1586,8 @@ static int set_machine_constraints(struct regulator_dev *rdev) } if (rdev->constraints->active_discharge && ops->set_active_discharge) { - bool ad_state = (rdev->constraints->active_discharge == - REGULATOR_ACTIVE_DISCHARGE_ENABLE) ? true : false; + bool ad_state = rdev->constraints->active_discharge == + REGULATOR_ACTIVE_DISCHARGE_ENABLE; ret = ops->set_active_discharge(rdev, ad_state); if (ret < 0) { diff --git a/drivers/regulator/max77838-regulator.c b/drivers/regulator/max77838-regulator.c new file mode 100644 index 00000000000000..9faddbfd25fd80 --- /dev/null +++ b/drivers/regulator/max77838-regulator.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// +// regulator driver for Maxim MAX77838 +// +// based on max77826-regulator.c +// +// Copyright (c) 2025, Ivaylo Ivanov + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum max77838_registers { + MAX77838_REG_DEVICE_ID = 0x00, + MAX77838_REG_TOPSYS_STAT, + MAX77838_REG_STAT, + MAX77838_REG_EN, + MAX77838_REG_GPIO_PD_CTRL, + MAX77838_REG_UVLO_CFG1, + /* 0x06 - 0x0B: reserved */ + MAX77838_REG_I2C_CFG = 0x0C, + /* 0x0D - 0x0F: reserved */ + MAX77838_REG_LDO1_CFG = 0x10, + MAX77838_REG_LDO2_CFG, + MAX77838_REG_LDO3_CFG, + MAX77838_REG_LDO4_CFG, + /* 0x14 - 0x1F: reserved */ + MAX77838_REG_BUCK_CFG1 = 0x20, + MAX77838_REG_BUCK_VOUT, +}; + +enum max77838_regulators { + MAX77838_LDO1 = 0, + MAX77838_LDO2, + MAX77838_LDO3, + MAX77838_LDO4, + MAX77838_BUCK, + MAX77838_MAX_REGULATORS, +}; + +#define MAX77838_MASK_LDO 0x7f +#define MAX77838_MASK_BUCK 0xff + +#define MAX77838_LDO1_EN BIT(0) +#define MAX77838_LDO2_EN BIT(1) +#define MAX77838_LDO3_EN BIT(2) +#define MAX77838_LDO4_EN BIT(3) +#define MAX77838_BUCK_EN BIT(4) + +#define MAX77838_BUCK_AD BIT(3) +#define MAX77838_LDO_AD BIT(7) + +#define MAX77838_LDO_VOLT_MIN 600000 +#define MAX77838_LDO_VOLT_MAX 3775000 +#define MAX77838_LDO_VOLT_STEP 25000 + +#define MAX77838_BUCK_VOLT_MIN 500000 +#define MAX77838_BUCK_VOLT_MAX 2093750 +#define MAX77838_BUCK_VOLT_STEP 6250 + +#define MAX77838_VOLT_RANGE(_type) \ + ((MAX77838_ ## _type ## _VOLT_MAX - \ + MAX77838_ ## _type ## _VOLT_MIN) / \ + MAX77838_ ## _type ## _VOLT_STEP + 1) + +#define MAX77838_LDO(_id) \ + [MAX77838_LDO ## _id] = { \ + .id = MAX77838_LDO ## _id, \ + .name = "ldo"#_id, \ + .of_match = of_match_ptr("ldo"#_id), \ + .regulators_node = "regulators", \ + .ops = &max77838_regulator_ops, \ + .min_uV = MAX77838_LDO_VOLT_MIN, \ + .uV_step = MAX77838_LDO_VOLT_STEP, \ + .n_voltages = MAX77838_VOLT_RANGE(LDO), \ + .enable_reg = MAX77838_REG_EN, \ + .enable_mask = MAX77838_LDO ## _id ## _EN, \ + .vsel_reg = MAX77838_REG_LDO ## _id ## _CFG, \ + .vsel_mask = MAX77838_MASK_LDO, \ + .active_discharge_off = 0, \ + .active_discharge_on = MAX77838_LDO_AD, \ + .active_discharge_mask = MAX77838_LDO_AD, \ + .active_discharge_reg = MAX77838_REG_LDO ## _id ## _CFG, \ + .owner = THIS_MODULE, \ + } + +#define MAX77838_BUCK_DESC \ + [MAX77838_BUCK] = { \ + .id = MAX77838_BUCK, \ + .name = "buck", \ + .of_match = of_match_ptr("buck"), \ + .regulators_node = "regulators", \ + .ops = &max77838_regulator_ops, \ + .min_uV = MAX77838_BUCK_VOLT_MIN, \ + .uV_step = MAX77838_BUCK_VOLT_STEP, \ + .n_voltages = MAX77838_VOLT_RANGE(BUCK), \ + .enable_reg = MAX77838_REG_EN, \ + .enable_mask = MAX77838_BUCK_EN, \ + .vsel_reg = MAX77838_REG_BUCK_VOUT, \ + .vsel_mask = MAX77838_MASK_BUCK, \ + .active_discharge_off = 0, \ + .active_discharge_on = MAX77838_BUCK_AD, \ + .active_discharge_mask = MAX77838_BUCK_AD, \ + .active_discharge_reg = MAX77838_REG_BUCK_CFG1, \ + .owner = THIS_MODULE, \ + } + +struct max77838_regulator_info { + struct regmap *regmap; +}; + +static const struct regmap_config max77838_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = MAX77838_REG_BUCK_VOUT, +}; + +static const struct regulator_ops max77838_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .list_voltage = regulator_list_voltage_linear, + .map_voltage = regulator_map_voltage_linear, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .set_active_discharge = regulator_set_active_discharge_regmap, +}; + +static const struct regulator_desc max77838_regulators_desc[] = { + MAX77838_LDO(1), + MAX77838_LDO(2), + MAX77838_LDO(3), + MAX77838_LDO(4), + MAX77838_BUCK_DESC, +}; + +static int max77838_read_device_id(struct regmap *regmap, struct device *dev) +{ + unsigned int device_id; + int ret; + + ret = regmap_read(regmap, MAX77838_REG_DEVICE_ID, &device_id); + if (!ret) + dev_dbg(dev, "DEVICE_ID: 0x%x\n", device_id); + + return ret; +} + +static int max77838_i2c_probe(struct i2c_client *client) +{ + struct device *dev = &client->dev; + struct max77838_regulator_info *info; + struct regulator_config config = {}; + struct regulator_dev *rdev; + struct regmap *regmap; + int i; + + info = devm_kzalloc(dev, sizeof(struct max77838_regulator_info), + GFP_KERNEL); + if (!info) + return -ENOMEM; + + regmap = devm_regmap_init_i2c(client, &max77838_regmap_config); + if (IS_ERR(regmap)) { + dev_err(dev, "Failed to allocate regmap!\n"); + return PTR_ERR(regmap); + } + + info->regmap = regmap; + i2c_set_clientdata(client, info); + + config.dev = dev; + config.regmap = regmap; + config.driver_data = info; + + for (i = 0; i < MAX77838_MAX_REGULATORS; i++) { + rdev = devm_regulator_register(dev, + &max77838_regulators_desc[i], + &config); + if (IS_ERR(rdev)) { + dev_err(dev, "Failed to register regulator!\n"); + return PTR_ERR(rdev); + } + } + + return max77838_read_device_id(regmap, dev); +} + +static const struct of_device_id __maybe_unused max77838_of_match[] = { + { .compatible = "maxim,max77838" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, max77838_of_match); + +static const struct i2c_device_id max77838_id[] = { + { "max77838-regulator" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(i2c, max77838_id); + +static struct i2c_driver max77838_regulator_driver = { + .driver = { + .name = "max77838", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .of_match_table = of_match_ptr(max77838_of_match), + }, + .probe = max77838_i2c_probe, + .id_table = max77838_id, +}; +module_i2c_driver(max77838_regulator_driver); + +MODULE_AUTHOR("Ivaylo Ivanov "); +MODULE_DESCRIPTION("MAX77838 PMIC regulator driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/regulator/pf0900-regulator.c b/drivers/regulator/pf0900-regulator.c new file mode 100644 index 00000000000000..b5effee3291724 --- /dev/null +++ b/drivers/regulator/pf0900-regulator.c @@ -0,0 +1,975 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright 2025 NXP. +// NXP PF0900 pmic driver + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum pf0900_regulators { + PF0900_SW1 = 0, + PF0900_SW2, + PF0900_SW3, + PF0900_SW4, + PF0900_SW5, + PF0900_LDO1, + PF0900_LDO2, + PF0900_LDO3, + PF0900_VAON, + PF0900_REGULATOR_CNT, +}; + +enum { + PF0900_DVS_LEVEL_RUN = 0, + PF0900_DVS_LEVEL_STANDBY, + PF0900_DVS_LEVEL_MAX, +}; + + +#define PF0900_VAON_VOLTAGE_NUM 0x03 +#define PF0900_SW_VOLTAGE_NUM 0x100 +#define PF0900_LDO_VOLTAGE_NUM 0x20 + +#define REGU_SW_CNT 0x5 +#define REGU_LDO_VAON_CNT 0x4 + +enum { + PF0900_REG_DEV_ID = 0x00, + PF0900_REG_DEV_FAM = 0x01, + PF0900_REG_REV_ID = 0x02, + PF0900_REG_PROG_ID1 = 0x03, + PF0900_REG_PROG_ID2 = 0x04, + PF0900_REG_SYSTEM_INT = 0x05, + PF0900_REG_STATUS1_INT = 0x06, + PF0900_REG_STATUS1_MSK = 0x07, + PF0900_REG_STATUS1_SNS = 0x08, + PF0900_REG_STATUS2_INT = 0x09, + PF0900_REG_STATUS2_MSK = 0x0A, + PF0900_REG_STATUS2_SNS = 0x0B, + PF0900_REG_STATUS3_INT = 0x0C, + PF0900_REG_STATUS3_MSK = 0x0D, + PF0900_REG_SW_MODE_INT = 0x0E, + PF0900_REG_SW_MODE_MSK = 0x0F, + PF0900_REG_SW_ILIM_INT = 0x10, + PF0900_REG_SW_ILIM_MSK = 0x11, + PF0900_REG_SW_ILIM_SNS = 0x12, + PF0900_REG_LDO_ILIM_INT = 0x13, + PF0900_REG_LDO_ILIM_MSK = 0x14, + PF0900_REG_LDO_ILIM_SNS = 0x15, + PF0900_REG_SW_UV_INT = 0x16, + PF0900_REG_SW_UV_MSK = 0x17, + PF0900_REG_SW_UV_SNS = 0x18, + PF0900_REG_SW_OV_INT = 0x19, + PF0900_REG_SW_OV_MSK = 0x1A, + PF0900_REG_SW_OV_SNS = 0x1B, + PF0900_REG_LDO_UV_INT = 0x1C, + PF0900_REG_LDO_UV_MSK = 0x1D, + PF0900_REG_LDO_UV_SNS = 0x1E, + PF0900_REG_LDO_OV_INT = 0x1F, + PF0900_REG_LDO_OV_MSK = 0x20, + PF0900_REG_LDO_OV_SNS = 0x21, + PF0900_REG_PWRON_INT = 0x22, + PF0900_REG_IO_INT = 0x24, + PF0900_REG_IO_MSK = 0x25, + PF0900_REG_IO_SNS = 0x26, + PF0900_REG_IOSHORT_SNS = 0x27, + PF0900_REG_ABIST_OV1 = 0x28, + PF0900_REG_ABIST_OV2 = 0x29, + PF0900_REG_ABIST_UV1 = 0x2A, + PF0900_REG_ABIST_UV2 = 0x2B, + PF0900_REG_ABIST_IO = 0x2C, + PF0900_REG_TEST_FLAGS = 0x2D, + PF0900_REG_HFAULT_FLAGS = 0x2E, + PF0900_REG_FAULT_FLAGS = 0x2F, + PF0900_REG_FS0B_CFG = 0x30, + PF0900_REG_FCCU_CFG = 0x31, + PF0900_REG_RSTB_CFG1 = 0x32, + PF0900_REG_SYSTEM_CMD = 0x33, + PF0900_REG_FS0B_CMD = 0x34, + PF0900_REG_SECURE_WR1 = 0x35, + PF0900_REG_SECURE_WR2 = 0x36, + PF0900_REG_VMON_CFG1 = 0x37, + PF0900_REG_SYS_CFG1 = 0x38, + PF0900_REG_GPO_CFG = 0x39, + PF0900_REG_GPO_CTRL = 0x3A, + PF0900_REG_PWRUP_CFG = 0x3B, + PF0900_REG_RSTB_PWRUP = 0x3C, + PF0900_REG_GPIO1_PWRUP = 0x3D, + PF0900_REG_GPIO2_PWRUP = 0x3E, + PF0900_REG_GPIO3_PWRUP = 0x3F, + PF0900_REG_GPIO4_PWRUP = 0x40, + PF0900_REG_VMON1_PWRUP = 0x41, + PF0900_REG_VMON2_PWRUP = 0x42, + PF0900_REG_SW1_PWRUP = 0x43, + PF0900_REG_SW2_PWRUP = 0x44, + PF0900_REG_SW3_PWRUP = 0x45, + PF0900_REG_SW4_PWRUP = 0x46, + PF0900_REG_SW5_PWRUP = 0x47, + PF0900_REG_LDO1_PWRUP = 0x48, + PF0900_REG_LDO2_PWRUP = 0x49, + PF0900_REG_LDO3_PWRUP = 0x4A, + PF0900_REG_VAON_PWRUP = 0x4B, + PF0900_REG_FREQ_CTRL = 0x4C, + PF0900_REG_PWRON_CFG = 0x4D, + PF0900_REG_WD_CTRL1 = 0x4E, + PF0900_REG_WD_CTRL2 = 0x4F, + PF0900_REG_WD_CFG1 = 0x50, + PF0900_REG_WD_CFG2 = 0x51, + PF0900_REG_WD_CNT1 = 0x52, + PF0900_REG_WD_CNT2 = 0x53, + PF0900_REG_FAULT_CFG = 0x54, + PF0900_REG_FAULT_CNT = 0x55, + PF0900_REG_DFS_CNT = 0x56, + PF0900_REG_AMUX_CFG = 0x57, + PF0900_REG_VMON1_RUN_CFG = 0x58, + PF0900_REG_VMON1_STBY_CFG = 0x59, + PF0900_REG_VMON1_CTRL = 0x5A, + PF0900_REG_VMON2_RUN_CFG = 0x5B, + PF0900_REG_VMON2_STBY_CFG = 0x5C, + PF0900_REG_VMON2_CTRL = 0x5D, + PF0900_REG_SW1_VRUN = 0x5E, + PF0900_REG_SW1_VSTBY = 0x5F, + PF0900_REG_SW1_MODE = 0x60, + PF0900_REG_SW1_CFG1 = 0x61, + PF0900_REG_SW1_CFG2 = 0x62, + PF0900_REG_SW2_VRUN = 0x63, + PF0900_REG_SW2_VSTBY = 0x64, + PF0900_REG_SW2_MODE = 0x65, + PF0900_REG_SW2_CFG1 = 0x66, + PF0900_REG_SW2_CFG2 = 0x67, + PF0900_REG_SW3_VRUN = 0x68, + PF0900_REG_SW3_VSTBY = 0x69, + PF0900_REG_SW3_MODE = 0x6A, + PF0900_REG_SW3_CFG1 = 0x6B, + PF0900_REG_SW3_CFG2 = 0x6C, + PF0900_REG_SW4_VRUN = 0x6D, + PF0900_REG_SW4_VSTBY = 0x6E, + PF0900_REG_SW4_MODE = 0x6F, + PF0900_REG_SW4_CFG1 = 0x70, + PF0900_REG_SW4_CFG2 = 0x71, + PF0900_REG_SW5_VRUN = 0x72, + PF0900_REG_SW5_VSTBY = 0x73, + PF0900_REG_SW5_MODE = 0x74, + PF0900_REG_SW5_CFG1 = 0x75, + PF0900_REG_SW5_CFG2 = 0x76, + PF0900_REG_LDO1_RUN = 0x77, + PF0900_REG_LDO1_STBY = 0x78, + PF0900_REG_LDO1_CFG2 = 0x79, + PF0900_REG_LDO2_RUN = 0x7A, + PF0900_REG_LDO2_STBY = 0x7B, + PF0900_REG_LDO2_CFG2 = 0x7C, + PF0900_REG_LDO3_RUN = 0x7D, + PF0900_REG_LDO3_STBY = 0x7E, + PF0900_REG_LDO3_CFG2 = 0x7F, + PF0900_REG_VAON_CFG1 = 0x80, + PF0900_REG_VAON_CFG2 = 0x81, + PF0900_REG_SYS_DIAG = 0x82, + PF0900_MAX_REGISTER, +}; + +/* PF0900 SW MODE */ +#define SW_RUN_MODE_OFF 0x00 +#define SW_RUN_MODE_PWM 0x01 +#define SW_RUN_MODE_PFM 0x02 +#define SW_STBY_MODE_OFF 0x00 +#define SW_STBY_MODE_PWM 0x04 +#define SW_STBY_MODE_PFM 0x08 + +/* PF0900 SW MODE MASK */ +#define SW_RUN_MODE_MASK GENMASK(1, 0) +#define SW_STBY_MODE_MASK GENMASK(3, 2) + +/* PF0900 SW VRUN/VSTBY MASK */ +#define PF0900_SW_VOL_MASK GENMASK(7, 0) + +/* PF0900_REG_VAON_CFG1 bits */ +#define PF0900_VAON_1P8V 0x01 + +#define PF0900_VAON_MASK GENMASK(1, 0) + +/* PF0900_REG_SWX_CFG1 MASK */ +#define PF0900_SW_DVS_MASK GENMASK(4, 3) + +/* PF0900_REG_LDO_RUN MASK */ +#define VLDO_RUN_MASK GENMASK(4, 0) +#define LDO_RUN_EN_MASK BIT(5) + +/* PF0900_REG_STATUS1_INT bits */ +#define PF0900_IRQ_PWRUP BIT(3) + +/* PF0900_REG_ILIM_INT bits */ +#define PF0900_IRQ_SW1_IL BIT(0) +#define PF0900_IRQ_SW2_IL BIT(1) +#define PF0900_IRQ_SW3_IL BIT(2) +#define PF0900_IRQ_SW4_IL BIT(3) +#define PF0900_IRQ_SW5_IL BIT(4) + +#define PF0900_IRQ_LDO1_IL BIT(0) +#define PF0900_IRQ_LDO2_IL BIT(1) +#define PF0900_IRQ_LDO3_IL BIT(2) + +/* PF0900_REG_UV_INT bits */ +#define PF0900_IRQ_SW1_UV BIT(0) +#define PF0900_IRQ_SW2_UV BIT(1) +#define PF0900_IRQ_SW3_UV BIT(2) +#define PF0900_IRQ_SW4_UV BIT(3) +#define PF0900_IRQ_SW5_UV BIT(4) + +#define PF0900_IRQ_LDO1_UV BIT(0) +#define PF0900_IRQ_LDO2_UV BIT(1) +#define PF0900_IRQ_LDO3_UV BIT(2) +#define PF0900_IRQ_VAON_UV BIT(3) + +/* PF0900_REG_OV_INT bits */ +#define PF0900_IRQ_SW1_OV BIT(0) +#define PF0900_IRQ_SW2_OV BIT(1) +#define PF0900_IRQ_SW3_OV BIT(2) +#define PF0900_IRQ_SW4_OV BIT(3) +#define PF0900_IRQ_SW5_OV BIT(4) + +#define PF0900_IRQ_LDO1_OV BIT(0) +#define PF0900_IRQ_LDO2_OV BIT(1) +#define PF0900_IRQ_LDO3_OV BIT(2) +#define PF0900_IRQ_VAON_OV BIT(3) + +struct pf0900_regulator_desc { + struct regulator_desc desc; + unsigned int suspend_enable_mask; + unsigned int suspend_voltage_reg; + unsigned int suspend_voltage_cache; +}; + +struct pf0900_drvdata { + const struct pf0900_regulator_desc *desc; + unsigned int rcnt; +}; + +struct pf0900 { + struct device *dev; + struct regmap *regmap; + const struct pf0900_drvdata *drvdata; + struct regulator_dev *rdevs[PF0900_REGULATOR_CNT]; + int irq; + unsigned short addr; + bool crc_en; +}; + +enum pf0900_regulator_type { + PF0900_SW = 0, + PF0900_LDO, +}; + +#define PF0900_REGU_IRQ(_reg, _type, _event) \ + { \ + .reg = _reg, \ + .type = _type, \ + .event = _event, \ + } + +struct pf0900_regulator_irq { + unsigned int reg; + unsigned int type; + unsigned int event; +}; + +static const struct regmap_range pf0900_range = { + .range_min = PF0900_REG_DEV_ID, + .range_max = PF0900_REG_SYS_DIAG, +}; + +static const struct regmap_access_table pf0900_volatile_regs = { + .yes_ranges = &pf0900_range, + .n_yes_ranges = 1, +}; + +static const struct regmap_config pf0900_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .volatile_table = &pf0900_volatile_regs, + .max_register = PF0900_MAX_REGISTER - 1, + .cache_type = REGCACHE_MAPLE, +}; + +static uint8_t crc8_j1850(unsigned short addr, unsigned int reg, + unsigned int val) +{ + uint8_t crcBuf[3]; + uint8_t t_crc; + uint8_t i, j; + + crcBuf[0] = addr; + crcBuf[1] = reg; + crcBuf[2] = val; + t_crc = 0xFF; + + /* + * The CRC calculation is based on the standard CRC-8-SAE as + * defined in the SAE-J1850 specification with the following + * characteristics. + * Polynomial = 0x1D + * Initial Value = 0xFF + * The CRC byte is calculated by shifting 24-bit data through + * the CRC polynomial.The 24-bits package is built as follows: + * DEVICE_ADDR[b8] + REGISTER_ADDR [b8] +DATA[b8] + * The DEVICE_ADDR is calculated as the 7-bit slave address + * shifted left one space plus the corresponding read/write bit. + * (7Bit Address [b7] << 1 ) + R/W = DEVICE_ADDR[b8] + */ + for (i = 0; i < sizeof(crcBuf); i++) { + t_crc ^= crcBuf[i]; + for (j = 0; j < 8; j++) { + if ((t_crc & 0x80) != 0) { + t_crc <<= 1; + t_crc ^= 0x1D; + } else { + t_crc <<= 1; + } + } + } + + return t_crc; +} + +static int pf0900_regmap_read(void *context, unsigned int reg, + unsigned int *val) +{ + struct device *dev = context; + struct i2c_client *i2c = to_i2c_client(dev); + struct pf0900 *pf0900 = dev_get_drvdata(dev); + int ret; + u8 crc; + + if (!pf0900 || !pf0900->dev) + return -EINVAL; + + if (reg >= PF0900_MAX_REGISTER) { + dev_err(pf0900->dev, "Invalid register address: 0x%x\n", reg); + return -EINVAL; + } + + if (pf0900->crc_en) { + ret = i2c_smbus_read_word_data(i2c, reg); + if (ret < 0) { + dev_err(pf0900->dev, "Read error at reg=0x%x: %d\n", reg, ret); + return ret; + } + + *val = (u16)ret; + crc = crc8_j1850(pf0900->addr << 1 | 0x1, reg, FIELD_GET(GENMASK(7, 0), *val)); + if (crc != FIELD_GET(GENMASK(15, 8), *val)) { + dev_err(pf0900->dev, "Crc check error!\n"); + return -EINVAL; + } + *val = FIELD_GET(GENMASK(7, 0), *val); + } else { + ret = i2c_smbus_read_byte_data(i2c, reg); + if (ret < 0) { + dev_err(pf0900->dev, "Read error at reg=0x%x: %d\n", reg, ret); + return ret; + } + *val = ret; + } + + return 0; +} + +static int pf0900_regmap_write(void *context, unsigned int reg, + unsigned int val) +{ + struct device *dev = context; + struct i2c_client *i2c = to_i2c_client(dev); + struct pf0900 *pf0900 = dev_get_drvdata(dev); + uint8_t data[2]; + int ret; + + if (!pf0900 || !pf0900->dev) + return -EINVAL; + + if (reg >= PF0900_MAX_REGISTER) { + dev_err(pf0900->dev, "Invalid register address: 0x%x\n", reg); + return -EINVAL; + } + + data[0] = val; + if (pf0900->crc_en) { + /* Get CRC */ + data[1] = crc8_j1850(pf0900->addr << 1, reg, data[0]); + val = FIELD_PREP(GENMASK(15, 8), data[1]) | data[0]; + ret = i2c_smbus_write_word_data(i2c, reg, val); + } else { + ret = i2c_smbus_write_byte_data(i2c, reg, data[0]); + } + + if (ret) { + dev_err(pf0900->dev, "Write reg=0x%x error!\n", reg); + return ret; + } + + return 0; +} + +static int pf0900_suspend_enable(struct regulator_dev *rdev) +{ + struct pf0900_regulator_desc *rdata = rdev_get_drvdata(rdev); + struct regmap *rmap = rdev_get_regmap(rdev); + + return regmap_update_bits(rmap, rdata->desc.enable_reg, + rdata->suspend_enable_mask, SW_STBY_MODE_PFM); +} + +static int pf0900_suspend_disable(struct regulator_dev *rdev) +{ + struct pf0900_regulator_desc *rdata = rdev_get_drvdata(rdev); + struct regmap *rmap = rdev_get_regmap(rdev); + + return regmap_update_bits(rmap, rdata->desc.enable_reg, + rdata->suspend_enable_mask, SW_STBY_MODE_OFF); +} + +static int pf0900_set_suspend_voltage(struct regulator_dev *rdev, int uV) +{ + struct pf0900_regulator_desc *rdata = rdev_get_drvdata(rdev); + struct regmap *rmap = rdev_get_regmap(rdev); + int ret; + + if (rdata->suspend_voltage_cache == uV) + return 0; + + ret = regulator_map_voltage_iterate(rdev, uV, uV); + if (ret < 0) { + dev_err(rdev_get_dev(rdev), "failed to map %i uV\n", uV); + return ret; + } + + dev_dbg(rdev_get_dev(rdev), "uV: %i, reg: 0x%x, msk: 0x%x, val: 0x%x\n", + uV, rdata->suspend_voltage_reg, rdata->desc.vsel_mask, ret); + ret = regmap_update_bits(rmap, rdata->suspend_voltage_reg, + rdata->desc.vsel_mask, ret); + if (ret < 0) { + dev_err(rdev_get_dev(rdev), "failed to set %i uV\n", uV); + return ret; + } + + rdata->suspend_voltage_cache = uV; + + return 0; +} + +static const struct regmap_bus pf0900_regmap_bus = { + .reg_read = pf0900_regmap_read, + .reg_write = pf0900_regmap_write, +}; + +static const struct regulator_ops pf0900_avon_regulator_ops = { + .list_voltage = regulator_list_voltage_table, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, +}; + +static const struct regulator_ops pf0900_dvs_sw_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .list_voltage = regulator_list_voltage_linear_range, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .set_ramp_delay = regulator_set_ramp_delay_regmap, + .set_suspend_enable = pf0900_suspend_enable, + .set_suspend_disable = pf0900_suspend_disable, + .set_suspend_voltage = pf0900_set_suspend_voltage, +}; + +static const struct regulator_ops pf0900_ldo_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .list_voltage = regulator_list_voltage_linear_range, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, +}; + +/* + * SW1/2/3/4/5 + * SW1_DVS[1:0] SW1 DVS ramp rate setting + * 00: 15.6mV/8usec + * 01: 15.6mV/4usec + * 10: 15.6mV/2usec + * 11: 15.6mV/1usec + */ +static const unsigned int pf0900_dvs_sw_ramp_table[] = { + 1950, 3900, 7800, 15600 +}; + +/* VAON 1.8V, 3.0V, or 3.3V */ +static const int pf0900_vaon_voltages[] = { + 0, 1800000, 3000000, 3300000, +}; + +/* + * SW1 0.5V to 3.3V + * 0.5V to 1.35V (6.25mV step) + * 1.8V to 2.5V (125mV step) + * 2.8V to 3.3V (250mV step) + */ +static const struct linear_range pf0900_dvs_sw1_volts[] = { + REGULATOR_LINEAR_RANGE(0, 0x00, 0x08, 0), + REGULATOR_LINEAR_RANGE(500000, 0x09, 0x91, 6250), + REGULATOR_LINEAR_RANGE(0, 0x92, 0x9E, 0), + REGULATOR_LINEAR_RANGE(1500000, 0x9F, 0x9F, 0), + REGULATOR_LINEAR_RANGE(1800000, 0xA0, 0xD8, 12500), + REGULATOR_LINEAR_RANGE(0, 0xD9, 0xDF, 0), + REGULATOR_LINEAR_RANGE(2800000, 0xE0, 0xF4, 25000), + REGULATOR_LINEAR_RANGE(0, 0xF5, 0xFF, 0), +}; + +/* + * SW2/3/4/5 0.3V to 3.3V + * 0.45V to 1.35V (6.25mV step) + * 1.8V to 2.5V (125mV step) + * 2.8V to 3.3V (250mV step) + */ +static const struct linear_range pf0900_dvs_sw2345_volts[] = { + REGULATOR_LINEAR_RANGE(300000, 0x00, 0x00, 0), + REGULATOR_LINEAR_RANGE(450000, 0x01, 0x91, 6250), + REGULATOR_LINEAR_RANGE(0, 0x92, 0x9E, 0), + REGULATOR_LINEAR_RANGE(1500000, 0x9F, 0x9F, 0), + REGULATOR_LINEAR_RANGE(1800000, 0xA0, 0xD8, 12500), + REGULATOR_LINEAR_RANGE(0, 0xD9, 0xDF, 0), + REGULATOR_LINEAR_RANGE(2800000, 0xE0, 0xF4, 25000), + REGULATOR_LINEAR_RANGE(0, 0xF5, 0xFF, 0), +}; + +/* + * LDO1 + * 0.75V to 3.3V + */ +static const struct linear_range pf0900_ldo1_volts[] = { + REGULATOR_LINEAR_RANGE(750000, 0x00, 0x0F, 50000), + REGULATOR_LINEAR_RANGE(1800000, 0x10, 0x1F, 100000), +}; + +/* + * LDO2/3 + * 0.65V to 3.3V (50mV step) + */ +static const struct linear_range pf0900_ldo23_volts[] = { + REGULATOR_LINEAR_RANGE(650000, 0x00, 0x0D, 50000), + REGULATOR_LINEAR_RANGE(1400000, 0x0E, 0x0F, 100000), + REGULATOR_LINEAR_RANGE(1800000, 0x10, 0x1F, 100000), +}; + +static const struct pf0900_regulator_desc pf0900_regulators[] = { + { + .desc = { + .name = "sw1", + .of_match = of_match_ptr("sw1"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW1, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw1_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw1_volts), + .vsel_reg = PF0900_REG_SW1_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW1_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW1_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW1_VSTBY, + }, + { + .desc = { + .name = "sw2", + .of_match = of_match_ptr("sw2"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW2, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw2345_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw2345_volts), + .vsel_reg = PF0900_REG_SW2_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW2_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW2_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW2_VSTBY, + }, + { + .desc = { + .name = "sw3", + .of_match = of_match_ptr("sw3"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW3, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw2345_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw2345_volts), + .vsel_reg = PF0900_REG_SW3_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW3_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW3_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW3_VSTBY, + }, + { + .desc = { + .name = "sw4", + .of_match = of_match_ptr("sw4"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW4, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw2345_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw2345_volts), + .vsel_reg = PF0900_REG_SW4_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW4_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW4_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW4_VSTBY, + }, + { + .desc = { + .name = "sw5", + .of_match = of_match_ptr("sw5"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_SW5, + .ops = &pf0900_dvs_sw_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_SW_VOLTAGE_NUM, + .linear_ranges = pf0900_dvs_sw2345_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_dvs_sw2345_volts), + .vsel_reg = PF0900_REG_SW5_VRUN, + .vsel_mask = PF0900_SW_VOL_MASK, + .enable_reg = PF0900_REG_SW5_MODE, + .enable_mask = SW_RUN_MODE_MASK, + .enable_val = SW_RUN_MODE_PWM, + .ramp_reg = PF0900_REG_SW5_CFG1, + .ramp_mask = PF0900_SW_DVS_MASK, + .ramp_delay_table = pf0900_dvs_sw_ramp_table, + .n_ramp_values = ARRAY_SIZE(pf0900_dvs_sw_ramp_table), + .owner = THIS_MODULE, + }, + .suspend_enable_mask = SW_STBY_MODE_MASK, + .suspend_voltage_reg = PF0900_REG_SW5_VSTBY, + }, + { + .desc = { + .name = "ldo1", + .of_match = of_match_ptr("ldo1"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_LDO1, + .ops = &pf0900_ldo_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_LDO_VOLTAGE_NUM, + .linear_ranges = pf0900_ldo1_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_ldo1_volts), + .vsel_reg = PF0900_REG_LDO1_RUN, + .vsel_mask = VLDO_RUN_MASK, + .enable_reg = PF0900_REG_LDO1_RUN, + .enable_mask = LDO_RUN_EN_MASK, + .owner = THIS_MODULE, + }, + }, + { + .desc = { + .name = "ldo2", + .of_match = of_match_ptr("ldo2"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_LDO2, + .ops = &pf0900_ldo_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_LDO_VOLTAGE_NUM, + .linear_ranges = pf0900_ldo23_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_ldo23_volts), + .vsel_reg = PF0900_REG_LDO2_RUN, + .vsel_mask = VLDO_RUN_MASK, + .enable_reg = PF0900_REG_LDO2_RUN, + .enable_mask = LDO_RUN_EN_MASK, + .owner = THIS_MODULE, + }, + }, + { + .desc = { + .name = "ldo3", + .of_match = of_match_ptr("ldo3"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_LDO3, + .ops = &pf0900_ldo_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_LDO_VOLTAGE_NUM, + .linear_ranges = pf0900_ldo23_volts, + .n_linear_ranges = ARRAY_SIZE(pf0900_ldo23_volts), + .vsel_reg = PF0900_REG_LDO3_RUN, + .vsel_mask = VLDO_RUN_MASK, + .enable_reg = PF0900_REG_LDO3_RUN, + .enable_mask = LDO_RUN_EN_MASK, + .owner = THIS_MODULE, + }, + }, + { + .desc = { + .name = "vaon", + .of_match = of_match_ptr("vaon"), + .regulators_node = of_match_ptr("regulators"), + .id = PF0900_VAON, + .ops = &pf0900_avon_regulator_ops, + .type = REGULATOR_VOLTAGE, + .n_voltages = PF0900_VAON_VOLTAGE_NUM, + .volt_table = pf0900_vaon_voltages, + .enable_reg = PF0900_REG_VAON_CFG1, + .enable_mask = PF0900_VAON_MASK, + .enable_val = PF0900_VAON_1P8V, + .vsel_reg = PF0900_REG_VAON_CFG1, + .vsel_mask = PF0900_VAON_MASK, + .owner = THIS_MODULE, + }, + }, +}; + +struct pf0900_regulator_irq regu_irqs[] = { + PF0900_REGU_IRQ(PF0900_REG_SW_ILIM_INT, PF0900_SW, REGULATOR_ERROR_OVER_CURRENT_WARN), + PF0900_REGU_IRQ(PF0900_REG_LDO_ILIM_INT, PF0900_LDO, REGULATOR_ERROR_OVER_CURRENT_WARN), + PF0900_REGU_IRQ(PF0900_REG_SW_UV_INT, PF0900_SW, REGULATOR_ERROR_UNDER_VOLTAGE_WARN), + PF0900_REGU_IRQ(PF0900_REG_LDO_UV_INT, PF0900_LDO, REGULATOR_ERROR_UNDER_VOLTAGE_WARN), + PF0900_REGU_IRQ(PF0900_REG_SW_OV_INT, PF0900_SW, REGULATOR_ERROR_OVER_VOLTAGE_WARN), + PF0900_REGU_IRQ(PF0900_REG_LDO_OV_INT, PF0900_LDO, REGULATOR_ERROR_OVER_VOLTAGE_WARN), +}; + +static irqreturn_t pf0900_irq_handler(int irq, void *data) +{ + unsigned int val, regu, i, index; + struct pf0900 *pf0900 = data; + int ret; + + for (i = 0; i < ARRAY_SIZE(regu_irqs); i++) { + ret = regmap_read(pf0900->regmap, regu_irqs[i].reg, &val); + if (ret < 0) { + dev_err(pf0900->dev, "Failed to read %d\n", ret); + return IRQ_NONE; + } + if (val) { + ret = regmap_write_bits(pf0900->regmap, regu_irqs[i].reg, val, val); + if (ret < 0) { + dev_err(pf0900->dev, "Failed to update %d\n", ret); + return IRQ_NONE; + } + + if (regu_irqs[i].type == PF0900_SW) { + for (index = 0; index < REGU_SW_CNT; index++) { + if (val & BIT(index)) { + regu = (enum pf0900_regulators)index; + regulator_notifier_call_chain(pf0900->rdevs[regu], + regu_irqs[i].event, + NULL); + } + } + } else if (regu_irqs[i].type == PF0900_LDO) { + for (index = 0; index < REGU_LDO_VAON_CNT; index++) { + if (val & BIT(index)) { + regu = (enum pf0900_regulators)index + PF0900_LDO1; + regulator_notifier_call_chain(pf0900->rdevs[regu], + regu_irqs[i].event, + NULL); + } + } + } + } + } + + return IRQ_HANDLED; +} + +static int pf0900_i2c_probe(struct i2c_client *i2c) +{ + const struct pf0900_regulator_desc *regulator_desc; + const struct pf0900_drvdata *drvdata = NULL; + struct device_node *np = i2c->dev.of_node; + unsigned int device_id, device_fam, i; + struct regulator_config config = { }; + struct pf0900 *pf0900; + int ret; + + if (!i2c->irq) + return dev_err_probe(&i2c->dev, -EINVAL, "No IRQ configured?\n"); + + pf0900 = devm_kzalloc(&i2c->dev, sizeof(struct pf0900), GFP_KERNEL); + if (!pf0900) + return -ENOMEM; + + drvdata = device_get_match_data(&i2c->dev); + if (!drvdata) + return dev_err_probe(&i2c->dev, -EINVAL, "unable to find driver data\n"); + + regulator_desc = drvdata->desc; + pf0900->drvdata = drvdata; + pf0900->crc_en = of_property_read_bool(np, "nxp,i2c-crc-enable"); + pf0900->irq = i2c->irq; + pf0900->dev = &i2c->dev; + pf0900->addr = i2c->addr; + + dev_set_drvdata(&i2c->dev, pf0900); + + pf0900->regmap = devm_regmap_init(&i2c->dev, &pf0900_regmap_bus, &i2c->dev, + &pf0900_regmap_config); + if (IS_ERR(pf0900->regmap)) + return dev_err_probe(&i2c->dev, PTR_ERR(pf0900->regmap), + "regmap initialization failed\n"); + ret = regmap_read(pf0900->regmap, PF0900_REG_DEV_ID, &device_id); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Read device id error\n"); + + ret = regmap_read(pf0900->regmap, PF0900_REG_DEV_FAM, &device_fam); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Read device fam error\n"); + + /* Check your board and dts for match the right pmic */ + if (device_fam == 0x09 && (device_id & 0x1F) != 0x0) + return dev_err_probe(&i2c->dev, -EINVAL, "Device id(%x) mismatched\n", + device_id >> 4); + + for (i = 0; i < drvdata->rcnt; i++) { + const struct regulator_desc *desc; + const struct pf0900_regulator_desc *r; + + r = ®ulator_desc[i]; + desc = &r->desc; + config.regmap = pf0900->regmap; + config.driver_data = (void *)r; + config.dev = pf0900->dev; + + pf0900->rdevs[i] = devm_regulator_register(pf0900->dev, desc, &config); + if (IS_ERR(pf0900->rdevs[i])) + return dev_err_probe(pf0900->dev, PTR_ERR(pf0900->rdevs[i]), + "Failed to register regulator(%s)\n", desc->name); + } + + ret = devm_request_threaded_irq(pf0900->dev, pf0900->irq, NULL, + pf0900_irq_handler, + (IRQF_TRIGGER_FALLING | IRQF_ONESHOT), + "pf0900-irq", pf0900); + + if (ret != 0) + return dev_err_probe(pf0900->dev, ret, "Failed to request IRQ: %d\n", + pf0900->irq); + /* + * The PWRUP_M is unmasked by default. When the device enter in RUN state, + * it will assert the PWRUP_I interrupt and assert the INTB pin to inform + * the MCU that it has finished the power up sequence properly. + */ + ret = regmap_write_bits(pf0900->regmap, PF0900_REG_STATUS1_INT, PF0900_IRQ_PWRUP, + PF0900_IRQ_PWRUP); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Clean PWRUP_I error\n"); + + /* mask interrupt PWRUP */ + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_STATUS1_MSK, PF0900_IRQ_PWRUP, + PF0900_IRQ_PWRUP); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_SW_ILIM_MSK, PF0900_IRQ_SW1_IL | + PF0900_IRQ_SW2_IL | PF0900_IRQ_SW3_IL | PF0900_IRQ_SW4_IL | + PF0900_IRQ_SW5_IL, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_SW_UV_MSK, PF0900_IRQ_SW1_UV | + PF0900_IRQ_SW2_UV | PF0900_IRQ_SW3_UV | PF0900_IRQ_SW4_UV | + PF0900_IRQ_SW5_UV, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_SW_OV_MSK, PF0900_IRQ_SW1_OV | + PF0900_IRQ_SW2_OV | PF0900_IRQ_SW3_OV | PF0900_IRQ_SW4_OV | + PF0900_IRQ_SW5_OV, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_LDO_ILIM_MSK, PF0900_IRQ_LDO1_IL | + PF0900_IRQ_LDO2_IL | PF0900_IRQ_LDO3_IL, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_LDO_UV_MSK, PF0900_IRQ_LDO1_UV | + PF0900_IRQ_LDO2_UV | PF0900_IRQ_LDO3_UV | PF0900_IRQ_VAON_UV, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + ret = regmap_update_bits(pf0900->regmap, PF0900_REG_LDO_OV_MSK, PF0900_IRQ_LDO1_OV | + PF0900_IRQ_LDO2_OV | PF0900_IRQ_LDO3_OV | PF0900_IRQ_VAON_OV, 0); + if (ret) + return dev_err_probe(&i2c->dev, ret, "Unmask irq error\n"); + + return 0; +} + +static struct pf0900_drvdata pf0900_drvdata = { + .desc = pf0900_regulators, + .rcnt = ARRAY_SIZE(pf0900_regulators), +}; + +static const struct of_device_id pf0900_of_match[] = { + { .compatible = "nxp,pf0900", .data = &pf0900_drvdata}, + { } +}; + +MODULE_DEVICE_TABLE(of, pf0900_of_match); + +static struct i2c_driver pf0900_i2c_driver = { + .driver = { + .name = "nxp-pf0900", + .of_match_table = pf0900_of_match, + }, + .probe = pf0900_i2c_probe, +}; + +module_i2c_driver(pf0900_i2c_driver); + +MODULE_AUTHOR("Joy Zou "); +MODULE_DESCRIPTION("NXP PF0900 Power Management IC driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/regulator/pf530x-regulator.c b/drivers/regulator/pf530x-regulator.c new file mode 100644 index 00000000000000..f789c4b6a499e1 --- /dev/null +++ b/drivers/regulator/pf530x-regulator.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0+ + +// documentation of this device is available at +// https://www.nxp.com/docs/en/data-sheet/PF5300.pdf + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* registers */ +#define PF530X_DEVICEID 0x00 +#define PF530X_REV 0x01 +#define PF530X_EMREV 0x02 +#define PF530X_PROGID 0x03 +#define PF530X_CONFIG1 0x04 +#define PF530X_INT_STATUS1 0x05 +#define PF530X_INT_SENSE1 0x06 +#define PF530X_INT_STATUS2 0x07 +#define PF530X_INT_SENSE2 0x08 +#define PF530X_BIST_STAT1 0x09 +#define PF530X_BIST_CTRL 0x0a +#define PF530X_STATE 0x0b +#define PF530X_STATE_CTRL 0x0c +#define PF530X_SW1_VOLT 0x0d +#define PF530X_SW1_STBY_VOLT 0x0e +#define PF530X_SW1_CTRL1 0x0f +#define PF530X_SW1_CTRL2 0x10 +#define PF530X_CLK_CTRL 0x11 +#define PF530X_SEQ_CTRL1 0x12 +#define PF530X_SEQ_CTRL2 0x13 +#define PF530X_RANDOM_CHK 0x14 +#define PF530X_RANDOM_GEN 0x15 +#define PF530X_WD_CTRL1 0x16 +#define PF530X_WD_SEED 0x17 +#define PF530X_WD_ANSWER 0x18 +#define PF530X_FLT_CNT1 0x19 +#define PF530X_FLT_CNT2 0x1a +#define PF530X_OTP_MODE 0x2f + +enum pf530x_states { + PF530X_STATE_POF, + PF530X_STATE_FUSE_LOAD, + PF530X_STATE_LP_OFF, + PF530X_STATE_SELF_TEST, + PF530X_STATE_POWER_UP, + PF530X_STATE_INIT, + PF530X_STATE_IO_RELEASE, + PF530X_STATE_RUN, + PF530X_STATE_STANDBY, + PF530X_STATE_FAULT, + PF530X_STATE_FAILSAFE, + PF530X_STATE_POWER_DOWN, + PF530X_STATE_2MS_SELFTEST_RETRY, + PF530X_STATE_OFF_DLY, +}; + +#define PF530_FAM 0x50 +enum pf530x_devid { + PF5300 = 0x3, + PF5301 = 0x4, + PF5302 = 0x5, +}; + +#define PF530x_FAM 0x50 +#define PF530x_DEVICE_FAM_MASK GENMASK(7, 4) +#define PF530x_DEVICE_ID_MASK GENMASK(3, 0) + +#define PF530x_STATE_MASK GENMASK(3, 0) +#define PF530x_STATE_RUN 0x07 +#define PF530x_STATE_STANDBY 0x08 +#define PF530x_STATE_LP_OFF 0x02 + +#define PF530X_OTP_STBY_MODE GENMASK(3, 2) +#define PF530X_OTP_RUN_MODE GENMASK(1, 0) + +#define PF530X_INT_STATUS_OV BIT(1) +#define PF530X_INT_STATUS_UV BIT(2) +#define PF530X_INT_STATUS_ILIM BIT(3) + +#define SW1_ILIM_S BIT(0) +#define VMON_UV_S BIT(1) +#define VMON_OV_S BIT(2) +#define VIN_OVLO_S BIT(3) +#define BG_ERR_S BIT(6) + +#define THERM_155_S BIT(3) +#define THERM_140_S BIT(2) +#define THERM_125_S BIT(1) +#define THERM_110_S BIT(0) + +struct pf530x_chip { + struct regmap *regmap; + struct device *dev; +}; + +static const struct regmap_config pf530x_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = PF530X_OTP_MODE, + .cache_type = REGCACHE_MAPLE, +}; + +static int pf530x_get_status(struct regulator_dev *rdev) +{ + unsigned int state; + int ret; + + ret = regmap_read(rdev->regmap, PF530X_INT_SENSE1, &state); + if (ret != 0) + return ret; + + if ((state & (BG_ERR_S | SW1_ILIM_S | VMON_UV_S | VMON_OV_S | VIN_OVLO_S)) + != 0) + return REGULATOR_STATUS_ERROR; + + // no errors, check if what non-error state we're in + ret = regmap_read(rdev->regmap, PF530X_STATE, &state); + if (ret != 0) + return ret; + + state &= PF530x_STATE_MASK; + + switch (state) { + case PF530x_STATE_RUN: + ret = REGULATOR_STATUS_NORMAL; + break; + case PF530x_STATE_STANDBY: + ret = REGULATOR_STATUS_STANDBY; + break; + case PF530x_STATE_LP_OFF: + ret = REGULATOR_STATUS_OFF; + break; + default: + ret = REGULATOR_STATUS_ERROR; + break; + } + return ret; +} + +static int pf530x_get_error_flags(struct regulator_dev *rdev, unsigned int *flags) +{ + unsigned int status; + int ret; + + ret = regmap_read(rdev->regmap, PF530X_INT_STATUS1, &status); + + if (ret != 0) + return ret; + + *flags = 0; + + if (status & PF530X_INT_STATUS_OV) + *flags |= REGULATOR_ERROR_OVER_VOLTAGE_WARN; + + if (status & PF530X_INT_STATUS_UV) + *flags |= REGULATOR_ERROR_UNDER_VOLTAGE; + + if (status & PF530X_INT_STATUS_ILIM) + *flags |= REGULATOR_ERROR_OVER_CURRENT; + + ret = regmap_read(rdev->regmap, PF530X_INT_SENSE2, &status); + + if (ret != 0) + return ret; + + if ((status & (THERM_155_S | + THERM_140_S | + THERM_125_S | + THERM_110_S)) != 0) + *flags |= REGULATOR_ERROR_OVER_TEMP_WARN; + + return 0; +} + +static const struct regulator_ops pf530x_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .map_voltage = regulator_map_voltage_linear_range, + .list_voltage = regulator_list_voltage_linear_range, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .get_status = pf530x_get_status, + .get_error_flags = pf530x_get_error_flags, + .set_bypass = regulator_set_bypass_regmap, + .get_bypass = regulator_get_bypass_regmap, +}; + +static const struct linear_range vrange = REGULATOR_LINEAR_RANGE(500000, 0, 140, 5000); + +static const struct regulator_desc pf530x_reg_desc = { + .name = "SW1", + .ops = &pf530x_regulator_ops, + .linear_ranges = &vrange, + .n_linear_ranges = 1, + .type = REGULATOR_VOLTAGE, + .id = 0, + .owner = THIS_MODULE, + .vsel_reg = PF530X_SW1_VOLT, + .vsel_mask = 0xFF, + .bypass_reg = PF530X_SW1_CTRL2, + .bypass_mask = 0x07, + .bypass_val_on = 0x07, + .bypass_val_off = 0x00, + .enable_reg = PF530X_SW1_CTRL1, + .enable_mask = GENMASK(5, 2), + .enable_val = GENMASK(5, 2), + .disable_val = 0, +}; + +static int pf530x_identify(struct pf530x_chip *chip) +{ + unsigned int value; + u8 dev_fam, dev_id, full_layer_rev, metal_layer_rev, prog_idh, prog_idl, emrev; + const char *name = NULL; + int ret; + + ret = regmap_read(chip->regmap, PF530X_DEVICEID, &value); + if (ret) { + dev_err(chip->dev, "failed to read chip family\n"); + return ret; + } + + dev_fam = value & PF530x_DEVICE_FAM_MASK; + switch (dev_fam) { + case PF530x_FAM: + break; + default: + dev_err(chip->dev, + "Chip 0x%x is not from PF530X family\n", dev_fam); + return ret; + } + + dev_id = value & PF530x_DEVICE_ID_MASK; + switch (dev_id) { + case PF5300: + name = "PF5300"; + break; + case PF5301: + name = "PF5301"; + break; + case PF5302: + name = "PF5302"; + break; + default: + dev_err(chip->dev, "Unknown pf530x device id 0x%x\n", dev_id); + return -ENODEV; + } + + ret = regmap_read(chip->regmap, PF530X_REV, &value); + if (ret) { + dev_err(chip->dev, "failed to read chip rev\n"); + return ret; + } + + full_layer_rev = ((value & 0xF0) == 0) ? '0' : ((((value & 0xF0) >> 4) - 1) + 'A'); + metal_layer_rev = value & 0xF; + + ret = regmap_read(chip->regmap, PF530X_EMREV, &value); + if (ret) { + dev_err(chip->dev, "failed to read chip emrev register\n"); + return ret; + } + + prog_idh = (value >> 4) + 'A'; + // prog_idh skips 'O', per page 96 of the datasheet + if (prog_idh >= 'O') + prog_idh += 1; + + emrev = value & 0x7; + + ret = regmap_read(chip->regmap, PF530X_PROGID, &value); + if (ret) { + dev_err(chip->dev, "failed to read chip progid register\n"); + return ret; + } + + if (value >= 0x22) { + dev_err(chip->dev, "invalid value for progid register\n"); + return -ENODEV; + } else if (value < 10) { + prog_idl = value + '0'; + } else { + prog_idl = (value - 10) + 'A'; + // prog_idh skips 'O', per page 97 of the datasheet + if (prog_idl >= 'O') + prog_idl += 1; + } + + dev_info(chip->dev, "%s Regulator found (Rev %c%d ProgID %c%c EMREV %x).\n", + name, full_layer_rev, metal_layer_rev, prog_idh, prog_idl, emrev); + + return 0; +} + +static int pf530x_i2c_probe(struct i2c_client *client) +{ + struct regulator_config config = { NULL, }; + struct pf530x_chip *chip; + int ret; + struct regulator_dev *rdev; + struct regulator_init_data *init_data; + + chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + i2c_set_clientdata(client, chip); + chip->dev = &client->dev; + + chip->regmap = devm_regmap_init_i2c(client, &pf530x_regmap_config); + if (IS_ERR(chip->regmap)) { + ret = PTR_ERR(chip->regmap); + dev_err(&client->dev, + "regmap allocation failed with err %d\n", ret); + return ret; + } + + ret = pf530x_identify(chip); + if (ret) + return ret; + + init_data = of_get_regulator_init_data(chip->dev, chip->dev->of_node, &pf530x_reg_desc); + if (!init_data) + return -ENODATA; + + config.dev = chip->dev; + config.of_node = chip->dev->of_node; + config.regmap = chip->regmap; + config.init_data = init_data; + + // the config parameter gets copied, it's ok to pass a pointer on the stack here + rdev = devm_regulator_register(&client->dev, &pf530x_reg_desc, &config); + if (IS_ERR(rdev)) { + dev_err(&client->dev, "failed to register %s regulator\n", pf530x_reg_desc.name); + return PTR_ERR(rdev); + } + + return 0; +} + +static const struct of_device_id pf530x_dt_ids[] = { + { .compatible = "nxp,pf5300",}, + { } +}; +MODULE_DEVICE_TABLE(of, pf530x_dt_ids); + +static const struct i2c_device_id pf530x_i2c_id[] = { + { "pf5300", 0 }, + { "pf5301", 0 }, + { "pf5302", 0 }, + {}, +}; +MODULE_DEVICE_TABLE(i2c, pf530x_i2c_id); + +static struct i2c_driver pf530x_regulator_driver = { + .id_table = pf530x_i2c_id, + .driver = { + .name = "pf530x", + .of_match_table = pf530x_dt_ids, + }, + .probe = pf530x_i2c_probe, +}; +module_i2c_driver(pf530x_regulator_driver); + +MODULE_AUTHOR("Woodrow Douglass "); +MODULE_DESCRIPTION("Regulator Driver for NXP's PF5300/PF5301/PF5302 PMIC"); +MODULE_LICENSE("GPL"); diff --git a/drivers/regulator/qcom-refgen-regulator.c b/drivers/regulator/qcom-refgen-regulator.c index cfa72ce85bc898..299ac3c8c3bc3d 100644 --- a/drivers/regulator/qcom-refgen-regulator.c +++ b/drivers/regulator/qcom-refgen-regulator.c @@ -94,7 +94,6 @@ static const struct regmap_config qcom_refgen_regmap_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .fast_io = true, }; static int qcom_refgen_probe(struct platform_device *pdev) diff --git a/drivers/regulator/rt5133-regulator.c b/drivers/regulator/rt5133-regulator.c new file mode 100644 index 00000000000000..129b1f13c88028 --- /dev/null +++ b/drivers/regulator/rt5133-regulator.c @@ -0,0 +1,642 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Richtek Technology Corp. +// Author: ChiYuan Huang +// Author: ShihChia Chang + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RT5133_REG_CHIP_INFO 0x00 +#define RT5133_REG_RST_CTRL 0x06 +#define RT5133_REG_BASE_CTRL 0x09 +#define RT5133_REG_GPIO_CTRL 0x0B +#define RT5133_REG_BASE_EVT 0x10 +#define RT5133_REG_LDO_PGB_STAT 0x15 +#define RT5133_REG_BASE_MASK 0x16 +#define RT5133_REG_LDO_SHDN 0x19 +#define RT5133_REG_LDO_ON 0x1A +#define RT5133_REG_LDO_OFF 0x1B +#define RT5133_REG_LDO1_CTRL1 0x20 +#define RT5133_REG_LDO1_CTRL2 0x21 +#define RT5133_REG_LDO1_CTRL3 0x22 +#define RT5133_REG_LDO2_CTRL1 0x24 +#define RT5133_REG_LDO2_CTRL2 0x25 +#define RT5133_REG_LDO2_CTRL3 0x26 +#define RT5133_REG_LDO3_CTRL1 0x28 +#define RT5133_REG_LDO3_CTRL2 0x29 +#define RT5133_REG_LDO3_CTRL3 0x2A +#define RT5133_REG_LDO4_CTRL1 0x2C +#define RT5133_REG_LDO4_CTRL2 0x2D +#define RT5133_REG_LDO4_CTRL3 0x2E +#define RT5133_REG_LDO5_CTRL1 0x30 +#define RT5133_REG_LDO5_CTRL2 0x31 +#define RT5133_REG_LDO5_CTRL3 0x32 +#define RT5133_REG_LDO6_CTRL1 0x34 +#define RT5133_REG_LDO6_CTRL2 0x35 +#define RT5133_REG_LDO6_CTRL3 0x36 +#define RT5133_REG_LDO7_CTRL1 0x38 +#define RT5133_REG_LDO7_CTRL2 0x39 +#define RT5133_REG_LDO7_CTRL3 0x3A +#define RT5133_REG_LDO8_CTRL1 0x3C +#define RT5133_REG_LDO8_CTRL2 0x3D +#define RT5133_REG_LDO8_CTRL3 0x3E +#define RT5133_REG_LDO8_CTRL4 0x3F + +#define RT5133_LDO_REG_BASE(_id) (0x20 + ((_id) - 1) * 4) + +#define RT5133_VENDOR_ID_MASK GENMASK(7, 4) +#define RT5133_RESET_CODE 0xB1 + +#define RT5133_FOFF_BASE_MASK BIT(1) +#define RT5133_OCSHDN_ALL_MASK BIT(7) +#define RT5133_OCSHDN_ALL_SHIFT (7) +#define RT5133_PGBSHDN_ALL_MASK BIT(6) +#define RT5133_PGBSHDN_ALL_SHIFT (6) + +#define RT5133_OCPTSEL_MASK BIT(5) +#define RT5133_PGBPTSEL_MASK BIT(4) +#define RT5133_STBTDSEL_MASK GENMASK(1, 0) + +#define RT5133_LDO_ENABLE_MASK BIT(7) +#define RT5133_LDO_VSEL_MASK GENMASK(7, 5) +#define RT5133_LDO_AD_MASK BIT(2) +#define RT5133_LDO_SOFT_START_MASK GENMASK(1, 0) + +#define RT5133_GPIO_NR 3 + +#define RT5133_LDO_PGB_EVT_MASK GENMASK(23, 16) +#define RT5133_LDO_PGB_EVT_SHIFT 16 +#define RT5133_LDO_OC_EVT_MASK GENMASK(15, 8) +#define RT5133_LDO_OC_EVT_SHIFT 8 +#define RT5133_VREF_EVT_MASK BIT(6) +#define RT5133_BASE_EVT_MASK GENMASK(7, 0) +#define RT5133_INTR_CLR_MASK GENMASK(23, 0) +#define RT5133_INTR_BYTE_NR 3 + +#define RT5133_MAX_I2C_BLOCK_SIZE 1 + +#define RT5133_CRC8_POLYNOMIAL 0x7 + +#define RT5133_I2C_ADDR_LEN 1 +#define RT5133_PREDATA_LEN 2 +#define RT5133_I2C_CRC_LEN 1 +#define RT5133_REG_ADDR_LEN 1 +#define RT5133_I2C_DUMMY_LEN 1 + +#define I2C_ADDR_XLATE_8BIT(_addr, _rw) ((((_addr) & 0x7F) << 1) | (_rw)) + +enum { + RT5133_REGULATOR_BASE = 0, + RT5133_REGULATOR_LDO1, + RT5133_REGULATOR_LDO2, + RT5133_REGULATOR_LDO3, + RT5133_REGULATOR_LDO4, + RT5133_REGULATOR_LDO5, + RT5133_REGULATOR_LDO6, + RT5133_REGULATOR_LDO7, + RT5133_REGULATOR_LDO8, + RT5133_REGULATOR_MAX +}; + +struct chip_data { + const struct regulator_desc *regulators; + const u8 vendor_id; +}; + +struct rt5133_priv { + struct device *dev; + struct regmap *regmap; + struct gpio_desc *enable_gpio; + struct regulator_dev *rdev[RT5133_REGULATOR_MAX]; + struct gpio_chip gc; + const struct chip_data *cdata; + unsigned int gpio_output_flag; + u8 crc8_tbls[CRC8_TABLE_SIZE]; +}; + +static const unsigned int vout_type1_tables[] = { + 1800000, 2500000, 2700000, 2800000, 2900000, 3000000, 3100000, 3200000 +}; + +static const unsigned int vout_type2_tables[] = { + 1700000, 1800000, 1900000, 2500000, 2700000, 2800000, 2900000, 3000000 +}; + +static const unsigned int vout_type3_tables[] = { + 900000, 950000, 1000000, 1050000, 1100000, 1150000, 1200000, 1800000 +}; + +static const unsigned int vout_type4_tables[] = { + 855000, 900000, 950000, 1000000, 1040000, 1090000, 1140000, 1710000 +}; + +static const struct regulator_ops rt5133_regulator_ops = { + .list_voltage = regulator_list_voltage_table, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, + .set_active_discharge = regulator_set_active_discharge_regmap, +}; + +static const struct regulator_ops rt5133_base_regulator_ops = { + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, +}; + +#define RT5133_REGULATOR_DESC(_name, _node_name, vtables, _supply) \ +{\ + .name = #_name,\ + .id = RT5133_REGULATOR_##_name,\ + .of_match = of_match_ptr(#_node_name),\ + .regulators_node = of_match_ptr("regulators"),\ + .supply_name = _supply,\ + .type = REGULATOR_VOLTAGE,\ + .owner = THIS_MODULE,\ + .ops = &rt5133_regulator_ops,\ + .n_voltages = ARRAY_SIZE(vtables),\ + .volt_table = vtables,\ + .enable_reg = RT5133_REG_##_name##_CTRL1,\ + .enable_mask = RT5133_LDO_ENABLE_MASK,\ + .vsel_reg = RT5133_REG_##_name##_CTRL2,\ + .vsel_mask = RT5133_LDO_VSEL_MASK,\ + .active_discharge_reg = RT5133_REG_##_name##_CTRL3,\ + .active_discharge_mask = RT5133_LDO_AD_MASK,\ +} + +static const struct regulator_desc rt5133_regulators[] = { + /* For digital part, base current control */ + { + .name = "base", + .id = RT5133_REGULATOR_BASE, + .of_match = of_match_ptr("base"), + .regulators_node = of_match_ptr("regulators"), + .type = REGULATOR_VOLTAGE, + .owner = THIS_MODULE, + .ops = &rt5133_base_regulator_ops, + .enable_reg = RT5133_REG_BASE_CTRL, + .enable_mask = RT5133_FOFF_BASE_MASK, + .enable_is_inverted = true, + }, + RT5133_REGULATOR_DESC(LDO1, ldo1, vout_type1_tables, "base"), + RT5133_REGULATOR_DESC(LDO2, ldo2, vout_type1_tables, "base"), + RT5133_REGULATOR_DESC(LDO3, ldo3, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO4, ldo4, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO5, ldo5, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO6, ldo6, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO7, ldo7, vout_type3_tables, "vin"), + RT5133_REGULATOR_DESC(LDO8, ldo8, vout_type3_tables, "vin"), +}; + +static const struct regulator_desc rt5133a_regulators[] = { + /* For digital part, base current control */ + { + .name = "base", + .id = RT5133_REGULATOR_BASE, + .of_match = of_match_ptr("base"), + .regulators_node = of_match_ptr("regulators"), + .type = REGULATOR_VOLTAGE, + .owner = THIS_MODULE, + .ops = &rt5133_base_regulator_ops, + .enable_reg = RT5133_REG_BASE_CTRL, + .enable_mask = RT5133_FOFF_BASE_MASK, + .enable_is_inverted = true, + }, + RT5133_REGULATOR_DESC(LDO1, ldo1, vout_type1_tables, "base"), + RT5133_REGULATOR_DESC(LDO2, ldo2, vout_type1_tables, "base"), + RT5133_REGULATOR_DESC(LDO3, ldo3, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO4, ldo4, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO5, ldo5, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO6, ldo6, vout_type2_tables, "base"), + RT5133_REGULATOR_DESC(LDO7, ldo7, vout_type3_tables, "vin"), + RT5133_REGULATOR_DESC(LDO8, ldo8, vout_type4_tables, "vin"), +}; + +static const struct chip_data regulator_data[] = { + { rt5133_regulators, 0x70}, + { rt5133a_regulators, 0x80}, +}; + +static int rt5133_gpio_direction_output(struct gpio_chip *gpio, + unsigned int offset, int value) +{ + struct rt5133_priv *priv = gpiochip_get_data(gpio); + + if (offset >= RT5133_GPIO_NR) + return -EINVAL; + + return regmap_update_bits(priv->regmap, RT5133_REG_GPIO_CTRL, + BIT(7 - offset) | BIT(3 - offset), + value ? BIT(7 - offset) | BIT(3 - offset) : 0); +} + +static int rt5133_gpio_get(struct gpio_chip *chip, unsigned int offset) +{ + struct rt5133_priv *priv = gpiochip_get_data(chip); + + return !!(priv->gpio_output_flag & BIT(offset)); +} + +static int rt5133_get_gpioen_mask(unsigned int offset, unsigned int *mask) +{ + if (offset >= RT5133_GPIO_NR) + return -EINVAL; + + *mask = (BIT(7 - offset) | BIT(3 - offset)); + + return 0; +} + +static int rt5133_gpio_set(struct gpio_chip *chip, unsigned int offset, int set_val) +{ + struct rt5133_priv *priv = gpiochip_get_data(chip); + unsigned int mask = 0, val = 0, next_flag = priv->gpio_output_flag; + int ret = 0; + + ret = rt5133_get_gpioen_mask(offset, &mask); + if (ret) { + dev_err(priv->dev, "%s get gpion en mask failed, offset(%d)\n", __func__, offset); + return ret; + } + + val = set_val ? mask : 0; + + if (set_val) + next_flag |= BIT(offset); + else + next_flag &= ~BIT(offset); + + ret = regmap_update_bits(priv->regmap, RT5133_REG_GPIO_CTRL, mask, val); + if (ret) { + dev_err(priv->dev, "Failed to set gpio [%d] val %d\n", offset, + set_val); + return ret; + } + + priv->gpio_output_flag = next_flag; + return 0; +} + +static irqreturn_t rt5133_intr_handler(int irq_number, void *data) +{ + struct rt5133_priv *priv = data; + u32 intr_evts = 0, handle_evts; + int i, ret; + + ret = regmap_bulk_read(priv->regmap, RT5133_REG_BASE_EVT, &intr_evts, + RT5133_INTR_BYTE_NR); + if (ret) { + dev_err(priv->dev, "%s, read event failed\n", __func__); + return IRQ_NONE; + } + + handle_evts = intr_evts & RT5133_BASE_EVT_MASK; + /* + * VREF_EVT is a special case, if base off + * this event will also be trigger. Skip it + */ + if (handle_evts & ~RT5133_VREF_EVT_MASK) + dev_dbg(priv->dev, "base event occurred [0x%02x]\n", + handle_evts); + + handle_evts = (intr_evts & RT5133_LDO_OC_EVT_MASK) >> + RT5133_LDO_OC_EVT_SHIFT; + + for (i = RT5133_REGULATOR_LDO1; i < RT5133_REGULATOR_MAX && handle_evts; i++) { + if (!(handle_evts & BIT(i - 1))) + continue; + regulator_notifier_call_chain(priv->rdev[i], + REGULATOR_EVENT_OVER_CURRENT, + &i); + } + + handle_evts = (intr_evts & RT5133_LDO_PGB_EVT_MASK) >> + RT5133_LDO_PGB_EVT_SHIFT; + for (i = RT5133_REGULATOR_LDO1; i < RT5133_REGULATOR_MAX && handle_evts; i++) { + if (!(handle_evts & BIT(i - 1))) + continue; + regulator_notifier_call_chain(priv->rdev[i], + REGULATOR_EVENT_FAIL, &i); + } + + ret = regmap_bulk_write(priv->regmap, RT5133_REG_BASE_EVT, &intr_evts, + RT5133_INTR_BYTE_NR); + if (ret) + dev_err(priv->dev, "%s, clear event failed\n", __func__); + + return IRQ_HANDLED; +} + +static int rt5133_enable_interrupts(int irq_no, struct rt5133_priv *priv) +{ + u32 mask = RT5133_INTR_CLR_MASK; + int ret; + + /* Force to write clear all events */ + ret = regmap_bulk_write(priv->regmap, RT5133_REG_BASE_EVT, &mask, + RT5133_INTR_BYTE_NR); + if (ret) { + dev_err(priv->dev, "Failed to clear all interrupts\n"); + return ret; + } + + /* Unmask all interrupts */ + mask = 0; + ret = regmap_bulk_write(priv->regmap, RT5133_REG_BASE_MASK, &mask, + RT5133_INTR_BYTE_NR); + if (ret) { + dev_err(priv->dev, "Failed to unmask all interrupts\n"); + return ret; + } + + return devm_request_threaded_irq(priv->dev, irq_no, NULL, + rt5133_intr_handler, IRQF_ONESHOT, + dev_name(priv->dev), priv); +} + +static int rt5133_regmap_hw_read(void *context, const void *reg_buf, + size_t reg_size, void *val_buf, + size_t val_size) +{ + struct rt5133_priv *priv = context; + struct i2c_client *client = to_i2c_client(priv->dev); + u8 reg = *(u8 *)reg_buf, crc; + u8 *buf; + int buf_len = RT5133_PREDATA_LEN + val_size + RT5133_I2C_CRC_LEN; + int read_len, ret; + + buf = kzalloc(buf_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + buf[0] = I2C_ADDR_XLATE_8BIT(client->addr, I2C_SMBUS_READ); + buf[1] = reg; + + read_len = val_size + RT5133_I2C_CRC_LEN; + ret = i2c_smbus_read_i2c_block_data(client, reg, read_len, + buf + RT5133_PREDATA_LEN); + + if (ret < 0) + goto out_read_err; + + if (ret != read_len) { + ret = -EIO; + goto out_read_err; + } + + crc = crc8(priv->crc8_tbls, buf, RT5133_PREDATA_LEN + val_size, 0); + if (crc != buf[RT5133_PREDATA_LEN + val_size]) { + ret = -EIO; + goto out_read_err; + } + + memcpy(val_buf, buf + RT5133_PREDATA_LEN, val_size); + dev_dbg(priv->dev, "%s, reg = 0x%02x, data = 0x%02x\n", __func__, reg, *(u8 *)val_buf); + +out_read_err: + kfree(buf); + return (ret < 0) ? ret : 0; +} + +static int rt5133_regmap_hw_write(void *context, const void *data, size_t count) +{ + struct rt5133_priv *priv = context; + struct i2c_client *client = to_i2c_client(priv->dev); + u8 reg = *(u8 *)data, crc; + u8 *buf; + int buf_len = RT5133_I2C_ADDR_LEN + count + RT5133_I2C_CRC_LEN + + RT5133_I2C_DUMMY_LEN; + int write_len, ret; + + buf = kzalloc(buf_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + buf[0] = I2C_ADDR_XLATE_8BIT(client->addr, I2C_SMBUS_WRITE); + buf[1] = reg; + memcpy(buf + RT5133_PREDATA_LEN, data + RT5133_REG_ADDR_LEN, + count - RT5133_REG_ADDR_LEN); + + crc = crc8(priv->crc8_tbls, buf, RT5133_I2C_ADDR_LEN + count, 0); + buf[RT5133_I2C_ADDR_LEN + count] = crc; + + write_len = count - RT5133_REG_ADDR_LEN + RT5133_I2C_CRC_LEN + + RT5133_I2C_DUMMY_LEN; + ret = i2c_smbus_write_i2c_block_data(client, reg, write_len, + buf + RT5133_PREDATA_LEN); + + dev_dbg(priv->dev, "%s, reg = 0x%02x, data = 0x%02x\n", __func__, reg, + *(u8 *)(buf + RT5133_PREDATA_LEN)); + kfree(buf); + return ret; +} + +static const struct regmap_bus rt5133_regmap_bus = { + .read = rt5133_regmap_hw_read, + .write = rt5133_regmap_hw_write, + /* Due to crc, the block read/write length has the limit */ + .max_raw_read = RT5133_MAX_I2C_BLOCK_SIZE, + .max_raw_write = RT5133_MAX_I2C_BLOCK_SIZE, +}; + +static bool rt5133_is_volatile_reg(struct device *dev, unsigned int reg) +{ + switch (reg) { + case RT5133_REG_CHIP_INFO: + case RT5133_REG_BASE_EVT...RT5133_REG_LDO_PGB_STAT: + case RT5133_REG_LDO_ON...RT5133_REG_LDO_OFF: + case RT5133_REG_LDO1_CTRL1: + case RT5133_REG_LDO2_CTRL1: + case RT5133_REG_LDO3_CTRL1: + case RT5133_REG_LDO4_CTRL1: + case RT5133_REG_LDO5_CTRL1: + case RT5133_REG_LDO6_CTRL1: + case RT5133_REG_LDO7_CTRL1: + case RT5133_REG_LDO8_CTRL1: + return true; + default: + return false; + }; +} + +static const struct regmap_config rt5133_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = RT5133_REG_LDO8_CTRL4, + .cache_type = REGCACHE_FLAT, + .num_reg_defaults_raw = RT5133_REG_LDO8_CTRL4 + 1, + .volatile_reg = rt5133_is_volatile_reg, +}; + +static int rt5133_chip_reset(struct rt5133_priv *priv) +{ + int ret; + + ret = regmap_write(priv->regmap, RT5133_REG_RST_CTRL, + RT5133_RESET_CODE); + if (ret) + return ret; + + /* Wait for register reset to take effect */ + udelay(2); + + return 0; +} + +static int rt5133_validate_vendor_info(struct rt5133_priv *priv) +{ + unsigned int val = 0; + int i, ret; + + ret = regmap_read(priv->regmap, RT5133_REG_CHIP_INFO, &val); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(regulator_data); i++) { + if ((val & RT5133_VENDOR_ID_MASK) == + regulator_data[i].vendor_id){ + priv->cdata = ®ulator_data[i]; + break; + } + } + if (!priv->cdata) { + dev_err(priv->dev, "Failed to find regulator match version\n"); + return -ENODEV; + } + + return 0; +} + +static int rt5133_parse_dt(struct rt5133_priv *priv) +{ + unsigned int val = 0; + int ret = 0; + + if (!device_property_read_bool(priv->dev, "richtek,oc-shutdown-all")) + val = 0; + else + val = 1 << RT5133_OCSHDN_ALL_SHIFT; + ret = regmap_update_bits(priv->regmap, RT5133_REG_LDO_SHDN, + RT5133_OCSHDN_ALL_MASK, val); + if (ret) + return ret; + + if (!device_property_read_bool(priv->dev, "richtek,pgb-shutdown-all")) + val = 0; + else + val = 1 << RT5133_PGBSHDN_ALL_SHIFT; + return regmap_update_bits(priv->regmap, RT5133_REG_LDO_SHDN, + RT5133_PGBSHDN_ALL_MASK, val); +} + +static int rt5133_probe(struct i2c_client *i2c) +{ + struct rt5133_priv *priv; + struct regulator_config config = {0}; + int i, ret; + + priv = devm_kzalloc(&i2c->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = &i2c->dev; + crc8_populate_msb(priv->crc8_tbls, RT5133_CRC8_POLYNOMIAL); + + priv->enable_gpio = devm_gpiod_get_optional(&i2c->dev, "enable", + GPIOD_OUT_HIGH); + if (IS_ERR(priv->enable_gpio)) + dev_err(&i2c->dev, "Failed to request HWEN gpio, check if default en=high\n"); + + priv->regmap = devm_regmap_init(&i2c->dev, &rt5133_regmap_bus, priv, + &rt5133_regmap_config); + if (IS_ERR(priv->regmap)) { + dev_err(&i2c->dev, "Failed to register regmap\n"); + return PTR_ERR(priv->regmap); + } + + ret = rt5133_validate_vendor_info(priv); + if (ret) { + dev_err(&i2c->dev, "Failed to check vendor info [%d]\n", ret); + return ret; + } + + ret = rt5133_chip_reset(priv); + if (ret) { + dev_err(&i2c->dev, "Failed to execute sw reset\n"); + return ret; + } + + config.dev = &i2c->dev; + config.driver_data = priv; + config.regmap = priv->regmap; + + for (i = 0; i < RT5133_REGULATOR_MAX; i++) { + priv->rdev[i] = devm_regulator_register(&i2c->dev, + priv->cdata->regulators + i, + &config); + if (IS_ERR(priv->rdev[i])) { + dev_err(&i2c->dev, + "Failed to register [%d] regulator\n", i); + return PTR_ERR(priv->rdev[i]); + } + } + + ret = rt5133_parse_dt(priv); + if (ret) { + dev_err(&i2c->dev, "%s, Failed to parse dt\n", __func__); + return ret; + } + + priv->gc.label = dev_name(&i2c->dev); + priv->gc.parent = &i2c->dev; + priv->gc.base = -1; + priv->gc.ngpio = RT5133_GPIO_NR; + priv->gc.set = rt5133_gpio_set; + priv->gc.get = rt5133_gpio_get; + priv->gc.direction_output = rt5133_gpio_direction_output; + priv->gc.can_sleep = true; + + ret = devm_gpiochip_add_data(&i2c->dev, &priv->gc, priv); + if (ret) + return ret; + + ret = rt5133_enable_interrupts(i2c->irq, priv); + if (ret) { + dev_err(&i2c->dev, "enable interrupt failed\n"); + return ret; + } + + i2c_set_clientdata(i2c, priv); + + return ret; +} + +static const struct of_device_id __maybe_unused rt5133_of_match_table[] = { + { .compatible = "richtek,rt5133", }, + { } +}; +MODULE_DEVICE_TABLE(of, rt5133_of_match_table); + +static struct i2c_driver rt5133_driver = { + .driver = { + .name = "rt5133", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .of_match_table = rt5133_of_match_table, + }, + .probe = rt5133_probe, +}; +module_i2c_driver(rt5133_driver); + +MODULE_DESCRIPTION("RT5133 Regulator Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/regulator/s2dos05-regulator.c b/drivers/regulator/s2dos05-regulator.c new file mode 100644 index 00000000000000..1463585c456520 --- /dev/null +++ b/drivers/regulator/s2dos05-regulator.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// s2dos05.c - Regulator driver for the Samsung s2dos05 +// +// Copyright (C) 2025 Dzmitry Sankouski + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct s2dos05_data { + struct regmap *regmap; + struct device *dev; +}; + +#define _BUCK(macro) S2DOS05_BUCK##macro +#define _buck_ops(num) s2dos05_ops##num +#define _LDO(macro) S2DOS05_LDO##macro +#define _REG(ctrl) S2DOS05_REG##ctrl +#define _ldo_ops(num) s2dos05_ops##num +#define _MASK(macro) S2DOS05_ENABLE_MASK##macro +#define _TIME(macro) S2DOS05_ENABLE_TIME##macro + +#define BUCK_DESC(_name, _id, _ops, m, s, v, e, em, t, a) { \ + .name = _name, \ + .id = _id, \ + .ops = _ops, \ + .of_match = of_match_ptr(_name), \ + .of_match_full_name = true, \ + .regulators_node = of_match_ptr("regulators"), \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = m, \ + .uV_step = s, \ + .n_voltages = S2DOS05_BUCK_N_VOLTAGES, \ + .vsel_reg = v, \ + .vsel_mask = S2DOS05_BUCK_VSEL_MASK, \ + .enable_reg = e, \ + .enable_mask = em, \ + .enable_time = t, \ + .active_discharge_off = 0, \ + .active_discharge_on = S2DOS05_BUCK_FD_MASK, \ + .active_discharge_reg = a, \ + .active_discharge_mask = S2DOS05_BUCK_FD_MASK \ +} + +#define LDO_DESC(_name, _id, _ops, m, s, v, e, em, t, a) { \ + .name = _name, \ + .id = _id, \ + .ops = _ops, \ + .of_match = of_match_ptr(_name), \ + .of_match_full_name = true, \ + .regulators_node = of_match_ptr("regulators"), \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = m, \ + .uV_step = s, \ + .n_voltages = S2DOS05_LDO_N_VOLTAGES, \ + .vsel_reg = v, \ + .vsel_mask = S2DOS05_LDO_VSEL_MASK, \ + .enable_reg = e, \ + .enable_mask = em, \ + .enable_time = t, \ + .active_discharge_off = 0, \ + .active_discharge_on = S2DOS05_LDO_FD_MASK, \ + .active_discharge_reg = a, \ + .active_discharge_mask = S2DOS05_LDO_FD_MASK \ +} + +static const struct regulator_ops s2dos05_ops = { + .list_voltage = regulator_list_voltage_linear, + .map_voltage = regulator_map_voltage_linear, + .is_enabled = regulator_is_enabled_regmap, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .set_active_discharge = regulator_set_active_discharge_regmap, +}; + +static const struct regulator_desc regulators[S2DOS05_REGULATOR_MAX] = { + // name, id, ops, min_uv, uV_step, vsel_reg, enable_reg + LDO_DESC("ldo1", _LDO(1), &_ldo_ops(), _LDO(_MIN1), + _LDO(_STEP1), _REG(_LDO1_CFG), + _REG(_EN), _MASK(_L1), _TIME(_LDO), _REG(_LDO1_CFG)), + LDO_DESC("ldo2", _LDO(2), &_ldo_ops(), _LDO(_MIN1), + _LDO(_STEP1), _REG(_LDO2_CFG), + _REG(_EN), _MASK(_L2), _TIME(_LDO), _REG(_LDO2_CFG)), + LDO_DESC("ldo3", _LDO(3), &_ldo_ops(), _LDO(_MIN2), + _LDO(_STEP1), _REG(_LDO3_CFG), + _REG(_EN), _MASK(_L3), _TIME(_LDO), _REG(_LDO3_CFG)), + LDO_DESC("ldo4", _LDO(4), &_ldo_ops(), _LDO(_MIN2), + _LDO(_STEP1), _REG(_LDO4_CFG), + _REG(_EN), _MASK(_L4), _TIME(_LDO), _REG(_LDO4_CFG)), + BUCK_DESC("buck", _BUCK(1), &_buck_ops(), _BUCK(_MIN1), + _BUCK(_STEP1), _REG(_BUCK_VOUT), + _REG(_EN), _MASK(_B1), _TIME(_BUCK), _REG(_BUCK_CFG)), +}; + +static int s2dos05_pmic_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sec_pmic_dev *iodev = dev_get_drvdata(pdev->dev.parent); + struct s2dos05_data *s2dos05; + struct regulator_config config = { }; + unsigned int rdev_num = ARRAY_SIZE(regulators); + + s2dos05 = devm_kzalloc(dev, sizeof(*s2dos05), GFP_KERNEL); + if (!s2dos05) + return -ENOMEM; + + platform_set_drvdata(pdev, s2dos05); + + s2dos05->regmap = iodev->regmap_pmic; + s2dos05->dev = dev; + if (!dev->of_node) + dev->of_node = dev->parent->of_node; + + config.dev = dev; + config.driver_data = s2dos05; + + for (int i = 0; i < rdev_num; i++) { + struct regulator_dev *regulator; + + regulator = devm_regulator_register(&pdev->dev, + ®ulators[i], &config); + if (IS_ERR(regulator)) { + return dev_err_probe(&pdev->dev, PTR_ERR(regulator), + "regulator init failed for %d\n", i); + } + } + + return 0; +} + +static const struct platform_device_id s2dos05_pmic_id[] = { + { "s2dos05-regulator" }, + { }, +}; +MODULE_DEVICE_TABLE(platform, s2dos05_pmic_id); + +static struct platform_driver s2dos05_platform_driver = { + .driver = { + .name = "s2dos05", + }, + .probe = s2dos05_pmic_probe, + .id_table = s2dos05_pmic_id, +}; +module_platform_driver(s2dos05_platform_driver); + +MODULE_AUTHOR("Dzmitry Sankouski "); +MODULE_DESCRIPTION("Samsung S2DOS05 Regulator Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/regulator/scmi-regulator.c b/drivers/regulator/scmi-regulator.c index 9df726f10ad121..6d609c42e4793b 100644 --- a/drivers/regulator/scmi-regulator.c +++ b/drivers/regulator/scmi-regulator.c @@ -257,7 +257,8 @@ static int process_scmi_regulator_of_node(struct scmi_device *sdev, struct device_node *np, struct scmi_regulator_info *rinfo) { - u32 dom, ret; + u32 dom; + int ret; ret = of_property_read_u32(np, "reg", &dom); if (ret) diff --git a/drivers/regulator/spacemit-p1.c b/drivers/regulator/spacemit-p1.c new file mode 100644 index 00000000000000..d437e6738ea1e3 --- /dev/null +++ b/drivers/regulator/spacemit-p1.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for regulators found in the SpacemiT P1 PMIC + * + * Copyright (C) 2025 by RISCstar Solutions Corporation. All rights reserved. + * Derived from code from SpacemiT. + * Copyright (c) 2023, SPACEMIT Co., Ltd + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MOD_NAME "spacemit-p1-regulator" + +enum p1_regulator_id { + P1_BUCK1, + P1_BUCK2, + P1_BUCK3, + P1_BUCK4, + P1_BUCK5, + P1_BUCK6, + + P1_ALDO1, + P1_ALDO2, + P1_ALDO3, + P1_ALDO4, + + P1_DLDO1, + P1_DLDO2, + P1_DLDO3, + P1_DLDO4, + P1_DLDO5, + P1_DLDO6, + P1_DLDO7, +}; + +static const struct regulator_ops p1_regulator_ops = { + .list_voltage = regulator_list_voltage_linear_range, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .is_enabled = regulator_is_enabled_regmap, +}; + +/* Selector value 255 can be used to disable the buck converter on sleep */ +static const struct linear_range p1_buck_ranges[] = { + REGULATOR_LINEAR_RANGE(500000, 0, 170, 5000), + REGULATOR_LINEAR_RANGE(1375000, 171, 254, 25000), +}; + +/* Selector value 0 can be used for suspend */ +static const struct linear_range p1_ldo_ranges[] = { + REGULATOR_LINEAR_RANGE(500000, 11, 127, 25000), +}; + +/* These define the voltage selector field for buck and LDO regulators */ +#define BUCK_MASK GENMASK(7, 0) +#define LDO_MASK GENMASK(6, 0) + +#define P1_ID(_TYPE, _n) P1_ ## _TYPE ## _n +#define P1_ENABLE_REG(_off, _n) ((_off) + 3 * ((_n) - 1)) + +#define P1_REG_DESC(_TYPE, _type, _n, _s, _off, _mask, _nv, _ranges) \ + { \ + .name = #_type #_n, \ + .supply_name = _s, \ + .of_match = of_match_ptr(#_type #_n), \ + .regulators_node = of_match_ptr("regulators"), \ + .id = P1_ID(_TYPE, _n), \ + .n_voltages = _nv, \ + .ops = &p1_regulator_ops, \ + .owner = THIS_MODULE, \ + .linear_ranges = _ranges, \ + .n_linear_ranges = ARRAY_SIZE(_ranges), \ + .vsel_reg = P1_ENABLE_REG(_off, _n) + 1, \ + .vsel_mask = _mask, \ + .enable_reg = P1_ENABLE_REG(_off, _n), \ + .enable_mask = BIT(0), \ + } + +#define P1_BUCK_DESC(_n) \ + P1_REG_DESC(BUCK, buck, _n, "vcc", 0x47, BUCK_MASK, 254, p1_buck_ranges) + +#define P1_ALDO_DESC(_n) \ + P1_REG_DESC(ALDO, aldo, _n, "vcc", 0x5b, LDO_MASK, 117, p1_ldo_ranges) + +#define P1_DLDO_DESC(_n) \ + P1_REG_DESC(DLDO, dldo, _n, "buck5", 0x67, LDO_MASK, 117, p1_ldo_ranges) + +static const struct regulator_desc p1_regulator_desc[] = { + P1_BUCK_DESC(1), + P1_BUCK_DESC(2), + P1_BUCK_DESC(3), + P1_BUCK_DESC(4), + P1_BUCK_DESC(5), + P1_BUCK_DESC(6), + + P1_ALDO_DESC(1), + P1_ALDO_DESC(2), + P1_ALDO_DESC(3), + P1_ALDO_DESC(4), + + P1_DLDO_DESC(1), + P1_DLDO_DESC(2), + P1_DLDO_DESC(3), + P1_DLDO_DESC(4), + P1_DLDO_DESC(5), + P1_DLDO_DESC(6), + P1_DLDO_DESC(7), +}; + +static int p1_regulator_probe(struct platform_device *pdev) +{ + struct regulator_config config = { }; + struct device *dev = &pdev->dev; + u32 i; + + /* + * The parent device (PMIC) owns the regmap. Since we don't + * provide one in the config structure, that one will be used. + */ + config.dev = dev->parent; + + for (i = 0; i < ARRAY_SIZE(p1_regulator_desc); i++) { + const struct regulator_desc *desc = &p1_regulator_desc[i]; + struct regulator_dev *rdev; + + rdev = devm_regulator_register(dev, desc, &config); + if (IS_ERR(rdev)) + return dev_err_probe(dev, PTR_ERR(rdev), + "error registering regulator %s\n", + desc->name); + } + + return 0; +} + +static struct platform_driver p1_regulator_driver = { + .probe = p1_regulator_probe, + .driver = { + .name = MOD_NAME, + }, +}; + +module_platform_driver(p1_regulator_driver); + +MODULE_DESCRIPTION("SpacemiT P1 regulator driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:" MOD_NAME); diff --git a/drivers/regulator/sy7636a-regulator.c b/drivers/regulator/sy7636a-regulator.c index d1e7ba1fb3e1af..27e3d939b7bb9e 100644 --- a/drivers/regulator/sy7636a-regulator.c +++ b/drivers/regulator/sy7636a-regulator.c @@ -83,9 +83,11 @@ static int sy7636a_regulator_probe(struct platform_device *pdev) if (!regmap) return -EPROBE_DEFER; - gdp = devm_gpiod_get(pdev->dev.parent, "epd-pwr-good", GPIOD_IN); + device_set_of_node_from_dev(&pdev->dev, pdev->dev.parent); + + gdp = devm_gpiod_get(&pdev->dev, "epd-pwr-good", GPIOD_IN); if (IS_ERR(gdp)) { - dev_err(pdev->dev.parent, "Power good GPIO fault %ld\n", PTR_ERR(gdp)); + dev_err(&pdev->dev, "Power good GPIO fault %ld\n", PTR_ERR(gdp)); return PTR_ERR(gdp); } @@ -105,7 +107,6 @@ static int sy7636a_regulator_probe(struct platform_device *pdev) } config.dev = &pdev->dev; - config.dev->of_node = pdev->dev.parent->of_node; config.regmap = regmap; rdev = devm_regulator_register(&pdev->dev, &desc, &config); diff --git a/drivers/regulator/tps6524x-regulator.c b/drivers/regulator/tps6524x-regulator.c index 3fee7e38c68bc9..6beb51293e8e1d 100644 --- a/drivers/regulator/tps6524x-regulator.c +++ b/drivers/regulator/tps6524x-regulator.c @@ -598,7 +598,6 @@ static int pmic_probe(struct spi_device *spi) spi_set_drvdata(spi, hw); - memset(hw, 0, sizeof(struct tps6524x)); hw->dev = dev; hw->spi = spi; mutex_init(&hw->lock); diff --git a/drivers/regulator/tps6594-regulator.c b/drivers/regulator/tps6594-regulator.c index ab882daec7c5fd..645e83462c645e 100644 --- a/drivers/regulator/tps6594-regulator.c +++ b/drivers/regulator/tps6594-regulator.c @@ -647,7 +647,7 @@ static int tps6594_regulator_probe(struct platform_device *pdev) default: dev_err(tps->dev, "unknown chip_id %lu\n", tps->chip_id); return -EINVAL; - }; + } enum { MULTI_BUCK12, diff --git a/drivers/reset/reset-eyeq.c b/drivers/reset/reset-eyeq.c index 02d50041048b42..2d3998368a1c5e 100644 --- a/drivers/reset/reset-eyeq.c +++ b/drivers/reset/reset-eyeq.c @@ -410,6 +410,13 @@ static int eqr_of_xlate_twocells(struct reset_controller_dev *rcdev, return eqr_of_xlate_internal(rcdev, reset_spec->args[0], reset_spec->args[1]); } +static void eqr_of_node_put(void *_dev) +{ + struct device *dev = _dev; + + of_node_put(dev->of_node); +} + static int eqr_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { @@ -428,6 +435,10 @@ static int eqr_probe(struct auxiliary_device *adev, if (!dev->of_node) return -ENODEV; + ret = devm_add_action_or_reset(dev, eqr_of_node_put, dev); + if (ret) + return ret; + /* * Using our newfound OF node, we can get match data. We cannot use * device_get_match_data() because it does not match reused OF nodes. diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 64f6e9756aff4a..4a8dc8d0a4b7c0 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -416,6 +416,16 @@ config RTC_DRV_NCT3018Y This driver can also be built as a module, if so, the module will be called "rtc-nct3018y". +config RTC_DRV_NCT6694 + tristate "Nuvoton NCT6694 RTC support" + depends on MFD_NCT6694 + help + If you say yes to this option, support will be included for Nuvoton + NCT6694, a USB device to RTC. + + This driver can also be built as a module. If so, the module will + be called rtc-nct6694. + config RTC_DRV_RK808 tristate "Rockchip RK805/RK808/RK809/RK817/RK818 RTC" depends on MFD_RK8XX diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 789bddfea99d8f..610a9ee5fd33c3 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -119,6 +119,7 @@ obj-$(CONFIG_RTC_DRV_MXC) += rtc-mxc.o obj-$(CONFIG_RTC_DRV_MXC_V2) += rtc-mxc_v2.o obj-$(CONFIG_RTC_DRV_GAMECUBE) += rtc-gamecube.o obj-$(CONFIG_RTC_DRV_NCT3018Y) += rtc-nct3018y.o +obj-$(CONFIG_RTC_DRV_NCT6694) += rtc-nct6694.o obj-$(CONFIG_RTC_DRV_NTXEC) += rtc-ntxec.o obj-$(CONFIG_RTC_DRV_OMAP) += rtc-omap.o obj-$(CONFIG_RTC_DRV_OPAL) += rtc-opal.o diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index e7b87130e6248d..2494d13fd767e9 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -137,10 +137,6 @@ static int mc13xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) } if (!priv->valid) { - ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_RTCRST); - if (unlikely(ret)) - goto out; - ret = mc13xxx_irq_unmask(priv->mc13xxx, MC13XXX_IRQ_RTCRST); } @@ -208,10 +204,6 @@ static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) if (unlikely(ret)) goto out; - ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_TODA); - if (unlikely(ret)) - goto out; - s1970 = rtc_tm_to_time64(&alarm->time); dev_dbg(dev, "%s: %s %lld\n", __func__, alarm->enabled ? "on" : "off", @@ -239,12 +231,9 @@ static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) static irqreturn_t mc13xxx_rtc_alarm_handler(int irq, void *dev) { struct mc13xxx_rtc *priv = dev; - struct mc13xxx *mc13xxx = priv->mc13xxx; rtc_update_irq(priv->rtc, 1, RTC_IRQF | RTC_AF); - mc13xxx_irq_ack(mc13xxx, irq); - return IRQ_HANDLED; } @@ -293,8 +282,6 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) mc13xxx_lock(mc13xxx); - mc13xxx_irq_ack(mc13xxx, MC13XXX_IRQ_RTCRST); - ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_RTCRST, mc13xxx_rtc_reset_handler, DRIVER_NAME, priv); if (ret) diff --git a/drivers/rtc/rtc-nct6694.c b/drivers/rtc/rtc-nct6694.c new file mode 100644 index 00000000000000..35401a0d9cf53b --- /dev/null +++ b/drivers/rtc/rtc-nct6694.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 RTC driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * USB command module type for NCT6694 RTC controller. + * This defines the module type used for communication with the NCT6694 + * RTC controller over the USB interface. + */ +#define NCT6694_RTC_MOD 0x08 + +/* Command 00h - RTC Time */ +#define NCT6694_RTC_TIME 0x0000 +#define NCT6694_RTC_TIME_SEL 0x00 + +/* Command 01h - RTC Alarm */ +#define NCT6694_RTC_ALARM 0x01 +#define NCT6694_RTC_ALARM_SEL 0x00 + +/* Command 02h - RTC Status */ +#define NCT6694_RTC_STATUS 0x02 +#define NCT6694_RTC_STATUS_SEL 0x00 + +#define NCT6694_RTC_IRQ_INT_EN BIT(0) /* Transmit a USB INT-in when RTC alarm */ +#define NCT6694_RTC_IRQ_GPO_EN BIT(5) /* Trigger a GPO Low Pulse when RTC alarm */ + +#define NCT6694_RTC_IRQ_EN (NCT6694_RTC_IRQ_INT_EN | NCT6694_RTC_IRQ_GPO_EN) +#define NCT6694_RTC_IRQ_STS BIT(0) /* Write 1 clear IRQ status */ + +struct __packed nct6694_rtc_time { + u8 sec; + u8 min; + u8 hour; + u8 week; + u8 day; + u8 month; + u8 year; +}; + +struct __packed nct6694_rtc_alarm { + u8 sec; + u8 min; + u8 hour; + u8 alarm_en; + u8 alarm_pend; +}; + +struct __packed nct6694_rtc_status { + u8 irq_en; + u8 irq_pend; +}; + +union __packed nct6694_rtc_msg { + struct nct6694_rtc_time time; + struct nct6694_rtc_alarm alarm; + struct nct6694_rtc_status sts; +}; + +struct nct6694_rtc_data { + struct nct6694 *nct6694; + struct rtc_device *rtc; + union nct6694_rtc_msg *msg; + int irq; +}; + +static int nct6694_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_time *time = &data->msg->time; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_TIME, + .sel = NCT6694_RTC_TIME_SEL, + .len = cpu_to_le16(sizeof(*time)) + }; + int ret; + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, time); + if (ret) + return ret; + + tm->tm_sec = bcd2bin(time->sec); /* tm_sec expect 0 ~ 59 */ + tm->tm_min = bcd2bin(time->min); /* tm_min expect 0 ~ 59 */ + tm->tm_hour = bcd2bin(time->hour); /* tm_hour expect 0 ~ 23 */ + tm->tm_wday = bcd2bin(time->week) - 1; /* tm_wday expect 0 ~ 6 */ + tm->tm_mday = bcd2bin(time->day); /* tm_mday expect 1 ~ 31 */ + tm->tm_mon = bcd2bin(time->month) - 1; /* tm_month expect 0 ~ 11 */ + tm->tm_year = bcd2bin(time->year) + 100; /* tm_year expect since 1900 */ + + return ret; +} + +static int nct6694_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_time *time = &data->msg->time; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_TIME, + .sel = NCT6694_RTC_TIME_SEL, + .len = cpu_to_le16(sizeof(*time)) + }; + + time->sec = bin2bcd(tm->tm_sec); + time->min = bin2bcd(tm->tm_min); + time->hour = bin2bcd(tm->tm_hour); + time->week = bin2bcd(tm->tm_wday + 1); + time->day = bin2bcd(tm->tm_mday); + time->month = bin2bcd(tm->tm_mon + 1); + time->year = bin2bcd(tm->tm_year - 100); + + return nct6694_write_msg(data->nct6694, &cmd_hd, time); +} + +static int nct6694_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_alarm *alarm = &data->msg->alarm; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_ALARM, + .sel = NCT6694_RTC_ALARM_SEL, + .len = cpu_to_le16(sizeof(*alarm)) + }; + int ret; + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, alarm); + if (ret) + return ret; + + alrm->time.tm_sec = bcd2bin(alarm->sec); + alrm->time.tm_min = bcd2bin(alarm->min); + alrm->time.tm_hour = bcd2bin(alarm->hour); + alrm->enabled = alarm->alarm_en; + alrm->pending = alarm->alarm_pend; + + return ret; +} + +static int nct6694_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_alarm *alarm = &data->msg->alarm; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_ALARM, + .sel = NCT6694_RTC_ALARM_SEL, + .len = cpu_to_le16(sizeof(*alarm)) + }; + + alarm->sec = bin2bcd(alrm->time.tm_sec); + alarm->min = bin2bcd(alrm->time.tm_min); + alarm->hour = bin2bcd(alrm->time.tm_hour); + alarm->alarm_en = alrm->enabled ? NCT6694_RTC_IRQ_EN : 0; + alarm->alarm_pend = 0; + + return nct6694_write_msg(data->nct6694, &cmd_hd, alarm); +} + +static int nct6694_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) +{ + struct nct6694_rtc_data *data = dev_get_drvdata(dev); + struct nct6694_rtc_status *sts = &data->msg->sts; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_STATUS, + .sel = NCT6694_RTC_STATUS_SEL, + .len = cpu_to_le16(sizeof(*sts)) + }; + + if (enabled) + sts->irq_en |= NCT6694_RTC_IRQ_EN; + else + sts->irq_en &= ~NCT6694_RTC_IRQ_EN; + + sts->irq_pend = 0; + + return nct6694_write_msg(data->nct6694, &cmd_hd, sts); +} + +static const struct rtc_class_ops nct6694_rtc_ops = { + .read_time = nct6694_rtc_read_time, + .set_time = nct6694_rtc_set_time, + .read_alarm = nct6694_rtc_read_alarm, + .set_alarm = nct6694_rtc_set_alarm, + .alarm_irq_enable = nct6694_rtc_alarm_irq_enable, +}; + +static irqreturn_t nct6694_irq(int irq, void *dev_id) +{ + struct nct6694_rtc_data *data = dev_id; + struct nct6694_rtc_status *sts = &data->msg->sts; + static const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_RTC_MOD, + .cmd = NCT6694_RTC_STATUS, + .sel = NCT6694_RTC_STATUS_SEL, + .len = cpu_to_le16(sizeof(*sts)) + }; + int ret; + + rtc_lock(data->rtc); + + sts->irq_en = NCT6694_RTC_IRQ_EN; + sts->irq_pend = NCT6694_RTC_IRQ_STS; + ret = nct6694_write_msg(data->nct6694, &cmd_hd, sts); + if (ret) { + rtc_unlock(data->rtc); + return IRQ_NONE; + } + + rtc_update_irq(data->rtc, 1, RTC_IRQF | RTC_AF); + + rtc_unlock(data->rtc); + + return IRQ_HANDLED; +} + +static void nct6694_irq_dispose_mapping(void *d) +{ + struct nct6694_rtc_data *data = d; + + irq_dispose_mapping(data->irq); +} + +static int nct6694_rtc_probe(struct platform_device *pdev) +{ + struct nct6694_rtc_data *data; + struct nct6694 *nct6694 = dev_get_drvdata(pdev->dev.parent); + int ret; + + data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->msg = devm_kzalloc(&pdev->dev, sizeof(union nct6694_rtc_msg), + GFP_KERNEL); + if (!data->msg) + return -ENOMEM; + + data->irq = irq_create_mapping(nct6694->domain, NCT6694_IRQ_RTC); + if (!data->irq) + return -EINVAL; + + ret = devm_add_action_or_reset(&pdev->dev, nct6694_irq_dispose_mapping, + data); + if (ret) + return ret; + + ret = devm_device_init_wakeup(&pdev->dev); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to init wakeup\n"); + + data->rtc = devm_rtc_allocate_device(&pdev->dev); + if (IS_ERR(data->rtc)) + return PTR_ERR(data->rtc); + + data->nct6694 = nct6694; + data->rtc->ops = &nct6694_rtc_ops; + data->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000; + data->rtc->range_max = RTC_TIMESTAMP_END_2099; + + platform_set_drvdata(pdev, data); + + ret = devm_request_threaded_irq(&pdev->dev, data->irq, NULL, + nct6694_irq, IRQF_ONESHOT, + "rtc-nct6694", data); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, "Failed to request irq\n"); + + return devm_rtc_register_device(data->rtc); +} + +static struct platform_driver nct6694_rtc_driver = { + .driver = { + .name = "nct6694-rtc", + }, + .probe = nct6694_rtc_probe, +}; + +module_platform_driver(nct6694_rtc_driver); + +MODULE_DESCRIPTION("USB-RTC driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-rtc"); diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig index 8c1c908d2c6e72..877a9bc7f04b93 100644 --- a/drivers/s390/block/Kconfig +++ b/drivers/s390/block/Kconfig @@ -5,19 +5,11 @@ comment "S/390 block device drivers" config DCSSBLK def_tristate m prompt "DCSSBLK support" - depends on S390 && BLOCK && (DAX || DAX=n) + depends on S390 && BLOCK && ZONE_DEVICE + select FS_DAX help Support for dcss block device -config DCSSBLK_DAX - def_bool y - depends on DCSSBLK - # requires S390 ZONE_DEVICE support - depends on BROKEN - prompt "DCSSBLK DAX support" - help - Enable DAX operation for the dcss block device - config DASD def_tristate y prompt "Support for DASD devices" diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 94fa5edecaddf8..86fef4b15015d1 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -79,6 +79,8 @@ struct dcssblk_dev_info { int num_of_segments; struct list_head seg_list; struct dax_device *dax_dev; + struct dev_pagemap pgmap; + void *pgmap_addr; }; struct segment_info { @@ -415,6 +417,8 @@ dcssblk_shared_store(struct device *dev, struct device_attribute *attr, const ch dax_remove_host(dev_info->gd); kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); + if (dev_info->pgmap_addr) + devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap); del_gendisk(dev_info->gd); put_disk(dev_info->gd); @@ -537,9 +541,6 @@ static int dcssblk_setup_dax(struct dcssblk_dev_info *dev_info) { struct dax_device *dax_dev; - if (!IS_ENABLED(CONFIG_DCSSBLK_DAX)) - return 0; - dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); if (IS_ERR(dax_dev)) return PTR_ERR(dax_dev); @@ -562,6 +563,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char struct dcssblk_dev_info *dev_info; struct segment_info *seg_info, *temp; char *local_buf; + void *addr; unsigned long seg_byte_size; dev_info = NULL; @@ -687,9 +689,26 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char if (rc) goto put_dev; - rc = dcssblk_setup_dax(dev_info); - if (rc) - goto out_dax; + if (!IS_ALIGNED(dev_info->start, SUBSECTION_SIZE) || + !IS_ALIGNED(dev_info->end + 1, SUBSECTION_SIZE)) { + pr_info("DCSS %s is not aligned to %lu bytes, DAX support disabled\n", + local_buf, SUBSECTION_SIZE); + } else { + dev_info->pgmap.type = MEMORY_DEVICE_FS_DAX; + dev_info->pgmap.range.start = dev_info->start; + dev_info->pgmap.range.end = dev_info->end; + dev_info->pgmap.nr_range = 1; + addr = devm_memremap_pages(&dev_info->dev, &dev_info->pgmap); + if (IS_ERR(addr)) { + rc = PTR_ERR(addr); + goto put_dev; + } + dev_info->pgmap_addr = addr; + rc = dcssblk_setup_dax(dev_info); + if (rc) + goto out_dax; + pr_info("DAX support enabled for DCSS %s\n", local_buf); + } get_device(&dev_info->dev); rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL); @@ -716,6 +735,8 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char out_dax: kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); + if (dev_info->pgmap_addr) + devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap); put_dev: list_del(&dev_info->lh); put_disk(dev_info->gd); @@ -801,6 +822,8 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch dax_remove_host(dev_info->gd); kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); + if (dev_info->pgmap_addr) + devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap); del_gendisk(dev_info->gd); put_disk(dev_info->gd); diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile index 81d6744e1861fd..dcbd51152ee3f8 100644 --- a/drivers/s390/char/Makefile +++ b/drivers/s390/char/Makefile @@ -21,6 +21,7 @@ obj-y += ctrlchar.o keyboard.o defkeymap.o sclp.o sclp_rw.o sclp_quiesce.o \ sclp_cmd.o sclp_config.o sclp_cpi_sys.o sclp_ocf.o sclp_ctl.o \ sclp_early.o sclp_early_core.o sclp_sd.o +obj-$(CONFIG_MEMORY_HOTPLUG) += sclp_mem.o obj-$(CONFIG_TN3270) += raw3270.o con3270.o obj-$(CONFIG_TN3270_FS) += fs3270.o diff --git a/drivers/s390/char/hmcdrv_dev.c b/drivers/s390/char/hmcdrv_dev.c index e069dd6858995e..b26fcf6849f2ad 100644 --- a/drivers/s390/char/hmcdrv_dev.c +++ b/drivers/s390/char/hmcdrv_dev.c @@ -244,24 +244,17 @@ static ssize_t hmcdrv_dev_write(struct file *fp, const char __user *ubuf, size_t len, loff_t *pos) { ssize_t retlen; + void *pdata; pr_debug("writing file '/dev/%pD' at pos. %lld with length %zd\n", fp, (long long) *pos, len); if (!fp->private_data) { /* first expect a cmd write */ - fp->private_data = kmalloc(len + 1, GFP_KERNEL); - - if (!fp->private_data) - return -ENOMEM; - - if (!copy_from_user(fp->private_data, ubuf, len)) { - ((char *)fp->private_data)[len] = '\0'; - return len; - } - - kfree(fp->private_data); - fp->private_data = NULL; - return -EFAULT; + pdata = memdup_user_nul(ubuf, len); + if (IS_ERR(pdata)) + return PTR_ERR(pdata); + fp->private_data = pdata; + return len; } retlen = hmcdrv_dev_transfer((char *) fp->private_data, diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 16469678548f2f..3480198eac0255 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -8,31 +8,46 @@ #define KMSG_COMPONENT "sclp_cmd" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#include #include -#include -#include #include +#include +#include #include #include -#include -#include -#include -#include -#include -#include #include -#include -#include +#include #include -#include -#include -#include #include "sclp.h" -#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 -#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 +/* CPU configuration related functions */ +#define SCLP_CMDW_CONFIGURE_CPU 0x00110001 +#define SCLP_CMDW_DECONFIGURE_CPU 0x00100001 +/* Channel path configuration related functions */ +#define SCLP_CMDW_CONFIGURE_CHPATH 0x000f0001 +#define SCLP_CMDW_DECONFIGURE_CHPATH 0x000e0001 +#define SCLP_CMDW_READ_CHPATH_INFORMATION 0x00030001 + +struct cpu_configure_sccb { + struct sccb_header header; +} __packed __aligned(8); + +struct chp_cfg_sccb { + struct sccb_header header; + u8 ccm; + u8 reserved[6]; + u8 cssid; +} __packed; + +struct chp_info_sccb { + struct sccb_header header; + u8 recognized[SCLP_CHP_INFO_MASK_SIZE]; + u8 standby[SCLP_CHP_INFO_MASK_SIZE]; + u8 configured[SCLP_CHP_INFO_MASK_SIZE]; + u8 ccm; + u8 reserved[6]; + u8 cssid; +} __packed; static void sclp_sync_callback(struct sclp_req *req, void *data) { @@ -64,13 +79,11 @@ int sclp_sync_request_timeout(sclp_cmdw_t cmd, void *sccb, int timeout) request->callback_data = &completion; init_completion(&completion); - /* Perform sclp request. */ rc = sclp_add_request(request); if (rc) goto out; wait_for_completion(&completion); - /* Check response. */ if (request->status != SCLP_REQ_DONE) { pr_warn("sync request failed (cmd=0x%08x, status=0x%02x)\n", cmd, request->status); @@ -81,22 +94,15 @@ int sclp_sync_request_timeout(sclp_cmdw_t cmd, void *sccb, int timeout) return rc; } -/* - * CPU configuration related functions. - */ - -#define SCLP_CMDW_CONFIGURE_CPU 0x00110001 -#define SCLP_CMDW_DECONFIGURE_CPU 0x00100001 - int _sclp_get_core_info(struct sclp_core_info *info) { - int rc; - int length = test_facility(140) ? EXT_SCCB_READ_CPU : PAGE_SIZE; struct read_cpu_info_sccb *sccb; + int rc, length; if (!SCLP_HAS_CPU_INFO) return -EOPNOTSUPP; + length = test_facility(140) ? EXT_SCCB_READ_CPU : PAGE_SIZE; sccb = (void *)__get_free_pages(GFP_KERNEL | GFP_DMA | __GFP_ZERO, get_order(length)); if (!sccb) return -ENOMEM; @@ -114,14 +120,10 @@ int _sclp_get_core_info(struct sclp_core_info *info) } sclp_fill_core_info(info, sccb); out: - free_pages((unsigned long) sccb, get_order(length)); + free_pages((unsigned long)sccb, get_order(length)); return rc; } -struct cpu_configure_sccb { - struct sccb_header header; -} __attribute__((packed, aligned(8))); - static int do_core_configure(sclp_cmdw_t cmd) { struct cpu_configure_sccb *sccb; @@ -130,8 +132,8 @@ static int do_core_configure(sclp_cmdw_t cmd) if (!SCLP_HAS_CPU_RECONFIG) return -EOPNOTSUPP; /* - * This is not going to cross a page boundary since we force - * kmalloc to have a minimum alignment of 8 bytes on s390. + * Use kmalloc to have a minimum alignment of 8 bytes and ensure sccb + * is not going to cross a page boundary. */ sccb = kzalloc(sizeof(*sccb), GFP_KERNEL | GFP_DMA); if (!sccb) @@ -165,394 +167,6 @@ int sclp_core_deconfigure(u8 core) return do_core_configure(SCLP_CMDW_DECONFIGURE_CPU | core << 8); } -#ifdef CONFIG_MEMORY_HOTPLUG - -static DEFINE_MUTEX(sclp_mem_mutex); -static LIST_HEAD(sclp_mem_list); -static u8 sclp_max_storage_id; -static DECLARE_BITMAP(sclp_storage_ids, 256); - -struct memory_increment { - struct list_head list; - u16 rn; - int standby; -}; - -struct assign_storage_sccb { - struct sccb_header header; - u16 rn; -} __packed; - -int arch_get_memory_phys_device(unsigned long start_pfn) -{ - if (!sclp.rzm) - return 0; - return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm); -} - -static unsigned long long rn2addr(u16 rn) -{ - return (unsigned long long) (rn - 1) * sclp.rzm; -} - -static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) -{ - struct assign_storage_sccb *sccb; - int rc; - - sccb = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); - if (!sccb) - return -ENOMEM; - sccb->header.length = PAGE_SIZE; - sccb->rn = rn; - rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL); - if (rc) - goto out; - switch (sccb->header.response_code) { - case 0x0020: - case 0x0120: - break; - default: - pr_warn("assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n", - cmd, sccb->header.response_code, rn); - rc = -EIO; - break; - } -out: - free_page((unsigned long) sccb); - return rc; -} - -static int sclp_assign_storage(u16 rn) -{ - unsigned long long start; - int rc; - - rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn); - if (rc) - return rc; - start = rn2addr(rn); - storage_key_init_range(start, start + sclp.rzm); - return 0; -} - -static int sclp_unassign_storage(u16 rn) -{ - return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn); -} - -struct attach_storage_sccb { - struct sccb_header header; - u16 :16; - u16 assigned; - u32 :32; - u32 entries[]; -} __packed; - -static int sclp_attach_storage(u8 id) -{ - struct attach_storage_sccb *sccb; - int rc; - int i; - - sccb = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); - if (!sccb) - return -ENOMEM; - sccb->header.length = PAGE_SIZE; - sccb->header.function_code = 0x40; - rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb, - SCLP_QUEUE_INTERVAL); - if (rc) - goto out; - switch (sccb->header.response_code) { - case 0x0020: - set_bit(id, sclp_storage_ids); - for (i = 0; i < sccb->assigned; i++) { - if (sccb->entries[i]) - sclp_unassign_storage(sccb->entries[i] >> 16); - } - break; - default: - rc = -EIO; - break; - } -out: - free_page((unsigned long) sccb); - return rc; -} - -static int sclp_mem_change_state(unsigned long start, unsigned long size, - int online) -{ - struct memory_increment *incr; - unsigned long long istart; - int rc = 0; - - list_for_each_entry(incr, &sclp_mem_list, list) { - istart = rn2addr(incr->rn); - if (start + size - 1 < istart) - break; - if (start > istart + sclp.rzm - 1) - continue; - if (online) - rc |= sclp_assign_storage(incr->rn); - else - sclp_unassign_storage(incr->rn); - if (rc == 0) - incr->standby = online ? 0 : 1; - } - return rc ? -EIO : 0; -} - -static bool contains_standby_increment(unsigned long start, unsigned long end) -{ - struct memory_increment *incr; - unsigned long istart; - - list_for_each_entry(incr, &sclp_mem_list, list) { - istart = rn2addr(incr->rn); - if (end - 1 < istart) - continue; - if (start > istart + sclp.rzm - 1) - continue; - if (incr->standby) - return true; - } - return false; -} - -static int sclp_mem_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - unsigned long start, size; - struct memory_notify *arg; - unsigned char id; - int rc = 0; - - arg = data; - start = arg->start_pfn << PAGE_SHIFT; - size = arg->nr_pages << PAGE_SHIFT; - mutex_lock(&sclp_mem_mutex); - for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1) - sclp_attach_storage(id); - switch (action) { - case MEM_GOING_OFFLINE: - /* - * We do not allow to set memory blocks offline that contain - * standby memory. This is done to simplify the "memory online" - * case. - */ - if (contains_standby_increment(start, start + size)) - rc = -EPERM; - break; - case MEM_PREPARE_ONLINE: - /* - * Access the altmap_start_pfn and altmap_nr_pages fields - * within the struct memory_notify specifically when dealing - * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. - * - * When altmap is in use, take the specified memory range - * online, which includes the altmap. - */ - if (arg->altmap_nr_pages) { - start = PFN_PHYS(arg->altmap_start_pfn); - size += PFN_PHYS(arg->altmap_nr_pages); - } - rc = sclp_mem_change_state(start, size, 1); - if (rc || !arg->altmap_nr_pages) - break; - /* - * Set CMMA state to nodat here, since the struct page memory - * at the beginning of the memory block will not go through the - * buddy allocator later. - */ - __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages); - break; - case MEM_FINISH_OFFLINE: - /* - * When altmap is in use, take the specified memory range - * offline, which includes the altmap. - */ - if (arg->altmap_nr_pages) { - start = PFN_PHYS(arg->altmap_start_pfn); - size += PFN_PHYS(arg->altmap_nr_pages); - } - sclp_mem_change_state(start, size, 0); - break; - default: - break; - } - mutex_unlock(&sclp_mem_mutex); - return rc ? NOTIFY_BAD : NOTIFY_OK; -} - -static struct notifier_block sclp_mem_nb = { - .notifier_call = sclp_mem_notifier, -}; - -static void __init align_to_block_size(unsigned long long *start, - unsigned long long *size, - unsigned long long alignment) -{ - unsigned long long start_align, size_align; - - start_align = roundup(*start, alignment); - size_align = rounddown(*start + *size, alignment) - start_align; - - pr_info("Standby memory at 0x%llx (%lluM of %lluM usable)\n", - *start, size_align >> 20, *size >> 20); - *start = start_align; - *size = size_align; -} - -static void __init add_memory_merged(u16 rn) -{ - unsigned long long start, size, addr, block_size; - static u16 first_rn, num; - - if (rn && first_rn && (first_rn + num == rn)) { - num++; - return; - } - if (!first_rn) - goto skip_add; - start = rn2addr(first_rn); - size = (unsigned long long) num * sclp.rzm; - if (start >= ident_map_size) - goto skip_add; - if (start + size > ident_map_size) - size = ident_map_size - start; - block_size = memory_block_size_bytes(); - align_to_block_size(&start, &size, block_size); - if (!size) - goto skip_add; - for (addr = start; addr < start + size; addr += block_size) - add_memory(0, addr, block_size, - cpu_has_edat1() ? - MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); -skip_add: - first_rn = rn; - num = 1; -} - -static void __init sclp_add_standby_memory(void) -{ - struct memory_increment *incr; - - list_for_each_entry(incr, &sclp_mem_list, list) - if (incr->standby) - add_memory_merged(incr->rn); - add_memory_merged(0); -} - -static void __init insert_increment(u16 rn, int standby, int assigned) -{ - struct memory_increment *incr, *new_incr; - struct list_head *prev; - u16 last_rn; - - new_incr = kzalloc(sizeof(*new_incr), GFP_KERNEL); - if (!new_incr) - return; - new_incr->rn = rn; - new_incr->standby = standby; - last_rn = 0; - prev = &sclp_mem_list; - list_for_each_entry(incr, &sclp_mem_list, list) { - if (assigned && incr->rn > rn) - break; - if (!assigned && incr->rn - last_rn > 1) - break; - last_rn = incr->rn; - prev = &incr->list; - } - if (!assigned) - new_incr->rn = last_rn + 1; - if (new_incr->rn > sclp.rnmax) { - kfree(new_incr); - return; - } - list_add(&new_incr->list, prev); -} - -static int __init sclp_detect_standby_memory(void) -{ - struct read_storage_sccb *sccb; - int i, id, assigned, rc; - - if (oldmem_data.start) /* No standby memory in kdump mode */ - return 0; - if ((sclp.facilities & 0xe00000000000ULL) != 0xe00000000000ULL) - return 0; - rc = -ENOMEM; - sccb = (void *) __get_free_page(GFP_KERNEL | GFP_DMA); - if (!sccb) - goto out; - assigned = 0; - for (id = 0; id <= sclp_max_storage_id; id++) { - memset(sccb, 0, PAGE_SIZE); - sccb->header.length = PAGE_SIZE; - rc = sclp_sync_request(SCLP_CMDW_READ_STORAGE_INFO | id << 8, sccb); - if (rc) - goto out; - switch (sccb->header.response_code) { - case 0x0010: - set_bit(id, sclp_storage_ids); - for (i = 0; i < sccb->assigned; i++) { - if (!sccb->entries[i]) - continue; - assigned++; - insert_increment(sccb->entries[i] >> 16, 0, 1); - } - break; - case 0x0310: - break; - case 0x0410: - for (i = 0; i < sccb->assigned; i++) { - if (!sccb->entries[i]) - continue; - assigned++; - insert_increment(sccb->entries[i] >> 16, 1, 1); - } - break; - default: - rc = -EIO; - break; - } - if (!rc) - sclp_max_storage_id = sccb->max_id; - } - if (rc || list_empty(&sclp_mem_list)) - goto out; - for (i = 1; i <= sclp.rnmax - assigned; i++) - insert_increment(0, 1, 0); - rc = register_memory_notifier(&sclp_mem_nb); - if (rc) - goto out; - sclp_add_standby_memory(); -out: - free_page((unsigned long) sccb); - return rc; -} -__initcall(sclp_detect_standby_memory); - -#endif /* CONFIG_MEMORY_HOTPLUG */ - -/* - * Channel path configuration related functions. - */ - -#define SCLP_CMDW_CONFIGURE_CHPATH 0x000f0001 -#define SCLP_CMDW_DECONFIGURE_CHPATH 0x000e0001 -#define SCLP_CMDW_READ_CHPATH_INFORMATION 0x00030001 - -struct chp_cfg_sccb { - struct sccb_header header; - u8 ccm; - u8 reserved[6]; - u8 cssid; -} __attribute__((packed)); - static int do_chp_configure(sclp_cmdw_t cmd) { struct chp_cfg_sccb *sccb; @@ -560,8 +174,7 @@ static int do_chp_configure(sclp_cmdw_t cmd) if (!SCLP_HAS_CHP_RECONFIG) return -EOPNOTSUPP; - /* Prepare sccb. */ - sccb = (struct chp_cfg_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (struct chp_cfg_sccb *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = sizeof(*sccb); @@ -581,7 +194,7 @@ static int do_chp_configure(sclp_cmdw_t cmd) break; } out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } @@ -609,16 +222,6 @@ int sclp_chp_deconfigure(struct chp_id chpid) return do_chp_configure(SCLP_CMDW_DECONFIGURE_CHPATH | chpid.id << 8); } -struct chp_info_sccb { - struct sccb_header header; - u8 recognized[SCLP_CHP_INFO_MASK_SIZE]; - u8 standby[SCLP_CHP_INFO_MASK_SIZE]; - u8 configured[SCLP_CHP_INFO_MASK_SIZE]; - u8 ccm; - u8 reserved[6]; - u8 cssid; -} __attribute__((packed)); - /** * sclp_chp_read_info - perform read channel-path information sclp command * @info: resulting channel-path information data @@ -634,8 +237,7 @@ int sclp_chp_read_info(struct sclp_chp_info *info) if (!SCLP_HAS_CHP_INFO) return -EOPNOTSUPP; - /* Prepare sccb. */ - sccb = (struct chp_info_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (struct chp_info_sccb *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = sizeof(*sccb); @@ -652,6 +254,6 @@ int sclp_chp_read_info(struct sclp_chp_info *info) memcpy(info->standby, sccb->standby, SCLP_CHP_INFO_MASK_SIZE); memcpy(info->configured, sccb->configured, SCLP_CHP_INFO_MASK_SIZE); out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c new file mode 100644 index 00000000000000..27f49f5fd35849 --- /dev/null +++ b/drivers/s390/char/sclp_mem.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory hotplug support via sclp + * + * Copyright IBM Corp. 2025 + */ + +#define KMSG_COMPONENT "sclp_mem" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sclp.h" + +#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 +#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 + +static DEFINE_MUTEX(sclp_mem_mutex); +static LIST_HEAD(sclp_mem_list); +static u8 sclp_max_storage_id; +static DECLARE_BITMAP(sclp_storage_ids, 256); + +struct memory_increment { + struct list_head list; + u16 rn; + int standby; +}; + +struct assign_storage_sccb { + struct sccb_header header; + u16 rn; +} __packed; + +struct attach_storage_sccb { + struct sccb_header header; + u16 :16; + u16 assigned; + u32 :32; + u32 entries[]; +} __packed; + +int arch_get_memory_phys_device(unsigned long start_pfn) +{ + if (!sclp.rzm) + return 0; + return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm); +} + +static unsigned long rn2addr(u16 rn) +{ + return (unsigned long)(rn - 1) * sclp.rzm; +} + +static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) +{ + struct assign_storage_sccb *sccb; + int rc; + + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + return -ENOMEM; + sccb->header.length = PAGE_SIZE; + sccb->rn = rn; + rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL); + if (rc) + goto out; + switch (sccb->header.response_code) { + case 0x0020: + case 0x0120: + break; + default: + pr_warn("assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n", + cmd, sccb->header.response_code, rn); + rc = -EIO; + break; + } +out: + free_page((unsigned long)sccb); + return rc; +} + +static int sclp_assign_storage(u16 rn) +{ + unsigned long start; + int rc; + + rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn); + if (rc) + return rc; + start = rn2addr(rn); + storage_key_init_range(start, start + sclp.rzm); + return 0; +} + +static int sclp_unassign_storage(u16 rn) +{ + return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn); +} + +static int sclp_attach_storage(u8 id) +{ + struct attach_storage_sccb *sccb; + int rc, i; + + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + return -ENOMEM; + sccb->header.length = PAGE_SIZE; + sccb->header.function_code = 0x40; + rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb, + SCLP_QUEUE_INTERVAL); + if (rc) + goto out; + switch (sccb->header.response_code) { + case 0x0020: + set_bit(id, sclp_storage_ids); + for (i = 0; i < sccb->assigned; i++) { + if (sccb->entries[i]) + sclp_unassign_storage(sccb->entries[i] >> 16); + } + break; + default: + rc = -EIO; + break; + } +out: + free_page((unsigned long)sccb); + return rc; +} + +static int sclp_mem_change_state(unsigned long start, unsigned long size, + int online) +{ + struct memory_increment *incr; + unsigned long istart; + int rc = 0; + + list_for_each_entry(incr, &sclp_mem_list, list) { + istart = rn2addr(incr->rn); + if (start + size - 1 < istart) + break; + if (start > istart + sclp.rzm - 1) + continue; + if (online) + rc |= sclp_assign_storage(incr->rn); + else + sclp_unassign_storage(incr->rn); + if (rc == 0) + incr->standby = online ? 0 : 1; + } + return rc ? -EIO : 0; +} + +static bool contains_standby_increment(unsigned long start, unsigned long end) +{ + struct memory_increment *incr; + unsigned long istart; + + list_for_each_entry(incr, &sclp_mem_list, list) { + istart = rn2addr(incr->rn); + if (end - 1 < istart) + continue; + if (start > istart + sclp.rzm - 1) + continue; + if (incr->standby) + return true; + } + return false; +} + +static int sclp_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long start, size; + struct memory_notify *arg; + unsigned char id; + int rc = 0; + + arg = data; + start = arg->start_pfn << PAGE_SHIFT; + size = arg->nr_pages << PAGE_SHIFT; + mutex_lock(&sclp_mem_mutex); + for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1) + sclp_attach_storage(id); + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Do not allow to set memory blocks offline that contain + * standby memory. This is done to simplify the "memory online" + * case. + */ + if (contains_standby_increment(start, start + size)) + rc = -EPERM; + break; + case MEM_PREPARE_ONLINE: + /* + * Access the altmap_start_pfn and altmap_nr_pages fields + * within the struct memory_notify specifically when dealing + * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + * + * When altmap is in use, take the specified memory range + * online, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } + rc = sclp_mem_change_state(start, size, 1); + if (rc || !arg->altmap_nr_pages) + break; + /* + * Set CMMA state to nodat here, since the struct page memory + * at the beginning of the memory block will not go through the + * buddy allocator later. + */ + __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages); + break; + case MEM_FINISH_OFFLINE: + /* + * When altmap is in use, take the specified memory range + * offline, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } + sclp_mem_change_state(start, size, 0); + break; + default: + break; + } + mutex_unlock(&sclp_mem_mutex); + return rc ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block sclp_mem_nb = { + .notifier_call = sclp_mem_notifier, +}; + +static void __init align_to_block_size(unsigned long *start, + unsigned long *size, + unsigned long alignment) +{ + unsigned long start_align, size_align; + + start_align = roundup(*start, alignment); + size_align = rounddown(*start + *size, alignment) - start_align; + + pr_info("Standby memory at 0x%lx (%luM of %luM usable)\n", + *start, size_align >> 20, *size >> 20); + *start = start_align; + *size = size_align; +} + +static void __init add_memory_merged(u16 rn) +{ + unsigned long start, size, addr, block_size; + static u16 first_rn, num; + + if (rn && first_rn && (first_rn + num == rn)) { + num++; + return; + } + if (!first_rn) + goto skip_add; + start = rn2addr(first_rn); + size = (unsigned long)num * sclp.rzm; + if (start >= ident_map_size) + goto skip_add; + if (start + size > ident_map_size) + size = ident_map_size - start; + block_size = memory_block_size_bytes(); + align_to_block_size(&start, &size, block_size); + if (!size) + goto skip_add; + for (addr = start; addr < start + size; addr += block_size) { + add_memory(0, addr, block_size, + cpu_has_edat1() ? + MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); + } +skip_add: + first_rn = rn; + num = 1; +} + +static void __init sclp_add_standby_memory(void) +{ + struct memory_increment *incr; + + list_for_each_entry(incr, &sclp_mem_list, list) { + if (incr->standby) + add_memory_merged(incr->rn); + } + add_memory_merged(0); +} + +static void __init insert_increment(u16 rn, int standby, int assigned) +{ + struct memory_increment *incr, *new_incr; + struct list_head *prev; + u16 last_rn; + + new_incr = kzalloc(sizeof(*new_incr), GFP_KERNEL); + if (!new_incr) + return; + new_incr->rn = rn; + new_incr->standby = standby; + last_rn = 0; + prev = &sclp_mem_list; + list_for_each_entry(incr, &sclp_mem_list, list) { + if (assigned && incr->rn > rn) + break; + if (!assigned && incr->rn - last_rn > 1) + break; + last_rn = incr->rn; + prev = &incr->list; + } + if (!assigned) + new_incr->rn = last_rn + 1; + if (new_incr->rn > sclp.rnmax) { + kfree(new_incr); + return; + } + list_add(&new_incr->list, prev); +} + +static int __init sclp_detect_standby_memory(void) +{ + struct read_storage_sccb *sccb; + int i, id, assigned, rc; + + /* No standby memory in kdump mode */ + if (oldmem_data.start) + return 0; + if ((sclp.facilities & 0xe00000000000UL) != 0xe00000000000UL) + return 0; + rc = -ENOMEM; + sccb = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + goto out; + assigned = 0; + for (id = 0; id <= sclp_max_storage_id; id++) { + memset(sccb, 0, PAGE_SIZE); + sccb->header.length = PAGE_SIZE; + rc = sclp_sync_request(SCLP_CMDW_READ_STORAGE_INFO | id << 8, sccb); + if (rc) + goto out; + switch (sccb->header.response_code) { + case 0x0010: + set_bit(id, sclp_storage_ids); + for (i = 0; i < sccb->assigned; i++) { + if (!sccb->entries[i]) + continue; + assigned++; + insert_increment(sccb->entries[i] >> 16, 0, 1); + } + break; + case 0x0310: + break; + case 0x0410: + for (i = 0; i < sccb->assigned; i++) { + if (!sccb->entries[i]) + continue; + assigned++; + insert_increment(sccb->entries[i] >> 16, 1, 1); + } + break; + default: + rc = -EIO; + break; + } + if (!rc) + sclp_max_storage_id = sccb->max_id; + } + if (rc || list_empty(&sclp_mem_list)) + goto out; + for (i = 1; i <= sclp.rnmax - assigned; i++) + insert_increment(0, 1, 0); + rc = register_memory_notifier(&sclp_mem_nb); + if (rc) + goto out; + sclp_add_standby_memory(); +out: + free_page((unsigned long)sccb); + return rc; +} +__initcall(sclp_detect_standby_memory); diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c index a1bafaf73f87ac..2a2931d303cb94 100644 --- a/drivers/s390/char/tape_3590.c +++ b/drivers/s390/char/tape_3590.c @@ -1671,7 +1671,7 @@ tape_3590_init(void) DBF_EVENT(3, "3590 init\n"); - tape_3590_wq = alloc_workqueue("tape_3590", 0, 0); + tape_3590_wq = alloc_workqueue("tape_3590", WQ_PERCPU, 0); if (!tape_3590_wq) return -ENOMEM; diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c index 3bf09a89a08940..e92e2fd8ce5da0 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.c +++ b/drivers/s390/crypto/zcrypt_ep11misc.c @@ -1405,7 +1405,9 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, /* Step 3: import the encrypted key value as a new key */ rc = ep11_unwrapkey(card, domain, kek, keklen, encbuf, encbuflen, 0, def_iv, - keybitsize, 0, keybuf, keybufsize, keytype, xflags); + keybitsize, keygenflags, + keybuf, keybufsize, + keytype, xflags); if (rc) { ZCRYPT_DBF_ERR("%s importing key value as new key failed, rc=%d\n", __func__, rc); diff --git a/drivers/soc/fsl/qe/qmc.c b/drivers/soc/fsl/qe/qmc.c index 36c0ccc06151f3..da5ea6d3561840 100644 --- a/drivers/soc/fsl/qe/qmc.c +++ b/drivers/soc/fsl/qe/qmc.c @@ -461,9 +461,16 @@ int qmc_chan_write_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length, ctrl = qmc_read16(&bd->cbd_sc); if (ctrl & (QMC_BD_TX_R | QMC_BD_TX_UB)) { - /* We are full ... */ - ret = -EBUSY; - goto end; + if (!(ctrl & (QMC_BD_TX_R | QMC_BD_TX_I)) && bd == chan->txbd_done) { + if (ctrl & QMC_BD_TX_W) + chan->txbd_done = chan->txbds; + else + chan->txbd_done++; + } else { + /* We are full ... */ + ret = -EBUSY; + goto end; + } } qmc_write16(&bd->cbd_datlen, length); @@ -475,6 +482,10 @@ int qmc_chan_write_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length, /* Activate the descriptor */ ctrl |= (QMC_BD_TX_R | QMC_BD_TX_UB); + if (complete) + ctrl |= QMC_BD_TX_I; + else + ctrl &= ~QMC_BD_TX_I; wmb(); /* Be sure to flush the descriptor before control update */ qmc_write16(&bd->cbd_sc, ctrl); @@ -569,9 +580,16 @@ int qmc_chan_read_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length, ctrl = qmc_read16(&bd->cbd_sc); if (ctrl & (QMC_BD_RX_E | QMC_BD_RX_UB)) { - /* We are full ... */ - ret = -EBUSY; - goto end; + if (!(ctrl & (QMC_BD_RX_E | QMC_BD_RX_I)) && bd == chan->rxbd_done) { + if (ctrl & QMC_BD_RX_W) + chan->rxbd_done = chan->rxbds; + else + chan->rxbd_done++; + } else { + /* We are full ... */ + ret = -EBUSY; + goto end; + } } qmc_write16(&bd->cbd_datlen, 0); /* data length is updated by the QMC */ @@ -587,6 +605,10 @@ int qmc_chan_read_submit(struct qmc_chan *chan, dma_addr_t addr, size_t length, /* Activate the descriptor */ ctrl |= (QMC_BD_RX_E | QMC_BD_RX_UB); + if (complete) + ctrl |= QMC_BD_RX_I; + else + ctrl &= ~QMC_BD_RX_I; wmb(); /* Be sure to flush data before descriptor activation */ qmc_write16(&bd->cbd_sc, ctrl); @@ -1482,19 +1504,19 @@ static int qmc_setup_chan(struct qmc *qmc, struct qmc_chan *chan) /* Init Rx BDs and set Wrap bit on last descriptor */ BUILD_BUG_ON(QMC_NB_RXBDS == 0); - val = QMC_BD_RX_I; for (i = 0; i < QMC_NB_RXBDS; i++) { bd = chan->rxbds + i; - qmc_write16(&bd->cbd_sc, val); + qmc_write16(&bd->cbd_sc, 0); } bd = chan->rxbds + QMC_NB_RXBDS - 1; - qmc_write16(&bd->cbd_sc, val | QMC_BD_RX_W); + qmc_write16(&bd->cbd_sc, QMC_BD_RX_W); /* Init Tx BDs and set Wrap bit on last descriptor */ BUILD_BUG_ON(QMC_NB_TXBDS == 0); - val = QMC_BD_TX_I; if (chan->mode == QMC_HDLC) - val |= QMC_BD_TX_L | QMC_BD_TX_TC; + val = QMC_BD_TX_L | QMC_BD_TX_TC; + else + val = 0; for (i = 0; i < QMC_NB_TXBDS; i++) { bd = chan->txbds + i; qmc_write16(&bd->cbd_sc, val); diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig index 9392c2c43cc80e..c0fc54c3cd35e4 100644 --- a/drivers/soc/tegra/Kconfig +++ b/drivers/soc/tegra/Kconfig @@ -96,6 +96,7 @@ config ARCH_TEGRA_210_SOC config ARCH_TEGRA_186_SOC bool "NVIDIA Tegra186 SoC" depends on !CPU_BIG_ENDIAN + select PINCTRL_TEGRA186 select MAILBOX select SOC_TEGRA_PMC help diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 4fd5cac799c547..55c1db81653400 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -1360,6 +1360,18 @@ int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base) } EXPORT_SYMBOL(sdw_slave_get_scale_index); +int sdw_slave_get_current_bank(struct sdw_slave *slave) +{ + int tmp; + + tmp = sdw_read(slave, SDW_SCP_CTRL); + if (tmp < 0) + return tmp; + + return FIELD_GET(SDW_SCP_STAT_CURR_BANK, tmp); +} +EXPORT_SYMBOL_GPL(sdw_slave_get_current_bank); + static int sdw_slave_set_frequency(struct sdw_slave *slave) { int scale_index; diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c index d2d99555ec5a50..3d4d00188c26cc 100644 --- a/drivers/soundwire/slave.c +++ b/drivers/soundwire/slave.c @@ -273,4 +273,10 @@ int sdw_of_find_slaves(struct sdw_bus *bus) return 0; } +struct device *of_sdw_find_device_by_node(struct device_node *np) +{ + return bus_find_device_by_of_node(&sdw_bus_type, np); +} +EXPORT_SYMBOL_GPL(of_sdw_find_device_by_node); + MODULE_IMPORT_NS("SND_SOC_SDCA"); diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 891729c9c5642a..e8a39e304c7eb6 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -99,6 +99,16 @@ config SPI_AMLOGIC_SPIFC_A1 This enables master mode support for the SPIFC (SPI flash controller) available in Amlogic A1 (A113L SoC). +config SPI_AMLOGIC_SPIFC_A4 + tristate "Amlogic A4 SPI Flash controller" + depends on ARCH_MESON || COMPILE_TEST + select REGMAP_MMIO + help + This enables SPI mode on the NAND Flash Controller of Amlogic + ARM SoCs. It supports SPI Nor Flash and SPI NAND Flash (Could + enable Host ECC HW engine). The controller implements the + SPI-MEM interface, it doesn't support generic SPI. + config SPI_AMLOGIC_SPISG tristate "Amlogic SPISG controller" depends on COMMON_CLK @@ -916,7 +926,8 @@ config SPI_ROCKCHIP_SFC config SPI_RB4XX tristate "Mikrotik RB4XX SPI master" - depends on SPI_MASTER && ATH79 + depends on SPI_MASTER && (ATH79 || COMPILE_TEST) + depends on OF help SPI controller driver for the Mikrotik RB4xx series boards. @@ -1224,6 +1235,17 @@ config SPI_UNIPHIER If your SoC supports SCSSI, say Y here. +config SPI_VIRTIO + tristate "Virtio SPI Controller" + depends on SPI_MASTER && VIRTIO + help + If you say yes to this option, support will be included for the virtio + SPI controller driver. The hardware can be emulated by any device model + software according to the virtio protocol. + + This driver can also be built as a module. If so, the module + will be called spi-virtio. + config SPI_XCOMM tristate "Analog Devices AD-FMCOMMS1-EBZ SPI-I2C-bridge driver" depends on I2C diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile index 062c85989c8c96..8ff74a13faaa88 100644 --- a/drivers/spi/Makefile +++ b/drivers/spi/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SPI_ALTERA) += spi-altera-platform.o obj-$(CONFIG_SPI_ALTERA_CORE) += spi-altera-core.o obj-$(CONFIG_SPI_ALTERA_DFL) += spi-altera-dfl.o obj-$(CONFIG_SPI_AMLOGIC_SPIFC_A1) += spi-amlogic-spifc-a1.o +obj-$(CONFIG_SPI_AMLOGIC_SPIFC_A4) += spi-amlogic-spifc-a4.o obj-$(CONFIG_SPI_AMLOGIC_SPISG) += spi-amlogic-spisg.o obj-$(CONFIG_SPI_APPLE) += spi-apple.o obj-$(CONFIG_SPI_AR934X) += spi-ar934x.o @@ -158,6 +159,7 @@ spi-thunderx-objs := spi-cavium.o spi-cavium-thunderx.o obj-$(CONFIG_SPI_THUNDERX) += spi-thunderx.o obj-$(CONFIG_SPI_TOPCLIFF_PCH) += spi-topcliff-pch.o obj-$(CONFIG_SPI_UNIPHIER) += spi-uniphier.o +obj-$(CONFIG_SPI_VIRTIO) += spi-virtio.o obj-$(CONFIG_SPI_XCOMM) += spi-xcomm.o obj-$(CONFIG_SPI_XILINX) += spi-xilinx.o obj-$(CONFIG_SPI_XLP) += spi-xlp.o diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index 4e9bfd26aa80b7..d7a3d85d00c2f3 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -63,6 +63,7 @@ #define SAMA7G5_QSPI0_MAX_SPEED_HZ 200000000 #define SAMA7G5_QSPI1_SDR_MAX_SPEED_HZ 133000000 +#define SAM9X7_QSPI_MAX_SPEED_HZ 100000000 /* Bitfields in QSPI_CR (Control Register) */ #define QSPI_CR_QSPIEN BIT(0) @@ -262,6 +263,9 @@ struct atmel_qspi_caps { bool has_ricr; bool octal; bool has_dma; + bool has_2xgclk; + bool has_padcalib; + bool has_dllon; }; struct atmel_qspi_ops; @@ -1027,13 +1031,25 @@ static int atmel_qspi_set_pad_calibration(struct atmel_qspi *aq) aq, QSPI_PCALCFG); /* DLL On + start calibration. */ - atmel_qspi_write(QSPI_CR_DLLON | QSPI_CR_STPCAL, aq, QSPI_CR); + if (aq->caps->has_dllon) + atmel_qspi_write(QSPI_CR_DLLON | QSPI_CR_STPCAL, aq, QSPI_CR); + /* If there is no DLL support only start calibration. */ + else + atmel_qspi_write(QSPI_CR_STPCAL, aq, QSPI_CR); - /* Check synchronization status before updating configuration. */ - ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, - (val & QSPI_SR2_DLOCK) && - !(val & QSPI_SR2_CALBSY), 40, - ATMEL_QSPI_TIMEOUT); + /* + * Check DLL clock lock and synchronization status before updating + * configuration. + */ + if (aq->caps->has_dllon) + ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, + (val & QSPI_SR2_DLOCK) && + !(val & QSPI_SR2_CALBSY), 40, + ATMEL_QSPI_TIMEOUT); + else + ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, + !(val & QSPI_SR2_CALBSY), 40, + ATMEL_QSPI_TIMEOUT); /* Refresh analogic blocks every 1 ms.*/ atmel_qspi_write(FIELD_PREP(QSPI_REFRESH_DELAY_COUNTER, @@ -1049,23 +1065,28 @@ static int atmel_qspi_set_gclk(struct atmel_qspi *aq) int ret; /* Disable DLL before setting GCLK */ - status = atmel_qspi_read(aq, QSPI_SR2); - if (status & QSPI_SR2_DLOCK) { - atmel_qspi_write(QSPI_CR_DLLOFF, aq, QSPI_CR); + if (aq->caps->has_dllon) { + status = atmel_qspi_read(aq, QSPI_SR2); + if (status & QSPI_SR2_DLOCK) { + atmel_qspi_write(QSPI_CR_DLLOFF, aq, QSPI_CR); + ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, + !(val & QSPI_SR2_DLOCK), 40, + ATMEL_QSPI_TIMEOUT); + if (ret) + return ret; + } - ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, - !(val & QSPI_SR2_DLOCK), 40, - ATMEL_QSPI_TIMEOUT); - if (ret) - return ret; + if (aq->target_max_speed_hz > QSPI_DLLCFG_THRESHOLD_FREQ) + atmel_qspi_write(QSPI_DLLCFG_RANGE, aq, QSPI_DLLCFG); + else + atmel_qspi_write(0, aq, QSPI_DLLCFG); } - if (aq->target_max_speed_hz > QSPI_DLLCFG_THRESHOLD_FREQ) - atmel_qspi_write(QSPI_DLLCFG_RANGE, aq, QSPI_DLLCFG); + if (aq->caps->has_2xgclk) + ret = clk_set_rate(aq->gclk, 2 * aq->target_max_speed_hz); else - atmel_qspi_write(0, aq, QSPI_DLLCFG); + ret = clk_set_rate(aq->gclk, aq->target_max_speed_hz); - ret = clk_set_rate(aq->gclk, aq->target_max_speed_hz); if (ret) { dev_err(&aq->pdev->dev, "Failed to set generic clock rate.\n"); return ret; @@ -1088,11 +1109,16 @@ static int atmel_qspi_sama7g5_init(struct atmel_qspi *aq) if (ret) return ret; - if (aq->caps->octal) { + /* + * Check if the SoC supports pad calibration in Octal SPI mode. + * Proceed only if both the capabilities are true. + */ + if (aq->caps->octal && aq->caps->has_padcalib) { ret = atmel_qspi_set_pad_calibration(aq); if (ret) return ret; - } else { + /* Start DLL on only if the SoC supports the same */ + } else if (aq->caps->has_dllon) { atmel_qspi_write(QSPI_CR_DLLON, aq, QSPI_CR); ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, (val & QSPI_SR2_DLOCK), 40, @@ -1458,19 +1484,19 @@ static int atmel_qspi_sama7g5_suspend(struct atmel_qspi *aq) clk_disable_unprepare(aq->gclk); - atmel_qspi_write(QSPI_CR_DLLOFF, aq, QSPI_CR); - ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, - !(val & QSPI_SR2_DLOCK), 40, - ATMEL_QSPI_TIMEOUT); - if (ret) - return ret; - - ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, - !(val & QSPI_SR2_CALBSY), 40, - ATMEL_QSPI_TIMEOUT); - if (ret) - return ret; + if (aq->caps->has_dllon) { + atmel_qspi_write(QSPI_CR_DLLOFF, aq, QSPI_CR); + ret = readl_poll_timeout(aq->regs + QSPI_SR2, val, + !(val & QSPI_SR2_DLOCK), 40, + ATMEL_QSPI_TIMEOUT); + if (ret) + return ret; + } + if (aq->caps->has_padcalib) + return readl_poll_timeout(aq->regs + QSPI_SR2, val, + !(val & QSPI_SR2_CALBSY), 40, + ATMEL_QSPI_TIMEOUT); return 0; } @@ -1602,17 +1628,48 @@ static const struct atmel_qspi_caps atmel_sam9x60_qspi_caps = { .has_ricr = true, }; +static const struct atmel_qspi_caps atmel_sam9x7_ospi_caps = { + .max_speed_hz = SAM9X7_QSPI_MAX_SPEED_HZ, + .has_gclk = true, + .octal = true, + .has_dma = true, + .has_2xgclk = true, + .has_padcalib = false, + .has_dllon = false, +}; + +static const struct atmel_qspi_caps atmel_sama7d65_ospi_caps = { + .max_speed_hz = SAMA7G5_QSPI0_MAX_SPEED_HZ, + .has_gclk = true, + .octal = true, + .has_dma = true, + .has_2xgclk = true, + .has_padcalib = true, + .has_dllon = false, +}; + +static const struct atmel_qspi_caps atmel_sama7d65_qspi_caps = { + .max_speed_hz = SAMA7G5_QSPI1_SDR_MAX_SPEED_HZ, + .has_gclk = true, + .has_dma = true, + .has_2xgclk = true, + .has_dllon = false, +}; + static const struct atmel_qspi_caps atmel_sama7g5_ospi_caps = { .max_speed_hz = SAMA7G5_QSPI0_MAX_SPEED_HZ, .has_gclk = true, .octal = true, .has_dma = true, + .has_padcalib = true, + .has_dllon = true, }; static const struct atmel_qspi_caps atmel_sama7g5_qspi_caps = { .max_speed_hz = SAMA7G5_QSPI1_SDR_MAX_SPEED_HZ, .has_gclk = true, .has_dma = true, + .has_dllon = true, }; static const struct of_device_id atmel_qspi_dt_ids[] = { @@ -1632,6 +1689,19 @@ static const struct of_device_id atmel_qspi_dt_ids[] = { .compatible = "microchip,sama7g5-qspi", .data = &atmel_sama7g5_qspi_caps, }, + { + .compatible = "microchip,sam9x7-ospi", + .data = &atmel_sam9x7_ospi_caps, + }, + { + .compatible = "microchip,sama7d65-ospi", + .data = &atmel_sama7d65_ospi_caps, + }, + { + .compatible = "microchip,sama7d65-qspi", + .data = &atmel_sama7d65_qspi_caps, + }, + { /* sentinel */ } }; diff --git a/drivers/spi/spi-altera-platform.c b/drivers/spi/spi-altera-platform.c index 585393802e9f9f..e163774fd65b49 100644 --- a/drivers/spi/spi-altera-platform.c +++ b/drivers/spi/spi-altera-platform.c @@ -30,7 +30,6 @@ static const struct regmap_config spi_altera_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .fast_io = true, }; static int altera_spi_probe(struct platform_device *pdev) diff --git a/drivers/spi/spi-amd-pci.c b/drivers/spi/spi-amd-pci.c index e5faab414c1726..d48c3a5da303d9 100644 --- a/drivers/spi/spi-amd-pci.c +++ b/drivers/spi/spi-amd-pci.c @@ -38,7 +38,7 @@ static int amd_spi_pci_probe(struct pci_dev *pdev, /* Allocate storage for host and driver private data */ host = devm_spi_alloc_host(dev, sizeof(struct amd_spi)); if (!host) - return dev_err_probe(dev, -ENOMEM, "Error allocating SPI host\n"); + return -ENOMEM; amd_spi = spi_controller_get_devdata(host); @@ -47,8 +47,7 @@ static int amd_spi_pci_probe(struct pci_dev *pdev, amd_spi->io_remap_addr = devm_ioremap(dev, io_base_addr, AMD_HID2_MEM_SIZE); if (!amd_spi->io_remap_addr) - return dev_err_probe(dev, -ENOMEM, - "ioremap of SPI registers failed\n"); + return -ENOMEM; dev_dbg(dev, "io_remap_address: %p\n", amd_spi->io_remap_addr); diff --git a/drivers/spi/spi-amd.c b/drivers/spi/spi-amd.c index 02e7fe095a0b55..4d1dce4f497406 100644 --- a/drivers/spi/spi-amd.c +++ b/drivers/spi/spi-amd.c @@ -857,7 +857,7 @@ static int amd_spi_probe(struct platform_device *pdev) /* Allocate storage for host and driver private data */ host = devm_spi_alloc_host(dev, sizeof(struct amd_spi)); if (!host) - return dev_err_probe(dev, -ENOMEM, "Error allocating SPI host\n"); + return -ENOMEM; amd_spi = spi_controller_get_devdata(host); amd_spi->io_remap_addr = devm_platform_ioremap_resource(pdev, 0); diff --git a/drivers/spi/spi-amlogic-spifc-a4.c b/drivers/spi/spi-amlogic-spifc-a4.c new file mode 100644 index 00000000000000..4338d00e56a6e8 --- /dev/null +++ b/drivers/spi/spi-amlogic-spifc-a4.c @@ -0,0 +1,1222 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR MIT) +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + * + * Driver for the SPI Mode of Amlogic Flash Controller + * Authors: + * Liang Yang + * Feng Chen + * Xianwei Zhao + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SFC_CMD 0x00 +#define SFC_CFG 0x04 +#define SFC_DADR 0x08 +#define SFC_IADR 0x0c +#define SFC_BUF 0x10 +#define SFC_INFO 0x14 +#define SFC_DC 0x18 +#define SFC_ADR 0x1c +#define SFC_DL 0x20 +#define SFC_DH 0x24 +#define SFC_CADR 0x28 +#define SFC_SADR 0x2c +#define SFC_RX_IDX 0x34 +#define SFC_RX_DAT 0x38 +#define SFC_SPI_CFG 0x40 + +/* settings in SFC_CMD */ + +/* 4 bits support 4 chip select, high false, low select but spi support 2*/ +#define CHIP_SELECT_MASK GENMASK(13, 10) +#define CS_NONE 0xf +#define CS_0 0xe +#define CS_1 0xd + +#define CLE (0x5 << 14) +#define ALE (0x6 << 14) +#define DWR (0x4 << 14) +#define DRD (0x8 << 14) +#define DUMMY (0xb << 14) +#define IDLE (0xc << 14) +#define IDLE_CYCLE_MASK GENMASK(9, 0) +#define EXT_CYCLE_MASK GENMASK(9, 0) + +#define OP_M2N ((0 << 17) | (2 << 20)) +#define OP_N2M ((1 << 17) | (2 << 20)) +#define OP_STS ((3 << 17) | (2 << 20)) +#define OP_ADL ((0 << 16) | (3 << 20)) +#define OP_ADH ((1 << 16) | (3 << 20)) +#define OP_AIL ((2 << 16) | (3 << 20)) +#define OP_AIH ((3 << 16) | (3 << 20)) +#define OP_ASL ((4 << 16) | (3 << 20)) +#define OP_ASH ((5 << 16) | (3 << 20)) +#define OP_SEED ((8 << 16) | (3 << 20)) +#define SEED_MASK GENMASK(14, 0) +#define ENABLE_RANDOM BIT(19) + +#define CMD_COMMAND(cs_sel, cmd) (CLE | ((cs_sel) << 10) | (cmd)) +#define CMD_ADDR(cs_sel, addr) (ALE | ((cs_sel) << 10) | (addr)) +#define CMD_DUMMY(cs_sel, cyc) (DUMMY | ((cs_sel) << 10) | ((cyc) & EXT_CYCLE_MASK)) +#define CMD_IDLE(cs_sel, cyc) (IDLE | ((cs_sel) << 10) | ((cyc) & IDLE_CYCLE_MASK)) +#define CMD_MEM2NAND(bch, pages) (OP_M2N | ((bch) << 14) | (pages)) +#define CMD_NAND2MEM(bch, pages) (OP_N2M | ((bch) << 14) | (pages)) +#define CMD_DATA_ADDRL(addr) (OP_ADL | ((addr) & 0xffff)) +#define CMD_DATA_ADDRH(addr) (OP_ADH | (((addr) >> 16) & 0xffff)) +#define CMD_INFO_ADDRL(addr) (OP_AIL | ((addr) & 0xffff)) +#define CMD_INFO_ADDRH(addr) (OP_AIH | (((addr) >> 16) & 0xffff)) +#define CMD_SEED(seed) (OP_SEED | ((seed) & SEED_MASK)) + +#define GET_CMD_SIZE(x) (((x) >> 22) & GENMASK(4, 0)) + +#define DEFAULT_PULLUP_CYCLE 2 +#define CS_SETUP_CYCLE 1 +#define CS_HOLD_CYCLE 2 +#define DEFAULT_BUS_CYCLE 4 + +#define RAW_SIZE GENMASK(13, 0) +#define RAW_SIZE_BW 14 + +#define DMA_ADDR_ALIGN 8 + +/* Bit fields in SFC_SPI_CFG */ +#define SPI_MODE_EN BIT(31) +#define RAW_EXT_SIZE GENMASK(29, 18) +#define ADDR_LANE GENMASK(17, 16) +#define CPOL BIT(15) +#define CPHA BIT(14) +#define EN_HOLD BIT(13) +#define EN_WP BIT(12) +#define TXADJ GENMASK(11, 8) +#define RXADJ GENMASK(7, 4) +#define CMD_LANE GENMASK(3, 2) +#define DATA_LANE GENMASK(1, 0) +#define LANE_MAX 0x3 + +/* raw ext size[25:14] + raw size[13:0] */ +#define RAW_MAX_RW_SIZE_MASK GENMASK(25, 0) + +/* Ecc fields */ +#define ECC_COMPLETE BIT(31) +#define ECC_UNCORRECTABLE 0x3f +#define ECC_ERR_CNT(x) (((x) >> 24) & 0x3f) +#define ECC_ZERO_CNT(x) (((x) >> 16) & 0x3f) + +#define ECC_BCH8_512 1 +#define ECC_BCH8_1K 2 +#define ECC_BCH8_PARITY_BYTES 14 +#define ECC_BCH8_USER_BYTES 2 +#define ECC_BCH8_INFO_BYTES (ECC_BCH8_USER_BYTES + ECC_BCH8_PARITY_BYTES) +#define ECC_BCH8_STRENGTH 8 +#define ECC_BCH8_DEFAULT_STEP 512 +#define ECC_DEFAULT_BCH_MODE ECC_BCH8_512 +#define ECC_PER_INFO_BYTE 8 +#define ECC_PATTERN 0x5a +#define ECC_BCH_MAX_SECT_SIZE 63 +/* soft flags for sfc */ +#define SFC_HWECC BIT(0) +#define SFC_DATA_RANDOM BIT(1) +#define SFC_DATA_ONLY BIT(2) +#define SFC_OOB_ONLY BIT(3) +#define SFC_DATA_OOB BIT(4) +#define SFC_AUTO_OOB BIT(5) +#define SFC_RAW_RW BIT(6) +#define SFC_XFER_MDOE_MASK GENMASK(6, 2) + +#define SFC_DATABUF_SIZE 8192 +#define SFC_INFOBUF_SIZE 256 +#define SFC_BUF_SIZE (SFC_DATABUF_SIZE + SFC_INFOBUF_SIZE) + +/* !!! PCB and SPI-NAND chip limitations */ +#define SFC_MAX_FREQUENCY (250 * 1000 * 1000) +#define SFC_MIN_FREQUENCY (4 * 1000 * 1000) +#define SFC_BUS_DEFAULT_CLK 40000000 +#define SFC_MAX_CS_NUM 2 + +/* SPI-FLASH R/W operation cmd */ +#define SPIFLASH_RD_OCTALIO 0xcb +#define SPIFLASH_RD_OCTAL 0x8b +#define SPIFLASH_RD_QUADIO 0xeb +#define SPIFLASH_RD_QUAD 0x6b +#define SPIFLASH_RD_DUALIO 0xbb +#define SPIFLASH_RD_DUAL 0x3b +#define SPIFLASH_RD_FAST 0x0b +#define SPIFLASH_RD 0x03 +#define SPIFLASH_WR_OCTALIO 0xC2 +#define SPIFLASH_WR_OCTAL 0x82 +#define SPIFLASH_WR_QUAD 0x32 +#define SPIFLASH_WR 0x02 +#define SPIFLASH_UP_QUAD 0x34 +#define SPIFLASH_UP 0x84 + +struct aml_sfc_ecc_cfg { + u32 stepsize; + u32 nsteps; + u32 strength; + u32 oobsize; + u32 bch; +}; + +struct aml_ecc_stats { + u32 corrected; + u32 bitflips; + u32 failed; +}; + +struct aml_sfc_caps { + struct aml_sfc_ecc_cfg *ecc_caps; + u32 num_ecc_caps; +}; + +struct aml_sfc { + struct device *dev; + struct clk *gate_clk; + struct clk *core_clk; + struct spi_controller *ctrl; + struct regmap *regmap_base; + const struct aml_sfc_caps *caps; + struct nand_ecc_engine ecc_eng; + struct aml_ecc_stats ecc_stats; + dma_addr_t daddr; + dma_addr_t iaddr; + u32 info_bytes; + u32 bus_rate; + u32 flags; + u32 rx_adj; + u32 cs_sel; + u8 *data_buf; + __le64 *info_buf; + u8 *priv; +}; + +#define AML_ECC_DATA(sz, s, b) { .stepsize = (sz), .strength = (s), .bch = (b) } + +static struct aml_sfc_ecc_cfg aml_a113l2_ecc_caps[] = { + AML_ECC_DATA(512, 8, ECC_BCH8_512), + AML_ECC_DATA(1024, 8, ECC_BCH8_1K), +}; + +static const struct aml_sfc_caps aml_a113l2_sfc_caps = { + .ecc_caps = aml_a113l2_ecc_caps, + .num_ecc_caps = ARRAY_SIZE(aml_a113l2_ecc_caps) +}; + +static struct aml_sfc *nand_to_aml_sfc(struct nand_device *nand) +{ + struct nand_ecc_engine *eng = nand->ecc.engine; + + return container_of(eng, struct aml_sfc, ecc_eng); +} + +static inline void *aml_sfc_to_ecc_ctx(struct aml_sfc *sfc) +{ + return sfc->priv; +} + +static int aml_sfc_wait_cmd_finish(struct aml_sfc *sfc, u64 timeout_ms) +{ + u32 cmd_size = 0; + int ret; + + /* + * The SPINAND flash controller employs a two-stage pipeline: + * 1) command prefetch; 2) command execution. + * + * All commands are stored in the FIFO, with one prefetched for execution. + * + * There are cases where the FIFO is detected as empty, yet a command may + * still be in execution and a prefetched command pending execution. + * + * So, send two idle commands to ensure all previous commands have + * been executed. + */ + regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(sfc->cs_sel, 0)); + regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(sfc->cs_sel, 0)); + + /* Wait for the FIFO to empty. */ + ret = regmap_read_poll_timeout(sfc->regmap_base, SFC_CMD, cmd_size, + !GET_CMD_SIZE(cmd_size), + 10, timeout_ms * 1000); + if (ret) + dev_err(sfc->dev, "wait for empty CMD FIFO time out\n"); + + return ret; +} + +static int aml_sfc_pre_transfer(struct aml_sfc *sfc, u32 idle_cycle, u32 cs2clk_cycle) +{ + int ret; + + ret = regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(CS_NONE, idle_cycle)); + if (ret) + return ret; + + return regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(sfc->cs_sel, cs2clk_cycle)); +} + +static int aml_sfc_end_transfer(struct aml_sfc *sfc, u32 clk2cs_cycle) +{ + int ret; + + ret = regmap_write(sfc->regmap_base, SFC_CMD, CMD_IDLE(sfc->cs_sel, clk2cs_cycle)); + if (ret) + return ret; + + return aml_sfc_wait_cmd_finish(sfc, 0); +} + +static int aml_sfc_set_bus_width(struct aml_sfc *sfc, u8 buswidth, u32 mask) +{ + int i; + u32 conf = 0; + + for (i = 0; i <= LANE_MAX; i++) { + if (buswidth == 1 << i) { + conf = i << __bf_shf(mask); + return regmap_update_bits(sfc->regmap_base, SFC_SPI_CFG, + mask, conf); + } + } + + return 0; +} + +static int aml_sfc_send_cmd(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + int i, ret; + u8 val; + + ret = aml_sfc_set_bus_width(sfc, op->cmd.buswidth, CMD_LANE); + if (ret) + return ret; + + for (i = 0; i < op->cmd.nbytes; i++) { + val = (op->cmd.opcode >> ((op->cmd.nbytes - i - 1) * 8)) & 0xff; + ret = regmap_write(sfc->regmap_base, SFC_CMD, CMD_COMMAND(sfc->cs_sel, val)); + if (ret) + return ret; + } + + return 0; +} + +static int aml_sfc_send_addr(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + int i, ret; + u8 val; + + ret = aml_sfc_set_bus_width(sfc, op->addr.buswidth, ADDR_LANE); + if (ret) + return ret; + + for (i = 0; i < op->addr.nbytes; i++) { + val = (op->addr.val >> ((op->addr.nbytes - i - 1) * 8)) & 0xff; + + ret = regmap_write(sfc->regmap_base, SFC_CMD, CMD_ADDR(sfc->cs_sel, val)); + if (ret) + return ret; + } + + return 0; +} + +static bool aml_sfc_is_xio_op(const struct spi_mem_op *op) +{ + switch (op->cmd.opcode) { + case SPIFLASH_RD_OCTALIO: + case SPIFLASH_RD_QUADIO: + case SPIFLASH_RD_DUALIO: + return true; + default: + break; + } + + return false; +} + +static int aml_sfc_send_cmd_addr_dummy(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + u32 dummy_cycle, cmd; + int ret; + + ret = aml_sfc_send_cmd(sfc, op); + if (ret) + return ret; + + ret = aml_sfc_send_addr(sfc, op); + if (ret) + return ret; + + if (op->dummy.nbytes) { + /* Dummy buswidth configuration is not supported */ + if (aml_sfc_is_xio_op(op)) + dummy_cycle = op->dummy.nbytes * 8 / op->data.buswidth; + else + dummy_cycle = op->dummy.nbytes * 8; + cmd = CMD_DUMMY(sfc->cs_sel, dummy_cycle - 1); + return regmap_write(sfc->regmap_base, SFC_CMD, cmd); + } + + return 0; +} + +static bool aml_sfc_is_snand_hwecc_page_op(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + switch (op->cmd.opcode) { + /* SPINAND read from cache cmd */ + case SPIFLASH_RD_QUADIO: + case SPIFLASH_RD_QUAD: + case SPIFLASH_RD_DUALIO: + case SPIFLASH_RD_DUAL: + case SPIFLASH_RD_FAST: + case SPIFLASH_RD: + /* SPINAND write to cache cmd */ + case SPIFLASH_WR_QUAD: + case SPIFLASH_WR: + case SPIFLASH_UP_QUAD: + case SPIFLASH_UP: + if (sfc->flags & SFC_HWECC) + return true; + else + return false; + default: + break; + } + + return false; +} + +static int aml_sfc_dma_buffer_setup(struct aml_sfc *sfc, void *databuf, + int datalen, void *infobuf, int infolen, + enum dma_data_direction dir) +{ + u32 cmd = 0; + int ret; + + sfc->daddr = dma_map_single(sfc->dev, databuf, datalen, dir); + ret = dma_mapping_error(sfc->dev, sfc->daddr); + if (ret) { + dev_err(sfc->dev, "DMA mapping error\n"); + goto out_map_data; + } + + cmd = CMD_DATA_ADDRL(sfc->daddr); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto out_map_data; + + cmd = CMD_DATA_ADDRH(sfc->daddr); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto out_map_data; + + if (infobuf) { + sfc->iaddr = dma_map_single(sfc->dev, infobuf, infolen, dir); + ret = dma_mapping_error(sfc->dev, sfc->iaddr); + if (ret) { + dev_err(sfc->dev, "DMA mapping error\n"); + dma_unmap_single(sfc->dev, sfc->daddr, datalen, dir); + goto out_map_data; + } + + sfc->info_bytes = infolen; + cmd = CMD_INFO_ADDRL(sfc->iaddr); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto out_map_info; + + cmd = CMD_INFO_ADDRH(sfc->iaddr); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto out_map_info; + } + + return 0; + +out_map_info: + dma_unmap_single(sfc->dev, sfc->iaddr, datalen, dir); +out_map_data: + dma_unmap_single(sfc->dev, sfc->daddr, datalen, dir); + + return ret; +} + +static void aml_sfc_dma_buffer_release(struct aml_sfc *sfc, + int datalen, int infolen, + enum dma_data_direction dir) +{ + dma_unmap_single(sfc->dev, sfc->daddr, datalen, dir); + if (infolen) { + dma_unmap_single(sfc->dev, sfc->iaddr, infolen, dir); + sfc->info_bytes = 0; + } +} + +static bool aml_sfc_dma_buffer_is_safe(const void *buffer) +{ + if ((uintptr_t)buffer % DMA_ADDR_ALIGN) + return false; + + if (virt_addr_valid(buffer)) + return true; + + return false; +} + +static void *aml_get_dma_safe_input_buf(const struct spi_mem_op *op) +{ + if (aml_sfc_dma_buffer_is_safe(op->data.buf.in)) + return op->data.buf.in; + + return kzalloc(op->data.nbytes, GFP_KERNEL); +} + +static void aml_sfc_put_dma_safe_input_buf(const struct spi_mem_op *op, void *buf) +{ + if (WARN_ON(op->data.dir != SPI_MEM_DATA_IN) || WARN_ON(!buf)) + return; + + if (buf == op->data.buf.in) + return; + + memcpy(op->data.buf.in, buf, op->data.nbytes); + kfree(buf); +} + +static void *aml_sfc_get_dma_safe_output_buf(const struct spi_mem_op *op) +{ + if (aml_sfc_dma_buffer_is_safe(op->data.buf.out)) + return (void *)op->data.buf.out; + + return kmemdup(op->data.buf.out, op->data.nbytes, GFP_KERNEL); +} + +static void aml_sfc_put_dma_safe_output_buf(const struct spi_mem_op *op, const void *buf) +{ + if (WARN_ON(op->data.dir != SPI_MEM_DATA_OUT) || WARN_ON(!buf)) + return; + + if (buf != op->data.buf.out) + kfree(buf); +} + +static u64 aml_sfc_cal_timeout_cycle(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + u64 ms; + + /* For each byte we wait for (8 cycles / buswidth) of the SPI clock. */ + ms = 8 * MSEC_PER_SEC * op->data.nbytes / op->data.buswidth; + do_div(ms, sfc->bus_rate / DEFAULT_BUS_CYCLE); + + /* + * Double the value and add a 200 ms tolerance to compensate for + * the impact of specific CS hold time, CS setup time sequences, + * controller burst gaps, and other related timing variations. + */ + ms += ms + 200; + + if (ms > UINT_MAX) + ms = UINT_MAX; + + return ms; +} + +static void aml_sfc_check_ecc_pages_valid(struct aml_sfc *sfc, bool raw) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + __le64 *info; + int ret; + + info = sfc->info_buf; + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + info += raw ? 0 : ecc_cfg->nsteps - 1; + + do { + usleep_range(10, 15); + /* info is updated by nfc dma engine*/ + smp_rmb(); + dma_sync_single_for_cpu(sfc->dev, sfc->iaddr, sfc->info_bytes, + DMA_FROM_DEVICE); + ret = le64_to_cpu(*info) & ECC_COMPLETE; + } while (!ret); +} + +static int aml_sfc_raw_io_op(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + void *buf = NULL; + int ret; + bool is_datain = false; + u32 cmd = 0, conf; + u64 timeout_ms; + + if (!op->data.nbytes) + goto end_xfer; + + conf = (op->data.nbytes >> RAW_SIZE_BW) << __bf_shf(RAW_EXT_SIZE); + ret = regmap_update_bits(sfc->regmap_base, SFC_SPI_CFG, RAW_EXT_SIZE, conf); + if (ret) + goto err_out; + + if (op->data.dir == SPI_MEM_DATA_IN) { + is_datain = true; + + buf = aml_get_dma_safe_input_buf(op); + if (!buf) { + ret = -ENOMEM; + goto err_out; + } + + cmd |= CMD_NAND2MEM(0, (op->data.nbytes & RAW_SIZE)); + } else if (op->data.dir == SPI_MEM_DATA_OUT) { + is_datain = false; + + buf = aml_sfc_get_dma_safe_output_buf(op); + if (!buf) { + ret = -ENOMEM; + goto err_out; + } + + cmd |= CMD_MEM2NAND(0, (op->data.nbytes & RAW_SIZE)); + } else { + goto end_xfer; + } + + ret = aml_sfc_dma_buffer_setup(sfc, buf, op->data.nbytes, + is_datain ? sfc->info_buf : NULL, + is_datain ? ECC_PER_INFO_BYTE : 0, + is_datain ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + if (ret) + goto err_out; + + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto err_out; + + timeout_ms = aml_sfc_cal_timeout_cycle(sfc, op); + ret = aml_sfc_wait_cmd_finish(sfc, timeout_ms); + if (ret) + goto err_out; + + if (is_datain) + aml_sfc_check_ecc_pages_valid(sfc, 1); + + if (op->data.dir == SPI_MEM_DATA_IN) + aml_sfc_put_dma_safe_input_buf(op, buf); + else if (op->data.dir == SPI_MEM_DATA_OUT) + aml_sfc_put_dma_safe_output_buf(op, buf); + + aml_sfc_dma_buffer_release(sfc, op->data.nbytes, + is_datain ? ECC_PER_INFO_BYTE : 0, + is_datain ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + +end_xfer: + return aml_sfc_end_transfer(sfc, CS_HOLD_CYCLE); + +err_out: + return ret; +} + +static void aml_sfc_set_user_byte(struct aml_sfc *sfc, __le64 *info_buf, u8 *oob_buf, bool auto_oob) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + __le64 *info; + int i, count, step_size; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + step_size = auto_oob ? ECC_BCH8_INFO_BYTES : ECC_BCH8_USER_BYTES; + + for (i = 0, count = 0; i < ecc_cfg->nsteps; i++, count += step_size) { + info = &info_buf[i]; + *info &= cpu_to_le64(~0xffff); + *info |= cpu_to_le64((oob_buf[count + 1] << 8) + oob_buf[count]); + } +} + +static void aml_sfc_get_user_byte(struct aml_sfc *sfc, __le64 *info_buf, u8 *oob_buf) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + __le64 *info; + int i, count; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + for (i = 0, count = 0; i < ecc_cfg->nsteps; i++, count += ECC_BCH8_INFO_BYTES) { + info = &info_buf[i]; + oob_buf[count] = le64_to_cpu(*info); + oob_buf[count + 1] = le64_to_cpu(*info) >> 8; + } +} + +static int aml_sfc_check_hwecc_status(struct aml_sfc *sfc, __le64 *info_buf) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + __le64 *info; + u32 i, max_bitflips = 0, per_sector_bitflips = 0; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + sfc->ecc_stats.failed = 0; + sfc->ecc_stats.bitflips = 0; + sfc->ecc_stats.corrected = 0; + + for (i = 0, info = info_buf; i < ecc_cfg->nsteps; i++, info++) { + if (ECC_ERR_CNT(le64_to_cpu(*info)) != ECC_UNCORRECTABLE) { + per_sector_bitflips = ECC_ERR_CNT(le64_to_cpu(*info)); + max_bitflips = max_t(u32, max_bitflips, per_sector_bitflips); + sfc->ecc_stats.corrected += per_sector_bitflips; + continue; + } + + return -EBADMSG; + } + + return max_bitflips; +} + +static int aml_sfc_read_page_hwecc(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + int ret, data_len, info_len; + u32 page_size, cmd = 0; + u64 timeout_ms; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + page_size = ecc_cfg->stepsize * ecc_cfg->nsteps; + data_len = page_size + ecc_cfg->oobsize; + info_len = ecc_cfg->nsteps * ECC_PER_INFO_BYTE; + + ret = aml_sfc_dma_buffer_setup(sfc, sfc->data_buf, data_len, + sfc->info_buf, info_len, DMA_FROM_DEVICE); + if (ret) + goto err_out; + + cmd |= CMD_NAND2MEM(ecc_cfg->bch, ecc_cfg->nsteps); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto err_out; + + timeout_ms = aml_sfc_cal_timeout_cycle(sfc, op); + ret = aml_sfc_wait_cmd_finish(sfc, timeout_ms); + if (ret) + goto err_out; + + aml_sfc_check_ecc_pages_valid(sfc, 0); + aml_sfc_dma_buffer_release(sfc, data_len, info_len, DMA_FROM_DEVICE); + + /* check ecc status here */ + ret = aml_sfc_check_hwecc_status(sfc, sfc->info_buf); + if (ret < 0) + sfc->ecc_stats.failed++; + else + sfc->ecc_stats.bitflips = ret; + + if (sfc->flags & SFC_DATA_ONLY) { + memcpy(op->data.buf.in, sfc->data_buf, page_size); + } else if (sfc->flags & SFC_OOB_ONLY) { + aml_sfc_get_user_byte(sfc, sfc->info_buf, op->data.buf.in); + } else if (sfc->flags & SFC_DATA_OOB) { + memcpy(op->data.buf.in, sfc->data_buf, page_size); + aml_sfc_get_user_byte(sfc, sfc->info_buf, op->data.buf.in + page_size); + } + + return aml_sfc_end_transfer(sfc, CS_HOLD_CYCLE); + +err_out: + return ret; +} + +static int aml_sfc_write_page_hwecc(struct aml_sfc *sfc, const struct spi_mem_op *op) +{ + struct aml_sfc_ecc_cfg *ecc_cfg; + int ret, data_len, info_len; + u32 page_size, cmd = 0; + u64 timeout_ms; + + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + page_size = ecc_cfg->stepsize * ecc_cfg->nsteps; + data_len = page_size + ecc_cfg->oobsize; + info_len = ecc_cfg->nsteps * ECC_PER_INFO_BYTE; + + memset(sfc->info_buf, ECC_PATTERN, ecc_cfg->oobsize); + memcpy(sfc->data_buf, op->data.buf.out, page_size); + + if (!(sfc->flags & SFC_DATA_ONLY)) { + if (sfc->flags & SFC_AUTO_OOB) + aml_sfc_set_user_byte(sfc, sfc->info_buf, + (u8 *)op->data.buf.out + page_size, 1); + else + aml_sfc_set_user_byte(sfc, sfc->info_buf, + (u8 *)op->data.buf.out + page_size, 0); + } + + ret = aml_sfc_dma_buffer_setup(sfc, sfc->data_buf, data_len, + sfc->info_buf, info_len, DMA_TO_DEVICE); + if (ret) + goto err_out; + + cmd |= CMD_MEM2NAND(ecc_cfg->bch, ecc_cfg->nsteps); + ret = regmap_write(sfc->regmap_base, SFC_CMD, cmd); + if (ret) + goto err_out; + + timeout_ms = aml_sfc_cal_timeout_cycle(sfc, op); + + ret = aml_sfc_wait_cmd_finish(sfc, timeout_ms); + if (ret) + goto err_out; + + aml_sfc_dma_buffer_release(sfc, data_len, info_len, DMA_TO_DEVICE); + + return aml_sfc_end_transfer(sfc, CS_HOLD_CYCLE); + +err_out: + return ret; +} + +static int aml_sfc_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) +{ + struct aml_sfc *sfc; + struct spi_device *spi; + struct aml_sfc_ecc_cfg *ecc_cfg; + int ret; + + sfc = spi_controller_get_devdata(mem->spi->controller); + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + spi = mem->spi; + sfc->cs_sel = spi->chip_select[0] ? CS_1 : CS_0; + + dev_dbg(sfc->dev, "cmd:0x%02x - addr:%08llX@%d:%u - dummy:%d:%u - data:%d:%u", + op->cmd.opcode, op->addr.val, op->addr.buswidth, op->addr.nbytes, + op->dummy.buswidth, op->dummy.nbytes, op->data.buswidth, op->data.nbytes); + + ret = aml_sfc_pre_transfer(sfc, DEFAULT_PULLUP_CYCLE, CS_SETUP_CYCLE); + if (ret) + return ret; + + ret = aml_sfc_send_cmd_addr_dummy(sfc, op); + if (ret) + return ret; + + ret = aml_sfc_set_bus_width(sfc, op->data.buswidth, DATA_LANE); + if (ret) + return ret; + + if (aml_sfc_is_snand_hwecc_page_op(sfc, op) && + ecc_cfg && !(sfc->flags & SFC_RAW_RW)) { + if (op->data.dir == SPI_MEM_DATA_IN) + return aml_sfc_read_page_hwecc(sfc, op); + else + return aml_sfc_write_page_hwecc(sfc, op); + } + + return aml_sfc_raw_io_op(sfc, op); +} + +static int aml_sfc_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op) +{ + struct aml_sfc *sfc; + struct aml_sfc_ecc_cfg *ecc_cfg; + + sfc = spi_controller_get_devdata(mem->spi->controller); + ecc_cfg = aml_sfc_to_ecc_ctx(sfc); + + if (aml_sfc_is_snand_hwecc_page_op(sfc, op) && ecc_cfg) { + if (op->data.nbytes > ecc_cfg->stepsize * ECC_BCH_MAX_SECT_SIZE) + return -EOPNOTSUPP; + } else if (op->data.nbytes & ~RAW_MAX_RW_SIZE_MASK) { + return -EOPNOTSUPP; + } + + return 0; +} + +static const struct spi_controller_mem_ops aml_sfc_mem_ops = { + .adjust_op_size = aml_sfc_adjust_op_size, + .exec_op = aml_sfc_exec_op, +}; + +static int aml_sfc_layout_ecc(struct mtd_info *mtd, int section, + struct mtd_oob_region *oobregion) +{ + struct nand_device *nand = mtd_to_nanddev(mtd); + + if (section >= nand->ecc.ctx.nsteps) + return -ERANGE; + + oobregion->offset = ECC_BCH8_USER_BYTES + (section * ECC_BCH8_INFO_BYTES); + oobregion->length = ECC_BCH8_PARITY_BYTES; + + return 0; +} + +static int aml_sfc_ooblayout_free(struct mtd_info *mtd, int section, + struct mtd_oob_region *oobregion) +{ + struct nand_device *nand = mtd_to_nanddev(mtd); + + if (section >= nand->ecc.ctx.nsteps) + return -ERANGE; + + oobregion->offset = section * ECC_BCH8_INFO_BYTES; + oobregion->length = ECC_BCH8_USER_BYTES; + + return 0; +} + +static const struct mtd_ooblayout_ops aml_sfc_ooblayout_ops = { + .ecc = aml_sfc_layout_ecc, + .free = aml_sfc_ooblayout_free, +}; + +static int aml_spi_settings(struct aml_sfc *sfc, struct spi_device *spi) +{ + u32 conf = 0; + + if (spi->mode & SPI_CPHA) + conf |= CPHA; + + if (spi->mode & SPI_CPOL) + conf |= CPOL; + + conf |= FIELD_PREP(RXADJ, sfc->rx_adj); + conf |= EN_HOLD | EN_WP; + return regmap_update_bits(sfc->regmap_base, SFC_SPI_CFG, + CPHA | CPOL | RXADJ | + EN_HOLD | EN_WP, conf); +} + +static int aml_set_spi_clk(struct aml_sfc *sfc, struct spi_device *spi) +{ + u32 speed_hz; + int ret; + + if (spi->max_speed_hz > SFC_MAX_FREQUENCY) + speed_hz = SFC_MAX_FREQUENCY; + else if (!spi->max_speed_hz) + speed_hz = SFC_BUS_DEFAULT_CLK; + else if (spi->max_speed_hz < SFC_MIN_FREQUENCY) + speed_hz = SFC_MIN_FREQUENCY; + else + speed_hz = spi->max_speed_hz; + + /* The SPI clock is generated by dividing the bus clock by four by default. */ + ret = regmap_write(sfc->regmap_base, SFC_CFG, (DEFAULT_BUS_CYCLE - 1)); + if (ret) { + dev_err(sfc->dev, "failed to set bus cycle\n"); + return ret; + } + + return clk_set_rate(sfc->core_clk, speed_hz * DEFAULT_BUS_CYCLE); +} + +static int aml_sfc_setup(struct spi_device *spi) +{ + struct aml_sfc *sfc; + int ret; + + sfc = spi_controller_get_devdata(spi->controller); + ret = aml_spi_settings(sfc, spi); + if (ret) + return ret; + + ret = aml_set_spi_clk(sfc, spi); + if (ret) + return ret; + + sfc->bus_rate = clk_get_rate(sfc->core_clk); + + return 0; +} + +static int aml_sfc_ecc_init_ctx(struct nand_device *nand) +{ + struct mtd_info *mtd = nanddev_to_mtd(nand); + struct aml_sfc *sfc = nand_to_aml_sfc(nand); + struct aml_sfc_ecc_cfg *ecc_cfg; + const struct aml_sfc_caps *caps = sfc->caps; + struct aml_sfc_ecc_cfg *ecc_caps = caps->ecc_caps; + int i, ecc_strength, ecc_step_size; + + ecc_step_size = nand->ecc.user_conf.step_size; + ecc_strength = nand->ecc.user_conf.strength; + + for (i = 0; i < caps->num_ecc_caps; i++) { + if (ecc_caps[i].stepsize == ecc_step_size) { + nand->ecc.ctx.conf.step_size = ecc_step_size; + nand->ecc.ctx.conf.flags |= BIT(ecc_caps[i].bch); + } + + if (ecc_caps[i].strength == ecc_strength) + nand->ecc.ctx.conf.strength = ecc_strength; + } + + if (!nand->ecc.ctx.conf.step_size) { + nand->ecc.ctx.conf.step_size = ECC_BCH8_DEFAULT_STEP; + nand->ecc.ctx.conf.flags |= BIT(ECC_DEFAULT_BCH_MODE); + } + + if (!nand->ecc.ctx.conf.strength) + nand->ecc.ctx.conf.strength = ECC_BCH8_STRENGTH; + + nand->ecc.ctx.nsteps = nand->memorg.pagesize / nand->ecc.ctx.conf.step_size; + nand->ecc.ctx.total = nand->ecc.ctx.nsteps * ECC_BCH8_PARITY_BYTES; + + /* Verify the page size and OOB size against the SFC requirements. */ + if ((nand->memorg.pagesize % nand->ecc.ctx.conf.step_size) || + (nand->memorg.oobsize < (nand->ecc.ctx.total + + nand->ecc.ctx.nsteps * ECC_BCH8_USER_BYTES))) + return -EOPNOTSUPP; + + nand->ecc.ctx.conf.engine_type = NAND_ECC_ENGINE_TYPE_ON_HOST; + + ecc_cfg = kzalloc(sizeof(*ecc_cfg), GFP_KERNEL); + if (!ecc_cfg) + return -ENOMEM; + + ecc_cfg->stepsize = nand->ecc.ctx.conf.step_size; + ecc_cfg->nsteps = nand->ecc.ctx.nsteps; + ecc_cfg->strength = nand->ecc.ctx.conf.strength; + ecc_cfg->oobsize = nand->memorg.oobsize; + ecc_cfg->bch = nand->ecc.ctx.conf.flags & BIT(ECC_DEFAULT_BCH_MODE) ? 1 : 2; + + nand->ecc.ctx.priv = ecc_cfg; + sfc->priv = (void *)ecc_cfg; + mtd_set_ooblayout(mtd, &aml_sfc_ooblayout_ops); + + sfc->flags |= SFC_HWECC; + + return 0; +} + +static void aml_sfc_ecc_cleanup_ctx(struct nand_device *nand) +{ + struct aml_sfc *sfc = nand_to_aml_sfc(nand); + + sfc->flags &= ~(SFC_HWECC); + kfree(nand->ecc.ctx.priv); + sfc->priv = NULL; +} + +static int aml_sfc_ecc_prepare_io_req(struct nand_device *nand, + struct nand_page_io_req *req) +{ + struct aml_sfc *sfc = nand_to_aml_sfc(nand); + struct spinand_device *spinand = nand_to_spinand(nand); + + sfc->flags &= ~SFC_XFER_MDOE_MASK; + + if (req->datalen && !req->ooblen) + sfc->flags |= SFC_DATA_ONLY; + else if (!req->datalen && req->ooblen) + sfc->flags |= SFC_OOB_ONLY; + else if (req->datalen && req->ooblen) + sfc->flags |= SFC_DATA_OOB; + + if (req->mode == MTD_OPS_RAW) + sfc->flags |= SFC_RAW_RW; + else if (req->mode == MTD_OPS_AUTO_OOB) + sfc->flags |= SFC_AUTO_OOB; + + memset(spinand->oobbuf, 0xff, nanddev_per_page_oobsize(nand)); + + return 0; +} + +static int aml_sfc_ecc_finish_io_req(struct nand_device *nand, + struct nand_page_io_req *req) +{ + struct aml_sfc *sfc = nand_to_aml_sfc(nand); + struct mtd_info *mtd = nanddev_to_mtd(nand); + + if (req->mode == MTD_OPS_RAW || req->type == NAND_PAGE_WRITE) + return 0; + + if (sfc->ecc_stats.failed) + mtd->ecc_stats.failed++; + + mtd->ecc_stats.corrected += sfc->ecc_stats.corrected; + + return sfc->ecc_stats.failed ? -EBADMSG : sfc->ecc_stats.bitflips; +} + +static const struct spi_controller_mem_caps aml_sfc_mem_caps = { + .ecc = true, +}; + +static const struct nand_ecc_engine_ops aml_sfc_ecc_engine_ops = { + .init_ctx = aml_sfc_ecc_init_ctx, + .cleanup_ctx = aml_sfc_ecc_cleanup_ctx, + .prepare_io_req = aml_sfc_ecc_prepare_io_req, + .finish_io_req = aml_sfc_ecc_finish_io_req, +}; + +static int aml_sfc_clk_init(struct aml_sfc *sfc) +{ + sfc->gate_clk = devm_clk_get_enabled(sfc->dev, "gate"); + if (IS_ERR(sfc->gate_clk)) { + dev_err(sfc->dev, "unable to enable gate clk\n"); + return PTR_ERR(sfc->gate_clk); + } + + sfc->core_clk = devm_clk_get_enabled(sfc->dev, "core"); + if (IS_ERR(sfc->core_clk)) { + dev_err(sfc->dev, "unable to enable core clk\n"); + return PTR_ERR(sfc->core_clk); + } + + return clk_set_rate(sfc->core_clk, SFC_BUS_DEFAULT_CLK); +} + +static int aml_sfc_disable_clk(struct aml_sfc *sfc) +{ + clk_disable_unprepare(sfc->core_clk); + clk_disable_unprepare(sfc->gate_clk); + + return 0; +} + +static int aml_sfc_probe(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct device *dev = &pdev->dev; + struct spi_controller *ctrl; + struct aml_sfc *sfc; + void __iomem *reg_base; + int ret; + u32 val = 0; + + const struct regmap_config core_config = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .max_register = SFC_SPI_CFG, + }; + + ctrl = devm_spi_alloc_host(dev, sizeof(*sfc)); + if (!ctrl) + return -ENOMEM; + platform_set_drvdata(pdev, ctrl); + + sfc = spi_controller_get_devdata(ctrl); + sfc->dev = dev; + sfc->ctrl = ctrl; + + sfc->caps = of_device_get_match_data(dev); + if (!sfc->caps) + return dev_err_probe(dev, -ENODEV, "failed to get device data\n"); + + reg_base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(reg_base)) + return PTR_ERR(reg_base); + + sfc->regmap_base = devm_regmap_init_mmio(dev, reg_base, &core_config); + if (IS_ERR(sfc->regmap_base)) + return dev_err_probe(dev, PTR_ERR(sfc->regmap_base), + "failed to init sfc base regmap\n"); + + sfc->data_buf = devm_kzalloc(dev, SFC_BUF_SIZE, GFP_KERNEL); + if (!sfc->data_buf) + return -ENOMEM; + sfc->info_buf = (__le64 *)(sfc->data_buf + SFC_DATABUF_SIZE); + + ret = aml_sfc_clk_init(sfc); + if (ret) + return dev_err_probe(dev, ret, "failed to initialize SFC clock\n"); + + /* Enable Amlogic flash controller spi mode */ + ret = regmap_write(sfc->regmap_base, SFC_SPI_CFG, SPI_MODE_EN); + if (ret) { + dev_err(dev, "failed to enable SPI mode\n"); + goto err_out; + } + + ret = dma_set_mask(sfc->dev, DMA_BIT_MASK(32)); + if (ret) { + dev_err(sfc->dev, "failed to set dma mask\n"); + goto err_out; + } + + sfc->ecc_eng.dev = &pdev->dev; + sfc->ecc_eng.integration = NAND_ECC_ENGINE_INTEGRATION_PIPELINED; + sfc->ecc_eng.ops = &aml_sfc_ecc_engine_ops; + sfc->ecc_eng.priv = sfc; + + ret = nand_ecc_register_on_host_hw_engine(&sfc->ecc_eng); + if (ret) { + dev_err(&pdev->dev, "failed to register Aml host ecc engine.\n"); + goto err_out; + } + + ret = of_property_read_u32(np, "amlogic,rx-adj", &val); + if (!ret) + sfc->rx_adj = val; + + ctrl->dev.of_node = np; + ctrl->mem_ops = &aml_sfc_mem_ops; + ctrl->mem_caps = &aml_sfc_mem_caps; + ctrl->setup = aml_sfc_setup; + ctrl->mode_bits = SPI_TX_QUAD | SPI_TX_DUAL | SPI_RX_QUAD | + SPI_RX_DUAL | SPI_TX_OCTAL | SPI_RX_OCTAL; + ctrl->max_speed_hz = SFC_MAX_FREQUENCY; + ctrl->min_speed_hz = SFC_MIN_FREQUENCY; + ctrl->num_chipselect = SFC_MAX_CS_NUM; + + ret = devm_spi_register_controller(dev, ctrl); + if (ret) + goto err_out; + + return 0; + +err_out: + aml_sfc_disable_clk(sfc); + + return ret; +} + +static void aml_sfc_remove(struct platform_device *pdev) +{ + struct spi_controller *ctlr = platform_get_drvdata(pdev); + struct aml_sfc *sfc = spi_controller_get_devdata(ctlr); + + aml_sfc_disable_clk(sfc); +} + +static const struct of_device_id aml_sfc_of_match[] = { + { + .compatible = "amlogic,a4-spifc", + .data = &aml_a113l2_sfc_caps + }, + {}, +}; +MODULE_DEVICE_TABLE(of, aml_sfc_of_match); + +static struct platform_driver aml_sfc_driver = { + .driver = { + .name = "aml_sfc", + .of_match_table = aml_sfc_of_match, + }, + .probe = aml_sfc_probe, + .remove = aml_sfc_remove, +}; +module_platform_driver(aml_sfc_driver); + +MODULE_DESCRIPTION("Amlogic SPI Flash Controller driver"); +MODULE_AUTHOR("Feng Chen "); +MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/spi/spi-amlogic-spisg.c b/drivers/spi/spi-amlogic-spisg.c index 2ab8bdf2a6761e..bcd7ec291ad07c 100644 --- a/drivers/spi/spi-amlogic-spisg.c +++ b/drivers/spi/spi-amlogic-spisg.c @@ -662,7 +662,7 @@ static int aml_spisg_clk_init(struct spisg_device *spisg, void __iomem *base) clk_disable_unprepare(spisg->pclk); - tbl = devm_kzalloc(dev, sizeof(struct clk_div_table) * (DIV_NUM + 1), GFP_KERNEL); + tbl = devm_kcalloc(dev, (DIV_NUM + 1), sizeof(*tbl), GFP_KERNEL); if (!tbl) return -ENOMEM; @@ -733,7 +733,7 @@ static int aml_spisg_probe(struct platform_device *pdev) else ctlr = spi_alloc_host(dev, sizeof(*spisg)); if (!ctlr) - return dev_err_probe(dev, -ENOMEM, "controller allocation failed\n"); + return -ENOMEM; spisg = spi_controller_get_devdata(ctlr); spisg->controller = ctlr; diff --git a/drivers/spi/spi-atmel.c b/drivers/spi/spi-atmel.c index 89a6b46cd3191a..89977bff76d270 100644 --- a/drivers/spi/spi-atmel.c +++ b/drivers/spi/spi-atmel.c @@ -256,6 +256,7 @@ struct atmel_spi { void __iomem *regs; int irq; struct clk *clk; + struct clk *gclk; struct platform_device *pdev; unsigned long spi_clk; @@ -397,20 +398,10 @@ static void cs_activate(struct atmel_spi *as, struct spi_device *spi) * on CS1,2,3 needs SPI_CSR0.BITS config as SPI_CSR1,2,3.BITS */ spi_writel(as, CSR0, asd->csr); - if (as->caps.has_wdrbt) { - spi_writel(as, MR, - SPI_BF(PCS, ~(0x01 << chip_select)) - | SPI_BIT(WDRBT) - | SPI_BIT(MODFDIS) - | SPI_BIT(MSTR)); - } else { - spi_writel(as, MR, - SPI_BF(PCS, ~(0x01 << chip_select)) - | SPI_BIT(MODFDIS) - | SPI_BIT(MSTR)); - } mr = spi_readl(as, MR); + mr = SPI_BFINS(PCS, ~(0x01 << chip_select), mr); + spi_writel(as, MR, mr); /* * Ensures the clock polarity is valid before we actually @@ -1490,6 +1481,8 @@ static void atmel_get_caps(struct atmel_spi *as) static void atmel_spi_init(struct atmel_spi *as) { + u32 mr = 0; + spi_writel(as, CR, SPI_BIT(SWRST)); spi_writel(as, CR, SPI_BIT(SWRST)); /* AT91SAM9263 Rev B workaround */ @@ -1497,12 +1490,17 @@ static void atmel_spi_init(struct atmel_spi *as) if (as->fifo_size) spi_writel(as, CR, SPI_BIT(FIFOEN)); - if (as->caps.has_wdrbt) { - spi_writel(as, MR, SPI_BIT(WDRBT) | SPI_BIT(MODFDIS) - | SPI_BIT(MSTR)); - } else { - spi_writel(as, MR, SPI_BIT(MSTR) | SPI_BIT(MODFDIS)); - } + /* + * If GCLK is selected as the source clock for the bit rate generation + * Enable the BRSRCCLK/FDIV/DIV32 bit + */ + if (as->gclk) + mr |= SPI_BIT(FDIV); + + if (as->caps.has_wdrbt) + mr |= SPI_BIT(WDRBT); + + spi_writel(as, MR, mr | SPI_BIT(MODFDIS) | SPI_BIT(MSTR)); if (as->use_pdc) spi_writel(as, PTCR, SPI_BIT(RXTDIS) | SPI_BIT(TXTDIS)); @@ -1565,6 +1563,11 @@ static int atmel_spi_probe(struct platform_device *pdev) as->phybase = regs->start; as->irq = irq; as->clk = clk; + as->gclk = devm_clk_get_optional(&pdev->dev, "spi_gclk"); + if (IS_ERR(as->gclk)) { + ret = PTR_ERR(as->gclk); + goto out_unmap_regs; + } init_completion(&as->xfer_completion); @@ -1625,7 +1628,19 @@ static int atmel_spi_probe(struct platform_device *pdev) if (ret) goto out_free_irq; - as->spi_clk = clk_get_rate(clk); + /* + * In cases where the peripheral clock is higher,the FLEX_SPI_CSRx.SCBR + * exceeds the threshold (SCBR ≤ 255), the GCLK is used as the source clock + * for the SPCK (SPI Serial Clock) bit rate generation + */ + if (as->gclk) { + ret = clk_prepare_enable(as->gclk); + if (ret) + goto out_disable_clk; + as->spi_clk = clk_get_rate(as->gclk); + } else { + as->spi_clk = clk_get_rate(clk); + } as->fifo_size = 0; if (!of_property_read_u32(pdev->dev.of_node, "atmel,fifo-size", @@ -1660,6 +1675,8 @@ static int atmel_spi_probe(struct platform_device *pdev) spi_writel(as, CR, SPI_BIT(SWRST)); spi_writel(as, CR, SPI_BIT(SWRST)); /* AT91SAM9263 Rev B workaround */ + clk_disable_unprepare(as->gclk); +out_disable_clk: clk_disable_unprepare(clk); out_free_irq: out_unmap_regs: @@ -1695,6 +1712,8 @@ static void atmel_spi_remove(struct platform_device *pdev) spin_unlock_irq(&as->lock); clk_disable_unprepare(as->clk); + if (as->gclk) + clk_disable_unprepare(as->gclk); pm_runtime_put_noidle(&pdev->dev); pm_runtime_disable(&pdev->dev); @@ -1706,6 +1725,8 @@ static int atmel_spi_runtime_suspend(struct device *dev) struct atmel_spi *as = spi_controller_get_devdata(host); clk_disable_unprepare(as->clk); + if (as->gclk) + clk_disable_unprepare(as->gclk); pinctrl_pm_select_sleep_state(dev); return 0; @@ -1715,10 +1736,20 @@ static int atmel_spi_runtime_resume(struct device *dev) { struct spi_controller *host = dev_get_drvdata(dev); struct atmel_spi *as = spi_controller_get_devdata(host); + int ret; pinctrl_pm_select_default_state(dev); - return clk_prepare_enable(as->clk); + ret = clk_prepare_enable(as->clk); + if (ret) + return ret; + if (as->gclk) { + ret = clk_prepare_enable(as->gclk); + if (ret) + return ret; + } + + return 0; } static int atmel_spi_suspend(struct device *dev) @@ -1746,10 +1777,17 @@ static int atmel_spi_resume(struct device *dev) ret = clk_prepare_enable(as->clk); if (ret) return ret; + if (as->gclk) { + ret = clk_prepare_enable(as->gclk); + if (ret) + return ret; + } atmel_spi_init(as); clk_disable_unprepare(as->clk); + if (as->gclk) + clk_disable_unprepare(as->gclk); if (!pm_runtime_suspended(dev)) { ret = atmel_spi_runtime_resume(dev); diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index 512d53a8ef4d14..e06f412190fd24 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -1050,7 +1050,7 @@ static int spi_engine_probe(struct platform_device *pdev) return -ENODEV; } - if (ADI_AXI_PCORE_VER_MINOR(version) >= 1) { + if (adi_axi_pcore_ver_gteq(version, 1, 1)) { unsigned int sizes = readl(spi_engine->base + SPI_ENGINE_REG_OFFLOAD_MEM_ADDR_WIDTH); @@ -1064,7 +1064,7 @@ static int spi_engine_probe(struct platform_device *pdev) } /* IP v1.5 dropped the requirement for SYNC in offload messages. */ - spi_engine->offload_requires_sync = ADI_AXI_PCORE_VER_MINOR(version) < 5; + spi_engine->offload_requires_sync = !adi_axi_pcore_ver_gteq(version, 1, 5); writel_relaxed(0x00, spi_engine->base + SPI_ENGINE_REG_RESET); writel_relaxed(0xff, spi_engine->base + SPI_ENGINE_REG_INT_PENDING); @@ -1091,15 +1091,12 @@ static int spi_engine_probe(struct platform_device *pdev) host->put_offload = spi_engine_put_offload; host->num_chipselect = 8; - /* Some features depend of the IP core version. */ - if (ADI_AXI_PCORE_VER_MAJOR(version) >= 1) { - if (ADI_AXI_PCORE_VER_MINOR(version) >= 2) { - host->mode_bits |= SPI_CS_HIGH; - host->setup = spi_engine_setup; - } - if (ADI_AXI_PCORE_VER_MINOR(version) >= 3) - host->mode_bits |= SPI_MOSI_IDLE_LOW | SPI_MOSI_IDLE_HIGH; + if (adi_axi_pcore_ver_gteq(version, 1, 2)) { + host->mode_bits |= SPI_CS_HIGH; + host->setup = spi_engine_setup; } + if (adi_axi_pcore_ver_gteq(version, 1, 3)) + host->mode_bits |= SPI_MOSI_IDLE_LOW | SPI_MOSI_IDLE_HIGH; if (host->max_speed_hz == 0) return dev_err_probe(&pdev->dev, -EINVAL, "spi_clk rate is 0"); diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c index 77de5a07639afb..192cc5ef65fb7b 100644 --- a/drivers/spi/spi-bcm2835.c +++ b/drivers/spi/spi-bcm2835.c @@ -622,7 +622,7 @@ static void bcm2835_spi_dma_rx_done(void *data) /* reset fifo and HW */ bcm2835_spi_reset_hw(bs); - /* and mark as completed */; + /* and mark as completed */ spi_finalize_current_transfer(ctlr); } diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 9bf823348cd30d..8fb13df8ff8714 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -33,7 +33,7 @@ #define CQSPI_NAME "cadence-qspi" #define CQSPI_MAX_CHIPSELECT 4 -static_assert(CQSPI_MAX_CHIPSELECT <= SPI_CS_CNT_MAX); +static_assert(CQSPI_MAX_CHIPSELECT <= SPI_DEVICE_CS_CNT_MAX); /* Quirks */ #define CQSPI_NEEDS_WR_DELAY BIT(0) @@ -46,6 +46,7 @@ static_assert(CQSPI_MAX_CHIPSELECT <= SPI_CS_CNT_MAX); #define CQSPI_DMA_SET_MASK BIT(7) #define CQSPI_SUPPORT_DEVICE_RESET BIT(8) #define CQSPI_DISABLE_STIG_MODE BIT(9) +#define CQSPI_DISABLE_RUNTIME_PM BIT(10) /* Capabilities */ #define CQSPI_SUPPORTS_OCTAL BIT(0) @@ -335,7 +336,7 @@ static bool cqspi_is_idle(struct cqspi_st *cqspi) { u32 reg = readl(cqspi->iobase + CQSPI_REG_CONFIG); - return reg & (1UL << CQSPI_REG_CONFIG_IDLE_LSB); + return reg & BIT(CQSPI_REG_CONFIG_IDLE_LSB); } static u32 cqspi_get_rd_sram_level(struct cqspi_st *cqspi) @@ -571,7 +572,7 @@ static int cqspi_command_read(struct cqspi_flash_pdata *f_pdata, reg |= (dummy_clk & CQSPI_REG_CMDCTRL_DUMMY_MASK) << CQSPI_REG_CMDCTRL_DUMMY_LSB; - reg |= (0x1 << CQSPI_REG_CMDCTRL_RD_EN_LSB); + reg |= BIT(CQSPI_REG_CMDCTRL_RD_EN_LSB); /* 0 means 1 byte. */ reg |= (((n_rx - 1) & CQSPI_REG_CMDCTRL_RD_BYTES_MASK) @@ -579,7 +580,7 @@ static int cqspi_command_read(struct cqspi_flash_pdata *f_pdata, /* setup ADDR BIT field */ if (op->addr.nbytes) { - reg |= (0x1 << CQSPI_REG_CMDCTRL_ADDR_EN_LSB); + reg |= BIT(CQSPI_REG_CMDCTRL_ADDR_EN_LSB); reg |= ((op->addr.nbytes - 1) & CQSPI_REG_CMDCTRL_ADD_BYTES_MASK) << CQSPI_REG_CMDCTRL_ADD_BYTES_LSB; @@ -646,7 +647,7 @@ static int cqspi_command_write(struct cqspi_flash_pdata *f_pdata, reg = opcode << CQSPI_REG_CMDCTRL_OPCODE_LSB; if (op->addr.nbytes) { - reg |= (0x1 << CQSPI_REG_CMDCTRL_ADDR_EN_LSB); + reg |= BIT(CQSPI_REG_CMDCTRL_ADDR_EN_LSB); reg |= ((op->addr.nbytes - 1) & CQSPI_REG_CMDCTRL_ADD_BYTES_MASK) << CQSPI_REG_CMDCTRL_ADD_BYTES_LSB; @@ -655,7 +656,7 @@ static int cqspi_command_write(struct cqspi_flash_pdata *f_pdata, } if (n_tx) { - reg |= (0x1 << CQSPI_REG_CMDCTRL_WR_EN_LSB); + reg |= BIT(CQSPI_REG_CMDCTRL_WR_EN_LSB); reg |= ((n_tx - 1) & CQSPI_REG_CMDCTRL_WR_BYTES_MASK) << CQSPI_REG_CMDCTRL_WR_BYTES_LSB; data = 0; @@ -719,6 +720,7 @@ static int cqspi_read_setup(struct cqspi_flash_pdata *f_pdata, reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; reg |= (op->addr.nbytes - 1); writel(reg, reg_base + CQSPI_REG_SIZE); + readl(reg_base + CQSPI_REG_SIZE); /* Flush posted write. */ return 0; } @@ -764,6 +766,7 @@ static int cqspi_indirect_read_execute(struct cqspi_flash_pdata *f_pdata, reinit_completion(&cqspi->transfer_complete); writel(CQSPI_REG_INDIRECTRD_START_MASK, reg_base + CQSPI_REG_INDIRECTRD); + readl(reg_base + CQSPI_REG_INDIRECTRD); /* Flush posted write. */ while (remaining > 0) { if (use_irq && @@ -1062,6 +1065,7 @@ static int cqspi_write_setup(struct cqspi_flash_pdata *f_pdata, reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; reg |= (op->addr.nbytes - 1); writel(reg, reg_base + CQSPI_REG_SIZE); + readl(reg_base + CQSPI_REG_SIZE); /* Flush posted write. */ return 0; } @@ -1090,6 +1094,8 @@ static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata, reinit_completion(&cqspi->transfer_complete); writel(CQSPI_REG_INDIRECTWR_START_MASK, reg_base + CQSPI_REG_INDIRECTWR); + readl(reg_base + CQSPI_REG_INDIRECTWR); /* Flush posted write. */ + /* * As per 66AK2G02 TRM SPRUHY8F section 11.15.5.3 Indirect Access * Controller programming sequence, couple of cycles of @@ -1186,7 +1192,7 @@ static void cqspi_chipselect(struct cqspi_flash_pdata *f_pdata) * CS2 to 4b'1011 * CS3 to 4b'0111 */ - chip_select = 0xF & ~(1 << chip_select); + chip_select = 0xF & ~BIT(chip_select); } reg &= ~(CQSPI_REG_CONFIG_CHIPSELECT_MASK @@ -1272,9 +1278,9 @@ static void cqspi_readdata_capture(struct cqspi_st *cqspi, reg = readl(reg_base + CQSPI_REG_READCAPTURE); if (bypass) - reg |= (1 << CQSPI_REG_READCAPTURE_BYPASS_LSB); + reg |= BIT(CQSPI_REG_READCAPTURE_BYPASS_LSB); else - reg &= ~(1 << CQSPI_REG_READCAPTURE_BYPASS_LSB); + reg &= ~BIT(CQSPI_REG_READCAPTURE_BYPASS_LSB); reg &= ~(CQSPI_REG_READCAPTURE_DELAY_MASK << CQSPI_REG_READCAPTURE_DELAY_LSB); @@ -1468,14 +1474,17 @@ static int cqspi_exec_mem_op(struct spi_mem *mem, const struct spi_mem_op *op) int ret; struct cqspi_st *cqspi = spi_controller_get_devdata(mem->spi->controller); struct device *dev = &cqspi->pdev->dev; + const struct cqspi_driver_platdata *ddata = of_device_get_match_data(dev); if (refcount_read(&cqspi->inflight_ops) == 0) return -ENODEV; - ret = pm_runtime_resume_and_get(dev); - if (ret) { - dev_err(&mem->spi->dev, "resume failed with %d\n", ret); - return ret; + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { + ret = pm_runtime_resume_and_get(dev); + if (ret) { + dev_err(&mem->spi->dev, "resume failed with %d\n", ret); + return ret; + } } if (!refcount_read(&cqspi->refcount)) @@ -1491,7 +1500,8 @@ static int cqspi_exec_mem_op(struct spi_mem *mem, const struct spi_mem_op *op) ret = cqspi_mem_process(mem, op); - pm_runtime_put_autosuspend(dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) + pm_runtime_put_autosuspend(dev); if (ret) dev_err(&mem->spi->dev, "operation failed with %d\n", ret); @@ -1717,12 +1727,10 @@ static const struct spi_controller_mem_caps cqspi_mem_caps = { static int cqspi_setup_flash(struct cqspi_st *cqspi) { - unsigned int max_cs = cqspi->num_chipselect - 1; struct platform_device *pdev = cqspi->pdev; struct device *dev = &pdev->dev; struct cqspi_flash_pdata *f_pdata; - unsigned int cs; - int ret; + int ret, cs, max_cs = -1; /* Get flash device data */ for_each_available_child_of_node_scoped(dev->of_node, np) { @@ -1735,10 +1743,10 @@ static int cqspi_setup_flash(struct cqspi_st *cqspi) if (cs >= cqspi->num_chipselect) { dev_err(dev, "Chip select %d out of range.\n", cs); return -EINVAL; - } else if (cs < max_cs) { - max_cs = cs; } + max_cs = max_t(int, cs, max_cs); + f_pdata = &cqspi->f_pdata[cs]; f_pdata->cqspi = cqspi; f_pdata->cs = cs; @@ -1748,6 +1756,11 @@ static int cqspi_setup_flash(struct cqspi_st *cqspi) return ret; } + if (max_cs < 0) { + dev_err(dev, "No flash device declared\n"); + return -ENODEV; + } + cqspi->num_chipselect = max_cs + 1; return 0; } @@ -1985,11 +1998,12 @@ static int cqspi_probe(struct platform_device *pdev) goto probe_setup_failed; } - pm_runtime_enable(dev); - - pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT); - pm_runtime_use_autosuspend(dev); - pm_runtime_get_noresume(dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { + pm_runtime_enable(dev); + pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT); + pm_runtime_use_autosuspend(dev); + pm_runtime_get_noresume(dev); + } ret = spi_register_controller(host); if (ret) { @@ -1997,12 +2011,17 @@ static int cqspi_probe(struct platform_device *pdev) goto probe_setup_failed; } - pm_runtime_put_autosuspend(dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { + pm_runtime_put_autosuspend(dev); + pm_runtime_mark_last_busy(dev); + pm_runtime_put_autosuspend(dev); + } return 0; probe_setup_failed: cqspi_controller_enable(cqspi, 0); - pm_runtime_disable(dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) + pm_runtime_disable(dev); probe_reset_failed: if (cqspi->is_jh7110) cqspi_jh7110_disable_clk(pdev, cqspi); @@ -2013,7 +2032,11 @@ static int cqspi_probe(struct platform_device *pdev) static void cqspi_remove(struct platform_device *pdev) { + const struct cqspi_driver_platdata *ddata; struct cqspi_st *cqspi = platform_get_drvdata(pdev); + struct device *dev = &pdev->dev; + + ddata = of_device_get_match_data(dev); refcount_set(&cqspi->refcount, 0); @@ -2026,14 +2049,17 @@ static void cqspi_remove(struct platform_device *pdev) if (cqspi->rx_chan) dma_release_channel(cqspi->rx_chan); - if (pm_runtime_get_sync(&pdev->dev) >= 0) - clk_disable(cqspi->clk); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) + if (pm_runtime_get_sync(&pdev->dev) >= 0) + clk_disable(cqspi->clk); if (cqspi->is_jh7110) cqspi_jh7110_disable_clk(pdev, cqspi); - pm_runtime_put_sync(&pdev->dev); - pm_runtime_disable(&pdev->dev); + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { + pm_runtime_put_sync(&pdev->dev); + pm_runtime_disable(&pdev->dev); + } } static int cqspi_runtime_suspend(struct device *dev) @@ -2112,7 +2138,8 @@ static const struct cqspi_driver_platdata socfpga_qspi = { .quirks = CQSPI_DISABLE_DAC_MODE | CQSPI_NO_SUPPORT_WR_COMPLETION | CQSPI_SLOW_SRAM - | CQSPI_DISABLE_STIG_MODE, + | CQSPI_DISABLE_STIG_MODE + | CQSPI_DISABLE_RUNTIME_PM, }; static const struct cqspi_driver_platdata versal_ospi = { diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 4bd4377551b5d2..83ea296597e946 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -331,6 +331,8 @@ struct fsl_dspi_dma { dma_addr_t rx_dma_phys; struct completion cmd_rx_complete; struct dma_async_tx_descriptor *rx_desc; + + size_t bufsize; }; struct fsl_dspi { @@ -373,6 +375,8 @@ struct fsl_dspi { void (*dev_to_host)(struct fsl_dspi *dspi, u32 rxdata); }; +static void dspi_setup_accel(struct fsl_dspi *dspi); + static bool is_s32g_dspi(struct fsl_dspi *data) { return data->devtype_data == &devtype_data[S32G] || @@ -468,6 +472,27 @@ static u32 dspi_pop_tx(struct fsl_dspi *dspi) return txdata; } +/* Push one word to the RX buffer from the POPR register (RX FIFO) */ +static void dspi_push_rx(struct fsl_dspi *dspi, u32 rxdata) +{ + if (!dspi->rx) + return; + dspi->dev_to_host(dspi, rxdata); +} + +static int dspi_fifo_error(struct fsl_dspi *dspi, u32 spi_sr) +{ + if (spi_sr & (SPI_SR_TFUF | SPI_SR_RFOF)) { + dev_err_ratelimited(&dspi->pdev->dev, "FIFO errors:%s%s\n", + spi_sr & SPI_SR_TFUF ? " TX underflow," : "", + spi_sr & SPI_SR_RFOF ? " RX overflow," : ""); + return -EIO; + } + return 0; +} + +#if IS_ENABLED(CONFIG_DMA_ENGINE) + /* Prepare one TX FIFO entry (txdata plus cmd) */ static u32 dspi_pop_tx_pushr(struct fsl_dspi *dspi) { @@ -481,19 +506,37 @@ static u32 dspi_pop_tx_pushr(struct fsl_dspi *dspi) return cmd << 16 | data; } -/* Push one word to the RX buffer from the POPR register (RX FIFO) */ -static void dspi_push_rx(struct fsl_dspi *dspi, u32 rxdata) +static size_t dspi_dma_max_datawords(struct fsl_dspi *dspi) { - if (!dspi->rx) - return; - dspi->dev_to_host(dspi, rxdata); + /* + * Transfers look like one of these, so we always use a full DMA word + * regardless of SPI word size: + * + * 31 16 15 0 + * ----------------------------------------- + * | CONTROL WORD | 16-bit DATA | + * ----------------------------------------- + * or + * ----------------------------------------- + * | CONTROL WORD | UNUSED | 8-bit DATA | + * ----------------------------------------- + */ + return dspi->dma->bufsize / DMA_SLAVE_BUSWIDTH_4_BYTES; +} + +static size_t dspi_dma_transfer_size(struct fsl_dspi *dspi) +{ + return dspi->words_in_flight * DMA_SLAVE_BUSWIDTH_4_BYTES; } static void dspi_tx_dma_callback(void *arg) { struct fsl_dspi *dspi = arg; struct fsl_dspi_dma *dma = dspi->dma; + struct device *dev = &dspi->pdev->dev; + dma_sync_single_for_cpu(dev, dma->tx_dma_phys, + dspi_dma_transfer_size(dspi), DMA_TO_DEVICE); complete(&dma->cmd_tx_complete); } @@ -501,9 +544,13 @@ static void dspi_rx_dma_callback(void *arg) { struct fsl_dspi *dspi = arg; struct fsl_dspi_dma *dma = dspi->dma; + struct device *dev = &dspi->pdev->dev; int i; if (dspi->rx) { + dma_sync_single_for_cpu(dev, dma->rx_dma_phys, + dspi_dma_transfer_size(dspi), + DMA_FROM_DEVICE); for (i = 0; i < dspi->words_in_flight; i++) dspi_push_rx(dspi, dspi->dma->rx_dma_buf[i]); } @@ -513,20 +560,22 @@ static void dspi_rx_dma_callback(void *arg) static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) { + size_t size = dspi_dma_transfer_size(dspi); struct device *dev = &dspi->pdev->dev; struct fsl_dspi_dma *dma = dspi->dma; int time_left; + u32 spi_sr; int i; for (i = 0; i < dspi->words_in_flight; i++) dspi->dma->tx_dma_buf[i] = dspi_pop_tx_pushr(dspi); + dma_sync_single_for_device(dev, dma->tx_dma_phys, size, DMA_TO_DEVICE); dma->tx_desc = dmaengine_prep_slave_single(dma->chan_tx, - dma->tx_dma_phys, - dspi->words_in_flight * - DMA_SLAVE_BUSWIDTH_4_BYTES, - DMA_MEM_TO_DEV, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + dma->tx_dma_phys, size, + DMA_MEM_TO_DEV, + DMA_PREP_INTERRUPT | + DMA_CTRL_ACK); if (!dma->tx_desc) { dev_err(dev, "Not able to get desc for DMA xfer\n"); return -EIO; @@ -539,12 +588,13 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) return -EINVAL; } + dma_sync_single_for_device(dev, dma->rx_dma_phys, size, + DMA_FROM_DEVICE); dma->rx_desc = dmaengine_prep_slave_single(dma->chan_rx, - dma->rx_dma_phys, - dspi->words_in_flight * - DMA_SLAVE_BUSWIDTH_4_BYTES, - DMA_DEV_TO_MEM, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + dma->rx_dma_phys, size, + DMA_DEV_TO_MEM, + DMA_PREP_INTERRUPT | + DMA_CTRL_ACK); if (!dma->rx_desc) { dev_err(dev, "Not able to get desc for DMA xfer\n"); return -EIO; @@ -565,7 +615,8 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) if (spi_controller_is_target(dspi->ctlr)) { wait_for_completion_interruptible(&dspi->dma->cmd_rx_complete); - return 0; + regmap_read(dspi->regmap, SPI_SR, &spi_sr); + return dspi_fifo_error(dspi, spi_sr); } time_left = wait_for_completion_timeout(&dspi->dma->cmd_tx_complete, @@ -589,13 +640,10 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) return 0; } -static void dspi_setup_accel(struct fsl_dspi *dspi); - -static int dspi_dma_xfer(struct fsl_dspi *dspi) +static void dspi_dma_xfer(struct fsl_dspi *dspi) { struct spi_message *message = dspi->cur_msg; struct device *dev = &dspi->pdev->dev; - int ret = 0; /* * dspi->len gets decremented by dspi_pop_tx_pushr in @@ -605,26 +653,22 @@ static int dspi_dma_xfer(struct fsl_dspi *dspi) /* Figure out operational bits-per-word for this chunk */ dspi_setup_accel(dspi); - dspi->words_in_flight = dspi->len / dspi->oper_word_size; - if (dspi->words_in_flight > dspi->devtype_data->fifo_size) - dspi->words_in_flight = dspi->devtype_data->fifo_size; + dspi->words_in_flight = min(dspi->len / dspi->oper_word_size, + dspi_dma_max_datawords(dspi)); message->actual_length += dspi->words_in_flight * dspi->oper_word_size; - ret = dspi_next_xfer_dma_submit(dspi); - if (ret) { + message->status = dspi_next_xfer_dma_submit(dspi); + if (message->status) { dev_err(dev, "DMA transfer failed\n"); break; } } - - return ret; } static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) { - int dma_bufsize = dspi->devtype_data->fifo_size * 2; struct device *dev = &dspi->pdev->dev; struct dma_slave_config cfg; struct fsl_dspi_dma *dma; @@ -644,17 +688,30 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) goto err_tx_channel; } - dma->tx_dma_buf = dma_alloc_coherent(dma->chan_tx->device->dev, - dma_bufsize, &dma->tx_dma_phys, - GFP_KERNEL); + if (spi_controller_is_target(dspi->ctlr)) { + /* + * In target mode we have to be ready to receive the maximum + * that can possibly be transferred at once by EDMA without any + * FIFO underflows. + */ + dma->bufsize = min(dma_get_max_seg_size(dma->chan_rx->device->dev), + dma_get_max_seg_size(dma->chan_tx->device->dev)) * + DMA_SLAVE_BUSWIDTH_4_BYTES; + } else { + dma->bufsize = PAGE_SIZE; + } + + dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev, + dma->bufsize, &dma->tx_dma_phys, + DMA_TO_DEVICE, GFP_KERNEL); if (!dma->tx_dma_buf) { ret = -ENOMEM; goto err_tx_dma_buf; } - dma->rx_dma_buf = dma_alloc_coherent(dma->chan_rx->device->dev, - dma_bufsize, &dma->rx_dma_phys, - GFP_KERNEL); + dma->rx_dma_buf = dma_alloc_noncoherent(dma->chan_rx->device->dev, + dma->bufsize, &dma->rx_dma_phys, + DMA_FROM_DEVICE, GFP_KERNEL); if (!dma->rx_dma_buf) { ret = -ENOMEM; goto err_rx_dma_buf; @@ -689,11 +746,12 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) return 0; err_slave_config: - dma_free_coherent(dma->chan_rx->device->dev, - dma_bufsize, dma->rx_dma_buf, dma->rx_dma_phys); + dma_free_noncoherent(dma->chan_rx->device->dev, dma->bufsize, + dma->rx_dma_buf, dma->rx_dma_phys, + DMA_FROM_DEVICE); err_rx_dma_buf: - dma_free_coherent(dma->chan_tx->device->dev, - dma_bufsize, dma->tx_dma_buf, dma->tx_dma_phys); + dma_free_noncoherent(dma->chan_tx->device->dev, dma->bufsize, + dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE); err_tx_dma_buf: dma_release_channel(dma->chan_tx); err_tx_channel: @@ -707,24 +765,37 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) static void dspi_release_dma(struct fsl_dspi *dspi) { - int dma_bufsize = dspi->devtype_data->fifo_size * 2; struct fsl_dspi_dma *dma = dspi->dma; if (!dma) return; if (dma->chan_tx) { - dma_free_coherent(dma->chan_tx->device->dev, dma_bufsize, - dma->tx_dma_buf, dma->tx_dma_phys); + dma_free_noncoherent(dma->chan_tx->device->dev, dma->bufsize, + dma->tx_dma_buf, dma->tx_dma_phys, + DMA_TO_DEVICE); dma_release_channel(dma->chan_tx); } if (dma->chan_rx) { - dma_free_coherent(dma->chan_rx->device->dev, dma_bufsize, - dma->rx_dma_buf, dma->rx_dma_phys); + dma_free_noncoherent(dma->chan_rx->device->dev, dma->bufsize, + dma->rx_dma_buf, dma->rx_dma_phys, + DMA_FROM_DEVICE); dma_release_channel(dma->chan_rx); } } +#else +static void dspi_dma_xfer(struct fsl_dspi *dspi) +{ + dspi->cur_msg->status = -EINVAL; +} +static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) +{ + dev_err(&dspi->pdev->dev, "DMA support not enabled in kernel\n"); + return -EINVAL; +} +static void dspi_release_dma(struct fsl_dspi *dspi) {} +#endif static void hz_to_spi_baud(char *pbr, char *br, int speed_hz, unsigned long clkrate, bool mtf_enabled) @@ -986,41 +1057,55 @@ static void dspi_fifo_write(struct fsl_dspi *dspi) dspi->progress, !dspi->irq); } -static int dspi_rxtx(struct fsl_dspi *dspi) +/* + * Read the previous transfer from the FIFO and transmit the next one. + * + * Returns false if the buffer to be transmitted is empty, and true if there is + * still data to transmit. + */ +static bool dspi_rxtx(struct fsl_dspi *dspi) { dspi_fifo_read(dspi); if (!dspi->len) /* Success! */ - return 0; + return false; dspi_fifo_write(dspi); - return -EINPROGRESS; + return true; } -static int dspi_poll(struct fsl_dspi *dspi) +static void dspi_poll(struct fsl_dspi *dspi) { - int tries = 1000; + int tries; + int err = 0; u32 spi_sr; do { - regmap_read(dspi->regmap, SPI_SR, &spi_sr); - regmap_write(dspi->regmap, SPI_SR, spi_sr); - - if (spi_sr & SPI_SR_CMDTCF) + for (tries = 1000; tries > 0; --tries) { + regmap_read(dspi->regmap, SPI_SR, &spi_sr); + regmap_write(dspi->regmap, SPI_SR, spi_sr); + + dspi->cur_msg->status = dspi_fifo_error(dspi, spi_sr); + if (dspi->cur_msg->status) + return; + if (spi_sr & SPI_SR_CMDTCF) + break; + } + if (!tries) { + err = -ETIMEDOUT; break; - } while (--tries); - - if (!tries) - return -ETIMEDOUT; + } + } while (dspi_rxtx(dspi)); - return dspi_rxtx(dspi); + dspi->cur_msg->status = err; } static irqreturn_t dspi_interrupt(int irq, void *dev_id) { struct fsl_dspi *dspi = (struct fsl_dspi *)dev_id; + int status; u32 spi_sr; regmap_read(dspi->regmap, SPI_SR, &spi_sr); @@ -1029,8 +1114,19 @@ static irqreturn_t dspi_interrupt(int irq, void *dev_id) if (!(spi_sr & SPI_SR_CMDTCF)) return IRQ_NONE; - if (dspi_rxtx(dspi) == 0) + status = dspi_fifo_error(dspi, spi_sr); + if (status) { + if (dspi->cur_msg) + WRITE_ONCE(dspi->cur_msg->status, status); + complete(&dspi->xfer_done); + return IRQ_HANDLED; + } + + if (dspi_rxtx(dspi) == false) { + if (dspi->cur_msg) + WRITE_ONCE(dspi->cur_msg->status, 0); complete(&dspi->xfer_done); + } return IRQ_HANDLED; } @@ -1060,7 +1156,6 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, struct spi_device *spi = message->spi; struct spi_transfer *transfer; bool cs = false; - int status = 0; u32 val = 0; bool cs_change = false; @@ -1120,7 +1215,7 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, dspi->progress, !dspi->irq); if (dspi->devtype_data->trans_mode == DSPI_DMA_MODE) { - status = dspi_dma_xfer(dspi); + dspi_dma_xfer(dspi); } else { /* * Reinitialize the completion before transferring data @@ -1134,15 +1229,12 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, dspi_fifo_write(dspi); - if (dspi->irq) { + if (dspi->irq) wait_for_completion(&dspi->xfer_done); - } else { - do { - status = dspi_poll(dspi); - } while (status == -EINPROGRESS); - } + else + dspi_poll(dspi); } - if (status) + if (READ_ONCE(message->status)) break; spi_transfer_delay_exec(transfer); @@ -1151,7 +1243,8 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, dspi_deassert_cs(spi, &cs); } - if (status || !cs_change) { + dspi->cur_msg = NULL; + if (message->status || !cs_change) { /* Put DSPI in stop mode */ regmap_update_bits(dspi->regmap, SPI_MCR, SPI_MCR_HALT, SPI_MCR_HALT); @@ -1160,10 +1253,9 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, ; } - message->status = status; spi_finalize_current_message(ctlr); - return status; + return message->status; } static int dspi_set_mtf(struct fsl_dspi *dspi) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index 431439d4cdda9c..8da66e10138679 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -26,6 +26,7 @@ #include #include #include +#include #define DRIVER_NAME "fsl_lpspi" @@ -485,10 +486,9 @@ static int fsl_lpspi_setup_transfer(struct spi_controller *controller, fsl_lpspi->tx = fsl_lpspi_buf_tx_u32; } - if (t->len <= fsl_lpspi->txfifosize) - fsl_lpspi->watermark = t->len; - else - fsl_lpspi->watermark = fsl_lpspi->txfifosize; + fsl_lpspi->watermark = min_t(typeof(fsl_lpspi->watermark), + fsl_lpspi->txfifosize, + t->len); if (fsl_lpspi_can_dma(controller, spi, t)) fsl_lpspi->usedma = true; diff --git a/drivers/spi/spi-ljca.c b/drivers/spi/spi-ljca.c index 2cab79ad2b98f2..3f412cf8f1cd97 100644 --- a/drivers/spi/spi-ljca.c +++ b/drivers/spi/spi-ljca.c @@ -289,7 +289,7 @@ static struct auxiliary_driver ljca_spi_driver = { }; module_auxiliary_driver(ljca_spi_driver); -MODULE_AUTHOR("Wentong Wu "); +MODULE_AUTHOR("Wentong Wu"); MODULE_AUTHOR("Zhifeng Wang "); MODULE_AUTHOR("Lixu Zhang "); MODULE_DESCRIPTION("Intel La Jolla Cove Adapter USB-SPI driver"); diff --git a/drivers/spi/spi-loopback-test.c b/drivers/spi/spi-loopback-test.c index 7dd92deffe3fb1..e0b131aa29b62e 100644 --- a/drivers/spi/spi-loopback-test.c +++ b/drivers/spi/spi-loopback-test.c @@ -446,7 +446,7 @@ static void spi_test_dump_message(struct spi_device *spi, int i; u8 b; - dev_info(&spi->dev, " spi_msg@%pK\n", msg); + dev_info(&spi->dev, " spi_msg@%p\n", msg); if (msg->status) dev_info(&spi->dev, " status: %i\n", msg->status); @@ -456,15 +456,15 @@ static void spi_test_dump_message(struct spi_device *spi, msg->actual_length); list_for_each_entry(xfer, &msg->transfers, transfer_list) { - dev_info(&spi->dev, " spi_transfer@%pK\n", xfer); + dev_info(&spi->dev, " spi_transfer@%p\n", xfer); dev_info(&spi->dev, " len: %i\n", xfer->len); - dev_info(&spi->dev, " tx_buf: %pK\n", xfer->tx_buf); + dev_info(&spi->dev, " tx_buf: %p\n", xfer->tx_buf); if (dump_data && xfer->tx_buf) spi_test_print_hex_dump(" TX: ", xfer->tx_buf, xfer->len); - dev_info(&spi->dev, " rx_buf: %pK\n", xfer->rx_buf); + dev_info(&spi->dev, " rx_buf: %p\n", xfer->rx_buf); if (dump_data && xfer->rx_buf) spi_test_print_hex_dump(" RX: ", xfer->rx_buf, @@ -558,7 +558,7 @@ static int spi_check_rx_ranges(struct spi_device *spi, /* if still not found then something has modified too much */ /* we could list the "closest" transfer here... */ dev_err(&spi->dev, - "loopback strangeness - rx changed outside of allowed range at: %pK\n", + "loopback strangeness - rx changed outside of allowed range at: %p\n", addr); /* do not return, only set ret, * so that we list all addresses @@ -696,7 +696,7 @@ static int spi_test_translate(struct spi_device *spi, } dev_err(&spi->dev, - "PointerRange [%pK:%pK[ not in range [%pK:%pK[ or [%pK:%pK[\n", + "PointerRange [%p:%p[ not in range [%p:%p[ or [%p:%p[\n", *ptr, *ptr + len, RX(0), RX(SPI_TEST_MAX_SIZE), TX(0), TX(SPI_TEST_MAX_SIZE)); diff --git a/drivers/spi/spi-microchip-core-qspi.c b/drivers/spi/spi-microchip-core-qspi.c index 8dc98b17f77b58..aafe6cbf2aea71 100644 --- a/drivers/spi/spi-microchip-core-qspi.c +++ b/drivers/spi/spi-microchip-core-qspi.c @@ -689,8 +689,7 @@ static int mchp_coreqspi_probe(struct platform_device *pdev) ctlr = devm_spi_alloc_host(&pdev->dev, sizeof(*qspi)); if (!ctlr) - return dev_err_probe(&pdev->dev, -ENOMEM, - "unable to allocate host for QSPI controller\n"); + return -ENOMEM; qspi = spi_controller_get_devdata(ctlr); platform_set_drvdata(pdev, qspi); diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c index 62ba0bd9cbb7e7..9128b86c536603 100644 --- a/drivers/spi/spi-microchip-core.c +++ b/drivers/spi/spi-microchip-core.c @@ -534,8 +534,7 @@ static int mchp_corespi_probe(struct platform_device *pdev) host = devm_spi_alloc_host(&pdev->dev, sizeof(*spi)); if (!host) - return dev_err_probe(&pdev->dev, -ENOMEM, - "unable to allocate host for SPI controller\n"); + return -ENOMEM; platform_set_drvdata(pdev, host); diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c index a6032d44771bfd..4b40985af1eaf0 100644 --- a/drivers/spi/spi-mt65xx.c +++ b/drivers/spi/spi-mt65xx.c @@ -563,6 +563,22 @@ static void mtk_spi_setup_packet(struct spi_controller *host) writel(reg_val, mdata->base + SPI_CFG1_REG); } +inline u32 mtk_spi_set_nbit(u32 nbit) +{ + switch (nbit) { + default: + pr_warn_once("unknown nbit mode %u. Falling back to single mode\n", + nbit); + fallthrough; + case SPI_NBITS_SINGLE: + return 0x0; + case SPI_NBITS_DUAL: + return 0x1; + case SPI_NBITS_QUAD: + return 0x2; + } +} + static void mtk_spi_enable_transfer(struct spi_controller *host) { u32 cmd; @@ -729,10 +745,16 @@ static int mtk_spi_transfer_one(struct spi_controller *host, /* prepare xfer direction and duplex mode */ if (mdata->dev_comp->ipm_design) { - if (!xfer->tx_buf || !xfer->rx_buf) { + if (xfer->tx_buf && xfer->rx_buf) { + reg_val &= ~SPI_CFG3_IPM_HALF_DUPLEX_EN; + } else if (xfer->tx_buf) { reg_val |= SPI_CFG3_IPM_HALF_DUPLEX_EN; - if (xfer->rx_buf) - reg_val |= SPI_CFG3_IPM_HALF_DUPLEX_DIR; + reg_val &= ~SPI_CFG3_IPM_HALF_DUPLEX_DIR; + reg_val |= mtk_spi_set_nbit(xfer->tx_nbits); + } else { + reg_val |= SPI_CFG3_IPM_HALF_DUPLEX_EN; + reg_val |= SPI_CFG3_IPM_HALF_DUPLEX_DIR; + reg_val |= mtk_spi_set_nbit(xfer->rx_nbits); } writel(reg_val, mdata->base + SPI_CFG3_IPM_REG); } @@ -1159,7 +1181,7 @@ static int mtk_spi_probe(struct platform_device *pdev) host = devm_spi_alloc_host(dev, sizeof(*mdata)); if (!host) - return dev_err_probe(dev, -ENOMEM, "failed to alloc spi host\n"); + return -ENOMEM; host->auto_runtime_pm = true; host->dev.of_node = dev->of_node; diff --git a/drivers/spi/spi-mtk-snfi.c b/drivers/spi/spi-mtk-snfi.c index e82ee6dcf4986e..ae38c244e25811 100644 --- a/drivers/spi/spi-mtk-snfi.c +++ b/drivers/spi/spi-mtk-snfi.c @@ -1139,7 +1139,6 @@ static int mtk_snand_write_page_cache(struct mtk_snand *snf, // Prepare for custom write interrupt nfi_write32(snf, NFI_INTR_EN, NFI_IRQ_INTR_EN | NFI_IRQ_CUS_PG); reinit_completion(&snf->op_done); - ; // Trigger NFI into custom mode nfi_write16(snf, NFI_CMD, NFI_CMD_DUMMY_WRITE); diff --git a/drivers/spi/spi-mxs.c b/drivers/spi/spi-mxs.c index 43455305fdf40e..0ebcbdb1b1f713 100644 --- a/drivers/spi/spi-mxs.c +++ b/drivers/spi/spi-mxs.c @@ -388,7 +388,7 @@ static int mxs_spi_transfer_one(struct spi_controller *host, TXRX_DEASSERT_CS : 0; /* - * Small blocks can be transfered via PIO. + * Small blocks can be transferred via PIO. * Measured by empiric means: * * dd if=/dev/mtdblock0 of=/dev/null bs=1024k count=1 diff --git a/drivers/spi/spi-npcm-fiu.c b/drivers/spi/spi-npcm-fiu.c index 67cc1d86de425e..cccd17f247754d 100644 --- a/drivers/spi/spi-npcm-fiu.c +++ b/drivers/spi/spi-npcm-fiu.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -498,10 +499,7 @@ static int npcm_fiu_read(struct spi_mem *mem, const struct spi_mem_op *op) do { addr = ((u32)op->addr.val + i); - if (currlen < 16) - readlen = currlen; - else - readlen = 16; + readlen = min_t(int, currlen, 16); buf_ptr = data + i; ret = npcm_fiu_uma_read(mem, op, addr, true, buf_ptr, diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index b92bfef47371fa..f9371f98a65bdc 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -330,6 +330,8 @@ /* Access flash memory using IP bus only */ #define FSPI_QUIRK_USE_IP_ONLY BIT(0) +/* Disable DTR */ +#define FSPI_QUIRK_DISABLE_DTR BIT(1) struct nxp_fspi_devtype_data { unsigned int rxfifo; @@ -344,7 +346,7 @@ static struct nxp_fspi_devtype_data lx2160a_data = { .rxfifo = SZ_512, /* (64 * 64 bits) */ .txfifo = SZ_1K, /* (128 * 64 bits) */ .ahb_buf_size = SZ_2K, /* (256 * 64 bits) */ - .quirks = 0, + .quirks = FSPI_QUIRK_DISABLE_DTR, .lut_num = 32, .little_endian = true, /* little-endian */ }; @@ -399,7 +401,8 @@ struct nxp_fspi { struct mutex lock; struct pm_qos_request pm_qos_req; int selected; -#define FSPI_NEED_INIT (1 << 0) +#define FSPI_NEED_INIT BIT(0) +#define FSPI_DTR_MODE BIT(1) int flags; }; @@ -559,12 +562,21 @@ static void nxp_fspi_prepare_lut(struct nxp_fspi *f, u32 target_lut_reg; /* cmd */ - lutval[0] |= LUT_DEF(0, LUT_CMD, LUT_PAD(op->cmd.buswidth), - op->cmd.opcode); + if (op->cmd.dtr) { + lutval[0] |= LUT_DEF(0, LUT_CMD_DDR, LUT_PAD(op->cmd.buswidth), + op->cmd.opcode >> 8); + lutval[lutidx / 2] |= LUT_DEF(lutidx, LUT_CMD_DDR, + LUT_PAD(op->cmd.buswidth), + op->cmd.opcode & 0xFF); + lutidx++; + } else { + lutval[0] |= LUT_DEF(0, LUT_CMD, LUT_PAD(op->cmd.buswidth), + op->cmd.opcode); + } /* addr bytes */ if (op->addr.nbytes) { - lutval[lutidx / 2] |= LUT_DEF(lutidx, LUT_ADDR, + lutval[lutidx / 2] |= LUT_DEF(lutidx, op->addr.dtr ? LUT_ADDR_DDR : LUT_ADDR, LUT_PAD(op->addr.buswidth), op->addr.nbytes * 8); lutidx++; @@ -572,7 +584,7 @@ static void nxp_fspi_prepare_lut(struct nxp_fspi *f, /* dummy bytes, if needed */ if (op->dummy.nbytes) { - lutval[lutidx / 2] |= LUT_DEF(lutidx, LUT_DUMMY, + lutval[lutidx / 2] |= LUT_DEF(lutidx, op->dummy.dtr ? LUT_DUMMY_DDR : LUT_DUMMY, /* * Due to FlexSPI controller limitation number of PAD for dummy * buswidth needs to be programmed as equal to data buswidth. @@ -587,7 +599,8 @@ static void nxp_fspi_prepare_lut(struct nxp_fspi *f, if (op->data.nbytes) { lutval[lutidx / 2] |= LUT_DEF(lutidx, op->data.dir == SPI_MEM_DATA_IN ? - LUT_NXP_READ : LUT_NXP_WRITE, + (op->data.dtr ? LUT_READ_DDR : LUT_NXP_READ) : + (op->data.dtr ? LUT_WRITE_DDR : LUT_NXP_WRITE), LUT_PAD(op->data.buswidth), 0); lutidx++; @@ -645,6 +658,40 @@ static void nxp_fspi_clk_disable_unprep(struct nxp_fspi *f) return; } +/* + * Sample Clock source selection for Flash Reading + * Four modes defined by fspi: + * mode 0: Dummy Read strobe generated by FlexSPI Controller + * and loopback internally + * mode 1: Dummy Read strobe generated by FlexSPI Controller + * and loopback from DQS pad + * mode 2: Reserved + * mode 3: Flash provided Read strobe and input from DQS pad + * + * fspi default use mode 0 after reset + */ +static void nxp_fspi_select_rx_sample_clk_source(struct nxp_fspi *f, + bool op_is_dtr) +{ + u32 reg; + + /* + * For 8D-8D-8D mode, need to use mode 3 (Flash provided Read + * strobe and input from DQS pad), otherwise read operaton may + * meet issue. + * This mode require flash device connect the DQS pad on board. + * For other modes, still use mode 0, keep align with before. + * spi_nor_suspend will disable 8D-8D-8D mode, also need to + * change the mode back to mode 0. + */ + reg = fspi_readl(f, f->iobase + FSPI_MCR0); + if (op_is_dtr) + reg |= FSPI_MCR0_RXCLKSRC(3); + else /*select mode 0 */ + reg &= ~FSPI_MCR0_RXCLKSRC(3); + fspi_writel(f, reg, f->iobase + FSPI_MCR0); +} + static void nxp_fspi_dll_calibration(struct nxp_fspi *f) { int ret; @@ -674,6 +721,17 @@ static void nxp_fspi_dll_calibration(struct nxp_fspi *f) dev_warn(f->dev, "DLL lock failed, please fix it!\n"); } +/* + * Config the DLL register to default value, enable the target clock delay + * line delay cell override mode, and use 1 fixed delay cell in DLL delay + * chain, this is the suggested setting when clock rate < 100MHz. + */ +static void nxp_fspi_dll_override(struct nxp_fspi *f) +{ + fspi_writel(f, FSPI_DLLACR_OVRDEN, f->iobase + FSPI_DLLACR); + fspi_writel(f, FSPI_DLLBCR_OVRDEN, f->iobase + FSPI_DLLBCR); +} + /* * In FlexSPI controller, flash access is based on value of FSPI_FLSHXXCR0 * register and start base address of the target device. @@ -715,15 +773,18 @@ static void nxp_fspi_dll_calibration(struct nxp_fspi *f) static void nxp_fspi_select_mem(struct nxp_fspi *f, struct spi_device *spi, const struct spi_mem_op *op) { + /* flexspi only support one DTR mode: 8D-8D-8D */ + bool op_is_dtr = op->cmd.dtr && op->addr.dtr && op->dummy.dtr && op->data.dtr; unsigned long rate = op->max_freq; int ret; uint64_t size_kb; /* * Return, if previously selected target device is same as current - * requested target device. + * requested target device. Also the DTR or STR mode do not change. */ - if (f->selected == spi_get_chipselect(spi, 0)) + if ((f->selected == spi_get_chipselect(spi, 0)) && + (!!(f->flags & FSPI_DTR_MODE) == op_is_dtr)) return; /* Reset FLSHxxCR0 registers */ @@ -740,6 +801,18 @@ static void nxp_fspi_select_mem(struct nxp_fspi *f, struct spi_device *spi, dev_dbg(f->dev, "Target device [CS:%x] selected\n", spi_get_chipselect(spi, 0)); + nxp_fspi_select_rx_sample_clk_source(f, op_is_dtr); + + if (op_is_dtr) { + f->flags |= FSPI_DTR_MODE; + /* For DTR mode, flexspi will default div 2 and output to device. + * so here to config the root clock to 2 * device rate. + */ + rate = rate * 2; + } else { + f->flags &= ~FSPI_DTR_MODE; + } + nxp_fspi_clk_disable_unprep(f); ret = clk_set_rate(f->clk, rate); @@ -756,6 +829,8 @@ static void nxp_fspi_select_mem(struct nxp_fspi *f, struct spi_device *spi, */ if (rate > 100000000) nxp_fspi_dll_calibration(f); + else + nxp_fspi_dll_override(f); f->selected = spi_get_chipselect(spi, 0); } @@ -1071,13 +1146,7 @@ static int nxp_fspi_default_setup(struct nxp_fspi *f) /* Disable the module */ fspi_writel(f, FSPI_MCR0_MDIS, base + FSPI_MCR0); - /* - * Config the DLL register to default value, enable the target clock delay - * line delay cell override mode, and use 1 fixed delay cell in DLL delay - * chain, this is the suggested setting when clock rate < 100MHz. - */ - fspi_writel(f, FSPI_DLLACR_OVRDEN, base + FSPI_DLLACR); - fspi_writel(f, FSPI_DLLBCR_OVRDEN, base + FSPI_DLLBCR); + nxp_fspi_dll_override(f); /* enable module */ fspi_writel(f, FSPI_MCR0_AHB_TIMEOUT(0xFF) | @@ -1164,6 +1233,13 @@ static const struct spi_controller_mem_ops nxp_fspi_mem_ops = { }; static const struct spi_controller_mem_caps nxp_fspi_mem_caps = { + .dtr = true, + .swap16 = false, + .per_op_freq = true, +}; + +static const struct spi_controller_mem_caps nxp_fspi_mem_caps_disable_dtr = { + .dtr = false, .per_op_freq = true, }; @@ -1279,12 +1355,17 @@ static int nxp_fspi_probe(struct platform_device *pdev) ctlr->bus_num = -1; ctlr->num_chipselect = NXP_FSPI_MAX_CHIPSELECT; ctlr->mem_ops = &nxp_fspi_mem_ops; - ctlr->mem_caps = &nxp_fspi_mem_caps; + + if (f->devtype_data->quirks & FSPI_QUIRK_DISABLE_DTR) + ctlr->mem_caps = &nxp_fspi_mem_caps_disable_dtr; + else + ctlr->mem_caps = &nxp_fspi_mem_caps; + ctlr->dev.of_node = np; ret = devm_add_action_or_reset(dev, nxp_fspi_cleanup, f); if (ret) - return dev_err_probe(dev, ret, "Failed to register nxp_fspi_cleanup\n"); + return ret; return devm_spi_register_controller(&pdev->dev, ctlr); } diff --git a/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c b/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c index 035d088d4d33d6..8468c773713a3d 100644 --- a/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c +++ b/drivers/spi/spi-offload-trigger-adi-util-sigma-delta.c @@ -5,12 +5,15 @@ */ #include -#include +#include +#include #include #include #include #include #include +#include +#include static bool adi_util_sigma_delta_match(struct spi_offload_trigger *trigger, enum spi_offload_trigger_type type, diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c index 6dc58a30804a12..69c2e9d9be3c38 100644 --- a/drivers/spi/spi-omap2-mcspi.c +++ b/drivers/spi/spi-omap2-mcspi.c @@ -988,6 +988,7 @@ static int omap2_mcspi_setup_transfer(struct spi_device *spi, else l &= ~OMAP2_MCSPI_CHCONF_PHA; + mcspi_write_chconf0(spi, l | OMAP2_MCSPI_CHCONF_FORCE); mcspi_write_chconf0(spi, l); cs->mode = spi->mode; diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index dd87cf4f70dd56..9e56e87746142f 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -33,6 +33,7 @@ #include #include #include +#include /* * This macro is used to define some register default values. @@ -760,10 +761,9 @@ static void setup_dma_scatter(struct pl022 *pl022, * we just feed in this, else we stuff in as much * as we can. */ - if (bytesleft < (PAGE_SIZE - offset_in_page(bufp))) - mapbytes = bytesleft; - else - mapbytes = PAGE_SIZE - offset_in_page(bufp); + mapbytes = min_t(int, bytesleft, + PAGE_SIZE - offset_in_page(bufp)); + sg_set_page(sg, virt_to_page(bufp), mapbytes, offset_in_page(bufp)); bufp += mapbytes; @@ -775,10 +775,7 @@ static void setup_dma_scatter(struct pl022 *pl022, } else { /* Map the dummy buffer on every page */ for_each_sg(sgtab->sgl, sg, sgtab->nents, i) { - if (bytesleft < PAGE_SIZE) - mapbytes = bytesleft; - else - mapbytes = PAGE_SIZE; + mapbytes = min_t(int, bytesleft, PAGE_SIZE); sg_set_page(sg, virt_to_page(pl022->dummypage), mapbytes, 0); bytesleft -= mapbytes; diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 06711a62fa3dca..ec7117a94d5f17 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -1283,7 +1283,7 @@ int pxa2xx_spi_probe(struct device *dev, struct ssp_device *ssp, else controller = devm_spi_alloc_host(dev, sizeof(*drv_data)); if (!controller) - return dev_err_probe(dev, -ENOMEM, "cannot alloc spi_controller\n"); + return -ENOMEM; drv_data = spi_controller_get_devdata(controller); drv_data->controller = controller; diff --git a/drivers/spi/spi-qpic-snand.c b/drivers/spi/spi-qpic-snand.c index 780abb967822a5..58ceea1ea8fb4e 100644 --- a/drivers/spi/spi-qpic-snand.c +++ b/drivers/spi/spi-qpic-snand.c @@ -78,7 +78,6 @@ struct qcom_ecc_stats { }; struct qpic_ecc { - struct device *dev; int ecc_bytes_hw; int spare_bytes; int bbm_size; @@ -95,8 +94,6 @@ struct qpic_ecc { u32 cfg1_raw; u32 ecc_buf_cfg; u32 ecc_bch_cfg; - u32 clrflashstatus; - u32 clrreadstatus; bool bch_enabled; }; @@ -382,12 +379,12 @@ static int qcom_spi_ecc_init_ctx_pipelined(struct nand_device *nand) FIELD_PREP(ECC_PARITY_SIZE_BYTES_BCH_MASK, ecc_cfg->ecc_bytes_hw); ecc_cfg->ecc_buf_cfg = FIELD_PREP(NUM_STEPS_MASK, 0x203); - ecc_cfg->clrflashstatus = FS_READY_BSY_N; - ecc_cfg->clrreadstatus = 0xc0; conf->step_size = ecc_cfg->step_size; conf->strength = ecc_cfg->strength; + snandc->regs->clrflashstatus = cpu_to_le32(FS_READY_BSY_N); + snandc->regs->clrreadstatus = cpu_to_le32(0xc0); snandc->regs->erased_cw_detect_cfg_clr = cpu_to_le32(CLR_ERASED_PAGE_DET); snandc->regs->erased_cw_detect_cfg_set = cpu_to_le32(SET_ERASED_PAGE_DET); @@ -494,9 +491,14 @@ qcom_spi_config_cw_read(struct qcom_nand_controller *snandc, bool use_ecc, int c qcom_write_reg_dma(snandc, &snandc->regs->cmd, NAND_FLASH_CMD, 1, NAND_BAM_NEXT_SGL); qcom_write_reg_dma(snandc, &snandc->regs->exec, NAND_EXEC_CMD, 1, NAND_BAM_NEXT_SGL); - qcom_read_reg_dma(snandc, NAND_FLASH_STATUS, 2, 0); - qcom_read_reg_dma(snandc, NAND_ERASED_CW_DETECT_STATUS, 1, - NAND_BAM_NEXT_SGL); + if (use_ecc) { + qcom_read_reg_dma(snandc, NAND_FLASH_STATUS, 2, 0); + qcom_read_reg_dma(snandc, NAND_ERASED_CW_DETECT_STATUS, 1, + NAND_BAM_NEXT_SGL); + } else { + qcom_read_reg_dma(snandc, NAND_FLASH_STATUS, 1, + NAND_BAM_NEXT_SGL); + } } static int qcom_spi_block_erase(struct qcom_nand_controller *snandc) @@ -599,8 +601,6 @@ static int qcom_spi_read_last_cw(struct qcom_nand_controller *snandc, snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_set_read_loc(snandc, num_cw - 1, 0, 0, ecc_cfg->cw_size, 1); @@ -734,8 +734,6 @@ static int qcom_spi_read_cw_raw(struct qcom_nand_controller *snandc, u8 *data_bu snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_set_read_loc(snandc, raw_cw, 0, 0, ecc_cfg->cw_size, 1); @@ -850,8 +848,6 @@ static int qcom_spi_read_page_ecc(struct qcom_nand_controller *snandc, snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_set_read_loc(snandc, 0, 0, 0, ecc_cfg->cw_data, 1); @@ -943,8 +939,6 @@ static int qcom_spi_read_page_oob(struct qcom_nand_controller *snandc, snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_set_read_loc(snandc, 0, 0, 0, ecc_cfg->cw_data, 1); @@ -1064,8 +1058,6 @@ static int qcom_spi_program_raw(struct qcom_nand_controller *snandc, snandc->regs->cfg0 = cpu_to_le32(cfg0); snandc->regs->cfg1 = cpu_to_le32(cfg1); snandc->regs->ecc_bch_cfg = cpu_to_le32(ecc_bch_cfg); - snandc->regs->clrflashstatus = cpu_to_le32(ecc_cfg->clrflashstatus); - snandc->regs->clrreadstatus = cpu_to_le32(ecc_cfg->clrreadstatus); snandc->regs->exec = cpu_to_le32(1); qcom_spi_config_page_write(snandc); @@ -1549,17 +1541,16 @@ static int qcom_spi_probe(struct platform_device *pdev) } snandc->props = dev_data; - snandc->dev = &pdev->dev; - snandc->core_clk = devm_clk_get(dev, "core"); + snandc->core_clk = devm_clk_get_enabled(dev, "core"); if (IS_ERR(snandc->core_clk)) return PTR_ERR(snandc->core_clk); - snandc->aon_clk = devm_clk_get(dev, "aon"); + snandc->aon_clk = devm_clk_get_enabled(dev, "aon"); if (IS_ERR(snandc->aon_clk)) return PTR_ERR(snandc->aon_clk); - snandc->qspi->iomacro_clk = devm_clk_get(dev, "iom"); + snandc->qspi->iomacro_clk = devm_clk_get_enabled(dev, "iom"); if (IS_ERR(snandc->qspi->iomacro_clk)) return PTR_ERR(snandc->qspi->iomacro_clk); @@ -1573,18 +1564,6 @@ static int qcom_spi_probe(struct platform_device *pdev) if (dma_mapping_error(dev, snandc->base_dma)) return -ENXIO; - ret = clk_prepare_enable(snandc->core_clk); - if (ret) - goto err_dis_core_clk; - - ret = clk_prepare_enable(snandc->aon_clk); - if (ret) - goto err_dis_aon_clk; - - ret = clk_prepare_enable(snandc->qspi->iomacro_clk); - if (ret) - goto err_dis_iom_clk; - ret = qcom_nandc_alloc(snandc); if (ret) goto err_snand_alloc; @@ -1625,12 +1604,6 @@ static int qcom_spi_probe(struct platform_device *pdev) err_spi_init: qcom_nandc_unalloc(snandc); err_snand_alloc: - clk_disable_unprepare(snandc->qspi->iomacro_clk); -err_dis_iom_clk: - clk_disable_unprepare(snandc->aon_clk); -err_dis_aon_clk: - clk_disable_unprepare(snandc->core_clk); -err_dis_core_clk: dma_unmap_resource(dev, res->start, resource_size(res), DMA_BIDIRECTIONAL, 0); return ret; @@ -1645,11 +1618,6 @@ static void qcom_spi_remove(struct platform_device *pdev) spi_unregister_controller(ctlr); nand_ecc_unregister_on_host_hw_engine(&snandc->qspi->ecc_eng); qcom_nandc_unalloc(snandc); - - clk_disable_unprepare(snandc->aon_clk); - clk_disable_unprepare(snandc->core_clk); - clk_disable_unprepare(snandc->qspi->iomacro_clk); - dma_unmap_resource(&pdev->dev, snandc->base_dma, resource_size(res), DMA_BIDIRECTIONAL, 0); } diff --git a/drivers/spi/spi-rb4xx.c b/drivers/spi/spi-rb4xx.c index e71d3805b150de..22b86fc8913262 100644 --- a/drivers/spi/spi-rb4xx.c +++ b/drivers/spi/spi-rb4xx.c @@ -16,7 +16,16 @@ #include #include -#include +#define AR71XX_SPI_REG_FS 0x00 /* Function Select */ +#define AR71XX_SPI_REG_CTRL 0x04 /* SPI Control */ +#define AR71XX_SPI_REG_IOC 0x08 /* SPI I/O Control */ +#define AR71XX_SPI_REG_RDS 0x0c /* Read Data Shift */ + +#define AR71XX_SPI_FS_GPIO BIT(0) /* Enable GPIO mode */ + +#define AR71XX_SPI_IOC_DO BIT(0) /* Data Out pin */ +#define AR71XX_SPI_IOC_CLK BIT(8) /* CLK pin */ +#define AR71XX_SPI_IOC_CS(n) BIT(16 + (n)) struct rb4xx_spi { void __iomem *base; @@ -63,7 +72,7 @@ static inline void do_spi_clk_two(struct rb4xx_spi *rbspi, u32 spi_ioc, if (value & BIT(1)) regval |= AR71XX_SPI_IOC_DO; if (value & BIT(0)) - regval |= AR71XX_SPI_IOC_CS2; + regval |= AR71XX_SPI_IOC_CS(2); rb4xx_write(rbspi, AR71XX_SPI_REG_IOC, regval); rb4xx_write(rbspi, AR71XX_SPI_REG_IOC, regval | AR71XX_SPI_IOC_CLK); @@ -89,7 +98,7 @@ static void rb4xx_set_cs(struct spi_device *spi, bool enable) */ if (enable) rb4xx_write(rbspi, AR71XX_SPI_REG_IOC, - AR71XX_SPI_IOC_CS0 | AR71XX_SPI_IOC_CS1); + AR71XX_SPI_IOC_CS(0) | AR71XX_SPI_IOC_CS(1)); } static int rb4xx_transfer_one(struct spi_controller *host, @@ -109,10 +118,10 @@ static int rb4xx_transfer_one(struct spi_controller *host, */ if (spi_get_chipselect(spi, 0) == 2) /* MMC */ - spi_ioc = AR71XX_SPI_IOC_CS0; + spi_ioc = AR71XX_SPI_IOC_CS(0); else /* Boot flash and CPLD */ - spi_ioc = AR71XX_SPI_IOC_CS1; + spi_ioc = AR71XX_SPI_IOC_CS(1); tx_buf = t->tx_buf; rx_buf = t->rx_buf; @@ -147,7 +156,7 @@ static int rb4xx_spi_probe(struct platform_device *pdev) if (!host) return -ENOMEM; - ahb_clk = devm_clk_get(&pdev->dev, "ahb"); + ahb_clk = devm_clk_get_enabled(&pdev->dev, "ahb"); if (IS_ERR(ahb_clk)) return PTR_ERR(ahb_clk); @@ -163,7 +172,6 @@ static int rb4xx_spi_probe(struct platform_device *pdev) rbspi = spi_controller_get_devdata(host); rbspi->base = spi_base; rbspi->clk = ahb_clk; - platform_set_drvdata(pdev, rbspi); err = devm_spi_register_controller(&pdev->dev, host); if (err) { @@ -171,23 +179,12 @@ static int rb4xx_spi_probe(struct platform_device *pdev) return err; } - err = clk_prepare_enable(ahb_clk); - if (err) - return err; - /* Enable SPI */ rb4xx_write(rbspi, AR71XX_SPI_REG_FS, AR71XX_SPI_FS_GPIO); return 0; } -static void rb4xx_spi_remove(struct platform_device *pdev) -{ - struct rb4xx_spi *rbspi = platform_get_drvdata(pdev); - - clk_disable_unprepare(rbspi->clk); -} - static const struct of_device_id rb4xx_spi_dt_match[] = { { .compatible = "mikrotik,rb4xx-spi" }, { }, @@ -196,10 +193,9 @@ MODULE_DEVICE_TABLE(of, rb4xx_spi_dt_match); static struct platform_driver rb4xx_spi_drv = { .probe = rb4xx_spi_probe, - .remove = rb4xx_spi_remove, .driver = { .name = "rb4xx-spi", - .of_match_table = of_match_ptr(rb4xx_spi_dt_match), + .of_match_table = rb4xx_spi_dt_match, }, }; diff --git a/drivers/spi/spi-rpc-if.c b/drivers/spi/spi-rpc-if.c index 627cffea5d5c7e..6edc0c4db854db 100644 --- a/drivers/spi/spi-rpc-if.c +++ b/drivers/spi/spi-rpc-if.c @@ -196,21 +196,23 @@ static void rpcif_spi_remove(struct platform_device *pdev) pm_runtime_disable(rpc->dev); } -static int __maybe_unused rpcif_spi_suspend(struct device *dev) +static int rpcif_spi_suspend(struct device *dev) { struct spi_controller *ctlr = dev_get_drvdata(dev); return spi_controller_suspend(ctlr); } -static int __maybe_unused rpcif_spi_resume(struct device *dev) +static int rpcif_spi_resume(struct device *dev) { struct spi_controller *ctlr = dev_get_drvdata(dev); + rpcif_hw_init(dev, false); + return spi_controller_resume(ctlr); } -static SIMPLE_DEV_PM_OPS(rpcif_spi_pm_ops, rpcif_spi_suspend, rpcif_spi_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(rpcif_spi_pm_ops, rpcif_spi_suspend, rpcif_spi_resume); static const struct platform_device_id rpc_if_spi_id_table[] = { { .name = "rpc-if-spi" }, @@ -224,9 +226,7 @@ static struct platform_driver rpcif_spi_driver = { .id_table = rpc_if_spi_id_table, .driver = { .name = "rpc-if-spi", -#ifdef CONFIG_PM_SLEEP - .pm = &rpcif_spi_pm_ops, -#endif + .pm = pm_sleep_ptr(&rpcif_spi_pm_ops), }, }; module_platform_driver(rpcif_spi_driver); diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c index b1567243ae196a..aab36c779c06a5 100644 --- a/drivers/spi/spi-s3c64xx.c +++ b/drivers/spi/spi-s3c64xx.c @@ -1268,8 +1268,7 @@ static int s3c64xx_spi_probe(struct platform_device *pdev) host = devm_spi_alloc_host(&pdev->dev, sizeof(*sdd)); if (!host) - return dev_err_probe(&pdev->dev, -ENOMEM, - "Unable to allocate SPI Host\n"); + return -ENOMEM; platform_set_drvdata(pdev, host); @@ -1507,16 +1506,6 @@ static const struct dev_pm_ops s3c64xx_spi_pm = { s3c64xx_spi_runtime_resume, NULL) }; -static const struct s3c64xx_spi_port_config s3c2443_spi_port_config = { - /* fifo_lvl_mask is deprecated. Use {rx, tx}_fifomask instead. */ - .fifo_lvl_mask = { 0x7f }, - /* rx_lvl_offset is deprecated. Use {rx, tx}_fifomask instead. */ - .rx_lvl_offset = 13, - .tx_st_done = 21, - .clk_div = 2, - .high_speed = true, -}; - static const struct s3c64xx_spi_port_config s3c6410_spi_port_config = { /* fifo_lvl_mask is deprecated. Use {rx, tx}_fifomask instead. */ .fifo_lvl_mask = { 0x7f, 0x7F }, @@ -1628,9 +1617,6 @@ static const struct s3c64xx_spi_port_config gs101_spi_port_config = { static const struct platform_device_id s3c64xx_spi_driver_ids[] = { { - .name = "s3c2443-spi", - .driver_data = (kernel_ulong_t)&s3c2443_spi_port_config, - }, { .name = "s3c6410-spi", .driver_data = (kernel_ulong_t)&s3c6410_spi_port_config, }, @@ -1642,9 +1628,6 @@ static const struct of_device_id s3c64xx_spi_dt_match[] = { { .compatible = "google,gs101-spi", .data = &gs101_spi_port_config, }, - { .compatible = "samsung,s3c2443-spi", - .data = &s3c2443_spi_port_config, - }, { .compatible = "samsung,s3c6410-spi", .data = &s3c6410_spi_port_config, }, diff --git a/drivers/spi/spi-sunplus-sp7021.c b/drivers/spi/spi-sunplus-sp7021.c index 7fd4cc6f74c25e..256ae07db6becc 100644 --- a/drivers/spi/spi-sunplus-sp7021.c +++ b/drivers/spi/spi-sunplus-sp7021.c @@ -103,7 +103,7 @@ static irqreturn_t sp7021_spi_target_irq(int irq, void *dev) data_status = readl(pspim->s_base + SP7021_DATA_RDY_REG); data_status |= SP7021_SLAVE_CLR_INT; - writel(data_status , pspim->s_base + SP7021_DATA_RDY_REG); + writel(data_status, pspim->s_base + SP7021_DATA_RDY_REG); complete(&pspim->target_isr); return IRQ_HANDLED; } @@ -296,7 +296,7 @@ static void sp7021_spi_setup_clk(struct spi_controller *ctlr, struct spi_transfe } static int sp7021_spi_host_transfer_one(struct spi_controller *ctlr, struct spi_device *spi, - struct spi_transfer *xfer) + struct spi_transfer *xfer) { struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); unsigned long timeout = msecs_to_jiffies(1000); @@ -360,7 +360,7 @@ static int sp7021_spi_host_transfer_one(struct spi_controller *ctlr, struct spi_ } static int sp7021_spi_target_transfer_one(struct spi_controller *ctlr, struct spi_device *spi, - struct spi_transfer *xfer) + struct spi_transfer *xfer) { struct sp7021_spi_ctlr *pspim = spi_controller_get_devdata(ctlr); struct device *dev = pspim->dev; diff --git a/drivers/spi/spi-virtio.c b/drivers/spi/spi-virtio.c new file mode 100644 index 00000000000000..2acb929b2c6907 --- /dev/null +++ b/drivers/spi/spi-virtio.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SPI bus driver for the Virtio SPI controller + * Copyright (C) 2023 OpenSynergy GmbH + * Copyright (C) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VIRTIO_SPI_MODE_MASK \ + (SPI_MODE_X_MASK | SPI_CS_HIGH | SPI_LSB_FIRST) + +struct virtio_spi_req { + struct completion completion; + const u8 *tx_buf; + u8 *rx_buf; + struct spi_transfer_head transfer_head ____cacheline_aligned; + struct spi_transfer_result result; +}; + +struct virtio_spi_priv { + /* The virtio device we're associated with */ + struct virtio_device *vdev; + /* Pointer to the virtqueue */ + struct virtqueue *vq; + /* Copy of config space mode_func_supported */ + u32 mode_func_supported; + /* Copy of config space max_freq_hz */ + u32 max_freq_hz; +}; + +static void virtio_spi_msg_done(struct virtqueue *vq) +{ + struct virtio_spi_req *req; + unsigned int len; + + while ((req = virtqueue_get_buf(vq, &len))) + complete(&req->completion); +} + +/* + * virtio_spi_set_delays - Set delay parameters for SPI transfer + * + * This function sets various delay parameters for SPI transfer, + * including delay after CS asserted, timing intervals between + * adjacent words within a transfer, delay before and after CS + * deasserted. It converts these delay parameters to nanoseconds + * using spi_delay_to_ns and stores the results in spi_transfer_head + * structure. + * If the conversion fails, the function logs a warning message and + * returns an error code. + * . . . . . . . . . . + * Delay + A + + B + + C + D + E + F + A + + * . . . . . . . . . . + * ___. . . . . . .___.___. . + * CS# |___.______.____.____.___.___| . |___._____________ + * . . . . . . . . . . + * . . . . . . . . . . + * SCLK__.___.___NNN_____NNN__.___.___.___.___.___.___NNN_______ + * + * NOTE: 1st transfer has two words, the delay between these two words are + * 'B' in the diagram. + * + * A => struct spi_device -> cs_setup + * B => max{struct spi_transfer -> word_delay, struct spi_device -> word_delay} + * Note: spi_device and spi_transfer both have word_delay, Linux + * choose the bigger one, refer to _spi_xfer_word_delay_update function + * C => struct spi_transfer -> delay + * D => struct spi_device -> cs_hold + * E => struct spi_device -> cs_inactive + * F => struct spi_transfer -> cs_change_delay + * + * So the corresponding relationship: + * A <===> cs_setup_ns (after CS asserted) + * B <===> word_delay_ns (delay between adjacent words within a transfer) + * C+D <===> cs_delay_hold_ns (before CS deasserted) + * E+F <===> cs_change_delay_inactive_ns (after CS deasserted, these two + * values are also recommended in the Linux driver to be added up) + */ +static int virtio_spi_set_delays(struct spi_transfer_head *th, + struct spi_device *spi, + struct spi_transfer *xfer) +{ + int cs_setup; + int cs_word_delay_xfer; + int cs_word_delay_spi; + int delay; + int cs_hold; + int cs_inactive; + int cs_change_delay; + + cs_setup = spi_delay_to_ns(&spi->cs_setup, xfer); + if (cs_setup < 0) { + dev_warn(&spi->dev, "Cannot convert cs_setup\n"); + return cs_setup; + } + th->cs_setup_ns = cpu_to_le32(cs_setup); + + cs_word_delay_xfer = spi_delay_to_ns(&xfer->word_delay, xfer); + if (cs_word_delay_xfer < 0) { + dev_warn(&spi->dev, "Cannot convert cs_word_delay_xfer\n"); + return cs_word_delay_xfer; + } + cs_word_delay_spi = spi_delay_to_ns(&spi->word_delay, xfer); + if (cs_word_delay_spi < 0) { + dev_warn(&spi->dev, "Cannot convert cs_word_delay_spi\n"); + return cs_word_delay_spi; + } + + th->word_delay_ns = cpu_to_le32(max(cs_word_delay_spi, cs_word_delay_xfer)); + + delay = spi_delay_to_ns(&xfer->delay, xfer); + if (delay < 0) { + dev_warn(&spi->dev, "Cannot convert delay\n"); + return delay; + } + cs_hold = spi_delay_to_ns(&spi->cs_hold, xfer); + if (cs_hold < 0) { + dev_warn(&spi->dev, "Cannot convert cs_hold\n"); + return cs_hold; + } + th->cs_delay_hold_ns = cpu_to_le32(delay + cs_hold); + + cs_inactive = spi_delay_to_ns(&spi->cs_inactive, xfer); + if (cs_inactive < 0) { + dev_warn(&spi->dev, "Cannot convert cs_inactive\n"); + return cs_inactive; + } + cs_change_delay = spi_delay_to_ns(&xfer->cs_change_delay, xfer); + if (cs_change_delay < 0) { + dev_warn(&spi->dev, "Cannot convert cs_change_delay\n"); + return cs_change_delay; + } + th->cs_change_delay_inactive_ns = + cpu_to_le32(cs_inactive + cs_change_delay); + + return 0; +} + +static int virtio_spi_transfer_one(struct spi_controller *ctrl, + struct spi_device *spi, + struct spi_transfer *xfer) +{ + struct virtio_spi_priv *priv = spi_controller_get_devdata(ctrl); + struct virtio_spi_req *spi_req __free(kfree) = NULL; + struct spi_transfer_head *th; + struct scatterlist sg_out_head, sg_out_payload; + struct scatterlist sg_in_result, sg_in_payload; + struct scatterlist *sgs[4]; + unsigned int outcnt = 0; + unsigned int incnt = 0; + int ret; + + spi_req = kzalloc(sizeof(*spi_req), GFP_KERNEL); + if (!spi_req) + return -ENOMEM; + + init_completion(&spi_req->completion); + + th = &spi_req->transfer_head; + + /* Fill struct spi_transfer_head */ + th->chip_select_id = spi_get_chipselect(spi, 0); + th->bits_per_word = spi->bits_per_word; + th->cs_change = xfer->cs_change; + th->tx_nbits = xfer->tx_nbits; + th->rx_nbits = xfer->rx_nbits; + th->reserved[0] = 0; + th->reserved[1] = 0; + th->reserved[2] = 0; + + static_assert(VIRTIO_SPI_CPHA == SPI_CPHA, + "VIRTIO_SPI_CPHA must match SPI_CPHA"); + static_assert(VIRTIO_SPI_CPOL == SPI_CPOL, + "VIRTIO_SPI_CPOL must match SPI_CPOL"); + static_assert(VIRTIO_SPI_CS_HIGH == SPI_CS_HIGH, + "VIRTIO_SPI_CS_HIGH must match SPI_CS_HIGH"); + static_assert(VIRTIO_SPI_MODE_LSB_FIRST == SPI_LSB_FIRST, + "VIRTIO_SPI_MODE_LSB_FIRST must match SPI_LSB_FIRST"); + + th->mode = cpu_to_le32(spi->mode & VIRTIO_SPI_MODE_MASK); + if (spi->mode & SPI_LOOP) + th->mode |= cpu_to_le32(VIRTIO_SPI_MODE_LOOP); + + th->freq = cpu_to_le32(xfer->speed_hz); + + ret = virtio_spi_set_delays(th, spi, xfer); + if (ret) + goto msg_done; + + /* Set buffers */ + spi_req->tx_buf = xfer->tx_buf; + spi_req->rx_buf = xfer->rx_buf; + + /* Prepare sending of virtio message */ + init_completion(&spi_req->completion); + + sg_init_one(&sg_out_head, th, sizeof(*th)); + sgs[outcnt] = &sg_out_head; + outcnt++; + + if (spi_req->tx_buf) { + sg_init_one(&sg_out_payload, spi_req->tx_buf, xfer->len); + sgs[outcnt] = &sg_out_payload; + outcnt++; + } + + if (spi_req->rx_buf) { + sg_init_one(&sg_in_payload, spi_req->rx_buf, xfer->len); + sgs[outcnt] = &sg_in_payload; + incnt++; + } + + sg_init_one(&sg_in_result, &spi_req->result, + sizeof(struct spi_transfer_result)); + sgs[outcnt + incnt] = &sg_in_result; + incnt++; + + ret = virtqueue_add_sgs(priv->vq, sgs, outcnt, incnt, spi_req, + GFP_KERNEL); + if (ret) + goto msg_done; + + /* Simple implementation: There can be only one transfer in flight */ + virtqueue_kick(priv->vq); + + wait_for_completion(&spi_req->completion); + + /* Read result from message and translate return code */ + switch (spi_req->result.result) { + case VIRTIO_SPI_TRANS_OK: + break; + case VIRTIO_SPI_PARAM_ERR: + ret = -EINVAL; + break; + case VIRTIO_SPI_TRANS_ERR: + ret = -EIO; + break; + default: + ret = -EIO; + break; + } + +msg_done: + if (ret) + ctrl->cur_msg->status = ret; + + return ret; +} + +static void virtio_spi_read_config(struct virtio_device *vdev) +{ + struct spi_controller *ctrl = dev_get_drvdata(&vdev->dev); + struct virtio_spi_priv *priv = vdev->priv; + u8 cs_max_number; + u8 tx_nbits_supported; + u8 rx_nbits_supported; + + cs_max_number = virtio_cread8(vdev, offsetof(struct virtio_spi_config, + cs_max_number)); + ctrl->num_chipselect = cs_max_number; + + /* Set the mode bits which are understood by this driver */ + priv->mode_func_supported = + virtio_cread32(vdev, offsetof(struct virtio_spi_config, + mode_func_supported)); + ctrl->mode_bits = priv->mode_func_supported & + (VIRTIO_SPI_CS_HIGH | VIRTIO_SPI_MODE_LSB_FIRST); + if (priv->mode_func_supported & VIRTIO_SPI_MF_SUPPORT_CPHA_1) + ctrl->mode_bits |= VIRTIO_SPI_CPHA; + if (priv->mode_func_supported & VIRTIO_SPI_MF_SUPPORT_CPOL_1) + ctrl->mode_bits |= VIRTIO_SPI_CPOL; + if (priv->mode_func_supported & VIRTIO_SPI_MF_SUPPORT_LSB_FIRST) + ctrl->mode_bits |= SPI_LSB_FIRST; + if (priv->mode_func_supported & VIRTIO_SPI_MF_SUPPORT_LOOPBACK) + ctrl->mode_bits |= SPI_LOOP; + tx_nbits_supported = + virtio_cread8(vdev, offsetof(struct virtio_spi_config, + tx_nbits_supported)); + if (tx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_DUAL) + ctrl->mode_bits |= SPI_TX_DUAL; + if (tx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_QUAD) + ctrl->mode_bits |= SPI_TX_QUAD; + if (tx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_OCTAL) + ctrl->mode_bits |= SPI_TX_OCTAL; + rx_nbits_supported = + virtio_cread8(vdev, offsetof(struct virtio_spi_config, + rx_nbits_supported)); + if (rx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_DUAL) + ctrl->mode_bits |= SPI_RX_DUAL; + if (rx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_QUAD) + ctrl->mode_bits |= SPI_RX_QUAD; + if (rx_nbits_supported & VIRTIO_SPI_RX_TX_SUPPORT_OCTAL) + ctrl->mode_bits |= SPI_RX_OCTAL; + + ctrl->bits_per_word_mask = + virtio_cread32(vdev, offsetof(struct virtio_spi_config, + bits_per_word_mask)); + + priv->max_freq_hz = + virtio_cread32(vdev, offsetof(struct virtio_spi_config, + max_freq_hz)); +} + +static int virtio_spi_find_vqs(struct virtio_spi_priv *priv) +{ + struct virtqueue *vq; + + vq = virtio_find_single_vq(priv->vdev, virtio_spi_msg_done, "spi-rq"); + if (IS_ERR(vq)) + return PTR_ERR(vq); + priv->vq = vq; + return 0; +} + +/* Function must not be called before virtio_spi_find_vqs() has been run */ +static void virtio_spi_del_vq(void *data) +{ + struct virtio_device *vdev = data; + + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); +} + +static int virtio_spi_probe(struct virtio_device *vdev) +{ + struct virtio_spi_priv *priv; + struct spi_controller *ctrl; + int ret; + + ctrl = devm_spi_alloc_host(&vdev->dev, sizeof(*priv)); + if (!ctrl) + return -ENOMEM; + + priv = spi_controller_get_devdata(ctrl); + priv->vdev = vdev; + vdev->priv = priv; + + device_set_node(&ctrl->dev, dev_fwnode(&vdev->dev)); + + dev_set_drvdata(&vdev->dev, ctrl); + + virtio_spi_read_config(vdev); + + ctrl->transfer_one = virtio_spi_transfer_one; + + ret = virtio_spi_find_vqs(priv); + if (ret) + return dev_err_probe(&vdev->dev, ret, "Cannot setup virtqueues\n"); + + /* Register cleanup for virtqueues using devm */ + ret = devm_add_action_or_reset(&vdev->dev, virtio_spi_del_vq, vdev); + if (ret) + return dev_err_probe(&vdev->dev, ret, "Cannot register virtqueue cleanup\n"); + + /* Use devm version to register controller */ + ret = devm_spi_register_controller(&vdev->dev, ctrl); + if (ret) + return dev_err_probe(&vdev->dev, ret, "Cannot register controller\n"); + + return 0; +} + +static int virtio_spi_freeze(struct device *dev) +{ + struct spi_controller *ctrl = dev_get_drvdata(dev); + struct virtio_device *vdev = dev_to_virtio(dev); + int ret; + + ret = spi_controller_suspend(ctrl); + if (ret) { + dev_warn(dev, "cannot suspend controller (%d)\n", ret); + return ret; + } + + virtio_spi_del_vq(vdev); + return 0; +} + +static int virtio_spi_restore(struct device *dev) +{ + struct spi_controller *ctrl = dev_get_drvdata(dev); + struct virtio_device *vdev = dev_to_virtio(dev); + int ret; + + ret = virtio_spi_find_vqs(vdev->priv); + if (ret) { + dev_err(dev, "problem starting vqueue (%d)\n", ret); + return ret; + } + + ret = spi_controller_resume(ctrl); + if (ret) + dev_err(dev, "problem resuming controller (%d)\n", ret); + + return ret; +} + +static struct virtio_device_id virtio_spi_id_table[] = { + { VIRTIO_ID_SPI, VIRTIO_DEV_ANY_ID }, + {} +}; +MODULE_DEVICE_TABLE(virtio, virtio_spi_id_table); + +static const struct dev_pm_ops virtio_spi_pm_ops = { + .freeze = pm_sleep_ptr(virtio_spi_freeze), + .restore = pm_sleep_ptr(virtio_spi_restore), +}; + +static struct virtio_driver virtio_spi_driver = { + .driver = { + .name = KBUILD_MODNAME, + .pm = &virtio_spi_pm_ops, + }, + .id_table = virtio_spi_id_table, + .probe = virtio_spi_probe, +}; +module_virtio_driver(virtio_spi_driver); + +MODULE_AUTHOR("OpenSynergy GmbH"); +MODULE_AUTHOR("Haixu Cui "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Virtio SPI bus driver"); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index a388f372b27a7f..2e0647a0689029 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -427,15 +427,13 @@ static int spi_probe(struct device *dev) if (spi->irq < 0) spi->irq = 0; - ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON | + PD_FLAG_DETACH_POWER_OFF); if (ret) return ret; - if (sdrv->probe) { + if (sdrv->probe) ret = sdrv->probe(spi); - if (ret) - dev_pm_domain_detach(dev, true); - } return ret; } @@ -446,8 +444,6 @@ static void spi_remove(struct device *dev) if (sdrv->remove) sdrv->remove(to_spi_device(dev)); - - dev_pm_domain_detach(dev, true); } static void spi_shutdown(struct device *dev) @@ -590,6 +586,7 @@ struct spi_device *spi_alloc_device(struct spi_controller *ctlr) spi->dev.bus = &spi_bus_type; spi->dev.release = spidev_release; spi->mode = ctlr->buswidth_override_bits; + spi->num_chipselect = 1; device_initialize(&spi->dev); return spi; @@ -626,11 +623,6 @@ static void spi_dev_set_name(struct spi_device *spi) */ #define SPI_INVALID_CS ((s8)-1) -static inline bool is_valid_cs(s8 chip_select) -{ - return chip_select != SPI_INVALID_CS; -} - static inline int spi_dev_check_cs(struct device *dev, struct spi_device *spi, u8 idx, struct spi_device *new_spi, u8 new_idx) @@ -639,9 +631,9 @@ static inline int spi_dev_check_cs(struct device *dev, u8 idx_new; cs = spi_get_chipselect(spi, idx); - for (idx_new = new_idx; idx_new < SPI_CS_CNT_MAX; idx_new++) { + for (idx_new = new_idx; idx_new < new_spi->num_chipselect; idx_new++) { cs_new = spi_get_chipselect(new_spi, idx_new); - if (is_valid_cs(cs) && is_valid_cs(cs_new) && cs == cs_new) { + if (cs == cs_new) { dev_err(dev, "chipselect %u already in use\n", cs_new); return -EBUSY; } @@ -656,7 +648,7 @@ static int spi_dev_check(struct device *dev, void *data) int status, idx; if (spi->controller == new_spi->controller) { - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { status = spi_dev_check_cs(dev, spi, idx, new_spi, 0); if (status) return status; @@ -678,10 +670,16 @@ static int __spi_add_device(struct spi_device *spi) int status, idx; u8 cs; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (spi->num_chipselect > SPI_DEVICE_CS_CNT_MAX) { + dev_err(dev, "num_cs %d > max %d\n", spi->num_chipselect, + SPI_DEVICE_CS_CNT_MAX); + return -EOVERFLOW; + } + + for (idx = 0; idx < spi->num_chipselect; idx++) { /* Chipselects are numbered 0..max; validate. */ cs = spi_get_chipselect(spi, idx); - if (is_valid_cs(cs) && cs >= ctlr->num_chipselect) { + if (cs >= ctlr->num_chipselect) { dev_err(dev, "cs%d >= max %d\n", spi_get_chipselect(spi, idx), ctlr->num_chipselect); return -EINVAL; @@ -693,13 +691,17 @@ static int __spi_add_device(struct spi_device *spi) * For example, spi->chip_select[0] != spi->chip_select[1] and so on. */ if (!spi_controller_is_target(ctlr)) { - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { status = spi_dev_check_cs(dev, spi, idx, spi, idx + 1); if (status) return status; } } + /* Initialize unused logical CS as invalid */ + for (idx = spi->num_chipselect; idx < SPI_DEVICE_CS_CNT_MAX; idx++) + spi_set_chipselect(spi, idx, SPI_INVALID_CS); + /* Set the bus ID string */ spi_dev_set_name(spi); @@ -721,10 +723,9 @@ static int __spi_add_device(struct spi_device *spi) if (ctlr->cs_gpiods) { u8 cs; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { cs = spi_get_chipselect(spi, idx); - if (is_valid_cs(cs)) - spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]); + spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]); } } @@ -777,14 +778,6 @@ int spi_add_device(struct spi_device *spi) } EXPORT_SYMBOL_GPL(spi_add_device); -static void spi_set_all_cs_unused(struct spi_device *spi) -{ - u8 idx; - - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) - spi_set_chipselect(spi, idx, SPI_INVALID_CS); -} - /** * spi_new_device - instantiate one new SPI device * @ctlr: Controller to which device is connected @@ -820,7 +813,6 @@ struct spi_device *spi_new_device(struct spi_controller *ctlr, WARN_ON(strlen(chip->modalias) >= sizeof(proxy->modalias)); /* Use provided chip-select for proxy device */ - spi_set_all_cs_unused(proxy); spi_set_chipselect(proxy, 0, chip->chip_select); proxy->max_speed_hz = chip->max_speed_hz; @@ -1028,7 +1020,7 @@ static void spi_res_release(struct spi_controller *ctlr, struct spi_message *mes /*-------------------------------------------------------------------------*/ #define spi_for_each_valid_cs(spi, idx) \ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) \ + for (idx = 0; idx < spi->num_chipselect; idx++) \ if (!(spi->cs_index_mask & BIT(idx))) {} else static inline bool spi_is_last_cs(struct spi_device *spi) @@ -1084,8 +1076,12 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) trace_spi_set_cs(spi, activate); spi->controller->last_cs_index_mask = spi->cs_index_mask; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) - spi->controller->last_cs[idx] = enable ? spi_get_chipselect(spi, 0) : SPI_INVALID_CS; + for (idx = 0; idx < SPI_DEVICE_CS_CNT_MAX; idx++) { + if (enable && idx < spi->num_chipselect) + spi->controller->last_cs[idx] = spi_get_chipselect(spi, 0); + else + spi->controller->last_cs[idx] = SPI_INVALID_CS; + } spi->controller->last_cs_mode_high = spi->mode & SPI_CS_HIGH; if (spi->controller->last_cs_mode_high) @@ -2358,7 +2354,7 @@ static void of_spi_parse_dt_cs_delay(struct device_node *nc, static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, struct device_node *nc) { - u32 value, cs[SPI_CS_CNT_MAX]; + u32 value, cs[SPI_DEVICE_CS_CNT_MAX]; int rc, idx; /* Mode (clock phase/polarity/etc.) */ @@ -2431,31 +2427,22 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, return 0; } - if (ctlr->num_chipselect > SPI_CS_CNT_MAX) { - dev_err(&ctlr->dev, "No. of CS is more than max. no. of supported CS\n"); - return -EINVAL; - } - - spi_set_all_cs_unused(spi); - /* Device address */ rc = of_property_read_variable_u32_array(nc, "reg", &cs[0], 1, - SPI_CS_CNT_MAX); + SPI_DEVICE_CS_CNT_MAX); if (rc < 0) { dev_err(&ctlr->dev, "%pOF has no valid 'reg' property (%d)\n", nc, rc); return rc; } - if (rc > ctlr->num_chipselect) { - dev_err(&ctlr->dev, "%pOF has number of CS > ctlr->num_chipselect (%d)\n", - nc, rc); - return rc; - } + if ((of_property_present(nc, "parallel-memories")) && (!(ctlr->flags & SPI_CONTROLLER_MULTI_CS))) { dev_err(&ctlr->dev, "SPI controller doesn't support multi CS\n"); return -EINVAL; } + + spi->num_chipselect = rc; for (idx = 0; idx < rc; idx++) spi_set_chipselect(spi, idx, cs[idx]); @@ -2580,7 +2567,6 @@ struct spi_device *spi_new_ancillary_device(struct spi_device *spi, strscpy(ancillary->modalias, "dummy", sizeof(ancillary->modalias)); /* Use provided chip-select for ancillary device */ - spi_set_all_cs_unused(ancillary); spi_set_chipselect(ancillary, 0, chip_select); /* Take over SPI mode/speed from SPI main device */ @@ -2828,7 +2814,6 @@ struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, return ERR_PTR(-ENOMEM); } - spi_set_all_cs_unused(spi); spi_set_chipselect(spi, 0, lookup.chip_select); ACPI_COMPANION_SET(&spi->dev, adev); @@ -3328,7 +3313,7 @@ int spi_register_controller(struct spi_controller *ctlr) } /* Setting last_cs to SPI_INVALID_CS means no chip selected */ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + for (idx = 0; idx < SPI_DEVICE_CS_CNT_MAX; idx++) ctlr->last_cs[idx] = SPI_INVALID_CS; status = device_add(&ctlr->dev); diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_soc_slider.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_soc_slider.c index 20d70cb0154201..49ff3bae727109 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_soc_slider.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_soc_slider.c @@ -67,7 +67,8 @@ static int slider_def_balance_set(const char *arg, const struct kernel_param *kp ret = kstrtou8(arg, 16, &slider_val); if (!ret) { - if (slider_val > SOC_SLIDER_VALUE_MAXIMUM) + if (slider_val <= slider_values[SOC_POWER_SLIDER_PERFORMANCE] || + slider_val >= slider_values[SOC_POWER_SLIDER_POWERSAVE]) return -EINVAL; slider_balanced_param = slider_val; diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index a36289e61315a2..d9ec3bf194966c 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -20,6 +20,8 @@ #include #include +#include "thermal_hwmon.h" + #define K3_VTM_DEVINFO_PWR0_OFFSET 0x4 #define K3_VTM_DEVINFO_PWR0_TEMPSENS_CT_MASK 0xf0 #define K3_VTM_TMPSENS0_CTRL_OFFSET 0x300 @@ -513,6 +515,8 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) ret = PTR_ERR(ti_thermal); goto err_free_ref_table; } + + devm_thermal_add_hwmon_sysfs(bgp->dev, ti_thermal); } platform_set_drvdata(pdev, bgp); diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index f4d1e66d7db9ef..ab55b20cda479c 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -639,7 +639,7 @@ static int lvts_sensor_init(struct device *dev, struct lvts_ctrl *lvts_ctrl, lvts_sensor[i].low_thresh = INT_MIN; lvts_sensor[i].high_thresh = INT_MIN; - }; + } lvts_ctrl->valid_sensor_mask = lvts_ctrl_data->valid_sensor_mask; diff --git a/drivers/thermal/qcom/Kconfig b/drivers/thermal/qcom/Kconfig index 2c7f3f9a26ebbb..a6bb01082ec697 100644 --- a/drivers/thermal/qcom/Kconfig +++ b/drivers/thermal/qcom/Kconfig @@ -34,7 +34,8 @@ config QCOM_SPMI_TEMP_ALARM config QCOM_LMH tristate "Qualcomm Limits Management Hardware" - depends on ARCH_QCOM && QCOM_SCM + depends on ARCH_QCOM || COMPILE_TEST + select QCOM_SCM help This enables initialization of Qualcomm limits management hardware(LMh). LMh allows for hardware-enforced mitigation for cpus based on diff --git a/drivers/thermal/qcom/lmh.c b/drivers/thermal/qcom/lmh.c index 75eaa9a68ab8aa..ddadcfada5136c 100644 --- a/drivers/thermal/qcom/lmh.c +++ b/drivers/thermal/qcom/lmh.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #include #include #include @@ -204,7 +206,7 @@ static int lmh_probe(struct platform_device *pdev) ret = qcom_scm_lmh_dcvsh(LMH_SUB_FN_THERMAL, LMH_TH_LOW_THRESHOLD, temp_low, LMH_NODE_DCVS, node_id, 0); if (ret) { - dev_err(dev, "Error setting thermal ARM threshold%d\n", ret); + dev_err(dev, "Error setting thermal LOW threshold%d\n", ret); return ret; } diff --git a/drivers/thermal/renesas/Kconfig b/drivers/thermal/renesas/Kconfig index dcf5fc5ae08e47..5735c8728a31fc 100644 --- a/drivers/thermal/renesas/Kconfig +++ b/drivers/thermal/renesas/Kconfig @@ -10,13 +10,13 @@ config RCAR_THERMAL thermal framework. config RCAR_GEN3_THERMAL - tristate "Renesas R-Car Gen3 and RZ/G2 thermal driver" + tristate "Renesas R-Car Gen3/Gen4 and RZ/G2 thermal driver" depends on ARCH_RENESAS || COMPILE_TEST depends on HAS_IOMEM depends on OF help - Enable this to plug the R-Car Gen3 or RZ/G2 thermal sensor driver into - the Linux thermal framework. + Enable this to plug the R-Car Gen3/Gen4 or RZ/G2 thermal sensor + driver into the Linux thermal framework. config RZG2L_THERMAL tristate "Renesas RZ/G2L thermal driver" @@ -26,3 +26,18 @@ config RZG2L_THERMAL help Enable this to plug the RZ/G2L thermal sensor driver into the Linux thermal framework. + +config RZG3E_THERMAL + tristate "Renesas RZ/G3E thermal driver" + depends on ARCH_RENESAS || COMPILE_TEST + help + Enable this to plug the RZ/G3E thermal sensor driver into the Linux + thermal framework. + +config RZG3S_THERMAL + tristate "Renesas RZ/G3S thermal driver" + depends on ARCH_R9A08G045 || COMPILE_TEST + depends on OF && IIO && RZG2L_ADC + help + Enable this to plug the RZ/G3S thermal sensor driver into the Linux + thermal framework. diff --git a/drivers/thermal/renesas/Makefile b/drivers/thermal/renesas/Makefile index bf9cb3cb94d678..8f5ae9af277cab 100644 --- a/drivers/thermal/renesas/Makefile +++ b/drivers/thermal/renesas/Makefile @@ -3,3 +3,5 @@ obj-$(CONFIG_RCAR_GEN3_THERMAL) += rcar_gen3_thermal.o obj-$(CONFIG_RCAR_THERMAL) += rcar_thermal.o obj-$(CONFIG_RZG2L_THERMAL) += rzg2l_thermal.o +obj-$(CONFIG_RZG3E_THERMAL) += rzg3e_thermal.o +obj-$(CONFIG_RZG3S_THERMAL) += rzg3s_thermal.o diff --git a/drivers/thermal/renesas/rcar_gen3_thermal.c b/drivers/thermal/renesas/rcar_gen3_thermal.c index 24a702ee4c1fb8..3223de238d0144 100644 --- a/drivers/thermal/renesas/rcar_gen3_thermal.c +++ b/drivers/thermal/renesas/rcar_gen3_thermal.c @@ -73,11 +73,17 @@ struct rcar_gen3_thermal_fuse_info { u32 mask; }; +struct rcar_gen3_thermal_fuse_default { + u32 ptat[3]; + u32 thcodes[TSC_MAX_NUM][3]; +}; + struct rcar_thermal_info { int scale; int adj_below; int adj_above; const struct rcar_gen3_thermal_fuse_info *fuses; + const struct rcar_gen3_thermal_fuse_default *fuse_defaults; }; struct equation_set_coef { @@ -165,7 +171,7 @@ static int rcar_gen3_thermal_get_temp(struct thermal_zone_device *tz, int *temp) const struct equation_set_coef *coef; int adj, decicelsius, reg, thcode; - /* Read register and convert to mili Celsius */ + /* Read register and convert to millidegree Celsius */ reg = rcar_gen3_thermal_read(tsc, REG_GEN3_TEMP) & CTEMP_MASK; if (reg < tsc->thcode[1]) { @@ -289,6 +295,7 @@ static void rcar_gen3_thermal_fetch_fuses(struct rcar_gen3_thermal_priv *priv) static bool rcar_gen3_thermal_read_fuses(struct rcar_gen3_thermal_priv *priv) { + const struct rcar_gen3_thermal_fuse_default *fuse_defaults = priv->info->fuse_defaults; unsigned int i; u32 thscp; @@ -297,24 +304,16 @@ static bool rcar_gen3_thermal_read_fuses(struct rcar_gen3_thermal_priv *priv) if (!priv->info->fuses || (thscp & THSCP_COR_PARA_VLD) != THSCP_COR_PARA_VLD) { /* Default THCODE values in case FUSEs are not set. */ - static const int thcodes[TSC_MAX_NUM][3] = { - { 3397, 2800, 2221 }, - { 3393, 2795, 2216 }, - { 3389, 2805, 2237 }, - { 3415, 2694, 2195 }, - { 3356, 2724, 2244 }, - }; - - priv->ptat[0] = 2631; - priv->ptat[1] = 1509; - priv->ptat[2] = 435; + priv->ptat[0] = fuse_defaults->ptat[0]; + priv->ptat[1] = fuse_defaults->ptat[1]; + priv->ptat[2] = fuse_defaults->ptat[2]; for (i = 0; i < priv->num_tscs; i++) { struct rcar_gen3_thermal_tsc *tsc = priv->tscs[i]; - tsc->thcode[0] = thcodes[i][0]; - tsc->thcode[1] = thcodes[i][1]; - tsc->thcode[2] = thcodes[i][2]; + tsc->thcode[0] = fuse_defaults->thcodes[i][0]; + tsc->thcode[1] = fuse_defaults->thcodes[i][1]; + tsc->thcode[2] = fuse_defaults->thcodes[i][2]; } return false; @@ -361,11 +360,33 @@ static const struct rcar_gen3_thermal_fuse_info rcar_gen3_thermal_fuse_info_gen4 .mask = GEN4_FUSE_MASK, }; +static const struct rcar_gen3_thermal_fuse_default rcar_gen3_thermal_fuse_default_info_gen3 = { + .ptat = { 2631, 1509, 435 }, + .thcodes = { + { 3397, 2800, 2221 }, + { 3393, 2795, 2216 }, + { 3389, 2805, 2237 }, + { 3415, 2694, 2195 }, + { 3356, 2724, 2244 }, + }, +}; + +static const struct rcar_gen3_thermal_fuse_default rcar_gen3_thermal_fuse_default_info_gen4 = { + .ptat = { 3274, 2164, 985 }, + .thcodes = { /* All four THS units share the same trimming */ + { 3218, 2617, 1980 }, + { 3218, 2617, 1980 }, + { 3218, 2617, 1980 }, + { 3218, 2617, 1980 }, + } +}; + static const struct rcar_thermal_info rcar_m3w_thermal_info = { .scale = 157, .adj_below = -41, .adj_above = 116, .fuses = &rcar_gen3_thermal_fuse_info_gen3, + .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen3, }; static const struct rcar_thermal_info rcar_gen3_thermal_info = { @@ -373,6 +394,15 @@ static const struct rcar_thermal_info rcar_gen3_thermal_info = { .adj_below = -41, .adj_above = 126, .fuses = &rcar_gen3_thermal_fuse_info_gen3, + .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen3, +}; + +static const struct rcar_thermal_info rcar_s4_thermal_info = { + .scale = 167, + .adj_below = -41, + .adj_above = 126, + .fuses = &rcar_gen3_thermal_fuse_info_gen4, + .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen3, }; static const struct rcar_thermal_info rcar_gen4_thermal_info = { @@ -380,6 +410,7 @@ static const struct rcar_thermal_info rcar_gen4_thermal_info = { .adj_below = -41, .adj_above = 126, .fuses = &rcar_gen3_thermal_fuse_info_gen4, + .fuse_defaults = &rcar_gen3_thermal_fuse_default_info_gen4, }; static const struct of_device_id rcar_gen3_thermal_dt_ids[] = { @@ -421,7 +452,7 @@ static const struct of_device_id rcar_gen3_thermal_dt_ids[] = { }, { .compatible = "renesas,r8a779f0-thermal", - .data = &rcar_gen4_thermal_info, + .data = &rcar_s4_thermal_info, }, { .compatible = "renesas,r8a779g0-thermal", diff --git a/drivers/thermal/renesas/rzg3e_thermal.c b/drivers/thermal/renesas/rzg3e_thermal.c new file mode 100644 index 00000000000000..e66d73ca675277 --- /dev/null +++ b/drivers/thermal/renesas/rzg3e_thermal.c @@ -0,0 +1,547 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/G3E TSU Temperature Sensor Unit + * + * Copyright (C) 2025 Renesas Electronics Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../thermal_hwmon.h" + +/* TSU Register offsets and bits */ +#define TSU_SSUSR 0x00 +#define TSU_SSUSR_EN_TS BIT(0) +#define TSU_SSUSR_ADC_PD_TS BIT(1) +#define TSU_SSUSR_SOC_TS_EN BIT(2) + +#define TSU_STRGR 0x04 +#define TSU_STRGR_ADST BIT(0) + +#define TSU_SOSR1 0x08 +#define TSU_SOSR1_ADCT_8 0x03 +#define TSU_SOSR1_ADCS BIT(4) +#define TSU_SOSR1_OUTSEL BIT(9) + +#define TSU_SCRR 0x10 +#define TSU_SCRR_OUT12BIT_TS GENMASK(11, 0) + +#define TSU_SSR 0x14 +#define TSU_SSR_CONV BIT(0) + +#define TSU_CMSR 0x18 +#define TSU_CMSR_CMPEN BIT(0) + +#define TSU_LLSR 0x1C +#define TSU_ULSR 0x20 + +#define TSU_SISR 0x30 +#define TSU_SISR_ADF BIT(0) +#define TSU_SISR_CMPF BIT(1) + +#define TSU_SIER 0x34 +#define TSU_SIER_CMPIE BIT(1) + +#define TSU_SICR 0x38 +#define TSU_SICR_ADCLR BIT(0) +#define TSU_SICR_CMPCLR BIT(1) + +/* Temperature calculation constants from datasheet */ +#define TSU_TEMP_D (-41) +#define TSU_TEMP_E 126 +#define TSU_CODE_MAX 0xFFF + +/* Timing specifications from datasheet */ +#define TSU_POWERUP_TIME_US 120 /* 120T at 1MHz sensor clock per datasheet */ +#define TSU_CONV_TIME_US 50 /* Per sample conversion time */ +#define TSU_POLL_DELAY_US 10 /* Polling interval */ +#define TSU_MIN_CLOCK_RATE 24000000 /* TSU_PCLK minimum 24MHz */ + +/** + * struct rzg3e_thermal_priv - RZ/G3E TSU private data + * @base: TSU register base + * @dev: device pointer + * @syscon: regmap for calibration values + * @zone: thermal zone device + * @rstc: reset control + * @trmval0: calibration value 0 (b) + * @trmval1: calibration value 1 (c) + * @trim_offset: offset for trim registers in syscon + * @lock: protects hardware access during conversions + */ +struct rzg3e_thermal_priv { + void __iomem *base; + struct device *dev; + struct regmap *syscon; + struct thermal_zone_device *zone; + struct reset_control *rstc; + u16 trmval0; + u16 trmval1; + u32 trim_offset; + struct mutex lock; +}; + +static int rzg3e_thermal_power_on(struct rzg3e_thermal_priv *priv) +{ + u32 val; + int ret; + + /* Clear any pending interrupts */ + writel(TSU_SICR_ADCLR | TSU_SICR_CMPCLR, priv->base + TSU_SICR); + + /* Disable all interrupts during setup */ + writel(0, priv->base + TSU_SIER); + + /* + * Power-on sequence per datasheet 7.11.9.1: + * SOC_TS_EN must be set at same time or before EN_TS and ADC_PD_TS + */ + val = TSU_SSUSR_SOC_TS_EN | TSU_SSUSR_EN_TS; + writel(val, priv->base + TSU_SSUSR); + + /* Wait for sensor stabilization per datasheet 7.11.7.1 */ + usleep_range(TSU_POWERUP_TIME_US, TSU_POWERUP_TIME_US + 10); + + /* Configure for average mode with 8 samples */ + val = TSU_SOSR1_OUTSEL | TSU_SOSR1_ADCT_8; + writel(val, priv->base + TSU_SOSR1); + + /* Ensure we're in single scan mode (default) */ + val = readl(priv->base + TSU_SOSR1); + if (val & TSU_SOSR1_ADCS) { + dev_err(priv->dev, "Invalid scan mode setting\n"); + return -EINVAL; + } + + /* Wait for any ongoing conversion to complete */ + ret = readl_poll_timeout(priv->base + TSU_SSR, val, + !(val & TSU_SSR_CONV), + TSU_POLL_DELAY_US, + USEC_PER_MSEC); + if (ret) { + dev_err(priv->dev, "Timeout waiting for conversion\n"); + return ret; + } + + return 0; +} + +static void rzg3e_thermal_power_off(struct rzg3e_thermal_priv *priv) +{ + /* Disable all interrupts */ + writel(0, priv->base + TSU_SIER); + + /* Clear pending interrupts */ + writel(TSU_SICR_ADCLR | TSU_SICR_CMPCLR, priv->base + TSU_SICR); + + /* Power down sequence per datasheet */ + writel(TSU_SSUSR_ADC_PD_TS, priv->base + TSU_SSUSR); +} + +/* + * Convert 12-bit sensor code to temperature in millicelsius + * Formula from datasheet 7.11.7.8: + * T(°C) = ((e - d) / (c - b)) * (a - b) + d + * where: a = sensor code, b = trmval0, c = trmval1, d = -41, e = 126 + */ +static int rzg3e_thermal_code_to_temp(struct rzg3e_thermal_priv *priv, u16 code) +{ + int temp_e_mc = TSU_TEMP_E * MILLIDEGREE_PER_DEGREE; + int temp_d_mc = TSU_TEMP_D * MILLIDEGREE_PER_DEGREE; + s64 numerator, denominator; + int temp_mc; + + numerator = (temp_e_mc - temp_d_mc) * (s64)(code - priv->trmval0); + denominator = priv->trmval1 - priv->trmval0; + + temp_mc = div64_s64(numerator, denominator) + temp_d_mc; + + return clamp(temp_mc, temp_d_mc, temp_e_mc); +} + +/* + * Convert temperature in millicelsius to 12-bit sensor code + * Formula from datasheet 7.11.7.9 (inverse of above) + */ +static u16 rzg3e_thermal_temp_to_code(struct rzg3e_thermal_priv *priv, int temp_mc) +{ + int temp_e_mc = TSU_TEMP_E * MILLIDEGREE_PER_DEGREE; + int temp_d_mc = TSU_TEMP_D * MILLIDEGREE_PER_DEGREE; + s64 numerator, denominator; + s64 code; + + numerator = (temp_mc - temp_d_mc) * (priv->trmval1 - priv->trmval0); + denominator = temp_e_mc - temp_d_mc; + + code = div64_s64(numerator, denominator) + priv->trmval0; + + return clamp_val(code, 0, TSU_CODE_MAX); +} + +static int rzg3e_thermal_get_temp(struct thermal_zone_device *tz, int *temp) +{ + struct rzg3e_thermal_priv *priv = thermal_zone_device_priv(tz); + u32 status, code; + int ret, timeout; + + ret = pm_runtime_resume_and_get(priv->dev); + if (ret < 0) + return ret; + + guard(mutex)(&priv->lock); + + /* Clear any previous conversion status */ + writel(TSU_SICR_ADCLR, priv->base + TSU_SICR); + + /* Start single conversion */ + writel(TSU_STRGR_ADST, priv->base + TSU_STRGR); + + /* Wait for conversion completion - 8 samples at ~50us each */ + timeout = TSU_CONV_TIME_US * 8 * 2; /* Double for margin */ + ret = readl_poll_timeout(priv->base + TSU_SISR, status, + status & TSU_SISR_ADF, + TSU_POLL_DELAY_US, timeout); + if (ret) { + dev_err(priv->dev, "Conversion timeout (status=0x%08x)\n", status); + goto out; + } + + /* Read the averaged result and clear the complete flag */ + code = readl(priv->base + TSU_SCRR) & TSU_SCRR_OUT12BIT_TS; + writel(TSU_SICR_ADCLR, priv->base + TSU_SICR); + + /* Convert to temperature */ + *temp = rzg3e_thermal_code_to_temp(priv, code); + + dev_dbg(priv->dev, "temp=%d mC (%d.%03d°C), code=0x%03x\n", + *temp, *temp / 1000, abs(*temp) % 1000, code); + +out: + pm_runtime_mark_last_busy(priv->dev); + pm_runtime_put_autosuspend(priv->dev); + return ret; +} + +static int rzg3e_thermal_set_trips(struct thermal_zone_device *tz, + int low, int high) +{ + struct rzg3e_thermal_priv *priv = thermal_zone_device_priv(tz); + u16 low_code, high_code; + u32 val; + int ret; + + /* Hardware requires low < high */ + if (low >= high) + return -EINVAL; + + ret = pm_runtime_resume_and_get(priv->dev); + if (ret < 0) + return ret; + + guard(mutex)(&priv->lock); + + /* Convert temperatures to codes */ + low_code = rzg3e_thermal_temp_to_code(priv, low); + high_code = rzg3e_thermal_temp_to_code(priv, high); + + dev_dbg(priv->dev, "set_trips: low=%d high=%d (codes: 0x%03x/0x%03x)\n", + low, high, low_code, high_code); + + /* Disable comparison during reconfiguration */ + writel(0, priv->base + TSU_SIER); + writel(0, priv->base + TSU_CMSR); + + /* Clear any pending comparison interrupts */ + writel(TSU_SICR_CMPCLR, priv->base + TSU_SICR); + + /* Set trip points */ + writel(low_code, priv->base + TSU_LLSR); + writel(high_code, priv->base + TSU_ULSR); + + /* + * Ensure OUTSEL is set for comparison per datasheet 7.11.7.4 + * Comparison uses averaged data + */ + val = readl(priv->base + TSU_SOSR1); + val |= TSU_SOSR1_OUTSEL; + writel(val, priv->base + TSU_SOSR1); + + /* Enable comparison with "out of range" mode (CMPCOND=0) */ + writel(TSU_CMSR_CMPEN, priv->base + TSU_CMSR); + + /* Unmask compare IRQ and start a conversion to evaluate window */ + writel(TSU_SIER_CMPIE, priv->base + TSU_SIER); + writel(TSU_STRGR_ADST, priv->base + TSU_STRGR); + + pm_runtime_mark_last_busy(priv->dev); + pm_runtime_put_autosuspend(priv->dev); + + return 0; +} + +static irqreturn_t rzg3e_thermal_irq_thread(int irq, void *data) +{ + struct rzg3e_thermal_priv *priv = data; + + dev_dbg(priv->dev, "Temperature threshold crossed\n"); + + /* Notify thermal framework to re-evaluate trip points */ + thermal_zone_device_update(priv->zone, THERMAL_TRIP_VIOLATED); + + return IRQ_HANDLED; +} + +static irqreturn_t rzg3e_thermal_irq(int irq, void *data) +{ + struct rzg3e_thermal_priv *priv = data; + u32 status; + + status = readl(priv->base + TSU_SISR); + + /* Check if comparison interrupt occurred */ + if (status & TSU_SISR_CMPF) { + /* Clear irq flag and disable interrupt until reconfigured */ + writel(TSU_SICR_CMPCLR, priv->base + TSU_SICR); + writel(0, priv->base + TSU_SIER); + + return IRQ_WAKE_THREAD; + } + + return IRQ_NONE; +} + +static const struct thermal_zone_device_ops rzg3e_tz_ops = { + .get_temp = rzg3e_thermal_get_temp, + .set_trips = rzg3e_thermal_set_trips, +}; + +static int rzg3e_thermal_get_calibration(struct rzg3e_thermal_priv *priv) +{ + u32 val; + int ret; + + /* Read calibration values from syscon */ + ret = regmap_read(priv->syscon, priv->trim_offset, &val); + if (ret) + return ret; + priv->trmval0 = val & GENMASK(11, 0); + + ret = regmap_read(priv->syscon, priv->trim_offset + 4, &val); + if (ret) + return ret; + priv->trmval1 = val & GENMASK(11, 0); + + /* Validate calibration data */ + if (!priv->trmval0 || !priv->trmval1 || + priv->trmval0 == priv->trmval1 || + priv->trmval0 == 0xFFF || priv->trmval1 == 0xFFF) { + dev_err(priv->dev, "Invalid calibration: b=0x%03x, c=0x%03x\n", + priv->trmval0, priv->trmval1); + return -EINVAL; + } + + dev_dbg(priv->dev, "Calibration: b=0x%03x (%u), c=0x%03x (%u)\n", + priv->trmval0, priv->trmval0, priv->trmval1, priv->trmval1); + + return 0; +} + +static int rzg3e_thermal_parse_dt(struct rzg3e_thermal_priv *priv) +{ + struct device_node *np = priv->dev->of_node; + u32 offset; + + priv->syscon = syscon_regmap_lookup_by_phandle_args(np, "renesas,tsu-trim", 1, &offset); + if (IS_ERR(priv->syscon)) + return dev_err_probe(priv->dev, PTR_ERR(priv->syscon), + "Failed to parse renesas,tsu-trim\n"); + + priv->trim_offset = offset; + return 0; +} + +static int rzg3e_thermal_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rzg3e_thermal_priv *priv; + struct clk *clk; + int irq, ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = dev; + ret = devm_mutex_init(dev, &priv->lock); + if (ret) + return ret; + platform_set_drvdata(pdev, priv); + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + /* Parse device tree for trim register info */ + ret = rzg3e_thermal_parse_dt(priv); + if (ret) + return ret; + + /* Get clock to verify frequency - clock is managed by power domain */ + clk = devm_clk_get(dev, NULL); + if (IS_ERR(clk)) + return dev_err_probe(dev, PTR_ERR(clk), + "Failed to get clock\n"); + + if (clk_get_rate(clk) < TSU_MIN_CLOCK_RATE) + return dev_err_probe(dev, -EINVAL, + "Clock rate %lu Hz too low (min %u Hz)\n", + clk_get_rate(clk), TSU_MIN_CLOCK_RATE); + + priv->rstc = devm_reset_control_get_exclusive_deasserted(dev, NULL); + if (IS_ERR(priv->rstc)) + return dev_err_probe(dev, PTR_ERR(priv->rstc), + "Failed to get/deassert reset control\n"); + + /* Get calibration data */ + ret = rzg3e_thermal_get_calibration(priv); + if (ret) + return dev_err_probe(dev, ret, + "Failed to get valid calibration data\n"); + + /* Get comparison interrupt */ + irq = platform_get_irq_byname(pdev, "adcmpi"); + if (irq < 0) + return irq; + + /* Enable runtime PM */ + pm_runtime_set_autosuspend_delay(dev, 1000); + pm_runtime_use_autosuspend(dev); + devm_pm_runtime_enable(dev); + + /* Initial hardware setup */ + ret = pm_runtime_resume_and_get(dev); + if (ret < 0) + return dev_err_probe(dev, ret, "Runtime resume failed\n"); + + /* Register thermal zone - this will trigger DT parsing */ + priv->zone = devm_thermal_of_zone_register(dev, 0, priv, &rzg3e_tz_ops); + if (IS_ERR(priv->zone)) { + ret = PTR_ERR(priv->zone); + dev_err(dev, "Failed to register thermal zone: %d\n", ret); + goto err_pm_put; + } + + /* Request threaded IRQ for comparison interrupt */ + ret = devm_request_threaded_irq(dev, irq, rzg3e_thermal_irq, + rzg3e_thermal_irq_thread, + IRQF_ONESHOT, "rzg3e_thermal", priv); + if (ret) { + dev_err(dev, "Failed to request IRQ: %d\n", ret); + goto err_pm_put; + } + + /* Add hwmon sysfs interface */ + ret = devm_thermal_add_hwmon_sysfs(dev, priv->zone); + if (ret) + dev_warn(dev, "Failed to add hwmon sysfs attributes\n"); + + pm_runtime_mark_last_busy(dev); + pm_runtime_put_autosuspend(dev); + + dev_info(dev, "RZ/G3E thermal sensor registered\n"); + + return 0; + +err_pm_put: + pm_runtime_put_sync(dev); + return ret; +} + +static int rzg3e_thermal_runtime_suspend(struct device *dev) +{ + struct rzg3e_thermal_priv *priv = dev_get_drvdata(dev); + + rzg3e_thermal_power_off(priv); + return 0; +} + +static int rzg3e_thermal_runtime_resume(struct device *dev) +{ + struct rzg3e_thermal_priv *priv = dev_get_drvdata(dev); + + return rzg3e_thermal_power_on(priv); +} + +static int rzg3e_thermal_suspend(struct device *dev) +{ + struct rzg3e_thermal_priv *priv = dev_get_drvdata(dev); + + /* If device is active, power it off */ + if (pm_runtime_active(dev)) + rzg3e_thermal_power_off(priv); + + /* Assert reset to ensure clean state after resume */ + reset_control_assert(priv->rstc); + + return 0; +} + +static int rzg3e_thermal_resume(struct device *dev) +{ + struct rzg3e_thermal_priv *priv = dev_get_drvdata(dev); + int ret; + + /* Deassert reset */ + ret = reset_control_deassert(priv->rstc); + if (ret) { + dev_err(dev, "Failed to deassert reset: %d\n", ret); + return ret; + } + + /* If device was active before suspend, power it back on */ + if (pm_runtime_active(dev)) + return rzg3e_thermal_power_on(priv); + + return 0; +} + +static const struct dev_pm_ops rzg3e_thermal_pm_ops = { + RUNTIME_PM_OPS(rzg3e_thermal_runtime_suspend, + rzg3e_thermal_runtime_resume, NULL) + SYSTEM_SLEEP_PM_OPS(rzg3e_thermal_suspend, rzg3e_thermal_resume) +}; + +static const struct of_device_id rzg3e_thermal_dt_ids[] = { + { .compatible = "renesas,r9a09g047-tsu" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, rzg3e_thermal_dt_ids); + +static struct platform_driver rzg3e_thermal_driver = { + .driver = { + .name = "rzg3e_thermal", + .of_match_table = rzg3e_thermal_dt_ids, + .pm = pm_ptr(&rzg3e_thermal_pm_ops), + }, + .probe = rzg3e_thermal_probe, +}; +module_platform_driver(rzg3e_thermal_driver); + +MODULE_DESCRIPTION("Renesas RZ/G3E TSU Thermal Sensor Driver"); +MODULE_AUTHOR("John Madieu "); +MODULE_LICENSE("GPL"); diff --git a/drivers/thermal/renesas/rzg3s_thermal.c b/drivers/thermal/renesas/rzg3s_thermal.c new file mode 100644 index 00000000000000..e25e36c99a8866 --- /dev/null +++ b/drivers/thermal/renesas/rzg3s_thermal.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/G3S TSU Thermal Sensor Driver + * + * Copyright (C) 2024 Renesas Electronics Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../thermal_hwmon.h" + +#define TSU_SM 0x0 +#define TSU_SM_EN BIT(0) +#define TSU_SM_OE BIT(1) +#define OTPTSUTRIM_REG(n) (0x18 + (n) * 0x4) +#define OTPTSUTRIM_EN_MASK BIT(31) +#define OTPTSUTRIM_MASK GENMASK(11, 0) + +#define TSU_READ_STEPS 8 + +/* Default calibration values, if FUSE values are missing. */ +#define SW_CALIB0_VAL 1297 +#define SW_CALIB1_VAL 751 + +#define MCELSIUS(temp) ((temp) * MILLIDEGREE_PER_DEGREE) + +/** + * struct rzg3s_thermal_priv - RZ/G3S thermal private data structure + * @base: TSU base address + * @dev: device pointer + * @tz: thermal zone pointer + * @rstc: reset control + * @channel: IIO channel to read the TSU + * @mode: current device mode + * @calib0: calibration value + * @calib1: calibration value + */ +struct rzg3s_thermal_priv { + void __iomem *base; + struct device *dev; + struct thermal_zone_device *tz; + struct reset_control *rstc; + struct iio_channel *channel; + enum thermal_device_mode mode; + u16 calib0; + u16 calib1; +}; + +static int rzg3s_thermal_get_temp(struct thermal_zone_device *tz, int *temp) +{ + struct rzg3s_thermal_priv *priv = thermal_zone_device_priv(tz); + int ts_code_ave = 0; + + if (priv->mode != THERMAL_DEVICE_ENABLED) + return -EAGAIN; + + for (u8 i = 0; i < TSU_READ_STEPS; i++) { + int ret, val; + + ret = iio_read_channel_raw(priv->channel, &val); + if (ret < 0) + return ret; + + ts_code_ave += val; + /* + * According to the HW manual (Rev.1.10, section 40.4.4 Procedure for Measuring + * the Temperature) we need to wait here at leat 3us. + */ + usleep_range(5, 10); + } + + ts_code_ave = DIV_ROUND_CLOSEST(MCELSIUS(ts_code_ave), TSU_READ_STEPS); + + /* + * According to the HW manual (Rev.1.10, section 40.4.4 Procedure for Measuring the + * Temperature) the computation formula is as follows: + * + * Tj = (ts_code_ave - priv->calib1) * 165 / (priv->calib0 - priv->calib1) - 40 + * + * Convert everything to milli Celsius before applying the formula to avoid + * losing precision. + */ + + *temp = div_s64((s64)(ts_code_ave - MCELSIUS(priv->calib1)) * MCELSIUS(165), + MCELSIUS(priv->calib0 - priv->calib1)) - MCELSIUS(40); + + /* Report it in milli degrees Celsius and round it up to 0.5 degrees Celsius. */ + *temp = roundup(*temp, 500); + + return 0; +} + +static void rzg3s_thermal_set_mode(struct rzg3s_thermal_priv *priv, + enum thermal_device_mode mode) +{ + struct device *dev = priv->dev; + int ret; + + ret = pm_runtime_resume_and_get(dev); + if (ret) + return; + + if (mode == THERMAL_DEVICE_DISABLED) { + writel(0, priv->base + TSU_SM); + } else { + writel(TSU_SM_EN, priv->base + TSU_SM); + /* + * According to the HW manual (Rev.1.10, section 40.4.1 Procedure for + * Starting the TSU) we need to wait here 30us or more. + */ + usleep_range(30, 40); + + writel(TSU_SM_OE | TSU_SM_EN, priv->base + TSU_SM); + /* + * According to the HW manual (Rev.1.10, section 40.4.1 Procedure for + * Starting the TSU) we need to wait here 50us or more. + */ + usleep_range(50, 60); + } + + pm_runtime_put_autosuspend(dev); +} + +static int rzg3s_thermal_change_mode(struct thermal_zone_device *tz, + enum thermal_device_mode mode) +{ + struct rzg3s_thermal_priv *priv = thermal_zone_device_priv(tz); + + if (priv->mode == mode) + return 0; + + rzg3s_thermal_set_mode(priv, mode); + priv->mode = mode; + + return 0; +} + +static const struct thermal_zone_device_ops rzg3s_tz_of_ops = { + .get_temp = rzg3s_thermal_get_temp, + .change_mode = rzg3s_thermal_change_mode, +}; + +static int rzg3s_thermal_read_calib(struct rzg3s_thermal_priv *priv) +{ + struct device *dev = priv->dev; + u32 val; + int ret; + + ret = pm_runtime_resume_and_get(dev); + if (ret) + return ret; + + val = readl(priv->base + OTPTSUTRIM_REG(0)); + if (val & OTPTSUTRIM_EN_MASK) + priv->calib0 = FIELD_GET(OTPTSUTRIM_MASK, val); + else + priv->calib0 = SW_CALIB0_VAL; + + val = readl(priv->base + OTPTSUTRIM_REG(1)); + if (val & OTPTSUTRIM_EN_MASK) + priv->calib1 = FIELD_GET(OTPTSUTRIM_MASK, val); + else + priv->calib1 = SW_CALIB1_VAL; + + pm_runtime_put_autosuspend(dev); + + return 0; +} + +static int rzg3s_thermal_probe(struct platform_device *pdev) +{ + struct rzg3s_thermal_priv *priv; + struct device *dev = &pdev->dev; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + priv->channel = devm_iio_channel_get(dev, "tsu"); + if (IS_ERR(priv->channel)) + return dev_err_probe(dev, PTR_ERR(priv->channel), "Failed to get IIO channel!\n"); + + priv->rstc = devm_reset_control_get_exclusive_deasserted(dev, NULL); + if (IS_ERR(priv->rstc)) + return dev_err_probe(dev, PTR_ERR(priv->rstc), "Failed to get reset!\n"); + + priv->dev = dev; + priv->mode = THERMAL_DEVICE_DISABLED; + platform_set_drvdata(pdev, priv); + + pm_runtime_set_autosuspend_delay(dev, 300); + pm_runtime_use_autosuspend(dev); + ret = devm_pm_runtime_enable(dev); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable runtime PM!\n"); + + ret = rzg3s_thermal_read_calib(priv); + if (ret) + return dev_err_probe(dev, ret, "Failed to read calibration data!\n"); + + priv->tz = devm_thermal_of_zone_register(dev, 0, priv, &rzg3s_tz_of_ops); + if (IS_ERR(priv->tz)) + return dev_err_probe(dev, PTR_ERR(priv->tz), "Failed to register thermal zone!\n"); + + ret = devm_thermal_add_hwmon_sysfs(dev, priv->tz); + if (ret) + return dev_err_probe(dev, ret, "Failed to add hwmon sysfs!\n"); + + return 0; +} + +static int rzg3s_thermal_suspend(struct device *dev) +{ + struct rzg3s_thermal_priv *priv = dev_get_drvdata(dev); + + rzg3s_thermal_set_mode(priv, THERMAL_DEVICE_DISABLED); + + return reset_control_assert(priv->rstc); +} + +static int rzg3s_thermal_resume(struct device *dev) +{ + struct rzg3s_thermal_priv *priv = dev_get_drvdata(dev); + int ret; + + ret = reset_control_deassert(priv->rstc); + if (ret) + return ret; + + if (priv->mode != THERMAL_DEVICE_DISABLED) + rzg3s_thermal_set_mode(priv, priv->mode); + + return 0; +} + +static const struct dev_pm_ops rzg3s_thermal_pm_ops = { + SYSTEM_SLEEP_PM_OPS(rzg3s_thermal_suspend, rzg3s_thermal_resume) +}; + +static const struct of_device_id rzg3s_thermal_dt_ids[] = { + { .compatible = "renesas,r9a08g045-tsu" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, rzg3s_thermal_dt_ids); + +static struct platform_driver rzg3s_thermal_driver = { + .driver = { + .name = "rzg3s-thermal", + .of_match_table = rzg3s_thermal_dt_ids, + .pm = pm_ptr(&rzg3s_thermal_pm_ops), + }, + .probe = rzg3s_thermal_probe, +}; +module_platform_driver(rzg3s_thermal_driver); + +MODULE_DESCRIPTION("Renesas RZ/G3S Thermal Sensor Unit Driver"); +MODULE_AUTHOR("Claudiu Beznea "); +MODULE_LICENSE("GPL"); diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c index 3beff9b6fac3ab..c49ddf70f86e7b 100644 --- a/drivers/thermal/rockchip_thermal.c +++ b/drivers/thermal/rockchip_thermal.c @@ -74,6 +74,7 @@ struct chip_tsadc_table { * @tshut_temp: the hardware-controlled shutdown temperature value, with no trim * @tshut_mode: the hardware-controlled shutdown mode (0:CRU 1:GPIO) * @tshut_polarity: the hardware-controlled active polarity (0:LOW 1:HIGH) + * @grf_required: true, if a GRF is required for proper functionality * @initialize: SoC special initialize tsadc controller method * @irq_ack: clear the interrupt * @control: enable/disable method for the tsadc controller @@ -97,6 +98,9 @@ struct rockchip_tsadc_chip { enum tshut_mode tshut_mode; enum tshut_polarity tshut_polarity; + /* GRF availability */ + bool grf_required; + /* Chip-wide methods */ void (*initialize)(struct regmap *grf, void __iomem *reg, enum tshut_polarity p); @@ -1098,10 +1102,9 @@ static const struct rockchip_tsadc_chip px30_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* 2 channels for tsadc */ - + .grf_required = true, .tshut_mode = TSHUT_MODE_CRU, /* default TSHUT via CRU */ .tshut_temp = 95000, - .initialize = rk_tsadcv4_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1109,7 +1112,6 @@ static const struct rockchip_tsadc_chip px30_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3328_code_table, .length = ARRAY_SIZE(rk3328_code_table), @@ -1122,11 +1124,10 @@ static const struct rockchip_tsadc_chip rv1108_tsadc_data = { /* cpu */ .chn_offset = 0, .chn_num = 1, /* one channel for tsadc */ - + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1134,7 +1135,6 @@ static const struct rockchip_tsadc_chip rv1108_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rv1108_table, .length = ARRAY_SIZE(rv1108_table), @@ -1147,11 +1147,10 @@ static const struct rockchip_tsadc_chip rk3228_tsadc_data = { /* cpu */ .chn_offset = 0, .chn_num = 1, /* one channel for tsadc */ - + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1159,7 +1158,6 @@ static const struct rockchip_tsadc_chip rk3228_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3228_code_table, .length = ARRAY_SIZE(rk3228_code_table), @@ -1172,11 +1170,10 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = { /* cpu, gpu */ .chn_offset = 1, .chn_num = 2, /* two channels for tsadc */ - + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv2_irq_ack, .control = rk_tsadcv2_control, @@ -1184,7 +1181,6 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3288_code_table, .length = ARRAY_SIZE(rk3288_code_table), @@ -1197,10 +1193,9 @@ static const struct rockchip_tsadc_chip rk3328_tsadc_data = { /* cpu */ .chn_offset = 0, .chn_num = 1, /* one channels for tsadc */ - + .grf_required = false, .tshut_mode = TSHUT_MODE_CRU, /* default TSHUT via CRU */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1208,7 +1203,6 @@ static const struct rockchip_tsadc_chip rk3328_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3328_code_table, .length = ARRAY_SIZE(rk3328_code_table), @@ -1221,11 +1215,10 @@ static const struct rockchip_tsadc_chip rk3366_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ - + .grf_required = true, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv3_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1233,7 +1226,6 @@ static const struct rockchip_tsadc_chip rk3366_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3228_code_table, .length = ARRAY_SIZE(rk3228_code_table), @@ -1246,11 +1238,10 @@ static const struct rockchip_tsadc_chip rk3368_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ - + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv2_initialize, .irq_ack = rk_tsadcv2_irq_ack, .control = rk_tsadcv2_control, @@ -1258,7 +1249,6 @@ static const struct rockchip_tsadc_chip rk3368_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3368_code_table, .length = ARRAY_SIZE(rk3368_code_table), @@ -1271,11 +1261,10 @@ static const struct rockchip_tsadc_chip rk3399_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ - + .grf_required = true, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv3_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1283,7 +1272,6 @@ static const struct rockchip_tsadc_chip rk3399_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3399_code_table, .length = ARRAY_SIZE(rk3399_code_table), @@ -1296,11 +1284,10 @@ static const struct rockchip_tsadc_chip rk3568_tsadc_data = { /* cpu, gpu */ .chn_offset = 0, .chn_num = 2, /* two channels for tsadc */ - + .grf_required = true, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, - .initialize = rk_tsadcv7_initialize, .irq_ack = rk_tsadcv3_irq_ack, .control = rk_tsadcv3_control, @@ -1308,7 +1295,6 @@ static const struct rockchip_tsadc_chip rk3568_tsadc_data = { .set_alarm_temp = rk_tsadcv2_alarm_temp, .set_tshut_temp = rk_tsadcv2_tshut_temp, .set_tshut_mode = rk_tsadcv2_tshut_mode, - .table = { .id = rk3568_code_table, .length = ARRAY_SIZE(rk3568_code_table), @@ -1321,6 +1307,7 @@ static const struct rockchip_tsadc_chip rk3576_tsadc_data = { /* top, big_core, little_core, ddr, npu, gpu */ .chn_offset = 0, .chn_num = 6, /* six channels for tsadc */ + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1345,6 +1332,7 @@ static const struct rockchip_tsadc_chip rk3588_tsadc_data = { /* top, big_core0, big_core1, little_core, center, gpu, npu */ .chn_offset = 0, .chn_num = 7, /* seven channels for tsadc */ + .grf_required = false, .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ .tshut_temp = 95000, @@ -1621,12 +1609,10 @@ static int rockchip_configure_from_dt(struct device *dev, return -EINVAL; } - /* The tsadc wont to handle the error in here since some SoCs didn't - * need this property. - */ thermal->grf = syscon_regmap_lookup_by_phandle(np, "rockchip,grf"); - if (IS_ERR(thermal->grf)) - dev_warn(dev, "Missing rockchip,grf property\n"); + if (IS_ERR(thermal->grf) && thermal->chip->grf_required) + return dev_err_probe(dev, PTR_ERR(thermal->grf), + "Missing rockchip,grf property\n"); rockchip_get_trim_configuration(dev, np, thermal); diff --git a/drivers/thermal/tegra/Makefile b/drivers/thermal/tegra/Makefile index eb27d194c58358..9b3e91f7fb97bd 100644 --- a/drivers/thermal/tegra/Makefile +++ b/drivers/thermal/tegra/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_TEGRA_BPMP_THERMAL) += tegra-bpmp-thermal.o obj-$(CONFIG_TEGRA30_TSENSOR) += tegra30-tsensor.o tegra-soctherm-y := soctherm.o soctherm-fuse.o +tegra-soctherm-$(CONFIG_ARCH_TEGRA_114_SOC) += tegra114-soctherm.o tegra-soctherm-$(CONFIG_ARCH_TEGRA_124_SOC) += tegra124-soctherm.o tegra-soctherm-$(CONFIG_ARCH_TEGRA_132_SOC) += tegra132-soctherm.o tegra-soctherm-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-soctherm.o diff --git a/drivers/thermal/tegra/soctherm-fuse.c b/drivers/thermal/tegra/soctherm-fuse.c index 190f95280e0b82..8d37cd8c9122b3 100644 --- a/drivers/thermal/tegra/soctherm-fuse.c +++ b/drivers/thermal/tegra/soctherm-fuse.c @@ -9,15 +9,12 @@ #include "soctherm.h" -#define NOMINAL_CALIB_FT 105 #define NOMINAL_CALIB_CP 25 #define FUSE_TSENSOR_CALIB_CP_TS_BASE_MASK 0x1fff #define FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK (0x1fff << 13) #define FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT 13 -#define FUSE_TSENSOR_COMMON 0x180 - /* * Tegra210: Layout of bits in FUSE_TSENSOR_COMMON: * 3 2 1 0 @@ -26,7 +23,7 @@ * | BASE_FT | BASE_CP | SHFT_FT | SHIFT_CP | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * - * Tegra12x, etc: + * Tegra124: * In chips prior to Tegra210, this fuse was incorrectly sized as 26 bits, * and didn't hold SHIFT_CP in [31:26]. Therefore these missing six bits * were obtained via the FUSE_SPARE_REALIGNMENT_REG register [5:0]. @@ -44,6 +41,13 @@ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |---------------------------------------------------| SHIFT_CP | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Tegra114: Layout of bits in FUSE_TSENSOR_COMMON aka FUSE_VSENSOR_CALIB: + * 3 2 1 0 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHFT_FT | BASE_FT | SHIFT_CP | BASE_CP | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define CALIB_COEFFICIENT 1000000LL @@ -77,7 +81,7 @@ int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse, s32 shifted_cp, shifted_ft; int err; - err = tegra_fuse_readl(FUSE_TSENSOR_COMMON, &val); + err = tegra_fuse_readl(tfuse->fuse_common_reg, &val); if (err) return err; @@ -96,10 +100,12 @@ int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse, return err; } + shifted_cp = (val & tfuse->fuse_shift_cp_mask) >> + tfuse->fuse_shift_cp_shift; shifted_cp = sign_extend32(val, 5); shared->actual_temp_cp = 2 * NOMINAL_CALIB_CP + shifted_cp; - shared->actual_temp_ft = 2 * NOMINAL_CALIB_FT + shifted_ft; + shared->actual_temp_ft = 2 * tfuse->nominal_calib_ft + shifted_ft; return 0; } diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c index 53a5c649f4b146..5d26b52beaba7a 100644 --- a/drivers/thermal/tegra/soctherm.c +++ b/drivers/thermal/tegra/soctherm.c @@ -31,6 +31,7 @@ #include #include +#include #include #include "../thermal_core.h" @@ -357,6 +358,12 @@ struct soctherm_oc_irq_chip_data { static struct soctherm_oc_irq_chip_data soc_irq_cdata; +/* Ensure that TEGRA114_* and TEGRA124_* counterparts are equal */ +static_assert(TEGRA114_SOCTHERM_SENSOR_CPU == TEGRA124_SOCTHERM_SENSOR_CPU); +static_assert(TEGRA114_SOCTHERM_SENSOR_MEM == TEGRA124_SOCTHERM_SENSOR_MEM); +static_assert(TEGRA114_SOCTHERM_SENSOR_GPU == TEGRA124_SOCTHERM_SENSOR_GPU); +static_assert(TEGRA114_SOCTHERM_SENSOR_PLLX == TEGRA124_SOCTHERM_SENSOR_PLLX); + /** * ccroc_writel() - writes a value to a CCROC register * @ts: pointer to a struct tegra_soctherm @@ -2045,6 +2052,12 @@ static void soctherm_init(struct platform_device *pdev) } static const struct of_device_id tegra_soctherm_of_match[] = { +#ifdef CONFIG_ARCH_TEGRA_114_SOC + { + .compatible = "nvidia,tegra114-soctherm", + .data = &tegra114_soctherm, + }, +#endif #ifdef CONFIG_ARCH_TEGRA_124_SOC { .compatible = "nvidia,tegra124-soctherm", diff --git a/drivers/thermal/tegra/soctherm.h b/drivers/thermal/tegra/soctherm.h index 70501e73d58623..aa4af9268b05d8 100644 --- a/drivers/thermal/tegra/soctherm.h +++ b/drivers/thermal/tegra/soctherm.h @@ -56,6 +56,9 @@ #define SENSOR_TEMP2_MEM_TEMP_MASK (0xffff << 16) #define SENSOR_TEMP2_PLLX_TEMP_MASK 0xffff +#define FUSE_VSENSOR_CALIB 0x08c +#define FUSE_TSENSOR_COMMON 0x180 + /** * struct tegra_tsensor_group - SOC_THERM sensor group data * @name: short name of the temperature sensor group @@ -109,9 +112,11 @@ struct tsensor_group_thermtrips { struct tegra_soctherm_fuse { u32 fuse_base_cp_mask, fuse_base_cp_shift; + u32 fuse_shift_cp_mask, fuse_shift_cp_shift; u32 fuse_base_ft_mask, fuse_base_ft_shift; u32 fuse_shift_ft_mask, fuse_shift_ft_shift; - u32 fuse_spare_realignment; + u32 fuse_common_reg, fuse_spare_realignment; + u32 nominal_calib_ft; }; struct tsensor_shared_calib { @@ -137,6 +142,10 @@ int tegra_calc_tsensor_calib(const struct tegra_tsensor *sensor, const struct tsensor_shared_calib *shared, u32 *calib); +#ifdef CONFIG_ARCH_TEGRA_114_SOC +extern const struct tegra_soctherm_soc tegra114_soctherm; +#endif + #ifdef CONFIG_ARCH_TEGRA_124_SOC extern const struct tegra_soctherm_soc tegra124_soctherm; #endif diff --git a/drivers/thermal/tegra/tegra114-soctherm.c b/drivers/thermal/tegra/tegra114-soctherm.c new file mode 100644 index 00000000000000..688104f2805280 --- /dev/null +++ b/drivers/thermal/tegra/tegra114-soctherm.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024, Svyatoslav Ryhel + */ + +#include +#include + +#include + +#include "soctherm.h" + +#define TEGRA114_THERMTRIP_ANY_EN_MASK (0x1 << 28) +#define TEGRA114_THERMTRIP_MEM_EN_MASK (0x1 << 27) +#define TEGRA114_THERMTRIP_GPU_EN_MASK (0x1 << 26) +#define TEGRA114_THERMTRIP_CPU_EN_MASK (0x1 << 25) +#define TEGRA114_THERMTRIP_TSENSE_EN_MASK (0x1 << 24) +#define TEGRA114_THERMTRIP_GPUMEM_THRESH_MASK (0xff << 16) +#define TEGRA114_THERMTRIP_CPU_THRESH_MASK (0xff << 8) +#define TEGRA114_THERMTRIP_TSENSE_THRESH_MASK 0xff + +#define TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK (0xff << 17) +#define TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK (0xff << 9) + +#define TEGRA114_THRESH_GRAIN 1000 +#define TEGRA114_BPTT 8 + +static const struct tegra_tsensor_configuration tegra114_tsensor_config = { + .tall = 16300, + .tiddq_en = 1, + .ten_count = 1, + .tsample = 163, + .tsample_ate = 655, +}; + +static const struct tegra_tsensor_group tegra114_tsensor_group_cpu = { + .id = TEGRA114_SOCTHERM_SENSOR_CPU, + .name = "cpu", + .sensor_temp_offset = SENSOR_TEMP1, + .sensor_temp_mask = SENSOR_TEMP1_CPU_TEMP_MASK, + .pdiv = 10, + .pdiv_ate = 10, + .pdiv_mask = SENSOR_PDIV_CPU_MASK, + .pllx_hotspot_diff = 6, + .pllx_hotspot_mask = SENSOR_HOTSPOT_CPU_MASK, + .thermtrip_any_en_mask = TEGRA114_THERMTRIP_ANY_EN_MASK, + .thermtrip_enable_mask = TEGRA114_THERMTRIP_CPU_EN_MASK, + .thermtrip_threshold_mask = TEGRA114_THERMTRIP_CPU_THRESH_MASK, + .thermctl_isr_mask = THERM_IRQ_CPU_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_CPU, + .thermctl_lvl0_up_thresh_mask = TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK, +}; + +static const struct tegra_tsensor_group tegra114_tsensor_group_gpu = { + .id = TEGRA114_SOCTHERM_SENSOR_GPU, + .name = "gpu", + .sensor_temp_offset = SENSOR_TEMP1, + .sensor_temp_mask = SENSOR_TEMP1_GPU_TEMP_MASK, + .pdiv = 10, + .pdiv_ate = 10, + .pdiv_mask = SENSOR_PDIV_GPU_MASK, + .pllx_hotspot_diff = 6, + .pllx_hotspot_mask = SENSOR_HOTSPOT_GPU_MASK, + .thermtrip_any_en_mask = TEGRA114_THERMTRIP_ANY_EN_MASK, + .thermtrip_enable_mask = TEGRA114_THERMTRIP_GPU_EN_MASK, + .thermtrip_threshold_mask = TEGRA114_THERMTRIP_GPUMEM_THRESH_MASK, + .thermctl_isr_mask = THERM_IRQ_GPU_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_GPU, + .thermctl_lvl0_up_thresh_mask = TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK, +}; + +static const struct tegra_tsensor_group tegra114_tsensor_group_pll = { + .id = TEGRA114_SOCTHERM_SENSOR_PLLX, + .name = "pll", + .sensor_temp_offset = SENSOR_TEMP2, + .sensor_temp_mask = SENSOR_TEMP2_PLLX_TEMP_MASK, + .pdiv = 10, + .pdiv_ate = 10, + .pdiv_mask = SENSOR_PDIV_PLLX_MASK, + .thermtrip_any_en_mask = TEGRA114_THERMTRIP_ANY_EN_MASK, + .thermtrip_enable_mask = TEGRA114_THERMTRIP_TSENSE_EN_MASK, + .thermtrip_threshold_mask = TEGRA114_THERMTRIP_TSENSE_THRESH_MASK, + .thermctl_isr_mask = THERM_IRQ_TSENSE_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_TSENSE, + .thermctl_lvl0_up_thresh_mask = TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK, +}; + +static const struct tegra_tsensor_group tegra114_tsensor_group_mem = { + .id = TEGRA114_SOCTHERM_SENSOR_MEM, + .name = "mem", + .sensor_temp_offset = SENSOR_TEMP2, + .sensor_temp_mask = SENSOR_TEMP2_MEM_TEMP_MASK, + .pdiv = 10, + .pdiv_ate = 10, + .pdiv_mask = SENSOR_PDIV_MEM_MASK, + .pllx_hotspot_diff = 0, + .pllx_hotspot_mask = SENSOR_HOTSPOT_MEM_MASK, + .thermtrip_any_en_mask = TEGRA114_THERMTRIP_ANY_EN_MASK, + .thermtrip_enable_mask = TEGRA114_THERMTRIP_MEM_EN_MASK, + .thermtrip_threshold_mask = TEGRA114_THERMTRIP_GPUMEM_THRESH_MASK, + .thermctl_isr_mask = THERM_IRQ_MEM_MASK, + .thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_MEM, + .thermctl_lvl0_up_thresh_mask = TEGRA114_THERMCTL_LVL0_UP_THRESH_MASK, + .thermctl_lvl0_dn_thresh_mask = TEGRA114_THERMCTL_LVL0_DN_THRESH_MASK, +}; + +static const struct tegra_tsensor_group *tegra114_tsensor_groups[] = { + &tegra114_tsensor_group_cpu, + &tegra114_tsensor_group_gpu, + &tegra114_tsensor_group_pll, + &tegra114_tsensor_group_mem, +}; + +static const struct tegra_tsensor tegra114_tsensors[] = { + { + .name = "cpu0", + .base = 0xc0, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x098, + .fuse_corr_alpha = 1196400, + .fuse_corr_beta = -13600000, + .group = &tegra114_tsensor_group_cpu, + }, { + .name = "cpu1", + .base = 0xe0, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x084, + .fuse_corr_alpha = 1196400, + .fuse_corr_beta = -13600000, + .group = &tegra114_tsensor_group_cpu, + }, { + .name = "cpu2", + .base = 0x100, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x088, + .fuse_corr_alpha = 1196400, + .fuse_corr_beta = -13600000, + .group = &tegra114_tsensor_group_cpu, + }, { + .name = "cpu3", + .base = 0x120, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x12c, + .fuse_corr_alpha = 1196400, + .fuse_corr_beta = -13600000, + .group = &tegra114_tsensor_group_cpu, + }, { + .name = "mem0", + .base = 0x140, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x158, + .fuse_corr_alpha = 1000000, + .fuse_corr_beta = 0, + .group = &tegra114_tsensor_group_mem, + }, { + .name = "mem1", + .base = 0x160, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x15c, + .fuse_corr_alpha = 1000000, + .fuse_corr_beta = 0, + .group = &tegra114_tsensor_group_mem, + }, { + .name = "gpu", + .base = 0x180, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x154, + .fuse_corr_alpha = 1124500, + .fuse_corr_beta = -9793100, + .group = &tegra114_tsensor_group_gpu, + }, { + .name = "pllx", + .base = 0x1a0, + .config = &tegra114_tsensor_config, + .calib_fuse_offset = 0x160, + .fuse_corr_alpha = 1224200, + .fuse_corr_beta = -14665000, + .group = &tegra114_tsensor_group_pll, + }, +}; + +static const struct tegra_soctherm_fuse tegra114_soctherm_fuse = { + .fuse_base_cp_mask = 0x3ff, + .fuse_base_cp_shift = 0, + .fuse_shift_cp_mask = 0x3f << 10, + .fuse_shift_cp_shift = 10, + .fuse_base_ft_mask = 0x7ff << 16, + .fuse_base_ft_shift = 16, + .fuse_shift_ft_mask = 0x1f << 27, + .fuse_shift_ft_shift = 27, + .fuse_common_reg = FUSE_VSENSOR_CALIB, + .fuse_spare_realignment = 0, + .nominal_calib_ft = 90, +}; + +const struct tegra_soctherm_soc tegra114_soctherm = { + .tsensors = tegra114_tsensors, + .num_tsensors = ARRAY_SIZE(tegra114_tsensors), + .ttgs = tegra114_tsensor_groups, + .num_ttgs = ARRAY_SIZE(tegra114_tsensor_groups), + .tfuse = &tegra114_soctherm_fuse, + .thresh_grain = TEGRA114_THRESH_GRAIN, + .bptt = TEGRA114_BPTT, + .use_ccroc = false, +}; diff --git a/drivers/thermal/tegra/tegra124-soctherm.c b/drivers/thermal/tegra/tegra124-soctherm.c index 20ad27f4d1a161..d86acff1b234de 100644 --- a/drivers/thermal/tegra/tegra124-soctherm.c +++ b/drivers/thermal/tegra/tegra124-soctherm.c @@ -200,11 +200,15 @@ static const struct tegra_tsensor tegra124_tsensors[] = { static const struct tegra_soctherm_fuse tegra124_soctherm_fuse = { .fuse_base_cp_mask = 0x3ff, .fuse_base_cp_shift = 0, + .fuse_shift_cp_mask = 0x3f, + .fuse_shift_cp_shift = 0, .fuse_base_ft_mask = 0x7ff << 10, .fuse_base_ft_shift = 10, .fuse_shift_ft_mask = 0x1f << 21, .fuse_shift_ft_shift = 21, + .fuse_common_reg = FUSE_TSENSOR_COMMON, .fuse_spare_realignment = 0x1fc, + .nominal_calib_ft = 105, }; const struct tegra_soctherm_soc tegra124_soctherm = { diff --git a/drivers/thermal/tegra/tegra132-soctherm.c b/drivers/thermal/tegra/tegra132-soctherm.c index b76308fdad9e26..64c0363b97171f 100644 --- a/drivers/thermal/tegra/tegra132-soctherm.c +++ b/drivers/thermal/tegra/tegra132-soctherm.c @@ -200,11 +200,15 @@ static struct tegra_tsensor tegra132_tsensors[] = { static const struct tegra_soctherm_fuse tegra132_soctherm_fuse = { .fuse_base_cp_mask = 0x3ff, .fuse_base_cp_shift = 0, + .fuse_shift_cp_mask = 0x3f, + .fuse_shift_cp_shift = 0, .fuse_base_ft_mask = 0x7ff << 10, .fuse_base_ft_shift = 10, .fuse_shift_ft_mask = 0x1f << 21, .fuse_shift_ft_shift = 21, + .fuse_common_reg = FUSE_TSENSOR_COMMON, .fuse_spare_realignment = 0x1fc, + .nominal_calib_ft = 105, }; const struct tegra_soctherm_soc tegra132_soctherm = { diff --git a/drivers/thermal/tegra/tegra210-soctherm.c b/drivers/thermal/tegra/tegra210-soctherm.c index d0ff793f18c561..f6e1493f0202ff 100644 --- a/drivers/thermal/tegra/tegra210-soctherm.c +++ b/drivers/thermal/tegra/tegra210-soctherm.c @@ -201,11 +201,15 @@ static const struct tegra_tsensor tegra210_tsensors[] = { static const struct tegra_soctherm_fuse tegra210_soctherm_fuse = { .fuse_base_cp_mask = 0x3ff << 11, .fuse_base_cp_shift = 11, + .fuse_shift_cp_mask = 0x3f, + .fuse_shift_cp_shift = 0, .fuse_base_ft_mask = 0x7ff << 21, .fuse_base_ft_shift = 21, .fuse_shift_ft_mask = 0x1f << 6, .fuse_shift_ft_shift = 6, + .fuse_common_reg = FUSE_TSENSOR_COMMON, .fuse_spare_realignment = 0, + .nominal_calib_ft = 105, }; static struct tsensor_group_thermtrips tegra210_tsensor_thermtrips[] = { diff --git a/drivers/thermal/thermal-generic-adc.c b/drivers/thermal/thermal-generic-adc.c index ee3d0aa31406cd..7c844589b153b5 100644 --- a/drivers/thermal/thermal-generic-adc.c +++ b/drivers/thermal/thermal-generic-adc.c @@ -7,6 +7,7 @@ * Author: Laxman Dewangan */ #include +#include #include #include #include @@ -73,6 +74,58 @@ static const struct thermal_zone_device_ops gadc_thermal_ops = { .get_temp = gadc_thermal_get_temp, }; +static const struct iio_chan_spec gadc_thermal_iio_channels[] = { + { + .type = IIO_TEMP, + .info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED), + } +}; + +static int gadc_thermal_read_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, + int *val, int *val2, long mask) +{ + struct gadc_thermal_info *gtinfo = iio_priv(indio_dev); + int ret; + + switch (mask) { + case IIO_CHAN_INFO_PROCESSED: + ret = gadc_thermal_get_temp(gtinfo->tz_dev, val); + if (ret) + return ret; + + return IIO_VAL_INT; + + default: + return -EINVAL; + } +} + +static const struct iio_info gadc_thermal_iio_info = { + .read_raw = gadc_thermal_read_raw, +}; + +static int gadc_iio_register(struct device *dev, struct gadc_thermal_info *gti) +{ + struct gadc_thermal_info *gtinfo; + struct iio_dev *indio_dev; + + indio_dev = devm_iio_device_alloc(dev, sizeof(*gtinfo)); + if (!indio_dev) + return -ENOMEM; + + gtinfo = iio_priv(indio_dev); + memcpy(gtinfo, gti, sizeof(*gtinfo)); + + indio_dev->name = dev_name(dev); + indio_dev->info = &gadc_thermal_iio_info; + indio_dev->modes = INDIO_DIRECT_MODE; + indio_dev->channels = gadc_thermal_iio_channels; + indio_dev->num_channels = ARRAY_SIZE(gadc_thermal_iio_channels); + + return devm_iio_device_register(dev, indio_dev); +} + static int gadc_thermal_read_linear_lookup_table(struct device *dev, struct gadc_thermal_info *gti) { @@ -153,7 +206,7 @@ static int gadc_thermal_probe(struct platform_device *pdev) devm_thermal_add_hwmon_sysfs(dev, gti->tz_dev); - return 0; + return gadc_iio_register(&pdev->dev, gti); } static const struct of_device_id of_adc_thermal_match[] = { diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c index cd1f657f782df2..13c663a154c4e8 100644 --- a/drivers/tty/hvc/hvc_console.c +++ b/drivers/tty/hvc/hvc_console.c @@ -543,10 +543,10 @@ static ssize_t hvc_write(struct tty_struct *tty, const u8 *buf, size_t count) } /* - * Racy, but harmless, kick thread if there is still pending data. + * Kick thread to flush if there's still pending data + * or to wakeup the write queue. */ - if (hp->n_outbuf) - hvc_kick(); + hvc_kick(); return written; } diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index 3f38fba8f6ea5d..a668e0bb26b397 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -1177,17 +1177,6 @@ static int sc16is7xx_startup(struct uart_port *port) sc16is7xx_port_write(port, SC16IS7XX_FCR_REG, SC16IS7XX_FCR_FIFO_BIT); - /* Enable EFR */ - sc16is7xx_port_write(port, SC16IS7XX_LCR_REG, - SC16IS7XX_LCR_CONF_MODE_B); - - regcache_cache_bypass(one->regmap, true); - - /* Enable write access to enhanced features and internal clock div */ - sc16is7xx_port_update(port, SC16IS7XX_EFR_REG, - SC16IS7XX_EFR_ENABLE_BIT, - SC16IS7XX_EFR_ENABLE_BIT); - /* Enable TCR/TLR */ sc16is7xx_port_update(port, SC16IS7XX_MCR_REG, SC16IS7XX_MCR_TCRTLR_BIT, @@ -1199,7 +1188,8 @@ static int sc16is7xx_startup(struct uart_port *port) SC16IS7XX_TCR_RX_RESUME(24) | SC16IS7XX_TCR_RX_HALT(48)); - regcache_cache_bypass(one->regmap, false); + /* Disable TCR/TLR access */ + sc16is7xx_port_update(port, SC16IS7XX_MCR_REG, SC16IS7XX_MCR_TCRTLR_BIT, 0); /* Now, initialize the UART */ sc16is7xx_port_write(port, SC16IS7XX_LCR_REG, SC16IS7XX_LCR_WORD_LEN_8); diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index fe457bf1e15bb4..a66b44d21fba25 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -33,7 +33,6 @@ #define CDNS_UART_MINOR 0 /* works best with devtmpfs */ #define CDNS_UART_NR_PORTS 16 #define CDNS_UART_FIFO_SIZE 64 /* FIFO size */ -#define CDNS_UART_REGISTER_SPACE 0x1000 #define TX_TIMEOUT 500000 /* Rx Trigger level */ @@ -1098,15 +1097,15 @@ static int cdns_uart_verify_port(struct uart_port *port, */ static int cdns_uart_request_port(struct uart_port *port) { - if (!request_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE, + if (!request_mem_region(port->mapbase, port->mapsize, CDNS_UART_NAME)) { return -ENOMEM; } - port->membase = ioremap(port->mapbase, CDNS_UART_REGISTER_SPACE); + port->membase = ioremap(port->mapbase, port->mapsize); if (!port->membase) { dev_err(port->dev, "Unable to map registers\n"); - release_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE); + release_mem_region(port->mapbase, port->mapsize); return -ENOMEM; } return 0; @@ -1121,7 +1120,7 @@ static int cdns_uart_request_port(struct uart_port *port) */ static void cdns_uart_release_port(struct uart_port *port) { - release_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE); + release_mem_region(port->mapbase, port->mapsize); iounmap(port->membase); port->membase = NULL; } @@ -1780,6 +1779,7 @@ static int cdns_uart_probe(struct platform_device *pdev) * and triggers invocation of the config_port() entry point. */ port->mapbase = res->start; + port->mapsize = resource_size(res); port->irq = irq; port->dev = &pdev->dev; port->uartclk = clk_get_rate(cdns_uart_data->uartclk); diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 1e50675772febb..cc88aaa106da30 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -243,7 +243,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba) hwq->sqe_base_addr = dmam_alloc_coherent(hba->dev, utrdl_size, &hwq->sqe_dma_addr, GFP_KERNEL); - if (!hwq->sqe_dma_addr) { + if (!hwq->sqe_base_addr) { dev_err(hba->dev, "SQE allocation failed\n"); return -ENOMEM; } @@ -252,7 +252,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba) hwq->cqe_base_addr = dmam_alloc_coherent(hba->dev, cqe_size, &hwq->cqe_dma_addr, GFP_KERNEL); - if (!hwq->cqe_dma_addr) { + if (!hwq->cqe_base_addr) { dev_err(hba->dev, "CQE allocation failed\n"); return -ENOMEM; } diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index c3177034b779eb..f441958b0ef45a 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -119,11 +119,11 @@ ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf) guard(mutex)(&usb_dynids_lock); list_for_each_entry(dynid, &dynids->list, node) if (dynid->id.bInterfaceClass != 0) - count += sysfs_emit_at(&buf[count], count, "%04x %04x %02x\n", + count += sysfs_emit_at(buf, count, "%04x %04x %02x\n", dynid->id.idVendor, dynid->id.idProduct, dynid->id.bInterfaceClass); else - count += sysfs_emit_at(&buf[count], count, "%04x %04x\n", + count += sysfs_emit_at(buf, count, "%04x %04x\n", dynid->id.idVendor, dynid->id.idProduct); return count; } diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 08a251df20c438..5246fa6af3d61c 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1891,7 +1891,7 @@ static struct dentry *ffs_sb_create_file(struct super_block *sb, /* Super block */ static const struct super_operations ffs_sb_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; struct ffs_sb_fill_data { diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c index 0a800ba53816a8..de16b02d857e07 100644 --- a/drivers/usb/gadget/function/f_midi2.c +++ b/drivers/usb/gadget/function/f_midi2.c @@ -1599,6 +1599,7 @@ static int f_midi2_create_card(struct f_midi2 *midi2) strscpy(fb->info.name, ump_fb_name(b), sizeof(fb->info.name)); } + snd_ump_update_group_attrs(ump); } for (i = 0; i < midi2->num_eps; i++) { @@ -1736,9 +1737,12 @@ static int f_midi2_create_usb_configs(struct f_midi2 *midi2, case USB_SPEED_HIGH: midi2_midi1_ep_out_desc.wMaxPacketSize = cpu_to_le16(512); midi2_midi1_ep_in_desc.wMaxPacketSize = cpu_to_le16(512); - for (i = 0; i < midi2->num_eps; i++) + for (i = 0; i < midi2->num_eps; i++) { midi2_midi2_ep_out_desc[i].wMaxPacketSize = cpu_to_le16(512); + midi2_midi2_ep_in_desc[i].wMaxPacketSize = + cpu_to_le16(512); + } fallthrough; case USB_SPEED_FULL: midi1_in_eps = midi2_midi1_ep_in_descs; @@ -1747,9 +1751,12 @@ static int f_midi2_create_usb_configs(struct f_midi2 *midi2, case USB_SPEED_SUPER: midi2_midi1_ep_out_desc.wMaxPacketSize = cpu_to_le16(1024); midi2_midi1_ep_in_desc.wMaxPacketSize = cpu_to_le16(1024); - for (i = 0; i < midi2->num_eps; i++) + for (i = 0; i < midi2->num_eps; i++) { midi2_midi2_ep_out_desc[i].wMaxPacketSize = cpu_to_le16(1024); + midi2_midi2_ep_in_desc[i].wMaxPacketSize = + cpu_to_le16(1024); + } midi1_in_eps = midi2_midi1_ep_in_ss_descs; midi1_out_eps = midi2_midi1_ep_out_ss_descs; break; diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index b51e132b0cd2a7..13c3da49348c59 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -2011,7 +2011,7 @@ gadgetfs_create_file (struct super_block *sb, char const *name, static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index 21dbfb0b3baca8..1cefca660773c4 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -765,8 +765,7 @@ static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req) if (!dum->driver) return -ESHUTDOWN; - local_irq_save(flags); - spin_lock(&dum->lock); + spin_lock_irqsave(&dum->lock, flags); list_for_each_entry(iter, &ep->queue, queue) { if (&iter->req != _req) continue; @@ -776,15 +775,16 @@ static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req) retval = 0; break; } - spin_unlock(&dum->lock); if (retval == 0) { dev_dbg(udc_dev(dum), "dequeued req %p from %s, len %d buf %p\n", req, _ep->name, _req->length, _req->buf); + spin_unlock(&dum->lock); usb_gadget_giveback_request(_ep, _req); + spin_lock(&dum->lock); } - local_irq_restore(flags); + spin_unlock_irqrestore(&dum->lock, flags); return retval; } diff --git a/drivers/usb/host/xhci-dbgcap.c b/drivers/usb/host/xhci-dbgcap.c index 06a2edb9e86ef7..63edf2d8f24501 100644 --- a/drivers/usb/host/xhci-dbgcap.c +++ b/drivers/usb/host/xhci-dbgcap.c @@ -101,13 +101,34 @@ static u32 xhci_dbc_populate_strings(struct dbc_str_descs *strings) return string_length; } +static void xhci_dbc_init_ep_contexts(struct xhci_dbc *dbc) +{ + struct xhci_ep_ctx *ep_ctx; + unsigned int max_burst; + dma_addr_t deq; + + max_burst = DBC_CTRL_MAXBURST(readl(&dbc->regs->control)); + + /* Populate bulk out endpoint context: */ + ep_ctx = dbc_bulkout_ctx(dbc); + deq = dbc_bulkout_enq(dbc); + ep_ctx->ep_info = 0; + ep_ctx->ep_info2 = dbc_epctx_info2(BULK_OUT_EP, 1024, max_burst); + ep_ctx->deq = cpu_to_le64(deq | dbc->ring_out->cycle_state); + + /* Populate bulk in endpoint context: */ + ep_ctx = dbc_bulkin_ctx(dbc); + deq = dbc_bulkin_enq(dbc); + ep_ctx->ep_info = 0; + ep_ctx->ep_info2 = dbc_epctx_info2(BULK_IN_EP, 1024, max_burst); + ep_ctx->deq = cpu_to_le64(deq | dbc->ring_in->cycle_state); +} + static void xhci_dbc_init_contexts(struct xhci_dbc *dbc, u32 string_length) { struct dbc_info_context *info; - struct xhci_ep_ctx *ep_ctx; u32 dev_info; - dma_addr_t deq, dma; - unsigned int max_burst; + dma_addr_t dma; if (!dbc) return; @@ -121,20 +142,8 @@ static void xhci_dbc_init_contexts(struct xhci_dbc *dbc, u32 string_length) info->serial = cpu_to_le64(dma + DBC_MAX_STRING_LENGTH * 3); info->length = cpu_to_le32(string_length); - /* Populate bulk out endpoint context: */ - ep_ctx = dbc_bulkout_ctx(dbc); - max_burst = DBC_CTRL_MAXBURST(readl(&dbc->regs->control)); - deq = dbc_bulkout_enq(dbc); - ep_ctx->ep_info = 0; - ep_ctx->ep_info2 = dbc_epctx_info2(BULK_OUT_EP, 1024, max_burst); - ep_ctx->deq = cpu_to_le64(deq | dbc->ring_out->cycle_state); - - /* Populate bulk in endpoint context: */ - ep_ctx = dbc_bulkin_ctx(dbc); - deq = dbc_bulkin_enq(dbc); - ep_ctx->ep_info = 0; - ep_ctx->ep_info2 = dbc_epctx_info2(BULK_IN_EP, 1024, max_burst); - ep_ctx->deq = cpu_to_le64(deq | dbc->ring_in->cycle_state); + /* Populate bulk in and out endpoint contexts: */ + xhci_dbc_init_ep_contexts(dbc); /* Set DbC context and info registers: */ lo_hi_writeq(dbc->ctx->dma, &dbc->regs->dccp); @@ -436,6 +445,42 @@ dbc_alloc_ctx(struct device *dev, gfp_t flags) return ctx; } +static void xhci_dbc_ring_init(struct xhci_ring *ring) +{ + struct xhci_segment *seg = ring->first_seg; + + /* clear all trbs on ring in case of old ring */ + memset(seg->trbs, 0, TRB_SEGMENT_SIZE); + + /* Only event ring does not use link TRB */ + if (ring->type != TYPE_EVENT) { + union xhci_trb *trb = &seg->trbs[TRBS_PER_SEGMENT - 1]; + + trb->link.segment_ptr = cpu_to_le64(ring->first_seg->dma); + trb->link.control = cpu_to_le32(LINK_TOGGLE | TRB_TYPE(TRB_LINK)); + } + xhci_initialize_ring_info(ring); +} + +static int xhci_dbc_reinit_ep_rings(struct xhci_dbc *dbc) +{ + struct xhci_ring *in_ring = dbc->eps[BULK_IN].ring; + struct xhci_ring *out_ring = dbc->eps[BULK_OUT].ring; + + if (!in_ring || !out_ring || !dbc->ctx) { + dev_warn(dbc->dev, "Can't re-init unallocated endpoints\n"); + return -ENODEV; + } + + xhci_dbc_ring_init(in_ring); + xhci_dbc_ring_init(out_ring); + + /* set ep context enqueue, dequeue, and cycle to initial values */ + xhci_dbc_init_ep_contexts(dbc); + + return 0; +} + static struct xhci_ring * xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags) { @@ -464,15 +509,10 @@ xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags) seg->dma = dma; - /* Only event ring does not use link TRB */ - if (type != TYPE_EVENT) { - union xhci_trb *trb = &seg->trbs[TRBS_PER_SEGMENT - 1]; - - trb->link.segment_ptr = cpu_to_le64(dma); - trb->link.control = cpu_to_le32(LINK_TOGGLE | TRB_TYPE(TRB_LINK)); - } INIT_LIST_HEAD(&ring->td_list); - xhci_initialize_ring_info(ring); + + xhci_dbc_ring_init(ring); + return ring; dma_fail: kfree(seg); @@ -864,7 +904,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc) dev_info(dbc->dev, "DbC cable unplugged\n"); dbc->state = DS_ENABLED; xhci_dbc_flush_requests(dbc); - + xhci_dbc_reinit_ep_rings(dbc); return EVT_DISC; } @@ -874,7 +914,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc) writel(portsc, &dbc->regs->portsc); dbc->state = DS_ENABLED; xhci_dbc_flush_requests(dbc); - + xhci_dbc_reinit_ep_rings(dbc); return EVT_DISC; } diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 81eaad87a3d9d0..c4a6544aa10751 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -962,7 +962,7 @@ static void xhci_free_virt_devices_depth_first(struct xhci_hcd *xhci, int slot_i out: /* we are now at a leaf device */ xhci_debugfs_remove_slot(xhci, slot_id); - xhci_free_virt_device(xhci, vdev, slot_id); + xhci_free_virt_device(xhci, xhci->devs[slot_id], slot_id); } int xhci_alloc_virt_device(struct xhci_hcd *xhci, int slot_id, diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index e5cd3309342364..fc869b7f803f04 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1322,7 +1322,18 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1033, 0xff), /* Telit LE910C1-EUX (ECM) */ .driver_info = NCTRL(0) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1034, 0xff), /* Telit LE910C4-WWX (rmnet) */ + .driver_info = RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1035, 0xff) }, /* Telit LE910C4-WWX (ECM) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1036, 0xff) }, /* Telit LE910C4-WWX */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1037, 0xff), /* Telit LE910C4-WWX (rmnet) */ + .driver_info = NCTRL(0) | NCTRL(1) | RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1038, 0xff), /* Telit LE910C4-WWX (rmnet) */ + .driver_info = NCTRL(0) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103b, 0xff), /* Telit LE910C4-WWX */ + .driver_info = NCTRL(0) | NCTRL(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103c, 0xff), /* Telit LE910C4-WWX */ + .driver_info = NCTRL(0) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG0), .driver_info = RSVD(0) | RSVD(1) | NCTRL(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG1), @@ -1369,6 +1380,12 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1075, 0xff), /* Telit FN990A (PCIe) */ .driver_info = RSVD(0) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1077, 0xff), /* Telit FN990A (rmnet + audio) */ + .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1078, 0xff), /* Telit FN990A (MBIM + audio) */ + .driver_info = NCTRL(0) | RSVD(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1079, 0xff), /* Telit FN990A (RNDIS + audio) */ + .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1080, 0xff), /* Telit FE990A (rmnet) */ .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1081, 0xff), /* Telit FE990A (MBIM) */ diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 1f6fdfaa34bf12..b2a568a5bc9b0b 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -2426,17 +2426,21 @@ static void tcpm_handle_vdm_request(struct tcpm_port *port, case ADEV_NONE: break; case ADEV_NOTIFY_USB_AND_QUEUE_VDM: - WARN_ON(typec_altmode_notify(adev, TYPEC_STATE_USB, NULL)); - typec_altmode_vdm(adev, p[0], &p[1], cnt); + if (rx_sop_type == TCPC_TX_SOP_PRIME) { + typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt); + } else { + WARN_ON(typec_altmode_notify(adev, TYPEC_STATE_USB, NULL)); + typec_altmode_vdm(adev, p[0], &p[1], cnt); + } break; case ADEV_QUEUE_VDM: - if (response_tx_sop_type == TCPC_TX_SOP_PRIME) + if (rx_sop_type == TCPC_TX_SOP_PRIME) typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt); else typec_altmode_vdm(adev, p[0], &p[1], cnt); break; case ADEV_QUEUE_VDM_SEND_EXIT_MODE_ON_FAIL: - if (response_tx_sop_type == TCPC_TX_SOP_PRIME) { + if (rx_sop_type == TCPC_TX_SOP_PRIME) { if (typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt)) { int svdm_version = typec_get_cable_svdm_version( diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index c6508fe0d5c8e5..35ded43304319c 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -765,11 +765,11 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); - bool busyloop_intr; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); do { - busyloop_intr = false; + bool busyloop_intr = false; + if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); @@ -780,10 +780,18 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { - /* Kicks are disabled at this point, break loop and - * process any remaining batched packets. Queue will - * be re-enabled afterwards. + /* Flush batched packets to handle pending RX + * work (if busyloop_intr is set) and to avoid + * unnecessary virtqueue kicks. */ + vhost_tx_batch(net, nvq, sock, &msg); + if (unlikely(busyloop_intr)) { + vhost_poll_queue(&vq->poll); + } else if (unlikely(vhost_enable_notify(&net->dev, + vq))) { + vhost_disable_notify(&net->dev, vq); + continue; + } break; } @@ -839,22 +847,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); - /* Kicks are still disabled, dispatch any remaining batched msgs. */ vhost_tx_batch(net, nvq, sock, &msg); - - if (unlikely(busyloop_intr)) - /* If interrupted while doing busy polling, requeue the - * handler to be fair handle_rx as well as other tasks - * waiting on cpu. - */ - vhost_poll_queue(&vq->poll); - else - /* All of our work has been completed; however, before - * leaving the TX handler, do one last check for work, - * and requeue handler if necessary. If there is no work, - * queue will be reenabled. - */ - vhost_net_busy_poll_try_queue(net, vq); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) @@ -1014,7 +1007,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) } static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, - bool *busyloop_intr, unsigned int count) + bool *busyloop_intr, unsigned int *count) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; @@ -1024,7 +1017,8 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ - vhost_net_signal_used(rnvq, count); + vhost_net_signal_used(rnvq, *count); + *count = 0; /* Both tx vq and rx socket were polled here */ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); @@ -1180,7 +1174,7 @@ static void handle_rx(struct vhost_net *net) do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, - &busyloop_intr, count); + &busyloop_intr, &count); if (!sock_len) break; sock_len += sock_hlen; diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index abf51332a5c559..98e4f68f4e3cb6 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -2884,7 +2884,7 @@ vhost_scsi_make_tport(struct target_fabric_configfs *tf, check_len: if (strlen(name) >= VHOST_SCSI_NAMELEN) { pr_err("Emulated %s Address: %s, exceeds" - " max: %d\n", name, vhost_scsi_dump_proto_id(tport), + " max: %d\n", vhost_scsi_dump_proto_id(tport), name, VHOST_SCSI_NAMELEN); kfree(tport); return ERR_PTR(-EINVAL); diff --git a/drivers/video/backlight/apple_dwi_bl.c b/drivers/video/backlight/apple_dwi_bl.c index 93bd744972d60a..ed8bf13d3f512b 100644 --- a/drivers/video/backlight/apple_dwi_bl.c +++ b/drivers/video/backlight/apple_dwi_bl.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/drivers/video/backlight/as3711_bl.c b/drivers/video/backlight/as3711_bl.c index 9f89eb19894e39..753160bbc3e722 100644 --- a/drivers/video/backlight/as3711_bl.c +++ b/drivers/video/backlight/as3711_bl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 9dc93c5e480b40..1e9b7e85d99a2c 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_PMAC_BACKLIGHT #include diff --git a/drivers/video/backlight/da9052_bl.c b/drivers/video/backlight/da9052_bl.c index f41523d78121b7..2493138febfa1d 100644 --- a/drivers/video/backlight/da9052_bl.c +++ b/drivers/video/backlight/da9052_bl.c @@ -9,6 +9,7 @@ #include #include +#include #include #include diff --git a/drivers/video/backlight/jornada720_bl.c b/drivers/video/backlight/jornada720_bl.c index e28d2c07179894..bbb65fdaddc79e 100644 --- a/drivers/video/backlight/jornada720_bl.c +++ b/drivers/video/backlight/jornada720_bl.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include diff --git a/drivers/video/backlight/ktd2801-backlight.c b/drivers/video/backlight/ktd2801-backlight.c index 0489b0615cebc9..17eac1b3bce4ad 100644 --- a/drivers/video/backlight/ktd2801-backlight.c +++ b/drivers/video/backlight/ktd2801-backlight.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/drivers/video/backlight/led_bl.c b/drivers/video/backlight/led_bl.c index d2db157b2c290a..efc5e380669aea 100644 --- a/drivers/video/backlight/led_bl.c +++ b/drivers/video/backlight/led_bl.c @@ -9,6 +9,7 @@ #include #include #include +#include #include struct led_bl_data { @@ -89,7 +90,7 @@ static int led_bl_get_leds(struct device *dev, return -EINVAL; } - leds = devm_kzalloc(dev, sizeof(struct led_classdev *) * nb_leds, + leds = devm_kcalloc(dev, nb_leds, sizeof(struct led_classdev *), GFP_KERNEL); if (!leds) return -ENOMEM; @@ -137,7 +138,7 @@ static int led_bl_parse_levels(struct device *dev, unsigned int db; u32 *levels = NULL; - levels = devm_kzalloc(dev, sizeof(u32) * num_levels, + levels = devm_kcalloc(dev, num_levels, sizeof(u32), GFP_KERNEL); if (!levels) return -ENOMEM; diff --git a/drivers/video/backlight/lp855x_bl.c b/drivers/video/backlight/lp855x_bl.c index 7075bfab59c4dc..d191560ce285f9 100644 --- a/drivers/video/backlight/lp855x_bl.c +++ b/drivers/video/backlight/lp855x_bl.c @@ -22,7 +22,7 @@ #define LP855X_DEVICE_CTRL 0x01 #define LP855X_EEPROM_START 0xA0 #define LP855X_EEPROM_END 0xA7 -#define LP8556_EPROM_START 0xA0 +#define LP8556_EPROM_START 0x98 #define LP8556_EPROM_END 0xAF /* LP8555/7 Registers */ diff --git a/drivers/video/backlight/mp3309c.c b/drivers/video/backlight/mp3309c.c index 372058e2612962..9337110ce6e593 100644 --- a/drivers/video/backlight/mp3309c.c +++ b/drivers/video/backlight/mp3309c.c @@ -222,7 +222,6 @@ static int mp3309c_parse_fwnode(struct mp3309c_chip *chip, if (IS_ERR(chip->pwmd)) return dev_err_probe(dev, PTR_ERR(chip->pwmd), "error getting pwm data\n"); pdata->dimming_mode = DIMMING_PWM; - pwm_apply_args(chip->pwmd); } /* @@ -353,12 +352,13 @@ static int mp3309c_probe(struct i2c_client *client) chip->pdata = pdata; /* Backlight properties */ - memset(&props, 0, sizeof(struct backlight_properties)); - props.brightness = pdata->default_brightness; - props.max_brightness = pdata->max_brightness; - props.scale = BACKLIGHT_SCALE_LINEAR; - props.type = BACKLIGHT_RAW; - props.power = BACKLIGHT_POWER_ON; + props = (typeof(props)){ + .brightness = pdata->default_brightness, + .max_brightness = pdata->max_brightness, + .scale = BACKLIGHT_SCALE_LINEAR, + .type = BACKLIGHT_RAW, + .power = BACKLIGHT_POWER_ON, + }; chip->bl = devm_backlight_device_register(dev, "mp3309c", dev, chip, &mp3309c_bl_ops, &props); if (IS_ERR(chip->bl)) diff --git a/drivers/video/backlight/rave-sp-backlight.c b/drivers/video/backlight/rave-sp-backlight.c index e708a060a6e46f..bfe01b9b9174c2 100644 --- a/drivers/video/backlight/rave-sp-backlight.c +++ b/drivers/video/backlight/rave-sp-backlight.c @@ -9,8 +9,10 @@ #include #include +#include #include #include +#include #include #define RAVE_SP_BACKLIGHT_LCD_EN BIT(7) diff --git a/drivers/video/backlight/rt4831-backlight.c b/drivers/video/backlight/rt4831-backlight.c index 7ead75929a437a..26214519bfcee1 100644 --- a/drivers/video/backlight/rt4831-backlight.c +++ b/drivers/video/backlight/rt4831-backlight.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 55f5731e94c318..5940e2eb92316c 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2504,7 +2504,7 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, unsigned charcount = font->charcount; int w = font->width; int h = font->height; - int size; + int size, alloc_size; int i, csum; u8 *new_data, *data = font->data; int pitch = PITCH(font->width); @@ -2531,9 +2531,16 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, if (fbcon_invalid_charcount(info, charcount)) return -EINVAL; - size = CALC_FONTSZ(h, pitch, charcount); + /* Check for integer overflow in font size calculation */ + if (check_mul_overflow(h, pitch, &size) || + check_mul_overflow(size, charcount, &size)) + return -EINVAL; + + /* Check for overflow in allocation size calculation */ + if (check_add_overflow(FONT_EXTRA_WORDS * sizeof(int), size, &alloc_size)) + return -EINVAL; - new_data = kmalloc(FONT_EXTRA_WORDS * sizeof(int) + size, GFP_USER); + new_data = kmalloc(alloc_size, GFP_USER); if (!new_data) return -ENOMEM; diff --git a/drivers/virt/coco/efi_secret/Kconfig b/drivers/virt/coco/efi_secret/Kconfig index 4404d198f3b200..94d88e5da70721 100644 --- a/drivers/virt/coco/efi_secret/Kconfig +++ b/drivers/virt/coco/efi_secret/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EFI_SECRET tristate "EFI secret area securityfs support" - depends on EFI && X86_64 + depends on EFI && (X86_64 || ARM64) select EFI_COCO_SECRET select SECURITYFS help diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 0c25b2ed44eb4f..05008d937e405b 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -760,6 +760,17 @@ config MAX77620_WATCHDOG MAX77620 chips. To compile this driver as a module, choose M here: the module will be called max77620_wdt. +config NCT6694_WATCHDOG + tristate "Nuvoton NCT6694 watchdog support" + depends on MFD_NCT6694 + select WATCHDOG_CORE + help + Say Y here to support Nuvoton NCT6694 watchdog timer + functionality. + + This driver can also be built as a module. If so, the module + will be called nct6694_wdt. + config IMX2_WDT tristate "IMX2+ Watchdog" depends on ARCH_MXC || ARCH_LAYERSCAPE || COMPILE_TEST diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index bbd4d62d2cc3bf..b680e4d3c1bc20 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -235,6 +235,7 @@ obj-$(CONFIG_WM831X_WATCHDOG) += wm831x_wdt.o obj-$(CONFIG_WM8350_WATCHDOG) += wm8350_wdt.o obj-$(CONFIG_MAX63XX_WATCHDOG) += max63xx_wdt.o obj-$(CONFIG_MAX77620_WATCHDOG) += max77620_wdt.o +obj-$(CONFIG_NCT6694_WATCHDOG) += nct6694_wdt.o obj-$(CONFIG_ZIIRAVE_WATCHDOG) += ziirave_wdt.o obj-$(CONFIG_SOFT_WATCHDOG) += softdog.o obj-$(CONFIG_MENF21BMC_WATCHDOG) += menf21bmc_wdt.o diff --git a/drivers/watchdog/nct6694_wdt.c b/drivers/watchdog/nct6694_wdt.c new file mode 100644 index 00000000000000..bc3689bd4b6bb0 --- /dev/null +++ b/drivers/watchdog/nct6694_wdt.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nuvoton NCT6694 WDT driver based on USB interface. + * + * Copyright (C) 2025 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DEVICE_NAME "nct6694-wdt" + +#define NCT6694_DEFAULT_TIMEOUT 10 +#define NCT6694_DEFAULT_PRETIMEOUT 0 + +#define NCT6694_WDT_MAX_DEVS 2 + +/* + * USB command module type for NCT6694 WDT controller. + * This defines the module type used for communication with the NCT6694 + * WDT controller over the USB interface. + */ +#define NCT6694_WDT_MOD 0x07 + +/* Command 00h - WDT Setup */ +#define NCT6694_WDT_SETUP 0x00 +#define NCT6694_WDT_SETUP_SEL(idx) (idx ? 0x01 : 0x00) + +/* Command 01h - WDT Command */ +#define NCT6694_WDT_COMMAND 0x01 +#define NCT6694_WDT_COMMAND_SEL(idx) (idx ? 0x01 : 0x00) + +static unsigned int timeout[NCT6694_WDT_MAX_DEVS] = { + [0 ... (NCT6694_WDT_MAX_DEVS - 1)] = NCT6694_DEFAULT_TIMEOUT +}; +module_param_array(timeout, int, NULL, 0644); +MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds"); + +static unsigned int pretimeout[NCT6694_WDT_MAX_DEVS] = { + [0 ... (NCT6694_WDT_MAX_DEVS - 1)] = NCT6694_DEFAULT_PRETIMEOUT +}; +module_param_array(pretimeout, int, NULL, 0644); +MODULE_PARM_DESC(pretimeout, "Watchdog pre-timeout in seconds"); + +static bool nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +enum { + NCT6694_ACTION_NONE = 0, + NCT6694_ACTION_SIRQ, + NCT6694_ACTION_GPO, +}; + +struct __packed nct6694_wdt_setup { + __le32 pretimeout; + __le32 timeout; + u8 owner; + u8 scratch; + u8 control; + u8 status; + __le32 countdown; +}; + +struct __packed nct6694_wdt_cmd { + __le32 wdt_cmd; + __le32 reserved; +}; + +union __packed nct6694_wdt_msg { + struct nct6694_wdt_setup setup; + struct nct6694_wdt_cmd cmd; +}; + +struct nct6694_wdt_data { + struct watchdog_device wdev; + struct device *dev; + struct nct6694 *nct6694; + union nct6694_wdt_msg *msg; + unsigned char wdev_idx; +}; + +static int nct6694_wdt_setting(struct watchdog_device *wdev, + u32 timeout_val, u8 timeout_act, + u32 pretimeout_val, u8 pretimeout_act) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + struct nct6694_wdt_setup *setup = &data->msg->setup; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_WDT_MOD, + .cmd = NCT6694_WDT_SETUP, + .sel = NCT6694_WDT_SETUP_SEL(data->wdev_idx), + .len = cpu_to_le16(sizeof(*setup)) + }; + unsigned int timeout_fmt, pretimeout_fmt; + + if (pretimeout_val == 0) + pretimeout_act = NCT6694_ACTION_NONE; + + timeout_fmt = (timeout_val * 1000) | (timeout_act << 24); + pretimeout_fmt = (pretimeout_val * 1000) | (pretimeout_act << 24); + + memset(setup, 0, sizeof(*setup)); + setup->timeout = cpu_to_le32(timeout_fmt); + setup->pretimeout = cpu_to_le32(pretimeout_fmt); + + return nct6694_write_msg(data->nct6694, &cmd_hd, setup); +} + +static int nct6694_wdt_start(struct watchdog_device *wdev) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + int ret; + + ret = nct6694_wdt_setting(wdev, wdev->timeout, NCT6694_ACTION_GPO, + wdev->pretimeout, NCT6694_ACTION_GPO); + if (ret) + return ret; + + dev_dbg(data->dev, "Setting WDT(%d): timeout = %d, pretimeout = %d\n", + data->wdev_idx, wdev->timeout, wdev->pretimeout); + + return ret; +} + +static int nct6694_wdt_stop(struct watchdog_device *wdev) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + struct nct6694_wdt_cmd *cmd = &data->msg->cmd; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_WDT_MOD, + .cmd = NCT6694_WDT_COMMAND, + .sel = NCT6694_WDT_COMMAND_SEL(data->wdev_idx), + .len = cpu_to_le16(sizeof(*cmd)) + }; + + memcpy(&cmd->wdt_cmd, "WDTC", 4); + cmd->reserved = 0; + + return nct6694_write_msg(data->nct6694, &cmd_hd, cmd); +} + +static int nct6694_wdt_ping(struct watchdog_device *wdev) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + struct nct6694_wdt_cmd *cmd = &data->msg->cmd; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_WDT_MOD, + .cmd = NCT6694_WDT_COMMAND, + .sel = NCT6694_WDT_COMMAND_SEL(data->wdev_idx), + .len = cpu_to_le16(sizeof(*cmd)) + }; + + memcpy(&cmd->wdt_cmd, "WDTS", 4); + cmd->reserved = 0; + + return nct6694_write_msg(data->nct6694, &cmd_hd, cmd); +} + +static int nct6694_wdt_set_timeout(struct watchdog_device *wdev, + unsigned int new_timeout) +{ + int ret; + + ret = nct6694_wdt_setting(wdev, new_timeout, NCT6694_ACTION_GPO, + wdev->pretimeout, NCT6694_ACTION_GPO); + if (ret) + return ret; + + wdev->timeout = new_timeout; + + return 0; +} + +static int nct6694_wdt_set_pretimeout(struct watchdog_device *wdev, + unsigned int new_pretimeout) +{ + int ret; + + ret = nct6694_wdt_setting(wdev, wdev->timeout, NCT6694_ACTION_GPO, + new_pretimeout, NCT6694_ACTION_GPO); + if (ret) + return ret; + + wdev->pretimeout = new_pretimeout; + + return 0; +} + +static unsigned int nct6694_wdt_get_time(struct watchdog_device *wdev) +{ + struct nct6694_wdt_data *data = watchdog_get_drvdata(wdev); + struct nct6694_wdt_setup *setup = &data->msg->setup; + const struct nct6694_cmd_header cmd_hd = { + .mod = NCT6694_WDT_MOD, + .cmd = NCT6694_WDT_SETUP, + .sel = NCT6694_WDT_SETUP_SEL(data->wdev_idx), + .len = cpu_to_le16(sizeof(*setup)) + }; + unsigned int timeleft_ms; + int ret; + + ret = nct6694_read_msg(data->nct6694, &cmd_hd, setup); + if (ret) + return 0; + + timeleft_ms = le32_to_cpu(setup->countdown); + + return timeleft_ms / 1000; +} + +static const struct watchdog_info nct6694_wdt_info = { + .options = WDIOF_SETTIMEOUT | + WDIOF_KEEPALIVEPING | + WDIOF_MAGICCLOSE | + WDIOF_PRETIMEOUT, + .identity = DEVICE_NAME, +}; + +static const struct watchdog_ops nct6694_wdt_ops = { + .owner = THIS_MODULE, + .start = nct6694_wdt_start, + .stop = nct6694_wdt_stop, + .set_timeout = nct6694_wdt_set_timeout, + .set_pretimeout = nct6694_wdt_set_pretimeout, + .get_timeleft = nct6694_wdt_get_time, + .ping = nct6694_wdt_ping, +}; + +static void nct6694_wdt_ida_free(void *d) +{ + struct nct6694_wdt_data *data = d; + struct nct6694 *nct6694 = data->nct6694; + + ida_free(&nct6694->wdt_ida, data->wdev_idx); +} + +static int nct6694_wdt_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct nct6694 *nct6694 = dev_get_drvdata(dev->parent); + struct nct6694_wdt_data *data; + struct watchdog_device *wdev; + int ret; + + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->msg = devm_kzalloc(dev, sizeof(union nct6694_wdt_msg), + GFP_KERNEL); + if (!data->msg) + return -ENOMEM; + + data->dev = dev; + data->nct6694 = nct6694; + + ret = ida_alloc(&nct6694->wdt_ida, GFP_KERNEL); + if (ret < 0) + return ret; + data->wdev_idx = ret; + + ret = devm_add_action_or_reset(dev, nct6694_wdt_ida_free, data); + if (ret) + return ret; + + wdev = &data->wdev; + wdev->info = &nct6694_wdt_info; + wdev->ops = &nct6694_wdt_ops; + wdev->timeout = timeout[data->wdev_idx]; + wdev->pretimeout = pretimeout[data->wdev_idx]; + if (timeout[data->wdev_idx] < pretimeout[data->wdev_idx]) { + dev_warn(data->dev, "pretimeout < timeout. Setting to zero\n"); + wdev->pretimeout = 0; + } + + wdev->min_timeout = 1; + wdev->max_timeout = 255; + + platform_set_drvdata(pdev, data); + + watchdog_set_drvdata(&data->wdev, data); + watchdog_set_nowayout(&data->wdev, nowayout); + watchdog_stop_on_reboot(&data->wdev); + + return devm_watchdog_register_device(dev, &data->wdev); +} + +static struct platform_driver nct6694_wdt_driver = { + .driver = { + .name = DEVICE_NAME, + }, + .probe = nct6694_wdt_probe, +}; + +module_platform_driver(nct6694_wdt_driver); + +MODULE_DESCRIPTION("USB-WDT driver for NCT6694"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:nct6694-wdt"); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 2de37dcd75566f..49c3f992639435 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -302,7 +302,7 @@ static enum bp_state reserve_additional_memory(void) * are not restored since this region is now known not to * conflict with any devices. */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { unsigned long pfn, i; pfn = PFN_DOWN(resource->start); @@ -626,7 +626,7 @@ int xen_alloc_ballooned_pages(unsigned int nr_pages, struct page **pages) */ BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { ret = xen_alloc_p2m_entry(page_to_pfn(page)); if (ret < 0) goto out_undo; diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 41309d38f78c3c..9478fae014e50f 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1314,14 +1314,17 @@ int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev, } EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi); -static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn) +static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn, + bool percpu) { struct evtchn_status status; evtchn_port_t port; - int rc = -ENOENT; + bool exists = false; memset(&status, 0, sizeof(status)); for (port = 0; port < xen_evtchn_max_channels(); port++) { + int rc; + status.dom = DOMID_SELF; status.port = port; rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); @@ -1329,12 +1332,16 @@ static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn) continue; if (status.status != EVTCHNSTAT_virq) continue; - if (status.u.virq == virq && status.vcpu == xen_vcpu_nr(cpu)) { + if (status.u.virq != virq) + continue; + if (status.vcpu == xen_vcpu_nr(cpu)) { *evtchn = port; - break; + return 0; + } else if (!percpu) { + exists = true; } } - return rc; + return exists ? -EEXIST : -ENOENT; } /** @@ -1381,8 +1388,11 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) evtchn = bind_virq.port; else { if (ret == -EEXIST) - ret = find_virq(virq, cpu, &evtchn); - BUG_ON(ret < 0); + ret = find_virq(virq, cpu, &evtchn, percpu); + if (ret) { + __unbind_from_irq(info, info->irq); + goto out; + } } ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq); @@ -1787,9 +1797,20 @@ static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu) * virq or IPI channel, which don't actually need to be rebound. Ignore * it, but don't do the xenlinux-level rebind in that case. */ - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) { + int old_cpu = info->cpu; + bind_evtchn_to_cpu(info, tcpu, false); + if (info->type == IRQT_VIRQ) { + int virq = info->u.virq; + int irq = per_cpu(virq_to_irq, old_cpu)[virq]; + + per_cpu(virq_to_irq, old_cpu)[virq] = -1; + per_cpu(virq_to_irq, tcpu)[virq] = irq; + } + } + do_unmask(info, EVT_MASK_REASON_TEMPORARY); return 0; diff --git a/drivers/xen/gntdev-dmabuf.c b/drivers/xen/gntdev-dmabuf.c index 82855105ab857f..550980dd3b0bc4 100644 --- a/drivers/xen/gntdev-dmabuf.c +++ b/drivers/xen/gntdev-dmabuf.c @@ -720,16 +720,15 @@ static void dmabuf_imp_release_all(struct gntdev_dmabuf_priv *priv) /* DMA buffer IOCTL support. */ -long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod, +long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, struct ioctl_gntdev_dmabuf_exp_from_refs __user *u) { struct ioctl_gntdev_dmabuf_exp_from_refs op; u32 *refs; long ret; - if (use_ptemod) { - pr_debug("Cannot provide dma-buf: use_ptemode %d\n", - use_ptemod); + if (xen_pv_domain()) { + pr_debug("Cannot provide dma-buf in a PV domain\n"); return -EINVAL; } diff --git a/drivers/xen/gntdev-dmabuf.h b/drivers/xen/gntdev-dmabuf.h index 3d9b9cf9d5a16a..9adf96ac74d393 100644 --- a/drivers/xen/gntdev-dmabuf.h +++ b/drivers/xen/gntdev-dmabuf.h @@ -18,7 +18,7 @@ struct gntdev_dmabuf_priv *gntdev_dmabuf_init(struct file *filp); void gntdev_dmabuf_fini(struct gntdev_dmabuf_priv *priv); -long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod, +long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, struct ioctl_gntdev_dmabuf_exp_from_refs __user *u); long gntdev_ioctl_dmabuf_exp_wait_released(struct gntdev_priv *priv, diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1f21607656182a..91ba5078c9d956 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -73,9 +73,6 @@ module_param(limit, uint, 0644); MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by one mapping request"); -/* True in PV mode, false otherwise */ -static int use_ptemod; - static void unmap_grant_pages(struct gntdev_grant_map *map, int offset, int pages); @@ -163,7 +160,7 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, NULL == add->pages || NULL == add->being_removed) goto err; - if (use_ptemod) { + if (xen_pv_domain()) { add->kmap_ops = kvmalloc_array(count, sizeof(add->kmap_ops[0]), GFP_KERNEL); add->kunmap_ops = kvmalloc_array(count, sizeof(add->kunmap_ops[0]), @@ -211,7 +208,7 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, add->grants[i].ref = INVALID_GRANT_REF; add->map_ops[i].handle = INVALID_GRANT_HANDLE; add->unmap_ops[i].handle = INVALID_GRANT_HANDLE; - if (use_ptemod) { + if (xen_pv_domain()) { add->kmap_ops[i].handle = INVALID_GRANT_HANDLE; add->kunmap_ops[i].handle = INVALID_GRANT_HANDLE; } @@ -268,7 +265,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) if (!refcount_dec_and_test(&map->users)) return; - if (map->pages && !use_ptemod) { + if (map->pages && !xen_pv_domain()) { /* * Increment the reference count. This ensures that the * subsequent call to unmap_grant_pages() will not wind up @@ -298,7 +295,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map) */ } - if (use_ptemod && map->notifier_init) + if (xen_pv_domain() && map->notifier_init) mmu_interval_notifier_remove(&map->notifier); if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { @@ -334,7 +331,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) size_t alloced = 0; int i, err = 0; - if (!use_ptemod) { + if (!xen_pv_domain()) { /* Note: it could already be mapped */ if (map->map_ops[0].handle != INVALID_GRANT_HANDLE) return 0; @@ -389,7 +386,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) if (map->flags & GNTMAP_device_map) map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr; - if (use_ptemod) { + if (xen_pv_domain()) { if (map->kmap_ops[i].status == GNTST_okay) { alloced++; map->kunmap_ops[i].handle = map->kmap_ops[i].handle; @@ -421,7 +418,7 @@ static void __unmap_grant_pages_done(int result, map->unmap_ops[offset+i].handle, map->unmap_ops[offset+i].status); map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; - if (use_ptemod) { + if (xen_pv_domain()) { if (map->kunmap_ops[offset + i].status == GNTST_okay && map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE) successful_unmaps++; @@ -464,7 +461,7 @@ static void __unmap_grant_pages(struct gntdev_grant_map *map, int offset, } map->unmap_data.unmap_ops = map->unmap_ops + offset; - map->unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL; + map->unmap_data.kunmap_ops = xen_pv_domain() ? map->kunmap_ops + offset : NULL; map->unmap_data.pages = map->pages + offset; map->unmap_data.count = pages; map->unmap_data.done = __unmap_grant_pages_done; @@ -1039,7 +1036,7 @@ static long gntdev_ioctl(struct file *flip, #ifdef CONFIG_XEN_GNTDEV_DMABUF case IOCTL_GNTDEV_DMABUF_EXP_FROM_REFS: - return gntdev_ioctl_dmabuf_exp_from_refs(priv, use_ptemod, ptr); + return gntdev_ioctl_dmabuf_exp_from_refs(priv, ptr); case IOCTL_GNTDEV_DMABUF_EXP_WAIT_RELEASED: return gntdev_ioctl_dmabuf_exp_wait_released(priv, ptr); @@ -1086,7 +1083,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP); - if (use_ptemod) + if (xen_pv_domain()) vm_flags_set(vma, VM_DONTCOPY); vma->vm_private_data = map; @@ -1102,7 +1099,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) map->pages_vm_start = vma->vm_start; - if (use_ptemod) { + if (xen_pv_domain()) { err = mmu_interval_notifier_insert_locked( &map->notifier, vma->vm_mm, vma->vm_start, vma->vm_end - vma->vm_start, &gntdev_mmu_ops); @@ -1113,7 +1110,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) } mutex_unlock(&priv->lock); - if (use_ptemod) { + if (xen_pv_domain()) { /* * gntdev takes the address of the PTE in find_grant_ptes() and * passes it to the hypervisor in gntdev_map_grant_pages(). The @@ -1139,7 +1136,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) if (err) goto out_put_map; - if (!use_ptemod) { + if (!xen_pv_domain()) { err = vm_map_pages_zero(vma, map->pages, map->count); if (err) goto out_put_map; @@ -1154,7 +1151,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) out_unlock_put: mutex_unlock(&priv->lock); out_put_map: - if (use_ptemod) + if (xen_pv_domain()) unmap_grant_pages(map, 0, map->count); gntdev_put_map(priv, map); return err; @@ -1183,8 +1180,6 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); - err = misc_register(&gntdev_miscdev); if (err != 0) { pr_err("Could not register gntdev device\n"); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 04a6b470b15dfb..478d2ad725ac6b 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1449,7 +1449,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_pv_domain()) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; @@ -1570,7 +1570,7 @@ static int gnttab_setup(void) if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + if (!xen_pv_domain() && gnttab_shared.addr == NULL) { gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { pr_warn("gnttab share frames is not mapped!\n"); @@ -1588,7 +1588,7 @@ int gnttab_resume(void) int gnttab_suspend(void) { - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) gnttab_interface->unmap_frames(); return 0; } diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 841afa4933c7a6..e20c40a62e64e2 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -95,10 +96,16 @@ static void do_suspend(void) shutting_down = SHUTDOWN_SUSPEND; + if (!mutex_trylock(&system_transition_mutex)) + { + pr_err("%s: failed to take system_transition_mutex\n", __func__); + goto out; + } + err = freeze_processes(); if (err) { pr_err("%s: freeze processes failed %d\n", __func__, err); - goto out; + goto out_unlock; } err = freeze_kernel_threads(); @@ -110,7 +117,7 @@ static void do_suspend(void) err = dpm_suspend_start(PMSG_FREEZE); if (err) { pr_err("%s: dpm_suspend_start %d\n", __func__, err); - goto out_thaw; + goto out_resume_end; } printk(KERN_DEBUG "suspending xenstore...\n"); @@ -150,10 +157,13 @@ static void do_suspend(void) else xs_suspend_cancel(); +out_resume_end: dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); out_thaw: thaw_processes(); +out_unlock: + mutex_unlock(&system_transition_mutex); out: shutting_down = SHUTDOWN_INVALID; } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 13a10f3294a80d..f52a457b302d9c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -271,7 +271,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) struct mmap_gfn_state state; /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return -ENOSYS; if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) @@ -353,7 +353,7 @@ static int mmap_batch_fn(void *data, int nr, void *state) struct page **cur_pages = NULL; int ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) cur_pages = &pages[st->index]; BUG_ON(nr < 0); @@ -535,7 +535,7 @@ static long privcmd_ioctl_mmap_batch( ret = -EINVAL; goto out_unlock; } - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_pv_domain()) { ret = alloc_empty_pages(vma, nr_pages); if (ret < 0) goto out_unlock; @@ -779,8 +779,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file, goto out; } - if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && - xen_feature(XENFEAT_auto_translated_physmap)) { + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); struct page **pages; unsigned int i; @@ -811,8 +810,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file, if (rc) goto out; - if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && - xen_feature(XENFEAT_auto_translated_physmap)) { + if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) { rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT); } else { unsigned int domid = @@ -1591,7 +1589,7 @@ static void privcmd_close(struct vm_area_struct *vma) int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; int rc; - if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) + if (xen_pv_domain() || !numpgs || !pages) return; rc = xen_unmap_domain_gfn_range(vma, numgfns, pages); diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c index a39f2d36dd9cfc..d6fc2aefe2646b 100644 --- a/drivers/xen/unpopulated-alloc.c +++ b/drivers/xen/unpopulated-alloc.c @@ -105,7 +105,7 @@ static int fill_list(unsigned int nr_pages) * are not restored since this region is now known not to * conflict with any devices. */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { xen_pfn_t pfn = PFN_DOWN(res->start); for (i = 0; i < alloc_pages; i++) { @@ -184,7 +184,7 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages) pages[i] = pg; #ifdef CONFIG_XEN_HAVE_PVMMU - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_pv_domain()) { ret = xen_alloc_p2m_entry(page_to_pfn(pg)); if (ret < 0) { unsigned int j; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index e73ec225d4a61d..2dc874fb550665 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -955,7 +955,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { #ifdef CONFIG_XEN_PV - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) ring_ops = &ring_ops_pv; else #endif diff --git a/drivers/zorro/names.c b/drivers/zorro/names.c index 077114ccc84073..b44f90989a66c5 100644 --- a/drivers/zorro/names.c +++ b/drivers/zorro/names.c @@ -36,21 +36,21 @@ struct zorro_manuf_info { * real memory.. Parse the same file multiple times * to get all the info. */ -#define MANUF( manuf, name ) static char __manufstr_##manuf[] __initdata = name; +#define MANUF(manuf, name) static char __manufstr_##manuf[] __initdata = name; #define ENDMANUF() -#define PRODUCT( manuf, prod, name ) static char __prodstr_##manuf##prod[] __initdata = name; +#define PRODUCT(manuf, prod, name) static char __prodstr_##manuf##prod[] __initdata = name; #include "devlist.h" -#define MANUF( manuf, name ) static struct zorro_prod_info __prods_##manuf[] __initdata = { +#define MANUF(manuf, name) static struct zorro_prod_info __prods_##manuf[] __initdata = { #define ENDMANUF() }; -#define PRODUCT( manuf, prod, name ) { 0x##prod, 0, __prodstr_##manuf##prod }, +#define PRODUCT(manuf, prod, name) { 0x##prod, 0, __prodstr_##manuf##prod }, #include "devlist.h" static struct zorro_manuf_info __initdata zorro_manuf_list[] = { -#define MANUF( manuf, name ) { 0x##manuf, ARRAY_SIZE(__prods_##manuf), __manufstr_##manuf, __prods_##manuf }, +#define MANUF(manuf, name) { 0x##manuf, ARRAY_SIZE(__prods_##manuf), __manufstr_##manuf, __prods_##manuf }, #define ENDMANUF() -#define PRODUCT( manuf, prod, name ) +#define PRODUCT(manuf, prod, name) #include "devlist.h" }; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 795c6388744cdb..1581ebac5bb423 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - return generic_drop_inode(inode); + return inode_generic_drop(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute diff --git a/fs/Kconfig b/fs/Kconfig index c654a364289700..7815379032dacb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -51,7 +51,6 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" source "fs/f2fs/Kconfig" -source "fs/bcachefs/Kconfig" source "fs/zonefs/Kconfig" endif # BLOCK diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index bd2f530e574086..1949e25c7741b1 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -184,4 +184,13 @@ config EXEC_KUNIT_TEST This builds the exec KUnit tests, which tests boundary conditions of various aspects of the exec internals. +config ARCH_HAS_ELF_CORE_EFLAGS + bool + depends on BINFMT_ELF && ELF_CORE + default n + help + Select this option if the architecture makes use of the e_flags + field in the ELF header to store ABI or other architecture-specific + information that should be preserved in core dumps. + endmenu diff --git a/fs/Makefile b/fs/Makefile index 334654f9584b94..e3523ab2e58713 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -121,7 +121,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_F2FS_FS) += f2fs/ -obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 69e1dd55b16010..894d2bad6b6cec 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -42,7 +42,7 @@ static void afs_volume_init_callback(struct afs_volume *volume) list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb); - queue_work(system_unbound_wq, &vnode->cb_work); + queue_work(system_dfl_wq, &vnode->cb_work); } } @@ -90,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas if (reason != afs_cb_break_for_deleted && vnode->status.type == AFS_FTYPE_FILE && atomic_read(&vnode->cb_nr_mmap)) - queue_work(system_unbound_wq, &vnode->cb_work); + queue_work(system_dfl_wq, &vnode->cb_work); trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true); } else { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index bfb69e0666728b..89d36e3e5c7999 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1823,7 +1823,8 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, static void afs_rename_success(struct afs_operation *op) { - struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + struct afs_vnode *vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; _enter("op=%08x", op->debug_id); @@ -1834,22 +1835,40 @@ static void afs_rename_success(struct afs_operation *op) op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); } + if (op->more_files[0].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[0]); + if (op->more_files[1].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[1]); /* If we're moving a subdir between dirs, we need to update * its DV counter too as the ".." will be altered. */ - if (S_ISDIR(vnode->netfs.inode.i_mode) && - op->file[0].vnode != op->file[1].vnode) { - u64 new_dv; + if (op->file[0].vnode != op->file[1].vnode) { + if (S_ISDIR(vnode->netfs.inode.i_mode)) { + u64 new_dv; - write_seqlock(&vnode->cb_lock); + write_seqlock(&vnode->cb_lock); - new_dv = vnode->status.data_version + 1; - trace_afs_set_dv(vnode, new_dv); - vnode->status.data_version = new_dv; - inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + new_dv = vnode->status.data_version + 1; + trace_afs_set_dv(vnode, new_dv); + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); - write_sequnlock(&vnode->cb_lock); + write_sequnlock(&vnode->cb_lock); + } + + if ((op->rename.rename_flags & RENAME_EXCHANGE) && + S_ISDIR(new_vnode->netfs.inode.i_mode)) { + u64 new_dv; + + write_seqlock(&new_vnode->cb_lock); + + new_dv = new_vnode->status.data_version + 1; + new_vnode->status.data_version = new_dv; + inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv); + + write_sequnlock(&new_vnode->cb_lock); + } } } @@ -1900,8 +1919,8 @@ static void afs_rename_edit_dir(struct afs_operation *op) if (S_ISDIR(vnode->netfs.inode.i_mode) && new_dvnode != orig_dvnode && test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - afs_edit_dir_update_dotdot(vnode, new_dvnode, - afs_edit_dir_for_rename_sub); + afs_edit_dir_update(vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); new_inode = d_inode(new_dentry); if (new_inode) { @@ -1915,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op) /* Now we can update d_fsdata on the dentries to reflect their * new parent's data_version. - * - * Note that if we ever implement RENAME_EXCHANGE, we'll have - * to update both dentries with opposing dir versions. */ afs_update_dentry_version(op, new_dvp, op->dentry); afs_update_dentry_version(op, new_dvp, op->dentry_2); @@ -1930,6 +1946,67 @@ static void afs_rename_edit_dir(struct afs_operation *op) fscache_end_operation(&new_cres); } +static void afs_rename_exchange_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode *orig_dvnode = orig_dvp->vnode; + struct afs_vnode *new_dvnode = new_dvp->vnode; + struct afs_vnode *old_vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; + struct dentry *old_dentry = op->dentry; + struct dentry *new_dentry = op->dentry_2; + + _enter("op=%08x", op->debug_id); + + if (new_dvnode == orig_dvnode) { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) { + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + afs_edit_dir_update(orig_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + } + + d_exchange(old_dentry, new_dentry); + up_write(&orig_dvnode->validate_lock); + } else { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + + up_write(&orig_dvnode->validate_lock); + down_write(&new_dvnode->validate_lock); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta) + afs_edit_dir_update(new_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + + if (S_ISDIR(old_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags)) + afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); + + if (S_ISDIR(new_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags)) + afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode, + afs_edit_dir_for_rename_sub); + + /* Now we can update d_fsdata on the dentries to reflect their + * new parents' data_version. + */ + afs_update_dentry_version(op, new_dvp, old_dentry); + afs_update_dentry_version(op, orig_dvp, new_dentry); + + d_exchange(old_dentry, new_dentry); + up_write(&new_dvnode->validate_lock); + } +} + static void afs_rename_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); @@ -1948,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = { .put = afs_rename_put, }; +#if 0 /* Autoswitched in yfs_fs_rename_replace(). */ +static const struct afs_operation_ops afs_rename_replace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_replace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; +#endif + +static const struct afs_operation_ops afs_rename_noreplace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_noreplace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; + +static const struct afs_operation_ops afs_rename_exchange_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_exchange, + .success = afs_rename_success, + .edit_dir = afs_rename_exchange_edit_dir, + .put = afs_rename_put, +}; + /* * rename a file in an AFS filesystem and/or move it between directories */ @@ -1956,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *new_dentry, unsigned int flags) { struct afs_operation *op; - struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; + struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL; int ret; - if (flags) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; /* Don't allow silly-rename files be moved around. */ @@ -1969,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + if (d_is_positive(new_dentry)) + new_vnode = AFS_FS_I(d_inode(new_dentry)); _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, @@ -1989,6 +2094,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (ret < 0) goto error; + ret = -ENOMEM; + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) + goto error; + afs_op_set_vnode(op, 0, orig_dvnode); afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; @@ -1997,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = vnode; + op->more_files[0].speculative = true; + op->more_files[1].vnode = new_vnode; + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old_dentry; op->dentry_2 = new_dentry; + op->rename.rename_flags = flags; op->rename.new_negative = d_is_negative(new_dentry); - op->ops = &afs_rename_operation; - /* For non-directories, check whether the target is busy and if so, - * make a copy of the dentry and then do a silly-rename. If the - * silly-rename succeeds, the copied dentry is hashed and becomes the - * new target. - */ - if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { - /* To prevent any new references to the target during the - * rename, we unhash the dentry in advance. + if (flags & RENAME_NOREPLACE) { + op->ops = &afs_rename_noreplace_operation; + } else if (flags & RENAME_EXCHANGE) { + op->ops = &afs_rename_exchange_operation; + d_drop(new_dentry); + } else { + /* If we might displace the target, we might need to do silly + * rename. */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - op->rename.rehash = new_dentry; - } + op->ops = &afs_rename_operation; - if (d_count(new_dentry) > 2) { - /* copy the target dentry's name */ - op->rename.tmp = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!op->rename.tmp) { - afs_op_nomem(op); - goto error; + /* For non-directories, check whether the target is busy and if + * so, make a copy of the dentry and then do a silly-rename. + * If the silly-rename succeeds, the copied dentry is hashed + * and becomes the new target. + */ + if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { + /* To prevent any new references to the target during + * the rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + op->rename.rehash = new_dentry; } - ret = afs_sillyrename(new_dvnode, - AFS_FS_I(d_inode(new_dentry)), - new_dentry, op->key); - if (ret) { - afs_op_set_error(op, ret); - goto error; + if (d_count(new_dentry) > 2) { + /* copy the target dentry's name */ + op->rename.tmp = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!op->rename.tmp) { + afs_op_nomem(op); + goto error; + } + + ret = afs_sillyrename(new_dvnode, + AFS_FS_I(d_inode(new_dentry)), + new_dentry, op->key); + if (ret) { + afs_op_set_error(op, ret); + goto error; + } + + op->dentry_2 = op->rename.tmp; + op->rename.rehash = NULL; + op->rename.new_negative = true; } - - op->dentry_2 = op->rename.tmp; - op->rename.rehash = NULL; - op->rename.new_negative = true; } } @@ -2052,6 +2179,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, d_drop(old_dentry); ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -EINVAL; out: afs_dir_unuse_cookie(orig_dvnode, ret); if (new_dvnode != orig_dvnode) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 60a549f1d9c5fd..4b1342c72089ac 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -522,11 +522,11 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, } /* - * Edit a subdirectory that has been moved between directories to update the - * ".." entry. + * Edit an entry in a directory to update the vnode it refers to. This is also + * used to update the ".." entry in a directory. */ -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why) +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *block; union afs_xdr_dirent *de; @@ -557,7 +557,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) goto already_invalidated; - slot = afs_dir_scan_block(block, &dotdot_name, b); + slot = afs_dir_scan_block(block, name, b); if (slot >= 0) goto found_dirent; @@ -566,7 +566,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d /* Didn't find the dirent to clobber. Download the directory again. */ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd); goto out; @@ -576,7 +576,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d de->u.unique = htonl(new_dvnode->fid.unique); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, - ntohl(de->u.vnode), ntohl(de->u.unique), ".."); + ntohl(de->u.vnode), ntohl(de->u.unique), name->name); kunmap_local(block); netfs_single_mark_inode_dirty(&vnode->netfs.inode); @@ -589,12 +589,12 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d already_invalidated: kunmap_local(block); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; error: trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 0b80eb93fa40b8..014495d4b8684d 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode if (IS_ERR(op)) return PTR_ERR(op); + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) { + afs_put_operation(op); + return -ENOMEM; + } + afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; @@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = AFS_FS_I(d_inode(old)); + op->more_files[0].speculative = true; + op->more_files[1].vnode = AFS_FS_I(d_inode(new)); + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old; op->dentry_2 = new; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e9538e91f8484d..e1cb17b8579139 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode) _enter(""); if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) - return generic_delete_inode(inode); + return inode_just_drop(inode); else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1124ea4000cb1b..444a3ea4fdf65f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -562,6 +562,7 @@ struct afs_server { #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ +#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ @@ -891,9 +892,10 @@ struct afs_operation { bool need_rehash; } unlink; struct { - struct dentry *rehash; - struct dentry *tmp; - bool new_negative; + struct dentry *rehash; + struct dentry *tmp; + unsigned int rename_flags; + bool new_negative; } rename; struct { struct netfs_io_subrequest *subreq; @@ -1100,8 +1102,8 @@ int afs_single_writepages(struct address_space *mapping, extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why); +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); /* @@ -1693,6 +1695,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); +void yfs_fs_rename_replace(struct afs_operation *op); +void yfs_fs_rename_noreplace(struct afs_operation *op); +void yfs_fs_rename_exchange(struct afs_operation *op); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 02475d415d885e..e6bb8237db989a 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -169,13 +169,13 @@ static int __init afs_init(void) printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); - afs_wq = alloc_workqueue("afs", 0, 0); + afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0); if (!afs_wq) goto error_afs_wq; afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!afs_async_calls) goto error_async; - afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); + afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!afs_lock_manager) goto error_lockmgr; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 8f2b3a17769082..c8a7f266080d92 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -131,6 +131,7 @@ int afs_abort_to_error(u32 abort_code) case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; case RXGEN_OPCODE: return -ENOTSUPP; + case RX_INVALID_OPERATION: return -ENOTSUPP; default: return -EREMOTEIO; } diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index e4cd89c44c4654..b2f06c1917c2e2 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -50,6 +50,9 @@ enum YFS_FS_Operations { YFSREMOVEACL = 64171, YFSREMOVEFILE2 = 64173, YFSSTOREOPAQUEACL2 = 64174, + YFSRENAME_REPLACE = 64176, + YFSRENAME_NOREPLACE = 64177, + YFSRENAME_EXCHANGE = 64187, YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */ YFSFETCHDATA64 = 64537, /* YFS Fetch file data */ YFSSTOREDATA64 = 64538, /* YFS Store file data */ diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index a1c24f589d9e13..6a4e7da10fc495 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op) afs_op_set_error(op, -EDQUOT); goto failed_but_online; + case RX_INVALID_OPERATION: + case RXGEN_OPCODE: + /* Handle downgrading to an older operation. */ + afs_op_set_error(op, -ENOTSUPP); + if (op->flags & AFS_OPERATION_DOWNGRADE) { + op->flags &= ~AFS_OPERATION_DOWNGRADE; + goto go_again; + } + goto failed_but_online; + default: afs_op_accumulate_error(op, error, abort_code); failed_but_online: @@ -620,12 +630,13 @@ bool afs_select_fileserver(struct afs_operation *op) op->addr_index = addr_index; set_bit(addr_index, &op->addr_tried); - op->volsync.creation = TIME64_MIN; - op->volsync.update = TIME64_MIN; - op->call_responded = false; _debug("address [%u] %u/%u %pISp", op->server_index, addr_index, alist->nr_addrs, rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); +go_again: + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; + op->call_responded = false; _leave(" = t"); return true; diff --git a/fs/afs/server.c b/fs/afs/server.c index a97562f831eb5a..c4428ebddb1da6 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -331,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate, void afs_put_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - unsigned int a, debug_id = server->debug_id; + unsigned int a, debug_id; bool zero; int r; if (!server) return; + debug_id = server->debug_id; a = atomic_read(&server->active); zero = __refcount_dec_and_test(&server->ref, &r); trace_afs_server(debug_id, r - 1, a, reason); diff --git a/fs/afs/write.c b/fs/afs/write.c index 2e7526ea883ae2..93ad86ff33453f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -172,7 +172,7 @@ static void afs_issue_write_worker(struct work_struct *work) void afs_issue_write(struct netfs_io_subrequest *subreq) { subreq->work.func = afs_issue_write_worker; - if (!queue_work(system_unbound_wq, &subreq->work)) + if (!queue_work(system_dfl_wq, &subreq->work)) WARN_ON_ONCE(1); } diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 257af259c04a6b..febf13a49f0bf6 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1042,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op) _enter(""); + if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags)) + return yfs_fs_rename_replace(op); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + @@ -1070,6 +1073,252 @@ void yfs_fs_rename(struct afs_operation *op) afs_make_op_call(op, call, GFP_NOFS); } +/* + * Deliver reply data to a YFS.Rename_NoReplace operation. This does not + * return the status of a displaced target inode as there cannot be one. + */ +static int yfs_deliver_fs_rename_1(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +/* + * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange + * operation. These return the status of the displaced target inode if there + * was one. + */ +static int yfs_deliver_fs_rename_2(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + struct afs_vnode_param *new_vp = &op->more_files[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSFid(&bp, &new_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +static void yfs_done_fs_rename_replace(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags); + call->op->flags |= AFS_OPERATION_DOWNGRADE; + } +} + +/* + * YFS.Rename_Replace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Replace = { + .name = "FS.Rename_Replace", + .op = yfs_FS_Rename_Replace, + .deliver = yfs_deliver_fs_rename_2, + .done = yfs_done_fs_rename_replace, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_NoReplace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_NoReplace = { + .name = "FS.Rename_NoReplace", + .op = yfs_FS_Rename_NoReplace, + .deliver = yfs_deliver_fs_rename_1, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_Exchange operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Exchange = { + .name = "FS.Rename_Exchange", + .op = yfs_FS_Rename_Exchange, + .deliver = yfs_deliver_fs_rename_2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory, replacing the target if it exists. The status + * of a displaced target is returned. + */ +void yfs_fs_rename_replace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_REPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Rename a file or directory, failing if the target dirent exists. + */ +void yfs_fs_rename_noreplace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Exchange a pair of files directories. + */ +void yfs_fs_rename_exchange(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + /* * YFS.StoreData64 operation type. */ diff --git a/fs/aio.c b/fs/aio.c index 7fc7b6221312c3..6002617f078c6f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -636,7 +636,7 @@ static void free_ioctx_reqs(struct percpu_ref *ref) /* Synchronize against RCU protected table->table[] dereferences */ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); - queue_rcu_work(system_wq, &ctx->free_rwork); + queue_rcu_work(system_percpu_wq, &ctx->free_rwork); } /* diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig deleted file mode 100644 index 8cb2b9d5da96f3..00000000000000 --- a/fs/bcachefs/Kconfig +++ /dev/null @@ -1,121 +0,0 @@ - -config BCACHEFS_FS - tristate "bcachefs filesystem support (EXPERIMENTAL)" - depends on BLOCK - select EXPORTFS - select CLOSURES - select CRC32 - select CRC64 - select FS_POSIX_ACL - select LZ4_COMPRESS - select LZ4_DECOMPRESS - select LZ4HC_COMPRESS - select LZ4HC_DECOMPRESS - select ZLIB_DEFLATE - select ZLIB_INFLATE - select ZSTD_COMPRESS - select ZSTD_DECOMPRESS - select CRYPTO_LIB_SHA256 - select CRYPTO_LIB_CHACHA - select CRYPTO_LIB_POLY1305 - select KEYS - select RAID6_PQ - select XOR_BLOCKS - select XXHASH - select SRCU - select SYMBOLIC_ERRNAME - select MIN_HEAP - select XARRAY_MULTI - help - The bcachefs filesystem - a modern, copy on write filesystem, with - support for multiple devices, compression, checksumming, etc. - -config BCACHEFS_QUOTA - bool "bcachefs quota support" - depends on BCACHEFS_FS - select QUOTACTL - -config BCACHEFS_ERASURE_CODING - bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)" - depends on BCACHEFS_FS - select QUOTACTL - help - This enables the "erasure_code" filesysystem and inode option, which - organizes data into reed-solomon stripes instead of ordinary - replication. - - WARNING: this feature is still undergoing on disk format changes, and - should only be enabled for testing purposes. - -config BCACHEFS_POSIX_ACL - bool "bcachefs POSIX ACL support" - depends on BCACHEFS_FS - select FS_POSIX_ACL - -config BCACHEFS_DEBUG - bool "bcachefs debugging" - depends on BCACHEFS_FS - help - Enables many extra debugging checks and assertions. - - The resulting code will be significantly slower than normal; you - probably shouldn't select this option unless you're a developer. - -config BCACHEFS_INJECT_TRANSACTION_RESTARTS - bool "Randomly inject transaction restarts" - depends on BCACHEFS_DEBUG - help - Randomly inject transaction restarts in a few core paths - may have a - significant performance penalty - -config BCACHEFS_TESTS - bool "bcachefs unit and performance tests" - depends on BCACHEFS_FS - help - Include some unit and performance tests for the core btree code - -config BCACHEFS_LOCK_TIME_STATS - bool "bcachefs lock time statistics" - depends on BCACHEFS_FS - help - Expose statistics for how long we held a lock in debugfs - -config BCACHEFS_NO_LATENCY_ACCT - bool "disable latency accounting and time stats" - depends on BCACHEFS_FS - help - This disables device latency tracking and time stats, only for performance testing - -config BCACHEFS_SIX_OPTIMISTIC_SPIN - bool "Optimistic spinning for six locks" - depends on BCACHEFS_FS - depends on SMP - default y - help - Instead of immediately sleeping when attempting to take a six lock that - is held by another thread, spin for a short while, as long as the - thread owning the lock is running. - -config BCACHEFS_PATH_TRACEPOINTS - bool "Extra btree_path tracepoints" - depends on BCACHEFS_FS && TRACING - help - Enable extra tracepoints for debugging btree_path operations; we don't - normally want these enabled because they happen at very high rates. - -config BCACHEFS_TRANS_KMALLOC_TRACE - bool "Trace bch2_trans_kmalloc() calls" - depends on BCACHEFS_FS - -config BCACHEFS_ASYNC_OBJECT_LISTS - bool "Keep async objects on fast_lists for debugfs visibility" - depends on BCACHEFS_FS && DEBUG_FS - -config MEAN_AND_VARIANCE_UNIT_TEST - tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS - depends on KUNIT - depends on BCACHEFS_FS - default KUNIT_ALL_TESTS - help - This option enables the kunit tests for mean_and_variance module. - If unsure, say N. diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile deleted file mode 100644 index 93c8ee5425c8dd..00000000000000 --- a/fs/bcachefs/Makefile +++ /dev/null @@ -1,107 +0,0 @@ - -obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o - -bcachefs-y := \ - acl.o \ - alloc_background.o \ - alloc_foreground.o \ - backpointers.o \ - bkey.o \ - bkey_methods.o \ - bkey_sort.o \ - bset.o \ - btree_cache.o \ - btree_gc.o \ - btree_io.o \ - btree_iter.o \ - btree_journal_iter.o \ - btree_key_cache.o \ - btree_locking.o \ - btree_node_scan.o \ - btree_trans_commit.o \ - btree_update.o \ - btree_update_interior.o \ - btree_write_buffer.o \ - buckets.o \ - buckets_waiting_for_journal.o \ - chardev.o \ - checksum.o \ - clock.o \ - compress.o \ - darray.o \ - data_update.o \ - debug.o \ - dirent.o \ - disk_accounting.o \ - disk_groups.o \ - ec.o \ - enumerated_ref.o \ - errcode.o \ - error.o \ - extents.o \ - extent_update.o \ - eytzinger.o \ - fast_list.o \ - fs.o \ - fs-ioctl.o \ - fs-io.o \ - fs-io-buffered.o \ - fs-io-direct.o \ - fs-io-pagecache.o \ - fsck.o \ - inode.o \ - io_read.o \ - io_misc.o \ - io_write.o \ - journal.o \ - journal_io.o \ - journal_reclaim.o \ - journal_sb.o \ - journal_seq_blacklist.o \ - keylist.o \ - logged_ops.o \ - lru.o \ - mean_and_variance.o \ - migrate.o \ - move.o \ - movinggc.o \ - namei.o \ - nocow_locking.o \ - opts.o \ - printbuf.o \ - progress.o \ - quota.o \ - rebalance.o \ - rcu_pending.o \ - recovery.o \ - recovery_passes.o \ - reflink.o \ - replicas.o \ - sb-clean.o \ - sb-counters.o \ - sb-downgrade.o \ - sb-errors.o \ - sb-members.o \ - siphash.o \ - six.o \ - snapshot.o \ - str_hash.o \ - subvolume.o \ - super.o \ - super-io.o \ - sysfs.o \ - tests.o \ - time_stats.o \ - thread_with_file.o \ - trace.o \ - two_state_shared_lock.o \ - util.o \ - varint.o \ - xattr.o - -bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o - -obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o - -# Silence "note: xyz changed in GCC X.X" messages -subdir-ccflags-y += $(call cc-disable-warning, psabi) diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c deleted file mode 100644 index d03adc36100eb8..00000000000000 --- a/fs/bcachefs/acl.c +++ /dev/null @@ -1,445 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" - -#include "acl.h" -#include "xattr.h" - -#include - -static const char * const acl_types[] = { - [ACL_USER_OBJ] = "user_obj", - [ACL_USER] = "user", - [ACL_GROUP_OBJ] = "group_obj", - [ACL_GROUP] = "group", - [ACL_MASK] = "mask", - [ACL_OTHER] = "other", - NULL, -}; - -void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size) -{ - const void *p, *end = value + size; - - if (!value || - size < sizeof(bch_acl_header) || - ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) - return; - - p = value + sizeof(bch_acl_header); - while (p < end) { - const bch_acl_entry *in = p; - unsigned tag = le16_to_cpu(in->e_tag); - - prt_str(out, acl_types[tag]); - - switch (tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - p += sizeof(bch_acl_entry_short); - break; - case ACL_USER: - prt_printf(out, " uid %u", le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - prt_printf(out, " gid %u", le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - } - - prt_printf(out, " %o", le16_to_cpu(in->e_perm)); - - if (p != end) - prt_char(out, ' '); - } -} - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - -#include "fs.h" - -#include -#include -#include -#include - -static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) -{ - return sizeof(bch_acl_header) + - sizeof(bch_acl_entry_short) * nr_short + - sizeof(bch_acl_entry) * nr_long; -} - -static inline int acl_to_xattr_type(int type) -{ - switch (type) { - case ACL_TYPE_ACCESS: - return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; - case ACL_TYPE_DEFAULT: - return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; - default: - BUG(); - } -} - -/* - * Convert from filesystem to in-memory representation. - */ -static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, - const void *value, size_t size) -{ - const void *p, *end = value + size; - struct posix_acl *acl; - struct posix_acl_entry *out; - unsigned count = 0; - int ret; - - if (!value) - return NULL; - if (size < sizeof(bch_acl_header)) - goto invalid; - if (((bch_acl_header *)value)->a_version != - cpu_to_le32(BCH_ACL_VERSION)) - goto invalid; - - p = value + sizeof(bch_acl_header); - while (p < end) { - const bch_acl_entry *entry = p; - - if (p + sizeof(bch_acl_entry_short) > end) - goto invalid; - - switch (le16_to_cpu(entry->e_tag)) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - p += sizeof(bch_acl_entry_short); - break; - case ACL_USER: - case ACL_GROUP: - p += sizeof(bch_acl_entry); - break; - default: - goto invalid; - } - - count++; - } - - if (p > end) - goto invalid; - - if (!count) - return NULL; - - acl = allocate_dropping_locks(trans, ret, - posix_acl_alloc(count, _gfp)); - if (!acl) - return ERR_PTR(-ENOMEM); - if (ret) { - kfree(acl); - return ERR_PTR(ret); - } - - out = acl->a_entries; - - p = value + sizeof(bch_acl_header); - while (p < end) { - const bch_acl_entry *in = p; - - out->e_tag = le16_to_cpu(in->e_tag); - out->e_perm = le16_to_cpu(in->e_perm); - - switch (out->e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - p += sizeof(bch_acl_entry_short); - break; - case ACL_USER: - out->e_uid = make_kuid(&init_user_ns, - le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - out->e_gid = make_kgid(&init_user_ns, - le32_to_cpu(in->e_id)); - p += sizeof(bch_acl_entry); - break; - } - - out++; - } - - BUG_ON(out != acl->a_entries + acl->a_count); - - return acl; -invalid: - pr_err("invalid acl entry"); - return ERR_PTR(-EINVAL); -} - -/* - * Convert from in-memory to filesystem representation. - */ -static struct bkey_i_xattr * -bch2_acl_to_xattr(struct btree_trans *trans, - const struct posix_acl *acl, - int type) -{ - struct bkey_i_xattr *xattr; - bch_acl_header *acl_header; - const struct posix_acl_entry *acl_e, *pe; - void *outptr; - unsigned nr_short = 0, nr_long = 0, acl_len, u64s; - - FOREACH_ACL_ENTRY(acl_e, acl, pe) { - switch (acl_e->e_tag) { - case ACL_USER: - case ACL_GROUP: - nr_long++; - break; - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - nr_short++; - break; - default: - return ERR_PTR(-EINVAL); - } - } - - acl_len = bch2_acl_size(nr_short, nr_long); - u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); - - if (u64s > U8_MAX) - return ERR_PTR(-E2BIG); - - xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(xattr)) - return xattr; - - bkey_xattr_init(&xattr->k_i); - xattr->k.u64s = u64s; - xattr->v.x_type = acl_to_xattr_type(type); - xattr->v.x_name_len = 0; - xattr->v.x_val_len = cpu_to_le16(acl_len); - - acl_header = xattr_val(&xattr->v); - acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); - - outptr = (void *) acl_header + sizeof(*acl_header); - - FOREACH_ACL_ENTRY(acl_e, acl, pe) { - bch_acl_entry *entry = outptr; - - entry->e_tag = cpu_to_le16(acl_e->e_tag); - entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch (acl_e->e_tag) { - case ACL_USER: - entry->e_id = cpu_to_le32( - from_kuid(&init_user_ns, acl_e->e_uid)); - outptr += sizeof(bch_acl_entry); - break; - case ACL_GROUP: - entry->e_id = cpu_to_le32( - from_kgid(&init_user_ns, acl_e->e_gid)); - outptr += sizeof(bch_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - outptr += sizeof(bch_acl_entry_short); - break; - } - } - - BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); - - return xattr; -} - -struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_iter iter = {}; - struct posix_acl *acl = NULL; - - if (rcu) - return ERR_PTR(-ECHILD); - - struct btree_trans *trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash, inode_inum(inode), &search, 0); - int ret = bkey_err(k); - if (ret) - goto err; - - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - ret = PTR_ERR_OR_ZERO(acl); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (ret) - acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL; - - if (!IS_ERR_OR_NULL(acl)) - set_cached_acl(&inode->v, type, acl); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return acl; -} - -int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode_u, - struct posix_acl *acl, int type) -{ - struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); - int ret; - - if (type == ACL_TYPE_DEFAULT && - !S_ISDIR(inode_u->bi_mode)) - return acl ? -EACCES : 0; - - if (acl) { - struct bkey_i_xattr *xattr = - bch2_acl_to_xattr(trans, acl, type); - if (IS_ERR(xattr)) - return PTR_ERR(xattr); - - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, - inum, &xattr->k_i, 0); - } else { - struct xattr_search_key search = - X_SEARCH(acl_to_xattr_type(type), "", 0); - - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, - inum, &search); - } - - return bch2_err_matches(ret, ENOENT) ? 0 : ret; -} - -int bch2_set_acl(struct mnt_idmap *idmap, - struct dentry *dentry, - struct posix_acl *_acl, int type) -{ - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode_u; - struct posix_acl *acl; - umode_t mode; - int ret; - - mutex_lock(&inode->ei_update_lock); - struct btree_trans *trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - acl = _acl; - - ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_intent); - if (ret) - goto btree_err; - - mode = inode_u.bi_mode; - - if (type == ACL_TYPE_ACCESS) { - ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl); - if (ret) - goto btree_err; - } - - ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type); - if (ret) - goto btree_err; - - inode_u.bi_ctime = bch2_current_time(c); - inode_u.bi_mode = mode; - - ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, 0); -btree_err: - bch2_trans_iter_exit(trans, &inode_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (unlikely(ret)) - goto err; - - bch2_inode_update_after_write(trans, inode, &inode_u, - ATTR_CTIME|ATTR_MODE); - - set_cached_acl(&inode->v, type, acl); -err: - bch2_trans_put(trans); - mutex_unlock(&inode->ei_update_lock); - - return ret; -} - -int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode, - umode_t mode, - struct posix_acl **new_acl) -{ - struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); - struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); - struct btree_iter iter; - struct posix_acl *acl = NULL; - - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inum, &search, BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - return bch2_err_matches(ret, ENOENT) ? 0 : ret; - - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - - acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - ret = PTR_ERR_OR_ZERO(acl); - if (ret) - goto err; - - ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); - if (ret) - goto err; - - struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - new->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, &new->k_i, 0); - *new_acl = acl; - acl = NULL; -err: - bch2_trans_iter_exit(trans, &iter); - if (!IS_ERR_OR_NULL(acl)) - kfree(acl); - return ret; -} - -#endif /* CONFIG_BCACHEFS_POSIX_ACL */ diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h deleted file mode 100644 index fe730a6bf0c18c..00000000000000 --- a/fs/bcachefs/acl.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ACL_H -#define _BCACHEFS_ACL_H - -struct bch_inode_unpacked; -struct bch_hash_info; -struct bch_inode_info; -struct posix_acl; - -#define BCH_ACL_VERSION 0x0001 - -typedef struct { - __le16 e_tag; - __le16 e_perm; - __le32 e_id; -} bch_acl_entry; - -typedef struct { - __le16 e_tag; - __le16 e_perm; -} bch_acl_entry_short; - -typedef struct { - __le32 a_version; -} bch_acl_header; - -void bch2_acl_to_text(struct printbuf *, const void *, size_t); - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - -struct posix_acl *bch2_get_acl(struct inode *, int, bool); - -int bch2_set_acl_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct posix_acl *, int); -int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); -int bch2_acl_chmod(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - umode_t, struct posix_acl **); - -#else - -static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode_u, - struct posix_acl *acl, int type) -{ - return 0; -} - -static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode, - umode_t mode, - struct posix_acl **new_acl) -{ - return 0; -} - -#endif /* CONFIG_BCACHEFS_POSIX_ACL */ - -#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c deleted file mode 100644 index 66de463186209c..00000000000000 --- a/fs/bcachefs/alloc_background.c +++ /dev/null @@ -1,2680 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_gc.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "buckets_waiting_for_journal.h" -#include "clock.h" -#include "debug.h" -#include "disk_accounting.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "lru.h" -#include "recovery.h" -#include "varint.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); - -/* Persistent alloc info: */ - -static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { -#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bkey_alloc_unpacked { - u64 journal_seq; - u8 gen; - u8 oldest_gen; - u8 data_type; - bool need_discard:1; - bool need_inc_gen:1; -#define x(_name, _bits) u##_bits _name; - BCH_ALLOC_FIELDS_V2() -#undef x -}; - -static inline u64 alloc_field_v1_get(const struct bch_alloc *a, - const void **p, unsigned field) -{ - unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; - u64 v; - - if (!(a->fields & (1 << field))) - return 0; - - switch (bytes) { - case 1: - v = *((const u8 *) *p); - break; - case 2: - v = le16_to_cpup(*p); - break; - case 4: - v = le32_to_cpup(*p); - break; - case 8: - v = le64_to_cpup(*p); - break; - default: - BUG(); - } - - *p += bytes; - return v; -} - -static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, - struct bkey_s_c k) -{ - const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; - const void *d = in->data; - unsigned idx = 0; - - out->gen = in->gen; - -#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); - BCH_ALLOC_FIELDS_V1() -#undef x -} - -static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, - struct bkey_s_c k) -{ - struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); - const u8 *in = a.v->data; - const u8 *end = bkey_val_end(a); - unsigned fieldnr = 0; - int ret; - u64 v; - - out->gen = a.v->gen; - out->oldest_gen = a.v->oldest_gen; - out->data_type = a.v->data_type; - -#define x(_name, _bits) \ - if (fieldnr < a.v->nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v = 0; \ - } \ - out->_name = v; \ - if (v != out->_name) \ - return -1; \ - fieldnr++; - - BCH_ALLOC_FIELDS_V2() -#undef x - return 0; -} - -static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, - struct bkey_s_c k) -{ - struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); - const u8 *in = a.v->data; - const u8 *end = bkey_val_end(a); - unsigned fieldnr = 0; - int ret; - u64 v; - - out->gen = a.v->gen; - out->oldest_gen = a.v->oldest_gen; - out->data_type = a.v->data_type; - out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); - out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); - out->journal_seq = le64_to_cpu(a.v->journal_seq); - -#define x(_name, _bits) \ - if (fieldnr < a.v->nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v = 0; \ - } \ - out->_name = v; \ - if (v != out->_name) \ - return -1; \ - fieldnr++; - - BCH_ALLOC_FIELDS_V2() -#undef x - return 0; -} - -static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) -{ - struct bkey_alloc_unpacked ret = { .gen = 0 }; - - switch (k.k->type) { - case KEY_TYPE_alloc: - bch2_alloc_unpack_v1(&ret, k); - break; - case KEY_TYPE_alloc_v2: - bch2_alloc_unpack_v2(&ret, k); - break; - case KEY_TYPE_alloc_v3: - bch2_alloc_unpack_v3(&ret, k); - break; - } - - return ret; -} - -static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) -{ - unsigned i, bytes = offsetof(struct bch_alloc, data); - - for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) - if (a->fields & (1 << i)) - bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; - - return DIV_ROUND_UP(bytes, sizeof(u64)); -} - -int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - int ret = 0; - - /* allow for unknown fields */ - bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), - c, alloc_v1_val_size_bad, - "incorrect value size (%zu < %u)", - bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); -fsck_err: - return ret; -} - -int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_alloc_unpacked u; - int ret = 0; - - bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), - c, alloc_v2_unpack_error, - "unpack error"); -fsck_err: - return ret; -} - -int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_alloc_unpacked u; - int ret = 0; - - bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), - c, alloc_v3_unpack_error, - "unpack error"); -fsck_err: - return ret; -} - -int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bch_alloc_v4 a; - int ret = 0; - - bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k)); - - bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k), - c, alloc_v4_val_size_bad, - "bad val size (%u > %zu)", - alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k)); - - bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) && - BCH_ALLOC_V4_NR_BACKPOINTERS(&a), - c, alloc_v4_backpointers_start_bad, - "invalid backpointers_start"); - - bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type, - c, alloc_key_data_type_bad, - "invalid data type (got %u should be %u)", - a.data_type, alloc_data_type(a, a.data_type)); - - for (unsigned i = 0; i < 2; i++) - bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX, - c, alloc_key_io_time_bad, - "invalid io_time[%s]: %llu, max %llu", - i == READ ? "read" : "write", - a.io_time[i], LRU_TIME_MAX); - - unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) > - offsetof(struct bch_alloc_v4, stripe_sectors) - ? a.stripe_sectors - : 0; - - switch (a.data_type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - bkey_fsck_err_on(stripe_sectors || - a.dirty_sectors || - a.cached_sectors || - a.stripe, - c, alloc_key_empty_but_have_data, - "empty data type free but have data %u.%u.%u %u", - stripe_sectors, - a.dirty_sectors, - a.cached_sectors, - a.stripe); - break; - case BCH_DATA_sb: - case BCH_DATA_journal: - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - bkey_fsck_err_on(!a.dirty_sectors && - !stripe_sectors, - c, alloc_key_dirty_sectors_0, - "data_type %s but dirty_sectors==0", - bch2_data_type_str(a.data_type)); - break; - case BCH_DATA_cached: - bkey_fsck_err_on(!a.cached_sectors || - a.dirty_sectors || - stripe_sectors || - a.stripe, - c, alloc_key_cached_inconsistency, - "data type inconsistency"); - - bkey_fsck_err_on(!a.io_time[READ] && - !(c->recovery.passes_to_run & - BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), - c, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time == 0"); - break; - case BCH_DATA_stripe: - break; - } -fsck_err: - return ret; -} - -void bch2_alloc_v4_swab(struct bkey_s k) -{ - struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; - - a->journal_seq_nonempty = swab64(a->journal_seq_nonempty); - a->journal_seq_empty = swab64(a->journal_seq_empty); - a->flags = swab32(a->flags); - a->dirty_sectors = swab32(a->dirty_sectors); - a->cached_sectors = swab32(a->cached_sectors); - a->io_time[0] = swab64(a->io_time[0]); - a->io_time[1] = swab64(a->io_time[1]); - a->stripe = swab32(a->stripe); - a->nr_external_backpointers = swab32(a->nr_external_backpointers); - a->stripe_sectors = swab32(a->stripe_sectors); -} - -static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, - unsigned dev, const struct bch_alloc_v4 *a) -{ - struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL; - - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); - bch2_prt_data_type(out, a->data_type); - prt_newline(out); - prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); - prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); - prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); - prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); - prt_printf(out, "cached_sectors %u\n", a->cached_sectors); - prt_printf(out, "stripe %u\n", a->stripe); - prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); - prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); - prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); - - if (ca) - prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); - prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); - printbuf_indent_sub(out, 2); - - bch2_dev_put(ca); -} - -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - - __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a); -} - -void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v); -} - -void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) -{ - if (k.k->type == KEY_TYPE_alloc_v4) { - void *src, *dst; - - *out = *bkey_s_c_to_alloc_v4(k).v; - - src = alloc_v4_backpointers(out); - SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); - dst = alloc_v4_backpointers(out); - - if (src < dst) - memset(src, 0, dst - src); - - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); - } else { - struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - - *out = (struct bch_alloc_v4) { - .journal_seq_nonempty = u.journal_seq, - .flags = u.need_discard, - .gen = u.gen, - .oldest_gen = u.oldest_gen, - .data_type = u.data_type, - .stripe_redundancy = u.stripe_redundancy, - .dirty_sectors = u.dirty_sectors, - .cached_sectors = u.cached_sectors, - .io_time[READ] = u.read_time, - .io_time[WRITE] = u.write_time, - .stripe = u.stripe, - }; - - SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); - } -} - -static noinline struct bkey_i_alloc_v4 * -__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bkey_i_alloc_v4 *ret; - - ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); - if (IS_ERR(ret)) - return ret; - - if (k.k->type == KEY_TYPE_alloc_v4) { - void *src, *dst; - - bkey_reassemble(&ret->k_i, k); - - src = alloc_v4_backpointers(&ret->v); - SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); - dst = alloc_v4_backpointers(&ret->v); - - if (src < dst) - memset(src, 0, dst - src); - - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); - set_alloc_v4_u64s(ret); - } else { - bkey_alloc_v4_init(&ret->k_i); - ret->k.p = k.k->p; - bch2_alloc_to_v4(k, &ret->v); - } - return ret; -} - -static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bkey_s_c_alloc_v4 a; - - if (likely(k.k->type == KEY_TYPE_alloc_v4) && - ((a = bkey_s_c_to_alloc_v4(k), true) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) - return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); - - return __bch2_alloc_to_v4_mut(trans, k); -} - -struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) -{ - return bch2_alloc_to_v4_mut_inlined(trans, k); -} - -struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) -{ - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, - BTREE_ITER_with_updates| - BTREE_ITER_cached| - BTREE_ITER_intent); - int ret = bkey_err(k); - if (unlikely(ret)) - return ERR_PTR(ret); - - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (unlikely(ret)) - goto err; - return a; -err: - bch2_trans_iter_exit(trans, iter); - return ERR_PTR(ret); -} - -__flatten -struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos, - BTREE_ITER_with_updates| - BTREE_ITER_cached| - BTREE_ITER_intent); - int ret = bkey_err(k); - if (unlikely(ret)) - return ERR_PTR(ret); - - if ((void *) k.v >= trans->mem && - (void *) k.v < trans->mem + trans->mem_top) { - bch2_trans_iter_exit(trans, &iter); - return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); - } - - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); - if (IS_ERR(a)) { - bch2_trans_iter_exit(trans, &iter); - return a; - } - - ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); - bch2_trans_iter_exit(trans, &iter); - return unlikely(ret) ? ERR_PTR(ret) : a; -} - -static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) -{ - *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; - - pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; - return pos; -} - -static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) -{ - pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; - pos.offset += offset; - return pos; -} - -static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) -{ - return k.k->type == KEY_TYPE_bucket_gens - ? bkey_s_c_to_bucket_gens(k).v->gens[offset] - : 0; -} - -int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), - c, bucket_gens_val_size_bad, - "bad val size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); -fsck_err: - return ret; -} - -void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); - unsigned i; - - for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { - if (i) - prt_char(out, ' '); - prt_printf(out, "%u", g.v->gens[i]); - } -} - -int bch2_bucket_gens_init(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_bucket_gens g; - bool have_bucket_gens_key = false; - int ret; - - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch, k, ({ - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!bch2_dev_bucket_exists(c, k.k->p)) - continue; - - struct bch_alloc_v4 a; - u8 gen = bch2_alloc_to_v4(k, &a)->gen; - unsigned offset; - struct bpos pos = alloc_gens_pos(iter.pos, &offset); - int ret2 = 0; - - if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) { - ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret2) - goto iter_err; - have_bucket_gens_key = false; - } - - if (!have_bucket_gens_key) { - bkey_bucket_gens_init(&g.k_i); - g.k.p = pos; - have_bucket_gens_key = true; - } - - g.v.gens[offset] = gen; -iter_err: - ret2; - })); - - if (have_bucket_gens_key && !ret) - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); - - bch2_trans_put(trans); - - bch_err_fn(c, ret); - return ret; -} - -int bch2_alloc_read(struct bch_fs *c) -{ - down_read(&c->state_lock); - - struct btree_trans *trans = bch2_trans_get(c); - struct bch_dev *ca = NULL; - int ret; - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { - ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_prefetch, k, ({ - u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; - u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; - - if (k.k->type != KEY_TYPE_bucket_gens) - continue; - - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; - - for (u64 b = max_t(u64, ca->mi.first_bucket, start); - b < min_t(u64, ca->mi.nbuckets, end); - b++) - *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; - 0; - })); - } else { - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch, k, ({ - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - if (k.k->p.offset < ca->mi.first_bucket) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket)); - continue; - } - - if (k.k->p.offset >= ca->mi.nbuckets) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - - struct bch_alloc_v4 a; - *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; - 0; - })); - } - - bch2_dev_put(ca); - bch2_trans_put(trans); - - up_read(&c->state_lock); - bch_err_fn(c, ret); - return ret; -} - -/* Free space/discard btree: */ - -static int __need_discard_or_freespace_err(struct btree_trans *trans, - struct bkey_s_c alloc_k, - bool set, bool discard, bool repair) -{ - struct bch_fs *c = trans->c; - enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0); - enum bch_sb_error_id err_id = discard - ? BCH_FSCK_ERR_need_discard_key_wrong - : BCH_FSCK_ERR_freespace_key_wrong; - enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, alloc_k); - - int ret = __bch2_fsck_err(NULL, trans, flags, err_id, - "bucket incorrectly %sset in %s btree\n%s", - set ? "" : "un", - bch2_btree_id_str(btree), - buf.buf); - if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) || - bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed)) - ret = 0; - - printbuf_exit(&buf); - return ret; -} - -#define need_discard_or_freespace_err(...) \ - fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__)) - -#define need_discard_or_freespace_err_on(cond, ...) \ - (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false) - -static int bch2_bucket_do_index(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c alloc_k, - const struct bch_alloc_v4 *a, - bool set) -{ - enum btree_id btree; - struct bpos pos; - - if (a->data_type != BCH_DATA_free && - a->data_type != BCH_DATA_need_discard) - return 0; - - switch (a->data_type) { - case BCH_DATA_free: - btree = BTREE_ID_freespace; - pos = alloc_freespace_pos(alloc_k.k->p, *a); - break; - case BCH_DATA_need_discard: - btree = BTREE_ID_need_discard; - pos = alloc_k.k->p; - break; - default: - return 0; - } - - struct btree_iter iter; - struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); - int ret = bkey_err(old); - if (ret) - return ret; - - need_discard_or_freespace_err_on(ca->mi.freespace_initialized && - !old.k->type != set, - trans, alloc_k, set, - btree == BTREE_ID_need_discard, false); - - ret = bch2_btree_bit_mod_iter(trans, &iter, set); -fsck_err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline int bch2_bucket_gen_update(struct btree_trans *trans, - struct bpos bucket, u8 gen) -{ - struct btree_iter iter; - unsigned offset; - struct bpos pos = alloc_gens_pos(bucket, &offset); - struct bkey_i_bucket_gens *g; - struct bkey_s_c k; - int ret; - - g = bch2_trans_kmalloc(trans, sizeof(*g)); - ret = PTR_ERR_OR_ZERO(g); - if (ret) - return ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, - BTREE_ITER_intent| - BTREE_ITER_with_updates); - ret = bkey_err(k); - if (ret) - return ret; - - if (k.k->type != KEY_TYPE_bucket_gens) { - bkey_bucket_gens_init(&g->k_i); - g->k.p = iter.pos; - } else { - bkey_reassemble(&g->k_i, k); - } - - g->v.gens[offset] = gen; - - ret = bch2_trans_update(trans, &iter, &g->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca, - enum bch_data_type data_type, - s64 delta_buckets, - s64 delta_sectors, - s64 delta_fragmented, unsigned flags) -{ - s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - - return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - d, dev_data_type, - .dev = ca->dev_idx, - .data_type = data_type); -} - -int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, - const struct bch_alloc_v4 *old, - const struct bch_alloc_v4 *new, - unsigned flags) -{ - s64 old_sectors = bch2_bucket_sectors(*old); - s64 new_sectors = bch2_bucket_sectors(*new); - if (old->data_type != new->data_type) { - int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, - 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?: - bch2_dev_data_type_accounting_mod(trans, ca, old->data_type, - -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags); - if (ret) - return ret; - } else if (old_sectors != new_sectors) { - int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, - 0, - new_sectors - old_sectors, - bch2_bucket_sectors_fragmented(ca, *new) - - bch2_bucket_sectors_fragmented(ca, *old), flags); - if (ret) - return ret; - } - - s64 old_unstriped = bch2_bucket_sectors_unstriped(*old); - s64 new_unstriped = bch2_bucket_sectors_unstriped(*new); - if (old_unstriped != new_unstriped) { - int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped, - !!new_unstriped - !!old_unstriped, - new_unstriped - old_unstriped, - 0, - flags); - if (ret) - return ret; - } - - return 0; -} - -int bch2_trigger_alloc(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); - if (!ca) - return bch_err_throw(c, trigger_alloc); - - struct bch_alloc_v4 old_a_convert; - const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - - struct bch_alloc_v4 *new_a; - if (likely(new.k->type == KEY_TYPE_alloc_v4)) { - new_a = bkey_s_to_alloc_v4(new).v; - } else { - BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair))); - - struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); - ret = PTR_ERR_OR_ZERO(new_ka); - if (unlikely(ret)) - goto err; - new_a = &new_ka->v; - } - - if (flags & BTREE_TRIGGER_transactional) { - alloc_data_type_set(new_a, new_a->data_type); - - int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - - (int) data_type_is_empty(old_a->data_type); - - if (is_empty_delta < 0) { - new_a->io_time[READ] = bch2_current_io_time(c, READ); - new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); - SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); - } - - if (data_type_is_empty(new_a->data_type) && - BCH_ALLOC_V4_NEED_INC_GEN(new_a) && - !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { - if (new_a->oldest_gen == new_a->gen && - !bch2_bucket_sectors_total(*new_a)) - new_a->oldest_gen++; - new_a->gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); - alloc_data_type_set(new_a, new_a->data_type); - } - - if (old_a->data_type != new_a->data_type || - (new_a->data_type == BCH_DATA_free && - alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: - bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); - if (ret) - goto err; - } - - if (new_a->data_type == BCH_DATA_cached && - !new_a->io_time[READ]) - new_a->io_time[READ] = bch2_current_io_time(c, READ); - - ret = bch2_lru_change(trans, new.k->p.inode, - bucket_to_u64(new.k->p), - alloc_lru_idx_read(*old_a), - alloc_lru_idx_read(*new_a)); - if (ret) - goto err; - - ret = bch2_lru_change(trans, - BCH_LRU_BUCKET_FRAGMENTATION, - bucket_to_u64(new.k->p), - alloc_lru_idx_fragmentation(*old_a, ca), - alloc_lru_idx_fragmentation(*new_a, ca)); - if (ret) - goto err; - - if (old_a->gen != new_a->gen) { - ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); - if (ret) - goto err; - } - - ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); - if (ret) - goto err; - } - - if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { - u64 transaction_seq = trans->journal_res.seq; - BUG_ON(!transaction_seq); - - if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, - trans, alloc_key_journal_seq_in_future, - "bucket journal seq in future (currently at %llu)\n%s", - journal_cur_seq(&c->journal), - (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) - new_a->journal_seq_nonempty = transaction_seq; - - int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - - (int) data_type_is_empty(old_a->data_type); - - /* - * Record journal sequence number of empty -> nonempty transition: - * Note that there may be multiple empty -> nonempty - * transitions, data in a bucket may be overwritten while we're - * still writing to it - so be careful to only record the first: - * */ - if (is_empty_delta < 0 && - new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) { - new_a->journal_seq_nonempty = transaction_seq; - new_a->journal_seq_empty = 0; - } - - /* - * Bucket becomes empty: mark it as waiting for a journal flush, - * unless updates since empty -> nonempty transition were never - * flushed - we may need to ask the journal not to flush - * intermediate sequence numbers: - */ - if (is_empty_delta > 0) { - if (new_a->journal_seq_nonempty == transaction_seq || - bch2_journal_noflush_seq(&c->journal, - new_a->journal_seq_nonempty, - transaction_seq)) { - new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0; - } else { - new_a->journal_seq_empty = transaction_seq; - - ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - transaction_seq); - if (bch2_fs_fatal_err_on(ret, c, - "setting bucket_needs_journal_commit: %s", - bch2_err_str(ret))) - goto err; - } - } - - if (new_a->gen != old_a->gen) { - guard(rcu)(); - u8 *gen = bucket_gen(ca, new.k->p.offset); - if (unlikely(!gen)) - goto invalid_bucket; - *gen = new_a->gen; - } - -#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) -#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) -#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk) - - if (statechange(a->data_type == BCH_DATA_free) && - bucket_flushed(new_a)) - closure_wake_up(&c->freelist_wait); - - if (statechange(a->data_type == BCH_DATA_need_discard) && - !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && - bucket_flushed(new_a)) - bch2_discard_one_bucket_fast(ca, new.k->p.offset); - - if (statechange(a->data_type == BCH_DATA_cached) && - !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && - should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_dev_do_invalidates(ca); - - if (statechange(a->data_type == BCH_DATA_need_gc_gens)) - bch2_gc_gens_async(c); - } - - if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { - guard(rcu)(); - struct bucket *g = gc_bucket(ca, new.k->p.offset); - if (unlikely(!g)) - goto invalid_bucket; - g->gen_valid = 1; - g->gen = new_a->gen; - } -err: -fsck_err: - printbuf_exit(&buf); - bch2_dev_put(ca); - return ret; -invalid_bucket: - bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", - (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = bch_err_throw(c, trigger_alloc); - goto err; -} - -/* - * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for - * extents style btrees, but works on non-extents btrees: - */ -static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, struct bkey *hole) -{ - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - - if (bkey_err(k)) - return k; - - if (k.k->type) { - return k; - } else { - struct btree_iter iter2; - struct bpos next; - - bch2_trans_copy_iter(trans, &iter2, iter); - - struct btree_path *path = btree_iter_path(trans, iter); - if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) - end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); - - end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); - - /* - * btree node min/max is a closed interval, upto takes a half - * open interval: - */ - k = bch2_btree_iter_peek_max(trans, &iter2, end); - next = iter2.pos; - bch2_trans_iter_exit(trans, &iter2); - - BUG_ON(next.offset >= iter->pos.offset + U32_MAX); - - if (bkey_err(k)) - return k; - - bkey_init(hole); - hole->p = iter->pos; - - bch2_key_resize(hole, next.offset - iter->pos.offset); - return (struct bkey_s_c) { hole, NULL }; - } -} - -static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) -{ - if (*ca) { - if (bucket->offset < (*ca)->mi.first_bucket) - bucket->offset = (*ca)->mi.first_bucket; - - if (bucket->offset < (*ca)->mi.nbuckets) - return true; - - bch2_dev_put(*ca); - *ca = NULL; - bucket->inode++; - bucket->offset = 0; - } - - guard(rcu)(); - *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); - if (*ca) { - *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); - bch2_dev_get(*ca); - } - - return *ca != NULL; -} - -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_dev **ca, struct bkey *hole) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; -again: - k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole); - if (bkey_err(k)) - return k; - - *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); - - if (!k.k->type) { - struct bpos hole_start = bkey_start_pos(k.k); - - if (!*ca || !bucket_valid(*ca, hole_start.offset)) { - if (!next_bucket(c, ca, &hole_start)) - return bkey_s_c_null; - - bch2_btree_iter_set_pos(trans, iter, hole_start); - goto again; - } - - if (k.k->p.offset > (*ca)->mi.nbuckets) - bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); - } - - return k; -} - -static noinline_for_stack -int bch2_check_alloc_key(struct btree_trans *trans, - struct bkey_s_c alloc_k, - struct btree_iter *alloc_iter, - struct btree_iter *discard_iter, - struct btree_iter *freespace_iter, - struct btree_iter *bucket_gens_iter) -{ - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - unsigned gens_offset; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); - if (fsck_err_on(!ca, - trans, alloc_key_to_missing_dev_bucket, - "alloc key for invalid device:bucket %llu:%llu", - alloc_k.k->p.inode, alloc_k.k->p.offset)) - ret = bch2_btree_delete_at(trans, alloc_iter, 0); - if (!ca) - return ret; - - if (!ca->mi.freespace_initialized) - goto out; - - a = bch2_alloc_to_v4(alloc_k, &a_convert); - - bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p); - k = bch2_btree_iter_peek_slot(trans, discard_iter); - ret = bkey_err(k); - if (ret) - goto err; - - bool is_discarded = a->data_type == BCH_DATA_need_discard; - if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, - trans, alloc_k, !is_discarded, true, true)) { - ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); - if (ret) - goto err; - } - - bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); - k = bch2_btree_iter_peek_slot(trans, freespace_iter); - ret = bkey_err(k); - if (ret) - goto err; - - bool is_free = a->data_type == BCH_DATA_free; - if (need_discard_or_freespace_err_on(!!k.k->type != is_free, - trans, alloc_k, !is_free, false, true)) { - ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); - if (ret) - goto err; - } - - bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); - k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), - trans, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n%s", - alloc_gen(k, gens_offset), a->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i_bucket_gens *g = - bch2_trans_kmalloc(trans, sizeof(*g)); - - ret = PTR_ERR_OR_ZERO(g); - if (ret) - goto err; - - if (k.k->type == KEY_TYPE_bucket_gens) { - bkey_reassemble(&g->k_i, k); - } else { - bkey_bucket_gens_init(&g->k_i); - g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); - } - - g->v.gens[gens_offset] = a->gen; - - ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); - if (ret) - goto err; - } -out: -err: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -static noinline_for_stack -int bch2_check_alloc_hole_freespace(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos start, - struct bpos *end, - struct btree_iter *freespace_iter) -{ - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - int ret; - - if (!ca->mi.freespace_initialized) - return 0; - - bch2_btree_iter_set_pos(trans, freespace_iter, start); - - k = bch2_btree_iter_peek_slot(trans, freespace_iter); - ret = bkey_err(k); - if (ret) - goto err; - - *end = bkey_min(k.k->p, *end); - - if (fsck_err_on(k.k->type != KEY_TYPE_set, - trans, freespace_hole_missing, - "hole in alloc btree missing in freespace btree\n" - "device %llu buckets %llu-%llu", - freespace_iter->pos.inode, - freespace_iter->pos.offset, - end->offset)) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.type = KEY_TYPE_set; - update->k.p = freespace_iter->pos; - bch2_key_resize(&update->k, - min_t(u64, U32_MAX, end->offset - - freespace_iter->pos.offset)); - - ret = bch2_trans_update(trans, freespace_iter, update, 0); - if (ret) - goto err; - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static noinline_for_stack -int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, - struct bpos start, - struct bpos *end, - struct btree_iter *bucket_gens_iter) -{ - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - unsigned i, gens_offset, gens_end_offset; - int ret; - - bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); - - k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (bkey_cmp(alloc_gens_pos(start, &gens_offset), - alloc_gens_pos(*end, &gens_end_offset))) - gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; - - if (k.k->type == KEY_TYPE_bucket_gens) { - struct bkey_i_bucket_gens g; - bool need_update = false; - - bkey_reassemble(&g.k_i, k); - - for (i = gens_offset; i < gens_end_offset; i++) { - if (fsck_err_on(g.v.gens[i], trans, - bucket_gens_hole_wrong, - "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", - bucket_gens_pos_to_alloc(k.k->p, i).inode, - bucket_gens_pos_to_alloc(k.k->p, i).offset, - g.v.gens[i])) { - g.v.gens[i] = 0; - need_update = true; - } - } - - if (need_update) { - struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - memcpy(u, &g, sizeof(g)); - - ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); - if (ret) - goto err; - } - } - - *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -struct check_discard_freespace_key_async { - struct work_struct work; - struct bch_fs *c; - struct bbpos pos; -}; - -static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); - int ret = bkey_err(k); - if (ret) - return ret; - - u8 gen; - ret = k.k->type != KEY_TYPE_set - ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) - : 0; - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static void check_discard_freespace_key_work(struct work_struct *work) -{ - struct check_discard_freespace_key_async *w = - container_of(work, struct check_discard_freespace_key_async, work); - - bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); - enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key); - kfree(w); -} - -int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, - bool async_repair) -{ - struct bch_fs *c = trans->c; - enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard - ? BCH_DATA_need_discard - : BCH_DATA_free; - struct printbuf buf = PRINTBUF; - - unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)| - FSCK_CAN_FIX|FSCK_CAN_IGNORE; - - struct bpos bucket = iter->pos; - bucket.offset &= ~(~0ULL << 56); - u64 genbits = iter->pos.offset & (~0ULL << 56); - - struct btree_iter alloc_iter; - struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - async_repair ? BTREE_ITER_cached : 0); - int ret = bkey_err(alloc_k); - if (ret) - return ret; - - if (!bch2_dev_bucket_exists(c, bucket)) { - if (__fsck_err(trans, fsck_flags, - need_discard_freespace_key_to_invalid_dev_bucket, - "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) - goto delete; - ret = 1; - goto out; - } - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - - if (a->data_type != state || - (state == BCH_DATA_free && - genbits != alloc_freespace_genbits(*a))) { - if (__fsck_err(trans, fsck_flags, - need_discard_freespace_key_bad, - "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - bch2_btree_id_str(iter->btree_id), - iter->pos.inode, - iter->pos.offset, - a->data_type == state, - genbits >> 56, alloc_freespace_genbits(*a) >> 56)) - goto delete; - ret = 1; - goto out; - } - - *gen = a->gen; -out: -fsck_err: - bch2_set_btree_iter_dontneed(trans, &alloc_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -delete: - if (!async_repair) { - ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, transaction_restart_commit); - goto out; - } else { - /* - * We can't repair here when called from the allocator path: the - * commit will recurse back into the allocator - */ - struct check_discard_freespace_key_async *w = - kzalloc(sizeof(*w), GFP_KERNEL); - if (!w) - goto out; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) { - kfree(w); - goto out; - } - - INIT_WORK(&w->work, check_discard_freespace_key_work); - w->c = c; - w->pos = BBPOS(iter->btree_id, iter->pos); - queue_work(c->write_ref_wq, &w->work); - - ret = 1; /* don't allocate from this bucket */ - goto out; - } -} - -static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) -{ - u8 gen; - int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); - return ret < 0 ? ret : 0; -} - -/* - * We've already checked that generation numbers in the bucket_gens btree are - * valid for buckets that exist; this just checks for keys for nonexistent - * buckets. - */ -static noinline_for_stack -int bch2_check_bucket_gens_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_i_bucket_gens g; - u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; - u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; - u64 b; - bool need_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(k.k->type != KEY_TYPE_bucket_gens); - bkey_reassemble(&g.k_i, k); - - struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); - if (!ca) { - if (fsck_err(trans, bucket_gens_to_invalid_dev, - "bucket_gens key for invalid device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; - } - - if (fsck_err_on(end <= ca->mi.first_bucket || - start >= ca->mi.nbuckets, - trans, bucket_gens_to_invalid_buckets, - "bucket_gens key for invalid buckets:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; - } - - for (b = start; b < ca->mi.first_bucket; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], - trans, bucket_gens_nonzero_for_invalid_buckets, - "bucket_gens key has nonzero gen for invalid bucket")) { - g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; - need_update = true; - } - - for (b = ca->mi.nbuckets; b < end; b++) - if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], - trans, bucket_gens_nonzero_for_invalid_buckets, - "bucket_gens key has nonzero gen for invalid bucket")) { - g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; - need_update = true; - } - - if (need_update) { - struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto out; - - memcpy(u, &g, sizeof(g)); - ret = bch2_trans_update(trans, iter, u, 0); - } -out: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_alloc_info(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; - struct bch_dev *ca = NULL; - struct bkey hole; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_prefetch); - - while (1) { - struct bpos next; - - bch2_trans_begin(trans); - - k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole); - ret = bkey_err(k); - if (ret) - goto bkey_err; - - if (!k.k) - break; - - if (k.k->type) { - next = bpos_nosnap_successor(k.k->p); - - ret = bch2_check_alloc_key(trans, - k, &iter, - &discard_iter, - &freespace_iter, - &bucket_gens_iter); - if (ret) - goto bkey_err; - } else { - next = k.k->p; - - ret = bch2_check_alloc_hole_freespace(trans, ca, - bkey_start_pos(k.k), - &next, - &freespace_iter) ?: - bch2_check_alloc_hole_bucket_gens(trans, - bkey_start_pos(k.k), - &next, - &bucket_gens_iter); - if (ret) - goto bkey_err; - } - - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto bkey_err; - - bch2_btree_iter_set_pos(trans, &iter, next); -bkey_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - } - bch2_trans_iter_exit(trans, &bucket_gens_iter); - bch2_trans_iter_exit(trans, &freespace_iter); - bch2_trans_iter_exit(trans, &discard_iter); - bch2_trans_iter_exit(trans, &iter); - bch2_dev_put(ca); - ca = NULL; - - if (ret < 0) - goto err; - - ret = for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_prefetch, k, - bch2_check_discard_freespace_key_fsck(trans, &iter)); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_prefetch); - while (1) { - bch2_trans_begin(trans); - k = bch2_btree_iter_peek(trans, &iter); - if (!k.k) - break; - - ret = bkey_err(k) ?: - bch2_check_discard_freespace_key_fsck(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - if (ret) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - - bch_err(c, "while checking %s", buf.buf); - printbuf_exit(&buf); - break; - } - - bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos)); - } - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_bucket_gens_key(trans, &iter, k)); -err: - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - struct btree_iter *alloc_iter, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - struct bkey_s_c alloc_k; - struct printbuf buf = PRINTBUF; - int ret; - - alloc_k = bch2_btree_iter_peek(trans, alloc_iter); - if (!alloc_k.k) - return 0; - - ret = bkey_err(alloc_k); - if (ret) - return ret; - - struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode); - if (!ca) - return 0; - - a = bch2_alloc_to_v4(alloc_k, &a_convert); - - u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - if (lru_idx) { - ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, - bucket_to_u64(alloc_k.k->p), - lru_idx, alloc_k, last_flushed); - if (ret) - goto err; - } - - if (a->data_type != BCH_DATA_cached) - goto err; - - if (fsck_err_on(!a->io_time[READ], - trans, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time 0\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i_alloc_v4 *a_mut = - bch2_alloc_to_v4_mut(trans, alloc_k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - goto err; - - a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); - ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_norun); - if (ret) - goto err; - - a = &a_mut->v; - } - - ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ], - alloc_k, last_flushed); - if (ret) - goto err; -err: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_alloc_to_lru_refs(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: - bch2_check_stripe_to_lru_refs(c); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; -} - -static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) -{ - struct bch_fs *c = ca->fs; - int ret; - - mutex_lock(&ca->discard_buckets_in_flight_lock); - struct discard_in_flight *i = - darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); - if (i) { - ret = bch_err_throw(c, EEXIST_discard_in_flight_add); - goto out; - } - - ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { - .in_progress = in_progress, - .bucket = bucket, - })); -out: - mutex_unlock(&ca->discard_buckets_in_flight_lock); - return ret; -} - -static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) -{ - mutex_lock(&ca->discard_buckets_in_flight_lock); - struct discard_in_flight *i = - darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); - BUG_ON(!i || !i->in_progress); - - darray_remove_item(&ca->discard_buckets_in_flight, i); - mutex_unlock(&ca->discard_buckets_in_flight_lock); -} - -struct discard_buckets_state { - u64 seen; - u64 open; - u64 need_journal_commit; - u64 discarded; -}; - -static int bch2_discard_one_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct btree_iter *need_discard_iter, - struct bpos *discard_pos_done, - struct discard_buckets_state *s, - bool fastpath) -{ - struct bch_fs *c = trans->c; - struct bpos pos = need_discard_iter->pos; - struct btree_iter iter = {}; - struct bkey_s_c k; - struct bkey_i_alloc_v4 *a; - struct printbuf buf = PRINTBUF; - bool discard_locked = false; - int ret = 0; - - if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { - s->open++; - goto out; - } - - u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, - pos.inode, pos.offset); - if (seq_ready > c->journal.flushed_seq_ondisk) { - if (seq_ready > c->journal.flushing_seq) - s->need_journal_commit++; - goto out; - } - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - need_discard_iter->pos, - BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - goto out; - - a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto out; - - if (a->v.data_type != BCH_DATA_need_discard) { - if (need_discard_or_freespace_err(trans, k, true, true, true)) { - ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); - if (ret) - goto out; - goto commit; - } - - goto out; - } - - if (!fastpath) { - if (discard_in_flight_add(ca, iter.pos.offset, true)) - goto out; - - discard_locked = true; - } - - if (!bkey_eq(*discard_pos_done, iter.pos)) { - s->discarded++; - *discard_pos_done = iter.pos; - - if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { - /* - * This works without any other locks because this is the only - * thread that removes items from the need_discard tree - */ - bch2_trans_unlock_long(trans); - blkdev_issue_discard(ca->disk_sb.bdev, - k.k->p.offset * ca->mi.bucket_size, - ca->mi.bucket_size, - GFP_KERNEL); - ret = bch2_trans_relock_notrace(trans); - if (ret) - goto out; - } - } - - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto out; -commit: - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto out; - - if (!fastpath) - count_event(c, bucket_discard); - else - count_event(c, bucket_discard_fast); -out: -fsck_err: - if (discard_locked) - discard_in_flight_remove(ca, iter.pos.offset); - if (!ret) - s->seen++; - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -static void bch2_do_discards_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); - struct bch_fs *c = ca->fs; - struct discard_buckets_state s = {}; - struct bpos discard_pos_done = POS_MAX; - int ret; - - /* - * We're doing the commit in bch2_discard_one_bucket instead of using - * for_each_btree_key_commit() so that we can increment counters after - * successful commit: - */ - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, - BTREE_ID_need_discard, - POS(ca->dev_idx, 0), - POS(ca->dev_idx, U64_MAX), 0, k, - bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); - - if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) - bch2_journal_flush_async(&c->journal, NULL); - - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, - bch2_err_str(ret)); - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); -} - -void bch2_dev_do_discards(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) - return; - - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) - goto put_write_ref; - - if (queue_work(c->write_ref_wq, &ca->discard_work)) - return; - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); -put_write_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); -} - -void bch2_do_discards(struct bch_fs *c) -{ - for_each_member_device(c, ca) - bch2_dev_do_discards(ca); -} - -static int bch2_do_discards_fast_one(struct btree_trans *trans, - struct bch_dev *ca, - u64 bucket, - struct bpos *discard_pos_done, - struct discard_buckets_state *s) -{ - struct btree_iter need_discard_iter; - struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, - BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); - int ret = bkey_err(discard_k); - if (ret) - return ret; - - if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, - trans, discarding_bucket_not_in_need_discard_btree, - "attempting to discard bucket %u:%llu not in need_discard btree", - ca->dev_idx, bucket)) - goto out; - - ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); -out: -fsck_err: - bch2_trans_iter_exit(trans, &need_discard_iter); - return ret; -} - -static void bch2_do_discards_fast_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); - struct bch_fs *c = ca->fs; - struct discard_buckets_state s = {}; - struct bpos discard_pos_done = POS_MAX; - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - - while (1) { - bool got_bucket = false; - u64 bucket; - - mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) { - if (i->in_progress) - continue; - - got_bucket = true; - bucket = i->bucket; - i->in_progress = true; - break; - } - mutex_unlock(&ca->discard_buckets_in_flight_lock); - - if (!got_bucket) - break; - - ret = lockrestart_do(trans, - bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s)); - bch_err_fn(c, ret); - - discard_in_flight_remove(ca, bucket); - - if (ret) - break; - } - - trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - - bch2_trans_put(trans); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); -} - -static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) -{ - struct bch_fs *c = ca->fs; - - if (discard_in_flight_add(ca, bucket, false)) - return; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) - return; - - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast)) - goto put_ref; - - if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) - return; - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); -put_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); -} - -static int invalidate_one_bp(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c_backpointer bp, - struct bkey_buf *last_flushed) -{ - struct btree_iter extent_iter; - struct bkey_s_c extent_k = - bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed); - int ret = bkey_err(extent_k); - if (ret) - return ret; - - if (!extent_k.k) - return 0; - - struct bkey_i *n = - bch2_bkey_make_mut(trans, &extent_iter, &extent_k, - BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); -err: - bch2_trans_iter_exit(trans, &extent_iter); - return ret; -} - -static int invalidate_one_bucket_by_bps(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, - u8 gen, - struct bkey_buf *last_flushed) -{ - struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket); - struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket); - - return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, - bp_start, bp_end, 0, k, - NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, ({ - if (k.k->type != KEY_TYPE_backpointer) - continue; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - - if (bp.v->bucket_gen != gen) - continue; - - /* filter out bps with gens that don't match */ - - invalidate_one_bp(trans, ca, bp, last_flushed); - })); -} - -noinline_for_stack -static int invalidate_one_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct btree_iter *lru_iter, - struct bkey_s_c lru_k, - struct bkey_buf *last_flushed, - s64 *nr_to_invalidate) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); - struct btree_iter alloc_iter = {}; - int ret = 0; - - if (*nr_to_invalidate <= 0) - return 1; - - if (!bch2_dev_bucket_exists(c, bucket)) { - if (fsck_err(trans, lru_entry_to_invalid_bucket, - "lru key points to nonexistent device:bucket %llu:%llu", - bucket.inode, bucket.offset)) - return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); - goto out; - } - - if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) - return 0; - - struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - BTREE_ITER_cached); - ret = bkey_err(alloc_k); - if (ret) - return ret; - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - - /* We expect harmless races here due to the btree write buffer: */ - if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) - goto out; - - /* - * Impossible since alloc_lru_idx_read() only returns nonzero if the - * bucket is supposed to be on the cached bucket LRU (i.e. - * BCH_DATA_cached) - * - * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 - */ - BUG_ON(a->data_type != BCH_DATA_cached); - BUG_ON(a->dirty_sectors); - - if (!a->cached_sectors) { - bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset, - true, last_flushed); - goto out; - } - - unsigned cached_sectors = a->cached_sectors; - u8 gen = a->gen; - - ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); - if (ret) - goto out; - - trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); - --*nr_to_invalidate; -out: -fsck_err: - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -} - -static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, - struct bch_dev *ca, bool *wrapped) -{ - struct bkey_s_c k; -again: - k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); - if (!k.k && !*wrapped) { - bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0)); - *wrapped = true; - goto again; - } - - return k; -} - -static void bch2_do_invalidates_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); - struct bch_fs *c = ca->fs; - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - ret = bch2_btree_write_buffer_tryflush(trans); - if (ret) - goto err; - - s64 nr_to_invalidate = - should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - struct btree_iter iter; - bool wrapped = false; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, - ((bch2_current_io_time(c, READ) + U32_MAX) & - LRU_TIME_MAX)), 0); - - while (true) { - bch2_trans_begin(trans); - - struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); - ret = bkey_err(k); - if (ret) - goto restart_err; - if (!k.k) - break; - - ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate); -restart_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_btree_iter_advance(trans, &iter); - } - bch2_trans_iter_exit(trans, &iter); -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&last_flushed, c); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); -} - -void bch2_dev_do_invalidates(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) - return; - - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates)) - goto put_ref; - - if (queue_work(c->write_ref_wq, &ca->invalidate_work)) - return; - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); -put_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); -} - -void bch2_do_invalidates(struct bch_fs *c) -{ - for_each_member_device(c, ca) - bch2_dev_do_invalidates(ca); -} - -int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, - u64 bucket_start, u64 bucket_end) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bkey hole; - struct bpos end = POS(ca->dev_idx, bucket_end); - struct bch_member *m; - unsigned long last_updated = jiffies; - int ret; - - BUG_ON(bucket_start > bucket_end); - BUG_ON(bucket_end > ca->mi.nbuckets); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), - BTREE_ITER_prefetch); - /* - * Scan the alloc btree for every bucket on @ca, and add buckets to the - * freespace/need_discard/need_gc_gens btrees as needed: - */ - while (1) { - if (time_after(jiffies, last_updated + HZ * 10)) { - bch_info(ca, "%s: currently at %llu/%llu", - __func__, iter.pos.offset, ca->mi.nbuckets); - last_updated = jiffies; - } - - bch2_trans_begin(trans); - - if (bkey_ge(iter.pos, end)) { - ret = 0; - break; - } - - k = bch2_get_key_or_hole(trans, &iter, end, &hole); - ret = bkey_err(k); - if (ret) - goto bkey_err; - - if (k.k->type) { - /* - * We process live keys in the alloc btree one at a - * time: - */ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - - ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto bkey_err; - - bch2_btree_iter_advance(trans, &iter); - } else { - struct bkey_i *freespace; - - freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); - ret = PTR_ERR_OR_ZERO(freespace); - if (ret) - goto bkey_err; - - bkey_init(&freespace->k); - freespace->k.type = KEY_TYPE_set; - freespace->k.p = k.k->p; - freespace->k.size = k.k->size; - - ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto bkey_err; - - bch2_btree_iter_set_pos(trans, &iter, k.k->p); - } -bkey_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - - if (ret < 0) { - bch_err_msg(ca, ret, "initializing free space"); - return ret; - } - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); - mutex_unlock(&c->sb_lock); - - return 0; -} - -int bch2_fs_freespace_init(struct bch_fs *c) -{ - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) - return 0; - - - /* - * We can crash during the device add path, so we need to check this on - * every mount: - */ - - bool doing_init = false; - for_each_member_device(c, ca) { - if (ca->mi.freespace_initialized) - continue; - - if (!doing_init) { - bch_info(c, "initializing freespace"); - doing_init = true; - } - - int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); - if (ret) { - bch2_dev_put(ca); - bch_err_fn(c, ret); - return ret; - } - } - - if (doing_init) { - mutex_lock(&c->sb_lock); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - bch_verbose(c, "done initializing freespace"); - } - - return 0; -} - -/* device removal */ - -int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -{ - struct bpos start = POS(ca->dev_idx, 0); - struct bpos end = POS(ca->dev_idx, U64_MAX); - int ret; - - /* - * We clear the LRU and need_discard btrees first so that we don't race - * with bch2_do_invalidates() and bch2_do_discards() - */ - ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_dev_usage_remove(c, ca->dev_idx); - bch_err_msg(ca, ret, "removing dev alloc info"); - return ret; -} - -/* Bucket IO clocks: */ - -static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) -{ - struct bch_fs *c = trans->c; - - struct btree_iter iter; - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - u64 now = bch2_current_io_time(c, rw); - if (a->v.io_time[rw] == now) - goto out; - - a->v.io_time[rw] = now; - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, 0); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) -{ - if (bch2_trans_relock(trans)) - bch2_trans_begin(trans); - - return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw)); -} - -/* Startup/shutdown (ro/rw): */ - -void bch2_recalc_capacity(struct bch_fs *c) -{ - u64 capacity = 0, reserved_sectors = 0, gc_reserve; - unsigned bucket_size_max = 0; - unsigned long ra_pages = 0; - - lockdep_assert_held(&c->state_lock); - - guard(rcu)(); - for_each_member_device_rcu(c, ca, NULL) { - struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); - if (bdev) - ra_pages += bdev->bd_disk->bdi->ra_pages; - - if (ca->mi.state != BCH_MEMBER_STATE_rw) - continue; - - u64 dev_reserve = 0; - - /* - * We need to reserve buckets (from the number - * of currently available buckets) against - * foreground writes so that mainly copygc can - * make forward progress. - * - * We need enough to refill the various reserves - * from scratch - copygc will use its entire - * reserve all at once, then run against when - * its reserve is refilled (from the formerly - * available buckets). - * - * This reserve is just used when considering if - * allocations for foreground writes must wait - - * not -ENOSPC calculations. - */ - - dev_reserve += ca->nr_btree_reserve * 2; - dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ - - dev_reserve += 1; /* btree write point */ - dev_reserve += 1; /* copygc write point */ - dev_reserve += 1; /* rebalance write point */ - - dev_reserve *= ca->mi.bucket_size; - - capacity += bucket_to_sector(ca, ca->mi.nbuckets - - ca->mi.first_bucket); - - reserved_sectors += dev_reserve * 2; - - bucket_size_max = max_t(unsigned, bucket_size_max, - ca->mi.bucket_size); - } - - bch2_set_ra_pages(c, ra_pages); - - gc_reserve = c->opts.gc_reserve_bytes - ? c->opts.gc_reserve_bytes >> 9 - : div64_u64(capacity * c->opts.gc_reserve_percent, 100); - - reserved_sectors = max(gc_reserve, reserved_sectors); - - reserved_sectors = min(reserved_sectors, capacity); - - c->reserved = reserved_sectors; - c->capacity = capacity - reserved_sectors; - - c->bucket_size_max = bucket_size_max; - - /* Wake up case someone was waiting for buckets */ - closure_wake_up(&c->freelist_wait); -} - -u64 bch2_min_rw_member_capacity(struct bch_fs *c) -{ - u64 ret = U64_MAX; - - guard(rcu)(); - for_each_rw_member_rcu(c, ca) - ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); - return ret; -} - -static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) -{ - struct open_bucket *ob; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - scoped_guard(spinlock, &ob->lock) { - if (ob->valid && !ob->on_partial_list && - ob->dev == ca->dev_idx) - return true; - } - } - - return false; -} - -void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) -{ - /* BCH_DATA_free == all rw devs */ - - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (rw && - (i == BCH_DATA_free || - (ca->mi.data_allowed & BIT(i)))) - set_bit(ca->dev_idx, c->rw_devs[i].d); - else - clear_bit(ca->dev_idx, c->rw_devs[i].d); -} - -/* device goes ro: */ -void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - /* First, remove device from allocation groups: */ - bch2_dev_allocator_set_rw(c, ca, false); - - c->rw_devs_change_count++; - - /* - * Capacity is calculated based off of devices in allocation groups: - */ - bch2_recalc_capacity(c); - - bch2_open_buckets_stop(c, ca, false); - - /* - * Wake up threads that were blocked on allocation, so they can notice - * the device can no longer be removed and the capacity has changed: - */ - closure_wake_up(&c->freelist_wait); - - /* - * journal_res_get() can block waiting for free space in the journal - - * it needs to notice there may not be devices to allocate from anymore: - */ - wake_up(&c->journal.wait); - - /* Now wait for any in flight writes: */ - - closure_wait_event(&c->open_buckets_wait, - !bch2_dev_has_open_write_point(c, ca)); -} - -/* device goes rw: */ -void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - bch2_dev_allocator_set_rw(c, ca, true); - c->rw_devs_change_count++; -} - -void bch2_dev_allocator_background_exit(struct bch_dev *ca) -{ - darray_exit(&ca->discard_buckets_in_flight); -} - -void bch2_dev_allocator_background_init(struct bch_dev *ca) -{ - mutex_init(&ca->discard_buckets_in_flight_lock); - INIT_WORK(&ca->discard_work, bch2_do_discards_work); - INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); - INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); -} - -void bch2_fs_allocator_background_init(struct bch_fs *c) -{ - spin_lock_init(&c->freelist_lock); -} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h deleted file mode 100644 index 0cc5adc55b6f1e..00000000000000 --- a/fs/bcachefs/alloc_background.h +++ /dev/null @@ -1,361 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_BACKGROUND_H -#define _BCACHEFS_ALLOC_BACKGROUND_H - -#include "bcachefs.h" -#include "alloc_types.h" -#include "buckets.h" -#include "debug.h" -#include "super.h" - -/* How out of date a pointer gen is allowed to be: */ -#define BUCKET_GC_GEN_MAX 96U - -static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - return ca && bucket_valid(ca, pos.offset); -} - -static inline u64 bucket_to_u64(struct bpos bucket) -{ - return (bucket.inode << 48) | bucket.offset; -} - -static inline struct bpos u64_to_bucket(u64 bucket) -{ - return POS(bucket >> 48, bucket & ~(~0ULL << 48)); -} - -static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) -{ - return a.gen - a.oldest_gen; -} - -static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src) -{ - dst->gen = src.gen; - dst->data_type = src.data_type; - dst->stripe_sectors = src.stripe_sectors; - dst->dirty_sectors = src.dirty_sectors; - dst->cached_sectors = src.cached_sectors; - dst->stripe = src.stripe; -} - -static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src) -{ - dst->gen = src.gen; - dst->data_type = src.data_type; - dst->stripe_sectors = src.stripe_sectors; - dst->dirty_sectors = src.dirty_sectors; - dst->cached_sectors = src.cached_sectors; - dst->stripe = src.stripe; -} - -static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) -{ - struct bch_alloc_v4 ret = {}; - __bucket_m_to_alloc(&ret, b); - return ret; -} - -static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) -{ - switch (data_type) { - case BCH_DATA_cached: - case BCH_DATA_stripe: - return BCH_DATA_user; - default: - return data_type; - } -} - -static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, - enum bch_data_type ptr) -{ - return !data_type_is_empty(bucket) && - bucket_data_type(bucket) != bucket_data_type(ptr); -} - -/* - * It is my general preference to use unsigned types for unsigned quantities - - * however, these helpers are used in disk accounting calculations run by - * triggers where the output will be negated and added to an s64. unsigned is - * right out even though all these quantities will fit in 32 bits, since it - * won't be sign extended correctly; u64 will negate "correctly", but s64 is the - * simpler option here. - */ -static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a) -{ - return a.stripe_sectors + a.dirty_sectors + a.cached_sectors; -} - -static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) -{ - return a.stripe_sectors + a.dirty_sectors; -} - -static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a) -{ - return a.data_type == BCH_DATA_cached - ? a.cached_sectors - : bch2_bucket_sectors_dirty(a); -} - -static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca, - struct bch_alloc_v4 a) -{ - int d = bch2_bucket_sectors(a); - - return d ? max(0, ca->mi.bucket_size - d) : 0; -} - -static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a) -{ - int d = a.stripe_sectors + a.dirty_sectors; - - return d ? max(0, ca->mi.bucket_size - d) : 0; -} - -static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a) -{ - return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0; -} - -static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, - enum bch_data_type data_type) -{ - if (a.stripe) - return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (bch2_bucket_sectors_dirty(a)) - return bucket_data_type(data_type); - if (a.cached_sectors) - return BCH_DATA_cached; - if (BCH_ALLOC_V4_NEED_DISCARD(&a)) - return BCH_DATA_need_discard; - if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) - return BCH_DATA_need_gc_gens; - return BCH_DATA_free; -} - -static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type) -{ - a->data_type = alloc_data_type(*a, data_type); -} - -static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) -{ - return a.data_type == BCH_DATA_cached - ? a.io_time[READ] & LRU_TIME_MAX - : 0; -} - -#define DATA_TYPES_MOVABLE \ - ((1U << BCH_DATA_btree)| \ - (1U << BCH_DATA_user)| \ - (1U << BCH_DATA_stripe)) - -static inline bool data_type_movable(enum bch_data_type type) -{ - return (1U << type) & DATA_TYPES_MOVABLE; -} - -static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, - struct bch_dev *ca) -{ - if (a.data_type >= BCH_DATA_NR) - return 0; - - if (!data_type_movable(a.data_type) || - !bch2_bucket_sectors_fragmented(ca, a)) - return 0; - - /* - * avoid overflowing LRU_TIME_BITS on a corrupted fs, when - * bucket_sectors_dirty is (much) bigger than bucket_size - */ - u64 d = min_t(s64, bch2_bucket_sectors_dirty(a), - ca->mi.bucket_size); - - return div_u64(d * (1ULL << 31), ca->mi.bucket_size); -} - -static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) -{ - return ((u64) alloc_gc_gen(a) >> 4) << 56; -} - -static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) -{ - pos.offset |= alloc_freespace_genbits(a); - return pos; -} - -static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a) -{ - return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: - BCH_ALLOC_V4_U64s_V0) + - BCH_ALLOC_V4_NR_BACKPOINTERS(a) * - (sizeof(struct bch_backpointer) / sizeof(u64)); -} - -static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) -{ - unsigned ret = alloc_v4_u64s_noerror(a); - BUG_ON(ret > U8_MAX - BKEY_U64s); - return ret; -} - -static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) -{ - set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); -} - -struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); -struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *, struct bpos, - enum btree_iter_update_trigger_flags); - -void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); - -static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert) -{ - const struct bch_alloc_v4 *ret; - - if (unlikely(k.k->type != KEY_TYPE_alloc_v4)) - goto slowpath; - - ret = bkey_s_c_to_alloc_v4(k).v; - if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s) - goto slowpath; - - return ret; -slowpath: - __bch2_alloc_to_v4(k, convert); - return convert; -} - -struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); - -int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); - -int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_alloc_v4_swab(struct bkey_s); -void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_alloc ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v1_validate, \ - .val_to_text = bch2_alloc_to_text, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 8, \ -}) - -#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v2_validate, \ - .val_to_text = bch2_alloc_to_text, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 8, \ -}) - -#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v3_validate, \ - .val_to_text = bch2_alloc_to_text, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 16, \ -}) - -#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ - .key_validate = bch2_alloc_v4_validate, \ - .val_to_text = bch2_alloc_v4_to_text, \ - .swab = bch2_alloc_v4_swab, \ - .trigger = bch2_trigger_alloc, \ - .min_val_size = 48, \ -}) - -int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ - .key_validate = bch2_bucket_gens_validate, \ - .val_to_text = bch2_bucket_gens_to_text, \ -}) - -int bch2_bucket_gens_init(struct bch_fs *); - -static inline bool bkey_is_alloc(const struct bkey *k) -{ - return k->type == KEY_TYPE_alloc || - k->type == KEY_TYPE_alloc_v2 || - k->type == KEY_TYPE_alloc_v3; -} - -int bch2_alloc_read(struct bch_fs *); - -int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, - const struct bch_alloc_v4 *, - const struct bch_alloc_v4 *, unsigned); -int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); -int bch2_check_alloc_info(struct bch_fs *); -int bch2_check_alloc_to_lru_refs(struct bch_fs *); -void bch2_dev_do_discards(struct bch_dev *); -void bch2_do_discards(struct bch_fs *); - -static inline u64 should_invalidate_buckets(struct bch_dev *ca, - struct bch_dev_usage u) -{ - u64 want_free = ca->mi.nbuckets >> 7; - u64 free = max_t(s64, 0, - u.buckets[BCH_DATA_free] - + u.buckets[BCH_DATA_need_discard] - - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); - - return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]); -} - -void bch2_dev_do_invalidates(struct bch_dev *); -void bch2_do_invalidates(struct bch_fs *); - -static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) -{ - return (void *) ((u64 *) &a->v + - (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: - BCH_ALLOC_V4_U64s_V0)); -} - -static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) -{ - return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); -} - -int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); -int bch2_fs_freespace_init(struct bch_fs *); -int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); - -void bch2_recalc_capacity(struct bch_fs *); -u64 bch2_min_rw_member_capacity(struct bch_fs *); - -void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool); -void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); - -void bch2_dev_allocator_background_exit(struct bch_dev *); -void bch2_dev_allocator_background_init(struct bch_dev *); - -void bch2_fs_allocator_background_init(struct bch_fs *); - -#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h deleted file mode 100644 index 740238369a5a21..00000000000000 --- a/fs/bcachefs/alloc_background_format.h +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H -#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H - -struct bch_alloc { - struct bch_val v; - __u8 fields; - __u8 gen; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V1() \ - x(read_time, 16) \ - x(write_time, 16) \ - x(data_type, 8) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ - x(oldest_gen, 8) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bch_alloc_v2 { - struct bch_val v; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V2() \ - x(read_time, 64) \ - x(write_time, 64) \ - x(dirty_sectors, 32) \ - x(cached_sectors, 32) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -struct bch_alloc_v3 { - struct bch_val v; - __le64 journal_seq; - __le32 flags; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) - -struct bch_alloc_v4 { - struct bch_val v; - __u64 journal_seq_nonempty; - __u32 flags; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 stripe_redundancy; - __u32 dirty_sectors; - __u32 cached_sectors; - __u64 io_time[2]; - __u32 stripe; - __u32 nr_external_backpointers; - /* end of fields in original version of alloc_v4 */ - __u64 journal_seq_empty; - __u32 stripe_sectors; - __u32 pad; -} __packed __aligned(8); - -#define BCH_ALLOC_V4_U64s_V0 6 -#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) - -BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) -BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) -BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) -BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) - -#define KEY_TYPE_BUCKET_GENS_BITS 8 -#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) -#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) - -struct bch_bucket_gens { - struct bch_val v; - u8 gens[KEY_TYPE_BUCKET_GENS_NR]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c deleted file mode 100644 index b58525ec7b4d3c..00000000000000 --- a/fs/bcachefs/alloc_foreground.c +++ /dev/null @@ -1,1683 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2012 Google, Inc. - * - * Foreground allocator code: allocate buckets from freelist, and allocate in - * sector granularity from writepoints. - * - * bch2_bucket_alloc() allocates a single bucket from a specific device. - * - * bch2_bucket_alloc_set() allocates one or more buckets from different devices - * in a given filesystem. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_gc.h" -#include "buckets.h" -#include "buckets_waiting_for_journal.h" -#include "clock.h" -#include "debug.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "io_write.h" -#include "journal.h" -#include "movinggc.h" -#include "nocow_locking.h" -#include "trace.h" - -#include -#include -#include - -static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, - struct mutex *lock) -{ - if (!mutex_trylock(lock)) { - bch2_trans_unlock(trans); - mutex_lock(lock); - } -} - -const char * const bch2_watermarks[] = { -#define x(t) #t, - BCH_WATERMARKS() -#undef x - NULL -}; - -/* - * Open buckets represent a bucket that's currently being allocated from. They - * serve two purposes: - * - * - They track buckets that have been partially allocated, allowing for - * sub-bucket sized allocations - they're used by the sector allocator below - * - * - They provide a reference to the buckets they own that mark and sweep GC - * can find, until the new allocation has a pointer to it inserted into the - * btree - * - * When allocating some space with the sector allocator, the allocation comes - * with a reference to an open bucket - the caller is required to put that - * reference _after_ doing the index update that makes its allocation reachable. - */ - -void bch2_reset_alloc_cursors(struct bch_fs *c) -{ - guard(rcu)(); - for_each_member_device_rcu(c, ca, NULL) - memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); -} - -static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) -{ - open_bucket_idx_t idx = ob - c->open_buckets; - open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); - - ob->hash = *slot; - *slot = idx; -} - -static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) -{ - open_bucket_idx_t idx = ob - c->open_buckets; - open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); - - while (*slot != idx) { - BUG_ON(!*slot); - slot = &c->open_buckets[*slot].hash; - } - - *slot = ob->hash; - ob->hash = 0; -} - -void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - if (ob->ec) { - ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); - return; - } - - spin_lock(&ob->lock); - ob->valid = false; - ob->data_type = 0; - spin_unlock(&ob->lock); - - spin_lock(&c->freelist_lock); - bch2_open_bucket_hash_remove(c, ob); - - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - - c->open_buckets_nr_free++; - ca->nr_open_buckets--; - spin_unlock(&c->freelist_lock); - - closure_wake_up(&c->open_buckets_wait); -} - -void bch2_open_bucket_write_error(struct bch_fs *c, - struct open_buckets *obs, - unsigned dev, int err) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, obs, ob, i) - if (ob->dev == dev && ob->ec) - bch2_ec_bucket_cancel(c, ob, err); -} - -static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) -{ - struct open_bucket *ob; - - BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); - - ob = c->open_buckets + c->open_buckets_freelist; - c->open_buckets_freelist = ob->freelist; - atomic_set(&ob->pin, 1); - ob->data_type = 0; - - c->open_buckets_nr_free--; - return ob; -} - -static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -{ - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) - return false; - - return bch2_is_superblock_bucket(ca, b); -} - -static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) -{ - BUG_ON(c->open_buckets_partial_nr >= - ARRAY_SIZE(c->open_buckets_partial)); - - spin_lock(&c->freelist_lock); - scoped_guard(rcu) - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; - - ob->on_partial_list = true; - c->open_buckets_partial[c->open_buckets_partial_nr++] = - ob - c->open_buckets; - spin_unlock(&c->freelist_lock); - - closure_wake_up(&c->open_buckets_wait); - closure_wake_up(&c->freelist_wait); -} - -static inline bool may_alloc_bucket(struct bch_fs *c, - struct alloc_request *req, - struct bpos bucket) -{ - if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - req->counters.skipped_open++; - return false; - } - - u64 journal_seq_ready = - bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, - bucket.inode, bucket.offset); - if (journal_seq_ready > c->journal.flushed_seq_ondisk) { - if (journal_seq_ready > c->journal.flushing_seq) - req->counters.need_journal_commit++; - req->counters.skipped_need_journal_commit++; - return false; - } - - if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - req->counters.skipped_nocow++; - return false; - } - - return true; -} - -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, - struct alloc_request *req, - u64 bucket, u8 gen, - struct closure *cl) -{ - struct bch_dev *ca = req->ca; - - if (unlikely(is_superblock_bucket(c, ca, bucket))) - return NULL; - - if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - req->counters.skipped_nouse++; - return NULL; - } - - spin_lock(&c->freelist_lock); - - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { - if (cl) - closure_wait(&c->open_buckets_wait, cl); - - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); - spin_unlock(&c->freelist_lock); - return ERR_PTR(bch_err_throw(c, open_buckets_empty)); - } - - /* Recheck under lock: */ - if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { - spin_unlock(&c->freelist_lock); - req->counters.skipped_open++; - return NULL; - } - - struct open_bucket *ob = bch2_open_bucket_alloc(c); - - spin_lock(&ob->lock); - ob->valid = true; - ob->sectors_free = ca->mi.bucket_size; - ob->dev = ca->dev_idx; - ob->gen = gen; - ob->bucket = bucket; - spin_unlock(&ob->lock); - - ca->nr_open_buckets++; - bch2_open_bucket_hash_add(c, ob); - - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); - track_event_change(&c->times[BCH_TIME_blocked_allocate], false); - - spin_unlock(&c->freelist_lock); - return ob; -} - -static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, - struct alloc_request *req, - struct btree_iter *freespace_iter, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - - if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) - return NULL; - - u8 gen; - int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); - if (ret < 0) - return ERR_PTR(ret); - if (ret) - return NULL; - - return __try_alloc_bucket(c, req, b, gen, cl); -} - -/* - * This path is for before the freespace btree is initialized: - */ -static noinline struct open_bucket * -bch2_bucket_alloc_early(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = req->ca; - struct btree_iter iter, citer; - struct bkey_s_c k, ck; - struct open_bucket *ob = NULL; - u64 first_bucket = ca->mi.first_bucket; - u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; - u64 alloc_start = max(first_bucket, *dev_alloc_cursor); - u64 alloc_cursor = alloc_start; - int ret; - - /* - * Scan with an uncached iterator to avoid polluting the key cache. An - * uncached iter will return a cached key if one exists, but if not - * there is no other underlying protection for the associated key cache - * slot. To avoid racing bucket allocations, look up the cached key slot - * of any likely allocation candidate before attempting to proceed with - * the allocation. This provides proper exclusion on the associated - * bucket. - */ -again: - for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), - BTREE_ITER_slots, k, ret) { - u64 bucket = k.k->p.offset; - - if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) - break; - - if (req->btree_bitmap != BTREE_BITMAP_ANY && - req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (req->btree_bitmap == BTREE_BITMAP_YES && - bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) - break; - - bucket = sector_to_bucket(ca, - round_up(bucket_to_sector(ca, bucket) + 1, - 1ULL << ca->mi.btree_bitmap_shift)); - bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); - req->counters.buckets_seen++; - req->counters.skipped_mi_btree_bitmap++; - continue; - } - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - if (a->data_type != BCH_DATA_free) - continue; - - /* now check the cached key to serialize concurrent allocs of the bucket */ - ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); - ret = bkey_err(ck); - if (ret) - break; - - a = bch2_alloc_to_v4(ck, &a_convert); - if (a->data_type != BCH_DATA_free) - goto next; - - req->counters.buckets_seen++; - - ob = may_alloc_bucket(c, req, k.k->p) - ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) - : NULL; -next: - bch2_set_btree_iter_dontneed(trans, &citer); - bch2_trans_iter_exit(trans, &citer); - if (ob) - break; - } - bch2_trans_iter_exit(trans, &iter); - - alloc_cursor = iter.pos.offset; - - if (!ob && ret) - ob = ERR_PTR(ret); - - if (!ob && alloc_start > first_bucket) { - alloc_cursor = alloc_start = first_bucket; - goto again; - } - - *dev_alloc_cursor = alloc_cursor; - - return ob; -} - -static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - struct bch_dev *ca = req->ca; - struct btree_iter iter; - struct bkey_s_c k; - struct open_bucket *ob = NULL; - u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); - u64 alloc_cursor = alloc_start; - int ret; -again: - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, alloc_cursor), - POS(ca->dev_idx, U64_MAX), - 0, k, ret) { - /* - * peek normally dosen't trim extents - they can span iter.pos, - * which is not what we want here: - */ - iter.k.size = iter.k.p.offset - iter.pos.offset; - - while (iter.k.size) { - req->counters.buckets_seen++; - - u64 bucket = iter.pos.offset & ~(~0ULL << 56); - if (req->btree_bitmap != BTREE_BITMAP_ANY && - req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, - bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (req->btree_bitmap == BTREE_BITMAP_YES && - bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) - goto fail; - - bucket = sector_to_bucket(ca, - round_up(bucket_to_sector(ca, bucket + 1), - 1ULL << ca->mi.btree_bitmap_shift)); - alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - - bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); - req->counters.skipped_mi_btree_bitmap++; - goto next; - } - - ob = try_alloc_bucket(trans, req, &iter, cl); - if (ob) { - if (!IS_ERR(ob)) - *dev_alloc_cursor = iter.pos.offset; - bch2_set_btree_iter_dontneed(trans, &iter); - break; - } - - iter.k.size--; - iter.pos.offset++; - } -next: - if (ob || ret) - break; - } -fail: - bch2_trans_iter_exit(trans, &iter); - - BUG_ON(ob && ret); - - if (ret) - ob = ERR_PTR(ret); - - if (!ob && alloc_start > ca->mi.first_bucket) { - alloc_cursor = alloc_start = ca->mi.first_bucket; - goto again; - } - - return ob; -} - -static noinline void trace_bucket_alloc2(struct bch_fs *c, - struct alloc_request *req, - struct closure *cl, - struct open_bucket *ob) -{ - struct printbuf buf = PRINTBUF; - - printbuf_tabstop_push(&buf, 24); - - prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); - prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); - prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); - prt_printf(&buf, "blocking\t%u\n", cl != NULL); - prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); - prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); - prt_printf(&buf, "copygc_wait\t%llu/%lli\n", - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); - prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); - prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); - prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); - prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); - prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); - prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); - - if (!IS_ERR(ob)) { - prt_printf(&buf, "allocated\t%llu\n", ob->bucket); - trace_bucket_alloc(c, buf.buf); - } else { - prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); - trace_bucket_alloc_fail(c, buf.buf); - } - - printbuf_exit(&buf); -} - -/** - * bch2_bucket_alloc_trans - allocate a single bucket from a specific device - * @trans: transaction object - * @req: state for the entire allocation - * @cl: if not NULL, closure to be used to wait if buckets not available - * @nowait: if true, do not wait for buckets to become available - * - * Returns: an open_bucket on success, or an ERR_PTR() on failure. - */ -static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl, - bool nowait) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = req->ca; - struct open_bucket *ob = NULL; - bool freespace = READ_ONCE(ca->mi.freespace_initialized); - u64 avail; - bool waiting = nowait; - - req->btree_bitmap = req->data_type == BCH_DATA_btree; - memset(&req->counters, 0, sizeof(req->counters)); -again: - bch2_dev_usage_read_fast(ca, &req->usage); - avail = dev_buckets_free(ca, req->usage, req->watermark); - - if (req->usage.buckets[BCH_DATA_need_discard] > - min(avail, ca->mi.nbuckets >> 7)) - bch2_dev_do_discards(ca); - - if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) - bch2_gc_gens_async(c); - - if (should_invalidate_buckets(ca, req->usage)) - bch2_dev_do_invalidates(ca); - - if (!avail) { - if (req->watermark > BCH_WATERMARK_normal && - c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) - goto alloc; - - if (cl && !waiting) { - closure_wait(&c->freelist_wait, cl); - waiting = true; - goto again; - } - - track_event_change(&c->times[BCH_TIME_blocked_allocate], true); - - ob = ERR_PTR(bch_err_throw(c, freelist_empty)); - goto err; - } - - if (waiting) - closure_wake_up(&c->freelist_wait); -alloc: - ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, req, cl) - : bch2_bucket_alloc_early(trans, req, cl); - - if (req->counters.need_journal_commit * 2 > avail) - bch2_journal_flush_async(&c->journal, NULL); - - if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { - req->btree_bitmap = BTREE_BITMAP_ANY; - goto alloc; - } - - if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { - freespace = false; - goto alloc; - } -err: - if (!ob) - ob = ERR_PTR(bch_err_throw(c, no_buckets_found)); - - if (!IS_ERR(ob)) - ob->data_type = req->data_type; - - if (!IS_ERR(ob)) - count_event(c, bucket_alloc); - else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) - count_event(c, bucket_alloc_fail); - - if (!IS_ERR(ob) - ? trace_bucket_alloc_enabled() - : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, req, cl, ob); - - return ob; -} - -struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, - struct closure *cl) -{ - struct open_bucket *ob; - struct alloc_request req = { - .watermark = watermark, - .data_type = data_type, - .ca = ca, - }; - - bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); - return ob; -} - -static int __dev_stripe_cmp(struct dev_stripe_state *stripe, - unsigned l, unsigned r) -{ - return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]); -} - -#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) - -void bch2_dev_alloc_list(struct bch_fs *c, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs, - struct dev_alloc_list *ret) -{ - ret->nr = 0; - - unsigned i; - for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret->data[ret->nr++] = i; - - bubble_sort(ret->data, ret->nr, dev_stripe_cmp); -} - -static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ -static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */ -static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */ - -static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe) -{ - /* - * Avoid underflowing clock hands if at all possible, if clock hands go - * to 0 then we lose information - clock hands can be in a wide range if - * we have devices we rarely try to allocate from, if we generally - * allocate from a specified target but only sometimes have to fall back - * to the whole filesystem. - */ - u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */ - u64 scale_min = 0; /* minumum we must subtract to avoid overflow */ - - for (u64 *v = stripe->next_alloc; - v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) { - if (*v) - scale_max = min(scale_max, *v); - if (*v > stripe_clock_hand_max) - scale_min = max(scale_min, *v - stripe_clock_hand_max); - } - - u64 scale = max(scale_min, scale_max); - - for (u64 *v = stripe->next_alloc; - v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) - *v = *v < scale ? 0 : *v - scale; -} - -static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, - struct dev_stripe_state *stripe, - struct bch_dev_usage *usage) -{ - /* - * Stripe state has a per device clock hand: we allocate from the device - * with the smallest clock hand. - * - * When we allocate, we don't do a simple increment; we add the inverse - * of the device's free space. This results in round robin behavior that - * biases in favor of the device(s) with more free space. - */ - - u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); - u64 free_space_inv = free_space - ? div64_u64(stripe_clock_hand_inv, free_space) - : stripe_clock_hand_inv; - - /* Saturating add, avoid overflow: */ - u64 sum = *v + free_space_inv; - *v = sum >= *v ? sum : U64_MAX; - - if (unlikely(*v > stripe_clock_hand_rescale)) - bch2_stripe_state_rescale(stripe); -} - -void bch2_dev_stripe_increment(struct bch_dev *ca, - struct dev_stripe_state *stripe) -{ - struct bch_dev_usage usage; - - bch2_dev_usage_read_fast(ca, &usage); - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); -} - -static int add_new_bucket(struct bch_fs *c, - struct alloc_request *req, - struct open_bucket *ob) -{ - unsigned durability = ob_dev(c, ob)->mi.durability; - - BUG_ON(req->nr_effective >= req->nr_replicas); - - __clear_bit(ob->dev, req->devs_may_alloc.d); - req->nr_effective += durability; - req->have_cache |= !durability; - - ob_push(c, &req->ptrs, ob); - - if (req->nr_effective >= req->nr_replicas) - return 1; - if (ob->ec) - return 1; - return 0; -} - -inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct alloc_request *req, - struct dev_stripe_state *stripe, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - BUG_ON(req->nr_effective >= req->nr_replicas); - - bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); - - darray_for_each(req->devs_sorted, i) { - req->ca = bch2_dev_tryget_noerror(c, *i); - if (!req->ca) - continue; - - if (!req->ca->mi.durability && req->have_cache) { - bch2_dev_put(req->ca); - continue; - } - - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, - req->flags & BCH_WRITE_alloc_nowait); - if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); - bch2_dev_put(req->ca); - - if (IS_ERR(ob)) { - ret = PTR_ERR(ob); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) - break; - continue; - } - - ret = add_new_bucket(c, req, ob); - if (ret) - break; - } - - if (ret == 1) - return 0; - if (ret) - return ret; - return bch_err_throw(c, insufficient_devices); -} - -/* Allocate from stripes: */ - -/* - * if we can't allocate a new stripe because there are already too many - * partially filled stripes, force allocating from an existing stripe even when - * it's to a device we don't want: - */ - -static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - if (req->nr_replicas < 2) - return 0; - - if (ec_open_bucket(c, &req->ptrs)) - return 0; - - struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, req, 0, cl); - if (IS_ERR(h)) - return PTR_ERR(h); - if (!h) - return 0; - - bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted); - - darray_for_each(req->devs_sorted, i) - for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { - if (!h->s->blocks[ec_idx]) - continue; - - struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { - ob->ec_idx = ec_idx; - ob->ec = h->s; - ec_stripe_new_get(h->s, STRIPE_REF_io); - - ret = add_new_bucket(c, req, ob); - goto out; - } - } -out: - bch2_ec_stripe_head_put(c, h); - return ret; -} - -/* Sector allocator */ - -static bool want_bucket(struct bch_fs *c, - struct alloc_request *req, - struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - if (!test_bit(ob->dev, req->devs_may_alloc.d)) - return false; - - if (ob->data_type != req->wp->data_type) - return false; - - if (!ca->mi.durability && - (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) - return false; - - if (req->ec != (ob->ec != NULL)) - return false; - - return true; -} - -static int bucket_alloc_set_writepoint(struct bch_fs *c, - struct alloc_request *req) -{ - struct open_bucket *ob; - unsigned i; - int ret = 0; - - req->scratch_ptrs.nr = 0; - - open_bucket_for_each(c, &req->wp->ptrs, ob, i) { - if (!ret && want_bucket(c, req, ob)) - ret = add_new_bucket(c, req, ob); - else - ob_push(c, &req->scratch_ptrs, ob); - } - req->wp->ptrs = req->scratch_ptrs; - - return ret; -} - -static int bucket_alloc_set_partial(struct bch_fs *c, - struct alloc_request *req) -{ - int i, ret = 0; - - if (!c->open_buckets_partial_nr) - return 0; - - spin_lock(&c->freelist_lock); - - if (!c->open_buckets_partial_nr) - goto unlock; - - for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { - struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - - if (want_bucket(c, req, ob)) { - struct bch_dev *ca = ob_dev(c, ob); - u64 avail; - - bch2_dev_usage_read_fast(ca, &req->usage); - avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; - if (!avail) - continue; - - array_remove_item(c->open_buckets_partial, - c->open_buckets_partial_nr, - i); - ob->on_partial_list = false; - - scoped_guard(rcu) - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - - ret = add_new_bucket(c, req, ob); - if (ret) - break; - } - } -unlock: - spin_unlock(&c->freelist_lock); - return ret; -} - -static int __open_bucket_add_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct closure *_cl) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - struct closure *cl = NULL; - unsigned i; - int ret; - - req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); - - /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*req->devs_have, i) - __clear_bit(*i, req->devs_may_alloc.d); - - open_bucket_for_each(c, &req->ptrs, ob, i) - __clear_bit(ob->dev, req->devs_may_alloc.d); - - ret = bucket_alloc_set_writepoint(c, req); - if (ret) - return ret; - - ret = bucket_alloc_set_partial(c, req); - if (ret) - return ret; - - if (req->ec) { - ret = bucket_alloc_from_stripe(trans, req, _cl); - } else { -retry_blocking: - /* - * Try nonblocking first, so that if one device is full we'll try from - * other devices: - */ - ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && - !cl && _cl) { - cl = _cl; - goto retry_blocking; - } - } - - return ret; -} - -static int open_bucket_add_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct closure *cl) -{ - int ret; - - if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { - ret = __open_bucket_add_buckets(trans, req, cl); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_operation_blocked) || - bch2_err_matches(ret, BCH_ERR_freelist_empty) || - bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - return ret; - if (req->nr_effective >= req->nr_replicas) - return 0; - } - - bool ec = false; - swap(ec, req->ec); - ret = __open_bucket_add_buckets(trans, req, cl); - swap(ec, req->ec); - - return ret < 0 ? ret : 0; -} - -/** - * should_drop_bucket - check if this is open_bucket should go away - * @ob: open_bucket to predicate on - * @c: filesystem handle - * @ca: if set, we're killing buckets for a particular device - * @ec: if true, we're shutting down erasure coding and killing all ec - * open_buckets - * otherwise, return true - * Returns: true if we should kill this open_bucket - * - * We're killing open_buckets because we're shutting down a device, erasure - * coding, or the entire filesystem - check if this open_bucket matches: - */ -static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, - struct bch_dev *ca, bool ec) -{ - if (ec) { - return ob->ec != NULL; - } else if (ca) { - bool drop = ob->dev == ca->dev_idx; - struct open_bucket *ob2; - unsigned i; - - if (!drop && ob->ec) { - unsigned nr_blocks; - - mutex_lock(&ob->ec->lock); - nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; - - for (i = 0; i < nr_blocks; i++) { - if (!ob->ec->blocks[i]) - continue; - - ob2 = c->open_buckets + ob->ec->blocks[i]; - drop |= ob2->dev == ca->dev_idx; - } - mutex_unlock(&ob->ec->lock); - } - - return drop; - } else { - return true; - } -} - -static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, - bool ec, struct write_point *wp) -{ - struct open_buckets ptrs = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; - - mutex_lock(&wp->lock); - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (should_drop_bucket(ob, c, ca, ec)) - bch2_open_bucket_put(c, ob); - else - ob_push(c, &ptrs, ob); - wp->ptrs = ptrs; - mutex_unlock(&wp->lock); -} - -void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, - bool ec) -{ - unsigned i; - - /* Next, close write points that point to this device... */ - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); - - bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); - bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); - bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); - - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - bch2_open_buckets_put(c, &a->ob); - } - mutex_unlock(&c->btree_reserve_cache_lock); - - spin_lock(&c->freelist_lock); - i = 0; - while (i < c->open_buckets_partial_nr) { - struct open_bucket *ob = - c->open_buckets + c->open_buckets_partial[i]; - - if (should_drop_bucket(ob, c, ca, ec)) { - --c->open_buckets_partial_nr; - swap(c->open_buckets_partial[i], - c->open_buckets_partial[c->open_buckets_partial_nr]); - - ob->on_partial_list = false; - - scoped_guard(rcu) - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - - spin_unlock(&c->freelist_lock); - bch2_open_bucket_put(c, ob); - spin_lock(&c->freelist_lock); - } else { - i++; - } - } - spin_unlock(&c->freelist_lock); - - bch2_ec_stop_dev(c, ca); -} - -static inline struct hlist_head *writepoint_hash(struct bch_fs *c, - unsigned long write_point) -{ - unsigned hash = - hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); - - return &c->write_points_hash[hash]; -} - -static struct write_point *__writepoint_find(struct hlist_head *head, - unsigned long write_point) -{ - struct write_point *wp; - - guard(rcu)(); - hlist_for_each_entry_rcu(wp, head, node) - if (wp->write_point == write_point) - return wp; - return NULL; -} - -static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) -{ - u64 stranded = c->write_points_nr * c->bucket_size_max; - u64 free = bch2_fs_usage_read_short(c).free; - - return stranded * factor > free; -} - -static noinline bool try_increase_writepoints(struct bch_fs *c) -{ - struct write_point *wp; - - if (c->write_points_nr == ARRAY_SIZE(c->write_points) || - too_many_writepoints(c, 32)) - return false; - - wp = c->write_points + c->write_points_nr++; - hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); - return true; -} - -static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) -{ - struct bch_fs *c = trans->c; - struct write_point *wp; - struct open_bucket *ob; - unsigned i; - - mutex_lock(&c->write_points_hash_lock); - if (c->write_points_nr < old_nr) { - mutex_unlock(&c->write_points_hash_lock); - return true; - } - - if (c->write_points_nr == 1 || - !too_many_writepoints(c, 8)) { - mutex_unlock(&c->write_points_hash_lock); - return false; - } - - wp = c->write_points + --c->write_points_nr; - - hlist_del_rcu(&wp->node); - mutex_unlock(&c->write_points_hash_lock); - - bch2_trans_mutex_lock_norelock(trans, &wp->lock); - open_bucket_for_each(c, &wp->ptrs, ob, i) - open_bucket_free_unused(c, ob); - wp->ptrs.nr = 0; - mutex_unlock(&wp->lock); - return true; -} - -static struct write_point *writepoint_find(struct btree_trans *trans, - unsigned long write_point) -{ - struct bch_fs *c = trans->c; - struct write_point *wp, *oldest; - struct hlist_head *head; - - if (!(write_point & 1UL)) { - wp = (struct write_point *) write_point; - bch2_trans_mutex_lock_norelock(trans, &wp->lock); - return wp; - } - - head = writepoint_hash(c, write_point); -restart_find: - wp = __writepoint_find(head, write_point); - if (wp) { -lock_wp: - bch2_trans_mutex_lock_norelock(trans, &wp->lock); - if (wp->write_point == write_point) - goto out; - mutex_unlock(&wp->lock); - goto restart_find; - } -restart_find_oldest: - oldest = NULL; - for (wp = c->write_points; - wp < c->write_points + c->write_points_nr; wp++) - if (!oldest || time_before64(wp->last_used, oldest->last_used)) - oldest = wp; - - bch2_trans_mutex_lock_norelock(trans, &oldest->lock); - bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); - if (oldest >= c->write_points + c->write_points_nr || - try_increase_writepoints(c)) { - mutex_unlock(&c->write_points_hash_lock); - mutex_unlock(&oldest->lock); - goto restart_find_oldest; - } - - wp = __writepoint_find(head, write_point); - if (wp && wp != oldest) { - mutex_unlock(&c->write_points_hash_lock); - mutex_unlock(&oldest->lock); - goto lock_wp; - } - - wp = oldest; - hlist_del_rcu(&wp->node); - wp->write_point = write_point; - hlist_add_head_rcu(&wp->node, head); - mutex_unlock(&c->write_points_hash_lock); -out: - wp->last_used = local_clock(); - return wp; -} - -static noinline void -deallocate_extra_replicas(struct bch_fs *c, - struct alloc_request *req) -{ - struct open_bucket *ob; - unsigned extra_replicas = req->nr_effective - req->nr_replicas; - unsigned i; - - req->scratch_ptrs.nr = 0; - - open_bucket_for_each(c, &req->ptrs, ob, i) { - unsigned d = ob_dev(c, ob)->mi.durability; - - if (d && d <= extra_replicas) { - extra_replicas -= d; - ob_push(c, &req->wp->ptrs, ob); - } else { - ob_push(c, &req->scratch_ptrs, ob); - } - } - - req->ptrs = req->scratch_ptrs; -} - -/* - * Get us an open_bucket we can allocate from, return with it locked: - */ -int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - unsigned target, - unsigned erasure_code, - struct write_point_specifier write_point, - struct bch_devs_list *devs_have, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl, - struct write_point **wp_ret) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - unsigned write_points_nr; - int i; - - struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); - int ret = PTR_ERR_OR_ZERO(req); - if (unlikely(ret)) - return ret; - - if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) - erasure_code = false; - - req->nr_replicas = nr_replicas; - req->target = target; - req->ec = erasure_code; - req->watermark = watermark; - req->flags = flags; - req->devs_have = devs_have; - - BUG_ON(!nr_replicas || !nr_replicas_required); -retry: - req->ptrs.nr = 0; - req->nr_effective = 0; - req->have_cache = false; - write_points_nr = c->write_points_nr; - - *wp_ret = req->wp = writepoint_find(trans, write_point.v); - - req->data_type = req->wp->data_type; - - ret = bch2_trans_relock(trans); - if (ret) - goto err; - - /* metadata may not allocate on cache devices: */ - if (req->data_type != BCH_DATA_user) - req->have_cache = true; - - if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, req, NULL); - if (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto alloc_done; - - /* Don't retry from all devices if we're out of open buckets: */ - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, req, cl); - if (!ret2 || - bch2_err_matches(ret2, BCH_ERR_transaction_restart) || - bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { - ret = ret2; - goto alloc_done; - } - } - - /* - * Only try to allocate cache (durability = 0 devices) from the - * specified target: - */ - req->have_cache = true; - req->target = 0; - - ret = open_bucket_add_buckets(trans, req, cl); - } else { - ret = open_bucket_add_buckets(trans, req, cl); - } -alloc_done: - BUG_ON(!ret && req->nr_effective < req->nr_replicas); - - if (erasure_code && !ec_open_bucket(c, &req->ptrs)) - pr_debug("failed to get ec bucket: ret %u", ret); - - if (ret == -BCH_ERR_insufficient_devices && - req->nr_effective >= nr_replicas_required) - ret = 0; - - if (ret) - goto err; - - if (req->nr_effective > req->nr_replicas) - deallocate_extra_replicas(c, req); - - /* Free buckets we didn't use: */ - open_bucket_for_each(c, &req->wp->ptrs, ob, i) - open_bucket_free_unused(c, ob); - - req->wp->ptrs = req->ptrs; - - req->wp->sectors_free = UINT_MAX; - - open_bucket_for_each(c, &req->wp->ptrs, ob, i) { - /* - * Ensure proper write alignment - either due to misaligned - * bucket sizes (from buggy bcachefs-tools), or writes that mix - * logical/physical alignment: - */ - struct bch_dev *ca = ob_dev(c, ob); - u64 offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free; - unsigned align = round_up(offset, block_sectors(c)) - offset; - - ob->sectors_free = max_t(int, 0, ob->sectors_free - align); - - req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); - } - - req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); - - /* Did alignment use up space in an open_bucket? */ - if (unlikely(!req->wp->sectors_free)) { - bch2_alloc_sectors_done(c, req->wp); - goto retry; - } - - BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); - - return 0; -err: - open_bucket_for_each(c, &req->wp->ptrs, ob, i) - if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) - ob_push(c, &req->ptrs, ob); - else - open_bucket_free_unused(c, ob); - req->wp->ptrs = req->ptrs; - - mutex_unlock(&req->wp->lock); - - if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && - try_decrease_writepoints(trans, write_points_nr)) - goto retry; - - if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - ret = bch_err_throw(c, bucket_alloc_blocked); - - if (cl && !(flags & BCH_WRITE_alloc_nowait) && - bch2_err_matches(ret, BCH_ERR_freelist_empty)) - ret = bch_err_throw(c, bucket_alloc_blocked); - - return ret; -} - -void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, - struct bkey_i *k, unsigned sectors, - bool cached) -{ - bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) -{ - bch2_alloc_sectors_done_inlined(c, wp); -} - -static inline void writepoint_init(struct write_point *wp, - enum bch_data_type type) -{ - mutex_init(&wp->lock); - wp->data_type = type; - - INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); - INIT_LIST_HEAD(&wp->writes); - spin_lock_init(&wp->writes_lock); -} - -void bch2_fs_allocator_foreground_init(struct bch_fs *c) -{ - struct open_bucket *ob; - struct write_point *wp; - - mutex_init(&c->write_points_hash_lock); - c->write_points_nr = ARRAY_SIZE(c->write_points); - - /* open bucket 0 is a sentinal NULL: */ - spin_lock_init(&c->open_buckets[0].lock); - - for (ob = c->open_buckets + 1; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock_init(&ob->lock); - c->open_buckets_nr_free++; - - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - } - - writepoint_init(&c->btree_write_point, BCH_DATA_btree); - writepoint_init(&c->rebalance_write_point, BCH_DATA_user); - writepoint_init(&c->copygc_write_point, BCH_DATA_user); - - for (wp = c->write_points; - wp < c->write_points + c->write_points_nr; wp++) { - writepoint_init(wp, BCH_DATA_user); - - wp->last_used = local_clock(); - wp->write_point = (unsigned long) wp; - hlist_add_head_rcu(&wp->node, - writepoint_hash(c, wp->write_point)); - } -} - -void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - unsigned data_type = ob->data_type; - barrier(); /* READ_ONCE() doesn't work on bitfields */ - - prt_printf(out, "%zu ref %u ", - ob - c->open_buckets, - atomic_read(&ob->pin)); - bch2_prt_data_type(out, data_type); - prt_printf(out, " %u:%llu gen %u allocated %u/%u", - ob->dev, ob->bucket, ob->gen, - ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); - if (ob->ec) - prt_printf(out, " ec idx %llu", ob->ec->idx); - if (ob->on_partial_list) - prt_str(out, " partial"); - prt_newline(out); -} - -void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_dev *ca) -{ - struct open_bucket *ob; - - out->atomic++; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - spin_lock(&ob->lock); - if (ob->valid && (!ca || ob->dev == ca->dev_idx)) - bch2_open_bucket_to_text(out, c, ob); - spin_unlock(&ob->lock); - } - - --out->atomic; -} - -void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) -{ - unsigned i; - - out->atomic++; - spin_lock(&c->freelist_lock); - - for (i = 0; i < c->open_buckets_partial_nr; i++) - bch2_open_bucket_to_text(out, c, - c->open_buckets + c->open_buckets_partial[i]); - - spin_unlock(&c->freelist_lock); - --out->atomic; -} - -static const char * const bch2_write_point_states[] = { -#define x(n) #n, - WRITE_POINT_STATES() -#undef x - NULL -}; - -static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, - struct write_point *wp) -{ - struct open_bucket *ob; - unsigned i; - - mutex_lock(&wp->lock); - - prt_printf(out, "%lu: ", wp->write_point); - prt_human_readable_u64(out, wp->sectors_allocated << 9); - - prt_printf(out, " last wrote: "); - bch2_pr_time_units(out, sched_clock() - wp->last_used); - - for (i = 0; i < WRITE_POINT_STATE_NR; i++) { - prt_printf(out, " %s: ", bch2_write_point_states[i]); - bch2_pr_time_units(out, wp->time[i]); - } - - prt_newline(out); - - printbuf_indent_add(out, 2); - open_bucket_for_each(c, &wp->ptrs, ob, i) - bch2_open_bucket_to_text(out, c, ob); - printbuf_indent_sub(out, 2); - - mutex_unlock(&wp->lock); -} - -void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct write_point *wp; - - prt_str(out, "Foreground write points\n"); - for (wp = c->write_points; - wp < c->write_points + ARRAY_SIZE(c->write_points); - wp++) - bch2_write_point_to_text(out, c, wp); - - prt_str(out, "Copygc write point\n"); - bch2_write_point_to_text(out, c, &c->copygc_write_point); - - prt_str(out, "Rebalance write point\n"); - bch2_write_point_to_text(out, c, &c->rebalance_write_point); - - prt_str(out, "Btree write point\n"); - bch2_write_point_to_text(out, c, &c->btree_write_point); -} - -void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) -{ - unsigned nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 24); - - prt_printf(out, "capacity\t%llu\n", c->capacity); - prt_printf(out, "reserved\t%llu\n", c->reserved); - prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); - prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); - prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data)); - prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached)); - prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved)); - prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); - prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes)); - - prt_newline(out); - prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); - prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); - prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); - prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); - prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); - prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); - prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); -} - -void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca); - unsigned nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - bch2_dev_usage_to_text(out, ca, &stats); - - prt_newline(out); - - prt_printf(out, "reserves:\n"); - for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) - prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); - - prt_newline(out); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - - prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); - prt_printf(out, "buckets to invalidate\t%llu\r\n", - should_invalidate_buckets(ca, bch2_dev_usage_read(ca))); -} - -static noinline void bch2_print_allocator_stuck(struct bch_fs *c) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", - c->opts.allocator_stuck_timeout); - - prt_printf(&buf, "Allocator debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_fs_alloc_debug_to_text(&buf, c); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - - bch2_printbuf_make_room(&buf, 4096); - - buf.atomic++; - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) { - prt_printf(&buf, "Dev %u:\n", ca->dev_idx); - printbuf_indent_add(&buf, 2); - bch2_dev_alloc_debug_to_text(&buf, ca); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - --buf.atomic; - - prt_printf(&buf, "Copygc debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_copygc_wait_to_text(&buf, c); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - - prt_printf(&buf, "Journal debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_journal_debug_to_text(&buf, &c->journal); - printbuf_indent_sub(&buf, 2); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - -static inline unsigned allocator_wait_timeout(struct bch_fs *c) -{ - if (c->allocator_last_stuck && - time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) - return 0; - - return c->opts.allocator_stuck_timeout * HZ; -} - -void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) -{ - unsigned t = allocator_wait_timeout(c); - - if (t && closure_sync_timeout(cl, t)) { - c->allocator_last_stuck = jiffies; - bch2_print_allocator_stuck(c); - } - - closure_sync(cl); -} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h deleted file mode 100644 index 1b3fc84600963c..00000000000000 --- a/fs/bcachefs/alloc_foreground.h +++ /dev/null @@ -1,318 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_FOREGROUND_H -#define _BCACHEFS_ALLOC_FOREGROUND_H - -#include "bcachefs.h" -#include "buckets.h" -#include "alloc_types.h" -#include "extents.h" -#include "io_write_types.h" -#include "sb-members.h" - -#include - -struct bkey; -struct bch_dev; -struct bch_fs; -struct bch_devs_List; - -extern const char * const bch2_watermarks[]; - -void bch2_reset_alloc_cursors(struct bch_fs *); - -struct dev_alloc_list { - unsigned nr; - u8 data[BCH_SB_MEMBERS_MAX]; -}; - -struct alloc_request { - unsigned nr_replicas; - unsigned target; - bool ec; - enum bch_watermark watermark; - enum bch_write_flags flags; - enum bch_data_type data_type; - struct bch_devs_list *devs_have; - struct write_point *wp; - - /* These fields are used primarily by open_bucket_add_buckets */ - struct open_buckets ptrs; - unsigned nr_effective; /* sum of @ptrs durability */ - bool have_cache; /* have we allocated from a 0 durability dev */ - struct bch_devs_mask devs_may_alloc; - - /* bch2_bucket_alloc_set_trans(): */ - struct dev_alloc_list devs_sorted; - struct bch_dev_usage usage; - - /* bch2_bucket_alloc_trans(): */ - struct bch_dev *ca; - - enum { - BTREE_BITMAP_NO, - BTREE_BITMAP_YES, - BTREE_BITMAP_ANY, - } btree_bitmap; - - struct { - u64 buckets_seen; - u64 skipped_open; - u64 skipped_need_journal_commit; - u64 need_journal_commit; - u64 skipped_nocow; - u64 skipped_nouse; - u64 skipped_mi_btree_bitmap; - } counters; - - unsigned scratch_nr_replicas; - unsigned scratch_nr_effective; - bool scratch_have_cache; - enum bch_data_type scratch_data_type; - struct open_buckets scratch_ptrs; - struct bch_devs_mask scratch_devs_may_alloc; -}; - -void bch2_dev_alloc_list(struct bch_fs *, - struct dev_stripe_state *, - struct bch_devs_mask *, - struct dev_alloc_list *); -void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); - -static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) -{ - return bch2_dev_have_ref(c, ob->dev); -} - -static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) -{ - switch (watermark) { - case BCH_WATERMARK_interior_updates: - return 0; - case BCH_WATERMARK_reclaim: - return OPEN_BUCKETS_COUNT / 6; - case BCH_WATERMARK_btree: - case BCH_WATERMARK_btree_copygc: - return OPEN_BUCKETS_COUNT / 4; - case BCH_WATERMARK_copygc: - return OPEN_BUCKETS_COUNT / 3; - default: - return OPEN_BUCKETS_COUNT / 2; - } -} - -struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum bch_watermark, enum bch_data_type, - struct closure *); - -static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, - struct open_bucket *ob) -{ - BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); - - obs->v[obs->nr++] = ob - c->open_buckets; -} - -#define open_bucket_for_each(_c, _obs, _ob, _i) \ - for ((_i) = 0; \ - (_i) < (_obs)->nr && \ - ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ - (_i)++) - -static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, - struct open_buckets *obs) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, obs, ob, i) - if (ob->ec) - return ob; - - return NULL; -} - -void bch2_open_bucket_write_error(struct bch_fs *, - struct open_buckets *, unsigned, int); - -void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); - -static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - if (atomic_dec_and_test(&ob->pin)) - __bch2_open_bucket_put(c, ob); -} - -static inline void bch2_open_buckets_put(struct bch_fs *c, - struct open_buckets *ptrs) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, ptrs, ob, i) - bch2_open_bucket_put(c, ob); - ptrs->nr = 0; -} - -static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp) -{ - struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) - ob_push(c, ob->sectors_free < block_sectors(c) - ? &ptrs - : &keep, ob); - wp->ptrs = keep; - - mutex_unlock(&wp->lock); - - bch2_open_buckets_put(c, &ptrs); -} - -static inline void bch2_open_bucket_get(struct bch_fs *c, - struct write_point *wp, - struct open_buckets *ptrs) -{ - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) { - ob->data_type = wp->data_type; - atomic_inc(&ob->pin); - ob_push(c, ptrs, ob); - } -} - -static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, - unsigned dev, u64 bucket) -{ - return c->open_buckets_hash + - (jhash_3words(dev, bucket, bucket >> 32, 0) & - (OPEN_BUCKETS_COUNT - 1)); -} - -static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) -{ - open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); - - while (slot) { - struct open_bucket *ob = &c->open_buckets[slot]; - - if (ob->dev == dev && ob->bucket == bucket) - return true; - - slot = ob->hash; - } - - return false; -} - -static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) -{ - bool ret; - - if (bch2_bucket_is_open(c, dev, bucket)) - return true; - - spin_lock(&c->freelist_lock); - ret = bch2_bucket_is_open(c, dev, bucket); - spin_unlock(&c->freelist_lock); - - return ret; -} - -enum bch_write_flags; -int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, - struct dev_stripe_state *, struct closure *); - -int bch2_alloc_sectors_start_trans(struct btree_trans *, - unsigned, unsigned, - struct write_point_specifier, - struct bch_devs_list *, - unsigned, unsigned, - enum bch_watermark, - enum bch_write_flags, - struct closure *, - struct write_point **); - -static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - return (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = ob->gen, - .dev = ob->dev, - .offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free, - }; -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -static inline void -bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, - struct bkey_i *k, unsigned sectors, - bool cached) -{ - struct open_bucket *ob; - unsigned i; - - BUG_ON(sectors > wp->sectors_free); - wp->sectors_free -= sectors; - wp->sectors_allocated += sectors; - - open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = ob_dev(c, ob); - struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); - - ptr.cached = cached || - (!ca->mi.durability && - wp->data_type == BCH_DATA_user); - - bch2_bkey_append_ptr(k, ptr); - - BUG_ON(sectors > ob->sectors_free); - ob->sectors_free -= sectors; - } -} - -void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, - struct bkey_i *, unsigned, bool); -void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); - -void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool); - -static inline struct write_point_specifier writepoint_hashed(unsigned long v) -{ - return (struct write_point_specifier) { .v = v | 1 }; -} - -static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) -{ - return (struct write_point_specifier) { .v = (unsigned long) wp }; -} - -void bch2_fs_allocator_foreground_init(struct bch_fs *); - -void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *); -void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *); -void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); - -void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); -void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); - -void __bch2_wait_on_allocator(struct bch_fs *, struct closure *); -static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) -{ - if (cl->closure_get_happened) - __bch2_wait_on_allocator(c, cl); -} - -#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h deleted file mode 100644 index e7becdf22cbafd..00000000000000 --- a/fs/bcachefs/alloc_types.h +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ALLOC_TYPES_H -#define _BCACHEFS_ALLOC_TYPES_H - -#include -#include - -#include "clock_types.h" -#include "fifo.h" - -#define BCH_WATERMARKS() \ - x(stripe) \ - x(normal) \ - x(copygc) \ - x(btree) \ - x(btree_copygc) \ - x(reclaim) \ - x(interior_updates) - -enum bch_watermark { -#define x(name) BCH_WATERMARK_##name, - BCH_WATERMARKS() -#undef x - BCH_WATERMARK_NR, -}; - -#define BCH_WATERMARK_BITS 3 -#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS) - -#define OPEN_BUCKETS_COUNT 1024 - -#define WRITE_POINT_HASH_NR 32 -#define WRITE_POINT_MAX 32 - -/* - * 0 is never a valid open_bucket_idx_t: - */ -typedef u16 open_bucket_idx_t; - -struct open_bucket { - spinlock_t lock; - atomic_t pin; - open_bucket_idx_t freelist; - open_bucket_idx_t hash; - - /* - * When an open bucket has an ec_stripe attached, this is the index of - * the block in the stripe this open_bucket corresponds to: - */ - u8 ec_idx; - enum bch_data_type data_type:6; - unsigned valid:1; - unsigned on_partial_list:1; - - u8 dev; - u8 gen; - u32 sectors_free; - u64 bucket; - struct ec_stripe_new *ec; -}; - -#define OPEN_BUCKET_LIST_MAX 15 - -struct open_buckets { - open_bucket_idx_t nr; - open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; -}; - -struct dev_stripe_state { - u64 next_alloc[BCH_SB_MEMBERS_MAX]; -}; - -#define WRITE_POINT_STATES() \ - x(stopped) \ - x(waiting_io) \ - x(waiting_work) \ - x(runnable) \ - x(running) - -enum write_point_state { -#define x(n) WRITE_POINT_##n, - WRITE_POINT_STATES() -#undef x - WRITE_POINT_STATE_NR -}; - -struct write_point { - struct { - struct hlist_node node; - struct mutex lock; - u64 last_used; - unsigned long write_point; - enum bch_data_type data_type; - - /* calculated based on how many pointers we're actually going to use: */ - unsigned sectors_free; - - struct open_buckets ptrs; - struct dev_stripe_state stripe; - - u64 sectors_allocated; - } __aligned(SMP_CACHE_BYTES); - - struct { - struct work_struct index_update_work; - - struct list_head writes; - spinlock_t writes_lock; - - enum write_point_state state; - u64 last_state_change; - u64 time[WRITE_POINT_STATE_NR]; - u64 last_runtime; - } __aligned(SMP_CACHE_BYTES); -}; - -struct write_point_specifier { - unsigned long v; -}; - -#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c deleted file mode 100644 index a7cd1f0f09647a..00000000000000 --- a/fs/bcachefs/async_objs.c +++ /dev/null @@ -1,132 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Async obj debugging: keep asynchronous objects on (very fast) lists, make - * them visibile in debugfs: - */ - -#include "bcachefs.h" -#include "async_objs.h" -#include "btree_io.h" -#include "debug.h" -#include "io_read.h" -#include "io_write.h" - -#include - -static void promote_obj_to_text(struct printbuf *out, void *obj) -{ - bch2_promote_op_to_text(out, obj); -} - -static void rbio_obj_to_text(struct printbuf *out, void *obj) -{ - bch2_read_bio_to_text(out, obj); -} - -static void write_op_obj_to_text(struct printbuf *out, void *obj) -{ - bch2_write_op_to_text(out, obj); -} - -static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj) -{ - struct btree_read_bio *rbio = obj; - bch2_btree_read_bio_to_text(out, rbio); -} - -static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj) -{ - struct btree_write_bio *wbio = obj; - bch2_bio_to_text(out, &wbio->wbio.bio); -} - -static int bch2_async_obj_list_open(struct inode *inode, struct file *file) -{ - struct async_obj_list *list = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - file->private_data = i; - i->from = POS_MIN; - i->iter = 0; - i->c = container_of(list, struct bch_fs, async_objs[list->idx]); - i->list = list; - i->buf = PRINTBUF; - return 0; -} - -static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct async_obj_list *list = i->list; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - struct genradix_iter iter; - void *obj; - fast_list_for_each_from(&list->list, iter, obj, i->iter) { - ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - - if (!i->size) - break; - - list->obj_to_text(&i->buf, obj); - } - - if (i->buf.allocation_failure) - ret = -ENOMEM; - else - i->iter = iter.pos; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static const struct file_operations async_obj_ops = { - .owner = THIS_MODULE, - .open = bch2_async_obj_list_open, - .release = bch2_dump_release, - .read = bch2_async_obj_list_read, -}; - -void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) -{ - c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir); - -#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \ - &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops); - BCH_ASYNC_OBJ_LISTS() -#undef x -} - -void bch2_fs_async_obj_exit(struct bch_fs *c) -{ - for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) - fast_list_exit(&c->async_objs[i].list); -} - -int bch2_fs_async_obj_init(struct bch_fs *c) -{ - for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) { - if (fast_list_init(&c->async_objs[i].list)) - return -BCH_ERR_ENOMEM_async_obj_init; - c->async_objs[i].idx = i; - } - -#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text; - BCH_ASYNC_OBJ_LISTS() -#undef x - - return 0; -} diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h deleted file mode 100644 index cd6489b8cf7645..00000000000000 --- a/fs/bcachefs/async_objs.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ASYNC_OBJS_H -#define _BCACHEFS_ASYNC_OBJS_H - -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS -static inline void __async_object_list_del(struct fast_list *head, unsigned idx) -{ - fast_list_remove(head, idx); -} - -static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx) -{ - int ret = fast_list_add(head, obj); - *idx = ret > 0 ? ret : 0; - return ret < 0 ? ret : 0; -} - -#define async_object_list_del(_c, _list, idx) \ - __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx) - -#define async_object_list_add(_c, _list, obj, idx) \ - __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx) - -void bch2_fs_async_obj_debugfs_init(struct bch_fs *); -void bch2_fs_async_obj_exit(struct bch_fs *); -int bch2_fs_async_obj_init(struct bch_fs *); - -#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ - -#define async_object_list_del(_c, _n, idx) do {} while (0) - -static inline int __async_object_list_add(void) -{ - return 0; -} -#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add() - -static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {} -static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {} -static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; } - -#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ - -#endif /* _BCACHEFS_ASYNC_OBJS_H */ diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h deleted file mode 100644 index 8d713c0f5841d7..00000000000000 --- a/fs/bcachefs/async_objs_types.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H -#define _BCACHEFS_ASYNC_OBJS_TYPES_H - -#define BCH_ASYNC_OBJ_LISTS() \ - x(promote) \ - x(rbio) \ - x(write_op) \ - x(btree_read_bio) \ - x(btree_write_bio) - -enum bch_async_obj_lists { -#define x(n) BCH_ASYNC_OBJ_LIST_##n, - BCH_ASYNC_OBJ_LISTS() -#undef x - BCH_ASYNC_OBJ_NR -}; - -struct async_obj_list { - struct fast_list list; - void (*obj_to_text)(struct printbuf *, void *); - unsigned idx; -}; - -#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c deleted file mode 100644 index 77d93beb3c8f50..00000000000000 --- a/fs/bcachefs/backpointers.c +++ /dev/null @@ -1,1391 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bbpos.h" -#include "alloc_background.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "checksum.h" -#include "disk_accounting.h" -#include "error.h" -#include "progress.h" -#include "recovery_passes.h" - -#include - -static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64); - -static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) -{ - return (struct bbpos) { - .btree = bp.btree_id, - .pos = bp.pos, - }; -} - -int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - int ret = 0; - - bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, - c, backpointer_level_bad, - "backpointer level bad: %u >= %u", - bp.v->level, BTREE_MAX_DEPTH); - - bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, - c, backpointer_dev_bad, - "backpointer for BCH_SB_MEMBER_INVALID"); -fsck_err: - return ret; -} - -void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - - struct bch_dev *ca; - u32 bucket_offset; - struct bpos bucket; - scoped_guard(rcu) { - ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (ca) - bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); - } - - if (ca) - prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); - else - prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); - - bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); - prt_str(out, " data_type="); - bch2_prt_data_type(out, bp.v->data_type); - prt_printf(out, " suboffset=%u len=%u gen=%u pos=", - (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp.v->bucket_len, - bp.v->bucket_gen); - bch2_bpos_to_text(out, bp.v->pos); -} - -void bch2_backpointer_swab(struct bkey_s k) -{ - struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); - - bp.v->bucket_len = swab32(bp.v->bucket_len); - bch2_bpos_swab(&bp.v->pos); -} - -static bool extent_matches_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - struct bkey_s_c_backpointer bp) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bkey_i_backpointer bp2; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); - - if (bpos_eq(bp.k->p, bp2.k.p) && - !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) - return true; - } - - return false; -} - -static noinline int backpointer_mod_err(struct btree_trans *trans, - struct bkey_s_c orig_k, - struct bkey_i_backpointer *new_bp, - struct bkey_s_c found_bp, - bool insert) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - bool will_check = c->recovery.passes_to_run & - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); - int ret = 0; - - if (insert) { - prt_printf(&buf, "existing backpointer found when inserting "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "found "); - bch2_bkey_val_to_text(&buf, c, found_bp); - prt_newline(&buf); - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); - } else if (!will_check) { - prt_printf(&buf, "backpointer not found when deleting\n"); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "searching for "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); - prt_newline(&buf); - - prt_printf(&buf, "got "); - bch2_bkey_val_to_text(&buf, c, found_bp); - prt_newline(&buf); - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); - } - - if (!will_check && __bch2_inconsistent_error(c, &buf)) - ret = bch_err_throw(c, erofs_unfixed_errors); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - return ret; -} - -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bkey_s_c orig_k, - struct bkey_i_backpointer *bp, - bool insert) -{ - struct btree_iter bp_iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bp->k.p, - BTREE_ITER_intent| - BTREE_ITER_slots| - BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - - if (insert - ? k.k->type - : (k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { - ret = backpointer_mod_err(trans, orig_k, bp, k, insert); - if (ret) - goto err; - } - - if (!insert) { - bp->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp->k, 0); - } - - ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); -err: - bch2_trans_iter_exit(trans, &bp_iter); - return ret; -} - -static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) -{ - return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) - ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) - : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - -static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, - struct bkey_s_c visiting_k, - struct bkey_buf *last_flushed) -{ - return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) - ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) - : 0; -} - -static int backpointer_target_not_found(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct bkey_s_c target_k, - struct bkey_buf *last_flushed, - bool commit) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - /* - * If we're using the btree write buffer, the backpointer we were - * looking at may have already been deleted - failure to find what it - * pointed to is not an error: - */ - ret = last_flushed - ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) - : 0; - if (ret) - return ret; - - prt_printf(&buf, "backpointer doesn't match %s it points to:\n", - bp.v->level ? "btree node" : "extent"); - bch2_bkey_val_to_text(&buf, c, bp.s_c); - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, target_k); - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) - if (p.ptr.dev == bp.k->p.inode) { - prt_newline(&buf); - struct bkey_i_backpointer bp2; - bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); - } - - if (fsck_err(trans, backpointer_to_missing_ptr, - "%s", buf.buf)) { - ret = bch2_backpointer_del(trans, bp.k->p); - if (ret || !commit) - goto out; - - /* - * Normally, on transaction commit from inside a transaction, - * we'll return -BCH_ERR_transaction_restart_nested, since a - * transaction commit invalidates pointers given out by peek(). - * - * However, since we're updating a write buffer btree, if we - * return a transaction restart and loop we won't see that the - * backpointer has been deleted without an additional write - * buffer flush - and those are expensive. - * - * So we're relying on the caller immediately advancing to the - * next backpointer and starting a new transaction immediately - * after backpointer_get_key() returns NULL: - */ - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - } -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - struct bkey_buf *last_flushed, - bool commit) -{ - struct bch_fs *c = trans->c; - - BUG_ON(!bp.v->level); - - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, - bp.v->level - 1, - 0); - struct btree *b = bch2_btree_iter_peek_node(trans, iter); - if (IS_ERR_OR_NULL(b)) - goto err; - - BUG_ON(b->c.level != bp.v->level - 1); - - if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, - bkey_i_to_s_c(&b->key), bp)) - return b; - - if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)); - } else { - int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), - last_flushed, commit); - b = ret ? ERR_PTR(ret) : NULL; - } -err: - bch2_trans_iter_exit(trans, iter); - return b; -} - -static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - unsigned iter_flags, - struct bkey_buf *last_flushed, - bool commit) -{ - struct bch_fs *c = trans->c; - - if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) - return bkey_s_c_null; - - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, - bp.v->level, - iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - /* - * peek_slot() doesn't normally return NULL - except when we ask for a - * key at a btree level that doesn't exist. - * - * We may want to revisit this and change peek_slot(): - */ - if (!k.k) { - bkey_init(&iter->k); - iter->k.p = bp.v->pos; - k.k = &iter->k; - } - - if (k.k && - extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) - return k; - - bch2_trans_iter_exit(trans, iter); - - if (!bp.v->level) { - int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; - } else { - struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); - if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) - return bkey_s_c_null; - if (IS_ERR_OR_NULL(b)) - return ((struct bkey_s_c) { .k = ERR_CAST(b) }); - - return bkey_i_to_s_c(&b->key); - } -} - -struct btree *bch2_backpointer_get_node(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - struct bkey_buf *last_flushed) -{ - return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true); -} - -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - unsigned iter_flags, - struct bkey_buf *last_flushed) -{ - return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true); -} - -static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, - struct bkey_buf *last_flushed) -{ - if (k.k->type != KEY_TYPE_backpointer) - return 0; - - struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = {}; - struct bkey_s_c alloc_k; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bpos bucket; - if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { - ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); - if (ret) - goto out; - - if (fsck_err(trans, backpointer_to_missing_device, - "backpointer for missing device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_backpointer_del(trans, k.k->p); - goto out; - } - - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); - ret = bkey_err(alloc_k); - if (ret) - goto out; - - if (alloc_k.k->type != KEY_TYPE_alloc_v4) { - ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); - if (ret) - goto out; - - if (fsck_err(trans, backpointer_to_missing_alloc, - "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", - alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_backpointer_del(trans, k.k->p); - } -out: -fsck_err: - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); - return ret; -} - -/* verify that every backpointer has a corresponding alloc key */ -int bch2_check_btree_backpointers(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_backpointers, POS_MIN, 0, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; -} - -struct extents_to_bp_state { - struct bpos bp_start; - struct bpos bp_end; - struct bkey_buf last_flushed; -}; - -static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, - struct bkey_s_c extent, unsigned dev) -{ - struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bch2_bkey_drop_device(bkey_i_to_s(n), dev); - return bch2_btree_insert_trans(trans, btree, n, 0); -} - -static int check_extent_checksum(struct btree_trans *trans, - enum btree_id btree, struct bkey_s_c extent, - enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct printbuf buf = PRINTBUF; - void *data_buf = NULL; - struct bio *bio = NULL; - size_t bytes; - int ret = 0; - - if (bkey_is_btree_ptr(extent.k)) - return false; - - bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) - if (p.ptr.dev == dev) - goto found; - BUG(); -found: - if (!p.crc.csum_type) - return false; - - bytes = p.crc.compressed_size << 9; - - struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ, - BCH_DEV_READ_REF_check_extent_checksums); - if (!ca) - return false; - - data_buf = kvmalloc(bytes, GFP_KERNEL); - if (!data_buf) { - ret = -ENOMEM; - goto err; - } - - bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL); - bio->bi_iter.bi_sector = p.ptr.offset; - bch2_bio_map(bio, data_buf, bytes); - ret = submit_bio_wait(bio); - if (ret) - goto err; - - prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n"); - bch2_btree_id_to_text(&buf, btree); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent); - prt_newline(&buf); - bch2_btree_id_to_text(&buf, o_btree); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, extent2); - - struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); - struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); - if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), - trans, dup_backpointer_to_bad_csum_extent, - "%s", buf.buf)) - ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; -fsck_err: -err: - if (bio) - bio_put(bio); - kvfree(data_buf); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_check_extent_checksums); - printbuf_exit(&buf); - return ret; -} - -static int check_bp_exists(struct btree_trans *trans, - struct extents_to_bp_state *s, - struct bkey_i_backpointer *bp, - struct bkey_s_c orig_k) -{ - struct bch_fs *c = trans->c; - struct btree_iter other_extent_iter = {}; - struct printbuf buf = PRINTBUF; - - if (bpos_lt(bp->k.p, s->bp_start) || - bpos_gt(bp->k.p, s->bp_end)) - return 0; - - struct btree_iter bp_iter; - struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); - int ret = bkey_err(bp_k); - if (ret) - goto err; - - if (bp_k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { - ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); - if (ret) - goto err; - - goto check_existing_bp; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &other_extent_iter); - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - return ret; -check_existing_bp: - /* Do we have a backpointer for a different extent? */ - if (bp_k.k->type != KEY_TYPE_backpointer) - goto missing; - - struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); - - struct bkey_s_c other_extent = - __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false); - ret = bkey_err(other_extent); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - ret = 0; - if (ret) - goto err; - - if (!other_extent.k) - goto missing; - - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); - if (ca) { - struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); - bkey_for_each_ptr(other_extent_ptrs, ptr) - if (ptr->dev == bp->k.p.inode && - dev_ptr_stale_rcu(ca, ptr)) { - rcu_read_unlock(); - ret = drop_dev_and_update(trans, other_bp.v->btree_id, - other_extent, bp->k.p.inode); - if (ret) - goto err; - goto out; - } - } - rcu_read_unlock(); - - if (bch2_extents_match(orig_k, other_extent)) { - printbuf_reset(&buf); - prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n"); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, other_extent); - bch_err(c, "%s", buf.buf); - - if (other_extent.k->size <= orig_k.k->size) { - ret = drop_dev_and_update(trans, other_bp.v->btree_id, - other_extent, bp->k.p.inode); - if (ret) - goto err; - goto out; - } else { - ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); - if (ret) - goto err; - goto missing; - } - } - - ret = check_extent_checksum(trans, - other_bp.v->btree_id, other_extent, - bp->v.btree_id, orig_k, - bp->k.p.inode); - if (ret < 0) - goto err; - if (ret) { - ret = 0; - goto missing; - } - - ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, - other_bp.v->btree_id, other_extent, bp->k.p.inode); - if (ret < 0) - goto err; - if (ret) { - ret = 0; - goto out; - } - - printbuf_reset(&buf); - prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, other_extent); - bch_err(c, "%s", buf.buf); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; -missing: - printbuf_reset(&buf); - prt_str(&buf, "missing backpointer\nfor: "); - bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\nwant: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); - prt_printf(&buf, "\ngot: "); - bch2_bkey_val_to_text(&buf, c, bp_k); - - if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); - - goto out; -} - -static int check_extent_to_backpointers(struct btree_trans *trans, - struct extents_to_bp_state *s, - enum btree_id btree, unsigned level, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.dev == BCH_SB_MEMBER_INVALID) - continue; - - bool empty; - { - /* scoped_guard() is a loop, so it breaks continue */ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - if (!ca) - continue; - - if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) - continue; - - u64 b = PTR_BUCKET_NR(ca, &p.ptr); - if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) - continue; - - empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); - } - - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); - - int ret = !empty - ? check_bp_exists(trans, s, &bp, k) - : bch2_bucket_backpointer_mod(trans, k, &bp, true); - if (ret) - return ret; - } - - return 0; -} - -static int check_btree_root_to_backpointers(struct btree_trans *trans, - struct extents_to_bp_state *s, - enum btree_id btree_id, - int *level) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct btree *b; - struct bkey_s_c k; - int ret; -retry: - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, - 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); - b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; - - if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); - goto retry; - } - - *level = b->c.level; - - k = bkey_i_to_s_c(&b->key); - ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static u64 mem_may_pin_bytes(struct bch_fs *c) -{ - struct sysinfo i; - si_meminfo(&i); - - u64 mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); -} - -static size_t btree_nodes_fit_in_ram(struct bch_fs *c) -{ - return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); -} - -static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - u64 btree_leaf_mask, - u64 btree_interior_mask, - struct bbpos start, struct bbpos *end) -{ - struct bch_fs *c = trans->c; - s64 mem_may_pin = mem_may_pin_bytes(c); - int ret = 0; - - bch2_btree_cache_unpin(c); - - btree_interior_mask |= btree_leaf_mask; - - c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; - c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; - c->btree_cache.pinned_nodes_start = start; - c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; - - for (enum btree_id btree = start.btree; - btree < BTREE_ID_NR && !ret; - btree++) { - unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; - - if (!(BIT_ULL(btree) & btree_leaf_mask) && - !(BIT_ULL(btree) & btree_interior_mask)) - continue; - - ret = __for_each_btree_node(trans, iter, btree, - btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_prefetch, b, ({ - mem_may_pin -= btree_buf_bytes(b); - if (mem_may_pin <= 0) { - c->btree_cache.pinned_nodes_end = *end = - BBPOS(btree, b->key.k.p); - break; - } - bch2_node_pin(c, b); - 0; - })); - } - - return ret; -} - -static inline int bch2_fs_going_ro(struct bch_fs *c) -{ - return test_bit(BCH_FS_going_ro, &c->flags) - ? -EROFS - : 0; -} - -static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct extents_to_bp_state *s) -{ - struct bch_fs *c = trans->c; - struct progress_indicator_state progress; - int ret = 0; - - bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); - - for (enum btree_id btree_id = 0; - btree_id < btree_id_nr_alive(c); - btree_id++) { - int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - check_btree_root_to_backpointers(trans, s, btree_id, &level)); - if (ret) - return ret; - - while (level >= depth) { - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); - bch2_fs_going_ro(c) ?: - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - })); - if (ret) - return ret; - - --level; - } - } - - return 0; -} - -enum alloc_sector_counter { - ALLOC_dirty, - ALLOC_cached, - ALLOC_stripe, - ALLOC_SECTORS_NR -}; - -static int data_type_to_alloc_counter(enum bch_data_type t) -{ - switch (t) { - case BCH_DATA_btree: - case BCH_DATA_user: - return ALLOC_dirty; - case BCH_DATA_cached: - return ALLOC_cached; - case BCH_DATA_stripe: - case BCH_DATA_parity: - return ALLOC_stripe; - default: - return -1; - } -} - -static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); - -static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, - bool *had_mismatch, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - bool need_commit = false; - - *had_mismatch = false; - - if (a->data_type == BCH_DATA_sb || - a->data_type == BCH_DATA_journal || - a->data_type == BCH_DATA_parity) - return 0; - - u32 sectors[ALLOC_SECTORS_NR]; - memset(sectors, 0, sizeof(sectors)); - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); - if (!ca) - return 0; - - struct btree_iter iter; - struct bkey_s_c bp_k; - int ret = 0; - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, alloc_k.k->p), - bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { - if (bp_k.k->type != KEY_TYPE_backpointer) - continue; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && - (bp.v->bucket_gen != a->gen || - bp.v->pad)) { - ret = bch2_backpointer_del(trans, bp_k.k->p); - if (ret) - break; - - need_commit = true; - continue; - } - - if (bp.v->bucket_gen != a->gen) - continue; - - int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); - if (alloc_counter < 0) - continue; - - sectors[alloc_counter] += bp.v->bucket_len; - }; - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - if (need_commit) { - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - } - - if (sectors[ALLOC_dirty] != a->dirty_sectors || - sectors[ALLOC_cached] != a->cached_sectors || - sectors[ALLOC_stripe] != a->stripe_sectors) { - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { - ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); - if (ret) - goto err; - } - - if (sectors[ALLOC_dirty] > a->dirty_sectors || - sectors[ALLOC_cached] > a->cached_sectors || - sectors[ALLOC_stripe] > a->stripe_sectors) { - ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: - bch_err_throw(c, transaction_restart_nested); - goto err; - } - - bool empty = (sectors[ALLOC_dirty] + - sectors[ALLOC_stripe] + - sectors[ALLOC_cached]) == 0; - - ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch, - alloc_k.k->p.offset) ?: - (empty - ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, - alloc_k.k->p.offset) - : 0); - - *had_mismatch = true; - } -err: - bch2_dev_put(ca); - return ret; -} - -static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr_v2: { - bool ret = false; - - guard(rcu)(); - struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; - while (pos.inode <= k.k->p.inode) { - if (pos.inode >= c->sb.nr_devices) - break; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - if (!ca) - goto next; - - struct bpos bucket = bp_pos_to_bucket(ca, pos); - u64 next = ca->mi.nbuckets; - - unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); - if (bitmap) - next = min_t(u64, next, - find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); - - bucket.offset = next; - if (bucket.offset == ca->mi.nbuckets) - goto next; - - ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); - if (ret) - break; -next: - pos = SPOS(pos.inode + 1, 0, 0); - } - - return ret; - } - case KEY_TYPE_btree_ptr: - return true; - default: - return false; - } -} - -static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, - enum btree_id btree, unsigned level) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; - - if (b) - bch2_node_pin(trans->c, b); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, - struct bpos start, struct bpos *end) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - struct bkey_buf tmp; - bch2_bkey_buf_init(&tmp); - - bch2_btree_cache_unpin(c); - - *end = SPOS_MAX; - - s64 mem_may_pin = mem_may_pin_bytes(c); - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, - 0, 1, BTREE_ITER_prefetch); - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - if (!backpointer_node_has_missing(c, k)) - continue; - - mem_may_pin -= c->opts.btree_node_size; - if (mem_may_pin <= 0) - break; - - bch2_bkey_buf_reassemble(&tmp, c, k); - struct btree_path *path = btree_iter_path(trans, &iter); - - BUG_ON(path->level != 1); - - bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); - })); - if (ret) - return ret; - - struct bpos pinned = SPOS_MAX; - mem_may_pin = mem_may_pin_bytes(c); - bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, - 0, 1, BTREE_ITER_prefetch); - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - if (!backpointer_node_has_missing(c, k)) - continue; - - mem_may_pin -= c->opts.btree_node_size; - if (mem_may_pin <= 0) { - *end = pinned; - break; - } - - bch2_bkey_buf_reassemble(&tmp, c, k); - struct btree_path *path = btree_iter_path(trans, &iter); - - BUG_ON(path->level != 1); - - int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); - - if (!ret2) - pinned = tmp.k->k.p; - - ret; - })); - if (ret) - return ret; - - return ret; -} - -int bch2_check_extents_to_backpointers(struct bch_fs *c) -{ - int ret = 0; - - struct btree_trans *trans = bch2_trans_get(c); - struct extents_to_bp_state s = { .bp_start = POS_MIN }; - - bch2_bkey_buf_init(&s.last_flushed); - bkey_init(&s.last_flushed.k->k); - - ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_prefetch, k, ({ - bool had_mismatch; - bch2_fs_going_ro(c) ?: - check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); - })); - if (ret) - goto err; - - u64 nr_buckets = 0, nr_mismatches = 0; - for_each_member_device(c, ca) { - nr_buckets += ca->mi.nbuckets; - nr_mismatches += ca->bucket_backpointer_mismatch.nr; - } - - if (!nr_mismatches) - goto err; - - bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", - nr_mismatches, nr_buckets); - - while (1) { - ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); - if (ret) - break; - - if ( bpos_eq(s.bp_start, POS_MIN) && - !bpos_eq(s.bp_end, SPOS_MAX)) - bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", - __func__, btree_nodes_fit_in_ram(c)); - - if (!bpos_eq(s.bp_start, POS_MIN) || - !bpos_eq(s.bp_end, SPOS_MAX)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, s.bp_start); - prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, s.bp_end); - - bch_verbose(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_check_extents_to_backpointers_pass(trans, &s); - if (ret || bpos_eq(s.bp_end, SPOS_MAX)) - break; - - s.bp_start = bpos_successor(s.bp_end); - } - - for_each_member_device(c, ca) { - bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); - bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); - } -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&s.last_flushed, c); - bch2_btree_cache_unpin(c); - - bch_err_fn(c, ret); - return ret; -} - -static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, - struct bpos bucket, - bool *had_mismatch, - struct bkey_buf *last_flushed) -{ - struct btree_iter alloc_iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - BTREE_ITER_cached); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; -} - -int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, - struct bch_dev *ca, u64 bucket, - bool copygc, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - bool had_mismatch; - int ret = lockrestart_do(trans, - check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket), - &had_mismatch, last_flushed)); - if (ret || !had_mismatch) - return ret; - - u64 nr = ca->bucket_backpointer_mismatch.nr; - u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0; - - struct printbuf buf = PRINTBUF; - __bch2_log_msg_start(ca->name, &buf); - - prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n", - bucket, nr, ca->mi.nbuckets); - - bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_extents_to_backpointers, - nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return 0; -} - -/* backpointers -> extents */ - -static int check_one_backpointer(struct btree_trans *trans, - struct bbpos start, - struct bbpos end, - struct bkey_s_c bp_k, - struct bkey_buf *last_flushed) -{ - if (bp_k.k->type != KEY_TYPE_backpointer) - return 0; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - struct bbpos pos = bp_to_bbpos(*bp.v); - - if (bbpos_cmp(pos, start) < 0 || - bbpos_cmp(pos, end) > 0) - return 0; - - struct btree_iter iter; - struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); - int ret = bkey_err(k); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - return 0; - if (ret) - return ret; - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int check_bucket_backpointers_to_extents(struct btree_trans *trans, - struct bch_dev *ca, struct bpos bucket) -{ - u32 restart_count = trans->restart_count; - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket), - bucket_pos_to_bp_end(ca, bucket), - 0, k, - check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) - ); - - bch2_bkey_buf_exit(&last_flushed, trans->c); - return ret ?: trans_was_restarted(trans, restart_count); -} - -static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, - struct bbpos start, - struct bbpos end) -{ - struct bch_fs *c = trans->c; - struct bkey_buf last_flushed; - struct progress_indicator_state progress; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); - - int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_prefetch, k, ({ - bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); - check_one_backpointer(trans, start, end, k, &last_flushed); - })); - - bch2_bkey_buf_exit(&last_flushed, c); - return ret; -} - -int bch2_check_backpointers_to_extents(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; - int ret; - - while (1) { - ret = bch2_get_btree_in_memory_pos(trans, - BIT_ULL(BTREE_ID_extents)| - BIT_ULL(BTREE_ID_reflink), - ~0, - start, &end); - if (ret) - break; - - if (!bbpos_cmp(start, BBPOS_MIN) && - bbpos_cmp(end, BBPOS_MAX)) - bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", - __func__, btree_nodes_fit_in_ram(c)); - - if (bbpos_cmp(start, BBPOS_MIN) || - bbpos_cmp(end, BBPOS_MAX)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "check_backpointers_to_extents(): "); - bch2_bbpos_to_text(&buf, start); - prt_str(&buf, "-"); - bch2_bbpos_to_text(&buf, end); - - bch_verbose(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_check_backpointers_to_extents_pass(trans, start, end); - if (ret || !bbpos_cmp(end, BBPOS_MAX)) - break; - - start = bbpos_successor(end); - } - bch2_trans_put(trans); - - bch2_btree_cache_unpin(c); - - bch_err_fn(c, ret); - return ret; -} - -static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit) -{ - scoped_guard(mutex, &b->lock) { - if (!b->buckets) { - b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), GFP_KERNEL); - if (!b->buckets) - return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); - } - - b->nr += !__test_and_set_bit(bit, b->buckets); - } - - return 0; -} - -int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b, - u64 old_size, u64 new_size) -{ - scoped_guard(mutex, &b->lock) { - if (!b->buckets) - return 0; - - unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), - sizeof(unsigned long), GFP_KERNEL); - if (!n) - return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); - - memcpy(n, b->buckets, - BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); - kvfree(b->buckets); - b->buckets = n; - } - - return 0; -} - -void bch2_bucket_bitmap_free(struct bucket_bitmap *b) -{ - mutex_lock(&b->lock); - kvfree(b->buckets); - b->buckets = NULL; - b->nr = 0; - mutex_unlock(&b->lock); -} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h deleted file mode 100644 index 7e71afee1ac053..00000000000000 --- a/fs/bcachefs/backpointers.h +++ /dev/null @@ -1,200 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BACKPOINTERS_H -#define _BCACHEFS_BACKPOINTERS_H - -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "buckets.h" -#include "error.h" -#include "super.h" - -static inline u64 swab40(u64 x) -{ - return (((x & 0x00000000ffULL) << 32)| - ((x & 0x000000ff00ULL) << 16)| - ((x & 0x0000ff0000ULL) >> 0)| - ((x & 0x00ff000000ULL) >> 16)| - ((x & 0xff00000000ULL) >> 32)); -} - -int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, - struct bkey_validate_context); -void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_backpointer_swab(struct bkey_s); - -#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ - .key_validate = bch2_backpointer_validate, \ - .val_to_text = bch2_backpointer_to_text, \ - .swab = bch2_backpointer_swab, \ - .min_val_size = 32, \ -}) - -#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 - -/* - * Convert from pos in backpointer btree to pos of corresponding bucket in alloc - * btree: - */ -static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos) -{ - u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); -} - -static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos, - u32 *bucket_offset) -{ - u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset)); -} - -static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); - if (ca) - *bucket = bp_pos_to_bucket(ca, bp_pos); - return ca != NULL; -} - -static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, - struct bpos bucket, - u64 bucket_offset) -{ - return POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); -} - -/* - * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: - */ -static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, - struct bpos bucket, - u64 bucket_offset) -{ - struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset); - EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret))); - return ret; -} - -static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket) -{ - return bucket_pos_to_bp(ca, bucket, 0); -} - -static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket) -{ - return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)); -} - -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, - struct bkey_s_c, - struct bkey_i_backpointer *, - bool); - -static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, - struct bkey_s_c orig_k, - struct bkey_i_backpointer *bp, - bool insert) -{ - if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); - - if (!insert) { - bp->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp->k, 0); - } - - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i); -} - -static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, - struct extent_ptr_decoded p, - const union bch_extent_entry *entry) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - if (p.has_ec) - return BCH_DATA_stripe; - if (p.ptr.cached) - return BCH_DATA_cached; - else - return BCH_DATA_user; - case KEY_TYPE_stripe: { - const struct bch_extent_ptr *ptr = &entry->ptr; - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - BUG_ON(ptr < s.v->ptrs || - ptr >= s.v->ptrs + s.v->nr_blocks); - - return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity - : BCH_DATA_user; - } - default: - BUG(); - } -} - -static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - struct bkey_i_backpointer *bp) -{ - bkey_backpointer_init(&bp->k_i); - bp->k.p.inode = p.ptr.dev; - - if (k.k->type != KEY_TYPE_stripe) - bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; - else { - /* - * Put stripe backpointers where they won't collide with the - * extent backpointers within the stripe: - */ - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1; - } - - bp->v = (struct bch_backpointer) { - .btree_id = btree_id, - .level = level, - .data_type = bch2_bkey_ptr_data_type(k, p, entry), - .bucket_gen = p.ptr.gen, - .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p), - .pos = k.k->p, - }; -} - -struct bkey_buf; -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, unsigned, struct bkey_buf *); -struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, struct bkey_buf *); - -int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, - bool, struct bkey_buf *); - -int bch2_check_btree_backpointers(struct bch_fs *); -int bch2_check_extents_to_backpointers(struct bch_fs *); -int bch2_check_backpointers_to_extents(struct bch_fs *); - -static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) -{ - unsigned long *bitmap = READ_ONCE(b->buckets); - return bitmap && test_bit(i, bitmap); -} - -int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64); -void bch2_bucket_bitmap_free(struct bucket_bitmap *); - -#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h deleted file mode 100644 index 63abe17f35eaa9..00000000000000 --- a/fs/bcachefs/bbpos.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BBPOS_H -#define _BCACHEFS_BBPOS_H - -#include "bbpos_types.h" -#include "bkey_methods.h" -#include "btree_cache.h" - -static inline int bbpos_cmp(struct bbpos l, struct bbpos r) -{ - return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos); -} - -static inline struct bbpos bbpos_successor(struct bbpos pos) -{ - if (bpos_cmp(pos.pos, SPOS_MAX)) { - pos.pos = bpos_successor(pos.pos); - return pos; - } - - if (pos.btree != BTREE_ID_NR) { - pos.btree++; - pos.pos = POS_MIN; - return pos; - } - - BUG(); -} - -static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) -{ - bch2_btree_id_to_text(out, pos.btree); - prt_char(out, ':'); - bch2_bpos_to_text(out, pos.pos); -} - -#endif /* _BCACHEFS_BBPOS_H */ diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h deleted file mode 100644 index f63893344f80aa..00000000000000 --- a/fs/bcachefs/bbpos_types.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BBPOS_TYPES_H -#define _BCACHEFS_BBPOS_TYPES_H - -struct bbpos { - enum btree_id btree; - struct bpos pos; -}; - -static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) -{ - return (struct bbpos) { btree, pos }; -} - -#define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) - -#endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h deleted file mode 100644 index ddfacad0f70cf4..00000000000000 --- a/fs/bcachefs/bcachefs.h +++ /dev/null @@ -1,1295 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_H -#define _BCACHEFS_H - -/* - * SOME HIGH LEVEL CODE DOCUMENTATION: - * - * Bcache mostly works with cache sets, cache devices, and backing devices. - * - * Support for multiple cache devices hasn't quite been finished off yet, but - * it's about 95% plumbed through. A cache set and its cache devices is sort of - * like a md raid array and its component devices. Most of the code doesn't care - * about individual cache devices, the main abstraction is the cache set. - * - * Multiple cache devices is intended to give us the ability to mirror dirty - * cached data and metadata, without mirroring clean cached data. - * - * Backing devices are different, in that they have a lifetime independent of a - * cache set. When you register a newly formatted backing device it'll come up - * in passthrough mode, and then you can attach and detach a backing device from - * a cache set at runtime - while it's mounted and in use. Detaching implicitly - * invalidates any cached data for that backing device. - * - * A cache set can have multiple (many) backing devices attached to it. - * - * There's also flash only volumes - this is the reason for the distinction - * between struct cached_dev and struct bcache_device. A flash only volume - * works much like a bcache device that has a backing device, except the - * "cached" data is always dirty. The end result is that we get thin - * provisioning with very little additional code. - * - * Flash only volumes work but they're not production ready because the moving - * garbage collector needs more work. More on that later. - * - * BUCKETS/ALLOCATION: - * - * Bcache is primarily designed for caching, which means that in normal - * operation all of our available space will be allocated. Thus, we need an - * efficient way of deleting things from the cache so we can write new things to - * it. - * - * To do this, we first divide the cache device up into buckets. A bucket is the - * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ - * works efficiently. - * - * Each bucket has a 16 bit priority, and an 8 bit generation associated with - * it. The gens and priorities for all the buckets are stored contiguously and - * packed on disk (in a linked list of buckets - aside from the superblock, all - * of bcache's metadata is stored in buckets). - * - * The priority is used to implement an LRU. We reset a bucket's priority when - * we allocate it or on cache it, and every so often we decrement the priority - * of each bucket. It could be used to implement something more sophisticated, - * if anyone ever gets around to it. - * - * The generation is used for invalidating buckets. Each pointer also has an 8 - * bit generation embedded in it; for a pointer to be considered valid, its gen - * must match the gen of the bucket it points into. Thus, to reuse a bucket all - * we have to do is increment its gen (and write its new gen to disk; we batch - * this up). - * - * Bcache is entirely COW - we never write twice to a bucket, even buckets that - * contain metadata (including btree nodes). - * - * THE BTREE: - * - * Bcache is in large part design around the btree. - * - * At a high level, the btree is just an index of key -> ptr tuples. - * - * Keys represent extents, and thus have a size field. Keys also have a variable - * number of pointers attached to them (potentially zero, which is handy for - * invalidating the cache). - * - * The key itself is an inode:offset pair. The inode number corresponds to a - * backing device or a flash only volume. The offset is the ending offset of the - * extent within the inode - not the starting offset; this makes lookups - * slightly more convenient. - * - * Pointers contain the cache device id, the offset on that device, and an 8 bit - * generation number. More on the gen later. - * - * Index lookups are not fully abstracted - cache lookups in particular are - * still somewhat mixed in with the btree code, but things are headed in that - * direction. - * - * Updates are fairly well abstracted, though. There are two different ways of - * updating the btree; insert and replace. - * - * BTREE_INSERT will just take a list of keys and insert them into the btree - - * overwriting (possibly only partially) any extents they overlap with. This is - * used to update the index after a write. - * - * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is - * overwriting a key that matches another given key. This is used for inserting - * data into the cache after a cache miss, and for background writeback, and for - * the moving garbage collector. - * - * There is no "delete" operation; deleting things from the index is - * accomplished by either by invalidating pointers (by incrementing a bucket's - * gen) or by inserting a key with 0 pointers - which will overwrite anything - * previously present at that location in the index. - * - * This means that there are always stale/invalid keys in the btree. They're - * filtered out by the code that iterates through a btree node, and removed when - * a btree node is rewritten. - * - * BTREE NODES: - * - * Our unit of allocation is a bucket, and we can't arbitrarily allocate and - * free smaller than a bucket - so, that's how big our btree nodes are. - * - * (If buckets are really big we'll only use part of the bucket for a btree node - * - no less than 1/4th - but a bucket still contains no more than a single - * btree node. I'd actually like to change this, but for now we rely on the - * bucket's gen for deleting btree nodes when we rewrite/split a node.) - * - * Anyways, btree nodes are big - big enough to be inefficient with a textbook - * btree implementation. - * - * The way this is solved is that btree nodes are internally log structured; we - * can append new keys to an existing btree node without rewriting it. This - * means each set of keys we write is sorted, but the node is not. - * - * We maintain this log structure in memory - keeping 1Mb of keys sorted would - * be expensive, and we have to distinguish between the keys we have written and - * the keys we haven't. So to do a lookup in a btree node, we have to search - * each sorted set. But we do merge written sets together lazily, so the cost of - * these extra searches is quite low (normally most of the keys in a btree node - * will be in one big set, and then there'll be one or two sets that are much - * smaller). - * - * This log structure makes bcache's btree more of a hybrid between a - * conventional btree and a compacting data structure, with some of the - * advantages of both. - * - * GARBAGE COLLECTION: - * - * We can't just invalidate any bucket - it might contain dirty data or - * metadata. If it once contained dirty data, other writes might overwrite it - * later, leaving no valid pointers into that bucket in the index. - * - * Thus, the primary purpose of garbage collection is to find buckets to reuse. - * It also counts how much valid data it each bucket currently contains, so that - * allocation can reuse buckets sooner when they've been mostly overwritten. - * - * It also does some things that are really internal to the btree - * implementation. If a btree node contains pointers that are stale by more than - * some threshold, it rewrites the btree node to avoid the bucket's generation - * wrapping around. It also merges adjacent btree nodes if they're empty enough. - * - * THE JOURNAL: - * - * Bcache's journal is not necessary for consistency; we always strictly - * order metadata writes so that the btree and everything else is consistent on - * disk in the event of an unclean shutdown, and in fact bcache had writeback - * caching (with recovery from unclean shutdown) before journalling was - * implemented. - * - * Rather, the journal is purely a performance optimization; we can't complete a - * write until we've updated the index on disk, otherwise the cache would be - * inconsistent in the event of an unclean shutdown. This means that without the - * journal, on random write workloads we constantly have to update all the leaf - * nodes in the btree, and those writes will be mostly empty (appending at most - * a few keys each) - highly inefficient in terms of amount of metadata writes, - * and it puts more strain on the various btree resorting/compacting code. - * - * The journal is just a log of keys we've inserted; on startup we just reinsert - * all the keys in the open journal entries. That means that when we're updating - * a node in the btree, we can wait until a 4k block of keys fills up before - * writing them out. - * - * For simplicity, we only journal updates to leaf nodes; updates to parent - * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth - * the complexity to deal with journalling them (in particular, journal replay) - * - updates to non leaf nodes just happen synchronously (see btree_split()). - */ - -#undef pr_fmt -#ifdef __KERNEL__ -#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ -#else -#define pr_fmt(fmt) "%s() " fmt "\n", __func__ -#endif - -#ifdef CONFIG_BCACHEFS_DEBUG -#define ENUMERATED_REF_DEBUG -#endif - -#ifndef dynamic_fault -#define dynamic_fault(...) 0 -#endif - -#define race_fault(...) dynamic_fault("bcachefs:race") - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "bcachefs_format.h" -#include "btree_journal_iter_types.h" -#include "disk_accounting_types.h" -#include "errcode.h" -#include "fast_list.h" -#include "fifo.h" -#include "nocow_locking_types.h" -#include "opts.h" -#include "sb-errors_types.h" -#include "seqmutex.h" -#include "snapshot_types.h" -#include "time_stats.h" -#include "util.h" - -#include "alloc_types.h" -#include "async_objs_types.h" -#include "btree_gc_types.h" -#include "btree_types.h" -#include "btree_node_scan_types.h" -#include "btree_write_buffer_types.h" -#include "buckets_types.h" -#include "buckets_waiting_for_journal_types.h" -#include "clock_types.h" -#include "disk_groups_types.h" -#include "ec_types.h" -#include "enumerated_ref_types.h" -#include "journal_types.h" -#include "keylist_types.h" -#include "quota_types.h" -#include "rebalance_types.h" -#include "recovery_passes_types.h" -#include "replicas_types.h" -#include "sb-members_types.h" -#include "subvolume_types.h" -#include "super_types.h" -#include "thread_with_file_types.h" - -#include "trace.h" - -#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) - -#define trace_and_count(_c, _name, ...) \ -do { \ - count_event(_c, _name); \ - trace_##_name(__VA_ARGS__); \ -} while (0) - -#define bch2_fs_init_fault(name) \ - dynamic_fault("bcachefs:bch_fs_init:" name) -#define bch2_meta_read_fault(name) \ - dynamic_fault("bcachefs:meta:read:" name) -#define bch2_meta_write_fault(name) \ - dynamic_fault("bcachefs:meta:write:" name) - -#ifdef __KERNEL__ -#define BCACHEFS_LOG_PREFIX -#endif - -#ifdef BCACHEFS_LOG_PREFIX - -#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) -#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name) -#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset) -#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) -#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ - "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset) - -#else - -#define bch2_log_msg(_c, fmt) fmt -#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name) -#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset) -#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) -#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ - "inum %llu offset %llu: " fmt "\n", (_inum), (_offset) - -#endif - -#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") - -void bch2_print_str(struct bch_fs *, const char *, const char *); - -__printf(2, 3) -void bch2_print_opts(struct bch_opts *, const char *, ...); - -__printf(2, 3) -void __bch2_print(struct bch_fs *c, const char *fmt, ...); - -#define maybe_dev_to_fs(_c) _Generic((_c), \ - struct bch_dev *: ((struct bch_dev *) (_c))->fs, \ - struct bch_fs *: (_c)) - -#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__) - -#define bch2_print_ratelimited(_c, ...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - \ - if (__ratelimit(&_rs)) \ - bch2_print(_c, __VA_ARGS__); \ -} while (0) - -#define bch2_print_str_ratelimited(_c, ...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - \ - if (__ratelimit(&_rs)) \ - bch2_print_str(_c, __VA_ARGS__); \ -} while (0) - -#define bch_info(c, fmt, ...) \ - bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_info_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_notice(c, fmt, ...) \ - bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_warn(c, fmt, ...) \ - bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_warn_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) - -#define bch_err(c, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_err_dev(ca, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -#define bch_err_dev_offset(ca, _offset, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) -#define bch_err_inum(c, _inum, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) -#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) - -#define bch_err_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_err_dev_ratelimited(ca, fmt, ...) \ - bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ - bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) -#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) -#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) - -static inline bool should_print_err(int err) -{ - return err && !bch2_err_matches(err, BCH_ERR_transaction_restart); -} - -#define bch_err_fn(_c, _ret) \ -do { \ - if (should_print_err(_ret)) \ - bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ -} while (0) - -#define bch_err_fn_ratelimited(_c, _ret) \ -do { \ - if (should_print_err(_ret)) \ - bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ -} while (0) - -#define bch_err_msg(_c, _ret, _msg, ...) \ -do { \ - if (should_print_err(_ret)) \ - bch_err(_c, "%s(): error " _msg " %s", __func__, \ - ##__VA_ARGS__, bch2_err_str(_ret)); \ -} while (0) - -#define bch_verbose(c, fmt, ...) \ -do { \ - if ((c)->opts.verbose) \ - bch_info(c, fmt, ##__VA_ARGS__); \ -} while (0) - -#define bch_verbose_ratelimited(c, fmt, ...) \ -do { \ - if ((c)->opts.verbose) \ - bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ -} while (0) - -#define pr_verbose_init(opts, fmt, ...) \ -do { \ - if (opt_get(opts, verbose)) \ - pr_info(fmt, ##__VA_ARGS__); \ -} while (0) - -static inline int __bch2_err_trace(struct bch_fs *c, int err) -{ - trace_error_throw(c, err, _THIS_IP_); - return err; -} - -#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) - -/* Parameters that are useful for debugging, but should always be compiled in: */ -#define BCH_DEBUG_PARAMS_ALWAYS() \ - BCH_DEBUG_PARAM(key_merging_disabled, \ - "Disables merging of extents") \ - BCH_DEBUG_PARAM(btree_node_merging_disabled, \ - "Disables merging of btree nodes") \ - BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ - "Causes mark and sweep to compact and rewrite every " \ - "btree node it traverses") \ - BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ - "Disables rewriting of btree nodes during mark and sweep")\ - BCH_DEBUG_PARAM(btree_shrinker_disabled, \ - "Disables the shrinker callback for the btree node cache")\ - BCH_DEBUG_PARAM(verify_btree_ondisk, \ - "Reread btree nodes at various points to verify the " \ - "mergesort in the read path against modifications " \ - "done in memory") \ - BCH_DEBUG_PARAM(verify_all_btree_replicas, \ - "When reading btree nodes, read all replicas and " \ - "compare them") \ - BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ - "Don't use the write buffer for backpointers, enabling "\ - "extra runtime checks") \ - BCH_DEBUG_PARAM(debug_check_btree_locking, \ - "Enable additional asserts for btree locking") \ - BCH_DEBUG_PARAM(debug_check_iterators, \ - "Enables extra verification for btree iterators") \ - BCH_DEBUG_PARAM(debug_check_bset_lookups, \ - "Enables extra verification for bset lookups") \ - BCH_DEBUG_PARAM(debug_check_btree_accounting, \ - "Verify btree accounting for keys within a node") \ - BCH_DEBUG_PARAM(debug_check_bkey_unpack, \ - "Enables extra verification for bkey unpack") - -/* Parameters that should only be compiled in debug mode: */ -#define BCH_DEBUG_PARAMS_DEBUG() \ - BCH_DEBUG_PARAM(journal_seq_verify, \ - "Store the journal sequence number in the version " \ - "number of every btree key, and verify that btree " \ - "update ordering is preserved during recovery") \ - BCH_DEBUG_PARAM(inject_invalid_keys, \ - "Store the journal sequence number in the version " \ - "number of every btree key, and verify that btree " \ - "update ordering is preserved during recovery") \ - BCH_DEBUG_PARAM(test_alloc_startup, \ - "Force allocator startup to use the slowpath where it" \ - "can't find enough free buckets without invalidating" \ - "cached data") \ - BCH_DEBUG_PARAM(force_reconstruct_read, \ - "Force reads to use the reconstruct path, when reading" \ - "from erasure coded extents") \ - BCH_DEBUG_PARAM(test_restart_gc, \ - "Test restarting mark and sweep gc when bucket gens change") - -#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() - -#ifdef CONFIG_BCACHEFS_DEBUG -#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() -#else -#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() -#endif - -#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name; -BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - -#define BCH_TIME_STATS() \ - x(btree_node_mem_alloc) \ - x(btree_node_split) \ - x(btree_node_compact) \ - x(btree_node_merge) \ - x(btree_node_sort) \ - x(btree_node_get) \ - x(btree_node_read) \ - x(btree_node_read_done) \ - x(btree_node_write) \ - x(btree_interior_update_foreground) \ - x(btree_interior_update_total) \ - x(btree_gc) \ - x(data_write) \ - x(data_write_to_submit) \ - x(data_write_to_queue) \ - x(data_write_to_btree_update) \ - x(data_write_btree_update) \ - x(data_read) \ - x(data_promote) \ - x(journal_flush_write) \ - x(journal_noflush_write) \ - x(journal_flush_seq) \ - x(blocked_journal_low_on_space) \ - x(blocked_journal_low_on_pin) \ - x(blocked_journal_max_in_flight) \ - x(blocked_journal_max_open) \ - x(blocked_key_cache_flush) \ - x(blocked_allocate) \ - x(blocked_allocate_open_bucket) \ - x(blocked_write_buffer_full) \ - x(nocow_lock_contended) - -enum bch_time_stats { -#define x(name) BCH_TIME_##name, - BCH_TIME_STATS() -#undef x - BCH_TIME_STAT_NR -}; - -/* Number of nodes btree coalesce will try to coalesce at once */ -#define GC_MERGE_NODES 4U - -/* Maximum number of nodes we might need to allocate atomically: */ -#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) - -/* Size of the freelist we allocate btree nodes from: */ -#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) - -#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) - -struct btree; - -struct io_count { - u64 sectors[2][BCH_DATA_NR]; -}; - -struct discard_in_flight { - bool in_progress:1; - u64 bucket:63; -}; - -#define BCH_DEV_READ_REFS() \ - x(bch2_online_devs) \ - x(trans_mark_dev_sbs) \ - x(read_fua_test) \ - x(sb_field_resize) \ - x(write_super) \ - x(journal_read) \ - x(fs_journal_alloc) \ - x(fs_resize_on_mount) \ - x(btree_node_read) \ - x(btree_node_read_all_replicas) \ - x(btree_node_scrub) \ - x(btree_node_write) \ - x(btree_node_scan) \ - x(btree_verify_replicas) \ - x(btree_node_ondisk_to_text) \ - x(io_read) \ - x(check_extent_checksums) \ - x(ec_block) - -enum bch_dev_read_ref { -#define x(n) BCH_DEV_READ_REF_##n, - BCH_DEV_READ_REFS() -#undef x - BCH_DEV_READ_REF_NR, -}; - -#define BCH_DEV_WRITE_REFS() \ - x(journal_write) \ - x(journal_do_discards) \ - x(dev_do_discards) \ - x(discard_one_bucket_fast) \ - x(do_invalidates) \ - x(nocow_flush) \ - x(io_write) \ - x(ec_block) \ - x(ec_bucket_zero) - -enum bch_dev_write_ref { -#define x(n) BCH_DEV_WRITE_REF_##n, - BCH_DEV_WRITE_REFS() -#undef x - BCH_DEV_WRITE_REF_NR, -}; - -struct bucket_bitmap { - unsigned long *buckets; - u64 nr; - struct mutex lock; -}; - -struct bch_dev { - struct kobject kobj; -#ifdef CONFIG_BCACHEFS_DEBUG - atomic_long_t ref; - bool dying; - unsigned long last_put; -#else - struct percpu_ref ref; -#endif - struct completion ref_completion; - struct enumerated_ref io_ref[2]; - - struct bch_fs *fs; - - u8 dev_idx; - /* - * Cached version of this device's member info from superblock - * Committed by bch2_write_super() -> bch_fs_mi_update() - */ - struct bch_member_cpu mi; - atomic64_t errors[BCH_MEMBER_ERROR_NR]; - unsigned long write_errors_start; - - __uuid_t uuid; - char name[BDEVNAME_SIZE]; - - struct bch_sb_handle disk_sb; - struct bch_sb *sb_read_scratch; - int sb_write_error; - dev_t dev; - atomic_t flush_seq; - - struct bch_devs_mask self; - - /* - * Buckets: - * Per-bucket arrays are protected by either rcu_read_lock or - * state_lock, for device resize. - */ - GENRADIX(struct bucket) buckets_gc; - struct bucket_gens __rcu *bucket_gens; - u8 *oldest_gen; - unsigned long *buckets_nouse; - - struct bucket_bitmap bucket_backpointer_mismatch; - struct bucket_bitmap bucket_backpointer_empty; - - struct bch_dev_usage_full __percpu - *usage; - - /* Allocator: */ - u64 alloc_cursor[3]; - - unsigned nr_open_buckets; - unsigned nr_partial_buckets; - unsigned nr_btree_reserve; - - struct work_struct invalidate_work; - struct work_struct discard_work; - struct mutex discard_buckets_in_flight_lock; - DARRAY(struct discard_in_flight) discard_buckets_in_flight; - struct work_struct discard_fast_work; - - atomic64_t rebalance_work; - - struct journal_device journal; - u64 prev_journal_sector; - - struct work_struct io_error_work; - - /* The rest of this all shows up in sysfs */ - atomic64_t cur_latency[2]; - struct bch2_time_stats_quantiles io_latency[2]; - -#define CONGESTED_MAX 1024 - atomic_t congested; - u64 congested_last; - - struct io_count __percpu *io_done; -}; - -/* - * initial_gc_unfixed - * error - * topology error - */ - -#define BCH_FS_FLAGS() \ - x(new_fs) \ - x(started) \ - x(clean_recovery) \ - x(btree_running) \ - x(accounting_replay_done) \ - x(may_go_rw) \ - x(rw) \ - x(rw_init_done) \ - x(was_rw) \ - x(stopping) \ - x(emergency_ro) \ - x(going_ro) \ - x(write_disable_complete) \ - x(clean_shutdown) \ - x(in_recovery) \ - x(in_fsck) \ - x(initial_gc_unfixed) \ - x(need_delete_dead_snapshots) \ - x(error) \ - x(topology_error) \ - x(errors_fixed) \ - x(errors_not_fixed) \ - x(no_invalid_checks) \ - x(discard_mount_opt_set) \ - -enum bch_fs_flags { -#define x(n) BCH_FS_##n, - BCH_FS_FLAGS() -#undef x -}; - -struct btree_debug { - unsigned id; -}; - -#define BCH_TRANSACTIONS_NR 128 - -struct btree_transaction_stats { - struct bch2_time_stats duration; - struct bch2_time_stats lock_hold_times; - struct mutex lock; - unsigned nr_max_paths; - unsigned max_mem; -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_trans_kmalloc_trace trans_kmalloc_trace; -#endif - char *max_paths_text; -}; - -struct bch_fs_pcpu { - u64 sectors_available; -}; - -struct journal_seq_blacklist_table { - size_t nr; - struct journal_seq_blacklist_table_entry { - u64 start; - u64 end; - bool dirty; - } entries[]; -}; - -struct btree_trans_buf { - struct btree_trans *trans; -}; - -#define BCH_WRITE_REFS() \ - x(journal) \ - x(trans) \ - x(write) \ - x(promote) \ - x(node_rewrite) \ - x(stripe_create) \ - x(stripe_delete) \ - x(reflink) \ - x(fallocate) \ - x(fsync) \ - x(dio_write) \ - x(discard) \ - x(discard_fast) \ - x(check_discard_freespace_key) \ - x(invalidate) \ - x(delete_dead_snapshots) \ - x(gc_gens) \ - x(snapshot_delete_pagecache) \ - x(sysfs) \ - x(btree_write_buffer) \ - x(btree_node_scrub) \ - x(async_recovery_passes) \ - x(ioctl_data) - -enum bch_write_ref { -#define x(n) BCH_WRITE_REF_##n, - BCH_WRITE_REFS() -#undef x - BCH_WRITE_REF_NR, -}; - -#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0) - -struct bch_fs { - struct closure cl; - - struct list_head list; - struct kobject kobj; - struct kobject counters_kobj; - struct kobject internal; - struct kobject opts_dir; - struct kobject time_stats; - unsigned long flags; - - int minor; - struct device *chardev; - struct super_block *vfs_sb; - dev_t dev; - char name[40]; - struct stdio_redirect *stdio; - struct task_struct *stdio_filter; - - /* ro/rw, add/remove/resize devices: */ - struct rw_semaphore state_lock; - - /* Counts outstanding writes, for clean transition to read-only */ - struct enumerated_ref writes; - /* - * Certain operations are only allowed in single threaded mode, during - * recovery, and we want to assert that this is the case: - */ - struct task_struct *recovery_task; - - /* - * Analagous to c->writes, for asynchronous ops that don't necessarily - * need fs to be read-write - */ - refcount_t ro_ref; - wait_queue_head_t ro_ref_wait; - - struct work_struct read_only_work; - - struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; - - struct bch_accounting_mem accounting; - - struct bch_replicas_cpu replicas; - struct bch_replicas_cpu replicas_gc; - struct mutex replicas_gc_lock; - - struct journal_entry_res btree_root_journal_res; - struct journal_entry_res clock_journal_res; - - struct bch_disk_groups_cpu __rcu *disk_groups; - - struct bch_opts opts; - - /* Updated by bch2_sb_update():*/ - struct { - __uuid_t uuid; - __uuid_t user_uuid; - - u16 version; - u16 version_incompat; - u16 version_incompat_allowed; - u16 version_min; - u16 version_upgrade_complete; - - u8 nr_devices; - u8 clean; - bool multi_device; /* true if we've ever had more than one device */ - - u8 encryption_type; - - u64 time_base_lo; - u32 time_base_hi; - unsigned time_units_per_sec; - unsigned nsec_per_time_unit; - u64 features; - u64 compat; - u64 recovery_passes_required; - unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; - u64 btrees_lost_data; - } sb; - DARRAY(enum bcachefs_metadata_version) - incompat_versions_requested; - - struct unicode_map *cf_encoding; - - struct bch_sb_handle disk_sb; - - unsigned short block_bits; /* ilog2(block_size) */ - - u16 btree_foreground_merge_threshold; - - struct closure sb_write; - struct mutex sb_lock; - - /* snapshot.c: */ - struct snapshot_table __rcu *snapshots; - struct mutex snapshot_table_lock; - struct rw_semaphore snapshot_create_lock; - - struct snapshot_delete snapshot_delete; - struct work_struct snapshot_wait_for_pagecache_and_delete_work; - snapshot_id_list snapshots_unlinked; - struct mutex snapshots_unlinked_lock; - - /* BTREE CACHE */ - struct bio_set btree_bio; - struct workqueue_struct *btree_read_complete_wq; - struct workqueue_struct *btree_write_submit_wq; - - struct btree_root btree_roots_known[BTREE_ID_NR]; - DARRAY(struct btree_root) btree_roots_extra; - struct mutex btree_root_lock; - - struct btree_cache btree_cache; - - /* - * Cache of allocated btree nodes - if we allocate a btree node and - * don't use it, if we free it that space can't be reused until going - * _all_ the way through the allocator (which exposes us to a livelock - * when allocating btree reserves fail halfway through) - instead, we - * can stick them here: - */ - struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; - unsigned btree_reserve_cache_nr; - struct mutex btree_reserve_cache_lock; - - mempool_t btree_interior_update_pool; - struct list_head btree_interior_update_list; - struct list_head btree_interior_updates_unwritten; - struct mutex btree_interior_update_lock; - struct closure_waitlist btree_interior_update_wait; - - struct workqueue_struct *btree_interior_update_worker; - struct work_struct btree_interior_update_work; - - struct workqueue_struct *btree_node_rewrite_worker; - struct list_head btree_node_rewrites; - struct list_head btree_node_rewrites_pending; - spinlock_t btree_node_rewrites_lock; - struct closure_waitlist btree_node_rewrites_wait; - - /* btree_io.c: */ - spinlock_t btree_write_error_lock; - struct btree_write_stats { - atomic64_t nr; - atomic64_t bytes; - } btree_write_stats[BTREE_WRITE_TYPE_NR]; - - /* btree_iter.c: */ - struct seqmutex btree_trans_lock; - struct list_head btree_trans_list; - mempool_t btree_trans_pool; - mempool_t btree_trans_mem_pool; - struct btree_trans_buf __percpu *btree_trans_bufs; - - struct srcu_struct btree_trans_barrier; - bool btree_trans_barrier_initialized; - - struct btree_key_cache btree_key_cache; - unsigned btree_key_cache_btrees; - - struct btree_write_buffer btree_write_buffer; - - struct workqueue_struct *btree_update_wq; - struct workqueue_struct *btree_write_complete_wq; - /* copygc needs its own workqueue for index updates.. */ - struct workqueue_struct *copygc_wq; - /* - * Use a dedicated wq for write ref holder tasks. Required to avoid - * dependency problems with other wq tasks that can block on ref - * draining, such as read-only transition. - */ - struct workqueue_struct *write_ref_wq; - - /* ALLOCATION */ - struct bch_devs_mask online_devs; - struct bch_devs_mask rw_devs[BCH_DATA_NR]; - unsigned long rw_devs_change_count; - - u64 capacity; /* sectors */ - u64 reserved; /* sectors */ - - /* - * When capacity _decreases_ (due to a disk being removed), we - * increment capacity_gen - this invalidates outstanding reservations - * and forces them to be revalidated - */ - u32 capacity_gen; - unsigned bucket_size_max; - - atomic64_t sectors_available; - struct mutex sectors_available_lock; - - struct bch_fs_pcpu __percpu *pcpu; - - struct percpu_rw_semaphore mark_lock; - - seqcount_t usage_lock; - struct bch_fs_usage_base __percpu *usage; - u64 __percpu *online_reserved; - - unsigned long allocator_last_stuck; - - struct io_clock io_clock[2]; - - /* JOURNAL SEQ BLACKLIST */ - struct journal_seq_blacklist_table * - journal_seq_blacklist_table; - - /* ALLOCATOR */ - spinlock_t freelist_lock; - struct closure_waitlist freelist_wait; - - open_bucket_idx_t open_buckets_freelist; - open_bucket_idx_t open_buckets_nr_free; - struct closure_waitlist open_buckets_wait; - struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; - open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; - - open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; - open_bucket_idx_t open_buckets_partial_nr; - - struct write_point btree_write_point; - struct write_point rebalance_write_point; - - struct write_point write_points[WRITE_POINT_MAX]; - struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; - struct mutex write_points_hash_lock; - unsigned write_points_nr; - - struct buckets_waiting_for_journal buckets_waiting_for_journal; - - /* GARBAGE COLLECTION */ - struct work_struct gc_gens_work; - unsigned long gc_count; - - enum btree_id gc_gens_btree; - struct bpos gc_gens_pos; - - /* - * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] - * has been marked by GC. - * - * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) - * - * Protected by gc_pos_lock. Only written to by GC thread, so GC thread - * can read without a lock. - */ - seqcount_t gc_pos_lock; - struct gc_pos gc_pos; - - /* - * The allocation code needs gc_mark in struct bucket to be correct, but - * it's not while a gc is in progress. - */ - struct rw_semaphore gc_lock; - struct mutex gc_gens_lock; - - /* IO PATH */ - struct semaphore io_in_flight; - struct bio_set bio_read; - struct bio_set bio_read_split; - struct bio_set bio_write; - struct bio_set replica_set; - struct mutex bio_bounce_pages_lock; - mempool_t bio_bounce_pages; - struct bucket_nocow_lock_table - nocow_locks; - struct rhashtable promote_table; - -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR]; -#endif - - mempool_t compression_bounce[2]; - mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; - size_t zstd_workspace_size; - - struct bch_key chacha20_key; - bool chacha20_key_set; - - atomic64_t key_version; - - mempool_t large_bkey_pool; - - /* MOVE.C */ - struct list_head moving_context_list; - struct mutex moving_context_lock; - - /* REBALANCE */ - struct bch_fs_rebalance rebalance; - - /* COPYGC */ - struct task_struct *copygc_thread; - struct write_point copygc_write_point; - s64 copygc_wait_at; - s64 copygc_wait; - bool copygc_running; - wait_queue_head_t copygc_running_wq; - - /* STRIPES: */ - GENRADIX(struct gc_stripe) gc_stripes; - - struct hlist_head ec_stripes_new[32]; - spinlock_t ec_stripes_new_lock; - - /* ERASURE CODING */ - struct list_head ec_stripe_head_list; - struct mutex ec_stripe_head_lock; - - struct list_head ec_stripe_new_list; - struct mutex ec_stripe_new_lock; - wait_queue_head_t ec_stripe_new_wait; - - struct work_struct ec_stripe_create_work; - u64 ec_stripe_hint; - - struct work_struct ec_stripe_delete_work; - - struct bio_set ec_bioset; - - /* REFLINK */ - reflink_gc_table reflink_gc_table; - size_t reflink_gc_nr; - - /* fs.c */ - struct list_head vfs_inodes_list; - struct mutex vfs_inodes_lock; - struct rhashtable vfs_inodes_table; - struct rhltable vfs_inodes_by_inum_table; - - /* VFS IO PATH - fs-io.c */ - struct bio_set writepage_bioset; - struct bio_set dio_write_bioset; - struct bio_set dio_read_bioset; - struct bio_set nocow_flush_bioset; - - /* QUOTAS */ - struct bch_memquota_type quotas[QTYP_NR]; - - /* RECOVERY */ - u64 journal_replay_seq_start; - u64 journal_replay_seq_end; - struct bch_fs_recovery recovery; - - /* DEBUG JUNK */ - struct dentry *fs_debug_dir; - struct dentry *btree_debug_dir; - struct dentry *async_obj_dir; - struct btree_debug btree_debug[BTREE_ID_NR]; - struct btree *verify_data; - struct btree_node *verify_ondisk; - struct mutex verify_lock; - - /* - * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - have to dynamically allocate them - */ - mempool_t fill_iter; - - mempool_t btree_bounce_pool; - - struct journal journal; - GENRADIX(struct journal_replay *) journal_entries; - u64 journal_entries_base_seq; - struct journal_keys journal_keys; - struct list_head journal_iters; - - struct find_btree_nodes found_btree_nodes; - - u64 last_bucket_seq_cleanup; - - u64 counters_on_mount[BCH_COUNTER_NR]; - u64 __percpu *counters; - - struct bch2_time_stats times[BCH_TIME_STAT_NR]; - - struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; - - /* ERRORS */ - struct list_head fsck_error_msgs; - struct mutex fsck_error_msgs_lock; - bool fsck_alloc_msgs_err; - - bch_sb_errors_cpu fsck_error_counts; - struct mutex fsck_error_counts_lock; -}; - -extern struct wait_queue_head bch2_read_only_wait; - -static inline bool bch2_ro_ref_tryget(struct bch_fs *c) -{ - if (test_bit(BCH_FS_stopping, &c->flags)) - return false; - - return refcount_inc_not_zero(&c->ro_ref); -} - -static inline void bch2_ro_ref_put(struct bch_fs *c) -{ - if (refcount_dec_and_test(&c->ro_ref)) - wake_up(&c->ro_ref_wait); -} - -static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) -{ -#ifndef NO_BCACHEFS_FS - if (c->vfs_sb) - c->vfs_sb->s_bdi->ra_pages = ra_pages; -#endif -} - -static inline unsigned bucket_bytes(const struct bch_dev *ca) -{ - return ca->mi.bucket_size << 9; -} - -static inline unsigned block_bytes(const struct bch_fs *c) -{ - return c->opts.block_size; -} - -static inline unsigned block_sectors(const struct bch_fs *c) -{ - return c->opts.block_size >> 9; -} - -static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) -{ - return c->btree_key_cache_btrees & (1U << btree); -} - -static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) -{ - struct timespec64 t; - s64 sec; - s32 rem; - - time += c->sb.time_base_lo; - - sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); - - set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit); - - return t; -} - -static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) -{ - return (ts.tv_sec * c->sb.time_units_per_sec + - (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; -} - -static inline s64 bch2_current_time(const struct bch_fs *c) -{ - struct timespec64 now; - - ktime_get_coarse_real_ts64(&now); - return timespec_to_bch2_time(c, now); -} - -static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) -{ - return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX); -} - -static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) -{ - struct stdio_redirect *stdio = c->stdio; - - if (c->stdio_filter && c->stdio_filter != current) - stdio = NULL; - return stdio; -} - -static inline unsigned metadata_replicas_required(struct bch_fs *c) -{ - return min(c->opts.metadata_replicas, - c->opts.metadata_replicas_required); -} - -static inline unsigned data_replicas_required(struct bch_fs *c) -{ - return min(c->opts.data_replicas, - c->opts.data_replicas_required); -} - -#define BKEY_PADDED_ONSTACK(key, pad) \ - struct { struct bkey_i key; __u64 key ## _pad[pad]; } - -/* - * This is needed because discard is both a filesystem option and a device - * option, and mount options are supposed to apply to that mount and not be - * persisted, i.e. if it's set as a mount option we can't propagate it to the - * device. - */ -static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) -{ - return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) - ? c->opts.discard - : ca->mi.discard; -} - -static inline bool bch2_fs_casefold_enabled(struct bch_fs *c) -{ -#ifdef CONFIG_UNICODE - return !c->opts.casefold_disabled; -#else - return false; -#endif -} - -#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h deleted file mode 100644 index b4a04df5ea9555..00000000000000 --- a/fs/bcachefs/bcachefs_format.h +++ /dev/null @@ -1,1545 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FORMAT_H -#define _BCACHEFS_FORMAT_H - -/* - * bcachefs on disk data structures - * - * OVERVIEW: - * - * There are three main types of on disk data structures in bcachefs (this is - * reduced from 5 in bcache) - * - * - superblock - * - journal - * - btree - * - * The btree is the primary structure; most metadata exists as keys in the - * various btrees. There are only a small number of btrees, they're not - * sharded - we have one btree for extents, another for inodes, et cetera. - * - * SUPERBLOCK: - * - * The superblock contains the location of the journal, the list of devices in - * the filesystem, and in general any metadata we need in order to decide - * whether we can start a filesystem or prior to reading the journal/btree - * roots. - * - * The superblock is extensible, and most of the contents of the superblock are - * in variable length, type tagged fields; see struct bch_sb_field. - * - * Backup superblocks do not reside in a fixed location; also, superblocks do - * not have a fixed size. To locate backup superblocks we have struct - * bch_sb_layout; we store a copy of this inside every superblock, and also - * before the first superblock. - * - * JOURNAL: - * - * The journal primarily records btree updates in the order they occurred; - * journal replay consists of just iterating over all the keys in the open - * journal entries and re-inserting them into the btrees. - * - * The journal also contains entry types for the btree roots, and blacklisted - * journal sequence numbers (see journal_seq_blacklist.c). - * - * BTREE: - * - * bcachefs btrees are copy on write b+ trees, where nodes are big (typically - * 128k-256k) and log structured. We use struct btree_node for writing the first - * entry in a given node (offset 0), and struct btree_node_entry for all - * subsequent writes. - * - * After the header, btree node entries contain a list of keys in sorted order. - * Values are stored inline with the keys; since values are variable length (and - * keys effectively are variable length too, due to packing) we can't do random - * access without building up additional in memory tables in the btree node read - * path. - * - * BTREE KEYS (struct bkey): - * - * The various btrees share a common format for the key - so as to avoid - * switching in fastpath lookup/comparison code - but define their own - * structures for the key values. - * - * The size of a key/value pair is stored as a u8 in units of u64s, so the max - * size is just under 2k. The common part also contains a type tag for the - * value, and a format field indicating whether the key is packed or not (and - * also meant to allow adding new key fields in the future, if desired). - * - * bkeys, when stored within a btree node, may also be packed. In that case, the - * bkey_format in that node is used to unpack it. Packed bkeys mean that we can - * be generous with field sizes in the common part of the key format (64 bit - * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. - */ - -#include -#include -#include -#include -#include -#include "vstructs.h" - -#ifdef __KERNEL__ -typedef uuid_t __uuid_t; -#endif - -#define BITMASK(name, type, field, offset, end) \ -static const __maybe_unused unsigned name##_OFFSET = offset; \ -static const __maybe_unused unsigned name##_BITS = (end - offset); \ - \ -static inline __u64 name(const type *k) \ -{ \ - return (k->field >> offset) & ~(~0ULL << (end - offset)); \ -} \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - k->field &= ~(~(~0ULL << (end - offset)) << offset); \ - k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ -} - -#define LE_BITMASK(_bits, name, type, field, offset, end) \ -static const __maybe_unused unsigned name##_OFFSET = offset; \ -static const __maybe_unused unsigned name##_BITS = (end - offset); \ -static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\ - \ -static inline __u64 name(const type *k) \ -{ \ - return (__le##_bits##_to_cpu(k->field) >> offset) & \ - ~(~0ULL << (end - offset)); \ -} \ - \ -static inline void SET_##name(type *k, __u64 v) \ -{ \ - __u##_bits new = __le##_bits##_to_cpu(k->field); \ - \ - new &= ~(~(~0ULL << (end - offset)) << offset); \ - new |= (v & ~(~0ULL << (end - offset))) << offset; \ - k->field = __cpu_to_le##_bits(new); \ -} - -#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) -#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) -#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) - -struct bkey_format { - __u8 key_u64s; - __u8 nr_fields; - /* One unused slot for now: */ - __u8 bits_per_field[6]; - __le64 field_offset[6]; -}; - -/* Btree keys - all units are in sectors */ - -struct bpos { - /* - * Word order matches machine byte order - btree code treats a bpos as a - * single large integer, for search/comparison purposes - * - * Note that wherever a bpos is embedded in another on disk data - * structure, it has to be byte swabbed when reading in metadata that - * wasn't written in native endian order: - */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - __u32 snapshot; - __u64 offset; - __u64 inode; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - __u64 inode; - __u64 offset; /* Points to end of extent - sectors */ - __u32 snapshot; -#else -#error edit for your odd byteorder. -#endif -} __packed -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -__aligned(4) -#endif -; - -#define KEY_INODE_MAX ((__u64)~0ULL) -#define KEY_OFFSET_MAX ((__u64)~0ULL) -#define KEY_SNAPSHOT_MAX ((__u32)~0U) -#define KEY_SIZE_MAX ((__u32)~0U) - -static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) -{ - return (struct bpos) { - .inode = inode, - .offset = offset, - .snapshot = snapshot, - }; -} - -#define POS_MIN SPOS(0, 0, 0) -#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0) -#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) -#define POS(_inode, _offset) SPOS(_inode, _offset, 0) - -/* Empty placeholder struct, for container_of() */ -struct bch_val { - __u64 __nothing[0]; -}; - -struct bversion { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - __u64 lo; - __u32 hi; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - __u32 hi; - __u64 lo; -#endif -} __packed -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -__aligned(4) -#endif -; - -struct bkey { - /* Size of combined key and value, in u64s */ - __u8 u64s; - - /* Format of key (0 for format local to btree node) */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 format:7, - needs_whiteout:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u8 needs_whiteout:1, - format:7; -#else -#error edit for your odd byteorder. -#endif - - /* Type of the value */ - __u8 type; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - __u8 pad[1]; - - struct bversion bversion; - __u32 size; /* extent size, in sectors */ - struct bpos p; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - struct bpos p; - __u32 size; /* extent size, in sectors */ - struct bversion bversion; - - __u8 pad[1]; -#endif -} __packed -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -/* - * The big-endian version of bkey can't be compiled by rustc with the "aligned" - * attr since it doesn't allow types to have both "packed" and "aligned" attrs. - * So for Rust compatibility, don't include this. It can be included in the LE - * version because the "packed" attr is redundant in that case. - * - * History: (quoting Kent) - * - * Specifically, when i was designing bkey, I wanted the header to be no - * bigger than necessary so that bkey_packed could use the rest. That means that - * decently offten extent keys will fit into only 8 bytes, instead of spilling over - * to 16. - * - * But packed_bkey treats the part after the header - the packed section - - * as a single multi word, variable length integer. And bkey, the unpacked - * version, is just a special case version of a bkey_packed; all the packed - * bkey code will work on keys in any packed format, the in-memory - * representation of an unpacked key also is just one type of packed key... - * - * So that constrains the key part of a bkig endian bkey to start right - * after the header. - * - * If we ever do a bkey_v2 and need to expand the hedaer by another byte for - * some reason - that will clean up this wart. - */ -__aligned(8) -#endif -; - -struct bkey_packed { - __u64 _data[0]; - - /* Size of combined key and value, in u64s */ - __u8 u64s; - - /* Format of key (0 for format local to btree node) */ - - /* - * XXX: next incompat on disk format change, switch format and - * needs_whiteout - bkey_packed() will be cheaper if format is the high - * bits of the bitfield - */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 format:7, - needs_whiteout:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u8 needs_whiteout:1, - format:7; -#endif - - /* Type of the value */ - __u8 type; - __u8 key_start[0]; - - /* - * We copy bkeys with struct assignment in various places, and while - * that shouldn't be done with packed bkeys we can't disallow it in C, - * and it's legal to cast a bkey to a bkey_packed - so padding it out - * to the same size as struct bkey should hopefully be safest. - */ - __u8 pad[sizeof(struct bkey) - 3]; -} __packed __aligned(8); - -typedef struct { - __le64 lo; - __le64 hi; -} bch_le128; - -#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) -#define BKEY_U64s_MAX U8_MAX -#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) - -#define KEY_PACKED_BITS_START 24 - -#define KEY_FORMAT_LOCAL_BTREE 0 -#define KEY_FORMAT_CURRENT 1 - -enum bch_bkey_fields { - BKEY_FIELD_INODE, - BKEY_FIELD_OFFSET, - BKEY_FIELD_SNAPSHOT, - BKEY_FIELD_SIZE, - BKEY_FIELD_VERSION_HI, - BKEY_FIELD_VERSION_LO, - BKEY_NR_FIELDS, -}; - -#define bkey_format_field(name, field) \ - [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) - -#define BKEY_FORMAT_CURRENT \ -((struct bkey_format) { \ - .key_u64s = BKEY_U64s, \ - .nr_fields = BKEY_NR_FIELDS, \ - .bits_per_field = { \ - bkey_format_field(INODE, p.inode), \ - bkey_format_field(OFFSET, p.offset), \ - bkey_format_field(SNAPSHOT, p.snapshot), \ - bkey_format_field(SIZE, size), \ - bkey_format_field(VERSION_HI, bversion.hi), \ - bkey_format_field(VERSION_LO, bversion.lo), \ - }, \ -}) - -/* bkey with inline value */ -struct bkey_i { - __u64 _data[0]; - - struct bkey k; - struct bch_val v; -}; - -#define POS_KEY(_pos) \ -((struct bkey) { \ - .u64s = BKEY_U64s, \ - .format = KEY_FORMAT_CURRENT, \ - .p = _pos, \ -}) - -#define KEY(_inode, _offset, _size) \ -((struct bkey) { \ - .u64s = BKEY_U64s, \ - .format = KEY_FORMAT_CURRENT, \ - .p = POS(_inode, _offset), \ - .size = _size, \ -}) - -static inline void bkey_init(struct bkey *k) -{ - *k = KEY(0, 0, 0); -} - -#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) - -#define __BKEY_PADDED(key, pad) \ - struct bkey_i key; __u64 key ## _pad[pad] - -enum bch_bkey_type_flags { - BKEY_TYPE_strict_btree_checks = BIT(0), -}; - -/* - * - DELETED keys are used internally to mark keys that should be ignored but - * override keys in composition order. Their version number is ignored. - * - * - DISCARDED keys indicate that the data is all 0s because it has been - * discarded. DISCARDs may have a version; if the version is nonzero the key - * will be persistent, otherwise the key will be dropped whenever the btree - * node is rewritten (like DELETED keys). - * - * - ERROR: any read of the data returns a read error, as the data was lost due - * to a failing device. Like DISCARDED keys, they can be removed (overridden) - * by new writes or cluster-wide GC. Node repair can also overwrite them with - * the same or a more recent version number, but not with an older version - * number. - * - * - WHITEOUT: for hash table btrees - */ -#define BCH_BKEY_TYPES() \ - x(deleted, 0, 0) \ - x(whiteout, 1, 0) \ - x(error, 2, 0) \ - x(cookie, 3, 0) \ - x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \ - x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \ - x(extent, 6, BKEY_TYPE_strict_btree_checks) \ - x(reservation, 7, BKEY_TYPE_strict_btree_checks) \ - x(inode, 8, BKEY_TYPE_strict_btree_checks) \ - x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \ - x(dirent, 10, BKEY_TYPE_strict_btree_checks) \ - x(xattr, 11, BKEY_TYPE_strict_btree_checks) \ - x(alloc, 12, BKEY_TYPE_strict_btree_checks) \ - x(quota, 13, BKEY_TYPE_strict_btree_checks) \ - x(stripe, 14, BKEY_TYPE_strict_btree_checks) \ - x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \ - x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \ - x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \ - x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \ - x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \ - x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \ - x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \ - x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \ - x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \ - x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \ - x(set, 25, 0) \ - x(lru, 26, BKEY_TYPE_strict_btree_checks) \ - x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \ - x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \ - x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \ - x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \ - x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \ - x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ - x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ - x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ - x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) - -enum bch_bkey_type { -#define x(name, nr, ...) KEY_TYPE_##name = nr, - BCH_BKEY_TYPES() -#undef x - KEY_TYPE_MAX, -}; - -struct bch_deleted { - struct bch_val v; -}; - -struct bch_whiteout { - struct bch_val v; -}; - -struct bch_error { - struct bch_val v; -}; - -struct bch_cookie { - struct bch_val v; - __le64 cookie; -}; - -struct bch_hash_whiteout { - struct bch_val v; -}; - -struct bch_set { - struct bch_val v; -}; - -/* 128 bits, sufficient for cryptographic MACs: */ -struct bch_csum { - __le64 lo; - __le64 hi; -} __packed __aligned(8); - -struct bch_backpointer { - struct bch_val v; - __u8 btree_id; - __u8 level; - __u8 data_type; - __u8 bucket_gen; - __u32 pad; - __u32 bucket_len; - struct bpos pos; -} __packed __aligned(8); - -/* Optional/variable size superblock sections: */ - -struct bch_sb_field { - __u64 _data[0]; - __le32 u64s; - __le32 type; -}; - -#define BCH_SB_FIELDS() \ - x(journal, 0) \ - x(members_v1, 1) \ - x(crypt, 2) \ - x(replicas_v0, 3) \ - x(quota, 4) \ - x(disk_groups, 5) \ - x(clean, 6) \ - x(replicas, 7) \ - x(journal_seq_blacklist, 8) \ - x(journal_v2, 9) \ - x(counters, 10) \ - x(members_v2, 11) \ - x(errors, 12) \ - x(ext, 13) \ - x(downgrade, 14) \ - x(recovery_passes, 15) - -#include "alloc_background_format.h" -#include "dirent_format.h" -#include "disk_accounting_format.h" -#include "disk_groups_format.h" -#include "extents_format.h" -#include "ec_format.h" -#include "inode_format.h" -#include "journal_seq_blacklist_format.h" -#include "logged_ops_format.h" -#include "lru_format.h" -#include "quota_format.h" -#include "recovery_passes_format.h" -#include "reflink_format.h" -#include "replicas_format.h" -#include "snapshot_format.h" -#include "subvolume_format.h" -#include "sb-counters_format.h" -#include "sb-downgrade_format.h" -#include "sb-errors_format.h" -#include "sb-members_format.h" -#include "xattr_format.h" - -enum bch_sb_field_type { -#define x(f, nr) BCH_SB_FIELD_##f = nr, - BCH_SB_FIELDS() -#undef x - BCH_SB_FIELD_NR -}; - -/* - * Most superblock fields are replicated in all device's superblocks - a few are - * not: - */ -#define BCH_SINGLE_DEVICE_SB_FIELDS \ - ((1U << BCH_SB_FIELD_journal)| \ - (1U << BCH_SB_FIELD_journal_v2)) - -/* BCH_SB_FIELD_journal: */ - -struct bch_sb_field_journal { - struct bch_sb_field field; - __le64 buckets[]; -}; - -struct bch_sb_field_journal_v2 { - struct bch_sb_field field; - - struct bch_sb_field_journal_v2_entry { - __le64 start; - __le64 nr; - } d[]; -}; - -/* BCH_SB_FIELD_crypt: */ - -struct nonce { - __le32 d[4]; -}; - -struct bch_key { - __le64 key[4]; -}; - -#define BCH_KEY_MAGIC \ - (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \ - ((__u64) 'h' << 16)|((__u64) '*' << 24)| \ - ((__u64) '*' << 32)|((__u64) 'k' << 40)| \ - ((__u64) 'e' << 48)|((__u64) 'y' << 56)) - -struct bch_encrypted_key { - __le64 magic; - struct bch_key key; -}; - -/* - * If this field is present in the superblock, it stores an encryption key which - * is used encrypt all other data/metadata. The key will normally be encrypted - * with the key userspace provides, but if encryption has been turned off we'll - * just store the master key unencrypted in the superblock so we can access the - * previously encrypted data. - */ -struct bch_sb_field_crypt { - struct bch_sb_field field; - - __le64 flags; - __le64 kdf_flags; - struct bch_encrypted_key key; -}; - -LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); - -enum bch_kdf_types { - BCH_KDF_SCRYPT = 0, - BCH_KDF_NR = 1, -}; - -/* stored as base 2 log of scrypt params: */ -LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); -LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); -LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); - -/* - * On clean shutdown, store btree roots and current journal sequence number in - * the superblock: - */ -struct jset_entry { - __le16 u64s; - __u8 btree_id; - __u8 level; - __u8 type; /* designates what this jset holds */ - __u8 pad[3]; - - struct bkey_i start[0]; - __u64 _data[]; -}; - -struct bch_sb_field_clean { - struct bch_sb_field field; - - __le32 flags; - __le16 _read_clock; /* no longer used */ - __le16 _write_clock; - __le64 journal_seq; - - struct jset_entry start[0]; - __u64 _data[]; -}; - -struct bch_sb_field_ext { - struct bch_sb_field field; - __le64 recovery_passes_required[2]; - __le64 errors_silent[8]; - __le64 btrees_lost_data; -}; - -/* Superblock: */ - -/* - * New versioning scheme: - * One common version number for all on disk data structures - superblock, btree - * nodes, journal entries - */ -#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10)) -#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) -#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) - -/* - * field 1: version name - * field 2: BCH_VERSION(major, minor) - * field 3: recovery passess required on upgrade - */ -#define BCH_METADATA_VERSIONS() \ - x(bkey_renumber, BCH_VERSION(0, 10)) \ - x(inode_btree_change, BCH_VERSION(0, 11)) \ - x(snapshot, BCH_VERSION(0, 12)) \ - x(inode_backpointers, BCH_VERSION(0, 13)) \ - x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \ - x(snapshot_2, BCH_VERSION(0, 15)) \ - x(reflink_p_fix, BCH_VERSION(0, 16)) \ - x(subvol_dirent, BCH_VERSION(0, 17)) \ - x(inode_v2, BCH_VERSION(0, 18)) \ - x(freespace, BCH_VERSION(0, 19)) \ - x(alloc_v4, BCH_VERSION(0, 20)) \ - x(new_data_types, BCH_VERSION(0, 21)) \ - x(backpointers, BCH_VERSION(0, 22)) \ - x(inode_v3, BCH_VERSION(0, 23)) \ - x(unwritten_extents, BCH_VERSION(0, 24)) \ - x(bucket_gens, BCH_VERSION(0, 25)) \ - x(lru_v2, BCH_VERSION(0, 26)) \ - x(fragmentation_lru, BCH_VERSION(0, 27)) \ - x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \ - x(snapshot_trees, BCH_VERSION(0, 29)) \ - x(major_minor, BCH_VERSION(1, 0)) \ - x(snapshot_skiplists, BCH_VERSION(1, 1)) \ - x(deleted_inodes, BCH_VERSION(1, 2)) \ - x(rebalance_work, BCH_VERSION(1, 3)) \ - x(member_seq, BCH_VERSION(1, 4)) \ - x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ - x(btree_subvolume_children, BCH_VERSION(1, 6)) \ - x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ - x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ - x(disk_accounting_v2, BCH_VERSION(1, 9)) \ - x(disk_accounting_v3, BCH_VERSION(1, 10)) \ - x(disk_accounting_inum, BCH_VERSION(1, 11)) \ - x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ - x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ - x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ - x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ - x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ - x(inode_depth, BCH_VERSION(1, 17)) \ - x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ - x(autofix_errors, BCH_VERSION(1, 19)) \ - x(directory_size, BCH_VERSION(1, 20)) \ - x(cached_backpointers, BCH_VERSION(1, 21)) \ - x(stripe_backpointers, BCH_VERSION(1, 22)) \ - x(stripe_lru, BCH_VERSION(1, 23)) \ - x(casefolding, BCH_VERSION(1, 24)) \ - x(extent_flags, BCH_VERSION(1, 25)) \ - x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ - x(fast_device_removal, BCH_VERSION(1, 27)) \ - x(inode_has_case_insensitive, BCH_VERSION(1, 28)) - -enum bcachefs_metadata_version { - bcachefs_metadata_version_min = 9, -#define x(t, n) bcachefs_metadata_version_##t = n, - BCH_METADATA_VERSIONS() -#undef x - bcachefs_metadata_version_max -}; - -static const __maybe_unused -unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work; - -#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) - -#define BCH_SB_SECTOR 8 - -#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */ - -struct bch_sb_layout { - __uuid_t magic; /* bcachefs superblock UUID */ - __u8 layout_type; - __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ - __u8 nr_superblocks; - __u8 pad[5]; - __le64 sb_offset[61]; -} __packed __aligned(8); - -#define BCH_SB_LAYOUT_SECTOR 7 - -/* - * @offset - sector where this sb was written - * @version - on disk format version - * @version_min - Oldest metadata version this filesystem contains; so we can - * safely drop compatibility code and refuse to mount filesystems - * we'd need it for - * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC) - * @seq - incremented each time superblock is written - * @uuid - used for generating various magic numbers and identifying - * member devices, never changes - * @user_uuid - user visible UUID, may be changed - * @label - filesystem label - * @seq - identifies most recent superblock, incremented each time - * superblock is written - * @features - enabled incompatible features - */ -struct bch_sb { - struct bch_csum csum; - __le16 version; - __le16 version_min; - __le16 pad[2]; - __uuid_t magic; - __uuid_t uuid; - __uuid_t user_uuid; - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 offset; - __le64 seq; - - __le16 block_size; - __u8 dev_idx; - __u8 nr_devices; - __le32 u64s; - - __le64 time_base_lo; - __le32 time_base_hi; - __le32 time_precision; - - __le64 flags[7]; - __le64 write_time; - __le64 features[2]; - __le64 compat[2]; - - struct bch_sb_layout layout; - - struct bch_sb_field start[0]; - __le64 _data[]; -} __packed __aligned(8); - -/* - * Flags: - * BCH_SB_INITALIZED - set on first mount - * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect - * behaviour of mount/recovery path: - * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits - * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 - * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides - * DATA/META_CSUM_TYPE. Also indicates encryption - * algorithm in use, if/when we get more than one - */ - -LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); - -LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); -LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); -LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); -LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); - -LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); - -LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); -LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); - -LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); -LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); - -LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); - -LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); -LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); -LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); -LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); - -LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); -LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); - -LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); -LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS, - struct bch_sb, flags[0], 63, 64); - -LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); -LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); - -LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); -LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); - -/* - * Max size of an extent that may require bouncing to read or write - * (checksummed, compressed): 64k - */ -LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, - struct bch_sb, flags[1], 14, 20); - -LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); - -LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); -LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); -LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); - -LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO, - struct bch_sb, flags[2], 0, 4); -LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); - -LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); -LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); -LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); -LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); -LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); -LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); -LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64); -LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); -LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); -LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); -LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); -LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56); - -LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60); -LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, - struct bch_sb, flags[4], 60, 64); - -LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, - struct bch_sb, flags[5], 0, 16); -LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, - struct bch_sb, flags[5], 16, 32); -LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); -LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, - struct bch_sb, flags[5], 48, 64); -LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); -LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); -LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); -LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); -LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); -LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); - -static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) -{ - return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4); -} - -static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) -{ - SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v); - SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4); -} - -static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb) -{ - return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) | - (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4); -} - -static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) -{ - SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v); - SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4); -} - -/* - * Features: - * - * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist - * reflink: gates KEY_TYPE_reflink - * inline_data: gates KEY_TYPE_inline_data - * new_siphash: gates BCH_STR_HASH_siphash - * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE - */ -#define BCH_SB_FEATURES() \ - x(lz4, 0) \ - x(gzip, 1) \ - x(zstd, 2) \ - x(atomic_nlink, 3) \ - x(ec, 4) \ - x(journal_seq_blacklist_v3, 5) \ - x(reflink, 6) \ - x(new_siphash, 7) \ - x(inline_data, 8) \ - x(new_extent_overwrite, 9) \ - x(incompressible, 10) \ - x(btree_ptr_v2, 11) \ - x(extents_above_btree_updates, 12) \ - x(btree_updates_journalled, 13) \ - x(reflink_inline_data, 14) \ - x(new_varint, 15) \ - x(journal_no_flush, 16) \ - x(alloc_v2, 17) \ - x(extents_across_btree_nodes, 18) \ - x(incompat_version_field, 19) \ - x(casefolding, 20) \ - x(no_alloc_info, 21) \ - x(small_image, 22) - -#define BCH_SB_FEATURES_ALWAYS \ - (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ - BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\ - BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\ - BIT_ULL(BCH_FEATURE_alloc_v2)|\ - BIT_ULL(BCH_FEATURE_extents_across_btree_nodes)) - -#define BCH_SB_FEATURES_ALL \ - (BCH_SB_FEATURES_ALWAYS| \ - BIT_ULL(BCH_FEATURE_new_siphash)| \ - BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ - BIT_ULL(BCH_FEATURE_new_varint)| \ - BIT_ULL(BCH_FEATURE_journal_no_flush)| \ - BIT_ULL(BCH_FEATURE_incompat_version_field)) - -enum bch_sb_feature { -#define x(f, n) BCH_FEATURE_##f, - BCH_SB_FEATURES() -#undef x - BCH_FEATURE_NR, -}; - -#define BCH_SB_COMPAT() \ - x(alloc_info, 0) \ - x(alloc_metadata, 1) \ - x(extents_above_btree_updates_done, 2) \ - x(bformat_overflow_done, 3) - -enum bch_sb_compat { -#define x(f, n) BCH_COMPAT_##f, - BCH_SB_COMPAT() -#undef x - BCH_COMPAT_NR, -}; - -/* options: */ - -#define BCH_VERSION_UPGRADE_OPTS() \ - x(compatible, 0) \ - x(incompatible, 1) \ - x(none, 2) - -enum bch_version_upgrade_opts { -#define x(t, n) BCH_VERSION_UPGRADE_##t = n, - BCH_VERSION_UPGRADE_OPTS() -#undef x -}; - -#define BCH_REPLICAS_MAX 4U - -#define BCH_BKEY_PTRS_MAX 16U - -#define BCH_ERROR_ACTIONS() \ - x(continue, 0) \ - x(fix_safe, 1) \ - x(panic, 2) \ - x(ro, 3) - -enum bch_error_actions { -#define x(t, n) BCH_ON_ERROR_##t = n, - BCH_ERROR_ACTIONS() -#undef x - BCH_ON_ERROR_NR -}; - -#define BCH_DEGRADED_ACTIONS() \ - x(ask, 0) \ - x(yes, 1) \ - x(very, 2) \ - x(no, 3) - -enum bch_degraded_actions { -#define x(t, n) BCH_DEGRADED_##t = n, - BCH_DEGRADED_ACTIONS() -#undef x - BCH_DEGRADED_ACTIONS_NR -}; - -#define BCH_STR_HASH_TYPES() \ - x(crc32c, 0) \ - x(crc64, 1) \ - x(siphash_old, 2) \ - x(siphash, 3) - -enum bch_str_hash_type { -#define x(t, n) BCH_STR_HASH_##t = n, - BCH_STR_HASH_TYPES() -#undef x - BCH_STR_HASH_NR -}; - -#define BCH_STR_HASH_OPTS() \ - x(crc32c, 0) \ - x(crc64, 1) \ - x(siphash, 2) - -enum bch_str_hash_opts { -#define x(t, n) BCH_STR_HASH_OPT_##t = n, - BCH_STR_HASH_OPTS() -#undef x - BCH_STR_HASH_OPT_NR -}; - -#define BCH_CSUM_TYPES() \ - x(none, 0) \ - x(crc32c_nonzero, 1) \ - x(crc64_nonzero, 2) \ - x(chacha20_poly1305_80, 3) \ - x(chacha20_poly1305_128, 4) \ - x(crc32c, 5) \ - x(crc64, 6) \ - x(xxhash, 7) - -enum bch_csum_type { -#define x(t, n) BCH_CSUM_##t = n, - BCH_CSUM_TYPES() -#undef x - BCH_CSUM_NR -}; - -static const __maybe_unused unsigned bch_crc_bytes[] = { - [BCH_CSUM_none] = 0, - [BCH_CSUM_crc32c_nonzero] = 4, - [BCH_CSUM_crc32c] = 4, - [BCH_CSUM_crc64_nonzero] = 8, - [BCH_CSUM_crc64] = 8, - [BCH_CSUM_xxhash] = 8, - [BCH_CSUM_chacha20_poly1305_80] = 10, - [BCH_CSUM_chacha20_poly1305_128] = 16, -}; - -static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) -{ - switch (type) { - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: - return true; - default: - return false; - } -} - -#define BCH_CSUM_OPTS() \ - x(none, 0) \ - x(crc32c, 1) \ - x(crc64, 2) \ - x(xxhash, 3) - -enum bch_csum_opt { -#define x(t, n) BCH_CSUM_OPT_##t = n, - BCH_CSUM_OPTS() -#undef x - BCH_CSUM_OPT_NR -}; - -#define BCH_COMPRESSION_TYPES() \ - x(none, 0) \ - x(lz4_old, 1) \ - x(gzip, 2) \ - x(lz4, 3) \ - x(zstd, 4) \ - x(incompressible, 5) - -enum bch_compression_type { -#define x(t, n) BCH_COMPRESSION_TYPE_##t = n, - BCH_COMPRESSION_TYPES() -#undef x - BCH_COMPRESSION_TYPE_NR -}; - -#define BCH_COMPRESSION_OPTS() \ - x(none, 0) \ - x(lz4, 1) \ - x(gzip, 2) \ - x(zstd, 3) - -enum bch_compression_opts { -#define x(t, n) BCH_COMPRESSION_OPT_##t = n, - BCH_COMPRESSION_OPTS() -#undef x - BCH_COMPRESSION_OPT_NR -}; - -/* - * Magic numbers - * - * The various other data structures have their own magic numbers, which are - * xored with the first part of the cache set's UUID - */ - -#define BCACHE_MAGIC \ - UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \ - 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) -#define BCHFS_MAGIC \ - UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ - 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) - -#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC - -#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) -#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) - -static inline __le64 __bch2_sb_magic(struct bch_sb *sb) -{ - __le64 ret; - - memcpy(&ret, &sb->uuid, sizeof(ret)); - return ret; -} - -static inline __u64 __jset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); -} - -static inline __u64 __bset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); -} - -/* Journal */ - -#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) - -#define BCH_JSET_ENTRY_TYPES() \ - x(btree_keys, 0) \ - x(btree_root, 1) \ - x(prio_ptrs, 2) \ - x(blacklist, 3) \ - x(blacklist_v2, 4) \ - x(usage, 5) \ - x(data_usage, 6) \ - x(clock, 7) \ - x(dev_usage, 8) \ - x(log, 9) \ - x(overwrite, 10) \ - x(write_buffer_keys, 11) \ - x(datetime, 12) \ - x(log_bkey, 13) - -enum bch_jset_entry_type { -#define x(f, nr) BCH_JSET_ENTRY_##f = nr, - BCH_JSET_ENTRY_TYPES() -#undef x - BCH_JSET_ENTRY_NR -}; - -static inline bool jset_entry_is_key(struct jset_entry *e) -{ - switch (e->type) { - case BCH_JSET_ENTRY_btree_keys: - case BCH_JSET_ENTRY_btree_root: - case BCH_JSET_ENTRY_write_buffer_keys: - return true; - } - - return false; -} - -/* - * Journal sequence numbers can be blacklisted: bsets record the max sequence - * number of all the journal entries they contain updates for, so that on - * recovery we can ignore those bsets that contain index updates newer that what - * made it into the journal. - * - * This means that we can't reuse that journal_seq - we have to skip it, and - * then record that we skipped it so that the next time we crash and recover we - * don't think there was a missing journal entry. - */ -struct jset_entry_blacklist { - struct jset_entry entry; - __le64 seq; -}; - -struct jset_entry_blacklist_v2 { - struct jset_entry entry; - __le64 start; - __le64 end; -}; - -#define BCH_FS_USAGE_TYPES() \ - x(reserved, 0) \ - x(inodes, 1) \ - x(key_version, 2) - -enum bch_fs_usage_type { -#define x(f, nr) BCH_FS_USAGE_##f = nr, - BCH_FS_USAGE_TYPES() -#undef x - BCH_FS_USAGE_NR -}; - -struct jset_entry_usage { - struct jset_entry entry; - __le64 v; -} __packed; - -struct jset_entry_data_usage { - struct jset_entry entry; - __le64 v; - struct bch_replicas_entry_v1 r; -} __packed; - -struct jset_entry_clock { - struct jset_entry entry; - __u8 rw; - __u8 pad[7]; - __le64 time; -} __packed; - -struct jset_entry_dev_usage_type { - __le64 buckets; - __le64 sectors; - __le64 fragmented; -} __packed; - -struct jset_entry_dev_usage { - struct jset_entry entry; - __le32 dev; - __u32 pad; - - __le64 _buckets_ec; /* No longer used */ - __le64 _buckets_unavailable; /* No longer used */ - - struct jset_entry_dev_usage_type d[]; -}; - -static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) -{ - return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / - sizeof(struct jset_entry_dev_usage_type); -} - -struct jset_entry_log { - struct jset_entry entry; - u8 d[]; -} __packed __aligned(8); - -static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l) -{ - unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d); - - while (b && !l->d[b - 1]) - --b; - return b; -} - -struct jset_entry_datetime { - struct jset_entry entry; - __le64 seconds; -} __packed __aligned(8); - -/* - * On disk format for a journal entry: - * seq is monotonically increasing; every journal entry has its own unique - * sequence number. - * - * last_seq is the oldest journal entry that still has keys the btree hasn't - * flushed to disk yet. - * - * version is for on disk format changes. - */ -struct jset { - struct bch_csum csum; - - __le64 magic; - __le64 seq; - __le32 version; - __le32 flags; - - __le32 u64s; /* size of d[] in u64s */ - - __u8 encrypted_start[0]; - - __le16 _read_clock; /* no longer used */ - __le16 _write_clock; - - /* Sequence number of oldest dirty journal entry */ - __le64 last_seq; - - - struct jset_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); -LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); -LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); - -#define BCH_JOURNAL_BUCKETS_MIN 8 - -/* Btree: */ - -enum btree_id_flags { - BTREE_IS_extents = BIT(0), - BTREE_IS_snapshots = BIT(1), - BTREE_IS_snapshot_field = BIT(2), - BTREE_IS_data = BIT(3), - BTREE_IS_write_buffer = BIT(4), -}; - -#define BCH_BTREE_IDS() \ - x(extents, 0, \ - BTREE_IS_extents| \ - BTREE_IS_snapshots| \ - BTREE_IS_data, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_error)| \ - BIT_ULL(KEY_TYPE_cookie)| \ - BIT_ULL(KEY_TYPE_extent)| \ - BIT_ULL(KEY_TYPE_reservation)| \ - BIT_ULL(KEY_TYPE_reflink_p)| \ - BIT_ULL(KEY_TYPE_inline_data)) \ - x(inodes, 1, \ - BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_inode)| \ - BIT_ULL(KEY_TYPE_inode_v2)| \ - BIT_ULL(KEY_TYPE_inode_v3)| \ - BIT_ULL(KEY_TYPE_inode_generation)) \ - x(dirents, 2, \ - BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_hash_whiteout)| \ - BIT_ULL(KEY_TYPE_dirent)) \ - x(xattrs, 3, \ - BTREE_IS_snapshots, \ - BIT_ULL(KEY_TYPE_whiteout)| \ - BIT_ULL(KEY_TYPE_cookie)| \ - BIT_ULL(KEY_TYPE_hash_whiteout)| \ - BIT_ULL(KEY_TYPE_xattr)) \ - x(alloc, 4, 0, \ - BIT_ULL(KEY_TYPE_alloc)| \ - BIT_ULL(KEY_TYPE_alloc_v2)| \ - BIT_ULL(KEY_TYPE_alloc_v3)| \ - BIT_ULL(KEY_TYPE_alloc_v4)) \ - x(quotas, 5, 0, \ - BIT_ULL(KEY_TYPE_quota)) \ - x(stripes, 6, 0, \ - BIT_ULL(KEY_TYPE_stripe)) \ - x(reflink, 7, \ - BTREE_IS_extents| \ - BTREE_IS_data, \ - BIT_ULL(KEY_TYPE_reflink_v)| \ - BIT_ULL(KEY_TYPE_indirect_inline_data)| \ - BIT_ULL(KEY_TYPE_error)) \ - x(subvolumes, 8, 0, \ - BIT_ULL(KEY_TYPE_subvolume)) \ - x(snapshots, 9, 0, \ - BIT_ULL(KEY_TYPE_snapshot)) \ - x(lru, 10, \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)) \ - x(freespace, 11, \ - BTREE_IS_extents, \ - BIT_ULL(KEY_TYPE_set)) \ - x(need_discard, 12, 0, \ - BIT_ULL(KEY_TYPE_set)) \ - x(backpointers, 13, \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_backpointer)) \ - x(bucket_gens, 14, 0, \ - BIT_ULL(KEY_TYPE_bucket_gens)) \ - x(snapshot_trees, 15, 0, \ - BIT_ULL(KEY_TYPE_snapshot_tree)) \ - x(deleted_inodes, 16, \ - BTREE_IS_snapshot_field| \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)) \ - x(logged_ops, 17, 0, \ - BIT_ULL(KEY_TYPE_logged_op_truncate)| \ - BIT_ULL(KEY_TYPE_logged_op_finsert)| \ - BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ - x(rebalance_work, 18, \ - BTREE_IS_snapshot_field| \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ - x(subvolume_children, 19, 0, \ - BIT_ULL(KEY_TYPE_set)) \ - x(accounting, 20, \ - BTREE_IS_snapshot_field| \ - BTREE_IS_write_buffer, \ - BIT_ULL(KEY_TYPE_accounting)) \ - -enum btree_id { -#define x(name, nr, ...) BTREE_ID_##name = nr, - BCH_BTREE_IDS() -#undef x - BTREE_ID_NR -}; - -/* - * Maximum number of btrees that we will _ever_ have under the current scheme, - * where we refer to them with 64 bit bitfields - and we also need a bit for - * the interior btree node type: - */ -#define BTREE_ID_NR_MAX 63 - -static inline bool btree_id_is_alloc(enum btree_id id) -{ - switch (id) { - case BTREE_ID_alloc: - case BTREE_ID_backpointers: - case BTREE_ID_need_discard: - case BTREE_ID_freespace: - case BTREE_ID_bucket_gens: - case BTREE_ID_lru: - case BTREE_ID_accounting: - return true; - default: - return false; - } -} - -#define BTREE_MAX_DEPTH 4U - -/* Btree nodes */ - -/* - * Btree nodes - * - * On disk a btree node is a list/log of these; within each set the keys are - * sorted - */ -struct bset { - __le64 seq; - - /* - * Highest journal entry this bset contains keys for. - * If on recovery we don't see that journal entry, this bset is ignored: - * this allows us to preserve the order of all index updates after a - * crash, since the journal records a total order of all index updates - * and anything that didn't make it to the journal doesn't get used. - */ - __le64 journal_seq; - - __le32 flags; - __le16 version; - __le16 u64s; /* count of d[] in u64s */ - - struct bkey_packed start[0]; - __u64 _data[]; -} __packed __aligned(8); - -LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); - -LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); -LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, - struct bset, flags, 5, 6); - -/* Sector offset within the btree node: */ -LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32); - -struct btree_node { - struct bch_csum csum; - __le64 magic; - - /* this flags field is encrypted, unlike bset->flags: */ - __le64 flags; - - /* Closed interval: */ - struct bpos min_key; - struct bpos max_key; - struct bch_extent_ptr _ptr; /* not used anymore */ - struct bkey_format format; - - union { - struct bset keys; - struct { - __u8 pad[22]; - __le16 u64s; - __u64 _data[0]; - - }; - }; -} __packed __aligned(8); - -LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4); -LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); -LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, - struct btree_node, flags, 8, 9); -LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25); -/* 25-32 unused */ -LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); - -static inline __u64 BTREE_NODE_ID(struct btree_node *n) -{ - return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4); -} - -static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v) -{ - SET_BTREE_NODE_ID_LO(n, v); - SET_BTREE_NODE_ID_HI(n, v >> 4); -} - -struct btree_node_entry { - struct bch_csum csum; - - union { - struct bset keys; - struct { - __u8 pad[22]; - __le16 u64s; - __u64 _data[0]; - }; - }; -} __packed __aligned(8); - -#endif /* _BCACHEFS_FORMAT_H */ diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h deleted file mode 100644 index 52594e925eb7ea..00000000000000 --- a/fs/bcachefs/bcachefs_ioctl.h +++ /dev/null @@ -1,473 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IOCTL_H -#define _BCACHEFS_IOCTL_H - -#include -#include -#include "bcachefs_format.h" -#include "bkey_types.h" - -/* - * Flags common to multiple ioctls: - */ -#define BCH_FORCE_IF_DATA_LOST (1 << 0) -#define BCH_FORCE_IF_METADATA_LOST (1 << 1) -#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) -#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) - -#define BCH_FORCE_IF_LOST \ - (BCH_FORCE_IF_DATA_LOST| \ - BCH_FORCE_IF_METADATA_LOST) -#define BCH_FORCE_IF_DEGRADED \ - (BCH_FORCE_IF_DATA_DEGRADED| \ - BCH_FORCE_IF_METADATA_DEGRADED) - -/* - * If cleared, ioctl that refer to a device pass it as a pointer to a pathname - * (e.g. /dev/sda1); if set, the dev field is the device's index within the - * filesystem: - */ -#define BCH_BY_INDEX (1 << 4) - -/* - * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem - * wide superblock: - */ -#define BCH_READ_DEV (1 << 5) - -/* global control dev: */ - -/* These are currently broken, and probably unnecessary: */ -#if 0 -#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) -#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) - -struct bch_ioctl_assemble { - __u32 flags; - __u32 nr_devs; - __u64 pad; - __u64 devs[]; -}; - -struct bch_ioctl_incremental { - __u32 flags; - __u64 pad; - __u64 dev; -}; -#endif - -/* filesystem ioctls: */ - -#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) - -/* These only make sense when we also have incremental assembly */ -#if 0 -#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) -#define BCH_IOCTL_STOP _IO(0xbc, 3) -#endif - -#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) -#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) -#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) -#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) -#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) -#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) -#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) -#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) -#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) - -#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) -#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) - -#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2) - -#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) -#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) -#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) -#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) - -/* ioctl below act on a particular file, not the filesystem as a whole: */ - -#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) - -/* - * BCH_IOCTL_QUERY_UUID: get filesystem UUID - * - * Returns user visible UUID, not internal UUID (which may not ever be changed); - * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with - * this UUID. - */ -struct bch_ioctl_query_uuid { - __uuid_t uuid; -}; - -#if 0 -struct bch_ioctl_start { - __u32 flags; - __u32 pad; -}; -#endif - -/* - * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem - * - * The specified device must not be open or in use. On success, the new device - * will be an online member of the filesystem just like any other member. - * - * The device must first be prepared by userspace by formatting with a bcachefs - * superblock, which is only used for passing in superblock options/parameters - * for that device (in struct bch_member). The new device's superblock should - * not claim to be a member of any existing filesystem - UUIDs on it will be - * ignored. - */ - -/* - * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem - * - * Any data present on @dev will be permanently deleted, and @dev will be - * removed from its slot in the filesystem's list of member devices. The device - * may be either offline or offline. - * - * Will fail removing @dev would leave us with insufficient read write devices - * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are - * set. - */ - -/* - * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem - * but is not open (e.g. because we started in degraded mode), bring it online - * - * all existing data on @dev will be available once the device is online, - * exactly as if @dev was present when the filesystem was first mounted - */ - -/* - * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that - * block device, without removing it from the filesystem (so it can be brought - * back online later) - * - * Data present on @dev will be unavailable while @dev is offline (unless - * replicated), but will still be intact and untouched if @dev is brought back - * online - * - * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would - * leave us with insufficient read write devices or degraded/unavailable data, - * unless the approprate BCH_FORCE_IF_* flags are set. - */ - -struct bch_ioctl_disk { - __u32 flags; - __u32 pad; - __u64 dev; -}; - -/* - * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem - * - * @new_state - one of the bch_member_state states (rw, ro, failed, - * spare) - * - * Will refuse to change member state if we would then have insufficient devices - * to write to, or if it would result in degraded data (when @new_state is - * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. - */ -struct bch_ioctl_disk_set_state { - __u32 flags; - __u8 new_state; - __u8 pad[3]; - __u64 dev; -}; - -#define BCH_DATA_OPS() \ - x(scrub, 0) \ - x(rereplicate, 1) \ - x(migrate, 2) \ - x(rewrite_old_nodes, 3) \ - x(drop_extra_replicas, 4) - -enum bch_data_ops { -#define x(t, n) BCH_DATA_OP_##t = n, - BCH_DATA_OPS() -#undef x - BCH_DATA_OP_NR -}; - -/* - * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. - * scrub, rereplicate, migrate). - * - * This ioctl kicks off a job in the background, and returns a file descriptor. - * Reading from the file descriptor returns a struct bch_ioctl_data_event, - * indicating current progress, and closing the file descriptor will stop the - * job. The file descriptor is O_CLOEXEC. - */ -struct bch_ioctl_data { - __u16 op; - __u8 start_btree; - __u8 end_btree; - __u32 flags; - - struct bpos start_pos; - struct bpos end_pos; - - union { - struct { - __u32 dev; - __u32 data_types; - } scrub; - struct { - __u32 dev; - __u32 pad; - } migrate; - struct { - __u64 pad[8]; - }; - }; -} __packed __aligned(8); - -enum bch_data_event { - BCH_DATA_EVENT_PROGRESS = 0, - /* XXX: add an event for reporting errors */ - BCH_DATA_EVENT_NR = 1, -}; - -enum data_progress_data_type_special { - DATA_PROGRESS_DATA_TYPE_phys = 254, - DATA_PROGRESS_DATA_TYPE_done = 255, -}; - -struct bch_ioctl_data_progress { - __u8 data_type; - __u8 btree_id; - __u8 pad[2]; - struct bpos pos; - - __u64 sectors_done; - __u64 sectors_total; - __u64 sectors_error_corrected; - __u64 sectors_error_uncorrected; -} __packed __aligned(8); - -enum bch_ioctl_data_event_ret { - BCH_IOCTL_DATA_EVENT_RET_done = 1, - BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, -}; - -struct bch_ioctl_data_event { - __u8 type; - __u8 ret; - __u8 pad[6]; - union { - struct bch_ioctl_data_progress p; - __u64 pad2[15]; - }; -} __packed __aligned(8); - -struct bch_replicas_usage { - __u64 sectors; - struct bch_replicas_entry_v1 r; -} __packed; - -static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u) -{ - return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r); -} - -static inline struct bch_replicas_usage * -replicas_usage_next(struct bch_replicas_usage *u) -{ - return (void *) u + replicas_usage_bytes(u); -} - -/* Obsolete */ -/* - * BCH_IOCTL_FS_USAGE: query filesystem disk space usage - * - * Returns disk space usage broken out by data type, number of replicas, and - * by component device - * - * @replica_entries_bytes - size, in bytes, allocated for replica usage entries - * - * On success, @replica_entries_bytes will be changed to indicate the number of - * bytes actually used. - * - * Returns -ERANGE if @replica_entries_bytes was too small - */ -struct bch_ioctl_fs_usage { - __u64 capacity; - __u64 used; - __u64 online_reserved; - __u64 persistent_reserved[BCH_REPLICAS_MAX]; - - __u32 replica_entries_bytes; - __u32 pad; - - struct bch_replicas_usage replicas[]; -}; - -/* Obsolete */ -/* - * BCH_IOCTL_DEV_USAGE: query device disk space usage - * - * Returns disk space usage broken out by data type - both by buckets and - * sectors. - */ -struct bch_ioctl_dev_usage { - __u64 dev; - __u32 flags; - __u8 state; - __u8 pad[7]; - - __u32 bucket_size; - __u64 nr_buckets; - - __u64 buckets_ec; - - struct bch_ioctl_dev_usage_type { - __u64 buckets; - __u64 sectors; - __u64 fragmented; - } d[10]; -}; - -/* Obsolete */ -struct bch_ioctl_dev_usage_v2 { - __u64 dev; - __u32 flags; - __u8 state; - __u8 nr_data_types; - __u8 pad[6]; - - __u32 bucket_size; - __u64 nr_buckets; - - struct bch_ioctl_dev_usage_type d[]; -}; - -/* - * BCH_IOCTL_READ_SUPER: read filesystem superblock - * - * Equivalent to reading the superblock directly from the block device, except - * avoids racing with the kernel writing the superblock or having to figure out - * which block device to read - * - * @sb - buffer to read into - * @size - size of userspace allocated buffer - * @dev - device to read superblock for, if BCH_READ_DEV flag is - * specified - * - * Returns -ERANGE if buffer provided is too small - */ -struct bch_ioctl_read_super { - __u32 flags; - __u32 pad; - __u64 dev; - __u64 size; - __u64 sb; -}; - -/* - * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to - * determine if disk is a (online) member - if so, returns device's index - * - * Returns -ENOENT if not found - */ -struct bch_ioctl_disk_get_idx { - __u64 dev; -}; - -/* - * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device - * - * @dev - member to resize - * @nbuckets - new number of buckets - */ -struct bch_ioctl_disk_resize { - __u32 flags; - __u32 pad; - __u64 dev; - __u64 nbuckets; -}; - -/* - * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device - * - * @dev - member to resize - * @nbuckets - new number of buckets - */ -struct bch_ioctl_disk_resize_journal { - __u32 flags; - __u32 pad; - __u64 dev; - __u64 nbuckets; -}; - -struct bch_ioctl_subvolume { - __u32 flags; - __u32 dirfd; - __u16 mode; - __u16 pad[3]; - __u64 dst_ptr; - __u64 src_ptr; -}; - -#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) -#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) - -/* - * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command, - * but with the kernel's implementation of fsck: - */ -struct bch_ioctl_fsck_offline { - __u64 flags; - __u64 opts; /* string */ - __u64 nr_devs; - __u64 devs[] __counted_by(nr_devs); -}; - -/* - * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command, - * but with the kernel's implementation of fsck: - */ -struct bch_ioctl_fsck_online { - __u64 flags; - __u64 opts; /* string */ -}; - -/* - * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting - * - * Returns disk space usage broken out by data type, number of replicas, and - * by component device - * - * @replica_entries_bytes - size, in bytes, allocated for replica usage entries - * - * On success, @replica_entries_bytes will be changed to indicate the number of - * bytes actually used. - * - * Returns -ERANGE if @replica_entries_bytes was too small - */ -struct bch_ioctl_query_accounting { - __u64 capacity; - __u64 used; - __u64 online_reserved; - - __u32 accounting_u64s; /* input parameter */ - __u32 accounting_types_mask; /* input parameter */ - - struct bkey_i_accounting accounting[]; -}; - -#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) - -struct bch_ioctl_query_counters { - __u16 nr; - __u16 flags; - __u32 pad; - __u64 d[]; -}; - -#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c deleted file mode 100644 index ee823c640642b4..00000000000000 --- a/fs/bcachefs/bkey.c +++ /dev/null @@ -1,1112 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey.h" -#include "bkey_cmp.h" -#include "bkey_methods.h" -#include "bset.h" -#include "util.h" - -const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; - -void bch2_bkey_packed_to_binary_text(struct printbuf *out, - const struct bkey_format *f, - const struct bkey_packed *k) -{ - const u64 *p = high_word(f, k); - unsigned word_bits = 64 - high_bit_offset; - unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset; - u64 v = *p & (~0ULL >> high_bit_offset); - - if (!nr_key_bits) { - prt_str(out, "(empty)"); - return; - } - - while (1) { - unsigned next_key_bits = nr_key_bits; - - if (nr_key_bits < 64) { - v >>= 64 - nr_key_bits; - next_key_bits = 0; - } else { - next_key_bits -= 64; - } - - bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits)); - - if (!next_key_bits) - break; - - prt_char(out, ' '); - - p = next_word(p); - v = *p; - word_bits = 64; - nr_key_bits = next_key_bits; - } -} - -static void __bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) -{ - struct bkey tmp; - - BUG_ON(bkeyp_val_u64s(format, packed) != - bkey_val_u64s(unpacked)); - - BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); - - tmp = __bch2_bkey_unpack_key(format, packed); - - if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n", - format->key_u64s, - format->bits_per_field[0], - format->bits_per_field[1], - format->bits_per_field[2], - format->bits_per_field[3], - format->bits_per_field[4]); - - prt_printf(&buf, "compiled unpack: "); - bch2_bkey_to_text(&buf, unpacked); - prt_newline(&buf); - - prt_printf(&buf, "c unpack: "); - bch2_bkey_to_text(&buf, &tmp); - prt_newline(&buf); - - prt_printf(&buf, "compiled unpack: "); - bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, - (struct bkey_packed *) unpacked); - prt_newline(&buf); - - prt_printf(&buf, "c unpack: "); - bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, - (struct bkey_packed *) &tmp); - prt_newline(&buf); - - panic("%s", buf.buf); - } -} - -static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) -{ - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) - __bch2_bkey_pack_verify(packed, unpacked, format); -} - -struct pack_state { - const struct bkey_format *format; - unsigned bits; /* bits remaining in current word */ - u64 w; /* current word */ - u64 *p; /* pointer to next word */ -}; - -__always_inline -static struct pack_state pack_state_init(const struct bkey_format *format, - struct bkey_packed *k) -{ - u64 *p = high_word(format, k); - - return (struct pack_state) { - .format = format, - .bits = 64 - high_bit_offset, - .w = 0, - .p = p, - }; -} - -__always_inline -static void pack_state_finish(struct pack_state *state, - struct bkey_packed *k) -{ - EBUG_ON(state->p < k->_data); - EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s); - - *state->p = state->w; -} - -struct unpack_state { - const struct bkey_format *format; - unsigned bits; /* bits remaining in current word */ - u64 w; /* current word */ - const u64 *p; /* pointer to next word */ -}; - -__always_inline -static struct unpack_state unpack_state_init(const struct bkey_format *format, - const struct bkey_packed *k) -{ - const u64 *p = high_word(format, k); - - return (struct unpack_state) { - .format = format, - .bits = 64 - high_bit_offset, - .w = *p << high_bit_offset, - .p = p, - }; -} - -__always_inline -static u64 get_inc_field(struct unpack_state *state, unsigned field) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); - - if (bits >= state->bits) { - v = state->w >> (64 - bits); - bits -= state->bits; - - state->p = next_word(state->p); - state->w = *state->p; - state->bits = 64; - } - - /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ - v |= (state->w >> 1) >> (63 - bits); - state->w <<= bits; - state->bits -= bits; - - return v + offset; -} - -__always_inline -static void __set_inc_field(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - - if (bits) { - if (bits > state->bits) { - bits -= state->bits; - /* avoid shift by 64 if bits is 64 - bits is never 0 here: */ - state->w |= (v >> 1) >> (bits - 1); - - *state->p = state->w; - state->p = next_word(state->p); - state->w = 0; - state->bits = 64; - } - - state->bits -= bits; - state->w |= v << state->bits; - } -} - -__always_inline -static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 offset = le64_to_cpu(state->format->field_offset[field]); - - if (v < offset) - return false; - - v -= offset; - - if (fls64(v) > bits) - return false; - - __set_inc_field(state, field, v); - return true; -} - -/* - * Note: does NOT set out->format (we don't know what it should be here!) - * - * Also: doesn't work on extents - it doesn't preserve the invariant that - * if k is packed bkey_start_pos(k) will successfully pack - */ -static bool bch2_bkey_transform_key(const struct bkey_format *out_f, - struct bkey_packed *out, - const struct bkey_format *in_f, - const struct bkey_packed *in) -{ - struct pack_state out_s = pack_state_init(out_f, out); - struct unpack_state in_s = unpack_state_init(in_f, in); - u64 *w = out->_data; - unsigned i; - - *w = 0; - - for (i = 0; i < BKEY_NR_FIELDS; i++) - if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) - return false; - - /* Can't happen because the val would be too big to unpack: */ - EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); - - pack_state_finish(&out_s, out); - out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; - out->needs_whiteout = in->needs_whiteout; - out->type = in->type; - - return true; -} - -bool bch2_bkey_transform(const struct bkey_format *out_f, - struct bkey_packed *out, - const struct bkey_format *in_f, - const struct bkey_packed *in) -{ - if (!bch2_bkey_transform_key(out_f, out, in_f, in)) - return false; - - memcpy_u64s((u64 *) out + out_f->key_u64s, - (u64 *) in + in_f->key_u64s, - (in->u64s - in_f->key_u64s)); - return true; -} - -struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, - const struct bkey_packed *in) -{ - struct unpack_state state = unpack_state_init(format, in); - struct bkey out; - - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->u64s < format->key_u64s); - EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); - EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); - - out.u64s = BKEY_U64s + in->u64s - format->key_u64s; - out.format = KEY_FORMAT_CURRENT; - out.needs_whiteout = in->needs_whiteout; - out.type = in->type; - out.pad[0] = 0; - -#define x(id, field) out.field = get_inc_field(&state, id); - bkey_fields() -#undef x - - return out; -} - -#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -struct bpos __bkey_unpack_pos(const struct bkey_format *format, - const struct bkey_packed *in) -{ - struct unpack_state state = unpack_state_init(format, in); - struct bpos out; - - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->u64s < format->key_u64s); - EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); - - out.inode = get_inc_field(&state, BKEY_FIELD_INODE); - out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); - out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); - - return out; -} -#endif - -/** - * bch2_bkey_pack_key -- pack just the key, not the value - * @out: packed result - * @in: key to pack - * @format: format of packed result - * - * Returns: true on success, false on failure - */ -bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, - const struct bkey_format *format) -{ - struct pack_state state = pack_state_init(format, out); - u64 *w = out->_data; - - EBUG_ON((void *) in == (void *) out); - EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); - EBUG_ON(in->format != KEY_FORMAT_CURRENT); - - *w = 0; - -#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; - bkey_fields() -#undef x - pack_state_finish(&state, out); - out->u64s = format->key_u64s + in->u64s - BKEY_U64s; - out->format = KEY_FORMAT_LOCAL_BTREE; - out->needs_whiteout = in->needs_whiteout; - out->type = in->type; - - bch2_bkey_pack_verify(out, in, format); - return true; -} - -/** - * bch2_bkey_unpack -- unpack the key and the value - * @b: btree node of @src key (for packed format) - * @dst: unpacked result - * @src: packed input - */ -void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, - const struct bkey_packed *src) -{ - __bkey_unpack_key(b, &dst->k, src); - - memcpy_u64s(&dst->v, - bkeyp_val(&b->format, src), - bkeyp_val_u64s(&b->format, src)); -} - -/** - * bch2_bkey_pack -- pack the key and the value - * @dst: packed result - * @src: unpacked input - * @format: format of packed result - * - * Returns: true on success, false on failure - */ -bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src, - const struct bkey_format *format) -{ - struct bkey_packed tmp; - - if (!bch2_bkey_pack_key(&tmp, &src->k, format)) - return false; - - memmove_u64s((u64 *) dst + format->key_u64s, - &src->v, - bkey_val_u64s(&src->k)); - memcpy_u64s_small(dst, &tmp, format->key_u64s); - - return true; -} - -__always_inline -static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) -{ - unsigned bits = state->format->bits_per_field[field]; - u64 offset = le64_to_cpu(state->format->field_offset[field]); - bool ret = true; - - EBUG_ON(v < offset); - v -= offset; - - if (fls64(v) > bits) { - v = ~(~0ULL << bits); - ret = false; - } - - __set_inc_field(state, field, v); - return ret; -} - -static bool bkey_packed_successor(struct bkey_packed *out, - const struct btree *b, - struct bkey_packed k) -{ - const struct bkey_format *f = &b->format; - unsigned nr_key_bits = b->nr_key_bits; - unsigned first_bit, offset; - u64 *p; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); - - if (!nr_key_bits) - return false; - - *out = k; - - first_bit = high_bit_offset + nr_key_bits - 1; - p = nth_word(high_word(f, out), first_bit >> 6); - offset = 63 - (first_bit & 63); - - while (nr_key_bits) { - unsigned bits = min(64 - offset, nr_key_bits); - u64 mask = (~0ULL >> (64 - bits)) << offset; - - if ((*p & mask) != mask) { - *p += 1ULL << offset; - EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); - return true; - } - - *p &= ~mask; - p = prev_word(p); - nr_key_bits -= bits; - offset = 0; - } - - return false; -} - -static bool bkey_format_has_too_big_fields(const struct bkey_format *f) -{ - for (unsigned i = 0; i < f->nr_fields; i++) { - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) - : 0; - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (packed_max + field_offset < packed_max || - packed_max + field_offset > unpacked_max) - return true; - } - - return false; -} - -/* - * Returns a packed key that compares <= in - * - * This is used in bset_search_tree(), where we need a packed pos in order to be - * able to compare against the keys in the auxiliary search tree - and it's - * legal to use a packed pos that isn't equivalent to the original pos, - * _provided_ it compares <= to the original pos. - */ -enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, - struct bpos in, - const struct btree *b) -{ - const struct bkey_format *f = &b->format; - struct pack_state state = pack_state_init(f, out); - u64 *w = out->_data; - struct bpos orig = in; - bool exact = true; - unsigned i; - - /* - * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3 - * byte header, but pack_pos() won't if the len/version fields are big - * enough - we need to make sure to zero them out: - */ - for (i = 0; i < f->key_u64s; i++) - w[i] = 0; - - if (unlikely(in.snapshot < - le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { - if (!in.offset-- && - !in.inode--) - return BKEY_PACK_POS_FAIL; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(in.offset < - le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { - if (!in.inode--) - return BKEY_PACK_POS_FAIL; - in.offset = KEY_OFFSET_MAX; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(in.inode < - le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) - return BKEY_PACK_POS_FAIL; - - if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) { - in.offset = KEY_OFFSET_MAX; - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) { - in.snapshot = KEY_SNAPSHOT_MAX; - exact = false; - } - - if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))) - exact = false; - - pack_state_finish(&state, out); - out->u64s = f->key_u64s; - out->format = KEY_FORMAT_LOCAL_BTREE; - out->type = KEY_TYPE_deleted; - - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - if (exact) { - BUG_ON(bkey_cmp_left_packed(b, out, &orig)); - } else { - struct bkey_packed successor; - - BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); - BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0 && - !bkey_format_has_too_big_fields(f)); - } - } - - return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; -} - -void bch2_bkey_format_init(struct bkey_format_state *s) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(s->field_min); i++) - s->field_min[i] = U64_MAX; - - for (i = 0; i < ARRAY_SIZE(s->field_max); i++) - s->field_max[i] = 0; - - /* Make sure we can store a size of 0: */ - s->field_min[BKEY_FIELD_SIZE] = 0; -} - -void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) -{ - unsigned field = 0; - - __bkey_format_add(s, field++, p.inode); - __bkey_format_add(s, field++, p.offset); - __bkey_format_add(s, field++, p.snapshot); -} - -/* - * We don't want it to be possible for the packed format to represent fields - * bigger than a u64... that will cause confusion and issues (like with - * bkey_packed_successor()) - */ -static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, - unsigned bits, u64 offset) -{ - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - - bits = min(bits, unpacked_bits); - - offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); - - f->bits_per_field[i] = bits; - f->field_offset[i] = cpu_to_le64(offset); -} - -struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) -{ - unsigned i, bits = KEY_PACKED_BITS_START; - struct bkey_format ret = { - .nr_fields = BKEY_NR_FIELDS, - }; - - for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { - s->field_min[i] = min(s->field_min[i], s->field_max[i]); - - set_format_field(&ret, i, - fls64(s->field_max[i] - s->field_min[i]), - s->field_min[i]); - - bits += ret.bits_per_field[i]; - } - - /* allow for extent merging: */ - if (ret.bits_per_field[BKEY_FIELD_SIZE]) { - unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]); - - ret.bits_per_field[BKEY_FIELD_SIZE] += b; - bits += b; - } - - ret.key_u64s = DIV_ROUND_UP(bits, 64); - - /* if we have enough spare bits, round fields up to nearest byte */ - bits = ret.key_u64s * 64 - bits; - - for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { - unsigned r = round_up(ret.bits_per_field[i], 8) - - ret.bits_per_field[i]; - - if (r <= bits) { - set_format_field(&ret, i, - ret.bits_per_field[i] + r, - le64_to_cpu(ret.field_offset[i])); - bits -= r; - } - } - - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - struct printbuf buf = PRINTBUF; - - BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); - printbuf_exit(&buf); - } - - return ret; -} - -int bch2_bkey_format_invalid(struct bch_fs *c, - struct bkey_format *f, - enum bch_validate_flags flags, - struct printbuf *err) -{ - unsigned bits = KEY_PACKED_BITS_START; - - if (f->nr_fields != BKEY_NR_FIELDS) { - prt_printf(err, "incorrect number of fields: got %u, should be %u", - f->nr_fields, BKEY_NR_FIELDS); - return -BCH_ERR_invalid; - } - - /* - * Verify that the packed format can't represent fields larger than the - * unpacked format: - */ - for (unsigned i = 0; i < f->nr_fields; i++) { - if (bch2_bkey_format_field_overflows(f, i)) { - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - unsigned packed_bits = min(64, f->bits_per_field[i]); - u64 packed_max = packed_bits - ? ~((~0ULL << 1) << (packed_bits - 1)) - : 0; - - prt_printf(err, "field %u too large: %llu + %llu > %llu", - i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max); - return -BCH_ERR_invalid; - } - - bits += f->bits_per_field[i]; - } - - if (f->key_u64s != DIV_ROUND_UP(bits, 64)) { - prt_printf(err, "incorrect key_u64s: got %u, should be %u", - f->key_u64s, DIV_ROUND_UP(bits, 64)); - return -BCH_ERR_invalid; - } - - return 0; -} - -void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f) -{ - prt_printf(out, "u64s %u fields ", f->key_u64s); - - for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) { - if (i) - prt_str(out, ", "); - prt_printf(out, "%u:%llu", - f->bits_per_field[i], - le64_to_cpu(f->field_offset[i])); - } -} - -/* - * Most significant differing bit - * Bits are indexed from 0 - return is [0, nr_key_bits) - */ -__pure -unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, - const struct bkey_packed *l_k, - const struct bkey_packed *r_k) -{ - const u64 *l = high_word(&b->format, l_k); - const u64 *r = high_word(&b->format, r_k); - unsigned nr_key_bits = b->nr_key_bits; - unsigned word_bits = 64 - high_bit_offset; - u64 l_v, r_v; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); - - /* for big endian, skip past header */ - l_v = *l & (~0ULL >> high_bit_offset); - r_v = *r & (~0ULL >> high_bit_offset); - - while (nr_key_bits) { - if (nr_key_bits < word_bits) { - l_v >>= word_bits - nr_key_bits; - r_v >>= word_bits - nr_key_bits; - nr_key_bits = 0; - } else { - nr_key_bits -= word_bits; - } - - if (l_v != r_v) - return fls64(l_v ^ r_v) - 1 + nr_key_bits; - - l = next_word(l); - r = next_word(r); - - l_v = *l; - r_v = *r; - word_bits = 64; - } - - return 0; -} - -/* - * First set bit - * Bits are indexed from 0 - return is [0, nr_key_bits) - */ -__pure -unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) -{ - const u64 *p = high_word(&b->format, k); - unsigned nr_key_bits = b->nr_key_bits; - unsigned ret = 0, offset; - - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); - - offset = nr_key_bits; - while (offset > 64) { - p = next_word(p); - offset -= 64; - } - - offset = 64 - offset; - - while (nr_key_bits) { - unsigned bits = nr_key_bits + offset < 64 - ? nr_key_bits - : 64 - offset; - - u64 mask = (~0ULL >> (64 - bits)) << offset; - - if (*p & mask) - return ret + __ffs64(*p & mask) - offset; - - p = prev_word(p); - nr_key_bits -= bits; - ret += bits; - offset = 0; - } - - return 0; -} - -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - -#define I(_x) (*(out)++ = (_x)) -#define I1(i0) I(i0) -#define I2(i0, i1) (I1(i0), I(i1)) -#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) -#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) -#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) - -static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, - enum bch_bkey_fields field, - unsigned dst_offset, unsigned dst_size, - bool *eax_zeroed) -{ - unsigned bits = format->bits_per_field[field]; - u64 offset = le64_to_cpu(format->field_offset[field]); - unsigned i, byte, bit_offset, align, shl, shr; - - if (!bits && !offset) { - if (!*eax_zeroed) { - /* xor eax, eax */ - I2(0x31, 0xc0); - } - - *eax_zeroed = true; - goto set_field; - } - - if (!bits) { - /* just return offset: */ - - switch (dst_size) { - case 8: - if (offset > S32_MAX) { - /* mov [rdi + dst_offset], offset */ - I3(0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - - I3(0xc7, 0x47, dst_offset + 4); - memcpy(out, (void *) &offset + 4, 4); - out += 4; - } else { - /* mov [rdi + dst_offset], offset */ - /* sign extended */ - I4(0x48, 0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - } - break; - case 4: - /* mov [rdi + dst_offset], offset */ - I3(0xc7, 0x47, dst_offset); - memcpy(out, &offset, 4); - out += 4; - break; - default: - BUG(); - } - - return out; - } - - bit_offset = format->key_u64s * 64; - for (i = 0; i <= field; i++) - bit_offset -= format->bits_per_field[i]; - - byte = bit_offset / 8; - bit_offset -= byte * 8; - - *eax_zeroed = false; - - if (bit_offset == 0 && bits == 8) { - /* movzx eax, BYTE PTR [rsi + imm8] */ - I4(0x0f, 0xb6, 0x46, byte); - } else if (bit_offset == 0 && bits == 16) { - /* movzx eax, WORD PTR [rsi + imm8] */ - I4(0x0f, 0xb7, 0x46, byte); - } else if (bit_offset + bits <= 32) { - align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); - byte -= align; - bit_offset += align * 8; - - BUG_ON(bit_offset + bits > 32); - - /* mov eax, [rsi + imm8] */ - I3(0x8b, 0x46, byte); - - if (bit_offset) { - /* shr eax, imm8 */ - I3(0xc1, 0xe8, bit_offset); - } - - if (bit_offset + bits < 32) { - unsigned mask = ~0U >> (32 - bits); - - /* and eax, imm32 */ - I1(0x25); - memcpy(out, &mask, 4); - out += 4; - } - } else if (bit_offset + bits <= 64) { - align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); - byte -= align; - bit_offset += align * 8; - - BUG_ON(bit_offset + bits > 64); - - /* mov rax, [rsi + imm8] */ - I4(0x48, 0x8b, 0x46, byte); - - shl = 64 - bit_offset - bits; - shr = bit_offset + shl; - - if (shl) { - /* shl rax, imm8 */ - I4(0x48, 0xc1, 0xe0, shl); - } - - if (shr) { - /* shr rax, imm8 */ - I4(0x48, 0xc1, 0xe8, shr); - } - } else { - align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); - byte -= align; - bit_offset += align * 8; - - BUG_ON(bit_offset + bits > 96); - - /* mov rax, [rsi + byte] */ - I4(0x48, 0x8b, 0x46, byte); - - /* mov edx, [rsi + byte + 8] */ - I3(0x8b, 0x56, byte + 8); - - /* bits from next word: */ - shr = bit_offset + bits - 64; - BUG_ON(shr > bit_offset); - - /* shr rax, bit_offset */ - I4(0x48, 0xc1, 0xe8, shr); - - /* shl rdx, imm8 */ - I4(0x48, 0xc1, 0xe2, 64 - shr); - - /* or rax, rdx */ - I3(0x48, 0x09, 0xd0); - - shr = bit_offset - shr; - - if (shr) { - /* shr rax, imm8 */ - I4(0x48, 0xc1, 0xe8, shr); - } - } - - /* rax += offset: */ - if (offset > S32_MAX) { - /* mov rdx, imm64 */ - I2(0x48, 0xba); - memcpy(out, &offset, 8); - out += 8; - /* add %rdx, %rax */ - I3(0x48, 0x01, 0xd0); - } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { - /* add rax, imm32 */ - I2(0x48, 0x05); - memcpy(out, &offset, 4); - out += 4; - } else if (offset) { - /* add eax, imm32 */ - I1(0x05); - memcpy(out, &offset, 4); - out += 4; - } -set_field: - switch (dst_size) { - case 8: - /* mov [rdi + dst_offset], rax */ - I4(0x48, 0x89, 0x47, dst_offset); - break; - case 4: - /* mov [rdi + dst_offset], eax */ - I3(0x89, 0x47, dst_offset); - break; - default: - BUG(); - } - - return out; -} - -int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) -{ - bool eax_zeroed = false; - u8 *out = _out; - - /* - * rdi: dst - unpacked key - * rsi: src - packed key - */ - - /* k->u64s, k->format, k->type */ - - /* mov eax, [rsi] */ - I2(0x8b, 0x06); - - /* add eax, BKEY_U64s - format->key_u64s */ - I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); - - /* and eax, imm32: mask out k->pad: */ - I5(0x25, 0xff, 0xff, 0xff, 0); - - /* mov [rdi], eax */ - I2(0x89, 0x07); - -#define x(id, field) \ - out = compile_bkey_field(format, out, id, \ - offsetof(struct bkey, field), \ - sizeof(((struct bkey *) NULL)->field), \ - &eax_zeroed); - bkey_fields() -#undef x - - /* retq */ - I1(0xc3); - - return (void *) out - _out; -} - -#else -#endif - -__pure -int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) -{ - return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); -} - -__pure __flatten -int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); -} - -__pure __flatten -int bch2_bkey_cmp_packed(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed_inlined(b, l, r); -} - -__pure __flatten -int __bch2_bkey_cmp_left_packed(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - const struct bkey *l_unpacked; - - return unlikely(l_unpacked = packed_to_bkey_c(l)) - ? bpos_cmp(l_unpacked->p, *r) - : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -} - -void bch2_bpos_swab(struct bpos *p) -{ - u8 *l = (u8 *) p; - u8 *h = ((u8 *) &p[1]) - 1; - - while (l < h) { - swap(*l, *h); - l++; - --h; - } -} - -void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) -{ - const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; - u8 *l = k->key_start; - u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1; - - while (l < h) { - swap(*l, *h); - l++; - --h; - } -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_bkey_pack_test(void) -{ - struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); - struct bkey_packed p; - - struct bkey_format test_format = { - .key_u64s = 3, - .nr_fields = BKEY_NR_FIELDS, - .bits_per_field = { - 13, - 64, - 32, - }, - }; - - struct unpack_state in_s = - unpack_state_init(&bch2_bkey_format_current, (void *) &t); - struct pack_state out_s = pack_state_init(&test_format, &p); - unsigned i; - - for (i = 0; i < out_s.format->nr_fields; i++) { - u64 a, v = get_inc_field(&in_s, i); - - switch (i) { -#define x(id, field) case id: a = t.field; break; - bkey_fields() -#undef x - default: - BUG(); - } - - if (a != v) - panic("got %llu actual %llu i %u\n", v, a, i); - - if (!set_inc_field(&out_s, i, v)) - panic("failed at %u\n", i); - } - - BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); -} -#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h deleted file mode 100644 index 3ccd521c190ac7..00000000000000 --- a/fs/bcachefs/bkey.h +++ /dev/null @@ -1,605 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_H -#define _BCACHEFS_BKEY_H - -#include -#include "bcachefs_format.h" -#include "bkey_types.h" -#include "btree_types.h" -#include "util.h" -#include "vstructs.h" - -#if 0 - -/* - * compiled unpack functions are disabled, pending a new interface for - * dynamically allocating executable memory: - */ - -#ifdef CONFIG_X86_64 -#define HAVE_BCACHEFS_COMPILED_UNPACK 1 -#endif -#endif - -void bch2_bkey_packed_to_binary_text(struct printbuf *, - const struct bkey_format *, - const struct bkey_packed *); - -enum bkey_lr_packed { - BKEY_PACKED_BOTH, - BKEY_PACKED_RIGHT, - BKEY_PACKED_LEFT, - BKEY_PACKED_NONE, -}; - -#define bkey_lr_packed(_l, _r) \ - ((_l)->format + ((_r)->format << 1)) - -static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src) -{ - memcpy_u64s_small(dst, src, src->u64s); -} - -static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src) -{ - memcpy_u64s_small(dst, src, src->k.u64s); -} - -struct btree; - -__pure -unsigned bch2_bkey_greatest_differing_bit(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); -__pure -unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); - -__pure -int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, - const struct bkey_packed *, - const struct btree *); - -__pure -int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, - const struct bkey_packed *, - const struct bpos *); - -__pure -int bch2_bkey_cmp_packed(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); - -__pure -int __bch2_bkey_cmp_left_packed(const struct btree *, - const struct bkey_packed *, - const struct bpos *); - -static inline __pure -int bkey_cmp_left_packed(const struct btree *b, - const struct bkey_packed *l, const struct bpos *r) -{ - return __bch2_bkey_cmp_left_packed(b, l, r); -} - -/* - * The compiler generates better code when we pass bpos by ref, but it's often - * enough terribly convenient to pass it by val... as much as I hate c++, const - * ref would be nice here: - */ -__pure __flatten -static inline int bkey_cmp_left_packed_byval(const struct btree *b, - const struct bkey_packed *l, - struct bpos r) -{ - return bkey_cmp_left_packed(b, l, &r); -} - -static __always_inline bool bpos_eq(struct bpos l, struct bpos r) -{ - return !((l.inode ^ r.inode) | - (l.offset ^ r.offset) | - (l.snapshot ^ r.snapshot)); -} - -static __always_inline bool bpos_lt(struct bpos l, struct bpos r) -{ - return l.inode != r.inode ? l.inode < r.inode : - l.offset != r.offset ? l.offset < r.offset : - l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false; -} - -static __always_inline bool bpos_le(struct bpos l, struct bpos r) -{ - return l.inode != r.inode ? l.inode < r.inode : - l.offset != r.offset ? l.offset < r.offset : - l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true; -} - -static __always_inline bool bpos_gt(struct bpos l, struct bpos r) -{ - return bpos_lt(r, l); -} - -static __always_inline bool bpos_ge(struct bpos l, struct bpos r) -{ - return bpos_le(r, l); -} - -static __always_inline int bpos_cmp(struct bpos l, struct bpos r) -{ - return cmp_int(l.inode, r.inode) ?: - cmp_int(l.offset, r.offset) ?: - cmp_int(l.snapshot, r.snapshot); -} - -static inline struct bpos bpos_min(struct bpos l, struct bpos r) -{ - return bpos_lt(l, r) ? l : r; -} - -static inline struct bpos bpos_max(struct bpos l, struct bpos r) -{ - return bpos_gt(l, r) ? l : r; -} - -static __always_inline bool bkey_eq(struct bpos l, struct bpos r) -{ - return !((l.inode ^ r.inode) | - (l.offset ^ r.offset)); -} - -static __always_inline bool bkey_lt(struct bpos l, struct bpos r) -{ - return l.inode != r.inode - ? l.inode < r.inode - : l.offset < r.offset; -} - -static __always_inline bool bkey_le(struct bpos l, struct bpos r) -{ - return l.inode != r.inode - ? l.inode < r.inode - : l.offset <= r.offset; -} - -static __always_inline bool bkey_gt(struct bpos l, struct bpos r) -{ - return bkey_lt(r, l); -} - -static __always_inline bool bkey_ge(struct bpos l, struct bpos r) -{ - return bkey_le(r, l); -} - -static __always_inline int bkey_cmp(struct bpos l, struct bpos r) -{ - return cmp_int(l.inode, r.inode) ?: - cmp_int(l.offset, r.offset); -} - -static inline struct bpos bkey_min(struct bpos l, struct bpos r) -{ - return bkey_lt(l, r) ? l : r; -} - -static inline struct bpos bkey_max(struct bpos l, struct bpos r) -{ - return bkey_gt(l, r) ? l : r; -} - -static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) -{ - return bpos_eq(l.k->p, r.k->p) && - l.k->size == r.k->size && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); -} - -void bch2_bpos_swab(struct bpos *); -void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); - -static __always_inline int bversion_cmp(struct bversion l, struct bversion r) -{ - return cmp_int(l.hi, r.hi) ?: - cmp_int(l.lo, r.lo); -} - -#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) -#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) - -static __always_inline bool bversion_zero(struct bversion v) -{ - return bversion_cmp(v, ZERO_VERSION) == 0; -} - -#ifdef CONFIG_BCACHEFS_DEBUG -/* statement expressions confusing unlikely()? */ -#define bkey_packed(_k) \ - ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ - (_k)->format != KEY_FORMAT_CURRENT; }) -#else -#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) -#endif - -/* - * It's safe to treat an unpacked bkey as a packed one, but not the reverse - */ -static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) -{ - return (struct bkey_packed *) k; -} - -static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) -{ - return (const struct bkey_packed *) k; -} - -static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) -{ - return bkey_packed(k) ? NULL : (struct bkey_i *) k; -} - -static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) -{ - return bkey_packed(k) ? NULL : (const struct bkey *) k; -} - -static inline unsigned bkey_format_key_bits(const struct bkey_format *format) -{ - return format->bits_per_field[BKEY_FIELD_INODE] + - format->bits_per_field[BKEY_FIELD_OFFSET] + - format->bits_per_field[BKEY_FIELD_SNAPSHOT]; -} - -static inline struct bpos bpos_successor(struct bpos p) -{ - if (!++p.snapshot && - !++p.offset && - !++p.inode) - BUG(); - - return p; -} - -static inline struct bpos bpos_predecessor(struct bpos p) -{ - if (!p.snapshot-- && - !p.offset-- && - !p.inode--) - BUG(); - - return p; -} - -static inline struct bpos bpos_nosnap_successor(struct bpos p) -{ - p.snapshot = 0; - - if (!++p.offset && - !++p.inode) - BUG(); - - return p; -} - -static inline struct bpos bpos_nosnap_predecessor(struct bpos p) -{ - p.snapshot = 0; - - if (!p.offset-- && - !p.inode--) - BUG(); - - return p; -} - -static inline u64 bkey_start_offset(const struct bkey *k) -{ - return k->p.offset - k->size; -} - -static inline struct bpos bkey_start_pos(const struct bkey *k) -{ - return (struct bpos) { - .inode = k->p.inode, - .offset = bkey_start_offset(k), - .snapshot = k->p.snapshot, - }; -} - -/* Packed helpers */ - -static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkey_packed(k) ? format->key_u64s : BKEY_U64s; -} - -static inline bool bkeyp_u64s_valid(const struct bkey_format *f, - const struct bkey_packed *k) -{ - return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s); -} - -static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkeyp_key_u64s(format, k) * sizeof(u64); -} - -static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return k->u64s - bkeyp_key_u64s(format, k); -} - -static inline size_t bkeyp_val_bytes(const struct bkey_format *format, - const struct bkey_packed *k) -{ - return bkeyp_val_u64s(format, k) * sizeof(u64); -} - -static inline void set_bkeyp_val_u64s(const struct bkey_format *format, - struct bkey_packed *k, unsigned val_u64s) -{ - k->u64s = bkeyp_key_u64s(format, k) + val_u64s; -} - -#define bkeyp_val(_format, _k) \ - ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k))) - -extern const struct bkey_format bch2_bkey_format_current; - -bool bch2_bkey_transform(const struct bkey_format *, - struct bkey_packed *, - const struct bkey_format *, - const struct bkey_packed *); - -struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, - const struct bkey_packed *); - -#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -struct bpos __bkey_unpack_pos(const struct bkey_format *, - const struct bkey_packed *); -#endif - -bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, - const struct bkey_format *); - -enum bkey_pack_pos_ret { - BKEY_PACK_POS_EXACT, - BKEY_PACK_POS_SMALLER, - BKEY_PACK_POS_FAIL, -}; - -enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, - const struct btree *); - -static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, - const struct btree *b) -{ - return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; -} - -void bch2_bkey_unpack(const struct btree *, struct bkey_i *, - const struct bkey_packed *); -bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, - const struct bkey_format *); - -typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); - -static inline void -__bkey_unpack_key_format_checked(const struct btree *b, - struct bkey *dst, - const struct bkey_packed *src) -{ - if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) { - compiled_unpack_fn unpack_fn = b->aux_data; - unpack_fn(dst, src); - - if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { - struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); - - BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); - } - } else { - *dst = __bch2_bkey_unpack_key(&b->format, src); - } -} - -static inline struct bkey -bkey_unpack_key_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ - struct bkey dst; - - __bkey_unpack_key_format_checked(b, &dst, src); - return dst; -} - -static inline void __bkey_unpack_key(const struct btree *b, - struct bkey *dst, - const struct bkey_packed *src) -{ - if (likely(bkey_packed(src))) - __bkey_unpack_key_format_checked(b, dst, src); - else - *dst = *packed_to_bkey_c(src); -} - -/** - * bkey_unpack_key -- unpack just the key, not the value - */ -static inline struct bkey bkey_unpack_key(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_key_format_checked(b, src) - : *packed_to_bkey_c(src); -} - -static inline struct bpos -bkey_unpack_pos_format_checked(const struct btree *b, - const struct bkey_packed *src) -{ -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - return bkey_unpack_key_format_checked(b, src).p; -#else - return __bkey_unpack_pos(&b->format, src); -#endif -} - -static inline struct bpos bkey_unpack_pos(const struct btree *b, - const struct bkey_packed *src) -{ - return likely(bkey_packed(src)) - ? bkey_unpack_pos_format_checked(b, src) - : packed_to_bkey_c(src)->p; -} - -/* Disassembled bkeys */ - -static inline struct bkey_s_c bkey_disassemble(const struct btree *b, - const struct bkey_packed *k, - struct bkey *u) -{ - __bkey_unpack_key(b, u, k); - - return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -} - -/* non const version: */ -static inline struct bkey_s __bkey_disassemble(const struct btree *b, - struct bkey_packed *k, - struct bkey *u) -{ - __bkey_unpack_key(b, u, k); - - return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -} - -static inline u64 bkey_field_max(const struct bkey_format *f, - enum bch_bkey_fields nr) -{ - return f->bits_per_field[nr] < 64 - ? (le64_to_cpu(f->field_offset[nr]) + - ~(~0ULL << f->bits_per_field[nr])) - : U64_MAX; -} - -#ifdef HAVE_BCACHEFS_COMPILED_UNPACK - -int bch2_compile_bkey_format(const struct bkey_format *, void *); - -#else - -static inline int bch2_compile_bkey_format(const struct bkey_format *format, - void *out) { return 0; } - -#endif - -static inline void bkey_reassemble(struct bkey_i *dst, - struct bkey_s_c src) -{ - dst->k = *src.k; - memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); -} - -/* byte order helpers */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - -static inline unsigned high_word_offset(const struct bkey_format *f) -{ - return f->key_u64s - 1; -} - -#define high_bit_offset 0 -#define nth_word(p, n) ((p) - (n)) - -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - -static inline unsigned high_word_offset(const struct bkey_format *f) -{ - return 0; -} - -#define high_bit_offset KEY_PACKED_BITS_START -#define nth_word(p, n) ((p) + (n)) - -#else -#error edit for your odd byteorder. -#endif - -#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f)) -#define next_word(p) nth_word(p, 1) -#define prev_word(p) nth_word(p, -1) - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_bkey_pack_test(void); -#else -static inline void bch2_bkey_pack_test(void) {} -#endif - -#define bkey_fields() \ - x(BKEY_FIELD_INODE, p.inode) \ - x(BKEY_FIELD_OFFSET, p.offset) \ - x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ - x(BKEY_FIELD_SIZE, size) \ - x(BKEY_FIELD_VERSION_HI, bversion.hi) \ - x(BKEY_FIELD_VERSION_LO, bversion.lo) - -struct bkey_format_state { - u64 field_min[BKEY_NR_FIELDS]; - u64 field_max[BKEY_NR_FIELDS]; -}; - -void bch2_bkey_format_init(struct bkey_format_state *); - -static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v) -{ - s->field_min[field] = min(s->field_min[field], v); - s->field_max[field] = max(s->field_max[field], v); -} - -/* - * Changes @format so that @k can be successfully packed with @format - */ -static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) -{ -#define x(id, field) __bkey_format_add(s, id, k->field); - bkey_fields() -#undef x -} - -void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); -struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); - -static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i) -{ - unsigned f_bits = f->bits_per_field[i]; - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (f_bits > unpacked_bits) - return true; - - if ((f_bits == unpacked_bits) && field_offset) - return true; - - u64 f_mask = f_bits - ? ~((~0ULL << (f_bits - 1)) << 1) - : 0; - - if (((field_offset + f_mask) & unpacked_mask) < field_offset) - return true; - return false; -} - -int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, - enum bch_validate_flags, struct printbuf *); -void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); - -#endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h deleted file mode 100644 index a30c4ae8eb369d..00000000000000 --- a/fs/bcachefs/bkey_buf.h +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_BUF_H -#define _BCACHEFS_BKEY_BUF_H - -#include "bcachefs.h" -#include "bkey.h" - -struct bkey_buf { - struct bkey_i *k; - u64 onstack[12]; -}; - -static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, - struct bch_fs *c, unsigned u64s) -{ - if (s->k == (void *) s->onstack && - u64s > ARRAY_SIZE(s->onstack)) { - s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); - memcpy(s->k, s->onstack, sizeof(s->onstack)); - } -} - -static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, - struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_buf_realloc(s, c, k.k->u64s); - bkey_reassemble(s->k, k); -} - -static inline void bch2_bkey_buf_copy(struct bkey_buf *s, - struct bch_fs *c, - struct bkey_i *src) -{ - bch2_bkey_buf_realloc(s, c, src->k.u64s); - bkey_copy(s->k, src); -} - -static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, - struct bch_fs *c, - struct btree *b, - struct bkey_packed *src) -{ - bch2_bkey_buf_realloc(s, c, BKEY_U64s + - bkeyp_val_u64s(&b->format, src)); - bch2_bkey_unpack(b, s->k, src); -} - -static inline void bch2_bkey_buf_init(struct bkey_buf *s) -{ - s->k = (void *) s->onstack; -} - -static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) -{ - if (s->k != (void *) s->onstack) - mempool_free(s->k, &c->large_bkey_pool); - s->k = NULL; -} - -#endif /* _BCACHEFS_BKEY_BUF_H */ diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h deleted file mode 100644 index 5f42a6e693606b..00000000000000 --- a/fs/bcachefs/bkey_cmp.h +++ /dev/null @@ -1,129 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_CMP_H -#define _BCACHEFS_BKEY_CMP_H - -#include "bkey.h" - -#ifdef CONFIG_X86_64 -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - long d0, d1, d2, d3; - int cmp; - - /* we shouldn't need asm for this, but gcc is being retarded: */ - - asm(".intel_syntax noprefix;" - "xor eax, eax;" - "xor edx, edx;" - "1:;" - "mov r8, [rdi];" - "mov r9, [rsi];" - "sub ecx, 64;" - "jl 2f;" - - "cmp r8, r9;" - "jnz 3f;" - - "lea rdi, [rdi - 8];" - "lea rsi, [rsi - 8];" - "jmp 1b;" - - "2:;" - "not ecx;" - "shr r8, 1;" - "shr r9, 1;" - "shr r8, cl;" - "shr r9, cl;" - "cmp r8, r9;" - - "3:\n" - "seta al;" - "setb dl;" - "sub eax, edx;" - ".att_syntax prefix;" - : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) - : "0" (l), "1" (r), "3" (nr_key_bits) - : "r8", "r9", "cc", "memory"); - - return cmp; -} -#else -static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, - unsigned nr_key_bits) -{ - u64 l_v, r_v; - - if (!nr_key_bits) - return 0; - - /* for big endian, skip past header */ - nr_key_bits += high_bit_offset; - l_v = *l & (~0ULL >> high_bit_offset); - r_v = *r & (~0ULL >> high_bit_offset); - - while (1) { - if (nr_key_bits < 64) { - l_v >>= 64 - nr_key_bits; - r_v >>= 64 - nr_key_bits; - nr_key_bits = 0; - } else { - nr_key_bits -= 64; - } - - if (!nr_key_bits || l_v != r_v) - break; - - l = next_word(l); - r = next_word(r); - - l_v = *l; - r_v = *r; - } - - return cmp_int(l_v, r_v); -} -#endif - -static inline __pure __flatten -int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) -{ - const struct bkey_format *f = &b->format; - int ret; - - EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); - EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); - - ret = __bkey_cmp_bits(high_word(f, l), - high_word(f, r), - b->nr_key_bits); - - EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), - bkey_unpack_pos(b, r))); - return ret; -} - -static inline __pure __flatten -int bch2_bkey_cmp_packed_inlined(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - struct bkey unpacked; - - if (likely(bkey_packed(l) && bkey_packed(r))) - return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); - - if (bkey_packed(l)) { - __bkey_unpack_key_format_checked(b, &unpacked, l); - l = (void *) &unpacked; - } else if (bkey_packed(r)) { - __bkey_unpack_key_format_checked(b, &unpacked, r); - r = (void *) &unpacked; - } - - return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); -} - -#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c deleted file mode 100644 index fcd8c82cba4f6f..00000000000000 --- a/fs/bcachefs/bkey_methods.c +++ /dev/null @@ -1,497 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "backpointers.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_types.h" -#include "alloc_background.h" -#include "dirent.h" -#include "disk_accounting.h" -#include "ec.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "io_misc.h" -#include "lru.h" -#include "quota.h" -#include "reflink.h" -#include "snapshot.h" -#include "subvolume.h" -#include "xattr.h" - -const char * const bch2_bkey_types[] = { -#define x(name, nr, ...) #name, - BCH_BKEY_TYPES() -#undef x - NULL -}; - -static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -#define bch2_bkey_ops_deleted ((struct bkey_ops) { \ - .key_validate = deleted_key_validate, \ -}) - -#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ - .key_validate = deleted_key_validate, \ -}) - -static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_val_bytes(k.k), - c, bkey_val_size_nonzero, - "incorrect value size (%zu != 0)", - bkey_val_bytes(k.k)); -fsck_err: - return ret; -} - -#define bch2_bkey_ops_error ((struct bkey_ops) { \ - .key_validate = empty_val_key_validate, \ -}) - -static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k); - - prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie)); -} - -#define bch2_bkey_ops_cookie ((struct bkey_ops) { \ - .key_validate = key_type_cookie_validate, \ - .val_to_text = key_type_cookie_to_text, \ - .min_val_size = 8, \ -}) - -#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ - .key_validate = empty_val_key_validate, \ -}) - -static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); - unsigned datalen = bkey_inline_data_bytes(k.k); - - prt_printf(out, "datalen %u: %*phN", - datalen, min(datalen, 32U), d.v->data); -} - -#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ - .key_validate = key_type_inline_data_validate, \ - .val_to_text = key_type_inline_data_to_text, \ -}) - -static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -{ - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -#define bch2_bkey_ops_set ((struct bkey_ops) { \ - .key_validate = empty_val_key_validate, \ - .key_merge = key_type_set_merge, \ -}) - -const struct bkey_ops bch2_bkey_ops[] = { -#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name, - BCH_BKEY_TYPES() -#undef x -}; - -const struct bkey_ops bch2_bkey_null_ops = { -}; - -int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) - return 0; - - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - int ret = 0; - - bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, - c, bkey_val_size_too_small, - "bad val size (%zu < %u)", - bkey_val_bytes(k.k), ops->min_val_size); - - if (!ops->key_validate) - return 0; - - ret = ops->key_validate(c, k, from); -fsck_err: - return ret; -} - -static u64 bch2_key_types_allowed[] = { - [BKEY_TYPE_btree] = - BIT_ULL(KEY_TYPE_deleted)| - BIT_ULL(KEY_TYPE_btree_ptr)| - BIT_ULL(KEY_TYPE_btree_ptr_v2), -#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, - BCH_BTREE_IDS() -#undef x -}; - -static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = { -#define x(name, nr, flags) [KEY_TYPE_##name] = flags, - BCH_BKEY_TYPES() -#undef x -}; - -const char *bch2_btree_node_type_str(enum btree_node_type type) -{ - return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); -} - -int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - enum btree_node_type type = __btree_node_type(from.level, from.btree); - - if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) - return 0; - - int ret = 0; - - bkey_fsck_err_on(k.k->u64s < BKEY_U64s, - c, bkey_u64s_too_small, - "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); - - if (type >= BKEY_TYPE_NR) - return 0; - - enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX - ? bch2_bkey_type_flags[k.k->type] - : 0; - - bool strict_key_type_allowed = - (from.flags & BCH_VALIDATE_commit) || - type == BKEY_TYPE_btree || - (from.btree < BTREE_ID_NR && - (bkey_flags & BKEY_TYPE_strict_btree_checks)); - - bkey_fsck_err_on(strict_key_type_allowed && - k.k->type < KEY_TYPE_MAX && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), - c, bkey_invalid_type_for_btree, - "invalid key type for btree %s (%s)", - bch2_btree_node_type_str(type), - k.k->type < KEY_TYPE_MAX - ? bch2_bkey_types[k.k->type] - : "(unknown)"); - - if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - bkey_fsck_err_on(k.k->size == 0, - c, bkey_extent_size_zero, - "size == 0"); - - bkey_fsck_err_on(k.k->size > k.k->p.offset, - c, bkey_extent_size_greater_than_offset, - "size greater than offset (%u > %llu)", - k.k->size, k.k->p.offset); - } else { - bkey_fsck_err_on(k.k->size, - c, bkey_size_nonzero, - "size != 0"); - } - - if (type != BKEY_TYPE_btree) { - enum btree_id btree = type - 1; - - if (btree_type_has_snapshots(btree)) { - bkey_fsck_err_on(!k.k->p.snapshot, - c, bkey_snapshot_zero, - "snapshot == 0"); - } else if (!btree_type_has_snapshot_field(btree)) { - bkey_fsck_err_on(k.k->p.snapshot, - c, bkey_snapshot_nonzero, - "nonzero snapshot"); - } else { - /* - * btree uses snapshot field but it's not required to be - * nonzero - */ - } - - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), - c, bkey_at_pos_max, - "key at POS_MAX"); - } -fsck_err: - return ret; -} - -int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return __bch2_bkey_validate(c, k, from) ?: - bch2_bkey_val_validate(c, k, from); -} - -int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), - c, bkey_before_start_of_btree_node, - "key before start of btree node"); - - bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), - c, bkey_after_end_of_btree_node, - "key past end of btree node"); -fsck_err: - return ret; -} - -void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) -{ - if (bpos_eq(pos, POS_MIN)) - prt_printf(out, "POS_MIN"); - else if (bpos_eq(pos, POS_MAX)) - prt_printf(out, "POS_MAX"); - else if (bpos_eq(pos, SPOS_MAX)) - prt_printf(out, "SPOS_MAX"); - else { - if (pos.inode == U64_MAX) - prt_printf(out, "U64_MAX"); - else - prt_printf(out, "%llu", pos.inode); - prt_printf(out, ":"); - if (pos.offset == U64_MAX) - prt_printf(out, "U64_MAX"); - else - prt_printf(out, "%llu", pos.offset); - prt_printf(out, ":"); - if (pos.snapshot == U32_MAX) - prt_printf(out, "U32_MAX"); - else - prt_printf(out, "%u", pos.snapshot); - } -} - -void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) -{ - if (k) { - prt_printf(out, "u64s %u type ", k->u64s); - - if (k->type < KEY_TYPE_MAX) - prt_printf(out, "%s ", bch2_bkey_types[k->type]); - else - prt_printf(out, "%u ", k->type); - - bch2_bpos_to_text(out, k->p); - - prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo); - } else { - prt_printf(out, "(null)"); - } -} - -void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - if (likely(ops->val_to_text)) - ops->val_to_text(out, c, k); -} - -void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_to_text(out, k.k); - - if (bkey_val_bytes(k.k)) { - prt_printf(out, ": "); - bch2_val_to_text(out, c, k); - } -} - -void bch2_bkey_swab_val(struct bkey_s k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - if (ops->swab) - ops->swab(k); -} - -bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - return ops->key_normalize - ? ops->key_normalize(c, k) - : false; -} - -bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); - - return ops->key_merge && - bch2_bkey_maybe_mergable(l.k, r.k) && - (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && - !static_branch_unlikely(&bch2_key_merging_disabled) && - ops->key_merge(c, l, r); -} - -static const struct old_bkey_type { - u8 btree_node_type; - u8 old; - u8 new; -} bkey_renumber_table[] = { - {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, - {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, - {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, - {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, - {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, - {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, - {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, - {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, - {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, - {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, - {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, - {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, -}; - -void bch2_bkey_renumber(enum btree_node_type btree_node_type, - struct bkey_packed *k, - int write) -{ - const struct old_bkey_type *i; - - for (i = bkey_renumber_table; - i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); - i++) - if (btree_node_type == i->btree_node_type && - k->type == (write ? i->new : i->old)) { - k->type = write ? i->old : i->new; - break; - } -} - -void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, - struct bkey_format *f, - struct bkey_packed *k) -{ - const struct bkey_ops *ops; - struct bkey uk; - unsigned nr_compat = 5; - int i; - - /* - * Do these operations in reverse order in the write path: - */ - - for (i = 0; i < nr_compat; i++) - switch (!write ? i : nr_compat - 1 - i) { - case 0: - if (big_endian != CPU_BIG_ENDIAN) { - bch2_bkey_swab_key(f, k); - } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - bch2_bkey_swab_key(f, k); - bch2_bkey_swab_key(f, k); - } - break; - case 1: - if (version < bcachefs_metadata_version_bkey_renumber) - bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); - break; - case 2: - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_inodes) { - if (!bkey_packed(k)) { - struct bkey_i *u = packed_to_bkey(k); - - swap(u->k.p.inode, u->k.p.offset); - } else if (f->bits_per_field[BKEY_FIELD_INODE] && - f->bits_per_field[BKEY_FIELD_OFFSET]) { - struct bkey_format tmp = *f, *in = f, *out = &tmp; - - swap(tmp.bits_per_field[BKEY_FIELD_INODE], - tmp.bits_per_field[BKEY_FIELD_OFFSET]); - swap(tmp.field_offset[BKEY_FIELD_INODE], - tmp.field_offset[BKEY_FIELD_OFFSET]); - - if (!write) - swap(in, out); - - uk = __bch2_bkey_unpack_key(in, k); - swap(uk.p.inode, uk.p.offset); - BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); - } - } - break; - case 3: - if (version < bcachefs_metadata_version_snapshot && - (level || btree_type_has_snapshots(btree_id))) { - struct bkey_i *u = packed_to_bkey(k); - - if (u) { - u->k.p.snapshot = write - ? 0 : U32_MAX; - } else { - u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]); - u64 max_packed = min_packed + - ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); - - uk = __bch2_bkey_unpack_key(f, k); - uk.p.snapshot = write - ? min_packed : min_t(u64, U32_MAX, max_packed); - - BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); - } - } - - break; - case 4: { - struct bkey_s u; - - if (!bkey_packed(k)) { - u = bkey_i_to_s(packed_to_bkey(k)); - } else { - uk = __bch2_bkey_unpack_key(f, k); - u.k = &uk; - u.v = bkeyp_val(f, k); - } - - if (big_endian != CPU_BIG_ENDIAN) - bch2_bkey_swab_val(u); - - ops = bch2_bkey_type_ops(k->type); - - if (ops->compat) - ops->compat(btree_id, version, big_endian, write, u); - break; - } - default: - BUG(); - } -} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h deleted file mode 100644 index bf34111cdf008d..00000000000000 --- a/fs/bcachefs/bkey_methods.h +++ /dev/null @@ -1,139 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_METHODS_H -#define _BCACHEFS_BKEY_METHODS_H - -#include "bkey.h" - -struct bch_fs; -struct btree; -struct btree_trans; -struct bkey; -enum btree_node_type; - -extern const char * const bch2_bkey_types[]; -extern const struct bkey_ops bch2_bkey_null_ops; - -/* - * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If - * invalid, entire key will be deleted. - * - * When invalid, error string is returned via @err. @rw indicates whether key is - * being read or written; more aggressive checks can be enabled when rw == WRITE. - */ -struct bkey_ops { - int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from); - void (*val_to_text)(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - void (*swab)(struct bkey_s); - bool (*key_normalize)(struct bch_fs *, struct bkey_s); - bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); - int (*trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - void (*compat)(enum btree_id id, unsigned version, - unsigned big_endian, int write, - struct bkey_s); - - /* Size of value type when first created: */ - unsigned min_val_size; -}; - -extern const struct bkey_ops bch2_bkey_ops[]; - -static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) -{ - return likely(type < KEY_TYPE_MAX) - ? &bch2_bkey_ops[type] - : &bch2_bkey_null_ops; -} - -int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, - struct bkey_validate_context from); - -void bch2_bpos_to_text(struct printbuf *, struct bpos); -void bch2_bkey_to_text(struct printbuf *, const struct bkey *); -void bch2_val_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - -void bch2_bkey_swab_val(struct bkey_s); - -bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); - -static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) -{ - return l->type == r->type && - !bversion_cmp(l->bversion, r->bversion) && - bpos_eq(l->p, bkey_start_pos(r)); -} - -bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -static inline int bch2_key_trigger(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); - - return ops->trigger - ? ops->trigger(trans, btree, level, old, new, flags) - : 0; -} - -static inline int bch2_key_trigger_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i deleted; - - bkey_init(&deleted.k); - deleted.k.p = old.k->p; - - return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), - BTREE_TRIGGER_overwrite|flags); -} - -static inline int bch2_key_trigger_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i deleted; - - bkey_init(&deleted.k); - deleted.k.p = new.k->p; - - return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_insert|flags); -} - -void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); - -void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, - int, struct bkey_format *, struct bkey_packed *); - -static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, - struct bkey_format *f, - struct bkey_packed *k) -{ - if (version < bcachefs_metadata_version_current || - big_endian != CPU_BIG_ENDIAN || - IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - __bch2_bkey_compat(level, btree_id, version, - big_endian, write, f, k); - -} - -#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c deleted file mode 100644 index 4536eb50fc4064..00000000000000 --- a/fs/bcachefs/bkey_sort.c +++ /dev/null @@ -1,214 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bkey_buf.h" -#include "bkey_cmp.h" -#include "bkey_sort.h" -#include "bset.h" -#include "extents.h" - -typedef int (*sort_cmp_fn)(const struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); - -static inline bool sort_iter_end(struct sort_iter *iter) -{ - return !iter->used; -} - -static inline void sort_iter_sift(struct sort_iter *iter, unsigned from, - sort_cmp_fn cmp) -{ - unsigned i; - - for (i = from; - i + 1 < iter->used && - cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; - i++) - swap(iter->data[i], iter->data[i + 1]); -} - -static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -{ - unsigned i = iter->used; - - while (i--) - sort_iter_sift(iter, i, cmp); -} - -static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -{ - return !sort_iter_end(iter) ? iter->data->k : NULL; -} - -static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -{ - struct sort_iter_set *i = iter->data; - - BUG_ON(!iter->used); - - i->k = bkey_p_next(i->k); - - BUG_ON(i->k > i->end); - - if (i->k == i->end) - array_remove_item(iter->data, iter->used, 0); - else - sort_iter_sift(iter, 0, cmp); -} - -static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, - sort_cmp_fn cmp) -{ - struct bkey_packed *ret = sort_iter_peek(iter); - - if (ret) - sort_iter_advance(iter, cmp); - - return ret; -} - -/* - * If keys compare equal, compare by pointer order: - */ -static inline int key_sort_fix_overlapping_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed(b, l, r) ?: - cmp_int((unsigned long) l, (unsigned long) r); -} - -static inline bool should_drop_next_key(struct sort_iter *iter) -{ - /* - * key_sort_cmp() ensures that when keys compare equal the older key - * comes first; so if l->k compares equal to r->k then l->k is older - * and should be dropped. - */ - return iter->used >= 2 && - !bch2_bkey_cmp_packed(iter->b, - iter->data[0].k, - iter->data[1].k); -} - -struct btree_nr_keys -bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, - struct sort_iter *iter) -{ - struct bkey_packed *out = dst->start; - struct bkey_packed *k; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - sort_iter_sort(iter, key_sort_fix_overlapping_cmp); - - while ((k = sort_iter_peek(iter))) { - if (!bkey_deleted(k) && - !should_drop_next_key(iter)) { - bkey_p_copy(out, k); - btree_keys_account_key_add(&nr, 0, out); - out = bkey_p_next(out); - } - - sort_iter_advance(iter, key_sort_fix_overlapping_cmp); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -/* Sort + repack in a new format: */ -struct btree_nr_keys -bch2_sort_repack(struct bset *dst, struct btree *src, - struct btree_node_iter *src_iter, - struct bkey_format *out_f, - bool filter_whiteouts) -{ - struct bkey_format *in_f = &src->format; - struct bkey_packed *in, *out = vstruct_last(dst); - struct btree_nr_keys nr; - bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); - - memset(&nr, 0, sizeof(nr)); - - while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { - if (filter_whiteouts && bkey_deleted(in)) - continue; - - if (!transform) - bkey_p_copy(out, in); - else if (bch2_bkey_transform(out_f, out, bkey_packed(in) - ? in_f : &bch2_bkey_format_current, in)) - out->format = KEY_FORMAT_LOCAL_BTREE; - else - bch2_bkey_unpack(src, (void *) out, in); - - out->needs_whiteout = false; - - btree_keys_account_key_add(&nr, 0, out); - out = bkey_p_next(out); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -static inline int keep_unwritten_whiteouts_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed_inlined(b, l, r) ?: - (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: - (long) l - (long) r; -} - -#include "btree_update_interior.h" - -/* - * For sorting in the btree node write path: whiteouts not in the unwritten - * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are - * dropped if overwritten by real keys: - */ -unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter) -{ - struct bkey_packed *in, *next, *out = dst; - - sort_iter_sort(iter, keep_unwritten_whiteouts_cmp); - - while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) { - if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b)) - continue; - - if ((next = sort_iter_peek(iter)) && - !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) - continue; - - bkey_p_copy(out, in); - out = bkey_p_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -/* - * Main sort routine for compacting a btree node in memory: we always drop - * whiteouts because any whiteouts that need to be written are in the unwritten - * whiteouts area: - */ -unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined); - - while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) { - if (bkey_deleted(in)) - continue; - - bkey_p_copy(out, in); - out = bkey_p_next(out); - } - - return (u64 *) out - (u64 *) dst; -} diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h deleted file mode 100644 index 9be969d4689066..00000000000000 --- a/fs/bcachefs/bkey_sort.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_SORT_H -#define _BCACHEFS_BKEY_SORT_H - -struct sort_iter { - struct btree *b; - unsigned used; - unsigned size; - - struct sort_iter_set { - struct bkey_packed *k, *end; - } data[]; -}; - -static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size) -{ - iter->b = b; - iter->used = 0; - iter->size = size; -} - -struct sort_iter_stack { - struct sort_iter iter; - struct sort_iter_set sets[MAX_BSETS + 1]; -}; - -static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b) -{ - sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets)); -} - -static inline void sort_iter_add(struct sort_iter *iter, - struct bkey_packed *k, - struct bkey_packed *end) -{ - BUG_ON(iter->used >= iter->size); - - if (k != end) - iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -} - -struct btree_nr_keys -bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, - struct sort_iter *); - -struct btree_nr_keys -bch2_sort_repack(struct bset *, struct btree *, - struct btree_node_iter *, - struct bkey_format *, bool); - -unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *); -unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *); - -#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h deleted file mode 100644 index b4f328f9853c60..00000000000000 --- a/fs/bcachefs/bkey_types.h +++ /dev/null @@ -1,241 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_TYPES_H -#define _BCACHEFS_BKEY_TYPES_H - -#include "bcachefs_format.h" - -/* - * bkey_i - bkey with inline value - * bkey_s - bkey with split value - * bkey_s_c - bkey with split value, const - */ - -#define bkey_p_next(_k) vstruct_next(_k) - -static inline struct bkey_i *bkey_next(struct bkey_i *k) -{ - return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); -} - -#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) - -static inline size_t bkey_val_bytes(const struct bkey *k) -{ - return bkey_val_u64s(k) * sizeof(u64); -} - -static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -{ - unsigned u64s = BKEY_U64s + val_u64s; - - BUG_ON(u64s > U8_MAX); - k->u64s = u64s; -} - -static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -{ - set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); -} - -#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) - -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) - -#define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) - -/* bkey with split value, const */ -struct bkey_s_c { - const struct bkey *k; - const struct bch_val *v; -}; - -/* bkey with split value */ -struct bkey_s { - union { - struct { - struct bkey *k; - struct bch_val *v; - }; - struct bkey_s_c s_c; - }; -}; - -#define bkey_s_null ((struct bkey_s) { .k = NULL }) -#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) - -#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) - -static inline struct bkey_s bkey_to_s(struct bkey *k) -{ - return (struct bkey_s) { .k = k, .v = NULL }; -} - -static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -{ - return (struct bkey_s_c) { .k = k, .v = NULL }; -} - -static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -{ - return (struct bkey_s) { .k = &k->k, .v = &k->v }; -} - -static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -{ - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -} - -/* - * For a given type of value (e.g. struct bch_extent), generates the types for - * bkey + bch_extent - inline, split, split const - and also all the conversion - * functions, which also check that the value is of the correct type. - * - * We use anonymous unions for upcasting - e.g. converting from e.g. a - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion - * functions. - */ -#define x(name, ...) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -}; \ - \ -struct bkey_s_c_##name { \ - union { \ - struct { \ - const struct bkey *k; \ - const struct bch_##name *v; \ - }; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -struct bkey_s_##name { \ - union { \ - struct { \ - struct bkey *k; \ - struct bch_##name *v; \ - }; \ - struct bkey_s_c_##name c; \ - struct bkey_s s; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline const struct bkey_i_##name * \ -bkey_i_to_##name##_c(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -{ \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -name##_i_to_s_c(const struct bkey_i_##name *k) \ -{ \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -bkey_i_to_s_c_##name(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -{ \ - struct bkey_i_##name *k = \ - container_of(&_k->k, struct bkey_i_##name, k); \ - \ - bkey_init(&k->k); \ - memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = KEY_TYPE_##name; \ - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ - \ - return k; \ -} - -BCH_BKEY_TYPES(); -#undef x - -enum bch_validate_flags { - BCH_VALIDATE_write = BIT(0), - BCH_VALIDATE_commit = BIT(1), - BCH_VALIDATE_silent = BIT(2), -}; - -#define BKEY_VALIDATE_CONTEXTS() \ - x(unknown) \ - x(superblock) \ - x(journal) \ - x(btree_root) \ - x(btree_node) \ - x(commit) - -struct bkey_validate_context { - enum { -#define x(n) BKEY_VALIDATE_##n, - BKEY_VALIDATE_CONTEXTS() -#undef x - } from:8; - enum bch_validate_flags flags:8; - u8 level; - enum btree_id btree; - bool root:1; - unsigned journal_offset; - u64 journal_seq; -}; - -#endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c deleted file mode 100644 index 32841f762eb2e0..00000000000000 --- a/fs/bcachefs/bset.c +++ /dev/null @@ -1,1576 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code for working with individual keys, and sorted sets of keys with in a - * btree node - * - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "btree_cache.h" -#include "bset.h" -#include "eytzinger.h" -#include "trace.h" -#include "util.h" - -#include -#include -#include -#include - -static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, - struct btree *); - -static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) -{ - unsigned n = ARRAY_SIZE(iter->data); - - while (n && __btree_node_iter_set_end(iter, n - 1)) - --n; - - return n; -} - -struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) -{ - return bch2_bkey_to_bset_inlined(b, k); -} - -/* - * There are never duplicate live keys in the btree - but including keys that - * have been flagged as deleted (and will be cleaned up later) we _will_ see - * duplicates. - * - * Thus the sort order is: usual key comparison first, but for keys that compare - * equal the deleted key(s) come first, and the (at most one) live version comes - * last. - * - * The main reason for this is insertion: to handle overwrites, we first iterate - * over keys that compare equal to our insert key, and then insert immediately - * prior to the first key greater than the key we're inserting - our insert - * position will be after all keys that compare equal to our insert key, which - * by the time we actually do the insert will all be deleted. - */ - -void bch2_dump_bset(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned set) -{ - struct bkey_packed *_k, *_n; - struct bkey uk, n; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - - if (!i->u64s) - return; - - for (_k = i->start; - _k < vstruct_last(i); - _k = _n) { - _n = bkey_p_next(_k); - - if (!_k->u64s) { - printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set, - _k->_data - i->_data); - break; - } - - k = bkey_disassemble(b, _k, &uk); - - printbuf_reset(&buf); - if (c) - bch2_bkey_val_to_text(&buf, c, k); - else - bch2_bkey_to_text(&buf, k.k); - printk(KERN_ERR "block %u key %5zu: %s\n", set, - _k->_data - i->_data, buf.buf); - - if (_n == vstruct_last(i)) - continue; - - n = bkey_unpack_key(b, _n); - - if (bpos_lt(n.p, k.k->p)) { - printk(KERN_ERR "Key skipped backwards\n"); - continue; - } - - if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) - printk(KERN_ERR "Duplicate keys\n"); - } - - printbuf_exit(&buf); -} - -void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) -{ - console_lock(); - for_each_bset(b, t) - bch2_dump_bset(c, b, bset(b, t), t - b->set); - console_unlock(); -} - -void bch2_dump_btree_node_iter(struct btree *b, - struct btree_node_iter *iter) -{ - struct btree_node_iter_set *set; - struct printbuf buf = PRINTBUF; - - printk(KERN_ERR "btree node iter with %u/%u sets:\n", - __btree_node_iter_used(iter), b->nsets); - - btree_node_iter_for_each(iter, set) { - struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); - struct bset_tree *t = bch2_bkey_to_bset(b, k); - struct bkey uk = bkey_unpack_key(b, k); - - printbuf_reset(&buf); - bch2_bkey_to_text(&buf, &uk); - printk(KERN_ERR "set %zu key %u: %s\n", - t - b->set, set->k, buf.buf); - } - - printbuf_exit(&buf); -} - -struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) -{ - struct bkey_packed *k; - struct btree_nr_keys nr = {}; - - for_each_bset(b, t) - bset_tree_for_each_key(b, t, k) - if (!bkey_deleted(k)) - btree_keys_account_key_add(&nr, t - b->set, k); - return nr; -} - -void __bch2_verify_btree_nr_keys(struct btree *b) -{ - struct btree_nr_keys nr = bch2_btree_node_count_keys(b); - - BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); -} - -static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, - struct btree *b) -{ - struct btree_node_iter iter = *_iter; - const struct bkey_packed *k, *n; - - k = bch2_btree_node_iter_peek_all(&iter, b); - __bch2_btree_node_iter_advance(&iter, b); - n = bch2_btree_node_iter_peek_all(&iter, b); - - bkey_unpack_key(b, k); - - if (n && - bkey_iter_cmp(b, k, n) > 0) { - struct btree_node_iter_set *set; - struct bkey ku = bkey_unpack_key(b, k); - struct bkey nu = bkey_unpack_key(b, n); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&buf1, &ku); - bch2_bkey_to_text(&buf2, &nu); - printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", - buf1.buf, buf2.buf); - printk(KERN_ERR "iter was:"); - - btree_node_iter_for_each(_iter, set) { - struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k); - struct bset_tree *t = bch2_bkey_to_bset(b, k2); - printk(" [%zi %zi]", t - b->set, - k2->_data - bset(b, t)->_data); - } - panic("\n"); - } -} - -void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) -{ - struct btree_node_iter_set *set, *s2; - struct bkey_packed *k, *p; - - if (bch2_btree_node_iter_end(iter)) - return; - - /* Verify no duplicates: */ - btree_node_iter_for_each(iter, set) { - BUG_ON(set->k > set->end); - btree_node_iter_for_each(iter, s2) - BUG_ON(set != s2 && set->end == s2->end); - } - - /* Verify that set->end is correct: */ - btree_node_iter_for_each(iter, set) { - for_each_bset(b, t) - if (set->end == t->end_offset) { - BUG_ON(set->k < btree_bkey_first_offset(t) || - set->k >= t->end_offset); - goto found; - } - BUG(); -found: - do {} while (0); - } - - /* Verify iterator is sorted: */ - btree_node_iter_for_each(iter, set) - BUG_ON(set != iter->data && - btree_node_iter_cmp(b, set[-1], set[0]) > 0); - - k = bch2_btree_node_iter_peek_all(iter, b); - - for_each_bset(b, t) { - if (iter->data[0].end == t->end_offset) - continue; - - p = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t)); - - BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); - } -} - -static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, - struct bkey_packed *insert, unsigned clobber_u64s) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, where); - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); - struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; -#if 0 - BUG_ON(prev && - bkey_iter_cmp(b, prev, insert) > 0); -#else - if (prev && - bkey_iter_cmp(b, prev, insert) > 0) { - struct bkey k1 = bkey_unpack_key(b, prev); - struct bkey k2 = bkey_unpack_key(b, insert); - - bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&buf1, &k1); - bch2_bkey_to_text(&buf2, &k2); - - panic("prev > insert:\n" - "prev key %s\n" - "insert key %s\n", - buf1.buf, buf2.buf); - } -#endif -#if 0 - BUG_ON(next != btree_bkey_last(b, t) && - bkey_iter_cmp(b, insert, next) > 0); -#else - if (next != btree_bkey_last(b, t) && - bkey_iter_cmp(b, insert, next) > 0) { - struct bkey k1 = bkey_unpack_key(b, insert); - struct bkey k2 = bkey_unpack_key(b, next); - - bch2_dump_btree_node(NULL, b); - bch2_bkey_to_text(&buf1, &k1); - bch2_bkey_to_text(&buf2, &k2); - - panic("insert > next:\n" - "insert key %s\n" - "next key %s\n", - buf1.buf, buf2.buf); - } -#endif -} - -static inline void bch2_verify_insert_pos(struct btree *b, - struct bkey_packed *where, - struct bkey_packed *insert, - unsigned clobber_u64s) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bch2_verify_insert_pos(b, where, insert, clobber_u64s); -} - - -/* Auxiliary search trees */ - -#define BFLOAT_FAILED_UNPACKED U8_MAX -#define BFLOAT_FAILED U8_MAX - -struct bkey_float { - u8 exponent; - u8 key_offset; - u16 mantissa; -}; -#define BKEY_MANTISSA_BITS 16 - -struct ro_aux_tree { - u8 nothing[0]; - struct bkey_float f[]; -}; - -struct rw_aux_tree { - u16 offset; - struct bpos k; -}; - -static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) -{ - BUG_ON(t->aux_data_offset == U16_MAX); - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - return t->aux_data_offset; - case BSET_RO_AUX_TREE: - return t->aux_data_offset + - DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8); - case BSET_RW_AUX_TREE: - return t->aux_data_offset + - DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); - default: - BUG(); - } -} - -static unsigned bset_aux_tree_buf_start(const struct btree *b, - const struct bset_tree *t) -{ - return t == b->set - ? DIV_ROUND_UP(b->unpack_fn_len, 8) - : bset_aux_tree_buf_end(t - 1); -} - -static void *__aux_tree_base(const struct btree *b, - const struct bset_tree *t) -{ - return b->aux_data + t->aux_data_offset * 8; -} - -static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - return __aux_tree_base(b, t); -} - -static struct bkey_float *bkey_float(const struct btree *b, - const struct bset_tree *t, - unsigned idx) -{ - return ro_aux_tree_base(b, t)->f + idx; -} - -static void __bset_aux_tree_verify(struct btree *b) -{ - for_each_bset(b, t) { - if (t->aux_data_offset == U16_MAX) - continue; - - BUG_ON(t != b->set && - t[-1].aux_data_offset == U16_MAX); - - BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); - BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); - BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); - } -} - -static inline void bset_aux_tree_verify(struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bset_aux_tree_verify(b); -} - -void bch2_btree_keys_init(struct btree *b) -{ - unsigned i; - - b->nsets = 0; - memset(&b->nr, 0, sizeof(b->nr)); - - for (i = 0; i < MAX_BSETS; i++) - b->set[i].data_offset = U16_MAX; - - bch2_bset_set_no_aux_tree(b, b->set); -} - -/* Binary tree stuff for auxiliary search trees */ - -/* - * Cacheline/offset <-> bkey pointer arithmetic: - * - * t->tree is a binary search tree in an array; each node corresponds to a key - * in one cacheline in t->set (BSET_CACHELINE bytes). - * - * This means we don't have to store the full index of the key that a node in - * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and - * then bkey_float->m gives us the offset within that cacheline, in units of 8 - * bytes. - * - * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to - * make this work. - * - * To construct the bfloat for an arbitrary key we need to know what the key - * immediately preceding it is: we have to check if the two keys differ in the - * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size - * of the previous key so we can walk backwards to it from t->tree[j]'s key. - */ - -static inline void *bset_cacheline(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline) -{ - return (void *) round_down((unsigned long) btree_bkey_first(b, t), - L1_CACHE_BYTES) + - cacheline * BSET_CACHELINE; -} - -static struct bkey_packed *cacheline_to_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - unsigned offset) -{ - return bset_cacheline(b, t, cacheline) + offset * 8; -} - -static unsigned bkey_to_cacheline(const struct btree *b, - const struct bset_tree *t, - const struct bkey_packed *k) -{ - return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; -} - -static ssize_t __bkey_to_cacheline_offset(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - const struct bkey_packed *k) -{ - return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); -} - -static unsigned bkey_to_cacheline_offset(const struct btree *b, - const struct bset_tree *t, - unsigned cacheline, - const struct bkey_packed *k) -{ - size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); - - EBUG_ON(m > U8_MAX); - return m; -} - -static inline struct bkey_packed *tree_to_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned j) -{ - return cacheline_to_bkey(b, t, - __eytzinger1_to_inorder(j, t->size - 1, t->extra), - bkey_float(b, t, j)->key_offset); -} - -static struct rw_aux_tree *rw_aux_tree(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); - - return __aux_tree_base(b, t); -} - -/* - * For the write set - the one we're currently inserting keys into - we don't - * maintain a full search tree, we just keep a simple lookup table in t->prev. - */ -static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, - struct bset_tree *t, - unsigned j) -{ - return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); -} - -static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, - unsigned j, struct bkey_packed *k) -{ - EBUG_ON(k >= btree_bkey_last(b, t)); - - rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { - .offset = __btree_node_key_to_offset(b, k), - .k = bkey_unpack_pos(b, k), - }; -} - -static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k = btree_bkey_first(b, t); - unsigned j = 0; - - BUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) - return; - - BUG_ON(t->size < 1); - BUG_ON(rw_aux_to_bkey(b, t, j) != k); - - goto start; - while (1) { - if (rw_aux_to_bkey(b, t, j) == k) { - BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, - bkey_unpack_pos(b, k))); -start: - if (++j == t->size) - break; - - BUG_ON(rw_aux_tree(b, t)[j].offset <= - rw_aux_tree(b, t)[j - 1].offset); - } - - k = bkey_p_next(k); - BUG_ON(k >= btree_bkey_last(b, t)); - } -} - -static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, - struct bset_tree *t) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bch2_bset_verify_rw_aux_tree(b, t); -} - -/* returns idx of first entry >= offset: */ -static unsigned rw_aux_tree_bsearch(struct btree *b, - struct bset_tree *t, - unsigned offset) -{ - unsigned bset_offs = offset - btree_bkey_first_offset(t); - unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); - unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; - - EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); - EBUG_ON(!t->size); - EBUG_ON(idx > t->size); - - while (idx < t->size && - rw_aux_tree(b, t)[idx].offset < offset) - idx++; - - while (idx && - rw_aux_tree(b, t)[idx - 1].offset >= offset) - idx--; - - EBUG_ON(idx < t->size && - rw_aux_tree(b, t)[idx].offset < offset); - EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); - EBUG_ON(idx + 1 < t->size && - rw_aux_tree(b, t)[idx].offset == - rw_aux_tree(b, t)[idx + 1].offset); - - return idx; -} - -static inline unsigned bkey_mantissa(const struct bkey_packed *k, - const struct bkey_float *f) -{ - u64 v; - - EBUG_ON(!bkey_packed(k)); - - v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); - - /* - * In little endian, we're shifting off low bits (and then the bits we - * want are at the low end), in big endian we're shifting off high bits - * (and then the bits we want are at the high end, so we shift them - * back down): - */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - v >>= f->exponent & 7; -#else - v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; -#endif - return (u16) v; -} - -static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) -{ - struct bkey_float *f = bkey_float(b, t, j); - struct bkey_packed *m = tree_to_bkey(b, t, j); - struct bkey_packed *l = is_power_of_2(j) - ? min_key - : tree_to_bkey(b, t, j >> ffs(j)); - struct bkey_packed *r = is_power_of_2(j + 1) - ? max_key - : tree_to_bkey(b, t, j >> (ffz(j) + 1)); - unsigned mantissa; - int shift, exponent, high_bit; - - /* - * for failed bfloats, the lookup code falls back to comparing against - * the original key. - */ - - if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || - !b->nr_key_bits) { - f->exponent = BFLOAT_FAILED_UNPACKED; - return; - } - - /* - * The greatest differing bit of l and r is the first bit we must - * include in the bfloat mantissa we're creating in order to do - * comparisons - that bit always becomes the high bit of - * bfloat->mantissa, and thus the exponent we're calculating here is - * the position of what will become the low bit in bfloat->mantissa: - * - * Note that this may be negative - we may be running off the low end - * of the key: we handle this later: - */ - high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), - min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); - exponent = high_bit - (BKEY_MANTISSA_BITS - 1); - - /* - * Then we calculate the actual shift value, from the start of the key - * (k->_data), to get the key bits starting at exponent: - */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; - - EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); -#else - shift = high_bit_offset + - b->nr_key_bits - - exponent - - BKEY_MANTISSA_BITS; - - EBUG_ON(shift < KEY_PACKED_BITS_START); -#endif - EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); - - f->exponent = shift; - mantissa = bkey_mantissa(m, f); - - /* - * If we've got garbage bits, set them to all 1s - it's legal for the - * bfloat to compare larger than the original key, but not smaller: - */ - if (exponent < 0) - mantissa |= ~(~0U << -exponent); - - f->mantissa = mantissa; -} - -/* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t) -{ - bset_aux_tree_verify(b); - - return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); -} - -static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t) -{ - return __bset_tree_capacity(b, t) / sizeof(struct bkey_float); -} - -static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t) -{ - return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); -} - -static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k; - - t->size = 1; - t->extra = BSET_RW_AUX_TREE_VAL; - rw_aux_tree(b, t)[0].offset = - __btree_node_key_to_offset(b, btree_bkey_first(b, t)); - - bset_tree_for_each_key(b, t, k) { - if (t->size == bset_rw_tree_capacity(b, t)) - break; - - if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > - L1_CACHE_BYTES) - rw_aux_tree_set(b, t, t->size++, k); - } -} - -static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) -{ - struct bkey_packed *k = btree_bkey_first(b, t); - struct bkey_i min_key, max_key; - unsigned cacheline = 1; - - t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), - bset_ro_tree_capacity(b, t)); -retry: - if (t->size < 2) { - t->size = 0; - t->extra = BSET_NO_AUX_TREE_VAL; - return; - } - - t->extra = eytzinger1_extra(t->size - 1); - - /* First we figure out where the first key in each cacheline is */ - eytzinger1_for_each(j, t->size - 1) { - while (bkey_to_cacheline(b, t, k) < cacheline) - k = bkey_p_next(k); - - if (k >= btree_bkey_last(b, t)) { - /* XXX: this path sucks */ - t->size--; - goto retry; - } - - bkey_float(b, t, j)->key_offset = - bkey_to_cacheline_offset(b, t, cacheline++, k); - - EBUG_ON(tree_to_bkey(b, t, j) != k); - } - - if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { - bkey_init(&min_key.k); - min_key.k.p = b->data->min_key; - } - - if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { - bkey_init(&max_key.k); - max_key.k.p = b->data->max_key; - } - - /* Then we build the tree */ - eytzinger1_for_each(j, t->size - 1) - make_bfloat(b, t, j, - bkey_to_packed(&min_key), - bkey_to_packed(&max_key)); -} - -static void bset_alloc_tree(struct btree *b, struct bset_tree *t) -{ - struct bset_tree *i; - - for (i = b->set; i != t; i++) - BUG_ON(bset_has_rw_aux_tree(i)); - - bch2_bset_set_no_aux_tree(b, t); - - /* round up to next cacheline: */ - t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), - SMP_CACHE_BYTES / sizeof(u64)); - - bset_aux_tree_verify(b); -} - -void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, - bool writeable) -{ - if (writeable - ? bset_has_rw_aux_tree(t) - : bset_has_ro_aux_tree(t)) - return; - - bset_alloc_tree(b, t); - - if (!__bset_tree_capacity(b, t)) - return; - - if (writeable) - __build_rw_aux_tree(b, t); - else - __build_ro_aux_tree(b, t); - - bset_aux_tree_verify(b); -} - -void bch2_bset_init_first(struct btree *b, struct bset *i) -{ - struct bset_tree *t; - - BUG_ON(b->nsets); - - memset(i, 0, sizeof(*i)); - get_random_bytes(&i->seq, sizeof(i->seq)); - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - t = &b->set[b->nsets++]; - set_btree_bset(b, t, i); -} - -void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) -{ - struct bset *i = &bne->keys; - struct bset_tree *t; - - BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); - BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); - BUG_ON(b->nsets >= MAX_BSETS); - - memset(i, 0, sizeof(*i)); - i->seq = btree_bset_first(b)->seq; - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - t = &b->set[b->nsets++]; - set_btree_bset(b, t, i); -} - -/* - * find _some_ key in the same bset as @k that precedes @k - not necessarily the - * immediate predecessor: - */ -static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - struct bkey_packed *p; - unsigned offset; - int j; - - EBUG_ON(k < btree_bkey_first(b, t) || - k > btree_bkey_last(b, t)); - - if (k == btree_bkey_first(b, t)) - return NULL; - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - p = btree_bkey_first(b, t); - break; - case BSET_RO_AUX_TREE: - j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); - - do { - p = j ? tree_to_bkey(b, t, - __inorder_to_eytzinger1(j--, - t->size - 1, t->extra)) - : btree_bkey_first(b, t); - } while (p >= k); - break; - case BSET_RW_AUX_TREE: - offset = __btree_node_key_to_offset(b, k); - j = rw_aux_tree_bsearch(b, t, offset); - p = j ? rw_aux_to_bkey(b, t, j - 1) - : btree_bkey_first(b, t); - break; - } - - return p; -} - -struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, - struct bset_tree *t, - struct bkey_packed *k, - unsigned min_key_type) -{ - struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; - - while ((p = __bkey_prev(b, t, k)) && !ret) { - for (i = p; i != k; i = bkey_p_next(i)) - if (i->type >= min_key_type) - ret = i; - - k = p; - } - - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - BUG_ON(ret >= orig_k); - - for (i = ret - ? bkey_p_next(ret) - : btree_bkey_first(b, t); - i != orig_k; - i = bkey_p_next(i)) - BUG_ON(i->type >= min_key_type); - } - - return ret; -} - -/* Insert */ - -static void rw_aux_tree_insert_entry(struct btree *b, - struct bset_tree *t, - unsigned idx) -{ - EBUG_ON(!idx || idx > t->size); - struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1); - struct bkey_packed *end = idx < t->size - ? rw_aux_to_bkey(b, t, idx) - : btree_bkey_last(b, t); - - if (t->size < bset_rw_tree_capacity(b, t) && - (void *) end - (void *) start > L1_CACHE_BYTES) { - struct bkey_packed *k = start; - - while (1) { - k = bkey_p_next(k); - if (k == end) - break; - - if ((void *) k - (void *) start >= L1_CACHE_BYTES) { - memmove(&rw_aux_tree(b, t)[idx + 1], - &rw_aux_tree(b, t)[idx], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[idx]); - t->size++; - rw_aux_tree_set(b, t, idx, k); - break; - } - } - } -} - -static void bch2_bset_fix_lookup_table(struct btree *b, - struct bset_tree *t, - struct bkey_packed *_where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - int shift = new_u64s - clobber_u64s; - unsigned idx, j, where = __btree_node_key_to_offset(b, _where); - - EBUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) - return; - - if (where > rw_aux_tree(b, t)[t->size - 1].offset) { - rw_aux_tree_insert_entry(b, t, t->size); - goto verify; - } - - /* returns first entry >= where */ - idx = rw_aux_tree_bsearch(b, t, where); - - if (rw_aux_tree(b, t)[idx].offset == where) { - if (!idx) { /* never delete first entry */ - idx++; - } else if (where < t->end_offset) { - rw_aux_tree_set(b, t, idx++, _where); - } else { - EBUG_ON(where != t->end_offset); - rw_aux_tree_insert_entry(b, t, --t->size); - goto verify; - } - } - - EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where); - if (idx < t->size && - rw_aux_tree(b, t)[idx].offset + shift == - rw_aux_tree(b, t)[idx - 1].offset) { - memmove(&rw_aux_tree(b, t)[idx], - &rw_aux_tree(b, t)[idx + 1], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[idx + 1]); - t->size -= 1; - } - - for (j = idx; j < t->size; j++) - rw_aux_tree(b, t)[j].offset += shift; - - EBUG_ON(idx < t->size && - rw_aux_tree(b, t)[idx].offset == - rw_aux_tree(b, t)[idx - 1].offset); - - rw_aux_tree_insert_entry(b, t, idx); - -verify: - bch2_bset_verify_rw_aux_tree(b, t); - bset_aux_tree_verify(b); -} - -void bch2_bset_insert(struct btree *b, - struct bkey_packed *where, - struct bkey_i *insert, - unsigned clobber_u64s) -{ - struct bkey_format *f = &b->format; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed packed, *src = bkey_to_packed(insert); - - bch2_bset_verify_rw_aux_tree(b, t); - bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); - - if (bch2_bkey_pack_key(&packed, &insert->k, f)) - src = &packed; - - if (!bkey_deleted(&insert->k)) - btree_keys_account_key_add(&b->nr, t - b->set, src); - - if (src->u64s != clobber_u64s) { - u64 *src_p = (u64 *) where->_data + clobber_u64s; - u64 *dst_p = (u64 *) where->_data + src->u64s; - - EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < - (int) clobber_u64s - src->u64s); - - memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); - le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); - set_btree_bset_end(b, t); - } - - memcpy_u64s_small(where, src, - bkeyp_key_u64s(f, src)); - memcpy_u64s(bkeyp_val(f, where), &insert->v, - bkeyp_val_u64s(f, src)); - - if (src->u64s != clobber_u64s) - bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); - - bch2_verify_btree_nr_keys(b); -} - -void bch2_bset_delete(struct btree *b, - struct bkey_packed *where, - unsigned clobber_u64s) -{ - struct bset_tree *t = bset_tree_last(b); - u64 *src_p = (u64 *) where->_data + clobber_u64s; - u64 *dst_p = where->_data; - - bch2_bset_verify_rw_aux_tree(b, t); - - EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); - - memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); - le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); - set_btree_bset_end(b, t); - - bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); -} - -/* Lookup */ - -__flatten -static struct bkey_packed *bset_search_write_set(const struct btree *b, - struct bset_tree *t, - struct bpos *search) -{ - unsigned l = 0, r = t->size; - - while (l + 1 != r) { - unsigned m = (l + r) >> 1; - - if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) - l = m; - else - r = m; - } - - return rw_aux_to_bkey(b, t, l); -} - -static inline void prefetch_four_cachelines(void *p) -{ -#ifdef CONFIG_X86_64 - asm("prefetcht0 (-127 + 64 * 0)(%0);" - "prefetcht0 (-127 + 64 * 1)(%0);" - "prefetcht0 (-127 + 64 * 2)(%0);" - "prefetcht0 (-127 + 64 * 3)(%0);" - : - : "r" (p + 127)); -#else - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - prefetch(p + L1_CACHE_BYTES * 3); -#endif -} - -static inline bool bkey_mantissa_bits_dropped(const struct btree *b, - const struct bkey_float *f) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; - - return f->exponent > key_bits_start; -#else - unsigned key_bits_end = high_bit_offset + b->nr_key_bits; - - return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; -#endif -} - -__flatten -static struct bkey_packed *bset_search_tree(const struct btree *b, - const struct bset_tree *t, - const struct bpos *search, - const struct bkey_packed *packed_search) -{ - struct ro_aux_tree *base = ro_aux_tree_base(b, t); - struct bkey_float *f; - struct bkey_packed *k; - unsigned inorder, n = 1, l, r; - int cmp; - - do { - if (likely(n << 4 < t->size)) - prefetch(&base->f[n << 4]); - - f = &base->f[n]; - if (unlikely(f->exponent >= BFLOAT_FAILED)) - goto slowpath; - - l = f->mantissa; - r = bkey_mantissa(packed_search, f); - - if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f)) - goto slowpath; - - n = n * 2 + (l < r); - continue; -slowpath: - k = tree_to_bkey(b, t, n); - cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); - if (!cmp) - return k; - - n = n * 2 + (cmp < 0); - } while (n < t->size); - - inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); - - /* - * n would have been the node we recursed to - the low bit tells us if - * we recursed left or recursed right. - */ - if (likely(!(n & 1))) { - --inorder; - if (unlikely(!inorder)) - return btree_bkey_first(b, t); - - f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; - } - - return cacheline_to_bkey(b, t, inorder, f->key_offset); -} - -static __always_inline __flatten -struct bkey_packed *__bch2_bset_search(struct btree *b, - struct bset_tree *t, - struct bpos *search, - const struct bkey_packed *lossy_packed_search) -{ - - /* - * First, we search for a cacheline, then lastly we do a linear search - * within that cacheline. - * - * To search for the cacheline, there's three different possibilities: - * * The set is too small to have a search tree, so we just do a linear - * search over the whole set. - * * The set is the one we're currently inserting into; keeping a full - * auxiliary search tree up to date would be too expensive, so we - * use a much simpler lookup table to do a binary search - - * bset_search_write_set(). - * * Or we use the auxiliary search tree we constructed earlier - - * bset_search_tree() - */ - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - return btree_bkey_first(b, t); - case BSET_RW_AUX_TREE: - return bset_search_write_set(b, t, search); - case BSET_RO_AUX_TREE: - return bset_search_tree(b, t, search, lossy_packed_search); - default: - BUG(); - } -} - -static __always_inline __flatten -struct bkey_packed *bch2_bset_search_linear(struct btree *b, - struct bset_tree *t, - struct bpos *search, - struct bkey_packed *packed_search, - const struct bkey_packed *lossy_packed_search, - struct bkey_packed *m) -{ - if (lossy_packed_search) - while (m != btree_bkey_last(b, t) && - bkey_iter_cmp_p_or_unp(b, m, - lossy_packed_search, search) < 0) - m = bkey_p_next(m); - - if (!packed_search) - while (m != btree_bkey_last(b, t) && - bkey_iter_pos_cmp(b, m, search) < 0) - m = bkey_p_next(m); - - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - - BUG_ON(prev && - bkey_iter_cmp_p_or_unp(b, prev, - packed_search, search) >= 0); - } - - return m; -} - -/* Btree node iterator */ - -static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - if (k != end) { - struct btree_node_iter_set *pos; - - btree_node_iter_for_each(iter, pos) - ; - - BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); - *pos = (struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), - __btree_node_key_to_offset(b, end) - }; - } -} - -void bch2_btree_node_iter_push(struct btree_node_iter *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - __bch2_btree_node_iter_push(iter, b, k, end); - bch2_btree_node_iter_sort(iter, b); -} - -noinline __flatten __cold -static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, - struct btree *b, struct bpos *search) -{ - struct bkey_packed *k; - - trace_bkey_pack_pos_fail(search); - - bch2_btree_node_iter_init_from_start(iter, b); - - while ((k = bch2_btree_node_iter_peek(iter, b)) && - bkey_iter_pos_cmp(b, k, search) < 0) - bch2_btree_node_iter_advance(iter, b); -} - -/** - * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a - * given position - * - * @iter: iterator to initialize - * @b: btree node to search - * @search: search key - * - * Main entry point to the lookup code for individual btree nodes: - * - * NOTE: - * - * When you don't filter out deleted keys, btree nodes _do_ contain duplicate - * keys. This doesn't matter for most code, but it does matter for lookups. - * - * Some adjacent keys with a string of equal keys: - * i j k k k k l m - * - * If you search for k, the lookup code isn't guaranteed to return you any - * specific k. The lookup code is conceptually doing a binary search and - * iterating backwards is very expensive so if the pivot happens to land at the - * last k that's what you'll get. - * - * This works out ok, but it's something to be aware of: - * - * - For non extents, we guarantee that the live key comes last - see - * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't - * see will only be deleted keys you don't care about. - * - * - For extents, deleted keys sort last (see the comment at the top of this - * file). But when you're searching for extents, you actually want the first - * key strictly greater than your search key - an extent that compares equal - * to the search key is going to have 0 sectors after the search key. - * - * But this does mean that we can't just search for - * bpos_successor(start_of_range) to get the first extent that overlaps with - * the range we want - if we're unlucky and there's an extent that ends - * exactly where we searched, then there could be a deleted key at the same - * position and we'd get that when we search instead of the preceding extent - * we needed. - * - * So we've got to search for start_of_range, then after the lookup iterate - * past any extents that compare equal to the position we searched for. - */ -__flatten -void bch2_btree_node_iter_init(struct btree_node_iter *iter, - struct btree *b, struct bpos *search) -{ - struct bkey_packed p, *packed_search = NULL; - struct btree_node_iter_set *pos = iter->data; - struct bkey_packed *k[MAX_BSETS]; - unsigned i; - - EBUG_ON(bpos_lt(*search, b->data->min_key)); - EBUG_ON(bpos_gt(*search, b->data->max_key)); - bset_aux_tree_verify(b); - - memset(iter, 0, sizeof(*iter)); - - switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { - case BKEY_PACK_POS_EXACT: - packed_search = &p; - break; - case BKEY_PACK_POS_SMALLER: - packed_search = NULL; - break; - case BKEY_PACK_POS_FAIL: - btree_node_iter_init_pack_failed(iter, b, search); - return; - } - - for (i = 0; i < b->nsets; i++) { - k[i] = __bch2_bset_search(b, b->set + i, search, &p); - prefetch_four_cachelines(k[i]); - } - - for (i = 0; i < b->nsets; i++) { - struct bset_tree *t = b->set + i; - struct bkey_packed *end = btree_bkey_last(b, t); - - k[i] = bch2_bset_search_linear(b, t, search, - packed_search, &p, k[i]); - if (k[i] != end) - *pos++ = (struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k[i]), - __btree_node_key_to_offset(b, end) - }; - } - - bch2_btree_node_iter_sort(iter, b); -} - -void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, - struct btree *b) -{ - memset(iter, 0, sizeof(*iter)); - - for_each_bset(b, t) - __bch2_btree_node_iter_push(iter, b, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - bch2_btree_node_iter_sort(iter, b); -} - -struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, - struct btree *b, - struct bset_tree *t) -{ - struct btree_node_iter_set *set; - - btree_node_iter_for_each(iter, set) - if (set->end == t->end_offset) - return __btree_node_offset_to_key(b, set->k); - - return btree_bkey_last(b, t); -} - -static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, - struct btree *b, - unsigned first) -{ - bool ret; - - if ((ret = (btree_node_iter_cmp(b, - iter->data[first], - iter->data[first + 1]) > 0))) - swap(iter->data[first], iter->data[first + 1]); - return ret; -} - -void bch2_btree_node_iter_sort(struct btree_node_iter *iter, - struct btree *b) -{ - /* unrolled bubble sort: */ - - if (!__btree_node_iter_set_end(iter, 2)) { - btree_node_iter_sort_two(iter, b, 0); - btree_node_iter_sort_two(iter, b, 1); - } - - if (!__btree_node_iter_set_end(iter, 1)) - btree_node_iter_sort_two(iter, b, 0); -} - -void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, - struct btree_node_iter_set *set) -{ - struct btree_node_iter_set *last = - iter->data + ARRAY_SIZE(iter->data) - 1; - - memmove(&set[0], &set[1], (void *) last - (void *) set); - *last = (struct btree_node_iter_set) { 0, 0 }; -} - -static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, - struct btree *b) -{ - iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; - - EBUG_ON(iter->data->k > iter->data->end); - - if (unlikely(__btree_node_iter_set_end(iter, 0))) { - /* avoid an expensive memmove call: */ - iter->data[0] = iter->data[1]; - iter->data[1] = iter->data[2]; - iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; - return; - } - - if (__btree_node_iter_set_end(iter, 1)) - return; - - if (!btree_node_iter_sort_two(iter, b, 0)) - return; - - if (__btree_node_iter_set_end(iter, 2)) - return; - - btree_node_iter_sort_two(iter, b, 1); -} - -void bch2_btree_node_iter_advance(struct btree_node_iter *iter, - struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { - __bch2_btree_node_iter_verify(iter, b); - __bch2_btree_node_iter_next_check(iter, b); - } - - __bch2_btree_node_iter_advance(iter, b); -} - -/* - * Expensive: - */ -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *k, *prev = NULL; - struct btree_node_iter_set *set; - unsigned end = 0; - - bch2_btree_node_iter_verify(iter, b); - - for_each_bset(b, t) { - k = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t)); - if (k && - (!prev || bkey_iter_cmp(b, k, prev) > 0)) { - prev = k; - end = t->end_offset; - } - } - - if (!prev) - return NULL; - - /* - * We're manually memmoving instead of just calling sort() to ensure the - * prev we picked ends up in slot 0 - sort won't necessarily put it - * there because of duplicate deleted keys: - */ - btree_node_iter_for_each(iter, set) - if (set->end == end) - goto found; - - BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); -found: - BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); - - memmove(&iter->data[1], - &iter->data[0], - (void *) set - (void *) &iter->data[0]); - - iter->data[0].k = __btree_node_key_to_offset(b, prev); - iter->data[0].end = end; - - bch2_btree_node_iter_verify(iter, b); - return prev; -} - -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *prev; - - do { - prev = bch2_btree_node_iter_prev_all(iter, b); - } while (prev && bkey_deleted(prev)); - - return prev; -} - -struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, - struct btree *b, - struct bkey *u) -{ - struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); - - return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; -} - -/* Mergesort */ - -void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) -{ - for_each_bset_c(b, t) { - enum bset_aux_tree_type type = bset_aux_tree_type(t); - size_t j; - - stats->sets[type].nr++; - stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * - sizeof(u64); - - if (bset_has_ro_aux_tree(t)) { - stats->floats += t->size - 1; - - for (j = 1; j < t->size; j++) - stats->failed += - bkey_float(b, t, j)->exponent == - BFLOAT_FAILED; - } - } -} - -void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, - struct bkey_packed *k) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, k); - struct bkey uk; - unsigned j, inorder; - - if (!bset_has_ro_aux_tree(t)) - return; - - inorder = bkey_to_cacheline(b, t, k); - if (!inorder || inorder >= t->size) - return; - - j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); - if (k != tree_to_bkey(b, t, j)) - return; - - switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED: - uk = bkey_unpack_key(b, k); - prt_printf(out, - " failed unpacked at depth %u\n" - "\t", - ilog2(j)); - bch2_bpos_to_text(out, uk.p); - prt_printf(out, "\n"); - break; - } -} diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h deleted file mode 100644 index a15ecf9d006e05..00000000000000 --- a/fs/bcachefs/bset.h +++ /dev/null @@ -1,536 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BSET_H -#define _BCACHEFS_BSET_H - -#include -#include - -#include "bcachefs.h" -#include "bkey.h" -#include "bkey_methods.h" -#include "btree_types.h" -#include "util.h" /* for time_stats */ -#include "vstructs.h" - -/* - * BKEYS: - * - * A bkey contains a key, a size field, a variable number of pointers, and some - * ancillary flag bits. - * - * We use two different functions for validating bkeys, bkey_invalid and - * bkey_deleted(). - * - * The one exception to the rule that ptr_invalid() filters out invalid keys is - * that it also filters out keys of size 0 - these are keys that have been - * completely overwritten. It'd be safe to delete these in memory while leaving - * them on disk, just unnecessary work - so we filter them out when resorting - * instead. - * - * We can't filter out stale keys when we're resorting, because garbage - * collection needs to find them to ensure bucket gens don't wrap around - - * unless we're rewriting the btree node those stale keys still exist on disk. - * - * We also implement functions here for removing some number of sectors from the - * front or the back of a bkey - this is mainly used for fixing overlapping - * extents, by removing the overlapping sectors from the older key. - * - * BSETS: - * - * A bset is an array of bkeys laid out contiguously in memory in sorted order, - * along with a header. A btree node is made up of a number of these, written at - * different times. - * - * There could be many of them on disk, but we never allow there to be more than - * 4 in memory - we lazily resort as needed. - * - * We implement code here for creating and maintaining auxiliary search trees - * (described below) for searching an individial bset, and on top of that we - * implement a btree iterator. - * - * BTREE ITERATOR: - * - * Most of the code in bcache doesn't care about an individual bset - it needs - * to search entire btree nodes and iterate over them in sorted order. - * - * The btree iterator code serves both functions; it iterates through the keys - * in a btree node in sorted order, starting from either keys after a specific - * point (if you pass it a search key) or the start of the btree node. - * - * AUXILIARY SEARCH TREES: - * - * Since keys are variable length, we can't use a binary search on a bset - we - * wouldn't be able to find the start of the next key. But binary searches are - * slow anyways, due to terrible cache behaviour; bcache originally used binary - * searches and that code topped out at under 50k lookups/second. - * - * So we need to construct some sort of lookup table. Since we only insert keys - * into the last (unwritten) set, most of the keys within a given btree node are - * usually in sets that are mostly constant. We use two different types of - * lookup tables to take advantage of this. - * - * Both lookup tables share in common that they don't index every key in the - * set; they index one key every BSET_CACHELINE bytes, and then a linear search - * is used for the rest. - * - * For sets that have been written to disk and are no longer being inserted - * into, we construct a binary search tree in an array - traversing a binary - * search tree in an array gives excellent locality of reference and is very - * fast, since both children of any node are adjacent to each other in memory - * (and their grandchildren, and great grandchildren...) - this means - * prefetching can be used to great effect. - * - * It's quite useful performance wise to keep these nodes small - not just - * because they're more likely to be in L2, but also because we can prefetch - * more nodes on a single cacheline and thus prefetch more iterations in advance - * when traversing this tree. - * - * Nodes in the auxiliary search tree must contain both a key to compare against - * (we don't want to fetch the key from the set, that would defeat the purpose), - * and a pointer to the key. We use a few tricks to compress both of these. - * - * To compress the pointer, we take advantage of the fact that one node in the - * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have - * a function (to_inorder()) that takes the index of a node in a binary tree and - * returns what its index would be in an inorder traversal, so we only have to - * store the low bits of the offset. - * - * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To - * compress that, we take advantage of the fact that when we're traversing the - * search tree at every iteration we know that both our search key and the key - * we're looking for lie within some range - bounded by our previous - * comparisons. (We special case the start of a search so that this is true even - * at the root of the tree). - * - * So we know the key we're looking for is between a and b, and a and b don't - * differ higher than bit 50, we don't need to check anything higher than bit - * 50. - * - * We don't usually need the rest of the bits, either; we only need enough bits - * to partition the key range we're currently checking. Consider key n - the - * key our auxiliary search tree node corresponds to, and key p, the key - * immediately preceding n. The lowest bit we need to store in the auxiliary - * search tree is the highest bit that differs between n and p. - * - * Note that this could be bit 0 - we might sometimes need all 80 bits to do the - * comparison. But we'd really like our nodes in the auxiliary search tree to be - * of fixed size. - * - * The solution is to make them fixed size, and when we're constructing a node - * check if p and n differed in the bits we needed them to. If they don't we - * flag that node, and when doing lookups we fallback to comparing against the - * real key. As long as this doesn't happen to often (and it seems to reliably - * happen a bit less than 1% of the time), we win - even on failures, that key - * is then more likely to be in cache than if we were doing binary searches all - * the way, since we're touching so much less memory. - * - * The keys in the auxiliary search tree are stored in (software) floating - * point, with an exponent and a mantissa. The exponent needs to be big enough - * to address all the bits in the original key, but the number of bits in the - * mantissa is somewhat arbitrary; more bits just gets us fewer failures. - * - * We need 7 bits for the exponent and 3 bits for the key's offset (since keys - * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. - * We need one node per 128 bytes in the btree node, which means the auxiliary - * search trees take up 3% as much memory as the btree itself. - * - * Constructing these auxiliary search trees is moderately expensive, and we - * don't want to be constantly rebuilding the search tree for the last set - * whenever we insert another key into it. For the unwritten set, we use a much - * simpler lookup table - it's just a flat array, so index i in the lookup table - * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing - * within each byte range works the same as with the auxiliary search trees. - * - * These are much easier to keep up to date when we insert a key - we do it - * somewhat lazily; when we shift a key up we usually just increment the pointer - * to it, only when it would overflow do we go to the trouble of finding the - * first key in that range of bytes again. - */ - -enum bset_aux_tree_type { - BSET_NO_AUX_TREE, - BSET_RO_AUX_TREE, - BSET_RW_AUX_TREE, -}; - -#define BSET_TREE_NR_TYPES 3 - -#define BSET_NO_AUX_TREE_VAL (U16_MAX) -#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) - -static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) -{ - switch (t->extra) { - case BSET_NO_AUX_TREE_VAL: - EBUG_ON(t->size); - return BSET_NO_AUX_TREE; - case BSET_RW_AUX_TREE_VAL: - EBUG_ON(!t->size); - return BSET_RW_AUX_TREE; - default: - EBUG_ON(!t->size); - return BSET_RO_AUX_TREE; - } -} - -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ - -#define BSET_CACHELINE 256 - -static inline size_t btree_keys_cachelines(const struct btree *b) -{ - return (1U << b->byte_order) / BSET_CACHELINE; -} - -static inline size_t btree_aux_data_bytes(const struct btree *b) -{ - return btree_keys_cachelines(b) * 8; -} - -static inline size_t btree_aux_data_u64s(const struct btree *b) -{ - return btree_aux_data_bytes(b) / sizeof(u64); -} - -#define for_each_bset(_b, _t) \ - for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) - -#define for_each_bset_c(_b, _t) \ - for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) - -#define bset_tree_for_each_key(_b, _t, _k) \ - for (_k = btree_bkey_first(_b, _t); \ - _k != btree_bkey_last(_b, _t); \ - _k = bkey_p_next(_k)) - -static inline bool bset_has_ro_aux_tree(const struct bset_tree *t) -{ - return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; -} - -static inline bool bset_has_rw_aux_tree(struct bset_tree *t) -{ - return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; -} - -static inline void bch2_bset_set_no_aux_tree(struct btree *b, - struct bset_tree *t) -{ - BUG_ON(t < b->set); - - for (; t < b->set + ARRAY_SIZE(b->set); t++) { - t->size = 0; - t->extra = BSET_NO_AUX_TREE_VAL; - t->aux_data_offset = U16_MAX; - } -} - -static inline void btree_node_set_format(struct btree *b, - struct bkey_format f) -{ - int len; - - b->format = f; - b->nr_key_bits = bkey_format_key_bits(&f); - - len = bch2_compile_bkey_format(&b->format, b->aux_data); - BUG_ON(len < 0 || len > U8_MAX); - - b->unpack_fn_len = len; - - bch2_bset_set_no_aux_tree(b, b->set); -} - -static inline struct bset *bset_next_set(struct btree *b, - unsigned block_bytes) -{ - struct bset *i = btree_bset_last(b); - - EBUG_ON(!is_power_of_2(block_bytes)); - - return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); -} - -void bch2_btree_keys_init(struct btree *); - -void bch2_bset_init_first(struct btree *, struct bset *); -void bch2_bset_init_next(struct btree *, struct btree_node_entry *); -void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); - -void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *, - unsigned); -void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); - -/* Bkey utility code */ - -/* packed or unpacked */ -static inline int bkey_cmp_p_or_unp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r_packed, - const struct bpos *r) -{ - EBUG_ON(r_packed && !bkey_packed(r_packed)); - - if (unlikely(!bkey_packed(l))) - return bpos_cmp(packed_to_bkey_c(l)->p, *r); - - if (likely(r_packed)) - return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); - - return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -} - -static inline struct bset_tree * -bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) -{ - unsigned offset = __btree_node_key_to_offset(b, k); - - for_each_bset(b, t) - if (offset <= t->end_offset) { - EBUG_ON(offset < btree_bkey_first_offset(t)); - return t; - } - - BUG(); -} - -struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); - -struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, - struct bkey_packed *, unsigned); - -static inline struct bkey_packed * -bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -{ - return bch2_bkey_prev_filter(b, t, k, 0); -} - -static inline struct bkey_packed * -bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -{ - return bch2_bkey_prev_filter(b, t, k, 1); -} - -/* Btree key iteration */ - -void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); -void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, - struct bpos *); -void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, - struct btree *); -struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, - struct btree *, - struct bset_tree *); - -void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); -void bch2_btree_node_iter_set_drop(struct btree_node_iter *, - struct btree_node_iter_set *); -void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); - -#define btree_node_iter_for_each(_iter, _set) \ - for (_set = (_iter)->data; \ - _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ - (_set)->k != (_set)->end; \ - _set++) - -static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, - unsigned i) -{ - return iter->data[i].k == iter->data[i].end; -} - -static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) -{ - return __btree_node_iter_set_end(iter, 0); -} - -/* - * When keys compare equal, deleted keys compare first: - * - * XXX: only need to compare pointers for keys that are both within a - * btree_node_iterator - we need to break ties for prev() to work correctly - */ -static inline int bkey_iter_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed(b, l, r) - ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) - ?: cmp_int(l, r); -} - -static inline int btree_node_iter_cmp(const struct btree *b, - struct btree_node_iter_set l, - struct btree_node_iter_set r) -{ - return bkey_iter_cmp(b, - __btree_node_offset_to_key(b, l.k), - __btree_node_offset_to_key(b, r.k)); -} - -/* These assume r (the search key) is not a deleted key: */ -static inline int bkey_iter_pos_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bpos *r) -{ - return bkey_cmp_left_packed(b, l, r) - ?: -((int) bkey_deleted(l)); -} - -static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r_packed, - const struct bpos *r) -{ - return bkey_cmp_p_or_unp(b, l, r_packed, r) - ?: -((int) bkey_deleted(l)); -} - -static inline struct bkey_packed * -__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, - struct btree *b) -{ - return __btree_node_offset_to_key(b, iter->data->k); -} - -static inline struct bkey_packed * -bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) -{ - return !bch2_btree_node_iter_end(iter) - ? __btree_node_offset_to_key(b, iter->data->k) - : NULL; -} - -static inline struct bkey_packed * -bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) -{ - struct bkey_packed *k; - - while ((k = bch2_btree_node_iter_peek_all(iter, b)) && - bkey_deleted(k)) - bch2_btree_node_iter_advance(iter, b); - - return k; -} - -static inline struct bkey_packed * -bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) -{ - struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); - - if (ret) - bch2_btree_node_iter_advance(iter, b); - - return ret; -} - -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, - struct btree *); -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, - struct btree *); - -struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, - struct btree *, - struct bkey *); - -#define for_each_btree_node_key(b, k, iter) \ - for (bch2_btree_node_iter_init_from_start((iter), (b)); \ - (k = bch2_btree_node_iter_peek((iter), (b))); \ - bch2_btree_node_iter_advance(iter, b)) - -#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ - for (bch2_btree_node_iter_init_from_start((iter), (b)); \ - (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ - bch2_btree_node_iter_advance(iter, b)) - -/* Accounting: */ - -struct btree_nr_keys bch2_btree_node_count_keys(struct btree *); - -static inline void btree_keys_account_key(struct btree_nr_keys *n, - unsigned bset, - struct bkey_packed *k, - int sign) -{ - n->live_u64s += k->u64s * sign; - n->bset_u64s[bset] += k->u64s * sign; - - if (bkey_packed(k)) - n->packed_keys += sign; - else - n->unpacked_keys += sign; -} - -static inline void btree_keys_account_val_delta(struct btree *b, - struct bkey_packed *k, - int delta) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, k); - - b->nr.live_u64s += delta; - b->nr.bset_u64s[t - b->set] += delta; -} - -#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ - btree_keys_account_key(_nr, _bset_idx, _k, 1) -#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ - btree_keys_account_key(_nr, _bset_idx, _k, -1) - -#define btree_account_key_add(_b, _k) \ - btree_keys_account_key(&(_b)->nr, \ - bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) -#define btree_account_key_drop(_b, _k) \ - btree_keys_account_key(&(_b)->nr, \ - bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) - -struct bset_stats { - struct { - size_t nr, bytes; - } sets[BSET_TREE_NR_TYPES]; - - size_t floats; - size_t failed; -}; - -void bch2_btree_keys_stats(const struct btree *, struct bset_stats *); -void bch2_bfloat_to_text(struct printbuf *, struct btree *, - struct bkey_packed *); - -/* Debug stuff */ - -void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); -void bch2_dump_btree_node(struct bch_fs *, struct btree *); -void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); - -void __bch2_verify_btree_nr_keys(struct btree *); -void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); - -static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) - __bch2_btree_node_iter_verify(iter, b); -} - -static inline void bch2_verify_btree_nr_keys(struct btree *b) -{ - if (static_branch_unlikely(&bch2_debug_check_btree_accounting)) - __bch2_verify_btree_nr_keys(b); -} - -#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c deleted file mode 100644 index 83c9860e6b82cb..00000000000000 --- a/fs/bcachefs/btree_cache.c +++ /dev/null @@ -1,1516 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bbpos.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "debug.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "trace.h" - -#include -#include -#include - -const char * const bch2_btree_node_flags[] = { - "typebit", - "typebit", - "typebit", -#define x(f) [BTREE_NODE_##f] = #f, - BTREE_FLAGS() -#undef x - NULL -}; - -void bch2_recalc_btree_reserve(struct bch_fs *c) -{ - unsigned reserve = 16; - - if (!c->btree_roots_known[0].b) - reserve += 8; - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (r->b) - reserve += min_t(unsigned, 1, r->b->c.level) * 8; - } - - c->btree_cache.nr_reserve = reserve; -} - -static inline size_t btree_cache_can_free(struct btree_cache_list *list) -{ - struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); - - size_t can_free = list->nr; - if (!list->idx) - can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); - return can_free; -} - -static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - - if (b->c.lock.readers) - list_add(&b->list, &bc->freed_pcpu); - else - list_add(&b->list, &bc->freed_nonpcpu); -} - -static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - BUG_ON(!b->data); - - bc->nr_freeable++; - list_add(&b->list, &bc->freeable); -} - -void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) -{ - struct btree_cache *bc = &c->btree_cache; - - mutex_lock(&bc->lock); - __bch2_btree_node_to_freelist(bc, b); - mutex_unlock(&bc->lock); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -} - -void __btree_node_data_free(struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - BUG_ON(btree_node_hashed(b)); - - /* - * This should really be done in slub/vmalloc, but we're using the - * kmalloc_large() path, so we're working around a slub bug by doing - * this here: - */ - if (b->data) - mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); - if (b->aux_data) - mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); - - EBUG_ON(btree_node_write_in_flight(b)); - - clear_btree_node_just_written(b); - - kvfree(b->data); - b->data = NULL; -#ifdef __KERNEL__ - kvfree(b->aux_data); -#else - munmap(b->aux_data, btree_aux_data_bytes(b)); -#endif - b->aux_data = NULL; -} - -static void btree_node_data_free(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(list_empty(&b->list)); - list_del_init(&b->list); - - __btree_node_data_free(b); - - --bc->nr_freeable; - btree_node_to_freedlist(bc, b); -} - -static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct btree *b = obj; - const u64 *v = arg->key; - - return b->hash_val == *v ? 0 : 1; -} - -static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, hash_val), - .key_len = sizeof(u64), - .obj_cmpfn = bch2_btree_cache_cmp_fn, - .automatic_shrinking = true, -}; - -static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -{ - BUG_ON(b->data || b->aux_data); - - gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; - - b->data = kvmalloc(btree_buf_bytes(b), gfp); - if (!b->data) - return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); -#ifdef __KERNEL__ - b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); -#else - b->aux_data = mmap(NULL, btree_aux_data_bytes(b), - PROT_READ|PROT_WRITE|PROT_EXEC, - MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - if (b->aux_data == MAP_FAILED) - b->aux_data = NULL; -#endif - if (!b->aux_data) { - kvfree(b->data); - b->data = NULL; - return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); - } - - return 0; -} - -static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) -{ - struct btree *b; - - b = kzalloc(sizeof(struct btree), gfp); - if (!b) - return NULL; - - bkey_btree_ptr_init(&b->key); - INIT_LIST_HEAD(&b->list); - INIT_LIST_HEAD(&b->write_blocked); - b->byte_order = ilog2(c->opts.btree_node_size); - return b; -} - -struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) -{ - struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); - if (!b) - return NULL; - - if (btree_node_data_alloc(c, b, GFP_KERNEL)) { - kfree(b); - return NULL; - } - - bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - return b; -} - -static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) -{ - struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); - - u64 mask = bc->pinned_nodes_mask[!!b->c.level]; - - return ((mask & BIT_ULL(b->c.btree_id)) && - bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && - bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); -} - -void bch2_node_pin(struct bch_fs *c, struct btree *b) -{ - struct btree_cache *bc = &c->btree_cache; - - mutex_lock(&bc->lock); - if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { - set_btree_node_pinned(b); - list_move(&b->list, &bc->live[1].list); - bc->live[0].nr--; - bc->live[1].nr++; - } - mutex_unlock(&bc->lock); -} - -void bch2_btree_cache_unpin(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct btree *b, *n; - - mutex_lock(&bc->lock); - c->btree_cache.pinned_nodes_mask[0] = 0; - c->btree_cache.pinned_nodes_mask[1] = 0; - - list_for_each_entry_safe(b, n, &bc->live[1].list, list) { - clear_btree_node_pinned(b); - list_move(&b->list, &bc->live[0].list); - bc->live[0].nr++; - bc->live[1].nr--; - } - - mutex_unlock(&bc->lock); -} - -/* Btree in memory cache - hash table */ - -void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -{ - lockdep_assert_held(&bc->lock); - - int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); - BUG_ON(ret); - - /* Cause future lookups for this node to fail: */ - b->hash_val = 0; - - if (b->c.btree_id < BTREE_ID_NR) - --bc->nr_by_btree[b->c.btree_id]; - --bc->live[btree_node_pinned(b)].nr; - list_del_init(&b->list); -} - -void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -{ - __bch2_btree_node_hash_remove(bc, b); - __bch2_btree_node_to_freelist(bc, b); -} - -int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) -{ - BUG_ON(!list_empty(&b->list)); - BUG_ON(b->hash_val); - - b->hash_val = btree_ptr_hash_val(&b->key); - int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, - bch_btree_cache_params); - if (ret) - return ret; - - if (b->c.btree_id < BTREE_ID_NR) - bc->nr_by_btree[b->c.btree_id]++; - - bool p = __btree_node_pinned(bc, b); - mod_bit(BTREE_NODE_pinned, &b->flags, p); - - list_add_tail(&b->list, &bc->live[p].list); - bc->live[p].nr++; - return 0; -} - -int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, - unsigned level, enum btree_id id) -{ - b->c.level = level; - b->c.btree_id = id; - - mutex_lock(&bc->lock); - int ret = __bch2_btree_node_hash_insert(bc, b); - mutex_unlock(&bc->lock); - - return ret; -} - -void bch2_btree_node_update_key_early(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_i *new) -{ - struct bch_fs *c = trans->c; - struct btree *b; - struct bkey_buf tmp; - int ret; - - bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_reassemble(&tmp, c, old); - - b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); - if (!IS_ERR_OR_NULL(b)) { - mutex_lock(&c->btree_cache.lock); - - __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - - mutex_unlock(&c->btree_cache.lock); - six_unlock_read(&b->c.lock); - } - - bch2_bkey_buf_exit(&tmp, c); -} - -__flatten -static inline struct btree *btree_cache_find(struct btree_cache *bc, - const struct bkey_i *k) -{ - u64 v = btree_ptr_hash_val(k); - - return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); -} - -static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, - bool flush, bool locked) -{ - struct btree_cache *bc = &c->btree_cache; - - lockdep_assert_held(&bc->lock); - - if (btree_node_noevict(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - if (btree_node_write_blocked(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - if (btree_node_will_make_reachable(b)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (btree_node_dirty(b)) { - if (!flush) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (locked) { - /* - * Using the underscore version because we don't want to compact - * bsets after the write, since this node is about to be evicted - * - unless btree verify mode is enabled, since it runs out of - * the post write cleanup: - */ - if (static_branch_unlikely(&bch2_verify_btree_ondisk)) - bch2_btree_node_write(c, b, SIX_LOCK_intent, - BTREE_WRITE_cache_reclaim); - else - __bch2_btree_node_write(c, b, - BTREE_WRITE_cache_reclaim); - } - } - - if (b->flags & ((1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_read_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; - else if (btree_node_write_in_flight(b)) - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (locked) - return -EINTR; - - /* XXX: waiting on IO with btree cache lock held */ - bch2_btree_node_wait_on_read(b); - bch2_btree_node_wait_on_write(b); - } - - return 0; -} - -/* - * this version is for btree nodes that have already been freed (we're not - * reaping a real btree node) - */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) -{ - struct btree_cache *bc = &c->btree_cache; - int ret = 0; - - lockdep_assert_held(&bc->lock); -retry_unlocked: - ret = __btree_node_reclaim_checks(c, b, flush, false); - if (ret) - return ret; - - if (!six_trylock_intent(&b->c.lock)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - if (!six_trylock_write(&b->c.lock)) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; - six_unlock_intent(&b->c.lock); - return bch_err_throw(c, ENOMEM_btree_node_reclaim); - } - - /* recheck under lock */ - ret = __btree_node_reclaim_checks(c, b, flush, true); - if (ret) { - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - if (ret == -EINTR) - goto retry_unlocked; - return ret; - } - - if (b->hash_val && !ret) - trace_and_count(c, btree_cache_reap, c, b); - return 0; -} - -static int btree_node_reclaim(struct bch_fs *c, struct btree *b) -{ - return __btree_node_reclaim(c, b, false); -} - -static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) -{ - return __btree_node_reclaim(c, b, true); -} - -static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct btree_cache_list *list = shrink->private_data; - struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); - struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); - struct btree *b, *t; - unsigned long nr = sc->nr_to_scan; - unsigned long can_free = 0; - unsigned long freed = 0; - unsigned long touched = 0; - unsigned i, flags; - unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; - - if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) - return SHRINK_STOP; - - mutex_lock(&bc->lock); - flags = memalloc_nofs_save(); - - /* - * It's _really_ critical that we don't free too many btree nodes - we - * have to always leave ourselves a reserve. The reserve is how we - * guarantee that allocating memory for a new btree node can always - * succeed, so that inserting keys into the btree can always succeed and - * IO can always make forward progress: - */ - can_free = btree_cache_can_free(list); - if (nr > can_free) { - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; - nr = can_free; - } - - i = 0; - list_for_each_entry_safe(b, t, &bc->freeable, list) { - /* - * Leave a few nodes on the freeable list, so that a btree split - * won't have to hit the system allocator: - */ - if (++i <= 3) - continue; - - touched++; - - if (touched >= nr) - goto out; - - if (!btree_node_reclaim(c, b)) { - btree_node_data_free(bc, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - freed++; - bc->nr_freed++; - } - } -restart: - list_for_each_entry_safe(b, t, &list->list, list) { - touched++; - - if (btree_node_accessed(b)) { - clear_btree_node_accessed(b); - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; - --touched;; - } else if (!btree_node_reclaim(c, b)) { - __bch2_btree_node_hash_remove(bc, b); - __btree_node_data_free(b); - btree_node_to_freedlist(bc, b); - - freed++; - bc->nr_freed++; - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - if (freed == nr) - goto out_rotate; - } else if (trigger_writes && - btree_node_dirty(b) && - !btree_node_will_make_reachable(b) && - !btree_node_write_blocked(b) && - six_trylock_read(&b->c.lock)) { - list_move(&list->list, &b->list); - mutex_unlock(&bc->lock); - __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); - six_unlock_read(&b->c.lock); - if (touched >= nr) - goto out_nounlock; - mutex_lock(&bc->lock); - goto restart; - } - - if (touched >= nr) - break; - } -out_rotate: - if (&t->list != &list->list) - list_move_tail(&list->list, &t->list); -out: - mutex_unlock(&bc->lock); -out_nounlock: - ret = freed; - memalloc_nofs_restore(flags); - trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); - return ret; -} - -static unsigned long bch2_btree_cache_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct btree_cache_list *list = shrink->private_data; - - if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) - return 0; - - return btree_cache_can_free(list); -} - -void bch2_fs_btree_cache_exit(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct btree *b, *t; - unsigned long flags; - - shrinker_free(bc->live[1].shrink); - shrinker_free(bc->live[0].shrink); - - /* vfree() can allocate memory: */ - flags = memalloc_nofs_save(); - mutex_lock(&bc->lock); - - if (c->verify_data) - list_move(&c->verify_data->list, &bc->live[0].list); - - kvfree(c->verify_ondisk); - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (r->b) - list_add(&r->b->list, &bc->live[0].list); - } - - list_for_each_entry_safe(b, t, &bc->live[1].list, list) - bch2_btree_node_hash_remove(bc, b); - list_for_each_entry_safe(b, t, &bc->live[0].list, list) - bch2_btree_node_hash_remove(bc, b); - - list_for_each_entry_safe(b, t, &bc->freeable, list) { - BUG_ON(btree_node_read_in_flight(b) || - btree_node_write_in_flight(b)); - - btree_node_data_free(bc, b); - cond_resched(); - } - - BUG_ON(!bch2_journal_error(&c->journal) && - atomic_long_read(&c->btree_cache.nr_dirty)); - - list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); - - list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { - list_del(&b->list); - six_lock_exit(&b->c.lock); - kfree(b); - } - - mutex_unlock(&bc->lock); - memalloc_nofs_restore(flags); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) - BUG_ON(bc->nr_by_btree[i]); - BUG_ON(bc->live[0].nr); - BUG_ON(bc->live[1].nr); - BUG_ON(bc->nr_freeable); - - if (bc->table_init_done) - rhashtable_destroy(&bc->table); -} - -int bch2_fs_btree_cache_init(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct shrinker *shrink; - unsigned i; - int ret = 0; - - ret = rhashtable_init(&bc->table, &bch_btree_cache_params); - if (ret) - goto err; - - bc->table_init_done = true; - - bch2_recalc_btree_reserve(c); - - for (i = 0; i < bc->nr_reserve; i++) { - struct btree *b = __bch2_btree_node_mem_alloc(c); - if (!b) - goto err; - __bch2_btree_node_to_freelist(bc, b); - } - - list_splice_init(&bc->live[0].list, &bc->freeable); - - mutex_init(&c->verify_lock); - - shrink = shrinker_alloc(0, "%s-btree_cache", c->name); - if (!shrink) - goto err; - bc->live[0].shrink = shrink; - shrink->count_objects = bch2_btree_cache_count; - shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 2; - shrink->private_data = &bc->live[0]; - shrinker_register(shrink); - - shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); - if (!shrink) - goto err; - bc->live[1].shrink = shrink; - shrink->count_objects = bch2_btree_cache_count; - shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 8; - shrink->private_data = &bc->live[1]; - shrinker_register(shrink); - - return 0; -err: - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); -} - -void bch2_fs_btree_cache_init_early(struct btree_cache *bc) -{ - mutex_init(&bc->lock); - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { - bc->live[i].idx = i; - INIT_LIST_HEAD(&bc->live[i].list); - } - INIT_LIST_HEAD(&bc->freeable); - INIT_LIST_HEAD(&bc->freed_pcpu); - INIT_LIST_HEAD(&bc->freed_nonpcpu); -} - -/* - * We can only have one thread cannibalizing other cached btree nodes at a time, - * or we'll deadlock. We use an open coded mutex to ensure that, which a - * cannibalize_bucket() will take. This means every time we unlock the root of - * the btree, we need to release this lock if we have it held. - */ -void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - - if (bc->alloc_lock == current) { - trace_and_count(c, btree_cache_cannibalize_unlock, trans); - bc->alloc_lock = NULL; - closure_wake_up(&bc->alloc_wait); - } -} - -int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct task_struct *old; - - old = NULL; - if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) - goto success; - - if (!cl) { - trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); - } - - closure_wait(&bc->alloc_wait, cl); - - /* Try again, after adding ourselves to waitlist */ - old = NULL; - if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { - /* We raced */ - closure_wake_up(&bc->alloc_wait); - goto success; - } - - trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); - -success: - trace_and_count(c, btree_cache_cannibalize_lock, trans); - return 0; -} - -static struct btree *btree_node_cannibalize(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) - list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_reclaim(c, b)) - return b; - - while (1) { - for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) - list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_write_and_reclaim(c, b)) - return b; - - /* - * Rare case: all nodes were intent-locked. - * Just busy-wait. - */ - WARN_ONCE(1, "btree cache cannibalize failed\n"); - cond_resched(); - } -} - -struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct list_head *freed = pcpu_read_locks - ? &bc->freed_pcpu - : &bc->freed_nonpcpu; - struct btree *b, *b2; - u64 start_time = local_clock(); - - mutex_lock(&bc->lock); - - /* - * We never free struct btree itself, just the memory that holds the on - * disk node. Check the freed list before allocating a new one: - */ - list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b)) { - list_del_init(&b->list); - goto got_node; - } - - b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); - if (b) { - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); - } else { - mutex_unlock(&bc->lock); - bch2_trans_unlock(trans); - b = __btree_node_mem_alloc(c, GFP_KERNEL); - if (!b) - goto err; - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); - mutex_lock(&bc->lock); - } - - BUG_ON(!six_trylock_intent(&b->c.lock)); - BUG_ON(!six_trylock_write(&b->c.lock)); - -got_node: - /* - * btree_free() doesn't free memory; it sticks the node on the end of - * the list. Check if there's any freed nodes there: - */ - list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2)) { - swap(b->data, b2->data); - swap(b->aux_data, b2->aux_data); - - list_del_init(&b2->list); - --bc->nr_freeable; - btree_node_to_freedlist(bc, b2); - mutex_unlock(&bc->lock); - - six_unlock_write(&b2->c.lock); - six_unlock_intent(&b2->c.lock); - goto got_mem; - } - - mutex_unlock(&bc->lock); - - if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { - bch2_trans_unlock(trans); - if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) - goto err; - } - -got_mem: - BUG_ON(!list_empty(&b->list)); - BUG_ON(btree_node_hashed(b)); - BUG_ON(btree_node_dirty(b)); - BUG_ON(btree_node_write_in_flight(b)); -out: - b->flags = 0; - b->written = 0; - b->nsets = 0; - b->sib_u64s[0] = 0; - b->sib_u64s[1] = 0; - b->whiteout_u64s = 0; - bch2_btree_keys_init(b); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], - start_time); - - int ret = bch2_trans_relock(trans); - if (unlikely(ret)) { - bch2_btree_node_to_freelist(c, b); - return ERR_PTR(ret); - } - - return b; -err: - mutex_lock(&bc->lock); - - /* Try to cannibalize another cached btree node: */ - if (bc->alloc_lock == current) { - b2 = btree_node_cannibalize(c); - clear_btree_node_just_written(b2); - __bch2_btree_node_hash_remove(bc, b2); - - if (b) { - swap(b->data, b2->data); - swap(b->aux_data, b2->aux_data); - btree_node_to_freedlist(bc, b2); - six_unlock_write(&b2->c.lock); - six_unlock_intent(&b2->c.lock); - } else { - b = b2; - } - - BUG_ON(!list_empty(&b->list)); - mutex_unlock(&bc->lock); - - trace_and_count(c, btree_cache_cannibalize, trans); - goto out; - } - - mutex_unlock(&bc->lock); - return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); -} - -/* Slowpath, don't want it inlined into btree_iter_traverse() */ -static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, - struct btree_path *path, - const struct bkey_i *k, - enum btree_id btree_id, - unsigned level, - enum six_lock_type lock_type, - bool sync) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - if (unlikely(level >= BTREE_MAX_DEPTH)) { - int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", - level, BTREE_MAX_DEPTH); - return ERR_PTR(ret); - } - - if (unlikely(!bkey_is_btree_ptr(&k->k))) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - - int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); - printbuf_exit(&buf); - return ERR_PTR(ret); - } - - if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - - int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); - printbuf_exit(&buf); - return ERR_PTR(ret); - } - - /* - * Parent node must be locked, else we could read in a btree node that's - * been freed: - */ - if (path && !bch2_btree_node_relock(trans, path, level + 1)) { - trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); - } - - b = bch2_btree_node_mem_alloc(trans, level != 0); - - if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { - if (!path) - return b; - - trans->memory_allocation_failure = true; - trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); - } - - if (IS_ERR(b)) - return b; - - bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { - /* raced with another fill: */ - - /* mark as unhashed... */ - b->hash_val = 0; - - mutex_lock(&bc->lock); - __bch2_btree_node_to_freelist(bc, b); - mutex_unlock(&bc->lock); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - return NULL; - } - - set_btree_node_read_in_flight(b); - six_unlock_write(&b->c.lock); - - if (path) { - u32 seq = six_lock_seq(&b->c.lock); - - /* Unlock before doing IO: */ - six_unlock_intent(&b->c.lock); - bch2_trans_unlock(trans); - - bch2_btree_node_read(trans, b, sync); - - int ret = bch2_trans_relock(trans); - if (ret) - return ERR_PTR(ret); - - if (!sync) - return NULL; - - if (!six_relock_type(&b->c.lock, lock_type, seq)) - b = NULL; - } else { - bch2_btree_node_read(trans, b, sync); - if (lock_type == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); - } - - return b; -} - -static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) -{ - struct printbuf buf = PRINTBUF; - - if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) - return; - - prt_printf(&buf, - "btree node header doesn't match ptr: "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, "\nptr: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - prt_str(&buf, "\nheader: "); - bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); - prt_str(&buf, "\nmin "); - bch2_bpos_to_text(&buf, b->data->min_key); - - prt_printf(&buf, "\nmax "); - bch2_bpos_to_text(&buf, b->data->max_key); - - bch2_fs_topology_error(c, "%s", buf.buf); - - printbuf_exit(&buf); -} - -static inline void btree_check_header(struct bch_fs *c, struct btree *b) -{ - if (b->c.btree_id != BTREE_NODE_ID(b->data) || - b->c.level != BTREE_NODE_LEVEL(b->data) || - !bpos_eq(b->data->max_key, b->key.k.p) || - (b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(b->data->min_key, - bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) - btree_bad_header(c, b); -} - -static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, - const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - bool need_relock = false; - int ret; - - EBUG_ON(level >= BTREE_MAX_DEPTH); -retry: - b = btree_cache_find(bc, k); - if (unlikely(!b)) { - /* - * We must have the parent locked to call bch2_btree_node_fill(), - * else we could read in a btree node from disk that's been - * freed: - */ - b = bch2_btree_node_fill(trans, path, k, path->btree_id, - level, lock_type, true); - need_relock = true; - - /* We raced and found the btree node in the cache */ - if (!b) - goto retry; - - if (IS_ERR(b)) - return b; - } else { - if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(trans, path, level + 1); - - ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - - BUG_ON(ret); - - if (unlikely(b->hash_val != btree_ptr_hash_val(k) || - b->c.level != level || - race_fault())) { - six_unlock_type(&b->c.lock, lock_type); - if (bch2_btree_node_relock(trans, path, level + 1)) - goto retry; - - trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); - } - - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - } - - if (unlikely(btree_node_read_in_flight(b))) { - u32 seq = six_lock_seq(&b->c.lock); - - six_unlock_type(&b->c.lock, lock_type); - bch2_trans_unlock(trans); - need_relock = true; - - bch2_btree_node_wait_on_read(b); - - ret = bch2_trans_relock(trans); - if (ret) - return ERR_PTR(ret); - - /* - * should_be_locked is not set on this path yet, so we need to - * relock it specifically: - */ - if (!six_relock_type(&b->c.lock, lock_type, seq)) - goto retry; - } - - if (unlikely(need_relock)) { - ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(ret); - } - } - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - } - - EBUG_ON(b->c.btree_id != path->btree_id); - EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - btree_check_header(c, b); - - return b; -} - -/** - * bch2_btree_node_get - find a btree node in the cache and lock it, reading it - * in from disk if necessary. - * - * @trans: btree transaction object - * @path: btree_path being traversed - * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) - * @level: level of btree node being looked up (0 == leaf node) - * @lock_type: SIX_LOCK_read or SIX_LOCK_intent - * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) - * - * The btree node will have either a read or a write lock held, depending on - * the @write parameter. - * - * Returns: btree node or ERR_PTR() - */ -struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, - const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree *b; - int ret; - - EBUG_ON(level >= BTREE_MAX_DEPTH); - - b = btree_node_mem_ptr(k); - - /* - * Check b->hash_val _before_ calling btree_node_lock() - this might not - * be the node we want anymore, and trying to lock the wrong node could - * cause an unneccessary transaction restart: - */ - if (unlikely(!c->opts.btree_node_mem_ptr_optimization || - !b || - b->hash_val != btree_ptr_hash_val(k))) - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); - - if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(trans, path, level + 1); - - ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - - BUG_ON(ret); - - if (unlikely(b->hash_val != btree_ptr_hash_val(k) || - b->c.level != level || - race_fault())) { - six_unlock_type(&b->c.lock, lock_type); - if (bch2_btree_node_relock(trans, path, level + 1)) - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); - - trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); - } - - if (unlikely(btree_node_read_in_flight(b))) { - six_unlock_type(&b->c.lock, lock_type); - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); - } - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - - if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - } - - EBUG_ON(b->c.btree_id != path->btree_id); - EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - btree_check_header(c, b); - - return b; -} - -struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, - const struct bkey_i *k, - enum btree_id btree_id, - unsigned level, - bool nofill) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - int ret; - - EBUG_ON(level >= BTREE_MAX_DEPTH); - - if (c->opts.btree_node_mem_ptr_optimization) { - b = btree_node_mem_ptr(k); - if (b) - goto lock_node; - } -retry: - b = btree_cache_find(bc, k); - if (unlikely(!b)) { - if (nofill) - goto out; - - b = bch2_btree_node_fill(trans, NULL, k, btree_id, - level, SIX_LOCK_read, true); - - /* We raced and found the btree node in the cache */ - if (!b) - goto retry; - - if (IS_ERR(b) && - !bch2_btree_cache_cannibalize_lock(trans, NULL)) - goto retry; - - if (IS_ERR(b)) - goto out; - } else { -lock_node: - ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - - BUG_ON(ret); - - if (unlikely(b->hash_val != btree_ptr_hash_val(k) || - b->c.btree_id != btree_id || - b->c.level != level)) { - six_unlock_read(&b->c.lock); - goto retry; - } - - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - } - - /* XXX: waiting on IO with btree locks held: */ - __bch2_btree_node_wait_on_read(b); - - prefetch(b->aux_data); - - for_each_bset(b, t) { - void *p = (u64 *) b->aux_data + t->aux_data_offset; - - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - } - - if (unlikely(btree_node_read_error(b))) { - six_unlock_read(&b->c.lock); - b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); - goto out; - } - - EBUG_ON(b->c.btree_id != btree_id); - EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - btree_check_header(c, b); -out: - bch2_btree_cache_cannibalize_unlock(trans); - return b; -} - -int bch2_btree_node_prefetch(struct btree_trans *trans, - struct btree_path *path, - const struct bkey_i *k, - enum btree_id btree_id, unsigned level) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - - BUG_ON(path && !btree_node_locked(path, level + 1)); - BUG_ON(level >= BTREE_MAX_DEPTH); - - struct btree *b = btree_cache_find(bc, k); - if (b) - return 0; - - b = bch2_btree_node_fill(trans, path, k, btree_id, - level, SIX_LOCK_read, false); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - return ret; - if (b) - six_unlock_read(&b->c.lock); - return 0; -} - -void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) -{ - struct bch_fs *c = trans->c; - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - b = btree_cache_find(bc, k); - if (!b) - return; - - BUG_ON(b == btree_node_root(trans->c, b)); -wait_on_io: - /* not allowed to wait on io with btree locks held: */ - - /* XXX we're called from btree_gc which will be holding other btree - * nodes locked - */ - __bch2_btree_node_wait_on_read(b); - __bch2_btree_node_wait_on_write(b); - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - if (unlikely(b->hash_val != btree_ptr_hash_val(k))) - goto out; - - if (btree_node_dirty(b)) { - __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - goto wait_on_io; - } - - BUG_ON(btree_node_dirty(b)); - - mutex_lock(&bc->lock); - bch2_btree_node_hash_remove(bc, b); - btree_node_data_free(bc, b); - mutex_unlock(&bc->lock); -out: - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -} - -const char *bch2_btree_id_str(enum btree_id btree) -{ - return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; -} - -void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) -{ - if (btree < BTREE_ID_NR) - prt_str(out, __bch2_btree_ids[btree]); - else - prt_printf(out, "(unknown btree %u)", btree); -} - -void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) -{ - prt_str(out, "btree="); - bch2_btree_id_to_text(out, btree); - prt_printf(out, " level=%u", level); -} - -void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, - enum btree_id btree, unsigned level, struct bkey_s_c k) -{ - bch2_btree_id_to_text(out, btree); - prt_printf(out, " level %u/", level); - struct btree_root *r = bch2_btree_id_root(c, btree); - if (r) - prt_printf(out, "%u", r->level); - else - prt_printf(out, "(unknown)"); - prt_newline(out); - - bch2_bkey_val_to_text(out, c, k); -} - -void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) -{ - __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); -} - -void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) -{ - struct bset_stats stats; - - memset(&stats, 0, sizeof(stats)); - - bch2_btree_keys_stats(b, &stats); - - prt_printf(out, "l %u ", b->c.level); - bch2_bpos_to_text(out, b->data->min_key); - prt_printf(out, " - "); - bch2_bpos_to_text(out, b->data->max_key); - prt_printf(out, ":\n" - " ptrs: "); - bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); - prt_newline(out); - - prt_printf(out, - " format: "); - bch2_bkey_format_to_text(out, &b->format); - - prt_printf(out, - " unpack fn len: %u\n" - " bytes used %zu/%zu (%zu%% full)\n" - " sib u64s: %u, %u (merge threshold %u)\n" - " nr packed keys %u\n" - " nr unpacked keys %u\n" - " floats %zu\n" - " failed unpacked %zu\n", - b->unpack_fn_len, - b->nr.live_u64s * sizeof(u64), - btree_buf_bytes(b) - sizeof(struct btree_node), - b->nr.live_u64s * 100 / btree_max_u64s(c), - b->sib_u64s[0], - b->sib_u64s[1], - c->btree_foreground_merge_threshold, - b->nr.packed_keys, - b->nr.unpacked_keys, - stats.floats, - stats.failed); -} - -static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, - const char *label, size_t nr) -{ - prt_printf(out, "%s\t", label); - prt_human_readable_u64(out, nr * c->opts.btree_node_size); - prt_printf(out, " (%zu)\n", nr); -} - -static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { -#define x(n) #n, - BCH_BTREE_CACHE_NOT_FREED_REASONS() -#undef x - NULL -}; - -void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); - - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - prt_btree_cache_line(out, c, "live:", bc->live[0].nr); - prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); - prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); - prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); - prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); - prt_newline(out); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { - bch2_btree_id_to_text(out, i); - prt_printf(out, "\t"); - prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); - prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); - } - - prt_newline(out); - prt_printf(out, "counters since mount:\n"); - prt_printf(out, "freed:\t%zu\n", bc->nr_freed); - prt_printf(out, "not freed:\n"); - - for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) - prt_printf(out, " %s\t%llu\n", - bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); -} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h deleted file mode 100644 index be275f87a60e04..00000000000000 --- a/fs/bcachefs/btree_cache.h +++ /dev/null @@ -1,157 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_CACHE_H -#define _BCACHEFS_BTREE_CACHE_H - -#include "bcachefs.h" -#include "btree_types.h" -#include "bkey_methods.h" - -extern const char * const bch2_btree_node_flags[]; - -struct btree_iter; - -void bch2_recalc_btree_reserve(struct bch_fs *); - -void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); - -void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); -void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); - -int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); -int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, - unsigned, enum btree_id); - -void bch2_node_pin(struct bch_fs *, struct btree *); -void bch2_btree_cache_unpin(struct bch_fs *); - -void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *); - -void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); -int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); - -void __btree_node_data_free(struct btree *); -struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); - -struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, - const struct bkey_i *, unsigned, - enum six_lock_type, unsigned long); - -struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *, - enum btree_id, unsigned, bool); - -int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *, - const struct bkey_i *, enum btree_id, unsigned); - -void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *); - -void bch2_fs_btree_cache_exit(struct bch_fs *); -int bch2_fs_btree_cache_init(struct bch_fs *); -void bch2_fs_btree_cache_init_early(struct btree_cache *); - -static inline u64 btree_ptr_hash_val(const struct bkey_i *k) -{ - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); - case KEY_TYPE_btree_ptr_v2: - /* - * The cast/deref is only necessary to avoid sparse endianness - * warnings: - */ - return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq); - default: - return 0; - } -} - -static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) -{ - return k->k.type == KEY_TYPE_btree_ptr_v2 - ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr - : NULL; -} - -/* is btree node in hash table? */ -static inline bool btree_node_hashed(struct btree *b) -{ - return b->hash_val != 0; -} - -#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ - for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ - &(_c)->btree_cache.table), \ - _iter = 0; _iter < (_tbl)->size; _iter++) \ - rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) - -static inline size_t btree_buf_bytes(const struct btree *b) -{ - return 1UL << b->byte_order; -} - -static inline size_t btree_buf_max_u64s(const struct btree *b) -{ - return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64); -} - -static inline size_t btree_max_u64s(const struct bch_fs *c) -{ - return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64); -} - -static inline size_t btree_sectors(const struct bch_fs *c) -{ - return c->opts.btree_node_size >> SECTOR_SHIFT; -} - -static inline unsigned btree_blocks(const struct bch_fs *c) -{ - return btree_sectors(c) >> c->block_bits; -} - -#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) - -#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) -#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) - -static inline unsigned btree_id_nr_alive(struct bch_fs *c) -{ - return BTREE_ID_NR + c->btree_roots_extra.nr; -} - -static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id) -{ - if (likely(id < BTREE_ID_NR)) { - return &c->btree_roots_known[id]; - } else { - unsigned idx = id - BTREE_ID_NR; - - /* This can happen when we're called from btree_node_scan */ - if (idx >= c->btree_roots_extra.nr) - return NULL; - - return &c->btree_roots_extra.data[idx]; - } -} - -static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) -{ - struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); - - return r ? r->b : NULL; -} - -const char *bch2_btree_id_str(enum btree_id); /* avoid */ -void bch2_btree_id_to_text(struct printbuf *, enum btree_id); -void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); - -void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, - enum btree_id, unsigned, struct bkey_s_c); -void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); - -#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c deleted file mode 100644 index bac108e93823c2..00000000000000 --- a/fs/bcachefs/btree_gc.c +++ /dev/null @@ -1,1308 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2010 Kent Overstreet - * Copyright (C) 2014 Datera Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_methods.h" -#include "bkey_buf.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_node_scan.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "btree_gc.h" -#include "buckets.h" -#include "clock.h" -#include "debug.h" -#include "disk_accounting.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "progress.h" -#include "recovery_passes.h" -#include "reflink.h" -#include "recovery.h" -#include "replicas.h" -#include "super-io.h" -#include "trace.h" - -#include -#include -#include -#include -#include -#include -#include - -#define DROP_THIS_NODE 10 -#define DROP_PREV_NODE 11 -#define DID_FILL_FROM_SCAN 12 - -/* - * Returns true if it's a btree we can easily reconstruct, or otherwise won't - * cause data loss if it's missing: - */ -static bool btree_id_important(enum btree_id btree) -{ - if (btree_id_is_alloc(btree)) - return false; - - switch (btree) { - case BTREE_ID_quotas: - case BTREE_ID_snapshot_trees: - case BTREE_ID_logged_ops: - case BTREE_ID_rebalance_work: - case BTREE_ID_subvolume_children: - return false; - default: - return true; - } -} - -static const char * const bch2_gc_phase_strs[] = { -#define x(n) #n, - GC_PHASES() -#undef x - NULL -}; - -void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) -{ - prt_str(out, bch2_gc_phase_strs[p->phase]); - prt_char(out, ' '); - bch2_btree_id_level_to_text(out, p->btree, p->level); - prt_char(out, ' '); - bch2_bpos_to_text(out, p->pos); -} - -static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) -{ - return (struct bkey_s) {{{ - (struct bkey *) k.k, - (struct bch_val *) k.v - }}}; -} - -static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - preempt_disable(); - write_seqcount_begin(&c->gc_pos_lock); - c->gc_pos = new_pos; - write_seqcount_end(&c->gc_pos_lock); - preempt_enable(); -} - -static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0); - __gc_pos_set(c, new_pos); -} - -static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) -{ - switch (b->key.k.type) { - case KEY_TYPE_btree_ptr: { - struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); - - dst->k.p = src->k.p; - dst->v.mem_ptr = 0; - dst->v.seq = b->data->keys.seq; - dst->v.sectors_written = 0; - dst->v.flags = 0; - dst->v.min_key = b->data->min_key; - set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); - memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); - break; - } - case KEY_TYPE_btree_ptr_v2: - bkey_copy(&dst->k_i, &b->key); - break; - default: - BUG(); - } -} - -static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) -{ - struct bkey_i_btree_ptr_v2 *new; - int ret; - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, " -> "); - bch2_bpos_to_text(&buf, new_min); - - bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); - if (!new) - return bch_err_throw(c, ENOMEM_gc_repair_key); - - btree_ptr_to_v2(b, new); - b->data->min_key = new_min; - new->v.min_key = new_min; - SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - - ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); - if (ret) { - kfree(new); - return ret; - } - - bch2_btree_node_drop_keys_outside_node(b); - bkey_copy(&b->key, &new->k_i); - return 0; -} - -static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) -{ - struct bkey_i_btree_ptr_v2 *new; - int ret; - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, " -> "); - bch2_bpos_to_text(&buf, new_max); - - bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); - if (ret) - return ret; - - new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); - if (!new) - return bch_err_throw(c, ENOMEM_gc_repair_key); - - btree_ptr_to_v2(b, new); - b->data->max_key = new_max; - new->k.p = new_max; - SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - - ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); - if (ret) { - kfree(new); - return ret; - } - - bch2_btree_node_drop_keys_outside_node(b); - - mutex_lock(&c->btree_cache.lock); - __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, &new->k_i); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); - return 0; -} - -static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b, - struct btree *prev, struct btree *cur, - struct bpos *pulled_from_scan) -{ - struct bch_fs *c = trans->c; - struct bpos expected_start = !prev - ? b->data->min_key - : bpos_successor(prev->key.k.p); - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, - b->data->min_key)); - - if (bpos_eq(expected_start, cur->data->min_key)) - return 0; - - prt_printf(&buf, " at "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, ":\nparent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - if (prev) { - prt_printf(&buf, "\nprev: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); - } - - prt_str(&buf, "\nnext: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); - - if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ - if (b->c.level == 1 && - bpos_lt(*pulled_from_scan, cur->data->min_key)) { - ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, - expected_start, - bpos_predecessor(cur->data->min_key)); - if (ret) - goto err; - - *pulled_from_scan = cur->data->min_key; - ret = DID_FILL_FROM_SCAN; - } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, - "btree node with incorrect min_key%s", buf.buf)) - ret = set_node_min(c, cur, expected_start); - } - } else { /* overlap */ - if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ - if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ - if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node, - "btree node overwritten by next node%s", buf.buf)) - ret = DROP_PREV_NODE; - } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, - "btree node with incorrect max_key%s", buf.buf)) - ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - } - } else { - if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ - if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node, - "btree node overwritten by prev node%s", buf.buf)) - ret = DROP_THIS_NODE; - } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, - "btree node with incorrect min_key%s", buf.buf)) - ret = set_node_min(c, cur, expected_start); - } - } - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, - struct btree *child, struct bpos *pulled_from_scan) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (bpos_eq(child->key.k.p, b->key.k.p)) - return 0; - - prt_printf(&buf, "\nat: "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, "\nparent: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - prt_str(&buf, "\nchild: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); - - if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, - "btree node with incorrect max_key%s", buf.buf)) { - if (b->c.level == 1 && - bpos_lt(*pulled_from_scan, b->key.k.p)) { - ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, - bpos_successor(child->key.k.p), b->key.k.p); - if (ret) - goto err; - - *pulled_from_scan = b->key.k.p; - ret = DID_FILL_FROM_SCAN; - } else { - ret = set_node_max(c, child, b->key.k.p); - } - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, - struct bpos *pulled_from_scan) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf prev_k, cur_k; - struct btree *prev = NULL, *cur = NULL; - bool have_child, new_pass = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (!b->c.level) - return 0; - - bch2_bkey_buf_init(&prev_k); - bch2_bkey_buf_init(&cur_k); -again: - cur = prev = NULL; - have_child = new_pass = false; - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - iter.prefetch = true; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); - - bch2_btree_and_journal_iter_advance(&iter); - bch2_bkey_buf_reassemble(&cur_k, c, k); - - cur = bch2_btree_node_get_noiter(trans, cur_k.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(cur); - - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - - if (bch2_err_matches(ret, EIO)) { - bch2_btree_node_evict(trans, cur_k.k); - cur = NULL; - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - if (ret) - break; - continue; - } - - bch_err_msg(c, ret, "getting btree node"); - if (ret) - break; - - if (bch2_btree_node_is_stale(c, cur)) { - bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf); - six_unlock_read(&cur->c.lock); - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - cur = NULL; - if (ret) - break; - continue; - } - - ret = lockrestart_do(trans, - btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan)); - if (ret < 0) - goto err; - - if (ret == DID_FILL_FROM_SCAN) { - new_pass = true; - ret = 0; - } - - if (ret == DROP_THIS_NODE) { - six_unlock_read(&cur->c.lock); - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - cur = NULL; - if (ret) - break; - continue; - } - - if (prev) - six_unlock_read(&prev->c.lock); - prev = NULL; - - if (ret == DROP_PREV_NODE) { - bch_info(c, "dropped prev node"); - bch2_btree_node_evict(trans, prev_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, prev_k.k->k.p); - if (ret) - break; - - bch2_btree_and_journal_iter_exit(&iter); - goto again; - } else if (ret) - break; - - prev = cur; - cur = NULL; - bch2_bkey_buf_copy(&prev_k, c, cur_k.k); - } - - if (!ret && !IS_ERR_OR_NULL(prev)) { - BUG_ON(cur); - ret = lockrestart_do(trans, - btree_repair_node_end(trans, b, prev, pulled_from_scan)); - if (ret == DID_FILL_FROM_SCAN) { - new_pass = true; - ret = 0; - } - } - - if (!IS_ERR_OR_NULL(prev)) - six_unlock_read(&prev->c.lock); - prev = NULL; - if (!IS_ERR_OR_NULL(cur)) - six_unlock_read(&cur->c.lock); - cur = NULL; - - if (ret) - goto err; - - bch2_btree_and_journal_iter_exit(&iter); - - if (new_pass) - goto again; - - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - iter.prefetch = true; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_buf_reassemble(&cur_k, c, k); - bch2_btree_and_journal_iter_advance(&iter); - - cur = bch2_btree_node_get_noiter(trans, cur_k.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(cur); - - bch_err_msg(c, ret, "getting btree node"); - if (ret) - goto err; - - ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); - six_unlock_read(&cur->c.lock); - cur = NULL; - - if (ret == DROP_THIS_NODE) { - bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); - new_pass = true; - } - - if (ret) - goto err; - - have_child = true; - } - - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - /* - * XXX: we're not passing the trans object here because we're not set up - * to handle a transaction restart - this code needs to be rewritten - * when we start doing online topology repair - */ - bch2_trans_unlock_long(trans); - if (mustfix_fsck_err_on(!have_child, - c, btree_node_topology_interior_node_empty, - "empty interior btree node at %s", buf.buf)) - ret = DROP_THIS_NODE; -err: -fsck_err: - if (!IS_ERR_OR_NULL(prev)) - six_unlock_read(&prev->c.lock); - if (!IS_ERR_OR_NULL(cur)) - six_unlock_read(&cur->c.lock); - - bch2_btree_and_journal_iter_exit(&iter); - - if (!ret && new_pass) - goto again; - - BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); - - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_check_root(struct btree_trans *trans, enum btree_id btree, - bool *reconstructed_root) -{ - struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, btree); - struct printbuf buf = PRINTBUF; - int ret = 0; - - bch2_btree_id_to_text(&buf, btree); - - if (r->error) { - bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - - ret = bch2_btree_has_scanned_nodes(c, btree); - if (ret < 0) - goto err; - - if (!ret) { - __fsck_err(trans, - FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0), - btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", buf.buf); - - r->alive = false; - r->error = 0; - bch2_btree_root_alloc_fake_trans(trans, btree, 0); - } else { - r->alive = false; - r->error = 0; - bch2_btree_root_alloc_fake_trans(trans, btree, 1); - - bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX); - if (ret) - goto err; - } - - *reconstructed_root = true; - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -int bch2_check_topology(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bpos pulled_from_scan = POS_MIN; - int ret = 0; - - bch2_trans_srcu_unlock(trans); - - for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - bool reconstructed_root = false; -recover: - ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root)); - if (ret) - break; - - struct btree_root *r = bch2_btree_id_root(c, i); - struct btree *b = r->b; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); - six_unlock_read(&b->c.lock); - - if (ret == DROP_THIS_NODE) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - r->b = NULL; - - if (!reconstructed_root) { - r->error = -EIO; - goto recover; - } - - struct printbuf buf = PRINTBUF; - bch2_btree_id_to_text(&buf, i); - bch_err(c, "empty btree root %s", buf.buf); - printbuf_exit(&buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - r->alive = false; - ret = 0; - } - } - - bch2_trans_put(trans); - return ret; -} - -/* marking of btree keys/nodes: */ - -static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, struct btree **prev, - struct btree_iter *iter, struct bkey_s_c k, - bool initial) -{ - struct bch_fs *c = trans->c; - - if (iter) { - struct btree_path *path = btree_iter_path(trans, iter); - struct btree *b = path_l(path)->b; - - if (*prev != b) { - int ret = bch2_btree_node_check_topology(trans, b); - if (ret) - return ret; - } - *prev = b; - } - - struct bkey deleted = KEY(0, 0, 0); - struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - struct printbuf buf = PRINTBUF; - int ret = 0; - - deleted.p = k.k->p; - - if (initial) { - BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) && - k.k->bversion.lo > atomic64_read(&c->journal.seq)); - - if (fsck_err_on(btree_id != BTREE_ID_accounting && - k.k->bversion.lo > atomic64_read(&c->key_version), - trans, bkey_version_in_future, - "key version number higher than recorded %llu\n%s", - atomic64_read(&c->key_version), - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - atomic64_set(&c->key_version, k.k->bversion.lo); - } - - if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), - trans, btree_bitmap_not_marked, - "btree ptr not marked in member info btree allocated bitmap\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - mutex_lock(&c->sb_lock); - bch2_dev_btree_bitmap_mark(c, k); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - - /* - * We require a commit before key_trigger() because - * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the - * wrong result if we run it multiple times. - */ - unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0; - - ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), - BTREE_TRIGGER_check_repair|flags); - if (ret) - goto out; - - if (trans->nr_updates) { - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } - - ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), - BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags); -out: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_gc_btree(struct btree_trans *trans, - struct progress_indicator_state *progress, - enum btree_id btree, bool initial) -{ - struct bch_fs *c = trans->c; - unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; - int ret = 0; - - /* We need to make sure every leaf node is readable before going RW */ - if (initial) - target_depth = 0; - - for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) { - struct btree *prev = NULL; - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level, - BTREE_ITER_prefetch); - - ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); - gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); - bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); - })); - if (ret) - goto err; - } - - /* root */ - do { -retry_root: - bch2_trans_begin(trans); - - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, - 0, bch2_btree_id_root(c, btree)->b->c.level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err_root; - - if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); - goto retry_root; - } - - gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); -err_root: - bch2_trans_iter_exit(trans, &iter); - } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); -err: - bch_err_fn(c, ret); - return ret; -} - -static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) -{ - return cmp_int(gc_btree_order(l), gc_btree_order(r)); -} - -static int bch2_gc_btrees(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct progress_indicator_state progress; - bch2_progress_init(&progress, c, ~0ULL); - - enum btree_id ids[BTREE_ID_NR]; - for (unsigned i = 0; i < BTREE_ID_NR; i++) - ids[i] = i; - bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - - for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - unsigned btree = i < BTREE_ID_NR ? ids[i] : i; - - if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) - continue; - - ret = bch2_gc_btree(trans, &progress, btree, true); - } - - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_mark_superblocks(struct bch_fs *c) -{ - gc_pos_set(c, gc_phase(GC_PHASE_sb)); - - return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); -} - -static void bch2_gc_free(struct bch_fs *c) -{ - bch2_accounting_gc_free(c); - - genradix_free(&c->reflink_gc_table); - genradix_free(&c->gc_stripes); - - for_each_member_device(c, ca) - genradix_free(&ca->buckets_gc); -} - -static int bch2_gc_start(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - int ret = bch2_dev_usage_init(ca, true); - if (ret) { - bch2_dev_put(ca); - return ret; - } - } - - return 0; -} - -/* returns true if not equal */ -static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, - struct bch_alloc_v4 r) -{ - return l.gen != r.gen || - l.oldest_gen != r.oldest_gen || - l.data_type != r.data_type || - l.dirty_sectors != r.dirty_sectors || - l.stripe_sectors != r.stripe_sectors || - l.cached_sectors != r.cached_sectors || - l.stripe_redundancy != r.stripe_redundancy || - l.stripe != r.stripe; -} - -static int bch2_alloc_write_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_dev *ca, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_i_alloc_v4 *a; - struct bch_alloc_v4 old_gc, gc, old_convert, new; - const struct bch_alloc_v4 *old; - int ret; - - if (!bucket_valid(ca, k.k->p.offset)) - return 0; - - old = bch2_alloc_to_v4(k, &old_convert); - gc = new = *old; - - __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); - - old_gc = gc; - - if ((old->data_type == BCH_DATA_sb || - old->data_type == BCH_DATA_journal) && - !bch2_dev_is_online(ca)) { - gc.data_type = old->data_type; - gc.dirty_sectors = old->dirty_sectors; - } - - /* - * gc.data_type doesn't yet include need_discard & need_gc_gen states - - * fix that here: - */ - alloc_data_type_set(&gc, gc.data_type); - if (gc.data_type != old_gc.data_type || - gc.dirty_sectors != old_gc.dirty_sectors) { - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc); - if (ret) - return ret; - - /* - * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not - * safe w.r.t. transaction restarts, so fixup the gc_bucket so - * we don't run it twice: - */ - struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); - gc_m->data_type = gc.data_type; - gc_m->dirty_sectors = gc.dirty_sectors; - } - - if (fsck_err_on(new.data_type != gc.data_type, - trans, alloc_key_data_type_wrong, - "bucket %llu:%llu gen %u has wrong data_type" - ": got %s, should be %s", - iter->pos.inode, iter->pos.offset, - gc.gen, - bch2_data_type_str(new.data_type), - bch2_data_type_str(gc.data_type))) - new.data_type = gc.data_type; - -#define copy_bucket_field(_errtype, _f) \ - if (fsck_err_on(new._f != gc._f, \ - trans, _errtype, \ - "bucket %llu:%llu gen %u data type %s has wrong " #_f \ - ": got %llu, should be %llu", \ - iter->pos.inode, iter->pos.offset, \ - gc.gen, \ - bch2_data_type_str(gc.data_type), \ - (u64) new._f, (u64) gc._f)) \ - new._f = gc._f; \ - - copy_bucket_field(alloc_key_gen_wrong, gen); - copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); - copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors); - copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); - copy_bucket_field(alloc_key_stripe_wrong, stripe); - copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy); -#undef copy_bucket_field - - if (!bch2_alloc_v4_cmp(*old, new)) - return 0; - - a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - a->v = new; - - /* - * The trigger normally makes sure these are set, but we're not running - * triggers: - */ - if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) - a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - - ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun); -fsck_err: - return ret; -} - -static int bch2_gc_alloc_done(struct bch_fs *c) -{ - int ret = 0; - - for_each_member_device(c, ca) { - ret = bch2_trans_run(c, - for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - POS(ca->dev_idx, ca->mi.nbuckets - 1), - BTREE_ITER_slots|BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_alloc_write_key(trans, &iter, ca, k))); - if (ret) { - bch2_dev_put(ca); - break; - } - } - - bch_err_fn(c, ret); - return ret; -} - -static int bch2_gc_alloc_start(struct bch_fs *c) -{ - int ret = 0; - - for_each_member_device(c, ca) { - ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); - if (ret) { - bch2_dev_put(ca); - ret = bch_err_throw(c, ENOMEM_gc_alloc_start); - break; - } - } - - bch_err_fn(c, ret); - return ret; -} - -static int bch2_gc_write_stripes_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - const struct bch_stripe *s; - struct gc_stripe *m; - bool bad = false; - unsigned i; - int ret = 0; - - if (k.k->type != KEY_TYPE_stripe) - return 0; - - s = bkey_s_c_to_stripe(k).v; - m = genradix_ptr(&c->gc_stripes, k.k->p.offset); - - for (i = 0; i < s->nr_blocks; i++) { - u32 old = stripe_blockcount_get(s, i); - u32 new = (m ? m->block_sectors[i] : 0); - - if (old != new) { - prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", - i, old, new); - bad = true; - } - } - - if (bad) - bch2_bkey_val_to_text(&buf, c, k); - - if (fsck_err_on(bad, - trans, stripe_sector_count_wrong, - "%s", buf.buf)) { - struct bkey_i_stripe *new; - - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bkey_reassemble(&new->k_i, k); - - for (i = 0; i < new->v.nr_blocks; i++) - stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); - - ret = bch2_trans_update(trans, iter, &new->k_i, 0); - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int bch2_gc_stripes_done(struct bch_fs *c) -{ - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_stripes, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_stripes_key(trans, &iter, k))); -} - -/** - * bch2_check_allocations - walk all references to buckets, and recompute them: - * - * @c: filesystem object - * - * Returns: 0 on success, or standard errcode on failure - * - * Order matters here: - * - Concurrent GC relies on the fact that we have a total ordering for - * everything that GC walks - see gc_will_visit_node(), - * gc_will_visit_root() - * - * - also, references move around in the course of index updates and - * various other crap: everything needs to agree on the ordering - * references are allowed to move around in - e.g., we're allowed to - * start with a reference owned by an open_bucket (the allocator) and - * move it to the btree, but not the reverse. - * - * This is necessary to ensure that gc doesn't miss references that - * move around - if references move backwards in the ordering GC - * uses, GC could skip past them - */ -int bch2_check_allocations(struct bch_fs *c) -{ - int ret; - - down_read(&c->state_lock); - down_write(&c->gc_lock); - - bch2_btree_interior_updates_flush(c); - - ret = bch2_gc_accounting_start(c) ?: - bch2_gc_start(c) ?: - bch2_gc_alloc_start(c) ?: - bch2_gc_reflink_start(c); - if (ret) - goto out; - - gc_pos_set(c, gc_phase(GC_PHASE_start)); - - ret = bch2_mark_superblocks(c); - bch_err_msg(c, ret, "marking superblocks"); - if (ret) - goto out; - - ret = bch2_gc_btrees(c); - if (ret) - goto out; - - c->gc_count++; - - ret = bch2_gc_alloc_done(c) ?: - bch2_gc_accounting_done(c) ?: - bch2_gc_stripes_done(c) ?: - bch2_gc_reflink_done(c); -out: - percpu_down_write(&c->mark_lock); - /* Indicates that gc is no longer in progress: */ - __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); - - bch2_gc_free(c); - percpu_up_write(&c->mark_lock); - - up_write(&c->gc_lock); - up_read(&c->state_lock); - - /* - * At startup, allocations can happen directly instead of via the - * allocator thread - issue wakeup in case they blocked on gc_lock: - */ - closure_wake_up(&c->freelist_wait); - - if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) - bch2_sb_members_clean_deleted(c); - - bch_err_fn(c, ret); - return ret; -} - -static int gc_btree_gens_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) - return -EROFS; - - bool too_stale = false; - scoped_guard(rcu) { - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - too_stale |= dev_ptr_stale(ca, ptr) > 16; - } - - if (!too_stale) - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(*gen, ptr->gen)) - *gen = ptr->gen; - } - } - - if (too_stale) { - struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - bch2_extent_normalize(c, bkey_i_to_s(u)); - } - - return 0; -} - -static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca, - struct btree_iter *iter, struct bkey_s_c k) -{ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - struct bkey_i_alloc_v4 *a_mut; - int ret; - - if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) - return 0; - - a_mut = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - return ret; - - a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - - return bch2_trans_update(trans, iter, &a_mut->k_i, 0); -} - -int bch2_gc_gens(struct bch_fs *c) -{ - u64 b, start_time = local_clock(); - int ret; - - if (!mutex_trylock(&c->gc_gens_lock)) - return 0; - - trace_and_count(c, gc_gens_start, c); - - /* - * We have to use trylock here. Otherwise, we would - * introduce a deadlock in the RO path - we take the - * state lock at the start of going RO. - */ - if (!down_read_trylock(&c->state_lock)) { - mutex_unlock(&c->gc_gens_lock); - return 0; - } - - for_each_member_device(c, ca) { - struct bucket_gens *gens = bucket_gens(ca); - - BUG_ON(ca->oldest_gen); - - ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); - if (!ca->oldest_gen) { - bch2_dev_put(ca); - ret = bch_err_throw(c, ENOMEM_gc_gens); - goto err; - } - - for (b = gens->first_bucket; - b < gens->nbuckets; b++) - ca->oldest_gen[b] = gens->b[b]; - } - - for (unsigned i = 0; i < BTREE_ID_NR; i++) - if (btree_type_has_ptrs(i)) { - c->gc_gens_btree = i; - c->gc_gens_pos = POS_MIN; - - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, i, - POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - gc_btree_gens_key(trans, &iter, k))); - if (ret) - goto err; - } - - struct bch_dev *ca = NULL; - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, - BTREE_ITER_prefetch, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ - ca = bch2_dev_iterate(c, ca, k.k->p.inode); - if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); - continue; - } - bch2_alloc_write_oldest_gen(trans, ca, &iter, k); - }))); - bch2_dev_put(ca); - - if (ret) - goto err; - - c->gc_gens_btree = 0; - c->gc_gens_pos = POS_MIN; - - c->gc_count++; - - bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); - trace_and_count(c, gc_gens_end, c); -err: - for_each_member_device(c, ca) { - kvfree(ca->oldest_gen); - ca->oldest_gen = NULL; - } - - up_read(&c->state_lock); - mutex_unlock(&c->gc_gens_lock); - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; -} - -static void bch2_gc_gens_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); - bch2_gc_gens(c); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); -} - -void bch2_gc_gens_async(struct bch_fs *c) -{ - if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) && - !queue_work(c->write_ref_wq, &c->gc_gens_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); -} - -void bch2_fs_btree_gc_init_early(struct bch_fs *c) -{ - seqcount_init(&c->gc_pos_lock); - INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); - - init_rwsem(&c->gc_lock); - mutex_init(&c->gc_gens_lock); -} diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h deleted file mode 100644 index ec77662369a2f5..00000000000000 --- a/fs/bcachefs/btree_gc.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_GC_H -#define _BCACHEFS_BTREE_GC_H - -#include "bkey.h" -#include "btree_gc_types.h" -#include "btree_types.h" - -int bch2_check_topology(struct bch_fs *); -int bch2_check_allocations(struct bch_fs *); - -/* - * For concurrent mark and sweep (with other index updates), we define a total - * ordering of _all_ references GC walks: - * - * Note that some references will have the same GC position as others - e.g. - * everything within the same btree node; in those cases we're relying on - * whatever locking exists for where those references live, i.e. the write lock - * on a btree node. - * - * That locking is also required to ensure GC doesn't pass the updater in - * between the updater adding/removing the reference and updating the GC marks; - * without that, we would at best double count sometimes. - * - * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ - * be held that prevents GC from passing the position the updater is at. - * - * (What about the start of gc, when we're clearing all the marks? GC clears the - * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc - * position inside its cmpxchg loop, so crap magically works). - */ - -/* Position of (the start of) a gc phase: */ -static inline struct gc_pos gc_phase(enum gc_phase phase) -{ - return (struct gc_pos) { .phase = phase, }; -} - -static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, - struct bpos pos) -{ - return (struct gc_pos) { - .phase = GC_PHASE_btree, - .btree = btree, - .level = level, - .pos = pos, - }; -} - -static inline int gc_btree_order(enum btree_id btree) -{ - if (btree == BTREE_ID_alloc) - return -2; - if (btree == BTREE_ID_stripes) - return -1; - return btree; -} - -static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -{ - return cmp_int(l.phase, r.phase) ?: - cmp_int(gc_btree_order(l.btree), - gc_btree_order(r.btree)) ?: - cmp_int(l.level, r.level) ?: - bpos_cmp(l.pos, r.pos); -} - -static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) -{ - unsigned seq; - bool ret; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - ret = gc_pos_cmp(pos, c->gc_pos) <= 0; - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - return ret; -} - -void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); - -int bch2_gc_gens(struct bch_fs *); -void bch2_gc_gens_async(struct bch_fs *); - -void bch2_fs_btree_gc_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h deleted file mode 100644 index c24dd6edf3773a..00000000000000 --- a/fs/bcachefs/btree_gc_types.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_GC_TYPES_H -#define _BCACHEFS_BTREE_GC_TYPES_H - -#include - -#define GC_PHASES() \ - x(not_running) \ - x(start) \ - x(sb) \ - x(btree) - -enum gc_phase { -#define x(n) GC_PHASE_##n, - GC_PHASES() -#undef x -}; - -struct gc_pos { - enum gc_phase phase:8; - enum btree_id btree:8; - u16 level; - struct bpos pos; -}; - -struct reflink_gc { - u64 offset; - u32 size; - u32 refcount; -}; - -typedef GENRADIX(struct reflink_gc) reflink_gc_table; - -#endif /* _BCACHEFS_BTREE_GC_TYPES_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c deleted file mode 100644 index 590cd29f3e86cb..00000000000000 --- a/fs/bcachefs/btree_io.c +++ /dev/null @@ -1,2742 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "async_objs.h" -#include "bkey_buf.h" -#include "bkey_methods.h" -#include "bkey_sort.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "checksum.h" -#include "debug.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "recovery.h" -#include "super-io.h" -#include "trace.h" - -#include - -static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) -{ - bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); - prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); - prt_str(out, "min: "); - bch2_bpos_to_text(out, bn->min_key); - prt_newline(out); - prt_str(out, "max: "); - bch2_bpos_to_text(out, bn->max_key); -} - -void bch2_btree_node_io_unlock(struct btree *b) -{ - EBUG_ON(!btree_node_write_in_flight(b)); - - clear_btree_node_write_in_flight_inner(b); - clear_btree_node_write_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -} - -void bch2_btree_node_io_lock(struct btree *b) -{ - wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void __bch2_btree_node_wait_on_read(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void __bch2_btree_node_wait_on_write(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void bch2_btree_node_wait_on_read(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); -} - -void bch2_btree_node_wait_on_write(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -static void verify_no_dups(struct btree *b, - struct bkey_packed *start, - struct bkey_packed *end) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bkey_packed *k, *p; - - if (start == end) - return; - - for (p = start, k = bkey_p_next(start); - k != end; - p = k, k = bkey_p_next(k)) { - struct bkey l = bkey_unpack_key(b, p); - struct bkey r = bkey_unpack_key(b, k); - - BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); - } -#endif -} - -static void set_needs_whiteout(struct bset *i, int v) -{ - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) - k->needs_whiteout = v; -} - -static void btree_bounce_free(struct bch_fs *c, size_t size, - bool used_mempool, void *p) -{ - if (used_mempool) - mempool_free(p, &c->btree_bounce_pool); - else - kvfree(p); -} - -static void *btree_bounce_alloc(struct bch_fs *c, size_t size, - bool *used_mempool) -{ - unsigned flags = memalloc_nofs_save(); - void *p; - - BUG_ON(size > c->opts.btree_node_size); - - *used_mempool = false; - p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); - if (!p) { - *used_mempool = true; - p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); - } - memalloc_nofs_restore(flags); - return p; -} - -static void sort_bkey_ptrs(const struct btree *bt, - struct bkey_packed **ptrs, unsigned nr) -{ - unsigned n = nr, a = nr / 2, b, c, d; - - if (!a) - return; - - /* Heap sort: see lib/sort.c: */ - while (1) { - if (a) - a--; - else if (--n) - swap(ptrs[0], ptrs[n]); - else - break; - - for (b = a; c = 2 * b + 1, (d = c + 1) < n;) - b = bch2_bkey_cmp_packed(bt, - ptrs[c], - ptrs[d]) >= 0 ? c : d; - if (d == n) - b = c; - - while (b != a && - bch2_bkey_cmp_packed(bt, - ptrs[a], - ptrs[b]) >= 0) - b = (b - 1) / 2; - c = b; - while (b != a) { - b = (b - 1) / 2; - swap(ptrs[b], ptrs[c]); - } - } -} - -static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) -{ - struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; - bool used_mempool = false; - size_t bytes = b->whiteout_u64s * sizeof(u64); - - if (!b->whiteout_u64s) - return; - - new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); - - ptrs = ptrs_end = ((void *) new_whiteouts + bytes); - - for (k = unwritten_whiteouts_start(b); - k != unwritten_whiteouts_end(b); - k = bkey_p_next(k)) - *--ptrs = k; - - sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); - - k = new_whiteouts; - - while (ptrs != ptrs_end) { - bkey_p_copy(k, *ptrs); - k = bkey_p_next(k); - ptrs++; - } - - verify_no_dups(b, new_whiteouts, - (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); - - memcpy_u64s(unwritten_whiteouts_start(b), - new_whiteouts, b->whiteout_u64s); - - btree_bounce_free(c, bytes, used_mempool, new_whiteouts); -} - -static bool should_compact_bset(struct btree *b, struct bset_tree *t, - bool compacting, enum compact_mode mode) -{ - if (!bset_dead_u64s(b, t)) - return false; - - switch (mode) { - case COMPACT_LAZY: - return should_compact_bset_lazy(b, t) || - (compacting && !bset_written(b, bset(b, t))); - case COMPACT_ALL: - return true; - default: - BUG(); - } -} - -static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) -{ - bool ret = false; - - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k, *n, *out, *start, *end; - struct btree_node_entry *src = NULL, *dst = NULL; - - if (t != b->set && !bset_written(b, i)) { - src = container_of(i, struct btree_node_entry, keys); - dst = max(write_block(b), - (void *) btree_bkey_last(b, t - 1)); - } - - if (src != dst) - ret = true; - - if (!should_compact_bset(b, t, ret, mode)) { - if (src != dst) { - memmove(dst, src, sizeof(*src) + - le16_to_cpu(src->keys.u64s) * - sizeof(u64)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - continue; - } - - start = btree_bkey_first(b, t); - end = btree_bkey_last(b, t); - - if (src != dst) { - memmove(dst, src, sizeof(*src)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - - out = i->start; - - for (k = start; k != end; k = n) { - n = bkey_p_next(k); - - if (!bkey_deleted(k)) { - bkey_p_copy(out, k); - out = bkey_p_next(out); - } else { - BUG_ON(k->needs_whiteout); - } - } - - i->u64s = cpu_to_le16((u64 *) out - i->_data); - set_btree_bset_end(b, t); - bch2_bset_set_no_aux_tree(b, t); - ret = true; - } - - bch2_verify_btree_nr_keys(b); - - bch2_btree_build_aux_trees(b); - - return ret; -} - -bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, - enum compact_mode mode) -{ - return bch2_drop_whiteouts(b, mode); -} - -static void btree_node_sort(struct bch_fs *c, struct btree *b, - unsigned start_idx, - unsigned end_idx) -{ - struct btree_node *out; - struct sort_iter_stack sort_iter; - struct bset_tree *t; - struct bset *start_bset = bset(b, &b->set[start_idx]); - bool used_mempool = false; - u64 start_time, seq = 0; - unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; - bool sorting_entire_node = start_idx == 0 && - end_idx == b->nsets; - - sort_iter_stack_init(&sort_iter, b); - - for (t = b->set + start_idx; - t < b->set + end_idx; - t++) { - u64s += le16_to_cpu(bset(b, t)->u64s); - sort_iter_add(&sort_iter.iter, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - } - - bytes = sorting_entire_node - ? btree_buf_bytes(b) - : __vstruct_bytes(struct btree_node, u64s); - - out = btree_bounce_alloc(c, bytes, &used_mempool); - - start_time = local_clock(); - - u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter); - - out->keys.u64s = cpu_to_le16(u64s); - - BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); - - if (sorting_entire_node) - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], - start_time); - - /* Make sure we preserve bset journal_seq: */ - for (t = b->set + start_idx; t < b->set + end_idx; t++) - seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); - start_bset->journal_seq = cpu_to_le64(seq); - - if (sorting_entire_node) { - u64s = le16_to_cpu(out->keys.u64s); - - BUG_ON(bytes != btree_buf_bytes(b)); - - /* - * Our temporary buffer is the same size as the btree node's - * buffer, we can just swap buffers instead of doing a big - * memcpy() - */ - *out = *b->data; - out->keys.u64s = cpu_to_le16(u64s); - swap(out, b->data); - set_btree_bset(b, b->set, &b->data->keys); - } else { - start_bset->u64s = out->keys.u64s; - memcpy_u64s(start_bset->start, - out->keys.start, - le16_to_cpu(out->keys.u64s)); - } - - for (i = start_idx + 1; i < end_idx; i++) - b->nr.bset_u64s[start_idx] += - b->nr.bset_u64s[i]; - - b->nsets -= shift; - - for (i = start_idx + 1; i < b->nsets; i++) { - b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; - b->set[i] = b->set[i + shift]; - } - - for (i = b->nsets; i < MAX_BSETS; i++) - b->nr.bset_u64s[i] = 0; - - set_btree_bset_end(b, &b->set[start_idx]); - bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); - - btree_bounce_free(c, bytes, used_mempool, out); - - bch2_verify_btree_nr_keys(b); -} - -void bch2_btree_sort_into(struct bch_fs *c, - struct btree *dst, - struct btree *src) -{ - struct btree_nr_keys nr; - struct btree_node_iter src_iter; - u64 start_time = local_clock(); - - BUG_ON(dst->nsets != 1); - - bch2_bset_set_no_aux_tree(dst, dst->set); - - bch2_btree_node_iter_init_from_start(&src_iter, src); - - nr = bch2_sort_repack(btree_bset_first(dst), - src, &src_iter, - &dst->format, - true); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], - start_time); - - set_btree_bset_end(dst, dst->set); - - dst->nr.live_u64s += nr.live_u64s; - dst->nr.bset_u64s[0] += nr.bset_u64s[0]; - dst->nr.packed_keys += nr.packed_keys; - dst->nr.unpacked_keys += nr.unpacked_keys; - - bch2_verify_btree_nr_keys(dst); -} - -/* - * We're about to add another bset to the btree node, so if there's currently - * too many bsets - sort some of them together: - */ -static bool btree_node_compact(struct bch_fs *c, struct btree *b) -{ - unsigned unwritten_idx; - bool ret = false; - - for (unwritten_idx = 0; - unwritten_idx < b->nsets; - unwritten_idx++) - if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) - break; - - if (b->nsets - unwritten_idx > 1) { - btree_node_sort(c, b, unwritten_idx, b->nsets); - ret = true; - } - - if (unwritten_idx > 1) { - btree_node_sort(c, b, 0, unwritten_idx); - ret = true; - } - - return ret; -} - -void bch2_btree_build_aux_trees(struct btree *b) -{ - for_each_bset(b, t) - bch2_bset_build_aux_tree(b, t, - !bset_written(b, bset(b, t)) && - t == bset_tree_last(b)); -} - -/* - * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? - * - * The first bset is going to be of similar order to the size of the node, the - * last bset is bounded by btree_write_set_buffer(), which is set to keep the - * memmove on insert from being too expensive: the middle bset should, ideally, - * be the geometric mean of the first and the last. - * - * Returns true if the middle bset is greater than that geometric mean: - */ -static inline bool should_compact_all(struct bch_fs *c, struct btree *b) -{ - unsigned mid_u64s_bits = - (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; - - return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; -} - -/* - * @bch_btree_init_next - initialize a new (unwritten) bset that can then be - * inserted into - * - * Safe to call if there already is an unwritten bset - will only add a new bset - * if @b doesn't already have one. - * - * Returns true if we sorted (i.e. invalidated iterators - */ -void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - struct btree_node_entry *bne; - bool reinit_iter = false; - - EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); - BUG_ON(bset_written(b, bset(b, &b->set[1]))); - BUG_ON(btree_node_just_written(b)); - - if (b->nsets == MAX_BSETS && - !btree_node_write_in_flight(b) && - should_compact_all(c, b)) { - bch2_btree_node_write_trans(trans, b, SIX_LOCK_write, - BTREE_WRITE_init_next_bset); - reinit_iter = true; - } - - if (b->nsets == MAX_BSETS && - btree_node_compact(c, b)) - reinit_iter = true; - - BUG_ON(b->nsets >= MAX_BSETS); - - bne = want_new_bset(c, b); - if (bne) - bch2_bset_init_next(b, bne); - - bch2_btree_build_aux_trees(b); - - if (reinit_iter) - bch2_trans_node_reinit_iter(trans, b); -} - -static void btree_err_msg(struct printbuf *out, struct bch_fs *c, - struct bch_dev *ca, - bool print_pos, - struct btree *b, struct bset *i, struct bkey_packed *k, - unsigned offset, int rw) -{ - if (print_pos) { - prt_str(out, rw == READ - ? "error validating btree node " - : "corrupt btree node before write "); - prt_printf(out, "at btree "); - bch2_btree_pos_to_text(out, c, b); - prt_newline(out); - } - - if (ca) - prt_printf(out, "%s ", ca->name); - - prt_printf(out, "node offset %u/%u", - b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); - if (i) - prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); - if (k) - prt_printf(out, " bset byte offset %lu", - (unsigned long)(void *)k - - ((unsigned long)(void *)i & ~511UL)); - prt_str(out, ": "); -} - -__printf(11, 12) -static int __btree_err(int ret, - struct bch_fs *c, - struct bch_dev *ca, - struct btree *b, - struct bset *i, - struct bkey_packed *k, - int rw, - enum bch_sb_error_id err_type, - struct bch_io_failures *failed, - struct printbuf *err_msg, - const char *fmt, ...) -{ - if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) - return ret == -BCH_ERR_btree_node_read_err_fixable - ? bch_err_throw(c, fsck_fix) - : ret; - - bool have_retry = false; - int ret2; - - if (ca) { - bch2_mark_btree_validate_failure(failed, ca->dev_idx); - - struct extent_ptr_decoded pick; - have_retry = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - failed, &pick, -1) == 1; - } - - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) - ret = bch_err_throw(c, btree_node_read_err_fixable); - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) - ret = bch_err_throw(c, btree_node_read_err_bad_node); - - bch2_sb_error_count(c, err_type); - - bool print_deferred = err_msg && - rw == READ && - !(test_bit(BCH_FS_in_fsck, &c->flags) && - c->opts.fix_errors == FSCK_FIX_ask); - - struct printbuf out = PRINTBUF; - bch2_log_msg_start(c, &out); - - if (!print_deferred) - err_msg = &out; - - btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw); - - va_list args; - va_start(args, fmt); - prt_vprintf(err_msg, fmt, args); - va_end(args); - - if (print_deferred) { - prt_newline(err_msg); - - switch (ret) { - case -BCH_ERR_btree_node_read_err_fixable: - ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); - if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && - !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { - ret = ret2; - goto fsck_err; - } - - if (!have_retry) - ret = bch_err_throw(c, fsck_fix); - goto out; - case -BCH_ERR_btree_node_read_err_bad_node: - prt_str(&out, ", "); - break; - } - - goto out; - } - - if (rw == WRITE) { - prt_str(&out, ", "); - ret = __bch2_inconsistent_error(c, &out) - ? -BCH_ERR_fsck_errors_not_fixed - : 0; - goto print; - } - - switch (ret) { - case -BCH_ERR_btree_node_read_err_fixable: - ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); - if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && - !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { - ret = ret2; - goto fsck_err; - } - - if (!have_retry) - ret = bch_err_throw(c, fsck_fix); - goto out; - case -BCH_ERR_btree_node_read_err_bad_node: - prt_str(&out, ", "); - break; - } -print: - bch2_print_str(c, KERN_ERR, out.buf); -out: -fsck_err: - printbuf_exit(&out); - return ret; -} - -#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ -({ \ - int _ret = __btree_err(type, c, ca, b, i, k, write, \ - BCH_FSCK_ERR_##_err_type, \ - failed, err_msg, \ - msg, ##__VA_ARGS__); \ - \ - if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \ - ret = _ret; \ - goto fsck_err; \ - } \ - \ - true; \ -}) - -#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) - -/* - * When btree topology repair changes the start or end of a node, that might - * mean we have to drop keys that are no longer inside the node: - */ -__cold -void bch2_btree_node_drop_keys_outside_node(struct btree *b) -{ - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) - if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) - break; - - if (k != i->start) { - unsigned shift = (u64 *) k - (u64 *) i->start; - - memmove_u64s_down(i->start, k, - (u64 *) vstruct_end(i) - (u64 *) k); - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); - set_btree_bset_end(b, t); - } - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) - if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) - break; - - if (k != vstruct_last(i)) { - i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); - set_btree_bset_end(b, t); - } - } - - /* - * Always rebuild search trees: eytzinger search tree nodes directly - * depend on the values of min/max key: - */ - bch2_bset_set_no_aux_tree(b, b->set); - bch2_btree_build_aux_trees(b); - b->nr = bch2_btree_node_count_keys(b); - - struct bkey_s_c k; - struct bkey unpacked; - struct btree_node_iter iter; - for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); - } -} - -static int validate_bset(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, struct bset *i, - unsigned offset, int write, - struct bch_io_failures *failed, - struct printbuf *err_msg) -{ - unsigned version = le16_to_cpu(i->version); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - int ret = 0; - - btree_err_on(!bch2_version_compatible(version), - -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, NULL, - btree_node_unsupported_version, - "unsupported bset version %u.%u", - BCH_VERSION_MAJOR(version), - BCH_VERSION_MINOR(version)); - - if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes && - btree_err_on(version < c->sb.version_min, - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, NULL, - btree_node_bset_older_than_sb_min, - "bset version %u older than superblock version_min %u", - version, c->sb.version_min)) { - if (bch2_version_compatible(version)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version_min = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } else { - /* We have no idea what's going on: */ - i->version = cpu_to_le16(c->sb.version); - } - } - - if (btree_err_on(BCH_VERSION_MAJOR(version) > - BCH_VERSION_MAJOR(c->sb.version), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, NULL, - btree_node_bset_newer_than_sb, - "bset version %u newer than superblock version %u", - version, c->sb.version)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - - btree_err_on(BSET_SEPARATE_WHITEOUTS(i), - -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, NULL, - btree_node_unsupported_version, - "BSET_SEPARATE_WHITEOUTS no longer supported"); - - btree_err_on(offset && !i->u64s, - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_empty, - "empty bset"); - - btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_wrong_sector_offset, - "bset at wrong sector offset"); - - if (!offset) { - struct btree_node *bn = - container_of(i, struct btree_node, keys); - /* These indicate that we read the wrong btree node: */ - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - /* XXX endianness */ - btree_err_on(bp->seq != bn->keys.seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - bset_bad_seq, - "incorrect sequence number (wrong btree node)"); - } - - btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, NULL, - btree_node_bad_btree, - "incorrect btree id"); - - btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, NULL, - btree_node_bad_level, - "incorrect level"); - - if (!write) - compat_btree_node(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, bn); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - if (BTREE_PTR_RANGE_UPDATED(bp)) { - b->data->min_key = bp->min_key; - b->data->max_key = b->key.k.p; - } - - btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_min_key, - "incorrect min_key: got %s should be %s", - (printbuf_reset(&buf1), - bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), - (printbuf_reset(&buf2), - bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); - } - - btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, NULL, - btree_node_bad_max_key, - "incorrect max key %s", - (printbuf_reset(&buf1), - bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); - - if (write) - compat_btree_node(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, bn); - - btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), - -BCH_ERR_btree_node_read_err_bad_node, - c, ca, b, i, NULL, - btree_node_bad_format, - "invalid bkey format: %s\n%s", buf1.buf, - (printbuf_reset(&buf2), - bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); - printbuf_reset(&buf1); - - compat_bformat(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, - &bn->format); - } -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - enum bch_validate_flags flags) -{ - return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = flags - }); -} - -static int bset_key_validate(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - bool updated_range, - enum bch_validate_flags flags) -{ - struct bkey_validate_context from = (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = flags, - }; - return __bch2_bkey_validate(c, k, from) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?: - (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0); -} - -static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, - struct bset *i, struct bkey_packed *k) -{ - if (bkey_p_next(k) > vstruct_last(i)) - return false; - - if (k->format > KEY_FORMAT_CURRENT) - return false; - - if (!bkeyp_u64s_valid(&b->format, k)) - return false; - - struct bkey tmp; - struct bkey_s u = __bkey_disassemble(b, k, &tmp); - return !__bch2_bkey_validate(c, u.s_c, - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = BCH_VALIDATE_silent - }); -} - -static inline int btree_node_read_bkey_cmp(const struct btree *b, - const struct bkey_packed *l, - const struct bkey_packed *r) -{ - return bch2_bkey_cmp_packed(b, l, r) - ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); -} - -static int validate_bset_keys(struct bch_fs *c, struct btree *b, - struct bset *i, int write, - struct bch_io_failures *failed, - struct printbuf *err_msg) -{ - unsigned version = le16_to_cpu(i->version); - struct bkey_packed *k, *prev = NULL; - struct printbuf buf = PRINTBUF; - bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && - BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - int ret = 0; - - for (k = i->start; - k != vstruct_last(i);) { - struct bkey_s u; - struct bkey tmp; - unsigned next_good_key; - - if (btree_err_on(bkey_p_next(k) > vstruct_last(i), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_past_bset_end, - "key extends past end of bset")) { - i->u64s = cpu_to_le16((u64 *) k - i->_data); - break; - } - - if (btree_err_on(k->format > KEY_FORMAT_CURRENT, - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_bad_format, - "invalid bkey format %u", k->format)) - goto drop_this_key; - - if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_bad_u64s, - "bad k->u64s %u (min %u max %zu)", k->u64s, - bkeyp_key_u64s(&b->format, k), - U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) - goto drop_this_key; - - if (!write) - bch2_bkey_compat(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, - &b->format, k); - - u = __bkey_disassemble(b, k, &tmp); - - ret = bset_key_validate(c, b, u.s_c, updated_range, write); - if (ret == -BCH_ERR_fsck_delete_bkey) - goto drop_this_key; - if (ret) - goto fsck_err; - - if (write) - bch2_bkey_compat(b->c.level, b->c.btree_id, version, - BSET_BIG_ENDIAN(i), write, - &b->format, k); - - if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { - struct bkey up = bkey_unpack_key(b, prev); - - printbuf_reset(&buf); - prt_printf(&buf, "keys out of order: "); - bch2_bkey_to_text(&buf, &up); - prt_printf(&buf, " > "); - bch2_bkey_to_text(&buf, u.k); - - if (btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bkey_out_of_order, - "%s", buf.buf)) - goto drop_this_key; - } - - prev = k; - k = bkey_p_next(k); - continue; -drop_this_key: - next_good_key = k->u64s; - - if (!next_good_key || - (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN && - version >= bcachefs_metadata_version_snapshot)) { - /* - * only do scanning if bch2_bkey_compat() has nothing to - * do - */ - - if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { - for (next_good_key = 1; - next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; - next_good_key++) - if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) - goto got_good_key; - } - - /* - * didn't find a good key, have to truncate the rest of - * the bset - */ - next_good_key = (u64 *) vstruct_last(i) - (u64 *) k; - } -got_good_key: - le16_add_cpu(&i->u64s, -next_good_key); - memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_error(b); - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, - struct bch_io_failures *failed, - struct printbuf *err_msg) -{ - struct btree_node_entry *bne; - struct sort_iter *iter; - struct btree_node *sorted; - struct bkey_packed *k; - struct bset *i; - bool used_mempool, blacklisted; - bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && - BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); - u64 max_journal_seq = 0; - struct printbuf buf = PRINTBUF; - int ret = 0, write = READ; - u64 start_time = local_clock(); - - b->version_ondisk = U16_MAX; - /* We might get called multiple times on read retry: */ - b->written = 0; - - iter = mempool_alloc(&c->fill_iter, GFP_NOFS); - sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2); - - if (bch2_meta_read_fault("btree")) - btree_err(-BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_fault_injected, - "dynamic fault"); - - btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_magic, - "bad magic: want %llx, got %llx", - bset_magic(c), le64_to_cpu(b->data->magic)); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *bp = - &bkey_i_to_btree_ptr_v2(&b->key)->v; - - bch2_bpos_to_text(&buf, b->data->min_key); - prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, b->data->max_key); - - btree_err_on(b->data->keys.seq != bp->seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_seq, - "got wrong btree node: got\n%s", - (printbuf_reset(&buf), - bch2_btree_node_header_to_text(&buf, b->data), - buf.buf)); - } else { - btree_err_on(!b->data->keys.seq, - -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, NULL, - btree_node_bad_seq, - "bad btree header: seq 0\n%s", - (printbuf_reset(&buf), - bch2_btree_node_header_to_text(&buf, b->data), - buf.buf)); - } - - while (b->written < (ptr_written ?: btree_sectors(c))) { - unsigned sectors; - bool first = !b->written; - - if (first) { - bne = NULL; - i = &b->data->keys; - } else { - bne = write_block(b); - i = &bne->keys; - - if (i->seq != b->data->keys.seq) - break; - } - - struct nonce nonce = btree_nonce(i, b->written << 9); - bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - - btree_err_on(!good_csum_type, - bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) - ? -BCH_ERR_btree_node_read_err_must_retry - : -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - if (first) { - sectors = vstruct_sectors(b->data, c->block_bits); - if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_past_end_of_btree_node, - "bset past end of btree node (offset %u len %u but written %zu)", - b->written, sectors, ptr_written ?: btree_sectors(c))) - i->u64s = 0; - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - bool csum_bad = bch2_crc_cmp(b->data->csum, csum); - if (csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; - } - - btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), - -BCH_ERR_btree_node_read_err_incompatible, - c, NULL, b, NULL, NULL, - btree_node_unsupported_version, - "btree node does not have NEW_EXTENT_OVERWRITE set"); - } else { - sectors = vstruct_sectors(bne, c->block_bits); - if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_past_end_of_btree_node, - "bset past end of btree node (offset %u len %u but written %zu)", - b->written, sectors, ptr_written ?: btree_sectors(c))) - i->u64s = 0; - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - bool csum_bad = bch2_crc_cmp(bne->csum, csum); - if (ca && csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; - } - } - - b->version_ondisk = min(b->version_ondisk, - le16_to_cpu(i->version)); - - ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg); - if (ret) - goto fsck_err; - - if (!b->written) - btree_node_set_format(b, b->data->format); - - ret = validate_bset_keys(c, b, i, READ, failed, err_msg); - if (ret) - goto fsck_err; - - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - blacklisted = bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(i->journal_seq), - true); - - btree_err_on(blacklisted && first, - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_blacklisted_journal_seq, - "first btree node bset has blacklisted journal seq (%llu)", - le64_to_cpu(i->journal_seq)); - - btree_err_on(blacklisted && ptr_written, - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - first_bset_blacklisted_journal_seq, - "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", - le64_to_cpu(i->journal_seq), - b->written, b->written + sectors, ptr_written); - - b->written = min(b->written + sectors, btree_sectors(c)); - - if (blacklisted && !first) - continue; - - sort_iter_add(iter, - vstruct_idx(i, 0), - vstruct_last(i)); - - max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq)); - } - - if (ptr_written) { - btree_err_on(b->written < ptr_written, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, NULL, - btree_node_data_missing, - "btree node data missing: expected %u sectors, found %u", - ptr_written, b->written); - } else { - for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_buf_bytes(b); - bne = (void *) bne + block_bytes(c)) - btree_err_on(bne->keys.seq == b->data->keys.seq && - !bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), - true), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, NULL, - btree_node_bset_after_end, - "found bset signature after last bset"); - } - - sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); - sorted->keys.u64s = 0; - - b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); - memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, - btree_buf_bytes(b) - - sizeof(struct btree_node) - - b->nr.live_u64s * sizeof(u64)); - - b->data->keys.u64s = sorted->keys.u64s; - *sorted = *b->data; - swap(sorted, b->data); - set_btree_bset(b, b->set, &b->data->keys); - b->nsets = 1; - b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); - - BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s)); - - btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); - - i = &b->data->keys; - for (k = i->start; k != vstruct_last(i);) { - struct bkey tmp; - struct bkey_s u = __bkey_disassemble(b, k, &tmp); - - ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); - if (ret == -BCH_ERR_fsck_delete_bkey || - (static_branch_unlikely(&bch2_inject_invalid_keys) && - !bversion_cmp(u.k->bversion, MAX_VERSION))) { - btree_keys_account_key_drop(&b->nr, 0, k); - - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - set_btree_bset_end(b, b->set); - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_error(b); - continue; - } - if (ret) - goto fsck_err; - - if (u.k->type == KEY_TYPE_btree_ptr_v2) { - struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); - - bp.v->mem_ptr = 0; - } - - k = bkey_p_next(k); - } - - bch2_bset_build_aux_tree(b, b->set, false); - - set_needs_whiteout(btree_bset_first(b), true); - - btree_node_reset_sib_u64s(b); - - if (updated_range) - bch2_btree_node_drop_keys_outside_node(b); - - /* - * XXX: - * - * We deadlock if too many btree updates require node rewrites while - * we're still in journal replay. - * - * This is because btree node rewrites generate more updates for the - * interior updates (alloc, backpointers), and if those updates touch - * new nodes and generate more rewrites - well, you see the problem. - * - * The biggest cause is that we don't use the btree write buffer (for - * the backpointer updates - this needs some real thought on locking in - * order to fix. - * - * The problem with this workaround (not doing the rewrite for degraded - * nodes in journal replay) is that those degraded nodes persist, and we - * don't want that (this is a real bug when a btree node write completes - * with fewer replicas than we wanted and leaves a degraded node due to - * device _removal_, i.e. the device went away mid write). - * - * It's less of a bug here, but still a problem because we don't yet - * have a way of tracking degraded data - we another index (all - * extents/btree nodes, by replicas entry) in order to fix properly - * (re-replicate degraded data at the earliest possible time). - */ - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { - scoped_guard(rcu) - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_degraded(b); - } - } - } - - if (!ptr_written) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_ptr_written_zero(b); - } -fsck_err: - mempool_free(iter, &c->fill_iter); - printbuf_exit(&buf); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); - return ret; -} - -static void btree_node_read_work(struct work_struct *work) -{ - struct btree_read_bio *rb = - container_of(work, struct btree_read_bio, work); - struct bch_fs *c = rb->c; - struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - struct btree *b = rb->b; - struct bio *bio = &rb->bio; - struct bch_io_failures failed = { .nr = 0 }; - int ret = 0; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "btree node read error at btree "); - bch2_btree_pos_to_text(&buf, c, b); - prt_newline(&buf); - - goto start; - while (1) { - ret = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - &failed, &rb->pick, -1); - if (ret <= 0) { - set_btree_node_read_error(b); - break; - } - - ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); - rb->have_ioref = ca != NULL; - rb->start_time = local_clock(); - bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = rb->pick.ptr.offset; - bio->bi_iter.bi_size = btree_buf_bytes(b); - - if (rb->have_ioref) { - bio_set_dev(bio, ca->disk_sb.bdev); - submit_bio_wait(bio); - } else { - bio->bi_status = BLK_STS_REMOVED; - } - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - rb->start_time, !bio->bi_status); -start: - if (rb->have_ioref) - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); - rb->have_ioref = false; - - if (bio->bi_status) { - bch2_mark_io_failure(&failed, &rb->pick, false); - continue; - } - - ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); - if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) - continue; - - if (ret) - set_btree_node_read_error(b); - - break; - } - - bch2_io_failures_to_text(&buf, c, &failed); - - if (btree_node_read_error(b)) - bch2_btree_lost_data(c, &buf, b->c.btree_id); - - /* - * only print retry success if we read from a replica with no errors - */ - if (btree_node_read_error(b)) - prt_printf(&buf, "ret %s", bch2_err_str(ret)); - else if (failed.nr) { - if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) - prt_printf(&buf, "retry success"); - else - prt_printf(&buf, "repair success"); - } - - if ((failed.nr || - btree_node_need_rewrite(b)) && - !btree_node_read_error(b) && - c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { - prt_printf(&buf, " (rewriting node)"); - bch2_btree_node_rewrite_async(c, b); - } - prt_newline(&buf); - - if (failed.nr) - bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); - - async_object_list_del(c, btree_read_bio, rb->list_idx); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], - rb->start_time); - bio_put(&rb->bio); - printbuf_exit(&buf); - clear_btree_node_read_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -} - -static void btree_node_read_endio(struct bio *bio) -{ - struct btree_read_bio *rb = - container_of(bio, struct btree_read_bio, bio); - struct bch_fs *c = rb->c; - struct bch_dev *ca = rb->have_ioref - ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - rb->start_time, !bio->bi_status); - - queue_work(c->btree_read_complete_wq, &rb->work); -} - -void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio) -{ - bch2_bio_to_text(out, &rbio->bio); -} - -struct btree_node_read_all { - struct closure cl; - struct bch_fs *c; - struct btree *b; - unsigned nr; - void *buf[BCH_REPLICAS_MAX]; - struct bio *bio[BCH_REPLICAS_MAX]; - blk_status_t err[BCH_REPLICAS_MAX]; -}; - -static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) -{ - struct btree_node *bn = data; - struct btree_node_entry *bne; - unsigned offset = 0; - - if (le64_to_cpu(bn->magic) != bset_magic(c)) - return 0; - - while (offset < btree_sectors(c)) { - if (!offset) { - offset += vstruct_sectors(bn, c->block_bits); - } else { - bne = data + (offset << 9); - if (bne->keys.seq != bn->keys.seq) - break; - offset += vstruct_sectors(bne, c->block_bits); - } - } - - return offset; -} - -static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) -{ - struct btree_node *bn = data; - struct btree_node_entry *bne; - - if (!offset) - return false; - - while (offset < btree_sectors(c)) { - bne = data + (offset << 9); - if (bne->keys.seq == bn->keys.seq) - return true; - offset++; - } - - return false; - return offset; -} - -static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) -{ - closure_type(ra, struct btree_node_read_all, cl); - struct bch_fs *c = ra->c; - struct btree *b = ra->b; - struct printbuf buf = PRINTBUF; - bool dump_bset_maps = false; - int ret = 0, best = -1, write = READ; - unsigned i, written = 0, written2 = 0; - __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 - ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; - bool _saw_error = false, *saw_error = &_saw_error; - struct printbuf *err_msg = NULL; - struct bch_io_failures *failed = NULL; - - for (i = 0; i < ra->nr; i++) { - struct btree_node *bn = ra->buf[i]; - - if (ra->err[i]) - continue; - - if (le64_to_cpu(bn->magic) != bset_magic(c) || - (seq && seq != bn->keys.seq)) - continue; - - if (best < 0) { - best = i; - written = btree_node_sectors_written(c, bn); - continue; - } - - written2 = btree_node_sectors_written(c, ra->buf[i]); - if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, NULL, - btree_node_replicas_sectors_written_mismatch, - "btree node sectors written mismatch: %u != %u", - written, written2) || - btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, NULL, - btree_node_bset_after_end, - "found bset signature after last bset") || - btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), - -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, NULL, - btree_node_replicas_data_mismatch, - "btree node replicas content mismatch")) - dump_bset_maps = true; - - if (written2 > written) { - written = written2; - best = i; - } - } -fsck_err: - if (dump_bset_maps) { - for (i = 0; i < ra->nr; i++) { - struct btree_node *bn = ra->buf[i]; - struct btree_node_entry *bne = NULL; - unsigned offset = 0, sectors; - bool gap = false; - - if (ra->err[i]) - continue; - - printbuf_reset(&buf); - - while (offset < btree_sectors(c)) { - if (!offset) { - sectors = vstruct_sectors(bn, c->block_bits); - } else { - bne = ra->buf[i] + (offset << 9); - if (bne->keys.seq != bn->keys.seq) - break; - sectors = vstruct_sectors(bne, c->block_bits); - } - - prt_printf(&buf, " %u-%u", offset, offset + sectors); - if (bne && bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), false)) - prt_printf(&buf, "*"); - offset += sectors; - } - - while (offset < btree_sectors(c)) { - bne = ra->buf[i] + (offset << 9); - if (bne->keys.seq == bn->keys.seq) { - if (!gap) - prt_printf(&buf, " GAP"); - gap = true; - - sectors = vstruct_sectors(bne, c->block_bits); - prt_printf(&buf, " %u-%u", offset, offset + sectors); - if (bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), false)) - prt_printf(&buf, "*"); - } - offset++; - } - - bch_err(c, "replica %u:%s", i, buf.buf); - } - } - - if (best >= 0) { - memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); - ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL); - } else { - ret = -1; - } - - if (ret) { - set_btree_node_read_error(b); - - struct printbuf buf = PRINTBUF; - bch2_btree_lost_data(c, &buf, b->c.btree_id); - if (buf.pos) - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - } else if (*saw_error) - bch2_btree_node_rewrite_async(c, b); - - for (i = 0; i < ra->nr; i++) { - mempool_free(ra->buf[i], &c->btree_bounce_pool); - bio_put(ra->bio[i]); - } - - closure_debug_destroy(&ra->cl); - kfree(ra); - printbuf_exit(&buf); - - clear_btree_node_read_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -} - -static void btree_node_read_all_replicas_endio(struct bio *bio) -{ - struct btree_read_bio *rb = - container_of(bio, struct btree_read_bio, bio); - struct bch_fs *c = rb->c; - struct btree_node_read_all *ra = rb->ra; - - if (rb->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); - - bch2_latency_acct(ca, rb->start_time, READ); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_node_read_all_replicas); - } - - ra->err[rb->idx] = bio->bi_status; - closure_put(&ra->cl); -} - -/* - * XXX This allocates multiple times from the same mempools, and can deadlock - * under sufficient memory pressure (but is only a debug path) - */ -static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) -{ - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded pick; - struct btree_node_read_all *ra; - unsigned i; - - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas); - - closure_init(&ra->cl, NULL); - ra->c = c; - ra->b = b; - ra->nr = bch2_bkey_nr_ptrs(k); - - for (i = 0; i < ra->nr; i++) { - ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); - ra->bio[i] = bio_alloc_bioset(NULL, - buf_pages(ra->buf[i], btree_buf_bytes(b)), - REQ_OP_READ|REQ_SYNC|REQ_META, - GFP_NOFS, - &c->btree_bio); - } - - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_node_read_all_replicas); - struct btree_read_bio *rb = - container_of(ra->bio[i], struct btree_read_bio, bio); - rb->c = c; - rb->b = b; - rb->ra = ra; - rb->start_time = local_clock(); - rb->have_ioref = ca != NULL; - rb->idx = i; - rb->pick = pick; - rb->bio.bi_iter.bi_sector = pick.ptr.offset; - rb->bio.bi_end_io = btree_node_read_all_replicas_endio; - bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b)); - - if (rb->have_ioref) { - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], - bio_sectors(&rb->bio)); - bio_set_dev(&rb->bio, ca->disk_sb.bdev); - - closure_get(&ra->cl); - submit_bio(&rb->bio); - } else { - ra->err[i] = BLK_STS_REMOVED; - } - - i++; - } - - if (sync) { - closure_sync(&ra->cl); - btree_node_read_all_replicas_done(&ra->cl.work); - } else { - continue_at(&ra->cl, btree_node_read_all_replicas_done, - c->btree_read_complete_wq); - } - - return 0; -} - -void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, - bool sync) -{ - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct btree_read_bio *rb; - struct bch_dev *ca; - struct bio *bio; - int ret; - - trace_and_count(c, btree_node_read, trans, b); - - if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && - !btree_node_read_all_replicas(c, b, sync)) - return; - - ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - NULL, &pick, -1); - - if (ret <= 0) { - bool ratelimit = true; - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_str(&buf, "btree node read error: no device to read from\n at "); - bch2_btree_pos_to_text(&buf, c, b); - prt_newline(&buf); - bch2_btree_lost_data(c, &buf, b->c.btree_id); - - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - bch2_fs_emergency_read_only2(c, &buf)) - ratelimit = false; - - static DEFINE_RATELIMIT_STATE(rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - if (!ratelimit || __ratelimit(&rs)) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - set_btree_node_read_error(b); - clear_btree_node_read_in_flight(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); - return; - } - - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); - - bio = bio_alloc_bioset(NULL, - buf_pages(b->data, btree_buf_bytes(b)), - REQ_OP_READ|REQ_SYNC|REQ_META, - GFP_NOFS, - &c->btree_bio); - rb = container_of(bio, struct btree_read_bio, bio); - rb->c = c; - rb->b = b; - rb->ra = NULL; - rb->start_time = local_clock(); - rb->have_ioref = ca != NULL; - rb->pick = pick; - INIT_WORK(&rb->work, btree_node_read_work); - bio->bi_iter.bi_sector = pick.ptr.offset; - bio->bi_end_io = btree_node_read_endio; - bch2_bio_map(bio, b->data, btree_buf_bytes(b)); - - async_object_list_add(c, btree_read_bio, rb, &rb->list_idx); - - if (rb->have_ioref) { - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], - bio_sectors(bio)); - bio_set_dev(bio, ca->disk_sb.bdev); - - if (sync) { - submit_bio_wait(bio); - bch2_latency_acct(ca, rb->start_time, READ); - btree_node_read_work(&rb->work); - } else { - submit_bio(bio); - } - } else { - bio->bi_status = BLK_STS_REMOVED; - - if (sync) - btree_node_read_work(&rb->work); - else - queue_work(c->btree_read_complete_wq, &rb->work); - } -} - -static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, - const struct bkey_i *k, unsigned level) -{ - struct bch_fs *c = trans->c; - struct closure cl; - struct btree *b; - int ret; - - closure_init_stack(&cl); - - do { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - closure_sync(&cl); - } while (ret); - - b = bch2_btree_node_mem_alloc(trans, level != 0); - bch2_btree_cache_cannibalize_unlock(trans); - - BUG_ON(IS_ERR(b)); - - bkey_copy(&b->key, k); - BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); - - set_btree_node_read_in_flight(b); - - /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */ - bch2_trans_unlock(trans); - bch2_btree_node_read(trans, b, true); - - if (btree_node_read_error(b)) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - ret = bch_err_throw(c, btree_node_read_error); - goto err; - } - - bch2_btree_set_root_for_read(c, b); -err: - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - return ret; -} - -int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, - const struct bkey_i *k, unsigned level) -{ - return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); -} - -struct btree_node_scrub { - struct bch_fs *c; - struct bch_dev *ca; - void *buf; - bool used_mempool; - unsigned written; - - enum btree_id btree; - unsigned level; - struct bkey_buf key; - __le64 seq; - - struct work_struct work; - struct bio bio; -}; - -static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, - struct printbuf *err) -{ - unsigned written = 0; - - if (le64_to_cpu(data->magic) != bset_magic(c)) { - prt_printf(err, "bad magic: want %llx, got %llx", - bset_magic(c), le64_to_cpu(data->magic)); - return false; - } - - while (written < (ptr_written ?: btree_sectors(c))) { - struct btree_node_entry *bne; - struct bset *i; - bool first = !written; - - if (first) { - bne = NULL; - i = &data->keys; - } else { - bne = (void *) data + (written << 9); - i = &bne->keys; - - if (!ptr_written && i->seq != data->keys.seq) - break; - } - - struct nonce nonce = btree_nonce(i, written << 9); - bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - - if (first) { - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); - if (bch2_crc_cmp(data->csum, csum)) { - bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); - return false; - } - } - - written += vstruct_sectors(data, c->block_bits); - } else { - if (good_csum_type) { - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - if (bch2_crc_cmp(bne->csum, csum)) { - bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); - return false; - } - } - - written += vstruct_sectors(bne, c->block_bits); - } - } - - return true; -} - -static void btree_node_scrub_work(struct work_struct *work) -{ - struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); - struct bch_fs *c = scrub->c; - struct printbuf err = PRINTBUF; - - __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, - bkey_i_to_s_c(scrub->key.k)); - prt_newline(&err); - - if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { - int ret = bch2_trans_do(c, - bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1, - scrub->key.k, 0)); - if (!bch2_err_matches(ret, ENOENT) && - !bch2_err_matches(ret, EROFS)) - bch_err_fn_ratelimited(c, ret); - } - - printbuf_exit(&err); - bch2_bkey_buf_exit(&scrub->key, c);; - btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); - enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); - kfree(scrub); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); -} - -static void btree_node_scrub_endio(struct bio *bio) -{ - struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); - - queue_work(scrub->c->btree_read_complete_wq, &scrub->work); -} - -int bch2_btree_node_scrub(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c k, unsigned dev) -{ - if (k.k->type != KEY_TYPE_btree_ptr_v2) - return 0; - - struct bch_fs *c = trans->c; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) - return bch_err_throw(c, erofs_no_writes); - - struct extent_ptr_decoded pick; - int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); - if (ret <= 0) - goto err; - - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_node_scrub); - if (!ca) { - ret = bch_err_throw(c, device_offline); - goto err; - } - - bool used_mempool = false; - void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); - - unsigned vecs = buf_pages(buf, c->opts.btree_node_size); - - struct btree_node_scrub *scrub = - kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); - if (!scrub) { - ret = -ENOMEM; - goto err_free; - } - - scrub->c = c; - scrub->ca = ca; - scrub->buf = buf; - scrub->used_mempool = used_mempool; - scrub->written = btree_ptr_sectors_written(k); - - scrub->btree = btree; - scrub->level = level; - bch2_bkey_buf_init(&scrub->key); - bch2_bkey_buf_reassemble(&scrub->key, c, k); - scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; - - INIT_WORK(&scrub->work, btree_node_scrub_work); - - bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); - bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); - scrub->bio.bi_iter.bi_sector = pick.ptr.offset; - scrub->bio.bi_end_io = btree_node_scrub_endio; - submit_bio(&scrub->bio); - return 0; -err_free: - btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); -err: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); - return ret; -} - -static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, - struct btree_write *w) -{ - unsigned long old, new; - - old = READ_ONCE(b->will_make_reachable); - do { - new = old; - if (!(old & 1)) - break; - - new &= ~1UL; - } while (!try_cmpxchg(&b->will_make_reachable, &old, new)); - - if (old & 1) - closure_put(&((struct btree_update *) new)->cl); - - bch2_journal_pin_drop(&c->journal, &w->journal); -} - -static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) -{ - struct btree_write *w = btree_prev_write(b); - unsigned long old, new; - unsigned type = 0; - - bch2_btree_complete_write(c, b, w); - - if (start_time) - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); - - old = READ_ONCE(b->flags); - do { - new = old; - - if ((old & (1U << BTREE_NODE_dirty)) && - (old & (1U << BTREE_NODE_need_write)) && - !(old & (1U << BTREE_NODE_never_write)) && - !(old & (1U << BTREE_NODE_write_blocked)) && - !(old & (1U << BTREE_NODE_will_make_reachable))) { - new &= ~(1U << BTREE_NODE_dirty); - new &= ~(1U << BTREE_NODE_need_write); - new |= (1U << BTREE_NODE_write_in_flight); - new |= (1U << BTREE_NODE_write_in_flight_inner); - new |= (1U << BTREE_NODE_just_written); - new ^= (1U << BTREE_NODE_write_idx); - - type = new & BTREE_WRITE_TYPE_MASK; - new &= ~BTREE_WRITE_TYPE_MASK; - } else { - new &= ~(1U << BTREE_NODE_write_in_flight); - new &= ~(1U << BTREE_NODE_write_in_flight_inner); - } - } while (!try_cmpxchg(&b->flags, &old, new)); - - if (new & (1U << BTREE_NODE_write_in_flight)) - __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); - else { - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); - } -} - -static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) -{ - struct btree_trans *trans = bch2_trans_get(c); - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - - /* we don't need transaction context anymore after we got the lock. */ - bch2_trans_put(trans); - __btree_node_write_done(c, b, start_time); - six_unlock_read(&b->c.lock); -} - -static void btree_node_write_work(struct work_struct *work) -{ - struct btree_write_bio *wbio = - container_of(work, struct btree_write_bio, work); - struct bch_fs *c = wbio->wbio.c; - struct btree *b = wbio->wbio.bio.bi_private; - u64 start_time = wbio->start_time; - int ret = 0; - - btree_bounce_free(c, - wbio->data_bytes, - wbio->wbio.used_mempool, - wbio->data); - - bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, - bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = bch_err_throw(c, btree_node_write_all_failed); - goto err; - } - - if (wbio->wbio.first_btree_write) { - if (wbio->wbio.failed.nr) { - - } - } else { - ret = bch2_trans_do(c, - bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - BCH_WATERMARK_interior_updates| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw, - !wbio->wbio.failed.nr)); - if (ret) - goto err; - } -out: - async_object_list_del(c, btree_write_bio, wbio->list_idx); - bio_put(&wbio->wbio.bio); - btree_node_write_done(c, b, start_time); - return; -err: - set_btree_node_noevict(b); - - if (!bch2_err_matches(ret, EROFS)) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); - bch2_btree_pos_to_text(&buf, c, b); - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - } - goto out; -} - -static void btree_node_write_endio(struct bio *bio) -{ - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; - struct bch_write_bio *orig = parent ?: wbio; - struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); - struct bch_fs *c = wbio->c; - struct btree *b = wbio->bio.bi_private; - struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, - wbio->submit_time, !bio->bi_status); - - if (ca && bio->bi_status) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "btree write error: %s\n ", - bch2_blk_status_to_str(bio->bi_status)); - bch2_btree_pos_to_text(&buf, c, b); - bch_err_dev_ratelimited(ca, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (bio->bi_status) { - unsigned long flags; - spin_lock_irqsave(&c->btree_write_error_lock, flags); - bch2_dev_list_add_dev(&orig->failed, wbio->dev); - spin_unlock_irqrestore(&c->btree_write_error_lock, flags); - } - - /* - * XXX: we should be using io_ref[WRITE], but we aren't retrying failed - * btree writes yet (due to device removal/ro): - */ - if (wbio->have_ioref) - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_node_write); - - if (parent) { - bio_put(bio); - bio_endio(&parent->bio); - return; - } - - clear_btree_node_write_in_flight_inner(b); - smp_mb__after_atomic(); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); - INIT_WORK(&wb->work, btree_node_write_work); - queue_work(c->btree_write_complete_wq, &wb->work); -} - -static int validate_bset_for_write(struct bch_fs *c, struct btree *b, - struct bset *i) -{ - int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level + 1, - .btree = b->c.btree_id, - .flags = BCH_VALIDATE_write, - }); - if (ret) { - bch2_fs_inconsistent(c, "invalid btree node key before write"); - return ret; - } - - ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: - validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL); - if (ret) { - bch2_inconsistent_error(c); - dump_stack(); - } - - return ret; -} - -static void btree_write_submit(struct work_struct *work) -{ - struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); - BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - - bkey_copy(&tmp.k, &wbio->key); - - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) - ptr->offset += wbio->sector_offset; - - bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, - &tmp.k, false); -} - -void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) -{ - struct btree_write_bio *wbio; - struct bset *i; - struct btree_node *bn = NULL; - struct btree_node_entry *bne = NULL; - struct sort_iter_stack sort_iter; - struct nonce nonce; - unsigned bytes_to_write, sectors_to_write, bytes, u64s; - u64 seq = 0; - bool used_mempool; - unsigned long old, new; - bool validate_before_checksum = false; - enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; - void *data; - u64 start_time = local_clock(); - int ret; - - if (flags & BTREE_WRITE_ALREADY_STARTED) - goto do_write; - - /* - * We may only have a read lock on the btree node - the dirty bit is our - * "lock" against racing with other threads that may be trying to start - * a write, we do a write iff we clear the dirty bit. Since setting the - * dirty bit requires a write lock, we can't race with other threads - * redirtying it: - */ - old = READ_ONCE(b->flags); - do { - new = old; - - if (!(old & (1 << BTREE_NODE_dirty))) - return; - - if ((flags & BTREE_WRITE_ONLY_IF_NEED) && - !(old & (1 << BTREE_NODE_need_write))) - return; - - if (old & - ((1 << BTREE_NODE_never_write)| - (1 << BTREE_NODE_write_blocked))) - return; - - if (b->written && - (old & (1 << BTREE_NODE_will_make_reachable))) - return; - - if (old & (1 << BTREE_NODE_write_in_flight)) - return; - - if (flags & BTREE_WRITE_ONLY_IF_NEED) - type = new & BTREE_WRITE_TYPE_MASK; - new &= ~BTREE_WRITE_TYPE_MASK; - - new &= ~(1 << BTREE_NODE_dirty); - new &= ~(1 << BTREE_NODE_need_write); - new |= (1 << BTREE_NODE_write_in_flight); - new |= (1 << BTREE_NODE_write_in_flight_inner); - new |= (1 << BTREE_NODE_just_written); - new ^= (1 << BTREE_NODE_write_idx); - } while (!try_cmpxchg_acquire(&b->flags, &old, new)); - - if (new & (1U << BTREE_NODE_need_write)) - return; -do_write: - BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); - - atomic_long_dec(&c->btree_cache.nr_dirty); - - BUG_ON(btree_node_fake(b)); - BUG_ON((b->will_make_reachable != 0) != !b->written); - - BUG_ON(b->written >= btree_sectors(c)); - BUG_ON(b->written & (block_sectors(c) - 1)); - BUG_ON(bset_written(b, btree_bset_last(b))); - BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); - BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); - - bch2_sort_whiteouts(c, b); - - sort_iter_stack_init(&sort_iter, b); - - bytes = !b->written - ? sizeof(struct btree_node) - : sizeof(struct btree_node_entry); - - bytes += b->whiteout_u64s * sizeof(u64); - - for_each_bset(b, t) { - i = bset(b, t); - - if (bset_written(b, i)) - continue; - - bytes += le16_to_cpu(i->u64s) * sizeof(u64); - sort_iter_add(&sort_iter.iter, - btree_bkey_first(b, t), - btree_bkey_last(b, t)); - seq = max(seq, le64_to_cpu(i->journal_seq)); - } - - BUG_ON(b->written && !seq); - - /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ - bytes += 8; - - /* buffer must be a multiple of the block size */ - bytes = round_up(bytes, block_bytes(c)); - - data = btree_bounce_alloc(c, bytes, &used_mempool); - - if (!b->written) { - bn = data; - *bn = *b->data; - i = &bn->keys; - } else { - bne = data; - bne->keys = b->data->keys; - i = &bne->keys; - } - - i->journal_seq = cpu_to_le64(seq); - i->u64s = 0; - - sort_iter_add(&sort_iter.iter, - unwritten_whiteouts_start(b), - unwritten_whiteouts_end(b)); - SET_BSET_SEPARATE_WHITEOUTS(i, false); - - u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter); - le16_add_cpu(&i->u64s, u64s); - - b->whiteout_u64s = 0; - - BUG_ON(!b->written && i->u64s != b->data->keys.u64s); - - set_needs_whiteout(i, false); - - /* do we have data to write? */ - if (b->written && !i->u64s) - goto nowrite; - - bytes_to_write = vstruct_end(i) - data; - sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; - - if (!b->written && - b->key.k.type == KEY_TYPE_btree_ptr_v2) - BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write); - - memset(data + bytes_to_write, 0, - (sectors_to_write << 9) - bytes_to_write); - - BUG_ON(b->written + sectors_to_write > btree_sectors(c)); - BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); - BUG_ON(i->seq != b->data->keys.seq); - - i->version = cpu_to_le16(c->sb.version); - SET_BSET_OFFSET(i, b->written); - SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); - - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) - validate_before_checksum = true; - - /* validate_bset will be modifying: */ - if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) - validate_before_checksum = true; - - /* if we're going to be encrypting, check metadata validity first: */ - if (validate_before_checksum && - validate_bset_for_write(c, b, i)) - goto err; - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "encrypting btree node: %s", bch2_err_str(ret))) - goto err; - - nonce = btree_nonce(i, b->written << 9); - - if (bn) - bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); - else - bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - /* if we're not encrypting, check metadata after checksumming: */ - if (!validate_before_checksum && - validate_bset_for_write(c, b, i)) - goto err; - - /* - * We handle btree write errors by immediately halting the journal - - * after we've done that, we can't issue any subsequent btree writes - * because they might have pointers to new nodes that failed to write. - * - * Furthermore, there's no point in doing any more btree writes because - * with the journal stopped, we're never going to update the journal to - * reflect that those writes were done and the data flushed from the - * journal: - * - * Also on journal error, the pending write may have updates that were - * never journalled (interior nodes, see btree_update_nodes_written()) - - * it's critical that we don't do the write in that case otherwise we - * will have updates visible that weren't in the journal: - * - * Make sure to update b->written so bch2_btree_init_next() doesn't - * break: - */ - if (bch2_journal_error(&c->journal) || - c->opts.nochanges) - goto err; - - trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); - - wbio = container_of(bio_alloc_bioset(NULL, - buf_pages(data, sectors_to_write << 9), - REQ_OP_WRITE|REQ_META, - GFP_NOFS, - &c->btree_bio), - struct btree_write_bio, wbio.bio); - wbio_init(&wbio->wbio.bio); - wbio->data = data; - wbio->data_bytes = bytes; - wbio->sector_offset = b->written; - wbio->start_time = start_time; - wbio->wbio.c = c; - wbio->wbio.used_mempool = used_mempool; - wbio->wbio.first_btree_write = !b->written; - wbio->wbio.bio.bi_end_io = btree_node_write_endio; - wbio->wbio.bio.bi_private = b; - - bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); - - bkey_copy(&wbio->key, &b->key); - - b->written += sectors_to_write; - - if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = - cpu_to_le16(b->written); - - atomic64_inc(&c->btree_write_stats[type].nr); - atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); - - async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx); - - INIT_WORK(&wbio->work, btree_write_submit); - queue_work(c->btree_write_submit_wq, &wbio->work); - return; -err: - set_btree_node_noevict(b); - b->written += sectors_to_write; -nowrite: - btree_bounce_free(c, bytes, used_mempool, data); - __btree_node_write_done(c, b, 0); -} - -/* - * Work that must be done with write lock held: - */ -bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) -{ - bool invalidated_iter = false; - struct btree_node_entry *bne; - - if (!btree_node_just_written(b)) - return false; - - BUG_ON(b->whiteout_u64s); - - clear_btree_node_just_written(b); - - /* - * Note: immediately after write, bset_written() doesn't work - the - * amount of data we had to write after compaction might have been - * smaller than the offset of the last bset. - * - * However, we know that all bsets have been written here, as long as - * we're still holding the write lock: - */ - - /* - * XXX: decide if we really want to unconditionally sort down to a - * single bset: - */ - if (b->nsets > 1) { - btree_node_sort(c, b, 0, b->nsets); - invalidated_iter = true; - } else { - invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); - } - - for_each_bset(b, t) - set_needs_whiteout(bset(b, t), true); - - bch2_btree_verify(c, b); - - /* - * If later we don't unconditionally sort down to a single bset, we have - * to ensure this is still true: - */ - BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); - - bne = want_new_bset(c, b); - if (bne) - bch2_bset_init_next(b, bne); - - bch2_btree_build_aux_trees(b); - - return invalidated_iter; -} - -/* - * Use this one if the node is intent locked: - */ -void bch2_btree_node_write(struct bch_fs *c, struct btree *b, - enum six_lock_type lock_type_held, - unsigned flags) -{ - if (lock_type_held == SIX_LOCK_intent || - (lock_type_held == SIX_LOCK_read && - six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b, flags); - - /* don't cycle lock unnecessarily: */ - if (btree_node_just_written(b) && - six_trylock_write(&b->c.lock)) { - bch2_btree_post_write_cleanup(c, b); - six_unlock_write(&b->c.lock); - } - - if (lock_type_held == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); - } else { - __bch2_btree_node_write(c, b, flags); - if (lock_type_held == SIX_LOCK_write && - btree_node_just_written(b)) - bch2_btree_post_write_cleanup(c, b); - } -} - -void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b, - enum six_lock_type lock_type_held, - unsigned flags) -{ - struct bch_fs *c = trans->c; - - if (lock_type_held == SIX_LOCK_intent || - (lock_type_held == SIX_LOCK_read && - six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b, flags); - - /* don't cycle lock unnecessarily: */ - if (btree_node_just_written(b) && - six_trylock_write(&b->c.lock)) { - bch2_btree_post_write_cleanup(c, b); - __bch2_btree_node_unlock_write(trans, b); - } - - if (lock_type_held == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); - } else { - __bch2_btree_node_write(c, b, flags); - if (lock_type_held == SIX_LOCK_write && - btree_node_just_written(b)) - bch2_btree_post_write_cleanup(c, b); - } -} - -static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) -{ - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - unsigned i; - bool ret = false; -restart: - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) - if (test_bit(flag, &b->flags)) { - rcu_read_unlock(); - wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); - ret = true; - goto restart; - } - rcu_read_unlock(); - - return ret; -} - -bool bch2_btree_flush_all_reads(struct bch_fs *c) -{ - return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); -} - -bool bch2_btree_flush_all_writes(struct bch_fs *c) -{ - return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); -} - -static const char * const bch2_btree_write_types[] = { -#define x(t, n) [n] = #t, - BCH_BTREE_WRITE_TYPES() - NULL -}; - -void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) -{ - printbuf_tabstop_push(out, 20); - printbuf_tabstop_push(out, 10); - - prt_printf(out, "\tnr\tsize\n"); - - for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { - u64 nr = atomic64_read(&c->btree_write_stats[i].nr); - u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); - - prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr); - prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); - prt_newline(out); - } -} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h deleted file mode 100644 index 30a5180532c8d8..00000000000000 --- a/fs/bcachefs/btree_io.h +++ /dev/null @@ -1,239 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_IO_H -#define _BCACHEFS_BTREE_IO_H - -#include "bkey_methods.h" -#include "bset.h" -#include "btree_locking.h" -#include "checksum.h" -#include "extents.h" -#include "io_write_types.h" - -struct bch_fs; -struct btree_write; -struct btree; -struct btree_iter; -struct btree_node_read_all; - -static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) -{ - if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) - atomic_long_inc(&c->btree_cache.nr_dirty); -} - -static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) -{ - if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) - atomic_long_dec(&c->btree_cache.nr_dirty); -} - -static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k) -{ - return k.k->type == KEY_TYPE_btree_ptr_v2 - ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written) - : 0; -} - -struct btree_read_bio { - struct bch_fs *c; - struct btree *b; - struct btree_node_read_all *ra; - u64 start_time; - unsigned have_ioref:1; - unsigned idx:7; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - struct extent_ptr_decoded pick; - struct work_struct work; - struct bio bio; -}; - -struct btree_write_bio { - struct work_struct work; - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - void *data; - unsigned data_bytes; - unsigned sector_offset; - u64 start_time; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - struct bch_write_bio wbio; -}; - -void bch2_btree_node_io_unlock(struct btree *); -void bch2_btree_node_io_lock(struct btree *); -void __bch2_btree_node_wait_on_read(struct btree *); -void __bch2_btree_node_wait_on_write(struct btree *); -void bch2_btree_node_wait_on_read(struct btree *); -void bch2_btree_node_wait_on_write(struct btree *); - -enum compact_mode { - COMPACT_LAZY, - COMPACT_ALL, -}; - -bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, - enum compact_mode); - -static inline bool should_compact_bset_lazy(struct btree *b, - struct bset_tree *t) -{ - unsigned total_u64s = bset_u64s(t); - unsigned dead_u64s = bset_dead_u64s(b, t); - - return dead_u64s > 64 && dead_u64s * 3 > total_u64s; -} - -static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) -{ - for_each_bset(b, t) - if (should_compact_bset_lazy(b, t)) - return bch2_compact_whiteouts(c, b, COMPACT_LAZY); - - return false; -} - -static inline struct nonce btree_nonce(struct bset *i, unsigned offset) -{ - return (struct nonce) {{ - [0] = cpu_to_le32(offset), - [1] = ((__le32 *) &i->seq)[0], - [2] = ((__le32 *) &i->seq)[1], - [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, - }}; -} - -static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) -{ - struct nonce nonce = btree_nonce(i, offset); - int ret; - - if (!offset) { - struct btree_node *bn = container_of(i, struct btree_node, keys); - unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - - ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, - &bn->flags, bytes); - if (ret) - return ret; - - nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); - } - - return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, - vstruct_end(i) - (void *) i->_data); -} - -void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); - -void bch2_btree_node_drop_keys_outside_node(struct btree *); - -void bch2_btree_build_aux_trees(struct btree *); -void bch2_btree_init_next(struct btree_trans *, struct btree *); - -int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, - struct btree *, - struct bch_io_failures *, - struct printbuf *); -void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); -int bch2_btree_root_read(struct bch_fs *, enum btree_id, - const struct bkey_i *, unsigned); - -void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *); - -int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, unsigned); - -bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); - -enum btree_write_flags { - __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, - __BTREE_WRITE_ALREADY_STARTED, -}; -#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED) -#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED) - -void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); -void bch2_btree_node_write(struct bch_fs *, struct btree *, - enum six_lock_type, unsigned); -void bch2_btree_node_write_trans(struct btree_trans *, struct btree *, - enum six_lock_type, unsigned); - -static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b, - enum six_lock_type lock_held) -{ - bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); -} - -bool bch2_btree_flush_all_reads(struct bch_fs *); -bool bch2_btree_flush_all_writes(struct bch_fs *); - -static inline void compat_bformat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, struct bkey_format *f) -{ - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_inodes) { - swap(f->bits_per_field[BKEY_FIELD_INODE], - f->bits_per_field[BKEY_FIELD_OFFSET]); - swap(f->field_offset[BKEY_FIELD_INODE], - f->field_offset[BKEY_FIELD_OFFSET]); - } - - if (version < bcachefs_metadata_version_snapshot && - (level || btree_type_has_snapshots(btree_id))) { - u64 max_packed = - ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); - - f->field_offset[BKEY_FIELD_SNAPSHOT] = write - ? 0 - : cpu_to_le64(U32_MAX - max_packed); - } -} - -static inline void compat_bpos(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, struct bpos *p) -{ - if (big_endian != CPU_BIG_ENDIAN) - bch2_bpos_swab(p); - - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_inodes) - swap(p->inode, p->offset); -} - -static inline void compat_btree_node(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, - struct btree_node *bn) -{ - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id_is_extents(btree_id) && - !bpos_eq(bn->min_key, POS_MIN) && - write) - bn->min_key = bpos_nosnap_predecessor(bn->min_key); - - if (version < bcachefs_metadata_version_snapshot && - write) - bn->max_key.snapshot = 0; - - compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); - compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); - - if (version < bcachefs_metadata_version_snapshot && - !write) - bn->max_key.snapshot = U32_MAX; - - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id_is_extents(btree_id) && - !bpos_eq(bn->min_key, POS_MIN) && - !write) - bn->min_key = bpos_nosnap_successor(bn->min_key); -} - -void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c deleted file mode 100644 index f8829b667ad35e..00000000000000 --- a/fs/bcachefs/btree_iter.c +++ /dev/null @@ -1,3804 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_methods.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "journal_io.h" -#include "replicas.h" -#include "snapshot.h" -#include "super.h" -#include "trace.h" - -#include -#include - -static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); -static inline void btree_path_list_add(struct btree_trans *, - btree_path_idx_t, btree_path_idx_t); - -static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) -{ -#ifdef TRACK_PATH_ALLOCATED - return iter->ip_allocated; -#else - return 0; -#endif -} - -static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t); -static void bch2_trans_srcu_lock(struct btree_trans *); - -static inline int __btree_path_cmp(const struct btree_path *l, - enum btree_id r_btree_id, - bool r_cached, - struct bpos r_pos, - unsigned r_level) -{ - /* - * Must match lock ordering as defined by __bch2_btree_node_lock: - */ - return cmp_int(l->btree_id, r_btree_id) ?: - cmp_int((int) l->cached, (int) r_cached) ?: - bpos_cmp(l->pos, r_pos) ?: - -cmp_int(l->level, r_level); -} - -static inline int btree_path_cmp(const struct btree_path *l, - const struct btree_path *r) -{ - return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); -} - -static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) -{ - /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_all_snapshots) { - p = bpos_successor(p); - } else { - p = bpos_nosnap_successor(p); - p.snapshot = iter->snapshot; - } - - return p; -} - -static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) -{ - /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_all_snapshots) { - p = bpos_predecessor(p); - } else { - p = bpos_nosnap_predecessor(p); - p.snapshot = iter->snapshot; - } - - return p; -} - -static inline struct bpos btree_iter_search_key(struct btree_iter *iter) -{ - struct bpos pos = iter->pos; - - if ((iter->flags & BTREE_ITER_is_extents) && - !bkey_eq(pos, POS_MAX)) - pos = bkey_successor(iter, pos); - return pos; -} - -static inline bool btree_path_pos_before_node(struct btree_path *path, - struct btree *b) -{ - return bpos_lt(path->pos, b->data->min_key); -} - -static inline bool btree_path_pos_after_node(struct btree_path *path, - struct btree *b) -{ - return bpos_gt(path->pos, b->key.k.p); -} - -static inline bool btree_path_pos_in_node(struct btree_path *path, - struct btree *b) -{ - return path->btree_id == b->c.btree_id && - !btree_path_pos_before_node(path, b) && - !btree_path_pos_after_node(path, b); -} - -/* Debug: */ - -static void __bch2_btree_path_verify_cached(struct btree_trans *trans, - struct btree_path *path) -{ - struct bkey_cached *ck; - bool locked = btree_node_locked(path, 0); - - if (!bch2_btree_node_relock(trans, path, 0)) - return; - - ck = (void *) path->l[0].b; - BUG_ON(ck->key.btree_id != path->btree_id || - !bkey_eq(ck->key.pos, path->pos)); - - if (!locked) - btree_node_unlock(trans, path, 0); -} - -static void __bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree_path_level *l; - struct btree_node_iter tmp; - bool locked; - struct bkey_packed *p, *k; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - struct printbuf buf3 = PRINTBUF; - const char *msg; - - l = &path->l[level]; - tmp = l->iter; - locked = btree_node_locked(path, level); - - if (path->cached) { - if (!level) - __bch2_btree_path_verify_cached(trans, path); - return; - } - - if (!btree_path_node(path, level)) - return; - - if (!bch2_btree_node_relock_notrace(trans, path, level)) - return; - - BUG_ON(!btree_path_pos_in_node(path, l->b)); - - bch2_btree_node_iter_verify(&l->iter, l->b); - - /* - * For interior nodes, the iterator will have skipped past deleted keys: - */ - p = level - ? bch2_btree_node_iter_prev(&tmp, l->b) - : bch2_btree_node_iter_prev_all(&tmp, l->b); - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - - if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { - msg = "before"; - goto err; - } - - if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { - msg = "after"; - goto err; - } - - if (!locked) - btree_node_unlock(trans, path, level); - return; -err: - bch2_bpos_to_text(&buf1, path->pos); - - if (p) { - struct bkey uk = bkey_unpack_key(l->b, p); - - bch2_bkey_to_text(&buf2, &uk); - } else { - prt_printf(&buf2, "(none)"); - } - - if (k) { - struct bkey uk = bkey_unpack_key(l->b, k); - - bch2_bkey_to_text(&buf3, &uk); - } else { - prt_printf(&buf3, "(none)"); - } - - panic("path should be %s key at level %u:\n" - "path pos %s\n" - "prev key %s\n" - "cur key %s\n", - msg, level, buf1.buf, buf2.buf, buf3.buf); -} - -static void __bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) -{ - struct bch_fs *c = trans->c; - - for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { - if (!path->l[i].b) { - BUG_ON(!path->cached && - bch2_btree_id_root(c, path->btree_id)->b->c.level > i); - break; - } - - __bch2_btree_path_verify_level(trans, path, i); - } - - bch2_btree_path_verify_locks(trans, path); -} - -void __bch2_trans_verify_paths(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned iter; - - trans_for_each_path(trans, path, iter) - __bch2_btree_path_verify(trans, path); -} - -static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) -{ - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); - - BUG_ON((iter->flags & BTREE_ITER_is_extents) && - (iter->flags & BTREE_ITER_all_snapshots)); - - BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) && - (iter->flags & BTREE_ITER_all_snapshots) && - !btree_type_has_snapshot_field(iter->btree_id)); - - if (iter->update_path) - __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); - __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); -} - -static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) -{ - BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && - !iter->pos.snapshot); - - BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && - iter->pos.snapshot != iter->snapshot); - - BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) : - !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) : - (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || - bkey_gt(iter->pos, iter->k.p))); -} - -static int __bch2_btree_iter_verify_ret(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k) -{ - struct btree_iter copy; - struct bkey_s_c prev; - int ret = 0; - - if (!(iter->flags & BTREE_ITER_filter_snapshots)) - return 0; - - if (bkey_err(k) || !k.k) - return 0; - - BUG_ON(!bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)); - - bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, - BTREE_ITER_nopreserve| - BTREE_ITER_all_snapshots); - prev = bch2_btree_iter_prev(trans, ©); - if (!prev.k) - goto out; - - ret = bkey_err(prev); - if (ret) - goto out; - - if (bkey_eq(prev.k->p, k.k->p) && - bch2_snapshot_is_ancestor(trans->c, iter->snapshot, - prev.k->p.snapshot) > 0) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - bch2_bkey_to_text(&buf1, k.k); - bch2_bkey_to_text(&buf2, prev.k); - - panic("iter snap %u\n" - "k %s\n" - "prev %s\n", - iter->snapshot, - buf1.buf, buf2.buf); - } -out: - bch2_trans_iter_exit(trans, ©); - return ret; -} - -void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - struct btree_path *path; - struct trans_for_each_path_inorder_iter iter; - struct printbuf buf = PRINTBUF; - - btree_trans_sort_paths(trans); - - trans_for_each_path_inorder(trans, path, iter) { - if (path->btree_id != id || - !btree_node_locked(path, 0) || - !path->should_be_locked) - continue; - - if (!path->cached) { - if (bkey_ge(pos, path->l[0].b->data->min_key) && - bkey_le(pos, path->l[0].b->key.k.p)) - return; - } else { - if (bkey_eq(pos, path->pos)) - return; - } - } - - bch2_dump_trans_paths_updates(trans); - bch2_bpos_to_text(&buf, pos); - - panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); -} - -static inline void bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned l) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_path_verify_level(trans, path, l); -} - -static inline void bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_path_verify(trans, path); -} - -static inline void bch2_btree_iter_verify(struct btree_trans *trans, - struct btree_iter *iter) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_iter_verify(trans, iter); -} - -static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_btree_iter_verify_entry_exit(iter); -} - -static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - return static_branch_unlikely(&bch2_debug_check_iterators) - ? __bch2_btree_iter_verify_ret(trans, iter, k) - : 0; -} - -/* Btree path: fixups after btree updates */ - -static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, - struct btree *b, - struct bset_tree *t, - struct bkey_packed *k) -{ - struct btree_node_iter_set *set; - - btree_node_iter_for_each(iter, set) - if (set->end == t->end_offset) { - set->k = __btree_node_key_to_offset(b, k); - bch2_btree_node_iter_sort(iter, b); - return; - } - - bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); -} - -static void __bch2_btree_path_fix_key_modified(struct btree_path *path, - struct btree *b, - struct bkey_packed *where) -{ - struct btree_path_level *l = &path->l[b->c.level]; - - if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) - return; - - if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) - bch2_btree_node_iter_advance(&l->iter, l->b); -} - -void bch2_btree_path_fix_key_modified(struct btree_trans *trans, - struct btree *b, - struct bkey_packed *where) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path_with_node(trans, b, path, i) { - __bch2_btree_path_fix_key_modified(path, b, where); - bch2_btree_path_verify_level(trans, path, b->c.level); - } -} - -static void __bch2_btree_node_iter_fix(struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bset_tree *t, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - const struct bkey_packed *end = btree_bkey_last(b, t); - struct btree_node_iter_set *set; - unsigned offset = __btree_node_key_to_offset(b, where); - int shift = new_u64s - clobber_u64s; - unsigned old_end = t->end_offset - shift; - unsigned orig_iter_pos = node_iter->data[0].k; - bool iter_current_key_modified = - orig_iter_pos >= offset && - orig_iter_pos <= offset + clobber_u64s; - - btree_node_iter_for_each(node_iter, set) - if (set->end == old_end) - goto found; - - /* didn't find the bset in the iterator - might have to readd it: */ - if (new_u64s && - bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { - bch2_btree_node_iter_push(node_iter, b, where, end); - goto fixup_done; - } else { - /* Iterator is after key that changed */ - return; - } -found: - set->end = t->end_offset; - - /* Iterator hasn't gotten to the key that changed yet: */ - if (set->k < offset) - return; - - if (new_u64s && - bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { - set->k = offset; - } else if (set->k < offset + clobber_u64s) { - set->k = offset + new_u64s; - if (set->k == set->end) - bch2_btree_node_iter_set_drop(node_iter, set); - } else { - /* Iterator is after key that changed */ - set->k = (int) set->k + shift; - return; - } - - bch2_btree_node_iter_sort(node_iter, b); -fixup_done: - if (node_iter->data[0].k != orig_iter_pos) - iter_current_key_modified = true; - - /* - * When a new key is added, and the node iterator now points to that - * key, the iterator might have skipped past deleted keys that should - * come after the key the iterator now points to. We have to rewind to - * before those deleted keys - otherwise - * bch2_btree_node_iter_prev_all() breaks: - */ - if (!bch2_btree_node_iter_end(node_iter) && - iter_current_key_modified && - b->c.level) { - struct bkey_packed *k, *k2, *p; - - k = bch2_btree_node_iter_peek_all(node_iter, b); - - for_each_bset(b, t) { - bool set_pos = false; - - if (node_iter->data[0].end == t->end_offset) - continue; - - k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); - - while ((p = bch2_bkey_prev_all(b, t, k2)) && - bkey_iter_cmp(b, k, p) < 0) { - k2 = p; - set_pos = true; - } - - if (set_pos) - btree_node_iter_set_set_pos(node_iter, - b, t, k2); - } - } -} - -void bch2_btree_node_iter_fix(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) -{ - struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); - struct btree_path *linked; - unsigned i; - - if (node_iter != &path->l[b->c.level].iter) { - __bch2_btree_node_iter_fix(path, b, node_iter, t, - where, clobber_u64s, new_u64s); - - if (static_branch_unlikely(&bch2_debug_check_iterators)) - bch2_btree_node_iter_verify(node_iter, b); - } - - trans_for_each_path_with_node(trans, b, linked, i) { - __bch2_btree_node_iter_fix(linked, b, - &linked->l[b->c.level].iter, t, - where, clobber_u64s, new_u64s); - bch2_btree_path_verify_level(trans, linked, b->c.level); - } -} - -/* Btree path level: pointer to a particular btree node and node iter */ - -static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, - struct btree_path_level *l, - struct bkey *u, - struct bkey_packed *k) -{ - if (unlikely(!k)) { - /* - * signal to bch2_btree_iter_peek_slot() that we're currently at - * a hole - */ - u->type = KEY_TYPE_deleted; - return bkey_s_c_null; - } - - return bkey_disassemble(l->b, k, u); -} - -static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, - struct btree_path_level *l, - struct bkey *u) -{ - return __btree_iter_unpack(c, l, u, - bch2_btree_node_iter_peek_all(&l->iter, l->b)); -} - -static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_prev(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->data->min_key; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - -static inline bool btree_path_advance_to_pos(struct btree_path *path, - struct btree_path_level *l, - int max_advance) -{ - struct bkey_packed *k; - int nr_advanced = 0; - - while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { - if (max_advance > 0 && nr_advanced >= max_advance) - return false; - - bch2_btree_node_iter_advance(&l->iter, l->b); - nr_advanced++; - } - - return true; -} - -static inline void __btree_path_level_init(struct btree_path *path, - unsigned level) -{ - struct btree_path_level *l = &path->l[level]; - - bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); - - /* - * Iterators to interior nodes should always be pointed at the first non - * whiteout: - */ - if (level) - bch2_btree_node_iter_peek(&l->iter, l->b); -} - -void bch2_btree_path_level_init(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - BUG_ON(path->cached); - - EBUG_ON(!btree_path_pos_in_node(path, b)); - - path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); - path->l[b->c.level].b = b; - __btree_path_level_init(path, b->c.level); -} - -/* Btree path: fixups after btree node updates: */ - -static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - - trans_for_each_update(trans, i) - if (!i->cached && - i->level == b->c.level && - i->btree_id == b->c.btree_id && - bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && - bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { - i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v; - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, i->btree_id, i->level, - i->k->k.p); - - if (j_k) { - i->old_k = j_k->k; - i->old_v = &j_k->v; - } - } - } -} - -/* - * A btree node is being replaced - update the iterator to point to the new - * node: - */ -void bch2_trans_node_add(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct btree_path *prev; - - BUG_ON(!btree_path_pos_in_node(path, b)); - - while ((prev = prev_btree_path(trans, path)) && - btree_path_pos_in_node(prev, b)) - path = prev; - - for (; - path && btree_path_pos_in_node(path, b); - path = next_btree_path(trans, path)) - if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) { - enum btree_node_locked_type t = - btree_lock_want(path, b->c.level); - - if (t != BTREE_NODE_UNLOCKED) { - btree_node_unlock(trans, path, b->c.level); - six_lock_increment(&b->c.lock, (enum six_lock_type) t); - mark_btree_node_locked(trans, path, b->c.level, t); - } - - bch2_btree_path_level_init(trans, path, b); - } - - bch2_trans_revalidate_updates_in_node(trans, b); -} - -void bch2_trans_node_drop(struct btree_trans *trans, - struct btree *b) -{ - struct btree_path *path; - unsigned i, level = b->c.level; - - trans_for_each_path(trans, path, i) - if (path->l[level].b == b) { - btree_node_unlock(trans, path, level); - path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); - } -} - -/* - * A btree node has been modified in such a way as to invalidate iterators - fix - * them: - */ -void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path_with_node(trans, b, path, i) - __btree_path_level_init(path, b->c.level); - - bch2_trans_revalidate_updates_in_node(trans, b); -} - -/* Btree path: traverse, set_pos: */ - -static inline int btree_path_lock_root(struct btree_trans *trans, - struct btree_path *path, - unsigned depth_want, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, path->btree_id); - enum six_lock_type lock_type; - unsigned i; - int ret; - - EBUG_ON(path->nodes_locked); - - while (1) { - struct btree *b = READ_ONCE(r->b); - if (unlikely(!b)) { - BUG_ON(!r->error); - return r->error; - } - - path->level = READ_ONCE(b->c.level); - - if (unlikely(path->level < depth_want)) { - /* - * the root is at a lower depth than the depth we want: - * got to the end of the btree, or we're walking nodes - * greater than some depth and there are no nodes >= - * that depth - */ - path->level = depth_want; - for (i = path->level; i < BTREE_MAX_DEPTH; i++) - path->l[i].b = NULL; - return 1; - } - - lock_type = __btree_lock_want(path, path->level); - ret = btree_node_lock(trans, path, &b->c, - path->level, lock_type, trace_ip); - if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - BUG(); - } - - if (likely(b == READ_ONCE(r->b) && - b->c.level == path->level && - !race_fault())) { - for (i = 0; i < path->level; i++) - path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); - path->l[path->level].b = b; - for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) - path->l[i].b = NULL; - - mark_btree_node_locked(trans, path, path->level, - (enum btree_node_locked_type) lock_type); - bch2_btree_path_level_init(trans, path, b); - return 0; - } - - six_unlock_type(&b->c.lock, lock_type); - } -} - -noinline -static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l = path_l(path); - struct btree_node_iter node_iter = l->iter; - struct bkey_packed *k; - struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_started, &c->flags) - ? (path->level > 1 ? 0 : 2) - : (path->level > 1 ? 1 : 16); - bool was_locked = btree_node_locked(path, path->level); - int ret = 0; - - bch2_bkey_buf_init(&tmp); - - while (nr-- && !ret) { - if (!bch2_btree_node_relock(trans, path, path->level)) - break; - - bch2_btree_node_iter_advance(&node_iter, l->b); - k = bch2_btree_node_iter_peek(&node_iter, l->b); - if (!k) - break; - - bch2_bkey_buf_unpack(&tmp, c, l->b, k); - ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, - path->level - 1); - } - - if (!was_locked) - btree_node_unlock(trans, path, path->level); - - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, - struct btree_and_journal_iter *jiter) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; - struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_started, &c->flags) - ? (path->level > 1 ? 0 : 2) - : (path->level > 1 ? 1 : 16); - bool was_locked = btree_node_locked(path, path->level); - int ret = 0; - - bch2_bkey_buf_init(&tmp); - - jiter->fail_if_too_many_whiteouts = true; - - while (nr-- && !ret) { - if (!bch2_btree_node_relock(trans, path, path->level)) - break; - - bch2_btree_and_journal_iter_advance(jiter); - k = bch2_btree_and_journal_iter_peek(jiter); - if (!k.k) - break; - - bch2_bkey_buf_reassemble(&tmp, c, k); - ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, - path->level - 1); - } - - if (!was_locked) - btree_node_unlock(trans, path, path->level); - - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, - struct btree_path *path, - unsigned plevel, struct btree *b) -{ - struct btree_path_level *l = &path->l[plevel]; - bool locked = btree_node_locked(path, plevel); - struct bkey_packed *k; - struct bch_btree_ptr_v2 *bp; - - if (!bch2_btree_node_relock(trans, path, plevel)) - return; - - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); - - bp = (void *) bkeyp_val(&l->b->format, k); - bp->mem_ptr = (unsigned long)b; - - if (!locked) - btree_node_unlock(trans, path, plevel); -} - -static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, - struct btree_path *path, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l = path_l(path); - struct btree_and_journal_iter jiter; - struct bkey_s_c k; - int ret = 0; - - __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); - - k = bch2_btree_and_journal_iter_peek(&jiter); - if (!k.k) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " at btree "); - bch2_btree_pos_to_text(&buf, c, l->b); - - ret = bch2_fs_topology_error(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } - - bkey_reassemble(&trans->btree_path_down, k); - - if ((flags & BTREE_ITER_prefetch) && - c->opts.btree_node_prefetch) - ret = btree_path_prefetch_j(trans, path, &jiter); - -err: - bch2_btree_and_journal_iter_exit(&jiter); - return ret; -} - -static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, - struct btree_path *path) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " within parent node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); - - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, btree_need_topology_repair); -} - -static __always_inline int btree_path_down(struct btree_trans *trans, - struct btree_path *path, - unsigned flags, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l = path_l(path); - struct btree *b; - unsigned level = path->level - 1; - enum six_lock_type lock_type = __btree_lock_want(path, level); - int ret; - - EBUG_ON(!btree_node_locked(path, path->level)); - - if (unlikely(trans->journal_replay_not_finished)) { - ret = btree_node_iter_and_journal_peek(trans, path, flags); - if (ret) - return ret; - } else { - struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); - if (unlikely(!k)) - return btree_node_missing_err(trans, path); - - bch2_bkey_unpack(l->b, &trans->btree_path_down, k); - - if (unlikely((flags & BTREE_ITER_prefetch)) && - c->opts.btree_node_prefetch) { - ret = btree_path_prefetch(trans, path); - if (ret) - return ret; - } - } - - b = bch2_btree_node_get(trans, path, &trans->btree_path_down, - level, lock_type, trace_ip); - ret = PTR_ERR_OR_ZERO(b); - if (unlikely(ret)) - return ret; - - if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) && - likely(!trans->journal_replay_not_finished && - trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2)) - btree_node_mem_ptr_set(trans, path, level + 1, b); - - if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(trans, path, level + 1); - - mark_btree_node_locked(trans, path, level, - (enum btree_node_locked_type) lock_type); - path->level = level; - bch2_btree_path_level_init(trans, path, b); - - bch2_btree_path_verify_locks(trans, path); - return 0; -} - -static int bch2_btree_path_traverse_all(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_path *path; - unsigned long trace_ip = _RET_IP_; - unsigned i; - int ret = 0; - - if (trans->in_traverse_all) - return bch_err_throw(c, transaction_restart_in_traverse_all); - - trans->in_traverse_all = true; -retry_all: - trans->restarted = 0; - trans->last_restarted_ip = 0; - - trans_for_each_path(trans, path, i) - path->should_be_locked = false; - - btree_trans_sort_paths(trans); - - bch2_trans_unlock(trans); - cond_resched(); - trans_set_locked(trans, false); - - if (unlikely(trans->memory_allocation_failure)) { - struct closure cl; - - closure_init_stack(&cl); - - do { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - closure_sync(&cl); - } while (ret); - } - - /* Now, redo traversals in correct order: */ - i = 0; - while (i < trans->nr_sorted) { - btree_path_idx_t idx = trans->sorted[i]; - - /* - * Traversing a path can cause another path to be added at about - * the same position: - */ - if (trans->paths[idx].uptodate) { - __btree_path_get(trans, &trans->paths[idx], false); - ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); - __btree_path_put(trans, &trans->paths[idx], false); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, ENOMEM)) - goto retry_all; - if (ret) - goto err; - } else { - i++; - } - } - - /* - * We used to assert that all paths had been traversed here - * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since - * path->should_be_locked is not set yet, we might have unlocked and - * then failed to relock a path - that's fine. - */ -err: - bch2_btree_cache_cannibalize_unlock(trans); - - trans->in_traverse_all = false; - - trace_and_count(c, trans_traverse_all, trans, trace_ip); - return ret; -} - -static inline bool btree_path_check_pos_in_node(struct btree_path *path, - unsigned l, int check_pos) -{ - if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) - return false; - if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) - return false; - return true; -} - -static inline bool btree_path_good_node(struct btree_trans *trans, - struct btree_path *path, - unsigned l, int check_pos) -{ - return is_btree_node(path, l) && - bch2_btree_node_relock(trans, path, l) && - btree_path_check_pos_in_node(path, l, check_pos); -} - -static void btree_path_set_level_down(struct btree_trans *trans, - struct btree_path *path, - unsigned new_level) -{ - unsigned l; - - path->level = new_level; - - for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) - if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) - btree_node_unlock(trans, path, l); - - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - bch2_btree_path_verify(trans, path); -} - -static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, - struct btree_path *path, - int check_pos) -{ - unsigned i, l = path->level; -again: - while (btree_path_node(path, l) && - !btree_path_good_node(trans, path, l, check_pos)) - __btree_path_set_level_up(trans, path, l++); - - /* If we need intent locks, take them too: */ - for (i = l + 1; - i < path->locks_want && btree_path_node(path, i); - i++) - if (!bch2_btree_node_relock(trans, path, i)) { - while (l <= i) - __btree_path_set_level_up(trans, path, l++); - goto again; - } - - return l; -} - -static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, - struct btree_path *path, - int check_pos) -{ - return likely(btree_node_locked(path, path->level) && - btree_path_check_pos_in_node(path, path->level, check_pos)) - ? path->level - : __btree_path_up_until_good_node(trans, path, check_pos); -} - -/* - * This is the main state machine for walking down the btree - walks down to a - * specified depth - * - * Returns 0 on success, -EIO on error (error reading in a btree node). - * - * On error, caller (peek_node()/peek_key()) must return NULL; the error is - * stashed in the iterator and returned from bch2_trans_exit(). - */ -int bch2_btree_path_traverse_one(struct btree_trans *trans, - btree_path_idx_t path_idx, - unsigned flags, - unsigned long trace_ip) -{ - struct btree_path *path = &trans->paths[path_idx]; - unsigned depth_want = path->level; - int ret = -((int) trans->restarted); - - if (unlikely(ret)) - goto out; - - if (unlikely(!trans->srcu_held)) - bch2_trans_srcu_lock(trans); - - trace_btree_path_traverse_start(trans, path); - - /* - * Ensure we obey path->should_be_locked: if it's set, we can't unlock - * and re-traverse the path without a transaction restart: - */ - if (path->should_be_locked) { - ret = bch2_btree_path_relock(trans, path, trace_ip); - goto out; - } - - if (path->cached) { - ret = bch2_btree_path_traverse_cached(trans, path_idx, flags); - goto out; - } - - path = &trans->paths[path_idx]; - - if (unlikely(path->level >= BTREE_MAX_DEPTH)) - goto out_uptodate; - - path->level = btree_path_up_until_good_node(trans, path, 0); - unsigned max_level = path->level; - - EBUG_ON(btree_path_node(path, path->level) && - !btree_node_locked(path, path->level)); - - /* - * Note: path->nodes[path->level] may be temporarily NULL here - that - * would indicate to other code that we got to the end of the btree, - * here it indicates that relocking the root failed - it's critical that - * btree_path_lock_root() comes next and that it can't fail - */ - while (path->level > depth_want) { - ret = btree_path_node(path, path->level) - ? btree_path_down(trans, path, flags, trace_ip) - : btree_path_lock_root(trans, path, depth_want, trace_ip); - if (unlikely(ret)) { - if (ret == 1) { - /* - * No nodes at this level - got to the end of - * the btree: - */ - ret = 0; - goto out; - } - - __bch2_btree_path_unlock(trans, path); - path->level = depth_want; - path->l[path->level].b = ERR_PTR(ret); - goto out; - } - } - - if (unlikely(max_level > path->level)) { - struct btree_path *linked; - unsigned iter; - - trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter) - for (unsigned j = path->level + 1; j < max_level; j++) - linked->l[j] = path->l[j]; - } - -out_uptodate: - path->uptodate = BTREE_ITER_UPTODATE; - trace_btree_path_traverse_end(trans, path); -out: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) - panic("ret %s (%i) trans->restarted %s (%i)\n", - bch2_err_str(ret), ret, - bch2_err_str(trans->restarted), trans->restarted); - bch2_btree_path_verify(trans, path); - return ret; -} - -static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, - struct btree_path *src) -{ - unsigned i, offset = offsetof(struct btree_path, pos); - - memcpy((void *) dst + offset, - (void *) src + offset, - sizeof(struct btree_path) - offset); - - for (i = 0; i < BTREE_MAX_DEPTH; i++) { - unsigned t = btree_node_locked_type(dst, i); - - if (t != BTREE_NODE_UNLOCKED) - six_lock_increment(&dst->l[i].b->c.lock, t); - } -} - -static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, - bool intent, unsigned long ip) -{ - btree_path_idx_t new = btree_path_alloc(trans, src); - btree_path_copy(trans, trans->paths + new, trans->paths + src); - __btree_path_get(trans, trans->paths + new, intent); -#ifdef TRACK_PATH_ALLOCATED - trans->paths[new].ip_allocated = ip; -#endif - return new; -} - -__flatten -btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, - btree_path_idx_t path, bool intent, unsigned long ip) -{ - struct btree_path *old = trans->paths + path; - __btree_path_put(trans, trans->paths + path, intent); - path = btree_path_clone(trans, path, intent, ip); - trace_btree_path_clone(trans, old, trans->paths + path); - trans->paths[path].preserve = false; - return path; -} - -btree_path_idx_t __must_check -__bch2_btree_path_set_pos(struct btree_trans *trans, - btree_path_idx_t path_idx, struct bpos new_pos, - bool intent, unsigned long ip) -{ - int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - EBUG_ON(!trans->paths[path_idx].ref); - - trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); - - path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip); - - struct btree_path *path = trans->paths + path_idx; - path->pos = new_pos; - trans->paths_sorted = false; - - if (unlikely(path->cached)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - goto out; - } - - unsigned level = btree_path_up_until_good_node(trans, path, cmp); - - if (btree_path_node(path, level)) { - struct btree_path_level *l = &path->l[level]; - - BUG_ON(!btree_node_locked(path, level)); - /* - * We might have to skip over many keys, or just a few: try - * advancing the node iterator, and if we have to skip over too - * many keys just reinit it (or if we're rewinding, since that - * is expensive). - */ - if (cmp < 0 || - !btree_path_advance_to_pos(path, l, 8)) - bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); - - /* - * Iterators to interior nodes should always be pointed at the first non - * whiteout: - */ - if (unlikely(level)) - bch2_btree_node_iter_peek(&l->iter, l->b); - } - - if (unlikely(level != path->level)) { - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - __bch2_btree_path_unlock(trans, path); - } -out: - bch2_btree_path_verify(trans, path); - return path_idx; -} - -/* Btree path: main interface: */ - -static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) -{ - struct btree_path *sib; - - sib = prev_btree_path(trans, path); - if (sib && !btree_path_cmp(sib, path)) - return sib; - - sib = next_btree_path(trans, path); - if (sib && !btree_path_cmp(sib, path)) - return sib; - - return NULL; -} - -static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) -{ - struct btree_path *sib; - - sib = prev_btree_path(trans, path); - if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) - return sib; - - sib = next_btree_path(trans, path); - if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) - return sib; - - return NULL; -} - -static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path) -{ - __bch2_btree_path_unlock(trans, trans->paths + path); - btree_path_list_remove(trans, trans->paths + path); - __clear_bit(path, trans->paths_allocated); -} - -static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path) -{ - unsigned l = path->level; - - do { - if (!btree_path_node(path, l)) - break; - - if (!is_btree_node(path, l)) - return false; - - if (path->l[l].lock_seq != path->l[l].b->c.lock.seq) - return false; - - l++; - } while (l < path->locks_want); - - return true; -} - -void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) -{ - struct btree_path *path = trans->paths + path_idx, *dup = NULL; - - if (!__btree_path_put(trans, path, intent)) - return; - - if (!path->preserve && !path->should_be_locked) - goto free; - - dup = path->preserve - ? have_path_at_pos(trans, path) - : have_node_at_pos(trans, path); - if (!dup) - return; - - /* - * If we need this path locked, the duplicate also has te be locked - * before we free this one: - */ - if (path->should_be_locked && - !dup->should_be_locked && - !trans->restarted) { - if (!(trans->locked - ? bch2_btree_path_relock_norestart(trans, dup) - : bch2_btree_path_can_relock(trans, dup))) - return; - - dup->should_be_locked = true; - } - - BUG_ON(path->should_be_locked && - !trans->restarted && - trans->locked && - !btree_node_locked(dup, dup->level)); - - path->should_be_locked = false; - dup->preserve |= path->preserve; -free: - trace_btree_path_free(trans, path_idx, dup); - __bch2_path_free(trans, path_idx); -} - -void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) -{ - panic("trans->restart_count %u, should be %u, last restarted by %pS\n", - trans->restart_count, restart_count, - (void *) trans->last_begin_ip); -} - -static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct printbuf buf = PRINTBUF; - bch2_prt_backtrace(&buf, &trans->last_restarted_trace); - panic("in transaction restart: %s, last restarted by\n%s", - bch2_err_str(trans->restarted), - buf.buf); -#else - panic("in transaction restart: %s, last restarted by %pS\n", - bch2_err_str(trans->restarted), - (void *) trans->last_restarted_ip); -#endif -} - -void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans) -{ - if (trans->restarted) - bch2_trans_in_restart_error(trans); - - if (!trans->locked) - panic("trans should be locked, unlocked by %pS\n", - (void *) trans->last_unlock_ip); - - BUG(); -} - -noinline __cold -void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) -{ - prt_printf(buf, "%u transaction updates for %s journal seq %llu\n", - trans->nr_updates, trans->fn, trans->journal_res.seq); - printbuf_indent_add(buf, 2); - - trans_for_each_update(trans, i) { - struct bkey_s_c old = { &i->old_k, i->old_v }; - - prt_str(buf, "update: btree="); - bch2_btree_id_to_text(buf, i->btree_id); - prt_printf(buf, " cached=%u %pS\n", - i->cached, - (void *) i->ip_allocated); - - prt_printf(buf, " old "); - bch2_bkey_val_to_text(buf, trans->c, old); - prt_newline(buf); - - prt_printf(buf, " new "); - bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); - prt_newline(buf); - } - - for (struct jset_entry *e = btree_trans_journal_entries_start(trans); - e != btree_trans_journal_entries_top(trans); - e = vstruct_next(e)) { - bch2_journal_entry_to_text(buf, trans->c, e); - prt_newline(buf); - } - - printbuf_indent_sub(buf, 2); -} - -static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) -{ - struct btree_path *path = trans->paths + path_idx; - - prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ", - path_idx, path->ref, path->intent_ref, - path->preserve ? 'P' : ' ', - path->should_be_locked ? 'S' : ' ', - path->cached ? 'C' : 'B'); - bch2_btree_id_level_to_text(out, path->btree_id, path->level); - prt_str(out, " pos "); - bch2_bpos_to_text(out, path->pos); - - if (!path->cached && btree_node_locked(path, path->level)) { - prt_char(out, ' '); - struct btree *b = path_l(path)->b; - bch2_bpos_to_text(out, b->data->min_key); - prt_char(out, '-'); - bch2_bpos_to_text(out, b->key.k.p); - } - -#ifdef TRACK_PATH_ALLOCATED - prt_printf(out, " %pS", (void *) path->ip_allocated); -#endif -} - -static const char *btree_node_locked_str(enum btree_node_locked_type t) -{ - switch (t) { - case BTREE_NODE_UNLOCKED: - return "unlocked"; - case BTREE_NODE_READ_LOCKED: - return "read"; - case BTREE_NODE_INTENT_LOCKED: - return "intent"; - case BTREE_NODE_WRITE_LOCKED: - return "write"; - default: - return NULL; - } -} - -void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) -{ - bch2_btree_path_to_text_short(out, trans, path_idx); - - struct btree_path *path = trans->paths + path_idx; - - prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want); - prt_newline(out); - - printbuf_indent_add(out, 2); - for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { - prt_printf(out, "l=%u locks %s seq %u node ", l, - btree_node_locked_str(btree_node_locked_type(path, l)), - path->l[l].lock_seq); - - int ret = PTR_ERR_OR_ZERO(path->l[l].b); - if (ret) - prt_str(out, bch2_err_str(ret)); - else - prt_printf(out, "%px", path->l[l].b); - prt_newline(out); - } - printbuf_indent_sub(out, 2); -} - -static noinline __cold -void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, - bool nosort) -{ - struct trans_for_each_path_inorder_iter iter; - - if (!nosort) - btree_trans_sort_paths(trans); - - trans_for_each_path_idx_inorder(trans, iter) { - bch2_btree_path_to_text_short(out, trans, iter.path_idx); - prt_newline(out); - } -} - -noinline __cold -void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) -{ - __bch2_trans_paths_to_text(out, trans, false); -} - -static noinline __cold -void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) -{ - struct printbuf buf = PRINTBUF; - - __bch2_trans_paths_to_text(&buf, trans, nosort); - bch2_trans_updates_to_text(&buf, trans); - - bch2_print_str(trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - -noinline __cold -void bch2_dump_trans_paths_updates(struct btree_trans *trans) -{ - __bch2_dump_trans_paths_updates(trans, false); -} - -noinline __cold -static void bch2_trans_update_max_paths(struct btree_trans *trans) -{ - struct btree_transaction_stats *s = btree_trans_stats(trans); - struct printbuf buf = PRINTBUF; - size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths); - - bch2_trans_paths_to_text(&buf, trans); - - if (!buf.allocation_failure) { - mutex_lock(&s->lock); - if (nr > s->nr_max_paths) { - s->nr_max_paths = nr; - swap(s->max_paths_text, buf.buf); - } - mutex_unlock(&s->lock); - } - - printbuf_exit(&buf); - - trans->nr_paths_max = nr; -} - -noinline __cold -int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) -{ - if (trace_trans_restart_too_many_iters_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_trans_paths_to_text(&buf, trans); - trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf); - printbuf_exit(&buf); - } - - count_event(trans->c, trans_restart_too_many_iters); - - return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); -} - -static noinline void btree_path_overflow(struct btree_trans *trans) -{ - bch2_dump_trans_paths_updates(trans); - bch_err(trans->c, "trans path overflow"); -} - -static noinline void btree_paths_realloc(struct btree_trans *trans) -{ - unsigned nr = trans->nr_paths * 2; - - void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + - sizeof(struct btree_trans_paths) + - nr * sizeof(struct btree_path) + - nr * sizeof(btree_path_idx_t) + 8 + - nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL); - - unsigned long *paths_allocated = p; - memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); - p += BITS_TO_LONGS(nr) * sizeof(unsigned long); - - p += sizeof(struct btree_trans_paths); - struct btree_path *paths = p; - *trans_paths_nr(paths) = nr; - memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); - p += nr * sizeof(struct btree_path); - - btree_path_idx_t *sorted = p; - memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t)); - p += nr * sizeof(btree_path_idx_t) + 8; - - struct btree_insert_entry *updates = p; - memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry)); - - unsigned long *old = trans->paths_allocated; - - rcu_assign_pointer(trans->paths_allocated, paths_allocated); - rcu_assign_pointer(trans->paths, paths); - rcu_assign_pointer(trans->sorted, sorted); - rcu_assign_pointer(trans->updates, updates); - - trans->nr_paths = nr; - - if (old != trans->_paths_allocated) - kfree_rcu_mightsleep(old); -} - -static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, - btree_path_idx_t pos) -{ - btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths); - - if (unlikely(idx == trans->nr_paths)) { - if (trans->nr_paths == BTREE_ITER_MAX) { - btree_path_overflow(trans); - return 0; - } - - btree_paths_realloc(trans); - } - - /* - * Do this before marking the new path as allocated, since it won't be - * initialized yet: - */ - if (unlikely(idx > trans->nr_paths_max)) - bch2_trans_update_max_paths(trans); - - __set_bit(idx, trans->paths_allocated); - - struct btree_path *path = &trans->paths[idx]; - path->ref = 0; - path->intent_ref = 0; - path->nodes_locked = 0; - - btree_path_list_add(trans, pos, idx); - trans->paths_sorted = false; - return idx; -} - -btree_path_idx_t bch2_path_get(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned level, - unsigned flags, unsigned long ip) -{ - struct btree_path *path; - bool cached = flags & BTREE_ITER_cached; - bool intent = flags & BTREE_ITER_intent; - struct trans_for_each_path_inorder_iter iter; - btree_path_idx_t path_pos = 0, path_idx; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_trans_verify_locks(trans); - - btree_trans_sort_paths(trans); - - if (intent) - locks_want = max(locks_want, level + 1); - locks_want = min(locks_want, BTREE_MAX_DEPTH); - - trans_for_each_path_inorder(trans, path, iter) { - if (__btree_path_cmp(path, - btree_id, - cached, - pos, - level) > 0) - break; - - path_pos = iter.path_idx; - } - - if (path_pos && - trans->paths[path_pos].cached == cached && - trans->paths[path_pos].btree_id == btree_id && - trans->paths[path_pos].level == level && - bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) { - trace_btree_path_get(trans, trans->paths + path_pos, &pos); - - __btree_path_get(trans, trans->paths + path_pos, intent); - path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); - path = trans->paths + path_idx; - } else { - path_idx = btree_path_alloc(trans, path_pos); - path = trans->paths + path_idx; - - __btree_path_get(trans, path, intent); - path->pos = pos; - path->btree_id = btree_id; - path->cached = cached; - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - path->should_be_locked = false; - path->level = level; - path->locks_want = locks_want; - path->nodes_locked = 0; - for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++) - path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -#ifdef TRACK_PATH_ALLOCATED - path->ip_allocated = ip; -#endif - trans->paths_sorted = false; - - trace_btree_path_alloc(trans, path); - } - - if (!(flags & BTREE_ITER_nopreserve)) - path->preserve = true; - - /* - * If the path has locks_want greater than requested, we don't downgrade - * it here - on transaction restart because btree node split needs to - * upgrade locks, we might be putting/getting the iterator again. - * Downgrading iterators only happens via bch2_trans_downgrade(), after - * a successful transaction commit. - */ - - return path_idx; -} - -btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) -{ - btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, - BTREE_ITER_nopreserve| - BTREE_ITER_intent, _RET_IP_); - path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); - - struct btree_path *path = trans->paths + path_idx; - bch2_btree_path_downgrade(trans, path); - __bch2_btree_path_unlock(trans, path); - return path_idx; -} - -struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) -{ - - struct btree_path_level *l = path_l(path); - struct bkey_packed *_k; - struct bkey_s_c k; - - if (unlikely(!l->b)) - return bkey_s_c_null; - - EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); - EBUG_ON(!btree_node_locked(path, path->level)); - - if (!path->cached) { - _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; - - EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); - - if (!k.k || !bpos_eq(path->pos, k.k->p)) - goto hole; - } else { - struct bkey_cached *ck = (void *) path->l[0].b; - if (!ck) - return bkey_s_c_null; - - EBUG_ON(path->btree_id != ck->key.btree_id || - !bkey_eq(path->pos, ck->key.pos)); - - *u = ck->k->k; - k = (struct bkey_s_c) { u, &ck->k->v }; - } - - return k; -hole: - bkey_init(u); - u->p = path->pos; - return (struct bkey_s_c) { u, NULL }; -} - -void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!iter->path || trans->restarted) - return; - - struct btree_path *path = btree_iter_path(trans, iter); - path->preserve = false; - if (path->ref == 1) - path->should_be_locked = false; -} -/* Btree iterators: */ - -int __must_check -__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) -{ - return bch2_btree_path_traverse(trans, iter->path, iter->flags); -} - -int __must_check -bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - iter->path = bch2_btree_path_set_pos(trans, iter->path, - btree_iter_search_key(iter), - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - return ret; - - struct btree_path *path = btree_iter_path(trans, iter); - if (btree_path_node(path, path->level)) - btree_path_set_should_be_locked(trans, path); - return 0; -} - -/* Iterate across nodes (leaf and interior nodes) */ - -struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, - struct btree_iter *iter) -{ - struct btree *b = NULL; - int ret; - - EBUG_ON(trans->paths[iter->path].cached); - bch2_btree_iter_verify(trans, iter); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - goto err; - - struct btree_path *path = btree_iter_path(trans, iter); - b = btree_path_node(path, path->level); - if (!b) - goto out; - - BUG_ON(bpos_lt(b->key.k.p, iter->pos)); - - bkey_init(&iter->k); - iter->k.p = iter->pos = b->key.k.p; - - iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out: - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - - return b; -err: - b = ERR_PTR(ret); - goto out; -} - -/* Only kept for -tools */ -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, - struct btree_iter *iter) -{ - struct btree *b; - - while (b = bch2_btree_iter_peek_node(trans, iter), - bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) - bch2_trans_begin(trans); - - return b; -} - -struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) -{ - struct btree *b = NULL; - int ret; - - EBUG_ON(trans->paths[iter->path].cached); - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(trans, iter); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - goto err; - - - struct btree_path *path = btree_iter_path(trans, iter); - - /* already at end? */ - if (!btree_path_node(path, path->level)) - return NULL; - - /* got to end? */ - if (!btree_path_node(path, path->level + 1)) { - path->should_be_locked = false; - btree_path_set_level_up(trans, path); - return NULL; - } - - /* - * We don't correctly handle nodes with extra intent locks here: - * downgrade so we don't violate locking invariants - */ - bch2_btree_path_downgrade(trans, path); - - if (!bch2_btree_node_relock(trans, path, path->level + 1)) { - trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); - __bch2_btree_path_unlock(trans, path); - path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - goto err; - } - - b = btree_path_node(path, path->level + 1); - - if (bpos_eq(iter->pos, b->key.k.p)) { - __btree_path_set_level_up(trans, path, path->level++); - } else { - if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(trans, path, path->level + 1); - - /* - * Haven't gotten to the end of the parent node: go back down to - * the next child node - */ - iter->path = bch2_btree_path_set_pos(trans, iter->path, - bpos_successor(iter->pos), - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - path = btree_iter_path(trans, iter); - btree_path_set_level_down(trans, path, iter->min_depth); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (ret) - goto err; - - path = btree_iter_path(trans, iter); - b = path->l[path->level].b; - } - - bkey_init(&iter->k); - iter->k.p = iter->pos = b->key.k.p; - - iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); - EBUG_ON(btree_iter_path(trans, iter)->uptodate); -out: - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - - return b; -err: - b = ERR_PTR(ret); - goto out; -} - -/* Iterate across keys (in leaf nodes only) */ - -inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_all_snapshots - ? bpos_eq(pos, SPOS_MAX) - : bkey_eq(pos, SPOS_MAX)); - - if (ret && !(iter->flags & BTREE_ITER_is_extents)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(trans, iter, pos); - return ret; -} - -inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bpos pos = bkey_start_pos(&iter->k); - bool ret = !(iter->flags & BTREE_ITER_all_snapshots - ? bpos_eq(pos, POS_MIN) - : bkey_eq(pos, POS_MIN)); - - if (ret && !(iter->flags & BTREE_ITER_is_extents)) - pos = bkey_predecessor(iter, pos); - bch2_btree_iter_set_pos(trans, iter, pos); - return ret; -} - -static noinline -void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key, struct bkey_s_c *k) -{ - struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; - - trans_for_each_update(trans, i) - if (!i->key_cache_already_flushed && - i->btree_id == iter->btree_id && - bpos_le(i->k->k.p, search_key) && - bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { - iter->k = i->k->k; - *k = bkey_i_to_s_c(i->k); - } -} - -static noinline -void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key, - struct bkey_s_c *k) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bpos end = path_l(path)->b->key.k.p; - - trans_for_each_update(trans, i) - if (!i->key_cache_already_flushed && - i->btree_id == iter->btree_id && - bpos_ge(i->k->k.p, search_key) && - bpos_le(i->k->k.p, k->k ? k->k->p : end)) { - iter->k = i->k->k; - *k = bkey_i_to_s_c(i->k); - } -} - -static noinline -void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k) -{ - trans_for_each_update(trans, i) - if (!i->key_cache_already_flushed && - i->btree_id == iter->btree_id && - bpos_eq(i->k->k.p, iter->pos)) { - iter->k = i->k->k; - *k = bkey_i_to_s_c(i->k); - } -} - -static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_pos, - struct bpos end_pos) -{ - struct btree_path *path = btree_iter_path(trans, iter); - - return bch2_journal_keys_peek_max(trans->c, iter->btree_id, - path->level, - search_pos, - end_pos, - &iter->journal_idx); -} - -static noinline -struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, - struct btree_iter *iter) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); - - if (k) { - iter->k = k->k; - return bkey_i_to_s_c(k); - } else { - return bkey_s_c_null; - } -} - -static noinline -void btree_trans_peek_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bkey_s_c *k) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, search_key, - k->k ? k->k->p : path_l(path)->b->key.k.p); - if (next_journal) { - iter->k = next_journal->k; - *k = bkey_i_to_s_c(next_journal); - } -} - -static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bpos end_pos) -{ - struct btree_path *path = btree_iter_path(trans, iter); - - return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, - path->level, - search_key, - end_pos, - &iter->journal_idx); -} - -static noinline -void btree_trans_peek_prev_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos search_key, - struct bkey_s_c *k) -{ - struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *next_journal = - bch2_btree_journal_peek_prev(trans, iter, search_key, - k->k ? k->k->p : path_l(path)->b->data->min_key); - - if (next_journal) { - iter->k = next_journal->k; - *k = bkey_i_to_s_c(next_journal); - } -} - -/* - * Checks btree key cache for key at iter->pos and returns it if present, or - * bkey_s_c_null: - */ -static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct bkey u; - struct bkey_s_c k; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if ((iter->flags & BTREE_ITER_key_cache_fill) && - bpos_eq(iter->pos, pos)) - return bkey_s_c_null; - - if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) - return bkey_s_c_null; - - if (!iter->key_cache_path) - iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, - iter->flags & BTREE_ITER_intent, 0, - iter->flags|BTREE_ITER_cached| - BTREE_ITER_cached_nofill, - _THIS_IP_); - - iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - iter->flags|BTREE_ITER_cached) ?: - bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); - if (!k.k) - return k; - - if ((iter->flags & BTREE_ITER_all_snapshots) && - !bpos_eq(pos, k.k->p)) - return bkey_s_c_null; - - iter->k = u; - k.k = &iter->k; - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - return k; -} - -static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key) -{ - struct bkey_s_c k, k2; - int ret; - - EBUG_ON(btree_iter_path(trans, iter)->cached); - bch2_btree_iter_verify(trans, iter); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - break; - } - - struct btree_path *path = btree_iter_path(trans, iter); - struct btree_path_level *l = path_l(path); - - if (unlikely(!l->b)) { - /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - - btree_path_set_should_be_locked(trans, path); - - k = btree_path_level_peek_all(trans->c, l, &iter->k); - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - k.k && - (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { - k = k2; - if (bkey_err(k)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); - break; - } - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_journal(trans, iter, search_key, &k); - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) - bch2_btree_trans_peek_updates(trans, iter, search_key, &k); - - if (k.k && bkey_deleted(k.k)) { - /* - * If we've got a whiteout, and it's after the search - * key, advance the search key to the whiteout instead - * of just after the whiteout - it might be a btree - * whiteout, with a real key at the same position, since - * in the btree deleted keys sort before non deleted. - */ - search_key = !bpos_eq(search_key, k.k->p) - ? k.k->p - : bpos_successor(k.k->p); - continue; - } - - if (likely(k.k)) { - break; - } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { - /* Advance to next leaf node: */ - search_key = bpos_successor(l->b->key.k.p); - } else { - /* End of btree: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - } - - bch2_btree_iter_verify(trans, iter); - - if (trace___btree_iter_peek_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace___btree_iter_peek(trans->c, buf.buf); - } - - return k; -} - -/** - * bch2_btree_iter_peek_max() - returns first key greater than or equal to - * iterator's current position - * @trans: btree transaction object - * @iter: iterator to peek from - * @end: search limit: returns keys less than or equal to @end - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end) -{ - struct bpos search_key = btree_iter_search_key(iter); - struct bkey_s_c k; - struct bpos iter_pos = iter->pos; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - if (iter->update_path) { - bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } - - while (1) { - k = __bch2_btree_iter_peek(trans, iter, search_key); - if (unlikely(!k.k)) - goto end; - if (unlikely(bkey_err(k))) - goto out_no_locked; - - if (iter->flags & BTREE_ITER_filter_snapshots) { - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - * - * But we can't do the full check here, because bkey_start_pos() - * isn't monotonically increasing before FILTER_SNAPSHOTS, and - * that's what we check against in extents mode: - */ - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(k.k->p, end) - : k.k->p.inode > end.inode)) - goto end; - - if (iter->update_path && - !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } - - if ((iter->flags & BTREE_ITER_intent) && - !(iter->flags & BTREE_ITER_is_extents) && - !iter->update_path) { - struct bpos pos = k.k->p; - - if (pos.snapshot < iter->snapshot) { - search_key = bpos_successor(k.k->p); - continue; - } - - pos.snapshot = iter->snapshot; - - /* - * advance, same as on exit for iter->path, but only up - * to snapshot - */ - __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); - iter->update_path = iter->path; - - iter->update_path = bch2_btree_path_set_pos(trans, - iter->update_path, pos, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; - } - } - - /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: - */ - if (!bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - search_key = bpos_successor(k.k->p); - continue; - } - - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_key_cache_fill)) { - search_key = bkey_successor(iter, k.k->p); - continue; - } - } - - /* - * iter->pos should be mononotically increasing, and always be - * equal to the key we just returned - except extents can - * straddle iter->pos: - */ - if (!(iter->flags & BTREE_ITER_is_extents)) - iter_pos = k.k->p; - else - iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - - if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) : - iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) : - bkey_gt(iter_pos, end))) - goto end; - - break; - } - - iter->pos = iter_pos; - - iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out_no_locked: - if (iter->update_path) { - ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_); - if (unlikely(ret)) - k = bkey_s_c_err(ret); - else - btree_path_set_should_be_locked(trans, trans->paths + iter->update_path); - } - - if (!(iter->flags & BTREE_ITER_all_snapshots)) - iter->pos.snapshot = iter->snapshot; - - ret = bch2_btree_iter_verify_ret(trans, iter, k); - if (unlikely(ret)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - } - - bch2_btree_iter_verify_entry_exit(iter); - - if (trace_btree_iter_peek_max_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace_btree_iter_peek_max(trans->c, buf.buf); - } - - return k; -end: - bch2_btree_iter_set_pos(trans, iter, end); - k = bkey_s_c_null; - goto out_no_locked; -} - -/** - * bch2_btree_iter_next() - returns first key greater than iterator's current - * position - * @trans: btree transaction object - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_advance(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek(trans, iter); -} - -static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key) -{ - struct bkey_s_c k, k2; - - bch2_btree_iter_verify(trans, iter); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(trans, iter, iter->pos); - k = bkey_s_c_err(ret); - break; - } - - struct btree_path *path = btree_iter_path(trans, iter); - struct btree_path_level *l = path_l(path); - - if (unlikely(!l->b)) { - /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); - k = bkey_s_c_null; - break; - } - - btree_path_set_should_be_locked(trans, path); - - k = btree_path_level_peek_all(trans->c, l, &iter->k); - if (!k.k || bpos_gt(k.k->p, search_key)) { - k = btree_path_level_prev(trans, path, l, &iter->k); - - BUG_ON(k.k && bpos_gt(k.k->p, search_key)); - } - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - k.k && - (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { - k = k2; - if (bkey_err(k2)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); - break; - } - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_prev_journal(trans, iter, search_key, &k); - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) - bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k); - - if (likely(k.k && !bkey_deleted(k.k))) { - break; - } else if (k.k) { - search_key = bpos_predecessor(k.k->p); - } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { - /* Advance to previous leaf node: */ - search_key = bpos_predecessor(path->l[0].b->data->min_key); - } else { - /* Start of btree: */ - bch2_btree_iter_set_pos(trans, iter, POS_MIN); - k = bkey_s_c_null; - break; - } - } - - bch2_btree_iter_verify(trans, iter); - return k; -} - -/** - * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to - * iterator's current position - * @trans: btree transaction object - * @iter: iterator to peek from - * @end: search limit: returns keys greater than or equal to @end - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end) -{ - if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && - !bkey_eq(iter->pos, POS_MAX) && - !((iter->flags & BTREE_ITER_is_extents) && - iter->pos.offset == U64_MAX)) { - - /* - * bkey_start_pos(), for extents, is not monotonically - * increasing until after filtering for snapshots: - * - * Thus, for extents we need to search forward until we find a - * real visible extents - easiest to just use peek_slot() (which - * internally uses peek() for extents) - */ - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); - if (bkey_err(k)) - return k; - - if (!bkey_deleted(k.k) && - (!(iter->flags & BTREE_ITER_is_extents) || - bkey_lt(bkey_start_pos(k.k), iter->pos))) - return k; - } - - struct bpos search_key = iter->pos; - struct bkey_s_c k; - btree_path_idx_t saved_path = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode); - - int ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - while (1) { - k = __bch2_btree_iter_peek_prev(trans, iter, search_key); - if (unlikely(!k.k)) - goto end; - if (unlikely(bkey_err(k))) - goto out_no_locked; - - if (iter->flags & BTREE_ITER_filter_snapshots) { - struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; - if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { - /* - * If we have a saved candidate, and we're past - * the last possible snapshot overwrite, return - * it: - */ - bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_intent); - iter->path = saved_path; - saved_path = 0; - k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); - break; - } - - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - */ - if (unlikely(bkey_lt(k.k->p, end))) - goto end; - - if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { - search_key = bpos_predecessor(k.k->p); - continue; - } - - if (k.k->p.snapshot != iter->snapshot) { - /* - * Have a key visible in iter->snapshot, but - * might have overwrites: - save it and keep - * searching. Unless it's a whiteout - then drop - * our previous saved candidate: - */ - if (saved_path) { - bch2_path_put(trans, saved_path, - iter->flags & BTREE_ITER_intent); - saved_path = 0; - } - - if (!bkey_whiteout(k.k)) { - saved_path = btree_path_clone(trans, iter->path, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - trace_btree_path_save_pos(trans, - trans->paths + iter->path, - trans->paths + saved_path); - } - - search_key = bpos_predecessor(k.k->p); - continue; - } - - if (bkey_whiteout(k.k)) { - search_key = bkey_predecessor(iter, k.k->p); - search_key.snapshot = U32_MAX; - continue; - } - } - - EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) : - iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) : - bkey_gt(k.k->p, iter->pos)); - - if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) : - iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) : - bkey_lt(k.k->p, end))) - goto end; - - break; - } - - /* Extents can straddle iter->pos: */ - iter->pos = bpos_min(iter->pos, k.k->p);; - - if (iter->flags & BTREE_ITER_filter_snapshots) - iter->pos.snapshot = iter->snapshot; -out_no_locked: - if (saved_path) - bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); - - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - - if (trace_btree_iter_peek_prev_min_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace_btree_iter_peek_prev_min(trans->c, buf.buf); - } - return k; -end: - bch2_btree_iter_set_pos(trans, iter, end); - k = bkey_s_c_null; - goto out_no_locked; -} - -/** - * bch2_btree_iter_prev() - returns first key less than iterator's current - * position - * @trans: btree transaction object - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_rewind(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_prev(trans, iter); -} - -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bpos search_key; - struct bkey_s_c k; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(trans, iter); - bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out; - } - - /* extents can't span inode numbers: */ - if ((iter->flags & BTREE_ITER_is_extents) && - unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { - if (iter->pos.inode == KEY_INODE_MAX) { - k = bkey_s_c_null; - goto out2; - } - - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); - } - - search_key = btree_iter_search_key(iter); - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out; - } - - struct btree_path *path = btree_iter_path(trans, iter); - if (unlikely(!btree_path_node(path, path->level))) { - k = bkey_s_c_null; - goto out2; - } - - btree_path_set_should_be_locked(trans, path); - - if ((iter->flags & BTREE_ITER_cached) || - !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { - k = bkey_s_c_null; - - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) { - bch2_btree_trans_peek_slot_updates(trans, iter, &k); - if (k.k) - goto out; - } - - if (unlikely(iter->flags & BTREE_ITER_with_journal) && - (k = btree_trans_peek_slot_journal(trans, iter)).k) - goto out; - - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { - if (!bkey_err(k)) - iter->k = *k.k; - /* We're not returning a key from iter->path: */ - goto out; - } - - k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); - if (unlikely(!k.k)) - goto out; - - if (unlikely(k.k->type == KEY_TYPE_whiteout && - (iter->flags & BTREE_ITER_filter_snapshots) && - !(iter->flags & BTREE_ITER_key_cache_fill))) - iter->k.type = KEY_TYPE_deleted; - } else { - struct bpos next; - struct bpos end = iter->pos; - - if (iter->flags & BTREE_ITER_is_extents) - end.offset = U64_MAX; - - EBUG_ON(btree_iter_path(trans, iter)->level); - - if (iter->flags & BTREE_ITER_intent) { - struct btree_iter iter2; - - bch2_trans_copy_iter(trans, &iter2, iter); - k = bch2_btree_iter_peek_max(trans, &iter2, end); - - if (k.k && !bkey_err(k)) { - swap(iter->key_cache_path, iter2.key_cache_path); - iter->k = iter2.k; - k.k = &iter->k; - } - bch2_trans_iter_exit(trans, &iter2); - } else { - struct bpos pos = iter->pos; - - k = bch2_btree_iter_peek_max(trans, iter, end); - if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(trans, iter, pos); - else - iter->pos = pos; - } - - if (unlikely(bkey_err(k))) - goto out; - - next = k.k ? bkey_start_pos(k.k) : POS_MAX; - - if (bkey_lt(iter->pos, next)) { - bkey_init(&iter->k); - iter->k.p = iter->pos; - - if (iter->flags & BTREE_ITER_is_extents) { - bch2_key_resize(&iter->k, - min_t(u64, KEY_SIZE_MAX, - (next.inode == iter->pos.inode - ? next.offset - : KEY_OFFSET_MAX) - - iter->pos.offset)); - EBUG_ON(!iter->k.size); - } - - k = (struct bkey_s_c) { &iter->k, NULL }; - } - } -out: - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - ret = bch2_btree_iter_verify_ret(trans, iter, k); - if (unlikely(ret)) - k = bkey_s_c_err(ret); -out2: - if (trace_btree_iter_peek_slot_enabled()) { - CLASS(printbuf, buf)(); - - int ret = bkey_err(k); - if (ret) - prt_str(&buf, bch2_err_str(ret)); - else if (k.k) - bch2_bkey_val_to_text(&buf, trans->c, k); - else - prt_str(&buf, "(null)"); - trace_btree_iter_peek_slot(trans->c, buf.buf); - } - - return k; -} - -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_advance(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_slot(trans, iter); -} - -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) -{ - if (!bch2_btree_iter_rewind(trans, iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_slot(trans, iter); -} - -/* Obsolete, but still used by rust wrapper in -tools */ -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - -/* new transactional stuff: */ - -#ifdef CONFIG_BCACHEFS_DEBUG -static void btree_trans_verify_sorted_refs(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1); - - trans_for_each_path(trans, path, i) { - BUG_ON(path->sorted_idx >= trans->nr_sorted); - BUG_ON(trans->sorted[path->sorted_idx] != i); - } - - for (i = 0; i < trans->nr_sorted; i++) { - unsigned idx = trans->sorted[i]; - - BUG_ON(!test_bit(idx, trans->paths_allocated)); - BUG_ON(trans->paths[idx].sorted_idx != i); - } -} - -static void btree_trans_verify_sorted(struct btree_trans *trans) -{ - struct btree_path *path, *prev = NULL; - struct trans_for_each_path_inorder_iter iter; - - if (!static_branch_unlikely(&bch2_debug_check_iterators)) - return; - - trans_for_each_path_inorder(trans, path, iter) { - if (prev && btree_path_cmp(prev, path) > 0) { - __bch2_dump_trans_paths_updates(trans, true); - panic("trans paths out of order!\n"); - } - prev = path; - } -} -#else -static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} -static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} -#endif - -void __bch2_btree_trans_sort_paths(struct btree_trans *trans) -{ - int i, l = 0, r = trans->nr_sorted, inc = 1; - bool swapped; - - btree_trans_verify_sorted_refs(trans); - - if (trans->paths_sorted) - goto out; - - /* - * Cocktail shaker sort: this is efficient because iterators will be - * mostly sorted. - */ - do { - swapped = false; - - for (i = inc > 0 ? l : r - 2; - i + 1 < r && i >= l; - i += inc) { - if (btree_path_cmp(trans->paths + trans->sorted[i], - trans->paths + trans->sorted[i + 1]) > 0) { - swap(trans->sorted[i], trans->sorted[i + 1]); - trans->paths[trans->sorted[i]].sorted_idx = i; - trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; - swapped = true; - } - } - - if (inc > 0) - --r; - else - l++; - inc = -inc; - } while (swapped); - - trans->paths_sorted = true; -out: - btree_trans_verify_sorted(trans); -} - -static inline void btree_path_list_remove(struct btree_trans *trans, - struct btree_path *path) -{ - EBUG_ON(path->sorted_idx >= trans->nr_sorted); -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - trans->nr_sorted--; - memmove_u64s_down_small(trans->sorted + path->sorted_idx, - trans->sorted + path->sorted_idx + 1, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, - sizeof(u64) / sizeof(btree_path_idx_t))); -#else - array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); -#endif - for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) - trans->paths[trans->sorted[i]].sorted_idx = i; -} - -static inline void btree_path_list_add(struct btree_trans *trans, - btree_path_idx_t pos, - btree_path_idx_t path_idx) -{ - struct btree_path *path = trans->paths + path_idx; - - path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted; - -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, - trans->sorted + path->sorted_idx, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, - sizeof(u64) / sizeof(btree_path_idx_t))); - trans->nr_sorted++; - trans->sorted[path->sorted_idx] = path_idx; -#else - array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx); -#endif - - for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) - trans->paths[trans->sorted[i]].sorted_idx = i; - - btree_trans_verify_sorted_refs(trans); -} - -void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) -{ - if (iter->update_path) - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - if (iter->path) - bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_intent); - if (iter->key_cache_path) - bch2_path_put(trans, iter->key_cache_path, - iter->flags & BTREE_ITER_intent); - iter->path = 0; - iter->update_path = 0; - iter->key_cache_path = 0; -} - -void bch2_trans_iter_init_outlined(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_id btree_id, struct bpos pos, - unsigned flags) -{ - bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, 0, flags), - _RET_IP_); -} - -void bch2_trans_node_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_id btree_id, - struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags) -{ - flags |= BTREE_ITER_not_extents; - flags |= BTREE_ITER_snapshot_field; - flags |= BTREE_ITER_all_snapshots; - - if (!depth && btree_id_cached(trans->c, btree_id)) - flags |= BTREE_ITER_with_key_cache; - - bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, - bch2_btree_iter_flags(trans, btree_id, depth, flags), - _RET_IP_); - - iter->min_depth = depth; - - struct btree_path *path = btree_iter_path(trans, iter); - BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); - BUG_ON(path->level != depth); - BUG_ON(iter->min_depth != depth); -} - -void bch2_trans_copy_iter(struct btree_trans *trans, - struct btree_iter *dst, struct btree_iter *src) -{ - *dst = *src; -#ifdef TRACK_PATH_ALLOCATED - dst->ip_allocated = _RET_IP_; -#endif - if (src->path) - __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent); - if (src->update_path) - __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent); - dst->key_cache_path = 0; -} - -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, - darray_trans_kmalloc_trace *trace) -{ - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 60); - - darray_for_each(*trace, i) - prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); -} -#endif - -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) -{ - struct bch_fs *c = trans->c; - unsigned new_top = trans->mem_top + size; - unsigned old_bytes = trans->mem_bytes; - unsigned new_bytes = roundup_pow_of_two(new_top); - int ret; - void *new_mem; - void *p; - - if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n", - BTREE_TRANS_MEM_MAX); - - bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -#endif - } - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (ret) - return ERR_PTR(ret); - - struct btree_transaction_stats *s = btree_trans_stats(trans); - if (new_bytes > s->max_mem) { - mutex_lock(&s->lock); -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); - s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, - trans->trans_kmalloc_trace.nr); - - memcpy(s->trans_kmalloc_trace.data, - trans->trans_kmalloc_trace.data, - sizeof(s->trans_kmalloc_trace.data[0]) * - s->trans_kmalloc_trace.nr); -#endif - s->max_mem = new_bytes; - mutex_unlock(&s->lock); - } - - if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) { - EBUG_ON(trans->mem_bytes >= new_bytes); - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); - } - - if (old_bytes) { - trans->realloc_bytes_required = new_bytes; - trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart_ip(trans, - BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); - } - - EBUG_ON(trans->mem); - - new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - - new_mem = kmalloc(new_bytes, GFP_KERNEL); - if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - trans->used_mempool = true; - } - - EBUG_ON(!new_mem); - - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - - ret = bch2_trans_relock(trans); - if (ret) - return ERR_PTR(ret); - } - - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - - p = trans->mem + trans->mem_top; - trans->mem_top += size; - memset(p, 0, size); - return p; -} - -static inline void check_srcu_held_too_long(struct btree_trans *trans) -{ - WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10), - "btree trans held srcu lock (delaying memory reclaim) for %lu seconds", - (jiffies - trans->srcu_lock_time) / HZ); -} - -void bch2_trans_srcu_unlock(struct btree_trans *trans) -{ - if (trans->srcu_held) { - struct bch_fs *c = trans->c; - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->cached && !btree_node_locked(path, 0)) - path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); - - check_srcu_held_too_long(trans); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - trans->srcu_held = false; - } -} - -static void bch2_trans_srcu_lock(struct btree_trans *trans) -{ - if (!trans->srcu_held) { - trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; - } -} - -/** - * bch2_trans_begin() - reset a transaction after a interrupted attempt - * @trans: transaction to reset - * - * Returns: current restart counter, to be used with trans_was_restarted() - * - * While iterating over nodes or updating nodes a attempt to lock a btree node - * may return BCH_ERR_transaction_restart when the trylock fails. When this - * occurs bch2_trans_begin() should be called and the transaction retried. - */ -u32 bch2_trans_begin(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - u64 now; - - bch2_trans_reset_updates(trans); - - trans->restart_count++; - trans->mem_top = 0; - - if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) { - EBUG_ON(!trans->mem || !trans->mem_bytes); - unsigned new_bytes = trans->realloc_bytes_required; - void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); - - EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); - - if (!new_mem) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - trans->used_mempool = true; - kfree(trans->mem); - } - } - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - } - - trans_for_each_path(trans, path, i) { - path->should_be_locked = false; - - /* - * If the transaction wasn't restarted, we're presuming to be - * doing something new: dont keep iterators excpt the ones that - * are in use - except for the subvolumes btree: - */ - if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) - path->preserve = false; - - /* - * XXX: we probably shouldn't be doing this if the transaction - * was restarted, but currently we still overflow transaction - * iterators if we do that - */ - if (!path->ref && !path->preserve) - __bch2_path_free(trans, i); - else - path->preserve = false; - } - - now = local_clock(); - - if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) && - time_after64(now, trans->last_begin_time + 10)) - __bch2_time_stats_update(&btree_trans_stats(trans)->duration, - trans->last_begin_time, now); - - if (!trans->restarted && - (need_resched() || - time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { - bch2_trans_unlock(trans); - cond_resched(); - now = local_clock(); - } - trans->last_begin_time = now; - - if (unlikely(trans->srcu_held && - time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) - bch2_trans_srcu_unlock(trans); - - trans->last_begin_ip = _RET_IP_; - -#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS - if (trans->restarted) { - trans->restart_count_this_trans++; - } else { - trans->restart_count_this_trans = 0; - } -#endif - -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - trans->trans_kmalloc_trace.nr = 0; -#endif - - trans_set_locked(trans, false); - - if (trans->restarted) { - bch2_btree_path_traverse_all(trans); - trans->notrace_relock_fail = false; - } - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - return trans->restart_count; -} - -const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" }; - -unsigned bch2_trans_get_fn_idx(const char *fn) -{ - for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) - if (!bch2_btree_transaction_fns[i] || - bch2_btree_transaction_fns[i] == fn) { - bch2_btree_transaction_fns[i] = fn; - return i; - } - - pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); - return 0; -} - -struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) - __acquires(&c->btree_trans_barrier) -{ - struct btree_trans *trans; - - if (IS_ENABLED(__KERNEL__)) { - trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); - if (trans) { - memset(trans, 0, offsetof(struct btree_trans, list)); - goto got_trans; - } - } - - trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); - memset(trans, 0, sizeof(*trans)); - - seqmutex_lock(&c->btree_trans_lock); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - struct btree_trans *pos; - pid_t pid = current->pid; - - trans->locking_wait.task = current; - - list_for_each_entry(pos, &c->btree_trans_list, list) { - struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); - /* - * We'd much prefer to be stricter here and completely - * disallow multiple btree_trans in the same thread - - * but the data move path calls bch2_write when we - * already have a btree_trans initialized. - */ - BUG_ON(pos_task && - pid == pos_task->pid && - pos->locked); - } - } - - list_add(&trans->list, &c->btree_trans_list); - seqmutex_unlock(&c->btree_trans_lock); -got_trans: - trans->c = c; - trans->last_begin_time = local_clock(); - trans->fn_idx = fn_idx; - trans->locking_wait.task = current; - trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) && - atomic_inc_not_zero(&c->journal_keys.ref); - trans->nr_paths = ARRAY_SIZE(trans->_paths); - trans->paths_allocated = trans->_paths_allocated; - trans->sorted = trans->_sorted; - trans->paths = trans->_paths; - trans->updates = trans->_updates; - - *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL; - - trans->paths_allocated[0] = 1; - - static struct lock_class_key lockdep_key; - lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0); - - if (fn_idx < BCH_TRANSACTIONS_NR) { - trans->fn = bch2_btree_transaction_fns[fn_idx]; - - struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; - - if (s->max_mem) { - unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); - - trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); - if (likely(trans->mem)) - trans->mem_bytes = expected_mem_bytes; - } - - trans->nr_paths_max = s->nr_max_paths; - } - - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; - trans_set_locked(trans, false); - - closure_init_stack_release(&trans->ref); - return trans; -} - -#ifdef CONFIG_BCACHEFS_DEBUG - -static bool btree_paths_leaked(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->ref) - return true; - return false; -} - -static void check_btree_paths_leaked(struct btree_trans *trans) -{ - if (btree_paths_leaked(trans)) { - struct bch_fs *c = trans->c; - struct btree_path *path; - unsigned i; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); - trans_for_each_path(trans, path, i) - if (path->ref) - prt_printf(&buf, "btree %s %pS\n", - bch2_btree_id_str(path->btree_id), - (void *) path->ip_allocated); - - bch2_fs_emergency_read_only2(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } -} -#else -static inline void check_btree_paths_leaked(struct btree_trans *trans) {} -#endif - -void bch2_trans_put(struct btree_trans *trans) - __releases(&c->btree_trans_barrier) -{ - struct bch_fs *c = trans->c; - - if (trans->restarted) - bch2_trans_in_restart_error(trans); - - bch2_trans_unlock(trans); - - trans_for_each_update(trans, i) - __btree_path_put(trans, trans->paths + i->path, true); - trans->nr_updates = 0; - - check_btree_paths_leaked(trans); - - if (trans->srcu_held) { - check_srcu_held_too_long(trans); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - } - - if (unlikely(trans->journal_replay_not_finished)) - bch2_journal_keys_put(c); - - /* - * trans->ref protects trans->locking_wait.task, btree_paths array; used - * by cycle detector - */ - closure_return_sync(&trans->ref); - trans->locking_wait.task = NULL; - -#ifdef CONFIG_BCACHEFS_DEBUG - darray_exit(&trans->last_restarted_trace); -#endif -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_exit(&trans->trans_kmalloc_trace); -#endif - - unsigned long *paths_allocated = trans->paths_allocated; - trans->paths_allocated = NULL; - trans->paths = NULL; - - if (paths_allocated != trans->_paths_allocated) - kvfree_rcu_mightsleep(paths_allocated); - - if (trans->used_mempool) - mempool_free(trans->mem, &c->btree_trans_mem_pool); - else - kfree(trans->mem); - - /* Userspace doesn't have a real percpu implementation: */ - if (IS_ENABLED(__KERNEL__)) - trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); - - if (trans) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - - mempool_free(trans, &c->btree_trans_pool); - } -} - -bool bch2_current_has_btree_trans(struct bch_fs *c) -{ - seqmutex_lock(&c->btree_trans_lock); - struct btree_trans *trans; - bool ret = false; - list_for_each_entry(trans, &c->btree_trans_list, list) - if (trans->locking_wait.task == current && - trans->locked) { - ret = true; - break; - } - seqmutex_unlock(&c->btree_trans_lock); - return ret; -} - -static void __maybe_unused -bch2_btree_bkey_cached_common_to_text(struct printbuf *out, - struct btree_bkey_cached_common *b) -{ - struct six_lock_count c = six_lock_counts(&b->lock); - pid_t pid; - - scoped_guard(rcu) { - struct task_struct *owner = READ_ONCE(b->lock.owner); - pid = owner ? owner->pid : 0; - } - - prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); - bch2_btree_id_to_text(out, b->btree_id); - prt_printf(out, " l=%u:", b->level); - bch2_bpos_to_text(out, btree_node_pos(b)); - - prt_printf(out, "\t locks %u:%u:%u held by pid %u", - c.n[0], c.n[1], c.n[2], pid); -} - -void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) -{ - struct btree_bkey_cached_common *b; - static char lock_types[] = { 'r', 'i', 'w' }; - struct task_struct *task = READ_ONCE(trans->locking_wait.task); - unsigned l, idx; - - /* before rcu_read_lock(): */ - bch2_printbuf_make_room(out, 4096); - - if (!out->nr_tabstops) { - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 32); - } - - prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); - - /* trans->paths is rcu protected vs. freeing */ - guard(rcu)(); - out->atomic++; - - struct btree_path *paths = rcu_dereference(trans->paths); - if (!paths) - goto out; - - unsigned long *paths_allocated = trans_paths_allocated(paths); - - trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) { - struct btree_path *path = paths + idx; - if (!path->nodes_locked) - continue; - - prt_printf(out, " path %u %c ", - idx, - path->cached ? 'c' : 'b'); - bch2_btree_id_to_text(out, path->btree_id); - prt_printf(out, " l=%u:", path->level); - bch2_bpos_to_text(out, path->pos); - prt_newline(out); - - for (l = 0; l < BTREE_MAX_DEPTH; l++) { - if (btree_node_locked(path, l) && - !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { - prt_printf(out, " %c l=%u ", - lock_types[btree_node_locked_type(path, l)], l); - bch2_btree_bkey_cached_common_to_text(out, b); - prt_newline(out); - } - } - } - - b = READ_ONCE(trans->locking); - if (b) { - prt_printf(out, " blocked for %lluus on\n", - div_u64(local_clock() - trans->locking_wait.start_time, 1000)); - prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); - bch2_btree_bkey_cached_common_to_text(out, b); - prt_newline(out); - } -out: - --out->atomic; -} - -void bch2_fs_btree_iter_exit(struct bch_fs *c) -{ - struct btree_transaction_stats *s; - struct btree_trans *trans; - int cpu; - - if (c->btree_trans_bufs) - for_each_possible_cpu(cpu) { - struct btree_trans *trans = - per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; - - if (trans) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - } - kfree(trans); - } - free_percpu(c->btree_trans_bufs); - - trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); - if (trans) - panic("%s leaked btree_trans\n", trans->fn); - - for (s = c->btree_transaction_stats; - s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); - s++) { -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_exit(&s->trans_kmalloc_trace); -#endif - kfree(s->max_paths_text); - bch2_time_stats_exit(&s->lock_hold_times); - } - - if (c->btree_trans_barrier_initialized) { - synchronize_srcu_expedited(&c->btree_trans_barrier); - cleanup_srcu_struct(&c->btree_trans_barrier); - } - mempool_exit(&c->btree_trans_mem_pool); - mempool_exit(&c->btree_trans_pool); -} - -void bch2_fs_btree_iter_init_early(struct bch_fs *c) -{ - struct btree_transaction_stats *s; - - for (s = c->btree_transaction_stats; - s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); - s++) { - bch2_time_stats_init(&s->duration); - bch2_time_stats_init(&s->lock_hold_times); - mutex_init(&s->lock); - } - - INIT_LIST_HEAD(&c->btree_trans_list); - seqmutex_init(&c->btree_trans_lock); -} - -int bch2_fs_btree_iter_init(struct bch_fs *c) -{ - int ret; - - c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); - if (!c->btree_trans_bufs) - return -ENOMEM; - - ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1, - sizeof(struct btree_trans)) ?: - mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, - BTREE_TRANS_MEM_MAX) ?: - init_srcu_struct(&c->btree_trans_barrier); - if (ret) - return ret; - - /* - * static annotation (hackily done) for lock ordering of reclaim vs. - * btree node locks: - */ -#ifdef CONFIG_LOCKDEP - fs_reclaim_acquire(GFP_KERNEL); - struct btree_trans *trans = bch2_trans_get(c); - trans_set_locked(trans, false); - bch2_trans_put(trans); - fs_reclaim_release(GFP_KERNEL); -#endif - - c->btree_trans_barrier_initialized = true; - return 0; - -} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h deleted file mode 100644 index 09dd3e52622e48..00000000000000 --- a/fs/bcachefs/btree_iter.h +++ /dev/null @@ -1,1010 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_ITER_H -#define _BCACHEFS_BTREE_ITER_H - -#include "bset.h" -#include "btree_types.h" -#include "trace.h" - -void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); -void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); -void bch2_dump_trans_paths_updates(struct btree_trans *); - -static inline int __bkey_err(const struct bkey *k) -{ - return PTR_ERR_OR_ZERO(k); -} - -#define bkey_err(_k) __bkey_err((_k).k) - -static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent) -{ - unsigned idx = path - trans->paths; - - EBUG_ON(idx >= trans->nr_paths); - EBUG_ON(!test_bit(idx, trans->paths_allocated)); - if (unlikely(path->ref == U8_MAX)) { - bch2_dump_trans_paths_updates(trans); - panic("path %u refcount overflow\n", idx); - } - - path->ref++; - path->intent_ref += intent; - trace_btree_path_get_ll(trans, path); -} - -static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) -{ - EBUG_ON(path - trans->paths >= trans->nr_paths); - EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated)); - EBUG_ON(!path->ref); - EBUG_ON(!path->intent_ref && intent); - - trace_btree_path_put_ll(trans, path); - path->intent_ref -= intent; - return --path->ref == 0; -} - -static inline void btree_path_set_dirty(struct btree_trans *trans, - struct btree_path *path, - enum btree_path_uptodate u) -{ - BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); - path->uptodate = max_t(unsigned, path->uptodate, u); -} - -static inline struct btree *btree_path_node(struct btree_path *path, - unsigned level) -{ - return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; -} - -static inline bool btree_node_lock_seq_matches(const struct btree_path *path, - const struct btree *b, unsigned level) -{ - return path->l[level].lock_seq == six_lock_seq(&b->c.lock); -} - -static inline struct btree *btree_node_parent(struct btree_path *path, - struct btree *b) -{ - return btree_path_node(path, b->c.level + 1); -} - -/* Iterate over paths within a transaction: */ - -void __bch2_btree_trans_sort_paths(struct btree_trans *); - -static inline void btree_trans_sort_paths(struct btree_trans *trans) -{ - if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - trans->paths_sorted) - return; - __bch2_btree_trans_sort_paths(trans); -} - -static inline unsigned long *trans_paths_nr(struct btree_path *paths) -{ - return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths; -} - -static inline unsigned long *trans_paths_allocated(struct btree_path *paths) -{ - unsigned long *v = trans_paths_nr(paths); - return v - BITS_TO_LONGS(*v); -} - -#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\ - for (_idx = _start; \ - (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \ - _idx++) - -static inline struct btree_path * -__trans_next_path(struct btree_trans *trans, unsigned *idx) -{ - unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG; - /* - * Open coded find_next_bit(), because - * - this is fast path, we can't afford the function call - * - and we know that nr_paths is a multiple of BITS_PER_LONG, - */ - while (*idx < trans->nr_paths) { - unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1)); - if (v) { - *idx += __ffs(v); - return trans->paths + *idx; - } - - *idx += BITS_PER_LONG; - *idx &= ~(BITS_PER_LONG - 1); - w++; - } - - return NULL; -} - -/* - * This version is intended to be safe for use on a btree_trans that is owned by - * another thread, for bch2_btree_trans_to_text(); - */ -#define trans_for_each_path_from(_trans, _path, _idx, _start) \ - for (_idx = _start; \ - (_path = __trans_next_path((_trans), &_idx)); \ - _idx++) - -#define trans_for_each_path(_trans, _path, _idx) \ - trans_for_each_path_from(_trans, _path, _idx, 1) - -static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) -{ - unsigned idx = path ? path->sorted_idx + 1 : 0; - - EBUG_ON(idx > trans->nr_sorted); - - return idx < trans->nr_sorted - ? trans->paths + trans->sorted[idx] - : NULL; -} - -static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) -{ - unsigned idx = path ? path->sorted_idx : trans->nr_sorted; - - return idx - ? trans->paths + trans->sorted[idx - 1] - : NULL; -} - -#define trans_for_each_path_idx_inorder(_trans, _iter) \ - for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ - (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ - _iter.sorted_idx < (_trans)->nr_sorted); \ - _iter.sorted_idx++) - -struct trans_for_each_path_inorder_iter { - btree_path_idx_t sorted_idx; - btree_path_idx_t path_idx; -}; - -#define trans_for_each_path_inorder(_trans, _path, _iter) \ - for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ - (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ - _path = (_trans)->paths + _iter.path_idx, \ - _iter.sorted_idx < (_trans)->nr_sorted); \ - _iter.sorted_idx++) - -#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ - for (_i = trans->nr_sorted - 1; \ - ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\ - --_i) - -static inline bool __path_has_node(const struct btree_path *path, - const struct btree *b) -{ - return path->l[b->c.level].b == b && - btree_node_lock_seq_matches(path, b, b->c.level); -} - -static inline struct btree_path * -__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, - unsigned *idx) -{ - struct btree_path *path; - - while ((path = __trans_next_path(trans, idx)) && - !__path_has_node(path, b)) - (*idx)++; - - return path; -} - -#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \ - for (_iter = 1; \ - (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\ - _iter++) - -btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t, - bool, unsigned long); - -static inline btree_path_idx_t __must_check -bch2_btree_path_make_mut(struct btree_trans *trans, - btree_path_idx_t path, bool intent, - unsigned long ip) -{ - if (trans->paths[path].ref > 1 || - trans->paths[path].preserve) - path = __bch2_btree_path_make_mut(trans, path, intent, ip); - trans->paths[path].should_be_locked = false; - return path; -} - -btree_path_idx_t __must_check -__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t, - struct bpos, bool, unsigned long); - -static inline btree_path_idx_t __must_check -bch2_btree_path_set_pos(struct btree_trans *trans, - btree_path_idx_t path, struct bpos new_pos, - bool intent, unsigned long ip) -{ - return !bpos_eq(new_pos, trans->paths[path].pos) - ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip) - : path; -} - -int __must_check bch2_btree_path_traverse_one(struct btree_trans *, - btree_path_idx_t, - unsigned, unsigned long); - -static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); - -static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - btree_path_idx_t path, unsigned flags) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) - return 0; - - return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); -} - -btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, - unsigned, unsigned, unsigned, unsigned long); -btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id, - unsigned, struct bpos); - -struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); - -/* - * bch2_btree_path_peek_slot() for a cached iterator might return a key in a - * different snapshot: - */ -static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) -{ - struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); - - if (k.k && bpos_eq(path->pos, k.k->p)) - return k; - - bkey_init(u); - u->p = path->pos; - return (struct bkey_s_c) { u, NULL }; -} - -struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, - struct btree_iter *, struct bpos); - -void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *); - -int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *); - -static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) -{ - return mutex_trylock(lock) - ? 0 - : __bch2_trans_mutex_lock(trans, lock); -} - -/* Debug: */ - -void __bch2_trans_verify_paths(struct btree_trans *); -void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); - -static inline void bch2_trans_verify_paths(struct btree_trans *trans) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_trans_verify_paths(trans); -} - -static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree, - struct bpos pos) -{ - if (static_branch_unlikely(&bch2_debug_check_iterators)) - __bch2_assert_pos_locked(trans, btree, pos); -} - -void bch2_btree_path_fix_key_modified(struct btree_trans *trans, - struct btree *, struct bkey_packed *); -void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, - struct btree *, struct btree_node_iter *, - struct bkey_packed *, unsigned, unsigned); - -int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); - -void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool); - -int bch2_trans_relock(struct btree_trans *); -int bch2_trans_relock_notrace(struct btree_trans *); -void bch2_trans_unlock(struct btree_trans *); -void bch2_trans_unlock_long(struct btree_trans *); - -static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) -{ - return restart_count != trans->restart_count - ? -BCH_ERR_transaction_restart_nested - : 0; -} - -void __noreturn bch2_trans_restart_error(struct btree_trans *, u32); - -static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, - u32 restart_count) -{ - if (trans_was_restarted(trans, restart_count)) - bch2_trans_restart_error(trans, restart_count); -} - -void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *); - -static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans) -{ - if (trans->restarted || !trans->locked) - bch2_trans_unlocked_or_in_restart_error(trans); -} - -__always_inline -static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) -{ - BUG_ON(err <= 0); - BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); - - trans->restarted = err; - trans->last_restarted_ip = ip; - return -err; -} - -__always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) -{ - btree_trans_restart_foreign_task(trans, err, ip); -#ifdef CONFIG_BCACHEFS_DEBUG - darray_exit(&trans->last_restarted_trace); - bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); -#endif - return -err; -} - -__always_inline -static int btree_trans_restart(struct btree_trans *trans, int err) -{ - return btree_trans_restart_ip(trans, err, _THIS_IP_); -} - -static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) -{ -#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS - if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { - trace_and_count(trans->c, trans_restart_injected, trans, ip); - return btree_trans_restart_ip(trans, - BCH_ERR_transaction_restart_fault_inject, ip); - } -#endif - return 0; -} - -bool bch2_btree_node_upgrade(struct btree_trans *, - struct btree_path *, unsigned); - -void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); - -static inline void bch2_btree_path_downgrade(struct btree_trans *trans, - struct btree_path *path) -{ - unsigned new_locks_want = path->level + !!path->intent_ref; - - if (path->locks_want > new_locks_want) - __bch2_btree_path_downgrade(trans, path, new_locks_want); -} - -void bch2_trans_downgrade(struct btree_trans *); - -void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); -void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); -void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); - -int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); -int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); - -struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *); - -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos); -struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *); - -static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans, - struct btree_iter *iter) -{ - return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX); -} - -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos); - -static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter) -{ - return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN); -} - -struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *); - -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *); - -bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *); -bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *); - -static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -{ - iter->k.type = KEY_TYPE_deleted; - iter->k.p.inode = iter->pos.inode = new_pos.inode; - iter->k.p.offset = iter->pos.offset = new_pos.offset; - iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; - iter->k.size = 0; -} - -static inline void bch2_btree_iter_set_pos(struct btree_trans *trans, - struct btree_iter *iter, struct bpos new_pos) -{ - if (unlikely(iter->update_path)) - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - - if (!(iter->flags & BTREE_ITER_all_snapshots)) - new_pos.snapshot = iter->snapshot; - - __bch2_btree_iter_set_pos(iter, new_pos); -} - -static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) -{ - BUG_ON(!(iter->flags & BTREE_ITER_is_extents)); - iter->pos = bkey_start_pos(&iter->k); -} - -static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans, - struct btree_iter *iter, u32 snapshot) -{ - struct bpos pos = iter->pos; - - iter->snapshot = snapshot; - pos.snapshot = snapshot; - bch2_btree_iter_set_pos(trans, iter, pos); -} - -void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); - -static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned level, - unsigned flags) -{ - if (level || !btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_cached; - flags &= ~BTREE_ITER_with_key_cache; - } else if (!(flags & BTREE_ITER_cached)) - flags |= BTREE_ITER_with_key_cache; - - if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && - btree_id_is_extents(btree_id)) - flags |= BTREE_ITER_is_extents; - - if (!(flags & BTREE_ITER_snapshot_field) && - !btree_type_has_snapshot_field(btree_id)) - flags &= ~BTREE_ITER_all_snapshots; - - if (!(flags & BTREE_ITER_all_snapshots) && - btree_type_has_snapshots(btree_id)) - flags |= BTREE_ITER_filter_snapshots; - - if (trans->journal_replay_not_finished) - flags |= BTREE_ITER_with_journal; - - return flags; -} - -static inline void bch2_trans_iter_init_common(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags, - unsigned long ip) -{ - iter->update_path = 0; - iter->key_cache_path = 0; - iter->btree_id = btree_id; - iter->min_depth = 0; - iter->flags = flags; - iter->snapshot = pos.snapshot; - iter->pos = pos; - iter->k = POS_KEY(pos); - iter->journal_idx = 0; -#ifdef CONFIG_BCACHEFS_DEBUG - iter->ip_allocated = ip; -#endif - iter->path = bch2_path_get(trans, btree_id, iter->pos, - locks_want, depth, flags, ip); -} - -void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos, unsigned); - -static inline void bch2_trans_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags) -{ - if (__builtin_constant_p(btree_id) && - __builtin_constant_p(flags)) - bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, 0, flags), - _THIS_IP_); - else - bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); -} - -void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos, - unsigned, unsigned, unsigned); -void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *); - -void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); - -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE -void bch2_trans_kmalloc_trace_to_text(struct printbuf *, - darray_trans_kmalloc_trace *); -#endif - -void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); - -static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, - unsigned long ip) -{ -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_push(&trans->trans_kmalloc_trace, - ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); -#endif -} - -static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, - unsigned long ip) -{ - size = roundup(size, 8); - - bch2_trans_kmalloc_trace(trans, size, ip); - - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; - return p; - } else { - return __bch2_trans_kmalloc(trans, size, ip); - } -} - -static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, - unsigned long ip) -{ - size = roundup(size, 8); - - bch2_trans_kmalloc_trace(trans, size, ip); - - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; - memset(p, 0, size); - return p; - } else { - return __bch2_trans_kmalloc(trans, size, ip); - } -} - -/** - * bch2_trans_kmalloc - allocate memory for use by the current transaction - * - * Must be called after bch2_trans_begin, which on second and further calls - * frees all memory allocated in this transaction - */ -static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -{ - return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); -} - -static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) -{ - return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); -} - -static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type) -{ - struct bkey_s_c k; - - bch2_trans_iter_init(trans, iter, btree_id, pos, flags); - k = bch2_btree_iter_peek_slot(trans, iter); - - if (!bkey_err(k) && type && k.k->type != type) - k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); - if (unlikely(bkey_err(k))) - bch2_trans_iter_exit(trans, iter); - return k; -} - -static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags) -{ - return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0); -} - -#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ - bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ - _btree_id, _pos, _flags, KEY_TYPE_##_type)) - -static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) -{ - unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k)); - memcpy(dst_v, src_k.v, b); - if (unlikely(b < dst_size)) - memset(dst_v + b, 0, dst_size - b); -} - -#define bkey_val_copy(_dst_v, _src_k) \ -do { \ - BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \ - __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \ -} while (0) - -static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, - unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, - unsigned val_size, void *val) -{ - struct btree_iter iter; - struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); - int ret = bkey_err(k); - if (!ret) { - __bkey_val_copy(val, val_size, k); - bch2_trans_iter_exit(trans, &iter); - } - - return ret; -} - -#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\ - __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ - KEY_TYPE_##_type, sizeof(*_val), _val) - -void bch2_trans_srcu_unlock(struct btree_trans *); - -u32 bch2_trans_begin(struct btree_trans *); - -#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _locks_want, _depth, _flags, _b, _do) \ -({ \ - bch2_trans_begin((_trans)); \ - \ - struct btree_iter _iter; \ - bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \ - _start, _locks_want, _depth, _flags); \ - int _ret3 = 0; \ - do { \ - _ret3 = lockrestart_do((_trans), ({ \ - struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\ - if (!_b) \ - break; \ - \ - PTR_ERR_OR_ZERO(_b) ?: (_do); \ - })) ?: \ - lockrestart_do((_trans), \ - PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\ - } while (!_ret3); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _flags, _b, _do) \ - __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b, _do) - -static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) -{ - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : - bch2_btree_iter_peek_prev(trans, iter); -} - -static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) -{ - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : - bch2_btree_iter_peek(trans, iter); -} - -static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end, - unsigned flags) -{ - if (!(flags & BTREE_ITER_slots)) - return bch2_btree_iter_peek_max(trans, iter, end); - - if (bkey_gt(iter->pos, end)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_slot(trans, iter); -} - -int __bch2_btree_trans_too_many_iters(struct btree_trans *); - -static inline int btree_trans_too_many_iters(struct btree_trans *trans) -{ - if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8) - return __bch2_btree_trans_too_many_iters(trans); - - return 0; -} - -/* - * goto instead of loop, so that when used inside for_each_btree_key2() - * break/continue work correctly - */ -#define lockrestart_do(_trans, _do) \ -({ \ - __label__ transaction_restart; \ - u32 _restart_count; \ - int _ret2; \ -transaction_restart: \ - _restart_count = bch2_trans_begin(_trans); \ - _ret2 = (_do); \ - \ - if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \ - goto transaction_restart; \ - \ - if (!_ret2) \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - _ret2; \ -}) - -/* - * nested_lockrestart_do(), nested_commit_do(): - * - * These are like lockrestart_do() and commit_do(), with two differences: - * - * - We don't call bch2_trans_begin() unless we had a transaction restart - * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a - * transaction restart - */ -#define nested_lockrestart_do(_trans, _do) \ -({ \ - u32 _restart_count, _orig_restart_count; \ - int _ret2; \ - \ - _restart_count = _orig_restart_count = (_trans)->restart_count; \ - \ - while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\ - _restart_count = bch2_trans_begin(_trans); \ - \ - if (!_ret2) \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - \ - _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ -}) - -#define for_each_btree_key_max_continue(_trans, _iter, \ - _end, _flags, _k, _do) \ -({ \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \ - _end, (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ - for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) - -#define for_each_btree_key_max(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ -({ \ - bch2_trans_begin(trans); \ - \ - struct btree_iter _iter; \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ -}) - -#define for_each_btree_key(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ - for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ - SPOS_MAX, _flags, _k, _do) - -#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ -({ \ - struct btree_iter _iter; \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \ - (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_key_commit(_trans, _iter, _btree_id, \ - _start, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ - for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - -#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \ - _start, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ - for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - -#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \ - _start, _end, _iter_flags, _k, \ - _disk_res, _journal_seq, _commit_flags,\ - _do) \ - for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ - (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_commit_flags))) - -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, - struct btree_iter *); - -#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(_trans, &(_iter))) - -#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\ - for (; \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(_trans, &(_iter))) - -#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ - SPOS_MAX, _flags, _k, _ret) - -#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_rewind(_trans, &(_iter))) - -#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \ - for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret) - -/* - * This should not be used in a fastpath, without first trying _do in - * nonblocking mode - it will cause excessive transaction restarts and - * potentially livelocking: - */ -#define drop_locks_do(_trans, _do) \ -({ \ - bch2_trans_unlock(_trans); \ - (_do) ?: bch2_trans_relock(_trans); \ -}) - -#define allocate_dropping_locks_errcode(_trans, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - int _ret = _do; \ - \ - if (bch2_err_matches(_ret, ENOMEM)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, _do); \ - } \ - _ret; \ -}) - -#define allocate_dropping_locks(_trans, _ret, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - typeof(_do) _p = _do; \ - \ - _ret = 0; \ - if (unlikely(!_p)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ - } \ - _p; \ -}) - -struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); -void bch2_trans_put(struct btree_trans *); - -bool bch2_current_has_btree_trans(struct bch_fs *); - -extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; -unsigned bch2_trans_get_fn_idx(const char *); - -#define bch2_trans_get(_c) \ -({ \ - static unsigned trans_fn_idx; \ - \ - if (unlikely(!trans_fn_idx)) \ - trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ - __bch2_trans_get(_c, trans_fn_idx); \ -}) - -/* - * We don't use DEFINE_CLASS() because using a function for the constructor - * breaks bch2_trans_get()'s use of __func__ - */ -typedef struct btree_trans * class_btree_trans_t; -static inline void class_btree_trans_destructor(struct btree_trans **p) -{ - struct btree_trans *trans = *p; - bch2_trans_put(trans); -} - -#define class_btree_trans_constructor(_c) bch2_trans_get(_c) - -#define bch2_trans_run(_c, _do) \ -({ \ - CLASS(btree_trans, trans)(_c); \ - (_do); \ -}) - -#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) - -void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); - -void bch2_fs_btree_iter_exit(struct bch_fs *); -void bch2_fs_btree_iter_init_early(struct bch_fs *); -int bch2_fs_btree_iter_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c deleted file mode 100644 index ea839560a13639..00000000000000 --- a/fs/bcachefs/btree_journal_iter.c +++ /dev/null @@ -1,830 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_cache.h" -#include "btree_journal_iter.h" -#include "journal_io.h" - -#include - -/* - * For managing keys we read from the journal: until journal replay works normal - * btree lookups need to be able to find and return keys from the journal where - * they overwrite what's in the btree, so we have a special iterator and - * operations for the regular btree iter code to use: - */ - -static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) -{ - size_t gap_size = keys->size - keys->nr; - - BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); - - if (pos >= keys->gap) - pos -= gap_size; - return pos; -} - -static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) -{ - size_t gap_size = keys->size - keys->nr; - - if (idx >= keys->gap) - idx += gap_size; - return idx; -} - -static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) -{ - return keys->data + idx_to_pos(keys, idx); -} - -static size_t __bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - size_t l = 0, r = keys->nr, m; - - while (l < r) { - m = l + ((r - l) >> 1); - if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) - l = m + 1; - else - r = m; - } - - BUG_ON(l < keys->nr && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); - - BUG_ON(l && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); - - return l; -} - -static size_t bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); -} - -/* Returns first non-overwritten key >= search key: */ -struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) -{ - struct journal_keys *keys = &c->journal_keys; - unsigned iters = 0; - struct journal_key *k; - - BUG_ON(*idx > keys->nr); -search: - if (!*idx) - *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - - while (*idx && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { - --(*idx); - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - struct bkey_i *ret = NULL; - rcu_read_lock(); /* for overwritten_ranges */ - - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { - if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - break; - - if (k->overwritten) { - if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->end; - else - *idx += 1; - continue; - } - - if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { - ret = k->k; - break; - } - - (*idx)++; - iters++; - if (iters == 10) { - *idx = 0; - rcu_read_unlock(); - goto search; - } - } - - rcu_read_unlock(); - return ret; -} - -struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) -{ - struct journal_keys *keys = &c->journal_keys; - unsigned iters = 0; - struct journal_key *k; - - BUG_ON(*idx > keys->nr); - - if (!keys->nr) - return NULL; -search: - if (!*idx) - *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - - while (*idx < keys->nr && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { - (*idx)++; - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - if (*idx == keys->nr) - --(*idx); - - struct bkey_i *ret = NULL; - rcu_read_lock(); /* for overwritten_ranges */ - - while (true) { - k = idx_to_key(keys, *idx); - if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) - break; - - if (k->overwritten) { - if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->start; - if (!*idx) - break; - --(*idx); - continue; - } - - if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { - ret = k->k; - break; - } - - if (!*idx) - break; - --(*idx); - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - rcu_read_unlock(); - return ret; -} - -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos) -{ - size_t idx = 0; - - return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx); -} - -static void journal_iter_verify(struct journal_iter *iter) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct journal_keys *keys = iter->keys; - size_t gap_size = keys->size - keys->nr; - - BUG_ON(iter->idx >= keys->gap && - iter->idx < keys->gap + gap_size); - - if (iter->idx < keys->size) { - struct journal_key *k = keys->data + iter->idx; - - int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); - BUG_ON(cmp > 0); - } -#endif -} - -static void journal_iters_fix(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - /* The key we just inserted is immediately before the gap: */ - size_t gap_end = keys->gap + (keys->size - keys->nr); - struct journal_key *new_key = &keys->data[keys->gap - 1]; - struct journal_iter *iter; - - /* - * If an iterator points one after the key we just inserted, decrement - * the iterator so it points at the key we just inserted - if the - * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will - * handle that: - */ - list_for_each_entry(iter, &c->journal_iters, list) { - journal_iter_verify(iter); - if (iter->idx == gap_end && - new_key->btree_id == iter->btree_id && - new_key->level == iter->level) - iter->idx = keys->gap - 1; - journal_iter_verify(iter); - } -} - -static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) -{ - struct journal_keys *keys = &c->journal_keys; - struct journal_iter *iter; - size_t gap_size = keys->size - keys->nr; - - list_for_each_entry(iter, &c->journal_iters, list) { - if (iter->idx > old_gap) - iter->idx -= gap_size; - if (iter->idx >= new_gap) - iter->idx += gap_size; - } -} - -int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct journal_key n = { - .btree_id = id, - .level = level, - .k = k, - .allocated = true, - /* - * Ensure these keys are done last by journal replay, to unblock - * journal reclaim: - */ - .journal_seq = U64_MAX, - }; - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); - - BUG_ON(test_bit(BCH_FS_rw, &c->flags)); - - if (idx < keys->size && - journal_key_cmp(&n, &keys->data[idx]) == 0) { - if (keys->data[idx].allocated) - kfree(keys->data[idx].k); - keys->data[idx] = n; - return 0; - } - - if (idx > keys->gap) - idx -= keys->size - keys->nr; - - size_t old_gap = keys->gap; - - if (keys->nr == keys->size) { - journal_iters_move_gap(c, old_gap, keys->size); - old_gap = keys->size; - - struct journal_keys new_keys = { - .nr = keys->nr, - .size = max_t(size_t, keys->size, 8) * 2, - }; - - new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL); - if (!new_keys.data) { - bch_err(c, "%s: error allocating new key array (size %zu)", - __func__, new_keys.size); - return bch_err_throw(c, ENOMEM_journal_key_insert); - } - - /* Since @keys was full, there was no gap: */ - memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); - kvfree(keys->data); - keys->data = new_keys.data; - keys->nr = new_keys.nr; - keys->size = new_keys.size; - - /* And now the gap is at the end: */ - keys->gap = keys->nr; - } - - journal_iters_move_gap(c, old_gap, idx); - - move_gap(keys, idx); - - keys->nr++; - keys->data[keys->gap++] = n; - - journal_iters_fix(c); - - return 0; -} - -/* - * Can only be used from the recovery thread while we're still RO - can't be - * used once we've got RW, as journal_keys is at that point used by multiple - * threads: - */ -int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct bkey_i *n; - int ret; - - n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); - if (!n) - return bch_err_throw(c, ENOMEM_journal_key_insert); - - bkey_copy(n, k); - ret = bch2_journal_key_insert_take(c, id, level, n); - if (ret) - kfree(n); - return ret; -} - -int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, - unsigned level, struct bpos pos) -{ - struct bkey_i whiteout; - - bkey_init(&whiteout.k); - whiteout.k.p = pos; - - return bch2_journal_key_insert(c, id, level, &whiteout); -} - -bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, - unsigned level, struct bpos pos) -{ - struct journal_keys *keys = &trans->c->journal_keys; - size_t idx = bch2_journal_key_search(keys, btree, level, pos); - - if (!trans->journal_replay_not_finished) - return false; - - return (idx < keys->size && - keys->data[idx].btree_id == btree && - keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos) && - bkey_deleted(&keys->data[idx].k->k)); -} - -static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) -{ - struct journal_key *k = keys->data + pos; - size_t idx = pos_to_idx(keys, pos); - - k->overwritten = true; - - struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; - struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; - - bool prev_overwritten = prev && prev->overwritten; - bool next_overwritten = next && next->overwritten; - - struct journal_key_range_overwritten *prev_range = - prev_overwritten ? prev->overwritten_range : NULL; - struct journal_key_range_overwritten *next_range = - next_overwritten ? next->overwritten_range : NULL; - - BUG_ON(prev_range && prev_range->end != idx); - BUG_ON(next_range && next_range->start != idx + 1); - - if (prev_range && next_range) { - prev_range->end = next_range->end; - - keys->data[pos].overwritten_range = prev_range; - for (size_t i = next_range->start; i < next_range->end; i++) { - struct journal_key *ip = keys->data + idx_to_pos(keys, i); - BUG_ON(ip->overwritten_range != next_range); - ip->overwritten_range = prev_range; - } - - kfree_rcu_mightsleep(next_range); - } else if (prev_range) { - prev_range->end++; - k->overwritten_range = prev_range; - if (next_overwritten) { - prev_range->end++; - next->overwritten_range = prev_range; - } - } else if (next_range) { - next_range->start--; - k->overwritten_range = next_range; - if (prev_overwritten) { - next_range->start--; - prev->overwritten_range = next_range; - } - } else if (prev_overwritten || next_overwritten) { - struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); - if (!r) - return; - - r->start = idx - (size_t) prev_overwritten; - r->end = idx + 1 + (size_t) next_overwritten; - - rcu_assign_pointer(k->overwritten_range, r); - if (prev_overwritten) - prev->overwritten_range = r; - if (next_overwritten) - next->overwritten_range = r; - } -} - -void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, - unsigned level, struct bpos pos) -{ - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, btree, level, pos); - - if (idx < keys->size && - keys->data[idx].btree_id == btree && - keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos) && - !keys->data[idx].overwritten) { - mutex_lock(&keys->overwrite_lock); - __bch2_journal_key_overwritten(keys, idx); - mutex_unlock(&keys->overwrite_lock); - } -} - -static void bch2_journal_iter_advance(struct journal_iter *iter) -{ - if (iter->idx < iter->keys->size) { - iter->idx++; - if (iter->idx == iter->keys->gap) - iter->idx += iter->keys->size - iter->keys->nr; - } -} - -static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) -{ - journal_iter_verify(iter); - - guard(rcu)(); - while (iter->idx < iter->keys->size) { - struct journal_key *k = iter->keys->data + iter->idx; - - int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); - if (cmp < 0) - break; - BUG_ON(cmp); - - if (!k->overwritten) - return bkey_i_to_s_c(k->k); - - if (k->overwritten_range) - iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); - else - bch2_journal_iter_advance(iter); - } - - return bkey_s_c_null; -} - -static void bch2_journal_iter_exit(struct journal_iter *iter) -{ - list_del(&iter->list); -} - -static void bch2_journal_iter_init(struct bch_fs *c, - struct journal_iter *iter, - enum btree_id id, unsigned level, - struct bpos pos) -{ - iter->btree_id = id; - iter->level = level; - iter->keys = &c->journal_keys; - iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); - - journal_iter_verify(iter); -} - -static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -{ - return bch2_btree_node_iter_peek_unpack(&iter->node_iter, - iter->b, &iter->unpacked); -} - -static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -{ - bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -} - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -{ - if (bpos_eq(iter->pos, SPOS_MAX)) - iter->at_end = true; - else - iter->pos = bpos_successor(iter->pos); -} - -static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) -{ - struct btree_and_journal_iter iter = *_iter; - struct bch_fs *c = iter.trans->c; - unsigned level = iter.journal.level; - struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_started, &c->flags) - ? (level > 1 ? 0 : 2) - : (level > 1 ? 1 : 16); - - iter.prefetch = false; - iter.fail_if_too_many_whiteouts = true; - bch2_bkey_buf_init(&tmp); - - while (nr--) { - bch2_btree_and_journal_iter_advance(&iter); - struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); - if (!k.k) - break; - - bch2_bkey_buf_reassemble(&tmp, c, k); - bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); - } - - bch2_bkey_buf_exit(&tmp, c); -} - -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -{ - struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; - size_t iters = 0; - - if (iter->prefetch && iter->journal.level) - btree_and_journal_iter_prefetch(iter); -again: - if (iter->at_end) - return bkey_s_c_null; - - iters++; - - if (iters > 20 && iter->fail_if_too_many_whiteouts) - return bkey_s_c_null; - - while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && - bpos_lt(btree_k.k->p, iter->pos)) - bch2_journal_iter_advance_btree(iter); - - if (iter->trans->journal_replay_not_finished) - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); - - ret = journal_k.k && - (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) - ? journal_k - : btree_k; - - if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) - ret = bkey_s_c_null; - - if (ret.k) { - iter->pos = ret.k->p; - if (bkey_deleted(ret.k)) { - bch2_btree_and_journal_iter_advance(iter); - goto again; - } - } else { - iter->pos = SPOS_MAX; - iter->at_end = true; - } - - return ret; -} - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) -{ - bch2_journal_iter_exit(&iter->journal); -} - -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, - struct btree_and_journal_iter *iter, - struct btree *b, - struct btree_node_iter node_iter, - struct bpos pos) -{ - memset(iter, 0, sizeof(*iter)); - - iter->trans = trans; - iter->b = b; - iter->node_iter = node_iter; - iter->pos = b->data->min_key; - iter->at_end = false; - INIT_LIST_HEAD(&iter->journal.list); - - if (trans->journal_replay_not_finished) { - bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); - if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) - list_add(&iter->journal.list, &trans->c->journal_iters); - } -} - -/* - * this version is used by btree_gc before filesystem has gone RW and - * multithreaded, so uses the journal_iters list: - */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, - struct btree_and_journal_iter *iter, - struct btree *b) -{ - struct btree_node_iter node_iter; - - bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); -} - -/* sort and dedup all keys in the journal: */ - -/* - * When keys compare equal, oldest compares first: - */ -static int journal_sort_key_cmp(const void *_l, const void *_r) -{ - const struct journal_key *l = _l; - const struct journal_key *r = _r; - int rewind = l->rewind && r->rewind ? -1 : 1; - - return journal_key_cmp(l, r) ?: - ((cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset)) * rewind); -} - -void bch2_journal_keys_put(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - BUG_ON(atomic_read(&keys->ref) <= 0); - - if (!atomic_dec_and_test(&keys->ref)) - return; - - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) { - if (i->overwritten_range && - (i == &darray_last(*keys) || - i->overwritten_range != i[1].overwritten_range)) - kfree(i->overwritten_range); - - if (i->allocated) - kfree(i->k); - } - - kvfree(keys->data); - keys->data = NULL; - keys->nr = keys->gap = keys->size = 0; - - struct journal_replay **i; - struct genradix_iter iter; - - genradix_for_each(&c->journal_entries, iter, i) - kvfree(*i); - genradix_free(&c->journal_entries); -} - -static void __journal_keys_sort(struct journal_keys *keys) -{ - sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), - journal_sort_key_cmp, NULL); - - cond_resched(); - - struct journal_key *dst = keys->data; - - darray_for_each(*keys, src) { - /* - * We don't accumulate accounting keys here because we have to - * compare each individual accounting key against the version in - * the btree during replay: - */ - if (src->k->k.type != KEY_TYPE_accounting && - src + 1 < &darray_top(*keys) && - !journal_key_cmp(src, src + 1)) - continue; - - *dst++ = *src; - } - - keys->nr = dst - keys->data; -} - -int bch2_journal_keys_sort(struct bch_fs *c) -{ - struct genradix_iter iter; - struct journal_replay *i, **_i; - struct journal_keys *keys = &c->journal_keys; - size_t nr_read = 0; - - u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - cond_resched(); - - vstruct_for_each(&i->j, entry) { - bool rewind = !entry->level && - !btree_id_is_alloc(entry->btree_id) && - le64_to_cpu(i->j.seq) >= rewind_seq; - - if (entry->type != (rewind - ? BCH_JSET_ENTRY_overwrite - : BCH_JSET_ENTRY_btree_keys)) - continue; - - if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start) - continue; - - jset_entry_for_each_key(entry, k) { - struct journal_key n = (struct journal_key) { - .btree_id = entry->btree_id, - .level = entry->level, - .rewind = rewind, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), - .journal_offset = k->_data - i->j._data, - }; - - if (darray_push(keys, n)) { - __journal_keys_sort(keys); - - if (keys->nr * 8 > keys->size * 7) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", - keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); - return bch_err_throw(c, ENOMEM_journal_keys_sort); - } - - BUG_ON(darray_push(keys, n)); - } - - nr_read++; - } - } - } - - __journal_keys_sort(keys); - keys->gap = keys->nr; - - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); - return 0; -} - -void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, - unsigned level_min, unsigned level_max, - struct bpos start, struct bpos end) -{ - struct journal_keys *keys = &c->journal_keys; - size_t dst = 0; - - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) - if (!(i->btree_id == btree && - i->level >= level_min && - i->level <= level_max && - bpos_ge(i->k->k.p, start) && - bpos_le(i->k->k.p, end))) - keys->data[dst++] = *i; - keys->nr = keys->gap = dst; -} - -void bch2_journal_keys_dump(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - struct printbuf buf = PRINTBUF; - - pr_info("%zu keys:", keys->nr); - - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) { - printbuf_reset(&buf); - prt_printf(&buf, "btree="); - bch2_btree_id_to_text(&buf, i->btree_id); - prt_printf(&buf, " l=%u ", i->level); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); - pr_err("%s", buf.buf); - } - printbuf_exit(&buf); -} - -void bch2_fs_journal_keys_init(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - atomic_set(&keys->ref, 1); - keys->initial_ref_held = true; - mutex_init(&keys->overwrite_lock); -} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h deleted file mode 100644 index 2a3082919b8d3e..00000000000000 --- a/fs/bcachefs/btree_journal_iter.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H -#define _BCACHEFS_BTREE_JOURNAL_ITER_H - -#include "bkey.h" - -struct journal_iter { - struct list_head list; - enum btree_id btree_id; - unsigned level; - size_t idx; - struct journal_keys *keys; -}; - -/* - * Iterate over keys in the btree, with keys from the journal overlaid on top: - */ - -struct btree_and_journal_iter { - struct btree_trans *trans; - struct btree *b; - struct btree_node_iter node_iter; - struct bkey unpacked; - - struct journal_iter journal; - struct bpos pos; - bool at_end; - bool prefetch; - bool fail_if_too_many_whiteouts; -}; - -static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, - unsigned l_level, - const struct journal_key *r) -{ - return -cmp_int(l_level, r->level) ?: - cmp_int(l_btree_id, r->btree_id); -} - -static inline int __journal_key_cmp(enum btree_id l_btree_id, - unsigned l_level, - struct bpos l_pos, - const struct journal_key *r) -{ - return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: - bpos_cmp(l_pos, r->k->k.p); -} - -static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -{ - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); -} - -struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, - unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, - unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, - unsigned, struct bpos); - -int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, - struct btree_and_journal_iter *); - -int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_insert(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_delete(struct bch_fs *, enum btree_id, - unsigned, struct bpos); -bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, - struct btree_and_journal_iter *, struct btree *, - struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, - struct btree_and_journal_iter *, struct btree *); - -void bch2_journal_keys_put(struct bch_fs *); - -static inline void bch2_journal_keys_put_initial(struct bch_fs *c) -{ - if (c->journal_keys.initial_ref_held) - bch2_journal_keys_put(c); - c->journal_keys.initial_ref_held = false; -} - -int bch2_journal_keys_sort(struct bch_fs *); - -void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, - unsigned, unsigned, - struct bpos, struct bpos); - -void bch2_journal_keys_dump(struct bch_fs *); - -void bch2_fs_journal_keys_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h deleted file mode 100644 index 86aacb254fb2dd..00000000000000 --- a/fs/bcachefs/btree_journal_iter_types.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H -#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H - -struct journal_key_range_overwritten { - size_t start, end; -}; - -struct journal_key { - u64 journal_seq; - u32 journal_offset; - enum btree_id btree_id:8; - unsigned level:8; - bool allocated:1; - bool overwritten:1; - bool rewind:1; - struct journal_key_range_overwritten __rcu * - overwritten_range; - struct bkey_i *k; -}; - -struct journal_keys { - /* must match layout in darray_types.h */ - size_t nr, size; - struct journal_key *data; - /* - * Gap buffer: instead of all the empty space in the array being at the - * end of the buffer - from @nr to @size - the empty space is at @gap. - * This means that sequential insertions are O(n) instead of O(n^2). - */ - size_t gap; - atomic_t ref; - bool initial_ref_held; - struct mutex overwrite_lock; -}; - -#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c deleted file mode 100644 index d96188b92db236..00000000000000 --- a/fs/bcachefs/btree_key_cache.c +++ /dev/null @@ -1,880 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "trace.h" - -#include - -static inline bool btree_uses_pcpu_readers(enum btree_id id) -{ - return id == BTREE_ID_subvolumes; -} - -static struct kmem_cache *bch2_key_cache; - -static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct bkey_cached *ck = obj; - const struct bkey_cached_key *key = arg->key; - - return ck->key.btree_id != key->btree_id || - !bpos_eq(ck->key.pos, key->pos); -} - -static const struct rhashtable_params bch2_btree_key_cache_params = { - .head_offset = offsetof(struct bkey_cached, hash), - .key_offset = offsetof(struct bkey_cached, key), - .key_len = sizeof(struct bkey_cached_key), - .obj_cmpfn = bch2_btree_key_cache_cmp_fn, - .automatic_shrinking = true, -}; - -static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, - struct bkey_cached *ck, - enum btree_node_locked_type lock_held) -{ - path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - path->l[0].b = (void *) ck; - mark_btree_node_locked(trans, path, 0, lock_held); -} - -__flatten -inline struct bkey_cached * -bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) -{ - struct bkey_cached_key key = { - .btree_id = btree_id, - .pos = pos, - }; - - return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, - bch2_btree_key_cache_params); -} - -static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) -{ - if (!six_trylock_intent(&ck->c.lock)) - return false; - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - six_unlock_intent(&ck->c.lock); - return false; - } - - if (!six_trylock_write(&ck->c.lock)) { - six_unlock_intent(&ck->c.lock); - return false; - } - - return true; -} - -static bool bkey_cached_evict(struct btree_key_cache *c, - struct bkey_cached *ck) -{ - bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params); - if (ret) { - memset(&ck->key, ~0, sizeof(ck->key)); - atomic_long_dec(&c->nr_keys); - } - - return ret; -} - -static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu) -{ - struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier); - struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); - - this_cpu_dec(*c->btree_key_cache.nr_pending); - kmem_cache_free(bch2_key_cache, ck); -} - -static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - kfree(ck->k); - ck->k = NULL; - ck->u64s = 0; - - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - - bool pcpu_readers = ck->c.lock.readers != NULL; - rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu); - this_cpu_inc(*bc->nr_pending); -} - -static void bkey_cached_free(struct btree_trans *trans, - struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - /* - * we'll hit strange issues in the SRCU code if we aren't holding an - * SRCU read lock... - */ - EBUG_ON(!trans->srcu_held); - - bkey_cached_free_noassert(bc, ck); -} - -static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) -{ - gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; - - struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); - if (unlikely(!ck)) - return NULL; - ck->k = kmalloc(key_u64s * sizeof(u64), gfp); - if (unlikely(!ck->k)) { - kmem_cache_free(bch2_key_cache, ck); - return NULL; - } - ck->u64s = key_u64s; - return ck; -} - -static struct bkey_cached * -bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) -{ - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); - int ret; - - struct bkey_cached *ck = container_of_or_null( - rcu_pending_dequeue(&bc->pending[pcpu_readers]), - struct bkey_cached, rcu); - if (ck) - goto lock; - - ck = allocate_dropping_locks(trans, ret, - __bkey_cached_alloc(key_u64s, _gfp)); - if (ret) { - if (ck) - kfree(ck->k); - kmem_cache_free(bch2_key_cache, ck); - return ERR_PTR(ret); - } - - if (ck) { - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); - ck->c.cached = true; - goto lock; - } - - ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]), - struct bkey_cached, rcu); - if (ck) - goto lock; -lock: - six_lock_intent(&ck->c.lock, NULL, NULL); - six_lock_write(&ck->c.lock, NULL, NULL); - return ck; -} - -static struct bkey_cached * -bkey_cached_reuse(struct btree_key_cache *c) -{ - - guard(rcu)(); - struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table); - struct rhash_head *pos; - struct bkey_cached *ck; - - for (unsigned i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bkey_cached_lock_for_evict(ck)) { - if (bkey_cached_evict(c, ck)) - return ck; - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - } - } - return NULL; -} - -static int btree_key_cache_create(struct btree_trans *trans, - struct btree_path *path, - struct btree_path *ck_path, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - - /* - * bch2_varint_decode can read past the end of the buffer by at - * most 7 bytes (it won't be used): - */ - unsigned key_u64s = k.k->u64s + 1; - - /* - * Allocate some extra space so that the transaction commit path is less - * likely to have to reallocate, since that requires a transaction - * restart: - */ - key_u64s = min(256U, (key_u64s * 3) / 2); - key_u64s = roundup_pow_of_two(key_u64s); - - struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); - int ret = PTR_ERR_OR_ZERO(ck); - if (ret) - return ret; - - if (unlikely(!ck)) { - ck = bkey_cached_reuse(bc); - if (unlikely(!ck)) { - bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_id_str(ck_path->btree_id)); - return bch_err_throw(c, ENOMEM_btree_key_cache_create); - } - } - - ck->c.level = 0; - ck->c.btree_id = ck_path->btree_id; - ck->key.btree_id = ck_path->btree_id; - ck->key.pos = ck_path->pos; - ck->flags = 1U << BKEY_CACHED_ACCESSED; - - if (unlikely(key_u64s > ck->u64s)) { - mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); - - struct bkey_i *new_k = allocate_dropping_locks(trans, ret, - kmalloc(key_u64s * sizeof(u64), _gfp)); - if (unlikely(!new_k)) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(ck->key.btree_id), key_u64s); - ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill); - } else if (ret) { - kfree(new_k); - goto err; - } - - kfree(ck->k); - ck->k = new_k; - ck->u64s = key_u64s; - } - - bkey_reassemble(ck->k, k); - - ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); - if (unlikely(ret)) - goto err; - - ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); - - bch2_btree_node_unlock_write(trans, path, path_l(path)->b); - - if (unlikely(ret)) /* raced with another fill? */ - goto err; - - atomic_long_inc(&bc->nr_keys); - six_unlock_write(&ck->c.lock); - - enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); - if (lock_want == SIX_LOCK_read) - six_lock_downgrade(&ck->c.lock); - btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); - ck_path->uptodate = BTREE_ITER_UPTODATE; - return 0; -err: - bkey_cached_free(trans, bc, ck); - mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); - - return ret; -} - -static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans, - struct btree_path *ck_path, - struct bkey_s_c k) -{ - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, ck_path->pos); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, trans->c, k); - trace_key_cache_fill(trans, buf.buf); - printbuf_exit(&buf); -} - -static noinline int btree_key_cache_fill(struct btree_trans *trans, - btree_path_idx_t ck_path_idx, - unsigned flags) -{ - struct btree_path *ck_path = trans->paths + ck_path_idx; - - if (flags & BTREE_ITER_cached_nofill) { - ck_path->l[0].b = NULL; - return 0; - } - - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, - BTREE_ITER_intent| - BTREE_ITER_key_cache_fill| - BTREE_ITER_cached_nofill); - iter.flags &= ~BTREE_ITER_with_journal; - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* Recheck after btree lookup, before allocating: */ - ck_path = trans->paths + ck_path_idx; - ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; - if (unlikely(ret)) - goto out; - - ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); - if (ret) - goto err; - - if (trace_key_cache_fill_enabled()) - do_trace_key_cache_fill(trans, ck_path, k); -out: - /* We're not likely to need this iterator again: */ - bch2_set_btree_iter_dontneed(trans, &iter); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, - btree_path_idx_t path_idx) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck; - struct btree_path *path = trans->paths + path_idx; -retry: - ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); - if (!ck) - return -ENOENT; - - enum six_lock_type lock_want = __btree_lock_want(path, 0); - - int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); - if (ret) - return ret; - - if (ck->key.btree_id != path->btree_id || - !bpos_eq(ck->key.pos, path->pos)) { - six_unlock_type(&ck->c.lock, lock_want); - goto retry; - } - - if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) - set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - - btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); - path->uptodate = BTREE_ITER_UPTODATE; - return 0; -} - -int bch2_btree_path_traverse_cached(struct btree_trans *trans, - btree_path_idx_t path_idx, - unsigned flags) -{ - EBUG_ON(trans->paths[path_idx].level); - - int ret; - do { - ret = btree_path_traverse_cached_fast(trans, path_idx); - if (unlikely(ret == -ENOENT)) - ret = btree_key_cache_fill(trans, path_idx, flags); - } while (ret == -EEXIST); - - struct btree_path *path = trans->paths + path_idx; - - if (unlikely(ret)) { - path->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - btree_node_unlock(trans, path, 0); - path->l[0].b = ERR_PTR(ret); - } - } else { - BUG_ON(path->uptodate); - BUG_ON(!path->nodes_locked); - } - - return ret; -} - -static int btree_key_cache_flush_pos(struct btree_trans *trans, - struct bkey_cached_key key, - u64 journal_seq, - unsigned commit_flags, - bool evict) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct btree_iter c_iter, b_iter; - struct bkey_cached *ck = NULL; - int ret; - - bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, - BTREE_ITER_slots| - BTREE_ITER_intent| - BTREE_ITER_all_snapshots); - bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, - BTREE_ITER_cached| - BTREE_ITER_intent); - b_iter.flags &= ~BTREE_ITER_with_key_cache; - - ret = bch2_btree_iter_traverse(trans, &c_iter); - if (ret) - goto out; - - ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; - if (!ck) - goto out; - - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - if (evict) - goto evict; - goto out; - } - - if (journal_seq && ck->journal.seq != journal_seq) - goto out; - - trans->journal_res.seq = ck->journal.seq; - - /* - * If we're at the end of the journal, we really want to free up space - * in the journal right away - we don't want to pin that old journal - * sequence number with a new btree node write, we want to re-journal - * the update - */ - if (ck->journal.seq == journal_last_seq(j)) - commit_flags |= BCH_WATERMARK_reclaim; - - if (ck->journal.seq != journal_last_seq(j) || - !test_bit(JOURNAL_space_low, &c->journal.flags)) - commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - - struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); - ret = bkey_err(btree_k); - if (ret) - goto err; - - /* * Check that we're not violating cache coherency rules: */ - BUG_ON(bkey_deleted(btree_k.k)); - - ret = bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_key_cache_reclaim| - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - commit_flags); -err: - bch2_fs_fatal_err_on(ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && - !bch2_journal_error(j), c, - "flushing key cache: %s", bch2_err_str(ret)); - if (ret) - goto out; - - bch2_journal_pin_drop(j, &ck->journal); - - struct btree_path *path = btree_iter_path(trans, &c_iter); - BUG_ON(!btree_node_locked(path, 0)); - - if (!evict) { - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_dec(&c->btree_key_cache.nr_dirty); - } - } else { - struct btree_path *path2; - unsigned i; -evict: - trans_for_each_path(trans, path2, i) - if (path2 != path) - __bch2_btree_path_unlock(trans, path2); - - bch2_btree_node_lock_write_nofail(trans, path, &ck->c); - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_dec(&c->btree_key_cache.nr_dirty); - } - - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); - if (bkey_cached_evict(&c->btree_key_cache, ck)) { - bkey_cached_free(trans, &c->btree_key_cache, ck); - } else { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - } - } -out: - bch2_trans_iter_exit(trans, &b_iter); - bch2_trans_iter_exit(trans, &c_iter); - return ret; -} - -int bch2_btree_key_cache_journal_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_cached *ck = - container_of(pin, struct bkey_cached, journal); - struct bkey_cached_key key; - struct btree_trans *trans = bch2_trans_get(c); - int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - int ret = 0; - - btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); - key = ck->key; - - if (ck->journal.seq != seq || - !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - six_unlock_read(&ck->c.lock); - goto unlock; - } - - if (ck->seq != seq) { - bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, - bch2_btree_key_cache_journal_flush); - six_unlock_read(&ck->c.lock); - goto unlock; - } - six_unlock_read(&ck->c.lock); - - ret = lockrestart_do(trans, - btree_key_cache_flush_pos(trans, key, seq, - BCH_TRANS_COMMIT_journal_reclaim, false)); -unlock: - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - bch2_trans_put(trans); - return ret; -} - -bool bch2_btree_insert_key_cached(struct btree_trans *trans, - unsigned flags, - struct btree_insert_entry *insert_entry) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; - struct bkey_i *insert = insert_entry->k; - bool kick_reclaim = false; - - BUG_ON(insert->k.u64s > ck->u64s); - - bkey_copy(ck->k, insert); - - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - set_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_inc(&c->btree_key_cache.nr_dirty); - - if (bch2_nr_btree_keys_need_flush(c)) - kick_reclaim = true; - } - - /* - * To minimize lock contention, we only add the journal pin here and - * defer pin updates to the flush callback via ->seq. Be careful not to - * update ->seq on nojournal commits because we don't want to update the - * pin to a seq that doesn't include journal updates on disk. Otherwise - * we risk losing the update after a crash. - * - * The only exception is if the pin is not active in the first place. We - * have to add the pin because journal reclaim drives key cache - * flushing. The flush callback will not proceed unless ->seq matches - * the latest pin, so make sure it starts with a consistent value. - */ - if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || - !journal_pin_active(&ck->journal)) { - ck->seq = trans->journal_res.seq; - } - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - &ck->journal, bch2_btree_key_cache_journal_flush); - - if (kick_reclaim) - journal_reclaim_kick(&c->journal); - return true; -} - -void bch2_btree_key_cache_drop(struct btree_trans *trans, - struct btree_path *path) -{ - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck = (void *) path->l[0].b; - - /* - * We just did an update to the btree, bypassing the key cache: the key - * cache key is now stale and must be dropped, even if dirty: - */ - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); - atomic_long_dec(&c->btree_key_cache.nr_dirty); - bch2_journal_pin_drop(&c->journal, &ck->journal); - } - - bkey_cached_evict(bc, ck); - bkey_cached_free(trans, bc, ck); - - mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); - - struct btree_path *path2; - unsigned i; - trans_for_each_path(trans, path2, i) - if (path2->l[0].b == (void *) ck) { - /* - * It's safe to clear should_be_locked here because - * we're evicting from the key cache, and we still have - * the underlying btree locked: filling into the key - * cache would require taking a write lock on the btree - * node - */ - path2->should_be_locked = false; - __bch2_btree_path_unlock(trans, path2); - path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); - btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE); - } - - bch2_trans_verify_locks(trans); -} - -static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct bch_fs *c = shrink->private_data; - struct btree_key_cache *bc = &c->btree_key_cache; - struct bucket_table *tbl; - struct bkey_cached *ck; - size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; - unsigned iter, start; - int srcu_idx; - - srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - rcu_read_lock(); - - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - - /* - * Scanning is expensive while a rehash is in progress - most elements - * will be on the new hashtable, if it's in progress - * - * A rehash could still start while we're scanning - that's ok, we'll - * still see most elements. - */ - if (unlikely(tbl->nest)) { - rcu_read_unlock(); - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - return SHRINK_STOP; - } - - iter = bc->shrink_iter; - if (iter >= tbl->size) - iter = 0; - start = iter; - - do { - struct rhash_head *pos, *next; - - pos = rht_ptr_rcu(&tbl->buckets[iter]); - - while (!rht_is_a_nulls(pos)) { - next = rht_dereference_bucket_rcu(pos->next, tbl, iter); - ck = container_of(pos, struct bkey_cached, hash); - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - bc->skipped_dirty++; - } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { - clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); - bc->skipped_accessed++; - } else if (!bkey_cached_lock_for_evict(ck)) { - bc->skipped_lock_fail++; - } else if (bkey_cached_evict(bc, ck)) { - bkey_cached_free_noassert(bc, ck); - bc->freed++; - freed++; - } else { - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); - } - - scanned++; - if (scanned >= nr) - goto out; - - pos = next; - } - - iter++; - if (iter >= tbl->size) - iter = 0; - } while (scanned < nr && iter != start); -out: - bc->shrink_iter = iter; - - rcu_read_unlock(); - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - return freed; -} - -static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct bch_fs *c = shrink->private_data; - struct btree_key_cache *bc = &c->btree_key_cache; - long nr = atomic_long_read(&bc->nr_keys) - - atomic_long_read(&bc->nr_dirty); - - /* - * Avoid hammering our shrinker too much if it's nearly empty - the - * shrinker code doesn't take into account how big our cache is, if it's - * mostly empty but the system is under memory pressure it causes nasty - * lock contention: - */ - nr -= 128; - - return max(0L, nr); -} - -void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - struct bucket_table *tbl; - struct bkey_cached *ck; - struct rhash_head *pos; - LIST_HEAD(items); - unsigned i; - - shrinker_free(bc->shrink); - - /* - * The loop is needed to guard against racing with rehash: - */ - while (atomic_long_read(&bc->nr_keys)) { - rcu_read_lock(); - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) { - if (tbl->nest) { - /* wait for in progress rehash */ - rcu_read_unlock(); - mutex_lock(&bc->table.mutex); - mutex_unlock(&bc->table.mutex); - continue; - } - for (i = 0; i < tbl->size; i++) - while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { - ck = container_of(pos, struct bkey_cached, hash); - BUG_ON(!bkey_cached_evict(bc, ck)); - kfree(ck->k); - kmem_cache_free(bch2_key_cache, ck); - } - } - rcu_read_unlock(); - } - - if (atomic_long_read(&bc->nr_dirty) && - !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_was_rw, &c->flags)) - panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", - atomic_long_read(&bc->nr_dirty)); - - if (atomic_long_read(&bc->nr_keys)) - panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", - atomic_long_read(&bc->nr_keys)); - - if (bc->table_init_done) - rhashtable_destroy(&bc->table); - - rcu_pending_exit(&bc->pending[0]); - rcu_pending_exit(&bc->pending[1]); - - free_percpu(bc->nr_pending); -} - -void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) -{ -} - -int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - struct shrinker *shrink; - - bc->nr_pending = alloc_percpu(size_t); - if (!bc->nr_pending) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - - if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || - rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - - if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - - bc->table_init_done = true; - - shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); - if (!shrink) - return bch_err_throw(c, ENOMEM_fs_btree_cache_init); - bc->shrink = shrink; - shrink->count_objects = bch2_btree_key_cache_count; - shrink->scan_objects = bch2_btree_key_cache_scan; - shrink->batch = 1 << 14; - shrink->seeks = 0; - shrink->private_data = c; - shrinker_register(shrink); - return 0; -} - -void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) -{ - printbuf_tabstop_push(out, 24); - printbuf_tabstop_push(out, 12); - - prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); - prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); - prt_newline(out); - prt_printf(out, "shrinker:\n"); - prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); - prt_printf(out, "freed:\t%lu\r\n", bc->freed); - prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); - prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); - prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); - prt_newline(out); - prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending)); -} - -void bch2_btree_key_cache_exit(void) -{ - kmem_cache_destroy(bch2_key_cache); -} - -int __init bch2_btree_key_cache_init(void) -{ - bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); - if (!bch2_key_cache) - return -ENOMEM; - - return 0; -} diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h deleted file mode 100644 index 82d8c72512a93a..00000000000000 --- a/fs/bcachefs/btree_key_cache.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_KEY_CACHE_H -#define _BCACHEFS_BTREE_KEY_CACHE_H - -static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = 1024 + nr_keys / 2; - - return max_t(ssize_t, 0, nr_dirty - max_dirty); -} - -static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = 4096 + (nr_keys * 3) / 4; - - return nr_dirty - max_dirty; -} - -static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) -{ - return __bch2_btree_key_cache_must_wait(c) > 0; -} - -static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = 2048 + (nr_keys * 5) / 8; - - return nr_dirty <= max_dirty; -} - -int bch2_btree_key_cache_journal_flush(struct journal *, - struct journal_entry_pin *, u64); - -struct bkey_cached * -bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); - -int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned); - -bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, - struct btree_insert_entry *); -void bch2_btree_key_cache_drop(struct btree_trans *, - struct btree_path *); - -void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); -void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); -int bch2_fs_btree_key_cache_init(struct btree_key_cache *); - -void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); - -void bch2_btree_key_cache_exit(void); -int __init bch2_btree_key_cache_init(void); - -#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h deleted file mode 100644 index 722f1ed1055152..00000000000000 --- a/fs/bcachefs/btree_key_cache_types.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H -#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H - -#include "rcu_pending.h" - -struct btree_key_cache { - struct rhashtable table; - bool table_init_done; - - struct shrinker *shrink; - unsigned shrink_iter; - - /* 0: non pcpu reader locks, 1: pcpu reader locks */ - struct rcu_pending pending[2]; - size_t __percpu *nr_pending; - - atomic_long_t nr_keys; - atomic_long_t nr_dirty; - - /* shrinker stats */ - unsigned long requested_to_free; - unsigned long freed; - unsigned long skipped_dirty; - unsigned long skipped_accessed; - unsigned long skipped_lock_fail; -}; - -struct bkey_cached_key { - u32 btree_id; - struct bpos pos; -} __packed __aligned(4); - -#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c deleted file mode 100644 index bed2b4b6ffb9e0..00000000000000 --- a/fs/bcachefs/btree_locking.c +++ /dev/null @@ -1,936 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_locking.h" -#include "btree_types.h" - -static struct lock_class_key bch2_btree_node_lock_key; - -void bch2_btree_lock_init(struct btree_bkey_cached_common *b, - enum six_lock_init_flags flags, - gfp_t gfp) -{ - __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); - lockdep_set_notrack_class(&b->lock); -} - -/* Btree node locking: */ - -struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, - struct btree_path *skip, - struct btree_bkey_cached_common *b, - unsigned level) -{ - struct btree_path *path; - struct six_lock_count ret; - unsigned i; - - memset(&ret, 0, sizeof(ret)); - - if (IS_ERR_OR_NULL(b)) - return ret; - - trans_for_each_path(trans, path, i) - if (path != skip && &path->l[level].b->c == b) { - int t = btree_node_locked_type(path, level); - - if (t != BTREE_NODE_UNLOCKED) - ret.n[t]++; - } - - return ret; -} - -/* unlock */ - -void bch2_btree_node_unlock_write(struct btree_trans *trans, - struct btree_path *path, struct btree *b) -{ - bch2_btree_node_unlock_write_inlined(trans, path, b); -} - -/* lock */ - -/* - * @trans wants to lock @b with type @type - */ -struct trans_waiting_for_lock { - struct btree_trans *trans; - struct btree_bkey_cached_common *node_want; - enum six_lock_type lock_want; - - /* for iterating over held locks :*/ - u8 path_idx; - u8 level; - u64 lock_start_time; -}; - -struct lock_graph { - struct trans_waiting_for_lock g[8]; - unsigned nr; -}; - -static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) -{ - struct trans_waiting_for_lock *i; - - prt_printf(out, "Found lock cycle (%u entries):\n", g->nr); - - for (i = g->g; i < g->g + g->nr; i++) { - struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); - if (!task) - continue; - - bch2_btree_trans_to_text(out, i->trans); - bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT); - } -} - -static noinline void print_chain(struct printbuf *out, struct lock_graph *g) -{ - struct trans_waiting_for_lock *i; - - for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); - if (i != g->g) - prt_str(out, "<- "); - prt_printf(out, "%u ", task ? task->pid : 0); - } - prt_newline(out); -} - -static void lock_graph_up(struct lock_graph *g) -{ - closure_put(&g->g[--g->nr].trans->ref); -} - -static noinline void lock_graph_pop_all(struct lock_graph *g) -{ - while (g->nr) - lock_graph_up(g); -} - -static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i) -{ - while (g->g + g->nr > i) - lock_graph_up(g); -} - -static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) -{ - g->g[g->nr++] = (struct trans_waiting_for_lock) { - .trans = trans, - .node_want = trans->locking, - .lock_want = trans->locking_wait.lock_want, - }; -} - -static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) -{ - closure_get(&trans->ref); - __lock_graph_down(g, trans); -} - -static bool lock_graph_remove_non_waiters(struct lock_graph *g, - struct trans_waiting_for_lock *from) -{ - struct trans_waiting_for_lock *i; - - if (from->trans->locking != from->node_want) { - lock_graph_pop_from(g, from); - return true; - } - - for (i = from + 1; i < g->g + g->nr; i++) - if (i->trans->locking != i->node_want || - i->trans->locking_wait.start_time != i[-1].lock_start_time) { - lock_graph_pop_from(g, i); - return true; - } - - return false; -} - -static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - - count_event(c, trans_restart_would_deadlock); - - if (trace_trans_restart_would_deadlock_enabled()) { - struct printbuf buf = PRINTBUF; - - buf.atomic++; - print_cycle(&buf, g); - - trace_trans_restart_would_deadlock(trans, buf.buf); - printbuf_exit(&buf); - } -} - -static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) -{ - if (i == g->g) { - trace_would_deadlock(g, i->trans); - return btree_trans_restart_foreign_task(i->trans, - BCH_ERR_transaction_restart_would_deadlock, - _THIS_IP_); - } else { - i->trans->lock_must_abort = true; - wake_up_process(i->trans->locking_wait.task); - return 0; - } -} - -static int btree_trans_abort_preference(struct btree_trans *trans) -{ - if (trans->lock_may_not_fail) - return 0; - if (trans->locking_wait.lock_want == SIX_LOCK_write) - return 1; - if (!trans->in_traverse_all) - return 2; - return 3; -} - -static noinline __noreturn void break_cycle_fail(struct lock_graph *g) -{ - struct printbuf buf = PRINTBUF; - buf.atomic++; - - prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); - - for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) { - struct btree_trans *trans = i->trans; - - bch2_btree_trans_to_text(&buf, trans); - - prt_printf(&buf, "backtrace:\n"); - printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - - bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - BUG(); -} - -static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, - struct trans_waiting_for_lock *from) -{ - struct trans_waiting_for_lock *i, *abort = NULL; - unsigned best = 0, pref; - int ret; - - if (lock_graph_remove_non_waiters(g, from)) - return 0; - - /* Only checking, for debugfs: */ - if (cycle) { - print_cycle(cycle, g); - ret = -1; - goto out; - } - - for (i = from; i < g->g + g->nr; i++) { - pref = btree_trans_abort_preference(i->trans); - if (pref > best) { - abort = i; - best = pref; - } - } - - if (unlikely(!best)) - break_cycle_fail(g); - - ret = abort_lock(g, abort); -out: - if (ret) - lock_graph_pop_all(g); - else - lock_graph_pop_from(g, abort); - return ret; -} - -static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, - struct printbuf *cycle) -{ - struct btree_trans *orig_trans = g->g->trans; - - for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) - if (i->trans == trans) { - closure_put(&trans->ref); - return break_cycle(g, cycle, i); - } - - if (unlikely(g->nr == ARRAY_SIZE(g->g))) { - closure_put(&trans->ref); - - if (orig_trans->lock_may_not_fail) - return 0; - - lock_graph_pop_all(g); - - if (cycle) - return 0; - - trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); - return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); - } - - __lock_graph_down(g, trans); - return 0; -} - -static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) -{ - return t1 + t2 > 1; -} - -int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) -{ - struct lock_graph g; - struct trans_waiting_for_lock *top; - struct btree_bkey_cached_common *b; - btree_path_idx_t path_idx; - int ret = 0; - - g.nr = 0; - - if (trans->lock_must_abort && !trans->lock_may_not_fail) { - if (cycle) - return -1; - - trace_would_deadlock(&g, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); - } - - lock_graph_down(&g, trans); - - /* trans->paths is rcu protected vs. freeing */ - guard(rcu)(); - if (cycle) - cycle->atomic++; -next: - if (!g.nr) - goto out; - - top = &g.g[g.nr - 1]; - - struct btree_path *paths = rcu_dereference(top->trans->paths); - if (!paths) - goto up; - - unsigned long *paths_allocated = trans_paths_allocated(paths); - - trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), - path_idx, top->path_idx) { - struct btree_path *path = paths + path_idx; - if (!path->nodes_locked) - continue; - - if (path_idx != top->path_idx) { - top->path_idx = path_idx; - top->level = 0; - top->lock_start_time = 0; - } - - for (; - top->level < BTREE_MAX_DEPTH; - top->level++, top->lock_start_time = 0) { - int lock_held = btree_node_locked_type(path, top->level); - - if (lock_held == BTREE_NODE_UNLOCKED) - continue; - - b = &READ_ONCE(path->l[top->level].b)->c; - - if (IS_ERR_OR_NULL(b)) { - /* - * If we get here, it means we raced with the - * other thread updating its btree_path - * structures - which means it can't be blocked - * waiting on a lock: - */ - if (!lock_graph_remove_non_waiters(&g, g.g)) { - /* - * If lock_graph_remove_non_waiters() - * didn't do anything, it must be - * because we're being called by debugfs - * checking for lock cycles, which - * invokes us on btree_transactions that - * aren't actually waiting on anything. - * Just bail out: - */ - lock_graph_pop_all(&g); - } - - goto next; - } - - if (list_empty_careful(&b->lock.wait_list)) - continue; - - raw_spin_lock(&b->lock.wait_lock); - list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { - BUG_ON(b != trans->locking); - - if (top->lock_start_time && - time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) - continue; - - top->lock_start_time = trans->locking_wait.start_time; - - /* Don't check for self deadlock: */ - if (trans == top->trans || - !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) - continue; - - closure_get(&trans->ref); - raw_spin_unlock(&b->lock.wait_lock); - - ret = lock_graph_descend(&g, trans, cycle); - if (ret) - goto out; - goto next; - - } - raw_spin_unlock(&b->lock.wait_lock); - } - } -up: - if (g.nr > 1 && cycle) - print_chain(cycle, &g); - lock_graph_up(&g); - goto next; -out: - if (cycle) - --cycle->atomic; - return ret; -} - -int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) -{ - struct btree_trans *trans = p; - - return bch2_check_for_deadlock(trans, NULL); -} - -int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, - struct btree_bkey_cached_common *b, - bool lock_may_not_fail) -{ - int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; - int ret; - - /* - * Must drop our read locks before calling six_lock_write() - - * six_unlock() won't do wakeups until the reader count - * goes to 0, and it's safe because we have the node intent - * locked: - */ - six_lock_readers_add(&b->lock, -readers); - ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, - lock_may_not_fail, _RET_IP_); - six_lock_readers_add(&b->lock, readers); - - if (ret) - mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED); - - return ret; -} - -void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) -{ - int ret = __btree_node_lock_write(trans, path, b, true); - BUG_ON(ret); -} - -/* relock */ - -static int btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade, - struct get_locks_fail *f, - int restart_err) -{ - unsigned l = path->level; - - do { - if (!btree_path_node(path, l)) - break; - - if (!(upgrade - ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) - goto err; - - l++; - } while (l < path->locks_want); - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; - - return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1; -err: - if (f) { - f->l = l; - f->b = path->l[l].b; - } - - /* - * Do transaction restart before unlocking, so we don't pop - * should_be_locked asserts - */ - if (restart_err) { - btree_trans_restart(trans, restart_err); - } else if (path->should_be_locked && !trans->restarted) { - if (upgrade) - path->locks_want = l; - return -1; - } - - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - - /* - * When we fail to get a lock, we have to ensure that any child nodes - * can't be relocked so bch2_btree_path_traverse has to walk back up to - * the node that we failed to relock: - */ - do { - path->l[l].b = upgrade - ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) - : ERR_PTR(-BCH_ERR_no_btree_node_relock); - } while (l--); - - return -restart_err ?: -1; -} - -bool __bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level, - bool trace) -{ - struct btree *b = btree_path_node(path, level); - int want = __btree_lock_want(path, level); - - if (race_fault()) - goto fail; - - if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || - (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, &b->c, level, want))) { - mark_btree_node_locked(trans, path, level, want); - return true; - } -fail: - if (trace && !trans->notrace_relock_fail) - trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); - return false; -} - -/* upgrade */ - -bool bch2_btree_node_upgrade(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree *b = path->l[level].b; - - if (!is_btree_node(path, level)) - return false; - - switch (btree_lock_want(path, level)) { - case BTREE_NODE_UNLOCKED: - BUG_ON(btree_node_locked(path, level)); - return true; - case BTREE_NODE_READ_LOCKED: - BUG_ON(btree_node_intent_locked(path, level)); - return bch2_btree_node_relock(trans, path, level); - case BTREE_NODE_INTENT_LOCKED: - break; - case BTREE_NODE_WRITE_LOCKED: - BUG(); - } - - if (btree_node_intent_locked(path, level)) - return true; - - if (race_fault()) - return false; - - if (btree_node_locked(path, level) - ? six_lock_tryupgrade(&b->c.lock) - : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) - goto success; - - if (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { - btree_node_unlock(trans, path, level); - goto success; - } - - trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); - return false; -success: - mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); - return true; -} - -/* Btree path locking: */ - -/* - * Only for btree_cache.c - only relocks intent locks - */ -int bch2_btree_path_relock_intent(struct btree_trans *trans, - struct btree_path *path) -{ - unsigned l; - - for (l = path->level; - l < path->locks_want && btree_path_node(path, l); - l++) { - if (!bch2_btree_node_relock(trans, path, l)) { - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); - trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); - } - } - - return 0; -} - -__flatten -bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) -{ - bool ret = !btree_path_get_locks(trans, path, false, NULL, 0); - bch2_trans_verify_locks(trans); - return ret; -} - -int __bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - if (!bch2_btree_path_relock_norestart(trans, path)) { - trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); - } - - return 0; -} - -bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - path->locks_want = new_locks_want; - - /* - * If we need it locked, we can't touch it. Otherwise, we can return - * success - bch2_path_get() will use this path, and it'll just be - * retraversed: - */ - bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) || - !path->should_be_locked; - - bch2_btree_path_verify_locks(trans, path); - return ret; -} - -int __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - unsigned old_locks = path->nodes_locked; - unsigned old_locks_want = path->locks_want; - - path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); - - struct get_locks_fail f = {}; - int ret = btree_path_get_locks(trans, path, true, &f, - BCH_ERR_transaction_restart_upgrade); - if (!ret) - goto out; - - /* - * XXX: this is ugly - we'd prefer to not be mucking with other - * iterators in the btree_trans here. - * - * On failure to upgrade the iterator, setting iter->locks_want and - * calling get_locks() is sufficient to make bch2_btree_path_traverse() - * get the locks we want on transaction restart. - * - * But if this iterator was a clone, on transaction restart what we did - * to this iterator isn't going to be preserved. - * - * Possibly we could add an iterator field for the parent iterator when - * an iterator is a copy - for now, we'll just upgrade any other - * iterators with the same btree id. - * - * The code below used to be needed to ensure ancestor nodes get locked - * before interior nodes - now that's handled by - * bch2_btree_path_traverse_all(). - */ - if (!path->cached && !trans->in_traverse_all) { - struct btree_path *linked; - unsigned i; - - trans_for_each_path(trans, linked, i) - if (linked != path && - linked->cached == path->cached && - linked->btree_id == path->btree_id && - linked->locks_want < new_locks_want) { - linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true, NULL, 0); - } - } - - count_event(trans->c, trans_restart_upgrade); - if (trace_trans_restart_upgrade_enabled()) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_); - prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id)); - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, "locks want %u -> %u level %u\n", - old_locks_want, new_locks_want, f.l); - prt_printf(&buf, "nodes_locked %x -> %x\n", - old_locks, path->nodes_locked); - prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) : - !f.b ? "(null)" : "(node)"); - prt_printf(&buf, "path seq %u node seq %u\n", - IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq, - path->l[f.l].lock_seq); - - trace_trans_restart_upgrade(trans->c, buf.buf); - printbuf_exit(&buf); - } -out: - bch2_trans_verify_locks(trans); - return ret; -} - -void __bch2_btree_path_downgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - unsigned l, old_locks_want = path->locks_want; - - if (trans->restarted) - return; - - EBUG_ON(path->locks_want < new_locks_want); - - path->locks_want = new_locks_want; - - while (path->nodes_locked && - (l = btree_path_highest_level_locked(path)) >= path->locks_want) { - if (l > path->level) { - btree_node_unlock(trans, path, l); - } else { - if (btree_node_intent_locked(path, l)) { - six_lock_downgrade(&path->l[l].b->c.lock); - mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED); - } - break; - } - } - - bch2_btree_path_verify_locks(trans, path); - - trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); -} - -/* Btree transaction locking: */ - -void bch2_trans_downgrade(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - if (trans->restarted) - return; - - trans_for_each_path(trans, path, i) - if (path->ref) - bch2_btree_path_downgrade(trans, path); -} - -static inline void __bch2_trans_unlock(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); -} - -static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, - struct get_locks_fail *f, bool trace, ulong ip) -{ - if (!trace) - goto out; - - if (trace_trans_restart_relock_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " %s l=%u seq=%u node seq=", - bch2_btree_id_str(path->btree_id), - f->l, path->l[f->l].lock_seq); - if (IS_ERR_OR_NULL(f->b)) { - prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); - } else { - prt_printf(&buf, "%u", f->b->c.lock.seq); - - struct six_lock_count c = - bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l); - prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - - c = six_lock_counts(&f->b->c.lock); - prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - } - - trace_trans_restart_relock(trans, ip, buf.buf); - printbuf_exit(&buf); - } - - count_event(trans->c, trans_restart_relock); -out: - __bch2_trans_unlock(trans); - bch2_trans_verify_locks(trans); -} - -static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip) -{ - bch2_trans_verify_locks(trans); - - if (unlikely(trans->restarted)) - return -((int) trans->restarted); - if (unlikely(trans->locked)) - goto out; - - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) { - struct get_locks_fail f; - int ret; - - if (path->should_be_locked && - (ret = btree_path_get_locks(trans, path, false, &f, - BCH_ERR_transaction_restart_relock))) { - bch2_trans_relock_fail(trans, path, &f, trace, ip); - return ret; - } - } - - trans_set_locked(trans, true); -out: - bch2_trans_verify_locks(trans); - return 0; -} - -int bch2_trans_relock(struct btree_trans *trans) -{ - return __bch2_trans_relock(trans, true, _RET_IP_); -} - -int bch2_trans_relock_notrace(struct btree_trans *trans) -{ - return __bch2_trans_relock(trans, false, _RET_IP_); -} - -void bch2_trans_unlock(struct btree_trans *trans) -{ - trans_set_unlocked(trans); - - __bch2_trans_unlock(trans); -} - -void bch2_trans_unlock_long(struct btree_trans *trans) -{ - bch2_trans_unlock(trans); - bch2_trans_srcu_unlock(trans); -} - -void bch2_trans_unlock_write(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) - if (btree_node_write_locked(path, l)) - bch2_btree_node_unlock_write(trans, path, path->l[l].b); -} - -int __bch2_trans_mutex_lock(struct btree_trans *trans, - struct mutex *lock) -{ - int ret = drop_locks_do(trans, (mutex_lock(lock), 0)); - - if (ret) - mutex_unlock(lock); - return ret; -} - -/* Debug */ - -void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path) -{ - if (!path->nodes_locked && btree_path_node(path, path->level)) { - /* - * A path may be uptodate and yet have nothing locked if and only if - * there is no node at path->level, which generally means we were - * iterating over all nodes and got to the end of the btree - */ - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); - BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); - } - - if (!path->nodes_locked) - return; - - for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { - int want = btree_lock_want(path, l); - int have = btree_node_locked_type_nowrite(path, l); - - BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); - - BUG_ON(is_btree_node(path, l) && want != have); - - BUG_ON(btree_node_locked(path, l) && - path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); - } -} - -static bool bch2_trans_locked(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->nodes_locked) - return true; - return false; -} - -void __bch2_trans_verify_locks(struct btree_trans *trans) -{ - if (!trans->locked) { - BUG_ON(bch2_trans_locked(trans)); - return; - } - - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - __bch2_btree_path_verify_locks(trans, path); -} diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h deleted file mode 100644 index f2173a3316f4a0..00000000000000 --- a/fs/bcachefs/btree_locking.h +++ /dev/null @@ -1,466 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_LOCKING_H -#define _BCACHEFS_BTREE_LOCKING_H - -/* - * Only for internal btree use: - * - * The btree iterator tracks what locks it wants to take, and what locks it - * currently has - here we have wrappers for locking/unlocking btree nodes and - * updating the iterator state - */ - -#include "btree_iter.h" -#include "six.h" - -void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); - -void bch2_trans_unlock_write(struct btree_trans *); - -static inline bool is_btree_node(struct btree_path *path, unsigned l) -{ - return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b); -} - -static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans) -{ - return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats) - ? &trans->c->btree_transaction_stats[trans->fn_idx] - : NULL; -} - -/* matches six lock types */ -enum btree_node_locked_type { - BTREE_NODE_UNLOCKED = -1, - BTREE_NODE_READ_LOCKED = SIX_LOCK_read, - BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, - BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write, -}; - -static inline int btree_node_locked_type(struct btree_path *path, - unsigned level) -{ - return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); -} - -static inline int btree_node_locked_type_nowrite(struct btree_path *path, - unsigned level) -{ - int have = btree_node_locked_type(path, level); - return have == BTREE_NODE_WRITE_LOCKED - ? BTREE_NODE_INTENT_LOCKED - : have; -} - -static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) -{ - return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; -} - -static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l) -{ - return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED; -} - -static inline bool btree_node_read_locked(struct btree_path *path, unsigned l) -{ - return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED; -} - -static inline bool btree_node_locked(struct btree_path *path, unsigned level) -{ - return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED; -} - -static inline void mark_btree_node_locked_noreset(struct btree_path *path, - unsigned level, - enum btree_node_locked_type type) -{ - /* relying on this to avoid a branch */ - BUILD_BUG_ON(SIX_LOCK_read != 0); - BUILD_BUG_ON(SIX_LOCK_intent != 1); - - path->nodes_locked &= ~(3U << (level << 1)); - path->nodes_locked |= (type + 1) << (level << 1); -} - -static inline void mark_btree_node_locked(struct btree_trans *trans, - struct btree_path *path, - unsigned level, - enum btree_node_locked_type type) -{ - mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - path->l[level].lock_taken_time = local_clock(); -#endif -} - -static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) -{ - return level < path->locks_want - ? SIX_LOCK_intent - : SIX_LOCK_read; -} - -static inline enum btree_node_locked_type -btree_lock_want(struct btree_path *path, int level) -{ - if (level < path->level) - return BTREE_NODE_UNLOCKED; - if (level < path->locks_want) - return BTREE_NODE_INTENT_LOCKED; - if (level == path->level) - return BTREE_NODE_READ_LOCKED; - return BTREE_NODE_UNLOCKED; -} - -static void btree_trans_lock_hold_time_update(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times, - path->l[level].lock_taken_time, - local_clock()); -#endif -} - -/* unlock: */ - -void bch2_btree_node_unlock_write(struct btree_trans *, - struct btree_path *, struct btree *); - -static inline void btree_node_unlock(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - int lock_type = btree_node_locked_type(path, level); - - EBUG_ON(level >= BTREE_MAX_DEPTH); - - if (lock_type != BTREE_NODE_UNLOCKED) { - if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) { - bch2_btree_node_unlock_write(trans, path, path->l[level].b); - lock_type = BTREE_NODE_INTENT_LOCKED; - } - six_unlock_type(&path->l[level].b->c.lock, lock_type); - btree_trans_lock_hold_time_update(trans, path, level); - mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); - } -} - -static inline int btree_path_lowest_level_locked(struct btree_path *path) -{ - return __ffs(path->nodes_locked) >> 1; -} - -static inline int btree_path_highest_level_locked(struct btree_path *path) -{ - return __fls(path->nodes_locked) >> 1; -} - -static inline void __bch2_btree_path_unlock(struct btree_trans *trans, - struct btree_path *path) -{ - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK); - - while (path->nodes_locked) - btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); -} - -/* - * Updates the saved lock sequence number, so that bch2_btree_node_relock() will - * succeed: - */ -static inline void -__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b) -{ - if (!b->c.lock.write_lock_recurse) { - struct btree_path *linked; - unsigned i; - - trans_for_each_path_with_node(trans, b, linked, i) - linked->l[b->c.level].lock_seq++; - } - - six_unlock_write(&b->c.lock); -} - -static inline void -bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, - struct btree *b) -{ - EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); - EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); - - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - __bch2_btree_node_unlock_write(trans, b); -} - -int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); - -/* lock: */ - -static inline void trans_set_locked(struct btree_trans *trans, bool try) -{ - if (!trans->locked) { - lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_); - trans->locked = true; - trans->last_unlock_ip = 0; - - trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0; - current->flags |= PF_MEMALLOC_NOFS; - } -} - -static inline void trans_set_unlocked(struct btree_trans *trans) -{ - if (trans->locked) { - lock_release(&trans->dep_map, _THIS_IP_); - trans->locked = false; - trans->last_unlock_ip = _RET_IP_; - - if (!trans->pf_memalloc_nofs) - current->flags &= ~PF_MEMALLOC_NOFS; - } -} - -static inline int __btree_node_lock_nopath(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - enum six_lock_type type, - bool lock_may_not_fail, - unsigned long ip) -{ - trans->lock_may_not_fail = lock_may_not_fail; - trans->lock_must_abort = false; - trans->locking = b; - - int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, - bch2_six_check_for_deadlock, trans, ip); - WRITE_ONCE(trans->locking, NULL); - WRITE_ONCE(trans->locking_wait.start_time, 0); - - if (!ret) - trace_btree_path_lock(trans, _THIS_IP_, b); - return ret; -} - -static inline int __must_check -btree_node_lock_nopath(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - enum six_lock_type type, - unsigned long ip) -{ - return __btree_node_lock_nopath(trans, b, type, false, ip); -} - -static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - enum six_lock_type type) -{ - int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_); - - BUG_ON(ret); -} - -/* - * Lock a btree node if we already have it locked on one of our linked - * iterators: - */ -static inline bool btree_node_lock_increment(struct btree_trans *trans, - struct btree_bkey_cached_common *b, - unsigned level, - enum btree_node_locked_type want) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (&path->l[level].b->c == b && - btree_node_locked_type(path, level) >= want) { - six_lock_increment(&b->lock, (enum six_lock_type) want); - return true; - } - - return false; -} - -static inline int btree_node_lock(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b, - unsigned level, - enum six_lock_type type, - unsigned long ip) -{ - int ret = 0; - - EBUG_ON(level >= BTREE_MAX_DEPTH); - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - if (likely(six_trylock_type(&b->lock, type)) || - btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || - !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) { -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - path->l[b->level].lock_taken_time = local_clock(); -#endif - } - - return ret; -} - -int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *, - struct btree_bkey_cached_common *b, bool); - -static inline int __btree_node_lock_write(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b, - bool lock_may_not_fail) -{ - EBUG_ON(&path->l[b->level].b->c != b); - EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock)); - EBUG_ON(!btree_node_intent_locked(path, b->level)); - - /* - * six locks are unfair, and read locks block while a thread wants a - * write lock: thus, we need to tell the cycle detector we have a write - * lock _before_ taking the lock: - */ - mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED); - - return likely(six_trylock_write(&b->lock)) - ? 0 - : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); -} - -static inline int __must_check -bch2_btree_node_lock_write(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) -{ - return __btree_node_lock_write(trans, path, b, false); -} - -void bch2_btree_node_lock_write_nofail(struct btree_trans *, - struct btree_path *, - struct btree_bkey_cached_common *); - -/* relock: */ - -bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *); -int __bch2_btree_path_relock(struct btree_trans *, - struct btree_path *, unsigned long); - -static inline int bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - return btree_node_locked(path, path->level) - ? 0 - : __bch2_btree_path_relock(trans, path, trace_ip); -} - -bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); - -static inline bool bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - EBUG_ON(btree_node_locked(path, level) && - !btree_node_write_locked(path, level) && - btree_node_locked_type(path, level) != __btree_lock_want(path, level)); - - return likely(btree_node_locked(path, level)) || - (!IS_ERR_OR_NULL(path->l[level].b) && - __bch2_btree_node_relock(trans, path, level, true)); -} - -static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - EBUG_ON(btree_node_locked(path, level) && - btree_node_locked_type_nowrite(path, level) != - __btree_lock_want(path, level)); - - return likely(btree_node_locked(path, level)) || - (!IS_ERR_OR_NULL(path->l[level].b) && - __bch2_btree_node_relock(trans, path, level, false)); -} - -/* upgrade */ - -bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned); - -static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - return new_locks_want > path->locks_want - ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want) - : true; -} - -int __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned); - -static inline int bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - - return likely(path->locks_want >= new_locks_want && path->nodes_locked) - ? 0 - : __bch2_btree_path_upgrade(trans, path, new_locks_want); -} - -/* misc: */ - -static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path) -{ - EBUG_ON(!btree_node_locked(path, path->level)); - EBUG_ON(path->uptodate); - - if (!path->should_be_locked) { - path->should_be_locked = true; - trace_btree_path_should_be_locked(trans, path); - } -} - -static inline void __btree_path_set_level_up(struct btree_trans *trans, - struct btree_path *path, - unsigned l) -{ - btree_node_unlock(trans, path, l); - path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up); -} - -static inline void btree_path_set_level_up(struct btree_trans *trans, - struct btree_path *path) -{ - __btree_path_set_level_up(trans, path, path->level++); - btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); -} - -/* debug */ - -struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, - struct btree_path *, - struct btree_bkey_cached_common *b, - unsigned); - -int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); - -void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *); -void __bch2_trans_verify_locks(struct btree_trans *); - -static inline void bch2_btree_path_verify_locks(struct btree_trans *trans, - struct btree_path *path) -{ - if (static_branch_unlikely(&bch2_debug_check_btree_locking)) - __bch2_btree_path_verify_locks(trans, path); -} - -static inline void bch2_trans_verify_locks(struct btree_trans *trans) -{ - if (static_branch_unlikely(&bch2_debug_check_btree_locking)) - __bch2_trans_verify_locks(trans); -} - -#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c deleted file mode 100644 index a3fb07c60e25f0..00000000000000 --- a/fs/bcachefs/btree_node_scan.c +++ /dev/null @@ -1,611 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_journal_iter.h" -#include "btree_node_scan.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "error.h" -#include "journal_io.h" -#include "recovery_passes.h" - -#include -#include -#include -#include - -struct find_btree_nodes_worker { - struct closure *cl; - struct find_btree_nodes *f; - struct bch_dev *ca; -}; - -static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) -{ - bch2_btree_id_level_to_text(out, n->btree_id, n->level); - prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", - n->seq, n->journal_seq, n->cookie); - bch2_bpos_to_text(out, n->min_key); - prt_str(out, "-"); - bch2_bpos_to_text(out, n->max_key); - - if (n->range_updated) - prt_str(out, " range updated"); - - for (unsigned i = 0; i < n->nr_ptrs; i++) { - prt_char(out, ' '); - bch2_extent_ptr_to_text(out, c, n->ptrs + i); - } -} - -static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) -{ - printbuf_indent_add(out, 2); - darray_for_each(nodes, i) { - found_btree_node_to_text(out, c, i); - prt_newline(out); - } - printbuf_indent_sub(out, 2); -} - -static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) -{ - struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); - - set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); - bp->k.p = f->max_key; - bp->v.seq = cpu_to_le64(f->cookie); - bp->v.sectors_written = 0; - bp->v.flags = 0; - bp->v.sectors_written = cpu_to_le16(f->sectors_written); - bp->v.min_key = f->min_key; - SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); - memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); -} - -static inline u64 bkey_journal_seq(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); - default: - return 0; - } -} - -static int found_btree_node_cmp_cookie(const void *_l, const void *_r) -{ - const struct found_btree_node *l = _l; - const struct found_btree_node *r = _r; - - return cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - cmp_int(l->cookie, r->cookie); -} - -/* - * Given two found btree nodes, if their sequence numbers are equal, take the - * one that's readable: - */ -static int found_btree_node_cmp_time(const struct found_btree_node *l, - const struct found_btree_node *r) -{ - return cmp_int(l->seq, r->seq) ?: - cmp_int(l->journal_seq, r->journal_seq); -} - -static int found_btree_node_cmp_pos(const void *_l, const void *_r) -{ - const struct found_btree_node *l = _l; - const struct found_btree_node *r = _r; - - return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->min_key, r->min_key) ?: - -found_btree_node_cmp_time(l, r); -} - -static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) -{ - return found_btree_node_cmp_pos(l, r) < 0; -} - -static inline void found_btree_node_swap(void *_l, void *_r, void *arg) -{ - struct found_btree_node *l = _l; - struct found_btree_node *r = _r; - - swap(*l, *r); -} - -static const struct min_heap_callbacks found_btree_node_heap_cbs = { - .less = found_btree_node_cmp_pos_less, - .swp = found_btree_node_swap, -}; - -static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - struct btree *b, struct bio *bio, u64 offset) -{ - struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); - struct btree_node *bn = b->data; - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, b->data, c->opts.block_size); - - u64 submit_time = local_clock(); - submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, - "IO error in try_read_btree_node() at %llu: %s", - offset, bch2_blk_status_to_str(bio->bi_status)); - return; - } - - if (le64_to_cpu(bn->magic) != bset_magic(c)) - return; - - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { - if (!c->chacha20_key_set) - return; - - struct nonce nonce = btree_nonce(&bn->keys, 0); - unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - - bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); - } - - if (btree_id_is_alloc(BTREE_NODE_ID(bn))) - return; - - if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) - return; - - if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) - return; - - rcu_read_lock(); - struct found_btree_node n = { - .btree_id = BTREE_NODE_ID(bn), - .level = BTREE_NODE_LEVEL(bn), - .seq = BTREE_NODE_SEQ(bn), - .cookie = le64_to_cpu(bn->keys.seq), - .min_key = bn->min_key, - .max_key = bn->max_key, - .nr_ptrs = 1, - .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, - .ptrs[0].offset = offset, - .ptrs[0].dev = ca->dev_idx, - .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)), - }; - rcu_read_unlock(); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, b->data, c->opts.btree_node_size); - - submit_time = local_clock(); - submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); - - found_btree_node_to_key(&b->key, &n); - - CLASS(printbuf, buf)(); - if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { - /* read_done will swap out b->data for another buffer */ - bn = b->data; - /* - * Grab journal_seq here because we want the max journal_seq of - * any bset; read_done sorts down to a single set and picks the - * max journal_seq - */ - n.journal_seq = le64_to_cpu(bn->keys.journal_seq), - n.sectors_written = b->written; - - mutex_lock(&f->lock); - if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { - bch_err(c, "try_read_btree_node() can't handle endian conversion"); - f->ret = -EINVAL; - goto unlock; - } - - if (darray_push(&f->nodes, n)) - f->ret = -ENOMEM; -unlock: - mutex_unlock(&f->lock); - } -} - -static int read_btree_nodes_worker(void *p) -{ - struct find_btree_nodes_worker *w = p; - struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); - struct bch_dev *ca = w->ca; - unsigned long last_print = jiffies; - struct btree *b = NULL; - struct bio *bio = NULL; - - b = __bch2_btree_node_mem_alloc(c); - if (!b) { - bch_err(c, "read_btree_nodes_worker: error allocating buf"); - w->f->ret = -ENOMEM; - goto err; - } - - bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); - if (!bio) { - bch_err(c, "read_btree_nodes_worker: error allocating bio"); - w->f->ret = -ENOMEM; - goto err; - } - - for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) - for (unsigned bucket_offset = 0; - bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; - bucket_offset += btree_sectors(c)) { - if (time_after(jiffies, last_print + HZ * 30)) { - u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; - u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; - - bch_info(ca, "%s: %2u%% done", __func__, - (unsigned) div64_u64(cur_sector * 100, end_sector)); - last_print = jiffies; - } - - u64 sector = bucket * ca->mi.bucket_size + bucket_offset; - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && - !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) - continue; - - try_read_btree_node(w->f, ca, b, bio, sector); - } -err: - if (b) - __btree_node_data_free(b); - kfree(b); - bio_put(bio); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - closure_put(w->cl); - kfree(w); - return 0; -} - -static int read_btree_nodes(struct find_btree_nodes *f) -{ - struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); - struct closure cl; - int ret = 0; - - closure_init_stack(&cl); - - for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { - if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) - continue; - - struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); - if (!w) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - ret = -ENOMEM; - goto err; - } - - w->cl = &cl; - w->f = f; - w->ca = ca; - - struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); - ret = PTR_ERR_OR_ZERO(t); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - kfree(w); - bch_err_msg(c, ret, "starting kthread"); - break; - } - - closure_get(&cl); - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); - wake_up_process(t); - } -err: - while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) - ; - return f->ret ?: ret; -} - -static bool nodes_overlap(const struct found_btree_node *l, - const struct found_btree_node *r) -{ - return (l->btree_id == r->btree_id && - l->level == r->level && - bpos_gt(l->max_key, r->min_key)); -} - -static int handle_overwrites(struct bch_fs *c, - struct found_btree_node *l, - found_btree_nodes *nodes_heap) -{ - struct found_btree_node *r; - - while ((r = min_heap_peek(nodes_heap)) && - nodes_overlap(l, r)) { - int cmp = found_btree_node_cmp_time(l, r); - - if (cmp > 0) { - if (bpos_cmp(l->max_key, r->max_key) >= 0) - min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - else { - r->range_updated = true; - r->min_key = bpos_successor(l->max_key); - r->range_updated = true; - min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); - } - } else if (cmp < 0) { - BUG_ON(bpos_eq(l->min_key, r->min_key)); - - l->max_key = bpos_predecessor(r->min_key); - l->range_updated = true; - } else if (r->level) { - min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - } else { - if (bpos_cmp(l->max_key, r->max_key) >= 0) - min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); - else { - r->range_updated = true; - r->min_key = bpos_successor(l->max_key); - r->range_updated = true; - min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); - } - } - - cond_resched(); - } - - return 0; -} - -int bch2_scan_for_btree_nodes(struct bch_fs *c) -{ - struct find_btree_nodes *f = &c->found_btree_nodes; - struct printbuf buf = PRINTBUF; - found_btree_nodes nodes_heap = {}; - size_t dst; - int ret = 0; - - if (f->nodes.nr) - return 0; - - mutex_init(&f->lock); - - ret = read_btree_nodes(f); - if (ret) - return ret; - - if (!f->nodes.nr) { - bch_err(c, "%s: no btree nodes found", __func__); - ret = -EINVAL; - goto err; - } - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes found:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_str(c, KERN_INFO, buf.buf); - } - - sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); - - dst = 0; - darray_for_each(f->nodes, i) { - struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; - - if (prev && - prev->cookie == i->cookie) { - if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { - bch_err(c, "%s: found too many replicas for btree node", __func__); - ret = -EINVAL; - goto err; - } - prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; - } else { - f->nodes.data[dst++] = *i; - } - } - f->nodes.nr = dst; - - sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_str(c, KERN_INFO, buf.buf); - } - - swap(nodes_heap, f->nodes); - - { - /* darray must have same layout as a heap */ - min_heap_char real_heap; - BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); - BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); - BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); - BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); - } - - min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); - - if (nodes_heap.nr) { - ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); - if (ret) - goto err; - - min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); - } - - while (true) { - ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); - if (ret) - goto err; - - if (!nodes_heap.nr) - break; - - ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); - if (ret) - goto err; - - min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); - } - - for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) - BUG_ON(nodes_overlap(n, n + 1)); - - if (0 && c->opts.verbose) { - printbuf_reset(&buf); - prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); - found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_str(c, KERN_INFO, buf.buf); - } else { - bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); - } - - eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); -err: - darray_exit(&nodes_heap); - printbuf_exit(&buf); - return ret; -} - -static int found_btree_node_range_start_cmp(const void *_l, const void *_r) -{ - const struct found_btree_node *l = _l; - const struct found_btree_node *r = _r; - - return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->max_key, r->min_key); -} - -#define for_each_found_btree_node_in_range(_f, _search, _idx) \ - for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ - sizeof((_f)->nodes.data[0]), \ - found_btree_node_range_start_cmp, &search); \ - _idx < (_f)->nodes.nr && \ - (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ - (_f)->nodes.data[_idx].level == _search.level && \ - bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ - _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) - -bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) -{ - struct find_btree_nodes *f = &c->found_btree_nodes; - - struct found_btree_node search = { - .btree_id = b->c.btree_id, - .level = b->c.level, - .min_key = b->data->min_key, - .max_key = b->key.k.p, - }; - - for_each_found_btree_node_in_range(f, search, idx) - if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) - return true; - return false; -} - -int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) -{ - int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - return ret; - - struct found_btree_node search = { - .btree_id = btree, - .level = 0, - .min_key = POS_MIN, - .max_key = SPOS_MAX, - }; - - for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) - return true; - return false; -} - -int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, - unsigned level, struct bpos node_min, struct bpos node_max) -{ - if (btree_id_is_alloc(btree)) - return 0; - - struct find_btree_nodes *f = &c->found_btree_nodes; - - int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - return ret; - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "recovery "); - bch2_btree_id_level_to_text(&buf, btree, level); - prt_str(&buf, " "); - bch2_bpos_to_text(&buf, node_min); - prt_str(&buf, " - "); - bch2_bpos_to_text(&buf, node_max); - - bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - struct found_btree_node search = { - .btree_id = btree, - .level = level, - .min_key = node_min, - .max_key = node_max, - }; - - for_each_found_btree_node_in_range(f, search, idx) { - struct found_btree_node n = f->nodes.data[idx]; - - n.range_updated |= bpos_lt(n.min_key, node_min); - n.min_key = bpos_max(n.min_key, node_min); - - n.range_updated |= bpos_gt(n.max_key, node_max); - n.max_key = bpos_min(n.max_key, node_max); - - struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; - - found_btree_node_to_key(&tmp.k, &n); - - if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); - bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); - printbuf_exit(&buf); - } - - BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = level + 1, - .btree = btree, - })); - - ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); - if (ret) - return ret; - } - - return 0; -} - -void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) -{ - darray_exit(&f->nodes); -} diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h deleted file mode 100644 index 66e6f9ed19d04b..00000000000000 --- a/fs/bcachefs/btree_node_scan.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_NODE_SCAN_H -#define _BCACHEFS_BTREE_NODE_SCAN_H - -int bch2_scan_for_btree_nodes(struct bch_fs *); -bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); -int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); -int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); -void bch2_find_btree_nodes_exit(struct find_btree_nodes *); - -#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */ diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h deleted file mode 100644 index 2811b6857c970d..00000000000000 --- a/fs/bcachefs/btree_node_scan_types.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H -#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H - -#include "darray.h" - -struct found_btree_node { - bool range_updated:1; - u8 btree_id; - u8 level; - unsigned sectors_written; - u32 seq; - u64 journal_seq; - u64 cookie; - - struct bpos min_key; - struct bpos max_key; - - unsigned nr_ptrs; - struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; -}; - -typedef DARRAY(struct found_btree_node) found_btree_nodes; - -struct find_btree_nodes { - int ret; - struct mutex lock; - found_btree_nodes nodes; -}; - -#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c deleted file mode 100644 index 639ef75b3dbd04..00000000000000 --- a/fs/bcachefs/btree_trans_commit.c +++ /dev/null @@ -1,1121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "disk_accounting.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "replicas.h" -#include "snapshot.h" - -#include -#include - -static const char * const trans_commit_flags_strs[] = { -#define x(n, ...) #n, - BCH_TRANS_COMMIT_FLAGS() -#undef x - NULL -}; - -void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags) -{ - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - - prt_printf(out, "watermark=%s", bch2_watermarks[watermark]); - - flags >>= BCH_WATERMARK_BITS; - if (flags) { - prt_char(out, ' '); - bch2_prt_bitflags(out, trans_commit_flags_strs, flags); - } -} - -static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); - - if (j_k) - k = bkey_i_to_s_c(j_k); - } - - u = *k.k; - u.needs_whiteout = i->old_k.needs_whiteout; - - BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); - BUG_ON(i->old_v != k.v); -#endif -} - -static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i) -{ - return (trans->paths + i->path)->l + i->level; -} - -static inline bool same_leaf_as_prev(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i != trans->updates && - insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b; -} - -static inline bool same_leaf_as_next(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i + 1 < trans->updates + trans->nr_updates && - insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b; -} - -inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct bch_fs *c = trans->c; - - if (unlikely(btree_node_just_written(b)) && - bch2_btree_post_write_cleanup(c, b)) - bch2_trans_node_reinit_iter(trans, b); - - /* - * If the last bset has been written, or if it's gotten too big - start - * a new bset to insert into: - */ - if (want_new_bset(c, b)) - bch2_btree_init_next(trans, b); -} - -static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -{ - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -} - -static inline int bch2_trans_lock_write(struct btree_trans *trans) -{ - EBUG_ON(trans->write_locked); - - trans_for_each_update(trans, i) { - if (same_leaf_as_prev(trans, i)) - continue; - - if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c)) - return trans_lock_write_fail(trans, i); - - if (!i->cached) - bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b); - } - - trans->write_locked = true; - return 0; -} - -static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans) -{ - if (likely(trans->write_locked)) { - trans_for_each_update(trans, i) - if (btree_node_locked_type(trans->paths + i->path, i->level) == - BTREE_NODE_WRITE_LOCKED) - bch2_btree_node_unlock_write_inlined(trans, - trans->paths + i->path, insert_l(trans, i)->b); - trans->write_locked = false; - } -} - -/* Inserting into a given leaf node (last stage of insert): */ - -/* Handle overwrites and do insert, for non extents: */ -bool bch2_btree_bset_insert_key(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - struct bkey_packed *k; - unsigned clobber_u64s = 0, new_u64s = 0; - - EBUG_ON(btree_node_just_written(b)); - EBUG_ON(bset_written(b, btree_bset_last(b))); - EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); - EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); - EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); - kmsan_check_memory(insert, bkey_bytes(&insert->k)); - - k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) - k = NULL; - - /* @k is the key being overwritten/deleted, if any: */ - EBUG_ON(k && bkey_deleted(k)); - - /* Deleting, but not found? nothing to do: */ - if (bkey_deleted(&insert->k) && !k) - return false; - - if (bkey_deleted(&insert->k)) { - /* Deleting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - if (k->needs_whiteout) - push_whiteout(b, insert->k.p); - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - bch2_bset_delete(b, k, clobber_u64s); - goto fix_iter; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - - return true; - } - - if (k) { - /* Overwriting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - insert->k.needs_whiteout = k->needs_whiteout; - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - goto overwrite; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - } - - k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -overwrite: - bch2_bset_insert(b, k, insert, clobber_u64s); - new_u64s = k->u64s; -fix_iter: - if (clobber_u64s != new_u64s) - bch2_btree_node_iter_fix(trans, path, b, node_iter, k, - clobber_u64s, new_u64s); - return true; -} - -static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, - unsigned i, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_write *w = container_of(pin, struct btree_write, journal); - struct btree *b = container_of(w, struct btree, writes[i]); - struct btree_trans *trans = bch2_trans_get(c); - unsigned long old, new; - unsigned idx = w - b->writes; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - - old = READ_ONCE(b->flags); - do { - new = old; - - if (!(old & (1 << BTREE_NODE_dirty)) || - !!(old & (1 << BTREE_NODE_write_idx)) != idx || - w->journal.seq != seq) - break; - - new &= ~BTREE_WRITE_TYPE_MASK; - new |= BTREE_WRITE_journal_reclaim; - new |= 1 << BTREE_NODE_need_write; - } while (!try_cmpxchg(&b->flags, &old, new)); - - btree_node_write_if_need(trans, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - - bch2_trans_put(trans); - return 0; -} - -int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 0, seq); -} - -int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 1, seq); -} - -inline void bch2_btree_add_journal_pin(struct bch_fs *c, - struct btree *b, u64 seq) -{ - struct btree_write *w = btree_current_write(b); - - bch2_journal_pin_add(&c->journal, seq, &w->journal, - btree_node_write_idx(b) == 0 - ? bch2_btree_node_flush0 - : bch2_btree_node_flush1); -} - -/** - * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node - * @trans: btree transaction object - * @path: path pointing to @insert's pos - * @insert: key to insert - * @journal_seq: sequence number of journal reservation - */ -inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, - struct btree_path *path, - struct bkey_i *insert, - u64 journal_seq) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(path)->b; - struct bset_tree *t = bset_tree_last(b); - struct bset *i = bset(b, t); - int old_u64s = bset_u64s(t); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - - if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, - &path_l(path)->iter, insert))) - return; - - i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); - - if (unlikely(!btree_node_dirty(b))) { - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - set_btree_node_dirty_acct(c, b); - } - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) bset_u64s(t) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch2_maybe_compact_whiteouts(c, b)) - bch2_trans_node_reinit_iter(trans, b); -} - -/* Cached btree updates: */ - -/* Normal update interface: */ - -static inline void btree_insert_entry_checks(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - struct btree_path *path = trans->paths + i->path; - - BUG_ON(!bpos_eq(i->k->k.p, path->pos)); - BUG_ON(i->cached != path->cached); - BUG_ON(i->level != path->level); - BUG_ON(i->btree_id != path->btree_id); - BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); - EBUG_ON(!i->level && - btree_type_has_snapshots(i->btree_id) && - !(i->flags & BTREE_UPDATE_internal_snapshot_node) && - test_bit(JOURNAL_replay_done, &trans->c->journal.flags) && - i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); -} - -static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) -{ - return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags, trans); -} - -#define JSET_ENTRY_LOG_U64s 4 - -static noinline void journal_transaction_name(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct jset_entry *entry = - bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_log, 0, 0, - JSET_ENTRY_LOG_U64s); - struct jset_entry_log *l = - container_of(entry, struct jset_entry_log, entry); - - memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), - trans->fn, strlen(trans->fn), 0); -} - -static inline int btree_key_can_insert(struct btree_trans *trans, - struct btree *b, unsigned u64s) -{ - if (!bch2_btree_node_insert_fits(b, u64s)) - return bch_err_throw(trans->c, btree_insert_btree_node_full); - - return 0; -} - -noinline static int -btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, - struct btree_path *path, unsigned new_u64s) -{ - struct bkey_cached *ck = (void *) path->l[0].b; - struct bkey_i *new_k; - int ret; - - bch2_trans_unlock_updates_write(trans); - bch2_trans_unlock(trans); - - new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); - if (!new_k) { - struct bch_fs *c = trans->c; - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(path->btree_id), new_u64s); - return bch_err_throw(c, ENOMEM_btree_key_cache_insert); - } - - ret = bch2_trans_relock(trans) ?: - bch2_trans_lock_write(trans); - if (unlikely(ret)) { - kfree(new_k); - return ret; - } - - memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); - - trans_for_each_update(trans, i) - if (i->old_v == &ck->k->v) - i->old_v = &new_k->v; - - kfree(ck->k); - ck->u64s = new_u64s; - ck->k = new_k; - return 0; -} - -static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, - struct btree_path *path, unsigned u64s) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) path->l[0].b; - unsigned new_u64s; - struct bkey_i *new_k; - unsigned watermark = flags & BCH_WATERMARK_MASK; - - EBUG_ON(path->level); - - if (watermark < BCH_WATERMARK_reclaim && - !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c)) - return bch_err_throw(c, btree_insert_need_journal_reclaim); - - /* - * bch2_varint_decode can read past the end of the buffer by at most 7 - * bytes (it won't be used): - */ - u64s += 1; - - if (u64s <= ck->u64s) - return 0; - - new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_k)) - return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); - - trans_for_each_update(trans, i) - if (i->old_v == &ck->k->v) - i->old_v = &new_k->v; - - ck->u64s = new_u64s; - ck->k = new_k; - return 0; -} - -/* Triggers: */ - -static int run_one_mem_trigger(struct btree_trans *trans, - struct btree_insert_entry *i, - unsigned flags) -{ - verify_update_old_key(trans, i); - - if (unlikely(flags & BTREE_TRIGGER_norun)) - return 0; - - struct bkey_s_c old = { &i->old_k, i->old_v }; - struct bkey_i *new = i->k; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - - if (old_ops->trigger == new_ops->trigger) - return bch2_key_trigger(trans, i->btree_id, i->level, - old, bkey_i_to_s(new), - BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags); - else - return bch2_key_trigger_new(trans, i->btree_id, i->level, - bkey_i_to_s(new), flags) ?: - bch2_key_trigger_old(trans, i->btree_id, i->level, - old, flags); -} - -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) -{ - verify_update_old_key(trans, i); - - if ((i->flags & BTREE_TRIGGER_norun) || - !btree_node_type_has_trans_triggers(i->bkey_type)) - return 0; - - /* - * Transactional triggers create new btree_insert_entries, so we can't - * pass them a pointer to a btree_insert_entry, that memory is going to - * move: - */ - struct bkey old_k = i->old_k; - struct bkey_s_c old = { &old_k, i->old_v }; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - unsigned flags = i->flags|BTREE_TRIGGER_transactional; - - if (!i->insert_trigger_run && - !i->overwrite_trigger_run && - old_ops->trigger == new_ops->trigger) { - i->overwrite_trigger_run = true; - i->insert_trigger_run = true; - return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), - BTREE_TRIGGER_insert| - BTREE_TRIGGER_overwrite|flags) ?: 1; - } else if (!i->overwrite_trigger_run) { - i->overwrite_trigger_run = true; - return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; - } else if (!i->insert_trigger_run) { - i->insert_trigger_run = true; - return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; - } else { - return 0; - } -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - unsigned sort_id_start = 0; - - while (sort_id_start < trans->nr_updates) { - unsigned i, sort_id = trans->updates[sort_id_start].sort_order; - bool trans_trigger_run; - - /* - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being - * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop - * references before they are re-added. - * - * Running triggers will append more updates to the list of - * updates as we're walking it: - */ - do { - trans_trigger_run = false; - - for (i = sort_id_start; - i < trans->nr_updates && trans->updates[i].sort_order <= sort_id; - i++) { - if (trans->updates[i].sort_order < sort_id) { - sort_id_start = i; - continue; - } - - int ret = run_one_trans_trigger(trans, trans->updates + i); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - - sort_id_start = i; - } - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && - btree_node_type_has_trans_triggers(i->bkey_type) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); -#endif - return 0; -} - -static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) -{ - trans_for_each_update(trans, i) - if (btree_node_type_has_triggers(i->bkey_type) && - gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) { - int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); - if (ret) - return ret; - } - - return 0; -} - -static inline int -bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_trans_commit_hook *h; - unsigned u64s = 0; - int ret = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); -#if 0 - /* todo: bring back dynamic fault injection */ - if (race_fault()) { - trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); - } -#endif - /* - * Check if the insert will fit in the leaf node with the write lock - * held, otherwise another thread could write the node changing the - * amount of space available: - */ - - prefetch(&trans->c->journal.flags); - - trans_for_each_update(trans, i) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) - u64s = 0; - - u64s += i->k->k.u64s; - ret = !i->cached - ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s) - : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s); - if (ret) { - *stopped_at = i; - return ret; - } - - i->k->k.needs_whiteout = false; - } - - /* - * Don't get journal reservation until after we know insert will - * succeed: - */ - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { - ret = bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_NONBLOCK); - if (ret) - return ret; - - if (unlikely(trans->journal_transaction_names)) - journal_transaction_name(trans); - } - - /* - * Not allowed to fail after we've gotten our journal reservation - we - * have to use it: - */ - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(flags & BCH_TRANS_COMMIT_no_journal_res)) { - if (static_branch_unlikely(&bch2_journal_seq_verify)) - trans_for_each_update(trans, i) - i->k->k.bversion.lo = trans->journal_res.seq; - else if (static_branch_unlikely(&bch2_inject_invalid_keys)) - trans_for_each_update(trans, i) - i->k->k.bversion = MAX_VERSION; - } - - h = trans->hooks; - while (h) { - ret = h->fn(trans, h); - if (ret) - return ret; - h = h->next; - } - - struct bkey_i *accounting; - - percpu_down_read(&c->mark_lock); - for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); - accounting != btree_trans_subbuf_top(trans, &trans->accounting); - accounting = bkey_next(accounting)) { - ret = bch2_accounting_trans_commit_hook(trans, - bkey_i_to_accounting(accounting), flags); - if (ret) - goto revert_fs_usage; - } - percpu_up_read(&c->mark_lock); - - /* XXX: we only want to run this if deltas are nonzero */ - bch2_trans_account_disk_usage_change(trans); - - trans_for_each_update(trans, i) - if (btree_node_type_has_atomic_triggers(i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); - if (ret) - goto fatal_err; - } - - if (unlikely(c->gc_pos.phase)) { - ret = bch2_trans_commit_run_gc_triggers(trans); - if (ret) - goto fatal_err; - } - - struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; - - for (struct jset_entry *i = btree_trans_journal_entries_start(trans); - i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) { - ret = bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, validate_context); - if (unlikely(ret)) { - bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", - trans->fn); - goto fatal_err; - } - } - - trans_for_each_update(trans, i) { - validate_context.level = i->level; - validate_context.btree = i->btree_id; - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); - if (unlikely(ret)){ - bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", - trans->fn, (void *) i->ip_allocated); - goto fatal_err; - } - btree_insert_entry_checks(trans, i); - } - - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { - struct journal *j = &c->journal; - struct jset_entry *entry; - - trans_for_each_update(trans, i) { - if (i->key_cache_already_flushed) - continue; - - if (i->flags & BTREE_UPDATE_nojournal) - continue; - - verify_update_old_key(trans, i); - - if (trans->journal_transaction_names) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_overwrite, - i->btree_id, i->level, - i->old_k.u64s); - bkey_reassemble((struct bkey_i *) entry->start, - (struct bkey_s_c) { &i->old_k, i->old_v }); - } - - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - i->btree_id, i->level, - i->k->k.u64s); - bkey_copy((struct bkey_i *) entry->start, i->k); - } - - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - btree_trans_journal_entries_start(trans), - trans->journal_entries.u64s); - - EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s); - - trans->journal_res.offset += trans->journal_entries.u64s; - trans->journal_res.u64s -= trans->journal_entries.u64s; - - memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_write_buffer_keys, - BTREE_ID_accounting, 0, - trans->accounting.u64s)->_data, - btree_trans_subbuf_base(trans, &trans->accounting), - trans->accounting.u64s); - - if (trans->journal_seq) - *trans->journal_seq = trans->journal_res.seq; - } - - trans_for_each_update(trans, i) { - struct btree_path *path = trans->paths + i->path; - - if (!i->cached) - bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); - else if (!i->key_cache_already_flushed) - bch2_btree_insert_key_cached(trans, flags, i); - else - bch2_btree_key_cache_drop(trans, path); - } - - return 0; -fatal_err: - bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); - percpu_down_read(&c->mark_lock); -revert_fs_usage: - for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); - i != accounting; - i = bkey_next(i)) - bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); - percpu_up_read(&c->mark_lock); - return ret; -} - -static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) -{ - /* - * Accounting keys aren't deduped in the journal: we have to compare - * each individual update against what's in the btree to see if it has - * been applied yet, and accounting updates also don't overwrite, - * they're deltas that accumulate. - */ - trans_for_each_update(trans, i) - if (i->k->k.type != KEY_TYPE_accounting) - bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); -} - -static int bch2_trans_commit_journal_pin_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - return 0; -} - -/* - * Get journal reservation, take write locks, and attempt to do btree update(s): - */ -static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - int ret = 0, u64s_delta = 0; - - for (unsigned idx = 0; idx < trans->nr_updates; idx++) { - struct btree_insert_entry *i = trans->updates + idx; - if (i->cached) - continue; - - u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; - u64s_delta -= i->old_btree_u64s; - - if (!same_leaf_as_next(trans, i)) { - if (u64s_delta <= 0) { - ret = bch2_foreground_maybe_merge(trans, i->path, - i->level, flags); - if (unlikely(ret)) - return ret; - } - - u64s_delta = 0; - } - } - - ret = bch2_trans_lock_write(trans); - if (unlikely(ret)) - return ret; - - ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); - - if (!ret && unlikely(trans->journal_replay_not_finished)) - bch2_drop_overwrites_from_journal(trans); - - bch2_trans_unlock_updates_write(trans); - - if (!ret && trans->journal_pin) - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - trans->journal_pin, - bch2_trans_commit_journal_pin_flush); - - /* - * Drop journal reservation after dropping write locks, since dropping - * the journal reservation may kick off a journal write: - */ - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) - bch2_journal_res_put(&c->journal, &trans->journal_res); - - return ret; -} - -static int journal_reclaim_wait_done(struct bch_fs *c) -{ - int ret = bch2_journal_error(&c->journal) ?: - bch2_btree_key_cache_wait_done(c); - - if (!ret) - journal_reclaim_kick(&c->journal); - return ret; -} - -static noinline -int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry *i, - int ret, unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - - if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) { - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag - */ - if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) { - ret = bch_err_throw(c, journal_reclaim_would_deadlock); - goto out; - } - - ret = drop_locks_do(trans, - bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_CHECK)); - goto out; - } - - switch (ret) { - case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, - trace_ip, trans->paths + i->path); - break; - case -BCH_ERR_btree_insert_need_mark_replicas: - ret = drop_locks_do(trans, - bch2_accounting_update_sb(trans)); - break; - case -BCH_ERR_btree_insert_need_journal_reclaim: - bch2_trans_unlock(trans); - - trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); - track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true); - - wait_event_freezable(c->journal.reclaim_wait, - (ret = journal_reclaim_wait_done(c))); - - track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false); - - if (ret < 0) - break; - - ret = bch2_trans_relock(trans); - break; - default: - BUG_ON(ret >= 0); - break; - } -out: - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); - - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - (flags & BCH_TRANS_COMMIT_no_enospc), c, - "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); - - return ret; -} - -/* - * This is for updates done in the early part of fsck - btree_gc - before we've - * gone RW. we only add the new key to the list of keys for journal replay to - * do. - */ -static noinline int -do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - - BUG_ON(current != c->recovery_task); - - trans_for_each_update(trans, i) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); - if (ret) - return ret; - } - - for (struct jset_entry *i = btree_trans_journal_entries_start(trans); - i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) { - if (i->type == BCH_JSET_ENTRY_btree_keys || - i->type == BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(i, k) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k); - if (ret) - return ret; - } - } - - if (i->type == BCH_JSET_ENTRY_btree_root) { - guard(mutex)(&c->btree_root_lock); - - struct btree_root *r = bch2_btree_id_root(c, i->btree_id); - - bkey_copy(&r->key, i->start); - r->level = i->level; - r->alive = true; - } - } - - for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); - i != btree_trans_subbuf_top(trans, &trans->accounting); - i = bkey_next(i)) { - int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); - if (ret) - return ret; - } - - return 0; -} - -int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) -{ - struct btree_insert_entry *errored_at = NULL; - struct bch_fs *c = trans->c; - unsigned journal_u64s = 0; - int ret = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - ret = trans_maybe_inject_restart(trans, _RET_IP_); - if (unlikely(ret)) - goto out_reset; - - if (!trans->nr_updates && - !trans->journal_entries.u64s && - !trans->accounting.u64s) - goto out_reset; - - ret = bch2_trans_commit_run_triggers(trans); - if (ret) - goto out_reset; - - if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && - unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) - ret = do_bch2_trans_commit_to_journal_replay(trans); - else - ret = bch_err_throw(c, erofs_trans_commit); - goto out_reset; - } - - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - - journal_u64s = jset_u64s(trans->accounting.u64s); - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - if (trans->journal_transaction_names) - journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); - - trans_for_each_update(trans, i) { - struct btree_path *path = trans->paths + i->path; - - EBUG_ON(!path->should_be_locked); - - ret = bch2_btree_path_upgrade(trans, path, i->level + 1); - if (unlikely(ret)) - goto out; - - EBUG_ON(!btree_node_intent_locked(path, i->level)); - - if (i->key_cache_already_flushed) - continue; - - if (i->flags & BTREE_UPDATE_nojournal) - continue; - - /* we're going to journal the key being updated: */ - journal_u64s += jset_u64s(i->k->k.u64s); - - /* and we're also going to log the overwrite: */ - if (trans->journal_transaction_names) - journal_u64s += jset_u64s(i->old_k.u64s); - } - - if (trans->extra_disk_res) { - ret = bch2_disk_reservation_add(c, trans->disk_res, - trans->extra_disk_res, - (flags & BCH_TRANS_COMMIT_no_enospc) - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto err; - } -retry: - errored_at = NULL; - bch2_trans_verify_not_unlocked_or_in_restart(trans); - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); - - trans->journal_u64s = journal_u64s + trans->journal_entries.u64s; - - ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); - - /* make sure we didn't drop or screw up locks: */ - bch2_trans_verify_locks(trans); - - if (ret) - goto err; - - trace_and_count(c, transaction_commit, trans, _RET_IP_); -out: - if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); -out_reset: - if (!ret) - bch2_trans_downgrade(trans); - bch2_trans_reset_updates(trans); - - return ret; -err: - ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_); - if (ret) - goto out; - - /* - * We might have done another transaction commit in the error path - - * i.e. btree write buffer flush - which will have made use of - * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is - * how the journal sequence number to pin is passed in - so we must - * restart: - */ - if (flags & BCH_TRANS_COMMIT_no_journal_res) { - ret = bch_err_throw(c, transaction_restart_nested); - goto out; - } - - goto retry; -} diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h deleted file mode 100644 index 112170fd9c8fb5..00000000000000 --- a/fs/bcachefs/btree_types.h +++ /dev/null @@ -1,937 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_TYPES_H -#define _BCACHEFS_BTREE_TYPES_H - -#include -#include - -#include "bbpos_types.h" -#include "btree_key_cache_types.h" -#include "buckets_types.h" -#include "darray.h" -#include "errcode.h" -#include "journal_types.h" -#include "replicas_types.h" -#include "six.h" - -struct open_bucket; -struct btree_update; -struct btree_trans; - -#define MAX_BSETS 3U - -struct btree_nr_keys { - - /* - * Amount of live metadata (i.e. size of node after a compaction) in - * units of u64s - */ - u16 live_u64s; - u16 bset_u64s[MAX_BSETS]; - - /* live keys only: */ - u16 packed_keys; - u16 unpacked_keys; -}; - -struct bset_tree { - /* - * We construct a binary tree in an array as if the array - * started at 1, so that things line up on the same cachelines - * better: see comments in bset.c at cacheline_to_bkey() for - * details - */ - - /* size of the binary tree and prev array */ - u16 size; - - /* function of size - precalculated for to_inorder() */ - u16 extra; - - u16 data_offset; - u16 aux_data_offset; - u16 end_offset; -}; - -struct btree_write { - struct journal_entry_pin journal; -}; - -struct btree_alloc { - struct open_buckets ob; - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); -}; - -struct btree_bkey_cached_common { - struct six_lock lock; - u8 level; - u8 btree_id; - bool cached; -}; - -struct btree { - struct btree_bkey_cached_common c; - - struct rhash_head hash; - u64 hash_val; - - unsigned long flags; - u16 written; - u8 nsets; - u8 nr_key_bits; - u16 version_ondisk; - - struct bkey_format format; - - struct btree_node *data; - void *aux_data; - - /* - * Sets of sorted keys - the real btree node - plus a binary search tree - * - * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point - * to the memory we have allocated for this btree node. Additionally, - * set[0]->data points to the entire btree node as it exists on disk. - */ - struct bset_tree set[MAX_BSETS]; - - struct btree_nr_keys nr; - u16 sib_u64s[2]; - u16 whiteout_u64s; - u8 byte_order; - u8 unpack_fn_len; - - struct btree_write writes[2]; - - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - - /* - * XXX: add a delete sequence number, so when bch2_btree_node_relock() - * fails because the lock sequence number has changed - i.e. the - * contents were modified - we can still relock the node if it's still - * the one we want, without redoing the traversal - */ - - /* - * For asynchronous splits/interior node updates: - * When we do a split, we allocate new child nodes and update the parent - * node to point to them: we update the parent in memory immediately, - * but then we must wait until the children have been written out before - * the update to the parent can be written - this is a list of the - * btree_updates that are blocking this node from being - * written: - */ - struct list_head write_blocked; - - /* - * Also for asynchronous splits/interior node updates: - * If a btree node isn't reachable yet, we don't want to kick off - * another write - because that write also won't yet be reachable and - * marking it as completed before it's reachable would be incorrect: - */ - unsigned long will_make_reachable; - - struct open_buckets ob; - - /* lru list */ - struct list_head list; -}; - -#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ - x(cache_reserve) \ - x(lock_intent) \ - x(lock_write) \ - x(dirty) \ - x(read_in_flight) \ - x(write_in_flight) \ - x(noevict) \ - x(write_blocked) \ - x(will_make_reachable) \ - x(access_bit) - -enum bch_btree_cache_not_freed_reasons { -#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n, - BCH_BTREE_CACHE_NOT_FREED_REASONS() -#undef x - BCH_BTREE_CACHE_NOT_FREED_REASONS_NR, -}; - -struct btree_cache_list { - unsigned idx; - struct shrinker *shrink; - struct list_head list; - size_t nr; -}; - -struct btree_cache { - struct rhashtable table; - bool table_init_done; - /* - * We never free a struct btree, except on shutdown - we just put it on - * the btree_cache_freed list and reuse it later. This simplifies the - * code, and it doesn't cost us much memory as the memory usage is - * dominated by buffers that hold the actual btree node data and those - * can be freed - and the number of struct btrees allocated is - * effectively bounded. - * - * btree_cache_freeable effectively is a small cache - we use it because - * high order page allocations can be rather expensive, and it's quite - * common to delete and allocate btree nodes in quick succession. It - * should never grow past ~2-3 nodes in practice. - */ - struct mutex lock; - struct list_head freeable; - struct list_head freed_pcpu; - struct list_head freed_nonpcpu; - struct btree_cache_list live[2]; - - size_t nr_freeable; - size_t nr_reserve; - size_t nr_by_btree[BTREE_ID_NR]; - atomic_long_t nr_dirty; - - /* shrinker stats */ - size_t nr_freed; - u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR]; - - /* - * If we need to allocate memory for a new btree node and that - * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation - lock to guarantee only one thread does - * this at a time: - */ - struct task_struct *alloc_lock; - struct closure_waitlist alloc_wait; - - struct bbpos pinned_nodes_start; - struct bbpos pinned_nodes_end; - /* btree id mask: 0 for leaves, 1 for interior */ - u64 pinned_nodes_mask[2]; -}; - -struct btree_node_iter { - struct btree_node_iter_set { - u16 k, end; - } data[MAX_BSETS]; -}; - -#define BTREE_ITER_FLAGS() \ - x(slots) \ - x(intent) \ - x(prefetch) \ - x(is_extents) \ - x(not_extents) \ - x(cached) \ - x(with_key_cache) \ - x(with_updates) \ - x(with_journal) \ - x(snapshot_field) \ - x(all_snapshots) \ - x(filter_snapshots) \ - x(nopreserve) \ - x(cached_nofill) \ - x(key_cache_fill) \ - -#define STR_HASH_FLAGS() \ - x(must_create) \ - x(must_replace) - -#define BTREE_UPDATE_FLAGS() \ - x(internal_snapshot_node) \ - x(nojournal) \ - x(key_cache_reclaim) - - -/* - * BTREE_TRIGGER_norun - don't run triggers at all - * - * BTREE_TRIGGER_transactional - we're running transactional triggers as part of - * a transaction commit: triggers may generate new updates - * - * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction - * commit: we have our journal reservation, we're holding btree node write - * locks, and we know the transaction is going to commit (returning an error - * here is a fatal error, causing us to go emergency read-only) - * - * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage - * - * BTREE_TRIGGER_insert - @new is entering the btree - * BTREE_TRIGGER_overwrite - @old is leaving the btree - */ -#define BTREE_TRIGGER_FLAGS() \ - x(norun) \ - x(transactional) \ - x(atomic) \ - x(check_repair) \ - x(gc) \ - x(insert) \ - x(overwrite) \ - x(is_root) - -enum { -#define x(n) BTREE_ITER_FLAG_BIT_##n, - BTREE_ITER_FLAGS() - STR_HASH_FLAGS() - BTREE_UPDATE_FLAGS() - BTREE_TRIGGER_FLAGS() -#undef x -}; - -/* iter flags must fit in a u16: */ -//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15); - -enum btree_iter_update_trigger_flags { -#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - BTREE_ITER_FLAGS() -#undef x -#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - STR_HASH_FLAGS() -#undef x -#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - BTREE_UPDATE_FLAGS() -#undef x -#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, - BTREE_TRIGGER_FLAGS() -#undef x -}; - -enum btree_path_uptodate { - BTREE_ITER_UPTODATE = 0, - BTREE_ITER_NEED_RELOCK = 1, - BTREE_ITER_NEED_TRAVERSE = 2, -}; - -#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG) -#define TRACK_PATH_ALLOCATED -#endif - -typedef u16 btree_path_idx_t; - -struct btree_path { - btree_path_idx_t sorted_idx; - u8 ref; - u8 intent_ref; - - /* btree_iter_copy starts here: */ - struct bpos pos; - - enum btree_id btree_id:5; - bool cached:1; - bool preserve:1; - enum btree_path_uptodate uptodate:2; - /* - * When true, failing to relock this path will cause the transaction to - * restart: - */ - bool should_be_locked:1; - unsigned level:3, - locks_want:3; - u8 nodes_locked; - - struct btree_path_level { - struct btree *b; - struct btree_node_iter iter; - u32 lock_seq; -#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - u64 lock_taken_time; -#endif - } l[BTREE_MAX_DEPTH]; -#ifdef TRACK_PATH_ALLOCATED - unsigned long ip_allocated; -#endif -}; - -static inline struct btree_path_level *path_l(struct btree_path *path) -{ - return path->l + path->level; -} - -static inline unsigned long btree_path_ip_allocated(struct btree_path *path) -{ -#ifdef TRACK_PATH_ALLOCATED - return path->ip_allocated; -#else - return _THIS_IP_; -#endif -} - -/* - * @pos - iterator's current position - * @level - current btree depth - * @locks_want - btree level below which we start taking intent locks - * @nodes_locked - bitmask indicating which nodes in @nodes are locked - * @nodes_intent_locked - bitmask indicating which locks are intent locks - */ -struct btree_iter { - btree_path_idx_t path; - btree_path_idx_t update_path; - btree_path_idx_t key_cache_path; - - enum btree_id btree_id:8; - u8 min_depth; - - /* btree_iter_copy starts here: */ - u16 flags; - - /* When we're filtering by snapshot, the snapshot ID we're looking for: */ - unsigned snapshot; - - struct bpos pos; - /* - * Current unpacked key - so that bch2_btree_iter_next()/ - * bch2_btree_iter_next_slot() can correctly advance pos. - */ - struct bkey k; - - /* BTREE_ITER_with_journal: */ - size_t journal_idx; -#ifdef TRACK_PATH_ALLOCATED - unsigned long ip_allocated; -#endif -}; - -#define BKEY_CACHED_ACCESSED 0 -#define BKEY_CACHED_DIRTY 1 - -struct bkey_cached { - struct btree_bkey_cached_common c; - - unsigned long flags; - u16 u64s; - struct bkey_cached_key key; - - struct rhash_head hash; - - struct journal_entry_pin journal; - u64 seq; - - struct bkey_i *k; - struct rcu_head rcu; -}; - -static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) -{ - return !b->cached - ? container_of(b, struct btree, c)->key.k.p - : container_of(b, struct bkey_cached, c)->key.pos; -} - -struct btree_insert_entry { - unsigned flags; - u8 sort_order; - u8 bkey_type; - enum btree_id btree_id:8; - u8 level:4; - bool cached:1; - bool insert_trigger_run:1; - bool overwrite_trigger_run:1; - bool key_cache_already_flushed:1; - /* - * @old_k may be a key from the journal; @old_btree_u64s always refers - * to the size of the key being overwritten in the btree: - */ - u8 old_btree_u64s; - btree_path_idx_t path; - struct bkey_i *k; - /* key being overwritten: */ - struct bkey old_k; - const struct bch_val *old_v; - unsigned long ip_allocated; -}; - -/* Number of btree paths we preallocate, usually enough */ -#define BTREE_ITER_INITIAL 64 -/* - * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code - * paths should run inside this limit, and if they don't it usually indicates a - * bug (leaking/duplicated btree paths). - * - * exception: some fsck paths - * - * bugs with excessive path usage seem to have possibly been eliminated now, so - * we might consider eliminating this (and btree_trans_too_many_iter()) at some - * point. - */ -#define BTREE_ITER_NORMAL_LIMIT 256 -/* never exceed limit */ -#define BTREE_ITER_MAX (1U << 10) - -struct btree_trans_commit_hook; -typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); - -struct btree_trans_commit_hook { - btree_trans_commit_hook_fn *fn; - struct btree_trans_commit_hook *next; -}; - -#define BTREE_TRANS_MEM_MAX (1U << 16) - -#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 - -struct btree_trans_paths { - unsigned long nr_paths; - struct btree_path paths[]; -}; - -struct trans_kmalloc_trace { - unsigned long ip; - size_t bytes; -}; -typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; - -struct btree_trans_subbuf { - u16 base; - u16 u64s; - u16 size;; -}; - -struct btree_trans { - struct bch_fs *c; - - unsigned long *paths_allocated; - struct btree_path *paths; - btree_path_idx_t *sorted; - struct btree_insert_entry *updates; - - void *mem; - unsigned mem_top; - unsigned mem_bytes; - unsigned realloc_bytes_required; -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - darray_trans_kmalloc_trace trans_kmalloc_trace; -#endif - - btree_path_idx_t nr_sorted; - btree_path_idx_t nr_paths; - btree_path_idx_t nr_paths_max; - btree_path_idx_t nr_updates; - u8 fn_idx; - u8 lock_must_abort; - bool lock_may_not_fail:1; - bool srcu_held:1; - bool locked:1; - bool pf_memalloc_nofs:1; - bool write_locked:1; - bool used_mempool:1; - bool in_traverse_all:1; - bool paths_sorted:1; - bool memory_allocation_failure:1; - bool journal_transaction_names:1; - bool journal_replay_not_finished:1; - bool notrace_relock_fail:1; - enum bch_errcode restarted:16; - u32 restart_count; -#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS - u32 restart_count_this_trans; -#endif - - u64 last_begin_time; - unsigned long last_begin_ip; - unsigned long last_restarted_ip; -#ifdef CONFIG_BCACHEFS_DEBUG - bch_stacktrace last_restarted_trace; -#endif - unsigned long last_unlock_ip; - unsigned long srcu_lock_time; - - const char *fn; - struct btree_bkey_cached_common *locking; - struct six_lock_waiter locking_wait; - int srcu_idx; - - /* update path: */ - struct btree_trans_subbuf journal_entries; - struct btree_trans_subbuf accounting; - - struct btree_trans_commit_hook *hooks; - struct journal_entry_pin *journal_pin; - - struct journal_res journal_res; - u64 *journal_seq; - struct disk_reservation *disk_res; - - struct bch_fs_usage_base fs_usage_delta; - - unsigned journal_u64s; - unsigned extra_disk_res; /* XXX kill */ - - __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif - /* Entries before this are zeroed out on every bch2_trans_get() call */ - - struct list_head list; - struct closure ref; - - unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)]; - struct btree_trans_paths trans_paths; - struct btree_path _paths[BTREE_ITER_INITIAL]; - btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4]; - struct btree_insert_entry _updates[BTREE_ITER_INITIAL]; -}; - -static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter) -{ - return trans->paths + iter->path; -} - -static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter) -{ - return iter->key_cache_path - ? trans->paths + iter->key_cache_path - : NULL; -} - -#define BCH_BTREE_WRITE_TYPES() \ - x(initial, 0) \ - x(init_next_bset, 1) \ - x(cache_reclaim, 2) \ - x(journal_reclaim, 3) \ - x(interior, 4) - -enum btree_write_type { -#define x(t, n) BTREE_WRITE_##t, - BCH_BTREE_WRITE_TYPES() -#undef x - BTREE_WRITE_TYPE_NR, -}; - -#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) -#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR)) - -#define BTREE_FLAGS() \ - x(read_in_flight) \ - x(read_error) \ - x(dirty) \ - x(need_write) \ - x(write_blocked) \ - x(will_make_reachable) \ - x(noevict) \ - x(write_idx) \ - x(accessed) \ - x(write_in_flight) \ - x(write_in_flight_inner) \ - x(just_written) \ - x(dying) \ - x(fake) \ - x(need_rewrite) \ - x(need_rewrite_error) \ - x(need_rewrite_degraded) \ - x(need_rewrite_ptr_written_zero) \ - x(never_write) \ - x(pinned) - -enum btree_flags { - /* First bits for btree node write type */ - BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1, -#define x(flag) BTREE_NODE_##flag, - BTREE_FLAGS() -#undef x -}; - -#define x(flag) \ -static inline bool btree_node_ ## flag(struct btree *b) \ -{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ - \ -static inline void set_btree_node_ ## flag(struct btree *b) \ -{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ - \ -static inline void clear_btree_node_ ## flag(struct btree *b) \ -{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } - -BTREE_FLAGS() -#undef x - -#define BTREE_NODE_REWRITE_REASON() \ - x(none) \ - x(unknown) \ - x(error) \ - x(degraded) \ - x(ptr_written_zero) - -enum btree_node_rewrite_reason { -#define x(n) BTREE_NODE_REWRITE_##n, - BTREE_NODE_REWRITE_REASON() -#undef x -}; - -static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b) -{ - if (btree_node_need_rewrite_ptr_written_zero(b)) - return BTREE_NODE_REWRITE_ptr_written_zero; - if (btree_node_need_rewrite_degraded(b)) - return BTREE_NODE_REWRITE_degraded; - if (btree_node_need_rewrite_error(b)) - return BTREE_NODE_REWRITE_error; - if (btree_node_need_rewrite(b)) - return BTREE_NODE_REWRITE_unknown; - return BTREE_NODE_REWRITE_none; -} - -static inline struct btree_write *btree_current_write(struct btree *b) -{ - return b->writes + btree_node_write_idx(b); -} - -static inline struct btree_write *btree_prev_write(struct btree *b) -{ - return b->writes + (btree_node_write_idx(b) ^ 1); -} - -static inline struct bset_tree *bset_tree_last(struct btree *b) -{ - EBUG_ON(!b->nsets); - return b->set + b->nsets - 1; -} - -static inline void * -__btree_node_offset_to_ptr(const struct btree *b, u16 offset) -{ - return (void *) ((u64 *) b->data + offset); -} - -static inline u16 -__btree_node_ptr_to_offset(const struct btree *b, const void *p) -{ - u16 ret = (u64 *) p - (u64 *) b->data; - - EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); - return ret; -} - -static inline struct bset *bset(const struct btree *b, - const struct bset_tree *t) -{ - return __btree_node_offset_to_ptr(b, t->data_offset); -} - -static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) -{ - t->end_offset = - __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); -} - -static inline void set_btree_bset(struct btree *b, struct bset_tree *t, - const struct bset *i) -{ - t->data_offset = __btree_node_ptr_to_offset(b, i); - set_btree_bset_end(b, t); -} - -static inline struct bset *btree_bset_first(struct btree *b) -{ - return bset(b, b->set); -} - -static inline struct bset *btree_bset_last(struct btree *b) -{ - return bset(b, bset_tree_last(b)); -} - -static inline u16 -__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) -{ - return __btree_node_ptr_to_offset(b, k); -} - -static inline struct bkey_packed * -__btree_node_offset_to_key(const struct btree *b, u16 k) -{ - return __btree_node_offset_to_ptr(b, k); -} - -static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) -{ - return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); -} - -#define btree_bkey_first(_b, _t) \ -({ \ - EBUG_ON(bset(_b, _t)->start != \ - __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ - \ - bset(_b, _t)->start; \ -}) - -#define btree_bkey_last(_b, _t) \ -({ \ - EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ - vstruct_last(bset(_b, _t))); \ - \ - __btree_node_offset_to_key(_b, (_t)->end_offset); \ -}) - -static inline unsigned bset_u64s(struct bset_tree *t) -{ - return t->end_offset - t->data_offset - - sizeof(struct bset) / sizeof(u64); -} - -static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) -{ - return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; -} - -static inline unsigned bset_byte_offset(struct btree *b, void *i) -{ - return i - (void *) b->data; -} - -enum btree_node_type { - BKEY_TYPE_btree, -#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1, - BCH_BTREE_IDS() -#undef x - BKEY_TYPE_NR -}; - -/* Type of a key in btree @id at level @level: */ -static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) -{ - return level ? BKEY_TYPE_btree : (unsigned) id + 1; -} - -/* Type of keys @b contains: */ -static inline enum btree_node_type btree_node_type(struct btree *b) -{ - return __btree_node_type(b->c.level, b->c.btree_id); -} - -const char *bch2_btree_node_type_str(enum btree_node_type); - -#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ - (BIT_ULL(BKEY_TYPE_extents)| \ - BIT_ULL(BKEY_TYPE_alloc)| \ - BIT_ULL(BKEY_TYPE_inodes)| \ - BIT_ULL(BKEY_TYPE_stripes)| \ - BIT_ULL(BKEY_TYPE_reflink)| \ - BIT_ULL(BKEY_TYPE_subvolumes)| \ - BIT_ULL(BKEY_TYPE_btree)) - -#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ - (BIT_ULL(BKEY_TYPE_alloc)| \ - BIT_ULL(BKEY_TYPE_inodes)| \ - BIT_ULL(BKEY_TYPE_stripes)| \ - BIT_ULL(BKEY_TYPE_snapshots)) - -#define BTREE_NODE_TYPE_HAS_TRIGGERS \ - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ - BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) - -static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type) -{ - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS; -} - -static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type) -{ - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS; -} - -static inline bool btree_node_type_has_triggers(enum btree_node_type type) -{ - return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; -} - -static inline bool btree_id_is_extents(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_node_type_is_extents(enum btree_node_type type) -{ - return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1); -} - -static inline bool btree_type_has_snapshots(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_type_has_snapshot_field(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_type_has_ptrs(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline bool btree_type_uses_write_buffer(enum btree_id btree) -{ - const u64 mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr) - BCH_BTREE_IDS() -#undef x - ; - - return BIT_ULL(btree) & mask; -} - -static inline u8 btree_trigger_order(enum btree_id btree) -{ - switch (btree) { - case BTREE_ID_alloc: - return U8_MAX; - case BTREE_ID_stripes: - return U8_MAX - 1; - default: - return btree; - } -} - -struct btree_root { - struct btree *b; - - /* On disk root - see async splits: */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); - u8 level; - u8 alive; - s16 error; -}; - -enum btree_gc_coalesce_fail_reason { - BTREE_GC_COALESCE_FAIL_RESERVE_GET, - BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, - BTREE_GC_COALESCE_FAIL_FORMAT_FITS, -}; - -enum btree_node_sibling { - btree_prev_sib, - btree_next_sib, -}; - -struct get_locks_fail { - unsigned l; - struct btree *b; -}; - -#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c deleted file mode 100644 index ee657b9f4b968d..00000000000000 --- a/fs/bcachefs/btree_update.c +++ /dev/null @@ -1,916 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_update.h" -#include "btree_iter.h" -#include "btree_journal_iter.h" -#include "btree_locking.h" -#include "buckets.h" -#include "debug.h" -#include "errcode.h" -#include "error.h" -#include "extents.h" -#include "keylist.h" -#include "snapshot.h" -#include "trace.h" - -#include - -static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, - const struct btree_insert_entry *r) -{ - return cmp_int(l->sort_order, r->sort_order) ?: - cmp_int(l->cached, r->cached) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p); -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, - struct bkey_i *, enum btree_iter_update_trigger_flags, - unsigned long ip); - -static noinline int extent_front_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bkey_i **insert, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_i *update; - int ret; - - if (unlikely(trans->journal_replay_not_finished)) - return 0; - - update = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - return ret; - - if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) - return 0; - - ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?: - bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p); - if (ret < 0) - return ret; - if (ret) - return 0; - - ret = bch2_btree_delete_at(trans, iter, flags); - if (ret) - return ret; - - *insert = update; - return 0; -} - -static noinline int extent_back_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - int ret; - - if (unlikely(trans->journal_replay_not_finished)) - return 0; - - ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: - bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); - if (ret < 0) - return ret; - if (ret) - return 0; - - bch2_bkey_merge(c, bkey_i_to_s(insert), k); - return 0; -} - -/* - * When deleting, check if we need to emit a whiteout (because we're overwriting - * something in an ancestor snapshot) - */ -static int need_whiteout_for_snapshot(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - u32 snapshot = pos.snapshot; - int ret; - - if (!bch2_snapshot_parent(trans->c, pos.snapshot)) - return 0; - - pos.snapshot++; - - for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_all_snapshots| - BTREE_ITER_nopreserve, k, ret) { - if (!bkey_eq(k.k->p, pos)) - break; - - if (bch2_snapshot_is_ancestor(trans->c, snapshot, - k.k->p.snapshot)) { - ret = !bkey_whiteout(k.k); - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - snapshot_id_list *s) -{ - int ret = 0; - - darray_for_each(*s, id) { - pos.snapshot = *id; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos, - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bkey_err(k); - if (ret) - break; - - if (k.k->type == KEY_TYPE_deleted) { - struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - ret = PTR_ERR_OR_ZERO(update); - if (ret) { - bch2_trans_iter_exit(trans, &iter); - break; - } - - bkey_init(&update->k); - update->k.p = pos; - update->k.type = KEY_TYPE_whiteout; - - ret = bch2_trans_update(trans, &iter, update, - BTREE_UPDATE_internal_snapshot_node); - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - break; - } - - darray_exit(s); - return ret; -} - -int bch2_trans_update_extent_overwrite(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_iter_update_trigger_flags flags, - struct bkey_s_c old, - struct bkey_s_c new) -{ - enum btree_id btree_id = iter->btree_id; - struct bkey_i *update; - struct bpos new_start = bkey_start_pos(new.k); - unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start); - unsigned back_split = bkey_gt(old.k->p, new.k->p); - unsigned middle_split = (front_split || back_split) && - old.k->p.snapshot != new.k->p.snapshot; - unsigned nr_splits = front_split + back_split + middle_split; - int ret = 0, compressed_sectors; - - /* - * If we're going to be splitting a compressed extent, note it - * so that __bch2_trans_commit() can increase our disk - * reservation: - */ - if (nr_splits > 1 && - (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_disk_res += compressed_sectors * (nr_splits - 1); - - if (front_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_back(new_start, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_internal_snapshot_node|flags); - if (ret) - return ret; - } - - /* If we're overwriting in a different snapshot - middle split: */ - if (middle_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new_start, update); - bch2_cut_back(new.k->p, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_internal_snapshot_node|flags); - if (ret) - return ret; - } - - if (bkey_le(old.k->p, new.k->p)) { - update = bch2_trans_kmalloc(trans, sizeof(*update)); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bkey_init(&update->k); - update->k.p = old.k->p; - update->k.p.snapshot = new.k->p.snapshot; - - if (new.k->p.snapshot != old.k->p.snapshot) { - update->k.type = KEY_TYPE_whiteout; - } else if (btree_type_has_snapshots(btree_id)) { - ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); - if (ret < 0) - return ret; - if (ret) - update->k.type = KEY_TYPE_whiteout; - } - - ret = bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_internal_snapshot_node|flags); - if (ret) - return ret; - } - - if (back_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new.k->p, update); - - ret = bch2_trans_update_by_path(trans, iter->path, update, - BTREE_UPDATE_internal_snapshot_node| - flags, _RET_IP_); - if (ret) - return ret; - } - - return 0; -} - -static int bch2_trans_update_extent(struct btree_trans *trans, - struct btree_iter *orig_iter, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k; - enum btree_id btree_id = orig_iter->btree_id; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_intent| - BTREE_ITER_with_updates| - BTREE_ITER_not_extents); - k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - - if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { - if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { - ret = extent_front_merge(trans, &iter, k, &insert, flags); - if (ret) - goto err; - } - - goto next; - } - - while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { - bool done = bkey_lt(insert->k.p, k.k->p); - - ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); - if (ret) - goto err; - - if (done) - goto out; -next: - bch2_btree_iter_advance(trans, &iter); - k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - } - - if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { - ret = extent_back_merge(trans, &iter, insert, k); - if (ret) - goto err; - } -out: - if (!bkey_deleted(&insert->k)) - ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); -err: - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_insert_entry *i, - enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - struct bkey k; - int ret; - - btree_path_idx_t path_idx = - bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, - BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, path_idx, 0); - if (ret) - goto out; - - struct btree_path *btree_path = trans->paths + path_idx; - - /* - * The old key in the insert entry might actually refer to an existing - * key in the btree that has been deleted from cache and not yet - * flushed. Check for this and skip the flush so we don't run triggers - * against a stale key. - */ - bch2_btree_path_peek_slot_exact(btree_path, &k); - if (!bkey_deleted(&k)) - goto out; - - i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_norun; - - btree_path_set_should_be_locked(trans, btree_path); - ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); -out: - bch2_path_put(trans, path_idx, true); - return ret; -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i, n; - int cmp; - - struct btree_path *path = trans->paths + path_idx; - EBUG_ON(!path->should_be_locked); - EBUG_ON(trans->nr_updates >= trans->nr_paths); - EBUG_ON(!bpos_eq(k->k.p, path->pos)); - - n = (struct btree_insert_entry) { - .flags = flags, - .sort_order = btree_trigger_order(path->btree_id), - .bkey_type = __btree_node_type(path->level, path->btree_id), - .btree_id = path->btree_id, - .level = path->level, - .cached = path->cached, - .path = path_idx, - .k = k, - .ip_allocated = ip, - }; - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(i != trans->updates && - btree_insert_entry_cmp(i - 1, i) >= 0); -#endif - - /* - * Pending updates are kept sorted: first, find position of new update, - * then delete/trim any updates the new update overwrites: - */ - for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) { - cmp = btree_insert_entry_cmp(&n, i); - if (cmp <= 0) - break; - } - - bool overwrite = !cmp && i < trans->updates + trans->nr_updates; - - if (overwrite) { - EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); - - bch2_path_put(trans, i->path, true); - i->flags = n.flags; - i->cached = n.cached; - i->k = n.k; - i->path = n.path; - i->ip_allocated = n.ip_allocated; - } else { - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, n); - - i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; - i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); - - if (j_k) { - i->old_k = j_k->k; - i->old_v = &j_k->v; - } - } - } - - __btree_path_get(trans, trans->paths + i->path, true); - - trace_update_by_path(trans, path, i, overwrite); - - /* - * If a key is present in the key cache, it must also exist in the - * btree - this is necessary for cache coherency. When iterating over - * a btree that's cached in the key cache, the btree iter code checks - * the key cache - but the key has to exist in the btree for that to - * work: - */ - if (path->cached && !i->old_btree_u64s) - return flush_new_cached_update(trans, i, flags, ip); - - return 0; -} - -static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_path *path) -{ - struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter); - - if (!key_cache_path || - !key_cache_path->should_be_locked || - !bpos_eq(key_cache_path->pos, iter->pos)) { - struct bkey_cached *ck; - int ret; - - if (!iter->key_cache_path) - iter->key_cache_path = - bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_intent| - BTREE_ITER_cached, _THIS_IP_); - - iter->key_cache_path = - bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached); - if (unlikely(ret)) - return ret; - - ck = (void *) trans->paths[iter->key_cache_path].l[0].b; - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); - } - - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - } - - return 0; -} - -int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - kmsan_check_memory(k, bkey_bytes(&k->k)); - - btree_path_idx_t path_idx = iter->update_path ?: iter->path; - int ret; - - if (iter->flags & BTREE_ITER_is_extents) - return bch2_trans_update_extent(trans, iter, k, flags); - - if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_key_cache_reclaim) && - (iter->flags & BTREE_ITER_filter_snapshots)) { - ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); - if (unlikely(ret < 0)) - return ret; - - if (ret) - k->k.type = KEY_TYPE_whiteout; - } - - /* - * Ensure that updates to cached btrees go to the key cache: - */ - struct btree_path *path = trans->paths + path_idx; - if (!(flags & BTREE_UPDATE_key_cache_reclaim) && - !path->cached && - !path->level && - btree_id_cached(trans->c, path->btree_id)) { - ret = bch2_trans_update_get_key_cache(trans, iter, path); - if (ret) - return ret; - - path_idx = iter->key_cache_path; - } - - return bch2_trans_update_by_path(trans, path_idx, k, flags, ip); -} - -int bch2_btree_insert_clone_trans(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) -{ - struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bkey_copy(n, k); - return bch2_btree_insert_trans(trans, btree, n, 0); -} - -void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, - struct btree_trans_subbuf *buf, - unsigned u64s) -{ - unsigned new_top = buf->u64s + u64s; - unsigned new_size = buf->size; - - BUG_ON(roundup_pow_of_two(new_top) > U16_MAX); - - if (new_top > new_size) - new_size = roundup_pow_of_two(new_top); - - void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64)); - if (IS_ERR(n)) - return n; - - unsigned offset = (u64 *) n - (u64 *) trans->mem; - BUG_ON(offset > U16_MAX); - - if (buf->u64s) - memcpy(n, - btree_trans_subbuf_base(trans, buf), - buf->size * sizeof(u64)); - buf->base = (u64 *) n - (u64 *) trans->mem; - buf->size = new_size; - - void *p = btree_trans_subbuf_top(trans, buf); - buf->u64s = new_top; - return p; -} - -int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos end) -{ - bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter); - int ret = bkey_err(k); - if (ret) - goto err; - - bch2_btree_iter_advance(trans, iter); - k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) - goto err; - - BUG_ON(k.k->type != KEY_TYPE_deleted); - - if (bkey_gt(k.k->p, end)) { - ret = bch_err_throw(trans->c, ENOSPC_btree_slot); - goto err; - } - - return 0; -err: - bch2_trans_iter_exit(trans, iter); - return ret; -} - -void bch2_trans_commit_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *h) -{ - h->next = trans->hooks; - trans->hooks = h; -} - -int bch2_btree_insert_nonextent(struct btree_trans *trans, - enum btree_id btree, struct bkey_i *k, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_cached| - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_intent|flags); - int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/** - * bch2_btree_insert - insert keys into the extent btree - * @c: pointer to struct bch_fs - * @id: btree to insert into - * @k: key to insert - * @disk_res: must be non-NULL whenever inserting or potentially - * splitting data extents - * @flags: transaction commit flags - * @iter_flags: btree iter update trigger flags - * - * Returns: 0 on success, error code on failure - */ -int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, - struct disk_reservation *disk_res, int flags, - enum btree_iter_update_trigger_flags iter_flags) -{ - return bch2_trans_commit_do(c, disk_res, NULL, flags, - bch2_btree_insert_trans(trans, id, k, iter_flags)); -} - -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned update_flags) -{ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - int ret = PTR_ERR_OR_ZERO(k); - if (ret) - return ret; - - bkey_init(&k->k); - k->k.p = iter->pos; - return bch2_trans_update(trans, iter, k, update_flags); -} - -int bch2_btree_delete(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - unsigned update_flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, pos, - BTREE_ITER_cached| - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, update_flags); - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - u32 restart_count = trans->restart_count; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); - while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(trans->c, 0); - struct bkey_i delete; - - ret = bkey_err(k); - if (ret) - goto err; - - bkey_init(&delete.k); - - /* - * This could probably be more efficient for extents: - */ - - /* - * For extents, iter.pos won't necessarily be the same as - * bkey_start_pos(k.k) (for non extents they always will be the - * same). It's important that we delete starting from iter.pos - * because the range we want to delete could start in the middle - * of k. - * - * (bch2_btree_iter_peek() does guarantee that iter.pos >= - * bkey_start_pos(k.k)). - */ - delete.k.p = iter.pos; - - if (iter.flags & BTREE_ITER_is_extents) - bch2_key_resize(&delete.k, - bpos_min(end, k.k->p).offset - - iter.pos.offset); - - ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: - bch2_trans_commit(trans, &disk_res, journal_seq, - BCH_TRANS_COMMIT_no_enospc); - bch2_disk_reservation_put(trans->c, &disk_res); -err: - /* - * the bch2_trans_begin() call is in a weird place because we - * need to call it after every transaction commit, to avoid path - * overflow, but don't want to call it if the delete operation - * is a no-op and we have no work to do: - */ - bch2_trans_begin(trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret ?: trans_was_restarted(trans, restart_count); -} - -/* - * bch_btree_delete_range - delete everything within a given range - * - * Range is a half open interval - [start, end) - */ -int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - int ret = bch2_trans_run(c, - bch2_btree_delete_range_trans(trans, id, start, end, - update_flags, journal_seq)); - if (ret == -BCH_ERR_transaction_restart_nested) - ret = 0; - return ret; -} - -int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set) -{ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - int ret = PTR_ERR_OR_ZERO(k); - if (ret) - return ret; - - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = iter->pos; - if (iter->flags & BTREE_ITER_is_extents) - bch2_key_resize(&k->k, 1); - - return bch2_trans_update(trans, iter, k, 0); -} - -int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) -{ - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - - int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_bit_mod_iter(trans, &iter, set); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) -{ - struct bkey_i k; - - bkey_init(&k.k); - k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k.k.p = pos; - - return bch2_trans_update_buffered(trans, btree, &k); -} - -static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len) -{ - unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); - journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0); - return 0; -} - -int bch2_trans_log_str(struct btree_trans *trans, const char *str) -{ - return __bch2_trans_log_str(trans, str, strlen(str)); -} - -int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) -{ - int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - return ret; - - return __bch2_trans_log_str(trans, buf->buf, buf->pos); -} - -int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, - unsigned level, struct bkey_i *k) -{ - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s); - bkey_copy(e->start, k); - return 0; -} - -__printf(3, 0) -static int -__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, - va_list args) -{ - struct printbuf buf = PRINTBUF; - prt_vprintf(&buf, fmt, args); - - unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - - int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - goto err; - - if (!test_bit(JOURNAL_running, &c->journal.flags)) { - ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); - if (ret) - goto err; - - struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); - journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); - c->journal.early_journal_entries.nr += jset_u64s(u64s); - } else { - ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, - bch2_trans_log_msg(trans, &buf)); - } -err: - printbuf_exit(&buf); - return ret; -} - -__printf(2, 3) -int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, 0, fmt, args); - va_end(args); - return ret; -} - -/* - * Use for logging messages during recovery to enable reserved space and avoid - * blocking. - */ -__printf(2, 3) -int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); - va_end(args); - return ret; -} diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h deleted file mode 100644 index 0b98ab959719ac..00000000000000 --- a/fs/bcachefs/btree_update.h +++ /dev/null @@ -1,429 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_UPDATE_H -#define _BCACHEFS_BTREE_UPDATE_H - -#include "btree_iter.h" -#include "journal.h" -#include "snapshot.h" - -struct bch_fs; -struct btree; - -void bch2_btree_node_prep_for_write(struct btree_trans *, - struct btree_path *, struct btree *); -bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, - struct btree *, struct btree_node_iter *, - struct bkey_i *); - -int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64); -int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64); -void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); - -void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, - struct bkey_i *, u64); - -#define BCH_TRANS_COMMIT_FLAGS() \ - x(no_enospc, "don't check for enospc") \ - x(no_check_rw, "don't attempt to take a ref on c->writes") \ - x(no_journal_res, "don't take a journal reservation, instead " \ - "pin journal entry referred to by trans->journal_res.seq") \ - x(journal_reclaim, "operation required for journal reclaim; may return error" \ - "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ - x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied") - -enum __bch_trans_commit_flags { - /* First bits for bch_watermark: */ - __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS, -#define x(n, ...) __BCH_TRANS_COMMIT_##n, - BCH_TRANS_COMMIT_FLAGS() -#undef x -}; - -enum bch_trans_commit_flags { -#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n), - BCH_TRANS_COMMIT_FLAGS() -#undef x -}; - -void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); - -int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); - -int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, - struct bkey_i *, enum btree_iter_update_trigger_flags); - -int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, - enum btree_iter_update_trigger_flags); -int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct - disk_reservation *, int flags, enum - btree_iter_update_trigger_flags iter_flags); - -int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, - struct bpos, struct bpos, unsigned, u64 *); -int bch2_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, unsigned, u64 *); - -int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); -int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); -int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); - -static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, - enum btree_id btree, struct bpos pos) -{ - return bch2_btree_bit_mod_buffered(trans, btree, pos, false); -} - -int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, - struct bpos, snapshot_id_list *); - -/* - * For use when splitting extents in existing snapshots: - * - * If @old_pos is an interior snapshot node, iterate over descendent snapshot - * nodes: for every descendent snapshot in whiche @old_pos is overwritten and - * not visible, emit a whiteout at @new_pos. - */ -static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id btree, - struct bpos old_pos, - struct bpos new_pos) -{ - BUG_ON(old_pos.snapshot != new_pos.snapshot); - - if (!btree_type_has_snapshots(btree) || - bkey_eq(old_pos, new_pos)) - return 0; - - snapshot_id_list s; - int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s); - if (ret) - return ret; - - return s.nr - ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s) - : 0; -} - -int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, - enum btree_iter_update_trigger_flags, - struct bkey_s_c, struct bkey_s_c); - -int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos); - -int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_iter_update_trigger_flags, - unsigned long); - -static inline int __must_check -bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) -{ - return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); -} - -static inline void *btree_trans_subbuf_base(struct btree_trans *trans, - struct btree_trans_subbuf *buf) -{ - return (u64 *) trans->mem + buf->base; -} - -static inline void *btree_trans_subbuf_top(struct btree_trans *trans, - struct btree_trans_subbuf *buf) -{ - return (u64 *) trans->mem + buf->base + buf->u64s; -} - -void *__bch2_trans_subbuf_alloc(struct btree_trans *, - struct btree_trans_subbuf *, - unsigned); - -static inline void * -bch2_trans_subbuf_alloc(struct btree_trans *trans, - struct btree_trans_subbuf *buf, - unsigned u64s) -{ - if (buf->u64s + u64s > buf->size) - return __bch2_trans_subbuf_alloc(trans, buf, u64s); - - void *p = btree_trans_subbuf_top(trans, buf); - buf->u64s += u64s; - return p; -} - -static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans) -{ - return btree_trans_subbuf_base(trans, &trans->journal_entries); -} - -static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) -{ - return btree_trans_subbuf_top(trans, &trans->journal_entries); -} - -static inline struct jset_entry * -bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) -{ - return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s); -} - -int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); - -int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *); - -static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) -{ - kmsan_check_memory(k, bkey_bytes(&k->k)); - - EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - - if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k); - dump_stack(); - return ret; - } - /* - * Most updates skip the btree write buffer until journal replay is - * finished because synchronization with journal replay relies on having - * a btree node locked - if we're overwriting a key in the journal that - * journal replay hasn't yet replayed, we have to mark it as - * overwritten. - * - * But accounting updates don't overwrite, they're deltas, and they have - * to be flushed to the btree strictly in order for journal replay to be - * able to tell which updates need to be applied: - */ - if (k->k.type != KEY_TYPE_accounting && - unlikely(trans->journal_replay_not_finished)) - return bch2_btree_insert_clone_trans(trans, btree, k); - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s); - bkey_copy(e->start, k); - return 0; -} - -void bch2_trans_commit_hook(struct btree_trans *, - struct btree_trans_commit_hook *); -int __bch2_trans_commit(struct btree_trans *, unsigned); - -int bch2_trans_log_str(struct btree_trans *, const char *); -int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); -int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); - -__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); -__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); - -/** - * bch2_trans_commit - insert keys at given iterator positions - * - * This is main entry point for btree updates. - * - * Return values: - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -static inline int bch2_trans_commit(struct btree_trans *trans, - struct disk_reservation *disk_res, - u64 *journal_seq, - unsigned flags) -{ - trans->disk_res = disk_res; - trans->journal_seq = journal_seq; - - return __bch2_trans_commit(trans, flags); -} - -#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ - lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_flags))) - -#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ - nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ - (_journal_seq), (_flags))) - -#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \ - bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) - -#define trans_for_each_update(_trans, _i) \ - for (struct btree_insert_entry *_i = (_trans)->updates; \ - (_i) < (_trans)->updates + (_trans)->nr_updates; \ - (_i)++) - -static inline void bch2_trans_reset_updates(struct btree_trans *trans) -{ - trans_for_each_update(trans, i) - bch2_path_put(trans, i->path, true); - - trans->nr_updates = 0; - trans->journal_entries.u64s = 0; - trans->journal_entries.size = 0; - trans->accounting.u64s = 0; - trans->accounting.size = 0; - trans->hooks = NULL; - trans->extra_disk_res = 0; -} - -static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, - unsigned type, unsigned min_bytes) -{ - unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); - struct bkey_i *mut; - - if (type && k.k->type != type) - return ERR_PTR(-ENOENT); - - /* extra padding for varint_decode_fast... */ - mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8); - if (!IS_ERR(mut)) { - bkey_reassemble(mut, k); - - if (unlikely(bytes > bkey_bytes(k.k))) { - memset((void *) mut + bkey_bytes(k.k), 0, - bytes - bkey_bytes(k.k)); - mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64)); - } - } - return mut; -} - -static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) -{ - return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); -} - -#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \ - bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \ - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - -static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) -{ - struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); - int ret; - - if (IS_ERR(mut)) - return mut; - - ret = bch2_trans_update(trans, iter, mut, flags); - if (ret) - return ERR_PTR(ret); - - *k = bkey_i_to_s_c(mut); - return mut; -} - -static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c *k, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); -} - -#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \ - bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\ - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - -static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) -{ - struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type); - struct bkey_i *ret = IS_ERR(k.k) - ? ERR_CAST(k.k) - : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); - if (IS_ERR(ret)) - bch2_trans_iter_exit(trans, iter); - return ret; -} - -static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); -} - -static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned min_bytes) -{ - struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); - int ret; - - if (IS_ERR(mut)) - return mut; - - ret = bch2_trans_update(trans, iter, mut, flags); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ERR_PTR(ret); - } - - return mut; -} - -static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, - unsigned min_bytes) -{ - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); -} - -static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); -} - -#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ - bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \ - _btree_id, _pos, _flags, \ - KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) - -static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, - enum btree_iter_update_trigger_flags flags, - unsigned type, unsigned val_size) -{ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); - int ret; - - if (IS_ERR(k)) - return k; - - bkey_init(&k->k); - k->k.p = iter->pos; - k->k.type = type; - set_bkey_val_bytes(&k->k, val_size); - - ret = bch2_trans_update(trans, iter, k, flags); - if (unlikely(ret)) - return ERR_PTR(ret); - return k; -} - -#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \ - bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \ - KEY_TYPE_##_type, sizeof(struct bch_##_type))) - -#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c deleted file mode 100644 index 553059b33bfd62..00000000000000 --- a/fs/bcachefs/btree_update_interior.c +++ /dev/null @@ -1,2854 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_journal_iter.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "buckets.h" -#include "clock.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-members.h" -#include "super-io.h" -#include "trace.h" - -#include - -static const char * const bch2_btree_update_modes[] = { -#define x(t) #t, - BTREE_UPDATE_MODES() -#undef x - NULL -}; - -static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *); - -static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, struct keylist *); -static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); - -/* - * Verify that child nodes correctly span parent node's range: - */ -int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 - ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key - : b->data->min_key; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - struct bkey_buf prev; - int ret = 0; - - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, - b->data->min_key)); - - bch2_bkey_buf_init(&prev); - bkey_init(&prev.k->k); - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - - if (b == btree_node_root(c, b)) { - if (!bpos_eq(b->data->min_key, POS_MIN)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "btree root with incorrect min_key: "); - bch2_bpos_to_text(&buf, b->data->min_key); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_root_bad_min_key, &buf); - goto err; - } - - if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "btree root with incorrect max_key: "); - bch2_bpos_to_text(&buf, b->data->max_key); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_root_bad_max_key, &buf); - goto err; - } - } - - if (!b->c.level) - goto out; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - if (k.k->type != KEY_TYPE_btree_ptr_v2) - goto out; - - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - - struct bpos expected_min = bkey_deleted(&prev.k->k) - ? node_min - : bpos_successor(prev.k->k.p); - - if (!bpos_eq(expected_min, bp.v->min_key)) { - prt_str(&buf, "end of prev node doesn't match start of next node"); - prt_str(&buf, "\nprev "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - prt_str(&buf, "\nnext "); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf); - goto err; - } - - bch2_bkey_buf_reassemble(&prev, c, k); - bch2_btree_and_journal_iter_advance(&iter); - } - - if (bkey_deleted(&prev.k->k)) { - prt_printf(&buf, "empty interior node\n"); - bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf); - goto err; - } - - if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - prt_str(&buf, "last child node doesn't end at end of parent node\nchild: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - prt_newline(&buf); - - bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf); - goto err; - } -out: - bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev, c); - printbuf_exit(&buf); - return ret; -err: - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_newline(&buf); - - ret = __bch2_topology_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - BUG_ON(!ret); - goto out; -} - -/* Calculate ideal packed bkey format for new btree nodes: */ - -static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) -{ - struct bkey_packed *k; - struct bkey uk; - - for_each_bset(b, t) - bset_tree_for_each_key(b, t, k) - if (!bkey_deleted(k)) { - uk = bkey_unpack_key(b, k); - bch2_bkey_format_add_key(s, &uk); - } -} - -static struct bkey_format bch2_btree_calc_format(struct btree *b) -{ - struct bkey_format_state s; - - bch2_bkey_format_init(&s); - bch2_bkey_format_add_pos(&s, b->data->min_key); - bch2_bkey_format_add_pos(&s, b->data->max_key); - __bch2_btree_calc_format(&s, b); - - return bch2_bkey_format_done(&s); -} - -static size_t btree_node_u64s_with_format(struct btree_nr_keys nr, - struct bkey_format *old_f, - struct bkey_format *new_f) -{ - /* stupid integer promotion rules */ - ssize_t delta = - (((int) new_f->key_u64s - old_f->key_u64s) * - (int) nr.packed_keys) + - (((int) new_f->key_u64s - BKEY_U64s) * - (int) nr.unpacked_keys); - - BUG_ON(delta + nr.live_u64s < 0); - - return nr.live_u64s + delta; -} - -/** - * bch2_btree_node_format_fits - check if we could rewrite node with a new format - * - * @c: filesystem handle - * @b: btree node to rewrite - * @nr: number of keys for new node (i.e. b->nr) - * @new_f: bkey format to translate keys to - * - * Returns: true if all re-packed keys will be able to fit in a new node. - * - * Assumes all keys will successfully pack with the new format. - */ -static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, - struct btree_nr_keys nr, - struct bkey_format *new_f) -{ - size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); - - return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); -} - -/* Btree node freeing/allocation: */ - -static void __btree_node_free(struct btree_trans *trans, struct btree *b) -{ - struct bch_fs *c = trans->c; - - trace_and_count(c, btree_node_free, trans, b); - - BUG_ON(btree_node_write_blocked(b)); - BUG_ON(btree_node_dirty(b)); - BUG_ON(btree_node_need_write(b)); - BUG_ON(b == btree_node_root(c, b)); - BUG_ON(b->ob.nr); - BUG_ON(!list_empty(&b->write_blocked)); - BUG_ON(b->will_make_reachable); - - clear_btree_node_noevict(b); -} - -static void bch2_btree_node_free_inmem(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct bch_fs *c = trans->c; - - bch2_btree_node_lock_write_nofail(trans, path, &b->c); - - __btree_node_free(trans, b); - - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&b->c.lock); - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - - bch2_trans_node_drop(trans, b); -} - -static void bch2_btree_node_free_never_used(struct btree_update *as, - struct btree_trans *trans, - struct btree *b) -{ - struct bch_fs *c = as->c; - struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; - - BUG_ON(!list_empty(&b->write_blocked)); - BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); - - b->will_make_reachable = 0; - closure_put(&as->cl); - - clear_btree_node_will_make_reachable(b); - clear_btree_node_accessed(b); - clear_btree_node_dirty_acct(c, b); - clear_btree_node_need_write(b); - - mutex_lock(&c->btree_cache.lock); - __bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - - BUG_ON(p->nr >= ARRAY_SIZE(p->b)); - p->b[p->nr++] = b; - - six_unlock_intent(&b->c.lock); - - bch2_trans_node_drop(trans, b); -} - -static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, - struct disk_reservation *res, - struct closure *cl, - bool interior_node, - unsigned target, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct write_point *wp; - struct btree *b; - BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct open_buckets obs = { .nr = 0 }; - struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim - ? BTREE_NODE_RESERVE - : 0; - int ret; - - b = bch2_btree_node_mem_alloc(trans, interior_node); - if (IS_ERR(b)) - return b; - - BUG_ON(b->ob.nr); - - mutex_lock(&c->btree_reserve_cache_lock); - if (c->btree_reserve_cache_nr > nr_reserve) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - obs = a->ob; - bkey_copy(&tmp.k, &a->k); - mutex_unlock(&c->btree_reserve_cache_lock); - goto out; - } - mutex_unlock(&c->btree_reserve_cache_lock); -retry: - ret = bch2_alloc_sectors_start_trans(trans, - target ?: - c->opts.metadata_target ?: - c->opts.foreground_target, - 0, - writepoint_ptr(&c->btree_write_point), - &devs_have, - res->nr_replicas, - min(res->nr_replicas, - c->opts.metadata_replicas_required), - watermark, - target ? BCH_WRITE_only_specified_devs : 0, - cl, &wp); - if (unlikely(ret)) - goto err; - - if (wp->sectors_free < btree_sectors(c)) { - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ob->sectors_free < btree_sectors(c)) - ob->sectors_free = 0; - - bch2_alloc_sectors_done(c, wp); - goto retry; - } - - bkey_btree_ptr_v2_init(&tmp.k); - bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); - - bch2_open_bucket_get(c, wp, &obs); - bch2_alloc_sectors_done(c, wp); -out: - bkey_copy(&b->key, &tmp.k); - b->ob = obs; - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - return b; -err: - bch2_btree_node_to_freelist(c, b); - return ERR_PTR(ret); -} - -static struct btree *bch2_btree_node_alloc(struct btree_update *as, - struct btree_trans *trans, - unsigned level) -{ - struct bch_fs *c = as->c; - struct btree *b; - struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; - int ret; - - BUG_ON(level >= BTREE_MAX_DEPTH); - BUG_ON(!p->nr); - - b = p->b[--p->nr]; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - - set_btree_node_accessed(b); - set_btree_node_dirty_acct(c, b); - set_btree_node_need_write(b); - - bch2_bset_init_first(b, &b->data->keys); - b->c.level = level; - b->c.btree_id = as->btree_id; - b->version_ondisk = c->sb.version; - - memset(&b->nr, 0, sizeof(b->nr)); - b->data->magic = cpu_to_le64(bset_magic(c)); - memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); - b->data->flags = 0; - SET_BTREE_NODE_ID(b->data, as->btree_id); - SET_BTREE_NODE_LEVEL(b->data, level); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); - - bp->v.mem_ptr = 0; - bp->v.seq = b->data->keys.seq; - bp->v.sectors_written = 0; - } - - SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); - - bch2_btree_build_aux_trees(b); - - ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); - BUG_ON(ret); - - trace_and_count(c, btree_node_alloc, trans, b); - bch2_increment_clock(c, btree_sectors(c), WRITE); - return b; -} - -static void btree_set_min(struct btree *b, struct bpos pos) -{ - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; - b->data->min_key = pos; -} - -static void btree_set_max(struct btree *b, struct bpos pos) -{ - b->key.k.p = pos; - b->data->max_key = pos; -} - -static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, - struct btree_trans *trans, - struct btree *b) -{ - struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level); - struct bkey_format format = bch2_btree_calc_format(b); - - /* - * The keys might expand with the new format - if they wouldn't fit in - * the btree node anymore, use the old format for now: - */ - if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format)) - format = b->format; - - SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); - - btree_set_min(n, b->data->min_key); - btree_set_max(n, b->data->max_key); - - n->data->format = format; - btree_node_set_format(n, format); - - bch2_btree_sort_into(as->c, n, b); - - btree_node_reset_sib_u64s(n); - return n; -} - -static struct btree *__btree_root_alloc(struct btree_update *as, - struct btree_trans *trans, unsigned level) -{ - struct btree *b = bch2_btree_node_alloc(as, trans, level); - - btree_set_min(b, POS_MIN); - btree_set_max(b, SPOS_MAX); - b->data->format = bch2_btree_calc_format(b); - - btree_node_set_format(b, b->data->format); - bch2_btree_build_aux_trees(b); - - return b; -} - -static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) -{ - struct bch_fs *c = as->c; - struct prealloc_nodes *p; - - for (p = as->prealloc_nodes; - p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); - p++) { - while (p->nr) { - struct btree *b = p->b[--p->nr]; - - mutex_lock(&c->btree_reserve_cache_lock); - - if (c->btree_reserve_cache_nr < - ARRAY_SIZE(c->btree_reserve_cache)) { - struct btree_alloc *a = - &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; - - a->ob = b->ob; - b->ob.nr = 0; - bkey_copy(&a->k, &b->key); - } else { - bch2_open_buckets_put(c, &b->ob); - } - - mutex_unlock(&c->btree_reserve_cache_lock); - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - __btree_node_free(trans, b); - bch2_btree_node_to_freelist(c, b); - } - } -} - -static int bch2_btree_reserve_get(struct btree_trans *trans, - struct btree_update *as, - unsigned nr_nodes[2], - unsigned target, - unsigned flags, - struct closure *cl) -{ - struct btree *b; - unsigned interior; - int ret = 0; - - BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); - - /* - * Protects reaping from the btree node cache and using the btree node - * open bucket reserve: - */ - ret = bch2_btree_cache_cannibalize_lock(trans, cl); - if (ret) - return ret; - - for (interior = 0; interior < 2; interior++) { - struct prealloc_nodes *p = as->prealloc_nodes + interior; - - while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, - interior, target, flags); - if (IS_ERR(b)) { - ret = PTR_ERR(b); - goto err; - } - - p->b[p->nr++] = b; - } - } -err: - bch2_btree_cache_cannibalize_unlock(trans); - return ret; -} - -/* Asynchronous interior node update machinery */ - -static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) -{ - struct bch_fs *c = as->c; - - if (as->took_gc_lock) - up_read(&c->gc_lock); - as->took_gc_lock = false; - - bch2_journal_pin_drop(&c->journal, &as->journal); - bch2_journal_pin_flush(&c->journal, &as->journal); - bch2_disk_reservation_put(c, &as->disk_res); - bch2_btree_reserve_put(as, trans); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], - as->start_time); - - mutex_lock(&c->btree_interior_update_lock); - list_del(&as->unwritten_list); - list_del(&as->list); - - closure_debug_destroy(&as->cl); - mempool_free(as, &c->btree_interior_update_pool); - - /* - * Have to do the wakeup with btree_interior_update_lock still held, - * since being on btree_interior_update_list is our ref on @c: - */ - closure_wake_up(&c->btree_interior_update_wait); - - mutex_unlock(&c->btree_interior_update_lock); -} - -static void btree_update_add_key(struct btree_update *as, - struct keylist *keys, struct btree *b) -{ - struct bkey_i *k = &b->key; - - BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > - ARRAY_SIZE(as->_old_keys)); - - bkey_copy(keys->top, k); - bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; - - bch2_keylist_push(keys); -} - -static bool btree_update_new_nodes_marked_sb(struct btree_update *as) -{ - for_each_keylist_key(&as->new_keys, k) - if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) - return false; - return true; -} - -static void btree_update_new_nodes_mark_sb(struct btree_update *as) -{ - struct bch_fs *c = as->c; - - mutex_lock(&c->sb_lock); - for_each_keylist_key(&as->new_keys, k) - bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); -} - -/* - * The transactional part of an interior btree node update, where we journal the - * update we did to the interior node and update alloc info: - */ -static int btree_update_nodes_written_trans(struct btree_trans *trans, - struct btree_update *as) -{ - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); - int ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); - - trans->journal_pin = &as->journal; - - for_each_keylist_key(&as->old_keys, k) { - unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - - ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), - BTREE_TRIGGER_transactional); - if (ret) - return ret; - } - - for_each_keylist_key(&as->new_keys, k) { - unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - - ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), - BTREE_TRIGGER_transactional); - if (ret) - return ret; - } - - return 0; -} - -/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ -static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) -{ - struct btree_node *b_data = READ_ONCE(b->data); - - return (b_data ? b_data->keys.seq : 0) == seq; -} - -static void btree_update_nodes_written(struct btree_update *as) -{ - struct bch_fs *c = as->c; - struct btree *b; - struct btree_trans *trans = bch2_trans_get(c); - u64 journal_seq = 0; - unsigned i; - int ret; - - /* - * If we're already in an error state, it might be because a btree node - * was never written, and we might be trying to free that same btree - * node here, but it won't have been marked as allocated and we'll see - * spurious disk usage inconsistencies in the transactional part below - * if we don't skip it: - */ - ret = bch2_journal_error(&c->journal); - if (ret) - goto err; - - if (!btree_update_new_nodes_marked_sb(as)) - btree_update_new_nodes_mark_sb(as); - - /* - * Wait for any in flight writes to finish before we free the old nodes - * on disk. But we haven't pinned those old nodes in the btree cache, - * they might have already been evicted. - * - * The update we're completing deleted references to those nodes from the - * btree, so we know if they've been evicted they can't be pulled back in. - * We just have to check if the nodes we have pointers to are still those - * old nodes, and haven't been reused. - * - * This can't be done locklessly because the data buffer might have been - * vmalloc allocated, and they're not RCU freed. We also need the - * __no_kmsan_checks annotation because even with the btree node read - * lock, nothing tells us that the data buffer has been initialized (if - * the btree node has been reused for a different node, and the data - * buffer swapped for a new data buffer). - */ - for (i = 0; i < as->nr_old_nodes; i++) { - b = as->old_nodes[i]; - - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]); - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq_matches) - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, - TASK_UNINTERRUPTIBLE); - } - - /* - * We did an update to a parent node where the pointers we added pointed - * to child nodes that weren't written yet: now, the child nodes have - * been written so we can write out the update to the interior node. - */ - - /* - * We can't call into journal reclaim here: we'd block on the journal - * reclaim lock, but we may need to release the open buckets we have - * pinned in order for other btree updates to make forward progress, and - * journal reclaim does btree updates when flushing bkey_cached entries, - * which may require allocations as well. - */ - ret = commit_do(trans, &as->disk_res, &journal_seq, - BCH_WATERMARK_interior_updates| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_journal_reclaim, - btree_update_nodes_written_trans(trans, as)); - bch2_trans_unlock(trans); - - bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, - "%s", bch2_err_str(ret)); -err: - /* - * Ensure transaction is unlocked before using btree_node_lock_nopath() - * (the use of which is always suspect, we need to work on removing this - * in the future) - * - * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() - * calls bch2_path_upgrade(), before we call path_make_mut(), so we may - * rarely end up with a locked path besides the one we have here: - */ - bch2_trans_unlock(trans); - bch2_trans_begin(trans); - - /* - * We have to be careful because another thread might be getting ready - * to free as->b and calling btree_update_reparent() on us - we'll - * recheck under btree_update_lock below: - */ - b = READ_ONCE(as->b); - if (b) { - /* - * @b is the node we did the final insert into: - * - * On failure to get a journal reservation, we still have to - * unblock the write and allow most of the write path to happen - * so that shutdown works, but the i->journal_seq mechanism - * won't work to prevent the btree write from being visible (we - * didn't get a journal sequence number) - instead - * __bch2_btree_node_write() doesn't do the actual write if - * we're in journal error state: - */ - - btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, - as->btree_id, b->c.level, b->key.k.p); - struct btree_path *path = trans->paths + path_idx; - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); - mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); - path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); - path->l[b->c.level].b = b; - - bch2_btree_node_lock_write_nofail(trans, path, &b->c); - - mutex_lock(&c->btree_interior_update_lock); - - list_del(&as->write_blocked_list); - if (list_empty(&b->write_blocked)) - clear_btree_node_write_blocked(b); - - /* - * Node might have been freed, recheck under - * btree_interior_update_lock: - */ - if (as->b == b) { - BUG_ON(!b->c.level); - BUG_ON(!btree_node_dirty(b)); - - if (!ret) { - struct bset *last = btree_bset_last(b); - - last->journal_seq = cpu_to_le64( - max(journal_seq, - le64_to_cpu(last->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); - } else { - /* - * If we didn't get a journal sequence number we - * can't write this btree node, because recovery - * won't know to ignore this write: - */ - set_btree_node_never_write(b); - } - } - - mutex_unlock(&c->btree_interior_update_lock); - - mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - six_unlock_write(&b->c.lock); - - btree_node_write_if_need(trans, b, SIX_LOCK_intent); - btree_node_unlock(trans, path, b->c.level); - bch2_path_put(trans, path_idx, true); - } - - bch2_journal_pin_drop(&c->journal, &as->journal); - - mutex_lock(&c->btree_interior_update_lock); - for (i = 0; i < as->nr_new_nodes; i++) { - b = as->new_nodes[i]; - - BUG_ON(b->will_make_reachable != (unsigned long) as); - b->will_make_reachable = 0; - clear_btree_node_will_make_reachable(b); - } - mutex_unlock(&c->btree_interior_update_lock); - - for (i = 0; i < as->nr_new_nodes; i++) { - b = as->new_nodes[i]; - - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - btree_node_write_if_need(trans, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - } - - for (i = 0; i < as->nr_open_buckets; i++) - bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); - - bch2_btree_update_free(as, trans); - bch2_trans_put(trans); -} - -static void btree_interior_update_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, btree_interior_update_work); - struct btree_update *as; - - while (1) { - mutex_lock(&c->btree_interior_update_lock); - as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, - struct btree_update, unwritten_list); - if (as && !as->nodes_written) - as = NULL; - mutex_unlock(&c->btree_interior_update_lock); - - if (!as) - break; - - btree_update_nodes_written(as); - } -} - -static CLOSURE_CALLBACK(btree_update_set_nodes_written) -{ - closure_type(as, struct btree_update, cl); - struct bch_fs *c = as->c; - - mutex_lock(&c->btree_interior_update_lock); - as->nodes_written = true; - mutex_unlock(&c->btree_interior_update_lock); - - queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); -} - -/* - * We're updating @b with pointers to nodes that haven't finished writing yet: - * block @b from being written until @as completes - */ -static void btree_update_updated_node(struct btree_update *as, struct btree *b) -{ - struct bch_fs *c = as->c; - - BUG_ON(as->mode != BTREE_UPDATE_none); - BUG_ON(as->update_level_end < b->c.level); - BUG_ON(!btree_node_dirty(b)); - BUG_ON(!b->c.level); - - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - - as->mode = BTREE_UPDATE_node; - as->b = b; - as->update_level_end = b->c.level; - - set_btree_node_write_blocked(b); - list_add(&as->write_blocked_list, &b->write_blocked); - - mutex_unlock(&c->btree_interior_update_lock); -} - -static int bch2_update_reparent_journal_pin_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - return 0; -} - -static void btree_update_reparent(struct btree_update *as, - struct btree_update *child) -{ - struct bch_fs *c = as->c; - - lockdep_assert_held(&c->btree_interior_update_lock); - - child->b = NULL; - child->mode = BTREE_UPDATE_update; - - bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, - bch2_update_reparent_journal_pin_flush); -} - -static void btree_update_updated_root(struct btree_update *as, struct btree *b) -{ - struct bkey_i *insert = &b->key; - struct bch_fs *c = as->c; - - BUG_ON(as->mode != BTREE_UPDATE_none); - - BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > - ARRAY_SIZE(as->journal_entries)); - - as->journal_u64s += - journal_entry_set((void *) &as->journal_entries[as->journal_u64s], - BCH_JSET_ENTRY_btree_root, - b->c.btree_id, b->c.level, - insert, insert->k.u64s); - - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - - as->mode = BTREE_UPDATE_root; - mutex_unlock(&c->btree_interior_update_lock); -} - -/* - * bch2_btree_update_add_new_node: - * - * This causes @as to wait on @b to be written, before it gets to - * bch2_btree_update_nodes_written - * - * Additionally, it sets b->will_make_reachable to prevent any additional writes - * to @b from happening besides the first until @b is reachable on disk - * - * And it adds @b to the list of @as's new nodes, so that we can update sector - * counts in bch2_btree_update_nodes_written: - */ -static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) -{ - struct bch_fs *c = as->c; - - closure_get(&as->cl); - - mutex_lock(&c->btree_interior_update_lock); - BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); - BUG_ON(b->will_make_reachable); - - as->new_nodes[as->nr_new_nodes++] = b; - b->will_make_reachable = 1UL|(unsigned long) as; - set_btree_node_will_make_reachable(b); - - mutex_unlock(&c->btree_interior_update_lock); - - btree_update_add_key(as, &as->new_keys, b); - - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { - unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; - unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; - - bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = - cpu_to_le16(sectors); - } -} - -/* - * returns true if @b was a new node - */ -static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) -{ - struct btree_update *as; - unsigned long v; - unsigned i; - - mutex_lock(&c->btree_interior_update_lock); - /* - * When b->will_make_reachable != 0, it owns a ref on as->cl that's - * dropped when it gets written by bch2_btree_complete_write - the - * xchg() is for synchronization with bch2_btree_complete_write: - */ - v = xchg(&b->will_make_reachable, 0); - clear_btree_node_will_make_reachable(b); - as = (struct btree_update *) (v & ~1UL); - - if (!as) { - mutex_unlock(&c->btree_interior_update_lock); - return; - } - - for (i = 0; i < as->nr_new_nodes; i++) - if (as->new_nodes[i] == b) - goto found; - - BUG(); -found: - array_remove_item(as->new_nodes, as->nr_new_nodes, i); - mutex_unlock(&c->btree_interior_update_lock); - - if (v & 1) - closure_put(&as->cl); -} - -static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) -{ - while (b->ob.nr) - as->open_buckets[as->nr_open_buckets++] = - b->ob.v[--b->ob.nr]; -} - -static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - return 0; -} - -/* - * @b is being split/rewritten: it may have pointers to not-yet-written btree - * nodes and thus outstanding btree_updates - redirect @b's - * btree_updates to point to this btree_update: - */ -static void bch2_btree_interior_update_will_free_node(struct btree_update *as, - struct btree *b) -{ - struct bch_fs *c = as->c; - struct btree_update *p, *n; - struct btree_write *w; - - set_btree_node_dying(b); - - if (btree_node_fake(b)) - return; - - mutex_lock(&c->btree_interior_update_lock); - - /* - * Does this node have any btree_update operations preventing - * it from being written? - * - * If so, redirect them to point to this btree_update: we can - * write out our new nodes, but we won't make them visible until those - * operations complete - */ - list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { - list_del_init(&p->write_blocked_list); - btree_update_reparent(as, p); - - /* - * for flush_held_btree_writes() waiting on updates to flush or - * nodes to be writeable: - */ - closure_wake_up(&c->btree_interior_update_wait); - } - - clear_btree_node_dirty_acct(c, b); - clear_btree_node_need_write(b); - clear_btree_node_write_blocked(b); - - /* - * Does this node have unwritten data that has a pin on the journal? - * - * If so, transfer that pin to the btree_update operation - - * note that if we're freeing multiple nodes, we only need to keep the - * oldest pin of any of the nodes we're freeing. We'll release the pin - * when the new nodes are persistent and reachable on disk: - */ - w = btree_current_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, - bch2_btree_update_will_free_node_journal_pin_flush); - bch2_journal_pin_drop(&c->journal, &w->journal); - - w = btree_prev_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, - bch2_btree_update_will_free_node_journal_pin_flush); - bch2_journal_pin_drop(&c->journal, &w->journal); - - mutex_unlock(&c->btree_interior_update_lock); - - /* - * Is this a node that isn't reachable on disk yet? - * - * Nodes that aren't reachable yet have writes blocked until they're - * reachable - now that we've cancelled any pending writes and moved - * things waiting on that write to wait on this update, we can drop this - * node from the list of nodes that the other update is making - * reachable, prior to freeing it: - */ - btree_update_drop_new_node(c, b); - - btree_update_add_key(as, &as->old_keys, b); - - as->old_nodes[as->nr_old_nodes] = b; - as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; - as->nr_old_nodes++; -} - -static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) -{ - struct bch_fs *c = as->c; - u64 start_time = as->start_time; - - BUG_ON(as->mode == BTREE_UPDATE_none); - - if (as->took_gc_lock) - up_read(&as->c->gc_lock); - as->took_gc_lock = false; - - bch2_btree_reserve_put(as, trans); - - continue_at(&as->cl, btree_update_set_nodes_written, - as->c->btree_interior_update_worker); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], - start_time); -} - -static const char * const btree_node_reawrite_reason_strs[] = { -#define x(n) #n, - BTREE_NODE_REWRITE_REASON() -#undef x - NULL, -}; - -static struct btree_update * -bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level_start, bool split, - unsigned target, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_update *as; - u64 start_time = local_clock(); - int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) - ? BCH_DISK_RESERVATION_NOFAIL : 0; - unsigned nr_nodes[2] = { 0, 0 }; - unsigned level_end = level_start; - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - int ret = 0; - u32 restart_count = trans->restart_count; - - BUG_ON(!path->should_be_locked); - - if (watermark == BCH_WATERMARK_copygc) - watermark = BCH_WATERMARK_btree_copygc; - if (watermark < BCH_WATERMARK_btree) - watermark = BCH_WATERMARK_btree; - - flags &= ~BCH_WATERMARK_MASK; - flags |= watermark; - - if (watermark < BCH_WATERMARK_reclaim && - test_bit(JOURNAL_space_low, &c->journal.flags)) { - if (flags & BCH_TRANS_COMMIT_journal_reclaim) - return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - - ret = drop_locks_do(trans, - ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); - if (ret) - return ERR_PTR(ret); - } - - while (1) { - nr_nodes[!!level_end] += 1 + split; - level_end++; - - ret = bch2_btree_path_upgrade(trans, path, level_end + 1); - if (ret) - return ERR_PTR(ret); - - if (!btree_path_node(path, level_end)) { - /* Allocating new root? */ - nr_nodes[1] += split; - level_end = BTREE_MAX_DEPTH; - break; - } - - /* - * Always check for space for two keys, even if we won't have to - * split at prior level - it might have been a merge instead: - */ - if (bch2_btree_node_insert_fits(path->l[level_end].b, - BKEY_BTREE_PTR_U64s_MAX * 2)) - break; - - split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); - } - - if (!down_read_trylock(&c->gc_lock)) { - ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); - if (ret) { - up_read(&c->gc_lock); - return ERR_PTR(ret); - } - } - - as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); - memset(as, 0, sizeof(*as)); - closure_init(&as->cl, NULL); - as->c = c; - as->start_time = start_time; - as->ip_started = _RET_IP_; - as->mode = BTREE_UPDATE_none; - as->flags = flags; - as->took_gc_lock = true; - as->btree_id = path->btree_id; - as->update_level_start = level_start; - as->update_level_end = level_end; - INIT_LIST_HEAD(&as->list); - INIT_LIST_HEAD(&as->unwritten_list); - INIT_LIST_HEAD(&as->write_blocked_list); - bch2_keylist_init(&as->old_keys, as->_old_keys); - bch2_keylist_init(&as->new_keys, as->_new_keys); - bch2_keylist_init(&as->parent_keys, as->inline_keys); - - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->list, &c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - struct btree *b = btree_path_node(path, path->level); - as->node_start = b->data->min_key; - as->node_end = b->data->max_key; - as->node_needed_rewrite = btree_node_rewrite_reason(b); - as->node_written = b->written; - as->node_sectors = btree_buf_bytes(b) >> 9; - as->node_remaining = __bch2_btree_u64s_remaining(b, - btree_bkey_last(b, bset_tree_last(b))); - - /* - * We don't want to allocate if we're in an error state, that can cause - * deadlock on emergency shutdown due to open buckets getting stuck in - * the btree_reserve_cache after allocator shutdown has cleared it out. - * This check needs to come after adding us to the btree_interior_update - * list but before calling bch2_btree_reserve_get, to synchronize with - * __bch2_fs_read_only(). - */ - ret = bch2_journal_error(&c->journal); - if (ret) - goto err; - - ret = bch2_disk_reservation_get(c, &as->disk_res, - (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), - READ_ONCE(c->opts.metadata_replicas), - disk_res_flags); - if (ret) - goto err; - - ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); - if (bch2_err_matches(ret, ENOSPC) || - bch2_err_matches(ret, ENOMEM)) { - struct closure cl; - - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag - */ - if (bch2_err_matches(ret, ENOSPC) && - (flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) { - ret = bch_err_throw(c, journal_reclaim_would_deadlock); - goto err; - } - - closure_init_stack(&cl); - - do { - ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); - if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) - break; - bch2_trans_unlock(trans); - bch2_wait_on_allocator(c, &cl); - } while (1); - } - - if (ret) { - trace_and_count(c, btree_reserve_get_fail, trans->fn, - _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); - goto err; - } - - ret = bch2_trans_relock(trans); - if (ret) - goto err; - - bch2_trans_verify_not_restarted(trans, restart_count); - return as; -err: - bch2_btree_update_free(as, trans); - if (!bch2_err_matches(ret, ENOSPC) && - !bch2_err_matches(ret, EROFS) && - ret != -BCH_ERR_journal_reclaim_would_deadlock && - ret != -BCH_ERR_journal_shutdown) - bch_err_fn_ratelimited(c, ret); - return ERR_PTR(ret); -} - -/* Btree root updates: */ - -static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) -{ - /* Root nodes cannot be reaped */ - mutex_lock(&c->btree_cache.lock); - list_del_init(&b->list); - mutex_unlock(&c->btree_cache.lock); - - mutex_lock(&c->btree_root_lock); - bch2_btree_id_root(c, b->c.btree_id)->b = b; - mutex_unlock(&c->btree_root_lock); - - bch2_recalc_btree_reserve(c); -} - -static int bch2_btree_set_root(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - bool nofail) -{ - struct bch_fs *c = as->c; - - trace_and_count(c, btree_node_set_root, trans, b); - - struct btree *old = btree_node_root(c, b); - - /* - * Ensure no one is using the old root while we switch to the - * new root: - */ - if (nofail) { - bch2_btree_node_lock_write_nofail(trans, path, &old->c); - } else { - int ret = bch2_btree_node_lock_write(trans, path, &old->c); - if (ret) - return ret; - } - - bch2_btree_set_root_inmem(c, b); - - btree_update_updated_root(as, b); - - /* - * Unlock old root after new root is visible: - * - * The new root isn't persistent, but that's ok: we still have - * an intent lock on the new root, and any updates that would - * depend on the new root would have to update the new root. - */ - bch2_btree_node_unlock_write(trans, path, old); - return 0; -} - -/* Interior node updates: */ - -static void bch2_insert_fixup_btree_ptr(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - struct bch_fs *c = as->c; - struct bkey_packed *k; - struct printbuf buf = PRINTBUF; - unsigned long old, new; - - BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && - !btree_ptr_sectors_written(bkey_i_to_s_c(insert))); - - if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) - bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - - struct bkey_validate_context from = (struct bkey_validate_context) { - .from = BKEY_VALIDATE_btree_node, - .level = b->c.level, - .btree = b->c.btree_id, - .flags = BCH_VALIDATE_commit, - }; - if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { - bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); - dump_stack(); - } - - BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > - ARRAY_SIZE(as->journal_entries)); - - as->journal_u64s += - journal_entry_set((void *) &as->journal_entries[as->journal_u64s], - BCH_JSET_ENTRY_btree_keys, - b->c.btree_id, b->c.level, - insert, insert->k.u64s); - - while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && - bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) - bch2_btree_node_iter_advance(node_iter, b); - - bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); - set_btree_node_dirty_acct(c, b); - - old = READ_ONCE(b->flags); - do { - new = old; - - new &= ~BTREE_WRITE_TYPE_MASK; - new |= BTREE_WRITE_interior; - new |= 1 << BTREE_NODE_need_write; - } while (!try_cmpxchg(&b->flags, &old, new)); - - printbuf_exit(&buf); -} - -static int -bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter node_iter, - struct keylist *keys) -{ - struct bkey_i *insert = bch2_keylist_front(keys); - struct bkey_packed *k; - - BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); - - while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && - (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) - ; - - for (; - insert != keys->top && bpos_le(insert->k.p, b->key.k.p); - insert = bkey_next(insert)) - bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - - int ret = bch2_btree_node_check_topology(trans, b); - if (ret) { - struct printbuf buf = PRINTBUF; - - for (struct bkey_i *k = keys->keys; - k != insert; - k = bkey_next(k)) { - bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); - prt_newline(&buf); - } - - bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s", - (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf); - dump_stack(); - return ret; - } - - memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); - keys->top_p -= insert->_data - keys->keys_p; - return 0; -} - -static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) -{ - if (insert_keys) - for_each_keylist_key(insert_keys, k) - if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos)) - return true; - return false; -} - -/* - * Move keys from n1 (original replacement node, now lower node) to n2 (higher - * node) - */ -static void __btree_split_node(struct btree_update *as, - struct btree_trans *trans, - struct btree *b, - struct btree *n[2], - struct keylist *insert_keys) -{ - struct bkey_packed *k; - struct bpos n1_pos = POS_MIN; - struct btree_node_iter iter; - struct bset *bsets[2]; - struct bkey_format_state format[2]; - struct bkey_packed *out[2]; - struct bkey uk; - unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; - struct { unsigned nr_keys, val_u64s; } nr_keys[2]; - int i; - - memset(&nr_keys, 0, sizeof(nr_keys)); - - for (i = 0; i < 2; i++) { - BUG_ON(n[i]->nsets != 1); - - bsets[i] = btree_bset_first(n[i]); - out[i] = bsets[i]->start; - - SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1); - bch2_bkey_format_init(&format[i]); - } - - u64s = 0; - for_each_btree_node_key(b, k, &iter) { - if (bkey_deleted(k)) - continue; - - uk = bkey_unpack_key(b, k); - - if (b->c.level && - u64s < n1_u64s && - u64s + k->u64s >= n1_u64s && - (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) || - key_deleted_in_insert(insert_keys, uk.p))) - n1_u64s += k->u64s; - - i = u64s >= n1_u64s; - u64s += k->u64s; - if (!i) - n1_pos = uk.p; - bch2_bkey_format_add_key(&format[i], &uk); - - nr_keys[i].nr_keys++; - nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k); - } - - btree_set_min(n[0], b->data->min_key); - btree_set_max(n[0], n1_pos); - btree_set_min(n[1], bpos_successor(n1_pos)); - btree_set_max(n[1], b->data->max_key); - - for (i = 0; i < 2; i++) { - bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key); - bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); - - n[i]->data->format = bch2_bkey_format_done(&format[i]); - - unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + - nr_keys[i].val_u64s; - if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) - n[i]->data->format = b->format; - - btree_node_set_format(n[i], n[i]->data->format); - } - - u64s = 0; - for_each_btree_node_key(b, k, &iter) { - if (bkey_deleted(k)) - continue; - - i = u64s >= n1_u64s; - u64s += k->u64s; - - if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k) - ? &b->format: &bch2_bkey_format_current, k)) - out[i]->format = KEY_FORMAT_LOCAL_BTREE; - else - bch2_bkey_unpack(b, (void *) out[i], k); - - out[i]->needs_whiteout = false; - - btree_keys_account_key_add(&n[i]->nr, 0, out[i]); - out[i] = bkey_p_next(out[i]); - } - - for (i = 0; i < 2; i++) { - bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data); - - BUG_ON(!bsets[i]->u64s); - - set_btree_bset_end(n[i], n[i]->set); - - btree_node_reset_sib_u64s(n[i]); - - bch2_verify_btree_nr_keys(n[i]); - - BUG_ON(bch2_btree_node_check_topology(trans, n[i])); - } -} - -/* - * For updates to interior nodes, we've got to do the insert before we split - * because the stuff we're inserting has to be inserted atomically. Post split, - * the keys might have to go in different nodes and the split would no longer be - * atomic. - * - * Worse, if the insert is from btree node coalescing, if we do the insert after - * we do the split (and pick the pivot) - the pivot we pick might be between - * nodes that were coalesced, and thus in the middle of a child node post - * coalescing: - */ -static int btree_split_insert_keys(struct btree_update *as, - struct btree_trans *trans, - btree_path_idx_t path_idx, - struct btree *b, - struct keylist *keys) -{ - struct btree_path *path = trans->paths + path_idx; - - if (!bch2_keylist_empty(keys) && - bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { - struct btree_node_iter node_iter; - - bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - - int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - if (ret) - return ret; - } - - return 0; -} - -static int btree_split(struct btree_update *as, struct btree_trans *trans, - btree_path_idx_t path, struct btree *b, - struct keylist *keys) -{ - struct bch_fs *c = as->c; - struct btree *parent = btree_node_parent(trans->paths + path, b); - struct btree *n1, *n2 = NULL, *n3 = NULL; - btree_path_idx_t path1 = 0, path2 = 0; - u64 start_time = local_clock(); - int ret = 0; - - bch2_verify_btree_nr_keys(b); - BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); - - ret = bch2_btree_node_check_topology(trans, b); - if (ret) - return ret; - - if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { - struct btree *n[2]; - - trace_and_count(c, btree_node_split, trans, b); - - n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); - n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); - - __btree_split_node(as, trans, b, n, keys); - - if (keys) { - ret = btree_split_insert_keys(as, trans, path, n1, keys) ?: - btree_split_insert_keys(as, trans, path, n2, keys); - if (ret) - goto err; - BUG_ON(!bch2_keylist_empty(keys)); - } - - bch2_btree_build_aux_trees(n2); - bch2_btree_build_aux_trees(n1); - - bch2_btree_update_add_new_node(as, n1); - bch2_btree_update_add_new_node(as, n2); - six_unlock_write(&n2->c.lock); - six_unlock_write(&n1->c.lock); - - path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); - six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path1, n1); - - path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p); - six_lock_increment(&n2->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path2, n2); - - /* - * Note that on recursive parent_keys == keys, so we - * can't start adding new keys to parent_keys before emptying it - * out (which we did with btree_split_insert_keys() above) - */ - bch2_keylist_add(&as->parent_keys, &n1->key); - bch2_keylist_add(&as->parent_keys, &n2->key); - - if (!parent) { - /* Depth increases, make a new root */ - n3 = __btree_root_alloc(as, trans, b->c.level + 1); - - bch2_btree_update_add_new_node(as, n3); - six_unlock_write(&n3->c.lock); - - trans->paths[path2].locks_want++; - BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); - six_lock_increment(&n3->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path2, n3); - - n3->sib_u64s[0] = U16_MAX; - n3->sib_u64s[1] = U16_MAX; - - ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); - if (ret) - goto err; - } - } else { - trace_and_count(c, btree_node_compact, trans, b); - - n1 = bch2_btree_node_alloc_replacement(as, trans, b); - - if (keys) { - ret = btree_split_insert_keys(as, trans, path, n1, keys); - if (ret) - goto err; - BUG_ON(!bch2_keylist_empty(keys)); - } - - bch2_btree_build_aux_trees(n1); - bch2_btree_update_add_new_node(as, n1); - six_unlock_write(&n1->c.lock); - - path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); - six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + path1, n1); - - if (parent) - bch2_keylist_add(&as->parent_keys, &n1->key); - } - - /* New nodes all written, now make them visible: */ - - if (parent) { - /* Split a non root node */ - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - } else if (n3) { - ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); - } else { - /* Root filled up but didn't need to be split */ - ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); - } - - if (ret) - goto err; - - bch2_btree_interior_update_will_free_node(as, b); - - if (n3) { - bch2_btree_update_get_open_buckets(as, n3); - bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0); - } - if (n2) { - bch2_btree_update_get_open_buckets(as, n2); - bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0); - } - bch2_btree_update_get_open_buckets(as, n1); - bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0); - - /* - * The old node must be freed (in memory) _before_ unlocking the new - * nodes - else another thread could re-acquire a read lock on the old - * node after another thread has locked and updated the new node, thus - * seeing stale data: - */ - bch2_btree_node_free_inmem(trans, trans->paths + path, b); - - if (n3) - bch2_trans_node_add(trans, trans->paths + path, n3); - if (n2) - bch2_trans_node_add(trans, trans->paths + path2, n2); - bch2_trans_node_add(trans, trans->paths + path1, n1); - - if (n3) - six_unlock_intent(&n3->c.lock); - if (n2) - six_unlock_intent(&n2->c.lock); - six_unlock_intent(&n1->c.lock); -out: - if (path2) { - __bch2_btree_path_unlock(trans, trans->paths + path2); - bch2_path_put(trans, path2, true); - } - if (path1) { - __bch2_btree_path_unlock(trans, trans->paths + path1); - bch2_path_put(trans, path1, true); - } - - bch2_trans_verify_locks(trans); - - bch2_time_stats_update(&c->times[n2 - ? BCH_TIME_btree_node_split - : BCH_TIME_btree_node_compact], - start_time); - return ret; -err: - if (n3) - bch2_btree_node_free_never_used(as, trans, n3); - if (n2) - bch2_btree_node_free_never_used(as, trans, n2); - bch2_btree_node_free_never_used(as, trans, n1); - goto out; -} - -/** - * bch2_btree_insert_node - insert bkeys into a given btree node - * - * @as: btree_update object - * @trans: btree_trans object - * @path_idx: path that points to current node - * @b: node to insert keys into - * @keys: list of keys to insert - * - * Returns: 0 on success, typically transaction restart error on failure - * - * Inserts as many keys as it can into a given btree node, splitting it if full. - * If a split occurred, this function will return early. This can only happen - * for leaf nodes -- inserts into interior nodes have to be atomic. - */ -static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, - btree_path_idx_t path_idx, struct btree *b, - struct keylist *keys) -{ - struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx, *linked; - unsigned i; - int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - int ret; - - lockdep_assert_held(&c->gc_lock); - BUG_ON(!b->c.level); - BUG_ON(!as || as->b); - bch2_verify_keylist_sorted(keys); - - if (!btree_node_intent_locked(path, b->c.level)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "%s(): node not locked at level %u\n", - __func__, b->c.level); - bch2_btree_update_to_text(&buf, as); - bch2_btree_path_to_text(&buf, trans, path_idx); - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return -EIO; - } - - ret = bch2_btree_node_lock_write(trans, path, &b->c); - if (ret) - return ret; - - bch2_btree_node_prep_for_write(trans, path, b); - - if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { - bch2_btree_node_unlock_write(trans, path, b); - goto split; - } - - - ret = bch2_btree_node_check_topology(trans, b) ?: - bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - if (ret) { - bch2_btree_node_unlock_write(trans, path, b); - return ret; - } - - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - - bch2_trans_verify_paths(trans); - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch2_maybe_compact_whiteouts(c, b)) - bch2_trans_node_reinit_iter(trans, b); - - btree_update_updated_node(as, b); - bch2_btree_node_unlock_write(trans, path, b); - return 0; -split: - /* - * We could attempt to avoid the transaction restart, by calling - * bch2_btree_path_upgrade() and allocating more nodes: - */ - if (b->c.level >= as->update_level_end) { - trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); - } - - return btree_split(as, trans, path_idx, b, keys); -} - -int bch2_btree_split_leaf(struct btree_trans *trans, - btree_path_idx_t path, - unsigned flags) -{ - /* btree_split & merge may both cause paths array to be reallocated */ - struct btree *b = path_l(trans->paths + path)->b; - struct btree_update *as; - unsigned l; - int ret = 0; - - as = bch2_btree_update_start(trans, trans->paths + path, - trans->paths[path].level, - true, 0, flags); - if (IS_ERR(as)) - return PTR_ERR(as); - - ret = btree_split(as, trans, path, b, NULL); - if (ret) { - bch2_btree_update_free(as, trans); - return ret; - } - - bch2_btree_update_done(as, trans); - - for (l = trans->paths[path].level + 1; - btree_node_intent_locked(&trans->paths[path], l) && !ret; - l++) - ret = bch2_foreground_maybe_merge(trans, path, l, flags); - - return ret; -} - -static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, - btree_path_idx_t path_idx) -{ - struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx; - struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; - - BUG_ON(!btree_node_locked(path, b->c.level)); - - n = __btree_root_alloc(as, trans, b->c.level + 1); - - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - path->locks_want++; - BUG_ON(btree_node_locked(path, n->c.level)); - six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path, n); - - n->sib_u64s[0] = U16_MAX; - n->sib_u64s[1] = U16_MAX; - - bch2_keylist_add(&as->parent_keys, &b->key); - btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); - - int ret = bch2_btree_set_root(as, trans, path, n, true); - BUG_ON(ret); - - bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - bch2_trans_node_add(trans, path, n); - six_unlock_intent(&n->c.lock); - - mutex_lock(&c->btree_cache.lock); - list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); - mutex_unlock(&c->btree_cache.lock); - - bch2_trans_verify_locks(trans); -} - -int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; - - if (btree_node_fake(b)) - return bch2_btree_split_leaf(trans, path, flags); - - struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, b->c.level, - true, 0, flags); - if (IS_ERR(as)) - return PTR_ERR(as); - - __btree_increase_depth(as, trans, path); - bch2_btree_update_done(as, trans); - return 0; -} - -int __bch2_foreground_maybe_merge(struct btree_trans *trans, - btree_path_idx_t path, - unsigned level, - unsigned flags, - enum btree_node_sibling sib) -{ - struct bch_fs *c = trans->c; - struct btree_update *as; - struct bkey_format_state new_s; - struct bkey_format new_f; - struct bkey_i delete; - struct btree *b, *m, *n, *prev, *next, *parent; - struct bpos sib_pos; - size_t sib_u64s; - enum btree_id btree = trans->paths[path].btree_id; - btree_path_idx_t sib_path = 0, new_path = 0; - u64 start_time = local_clock(); - int ret = 0; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - BUG_ON(!trans->paths[path].should_be_locked); - BUG_ON(!btree_node_locked(&trans->paths[path], level)); - - /* - * Work around a deadlock caused by the btree write buffer not doing - * merges and leaving tons of merges for us to do - we really don't need - * to be doing merges at all from the interior update path, and if the - * interior update path is generating too many new interior updates we - * deadlock: - */ - if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) - return 0; - - if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) { - flags &= ~BCH_WATERMARK_MASK; - flags |= BCH_WATERMARK_btree; - flags |= BCH_TRANS_COMMIT_journal_reclaim; - } - - b = trans->paths[path].l[level].b; - - if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || - (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { - b->sib_u64s[sib] = U16_MAX; - return 0; - } - - sib_pos = sib == btree_prev_sib - ? bpos_predecessor(b->data->min_key) - : bpos_successor(b->data->max_key); - - sib_path = bch2_path_get(trans, btree, sib_pos, - U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, sib_path, false); - if (ret) - goto err; - - btree_path_set_should_be_locked(trans, trans->paths + sib_path); - - m = trans->paths[sib_path].l[level].b; - - if (btree_node_parent(trans->paths + path, b) != - btree_node_parent(trans->paths + sib_path, m)) { - b->sib_u64s[sib] = U16_MAX; - goto out; - } - - if (sib == btree_prev_sib) { - prev = m; - next = b; - } else { - prev = b; - next = m; - } - - if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { - struct printbuf buf = PRINTBUF; - - printbuf_indent_add_nextline(&buf, 2); - prt_printf(&buf, "%s(): ", __func__); - ret = __bch2_topology_error(c, &buf); - prt_newline(&buf); - - prt_printf(&buf, "prev ends at "); - bch2_bpos_to_text(&buf, prev->data->max_key); - prt_newline(&buf); - - prt_printf(&buf, "next starts at "); - bch2_bpos_to_text(&buf, next->data->min_key); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } - - bch2_bkey_format_init(&new_s); - bch2_bkey_format_add_pos(&new_s, prev->data->min_key); - __bch2_btree_calc_format(&new_s, prev); - __bch2_btree_calc_format(&new_s, next); - bch2_bkey_format_add_pos(&new_s, next->data->max_key); - new_f = bch2_bkey_format_done(&new_s); - - sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) + - btree_node_u64s_with_format(m->nr, &m->format, &new_f); - - if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { - sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); - sib_u64s /= 2; - sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); - } - - sib_u64s = min(sib_u64s, btree_max_u64s(c)); - sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); - b->sib_u64s[sib] = sib_u64s; - - if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) - goto out; - - parent = btree_node_parent(trans->paths + path, b); - as = bch2_btree_update_start(trans, trans->paths + path, level, false, - 0, BCH_TRANS_COMMIT_no_enospc|flags); - ret = PTR_ERR_OR_ZERO(as); - if (ret) - goto err; - - as->node_start = prev->data->min_key; - as->node_end = next->data->max_key; - - trace_and_count(c, btree_node_merge, trans, b); - - n = bch2_btree_node_alloc(as, trans, b->c.level); - - SET_BTREE_NODE_SEQ(n->data, - max(BTREE_NODE_SEQ(b->data), - BTREE_NODE_SEQ(m->data)) + 1); - - btree_set_min(n, prev->data->min_key); - btree_set_max(n, next->data->max_key); - - n->data->format = new_f; - btree_node_set_format(n, new_f); - - bch2_btree_sort_into(c, n, prev); - bch2_btree_sort_into(c, n, next); - - bch2_btree_build_aux_trees(n); - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p); - six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + new_path, n); - - bkey_init(&delete.k); - delete.k.p = prev->key.k.p; - bch2_keylist_add(&as->parent_keys, &delete); - bch2_keylist_add(&as->parent_keys, &n->key); - - bch2_trans_verify_paths(trans); - - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - if (ret) - goto err_free_update; - - bch2_btree_interior_update_will_free_node(as, b); - bch2_btree_interior_update_will_free_node(as, m); - - bch2_trans_verify_paths(trans); - - bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - - bch2_btree_node_free_inmem(trans, trans->paths + path, b); - bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); - - bch2_trans_node_add(trans, trans->paths + path, n); - - bch2_trans_verify_paths(trans); - - six_unlock_intent(&n->c.lock); - - bch2_btree_update_done(as, trans); - - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); -out: -err: - if (new_path) - bch2_path_put(trans, new_path, true); - bch2_path_put(trans, sib_path, true); - bch2_trans_verify_locks(trans); - if (ret == -BCH_ERR_journal_reclaim_would_deadlock) - ret = 0; - if (!ret) - ret = bch2_trans_relock(trans); - return ret; -err_free_update: - bch2_btree_node_free_never_used(as, trans, n); - bch2_btree_update_free(as, trans); - goto out; -} - -static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b) -{ - bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - goto err; - - /* has node been freed? */ - if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { - /* node has been freed: */ - BUG_ON(!btree_node_dying(b)); - ret = bch_err_throw(trans->c, btree_node_dying); - goto err; - } - - BUG_ON(!btree_node_hashed(b)); - return 0; -err: - bch2_trans_iter_exit(trans, iter); - return ret; -} - -int bch2_btree_node_rewrite(struct btree_trans *trans, - struct btree_iter *iter, - struct btree *b, - unsigned target, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree *n, *parent; - struct btree_update *as; - btree_path_idx_t new_path = 0; - int ret; - - flags |= BCH_TRANS_COMMIT_no_enospc; - - struct btree_path *path = btree_iter_path(trans, iter); - parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, b->c.level, - false, target, flags); - ret = PTR_ERR_OR_ZERO(as); - if (ret) - goto out; - - n = bch2_btree_node_alloc_replacement(as, trans, b); - - bch2_btree_build_aux_trees(n); - bch2_btree_update_add_new_node(as, n); - six_unlock_write(&n->c.lock); - - new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p); - six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, trans->paths + new_path, n); - - trace_and_count(c, btree_node_rewrite, trans, b); - - if (parent) { - bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); - } else { - ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); - } - - if (ret) - goto err; - - bch2_btree_interior_update_will_free_node(as, b); - - bch2_btree_update_get_open_buckets(as, n); - bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); - - bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); - - bch2_trans_node_add(trans, trans->paths + iter->path, n); - six_unlock_intent(&n->c.lock); - - bch2_btree_update_done(as, trans); -out: - if (new_path) - bch2_path_put(trans, new_path, true); - bch2_trans_downgrade(trans); - return ret; -err: - bch2_btree_node_free_never_used(as, trans, n); - bch2_btree_update_free(as, trans); - goto out; -} - -int bch2_btree_node_rewrite_key(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_i *k, unsigned flags) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, - btree, k->k.p, - BTREE_MAX_DEPTH, level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto out; - - bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); - ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) - : -ENOENT; -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_node_rewrite_pos(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bpos pos, - unsigned target, - unsigned flags) -{ - BUG_ON(!level); - - /* Traverse one depth lower to get a pointer to the node itself: */ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto err; - - ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, - struct btree *b, unsigned flags) -{ - struct btree_iter iter; - int ret = get_iter_to_node(trans, &iter, b); - if (ret) - return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -struct async_btree_rewrite { - struct bch_fs *c; - struct work_struct work; - struct list_head list; - enum btree_id btree_id; - unsigned level; - struct bkey_buf key; -}; - -static void async_btree_node_rewrite_work(struct work_struct *work) -{ - struct async_btree_rewrite *a = - container_of(work, struct async_btree_rewrite, work); - struct bch_fs *c = a->c; - - int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, - a->btree_id, a->level, a->key.k, 0)); - if (!bch2_err_matches(ret, ENOENT) && - !bch2_err_matches(ret, EROFS)) - bch_err_fn_ratelimited(c, ret); - - spin_lock(&c->btree_node_rewrites_lock); - list_del(&a->list); - spin_unlock(&c->btree_node_rewrites_lock); - - closure_wake_up(&c->btree_node_rewrites_wait); - - bch2_bkey_buf_exit(&a->key, c); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); - kfree(a); -} - -void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) -{ - struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); - if (!a) - return; - - a->c = c; - a->btree_id = b->c.btree_id; - a->level = b->c.level; - INIT_WORK(&a->work, async_btree_node_rewrite_work); - - bch2_bkey_buf_init(&a->key); - bch2_bkey_buf_copy(&a->key, c, &b->key); - - bool now = false, pending = false; - - spin_lock(&c->btree_node_rewrites_lock); - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && - enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { - list_add(&a->list, &c->btree_node_rewrites); - now = true; - } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { - list_add(&a->list, &c->btree_node_rewrites_pending); - pending = true; - } - spin_unlock(&c->btree_node_rewrites_lock); - - if (now) { - queue_work(c->btree_node_rewrite_worker, &a->work); - } else if (pending) { - /* bch2_do_pending_node_rewrites will execute */ - } else { - bch2_bkey_buf_exit(&a->key, c); - kfree(a); - } -} - -void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) -{ - closure_wait_event(&c->btree_node_rewrites_wait, - list_empty(&c->btree_node_rewrites)); -} - -void bch2_do_pending_node_rewrites(struct bch_fs *c) -{ - while (1) { - spin_lock(&c->btree_node_rewrites_lock); - struct async_btree_rewrite *a = - list_pop_entry(&c->btree_node_rewrites_pending, - struct async_btree_rewrite, list); - if (a) - list_add(&a->list, &c->btree_node_rewrites); - spin_unlock(&c->btree_node_rewrites_lock); - - if (!a) - break; - - enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); - queue_work(c->btree_node_rewrite_worker, &a->work); - } -} - -void bch2_free_pending_node_rewrites(struct bch_fs *c) -{ - while (1) { - spin_lock(&c->btree_node_rewrites_lock); - struct async_btree_rewrite *a = - list_pop_entry(&c->btree_node_rewrites_pending, - struct async_btree_rewrite, list); - spin_unlock(&c->btree_node_rewrites_lock); - - if (!a) - break; - - bch2_bkey_buf_exit(&a->key, c); - kfree(a); - } -} - -static int __bch2_btree_node_update_key(struct btree_trans *trans, - struct btree_iter *iter, - struct btree *b, struct btree *new_hash, - struct bkey_i *new_key, - unsigned commit_flags, - bool skip_triggers) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter2 = {}; - struct btree *parent; - int ret; - - if (!skip_triggers) { - ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s_c(&b->key), - BTREE_TRIGGER_transactional) ?: - bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s(new_key), - BTREE_TRIGGER_transactional); - if (ret) - return ret; - } - - if (new_hash) { - bkey_copy(&new_hash->key, new_key); - ret = bch2_btree_node_hash_insert(&c->btree_cache, - new_hash, b->c.level, b->c.btree_id); - BUG_ON(ret); - } - - parent = btree_node_parent(btree_iter_path(trans, iter), b); - if (parent) { - bch2_trans_copy_iter(trans, &iter2, iter); - - iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_intent, - _THIS_IP_); - - struct btree_path *path2 = btree_iter_path(trans, &iter2); - BUG_ON(path2->level != b->c.level); - BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); - - btree_path_set_level_up(trans, path2); - - trans->paths_sorted = false; - - ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); - if (ret) - goto err; - } else { - BUG_ON(btree_node_root(c, b) != b); - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, - jset_u64s(new_key->k.u64s)); - ret = PTR_ERR_OR_ZERO(e); - if (ret) - return ret; - - journal_entry_set(e, - BCH_JSET_ENTRY_btree_root, - b->c.btree_id, b->c.level, - new_key, new_key->k.u64s); - } - - ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); - if (ret) - goto err; - - bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); - - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, new_hash); - - __bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new_key); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); - } else { - bkey_copy(&b->key, new_key); - } - - bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); -out: - bch2_trans_iter_exit(trans, &iter2); - return ret; -err: - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); - } - goto out; -} - -int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b, struct bkey_i *new_key, - unsigned commit_flags, bool skip_triggers) -{ - struct bch_fs *c = trans->c; - struct btree *new_hash = NULL; - struct btree_path *path = btree_iter_path(trans, iter); - struct closure cl; - int ret = 0; - - ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); - if (ret) - return ret; - - closure_init_stack(&cl); - - /* - * check btree_ptr_hash_val() after @b is locked by - * btree_iter_traverse(): - */ - if (btree_ptr_hash_val(new_key) != b->hash_val) { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - if (ret) { - ret = drop_locks_do(trans, (closure_sync(&cl), 0)); - if (ret) - return ret; - } - - new_hash = bch2_btree_node_mem_alloc(trans, false); - ret = PTR_ERR_OR_ZERO(new_hash); - if (ret) - goto err; - } - - path->intent_ref++; - ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, - commit_flags, skip_triggers); - --path->intent_ref; - - if (new_hash) - bch2_btree_node_to_freelist(c, new_hash); -err: - closure_sync(&cl); - bch2_btree_cache_cannibalize_unlock(trans); - return ret; -} - -int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, - struct btree *b, struct bkey_i *new_key, - unsigned commit_flags, bool skip_triggers) -{ - struct btree_iter iter; - int ret = get_iter_to_node(trans, &iter, b); - if (ret) - return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - - bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, - !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); - - ret = bch2_btree_node_update_key(trans, &iter, b, new_key, - commit_flags, skip_triggers); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* Init code: */ - -/* - * Only for filesystem bringup, when first reading the btree roots or allocating - * btree roots when initializing a new filesystem: - */ -void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) -{ - BUG_ON(btree_node_root(c, b)); - - bch2_btree_set_root_inmem(c, b); -} - -int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level) -{ - struct bch_fs *c = trans->c; - struct closure cl; - struct btree *b; - int ret; - - closure_init_stack(&cl); - - do { - ret = bch2_btree_cache_cannibalize_lock(trans, &cl); - closure_sync(&cl); - } while (ret); - - b = bch2_btree_node_mem_alloc(trans, false); - bch2_btree_cache_cannibalize_unlock(trans); - - ret = PTR_ERR_OR_ZERO(b); - if (ret) - return ret; - - set_btree_node_fake(b); - set_btree_node_need_rewrite(b); - b->c.level = level; - b->c.btree_id = id; - - bkey_btree_ptr_init(&b->key); - b->key.k.p = SPOS_MAX; - *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; - - bch2_bset_init_first(b, &b->data->keys); - bch2_btree_build_aux_trees(b); - - b->data->flags = 0; - btree_set_min(b, POS_MIN); - btree_set_max(b, SPOS_MAX); - b->data->format = bch2_btree_calc_format(b); - btree_node_set_format(b, b->data->format); - - ret = bch2_btree_node_hash_insert(&c->btree_cache, b, - b->c.level, b->c.btree_id); - BUG_ON(ret); - - bch2_btree_set_root_inmem(c, b); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - return 0; -} - -void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) -{ - bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); -} - -static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) -{ - prt_printf(out, "%ps: ", (void *) as->ip_started); - bch2_trans_commit_flags_to_text(out, as->flags); - - prt_str(out, " "); - bch2_btree_id_to_text(out, as->btree_id); - prt_printf(out, " l=%u-%u ", - as->update_level_start, - as->update_level_end); - bch2_bpos_to_text(out, as->node_start); - prt_char(out, ' '); - bch2_bpos_to_text(out, as->node_end); - prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s", - as->node_written, - as->node_sectors, - as->node_remaining, - btree_node_reawrite_reason_strs[as->node_needed_rewrite]); - - prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - bch2_btree_update_modes[as->mode], - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); -} - -void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct btree_update *as; - - mutex_lock(&c->btree_interior_update_lock); - list_for_each_entry(as, &c->btree_interior_update_list, list) - bch2_btree_update_to_text(out, as); - mutex_unlock(&c->btree_interior_update_lock); -} - -static bool bch2_btree_interior_updates_pending(struct bch_fs *c) -{ - bool ret; - - mutex_lock(&c->btree_interior_update_lock); - ret = !list_empty(&c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - return ret; -} - -bool bch2_btree_interior_updates_flush(struct bch_fs *c) -{ - bool ret = bch2_btree_interior_updates_pending(c); - - if (ret) - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_pending(c)); - return ret; -} - -void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) -{ - struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - - mutex_lock(&c->btree_root_lock); - - r->level = entry->level; - r->alive = true; - bkey_copy(&r->key, (struct bkey_i *) entry->start); - - mutex_unlock(&c->btree_root_lock); -} - -struct jset_entry * -bch2_btree_roots_to_journal_entries(struct bch_fs *c, - struct jset_entry *end, - unsigned long skip) -{ - unsigned i; - - mutex_lock(&c->btree_root_lock); - - for (i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (r->alive && !test_bit(i, &skip)) { - journal_entry_set(end, BCH_JSET_ENTRY_btree_root, - i, r->level, &r->key, r->key.k.u64s); - end = vstruct_next(end); - } - } - - mutex_unlock(&c->btree_root_lock); - - return end; -} - -static void bch2_btree_alloc_to_text(struct printbuf *out, - struct bch_fs *c, - struct btree_alloc *a) -{ - printbuf_indent_add(out, 2); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k)); - prt_newline(out); - - struct open_bucket *ob; - unsigned i; - open_bucket_for_each(c, &a->ob, ob, i) - bch2_open_bucket_to_text(out, c, ob); - - printbuf_indent_sub(out, 2); -} - -void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) -{ - for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++) - bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]); -} - -void bch2_fs_btree_interior_update_exit(struct bch_fs *c) -{ - WARN_ON(!list_empty(&c->btree_node_rewrites)); - WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); - - if (c->btree_node_rewrite_worker) - destroy_workqueue(c->btree_node_rewrite_worker); - if (c->btree_interior_update_worker) - destroy_workqueue(c->btree_interior_update_worker); - mempool_exit(&c->btree_interior_update_pool); -} - -void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) -{ - mutex_init(&c->btree_reserve_cache_lock); - INIT_LIST_HEAD(&c->btree_interior_update_list); - INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); - mutex_init(&c->btree_interior_update_lock); - INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); - - INIT_LIST_HEAD(&c->btree_node_rewrites); - INIT_LIST_HEAD(&c->btree_node_rewrites_pending); - spin_lock_init(&c->btree_node_rewrites_lock); -} - -int bch2_fs_btree_interior_update_init(struct bch_fs *c) -{ - c->btree_interior_update_worker = - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); - if (!c->btree_interior_update_worker) - return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); - - c->btree_node_rewrite_worker = - alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); - if (!c->btree_node_rewrite_worker) - return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); - - if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, - sizeof(struct btree_update))) - return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init); - - return 0; -} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h deleted file mode 100644 index ac04e45a851594..00000000000000 --- a/fs/bcachefs/btree_update_interior.h +++ /dev/null @@ -1,364 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H -#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H - -#include "btree_cache.h" -#include "btree_locking.h" -#include "btree_update.h" - -#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) - -#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) - -int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); - -#define BTREE_UPDATE_MODES() \ - x(none) \ - x(node) \ - x(root) \ - x(update) - -enum btree_update_mode { -#define x(n) BTREE_UPDATE_##n, - BTREE_UPDATE_MODES() -#undef x -}; - -/* - * Tracks an in progress split/rewrite of a btree node and the update to the - * parent node: - * - * When we split/rewrite a node, we do all the updates in memory without - * waiting for any writes to complete - we allocate the new node(s) and update - * the parent node, possibly recursively up to the root. - * - * The end result is that we have one or more new nodes being written - - * possibly several, if there were multiple splits - and then a write (updating - * an interior node) which will make all these new nodes visible. - * - * Additionally, as we split/rewrite nodes we free the old nodes - but the old - * nodes can't be freed (their space on disk can't be reclaimed) until the - * update to the interior node that makes the new node visible completes - - * until then, the old nodes are still reachable on disk. - * - */ -struct btree_update { - struct closure cl; - struct bch_fs *c; - u64 start_time; - unsigned long ip_started; - - struct list_head list; - struct list_head unwritten_list; - - enum btree_update_mode mode; - enum bch_trans_commit_flags flags; - unsigned nodes_written:1; - unsigned took_gc_lock:1; - - enum btree_id btree_id; - struct bpos node_start; - struct bpos node_end; - enum btree_node_rewrite_reason node_needed_rewrite; - u16 node_written; - u16 node_sectors; - u16 node_remaining; - - unsigned update_level_start; - unsigned update_level_end; - - struct disk_reservation disk_res; - - /* - * BTREE_UPDATE_node: - * The update that made the new nodes visible was a regular update to an - * existing interior node - @b. We can't write out the update to @b - * until the new nodes we created are finished writing, so we block @b - * from writing by putting this btree_interior update on the - * @b->write_blocked list with @write_blocked_list: - */ - struct btree *b; - struct list_head write_blocked_list; - - /* - * We may be freeing nodes that were dirty, and thus had journal entries - * pinned: we need to transfer the oldest of those pins to the - * btree_update operation, and release it when the new node(s) - * are all persistent and reachable: - */ - struct journal_entry_pin journal; - - /* Preallocated nodes we reserve when we start the update: */ - struct prealloc_nodes { - struct btree *b[BTREE_UPDATE_NODES_MAX]; - unsigned nr; - } prealloc_nodes[2]; - - /* Nodes being freed: */ - struct keylist old_keys; - u64 _old_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_U64s_MAX]; - - /* Nodes being added: */ - struct keylist new_keys; - u64 _new_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_U64s_MAX]; - - /* New nodes, that will be made reachable by this update: */ - struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; - unsigned nr_new_nodes; - - struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; - __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; - unsigned nr_old_nodes; - - open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * - BCH_REPLICAS_MAX]; - open_bucket_idx_t nr_open_buckets; - - unsigned journal_u64s; - u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; - - /* Only here to reduce stack usage on recursive splits: */ - struct keylist parent_keys; - /* - * Enough room for btree_split's keys without realloc - btree node - * pointers never have crc/compression info, so we only need to acount - * for the pointers for three keys - */ - u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; -}; - -struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, - struct btree_trans *, - struct btree *, - struct bkey_format); - -int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); - -int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); - -int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, - unsigned, unsigned, enum btree_node_sibling); - -static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, - btree_path_idx_t path_idx, - unsigned level, unsigned flags, - enum btree_node_sibling sib) -{ - struct btree_path *path = trans->paths + path_idx; - struct btree *b; - - EBUG_ON(!btree_node_locked(path, level)); - - if (static_branch_unlikely(&bch2_btree_node_merging_disabled)) - return 0; - - b = path->l[level].b; - if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) - return 0; - - return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib); -} - -static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - btree_path_idx_t path, - unsigned level, - unsigned flags) -{ - bch2_trans_verify_not_unlocked_or_in_restart(trans); - - return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, - btree_prev_sib) ?: - bch2_foreground_maybe_merge_sibling(trans, path, level, flags, - btree_next_sib); -} - -int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned, unsigned); -int bch2_btree_node_rewrite_key(struct btree_trans *, - enum btree_id, unsigned, - struct bkey_i *, unsigned); -int bch2_btree_node_rewrite_pos(struct btree_trans *, - enum btree_id, unsigned, - struct bpos, unsigned, unsigned); -int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, - struct btree *, unsigned); - -void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); - -int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, - struct btree *, struct bkey_i *, - unsigned, bool); -int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, - struct bkey_i *, unsigned, bool); - -void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); - -int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned); -void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); - -static inline unsigned btree_update_reserve_required(struct bch_fs *c, - struct btree *b) -{ - unsigned depth = btree_node_root(c, b)->c.level + 1; - - /* - * Number of nodes we might have to allocate in a worst case btree - * split operation - we split all the way up to the root, then allocate - * a new root, unless we're already at max depth: - */ - if (depth < BTREE_MAX_DEPTH) - return (depth - b->c.level) * 2 + 1; - else - return (depth - b->c.level) * 2 - 1; -} - -static inline void btree_node_reset_sib_u64s(struct btree *b) -{ - b->sib_u64s[0] = b->nr.live_u64s; - b->sib_u64s[1] = b->nr.live_u64s; -} - -static inline void *btree_data_end(struct btree *b) -{ - return (void *) b->data + btree_buf_bytes(b); -} - -static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b) -{ - return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s); -} - -static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b) -{ - return btree_data_end(b); -} - -static inline void *write_block(struct btree *b) -{ - return (void *) b->data + (b->written << 9); -} - -static inline bool __btree_addr_written(struct btree *b, void *p) -{ - return p < write_block(b); -} - -static inline bool bset_written(struct btree *b, struct bset *i) -{ - return __btree_addr_written(b, i); -} - -static inline bool bkey_written(struct btree *b, struct bkey_packed *k) -{ - return __btree_addr_written(b, k); -} - -static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end) -{ - ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + - b->whiteout_u64s; - ssize_t total = btree_buf_bytes(b) >> 3; - - /* Always leave one extra u64 for bch2_varint_decode: */ - used++; - - return total - used; -} - -static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b) -{ - ssize_t remaining = __bch2_btree_u64s_remaining(b, - btree_bkey_last(b, bset_tree_last(b))); - - BUG_ON(remaining < 0); - - if (bset_written(b, btree_bset_last(b))) - return 0; - - return remaining; -} - -#define BTREE_WRITE_SET_U64s_BITS 9 - -static inline unsigned btree_write_set_buffer(struct btree *b) -{ - /* - * Could buffer up larger amounts of keys for btrees with larger keys, - * pending benchmarking: - */ - return 8 << BTREE_WRITE_SET_U64s_BITS; -} - -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) -{ - struct bset_tree *t = bset_tree_last(b); - struct btree_node_entry *bne = max(write_block(b), - (void *) btree_bkey_last(b, t)); - ssize_t remaining_space = - __bch2_btree_u64s_remaining(b, bne->keys.start); - - if (unlikely(bset_written(b, bset(b, t)))) { - if (b->written + block_sectors(c) <= btree_sectors(c)) - return bne; - } else { - if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && - remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) - return bne; - } - - return NULL; -} - -static inline void push_whiteout(struct btree *b, struct bpos pos) -{ - struct bkey_packed k; - - BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s); - EBUG_ON(btree_node_just_written(b)); - - if (!bkey_pack_pos(&k, pos, b)) { - struct bkey *u = (void *) &k; - - bkey_init(u); - u->p = pos; - } - - k.needs_whiteout = true; - - b->whiteout_u64s += k.u64s; - bkey_p_copy(unwritten_whiteouts_start(b), &k); -} - -/* - * write lock must be held on @b (else the dirty bset that we were going to - * insert into could be written out from under us) - */ -static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s) -{ - if (unlikely(btree_node_need_rewrite(b))) - return false; - - return u64s <= bch2_btree_keys_u64s_remaining(b); -} - -void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); - -bool bch2_btree_interior_updates_flush(struct bch_fs *); - -void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); -struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, - struct jset_entry *, unsigned long); - -void bch2_async_btree_node_rewrites_flush(struct bch_fs *); -void bch2_do_pending_node_rewrites(struct bch_fs *); -void bch2_free_pending_node_rewrites(struct bch_fs *); - -void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_btree_interior_update_exit(struct bch_fs *); -void bch2_fs_btree_interior_update_init_early(struct bch_fs *); -int bch2_fs_btree_interior_update_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c deleted file mode 100644 index 4b095235a0d221..00000000000000 --- a/fs/bcachefs/btree_write_buffer.c +++ /dev/null @@ -1,893 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "disk_accounting.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" - -#include -#include - -static int bch2_btree_write_buffer_journal_flush(struct journal *, - struct journal_entry_pin *, u64); - -static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) -{ - return (cmp_int(l->hi, r->hi) ?: - cmp_int(l->mi, r->mi) ?: - cmp_int(l->lo, r->lo)) >= 0; -} - -static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) -{ -#ifdef CONFIG_X86_64 - int cmp; - - asm("mov (%[l]), %%rax;" - "sub (%[r]), %%rax;" - "mov 8(%[l]), %%rax;" - "sbb 8(%[r]), %%rax;" - "mov 16(%[l]), %%rax;" - "sbb 16(%[r]), %%rax;" - : "=@ccae" (cmp) - : [l] "r" (l), [r] "r" (r) - : "rax", "cc"); - - EBUG_ON(cmp != __wb_key_ref_cmp(l, r)); - return cmp; -#else - return __wb_key_ref_cmp(l, r); -#endif -} - -static int wb_key_seq_cmp(const void *_l, const void *_r) -{ - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; - - return cmp_int(l->journal_seq, r->journal_seq); -} - -/* Compare excluding idx, the low 24 bits: */ -static inline bool wb_key_eq(const void *_l, const void *_r) -{ - const struct wb_key_ref *l = _l; - const struct wb_key_ref *r = _r; - - return !((l->hi ^ r->hi)| - (l->mi ^ r->mi)| - ((l->lo >> 24) ^ (r->lo >> 24))); -} - -static noinline void wb_sort(struct wb_key_ref *base, size_t num) -{ - size_t n = num, a = num / 2; - - if (!a) /* num < 2 || size == 0 */ - return; - - for (;;) { - size_t b, c, d; - - if (a) /* Building heap: sift down --a */ - --a; - else if (--n) /* Sorting: Extract root to --n */ - swap(base[0], base[n]); - else /* Sort complete */ - break; - - /* - * Sift element at "a" down into heap. This is the - * "bottom-up" variant, which significantly reduces - * calls to cmp_func(): we find the sift-down path all - * the way to the leaves (one compare per level), then - * backtrack to find where to insert the target element. - * - * Because elements tend to sift down close to the leaves, - * this uses fewer compares than doing two per level - * on the way down. (A bit more than half as many on - * average, 3/4 worst-case.) - */ - for (b = a; c = 2*b + 1, (d = c + 1) < n;) - b = wb_key_ref_cmp(base + c, base + d) ? c : d; - if (d == n) /* Special case last leaf with no sibling */ - b = c; - - /* Now backtrack from "b" to the correct location for "a" */ - while (b != a && wb_key_ref_cmp(base + a, base + b)) - b = (b - 1) / 2; - c = b; /* Where "a" belongs */ - while (b != a) { /* Shift it into place */ - b = (b - 1) / 2; - swap(base[b], base[c]); - } - } -} - -static noinline int wb_flush_one_slowpath(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_write_buffered_key *wb) -{ - struct btree_path *path = btree_iter_path(trans, iter); - - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - - trans->journal_res.seq = wb->journal_seq; - - return bch2_trans_update(trans, iter, &wb->k, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim); -} - -static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, - struct btree_write_buffered_key *wb, - bool *write_locked, - bool *accounting_accumulated, - size_t *fast) -{ - struct btree_path *path; - int ret; - - EBUG_ON(!wb->journal_seq); - EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); - EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); - - ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - - if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) { - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u); - - if (k.k->type == KEY_TYPE_accounting) - bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k), - bkey_s_c_to_accounting(k)); - } - *accounting_accumulated = true; - - /* - * We can't clone a path that has write locks: unshare it now, before - * set_pos and traverse(): - */ - if (btree_iter_path(trans, iter)->ref > 1) - iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); - - path = btree_iter_path(trans, iter); - - if (!*write_locked) { - ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); - if (ret) - return ret; - - bch2_btree_node_prep_for_write(trans, path, path->l[0].b); - *write_locked = true; - } - - if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) { - *write_locked = false; - return wb_flush_one_slowpath(trans, iter, wb); - } - - EBUG_ON(!bpos_eq(wb->k.k.p, path->pos)); - - bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); - (*fast)++; - return 0; -} - -/* - * Update a btree with a write buffered key using the journal seq of the - * original write buffer insert. - * - * It is not safe to rejournal the key once it has been inserted into the write - * buffer because that may break recovery ordering. For example, the key may - * have already been modified in the active write buffer in a seq that comes - * before the current transaction. If we were to journal this key again and - * crash, recovery would process updates in the wrong order. - */ -static int -btree_write_buffered_insert(struct btree_trans *trans, - struct btree_write_buffered_key *wb) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), - BTREE_ITER_cached|BTREE_ITER_intent); - - trans->journal_res.seq = wb->journal_seq; - - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &wb->k, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) -{ - struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); - struct journal *j = &c->journal; - - if (!wb->inc.keys.nr) - return; - - bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); - - darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); - darray_resize(&wb->sorted, wb->flushing.keys.size); - - if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { - swap(wb->flushing.keys, wb->inc.keys); - goto out; - } - - size_t nr = min(darray_room(wb->flushing.keys), - wb->sorted.size - wb->flushing.keys.nr); - nr = min(nr, wb->inc.keys.nr); - - memcpy(&darray_top(wb->flushing.keys), - wb->inc.keys.data, - sizeof(wb->inc.keys.data[0]) * nr); - - memmove(wb->inc.keys.data, - wb->inc.keys.data + nr, - sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); - - wb->flushing.keys.nr += nr; - wb->inc.keys.nr -= nr; -out: - if (!wb->inc.keys.nr) - bch2_journal_pin_drop(j, &wb->inc.pin); - else - bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, - bch2_btree_write_buffer_journal_flush); - - if (j->watermark) { - spin_lock(&j->lock); - bch2_journal_set_watermark(j); - spin_unlock(&j->lock); - } - - BUG_ON(wb->sorted.size < wb->flushing.keys.nr); -} - -int bch2_btree_write_buffer_insert_err(struct bch_fs *c, - enum btree_id btree, struct bkey_i *k) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "attempting to do write buffer update on non wb btree="); - bch2_btree_id_to_text(&buf, btree); - prt_str(&buf, "\n"); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - return -EROFS; -} - -static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_iter iter = {}; - size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; - bool write_locked = false; - bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); - int ret = 0; - - ret = bch2_journal_error(&c->journal); - if (ret) - return ret; - - bch2_trans_unlock(trans); - bch2_trans_begin(trans); - - mutex_lock(&wb->inc.lock); - move_keys_from_inc_to_flushing(wb); - mutex_unlock(&wb->inc.lock); - - for (size_t i = 0; i < wb->flushing.keys.nr; i++) { - wb->sorted.data[i].idx = i; - wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; - memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); - } - wb->sorted.nr = wb->flushing.keys.nr; - - /* - * We first sort so that we can detect and skip redundant updates, and - * then we attempt to flush in sorted btree order, as this is most - * efficient. - * - * However, since we're not flushing in the order they appear in the - * journal we won't be able to drop our journal pin until everything is - * flushed - which means this could deadlock the journal if we weren't - * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail - * if it would block taking a journal reservation. - * - * If that happens, simply skip the key so we can optimistically insert - * as many keys as possible in the fast path. - */ - wb_sort(wb->sorted.data, wb->sorted.nr); - - darray_for_each(wb->sorted, i) { - struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; - - if (unlikely(!btree_type_uses_write_buffer(k->btree))) { - ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k); - goto err; - } - - for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) - prefetch(&wb->flushing.keys.data[n->idx]); - - BUG_ON(!k->journal_seq); - - if (!accounting_replay_done && - k->k.k.type == KEY_TYPE_accounting) { - slowpath++; - continue; - } - - if (i + 1 < &darray_top(wb->sorted) && - wb_key_eq(i, i + 1)) { - struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - - if (k->k.k.type == KEY_TYPE_accounting && - n->k.k.type == KEY_TYPE_accounting) - bch2_accounting_accumulate(bkey_i_to_accounting(&n->k), - bkey_i_to_s_c_accounting(&k->k)); - - overwritten++; - n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); - k->journal_seq = 0; - continue; - } - - if (write_locked) { - struct btree_path *path = btree_iter_path(trans, &iter); - - if (path->btree_id != i->btree || - bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - write_locked = false; - - ret = lockrestart_do(trans, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_foreground_maybe_merge(trans, iter.path, 0, - BCH_WATERMARK_reclaim| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc)); - if (ret) - goto err; - } - } - - if (!iter.path || iter.btree_id != k->btree) { - bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - } - - bch2_btree_iter_set_pos(trans, &iter, k->k.k.p); - btree_iter_path(trans, &iter)->preserve = false; - - bool accounting_accumulated = false; - do { - if (race_fault()) { - ret = bch_err_throw(c, journal_reclaim_would_deadlock); - break; - } - - ret = wb_flush_one(trans, &iter, k, &write_locked, - &accounting_accumulated, &fast); - if (!write_locked) - bch2_trans_begin(trans); - } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); - - if (!ret) { - k->journal_seq = 0; - } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { - slowpath++; - ret = 0; - } else - break; - } - - if (write_locked) { - struct btree_path *path = btree_iter_path(trans, &iter); - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - goto err; - - if (slowpath) { - /* - * Flush in the order they were present in the journal, so that - * we can release journal pins: - * The fastpath zapped the seq of keys that were successfully flushed so - * we can skip those here. - */ - trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); - - sort_nonatomic(wb->flushing.keys.data, - wb->flushing.keys.nr, - sizeof(wb->flushing.keys.data[0]), - wb_key_seq_cmp, NULL); - - darray_for_each(wb->flushing.keys, i) { - if (!i->journal_seq) - continue; - - if (!accounting_replay_done && - i->k.k.type == KEY_TYPE_accounting) { - could_not_insert++; - continue; - } - - if (!could_not_insert) - bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); - - bch2_trans_begin(trans); - - ret = commit_do(trans, NULL, NULL, - BCH_WATERMARK_reclaim| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res , - btree_write_buffered_insert(trans, i)); - if (ret) - goto err; - - i->journal_seq = 0; - } - - /* - * If journal replay hasn't finished with accounting keys we - * can't flush accounting keys at all - condense them and leave - * them for next time. - * - * Q: Can the write buffer overflow? - * A Shouldn't be any actual risk. It's just new accounting - * updates that the write buffer can't flush, and those are only - * going to be generated by interior btree node updates as - * journal replay has to split/rewrite nodes to make room for - * its updates. - * - * And for those new acounting updates, updates to the same - * counters get accumulated as they're flushed from the journal - * to the write buffer - see the patch for eytzingcer tree - * accumulated. So we could only overflow if the number of - * distinct counters touched somehow was very large. - */ - if (could_not_insert) { - struct btree_write_buffered_key *dst = wb->flushing.keys.data; - - darray_for_each(wb->flushing.keys, i) - if (i->journal_seq) - *dst++ = *i; - wb->flushing.keys.nr = dst - wb->flushing.keys.data; - } - } -err: - if (ret || !could_not_insert) { - bch2_journal_pin_drop(j, &wb->flushing.pin); - wb->flushing.keys.nr = 0; - } - - bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret)); - trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0); - return ret; -} - -static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) -{ - struct journal_keys_to_wb dst; - int ret = 0; - - bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - - for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(entry, k) { - ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); - if (ret) - goto out; - } - - entry->type = BCH_JSET_ENTRY_btree_keys; - } -out: - ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; - return ret; -} - -static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) -{ - struct journal *j = &c->journal; - struct journal_buf *buf; - bool blocked; - int ret = 0; - - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { - ret = bch2_journal_keys_to_write_buffer(c, buf); - - if (!blocked && !ret) { - spin_lock(&j->lock); - buf->need_flush_to_write_buffer = false; - spin_unlock(&j->lock); - } - - mutex_unlock(&j->buf_lock); - - if (blocked) { - bch2_journal_unblock(j); - break; - } - } - - return ret; -} - -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, - bool *did_work) -{ - struct bch_fs *c = trans->c; - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret = 0, fetch_from_journal_err; - - do { - bch2_trans_unlock(trans); - - fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq); - - *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; - - /* - * On memory allocation failure, bch2_btree_write_buffer_flush_locked() - * is not guaranteed to empty wb->inc: - */ - mutex_lock(&wb->flushing.lock); - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flushing.lock); - } while (!ret && - (fetch_from_journal_err || - (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || - (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq))); - - return ret; -} - -static int bch2_btree_write_buffer_journal_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool did_work = false; - - return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work)); -} - -int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - bool did_work = false; - - trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); - - return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work); -} - -/* - * The write buffer requires flushing when going RO: keys in the journal for the - * write buffer don't have a journal pin yet - */ -bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c) -{ - if (bch2_journal_error(&c->journal)) - return false; - - bool did_work = false; - bch2_trans_run(c, btree_write_buffer_flush_seq(trans, - journal_cur_seq(&c->journal), &did_work)); - return did_work; -} - -int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret = 0; - - if (mutex_trylock(&wb->flushing.lock)) { - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flushing.lock); - } - - return ret; -} - -int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) - return bch_err_throw(c, erofs_no_writes); - - int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); - return ret; -} - -/* - * In check and repair code, when checking references to write buffer btrees we - * need to issue a flush before we have a definitive error: this issues a flush - * if this is a key we haven't yet checked. - */ -int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, - struct bkey_s_c referring_k, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bkey_buf tmp; - int ret = 0; - - bch2_bkey_buf_init(&tmp); - - if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { - if (trace_write_buffer_maybe_flush_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, referring_k); - trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); - printbuf_exit(&buf); - } - - bch2_bkey_buf_reassemble(&tmp, c, referring_k); - - if (bkey_is_btree_ptr(referring_k.k)) { - bch2_trans_unlock(trans); - bch2_btree_interior_updates_flush(c); - } - - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - bch2_bkey_buf_copy(last_flushed, c, tmp.k); - - /* can we avoid the unconditional restart? */ - trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_); - ret = bch_err_throw(c, transaction_restart_write_buffer_flush); - } -err: - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -static void bch2_btree_write_buffer_flush_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret; - - mutex_lock(&wb->flushing.lock); - do { - ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); - } while (!ret && bch2_btree_write_buffer_should_flush(c)); - mutex_unlock(&wb->flushing.lock); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); -} - -static void wb_accounting_sort(struct btree_write_buffer *wb) -{ - eytzinger0_sort(wb->accounting.data, wb->accounting.nr, - sizeof(wb->accounting.data[0]), - wb_key_cmp, NULL); -} - -int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree, - struct bkey_i_accounting *k) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key new = { .btree = btree }; - - bkey_copy(&new.k, &k->k_i); - - int ret = darray_push(&wb->accounting, new); - if (ret) - return ret; - - wb_accounting_sort(wb); - return 0; -} - -int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret; -retry: - ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); - if (!ret && dst->wb == &wb->flushing) - ret = darray_resize(&wb->sorted, wb->flushing.keys.size); - - if (unlikely(ret)) { - if (dst->wb == &c->btree_write_buffer.flushing) { - mutex_unlock(&dst->wb->lock); - dst->wb = &c->btree_write_buffer.inc; - bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, - bch2_btree_write_buffer_journal_flush); - goto retry; - } - - return ret; - } - - dst->room = darray_room(dst->wb->keys); - if (dst->wb == &wb->flushing) - dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); - BUG_ON(!dst->room); - BUG_ON(!dst->seq); - - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; - return 0; -} - -void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - if (mutex_trylock(&wb->flushing.lock)) { - mutex_lock(&wb->inc.lock); - move_keys_from_inc_to_flushing(wb); - - /* - * Attempt to skip wb->inc, and add keys directly to - * wb->flushing, saving us a copy later: - */ - - if (!wb->inc.keys.nr) { - dst->wb = &wb->flushing; - } else { - mutex_unlock(&wb->flushing.lock); - dst->wb = &wb->inc; - } - } else { - mutex_lock(&wb->inc.lock); - dst->wb = &wb->inc; - } - - dst->room = darray_room(dst->wb->keys); - if (dst->wb == &wb->flushing) - dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); - dst->seq = seq; - - bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, - bch2_btree_write_buffer_journal_flush); - - darray_for_each(wb->accounting, i) - memset(&i->k.v, 0, bkey_val_bytes(&i->k.k)); -} - -int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - unsigned live_accounting_keys = 0; - int ret = 0; - - darray_for_each(wb->accounting, i) - if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) { - i->journal_seq = dst->seq; - live_accounting_keys++; - ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k); - if (ret) - break; - } - - if (live_accounting_keys * 2 < wb->accounting.nr) { - struct btree_write_buffered_key *dst = wb->accounting.data; - - darray_for_each(wb->accounting, src) - if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k))) - *dst++ = *src; - wb->accounting.nr = dst - wb->accounting.data; - wb_accounting_sort(wb); - } - - if (!dst->wb->keys.nr) - bch2_journal_pin_drop(&c->journal, &dst->wb->pin); - - if (bch2_btree_write_buffer_should_flush(c) && - __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && - !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); - - if (dst->wb == &wb->flushing) - mutex_unlock(&wb->flushing.lock); - mutex_unlock(&wb->inc.lock); - - return ret; -} - -static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) -{ - if (wb->keys.size >= new_size) - return 0; - - if (!mutex_trylock(&wb->lock)) - return -EINTR; - - int ret = darray_resize(&wb->keys, new_size); - mutex_unlock(&wb->lock); - return ret; -} - -int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb_keys_resize(&wb->flushing, new_size) ?: - wb_keys_resize(&wb->inc, new_size); -} - -void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && - !bch2_journal_error(&c->journal)); - - darray_exit(&wb->accounting); - darray_exit(&wb->sorted); - darray_exit(&wb->flushing.keys); - darray_exit(&wb->inc.keys); -} - -void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - mutex_init(&wb->inc.lock); - mutex_init(&wb->flushing.lock); - INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); -} - -int bch2_fs_btree_write_buffer_init(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - /* Will be resized by journal as needed: */ - unsigned initial_size = 1 << 16; - - return darray_make_room(&wb->inc.keys, initial_size) ?: - darray_make_room(&wb->flushing.keys, initial_size) ?: - darray_make_room(&wb->sorted, initial_size); -} diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h deleted file mode 100644 index c351d21aca0b88..00000000000000 --- a/fs/bcachefs/btree_write_buffer.h +++ /dev/null @@ -1,113 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H -#define _BCACHEFS_BTREE_WRITE_BUFFER_H - -#include "bkey.h" -#include "disk_accounting.h" - -static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; -} - -static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; -} - -struct btree_trans; -int bch2_btree_write_buffer_flush_sync(struct btree_trans *); -bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *); -int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); -int bch2_btree_write_buffer_tryflush(struct btree_trans *); - -struct bkey_buf; -int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *); - -struct journal_keys_to_wb { - struct btree_write_buffer_keys *wb; - size_t room; - u64 seq; -}; - -static inline int wb_key_cmp(const void *_l, const void *_r) -{ - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; - - return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p); -} - -int bch2_accounting_key_to_wb_slowpath(struct bch_fs *, - enum btree_id, struct bkey_i_accounting *); - -static inline int bch2_accounting_key_to_wb(struct bch_fs *c, - enum btree_id btree, struct bkey_i_accounting *k) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key search; - search.btree = btree; - search.k.k.p = k->k.p; - - unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr, - sizeof(wb->accounting.data[0]), - wb_key_cmp, &search); - - if (idx >= wb->accounting.nr) - return bch2_accounting_key_to_wb_slowpath(c, btree, k); - - struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k); - bch2_accounting_accumulate(dst, accounting_i_to_s_c(k)); - return 0; -} - -int bch2_journal_key_to_wb_slowpath(struct bch_fs *, - struct journal_keys_to_wb *, - enum btree_id, struct bkey_i *); - -static inline int __bch2_journal_key_to_wb(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - if (unlikely(!dst->room)) - return bch2_journal_key_to_wb_slowpath(c, dst, btree, k); - - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; - return 0; -} - -static inline int bch2_journal_key_to_wb(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(c, btree, k); - dump_stack(); - return ret; - } - - EBUG_ON(!dst->seq); - - return k->k.type == KEY_TYPE_accounting - ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k)) - : __bch2_journal_key_to_wb(c, dst, btree, k); -} - -void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); -int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); - -int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); -void bch2_fs_btree_write_buffer_exit(struct bch_fs *); -void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); -int bch2_fs_btree_write_buffer_init(struct bch_fs *); - -#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h deleted file mode 100644 index e9e76e20f43b0b..00000000000000 --- a/fs/bcachefs/btree_write_buffer_types.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H -#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H - -#include "darray.h" -#include "journal_types.h" - -#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 -#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) - -struct wb_key_ref { -union { - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned idx:24; - u8 pos[sizeof(struct bpos)]; - enum btree_id btree:8; -#else - enum btree_id btree:8; - u8 pos[sizeof(struct bpos)]; - unsigned idx:24; -#endif - } __packed; - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - u64 lo; - u64 mi; - u64 hi; -#else - u64 hi; - u64 mi; - u64 lo; -#endif - }; -}; -}; - -struct btree_write_buffered_key { - enum btree_id btree:8; - u64 journal_seq:56; - __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); -}; - -struct btree_write_buffer_keys { - DARRAY(struct btree_write_buffered_key) keys; - struct journal_entry_pin pin; - struct mutex lock; -}; - -struct btree_write_buffer { - DARRAY(struct wb_key_ref) sorted; - struct btree_write_buffer_keys inc; - struct btree_write_buffer_keys flushing; - struct work_struct flush_work; - - DARRAY(struct btree_write_buffered_key) accounting; -}; - -#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c deleted file mode 100644 index f25903c10e8a6f..00000000000000 --- a/fs/bcachefs/buckets.c +++ /dev/null @@ -1,1395 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code for manipulating bucket marks for garbage collection. - * - * Copyright 2014 Datera, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "backpointers.h" -#include "bset.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "buckets.h" -#include "buckets_waiting_for_journal.h" -#include "disk_accounting.h" -#include "ec.h" -#include "error.h" -#include "inode.h" -#include "movinggc.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "reflink.h" -#include "replicas.h" -#include "subvolume.h" -#include "trace.h" - -#include - -void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) -{ - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets); -} - -void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage) -{ - memset(usage, 0, sizeof(*usage)); - acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, - sizeof(struct bch_dev_usage_full) / sizeof(u64)); -} - -static u64 reserve_factor(u64 r) -{ - return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); -} - -static struct bch_fs_usage_short -__bch2_fs_usage_read_short(struct bch_fs *c) -{ - struct bch_fs_usage_short ret; - u64 data, reserved; - - ret.capacity = c->capacity - - percpu_u64_get(&c->usage->hidden); - - data = percpu_u64_get(&c->usage->data) + - percpu_u64_get(&c->usage->btree); - reserved = percpu_u64_get(&c->usage->reserved) + - percpu_u64_get(c->online_reserved); - - ret.used = min(ret.capacity, data + reserve_factor(reserved)); - ret.free = ret.capacity - ret.used; - - ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes); - - return ret; -} - -struct bch_fs_usage_short -bch2_fs_usage_read_short(struct bch_fs *c) -{ - struct bch_fs_usage_short ret; - - percpu_down_read(&c->mark_lock); - ret = __bch2_fs_usage_read_short(c); - percpu_up_read(&c->mark_lock); - - return ret; -} - -void bch2_dev_usage_to_text(struct printbuf *out, - struct bch_dev *ca, - struct bch_dev_usage_full *usage) -{ - if (out->nr_tabstops < 5) { - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - } - - prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); - - for (unsigned i = 0; i < BCH_DATA_NR; i++) { - bch2_prt_data_type(out, i); - prt_printf(out, "\t%llu\r%llu\r%llu\r\n", - usage->d[i].buckets, - usage->d[i].sectors, - usage->d[i].fragmented); - } - - prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets); -} - -static int bch2_check_fix_ptr(struct btree_trans *trans, - struct bkey_s_c k, - struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - bool *do_update) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (!ca) { - if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, - trans, ptr_to_invalid_device, - "pointer to missing device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - return 0; - } - - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - if (!g) { - if (fsck_err(trans, ptr_to_invalid_device, - "pointer to invalid bucket on device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - goto out; - } - - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - - if (fsck_err_on(!g->gen_valid, - trans, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - /* this pointer will be dropped */ - *do_update = true; - goto out; - } - } - - /* g->gen_valid == true */ - - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached && - (g->data_type != BCH_DATA_btree || - data_type == BCH_DATA_btree)) { - g->data_type = data_type; - g->stripe_sectors = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } - - *do_update = true; - } - - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - - if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - trans, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - goto out; - - if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), - trans, ptr_bucket_data_type_mismatch, - "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (!p.ptr.cached && - data_type == BCH_DATA_btree) { - switch (g->data_type) { - case BCH_DATA_sb: - bch_err(c, "btree and superblock in the same bucket - cannot repair"); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto out; - case BCH_DATA_journal: - ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr)); - bch_err_msg(c, ret, "error deleting journal bucket %zu", - PTR_BUCKET_NR(ca, &p.ptr)); - if (ret) - goto out; - break; - } - - g->data_type = data_type; - g->stripe_sectors = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } else { - *do_update = true; - } - } - - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, - trans, ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), - trans, ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - *do_update = true; - } -out: -fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_fix_ptrs(struct btree_trans *trans, - enum btree_id btree, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry_c; - struct extent_ptr_decoded p = { 0 }; - bool do_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - /* We don't yet do btree key updates correctly for when we're RW */ - BUG_ON(test_bit(BCH_FS_rw, &c->flags)); - - bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { - ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); - if (ret) - goto err; - } - - if (do_update) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - scoped_guard(rcu) - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); - - if (level) { - /* - * We don't want to drop btree node pointers - if the - * btree node isn't there anymore, the read path will - * sort it out: - */ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - scoped_guard(rcu) - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen; - } - } else { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - - rcu_read_lock(); -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); - - if ((p.ptr.cached && - (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || - (!p.ptr.cached && - gen_cmp(p.ptr.gen, g->gen) < 0) || - gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type)) { - bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); - goto restart_drop_ptrs; - } - } - rcu_read_unlock(); -again: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_extent_entry_for_each(ptrs, entry) { - if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, - entry->stripe_ptr.idx); - union bch_extent_entry *next_ptr; - - bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) - if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) - goto found; - next_ptr = NULL; -found: - if (!next_ptr) { - bch_err(c, "aieee, found stripe ptr with no data ptr"); - continue; - } - - if (!m || !m->alive || - !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], - &next_ptr->ptr, - m->sectors)) { - bch2_bkey_extent_entry_drop(new, entry); - goto again; - } - } - } - } - - if (0) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, k); - bch_info(c, "updated %s", buf.buf); - - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - bch_info(c, "new key %s", buf.buf); - } - - if (!(flags & BTREE_TRIGGER_is_root)) { - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun); - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - if (level) - bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); - } else { - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, - jset_u64s(new->k.u64s)); - ret = PTR_ERR_OR_ZERO(e); - if (ret) - goto err; - - journal_entry_set(e, - BCH_JSET_ENTRY_btree_root, - btree, level - 1, - new, new->k.u64s); - - /* - * no locking, we're single threaded and not rw yet, see - * the big assertino above that we repeat here: - */ - BUG_ON(test_bit(BCH_FS_rw, &c->flags)); - - struct btree *b = bch2_btree_id_root(c, btree)->b; - bkey_copy(&b->key, new); - } - } -err: - printbuf_exit(&buf); - return ret; -} - -static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf, - struct bkey_s_c k, bool insert, enum bch_sb_error_id id) -{ - struct bch_fs *c = trans->c; - - prt_printf(buf, "\nwhile marking "); - bch2_bkey_val_to_text(buf, c, k); - prt_newline(buf); - - bool print = __bch2_count_fsck_err(c, id, buf); - - int ret = bch2_run_explicit_recovery_pass(c, buf, - BCH_RECOVERY_PASS_check_allocations, 0); - - if (insert) { - bch2_trans_updates_to_text(buf, trans); - __bch2_inconsistent_error(c, buf); - /* - * If we're in recovery, run_explicit_recovery_pass might give - * us an error code for rewinding recovery - */ - if (!ret) - ret = bch_err_throw(c, bucket_ref_update); - } else { - /* Always ignore overwrite errors, so that deletion works */ - ret = 0; - } - - if (print || insert) - bch2_print_str(c, KERN_ERR, buf->buf); - return ret; -} - -int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 *bucket_sectors) -{ - struct bch_fs *c = trans->c; - size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); - struct printbuf buf = PRINTBUF; - bool inserting = sectors > 0; - int ret = 0; - - BUG_ON(!sectors); - - if (unlikely(gen_after(ptr->gen, b_gen))) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); - goto out; - } - - if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_too_stale); - goto out; - } - - if (b_gen != ptr->gen && ptr->cached) { - ret = 1; - goto out; - } - - if (unlikely(b_gen != ptr->gen)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)", - ptr->dev, bucket_nr, b_gen, - bucket_gen_get(ca, bucket_nr), - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_stale_dirty_ptr); - goto out; - } - - if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type), - bch2_data_type_str(ptr_data_type)); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_bucket_data_type_mismatch); - goto out; - } - - if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) { - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type ?: ptr_data_type), - *bucket_sectors, sectors); - - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_bucket_sector_count_overflow); - sectors = -*bucket_sectors; - goto out; - } - - *bucket_sectors += sectors; -out: - printbuf_exit(&buf); - return ret; -} - -void bch2_trans_account_disk_usage_change(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - static int warned_disk_usage = 0; - bool warn = false; - - percpu_down_read(&c->mark_lock); - struct bch_fs_usage_base *src = &trans->fs_usage_delta; - - s64 added = src->btree + src->data + src->reserved; - - /* - * Not allowed to reduce sectors_available except by getting a - * reservation: - */ - s64 should_not_have_added = added - (s64) disk_res_sectors; - if (unlikely(should_not_have_added > 0)) { - u64 old, new; - - old = atomic64_read(&c->sectors_available); - do { - new = max_t(s64, 0, old - should_not_have_added); - } while (!atomic64_try_cmpxchg(&c->sectors_available, - &old, new)); - - added -= should_not_have_added; - warn = true; - } - - if (added > 0) { - trans->disk_res->sectors -= added; - this_cpu_sub(*c->online_reserved, added); - } - - preempt_disable(); - struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); - acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); - preempt_enable(); - percpu_up_read(&c->mark_lock); - - if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) - bch2_trans_inconsistent(trans, - "disk usage increased %lli more than %llu sectors reserved)", - should_not_have_added, disk_res_sectors); -} - -/* KEY_TYPE_extent: */ - -static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, - struct bkey_s_c k, - const struct extent_ptr_decoded *p, - s64 sectors, enum bch_data_type ptr_data_type, - struct bch_alloc_v4 *a, - bool insert) -{ - u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : - !p->ptr.cached ? &a->dirty_sectors : - &a->cached_sectors; - int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type, - a->gen, a->data_type, dst_sectors); - - if (ret) - return ret; - if (insert) - alloc_data_type_set(a, ptr_data_type); - return 0; -} - -static int bch2_trigger_pointer(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - s64 *sectors, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); - - *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; - - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); - if (unlikely(!ca)) { - if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) - ret = bch_err_throw(c, trigger_pointer); - goto err; - } - - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - if (!bucket_valid(ca, bucket.offset)) { - if (insert) { - bch2_dev_bucket_missing(ca, bucket.offset); - ret = bch_err_throw(c, trigger_pointer); - } - goto err; - } - - if (flags & BTREE_TRIGGER_transactional) { - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); - if (ret) - goto err; - - ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); - if (ret) - goto err; - } - - if (flags & BTREE_TRIGGER_gc) { - struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", - p.ptr.dev, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch_err_throw(c, trigger_pointer); - goto err; - } - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); - alloc_to_bucket(g, new); - bucket_unlock(g); - - if (!ret) - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - } -err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -static int bch2_trigger_stripe_ptr(struct btree_trans *trans, - struct bkey_s_c k, - struct extent_ptr_decoded p, - enum bch_data_type data_type, - s64 sectors, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - - if (flags & BTREE_TRIGGER_transactional) { - struct btree_iter iter; - struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_with_updates, stripe); - int ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, - "pointer to nonexistent stripe %llu", - (u64) p.ec.idx); - goto err; - } - - if (!bch2_ptr_matches_stripe(&s->v, p)) { - bch2_trans_inconsistent(trans, - "stripe pointer doesn't match stripe %llu", - (u64) p.ec.idx); - ret = bch_err_throw(c, trigger_stripe_pointer); - goto err; - } - - stripe_blockcount_set(&s->v, p.ec.block, - stripe_blockcount_get(&s->v, p.ec.block) + - sectors); - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = data_type; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; - } - - if (flags & BTREE_TRIGGER_gc) { - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - (u64) p.ec.idx); - return bch_err_throw(c, ENOMEM_mark_stripe_ptr); - } - - gc_stripe_lock(m); - - if (!m || !m->alive) { - gc_stripe_unlock(m); - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ", - (u64) p.ec.idx); - bch2_bkey_val_to_text(&buf, c, k); - __bch2_inconsistent_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, trigger_stripe_pointer); - } - - m->block_sectors[p.ec.block] += sectors; - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA"); - gc_stripe_unlock(m); - - acc.replicas.data_type = data_type; - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); - if (ret) - return ret; - } - - return 0; -} - -static int __trigger_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - bool gc = flags & BTREE_TRIGGER_gc; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - enum bch_data_type data_type = bkey_is_btree_ptr(k.k) - ? BCH_DATA_btree - : BCH_DATA_user; - int ret = 0; - - s64 replicas_sectors = 0; - - struct disk_accounting_pos acc_replicas_key; - memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); - acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; - acc_replicas_key.replicas.data_type = data_type; - acc_replicas_key.replicas.nr_devs = 0; - acc_replicas_key.replicas.nr_required = 1; - - unsigned cur_compression_type = 0; - u64 compression_acct[3] = { 1, 0, 0 }; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = 0; - ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); - if (ret < 0) - return ret; - - bool stale = ret > 0; - - if (p.ptr.cached && stale) - continue; - - if (p.ptr.cached) { - ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc); - if (ret) - return ret; - } else if (!p.has_ec) { - replicas_sectors += disk_sectors; - replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); - } else { - ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); - if (ret) - return ret; - - /* - * There may be other dirty pointers in this extent, but - * if so they're not required for mounting if we have an - * erasure coded pointer in this extent: - */ - acc_replicas_key.replicas.nr_required = 0; - } - - if (cur_compression_type && - cur_compression_type != p.crc.compression_type) { - if (flags & BTREE_TRIGGER_overwrite) - bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - - ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, - compression, cur_compression_type); - if (ret) - return ret; - - compression_acct[0] = 1; - compression_acct[1] = 0; - compression_acct[2] = 0; - } - - cur_compression_type = p.crc.compression_type; - if (p.crc.compression_type) { - compression_acct[1] += p.crc.uncompressed_size; - compression_acct[2] += p.crc.compressed_size; - } - } - - if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); - if (ret) - return ret; - } - - if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot); - if (ret) - return ret; - } - - if (cur_compression_type) { - if (flags & BTREE_TRIGGER_overwrite) - bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - - ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, - compression, cur_compression_type); - if (ret) - return ret; - } - - if (level) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); - if (ret) - return ret; - } else { - bool insert = !(flags & BTREE_TRIGGER_overwrite); - - s64 v[3] = { - insert ? 1 : -1, - insert ? k.k->size : -((s64) k.k->size), - replicas_sectors, - }; - ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); - if (ret) - return ret; - } - - return 0; -} - -int bch2_trigger_extent(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); - struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); - unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; - unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; - - if (unlikely(flags & BTREE_TRIGGER_check_repair)) - return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags); - - /* if pointers aren't changing - nothing to do: */ - if (new_ptrs_bytes == old_ptrs_bytes && - !memcmp(new_ptrs.start, - old_ptrs.start, - new_ptrs_bytes)) - return 0; - - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - if (old.k->type) { - int ret = __trigger_extent(trans, btree, level, old, - flags & ~BTREE_TRIGGER_insert); - if (ret) - return ret; - } - - if (new.k->type) { - int ret = __trigger_extent(trans, btree, level, new.s_c, - flags & ~BTREE_TRIGGER_overwrite); - if (ret) - return ret; - } - - int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta[1] = { 0 }; - - s64 s = bch2_bkey_sectors_need_rebalance(c, old); - need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta[0] -= s; - - s = bch2_bkey_sectors_need_rebalance(c, new.s_c); - need_rebalance_delta += s != 0; - need_rebalance_sectors_delta[0] += s; - - if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, need_rebalance_delta > 0); - if (ret) - return ret; - } - - if (need_rebalance_sectors_delta[0]) { - int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - need_rebalance_sectors_delta, rebalance_work); - if (ret) - return ret; - } - } - - return 0; -} - -/* KEY_TYPE_reservation */ - -static int __trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors[1] = { k.k->size }; - - if (flags & BTREE_TRIGGER_overwrite) - sectors[0] = -sectors[0]; - - return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, - persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); - } - - return 0; -} - -int bch2_trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); -} - -/* Mark superblocks: */ - -static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - enum bch_data_type type, - unsigned sectors) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - int ret = 0; - - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b)); - if (IS_ERR(a)) - return PTR_ERR(a); - - if (a->v.data_type && type && a->v.data_type != type) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s\n", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - bch2_data_type_str(type), - bch2_data_type_str(type)); - - bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); - - ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_allocations, 0); - - /* Always print, this is always fatal */ - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - if (!ret) - ret = bch_err_throw(c, metadata_bucket_inconsistency); - goto err; - } - - if (a->v.data_type != type || - a->v.dirty_sectors != sectors) { - a->v.data_type = type; - a->v.dirty_sectors = sectors; - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, - u64 b, enum bch_data_type data_type, unsigned sectors, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - struct bucket *g = gc_bucket(ca, b); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", - ca->dev_idx, bch2_data_type_str(data_type))) - goto err; - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g); - - if (bch2_fs_inconsistent_on(g->data_type && - g->data_type != data_type, c, - "different types of data in same bucket: %s, %s", - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) - goto err_unlock; - - if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, - "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", - ca->dev_idx, b, g->gen, - bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) - goto err_unlock; - - g->data_type = data_type; - g->dirty_sectors += sectors; - struct bch_alloc_v4 new = bucket_m_to_alloc(*g); - bucket_unlock(g); - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - return ret; -err_unlock: - bucket_unlock(g); -err: - return bch_err_throw(c, metadata_bucket_inconsistency); -} - -int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - enum bch_data_type type, unsigned sectors, - enum btree_iter_update_trigger_flags flags) -{ - BUG_ON(type != BCH_DATA_free && - type != BCH_DATA_sb && - type != BCH_DATA_journal); - - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; - - if (flags & BTREE_TRIGGER_gc) - return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags); - else if (flags & BTREE_TRIGGER_transactional) - return commit_do(trans, NULL, NULL, 0, - __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); - else - BUG(); -} - -static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, - struct bch_dev *ca, u64 start, u64 end, - enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors, - enum btree_iter_update_trigger_flags flags) -{ - do { - u64 b = sector_to_bucket(ca, start); - unsigned sectors = - min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - - if (b != *bucket && *bucket_sectors) { - int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, - type, *bucket_sectors, flags); - if (ret) - return ret; - - *bucket_sectors = 0; - } - - *bucket = b; - *bucket_sectors += sectors; - start += sectors; - } while (start < end); - - return 0; -} - -static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - - mutex_lock(&c->sb_lock); - struct bch_sb_layout layout = ca->disk_sb.sb->layout; - mutex_unlock(&c->sb_lock); - - u64 bucket = 0; - unsigned i, bucket_sectors = 0; - int ret; - - for (i = 0; i < layout.nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout.sb_offset[i]); - - if (offset == BCH_SB_SECTOR) { - ret = bch2_trans_mark_metadata_sectors(trans, ca, - 0, BCH_SB_SECTOR, - BCH_DATA_sb, &bucket, &bucket_sectors, flags); - if (ret) - return ret; - } - - ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, - offset + (1 << layout.sb_max_size_bits), - BCH_DATA_sb, &bucket, &bucket_sectors, flags); - if (ret) - return ret; - } - - if (bucket_sectors) { - ret = bch2_trans_mark_metadata_bucket(trans, ca, - bucket, BCH_DATA_sb, bucket_sectors, flags); - if (ret) - return ret; - } - - for (i = 0; i < ca->journal.nr; i++) { - ret = bch2_trans_mark_metadata_bucket(trans, ca, - ca->journal.buckets[i], - BCH_DATA_journal, ca->mi.bucket_size, flags); - if (ret) - return ret; - } - - return 0; -} - -int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, - enum btree_iter_update_trigger_flags flags) -{ - int ret = bch2_trans_run(c, - __bch2_trans_mark_dev_sb(trans, ca, flags)); - bch_err_fn(c, ret); - return ret; -} - -int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, - enum btree_iter_update_trigger_flags flags) -{ - for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { - int ret = bch2_trans_mark_dev_sb(c, ca, flags); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); - return ret; - } - } - - return 0; -} - -int bch2_trans_mark_dev_sbs(struct bch_fs *c) -{ - return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); -} - -bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - for (i = 0; i < ca->journal.nr; i++) - if (b == ca->journal.buckets[i]) - return true; - - return false; -} - -/* Disk reservations: */ - -#define SECTORS_CACHE 1024 - -int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, enum bch_reservation_flags flags) -{ - struct bch_fs_pcpu *pcpu; - u64 old, get; - u64 sectors_available; - int ret; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - pcpu = this_cpu_ptr(c->pcpu); - - if (sectors <= pcpu->sectors_available) - goto out; - - old = atomic64_read(&c->sectors_available); - do { - get = min((u64) sectors + SECTORS_CACHE, old); - - if (get < sectors) { - preempt_enable(); - goto recalculate; - } - } while (!atomic64_try_cmpxchg(&c->sectors_available, - &old, old - get)); - - pcpu->sectors_available += get; - -out: - pcpu->sectors_available -= sectors; - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - - preempt_enable(); - percpu_up_read(&c->mark_lock); - return 0; - -recalculate: - mutex_lock(&c->sectors_available_lock); - - percpu_u64_set(&c->pcpu->sectors_available, 0); - sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); - - if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) - sectors = min(sectors, sectors_available); - - if (sectors <= sectors_available || - (flags & BCH_DISK_RESERVATION_NOFAIL)) { - atomic64_set(&c->sectors_available, - max_t(s64, 0, sectors_available - sectors)); - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - ret = 0; - } else { - atomic64_set(&c->sectors_available, sectors_available); - ret = bch_err_throw(c, ENOSPC_disk_reservation); - } - - mutex_unlock(&c->sectors_available_lock); - percpu_up_read(&c->mark_lock); - - return ret; -} - -/* Startup/shutdown: */ - -void bch2_buckets_nouse_free(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - kvfree_rcu_mightsleep(ca->buckets_nouse); - ca->buckets_nouse = NULL; - } -} - -int bch2_buckets_nouse_alloc(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - BUG_ON(ca->buckets_nouse); - - ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO); - if (!ca->buckets_nouse) { - bch2_dev_put(ca); - return bch_err_throw(c, ENOMEM_buckets_nouse); - } - } - - return 0; -} - -static void bucket_gens_free_rcu(struct rcu_head *rcu) -{ - struct bucket_gens *buckets = - container_of(rcu, struct bucket_gens, rcu); - - kvfree(buckets); -} - -int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -{ - struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; - bool resize = ca->bucket_gens != NULL; - int ret; - - if (resize) - lockdep_assert_held(&c->state_lock); - - if (resize && ca->buckets_nouse) - return bch_err_throw(c, no_resize_with_buckets_nouse); - - bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), - GFP_KERNEL|__GFP_ZERO); - if (!bucket_gens) { - ret = bch_err_throw(c, ENOMEM_bucket_gens); - goto err; - } - - bucket_gens->first_bucket = ca->mi.first_bucket; - bucket_gens->nbuckets = nbuckets; - bucket_gens->nbuckets_minus_first = - bucket_gens->nbuckets - bucket_gens->first_bucket; - - old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); - - if (resize) { - u64 copy = min(bucket_gens->nbuckets, - old_bucket_gens->nbuckets); - memcpy(bucket_gens->b, - old_bucket_gens->b, - sizeof(bucket_gens->b[0]) * copy); - } - - ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch, - ca->mi.nbuckets, nbuckets) ?: - bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty, - ca->mi.nbuckets, nbuckets); - - rcu_assign_pointer(ca->bucket_gens, bucket_gens); - bucket_gens = old_bucket_gens; - - nbuckets = ca->mi.nbuckets; - - ret = 0; -err: - if (bucket_gens) - call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); - - return ret; -} - -void bch2_dev_buckets_free(struct bch_dev *ca) -{ - kvfree(ca->buckets_nouse); - kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); - free_percpu(ca->usage); -} - -int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) -{ - ca->usage = alloc_percpu(struct bch_dev_usage_full); - if (!ca->usage) - return bch_err_throw(c, ENOMEM_usage_init); - - return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); -} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h deleted file mode 100644 index 49a3807a5eabff..00000000000000 --- a/fs/bcachefs/buckets.h +++ /dev/null @@ -1,369 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Code for manipulating bucket marks for garbage collection. - * - * Copyright 2014 Datera, Inc. - */ - -#ifndef _BUCKETS_H -#define _BUCKETS_H - -#include "buckets_types.h" -#include "extents.h" -#include "sb-members.h" - -static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s) -{ - return div_u64(s, ca->mi.bucket_size); -} - -static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -{ - return ((sector_t) b) * ca->mi.bucket_size; -} - -static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -{ - u32 remainder; - - div_u64_rem(s, ca->mi.bucket_size, &remainder); - return remainder; -} - -static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset) -{ - return div_u64_rem(s, ca->mi.bucket_size, offset); -} - -#define for_each_bucket(_b, _buckets) \ - for (_b = (_buckets)->b + (_buckets)->first_bucket; \ - _b < (_buckets)->b + (_buckets)->nbuckets; _b++) - -static inline void bucket_unlock(struct bucket *b) -{ - BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); - - clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); - smp_mb__after_atomic(); - wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR); -} - -static inline void bucket_lock(struct bucket *b) -{ - wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR, - TASK_UNINTERRUPTIBLE); -} - -static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) -{ - return bucket_valid(ca, b) - ? genradix_ptr(&ca->buckets_gc, b) - : NULL; -} - -static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) -{ - return rcu_dereference_check(ca->bucket_gens, - lockdep_is_held(&ca->fs->state_lock)); -} - -static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) -{ - struct bucket_gens *gens = bucket_gens(ca); - - if (b - gens->first_bucket >= gens->nbuckets_minus_first) - return NULL; - return gens->b + b; -} - -static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) -{ - u8 *gen = bucket_gen(ca, b); - return gen ? *gen : -1; -} - -static inline int bucket_gen_get(struct bch_dev *ca, size_t b) -{ - guard(rcu)(); - return bucket_gen_get_rcu(ca, b); -} - -static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return sector_to_bucket(ca, ptr->offset); -} - -static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); -} - -static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca, - const struct bch_extent_ptr *ptr, - u32 *bucket_offset) -{ - return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); -} - -static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); -} - -static inline enum bch_data_type ptr_data_type(const struct bkey *k, - const struct bch_extent_ptr *ptr) -{ - if (bkey_is_btree_ptr(k)) - return BCH_DATA_btree; - - return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; -} - -static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) -{ - EBUG_ON(sectors < 0); - - return crc_is_compressed(p.crc) - ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, - p.crc.uncompressed_size) - : sectors; -} - -static inline int gen_cmp(u8 a, u8 b) -{ - return (s8) (a - b); -} - -static inline int gen_after(u8 a, u8 b) -{ - return max(0, gen_cmp(a, b)); -} - -static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) -{ - int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr)); - return gen < 0 ? gen : gen_after(gen, ptr->gen); -} - -/** - * dev_ptr_stale() - check if a pointer points into a bucket that has been - * invalidated. - */ -static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) -{ - guard(rcu)(); - return dev_ptr_stale_rcu(ca, ptr); -} - -/* Device usage: */ - -void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *); -static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) -{ - struct bch_dev_usage ret; - - bch2_dev_usage_read_fast(ca, &ret); - return ret; -} - -void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *); -static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca) -{ - struct bch_dev_usage_full ret; - - bch2_dev_usage_full_read_fast(ca, &ret); - return ret; -} - -void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *); - -static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) -{ - s64 reserved = 0; - - switch (watermark) { - case BCH_WATERMARK_NR: - BUG(); - case BCH_WATERMARK_stripe: - reserved += ca->mi.nbuckets >> 6; - fallthrough; - case BCH_WATERMARK_normal: - reserved += ca->mi.nbuckets >> 6; - fallthrough; - case BCH_WATERMARK_copygc: - reserved += ca->nr_btree_reserve; - fallthrough; - case BCH_WATERMARK_btree: - reserved += ca->nr_btree_reserve; - fallthrough; - case BCH_WATERMARK_btree_copygc: - case BCH_WATERMARK_reclaim: - case BCH_WATERMARK_interior_updates: - break; - } - - return reserved; -} - -static inline u64 dev_buckets_free(struct bch_dev *ca, - struct bch_dev_usage usage, - enum bch_watermark watermark) -{ - return max_t(s64, 0, - usage.buckets[BCH_DATA_free]- - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, watermark)); -} - -static inline u64 __dev_buckets_available(struct bch_dev *ca, - struct bch_dev_usage usage, - enum bch_watermark watermark) -{ - return max_t(s64, 0, - usage.buckets[BCH_DATA_free] - + usage.buckets[BCH_DATA_cached] - + usage.buckets[BCH_DATA_need_gc_gens] - + usage.buckets[BCH_DATA_need_discard] - - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, watermark)); -} - -static inline u64 dev_buckets_available(struct bch_dev *ca, - enum bch_watermark watermark) -{ - return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark); -} - -/* Filesystem usage: */ - -struct bch_fs_usage_short -bch2_fs_usage_read_short(struct bch_fs *); - -int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *, - struct bkey_s_c, const struct bch_extent_ptr *, - s64, enum bch_data_type, u8, u8, u32 *); - -int bch2_check_fix_ptrs(struct btree_trans *, - enum btree_id, unsigned, struct bkey_s_c, - enum btree_iter_update_trigger_flags); - -int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); -int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ -({ \ - int ret = 0; \ - \ - if (_old.k->type) \ - ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \ - if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\ - ret; \ -}) - -void bch2_trans_account_disk_usage_change(struct btree_trans *); - -int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64, - enum bch_data_type, unsigned, - enum btree_iter_update_trigger_flags); -int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *, - enum btree_iter_update_trigger_flags); -int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, - enum btree_iter_update_trigger_flags); -int bch2_trans_mark_dev_sbs(struct bch_fs *); - -bool bch2_is_superblock_bucket(struct bch_dev *, u64); - -static inline const char *bch2_data_type_str(enum bch_data_type type) -{ - return type < BCH_DATA_NR - ? __bch2_data_types[type] - : "(invalid data type)"; -} - -/* disk reservations: */ - -static inline void bch2_disk_reservation_put(struct bch_fs *c, - struct disk_reservation *res) -{ - if (res->sectors) { - this_cpu_sub(*c->online_reserved, res->sectors); - res->sectors = 0; - } -} - -enum bch_reservation_flags { - BCH_DISK_RESERVATION_NOFAIL = 1 << 0, - BCH_DISK_RESERVATION_PARTIAL = 1 << 1, -}; - -int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *, - u64, enum bch_reservation_flags); - -static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, enum bch_reservation_flags flags) -{ -#ifdef __KERNEL__ - u64 old, new; - - old = this_cpu_read(c->pcpu->sectors_available); - do { - if (sectors > old) - return __bch2_disk_reservation_add(c, res, sectors, flags); - - new = old - sectors; - } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new)); - - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - return 0; -#else - return __bch2_disk_reservation_add(c, res, sectors, flags); -#endif -} - -static inline struct disk_reservation -bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) -{ - return (struct disk_reservation) { - .sectors = 0, -#if 0 - /* not used yet: */ - .gen = c->capacity_gen, -#endif - .nr_replicas = nr_replicas, - }; -} - -static inline int bch2_disk_reservation_get(struct bch_fs *c, - struct disk_reservation *res, - u64 sectors, unsigned nr_replicas, - int flags) -{ - *res = bch2_disk_reservation_init(c, nr_replicas); - - return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); -} - -#define RESERVE_FACTOR 6 - -static inline u64 avail_factor(u64 r) -{ - return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); -} - -void bch2_buckets_nouse_free(struct bch_fs *); -int bch2_buckets_nouse_alloc(struct bch_fs *); - -int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); -void bch2_dev_buckets_free(struct bch_dev *); -int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); - -#endif /* _BUCKETS_H */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h deleted file mode 100644 index 0aed2500ade32f..00000000000000 --- a/fs/bcachefs/buckets_types.h +++ /dev/null @@ -1,100 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BUCKETS_TYPES_H -#define _BUCKETS_TYPES_H - -#include "bcachefs_format.h" -#include "util.h" - -#define BUCKET_JOURNAL_SEQ_BITS 16 - -/* - * Ugly hack alert: - * - * We need to cram a spinlock in a single byte, because that's what we have left - * in struct bucket, and we care about the size of these - during fsck, we need - * in memory state for every single bucket on every device. - * - * We used to do - * while (xchg(&b->lock, 1) cpu_relax(); - * but, it turns out not all architectures support xchg on a single byte. - * - * So now we use bit_spin_lock(), with fun games since we can't burn a whole - * ulong for this - we just need to make sure the lock bit always ends up in the - * first byte. - */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define BUCKET_LOCK_BITNR 0 -#else -#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) -#endif - -union ulong_byte_assert { - ulong ulong; - u8 byte; -}; - -struct bucket { - u8 lock; - u8 gen_valid:1; - u8 data_type:7; - u8 gen; - u8 stripe_redundancy; - u32 stripe; - u32 dirty_sectors; - u32 cached_sectors; - u32 stripe_sectors; -} __aligned(sizeof(long)); - -struct bucket_gens { - struct rcu_head rcu; - u16 first_bucket; - size_t nbuckets; - size_t nbuckets_minus_first; - u8 b[] __counted_by(nbuckets); -}; - -/* Only info on bucket countns: */ -struct bch_dev_usage { - u64 buckets[BCH_DATA_NR]; -}; - -struct bch_dev_usage_full { - struct bch_dev_usage_type { - u64 buckets; - u64 sectors; /* _compressed_ sectors: */ - /* - * XXX - * Why do we have this? Isn't it just buckets * bucket_size - - * sectors? - */ - u64 fragmented; - } d[BCH_DATA_NR]; -}; - -struct bch_fs_usage_base { - u64 hidden; - u64 btree; - u64 data; - u64 cached; - u64 reserved; - u64 nr_inodes; -}; - -struct bch_fs_usage_short { - u64 capacity; - u64 used; - u64 free; - u64 nr_inodes; -}; - -/* - * A reservation for space on disk: - */ -struct disk_reservation { - u64 sectors; - u32 gen; - unsigned nr_replicas; -}; - -#endif /* _BUCKETS_TYPES_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c deleted file mode 100644 index 832eff93acb667..00000000000000 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "buckets_waiting_for_journal.h" -#include -#include - -static inline struct bucket_hashed * -bucket_hash(struct buckets_waiting_for_journal_table *t, - unsigned hash_seed_idx, u64 dev_bucket) -{ - return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits); -} - -static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits) -{ - unsigned i; - - t->bits = bits; - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) - get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); - memset(t->d, 0, sizeof(t->d[0]) << t->bits); -} - -u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, - unsigned dev, u64 bucket) -{ - struct buckets_waiting_for_journal_table *t; - u64 dev_bucket = (u64) dev << 56 | bucket; - u64 ret = 0; - - mutex_lock(&b->lock); - t = b->t; - - for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { - struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); - - if (h->dev_bucket == dev_bucket) { - ret = h->journal_seq; - break; - } - } - - mutex_unlock(&b->lock); - - return ret; -} - -static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, - struct bucket_hashed *new, - u64 flushed_seq) -{ - struct bucket_hashed *last_evicted = NULL; - unsigned tries, i; - - for (tries = 0; tries < 10; tries++) { - struct bucket_hashed *old, *victim = NULL; - - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { - old = bucket_hash(t, i, new->dev_bucket); - - if (old->dev_bucket == new->dev_bucket || - old->journal_seq <= flushed_seq) { - *old = *new; - return true; - } - - if (last_evicted != old) - victim = old; - } - - /* hashed to same slot 3 times: */ - if (!victim) - break; - - /* Failed to find an empty slot: */ - swap(*new, *victim); - last_evicted = victim; - } - - return false; -} - -int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, - u64 flushed_seq, - unsigned dev, u64 bucket, - u64 journal_seq) -{ - struct buckets_waiting_for_journal_table *t, *n; - struct bucket_hashed tmp, new = { - .dev_bucket = (u64) dev << 56 | bucket, - .journal_seq = journal_seq, - }; - size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0; - int ret = 0; - - mutex_lock(&b->lock); - - if (likely(bucket_table_insert(b->t, &new, flushed_seq))) - goto out; - - t = b->t; - size = 1UL << t->bits; - for (i = 0; i < size; i++) - nr_elements += t->d[i].journal_seq > flushed_seq; - - new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); -realloc: - n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); - if (!n) { - struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal); - ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set); - goto out; - } - -retry_rehash: - if (nr_rehashes_this_size == 3) { - new_bits++; - nr_rehashes_this_size = 0; - kvfree(n); - goto realloc; - } - - nr_rehashes++; - nr_rehashes_this_size++; - - bucket_table_init(n, new_bits); - - tmp = new; - BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); - - for (i = 0; i < 1UL << t->bits; i++) { - if (t->d[i].journal_seq <= flushed_seq) - continue; - - tmp = t->d[i]; - if (!bucket_table_insert(n, &tmp, flushed_seq)) - goto retry_rehash; - } - - b->t = n; - kvfree(t); - - pr_debug("took %zu rehashes, table at %zu/%lu elements", - nr_rehashes, nr_elements, 1UL << b->t->bits); -out: - mutex_unlock(&b->lock); - - return ret; -} - -void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) -{ - struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; - - kvfree(b->t); -} - -#define INITIAL_TABLE_BITS 3 - -int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) -{ - struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; - - mutex_init(&b->lock); - - b->t = kvmalloc(sizeof(*b->t) + - (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL); - if (!b->t) - return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init; - - bucket_table_init(b->t, INITIAL_TABLE_BITS); - return 0; -} diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h deleted file mode 100644 index 365619ca44c87e..00000000000000 --- a/fs/bcachefs/buckets_waiting_for_journal.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H -#define _BUCKETS_WAITING_FOR_JOURNAL_H - -#include "buckets_waiting_for_journal_types.h" - -u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *, - unsigned, u64); -int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, - u64, unsigned, u64, u64); - -void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); -int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); - -#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h deleted file mode 100644 index e593db061d81b2..00000000000000 --- a/fs/bcachefs/buckets_waiting_for_journal_types.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H -#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H - -#include - -struct bucket_hashed { - u64 dev_bucket; - u64 journal_seq; -}; - -struct buckets_waiting_for_journal_table { - unsigned bits; - u64 hash_seeds[3]; - struct bucket_hashed d[]; -}; - -struct buckets_waiting_for_journal { - struct mutex lock; - struct buckets_waiting_for_journal_table *t; -}; - -#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c deleted file mode 100644 index 5ea89aa2b0c42a..00000000000000 --- a/fs/bcachefs/chardev.c +++ /dev/null @@ -1,843 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_CHARDEV - -#include "bcachefs.h" -#include "bcachefs_ioctl.h" -#include "buckets.h" -#include "chardev.h" -#include "disk_accounting.h" -#include "fsck.h" -#include "journal.h" -#include "move.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-counters.h" -#include "super-io.h" -#include "thread_with_file.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -/* returns with ref on ca->ref */ -static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, - unsigned flags) -{ - struct bch_dev *ca; - - if (flags & BCH_BY_INDEX) { - if (dev >= c->sb.nr_devices) - return ERR_PTR(-EINVAL); - - ca = bch2_dev_tryget_noerror(c, dev); - if (!ca) - return ERR_PTR(-EINVAL); - } else { - char *path; - - path = strndup_user((const char __user *) - (unsigned long) dev, PATH_MAX); - if (IS_ERR(path)) - return ERR_CAST(path); - - ca = bch2_dev_lookup(c, path); - kfree(path); - } - - return ca; -} - -#if 0 -static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) -{ - struct bch_ioctl_assemble arg; - struct bch_fs *c; - u64 *user_devs = NULL; - char **devs = NULL; - unsigned i; - int ret = -EFAULT; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); - if (!user_devs) - return -ENOMEM; - - devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); - - if (copy_from_user(user_devs, user_arg->devs, - sizeof(u64) * arg.nr_devs)) - goto err; - - for (i = 0; i < arg.nr_devs; i++) { - devs[i] = strndup_user((const char __user *)(unsigned long) - user_devs[i], - PATH_MAX); - ret= PTR_ERR_OR_ZERO(devs[i]); - if (ret) - goto err; - } - - c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); - ret = PTR_ERR_OR_ZERO(c); - if (!ret) - closure_put(&c->cl); -err: - if (devs) - for (i = 0; i < arg.nr_devs; i++) - kfree(devs[i]); - kfree(devs); - return ret; -} - -static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) -{ - struct bch_ioctl_incremental arg; - const char *err; - char *path; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - ret = PTR_ERR_OR_ZERO(path); - if (ret) - return ret; - - err = bch2_fs_open_incremental(path); - kfree(path); - - if (err) { - pr_err("Could not register bcachefs devices: %s", err); - return -EINVAL; - } - - return 0; -} -#endif - -static long bch2_global_ioctl(unsigned cmd, void __user *arg) -{ - long ret; - - switch (cmd) { -#if 0 - case BCH_IOCTL_ASSEMBLE: - return bch2_ioctl_assemble(arg); - case BCH_IOCTL_INCREMENTAL: - return bch2_ioctl_incremental(arg); -#endif - case BCH_IOCTL_FSCK_OFFLINE: { - ret = bch2_ioctl_fsck_offline(arg); - break; - } - default: - ret = -ENOTTY; - break; - } - - if (ret < 0) - ret = bch2_err_class(ret); - return ret; -} - -static long bch2_ioctl_query_uuid(struct bch_fs *c, - struct bch_ioctl_query_uuid __user *user_arg) -{ - return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid, - sizeof(c->sb.user_uuid)); -} - -#if 0 -static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (arg.flags || arg.pad) - return -EINVAL; - - return bch2_fs_start(c); -} - -static long bch2_ioctl_stop(struct bch_fs *c) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - bch2_fs_stop(c); - return 0; -} -#endif - -static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - char *path; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - ret = PTR_ERR_OR_ZERO(path); - if (ret) - return ret; - - ret = bch2_dev_add(c, path); - if (!IS_ERR(path)) - kfree(path); - - return ret; -} - -static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - struct bch_dev *ca; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| - BCH_FORCE_IF_METADATA_LOST| - BCH_FORCE_IF_DEGRADED| - BCH_BY_INDEX)) || - arg.pad) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - return bch2_dev_remove(c, ca, arg.flags); -} - -static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - char *path; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (arg.flags || arg.pad) - return -EINVAL; - - path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - ret = PTR_ERR_OR_ZERO(path); - if (ret) - return ret; - - ret = bch2_dev_online(c, path); - kfree(path); - return ret; -} - -static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| - BCH_FORCE_IF_METADATA_LOST| - BCH_FORCE_IF_DEGRADED| - BCH_BY_INDEX)) || - arg.pad) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_dev_offline(c, ca, arg.flags); - bch2_dev_put(ca); - return ret; -} - -static long bch2_ioctl_disk_set_state(struct bch_fs *c, - struct bch_ioctl_disk_set_state arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| - BCH_FORCE_IF_METADATA_LOST| - BCH_FORCE_IF_DEGRADED| - BCH_BY_INDEX)) || - arg.pad[0] || arg.pad[1] || arg.pad[2] || - arg.new_state >= BCH_MEMBER_STATE_NR) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); - if (ret) - bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); - - bch2_dev_put(ca); - return ret; -} - -struct bch_data_ctx { - struct thread_with_file thr; - - struct bch_fs *c; - struct bch_ioctl_data arg; - struct bch_move_stats stats; -}; - -static int bch2_data_thread(void *arg) -{ - struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); - - ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - if (ctx->thr.ret == -BCH_ERR_device_offline) - ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; - else { - ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; - ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; - } - enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data); - return 0; -} - -static int bch2_data_job_release(struct inode *inode, struct file *file) -{ - struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - - bch2_thread_with_file_exit(&ctx->thr); - kfree(ctx); - return 0; -} - -static ssize_t bch2_data_job_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - struct bch_fs *c = ctx->c; - struct bch_ioctl_data_event e = { - .type = BCH_DATA_EVENT_PROGRESS, - .ret = ctx->stats.ret, - .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.pos.btree, - .p.pos = ctx->stats.pos.pos, - .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), - .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), - }; - - if (ctx->arg.op == BCH_DATA_OP_scrub) { - struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); - if (ca) { - struct bch_dev_usage_full u; - bch2_dev_usage_full_read_fast(ca, &u); - for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) - if (ctx->arg.scrub.data_types & BIT(i)) - e.p.sectors_total += u.d[i].sectors; - bch2_dev_put(ca); - } - } else { - e.p.sectors_total = bch2_fs_usage_read_short(c).used; - } - - if (len < sizeof(e)) - return -EINVAL; - - return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e); -} - -static const struct file_operations bcachefs_data_ops = { - .release = bch2_data_job_release, - .read = bch2_data_job_read, -}; - -static long bch2_ioctl_data(struct bch_fs *c, - struct bch_ioctl_data arg) -{ - struct bch_data_ctx *ctx; - int ret; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data)) - return -EROFS; - - if (!capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto put_ref; - } - - if (arg.op >= BCH_DATA_OP_NR || arg.flags) { - ret = -EINVAL; - goto put_ref; - } - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { - ret = -ENOMEM; - goto put_ref; - } - - ctx->c = c; - ctx->arg = arg; - - ret = bch2_run_thread_with_file(&ctx->thr, - &bcachefs_data_ops, - bch2_data_thread); - if (ret < 0) - goto cleanup; - return ret; -cleanup: - kfree(ctx); -put_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data); - return ret; -} - -static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c, - struct bch_ioctl_fs_usage __user *user_arg) -{ - struct bch_ioctl_fs_usage arg = {}; - darray_char replicas = {}; - u32 replica_entries_bytes; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) - return -EFAULT; - - ret = bch2_fs_replicas_usage_read(c, &replicas) ?: - (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?: - copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr); - if (ret) - goto err; - - struct bch_fs_usage_short u = bch2_fs_usage_read_short(c); - arg.capacity = c->capacity; - arg.used = u.used; - arg.online_reserved = percpu_u64_get(c->online_reserved); - arg.replica_entries_bytes = replicas.nr; - - for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { - struct disk_accounting_pos k; - disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i); - - bch2_accounting_mem_read(c, - disk_accounting_pos_to_bpos(&k), - &arg.persistent_reserved[i], 1); - } - - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -err: - darray_exit(&replicas); - return ret; -} - -static long bch2_ioctl_query_accounting(struct bch_fs *c, - struct bch_ioctl_query_accounting __user *user_arg) -{ - struct bch_ioctl_query_accounting arg; - darray_char accounting = {}; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?: - bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?: - (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?: - copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr); - if (ret) - goto err; - - arg.capacity = c->capacity; - arg.used = bch2_fs_usage_read_short(c).used; - arg.online_reserved = percpu_u64_get(c->online_reserved); - arg.accounting_u64s = accounting.nr / sizeof(u64); - - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -err: - darray_exit(&accounting); - return ret; -} - -/* obsolete, didn't allow for new data types: */ -static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c, - struct bch_ioctl_dev_usage __user *user_arg) -{ - struct bch_ioctl_dev_usage arg; - struct bch_dev_usage_full src; - struct bch_dev *ca; - unsigned i; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad[0] || - arg.pad[1] || - arg.pad[2]) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - src = bch2_dev_usage_full_read(ca); - - arg.state = ca->mi.state; - arg.bucket_size = ca->mi.bucket_size; - arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - - for (i = 0; i < ARRAY_SIZE(arg.d); i++) { - arg.d[i].buckets = src.d[i].buckets; - arg.d[i].sectors = src.d[i].sectors; - arg.d[i].fragmented = src.d[i].fragmented; - } - - bch2_dev_put(ca); - - return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -} - -static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, - struct bch_ioctl_dev_usage_v2 __user *user_arg) -{ - struct bch_ioctl_dev_usage_v2 arg; - struct bch_dev_usage_full src; - struct bch_dev *ca; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad[0] || - arg.pad[1] || - arg.pad[2]) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - src = bch2_dev_usage_full_read(ca); - - arg.state = ca->mi.state; - arg.bucket_size = ca->mi.bucket_size; - arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR); - arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); - if (ret) - goto err; - - for (unsigned i = 0; i < arg.nr_data_types; i++) { - struct bch_ioctl_dev_usage_type t = { - .buckets = src.d[i].buckets, - .sectors = src.d[i].sectors, - .fragmented = src.d[i].fragmented, - }; - - ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t)); - if (ret) - goto err; - } -err: - bch2_dev_put(ca); - return ret; -} - -static long bch2_ioctl_read_super(struct bch_fs *c, - struct bch_ioctl_read_super arg) -{ - struct bch_dev *ca = NULL; - struct bch_sb *sb; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || - arg.pad) - return -EINVAL; - - mutex_lock(&c->sb_lock); - - if (arg.flags & BCH_READ_DEV) { - ca = bch2_device_lookup(c, arg.dev, arg.flags); - ret = PTR_ERR_OR_ZERO(ca); - if (ret) - goto err_unlock; - - sb = ca->disk_sb.sb; - } else { - sb = c->disk_sb.sb; - } - - if (vstruct_bytes(sb) > arg.size) { - ret = -ERANGE; - goto err; - } - - ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, - vstruct_bytes(sb)); -err: - bch2_dev_put(ca); -err_unlock: - mutex_unlock(&c->sb_lock); - return ret; -} - -static long bch2_ioctl_disk_get_idx(struct bch_fs *c, - struct bch_ioctl_disk_get_idx arg) -{ - dev_t dev = huge_decode_dev(arg.dev); - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - guard(rcu)(); - for_each_online_member_rcu(c, ca) - if (ca->dev == dev) - return ca->dev_idx; - - return bch_err_throw(c, ENOENT_dev_idx_not_found); -} - -static long bch2_ioctl_disk_resize(struct bch_fs *c, - struct bch_ioctl_disk_resize arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_dev_resize(c, ca, arg.nbuckets); - - bch2_dev_put(ca); - return ret; -} - -static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, - struct bch_ioctl_disk_resize_journal arg) -{ - struct bch_dev *ca; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((arg.flags & ~BCH_BY_INDEX) || - arg.pad) - return -EINVAL; - - if (arg.nbuckets > U32_MAX) - return -EINVAL; - - ca = bch2_device_lookup(c, arg.dev, arg.flags); - if (IS_ERR(ca)) - return PTR_ERR(ca); - - ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); - - bch2_dev_put(ca); - return ret; -} - -#define BCH_IOCTL(_name, _argtype) \ -do { \ - _argtype i; \ - \ - if (copy_from_user(&i, arg, sizeof(i))) \ - return -EFAULT; \ - ret = bch2_ioctl_##_name(c, i); \ - goto out; \ -} while (0) - -long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) -{ - long ret; - - switch (cmd) { - case BCH_IOCTL_QUERY_UUID: - return bch2_ioctl_query_uuid(c, arg); - case BCH_IOCTL_FS_USAGE: - return bch2_ioctl_fs_usage(c, arg); - case BCH_IOCTL_DEV_USAGE: - return bch2_ioctl_dev_usage(c, arg); - case BCH_IOCTL_DEV_USAGE_V2: - return bch2_ioctl_dev_usage_v2(c, arg); -#if 0 - case BCH_IOCTL_START: - BCH_IOCTL(start, struct bch_ioctl_start); - case BCH_IOCTL_STOP: - return bch2_ioctl_stop(c); -#endif - case BCH_IOCTL_READ_SUPER: - BCH_IOCTL(read_super, struct bch_ioctl_read_super); - case BCH_IOCTL_DISK_GET_IDX: - BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); - } - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EINVAL; - - switch (cmd) { - case BCH_IOCTL_DISK_ADD: - BCH_IOCTL(disk_add, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_REMOVE: - BCH_IOCTL(disk_remove, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_ONLINE: - BCH_IOCTL(disk_online, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_OFFLINE: - BCH_IOCTL(disk_offline, struct bch_ioctl_disk); - case BCH_IOCTL_DISK_SET_STATE: - BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); - case BCH_IOCTL_DATA: - BCH_IOCTL(data, struct bch_ioctl_data); - case BCH_IOCTL_DISK_RESIZE: - BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); - case BCH_IOCTL_DISK_RESIZE_JOURNAL: - BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); - case BCH_IOCTL_FSCK_ONLINE: - BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); - case BCH_IOCTL_QUERY_ACCOUNTING: - return bch2_ioctl_query_accounting(c, arg); - case BCH_IOCTL_QUERY_COUNTERS: - return bch2_ioctl_query_counters(c, arg); - default: - return -ENOTTY; - } -out: - if (ret < 0) - ret = bch2_err_class(ret); - return ret; -} - -static DEFINE_IDR(bch_chardev_minor); - -static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) -{ - unsigned minor = iminor(file_inode(filp)); - struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; - void __user *arg = (void __user *) v; - - return c - ? bch2_fs_ioctl(c, cmd, arg) - : bch2_global_ioctl(cmd, arg); -} - -static const struct file_operations bch_chardev_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = bch2_chardev_ioctl, - .open = nonseekable_open, -}; - -static int bch_chardev_major; -static const struct class bch_chardev_class = { - .name = "bcachefs", -}; -static struct device *bch_chardev; - -void bch2_fs_chardev_exit(struct bch_fs *c) -{ - if (!IS_ERR_OR_NULL(c->chardev)) - device_unregister(c->chardev); - if (c->minor >= 0) - idr_remove(&bch_chardev_minor, c->minor); -} - -int bch2_fs_chardev_init(struct bch_fs *c) -{ - c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); - if (c->minor < 0) - return c->minor; - - c->chardev = device_create(&bch_chardev_class, NULL, - MKDEV(bch_chardev_major, c->minor), c, - "bcachefs%u-ctl", c->minor); - if (IS_ERR(c->chardev)) - return PTR_ERR(c->chardev); - - return 0; -} - -void bch2_chardev_exit(void) -{ - device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX)); - class_unregister(&bch_chardev_class); - if (bch_chardev_major > 0) - unregister_chrdev(bch_chardev_major, "bcachefs"); -} - -int __init bch2_chardev_init(void) -{ - int ret; - - bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); - if (bch_chardev_major < 0) - return bch_chardev_major; - - ret = class_register(&bch_chardev_class); - if (ret) - goto major_out; - - bch_chardev = device_create(&bch_chardev_class, NULL, - MKDEV(bch_chardev_major, U8_MAX), - NULL, "bcachefs-ctl"); - if (IS_ERR(bch_chardev)) { - ret = PTR_ERR(bch_chardev); - goto class_out; - } - - return 0; - -class_out: - class_unregister(&bch_chardev_class); -major_out: - unregister_chrdev(bch_chardev_major, "bcachefs-ctl"); - return ret; -} - -#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h deleted file mode 100644 index 0f563ca53c36e7..00000000000000 --- a/fs/bcachefs/chardev.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CHARDEV_H -#define _BCACHEFS_CHARDEV_H - -#ifndef NO_BCACHEFS_FS - -long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); - -void bch2_fs_chardev_exit(struct bch_fs *); -int bch2_fs_chardev_init(struct bch_fs *); - -void bch2_chardev_exit(void); -int __init bch2_chardev_init(void); - -#else - -static inline long bch2_fs_ioctl(struct bch_fs *c, - unsigned cmd, void __user * arg) -{ - return -ENOTTY; -} - -static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} -static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } - -static inline void bch2_chardev_exit(void) {} -static inline int __init bch2_chardev_init(void) { return 0; } - -#endif /* NO_BCACHEFS_FS */ - -#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c deleted file mode 100644 index a6795e73f0b93f..00000000000000 --- a/fs/bcachefs/checksum.c +++ /dev/null @@ -1,698 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "checksum.h" -#include "errcode.h" -#include "error.h" -#include "super.h" -#include "super-io.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * bch2_checksum state is an abstraction of the checksum state calculated over different pages. - * it features page merging without having the checksum algorithm lose its state. - * for native checksum aglorithms (like crc), a default seed value will do. - * for hash-like algorithms, a state needs to be stored - */ - -struct bch2_checksum_state { - union { - u64 seed; - struct xxh64_state h64state; - }; - unsigned int type; -}; - -static void bch2_checksum_init(struct bch2_checksum_state *state) -{ - switch (state->type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c: - case BCH_CSUM_crc64: - state->seed = 0; - break; - case BCH_CSUM_crc32c_nonzero: - state->seed = U32_MAX; - break; - case BCH_CSUM_crc64_nonzero: - state->seed = U64_MAX; - break; - case BCH_CSUM_xxhash: - xxh64_reset(&state->h64state, 0); - break; - default: - BUG(); - } -} - -static u64 bch2_checksum_final(const struct bch2_checksum_state *state) -{ - switch (state->type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c: - case BCH_CSUM_crc64: - return state->seed; - case BCH_CSUM_crc32c_nonzero: - return state->seed ^ U32_MAX; - case BCH_CSUM_crc64_nonzero: - return state->seed ^ U64_MAX; - case BCH_CSUM_xxhash: - return xxh64_digest(&state->h64state); - default: - BUG(); - } -} - -static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) -{ - switch (state->type) { - case BCH_CSUM_none: - return; - case BCH_CSUM_crc32c_nonzero: - case BCH_CSUM_crc32c: - state->seed = crc32c(state->seed, data, len); - break; - case BCH_CSUM_crc64_nonzero: - case BCH_CSUM_crc64: - state->seed = crc64_be(state->seed, data, len); - break; - case BCH_CSUM_xxhash: - xxh64_update(&state->h64state, data, len); - break; - default: - BUG(); - } -} - -static void bch2_chacha20_init(struct chacha_state *state, - const struct bch_key *key, struct nonce nonce) -{ - u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)]; - - BUILD_BUG_ON(sizeof(key_words) != sizeof(*key)); - memcpy(key_words, key, sizeof(key_words)); - le32_to_cpu_array(key_words, ARRAY_SIZE(key_words)); - - BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE); - chacha_init(state, key_words, (const u8 *)nonce.d); - - memzero_explicit(key_words, sizeof(key_words)); -} - -void bch2_chacha20(const struct bch_key *key, struct nonce nonce, - void *data, size_t len) -{ - struct chacha_state state; - - bch2_chacha20_init(&state, key, nonce); - chacha20_crypt(&state, data, data, len); - chacha_zeroize_state(&state); -} - -static void bch2_poly1305_init(struct poly1305_desc_ctx *desc, - struct bch_fs *c, struct nonce nonce) -{ - u8 key[POLY1305_KEY_SIZE] = { 0 }; - - nonce.d[3] ^= BCH_NONCE_POLY; - - bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key)); - poly1305_init(desc, key); -} - -struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, - struct nonce nonce, const void *data, size_t len) -{ - switch (type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c_nonzero: - case BCH_CSUM_crc64_nonzero: - case BCH_CSUM_crc32c: - case BCH_CSUM_xxhash: - case BCH_CSUM_crc64: { - struct bch2_checksum_state state; - - state.type = type; - - bch2_checksum_init(&state); - bch2_checksum_update(&state, data, len); - - return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; - } - - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: { - struct poly1305_desc_ctx dctx; - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - - bch2_poly1305_init(&dctx, c, nonce); - poly1305_update(&dctx, data, len); - poly1305_final(&dctx, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; - } - default: - return (struct bch_csum) {}; - } -} - -int bch2_encrypt(struct bch_fs *c, unsigned type, - struct nonce nonce, void *data, size_t len) -{ - if (!bch2_csum_type_is_encryption(type)) - return 0; - - if (bch2_fs_inconsistent_on(!c->chacha20_key_set, - c, "attempting to encrypt without encryption key")) - return bch_err_throw(c, no_encryption_key); - - bch2_chacha20(&c->chacha20_key, nonce, data, len); - return 0; -} - -static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv; - - switch (type) { - case BCH_CSUM_none: - return (struct bch_csum) { 0 }; - case BCH_CSUM_crc32c_nonzero: - case BCH_CSUM_crc64_nonzero: - case BCH_CSUM_crc32c: - case BCH_CSUM_xxhash: - case BCH_CSUM_crc64: { - struct bch2_checksum_state state; - - state.type = type; - bch2_checksum_init(&state); - -#ifdef CONFIG_HIGHMEM - __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; - - bch2_checksum_update(&state, p, bv.bv_len); - kunmap_local(p); - } -#else - __bio_for_each_bvec(bv, bio, *iter, *iter) - bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, - bv.bv_len); -#endif - return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; - } - - case BCH_CSUM_chacha20_poly1305_80: - case BCH_CSUM_chacha20_poly1305_128: { - struct poly1305_desc_ctx dctx; - u8 digest[POLY1305_DIGEST_SIZE]; - struct bch_csum ret = { 0 }; - - bch2_poly1305_init(&dctx, c, nonce); - -#ifdef CONFIG_HIGHMEM - __bio_for_each_segment(bv, bio, *iter, *iter) { - void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; - - poly1305_update(&dctx, p, bv.bv_len); - kunmap_local(p); - } -#else - __bio_for_each_bvec(bv, bio, *iter, *iter) - poly1305_update(&dctx, - page_address(bv.bv_page) + bv.bv_offset, - bv.bv_len); -#endif - poly1305_final(&dctx, digest); - - memcpy(&ret, digest, bch_crc_bytes[type]); - return ret; - } - default: - return (struct bch_csum) {}; - } -} - -struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - struct bvec_iter iter = bio->bi_iter; - - return __bch2_checksum_bio(c, type, nonce, bio, &iter); -} - -int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - struct bio_vec bv; - struct bvec_iter iter; - struct chacha_state chacha_state; - int ret = 0; - - if (bch2_fs_inconsistent_on(!c->chacha20_key_set, - c, "attempting to encrypt without encryption key")) - return bch_err_throw(c, no_encryption_key); - - bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce); - - bio_for_each_segment(bv, bio, iter) { - void *p; - - /* - * chacha_crypt() assumes that the length is a multiple of - * CHACHA_BLOCK_SIZE on any non-final call. - */ - if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) { - bch_err_ratelimited(c, "bio not aligned for encryption"); - ret = -EIO; - break; - } - - p = bvec_kmap_local(&bv); - chacha20_crypt(&chacha_state, p, p, bv.bv_len); - kunmap_local(p); - } - chacha_zeroize_state(&chacha_state); - return ret; -} - -struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, - struct bch_csum b, size_t b_len) -{ - struct bch2_checksum_state state; - - state.type = type; - bch2_checksum_init(&state); - state.seed = le64_to_cpu(a.lo); - - BUG_ON(!bch2_checksum_mergeable(type)); - - while (b_len) { - unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE); - - bch2_checksum_update(&state, - page_address(ZERO_PAGE(0)), page_len); - b_len -= page_len; - } - a.lo = cpu_to_le64(bch2_checksum_final(&state)); - a.lo ^= b.lo; - a.hi ^= b.hi; - return a; -} - -int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, - struct bversion version, - struct bch_extent_crc_unpacked crc_old, - struct bch_extent_crc_unpacked *crc_a, - struct bch_extent_crc_unpacked *crc_b, - unsigned len_a, unsigned len_b, - unsigned new_csum_type) -{ - struct bvec_iter iter = bio->bi_iter; - struct nonce nonce = extent_nonce(version, crc_old); - struct bch_csum merged = { 0 }; - struct crc_split { - struct bch_extent_crc_unpacked *crc; - unsigned len; - unsigned csum_type; - struct bch_csum csum; - } splits[3] = { - { crc_a, len_a, new_csum_type, { 0 }}, - { crc_b, len_b, new_csum_type, { 0 } }, - { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } }, - }, *i; - bool mergeable = crc_old.csum_type == new_csum_type && - bch2_checksum_mergeable(new_csum_type); - unsigned crc_nonce = crc_old.nonce; - - BUG_ON(len_a + len_b > bio_sectors(bio)); - BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); - BUG_ON(crc_is_compressed(crc_old)); - BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != - bch2_csum_type_is_encryption(new_csum_type)); - - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { - iter.bi_size = i->len << 9; - if (mergeable || i->crc) - i->csum = __bch2_checksum_bio(c, i->csum_type, - nonce, bio, &iter); - else - bio_advance_iter(bio, &iter, i->len << 9); - nonce = nonce_add(nonce, i->len << 9); - } - - if (mergeable) - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) - merged = bch2_checksum_merge(new_csum_type, merged, - i->csum, i->len << 9); - else - merged = bch2_checksum_bio(c, crc_old.csum_type, - extent_nonce(version, crc_old), bio); - - if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" - " expected %0llx:%0llx got %0llx:%0llx (old type ", - __func__, - crc_old.csum.hi, - crc_old.csum.lo, - merged.hi, - merged.lo); - bch2_prt_csum_type(&buf, crc_old.csum_type); - prt_str(&buf, " new type "); - bch2_prt_csum_type(&buf, new_csum_type); - prt_str(&buf, ")"); - WARN_RATELIMIT(1, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, recompute_checksum); - } - - for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { - if (i->crc) - *i->crc = (struct bch_extent_crc_unpacked) { - .csum_type = i->csum_type, - .compression_type = crc_old.compression_type, - .compressed_size = i->len, - .uncompressed_size = i->len, - .offset = 0, - .live_size = i->len, - .nonce = crc_nonce, - .csum = i->csum, - }; - - if (bch2_csum_type_is_encryption(new_csum_type)) - crc_nonce += i->len; - } - - return 0; -} - -/* BCH_SB_FIELD_crypt: */ - -static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&crypt->field), sizeof(*crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - if (BCH_CRYPT_KDF_TYPE(crypt)) { - prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - return 0; -} - -static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt)); - prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt)); - prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt)); - prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt)); -} - -const struct bch_sb_field_ops bch_sb_field_ops_crypt = { - .validate = bch2_sb_crypt_validate, - .to_text = bch2_sb_crypt_to_text, -}; - -#ifdef __KERNEL__ -static int __bch2_request_key(char *key_description, struct bch_key *key) -{ - struct key *keyring_key; - const struct user_key_payload *ukp; - int ret; - - keyring_key = request_key(&key_type_user, key_description, NULL); - if (IS_ERR(keyring_key)) - return PTR_ERR(keyring_key); - - down_read(&keyring_key->sem); - ukp = dereference_key_locked(keyring_key); - if (ukp->datalen == sizeof(*key)) { - memcpy(key, ukp->data, ukp->datalen); - ret = 0; - } else { - ret = -EINVAL; - } - up_read(&keyring_key->sem); - key_put(keyring_key); - - return ret; -} -#else -#include - -static int __bch2_request_key(char *key_description, struct bch_key *key) -{ - key_serial_t key_id; - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_SESSION_KEYRING); - if (key_id >= 0) - goto got_key; - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_USER_KEYRING); - if (key_id >= 0) - goto got_key; - - key_id = request_key("user", key_description, NULL, - KEY_SPEC_USER_SESSION_KEYRING); - if (key_id >= 0) - goto got_key; - - return -errno; -got_key: - - if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) - return -1; - - return 0; -} - -#include "crypto.h" -#endif - -int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -{ - struct printbuf key_description = PRINTBUF; - int ret; - - prt_printf(&key_description, "bcachefs:"); - pr_uuid(&key_description, sb->user_uuid.b); - - ret = __bch2_request_key(key_description.buf, key); - printbuf_exit(&key_description); - -#ifndef __KERNEL__ - if (ret) { - char *passphrase = read_passphrase("Enter passphrase: "); - struct bch_encrypted_key sb_key; - - bch2_passphrase_check(sb, passphrase, - key, &sb_key); - ret = 0; - } -#endif - - /* stash with memfd, pass memfd fd to mount */ - - return ret; -} - -#ifndef __KERNEL__ -int bch2_revoke_key(struct bch_sb *sb) -{ - key_serial_t key_id; - struct printbuf key_description = PRINTBUF; - - prt_printf(&key_description, "bcachefs:"); - pr_uuid(&key_description, sb->user_uuid.b); - - key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING); - printbuf_exit(&key_description); - if (key_id < 0) - return errno; - - keyctl_revoke(key_id); - - return 0; -} -#endif - -int bch2_decrypt_sb_key(struct bch_fs *c, - struct bch_sb_field_crypt *crypt, - struct bch_key *key) -{ - struct bch_encrypted_key sb_key = crypt->key; - struct bch_key user_key; - int ret = 0; - - /* is key encrypted? */ - if (!bch2_key_is_encrypted(&sb_key)) - goto out; - - ret = bch2_request_key(c->disk_sb.sb, &user_key); - if (ret) { - bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); - goto err; - } - - /* decrypt real key: */ - bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key)); - - if (bch2_key_is_encrypted(&sb_key)) { - bch_err(c, "incorrect encryption key"); - ret = -EINVAL; - goto err; - } -out: - *key = sb_key.key; -err: - memzero_explicit(&sb_key, sizeof(sb_key)); - memzero_explicit(&user_key, sizeof(user_key)); - return ret; -} - -#if 0 - -/* - * This seems to be duplicating code in cmd_remove_passphrase() in - * bcachefs-tools, but we might want to switch userspace to use this - and - * perhaps add an ioctl for calling this at runtime, so we can take the - * passphrase off of a mounted filesystem (which has come up). - */ -int bch2_disable_encryption(struct bch_fs *c) -{ - struct bch_sb_field_crypt *crypt; - struct bch_key key; - int ret = -EINVAL; - - mutex_lock(&c->sb_lock); - - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); - if (!crypt) - goto out; - - /* is key encrypted? */ - ret = 0; - if (bch2_key_is_encrypted(&crypt->key)) - goto out; - - ret = bch2_decrypt_sb_key(c, crypt, &key); - if (ret) - goto out; - - crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); - crypt->key.key = key; - - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); - bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); - - return ret; -} - -/* - * For enabling encryption on an existing filesystem: not hooked up yet, but it - * should be - */ -int bch2_enable_encryption(struct bch_fs *c, bool keyed) -{ - struct bch_encrypted_key key; - struct bch_key user_key; - struct bch_sb_field_crypt *crypt; - int ret = -EINVAL; - - mutex_lock(&c->sb_lock); - - /* Do we already have an encryption key? */ - if (bch2_sb_field_get(c->disk_sb.sb, crypt)) - goto err; - - ret = bch2_alloc_ciphers(c); - if (ret) - goto err; - - key.magic = cpu_to_le64(BCH_KEY_MAGIC); - get_random_bytes(&key.key, sizeof(key.key)); - - if (keyed) { - ret = bch2_request_key(c->disk_sb.sb, &user_key); - if (ret) { - bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); - goto err; - } - - ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), - &key, sizeof(key)); - if (ret) - goto err; - } - - ret = crypto_skcipher_setkey(&c->chacha20->base, - (void *) &key.key, sizeof(key.key)); - if (ret) - goto err; - - crypt = bch2_sb_field_resize(&c->disk_sb, crypt, - sizeof(*crypt) / sizeof(u64)); - if (!crypt) { - ret = bch_err_throw(c, ENOSPC_sb_crypt); - goto err; - } - - crypt->key = key; - - /* write superblock */ - SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); - bch2_write_super(c); -err: - mutex_unlock(&c->sb_lock); - memzero_explicit(&user_key, sizeof(user_key)); - memzero_explicit(&key, sizeof(key)); - return ret; -} -#endif - -void bch2_fs_encryption_exit(struct bch_fs *c) -{ - memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key)); -} - -int bch2_fs_encryption_init(struct bch_fs *c) -{ - struct bch_sb_field_crypt *crypt; - int ret; - - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); - if (!crypt) - return 0; - - ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key); - if (ret) - return ret; - c->chacha20_key_set = true; - return 0; -} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h deleted file mode 100644 index 7bd9cf6104ca12..00000000000000 --- a/fs/bcachefs/checksum.h +++ /dev/null @@ -1,240 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CHECKSUM_H -#define _BCACHEFS_CHECKSUM_H - -#include "bcachefs.h" -#include "extents_types.h" -#include "super-io.h" - -#include -#include - -static inline bool bch2_checksum_mergeable(unsigned type) -{ - - switch (type) { - case BCH_CSUM_none: - case BCH_CSUM_crc32c: - case BCH_CSUM_crc64: - return true; - default: - return false; - } -} - -struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, - struct bch_csum, size_t); - -#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) -#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) -#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) -#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) -#define BCH_NONCE_POLY cpu_to_le32(1 << 31) - -struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, - const void *, size_t); - -/* - * This is used for various on disk data structures - bch_sb, prio_set, bset, - * jset: The checksum is _always_ the first field of these structs - */ -#define csum_vstruct(_c, _type, _nonce, _i) \ -({ \ - const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\ - \ - bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\ -}) - -static inline void bch2_csum_to_text(struct printbuf *out, - enum bch_csum_type type, - struct bch_csum csum) -{ - const u8 *p = (u8 *) &csum; - unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16; - - for (unsigned i = 0; i < bytes; i++) - prt_hex_byte(out, p[i]); -} - -static inline void bch2_csum_err_msg(struct printbuf *out, - enum bch_csum_type type, - struct bch_csum expected, - struct bch_csum got) -{ - prt_str(out, "checksum error, type "); - bch2_prt_csum_type(out, type); - prt_str(out, ": got "); - bch2_csum_to_text(out, type, got); - prt_str(out, " should be "); - bch2_csum_to_text(out, type, expected); -} - -void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t); - -int bch2_request_key(struct bch_sb *, struct bch_key *); -#ifndef __KERNEL__ -int bch2_revoke_key(struct bch_sb *); -#endif - -int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, - void *data, size_t); - -struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); - -int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, - struct bch_extent_crc_unpacked, - struct bch_extent_crc_unpacked *, - struct bch_extent_crc_unpacked *, - unsigned, unsigned, unsigned); - -int __bch2_encrypt_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); - -static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) -{ - return bch2_csum_type_is_encryption(type) - ? __bch2_encrypt_bio(c, type, nonce, bio) - : 0; -} - -extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; - -int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, - struct bch_key *); - -#if 0 -int bch2_disable_encryption(struct bch_fs *); -int bch2_enable_encryption(struct bch_fs *, bool); -#endif - -void bch2_fs_encryption_exit(struct bch_fs *); -int bch2_fs_encryption_init(struct bch_fs *); - -static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, - bool data) -{ - switch (type) { - case BCH_CSUM_OPT_none: - return BCH_CSUM_none; - case BCH_CSUM_OPT_crc32c: - return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; - case BCH_CSUM_OPT_crc64: - return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; - case BCH_CSUM_OPT_xxhash: - return BCH_CSUM_xxhash; - default: - BUG(); - } -} - -static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, - struct bch_io_opts opts) -{ - if (opts.nocow) - return 0; - - if (c->sb.encryption_type) - return c->opts.wide_macs - ? BCH_CSUM_chacha20_poly1305_128 - : BCH_CSUM_chacha20_poly1305_80; - - return bch2_csum_opt_to_type(opts.data_checksum, true); -} - -static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) -{ - if (c->sb.encryption_type) - return BCH_CSUM_chacha20_poly1305_128; - - return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); -} - -static inline bool bch2_checksum_type_valid(const struct bch_fs *c, - unsigned type) -{ - if (type >= BCH_CSUM_NR) - return false; - - if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set) - return false; - - return true; -} - -/* returns true if not equal */ -static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) -{ - /* - * XXX: need some way of preventing the compiler from optimizing this - * into a form that isn't constant time.. - */ - return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; -} - -/* for skipping ahead and encrypting/decrypting at an offset: */ -static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) -{ - EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - - le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); - return nonce; -} - -static inline struct nonce null_nonce(void) -{ - struct nonce ret; - - memset(&ret, 0, sizeof(ret)); - return ret; -} - -static inline struct nonce extent_nonce(struct bversion version, - struct bch_extent_crc_unpacked crc) -{ - unsigned compression_type = crc_is_compressed(crc) - ? crc.compression_type - : 0; - unsigned size = compression_type ? crc.uncompressed_size : 0; - struct nonce nonce = (struct nonce) {{ - [0] = cpu_to_le32(size << 22), - [1] = cpu_to_le32(version.lo), - [2] = cpu_to_le32(version.lo >> 32), - [3] = cpu_to_le32(version.hi| - (compression_type << 24))^BCH_NONCE_EXTENT, - }}; - - return nonce_add(nonce, crc.nonce << 9); -} - -static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) -{ - return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; -} - -static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) -{ - __le64 magic = __bch2_sb_magic(sb); - - return (struct nonce) {{ - [0] = 0, - [1] = 0, - [2] = ((__le32 *) &magic)[0], - [3] = ((__le32 *) &magic)[1], - }}; -} - -static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) -{ - __le64 magic = bch2_sb_magic(c); - - return (struct nonce) {{ - [0] = 0, - [1] = 0, - [2] = ((__le32 *) &magic)[0], - [3] = ((__le32 *) &magic)[1], - }}; -} - -#endif /* _BCACHEFS_CHECKSUM_H */ diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c deleted file mode 100644 index 8e9264b5a84e3d..00000000000000 --- a/fs/bcachefs/clock.c +++ /dev/null @@ -1,181 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "clock.h" - -#include -#include -#include - -static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args) -{ - struct io_timer **_l = (struct io_timer **)l; - struct io_timer **_r = (struct io_timer **)r; - - return (*_l)->expire < (*_r)->expire; -} - -static const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = NULL, -}; - -void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) -{ - spin_lock(&clock->timer_lock); - - if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { - spin_unlock(&clock->timer_lock); - timer->fn(timer); - return; - } - - for (size_t i = 0; i < clock->timers.nr; i++) - if (clock->timers.data[i] == timer) - goto out; - - BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL)); -out: - spin_unlock(&clock->timer_lock); -} - -void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) -{ - spin_lock(&clock->timer_lock); - - for (size_t i = 0; i < clock->timers.nr; i++) - if (clock->timers.data[i] == timer) { - min_heap_del(&clock->timers, i, &callbacks, NULL); - break; - } - - spin_unlock(&clock->timer_lock); -} - -struct io_clock_wait { - struct io_timer io_timer; - struct task_struct *task; - int expired; -}; - -static void io_clock_wait_fn(struct io_timer *timer) -{ - struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, io_timer); - - wait->expired = 1; - wake_up_process(wait->task); -} - -void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) -{ - struct io_clock_wait wait = { - .io_timer.expire = until, - .io_timer.fn = io_clock_wait_fn, - .io_timer.fn2 = (void *) _RET_IP_, - .task = current, - }; - - bch2_io_timer_add(clock, &wait.io_timer); - schedule(); - bch2_io_timer_del(clock, &wait.io_timer); -} - -unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - struct io_clock_wait wait = { - .io_timer.expire = io_until, - .io_timer.fn = io_clock_wait_fn, - .io_timer.fn2 = (void *) _RET_IP_, - .task = current, - }; - - bch2_io_timer_add(clock, &wait.io_timer); - - set_current_state(TASK_INTERRUPTIBLE); - if (!(kthread && kthread_should_stop())) { - cpu_timeout = schedule_timeout(cpu_timeout); - try_to_freeze(); - } - - __set_current_state(TASK_RUNNING); - bch2_io_timer_del(clock, &wait.io_timer); - return cpu_timeout; -} - -void bch2_kthread_io_clock_wait(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - - while (!(kthread && kthread_should_stop()) && - cpu_timeout && - atomic64_read(&clock->now) < io_until) - cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout); -} - -static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) -{ - struct io_timer *ret = NULL; - - if (clock->timers.nr && - time_after_eq64(now, clock->timers.data[0]->expire)) { - ret = *min_heap_peek(&clock->timers); - min_heap_pop(&clock->timers, &callbacks, NULL); - } - - return ret; -} - -void __bch2_increment_clock(struct io_clock *clock, u64 sectors) -{ - struct io_timer *timer; - u64 now = atomic64_add_return(sectors, &clock->now); - - spin_lock(&clock->timer_lock); - while ((timer = get_expired_timer(clock, now))) - timer->fn(timer); - spin_unlock(&clock->timer_lock); -} - -void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) -{ - out->atomic++; - spin_lock(&clock->timer_lock); - u64 now = atomic64_read(&clock->now); - - printbuf_tabstop_push(out, 40); - prt_printf(out, "current time:\t%llu\n", now); - - for (unsigned i = 0; i < clock->timers.nr; i++) - prt_printf(out, "%ps %ps:\t%llu\n", - clock->timers.data[i]->fn, - clock->timers.data[i]->fn2, - clock->timers.data[i]->expire); - spin_unlock(&clock->timer_lock); - --out->atomic; -} - -void bch2_io_clock_exit(struct io_clock *clock) -{ - free_heap(&clock->timers); - free_percpu(clock->pcpu_buf); -} - -int bch2_io_clock_init(struct io_clock *clock) -{ - atomic64_set(&clock->now, 0); - spin_lock_init(&clock->timer_lock); - - clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); - - clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); - if (!clock->pcpu_buf) - return -BCH_ERR_ENOMEM_io_clock_init; - - if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) - return -BCH_ERR_ENOMEM_io_clock_init; - - return 0; -} diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h deleted file mode 100644 index 8769be2aa21e89..00000000000000 --- a/fs/bcachefs/clock.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CLOCK_H -#define _BCACHEFS_CLOCK_H - -void bch2_io_timer_add(struct io_clock *, struct io_timer *); -void bch2_io_timer_del(struct io_clock *, struct io_timer *); -unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long); -void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); - -void __bch2_increment_clock(struct io_clock *, u64); - -static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors, - int rw) -{ - struct io_clock *clock = &c->io_clock[rw]; - - if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= - IO_CLOCK_PCPU_SECTORS)) - __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); -} - -void bch2_io_clock_schedule_timeout(struct io_clock *, u64); - -void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); - -void bch2_io_clock_exit(struct io_clock *); -int bch2_io_clock_init(struct io_clock *); - -#endif /* _BCACHEFS_CLOCK_H */ diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h deleted file mode 100644 index 37554e4514fe70..00000000000000 --- a/fs/bcachefs/clock_types.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_CLOCK_TYPES_H -#define _BCACHEFS_CLOCK_TYPES_H - -#include "util.h" - -#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) - -/* - * Clocks/timers in units of sectors of IO: - * - * Note - they use percpu batching, so they're only approximate. - */ - -struct io_timer; -typedef void (*io_timer_fn)(struct io_timer *); - -struct io_timer { - io_timer_fn fn; - void *fn2; - u64 expire; -}; - -/* Amount to buffer up on a percpu counter */ -#define IO_CLOCK_PCPU_SECTORS 128 - -typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap) io_timer_heap; - -struct io_clock { - atomic64_t now; - u16 __percpu *pcpu_buf; - unsigned max_slop; - - spinlock_t timer_lock; - io_timer_heap timers; -}; - -#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c deleted file mode 100644 index b37b1f325f0ae5..00000000000000 --- a/fs/bcachefs/compress.c +++ /dev/null @@ -1,773 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "checksum.h" -#include "compress.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "opts.h" -#include "super-io.h" - -#include -#include -#include - -static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type) -{ - switch (type) { - case BCH_COMPRESSION_TYPE_none: - case BCH_COMPRESSION_TYPE_incompressible: - return BCH_COMPRESSION_OPT_none; - case BCH_COMPRESSION_TYPE_lz4_old: - case BCH_COMPRESSION_TYPE_lz4: - return BCH_COMPRESSION_OPT_lz4; - case BCH_COMPRESSION_TYPE_gzip: - return BCH_COMPRESSION_OPT_gzip; - case BCH_COMPRESSION_TYPE_zstd: - return BCH_COMPRESSION_OPT_zstd; - default: - BUG(); - } -} - -/* Bounce buffer: */ -struct bbuf { - void *b; - enum { - BB_NONE, - BB_VMAP, - BB_KMALLOC, - BB_MEMPOOL, - } type; - int rw; -}; - -static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) -{ - void *b; - - BUG_ON(size > c->opts.encoded_extent_max); - - b = kmalloc(size, GFP_NOFS|__GFP_NOWARN); - if (b) - return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; - - b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS); - if (b) - return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; - - BUG(); -} - -static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) -{ - struct bio_vec bv; - struct bvec_iter iter; - void *expected_start = NULL; - - __bio_for_each_bvec(bv, bio, iter, start) { - if (expected_start && - expected_start != page_address(bv.bv_page) + bv.bv_offset) - return false; - - expected_start = page_address(bv.bv_page) + - bv.bv_offset + bv.bv_len; - } - - return true; -} - -static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, - struct bvec_iter start, int rw) -{ - struct bbuf ret; - struct bio_vec bv; - struct bvec_iter iter; - unsigned nr_pages = 0; - struct page *stack_pages[16]; - struct page **pages = NULL; - void *data; - - BUG_ON(start.bi_size > c->opts.encoded_extent_max); - - if (!PageHighMem(bio_iter_page(bio, start)) && - bio_phys_contig(bio, start)) - return (struct bbuf) { - .b = page_address(bio_iter_page(bio, start)) + - bio_iter_offset(bio, start), - .type = BB_NONE, .rw = rw - }; - - /* check if we can map the pages contiguously: */ - __bio_for_each_segment(bv, bio, iter, start) { - if (iter.bi_size != start.bi_size && - bv.bv_offset) - goto bounce; - - if (bv.bv_len < iter.bi_size && - bv.bv_offset + bv.bv_len < PAGE_SIZE) - goto bounce; - - nr_pages++; - } - - BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); - - pages = nr_pages > ARRAY_SIZE(stack_pages) - ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS) - : stack_pages; - if (!pages) - goto bounce; - - nr_pages = 0; - __bio_for_each_segment(bv, bio, iter, start) - pages[nr_pages++] = bv.bv_page; - - data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (pages != stack_pages) - kfree(pages); - - if (data) - return (struct bbuf) { - .b = data + bio_iter_offset(bio, start), - .type = BB_VMAP, .rw = rw - }; -bounce: - ret = __bounce_alloc(c, start.bi_size, rw); - - if (rw == READ) - memcpy_from_bio(ret.b, bio, start); - - return ret; -} - -static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) -{ - return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); -} - -static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) -{ - switch (buf.type) { - case BB_NONE: - break; - case BB_VMAP: - vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); - break; - case BB_KMALLOC: - kfree(buf.b); - break; - case BB_MEMPOOL: - mempool_free(buf.b, &c->compression_bounce[buf.rw]); - break; - } -} - -static inline void zlib_set_workspace(z_stream *strm, void *workspace) -{ -#ifdef __KERNEL__ - strm->workspace = workspace; -#endif -} - -static int __bio_uncompress(struct bch_fs *c, struct bio *src, - void *dst_data, struct bch_extent_crc_unpacked crc) -{ - struct bbuf src_data = { NULL }; - size_t src_len = src->bi_iter.bi_size; - size_t dst_len = crc.uncompressed_size << 9; - void *workspace; - int ret = 0, ret2; - - enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); - mempool_t *workspace_pool = &c->compress_workspace[opt]; - if (unlikely(!mempool_initialized(workspace_pool))) { - if (fsck_err(c, compression_type_not_marked_in_sb, - "compression type %s set but not marked in superblock", - __bch2_compression_types[crc.compression_type])) - ret = bch2_check_set_has_compressed_data(c, opt); - else - ret = bch_err_throw(c, compression_workspace_not_initialized); - if (ret) - goto err; - } - - src_data = bio_map_or_bounce(c, src, READ); - - switch (crc.compression_type) { - case BCH_COMPRESSION_TYPE_lz4_old: - case BCH_COMPRESSION_TYPE_lz4: - ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, - src_len, dst_len, dst_len); - if (ret2 != dst_len) - ret = bch_err_throw(c, decompress_lz4); - break; - case BCH_COMPRESSION_TYPE_gzip: { - z_stream strm = { - .next_in = src_data.b, - .avail_in = src_len, - .next_out = dst_data, - .avail_out = dst_len, - }; - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - - zlib_set_workspace(&strm, workspace); - zlib_inflateInit2(&strm, -MAX_WBITS); - ret2 = zlib_inflate(&strm, Z_FINISH); - - mempool_free(workspace, workspace_pool); - - if (ret2 != Z_STREAM_END) - ret = bch_err_throw(c, decompress_gzip); - break; - } - case BCH_COMPRESSION_TYPE_zstd: { - ZSTD_DCtx *ctx; - size_t real_src_len = le32_to_cpup(src_data.b); - - if (real_src_len > src_len - 4) { - ret = bch_err_throw(c, decompress_zstd_src_len_bad); - goto err; - } - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - - ret2 = zstd_decompress_dctx(ctx, - dst_data, dst_len, - src_data.b + 4, real_src_len); - - mempool_free(workspace, workspace_pool); - - if (ret2 != dst_len) - ret = bch_err_throw(c, decompress_zstd); - break; - } - default: - BUG(); - } -err: -fsck_err: - bio_unmap_or_unbounce(c, src_data); - return ret; -} - -int bch2_bio_uncompress_inplace(struct bch_write_op *op, - struct bio *bio) -{ - struct bch_fs *c = op->c; - struct bch_extent_crc_unpacked *crc = &op->crc; - struct bbuf data = { NULL }; - size_t dst_len = crc->uncompressed_size << 9; - int ret = 0; - - /* bio must own its pages: */ - BUG_ON(!bio->bi_vcnt); - BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { - bch2_write_op_error(op, op->pos.offset, - "extent too big to decompress (%u > %u)", - crc->uncompressed_size << 9, c->opts.encoded_extent_max); - return bch_err_throw(c, decompress_exceeded_max_encoded_extent); - } - - data = __bounce_alloc(c, dst_len, WRITE); - - ret = __bio_uncompress(c, bio, data.b, *crc); - - if (c->opts.no_data_io) - ret = 0; - - if (ret) { - bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); - goto err; - } - - /* - * XXX: don't have a good way to assert that the bio was allocated with - * enough space, we depend on bch2_move_extent doing the right thing - */ - bio->bi_iter.bi_size = crc->live_size << 9; - - memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); - - crc->csum_type = 0; - crc->compression_type = 0; - crc->compressed_size = crc->live_size; - crc->uncompressed_size = crc->live_size; - crc->offset = 0; - crc->csum = (struct bch_csum) { 0, 0 }; -err: - bio_unmap_or_unbounce(c, data); - return ret; -} - -int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, - struct bio *dst, struct bvec_iter dst_iter, - struct bch_extent_crc_unpacked crc) -{ - struct bbuf dst_data = { NULL }; - size_t dst_len = crc.uncompressed_size << 9; - int ret; - - if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || - crc.compressed_size << 9 > c->opts.encoded_extent_max) - return bch_err_throw(c, decompress_exceeded_max_encoded_extent); - - dst_data = dst_len == dst_iter.bi_size - ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) - : __bounce_alloc(c, dst_len, WRITE); - - ret = __bio_uncompress(c, src, dst_data.b, crc); - if (ret) - goto err; - - if (dst_data.type != BB_NONE && - dst_data.type != BB_VMAP) - memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); -err: - bio_unmap_or_unbounce(c, dst_data); - return ret; -} - -static int attempt_compress(struct bch_fs *c, - void *workspace, - void *dst, size_t dst_len, - void *src, size_t src_len, - struct bch_compression_opt compression) -{ - enum bch_compression_type compression_type = - __bch2_compression_opt_to_type[compression.type]; - - switch (compression_type) { - case BCH_COMPRESSION_TYPE_lz4: - if (compression.level < LZ4HC_MIN_CLEVEL) { - int len = src_len; - int ret = LZ4_compress_destSize( - src, dst, - &len, dst_len, - workspace); - if (len < src_len) - return -len; - - return ret; - } else { - int ret = LZ4_compress_HC( - src, dst, - src_len, dst_len, - compression.level, - workspace); - - return ret ?: -1; - } - case BCH_COMPRESSION_TYPE_gzip: { - z_stream strm = { - .next_in = src, - .avail_in = src_len, - .next_out = dst, - .avail_out = dst_len, - }; - - zlib_set_workspace(&strm, workspace); - if (zlib_deflateInit2(&strm, - compression.level - ? clamp_t(unsigned, compression.level, - Z_BEST_SPEED, Z_BEST_COMPRESSION) - : Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, - Z_DEFAULT_STRATEGY) != Z_OK) - return 0; - - if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) - return 0; - - if (zlib_deflateEnd(&strm) != Z_OK) - return 0; - - return strm.total_out; - } - case BCH_COMPRESSION_TYPE_zstd: { - /* - * rescale: - * zstd max compression level is 22, our max level is 15 - */ - unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); - ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); - ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size); - - /* - * ZSTD requires that when we decompress we pass in the exact - * compressed size - rounding it up to the nearest sector - * doesn't work, so we use the first 4 bytes of the buffer for - * that. - * - * Additionally, the ZSTD code seems to have a bug where it will - * write just past the end of the buffer - so subtract a fudge - * factor (7 bytes) from the dst buffer size to account for - * that. - */ - size_t len = zstd_compress_cctx(ctx, - dst + 4, dst_len - 4 - 7, - src, src_len, - ¶ms); - if (zstd_is_error(len)) - return 0; - - *((__le32 *) dst) = cpu_to_le32(len); - return len + 4; - } - default: - BUG(); - } -} - -static unsigned __bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - struct bch_compression_opt compression) -{ - struct bbuf src_data = { NULL }, dst_data = { NULL }; - void *workspace; - enum bch_compression_type compression_type = - __bch2_compression_opt_to_type[compression.type]; - unsigned pad; - int ret = 0; - - /* bch2_compression_decode catches unknown compression types: */ - BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR); - - mempool_t *workspace_pool = &c->compress_workspace[compression.type]; - if (unlikely(!mempool_initialized(workspace_pool))) { - if (fsck_err(c, compression_opt_not_marked_in_sb, - "compression opt %s set but not marked in superblock", - bch2_compression_opts[compression.type])) { - ret = bch2_check_set_has_compressed_data(c, compression.type); - if (ret) /* memory allocation failure, don't compress */ - return 0; - } else { - return 0; - } - } - - /* If it's only one block, don't bother trying to compress: */ - if (src->bi_iter.bi_size <= c->opts.block_size) - return BCH_COMPRESSION_TYPE_incompressible; - - dst_data = bio_map_or_bounce(c, dst, WRITE); - src_data = bio_map_or_bounce(c, src, READ); - - workspace = mempool_alloc(workspace_pool, GFP_NOFS); - - *src_len = src->bi_iter.bi_size; - *dst_len = dst->bi_iter.bi_size; - - /* - * XXX: this algorithm sucks when the compression code doesn't tell us - * how much would fit, like LZ4 does: - */ - while (1) { - if (*src_len <= block_bytes(c)) { - ret = -1; - break; - } - - ret = attempt_compress(c, workspace, - dst_data.b, *dst_len, - src_data.b, *src_len, - compression); - if (ret > 0) { - *dst_len = ret; - ret = 0; - break; - } - - /* Didn't fit: should we retry with a smaller amount? */ - if (*src_len <= *dst_len) { - ret = -1; - break; - } - - /* - * If ret is negative, it's a hint as to how much data would fit - */ - BUG_ON(-ret >= *src_len); - - if (ret < 0) - *src_len = -ret; - else - *src_len -= (*src_len - *dst_len) / 2; - *src_len = round_down(*src_len, block_bytes(c)); - } - - mempool_free(workspace, workspace_pool); - - if (ret) - goto err; - - /* Didn't get smaller: */ - if (round_up(*dst_len, block_bytes(c)) >= *src_len) - goto err; - - pad = round_up(*dst_len, block_bytes(c)) - *dst_len; - - memset(dst_data.b + *dst_len, 0, pad); - *dst_len += pad; - - if (dst_data.type != BB_NONE && - dst_data.type != BB_VMAP) - memcpy_to_bio(dst, dst->bi_iter, dst_data.b); - - BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); - BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); - BUG_ON(*dst_len & (block_bytes(c) - 1)); - BUG_ON(*src_len & (block_bytes(c) - 1)); - ret = compression_type; -out: - bio_unmap_or_unbounce(c, src_data); - bio_unmap_or_unbounce(c, dst_data); - return ret; -err: - ret = BCH_COMPRESSION_TYPE_incompressible; - goto out; -fsck_err: - ret = 0; - goto out; -} - -unsigned bch2_bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - unsigned compression_opt) -{ - unsigned orig_dst = dst->bi_iter.bi_size; - unsigned orig_src = src->bi_iter.bi_size; - unsigned compression_type; - - /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ - src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, - c->opts.encoded_extent_max); - /* Don't generate a bigger output than input: */ - dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - - compression_type = - __bio_compress(c, dst, dst_len, src, src_len, - bch2_compression_decode(compression_opt)); - - dst->bi_iter.bi_size = orig_dst; - src->bi_iter.bi_size = orig_src; - return compression_type; -} - -static int __bch2_fs_compress_init(struct bch_fs *, u64); - -#define BCH_FEATURE_none 0 - -static const unsigned bch2_compression_opt_to_feature[] = { -#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, - BCH_COMPRESSION_OPTS() -#undef x -}; - -#undef BCH_FEATURE_none - -static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) -{ - int ret = 0; - - if ((c->sb.features & f) == f) - return 0; - - mutex_lock(&c->sb_lock); - - if ((c->sb.features & f) == f) { - mutex_unlock(&c->sb_lock); - return 0; - } - - ret = __bch2_fs_compress_init(c, c->sb.features|f); - if (ret) { - mutex_unlock(&c->sb_lock); - return ret; - } - - c->disk_sb.sb->features[0] |= cpu_to_le64(f); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - -int bch2_check_set_has_compressed_data(struct bch_fs *c, - unsigned compression_opt) -{ - unsigned compression_type = bch2_compression_decode(compression_opt).type; - - BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); - - return compression_type - ? __bch2_check_set_has_compressed_data(c, - 1ULL << bch2_compression_opt_to_feature[compression_type]) - : 0; -} - -void bch2_fs_compress_exit(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) - mempool_exit(&c->compress_workspace[i]); - mempool_exit(&c->compression_bounce[WRITE]); - mempool_exit(&c->compression_bounce[READ]); -} - -static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) -{ - ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), - c->opts.encoded_extent_max); - - c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); - - struct { - unsigned feature; - enum bch_compression_opts type; - size_t compress_workspace; - } compression_types[] = { - { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4, - max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, - { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip, - max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), - zlib_inflate_workspacesize()) }, - { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd, - max(c->zstd_workspace_size, - zstd_dctx_workspace_bound()) }, - }, *i; - bool have_compressed = false; - - for (i = compression_types; - i < compression_types + ARRAY_SIZE(compression_types); - i++) - have_compressed |= (features & (1 << i->feature)) != 0; - - if (!have_compressed) - return 0; - - if (!mempool_initialized(&c->compression_bounce[READ]) && - mempool_init_kvmalloc_pool(&c->compression_bounce[READ], - 1, c->opts.encoded_extent_max)) - return bch_err_throw(c, ENOMEM_compression_bounce_read_init); - - if (!mempool_initialized(&c->compression_bounce[WRITE]) && - mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], - 1, c->opts.encoded_extent_max)) - return bch_err_throw(c, ENOMEM_compression_bounce_write_init); - - for (i = compression_types; - i < compression_types + ARRAY_SIZE(compression_types); - i++) { - if (!(features & (1 << i->feature))) - continue; - - if (mempool_initialized(&c->compress_workspace[i->type])) - continue; - - if (mempool_init_kvmalloc_pool( - &c->compress_workspace[i->type], - 1, i->compress_workspace)) - return bch_err_throw(c, ENOMEM_compression_workspace_init); - } - - return 0; -} - -static u64 compression_opt_to_feature(unsigned v) -{ - unsigned type = bch2_compression_decode(v).type; - - return BIT_ULL(bch2_compression_opt_to_feature[type]); -} - -int bch2_fs_compress_init(struct bch_fs *c) -{ - u64 f = c->sb.features; - - f |= compression_opt_to_feature(c->opts.compression); - f |= compression_opt_to_feature(c->opts.background_compression); - - return __bch2_fs_compress_init(c, f); -} - -int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, - struct printbuf *err) -{ - char *val = kstrdup(_val, GFP_KERNEL); - char *p = val, *type_str, *level_str; - struct bch_compression_opt opt = { 0 }; - int ret; - - if (!val) - return -ENOMEM; - - type_str = strsep(&p, ":"); - level_str = p; - - ret = match_string(bch2_compression_opts, -1, type_str); - if (ret < 0 && err) - prt_printf(err, "invalid compression type\n"); - if (ret < 0) - goto err; - - opt.type = ret; - - if (level_str) { - unsigned level; - - ret = kstrtouint(level_str, 10, &level); - if (!ret && !opt.type && level) - ret = -EINVAL; - if (!ret && level > 15) - ret = -EINVAL; - if (ret < 0 && err) - prt_printf(err, "invalid compression level\n"); - if (ret < 0) - goto err; - - opt.level = level; - } - - *res = bch2_compression_encode(opt); -err: - kfree(val); - return ret; -} - -void bch2_compression_opt_to_text(struct printbuf *out, u64 v) -{ - struct bch_compression_opt opt = bch2_compression_decode(v); - - if (opt.type < BCH_COMPRESSION_OPT_NR) - prt_str(out, bch2_compression_opts[opt.type]); - else - prt_printf(out, "(unknown compression opt %u)", opt.type); - if (opt.level) - prt_printf(out, ":%u", opt.level); -} - -void bch2_opt_compression_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) -{ - return bch2_compression_opt_to_text(out, v); -} - -int bch2_opt_compression_validate(u64 v, struct printbuf *err) -{ - if (!bch2_compression_opt_valid(v)) { - prt_printf(err, "invalid compression opt %llu", v); - return -BCH_ERR_invalid_sb_opt_compression; - } - - return 0; -} diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h deleted file mode 100644 index bec2f05bfd52ac..00000000000000 --- a/fs/bcachefs/compress.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_COMPRESS_H -#define _BCACHEFS_COMPRESS_H - -#include "extents_types.h" - -static const unsigned __bch2_compression_opt_to_type[] = { -#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, - BCH_COMPRESSION_OPTS() -#undef x -}; - -struct bch_compression_opt { - u8 type:4, - level:4; -}; - -static inline struct bch_compression_opt __bch2_compression_decode(unsigned v) -{ - return (struct bch_compression_opt) { - .type = v & 15, - .level = v >> 4, - }; -} - -static inline bool bch2_compression_opt_valid(unsigned v) -{ - struct bch_compression_opt opt = __bch2_compression_decode(v); - - return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level); -} - -static inline struct bch_compression_opt bch2_compression_decode(unsigned v) -{ - return bch2_compression_opt_valid(v) - ? __bch2_compression_decode(v) - : (struct bch_compression_opt) { 0 }; -} - -static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) -{ - return opt.type|(opt.level << 4); -} - -static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) -{ - return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; -} - -struct bch_write_op; -int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); -int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, - struct bvec_iter, struct bch_extent_crc_unpacked); -unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, - struct bio *, size_t *, unsigned); - -int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); -void bch2_fs_compress_exit(struct bch_fs *); -int bch2_fs_compress_init(struct bch_fs *); - -void bch2_compression_opt_to_text(struct printbuf *, u64); - -int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); -void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); -int bch2_opt_compression_validate(u64, struct printbuf *); - -#define bch2_opt_compression (struct bch_opt_fn) { \ - .parse = bch2_opt_compression_parse, \ - .to_text = bch2_opt_compression_to_text, \ - .validate = bch2_opt_compression_validate, \ -} - -#endif /* _BCACHEFS_COMPRESS_H */ diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c deleted file mode 100644 index e86d36d23e9e30..00000000000000 --- a/fs/bcachefs/darray.c +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include "darray.h" - -int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) -{ - if (new_size > d->size) { - new_size = roundup_pow_of_two(new_size); - - /* - * This is a workaround: kvmalloc() doesn't support > INT_MAX - * allocations, but vmalloc() does. - * The limit needs to be lifted from kvmalloc, and when it does - * we'll go back to just using that. - */ - size_t bytes; - if (unlikely(check_mul_overflow(new_size, element_size, &bytes))) - return -ENOMEM; - - void *data = likely(bytes < INT_MAX) - ? kvmalloc_noprof(bytes, gfp) - : vmalloc_noprof(bytes); - if (!data) - return -ENOMEM; - - if (d->size) - memcpy(data, d->data, d->size * element_size); - if (d->data != d->preallocated) - kvfree(d->data); - d->data = data; - d->size = new_size; - } - - return 0; -} diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h deleted file mode 100644 index 4080ee99aaddba..00000000000000 --- a/fs/bcachefs/darray.h +++ /dev/null @@ -1,158 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DARRAY_H -#define _BCACHEFS_DARRAY_H - -/* - * Dynamic arrays: - * - * Inspired by CCAN's darray - */ - -#include -#include - -#define DARRAY_PREALLOCATED(_type, _nr) \ -struct { \ - size_t nr, size; \ - _type *data; \ - _type preallocated[_nr]; \ -} - -#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) - -typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; -typedef DARRAY(const char *) darray_const_str; - -typedef DARRAY(u8) darray_u8; -typedef DARRAY(u16) darray_u16; -typedef DARRAY(u32) darray_u32; -typedef DARRAY(u64) darray_u64; - -typedef DARRAY(s8) darray_s8; -typedef DARRAY(s16) darray_s16; -typedef DARRAY(s32) darray_s32; -typedef DARRAY(s64) darray_s64; - -int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); - -#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__)) - -#define __darray_resize(_d, _element_size, _new_size, _gfp) \ - (unlikely((_new_size) > (_d)->size) \ - ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\ - : 0) - -#define darray_resize_gfp(_d, _new_size, _gfp) \ - __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp) - -#define darray_resize(_d, _new_size) \ - darray_resize_gfp(_d, _new_size, GFP_KERNEL) - -#define darray_make_room_gfp(_d, _more, _gfp) \ - darray_resize_gfp((_d), (_d)->nr + (_more), _gfp) - -#define darray_make_room(_d, _more) \ - darray_make_room_gfp(_d, _more, GFP_KERNEL) - -#define darray_room(_d) ((_d).size - (_d).nr) - -#define darray_top(_d) ((_d).data[(_d).nr]) - -#define darray_push_gfp(_d, _item, _gfp) \ -({ \ - int _ret = darray_make_room_gfp((_d), 1, _gfp); \ - \ - if (!_ret) \ - (_d)->data[(_d)->nr++] = (_item); \ - _ret; \ -}) - -#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL) - -#define darray_pop(_d) ((_d)->data[--(_d)->nr]) - -#define darray_first(_d) ((_d).data[0]) -#define darray_last(_d) ((_d).data[(_d).nr - 1]) - -#define darray_insert_item(_d, pos, _item) \ -({ \ - size_t _pos = (pos); \ - int _ret = darray_make_room((_d), 1); \ - \ - if (!_ret) \ - array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \ - _ret; \ -}) - -#define darray_remove_item(_d, _pos) \ - array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) - -#define darray_find_p(_d, _i, cond) \ -({ \ - typeof((_d).data) _ret = NULL; \ - \ - darray_for_each(_d, _i) \ - if (cond) { \ - _ret = _i; \ - break; \ - } \ - _ret; \ -}) - -#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item) - -/* Iteration: */ - -#define __darray_for_each(_d, _i) \ - for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) - -#define darray_for_each(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) - -#define darray_for_each_reverse(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) - -/* Init/exit */ - -#define darray_init(_d) \ -do { \ - (_d)->nr = 0; \ - (_d)->size = ARRAY_SIZE((_d)->preallocated); \ - (_d)->data = (_d)->size ? (_d)->preallocated : NULL; \ -} while (0) - -#define darray_exit(_d) \ -do { \ - if (!ARRAY_SIZE((_d)->preallocated) || \ - (_d)->data != (_d)->preallocated) \ - kvfree((_d)->data); \ - darray_init(_d); \ -} while (0) - -#define DEFINE_DARRAY_CLASS(_type) \ -DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void) - -#define DEFINE_DARRAY(_type) \ -typedef DARRAY(_type) darray_##_type; \ -DEFINE_DARRAY_CLASS(darray_##_type) - -#define DEFINE_DARRAY_NAMED(_name, _type) \ -typedef DARRAY(_type) _name; \ -DEFINE_DARRAY_CLASS(_name) - -DEFINE_DARRAY_CLASS(darray_char); -DEFINE_DARRAY_CLASS(darray_str) -DEFINE_DARRAY_CLASS(darray_const_str) - -DEFINE_DARRAY_CLASS(darray_u8) -DEFINE_DARRAY_CLASS(darray_u16) -DEFINE_DARRAY_CLASS(darray_u32) -DEFINE_DARRAY_CLASS(darray_u64) - -DEFINE_DARRAY_CLASS(darray_s8) -DEFINE_DARRAY_CLASS(darray_s16) -DEFINE_DARRAY_CLASS(darray_s32) -DEFINE_DARRAY_CLASS(darray_s64) - -#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c deleted file mode 100644 index e848e210a9bf76..00000000000000 --- a/fs/bcachefs/data_update.c +++ /dev/null @@ -1,1021 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "compress.h" -#include "data_update.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "extents.h" -#include "io_write.h" -#include "keylist.h" -#include "move.h" -#include "nocow_locking.h" -#include "rebalance.h" -#include "snapshot.h" -#include "subvolume.h" -#include "trace.h" - -#include - -static const char * const bch2_data_update_type_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_DATA_UPDATE_TYPES() -#undef x - NULL -}; - -static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); -} - -static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) { - if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; - bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); - } - return false; - } - } - return true; -} - -static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } -} - -static noinline_for_stack -bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, - const struct bch_extent_ptr *start) -{ - if (!ctxt) { - bkey_for_each_ptr(ptrs, ptr) { - if (ptr == start) - break; - - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } - return false; - } - - __bkey_for_each_ptr(start, ptrs.end, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - bool locked; - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || - list_empty(&ctxt->ios)); - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } - return true; -} - -static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) -{ - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) - return __bkey_nocow_lock(c, ctxt, ptrs, ptr); - } - - return true; -} - -noinline_for_stack -static void trace_io_move_finish2(struct data_update *u, - struct bkey_i *new, - struct bkey_i *insert) -{ - struct bch_fs *c = u->op.c; - struct printbuf buf = PRINTBUF; - - prt_newline(&buf); - - bch2_data_update_to_text(&buf, u); - prt_newline(&buf); - - prt_str_indented(&buf, "new replicas:\t"); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - prt_newline(&buf); - - prt_str_indented(&buf, "insert:\t"); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_newline(&buf); - - trace_io_move_finish(c, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static void trace_io_move_fail2(struct data_update *m, - struct bkey_s_c new, - struct bkey_s_c wrote, - struct bkey_i *insert, - const char *msg) -{ - struct bch_fs *c = m->op.c; - struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - struct printbuf buf = PRINTBUF; - unsigned rewrites_found = 0; - - if (!trace_io_move_fail_enabled()) - return; - - prt_str(&buf, msg); - - if (insert) { - const union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct extent_ptr_decoded p; - - unsigned ptr_bit = 1; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { - if ((ptr_bit & m->data_opts.rewrite_ptrs) && - (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && - !ptr->cached) - rewrites_found |= ptr_bit; - ptr_bit <<= 1; - } - } - - prt_str(&buf, "rewrites found:\t"); - bch2_prt_u64_base2(&buf, rewrites_found); - prt_newline(&buf); - - bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, new); - - prt_str(&buf, "\nwrote: "); - bch2_bkey_val_to_text(&buf, c, wrote); - - if (insert) { - prt_str(&buf, "\ninsert: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - } - - trace_io_move_fail(c, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static void trace_data_update2(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_data_update(c, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static void trace_io_move_created_rebalance2(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - struct printbuf buf = PRINTBUF; - - bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_io_move_created_rebalance(c, buf.buf); - printbuf_exit(&buf); - - this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); -} - -noinline_for_stack -static int data_update_invalid_bkey(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_str(&buf, "about to insert invalid key in data update path"); - prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_newline(&buf); - - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - return bch_err_throw(c, invalid_bkey); -} - -static int __bch2_data_update_index_update(struct btree_trans *trans, - struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_iter iter; - struct data_update *m = container_of(op, struct data_update, op); - int ret = 0; - - bch2_trans_iter_init(trans, &iter, m->btree_id, - bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), - BTREE_ITER_slots|BTREE_ITER_intent); - - while (1) { - struct bkey_s_c k; - struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - struct bkey_i *insert = NULL; - struct bkey_i_extent *new; - const union bch_extent_entry *entry_c; - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_extent_ptr *ptr; - const struct bch_extent_ptr *ptr_c; - struct bpos next_pos; - bool should_check_enospc; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; - unsigned rewrites_found = 0, durability, ptr_bit; - - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); - - if (!bch2_extents_match(k, old)) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), - NULL, "no match:"); - goto nowork; - } - - insert = bch2_trans_kmalloc(trans, - bkey_bytes(k.k) + - bkey_val_bytes(&new->k) + - sizeof(struct bch_extent_rebalance)); - ret = PTR_ERR_OR_ZERO(insert); - if (ret) - goto err; - - bkey_reassemble(insert, k); - - new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); - bch2_cut_front(iter.pos, &new->k_i); - - bch2_cut_front(iter.pos, insert); - bch2_cut_back(new->k.p, insert); - bch2_cut_back(insert->k.p, &new->k_i); - - /* - * @old: extent that we read from - * @insert: key that we're going to update, initialized from - * extent currently in btree - same as @old unless we raced with - * other updates - * @new: extent with new pointers that we'll be adding to @insert - * - * Fist, drop rewrite_ptrs from @new: - */ - ptr_bit = 1; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { - if ((ptr_bit & m->data_opts.rewrite_ptrs) && - (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && - !ptr->cached) { - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), ptr); - rewrites_found |= ptr_bit; - } - ptr_bit <<= 1; - } - - if (m->data_opts.rewrite_ptrs && - !rewrites_found && - bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); - goto nowork; - } - - /* - * A replica that we just wrote might conflict with a replica - * that we want to keep, due to racing with another move: - */ -restart_drop_conflicting_replicas: - extent_for_each_ptr(extent_i_to_s(new), ptr) - if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && - !ptr_c->cached) { - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); - goto restart_drop_conflicting_replicas; - } - - if (!bkey_val_u64s(&new->k)) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); - goto nowork; - } - - /* Now, drop pointers that conflict with what we just wrote: */ - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) - if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); - - durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + - bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); - - /* Now, drop excess replicas: */ - scoped_guard(rcu) { -restart_drop_extra_replicas: - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { - unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); - - if (!p.ptr.cached && - durability - ptr_durability >= m->op.opts.data_replicas) { - durability -= ptr_durability; - - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), &entry->ptr); - goto restart_drop_extra_replicas; - } - } - } - - /* Finally, add the pointers we just wrote: */ - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) - bch2_extent_ptr_decoded_append(insert, &p); - - bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); - - ret = bch2_sum_sector_overwrites(trans, &iter, insert, - &should_check_enospc, - &i_sectors_delta, - &disk_sectors_delta); - if (ret) - goto err; - - if (disk_sectors_delta > (s64) op->res.sectors) { - ret = bch2_disk_reservation_add(c, &op->res, - disk_sectors_delta - op->res.sectors, - !should_check_enospc - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto out; - } - - next_pos = insert->k.p; - - /* - * Check for nonce offset inconsistency: - * This is debug code - we've been seeing this bug rarely, and - * it's been hard to reproduce, so this should give us some more - * information when it does occur: - */ - int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), - (struct bkey_validate_context) { - .btree = m->btree_id, - .flags = BCH_VALIDATE_commit, - }); - if (unlikely(invalid)) { - ret = data_update_invalid_bkey(m, old, k, insert); - goto out; - } - - ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: - bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: - bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, bkey_start_pos(&insert->k)) ?: - bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: - bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto err; - - if (trace_data_update_enabled()) - trace_data_update2(m, old, k, insert); - - if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > - bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) - trace_io_move_created_rebalance2(m, old, k, insert); - - ret = bch2_trans_commit(trans, &op->res, - NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - m->data_opts.btree_insert_flags); - if (ret) - goto err; - - bch2_btree_iter_set_pos(trans, &iter, next_pos); - - this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); - if (trace_io_move_finish_enabled()) - trace_io_move_finish2(m, &new->k_i, insert); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - if (ret) - break; -next: - while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { - bch2_keylist_pop_front(&op->insert_keys); - if (bch2_keylist_empty(&op->insert_keys)) - goto out; - } - continue; -nowork: - if (m->stats) { - BUG_ON(k.k->p.offset <= iter.pos.offset); - atomic64_inc(&m->stats->keys_raced); - atomic64_add(k.k->p.offset - iter.pos.offset, - &m->stats->sectors_raced); - } - - count_event(c, io_move_fail); - - bch2_btree_iter_advance(trans, &iter); - goto next; - } -out: - bch2_trans_iter_exit(trans, &iter); - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - return ret; -} - -int bch2_data_update_index_update(struct bch_write_op *op) -{ - return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); -} - -void bch2_data_update_read_done(struct data_update *m) -{ - m->read_done = true; - - /* write bio must own pages: */ - BUG_ON(!m->op.wbio.bio.bi_vcnt); - - m->op.crc = m->rbio.pick.crc; - m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; - - this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); - - closure_call(&m->op.cl, bch2_write, NULL, NULL); -} - -void bch2_data_update_exit(struct data_update *update) -{ - struct bch_fs *c = update->op.c; - struct bkey_s_c k = bkey_i_to_s_c(update->k.k); - - bch2_bio_free_pages_pool(c, &update->op.wbio.bio); - kfree(update->bvecs); - update->bvecs = NULL; - - if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); - bkey_put_dev_refs(c, k); - bch2_disk_reservation_put(c, &update->op.res); - bch2_bkey_buf_exit(&update->k, c); -} - -static noinline_for_stack -int bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) -{ - struct bch_fs *c = update->op.c; - struct bkey_i_extent *e; - struct write_point *wp; - struct closure cl; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - closure_init_stack(&cl); - bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - - while (bpos_lt(update->op.pos, update->k.k->k.p)) { - unsigned sectors = update->k.k->k.p.offset - - update->op.pos.offset; - - bch2_trans_begin(trans); - - bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, - BTREE_ITER_slots); - ret = lockrestart_do(trans, ({ - k = bch2_btree_iter_peek_slot(trans, &iter); - bkey_err(k); - })); - bch2_trans_iter_exit(trans, &iter); - - if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) - break; - - e = bkey_extent_init(update->op.insert_keys.top); - e->k.p = update->op.pos; - - ret = bch2_alloc_sectors_start_trans(trans, - update->op.target, - false, - update->op.write_point, - &update->op.devs_have, - update->op.nr_replicas, - update->op.nr_replicas, - update->op.watermark, - 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { - bch2_trans_unlock(trans); - closure_sync(&cl); - continue; - } - - bch_err_fn_ratelimited(c, ret); - - if (ret) - break; - - sectors = min(sectors, wp->sectors_free); - - bch2_key_resize(&e->k, sectors); - - bch2_open_bucket_get(c, wp, &update->op.open_buckets); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - - update->op.pos.offset += sectors; - - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->unwritten = true; - bch2_keylist_push(&update->op.insert_keys); - - ret = __bch2_data_update_index_update(trans, &update->op); - - bch2_open_buckets_put(c, &update->op.open_buckets); - - if (ret) - break; - } - - if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock(trans); - closure_sync(&cl); - } - - return ret; -} - -void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 20); - - prt_str_indented(out, "rewrite ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); - prt_newline(out); - - prt_str_indented(out, "kill ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->kill_ptrs); - prt_newline(out); - - prt_str_indented(out, "target:\t"); - bch2_target_to_text(out, c, data_opts->target); - prt_newline(out); - - prt_str_indented(out, "compression:\t"); - bch2_compression_opt_to_text(out, io_opts->background_compression); - prt_newline(out); - - prt_str_indented(out, "opts.replicas:\t"); - prt_u64(out, io_opts->data_replicas); - prt_newline(out); - - prt_str_indented(out, "extra replicas:\t"); - prt_u64(out, data_opts->extra_replicas); - prt_newline(out); - - prt_str_indented(out, "scrub:\t"); - prt_u64(out, data_opts->scrub); -} - -void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) -{ - prt_str(out, bch2_data_update_type_strs[m->type]); - prt_newline(out); - - bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - prt_newline(out); - - prt_str_indented(out, "old key:\t"); - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); -} - -void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) -{ - bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); - prt_newline(out); - printbuf_indent_add(out, 2); - bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - - if (!m->read_done) { - prt_printf(out, "read:\n"); - printbuf_indent_add(out, 2); - bch2_read_bio_to_text(out, &m->rbio); - } else { - prt_printf(out, "write:\n"); - printbuf_indent_add(out, 2); - bch2_write_op_to_text(out, &m->op); - } - printbuf_indent_sub(out, 4); -} - -int bch2_extent_drop_ptrs(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bch_fs *c = trans->c; - struct bkey_i *n; - int ret; - - n = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - while (data_opts->kill_ptrs) { - unsigned i = 0, drop = __fls(data_opts->kill_ptrs); - - bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); - data_opts->kill_ptrs ^= 1U << drop; - } - - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); - - /* - * Since we're not inserting through an extent iterator - * (BTREE_ITER_all_snapshots iterators aren't extent iterators), - * we aren't using the extent overwrite path to delete, we're - * just using the normal key deletion path: - */ - if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) - n->k.size = 0; - - return bch2_trans_relock(trans) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - -static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts, - unsigned buf_bytes) -{ - unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); - - m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); - if (!m->bvecs) - return -ENOMEM; - - bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); - bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); - - if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { - kfree(m->bvecs); - m->bvecs = NULL; - return -ENOMEM; - } - - rbio_init(&m->rbio.bio, c, *io_opts, NULL); - m->rbio.data_update = true; - m->rbio.bio.bi_iter.bi_size = buf_bytes; - m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); - m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - return 0; -} - -int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - /* write path might have to decompress data: */ - unsigned buf_bytes = 0; - bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - - return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); -} - -static int can_write_extent(struct bch_fs *c, struct data_update *m) -{ - if ((m->op.flags & BCH_WRITE_alloc_nowait) && - unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) - return bch_err_throw(c, data_update_done_would_block); - - unsigned target = m->op.flags & BCH_WRITE_only_specified_devs - ? m->op.target - : 0; - struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); - - darray_for_each(m->op.devs_have, i) - __clear_bit(*i, devs.d); - - guard(rcu)(); - - unsigned nr_replicas = 0, i; - for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); - if (!ca) - continue; - - struct bch_dev_usage usage; - bch2_dev_usage_read_fast(ca, &usage); - - if (!dev_buckets_free(ca, usage, m->op.watermark)) - continue; - - nr_replicas += ca->mi.durability; - if (nr_replicas >= m->op.nr_replicas) - break; - } - - if (!nr_replicas) - return bch_err_throw(c, data_update_done_no_rw_devs); - if (nr_replicas < m->op.nr_replicas) - return bch_err_throw(c, insufficient_devices); - return 0; -} - -int bch2_data_update_init(struct btree_trans *trans, - struct btree_iter *iter, - struct moving_context *ctxt, - struct data_update *m, - struct write_point_specifier wp, - struct bch_io_opts *io_opts, - struct data_update_opts data_opts, - enum btree_id btree_id, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - if (k.k->p.snapshot) { - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { - /* Can't repair yet, waiting on other recovery passes */ - return bch_err_throw(c, data_update_done_no_snapshot); - } - if (ret < 0) - return ret; - if (ret) /* key was deleted */ - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, data_update_done_no_snapshot); - ret = 0; - } - - bch2_bkey_buf_init(&m->k); - bch2_bkey_buf_reassemble(&m->k, c, k); - m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc - ? BCH_DATA_UPDATE_copygc - : BCH_DATA_UPDATE_rebalance; - m->btree_id = btree_id; - m->data_opts = data_opts; - m->ctxt = ctxt; - m->stats = ctxt ? ctxt->stats : NULL; - - bch2_write_op_init(&m->op, c, *io_opts); - m->op.pos = bkey_start_pos(k.k); - m->op.version = k.k->bversion; - m->op.target = data_opts.target; - m->op.write_point = wp; - m->op.nr_replicas = 0; - m->op.flags |= BCH_WRITE_pages_stable| - BCH_WRITE_pages_owned| - BCH_WRITE_data_encoded| - BCH_WRITE_move| - m->data_opts.write_flags; - m->op.compression_opt = io_opts->background_compression; - m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - - unsigned durability_have = 0, durability_removing = 0; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; - unsigned buf_bytes = 0; - bool unwritten = false; - - unsigned ptr_bit = 1; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (!p.ptr.cached) { - guard(rcu)(); - if (ptr_bit & m->data_opts.rewrite_ptrs) { - if (crc_is_compressed(p.crc)) - reserve_sectors += k.k->size; - - m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - durability_removing += bch2_extent_ptr_desired_durability(c, &p); - } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { - bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); - durability_have += bch2_extent_ptr_durability(c, &p); - } - } - - /* - * op->csum_type is normally initialized from the fs/file's - * current options - but if an extent is encrypted, we require - * that it stays encrypted: - */ - if (bch2_csum_type_is_encryption(p.crc.csum_type)) { - m->op.nonce = p.crc.nonce + p.crc.offset; - m->op.csum_type = p.crc.csum_type; - } - - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) - m->op.incompressible = true; - - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - unwritten |= p.ptr.unwritten; - - ptr_bit <<= 1; - } - - unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); - - /* - * If current extent durability is less than io_opts.data_replicas, - * we're not trying to rereplicate the extent up to data_replicas here - - * unless extra_replicas was specified - * - * Increasing replication is an explicit operation triggered by - * rereplicate, currently, so that users don't get an unexpected -ENOSPC - */ - m->op.nr_replicas = min(durability_removing, durability_required) + - m->data_opts.extra_replicas; - - /* - * If device(s) were set to durability=0 after data was written to them - * we can end up with a duribilty=0 extent, and the normal algorithm - * that tries not to increase durability doesn't work: - */ - if (!(durability_have + durability_removing)) - m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); - - m->op.nr_replicas_required = m->op.nr_replicas; - - /* - * It might turn out that we don't need any new replicas, if the - * replicas or durability settings have been changed since the extent - * was written: - */ - if (!m->op.nr_replicas) { - m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; - m->data_opts.rewrite_ptrs = 0; - /* if iter == NULL, it's just a promote */ - if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); - if (!ret) - ret = bch_err_throw(c, data_update_done_no_writes_needed); - goto out_bkey_buf_exit; - } - - /* - * Check if the allocation will succeed, to avoid getting an error later - * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless - * read: - * - * This guards against - * - BCH_WRITE_alloc_nowait allocations failing (promotes) - * - Destination target full - * - Device(s) in destination target offline - * - Insufficient durability available in destination target - * (i.e. trying to move a durability=2 replica to a target with a - * single durability=2 device) - */ - ret = can_write_extent(c, m); - if (ret) - goto out_bkey_buf_exit; - - if (reserve_sectors) { - ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, - m->data_opts.extra_replicas - ? 0 - : BCH_DISK_RESERVATION_NOFAIL); - if (ret) - goto out_bkey_buf_exit; - } - - if (!bkey_get_dev_refs(c, k)) { - ret = bch_err_throw(c, data_update_done_no_dev_refs); - goto out_put_disk_res; - } - - if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, ptrs)) { - ret = bch_err_throw(c, nocow_lock_blocked); - goto out_put_dev_refs; - } - - if (unwritten) { - ret = bch2_update_unwritten_extent(trans, m) ?: - bch_err_throw(c, data_update_done_unwritten); - goto out_nocow_unlock; - } - - bch2_trans_unlock(trans); - - ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); - if (ret) - goto out_nocow_unlock; - - return 0; -out_nocow_unlock: - if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); -out_put_dev_refs: - bkey_put_dev_refs(c, k); -out_put_disk_res: - bch2_disk_reservation_put(c, &m->op.res); -out_bkey_buf_exit: - bch2_bkey_buf_exit(&m->k, c); - return ret; -} - -void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - - bkey_for_each_ptr(ptrs, ptr) { - if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { - opts->kill_ptrs |= ptr_bit; - opts->rewrite_ptrs ^= ptr_bit; - } - - ptr_bit <<= 1; - } -} diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h deleted file mode 100644 index 5e14d13568de8f..00000000000000 --- a/fs/bcachefs/data_update.h +++ /dev/null @@ -1,93 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _BCACHEFS_DATA_UPDATE_H -#define _BCACHEFS_DATA_UPDATE_H - -#include "bkey_buf.h" -#include "io_read.h" -#include "io_write_types.h" - -struct moving_context; - -struct data_update_opts { - unsigned rewrite_ptrs; - unsigned kill_ptrs; - u16 target; - u8 extra_replicas; - unsigned btree_insert_flags; - unsigned write_flags; - - int read_dev; - bool scrub; -}; - -void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, - struct bch_io_opts *, struct data_update_opts *); - -#define BCH_DATA_UPDATE_TYPES() \ - x(copygc, 0) \ - x(rebalance, 1) \ - x(promote, 2) - -enum bch_data_update_types { -#define x(n, id) BCH_DATA_UPDATE_##n = id, - BCH_DATA_UPDATE_TYPES() -#undef x -}; - -struct data_update { - enum bch_data_update_types type; - /* extent being updated: */ - bool read_done; - enum btree_id btree_id; - struct bkey_buf k; - struct data_update_opts data_opts; - struct moving_context *ctxt; - struct bch_move_stats *stats; - - struct bch_read_bio rbio; - struct bch_write_op op; - struct bio_vec *bvecs; -}; - -struct promote_op { - struct rcu_head rcu; - u64 start_time; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - - struct rhash_head hash; - struct bpos pos; - - struct work_struct work; - struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ -}; - -void bch2_data_update_to_text(struct printbuf *, struct data_update *); -void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); - -int bch2_data_update_index_update(struct bch_write_op *); - -void bch2_data_update_read_done(struct data_update *); - -int bch2_extent_drop_ptrs(struct btree_trans *, - struct btree_iter *, - struct bkey_s_c, - struct bch_io_opts *, - struct data_update_opts *); - -int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, - struct bch_io_opts *); - -void bch2_data_update_exit(struct data_update *); -int bch2_data_update_init(struct btree_trans *, struct btree_iter *, - struct moving_context *, - struct data_update *, - struct write_point_specifier, - struct bch_io_opts *, struct data_update_opts, - enum btree_id, struct bkey_s_c); -void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); - -#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c deleted file mode 100644 index 07c2a0f73cc204..00000000000000 --- a/fs/bcachefs/debug.c +++ /dev/null @@ -1,996 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Assorted bcachefs debug code - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_locking.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "data_update.h" -#include "debug.h" -#include "error.h" -#include "extents.h" -#include "fsck.h" -#include "inode.h" -#include "journal_reclaim.h" -#include "super.h" - -#include -#include -#include -#include -#include - -static struct dentry *bch_debug; - -static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, - struct extent_ptr_decoded pick) -{ - struct btree *v = c->verify_data; - struct btree_node *n_ondisk = c->verify_ondisk; - struct btree_node *n_sorted = c->verify_data->data; - struct bset *sorted, *inmemory = &b->data->keys; - struct bio *bio; - bool failed = false; - - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_verify_replicas); - if (!ca) - return false; - - bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_sorted, btree_buf_bytes(b)), - REQ_OP_READ|REQ_META, - GFP_NOFS, - &c->btree_bio); - bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_sorted, btree_buf_bytes(b)); - - submit_bio_wait(bio); - - bio_put(bio); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_verify_replicas); - - memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); - - v->written = 0; - if (bch2_btree_node_read_done(c, ca, v, NULL, NULL)) - return false; - - n_sorted = c->verify_data->data; - sorted = &n_sorted->keys; - - if (inmemory->u64s != sorted->u64s || - memcmp(inmemory->start, - sorted->start, - vstruct_end(inmemory) - (void *) inmemory->start)) { - unsigned offset = 0, sectors; - struct bset *i; - unsigned j; - - console_lock(); - - printk(KERN_ERR "*** in memory:\n"); - bch2_dump_bset(c, b, inmemory, 0); - - printk(KERN_ERR "*** read back in:\n"); - bch2_dump_bset(c, v, sorted, 0); - - while (offset < v->written) { - if (!offset) { - i = &n_ondisk->keys; - sectors = vstruct_blocks(n_ondisk, c->block_bits) << - c->block_bits; - } else { - struct btree_node_entry *bne = - (void *) n_ondisk + (offset << 9); - i = &bne->keys; - - sectors = vstruct_blocks(bne, c->block_bits) << - c->block_bits; - } - - printk(KERN_ERR "*** on disk block %u:\n", offset); - bch2_dump_bset(c, b, i, offset); - - offset += sectors; - } - - for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) - if (inmemory->_data[j] != sorted->_data[j]) - break; - - console_unlock(); - bch_err(c, "verify failed at key %u", j); - - failed = true; - } - - if (v->written != b->written) { - bch_err(c, "written wrong: expected %u, got %u", - b->written, v->written); - failed = true; - } - - return failed; -} - -void __bch2_btree_verify(struct bch_fs *c, struct btree *b) -{ - struct bkey_ptrs_c ptrs; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; - struct btree *v; - struct bset *inmemory = &b->data->keys; - struct bkey_packed *k; - bool failed = false; - - if (c->opts.nochanges) - return; - - bch2_btree_node_io_lock(b); - mutex_lock(&c->verify_lock); - - if (!c->verify_ondisk) { - c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); - if (!c->verify_ondisk) - goto out; - } - - if (!c->verify_data) { - c->verify_data = __bch2_btree_node_mem_alloc(c); - if (!c->verify_data) - goto out; - } - - BUG_ON(b->nsets != 1); - - for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) - if (k->type == KEY_TYPE_btree_ptr_v2) - ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0; - - v = c->verify_data; - bkey_copy(&v->key, &b->key); - v->c.level = b->c.level; - v->c.btree_id = b->c.btree_id; - bch2_btree_keys_init(v); - - ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); - bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) - failed |= bch2_btree_verify_replica(c, b, p); - - if (failed) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf); - printbuf_exit(&buf); - } -out: - mutex_unlock(&c->verify_lock); - bch2_btree_node_io_unlock(b); -} - -void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, - const struct btree *b) -{ - struct btree_node *n_ondisk = NULL; - struct extent_ptr_decoded pick; - struct bch_dev *ca; - struct bio *bio = NULL; - unsigned offset = 0; - int ret; - - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { - prt_printf(out, "error getting device to read from: invalid device\n"); - return; - } - - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_btree_node_ondisk_to_text); - if (!ca) { - prt_printf(out, "error getting device to read from: not online\n"); - return; - } - - n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); - if (!n_ondisk) { - prt_printf(out, "memory allocation failure\n"); - goto out; - } - - bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_ondisk, btree_buf_bytes(b)), - REQ_OP_READ|REQ_META, - GFP_NOFS, - &c->btree_bio); - bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b)); - - ret = submit_bio_wait(bio); - if (ret) { - prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret)); - goto out; - } - - while (offset < btree_sectors(c)) { - struct bset *i; - struct nonce nonce; - struct bch_csum csum; - struct bkey_packed *k; - unsigned sectors; - - if (!offset) { - i = &n_ondisk->keys; - - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { - prt_printf(out, "unknown checksum type at offset %u: %llu\n", - offset, BSET_CSUM_TYPE(i)); - goto out; - } - - nonce = btree_nonce(i, offset << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); - - if (bch2_crc_cmp(csum, n_ondisk->csum)) { - prt_printf(out, "invalid checksum\n"); - goto out; - } - - bset_encrypt(c, i, offset << 9); - - sectors = vstruct_sectors(n_ondisk, c->block_bits); - } else { - struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); - - i = &bne->keys; - - if (i->seq != n_ondisk->keys.seq) - break; - - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { - prt_printf(out, "unknown checksum type at offset %u: %llu\n", - offset, BSET_CSUM_TYPE(i)); - goto out; - } - - nonce = btree_nonce(i, offset << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - if (bch2_crc_cmp(csum, bne->csum)) { - prt_printf(out, "invalid checksum"); - goto out; - } - - bset_encrypt(c, i, offset << 9); - - sectors = vstruct_sectors(bne, c->block_bits); - } - - prt_printf(out, " offset %u version %u, journal seq %llu\n", - offset, - le16_to_cpu(i->version), - le64_to_cpu(i->journal_seq)); - offset += sectors; - - printbuf_indent_add(out, 4); - - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { - struct bkey u; - - bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); - prt_newline(out); - } - - printbuf_indent_sub(out, 4); - } -out: - if (bio) - bio_put(bio); - kvfree(n_ondisk); - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_btree_node_ondisk_to_text); -} - -#ifdef CONFIG_DEBUG_FS - -ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) -{ - if (i->buf.pos) { - size_t bytes = min_t(size_t, i->buf.pos, i->size); - int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes); - - i->ret += copied; - i->ubuf += copied; - i->size -= copied; - i->buf.pos -= copied; - memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); - - if (i->buf.last_newline >= copied) - i->buf.last_newline -= copied; - if (i->buf.last_field >= copied) - i->buf.last_field -= copied; - - if (copied != bytes) - return -EFAULT; - } - - return i->size ? 0 : i->ret; -} - -static int bch2_dump_open(struct inode *inode, struct file *file) -{ - struct btree_debug *bd = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - file->private_data = i; - i->from = POS_MIN; - i->iter = 0; - i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); - i->id = bd->id; - i->buf = PRINTBUF; - - return 0; -} - -int bch2_dump_release(struct inode *inode, struct file *file) -{ - struct dump_iter *i = file->private_data; - - printbuf_exit(&i->buf); - kfree(i); - return 0; -} - -static ssize_t bch2_read_btree(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - return bch2_debugfs_flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - bch2_bkey_val_to_text(&i->buf, i->c, k); - prt_newline(&i->buf); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); - bch2_debugfs_flush_buf(i); - }))) ?: - i->ret; -} - -static const struct file_operations btree_debug_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_read_btree, -}; - -static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - ssize_t ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - - if (bpos_eq(SPOS_MAX, i->from)) - return i->ret; - - return bch2_trans_run(i->c, - for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ - bch2_btree_node_to_text(&i->buf, i->c, b); - i->from = !bpos_eq(SPOS_MAX, b->key.k.p) - ? bpos_successor(b->key.k.p) - : b->key.k.p; - - drop_locks_do(trans, bch2_debugfs_flush_buf(i)); - }))) ?: i->ret; -} - -static const struct file_operations btree_format_debug_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_read_btree_formats, -}; - -static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - return bch2_debugfs_flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - struct btree_path_level *l = - &btree_iter_path(trans, &iter)->l[0]; - struct bkey_packed *_k = - bch2_btree_node_iter_peek(&l->iter, l->b); - - if (bpos_gt(l->b->key.k.p, i->prev_node)) { - bch2_btree_node_to_text(&i->buf, i->c, l->b); - i->prev_node = l->b->key.k.p; - } - - bch2_bfloat_to_text(&i->buf, l->b, _k); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); - bch2_debugfs_flush_buf(i); - }))) ?: - i->ret; -} - -static const struct file_operations bfloat_failed_debug_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_read_bfloat_failed, -}; - -static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, - struct btree *b) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - prt_printf(out, "%px ", b); - bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); - prt_printf(out, "\n"); - - printbuf_indent_add(out, 2); - - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); - prt_newline(out); - - prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_btree_node_flags, b->flags); - prt_newline(out); - - prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL); - prt_printf(out, "written:\t%u\n", b->written); - prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked)); - prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable); - - prt_printf(out, "journal pin %px:\t%llu\n", - &b->writes[0].journal, b->writes[0].journal.seq); - prt_printf(out, "journal pin %px:\t%llu\n", - &b->writes[1].journal, b->writes[1].journal.seq); - - prt_printf(out, "ob:\t%u\n", b->ob.nr); - - printbuf_indent_sub(out, 2); -} - -static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - bool done = false; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - do { - ret = bch2_debugfs_flush_buf(i); - if (ret) - return ret; - - i->buf.atomic++; - scoped_guard(rcu) { - struct bucket_table *tbl = - rht_dereference_rcu(c->btree_cache.table.tbl, - &c->btree_cache.table); - if (i->iter < tbl->size) { - struct rhash_head *pos; - struct btree *b; - - rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) - bch2_cached_btree_node_to_text(&i->buf, c, b); - i->iter++; - } else { - done = true; - } - } - --i->buf.atomic; - } while (!done); - - if (i->buf.allocation_failure) - ret = -ENOMEM; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static const struct file_operations cached_btree_nodes_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_cached_btree_nodes_read, -}; - -typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); - -static void list_sort(struct list_head *head, list_cmp_fn cmp) -{ - struct list_head *pos; - - list_for_each(pos, head) - while (!list_is_last(pos, head) && - cmp(pos, pos->next) > 0) { - struct list_head *pos2, *next = pos->next; - - list_del(next); - list_for_each(pos2, head) - if (cmp(next, pos2) < 0) - goto pos_found; - BUG(); -pos_found: - list_add_tail(next, pos2); - } -} - -static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) -{ - return cmp_int(l, r); -} - -static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - struct btree_trans *trans; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -restart: - seqmutex_lock(&c->btree_trans_lock); - list_sort(&c->btree_trans_list, list_ptr_order_cmp); - - list_for_each_entry(trans, &c->btree_trans_list, list) { - if ((ulong) trans <= i->iter) - continue; - - i->iter = (ulong) trans; - - if (!closure_get_not_zero(&trans->ref)) - continue; - - if (!trans->srcu_held) { - closure_put(&trans->ref); - continue; - } - - u32 seq = seqmutex_unlock(&c->btree_trans_lock); - - bch2_btree_trans_to_text(&i->buf, trans); - - prt_printf(&i->buf, "backtrace:\n"); - printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); - printbuf_indent_sub(&i->buf, 2); - prt_newline(&i->buf); - - closure_put(&trans->ref); - - ret = bch2_debugfs_flush_buf(i); - if (ret) - goto unlocked; - - if (!seqmutex_relock(&c->btree_trans_lock, seq)) - goto restart; - } - seqmutex_unlock(&c->btree_trans_lock); -unlocked: - srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - if (i->buf.allocation_failure) - ret = -ENOMEM; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static const struct file_operations btree_transactions_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_btree_transactions_read, -}; - -static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - bool done = false; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - while (1) { - err = bch2_debugfs_flush_buf(i); - if (err) - return err; - - if (!i->size) - break; - - if (done) - break; - - done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); - i->iter++; - } - - if (i->buf.allocation_failure) - return -ENOMEM; - - return i->ret; -} - -static const struct file_operations journal_pins_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_journal_pins_read, -}; - -static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (!i->iter) { - bch2_btree_updates_to_text(&i->buf, c); - i->iter++; - } - - err = bch2_debugfs_flush_buf(i); - if (err) - return err; - - if (i->buf.allocation_failure) - return -ENOMEM; - - return i->ret; -} - -static const struct file_operations btree_updates_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_btree_updates_read, -}; - -static int btree_transaction_stats_open(struct inode *inode, struct file *file) -{ - struct bch_fs *c = inode->i_private; - struct dump_iter *i; - - i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) - return -ENOMEM; - - i->iter = 1; - i->c = c; - i->buf = PRINTBUF; - file->private_data = i; - - return 0; -} - -static int btree_transaction_stats_release(struct inode *inode, struct file *file) -{ - struct dump_iter *i = file->private_data; - - printbuf_exit(&i->buf); - kfree(i); - - return 0; -} - -static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - int err; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - while (1) { - struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; - - err = bch2_debugfs_flush_buf(i); - if (err) - return err; - - if (!i->size) - break; - - if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) || - !bch2_btree_transaction_fns[i->iter]) - break; - - prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); - printbuf_indent_add(&i->buf, 2); - - mutex_lock(&s->lock); - - prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); -#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - printbuf_indent_add(&i->buf, 2); - bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); - printbuf_indent_sub(&i->buf, 2); -#endif - - prt_printf(&i->buf, "Transaction duration:\n"); - - printbuf_indent_add(&i->buf, 2); - bch2_time_stats_to_text(&i->buf, &s->duration); - printbuf_indent_sub(&i->buf, 2); - - if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { - prt_printf(&i->buf, "Lock hold times:\n"); - - printbuf_indent_add(&i->buf, 2); - bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); - printbuf_indent_sub(&i->buf, 2); - } - - if (s->max_paths_text) { - prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths); - - printbuf_indent_add(&i->buf, 2); - prt_str_indented(&i->buf, s->max_paths_text); - printbuf_indent_sub(&i->buf, 2); - } - - mutex_unlock(&s->lock); - - printbuf_indent_sub(&i->buf, 2); - prt_newline(&i->buf); - i->iter++; - } - - if (i->buf.allocation_failure) - return -ENOMEM; - - return i->ret; -} - -static const struct file_operations btree_transaction_stats_op = { - .owner = THIS_MODULE, - .open = btree_transaction_stats_open, - .release = btree_transaction_stats_release, - .read = btree_transaction_stats_read, -}; - -/* walk btree transactions until we find a deadlock and print it */ -static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct btree_trans *trans; - ulong iter = 0; -restart: - seqmutex_lock(&c->btree_trans_lock); - list_sort(&c->btree_trans_list, list_ptr_order_cmp); - - list_for_each_entry(trans, &c->btree_trans_list, list) { - if ((ulong) trans <= iter) - continue; - - iter = (ulong) trans; - - if (!closure_get_not_zero(&trans->ref)) - continue; - - u32 seq = seqmutex_unlock(&c->btree_trans_lock); - - bool found = bch2_check_for_deadlock(trans, out) != 0; - - closure_put(&trans->ref); - - if (found) - return; - - if (!seqmutex_relock(&c->btree_trans_lock, seq)) - goto restart; - } - seqmutex_unlock(&c->btree_trans_lock); -} - -typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); - -static ssize_t bch2_simple_print(struct file *file, char __user *buf, - size_t size, loff_t *ppos, - fs_to_text_fn fn) -{ - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (!i->iter) { - fn(&i->buf, c); - i->iter++; - } - - if (i->buf.allocation_failure) - ret = -ENOMEM; - - if (!ret) - ret = bch2_debugfs_flush_buf(i); - - return ret ?: i->ret; -} - -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); -} - -static const struct file_operations btree_deadlock_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_btree_deadlock_read, -}; - -static ssize_t bch2_write_points_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) -{ - return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); -} - -static const struct file_operations write_points_ops = { - .owner = THIS_MODULE, - .open = bch2_dump_open, - .release = bch2_dump_release, - .read = bch2_write_points_read, -}; - -void bch2_fs_debug_exit(struct bch_fs *c) -{ - if (!IS_ERR_OR_NULL(c->fs_debug_dir)) - debugfs_remove_recursive(c->fs_debug_dir); -} - -static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd) -{ - struct dentry *d; - - d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir); - - debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops); - - debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops); - - debugfs_create_file("bfloat-failed", 0400, d, bd, - &bfloat_failed_debug_ops); -} - -void bch2_fs_debug_init(struct bch_fs *c) -{ - struct btree_debug *bd; - char name[100]; - - if (IS_ERR_OR_NULL(bch_debug)) - return; - - if (c->sb.multi_device) - snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); - else - strscpy(name, c->name, sizeof(name)); - - c->fs_debug_dir = debugfs_create_dir(name, bch_debug); - if (IS_ERR_OR_NULL(c->fs_debug_dir)) - return; - - debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, - c->btree_debug, &cached_btree_nodes_ops); - - debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, - c->btree_debug, &btree_transactions_ops); - - debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, - c->btree_debug, &journal_pins_ops); - - debugfs_create_file("btree_updates", 0400, c->fs_debug_dir, - c->btree_debug, &btree_updates_ops); - - debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, - c, &btree_transaction_stats_op); - - debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, - c->btree_debug, &btree_deadlock_ops); - - debugfs_create_file("write_points", 0400, c->fs_debug_dir, - c->btree_debug, &write_points_ops); - - bch2_fs_async_obj_debugfs_init(c); - - c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); - if (IS_ERR_OR_NULL(c->btree_debug_dir)) - return; - - for (bd = c->btree_debug; - bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); - bd++) { - bd->id = bd - c->btree_debug; - bch2_fs_debug_btree_init(c, bd); - } -} - -#endif - -void bch2_debug_exit(void) -{ - if (!IS_ERR_OR_NULL(bch_debug)) - debugfs_remove_recursive(bch_debug); -} - -int __init bch2_debug_init(void) -{ - bch_debug = debugfs_create_dir("bcachefs", NULL); - return 0; -} diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h deleted file mode 100644 index d88b1194b8acc0..00000000000000 --- a/fs/bcachefs/debug.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DEBUG_H -#define _BCACHEFS_DEBUG_H - -#include "bcachefs.h" - -struct bio; -struct btree; -struct bch_fs; - -void __bch2_btree_verify(struct bch_fs *, struct btree *); -void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, - const struct btree *); - -static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) -{ - if (static_branch_unlikely(&bch2_verify_btree_ondisk)) - __bch2_btree_verify(c, b); -} - -#ifdef CONFIG_DEBUG_FS -struct dump_iter { - struct bch_fs *c; - struct async_obj_list *list; - enum btree_id id; - struct bpos from; - struct bpos prev_node; - u64 iter; - - struct printbuf buf; - - char __user *ubuf; /* destination user buffer */ - size_t size; /* size of requested read */ - ssize_t ret; /* bytes read so far */ -}; - -ssize_t bch2_debugfs_flush_buf(struct dump_iter *); -int bch2_dump_release(struct inode *, struct file *); - -void bch2_fs_debug_exit(struct bch_fs *); -void bch2_fs_debug_init(struct bch_fs *); -#else -static inline void bch2_fs_debug_exit(struct bch_fs *c) {} -static inline void bch2_fs_debug_init(struct bch_fs *c) {} -#endif - -void bch2_debug_exit(void); -int bch2_debug_init(void); - -#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c deleted file mode 100644 index 28875c5c86add7..00000000000000 --- a/fs/bcachefs/dirent.c +++ /dev/null @@ -1,766 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "extents.h" -#include "dirent.h" -#include "fs.h" -#include "keylist.h" -#include "str_hash.h" -#include "subvolume.h" - -#include - -#ifdef CONFIG_UNICODE -int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - *out_cf = (struct qstr) QSTR_INIT(NULL, 0); - - if (!bch2_fs_casefold_enabled(trans->c)) - return -EOPNOTSUPP; - - unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); - int ret = PTR_ERR_OR_ZERO(buf); - if (ret) - return ret; - - ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1); - if (ret <= 0) - return ret; - - *out_cf = (struct qstr) QSTR_INIT(buf, ret); - return 0; -} -#endif - -static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) -{ - if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) - return 0; - - unsigned bkey_u64s = bkey_val_u64s(d.k); - unsigned bkey_bytes = bkey_u64s * sizeof(u64); - u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1]; -#if CPU_BIG_ENDIAN - unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8; -#else - unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8; -#endif - - return bkey_bytes - - (d.v->d_casefold - ? offsetof(struct bch_dirent, d_cf_name_block.d_names) - : offsetof(struct bch_dirent, d_name)) - - trailing_nuls; -} - -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) -{ - if (d.v->d_casefold) { - unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); - return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len); - } else { - return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); - } -} - -static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d) -{ - if (d.v->d_casefold) { - unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); - unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len); - return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len); - } else { - return (struct qstr) QSTR_INIT(NULL, 0); - } -} - -static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d) -{ - return d.v->d_casefold - ? bch2_dirent_get_casefold_name(d) - : bch2_dirent_get_name(d); -} - -static u64 bch2_dirent_hash(const struct bch_hash_info *info, - const struct qstr *name) -{ - struct bch_str_hash_ctx ctx; - - bch2_str_hash_init(&ctx, info); - bch2_str_hash_update(&ctx, info, name->name, name->len); - - /* [0,2) reserved for dots */ - return max_t(u64, bch2_str_hash_end(&ctx, info), 2); -} - -static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) -{ - return bch2_dirent_hash(info, key); -} - -static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_lookup_name(d); - - return bch2_dirent_hash(info, &name); -} - -static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) -{ - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - const struct qstr l_name = bch2_dirent_get_lookup_name(l); - const struct qstr *r_name = _r; - - return !qstr_eq(l_name, *r_name); -} - -static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -{ - struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); - const struct qstr l_name = bch2_dirent_get_lookup_name(l); - const struct qstr r_name = bch2_dirent_get_lookup_name(r); - - return !qstr_eq(l_name, r_name); -} - -static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - if (d.v->d_type == DT_SUBVOL) - return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; - return true; -} - -const struct bch_hash_desc bch2_dirent_hash_desc = { - .btree_id = BTREE_ID_dirents, - .key_type = KEY_TYPE_dirent, - .hash_key = dirent_hash_key, - .hash_bkey = dirent_hash_bkey, - .cmp_key = dirent_cmp_key, - .cmp_bkey = dirent_cmp_bkey, - .is_visible = dirent_is_visible, -}; - -int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - unsigned name_block_len = bch2_dirent_name_bytes(d); - struct qstr d_name = bch2_dirent_get_name(d); - struct qstr d_cf_name = bch2_dirent_get_casefold_name(d); - int ret = 0; - - bkey_fsck_err_on(!d_name.len, - c, dirent_empty_name, - "empty name"); - - bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len, - c, dirent_val_too_big, - "dirent names exceed bkey size (%d + %d > %d)", - d_name.len, d_cf_name.len, name_block_len); - - /* - * Check new keys don't exceed the max length - * (older keys may be larger.) - */ - bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, - c, dirent_name_too_long, - "dirent name too big (%u > %u)", - d_name.len, BCH_NAME_MAX); - - bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), - c, dirent_name_embedded_nul, - "dirent has stray data after name's NUL"); - - bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || - (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), - c, dirent_name_dot_or_dotdot, - "invalid name"); - - bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), - c, dirent_name_has_slash, - "name with /"); - - bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode, - c, dirent_to_itself, - "dirent points to own directory"); - - if (d.v->d_casefold) { - bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit && - d_cf_name.len > BCH_NAME_MAX, - c, dirent_cf_name_too_big, - "dirent w/ cf name too big (%u > %u)", - d_cf_name.len, BCH_NAME_MAX); - - bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len), - c, dirent_stray_data_after_cf_name, - "dirent has stray data after cf name's NUL"); - } -fsck_err: - return ret; -} - -void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr d_name = bch2_dirent_get_name(d); - - prt_printf(out, "%.*s", d_name.len, d_name.name); - - if (d.v->d_casefold) { - struct qstr d_name = bch2_dirent_get_lookup_name(d); - prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name); - } - - prt_str(out, " ->"); - - if (d.v->d_type != DT_SUBVOL) - prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum)); - else - prt_printf(out, " %u -> %u", - le32_to_cpu(d.v->d_parent_subvol), - le32_to_cpu(d.v->d_child_subvol)); - - prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); -} - -int bch2_dirent_init_name(struct bch_fs *c, - struct bkey_i_dirent *dirent, - const struct bch_hash_info *hash_info, - const struct qstr *name, - const struct qstr *cf_name) -{ - EBUG_ON(hash_info->cf_encoding == NULL && cf_name); - int cf_len = 0; - - if (name->len > BCH_NAME_MAX) - return -ENAMETOOLONG; - - dirent->v.d_casefold = hash_info->cf_encoding != NULL; - - if (!dirent->v.d_casefold) { - memcpy(&dirent->v.d_name[0], name->name, name->len); - memset(&dirent->v.d_name[name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); - } else { - if (!bch2_fs_casefold_enabled(c)) - return -EOPNOTSUPP; - -#ifdef CONFIG_UNICODE - memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - - char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len]; - - if (cf_name) { - cf_len = cf_name->len; - - memcpy(cf_out, cf_name->name, cf_name->len); - } else { - cf_len = utf8_casefold(hash_info->cf_encoding, name, - cf_out, - bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out); - if (cf_len <= 0) - return cf_len; - } - - memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_cf_name_block.d_names) - - name->len + cf_len); - - dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); - dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); - - EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); -#endif - } - - unsigned u64s = dirent_val_u64s(name->len, cf_len); - BUG_ON(u64s > bkey_val_u64s(&dirent->k)); - set_bkey_val_u64s(&dirent->k, u64s); - return 0; -} - -struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans, - const struct bch_hash_info *hash_info, - subvol_inum dir, - u8 type, - const struct qstr *name, - const struct qstr *cf_name, - u64 dst) -{ - struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); - if (IS_ERR(dirent)) - return dirent; - - bkey_dirent_init(&dirent->k_i); - dirent->k.u64s = BKEY_U64s_MAX; - - if (type != DT_SUBVOL) { - dirent->v.d_inum = cpu_to_le64(dst); - } else { - dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); - dirent->v.d_child_subvol = cpu_to_le32(dst); - } - - dirent->v.d_type = type; - dirent->v.d_unused = 0; - - int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name); - if (ret) - return ERR_PTR(ret); - - EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); - return dirent; -} - -int bch2_dirent_create_snapshot(struct btree_trans *trans, - u32 dir_subvol, u64 dir, u32 snapshot, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, - enum btree_iter_update_trigger_flags flags) -{ - subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; - struct bkey_i_dirent *dirent; - int ret; - - dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); - ret = PTR_ERR_OR_ZERO(dirent); - if (ret) - return ret; - - dirent->k.p.inode = dir; - dirent->k.p.snapshot = snapshot; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, snapshot, &dirent->k_i, flags); - *dir_offset = dirent->k.p.offset; - - return ret; -} - -int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i_dirent *dirent; - int ret; - - dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); - ret = PTR_ERR_OR_ZERO(dirent); - if (ret) - return ret; - - ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, flags); - *dir_offset = dirent->k.p.offset; - - return ret; -} - -int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, - struct bkey_s_c_dirent d, subvol_inum *target) -{ - struct bch_subvolume s; - int ret = 0; - - if (d.v->d_type == DT_SUBVOL && - le32_to_cpu(d.v->d_parent_subvol) != dir.subvol) - return 1; - - if (likely(d.v->d_type != DT_SUBVOL)) { - target->subvol = dir.subvol; - target->inum = le64_to_cpu(d.v->d_inum); - } else { - target->subvol = le32_to_cpu(d.v->d_child_subvol); - - ret = bch2_subvolume_get(trans, target->subvol, true, &s); - - target->inum = le64_to_cpu(s.inode); - } - - return ret; -} - -int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, - const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, - enum bch_rename_mode mode) -{ - struct qstr src_name_lookup, dst_name_lookup; - struct btree_iter src_iter = {}; - struct btree_iter dst_iter = {}; - struct bkey_s_c old_src, old_dst = bkey_s_c_null; - struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; - struct bpos dst_pos = - POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); - unsigned src_update_flags = 0; - bool delete_src, delete_dst; - int ret = 0; - - memset(src_inum, 0, sizeof(*src_inum)); - memset(dst_inum, 0, sizeof(*dst_inum)); - - /* Lookup src: */ - ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup); - if (ret) - goto out; - old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, &src_name_lookup, - BTREE_ITER_intent); - ret = bkey_err(old_src); - if (ret) - goto out; - - ret = bch2_dirent_read_target(trans, src_dir, - bkey_s_c_to_dirent(old_src), src_inum); - if (ret) - goto out; - - /* Lookup dst: */ - ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup); - if (ret) - goto out; - if (mode == BCH_RENAME) { - /* - * Note that we're _not_ checking if the target already exists - - * we're relying on the VFS to do that check for us for - * correctness: - */ - ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, &dst_name_lookup); - if (ret) - goto out; - } else { - old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, &dst_name_lookup, - BTREE_ITER_intent); - ret = bkey_err(old_dst); - if (ret) - goto out; - - ret = bch2_dirent_read_target(trans, dst_dir, - bkey_s_c_to_dirent(old_dst), dst_inum); - if (ret) - goto out; - } - - if (mode != BCH_RENAME_EXCHANGE) - *src_offset = dst_iter.pos.offset; - - /* Create new dst key: */ - new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, - dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); - ret = PTR_ERR_OR_ZERO(new_dst); - if (ret) - goto out; - - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - new_dst->k.p = dst_iter.pos; - - /* Create new src key: */ - if (mode == BCH_RENAME_EXCHANGE) { - new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name, - src_hash->cf_encoding ? &src_name_lookup : NULL, 0); - ret = PTR_ERR_OR_ZERO(new_src); - if (ret) - goto out; - - dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); - new_src->k.p = src_iter.pos; - } else { - new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - ret = PTR_ERR_OR_ZERO(new_src); - if (ret) - goto out; - - bkey_init(&new_src->k); - new_src->k.p = src_iter.pos; - - if (bkey_le(dst_pos, src_iter.pos) && - bkey_lt(src_iter.pos, dst_iter.pos)) { - /* - * We have a hash collision for the new dst key, - * and new_src - the key we're deleting - is between - * new_dst's hashed slot and the slot we're going to be - * inserting it into - oops. This will break the hash - * table if we don't deal with it: - */ - if (mode == BCH_RENAME) { - /* - * If we're not overwriting, we can just insert - * new_dst at the src position: - */ - new_src = new_dst; - new_src->k.p = src_iter.pos; - goto out_set_src; - } else { - /* If we're overwriting, we can't insert new_dst - * at a different slot because it has to - * overwrite old_dst - just make sure to use a - * whiteout when deleting src: - */ - new_src->k.type = KEY_TYPE_hash_whiteout; - } - } else { - /* Check if we need a whiteout to delete src: */ - ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, - src_hash, &src_iter); - if (ret < 0) - goto out; - - if (ret) - new_src->k.type = KEY_TYPE_hash_whiteout; - } - } - - if (new_dst->v.d_type == DT_SUBVOL) - new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol); - - if ((mode == BCH_RENAME_EXCHANGE) && - new_src->v.d_type == DT_SUBVOL) - new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); - - ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); - if (ret) - goto out; -out_set_src: - /* - * If we're deleting a subvolume we need to really delete the dirent, - * not just emit a whiteout in the current snapshot - there can only be - * single dirent that points to a given subvolume. - * - * IOW, we don't maintain multiple versions in different snapshots of - * dirents that point to subvolumes - dirents that point to subvolumes - * are only visible in one particular subvolume so it's not necessary, - * and it would be particularly confusing for fsck to have to deal with. - */ - delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL && - new_src->k.p.snapshot != old_src.k->p.snapshot; - - delete_dst = old_dst.k && - bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL && - new_dst->k.p.snapshot != old_dst.k->p.snapshot; - - if (!delete_src || !bkey_deleted(&new_src->k)) { - ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); - if (ret) - goto out; - } - - if (delete_src) { - bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &src_iter) ?: - bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto out; - } - - if (delete_dst) { - bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &dst_iter) ?: - bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto out; - } - - if (mode == BCH_RENAME_EXCHANGE) - *src_offset = new_src->k.p.offset; - *dst_offset = new_dst->k.p.offset; -out: - bch2_trans_iter_exit(trans, &src_iter); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; -} - -int bch2_dirent_lookup_trans(struct btree_trans *trans, - struct btree_iter *iter, - subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum, - unsigned flags) -{ - struct qstr lookup_name; - int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name); - if (ret) - return ret; - - struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, &lookup_name, flags); - ret = bkey_err(k); - if (ret) - goto err; - - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); - if (ret > 0) - ret = -ENOENT; -err: - if (ret) - bch2_trans_iter_exit(trans, iter); - return ret; -} - -u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - - int ret = lockrestart_do(trans, - bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, - SPOS(dir, 0, snapshot), - POS(dir, U64_MAX), 0, k, ret) - if (k.k->type == KEY_TYPE_dirent) { - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) - continue; - ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty); - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) -{ - u32 snapshot; - - return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: - bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); -} - -static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target) -{ - struct qstr name = bch2_dirent_get_name(d); - /* - * Although not required by the kernel code, updating ctx->pos is needed - * for the bcachefs FUSE driver. Without this update, the FUSE - * implementation will be stuck in an infinite loop when reading - * directories (via the bcachefs_fuse_readdir callback). - * In kernel space, ctx->pos is updated by the VFS code. - */ - ctx->pos = d.k->p.offset; - bool ret = dir_emit(ctx, name.name, - name.len, - target.inum, - vfs_d_type(d.v->d_type)); - if (ret) - ctx->pos = d.k->p.offset + 1; - return !ret; -} - -int bch2_readdir(struct bch_fs *c, subvol_inum inum, - struct bch_hash_info *hash_info, - struct dir_context *ctx) -{ - struct bkey_buf sk; - bch2_bkey_buf_init(&sk); - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, - POS(inum.inum, ctx->pos), - POS(inum.inum, U64_MAX), - inum.subvol, 0, k, ({ - if (k.k->type != KEY_TYPE_dirent) - continue; - - /* dir_emit() can fault and block: */ - bch2_bkey_buf_reassemble(&sk, c, k); - struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); - - subvol_inum target; - - bool need_second_pass = false; - int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc, - hash_info, &iter, k, &need_second_pass) ?: - bch2_dirent_read_target(trans, inum, dirent, &target); - if (ret2 > 0) - continue; - - ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target)); - }))); - - bch2_bkey_buf_exit(&sk, c); - - return ret < 0 ? ret : 0; -} - -/* fsck */ - -static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode_nr) - break; - if (!bkey_is_inode(k.k)) - continue; - ret = bch2_inode_unpack(k, inode); - goto found; - } - ret = bch_err_throw(trans->c, ENOENT_inode); -found: - bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - - ret = lookup_first_inode(trans, pos.inode, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch2_hash_info_init(c, &dir_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -err: - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h deleted file mode 100644 index 0417608c18d578..00000000000000 --- a/fs/bcachefs/dirent.h +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DIRENT_H -#define _BCACHEFS_DIRENT_H - -#include "str_hash.h" - -extern const struct bch_hash_desc bch2_dirent_hash_desc; - -int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_dirent ((struct bkey_ops) { \ - .key_validate = bch2_dirent_validate, \ - .val_to_text = bch2_dirent_to_text, \ - .min_val_size = 16, \ -}) - -struct qstr; -struct file; -struct dir_context; -struct bch_fs; -struct bch_hash_info; -struct bch_inode_info; - -#ifdef CONFIG_UNICODE -int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, - const struct qstr *, struct qstr *); -#else -static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - return -EOPNOTSUPP; -} -#endif - -static inline int bch2_maybe_casefold(struct btree_trans *trans, - const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - if (likely(!info->cf_encoding)) { - *out_cf = *str; - return 0; - } else { - return bch2_casefold(trans, info, str, out_cf); - } -} - -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent); - -static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) -{ - unsigned bytes = cf_len - ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len - : offsetof(struct bch_dirent, d_name) + len; - - return DIV_ROUND_UP(bytes, sizeof(u64)); -} - -int bch2_dirent_read_target(struct btree_trans *, subvol_inum, - struct bkey_s_c_dirent, subvol_inum *); - -static inline void dirent_copy_target(struct bkey_i_dirent *dst, - struct bkey_s_c_dirent src) -{ - dst->v.d_inum = src.v->d_inum; - dst->v.d_type = src.v->d_type; -} - -int bch2_dirent_init_name(struct bch_fs *, - struct bkey_i_dirent *, - const struct bch_hash_info *, - const struct qstr *, - const struct qstr *); -struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *, - const struct bch_hash_info *, subvol_inum, u8, - const struct qstr *, const struct qstr *, u64); - -int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, - const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, - enum btree_iter_update_trigger_flags); -int bch2_dirent_create(struct btree_trans *, subvol_inum, - const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, - enum btree_iter_update_trigger_flags); - -static inline unsigned vfs_d_type(unsigned type) -{ - return type == DT_SUBVOL ? DT_DIR : type; -} - -enum bch_rename_mode { - BCH_RENAME, - BCH_RENAME_OVERWRITE, - BCH_RENAME_EXCHANGE, -}; - -int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, - subvol_inum, struct bch_hash_info *, - const struct qstr *, subvol_inum *, u64 *, - const struct qstr *, subvol_inum *, u64 *, - enum bch_rename_mode); - -int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, - subvol_inum, const struct bch_hash_info *, - const struct qstr *, subvol_inum *, unsigned); -u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, - const struct bch_hash_info *, - const struct qstr *, subvol_inum *); - -int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); -int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); -int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *); - -int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); - -#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h deleted file mode 100644 index a46dbddd21aad8..00000000000000 --- a/fs/bcachefs/dirent_format.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DIRENT_FORMAT_H -#define _BCACHEFS_DIRENT_FORMAT_H - -/* - * Dirents (and xattrs) have to implement string lookups; since our b-tree - * doesn't support arbitrary length strings for the key, we instead index by a - * 64 bit hash (currently truncated sha1) of the string, stored in the offset - * field of the key - using linear probing to resolve hash collisions. This also - * provides us with the readdir cookie posix requires. - * - * Linear probing requires us to use whiteouts for deletions, in the event of a - * collision: - */ - -struct bch_dirent { - struct bch_val v; - - /* Target inode number: */ - union { - __le64 d_inum; - struct { /* DT_SUBVOL */ - __le32 d_child_subvol; - __le32 d_parent_subvol; - }; - }; - - /* - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 d_type:5, - d_unused:2, - d_casefold:1; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u8 d_casefold:1, - d_unused:2, - d_type:5; -#endif - - union { - struct { - __u8 d_pad; - __le16 d_name_len; - __le16 d_cf_name_len; - __u8 d_names[]; - } d_cf_name_block __packed; - __DECLARE_FLEX_ARRAY(__u8, d_name); - } __packed; -} __packed __aligned(8); - -#define DT_SUBVOL 16 -#define BCH_DT_MAX 17 - -#define BCH_NAME_MAX 512 - -#endif /* _BCACHEFS_DIRENT_FORMAT_H */ diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c deleted file mode 100644 index f7528cd69c73fe..00000000000000 --- a/fs/bcachefs/disk_accounting.c +++ /dev/null @@ -1,1074 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bcachefs_ioctl.h" -#include "btree_cache.h" -#include "btree_journal_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "compress.h" -#include "disk_accounting.h" -#include "error.h" -#include "journal_io.h" -#include "replicas.h" - -/* - * Notes on disk accounting: - * - * We have two parallel sets of counters to be concerned with, and both must be - * kept in sync. - * - * - Persistent/on disk accounting, stored in the accounting btree and updated - * via btree write buffer updates that treat new accounting keys as deltas to - * apply to existing values. But reading from a write buffer btree is - * expensive, so we also have - * - * - In memory accounting, where accounting is stored as an array of percpu - * counters, indexed by an eytzinger array of disk acounting keys/bpos (which - * are the same thing, excepting byte swabbing on big endian). - * - * Cheap to read, but non persistent. - * - * Disk accounting updates are generated by transactional triggers; these run as - * keys enter and leave the btree, and can compare old and new versions of keys; - * the output of these triggers are deltas to the various counters. - * - * Disk accounting updates are done as btree write buffer updates, where the - * counters in the disk accounting key are deltas that will be applied to the - * counter in the btree when the key is flushed by the write buffer (or journal - * replay). - * - * To do a disk accounting update: - * - initialize a disk_accounting_pos, to specify which counter is being update - * - initialize counter deltas, as an array of 1-3 s64s - * - call bch2_disk_accounting_mod() - * - * This queues up the accounting update to be done at transaction commit time. - * Underneath, it's a normal btree write buffer update. - * - * The transaction commit path is responsible for propagating updates to the in - * memory counters, with bch2_accounting_mem_mod(). - * - * The commit path also assigns every disk accounting update a unique version - * number, based on the journal sequence number and offset within that journal - * buffer; this is used by journal replay to determine which updates have been - * done. - * - * The transaction commit path also ensures that replicas entry accounting - * updates are properly marked in the superblock (so that we know whether we can - * mount without data being unavailable); it will update the superblock if - * bch2_accounting_mem_mod() tells it to. - */ - -static const char * const disk_accounting_type_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_DISK_ACCOUNTING_TYPES() -#undef x - NULL -}; - -static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, - s64 *d, unsigned nr) -{ - struct bkey_i_accounting *acc = bkey_accounting_init(k); - - acc->k.p = pos; - set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); - - memcpy_u64s_small(acc->v.d, d, nr); -} - -static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, - s64 *d, unsigned nr) -{ - return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); -} - -static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); - -int bch2_disk_accounting_mod(struct btree_trans *trans, - struct disk_accounting_pos *k, - s64 *d, unsigned nr, bool gc) -{ - BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); - - /* Normalize: */ - switch (k->type) { - case BCH_DISK_ACCOUNTING_replicas: - bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp); - break; - } - - struct bpos pos = disk_accounting_pos_to_bpos(k); - - if (likely(!gc)) { - struct bkey_i_accounting *a; -#if 0 - for (a = btree_trans_subbuf_base(trans, &trans->accounting); - a != btree_trans_subbuf_top(trans, &trans->accounting); - a = (void *) bkey_next(&a->k_i)) - if (bpos_eq(a->k.p, pos)) { - BUG_ON(nr != bch2_accounting_counters(&a->k)); - acc_u64s(a->v.d, d, nr); - - if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { - unsigned offset = (u64 *) a - - (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); - - trans->accounting.u64s -= a->k.u64s; - memmove_u64s_down(a, - bkey_next(&a->k_i), - trans->accounting.u64s - offset); - } - return 0; - } -#endif - unsigned u64s = sizeof(*a) / sizeof(u64) + nr; - a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - __accounting_key_init(&a->k_i, pos, d, nr); - return 0; - } else { - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - - __accounting_key_init(&k_i.k, pos, d, nr); - - int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); - if (ret == -BCH_ERR_btree_insert_need_mark_replicas) - ret = drop_locks_do(trans, - bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: - bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); - return ret; - } -} - -int bch2_mod_dev_cached_sectors(struct btree_trans *trans, - unsigned dev, s64 sectors, - bool gc) -{ - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_replicas_entry_cached(&acc.replicas, dev); - - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); -} - -static inline bool is_zero(char *start, char *end) -{ - BUG_ON(start > end); - - for (; start < end; start++) - if (*start) - return false; - return true; -} - -#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) - -static const unsigned bch2_accounting_type_nr_counters[] = { -#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, - BCH_DISK_ACCOUNTING_TYPES() -#undef x -}; - -int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - void *end = &acc_k + 1; - int ret = 0; - - bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && - bversion_zero(k.k->bversion), - c, accounting_key_version_0, - "accounting key with version=0"); - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_nr_inodes: - end = field_end(acc_k, nr_inodes); - break; - case BCH_DISK_ACCOUNTING_persistent_reserved: - end = field_end(acc_k, persistent_reserved); - break; - case BCH_DISK_ACCOUNTING_replicas: - bkey_fsck_err_on(!acc_k.replicas.nr_devs, - c, accounting_key_replicas_nr_devs_0, - "accounting key replicas entry with nr_devs=0"); - - bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || - (acc_k.replicas.nr_required > 1 && - acc_k.replicas.nr_required == acc_k.replicas.nr_devs), - c, accounting_key_replicas_nr_required_bad, - "accounting key replicas entry with bad nr_required"); - - for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) - bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], - c, accounting_key_replicas_devs_unsorted, - "accounting key replicas entry with unsorted devs"); - - end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - end = field_end(acc_k, dev_data_type); - break; - case BCH_DISK_ACCOUNTING_compression: - end = field_end(acc_k, compression); - break; - case BCH_DISK_ACCOUNTING_snapshot: - end = field_end(acc_k, snapshot); - break; - case BCH_DISK_ACCOUNTING_btree: - end = field_end(acc_k, btree); - break; - case BCH_DISK_ACCOUNTING_rebalance_work: - end = field_end(acc_k, rebalance_work); - break; - } - - bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), - c, accounting_key_junk_at_end, - "junk at end of accounting key"); - - bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], - c, accounting_key_nr_counters_wrong, - "accounting key with %u counters, should be %u", - bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); -fsck_err: - return ret; -} - -void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) -{ - if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { - prt_printf(out, "unknown type %u", k->type); - return; - } - - prt_str(out, disk_accounting_type_strs[k->type]); - prt_str(out, " "); - - switch (k->type) { - case BCH_DISK_ACCOUNTING_nr_inodes: - break; - case BCH_DISK_ACCOUNTING_persistent_reserved: - prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); - break; - case BCH_DISK_ACCOUNTING_replicas: - bch2_replicas_entry_to_text(out, &k->replicas); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); - bch2_prt_data_type(out, k->dev_data_type.data_type); - break; - case BCH_DISK_ACCOUNTING_compression: - bch2_prt_compression_type(out, k->compression.type); - break; - case BCH_DISK_ACCOUNTING_snapshot: - prt_printf(out, "id=%u", k->snapshot.id); - break; - case BCH_DISK_ACCOUNTING_btree: - prt_str(out, "btree="); - bch2_btree_id_to_text(out, k->btree.id); - break; - } -} - -void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - bch2_accounting_key_to_text(out, &acc_k); - - for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) - prt_printf(out, " %lli", acc.v->d[i]); -} - -void bch2_accounting_swab(struct bkey_s k) -{ - for (u64 *p = (u64 *) k.v; - p < (u64 *) bkey_val_end(k); - p++) - *p = swab64(*p); -} - -static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, - struct disk_accounting_pos *acc) -{ - unsafe_memcpy(r, &acc->replicas, - replicas_entry_bytes(&acc->replicas), - "variable length struct"); -} - -static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) -{ - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, p); - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_replicas: - __accounting_to_replicas(r, &acc_k); - return true; - default: - return false; - } -} - -static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) -{ - union bch_replicas_padded r; - return accounting_to_replicas(&r.e, p) - ? bch2_mark_replicas(c, &r.e) - : 0; -} - -/* - * Ensure accounting keys being updated are present in the superblock, when - * applicable (i.e. replicas updates) - */ -int bch2_accounting_update_sb(struct btree_trans *trans) -{ - for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); - i != btree_trans_subbuf_top(trans, &trans->accounting); - i = bkey_next(i)) { - int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); - if (ret) - return ret; - } - - return 0; -} - -static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) -{ - struct bch_accounting_mem *acc = &c->accounting; - - /* raced with another insert, already present: */ - if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p) < acc->k.nr) - return 0; - - struct accounting_mem_entry n = { - .pos = a.k->p, - .bversion = a.k->bversion, - .nr_counters = bch2_accounting_counters(a.k), - .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL), - }; - - if (!n.v[0]) - goto err; - - if (acc->gc_running) { - n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!n.v[1]) - goto err; - } - - if (darray_push(&acc->k, n)) - goto err; - - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); - - if (trace_accounting_mem_insert_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_accounting_to_text(&buf, c, a.s_c); - trace_accounting_mem_insert(c, buf.buf); - printbuf_exit(&buf); - } - return 0; -err: - free_percpu(n.v[1]); - free_percpu(n.v[0]); - return bch_err_throw(c, ENOMEM_disk_accounting); -} - -int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) -{ - union bch_replicas_padded r; - - if (mode != BCH_ACCOUNTING_read && - accounting_to_replicas(&r.e, a.k->p) && - !bch2_replicas_marked_locked(c, &r.e)) - return bch_err_throw(c, btree_insert_need_mark_replicas); - - percpu_up_read(&c->mark_lock); - percpu_down_write(&c->mark_lock); - int ret = __bch2_accounting_mem_insert(c, a); - percpu_up_write(&c->mark_lock); - percpu_down_read(&c->mark_lock); - return ret; -} - -int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) -{ - union bch_replicas_padded r; - - if (mode != BCH_ACCOUNTING_read && - accounting_to_replicas(&r.e, a.k->p) && - !bch2_replicas_marked_locked(c, &r.e)) - return bch_err_throw(c, btree_insert_need_mark_replicas); - - return __bch2_accounting_mem_insert(c, a); -} - -static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) -{ - for (unsigned i = 0; i < e->nr_counters; i++) - if (percpu_u64_get(e->v[0] + i) || - (e->v[1] && - percpu_u64_get(e->v[1] + i))) - return false; - return true; -} - -void bch2_accounting_mem_gc(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - percpu_down_write(&c->mark_lock); - struct accounting_mem_entry *dst = acc->k.data; - - darray_for_each(acc->k, src) { - if (accounting_mem_entry_is_zero(src)) { - free_percpu(src->v[0]); - free_percpu(src->v[1]); - } else { - *dst++ = *src; - } - } - - acc->k.nr = dst - acc->k.data; - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); - percpu_up_write(&c->mark_lock); -} - -/* - * Read out accounting keys for replicas entries, as an array of - * bch_replicas_usage entries. - * - * Note: this may be deprecated/removed at smoe point in the future and replaced - * with something more general, it exists to support the ioctl used by the - * 'bcachefs fs usage' command. - */ -int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) -{ - struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - darray_init(usage); - - percpu_down_read(&c->mark_lock); - darray_for_each(acc->k, i) { - union { - u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, - BCH_BKEY_PTRS_MAX)]; - struct bch_replicas_usage r; - } u; - u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; - - if (!accounting_to_replicas(&u.r.r, i->pos)) - continue; - - u64 sectors; - bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); - u.r.sectors = sectors; - - ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); - if (ret) - break; - - memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); - usage->nr += replicas_usage_bytes(&u.r); - } - percpu_up_read(&c->mark_lock); - - if (ret) - darray_exit(usage); - return ret; -} - -int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) -{ - - struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - darray_init(out_buf); - - percpu_down_read(&c->mark_lock); - darray_for_each(acc->k, i) { - struct disk_accounting_pos a_p; - bpos_to_disk_accounting_pos(&a_p, i->pos); - - if (!(accounting_types_mask & BIT(a_p.type))) - continue; - - ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + - sizeof(u64) * i->nr_counters); - if (ret) - break; - - struct bkey_i_accounting *a_out = - bkey_accounting_init((void *) &darray_top(*out_buf)); - set_bkey_val_u64s(&a_out->k, i->nr_counters); - a_out->k.p = i->pos; - bch2_accounting_mem_read_counters(acc, i - acc->k.data, - a_out->v.d, i->nr_counters, false); - - if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) - out_buf->nr += bkey_bytes(&a_out->k); - } - - percpu_up_read(&c->mark_lock); - - if (ret) - darray_exit(out_buf); - return ret; -} - -static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) -{ - darray_for_each(acc->k, e) { - free_percpu(e->v[gc]); - e->v[gc] = NULL; - } -} - -int bch2_gc_accounting_start(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - int ret = 0; - - percpu_down_write(&c->mark_lock); - darray_for_each(acc->k, e) { - e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!e->v[1]) { - bch2_accounting_free_counters(acc, true); - ret = bch_err_throw(c, ENOMEM_disk_accounting); - break; - } - } - - acc->gc_running = !ret; - percpu_up_write(&c->mark_lock); - - return ret; -} - -int bch2_gc_accounting_done(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - struct bpos pos = POS_MIN; - int ret = 0; - - percpu_down_write(&c->mark_lock); - while (1) { - unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &pos); - - if (idx >= acc->k.nr) - break; - - struct accounting_mem_entry *e = acc->k.data + idx; - pos = bpos_successor(e->pos); - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, e->pos); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - continue; - - u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; - u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; - - unsigned nr = e->nr_counters; - bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); - bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); - - if (memcmp(dst_v, src_v, nr * sizeof(u64))) { - printbuf_reset(&buf); - prt_str(&buf, "accounting mismatch for "); - bch2_accounting_key_to_text(&buf, &acc_k); - - prt_str(&buf, ":\n got"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", dst_v[j]); - - prt_str(&buf, "\nshould be"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", src_v[j]); - - for (unsigned j = 0; j < nr; j++) - src_v[j] -= dst_v[j]; - - bch2_trans_unlock_long(trans); - - if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) { - percpu_up_write(&c->mark_lock); - ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); - percpu_down_write(&c->mark_lock); - if (ret) - goto err; - - if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { - memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - - accounting_key_init(&k_i.k, &acc_k, src_v, nr); - bch2_accounting_mem_mod_locked(trans, - bkey_i_to_s_c_accounting(&k_i.k), - BCH_ACCOUNTING_normal, true); - - preempt_disable(); - struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); - struct bch_fs_usage_base *src = &trans->fs_usage_delta; - acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); - preempt_enable(); - } - } - } - } -err: -fsck_err: - percpu_up_write(&c->mark_lock); - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - - if (k.k->type != KEY_TYPE_accounting) - return 0; - - percpu_down_read(&c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), - BCH_ACCOUNTING_read, false); - percpu_up_read(&c->mark_lock); - return ret; -} - -static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - struct disk_accounting_pos *acc, - u64 *v, unsigned nr) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0, invalid_dev = -1; - - switch (acc->type) { - case BCH_DISK_ACCOUNTING_replicas: { - union bch_replicas_padded r; - __accounting_to_replicas(&r.e, acc); - - for (unsigned i = 0; i < r.e.nr_devs; i++) - if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && - !bch2_dev_exists(c, r.e.devs[i])) { - invalid_dev = r.e.devs[i]; - goto invalid_device; - } - - /* - * All replicas entry checks except for invalid device are done - * in bch2_accounting_validate - */ - BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); - - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n%s", - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, acc), - buf.buf))) { - /* - * We're not RW yet and still single threaded, dropping - * and retaking lock is ok: - */ - percpu_up_write(&c->mark_lock); - ret = bch2_mark_replicas(c, &r.e); - if (ret) - goto fsck_err; - percpu_down_write(&c->mark_lock); - } - break; - } - - case BCH_DISK_ACCOUNTING_dev_data_type: - if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { - invalid_dev = acc->dev_data_type.dev; - goto invalid_device; - } - break; - } - -fsck_err: - printbuf_exit(&buf); - return ret; -invalid_device: - if (fsck_err(trans, accounting_to_invalid_device, - "accounting entry points to invalid device %i\n%s", - invalid_dev, - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, acc), - buf.buf))) { - for (unsigned i = 0; i < nr; i++) - v[i] = -v[i]; - - ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: - -BCH_ERR_remove_disk_accounting_entry; - } else { - ret = bch_err_throw(c, remove_disk_accounting_entry); - } - goto fsck_err; -} - -/* - * At startup time, initialize the in memory accounting from the btree (and - * journal) - */ -int bch2_accounting_read(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - - /* - * We might run more than once if we rewind to start topology repair or - * btree node scan - and those might cause us to get different results, - * so we can't just skip if we've already run. - * - * Instead, zero out any accounting we have: - */ - percpu_down_write(&c->mark_lock); - darray_for_each(acc->k, e) - percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); - for_each_member_device(c, ca) - percpu_memset(ca->usage, 0, sizeof(*ca->usage)); - percpu_memset(c->usage, 0, sizeof(*c->usage)); - percpu_up_write(&c->mark_lock); - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); - iter.flags &= ~BTREE_ITER_with_journal; - int ret = for_each_btree_key_continue(trans, iter, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); - - if (k.k->type != KEY_TYPE_accounting) - continue; - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next; - memset(&next, 0, sizeof(next)); - next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); - continue; - } - - accounting_read_key(trans, k); - })); - if (ret) - goto err; - - struct journal_keys *keys = &c->journal_keys; - struct journal_key *dst = keys->data; - move_gap(keys, keys->nr); - - darray_for_each(*keys, i) { - if (i->k->k.type == KEY_TYPE_accounting) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); - - if (!bch2_accounting_is_mem(&acc_k)) - continue; - - struct bkey_s_c k = bkey_i_to_s_c(i->k); - unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, - sizeof(acc->k.data[0]), - accounting_pos_cmp, &k.k->p); - - bool applied = idx < acc->k.nr && - bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; - - if (applied) - continue; - - if (i + 1 < &darray_top(*keys) && - i[1].k->k.type == KEY_TYPE_accounting && - !journal_key_cmp(i, i + 1)) { - WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); - - i[1].journal_seq = i[0].journal_seq; - - bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), - bkey_s_c_to_accounting(k)); - continue; - } - - ret = accounting_read_key(trans, k); - if (ret) - goto err; - } - - *dst++ = *i; - } - keys->gap = keys->nr = dst - keys->data; - - percpu_down_write(&c->mark_lock); - - darray_for_each_reverse(acc->k, i) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->pos); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - memset(v, 0, sizeof(v)); - - for (unsigned j = 0; j < i->nr_counters; j++) - v[j] = percpu_u64_get(i->v[0] + j); - - /* - * If the entry counters are zeroed, it should be treated as - * nonexistent - it might point to an invalid device. - * - * Remove it, so that if it's re-added it gets re-marked in the - * superblock: - */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) - ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); - - if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(i->v[0]); - free_percpu(i->v[1]); - darray_remove_item(&acc->k, i); - ret = 0; - continue; - } - - if (ret) - goto fsck_err; - } - - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); - - preempt_disable(); - struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); - - for (unsigned i = 0; i < acc->k.nr; i++) { - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - - switch (k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - usage->reserved += v[0] * k.persistent_reserved.nr_replicas; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); - if (ca) { - struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; - percpu_u64_set(&d->buckets, v[0]); - percpu_u64_set(&d->sectors, v[1]); - percpu_u64_set(&d->fragmented, v[2]); - - if (k.dev_data_type.data_type == BCH_DATA_sb || - k.dev_data_type.data_type == BCH_DATA_journal) - usage->hidden += v[0] * ca->mi.bucket_size; - } - break; - } - } - } - preempt_enable(); -fsck_err: - percpu_up_write(&c->mark_lock); -err: - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) -{ - return bch2_trans_run(c, - bch2_btree_write_buffer_flush_sync(trans) ?: - for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ - struct disk_accounting_pos acc; - bpos_to_disk_accounting_pos(&acc, k.k->p); - - acc.type == BCH_DISK_ACCOUNTING_dev_data_type && - acc.dev_data_type.dev == dev - ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) - : 0; - })) ?: - bch2_btree_write_buffer_flush_sync(trans)); -} - -int bch2_dev_usage_init(struct bch_dev *ca, bool gc) -{ - struct bch_fs *c = ca->fs; - u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; - - int ret = bch2_trans_do(c, ({ - bch2_disk_accounting_mod2(trans, gc, - v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free) ?: - (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); - })); - bch_err_fn(c, ret); - return ret; -} - -void bch2_verify_accounting_clean(struct bch_fs *c) -{ - bool mismatch = false; - struct bch_fs_usage_base base = {}, base_inmem = {}; - - bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_accounting, POS_MIN, - BTREE_ITER_all_snapshots, k, ({ - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); - unsigned nr = bch2_accounting_counters(k.k); - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next; - memset(&next, 0, sizeof(next)); - next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); - continue; - } - - bch2_accounting_mem_read(c, k.k->p, v, nr); - - if (memcmp(a.v->d, v, nr * sizeof(u64))) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, " !="); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", v[j]); - - pr_err("%s", buf.buf); - printbuf_exit(&buf); - mismatch = true; - } - - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - { - guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ - struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (!ca) - continue; - - v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); - v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); - v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); - } - - if (memcmp(a.v->d, v, 3 * sizeof(u64))) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, " in mem"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", v[j]); - - pr_err("dev accounting mismatch: %s", buf.buf); - printbuf_exit(&buf); - mismatch = true; - } - } - - 0; - }))); - - acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); - -#define check(x) \ - if (base.x != base_inmem.x) { \ - pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \ - mismatch = true; \ - } - - //check(hidden); - check(btree); - check(data); - check(cached); - check(reserved); - check(nr_inodes); - - WARN_ON(mismatch); -} - -void bch2_accounting_gc_free(struct bch_fs *c) -{ - lockdep_assert_held(&c->mark_lock); - - struct bch_accounting_mem *acc = &c->accounting; - - bch2_accounting_free_counters(acc, true); - acc->gc_running = false; -} - -void bch2_fs_accounting_exit(struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - bch2_accounting_free_counters(acc, false); - darray_exit(&acc->k); -} diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h deleted file mode 100644 index d61abebf3e0be4..00000000000000 --- a/fs/bcachefs/disk_accounting.h +++ /dev/null @@ -1,301 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_ACCOUNTING_H -#define _BCACHEFS_DISK_ACCOUNTING_H - -#include "btree_update.h" -#include "eytzinger.h" -#include "sb-members.h" - -static inline void bch2_u64s_neg(u64 *v, unsigned nr) -{ - for (unsigned i = 0; i < nr; i++) - v[i] = -v[i]; -} - -static inline unsigned bch2_accounting_counters(const struct bkey *k) -{ - return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64); -} - -static inline void bch2_accounting_neg(struct bkey_s_accounting a) -{ - bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k)); -} - -static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) -{ - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - if (a.v->d[i]) - return false; - return true; -} - -static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, - struct bkey_s_c_accounting src) -{ - for (unsigned i = 0; - i < min(bch2_accounting_counters(&dst->k), - bch2_accounting_counters(src.k)); - i++) - dst->v.d[i] += src.v->d[i]; - - if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) - dst->k.bversion = src.k->bversion; -} - -static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, - enum bch_data_type data_type, - s64 sectors) -{ - switch (data_type) { - case BCH_DATA_btree: - fs_usage->btree += sectors; - break; - case BCH_DATA_user: - case BCH_DATA_parity: - fs_usage->data += sectors; - break; - case BCH_DATA_cached: - fs_usage->cached += sectors; - break; - default: - break; - } -} - -static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) -{ - BUILD_BUG_ON(sizeof(*acc) != sizeof(p)); - -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - acc->_pad = p; -#else - memcpy_swab(acc, &p, sizeof(p)); -#endif -} - -static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc) -{ - struct bpos p; -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - p = acc->_pad; -#else - memcpy_swab(&p, acc, sizeof(p)); -#endif - return p; -} - -int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, - s64 *, unsigned, bool); - -#define disk_accounting_key_init(_k, _type, ...) \ -do { \ - memset(&(_k), 0, sizeof(_k)); \ - (_k).type = BCH_DISK_ACCOUNTING_##_type; \ - (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ -} while (0) - -#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ -({ \ - struct disk_accounting_pos pos; \ - disk_accounting_key_init(pos, __VA_ARGS__); \ - bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ -}) - -#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ - bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) - -int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); - -int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); -void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_accounting_swab(struct bkey_s); - -#define bch2_bkey_ops_accounting ((struct bkey_ops) { \ - .key_validate = bch2_accounting_validate, \ - .val_to_text = bch2_accounting_to_text, \ - .swab = bch2_accounting_swab, \ - .min_val_size = 8, \ -}) - -int bch2_accounting_update_sb(struct btree_trans *); - -static inline int accounting_pos_cmp(const void *_l, const void *_r) -{ - const struct bpos *l = _l, *r = _r; - - return bpos_cmp(*l, *r); -} - -enum bch_accounting_mode { - BCH_ACCOUNTING_normal, - BCH_ACCOUNTING_gc, - BCH_ACCOUNTING_read, -}; - -int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); -int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); -void bch2_accounting_mem_gc(struct bch_fs *); - -static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) -{ - return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && - acc->type != BCH_DISK_ACCOUNTING_inum; -} - -/* - * Update in memory counters so they match the btree update we're doing; called - * from transaction commit path - */ -static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, - struct bkey_s_c_accounting a, - enum bch_accounting_mode mode, - bool write_locked) -{ - struct bch_fs *c = trans->c; - struct bch_accounting_mem *acc = &c->accounting; - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, a.k->p); - bool gc = mode == BCH_ACCOUNTING_gc; - - if (gc && !acc->gc_running) - return 0; - - if (!bch2_accounting_is_mem(&acc_k)) - return 0; - - if (mode == BCH_ACCOUNTING_normal) { - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (ca) { - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); - } - break; - } - } - } - - unsigned idx; - - while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = 0; - if (unlikely(write_locked)) - ret = bch2_accounting_mem_insert_locked(c, a, mode); - else - ret = bch2_accounting_mem_insert(c, a, mode); - if (ret) - return ret; - } - - struct accounting_mem_entry *e = &acc->k.data[idx]; - - EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); - - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - this_cpu_add(e->v[gc][i], a.v->d[i]); - return 0; -} - -static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) -{ - percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); - percpu_up_read(&trans->c->mark_lock); - return ret; -} - -static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, - unsigned idx, u64 *v, unsigned nr, bool gc) -{ - memset(v, 0, sizeof(*v) * nr); - - if (unlikely(idx >= acc->k.nr)) - return; - - struct accounting_mem_entry *e = &acc->k.data[idx]; - - nr = min_t(unsigned, nr, e->nr_counters); - - for (unsigned i = 0; i < nr; i++) - v[i] = percpu_u64_get(e->v[gc] + i); -} - -static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, - u64 *v, unsigned nr) -{ - percpu_down_read(&c->mark_lock); - struct bch_accounting_mem *acc = &c->accounting; - unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &p); - - bch2_accounting_mem_read_counters(acc, idx, v, nr, false); - percpu_up_read(&c->mark_lock); -} - -static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) -{ - EBUG_ON(!res->ref); - - return (struct bversion) { - .hi = res->seq >> 32, - .lo = (res->seq << 32) | (res->offset + offset), - }; -} - -static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, - struct bkey_i_accounting *a, - unsigned commit_flags) -{ - u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); - - EBUG_ON(bversion_zero(a->k.bversion)); - - return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) - ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false) - : 0; -} - -static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans, - struct bkey_i_accounting *a_i, - unsigned commit_flags) -{ - if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { - struct bkey_s_accounting a = accounting_i_to_s(a_i); - - bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false); - bch2_accounting_neg(a); - } -} - -int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); -int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); - -int bch2_gc_accounting_start(struct bch_fs *); -int bch2_gc_accounting_done(struct bch_fs *); - -int bch2_accounting_read(struct bch_fs *); - -int bch2_dev_usage_remove(struct bch_fs *, unsigned); -int bch2_dev_usage_init(struct bch_dev *, bool); - -void bch2_verify_accounting_clean(struct bch_fs *c); - -void bch2_accounting_gc_free(struct bch_fs *); -void bch2_fs_accounting_exit(struct bch_fs *); - -#endif /* _BCACHEFS_DISK_ACCOUNTING_H */ diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h deleted file mode 100644 index 8269af1dbe2a09..00000000000000 --- a/fs/bcachefs/disk_accounting_format.h +++ /dev/null @@ -1,225 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H -#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H - -#include "replicas_format.h" - -/* - * Disk accounting - KEY_TYPE_accounting - on disk format: - * - * Here, the key has considerably more structure than a typical key (bpos); an - * accounting key is 'struct disk_accounting_pos', which is a union of bpos. - * - * More specifically: a key is just a muliword integer (where word endianness - * matches native byte order), so we're treating bpos as an opaque 20 byte - * integer and mapping bch_accounting_key to that. - * - * This is a type-tagged union of all our various subtypes; a disk accounting - * key can be device counters, replicas counters, et cetera - it's extensible. - * - * The value is a list of u64s or s64s; the number of counters is specific to a - * given accounting type. - * - * Unlike with other key types, updates are _deltas_, and the deltas are not - * resolved until the update to the underlying btree, done by btree write buffer - * flush or journal replay. - * - * Journal replay in particular requires special handling. The journal tracks a - * range of entries which may possibly have not yet been applied to the btree - * yet - it does not know definitively whether individual entries are dirty and - * still need to be applied. - * - * To handle this, we use the version field of struct bkey, and give every - * accounting update a unique version number - a total ordering in time; the - * version number is derived from the key's position in the journal. Then - * journal replay can compare the version number of the key from the journal - * with the version number of the key in the btree to determine if a key needs - * to be replayed. - * - * For this to work, we must maintain this strict time ordering of updates as - * they are flushed to the btree, both via write buffer flush and via journal - * replay. This has complications for the write buffer code while journal replay - * is still in progress; the write buffer cannot flush any accounting keys to - * the btree until journal replay has finished replaying its accounting keys, or - * the (newer) version number of the keys from the write buffer will cause - * updates from journal replay to be lost. - */ - -struct bch_accounting { - struct bch_val v; - __u64 d[]; -}; - -#define BCH_ACCOUNTING_MAX_COUNTERS 3 - -#define BCH_DATA_TYPES() \ - x(free, 0) \ - x(sb, 1) \ - x(journal, 2) \ - x(btree, 3) \ - x(user, 4) \ - x(cached, 5) \ - x(parity, 6) \ - x(stripe, 7) \ - x(need_gc_gens, 8) \ - x(need_discard, 9) \ - x(unstriped, 10) - -enum bch_data_type { -#define x(t, n) BCH_DATA_##t, - BCH_DATA_TYPES() -#undef x - BCH_DATA_NR -}; - -static inline bool data_type_is_empty(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - return true; - default: - return false; - } -} - -static inline bool data_type_is_hidden(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_sb: - case BCH_DATA_journal: - return true; - default: - return false; - } -} - -/* - * field 1: name - * field 2: id - * field 3: number of counters (max 3) - */ - -#define BCH_DISK_ACCOUNTING_TYPES() \ - x(nr_inodes, 0, 1) \ - x(persistent_reserved, 1, 1) \ - x(replicas, 2, 1) \ - x(dev_data_type, 3, 3) \ - x(compression, 4, 3) \ - x(snapshot, 5, 1) \ - x(btree, 6, 1) \ - x(rebalance_work, 7, 1) \ - x(inum, 8, 3) - -enum disk_accounting_type { -#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, - BCH_DISK_ACCOUNTING_TYPES() -#undef x - BCH_DISK_ACCOUNTING_TYPE_NR, -}; - -/* - * No subtypes - number of inodes in the entire filesystem - * - * XXX: perhaps we could add a per-subvolume counter? - */ -struct bch_acct_nr_inodes { -}; - -/* - * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the - * reservation: - */ -struct bch_acct_persistent_reserved { - __u8 nr_replicas; -}; - -/* - * device, data type counter fields: - * [ - * nr_buckets - * live sectors (in buckets of that data type) - * sectors of internal fragmentation - * ] - * - * XXX: live sectors should've been done differently, you can have multiple data - * types in the same bucket (user, stripe, cached) and this collapses them to - * the bucket data type, and makes the internal fragmentation counter redundant - */ -struct bch_acct_dev_data_type { - __u8 dev; - __u8 data_type; -}; - -/* - * Compression type fields: - * [ - * number of extents - * uncompressed size - * compressed size - * ] - * - * Compression ratio, average extent size (fragmentation). - */ -struct bch_acct_compression { - __u8 type; -}; - -/* - * On disk usage by snapshot id; counts same values as replicas counter, but - * aggregated differently - */ -struct bch_acct_snapshot { - __u32 id; -} __packed; - -struct bch_acct_btree { - __u32 id; -} __packed; - -/* - * inum counter fields: - * [ - * number of extents - * sum of extent sizes - bkey size - * this field is similar to inode.bi_sectors, except here extents in - * different snapshots but the same inode number are all collapsed to the - * same counter - * sum of on disk size - same values tracked by replicas counters - * ] - * - * This tracks on disk fragmentation. - */ -struct bch_acct_inum { - __u64 inum; -} __packed; - -/* - * Simple counter of the amount of data (on disk sectors) rebalance needs to - * move, extents counted here are also in the rebalance_work btree. - */ -struct bch_acct_rebalance_work { -}; - -struct disk_accounting_pos { - union { - struct { - __u8 type; - union { - struct bch_acct_nr_inodes nr_inodes; - struct bch_acct_persistent_reserved persistent_reserved; - struct bch_replicas_entry_v1 replicas; - struct bch_acct_dev_data_type dev_data_type; - struct bch_acct_compression compression; - struct bch_acct_snapshot snapshot; - struct bch_acct_btree btree; - struct bch_acct_rebalance_work rebalance_work; - struct bch_acct_inum inum; - } __packed; - } __packed; - struct bpos _pad; - }; -}; - -#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */ diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h deleted file mode 100644 index b1982131b20666..00000000000000 --- a/fs/bcachefs/disk_accounting_types.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H -#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H - -#include "darray.h" - -struct accounting_mem_entry { - struct bpos pos; - struct bversion bversion; - unsigned nr_counters; - u64 __percpu *v[2]; -}; - -struct bch_accounting_mem { - DARRAY(struct accounting_mem_entry) k; - bool gc_running; -}; - -#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c deleted file mode 100644 index cde842ac188632..00000000000000 --- a/fs/bcachefs/disk_groups.c +++ /dev/null @@ -1,591 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "disk_groups.h" -#include "sb-members.h" -#include "super-io.h" - -#include - -static int group_cmp(const void *_l, const void *_r) -{ - const struct bch_disk_group *l = _l; - const struct bch_disk_group *r = _r; - - return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - - (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: - ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - - (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: - strncmp(l->label, r->label, sizeof(l->label)); -} - -static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_disk_groups *groups = - field_to_type(f, disk_groups); - struct bch_disk_group *g, *sorted = NULL; - unsigned nr_groups = disk_groups_nr(groups); - unsigned i, len; - int ret = 0; - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member m = bch2_sb_member_get(sb, i); - unsigned group_id; - - if (!BCH_MEMBER_GROUP(&m)) - continue; - - group_id = BCH_MEMBER_GROUP(&m) - 1; - - if (group_id >= nr_groups) { - prt_printf(err, "disk %u has invalid label %u (have %u)", - i, group_id, nr_groups); - return -BCH_ERR_invalid_sb_disk_groups; - } - - if (BCH_GROUP_DELETED(&groups->entries[group_id])) { - prt_printf(err, "disk %u has deleted label %u", i, group_id); - return -BCH_ERR_invalid_sb_disk_groups; - } - } - - if (!nr_groups) - return 0; - - for (i = 0; i < nr_groups; i++) { - g = groups->entries + i; - - if (BCH_GROUP_DELETED(g)) - continue; - - len = strnlen(g->label, sizeof(g->label)); - if (!len) { - prt_printf(err, "label %u empty", i); - return -BCH_ERR_invalid_sb_disk_groups; - } - } - - sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); - if (!sorted) - return -BCH_ERR_ENOMEM_disk_groups_validate; - - memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); - sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); - - for (g = sorted; g + 1 < sorted + nr_groups; g++) - if (!BCH_GROUP_DELETED(g) && - !group_cmp(&g[0], &g[1])) { - prt_printf(err, "duplicate label %llu.%.*s", - BCH_GROUP_PARENT(g), - (int) sizeof(g->label), g->label); - ret = -BCH_ERR_invalid_sb_disk_groups; - goto err; - } -err: - kfree(sorted); - return ret; -} - -static void bch2_sb_disk_groups_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_disk_groups *groups = - field_to_type(f, disk_groups); - struct bch_disk_group *g; - unsigned nr_groups = disk_groups_nr(groups); - - for (g = groups->entries; - g < groups->entries + nr_groups; - g++) { - if (g != groups->entries) - prt_printf(out, " "); - - if (BCH_GROUP_DELETED(g)) - prt_printf(out, "[deleted]"); - else - prt_printf(out, "[parent %llu name %s]", - BCH_GROUP_PARENT(g), g->label); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { - .validate = bch2_sb_disk_groups_validate, - .to_text = bch2_sb_disk_groups_to_text -}; - -int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_disk_groups *groups; - struct bch_disk_groups_cpu *cpu_g, *old_g; - unsigned i, g, nr_groups; - - lockdep_assert_held(&c->sb_lock); - - groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups); - nr_groups = disk_groups_nr(groups); - - if (!groups) - return 0; - - cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); - if (!cpu_g) - return bch_err_throw(c, ENOMEM_disk_groups_to_cpu); - - cpu_g->nr = nr_groups; - - for (i = 0; i < nr_groups; i++) { - struct bch_disk_group *src = &groups->entries[i]; - struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; - - dst->deleted = BCH_GROUP_DELETED(src); - dst->parent = BCH_GROUP_PARENT(src); - memcpy(dst->label, src->label, sizeof(dst->label)); - } - - for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); - struct bch_disk_group_cpu *dst; - - if (!bch2_member_alive(&m)) - continue; - - g = BCH_MEMBER_GROUP(&m); - while (g) { - dst = &cpu_g->entries[g - 1]; - __set_bit(i, dst->devs.d); - g = dst->parent; - } - } - - old_g = rcu_dereference_protected(c->disk_groups, - lockdep_is_held(&c->sb_lock)); - rcu_assign_pointer(c->disk_groups, cpu_g); - if (old_g) - kfree_rcu(old_g, rcu); - - return 0; -} - -const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) -{ - struct target t = target_decode(target); - - guard(rcu)(); - - switch (t.type) { - case TARGET_NULL: - return NULL; - case TARGET_DEV: { - struct bch_dev *ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - return ca ? &ca->self : NULL; - } - case TARGET_GROUP: { - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - - return g && t.group < g->nr && !g->entries[t.group].deleted - ? &g->entries[t.group].devs - : NULL; - } - default: - BUG(); - } -} - -bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) -{ - struct target t = target_decode(target); - - switch (t.type) { - case TARGET_NULL: - return false; - case TARGET_DEV: - return dev == t.dev; - case TARGET_GROUP: { - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - const struct bch_devs_mask *m = - g && t.group < g->nr && !g->entries[t.group].deleted - ? &g->entries[t.group].devs - : NULL; - - return m ? test_bit(dev, m->d) : false; - } - default: - BUG(); - } -} - -static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, - unsigned parent, - const char *name, unsigned namelen) -{ - unsigned i, nr_groups = disk_groups_nr(groups); - - if (!namelen || namelen > BCH_SB_LABEL_SIZE) - return -EINVAL; - - for (i = 0; i < nr_groups; i++) { - struct bch_disk_group *g = groups->entries + i; - - if (BCH_GROUP_DELETED(g)) - continue; - - if (!BCH_GROUP_DELETED(g) && - BCH_GROUP_PARENT(g) == parent && - strnlen(g->label, sizeof(g->label)) == namelen && - !memcmp(name, g->label, namelen)) - return i; - } - - return -1; -} - -static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, - const char *name, unsigned namelen) -{ - struct bch_sb_field_disk_groups *groups = - bch2_sb_field_get(sb->sb, disk_groups); - unsigned i, nr_groups = disk_groups_nr(groups); - struct bch_disk_group *g; - - if (!namelen || namelen > BCH_SB_LABEL_SIZE) - return -EINVAL; - - for (i = 0; - i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); - i++) - ; - - if (i == nr_groups) { - unsigned u64s = - (sizeof(struct bch_sb_field_disk_groups) + - sizeof(struct bch_disk_group) * (nr_groups + 1)) / - sizeof(u64); - - groups = bch2_sb_field_resize(sb, disk_groups, u64s); - if (!groups) - return -BCH_ERR_ENOSPC_disk_label_add; - - nr_groups = disk_groups_nr(groups); - } - - BUG_ON(i >= nr_groups); - - g = &groups->entries[i]; - - memcpy(g->label, name, namelen); - if (namelen < sizeof(g->label)) - g->label[namelen] = '\0'; - SET_BCH_GROUP_DELETED(g, 0); - SET_BCH_GROUP_PARENT(g, parent); - SET_BCH_GROUP_DATA_ALLOWED(g, ~0); - - return i; -} - -int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) -{ - struct bch_sb_field_disk_groups *groups = - bch2_sb_field_get(sb->sb, disk_groups); - int v = -1; - - do { - const char *next = strchrnul(name, '.'); - unsigned len = next - name; - - if (*next == '.') - next++; - - v = __bch2_disk_group_find(groups, v + 1, name, len); - name = next; - } while (*name && v >= 0); - - return v; -} - -int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) -{ - struct bch_sb_field_disk_groups *groups; - unsigned parent = 0; - int v = -1; - - do { - const char *next = strchrnul(name, '.'); - unsigned len = next - name; - - if (*next == '.') - next++; - - groups = bch2_sb_field_get(sb->sb, disk_groups); - - v = __bch2_disk_group_find(groups, parent, name, len); - if (v < 0) - v = __bch2_disk_group_add(sb, parent, name, len); - if (v < 0) - return v; - - parent = v + 1; - name = next; - } while (*name && v >= 0); - - return v; -} - -static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, - unsigned v) -{ - u16 path[32]; - unsigned nr = 0; - - while (1) { - if (nr == ARRAY_SIZE(path)) - goto invalid; - - if (v >= (g ? g->nr : 0)) - goto invalid; - - struct bch_disk_group_cpu *e = g->entries + v; - - if (e->deleted) - goto invalid; - - path[nr++] = v; - - if (!e->parent) - break; - - v = e->parent - 1; - } - - while (nr) { - struct bch_disk_group_cpu *e = g->entries + path[--nr]; - - prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); - if (nr) - prt_printf(out, "."); - } - return; -invalid: - prt_printf(out, "invalid label %u", v); -} - -void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_printbuf_make_room(out, 4096); - - out->atomic++; - guard(rcu)(); - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - - for (unsigned i = 0; i < (g ? g->nr : 0); i++) { - prt_printf(out, "%2u: ", i); - - if (g->entries[i].deleted) { - prt_printf(out, "[deleted]"); - goto next; - } - - __bch2_disk_path_to_text(out, g, i); - - prt_printf(out, " devs"); - - for_each_member_device_rcu(c, ca, &g->entries[i].devs) - prt_printf(out, " %s", ca->name); -next: - prt_newline(out); - } - - out->atomic--; -} - -void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) -{ - out->atomic++; - guard(rcu)(); - __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), - --out->atomic; -} - -void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) -{ - struct bch_sb_field_disk_groups *groups = - bch2_sb_field_get(sb, disk_groups); - struct bch_disk_group *g; - unsigned nr = 0; - u16 path[32]; - - while (1) { - if (nr == ARRAY_SIZE(path)) - goto inval; - - if (v >= disk_groups_nr(groups)) - goto inval; - - g = groups->entries + v; - - if (BCH_GROUP_DELETED(g)) - goto inval; - - path[nr++] = v; - - if (!BCH_GROUP_PARENT(g)) - break; - - v = BCH_GROUP_PARENT(g) - 1; - } - - while (nr) { - v = path[--nr]; - g = groups->entries + v; - - prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); - if (nr) - prt_printf(out, "."); - } - return; -inval: - prt_printf(out, "invalid label %u", v); -} - -int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -{ - lockdep_assert_held(&c->sb_lock); - - - if (!strlen(name) || !strcmp(name, "none")) { - struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_GROUP(mi, 0); - } else { - int v = bch2_disk_path_find_or_create(&c->disk_sb, name); - if (v < 0) - return v; - - struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_GROUP(mi, v + 1); - } - - return bch2_sb_disk_groups_to_cpu(c); -} - -int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -{ - int ret; - - mutex_lock(&c->sb_lock); - ret = __bch2_dev_group_set(c, ca, name) ?: - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, - struct printbuf *err) -{ - struct bch_dev *ca; - int g; - - if (!val) - return -EINVAL; - - if (!c) - return -BCH_ERR_option_needs_open_fs; - - if (!strlen(val) || !strcmp(val, "none")) { - *res = 0; - return 0; - } - - /* Is it a device? */ - ca = bch2_dev_lookup(c, val); - if (!IS_ERR(ca)) { - *res = dev_to_target(ca->dev_idx); - bch2_dev_put(ca); - return 0; - } - - mutex_lock(&c->sb_lock); - g = bch2_disk_path_find(&c->disk_sb, val); - mutex_unlock(&c->sb_lock); - - if (g >= 0) { - *res = group_to_target(g); - return 0; - } - - return -EINVAL; -} - -void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) -{ - struct target t = target_decode(v); - - switch (t.type) { - case TARGET_NULL: - prt_printf(out, "none"); - return; - case TARGET_DEV: { - out->atomic++; - guard(rcu)(); - struct bch_dev *ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - - if (ca && ca->disk_sb.bdev) - prt_printf(out, "/dev/%s", ca->name); - else if (ca) - prt_printf(out, "offline device %u", t.dev); - else - prt_printf(out, "invalid device %u", t.dev); - - out->atomic--; - return; - } - case TARGET_GROUP: - bch2_disk_path_to_text(out, c, t.group); - return; - default: - BUG(); - } -} - -static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) -{ - struct target t = target_decode(v); - - switch (t.type) { - case TARGET_NULL: - prt_printf(out, "none"); - break; - case TARGET_DEV: { - struct bch_member m = bch2_sb_member_get(sb, t.dev); - - if (bch2_member_exists(sb, t.dev)) { - prt_printf(out, "Device "); - pr_uuid(out, m.uuid.b); - prt_printf(out, " (%u)", t.dev); - } else { - prt_printf(out, "Bad device %u", t.dev); - } - break; - } - case TARGET_GROUP: - bch2_disk_path_to_text_sb(out, sb, t.group); - break; - default: - BUG(); - } -} - -void bch2_opt_target_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) -{ - if (c) - bch2_target_to_text(out, c, v); - else - bch2_target_to_text_sb(out, sb, v); -} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h deleted file mode 100644 index 441826fff22436..00000000000000 --- a/fs/bcachefs/disk_groups.h +++ /dev/null @@ -1,111 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_GROUPS_H -#define _BCACHEFS_DISK_GROUPS_H - -#include "disk_groups_types.h" - -extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; - -static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) -{ - return groups - ? (vstruct_end(&groups->field) - - (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) - : 0; -} - -struct target { - enum { - TARGET_NULL, - TARGET_DEV, - TARGET_GROUP, - } type; - union { - unsigned dev; - unsigned group; - }; -}; - -#define TARGET_DEV_START 1 -#define TARGET_GROUP_START (256 + TARGET_DEV_START) - -static inline u16 dev_to_target(unsigned dev) -{ - return TARGET_DEV_START + dev; -} - -static inline u16 group_to_target(unsigned group) -{ - return TARGET_GROUP_START + group; -} - -static inline struct target target_decode(unsigned target) -{ - if (target >= TARGET_GROUP_START) - return (struct target) { - .type = TARGET_GROUP, - .group = target - TARGET_GROUP_START - }; - - if (target >= TARGET_DEV_START) - return (struct target) { - .type = TARGET_DEV, - .group = target - TARGET_DEV_START - }; - - return (struct target) { .type = TARGET_NULL }; -} - -const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); - -static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, - enum bch_data_type data_type, - u16 target) -{ - struct bch_devs_mask devs = c->rw_devs[data_type]; - const struct bch_devs_mask *t = bch2_target_to_mask(c, target); - - if (t) - bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); - return devs; -} - -static inline bool bch2_target_accepts_data(struct bch_fs *c, - enum bch_data_type data_type, - u16 target) -{ - struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target); - return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX); -} - -bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); - -int bch2_disk_path_find(struct bch_sb_handle *, const char *); - -/* Exported for userspace bcachefs-tools: */ -int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); - -void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned); -void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned); - -void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned); - -int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); -void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); - -#define bch2_opt_target (struct bch_opt_fn) { \ - .parse = bch2_opt_target_parse, \ - .to_text = bch2_opt_target_to_text, \ -} - -int bch2_sb_disk_groups_to_cpu(struct bch_fs *); - -int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); -int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); - -const char *bch2_sb_validate_disk_groups(struct bch_sb *, - struct bch_sb_field *); - -void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *); - -#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h deleted file mode 100644 index 698990bbf1d206..00000000000000 --- a/fs/bcachefs/disk_groups_format.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H -#define _BCACHEFS_DISK_GROUPS_FORMAT_H - -#define BCH_SB_LABEL_SIZE 32 - -struct bch_disk_group { - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 flags[2]; -} __packed __aligned(8); - -LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) - -struct bch_sb_field_disk_groups { - struct bch_sb_field field; - struct bch_disk_group entries[]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */ diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h deleted file mode 100644 index a54ef085b13d46..00000000000000 --- a/fs/bcachefs/disk_groups_types.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H -#define _BCACHEFS_DISK_GROUPS_TYPES_H - -struct bch_disk_group_cpu { - bool deleted; - u16 parent; - u8 label[BCH_SB_LABEL_SIZE]; - struct bch_devs_mask devs; -}; - -struct bch_disk_groups_cpu { - struct rcu_head rcu; - unsigned nr; - struct bch_disk_group_cpu entries[] __counted_by(nr); -}; - -#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c deleted file mode 100644 index 543dbba9b14f39..00000000000000 --- a/fs/bcachefs/ec.c +++ /dev/null @@ -1,2405 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* erasure coding */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "checksum.h" -#include "disk_accounting.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "io_read.h" -#include "io_write.h" -#include "keylist.h" -#include "lru.h" -#include "recovery.h" -#include "replicas.h" -#include "super-io.h" -#include "util.h" - -#include -#include - -#ifdef __KERNEL__ - -#include -#include - -static void raid5_recov(unsigned disks, unsigned failed_idx, - size_t size, void **data) -{ - unsigned i = 2, nr; - - BUG_ON(failed_idx >= disks); - - swap(data[0], data[failed_idx]); - memcpy(data[0], data[1], size); - - while (i < disks) { - nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); - xor_blocks(nr, size, data[0], data + i); - i += nr; - } - - swap(data[0], data[failed_idx]); -} - -static void raid_gen(int nd, int np, size_t size, void **v) -{ - if (np >= 1) - raid5_recov(nd + np, nd, size, v); - if (np >= 2) - raid6_call.gen_syndrome(nd + np, size, v); - BUG_ON(np > 2); -} - -static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) -{ - switch (nr) { - case 0: - break; - case 1: - if (ir[0] < nd + 1) - raid5_recov(nd + 1, ir[0], size, v); - else - raid6_call.gen_syndrome(nd + np, size, v); - break; - case 2: - if (ir[1] < nd) { - /* data+data failure. */ - raid6_2data_recov(nd + np, size, ir[0], ir[1], v); - } else if (ir[0] < nd) { - /* data + p/q failure */ - - if (ir[1] == nd) /* data + p failure */ - raid6_datap_recov(nd + np, size, ir[0], v); - else { /* data + q failure */ - raid5_recov(nd + 1, ir[0], size, v); - raid6_call.gen_syndrome(nd + np, size, v); - } - } else { - raid_gen(nd, np, size, v); - } - break; - default: - BUG(); - } -} - -#else - -#include - -#endif - -struct ec_bio { - struct bch_dev *ca; - struct ec_stripe_buf *buf; - size_t idx; - int rw; - u64 submit_time; - struct bio bio; -}; - -/* Stripes btree keys: */ - -int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - int ret = 0; - - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || - bpos_gt(k.k->p, POS(0, U32_MAX)), - c, stripe_pos_bad, - "stripe at bad pos"); - - bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), - c, stripe_val_size_bad, - "incorrect value size (%zu < %u)", - bkey_val_u64s(k.k), stripe_val_u64s(s)); - - bkey_fsck_err_on(s->csum_granularity_bits >= 64, - c, stripe_csum_granularity_bad, - "invalid csum granularity (%u >= 64)", - s->csum_granularity_bits); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; - struct bch_stripe s = {}; - - memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); - - unsigned nr_data = s.nr_blocks - s.nr_redundant; - - prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", - s.algorithm, - le16_to_cpu(s.sectors), - nr_data, - s.nr_redundant); - bch2_prt_csum_type(out, s.csum_type); - prt_str(out, " gran "); - if (s.csum_granularity_bits < 64) - prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); - else - prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); - - if (s.disk_label) { - prt_str(out, " label"); - bch2_disk_path_to_text(out, c, s.disk_label - 1); - } - - for (unsigned i = 0; i < s.nr_blocks; i++) { - const struct bch_extent_ptr *ptr = sp->ptrs + i; - - if ((void *) ptr >= bkey_val_end(k)) - break; - - prt_char(out, ' '); - bch2_extent_ptr_to_text(out, c, ptr); - - if (s.csum_type < BCH_CSUM_NR && - i < nr_data && - stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) - prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); - } -} - -/* Triggers: */ - -static int __mark_stripe_bucket(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c_stripe s, - unsigned ptr_idx, bool deleting, - struct bpos bucket, - struct bch_alloc_v4 *a, - enum btree_iter_update_trigger_flags flags) -{ - const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; - unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_fs *c = trans->c; - if (deleting) - sectors = -sectors; - - if (!deleting) { - if (bch2_trans_inconsistent_on(a->stripe || - a->stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", - bucket.inode, bucket.offset, a->gen, - bch2_data_type_str(a->data_type), - a->dirty_sectors, - a->stripe, s.k->p.offset, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", - bucket.inode, bucket.offset, a->gen, - bch2_data_type_str(a->data_type), - a->dirty_sectors, - a->cached_sectors, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - } else { - if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || - a->stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", - bucket.inode, bucket.offset, a->gen, - a->stripe, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, - "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", - bucket.inode, bucket.offset, a->gen, - bch2_data_type_str(a->data_type), - bch2_data_type_str(data_type), - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - if (bch2_trans_inconsistent_on(parity && - (a->dirty_sectors != -sectors || - a->cached_sectors), trans, - "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", - bucket.inode, bucket.offset, a->gen, - a->dirty_sectors, - a->cached_sectors, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - } - - if (sectors) { - ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, - a->gen, a->data_type, &a->dirty_sectors); - if (ret) - goto err; - } - - if (!deleting) { - a->stripe = s.k->p.offset; - a->stripe_redundancy = s.v->nr_redundant; - alloc_data_type_set(a, data_type); - } else { - a->stripe = 0; - a->stripe_redundancy = 0; - alloc_data_type_set(a, BCH_DATA_user); - } -err: - printbuf_exit(&buf); - return ret; -} - -static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned ptr_idx, bool deleting, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); - if (unlikely(!ca)) { - if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - if (flags & BTREE_TRIGGER_transactional) { - struct extent_ptr_decoded p = { - .ptr = *ptr, - .crc = bch2_extent_crc_unpack(s.k, NULL), - }; - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p, - (const union bch_extent_entry *) ptr, &bp); - - struct bkey_i_alloc_v4 *a = - bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: - bch2_bucket_backpointer_mod(trans, s.s_c, &bp, - !(flags & BTREE_TRIGGER_overwrite)); - if (ret) - goto err; - } - - if (flags & BTREE_TRIGGER_gc) { - struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", - ptr->dev, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = bch_err_throw(c, mark_stripe); - goto err; - } - - bucket_lock(g); - struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); - alloc_to_bucket(g, new); - bucket_unlock(g); - - if (!ret) - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - } -err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; -} - -static int mark_stripe_buckets(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; - - BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); - - unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - - for (unsigned i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - int ret = mark_stripe_bucket(trans, - bkey_s_c_to_stripe(new), i, false, flags); - if (ret) - return ret; - } - - if (old_s) { - int ret = mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true, flags); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_trigger_stripe(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s _new, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_s_c new = _new.s_c; - struct bch_fs *c = trans->c; - u64 idx = new.k->p.offset; - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; - - if (unlikely(flags & BTREE_TRIGGER_check_repair)) - return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); - - BUG_ON(new_s && old_s && - (new_s->nr_blocks != old_s->nr_blocks || - new_s->nr_redundant != old_s->nr_redundant)); - - if (flags & BTREE_TRIGGER_transactional) { - int ret = bch2_lru_change(trans, - BCH_LRU_STRIPE_FRAGMENTATION, - idx, - stripe_lru_pos(old_s), - stripe_lru_pos(new_s)); - if (ret) - return ret; - } - - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - /* - * If the pointers aren't changing, we don't need to do anything: - */ - if (new_s && old_s && - new_s->nr_blocks == old_s->nr_blocks && - new_s->nr_redundant == old_s->nr_redundant && - !memcmp(old_s->ptrs, new_s->ptrs, - new_s->nr_blocks * sizeof(struct bch_extent_ptr))) - return 0; - - struct gc_stripe *gc = NULL; - if (flags & BTREE_TRIGGER_gc) { - gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); - if (!gc) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); - return bch_err_throw(c, ENOMEM_mark_stripe); - } - - /* - * This will be wrong when we bring back runtime gc: we should - * be unmarking the old key and then marking the new key - * - * Also: when we bring back runtime gc, locking - */ - gc->alive = true; - gc->sectors = le16_to_cpu(new_s->sectors); - gc->nr_blocks = new_s->nr_blocks; - gc->nr_redundant = new_s->nr_redundant; - - for (unsigned i = 0; i < new_s->nr_blocks; i++) - gc->ptrs[i] = new_s->ptrs[i]; - - /* - * gc recalculates this field from stripe ptr - * references: - */ - memset(gc->block_sectors, 0, sizeof(gc->block_sectors)); - } - - if (new_s) { - s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, new); - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); - if (ret) - return ret; - - if (gc) - unsafe_memcpy(&gc->r.e, &acc.replicas, - replicas_entry_bytes(&acc.replicas), "VLA"); - } - - if (old_s) { - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; - - struct disk_accounting_pos acc; - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, old); - int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); - if (ret) - return ret; - } - - int ret = mark_stripe_buckets(trans, old, new, flags); - if (ret) - return ret; - } - - return 0; -} - -/* returns blocknr in stripe that we matched: */ -static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, - struct bkey_s_c k, unsigned *block) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i, nr_data = s->nr_blocks - s->nr_redundant; - - bkey_for_each_ptr(ptrs, ptr) - for (i = 0; i < nr_data; i++) - if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, - le16_to_cpu(s->sectors))) { - *block = i; - return ptr; - } - - return NULL; -} - -static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) - if (extent_entry_type(entry) == - BCH_EXTENT_ENTRY_stripe_ptr && - entry->stripe_ptr.idx == idx) - return true; - - return false; -} - -/* Stripe bufs: */ - -static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) -{ - if (buf->key.k.type == KEY_TYPE_stripe) { - struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); - unsigned i; - - for (i = 0; i < s->v.nr_blocks; i++) { - kvfree(buf->data[i]); - buf->data[i] = NULL; - } - } -} - -/* XXX: this is a non-mempoolified memory allocation: */ -static int ec_stripe_buf_init(struct bch_fs *c, - struct ec_stripe_buf *buf, - unsigned offset, unsigned size) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned csum_granularity = 1U << v->csum_granularity_bits; - unsigned end = offset + size; - unsigned i; - - BUG_ON(end > le16_to_cpu(v->sectors)); - - offset = round_down(offset, csum_granularity); - end = min_t(unsigned, le16_to_cpu(v->sectors), - round_up(end, csum_granularity)); - - buf->offset = offset; - buf->size = end - offset; - - memset(buf->valid, 0xFF, sizeof(buf->valid)); - - for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); - if (!buf->data[i]) - goto err; - } - - return 0; -err: - ec_stripe_buf_exit(buf); - return bch_err_throw(c, ENOMEM_stripe_buf); -} - -/* Checksumming: */ - -static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, - unsigned block, unsigned offset) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned end = buf->offset + buf->size; - unsigned len = min(csum_granularity, end - offset); - - BUG_ON(offset >= end); - BUG_ON(offset < buf->offset); - BUG_ON(offset & (csum_granularity - 1)); - BUG_ON(offset + len != le16_to_cpu(v->sectors) && - (len & (csum_granularity - 1))); - - return bch2_checksum(NULL, v->csum_type, - null_nonce(), - buf->data[block] + ((offset - buf->offset) << 9), - len << 9); -} - -static void ec_generate_checksums(struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned i, j, csums_per_device = stripe_csums_per_device(v); - - if (!v->csum_type) - return; - - BUG_ON(buf->offset); - BUG_ON(buf->size != le16_to_cpu(v->sectors)); - - for (i = 0; i < v->nr_blocks; i++) - for (j = 0; j < csums_per_device; j++) - stripe_csum_set(v, i, j, - ec_block_checksum(buf, i, j << v->csum_granularity_bits)); -} - -static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned i; - - if (!v->csum_type) - return; - - for (i = 0; i < v->nr_blocks; i++) { - unsigned offset = buf->offset; - unsigned end = buf->offset + buf->size; - - if (!test_bit(i, buf->valid)) - continue; - - while (offset < end) { - unsigned j = offset >> v->csum_granularity_bits; - unsigned len = min(csum_granularity, end - offset); - struct bch_csum want = stripe_csum_get(v, i, j); - struct bch_csum got = ec_block_checksum(buf, i, offset); - - if (bch2_crc_cmp(want, got)) { - struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); - if (ca) { - struct printbuf err = PRINTBUF; - - prt_str(&err, "stripe "); - bch2_csum_err_msg(&err, v->csum_type, want, got); - prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); - bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); - bch_err_ratelimited(ca, "%s", err.buf); - printbuf_exit(&err); - - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - } - - clear_bit(i, buf->valid); - break; - } - - offset += len; - } - } -} - -/* Erasure coding: */ - -static void ec_generate_ec(struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned nr_data = v->nr_blocks - v->nr_redundant; - unsigned bytes = le16_to_cpu(v->sectors) << 9; - - raid_gen(nr_data, v->nr_redundant, bytes, buf->data); -} - -static unsigned ec_nr_failed(struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - - return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); -} - -static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; - unsigned nr_data = v->nr_blocks - v->nr_redundant; - unsigned bytes = buf->size << 9; - - if (ec_nr_failed(buf) > v->nr_redundant) { - bch_err_ratelimited(c, - "error doing reconstruct read: unable to read enough blocks"); - return -1; - } - - for (i = 0; i < nr_data; i++) - if (!test_bit(i, buf->valid)) - failed[nr_failed++] = i; - - raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); - return 0; -} - -/* IO: */ - -static void ec_block_endio(struct bio *bio) -{ - struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); - struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; - struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; - struct bch_dev *ca = ec_bio->ca; - struct closure *cl = bio->bi_private; - int rw = ec_bio->rw; - unsigned ref = rw == READ - ? BCH_DEV_READ_REF_ec_block - : BCH_DEV_WRITE_REF_ec_block; - - bch2_account_io_completion(ca, bio_data_dir(bio), - ec_bio->submit_time, !bio->bi_status); - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", - str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status)); - clear_bit(ec_bio->idx, ec_bio->buf->valid); - } - - int stale = dev_ptr_stale(ca, ptr); - if (stale) { - bch_err_ratelimited(ca->fs, - "error %s stripe: stale/invalid pointer (%i) after io", - bio_data_dir(bio) == READ ? "reading from" : "writing to", - stale); - clear_bit(ec_bio->idx, ec_bio->buf->valid); - } - - bio_put(&ec_bio->bio); - enumerated_ref_put(&ca->io_ref[rw], ref); - closure_put(cl); -} - -static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, - blk_opf_t opf, unsigned idx, struct closure *cl) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; - unsigned offset = 0, bytes = buf->size << 9; - struct bch_extent_ptr *ptr = &v->ptrs[idx]; - enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant - ? BCH_DATA_user - : BCH_DATA_parity; - int rw = op_is_write(opf); - unsigned ref = rw == READ - ? BCH_DEV_READ_REF_ec_block - : BCH_DEV_WRITE_REF_ec_block; - - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); - if (!ca) { - clear_bit(idx, buf->valid); - return; - } - - int stale = dev_ptr_stale(ca, ptr); - if (stale) { - bch_err_ratelimited(c, - "error %s stripe: stale pointer (%i)", - rw == READ ? "reading from" : "writing to", - stale); - clear_bit(idx, buf->valid); - return; - } - - - this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); - - while (offset < bytes) { - unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, - DIV_ROUND_UP(bytes, PAGE_SIZE)); - unsigned b = min_t(size_t, bytes - offset, - nr_iovecs << PAGE_SHIFT); - struct ec_bio *ec_bio; - - ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, - nr_iovecs, - opf, - GFP_KERNEL, - &c->ec_bioset), - struct ec_bio, bio); - - ec_bio->ca = ca; - ec_bio->buf = buf; - ec_bio->idx = idx; - ec_bio->rw = rw; - ec_bio->submit_time = local_clock(); - - ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); - ec_bio->bio.bi_end_io = ec_block_endio; - ec_bio->bio.bi_private = cl; - - bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); - - closure_get(cl); - enumerated_ref_get(&ca->io_ref[rw], ref); - - submit_bio(&ec_bio->bio); - - offset += b; - } - - enumerated_ref_put(&ca->io_ref[rw], ref); -} - -static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, - struct ec_stripe_buf *stripe) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - POS(0, idx), BTREE_ITER_slots); - ret = bkey_err(k); - if (ret) - goto err; - if (k.k->type != KEY_TYPE_stripe) { - ret = -ENOENT; - goto err; - } - bkey_reassemble(&stripe->key, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* recovery read path: */ -int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, - struct bkey_s_c orig_k) -{ - struct bch_fs *c = trans->c; - struct ec_stripe_buf *buf = NULL; - struct closure cl; - struct bch_stripe *v; - unsigned i, offset; - const char *msg = NULL; - struct printbuf msgbuf = PRINTBUF; - int ret = 0; - - closure_init_stack(&cl); - - BUG_ON(!rbio->pick.has_ec); - - buf = kzalloc(sizeof(*buf), GFP_NOFS); - if (!buf) - return bch_err_throw(c, ENOMEM_ec_read_extent); - - ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); - if (ret) { - msg = "stripe not found"; - goto err; - } - - v = &bkey_i_to_stripe(&buf->key)->v; - - if (!bch2_ptr_matches_stripe(v, rbio->pick)) { - msg = "pointer doesn't match stripe"; - goto err; - } - - offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; - if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { - msg = "read is bigger than stripe"; - goto err; - } - - ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio)); - if (ret) { - msg = "-ENOMEM"; - goto err; - } - - for (i = 0; i < v->nr_blocks; i++) - ec_block_io(c, buf, REQ_OP_READ, i, &cl); - - closure_sync(&cl); - - if (ec_nr_failed(buf) > v->nr_redundant) { - msg = "unable to read enough blocks"; - goto err; - } - - ec_validate_checksums(c, buf); - - ret = ec_do_recov(c, buf); - if (ret) - goto err; - - memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, - buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); -out: - ec_stripe_buf_exit(buf); - kfree(buf); - return ret; -err: - bch2_bkey_val_to_text(&msgbuf, c, orig_k); - bch_err_ratelimited(c, - "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); - printbuf_exit(&msgbuf); - ret = bch_err_throw(c, stripe_reconstruct); - goto out; -} - -/* stripe bucket accounting: */ - -static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) -{ - if (c->gc_pos.phase != GC_PHASE_not_running && - !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) - return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc); - - return 0; -} - -static int ec_stripe_mem_alloc(struct btree_trans *trans, - struct btree_iter *iter) -{ - return allocate_dropping_locks_errcode(trans, - __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); -} - -/* - * Hash table of open stripes: - * Stripes that are being created or modified are kept in a hash table, so that - * stripe deletion can skip them. - */ - -static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) -{ - unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); - struct ec_stripe_new *s; - - hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) - if (s->idx == idx) - return true; - return false; -} - -static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) -{ - bool ret = false; - - spin_lock(&c->ec_stripes_new_lock); - ret = __bch2_stripe_is_open(c, idx); - spin_unlock(&c->ec_stripes_new_lock); - - return ret; -} - -static bool bch2_try_open_stripe(struct bch_fs *c, - struct ec_stripe_new *s, - u64 idx) -{ - bool ret; - - spin_lock(&c->ec_stripes_new_lock); - ret = !__bch2_stripe_is_open(c, idx); - if (ret) { - unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); - - s->idx = idx; - hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); - } - spin_unlock(&c->ec_stripes_new_lock); - - return ret; -} - -static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) -{ - BUG_ON(!s->idx); - - spin_lock(&c->ec_stripes_new_lock); - hlist_del_init(&s->hash); - spin_unlock(&c->ec_stripes_new_lock); - - s->idx = 0; -} - -/* stripe deletion */ - -static int ec_stripe_delete(struct btree_trans *trans, u64 idx) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - goto err; - - /* - * We expect write buffer races here - * Important: check stripe_is_open with stripe key locked: - */ - if (k.k->type == KEY_TYPE_stripe && - !bch2_stripe_is_open(trans->c, idx) && - stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * XXX - * can we kill this and delete stripes from the trigger? - */ -static void ec_stripe_delete_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, ec_stripe_delete_work); - - bch2_trans_run(c, - bch2_btree_write_buffer_tryflush(trans) ?: - for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), - 0, lru_k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ - ec_stripe_delete(trans, lru_k.k->p.offset); - }))); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); -} - -void bch2_do_stripe_deletes(struct bch_fs *c) -{ - if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && - !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); -} - -/* stripe creation: */ - -static int ec_stripe_key_update(struct btree_trans *trans, - struct bkey_i_stripe *old, - struct bkey_i_stripe *new) -{ - struct bch_fs *c = trans->c; - bool create = !old; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - goto err; - - if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), - c, "error %s stripe: got existing key type %s", - create ? "creating" : "updating", - bch2_bkey_types[k.k->type])) { - ret = -EINVAL; - goto err; - } - - if (k.k->type == KEY_TYPE_stripe) { - const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; - - BUG_ON(old->v.nr_blocks != new->v.nr_blocks); - BUG_ON(old->v.nr_blocks != v->nr_blocks); - - for (unsigned i = 0; i < new->v.nr_blocks; i++) { - unsigned sectors = stripe_blockcount_get(v, i); - - if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "stripe changed nonempty block %u", i); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; - } - - /* - * If the stripe ptr changed underneath us, it must have - * been dev_remove_stripes() -> * invalidate_stripe_to_dev() - */ - if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { - BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); - - if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) - new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; - } - - stripe_blockcount_set(&new->v, i, sectors); - } - } - - ret = bch2_trans_update(trans, &iter, &new->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int ec_stripe_update_extent(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, u8 gen, - struct ec_stripe_buf *s, - struct bkey_s_c_backpointer bp, - struct bkey_buf *last_flushed) -{ - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - const struct bch_extent_ptr *ptr_c; - struct bch_extent_ptr *ec_ptr = NULL; - struct bch_extent_stripe_ptr stripe_ptr; - struct bkey_i *n; - int ret, dev, block; - - if (bp.v->level) { - struct printbuf buf = PRINTBUF; - struct btree_iter node_iter; - struct btree *b; - - b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); - bch2_trans_iter_exit(trans, &node_iter); - - if (!b) - return 0; - - prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); - bch2_bkey_val_to_text(&buf, c, bp.s_c); - - bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, erasure_coding_found_btree_node); - } - - k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); - ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) { - /* - * extent no longer exists - we could flush the btree - * write buffer and retry to verify, but no need: - */ - return 0; - } - - if (extent_has_stripe_ptr(k, s->key.k.p.offset)) - goto out; - - ptr_c = bkey_matches_stripe(v, k, &block); - /* - * It doesn't generally make sense to erasure code cached ptrs: - * XXX: should we be incrementing a counter? - */ - if (!ptr_c || ptr_c->cached) - goto out; - - dev = v->ptrs[block].dev; - - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto out; - - bkey_reassemble(n, k); - - bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev); - ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); - BUG_ON(!ec_ptr); - - stripe_ptr = (struct bch_extent_stripe_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, - .block = block, - .redundancy = v->nr_redundant, - .idx = s->key.k.p.offset, - }; - - __extent_entry_insert(n, - (union bch_extent_entry *) ec_ptr, - (union bch_extent_entry *) &stripe_ptr); - - ret = bch2_trans_update(trans, &iter, n, 0); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, - unsigned block) -{ - struct bch_fs *c = trans->c; - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_extent_ptr ptr = v->ptrs[block]; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); - if (!ca) - return bch_err_throw(c, ENOENT_dev_not_found); - - struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket_pos), - bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, - NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, ({ - if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) - break; - - if (bp_k.k->type != KEY_TYPE_backpointer) - continue; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - if (bp.v->btree_id == BTREE_ID_stripes) - continue; - - ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, - bp, &last_flushed); - })); - - bch2_bkey_buf_exit(&last_flushed, c); - bch2_dev_put(ca); - return ret; -} - -static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - unsigned nr_data = v->nr_blocks - v->nr_redundant; - - int ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - for (unsigned i = 0; i < nr_data; i++) { - ret = ec_stripe_update_bucket(trans, s, i); - if (ret) - break; - } -err: - bch2_trans_put(trans); - return ret; -} - -static void zero_out_rest_of_ec_bucket(struct bch_fs *c, - struct ec_stripe_new *s, - unsigned block, - struct open_bucket *ob) -{ - struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, - BCH_DEV_WRITE_REF_ec_bucket_zero); - if (!ca) { - s->err = bch_err_throw(c, erofs_no_writes); - return; - } - - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - memset(s->new_stripe.data[block] + (offset << 9), - 0, - ob->sectors_free << 9); - - int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, - ob->bucket * ca->mi.bucket_size + offset, - ob->sectors_free, - GFP_KERNEL, 0); - - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); - - if (ret) - s->err = ret; -} - -void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) -{ - if (s->idx) - bch2_stripe_close(c, s); - kfree(s); -} - -/* - * data buckets of new stripe all written: create the stripe - */ -static void ec_stripe_create(struct ec_stripe_new *s) -{ - struct bch_fs *c = s->c; - struct open_bucket *ob; - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - int ret; - - BUG_ON(s->h->s == s); - - closure_sync(&s->iodone); - - if (!s->err) { - for (i = 0; i < nr_data; i++) - if (s->blocks[i]) { - ob = c->open_buckets + s->blocks[i]; - - if (ob->sectors_free) - zero_out_rest_of_ec_bucket(c, s, i, ob); - } - } - - if (s->err) { - if (!bch2_err_matches(s->err, EROFS)) - bch_err(c, "error creating stripe: error writing data buckets"); - ret = s->err; - goto err; - } - - if (s->have_existing_stripe) { - ec_validate_checksums(c, &s->existing_stripe); - - if (ec_do_recov(c, &s->existing_stripe)) { - bch_err(c, "error creating stripe: error reading existing stripe"); - ret = bch_err_throw(c, ec_block_read); - goto err; - } - - for (i = 0; i < nr_data; i++) - if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) - swap(s->new_stripe.data[i], - s->existing_stripe.data[i]); - - ec_stripe_buf_exit(&s->existing_stripe); - } - - BUG_ON(!s->allocated); - BUG_ON(!s->idx); - - ec_generate_ec(&s->new_stripe); - - ec_generate_checksums(&s->new_stripe); - - /* write p/q: */ - for (i = nr_data; i < v->nr_blocks; i++) - ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); - closure_sync(&s->iodone); - - if (ec_nr_failed(&s->new_stripe)) { - bch_err(c, "error creating stripe: error writing redundancy buckets"); - ret = bch_err_throw(c, ec_block_write); - goto err; - } - - ret = bch2_trans_commit_do(c, &s->res, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, - ec_stripe_key_update(trans, - s->have_existing_stripe - ? bkey_i_to_stripe(&s->existing_stripe.key) - : NULL, - bkey_i_to_stripe(&s->new_stripe.key))); - bch_err_msg(c, ret, "creating stripe key"); - if (ret) { - goto err; - } - - ret = ec_stripe_update_extents(c, &s->new_stripe); - bch_err_msg(c, ret, "error updating extents"); - if (ret) - goto err; -err: - trace_stripe_create(c, s->idx, ret); - - bch2_disk_reservation_put(c, &s->res); - - for (i = 0; i < v->nr_blocks; i++) - if (s->blocks[i]) { - ob = c->open_buckets + s->blocks[i]; - - if (i < nr_data) { - ob->ec = NULL; - __bch2_open_bucket_put(c, ob); - } else { - bch2_open_bucket_put(c, ob); - } - } - - mutex_lock(&c->ec_stripe_new_lock); - list_del(&s->list); - mutex_unlock(&c->ec_stripe_new_lock); - wake_up(&c->ec_stripe_new_wait); - - ec_stripe_buf_exit(&s->existing_stripe); - ec_stripe_buf_exit(&s->new_stripe); - closure_debug_destroy(&s->iodone); - - ec_stripe_new_put(c, s, STRIPE_REF_stripe); -} - -static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) -{ - struct ec_stripe_new *s; - - mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) - if (!atomic_read(&s->ref[STRIPE_REF_io])) - goto out; - s = NULL; -out: - mutex_unlock(&c->ec_stripe_new_lock); - - return s; -} - -static void ec_stripe_create_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, - struct bch_fs, ec_stripe_create_work); - struct ec_stripe_new *s; - - while ((s = get_pending_stripe(c))) - ec_stripe_create(s); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); -} - -void bch2_ec_do_stripe_creates(struct bch_fs *c) -{ - enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); - - if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); -} - -static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct ec_stripe_new *s = h->s; - - lockdep_assert_held(&h->lock); - - BUG_ON(!s->allocated && !s->err); - - h->s = NULL; - s->pending = true; - - mutex_lock(&c->ec_stripe_new_lock); - list_add(&s->list, &c->ec_stripe_new_list); - mutex_unlock(&c->ec_stripe_new_lock); - - ec_stripe_new_put(c, s, STRIPE_REF_io); -} - -static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err) -{ - h->s->err = err; - ec_stripe_new_set_pending(c, h); -} - -void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) -{ - struct ec_stripe_new *s = ob->ec; - - s->err = err; -} - -void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) -{ - struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - if (!ob) - return NULL; - - BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); - - struct bch_dev *ca = ob_dev(c, ob); - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - - return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); -} - -static int unsigned_cmp(const void *_l, const void *_r) -{ - unsigned l = *((const unsigned *) _l); - unsigned r = *((const unsigned *) _r); - - return cmp_int(l, r); -} - -/* pick most common bucket size: */ -static unsigned pick_blocksize(struct bch_fs *c, - struct bch_devs_mask *devs) -{ - unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; - struct { - unsigned nr, size; - } cur = { 0, 0 }, best = { 0, 0 }; - - for_each_member_device_rcu(c, ca, devs) - sizes[nr++] = ca->mi.bucket_size; - - sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); - - for (unsigned i = 0; i < nr; i++) { - if (sizes[i] != cur.size) { - if (cur.nr > best.nr) - best = cur; - - cur.nr = 0; - cur.size = sizes[i]; - } - - cur.nr++; - } - - if (cur.nr > best.nr) - best = cur; - - return best.size; -} - -static bool may_create_new_stripe(struct bch_fs *c) -{ - return false; -} - -static void ec_stripe_key_init(struct bch_fs *c, - struct bkey_i *k, - unsigned nr_data, - unsigned nr_parity, - unsigned stripe_size, - unsigned disk_label) -{ - struct bkey_i_stripe *s = bkey_stripe_init(k); - unsigned u64s; - - s->v.sectors = cpu_to_le16(stripe_size); - s->v.algorithm = 0; - s->v.nr_blocks = nr_data + nr_parity; - s->v.nr_redundant = nr_parity; - s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); - s->v.csum_type = BCH_CSUM_crc32c; - s->v.disk_label = disk_label; - - while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { - BUG_ON(1 << s->v.csum_granularity_bits >= - le16_to_cpu(s->v.sectors) || - s->v.csum_granularity_bits == U8_MAX); - s->v.csum_granularity_bits++; - } - - set_bkey_val_u64s(&s->k, u64s); -} - -static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct ec_stripe_new *s; - - lockdep_assert_held(&h->lock); - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return NULL; - - mutex_init(&s->lock); - closure_init(&s->iodone, NULL); - atomic_set(&s->ref[STRIPE_REF_stripe], 1); - atomic_set(&s->ref[STRIPE_REF_io], 1); - s->c = c; - s->h = h; - s->nr_data = min_t(unsigned, h->nr_active_devs, - BCH_BKEY_PTRS_MAX) - h->redundancy; - s->nr_parity = h->redundancy; - - ec_stripe_key_init(c, &s->new_stripe.key, - s->nr_data, s->nr_parity, - h->blocksize, h->disk_label); - return s; -} - -static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct bch_devs_mask devs = h->devs; - unsigned nr_devs, nr_devs_with_durability; - - scoped_guard(rcu) { - h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label - ? group_to_target(h->disk_label - 1) - : 0); - nr_devs = dev_mask_nr(&h->devs); - - for_each_member_device_rcu(c, ca, &h->devs) - if (!ca->mi.durability) - __clear_bit(ca->dev_idx, h->devs.d); - nr_devs_with_durability = dev_mask_nr(&h->devs); - - h->blocksize = pick_blocksize(c, &h->devs); - - h->nr_active_devs = 0; - for_each_member_device_rcu(c, ca, &h->devs) - if (ca->mi.bucket_size == h->blocksize) - h->nr_active_devs++; - } - - /* - * If we only have redundancy + 1 devices, we're better off with just - * replication: - */ - h->insufficient_devs = h->nr_active_devs < h->redundancy + 2; - - if (h->insufficient_devs) { - const char *err; - - if (nr_devs < h->redundancy + 2) - err = NULL; - else if (nr_devs_with_durability < h->redundancy + 2) - err = "cannot use durability=0 devices"; - else - err = "mismatched bucket sizes"; - - if (err) - bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s", - h->nr_active_devs, h->redundancy + 2, err); - } - - struct bch_devs_mask devs_leaving; - bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX); - - if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving)) - ec_stripe_new_cancel(c, h, -EINTR); - - h->rw_devs_change_count = c->rw_devs_change_count; -} - -static struct ec_stripe_head * -ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label, - unsigned algo, unsigned redundancy, - enum bch_watermark watermark) -{ - struct ec_stripe_head *h; - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return NULL; - - mutex_init(&h->lock); - BUG_ON(!mutex_trylock(&h->lock)); - - h->disk_label = disk_label; - h->algo = algo; - h->redundancy = redundancy; - h->watermark = watermark; - - list_add(&h->list, &c->ec_stripe_head_list); - return h; -} - -void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) -{ - if (h->s && - h->s->allocated && - bitmap_weight(h->s->blocks_allocated, - h->s->nr_data) == h->s->nr_data) - ec_stripe_new_set_pending(c, h); - - mutex_unlock(&h->lock); -} - -static struct ec_stripe_head * -__bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned disk_label, - unsigned algo, - unsigned redundancy, - enum bch_watermark watermark) -{ - struct bch_fs *c = trans->c; - struct ec_stripe_head *h; - int ret; - - if (!redundancy) - return NULL; - - ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); - if (ret) - return ERR_PTR(ret); - - if (test_bit(BCH_FS_going_ro, &c->flags)) { - h = ERR_PTR(-BCH_ERR_erofs_no_writes); - goto err; - } - - list_for_each_entry(h, &c->ec_stripe_head_list, list) - if (h->disk_label == disk_label && - h->algo == algo && - h->redundancy == redundancy && - h->watermark == watermark) { - ret = bch2_trans_mutex_lock(trans, &h->lock); - if (ret) { - h = ERR_PTR(ret); - goto err; - } - goto found; - } - - h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); - if (!h) { - h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); - goto err; - } -found: - if (h->rw_devs_change_count != c->rw_devs_change_count) - ec_stripe_head_devs_update(c, h); - - if (h->insufficient_devs) { - mutex_unlock(&h->lock); - h = NULL; - } -err: - mutex_unlock(&c->ec_stripe_head_lock); - return h; -} - -static int new_stripe_alloc_buckets(struct btree_trans *trans, - struct alloc_request *req, - struct ec_stripe_head *h, struct ec_stripe_new *s, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - struct open_bucket *ob; - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - int ret = 0; - - req->scratch_data_type = req->data_type; - req->scratch_ptrs = req->ptrs; - req->scratch_nr_replicas = req->nr_replicas; - req->scratch_nr_effective = req->nr_effective; - req->scratch_have_cache = req->have_cache; - req->scratch_devs_may_alloc = req->devs_may_alloc; - - req->devs_may_alloc = h->devs; - req->have_cache = true; - - BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); - BUG_ON(v->nr_redundant != s->nr_parity); - - /* * We bypass the sector allocator which normally does this: */ - bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, - c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); - - for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { - /* - * Note: we don't yet repair invalid blocks (failed/removed - * devices) when reusing stripes - we still need a codepath to - * walk backpointers and update all extents that point to that - * block when updating the stripe - */ - if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); - - if (i < s->nr_data) - nr_have_data++; - else - nr_have_parity++; - } - - BUG_ON(nr_have_data > s->nr_data); - BUG_ON(nr_have_parity > s->nr_parity); - - req->ptrs.nr = 0; - if (nr_have_parity < s->nr_parity) { - req->nr_replicas = s->nr_parity; - req->nr_effective = nr_have_parity; - req->data_type = BCH_DATA_parity; - - ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); - - open_bucket_for_each(c, &req->ptrs, ob, i) { - j = find_next_zero_bit(s->blocks_gotten, - s->nr_data + s->nr_parity, - s->nr_data); - BUG_ON(j >= s->nr_data + s->nr_parity); - - s->blocks[j] = req->ptrs.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, s->blocks_gotten); - } - - if (ret) - goto err; - } - - req->ptrs.nr = 0; - if (nr_have_data < s->nr_data) { - req->nr_replicas = s->nr_data; - req->nr_effective = nr_have_data; - req->data_type = BCH_DATA_user; - - ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); - - open_bucket_for_each(c, &req->ptrs, ob, i) { - j = find_next_zero_bit(s->blocks_gotten, - s->nr_data, 0); - BUG_ON(j >= s->nr_data); - - s->blocks[j] = req->ptrs.v[i]; - v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, s->blocks_gotten); - } - - if (ret) - goto err; - } -err: - req->data_type = req->scratch_data_type; - req->ptrs = req->scratch_ptrs; - req->nr_replicas = req->scratch_nr_replicas; - req->nr_effective = req->scratch_nr_effective; - req->have_cache = req->scratch_have_cache; - req->devs_may_alloc = req->scratch_devs_may_alloc; - return ret; -} - -static int __get_existing_stripe(struct btree_trans *trans, - struct ec_stripe_head *head, - struct ec_stripe_buf *stripe, - u64 idx) -{ - struct bch_fs *c = trans->c; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_stripes, POS(0, idx), 0); - int ret = bkey_err(k); - if (ret) - goto err; - - /* We expect write buffer races here */ - if (k.k->type != KEY_TYPE_stripe) - goto out; - - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - if (stripe_lru_pos(s.v) <= 1) - goto out; - - if (s.v->disk_label == head->disk_label && - s.v->algorithm == head->algo && - s.v->nr_redundant == head->redundancy && - le16_to_cpu(s.v->sectors) == head->blocksize && - bch2_try_open_stripe(c, head->s, idx)) { - bkey_reassemble(&stripe->key, k); - ret = 1; - } -out: - bch2_set_btree_iter_dontneed(trans, &iter); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) -{ - struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; - unsigned i; - - BUG_ON(existing_v->nr_redundant != s->nr_parity); - s->nr_data = existing_v->nr_blocks - - existing_v->nr_redundant; - - int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); - if (ret) { - bch2_stripe_close(c, s); - return ret; - } - - BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); - - /* - * Free buckets we initially allocated - they might conflict with - * blocks from the stripe we're reusing: - */ - for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { - bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); - s->blocks[i] = 0; - } - memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); - memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); - - for (unsigned i = 0; i < existing_v->nr_blocks; i++) { - if (stripe_blockcount_get(existing_v, i)) { - __set_bit(i, s->blocks_gotten); - __set_bit(i, s->blocks_allocated); - } - - ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); - } - - bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); - s->have_existing_stripe = true; - - return 0; -} - -static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h, - struct ec_stripe_new *s) -{ - struct bch_fs *c = trans->c; - - /* - * If we can't allocate a new stripe, and there's no stripes with empty - * blocks for us to reuse, that means we have to wait on copygc: - */ - if (may_create_new_stripe(c)) - return -1; - - struct btree_iter lru_iter; - struct bkey_s_c lru_k; - int ret = 0; - - for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), - lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), - 0, lru_k, ret) { - ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &lru_iter); - if (!ret) - ret = bch_err_throw(c, stripe_alloc_blocked); - if (ret == 1) - ret = 0; - if (ret) - return ret; - - return init_new_stripe_from_existing(c, s); -} - -static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h, - struct ec_stripe_new *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bpos min_pos = POS(0, 1); - struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); - int ret; - - if (!s->res.sectors) { - ret = bch2_disk_reservation_get(c, &s->res, - h->blocksize, - s->nr_parity, - BCH_DISK_RESERVATION_NOFAIL); - if (ret) - return ret; - } - - /* - * Allocate stripe slot - * XXX: we're going to need a bitrange btree of free stripes - */ - for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { - if (bkey_gt(k.k->p, POS(0, U32_MAX))) { - if (start_pos.offset) { - start_pos = min_pos; - bch2_btree_iter_set_pos(trans, &iter, start_pos); - continue; - } - - ret = bch_err_throw(c, ENOSPC_stripe_create); - break; - } - - if (bkey_deleted(k.k) && - bch2_try_open_stripe(c, s, k.k->p.offset)) - break; - } - - c->ec_stripe_hint = iter.pos.offset; - - if (ret) - goto err; - - ret = ec_stripe_mem_alloc(trans, &iter); - if (ret) { - bch2_stripe_close(c, s); - goto err; - } - - s->new_stripe.key.k.p = iter.pos; -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -err: - bch2_disk_reservation_put(c, &s->res); - goto out; -} - -struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - struct alloc_request *req, - unsigned algo, - struct closure *cl) -{ - struct bch_fs *c = trans->c; - unsigned redundancy = req->nr_replicas - 1; - unsigned disk_label = 0; - struct target t = target_decode(req->target); - bool waiting = false; - int ret; - - if (t.type == TARGET_GROUP) { - if (t.group > U8_MAX) { - bch_err(c, "cannot create a stripe when disk_label > U8_MAX"); - return NULL; - } - disk_label = t.group + 1; /* 0 == no label */ - } - - struct ec_stripe_head *h = - __bch2_ec_stripe_head_get(trans, disk_label, algo, - redundancy, req->watermark); - if (IS_ERR_OR_NULL(h)) - return h; - - if (!h->s) { - h->s = ec_new_stripe_alloc(c, h); - if (!h->s) { - ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc); - bch_err(c, "failed to allocate new stripe"); - goto err; - } - - h->nr_created++; - } - - struct ec_stripe_new *s = h->s; - - if (s->allocated) - goto allocated; - - if (s->have_existing_stripe) - goto alloc_existing; - - /* First, try to allocate a full stripe: */ - enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; - swap(req->watermark, saved_watermark); - ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h, s); - swap(req->watermark, saved_watermark); - - if (!ret) - goto allocate_buf; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, ENOMEM)) - goto err; - - /* - * Not enough buckets available for a full stripe: we must reuse an - * existing stripe: - */ - while (1) { - ret = __bch2_ec_stripe_head_reuse(trans, h, s); - if (!ret) - break; - if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) - goto err; - - if (req->watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h, s); - if (ret) - goto err; - goto allocate_buf; - } - - /* XXX freelist_wait? */ - closure_wait(&c->freelist_wait, cl); - waiting = true; - } - - if (waiting) - closure_wake_up(&c->freelist_wait); -alloc_existing: - /* - * Retry allocating buckets, with the watermark for this - * particular write: - */ - ret = new_stripe_alloc_buckets(trans, req, h, s, cl); - if (ret) - goto err; - -allocate_buf: - ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize); - if (ret) - goto err; - - s->allocated = true; -allocated: - BUG_ON(!s->idx); - BUG_ON(!s->new_stripe.data[0]); - BUG_ON(trans->restarted); - return h; -err: - bch2_ec_stripe_head_put(c, h); - return ERR_PTR(ret); -} - -/* device removal */ - -int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - unsigned dev_idx, - unsigned flags) -{ - if (k.k->type != KEY_TYPE_stripe) - return 0; - - struct bch_fs *c = trans->c; - struct bkey_i_stripe *s = - bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); - int ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - struct disk_accounting_pos acc; - - s64 sectors = 0; - for (unsigned i = 0; i < s->v.nr_blocks; i++) - sectors -= stripe_blockcount_get(&s->v, i); - - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = BCH_DATA_user; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); - if (ret) - return ret; - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); - - /* XXX: how much redundancy do we still have? check degraded flags */ - - unsigned nr_good = 0; - - scoped_guard(rcu) - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == dev_idx) - ptr->dev = BCH_SB_MEMBER_INVALID; - - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; - } - - if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return bch_err_throw(c, remove_would_lose_data); - - unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; - - if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) - return bch_err_throw(c, remove_would_lose_data); - - sectors = -sectors; - - memset(&acc, 0, sizeof(acc)); - acc.type = BCH_DISK_ACCOUNTING_replicas; - bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); - acc.replicas.data_type = BCH_DATA_user; - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -} - -static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, - unsigned flags) -{ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); - - if (!a->stripe) - return 0; - - if (a->stripe_sectors) { - struct bch_fs *c = trans->c; - bch_err(c, "trying to invalidate device in stripe when bucket has stripe data"); - return bch_err_throw(c, invalidate_stripe_to_dev); - } - - struct btree_iter iter; - struct bkey_s_c_stripe s = - bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); - int ret = bkey_err(s); - if (ret) - return ret; - - ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_max_commit(trans, iter, - BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), - BTREE_ITER_intent, k, - NULL, NULL, 0, ({ - bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); - }))); - bch_err_fn(c, ret); - return ret; -} - -/* startup/shutdown */ - -static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) -{ - struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i; - - mutex_lock(&c->ec_stripe_head_lock); - list_for_each_entry(h, &c->ec_stripe_head_list, list) { - mutex_lock(&h->lock); - if (!h->s) - goto unlock; - - if (!ca) - goto found; - - for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { - if (!h->s->blocks[i]) - continue; - - ob = c->open_buckets + h->s->blocks[i]; - if (ob->dev == ca->dev_idx) - goto found; - } - goto unlock; -found: - ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); -unlock: - mutex_unlock(&h->lock); - } - mutex_unlock(&c->ec_stripe_head_lock); -} - -void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) -{ - __bch2_ec_stop(c, ca); -} - -void bch2_fs_ec_stop(struct bch_fs *c) -{ - __bch2_ec_stop(c, NULL); -} - -static bool bch2_fs_ec_flush_done(struct bch_fs *c) -{ - sched_annotate_sleep(); - - mutex_lock(&c->ec_stripe_new_lock); - bool ret = list_empty(&c->ec_stripe_new_list); - mutex_unlock(&c->ec_stripe_new_lock); - - return ret; -} - -void bch2_fs_ec_flush(struct bch_fs *c) -{ - wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); -} - -int bch2_stripes_read(struct bch_fs *c) -{ - return 0; -} - -static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, - struct ec_stripe_new *s) -{ - prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs", - s->idx, s->nr_data, s->nr_parity, - bitmap_weight(s->blocks_allocated, s->nr_data), - atomic_read(&s->ref[STRIPE_REF_io]), - atomic_read(&s->ref[STRIPE_REF_stripe]), - bch2_watermarks[s->h->watermark]); - - struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; - unsigned i; - for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) - prt_printf(out, " %u", s->blocks[i]); - prt_newline(out); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key)); - prt_newline(out); -} - -void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct ec_stripe_head *h; - struct ec_stripe_new *s; - - mutex_lock(&c->ec_stripe_head_lock); - list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", - h->disk_label, h->algo, h->redundancy, - bch2_watermarks[h->watermark], - h->nr_created); - - if (h->s) - bch2_new_stripe_to_text(out, c, h->s); - } - mutex_unlock(&c->ec_stripe_head_lock); - - prt_printf(out, "in flight:\n"); - - mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) - bch2_new_stripe_to_text(out, c, s); - mutex_unlock(&c->ec_stripe_new_lock); -} - -void bch2_fs_ec_exit(struct bch_fs *c) -{ - struct ec_stripe_head *h; - unsigned i; - - while (1) { - mutex_lock(&c->ec_stripe_head_lock); - h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); - mutex_unlock(&c->ec_stripe_head_lock); - - if (!h) - break; - - if (h->s) { - for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) - BUG_ON(h->s->blocks[i]); - - kfree(h->s); - } - kfree(h); - } - - BUG_ON(!list_empty(&c->ec_stripe_new_list)); - - bioset_exit(&c->ec_bioset); -} - -void bch2_fs_ec_init_early(struct bch_fs *c) -{ - spin_lock_init(&c->ec_stripes_new_lock); - - INIT_LIST_HEAD(&c->ec_stripe_head_list); - mutex_init(&c->ec_stripe_head_lock); - - INIT_LIST_HEAD(&c->ec_stripe_new_list); - mutex_init(&c->ec_stripe_new_lock); - init_waitqueue_head(&c->ec_stripe_new_wait); - - INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); - INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); -} - -int bch2_fs_ec_init(struct bch_fs *c) -{ - return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), - BIOSET_NEED_BVECS); -} - -static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, - struct bkey_s_c k, - struct bkey_buf *last_flushed) -{ - if (k.k->type != KEY_TYPE_stripe) - return 0; - - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - u64 lru_idx = stripe_lru_pos(s.v); - if (lru_idx) { - int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION, - k.k->p.offset, lru_idx, k, last_flushed); - if (ret) - return ret; - } - return 0; -} - -int bch2_check_stripe_to_lru_refs(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, - POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h deleted file mode 100644 index 548048adf0d573..00000000000000 --- a/fs/bcachefs/ec.h +++ /dev/null @@ -1,309 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EC_H -#define _BCACHEFS_EC_H - -#include "ec_types.h" -#include "buckets_types.h" -#include "extents_types.h" - -int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_stripe ((struct bkey_ops) { \ - .key_validate = bch2_stripe_validate, \ - .val_to_text = bch2_stripe_to_text, \ - .swab = bch2_ptr_swab, \ - .trigger = bch2_trigger_stripe, \ - .min_val_size = 8, \ -}) - -static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) -{ - return DIV_ROUND_UP(le16_to_cpu(s->sectors), - 1 << s->csum_granularity_bits); -} - -static inline unsigned stripe_csum_offset(const struct bch_stripe *s, - unsigned dev, unsigned csum_idx) -{ - EBUG_ON(s->csum_type >= BCH_CSUM_NR); - - unsigned csum_bytes = bch_crc_bytes[s->csum_type]; - - return sizeof(struct bch_stripe) + - sizeof(struct bch_extent_ptr) * s->nr_blocks + - (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; -} - -static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, - unsigned idx) -{ - return stripe_csum_offset(s, s->nr_blocks, 0) + - sizeof(u16) * idx; -} - -static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, - unsigned idx) -{ - return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); -} - -static inline void stripe_blockcount_set(struct bch_stripe *s, - unsigned idx, unsigned v) -{ - __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); - - *p = cpu_to_le16(v); -} - -static inline unsigned stripe_val_u64s(const struct bch_stripe *s) -{ - return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), - sizeof(u64)); -} - -static inline void *stripe_csum(struct bch_stripe *s, - unsigned block, unsigned csum_idx) -{ - EBUG_ON(block >= s->nr_blocks); - EBUG_ON(csum_idx >= stripe_csums_per_device(s)); - - return (void *) s + stripe_csum_offset(s, block, csum_idx); -} - -static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, - unsigned block, unsigned csum_idx) -{ - struct bch_csum csum = { 0 }; - - memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); - return csum; -} - -static inline void stripe_csum_set(struct bch_stripe *s, - unsigned block, unsigned csum_idx, - struct bch_csum csum) -{ - memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); -} - -#define STRIPE_LRU_POS_EMPTY 1 - -static inline u64 stripe_lru_pos(const struct bch_stripe *s) -{ - if (!s) - return 0; - - unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; - - for (unsigned i = 0; i < nr_data; i++) - blocks_empty += !stripe_blockcount_get(s, i); - - /* Will be picked up by the stripe_delete worker */ - if (blocks_empty == nr_data) - return STRIPE_LRU_POS_EMPTY; - - if (!blocks_empty) - return 0; - - /* invert: more blocks empty = reuse first */ - return LRU_TIME_MAX - blocks_empty; -} - -static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, - const struct bch_extent_ptr *data_ptr, - unsigned sectors) -{ - return (data_ptr->dev == stripe_ptr->dev || - data_ptr->dev == BCH_SB_MEMBER_INVALID || - stripe_ptr->dev == BCH_SB_MEMBER_INVALID) && - data_ptr->gen == stripe_ptr->gen && - data_ptr->offset >= stripe_ptr->offset && - data_ptr->offset < stripe_ptr->offset + sectors; -} - -static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, - struct extent_ptr_decoded p) -{ - unsigned nr_data = s->nr_blocks - s->nr_redundant; - - BUG_ON(!p.has_ec); - - if (p.ec.block >= nr_data) - return false; - - return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, - le16_to_cpu(s->sectors)); -} - -static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, - struct extent_ptr_decoded p) -{ - unsigned nr_data = m->nr_blocks - m->nr_redundant; - - BUG_ON(!p.has_ec); - - if (p.ec.block >= nr_data) - return false; - - return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, - m->sectors); -} - -static inline void gc_stripe_unlock(struct gc_stripe *s) -{ - BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); - - clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); - smp_mb__after_atomic(); - wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); -} - -static inline void gc_stripe_lock(struct gc_stripe *s) -{ - wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR, - TASK_UNINTERRUPTIBLE); -} - -struct bch_read_bio; - -struct ec_stripe_buf { - /* might not be buffering the entire stripe: */ - unsigned offset; - unsigned size; - unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - - void *data[BCH_BKEY_PTRS_MAX]; - - __BKEY_PADDED(key, 255); -}; - -struct ec_stripe_head; - -enum ec_stripe_ref { - STRIPE_REF_io, - STRIPE_REF_stripe, - STRIPE_REF_NR -}; - -struct ec_stripe_new { - struct bch_fs *c; - struct ec_stripe_head *h; - struct mutex lock; - struct list_head list; - - struct hlist_node hash; - u64 idx; - - struct closure iodone; - - atomic_t ref[STRIPE_REF_NR]; - - int err; - - u8 nr_data; - u8 nr_parity; - bool allocated; - bool pending; - bool have_existing_stripe; - - unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; - struct disk_reservation res; - - struct ec_stripe_buf new_stripe; - struct ec_stripe_buf existing_stripe; -}; - -struct ec_stripe_head { - struct list_head list; - struct mutex lock; - - unsigned disk_label; - unsigned algo; - unsigned redundancy; - enum bch_watermark watermark; - bool insufficient_devs; - - unsigned long rw_devs_change_count; - - u64 nr_created; - - struct bch_devs_mask devs; - unsigned nr_active_devs; - - unsigned blocksize; - - struct dev_stripe_state block_stripe; - struct dev_stripe_state parity_stripe; - - struct ec_stripe_new *s; -}; - -int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); - -void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); - -void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); - -int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); - -void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); - -struct alloc_request; -struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - struct alloc_request *, unsigned, struct closure *); - -void bch2_do_stripe_deletes(struct bch_fs *); -void bch2_ec_do_stripe_creates(struct bch_fs *); -void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); - -static inline void ec_stripe_new_get(struct ec_stripe_new *s, - enum ec_stripe_ref ref) -{ - atomic_inc(&s->ref[ref]); -} - -static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, - enum ec_stripe_ref ref) -{ - BUG_ON(atomic_read(&s->ref[ref]) <= 0); - - if (atomic_dec_and_test(&s->ref[ref])) - switch (ref) { - case STRIPE_REF_stripe: - bch2_ec_stripe_new_free(c, s); - break; - case STRIPE_REF_io: - bch2_ec_do_stripe_creates(c); - break; - default: - BUG(); - } -} - -int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, - struct bkey_s_c, unsigned, unsigned); -int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); - -void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); -void bch2_fs_ec_stop(struct bch_fs *); -void bch2_fs_ec_flush(struct bch_fs *); - -int bch2_stripes_read(struct bch_fs *); - -void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_ec_exit(struct bch_fs *); -void bch2_fs_ec_init_early(struct bch_fs *); -int bch2_fs_ec_init(struct bch_fs *); - -int bch2_check_stripe_to_lru_refs(struct bch_fs *); - -#endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h deleted file mode 100644 index b9770f24f213c1..00000000000000 --- a/fs/bcachefs/ec_format.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EC_FORMAT_H -#define _BCACHEFS_EC_FORMAT_H - -struct bch_stripe { - struct bch_val v; - __le16 sectors; - __u8 algorithm; - __u8 nr_blocks; - __u8 nr_redundant; - - __u8 csum_granularity_bits; - __u8 csum_type; - - /* - * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2 - * - * we can manage with this because this only needs to point to a - * disk label, not a target: - */ - __u8 disk_label; - - /* - * Variable length sections: - * - Pointers - * - Checksums - * 2D array of [stripe block/device][csum block], with checksum block - * size given by csum_granularity_bits - * - Block sector counts: per-block array of u16s - * - * XXX: - * Either checksums should have come last, or we should have included a - * checksum_size field (the size in bytes of the checksum itself, not - * the blocksize the checksum covers). - * - * Currently we aren't able to access the block sector counts if the - * checksum type is unknown. - */ - - struct bch_extent_ptr ptrs[]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_EC_FORMAT_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h deleted file mode 100644 index 809446c789518b..00000000000000 --- a/fs/bcachefs/ec_types.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EC_TYPES_H -#define _BCACHEFS_EC_TYPES_H - -#include "bcachefs_format.h" - -union bch_replicas_padded { - u8 bytes[struct_size_t(struct bch_replicas_entry_v1, - devs, BCH_BKEY_PTRS_MAX)]; - struct bch_replicas_entry_v1 e; -}; - -struct stripe { - size_t heap_idx; - u16 sectors; - u8 algorithm; - u8 nr_blocks; - u8 nr_redundant; - u8 blocks_nonempty; - u8 disk_label; -}; - -struct gc_stripe { - u8 lock; - unsigned alive:1; /* does a corresponding key exist in stripes btree? */ - u16 sectors; - u8 nr_blocks; - u8 nr_redundant; - u16 block_sectors[BCH_BKEY_PTRS_MAX]; - struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; - - union bch_replicas_padded r; -}; - -#endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c deleted file mode 100644 index 56ab430f209f5e..00000000000000 --- a/fs/bcachefs/enumerated_ref.c +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "enumerated_ref.h" -#include "util.h" - -#include - -#ifdef ENUMERATED_REF_DEBUG -void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - atomic_long_inc(&ref->refs[idx]); -} - -bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - return atomic_long_inc_not_zero(&ref->refs[idx]); -} - -bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - return !ref->dying && - atomic_long_inc_not_zero(&ref->refs[idx]); -} - -void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) -{ - BUG_ON(idx >= ref->nr); - long v = atomic_long_dec_return(&ref->refs[idx]); - - BUG_ON(v < 0); - if (v) - return; - - for (unsigned i = 0; i < ref->nr; i++) - if (atomic_long_read(&ref->refs[i])) - return; - - if (ref->stop_fn) - ref->stop_fn(ref); - complete(&ref->stop_complete); -} -#endif - -#ifndef ENUMERATED_REF_DEBUG -static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref) -{ - struct enumerated_ref *ref = - container_of(percpu_ref, struct enumerated_ref, ref); - - if (ref->stop_fn) - ref->stop_fn(ref); - complete(&ref->stop_complete); -} -#endif - -void enumerated_ref_stop_async(struct enumerated_ref *ref) -{ - reinit_completion(&ref->stop_complete); - -#ifndef ENUMERATED_REF_DEBUG - percpu_ref_kill(&ref->ref); -#else - ref->dying = true; - for (unsigned i = 0; i < ref->nr; i++) - enumerated_ref_put(ref, i); -#endif -} - -void enumerated_ref_stop(struct enumerated_ref *ref, - const char * const names[]) -{ - enumerated_ref_stop_async(ref); - while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n"); - prt_str(&buf, "Outstanding refs:\n"); - enumerated_ref_to_text(&buf, ref, names); - printk(KERN_ERR "%s", buf.buf); - printbuf_exit(&buf); - } -} - -void enumerated_ref_start(struct enumerated_ref *ref) -{ -#ifndef ENUMERATED_REF_DEBUG - percpu_ref_reinit(&ref->ref); -#else - ref->dying = false; - for (unsigned i = 0; i < ref->nr; i++) { - BUG_ON(atomic_long_read(&ref->refs[i])); - atomic_long_inc(&ref->refs[i]); - } -#endif -} - -void enumerated_ref_exit(struct enumerated_ref *ref) -{ -#ifndef ENUMERATED_REF_DEBUG - percpu_ref_exit(&ref->ref); -#else - kfree(ref->refs); - ref->refs = NULL; - ref->nr = 0; -#endif -} - -int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr, - void (*stop_fn)(struct enumerated_ref *)) -{ - init_completion(&ref->stop_complete); - ref->stop_fn = stop_fn; - -#ifndef ENUMERATED_REF_DEBUG - return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb, - PERCPU_REF_INIT_DEAD, GFP_KERNEL); -#else - ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL); - if (!ref->refs) - return -ENOMEM; - - ref->nr = nr; - return 0; -#endif -} - -void enumerated_ref_to_text(struct printbuf *out, - struct enumerated_ref *ref, - const char * const names[]) -{ -#ifdef ENUMERATED_REF_DEBUG - bch2_printbuf_tabstop_push(out, 32); - - for (unsigned i = 0; i < ref->nr; i++) - prt_printf(out, "%s\t%li\n", names[i], - atomic_long_read(&ref->refs[i])); -#else - prt_str(out, "(not in debug mode)\n"); -#endif -} diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h deleted file mode 100644 index ec01cf59ef8069..00000000000000 --- a/fs/bcachefs/enumerated_ref.h +++ /dev/null @@ -1,66 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ENUMERATED_REF_H -#define _BCACHEFS_ENUMERATED_REF_H - -#include "enumerated_ref_types.h" - -/* - * A refcount where the users are enumerated: in debug mode, we create sepate - * refcounts for each user, to make leaks and refcount errors easy to track - * down: - */ - -#ifdef ENUMERATED_REF_DEBUG -void enumerated_ref_get(struct enumerated_ref *, unsigned); -bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned); -bool enumerated_ref_tryget(struct enumerated_ref *, unsigned); -void enumerated_ref_put(struct enumerated_ref *, unsigned); -#else - -static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) -{ - percpu_ref_get(&ref->ref); -} - -static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - return percpu_ref_tryget(&ref->ref); -} - -static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) -{ - return percpu_ref_tryget_live(&ref->ref); -} - -static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) -{ - percpu_ref_put(&ref->ref); -} -#endif - -static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref) -{ -#ifndef ENUMERATED_REF_DEBUG - return percpu_ref_is_zero(&ref->ref); -#else - for (unsigned i = 0; i < ref->nr; i++) - if (atomic_long_read(&ref->refs[i])) - return false; - return true; -#endif -} - -void enumerated_ref_stop_async(struct enumerated_ref *); -void enumerated_ref_stop(struct enumerated_ref *, const char * const[]); -void enumerated_ref_start(struct enumerated_ref *); - -void enumerated_ref_exit(struct enumerated_ref *); -int enumerated_ref_init(struct enumerated_ref *, unsigned, - void (*stop_fn)(struct enumerated_ref *)); - -struct printbuf; -void enumerated_ref_to_text(struct printbuf *, - struct enumerated_ref *, - const char * const[]); - -#endif /* _BCACHEFS_ENUMERATED_REF_H */ diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h deleted file mode 100644 index 0e6076f466d3d8..00000000000000 --- a/fs/bcachefs/enumerated_ref_types.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H -#define _BCACHEFS_ENUMERATED_REF_TYPES_H - -#include - -struct enumerated_ref { -#ifdef ENUMERATED_REF_DEBUG - unsigned nr; - bool dying; - atomic_long_t *refs; -#else - struct percpu_ref ref; -#endif - void (*stop_fn)(struct enumerated_ref *); - struct completion stop_complete; -}; - -#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */ diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c deleted file mode 100644 index c39cf304c68102..00000000000000 --- a/fs/bcachefs/errcode.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "errcode.h" -#include "trace.h" - -#include - -static const char * const bch2_errcode_strs[] = { -#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err, - BCH_ERRCODES() -#undef x - NULL -}; - -static const unsigned bch2_errcode_parents[] = { -#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, - BCH_ERRCODES() -#undef x -}; - -__attribute__((const)) -const char *bch2_err_str(int err) -{ - const char *errstr; - - err = abs(err); - - BUG_ON(err >= BCH_ERR_MAX); - - if (err >= BCH_ERR_START) - errstr = bch2_errcode_strs[err - BCH_ERR_START]; - else if (err) - errstr = errname(err); - else - errstr = "(No error)"; - return errstr ?: "(Invalid error)"; -} - -__attribute__((const)) -bool __bch2_err_matches(int err, int class) -{ - err = abs(err); - class = abs(class); - - BUG_ON(err >= BCH_ERR_MAX); - BUG_ON(class >= BCH_ERR_MAX); - - while (err >= BCH_ERR_START && err != class) - err = bch2_errcode_parents[err - BCH_ERR_START]; - - return err == class; -} - -int __bch2_err_class(int bch_err) -{ - int std_err = -bch_err; - BUG_ON((unsigned) std_err >= BCH_ERR_MAX); - - while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START]) - std_err = bch2_errcode_parents[std_err - BCH_ERR_START]; - - trace_error_downcast(bch_err, std_err, _RET_IP_); - - return -std_err; -} - -const char *bch2_blk_status_to_str(blk_status_t status) -{ - if (status == BLK_STS_REMOVED) - return "device removed"; - return blk_status_to_str(status); -} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h deleted file mode 100644 index acc3b7b677041d..00000000000000 --- a/fs/bcachefs/errcode.h +++ /dev/null @@ -1,387 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ERRCODE_H -#define _BCACHEFS_ERRCODE_H - -#define BCH_ERRCODES() \ - x(ERANGE, ERANGE_option_too_small) \ - x(ERANGE, ERANGE_option_too_big) \ - x(EINVAL, injected) \ - x(BCH_ERR_injected, injected_fs_start) \ - x(EINVAL, mount_option) \ - x(BCH_ERR_mount_option, option_name) \ - x(BCH_ERR_mount_option, option_value) \ - x(BCH_ERR_mount_option, option_not_bool) \ - x(ENOMEM, ENOMEM_stripe_buf) \ - x(ENOMEM, ENOMEM_replicas_table) \ - x(ENOMEM, ENOMEM_cpu_replicas) \ - x(ENOMEM, ENOMEM_replicas_gc) \ - x(ENOMEM, ENOMEM_disk_groups_validate) \ - x(ENOMEM, ENOMEM_disk_groups_to_cpu) \ - x(ENOMEM, ENOMEM_mark_snapshot) \ - x(ENOMEM, ENOMEM_mark_stripe) \ - x(ENOMEM, ENOMEM_mark_stripe_ptr) \ - x(ENOMEM, ENOMEM_btree_key_cache_create) \ - x(ENOMEM, ENOMEM_btree_key_cache_fill) \ - x(ENOMEM, ENOMEM_btree_key_cache_insert) \ - x(ENOMEM, ENOMEM_trans_kmalloc) \ - x(ENOMEM, ENOMEM_trans_log_msg) \ - x(ENOMEM, ENOMEM_do_encrypt) \ - x(ENOMEM, ENOMEM_ec_read_extent) \ - x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \ - x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \ - x(ENOMEM, ENOMEM_fs_btree_cache_init) \ - x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \ - x(ENOMEM, ENOMEM_fs_counters_init) \ - x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \ - x(ENOMEM, ENOMEM_io_clock_init) \ - x(ENOMEM, ENOMEM_blacklist_table_init) \ - x(ENOMEM, ENOMEM_sb_realloc_injected) \ - x(ENOMEM, ENOMEM_sb_bio_realloc) \ - x(ENOMEM, ENOMEM_sb_buf_realloc) \ - x(ENOMEM, ENOMEM_sb_journal_validate) \ - x(ENOMEM, ENOMEM_sb_journal_v2_validate) \ - x(ENOMEM, ENOMEM_journal_entry_add) \ - x(ENOMEM, ENOMEM_journal_read_buf_realloc) \ - x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\ - x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \ - x(ENOMEM, ENOMEM_bio_read_init) \ - x(ENOMEM, ENOMEM_bio_read_split_init) \ - x(ENOMEM, ENOMEM_bio_write_init) \ - x(ENOMEM, ENOMEM_bio_bounce_pages_init) \ - x(ENOMEM, ENOMEM_writepage_bioset_init) \ - x(ENOMEM, ENOMEM_dio_read_bioset_init) \ - x(ENOMEM, ENOMEM_dio_write_bioset_init) \ - x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ - x(ENOMEM, ENOMEM_promote_table_init) \ - x(ENOMEM, ENOMEM_async_obj_init) \ - x(ENOMEM, ENOMEM_compression_bounce_read_init) \ - x(ENOMEM, ENOMEM_compression_bounce_write_init) \ - x(ENOMEM, ENOMEM_compression_workspace_init) \ - x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \ - x(EIO, compression_workspace_not_initialized) \ - x(ENOMEM, ENOMEM_bucket_gens) \ - x(ENOMEM, ENOMEM_buckets_nouse) \ - x(ENOMEM, ENOMEM_usage_init) \ - x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \ - x(ENOMEM, ENOMEM_btree_node_reclaim) \ - x(ENOMEM, ENOMEM_btree_node_mem_alloc) \ - x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \ - x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\ - x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \ - x(ENOMEM, ENOMEM_set_nr_journal_buckets) \ - x(ENOMEM, ENOMEM_dev_journal_init) \ - x(ENOMEM, ENOMEM_journal_pin_fifo) \ - x(ENOMEM, ENOMEM_journal_buf) \ - x(ENOMEM, ENOMEM_gc_start) \ - x(ENOMEM, ENOMEM_gc_alloc_start) \ - x(ENOMEM, ENOMEM_gc_reflink_start) \ - x(ENOMEM, ENOMEM_gc_gens) \ - x(ENOMEM, ENOMEM_gc_repair_key) \ - x(ENOMEM, ENOMEM_fsck_extent_ends_at) \ - x(ENOMEM, ENOMEM_fsck_add_nlink) \ - x(ENOMEM, ENOMEM_journal_key_insert) \ - x(ENOMEM, ENOMEM_journal_keys_sort) \ - x(ENOMEM, ENOMEM_read_superblock_clean) \ - x(ENOMEM, ENOMEM_fs_alloc) \ - x(ENOMEM, ENOMEM_fs_name_alloc) \ - x(ENOMEM, ENOMEM_fs_other_alloc) \ - x(ENOMEM, ENOMEM_dev_alloc) \ - x(ENOMEM, ENOMEM_disk_accounting) \ - x(ENOMEM, ENOMEM_stripe_head_alloc) \ - x(ENOMEM, ENOMEM_journal_read_bucket) \ - x(ENOSPC, ENOSPC_disk_reservation) \ - x(ENOSPC, ENOSPC_bucket_alloc) \ - x(ENOSPC, ENOSPC_disk_label_add) \ - x(ENOSPC, ENOSPC_stripe_create) \ - x(ENOSPC, ENOSPC_inode_create) \ - x(ENOSPC, ENOSPC_str_hash_create) \ - x(ENOSPC, ENOSPC_snapshot_create) \ - x(ENOSPC, ENOSPC_subvolume_create) \ - x(ENOSPC, ENOSPC_sb) \ - x(ENOSPC, ENOSPC_sb_journal) \ - x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \ - x(ENOSPC, ENOSPC_sb_quota) \ - x(ENOSPC, ENOSPC_sb_replicas) \ - x(ENOSPC, ENOSPC_sb_members) \ - x(ENOSPC, ENOSPC_sb_members_v2) \ - x(ENOSPC, ENOSPC_sb_crypt) \ - x(ENOSPC, ENOSPC_sb_downgrade) \ - x(ENOSPC, ENOSPC_btree_slot) \ - x(ENOSPC, ENOSPC_snapshot_tree) \ - x(ENOENT, ENOENT_bkey_type_mismatch) \ - x(ENOENT, ENOENT_str_hash_lookup) \ - x(ENOENT, ENOENT_str_hash_set_must_replace) \ - x(ENOENT, ENOENT_inode) \ - x(ENOENT, ENOENT_not_subvol) \ - x(ENOENT, ENOENT_not_directory) \ - x(ENOENT, ENOENT_directory_dead) \ - x(ENOENT, ENOENT_subvolume) \ - x(ENOENT, ENOENT_snapshot_tree) \ - x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ - x(ENOENT, ENOENT_dev_not_found) \ - x(ENOENT, ENOENT_dev_bucket_not_found) \ - x(ENOENT, ENOENT_dev_idx_not_found) \ - x(ENOENT, ENOENT_inode_no_backpointer) \ - x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ - x(ENOENT, btree_node_dying) \ - x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ - x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ - x(EEXIST, EEXIST_str_hash_set) \ - x(EEXIST, EEXIST_discard_in_flight_add) \ - x(EEXIST, EEXIST_subvolume_create) \ - x(ENOSPC, open_buckets_empty) \ - x(ENOSPC, freelist_empty) \ - x(BCH_ERR_freelist_empty, no_buckets_found) \ - x(0, transaction_restart) \ - x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ - x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ - x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ - x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ - x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\ - x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \ - x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ - x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ - x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ - x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ - x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ - x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ - x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ - x(BCH_ERR_transaction_restart, transaction_restart_nested) \ - x(BCH_ERR_transaction_restart, transaction_restart_commit) \ - x(0, no_btree_node) \ - x(BCH_ERR_no_btree_node, no_btree_node_relock) \ - x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ - x(BCH_ERR_no_btree_node, no_btree_node_drop) \ - x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \ - x(BCH_ERR_no_btree_node, no_btree_node_up) \ - x(BCH_ERR_no_btree_node, no_btree_node_down) \ - x(BCH_ERR_no_btree_node, no_btree_node_init) \ - x(BCH_ERR_no_btree_node, no_btree_node_cached) \ - x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \ - x(0, btree_insert_fail) \ - x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(0, backpointer_to_overwritten_btree_node) \ - x(0, journal_reclaim_would_deadlock) \ - x(EINVAL, fsck) \ - x(BCH_ERR_fsck, fsck_ask) \ - x(BCH_ERR_fsck, fsck_fix) \ - x(BCH_ERR_fsck, fsck_delete_bkey) \ - x(BCH_ERR_fsck, fsck_ignore) \ - x(BCH_ERR_fsck, fsck_errors_not_fixed) \ - x(BCH_ERR_fsck, fsck_repair_unimplemented) \ - x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(EINVAL, recovery_will_run) \ - x(BCH_ERR_recovery_will_run, restart_recovery) \ - x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \ - x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \ - x(0, data_update_done) \ - x(0, bkey_was_deleted) \ - x(BCH_ERR_data_update_done, data_update_done_would_block) \ - x(BCH_ERR_data_update_done, data_update_done_unwritten) \ - x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ - x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ - x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ - x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ - x(EINVAL, device_state_not_allowed) \ - x(EINVAL, member_info_missing) \ - x(EINVAL, mismatched_block_size) \ - x(EINVAL, block_size_too_small) \ - x(EINVAL, bucket_size_too_small) \ - x(EINVAL, device_size_too_small) \ - x(EINVAL, device_size_too_big) \ - x(EINVAL, device_not_a_member_of_filesystem) \ - x(EINVAL, device_has_been_removed) \ - x(EINVAL, device_splitbrain) \ - x(EINVAL, device_already_online) \ - x(EINVAL, filesystem_uuid_already_open) \ - x(EINVAL, insufficient_devices_to_start) \ - x(EINVAL, invalid) \ - x(EINVAL, internal_fsck_err) \ - x(EINVAL, opt_parse_error) \ - x(EINVAL, remove_with_metadata_missing_unimplemented)\ - x(EINVAL, remove_would_lose_data) \ - x(EINVAL, no_resize_with_buckets_nouse) \ - x(EINVAL, inode_unpack_error) \ - x(EINVAL, inode_not_unlinked) \ - x(EINVAL, inode_has_child_snapshot) \ - x(EINVAL, varint_decode_error) \ - x(EINVAL, erasure_coding_found_btree_node) \ - x(EINVAL, option_negative) \ - x(EOPNOTSUPP, may_not_use_incompat_feature) \ - x(EROFS, erofs_trans_commit) \ - x(EROFS, erofs_no_writes) \ - x(EROFS, erofs_journal_err) \ - x(EROFS, erofs_sb_err) \ - x(EROFS, erofs_unfixed_errors) \ - x(EROFS, erofs_norecovery) \ - x(EROFS, erofs_nochanges) \ - x(EROFS, erofs_no_alloc_info) \ - x(EROFS, erofs_filesystem_full) \ - x(EROFS, insufficient_devices) \ - x(0, operation_blocked) \ - x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ - x(BCH_ERR_operation_blocked, journal_res_blocked) \ - x(BCH_ERR_journal_res_blocked, journal_blocked) \ - x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \ - x(BCH_ERR_journal_res_blocked, journal_max_open) \ - x(BCH_ERR_journal_res_blocked, journal_full) \ - x(BCH_ERR_journal_res_blocked, journal_pin_full) \ - x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ - x(BCH_ERR_journal_res_blocked, journal_stuck) \ - x(BCH_ERR_journal_res_blocked, journal_retry_open) \ - x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ - x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ - x(BCH_ERR_invalid, invalid_sb) \ - x(BCH_ERR_invalid_sb, invalid_sb_magic) \ - x(BCH_ERR_invalid_sb, invalid_sb_version) \ - x(BCH_ERR_invalid_sb, invalid_sb_features) \ - x(BCH_ERR_invalid_sb, invalid_sb_too_big) \ - x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \ - x(BCH_ERR_invalid_sb, invalid_sb_csum) \ - x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ - x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ - x(BCH_ERR_invalid_sb, invalid_sb_offset) \ - x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ - x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ - x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ - x(BCH_ERR_invalid_sb, invalid_sb_field_size) \ - x(BCH_ERR_invalid_sb, invalid_sb_layout) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ - x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \ - x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ - x(BCH_ERR_invalid_sb, invalid_sb_members) \ - x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ - x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ - x(BCH_ERR_invalid_sb, invalid_replicas_entry) \ - x(BCH_ERR_invalid_sb, invalid_sb_journal) \ - x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ - x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ - x(BCH_ERR_invalid_sb, invalid_sb_clean) \ - x(BCH_ERR_invalid_sb, invalid_sb_quota) \ - x(BCH_ERR_invalid_sb, invalid_sb_errors) \ - x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ - x(BCH_ERR_invalid_sb, invalid_sb_ext) \ - x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ - x(BCH_ERR_invalid, invalid_bkey) \ - x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ - x(EROFS, journal_shutdown) \ - x(EIO, journal_flush_err) \ - x(EIO, journal_write_err) \ - x(EIO, btree_node_read_err) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ - x(EIO, sb_not_downgraded) \ - x(EIO, btree_node_write_all_failed) \ - x(EIO, btree_node_read_error) \ - x(EIO, btree_need_topology_repair) \ - x(EIO, bucket_ref_update) \ - x(EIO, trigger_alloc) \ - x(EIO, trigger_pointer) \ - x(EIO, trigger_stripe_pointer) \ - x(EIO, metadata_bucket_inconsistency) \ - x(EIO, mark_stripe) \ - x(EIO, stripe_reconstruct) \ - x(EIO, key_type_error) \ - x(EIO, extent_poisoned) \ - x(EIO, missing_indirect_extent) \ - x(EIO, invalidate_stripe_to_dev) \ - x(EIO, no_encryption_key) \ - x(EIO, insufficient_journal_devices) \ - x(EIO, device_offline) \ - x(EIO, EIO_fault_injected) \ - x(EIO, ec_block_read) \ - x(EIO, ec_block_write) \ - x(EIO, recompute_checksum) \ - x(EIO, decompress) \ - x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ - x(BCH_ERR_decompress, decompress_lz4) \ - x(BCH_ERR_decompress, decompress_gzip) \ - x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ - x(BCH_ERR_decompress, decompress_zstd) \ - x(EIO, data_write) \ - x(BCH_ERR_data_write, data_write_io) \ - x(BCH_ERR_data_write, data_write_csum) \ - x(BCH_ERR_data_write, data_write_invalid_ptr) \ - x(BCH_ERR_data_write, data_write_misaligned) \ - x(BCH_ERR_decompress, data_read) \ - x(BCH_ERR_data_read, no_device_to_read_from) \ - x(BCH_ERR_data_read, no_devices_valid) \ - x(BCH_ERR_data_read, data_read_io_err) \ - x(BCH_ERR_data_read, data_read_csum_err) \ - x(BCH_ERR_data_read, data_read_retry) \ - x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ - x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ - x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ - x(BCH_ERR_data_read, data_read_decompress_err) \ - x(BCH_ERR_data_read, data_read_decrypt_err) \ - x(BCH_ERR_data_read, data_read_ptr_stale_race) \ - x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ - x(BCH_ERR_data_read, data_read_no_encryption_key) \ - x(BCH_ERR_data_read, data_read_buffer_too_small) \ - x(BCH_ERR_data_read, data_read_key_overwritten) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \ - x(0, nopromote) \ - x(BCH_ERR_nopromote, nopromote_may_not) \ - x(BCH_ERR_nopromote, nopromote_already_promoted) \ - x(BCH_ERR_nopromote, nopromote_unwritten) \ - x(BCH_ERR_nopromote, nopromote_congested) \ - x(BCH_ERR_nopromote, nopromote_in_flight) \ - x(BCH_ERR_nopromote, nopromote_no_writes) \ - x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, invalid_snapshot_node) \ - x(0, option_needs_open_fs) \ - x(0, remove_disk_accounting_entry) - -enum bch_errcode { - BCH_ERR_START = 2048, -#define x(class, err) BCH_ERR_##err, - BCH_ERRCODES() -#undef x - BCH_ERR_MAX -}; - -__attribute__((const)) const char *bch2_err_str(int); - -__attribute__((const)) bool __bch2_err_matches(int, int); - -__attribute__((const)) -static inline bool _bch2_err_matches(int err, int class) -{ - return err < 0 && __bch2_err_matches(err, class); -} - -#define bch2_err_matches(_err, _class) \ -({ \ - BUILD_BUG_ON(!__builtin_constant_p(_class)); \ - unlikely(_bch2_err_matches(_err, _class)); \ -}) - -int __bch2_err_class(int); - -static inline long bch2_err_class(long err) -{ - return err < 0 ? __bch2_err_class(err) : err; -} - -#define BLK_STS_REMOVED ((__force blk_status_t)128) - -#include -const char *bch2_blk_status_to_str(blk_status_t); - -#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c deleted file mode 100644 index 267e73d9d7e6ee..00000000000000 --- a/fs/bcachefs/error.c +++ /dev/null @@ -1,771 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "error.h" -#include "journal.h" -#include "namei.h" -#include "recovery_passes.h" -#include "super.h" -#include "thread_with_file.h" - -#define FSCK_ERR_RATELIMIT_NR 10 - -void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out) -{ - printbuf_indent_add_nextline(out, 2); - -#ifdef BCACHEFS_LOG_PREFIX - prt_printf(out, "bcachefs (%s): ", fs_or_dev_name); -#endif -} - -bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) -{ - set_bit(BCH_FS_error, &c->flags); - - switch (c->opts.errors) { - case BCH_ON_ERROR_continue: - return false; - case BCH_ON_ERROR_fix_safe: - case BCH_ON_ERROR_ro: - bch2_fs_emergency_read_only2(c, out); - return true; - case BCH_ON_ERROR_panic: - bch2_print_str(c, KERN_ERR, out->buf); - panic(bch2_fmt(c, "panic after error")); - return true; - default: - BUG(); - } -} - -bool bch2_inconsistent_error(struct bch_fs *c) -{ - struct printbuf buf = PRINTBUF; - buf.atomic++; - - printbuf_indent_add_nextline(&buf, 2); - - bool ret = __bch2_inconsistent_error(c, &buf); - if (ret) - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - return ret; -} - -__printf(3, 0) -static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans, - const char *fmt, va_list args) -{ - struct printbuf buf = PRINTBUF; - buf.atomic++; - - bch2_log_msg_start(c, &buf); - - prt_vprintf(&buf, fmt, args); - prt_newline(&buf); - - if (trans) - bch2_trans_updates_to_text(&buf, trans); - bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_exit(&buf); - return ret; -} - -bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args); - va_end(args); - return ret; -} - -bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args); - va_end(args); - return ret; -} - -int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) -{ - prt_printf(out, "btree topology error: "); - - set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_in_recovery, &c->flags)) { - __bch2_inconsistent_error(c, out); - return bch_err_throw(c, btree_need_topology_repair); - } else { - return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: - bch_err_throw(c, btree_need_topology_repair); - } -} - -int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) -{ - struct printbuf buf = PRINTBUF; - - bch2_log_msg_start(c, &buf); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - - int ret = __bch2_topology_error(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_exit(&buf); - return ret; -} - -void bch2_fatal_error(struct bch_fs *c) -{ - if (bch2_fs_emergency_read_only(c)) - bch_err(c, "fatal error - emergency read only"); -} - -void bch2_io_error_work(struct work_struct *work) -{ - struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); - struct bch_fs *c = ca->fs; - - /* XXX: if it's reads or checksums that are failing, set it to failed */ - - down_write(&c->state_lock); - unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); - - if (write_errors_start && - time_after(jiffies, - write_errors_start + c->opts.write_error_timeout * HZ)) { - if (ca->mi.state >= BCH_MEMBER_STATE_ro) - goto out; - - bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - struct printbuf buf = PRINTBUF; - __bch2_log_msg_start(ca->name, &buf); - - prt_printf(&buf, "writes erroring for %u seconds, setting %s ro", - c->opts.write_error_timeout, - dev ? "device" : "filesystem"); - if (!dev) - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } -out: - up_write(&c->state_lock); -} - -void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) -{ - atomic64_inc(&ca->errors[type]); - - if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) - ca->write_errors_start = jiffies; - - queue_work(system_long_wq, &ca->io_error_work); -} - -enum ask_yn { - YN_NO, - YN_YES, - YN_ALLNO, - YN_ALLYES, -}; - -static enum ask_yn parse_yn_response(char *buf) -{ - buf = strim(buf); - - if (strlen(buf) == 1) - switch (buf[0]) { - case 'n': - return YN_NO; - case 'y': - return YN_YES; - case 'N': - return YN_ALLNO; - case 'Y': - return YN_ALLYES; - } - return -1; -} - -#ifdef __KERNEL__ -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) -{ - struct stdio_redirect *stdio = c->stdio; - - if (c->stdio_filter && c->stdio_filter != current) - stdio = NULL; - - if (!stdio) - return YN_NO; - - if (trans) - bch2_trans_unlock(trans); - - unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0; - darray_char line = {}; - int ret; - - do { - unsigned long t; - bch2_print(c, " (y,n, or Y,N for all errors of this type) "); -rewait: - t = unlock_long_at - ? max_t(long, unlock_long_at - jiffies, 0) - : MAX_SCHEDULE_TIMEOUT; - - int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t); - if (r == -ETIME) { - bch2_trans_unlock_long(trans); - unlock_long_at = 0; - goto rewait; - } - - if (r < 0) { - ret = YN_NO; - break; - } - - darray_last(line) = '\0'; - } while ((ret = parse_yn_response(line.data)) < 0); - - darray_exit(&line); - return ret; -} -#else - -#include "tools-util.h" - -static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) -{ - char *buf = NULL; - size_t buflen = 0; - int ret; - - do { - fputs(" (y,n, or Y,N for all errors of this type) ", stdout); - fflush(stdout); - - if (getline(&buf, &buflen, stdin) < 0) - die("error reading from standard input"); - } while ((ret = parse_yn_response(buf)) < 0); - - free(buf); - return ret; -} - -#endif - -static struct fsck_err_state *fsck_err_get(struct bch_fs *c, - enum bch_sb_error_id id) -{ - struct fsck_err_state *s; - - list_for_each_entry(s, &c->fsck_error_msgs, list) - if (s->id == id) { - /* - * move it to the head of the list: repeated fsck errors - * are common - */ - list_move(&s->list, &c->fsck_error_msgs); - return s; - } - - s = kzalloc(sizeof(*s), GFP_NOFS); - if (!s) { - if (!c->fsck_alloc_msgs_err) - bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); - c->fsck_alloc_msgs_err = true; - return NULL; - } - - INIT_LIST_HEAD(&s->list); - s->id = id; - list_add(&s->list, &c->fsck_error_msgs); - return s; -} - -/* s/fix?/fixing/ s/recreate?/recreating/ */ -static void prt_actioning(struct printbuf *out, const char *action) -{ - unsigned len = strlen(action); - - BUG_ON(action[len - 1] != '?'); - --len; - - if (action[len - 1] == 'e') - --len; - - prt_bytes(out, action, len); - prt_str(out, "ing"); -} - -static const u8 fsck_flags_extra[] = { -#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags, - BCH_SB_ERRS() -#undef x -}; - -static int do_fsck_ask_yn(struct bch_fs *c, - struct btree_trans *trans, - struct printbuf *question, - const char *action) -{ - prt_str(question, ", "); - prt_str(question, action); - - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", question->buf); - else - bch2_print_str(c, KERN_ERR, question->buf); - - int ask = bch2_fsck_ask_yn(c, trans); - - if (trans) { - int ret = bch2_trans_relock(trans); - if (ret) - return ret; - } - - return ask; -} - -static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, - enum bch_sb_error_id id, const char *msg, - bool *repeat, bool *print, bool *suppress) -{ - bch2_sb_error_count(c, id); - - struct fsck_err_state *s = fsck_err_get(c, id); - if (s) { - /* - * We may be called multiple times for the same error on - * transaction restart - this memoizes instead of asking the user - * multiple times for the same error: - */ - if (s->last_msg && !strcmp(msg, s->last_msg)) { - *repeat = true; - *print = false; - return s; - } - - kfree(s->last_msg); - s->last_msg = kstrdup(msg, GFP_KERNEL); - - if (c->opts.ratelimit_errors && - s->nr >= FSCK_ERR_RATELIMIT_NR) { - if (s->nr == FSCK_ERR_RATELIMIT_NR) - *suppress = true; - else - *print = false; - } - - s->nr++; - } - return s; -} - -bool __bch2_count_fsck_err(struct bch_fs *c, - enum bch_sb_error_id id, struct printbuf *msg) -{ - bch2_sb_error_count(c, id); - - mutex_lock(&c->fsck_error_msgs_lock); - bool print = true, repeat = false, suppress = false; - - count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress); - mutex_unlock(&c->fsck_error_msgs_lock); - - if (suppress) - prt_printf(msg, "Ratelimiting new instances of previous error\n"); - - return print && !repeat; -} - -int bch2_fsck_err_opt(struct bch_fs *c, - enum bch_fsck_flags flags, - enum bch_sb_error_id err) -{ - if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) - flags |= fsck_flags_extra[err]; - - if (test_bit(BCH_FS_in_fsck, &c->flags)) { - if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) - return bch_err_throw(c, fsck_repair_unimplemented); - - switch (c->opts.fix_errors) { - case FSCK_FIX_exit: - return bch_err_throw(c, fsck_errors_not_fixed); - case FSCK_FIX_yes: - if (flags & FSCK_CAN_FIX) - return bch_err_throw(c, fsck_fix); - fallthrough; - case FSCK_FIX_no: - if (flags & FSCK_CAN_IGNORE) - return bch_err_throw(c, fsck_ignore); - return bch_err_throw(c, fsck_errors_not_fixed); - case FSCK_FIX_ask: - if (flags & FSCK_AUTOFIX) - return bch_err_throw(c, fsck_fix); - return bch_err_throw(c, fsck_ask); - default: - BUG(); - } - } else { - if ((flags & FSCK_AUTOFIX) && - (c->opts.errors == BCH_ON_ERROR_continue || - c->opts.errors == BCH_ON_ERROR_fix_safe)) - return bch_err_throw(c, fsck_fix); - - if (c->opts.errors == BCH_ON_ERROR_continue && - (flags & FSCK_CAN_IGNORE)) - return bch_err_throw(c, fsck_ignore); - return bch_err_throw(c, fsck_errors_not_fixed); - } -} - -int __bch2_fsck_err(struct bch_fs *c, - struct btree_trans *trans, - enum bch_fsck_flags flags, - enum bch_sb_error_id err, - const char *fmt, ...) -{ - va_list args; - struct printbuf buf = PRINTBUF, *out = &buf; - int ret = 0; - const char *action_orig = "fix?", *action = action_orig; - - might_sleep(); - - if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) - flags |= fsck_flags_extra[err]; - - if (!c) - c = trans->c; - - /* - * Ugly: if there's a transaction in the current task it has to be - * passed in to unlock if we prompt for user input. - * - * But, plumbing a transaction and transaction restarts into - * bkey_validate() is problematic. - * - * So: - * - make all bkey errors AUTOFIX, they're simple anyways (we just - * delete the key) - * - and we don't need to warn if we're not prompting - */ - WARN_ON((flags & FSCK_CAN_FIX) && - !(flags & FSCK_AUTOFIX) && - !trans && - bch2_current_has_btree_trans(c)); - - if (test_bit(err, c->sb.errors_silent)) - return flags & FSCK_CAN_FIX - ? bch_err_throw(c, fsck_fix) - : bch_err_throw(c, fsck_ignore); - - printbuf_indent_add_nextline(out, 2); - -#ifdef BCACHEFS_LOG_PREFIX - if (strncmp(fmt, "bcachefs", 8)) - prt_printf(out, bch2_log_msg(c, "")); -#endif - - va_start(args, fmt); - prt_vprintf(out, fmt, args); - va_end(args); - - /* Custom fix/continue/recreate/etc.? */ - if (out->buf[out->pos - 1] == '?') { - const char *p = strrchr(out->buf, ','); - if (p) { - out->pos = p - out->buf; - action = kstrdup(p + 2, GFP_KERNEL); - if (!action) { - ret = -ENOMEM; - goto err; - } - } - } - - mutex_lock(&c->fsck_error_msgs_lock); - bool repeat = false, print = true, suppress = false; - bool inconsistent = false, exiting = false; - struct fsck_err_state *s = - count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress); - if (repeat) { - ret = s->ret; - goto err_unlock; - } - - if ((flags & FSCK_AUTOFIX) && - (c->opts.errors == BCH_ON_ERROR_continue || - c->opts.errors == BCH_ON_ERROR_fix_safe)) { - prt_str(out, ", "); - if (flags & FSCK_CAN_FIX) { - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_fix); - } else { - prt_str(out, ", continuing"); - ret = bch_err_throw(c, fsck_ignore); - } - - goto print; - } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) { - if (c->opts.errors != BCH_ON_ERROR_continue || - !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { - prt_str_indented(out, ", shutting down\n" - "error not marked as autofix and not in fsck\n" - "run fsck, and forward to devs so error can be marked for self-healing"); - inconsistent = true; - print = true; - ret = bch_err_throw(c, fsck_errors_not_fixed); - } else if (flags & FSCK_CAN_FIX) { - prt_str(out, ", "); - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_fix); - } else { - prt_str(out, ", continuing"); - ret = bch_err_throw(c, fsck_ignore); - } - } else if (c->opts.fix_errors == FSCK_FIX_exit) { - prt_str(out, ", exiting"); - ret = bch_err_throw(c, fsck_errors_not_fixed); - } else if (flags & FSCK_CAN_FIX) { - int fix = s && s->fix - ? s->fix - : c->opts.fix_errors; - - if (fix == FSCK_FIX_ask) { - print = false; - - ret = do_fsck_ask_yn(c, trans, out, action); - if (ret < 0) - goto err_unlock; - - if (ret >= YN_ALLNO && s) - s->fix = ret == YN_ALLNO - ? FSCK_FIX_no - : FSCK_FIX_yes; - - ret = ret & 1 - ? bch_err_throw(c, fsck_fix) - : bch_err_throw(c, fsck_ignore); - } else if (fix == FSCK_FIX_yes || - (c->opts.nochanges && - !(flags & FSCK_CAN_IGNORE))) { - prt_str(out, ", "); - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_fix); - } else { - prt_str(out, ", not "); - prt_actioning(out, action); - ret = bch_err_throw(c, fsck_ignore); - } - } else { - if (flags & FSCK_CAN_IGNORE) { - prt_str(out, ", continuing"); - ret = bch_err_throw(c, fsck_ignore); - } else { - prt_str(out, " (repair unimplemented)"); - ret = bch_err_throw(c, fsck_repair_unimplemented); - } - } - - if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) && - (c->opts.fix_errors == FSCK_FIX_exit || - !(flags & FSCK_CAN_IGNORE))) - ret = bch_err_throw(c, fsck_errors_not_fixed); - - if (test_bit(BCH_FS_in_fsck, &c->flags) && - (!bch2_err_matches(ret, BCH_ERR_fsck_fix) && - !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) { - exiting = true; - print = true; - } -print: - prt_newline(out); - - if (inconsistent) - __bch2_inconsistent_error(c, out); - else if (exiting) - prt_printf(out, "Unable to continue, halting\n"); - else if (suppress) - prt_printf(out, "Ratelimiting new instances of previous error\n"); - - if (print) { - /* possibly strip an empty line, from printbuf_indent_add */ - while (out->pos && out->buf[out->pos - 1] == ' ') - --out->pos; - printbuf_nul_terminate(out); - - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", out->buf); - else - bch2_print_str(c, KERN_ERR, out->buf); - } - - if (s) - s->ret = ret; - - if (trans && - !(flags & FSCK_ERR_NO_LOG) && - ret == -BCH_ERR_fsck_fix) - ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; -err_unlock: - mutex_unlock(&c->fsck_error_msgs_lock); -err: - /* - * We don't yet track whether the filesystem currently has errors, for - * log_fsck_err()s: that would require us to track for every error type - * which recovery pass corrects it, to get the fsck exit status correct: - */ - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - /* nothing */ - } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); - } - - if (action != action_orig) - kfree(action); - printbuf_exit(&buf); - - BUG_ON(!ret); - return ret; -} - -static const char * const bch2_bkey_validate_contexts[] = { -#define x(n) #n, - BKEY_VALIDATE_CONTEXTS() -#undef x - NULL -}; - -int __bch2_bkey_fsck_err(struct bch_fs *c, - struct bkey_s_c k, - struct bkey_validate_context from, - enum bch_sb_error_id err, - const char *fmt, ...) -{ - if (from.flags & BCH_VALIDATE_silent) - return bch_err_throw(c, fsck_delete_bkey); - - unsigned fsck_flags = 0; - if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { - if (test_bit(err, c->sb.errors_silent)) - return bch_err_throw(c, fsck_delete_bkey); - - fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; - } - if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) - fsck_flags |= fsck_flags_extra[err]; - - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "invalid bkey in %s", - bch2_bkey_validate_contexts[from.from]); - - if (from.from == BKEY_VALIDATE_journal) - prt_printf(&buf, " journal seq=%llu offset=%u", - from.journal_seq, from.journal_offset); - - prt_str(&buf, " btree="); - bch2_btree_id_to_text(&buf, from.btree); - prt_printf(&buf, " level=%u: ", from.level); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - - int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf); - printbuf_exit(&buf); - return ret; -} - -static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) -{ - struct fsck_err_state *s, *n; - - mutex_lock(&c->fsck_error_msgs_lock); - - list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { - if (print && s->ratelimited && s->last_msg) - bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); - - list_del(&s->list); - kfree(s->last_msg); - kfree(s); - } - - mutex_unlock(&c->fsck_error_msgs_lock); -} - -void bch2_flush_fsck_errs(struct bch_fs *c) -{ - __bch2_flush_fsck_errs(c, true); -} - -void bch2_free_fsck_errs(struct bch_fs *c) -{ - __bch2_flush_fsck_errs(c, false); -} - -int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - subvol_inum inum, u64 offset) -{ - u32 restart_count = trans->restart_count; - int ret = 0; - - if (inum.subvol) { - ret = bch2_inum_to_path(trans, inum, out); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - } - if (!inum.subvol || ret) - prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); - prt_printf(out, " offset %llu: ", offset); - - return trans_was_restarted(trans, restart_count); -} - -void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, - subvol_inum inum, u64 offset) -{ - bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); -} - -int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bpos pos) -{ - int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out); - if (ret) - return ret; - - prt_printf(out, " offset %llu: ", pos.offset << 8); - return 0; -} - -void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, - struct bpos pos) -{ - bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); -} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h deleted file mode 100644 index 0c3c3a24fc6f6d..00000000000000 --- a/fs/bcachefs/error.h +++ /dev/null @@ -1,258 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_ERROR_H -#define _BCACHEFS_ERROR_H - -#include -#include -#include "bkey_types.h" -#include "sb-errors.h" - -struct bch_dev; -struct bch_fs; -struct work_struct; - -/* - * XXX: separate out errors that indicate on disk data is inconsistent, and flag - * superblock as such - */ - -/* Error messages: */ - -void __bch2_log_msg_start(const char *, struct printbuf *); - -static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) -{ - __bch2_log_msg_start(c->name, out); -} - -/* - * Inconsistency errors: The on disk data is inconsistent. If these occur during - * initial recovery, they don't indicate a bug in the running code - we walk all - * the metadata before modifying anything. If they occur at runtime, they - * indicate either a bug in the running code or (less likely) data is being - * silently corrupted under us. - * - * XXX: audit all inconsistent errors and make sure they're all recoverable, in - * BCH_ON_ERROR_CONTINUE mode - */ - -bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *); -bool bch2_inconsistent_error(struct bch_fs *); -__printf(2, 3) -bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...); - -#define bch2_fs_inconsistent_on(cond, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - if (_ret) \ - bch2_fs_inconsistent(__VA_ARGS__); \ - _ret; \ -}) - -__printf(2, 3) -bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...); - -#define bch2_trans_inconsistent_on(cond, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - if (_ret) \ - bch2_trans_inconsistent(__VA_ARGS__); \ - _ret; \ -}) - -int __bch2_topology_error(struct bch_fs *, struct printbuf *); -__printf(2, 3) -int bch2_fs_topology_error(struct bch_fs *, const char *, ...); - -/* - * Fsck errors: inconsistency errors we detect at mount time, and should ideally - * be able to repair: - */ - -struct fsck_err_state { - struct list_head list; - enum bch_sb_error_id id; - u64 nr; - bool ratelimited; - int ret; - int fix; - char *last_msg; -}; - -#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) - -bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *); -#define bch2_count_fsck_err(_c, _err, ...) \ - __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) - -int bch2_fsck_err_opt(struct bch_fs *, - enum bch_fsck_flags, - enum bch_sb_error_id); - -__printf(5, 6) __cold -int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, - enum bch_fsck_flags, - enum bch_sb_error_id, - const char *, ...); -#define bch2_fsck_err(c, _flags, _err_type, ...) \ - __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\ - type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\ - _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__) - -void bch2_flush_fsck_errs(struct bch_fs *); -void bch2_free_fsck_errs(struct bch_fs *); - -#define fsck_err_wrap(_do) \ -({ \ - int _ret = _do; \ - if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ - !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \ - ret = _ret; \ - goto fsck_err; \ - } \ - \ - bch2_err_matches(_ret, BCH_ERR_fsck_fix); \ -}) - -#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) - -/* These macros return true if error should be fixed: */ - -/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ - -#define __fsck_err_on(cond, c, _flags, _err_type, ...) \ -({ \ - might_sleep(); \ - \ - if (type_is(c, struct bch_fs *)) \ - WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\ - \ - (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\ -}) - -#define mustfix_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) - -#define mustfix_fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) - -#define fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -#define fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -#define log_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) - -#define log_fsck_err_on(cond, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - if (_ret) \ - log_fsck_err(__VA_ARGS__); \ - _ret; \ -}) - -enum bch_validate_flags; -__printf(5, 6) -int __bch2_bkey_fsck_err(struct bch_fs *, - struct bkey_s_c, - struct bkey_validate_context from, - enum bch_sb_error_id, - const char *, ...); - -/* - * for now, bkey fsck errors are always handled by deleting the entire key - - * this will change at some point - */ -#define bkey_fsck_err(c, _err_type, _err_msg, ...) \ -do { \ - int _ret = __bch2_bkey_fsck_err(c, k, from, \ - BCH_FSCK_ERR_##_err_type, \ - _err_msg, ##__VA_ARGS__); \ - if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ - !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \ - ret = _ret; \ - ret = bch_err_throw(c, fsck_delete_bkey); \ - goto fsck_err; \ -} while (0) - -#define bkey_fsck_err_on(cond, ...) \ -do { \ - if (unlikely(cond)) \ - bkey_fsck_err(__VA_ARGS__); \ -} while (0) - -/* - * Fatal errors: these don't indicate a bug, but we can't continue running in RW - * mode - pretty much just due to metadata IO errors: - */ - -void bch2_fatal_error(struct bch_fs *); - -#define bch2_fs_fatal_error(c, _msg, ...) \ -do { \ - bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \ - bch2_fatal_error(c); \ -} while (0) - -#define bch2_fs_fatal_err_on(cond, c, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - \ - if (_ret) \ - bch2_fs_fatal_error(c, __VA_ARGS__); \ - _ret; \ -}) - -/* - * IO errors: either recoverable metadata IO (because we have replicas), or data - * IO - we need to log it and print out a message, but we don't (necessarily) - * want to shut down the fs: - */ - -void bch2_io_error_work(struct work_struct *); - -/* Does the error handling without logging a message */ -void bch2_io_error(struct bch_dev *, enum bch_member_error_type); - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - -static inline void bch2_account_io_success_fail(struct bch_dev *ca, - enum bch_member_error_type type, - bool success) -{ - if (likely(success)) { - if (type == BCH_MEMBER_ERROR_write && - ca->write_errors_start) - ca->write_errors_start = 0; - } else { - bch2_io_error(ca, type); - } -} - -static inline void bch2_account_io_completion(struct bch_dev *ca, - enum bch_member_error_type type, - u64 submit_time, bool success) -{ - if (unlikely(!ca)) - return; - - if (type != BCH_MEMBER_ERROR_checksum) - bch2_latency_acct(ca, submit_time, type); - - bch2_account_io_success_fail(ca, type, success); -} - -int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); - -void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); - -int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); -void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); - -#endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c deleted file mode 100644 index e76e58a568bffc..00000000000000 --- a/fs/bcachefs/extent_update.c +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "debug.h" -#include "extents.h" -#include "extent_update.h" - -/* - * This counts the number of iterators to the alloc & ec btrees we'll need - * inserting/removing this extent: - */ -static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - unsigned ret = 0, lru = 0; - - bkey_extent_entry_for_each(ptrs, entry) { - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - /* Might also be updating LRU btree */ - if (entry->ptr.cached) - lru++; - - fallthrough; - case BCH_EXTENT_ENTRY_stripe_ptr: - ret++; - } - } - - /* - * Updating keys in the alloc btree may also update keys in the - * freespace or discard btrees: - */ - return lru + ret * 2; -} - -#define EXTENT_ITERS_MAX 64 - -static int count_iters_for_insert(struct btree_trans *trans, - struct bkey_s_c k, - unsigned offset, - struct bpos *end, - unsigned *nr_iters) -{ - int ret = 0, ret2 = 0; - - if (*nr_iters >= EXTENT_ITERS_MAX) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } - - switch (k.k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - - if (*nr_iters >= EXTENT_ITERS_MAX) { - *end = bpos_min(*end, k.k->p); - ret = 1; - } - - break; - case KEY_TYPE_reflink_p: { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = REFLINK_P_IDX(p.v); - unsigned sectors = bpos_min(*end, p.k->p).offset - - bkey_start_offset(p.k); - struct btree_iter iter; - struct bkey_s_c r_k; - - for_each_btree_key_norestart(trans, iter, - BTREE_ID_reflink, POS(0, idx + offset), - BTREE_ITER_slots, r_k, ret2) { - if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) - break; - - /* extent_update_to_keys(), for the reflink_v update */ - *nr_iters += 1; - - *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - - if (*nr_iters >= EXTENT_ITERS_MAX) { - struct bpos pos = bkey_start_pos(k.k); - pos.offset += min_t(u64, k.k->size, - r_k.k->p.offset - idx); - - *end = bpos_min(*end, pos); - ret = 1; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - break; - } - } - - return ret2 ?: ret; -} - -int bch2_extent_atomic_end(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos *end) -{ - unsigned nr_iters = 0; - - struct btree_iter copy; - bch2_trans_copy_iter(trans, ©, iter); - - int ret = bch2_btree_iter_traverse(trans, ©); - if (ret) - goto err; - - struct bkey_s_c k; - for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) { - unsigned offset = 0; - - if (bkey_gt(iter->pos, bkey_start_pos(k.k))) - offset = iter->pos.offset - bkey_start_offset(k.k); - - ret = count_iters_for_insert(trans, k, offset, end, &nr_iters); - if (ret) - break; - } -err: - bch2_trans_iter_exit(trans, ©); - return ret < 0 ? ret : 0; -} - -int bch2_extent_trim_atomic(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *k) -{ - struct bpos end = k->k.p; - int ret = bch2_extent_atomic_end(trans, iter, &end); - if (ret) - return ret; - - /* tracepoint */ - - if (bpos_lt(end, k->k.p)) { - if (trace_extent_trim_atomic_enabled()) { - CLASS(printbuf, buf)(); - bch2_bpos_to_text(&buf, end); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); - trace_extent_trim_atomic(trans->c, buf.buf); - } - bch2_cut_back(end, k); - } - return 0; -} diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h deleted file mode 100644 index 34467db53f4575..00000000000000 --- a/fs/bcachefs/extent_update.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENT_UPDATE_H -#define _BCACHEFS_EXTENT_UPDATE_H - -#include "bcachefs.h" - -int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, - struct bpos *); -int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, - struct bkey_i *); - -#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c deleted file mode 100644 index 83cbd77dcb9cce..00000000000000 --- a/fs/bcachefs/extents.c +++ /dev/null @@ -1,1735 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2010 Kent Overstreet - * - * Code for managing the extent btree and dynamically updating the writeback - * dirty sector count. - */ - -#include "bcachefs.h" -#include "bkey_methods.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "buckets.h" -#include "checksum.h" -#include "compress.h" -#include "debug.h" -#include "disk_groups.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "journal.h" -#include "rebalance.h" -#include "replicas.h" -#include "super.h" -#include "super-io.h" -#include "trace.h" -#include "util.h" - -static const char * const bch2_extent_flags_strs[] = { -#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, - BCH_EXTENT_FLAGS() -#undef x - NULL, -}; - -static unsigned bch2_crc_field_size_max[] = { - [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -}; - -static void bch2_extent_crc_pack(union bch_extent_crc *, - struct bch_extent_crc_unpacked, - enum bch_extent_entry_type); - -void bch2_io_failures_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_io_failures *failed) -{ - static const char * const error_types[] = { - "btree validate", "io", "checksum", "ec reconstruct", NULL - }; - - for (struct bch_dev_io_failures *f = failed->devs; - f < failed->devs + failed->nr; - f++) { - unsigned errflags = - ((!!f->failed_btree_validate) << 0) | - ((!!f->failed_io) << 1) | - ((!!f->failed_csum_nr) << 2) | - ((!!f->failed_ec) << 3); - - bch2_printbuf_make_room(out, 1024); - out->atomic++; - scoped_guard(rcu) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); - if (ca) - prt_str(out, ca->name); - else - prt_printf(out, "(invalid device %u)", f->dev); - } - --out->atomic; - - prt_char(out, ' '); - - if (!errflags) { - prt_str(out, "no error - confused"); - } else if (is_power_of_2(errflags)) { - prt_bitflags(out, error_types, errflags); - prt_str(out, " error"); - } else { - prt_str(out, "errors: "); - prt_bitflags(out, error_types, errflags); - } - prt_newline(out); - } -} - -struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, - unsigned dev) -{ - struct bch_dev_io_failures *i; - - for (i = f->devs; i < f->devs + f->nr; i++) - if (i->dev == dev) - return i; - - return NULL; -} - -void bch2_mark_io_failure(struct bch_io_failures *failed, - struct extent_ptr_decoded *p, - bool csum_error) -{ - struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); - - if (!f) { - BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - - f = &failed->devs[failed->nr++]; - memset(f, 0, sizeof(*f)); - f->dev = p->ptr.dev; - } - - if (p->do_ec_reconstruct) - f->failed_ec = true; - else if (!csum_error) - f->failed_io = true; - else - f->failed_csum_nr++; -} - -void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, - unsigned dev) -{ - struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); - - if (!f) { - BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - - f = &failed->devs[failed->nr++]; - memset(f, 0, sizeof(*f)); - f->dev = dev; - } - - f->failed_btree_validate = true; -} - -static inline u64 dev_latency(struct bch_dev *ca) -{ - return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; -} - -static inline int dev_failed(struct bch_dev *ca) -{ - return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; -} - -/* - * returns true if p1 is better than p2: - */ -static inline bool ptr_better(struct bch_fs *c, - const struct extent_ptr_decoded p1, - u64 p1_latency, - struct bch_dev *ca1, - const struct extent_ptr_decoded p2, - u64 p2_latency) -{ - struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - - int failed_delta = dev_failed(ca1) - dev_failed(ca2); - if (unlikely(failed_delta)) - return failed_delta < 0; - - if (static_branch_unlikely(&bch2_force_reconstruct_read)) - return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - - if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) - return p1.do_ec_reconstruct < p2.do_ec_reconstruct; - - int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; - if (unlikely(crc_retry_delta)) - return crc_retry_delta < 0; - - /* Pick at random, biased in favor of the faster device: */ - - return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; -} - -/* - * This picks a non-stale pointer, preferably from a device other than @avoid. - * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to - * other devices, it will still pick a pointer from avoid. - */ -int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_failures *failed, - struct extent_ptr_decoded *pick, - int dev) -{ - bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; - bool have_dirty_ptrs = false, have_pick = false; - - if (k.k->type == KEY_TYPE_error) - return bch_err_throw(c, key_type_error); - - rcu_read_lock(); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - u64 pick_latency; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - have_dirty_ptrs |= !p.ptr.cached; - - /* - * Unwritten extent: no need to actually read, treat it as a - * hole and return 0s: - */ - if (p.ptr.unwritten) { - rcu_read_unlock(); - return 0; - } - - /* Are we being asked to read from a specific device? */ - if (dev >= 0 && p.ptr.dev != dev) - continue; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - - if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { - rcu_read_unlock(); - int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); - if (ret) - return ret; - rcu_read_lock(); - } - - if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) - continue; - - struct bch_dev_io_failures *f = - unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; - if (unlikely(f)) { - p.crc_retry_nr = f->failed_csum_nr; - p.has_ec &= ~f->failed_ec; - - if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { - have_io_errors |= f->failed_io; - have_io_errors |= f->failed_btree_validate; - have_io_errors |= f->failed_ec; - } - have_csum_errors |= !!f->failed_csum_nr; - - if (p.has_ec && (f->failed_io || f->failed_csum_nr)) - p.do_ec_reconstruct = true; - else if (f->failed_io || - f->failed_btree_validate || - f->failed_csum_nr > c->opts.checksum_err_retry_nr) - continue; - } - - have_missing_devs |= ca && !bch2_dev_is_online(ca); - - if (!ca || !bch2_dev_is_online(ca)) { - if (!p.has_ec) - continue; - p.do_ec_reconstruct = true; - } - - if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) - p.do_ec_reconstruct = true; - - u64 p_latency = dev_latency(ca); - /* - * Square the latencies, to bias more in favor of the faster - * device - we never want to stop issuing reads to the slower - * device altogether, so that we can update our latency numbers: - */ - p_latency *= p_latency; - - if (!have_pick || - ptr_better(c, - p, p_latency, ca, - *pick, pick_latency)) { - *pick = p; - pick_latency = p_latency; - have_pick = true; - } - } - rcu_read_unlock(); - - if (have_pick) - return 1; - if (!have_dirty_ptrs) - return 0; - if (have_missing_devs) - return bch_err_throw(c, no_device_to_read_from); - if (have_csum_errors) - return bch_err_throw(c, data_read_csum_err); - if (have_io_errors) - return bch_err_throw(c, data_read_io_err); - - /* - * If we get here, we have pointers (bkey_ptrs_validate() ensures that), - * but they don't point to valid devices: - */ - return bch_err_throw(c, no_devices_valid); -} - -/* KEY_TYPE_btree_ptr: */ - -int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, - c, btree_ptr_val_too_big, - "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} - -int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - int ret = 0; - - bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, - c, btree_ptr_v2_val_too_big, - "value too big (%zu > %zu)", - bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - - bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), - c, btree_ptr_v2_min_key_bad, - "min_key > key"); - - if ((from.flags & BCH_VALIDATE_write) && - c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) - bkey_fsck_err_on(!bp.v->sectors_written, - c, btree_ptr_v2_written_0, - "sectors_written == 0"); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - - prt_printf(out, "seq %llx written %u min_key %s", - le64_to_cpu(bp.v->seq), - le16_to_cpu(bp.v->sectors_written), - BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); - - bch2_bpos_to_text(out, bp.v->min_key); - prt_printf(out, " "); - bch2_bkey_ptrs_to_text(out, c, k); -} - -void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, - unsigned big_endian, int write, - struct bkey_s k) -{ - struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); - - compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); - - if (version < bcachefs_metadata_version_inode_btree_change && - btree_id_is_extents(btree_id) && - !bkey_eq(bp.v->min_key, POS_MIN)) - bp.v->min_key = write - ? bpos_nosnap_predecessor(bp.v->min_key) - : bpos_nosnap_successor(bp.v->min_key); -} - -/* KEY_TYPE_extent: */ - -bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -{ - struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); - struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); - union bch_extent_entry *en_l; - const union bch_extent_entry *en_r; - struct extent_ptr_decoded lp, rp; - bool use_right_ptr; - - en_l = l_ptrs.start; - en_r = r_ptrs.start; - while (en_l < l_ptrs.end && en_r < r_ptrs.end) { - if (extent_entry_type(en_l) != extent_entry_type(en_r)) - return false; - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - if (en_l < l_ptrs.end || en_r < r_ptrs.end) - return false; - - en_l = l_ptrs.start; - en_r = r_ptrs.start; - lp.crc = bch2_extent_crc_unpack(l.k, NULL); - rp.crc = bch2_extent_crc_unpack(r.k, NULL); - - guard(rcu)(); - - while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && - __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { - if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != - rp.ptr.offset + rp.crc.offset || - lp.ptr.dev != rp.ptr.dev || - lp.ptr.gen != rp.ptr.gen || - lp.ptr.unwritten != rp.ptr.unwritten || - lp.has_ec != rp.has_ec) - return false; - - /* Extents may not straddle buckets: */ - struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); - bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); - - if (!same_bucket) - return false; - - if (lp.has_ec != rp.has_ec || - (lp.has_ec && - (lp.ec.block != rp.ec.block || - lp.ec.redundancy != rp.ec.redundancy || - lp.ec.idx != rp.ec.idx))) - return false; - - if (lp.crc.compression_type != rp.crc.compression_type || - lp.crc.nonce != rp.crc.nonce) - return false; - - if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= - lp.crc.uncompressed_size) { - /* can use left extent's crc entry */ - } else if (lp.crc.live_size <= rp.crc.offset) { - /* can use right extent's crc entry */ - } else { - /* check if checksums can be merged: */ - if (lp.crc.csum_type != rp.crc.csum_type || - lp.crc.nonce != rp.crc.nonce || - crc_is_compressed(lp.crc) || - !bch2_checksum_mergeable(lp.crc.csum_type)) - return false; - - if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || - rp.crc.offset) - return false; - - if (lp.crc.csum_type && - lp.crc.uncompressed_size + - rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) - return false; - } - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - en_l = l_ptrs.start; - en_r = r_ptrs.start; - while (en_l < l_ptrs.end && en_r < r_ptrs.end) { - if (extent_entry_is_crc(en_l)) { - struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - if (crc_l.uncompressed_size + crc_r.uncompressed_size > - bch2_crc_field_size_max[extent_entry_type(en_l)]) - return false; - } - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - use_right_ptr = false; - en_l = l_ptrs.start; - en_r = r_ptrs.start; - while (en_l < l_ptrs.end) { - if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && - use_right_ptr) - en_l->ptr = en_r->ptr; - - if (extent_entry_is_crc(en_l)) { - struct bch_extent_crc_unpacked crc_l = - bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - struct bch_extent_crc_unpacked crc_r = - bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - use_right_ptr = false; - - if (crc_l.offset + crc_l.live_size + crc_r.live_size <= - crc_l.uncompressed_size) { - /* can use left extent's crc entry */ - } else if (crc_l.live_size <= crc_r.offset) { - /* can use right extent's crc entry */ - crc_r.offset -= crc_l.live_size; - bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, - extent_entry_type(en_l)); - use_right_ptr = true; - } else { - crc_l.csum = bch2_checksum_merge(crc_l.csum_type, - crc_l.csum, - crc_r.csum, - crc_r.uncompressed_size << 9); - - crc_l.uncompressed_size += crc_r.uncompressed_size; - crc_l.compressed_size += crc_r.compressed_size; - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, - extent_entry_type(en_l)); - } - } - - en_l = extent_entry_next(en_l); - en_r = extent_entry_next(en_r); - } - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -/* KEY_TYPE_reservation: */ - -int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - int ret = 0; - - bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, - c, reservation_key_nr_replicas_invalid, - "invalid nr_replicas (%u)", r.v->nr_replicas); -fsck_err: - return ret; -} - -void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - prt_printf(out, "generation %u replicas %u", - le32_to_cpu(r.v->generation), - r.v->nr_replicas); -} - -bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reservation l = bkey_s_to_reservation(_l); - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); - - if (l.v->generation != r.v->generation || - l.v->nr_replicas != r.v->nr_replicas) - return false; - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -/* Extent checksum entries: */ - -/* returns true if not equal */ -static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, - struct bch_extent_crc_unpacked r) -{ - return (l.csum_type != r.csum_type || - l.compression_type != r.compression_type || - l.compressed_size != r.compressed_size || - l.uncompressed_size != r.uncompressed_size || - l.offset != r.offset || - l.live_size != r.live_size || - l.nonce != r.nonce || - bch2_crc_cmp(l.csum, r.csum)); -} - -static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, - struct bch_extent_crc_unpacked n) -{ - return !crc_is_compressed(u) && - u.csum_type && - u.uncompressed_size > u.live_size && - bch2_csum_type_is_encryption(u.csum_type) == - bch2_csum_type_is_encryption(n.csum_type); -} - -bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, - struct bch_extent_crc_unpacked n) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - if (!n.csum_type) - return false; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (can_narrow_crc(crc, n)) - return true; - - return false; -} - -/* - * We're writing another replica for this extent, so while we've got the data in - * memory we'll be computing a new checksum for the currently live data. - * - * If there are other replicas we aren't moving, and they are checksummed but - * not compressed, we can modify them to point to only the data that is - * currently live (so that readers won't have to bounce) while we've got the - * checksum we need: - */ -bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked u; - struct extent_ptr_decoded p; - union bch_extent_entry *i; - bool ret = false; - - /* Find a checksum entry that covers only live data: */ - if (!n.csum_type) { - bkey_for_each_crc(&k->k, ptrs, u, i) - if (!crc_is_compressed(u) && - u.csum_type && - u.live_size == u.uncompressed_size) { - n = u; - goto found; - } - return false; - } -found: - BUG_ON(crc_is_compressed(n)); - BUG_ON(n.offset); - BUG_ON(n.live_size != k->k.size); - -restart_narrow_pointers: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - - bkey_for_each_ptr_decode(&k->k, ptrs, p, i) - if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); - p.ptr.offset += p.crc.offset; - p.crc = n; - bch2_extent_ptr_decoded_append(k, &p); - ret = true; - goto restart_narrow_pointers; - } - - return ret; -} - -static void bch2_extent_crc_pack(union bch_extent_crc *dst, - struct bch_extent_crc_unpacked src, - enum bch_extent_entry_type type) -{ -#define common_fields(_src) \ - .type = BIT(type), \ - .csum_type = _src.csum_type, \ - .compression_type = _src.compression_type, \ - ._compressed_size = _src.compressed_size - 1, \ - ._uncompressed_size = _src.uncompressed_size - 1, \ - .offset = _src.offset - - switch (type) { - case BCH_EXTENT_ENTRY_crc32: - dst->crc32 = (struct bch_extent_crc32) { - common_fields(src), - .csum = (u32 __force) *((__le32 *) &src.csum.lo), - }; - break; - case BCH_EXTENT_ENTRY_crc64: - dst->crc64 = (struct bch_extent_crc64) { - common_fields(src), - .nonce = src.nonce, - .csum_lo = (u64 __force) src.csum.lo, - .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), - }; - break; - case BCH_EXTENT_ENTRY_crc128: - dst->crc128 = (struct bch_extent_crc128) { - common_fields(src), - .nonce = src.nonce, - .csum = src.csum, - }; - break; - default: - BUG(); - } -#undef set_common_fields -} - -void bch2_extent_crc_append(struct bkey_i *k, - struct bch_extent_crc_unpacked new) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - union bch_extent_crc *crc = (void *) ptrs.end; - enum bch_extent_entry_type type; - - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc32; - else if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc64; - else if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) - type = BCH_EXTENT_ENTRY_crc128; - else - BUG(); - - bch2_extent_crc_pack(crc, new, type); - - k->k.u64s += extent_entry_u64s(ptrs.end); - - EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -} - -/* Generic code for keys with pointers: */ - -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -{ - return bch2_bkey_devs(k).nr; -} - -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -{ - return k.k->type == KEY_TYPE_reservation - ? bkey_s_c_to_reservation(k).v->nr_replicas - : bch2_bkey_dirty_devs(k).nr; -} - -unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) -{ - unsigned ret = 0; - - if (k.k->type == KEY_TYPE_reservation) { - ret = bkey_s_c_to_reservation(k).v->nr_replicas; - } else { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - ret += !p.ptr.cached && !crc_is_compressed(p.crc); - } - - return ret; -} - -unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ret = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && crc_is_compressed(p.crc)) - ret += p.crc.compressed_size; - - return ret; -} - -bool bch2_bkey_is_incompressible(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - - bkey_for_each_crc(k.k, ptrs, crc, entry) - if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) - return true; - return false; -} - -unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p = { 0 }; - unsigned replicas = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - - if (p.has_ec) - replicas += p.ec.redundancy; - - replicas++; - - } - - return replicas; -} - -static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p) -{ - if (p->ptr.cached) - return 0; - - return p->has_ec - ? p->ec.redundancy + 1 - : ca->mi.durability; -} - -unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) -{ - struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - - return ca ? __extent_ptr_durability(ca, p) : 0; -} - -unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) -{ - struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - - if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) - return 0; - - return __extent_ptr_durability(ca, p); -} - -unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, &p); - return durability; -} - -static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) - durability += bch2_extent_ptr_durability(c, &p); - return durability; -} - -void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - union bch_extent_entry *next = extent_entry_next(entry); - - memmove_u64s(entry, next, (u64 *) end - (u64 *) next); - k->k.u64s -= extent_entry_u64s(entry); -} - -void bch2_extent_ptr_decoded_append(struct bkey_i *k, - struct extent_ptr_decoded *p) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked crc = - bch2_extent_crc_unpack(&k->k, NULL); - union bch_extent_entry *pos; - - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = ptrs.start; - goto found; - } - - bkey_for_each_crc(&k->k, ptrs, crc, pos) - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = extent_entry_next(pos); - goto found; - } - - bch2_extent_crc_append(k, p->crc); - pos = bkey_val_end(bkey_i_to_s(k)); -found: - p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ptr)); - - if (p->has_ec) { - p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ec)); - } -} - -static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, - union bch_extent_entry *entry) -{ - union bch_extent_entry *i = ptrs.start; - - if (i == entry) - return NULL; - - while (extent_entry_next(i) != entry) - i = extent_entry_next(i); - return i; -} - -/* - * Returns pointer to the next entry after the one being dropped: - */ -void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry = to_entry(ptr), *next; - bool drop_crc = true; - - if (k.k->type == KEY_TYPE_stripe) { - ptr->dev = BCH_SB_MEMBER_INVALID; - return; - } - - EBUG_ON(ptr < &ptrs.start->ptr || - ptr >= &ptrs.end->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - - for (next = extent_entry_next(entry); - next != ptrs.end; - next = extent_entry_next(next)) { - if (extent_entry_is_crc(next)) { - break; - } else if (extent_entry_is_ptr(next)) { - drop_crc = false; - break; - } - } - - extent_entry_drop(k, entry); - - while ((entry = extent_entry_prev(ptrs, entry))) { - if (extent_entry_is_ptr(entry)) - break; - - if ((extent_entry_is_crc(entry) && drop_crc) || - extent_entry_is_stripe_ptr(entry)) - extent_entry_drop(k, entry); - } -} - -void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) -{ - if (k.k->type != KEY_TYPE_stripe) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == ptr->dev && p.has_ec) { - ptr->dev = BCH_SB_MEMBER_INVALID; - return; - } - } - - bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; - - bch2_bkey_drop_ptr_noerror(k, ptr); - - /* - * If we deleted all the dirty pointers and there's still cached - * pointers, we could set the cached pointers to dirty if they're not - * stale - but to do that correctly we'd need to grab an open_bucket - * reference so that we don't race with bucket reuse: - */ - if (have_dirty && - !bch2_bkey_dirty_devs(k.s_c).nr) { - k.k->type = KEY_TYPE_error; - set_bkey_val_u64s(k.k, 0); - } else if (!bch2_bkey_nr_ptrs(k.s_c)) { - k.k->type = KEY_TYPE_deleted; - set_bkey_val_u64s(k.k, 0); - } -} - -void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) -{ - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); -} - -void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) -{ - bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev); -} - -const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == dev) - return ptr; - - return NULL; -} - -bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_dev *ca; - - guard(rcu)(); - bkey_for_each_ptr(ptrs, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (ca = bch2_dev_rcu(c, ptr->dev)) && - (!ptr->cached || - !dev_ptr_stale_rcu(ca, ptr))) - return true; - - return false; -} - -bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_extent_ptr m, u64 offset) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == m.dev && - p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == - (s64) m.offset - offset) - return true; - - return false; -} - -/* - * Returns true if two extents refer to the same data: - */ -bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) -{ - if (k1.k->type != k2.k->type) - return false; - - if (bkey_extent_is_direct_data(k1.k)) { - struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); - struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); - const union bch_extent_entry *entry1, *entry2; - struct extent_ptr_decoded p1, p2; - - if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) - return false; - - bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) - bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) - if (p1.ptr.dev == p2.ptr.dev && - p1.ptr.gen == p2.ptr.gen && - - /* - * This checks that the two pointers point - * to the same region on disk - adjusting - * for the difference in where the extents - * start, since one may have been trimmed: - */ - (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && - - /* - * This additionally checks that the - * extents overlap on disk, since the - * previous check may trigger spuriously - * when one extent is immediately partially - * overwritten with another extent (so that - * on disk they are adjacent) and - * compression is in use: - */ - ((p1.ptr.offset >= p2.ptr.offset && - p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || - (p2.ptr.offset >= p1.ptr.offset && - p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) - return true; - - return false; - } else { - /* KEY_TYPE_deleted, etc. */ - return true; - } -} - -struct bch_extent_ptr * -bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) -{ - struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); - union bch_extent_entry *entry2; - struct extent_ptr_decoded p2; - - bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) - if (p1.ptr.dev == p2.ptr.dev && - p1.ptr.gen == p2.ptr.gen && - (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) - return &entry2->ptr; - - return NULL; -} - -static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, - struct bch_extent_ptr *ptr) -{ - unsigned target = opts->promote_target ?: opts->foreground_target; - - if (target && !bch2_dev_in_target(c, ptr->dev, target)) - return false; - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - - return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); -} - -void bch2_extent_ptr_set_cached(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s k, - struct bch_extent_ptr *ptr) -{ - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bool have_cached_ptr; - unsigned drop_dev = ptr->dev; - - guard(rcu)(); -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(k); - have_cached_ptr = false; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - /* - * Check if it's erasure coded - stripes can't contain cached - * data. Possibly something we can fix in the future? - */ - if (&entry->ptr == ptr && p.has_ec) - goto drop; - - if (p.ptr.cached) { - if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { - bch2_bkey_drop_ptr_noerror(k, &entry->ptr); - ptr = NULL; - goto restart_drop_ptrs; - } - - have_cached_ptr = true; - } - } - - if (!ptr) - bkey_for_each_ptr(ptrs, ptr2) - if (ptr2->dev == drop_dev) - ptr = ptr2; - - if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) - goto drop; - - ptr->cached = true; - return; -drop: - bch2_bkey_drop_ptr_noerror(k, ptr); -} - -/* - * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. - * - * Returns true if @k should be dropped entirely - * - * For existing keys, only called when btree nodes are being rewritten, not when - * they're merely being compacted/resorted in memory. - */ -bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -{ - struct bch_dev *ca; - - guard(rcu)(); - bch2_bkey_drop_ptrs(k, ptr, - ptr->cached && - (!(ca = bch2_dev_rcu(c, ptr->dev)) || - dev_ptr_stale_rcu(ca, ptr) > 0)); - - return bkey_deleted(k.k); -} - -/* - * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. - * - * Like bch2_extent_normalize(), but also only keeps a single cached pointer on - * the promote target. - */ -bool bch2_extent_normalize_by_opts(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s k) -{ - struct bkey_ptrs ptrs; - bool have_cached_ptr; - - guard(rcu)(); -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(k); - have_cached_ptr = false; - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->cached) { - if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) { - bch2_bkey_drop_ptr(k, ptr); - goto restart_drop_ptrs; - } - have_cached_ptr = true; - } - - return bkey_deleted(k.k); -} - -void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) -{ - out->atomic++; - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - if (!ca) { - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : ""); - } else { - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, "ptr: %u:%llu:%u gen %u", - ptr->dev, b, offset, ptr->gen); - if (ca->mi.durability != 1) - prt_printf(out, " d=%u", ca->mi.durability); - if (ptr->cached) - prt_str(out, " cached"); - if (ptr->unwritten) - prt_str(out, " unwritten"); - int stale = dev_ptr_stale_rcu(ca, ptr); - if (stale > 0) - prt_printf(out, " stale"); - else if (stale) - prt_printf(out, " invalid"); - } - --out->atomic; -} - -void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) -{ - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", - crc->compressed_size, - crc->uncompressed_size, - crc->offset, crc->nonce); - bch2_prt_csum_type(out, crc->csum_type); - prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo); - prt_str(out, " compress "); - bch2_prt_compression_type(out, crc->compression_type); -} - -static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, - const struct bch_extent_rebalance *r) -{ - prt_str(out, "rebalance:"); - - prt_printf(out, " replicas=%u", r->data_replicas); - if (r->data_replicas_from_inode) - prt_str(out, " (inode)"); - - prt_str(out, " checksum="); - bch2_prt_csum_opt(out, r->data_checksum); - if (r->data_checksum_from_inode) - prt_str(out, " (inode)"); - - if (r->background_compression || r->background_compression_from_inode) { - prt_str(out, " background_compression="); - bch2_compression_opt_to_text(out, r->background_compression); - - if (r->background_compression_from_inode) - prt_str(out, " (inode)"); - } - - if (r->background_target || r->background_target_from_inode) { - prt_str(out, " background_target="); - if (c) - bch2_target_to_text(out, c, r->background_target); - else - prt_printf(out, "%u", r->background_target); - - if (r->background_target_from_inode) - prt_str(out, " (inode)"); - } - - if (r->promote_target || r->promote_target_from_inode) { - prt_str(out, " promote_target="); - if (c) - bch2_target_to_text(out, c, r->promote_target); - else - prt_printf(out, "%u", r->promote_target); - - if (r->promote_target_from_inode) - prt_str(out, " (inode)"); - } - - if (r->erasure_code || r->erasure_code_from_inode) { - prt_printf(out, " ec=%u", r->erasure_code); - if (r->erasure_code_from_inode) - prt_str(out, " (inode)"); - } -} - -void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - bool first = true; - - if (c) - prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); - - bkey_extent_entry_for_each(ptrs, entry) { - if (!first) - prt_printf(out, " "); - - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); - break; - - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: { - struct bch_extent_crc_unpacked crc = - bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - - bch2_extent_crc_unpacked_to_text(out, &crc); - break; - } - case BCH_EXTENT_ENTRY_stripe_ptr: { - const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; - - prt_printf(out, "ec: idx %llu block %u", - (u64) ec->idx, ec->block); - break; - } - case BCH_EXTENT_ENTRY_rebalance: - bch2_extent_rebalance_to_text(out, c, &entry->rebalance); - break; - - case BCH_EXTENT_ENTRY_flags: - prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); - break; - - default: - prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); - return; - } - - first = false; - } -} - -static int extent_ptr_validate(struct bch_fs *c, - struct bkey_s_c k, - struct bkey_validate_context from, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata) -{ - int ret = 0; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, - c, ptr_to_duplicate_device, - "multiple pointers to same device (%u)", ptr->dev); - - /* bad pointers are repaired by check_fix_ptrs(): */ - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - if (!ca) { - rcu_read_unlock(); - return 0; - } - u32 bucket_offset; - u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - unsigned first_bucket = ca->mi.first_bucket; - u64 nbuckets = ca->mi.nbuckets; - unsigned bucket_size = ca->mi.bucket_size; - rcu_read_unlock(); - - bkey_fsck_err_on(bucket >= nbuckets, - c, ptr_after_last_bucket, - "pointer past last bucket (%llu > %llu)", bucket, nbuckets); - bkey_fsck_err_on(bucket < first_bucket, - c, ptr_before_first_bucket, - "pointer before first bucket (%llu < %u)", bucket, first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, - c, ptr_spans_multiple_buckets, - "pointer spans multiple buckets (%u + %u > %u)", - bucket_offset, size_ondisk, bucket_size); -fsck_err: - return ret; -} - -int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - unsigned size_ondisk = k.k->size; - unsigned nonce = UINT_MAX; - unsigned nr_ptrs = 0; - bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; - int ret = 0; - - if (bkey_is_btree_ptr(k.k)) - size_ondisk = btree_sectors(c); - - bkey_extent_entry_for_each(ptrs, entry) { - bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, - c, extent_ptrs_invalid_entry, - "invalid extent entry type (got %u, max %u)", - __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); - - bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry), - c, btree_ptr_has_non_ptr, - "has non ptr field"); - - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); - if (ret) - return ret; - - bkey_fsck_err_on(entry->ptr.cached && have_ec, - c, ptr_cached_and_erasure_coded, - "cached, erasure coded ptr"); - - if (!entry->ptr.unwritten) - have_written = true; - else - have_unwritten = true; - - have_ec = false; - crc_since_last_ptr = false; - nr_ptrs++; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - - bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), - c, ptr_crc_csum_type_unknown, - "invalid checksum type"); - bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, - c, ptr_crc_compression_type_unknown, - "invalid compression type"); - - bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, - c, ptr_crc_uncompressed_size_too_small, - "checksum offset + key size > uncompressed size"); - bkey_fsck_err_on(crc_is_encoded(crc) && - (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), - c, ptr_crc_uncompressed_size_too_big, - "too large encoded extent"); - bkey_fsck_err_on(!crc_is_compressed(crc) && - crc.compressed_size != crc.uncompressed_size, - c, ptr_crc_uncompressed_size_mismatch, - "not compressed but compressed != uncompressed size"); - - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - bkey_fsck_err(c, ptr_crc_nonce_mismatch, - "incorrect nonce"); - } - - bkey_fsck_err_on(crc_since_last_ptr, - c, ptr_crc_redundant, - "redundant crc entry"); - crc_since_last_ptr = true; - - size_ondisk = crc.compressed_size; - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - bkey_fsck_err_on(have_ec, - c, ptr_stripe_redundant, - "redundant stripe entry"); - have_ec = true; - break; - case BCH_EXTENT_ENTRY_rebalance: { - /* - * this shouldn't be a fsck error, for forward - * compatibility; the rebalance code should just refetch - * the compression opt if it's unknown - */ -#if 0 - const struct bch_extent_rebalance *r = &entry->rebalance; - - if (!bch2_compression_opt_valid(r->compression)) { - struct bch_compression_opt opt = __bch2_compression_decode(r->compression); - prt_printf(err, "invalid compression opt %u:%u", - opt.type, opt.level); - return bch_err_throw(c, invalid_bkey); - } -#endif - break; - } - case BCH_EXTENT_ENTRY_flags: - bkey_fsck_err_on(entry != ptrs.start, - c, extent_flags_not_at_start, - "extent flags entry not at start"); - break; - } - } - - bkey_fsck_err_on(!nr_ptrs, - c, extent_ptrs_no_ptrs, - "no ptrs"); - bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, - c, extent_ptrs_too_many_ptrs, - "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); - bkey_fsck_err_on(have_written && have_unwritten, - c, extent_ptrs_written_and_unwritten, - "extent with unwritten and written ptrs"); - bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, - c, extent_ptrs_unwritten, - "has unwritten ptrs"); - bkey_fsck_err_on(crc_since_last_ptr, - c, extent_ptrs_redundant_crc, - "redundant crc entry"); - bkey_fsck_err_on(have_ec, - c, extent_ptrs_redundant_stripe, - "redundant stripe entry"); -fsck_err: - return ret; -} - -void bch2_ptr_swab(struct bkey_s k) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - u64 *d; - - for (d = (u64 *) ptrs.start; - d != (u64 *) ptrs.end; - d++) - *d = swab64(*d); - - for (entry = ptrs.start; - entry < ptrs.end; - entry = extent_entry_next(entry)) { - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - break; - case BCH_EXTENT_ENTRY_rebalance: - break; - default: - /* Bad entry type: will be caught by validate() */ - return; - } - } -} - -int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) -{ - int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); - if (ret) - return ret; - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - - if (ptrs.start != ptrs.end && - extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { - ptrs.start->flags.flags = flags; - } else { - struct bch_extent_flags f = { - .type = BIT(BCH_EXTENT_ENTRY_flags), - .flags = flags, - }; - __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); - } - - return 0; -} - -/* Generic extent code: */ - -int bch2_cut_front_s(struct bpos where, struct bkey_s k) -{ - unsigned new_val_u64s = bkey_val_u64s(k.k); - int val_u64s_delta; - u64 sub; - - if (bkey_le(where, bkey_start_pos(k.k))) - return 0; - - EBUG_ON(bkey_gt(where, k.k->p)); - - sub = where.offset - bkey_start_offset(k.k); - - k.k->size -= sub; - - if (!k.k->size) { - k.k->type = KEY_TYPE_deleted; - new_val_u64s = 0; - } - - switch (k.k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - bool seen_crc = false; - - bkey_extent_entry_for_each(ptrs, entry) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - if (!seen_crc) - entry->ptr.offset += sub; - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.offset += sub; - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.offset += sub; - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.offset += sub; - break; - case BCH_EXTENT_ENTRY_stripe_ptr: - case BCH_EXTENT_ENTRY_rebalance: - case BCH_EXTENT_ENTRY_flags: - break; - } - - if (extent_entry_is_crc(entry)) - seen_crc = true; - } - - break; - } - case KEY_TYPE_reflink_p: { - struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - - SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); - break; - } - case KEY_TYPE_inline_data: - case KEY_TYPE_indirect_inline_data: { - void *p = bkey_inline_data_p(k); - unsigned bytes = bkey_inline_data_bytes(k.k); - - sub = min_t(u64, sub << 9, bytes); - - memmove(p, p + sub, bytes - sub); - - new_val_u64s -= sub >> 3; - break; - } - } - - val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; - BUG_ON(val_u64s_delta < 0); - - set_bkey_val_u64s(k.k, new_val_u64s); - memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); - return -val_u64s_delta; -} - -int bch2_cut_back_s(struct bpos where, struct bkey_s k) -{ - unsigned new_val_u64s = bkey_val_u64s(k.k); - int val_u64s_delta; - u64 len = 0; - - if (bkey_ge(where, k.k->p)) - return 0; - - EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); - - len = where.offset - bkey_start_offset(k.k); - - k.k->p.offset = where.offset; - k.k->size = len; - - if (!len) { - k.k->type = KEY_TYPE_deleted; - new_val_u64s = 0; - } - - switch (k.k->type) { - case KEY_TYPE_inline_data: - case KEY_TYPE_indirect_inline_data: - new_val_u64s = (bkey_inline_data_offset(k.k) + - min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; - break; - } - - val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; - BUG_ON(val_u64s_delta < 0); - - set_bkey_val_u64s(k.k, new_val_u64s); - memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); - return -val_u64s_delta; -} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h deleted file mode 100644 index b8590e51b76e62..00000000000000 --- a/fs/bcachefs/extents.h +++ /dev/null @@ -1,768 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENTS_H -#define _BCACHEFS_EXTENTS_H - -#include "bcachefs.h" -#include "bkey.h" -#include "extents_types.h" - -struct bch_fs; -struct btree_trans; - -/* extent entries: */ - -#define extent_entry_last(_e) \ - ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) - -#define entry_to_ptr(_entry) \ -({ \ - EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ - \ - __builtin_choose_expr( \ - type_is_exact(_entry, const union bch_extent_entry *), \ - (const struct bch_extent_ptr *) (_entry), \ - (struct bch_extent_ptr *) (_entry)); \ -}) - -/* downcast, preserves const */ -#define to_entry(_entry) \ -({ \ - BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ - !type_is(_entry, struct bch_extent_ptr *) && \ - !type_is(_entry, struct bch_extent_stripe_ptr *)); \ - \ - __builtin_choose_expr( \ - (type_is_exact(_entry, const union bch_extent_crc *) || \ - type_is_exact(_entry, const struct bch_extent_ptr *) ||\ - type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ - (const union bch_extent_entry *) (_entry), \ - (union bch_extent_entry *) (_entry)); \ -}) - -#define extent_entry_next(_entry) \ - ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) - -#define extent_entry_next_safe(_entry, _end) \ - (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ - ? extent_entry_next(_entry) \ - : _end) - -static inline unsigned -__extent_entry_type(const union bch_extent_entry *e) -{ - return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; -} - -static inline enum bch_extent_entry_type -extent_entry_type(const union bch_extent_entry *e) -{ - int ret = __ffs(e->type); - - EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); - - return ret; -} - -static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) -{ - switch (extent_entry_type(entry)) { -#define x(f, n) \ - case BCH_EXTENT_ENTRY_##f: \ - return sizeof(struct bch_extent_##f); - BCH_EXTENT_ENTRY_TYPES() -#undef x - default: - BUG(); - } -} - -static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) -{ - return extent_entry_bytes(entry) / sizeof(u64); -} - -static inline void __extent_entry_insert(struct bkey_i *k, - union bch_extent_entry *dst, - union bch_extent_entry *new) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - - memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - k->k.u64s += extent_entry_u64s(new); - memcpy_u64s_small(dst, new, extent_entry_u64s(new)); -} - -static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) -{ - union bch_extent_entry *next = extent_entry_next(entry); - - /* stripes have ptrs, but their layout doesn't work with this code */ - BUG_ON(k.k->type == KEY_TYPE_stripe); - - memmove_u64s_down(entry, next, - (u64 *) bkey_val_end(k) - (u64 *) next); - k.k->u64s -= (u64 *) next - (u64 *) entry; -} - -static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) -{ - return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; -} - -static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) -{ - return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; -} - -static inline bool extent_entry_is_crc(const union bch_extent_entry *e) -{ - switch (__extent_entry_type(e)) { - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - return true; - default: - return false; - } -} - -union bch_extent_crc { - u8 type; - struct bch_extent_crc32 crc32; - struct bch_extent_crc64 crc64; - struct bch_extent_crc128 crc128; -}; - -#define __entry_to_crc(_entry) \ - __builtin_choose_expr( \ - type_is_exact(_entry, const union bch_extent_entry *), \ - (const union bch_extent_crc *) (_entry), \ - (union bch_extent_crc *) (_entry)) - -#define entry_to_crc(_entry) \ -({ \ - EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ - \ - __entry_to_crc(_entry); \ -}) - -static inline struct bch_extent_crc_unpacked -bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) -{ -#define common_fields(_crc) \ - .csum_type = _crc.csum_type, \ - .compression_type = _crc.compression_type, \ - .compressed_size = _crc._compressed_size + 1, \ - .uncompressed_size = _crc._uncompressed_size + 1, \ - .offset = _crc.offset, \ - .live_size = k->size - - if (!crc) - return (struct bch_extent_crc_unpacked) { - .compressed_size = k->size, - .uncompressed_size = k->size, - .live_size = k->size, - }; - - switch (extent_entry_type(to_entry(crc))) { - case BCH_EXTENT_ENTRY_crc32: { - struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { - common_fields(crc->crc32), - }; - - *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum; - return ret; - } - case BCH_EXTENT_ENTRY_crc64: { - struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { - common_fields(crc->crc64), - .nonce = crc->crc64.nonce, - .csum.lo = (__force __le64) crc->crc64.csum_lo, - }; - - *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi; - - return ret; - } - case BCH_EXTENT_ENTRY_crc128: { - struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { - common_fields(crc->crc128), - .nonce = crc->crc128.nonce, - .csum = crc->crc128.csum, - }; - - return ret; - } - default: - BUG(); - } -#undef common_fields -} - -static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) -{ - return (crc.compression_type != BCH_COMPRESSION_TYPE_none && - crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); -} - -static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) -{ - return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); -} - -void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *); - -/* bkey_ptrs: generically over any key type that has ptrs */ - -struct bkey_ptrs_c { - const union bch_extent_entry *start; - const union bch_extent_entry *end; -}; - -struct bkey_ptrs { - union bch_extent_entry *start; - union bch_extent_entry *end; -}; - -static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: { - struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); - - return (struct bkey_ptrs_c) { - to_entry(&e.v->start[0]), - to_entry(extent_entry_last(e)) - }; - } - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - - return (struct bkey_ptrs_c) { - e.v->start, - extent_entry_last(e) - }; - } - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - return (struct bkey_ptrs_c) { - to_entry(&s.v->ptrs[0]), - to_entry(&s.v->ptrs[s.v->nr_blocks]), - }; - } - case KEY_TYPE_reflink_v: { - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - return (struct bkey_ptrs_c) { - r.v->start, - bkey_val_end(r), - }; - } - case KEY_TYPE_btree_ptr_v2: { - struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); - - return (struct bkey_ptrs_c) { - to_entry(&e.v->start[0]), - to_entry(extent_entry_last(e)) - }; - } - default: - return (struct bkey_ptrs_c) { NULL, NULL }; - } -} - -static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); - - return (struct bkey_ptrs) { - (void *) p.start, - (void *) p.end - }; -} - -#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ - for ((_entry) = (_start); \ - (_entry) < (_end); \ - (_entry) = extent_entry_next_safe(_entry, _end)) - -#define __bkey_ptr_next(_ptr, _end) \ -({ \ - typeof(_end) _entry; \ - \ - __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ - if (extent_entry_is_ptr(_entry)) \ - break; \ - \ - _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ -}) - -#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ - __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) - -#define bkey_extent_entry_for_each(_p, _entry) \ - bkey_extent_entry_for_each_from(_p, _entry, _p.start) - -#define __bkey_for_each_ptr(_start, _end, _ptr) \ - for (typeof(_start) (_ptr) = (_start); \ - ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ - (_ptr)++) - -#define bkey_ptr_next(_p, _ptr) \ - __bkey_ptr_next(_ptr, (_p).end) - -#define bkey_for_each_ptr(_p, _ptr) \ - __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) - -#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ -({ \ - __label__ out; \ - \ - (_ptr).has_ec = false; \ - (_ptr).do_ec_reconstruct = false; \ - (_ptr).crc_retry_nr = 0; \ - \ - __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ - switch (__extent_entry_type(_entry)) { \ - case BCH_EXTENT_ENTRY_ptr: \ - (_ptr).ptr = _entry->ptr; \ - goto out; \ - case BCH_EXTENT_ENTRY_crc32: \ - case BCH_EXTENT_ENTRY_crc64: \ - case BCH_EXTENT_ENTRY_crc128: \ - (_ptr).crc = bch2_extent_crc_unpack(_k, \ - entry_to_crc(_entry)); \ - break; \ - case BCH_EXTENT_ENTRY_stripe_ptr: \ - (_ptr).ec = _entry->stripe_ptr; \ - (_ptr).has_ec = true; \ - break; \ - default: \ - /* nothing */ \ - break; \ - } \ -out: \ - _entry < (_end); \ -}) - -#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ - for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ - (_entry) = _start; \ - __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ - (_entry) = extent_entry_next_safe(_entry, _end)) - -#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ - __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ - _ptr, _entry) - -#define bkey_crc_next(_k, _end, _crc, _iter) \ -({ \ - __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ - if (extent_entry_is_crc(_iter)) { \ - (_crc) = bch2_extent_crc_unpack(_k, \ - entry_to_crc(_iter)); \ - break; \ - } \ - \ - (_iter) < (_end); \ -}) - -#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ - for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ - (_iter) = (_start); \ - bkey_crc_next(_k, _end, _crc, _iter); \ - (_iter) = extent_entry_next(_iter)) - -#define bkey_for_each_crc(_k, _p, _crc, _iter) \ - __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) - -/* Iterate over pointers in KEY_TYPE_extent: */ - -#define extent_ptr_next(_e, _ptr) \ - __bkey_ptr_next(_ptr, extent_entry_last(_e)) - -#define extent_for_each_ptr(_e, _ptr) \ - __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) - -#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ - __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ - extent_entry_last(_e), _ptr, _entry) - -/* utility code common to all keys with pointers: */ - -void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, - struct bch_io_failures *); -struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, - unsigned); -void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *, bool); -void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); -int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, - struct bch_io_failures *, - struct extent_ptr_decoded *, int); - -/* KEY_TYPE_btree_ptr: */ - -int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); - -int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, - int, struct bkey_s); - -#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ - .key_validate = bch2_btree_ptr_validate, \ - .val_to_text = bch2_btree_ptr_to_text, \ - .swab = bch2_ptr_swab, \ - .trigger = bch2_trigger_extent, \ -}) - -#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ - .key_validate = bch2_btree_ptr_v2_validate, \ - .val_to_text = bch2_btree_ptr_v2_to_text, \ - .swab = bch2_ptr_swab, \ - .compat = bch2_btree_ptr_v2_compat, \ - .trigger = bch2_trigger_extent, \ - .min_val_size = 40, \ -}) - -/* KEY_TYPE_extent: */ - -bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -#define bch2_bkey_ops_extent ((struct bkey_ops) { \ - .key_validate = bch2_bkey_ptrs_validate, \ - .val_to_text = bch2_bkey_ptrs_to_text, \ - .swab = bch2_ptr_swab, \ - .key_normalize = bch2_extent_normalize, \ - .key_merge = bch2_extent_merge, \ - .trigger = bch2_trigger_extent, \ -}) - -/* KEY_TYPE_reservation: */ - -int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); - -#define bch2_bkey_ops_reservation ((struct bkey_ops) { \ - .key_validate = bch2_reservation_validate, \ - .val_to_text = bch2_reservation_to_text, \ - .key_merge = bch2_reservation_merge, \ - .trigger = bch2_trigger_reservation, \ - .min_val_size = 8, \ -}) - -/* Extent checksum entries: */ - -bool bch2_can_narrow_extent_crcs(struct bkey_s_c, - struct bch_extent_crc_unpacked); -bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); -void bch2_extent_crc_append(struct bkey_i *, - struct bch_extent_crc_unpacked); - -/* Generic code for keys with pointers: */ - -static inline bool bkey_is_btree_ptr(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_direct_data(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_inline_data(const struct bkey *k) -{ - return k->type == KEY_TYPE_inline_data || - k->type == KEY_TYPE_indirect_inline_data; -} - -static inline unsigned bkey_inline_data_offset(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_inline_data: - return sizeof(struct bch_inline_data); - case KEY_TYPE_indirect_inline_data: - return sizeof(struct bch_indirect_inline_data); - default: - BUG(); - } -} - -static inline unsigned bkey_inline_data_bytes(const struct bkey *k) -{ - return bkey_val_bytes(k) - bkey_inline_data_offset(k); -} - -#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) - -static inline bool bkey_extent_is_data(const struct bkey *k) -{ - return bkey_extent_is_direct_data(k) || - bkey_extent_is_inline_data(k) || - k->type == KEY_TYPE_reflink_p; -} - -/* - * Should extent be counted under inode->i_sectors? - */ -static inline bool bkey_extent_is_allocation(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reservation: - case KEY_TYPE_reflink_p: - case KEY_TYPE_reflink_v: - case KEY_TYPE_inline_data: - case KEY_TYPE_indirect_inline_data: - case KEY_TYPE_error: - return true; - default: - return false; - } -} - -static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->unwritten) - return true; - return false; -} - -static inline bool bkey_extent_is_reservation(struct bkey_s_c k) -{ - return k.k->type == KEY_TYPE_reservation || - bkey_extent_is_unwritten(k); -} - -static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(p, ptr) - ret.data[ret.nr++] = ptr->dev; - - return ret; -} - -static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(p, ptr) - if (!ptr->cached) - ret.data[ret.nr++] = ptr->dev; - - return ret; -} - -static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(p, ptr) - if (ptr->cached) - ret.data[ret.nr++] = ptr->dev; - - return ret; -} - -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); -unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); -bool bch2_bkey_is_incompressible(struct bkey_s_c); -unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); - -unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); -unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *); -unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); -unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); - -const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); - -static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) -{ - return (void *) bch2_bkey_has_device_c(k.s_c, dev); -} - -bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); - -void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); - -static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) -{ - struct bch_extent_ptr *dest; - - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); - - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); - - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k)); - *dest = ptr; - k->k.u64s++; - break; - default: - BUG(); - } -} - -void bch2_extent_ptr_decoded_append(struct bkey_i *, - struct extent_ptr_decoded *); -void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *); -void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); - -void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); -void bch2_bkey_drop_device(struct bkey_s, unsigned); - -#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \ -do { \ - __label__ _again; \ - struct bkey_ptrs _ptrs; \ -_again: \ - _ptrs = bch2_bkey_ptrs(_k); \ - \ - bkey_for_each_ptr(_ptrs, _ptr) \ - if (_cond) { \ - bch2_bkey_drop_ptr_noerror(_k, _ptr); \ - goto _again; \ - } \ -} while (0) - -#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ -do { \ - __label__ _again; \ - struct bkey_ptrs _ptrs; \ -_again: \ - _ptrs = bch2_bkey_ptrs(_k); \ - \ - bkey_for_each_ptr(_ptrs, _ptr) \ - if (_cond) { \ - bch2_bkey_drop_ptr(_k, _ptr); \ - goto _again; \ - } \ -} while (0) - -bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, - struct bch_extent_ptr, u64); -bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); -struct bch_extent_ptr * -bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); - -void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, - struct bkey_s, struct bch_extent_ptr *); - -bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); -bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); - -void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); -void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); - -static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, - struct bch_extent_ptr ptr2) -{ - return (ptr1.cached == ptr2.cached && - ptr1.unwritten == ptr2.unwritten && - ptr1.offset == ptr2.offset && - ptr1.dev == ptr2.dev && - ptr1.gen == ptr2.gen); -} - -void bch2_ptr_swab(struct bkey_s); - -/* Generic extent code: */ - -enum bch_extent_overlap { - BCH_EXTENT_OVERLAP_ALL = 0, - BCH_EXTENT_OVERLAP_BACK = 1, - BCH_EXTENT_OVERLAP_FRONT = 2, - BCH_EXTENT_OVERLAP_MIDDLE = 3, -}; - -/* Returns how k overlaps with m */ -static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, - const struct bkey *m) -{ - int cmp1 = bkey_lt(k->p, m->p); - int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m)); - - return (cmp1 << 1) + cmp2; -} - -int bch2_cut_front_s(struct bpos, struct bkey_s); -int bch2_cut_back_s(struct bpos, struct bkey_s); - -static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) -{ - bch2_cut_front_s(where, bkey_i_to_s(k)); -} - -static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) -{ - bch2_cut_back_s(where, bkey_i_to_s(k)); -} - -/** - * bch_key_resize - adjust size of @k - * - * bkey_start_offset(k) will be preserved, modifies where the extent ends - */ -static inline void bch2_key_resize(struct bkey *k, unsigned new_size) -{ - k->p.offset -= k->size; - k->p.offset += new_size; - k->size = new_size; -} - -static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) -{ - if (ptrs.start != ptrs.end && - extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) - return ptrs.start->flags.flags; - return 0; -} - -static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) -{ - return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); -} - -int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); - -#endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h deleted file mode 100644 index 74c0252cbd984d..00000000000000 --- a/fs/bcachefs/extents_format.h +++ /dev/null @@ -1,304 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENTS_FORMAT_H -#define _BCACHEFS_EXTENTS_FORMAT_H - -/* - * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally - * preceded by checksum/compression information (bch_extent_crc32 or - * bch_extent_crc64). - * - * One major determining factor in the format of extents is how we handle and - * represent extents that have been partially overwritten and thus trimmed: - * - * If an extent is not checksummed or compressed, when the extent is trimmed we - * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the data that is currently - * live. The size field in struct bkey records the current (live) size of the - * extent, and is also used to mean "size of region on disk that we point to" in - * this case. - * - * Thus an extent that is not checksummed or compressed will consist only of a - * list of bch_extent_ptrs, with none of the fields in - * bch_extent_crc32/bch_extent_crc64. - * - * When an extent is checksummed or compressed, it's not possible to read only - * the data that is currently live: we have to read the entire extent that was - * originally written, and then return only the part of the extent that is - * currently live. - * - * Thus, in addition to the current size of the extent in struct bkey, we need - * to store the size of the originally allocated space - this is the - * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, - * when the extent is trimmed, instead of modifying the offset field of the - * pointer, we keep a second smaller offset field - "offset into the original - * extent of the currently live region". - * - * The other major determining factor is replication and data migration: - * - * Each pointer may have its own bch_extent_crc32/64. When doing a replicated - * write, we will initially write all the replicas in the same format, with the - * same checksum type and compression format - however, when copygc runs later (or - * tiering/cache promotion, anything that moves data), it is not in general - * going to rewrite all the pointers at once - one of the replicas may be in a - * bucket on one device that has very little fragmentation while another lives - * in a bucket that has become heavily fragmented, and thus is being rewritten - * sooner than the rest. - * - * Thus it will only move a subset of the pointers (or in the case of - * tiering/cache promotion perhaps add a single pointer without dropping any - * current pointers), and if the extent has been partially overwritten it must - * write only the currently live portion (or copygc would not be able to reduce - * fragmentation!) - which necessitates a different bch_extent_crc format for - * the new pointer. - * - * But in the interests of space efficiency, we don't want to store one - * bch_extent_crc for each pointer if we don't have to. - * - * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and - * bch_extent_ptrs appended arbitrarily one after the other. We determine the - * type of a given entry with a scheme similar to utf8 (except we're encoding a - * type, not a size), encoding the type in the position of the first set bit: - * - * bch_extent_crc32 - 0b1 - * bch_extent_ptr - 0b10 - * bch_extent_crc64 - 0b100 - * - * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and - * bch_extent_crc64 is the least constrained). - * - * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, - * until the next bch_extent_crc32/64. - * - * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer - * is neither checksummed nor compressed. - */ - -#define BCH_EXTENT_ENTRY_TYPES() \ - x(ptr, 0) \ - x(crc32, 1) \ - x(crc64, 2) \ - x(crc128, 3) \ - x(stripe_ptr, 4) \ - x(rebalance, 5) \ - x(flags, 6) -#define BCH_EXTENT_ENTRY_MAX 7 - -enum bch_extent_entry_type { -#define x(f, n) BCH_EXTENT_ENTRY_##f = n, - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -/* Compressed/uncompressed size are stored biased by 1: */ -struct bch_extent_crc32 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:2, - _compressed_size:7, - _uncompressed_size:7, - offset:7, - _unused:1, - csum_type:4, - compression_type:4; - __u32 csum; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u32 csum; - __u32 compression_type:4, - csum_type:4, - _unused:1, - offset:7, - _uncompressed_size:7, - _compressed_size:7, - type:2; -#endif -} __packed __aligned(8); - -#define CRC32_SIZE_MAX (1U << 7) -#define CRC32_NONCE_MAX 0 - -struct bch_extent_crc64 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:3, - _compressed_size:9, - _uncompressed_size:9, - offset:9, - nonce:10, - csum_type:4, - compression_type:4, - csum_hi:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 csum_hi:16, - compression_type:4, - csum_type:4, - nonce:10, - offset:9, - _uncompressed_size:9, - _compressed_size:9, - type:3; -#endif - __u64 csum_lo; -} __packed __aligned(8); - -#define CRC64_SIZE_MAX (1U << 9) -#define CRC64_NONCE_MAX ((1U << 10) - 1) - -struct bch_extent_crc128 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:4, - _compressed_size:13, - _uncompressed_size:13, - offset:13, - nonce:13, - csum_type:4, - compression_type:4; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 compression_type:4, - csum_type:4, - nonce:13, - offset:13, - _uncompressed_size:13, - _compressed_size:13, - type:4; -#endif - struct bch_csum csum; -} __packed __aligned(8); - -#define CRC128_SIZE_MAX (1U << 13) -#define CRC128_NONCE_MAX ((1U << 13) - 1) - -/* - * @reservation - pointer hasn't been written to, just reserved - */ -struct bch_extent_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:1, - cached:1, - unused:1, - unwritten:1, - offset:44, /* 8 petabytes */ - dev:8, - gen:8; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 gen:8, - dev:8, - offset:44, - unwritten:1, - unused:1, - cached:1, - type:1; -#endif -} __packed __aligned(8); - -struct bch_extent_stripe_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:5, - block:8, - redundancy:4, - idx:47; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:47, - redundancy:4, - block:8, - type:5; -#endif -}; - -#define BCH_EXTENT_FLAGS() \ - x(poisoned, 0) - -enum bch_extent_flags_e { -#define x(n, v) BCH_EXTENT_FLAG_##n = v, - BCH_EXTENT_FLAGS() -#undef x -}; - -struct bch_extent_flags { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:7, - flags:57; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 flags:57, - type:7; -#endif -}; - -/* bch_extent_rebalance: */ -#include "rebalance_format.h" - -union bch_extent_entry { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 - unsigned long type; -#elif __BITS_PER_LONG == 32 - struct { - unsigned long pad; - unsigned long type; - }; -#else -#error edit for your odd byteorder. -#endif - -#define x(f, n) struct bch_extent_##f f; - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -struct bch_btree_ptr { - struct bch_val v; - - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -struct bch_btree_ptr_v2 { - struct bch_val v; - - __u64 mem_ptr; - __le64 seq; - __le16 sectors_written; - __le16 flags; - struct bpos min_key; - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); - -struct bch_extent { - struct bch_val v; - - __u64 _data[0]; - union bch_extent_entry start[]; -} __packed __aligned(8); - -/* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(__u64)) - -/* Maximum possible size of an entire extent value: */ -#define BKEY_EXTENT_VAL_U64s_MAX \ - (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) - -/* * Maximum possible size of an entire extent, key + value: */ -#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) - -/* Btree pointers don't carry around checksums: */ -#define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) -#define BKEY_BTREE_PTR_U64s_MAX \ - (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - -struct bch_reservation { - struct bch_val v; - - __le32 generation; - __u8 nr_replicas; - __u8 pad[3]; -} __packed __aligned(8); - -struct bch_inline_data { - struct bch_val v; - u8 data[]; -}; - -#endif /* _BCACHEFS_EXTENTS_FORMAT_H */ diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h deleted file mode 100644 index b23ce4a373c024..00000000000000 --- a/fs/bcachefs/extents_types.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_EXTENTS_TYPES_H -#define _BCACHEFS_EXTENTS_TYPES_H - -#include "bcachefs_format.h" - -struct bch_extent_crc_unpacked { - u32 compressed_size; - u32 uncompressed_size; - u32 live_size; - - u8 csum_type; - u8 compression_type; - - u16 offset; - - u16 nonce; - - struct bch_csum csum; -}; - -struct extent_ptr_decoded { - bool has_ec; - bool do_ec_reconstruct; - u8 crc_retry_nr; - struct bch_extent_crc_unpacked crc; - struct bch_extent_ptr ptr; - struct bch_extent_stripe_ptr ec; -}; - -struct bch_io_failures { - u8 nr; - struct bch_dev_io_failures { - u8 dev; - unsigned failed_csum_nr:6, - failed_io:1, - failed_btree_validate:1, - failed_ec:1; - } devs[BCH_REPLICAS_MAX + 1]; -}; - -#endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c deleted file mode 100644 index 0e742555cb0af9..00000000000000 --- a/fs/bcachefs/eytzinger.c +++ /dev/null @@ -1,315 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "eytzinger.h" - -/** - * is_aligned - is this pointer & size okay for word-wide copying? - * @base: pointer to data - * @size: size of each element - * @align: required alignment (typically 4 or 8) - * - * Returns true if elements can be copied using word loads and stores. - * The size must be a multiple of the alignment, and the base address must - * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. - * - * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" - * to "if ((a | b) & mask)", so we do that by hand. - */ -__attribute_const__ __always_inline -static bool is_aligned(const void *base, size_t size, unsigned char align) -{ - unsigned char lsbits = (unsigned char)size; - - (void)base; -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - lsbits |= (unsigned char)(uintptr_t)base; -#endif - return (lsbits & (align - 1)) == 0; -} - -/** - * swap_words_32 - swap two elements in 32-bit chunks - * @a: pointer to the first element to swap - * @b: pointer to the second element to swap - * @n: element size (must be a multiple of 4) - * - * Exchange the two objects in memory. This exploits base+index addressing, - * which basically all CPUs have, to minimize loop overhead computations. - * - * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the - * bottom of the loop, even though the zero flag is still valid from the - * subtract (since the intervening mov instructions don't alter the flags). - * Gcc 8.1.0 doesn't have that problem. - */ -static void swap_words_32(void *a, void *b, size_t n) -{ - do { - u32 t = *(u32 *)(a + (n -= 4)); - *(u32 *)(a + n) = *(u32 *)(b + n); - *(u32 *)(b + n) = t; - } while (n); -} - -/** - * swap_words_64 - swap two elements in 64-bit chunks - * @a: pointer to the first element to swap - * @b: pointer to the second element to swap - * @n: element size (must be a multiple of 8) - * - * Exchange the two objects in memory. This exploits base+index - * addressing, which basically all CPUs have, to minimize loop overhead - * computations. - * - * We'd like to use 64-bit loads if possible. If they're not, emulating - * one requires base+index+4 addressing which x86 has but most other - * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, - * but it's possible to have 64-bit loads without 64-bit pointers (e.g. - * x32 ABI). Are there any cases the kernel needs to worry about? - */ -static void swap_words_64(void *a, void *b, size_t n) -{ - do { -#ifdef CONFIG_64BIT - u64 t = *(u64 *)(a + (n -= 8)); - *(u64 *)(a + n) = *(u64 *)(b + n); - *(u64 *)(b + n) = t; -#else - /* Use two 32-bit transfers to avoid base+index+4 addressing */ - u32 t = *(u32 *)(a + (n -= 4)); - *(u32 *)(a + n) = *(u32 *)(b + n); - *(u32 *)(b + n) = t; - - t = *(u32 *)(a + (n -= 4)); - *(u32 *)(a + n) = *(u32 *)(b + n); - *(u32 *)(b + n) = t; -#endif - } while (n); -} - -/** - * swap_bytes - swap two elements a byte at a time - * @a: pointer to the first element to swap - * @b: pointer to the second element to swap - * @n: element size - * - * This is the fallback if alignment doesn't allow using larger chunks. - */ -static void swap_bytes(void *a, void *b, size_t n) -{ - do { - char t = ((char *)a)[--n]; - ((char *)a)[n] = ((char *)b)[n]; - ((char *)b)[n] = t; - } while (n); -} - -/* - * The values are arbitrary as long as they can't be confused with - * a pointer, but small integers make for the smallest compare - * instructions. - */ -#define SWAP_WORDS_64 (swap_r_func_t)0 -#define SWAP_WORDS_32 (swap_r_func_t)1 -#define SWAP_BYTES (swap_r_func_t)2 -#define SWAP_WRAPPER (swap_r_func_t)3 - -struct wrapper { - cmp_func_t cmp; - swap_func_t swap_func; -}; - -/* - * The function pointer is last to make tail calls most efficient if the - * compiler decides not to inline this function. - */ -static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) -{ - if (swap_func == SWAP_WRAPPER) { - ((const struct wrapper *)priv)->swap_func(a, b, (int)size); - return; - } - - if (swap_func == SWAP_WORDS_64) - swap_words_64(a, b, size); - else if (swap_func == SWAP_WORDS_32) - swap_words_32(a, b, size); - else if (swap_func == SWAP_BYTES) - swap_bytes(a, b, size); - else - swap_func(a, b, (int)size, priv); -} - -#define _CMP_WRAPPER ((cmp_r_func_t)0L) - -static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) -{ - if (cmp == _CMP_WRAPPER) - return ((const struct wrapper *)priv)->cmp(a, b); - return cmp(a, b, priv); -} - -static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, - cmp_r_func_t cmp_func, const void *priv, - size_t l, size_t r) -{ - return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, - base1 + inorder_to_eytzinger1(r, n) * size, - cmp_func, priv); -} - -static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, - swap_r_func_t swap_func, const void *priv, - size_t l, size_t r) -{ - do_swap(base1 + inorder_to_eytzinger1(l, n) * size, - base1 + inorder_to_eytzinger1(r, n) * size, - size, swap_func, priv); -} - -static void eytzinger1_sort_r(void *base1, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) -{ - unsigned i, j, k; - - /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) - swap_func = NULL; - - if (!swap_func) { - if (is_aligned(base1, size, 8)) - swap_func = SWAP_WORDS_64; - else if (is_aligned(base1, size, 4)) - swap_func = SWAP_WORDS_32; - else - swap_func = SWAP_BYTES; - } - - /* heapify */ - for (i = n / 2; i >= 1; --i) { - /* Find the sift-down path all the way to the leaves. */ - for (j = i; k = j * 2, k < n;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ - if (j * 2 == n) - j *= 2; - - /* Backtrack to the correct location. */ - while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) - j /= 2; - - /* Shift the element into its correct place. */ - for (k = j; j != i;) { - j /= 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } - - /* sort */ - for (i = n; i > 1; --i) { - eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); - - /* Find the sift-down path all the way to the leaves. */ - for (j = 1; k = j * 2, k + 1 < i;) - j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - - /* Special case for the last leaf with no sibling. */ - if (j * 2 + 1 == i) - j *= 2; - - /* Backtrack to the correct location. */ - while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) - j /= 2; - - /* Shift the element into its correct place. */ - for (k = j; j > 1;) { - j /= 2; - eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); - } - } -} - -void eytzinger0_sort_r(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) -{ - void *base1 = base - size; - - return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); -} - -void eytzinger0_sort(void *base, size_t n, size_t size, - cmp_func_t cmp_func, - swap_func_t swap_func) -{ - struct wrapper w = { - .cmp = cmp_func, - .swap_func = swap_func, - }; - - return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); -} - -#if 0 -#include -#include -#include - -static u64 cmp_count; - -static int mycmp(const void *a, const void *b) -{ - u32 _a = *(u32 *)a; - u32 _b = *(u32 *)b; - - cmp_count++; - if (_a < _b) - return -1; - else if (_a > _b) - return 1; - else - return 0; -} - -static int test(void) -{ - size_t N, i; - ktime_t start, end; - s64 delta; - u32 *arr; - - for (N = 10000; N <= 100000; N += 10000) { - arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL); - cmp_count = 0; - - for (i = 0; i < N; i++) - arr[i] = get_random_u32(); - - start = ktime_get(); - eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL); - end = ktime_get(); - - delta = ktime_us_delta(end, start); - printk(KERN_INFO "time: %lld\n", delta); - printk(KERN_INFO "comparisons: %lld\n", cmp_count); - - u32 prev = 0; - - eytzinger0_for_each(i, N) { - if (prev > arr[i]) - goto err; - prev = arr[i]; - } - - kfree(arr); - } - return 0; - -err: - kfree(arr); - return -1; -} -#endif diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h deleted file mode 100644 index 643c1f7160615d..00000000000000 --- a/fs/bcachefs/eytzinger.h +++ /dev/null @@ -1,300 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _EYTZINGER_H -#define _EYTZINGER_H - -#include -#include - -#ifdef EYTZINGER_DEBUG -#include -#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) -#else -#define EYTZINGER_BUG_ON(cond) -#endif - -/* - * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array. - * - * Consider using an eytzinger tree any time you would otherwise be doing binary - * search over an array. Binary search is a worst case scenario for branch - * prediction and prefetching, but in an eytzinger tree every node's children - * are adjacent in memory, thus we can prefetch children before knowing the - * result of the comparison, assuming multiple nodes fit on a cacheline. - * - * Two variants are provided, for one based indexing and zero based indexing. - * - * Zero based indexing is more convenient, but one based indexing has better - * alignment and thus better performance because each new level of the tree - * starts at a power of two, and thus if element 0 was cacheline aligned, each - * new level will be as well. - */ - -static inline unsigned eytzinger1_child(unsigned i, unsigned child) -{ - EYTZINGER_BUG_ON(child > 1); - - return (i << 1) + child; -} - -static inline unsigned eytzinger1_left_child(unsigned i) -{ - return eytzinger1_child(i, 0); -} - -static inline unsigned eytzinger1_right_child(unsigned i) -{ - return eytzinger1_child(i, 1); -} - -static inline unsigned eytzinger1_first(unsigned size) -{ - return size ? rounddown_pow_of_two(size) : 0; -} - -static inline unsigned eytzinger1_last(unsigned size) -{ - return rounddown_pow_of_two(size + 1) - 1; -} - -static inline unsigned eytzinger1_next(unsigned i, unsigned size) -{ - EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_right_child(i) <= size) { - i = eytzinger1_right_child(i); - - i <<= __fls(size) - __fls(i); - i >>= i > size; - } else { - i >>= ffz(i) + 1; - } - - return i; -} - -static inline unsigned eytzinger1_prev(unsigned i, unsigned size) -{ - EYTZINGER_BUG_ON(i == 0 || i > size); - - if (eytzinger1_left_child(i) <= size) { - i = eytzinger1_left_child(i) + 1; - - i <<= __fls(size) - __fls(i); - i -= 1; - i >>= i > size; - } else { - i >>= __ffs(i) + 1; - } - - return i; -} - -static inline unsigned eytzinger1_extra(unsigned size) -{ - return size - ? (size + 1 - rounddown_pow_of_two(size)) << 1 - : 0; -} - -static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, - unsigned extra) -{ - unsigned b = __fls(i); - unsigned shift = __fls(size) - b; - int s; - - EYTZINGER_BUG_ON(!i || i > size); - - i ^= 1U << b; - i <<= 1; - i |= 1; - i <<= shift; - - /* - * sign bit trick: - * - * if (i > extra) - * i -= (i - extra) >> 1; - */ - s = extra - i; - i += (s >> 1) & (s >> 31); - - return i; -} - -static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, - unsigned extra) -{ - unsigned shift; - int s; - - EYTZINGER_BUG_ON(!i || i > size); - - /* - * sign bit trick: - * - * if (i > extra) - * i += i - extra; - */ - s = extra - i; - i -= s & (s >> 31); - - shift = __ffs(i); - - i >>= shift + 1; - i |= 1U << (__fls(size) - shift); - - return i; -} - -static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) -{ - return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); -} - -static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) -{ - return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); -} - -#define eytzinger1_for_each(_i, _size) \ - for (unsigned (_i) = eytzinger1_first((_size)); \ - (_i) != 0; \ - (_i) = eytzinger1_next((_i), (_size))) - -/* Zero based indexing version: */ - -static inline unsigned eytzinger0_child(unsigned i, unsigned child) -{ - EYTZINGER_BUG_ON(child > 1); - - return (i << 1) + 1 + child; -} - -static inline unsigned eytzinger0_left_child(unsigned i) -{ - return eytzinger0_child(i, 0); -} - -static inline unsigned eytzinger0_right_child(unsigned i) -{ - return eytzinger0_child(i, 1); -} - -static inline unsigned eytzinger0_first(unsigned size) -{ - return eytzinger1_first(size) - 1; -} - -static inline unsigned eytzinger0_last(unsigned size) -{ - return eytzinger1_last(size) - 1; -} - -static inline unsigned eytzinger0_next(unsigned i, unsigned size) -{ - return eytzinger1_next(i + 1, size) - 1; -} - -static inline unsigned eytzinger0_prev(unsigned i, unsigned size) -{ - return eytzinger1_prev(i + 1, size) - 1; -} - -static inline unsigned eytzinger0_extra(unsigned size) -{ - return eytzinger1_extra(size); -} - -static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, - unsigned extra) -{ - return __eytzinger1_to_inorder(i + 1, size, extra) - 1; -} - -static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, - unsigned extra) -{ - return __inorder_to_eytzinger1(i + 1, size, extra) - 1; -} - -static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) -{ - return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); -} - -static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) -{ - return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); -} - -#define eytzinger0_for_each(_i, _size) \ - for (unsigned (_i) = eytzinger0_first((_size)); \ - (_i) != -1; \ - (_i) = eytzinger0_next((_i), (_size))) - -#define eytzinger0_for_each_prev(_i, _size) \ - for (unsigned (_i) = eytzinger0_last((_size)); \ - (_i) != -1; \ - (_i) = eytzinger0_prev((_i), (_size))) - -/* return greatest node <= @search, or -1 if not found */ -static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) -{ - void *base1 = base - size; - unsigned n = 1; - - while (n <= nr) - n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); - n >>= __ffs(n) + 1; - return n - 1; -} - -/* return smallest node > @search, or -1 if not found */ -static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) -{ - void *base1 = base - size; - unsigned n = 1; - - while (n <= nr) - n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); - n >>= __ffs(n + 1) + 1; - return n - 1; -} - -/* return smallest node >= @search, or -1 if not found */ -static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) -{ - void *base1 = base - size; - unsigned n = 1; - - while (n <= nr) - n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); - n >>= __ffs(n + 1) + 1; - return n - 1; -} - -#define eytzinger0_find(base, nr, size, _cmp, search) \ -({ \ - size_t _size = (size); \ - void *_base1 = (void *)(base) - _size; \ - const void *_search = (search); \ - size_t _nr = (nr); \ - size_t _i = 1; \ - int _res; \ - \ - while (_i <= _nr && \ - (_res = _cmp(_search, _base1 + _i * _size))) \ - _i = eytzinger1_child(_i, _res > 0); \ - _i - 1; \ -}) - -void eytzinger0_sort_r(void *, size_t, size_t, - cmp_r_func_t, swap_r_func_t, const void *); -void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); - -#endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c deleted file mode 100644 index 2faec143eb31c0..00000000000000 --- a/fs/bcachefs/fast_list.c +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Fast, unordered lists - * - * Supports add, remove, and iterate - * - * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot - * allocation and freeing. - * - * This means that adding, removing, and iterating over items is lockless, - * except when refilling/emptying the percpu slot buffers. - */ - -#include "fast_list.h" - -struct fast_list_pcpu { - u32 nr; - u32 entries[31]; -}; - -static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp) -{ - int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp); - if (unlikely(idx < 0)) - return 0; - - if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) { - ida_free(&l->slots_allocated, idx); - return 0; - } - - return idx; -} - -/** - * fast_list_get_idx - get a slot in a fast_list - * @l: list to get slot in - * - * This allocates a slot in the radix tree without storing to it, so that we can - * take the potential memory allocation failure early and do the list add later - * when we can't take an allocation failure. - * - * Returns: positive integer on success, -ENOMEM on failure - */ -int fast_list_get_idx(struct fast_list *l) -{ - unsigned long flags; - int idx; -retry: - local_irq_save(flags); - struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); - - if (unlikely(!lp->nr)) { - u32 entries[16], nr = 0; - - local_irq_restore(flags); - while (nr < ARRAY_SIZE(entries) && - (idx = fast_list_alloc_idx(l, GFP_KERNEL))) - entries[nr++] = idx; - local_irq_save(flags); - - lp = this_cpu_ptr(l->buffer); - - while (nr && lp->nr < ARRAY_SIZE(lp->entries)) - lp->entries[lp->nr++] = entries[--nr]; - - if (unlikely(nr)) { - local_irq_restore(flags); - while (nr) - ida_free(&l->slots_allocated, entries[--nr]); - goto retry; - } - - if (unlikely(!lp->nr)) { - local_irq_restore(flags); - return -ENOMEM; - } - } - - idx = lp->entries[--lp->nr]; - local_irq_restore(flags); - - return idx; -} - -/** - * fast_list_add - add an item to a fast_list - * @l: list - * @item: item to add - * - * Allocates a slot in the radix tree and stores to it and then returns the - * slot index, which must be passed to fast_list_remove(). - * - * Returns: positive integer on success, -ENOMEM on failure - */ -int fast_list_add(struct fast_list *l, void *item) -{ - int idx = fast_list_get_idx(l); - if (idx < 0) - return idx; - - *genradix_ptr_inlined(&l->items, idx) = item; - return idx; -} - -/** - * fast_list_remove - remove an item from a fast_list - * @l: list - * @idx: item's slot index - * - * Zeroes out the slot in the radix tree and frees the slot for future - * fast_list_add() operations. - */ -void fast_list_remove(struct fast_list *l, unsigned idx) -{ - u32 entries[16], nr = 0; - unsigned long flags; - - if (!idx) - return; - - *genradix_ptr_inlined(&l->items, idx) = NULL; - - local_irq_save(flags); - struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); - - if (unlikely(lp->nr == ARRAY_SIZE(lp->entries))) - while (nr < ARRAY_SIZE(entries)) - entries[nr++] = lp->entries[--lp->nr]; - - lp->entries[lp->nr++] = idx; - local_irq_restore(flags); - - if (unlikely(nr)) - while (nr) - ida_free(&l->slots_allocated, entries[--nr]); -} - -void fast_list_exit(struct fast_list *l) -{ - /* XXX: warn if list isn't empty */ - free_percpu(l->buffer); - ida_destroy(&l->slots_allocated); - genradix_free(&l->items); -} - -int fast_list_init(struct fast_list *l) -{ - genradix_init(&l->items); - ida_init(&l->slots_allocated); - l->buffer = alloc_percpu(*l->buffer); - if (!l->buffer) - return -ENOMEM; - return 0; -} diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h deleted file mode 100644 index 73c9bf591fd6ef..00000000000000 --- a/fs/bcachefs/fast_list.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef _LINUX_FAST_LIST_H -#define _LINUX_FAST_LIST_H - -#include -#include -#include - -struct fast_list_pcpu; - -struct fast_list { - GENRADIX(void *) items; - struct ida slots_allocated;; - struct fast_list_pcpu __percpu - *buffer; -}; - -static inline void *fast_list_iter_peek(struct genradix_iter *iter, - struct fast_list *list) -{ - void **p; - while ((p = genradix_iter_peek(iter, &list->items)) && !*p) - genradix_iter_advance(iter, &list->items); - - return p ? *p : NULL; -} - -#define fast_list_for_each_from(_list, _iter, _i, _start) \ - for (_iter = genradix_iter_init(&(_list)->items, _start); \ - (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \ - genradix_iter_advance(&(_iter), &(_list)->items)) - -#define fast_list_for_each(_list, _iter, _i) \ - fast_list_for_each_from(_list, _iter, _i, 0) - -int fast_list_get_idx(struct fast_list *l); -int fast_list_add(struct fast_list *l, void *item); -void fast_list_remove(struct fast_list *l, unsigned idx); -void fast_list_exit(struct fast_list *l); -int fast_list_init(struct fast_list *l); - -#endif /* _LINUX_FAST_LIST_H */ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h deleted file mode 100644 index d8153fe27037ef..00000000000000 --- a/fs/bcachefs/fifo.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FIFO_H -#define _BCACHEFS_FIFO_H - -#include "util.h" - -#define FIFO(type) \ -struct { \ - size_t front, back, size, mask; \ - type *data; \ -} - -#define DECLARE_FIFO(type, name) FIFO(type) name - -#define fifo_buf_size(fifo) \ - ((fifo)->size \ - ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ - : 0) - -#define init_fifo(fifo, _size, _gfp) \ -({ \ - (fifo)->front = (fifo)->back = 0; \ - (fifo)->size = (_size); \ - (fifo)->mask = (fifo)->size \ - ? roundup_pow_of_two((fifo)->size) - 1 \ - : 0; \ - (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \ -}) - -#define free_fifo(fifo) \ -do { \ - kvfree((fifo)->data); \ - (fifo)->data = NULL; \ -} while (0) - -#define fifo_swap(l, r) \ -do { \ - swap((l)->front, (r)->front); \ - swap((l)->back, (r)->back); \ - swap((l)->size, (r)->size); \ - swap((l)->mask, (r)->mask); \ - swap((l)->data, (r)->data); \ -} while (0) - -#define fifo_move(dest, src) \ -do { \ - typeof(*((dest)->data)) _t; \ - while (!fifo_full(dest) && \ - fifo_pop(src, _t)) \ - fifo_push(dest, _t); \ -} while (0) - -#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) -#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) - -#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) -#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) - -#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) -#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) - -#define fifo_entry_idx_abs(fifo, p) \ - ((((p) >= &fifo_peek_front(fifo) \ - ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ - (((p) - (fifo)->data))) - -#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) -#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask]) - -#define fifo_push_back_ref(f) \ - (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) - -#define fifo_push_front_ref(f) \ - (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) - -#define fifo_push_back(fifo, new) \ -({ \ - typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ - if (_r) \ - *_r = (new); \ - _r != NULL; \ -}) - -#define fifo_push_front(fifo, new) \ -({ \ - typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ - if (_r) \ - *_r = (new); \ - _r != NULL; \ -}) - -#define fifo_pop_front(fifo, i) \ -({ \ - bool _r = !fifo_empty((fifo)); \ - if (_r) \ - (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ - _r; \ -}) - -#define fifo_pop_back(fifo, i) \ -({ \ - bool _r = !fifo_empty((fifo)); \ - if (_r) \ - (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ - _r; \ -}) - -#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) -#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) -#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) -#define fifo_peek(fifo) fifo_peek_front(fifo) - -#define fifo_for_each_entry(_entry, _fifo, _iter) \ - for (typecheck(typeof((_fifo)->front), _iter), \ - (_iter) = (_fifo)->front; \ - ((_iter != (_fifo)->back) && \ - (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - (_iter)++) - -#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ - for (typecheck(typeof((_fifo)->front), _iter), \ - (_iter) = (_fifo)->front; \ - ((_iter != (_fifo)->back) && \ - (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - (_iter)++) - -#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c deleted file mode 100644 index 1c54b9b5bd6953..00000000000000 --- a/fs/bcachefs/fs-io-buffered.c +++ /dev/null @@ -1,1109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "fs-io.h" -#include "fs-io-buffered.h" -#include "fs-io-direct.h" -#include "fs-io-pagecache.h" -#include "io_read.h" -#include "io_write.h" - -#include -#include -#include - -static inline bool bio_full(struct bio *bio, unsigned len) -{ - if (bio->bi_vcnt >= bio->bi_max_vecs) - return true; - if (bio->bi_iter.bi_size > UINT_MAX - len) - return true; - return false; -} - -/* readpage(s): */ - -static void bch2_readpages_end_io(struct bio *bio) -{ - struct folio_iter fi; - - bio_for_each_folio_all(fi, bio) - folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK); - - bio_put(bio); -} - -struct readpages_iter { - struct address_space *mapping; - unsigned idx; - folios folios; -}; - -static int readpages_iter_init(struct readpages_iter *iter, - struct readahead_control *ractl) -{ - struct folio *folio; - - *iter = (struct readpages_iter) { ractl->mapping }; - - while ((folio = __readahead_folio(ractl))) { - if (!bch2_folio_create(folio, GFP_KERNEL) || - darray_push(&iter->folios, folio)) { - bch2_folio_release(folio); - ractl->_nr_pages += folio_nr_pages(folio); - ractl->_index -= folio_nr_pages(folio); - return iter->folios.nr ? 0 : -ENOMEM; - } - - folio_put(folio); - } - - return 0; -} - -static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) -{ - if (iter->idx >= iter->folios.nr) - return NULL; - return iter->folios.data[iter->idx]; -} - -static inline void readpage_iter_advance(struct readpages_iter *iter) -{ - iter->idx++; -} - -static bool extent_partial_reads_expensive(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (crc.csum_type || crc.compression_type) - return true; - return false; -} - -static int readpage_bio_extend(struct btree_trans *trans, - struct readpages_iter *iter, - struct bio *bio, - unsigned sectors_this_extent, - bool get_more) -{ - /* Don't hold btree locks while allocating memory: */ - bch2_trans_unlock(trans); - - while (bio_sectors(bio) < sectors_this_extent && - bio->bi_vcnt < bio->bi_max_vecs) { - struct folio *folio = readpage_iter_peek(iter); - int ret; - - if (folio) { - readpage_iter_advance(iter); - } else { - pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; - - if (!get_more) - break; - - unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); - - if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) - break; - - unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); - - /* ensure proper alignment */ - order = min(order, __ffs(folio_offset|BIT(31))); - - folio = xa_load(&iter->mapping->i_pages, folio_offset); - if (folio && !xa_is_value(folio)) - break; - - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); - if (!folio) - break; - - if (!__bch2_folio_create(folio, GFP_KERNEL)) { - folio_put(folio); - break; - } - - ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); - if (ret) { - __bch2_folio_release(folio); - folio_put(folio); - break; - } - - folio_put(folio); - } - - BUG_ON(folio_sector(folio) != bio_end_sector(bio)); - - BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); - } - - return bch2_trans_relock(trans); -} - -static void bchfs_read(struct btree_trans *trans, - struct bch_read_bio *rbio, - subvol_inum inum, - struct readpages_iter *readpages_iter) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; - int flags = BCH_READ_retry_if_stale| - BCH_READ_may_promote; - int ret = 0; - - rbio->subvol = inum.subvol; - - bch2_bkey_buf_init(&sk); - bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_slots); - while (1) { - struct bkey_s_c k; - unsigned bytes, sectors; - s64 offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - bch2_trans_begin(trans); - - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - bch2_btree_iter_set_pos(trans, &iter, - POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - goto err; - - k = bkey_i_to_s_c(sk.k); - - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - - if (readpages_iter) { - ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, - extent_partial_reads_expensive(k)); - if (ret) - goto err; - } - - bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; - swap(rbio->bio.bi_iter.bi_size, bytes); - - if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_last_fragment; - - bch2_bio_page_state_set(&rbio->bio, k); - - bch2_read_extent(trans, rbio, iter.pos, - data_btree, k, offset_into_extent, flags); - /* - * Careful there's a landmine here if bch2_read_extent() ever - * starts returning transaction restarts here. - * - * We've changed rbio->bi_iter.bi_size to be "bytes we can read - * from this extent" with the swap call, and we restore it - * below. That restore needs to come before checking for - * errors. - * - * But unlike __bch2_read(), we use the rbio bvec iter, not one - * on the stack, so we can't do the restore right after the - * bch2_read_extent() call: we don't own that iterator anymore - * if BCH_READ_last_fragment is set, since we may have submitted - * that rbio instead of cloning it. - */ - - if (flags & BCH_READ_last_fragment) - break; - - swap(rbio->bio.bi_iter.bi_size, bytes); - bio_advance(&rbio->bio, bytes); -err: - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - - rbio->bio.bi_status = BLK_STS_IOERR; - bio_endio(&rbio->bio); - } - - bch2_bkey_buf_exit(&sk, c); -} - -void bch2_readahead(struct readahead_control *ractl) -{ - struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct folio *folio; - struct readpages_iter readpages_iter; - struct blk_plug plug; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - int ret = readpages_iter_init(&readpages_iter, ractl); - if (ret) - return; - - /* - * Besides being a general performance optimization, plugging helps with - * avoiding btree transaction srcu warnings - submitting a bio can - * block, and we don't want todo that with the transaction locked. - * - * However, plugged bios are submitted when we schedule; we ideally - * would have our own scheduler hook to call unlock_long() before - * scheduling. - */ - blk_start_plug(&plug); - bch2_pagecache_add_get(inode); - - struct btree_trans *trans = bch2_trans_get(c); - while ((folio = readpage_iter_peek(&readpages_iter))) { - unsigned n = min_t(unsigned, - readpages_iter.folios.nr - - readpages_iter.idx, - BIO_MAX_VECS); - struct bch_read_bio *rbio = - rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, - GFP_KERNEL, &c->bio_read), - c, - opts, - bch2_readpages_end_io); - - readpage_iter_advance(&readpages_iter); - - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bchfs_read(trans, rbio, inode_inum(inode), - &readpages_iter); - bch2_trans_unlock(trans); - } - bch2_trans_put(trans); - - bch2_pagecache_add_put(inode); - blk_finish_plug(&plug); - darray_exit(&readpages_iter.folios); -} - -static void bch2_read_single_folio_end_io(struct bio *bio) -{ - complete(bio->bi_private); -} - -int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_read_bio *rbio; - struct bch_io_opts opts; - struct blk_plug plug; - int ret; - DECLARE_COMPLETION_ONSTACK(done); - - BUG_ON(folio_test_uptodate(folio)); - BUG_ON(folio_test_dirty(folio)); - - if (!bch2_folio_create(folio, GFP_KERNEL)) - return -ENOMEM; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - c, - opts, - bch2_read_single_folio_end_io); - rbio->bio.bi_private = &done; - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - blk_start_plug(&plug); - bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); - blk_finish_plug(&plug); - wait_for_completion(&done); - - ret = blk_status_to_errno(rbio->bio.bi_status); - bio_put(&rbio->bio); - - if (ret < 0) - return ret; - - folio_mark_uptodate(folio); - return 0; -} - -int bch2_read_folio(struct file *file, struct folio *folio) -{ - int ret; - - ret = bch2_read_single_folio(folio, folio->mapping); - folio_unlock(folio); - return bch2_err_class(ret); -} - -/* writepages: */ - -struct bch_writepage_io { - struct bch_inode_info *inode; - - /* must be last: */ - struct bch_write_op op; -}; - -struct bch_writepage_state { - struct bch_writepage_io *io; - struct bch_io_opts opts; - struct bch_folio_sector *tmp; - unsigned tmp_sectors; - struct blk_plug plug; -}; - -/* - * Determine when a writepage io is full. We have to limit writepage bios to a - * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to - * what the bounce path in bch2_write_extent() can handle. In theory we could - * loosen this restriction for non-bounce I/O, but we don't have that context - * here. Ideally, we can up this limit and make it configurable in the future - * when the bounce path can be enhanced to accommodate larger source bios. - */ -static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len) -{ - struct bio *bio = &io->op.wbio.bio; - return bio_full(bio, len) || - (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE); -} - -static void bch2_writepage_io_done(struct bch_write_op *op) -{ - struct bch_writepage_io *io = - container_of(op, struct bch_writepage_io, op); - struct bch_fs *c = io->op.c; - struct bio *bio = &io->op.wbio.bio; - struct folio_iter fi; - unsigned i; - - if (io->op.error) { - set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - mapping_set_error(fi.folio->mapping, -EIO); - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - if (io->op.flags & BCH_WRITE_wrote_data_inline) { - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - /* - * racing with fallocate can cause us to add fewer sectors than - * expected - but we shouldn't add more sectors than expected: - */ - WARN_ON_ONCE(io->op.i_sectors_delta > 0); - - /* - * (error (due to going RO) halfway through a page can screw that up - * slightly) - * XXX wtf? - BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); - */ - - /* - * The writeback flag is effectively our ref on the inode - - * fixup i_blocks before calling folio_end_writeback: - */ - bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s = __bch2_folio(fi.folio); - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(fi.folio); - } - - bio_put(&io->op.wbio.bio); -} - -static void bch2_writepage_do_io(struct bch_writepage_state *w) -{ - struct bch_writepage_io *io = w->io; - - w->io = NULL; - closure_call(&io->op.cl, bch2_write, NULL, NULL); -} - -/* - * Get a bch_writepage_io and add @page to it - appending to an existing one if - * possible, else allocating a new one: - */ -static void bch2_writepage_io_alloc(struct bch_fs *c, - struct writeback_control *wbc, - struct bch_writepage_state *w, - struct bch_inode_info *inode, - u64 sector, - unsigned nr_replicas) -{ - struct bch_write_op *op; - - w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, - REQ_OP_WRITE, - GFP_KERNEL, - &c->writepage_bioset), - struct bch_writepage_io, op.wbio.bio); - - w->io->inode = inode; - op = &w->io->op; - bch2_write_op_init(op, c, w->opts); - op->target = w->opts.foreground_target; - op->nr_replicas = nr_replicas; - op->res.nr_replicas = nr_replicas; - op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->subvol = inode->ei_inum.subvol; - op->pos = POS(inode->v.i_ino, sector); - op->end_io = bch2_writepage_io_done; - op->devs_need_flush = &inode->ei_devs_need_flush; - op->wbio.bio.bi_iter.bi_sector = sector; - op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); -} - -static int __bch2_writepage(struct folio *folio, - struct writeback_control *wbc, - void *data) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_writepage_state *w = data; - struct bch_folio *s; - unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; - loff_t i_size = i_size_read(&inode->v); - int ret; - - EBUG_ON(!folio_test_uptodate(folio)); - - /* Is the folio fully inside i_size? */ - if (folio_end_pos(folio) <= i_size) - goto do_io; - - /* Is the folio fully outside i_size? (truncate in progress) */ - if (folio_pos(folio) >= i_size) { - folio_unlock(folio); - return 0; - } - - /* - * The folio straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the folio size. For a file that is not a multiple of - * the folio size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - folio_zero_segment(folio, - i_size - folio_pos(folio), - folio_size(folio)); -do_io: - f_sectors = folio_sectors(folio); - s = bch2_folio(folio); - - if (f_sectors > w->tmp_sectors) { - kfree(w->tmp); - w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL); - w->tmp_sectors = f_sectors; - } - - /* - * Things get really hairy with errors during writeback: - */ - ret = bch2_get_folio_disk_reservation(c, inode, folio, false); - BUG_ON(ret); - - /* Before unlocking the page, get copy of reservations: */ - spin_lock(&s->lock); - memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - nr_replicas_this_write = - min_t(unsigned, nr_replicas_this_write, - s->s[i].nr_replicas + - s->s[i].replicas_reserved); - } - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - s->s[i].nr_replicas = w->opts.compression - ? 0 : nr_replicas_this_write; - - s->s[i].replicas_reserved = 0; - bch2_folio_sector_set(folio, s, i, SECTOR_allocated); - } - spin_unlock(&s->lock); - - BUG_ON(atomic_read(&s->write_count)); - atomic_set(&s->write_count, 1); - - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - - folio_unlock(folio); - - offset = 0; - while (1) { - unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; - u64 sector; - - while (offset < f_sectors && - w->tmp[offset].state < SECTOR_dirty) - offset++; - - if (offset == f_sectors) - break; - - while (offset + sectors < f_sectors && - w->tmp[offset + sectors].state >= SECTOR_dirty) { - reserved_sectors += w->tmp[offset + sectors].replicas_reserved; - dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; - sectors++; - } - BUG_ON(!sectors); - - sector = folio_sector(folio) + offset; - - if (w->io && - (w->io->op.res.nr_replicas != nr_replicas_this_write || - bch_io_full(w->io, sectors << 9) || - bio_end_sector(&w->io->op.wbio.bio) != sector)) - bch2_writepage_do_io(w); - - if (!w->io) - bch2_writepage_io_alloc(c, wbc, w, inode, sector, - nr_replicas_this_write); - - atomic_inc(&s->write_count); - - BUG_ON(inode != w->io->inode); - BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, - sectors << 9, offset << 9)); - - w->io->op.res.sectors += reserved_sectors; - w->io->op.i_sectors_delta -= dirty_sectors; - w->io->op.new_i_size = i_size; - - offset += sectors; - } - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(folio); - - return 0; -} - -int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - - bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); - - blk_start_plug(&w->plug); - int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); - if (w->io) - bch2_writepage_do_io(w); - blk_finish_plug(&w->plug); - kfree(w->tmp); - kfree(w); - return bch2_err_class(ret); -} - -/* buffered writes: */ - -int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res; - struct folio *folio; - unsigned offset; - int ret = -ENOMEM; - - res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) - return -ENOMEM; - - bch2_folio_reservation_init(c, inode, res); - *fsdata = res; - - bch2_pagecache_add_get(inode); - - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_WRITEBEGIN | fgf_set_order(len), - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - goto err_unlock; - - offset = pos - folio_pos(folio); - len = min_t(size_t, len, folio_end_pos(folio) - pos); - - if (folio_test_uptodate(folio)) - goto out; - - /* If we're writing entire folio, don't need to read it in first: */ - if (!offset && len == folio_size(folio)) - goto out; - - if (!offset && pos + len >= inode->v.i_size) { - folio_zero_segment(folio, len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } - - if (folio_pos(folio) >= inode->v.i_size) { - folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } -readpage: - ret = bch2_read_single_folio(folio, mapping); - if (ret) - goto err; -out: - ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); - if (ret) - goto err; - - ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); - if (ret) { - if (!folio_test_uptodate(folio)) { - /* - * If the folio hasn't been read in, we won't know if we - * actually need a reservation - we don't actually need - * to read here, we just need to check if the folio is - * fully backed by uncompressed data: - */ - goto readpage; - } - - goto err; - } - - *foliop = folio; - return 0; -err: - folio_unlock(folio); - folio_put(folio); -err_unlock: - bch2_pagecache_add_put(inode); - kfree(res); - *fsdata = NULL; - return bch2_err_class(ret); -} - -int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res = fsdata; - unsigned offset = pos - folio_pos(folio); - - lockdep_assert_held(&inode->v.i_rwsem); - BUG_ON(offset + copied > folio_size(folio)); - - if (unlikely(copied < len && !folio_test_uptodate(folio))) { - /* - * The folio needs to be read in, but that would destroy - * our partial write - simplest thing is to just force - * userspace to redo the write: - */ - folio_zero_range(folio, 0, folio_size(folio)); - flush_dcache_folio(folio); - copied = 0; - } - - spin_lock(&inode->v.i_lock); - if (pos + copied > inode->v.i_size) - i_size_write(&inode->v, pos + copied); - spin_unlock(&inode->v.i_lock); - - if (copied) { - if (!folio_test_uptodate(folio)) - folio_mark_uptodate(folio); - - bch2_set_folio_dirty(c, inode, folio, res, offset, copied); - - inode->ei_last_dirtied = (unsigned long) current; - } - - folio_unlock(folio); - folio_put(folio); - bch2_pagecache_add_put(inode); - - bch2_folio_reservation_put(c, inode, res); - kfree(res); - - return copied; -} - -static noinline void folios_trunc(folios *fs, struct folio **fi) -{ - while (fs->data + fs->nr > fi) { - struct folio *f = darray_pop(fs); - - folio_unlock(f); - folio_put(f); - } -} - -static int __bch2_buffered_write(struct bch_inode_info *inode, - struct address_space *mapping, - struct iov_iter *iter, - loff_t pos, unsigned len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - folios fs; - struct folio *f; - unsigned copied = 0, f_offset, f_copied; - u64 end = pos + len, f_pos, f_len; - loff_t last_folio_pos = inode->v.i_size; - int ret = 0; - - BUG_ON(!len); - - bch2_folio_reservation_init(c, inode, &res); - darray_init(&fs); - - ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, - FGP_WRITEBEGIN | fgf_set_order(len), - mapping_gfp_mask(mapping), &fs); - if (ret) - goto out; - - BUG_ON(!fs.nr); - - f = darray_first(fs); - if (pos != folio_pos(f) && !folio_test_uptodate(f)) { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - - f = darray_last(fs); - end = min(end, folio_end_pos(f)); - last_folio_pos = folio_pos(f); - if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { - if (end >= inode->v.i_size) { - folio_zero_range(f, 0, folio_size(f)); - } else { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - } - - ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr); - if (ret) - goto out; - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(fs)); - darray_for_each(fs, fi) { - ssize_t f_reserved; - - f = *fi; - f_len = min(end, folio_end_pos(f)) - f_pos; - f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len); - - if (unlikely(f_reserved != f_len)) { - if (f_reserved < 0) { - if (f == darray_first(fs)) { - ret = f_reserved; - goto out; - } - - folios_trunc(&fs, fi); - end = min(end, folio_end_pos(darray_last(fs))); - } else { - if (!folio_test_uptodate(f)) { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - - folios_trunc(&fs, fi + 1); - end = f_pos + f_reserved; - } - - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (mapping_writably_mapped(mapping)) - darray_for_each(fs, fi) - flush_dcache_folio(*fi); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(fs)); - darray_for_each(fs, fi) { - f = *fi; - f_len = min(end, folio_end_pos(f)) - f_pos; - f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); - if (!f_copied) { - folios_trunc(&fs, fi); - break; - } - - if (!folio_test_uptodate(f) && - f_copied != folio_size(f) && - pos + copied + f_copied < inode->v.i_size) { - iov_iter_revert(iter, f_copied); - folio_zero_range(f, 0, folio_size(f)); - folios_trunc(&fs, fi); - break; - } - - flush_dcache_folio(f); - copied += f_copied; - - if (f_copied != f_len) { - folios_trunc(&fs, fi + 1); - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (!copied) - goto out; - - end = pos + copied; - - spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(fs)); - darray_for_each(fs, fi) { - f = *fi; - f_len = min(end, folio_end_pos(f)) - f_pos; - - if (!folio_test_uptodate(f)) - folio_mark_uptodate(f); - - bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - inode->ei_last_dirtied = (unsigned long) current; -out: - darray_for_each(fs, fi) { - folio_unlock(*fi); - folio_put(*fi); - } - - /* - * If the last folio added to the mapping starts beyond current EOF, we - * performed a short write but left around at least one post-EOF folio. - * Clean up the mapping before we return. - */ - if (last_folio_pos >= inode->v.i_size) - truncate_pagecache(&inode->v, inode->v.i_size); - - darray_exit(&fs); - bch2_folio_reservation_put(c, inode, &res); - - return copied ?: ret; -} - -static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos = iocb->ki_pos; - ssize_t written = 0; - int ret = 0; - - bch2_pagecache_add_get(inode); - - do { - unsigned offset = pos & (PAGE_SIZE - 1); - unsigned bytes = iov_iter_count(iter); -again: - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. - */ - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - bytes = min_t(unsigned long, iov_iter_count(iter), - PAGE_SIZE - offset); - - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - ret = -EFAULT; - break; - } - } - - if (unlikely(fatal_signal_pending(current))) { - ret = -EINTR; - break; - } - - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); - if (unlikely(ret < 0)) - break; - - cond_resched(); - - if (unlikely(ret == 0)) { - /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. - */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(iter)); - goto again; - } - pos += ret; - written += ret; - ret = 0; - - balance_dirty_pages_ratelimited(mapping); - } while (iov_iter_count(iter)); - - bch2_pagecache_add_put(inode); - - return written ? written : ret; -} - -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = bch2_direct_write(iocb, from); - goto out; - } - - inode_lock(&inode->v); - - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs(file); - if (ret) - goto unlock; - - ret = file_update_time(file); - if (ret) - goto unlock; - - ret = bch2_buffered_write(iocb, from); - if (likely(ret > 0)) - iocb->ki_pos += ret; -unlock: - inode_unlock(&inode->v); - - if (ret > 0) - ret = generic_write_sync(iocb, ret); -out: - return bch2_err_class(ret); -} - -void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) -{ - bioset_exit(&c->writepage_bioset); -} - -int bch2_fs_fs_io_buffered_init(struct bch_fs *c) -{ - if (bioset_init(&c->writepage_bioset, - 4, offsetof(struct bch_writepage_io, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_writepage_bioset_init; - - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h deleted file mode 100644 index 14de91c2765607..00000000000000 --- a/fs/bcachefs/fs-io-buffered.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_BUFFERED_H -#define _BCACHEFS_FS_IO_BUFFERED_H - -#ifndef NO_BCACHEFS_FS - -int bch2_read_single_folio(struct folio *, struct address_space *); -int bch2_read_folio(struct file *, struct folio *); - -int bch2_writepages(struct address_space *, struct writeback_control *); -void bch2_readahead(struct readahead_control *); - -int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos, - unsigned len, struct folio **, void **); -int bch2_write_end(const struct kiocb *, struct address_space *, loff_t, - unsigned len, unsigned copied, struct folio *, void *); - -ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); - -void bch2_fs_fs_io_buffered_exit(struct bch_fs *); -int bch2_fs_fs_io_buffered_init(struct bch_fs *); -#else -static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {} -static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; } -#endif - -#endif /* _BCACHEFS_FS_IO_BUFFERED_H */ diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c deleted file mode 100644 index 1f5154d9676bda..00000000000000 --- a/fs/bcachefs/fs-io-direct.c +++ /dev/null @@ -1,704 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "enumerated_ref.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-io-direct.h" -#include "fs-io-pagecache.h" -#include "io_read.h" -#include "io_write.h" - -#include -#include -#include -#include - -/* O_DIRECT reads */ - -struct dio_read { - struct closure cl; - struct kiocb *req; - long ret; - bool should_dirty; - struct bch_read_bio rbio; -}; - -static void bio_check_or_release(struct bio *bio, bool check_dirty) -{ - if (check_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static CLOSURE_CALLBACK(bch2_dio_read_complete) -{ - closure_type(dio, struct dio_read, cl); - - dio->req->ki_complete(dio->req, dio->ret); - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); -} - -static void bch2_direct_IO_read_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - - if (bio->bi_status) - dio->ret = blk_status_to_errno(bio->bi_status); - - closure_put(&dio->cl); -} - -static void bch2_direct_IO_read_split_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; - - bch2_direct_IO_read_endio(bio); - bio_check_or_release(bio, should_dirty); -} - -static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct dio_read *dio; - struct bio *bio; - struct blk_plug plug; - loff_t offset = req->ki_pos; - bool sync = is_sync_kiocb(req); - bool split = false; - size_t shorten; - ssize_t ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - /* bios must be 512 byte aligned: */ - if ((offset|iter->count) & (SECTOR_SIZE - 1)) - return -EINVAL; - - ret = min_t(loff_t, iter->count, - max_t(loff_t, 0, i_size_read(&inode->v) - offset)); - - if (!ret) - return ret; - - shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); - if (shorten >= iter->count) - shorten = 0; - iter->count -= shorten; - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->dio_read_bioset); - - dio = container_of(bio, struct dio_read, rbio.bio); - closure_init(&dio->cl, NULL); - - /* - * this is a _really_ horrible hack just to avoid an atomic sub at the - * end: - */ - if (!sync) { - set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER - - CLOSURE_RUNNING + - CLOSURE_DESTRUCTOR); - } else { - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER + 1); - dio->cl.closure_get_happened = true; - } - - dio->req = req; - dio->ret = ret; - /* - * This is one of the sketchier things I've encountered: we have to skip - * the dirtying of requests that are internal from the kernel (i.e. from - * loopback), because we'll deadlock on page_lock. - */ - dio->should_dirty = iter_is_iovec(iter); - - blk_start_plug(&plug); - - goto start; - while (iter->count) { - split = true; - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->bio_read); -start: - bio->bi_opf = REQ_OP_READ|REQ_SYNC; - bio->bi_iter.bi_sector = offset >> 9; - bio->bi_private = dio; - - ret = bio_iov_iter_get_pages(bio, iter); - if (ret < 0) { - /* XXX: fault inject this path */ - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - break; - } - - offset += bio->bi_iter.bi_size; - - if (dio->should_dirty) - bio_set_pages_dirty(bio); - - if (iter->count) - closure_get(&dio->cl); - - struct bch_read_bio *rbio = - rbio_init(bio, - c, - opts, - split - ? bch2_direct_IO_read_split_endio - : bch2_direct_IO_read_endio); - - bch2_read(c, rbio, inode_inum(inode)); - } - - blk_finish_plug(&plug); - - iter->count += shorten; - - if (sync) { - closure_sync(&dio->cl); - closure_debug_destroy(&dio->cl); - ret = dio->ret; - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); - return ret; - } else { - return -EIOCBQUEUED; - } -} - -ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - size_t count = iov_iter_count(iter); - ssize_t ret = 0; - - if (!count) - return 0; /* skip atime */ - - if (iocb->ki_flags & IOCB_DIRECT) { - struct blk_plug plug; - - if (unlikely(mapping->nrpages)) { - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret < 0) - goto out; - } - - file_accessed(file); - - blk_start_plug(&plug); - ret = bch2_direct_IO_read(iocb, iter); - blk_finish_plug(&plug); - - if (ret >= 0) - iocb->ki_pos += ret; - } else { - bch2_pagecache_add_get(inode); - ret = filemap_read(iocb, iter, ret); - bch2_pagecache_add_put(inode); - } -out: - return bch2_err_class(ret); -} - -/* O_DIRECT writes */ - -struct dio_write { - struct kiocb *req; - struct address_space *mapping; - struct bch_inode_info *inode; - struct mm_struct *mm; - const struct iovec *iov; - unsigned loop:1, - extending:1, - sync:1, - flush:1; - struct quota_res quota_res; - u64 written; - - struct iov_iter iter; - struct iovec inline_vecs[2]; - - /* must be last: */ - struct bch_write_op op; -}; - -static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, - u64 offset, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - u64 end = offset + size; - u32 snapshot; - bool ret = true; - int err; -retry: - bch2_trans_begin(trans); - - err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (err) - goto err; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_slots, k, err) { - if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) - break; - - if (k.k->p.snapshot != snapshot || - nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(err, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_put(trans); - - return err ? false : ret; -} - -static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - return bch2_check_range_allocated(c, inode_inum(inode), - dio->op.pos.offset, bio_sectors(bio), - dio->op.opts.data_replicas, - dio->op.opts.compression != 0); -} - -static void bch2_dio_write_loop_async(struct bch_write_op *); -static __always_inline long bch2_dio_write_done(struct dio_write *dio); - -/* - * We're going to return -EIOCBQUEUED, but we haven't finished consuming the - * iov_iter yet, so we need to stash a copy of the iovec: it might be on the - * caller's stack, we're not guaranteed that it will live for the duration of - * the IO: - */ -static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) -{ - struct iovec *iov = dio->inline_vecs; - - /* - * iov_iter has a single embedded iovec - nothing to do: - */ - if (iter_is_ubuf(&dio->iter)) - return 0; - - /* - * We don't currently handle non-iovec iov_iters here - return an error, - * and we'll fall back to doing the IO synchronously: - */ - if (!iter_is_iovec(&dio->iter)) - return -1; - - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), - GFP_KERNEL); - if (unlikely(!iov)) - return -ENOMEM; - } - - memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); - dio->iter.__iov = iov; - return 0; -} - -static CLOSURE_CALLBACK(bch2_dio_write_flush_done) -{ - closure_type(dio, struct dio_write, op.cl); - struct bch_fs *c = dio->op.c; - - closure_debug_destroy(cl); - - dio->op.error = bch2_journal_error(&c->journal); - - bch2_dio_write_done(dio); -} - -static noinline void bch2_dio_write_flush(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct bch_inode_unpacked inode; - int ret; - - dio->flush = 0; - - closure_init(&dio->op.cl, NULL); - - if (!dio->op.error) { - ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); - if (ret) { - dio->op.error = ret; - } else { - bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, - &dio->op.cl); - bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); - } - } - - if (dio->sync) { - closure_sync(&dio->op.cl); - closure_debug_destroy(&dio->op.cl); - } else { - continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); - } -} - -static __always_inline long bch2_dio_write_done(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - bool sync = dio->sync; - long ret; - - if (unlikely(dio->flush)) { - bch2_dio_write_flush(dio); - if (!sync) - return -EIOCBQUEUED; - } - - bch2_pagecache_block_put(inode); - - kfree(dio->iov); - - ret = dio->op.error ?: ((long) dio->written << 9); - bio_put(&dio->op.wbio.bio); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); - - /* inode->i_dio_count is our ref on inode and thus bch_fs */ - inode_dio_end(&inode->v); - - if (ret < 0) - ret = bch2_err_class(ret); - - if (!sync) { - req->ki_complete(req, ret); - ret = -EIOCBQUEUED; - } - return ret; -} - -static __always_inline void bch2_dio_write_end(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - req->ki_pos += (u64) dio->op.written << 9; - dio->written += dio->op.written; - - if (dio->extending) { - spin_lock(&inode->v.i_lock); - if (req->ki_pos > inode->v.i_size) - i_size_write(&inode->v, req->ki_pos); - spin_unlock(&inode->v.i_lock); - } - - if (dio->op.i_sectors_delta || dio->quota_res.sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); - __bch2_quota_reservation_put(c, inode, &dio->quota_res); - mutex_unlock(&inode->ei_quota_lock); - } - - bio_release_pages(bio, false); - - if (unlikely(dio->op.error)) - set_bit(EI_INODE_ERROR, &inode->ei_flags); -} - -static __always_inline long bch2_dio_write_loop(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct address_space *mapping = dio->mapping; - struct bch_inode_info *inode = dio->inode; - struct bch_io_opts opts; - struct bio *bio = &dio->op.wbio.bio; - unsigned unaligned, iter_count; - bool sync = dio->sync, dropped_locks; - long ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - while (1) { - iter_count = dio->iter.count; - - EBUG_ON(current->faults_disabled_mapping); - current->faults_disabled_mapping = mapping; - - ret = bio_iov_iter_get_pages(bio, &dio->iter); - - dropped_locks = fdm_dropped_locks(); - - current->faults_disabled_mapping = NULL; - - /* - * If the fault handler returned an error but also signalled - * that it dropped & retook ei_pagecache_lock, we just need to - * re-shoot down the page cache and retry: - */ - if (dropped_locks && ret) - ret = 0; - - if (unlikely(ret < 0)) - goto err; - - if (unlikely(dropped_locks)) { - ret = bch2_write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter_count - 1); - if (unlikely(ret)) - goto err; - - if (!bio->bi_iter.bi_size) - continue; - } - - unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); - bio->bi_iter.bi_size -= unaligned; - iov_iter_revert(&dio->iter, unaligned); - - if (!bio->bi_iter.bi_size) { - /* - * bio_iov_iter_get_pages was only able to get < - * blocksize worth of pages: - */ - ret = -EFAULT; - goto err; - } - - bch2_write_op_init(&dio->op, c, opts); - dio->op.end_io = sync - ? NULL - : bch2_dio_write_loop_async; - dio->op.target = dio->op.opts.foreground_target; - dio->op.write_point = writepoint_hashed((unsigned long) current); - dio->op.nr_replicas = dio->op.opts.data_replicas; - dio->op.subvol = inode->ei_inum.subvol; - dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); - dio->op.devs_need_flush = &inode->ei_devs_need_flush; - - if (sync) - dio->op.flags |= BCH_WRITE_sync; - dio->op.flags |= BCH_WRITE_check_enospc; - - ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, - bio_sectors(bio), true); - if (unlikely(ret)) - goto err; - - ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), - dio->op.opts.data_replicas, 0); - if (unlikely(ret) && - !bch2_dio_write_check_allocated(dio)) - goto err; - - task_io_account_write(bio->bi_iter.bi_size); - - if (unlikely(dio->iter.count) && - !dio->sync && - !dio->loop && - bch2_dio_write_copy_iov(dio)) - dio->sync = sync = true; - - dio->loop = true; - closure_call(&dio->op.cl, bch2_write, NULL, NULL); - - if (!sync) - return -EIOCBQUEUED; - - bch2_dio_write_end(dio); - - if (likely(!dio->iter.count) || dio->op.error) - break; - - bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); - } -out: - return bch2_dio_write_done(dio); -err: - dio->op.error = ret; - - bio_release_pages(bio, false); - - bch2_quota_reservation_put(c, inode, &dio->quota_res); - goto out; -} - -static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) -{ - struct mm_struct *mm = dio->mm; - - bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); - - if (mm) - kthread_use_mm(mm); - bch2_dio_write_loop(dio); - if (mm) - kthread_unuse_mm(mm); -} - -static void bch2_dio_write_loop_async(struct bch_write_op *op) -{ - struct dio_write *dio = container_of(op, struct dio_write, op); - - bch2_dio_write_end(dio); - - if (likely(!dio->iter.count) || dio->op.error) - bch2_dio_write_done(dio); - else - bch2_dio_write_continue(dio); -} - -ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct dio_write *dio; - struct bio *bio; - bool locked = true, extending; - ssize_t ret; - - prefetch(&c->opts); - prefetch((void *) &c->opts + 64); - prefetch(&inode->ei_inode); - prefetch((void *) &inode->ei_inode + 64); - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write)) - return -EROFS; - - inode_lock(&inode->v); - - ret = generic_write_checks(req, iter); - if (unlikely(ret <= 0)) - goto err_put_write_ref; - - ret = file_remove_privs(file); - if (unlikely(ret)) - goto err_put_write_ref; - - ret = file_update_time(file); - if (unlikely(ret)) - goto err_put_write_ref; - - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) { - ret = -EINVAL; - goto err_put_write_ref; - } - - inode_dio_begin(&inode->v); - bch2_pagecache_block_get(inode); - - extending = req->ki_pos + iter->count > inode->v.i_size; - if (!extending) { - inode_unlock(&inode->v); - locked = false; - } - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE | REQ_SYNC | REQ_IDLE, - GFP_KERNEL, - &c->dio_write_bioset); - dio = container_of(bio, struct dio_write, op.wbio.bio); - dio->req = req; - dio->mapping = mapping; - dio->inode = inode; - dio->mm = current->mm; - dio->iov = NULL; - dio->loop = false; - dio->extending = extending; - dio->sync = is_sync_kiocb(req) || extending; - dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; - dio->quota_res.sectors = 0; - dio->written = 0; - dio->iter = *iter; - dio->op.c = c; - - if (unlikely(mapping->nrpages)) { - ret = bch2_write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter->count - 1); - if (unlikely(ret)) - goto err_put_bio; - } - - ret = bch2_dio_write_loop(dio); -out: - if (locked) - inode_unlock(&inode->v); - return ret; -err_put_bio: - bch2_pagecache_block_put(inode); - bio_put(bio); - inode_dio_end(&inode->v); -err_put_write_ref: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); - goto out; -} - -void bch2_fs_fs_io_direct_exit(struct bch_fs *c) -{ - bioset_exit(&c->dio_write_bioset); - bioset_exit(&c->dio_read_bioset); -} - -int bch2_fs_fs_io_direct_init(struct bch_fs *c) -{ - if (bioset_init(&c->dio_read_bioset, - 4, offsetof(struct dio_read, rbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_read_bioset_init; - - if (bioset_init(&c->dio_write_bioset, - 4, offsetof(struct dio_write, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_write_bioset_init; - - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h deleted file mode 100644 index 814621ec7f81dc..00000000000000 --- a/fs/bcachefs/fs-io-direct.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_DIRECT_H -#define _BCACHEFS_FS_IO_DIRECT_H - -#ifndef NO_BCACHEFS_FS -ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *); -ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); - -void bch2_fs_fs_io_direct_exit(struct bch_fs *); -int bch2_fs_fs_io_direct_init(struct bch_fs *); -#else -static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {} -static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; } -#endif - -#endif /* _BCACHEFS_FS_IO_DIRECT_H */ diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c deleted file mode 100644 index c2cc405822f2b2..00000000000000 --- a/fs/bcachefs/fs-io-pagecache.c +++ /dev/null @@ -1,827 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "btree_iter.h" -#include "extents.h" -#include "fs-io.h" -#include "fs-io-pagecache.h" -#include "subvolume.h" - -#include -#include - -int bch2_filemap_get_contig_folios_d(struct address_space *mapping, - loff_t start, u64 end, - fgf_t fgp_flags, gfp_t gfp, - folios *fs) -{ - struct folio *f; - u64 pos = start; - int ret = 0; - - while (pos < end) { - if ((u64) pos >= (u64) start + (1ULL << 20)) - fgp_flags &= ~FGP_CREAT; - - ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL); - if (ret) - break; - - f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); - if (IS_ERR(f)) - break; - - BUG_ON(fs->nr && folio_pos(f) != pos); - - pos = folio_end_pos(f); - darray_push(fs, f); - } - - if (!fs->nr && !ret && (fgp_flags & FGP_CREAT)) - ret = -ENOMEM; - - return fs->nr ? 0 : ret; -} - -/* pagecache_block must be held */ -int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, - loff_t start, loff_t end) -{ - int ret; - - /* - * XXX: the way this is currently implemented, we can spin if a process - * is continually redirtying a specific page - */ - do { - if (!mapping->nrpages) - return 0; - - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) - break; - - if (!mapping->nrpages) - return 0; - - ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } while (ret == -EBUSY); - - return ret; -} - -#if 0 -/* Useful for debug tracing: */ -static const char * const bch2_folio_sector_states[] = { -#define x(n) #n, - BCH_FOLIO_SECTOR_STATE() -#undef x - NULL -}; -#endif - -static inline enum bch_folio_sector_state -folio_sector_dirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_dirty; - case SECTOR_reserved: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_undirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_dirty: - return SECTOR_unallocated; - case SECTOR_dirty_reserved: - return SECTOR_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_reserve(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_reserved; - case SECTOR_dirty: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -/* for newly allocated folios: */ -struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - struct bch_folio *s; - - s = kzalloc(sizeof(*s) + - sizeof(struct bch_folio_sector) * - folio_sectors(folio), gfp); - if (!s) - return NULL; - - spin_lock_init(&s->lock); - folio_attach_private(folio, s); - return s; -} - -struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); -} - -static unsigned bkey_to_sector_state(struct bkey_s_c k) -{ - if (bkey_extent_is_reservation(k)) - return SECTOR_reserved; - if (bkey_extent_is_allocation(k.k)) - return SECTOR_allocated; - return SECTOR_unallocated; -} - -static void __bch2_folio_set(struct folio *folio, - unsigned pg_offset, unsigned pg_len, - unsigned nr_ptrs, unsigned state) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - BUG_ON(pg_offset >= sectors); - BUG_ON(pg_offset + pg_len > sectors); - - spin_lock(&s->lock); - - for (i = pg_offset; i < pg_offset + pg_len; i++) { - s->s[i].nr_replicas = nr_ptrs; - bch2_folio_sector_set(folio, s, i, state); - } - - if (i == sectors) - s->uptodate = true; - - spin_unlock(&s->lock); -} - -/* - * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the - * extents btree: - */ -int bch2_folio_set(struct bch_fs *c, subvol_inum inum, - struct folio **fs, unsigned nr_folios) -{ - u64 offset = folio_sector(fs[0]); - bool need_set = false; - - for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) { - struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); - if (!s) - return -ENOMEM; - - need_set |= !s->uptodate; - } - - if (!need_set) - return 0; - - unsigned folio_idx = 0; - - return bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inum.inum, offset), - POS(inum.inum, U64_MAX), - inum.subvol, BTREE_ITER_slots, k, ({ - unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - while (folio_idx < nr_folios) { - struct folio *folio = fs[folio_idx]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - - folio_start; - unsigned folio_len = min(k.k->p.offset, folio_end) - - folio_offset - folio_start; - - BUG_ON(k.k->p.offset < folio_start); - BUG_ON(bkey_start_offset(k.k) > folio_end); - - if (!bch2_folio(folio)->uptodate) - __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - - if (k.k->p.offset < folio_end) - break; - folio_idx++; - } - - if (folio_idx == nr_folios) - break; - 0; - }))); -} - -void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) -{ - struct bvec_iter iter; - struct folio_vec fv; - unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - bio_for_each_folio(fv, bio, iter) - __bch2_folio_set(fv.fv_folio, - fv.fv_offset >> 9, - fv.fv_len >> 9, - nr_ptrs, state); -} - -void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, - u64 start, u64 end) -{ - pgoff_t index = start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - unsigned i, j; - - if (end <= start) - return; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; - - BUG_ON(end <= folio_start); - - folio_lock(folio); - s = bch2_folio(folio); - - if (s) { - spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) - s->s[j].nr_replicas = 0; - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } -} - -int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, - u64 *start, u64 end, - bool nonblocking) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = *start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - s64 i_sectors_delta = 0; - int ret = 0; - - if (end <= *start) - return 0; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - - if (!nonblocking) - folio_lock(folio); - else if (!folio_trylock(folio)) { - folio_batch_release(&fbatch); - ret = -EAGAIN; - break; - } - - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - - BUG_ON(end <= folio_start); - - *start = min(end, folio_end); - - struct bch_folio *s = bch2_folio(folio); - if (s) { - unsigned folio_offset = max(*start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - - spin_lock(&s->lock); - for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { - i_sectors_delta -= s->s[j].state == SECTOR_dirty; - bch2_folio_sector_set(folio, s, j, - folio_sector_reserve(s->s[j].state)); - } - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - return ret; -} - -static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, - unsigned nr_replicas) -{ - return max(0, (int) nr_replicas - - s->nr_replicas - - s->replicas_reserved); -} - -int bch2_get_folio_disk_reservation(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, bool check_enospc) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned nr_replicas = inode_nr_replicas(c, inode); - struct disk_reservation disk_res = { 0 }; - unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; - int ret; - - if (!s) - return -ENOMEM; - - for (i = 0; i < sectors; i++) - disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); - - if (!disk_res_sectors) - return 0; - - ret = bch2_disk_reservation_get(c, &disk_res, - disk_res_sectors, 1, - !check_enospc - ? BCH_DISK_RESERVATION_NOFAIL - : 0); - if (unlikely(ret)) - return ret; - - for (i = 0; i < sectors; i++) - s->s[i].replicas_reserved += - sectors_to_reserve(&s->s[i], nr_replicas); - - return 0; -} - -void bch2_folio_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - bch2_disk_reservation_put(c, &res->disk); - bch2_quota_reservation_put(c, inode, &res->quota); -} - -static int __bch2_folio_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - size_t offset, size_t len, - bool partial) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned i, disk_sectors = 0, quota_sectors = 0; - struct disk_reservation disk_res = {}; - size_t reserved = len; - int ret; - - if (!s) - return -ENOMEM; - - BUG_ON(!s->uptodate); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - quota_sectors += s->s[i].state == SECTOR_unallocated; - } - - if (disk_sectors) { - ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors, - partial ? BCH_DISK_RESERVATION_PARTIAL : 0); - if (unlikely(ret)) - return ret; - - if (unlikely(disk_res.sectors != disk_sectors)) { - disk_sectors = quota_sectors = 0; - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - if (disk_sectors > disk_res.sectors) { - /* - * Make sure to get a reservation that's - * aligned to the filesystem blocksize: - */ - unsigned reserved_offset = round_down(i << 9, block_bytes(c)); - reserved = clamp(reserved_offset, offset, offset + len) - offset; - - if (!reserved) { - bch2_disk_reservation_put(c, &disk_res); - return bch_err_throw(c, ENOSPC_disk_reservation); - } - break; - } - quota_sectors += s->s[i].state == SECTOR_unallocated; - } - } - } - - if (quota_sectors) { - ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true); - if (unlikely(ret)) { - bch2_disk_reservation_put(c, &disk_res); - return ret; - } - } - - res->disk.sectors += disk_res.sectors; - return partial ? reserved : 0; -} - -int bch2_folio_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - size_t offset, size_t len) -{ - return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false); -} - -ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - size_t offset, size_t len) -{ - return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true); -} - -static void bch2_clear_folio_bits(struct folio *folio) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_folio *s = bch2_folio(folio); - struct disk_reservation disk_res = { 0 }; - int i, sectors = folio_sectors(folio), dirty_sectors = 0; - - if (!s) - return; - - EBUG_ON(!folio_test_locked(folio)); - EBUG_ON(folio_test_writeback(folio)); - - for (i = 0; i < sectors; i++) { - disk_res.sectors += s->s[i].replicas_reserved; - s->s[i].replicas_reserved = 0; - - dirty_sectors -= s->s[i].state == SECTOR_dirty; - bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); - } - - bch2_disk_reservation_put(c, &disk_res); - - bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); - - bch2_folio_release(folio); -} - -void bch2_set_folio_dirty(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - unsigned offset, unsigned len) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, dirty_sectors = 0; - - WARN_ON((u64) folio_pos(folio) + offset + len > - round_up((u64) i_size_read(&inode->v), block_bytes(c))); - - BUG_ON(!s->uptodate); - - spin_lock(&s->lock); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - unsigned sectors = sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); - - /* - * This can happen if we race with the error path in - * bch2_writepage_io_done(): - */ - sectors = min_t(unsigned, sectors, res->disk.sectors); - - s->s[i].replicas_reserved += sectors; - res->disk.sectors -= sectors; - - dirty_sectors += s->s[i].state == SECTOR_unallocated; - - bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); - } - - spin_unlock(&s->lock); - - bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); - - if (!folio_test_dirty(folio)) - filemap_dirty_folio(inode->v.i_mapping, folio); -} - -vm_fault_t bch2_page_fault(struct vm_fault *vmf) -{ - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct address_space *fdm = faults_disabled_mapping(); - struct bch_inode_info *inode = file_bch_inode(file); - vm_fault_t ret; - - if (fdm == mapping) - return VM_FAULT_SIGBUS; - - /* Lock ordering: */ - if (fdm > mapping) { - struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); - - if (bch2_pagecache_add_tryget(inode)) - goto got_lock; - - bch2_pagecache_block_put(fdm_host); - - bch2_pagecache_add_get(inode); - bch2_pagecache_add_put(inode); - - bch2_pagecache_block_get(fdm_host); - - /* Signal that lock has been dropped: */ - set_fdm_dropped_locks(); - return VM_FAULT_SIGBUS; - } - - bch2_pagecache_add_get(inode); -got_lock: - ret = filemap_fault(vmf); - bch2_pagecache_add_put(inode); - - return ret; -} - -vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -{ - struct folio *folio = page_folio(vmf->page); - struct file *file = vmf->vma->vm_file; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - vm_fault_t ret; - - loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c)); - unsigned offset = file_offset - folio_pos(folio); - unsigned len = max(PAGE_SIZE, block_bytes(c)); - - BUG_ON(offset + len > folio_size(folio)); - - bch2_folio_reservation_init(c, inode, &res); - - sb_start_pagefault(inode->v.i_sb); - file_update_time(file); - - /* - * Not strictly necessary, but helps avoid dio writes livelocking in - * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get - * a bch2_write_invalidate_inode_pages_range() that works without dropping - * page lock before invalidating page - */ - bch2_pagecache_add_get(inode); - - folio_lock(folio); - u64 isize = i_size_read(&inode->v); - - if (folio->mapping != mapping || file_offset >= isize) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - - len = min_t(unsigned, len, isize - file_offset); - - if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: - bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) { - folio_unlock(folio); - ret = VM_FAULT_SIGBUS; - goto out; - } - - bch2_set_folio_dirty(c, inode, folio, &res, offset, len); - bch2_folio_reservation_put(c, inode, &res); - - folio_wait_stable(folio); - ret = VM_FAULT_LOCKED; -out: - bch2_pagecache_add_put(inode); - sb_end_pagefault(inode->v.i_sb); - - return ret; -} - -void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) -{ - if (offset || length < folio_size(folio)) - return; - - bch2_clear_folio_bits(folio); -} - -bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) -{ - if (folio_test_dirty(folio) || folio_test_writeback(folio)) - return false; - - bch2_clear_folio_bits(folio); - return true; -} - -/* fseek: */ - -static int folio_data_offset(struct folio *folio, loff_t pos, - unsigned min_replicas) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - if (s) - for (i = folio_pos_to_s(folio, pos); i < sectors; i++) - if (s->s[i].state >= SECTOR_dirty && - s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) - return i << SECTOR_SHIFT; - - return -1; -} - -loff_t bch2_seek_pagecache_data(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct folio_batch fbatch; - pgoff_t start_index = start_offset >> PAGE_SHIFT; - pgoff_t end_index = end_offset >> PAGE_SHIFT; - pgoff_t index = start_index; - unsigned i; - loff_t ret; - int offset; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(vinode->i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - - if (!nonblock) { - folio_lock(folio); - } else if (!folio_trylock(folio)) { - folio_batch_release(&fbatch); - return -EAGAIN; - } - - offset = folio_data_offset(folio, - max(folio_pos(folio), start_offset), - min_replicas); - if (offset >= 0) { - ret = clamp(folio_pos(folio) + offset, - start_offset, end_offset); - folio_unlock(folio); - folio_batch_release(&fbatch); - return ret; - } - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - return end_offset; -} - -/* - * Search for a hole in a folio. - * - * The filemap layer returns -ENOENT if no folio exists, so reuse the same error - * code to indicate a pagecache hole exists at the returned offset. Otherwise - * return 0 if the folio is filled with data, or an error code. This function - * can return -EAGAIN if nonblock is specified. - */ -static int folio_hole_offset(struct address_space *mapping, loff_t *offset, - unsigned min_replicas, bool nonblock) -{ - struct folio *folio; - struct bch_folio *s; - unsigned i, sectors; - int ret = -ENOENT; - - folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, - FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); - if (IS_ERR(folio)) - return PTR_ERR(folio); - - s = bch2_folio(folio); - if (!s) - goto unlock; - - sectors = folio_sectors(folio); - for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) - if (s->s[i].state < SECTOR_dirty || - s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { - *offset = max(*offset, - folio_pos(folio) + (i << SECTOR_SHIFT)); - goto unlock; - } - - *offset = folio_end_pos(folio); - ret = 0; -unlock: - folio_unlock(folio); - folio_put(folio); - return ret; -} - -loff_t bch2_seek_pagecache_hole(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct address_space *mapping = vinode->i_mapping; - loff_t offset = start_offset; - loff_t ret = 0; - - while (!ret && offset < end_offset) - ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock); - - if (ret && ret != -ENOENT) - return ret; - return min(offset, end_offset); -} - -int bch2_clamp_data_hole(struct inode *inode, - u64 *hole_start, - u64 *hole_end, - unsigned min_replicas, - bool nonblock) -{ - loff_t ret; - - ret = bch2_seek_pagecache_hole(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_start = ret; - - if (*hole_start == *hole_end) - return 0; - - ret = bch2_seek_pagecache_data(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_end = ret; - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h deleted file mode 100644 index fad911cf506801..00000000000000 --- a/fs/bcachefs/fs-io-pagecache.h +++ /dev/null @@ -1,176 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_PAGECACHE_H -#define _BCACHEFS_FS_IO_PAGECACHE_H - -#include - -typedef DARRAY(struct folio *) folios; - -int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, - u64, fgf_t, gfp_t, folios *); -int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); - -/* - * Use u64 for the end pos and sector helpers because if the folio covers the - * max supported range of the mapping, the start offset of the next folio - * overflows loff_t. This breaks much of the range based processing in the - * buffered write path. - */ -static inline u64 folio_end_pos(struct folio *folio) -{ - return folio_pos(folio) + folio_size(folio); -} - -static inline size_t folio_sectors(struct folio *folio) -{ - return PAGE_SECTORS << folio_order(folio); -} - -static inline loff_t folio_sector(struct folio *folio) -{ - return folio_pos(folio) >> 9; -} - -static inline u64 folio_end_sector(struct folio *folio) -{ - return folio_end_pos(folio) >> 9; -} - -#define BCH_FOLIO_SECTOR_STATE() \ - x(unallocated) \ - x(reserved) \ - x(dirty) \ - x(dirty_reserved) \ - x(allocated) - -enum bch_folio_sector_state { -#define x(n) SECTOR_##n, - BCH_FOLIO_SECTOR_STATE() -#undef x -}; - -struct bch_folio_sector { - /* Uncompressed, fully allocated replicas (or on disk reservation): */ - u8 nr_replicas:4, - /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - replicas_reserved:4; - u8 state; -}; - -struct bch_folio { - spinlock_t lock; - atomic_t write_count; - /* - * Is the sector state up to date with the btree? - * (Not the data itself) - */ - bool uptodate; - struct bch_folio_sector s[]; -}; - -/* Helper for when we need to add debug instrumentation: */ -static inline void bch2_folio_sector_set(struct folio *folio, - struct bch_folio *s, - unsigned i, unsigned n) -{ - s->s[i].state = n; -} - -/* file offset (to folio offset) to bch_folio_sector index */ -static inline int folio_pos_to_s(struct folio *folio, loff_t pos) -{ - u64 f_offset = pos - folio_pos(folio); - - BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); - return f_offset >> SECTOR_SHIFT; -} - -/* for newly allocated folios: */ -static inline void __bch2_folio_release(struct folio *folio) -{ - kfree(folio_detach_private(folio)); -} - -static inline void bch2_folio_release(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - __bch2_folio_release(folio); -} - -static inline struct bch_folio *__bch2_folio(struct folio *folio) -{ - return folio_get_private(folio); -} - -static inline struct bch_folio *bch2_folio(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - - return __bch2_folio(folio); -} - -struct bch_folio *__bch2_folio_create(struct folio *, gfp_t); -struct bch_folio *bch2_folio_create(struct folio *, gfp_t); - -struct bch2_folio_reservation { - struct disk_reservation disk; - struct quota_res quota; -}; - -static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -{ - /* XXX: this should not be open coded */ - return inode->ei_inode.bi_data_replicas - ? inode->ei_inode.bi_data_replicas - 1 - : c->opts.data_replicas; -} - -static inline void bch2_folio_reservation_init(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - memset(res, 0, sizeof(*res)); - - res->disk.nr_replicas = inode_nr_replicas(c, inode); -} - -int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); -void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); - -void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); -int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool); - -int bch2_get_folio_disk_reservation(struct bch_fs *, - struct bch_inode_info *, - struct folio *, bool); - -void bch2_folio_reservation_put(struct bch_fs *, - struct bch_inode_info *, - struct bch2_folio_reservation *); -int bch2_folio_reservation_get(struct bch_fs *, - struct bch_inode_info *, - struct folio *, - struct bch2_folio_reservation *, - size_t, size_t); -ssize_t bch2_folio_reservation_get_partial(struct bch_fs *, - struct bch_inode_info *, - struct folio *, - struct bch2_folio_reservation *, - size_t, size_t); - -void bch2_set_folio_dirty(struct bch_fs *, - struct bch_inode_info *, - struct folio *, - struct bch2_folio_reservation *, - unsigned, unsigned); - -vm_fault_t bch2_page_fault(struct vm_fault *); -vm_fault_t bch2_page_mkwrite(struct vm_fault *); -void bch2_invalidate_folio(struct folio *, size_t, size_t); -bool bch2_release_folio(struct folio *, gfp_t); - -loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool); -loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool); -int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); - -#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c deleted file mode 100644 index a233f45875e966..00000000000000 --- a/fs/bcachefs/fs-io.c +++ /dev/null @@ -1,1102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "clock.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "extent_update.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-io-buffered.h" -#include "fs-io-pagecache.h" -#include "fsck.h" -#include "inode.h" -#include "journal.h" -#include "io_misc.h" -#include "keylist.h" -#include "quota.h" -#include "reflink.h" -#include "trace.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct nocow_flush { - struct closure *cl; - struct bch_dev *ca; - struct bio bio; -}; - -static void nocow_flush_endio(struct bio *_bio) -{ - - struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); - - closure_put(bio->cl); - enumerated_ref_put(&bio->ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_nocow_flush); - bio_put(&bio->bio); -} - -void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, - struct bch_inode_info *inode, - struct closure *cl) -{ - struct nocow_flush *bio; - struct bch_dev *ca; - struct bch_devs_mask devs; - unsigned dev; - - dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); - if (dev == BCH_SB_MEMBERS_MAX) - return; - - devs = inode->ei_devs_need_flush; - memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - - for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - scoped_guard(rcu) { - ca = rcu_dereference(c->devs[dev]); - if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_nocow_flush)) - ca = NULL; - } - - if (!ca) - continue; - - bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, - REQ_OP_WRITE|REQ_PREFLUSH, - GFP_KERNEL, - &c->nocow_flush_bioset), - struct nocow_flush, bio); - bio->cl = cl; - bio->ca = ca; - bio->bio.bi_end_io = nocow_flush_endio; - closure_bio_submit(&bio->bio, cl); - } -} - -static int bch2_inode_flush_nocow_writes(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct closure cl; - - closure_init_stack(&cl); - bch2_inode_flush_nocow_writes_async(c, inode, &cl); - closure_sync(&cl); - - return 0; -} - -/* i_size updates: */ - -struct inode_new_size { - loff_t new_size; - u64 now; - unsigned fields; -}; - -static int inode_set_size(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct inode_new_size *s = p; - - bi->bi_size = s->new_size; - if (s->fields & ATTR_ATIME) - bi->bi_atime = s->now; - if (s->fields & ATTR_MTIME) - bi->bi_mtime = s->now; - if (s->fields & ATTR_CTIME) - bi->bi_ctime = s->now; - - return 0; -} - -int __must_check bch2_write_inode_size(struct bch_fs *c, - struct bch_inode_info *inode, - loff_t new_size, unsigned fields) -{ - struct inode_new_size s = { - .new_size = new_size, - .now = bch2_current_time(c), - .fields = fields, - }; - - return bch2_write_inode(c, inode, inode_set_size, &s, fields); -} - -void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, sectors, - inode->ei_inode.bi_sectors); - - bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - if (sectors < 0) - sectors = -inode->v.i_blocks; - else - sectors = 0; - } - - inode->v.i_blocks += sectors; - -#ifdef CONFIG_BCACHEFS_QUOTA - if (quota_res && - !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && - sectors > 0) { - BUG_ON(sectors > quota_res->sectors); - BUG_ON(sectors > inode->ei_quota_reserved); - - quota_res->sectors -= sectors; - inode->ei_quota_reserved -= sectors; - } else { - bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); - } -#endif -} - -/* fsync: */ - -static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, - u64 *seq) -{ - struct printbuf buf = PRINTBUF; - struct bch_inode_unpacked u; - struct btree_iter iter; - int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); - if (ret) - return ret; - - u64 cur_seq = journal_cur_seq(&trans->c->journal); - *seq = min(cur_seq, u.bi_journal_seq); - - if (fsck_err_on(u.bi_journal_seq > cur_seq, - trans, inode_journal_seq_in_future, - "inode journal seq in future (currently at %llu)\n%s", - cur_seq, - (bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - u.bi_journal_seq = cur_seq; - ret = bch2_inode_write(trans, &iter, &u); - } -fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -/* - * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an - * insert trigger: look up the btree inode instead - */ -static int bch2_flush_inode(struct bch_fs *c, - struct bch_inode_info *inode) -{ - if (c->opts.journal_flush_disabled) - return 0; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) - return -EROFS; - - u64 seq; - int ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: - bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: - bch2_inode_flush_nocow_writes(c, inode); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); - return ret; -} - -int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, err; - - trace_bch2_fsync(file, datasync); - - ret = file_write_and_wait_range(file, start, end); - if (ret) - goto out; - ret = sync_inode_metadata(&inode->v, 1); - if (ret) - goto out; - ret = bch2_flush_inode(c, inode); -out: - ret = bch2_err_class(ret); - if (ret == -EROFS) - ret = -EIO; - - err = file_check_and_advance_wb_err(file); - if (!ret) - ret = err; - - return ret; -} - -/* truncate: */ - -static inline int range_has_data(struct bch_fs *c, u32 subvol, - struct bpos start, - struct bpos end) -{ - return bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, - subvol, 0, k, ({ - bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); - }))); -} - -static int __bch2_truncate_folio(struct bch_inode_info *inode, - pgoff_t index, loff_t start, loff_t end) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; - struct bch_folio *s; - unsigned start_offset; - unsigned end_offset; - unsigned i; - struct folio *folio; - s64 i_sectors_delta = 0; - int ret = 0; - u64 end_pos; - - folio = filemap_lock_folio(mapping, index); - if (IS_ERR_OR_NULL(folio)) { - /* - * XXX: we're doing two index lookups when we end up reading the - * folio - */ - ret = range_has_data(c, inode->ei_inum.subvol, - POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), - POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); - if (ret <= 0) - return ret; - - folio = __filemap_get_folio(mapping, index, - FGP_LOCK|FGP_CREAT, GFP_KERNEL); - if (IS_ERR(folio)) { - ret = -ENOMEM; - goto out; - } - } - - BUG_ON(start >= folio_end_pos(folio)); - BUG_ON(end <= folio_pos(folio)); - - start_offset = max(start, folio_pos(folio)) - folio_pos(folio); - end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); - - /* Folio boundary? Nothing to do */ - if (start_offset == 0 && - end_offset == folio_size(folio)) { - ret = 0; - goto unlock; - } - - s = bch2_folio_create(folio, 0); - if (!s) { - ret = -ENOMEM; - goto unlock; - } - - if (!folio_test_uptodate(folio)) { - ret = bch2_read_single_folio(folio, mapping); - if (ret) - goto unlock; - } - - ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); - if (ret) - goto unlock; - - for (i = round_up(start_offset, block_bytes(c)) >> 9; - i < round_down(end_offset, block_bytes(c)) >> 9; - i++) { - s->s[i].nr_replicas = 0; - - i_sectors_delta -= s->s[i].state == SECTOR_dirty; - bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); - } - - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - - /* - * Caller needs to know whether this folio will be written out by - * writeback - doing an i_size update if necessary - or whether it will - * be responsible for the i_size update. - * - * Note that we shouldn't ever see a folio beyond EOF, but check and - * warn if so. This has been observed by failure to clean up folios - * after a short write and there's still a chance reclaim will fix - * things up. - */ - WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); - end_pos = folio_end_pos(folio); - if (inode->v.i_size > folio_pos(folio)) - end_pos = min_t(u64, inode->v.i_size, end_pos); - ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; - - folio_zero_segment(folio, start_offset, end_offset); - - /* - * Bit of a hack - we don't want truncate to fail due to -ENOSPC. - * - * XXX: because we aren't currently tracking whether the folio has actual - * data in it (vs. just 0s, or only partially written) this wrong. ick. - */ - BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); - - /* - * This removes any writeable userspace mappings; we need to force - * .page_mkwrite to be called again before any mmapped writes, to - * redirty the full page: - */ - folio_mkclean(folio); - filemap_dirty_folio(mapping, folio); -unlock: - folio_unlock(folio); - folio_put(folio); -out: - return ret; -} - -static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) -{ - return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, - from, ANYSINT_MAX(loff_t)); -} - -static int bch2_truncate_folios(struct bch_inode_info *inode, - loff_t start, loff_t end) -{ - int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, - start, end); - - if (ret >= 0 && - start >> PAGE_SHIFT != end >> PAGE_SHIFT) - ret = __bch2_truncate_folio(inode, - (end - 1) >> PAGE_SHIFT, - start, end); - return ret; -} - -static int bch2_extend(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct bch_inode_unpacked *inode_u, - struct iattr *iattr) -{ - struct address_space *mapping = inode->v.i_mapping; - int ret; - - /* - * sync appends: - * - * this has to be done _before_ extending i_size: - */ - ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); - if (ret) - return ret; - - truncate_setsize(&inode->v, iattr->ia_size); - - return bch2_setattr_nonsize(idmap, inode, iattr); -} - -int bchfs_truncate(struct mnt_idmap *idmap, - struct bch_inode_info *inode, struct iattr *iattr) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; - struct bch_inode_unpacked inode_u; - s64 i_sectors_delta = 0; - int ret = 0; - - /* - * If the truncate call with change the size of the file, the - * cmtimes should be updated. If the size will not change, we - * do not need to update the cmtimes. - */ - if (iattr->ia_size != inode->v.i_size) { - if (!(iattr->ia_valid & ATTR_MTIME)) - ktime_get_coarse_real_ts64(&iattr->ia_mtime); - if (!(iattr->ia_valid & ATTR_CTIME)) - ktime_get_coarse_real_ts64(&iattr->ia_ctime); - iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; - } - - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(inode); - - ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); - if (ret) - goto err; - - /* - * check this before next assertion; on filesystem error our normal - * invariants are a bit broken (truncate has to truncate the page cache - * before the inode). - */ - ret = bch2_journal_error(&c->journal); - if (ret) - goto err; - - WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && - inode->v.i_size < inode_u.bi_size, - "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", - (u64) inode->v.i_size, inode_u.bi_size); - - if (iattr->ia_size > inode->v.i_size) { - ret = bch2_extend(idmap, inode, &inode_u, iattr); - goto err; - } - - iattr->ia_valid &= ~ATTR_SIZE; - - ret = bch2_truncate_folio(inode, iattr->ia_size); - if (unlikely(ret < 0)) - goto err; - ret = 0; - - truncate_setsize(&inode->v, iattr->ia_size); - - /* - * When extending, we're going to write the new i_size to disk - * immediately so we need to flush anything above the current on disk - * i_size first: - * - * Also, when extending we need to flush the page that i_size currently - * straddles - if it's mapped to userspace, we need to ensure that - * userspace has to redirty it and call .mkwrite -> set_page_dirty - * again to allocate the part of the page that was extended. - */ - if (iattr->ia_size > inode_u.bi_size) - ret = filemap_write_and_wait_range(mapping, - inode_u.bi_size, - iattr->ia_size - 1); - else if (iattr->ia_size & (PAGE_SIZE - 1)) - ret = filemap_write_and_wait_range(mapping, - round_down(iattr->ia_size, PAGE_SIZE), - iattr->ia_size - 1); - if (ret) - goto err; - - ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - - if (unlikely(ret)) { - /* - * If we error here, VFS caches are now inconsistent with btree - */ - set_bit(EI_INODE_ERROR, &inode->ei_flags); - goto err; - } - - if (unlikely(!inode->v.i_size && inode->v.i_blocks && - !bch2_journal_error(&c->journal))) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, - "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, - inode->ei_inode.bi_sectors); - - bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_setattr_nonsize(idmap, inode, iattr); -err: - bch2_pagecache_block_put(inode); - return bch2_err_class(ret); -} - -/* fallocate: */ - -static int inode_update_times_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); - return 0; -} - -static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 end = offset + len; - u64 block_start = round_up(offset, block_bytes(c)); - u64 block_end = round_down(end, block_bytes(c)); - bool truncated_last_page; - int ret = 0; - - ret = bch2_truncate_folios(inode, offset, end); - if (unlikely(ret < 0)) - goto err; - - truncated_last_page = ret; - - truncate_pagecache_range(&inode->v, offset, end - 1); - - if (block_start < block_end) { - s64 i_sectors_delta = 0; - - ret = bch2_fpunch(c, inode_inum(inode), - block_start >> 9, block_end >> 9, - &i_sectors_delta); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - } - - mutex_lock(&inode->ei_update_lock); - if (end >= inode->v.i_size && !truncated_last_page) { - ret = bch2_write_inode_size(c, inode, inode->v.i_size, - ATTR_MTIME|ATTR_CTIME); - } else { - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_MTIME|ATTR_CTIME); - } - mutex_unlock(&inode->ei_update_lock); -err: - return ret; -} - -static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, - loff_t offset, loff_t len, - bool insert) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; - s64 i_sectors_delta = 0; - int ret = 0; - - if ((offset | len) & (block_bytes(c) - 1)) - return -EINVAL; - - if (insert) { - if (offset >= inode->v.i_size) - return -EINVAL; - } else { - if (offset + len >= inode->v.i_size) - return -EINVAL; - } - - ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); - if (ret) - return ret; - - if (insert) - i_size_write(&inode->v, inode->v.i_size + len); - - ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, - insert, &i_sectors_delta); - if (!ret && !insert) - i_size_write(&inode->v, inode->v.i_size - len); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - - return ret; -} - -static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, - u64 start_sector, u64 end_sector) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bpos end_pos = POS(inode->v.i_ino, end_sector); - struct bch_io_opts opts; - int ret = 0; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inode->v.i_ino, start_sector), - BTREE_ITER_slots|BTREE_ITER_intent); - - while (!ret) { - s64 i_sectors_delta = 0; - struct quota_res quota_res = { 0 }; - struct bkey_s_c k; - unsigned sectors; - bool is_allocation; - u64 hole_start, hole_end; - u32 snapshot; - - bch2_trans_begin(trans); - - if (bkey_ge(iter.pos, end_pos)) - break; - - ret = bch2_subvolume_get_snapshot(trans, - inode->ei_inum.subvol, &snapshot); - if (ret) - goto bkey_err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - k = bch2_btree_iter_peek_slot(trans, &iter); - if ((ret = bkey_err(k))) - goto bkey_err; - - hole_start = iter.pos.offset; - hole_end = bpos_min(k.k->p, end_pos).offset; - is_allocation = bkey_extent_is_allocation(k.k); - - /* already reserved */ - if (bkey_extent_is_reservation(k) && - bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { - bch2_btree_iter_advance(trans, &iter); - continue; - } - - if (bkey_extent_is_data(k.k) && - !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance(trans, &iter); - continue; - } - - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - /* - * Lock ordering - can't be holding btree locks while - * blocking on a folio lock: - */ - if (bch2_clamp_data_hole(&inode->v, - &hole_start, - &hole_end, - opts.data_replicas, true)) { - ret = drop_locks_do(trans, - (bch2_clamp_data_hole(&inode->v, - &hole_start, - &hole_end, - opts.data_replicas, false), 0)); - if (ret) - goto bkey_err; - } - bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); - - if (ret) - goto bkey_err; - - if (hole_start == hole_end) - continue; - } - - sectors = hole_end - hole_start; - - if (!is_allocation) { - ret = bch2_quota_reservation_add(c, inode, - "a_res, sectors, true); - if (unlikely(ret)) - goto bkey_err; - } - - ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, - sectors, opts, &i_sectors_delta, - writepoint_hashed((unsigned long) current)); - if (ret) - goto bkey_err; - - bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - - if (bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, true)) { - ret = drop_locks_do(trans, - bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, false)); - if (ret) - goto bkey_err; - } -bkey_err: - bch2_quota_reservation_put(c, inode, "a_res); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - } - - if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { - struct quota_res quota_res = { 0 }; - s64 i_sectors_delta = 0; - - bch2_fpunch_at(trans, &iter, inode_inum(inode), - end_sector, &i_sectors_delta); - bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - bch2_quota_reservation_put(c, inode, "a_res); - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, - loff_t offset, loff_t len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 end = offset + len; - u64 block_start = round_down(offset, block_bytes(c)); - u64 block_end = round_up(end, block_bytes(c)); - bool truncated_last_page = false; - int ret, ret2 = 0; - - if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { - ret = inode_newsize_ok(&inode->v, end); - if (ret) - return ret; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - ret = bch2_truncate_folios(inode, offset, end); - if (unlikely(ret < 0)) - return ret; - - truncated_last_page = ret; - - truncate_pagecache_range(&inode->v, offset, end - 1); - - block_start = round_up(offset, block_bytes(c)); - block_end = round_down(end, block_bytes(c)); - } - - ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); - - /* - * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, - * so that the VFS cache i_size is consistent with the btree i_size: - */ - if (ret && - !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) - return ret; - - if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) - end = inode->v.i_size; - - if (end >= inode->v.i_size && - (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || - !(mode & FALLOC_FL_KEEP_SIZE))) { - spin_lock(&inode->v.i_lock); - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); - - mutex_lock(&inode->ei_update_lock); - ret2 = bch2_write_inode_size(c, inode, end, 0); - mutex_unlock(&inode->ei_update_lock); - } - - return ret ?: ret2; -} - -long bch2_fallocate_dispatch(struct file *file, int mode, - loff_t offset, loff_t len) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - long ret; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) - return -EROFS; - - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(inode); - - ret = file_modified(file); - if (ret) - goto err; - - if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) - ret = bchfs_fallocate(inode, mode, offset, len); - else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) - ret = bchfs_fpunch(inode, offset, len); - else if (mode == FALLOC_FL_INSERT_RANGE) - ret = bchfs_fcollapse_finsert(inode, offset, len, true); - else if (mode == FALLOC_FL_COLLAPSE_RANGE) - ret = bchfs_fcollapse_finsert(inode, offset, len, false); - else - ret = -EOPNOTSUPP; -err: - bch2_pagecache_block_put(inode); - inode_unlock(&inode->v); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); - - return bch2_err_class(ret); -} - -/* - * Take a quota reservation for unallocated blocks in a given file range - * Does not check pagecache - */ -static int quota_reserve_range(struct bch_inode_info *inode, - struct quota_res *res, - u64 start, u64 end) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 sectors = end - start; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, - BTREE_ID_extents, - POS(inode->v.i_ino, start), - POS(inode->v.i_ino, end - 1), - inode->ei_inum.subvol, 0, k, ({ - if (bkey_extent_is_allocation(k.k)) { - u64 s = min(end, k.k->p.offset) - - max(start, bkey_start_offset(k.k)); - BUG_ON(s > sectors); - sectors -= s; - } - - 0; - }))); - - return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); -} - -loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - loff_t len, unsigned remap_flags) -{ - struct bch_inode_info *src = file_bch_inode(file_src); - struct bch_inode_info *dst = file_bch_inode(file_dst); - struct bch_fs *c = src->v.i_sb->s_fs_info; - struct quota_res quota_res = { 0 }; - s64 i_sectors_delta = 0; - u64 aligned_len; - loff_t ret = 0; - - if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) - return -EINVAL; - - if ((pos_src & (block_bytes(c) - 1)) || - (pos_dst & (block_bytes(c) - 1))) - return -EINVAL; - - if (src == dst && - abs(pos_src - pos_dst) < len) - return -EINVAL; - - lock_two_nondirectories(&src->v, &dst->v); - bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); - - inode_dio_wait(&src->v); - inode_dio_wait(&dst->v); - - ret = generic_remap_file_range_prep(file_src, pos_src, - file_dst, pos_dst, - &len, remap_flags); - if (ret < 0 || len == 0) - goto err; - - aligned_len = round_up((u64) len, block_bytes(c)); - - ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, - pos_dst, pos_dst + len - 1); - if (ret) - goto err; - - ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, - (pos_dst + aligned_len) >> 9); - if (ret) - goto err; - - if (!(remap_flags & REMAP_FILE_DEDUP)) - file_update_time(file_dst); - - bch2_mark_pagecache_unallocated(src, pos_src >> 9, - (pos_src + aligned_len) >> 9); - - /* - * XXX: we'd like to be telling bch2_remap_range() if we have - * permission to write to the source file, and thus if io path option - * changes should be propagated through the copy, but we need mnt_idmap - * from the pathwalk, awkward - */ - ret = bch2_remap_range(c, - inode_inum(dst), pos_dst >> 9, - inode_inum(src), pos_src >> 9, - aligned_len >> 9, - pos_dst + len, &i_sectors_delta, - false); - if (ret < 0) - goto err; - - /* - * due to alignment, we might have remapped slightly more than requsted - */ - ret = min((u64) ret << 9, (u64) len); - - bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); - - spin_lock(&dst->v.i_lock); - if (pos_dst + ret > dst->v.i_size) - i_size_write(&dst->v, pos_dst + ret); - spin_unlock(&dst->v.i_lock); - - if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || - IS_SYNC(file_inode(file_dst))) - ret = bch2_flush_inode(c, dst); -err: - bch2_quota_reservation_put(c, dst, "a_res); - bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); - unlock_two_nondirectories(&src->v, &dst->v); - - return bch2_err_class(ret); -} - -/* fseek: */ - -static loff_t bch2_seek_data(struct file *file, u64 offset) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - subvol_inum inum = inode_inum(inode); - u64 isize, next_data = MAX_LFS_FILESIZE; - - isize = i_size_read(&inode->v); - if (offset >= isize) - return -ENXIO; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), - POS(inode->v.i_ino, U64_MAX), - inum.subvol, 0, k, ({ - if (bkey_extent_is_data(k.k)) { - next_data = max(offset, bkey_start_offset(k.k) << 9); - break; - } else if (k.k->p.offset >> 9 > isize) - break; - 0; - }))); - if (ret) - return ret; - - if (next_data > offset) - next_data = bch2_seek_pagecache_data(&inode->v, - offset, next_data, 0, false); - - if (next_data >= isize) - return -ENXIO; - - return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); -} - -static loff_t bch2_seek_hole(struct file *file, u64 offset) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - subvol_inum inum = inode_inum(inode); - u64 isize, next_hole = MAX_LFS_FILESIZE; - - isize = i_size_read(&inode->v); - if (offset >= isize) - return -ENXIO; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), - POS(inode->v.i_ino, U64_MAX), - inum.subvol, BTREE_ITER_slots, k, ({ - if (k.k->p.inode != inode->v.i_ino || - !bkey_extent_is_data(k.k)) { - loff_t start_offset = k.k->p.inode == inode->v.i_ino - ? max(offset, bkey_start_offset(k.k) << 9) - : offset; - loff_t end_offset = k.k->p.inode == inode->v.i_ino - ? MAX_LFS_FILESIZE - : k.k->p.offset << 9; - - /* - * Found a hole in the btree, now make sure it's - * a hole in the pagecache. We might have to - * keep searching if this hole is entirely dirty - * in the page cache: - */ - bch2_trans_unlock(trans); - loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, - start_offset, end_offset, 0, false); - if (pagecache_hole < end_offset) { - next_hole = pagecache_hole; - break; - } - } else { - offset = max(offset, bkey_start_offset(k.k) << 9); - } - 0; - }))); - if (ret) - return ret; - - if (next_hole > isize) - next_hole = isize; - - return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); -} - -loff_t bch2_llseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - switch (whence) { - case SEEK_SET: - case SEEK_CUR: - case SEEK_END: - ret = generic_file_llseek(file, offset, whence); - break; - case SEEK_DATA: - ret = bch2_seek_data(file, offset); - break; - case SEEK_HOLE: - ret = bch2_seek_hole(file, offset); - break; - default: - ret = -EINVAL; - break; - } - - return bch2_err_class(ret); -} - -void bch2_fs_fsio_exit(struct bch_fs *c) -{ - bioset_exit(&c->nocow_flush_bioset); -} - -int bch2_fs_fsio_init(struct bch_fs *c) -{ - if (bioset_init(&c->nocow_flush_bioset, - 1, offsetof(struct nocow_flush, bio), 0)) - return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; - - return 0; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h deleted file mode 100644 index ca70346e68dc3d..00000000000000 --- a/fs/bcachefs/fs-io.h +++ /dev/null @@ -1,184 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IO_H -#define _BCACHEFS_FS_IO_H - -#ifndef NO_BCACHEFS_FS - -#include "buckets.h" -#include "fs.h" -#include "io_write_types.h" -#include "quota.h" - -#include - -struct folio_vec { - struct folio *fv_folio; - size_t fv_offset; - size_t fv_len; -}; - -static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) -{ - - struct folio *folio = page_folio(bv.bv_page); - size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + - bv.bv_offset; - size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); - - return (struct folio_vec) { - .fv_folio = folio, - .fv_offset = offset, - .fv_len = len, - }; -} - -static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, - struct bvec_iter iter) -{ - return biovec_to_foliovec(bio_iter_iovec(bio, iter)); -} - -#define __bio_for_each_folio(bvl, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ - bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) - -/** - * bio_for_each_folio - iterate over folios within a bio - * - * Like other non-_all versions, this iterates over what bio->bi_iter currently - * points to. This version is for drivers, where the bio may have previously - * been split or cloned. - */ -#define bio_for_each_folio(bvl, bio, iter) \ - __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) - -struct quota_res { - u64 sectors; -}; - -#ifdef CONFIG_BCACHEFS_QUOTA - -static inline void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - BUG_ON(res->sectors > inode->ei_quota_reserved); - - bch2_quota_acct(c, inode->ei_qid, Q_SPC, - -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); - inode->ei_quota_reserved -= res->sectors; - res->sectors = 0; -} - -static inline void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - if (res->sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_quota_reservation_put(c, inode, res); - mutex_unlock(&inode->ei_quota_lock); - } -} - -static inline int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - u64 sectors, - bool check_enospc) -{ - int ret; - - if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) - return 0; - - mutex_lock(&inode->ei_quota_lock); - ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, - check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); - if (likely(!ret)) { - inode->ei_quota_reserved += sectors; - res->sectors += sectors; - } - mutex_unlock(&inode->ei_quota_lock); - - return ret; -} - -#else - -static inline void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static inline void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static inline int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - unsigned sectors, - bool check_enospc) -{ - return 0; -} - -#endif - -void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *, - struct quota_res *, s64); - -static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - if (sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_i_sectors_acct(c, inode, quota_res, sectors); - mutex_unlock(&inode->ei_quota_lock); - } -} - -static inline struct address_space *faults_disabled_mapping(void) -{ - return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); -} - -static inline void set_fdm_dropped_locks(void) -{ - current->faults_disabled_mapping = - (void *) (((unsigned long) current->faults_disabled_mapping)|1); -} - -static inline bool fdm_dropped_locks(void) -{ - return ((unsigned long) current->faults_disabled_mapping) & 1; -} - -void bch2_inode_flush_nocow_writes_async(struct bch_fs *, - struct bch_inode_info *, struct closure *); - -int __must_check bch2_write_inode_size(struct bch_fs *, - struct bch_inode_info *, - loff_t, unsigned); - -int bch2_fsync(struct file *, loff_t, loff_t, int); - -int bchfs_truncate(struct mnt_idmap *, - struct bch_inode_info *, struct iattr *); -long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); - -loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, - loff_t, loff_t, unsigned); - -loff_t bch2_llseek(struct file *, loff_t, int); - -void bch2_fs_fsio_exit(struct bch_fs *); -int bch2_fs_fsio_init(struct bch_fs *); -#else -static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} -static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } -#endif - -#endif /* _BCACHEFS_FS_IO_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c deleted file mode 100644 index 4e72e654da9666..00000000000000 --- a/fs/bcachefs/fs-ioctl.c +++ /dev/null @@ -1,442 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "chardev.h" -#include "dirent.h" -#include "fs.h" -#include "fs-ioctl.h" -#include "namei.h" -#include "quota.h" - -#include -#include -#include -#include -#include -#include - -#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ -#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ -#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ - -static int bch2_reinherit_attrs_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_inode_info *dir = p; - - return !bch2_reinherit_attrs(bi, &dir->ei_inode); -} - -static int bch2_ioc_reinherit_attrs(struct bch_fs *c, - struct file *file, - struct bch_inode_info *src, - const char __user *name) -{ - struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); - struct bch_inode_info *dst; - struct inode *vinode = NULL; - char *kname = NULL; - struct qstr qstr; - int ret = 0; - subvol_inum inum; - - kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); - if (!kname) - return -ENOMEM; - - ret = strncpy_from_user(kname, name, BCH_NAME_MAX); - if (unlikely(ret < 0)) - goto err1; - - qstr.len = ret; - qstr.name = kname; - - ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); - if (ret) - goto err1; - - vinode = bch2_vfs_inode_get(c, inum); - ret = PTR_ERR_OR_ZERO(vinode); - if (ret) - goto err1; - - dst = to_bch_ei(vinode); - - ret = mnt_want_write_file(file); - if (ret) - goto err2; - - bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); - - if (inode_attr_changing(src, dst, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, dst, - src->ei_qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err3; - } - - ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); -err3: - bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); - - /* return true if we did work */ - if (ret >= 0) - ret = !ret; - - mnt_drop_write_file(file); -err2: - iput(vinode); -err1: - kfree(kname); - - return ret; -} - -static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg) -{ - return put_user(inode->v.i_generation, arg); -} - -static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label) -{ - int ret; - size_t len; - char label[BCH_SB_LABEL_SIZE]; - - BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX); - - mutex_lock(&c->sb_lock); - memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE); - mutex_unlock(&c->sb_lock); - - len = strnlen(label, BCH_SB_LABEL_SIZE); - if (len == BCH_SB_LABEL_SIZE) { - bch_warn(c, - "label is too long, return the first %zu bytes", - --len); - } - - ret = copy_to_user(user_label, label, len); - - return ret ? -EFAULT : 0; -} - -static int bch2_ioc_setlabel(struct bch_fs *c, - struct file *file, - struct bch_inode_info *inode, - const char __user *user_label) -{ - int ret; - char label[BCH_SB_LABEL_SIZE]; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(label, user_label, sizeof(label))) - return -EFAULT; - - if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) { - bch_err(c, - "unable to set label with more than %d bytes", - BCH_SB_LABEL_SIZE - 1); - return -EINVAL; - } - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - mutex_lock(&c->sb_lock); - strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - mnt_drop_write_file(file); - return ret; -} - -static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) -{ - u32 flags; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (get_user(flags, arg)) - return -EFAULT; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "shutdown by ioctl type %u", flags); - - switch (flags) { - case FSOP_GOING_FLAGS_DEFAULT: - ret = bdev_freeze(c->vfs_sb->s_bdev); - if (ret) - break; - bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only2(c, &buf); - bdev_thaw(c->vfs_sb->s_bdev); - break; - case FSOP_GOING_FLAGS_LOGFLUSH: - bch2_journal_flush(&c->journal); - fallthrough; - case FSOP_GOING_FLAGS_NOLOGFLUSH: - bch2_fs_emergency_read_only2(c, &buf); - break; - default: - ret = -EINVAL; - goto noprint; - } - - bch2_print_str(c, KERN_ERR, buf.buf); -noprint: - printbuf_exit(&buf); - return ret; -} - -static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - struct inode *dir; - struct bch_inode_info *inode; - struct user_namespace *s_user_ns; - struct dentry *dst_dentry; - struct path src_path, dst_path; - int how = LOOKUP_FOLLOW; - int error; - subvol_inum snapshot_src = { 0 }; - unsigned lookup_flags = 0; - unsigned create_flags = BCH_CREATE_SUBVOL; - - if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| - BCH_SUBVOL_SNAPSHOT_RO)) - return -EINVAL; - - if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && - (arg.src_ptr || - (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) - return -EINVAL; - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) - create_flags |= BCH_CREATE_SNAPSHOT; - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) - create_flags |= BCH_CREATE_SNAPSHOT_RO; - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) { - /* sync_inodes_sb enforce s_umount is locked */ - down_read(&c->vfs_sb->s_umount); - sync_inodes_sb(c->vfs_sb); - up_read(&c->vfs_sb->s_umount); - } - - if (arg.src_ptr) { - error = user_path_at(arg.dirfd, - (const char __user *)(unsigned long)arg.src_ptr, - how, &src_path); - if (error) - goto err1; - - if (src_path.dentry->d_sb->s_fs_info != c) { - path_put(&src_path); - error = -EXDEV; - goto err1; - } - - snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); - } - - dst_dentry = user_path_create(arg.dirfd, - (const char __user *)(unsigned long)arg.dst_ptr, - &dst_path, lookup_flags); - error = PTR_ERR_OR_ZERO(dst_dentry); - if (error) - goto err2; - - if (dst_dentry->d_sb->s_fs_info != c) { - error = -EXDEV; - goto err3; - } - - if (dst_dentry->d_inode) { - error = bch_err_throw(c, EEXIST_subvolume_create); - goto err3; - } - - dir = dst_path.dentry->d_inode; - if (IS_DEADDIR(dir)) { - error = bch_err_throw(c, ENOENT_directory_dead); - goto err3; - } - - s_user_ns = dir->i_sb->s_user_ns; - if (!kuid_has_mapping(s_user_ns, current_fsuid()) || - !kgid_has_mapping(s_user_ns, current_fsgid())) { - error = -EOVERFLOW; - goto err3; - } - - error = inode_permission(file_mnt_idmap(filp), - dir, MAY_WRITE | MAY_EXEC); - if (error) - goto err3; - - if (!IS_POSIXACL(dir)) - arg.mode &= ~current_umask(); - - error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); - if (error) - goto err3; - - if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && - !arg.src_ptr) - snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; - - down_write(&c->snapshot_create_lock); - inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), - dst_dentry, arg.mode|S_IFDIR, - 0, snapshot_src, create_flags); - up_write(&c->snapshot_create_lock); - - error = PTR_ERR_OR_ZERO(inode); - if (error) - goto err3; - - d_instantiate(dst_dentry, &inode->v); - fsnotify_mkdir(dir, dst_dentry); -err3: - done_path_create(&dst_path, dst_dentry); -err2: - if (arg.src_ptr) - path_put(&src_path); -err1: - return error; -} - -static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - const char __user *name = (void __user *)(unsigned long)arg.dst_ptr; - struct path path; - struct inode *dir; - struct dentry *victim; - int ret = 0; - - if (arg.flags) - return -EINVAL; - - victim = user_path_locked_at(arg.dirfd, name, &path); - if (IS_ERR(victim)) - return PTR_ERR(victim); - - dir = d_inode(path.dentry); - if (victim->d_sb->s_fs_info != c) { - ret = -EXDEV; - goto err; - } - - ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?: - __bch2_unlink(dir, victim, true); - if (!ret) { - fsnotify_rmdir(dir, victim); - d_invalidate(victim); - } -err: - inode_unlock(dir); - dput(victim); - path_put(&path); - return ret; -} - -long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - long ret; - - switch (cmd) { - case BCHFS_IOC_REINHERIT_ATTRS: - ret = bch2_ioc_reinherit_attrs(c, file, inode, - (void __user *) arg); - break; - - case FS_IOC_GETVERSION: - ret = bch2_ioc_getversion(inode, (u32 __user *) arg); - break; - - case FS_IOC_SETVERSION: - ret = -ENOTTY; - break; - - case FS_IOC_GETFSLABEL: - ret = bch2_ioc_getlabel(c, (void __user *) arg); - break; - - case FS_IOC_SETFSLABEL: - ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg); - break; - - case FS_IOC_GOINGDOWN: - ret = bch2_ioc_goingdown(c, (u32 __user *) arg); - break; - - case BCH_IOCTL_SUBVOLUME_CREATE: { - struct bch_ioctl_subvolume i; - - ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) - ? -EFAULT - : bch2_ioctl_subvolume_create(c, file, i); - break; - } - - case BCH_IOCTL_SUBVOLUME_DESTROY: { - struct bch_ioctl_subvolume i; - - ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) - ? -EFAULT - : bch2_ioctl_subvolume_destroy(c, file, i); - break; - } - - default: - ret = bch2_fs_ioctl(c, cmd, (void __user *) arg); - break; - } - - return bch2_err_class(ret); -} - -#ifdef CONFIG_COMPAT -long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - /* These are just misnamed, they actually get/put from/to user an int */ - switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; - case FS_IOC32_GETVERSION: - cmd = FS_IOC_GETVERSION; - break; - case FS_IOC_GETFSLABEL: - case FS_IOC_SETFSLABEL: - break; - default: - return -ENOIOCTLCMD; - } - return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -} -#endif - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h deleted file mode 100644 index a657e4994b7153..00000000000000 --- a/fs/bcachefs/fs-ioctl.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_IOCTL_H -#define _BCACHEFS_FS_IOCTL_H - -long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); -long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); - -#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c deleted file mode 100644 index 687af0eea0c2b4..00000000000000 --- a/fs/bcachefs/fs.c +++ /dev/null @@ -1,2768 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "acl.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "chardev.h" -#include "dirent.h" -#include "errcode.h" -#include "extents.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-ioctl.h" -#include "fs-io-buffered.h" -#include "fs-io-direct.h" -#include "fs-io-pagecache.h" -#include "fsck.h" -#include "inode.h" -#include "io_read.h" -#include "journal.h" -#include "keylist.h" -#include "namei.h" -#include "quota.h" -#include "rebalance.h" -#include "snapshot.h" -#include "super.h" -#include "xattr.h" -#include "trace.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct kmem_cache *bch2_inode_cache; - -static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, - struct bch_inode_info *, - struct bch_inode_unpacked *, - struct bch_subvolume *); - -/* Set VFS inode flags from bcachefs inode: */ -static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode) -{ - static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_sync] = S_SYNC, - [__BCH_INODE_immutable] = S_IMMUTABLE, - [__BCH_INODE_append] = S_APPEND, - [__BCH_INODE_noatime] = S_NOATIME, - }; - - set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); - - if (bch2_inode_casefold(c, &inode->ei_inode)) - inode->v.i_flags |= S_CASEFOLD; - else - inode->v.i_flags &= ~S_CASEFOLD; -} - -void bch2_inode_update_after_write(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - unsigned fields) -{ - struct bch_fs *c = trans->c; - - BUG_ON(bi->bi_inum != inode->v.i_ino); - - bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); - - set_nlink(&inode->v, bch2_inode_nlink_get(bi)); - i_uid_write(&inode->v, bi->bi_uid); - i_gid_write(&inode->v, bi->bi_gid); - inode->v.i_mode = bi->bi_mode; - - if (fields & ATTR_SIZE) - i_size_write(&inode->v, bi->bi_size); - - if (fields & ATTR_ATIME) - inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); - if (fields & ATTR_MTIME) - inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); - if (fields & ATTR_CTIME) - inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); - - inode->ei_inode = *bi; - - bch2_inode_flags_to_vfs(c, inode); -} - -int __must_check bch2_write_inode(struct bch_fs *c, - struct bch_inode_info *inode, - inode_set_fn set, - void *p, unsigned fields) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - struct bch_inode_unpacked inode_u; - int ret; -retry: - bch2_trans_begin(trans); - - ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); - if (ret) - goto err; - - struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); - - ret = (set ? set(trans, inode, &inode_u, p) : 0); - if (ret) - goto err; - - struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); - bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); - - if (rebalance_changed) { - ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); - if (ret) - goto err; - } - - ret = bch2_inode_write(trans, &iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - - /* - * the btree node lock protects inode->ei_inode, not ei_update_lock; - * this is important for inode updates via bchfs_write_index_update - */ - if (!ret) - bch2_inode_update_after_write(trans, inode, &inode_u, fields); -err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (rebalance_changed) - bch2_rebalance_wakeup(c); - - bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, - "%s: inode %llu:%llu not found when updating", - bch2_err_str(ret), - inode_inum(inode).subvol, - inode_inum(inode).inum); - - bch2_trans_put(trans); - return ret < 0 ? ret : 0; -} - -int bch2_fs_quota_transfer(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch_qid new_qid, - unsigned qtypes, - enum quota_acct_mode mode) -{ - unsigned i; - int ret; - - qtypes &= enabled_qtypes(c); - - for (i = 0; i < QTYP_NR; i++) - if (new_qid.q[i] == inode->ei_qid.q[i]) - qtypes &= ~(1U << i); - - if (!qtypes) - return 0; - - mutex_lock(&inode->ei_quota_lock); - - ret = bch2_quota_transfer(c, qtypes, new_qid, - inode->ei_qid, - inode->v.i_blocks + - inode->ei_quota_reserved, - mode); - if (!ret) - for (i = 0; i < QTYP_NR; i++) - if (qtypes & (1 << i)) - inode->ei_qid.q[i] = new_qid.q[i]; - - mutex_unlock(&inode->ei_quota_lock); - - return ret; -} - -static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) -{ - const subvol_inum *inum = data; - siphash_key_t k = { .key[0] = seed }; - - return siphash_2u64(inum->subvol, inum->inum, &k); -} - -static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) -{ - const struct bch_inode_info *inode = data; - - return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); -} - -static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct bch_inode_info *inode = obj; - const subvol_inum *v = arg->key; - - return !subvol_inum_eq(inode->ei_inum, *v); -} - -static const struct rhashtable_params bch2_vfs_inodes_params = { - .head_offset = offsetof(struct bch_inode_info, hash), - .key_offset = offsetof(struct bch_inode_info, ei_inum), - .key_len = sizeof(subvol_inum), - .hashfn = bch2_vfs_inode_hash_fn, - .obj_hashfn = bch2_vfs_inode_obj_hash_fn, - .obj_cmpfn = bch2_vfs_inode_cmp_fn, - .automatic_shrinking = true, -}; - -static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { - .head_offset = offsetof(struct bch_inode_info, by_inum_hash), - .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), - .key_len = sizeof(u64), - .automatic_shrinking = true, -}; - -int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) -{ - struct bch_fs *c = trans->c; - struct rhltable *ht = &c->vfs_inodes_by_inum_table; - u64 inum = p.offset; - DARRAY(u32) subvols; - int ret = 0; - - if (!test_bit(BCH_FS_started, &c->flags)) - return false; - - darray_init(&subvols); -restart_from_top: - - /* - * Tweaked version of __rhashtable_lookup(); we need to get a list of - * subvolumes in which the given inode number is open. - * - * For this to work, we don't include the subvolume ID in the key that - * we hash - all inodes with the same inode number regardless of - * subvolume will hash to the same slot. - * - * This will be less than ideal if the same file is ever open - * simultaneously in many different snapshots: - */ - rcu_read_lock(); - struct rhash_lock_head __rcu *const *bkt; - struct rhash_head *he; - unsigned int hash; - struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); -restart: - hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); - bkt = rht_bucket(tbl, hash); - do { - struct bch_inode_info *inode; - - rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { - if (inode->ei_inum.inum == inum) { - ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, - GFP_NOWAIT|__GFP_NOWARN); - if (ret) { - rcu_read_unlock(); - ret = darray_make_room(&subvols, 1); - if (ret) - goto err; - subvols.nr = 0; - goto restart_from_top; - } - } - } - /* An object might have been moved to a different hash chain, - * while we walk along it - better check and retry. - */ - } while (he != RHT_NULLS_MARKER(bkt)); - - /* Ensure we see any new tables. */ - smp_rmb(); - - tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); - if (unlikely(tbl)) - goto restart; - rcu_read_unlock(); - - darray_for_each(subvols, i) { - u32 snap; - ret = bch2_subvolume_get_snapshot(trans, *i, &snap); - if (ret) - goto err; - - ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); - if (ret) - break; - } -err: - darray_exit(&subvols); - return ret; -} - -static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) -{ - return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); -} - -static void __wait_on_freeing_inode(struct bch_fs *c, - struct bch_inode_info *inode, - subvol_inum inum) -{ - wait_queue_head_t *wq; - struct wait_bit_queue_entry wait; - - wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->v.i_lock); - - if (__bch2_inode_hash_find(c, inum) == inode) - schedule_timeout(HZ * 10); - finish_wait(wq, &wait.wq_entry); -} - -static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, - subvol_inum inum) -{ - struct bch_inode_info *inode; -repeat: - inode = __bch2_inode_hash_find(c, inum); - if (inode) { - spin_lock(&inode->v.i_lock); - if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { - spin_unlock(&inode->v.i_lock); - return NULL; - } - if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { - if (!trans) { - __wait_on_freeing_inode(c, inode, inum); - } else { - int ret = drop_locks_do(trans, - (__wait_on_freeing_inode(c, inode, inum), 0)); - if (ret) - return ERR_PTR(ret); - } - goto repeat; - } - __iget(&inode->v); - spin_unlock(&inode->v.i_lock); - } - - return inode; -} - -static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) -{ - spin_lock(&inode->v.i_lock); - bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); - spin_unlock(&inode->v.i_lock); - - if (remove) { - int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, - &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); - BUG_ON(ret); - - ret = rhashtable_remove_fast(&c->vfs_inodes_table, - &inode->hash, bch2_vfs_inodes_params); - BUG_ON(ret); - inode->v.i_hash.pprev = NULL; - /* - * This pairs with the bch2_inode_hash_find() -> - * __wait_on_freeing_inode() path - */ - inode_wake_up_bit(&inode->v, __I_NEW); - } -} - -static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, - struct btree_trans *trans, - struct bch_inode_info *inode) -{ - struct bch_inode_info *old = inode; - - set_bit(EI_INODE_HASHED, &inode->ei_flags); -retry: - if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, - &inode->ei_inum, - &inode->hash, - bch2_vfs_inodes_params))) { - old = bch2_inode_hash_find(c, trans, inode->ei_inum); - if (!old) - goto retry; - - clear_bit(EI_INODE_HASHED, &inode->ei_flags); - - /* - * bcachefs doesn't use I_NEW; we have no use for it since we - * only insert fully created inodes in the inode hash table. But - * discard_new_inode() expects it to be set... - */ - inode->v.i_state |= I_NEW; - /* - * We don't want bch2_evict_inode() to delete the inode on disk, - * we just raced and had another inode in cache. Normally new - * inodes don't have nlink == 0 - except tmpfiles do... - */ - set_nlink(&inode->v, 1); - discard_new_inode(&inode->v); - return old; - } else { - int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, - &inode->by_inum_hash, - bch2_vfs_inodes_by_inum_params); - BUG_ON(ret); - - inode_fake_hash(&inode->v); - - inode_sb_list_add(&inode->v); - - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); - return inode; - } -} - -#define memalloc_flags_do(_flags, _do) \ -({ \ - unsigned _saved_flags = memalloc_flags_save(_flags); \ - typeof(_do) _ret = _do; \ - memalloc_noreclaim_restore(_saved_flags); \ - _ret; \ -}) - -static struct inode *bch2_alloc_inode(struct super_block *sb) -{ - BUG(); -} - -static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) -{ - struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, - bch2_inode_cache, gfp); - if (!inode) - return NULL; - - inode_init_once(&inode->v); - mutex_init(&inode->ei_update_lock); - two_state_lock_init(&inode->ei_pagecache_lock); - INIT_LIST_HEAD(&inode->ei_vfs_inode_list); - inode->ei_flags = 0; - mutex_init(&inode->ei_quota_lock); - memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - - if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { - kmem_cache_free(bch2_inode_cache, inode); - return NULL; - } - - return inode; -} - -/* - * Allocate a new inode, dropping/retaking btree locks if necessary: - */ -static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) -{ - struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); - - if (unlikely(!inode)) { - int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); - if (ret && inode) { - __destroy_inode(&inode->v); - kmem_cache_free(bch2_inode_cache, inode); - } - if (ret) - return ERR_PTR(ret); - } - - return inode; -} - -static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *bi, - struct bch_subvolume *subvol) -{ - struct bch_inode_info *inode = bch2_new_inode(trans); - if (IS_ERR(inode)) - return inode; - - bch2_vfs_inode_init(trans, inum, inode, bi, subvol); - - return bch2_inode_hash_insert(trans->c, trans, inode); - -} - -struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) -{ - struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); - if (inode) - return &inode->v; - - struct btree_trans *trans = bch2_trans_get(c); - - struct bch_inode_unpacked inode_u; - struct bch_subvolume subvol; - int ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - bch2_trans_put(trans); - - return ret ? ERR_PTR(ret) : &inode->v; -} - -struct bch_inode_info * -__bch2_create(struct mnt_idmap *idmap, - struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev, subvol_inum snapshot_src, - unsigned flags) -{ - struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct bch_inode_unpacked dir_u; - struct bch_inode_info *inode; - struct bch_inode_unpacked inode_u; - struct posix_acl *default_acl = NULL, *acl = NULL; - subvol_inum inum; - struct bch_subvolume subvol; - u64 journal_seq = 0; - kuid_t kuid; - kgid_t kgid; - int ret; - - /* - * preallocate acls + vfs inode before btree transaction, so that - * nothing can fail after the transaction succeeds: - */ -#ifdef CONFIG_BCACHEFS_POSIX_ACL - ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); - if (ret) - return ERR_PTR(ret); -#endif - inode = __bch2_new_inode(c, GFP_NOFS); - if (unlikely(!inode)) { - inode = ERR_PTR(-ENOMEM); - goto err; - } - - bch2_inode_init_early(c, &inode_u); - - if (!(flags & BCH_CREATE_TMPFILE)) - mutex_lock(&dir->ei_update_lock); - - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); - kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); - ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: - bch2_create_trans(trans, - inode_inum(dir), &dir_u, &inode_u, - !(flags & BCH_CREATE_TMPFILE) - ? &dentry->d_name : NULL, - from_kuid(i_user_ns(&dir->v), kuid), - from_kgid(i_user_ns(&dir->v), kgid), - mode, rdev, - default_acl, acl, snapshot_src, flags) ?: - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, - KEY_TYPE_QUOTA_PREALLOC); - if (unlikely(ret)) - goto err_before_quota; - - inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; - inum.inum = inode_u.bi_inum; - - ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_trans_commit(trans, NULL, &journal_seq, 0); - if (unlikely(ret)) { - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, - KEY_TYPE_QUOTA_WARN); -err_before_quota: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - goto err_trans; - } - - if (!(flags & BCH_CREATE_TMPFILE)) { - bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - mutex_unlock(&dir->ei_update_lock); - } - - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - - set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); - set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); - - /* - * we must insert the new inode into the inode cache before calling - * bch2_trans_exit() and dropping locks, else we could race with another - * thread pulling the inode in and modifying it: - * - * also, calling bch2_inode_hash_insert() without passing in the - * transaction object is sketchy - if we could ever end up in - * __wait_on_freeing_inode(), we'd risk deadlock. - * - * But that shouldn't be possible, since we still have the inode locked - * that we just created, and we _really_ can't take a transaction - * restart here. - */ - inode = bch2_inode_hash_insert(c, NULL, inode); - bch2_trans_put(trans); -err: - posix_acl_release(default_acl); - posix_acl_release(acl); - return inode; -err_trans: - if (!(flags & BCH_CREATE_TMPFILE)) - mutex_unlock(&dir->ei_update_lock); - - bch2_trans_put(trans); - make_bad_inode(&inode->v); - iput(&inode->v); - inode = ERR_PTR(ret); - goto err; -} - -/* methods */ - -static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, - subvol_inum dir, struct bch_hash_info *dir_hash_info, - const struct qstr *name) -{ - struct bch_fs *c = trans->c; - subvol_inum inum = {}; - struct printbuf buf = PRINTBUF; - - struct qstr lookup_name; - int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); - if (ret) - return ERR_PTR(ret); - - struct btree_iter dirent_iter = {}; - struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - dir_hash_info, dir, &lookup_name, 0); - ret = bkey_err(k); - if (ret) - return ERR_PTR(ret); - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - ret = bch2_dirent_read_target(trans, dir, d, &inum); - if (ret > 0) - ret = -ENOENT; - if (ret) - goto err; - - struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); - if (inode) - goto out; - - /* - * Note: if check/repair needs it, we commit before - * bch2_inode_hash_init_insert(), as after that point we can't take a - * restart - not in the top level loop with a commit_do(), like we - * usually do: - */ - - struct bch_subvolume subvol; - struct bch_inode_unpacked inode_u; - ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: - bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - - /* - * don't remove it: check_inodes might find another inode that points - * back to this dirent - */ - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - c, "dirent to missing inode:\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); - if (ret) - goto err; -out: - bch2_trans_iter_exit(trans, &dirent_iter); - printbuf_exit(&buf); - return inode; -err: - inode = ERR_PTR(ret); - goto out; -} - -static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, - unsigned int flags) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); - - struct bch_inode_info *inode; - bch2_trans_do(c, - PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), - &hash, &dentry->d_name))); - if (IS_ERR(inode)) - inode = NULL; - - if (!inode && IS_CASEFOLDED(vdir)) { - /* - * Do not cache a negative dentry in casefolded directories - * as it would need to be invalidated in the following situation: - * - Lookup file "blAH" in a casefolded directory - * - Creation of file "BLAH" in a casefolded directory - * - Lookup file "blAH" in a casefolded directory - * which would fail if we had a negative dentry. - * - * We should come back to this when VFS has a method to handle - * this edgecase. - */ - return NULL; - } - - return d_splice_alias(&inode->v, dentry); -} - -static int bch2_mknod(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - struct bch_inode_info *inode = - __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, - (subvol_inum) { 0 }, 0); - - if (IS_ERR(inode)) - return bch2_err_class(PTR_ERR(inode)); - - d_instantiate(dentry, &inode->v); - return 0; -} - -static int bch2_create(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, - umode_t mode, bool excl) -{ - return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); -} - -static int __bch2_link(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch_inode_info *dir, - struct dentry *dentry) -{ - struct bch_inode_unpacked dir_u, inode_u; - int ret; - - mutex_lock(&inode->ei_update_lock); - struct btree_trans *trans = bch2_trans_get(c); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_link_trans(trans, - inode_inum(dir), &dir_u, - inode_inum(inode), &inode_u, - &dentry->d_name)); - - if (likely(!ret)) { - bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); - } - - bch2_trans_put(trans); - mutex_unlock(&inode->ei_update_lock); - return ret; -} - -static int bch2_link(struct dentry *old_dentry, struct inode *vdir, - struct dentry *dentry) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); - int ret; - - lockdep_assert_held(&inode->v.i_rwsem); - - ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: - bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - __bch2_link(c, inode, dir, dentry); - if (unlikely(ret)) - return bch2_err_class(ret); - - ihold(&inode->v); - d_instantiate(dentry, &inode->v); - return 0; -} - -int __bch2_unlink(struct inode *vdir, struct dentry *dentry, - bool deleting_snapshot) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_inode_unpacked dir_u, inode_u; - int ret; - - bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); - - struct btree_trans *trans = bch2_trans_get(c); - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_unlink_trans(trans, - inode_inum(dir), &dir_u, - &inode_u, &dentry->d_name, - deleting_snapshot)); - if (unlikely(ret)) - goto err; - - bch2_inode_update_after_write(trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - bch2_inode_update_after_write(trans, inode, &inode_u, - ATTR_MTIME); - - if (inode_u.bi_subvol) { - /* - * Subvolume deletion is asynchronous, but we still want to tell - * the VFS that it's been deleted here: - */ - set_nlink(&inode->v, 0); - } - - if (IS_CASEFOLDED(vdir)) - d_invalidate(dentry); -err: - bch2_trans_put(trans); - bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); - - return ret; -} - -static int bch2_unlink(struct inode *vdir, struct dentry *dentry) -{ - struct bch_inode_info *dir= to_bch_ei(vdir); - struct bch_fs *c = dir->v.i_sb->s_fs_info; - - int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: - __bch2_unlink(vdir, dentry, false); - return bch2_err_class(ret); -} - -static int bch2_symlink(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, - const char *symname) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir), *inode; - int ret; - - inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, - (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); - if (IS_ERR(inode)) - return bch2_err_class(PTR_ERR(inode)); - - inode_lock(&inode->v); - ret = page_symlink(&inode->v, symname, strlen(symname) + 1); - inode_unlock(&inode->v); - - if (unlikely(ret)) - goto err; - - ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); - if (unlikely(ret)) - goto err; - - ret = __bch2_link(c, inode, dir, dentry); - if (unlikely(ret)) - goto err; - - d_instantiate(dentry, &inode->v); - return 0; -err: - iput(&inode->v); - return bch2_err_class(ret); -} - -static struct dentry *bch2_mkdir(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, umode_t mode) -{ - return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0)); -} - -static int bch2_rename2(struct mnt_idmap *idmap, - struct inode *src_vdir, struct dentry *src_dentry, - struct inode *dst_vdir, struct dentry *dst_dentry, - unsigned flags) -{ - struct bch_fs *c = src_vdir->i_sb->s_fs_info; - struct bch_inode_info *src_dir = to_bch_ei(src_vdir); - struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); - struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); - struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); - struct bch_inode_unpacked dst_dir_u, src_dir_u; - struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; - struct btree_trans *trans; - enum bch_rename_mode mode = flags & RENAME_EXCHANGE - ? BCH_RENAME_EXCHANGE - : dst_dentry->d_inode - ? BCH_RENAME_OVERWRITE : BCH_RENAME; - bool whiteout = !!(flags & RENAME_WHITEOUT); - int ret; - - if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) - return -EINVAL; - - if (mode == BCH_RENAME_OVERWRITE) { - ret = filemap_write_and_wait_range(src_inode->v.i_mapping, - 0, LLONG_MAX); - if (ret) - return ret; - } - - bch2_lock_inodes(INODE_UPDATE_LOCK, - src_dir, - dst_dir, - src_inode, - dst_inode); - - trans = bch2_trans_get(c); - - ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: - bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); - if (ret) - goto err_tx_restart; - - if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, src_inode, - dst_dir->ei_qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err; - } - - if (mode == BCH_RENAME_EXCHANGE && - inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, dst_inode, - src_dir->ei_qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err; - } -retry: - bch2_trans_begin(trans); - - ret = bch2_rename_trans(trans, - inode_inum(src_dir), &src_dir_u, - inode_inum(dst_dir), &dst_dir_u, - &src_inode_u, - &dst_inode_u, - &src_dentry->d_name, - &dst_dentry->d_name, - mode); - if (unlikely(ret)) - goto err_tx_restart; - - if (whiteout) { - whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); - ret = PTR_ERR_OR_ZERO(whiteout_inode_u); - if (unlikely(ret)) - goto err_tx_restart; - bch2_inode_init_early(c, whiteout_inode_u); - - ret = bch2_create_trans(trans, - inode_inum(src_dir), &src_dir_u, - whiteout_inode_u, - &src_dentry->d_name, - from_kuid(i_user_ns(&src_dir->v), current_fsuid()), - from_kgid(i_user_ns(&src_dir->v), current_fsgid()), - S_IFCHR|WHITEOUT_MODE, 0, - NULL, NULL, (subvol_inum) { 0 }, 0) ?: - bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, - KEY_TYPE_QUOTA_PREALLOC); - if (unlikely(ret)) - goto err_tx_restart; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0); - if (unlikely(ret)) { -err_tx_restart: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - goto err; - } - - BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); - BUG_ON(dst_inode && - dst_inode->v.i_ino != dst_inode_u.bi_inum); - - bch2_inode_update_after_write(trans, src_dir, &src_dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - - if (src_dir != dst_dir) - bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, - ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); - - bch2_inode_update_after_write(trans, src_inode, &src_inode_u, - ATTR_CTIME); - - if (dst_inode) - bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, - ATTR_CTIME); -err: - bch2_trans_put(trans); - - bch2_fs_quota_transfer(c, src_inode, - bch_qid(&src_inode->ei_inode), - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_NOCHECK); - if (dst_inode) - bch2_fs_quota_transfer(c, dst_inode, - bch_qid(&dst_inode->ei_inode), - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_NOCHECK); - - bch2_unlock_inodes(INODE_UPDATE_LOCK, - src_dir, - dst_dir, - src_inode, - dst_inode); - - return bch2_err_class(ret); -} - -static void bch2_setattr_copy(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - struct iattr *attr) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - unsigned int ia_valid = attr->ia_valid; - kuid_t kuid; - kgid_t kgid; - - if (ia_valid & ATTR_UID) { - kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); - bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); - } - if (ia_valid & ATTR_GID) { - kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); - bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); - } - - if (ia_valid & ATTR_SIZE) - bi->bi_size = attr->ia_size; - - if (ia_valid & ATTR_ATIME) - bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); - - if (ia_valid & ATTR_MODE) { - umode_t mode = attr->ia_mode; - kgid_t gid = ia_valid & ATTR_GID - ? kgid - : inode->v.i_gid; - - if (!in_group_or_capable(idmap, &inode->v, - make_vfsgid(idmap, i_user_ns(&inode->v), gid))) - mode &= ~S_ISGID; - bi->bi_mode = mode; - } -} - -int bch2_setattr_nonsize(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct iattr *attr) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_qid qid; - struct btree_trans *trans; - struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode_u; - struct posix_acl *acl = NULL; - kuid_t kuid; - kgid_t kgid; - int ret; - - mutex_lock(&inode->ei_update_lock); - - qid = inode->ei_qid; - - if (attr->ia_valid & ATTR_UID) { - kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); - qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); - } - - if (attr->ia_valid & ATTR_GID) { - kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); - qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); - } - - ret = bch2_fs_quota_transfer(c, inode, qid, ~0, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - goto err; - - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - kfree(acl); - acl = NULL; - - ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_intent); - if (ret) - goto btree_err; - - bch2_setattr_copy(idmap, inode, &inode_u, attr); - - if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, - inode_u.bi_mode, &acl); - if (ret) - goto btree_err; - } - - ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -btree_err: - bch2_trans_iter_exit(trans, &inode_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (unlikely(ret)) - goto err_trans; - - bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); - - if (acl) - set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -err_trans: - bch2_trans_put(trans); -err: - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); -} - -static int bch2_getattr(struct mnt_idmap *idmap, - const struct path *path, struct kstat *stat, - u32 request_mask, unsigned query_flags) -{ - struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); - vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); - - stat->dev = inode->v.i_sb->s_dev; - stat->ino = inode->v.i_ino; - stat->mode = inode->v.i_mode; - stat->nlink = inode->v.i_nlink; - stat->uid = vfsuid_into_kuid(vfsuid); - stat->gid = vfsgid_into_kgid(vfsgid); - stat->rdev = inode->v.i_rdev; - stat->size = i_size_read(&inode->v); - stat->atime = inode_get_atime(&inode->v); - stat->mtime = inode_get_mtime(&inode->v); - stat->ctime = inode_get_ctime(&inode->v); - stat->blksize = block_bytes(c); - stat->blocks = inode->v.i_blocks; - - stat->subvol = inode->ei_inum.subvol; - stat->result_mask |= STATX_SUBVOL; - - if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { - stat->result_mask |= STATX_DIOALIGN; - /* - * this is incorrect; we should be tracking this in superblock, - * and checking the alignment of open devices - */ - stat->dio_mem_align = SECTOR_SIZE; - stat->dio_offset_align = block_bytes(c); - } - - if (request_mask & STATX_BTIME) { - stat->result_mask |= STATX_BTIME; - stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); - } - - if (inode->ei_inode.bi_flags & BCH_INODE_immutable) - stat->attributes |= STATX_ATTR_IMMUTABLE; - stat->attributes_mask |= STATX_ATTR_IMMUTABLE; - - if (inode->ei_inode.bi_flags & BCH_INODE_append) - stat->attributes |= STATX_ATTR_APPEND; - stat->attributes_mask |= STATX_ATTR_APPEND; - - if (inode->ei_inode.bi_flags & BCH_INODE_nodump) - stat->attributes |= STATX_ATTR_NODUMP; - stat->attributes_mask |= STATX_ATTR_NODUMP; - - return 0; -} - -static int bch2_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *iattr) -{ - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; - - lockdep_assert_held(&inode->v.i_rwsem); - - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - setattr_prepare(idmap, dentry, iattr); - if (ret) - return ret; - - return iattr->ia_valid & ATTR_SIZE - ? bchfs_truncate(idmap, inode, iattr) - : bch2_setattr_nonsize(idmap, inode, iattr); -} - -static int bch2_tmpfile(struct mnt_idmap *idmap, - struct inode *vdir, struct file *file, umode_t mode) -{ - struct bch_inode_info *inode = - __bch2_create(idmap, to_bch_ei(vdir), - file->f_path.dentry, mode, 0, - (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); - - if (IS_ERR(inode)) - return bch2_err_class(PTR_ERR(inode)); - - d_mark_tmpfile(file, &inode->v); - d_instantiate(file->f_path.dentry, &inode->v); - return finish_open_simple(file, 0); -} - -struct bch_fiemap_extent { - struct bkey_buf kbuf; - unsigned flags; -}; - -static int bch2_fill_extent(struct bch_fs *c, - struct fiemap_extent_info *info, - struct bch_fiemap_extent *fe) -{ - struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); - unsigned flags = fe->flags; - - BUG_ON(!k.k->size); - - if (bkey_extent_is_direct_data(k.k)) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int ret; - - if (k.k->type == KEY_TYPE_reflink_v) - flags |= FIEMAP_EXTENT_SHARED; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int flags2 = 0; - u64 offset = p.ptr.offset; - - if (p.ptr.unwritten) - flags2 |= FIEMAP_EXTENT_UNWRITTEN; - - if (p.crc.compression_type) - flags2 |= FIEMAP_EXTENT_ENCODED; - else - offset += p.crc.offset; - - if ((offset & (block_sectors(c) - 1)) || - (k.k->size & (block_sectors(c) - 1))) - flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; - - ret = fiemap_fill_next_extent(info, - bkey_start_offset(k.k) << 9, - offset << 9, - k.k->size << 9, flags|flags2); - if (ret) - return ret; - } - - return 0; - } else if (bkey_extent_is_inline_data(k.k)) { - return fiemap_fill_next_extent(info, - bkey_start_offset(k.k) << 9, - 0, k.k->size << 9, - flags| - FIEMAP_EXTENT_DATA_INLINE); - } else if (k.k->type == KEY_TYPE_reservation) { - return fiemap_fill_next_extent(info, - bkey_start_offset(k.k) << 9, - 0, k.k->size << 9, - flags| - FIEMAP_EXTENT_DELALLOC| - FIEMAP_EXTENT_UNWRITTEN); - } else { - BUG(); - } -} - -/* - * Scan a range of an inode for data in pagecache. - * - * Intended to be retryable, so don't modify the output params until success is - * imminent. - */ -static int -bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, - bool nonblock) -{ - loff_t dstart, dend; - - dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); - if (dstart < 0) - return dstart; - - if (dstart == *end) { - *start = dstart; - return 0; - } - - dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); - if (dend < 0) - return dend; - - /* race */ - BUG_ON(dstart == dend); - - *start = dstart; - *end = dend; - return 0; -} - -/* - * Scan a range of pagecache that corresponds to a file mapping hole in the - * extent btree. If data is found, fake up an extent key so it looks like a - * delalloc extent to the rest of the fiemap processing code. - */ -static int -bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode, - u64 start, u64 end, struct bch_fiemap_extent *cur) -{ - struct bch_fs *c = trans->c; - struct bkey_i_extent *delextent; - struct bch_extent_ptr ptr = {}; - loff_t dstart = start << 9, dend = end << 9; - int ret; - - /* - * We hold btree locks here so we cannot block on folio locks without - * dropping trans locks first. Run a nonblocking scan for the common - * case of no folios over holes and fall back on failure. - * - * Note that dropping locks like this is technically racy against - * writeback inserting to the extent tree, but a non-sync fiemap scan is - * fundamentally racy with writeback anyways. Therefore, just report the - * range as delalloc regardless of whether we have to cycle trans locks. - */ - ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true); - if (ret == -EAGAIN) - ret = drop_locks_do(trans, - bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false)); - if (ret < 0) - return ret; - - /* - * Create a fake extent key in the buffer. We have to add a dummy extent - * pointer for the fill code to add an extent entry. It's explicitly - * zeroed to reflect delayed allocation (i.e. phys offset 0). - */ - bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); - delextent = bkey_extent_init(cur->kbuf.k); - delextent->k.p = POS(inode->ei_inum.inum, dend >> 9); - delextent->k.size = (dend - dstart) >> 9; - bch2_bkey_append_ptr(&delextent->k_i, ptr); - - cur->flags = FIEMAP_EXTENT_DELALLOC; - - return 0; -} - -static int bch2_next_fiemap_extent(struct btree_trans *trans, - struct bch_inode_info *inode, - u64 start, u64 end, - struct bch_fiemap_extent *cur) -{ - u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); - if (ret) - return ret; - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inode->ei_inum.inum, start, snapshot), 0); - - struct bkey_s_c k = - bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); - ret = bkey_err(k); - if (ret) - goto err; - - u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; - - ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); - if (ret) - goto err; - - struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); - - /* - * Does the pagecache or the btree take precedence? - * - * It _should_ be the pagecache, so that we correctly report delalloc - * extents when dirty in the pagecache (we're COW, after all). - * - * But we'd have to add per-sector writeback tracking to - * bch_folio_state, otherwise we report delalloc extents for clean - * cached data in the pagecache. - * - * We should do this, but even then fiemap won't report stable mappings: - * on bcachefs data moves around in the background (copygc, rebalance) - * and we don't provide a way for userspace to lock that out. - */ - if (k.k && - bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)), - pagecache_start)) { - bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); - bch2_cut_front(iter.pos, cur->kbuf.k); - bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k); - cur->flags = 0; - } else if (k.k) { - bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k); - } - - if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) { - unsigned sectors = cur->kbuf.k->k.size; - s64 offset_into_extent = 0; - enum btree_id data_btree = BTREE_ID_extents; - ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, - &cur->kbuf); - if (ret) - goto err; - - struct bkey_i *k = cur->kbuf.k; - sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); - - bch2_cut_front(POS(k->k.p.inode, - bkey_start_offset(&k->k) + offset_into_extent), - k); - bch2_key_resize(&k->k, sectors); - k->k.p = iter.pos; - k->k.p.offset += k->k.size; - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, - u64 start, u64 len) -{ - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *ei = to_bch_ei(vinode); - struct btree_trans *trans; - struct bch_fiemap_extent cur, prev; - int ret = 0; - - ret = fiemap_prep(&ei->v, info, start, &len, 0); - if (ret) - return ret; - - if (start + len < start) - return -EINVAL; - - start >>= 9; - u64 end = (start + len) >> 9; - - bch2_bkey_buf_init(&cur.kbuf); - bch2_bkey_buf_init(&prev.kbuf); - bkey_init(&prev.kbuf.k->k); - - trans = bch2_trans_get(c); - - while (start < end) { - ret = lockrestart_do(trans, - bch2_next_fiemap_extent(trans, ei, start, end, &cur)); - if (ret) - goto err; - - BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start); - BUG_ON(cur.kbuf.k->k.p.offset > end); - - if (bkey_start_offset(&cur.kbuf.k->k) == end) - break; - - start = cur.kbuf.k->k.p.offset; - - if (!bkey_deleted(&prev.kbuf.k->k)) { - bch2_trans_unlock(trans); - ret = bch2_fill_extent(c, info, &prev); - if (ret) - goto err; - } - - bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k); - prev.flags = cur.flags; - } - - if (!bkey_deleted(&prev.kbuf.k->k)) { - bch2_trans_unlock(trans); - prev.flags |= FIEMAP_EXTENT_LAST; - ret = bch2_fill_extent(c, info, &prev); - } -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&cur.kbuf, c); - bch2_bkey_buf_exit(&prev.kbuf, c); - - return bch2_err_class(ret < 0 ? ret : 0); -} - -static const struct vm_operations_struct bch_vm_ops = { - .fault = bch2_page_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = bch2_page_mkwrite, -}; - -static int bch2_mmap_prepare(struct vm_area_desc *desc) -{ - file_accessed(desc->file); - - desc->vm_ops = &bch_vm_ops; - return 0; -} - -/* Directories: */ - -static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) -{ - return generic_file_llseek_size(file, offset, whence, - S64_MAX, S64_MAX); -} - -static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - - if (!dir_emit_dots(file, ctx)) - return 0; - - int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx); - - bch_err_fn(c, ret); - return bch2_err_class(ret); -} - -static int bch2_open(struct inode *vinode, struct file *file) -{ - if (file->f_flags & (O_WRONLY|O_RDWR)) { - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); - if (ret) - return ret; - } - - file->f_mode |= FMODE_CAN_ODIRECT; - - return generic_file_open(vinode, file); -} - -/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_sync] = FS_SYNC_FL, - [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, - [__BCH_INODE_append] = FS_APPEND_FL, - [__BCH_INODE_nodump] = FS_NODUMP_FL, - [__BCH_INODE_noatime] = FS_NOATIME_FL, -}; - -/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const __maybe_unused unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_sync] = FS_XFLAG_SYNC, - [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_append] = FS_XFLAG_APPEND, - [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, - [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, -}; - -static int bch2_fileattr_get(struct dentry *dentry, - struct file_kattr *fa) -{ - struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); - - if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) - fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; - - if (bch2_inode_casefold(c, &inode->ei_inode)) - fa->flags |= FS_CASEFOLD_FL; - - fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; - return 0; -} - -struct flags_set { - unsigned mask; - unsigned flags; - unsigned projid; - bool set_project; - bool set_casefold; - bool casefold; -}; - -static int fssetxattr_inode_update_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = trans->c; - struct flags_set *s = p; - - /* - * We're relying on btree locking here for exclusion with other ioctl - * calls - use the flags in the btree (@bi), not inode->i_flags: - */ - if (!S_ISREG(bi->bi_mode) && - !S_ISDIR(bi->bi_mode) && - (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags) - return -EINVAL; - - if (s->casefold != bch2_inode_casefold(c, bi)) { - int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold); - if (ret) - return ret; - } - - if (s->set_project) { - bi->bi_project = s->projid; - bi->bi_fields_set |= BIT(Inode_opt_project); - } - - bi->bi_flags &= ~s->mask; - bi->bi_flags |= s->flags; - - bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); - return 0; -} - -static int bch2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, - struct file_kattr *fa) -{ - struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct flags_set s = {}; - int ret; - - if (fa->fsx_valid) { - fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT; - - s.mask = map_defined(bch_flags_to_xflags); - s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); - if (fa->fsx_xflags) - return -EOPNOTSUPP; - - if (fa->fsx_projid >= U32_MAX) - return -EINVAL; - - /* - * inode fields accessible via the xattr interface are stored with a +1 - * bias, so that 0 means unset: - */ - if ((inode->ei_inode.bi_project || - fa->fsx_projid) && - inode->ei_inode.bi_project != fa->fsx_projid + 1) { - s.projid = fa->fsx_projid + 1; - s.set_project = true; - } - } - - if (fa->flags_valid) { - s.mask = map_defined(bch_flags_to_uflags); - - s.set_casefold = true; - s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0; - fa->flags &= ~FS_CASEFOLD_FL; - - s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); - if (fa->flags) - return -EOPNOTSUPP; - } - - mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - (s.set_project - ? bch2_set_projid(c, inode, fa->fsx_projid) - : 0) ?: - bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, - ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); -} - -static const struct file_operations bch_file_operations = { - .open = bch2_open, - .llseek = bch2_llseek, - .read_iter = bch2_read_iter, - .write_iter = bch2_write_iter, - .mmap_prepare = bch2_mmap_prepare, - .get_unmapped_area = thp_get_unmapped_area, - .fsync = bch2_fsync, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, - .fallocate = bch2_fallocate_dispatch, - .unlocked_ioctl = bch2_fs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = bch2_compat_fs_ioctl, -#endif - .remap_file_range = bch2_remap_file_range, -}; - -static const struct inode_operations bch_file_inode_operations = { - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .fiemap = bch2_fiemap, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct inode_operations bch_dir_inode_operations = { - .lookup = bch2_lookup, - .create = bch2_create, - .link = bch2_link, - .unlink = bch2_unlink, - .symlink = bch2_symlink, - .mkdir = bch2_mkdir, - .rmdir = bch2_unlink, - .mknod = bch2_mknod, - .rename = bch2_rename2, - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .tmpfile = bch2_tmpfile, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct file_operations bch_dir_file_operations = { - .llseek = bch2_dir_llseek, - .read = generic_read_dir, - .iterate_shared = bch2_vfs_readdir, - .fsync = bch2_fsync, - .unlocked_ioctl = bch2_fs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = bch2_compat_fs_ioctl, -#endif -}; - -static const struct inode_operations bch_symlink_inode_operations = { - .get_link = page_get_link, - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct inode_operations bch_special_inode_operations = { - .getattr = bch2_getattr, - .setattr = bch2_setattr, - .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_inode_acl = bch2_get_acl, - .set_acl = bch2_set_acl, -#endif - .fileattr_get = bch2_fileattr_get, - .fileattr_set = bch2_fileattr_set, -}; - -static const struct address_space_operations bch_address_space_operations = { - .read_folio = bch2_read_folio, - .writepages = bch2_writepages, - .readahead = bch2_readahead, - .dirty_folio = filemap_dirty_folio, - .write_begin = bch2_write_begin, - .write_end = bch2_write_end, - .invalidate_folio = bch2_invalidate_folio, - .release_folio = bch2_release_folio, -#ifdef CONFIG_MIGRATION - .migrate_folio = filemap_migrate_folio, -#endif - .error_remove_folio = generic_error_remove_folio, -}; - -struct bcachefs_fid { - u64 inum; - u32 subvol; - u32 gen; -} __packed; - -struct bcachefs_fid_with_parent { - struct bcachefs_fid fid; - struct bcachefs_fid dir; -} __packed; - -static int bcachefs_fid_valid(int fh_len, int fh_type) -{ - switch (fh_type) { - case FILEID_BCACHEFS_WITHOUT_PARENT: - return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); - case FILEID_BCACHEFS_WITH_PARENT: - return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); - default: - return false; - } -} - -static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) -{ - return (struct bcachefs_fid) { - .inum = inode->ei_inum.inum, - .subvol = inode->ei_inum.subvol, - .gen = inode->ei_inode.bi_generation, - }; -} - -static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, - struct inode *vdir) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_inode_info *dir = to_bch_ei(vdir); - int min_len; - - if (!S_ISDIR(inode->v.i_mode) && dir) { - struct bcachefs_fid_with_parent *fid = (void *) fh; - - min_len = sizeof(*fid) / sizeof(u32); - if (*len < min_len) { - *len = min_len; - return FILEID_INVALID; - } - - fid->fid = bch2_inode_to_fid(inode); - fid->dir = bch2_inode_to_fid(dir); - - *len = min_len; - return FILEID_BCACHEFS_WITH_PARENT; - } else { - struct bcachefs_fid *fid = (void *) fh; - - min_len = sizeof(*fid) / sizeof(u32); - if (*len < min_len) { - *len = min_len; - return FILEID_INVALID; - } - *fid = bch2_inode_to_fid(inode); - - *len = min_len; - return FILEID_BCACHEFS_WITHOUT_PARENT; - } -} - -static struct inode *bch2_nfs_get_inode(struct super_block *sb, - struct bcachefs_fid fid) -{ - struct bch_fs *c = sb->s_fs_info; - struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { - .subvol = fid.subvol, - .inum = fid.inum, - }); - if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { - iput(vinode); - vinode = ERR_PTR(-ESTALE); - } - return vinode; -} - -static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, - int fh_len, int fh_type) -{ - struct bcachefs_fid *fid = (void *) _fid; - - if (!bcachefs_fid_valid(fh_len, fh_type)) - return NULL; - - return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); -} - -static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, - int fh_len, int fh_type) -{ - struct bcachefs_fid_with_parent *fid = (void *) _fid; - - if (!bcachefs_fid_valid(fh_len, fh_type) || - fh_type != FILEID_BCACHEFS_WITH_PARENT) - return NULL; - - return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); -} - -static struct dentry *bch2_get_parent(struct dentry *child) -{ - struct bch_inode_info *inode = to_bch_ei(child->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - subvol_inum parent_inum = { - .subvol = inode->ei_inode.bi_parent_subvol ?: - inode->ei_inum.subvol, - .inum = inode->ei_inode.bi_dir, - }; - - return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); -} - -static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) -{ - struct bch_inode_info *inode = to_bch_ei(child->d_inode); - struct bch_inode_info *dir = to_bch_ei(parent->d_inode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter1; - struct btree_iter iter2; - struct bkey_s_c k; - struct bkey_s_c_dirent d; - struct bch_inode_unpacked inode_u; - subvol_inum target; - u32 snapshot; - struct qstr dirent_name; - unsigned name_len = 0; - int ret; - - if (!S_ISDIR(dir->v.i_mode)) - return -EINVAL; - - trans = bch2_trans_get(c); - - bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, - POS(dir->ei_inode.bi_inum, 0), 0); - bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, - POS(dir->ei_inode.bi_inum, 0), 0); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); - bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); - - ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); - if (ret) - goto err; - - if (inode_u.bi_dir == dir->ei_inode.bi_inum) { - bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); - - k = bch2_btree_iter_peek_slot(trans, &iter1); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_dirent) { - ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); - goto err; - } - - d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); - if (ret > 0) - ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); - if (ret) - goto err; - - if (subvol_inum_eq(target, inode->ei_inum)) - goto found; - } else { - /* - * File with multiple hardlinks and our backref is to the wrong - * directory - linear search: - */ - for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { - if (k.k->p.inode > dir->ei_inode.bi_inum) - break; - - if (k.k->type != KEY_TYPE_dirent) - continue; - - d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); - if (ret < 0) - break; - if (ret) - continue; - - if (subvol_inum_eq(target, inode->ei_inum)) - goto found; - } - } - - ret = -ENOENT; - goto err; -found: - dirent_name = bch2_dirent_get_name(d); - - name_len = min_t(unsigned, dirent_name.len, NAME_MAX); - memcpy(name, dirent_name.name, name_len); - name[name_len] = '\0'; -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_iter_exit(trans, &iter1); - bch2_trans_iter_exit(trans, &iter2); - bch2_trans_put(trans); - - return ret; -} - -static const struct export_operations bch_export_ops = { - .encode_fh = bch2_encode_fh, - .fh_to_dentry = bch2_fh_to_dentry, - .fh_to_parent = bch2_fh_to_parent, - .get_parent = bch2_get_parent, - .get_name = bch2_get_name, -}; - -static void bch2_vfs_inode_init(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - struct bch_subvolume *subvol) -{ - inode->v.i_ino = inum.inum; - inode->ei_inum = inum; - inode->ei_inode.bi_inum = inum.inum; - bch2_inode_update_after_write(trans, inode, bi, ~0); - - inode->v.i_blocks = bi->bi_sectors; - inode->v.i_rdev = bi->bi_dev; - inode->v.i_generation = bi->bi_generation; - inode->v.i_size = bi->bi_size; - - inode->ei_flags = 0; - inode->ei_quota_reserved = 0; - inode->ei_qid = bch_qid(bi); - - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - - inode->v.i_mapping->a_ops = &bch_address_space_operations; - - switch (inode->v.i_mode & S_IFMT) { - case S_IFREG: - inode->v.i_op = &bch_file_inode_operations; - inode->v.i_fop = &bch_file_operations; - break; - case S_IFDIR: - inode->v.i_op = &bch_dir_inode_operations; - inode->v.i_fop = &bch_dir_file_operations; - break; - case S_IFLNK: - inode_nohighmem(&inode->v); - inode->v.i_op = &bch_symlink_inode_operations; - break; - default: - init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); - inode->v.i_op = &bch_special_inode_operations; - break; - } - - mapping_set_folio_min_order(inode->v.i_mapping, - get_order(trans->c->opts.block_size)); -} - -static void bch2_free_inode(struct inode *vinode) -{ - kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); -} - -static int inode_update_times_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); - bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); - bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); - - return 0; -} - -static int bch2_vfs_write_inode(struct inode *vinode, - struct writeback_control *wbc) -{ - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *inode = to_bch_ei(vinode); - int ret; - - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); -} - -static void bch2_evict_inode(struct inode *vinode) -{ - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *inode = to_bch_ei(vinode); - bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); - - /* - * evict() has waited for outstanding writeback, we'll do no more IO - * through this inode: it's safe to remove from VFS inode hashtable here - * - * Do that now so that other threads aren't blocked from pulling it back - * in, there's no reason for them to be: - */ - if (!delete) - bch2_inode_hash_remove(c, inode); - - truncate_inode_pages_final(&inode->v.i_data); - - clear_inode(&inode->v); - - BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); - - if (delete) { - bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), - KEY_TYPE_QUOTA_WARN); - bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, - KEY_TYPE_QUOTA_WARN); - int ret = bch2_inode_rm(c, inode_inum(inode)); - if (ret && !bch2_err_matches(ret, EROFS)) { - bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", - inode->ei_inum.subvol, - inode->ei_inum.inum); - bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); - } - - /* - * If we are deleting, we need it present in the vfs hash table - * so that fsck can check if unlinked inodes are still open: - */ - bch2_inode_hash_remove(c, inode); - } - - mutex_lock(&c->vfs_inodes_lock); - list_del_init(&inode->ei_vfs_inode_list); - mutex_unlock(&c->vfs_inodes_lock); -} - -void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) -{ - struct bch_inode_info *inode; - DARRAY(struct bch_inode_info *) grabbed; - bool clean_pass = false, this_pass_clean; - - /* - * Initially, we scan for inodes without I_DONTCACHE, then mark them to - * be pruned with d_mark_dontcache(). - * - * Once we've had a clean pass where we didn't find any inodes without - * I_DONTCACHE, we wait for them to be freed: - */ - - darray_init(&grabbed); - darray_make_room(&grabbed, 1024); -again: - cond_resched(); - this_pass_clean = true; - - mutex_lock(&c->vfs_inodes_lock); - list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { - if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) - continue; - - if (!(inode->v.i_state & I_DONTCACHE) && - !(inode->v.i_state & I_FREEING) && - igrab(&inode->v)) { - this_pass_clean = false; - - if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { - iput(&inode->v); - break; - } - } else if (clean_pass && this_pass_clean) { - struct wait_bit_queue_entry wqe; - struct wait_queue_head *wq_head; - - wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); - prepare_to_wait_event(wq_head, &wqe.wq_entry, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&c->vfs_inodes_lock); - - schedule(); - finish_wait(wq_head, &wqe.wq_entry); - goto again; - } - } - mutex_unlock(&c->vfs_inodes_lock); - - darray_for_each(grabbed, i) { - inode = *i; - d_mark_dontcache(&inode->v); - d_prune_aliases(&inode->v); - iput(&inode->v); - } - grabbed.nr = 0; - - if (!clean_pass || !this_pass_clean) { - clean_pass = this_pass_clean; - goto again; - } - - darray_exit(&grabbed); -} - -static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct bch_fs *c = sb->s_fs_info; - struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); - unsigned shift = sb->s_blocksize_bits - 9; - /* - * this assumes inodes take up 64 bytes, which is a decent average - * number: - */ - u64 avail_inodes = ((usage.capacity - usage.used) << 3); - - buf->f_type = BCACHEFS_STATFS_MAGIC; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = usage.capacity >> shift; - buf->f_bfree = usage.free >> shift; - buf->f_bavail = avail_factor(usage.free) >> shift; - - buf->f_files = usage.nr_inodes + avail_inodes; - buf->f_ffree = avail_inodes; - - buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); - buf->f_namelen = BCH_NAME_MAX; - - return 0; -} - -static int bch2_sync_fs(struct super_block *sb, int wait) -{ - struct bch_fs *c = sb->s_fs_info; - int ret; - - trace_bch2_sync_fs(sb, wait); - - if (c->opts.journal_flush_disabled) - return 0; - - if (!wait) { - bch2_journal_flush_async(&c->journal, NULL); - return 0; - } - - ret = bch2_journal_flush(&c->journal); - return bch2_err_class(ret); -} - -static struct bch_fs *bch2_path_to_fs(const char *path) -{ - struct bch_fs *c; - dev_t dev; - int ret; - - ret = lookup_bdev(path, &dev); - if (ret) - return ERR_PTR(ret); - - c = bch2_dev_to_fs(dev); - if (c) - closure_put(&c->cl); - return c ?: ERR_PTR(-ENOENT); -} - -static int bch2_show_devname(struct seq_file *seq, struct dentry *root) -{ - struct bch_fs *c = root->d_sb->s_fs_info; - bool first = true; - - guard(rcu)(); - for_each_online_member_rcu(c, ca) { - if (!first) - seq_putc(seq, ':'); - first = false; - seq_puts(seq, ca->disk_sb.sb_name); - } - - return 0; -} - -static int bch2_show_options(struct seq_file *seq, struct dentry *root) -{ - struct bch_fs *c = root->d_sb->s_fs_info; - struct printbuf buf = PRINTBUF; - - bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, - OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); - printbuf_nul_terminate(&buf); - seq_printf(seq, ",%s", buf.buf); - - int ret = buf.allocation_failure ? -ENOMEM : 0; - printbuf_exit(&buf); - return ret; -} - -static void bch2_put_super(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - __bch2_fs_stop(c); -} - -/* - * bcachefs doesn't currently integrate intwrite freeze protection but the - * internal write references serve the same purpose. Therefore reuse the - * read-only transition code to perform the quiesce. The caveat is that we don't - * currently have the ability to block tasks that want a write reference while - * the superblock is frozen. This is fine for now, but we should either add - * blocking support or find a way to integrate sb_start_intwrite() and friends. - */ -static int bch2_freeze(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); - return 0; -} - -static int bch2_unfreeze(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - int ret; - - if (test_bit(BCH_FS_emergency_ro, &c->flags)) - return 0; - - down_write(&c->state_lock); - ret = bch2_fs_read_write(c); - up_write(&c->state_lock); - return ret; -} - -static const struct super_operations bch_super_operations = { - .alloc_inode = bch2_alloc_inode, - .free_inode = bch2_free_inode, - .write_inode = bch2_vfs_write_inode, - .evict_inode = bch2_evict_inode, - .sync_fs = bch2_sync_fs, - .statfs = bch2_statfs, - .show_devname = bch2_show_devname, - .show_options = bch2_show_options, - .put_super = bch2_put_super, - .freeze_fs = bch2_freeze, - .unfreeze_fs = bch2_unfreeze, -}; - -static int bch2_set_super(struct super_block *s, void *data) -{ - s->s_fs_info = data; - return 0; -} - -static int bch2_noset_super(struct super_block *s, void *data) -{ - return -EBUSY; -} - -typedef DARRAY(struct bch_fs *) darray_fs; - -static int bch2_test_super(struct super_block *s, void *data) -{ - struct bch_fs *c = s->s_fs_info; - darray_fs *d = data; - - if (!c) - return false; - - darray_for_each(*d, i) - if (c != *i) - return false; - return true; -} - -static int bch2_fs_get_tree(struct fs_context *fc) -{ - struct bch_fs *c; - struct super_block *sb; - struct inode *vinode; - struct bch2_opts_parse *opts_parse = fc->fs_private; - struct bch_opts opts = opts_parse->opts; - darray_const_str devs; - darray_fs devs_to_fs = {}; - int ret; - - opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); - opt_set(opts, nostart, true); - - if (!fc->source || strlen(fc->source) == 0) - return -EINVAL; - - ret = bch2_split_devs(fc->source, &devs); - if (ret) - return ret; - - darray_for_each(devs, i) { - ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); - if (ret) - goto err; - } - - sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); - if (!IS_ERR(sb)) - goto got_sb; - - c = bch2_fs_open(&devs, &opts); - ret = PTR_ERR_OR_ZERO(c); - if (ret) - goto err; - - if (opt_defined(opts, discard)) - set_bit(BCH_FS_discard_mount_opt_set, &c->flags); - - /* Some options can't be parsed until after the fs is started: */ - opts = bch2_opts_empty(); - ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false); - if (ret) - goto err_stop_fs; - - bch2_opts_apply(&c->opts, opts); - - ret = bch2_fs_start(c); - if (ret) - goto err_stop_fs; - - /* - * We might be doing a RO mount because other options required it, or we - * have no alloc info and it's a small image with no room to regenerate - * it - */ - if (c->opts.read_only) - fc->sb_flags |= SB_RDONLY; - - sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); - ret = PTR_ERR_OR_ZERO(sb); - if (ret) - goto err_stop_fs; -got_sb: - c = sb->s_fs_info; - - if (sb->s_root) { - if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { - ret = -EBUSY; - goto err_put_super; - } - goto out; - } - - sb->s_blocksize = block_bytes(c); - sb->s_blocksize_bits = ilog2(block_bytes(c)); - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_op = &bch_super_operations; - sb->s_export_op = &bch_export_ops; -#ifdef CONFIG_BCACHEFS_QUOTA - sb->s_qcop = &bch2_quotactl_operations; - sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; -#endif - sb->s_xattr = bch2_xattr_handlers; - sb->s_magic = BCACHEFS_STATFS_MAGIC; - sb->s_time_gran = c->sb.nsec_per_time_unit; - sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; - sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); - super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); - - if (c->sb.multi_device) - super_set_sysfs_name_uuid(sb); - else - strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name)); - - sb->s_shrink->seeks = 0; - c->vfs_sb = sb; - strscpy(sb->s_id, c->name, sizeof(sb->s_id)); - - ret = super_setup_bdi(sb); - if (ret) - goto err_put_super; - - sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - - scoped_guard(rcu) { - for_each_online_member_rcu(c, ca) { - struct block_device *bdev = ca->disk_sb.bdev; - - /* XXX: create an anonymous device for multi device filesystems */ - sb->s_bdev = bdev; - sb->s_dev = bdev->bd_dev; - break; - } - } - - c->dev = sb->s_dev; - -#ifdef CONFIG_BCACHEFS_POSIX_ACL - if (c->opts.acl) - sb->s_flags |= SB_POSIXACL; -#endif - - sb->s_shrink->seeks = 0; - -#ifdef CONFIG_UNICODE - if (bch2_fs_casefold_enabled(c)) - sb->s_encoding = c->cf_encoding; - generic_set_sb_d_ops(sb); -#endif - - vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); - ret = PTR_ERR_OR_ZERO(vinode); - bch_err_msg(c, ret, "mounting: error getting root inode"); - if (ret) - goto err_put_super; - - sb->s_root = d_make_root(vinode); - if (!sb->s_root) { - bch_err(c, "error mounting: error allocating root dentry"); - ret = -ENOMEM; - goto err_put_super; - } - - sb->s_flags |= SB_ACTIVE; -out: - fc->root = dget(sb->s_root); -err: - darray_exit(&devs_to_fs); - bch2_darray_str_exit(&devs); - if (ret) - pr_err("error: %s", bch2_err_str(ret)); - /* - * On an inconsistency error in recovery we might see an -EROFS derived - * errorcode (from the journal), but we don't want to return that to - * userspace as that causes util-linux to retry the mount RO - which is - * confusing: - */ - if (bch2_err_matches(ret, EROFS) && ret != -EROFS) - ret = -EIO; - return bch2_err_class(ret); - -err_stop_fs: - bch2_fs_stop(c); - goto err; - -err_put_super: - if (!sb->s_root) - __bch2_fs_stop(c); - deactivate_locked_super(sb); - goto err; -} - -static void bch2_kill_sb(struct super_block *sb) -{ - struct bch_fs *c = sb->s_fs_info; - - generic_shutdown_super(sb); - bch2_fs_free(c); -} - -static void bch2_fs_context_free(struct fs_context *fc) -{ - struct bch2_opts_parse *opts = fc->fs_private; - - if (opts) { - printbuf_exit(&opts->parse_later); - kfree(opts); - } -} - -static int bch2_fs_parse_param(struct fs_context *fc, - struct fs_parameter *param) -{ - /* - * the "source" param, i.e., the name of the device(s) to mount, - * is handled by the VFS layer. - */ - if (!strcmp(param->key, "source")) - return -ENOPARAM; - - struct bch2_opts_parse *opts = fc->fs_private; - struct bch_fs *c = NULL; - - /* for reconfigure, we already have a struct bch_fs */ - if (fc->root) - c = fc->root->d_sb->s_fs_info; - - int ret = bch2_parse_one_mount_opt(c, &opts->opts, - &opts->parse_later, param->key, - param->string); - if (ret) - pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret)); - - return bch2_err_class(ret); -} - -static int bch2_fs_reconfigure(struct fs_context *fc) -{ - struct super_block *sb = fc->root->d_sb; - struct bch2_opts_parse *opts = fc->fs_private; - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); - - if (opts->opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts->opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts->opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts->opts, errors)) - c->opts.errors = opts->opts.errors; -err: - return bch2_err_class(ret); -} - -static const struct fs_context_operations bch2_context_ops = { - .free = bch2_fs_context_free, - .parse_param = bch2_fs_parse_param, - .get_tree = bch2_fs_get_tree, - .reconfigure = bch2_fs_reconfigure, -}; - -static int bch2_init_fs_context(struct fs_context *fc) -{ - struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); - - if (!opts) - return -ENOMEM; - - opts->parse_later = PRINTBUF; - - fc->ops = &bch2_context_ops; - fc->fs_private = opts; - - return 0; -} - -void bch2_fs_vfs_exit(struct bch_fs *c) -{ - if (c->vfs_inodes_by_inum_table.ht.tbl) - rhltable_destroy(&c->vfs_inodes_by_inum_table); - if (c->vfs_inodes_table.tbl) - rhashtable_destroy(&c->vfs_inodes_table); -} - -int bch2_fs_vfs_init(struct bch_fs *c) -{ - return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: - rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); -} - -static struct file_system_type bcache_fs_type = { - .owner = THIS_MODULE, - .name = "bcachefs", - .init_fs_context = bch2_init_fs_context, - .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS, -}; - -MODULE_ALIAS_FS("bcachefs"); - -void bch2_vfs_exit(void) -{ - unregister_filesystem(&bcache_fs_type); - kmem_cache_destroy(bch2_inode_cache); -} - -int __init bch2_vfs_init(void) -{ - int ret = -ENOMEM; - - bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | - SLAB_ACCOUNT); - if (!bch2_inode_cache) - goto err; - - ret = register_filesystem(&bcache_fs_type); - if (ret) - goto err; - - return 0; -err: - bch2_vfs_exit(); - return ret; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h deleted file mode 100644 index dd2198541455b2..00000000000000 --- a/fs/bcachefs/fs.h +++ /dev/null @@ -1,215 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_H -#define _BCACHEFS_FS_H - -#include "inode.h" -#include "opts.h" -#include "str_hash.h" -#include "quota_types.h" -#include "two_state_shared_lock.h" - -#include -#include - -struct bch_inode_info { - struct inode v; - struct rhash_head hash; - struct rhlist_head by_inum_hash; - subvol_inum ei_inum; - - struct list_head ei_vfs_inode_list; - unsigned long ei_flags; - - struct mutex ei_update_lock; - u64 ei_quota_reserved; - unsigned long ei_last_dirtied; - two_state_lock_t ei_pagecache_lock; - - struct mutex ei_quota_lock; - struct bch_qid ei_qid; - - /* - * When we've been doing nocow writes we'll need to issue flushes to the - * underlying block devices - * - * XXX: a device may have had a flush issued by some other codepath. It - * would be better to keep for each device a sequence number that's - * incremented when we isusue a cache flush, and track here the sequence - * number that needs flushing. - */ - struct bch_devs_mask ei_devs_need_flush; - - /* copy of inode in btree: */ - struct bch_inode_unpacked ei_inode; -}; - -#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0) -#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0) -#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0) - -#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1) -#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1) - -static inline subvol_inum inode_inum(struct bch_inode_info *inode) -{ - return inode->ei_inum; -} - -/* - * Set if we've gotten a btree error for this inode, and thus the vfs inode and - * btree inode may be inconsistent: - */ -#define EI_INODE_ERROR 0 - -/* - * Set in the inode is in a snapshot subvolume - we don't do quota accounting in - * those: - */ -#define EI_INODE_SNAPSHOT 1 -#define EI_INODE_HASHED 2 - -#define to_bch_ei(_inode) \ - container_of_or_null(_inode, struct bch_inode_info, v) - -static inline int ptrcmp(void *l, void *r) -{ - return cmp_int(l, r); -} - -enum bch_inode_lock_op { - INODE_PAGECACHE_BLOCK = (1U << 0), - INODE_UPDATE_LOCK = (1U << 1), -}; - -#define bch2_lock_inodes(_locks, ...) \ -do { \ - struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ - unsigned i; \ - \ - bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ - \ - for (i = 1; i < ARRAY_SIZE(a); i++) \ - if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_get(a[i]);\ - if ((_locks) & INODE_UPDATE_LOCK) \ - mutex_lock_nested(&a[i]->ei_update_lock, i);\ - } \ -} while (0) - -#define bch2_unlock_inodes(_locks, ...) \ -do { \ - struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ - unsigned i; \ - \ - bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ - \ - for (i = 1; i < ARRAY_SIZE(a); i++) \ - if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_put(a[i]);\ - if ((_locks) & INODE_UPDATE_LOCK) \ - mutex_unlock(&a[i]->ei_update_lock); \ - } \ -} while (0) - -static inline struct bch_inode_info *file_bch_inode(struct file *file) -{ - return to_bch_ei(file_inode(file)); -} - -static inline bool inode_attr_changing(struct bch_inode_info *dir, - struct bch_inode_info *inode, - enum inode_opt_id id) -{ - return !(inode->ei_inode.bi_fields_set & (1 << id)) && - bch2_inode_opt_get(&dir->ei_inode, id) != - bch2_inode_opt_get(&inode->ei_inode, id); -} - -static inline bool inode_attrs_changing(struct bch_inode_info *dir, - struct bch_inode_info *inode) -{ - unsigned id; - - for (id = 0; id < Inode_opt_nr; id++) - if (inode_attr_changing(dir, inode, id)) - return true; - - return false; -} - -struct bch_inode_unpacked; - -#ifndef NO_BCACHEFS_FS - -struct bch_inode_info * -__bch2_create(struct mnt_idmap *, struct bch_inode_info *, - struct dentry *, umode_t, dev_t, subvol_inum, unsigned); - -int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); - -int bch2_fs_quota_transfer(struct bch_fs *, - struct bch_inode_info *, - struct bch_qid, - unsigned, - enum quota_acct_mode); - -static inline int bch2_set_projid(struct bch_fs *c, - struct bch_inode_info *inode, - u32 projid) -{ - struct bch_qid qid = inode->ei_qid; - - qid.q[QTYP_PRJ] = projid; - - return bch2_fs_quota_transfer(c, inode, qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); -} - -struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); - -/* returns 0 if we want to do the update, or error is passed up */ -typedef int (*inode_set_fn)(struct btree_trans *, - struct bch_inode_info *, - struct bch_inode_unpacked *, void *); - -void bch2_inode_update_after_write(struct btree_trans *, - struct bch_inode_info *, - struct bch_inode_unpacked *, - unsigned); -int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, - inode_set_fn, void *, unsigned); - -int bch2_setattr_nonsize(struct mnt_idmap *, - struct bch_inode_info *, - struct iattr *); -int __bch2_unlink(struct inode *, struct dentry *, bool); - -void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); - -void bch2_fs_vfs_exit(struct bch_fs *); -int bch2_fs_vfs_init(struct bch_fs *); - -void bch2_vfs_exit(void); -int bch2_vfs_init(void); - -#else - -#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) - -static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } - -static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, - snapshot_id_list *s) {} - -static inline void bch2_fs_vfs_exit(struct bch_fs *c) {} -static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; } - -static inline void bch2_vfs_exit(void) {} -static inline int bch2_vfs_init(void) { return 0; } - -#endif /* NO_BCACHEFS_FS */ - -#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c deleted file mode 100644 index 15c1e890d299b2..00000000000000 --- a/fs/bcachefs/fsck.c +++ /dev/null @@ -1,3363 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bcachefs_ioctl.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "buckets.h" -#include "darray.h" -#include "dirent.h" -#include "error.h" -#include "fs.h" -#include "fsck.h" -#include "inode.h" -#include "io_misc.h" -#include "keylist.h" -#include "namei.h" -#include "recovery_passes.h" -#include "snapshot.h" -#include "super.h" -#include "thread_with_file.h" -#include "xattr.h" - -#include -#include /* struct qstr */ - -static int dirent_points_to_inode_nowarn(struct bch_fs *c, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) -{ - if (d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum) - return 0; - return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); -} - -static void dirent_inode_mismatch_msg(struct printbuf *out, - struct bch_fs *c, - struct bkey_s_c_dirent dirent, - struct bch_inode_unpacked *inode) -{ - prt_str(out, "inode points to dirent that does not point back:"); - prt_newline(out); - bch2_bkey_val_to_text(out, c, dirent.s_c); - prt_newline(out); - bch2_inode_unpacked_to_text(out, inode); -} - -static int dirent_points_to_inode(struct bch_fs *c, - struct bkey_s_c_dirent dirent, - struct bch_inode_unpacked *inode) -{ - int ret = dirent_points_to_inode_nowarn(c, dirent, inode); - if (ret) { - struct printbuf buf = PRINTBUF; - dirent_inode_mismatch_msg(&buf, c, dirent, inode); - bch_warn(c, "%s", buf.buf); - printbuf_exit(&buf); - } - return ret; -} - -/* - * XXX: this is handling transaction restarts without returning - * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: - */ -static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, - u32 snapshot) -{ - u64 sectors = 0; - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ({ - if (bkey_extent_is_allocation(k.k)) - sectors += k.k->size; - 0; - })); - - return ret ?: sectors; -} - -static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, - u32 snapshot) -{ - u64 subdirs = 0; - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ({ - if (k.k->type == KEY_TYPE_dirent && - bkey_s_c_to_dirent(k).v->d_type == DT_DIR) - subdirs++; - 0; - })); - - return ret ?: subdirs; -} - -static int subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) -{ - struct bch_subvolume s; - int ret = bch2_subvolume_get(trans, subvol, false, &s); - - *snapshot = le32_to_cpu(s.snapshot); - *inum = le64_to_cpu(s.inode); - return ret; -} - -static int lookup_dirent_in_snapshot(struct btree_trans *trans, - struct bch_hash_info hash_info, - subvol_inum dir, struct qstr *name, - u64 *target, unsigned *type, u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0, snapshot); - int ret = bkey_err(k); - if (ret) - return ret; - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - *target = le64_to_cpu(d.v->d_inum); - *type = d.v->d_type; - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -/* - * Find any subvolume associated with a tree of snapshots - * We can't rely on master_subvol - it might have been deleted. - */ -static int find_snapshot_tree_subvol(struct btree_trans *trans, - u32 tree_id, u32 *subvol) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_snapshot) - continue; - - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - if (le32_to_cpu(s.v->tree) != tree_id) - continue; - - if (s.v->subvol) { - *subvol = le32_to_cpu(s.v->subvol); - goto found; - } - } - ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); -found: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* Get lost+found, create if it doesn't exist: */ -static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - struct bch_inode_unpacked *lostfound, - u64 reattaching_inum) -{ - struct bch_fs *c = trans->c; - struct qstr lostfound_str = QSTR("lost+found"); - struct btree_iter lostfound_iter = {}; - u64 inum = 0; - unsigned d_type = 0; - int ret; - - struct bch_snapshot_tree st; - ret = bch2_snapshot_tree_lookup(trans, - bch2_snapshot_tree(c, snapshot), &st); - if (ret) - return ret; - - u32 subvolid; - ret = find_snapshot_tree_subvol(trans, - bch2_snapshot_tree(c, snapshot), &subvolid); - bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", - bch2_snapshot_tree(c, snapshot)); - if (ret) - return ret; - - struct bch_subvolume subvol; - ret = bch2_subvolume_get(trans, subvolid, false, &subvol); - bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); - if (ret) - return ret; - - if (!subvol.inode) { - struct btree_iter iter; - struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(subvol); - if (ret) - return ret; - - subvol->v.inode = cpu_to_le64(reattaching_inum); - bch2_trans_iter_exit(trans, &iter); - } - - subvol_inum root_inum = { - .subvol = subvolid, - .inum = le64_to_cpu(subvol.inode) - }; - - struct bch_inode_unpacked root_inode; - struct bch_hash_info root_hash_info; - ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); - bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", - root_inum.inum, subvolid); - if (ret) - return ret; - - root_hash_info = bch2_hash_info_init(c, &root_inode); - - ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type, snapshot); - if (bch2_err_matches(ret, ENOENT)) - goto create_lostfound; - - bch_err_fn(c, ret); - if (ret) - return ret; - - if (d_type != DT_DIR) { - bch_err(c, "error looking up lost+found: not a directory"); - return bch_err_throw(c, ENOENT_not_directory); - } - - /* - * The bch2_check_dirents pass has already run, dangling dirents - * shouldn't exist here: - */ - ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); - bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", - inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); - return ret; - -create_lostfound: - /* - * we always create lost+found in the root snapshot; we don't want - * different branches of the snapshot tree to have different lost+found - */ - snapshot = le32_to_cpu(st.root_snapshot); - /* - * XXX: we could have a nicer log message here if we had a nice way to - * walk backpointers to print a path - */ - struct printbuf path = PRINTBUF; - ret = bch2_inum_to_path(trans, root_inum, &path); - if (ret) - goto err; - - bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", - path.buf, root_inum.subvol, snapshot); - printbuf_exit(&path); - - u64 now = bch2_current_time(c); - u64 cpu = raw_smp_processor_id(); - - bch2_inode_init_early(c, lostfound); - bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); - lostfound->bi_dir = root_inode.bi_inum; - lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); - - root_inode.bi_nlink++; - - ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); - ret = bch2_btree_iter_traverse(trans, &lostfound_iter); - if (ret) - goto err; - - ret = bch2_dirent_create_snapshot(trans, - 0, root_inode.bi_inum, snapshot, &root_hash_info, - mode_to_type(lostfound->bi_mode), - &lostfound_str, - lostfound->bi_inum, - &lostfound->bi_dir_offset, - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create) ?: - bch2_inode_write_flags(trans, &lostfound_iter, lostfound, - BTREE_UPDATE_internal_snapshot_node); -err: - bch_err_msg(c, ret, "creating lost+found"); - bch2_trans_iter_exit(trans, &lostfound_iter); - return ret; -} - -static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) -{ - if (inode->bi_inum == BCACHEFS_ROOT_INO && - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) - return false; - - /* - * Subvolume roots are special: older versions of subvolume roots may be - * disconnected, it's only the newest version that matters. - * - * We only keep a single dirent pointing to a subvolume root, i.e. - * older versions of snapshots will not have a different dirent pointing - * to the same subvolume root. - * - * This is because dirents that point to subvolumes are only visible in - * the parent subvolume - versioning is not needed - and keeping them - * around would break fsck, because when we're crossing subvolumes we - * don't have a consistent snapshot ID to do check the inode <-> dirent - * relationships. - * - * Thus, a subvolume root that's been renamed after a snapshot will have - * a disconnected older version - that's expected. - * - * Note that taking a snapshot always updates the root inode (to update - * the dirent backpointer), so a subvolume root inode with - * BCH_INODE_has_child_snapshot is never visible. - */ - if (inode->bi_subvol && - (inode->bi_flags & BCH_INODE_has_child_snapshot)) - return false; - - return !bch2_inode_has_backpointer(inode) && - !(inode->bi_flags & BCH_INODE_unlinked); -} - -static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, - SPOS(d_pos.inode, d_pos.offset, snapshot), - BTREE_ITER_intent| - BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - - if (bpos_eq(k.k->p, d_pos)) { - /* - * delet_at() doesn't work because the update path doesn't - * internally use BTREE_ITER_with_updates yet - */ - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); - ret = PTR_ERR_OR_ZERO(k); - if (ret) - goto err; - - bkey_init(&k->k); - k->k.type = KEY_TYPE_whiteout; - k->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) -{ - struct bch_fs *c = trans->c; - struct bch_inode_unpacked lostfound; - char name_buf[20]; - int ret; - - u32 dirent_snapshot = inode->bi_snapshot; - if (inode->bi_subvol) { - inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; - - struct btree_iter subvol_iter; - struct bkey_i_subvolume *subvol = - bch2_bkey_get_mut_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, inode->bi_subvol), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(subvol); - if (ret) - return ret; - - subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; - bch2_trans_iter_exit(trans, &subvol_iter); - - u64 root_inum; - ret = subvol_lookup(trans, inode->bi_parent_subvol, - &dirent_snapshot, &root_inum); - if (ret) - return ret; - - snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); - } else { - snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); - } - - ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); - if (ret) - return ret; - - bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); - - lostfound.bi_nlink += S_ISDIR(inode->bi_mode); - - /* ensure lost+found inode is also present in inode snapshot */ - if (!inode->bi_subvol) { - BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot)); - lostfound.bi_snapshot = inode->bi_snapshot; - } - - ret = __bch2_fsck_write_inode(trans, &lostfound); - if (ret) - return ret; - - struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); - struct qstr name = QSTR(name_buf); - - inode->bi_dir = lostfound.bi_inum; - - ret = bch2_dirent_create_snapshot(trans, - inode->bi_parent_subvol, lostfound.bi_inum, - dirent_snapshot, - &dir_hash, - inode_d_type(inode), - &name, - inode->bi_subvol ?: inode->bi_inum, - &inode->bi_dir_offset, - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create); - if (ret) { - bch_err_msg(c, ret, "error creating dirent"); - return ret; - } - - ret = __bch2_fsck_write_inode(trans, inode); - if (ret) - return ret; - - { - CLASS(printbuf, buf)(); - ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, - inode->bi_snapshot, NULL, &buf); - if (ret) - return ret; - - bch_info(c, "reattached at %s", buf.buf); - } - - /* - * Fix up inodes in child snapshots: if they should also be reattached - * update the backpointer field, if they should not be we need to emit - * whiteouts for the dirent we just created. - */ - if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { - snapshot_id_list whiteouts_done; - struct btree_iter iter; - struct bkey_s_c k; - - darray_init(&whiteouts_done); - - for_each_btree_key_reverse_norestart(trans, iter, - BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), - BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { - if (k.k->p.offset != inode->bi_inum) - break; - - if (!bkey_is_inode(k.k) || - !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || - snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) - continue; - - struct bch_inode_unpacked child_inode; - ret = bch2_inode_unpack(k, &child_inode); - if (ret) - break; - - if (!inode_should_reattach(&child_inode)) { - ret = maybe_delete_dirent(trans, - SPOS(lostfound.bi_inum, inode->bi_dir_offset, - dirent_snapshot), - k.k->p.snapshot); - if (ret) - break; - - ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); - if (ret) - break; - } else { - iter.snapshot = k.k->p.snapshot; - child_inode.bi_dir = inode->bi_dir; - child_inode.bi_dir_offset = inode->bi_dir_offset; - - ret = bch2_inode_write_flags(trans, &iter, &child_inode, - BTREE_UPDATE_internal_snapshot_node); - if (ret) - break; - } - } - darray_exit(&whiteouts_done); - bch2_trans_iter_exit(trans, &iter); - } - - return ret; -} - -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) -{ - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -} - -static int remove_backpointer(struct btree_trans *trans, - struct bch_inode_unpacked *inode) -{ - if (!bch2_inode_has_backpointer(inode)) - return 0; - - u32 snapshot = inode->bi_snapshot; - - if (inode->bi_parent_subvol) { - int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); - if (ret) - return ret; - } - - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); - int ret = bkey_err(d) ?: - dirent_points_to_inode(c, d, inode) ?: - bch2_fsck_remove_dirent(trans, d.k->p); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) -{ - struct bch_fs *c = trans->c; - - struct bch_inode_unpacked inode; - int ret = bch2_inode_find_by_inum_trans(trans, - (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, - &inode); - if (ret) - return ret; - - ret = remove_backpointer(trans, &inode); - if (!bch2_err_matches(ret, ENOENT)) - bch_err_msg(c, ret, "removing dirent"); - if (ret) - return ret; - - ret = reattach_inode(trans, &inode); - bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); - return ret; -} - -static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) -{ - struct bch_fs *c = trans->c; - - if (!bch2_snapshot_is_leaf(c, snapshotid)) { - bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); - return bch_err_throw(c, fsck_repair_unimplemented); - } - - /* - * If inum isn't set, that means we're being called from check_dirents, - * not check_inodes - the root of this subvolume doesn't exist or we - * would have found it there: - */ - if (!inum) { - struct btree_iter inode_iter = {}; - struct bch_inode_unpacked new_inode; - u64 cpu = raw_smp_processor_id(); - - bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); - - new_inode.bi_subvol = subvolid; - - int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: - bch2_btree_iter_traverse(trans, &inode_iter) ?: - bch2_inode_write(trans, &inode_iter, &new_inode); - bch2_trans_iter_exit(trans, &inode_iter); - if (ret) - return ret; - - inum = new_inode.bi_inum; - } - - bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); - - struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); - int ret = PTR_ERR_OR_ZERO(new_subvol); - if (ret) - return ret; - - bkey_subvolume_init(&new_subvol->k_i); - new_subvol->k.p.offset = subvolid; - new_subvol->v.snapshot = cpu_to_le32(snapshotid); - new_subvol->v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); - if (ret) - return ret; - - struct btree_iter iter; - struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, snapshotid), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(s); - bch_err_msg(c, ret, "getting snapshot %u", snapshotid); - if (ret) - return ret; - - u32 snapshot_tree = le32_to_cpu(s->v.tree); - - s->v.subvol = cpu_to_le32(subvolid); - SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); - bch2_trans_iter_exit(trans, &iter); - - struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshot_trees, POS(0, snapshot_tree), - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(st); - bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); - if (ret) - return ret; - - if (!st->v.master_subvol) - st->v.master_subvol = cpu_to_le32(subvolid); - - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) -{ - struct bch_fs *c = trans->c; - unsigned i_mode = S_IFREG; - u64 i_size = 0; - - switch (btree) { - case BTREE_ID_extents: { - struct btree_iter iter = {}; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); - bch2_trans_iter_exit(trans, &iter); - int ret = bkey_err(k); - if (ret) - return ret; - - i_size = k.k->p.offset << 9; - break; - } - case BTREE_ID_dirents: - i_mode = S_IFDIR; - break; - case BTREE_ID_xattrs: - break; - default: - BUG(); - } - - struct bch_inode_unpacked new_inode; - bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); - new_inode.bi_size = i_size; - new_inode.bi_inum = inum; - new_inode.bi_snapshot = snapshot; - - return __bch2_fsck_write_inode(trans, &new_inode); -} - -static inline void snapshots_seen_exit(struct snapshots_seen *s) -{ - darray_exit(&s->ids); -} - -static inline void snapshots_seen_init(struct snapshots_seen *s) -{ - memset(s, 0, sizeof(*s)); -} - -static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) -{ - u32 *i; - __darray_for_each(s->ids, i) { - if (*i == id) - return 0; - if (*i > id) - break; - } - - int ret = darray_insert_item(&s->ids, i - s->ids.data, id); - if (ret) - bch_err(c, "error reallocating snapshots_seen table (size %zu)", - s->ids.size); - return ret; -} - -static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, - enum btree_id btree_id, struct bpos pos) -{ - if (!bkey_eq(s->pos, pos)) - s->ids.nr = 0; - s->pos = pos; - - return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); -} - -/** - * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, - * and @ancestor hasn't been overwritten in @seen - * - * @c: filesystem handle - * @seen: list of snapshot ids already seen at current position - * @id: descendent snapshot id - * @ancestor: ancestor snapshot id - * - * Returns: whether key in @ancestor snapshot is visible in @id snapshot - */ -static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, - u32 id, u32 ancestor) -{ - EBUG_ON(id > ancestor); - - if (id == ancestor) - return true; - - if (!bch2_snapshot_is_ancestor(c, id, ancestor)) - return false; - - /* - * We know that @id is a descendant of @ancestor, we're checking if - * we've seen a key that overwrote @ancestor - i.e. also a descendent of - * @ascestor and with @id as a descendent. - * - * But we already know that we're scanning IDs between @id and @ancestor - * numerically, since snapshot ID lists are kept sorted, so if we find - * an id that's an ancestor of @id we're done: - */ - darray_for_each_reverse(seen->ids, i) - if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) - return false; - - return true; -} - -/** - * ref_visible - given a key with snapshot id @src that points to a key with - * snapshot id @dst, test whether there is some snapshot in which @dst is - * visible. - * - * @c: filesystem handle - * @s: list of snapshot IDs already seen at @src - * @src: snapshot ID of src key - * @dst: snapshot ID of dst key - * Returns: true if there is some snapshot in which @dst is visible - * - * Assumes we're visiting @src keys in natural key order - */ -static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, - u32 src, u32 dst) -{ - return dst <= src - ? key_visible_in_snapshot(c, s, dst, src) - : bch2_snapshot_is_ancestor(c, src, dst); -} - -static int ref_visible2(struct bch_fs *c, - u32 src, struct snapshots_seen *src_seen, - u32 dst, struct snapshots_seen *dst_seen) -{ - if (dst > src) { - swap(dst, src); - swap(dst_seen, src_seen); - } - return key_visible_in_snapshot(c, src_seen, dst, src); -} - -#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ - for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ - (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ - if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) - -struct inode_walker_entry { - struct bch_inode_unpacked inode; - bool whiteout; - u64 count; - u64 i_size; -}; - -struct inode_walker { - bool first_this_inode; - bool have_inodes; - bool recalculate_sums; - struct bpos last_pos; - - DARRAY(struct inode_walker_entry) inodes; - snapshot_id_list deletes; -}; - -static void inode_walker_exit(struct inode_walker *w) -{ - darray_exit(&w->inodes); - darray_exit(&w->deletes); -} - -static struct inode_walker inode_walker_init(void) -{ - return (struct inode_walker) { 0, }; -} - -static int add_inode(struct bch_fs *c, struct inode_walker *w, - struct bkey_s_c inode) -{ - int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { - .whiteout = !bkey_is_inode(inode.k), - })); - if (ret) - return ret; - - struct inode_walker_entry *n = &darray_last(w->inodes); - if (!n->whiteout) { - return bch2_inode_unpack(inode, &n->inode); - } else { - n->inode.bi_inum = inode.k->p.offset; - n->inode.bi_snapshot = inode.k->p.snapshot; - return 0; - } -} - -static int get_inodes_all_snapshots(struct btree_trans *trans, - struct inode_walker *w, u64 inum) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - /* - * We no longer have inodes for w->last_pos; clear this to avoid - * screwing up check_i_sectors/check_subdir_count if we take a - * transaction restart here: - */ - w->have_inodes = false; - w->recalculate_sums = false; - w->inodes.nr = 0; - - for_each_btree_key_max_norestart(trans, iter, - BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - ret = add_inode(c, w, k); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; - - w->first_this_inode = true; - w->have_inodes = true; - return 0; -} - -static int get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - w->inodes.nr = 0; - w->deletes.nr = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - - if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) - continue; - - if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) - continue; - - ret = bkey_is_inode(k.k) - ? add_inode(c, w, k) - : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static struct inode_walker_entry * -lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - - struct inode_walker_entry *i = darray_find_p(w->inodes, i, - bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); - - if (!i) - return NULL; - - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, - trans, snapshot_key_missing_inode_snapshot, - "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" - "unexpected because we should always update the inode when we update a key in that inode\n" - "%s", - w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, - (bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - if (!i->whiteout) { - struct bch_inode_unpacked new = i->inode; - new.bi_snapshot = k.k->p.snapshot; - ret = __bch2_fsck_write_inode(trans, &new); - } else { - struct bkey_i whiteout; - bkey_init(&whiteout.k); - whiteout.k.type = KEY_TYPE_whiteout; - whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); - ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &whiteout, - BTREE_UPDATE_internal_snapshot_node); - } - - if (ret) - goto fsck_err; - - ret = bch2_trans_commit(trans, NULL, NULL, 0); - if (ret) - goto fsck_err; - - struct inode_walker_entry new_entry = *i; - - new_entry.inode.bi_snapshot = k.k->p.snapshot; - new_entry.count = 0; - new_entry.i_size = 0; - - while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) - --i; - - size_t pos = i - w->inodes.data; - ret = darray_insert_item(&w->inodes, pos, new_entry); - if (ret) - goto fsck_err; - - ret = bch_err_throw(c, transaction_restart_nested); - goto fsck_err; - } - - printbuf_exit(&buf); - return i; -fsck_err: - printbuf_exit(&buf); - return ERR_PTR(ret); -} - -static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - struct inode_walker *w, - struct bkey_s_c k) -{ - if (w->last_pos.inode != k.k->p.inode) { - int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); - if (ret) - return ERR_PTR(ret); - } - - w->last_pos = k.k->p; - - return lookup_inode_for_snapshot(trans, w, k); -} - -/* - * Prefer to delete the first one, since that will be the one at the wrong - * offset: - * return value: 0 -> delete k1, 1 -> delete k2 - */ -int bch2_fsck_update_backpointers(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_i *new) -{ - if (new->k.type != KEY_TYPE_dirent) - return 0; - - struct bkey_i_dirent *d = bkey_i_to_dirent(new); - struct inode_walker target = inode_walker_init(); - int ret = 0; - - if (d->v.d_type == DT_SUBVOL) { - bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); - ret = -BCH_ERR_fsck_repair_unimplemented; - } else { - ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); - if (ret) - goto err; - - darray_for_each(target.inodes, i) { - i->inode.bi_dir_offset = d->k.p.offset; - ret = __bch2_fsck_write_inode(trans, &i->inode); - if (ret) - goto err; - } - } -err: - inode_walker_exit(&target); - return ret; -} - -static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - u32 *snapshot) -{ - if (inode->bi_subvol) { - u64 inum; - int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); - if (ret) - return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); - } - - return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); -} - -static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); - int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int check_inode_dirent_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - bool *write_inode) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - u32 inode_snapshot = inode->bi_snapshot; - struct btree_iter dirent_iter = {}; - struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); - int ret = bkey_err(d); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && - inode->bi_subvol && - (inode->bi_flags & BCH_INODE_has_child_snapshot)) { - /* Older version of a renamed subvolume root: we won't have a - * correct dirent for it. That's expected, see - * inode_should_reattach(). - * - * We don't clear the backpointer field when doing the rename - * because there might be arbitrarily many versions in older - * snapshots. - */ - inode->bi_dir = 0; - inode->bi_dir_offset = 0; - *write_inode = true; - goto out; - } - - if (fsck_err_on(ret, - trans, inode_points_to_missing_dirent, - "inode points to missing dirent\n%s", - (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || - fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), - trans, inode_points_to_wrong_dirent, - "%s", - (printbuf_reset(&buf), - dirent_inode_mismatch_msg(&buf, c, d, inode), - buf.buf))) { - /* - * We just clear the backpointer fields for now. If we find a - * dirent that points to this inode in check_dirents(), we'll - * update it then; then when we get to check_path() if the - * backpointer is still 0 we'll reattach it. - */ - inode->bi_dir = 0; - inode->bi_dir_offset = 0; - *write_inode = true; - } -out: - ret = 0; -fsck_err: - bch2_trans_iter_exit(trans, &dirent_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int check_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_inode_unpacked *snapshot_root, - struct snapshots_seen *s) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct bch_inode_unpacked u; - bool do_update = false; - int ret; - - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret < 0) - goto err; - if (ret) - return 0; - - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) - return 0; - - ret = bch2_inode_unpack(k, &u); - if (ret) - goto err; - - if (snapshot_root->bi_inum != u.bi_inum) { - ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); - if (ret) - goto err; - } - - if (u.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { - ret = bch2_repair_inode_hash_info(trans, snapshot_root); - BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); - if (ret) - goto err; - } - - ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); - if (ret) - goto err; - - if (bch2_inode_has_backpointer(&u)) { - ret = check_inode_dirent_inode(trans, &u, &do_update); - if (ret) - goto err; - } - - if (fsck_err_on(bch2_inode_has_backpointer(&u) && - (u.bi_flags & BCH_INODE_unlinked), - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - u.bi_flags &= ~BCH_INODE_unlinked; - do_update = true; - } - - if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) { - /* Check for this early so that check_unreachable_inode() will reattach it */ - - ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot); - if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty) - goto err; - - fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty, - "dir unlinked but not empty\n%s", - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf)); - u.bi_flags &= ~BCH_INODE_unlinked; - do_update = true; - ret = 0; - } - - if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, - trans, inode_dir_has_nonzero_i_size, - "directory %llu:%u with nonzero i_size %lli", - u.bi_inum, u.bi_snapshot, u.bi_size)) { - u.bi_size = 0; - do_update = true; - } - - ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret < 0) - goto err; - - if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), - trans, inode_has_child_snapshots_wrong, - "inode has_child_snapshots flag wrong (should be %u)\n%s", - ret, - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - if (ret) - u.bi_flags |= BCH_INODE_has_child_snapshot; - else - u.bi_flags &= ~BCH_INODE_has_child_snapshot; - do_update = true; - } - ret = 0; - - if ((u.bi_flags & BCH_INODE_unlinked) && - !(u.bi_flags & BCH_INODE_has_child_snapshot)) { - if (!test_bit(BCH_FS_started, &c->flags)) { - /* - * If we're not in online fsck, don't delete unlinked - * inodes, just make sure they're on the deleted list. - * - * They might be referred to by a logged operation - - * i.e. we might have crashed in the middle of a - * truncate on an unlinked but open file - so we want to - * let the delete_dead_inodes kill it after resuming - * logged ops. - */ - ret = check_inode_deleted_list(trans, k.k->p); - if (ret < 0) - goto err_noprint; - - fsck_err_on(!ret, - trans, unlinked_inode_not_on_deleted_list, - "inode %llu:%u unlinked, but not on deleted list", - u.bi_inum, k.k->p.snapshot); - - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); - if (ret) - goto err; - } else { - ret = bch2_inode_or_descendents_is_open(trans, k.k->p); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, - trans, inode_unlinked_and_not_open, - "inode %llu:%u unlinked and not open", - u.bi_inum, u.bi_snapshot)) { - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); - bch_err_msg(c, ret, "in fsck deleting inode"); - goto err_noprint; - } - ret = 0; - } - } - - if (fsck_err_on(u.bi_parent_subvol && - (u.bi_subvol == 0 || - u.bi_subvol == BCACHEFS_ROOT_SUBVOL), - trans, inode_bi_parent_nonzero, - "inode %llu:%u has subvol %u but nonzero parent subvol %u", - u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { - u.bi_parent_subvol = 0; - do_update = true; - } - - if (u.bi_subvol) { - struct bch_subvolume s; - - ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { - ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); - goto do_update; - } - - if (fsck_err_on(ret, - trans, inode_bi_subvol_missing, - "inode %llu:%u bi_subvol points to missing subvolume %u", - u.bi_inum, k.k->p.snapshot, u.bi_subvol) || - fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || - !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), - k.k->p.snapshot), - trans, inode_bi_subvol_wrong, - "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", - u.bi_inum, k.k->p.snapshot, u.bi_subvol, - le64_to_cpu(s.inode), - le32_to_cpu(s.snapshot))) { - u.bi_subvol = 0; - u.bi_parent_subvol = 0; - do_update = true; - } - } - - if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), - trans, inode_journal_seq_in_future, - "inode journal seq in future (currently at %llu)\n%s", - journal_cur_seq(&c->journal), - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &u), - buf.buf))) { - u.bi_journal_seq = journal_cur_seq(&c->journal); - do_update = true; - } -do_update: - if (do_update) { - ret = __bch2_fsck_write_inode(trans, &u); - bch_err_msg(c, ret, "in fsck updating inode"); - if (ret) - goto err_noprint; - } -err: -fsck_err: - bch_err_fn(c, ret); -err_noprint: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_inodes(struct bch_fs *c) -{ - struct bch_inode_unpacked snapshot_root = {}; - struct snapshots_seen s; - - snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &snapshot_root, &s))); - - snapshots_seen_exit(&s); - bch_err_fn(c, ret); - return ret; -} - -static int find_oldest_inode_needs_reattach(struct btree_trans *trans, - struct bch_inode_unpacked *inode) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - /* - * We look for inodes to reattach in natural key order, leaves first, - * but we should do the reattach at the oldest version that needs to be - * reattached: - */ - for_each_btree_key_norestart(trans, iter, - BTREE_ID_inodes, - SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode->bi_inum) - break; - - if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) - continue; - - if (!bkey_is_inode(k.k)) - break; - - struct bch_inode_unpacked parent_inode; - ret = bch2_inode_unpack(k, &parent_inode); - if (ret) - break; - - if (!inode_should_reattach(&parent_inode)) - break; - - *inode = parent_inode; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static int check_unreachable_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (!bkey_is_inode(k.k)) - return 0; - - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); - if (ret) - return ret; - - if (!inode_should_reattach(&inode)) - return 0; - - ret = find_oldest_inode_needs_reattach(trans, &inode); - if (ret) - return ret; - - if (fsck_err(trans, inode_unreachable, - "unreachable inode:\n%s", - (bch2_inode_unpacked_to_text(&buf, &inode), - buf.buf))) - ret = reattach_inode(trans, &inode); -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * Reattach unreachable (but not unlinked) inodes - * - * Run after check_inodes() and check_dirents(), so we node that inode - * backpointer fields point to valid dirents, and every inode that has a dirent - * that points to it has its backpointer field set - so we're just looking for - * non-unlinked inodes without backpointers: - * - * XXX: this is racy w.r.t. hardlink removal in online fsck - */ -int bch2_check_unreachable_inodes(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_unreachable_inode(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) -{ - switch (btree) { - case BTREE_ID_extents: - return S_ISREG(mode) || S_ISLNK(mode); - case BTREE_ID_dirents: - return S_ISDIR(mode); - case BTREE_ID_xattrs: - return true; - default: - BUG(); - } -} - -static int check_key_has_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct inode_walker *inode, - struct inode_walker_entry *i, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter iter2 = {}; - int ret = PTR_ERR_OR_ZERO(i); - if (ret) - return ret; - - if (k.k->type == KEY_TYPE_whiteout) - goto out; - - bool have_inode = i && !i->whiteout; - - if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) - goto reconstruct; - - if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) - goto out; - - prt_printf(&buf, ", "); - - bool have_old_inode = false; - darray_for_each(inode->inodes, i2) - if (!i2->whiteout && - bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && - btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { - prt_printf(&buf, "but found good inode in older snapshot\n"); - bch2_inode_unpacked_to_text(&buf, &i2->inode); - prt_newline(&buf); - have_old_inode = true; - break; - } - - struct bkey_s_c k2; - unsigned nr_keys = 0; - - prt_printf(&buf, "found keys:\n"); - - for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, - SPOS(k.k->p.inode, 0, k.k->p.snapshot), - POS(k.k->p.inode, U64_MAX), - 0, k2, ret) { - nr_keys++; - if (nr_keys <= 10) { - bch2_bkey_val_to_text(&buf, c, k2); - prt_newline(&buf); - } - if (nr_keys >= 100) - break; - } - - if (ret) - goto err; - - if (nr_keys > 100) - prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); - else if (nr_keys > 10) - prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); - - if (!have_inode) { - if (fsck_err_on(!have_inode, - trans, key_in_missing_inode, - "key in missing inode%s", buf.buf)) { - /* - * Maybe a deletion that raced with data move, or something - * weird like that? But if we know the inode was deleted, or - * it's just a few keys, we can safely delete them. - * - * If it's many keys, we should probably recreate the inode - */ - if (have_old_inode || nr_keys <= 2) - goto delete; - else - goto reconstruct; - } - } else { - /* - * not autofix, this one would be a giant wtf - bit error in the - * inode corrupting i_mode? - * - * may want to try repairing inode instead of deleting - */ - if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), - trans, key_in_wrong_inode_type, - "key for wrong inode mode %o%s", - i->inode.bi_mode, buf.buf)) - goto delete; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &iter2); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -delete: - /* - * XXX: print out more info - * count up extents for this inode, check if we have different inode in - * an older snapshot version, perhaps decide if we want to reconstitute - */ - ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - goto out; -reconstruct: - ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - inode->last_pos.inode--; - ret = bch_err_throw(c, transaction_restart_nested); - goto out; -} - -static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - s64 count2; - - darray_for_each(w->inodes, i) { - if (i->inode.bi_sectors == i->count) - continue; - - count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); - - if (w->recalculate_sums) - i->count = count2; - - if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); - i->count = count2; - } - - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), - trans, inode_i_sectors_wrong, - "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, - i->inode.bi_sectors, i->count)) { - i->inode.bi_sectors = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - -static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) -{ - u32 restart_count = trans->restart_count; - return check_i_sectors_notnested(trans, w) ?: - trans_was_restarted(trans, restart_count); -} - -struct extent_end { - u32 snapshot; - u64 offset; - struct snapshots_seen seen; -}; - -struct extent_ends { - struct bpos last_pos; - DARRAY(struct extent_end) e; -}; - -static void extent_ends_reset(struct extent_ends *extent_ends) -{ - darray_for_each(extent_ends->e, i) - snapshots_seen_exit(&i->seen); - extent_ends->e.nr = 0; -} - -static void extent_ends_exit(struct extent_ends *extent_ends) -{ - extent_ends_reset(extent_ends); - darray_exit(&extent_ends->e); -} - -static void extent_ends_init(struct extent_ends *extent_ends) -{ - memset(extent_ends, 0, sizeof(*extent_ends)); -} - -static int extent_ends_at(struct bch_fs *c, - struct extent_ends *extent_ends, - struct snapshots_seen *seen, - struct bkey_s_c k) -{ - struct extent_end *i, n = (struct extent_end) { - .offset = k.k->p.offset, - .snapshot = k.k->p.snapshot, - .seen = *seen, - }; - - n.seen.ids.data = kmemdup(seen->ids.data, - sizeof(seen->ids.data[0]) * seen->ids.size, - GFP_KERNEL); - if (!n.seen.ids.data) - return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); - - __darray_for_each(extent_ends->e, i) { - if (i->snapshot == k.k->p.snapshot) { - snapshots_seen_exit(&i->seen); - *i = n; - return 0; - } - - if (i->snapshot >= k.k->p.snapshot) - break; - } - - return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); -} - -static int overlapping_extents_found(struct btree_trans *trans, - enum btree_id btree, - struct bpos pos1, struct snapshots_seen *pos1_seen, - struct bkey pos2, - bool *fixed, - struct extent_end *extent_end) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter iter1, iter2 = {}; - struct bkey_s_c k1, k2; - int ret; - - BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); - - bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents); - k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k1); - if (ret) - goto err; - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k1); - - if (!bpos_eq(pos1, k1.k->p)) { - prt_str(&buf, "\nwanted\n "); - bch2_bpos_to_text(&buf, pos1); - prt_str(&buf, "\n"); - bch2_bkey_to_text(&buf, &pos2); - - bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", - __func__, buf.buf); - ret = bch_err_throw(c, internal_fsck_err); - goto err; - } - - bch2_trans_copy_iter(trans, &iter2, &iter1); - - while (1) { - bch2_btree_iter_advance(trans, &iter2); - - k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); - ret = bkey_err(k2); - if (ret) - goto err; - - if (bpos_ge(k2.k->p, pos2.p)) - break; - } - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k2); - - if (bpos_gt(k2.k->p, pos2.p) || - pos2.size != k2.k->size) { - bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", - __func__, buf.buf); - ret = bch_err_throw(c, internal_fsck_err); - goto err; - } - - prt_printf(&buf, "\noverwriting %s extent", - pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); - - if (fsck_err(trans, extent_overlapping, - "overlapping extents%s", buf.buf)) { - struct btree_iter *old_iter = &iter1; - struct disk_reservation res = { 0 }; - - if (pos1.snapshot < pos2.p.snapshot) { - old_iter = &iter2; - swap(k1, k2); - } - - trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); - - ret = bch2_trans_update_extent_overwrite(trans, old_iter, - BTREE_UPDATE_internal_snapshot_node, - k1, k2) ?: - bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); - bch2_disk_reservation_put(c, &res); - - bch_info(c, "repair ret %s", bch2_err_str(ret)); - - if (ret) - goto err; - - *fixed = true; - - if (pos1.snapshot == pos2.p.snapshot) { - /* - * We overwrote the first extent, and did the overwrite - * in the same snapshot: - */ - extent_end->offset = bkey_start_offset(&pos2); - } else if (pos1.snapshot > pos2.p.snapshot) { - /* - * We overwrote the first extent in pos2's snapshot: - */ - ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); - } else { - /* - * We overwrote the second extent - restart - * check_extent() from the top: - */ - ret = bch_err_throw(c, transaction_restart_nested); - } - } -fsck_err: -err: - bch2_trans_iter_exit(trans, &iter2); - bch2_trans_iter_exit(trans, &iter1); - printbuf_exit(&buf); - return ret; -} - -static int check_overlapping_extents(struct btree_trans *trans, - struct snapshots_seen *seen, - struct extent_ends *extent_ends, - struct bkey_s_c k, - struct btree_iter *iter, - bool *fixed) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - /* transaction restart, running again */ - if (bpos_eq(extent_ends->last_pos, k.k->p)) - return 0; - - if (extent_ends->last_pos.inode != k.k->p.inode) - extent_ends_reset(extent_ends); - - darray_for_each(extent_ends->e, i) { - if (i->offset <= bkey_start_offset(k.k)) - continue; - - if (!ref_visible2(c, - k.k->p.snapshot, seen, - i->snapshot, &i->seen)) - continue; - - ret = overlapping_extents_found(trans, iter->btree_id, - SPOS(iter->pos.inode, - i->offset, - i->snapshot), - &i->seen, - *k.k, fixed, i); - if (ret) - goto err; - } - - extent_ends->last_pos = k.k->p; -err: - return ret; -} - -static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (crc_is_encoded(crc) && - crc.uncompressed_size > encoded_extent_max_sectors) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); - printbuf_exit(&buf); - } - - return 0; -} - -static int check_extent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct inode_walker *inode, - struct snapshots_seen *s, - struct extent_ends *extent_ends, - struct disk_reservation *res) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) { - ret = ret < 0 ? ret : 0; - goto out; - } - - if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { - ret = check_i_sectors(trans, inode); - if (ret) - goto err; - } - - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); - if (ret) - goto err; - - struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); - ret = PTR_ERR_OR_ZERO(extent_i); - if (ret) - goto err; - - ret = check_key_has_inode(trans, iter, inode, extent_i, k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_whiteout) { - ret = check_overlapping_extents(trans, s, extent_ends, k, iter, - &inode->recalculate_sums); - if (ret) - goto err; - - /* - * Check inodes in reverse order, from oldest snapshots to - * newest, starting from the inode that matches this extent's - * snapshot. If we didn't have one, iterate over all inodes: - */ - for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); - inode->inodes.data && i >= inode->inodes.data; - --i) { - if (i->inode.bi_snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) - continue; - - u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; - - if (fsck_err_on(k.k->p.offset > last_block && - !bkey_extent_is_reservation(k), - trans, extent_past_end_of_inode, - "extent type past end of inode %llu:%u, i_size %llu\n%s", - i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?: - bch2_fpunch_snapshot(trans, - SPOS(i->inode.bi_inum, - last_block, - i->inode.bi_snapshot), - POS(i->inode.bi_inum, U64_MAX)); - if (ret) - goto err; - - iter->k.type = KEY_TYPE_whiteout; - break; - } - } - } - - ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - if (bkey_extent_is_allocation(k.k)) { - for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); - inode->inodes.data && i >= inode->inodes.data; - --i) { - if (i->whiteout || - i->inode.bi_snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) - continue; - - i->count += k.k->size; - } - } - - if (k.k->type != KEY_TYPE_whiteout) { - ret = extent_ends_at(c, extent_ends, s, k); - if (ret) - goto err; - } -out: -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -/* - * Walk extents: verify that extents have a corresponding S_ISREG inode, and - * that i_size an i_sectors are consistent - */ -int bch2_check_extents(struct bch_fs *c) -{ - struct inode_walker w = inode_walker_init(); - struct snapshots_seen s; - struct extent_ends extent_ends; - struct disk_reservation res = { 0 }; - - snapshots_seen_init(&s); - extent_ends_init(&extent_ends); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_extents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: - check_extent_overbig(trans, &iter, k); - })) ?: - check_i_sectors_notnested(trans, &w)); - - bch2_disk_reservation_put(c, &res); - extent_ends_exit(&extent_ends); - inode_walker_exit(&w); - snapshots_seen_exit(&s); - - bch_err_fn(c, ret); - return ret; -} - -int bch2_check_indirect_extents(struct bch_fs *c) -{ - struct disk_reservation res = { 0 }; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, - POS_MIN, - BTREE_ITER_prefetch, k, - &res, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ - bch2_disk_reservation_put(c, &res); - check_extent_overbig(trans, &iter, k); - }))); - - bch2_disk_reservation_put(c, &res); - bch_err_fn(c, ret); - return ret; -} - -static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - s64 count2; - - darray_for_each(w->inodes, i) { - if (i->inode.bi_nlink == i->count) - continue; - - count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); - if (count2 < 0) - return count2; - - if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); - i->count = count2; - if (i->inode.bi_nlink == i->count) - continue; - } - - if (i->inode.bi_nlink != i->count) { - CLASS(printbuf, buf)(); - - lockrestart_do(trans, - bch2_inum_snapshot_to_path(trans, w->last_pos.inode, - i->inode.bi_snapshot, NULL, &buf)); - - if (fsck_err_on(i->inode.bi_nlink != i->count, - trans, inode_dir_wrong_nlink, - "directory with wrong i_nlink: got %u, should be %llu\n%s", - i->inode.bi_nlink, i->count, buf.buf)) { - i->inode.bi_nlink = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } - } - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - -static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) -{ - u32 restart_count = trans->restart_count; - return check_subdir_count_notnested(trans, w) ?: - trans_was_restarted(trans, restart_count); -} - -/* find a subvolume that's a descendent of @snapshot: */ -static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_subvolume) - continue; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { - bch2_trans_iter_exit(trans, &iter); - *subvolid = k.k->p.offset; - goto found; - } - } - if (!ret) - ret = -ENOENT; -found: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -noinline_for_stack -static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c_dirent d) -{ - struct bch_fs *c = trans->c; - struct btree_iter subvol_iter = {}; - struct bch_inode_unpacked subvol_root; - u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); - u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 parent_snapshot; - u32 new_parent_subvol = 0; - u64 parent_inum; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (ret || - (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { - int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); - if (ret2 && !bch2_err_matches(ret, ENOENT)) - return ret2; - } - - if (ret && - !new_parent_subvol && - (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { - /* - * Couldn't find a subvol for dirent's snapshot - but we lost - * subvols, so we need to reconstruct: - */ - ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); - if (ret) - return ret; - - parent_snapshot = d.k->p.snapshot; - } - - if (fsck_err_on(ret, - trans, dirent_to_missing_parent_subvol, - "dirent parent_subvol points to missing subvolume\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || - fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), - trans, dirent_not_visible_in_parent_subvol, - "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", - parent_snapshot, - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - if (!new_parent_subvol) { - bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); - return bch_err_throw(c, fsck_repair_unimplemented); - } - - struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); - ret = PTR_ERR_OR_ZERO(new_dirent); - if (ret) - goto err; - - new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); - } - - struct bkey_s_c_subvolume s = - bch2_bkey_get_iter_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, target_subvol), - 0, subvolume); - ret = bkey_err(s.s_c); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (ret) { - if (fsck_err(trans, dirent_to_missing_subvol, - "dirent points to missing subvolume\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return bch2_fsck_remove_dirent(trans, d.k->p); - ret = 0; - goto out; - } - - if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { - printbuf_reset(&buf); - - prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", - parent_subvol); - - ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, - le64_to_cpu(s.v->inode) }, &buf); - if (ret) - goto err; - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, s.s_c); - - if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.fs_path_parent = cpu_to_le32(parent_subvol); - } - } - - u64 target_inum = le64_to_cpu(s.v->inode); - u32 target_snapshot = le32_to_cpu(s.v->snapshot); - - ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, - &subvol_root, 0); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (ret) { - bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; - } - - if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, - trans, inode_bi_parent_wrong, - "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", - target_inum, - subvol_root.bi_parent_subvol, parent_subvol)) { - subvol_root.bi_parent_subvol = parent_subvol; - subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot); - ret = __bch2_fsck_write_inode(trans, &subvol_root); - if (ret) - goto err; - } - - ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); - if (ret) - goto err; -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &subvol_iter); - printbuf_exit(&buf); - return ret; -} - -static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct bch_hash_info *hash_info, - struct inode_walker *dir, - struct inode_walker *target, - struct snapshots_seen *s, - bool *need_second_pass) -{ - struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) { - ret = ret < 0 ? ret : 0; - goto out; - } - - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); - if (ret) - goto err; - - if (k.k->type == KEY_TYPE_whiteout) - goto out; - - if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { - ret = check_subdir_dirents_count(trans, dir); - if (ret) - goto err; - } - - i = walk_inode(trans, dir, k); - ret = PTR_ERR_OR_ZERO(i); - if (ret < 0) - goto err; - - ret = check_key_has_inode(trans, iter, dir, i, k); - if (ret) - goto err; - - if (!i || i->whiteout) - goto out; - - if (dir->first_this_inode) - *hash_info = bch2_hash_info_init(c, &i->inode); - dir->first_this_inode = false; - - hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; - - ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, - iter, k, need_second_pass); - if (ret < 0) - goto err; - if (ret) { - /* dirent has been deleted */ - ret = 0; - goto out; - } - - if (k.k->type != KEY_TYPE_dirent) - goto out; - - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - /* check casefold */ - if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding, - trans, dirent_casefold_mismatch, - "dirent casefold does not match dir casefold\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_parent_subvol) - : 0, - }; - u64 target = d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) - : le64_to_cpu(d.v->d_inum); - struct qstr name = bch2_dirent_get_name(d); - - struct bkey_i_dirent *new_d = - bch2_dirent_create_key(trans, hash_info, dir_inum, - d.v->d_type, &name, NULL, target); - ret = PTR_ERR_OR_ZERO(new_d); - if (ret) - goto out; - - new_d->k.p.inode = d.k->p.inode; - new_d->k.p.snapshot = d.k->p.snapshot; - - struct btree_iter dup_iter = {}; - ret = bch2_hash_delete_at(trans, - bch2_dirent_hash_desc, hash_info, iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_str_hash_repair_key(trans, s, - &bch2_dirent_hash_desc, hash_info, - iter, bkey_i_to_s_c(&new_d->k_i), - &dup_iter, bkey_s_c_null, - need_second_pass); - goto out; - } - - if (d.v->d_type == DT_SUBVOL) { - ret = check_dirent_to_subvol(trans, iter, d); - if (ret) - goto err; - } else { - ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); - if (ret) - goto err; - - if (fsck_err_on(!target->inodes.nr, - trans, dirent_to_missing_inode, - "dirent points to missing inode:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - ret = bch2_fsck_remove_dirent(trans, d.k->p); - if (ret) - goto err; - } - - darray_for_each(target->inodes, i) { - ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); - if (ret) - goto err; - } - - darray_for_each(target->deletes, i) - if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), - trans, dirent_to_overwritten_inode, - "dirent points to inode overwritten in snapshot %u:\n%s", - *i, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) { - struct btree_iter delete_iter; - bch2_trans_iter_init(trans, &delete_iter, - BTREE_ID_dirents, - SPOS(k.k->p.inode, k.k->p.offset, *i), - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - hash_info, - &delete_iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &delete_iter); - if (ret) - goto err; - - } - } - - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; - - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { - if (d.v->d_type == DT_DIR) - i->count++; - i->i_size += bkey_bytes(d.k); - } -out: -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * Walk dirents: verify that they all have a corresponding S_ISDIR inode, - * validate d_type - */ -int bch2_check_dirents(struct bch_fs *c) -{ - struct inode_walker dir = inode_walker_init(); - struct inode_walker target = inode_walker_init(); - struct snapshots_seen s; - struct bch_hash_info hash_info; - bool need_second_pass = false, did_second_pass = false; - int ret; - - snapshots_seen_init(&s); -again: - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, - &need_second_pass)) ?: - check_subdir_count_notnested(trans, &dir)); - - if (!ret && need_second_pass && !did_second_pass) { - bch_info(c, "check_dirents requires second pass"); - swap(did_second_pass, need_second_pass); - goto again; - } - - if (!ret && need_second_pass) { - bch_err(c, "dirents not repairing"); - ret = -EINVAL; - } - - snapshots_seen_exit(&s); - inode_walker_exit(&dir); - inode_walker_exit(&target); - bch_err_fn(c, ret); - return ret; -} - -static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct bch_hash_info *hash_info, - struct inode_walker *inode) -{ - struct bch_fs *c = trans->c; - - int ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret < 0) - return ret; - if (ret) - return 0; - - struct inode_walker_entry *i = walk_inode(trans, inode, k); - ret = PTR_ERR_OR_ZERO(i); - if (ret) - return ret; - - ret = check_key_has_inode(trans, iter, inode, i, k); - if (ret) - return ret; - - if (!i || i->whiteout) - return 0; - - if (inode->first_this_inode) - *hash_info = bch2_hash_info_init(c, &i->inode); - inode->first_this_inode = false; - - bool need_second_pass = false; - return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, - iter, k, &need_second_pass); -} - -/* - * Walk xattrs: verify that they all have a corresponding inode - */ -int bch2_check_xattrs(struct bch_fs *c) -{ - struct inode_walker inode = inode_walker_init(); - struct bch_hash_info hash_info; - int ret = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - check_xattr(trans, &iter, k, &hash_info, &inode))); - - inode_walker_exit(&inode); - bch_err_fn(c, ret); - return ret; -} - -static int check_root_trans(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct bch_inode_unpacked root_inode; - u32 snapshot; - u64 inum; - int ret; - - ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, - "root subvol missing")) { - struct bkey_i_subvolume *root_subvol = - bch2_trans_kmalloc(trans, sizeof(*root_subvol)); - ret = PTR_ERR_OR_ZERO(root_subvol); - if (ret) - goto err; - - snapshot = U32_MAX; - inum = BCACHEFS_ROOT_INO; - - bkey_subvolume_init(&root_subvol->k_i); - root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_subvol->v.flags = 0; - root_subvol->v.snapshot = cpu_to_le32(snapshot); - root_subvol->v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); - bch_err_msg(c, ret, "writing root subvol"); - if (ret) - goto err; - } - - ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, - &root_inode, 0); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (mustfix_fsck_err_on(ret, - trans, root_dir_missing, - "root directory missing") || - mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), - trans, root_inode_not_dir, - "root inode not a directory")) { - bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, - 0, NULL); - root_inode.bi_inum = inum; - root_inode.bi_snapshot = snapshot; - - ret = __bch2_fsck_write_inode(trans, &root_inode); - bch_err_msg(c, ret, "writing root inode"); - } -err: -fsck_err: - return ret; -} - -/* Get root directory, create if it doesn't exist: */ -int bch2_check_root(struct bch_fs *c) -{ - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_root_trans(trans)); - bch_err_fn(c, ret); - return ret; -} - -static bool darray_u32_has(darray_u32 *d, u32 v) -{ - darray_for_each(*d, i) - if (*i == v) - return true; - return false; -} - -static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct btree_iter parent_iter = {}; - darray_u32 subvol_path = {}; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - subvol_inum start = { - .subvol = k.k->p.offset, - .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), - }; - - while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { - ret = darray_push(&subvol_path, k.k->p.offset); - if (ret) - goto err; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - - struct bch_inode_unpacked subvol_root; - ret = bch2_inode_find_by_inum_trans(trans, - (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, - &subvol_root); - if (ret) - break; - - u32 parent = le32_to_cpu(s.v->fs_path_parent); - - if (darray_u32_has(&subvol_path, parent)) { - printbuf_reset(&buf); - prt_printf(&buf, "subvolume loop: "); - - ret = bch2_inum_to_path(trans, start, &buf); - if (ret) - goto err; - - if (fsck_err(trans, subvol_loop, "%s", buf.buf)) - ret = reattach_subvol(trans, s); - break; - } - - bch2_trans_iter_exit(trans, &parent_iter); - bch2_trans_iter_init(trans, &parent_iter, - BTREE_ID_subvolumes, POS(0, parent), 0); - k = bch2_btree_iter_peek_slot(trans, &parent_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, - trans, subvol_unreachable, - "unreachable subvolume %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, s.s_c), - buf.buf))) { - ret = reattach_subvol(trans, s); - break; - } - } -fsck_err: -err: - printbuf_exit(&buf); - darray_exit(&subvol_path); - bch2_trans_iter_exit(trans, &parent_iter); - return ret; -} - -int bch2_check_subvolume_structure(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol_path(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int bch2_bi_depth_renumber_one(struct btree_trans *trans, - u64 inum, u32 snapshot, - u32 new_depth) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), 0); - - struct bch_inode_unpacked inode; - int ret = bkey_err(k) ?: - !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode - : bch2_inode_unpack(k, &inode); - if (ret) - goto err; - - if (inode.bi_depth != new_depth) { - inode.bi_depth = new_depth; - ret = __bch2_fsck_write_inode(trans, &inode) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, - u32 snapshot, u32 new_bi_depth) -{ - u32 restart_count = trans->restart_count; - int ret = 0; - - darray_for_each_reverse(*path, i) { - ret = nested_lockrestart_do(trans, - bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); - bch_err_fn(trans->c, ret); - if (ret) - break; - - new_bi_depth++; - } - - return ret ?: trans_was_restarted(trans, restart_count); -} - -static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) -{ - struct bch_fs *c = trans->c; - struct btree_iter inode_iter = {}; - darray_u64 path = {}; - struct printbuf buf = PRINTBUF; - u32 snapshot = inode_k.k->p.snapshot; - bool redo_bi_depth = false; - u32 min_bi_depth = U32_MAX; - int ret = 0; - - struct bpos start = inode_k.k->p; - - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(inode_k, &inode); - if (ret) - return ret; - - /* - * If we're running full fsck, check_dirents() will have already ran, - * and we shouldn't see any missing backpointers here - otherwise that's - * handled separately, by check_unreachable_inodes - */ - while (!inode.bi_subvol && - bch2_inode_has_backpointer(&inode)) { - struct btree_iter dirent_iter; - struct bkey_s_c_dirent d; - - d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); - ret = bkey_err(d.s_c); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto out; - - if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) - bch2_trans_iter_exit(trans, &dirent_iter); - - if (bch2_err_matches(ret, ENOENT)) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, inode_k); - bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", - bch2_err_str(ret), buf.buf); - goto out; - } - - bch2_trans_iter_exit(trans, &dirent_iter); - - ret = darray_push(&path, inode.bi_inum); - if (ret) - return ret; - - bch2_trans_iter_exit(trans, &inode_iter); - inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, inode.bi_dir, snapshot), 0); - - struct bch_inode_unpacked parent_inode; - ret = bkey_err(inode_k) ?: - !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode - : bch2_inode_unpack(inode_k, &parent_inode); - if (ret) { - /* Should have been caught in dirents pass */ - bch_err_msg(c, ret, "error looking up parent directory"); - goto out; - } - - min_bi_depth = parent_inode.bi_depth; - - if (parent_inode.bi_depth < inode.bi_depth && - min_bi_depth < U16_MAX) - break; - - inode = parent_inode; - redo_bi_depth = true; - - if (darray_find(path, inode.bi_inum)) { - printbuf_reset(&buf); - prt_printf(&buf, "directory structure loop in snapshot %u: ", - snapshot); - - ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); - if (ret) - goto out; - - if (c->opts.verbose) { - prt_newline(&buf); - darray_for_each(path, i) - prt_printf(&buf, "%llu ", *i); - } - - if (fsck_err(trans, dir_loop, "%s", buf.buf)) { - ret = remove_backpointer(trans, &inode); - bch_err_msg(c, ret, "removing dirent"); - if (ret) - goto out; - - ret = reattach_inode(trans, &inode); - bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); - } - - goto out; - } - } - - if (inode.bi_subvol) - min_bi_depth = 0; - - if (redo_bi_depth) - ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); -out: -fsck_err: - bch2_trans_iter_exit(trans, &inode_iter); - darray_exit(&path); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -/* - * Check for loops in the directory structure: all other connectivity issues - * have been fixed by prior passes - */ -int bch2_check_directory_structure(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - if (!S_ISDIR(bkey_inode_mode(k))) - continue; - - if (bch2_inode_flags(k) & BCH_INODE_unlinked) - continue; - - check_path_loop(trans, k); - }))); - - bch_err_fn(c, ret); - return ret; -} - -struct nlink_table { - size_t nr; - size_t size; - - struct nlink { - u64 inum; - u32 snapshot; - u32 count; - } *d; -}; - -static int add_nlink(struct bch_fs *c, struct nlink_table *t, - u64 inum, u32 snapshot) -{ - if (t->nr == t->size) { - size_t new_size = max_t(size_t, 128UL, t->size * 2); - void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); - - if (!d) { - bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", - new_size); - return bch_err_throw(c, ENOMEM_fsck_add_nlink); - } - - if (t->d) - memcpy(d, t->d, t->size * sizeof(t->d[0])); - kvfree(t->d); - - t->d = d; - t->size = new_size; - } - - - t->d[t->nr++] = (struct nlink) { - .inum = inum, - .snapshot = snapshot, - }; - - return 0; -} - -static int nlink_cmp(const void *_l, const void *_r) -{ - const struct nlink *l = _l; - const struct nlink *r = _r; - - return cmp_int(l->inum, r->inum); -} - -static void inc_link(struct bch_fs *c, struct snapshots_seen *s, - struct nlink_table *links, - u64 range_start, u64 range_end, u64 inum, u32 snapshot) -{ - struct nlink *link, key = { - .inum = inum, .snapshot = U32_MAX, - }; - - if (inum < range_start || inum >= range_end) - return; - - link = __inline_bsearch(&key, links->d, links->nr, - sizeof(links->d[0]), nlink_cmp); - if (!link) - return; - - while (link > links->d && link[0].inum == link[-1].inum) - --link; - - for (; link < links->d + links->nr && link->inum == inum; link++) - if (ref_visible(c, s, snapshot, link->snapshot)) { - link->count++; - if (link->snapshot >= snapshot) - break; - } -} - -noinline_for_stack -static int check_nlinks_find_hardlinks(struct bch_fs *c, - struct nlink_table *t, - u64 start, u64 *end) -{ - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_inodes, - POS(0, start), - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - if (!bkey_is_inode(k.k)) - continue; - - /* Should never fail, checked by bch2_inode_invalid: */ - struct bch_inode_unpacked u; - _ret3 = bch2_inode_unpack(k, &u); - if (_ret3) - break; - - /* - * Backpointer and directory structure checks are sufficient for - * directories, since they can't have hardlinks: - */ - if (S_ISDIR(u.bi_mode)) - continue; - - /* - * Previous passes ensured that bi_nlink is nonzero if - * it had multiple hardlinks: - */ - if (!u.bi_nlink) - continue; - - ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); - if (ret) { - *end = k.k->p.offset; - ret = 0; - break; - } - 0; - }))); - - bch_err_fn(c, ret); - return ret; -} - -noinline_for_stack -static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, - u64 range_start, u64 range_end) -{ - struct snapshots_seen s; - - snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_intent| - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); - if (ret) - break; - - if (k.k->type == KEY_TYPE_dirent) { - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - - if (d.v->d_type != DT_DIR && - d.v->d_type != DT_SUBVOL) - inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), d.k->p.snapshot); - } - 0; - }))); - - snapshots_seen_exit(&s); - - bch_err_fn(c, ret); - return ret; -} - -static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct nlink_table *links, - size_t *idx, u64 range_end) -{ - struct bch_inode_unpacked u; - struct nlink *link = &links->d[*idx]; - int ret = 0; - - if (k.k->p.offset >= range_end) - return 1; - - if (!bkey_is_inode(k.k)) - return 0; - - ret = bch2_inode_unpack(k, &u); - if (ret) - return ret; - - if (S_ISDIR(u.bi_mode)) - return 0; - - if (!u.bi_nlink) - return 0; - - while ((cmp_int(link->inum, k.k->p.offset) ?: - cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { - BUG_ON(*idx == links->nr); - link = &links->d[++*idx]; - } - - if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, - trans, inode_wrong_nlink, - "inode %llu type %s has wrong i_nlink (%u, should be %u)", - u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], - bch2_inode_nlink_get(&u), link->count)) { - bch2_inode_nlink_set(&u, link->count); - ret = __bch2_fsck_write_inode(trans, &u); - } -fsck_err: - return ret; -} - -noinline_for_stack -static int check_nlinks_update_hardlinks(struct bch_fs *c, - struct nlink_table *links, - u64 range_start, u64 range_end) -{ - size_t idx = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS(0, range_start), - BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); - if (ret < 0) { - bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); - return ret; - } - - return 0; -} - -int bch2_check_nlinks(struct bch_fs *c) -{ - struct nlink_table links = { 0 }; - u64 this_iter_range_start, next_iter_range_start = 0; - int ret = 0; - - do { - this_iter_range_start = next_iter_range_start; - next_iter_range_start = U64_MAX; - - ret = check_nlinks_find_hardlinks(c, &links, - this_iter_range_start, - &next_iter_range_start); - - ret = check_nlinks_walk_dirents(c, &links, - this_iter_range_start, - next_iter_range_start); - if (ret) - break; - - ret = check_nlinks_update_hardlinks(c, &links, - this_iter_range_start, - next_iter_range_start); - if (ret) - break; - - links.nr = 0; - } while (next_iter_range_start != U64_MAX); - - kvfree(links.d); - bch_err_fn(c, ret); - return ret; -} - -static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_p p; - struct bkey_i_reflink_p *u; - - if (k.k->type != KEY_TYPE_reflink_p) - return 0; - - p = bkey_s_c_to_reflink_p(k); - - if (!p.v->front_pad && !p.v->back_pad) - return 0; - - u = bch2_trans_kmalloc(trans, sizeof(*u)); - int ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - bkey_reassemble(&u->k_i, k); - u->v.front_pad = 0; - u->v.back_pad = 0; - - return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); -} - -int bch2_fix_reflink_p(struct bch_fs *c) -{ - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) - return 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_extents, POS_MIN, - BTREE_ITER_intent|BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - fix_reflink_p_key(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -#ifndef NO_BCACHEFS_CHARDEV - -struct fsck_thread { - struct thread_with_stdio thr; - struct bch_fs *c; - struct bch_opts opts; -}; - -static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) -{ - struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - kfree(thr); -} - -static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - int ret = PTR_ERR_OR_ZERO(c); - if (ret) - return ret; - - ret = bch2_fs_start(thr->c); - if (ret) - goto err; - - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); - ret |= 1; - } - if (test_bit(BCH_FS_error, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); - ret |= 4; - } -err: - bch2_fs_stop(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_offline_thread_fn, -}; - -long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) -{ - struct bch_ioctl_fsck_offline arg; - struct fsck_thread *thr = NULL; - darray_const_str devs = {}; - long ret = 0; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - for (size_t i = 0; i < arg.nr_devs; i++) { - u64 dev_u64; - ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); - if (ret) - goto err; - - char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); - ret = PTR_ERR_OR_ZERO(dev_str); - if (ret) - goto err; - - ret = darray_push(&devs, dev_str); - if (ret) { - kfree(dev_str); - goto err; - } - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - opt_set(thr->opts, read_only, 1); - opt_set(thr->opts, ratelimit_errors, 0); - - /* We need request_key() to be called before we punt to kthread: */ - opt_set(thr->opts, nostart, true); - - bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - - thr->c = bch2_fs_open(&devs, &thr->opts); - - if (!IS_ERR(thr->c) && - thr->c->opts.errors == BCH_ON_ERROR_panic) - thr->c->opts.errors = BCH_ON_ERROR_ro; - - ret = __bch2_run_thread_with_stdio(&thr->thr); -out: - darray_for_each(devs, i) - kfree(*i); - darray_exit(&devs); - return ret; -err: - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - goto out; -} - -static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - c->stdio_filter = current; - c->stdio = &thr->thr.stdio; - - /* - * XXX: can we figure out a way to do this without mucking with c->opts? - */ - unsigned old_fix_errors = c->opts.fix_errors; - if (opt_defined(thr->opts, fix_errors)) - c->opts.fix_errors = thr->opts.fix_errors; - else - c->opts.fix_errors = FSCK_FIX_ask; - - c->opts.fsck = true; - set_bit(BCH_FS_in_fsck, &c->flags); - - int ret = bch2_run_online_recovery_passes(c, ~0ULL); - - clear_bit(BCH_FS_in_fsck, &c->flags); - bch_err_fn(c, ret); - - c->stdio = NULL; - c->stdio_filter = NULL; - c->opts.fix_errors = old_fix_errors; - - up(&c->recovery.run_lock); - bch2_ro_ref_put(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_online_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_online_thread_fn, -}; - -long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) -{ - struct fsck_thread *thr = NULL; - long ret = 0; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!bch2_ro_ref_tryget(c)) - return -EROFS; - - if (down_trylock(&c->recovery.run_lock)) { - bch2_ro_ref_put(c); - return -EAGAIN; - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->c = c; - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); -err: - if (ret < 0) { - bch_err_fn(c, ret); - if (thr) - bch2_fsck_thread_exit(&thr->thr); - up(&c->recovery.run_lock); - bch2_ro_ref_put(c); - } - return ret; -} - -#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h deleted file mode 100644 index e5fe7cf7b25141..00000000000000 --- a/fs/bcachefs/fsck.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FSCK_H -#define _BCACHEFS_FSCK_H - -#include "str_hash.h" - -/* recoverds snapshot IDs of overwrites at @pos */ -struct snapshots_seen { - struct bpos pos; - snapshot_id_list ids; -}; - -int bch2_fsck_update_backpointers(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc, - struct bch_hash_info *, - struct bkey_i *); - -int bch2_check_inodes(struct bch_fs *); -int bch2_check_extents(struct bch_fs *); -int bch2_check_indirect_extents(struct bch_fs *); -int bch2_check_dirents(struct bch_fs *); -int bch2_check_xattrs(struct bch_fs *); -int bch2_check_root(struct bch_fs *); -int bch2_check_subvolume_structure(struct bch_fs *); -int bch2_check_unreachable_inodes(struct bch_fs *); -int bch2_check_directory_structure(struct bch_fs *); -int bch2_check_nlinks(struct bch_fs *); -int bch2_fix_reflink_p(struct bch_fs *); - -long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *); -long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online); - -#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c deleted file mode 100644 index ef4cc7395b86b2..00000000000000 --- a/fs/bcachefs/inode.c +++ /dev/null @@ -1,1566 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_key_cache.h" -#include "btree_write_buffer.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "buckets.h" -#include "compress.h" -#include "dirent.h" -#include "disk_accounting.h" -#include "error.h" -#include "extents.h" -#include "extent_update.h" -#include "fs.h" -#include "inode.h" -#include "namei.h" -#include "opts.h" -#include "str_hash.h" -#include "snapshot.h" -#include "subvolume.h" -#include "varint.h" - -#include - -#include - -#define x(name, ...) #name, -const char * const bch2_inode_opts[] = { - BCH_INODE_OPTS() - NULL, -}; - -static const char * const bch2_inode_flag_strs[] = { - BCH_INODE_FLAGS() - NULL -}; -#undef x - -static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); -static int may_delete_deleted_inum(struct btree_trans *, subvol_inum); - -static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; - -static int inode_decode_field(const u8 *in, const u8 *end, - u64 out[2], unsigned *out_bits) -{ - __be64 be[2] = { 0, 0 }; - unsigned bytes, shift; - u8 *p; - - if (in >= end) - return -BCH_ERR_inode_unpack_error; - - if (!*in) - return -BCH_ERR_inode_unpack_error; - - /* - * position of highest set bit indicates number of bytes: - * shift = number of bits to remove in high byte: - */ - shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ - bytes = byte_table[shift - 1]; - - if (in + bytes > end) - return -BCH_ERR_inode_unpack_error; - - p = (u8 *) be + 16 - bytes; - memcpy(p, in, bytes); - *p ^= (1 << 8) >> shift; - - out[0] = be64_to_cpu(be[0]); - out[1] = be64_to_cpu(be[1]); - *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); - - return bytes; -} - -static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - struct bkey_i_inode_v3 *k = &packed->inode; - u8 *out = k->v.fields; - u8 *end = (void *) &packed[1]; - u8 *last_nonzero_field = out; - unsigned nr_fields = 0, last_nonzero_fieldnr = 0; - unsigned bytes; - int ret; - - bkey_inode_v3_init(&packed->inode.k_i); - packed->inode.k.p.offset = inode->bi_inum; - packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); - packed->inode.v.bi_hash_seed = inode->bi_hash_seed; - packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); - packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); - packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); - packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); - SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); - SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); - - -#define x(_name, _bits) \ - nr_fields++; \ - \ - if (inode->_name) { \ - ret = bch2_varint_encode_fast(out, inode->_name); \ - out += ret; \ - \ - if (_bits > 64) \ - *out++ = 0; \ - \ - last_nonzero_field = out; \ - last_nonzero_fieldnr = nr_fields; \ - } else { \ - *out++ = 0; \ - \ - if (_bits > 64) \ - *out++ = 0; \ - } - - BCH_INODE_FIELDS_v3() -#undef x - BUG_ON(out > end); - - out = last_nonzero_field; - nr_fields = last_nonzero_fieldnr; - - bytes = out - (u8 *) &packed->inode.v; - set_bkey_val_bytes(&packed->inode.k, bytes); - memset_u64s_tail(&packed->inode.v, 0, bytes); - - SET_INODEv3_NR_FIELDS(&k->v, nr_fields); - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - struct bch_inode_unpacked unpacked; - - ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); - BUG_ON(ret); - BUG_ON(unpacked.bi_inum != inode->bi_inum); - BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); - BUG_ON(unpacked.bi_sectors != inode->bi_sectors); - BUG_ON(unpacked.bi_size != inode->bi_size); - BUG_ON(unpacked.bi_version != inode->bi_version); - BUG_ON(unpacked.bi_mode != inode->bi_mode); - -#define x(_name, _bits) if (unpacked._name != inode->_name) \ - panic("unpacked %llu should be %llu", \ - (u64) unpacked._name, (u64) inode->_name); - BCH_INODE_FIELDS_v3() -#undef x - } -} - -void bch2_inode_pack(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - bch2_inode_pack_inlined(packed, inode); -} - -static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, - struct bch_inode_unpacked *unpacked) -{ - const u8 *in = inode.v->fields; - const u8 *end = bkey_val_end(inode); - u64 field[2]; - unsigned fieldnr = 0, field_bits; - int ret; - -#define x(_name, _bits) \ - if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \ - unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ - memset((void *) unpacked + offset, 0, \ - sizeof(*unpacked) - offset); \ - return 0; \ - } \ - \ - ret = inode_decode_field(in, end, field, &field_bits); \ - if (ret < 0) \ - return ret; \ - \ - if (field_bits > sizeof(unpacked->_name) * 8) \ - return -BCH_ERR_inode_unpack_error; \ - \ - unpacked->_name = field[1]; \ - in += ret; - - BCH_INODE_FIELDS_v2() -#undef x - - /* XXX: signal if there were more fields than expected? */ - return 0; -} - -static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, - const u8 *in, const u8 *end, - unsigned nr_fields) -{ - unsigned fieldnr = 0; - int ret; - u64 v[2]; - -#define x(_name, _bits) \ - if (fieldnr < nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v[0]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - \ - if (_bits > 64) { \ - ret = bch2_varint_decode_fast(in, end, &v[1]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v[1] = 0; \ - } \ - } else { \ - v[0] = v[1] = 0; \ - } \ - \ - unpacked->_name = v[0]; \ - if (v[1] || v[0] != unpacked->_name) \ - return -BCH_ERR_inode_unpack_error; \ - fieldnr++; - - BCH_INODE_FIELDS_v2() -#undef x - - /* XXX: signal if there were more fields than expected? */ - return 0; -} - -static int bch2_inode_unpack_v3(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) -{ - struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); - const u8 *in = inode.v->fields; - const u8 *end = bkey_val_end(inode); - unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); - unsigned fieldnr = 0; - int ret; - u64 v[2]; - - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); - unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); - unpacked->bi_size = le64_to_cpu(inode.v->bi_size); - unpacked->bi_version = le64_to_cpu(inode.v->bi_version); - unpacked->bi_mode = INODEv3_MODE(inode.v); - -#define x(_name, _bits) \ - if (fieldnr < nr_fields) { \ - ret = bch2_varint_decode_fast(in, end, &v[0]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - \ - if (_bits > 64) { \ - ret = bch2_varint_decode_fast(in, end, &v[1]); \ - if (ret < 0) \ - return ret; \ - in += ret; \ - } else { \ - v[1] = 0; \ - } \ - } else { \ - v[0] = v[1] = 0; \ - } \ - \ - unpacked->_name = v[0]; \ - if (v[1] || v[0] != unpacked->_name) \ - return -BCH_ERR_inode_unpack_error; \ - fieldnr++; - - BCH_INODE_FIELDS_v3() -#undef x - - /* XXX: signal if there were more fields than expected? */ - return 0; -} - -static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) -{ - memset(unpacked, 0, sizeof(*unpacked)); - - switch (k.k->type) { - case KEY_TYPE_inode: { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= 0; - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - - if (INODEv1_NEW_VARINT(inode.v)) { - return bch2_inode_unpack_v2(unpacked, inode.v->fields, - bkey_val_end(inode), - INODEv1_NR_FIELDS(inode.v)); - } else { - return bch2_inode_unpack_v1(inode, unpacked); - } - break; - } - case KEY_TYPE_inode_v2: { - struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_snapshot = inode.k->p.snapshot; - unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - - return bch2_inode_unpack_v2(unpacked, inode.v->fields, - bkey_val_end(inode), - INODEv2_NR_FIELDS(inode.v)); - } - default: - BUG(); - } -} - -int bch2_inode_unpack(struct bkey_s_c k, - struct bch_inode_unpacked *unpacked) -{ - return likely(k.k->type == KEY_TYPE_inode_v3) - ? bch2_inode_unpack_v3(k, unpacked) - : bch2_inode_unpack_slowpath(k, unpacked); -} - -int __bch2_inode_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags, - bool warn) -{ - u32 snapshot; - int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn); - if (ret) - return ret; - - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - flags|BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; - if (ret) - goto err; - - ret = bch2_inode_unpack(k, inode); - if (ret) - goto err; - - return 0; -err: - if (warn) - bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum); - bch2_trans_iter_exit(trans, iter); - return ret; -} - -int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, - u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode, - unsigned flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode_nr, snapshot), flags); - int ret = bkey_err(k); - if (ret) - goto err; - - ret = bkey_is_inode(k.k) - ? bch2_inode_unpack(k, inode) - : -BCH_ERR_ENOENT_inode; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); -} - -int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, - struct bch_inode_unpacked *root) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) { - ret = bch2_inode_unpack(k, root); - goto out; - } - } - /* We're only called when we know we have an inode for @inum */ - BUG_ON(!ret); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_write_flags(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_inode_buf *inode_p; - - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack_inlined(inode_p, inode); - inode_p->inode.k.p.snapshot = iter->snapshot; - return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); -} - -int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) -{ - struct bkey_inode_buf *inode_p = - bch2_trans_kmalloc(trans, sizeof(*inode_p)); - - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack(inode_p, inode); - inode_p->inode.k.p.snapshot = inode->bi_snapshot; - - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &inode_p->inode.k_i, - BTREE_UPDATE_internal_snapshot_node); -} - -int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_fsck_write_inode(trans, inode)); - bch_err_fn(trans->c, ret); - return ret; -} - -struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) -{ - struct bch_inode_unpacked u; - struct bkey_inode_buf *inode_p; - int ret; - - if (!bkey_is_inode(&k->k)) - return ERR_PTR(-ENOENT); - - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return ERR_CAST(inode_p); - - ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); - if (ret) - return ERR_PTR(ret); - - bch2_inode_pack(inode_p, &u); - return &inode_p->inode.k_i; -} - -static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bch_inode_unpacked unpacked; - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode, - c, inode_pos_inode_nonzero, - "nonzero k.p.inode"); - - bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, - c, inode_pos_blockdev_range, - "fs inode in blockdev range"); - - bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), - c, inode_unpack_error, - "invalid variable length fields"); - - bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, - c, inode_checksum_type_invalid, - "invalid data checksum type (%u >= %u", - unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); - - bkey_fsck_err_on(unpacked.bi_compression && - !bch2_compression_opt_valid(unpacked.bi_compression - 1), - c, inode_compression_type_invalid, - "invalid compression opt %u", unpacked.bi_compression - 1); - - bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && - unpacked.bi_nlink != 0, - c, inode_unlinked_but_nlink_nonzero, - "flagged as unlinked but bi_nlink != 0"); - - bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), - c, inode_subvol_root_but_not_dir, - "subvolume root but not a directory"); -fsck_err: - return ret; -} - -int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - int ret = 0; - - bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR, - c, inode_str_hash_invalid, - "invalid str hash type (%llu >= %u)", - INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); - - ret = __bch2_inode_validate(c, k, from); -fsck_err: - return ret; -} - -int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - int ret = 0; - - bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, - c, inode_str_hash_invalid, - "invalid str hash type (%llu >= %u)", - INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - - ret = __bch2_inode_validate(c, k, from); -fsck_err: - return ret; -} - -int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); - int ret = 0; - - bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || - INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), - c, inode_v3_fields_start_bad, - "invalid fields_start (got %llu, min %u max %zu)", - INODEv3_FIELDS_START(inode.v), - INODEv3_FIELDS_START_INITIAL, - bkey_val_u64s(inode.k)); - - bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, - c, inode_str_hash_invalid, - "invalid str hash type (%llu >= %u)", - INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - - ret = __bch2_inode_validate(c, k, from); -fsck_err: - return ret; -} - -static void __bch2_inode_unpacked_to_text(struct printbuf *out, - struct bch_inode_unpacked *inode) -{ - prt_printf(out, "\n"); - printbuf_indent_add(out, 2); - prt_printf(out, "mode=%o\n", inode->bi_mode); - - prt_str(out, "flags="); - prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); - prt_printf(out, "(%x)\n", inode->bi_flags); - - prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); - prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed); - prt_printf(out, "hash_type="); - bch2_prt_str_hash_type(out, INODE_STR_HASH(inode)); - prt_newline(out); - prt_printf(out, "bi_size=%llu\n", inode->bi_size); - prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); - prt_printf(out, "bi_version=%llu\n", inode->bi_version); - -#define x(_name, _bits) \ - prt_printf(out, #_name "=%llu\n", (u64) inode->_name); - BCH_INODE_FIELDS_v3() -#undef x - - bch2_printbuf_strip_trailing_newline(out); - printbuf_indent_sub(out, 2); -} - -void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) -{ - prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot); - __bch2_inode_unpacked_to_text(out, inode); -} - -void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bch_inode_unpacked inode; - - if (bch2_inode_unpack(k, &inode)) { - prt_printf(out, "(unpack error)"); - return; - } - - __bch2_inode_unpacked_to_text(out, &inode); -} - -static inline u64 bkey_inode_flags(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); - case KEY_TYPE_inode_v2: - return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); - default: - return 0; - } -} - -static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); - return; - case KEY_TYPE_inode_v2: - bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); - return; - case KEY_TYPE_inode_v3: - bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); - return; - default: - BUG(); - } -} - -static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) -{ - unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; - - return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); -} - -static struct bkey_s_c -bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos pos, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_max_norestart(trans, *iter, btree, - bpos_successor(pos), - SPOS(pos.inode, pos.offset, U32_MAX), - flags|BTREE_ITER_all_snapshots, k, ret) - if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) - return k; - - bch2_trans_iter_exit(trans, iter); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -} - -static struct bkey_s_c -bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos, unsigned flags) -{ - struct bkey_s_c k; -again: - k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); - if (!k.k || - bkey_err(k) || - bkey_is_inode(k.k)) - return k; - - bch2_trans_iter_exit(trans, iter); - pos = k.k->p; - goto again; -} - -int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_max_norestart(trans, iter, - BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), - BTREE_ITER_all_snapshots| - BTREE_ITER_with_updates, k, ret) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && - bkey_is_inode(k.k)) { - ret = 1; - break; - } - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int update_inode_has_children(struct btree_trans *trans, - struct bkey_s k, - bool have_child) -{ - if (!have_child) { - int ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret) - return ret < 0 ? ret : 0; - } - - u64 f = bkey_inode_flags(k.s_c); - if (have_child != !!(f & BCH_INODE_has_child_snapshot)) - bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); - - return 0; -} - -static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, - bool have_child) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, - &iter, pos, BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) - return 0; - - if (!have_child) { - ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret) { - ret = ret < 0 ? ret : 0; - goto err; - } - } - - u64 f = bkey_inode_flags(k); - if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { - struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, - BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_trigger_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - - if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { - BUG_ON(!trans->journal_res.seq); - bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); - } - - s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { - int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); - if (ret) - return ret; - } - - if (flags & BTREE_TRIGGER_transactional) { - int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - - (int) bkey_is_unlinked_inode(old); - if (unlinked_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, - new.k->p, unlinked_delta > 0); - if (ret) - return ret; - } - - /* - * If we're creating or deleting an inode at this snapshot ID, - * and there might be an inode in a parent snapshot ID, we might - * need to set or clear the has_child_snapshot flag on the - * parent. - */ - int deleted_delta = (int) bkey_is_inode(new.k) - - (int) bkey_is_inode(old.k); - if (deleted_delta && - bch2_snapshot_parent(c, new.k->p.snapshot)) { - int ret = update_parent_inode_has_children(trans, new.k->p, - deleted_delta > 0); - if (ret) - return ret; - } - - /* - * When an inode is first updated in a new snapshot, we may need - * to clear has_child_snapshot - */ - if (deleted_delta > 0) { - int ret = update_inode_has_children(trans, new, false); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode, - c, inode_pos_inode_nonzero, - "nonzero k.p.inode"); -fsck_err: - return ret; -} - -void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); - - prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); -} - -int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, - c, inode_alloc_cursor_inode_bad, - "k.p.inode bad"); -fsck_err: - return ret; -} - -void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); - - prt_printf(out, "idx %llu generation %llu", - le64_to_cpu(i.v->idx), - le64_to_cpu(i.v->gen)); -} - -void bch2_inode_init_early(struct bch_fs *c, - struct bch_inode_unpacked *inode_u) -{ - enum bch_str_hash_type str_hash = - bch2_str_hash_opt_to_type(c, c->opts.str_hash); - - memset(inode_u, 0, sizeof(*inode_u)); - - SET_INODE_STR_HASH(inode_u, str_hash); - get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); -} - -void bch2_inode_init_late(struct bch_fs *c, - struct bch_inode_unpacked *inode_u, u64 now, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct bch_inode_unpacked *parent) -{ - inode_u->bi_mode = mode; - inode_u->bi_uid = uid; - inode_u->bi_gid = gid; - inode_u->bi_dev = rdev; - inode_u->bi_atime = now; - inode_u->bi_mtime = now; - inode_u->bi_ctime = now; - inode_u->bi_otime = now; - - if (parent && parent->bi_mode & S_ISGID) { - inode_u->bi_gid = parent->bi_gid; - if (S_ISDIR(mode)) - inode_u->bi_mode |= S_ISGID; - } - - if (parent) { -#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; - BCH_INODE_OPTS() -#undef x - } - - if (!S_ISDIR(mode)) - inode_u->bi_casefold = 0; - - if (bch2_inode_casefold(c, inode_u)) - inode_u->bi_flags |= BCH_INODE_has_case_insensitive; -} - -void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct bch_inode_unpacked *parent) -{ - bch2_inode_init_early(c, inode_u); - bch2_inode_init_late(c, inode_u, bch2_current_time(c), - uid, gid, mode, rdev, parent); -} - -static struct bkey_i_inode_alloc_cursor * -bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) -{ - struct bch_fs *c = trans->c; - - u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; - - cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), - BTREE_ITER_cached); - int ret = bkey_err(k); - if (ret) - return ERR_PTR(ret); - - struct bkey_i_inode_alloc_cursor *cursor = - k.k->type == KEY_TYPE_inode_alloc_cursor - ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) - : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); - ret = PTR_ERR_OR_ZERO(cursor); - if (ret) - goto err; - - if (c->opts.inodes_32bit) { - *min = BLOCKDEV_INODE_MAX; - *max = INT_MAX; - } else { - cursor->v.bits = c->opts.shard_inode_numbers_bits; - - unsigned bits = 63 - c->opts.shard_inode_numbers_bits; - - *min = max(cpu << bits, (u64) INT_MAX + 1); - *max = (cpu << bits) | ~(ULLONG_MAX << bits); - } - - if (le64_to_cpu(cursor->v.idx) < *min) - cursor->v.idx = cpu_to_le64(*min); - - if (le64_to_cpu(cursor->v.idx) >= *max) { - cursor->v.idx = cpu_to_le64(*min); - le32_add_cpu(&cursor->v.gen, 1); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret ? ERR_PTR(ret) : cursor; -} - -/* - * This just finds an empty slot: - */ -int bch2_inode_create(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode_u, - u32 snapshot, u64 cpu) -{ - u64 min, max; - struct bkey_i_inode_alloc_cursor *cursor = - bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); - int ret = PTR_ERR_OR_ZERO(cursor); - if (ret) - return ret; - - u64 start = le64_to_cpu(cursor->v.idx); - u64 pos = start; - - bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_all_snapshots| - BTREE_ITER_intent); - struct bkey_s_c k; -again: - while ((k = bch2_btree_iter_peek(trans, iter)).k && - !(ret = bkey_err(k)) && - bkey_lt(k.k->p, POS(0, max))) { - if (pos < iter->pos.offset) - goto found_slot; - - /* - * We don't need to iterate over keys in every snapshot once - * we've found just one: - */ - pos = iter->pos.offset + 1; - bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); - } - - if (!ret && pos < max) - goto found_slot; - - if (!ret && start == min) - ret = bch_err_throw(trans->c, ENOSPC_inode_create); - - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ret; - } - - /* Retry from start */ - pos = start = min; - bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); - le32_add_cpu(&cursor->v.gen, 1); - goto again; -found_slot: - bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot)); - k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return ret; - } - - inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = le64_to_cpu(cursor->v.gen); - cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); - return 0; -} - -static int bch2_inode_delete_keys(struct btree_trans *trans, - subvol_inum inum, enum btree_id id) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i delete; - struct bpos end = POS(inum.inum, U64_MAX); - u32 snapshot; - int ret = 0; - - /* - * We're never going to be deleting partial extents, no need to use an - * extent iterator: - */ - bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_intent); - - while (1) { - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - k = bch2_btree_iter_peek_max(trans, &iter, end); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k) - break; - - bkey_init(&delete.k); - delete.k.p = iter.pos; - - if (iter.flags & BTREE_ITER_is_extents) - bch2_key_resize(&delete.k, - bpos_min(end, k.k->p).offset - - iter.pos.offset); - - ret = bch2_trans_update(trans, &iter, &delete, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -err: - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - struct bkey_s_c k; - u32 snapshot; - int ret; - - ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum)); - if (ret) - goto err2; - - /* - * If this was a directory, there shouldn't be any real dirents left - - * but there could be whiteouts (from hash collisions) that we should - * delete: - * - * XXX: the dirent code ideally would delete whiteouts when they're no - * longer needed - */ - ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); - if (ret) - goto err2; -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - BTREE_ITER_intent|BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(c, - "inode %llu:%u not found when deleting", - inum.inum, snapshot); - ret = bch_err_throw(c, ENOENT_inode); - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -err: - bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (ret) - goto err2; - - ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); -err2: - bch2_trans_put(trans); - return ret; -} - -int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) -{ - if (bi->bi_flags & BCH_INODE_unlinked) - bi->bi_flags &= ~BCH_INODE_unlinked; - else { - if (bi->bi_nlink == U32_MAX) - return -EINVAL; - - bi->bi_nlink++; - } - - return 0; -} - -void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) -{ - if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { - bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", - bi->bi_inum); - return; - } - - if (bi->bi_flags & BCH_INODE_unlinked) { - bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); - return; - } - - if (bi->bi_nlink) - bi->bi_nlink--; - else - bi->bi_flags |= BCH_INODE_unlinked; -} - -struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) -{ - struct bch_opts ret = { 0 }; -#define x(_name, _bits) \ - if (inode->bi_##_name) \ - opt_set(ret, _name, inode->bi_##_name - 1); - BCH_INODE_OPTS() -#undef x - return ret; -} - -void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, - struct bch_inode_unpacked *inode) -{ -#define x(_name, _bits) \ - if ((inode)->bi_##_name) { \ - opts->_name = inode->bi_##_name - 1; \ - opts->_name##_from_inode = true; \ - } else { \ - opts->_name = c->opts._name; \ - opts->_name##_from_inode = false; \ - } - BCH_INODE_OPTS() -#undef x - - bch2_io_opts_fixups(opts); -} - -int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) -{ - struct bch_inode_unpacked inode; - int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); - - if (ret) - return ret; - - bch2_inode_opts_get(opts, trans->c, &inode); - return 0; -} - -int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *bi, unsigned v) -{ - struct bch_fs *c = trans->c; - -#ifndef CONFIG_UNICODE - bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); - return -EOPNOTSUPP; -#endif - - if (c->opts.casefold_disabled) - return -EOPNOTSUPP; - - int ret = 0; - /* Not supported on individual files. */ - if (!S_ISDIR(bi->bi_mode)) - return -EOPNOTSUPP; - - /* - * Make sure the dir is empty, as otherwise we'd need to - * rehash everything and update the dirent keys. - */ - ret = bch2_empty_dir_trans(trans, inum); - if (ret < 0) - return ret; - - ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); - if (ret) - return ret; - - bch2_check_set_feature(c, BCH_FEATURE_casefolding); - - bi->bi_casefold = v + 1; - bi->bi_fields_set |= BIT(Inode_opt_casefold); - - return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); -} - -static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = {}; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; - struct bkey_s_c k; - int ret; - - do { - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL); - } while (ret == -BCH_ERR_transaction_restart_nested); - if (ret) - goto err; -retry: - bch2_trans_begin(trans); - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), BTREE_ITER_intent); - ret = bkey_err(k); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(c, - "inode %llu:%u not found when deleting", - inum, snapshot); - ret = bch_err_throw(c, ENOENT_inode); - goto err; - } - - bch2_inode_unpack(k, &inode_u); - - /* Subvolume root? */ - if (inode_u.bi_subvol) - bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); - - bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter.pos; - delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - - ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); -err: - bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - return ret ?: -BCH_ERR_transaction_restart_nested; -} - -/* - * After deleting an inode, there may be versions in older snapshots that should - * also be deleted - if they're not referenced by sibling snapshots and not open - * in other subvolumes: - */ -static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; -next_parent: - ret = lockrestart_do(trans, - bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); - if (ret || !k.k) - return ret; - - bool unlinked = bkey_is_unlinked_inode(k); - pos = k.k->p; - bch2_trans_iter_exit(trans, &iter); - - if (!unlinked) - return 0; - - ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); - if (ret) - return ret < 0 ? ret : 0; - - ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); - if (ret) - return ret; - goto next_parent; -} - -int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) -{ - return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: - delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); -} - -static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, - bool from_deleted_inodes) -{ - struct bch_fs *c = trans->c; - struct btree_iter inode_iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - struct printbuf buf = PRINTBUF; - int ret; - - k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode); - if (fsck_err_on(from_deleted_inodes && ret, - trans, deleted_inode_missing, - "nonexistent inode %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - - ret = bch2_inode_unpack(k, &inode); - if (ret) - goto out; - - if (S_ISDIR(inode.bi_mode)) { - ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); - if (fsck_err_on(from_deleted_inodes && - bch2_err_matches(ret, ENOTEMPTY), - trans, deleted_inode_is_dir, - "non empty directory %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - } - - ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); - if (fsck_err_on(from_deleted_inodes && ret, - trans, deleted_inode_not_unlinked, - "non-deleted inode %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - - ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot) - ? 0 : bch_err_throw(c, inode_has_child_snapshot); - - if (fsck_err_on(from_deleted_inodes && ret, - trans, deleted_inode_has_child_snapshots, - "inode with child snapshots %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; - if (ret) - goto out; - - ret = bch2_inode_has_child_snapshots(trans, k.k->p); - if (ret < 0) - goto out; - - if (ret) { - if (fsck_err(trans, inode_has_child_snapshots_wrong, - "inode has_child_snapshots flag wrong (should be set)\n%s", - (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &inode), - buf.buf))) { - inode.bi_flags |= BCH_INODE_has_child_snapshot; - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - goto out; - } - - if (!from_deleted_inodes) { - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - bch_err_throw(c, inode_has_child_snapshot); - goto out; - } - - goto delete; - - } - - if (from_deleted_inodes) { - if (test_bit(BCH_FS_clean_recovery, &c->flags) && - !fsck_err(trans, deleted_inode_but_clean, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) { - ret = 0; - goto out; - } - - ret = 1; - } -out: -fsck_err: - bch2_trans_iter_exit(trans, &inode_iter); - printbuf_exit(&buf); - return ret; -delete: - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); - goto out; -} - -static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum) -{ - u32 snapshot; - - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false); -} - -int bch2_delete_dead_inodes(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - int ret; - - /* - * if we ran check_inodes() unlinked inodes will have already been - * cleaned up but the write buffer will be out of sync; therefore we - * alway need a write buffer flush - */ - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - /* - * Weird transaction restart handling here because on successful delete, - * bch2_inode_rm_snapshot() will return a nested transaction restart, - * but we can't retry because the btree write buffer won't have been - * flushed and we'd spin: - */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - ret = may_delete_deleted_inode(trans, k.k->p, true); - if (ret > 0) { - bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", - k.k->p.offset, k.k->p.snapshot); - - ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); - /* - * We don't want to loop here: a transaction restart - * error here means we handled a transaction restart and - * we're actually done, but if we loop we'll retry the - * same key because the write buffer hasn't been flushed - * yet - */ - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - } - - ret; - })); -err: - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h deleted file mode 100644 index b8ec3e628d9055..00000000000000 --- a/fs/bcachefs/inode.h +++ /dev/null @@ -1,319 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_INODE_H -#define _BCACHEFS_INODE_H - -#include "bkey.h" -#include "bkey_methods.h" -#include "opts.h" -#include "snapshot.h" - -extern const char * const bch2_inode_opts[]; - -int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); - -static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) -{ - return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0 - ? __bch2_inode_has_child_snapshots(trans, pos) - : 0; -} - -int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_inode ((struct bkey_ops) { \ - .key_validate = bch2_inode_validate, \ - .val_to_text = bch2_inode_to_text, \ - .trigger = bch2_trigger_inode, \ - .min_val_size = 16, \ -}) - -#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ - .key_validate = bch2_inode_v2_validate, \ - .val_to_text = bch2_inode_to_text, \ - .trigger = bch2_trigger_inode, \ - .min_val_size = 32, \ -}) - -#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ - .key_validate = bch2_inode_v3_validate, \ - .val_to_text = bch2_inode_to_text, \ - .trigger = bch2_trigger_inode, \ - .min_val_size = 48, \ -}) - -static inline bool bkey_is_inode(const struct bkey *k) -{ - return k->type == KEY_TYPE_inode || - k->type == KEY_TYPE_inode_v2 || - k->type == KEY_TYPE_inode_v3; -} - -int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ - .key_validate = bch2_inode_generation_validate, \ - .val_to_text = bch2_inode_generation_to_text, \ - .min_val_size = 8, \ -}) - -int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ - .key_validate = bch2_inode_alloc_cursor_validate, \ - .val_to_text = bch2_inode_alloc_cursor_to_text, \ - .min_val_size = 16, \ -}) - -#if 0 -typedef struct { - u64 lo; - u32 hi; -} __packed __aligned(4) u96; -#endif -typedef u64 u96; - -struct bch_inode_unpacked { - u64 bi_inum; - u32 bi_snapshot; - u64 bi_journal_seq; - __le64 bi_hash_seed; - u64 bi_size; - u64 bi_sectors; - u64 bi_version; - u32 bi_flags; - u16 bi_mode; - -#define x(_name, _bits) u##_bits _name; - BCH_INODE_FIELDS_v3() -#undef x -}; -BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24); - -struct bkey_inode_buf { - struct bkey_i_inode_v3 inode; - -#define x(_name, _bits) + 8 + _bits / 8 - u8 _pad[0 + BCH_INODE_FIELDS_v3()]; -#undef x -}; - -void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); -int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); -struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); - -void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); - -int __bch2_inode_peek(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, subvol_inum, unsigned, bool); - -static inline int bch2_inode_peek_nowarn(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags) -{ - return __bch2_inode_peek(trans, iter, inode, inum, flags, false); -} - -static inline int bch2_inode_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode, - subvol_inum inum, unsigned flags) -{ - return __bch2_inode_peek(trans, iter, inode, inum, flags, true); -} - -int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32, - struct bch_inode_unpacked *, unsigned); -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, - subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, - struct bch_inode_unpacked *); - -int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, - struct bch_inode_unpacked *root); - -int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); - -static inline int bch2_inode_write(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode) -{ - return bch2_inode_write_flags(trans, iter, inode, 0); -} - -int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); -int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); - -void bch2_inode_init_early(struct bch_fs *, - struct bch_inode_unpacked *); -void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64, - uid_t, gid_t, umode_t, dev_t, - struct bch_inode_unpacked *); -void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, - uid_t, gid_t, umode_t, dev_t, - struct bch_inode_unpacked *); - -int bch2_inode_create(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, u32, u64); - -int bch2_inode_rm(struct bch_fs *, subvol_inum); - -#define inode_opt_get(_c, _inode, _name) \ - ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) - -static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, - enum inode_opt_id id, u64 v) -{ - switch (id) { -#define x(_name, ...) \ - case Inode_opt_##_name: \ - inode->bi_##_name = v; \ - break; - BCH_INODE_OPTS() -#undef x - default: - BUG(); - } -} - -static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, - enum inode_opt_id id) -{ - switch (id) { -#define x(_name, ...) \ - case Inode_opt_##_name: \ - return inode->bi_##_name; - BCH_INODE_OPTS() -#undef x - default: - BUG(); - } -} - -static inline u8 mode_to_type(umode_t mode) -{ - return (mode >> 12) & 15; -} - -static inline u8 inode_d_type(struct bch_inode_unpacked *inode) -{ - return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); -} - -static inline u32 bch2_inode_flags(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); - case KEY_TYPE_inode_v2: - return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); - default: - return 0; - } -} - -static inline unsigned bkey_inode_mode(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode); - case KEY_TYPE_inode_v2: - return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode); - case KEY_TYPE_inode_v3: - return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v); - default: - return 0; - } -} - -static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) -{ - /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */ - return bi->bi_casefold - ? bi->bi_casefold - 1 - : c->opts.casefold; -} - -static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi) -{ - return bi->bi_dir || bi->bi_dir_offset; -} - -/* i_nlink: */ - -static inline unsigned nlink_bias(umode_t mode) -{ - return S_ISDIR(mode) ? 2 : 1; -} - -static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) -{ - return bi->bi_flags & BCH_INODE_unlinked - ? 0 - : bi->bi_nlink + nlink_bias(bi->bi_mode); -} - -static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, - unsigned nlink) -{ - if (nlink) { - bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); - bi->bi_flags &= ~BCH_INODE_unlinked; - } else { - bi->bi_nlink = 0; - bi->bi_flags |= BCH_INODE_unlinked; - } -} - -int bch2_inode_nlink_inc(struct bch_inode_unpacked *); -void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); - -struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); -void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, - struct bch_inode_unpacked *); -int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *); -int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, unsigned); - -#include "rebalance.h" - -static inline struct bch_extent_rebalance -bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) -{ - struct bch_io_opts io_opts; - bch2_inode_opts_get(&io_opts, c, inode); - return io_opts_to_rebalance_opts(c, &io_opts); -} - -#define BCACHEFS_ROOT_SUBVOL_INUM \ - ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) - -static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b) -{ - return a.subvol == b.subvol && a.inum == b.inum; -} - -int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); -int bch2_delete_dead_inodes(struct bch_fs *); - -#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h deleted file mode 100644 index 1f00938b1bdc8a..00000000000000 --- a/fs/bcachefs/inode_format.h +++ /dev/null @@ -1,185 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_INODE_FORMAT_H -#define _BCACHEFS_INODE_FORMAT_H - -#define BLOCKDEV_INODE_MAX 4096 -#define BCACHEFS_ROOT_INO 4096 - -struct bch_inode { - struct bch_val v; - - __le64 bi_hash_seed; - __le32 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v2 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v3 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le64 bi_sectors; - __le64 bi_size; - __le64 bi_version; - __u8 fields[]; -} __packed __aligned(8); - -#define INODEv3_FIELDS_START_INITIAL 6 -#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) - -struct bch_inode_generation { - struct bch_val v; - - __le32 bi_generation; - __le32 pad; -} __packed __aligned(8); - -/* - * bi_subvol and bi_parent_subvol are only set for subvolume roots: - */ - -#define BCH_INODE_FIELDS_v2() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_size, 64) \ - x(bi_sectors, 64) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) - -#define BCH_INODE_FIELDS_v3() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) \ - x(bi_depth, 32) \ - x(bi_inodes_32bit, 8) \ - x(bi_casefold, 8) - -/* subset of BCH_INODE_FIELDS */ -#define BCH_INODE_OPTS() \ - x(data_checksum, 8) \ - x(compression, 8) \ - x(project, 32) \ - x(background_compression, 8) \ - x(data_replicas, 8) \ - x(promote_target, 16) \ - x(foreground_target, 16) \ - x(background_target, 16) \ - x(erasure_code, 16) \ - x(nocow, 8) \ - x(inodes_32bit, 8) \ - x(casefold, 8) - -enum inode_opt_id { -#define x(name, ...) \ - Inode_opt_##name, - BCH_INODE_OPTS() -#undef x - Inode_opt_nr, -}; - -/* - * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive - - * for overlayfs - */ -#define BCH_INODE_FLAGS() \ - x(sync, 0) \ - x(immutable, 1) \ - x(append, 2) \ - x(nodump, 3) \ - x(noatime, 4) \ - x(i_size_dirty, 5) \ - x(i_sectors_dirty, 6) \ - x(unlinked, 7) \ - x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) \ - x(has_case_insensitive, 10) - -/* bits 20+ reserved for packed fields below: */ - -enum bch_inode_flags { -#define x(t, n) BCH_INODE_##t = 1U << n, - BCH_INODE_FLAGS() -#undef x -}; - -enum __bch_inode_flags { -#define x(t, n) __BCH_INODE_##t = n, - BCH_INODE_FLAGS() -#undef x -}; - -LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32); - -LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); -LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); -LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_FIELDS_START, - struct bch_inode_v3, bi_flags, 31, 36); -LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); - -struct bch_inode_alloc_cursor { - struct bch_val v; - __u8 bits; - __u8 pad; - __le32 gen; - __le64 idx; -}; - -#endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c deleted file mode 100644 index 07023667a475f6..00000000000000 --- a/fs/bcachefs/io_misc.c +++ /dev/null @@ -1,570 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * io_misc.c - fallocate, fpunch, truncate: - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "clock.h" -#include "error.h" -#include "extents.h" -#include "extent_update.h" -#include "inode.h" -#include "io_misc.h" -#include "io_write.h" -#include "logged_ops.h" -#include "rebalance.h" -#include "subvolume.h" - -/* Overwrites whatever was present with zeroes: */ -int bch2_extent_fallocate(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - u64 sectors, - struct bch_io_opts opts, - s64 *i_sectors_delta, - struct write_point_specifier write_point) -{ - struct bch_fs *c = trans->c; - struct disk_reservation disk_res = { 0 }; - struct closure cl; - struct open_buckets open_buckets = { 0 }; - struct bkey_s_c k; - struct bkey_buf old, new; - unsigned sectors_allocated = 0, new_replicas; - bool unwritten = opts.nocow && - c->sb.version >= bcachefs_metadata_version_unwritten_extents; - int ret; - - bch2_bkey_buf_init(&old); - bch2_bkey_buf_init(&new); - closure_init_stack(&cl); - - k = bch2_btree_iter_peek_slot(trans, iter); - ret = bkey_err(k); - if (ret) - return ret; - - sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); - new_replicas = max(0, (int) opts.data_replicas - - (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - - /* - * Get a disk reservation before (in the nocow case) calling - * into the allocator: - */ - ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); - if (unlikely(ret)) - goto err_noprint; - - bch2_bkey_buf_reassemble(&old, c, k); - - if (!unwritten) { - struct bkey_i_reservation *reservation; - - bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); - reservation = bkey_reservation_init(new.k); - reservation->k.p = iter->pos; - bch2_key_resize(&reservation->k, sectors); - reservation->v.nr_replicas = opts.data_replicas; - } else { - struct bkey_i_extent *e; - struct bch_devs_list devs_have; - struct write_point *wp; - - devs_have.nr = 0; - - bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); - - e = bkey_extent_init(new.k); - e->k.p = iter->pos; - - ret = bch2_alloc_sectors_start_trans(trans, - opts.foreground_target, - false, - write_point, - &devs_have, - opts.data_replicas, - opts.data_replicas, - BCH_WATERMARK_normal, 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = bch_err_throw(c, transaction_restart_nested); - if (ret) - goto err; - - sectors = min_t(u64, sectors, wp->sectors_free); - sectors_allocated = sectors; - - bch2_key_resize(&e->k, sectors); - - bch2_open_bucket_get(c, wp, &open_buckets); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->unwritten = true; - } - - ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, - 0, i_sectors_delta, true); -err: - if (!ret && sectors_allocated) - bch2_increment_clock(c, sectors_allocated, WRITE); - if (should_print_err(ret)) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); - prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } -err_noprint: - bch2_open_buckets_put(c, &open_buckets); - bch2_disk_reservation_put(c, &disk_res); - bch2_bkey_buf_exit(&new, c); - bch2_bkey_buf_exit(&old, c); - - if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock_long(trans); - bch2_wait_on_allocator(c, &cl); - } - - return ret; -} - -/* For fsck */ -int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) -{ - u32 restart_count = trans->restart_count; - struct bch_fs *c = trans->c; - struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bkey_i delete; - - int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, - start, end, 0, k, - &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bkey_init(&delete.k); - delete.k.p = iter.pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); - - bch2_extent_trim_atomic(trans, &iter, &delete) ?: - bch2_trans_update(trans, &iter, &delete, 0); - })); - - bch2_disk_reservation_put(c, &disk_res); - return ret ?: trans_was_restarted(trans, restart_count); -} - -/* - * Returns -BCH_ERR_transacton_restart if we had to drop locks: - */ -int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - subvol_inum inum, u64 end, - s64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bpos end_pos = POS(inum.inum, end); - struct bkey_s_c k; - int ret = 0, ret2 = 0; - u32 snapshot; - - while (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - - if (ret) - ret2 = ret; - - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, iter, snapshot); - - /* - * peek_max() doesn't have ideal semantics for extents: - */ - k = bch2_btree_iter_peek_max(trans, iter, end_pos); - if (!k.k) - break; - - ret = bkey_err(k); - if (ret) - continue; - - bkey_init(&delete.k); - delete.k.p = iter->pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end_pos, &delete); - - ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, 0, i_sectors_delta, false); - bch2_disk_reservation_put(c, &disk_res); - } - - return ret ?: ret2; -} - -int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, - s64 *i_sectors_delta) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, start), - BTREE_ITER_intent); - - ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - - return ret; -} - -/* truncate: */ - -void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k); - - prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); - prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); - prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size)); -} - -static int truncate_set_isize(struct btree_trans *trans, - subvol_inum inum, - u64 new_i_size, - bool warn) -{ - struct btree_iter iter = {}; - struct bch_inode_unpacked inode_u; - int ret; - - ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?: - (inode_u.bi_size = new_i_size, 0) ?: - bch2_inode_write(trans, &iter, &inode_u); - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, - struct bkey_i *op_k, - u64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter fpunch_iter; - struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); - subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; - u64 new_i_size = le64_to_cpu(op->v.new_i_size); - bool warn_errors = i_sectors_delta != NULL; - int ret; - - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL)); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, - POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), - BTREE_ITER_intent); - ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); - bch2_trans_iter_exit(trans, &fpunch_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; -err: - if (warn_errors) - bch_err_fn(c, ret); - return ret; -} - -int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k) -{ - return __bch2_resume_logged_op_truncate(trans, op_k, NULL); -} - -int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta) -{ - struct bkey_i_logged_op_truncate op; - - bkey_logged_op_truncate_init(&op.k_i); - op.v.subvol = cpu_to_le32(inum.subvol); - op.v.inum = cpu_to_le64(inum.inum); - op.v.new_i_size = cpu_to_le64(new_i_size); - - /* - * Logged ops aren't atomic w.r.t. snapshot creation: creating a - * snapshot while they're in progress, then crashing, will result in the - * resume only proceeding in one of the snapshots - */ - down_read(&c->snapshot_create_lock); - struct btree_trans *trans = bch2_trans_get(c); - int ret = bch2_logged_op_start(trans, &op.k_i); - if (ret) - goto out; - ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta); - ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; -out: - bch2_trans_put(trans); - up_read(&c->snapshot_create_lock); - - return ret; -} - -/* finsert/fcollapse: */ - -void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k); - - prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); - prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); - prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset)); - prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset)); -} - -static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, - u64 offset, s64 len, bool warn) -{ - struct btree_iter iter; - struct bch_inode_unpacked inode_u; - int ret; - - offset <<= 9; - len <<= 9; - - ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn); - if (ret) - return ret; - - if (len > 0) { - if (MAX_LFS_FILESIZE - inode_u.bi_size < len) { - ret = -EFBIG; - goto err; - } - - if (offset >= inode_u.bi_size) { - ret = -EINVAL; - goto err; - } - } - - inode_u.bi_size += len; - inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c); - - ret = bch2_inode_write(trans, &iter, &inode_u); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, - struct bkey_i *op_k, - u64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); - subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; - struct bch_io_opts opts; - u64 dst_offset = le64_to_cpu(op->v.dst_offset); - u64 src_offset = le64_to_cpu(op->v.src_offset); - s64 shift = dst_offset - src_offset; - u64 len = abs(shift); - u64 pos = le64_to_cpu(op->v.pos); - bool insert = shift > 0; - u32 snapshot; - bool warn_errors = i_sectors_delta != NULL; - int ret = 0; - - ret = bch2_inum_opts_get(trans, inum, &opts); - if (ret) - return ret; - - /* - * check for missing subvolume before fpunch, as in resume we don't want - * it to be a fatal error - */ - ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors)); - if (ret) - return ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, 0), - BTREE_ITER_intent); - - switch (op->v.state) { -case LOGGED_OP_FINSERT_start: - op->v.state = LOGGED_OP_FINSERT_shift_extents; - - if (insert) { - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, src_offset, len, warn_errors) ?: - bch2_logged_op_update(trans, &op->k_i)); - if (ret) - goto err; - } else { - bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset)); - - ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_logged_op_update(trans, &op->k_i)); - } - - fallthrough; -case LOGGED_OP_FINSERT_shift_extents: - while (1) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete, *copy; - struct bkey_s_c k; - struct bpos src_pos = POS(inum.inum, src_offset); - - bch2_trans_begin(trans); - - ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, - warn_errors); - if (ret) - goto btree_err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot)); - - k = insert - ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0)) - : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX)); - if ((ret = bkey_err(k))) - goto btree_err; - - if (!k.k || - k.k->p.inode != inum.inum || - bkey_le(k.k->p, POS(inum.inum, src_offset))) - break; - - copy = bch2_bkey_make_mut_noupdate(trans, k); - if ((ret = PTR_ERR_OR_ZERO(copy))) - goto btree_err; - - if (insert && - bkey_lt(bkey_start_pos(k.k), src_pos)) { - bch2_cut_front(src_pos, copy); - - /* Splitting compressed extent? */ - bch2_disk_reservation_add(c, &disk_res, - copy->k.size * - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)), - BCH_DISK_RESERVATION_NOFAIL); - } - - bkey_init(&delete.k); - delete.k.p = copy->k.p; - delete.k.p.snapshot = snapshot; - delete.k.size = copy->k.size; - - copy->k.p.offset += shift; - copy->k.p.snapshot = snapshot; - - op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - - ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: - bch2_logged_op_update(trans, &op->k_i) ?: - bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); -btree_err: - bch2_disk_reservation_put(c, &disk_res); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - - pos = le64_to_cpu(op->v.pos); - } - - op->v.state = LOGGED_OP_FINSERT_finish; - - if (!insert) { - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?: - bch2_logged_op_update(trans, &op->k_i)); - } else { - /* We need an inode update to update bi_journal_seq for fsync: */ - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - adjust_i_size(trans, inum, 0, 0, warn_errors) ?: - bch2_logged_op_update(trans, &op->k_i)); - } - - break; -case LOGGED_OP_FINSERT_finish: - break; - } -err: - bch2_trans_iter_exit(trans, &iter); - if (warn_errors) - bch_err_fn(c, ret); - return ret; -} - -int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k) -{ - return __bch2_resume_logged_op_finsert(trans, op_k, NULL); -} - -int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, - u64 offset, u64 len, bool insert, - s64 *i_sectors_delta) -{ - struct bkey_i_logged_op_finsert op; - s64 shift = insert ? len : -len; - - bkey_logged_op_finsert_init(&op.k_i); - op.v.subvol = cpu_to_le32(inum.subvol); - op.v.inum = cpu_to_le64(inum.inum); - op.v.dst_offset = cpu_to_le64(offset + shift); - op.v.src_offset = cpu_to_le64(offset); - op.v.pos = cpu_to_le64(insert ? U64_MAX : offset); - - /* - * Logged ops aren't atomic w.r.t. snapshot creation: creating a - * snapshot while they're in progress, then crashing, will result in the - * resume only proceeding in one of the snapshots - */ - down_read(&c->snapshot_create_lock); - struct btree_trans *trans = bch2_trans_get(c); - int ret = bch2_logged_op_start(trans, &op.k_i); - if (ret) - goto out; - ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta); - ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; -out: - bch2_trans_put(trans); - up_read(&c->snapshot_create_lock); - - return ret; -} diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h deleted file mode 100644 index b93e4d4b3c0c50..00000000000000 --- a/fs/bcachefs/io_misc.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_MISC_H -#define _BCACHEFS_IO_MISC_H - -int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, - u64, struct bch_io_opts, s64 *, - struct write_point_specifier); - -int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); -int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - subvol_inum, u64, s64 *); -int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); - -void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) { \ - .val_to_text = bch2_logged_op_truncate_to_text, \ - .min_val_size = 24, \ -}) - -int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *); - -int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *); - -void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \ - .val_to_text = bch2_logged_op_finsert_to_text, \ - .min_val_size = 24, \ -}) - -int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *); - -int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *); - -#endif /* _BCACHEFS_IO_MISC_H */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c deleted file mode 100644 index e0874ad9a6cf24..00000000000000 --- a/fs/bcachefs/io_read.c +++ /dev/null @@ -1,1543 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Some low level IO code, and hacks for various block layer limitations - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "data_update.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "io_read.h" -#include "io_misc.h" -#include "io_write.h" -#include "reflink.h" -#include "subvolume.h" -#include "trace.h" - -#include -#include -#include - -#ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_read_corrupt_ratio; -module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); -MODULE_PARM_DESC(read_corrupt_ratio, ""); -#endif - -static bool bch2_poison_extents_on_checksum_error; -module_param_named(poison_extents_on_checksum_error, - bch2_poison_extents_on_checksum_error, bool, 0644); -MODULE_PARM_DESC(poison_extents_on_checksum_error, - "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - const struct bch_devs_mask *devs; - unsigned d, nr = 0, total = 0; - u64 now = local_clock(), last; - s64 congested; - struct bch_dev *ca; - - if (!target) - return false; - - guard(rcu)(); - devs = bch2_target_to_mask(c, target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { - ca = rcu_dereference(c->devs[d]); - if (!ca) - continue; - - congested = atomic_read(&ca->congested); - last = READ_ONCE(ca->congested_last); - if (time_after64(now, last)) - congested -= (now - last) >> 12; - - total += max(congested, 0LL); - nr++; - } - - return get_random_u32_below(nr * CONGESTED_MAX) < total; -} - -#else - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - return false; -} - -#endif - -/* Cache promotion on read */ - -static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), - .automatic_shrinking = true, -}; - -static inline bool have_io_error(struct bch_io_failures *failed) -{ - return failed && failed->nr; -} - -static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) -{ - EBUG_ON(rbio->split); - - return rbio->data_update - ? container_of(rbio, struct data_update, rbio) - : NULL; -} - -static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) -{ - struct data_update *u = rbio_data_update(orig); - if (!u) - return false; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); - unsigned i = 0; - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == dev && - u->data_opts.rewrite_ptrs & BIT(i)) - return true; - i++; - } - - return false; -} - -static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, - struct bpos pos, - struct bch_io_opts opts, - unsigned flags, - struct bch_io_failures *failed) -{ - if (!have_io_error(failed)) { - BUG_ON(!opts.promote_target); - - if (!(flags & BCH_READ_may_promote)) - return bch_err_throw(c, nopromote_may_not); - - if (bch2_bkey_has_target(c, k, opts.promote_target)) - return bch_err_throw(c, nopromote_already_promoted); - - if (bkey_extent_is_unwritten(k)) - return bch_err_throw(c, nopromote_unwritten); - - if (bch2_target_congested(c, opts.promote_target)) - return bch_err_throw(c, nopromote_congested); - } - - if (rhashtable_lookup_fast(&c->promote_table, &pos, - bch_promote_params)) - return bch_err_throw(c, nopromote_in_flight); - - return 0; -} - -static noinline void promote_free(struct bch_read_bio *rbio) -{ - struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - struct bch_fs *c = rbio->c; - - int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - - async_object_list_del(c, promote, op->list_idx); - async_object_list_del(c, rbio, rbio->list_idx); - - bch2_data_update_exit(&op->write); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); - kfree_rcu(op, rcu); -} - -static void promote_done(struct bch_write_op *wop) -{ - struct promote_op *op = container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.rbio.c; - - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); - promote_free(&op->write.rbio); -} - -static void promote_start_work(struct work_struct *work) -{ - struct promote_op *op = container_of(work, struct promote_op, work); - - bch2_data_update_read_done(&op->write); -} - -static noinline void promote_start(struct bch_read_bio *rbio) -{ - struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - - trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); - - INIT_WORK(&op->work, promote_start_work); - queue_work(rbio->c->write_ref_wq, &op->work); -} - -static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - unsigned sectors, - struct bch_read_bio *orig, - struct bch_io_failures *failed) -{ - struct bch_fs *c = trans->c; - int ret; - - struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; - - if (!have_io_error(failed)) { - update_opts.target = orig->opts.promote_target; - update_opts.extra_replicas = 1; - update_opts.write_flags |= BCH_WRITE_cached; - update_opts.write_flags |= BCH_WRITE_only_specified_devs; - } else { - update_opts.target = orig->opts.foreground_target; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev) && - !ptr_being_rewritten(orig, ptr->dev)) - update_opts.rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - if (!update_opts.rewrite_ptrs) - return NULL; - } - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) - return ERR_PTR(-BCH_ERR_nopromote_no_writes); - - struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) { - ret = bch_err_throw(c, nopromote_enomem); - goto err_put; - } - - op->start_time = local_clock(); - op->pos = pos; - - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) { - ret = bch_err_throw(c, nopromote_in_flight); - goto err; - } - - ret = async_object_list_add(c, promote, op, &op->list_idx); - if (ret < 0) - goto err_remove_hash; - - ret = bch2_data_update_init(trans, NULL, NULL, &op->write, - writepoint_hashed((unsigned long) current), - &orig->opts, - update_opts, - btree_id, k); - op->write.type = BCH_DATA_UPDATE_promote; - /* - * possible errors: -BCH_ERR_nocow_lock_blocked, - * -BCH_ERR_ENOSPC_disk_reservation: - */ - if (ret) - goto err_remove_list; - - rbio_init_fragment(&op->write.rbio.bio, orig); - op->write.rbio.bounce = true; - op->write.rbio.promote = true; - op->write.op.end_io = promote_done; - - return &op->write.rbio; -err_remove_list: - async_object_list_del(c, promote, op->list_idx); -err_remove_hash: - BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params)); -err: - bio_free_pages(&op->write.op.wbio.bio); - /* We may have added to the rhashtable and thus need rcu freeing: */ - kfree_rcu(op, rcu); -err_put: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); - return ERR_PTR(ret); -} - -noinline -static struct bch_read_bio *promote_alloc(struct btree_trans *trans, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, - unsigned flags, - struct bch_read_bio *orig, - bool *bounce, - bool *read_full, - struct bch_io_failures *failed) -{ - /* - * We're in the retry path, but we don't know what to repair yet, and we - * don't want to do a promote here: - */ - if (failed && !failed->nr) - return NULL; - - struct bch_fs *c = trans->c; - /* - * if failed != NULL we're not actually doing a promote, we're - * recovering from an io/checksum error - */ - bool promote_full = (have_io_error(failed) || - *read_full || - READ_ONCE(c->opts.promote_whole_extents)); - /* data might have to be decompressed in the write path: */ - unsigned sectors = promote_full - ? max(pick->crc.compressed_size, pick->crc.live_size) - : bvec_iter_sectors(iter); - struct bpos pos = promote_full - ? bkey_start_pos(k.k) - : POS(k.k->p.inode, iter.bi_sector); - int ret; - - ret = should_promote(c, k, pos, orig->opts, flags, failed); - if (ret) - goto nopromote; - - struct bch_read_bio *promote = - __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, sectors, orig, failed); - if (!promote) - return NULL; - - ret = PTR_ERR_OR_ZERO(promote); - if (ret) - goto nopromote; - - *bounce = true; - *read_full = promote_full; - - if (have_io_error(failed)) - orig->self_healing = true; - - return promote; -nopromote: - trace_io_read_nopromote(c, ret); - return NULL; -} - -void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) -{ - if (!op->write.read_done) { - prt_printf(out, "parent read: %px\n", op->write.rbio.parent); - printbuf_indent_add(out, 2); - bch2_read_bio_to_text(out, op->write.rbio.parent); - printbuf_indent_sub(out, 2); - } - - bch2_data_update_to_text(out, &op->write); -} - -/* Read */ - -static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_read_bio *rbio, struct bpos read_pos) -{ - int ret = lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { rbio->subvol, read_pos.inode }, - read_pos.offset << 9)); - if (ret) - return ret; - - if (rbio->data_update) - prt_str(out, "(internal move) "); - - return 0; -} - -static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, - struct bch_read_bio *rbio, struct bpos read_pos) -{ - bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); -} - -enum rbio_context { - RBIO_CONTEXT_NULL, - RBIO_CONTEXT_HIGHPRI, - RBIO_CONTEXT_UNBOUND, -}; - -static inline struct bch_read_bio * -bch2_rbio_parent(struct bch_read_bio *rbio) -{ - return rbio->split ? rbio->parent : rbio; -} - -__always_inline -static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, - enum rbio_context context, - struct workqueue_struct *wq) -{ - if (context <= rbio->context) { - fn(&rbio->work); - } else { - rbio->work.func = fn; - rbio->context = context; - queue_work(wq, &rbio->work); - } -} - -static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -{ - BUG_ON(rbio->bounce && !rbio->split); - - if (rbio->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); - } - - if (rbio->split) { - struct bch_read_bio *parent = rbio->parent; - - if (unlikely(rbio->promote)) { - if (!rbio->bio.bi_status) - promote_start(rbio); - else - promote_free(rbio); - } else { - async_object_list_del(rbio->c, rbio, rbio->list_idx); - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - - bio_put(&rbio->bio); - } - - rbio = parent; - } - - return rbio; -} - -/* - * Only called on a top level bch_read_bio to complete an entire read request, - * not a split: - */ -static void bch2_rbio_done(struct bch_read_bio *rbio) -{ - if (rbio->start_time) - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], - rbio->start_time); -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - if (rbio->list_idx) - async_object_list_del(rbio->c, rbio, rbio->list_idx); -#endif - bio_endio(&rbio->bio); -} - -static void get_rbio_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, - struct bkey_buf *sk) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = lockrestart_do(trans, - bkey_err(k = bch2_bkey_get_iter(trans, &iter, - rbio->data_btree, rbio->data_pos, 0))); - if (ret) - return; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) - if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { - bch2_bkey_buf_reassemble(sk, trans->c, k); - break; - } - - bch2_trans_iter_exit(trans, &iter); -} - -static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, - enum btree_id btree, struct bkey_s_c read_k) -{ - if (!bch2_poison_extents_on_checksum_error) - return 0; - - struct bch_fs *c = trans->c; - - struct data_update *u = rbio_data_update(rbio); - if (u) - read_k = bkey_i_to_s_c(u->k.k); - - u64 flags = bch2_bkey_extent_flags(read_k); - if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), - BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_and_val_eq(k, read_k)) - goto out; - - struct bkey_i *new = bch2_trans_kmalloc(trans, - bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); - ret = PTR_ERR_OR_ZERO(new) ?: - (bkey_reassemble(new, k), 0) ?: - bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: - bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - - /* - * Propagate key change back to data update path, in particular so it - * knows the extent has been poisoned and it's safe to change the - * checksum - */ - if (u && !ret) - bch2_bkey_buf_copy(&u->k, c, new); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, - struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) -{ - struct data_update *u = container_of(rbio, struct data_update, rbio); -retry: - bch2_trans_begin(trans); - - struct btree_iter iter; - struct bkey_s_c k; - int ret = lockrestart_do(trans, - bkey_err(k = bch2_bkey_get_iter(trans, &iter, - u->btree_id, bkey_start_pos(&u->k.k->k), - 0))); - if (ret) - goto err; - - if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { - /* extent we wanted to read no longer exists: */ - rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); - goto err; - } - - ret = __bch2_read_extent(trans, rbio, bvec_iter, - bkey_start_pos(&u->k.k->k), - u->btree_id, - bkey_i_to_s_c(u->k.k), - 0, failed, flags, -1); -err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_data_read_retry)) - goto retry; - - if (ret) { - rbio->bio.bi_status = BLK_STS_IOERR; - rbio->ret = ret; - } - - BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); - return ret; -} - -static void bch2_rbio_retry(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - subvol_inum inum = { - .subvol = rbio->subvol, - .inum = rbio->read_pos.inode, - }; - struct bch_io_failures failed = { .nr = 0 }; - - struct btree_trans *trans = bch2_trans_get(c); - - struct bkey_buf sk; - bch2_bkey_buf_init(&sk); - bkey_init(&sk.k->k); - - trace_io_read_retry(&rbio->bio); - this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], - bvec_iter_sectors(rbio->bvec_iter)); - - get_rbio_extent(trans, rbio, &sk); - - if (!bkey_deleted(&sk.k->k) && - bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(&failed, &rbio->pick, - rbio->ret == -BCH_ERR_data_read_retry_csum_err); - - if (!rbio->split) { - rbio->bio.bi_status = 0; - rbio->ret = 0; - } - - unsigned subvol = rbio->subvol; - struct bpos read_pos = rbio->read_pos; - - rbio = bch2_rbio_free(rbio); - - flags |= BCH_READ_in_retry; - flags &= ~BCH_READ_may_promote; - flags &= ~BCH_READ_last_fragment; - flags |= BCH_READ_must_clone; - - int ret = rbio->data_update - ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) - : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); - - if (ret) { - rbio->ret = ret; - rbio->bio.bi_status = BLK_STS_IOERR; - } - - if (failed.nr || ret) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, - (subvol_inum) { subvol, read_pos.inode }, - read_pos.offset << 9)); - if (rbio->data_update) - prt_str(&buf, "(internal move) "); - - prt_str(&buf, "data read error, "); - if (!ret) { - prt_str(&buf, "successful retry"); - if (rbio->self_healing) - prt_str(&buf, ", self healing"); - } else - prt_str(&buf, bch2_err_str(ret)); - prt_newline(&buf); - - - if (!bkey_deleted(&sk.k->k)) { - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); - prt_newline(&buf); - } - - bch2_io_failures_to_text(&buf, c, &failed); - - bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - - bch2_rbio_done(rbio); - bch2_bkey_buf_exit(&sk, c); - bch2_trans_put(trans); -} - -static void bch2_rbio_error(struct bch_read_bio *rbio, - int ret, blk_status_t blk_error) -{ - BUG_ON(ret >= 0); - - rbio->ret = ret; - rbio->bio.bi_status = blk_error; - - bch2_rbio_parent(rbio)->saw_error = true; - - if (rbio->flags & BCH_READ_in_retry) - return; - - if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); - } else { - rbio = bch2_rbio_free(rbio); - - rbio->ret = ret; - rbio->bio.bi_status = blk_error; - - bch2_rbio_done(rbio); - } -} - -static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, - struct bch_read_bio *rbio) -{ - struct bch_fs *c = rbio->c; - u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; - struct bch_extent_crc_unpacked new_crc; - struct btree_iter iter; - struct bkey_i *new; - struct bkey_s_c k; - int ret = 0; - - if (crc_is_compressed(rbio->pick.crc)) - return 0; - - k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_slots|BTREE_ITER_intent); - if ((ret = bkey_err(k))) - goto out; - - if (bversion_cmp(k.k->bversion, rbio->version) || - !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) - goto out; - - /* Extent was merged? */ - if (bkey_start_offset(k.k) < data_offset || - k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) - goto out; - - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(k.k) - data_offset, k.k->size, - rbio->pick.crc.csum_type)) { - bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); - ret = 0; - goto out; - } - - /* - * going to be temporarily appending another checksum entry: - */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - sizeof(struct bch_extent_crc128)); - if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; - - bkey_reassemble(new, k); - - if (!bch2_bkey_narrow_crcs(new, new_crc)) - goto out; - - ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -{ - bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_rbio_narrow_crcs(trans, rbio)); -} - -static void bch2_read_decompress_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "decompression error"); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); - printbuf_exit(&buf); -} - -static void bch2_read_decrypt_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "decrypt error"); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); - printbuf_exit(&buf); -} - -/* Inner part that may run in process context */ -static void __bch2_read_endio(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct bch_read_bio *parent = bch2_rbio_parent(rbio); - struct bio *src = &rbio->bio; - struct bio *dst = &parent->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - unsigned nofs_flags; - struct bch_csum csum; - int ret; - - nofs_flags = memalloc_nofs_save(); - - /* Reset iterator for checksumming and copying bounced data: */ - if (rbio->bounce) { - src->bi_iter.bi_size = crc.compressed_size << 9; - src->bi_iter.bi_idx = 0; - src->bi_iter.bi_bvec_done = 0; - } else { - src->bi_iter = rbio->bvec_iter; - } - - bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); - - csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; - - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { - rbio->flags |= BCH_READ_must_bounce; - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, - BLK_STS_IOERR); - goto out; - } - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); - - if (!csum_good) - goto csum_err; - - /* - * XXX - * We need to rework the narrow_crcs path to deliver the read completion - * first, and then punt to a different workqueue, otherwise we're - * holding up reads while doing btree updates which is bad for memory - * reclaim. - */ - if (unlikely(rbio->narrow_crcs)) - bch2_rbio_narrow_crcs(rbio); - - if (likely(!parent->data_update)) { - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; - } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); - - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; - - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } - } - } else { - if (rbio->split) - rbio->parent->pick = rbio->pick; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } - } - - if (rbio->promote) { - /* - * Re encrypt data we decrypted, so it's consistent with - * rbio->crc: - */ - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - } - - if (likely(!(rbio->flags & BCH_READ_in_retry))) { - rbio = bch2_rbio_free(rbio); - bch2_rbio_done(rbio); - } -out: - memalloc_nofs_restore(nofs_flags); - return; -csum_err: - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); - goto out; -decompression_err: - bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); - goto out; -decrypt_err: - bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); - goto out; -} - -static void bch2_read_endio(struct bio *bio) -{ - struct bch_read_bio *rbio = - container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct workqueue_struct *wq = NULL; - enum rbio_context context = RBIO_CONTEXT_NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - rbio->submit_time, !bio->bi_status); - - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - - if (unlikely(bio->bi_status)) { - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); - return; - } - - if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || - (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { - trace_and_count(c, io_read_reuse_race, &rbio->bio); - - if (rbio->flags & BCH_READ_retry_if_stale) - bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); - else - bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); - return; - } - - if (rbio->narrow_crcs || - rbio->promote || - crc_is_compressed(rbio->pick.crc) || - bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) - context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; - else if (rbio->pick.crc.csum_type) - context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; - - bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -} - -static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, - struct bch_dev *ca, - struct bkey_s_c k, - struct bch_extent_ptr ptr) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(ca, &ptr), - BTREE_ITER_cached); - - int gen = bucket_gen_get(ca, iter.pos.offset); - if (gen >= 0) { - prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); - printbuf_indent_add(&buf, 2); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - prt_printf(&buf, "memory gen: %u", gen); - - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); - if (!ret) { - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - } - } else { - prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", - iter.pos.inode, iter.pos.offset); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "first bucket %u nbuckets %llu\n", - ca->mi.first_bucket, ca->mi.nbuckets); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - } - - bch2_fs_inconsistent(c, "%s", buf.buf); - - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); -} - -int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - struct bvec_iter iter, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags, int dev) -{ - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct bch_read_bio *rbio = NULL; - bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos data_pos = bkey_start_pos(k.k); - struct data_update *u = rbio_data_update(orig); - int ret = 0; - - if (bkey_extent_is_inline_data(k.k)) { - unsigned bytes = min_t(unsigned, iter.bi_size, - bkey_inline_data_bytes(k.k)); - - swap(iter.bi_size, bytes); - memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); - swap(iter.bi_size, bytes); - bio_advance_iter(&orig->bio, &iter, bytes); - zero_fill_bio_iter(&orig->bio, iter); - this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], - bvec_iter_sectors(iter)); - goto out_read_done; - } - - if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && - !orig->data_update) - return bch_err_throw(c, extent_poisoned); -retry_pick: - ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); - - /* hole or reservation - just zero fill: */ - if (!ret) - goto hole; - - if (unlikely(ret < 0)) { - if (ret == -BCH_ERR_data_read_csum_err) { - int ret2 = maybe_poison_extent(trans, orig, data_btree, k); - if (ret2) { - ret = ret2; - goto err; - } - - trace_and_count(c, io_read_fail_and_poison, &orig->bio); - } - - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "%s\n ", bch2_err_str(ret)); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - goto err; - } - - if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && - !c->chacha20_key_set) { - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, data_read_no_encryption_key); - goto err; - } - - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, - BCH_DEV_READ_REF_io_read); - - /* - * Stale dirty pointers are treated as IO errors, but @failed isn't - * allocated unless we're in the retry path - so if we're not in the - * retry path, don't check here, it'll be caught in bch2_read_endio() - * and we'll end up in the retry path: - */ - if ((flags & BCH_READ_in_retry) && - !pick.ptr.cached && - ca && - unlikely(dev_ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); - bch2_mark_io_failure(failed, &pick, false); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); - goto retry_pick; - } - - if (likely(!u)) { - if (!(flags & BCH_READ_last_fragment) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_must_clone; - - narrow_crcs = !(flags & BCH_READ_in_retry) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_user_mapped)) - flags |= BCH_READ_must_bounce; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_user_mapped)) || - (flags & BCH_READ_must_bounce)))) { - read_full = true; - bounce = true; - } - } else { - /* - * can happen if we retry, and the extent we were going to read - * has been merged in the meantime: - */ - if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { - if (ca) - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_io_read); - rbio->ret = bch_err_throw(c, data_read_buffer_too_small); - goto out_read_done; - } - - iter.bi_size = pick.crc.compressed_size << 9; - read_full = true; - } - - if (orig->opts.promote_target || have_io_error(failed)) - rbio = promote_alloc(trans, iter, k, &pick, flags, orig, - &bounce, &read_full, failed); - - if (!read_full) { - EBUG_ON(crc_is_compressed(pick.crc)); - EBUG_ON(pick.crc.csum_type && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - bvec_iter_sectors(iter) != pick.crc.live_size || - pick.crc.offset || - offset_into_extent)); - - data_pos.offset += offset_into_extent; - pick.ptr.offset += pick.crc.offset + - offset_into_extent; - offset_into_extent = 0; - pick.crc.compressed_size = bvec_iter_sectors(iter); - pick.crc.uncompressed_size = bvec_iter_sectors(iter); - pick.crc.offset = 0; - pick.crc.live_size = bvec_iter_sectors(iter); - } - - if (rbio) { - /* - * promote already allocated bounce rbio: - * promote needs to allocate a bio big enough for uncompressing - * data in the write path, but we're not going to use it all - * here: - */ - EBUG_ON(rbio->bio.bi_iter.bi_size < - pick.crc.compressed_size << 9); - rbio->bio.bi_iter.bi_size = - pick.crc.compressed_size << 9; - } else if (bounce) { - unsigned sectors = pick.crc.compressed_size; - - rbio = rbio_init_fragment(bio_alloc_bioset(NULL, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - 0, - GFP_NOFS, - &c->bio_read_split), - orig); - - bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - rbio->bounce = true; - } else if (flags & BCH_READ_must_clone) { - /* - * Have to clone if there were any splits, due to error - * reporting issues (if a split errored, and retrying didn't - * work, when it reports the error to its parent (us) we don't - * know if the error was from our bio, and we should retry, or - * from the whole bio, in which case we don't want to retry and - * lose the error) - */ - rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, - &c->bio_read_split), - orig); - rbio->bio.bi_iter = iter; - } else { - rbio = orig; - rbio->bio.bi_iter = iter; - EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); - } - - EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - - rbio->submit_time = local_clock(); - if (!rbio->split) - rbio->end_io = orig->bio.bi_end_io; - rbio->bvec_iter = iter; - rbio->offset_into_extent= offset_into_extent; - rbio->flags = flags; - rbio->have_ioref = ca != NULL; - rbio->narrow_crcs = narrow_crcs; - rbio->ret = 0; - rbio->context = 0; - rbio->pick = pick; - rbio->subvol = orig->subvol; - rbio->read_pos = read_pos; - rbio->data_btree = data_btree; - rbio->data_pos = data_pos; - rbio->version = k.k->bversion; - INIT_WORK(&rbio->work, NULL); - - rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick.ptr.offset; - rbio->bio.bi_end_io = bch2_read_endio; - - async_object_list_add(c, rbio, rbio, &rbio->list_idx); - - if (rbio->bounce) - trace_and_count(c, io_read_bounce, &rbio->bio); - - if (!u) - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); - else - this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); - bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - - /* - * If it's being moved internally, we don't want to flag it as a cache - * hit: - */ - if (ca && pick.ptr.cached && !u) - bch2_bucket_io_time_reset(trans, pick.ptr.dev, - PTR_BUCKET_NR(ca, &pick.ptr), READ); - - if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { - bio_inc_remaining(&orig->bio); - trace_and_count(c, io_read_split, &orig->bio); - } - - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - if (!(flags & BCH_READ_in_retry)) - bch2_trans_unlock(trans); - else - bch2_trans_unlock_long(trans); - - if (likely(!rbio->pick.do_ec_reconstruct)) { - if (unlikely(!rbio->have_ioref)) { - bch2_rbio_error(rbio, - -BCH_ERR_data_read_retry_device_offline, - BLK_STS_IOERR); - goto out; - } - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], - bio_sectors(&rbio->bio)); - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - - if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } else { - if (likely(!(flags & BCH_READ_in_retry))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); - } - - /* - * We just submitted IO which may block, we expect relock fail - * events and shouldn't count them: - */ - trans->notrace_relock_fail = true; - } else { - /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, - BLK_STS_IOERR); - goto out; - } - - if (likely(!(flags & BCH_READ_in_retry))) - bio_endio(&rbio->bio); - } -out: - if (likely(!(flags & BCH_READ_in_retry))) { - return 0; - } else { - bch2_trans_unlock(trans); - - int ret; - - rbio->context = RBIO_CONTEXT_UNBOUND; - bch2_read_endio(&rbio->bio); - - ret = rbio->ret; - rbio = bch2_rbio_free(rbio); - - if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) - bch2_mark_io_failure(failed, &pick, - ret == -BCH_ERR_data_read_retry_csum_err); - - return ret; - } - -err: - if (flags & BCH_READ_in_retry) - return ret; - - orig->bio.bi_status = BLK_STS_IOERR; - orig->ret = ret; - goto out_read_done; - -hole: - this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], - bvec_iter_sectors(iter)); - /* - * won't normally happen in the data update (bch2_move_extent()) path, - * but if we retry and the extent we wanted to read no longer exists we - * have to signal that: - */ - if (u) - orig->ret = bch_err_throw(c, data_read_key_overwritten); - - zero_fill_bio_iter(&orig->bio, iter); -out_read_done: - if ((flags & BCH_READ_last_fragment) && - !(flags & BCH_READ_in_retry)) - bch2_rbio_done(orig); - return 0; -} - -int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, - struct bkey_buf *prev_read, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - enum btree_id data_btree; - int ret; - - EBUG_ON(rbio->data_update); - - bch2_bkey_buf_init(&sk); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, bvec_iter.bi_sector), - BTREE_ITER_slots); - - while (1) { - data_btree = BTREE_ID_extents; - - bch2_trans_begin(trans); - - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - - bch2_btree_iter_set_pos(trans, &iter, - POS(inum.inum, bvec_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - s64 offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - unsigned sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - goto err; - - k = bkey_i_to_s_c(sk.k); - - if (unlikely(flags & BCH_READ_in_retry)) { - if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) - failed->nr = 0; - bch2_bkey_buf_copy(prev_read, c, sk.k); - } - - /* - * With indirect extents, the amount of data to read is the min - * of the original extent and the indirect extent: - */ - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - - unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_last_fragment; - - ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, - data_btree, k, - offset_into_extent, failed, flags, -1); - swap(bvec_iter.bi_size, bytes); - - if (ret) - goto err; - - if (flags & BCH_READ_last_fragment) - break; - - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); -err: - if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) - flags |= BCH_READ_must_bounce; - - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_data_read_retry)) - break; - } - - if (unlikely(ret)) { - if (ret != -BCH_ERR_extent_poisoned) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, - bvec_iter.bi_sector << 9)); - prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - rbio->bio.bi_status = BLK_STS_IOERR; - rbio->ret = ret; - - if (!(flags & BCH_READ_in_retry)) - bch2_rbio_done(rbio); - } - - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&sk, c); - return ret; -} - -static const char * const bch2_read_bio_flags[] = { -#define x(n) #n, - BCH_READ_FLAGS() -#undef x - NULL -}; - -void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) -{ - u64 now = local_clock(); - prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); - prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); - - if (!rbio->split) - prt_printf(out, "end_io:\t%ps\n", rbio->end_io); - else - prt_printf(out, "parent:\t%px\n", rbio->parent); - - prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); - - prt_printf(out, "promote:\t%u\n", rbio->promote); - prt_printf(out, "bounce:\t%u\n", rbio->bounce); - prt_printf(out, "split:\t%u\n", rbio->split); - prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); - prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); - prt_printf(out, "context:\t%u\n", rbio->context); - - int ret = READ_ONCE(rbio->ret); - if (ret < 0) - prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); - else - prt_printf(out, "ret:\t%i\n", ret); - - prt_printf(out, "flags:\t"); - bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); - prt_newline(out); - - bch2_bio_to_text(out, &rbio->bio); -} - -void bch2_fs_io_read_exit(struct bch_fs *c) -{ - if (c->promote_table.tbl) - rhashtable_destroy(&c->promote_table); - bioset_exit(&c->bio_read_split); - bioset_exit(&c->bio_read); - mempool_exit(&c->bio_bounce_pages); -} - -int bch2_fs_io_read_init(struct bch_fs *c) -{ - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); - - if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return bch_err_throw(c, ENOMEM_bio_read_init); - - if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return bch_err_throw(c, ENOMEM_bio_read_split_init); - - if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return bch_err_throw(c, ENOMEM_promote_table_init); - - return 0; -} diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h deleted file mode 100644 index 9c5ddbf861b39c..00000000000000 --- a/fs/bcachefs/io_read.h +++ /dev/null @@ -1,216 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_READ_H -#define _BCACHEFS_IO_READ_H - -#include "bkey_buf.h" -#include "btree_iter.h" -#include "extents_types.h" -#include "reflink.h" - -struct bch_read_bio { - struct bch_fs *c; - u64 start_time; - u64 submit_time; - - /* - * Reads will often have to be split, and if the extent being read from - * was checksummed or compressed we'll also have to allocate bounce - * buffers and copy the data back into the original bio. - * - * If we didn't have to split, we have to save and restore the original - * bi_end_io - @split below indicates which: - */ - union { - struct bch_read_bio *parent; - bio_end_io_t *end_io; - }; - - /* - * Saved copy of bio->bi_iter, from submission time - allows us to - * resubmit on IO error, and also to copy data back to the original bio - * when we're bouncing: - */ - struct bvec_iter bvec_iter; - - unsigned offset_into_extent; - - u16 flags; - union { - struct { - u16 data_update:1, - promote:1, - bounce:1, - split:1, - have_ioref:1, - narrow_crcs:1, - saw_error:1, - self_healing:1, - context:2; - }; - u16 _state; - }; - s16 ret; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - - struct extent_ptr_decoded pick; - - /* - * pos we read from - different from data_pos for indirect extents: - */ - u32 subvol; - struct bpos read_pos; - - /* - * start pos of data we read (may not be pos of data we want) - for - * promote, narrow extents paths: - */ - enum btree_id data_btree; - struct bpos data_pos; - struct bversion version; - - struct bch_io_opts opts; - - struct work_struct work; - - struct bio bio; -}; - -#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio) - -struct bch_devs_mask; -struct cache_promote_op; -struct extent_ptr_decoded; - -static inline int bch2_read_indirect_extent(struct btree_trans *trans, - enum btree_id *data_btree, - s64 *offset_into_extent, - struct bkey_buf *extent) -{ - if (extent->k->k.type != KEY_TYPE_reflink_p) - return 0; - - *data_btree = BTREE_ID_reflink; - - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, - offset_into_extent, - bkey_i_to_s_c_reflink_p(extent->k), - true, 0); - int ret = bkey_err(k); - if (ret) - return ret; - - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); - return bch_err_throw(c, missing_indirect_extent); - } - - bch2_bkey_buf_reassemble(extent, c, k); - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -#define BCH_READ_FLAGS() \ - x(retry_if_stale) \ - x(may_promote) \ - x(user_mapped) \ - x(last_fragment) \ - x(must_bounce) \ - x(must_clone) \ - x(in_retry) - -enum __bch_read_flags { -#define x(n) __BCH_READ_##n, - BCH_READ_FLAGS() -#undef x -}; - -enum bch_read_flags { -#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), - BCH_READ_FLAGS() -#undef x -}; - -int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, - struct bvec_iter, struct bpos, enum btree_id, - struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned, int); - -static inline void bch2_read_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, unsigned flags) -{ - int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags, -1); - /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ - WARN(ret, "unhandled error from __bch2_read_extent()"); -} - -int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, - struct bch_io_failures *, struct bkey_buf *, unsigned flags); - -static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum) -{ - BUG_ON(rbio->_state); - - rbio->subvol = inum.subvol; - - bch2_trans_run(c, - __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, - BCH_READ_retry_if_stale| - BCH_READ_may_promote| - BCH_READ_user_mapped)); -} - -static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, - struct bch_read_bio *orig) -{ - struct bch_read_bio *rbio = to_rbio(bio); - - rbio->c = orig->c; - rbio->_state = 0; - rbio->flags = 0; - rbio->ret = 0; - rbio->split = true; - rbio->parent = orig; - rbio->opts = orig->opts; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - rbio->list_idx = 0; -#endif - return rbio; -} - -static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_fs *c, - struct bch_io_opts opts, - bio_end_io_t end_io) -{ - struct bch_read_bio *rbio = to_rbio(bio); - - rbio->start_time = local_clock(); - rbio->c = c; - rbio->_state = 0; - rbio->flags = 0; - rbio->ret = 0; - rbio->opts = opts; - rbio->bio.bi_end_io = end_io; -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - rbio->list_idx = 0; -#endif - return rbio; -} - -struct promote_op; -void bch2_promote_op_to_text(struct printbuf *, struct promote_op *); -void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *); - -void bch2_fs_io_read_exit(struct bch_fs *); -int bch2_fs_io_read_init(struct bch_fs *); - -#endif /* _BCACHEFS_IO_READ_H */ diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c deleted file mode 100644 index 88b1eec8eff319..00000000000000 --- a/fs/bcachefs/io_write.c +++ /dev/null @@ -1,1780 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "bkey_buf.h" -#include "bset.h" -#include "btree_update.h" -#include "buckets.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "debug.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extent_update.h" -#include "inode.h" -#include "io_write.h" -#include "journal.h" -#include "keylist.h" -#include "move.h" -#include "nocow_locking.h" -#include "rebalance.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" -#include "trace.h" - -#include -#include -#include -#include - -#ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_write_corrupt_ratio; -module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); -MODULE_PARM_DESC(write_corrupt_ratio, ""); -#endif - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - -static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, - u64 now, int rw) -{ - u64 latency_capable = - ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; - /* ideally we'd be taking into account the device's variance here: */ - u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); - s64 latency_over = io_latency - latency_threshold; - - if (latency_threshold && latency_over > 0) { - /* - * bump up congested by approximately latency_over * 4 / - * latency_threshold - we don't need much accuracy here so don't - * bother with the divide: - */ - if (atomic_read(&ca->congested) < CONGESTED_MAX) - atomic_add(latency_over >> - max_t(int, ilog2(latency_threshold) - 2, 0), - &ca->congested); - - ca->congested_last = now; - } else if (atomic_read(&ca->congested) > 0) { - atomic_dec(&ca->congested); - } -} - -void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) -{ - atomic64_t *latency = &ca->cur_latency[rw]; - u64 now = local_clock(); - u64 io_latency = time_after64(now, submit_time) - ? now - submit_time - : 0; - u64 old, new; - - old = atomic64_read(latency); - do { - /* - * If the io latency was reasonably close to the current - * latency, skip doing the update and atomic operation - most of - * the time: - */ - if (abs((int) (old - io_latency)) < (old >> 1) && - now & ~(~0U << 5)) - break; - - new = ewma_add(old, io_latency, 5); - } while (!atomic64_try_cmpxchg(latency, &old, new)); - - bch2_congested_acct(ca, io_latency, now, rw); - - __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); -} - -#endif - -/* Allocate, free from mempool: */ - -void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) -{ - struct bvec_iter_all iter; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, iter) - if (bv->bv_page != ZERO_PAGE(0)) - mempool_free(bv->bv_page, &c->bio_bounce_pages); - bio->bi_vcnt = 0; -} - -static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) -{ - struct page *page; - - if (likely(!*using_mempool)) { - page = alloc_page(GFP_NOFS); - if (unlikely(!page)) { - mutex_lock(&c->bio_bounce_pages_lock); - *using_mempool = true; - goto pool_alloc; - - } - } else { -pool_alloc: - page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); - } - - return page; -} - -void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, - size_t size) -{ - bool using_mempool = false; - - while (size) { - struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min_t(size_t, PAGE_SIZE, size); - - BUG_ON(!bio_add_page(bio, page, len, 0)); - size -= len; - } - - if (using_mempool) - mutex_unlock(&c->bio_bounce_pages_lock); -} - -/* Extent update path: */ - -int bch2_sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, - bool *usage_increasing, - s64 *i_sectors_delta, - s64 *disk_sectors_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c old; - unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); - bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); - int ret = 0; - - *usage_increasing = false; - *i_sectors_delta = 0; - *disk_sectors_delta = 0; - - bch2_trans_copy_iter(trans, &iter, extent_iter); - - for_each_btree_key_max_continue_norestart(trans, iter, - new->k.p, BTREE_ITER_slots, old, ret) { - s64 sectors = min(new->k.p.offset, old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k)); - - *i_sectors_delta += sectors * - (bkey_extent_is_allocation(&new->k) - - bkey_extent_is_allocation(old.k)); - - *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); - *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot - ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) - : 0; - - if (!*usage_increasing && - (new->k.p.snapshot != old.k->p.snapshot || - new_replicas > bch2_bkey_replicas(c, old) || - (!new_compressed && bch2_bkey_sectors_compressed(old)))) - *usage_increasing = true; - - if (bkey_ge(old.k->p, new->k.p)) - break; - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, - struct btree_iter *extent_iter, - u64 new_i_size, - s64 i_sectors_delta) -{ - /* - * Crazy performance optimization: - * Every extent update needs to also update the inode: the inode trigger - * will set bi->journal_seq to the journal sequence number of this - * transaction - for fsync. - * - * But if that's the only reason we're updating the inode (we're not - * updating bi_size or bi_sectors), then we don't need the inode update - * to be journalled - if we crash, the bi_journal_seq update will be - * lost, but that's fine. - */ - unsigned inode_update_flags = BTREE_UPDATE_nojournal; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, - extent_iter->pos.inode, - extent_iter->snapshot), - BTREE_ITER_intent| - BTREE_ITER_cached); - int ret = bkey_err(k); - if (unlikely(ret)) - return ret; - - /* - * varint_decode_fast(), in the inode .invalid method, reads up to 7 - * bytes past the end of the buffer: - */ - struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); - ret = PTR_ERR_OR_ZERO(k_mut); - if (unlikely(ret)) - goto err; - - bkey_reassemble(k_mut, k); - - if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { - k_mut = bch2_inode_to_v3(trans, k_mut); - ret = PTR_ERR_OR_ZERO(k_mut); - if (unlikely(ret)) - goto err; - } - - struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); - - if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && - new_i_size > le64_to_cpu(inode->v.bi_size)) { - inode->v.bi_size = cpu_to_le64(new_i_size); - inode_update_flags = 0; - } - - if (i_sectors_delta) { - s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); - if (unlikely(bi_sectors + i_sectors_delta < 0)) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", - extent_iter->pos.inode, bi_sectors, i_sectors_delta); - - bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - if (i_sectors_delta < 0) - i_sectors_delta = -bi_sectors; - else - i_sectors_delta = 0; - } - - le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); - inode_update_flags = 0; - } - - /* - * extents, dirents and xattrs updates require that an inode update also - * happens - to ensure that if a key exists in one of those btrees with - * a given snapshot ID an inode is also present - so we may have to skip - * the nojournal optimization: - */ - if (inode->k.p.snapshot != iter.snapshot) { - inode->k.p.snapshot = iter.snapshot; - inode_update_flags = 0; - } - - ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_internal_snapshot_node| - inode_update_flags); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_extent_update(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - struct bkey_i *k, - struct disk_reservation *disk_res, - u64 new_i_size, - s64 *i_sectors_delta_total, - bool check_enospc) -{ - struct bpos next_pos; - bool usage_increasing; - s64 i_sectors_delta = 0, disk_sectors_delta = 0; - int ret; - - /* - * This traverses us the iterator without changing iter->path->pos to - * search_key() (which is pos + 1 for extents): we want there to be a - * path already traversed at iter->pos because - * bch2_trans_extent_update() will use it to attempt extent merging - */ - ret = __bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - - ret = bch2_extent_trim_atomic(trans, iter, k); - if (ret) - return ret; - - next_pos = k->k.p; - - ret = bch2_sum_sector_overwrites(trans, iter, k, - &usage_increasing, - &i_sectors_delta, - &disk_sectors_delta); - if (ret) - return ret; - - if (disk_res && - disk_sectors_delta > (s64) disk_res->sectors) { - ret = bch2_disk_reservation_add(trans->c, disk_res, - disk_sectors_delta - disk_res->sectors, - !check_enospc || !usage_increasing - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - return ret; - } - - /* - * Note: - * We always have to do an inode update - even when i_size/i_sectors - * aren't changing - for fsync to work properly; fsync relies on - * inode->bi_journal_seq which is updated by the trigger code: - */ - ret = bch2_extent_update_i_size_sectors(trans, iter, - min(k->k.p.offset << 9, new_i_size), - i_sectors_delta) ?: - bch2_trans_update(trans, iter, k, 0) ?: - bch2_trans_commit(trans, disk_res, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc); - if (unlikely(ret)) - return ret; - - if (i_sectors_delta_total) - *i_sectors_delta_total += i_sectors_delta; - bch2_btree_iter_set_pos(trans, iter, next_pos); - return 0; -} - -static int bch2_write_index_default(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct bkey_buf sk; - struct keylist *keys = &op->insert_keys; - struct bkey_i *k = bch2_keylist_front(keys); - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - subvol_inum inum = { - .subvol = op->subvol, - .inum = k->k.p.inode, - }; - int ret; - - BUG_ON(!inum.subvol); - - bch2_bkey_buf_init(&sk); - - do { - bch2_trans_begin(trans); - - k = bch2_keylist_front(keys); - bch2_bkey_buf_copy(&sk, c, k); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, - &sk.k->k.p.snapshot); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - bkey_start_pos(&sk.k->k), - BTREE_ITER_slots|BTREE_ITER_intent); - - ret = bch2_extent_update(trans, inum, &iter, sk.k, - &op->res, - op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_check_enospc); - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (bkey_ge(iter.pos, k->k.p)) - bch2_keylist_pop_front(&op->insert_keys); - else - bch2_cut_front(iter.pos, k); - } while (!bch2_keylist_empty(keys)); - - bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); - - return ret; -} - -/* Writes */ - -void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) -{ - struct printbuf buf = PRINTBUF; - - if (op->subvol) { - bch2_inum_offset_err_msg(op->c, &buf, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - } else { - struct bpos pos = op->pos; - pos.offset = offset; - bch2_inum_snap_offset_err_msg(op->c, &buf, pos); - } - - prt_str(&buf, "write error: "); - - va_list args; - va_start(args, fmt); - prt_vprintf(&buf, fmt, args); - va_end(args); - - if (op->flags & BCH_WRITE_move) { - struct data_update *u = container_of(op, struct data_update, op); - - prt_printf(&buf, "\n from internal move "); - bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); - } - - bch_err_ratelimited(op->c, "%s", buf.buf); - printbuf_exit(&buf); -} - -void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, - enum bch_data_type type, - const struct bkey_i *k, - bool nocow) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - struct bch_write_bio *n; - unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; - unsigned ref_idx = type == BCH_DATA_btree - ? BCH_DEV_READ_REF_btree_node_write - : BCH_DEV_WRITE_REF_io_write; - - BUG_ON(c->opts.nochanges); - - const struct bch_extent_ptr *last = NULL; - bkey_for_each_ptr(ptrs, ptr) - last = ptr; - - bkey_for_each_ptr(ptrs, ptr) { - /* - * XXX: btree writes should be using io_ref[WRITE], but we - * aren't retrying failed btree writes yet (due to device - * removal/ro): - */ - struct bch_dev *ca = nocow - ? bch2_dev_have_ref(c, ptr->dev) - : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); - - if (ptr != last) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); - - n->bio.bi_end_io = wbio->bio.bi_end_io; - n->bio.bi_private = wbio->bio.bi_private; - n->parent = wbio; - n->split = true; - n->bounce = false; - n->put_bio = true; - n->bio.bi_opf = wbio->bio.bi_opf; - bio_inc_remaining(&wbio->bio); - } else { - n = wbio; - n->split = false; - } - - n->c = c; - n->dev = ptr->dev; - n->have_ioref = ca != NULL; - n->nocow = nocow; - n->submit_time = local_clock(); - n->inode_offset = bkey_start_offset(&k->k); - if (nocow) - n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); - n->bio.bi_iter.bi_sector = ptr->offset; - - if (likely(n->have_ioref)) { - this_cpu_add(ca->io_done->sectors[WRITE][type], - bio_sectors(&n->bio)); - - bio_set_dev(&n->bio, ca->disk_sb.bdev); - - if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { - bio_endio(&n->bio); - continue; - } - - submit_bio(&n->bio); - } else { - n->bio.bi_status = BLK_STS_REMOVED; - bio_endio(&n->bio); - } - } -} - -static void __bch2_write(struct bch_write_op *); - -static void bch2_write_done(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; - - EBUG_ON(op->open_buckets.nr); - - bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - bch2_disk_reservation_put(c, &op->res); - - if (!(op->flags & BCH_WRITE_move)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); - bch2_keylist_free(&op->insert_keys, op->inline_keys); - - EBUG_ON(cl->parent); - closure_debug_destroy(cl); - async_object_list_del(c, write_op, op->list_idx); - if (op->end_io) - op->end_io(op); -} - -static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct keylist *keys = &op->insert_keys; - struct bkey_i *src, *dst = keys->keys, *n; - - for (src = keys->keys; src != keys->top; src = n) { - n = bkey_next(src); - - if (bkey_extent_is_direct_data(&src->k)) { - bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, - test_bit(ptr->dev, op->failed.d)); - - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return bch_err_throw(c, data_write_io); - } - - if (dst != src) - memmove_u64s_down(dst, src, src->k.u64s); - dst = bkey_next(dst); - } - - keys->top = dst; - return 0; -} - -/** - * __bch2_write_index - after a write, update index to point to new data - * @op: bch_write_op to process - */ -static void __bch2_write_index(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct keylist *keys = &op->insert_keys; - unsigned dev; - int ret = 0; - - if (unlikely(op->flags & BCH_WRITE_io_error)) { - ret = bch2_write_drop_io_error_ptrs(op); - if (ret) - goto err; - } - - if (!bch2_keylist_empty(keys)) { - u64 sectors_start = keylist_sectors(keys); - - ret = !(op->flags & BCH_WRITE_move) - ? bch2_write_index_default(op) - : bch2_data_update_index_update(op); - - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - BUG_ON(keylist_sectors(keys) && !ret); - - op->written += sectors_start - keylist_sectors(keys); - - if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - - bch2_write_op_error(op, bkey_start_offset(&insert->k), - "btree update error: %s", bch2_err_str(ret)); - } - - if (ret) - goto err; - } -out: - /* If some a bucket wasn't written, we can't erasure code it: */ - for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); - - bch2_open_buckets_put(c, &op->open_buckets); - return; -err: - keys->top = keys->keys; - op->error = ret; - op->flags |= BCH_WRITE_submitted; - goto out; -} - -static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) -{ - if (state != wp->state) { - struct task_struct *p = current; - u64 now = ktime_get_ns(); - u64 runtime = p->se.sum_exec_runtime + - (now - p->se.exec_start); - - if (state == WRITE_POINT_runnable) - wp->last_runtime = runtime; - else if (wp->state == WRITE_POINT_runnable) - wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; - - if (wp->last_state_change && - time_after64(now, wp->last_state_change)) - wp->time[wp->state] += now - wp->last_state_change; - wp->state = state; - wp->last_state_change = now; - } -} - -static inline void wp_update_state(struct write_point *wp, bool running) -{ - enum write_point_state state; - - state = running ? WRITE_POINT_runnable: - !list_empty(&wp->writes) ? WRITE_POINT_waiting_io - : WRITE_POINT_stopped; - - __wp_update_state(wp, state); -} - -static CLOSURE_CALLBACK(bch2_write_index) -{ - closure_type(op, struct bch_write_op, cl); - struct write_point *wp = op->wp; - struct workqueue_struct *wq = index_update_wq(op); - unsigned long flags; - - if ((op->flags & BCH_WRITE_submitted) && - (op->flags & BCH_WRITE_move)) - bch2_bio_free_pages_pool(op->c, &op->wbio.bio); - - spin_lock_irqsave(&wp->writes_lock, flags); - if (wp->state == WRITE_POINT_waiting_io) - __wp_update_state(wp, WRITE_POINT_waiting_work); - list_add_tail(&op->wp_list, &wp->writes); - spin_unlock_irqrestore (&wp->writes_lock, flags); - - queue_work(wq, &wp->index_update_work); -} - -static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) -{ - op->wp = wp; - - if (wp->state == WRITE_POINT_stopped) { - spin_lock_irq(&wp->writes_lock); - __wp_update_state(wp, WRITE_POINT_waiting_io); - spin_unlock_irq(&wp->writes_lock); - } -} - -void bch2_write_point_do_index_updates(struct work_struct *work) -{ - struct write_point *wp = - container_of(work, struct write_point, index_update_work); - struct bch_write_op *op; - - while (1) { - spin_lock_irq(&wp->writes_lock); - op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); - wp_update_state(wp, op != NULL); - spin_unlock_irq(&wp->writes_lock); - - if (!op) - break; - - op->flags |= BCH_WRITE_in_worker; - - __bch2_write_index(op); - - if (!(op->flags & BCH_WRITE_submitted)) - __bch2_write(op); - else - bch2_write_done(&op->cl); - } -} - -static void bch2_write_endio(struct bio *bio) -{ - struct closure *cl = bio->bi_private; - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; - struct bch_fs *c = wbio->c; - struct bch_dev *ca = wbio->have_ioref - ? bch2_dev_have_ref(c, wbio->dev) - : NULL; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, - wbio->submit_time, !bio->bi_status); - - if (unlikely(bio->bi_status)) { - if (ca) - bch_err_inum_offset_ratelimited(ca, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status)); - else - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status)); - set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_io_error; - } - - if (wbio->nocow) { - bch2_bucket_nocow_unlock(&c->nocow_locks, - POS(ca->dev_idx, wbio->nocow_bucket), - BUCKET_NOCOW_LOCK_UPDATE); - set_bit(wbio->dev, op->devs_need_flush->d); - } - - if (wbio->have_ioref) - enumerated_ref_put(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_io_write); - - if (wbio->bounce) - bch2_bio_free_pages_pool(c, bio); - - if (wbio->put_bio) - bio_put(bio); - - if (parent) - bio_endio(&parent->bio); - else - closure_put(cl); -} - -static void init_append_extent(struct bch_write_op *op, - struct write_point *wp, - struct bversion version, - struct bch_extent_crc_unpacked crc) -{ - struct bkey_i_extent *e; - - op->pos.offset += crc.uncompressed_size; - - e = bkey_extent_init(op->insert_keys.top); - e->k.p = op->pos; - e->k.size = crc.uncompressed_size; - e->k.bversion = version; - - if (crc.csum_type || - crc.compression_type || - crc.nonce) - bch2_extent_crc_append(&e->k_i, crc); - - bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_cached); - - if (!(op->flags & BCH_WRITE_move)) - bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); - - bch2_keylist_push(&op->insert_keys); -} - -static struct bio *bch2_write_bio_alloc(struct bch_fs *c, - struct write_point *wp, - struct bio *src, - bool *page_alloc_failed, - void *buf) -{ - struct bch_write_bio *wbio; - struct bio *bio; - unsigned output_available = - min(wp->sectors_free << 9, src->bi_iter.bi_size); - unsigned pages = DIV_ROUND_UP(output_available + - (buf - ? ((unsigned long) buf & (PAGE_SIZE - 1)) - : 0), PAGE_SIZE); - - pages = min(pages, BIO_MAX_VECS); - - bio = bio_alloc_bioset(NULL, pages, 0, - GFP_NOFS, &c->bio_write); - wbio = wbio_init(bio); - wbio->put_bio = true; - /* copy WRITE_SYNC flag */ - wbio->bio.bi_opf = src->bi_opf; - - if (buf) { - bch2_bio_map(bio, buf, output_available); - return bio; - } - - wbio->bounce = true; - - /* - * We can't use mempool for more than c->sb.encoded_extent_max - * worth of pages, but we'd like to allocate more if we can: - */ - bch2_bio_alloc_pages_pool(c, bio, - min_t(unsigned, output_available, - c->opts.encoded_extent_max)); - - if (bio->bi_iter.bi_size < output_available) - *page_alloc_failed = - bch2_bio_alloc_pages(bio, - output_available - - bio->bi_iter.bi_size, - GFP_NOFS) != 0; - - return bio; -} - -static int bch2_write_rechecksum(struct bch_fs *c, - struct bch_write_op *op, - unsigned new_csum_type) -{ - struct bio *bio = &op->wbio.bio; - struct bch_extent_crc_unpacked new_crc; - - /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ - - if (bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(new_csum_type)) - new_csum_type = op->crc.csum_type; - - int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); - if (ret) - return ret; - - bio_advance(bio, op->crc.offset << 9); - bio->bi_iter.bi_size = op->crc.live_size << 9; - op->crc = new_crc; - return 0; -} - -static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) -{ - struct bch_fs *c = op->c; - struct bio *bio = &op->wbio.bio; - struct bch_csum csum; - int ret = 0; - - BUG_ON(bio_sectors(bio) != op->crc.compressed_size); - - /* Can we just write the entire extent as is? */ - if (op->crc.uncompressed_size == op->crc.live_size && - op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && - op->crc.compressed_size <= wp->sectors_free && - (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || - op->incompressible)) { - if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type) { - ret = bch2_write_rechecksum(c, op, op->csum_type); - if (ret) - return ret; - } - - return 1; - } - - /* - * If the data is compressed and we couldn't write the entire extent as - * is, we have to decompress it: - */ - if (crc_is_compressed(op->crc)) { - /* Last point we can still verify checksum: */ - struct nonce nonce = extent_nonce(op->version, op->crc); - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - goto csum_err; - - if (bch2_csum_type_is_encryption(op->crc.csum_type)) { - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); - if (ret) - return ret; - - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - } - - ret = bch2_bio_uncompress_inplace(op, bio); - if (ret) - return ret; - } - - /* - * No longer have compressed data after this point - data might be - * encrypted: - */ - - /* - * If the data is checksummed and we're only writing a subset, - * rechecksum and adjust bio to point to currently live data: - */ - if (op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) { - ret = bch2_write_rechecksum(c, op, op->csum_type); - if (ret) - return ret; - } - - /* - * If we want to compress the data, it has to be decrypted: - */ - if (bch2_csum_type_is_encryption(op->crc.csum_type) && - (op->compression_opt || op->crc.csum_type != op->csum_type)) { - struct nonce nonce = extent_nonce(op->version, op->crc); - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - goto csum_err; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); - if (ret) - return ret; - - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - } - - return 0; -csum_err: - bch2_write_op_error(op, op->pos.offset, - "error verifying existing checksum while moving existing data (memory corruption?)\n" - " expected %0llx:%0llx got %0llx:%0llx type %s", - op->crc.csum.hi, - op->crc.csum.lo, - csum.hi, - csum.lo, - op->crc.csum_type < BCH_CSUM_NR - ? __bch2_csum_types[op->crc.csum_type] - : "(unknown)"); - return bch_err_throw(c, data_write_csum); -} - -static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, - struct bio **_dst) -{ - struct bch_fs *c = op->c; - struct bio *src = &op->wbio.bio, *dst = src; - struct bvec_iter saved_iter; - void *ec_buf; - unsigned total_output = 0, total_input = 0; - bool bounce = false; - bool page_alloc_failed = false; - int ret, more = 0; - - if (op->incompressible) - op->compression_opt = 0; - - BUG_ON(!bio_sectors(src)); - - ec_buf = bch2_writepoint_ec_buf(c, wp); - - if (unlikely(op->flags & BCH_WRITE_data_encoded)) { - ret = bch2_write_prep_encoded_data(op, wp); - if (ret < 0) - goto err; - if (ret) { - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; - } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; - } - } - - if (ec_buf || - op->compression_opt || - (op->csum_type && - !(op->flags & BCH_WRITE_pages_stable)) || - (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_pages_owned))) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bounce = true; - } - -#ifdef CONFIG_BCACHEFS_DEBUG - unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); - if (!bounce && write_corrupt_ratio) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bounce = true; - } -#endif - saved_iter = dst->bi_iter; - - do { - struct bch_extent_crc_unpacked crc = { 0 }; - struct bversion version = op->version; - size_t dst_len = 0, src_len = 0; - - if (page_alloc_failed && - dst->bi_iter.bi_size < (wp->sectors_free << 9) && - dst->bi_iter.bi_size < c->opts.encoded_extent_max) - break; - - BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_data_encoded) && - bch2_csum_type_is_encryption(op->crc.csum_type)); - BUG_ON(op->compression_opt && !bounce); - - crc.compression_type = op->incompressible - ? BCH_COMPRESSION_TYPE_incompressible - : op->compression_opt - ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_opt) - : 0; - if (!crc_is_compressed(crc)) { - dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); - - if (op->csum_type) - dst_len = min_t(unsigned, dst_len, - c->opts.encoded_extent_max); - - if (bounce) { - swap(dst->bi_iter.bi_size, dst_len); - bio_copy_data(dst, src); - swap(dst->bi_iter.bi_size, dst_len); - } - - src_len = dst_len; - } - - BUG_ON(!src_len || !dst_len); - - if (bch2_csum_type_is_encryption(op->csum_type)) { - if (bversion_zero(version)) { - version.lo = atomic64_inc_return(&c->key_version); - } else { - crc.nonce = op->nonce; - op->nonce += src_len >> 9; - } - } - - if ((op->flags & BCH_WRITE_data_encoded) && - !crc_is_compressed(crc) && - bch2_csum_type_is_encryption(op->crc.csum_type) == - bch2_csum_type_is_encryption(op->csum_type)) { - u8 compression_type = crc.compression_type; - u16 nonce = crc.nonce; - /* - * Note: when we're using rechecksum(), we need to be - * checksumming @src because it has all the data our - * existing checksum covers - if we bounced (because we - * were trying to compress), @dst will only have the - * part of the data the new checksum will cover. - * - * But normally we want to be checksumming post bounce, - * because part of the reason for bouncing is so the - * data can't be modified (by userspace) while it's in - * flight. - */ - ret = bch2_rechecksum_bio(c, src, version, op->crc, - &crc, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), - op->csum_type); - if (ret) - goto err; - /* - * rchecksum_bio sets compression_type on crc from op->crc, - * this isn't always correct as sometimes we're changing - * an extent from uncompressed to incompressible. - */ - crc.compression_type = compression_type; - crc.nonce = nonce; - } else { - if ((op->flags & BCH_WRITE_data_encoded) && - (ret = bch2_rechecksum_bio(c, src, version, op->crc, - NULL, &op->crc, - src_len >> 9, - bio_sectors(src) - (src_len >> 9), - op->crc.csum_type))) - goto err; - - crc.compressed_size = dst_len >> 9; - crc.uncompressed_size = src_len >> 9; - crc.live_size = src_len >> 9; - - swap(dst->bi_iter.bi_size, dst_len); - ret = bch2_encrypt_bio(c, op->csum_type, - extent_nonce(version, crc), dst); - if (ret) - goto err; - - crc.csum = bch2_checksum_bio(c, op->csum_type, - extent_nonce(version, crc), dst); - crc.csum_type = op->csum_type; - swap(dst->bi_iter.bi_size, dst_len); - } - - init_append_extent(op, wp, version, crc); - -#ifdef CONFIG_BCACHEFS_DEBUG - if (write_corrupt_ratio) { - swap(dst->bi_iter.bi_size, dst_len); - bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); - swap(dst->bi_iter.bi_size, dst_len); - } -#endif - - if (dst != src) - bio_advance(dst, dst_len); - bio_advance(src, src_len); - total_output += dst_len; - total_input += src_len; - } while (dst->bi_iter.bi_size && - src->bi_iter.bi_size && - wp->sectors_free && - !bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)); - - more = src->bi_iter.bi_size != 0; - - dst->bi_iter = saved_iter; - - if (dst == src && more) { - BUG_ON(total_output != total_input); - - dst = bio_split(src, total_input >> 9, - GFP_NOFS, &c->bio_write); - wbio_init(dst)->put_bio = true; - /* copy WRITE_SYNC flag */ - dst->bi_opf = src->bi_opf; - } - - dst->bi_iter.bi_size = total_output; -do_write: - *_dst = dst; - return more; -err: - if (to_wbio(dst)->bounce) - bch2_bio_free_pages_pool(c, dst); - if (to_wbio(dst)->put_bio) - bio_put(dst); - - return ret; -} - -static bool bch2_extent_is_writeable(struct bch_write_op *op, - struct bkey_s_c k) -{ - struct bch_fs *c = op->c; - struct bkey_s_c_extent e; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; - unsigned replicas = 0; - - if (k.k->type != KEY_TYPE_extent) - return false; - - e = bkey_s_c_to_extent(k); - - guard(rcu)(); - extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) - return false; - - replicas += bch2_extent_ptr_durability(c, &p); - } - - return replicas >= op->opts.data_replicas; -} - -static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *orig, - struct bkey_s_c k, - u64 new_i_size) -{ - if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { - /* trace this */ - return 0; - } - - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bch2_cut_front(bkey_start_pos(&orig->k), new); - bch2_cut_back(orig->k.p, new); - - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr(ptrs, ptr) - ptr->unwritten = 0; - - /* - * Note that we're not calling bch2_subvol_get_snapshot() in this path - - * that was done when we kicked off the write, and here it's important - * that we update the extent that we wrote to - even if a snapshot has - * since been created. The write is still outstanding, so we're ok - * w.r.t. snapshot atomicity: - */ - return bch2_extent_update_i_size_sectors(trans, iter, - min(new->k.p.offset << 9, new_i_size), 0) ?: - bch2_trans_update(trans, iter, new, - BTREE_UPDATE_internal_snapshot_node); -} - -static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - - for_each_keylist_key(&op->insert_keys, orig) { - ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, - bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); - })); - if (ret) - break; - } - - bch2_trans_put(trans); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch2_write_op_error(op, bkey_start_offset(&insert->k), - "btree update error: %s", bch2_err_str(ret)); - } - - if (ret) - op->error = ret; -} - -static void __bch2_nocow_write_done(struct bch_write_op *op) -{ - if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = bch_err_throw(op->c, data_write_io); - } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) - bch2_nocow_write_convert_unwritten(op); -} - -static CLOSURE_CALLBACK(bch2_nocow_write_done) -{ - closure_type(op, struct bch_write_op, cl); - - __bch2_nocow_write_done(op); - bch2_write_done(cl); -} - -struct bucket_to_lock { - struct bpos b; - unsigned gen; - struct nocow_lock_bucket *l; -}; - -static void bch2_nocow_write(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; - u32 snapshot; - struct bucket_to_lock *stale_at; - int stale, ret; - - if (op->flags & BCH_WRITE_move) - return; - - darray_init(&buckets); - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); - if (unlikely(ret)) - goto err; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(op->pos.inode, op->pos.offset, snapshot), - BTREE_ITER_slots); - while (1) { - struct bio *bio = &op->wbio.bio; - - buckets.nr = 0; - - ret = bch2_trans_relock(trans); - if (ret) - break; - - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - break; - - /* fall back to normal cow write path? */ - if (unlikely(k.k->p.snapshot != snapshot || - !bch2_extent_is_writeable(op, k))) - break; - - if (bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - k.k->u64s)) - break; - - /* Get iorefs before dropping btree locks: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, - BCH_DEV_WRITE_REF_io_write); - if (unlikely(!ca)) - goto err_get_ioref; - - struct bpos b = PTR_BUCKET_POS(ca, ptr); - struct nocow_lock_bucket *l = - bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); - prefetch(l); - - /* XXX allocating memory with btree locks held - rare */ - darray_push_gfp(&buckets, ((struct bucket_to_lock) { - .b = b, .gen = ptr->gen, .l = l, - }), GFP_KERNEL|__GFP_NOFAIL); - - if (ptr->unwritten) - op->flags |= BCH_WRITE_convert_unwritten; - } - - /* Unlock before taking nocow locks, doing IO: */ - bkey_reassemble(op->insert_keys.top, k); - bch2_trans_unlock(trans); - - bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_convert_unwritten) - bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); - - darray_for_each(buckets, i) { - struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); - - __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, - bucket_to_u64(i->b), - BUCKET_NOCOW_LOCK_UPDATE); - - int gen = bucket_gen_get(ca, i->b.offset); - stale = gen < 0 ? gen : gen_after(gen, i->gen); - if (unlikely(stale)) { - stale_at = i; - goto err_bucket_stale; - } - } - - bio = &op->wbio.bio; - if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { - bio = bio_split(bio, k.k->p.offset - op->pos.offset, - GFP_KERNEL, &c->bio_write); - wbio_init(bio)->put_bio = true; - bio->bi_opf = op->wbio.bio.bi_opf; - } else { - op->flags |= BCH_WRITE_submitted; - } - - op->pos.offset += bio_sectors(bio); - op->written += bio_sectors(bio); - - bio->bi_end_io = bch2_write_endio; - bio->bi_private = &op->cl; - bio->bi_opf |= REQ_OP_WRITE; - closure_get(&op->cl); - - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - op->insert_keys.top, true); - - bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_submitted) - break; - bch2_btree_iter_advance(trans, &iter); - } -out: - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - darray_exit(&buckets); - - if (ret) { - bch2_write_op_error(op, op->pos.offset, - "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); - op->error = ret; - op->flags |= BCH_WRITE_submitted; - } - - /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_submitted)) { - closure_sync(&op->cl); - __bch2_nocow_write_done(op); - op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_sync) { - closure_sync(&op->cl); - bch2_nocow_write_done(&op->cl.work); - } else { - /* - * XXX - * needs to run out of process context because ei_quota_lock is - * a mutex - */ - continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); - } - return; -err_get_ioref: - darray_for_each(buckets, i) - enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], - BCH_DEV_WRITE_REF_io_write); - - /* Fall back to COW path: */ - goto out; -err_bucket_stale: - darray_for_each(buckets, i) { - bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); - if (i == stale_at) - break; - } - - struct printbuf buf = PRINTBUF; - if (bch2_fs_inconsistent_on(stale < 0, c, - "pointer to invalid bucket in nocow path on device %llu\n %s", - stale_at->b.inode, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch_err_throw(c, data_write_invalid_ptr); - } else { - /* We can retry this: */ - ret = bch_err_throw(c, transaction_restart); - } - printbuf_exit(&buf); - - goto err_get_ioref; -} - -static void __bch2_write(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct write_point *wp = NULL; - struct bio *bio = NULL; - unsigned nofs_flags; - int ret; - - nofs_flags = memalloc_nofs_save(); - - if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { - bch2_nocow_write(op); - if (op->flags & BCH_WRITE_submitted) - goto out_nofs_restore; - } -again: - memset(&op->failed, 0, sizeof(op->failed)); - - do { - struct bkey_i *key_to_write; - unsigned key_to_write_offset = op->insert_keys.top_p - - op->insert_keys.keys_p; - - /* +1 for possible cache device: */ - if (op->open_buckets.nr + op->nr_replicas + 1 > - ARRAY_SIZE(op->open_buckets.v)) - break; - - if (bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)) - break; - - /* - * The copygc thread is now global, which means it's no longer - * freeing up space on specific disks, which means that - * allocations for specific disks may hang arbitrarily long: - */ - ret = bch2_trans_run(c, lockrestart_do(trans, - bch2_alloc_sectors_start_trans(trans, - op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), - op->write_point, - &op->devs_have, - op->nr_replicas, - op->nr_replicas_required, - op->watermark, - op->flags, - &op->cl, &wp))); - if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - break; - - goto err; - } - - EBUG_ON(!wp); - - bch2_open_bucket_get(c, wp, &op->open_buckets); - ret = bch2_write_extent(op, wp, &bio); - - bch2_alloc_sectors_done_inlined(c, wp); -err: - if (ret <= 0) { - op->flags |= BCH_WRITE_submitted; - - if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_alloc_nowait)) - bch2_write_op_error(op, op->pos.offset, - "%s(): %s", __func__, bch2_err_str(ret)); - op->error = ret; - break; - } - } - - bio->bi_end_io = bch2_write_endio; - bio->bi_private = &op->cl; - bio->bi_opf |= REQ_OP_WRITE; - - closure_get(bio->bi_private); - - key_to_write = (void *) (op->insert_keys.keys_p + - key_to_write_offset); - - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - key_to_write, false); - } while (ret); - - /* - * Sync or no? - * - * If we're running asynchronously, wne may still want to block - * synchronously here if we weren't able to submit all of the IO at - * once, as that signals backpressure to the caller. - */ - if ((op->flags & BCH_WRITE_sync) || - (!(op->flags & BCH_WRITE_submitted) && - !(op->flags & BCH_WRITE_in_worker))) { - bch2_wait_on_allocator(c, &op->cl); - - __bch2_write_index(op); - - if (!(op->flags & BCH_WRITE_submitted)) - goto again; - bch2_write_done(&op->cl); - } else { - bch2_write_queue(op, wp); - continue_at(&op->cl, bch2_write_index, NULL); - } -out_nofs_restore: - memalloc_nofs_restore(nofs_flags); -} - -static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) -{ - struct bio *bio = &op->wbio.bio; - struct bvec_iter iter; - struct bkey_i_inline_data *id; - unsigned sectors; - int ret; - - memset(&op->failed, 0, sizeof(op->failed)); - - op->flags |= BCH_WRITE_wrote_data_inline; - op->flags |= BCH_WRITE_submitted; - - bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); - - ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_U64s + DIV_ROUND_UP(data_len, 8)); - if (ret) { - op->error = ret; - goto err; - } - - sectors = bio_sectors(bio); - op->pos.offset += sectors; - - id = bkey_inline_data_init(op->insert_keys.top); - id->k.p = op->pos; - id->k.bversion = op->version; - id->k.size = sectors; - - iter = bio->bi_iter; - iter.bi_size = data_len; - memcpy_from_bio(id->v.data, bio, iter); - - while (data_len & 7) - id->v.data[data_len++] = '\0'; - set_bkey_val_bytes(&id->k, data_len); - bch2_keylist_push(&op->insert_keys); - - __bch2_write_index(op); -err: - bch2_write_done(&op->cl); -} - -/** - * bch2_write() - handle a write to a cache device or flash only volume - * @cl: &bch_write_op->cl - * - * This is the starting point for any data to end up in a cache device; it could - * be from a normal write, or a writeback write, or a write to a flash only - * volume - it's also used by the moving garbage collector to compact data in - * mostly empty buckets. - * - * It first writes the data to the cache, creating a list of keys to be inserted - * (if the data won't fit in a single open bucket, there will be multiple keys); - * after the data is written it calls bch_journal, and after the keys have been - * added to the next journal write they're inserted into the btree. - * - * If op->discard is true, instead of inserting the data it invalidates the - * region of the cache represented by op->bio and op->inode. - */ -CLOSURE_CALLBACK(bch2_write) -{ - closure_type(op, struct bch_write_op, cl); - struct bio *bio = &op->wbio.bio; - struct bch_fs *c = op->c; - unsigned data_len; - - EBUG_ON(op->cl.parent); - BUG_ON(!op->nr_replicas); - BUG_ON(!op->write_point.v); - BUG_ON(bkey_eq(op->pos, POS_MAX)); - - async_object_list_add(c, write_op, op, &op->list_idx); - - if (op->flags & BCH_WRITE_only_specified_devs) - op->flags |= BCH_WRITE_alloc_nowait; - - op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); - op->start_time = local_clock(); - bch2_keylist_init(&op->insert_keys, op->inline_keys); - wbio_init(bio)->put_bio = false; - - if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { - bch2_write_op_error(op, op->pos.offset, "misaligned write"); - op->error = bch_err_throw(c, data_write_misaligned); - goto err; - } - - if (c->opts.nochanges) { - op->error = bch_err_throw(c, erofs_no_writes); - goto err; - } - - if (!(op->flags & BCH_WRITE_move) && - !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { - op->error = bch_err_throw(c, erofs_no_writes); - goto err; - } - - if (!(op->flags & BCH_WRITE_move)) - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); - bch2_increment_clock(c, bio_sectors(bio), WRITE); - - data_len = min_t(u64, bio->bi_iter.bi_size, - op->new_i_size - (op->pos.offset << 9)); - - if (c->opts.inline_data && - data_len <= min(block_bytes(c) / 2, 1024U)) { - bch2_write_data_inline(op, data_len); - return; - } - - __bch2_write(op); - return; -err: - bch2_disk_reservation_put(c, &op->res); - - closure_debug_destroy(&op->cl); - async_object_list_del(c, write_op, op->list_idx); - if (op->end_io) - op->end_io(op); -} - -static const char * const bch2_write_flags[] = { -#define x(f) #f, - BCH_WRITE_FLAGS() -#undef x - NULL -}; - -void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - prt_printf(out, "pos:\t"); - bch2_bpos_to_text(out, op->pos); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "started:\t"); - bch2_pr_time_units(out, local_clock() - op->start_time); - prt_newline(out); - - prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_write_flags, op->flags); - prt_newline(out); - - prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); - prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); - - prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); - prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); - - printbuf_indent_sub(out, 2); -} - -void bch2_fs_io_write_exit(struct bch_fs *c) -{ - bioset_exit(&c->replica_set); - bioset_exit(&c->bio_write); -} - -int bch2_fs_io_write_init(struct bch_fs *c) -{ - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || - bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) - return bch_err_throw(c, ENOMEM_bio_write_init); - - return 0; -} diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h deleted file mode 100644 index 2c0a8f35ee1feb..00000000000000 --- a/fs/bcachefs/io_write.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_WRITE_H -#define _BCACHEFS_IO_WRITE_H - -#include "checksum.h" -#include "io_write_types.h" - -#define to_wbio(_bio) \ - container_of((_bio), struct bch_write_bio, bio) - -void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); - -void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - enum bch_data_type, const struct bkey_i *, bool); - -__printf(3, 4) -void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); - -static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -{ - return op->watermark == BCH_WATERMARK_copygc - ? op->c->copygc_wq - : op->c->btree_update_wq; -} - -int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, - struct bkey_i *, bool *, s64 *, s64 *); -int bch2_extent_update(struct btree_trans *, subvol_inum, - struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64, s64 *, bool); - -static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_io_opts opts) -{ - op->c = c; - op->end_io = NULL; - op->flags = 0; - op->written = 0; - op->error = 0; - op->csum_type = bch2_data_checksum_type(c, opts); - op->compression_opt = opts.compression; - op->nr_replicas = 0; - op->nr_replicas_required = c->opts.data_replicas_required; - op->watermark = BCH_WATERMARK_normal; - op->incompressible = 0; - op->open_buckets.nr = 0; - op->devs_have.nr = 0; - op->target = 0; - op->opts = opts; - op->subvol = 0; - op->pos = POS_MAX; - op->version = ZERO_VERSION; - op->write_point = (struct write_point_specifier) { 0 }; - op->res = (struct disk_reservation) { 0 }; - op->new_i_size = U64_MAX; - op->i_sectors_delta = 0; - op->devs_need_flush = NULL; -} - -CLOSURE_CALLBACK(bch2_write); -void bch2_write_point_do_index_updates(struct work_struct *); - -static inline struct bch_write_bio *wbio_init(struct bio *bio) -{ - struct bch_write_bio *wbio = to_wbio(bio); - - memset(&wbio->wbio, 0, sizeof(wbio->wbio)); - return wbio; -} - -void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); - -void bch2_fs_io_write_exit(struct bch_fs *); -int bch2_fs_io_write_init(struct bch_fs *); - -#endif /* _BCACHEFS_IO_WRITE_H */ diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h deleted file mode 100644 index 5da4eb8bb6f6d8..00000000000000 --- a/fs/bcachefs/io_write_types.h +++ /dev/null @@ -1,129 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_WRITE_TYPES_H -#define _BCACHEFS_IO_WRITE_TYPES_H - -#include "alloc_types.h" -#include "btree_types.h" -#include "buckets_types.h" -#include "extents_types.h" -#include "keylist_types.h" -#include "opts.h" -#include "super_types.h" - -#include -#include - -#define BCH_WRITE_FLAGS() \ - x(alloc_nowait) \ - x(cached) \ - x(data_encoded) \ - x(pages_stable) \ - x(pages_owned) \ - x(only_specified_devs) \ - x(wrote_data_inline) \ - x(check_enospc) \ - x(sync) \ - x(move) \ - x(in_worker) \ - x(submitted) \ - x(io_error) \ - x(convert_unwritten) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - -struct bch_write_bio { - struct_group(wbio, - struct bch_fs *c; - struct bch_write_bio *parent; - - u64 submit_time; - u64 inode_offset; - u64 nocow_bucket; - - struct bch_devs_list failed; - u8 dev; - - unsigned split:1, - bounce:1, - put_bio:1, - have_ioref:1, - nocow:1, - used_mempool:1, - first_btree_write:1; - ); - - struct bio bio; -}; - -struct bch_write_op { - struct closure cl; - struct bch_fs *c; - void (*end_io)(struct bch_write_op *); - u64 start_time; - -#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS - unsigned list_idx; -#endif - - unsigned written; /* sectors */ - u16 flags; - s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ - - unsigned compression_opt:8; - unsigned csum_type:4; - unsigned nr_replicas:4; - unsigned nr_replicas_required:4; - unsigned watermark:3; - unsigned incompressible:1; - unsigned stripe_waited:1; - - struct bch_devs_list devs_have; - u16 target; - u16 nonce; - struct bch_io_opts opts; - - u32 subvol; - struct bpos pos; - struct bversion version; - - /* For BCH_WRITE_data_encoded: */ - struct bch_extent_crc_unpacked crc; - - struct write_point_specifier write_point; - - struct write_point *wp; - struct list_head wp_list; - - struct disk_reservation res; - - struct open_buckets open_buckets; - - u64 new_i_size; - s64 i_sectors_delta; - - struct bch_devs_mask failed; - - struct keylist insert_keys; - u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; - - /* - * Bitmask of devices that have had nocow writes issued to them since - * last flush: - */ - struct bch_devs_mask *devs_need_flush; - - /* Must be last: */ - struct bch_write_bio wbio; -}; - -#endif /* _BCACHEFS_IO_WRITE_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c deleted file mode 100644 index ddfeb0dafc9d84..00000000000000 --- a/fs/bcachefs/journal.c +++ /dev/null @@ -1,1832 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bcachefs journalling code, for btree insertions - * - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_foreground.h" -#include "bkey_methods.h" -#include "btree_gc.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "enumerated_ref.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "journal_sb.h" -#include "journal_seq_blacklist.h" -#include "trace.h" - -static inline bool journal_seq_unwritten(struct journal *j, u64 seq) -{ - return seq > j->seq_ondisk; -} - -static bool __journal_entry_is_open(union journal_res_state state) -{ - return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -} - -static inline unsigned nr_unwritten_journal_entries(struct journal *j) -{ - return atomic64_read(&j->seq) - j->seq_ondisk; -} - -static bool journal_entry_is_open(struct journal *j) -{ - return __journal_entry_is_open(j->reservations); -} - -static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) -{ - union journal_res_state s = READ_ONCE(j->reservations); - unsigned i = seq & JOURNAL_BUF_MASK; - struct journal_buf *buf = j->buf + i; - - prt_printf(out, "seq:\t%llu\n", seq); - printbuf_indent_add(out, 2); - - if (!buf->write_started) - prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); - - struct closure *cl = &buf->io; - int r = atomic_read(&cl->remaining); - prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); - - if (buf->data) { - prt_printf(out, "size:\t"); - prt_human_readable_u64(out, vstruct_bytes(buf->data)); - prt_newline(out); - } - - prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies); - - prt_printf(out, "flags:\t"); - if (buf->noflush) - prt_str(out, "noflush "); - if (buf->must_flush) - prt_str(out, "must_flush "); - if (buf->separate_flush) - prt_str(out, "separate_flush "); - if (buf->need_flush_to_write_buffer) - prt_str(out, "need_flush_to_write_buffer "); - if (buf->write_started) - prt_str(out, "write_started "); - if (buf->write_allocated) - prt_str(out, "write_allocated "); - if (buf->write_done) - prt_str(out, "write_done"); - prt_newline(out); - - printbuf_indent_sub(out, 2); -} - -static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) -{ - lockdep_assert_held(&j->lock); - out->atomic++; - - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 24); - - for (u64 seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) - bch2_journal_buf_to_text(out, j, seq); - prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); - - --out->atomic; -} - -static inline struct journal_buf * -journal_seq_to_buf(struct journal *j, u64 seq) -{ - struct journal_buf *buf = NULL; - - EBUG_ON(seq > journal_cur_seq(j)); - - if (journal_seq_unwritten(j, seq)) - buf = j->buf + (seq & JOURNAL_BUF_MASK); - return buf; -} - -static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) -{ - for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) - INIT_LIST_HEAD(&p->unflushed[i]); - for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) - INIT_LIST_HEAD(&p->flushed[i]); - atomic_set(&p->count, count); - p->devs.nr = 0; -} - -/* - * Detect stuck journal conditions and trigger shutdown. Technically the journal - * can end up stuck for a variety of reasons, such as a blocked I/O, journal - * reservation lockup, etc. Since this is a fatal error with potentially - * unpredictable characteristics, we want to be fairly conservative before we - * decide to shut things down. - * - * Consider the journal stuck when it appears full with no ability to commit - * btree transactions, to discard journal buckets, nor acquire priority - * (reserved watermark) reservation. - */ -static inline bool -journal_error_check_stuck(struct journal *j, int error, unsigned flags) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool stuck = false; - struct printbuf buf = PRINTBUF; - - buf.atomic++; - - if (!(error == -BCH_ERR_journal_full || - error == -BCH_ERR_journal_pin_full) || - nr_unwritten_journal_entries(j) || - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) - return stuck; - - spin_lock(&j->lock); - - if (j->can_discard) { - spin_unlock(&j->lock); - return stuck; - } - - stuck = true; - - /* - * The journal shutdown path will set ->err_seq, but do it here first to - * serialize against concurrent failures and avoid duplicate error - * reports. - */ - if (j->err_seq) { - spin_unlock(&j->lock); - return stuck; - } - j->err_seq = journal_cur_seq(j); - - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); - prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), - bch2_err_str(error)); - bch2_print_str(c, KERN_ERR, buf.buf); - - printbuf_reset(&buf); - bch2_journal_pins_to_text(&buf, j); - bch_err(c, "Journal pins:\n%s", buf.buf); - printbuf_exit(&buf); - - bch2_fatal_error(c); - dump_stack(); - - return stuck; -} - -void bch2_journal_do_writes(struct journal *j) -{ - for (u64 seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) { - unsigned idx = seq & JOURNAL_BUF_MASK; - struct journal_buf *w = j->buf + idx; - - if (w->write_started && !w->write_allocated) - break; - if (w->write_started) - continue; - - if (!journal_state_seq_count(j, j->reservations, seq)) { - j->seq_write_started = seq; - w->write_started = true; - closure_call(&w->io, bch2_journal_write, j->wq, NULL); - } - - break; - } -} - -/* - * Final processing when the last reference of a journal buffer has been - * dropped. Drop the pin list reference acquired at journal entry open and write - * the buffer, if requested. - */ -void bch2_journal_buf_put_final(struct journal *j, u64 seq) -{ - lockdep_assert_held(&j->lock); - - if (__bch2_journal_pin_put(j, seq)) - bch2_journal_reclaim_fast(j); - bch2_journal_do_writes(j); - - /* - * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an - * open journal entry - */ - wake_up(&j->wait); -} - -/* - * Returns true if journal entry is now closed: - * - * We don't close a journal_buf until the next journal_buf is finished writing, - * and can be opened again - this also initializes the next journal_buf: - */ -static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf = journal_cur_buf(j); - union journal_res_state old, new; - unsigned sectors; - - BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && - closed_val != JOURNAL_ENTRY_ERROR_VAL); - - lockdep_assert_held(&j->lock); - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - new.cur_entry_offset = closed_val; - - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || - old.cur_entry_offset == new.cur_entry_offset) - return; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - - if (!__journal_entry_is_open(old)) - return; - - if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) - old.cur_entry_offset = j->cur_entry_offset_if_blocked; - - /* Close out old buffer: */ - buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - - if (trace_journal_entry_close_enabled() && trace) { - struct printbuf pbuf = PRINTBUF; - pbuf.atomic++; - - prt_str(&pbuf, "entry size: "); - prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); - prt_newline(&pbuf); - bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); - trace_journal_entry_close(c, pbuf.buf); - printbuf_exit(&pbuf); - } - - sectors = vstruct_blocks_plus(buf->data, c->block_bits, - buf->u64s_reserved) << c->block_bits; - if (unlikely(sectors > buf->sectors)) { - struct printbuf err = PRINTBUF; - err.atomic++; - - prt_printf(&err, "journal entry overran reserved space: %u > %u\n", - sectors, buf->sectors); - prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n", - le32_to_cpu(buf->data->u64s), buf->u64s_reserved, - j->cur_entry_u64s, - c->block_bits); - prt_printf(&err, "fatal error - emergency read only"); - bch2_journal_halt_locked(j); - - bch_err(c, "%s", err.buf); - printbuf_exit(&err); - return; - } - - buf->sectors = sectors; - - /* - * We have to set last_seq here, _before_ opening a new journal entry: - * - * A threads may replace an old pin with a new pin on their current - * journal reservation - the expectation being that the journal will - * contain either what the old pin protected or what the new pin - * protects. - * - * After the old pin is dropped journal_last_seq() won't include the old - * pin, so we can only write the updated last_seq on the entry that - * contains whatever the new pin protects. - * - * Restated, we can _not_ update last_seq for a given entry if there - * could be a newer entry open with reservations/pins that have been - * taken against it. - * - * Hence, we want update/set last_seq on the current journal entry right - * before we open a new one: - */ - buf->last_seq = journal_last_seq(j); - buf->data->last_seq = cpu_to_le64(buf->last_seq); - BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); - - cancel_delayed_work(&j->write_work); - - bch2_journal_space_available(j); - - __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); -} - -void bch2_journal_halt_locked(struct journal *j) -{ - lockdep_assert_held(&j->lock); - - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); - if (!j->err_seq) - j->err_seq = journal_cur_seq(j); - journal_wake(j); -} - -void bch2_journal_halt(struct journal *j) -{ - spin_lock(&j->lock); - bch2_journal_halt_locked(j); - spin_unlock(&j->lock); -} - -static bool journal_entry_want_write(struct journal *j) -{ - bool ret = !journal_entry_is_open(j) || - journal_cur_seq(j) == journal_last_unwritten_seq(j); - - /* Don't close it yet if we already have a write in flight: */ - if (ret) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - else if (nr_unwritten_journal_entries(j)) { - struct journal_buf *buf = journal_cur_buf(j); - - if (!buf->flush_time) { - buf->flush_time = local_clock() ?: 1; - buf->expires = jiffies; - } - } - - return ret; -} - -bool bch2_journal_entry_close(struct journal *j) -{ - bool ret; - - spin_lock(&j->lock); - ret = journal_entry_want_write(j); - spin_unlock(&j->lock); - - return ret; -} - -/* - * should _only_ called from journal_res_get() - when we actually want a - * journal reservation - journal entry is open means journal is dirty: - */ -static int journal_entry_open(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf = j->buf + - ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); - union journal_res_state old, new; - int u64s; - - lockdep_assert_held(&j->lock); - BUG_ON(journal_entry_is_open(j)); - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - - if (j->blocked) - return bch_err_throw(c, journal_blocked); - - if (j->cur_entry_error) - return j->cur_entry_error; - - int ret = bch2_journal_error(j); - if (unlikely(ret)) - return ret; - - if (!fifo_free(&j->pin)) - return bch_err_throw(c, journal_pin_full); - - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return bch_err_throw(c, journal_max_in_flight); - - if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) - return bch_err_throw(c, journal_max_open); - - if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { - bch_err(c, "cannot start: journal seq overflow"); - if (bch2_fs_emergency_read_only_locked(c)) - bch_err(c, "fatal error - emergency read only"); - return bch_err_throw(c, journal_shutdown); - } - - if (!j->free_buf && !buf->data) - return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */ - - BUG_ON(!j->cur_entry_sectors); - - if (!buf->data) { - swap(buf->data, j->free_buf); - swap(buf->buf_size, j->free_buf_size); - } - - buf->expires = - (journal_cur_seq(j) == j->flushed_seq_ondisk - ? jiffies - : j->last_flush_write) + - msecs_to_jiffies(c->opts.journal_flush_delay); - - buf->u64s_reserved = j->entry_u64s_reserved; - buf->disk_sectors = j->cur_entry_sectors; - buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); - - u64s = (int) (buf->sectors << 9) / sizeof(u64) - - journal_entry_overhead(j); - u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); - - if (u64s <= (ssize_t) j->early_journal_entries.nr) - return bch_err_throw(c, journal_full); - - if (fifo_empty(&j->pin) && j->reclaim_thread) - wake_up_process(j->reclaim_thread); - - /* - * The fifo_push() needs to happen at the same time as j->seq is - * incremented for journal_last_seq() to be calculated correctly - */ - atomic64_inc(&j->seq); - journal_pin_list_init(fifo_push_ref(&j->pin), 1); - - if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { - bch_err(c, "attempting to open blacklisted journal seq %llu", - journal_cur_seq(j)); - if (bch2_fs_emergency_read_only_locked(c)) - bch_err(c, "fatal error - emergency read only"); - return bch_err_throw(c, journal_shutdown); - } - - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); - - BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); - - bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - buf->flush_time = 0; - buf->need_flush_to_write_buffer = true; - buf->write_started = false; - buf->write_allocated = false; - buf->write_done = false; - - memset(buf->data, 0, sizeof(*buf->data)); - buf->data->seq = cpu_to_le64(journal_cur_seq(j)); - buf->data->u64s = 0; - - if (j->early_journal_entries.nr) { - memcpy(buf->data->_data, j->early_journal_entries.data, - j->early_journal_entries.nr * sizeof(u64)); - le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr); - } - - /* - * Must be set before marking the journal entry as open: - */ - j->cur_entry_u64s = u64s; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - - BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); - - new.idx++; - BUG_ON(journal_state_count(new, new.idx)); - BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); - - journal_state_inc(&new); - - /* Handle any already added entries */ - new.cur_entry_offset = le32_to_cpu(buf->data->u64s); - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - - if (nr_unwritten_journal_entries(j) == 1) - mod_delayed_work(j->wq, - &j->write_work, - msecs_to_jiffies(c->opts.journal_flush_delay)); - journal_wake(j); - - if (j->early_journal_entries.nr) - darray_exit(&j->early_journal_entries); - return 0; -} - -static bool journal_quiesced(struct journal *j) -{ - bool ret = atomic64_read(&j->seq) == j->seq_ondisk; - - if (!ret) - bch2_journal_entry_close(j); - return ret; -} - -static void journal_quiesce(struct journal *j) -{ - wait_event(j->wait, journal_quiesced(j)); -} - -static void journal_write_work(struct work_struct *work) -{ - struct journal *j = container_of(work, struct journal, write_work.work); - - spin_lock(&j->lock); - if (__journal_entry_is_open(j->reservations)) { - long delta = journal_cur_buf(j)->expires - jiffies; - - if (delta > 0) - mod_delayed_work(j->wq, &j->write_work, delta); - else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - } - spin_unlock(&j->lock); -} - -static void journal_buf_prealloc(struct journal *j) -{ - if (j->free_buf && - j->free_buf_size >= j->buf_size_want) - return; - - unsigned buf_size = j->buf_size_want; - - spin_unlock(&j->lock); - void *buf = kvmalloc(buf_size, GFP_NOFS); - spin_lock(&j->lock); - - if (buf && - (!j->free_buf || - buf_size > j->free_buf_size)) { - swap(buf, j->free_buf); - swap(buf_size, j->free_buf_size); - } - - if (unlikely(buf)) { - spin_unlock(&j->lock); - /* kvfree can sleep */ - kvfree(buf); - spin_lock(&j->lock); - } -} - -static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned flags) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; - bool can_discard; - int ret; -retry: - if (journal_res_get_fast(j, res, flags)) - return 0; - - ret = bch2_journal_error(j); - if (unlikely(ret)) - return ret; - - if (j->blocked) - return bch_err_throw(c, journal_blocked); - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = bch_err_throw(c, journal_full); - can_discard = j->can_discard; - goto out; - } - - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = bch_err_throw(c, journal_max_in_flight); - goto out; - } - - spin_lock(&j->lock); - - journal_buf_prealloc(j); - - /* - * Recheck after taking the lock, so we don't race with another thread - * that just did journal_entry_open() and call bch2_journal_entry_close() - * unnecessarily - */ - if (journal_res_get_fast(j, res, flags)) { - ret = 0; - goto unlock; - } - - /* - * If we couldn't get a reservation because the current buf filled up, - * and we had room for a bigger entry on disk, signal that we want to - * realloc the journal bufs: - */ - buf = journal_cur_buf(j); - if (journal_entry_is_open(j) && - buf->buf_size >> 9 < buf->disk_sectors && - buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) - j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open; -unlock: - can_discard = j->can_discard; - spin_unlock(&j->lock); -out: - if (likely(!ret)) - return 0; - if (ret == -BCH_ERR_journal_retry_open) - goto retry; - - if (journal_error_check_stuck(j, ret, flags)) - ret = bch_err_throw(c, journal_stuck); - - if (ret == -BCH_ERR_journal_max_in_flight && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && - trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_printbuf_make_room(&buf, 4096); - - spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); - spin_unlock(&j->lock); - - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - count_event(c, journal_entry_full); - } - - if (ret == -BCH_ERR_journal_max_open && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && - trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_printbuf_make_room(&buf, 4096); - - spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); - spin_unlock(&j->lock); - - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - count_event(c, journal_entry_full); - } - - /* - * Journal is full - can't rely on reclaim from work item due to - * freezing: - */ - if ((ret == -BCH_ERR_journal_full || - ret == -BCH_ERR_journal_pin_full) && - !(flags & JOURNAL_RES_GET_NONBLOCK)) { - if (can_discard) { - bch2_journal_do_discards(j); - goto retry; - } - - if (mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } - } - - return ret; -} - -static unsigned max_dev_latency(struct bch_fs *c) -{ - u64 nsecs = 0; - - guard(rcu)(); - for_each_rw_member_rcu(c, ca) - nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); - - return nsecs_to_jiffies(nsecs); -} - -/* - * Essentially the entry function to the journaling code. When bcachefs is doing - * a btree insert, it calls this function to get the current journal write. - * Journal write is the structure used set up journal writes. The calling - * function will then add its keys to the structure, queuing them for the next - * write. - * - * To ensure forward progress, the current task must not be holding any - * btree node write locks. - */ -int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned flags, - struct btree_trans *trans) -{ - int ret; - - if (closure_wait_event_timeout(&j->async_wait, - !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK), - HZ)) - return ret; - - if (trans) - bch2_trans_unlock_long(trans); - - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); - - remaining_wait = max(0, remaining_wait - HZ); - - if (closure_wait_event_timeout(&j->async_wait, - !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK), - remaining_wait)) - return ret; - - struct printbuf buf = PRINTBUF; - bch2_journal_debug_to_text(&buf, j); - bch2_print_str(c, KERN_ERR, buf.buf); - prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); - printbuf_exit(&buf); - - closure_wait_event(&j->async_wait, - !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || - (flags & JOURNAL_RES_GET_NONBLOCK)); - return ret; -} - -/* journal_entry_res: */ - -void bch2_journal_entry_res_resize(struct journal *j, - struct journal_entry_res *res, - unsigned new_u64s) -{ - union journal_res_state state; - int d = new_u64s - res->u64s; - - spin_lock(&j->lock); - - j->entry_u64s_reserved += d; - if (d <= 0) - goto out; - - j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); - state = READ_ONCE(j->reservations); - - if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && - state.cur_entry_offset > j->cur_entry_u64s) { - j->cur_entry_u64s += d; - /* - * Not enough room in current journal entry, have to flush it: - */ - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - } else { - journal_cur_buf(j)->u64s_reserved += d; - } -out: - spin_unlock(&j->lock); - res->u64s += d; -} - -/* journal flushing: */ - -/** - * bch2_journal_flush_seq_async - wait for a journal entry to be written - * @j: journal object - * @seq: seq to flush - * @parent: closure object to wait with - * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, - * -BCH_ERR_journal_flush_err if @seq will never be flushed - * - * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if - * necessary - */ -int bch2_journal_flush_seq_async(struct journal *j, u64 seq, - struct closure *parent) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; - int ret = 0; - - if (seq <= j->flushed_seq_ondisk) - return 1; - - spin_lock(&j->lock); - - if (WARN_ONCE(seq > journal_cur_seq(j), - "requested to flush journal seq %llu, but currently at %llu", - seq, journal_cur_seq(j))) - goto out; - - /* Recheck under lock: */ - if (j->err_seq && seq >= j->err_seq) { - ret = bch_err_throw(c, journal_flush_err); - goto out; - } - - if (seq <= j->flushed_seq_ondisk) { - ret = 1; - goto out; - } - - /* if seq was written, but not flushed - flush a newer one instead */ - seq = max(seq, journal_last_unwritten_seq(j)); - -recheck_need_open: - if (seq > journal_cur_seq(j)) { - struct journal_res res = { 0 }; - - if (journal_entry_is_open(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - - spin_unlock(&j->lock); - - /* - * We're called from bch2_journal_flush_seq() -> wait_event(); - * but this might block. We won't usually block, so we won't - * livelock: - */ - sched_annotate_sleep(); - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); - if (ret) - return ret; - - seq = res.seq; - buf = journal_seq_to_buf(j, seq); - buf->must_flush = true; - - if (!buf->flush_time) { - buf->flush_time = local_clock() ?: 1; - buf->expires = jiffies; - } - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); - - bch2_journal_res_put(j, &res); - - spin_lock(&j->lock); - goto want_write; - } - - /* - * if write was kicked off without a flush, or if we promised it - * wouldn't be a flush, flush the next sequence number instead - */ - buf = journal_seq_to_buf(j, seq); - if (buf->noflush) { - seq++; - goto recheck_need_open; - } - - buf->must_flush = true; - j->flushing_seq = max(j->flushing_seq, seq); - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); -want_write: - if (seq == journal_cur_seq(j)) - journal_entry_want_write(j); -out: - spin_unlock(&j->lock); - return ret; -} - -int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state) -{ - u64 start_time = local_clock(); - int ret, ret2; - - /* - * Don't update time_stats when @seq is already flushed: - */ - if (seq <= j->flushed_seq_ondisk) - return 0; - - ret = wait_event_state(j->wait, - (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)), - task_state); - - if (!ret) - bch2_time_stats_update(j->flush_seq_time, start_time); - - return ret ?: ret2 < 0 ? ret2 : 0; -} - -/* - * bch2_journal_flush_async - if there is an open journal entry, or a journal - * still being written, write it and wait for the write to complete - */ -void bch2_journal_flush_async(struct journal *j, struct closure *parent) -{ - bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); -} - -int bch2_journal_flush(struct journal *j) -{ - return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE); -} - -/* - * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the - * range [start, end) - * @seq - */ -bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - u64 unwritten_seq; - bool ret = false; - - if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) - return false; - - if (c->journal.flushed_seq_ondisk >= start) - return false; - - spin_lock(&j->lock); - if (c->journal.flushed_seq_ondisk >= start) - goto out; - - for (unwritten_seq = journal_last_unwritten_seq(j); - unwritten_seq < end; - unwritten_seq++) { - struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - - /* journal flush already in flight, or flush requseted */ - if (buf->must_flush) - goto out; - - buf->noflush = true; - } - - ret = true; -out: - spin_unlock(&j->lock); - return ret; -} - -static int __bch2_journal_meta(struct journal *j) -{ - struct journal_res res = {}; - int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); - if (ret) - return ret; - - struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK); - buf->must_flush = true; - - if (!buf->flush_time) { - buf->flush_time = local_clock() ?: 1; - buf->expires = jiffies; - } - - bch2_journal_res_put(j, &res); - - return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); -} - -int bch2_journal_meta(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) - return bch_err_throw(c, erofs_no_writes); - - int ret = __bch2_journal_meta(j); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); - return ret; -} - -/* block/unlock the journal: */ - -void bch2_journal_unblock(struct journal *j) -{ - spin_lock(&j->lock); - if (!--j->blocked && - j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && - j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - new.cur_entry_offset = j->cur_entry_offset_if_blocked; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - } - spin_unlock(&j->lock); - - journal_wake(j); -} - -static void __bch2_journal_block(struct journal *j) -{ - if (!j->blocked++) { - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - j->cur_entry_offset_if_blocked = old.cur_entry_offset; - - if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) - break; - - new.v = old.v; - new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - - if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) - journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); - } -} - -void bch2_journal_block(struct journal *j) -{ - spin_lock(&j->lock); - __bch2_journal_block(j); - spin_unlock(&j->lock); - - journal_quiesce(j); -} - -static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, - u64 max_seq, bool *blocked) -{ - struct journal_buf *ret = NULL; - - /* We're inside wait_event(), but using mutex_lock(: */ - sched_annotate_sleep(); - mutex_lock(&j->buf_lock); - spin_lock(&j->lock); - max_seq = min(max_seq, journal_cur_seq(j)); - - for (u64 seq = journal_last_unwritten_seq(j); - seq <= max_seq; - seq++) { - unsigned idx = seq & JOURNAL_BUF_MASK; - struct journal_buf *buf = j->buf + idx; - - if (buf->need_flush_to_write_buffer) { - union journal_res_state s; - s.v = atomic64_read_acquire(&j->reservations.counter); - - unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); - - if (open && !*blocked) { - __bch2_journal_block(j); - s.v = atomic64_read_acquire(&j->reservations.counter); - *blocked = true; - } - - ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open - ? ERR_PTR(-EAGAIN) - : buf; - break; - } - } - - spin_unlock(&j->lock); - if (IS_ERR_OR_NULL(ret)) - mutex_unlock(&j->buf_lock); - return ret; -} - -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, - u64 max_seq, bool *blocked) -{ - struct journal_buf *ret; - *blocked = false; - - wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, - max_seq, blocked)) != ERR_PTR(-EAGAIN)); - if (IS_ERR_OR_NULL(ret) && *blocked) - bch2_journal_unblock(j); - - return ret; -} - -/* allocate journal on a device: */ - -static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, - bool new_fs, struct closure *cl) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - u64 *new_bucket_seq = NULL, *new_buckets = NULL; - struct open_bucket **ob = NULL; - long *bu = NULL; - unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr; - int ret = 0; - - BUG_ON(nr <= ja->nr); - - bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); - ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); - new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); - new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); - if (!bu || !ob || !new_buckets || !new_bucket_seq) { - ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); - goto err_free; - } - - for (nr_got = 0; nr_got < nr_want; nr_got++) { - enum bch_watermark watermark = new_fs - ? BCH_WATERMARK_btree - : BCH_WATERMARK_normal; - - ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, - BCH_DATA_journal, cl); - ret = PTR_ERR_OR_ZERO(ob[nr_got]); - if (ret) - break; - - if (!new_fs) { - ret = bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(trans, ca, - ob[nr_got]->bucket, BCH_DATA_journal, - ca->mi.bucket_size, BTREE_TRIGGER_transactional)); - if (ret) { - bch2_open_bucket_put(c, ob[nr_got]); - bch_err_msg(c, ret, "marking new journal buckets"); - break; - } - } - - bu[nr_got] = ob[nr_got]->bucket; - } - - if (!nr_got) - goto err_free; - - /* Don't return an error if we successfully allocated some buckets: */ - ret = 0; - - if (c) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_block(&c->journal); - mutex_lock(&c->sb_lock); - } - - memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); - memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); - - BUG_ON(ja->discard_idx > ja->nr); - - pos = ja->discard_idx ?: ja->nr; - - memmove(new_buckets + pos + nr_got, - new_buckets + pos, - sizeof(new_buckets[0]) * (ja->nr - pos)); - memmove(new_bucket_seq + pos + nr_got, - new_bucket_seq + pos, - sizeof(new_bucket_seq[0]) * (ja->nr - pos)); - - for (i = 0; i < nr_got; i++) { - new_buckets[pos + i] = bu[i]; - new_bucket_seq[pos + i] = 0; - } - - nr = ja->nr + nr_got; - - ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr); - if (ret) - goto err_unblock; - - bch2_write_super(c); - - /* Commit: */ - if (c) - spin_lock(&c->journal.lock); - - swap(new_buckets, ja->buckets); - swap(new_bucket_seq, ja->bucket_seq); - ja->nr = nr; - - if (pos <= ja->discard_idx) - ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr; - if (pos <= ja->dirty_idx_ondisk) - ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr; - if (pos <= ja->dirty_idx) - ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr; - if (pos <= ja->cur_idx) - ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr; - - if (c) - spin_unlock(&c->journal.lock); -err_unblock: - if (c) { - bch2_journal_unblock(&c->journal); - mutex_unlock(&c->sb_lock); - } - - if (ret && !new_fs) - for (i = 0; i < nr_got; i++) - bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(trans, ca, - bu[i], BCH_DATA_free, 0, - BTREE_TRIGGER_transactional)); -err_free: - for (i = 0; i < nr_got; i++) - bch2_open_bucket_put(c, ob[i]); - - kfree(new_bucket_seq); - kfree(new_buckets); - kfree(ob); - kfree(bu); - return ret; -} - -static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca, - unsigned nr, bool new_fs) -{ - struct journal_device *ja = &ca->journal; - int ret = 0; - - struct closure cl; - closure_init_stack(&cl); - - /* don't handle reducing nr of buckets yet: */ - if (nr < ja->nr) - return 0; - - while (!ret && ja->nr < nr) { - struct disk_reservation disk_res = { 0, 0, 0 }; - - /* - * note: journal buckets aren't really counted as _sectors_ used yet, so - * we don't need the disk reservation to avoid the BUG_ON() in buckets.c - * when space used goes up without a reservation - but we do need the - * reservation to ensure we'll actually be able to allocate: - * - * XXX: that's not right, disk reservations only ensure a - * filesystem-wide allocation will succeed, this is a device - * specific allocation - we can hang here: - */ - if (!new_fs) { - ret = bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0); - if (ret) - break; - } - - ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl); - - if (ret == -BCH_ERR_bucket_alloc_blocked || - ret == -BCH_ERR_open_buckets_empty) - ret = 0; /* wait and retry */ - - bch2_disk_reservation_put(c, &disk_res); - bch2_wait_on_allocator(c, &cl); - } - - return ret; -} - -/* - * Allocate more journal space at runtime - not currently making use if it, but - * the code works: - */ -int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) -{ - down_write(&c->state_lock); - int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); - up_write(&c->state_lock); - - bch_err_fn(c, ret); - return ret; -} - -int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b) -{ - struct bch_fs *c = ca->fs; - struct journal *j = &c->journal; - struct journal_device *ja = &ca->journal; - - guard(mutex)(&c->sb_lock); - unsigned pos; - for (pos = 0; pos < ja->nr; pos++) - if (ja->buckets[pos] == b) - break; - - if (pos == ja->nr) { - bch_err(ca, "journal bucket %llu not found when deleting", b); - return -EINVAL; - } - - u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);; - if (!new_buckets) - return bch_err_throw(c, ENOMEM_set_nr_journal_buckets); - - memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); - memmove(&new_buckets[pos], - &new_buckets[pos + 1], - (ja->nr - 1 - pos) * sizeof(new_buckets[0])); - - int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?: - bch2_write_super(c); - if (ret) { - kfree(new_buckets); - return ret; - } - - scoped_guard(spinlock, &j->lock) { - if (pos < ja->discard_idx) - --ja->discard_idx; - if (pos < ja->dirty_idx_ondisk) - --ja->dirty_idx_ondisk; - if (pos < ja->dirty_idx) - --ja->dirty_idx; - if (pos < ja->cur_idx) - --ja->cur_idx; - - ja->nr--; - - memmove(&ja->buckets[pos], - &ja->buckets[pos + 1], - (ja->nr - pos) * sizeof(ja->buckets[0])); - - memmove(&ja->bucket_seq[pos], - &ja->bucket_seq[pos + 1], - (ja->nr - pos) * sizeof(ja->bucket_seq[0])); - - bch2_journal_space_available(j); - } - - kfree(new_buckets); - return 0; -} - -int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) -{ - struct bch_fs *c = ca->fs; - - if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) - return 0; - - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); - return bch_err_throw(c, erofs_filesystem_full); - } - - unsigned nr; - int ret; - - if (dynamic_fault("bcachefs:add:journal_alloc")) { - ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); - goto err; - } - - /* 1/128th of the device by default: */ - nr = ca->mi.nbuckets >> 7; - - /* - * clamp journal size to 8192 buckets or 8GB (in sectors), whichever - * is smaller: - */ - nr = clamp_t(unsigned, nr, - BCH_JOURNAL_BUCKETS_MIN, - min(1 << 13, - (1 << 24) / ca->mi.bucket_size)); - - ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); -err: - bch_err_fn(ca, ret); - return ret; -} - -int bch2_fs_journal_alloc(struct bch_fs *c) -{ - for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { - if (ca->journal.nr) - continue; - - int ret = bch2_dev_journal_alloc(ca, true); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_fs_journal_alloc); - return ret; - } - } - - return 0; -} - -/* startup/shutdown: */ - -static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) -{ - bool ret = false; - u64 seq; - - spin_lock(&j->lock); - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j) && !ret; - seq++) { - struct journal_buf *buf = journal_seq_to_buf(j, seq); - - if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) - ret = true; - } - spin_unlock(&j->lock); - - return ret; -} - -void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) -{ - wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); -} - -void bch2_fs_journal_stop(struct journal *j) -{ - if (!test_bit(JOURNAL_running, &j->flags)) - return; - - bch2_journal_reclaim_stop(j); - bch2_journal_flush_all_pins(j); - - wait_event(j->wait, bch2_journal_entry_close(j)); - - /* - * Always write a new journal entry, to make sure the clock hands are up - * to date (and match the superblock) - */ - __bch2_journal_meta(j); - - journal_quiesce(j); - cancel_delayed_work_sync(&j->write_work); - - WARN(!bch2_journal_error(j) && - test_bit(JOURNAL_replay_done, &j->flags) && - j->last_empty_seq != journal_cur_seq(j), - "journal shutdown error: cur seq %llu but last empty seq %llu", - journal_cur_seq(j), j->last_empty_seq); - - if (!bch2_journal_error(j)) - clear_bit(JOURNAL_running, &j->flags); -} - -int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_entry_pin_list *p; - struct journal_replay *i, **_i; - struct genradix_iter iter; - bool had_entries = false; - - /* - * - * XXX pick most recent non blacklisted sequence number - */ - - cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); - - if (cur_seq >= JOURNAL_SEQ_MAX) { - bch_err(c, "cannot start: journal seq overflow"); - return -EINVAL; - } - - /* Clean filesystem? */ - if (!last_seq) - last_seq = cur_seq; - - u64 nr = cur_seq - last_seq; - - /* - * Extra fudge factor, in case we crashed when the journal pin fifo was - * nearly or completely full. We'll need to be able to open additional - * journal entries (at least a few) in order for journal replay to get - * going: - */ - nr += nr / 4; - - nr = max(nr, JOURNAL_PIN); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return bch_err_throw(c, ENOMEM_journal_pin_fifo); - } - - j->replay_journal_seq = last_seq; - j->replay_journal_seq_end = cur_seq; - j->last_seq_ondisk = last_seq; - j->flushed_seq_ondisk = cur_seq - 1; - j->seq_write_started = cur_seq - 1; - j->seq_ondisk = cur_seq - 1; - j->pin.front = last_seq; - j->pin.back = cur_seq; - atomic64_set(&j->seq, cur_seq - 1); - - u64 seq; - fifo_for_each_entry_ptr(p, &j->pin, seq) - journal_pin_list_init(p, 1); - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - seq = le64_to_cpu(i->j.seq); - BUG_ON(seq >= cur_seq); - - if (seq < last_seq) - continue; - - if (journal_entry_empty(&i->j)) - j->last_empty_seq = le64_to_cpu(i->j.seq); - - p = journal_seq_pin(j, seq); - - p->devs.nr = 0; - darray_for_each(i->ptrs, ptr) - bch2_dev_list_add_dev(&p->devs, ptr->dev); - - had_entries = true; - } - - if (!had_entries) - j->last_empty_seq = cur_seq - 1; /* to match j->seq */ - - spin_lock(&j->lock); - j->last_flush_write = jiffies; - - j->reservations.idx = journal_cur_seq(j); - - c->last_bucket_seq_cleanup = journal_cur_seq(j); - spin_unlock(&j->lock); - - return 0; -} - -void bch2_journal_set_replay_done(struct journal *j) -{ - /* - * journal_space_available must happen before setting JOURNAL_running - * JOURNAL_running must happen before JOURNAL_replay_done - */ - spin_lock(&j->lock); - bch2_journal_space_available(j); - - set_bit(JOURNAL_need_flush_write, &j->flags); - set_bit(JOURNAL_running, &j->flags); - set_bit(JOURNAL_replay_done, &j->flags); - spin_unlock(&j->lock); -} - -/* init/exit: */ - -void bch2_dev_journal_exit(struct bch_dev *ca) -{ - struct journal_device *ja = &ca->journal; - - for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - kfree(ja->bio[i]); - ja->bio[i] = NULL; - } - - kfree(ja->buckets); - kfree(ja->bucket_seq); - ja->buckets = NULL; - ja->bucket_seq = NULL; -} - -int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets = - bch2_sb_field_get(sb, journal); - struct bch_sb_field_journal_v2 *journal_buckets_v2 = - bch2_sb_field_get(sb, journal_v2); - - ja->nr = 0; - - if (journal_buckets_v2) { - unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - - for (unsigned i = 0; i < nr; i++) - ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); - } else if (journal_buckets) { - ja->nr = bch2_nr_journal_buckets(journal_buckets); - } - - ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); - if (!ja->bucket_seq) - return bch_err_throw(c, ENOMEM_dev_journal_init); - - unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); - - for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, - nr_bvecs), GFP_KERNEL); - if (!ja->bio[i]) - return bch_err_throw(c, ENOMEM_dev_journal_init); - - ja->bio[i]->ca = ca; - ja->bio[i]->buf_idx = i; - bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); - } - - ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); - if (!ja->buckets) - return bch_err_throw(c, ENOMEM_dev_journal_init); - - if (journal_buckets_v2) { - unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - unsigned dst = 0; - - for (unsigned i = 0; i < nr; i++) - for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) - ja->buckets[dst++] = - le64_to_cpu(journal_buckets_v2->d[i].start) + j; - } else if (journal_buckets) { - for (unsigned i = 0; i < ja->nr; i++) - ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); - } - - return 0; -} - -void bch2_fs_journal_exit(struct journal *j) -{ - if (j->wq) - destroy_workqueue(j->wq); - - darray_exit(&j->early_journal_entries); - - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - kvfree(j->buf[i].data); - kvfree(j->free_buf); - free_fifo(&j->pin); -} - -void bch2_fs_journal_init_early(struct journal *j) -{ - static struct lock_class_key res_key; - - mutex_init(&j->buf_lock); - spin_lock_init(&j->lock); - spin_lock_init(&j->err_lock); - init_waitqueue_head(&j->wait); - INIT_DELAYED_WORK(&j->write_work, journal_write_work); - init_waitqueue_head(&j->reclaim_wait); - init_waitqueue_head(&j->pin_flush_wait); - mutex_init(&j->reclaim_lock); - mutex_init(&j->discard_lock); - - lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - - atomic64_set(&j->reservations.counter, - ((union journal_res_state) - { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); -} - -int bch2_fs_journal_init(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; - j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); - if (!j->free_buf) - return bch_err_throw(c, ENOMEM_journal_buf); - - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) - j->buf[i].idx = i; - - j->wq = alloc_workqueue("bcachefs_journal", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); - if (!j->wq) - return bch_err_throw(c, ENOMEM_fs_other_alloc); - return 0; -} - -/* debug: */ - -static const char * const bch2_journal_flags_strs[] = { -#define x(n) #n, - JOURNAL_FLAGS() -#undef x - NULL -}; - -void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union journal_res_state s; - unsigned long now = jiffies; - u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 28); - out->atomic++; - - guard(rcu)(); - s = READ_ONCE(j->reservations); - - prt_printf(out, "flags:\t"); - prt_bitflags(out, bch2_journal_flags_strs, j->flags); - prt_newline(out); - prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); - prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); - prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); - prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); - prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); - prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); - prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); - prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); - prt_printf(out, "average write size:\t"); - prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); - prt_newline(out); - prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); - prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); - prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); - prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) - ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "blocked:\t%u\n", j->blocked); - prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error)); - prt_printf(out, "current entry:\t"); - - switch (s.cur_entry_offset) { - case JOURNAL_ENTRY_ERROR_VAL: - prt_printf(out, "error\n"); - break; - case JOURNAL_ENTRY_CLOSED_VAL: - prt_printf(out, "closed\n"); - break; - case JOURNAL_ENTRY_BLOCKED_VAL: - prt_printf(out, "blocked\n"); - break; - default: - prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); - break; - } - - prt_printf(out, "unwritten entries:\n"); - bch2_journal_bufs_to_text(out, j); - - prt_printf(out, "space:\n"); - printbuf_indent_add(out, 2); - prt_printf(out, "discarded\t%u:%u\n", - j->space[journal_space_discarded].next_entry, - j->space[journal_space_discarded].total); - prt_printf(out, "clean ondisk\t%u:%u\n", - j->space[journal_space_clean_ondisk].next_entry, - j->space[journal_space_clean_ondisk].total); - prt_printf(out, "clean\t%u:%u\n", - j->space[journal_space_clean].next_entry, - j->space[journal_space_clean].total); - prt_printf(out, "total\t%u:%u\n", - j->space[journal_space_total].next_entry, - j->space[journal_space_total].total); - printbuf_indent_sub(out, 2); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->mi.durability) - continue; - - struct journal_device *ja = &ca->journal; - - if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) - continue; - - if (!ja->nr) - continue; - - prt_printf(out, "dev %u:\n", ca->dev_idx); - prt_printf(out, "durability %u:\n", ca->mi.durability); - printbuf_indent_add(out, 2); - prt_printf(out, "nr\t%u\n", ja->nr); - prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); - prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); - prt_printf(out, "discard_idx\t%u\n", ja->discard_idx); - prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); - prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); - prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); - printbuf_indent_sub(out, 2); - } - - prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); - - --out->atomic; -} - -void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -{ - spin_lock(&j->lock); - __bch2_journal_debug_to_text(out, j); - spin_unlock(&j->lock); -} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h deleted file mode 100644 index 977907038d98d0..00000000000000 --- a/fs/bcachefs/journal.h +++ /dev/null @@ -1,465 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_H -#define _BCACHEFS_JOURNAL_H - -/* - * THE JOURNAL: - * - * The primary purpose of the journal is to log updates (insertions) to the - * b-tree, to avoid having to do synchronous updates to the b-tree on disk. - * - * Without the journal, the b-tree is always internally consistent on - * disk - and in fact, in the earliest incarnations bcache didn't have a journal - * but did handle unclean shutdowns by doing all index updates synchronously - * (with coalescing). - * - * Updates to interior nodes still happen synchronously and without the journal - * (for simplicity) - this may change eventually but updates to interior nodes - * are rare enough it's not a huge priority. - * - * This means the journal is relatively separate from the b-tree; it consists of - * just a list of keys and journal replay consists of just redoing those - * insertions in same order that they appear in the journal. - * - * PERSISTENCE: - * - * For synchronous updates (where we're waiting on the index update to hit - * disk), the journal entry will be written out immediately (or as soon as - * possible, if the write for the previous journal entry was still in flight). - * - * Synchronous updates are specified by passing a closure (@flush_cl) to - * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter - * down to the journalling code. That closure will wait on the journal write to - * complete (via closure_wait()). - * - * If the index update wasn't synchronous, the journal entry will be - * written out after 10 ms have elapsed, by default (the delay_ms field - * in struct journal). - * - * JOURNAL ENTRIES: - * - * A journal entry is variable size (struct jset), it's got a fixed length - * header and then a variable number of struct jset_entry entries. - * - * Journal entries are identified by monotonically increasing 64 bit sequence - * numbers - jset->seq; other places in the code refer to this sequence number. - * - * A jset_entry entry contains one or more bkeys (which is what gets inserted - * into the b-tree). We need a container to indicate which b-tree the key is - * for; also, the roots of the various b-trees are stored in jset_entry entries - * (one for each b-tree) - this lets us add new b-tree types without changing - * the on disk format. - * - * We also keep some things in the journal header that are logically part of the - * superblock - all the things that are frequently updated. This is for future - * bcache on raw flash support; the superblock (which will become another - * journal) can't be moved or wear leveled, so it contains just enough - * information to find the main journal, and the superblock only has to be - * rewritten when we want to move/wear level the main journal. - * - * JOURNAL LAYOUT ON DISK: - * - * The journal is written to a ringbuffer of buckets (which is kept in the - * superblock); the individual buckets are not necessarily contiguous on disk - * which means that journal entries are not allowed to span buckets, but also - * that we can resize the journal at runtime if desired (unimplemented). - * - * The journal buckets exist in the same pool as all the other buckets that are - * managed by the allocator and garbage collection - garbage collection marks - * the journal buckets as metadata buckets. - * - * OPEN/DIRTY JOURNAL ENTRIES: - * - * Open/dirty journal entries are journal entries that contain b-tree updates - * that have not yet been written out to the b-tree on disk. We have to track - * which journal entries are dirty, and we also have to avoid wrapping around - * the journal and overwriting old but still dirty journal entries with new - * journal entries. - * - * On disk, this is represented with the "last_seq" field of struct jset; - * last_seq is the first sequence number that journal replay has to replay. - * - * To avoid overwriting dirty journal entries on disk, we keep a mapping (in - * journal_device->seq) of for each journal bucket, the highest sequence number - * any journal entry it contains. Then, by comparing that against last_seq we - * can determine whether that journal bucket contains dirty journal entries or - * not. - * - * To track which journal entries are dirty, we maintain a fifo of refcounts - * (where each entry corresponds to a specific sequence number) - when a ref - * goes to 0, that journal entry is no longer dirty. - * - * Journalling of index updates is done at the same time as the b-tree itself is - * being modified (see btree_insert_key()); when we add the key to the journal - * the pending b-tree write takes a ref on the journal entry the key was added - * to. If a pending b-tree write would need to take refs on multiple dirty - * journal entries, it only keeps the ref on the oldest one (since a newer - * journal entry will still be replayed if an older entry was dirty). - * - * JOURNAL FILLING UP: - * - * There are two ways the journal could fill up; either we could run out of - * space to write to, or we could have too many open journal entries and run out - * of room in the fifo of refcounts. Since those refcounts are decremented - * without any locking we can't safely resize that fifo, so we handle it the - * same way. - * - * If the journal fills up, we start flushing dirty btree nodes until we can - * allocate space for a journal write again - preferentially flushing btree - * nodes that are pinning the oldest journal entries first. - */ - -#include - -#include "journal_types.h" - -struct bch_fs; - -static inline void journal_wake(struct journal *j) -{ - wake_up(&j->wait); - closure_wake_up(&j->async_wait); -} - -/* Sequence number of oldest dirty journal entry */ - -static inline u64 journal_last_seq(struct journal *j) -{ - return j->pin.front; -} - -static inline u64 journal_cur_seq(struct journal *j) -{ - return atomic64_read(&j->seq); -} - -static inline u64 journal_last_unwritten_seq(struct journal *j) -{ - return j->seq_ondisk + 1; -} - -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - unsigned idx = (journal_cur_seq(j) & - JOURNAL_BUF_MASK & - ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; - - return j->buf + idx; -} - -static inline int journal_state_count(union journal_res_state s, int idx) -{ - switch (idx) { - case 0: return s.buf0_count; - case 1: return s.buf1_count; - case 2: return s.buf2_count; - case 3: return s.buf3_count; - } - BUG(); -} - -static inline int journal_state_seq_count(struct journal *j, - union journal_res_state s, u64 seq) -{ - if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR) - return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); - else - return 0; -} - -static inline void journal_state_inc(union journal_res_state *s) -{ - s->buf0_count += s->idx == 0; - s->buf1_count += s->idx == 1; - s->buf2_count += s->idx == 2; - s->buf3_count += s->idx == 3; -} - -/* - * Amount of space that will be taken up by some keys in the journal (i.e. - * including the jset header) - */ -static inline unsigned jset_u64s(unsigned u64s) -{ - return u64s + sizeof(struct jset_entry) / sizeof(u64); -} - -static inline int journal_entry_overhead(struct journal *j) -{ - return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; -} - -static inline struct jset_entry * -bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) -{ - struct jset *jset = buf->data; - struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); - - memset(entry, 0, sizeof(*entry)); - entry->u64s = cpu_to_le16(u64s); - - le32_add_cpu(&jset->u64s, jset_u64s(u64s)); - - return entry; -} - -static inline struct jset_entry * -journal_res_entry(struct journal *j, struct journal_res *res) -{ - return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); -} - -static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, - enum btree_id id, unsigned level, - unsigned u64s) -{ - entry->u64s = cpu_to_le16(u64s); - entry->btree_id = id; - entry->level = level; - entry->type = type; - entry->pad[0] = 0; - entry->pad[1] = 0; - entry->pad[2] = 0; - return jset_u64s(u64s); -} - -static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, - enum btree_id id, unsigned level, - const void *data, unsigned u64s) -{ - unsigned ret = journal_entry_init(entry, type, id, level, u64s); - - memcpy_u64s_small(entry->_data, data, u64s); - return ret; -} - -static inline struct jset_entry * -bch2_journal_add_entry(struct journal *j, struct journal_res *res, - unsigned type, enum btree_id id, - unsigned level, unsigned u64s) -{ - struct jset_entry *entry = journal_res_entry(j, res); - unsigned actual = journal_entry_init(entry, type, id, level, u64s); - - EBUG_ON(!res->ref); - EBUG_ON(actual > res->u64s); - - res->offset += actual; - res->u64s -= actual; - return entry; -} - -static inline bool journal_entry_empty(struct jset *j) -{ - if (j->seq != j->last_seq) - return false; - - vstruct_for_each(j, i) - if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) - return false; - return true; -} - -/* - * Drop reference on a buffer index and return true if the count has hit zero. - */ -static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx) -{ - union journal_res_state s; - - s.v = atomic64_sub_return(((union journal_res_state) { - .buf0_count = idx == 0, - .buf1_count = idx == 1, - .buf2_count = idx == 2, - .buf3_count = idx == 3, - }).v, &j->reservations.counter); - return s; -} - -bool bch2_journal_entry_close(struct journal *); -void bch2_journal_do_writes(struct journal *); -void bch2_journal_buf_put_final(struct journal *, u64); - -static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) -{ - unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); - if (!journal_state_count(s, idx)) - bch2_journal_buf_put_final(j, seq); -} - -static inline void bch2_journal_buf_put(struct journal *j, u64 seq) -{ - unsigned idx = seq & JOURNAL_STATE_BUF_MASK; - union journal_res_state s; - - s = journal_state_buf_put(j, idx); - if (!journal_state_count(s, idx)) { - spin_lock(&j->lock); - bch2_journal_buf_put_final(j, seq); - spin_unlock(&j->lock); - } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) - wake_up(&j->wait); -} - -/* - * This function releases the journal write structure so other threads can - * then proceed to add their keys as well. - */ -static inline void bch2_journal_res_put(struct journal *j, - struct journal_res *res) -{ - if (!res->ref) - return; - - lock_release(&j->res_map, _THIS_IP_); - - while (res->u64s) - bch2_journal_add_entry(j, res, - BCH_JSET_ENTRY_btree_keys, - 0, 0, 0); - - bch2_journal_buf_put(j, res->seq); - - res->ref = 0; -} - -int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned, struct btree_trans *); - -/* First bits for BCH_WATERMARK: */ -enum journal_res_flags { - __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, - __JOURNAL_RES_GET_CHECK, -}; - -#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) -#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) - -static inline int journal_res_get_fast(struct journal *j, - struct journal_res *res, - unsigned flags) -{ - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - - /* - * Check if there is still room in the current journal - * entry, smp_rmb() guarantees that reads from reservations.counter - * occur before accessing cur_entry_u64s: - */ - smp_rmb(); - if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) - return 0; - - EBUG_ON(!journal_state_count(new, new.idx)); - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) - return 0; - - new.cur_entry_offset += res->u64s; - journal_state_inc(&new); - - /* - * If the refcount would overflow, we have to wait: - * XXX - tracepoint this: - */ - if (!journal_state_count(new, new.idx)) - return 0; - - if (flags & JOURNAL_RES_GET_CHECK) - return 1; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - - res->ref = true; - res->offset = old.cur_entry_offset; - res->seq = journal_cur_seq(j); - res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; - return 1; -} - -static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s, unsigned flags, - struct btree_trans *trans) -{ - int ret; - - EBUG_ON(res->ref); - EBUG_ON(!test_bit(JOURNAL_running, &j->flags)); - - res->u64s = u64s; - - if (journal_res_get_fast(j, res, flags)) - goto out; - - ret = bch2_journal_res_get_slowpath(j, res, flags, trans); - if (ret) - return ret; -out: - if (!(flags & JOURNAL_RES_GET_CHECK)) { - lock_acquire_shared(&j->res_map, 0, - (flags & JOURNAL_RES_GET_NONBLOCK) != 0, - NULL, _THIS_IP_); - EBUG_ON(!res->ref); - BUG_ON(!res->seq); - } - return 0; -} - -/* journal_entry_res: */ - -void bch2_journal_entry_res_resize(struct journal *, - struct journal_entry_res *, - unsigned); - -int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); -void bch2_journal_flush_async(struct journal *, struct closure *); - -int bch2_journal_flush_seq(struct journal *, u64, unsigned); -int bch2_journal_flush(struct journal *); -bool bch2_journal_noflush_seq(struct journal *, u64, u64); -int bch2_journal_meta(struct journal *); - -void bch2_journal_halt_locked(struct journal *); -void bch2_journal_halt(struct journal *); - -static inline int bch2_journal_error(struct journal *j) -{ - return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL - ? -BCH_ERR_journal_shutdown : 0; -} - -struct bch_dev; - -void bch2_journal_unblock(struct journal *); -void bch2_journal_block(struct journal *); -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); - -void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); -void bch2_journal_debug_to_text(struct printbuf *, struct journal *); - -int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned); -int bch2_dev_journal_bucket_delete(struct bch_dev *, u64); - -int bch2_dev_journal_alloc(struct bch_dev *, bool); -int bch2_fs_journal_alloc(struct bch_fs *); - -void bch2_dev_journal_stop(struct journal *, struct bch_dev *); - -void bch2_fs_journal_stop(struct journal *); -int bch2_fs_journal_start(struct journal *, u64, u64); -void bch2_journal_set_replay_done(struct journal *); - -void bch2_dev_journal_exit(struct bch_dev *); -int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); -void bch2_fs_journal_exit(struct journal *); -void bch2_fs_journal_init_early(struct journal *); -int bch2_fs_journal_init(struct journal *); - -#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c deleted file mode 100644 index 9e028dbcc3d02d..00000000000000 --- a/fs/bcachefs/journal_io.c +++ /dev/null @@ -1,2242 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "btree_io.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "checksum.h" -#include "disk_groups.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "replicas.h" -#include "sb-clean.h" -#include "trace.h" - -#include -#include -#include - -void bch2_journal_pos_from_member_info_set(struct bch_fs *c) -{ - lockdep_assert_held(&c->sb_lock); - - for_each_member_device(c, ca) { - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - - m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); - m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); - } -} - -void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - for_each_member_device(c, ca) { - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); - - unsigned idx = le32_to_cpu(m.last_journal_bucket); - if (idx < ca->journal.nr) - ca->journal.cur_idx = idx; - unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); - if (offset <= ca->mi.bucket_size) - ca->journal.sectors_free = ca->mi.bucket_size - offset; - } - mutex_unlock(&c->sb_lock); -} - -static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) -{ - struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev); - prt_printf(out, "%s %u:%u:%u (sector %llu)", - ca ? ca->name : "(invalid dev)", - p->dev, p->bucket, p->bucket_offset, p->sector); - bch2_dev_put(ca); -} - -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) -{ - darray_for_each(j->ptrs, i) { - if (i != j->ptrs.data) - prt_printf(out, " "); - bch2_journal_ptr_to_text(out, c, i); - } -} - -static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) -{ - for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { - struct jset_entry_datetime *datetime = - container_of(entry, struct jset_entry_datetime, entry); - bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); - break; - } -} - -static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) -{ - prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); - bch2_journal_datetime_to_text(out, &j->j); - prt_char(out, ' '); - bch2_journal_ptrs_to_text(out, c, j); -} - -static struct nonce journal_nonce(const struct jset *jset) -{ - return (struct nonce) {{ - [0] = 0, - [1] = ((__le32 *) &jset->seq)[0], - [2] = ((__le32 *) &jset->seq)[1], - [3] = BCH_NONCE_JOURNAL, - }}; -} - -static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) -{ - if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { - *csum = (struct bch_csum) {}; - return false; - } - - *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); - return !bch2_crc_cmp(j->csum, *csum); -} - -static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) -{ - return (seq - c->journal_entries_base_seq) & (~0U >> 1); -} - -static void __journal_replay_free(struct bch_fs *c, - struct journal_replay *i) -{ - struct journal_replay **p = - genradix_ptr(&c->journal_entries, - journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); - - BUG_ON(*p != i); - *p = NULL; - kvfree(i); -} - -static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) -{ - if (blacklisted) - i->ignore_blacklisted = true; - else - i->ignore_not_dirty = true; - - if (!c->opts.read_entire_journal) - __journal_replay_free(c, i); -} - -struct journal_list { - struct closure cl; - u64 last_seq; - struct mutex lock; - int ret; -}; - -#define JOURNAL_ENTRY_ADD_OK 0 -#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 - -/* - * Given a journal entry we just read, add it to the list of journal entries to - * be replayed: - */ -static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, - struct journal_ptr entry_ptr, - struct journal_list *jlist, struct jset *j) -{ - struct genradix_iter iter; - struct journal_replay **_i, *i, *dup; - size_t bytes = vstruct_bytes(j); - u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; - struct printbuf buf = PRINTBUF; - int ret = JOURNAL_ENTRY_ADD_OK; - - if (last_seq && c->opts.journal_rewind) - last_seq = min(last_seq, c->opts.journal_rewind); - - if (!c->journal.oldest_seq_found_ondisk || - le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) - c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); - - /* Is this entry older than the range we need? */ - if (!c->opts.read_entire_journal && - le64_to_cpu(j->seq) < jlist->last_seq) - return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - - /* - * genradixes are indexed by a ulong, not a u64, so we can't index them - * by sequence number directly: Assume instead that they will all fall - * within the range of +-2billion of the filrst one we find. - */ - if (!c->journal_entries_base_seq) - c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); - - /* Drop entries we don't need anymore */ - if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { - genradix_for_each_from(&c->journal_entries, iter, _i, - journal_entry_radix_idx(c, jlist->last_seq)) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - if (le64_to_cpu(i->j.seq) >= last_seq) - break; - - journal_replay_free(c, i, false); - } - } - - jlist->last_seq = max(jlist->last_seq, last_seq); - - _i = genradix_ptr_alloc(&c->journal_entries, - journal_entry_radix_idx(c, le64_to_cpu(j->seq)), - GFP_KERNEL); - if (!_i) - return bch_err_throw(c, ENOMEM_journal_entry_add); - - /* - * Duplicate journal entries? If so we want the one that didn't have a - * checksum error: - */ - dup = *_i; - if (dup) { - bool identical = bytes == vstruct_bytes(&dup->j) && - !memcmp(j, &dup->j, bytes); - bool not_identical = !identical && - entry_ptr.csum_good && - dup->csum_good; - - bool same_device = false; - darray_for_each(dup->ptrs, ptr) - if (ptr->dev == ca->dev_idx) - same_device = true; - - ret = darray_push(&dup->ptrs, entry_ptr); - if (ret) - goto out; - - bch2_journal_replay_to_text(&buf, c, dup); - - fsck_err_on(same_device, - c, journal_entry_dup_same_device, - "duplicate journal entry on same device\n%s", - buf.buf); - - fsck_err_on(not_identical, - c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries\n%s", - buf.buf); - - if (entry_ptr.csum_good && !identical) - goto replace; - - goto out; - } -replace: - i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); - if (!i) - return bch_err_throw(c, ENOMEM_journal_entry_add); - - darray_init(&i->ptrs); - i->csum_good = entry_ptr.csum_good; - i->ignore_blacklisted = false; - i->ignore_not_dirty = false; - unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); - - if (dup) { - /* The first ptr should represent the jset we kept: */ - darray_for_each(dup->ptrs, ptr) - darray_push(&i->ptrs, *ptr); - __journal_replay_free(c, dup); - } else { - darray_push(&i->ptrs, entry_ptr); - } - - *_i = i; -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* this fills in a range with empty jset_entries: */ -static void journal_entry_null_range(void *start, void *end) -{ - struct jset_entry *entry; - - for (entry = start; entry != end; entry = vstruct_next(entry)) - memset(entry, 0, sizeof(*entry)); -} - -#define JOURNAL_ENTRY_REREAD 5 -#define JOURNAL_ENTRY_NONE 6 -#define JOURNAL_ENTRY_BAD 7 - -static void journal_entry_err_msg(struct printbuf *out, - u32 version, - struct jset *jset, - struct jset_entry *entry) -{ - prt_str(out, "invalid journal entry, version="); - bch2_version_to_text(out, version); - - if (entry) { - prt_str(out, " type="); - bch2_prt_jset_entry_type(out, entry->type); - } - - if (!jset) { - prt_printf(out, " in superblock"); - } else { - - prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); - - if (entry) - prt_printf(out, " offset=%zi/%u", - (u64 *) entry - jset->_data, - le32_to_cpu(jset->u64s)); - } - - prt_str(out, ": "); -} - -#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ -({ \ - struct printbuf _buf = PRINTBUF; \ - \ - journal_entry_err_msg(&_buf, version, jset, entry); \ - prt_printf(&_buf, msg, ##__VA_ARGS__); \ - \ - switch (from.flags & BCH_VALIDATE_write) { \ - case READ: \ - mustfix_fsck_err(c, _err, "%s", _buf.buf); \ - break; \ - case WRITE: \ - bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ - if (bch2_fs_inconsistent(c, \ - "corrupt metadata before write: %s\n", _buf.buf)) {\ - ret = bch_err_throw(c, fsck_errors_not_fixed); \ - goto fsck_err; \ - } \ - break; \ - } \ - \ - printbuf_exit(&_buf); \ - true; \ -}) - -#define journal_entry_err_on(cond, ...) \ - ((cond) ? journal_entry_err(__VA_ARGS__) : false) - -#define FSCK_DELETED_KEY 5 - -static int journal_validate_key(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - struct bkey_i *k, - struct bkey_validate_context from, - unsigned version, int big_endian) -{ - enum bch_validate_flags flags = from.flags; - int write = flags & BCH_VALIDATE_write; - void *next = vstruct_next(entry); - int ret = 0; - - if (journal_entry_err_on(!k->k.u64s, - c, version, jset, entry, - journal_entry_bkey_u64s_0, - "k->u64s 0")) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - - if (journal_entry_err_on((void *) bkey_next(k) > - (void *) vstruct_next(entry), - c, version, jset, entry, - journal_entry_bkey_past_end, - "extends past end of journal entry")) { - entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - - if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, - c, version, jset, entry, - journal_entry_bkey_bad_format, - "bad format %u", k->k.format)) { - le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - - if (!write) - bch2_bkey_compat(from.level, from.btree, version, big_endian, - write, NULL, bkey_to_packed(k)); - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); - if (ret == -BCH_ERR_fsck_delete_bkey) { - le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); - memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(vstruct_next(entry), next); - return FSCK_DELETED_KEY; - } - if (ret) - goto fsck_err; - - if (write) - bch2_bkey_compat(from.level, from.btree, version, big_endian, - write, NULL, bkey_to_packed(k)); -fsck_err: - return ret; -} - -static int journal_entry_btree_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct bkey_i *k = entry->start; - - from.level = entry->level; - from.btree = entry->btree_id; - - while (k != vstruct_last(entry)) { - int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); - if (ret == FSCK_DELETED_KEY) - continue; - else if (ret) - return ret; - - k = bkey_next(k); - } - - return 0; -} - -static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - bool first = true; - - jset_entry_for_each_key(entry, k) { - /* We may be called on entries that haven't been validated: */ - if (!k->k.u64s) - break; - - if (!first) { - prt_newline(out); - bch2_prt_jset_entry_type(out, entry->type); - prt_str(out, ": "); - } - bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); - prt_char(out, ' '); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); - first = false; - } -} - -static int journal_entry_btree_root_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct bkey_i *k = entry->start; - int ret = 0; - - from.root = true; - from.level = entry->level + 1; - from.btree = entry->btree_id; - - if (journal_entry_err_on(!entry->u64s || - le16_to_cpu(entry->u64s) != k->k.u64s, - c, version, jset, entry, - journal_entry_btree_root_bad_size, - "invalid btree root journal entry: wrong number of keys")) { - void *next = vstruct_next(entry); - /* - * we don't want to null out this jset_entry, - * just the contents, so that later we can tell - * we were _supposed_ to have a btree root - */ - entry->u64s = 0; - journal_entry_null_range(vstruct_next(entry), next); - return 0; - } - - ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); - if (ret == FSCK_DELETED_KEY) - ret = 0; -fsck_err: - return ret; -} - -static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_prio_ptrs_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - /* obsolete, don't care: */ - return 0; -} - -static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ -} - -static int journal_entry_blacklist_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - int ret = 0; - - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, - c, version, jset, entry, - journal_entry_blacklist_bad_size, - "invalid journal seq blacklist entry: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - } -fsck_err: - return ret; -} - -static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_blacklist *bl = - container_of(entry, struct jset_entry_blacklist, entry); - - prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); -} - -static int journal_entry_blacklist_v2_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_blacklist_v2 *bl_entry; - int ret = 0; - - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, - c, version, jset, entry, - journal_entry_blacklist_v2_bad_size, - "invalid journal seq blacklist entry: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - goto out; - } - - bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); - - if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > - le64_to_cpu(bl_entry->end), - c, version, jset, entry, - journal_entry_blacklist_v2_start_past_end, - "invalid journal seq blacklist entry: start > end")) { - journal_entry_null_range(entry, vstruct_next(entry)); - } -out: -fsck_err: - return ret; -} - -static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_blacklist_v2 *bl = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - prt_printf(out, "start=%llu end=%llu", - le64_to_cpu(bl->start), - le64_to_cpu(bl->end)); -} - -static int journal_entry_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - int ret = 0; - - if (journal_entry_err_on(bytes < sizeof(*u), - c, version, jset, entry, - journal_entry_usage_bad_size, - "invalid journal entry usage: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - -fsck_err: - return ret; -} - -static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); - - prt_str(out, "type="); - bch2_prt_fs_usage_type(out, u->entry.btree_id); - prt_printf(out, " v=%llu", le64_to_cpu(u->v)); -} - -static int journal_entry_data_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - struct printbuf err = PRINTBUF; - int ret = 0; - - if (journal_entry_err_on(bytes < sizeof(*u) || - bytes < sizeof(*u) + u->r.nr_devs, - c, version, jset, entry, - journal_entry_data_usage_bad_size, - "invalid journal entry usage: bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - goto out; - } - - if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), - c, version, jset, entry, - journal_entry_data_usage_bad_size, - "invalid journal entry usage: %s", err.buf)) { - journal_entry_null_range(entry, vstruct_next(entry)); - goto out; - } -out: -fsck_err: - printbuf_exit(&err); - return ret; -} - -static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); - - bch2_replicas_entry_to_text(out, &u->r); - prt_printf(out, "=%llu", le64_to_cpu(u->v)); -} - -static int journal_entry_clock_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - int ret = 0; - - if (journal_entry_err_on(bytes != sizeof(*clock), - c, version, jset, entry, - journal_entry_clock_bad_size, - "bad size")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - - if (journal_entry_err_on(clock->rw > 1, - c, version, jset, entry, - journal_entry_clock_bad_rw, - "bad rw")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - -fsck_err: - return ret; -} - -static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - - prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); -} - -static int journal_entry_dev_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - struct jset_entry_dev_usage *u = - container_of(entry, struct jset_entry_dev_usage, entry); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - unsigned expected = sizeof(*u); - int ret = 0; - - if (journal_entry_err_on(bytes < expected, - c, version, jset, entry, - journal_entry_dev_usage_bad_size, - "bad size (%u < %u)", - bytes, expected)) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - - if (journal_entry_err_on(u->pad, - c, version, jset, entry, - journal_entry_dev_usage_bad_pad, - "bad pad")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - -fsck_err: - return ret; -} - -static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_dev_usage *u = - container_of(entry, struct jset_entry_dev_usage, entry); - unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - - if (vstruct_bytes(entry) < sizeof(*u)) - return; - - prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); - - printbuf_indent_add(out, 2); - for (i = 0; i < nr_types; i++) { - prt_newline(out); - bch2_prt_data_type(out, i); - prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", - le64_to_cpu(u->d[i].buckets), - le64_to_cpu(u->d[i].sectors), - le64_to_cpu(u->d[i].fragmented)); - } - printbuf_indent_sub(out, 2); -} - -static int journal_entry_log_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - return 0; -} - -static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - - prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); -} - -static int journal_entry_overwrite_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - from.flags = 0; - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, from); -} - -static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_log_bkey_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - from.flags = 0; - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, from); -} - -static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, from); -} - -static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - -static int journal_entry_datetime_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - unsigned bytes = vstruct_bytes(entry); - unsigned expected = 16; - int ret = 0; - - if (journal_entry_err_on(vstruct_bytes(entry) < expected, - c, version, jset, entry, - journal_entry_dev_usage_bad_size, - "bad size (%u < %u)", - bytes, expected)) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } -fsck_err: - return ret; -} - -static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - struct jset_entry_datetime *datetime = - container_of(entry, struct jset_entry_datetime, entry); - - bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); -} - -struct jset_entry_ops { - int (*validate)(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, - struct bkey_validate_context); - void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); -}; - -static const struct jset_entry_ops bch2_jset_entry_ops[] = { -#define x(f, nr) \ - [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ - .validate = journal_entry_##f##_validate, \ - .to_text = journal_entry_##f##_to_text, \ - }, - BCH_JSET_ENTRY_TYPES() -#undef x -}; - -int bch2_journal_entry_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - struct bkey_validate_context from) -{ - return entry->type < BCH_JSET_ENTRY_NR - ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, - version, big_endian, from) - : 0; -} - -void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - bch2_prt_jset_entry_type(out, entry->type); - - if (entry->type < BCH_JSET_ENTRY_NR) { - prt_str(out, ": "); - bch2_jset_entry_ops[entry->type].to_text(out, c, entry); - } -} - -static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - enum bch_validate_flags flags) -{ - struct bkey_validate_context from = { - .flags = flags, - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(jset->seq), - }; - - unsigned version = le32_to_cpu(jset->version); - int ret = 0; - - vstruct_for_each(jset, entry) { - from.journal_offset = (u64 *) entry - jset->_data; - - if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), - c, version, jset, entry, - journal_entry_past_jset_end, - "journal entry extends past end of jset")) { - jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); - break; - } - - ret = bch2_journal_entry_validate(c, jset, entry, version, - JSET_BIG_ENDIAN(jset), from); - if (ret) - break; - } -fsck_err: - return ret; -} - -static int jset_validate(struct bch_fs *c, - struct bch_dev *ca, - struct jset *jset, u64 sector, - enum bch_validate_flags flags) -{ - struct bkey_validate_context from = { - .flags = flags, - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(jset->seq), - }; - int ret = 0; - - if (le64_to_cpu(jset->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - - unsigned version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), - c, version, jset, NULL, - jset_unsupported_version, - "%s sector %llu seq %llu: incompatible journal entry version %u.%u", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), - BCH_VERSION_MAJOR(version), - BCH_VERSION_MINOR(version))) { - /* don't try to continue: */ - return -EINVAL; - } - - if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), - c, version, jset, NULL, - jset_unknown_csum, - "%s sector %llu seq %llu: journal entry with unknown csum type %llu", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), - JSET_CSUM_TYPE(jset))) - ret = JOURNAL_ENTRY_BAD; - - /* last_seq is ignored when JSET_NO_FLUSH is true */ - if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && - le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), - c, version, jset, NULL, - jset_last_seq_newer_than_seq, - "invalid journal entry: last_seq > seq (%llu > %llu)", - le64_to_cpu(jset->last_seq), - le64_to_cpu(jset->seq))) { - jset->last_seq = jset->seq; - return JOURNAL_ENTRY_BAD; - } - - ret = jset_validate_entries(c, jset, flags); -fsck_err: - return ret; -} - -static int jset_validate_early(struct bch_fs *c, - struct bch_dev *ca, - struct jset *jset, u64 sector, - unsigned bucket_sectors_left, - unsigned sectors_read) -{ - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(jset->seq), - }; - int ret = 0; - - if (le64_to_cpu(jset->magic) != jset_magic(c)) - return JOURNAL_ENTRY_NONE; - - unsigned version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), - c, version, jset, NULL, - jset_unsupported_version, - "%s sector %llu seq %llu: unknown journal entry version %u.%u", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), - BCH_VERSION_MAJOR(version), - BCH_VERSION_MINOR(version))) { - /* don't try to continue: */ - return -EINVAL; - } - - size_t bytes = vstruct_bytes(jset); - if (bytes > (sectors_read << 9) && - sectors_read < bucket_sectors_left) - return JOURNAL_ENTRY_REREAD; - - if (journal_entry_err_on(bytes > bucket_sectors_left << 9, - c, version, jset, NULL, - jset_past_bucket_end, - "%s sector %llu seq %llu: journal entry too big (%zu bytes)", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), bytes)) - le32_add_cpu(&jset->u64s, - -((bytes - (bucket_sectors_left << 9)) / 8)); -fsck_err: - return ret; -} - -struct journal_read_buf { - void *data; - size_t size; -}; - -static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, - size_t new_size) -{ - void *n; - - /* the bios are sized for this many pages, max: */ - if (new_size > JOURNAL_ENTRY_SIZE_MAX) - return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); - - new_size = roundup_pow_of_two(new_size); - n = kvmalloc(new_size, GFP_KERNEL); - if (!n) - return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); - - kvfree(b->data); - b->data = n; - b->size = new_size; - return 0; -} - -static int journal_read_bucket(struct bch_dev *ca, - struct journal_read_buf *buf, - struct journal_list *jlist, - unsigned bucket) -{ - struct bch_fs *c = ca->fs; - struct journal_device *ja = &ca->journal; - struct jset *j = NULL; - unsigned sectors, sectors_read = 0; - u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), - end = offset + ca->mi.bucket_size; - bool saw_bad = false, csum_good; - int ret = 0; - - pr_debug("reading %u", bucket); - - while (offset < end) { - if (!sectors_read) { - struct bio *bio; - unsigned nr_bvecs; -reread: - sectors_read = min_t(unsigned, - end - offset, buf->size >> 9); - nr_bvecs = buf_pages(buf->data, sectors_read << 9); - - bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!bio) - return bch_err_throw(c, ENOMEM_journal_read_bucket); - bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); - - bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, buf->data, sectors_read << 9); - - u64 submit_time = local_clock(); - ret = submit_bio_wait(bio); - kfree(bio); - - if (!ret && bch2_meta_read_fault("journal")) - ret = bch_err_throw(c, EIO_fault_injected); - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, - submit_time, !ret); - - if (ret) { - bch_err_dev_ratelimited(ca, - "journal read error: sector %llu", offset); - /* - * We don't error out of the recovery process - * here, since the relevant journal entry may be - * found on a different device, and missing or - * no journal entries will be handled later - */ - return 0; - } - - j = buf->data; - } - - ret = jset_validate_early(c, ca, j, offset, - end - offset, sectors_read); - switch (ret) { - case 0: - sectors = vstruct_sectors(j, c->block_bits); - break; - case JOURNAL_ENTRY_REREAD: - if (vstruct_bytes(j) > buf->size) { - ret = journal_read_buf_realloc(c, buf, - vstruct_bytes(j)); - if (ret) - return ret; - } - goto reread; - case JOURNAL_ENTRY_NONE: - if (!saw_bad) - return 0; - /* - * On checksum error we don't really trust the size - * field of the journal entry we read, so try reading - * again at next block boundary: - */ - sectors = block_sectors(c); - goto next_block; - default: - return ret; - } - - if (le64_to_cpu(j->seq) > ja->highest_seq_found) { - ja->highest_seq_found = le64_to_cpu(j->seq); - ja->cur_idx = bucket; - ja->sectors_free = ca->mi.bucket_size - - bucket_remainder(ca, offset) - sectors; - } - - /* - * This happens sometimes if we don't have discards on - - * when we've partially overwritten a bucket with new - * journal entries. We don't need the rest of the - * bucket: - */ - if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - return 0; - - ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - - struct bch_csum csum; - csum_good = jset_csum_good(c, j, &csum); - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); - - if (!csum_good) { - /* - * Don't print an error here, we'll print the error - * later if we need this journal entry - */ - saw_bad = true; - } - - ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), - j->encrypted_start, - vstruct_end(j) - (void *) j->encrypted_start); - bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); - - mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, (struct journal_ptr) { - .csum_good = csum_good, - .csum = csum, - .dev = ca->dev_idx, - .bucket = bucket, - .bucket_offset = offset - - bucket_to_sector(ca, ja->buckets[bucket]), - .sector = offset, - }, jlist, j); - mutex_unlock(&jlist->lock); - - switch (ret) { - case JOURNAL_ENTRY_ADD_OK: - break; - case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: - break; - default: - return ret; - } -next_block: - pr_debug("next"); - offset += sectors; - sectors_read -= sectors; - j = ((void *) j) + (sectors << 9); - } - - return 0; -} - -static CLOSURE_CALLBACK(bch2_journal_read_device) -{ - closure_type(ja, struct journal_device, read); - struct bch_dev *ca = container_of(ja, struct bch_dev, journal); - struct bch_fs *c = ca->fs; - struct journal_list *jlist = - container_of(cl->parent, struct journal_list, cl); - struct journal_read_buf buf = { NULL, 0 }; - unsigned i; - int ret = 0; - - if (!ja->nr) - goto out; - - ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); - if (ret) - goto err; - - pr_debug("%u journal buckets", ja->nr); - - for (i = 0; i < ja->nr; i++) { - ret = journal_read_bucket(ca, &buf, jlist, i); - if (ret) - goto err; - } - - /* - * Set dirty_idx to indicate the entire journal is full and needs to be - * reclaimed - journal reclaim will immediately reclaim whatever isn't - * pinned when it first runs: - */ - ja->discard_idx = ja->dirty_idx_ondisk = - ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; -out: - bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); - kvfree(buf.data); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); - closure_return(cl); - return; -err: - mutex_lock(&jlist->lock); - jlist->ret = ret; - mutex_unlock(&jlist->lock); - goto out; -} - -noinline_for_stack -static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) -{ - struct printbuf buf = PRINTBUF; - enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); - bool have_good = false; - - prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); - bch2_journal_datetime_to_text(&buf, &j->j); - prt_newline(&buf); - - darray_for_each(j->ptrs, ptr) - if (!ptr->csum_good) { - bch2_journal_ptr_to_text(&buf, c, ptr); - prt_char(&buf, ' '); - bch2_csum_to_text(&buf, csum_type, ptr->csum); - prt_newline(&buf); - } else { - have_good = true; - } - - prt_printf(&buf, "should be "); - bch2_csum_to_text(&buf, csum_type, j->j.csum); - - if (have_good) - prt_printf(&buf, "\n(had good copy on another device)"); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - -noinline_for_stack -static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) -{ - struct printbuf buf = PRINTBUF; - int ret = 0; - - struct genradix_iter radix_iter; - struct journal_replay *i, **_i, *prev = NULL; - u64 seq = start_seq; - - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - BUG_ON(seq > le64_to_cpu(i->j.seq)); - - while (seq < le64_to_cpu(i->j.seq)) { - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - u64 missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - u64 missing_end = seq - 1; - - printbuf_reset(&buf); - prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", - missing_start, missing_end, - start_seq, end_seq); - - prt_printf(&buf, "\nprev at "); - if (prev) { - bch2_journal_ptrs_to_text(&buf, c, prev); - prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf, "(none)"); - - prt_printf(&buf, "\nnext at "); - bch2_journal_ptrs_to_text(&buf, c, i); - prt_printf(&buf, ", continue?"); - - fsck_err(c, journal_entries_missing, "%s", buf.buf); - } - - prev = i; - seq++; - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_journal_read(struct bch_fs *c, - u64 *last_seq, - u64 *blacklist_seq, - u64 *start_seq) -{ - struct journal_list jlist; - struct journal_replay *i, **_i; - struct genradix_iter radix_iter; - struct printbuf buf = PRINTBUF; - bool degraded = false, last_write_torn = false; - u64 seq; - int ret = 0; - - closure_init_stack(&jlist.cl); - mutex_init(&jlist.lock); - jlist.last_seq = 0; - jlist.ret = 0; - - for_each_member_device(c, ca) { - if (!c->opts.fsck && - !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) - continue; - - if ((ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro) && - enumerated_ref_tryget(&ca->io_ref[READ], - BCH_DEV_READ_REF_journal_read)) - closure_call(&ca->journal.read, - bch2_journal_read_device, - system_unbound_wq, - &jlist.cl); - else - degraded = true; - } - - while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) - ; - - if (jlist.ret) - return jlist.ret; - - *last_seq = 0; - *start_seq = 0; - *blacklist_seq = 0; - - /* - * Find most recent flush entry, and ignore newer non flush entries - - * those entries will be blacklisted: - */ - genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - if (!*start_seq) - *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; - - if (JSET_NO_FLUSH(&i->j)) { - i->ignore_blacklisted = true; - continue; - } - - if (!last_write_torn && !i->csum_good) { - last_write_torn = true; - i->ignore_blacklisted = true; - continue; - } - - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .journal_seq = le64_to_cpu(i->j.seq), - }; - if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), - c, le32_to_cpu(i->j.version), &i->j, NULL, - jset_last_seq_newer_than_seq, - "invalid journal entry: last_seq > seq (%llu > %llu)", - le64_to_cpu(i->j.last_seq), - le64_to_cpu(i->j.seq))) - i->j.last_seq = i->j.seq; - - *last_seq = le64_to_cpu(i->j.last_seq); - *blacklist_seq = le64_to_cpu(i->j.seq) + 1; - break; - } - - if (!*start_seq) { - bch_info(c, "journal read done, but no entries found"); - return 0; - } - - if (!*last_seq) { - fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, - "journal read done, but no entries found after dropping non-flushes"); - return 0; - } - - printbuf_reset(&buf); - prt_printf(&buf, "journal read done, replaying entries %llu-%llu", - *last_seq, *blacklist_seq - 1); - - /* - * Drop blacklisted entries and entries older than last_seq (or start of - * journal rewind: - */ - u64 drop_before = *last_seq; - if (c->opts.journal_rewind) { - drop_before = min(drop_before, c->opts.journal_rewind); - prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); - } - - *last_seq = drop_before; - if (*start_seq != *blacklist_seq) - prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); - bch_info(c, "%s", buf.buf); - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - seq = le64_to_cpu(i->j.seq); - if (seq < drop_before) { - journal_replay_free(c, i, false); - continue; - } - - if (bch2_journal_seq_is_blacklisted(c, seq, true)) { - fsck_err_on(!JSET_NO_FLUSH(&i->j), c, - jset_seq_blacklisted, - "found blacklisted journal entry %llu", seq); - i->ignore_blacklisted = true; - } - } - - ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1); - if (ret) - goto err; - - genradix_for_each(&c->journal_entries, radix_iter, _i) { - union bch_replicas_padded replicas = { - .e.data_type = BCH_DATA_journal, - .e.nr_devs = 0, - .e.nr_required = 1, - }; - - i = *_i; - if (journal_replay_ignore(i)) - continue; - - /* - * Don't print checksum errors until we know we're going to use - * a given journal entry: - */ - darray_for_each(i->ptrs, ptr) - if (!ptr->csum_good) { - bch2_journal_print_checksum_error(c, i); - break; - } - - ret = jset_validate(c, - bch2_dev_have_ref(c, i->ptrs.data[0].dev), - &i->j, - i->ptrs.data[0].sector, - READ); - if (ret) - goto err; - - darray_for_each(i->ptrs, ptr) - replicas_entry_add_dev(&replicas.e, ptr->dev); - - bch2_replicas_entry_sort(&replicas.e); - - printbuf_reset(&buf); - bch2_replicas_entry_to_text(&buf, &replicas.e); - - if (!degraded && - !bch2_replicas_marked(c, &replicas.e) && - (le64_to_cpu(i->j.seq) == *last_seq || - fsck_err(c, journal_entry_replicas_not_marked, - "superblock not marked as containing replicas for journal entry %llu\n%s", - le64_to_cpu(i->j.seq), buf.buf))) { - ret = bch2_mark_replicas(c, &replicas.e); - if (ret) - goto err; - } - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* journal write: */ - -static void journal_advance_devs_to_next_bucket(struct journal *j, - struct dev_alloc_list *devs, - unsigned sectors, __le64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - guard(rcu)(); - darray_for_each(*devs, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); - if (!ca) - continue; - - struct journal_device *ja = &ca->journal; - - if (sectors > ja->sectors_free && - sectors <= ca->mi.bucket_size && - bch2_journal_dev_buckets_available(j, ja, - journal_space_discarded)) { - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; - - /* - * ja->bucket_seq[ja->cur_idx] must always have - * something sensible: - */ - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); - } - } -} - -static void __journal_write_alloc(struct journal *j, - struct journal_buf *w, - struct dev_alloc_list *devs, - unsigned sectors, - unsigned *replicas, - unsigned replicas_want) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - darray_for_each(*devs, i) { - struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, - BCH_DEV_WRITE_REF_journal_write); - if (!ca) - continue; - - struct journal_device *ja = &ca->journal; - - /* - * Check that we can use this device, and aren't already using - * it: - */ - if (!ca->mi.durability || - ca->mi.state != BCH_MEMBER_STATE_rw || - !ja->nr || - bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || - sectors > ja->sectors_free) { - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); - continue; - } - - bch2_dev_stripe_increment(ca, &j->wp.stripe); - - bch2_bkey_append_ptr(&w->key, - (struct bch_extent_ptr) { - .offset = bucket_to_sector(ca, - ja->buckets[ja->cur_idx]) + - ca->mi.bucket_size - - ja->sectors_free, - .dev = ca->dev_idx, - }); - - ja->sectors_free -= sectors; - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - - *replicas += ca->mi.durability; - - if (*replicas >= replicas_want) - break; - } -} - -static int journal_write_alloc(struct journal *j, struct journal_buf *w, - unsigned *replicas) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_devs_mask devs; - struct dev_alloc_list devs_sorted; - unsigned sectors = vstruct_sectors(w->data, c->block_bits); - unsigned target = c->opts.metadata_target ?: - c->opts.foreground_target; - unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); - unsigned replicas_need = min_t(unsigned, replicas_want, - READ_ONCE(c->opts.metadata_replicas_required)); - bool advance_done = false; - -retry_target: - devs = target_rw_devs(c, BCH_DATA_journal, target); - bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); -retry_alloc: - __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); - - if (likely(*replicas >= replicas_want)) - goto done; - - if (!advance_done) { - journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); - advance_done = true; - goto retry_alloc; - } - - if (*replicas < replicas_want && target) { - /* Retry from all devices: */ - target = 0; - advance_done = false; - goto retry_target; - } -done: - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - -#if 0 - /* - * XXX: we need a way to alert the user when we go degraded for any - * reason - */ - if (*replicas < min(replicas_want, - dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { - } -#endif - - return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; -} - -static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - /* we aren't holding j->lock: */ - unsigned new_size = READ_ONCE(j->buf_size_want); - void *new_buf; - - if (buf->buf_size >= new_size) - return; - - size_t btree_write_buffer_size = new_size / 64; - - if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) - return; - - new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); - if (!new_buf) - return; - - memcpy(new_buf, buf->data, buf->buf_size); - - spin_lock(&j->lock); - swap(buf->data, new_buf); - swap(buf->buf_size, new_size); - spin_unlock(&j->lock); - - kvfree(new_buf); -} - -static CLOSURE_CALLBACK(journal_write_done) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union bch_replicas_padded replicas; - u64 seq = le64_to_cpu(w->data->seq); - int err = 0; - - bch2_time_stats_update(!JSET_NO_FLUSH(w->data) - ? j->flush_write_time - : j->noflush_write_time, j->write_start_time); - - if (!w->devs_written.nr) { - err = bch_err_throw(c, journal_write_err); - } else { - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); - err = bch2_mark_replicas(c, &replicas.e); - } - - if (err && !bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - if (err == -BCH_ERR_journal_write_err) - prt_printf(&buf, "unable to write journal to sufficient devices\n"); - else - prt_printf(&buf, "journal write error marking replicas: %s\n", - bch2_err_str(err)); - - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - - closure_debug_destroy(cl); - - spin_lock(&j->lock); - if (seq >= j->pin.front) - journal_seq_pin(j, seq)->devs = w->devs_written; - if (err && (!j->err_seq || seq < j->err_seq)) - j->err_seq = seq; - w->write_done = true; - - if (!j->free_buf || j->free_buf_size < w->buf_size) { - swap(j->free_buf, w->data); - swap(j->free_buf_size, w->buf_size); - } - - if (w->data) { - void *buf = w->data; - w->data = NULL; - w->buf_size = 0; - - spin_unlock(&j->lock); - kvfree(buf); - spin_lock(&j->lock); - } - - bool completed = false; - bool do_discards = false; - - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) { - w = j->buf + (seq & JOURNAL_BUF_MASK); - if (!w->write_done) - break; - - if (!j->err_seq && !w->noflush) { - j->flushed_seq_ondisk = seq; - j->last_seq_ondisk = w->last_seq; - - closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); - do_discards = true; - } - - j->seq_ondisk = seq; - - /* - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard - * more buckets: - * - * Must come before signaling write completion, for - * bch2_fs_journal_stop(): - */ - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); - - closure_wake_up(&w->wait); - completed = true; - } - - if (completed) { - bch2_journal_reclaim_fast(j); - bch2_journal_space_available(j); - - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); - - journal_wake(j); - } - - if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && - j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { - struct journal_buf *buf = journal_cur_buf(j); - long delta = buf->expires - jiffies; - - /* - * We don't close a journal entry to write it while there's - * previous entries still in flight - the current journal entry - * might want to be written now: - */ - mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); - } - - /* - * We don't typically trigger journal writes from her - the next journal - * write will be triggered immediately after the previous one is - * allocated, in bch2_journal_write() - but the journal write error path - * is special: - */ - bch2_journal_do_writes(j); - spin_unlock(&j->lock); - - if (do_discards) - bch2_do_discards(c); -} - -static void journal_write_endio(struct bio *bio) -{ - struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); - struct bch_dev *ca = jbio->ca; - struct journal *j = &ca->fs->journal; - struct journal_buf *w = j->buf + jbio->buf_idx; - - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, - jbio->submit_time, !bio->bi_status); - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, - "error writing journal entry %llu: %s", - le64_to_cpu(w->data->seq), - bch2_blk_status_to_str(bio->bi_status)); - - unsigned long flags; - spin_lock_irqsave(&j->err_lock, flags); - bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); - spin_unlock_irqrestore(&j->err_lock, flags); - } - - closure_put(&w->io); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); -} - -static CLOSURE_CALLBACK(journal_write_submit) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned sectors = vstruct_sectors(w->data, c->block_bits); - - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], - sectors); - - struct journal_device *ja = &ca->journal; - struct journal_bio *jbio = ja->bio[w->idx]; - struct bio *bio = &jbio->bio; - - jbio->submit_time = local_clock(); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); - - BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); - ca->prev_journal_sector = bio->bi_iter.bi_sector; - - if (!JSET_NO_FLUSH(w->data)) - bio->bi_opf |= REQ_FUA; - if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) - bio->bi_opf |= REQ_PREFLUSH; - - bch2_bio_map(bio, w->data, sectors << 9); - - trace_and_count(c, journal_write, bio); - closure_bio_submit(bio, cl); - - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - } - - continue_at(cl, journal_write_done, j->wq); -} - -static CLOSURE_CALLBACK(journal_write_preflush) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - /* - * Wait for previous journal writes to comelete; they won't necessarily - * be flushed if they're still in flight - */ - if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { - spin_lock(&j->lock); - if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { - closure_wait(&j->async_wait, cl); - spin_unlock(&j->lock); - continue_at(cl, journal_write_preflush, j->wq); - return; - } - spin_unlock(&j->lock); - } - - if (w->separate_flush) { - for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { - enumerated_ref_get(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_journal_write); - - struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio[w->idx]->bio; - bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - - continue_at(cl, journal_write_submit, j->wq); - } else { - /* - * no need to punt to another work item if we're not waiting on - * preflushes - */ - journal_write_submit(&cl->work); - } -} - -static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *start, *end; - struct jset *jset = w->data; - struct journal_keys_to_wb wb = { NULL }; - unsigned u64s; - unsigned long btree_roots_have = 0; - u64 seq = le64_to_cpu(jset->seq); - int ret; - - /* - * Simple compaction, dropping empty jset_entries (from journal - * reservations that weren't fully used) and merging jset_entries that - * can be. - * - * If we wanted to be really fancy here, we could sort all the keys in - * the jset and drop keys that were overwritten - probably not worth it: - */ - vstruct_for_each(jset, i) { - unsigned u64s = le16_to_cpu(i->u64s); - - /* Empty entry: */ - if (!u64s) - continue; - - /* - * New btree roots are set by journalling them; when the journal - * entry gets written we have to propagate them to - * c->btree_roots - * - * But, every journal entry we write has to contain all the - * btree roots (at least for now); so after we copy btree roots - * to c->btree_roots we have to get any missing btree roots and - * add them to this journal entry: - */ - switch (i->type) { - case BCH_JSET_ENTRY_btree_root: - bch2_journal_entry_to_btree_root(c, i); - __set_bit(i->btree_id, &btree_roots_have); - break; - case BCH_JSET_ENTRY_write_buffer_keys: - EBUG_ON(!w->need_flush_to_write_buffer); - - if (!wb.wb) - bch2_journal_keys_to_write_buffer_start(c, &wb, seq); - - jset_entry_for_each_key(i, k) { - ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); - if (ret) { - bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", - bch2_err_str(ret)); - bch2_journal_keys_to_write_buffer_end(c, &wb); - return ret; - } - } - i->type = BCH_JSET_ENTRY_btree_keys; - break; - } - } - - if (wb.wb) { - ret = bch2_journal_keys_to_write_buffer_end(c, &wb); - if (ret) { - bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", - bch2_err_str(ret)); - return ret; - } - } - - spin_lock(&c->journal.lock); - w->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); - - start = end = vstruct_last(jset); - - end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - - struct jset_entry_datetime *d = - container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); - d->entry.type = BCH_JSET_ENTRY_datetime; - d->seconds = cpu_to_le64(ktime_get_real_seconds()); - - bch2_journal_super_entries_add_common(c, &end, seq); - u64s = (u64 *) end - (u64 *) start; - - WARN_ON(u64s > j->entry_u64s_reserved); - - le32_add_cpu(&jset->u64s, u64s); - - unsigned sectors = vstruct_sectors(jset, c->block_bits); - - if (sectors > w->sectors) { - bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", - vstruct_bytes(jset), w->sectors << 9, - u64s, w->u64s_reserved, j->entry_u64s_reserved); - return -EINVAL; - } - - return 0; -} - -static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset *jset = w->data; - u64 seq = le64_to_cpu(jset->seq); - bool validate_before_checksum = false; - int ret = 0; - - jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = cpu_to_le32(c->sb.version); - - SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); - SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); - - if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = seq; - - if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) - validate_before_checksum = true; - - if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) - validate_before_checksum = true; - - if (validate_before_checksum && - (ret = jset_validate(c, NULL, jset, 0, WRITE))) - return ret; - - ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), - jset->encrypted_start, - vstruct_end(jset) - (void *) jset->encrypted_start); - if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) - return ret; - - jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), - journal_nonce(jset), jset); - - if (!validate_before_checksum && - (ret = jset_validate(c, NULL, jset, 0, WRITE))) - return ret; - - unsigned sectors = vstruct_sectors(jset, c->block_bits); - unsigned bytes = vstruct_bytes(jset); - memset((void *) jset + bytes, 0, (sectors << 9) - bytes); - return 0; -} - -static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int error = bch2_journal_error(j); - - /* - * If the journal is in an error state - we did an emergency shutdown - - * we prefer to continue doing journal writes. We just mark them as - * noflush so they'll never be used, but they'll still be visible by the - * list_journal tool - this helps in debugging. - * - * There's a caveat: the first journal write after marking the - * superblock dirty must always be a flush write, because on startup - * from a clean shutdown we didn't necessarily read the journal and the - * new journal write might overwrite whatever was in the journal - * previously - we can't leave the journal without any flush writes in - * it. - * - * So if we're in an error state, and we're still starting up, we don't - * write anything at all. - */ - if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) - return error; - - if (error || - w->noflush || - (!w->must_flush && - time_before(jiffies, j->last_flush_write + - msecs_to_jiffies(c->opts.journal_flush_delay)) && - test_bit(JOURNAL_may_skip_flush, &j->flags))) { - w->noflush = true; - SET_JSET_NO_FLUSH(w->data, true); - w->data->last_seq = 0; - w->last_seq = 0; - - j->nr_noflush_writes++; - } else { - w->must_flush = true; - j->last_flush_write = jiffies; - j->nr_flush_writes++; - clear_bit(JOURNAL_need_flush_write, &j->flags); - } - - return 0; -} - -CLOSURE_CALLBACK(bch2_journal_write) -{ - closure_type(w, struct journal_buf, io); - struct journal *j = container_of(w, struct journal, buf[w->idx]); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - union bch_replicas_padded replicas; - unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); - int ret; - - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - BUG_ON(!w->write_started); - BUG_ON(w->write_allocated); - BUG_ON(w->write_done); - - j->write_start_time = local_clock(); - - spin_lock(&j->lock); - if (nr_rw_members > 1) - w->separate_flush = true; - - ret = bch2_journal_write_pick_flush(j, w); - spin_unlock(&j->lock); - - if (unlikely(ret)) - goto err; - - mutex_lock(&j->buf_lock); - journal_buf_realloc(j, w); - - ret = bch2_journal_write_prep(j, w); - mutex_unlock(&j->buf_lock); - - if (unlikely(ret)) - goto err; - - unsigned replicas_allocated = 0; - while (1) { - ret = journal_write_alloc(j, w, &replicas_allocated); - if (!ret || !j->can_discard) - break; - - bch2_journal_do_discards(j); - } - - if (unlikely(ret)) - goto err_allocate_write; - - ret = bch2_journal_write_checksum(j, w); - if (unlikely(ret)) - goto err; - - spin_lock(&j->lock); - /* - * write is allocated, no longer need to account for it in - * bch2_journal_space_available(): - */ - w->sectors = 0; - w->write_allocated = true; - j->entry_bytes_written += vstruct_bytes(w->data); - - /* - * journal entry has been compacted and allocated, recalculate space - * available: - */ - bch2_journal_space_available(j); - bch2_journal_do_writes(j); - spin_unlock(&j->lock); - - w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - - /* - * Mark journal replicas before we submit the write to guarantee - * recovery will find the journal entries after a crash. - */ - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); - ret = bch2_mark_replicas(c, &replicas.e); - if (ret) - goto err; - - if (c->opts.nochanges) - goto no_io; - - if (!JSET_NO_FLUSH(w->data)) - continue_at(cl, journal_write_preflush, j->wq); - else - continue_at(cl, journal_write_submit, j->wq); - return; -err_allocate_write: - if (!bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - - bch2_journal_debug_to_text(&buf, j); - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), - le64_to_cpu(w->data->seq), - vstruct_sectors(w->data, c->block_bits), - bch2_err_str(ret)); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } -err: - bch2_fatal_error(c); -no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); - } - - continue_at(cl, journal_write_done, j->wq); -} diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h deleted file mode 100644 index 6fa82c4050fea1..00000000000000 --- a/fs/bcachefs/journal_io.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_IO_H -#define _BCACHEFS_JOURNAL_IO_H - -#include "darray.h" - -void bch2_journal_pos_from_member_info_set(struct bch_fs *); -void bch2_journal_pos_from_member_info_resume(struct bch_fs *); - -struct journal_ptr { - bool csum_good; - struct bch_csum csum; - u8 dev; - u32 bucket; - u32 bucket_offset; - u64 sector; -}; - -/* - * Only used for holding the journal entries we read in btree_journal_read() - * during cache_registration - */ -struct journal_replay { - DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; - - bool csum_good; - bool ignore_blacklisted; - bool ignore_not_dirty; - /* must be last: */ - struct jset j; -}; - -static inline bool journal_replay_ignore(struct journal_replay *i) -{ - return !i || i->ignore_blacklisted || i->ignore_not_dirty; -} - -static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, - struct jset_entry *entry, unsigned type) -{ - while (entry < vstruct_last(jset)) { - if (entry->type == type) - return entry; - - entry = vstruct_next(entry); - } - - return NULL; -} - -#define for_each_jset_entry_type(entry, jset, type) \ - for (struct jset_entry *entry = (jset)->start; \ - (entry = __jset_entry_type_next(jset, entry, type)); \ - entry = vstruct_next(entry)) - -#define jset_entry_for_each_key(_e, _k) \ - for (struct bkey_i *_k = (_e)->start; \ - _k < vstruct_last(_e); \ - _k = bkey_next(_k)) - -#define for_each_jset_key(k, entry, jset) \ - for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\ - jset_entry_for_each_key(entry, k) - -int bch2_journal_entry_validate(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, - struct bkey_validate_context); -void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, - struct jset_entry *); - -void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, - struct journal_replay *); - -int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); - -CLOSURE_CALLBACK(bch2_journal_write); - -static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - -#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c deleted file mode 100644 index 0042d43b8e57f8..00000000000000 --- a/fs/bcachefs/journal_reclaim.c +++ /dev/null @@ -1,1037 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "errcode.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "replicas.h" -#include "sb-members.h" -#include "trace.h" - -#include -#include - -static bool __should_discard_bucket(struct journal *, struct journal_device *); - -/* Free space calculations: */ - -static unsigned journal_space_from(struct journal_device *ja, - enum journal_space_from from) -{ - switch (from) { - case journal_space_discarded: - return ja->discard_idx; - case journal_space_clean_ondisk: - return ja->dirty_idx_ondisk; - case journal_space_clean: - return ja->dirty_idx; - default: - BUG(); - } -} - -unsigned bch2_journal_dev_buckets_available(struct journal *j, - struct journal_device *ja, - enum journal_space_from from) -{ - if (!ja->nr) - return 0; - - unsigned available = (journal_space_from(ja, from) - - ja->cur_idx - 1 + ja->nr) % ja->nr; - - /* - * Don't use the last bucket unless writing the new last_seq - * will make another bucket available: - */ - if (available && ja->dirty_idx_ondisk == ja->dirty_idx) - --available; - - return available; -} - -void bch2_journal_set_watermark(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool low_on_space = j->space[journal_space_clean].total * 4 <= - j->space[journal_space_total].total; - bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; - bool low_on_wb = bch2_btree_write_buffer_must_wait(c); - unsigned watermark = low_on_space || low_on_pin || low_on_wb - ? BCH_WATERMARK_reclaim - : BCH_WATERMARK_stripe; - - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) || - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) || - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) - trace_and_count(c, journal_full, c); - - mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin); - - swap(watermark, j->watermark); - if (watermark > j->watermark) - journal_wake(j); -} - -static struct journal_space -journal_dev_space_available(struct journal *j, struct bch_dev *ca, - enum journal_space_from from) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_device *ja = &ca->journal; - unsigned sectors, buckets, unwritten; - unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c)); - u64 seq; - - if (from == journal_space_total) - return (struct journal_space) { - .next_entry = bucket_size_aligned, - .total = bucket_size_aligned * ja->nr, - }; - - buckets = bch2_journal_dev_buckets_available(j, ja, from); - sectors = round_down(ja->sectors_free, block_sectors(c)); - - /* - * We that we don't allocate the space for a journal entry - * until we write it out - thus, account for it here: - */ - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j); - seq++) { - unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; - - if (!unwritten) - continue; - - /* entry won't fit on this device, skip: */ - if (unwritten > bucket_size_aligned) - continue; - - if (unwritten >= sectors) { - if (!buckets) { - sectors = 0; - break; - } - - buckets--; - sectors = bucket_size_aligned; - } - - sectors -= unwritten; - } - - if (sectors < ca->mi.bucket_size && buckets) { - buckets--; - sectors = bucket_size_aligned; - } - - return (struct journal_space) { - .next_entry = sectors, - .total = sectors + buckets * bucket_size_aligned, - }; -} - -static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, - enum journal_space_from from) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned pos, nr_devs = 0; - struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; - unsigned min_bucket_size = U32_MAX; - - BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->journal.nr || - !ca->mi.durability) - continue; - - min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); - - space = journal_dev_space_available(j, ca, from); - if (!space.next_entry) - continue; - - for (pos = 0; pos < nr_devs; pos++) - if (space.total > dev_space[pos].total) - break; - - array_insert_item(dev_space, nr_devs, pos, space); - } - - if (nr_devs < nr_devs_want) - return (struct journal_space) { 0, 0 }; - - /* - * It's possible for bucket size to be misaligned w.r.t. the filesystem - * block size: - */ - min_bucket_size = round_down(min_bucket_size, block_sectors(c)); - - /* - * We sorted largest to smallest, and we want the smallest out of the - * @nr_devs_want largest devices: - */ - space = dev_space[nr_devs_want - 1]; - space.next_entry = min(space.next_entry, min_bucket_size); - return space; -} - -void bch2_journal_space_available(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned clean, clean_ondisk, total; - unsigned max_entry_size = min(j->buf[0].buf_size >> 9, - j->buf[1].buf_size >> 9); - unsigned nr_online = 0, nr_devs_want; - bool can_discard = false; - int ret = 0; - - lockdep_assert_held(&j->lock); - guard(rcu)(); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - struct journal_device *ja = &ca->journal; - - if (!ja->nr) - continue; - - while (ja->dirty_idx != ja->cur_idx && - ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) - ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; - - while (ja->dirty_idx_ondisk != ja->dirty_idx && - ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) - ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - - can_discard |= __should_discard_bucket(j, ja); - - max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); - nr_online++; - } - - j->can_discard = can_discard; - - if (nr_online < metadata_replicas_required(c)) { - if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" - "rw journal devs:", nr_online, metadata_replicas_required(c)); - - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) - prt_printf(&buf, " %s", ca->name); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - } - ret = bch_err_throw(c, insufficient_journal_devices); - goto out; - } - - nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); - - for (unsigned i = 0; i < journal_space_nr; i++) - j->space[i] = __journal_space_available(j, nr_devs_want, i); - - clean_ondisk = j->space[journal_space_clean_ondisk].total; - clean = j->space[journal_space_clean].total; - total = j->space[journal_space_total].total; - - if (!j->space[journal_space_discarded].next_entry) - ret = bch_err_throw(c, journal_full); - - if ((j->space[journal_space_clean_ondisk].next_entry < - j->space[journal_space_clean_ondisk].total) && - (clean - clean_ondisk <= total / 8) && - (clean_ondisk * 2 > clean)) - set_bit(JOURNAL_may_skip_flush, &j->flags); - else - clear_bit(JOURNAL_may_skip_flush, &j->flags); - - bch2_journal_set_watermark(j); -out: - j->cur_entry_sectors = !ret - ? j->space[journal_space_discarded].next_entry - : 0; - j->cur_entry_error = ret; - - if (!ret) - journal_wake(j); -} - -/* Discards - last part of journal reclaim: */ - -static bool __should_discard_bucket(struct journal *j, struct journal_device *ja) -{ - unsigned min_free = max(4, ja->nr / 8); - - return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < - min_free && - ja->discard_idx != ja->dirty_idx_ondisk; -} - -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -{ - spin_lock(&j->lock); - bool ret = __should_discard_bucket(j, ja); - spin_unlock(&j->lock); - - return ret; -} - -/* - * Advance ja->discard_idx as long as it points to buckets that are no longer - * dirty, issuing discards if necessary: - */ -void bch2_journal_do_discards(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - - mutex_lock(&j->discard_lock); - - for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { - struct journal_device *ja = &ca->journal; - - while (should_discard_bucket(j, ja)) { - if (!c->opts.nochanges && - bch2_discard_opt_enabled(c, ca) && - bdev_max_discard_sectors(ca->disk_sb.bdev)) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, - ja->buckets[ja->discard_idx]), - ca->mi.bucket_size, GFP_NOFS); - - spin_lock(&j->lock); - ja->discard_idx = (ja->discard_idx + 1) % ja->nr; - - bch2_journal_space_available(j); - spin_unlock(&j->lock); - } - } - - mutex_unlock(&j->discard_lock); -} - -/* - * Journal entry pinning - machinery for holding a reference on a given journal - * entry, holding it open to ensure it gets replayed during recovery: - */ - -void bch2_journal_reclaim_fast(struct journal *j) -{ - bool popped = false; - - lockdep_assert_held(&j->lock); - - /* - * Unpin journal entries whose reference counts reached zero, meaning - * all btree nodes got written out - */ - while (!fifo_empty(&j->pin) && - j->pin.front <= j->seq_ondisk && - !atomic_read(&fifo_peek_front(&j->pin).count)) { - j->pin.front++; - popped = true; - } - - if (popped) { - bch2_journal_space_available(j); - __closure_wake_up(&j->reclaim_flush_wait); - } -} - -bool __bch2_journal_pin_put(struct journal *j, u64 seq) -{ - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - return atomic_dec_and_test(&pin_list->count); -} - -void bch2_journal_pin_put(struct journal *j, u64 seq) -{ - if (__bch2_journal_pin_put(j, seq)) { - spin_lock(&j->lock); - bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); - } -} - -static inline bool __journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - struct journal_entry_pin_list *pin_list; - - if (!journal_pin_active(pin)) - return false; - - if (j->flush_in_progress == pin) - j->flush_in_progress_dropped = true; - - pin_list = journal_seq_pin(j, pin->seq); - pin->seq = 0; - list_del_init(&pin->list); - - if (j->reclaim_flush_wait.list.first) - __closure_wake_up(&j->reclaim_flush_wait); - - /* - * Unpinning a journal entry may make journal_next_bucket() succeed, if - * writing a new last_seq will now make another bucket available: - */ - return atomic_dec_and_test(&pin_list->count) && - pin_list == &fifo_peek_front(&j->pin); -} - -void bch2_journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) -{ - spin_lock(&j->lock); - if (__journal_pin_drop(j, pin)) - bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); -} - -static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, - journal_pin_flush_fn fn) -{ - if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) { - unsigned idx = fn == bch2_btree_node_flush1; - struct btree *b = container_of(pin, struct btree, writes[idx].journal); - - return JOURNAL_PIN_TYPE_btree0 - b->c.level; - } else if (fn == bch2_btree_key_cache_journal_flush) - return JOURNAL_PIN_TYPE_key_cache; - else - return JOURNAL_PIN_TYPE_other; -} - -static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn, - enum journal_pin_type type) -{ - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - /* - * flush_fn is how we identify journal pins in debugfs, so must always - * exist, even if it doesn't do anything: - */ - BUG_ON(!flush_fn); - - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; - - if (list_empty(&pin_list->unflushed[type]) && - j->reclaim_flush_wait.list.first) - __closure_wake_up(&j->reclaim_flush_wait); - - list_add(&pin->list, &pin_list->unflushed[type]); -} - -void bch2_journal_pin_copy(struct journal *j, - struct journal_entry_pin *dst, - struct journal_entry_pin *src, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - - u64 seq = READ_ONCE(src->seq); - - if (seq < journal_last_seq(j)) { - /* - * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on - * the src pin - with the pin dropped, the entry to pin might no - * longer to exist, but that means there's no longer anything to - * copy and we can bail out here: - */ - spin_unlock(&j->lock); - return; - } - - bool reclaim = __journal_pin_drop(j, dst); - - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); - - if (reclaim) - bch2_journal_reclaim_fast(j); - - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - if (seq == journal_last_seq(j)) - journal_wake(j); - spin_unlock(&j->lock); -} - -void bch2_journal_pin_set(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - - BUG_ON(seq < journal_last_seq(j)); - - bool reclaim = __journal_pin_drop(j, pin); - - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); - - if (reclaim) - bch2_journal_reclaim_fast(j); - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - if (seq == journal_last_seq(j)) - journal_wake(j); - - spin_unlock(&j->lock); -} - -/** - * bch2_journal_pin_flush: ensure journal pin callback is no longer running - * @j: journal object - * @pin: pin to flush - */ -void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) -{ - BUG_ON(journal_pin_active(pin)); - - wait_event(j->pin_flush_wait, j->flush_in_progress != pin); -} - -/* - * Journal reclaim: flush references to open journal entries to reclaim space in - * the journal - * - * May be done by the journal code in the background as needed to free up space - * for more journal entries, or as part of doing a clean shutdown, or to migrate - * data off of a specific device: - */ - -static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, - u64 seq_to_flush, - unsigned allowed_below_seq, - unsigned allowed_above_seq, - u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *ret = NULL; - - fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { - if (*seq > seq_to_flush && !allowed_above_seq) - break; - - for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) - if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || - (BIT(i) & allowed_above_seq)) { - ret = list_first_entry_or_null(&pin_list->unflushed[i], - struct journal_entry_pin, list); - if (ret) - return ret; - } - } - - return NULL; -} - -/* returns true if we did work */ -static size_t journal_flush_pins(struct journal *j, - u64 seq_to_flush, - unsigned allowed_below_seq, - unsigned allowed_above_seq, - unsigned min_any, - unsigned min_key_cache) -{ - struct journal_entry_pin *pin; - size_t nr_flushed = 0; - journal_pin_flush_fn flush_fn; - u64 seq; - int err; - - lockdep_assert_held(&j->reclaim_lock); - - while (1) { - unsigned allowed_above = allowed_above_seq; - unsigned allowed_below = allowed_below_seq; - - if (min_any) { - allowed_above |= ~0; - allowed_below |= ~0; - } - - if (min_key_cache) { - allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); - allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); - } - - cond_resched(); - - j->last_flushed = jiffies; - - spin_lock(&j->lock); - pin = journal_get_next_pin(j, seq_to_flush, - allowed_below, - allowed_above, &seq); - if (pin) { - BUG_ON(j->flush_in_progress); - j->flush_in_progress = pin; - j->flush_in_progress_dropped = false; - flush_fn = pin->flush; - } - spin_unlock(&j->lock); - - if (!pin) - break; - - if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) - min_key_cache--; - - if (min_any) - min_any--; - - err = flush_fn(j, pin, seq); - - spin_lock(&j->lock); - /* Pin might have been dropped or rearmed: */ - if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); - j->flush_in_progress = NULL; - j->flush_in_progress_dropped = false; - spin_unlock(&j->lock); - - wake_up(&j->pin_flush_wait); - - if (err) - break; - - nr_flushed++; - } - - return nr_flushed; -} - -static u64 journal_seq_to_flush(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - u64 seq_to_flush = 0; - - guard(spinlock)(&j->lock); - guard(rcu)(); - - for_each_rw_member_rcu(c, ca) { - struct journal_device *ja = &ca->journal; - unsigned nr_buckets, bucket_to_flush; - - if (!ja->nr) - continue; - - /* Try to keep the journal at most half full: */ - nr_buckets = ja->nr / 2; - - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; - seq_to_flush = max(seq_to_flush, - ja->bucket_seq[bucket_to_flush]); - } - - /* Also flush if the pin fifo is more than half full */ - return max_t(s64, seq_to_flush, - (s64) journal_cur_seq(j) - - (j->pin.size >> 1)); -} - -/** - * __bch2_journal_reclaim - free up journal buckets - * @j: journal object - * @direct: direct or background reclaim? - * @kicked: requested to run since we last ran? - * - * Background journal reclaim writes out btree nodes. It should be run - * early enough so that we never completely run out of journal buckets. - * - * High watermarks for triggering background reclaim: - * - FIFO has fewer than 512 entries left - * - fewer than 25% journal buckets free - * - * Background reclaim runs until low watermarks are reached: - * - FIFO has more than 1024 entries left - * - more than 50% journal buckets free - * - * As long as a reclaim can complete in the time it takes to fill up - * 512 journal entries or 25% of all journal buckets, then - * journal_next_bucket() should not stall. - */ -static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_cache *bc = &c->btree_cache; - bool kthread = (current->flags & PF_KTHREAD) != 0; - u64 seq_to_flush; - size_t min_nr, min_key_cache, nr_flushed; - unsigned flags; - int ret = 0; - - /* - * We can't invoke memory reclaim while holding the reclaim_lock - - * journal reclaim is required to make progress for memory reclaim - * (cleaning the caches), so we can't get stuck in memory reclaim while - * we're holding the reclaim lock: - */ - lockdep_assert_held(&j->reclaim_lock); - flags = memalloc_noreclaim_save(); - - do { - if (kthread && kthread_should_stop()) - break; - - ret = bch2_journal_error(j); - if (ret) - break; - - /* XXX shove journal discards off to another thread */ - bch2_journal_do_discards(j); - - seq_to_flush = journal_seq_to_flush(j); - min_nr = 0; - - /* - * If it's been longer than j->reclaim_delay_ms since we last flushed, - * make sure to flush at least one journal pin: - */ - if (time_after(jiffies, j->last_flushed + - msecs_to_jiffies(c->opts.journal_reclaim_delay))) - min_nr = 1; - - if (j->watermark != BCH_WATERMARK_stripe) - min_nr = 1; - - size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr; - if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live) - min_nr = 1; - - min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); - - trace_and_count(c, journal_reclaim_start, c, - direct, kicked, - min_nr, min_key_cache, - atomic_long_read(&bc->nr_dirty), btree_cache_live, - atomic_long_read(&c->btree_key_cache.nr_dirty), - atomic_long_read(&c->btree_key_cache.nr_keys)); - - nr_flushed = journal_flush_pins(j, seq_to_flush, - ~0, 0, - min_nr, min_key_cache); - - if (direct) - j->nr_direct_reclaim += nr_flushed; - else - j->nr_background_reclaim += nr_flushed; - trace_and_count(c, journal_reclaim_finish, c, nr_flushed); - - if (nr_flushed) - wake_up(&j->reclaim_wait); - } while ((min_nr || min_key_cache) && nr_flushed && !direct); - - memalloc_noreclaim_restore(flags); - - return ret; -} - -int bch2_journal_reclaim(struct journal *j) -{ - return __bch2_journal_reclaim(j, true, true); -} - -static int bch2_journal_reclaim_thread(void *arg) -{ - struct journal *j = arg; - struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned long delay, now; - bool journal_empty; - int ret = 0; - - set_freezable(); - - j->last_flushed = jiffies; - - while (!ret && !kthread_should_stop()) { - bool kicked = j->reclaim_kicked; - - j->reclaim_kicked = false; - - mutex_lock(&j->reclaim_lock); - ret = __bch2_journal_reclaim(j, false, kicked); - mutex_unlock(&j->reclaim_lock); - - now = jiffies; - delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); - j->next_reclaim = j->last_flushed + delay; - - if (!time_in_range(j->next_reclaim, now, now + delay)) - j->next_reclaim = now + delay; - - while (1) { - set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - if (kthread_should_stop()) - break; - if (j->reclaim_kicked) - break; - - spin_lock(&j->lock); - journal_empty = fifo_empty(&j->pin); - spin_unlock(&j->lock); - - long timeout = j->next_reclaim - jiffies; - - if (journal_empty) - schedule(); - else if (timeout > 0) - schedule_timeout(timeout); - else - break; - } - __set_current_state(TASK_RUNNING); - } - - return 0; -} - -void bch2_journal_reclaim_stop(struct journal *j) -{ - struct task_struct *p = j->reclaim_thread; - - j->reclaim_thread = NULL; - - if (p) { - kthread_stop(p); - put_task_struct(p); - } -} - -int bch2_journal_reclaim_start(struct journal *j) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct task_struct *p; - int ret; - - if (j->reclaim_thread) - return 0; - - p = kthread_create(bch2_journal_reclaim_thread, j, - "bch-reclaim/%s", c->name); - ret = PTR_ERR_OR_ZERO(p); - bch_err_msg(c, ret, "creating journal reclaim thread"); - if (ret) - return ret; - - get_task_struct(p); - j->reclaim_thread = p; - wake_up_process(p); - return 0; -} - -static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, - unsigned types) -{ - struct journal_entry_pin_list *pin_list; - u64 seq; - - spin_lock(&j->lock); - fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { - if (seq > seq_to_flush) - break; - - for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) - if ((BIT(i) & types) && - (!list_empty(&pin_list->unflushed[i]) || - !list_empty(&pin_list->flushed[i]))) { - spin_unlock(&j->lock); - return true; - } - } - spin_unlock(&j->lock); - - return false; -} - -static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, - unsigned types) -{ - return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || - journal_pins_still_flushing(j, seq_to_flush, types); -} - -static int journal_flush_done(struct journal *j, u64 seq_to_flush, - bool *did_work) -{ - int ret = 0; - - ret = bch2_journal_error(j); - if (ret) - return ret; - - mutex_lock(&j->reclaim_lock); - - for (int type = JOURNAL_PIN_TYPE_NR - 1; - type >= 0; - --type) - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { - *did_work = true; - goto unlock; - } - - if (seq_to_flush > journal_cur_seq(j)) - bch2_journal_entry_close(j); - - spin_lock(&j->lock); - /* - * If journal replay hasn't completed, the unreplayed journal entries - * hold refs on their corresponding sequence numbers - */ - ret = !test_bit(JOURNAL_replay_done, &j->flags) || - journal_last_seq(j) > seq_to_flush || - !fifo_used(&j->pin); - - spin_unlock(&j->lock); -unlock: - mutex_unlock(&j->reclaim_lock); - - return ret; -} - -bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) -{ - /* time_stats this */ - bool did_work = false; - - if (!test_bit(JOURNAL_running, &j->flags)) - return false; - - closure_wait_event(&j->reclaim_flush_wait, - journal_flush_done(j, seq_to_flush, &did_work)); - - return did_work; -} - -int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_entry_pin_list *p; - u64 iter, seq = 0; - int ret = 0; - - spin_lock(&j->lock); - fifo_for_each_entry_ptr(p, &j->pin, iter) - if (dev_idx >= 0 - ? bch2_dev_list_has_dev(p->devs, dev_idx) - : p->devs.nr < c->opts.metadata_replicas) - seq = iter; - spin_unlock(&j->lock); - - bch2_journal_flush_pins(j, seq); - - ret = bch2_journal_error(j); - if (ret) - return ret; - - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); - - /* - * Now that we've populated replicas_gc, write to the journal to mark - * active journal devices. This handles the case where the journal might - * be empty. Otherwise we could clear all journal replicas and - * temporarily put the fs into an unrecoverable state. Journal recovery - * expects to find devices marked for journal data on unclean mount. - */ - ret = bch2_journal_meta(&c->journal); - if (ret) - goto err; - - seq = 0; - spin_lock(&j->lock); - while (!ret) { - union bch_replicas_padded replicas; - - seq = max(seq, journal_last_seq(j)); - if (seq >= j->pin.back) - break; - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - journal_seq_pin(j, seq)->devs); - seq++; - - if (replicas.e.nr_devs) { - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); - } - } - spin_unlock(&j->lock); -err: - ret = bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); - - return ret; -} - -bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) -{ - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *pin; - - spin_lock(&j->lock); - if (!test_bit(JOURNAL_running, &j->flags)) { - spin_unlock(&j->lock); - return true; - } - - *seq = max(*seq, j->pin.front); - - if (*seq >= j->pin.back) { - spin_unlock(&j->lock); - return true; - } - - out->atomic++; - - pin_list = journal_seq_pin(j, *seq); - - prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); - printbuf_indent_add(out, 2); - - prt_printf(out, "unflushed:\n"); - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) - list_for_each_entry(pin, &pin_list->unflushed[i], list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - prt_printf(out, "flushed:\n"); - for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) - list_for_each_entry(pin, &pin_list->flushed[i], list) - prt_printf(out, "\t%px %ps\n", pin, pin->flush); - - printbuf_indent_sub(out, 2); - - --out->atomic; - spin_unlock(&j->lock); - - return false; -} - -void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -{ - u64 seq = 0; - - while (!bch2_journal_seq_pins_to_text(out, j, &seq)) - seq++; -} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h deleted file mode 100644 index 0a73d7134e1cc6..00000000000000 --- a/fs/bcachefs/journal_reclaim.h +++ /dev/null @@ -1,84 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_RECLAIM_H -#define _BCACHEFS_JOURNAL_RECLAIM_H - -#define JOURNAL_PIN (32 * 1024) - -static inline void journal_reclaim_kick(struct journal *j) -{ - struct task_struct *p = READ_ONCE(j->reclaim_thread); - - j->reclaim_kicked = true; - if (p) - wake_up_process(p); -} - -unsigned bch2_journal_dev_buckets_available(struct journal *, - struct journal_device *, - enum journal_space_from); -void bch2_journal_set_watermark(struct journal *); -void bch2_journal_space_available(struct journal *); - -static inline bool journal_pin_active(struct journal_entry_pin *pin) -{ - return pin->seq != 0; -} - -static inline struct journal_entry_pin_list * -journal_seq_pin(struct journal *j, u64 seq) -{ - EBUG_ON(seq < j->pin.front || seq >= j->pin.back); - - return &j->pin.data[seq & j->pin.mask]; -} - -void bch2_journal_reclaim_fast(struct journal *); -bool __bch2_journal_pin_put(struct journal *, u64); -void bch2_journal_pin_put(struct journal *, u64); -void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); - -void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, - journal_pin_flush_fn); - -static inline void bch2_journal_pin_add(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) - bch2_journal_pin_set(j, seq, pin, flush_fn); -} - -void bch2_journal_pin_copy(struct journal *, - struct journal_entry_pin *, - struct journal_entry_pin *, - journal_pin_flush_fn); - -static inline void bch2_journal_pin_update(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) - bch2_journal_pin_set(j, seq, pin, flush_fn); -} - -void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); - -void bch2_journal_do_discards(struct journal *); -int bch2_journal_reclaim(struct journal *); - -void bch2_journal_reclaim_stop(struct journal *); -int bch2_journal_reclaim_start(struct journal *); - -bool bch2_journal_flush_pins(struct journal *, u64); - -static inline bool bch2_journal_flush_all_pins(struct journal *j) -{ - return bch2_journal_flush_pins(j, U64_MAX); -} - -int bch2_journal_flush_device_pins(struct journal *, int); - -void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); - -#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c deleted file mode 100644 index 0cb9b93f13e79f..00000000000000 --- a/fs/bcachefs/journal_sb.c +++ /dev/null @@ -1,232 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "journal_sb.h" -#include "darray.h" - -#include - -/* BCH_SB_FIELD_journal: */ - -static int u64_cmp(const void *_l, const void *_r) -{ - const u64 *l = _l; - const u64 *r = _r; - - return cmp_int(*l, *r); -} - -static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); - int ret = -BCH_ERR_invalid_sb_journal; - unsigned nr; - unsigned i; - u64 *b; - - nr = bch2_nr_journal_buckets(journal); - if (!nr) - return 0; - - b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); - if (!b) - return -BCH_ERR_ENOMEM_sb_journal_validate; - - for (i = 0; i < nr; i++) - b[i] = le64_to_cpu(journal->buckets[i]); - - sort(b, nr, sizeof(u64), u64_cmp, NULL); - - if (!b[0]) { - prt_printf(err, "journal bucket at sector 0"); - goto err; - } - - if (b[0] < le16_to_cpu(m.first_bucket)) { - prt_printf(err, "journal bucket %llu before first bucket %u", - b[0], le16_to_cpu(m.first_bucket)); - goto err; - } - - if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) { - prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1], le64_to_cpu(m.nbuckets)); - goto err; - } - - for (i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) { - prt_printf(err, "duplicate journal buckets %llu", b[i]); - goto err; - } - - ret = 0; -err: - kfree(b); - return ret; -} - -static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal *journal = field_to_type(f, journal); - unsigned i, nr = bch2_nr_journal_buckets(journal); - - prt_printf(out, "Buckets: "); - for (i = 0; i < nr; i++) - prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i])); - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_journal = { - .validate = bch2_sb_journal_validate, - .to_text = bch2_sb_journal_to_text, -}; - -struct u64_range { - u64 start; - u64 end; -}; - -static int u64_range_cmp(const void *_l, const void *_r) -{ - const struct u64_range *l = _l; - const struct u64_range *r = _r; - - return cmp_int(l->start, r->start); -} - -static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); - struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); - int ret = -BCH_ERR_invalid_sb_journal; - u64 sum = 0; - unsigned nr; - unsigned i; - struct u64_range *b; - - nr = bch2_sb_field_journal_v2_nr_entries(journal); - if (!nr) - return 0; - - b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); - if (!b) - return -BCH_ERR_ENOMEM_sb_journal_v2_validate; - - for (i = 0; i < nr; i++) { - b[i].start = le64_to_cpu(journal->d[i].start); - b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); - - if (b[i].end <= b[i].start) { - prt_printf(err, "journal buckets entry with bad nr: %llu+%llu", - le64_to_cpu(journal->d[i].start), - le64_to_cpu(journal->d[i].nr)); - goto err; - } - - sum += le64_to_cpu(journal->d[i].nr); - } - - sort(b, nr, sizeof(*b), u64_range_cmp, NULL); - - if (!b[0].start) { - prt_printf(err, "journal bucket at sector 0"); - goto err; - } - - if (b[0].start < le16_to_cpu(m.first_bucket)) { - prt_printf(err, "journal bucket %llu before first bucket %u", - b[0].start, le16_to_cpu(m.first_bucket)); - goto err; - } - - if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) { - prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", - b[nr - 1].end - 1, le64_to_cpu(m.nbuckets)); - goto err; - } - - for (i = 0; i + 1 < nr; i++) { - if (b[i].end > b[i + 1].start) { - prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", - b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); - goto err; - } - } - - if (sum > UINT_MAX) { - prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX); - goto err; - } - - ret = 0; -err: - kfree(b); - return ret; -} - -static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); - unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); - - prt_printf(out, "Buckets: "); - for (i = 0; i < nr; i++) - prt_printf(out, " %llu-%llu", - le64_to_cpu(journal->d[i].start), - le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { - .validate = bch2_sb_journal_v2_validate, - .to_text = bch2_sb_journal_v2_to_text, -}; - -int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, - u64 *buckets, unsigned nr) -{ - struct bch_sb_field_journal_v2 *j; - unsigned i, dst = 0, nr_compacted = 1; - - if (c) - lockdep_assert_held(&c->sb_lock); - - if (!nr) { - bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); - bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); - return 0; - } - - for (i = 0; i + 1 < nr; i++) - if (buckets[i] + 1 != buckets[i + 1]) - nr_compacted++; - - j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, - (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); - if (!j) - return bch_err_throw(c, ENOSPC_sb_journal); - - bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); - - j->d[dst].start = cpu_to_le64(buckets[0]); - j->d[dst].nr = cpu_to_le64(1); - - for (i = 1; i < nr; i++) { - if (buckets[i] == buckets[i - 1] + 1) { - le64_add_cpu(&j->d[dst].nr, 1); - } else { - dst++; - j->d[dst].start = cpu_to_le64(buckets[i]); - j->d[dst].nr = cpu_to_le64(1); - } - } - - BUG_ON(dst + 1 != nr_compacted); - return 0; -} diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h deleted file mode 100644 index ba40a7e8d90a32..00000000000000 --- a/fs/bcachefs/journal_sb.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#include "super-io.h" -#include "vstructs.h" - -static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -{ - return j - ? (__le64 *) vstruct_end(&j->field) - j->buckets - : 0; -} - -static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) -{ - if (!j) - return 0; - - return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; -} - -extern const struct bch_sb_field_ops bch_sb_field_ops_journal; -extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; - -int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c deleted file mode 100644 index af4fe416d9ecac..00000000000000 --- a/fs/bcachefs/journal_seq_blacklist.c +++ /dev/null @@ -1,264 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "eytzinger.h" -#include "journal.h" -#include "journal_seq_blacklist.h" -#include "super-io.h" - -/* - * journal_seq_blacklist machinery: - * - * To guarantee order of btree updates after a crash, we need to detect when a - * btree node entry (bset) is newer than the newest journal entry that was - * successfully written, and ignore it - effectively ignoring any btree updates - * that didn't make it into the journal. - * - * If we didn't do this, we might have two btree nodes, a and b, both with - * updates that weren't written to the journal yet: if b was updated after a, - * but b was flushed and not a - oops; on recovery we'll find that the updates - * to b happened, but not the updates to a that happened before it. - * - * Ignoring bsets that are newer than the newest journal entry is always safe, - * because everything they contain will also have been journalled - and must - * still be present in the journal on disk until a journal entry has been - * written _after_ that bset was written. - * - * To accomplish this, bsets record the newest journal sequence number they - * contain updates for; then, on startup, the btree code queries the journal - * code to ask "Is this sequence number newer than the newest journal entry? If - * so, ignore it." - * - * When this happens, we must blacklist that journal sequence number: the - * journal must not write any entries with that sequence number, and it must - * record that it was blacklisted so that a) on recovery we don't think we have - * missing journal entries and b) so that the btree code continues to ignore - * that bset, until that btree node is rewritten. - */ - -static unsigned sb_blacklist_u64s(unsigned nr) -{ - struct bch_sb_field_journal_seq_blacklist *bl; - - return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); -} - -int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) -{ - struct bch_sb_field_journal_seq_blacklist *bl; - unsigned i = 0, nr; - int ret = 0; - - mutex_lock(&c->sb_lock); - bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); - nr = blacklist_nr_entries(bl); - - while (i < nr) { - struct journal_seq_blacklist_entry *e = - bl->start + i; - - if (end < le64_to_cpu(e->start)) - break; - - if (start > le64_to_cpu(e->end)) { - i++; - continue; - } - - /* - * Entry is contiguous or overlapping with new entry: merge it - * with new entry, and delete: - */ - - start = min(start, le64_to_cpu(e->start)); - end = max(end, le64_to_cpu(e->end)); - array_remove_item(bl->start, nr, i); - } - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - sb_blacklist_u64s(nr + 1)); - if (!bl) { - ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist); - goto out; - } - - array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { - .start = cpu_to_le64(start), - .end = cpu_to_le64(end), - })); - c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); - - ret = bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); - - return ret ?: bch2_blacklist_table_initialize(c); -} - -static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) -{ - const struct journal_seq_blacklist_table_entry *l = _l; - const struct journal_seq_blacklist_table_entry *r = _r; - - return cmp_int(l->start, r->start); -} - -bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, - bool dirty) -{ - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - struct journal_seq_blacklist_table_entry search = { .start = seq }; - int idx; - - if (!t) - return false; - - idx = eytzinger0_find_le(t->entries, t->nr, - sizeof(t->entries[0]), - journal_seq_blacklist_table_cmp, - &search); - if (idx < 0) - return false; - - BUG_ON(t->entries[idx].start > seq); - - if (seq >= t->entries[idx].end) - return false; - - if (dirty) - t->entries[idx].dirty = true; - return true; -} - -u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c) -{ - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - - if (!t || !t->nr) - return 0; - - return t->entries[eytzinger0_last(t->nr)].end - 1; -} - -int bch2_blacklist_table_initialize(struct bch_fs *c) -{ - struct bch_sb_field_journal_seq_blacklist *bl = - bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); - struct journal_seq_blacklist_table *t; - unsigned i, nr = blacklist_nr_entries(bl); - - if (!bl) - return 0; - - t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); - if (!t) - return bch_err_throw(c, ENOMEM_blacklist_table_init); - - t->nr = nr; - - for (i = 0; i < nr; i++) { - t->entries[i].start = le64_to_cpu(bl->start[i].start); - t->entries[i].end = le64_to_cpu(bl->start[i].end); - } - - eytzinger0_sort(t->entries, - t->nr, - sizeof(t->entries[0]), - journal_seq_blacklist_table_cmp, - NULL); - - kfree(c->journal_seq_blacklist_table); - c->journal_seq_blacklist_table = t; - return 0; -} - -static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_journal_seq_blacklist *bl = - field_to_type(f, journal_seq_blacklist); - unsigned i, nr = blacklist_nr_entries(bl); - - for (i = 0; i < nr; i++) { - struct journal_seq_blacklist_entry *e = bl->start + i; - - if (le64_to_cpu(e->start) >= - le64_to_cpu(e->end)) { - prt_printf(err, "entry %u start >= end (%llu >= %llu)", - i, le64_to_cpu(e->start), le64_to_cpu(e->end)); - return -BCH_ERR_invalid_sb_journal_seq_blacklist; - } - - if (i + 1 < nr && - le64_to_cpu(e[0].end) > - le64_to_cpu(e[1].start)) { - prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", - i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); - return -BCH_ERR_invalid_sb_journal_seq_blacklist; - } - } - - return 0; -} - -static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_journal_seq_blacklist *bl = - field_to_type(f, journal_seq_blacklist); - struct journal_seq_blacklist_entry *i; - unsigned nr = blacklist_nr_entries(bl); - - for (i = bl->start; i < bl->start + nr; i++) { - if (i != bl->start) - prt_printf(out, " "); - - prt_printf(out, "%llu-%llu", - le64_to_cpu(i->start), - le64_to_cpu(i->end)); - } - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { - .validate = bch2_sb_journal_seq_blacklist_validate, - .to_text = bch2_sb_journal_seq_blacklist_to_text -}; - -bool bch2_blacklist_entries_gc(struct bch_fs *c) -{ - struct journal_seq_blacklist_entry *src, *dst; - - struct bch_sb_field_journal_seq_blacklist *bl = - bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); - if (!bl) - return false; - - unsigned nr = blacklist_nr_entries(bl); - dst = bl->start; - - struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; - BUG_ON(nr != t->nr); - - src = bl->start; - eytzinger0_for_each(i, nr) { - BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); - BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - - if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) - *dst++ = *src; - src++; - } - - unsigned new_nr = dst - bl->start; - if (new_nr == nr) - return false; - - bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr); - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - new_nr ? sb_blacklist_u64s(new_nr) : 0); - BUG_ON(new_nr && !bl); - return true; -} diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h deleted file mode 100644 index f06942ccfcddaa..00000000000000 --- a/fs/bcachefs/journal_seq_blacklist.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H - -static inline unsigned -blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) -{ - return bl - ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / - sizeof(struct journal_seq_blacklist_entry)) - : 0; -} - -bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); -u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); -int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); -int bch2_blacklist_table_initialize(struct bch_fs *); - -extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; - -bool bch2_blacklist_entries_gc(struct bch_fs *); - -#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h deleted file mode 100644 index 2566b12dbc045e..00000000000000 --- a/fs/bcachefs/journal_seq_blacklist_format.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H -#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H - -struct journal_seq_blacklist_entry { - __le64 start; - __le64 end; -}; - -struct bch_sb_field_journal_seq_blacklist { - struct bch_sb_field field; - struct journal_seq_blacklist_entry start[]; -}; - -#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h deleted file mode 100644 index 51104bbb99dae7..00000000000000 --- a/fs/bcachefs/journal_types.h +++ /dev/null @@ -1,342 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_JOURNAL_TYPES_H -#define _BCACHEFS_JOURNAL_TYPES_H - -#include -#include - -#include "alloc_types.h" -#include "super_types.h" -#include "fifo.h" - -/* btree write buffer steals 8 bits for its own purposes: */ -#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) - -#define JOURNAL_STATE_BUF_BITS 2 -#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) -#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) - -#define JOURNAL_BUF_BITS 4 -#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) -#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) - -/* - * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to - * the journal that are being staged or in flight. - */ -struct journal_buf { - struct closure io; - struct jset *data; - - __BKEY_PADDED(key, BCH_REPLICAS_MAX); - struct bch_devs_list devs_written; - - struct closure_waitlist wait; - u64 last_seq; /* copy of data->last_seq */ - long expires; - u64 flush_time; - - unsigned buf_size; /* size in bytes of @data */ - unsigned sectors; /* maximum size for current entry */ - unsigned disk_sectors; /* maximum size entry could have been, if - buf_size was bigger */ - unsigned u64s_reserved; - bool noflush:1; /* write has already been kicked off, and was noflush */ - bool must_flush:1; /* something wants a flush */ - bool separate_flush:1; - bool need_flush_to_write_buffer:1; - bool write_started:1; - bool write_allocated:1; - bool write_done:1; - u8 idx; -}; - -/* - * Something that makes a journal entry dirty - i.e. a btree node that has to be - * flushed: - */ - -enum journal_pin_type { - JOURNAL_PIN_TYPE_btree3, - JOURNAL_PIN_TYPE_btree2, - JOURNAL_PIN_TYPE_btree1, - JOURNAL_PIN_TYPE_btree0, - JOURNAL_PIN_TYPE_key_cache, - JOURNAL_PIN_TYPE_other, - JOURNAL_PIN_TYPE_NR, -}; - -struct journal_entry_pin_list { - struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; - struct list_head flushed[JOURNAL_PIN_TYPE_NR]; - atomic_t count; - struct bch_devs_list devs; -}; - -struct journal; -struct journal_entry_pin; -typedef int (*journal_pin_flush_fn)(struct journal *j, - struct journal_entry_pin *, u64); - -struct journal_entry_pin { - struct list_head list; - journal_pin_flush_fn flush; - u64 seq; -}; - -struct journal_res { - bool ref; - u16 u64s; - u32 offset; - u64 seq; -}; - -union journal_res_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 cur_entry_offset:22, - idx:2, - buf0_count:10, - buf1_count:10, - buf2_count:10, - buf3_count:10; - }; -}; - -/* bytes: */ -#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ - -/* - * We stash some journal state as sentinal values in cur_entry_offset: - * note - cur_entry_offset is in units of u64s - */ -#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) - -#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) -#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) - -struct journal_space { - /* Units of 512 bytes sectors: */ - unsigned next_entry; /* How big the next journal entry can be */ - unsigned total; -}; - -enum journal_space_from { - journal_space_discarded, - journal_space_clean_ondisk, - journal_space_clean, - journal_space_total, - journal_space_nr, -}; - -#define JOURNAL_FLAGS() \ - x(replay_done) \ - x(running) \ - x(may_skip_flush) \ - x(need_flush_write) \ - x(space_low) - -enum journal_flags { -#define x(n) JOURNAL_##n, - JOURNAL_FLAGS() -#undef x -}; - -struct journal_bio { - struct bch_dev *ca; - unsigned buf_idx; - u64 submit_time; - - struct bio bio; -}; - -/* Embedded in struct bch_fs */ -struct journal { - /* Fastpath stuff up front: */ - struct { - - union journal_res_state reservations; - enum bch_watermark watermark; - - } __aligned(SMP_CACHE_BYTES); - - unsigned long flags; - - /* Max size of current journal entry */ - unsigned cur_entry_u64s; - unsigned cur_entry_sectors; - - /* Reserved space in journal entry to be used just prior to write */ - unsigned entry_u64s_reserved; - - - /* - * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if - * insufficient devices: - */ - int cur_entry_error; - unsigned cur_entry_offset_if_blocked; - - unsigned buf_size_want; - /* - * We may queue up some things to be journalled (log messages) before - * the journal has actually started - stash them here: - */ - darray_u64 early_journal_entries; - - /* - * Protects journal_buf->data, when accessing without a jorunal - * reservation: for synchronization between the btree write buffer code - * and the journal write path: - */ - struct mutex buf_lock; - /* - * Two journal entries -- one is currently open for new entries, the - * other is possibly being written out. - */ - struct journal_buf buf[JOURNAL_BUF_NR]; - void *free_buf; - unsigned free_buf_size; - - spinlock_t lock; - - /* if nonzero, we may not open a new journal entry: */ - unsigned blocked; - - /* Used when waiting because the journal was full */ - wait_queue_head_t wait; - struct closure_waitlist async_wait; - struct closure_waitlist reclaim_flush_wait; - - struct delayed_work write_work; - struct workqueue_struct *wq; - - /* Sequence number of most recent journal entry (last entry in @pin) */ - atomic64_t seq; - - u64 seq_write_started; - /* seq, last_seq from the most recent journal entry successfully written */ - u64 seq_ondisk; - u64 flushed_seq_ondisk; - u64 flushing_seq; - u64 last_seq_ondisk; - u64 err_seq; - u64 last_empty_seq; - u64 oldest_seq_found_ondisk; - - /* - * FIFO of journal entries whose btree updates have not yet been - * written out. - * - * Each entry is a reference count. The position in the FIFO is the - * entry's sequence number relative to @seq. - * - * The journal entry itself holds a reference count, put when the - * journal entry is written out. Each btree node modified by the journal - * entry also holds a reference count, put when the btree node is - * written. - * - * When a reference count reaches zero, the journal entry is no longer - * needed. When all journal entries in the oldest journal bucket are no - * longer needed, the bucket can be discarded and reused. - */ - struct { - u64 front, back, size, mask; - struct journal_entry_pin_list *data; - } pin; - - struct journal_space space[journal_space_nr]; - - u64 replay_journal_seq; - u64 replay_journal_seq_end; - - struct write_point wp; - spinlock_t err_lock; - - struct mutex reclaim_lock; - /* - * Used for waiting until journal reclaim has freed up space in the - * journal: - */ - wait_queue_head_t reclaim_wait; - struct task_struct *reclaim_thread; - bool reclaim_kicked; - unsigned long next_reclaim; - u64 nr_direct_reclaim; - u64 nr_background_reclaim; - - unsigned long last_flushed; - struct journal_entry_pin *flush_in_progress; - bool flush_in_progress_dropped; - wait_queue_head_t pin_flush_wait; - - /* protects advancing ja->discard_idx: */ - struct mutex discard_lock; - bool can_discard; - - unsigned long last_flush_write; - - u64 write_start_time; - - u64 nr_flush_writes; - u64 nr_noflush_writes; - u64 entry_bytes_written; - - struct bch2_time_stats *flush_write_time; - struct bch2_time_stats *noflush_write_time; - struct bch2_time_stats *flush_seq_time; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map res_map; -#endif -} __aligned(SMP_CACHE_BYTES); - -/* - * Embedded in struct bch_dev. First three fields refer to the array of journal - * buckets, in bch_sb. - */ -struct journal_device { - /* - * For each journal bucket, contains the max sequence number of the - * journal writes it contains - so we know when a bucket can be reused. - */ - u64 *bucket_seq; - - unsigned sectors_free; - - /* - * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: - */ - unsigned discard_idx; /* Next bucket to discard */ - unsigned dirty_idx_ondisk; - unsigned dirty_idx; - unsigned cur_idx; /* Journal bucket we're currently writing to */ - unsigned nr; - - u64 *buckets; - - /* Bio for journal reads/writes to this device */ - struct journal_bio *bio[JOURNAL_BUF_NR]; - - /* for bch_journal_read_device */ - struct closure read; - u64 highest_seq_found; -}; - -/* - * journal_entry_res - reserve space in every journal entry: - */ -struct journal_entry_res { - unsigned u64s; -}; - -#endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c deleted file mode 100644 index 1b828bddd11bf1..00000000000000 --- a/fs/bcachefs/keylist.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey.h" -#include "keylist.h" - -int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, - size_t nr_inline_u64s, size_t new_u64s) -{ - size_t oldsize = bch2_keylist_u64s(l); - size_t newsize = oldsize + new_u64s; - u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; - u64 *new_keys; - - newsize = roundup_pow_of_two(newsize); - - if (newsize <= nr_inline_u64s || - (old_buf && roundup_pow_of_two(oldsize) == newsize)) - return 0; - - new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS); - if (!new_keys) - return -ENOMEM; - - if (!old_buf) - memcpy_u64s(new_keys, inline_u64s, oldsize); - - l->keys_p = new_keys; - l->top_p = new_keys + oldsize; - - return 0; -} - -void bch2_keylist_pop_front(struct keylist *l) -{ - l->top_p -= bch2_keylist_front(l)->k.u64s; - - memmove_u64s_down(l->keys, - bkey_next(l->keys), - bch2_keylist_u64s(l)); -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_verify_keylist_sorted(struct keylist *l) -{ - for_each_keylist_key(l, k) - BUG_ON(bkey_next(k) != l->top && - bpos_ge(k->k.p, bkey_next(k)->k.p)); -} -#endif diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h deleted file mode 100644 index e687e0e9aede1c..00000000000000 --- a/fs/bcachefs/keylist.h +++ /dev/null @@ -1,72 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_KEYLIST_H -#define _BCACHEFS_KEYLIST_H - -#include "keylist_types.h" - -int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -void bch2_keylist_pop_front(struct keylist *); - -static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) -{ - l->top_p = l->keys_p = inline_keys; -} - -static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) -{ - if (l->keys_p != inline_keys) - kfree(l->keys_p); -} - -static inline void bch2_keylist_push(struct keylist *l) -{ - l->top = bkey_next(l->top); -} - -static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) -{ - bkey_copy(l->top, k); - bch2_keylist_push(l); -} - -static inline bool bch2_keylist_empty(struct keylist *l) -{ - return l->top == l->keys; -} - -static inline size_t bch2_keylist_u64s(struct keylist *l) -{ - return l->top_p - l->keys_p; -} - -static inline size_t bch2_keylist_bytes(struct keylist *l) -{ - return bch2_keylist_u64s(l) * sizeof(u64); -} - -static inline struct bkey_i *bch2_keylist_front(struct keylist *l) -{ - return l->keys; -} - -#define for_each_keylist_key(_keylist, _k) \ - for (struct bkey_i *_k = (_keylist)->keys; \ - _k != (_keylist)->top; \ - _k = bkey_next(_k)) - -static inline u64 keylist_sectors(struct keylist *keys) -{ - u64 ret = 0; - - for_each_keylist_key(keys, k) - ret += k->k.size; - return ret; -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_verify_keylist_sorted(struct keylist *); -#else -static inline void bch2_verify_keylist_sorted(struct keylist *l) {} -#endif - -#endif /* _BCACHEFS_KEYLIST_H */ diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h deleted file mode 100644 index 4b3ff7d8a87560..00000000000000 --- a/fs/bcachefs/keylist_types.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_KEYLIST_TYPES_H -#define _BCACHEFS_KEYLIST_TYPES_H - -struct keylist { - union { - struct bkey_i *keys; - u64 *keys_p; - }; - union { - struct bkey_i *top; - u64 *top_p; - }; -}; - -#endif /* _BCACHEFS_KEYLIST_TYPES_H */ diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c deleted file mode 100644 index 75f27ec26f85f8..00000000000000 --- a/fs/bcachefs/logged_ops.c +++ /dev/null @@ -1,119 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "error.h" -#include "io_misc.h" -#include "logged_ops.h" -#include "super.h" - -struct bch_logged_op_fn { - u8 type; - int (*resume)(struct btree_trans *, struct bkey_i *); -}; - -static const struct bch_logged_op_fn logged_op_fns[] = { -#define x(n) { \ - .type = KEY_TYPE_logged_op_##n, \ - .resume = bch2_resume_logged_op_##n, \ -}, - BCH_LOGGED_OPS() -#undef x -}; - -static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type) -{ - for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++) - if (logged_op_fns[i].type == type) - return logged_op_fns + i; - return NULL; -} - -static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; - struct printbuf buf = PRINTBUF; - int ret = 0; - - fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags), - trans, logged_op_but_clean, - "filesystem marked as clean but have logged op\n%s", - (bch2_bkey_val_to_text(&buf, c, k), - buf.buf)); - - struct bkey_buf sk; - bch2_bkey_buf_init(&sk); - bch2_bkey_buf_reassemble(&sk, c, k); - - const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type); - if (fn) - fn->resume(trans, sk.k); - - ret = bch2_logged_op_finish(trans, sk.k); - - bch2_bkey_buf_exit(&sk, c); -fsck_err: - printbuf_exit(&buf); - return ret ?: trans_was_restarted(trans, restart_count); -} - -int bch2_resume_logged_ops(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, - BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM_logged_ops, 0), - POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), - BTREE_ITER_prefetch, k, - resume_logged_op(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) -{ - struct btree_iter iter; - int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); - if (ret) - return ret; - - k->k.p = iter.pos; - - ret = bch2_trans_update(trans, &iter, k, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) -{ - return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_logged_op_start(trans, k)); -} - -int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); - /* - * This needs to be a fatal error because we've left an unfinished - * operation in the logged ops btree. - * - * We should only ever see an error here if the filesystem has already - * been shut down, but make sure of that here: - */ - if (ret) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - bch2_fs_fatal_error(c, "deleting logged operation %s: %s", - buf.buf, bch2_err_str(ret)); - printbuf_exit(&buf); - } - - return ret; -} diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h deleted file mode 100644 index 30ae9ef737dd95..00000000000000 --- a/fs/bcachefs/logged_ops.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LOGGED_OPS_H -#define _BCACHEFS_LOGGED_OPS_H - -#include "bkey.h" - -#define BCH_LOGGED_OPS() \ - x(truncate) \ - x(finsert) - -static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op) -{ - return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0); -} - -int bch2_resume_logged_ops(struct bch_fs *); -int bch2_logged_op_start(struct btree_trans *, struct bkey_i *); -int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *); - -#endif /* _BCACHEFS_LOGGED_OPS_H */ diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h deleted file mode 100644 index cfb67c95d4c8a0..00000000000000 --- a/fs/bcachefs/logged_ops_format.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H -#define _BCACHEFS_LOGGED_OPS_FORMAT_H - -enum logged_ops_inums { - LOGGED_OPS_INUM_logged_ops, - LOGGED_OPS_INUM_inode_cursors, -}; - -struct bch_logged_op_truncate { - struct bch_val v; - __le32 subvol; - __le32 pad; - __le64 inum; - __le64 new_i_size; -}; - -enum logged_op_finsert_state { - LOGGED_OP_FINSERT_start, - LOGGED_OP_FINSERT_shift_extents, - LOGGED_OP_FINSERT_finish, -}; - -struct bch_logged_op_finsert { - struct bch_val v; - __u8 state; - __u8 pad[3]; - __le32 subvol; - __le64 inum; - __le64 dst_offset; - __le64 src_offset; - __le64 pos; -}; - -#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c deleted file mode 100644 index 57b5b3263b083a..00000000000000 --- a/fs/bcachefs/lru.c +++ /dev/null @@ -1,223 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "bkey_buf.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "ec.h" -#include "error.h" -#include "lru.h" -#include "recovery.h" - -/* KEY_TYPE_lru is obsolete: */ -int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(!lru_pos_time(k.k->p), - c, lru_entry_at_time_0, - "lru entry at time=0"); -fsck_err: - return ret; -} - -void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct bch_lru *lru = bkey_s_c_to_lru(k).v; - - prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); -} - -void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru) -{ - prt_printf(out, "%llu:%llu -> %llu:%llu", - lru_pos_id(lru), - lru_pos_time(lru), - u64_to_bucket(lru.offset).inode, - u64_to_bucket(lru.offset).offset); -} - -static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, - u64 dev_bucket, u64 time, bool set) -{ - return time - ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), set) - : 0; -} - -int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -{ - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); -} - -int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -{ - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); -} - -int __bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) -{ - if (old_time == new_time) - return 0; - - return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: - bch2_lru_set(trans, lru_id, dev_bucket, new_time); -} - -static const char * const bch2_lru_types[] = { -#define x(n) #n, - BCH_LRU_TYPES() -#undef x - NULL -}; - -int bch2_lru_check_set(struct btree_trans *trans, - u16 lru_id, - u64 dev_bucket, - u64 time, - struct bkey_s_c referring_k, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter lru_iter; - struct bkey_s_c lru_k = - bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), 0); - int ret = bkey_err(lru_k); - if (ret) - return ret; - - if (lru_k.k->type != KEY_TYPE_set) { - ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); - if (ret) - goto err; - - if (fsck_err(trans, alloc_key_to_missing_lru_entry, - "missing %s lru entry\n%s", - bch2_lru_types[lru_type(lru_k)], - (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { - ret = bch2_lru_set(trans, lru_id, dev_bucket, time); - if (ret) - goto err; - } - } -err: -fsck_err: - bch2_trans_iter_exit(trans, &lru_iter); - printbuf_exit(&buf); - return ret; -} - -static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) -{ - enum bch_lru_type type = lru_type(lru_k); - - switch (type) { - case BCH_LRU_read: - case BCH_LRU_fragmentation: - return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); - case BCH_LRU_stripes: - return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset)); - default: - BUG(); - } -} - -static u64 bkey_lru_type_idx(struct bch_fs *c, - enum bch_lru_type type, - struct bkey_s_c k) -{ - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - - switch (type) { - case BCH_LRU_read: - a = bch2_alloc_to_v4(k, &a_convert); - return alloc_lru_idx_read(*a); - case BCH_LRU_fragmentation: { - a = bch2_alloc_to_v4(k, &a_convert); - - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); - return ca - ? alloc_lru_idx_fragmentation(*a, ca) - : 0; - } - case BCH_LRU_stripes: - return k.k->type == KEY_TYPE_stripe - ? stripe_lru_pos(bkey_s_c_to_stripe(k).v) - : 0; - default: - BUG(); - } -} - -static int bch2_check_lru_key(struct btree_trans *trans, - struct btree_iter *lru_iter, - struct bkey_s_c lru_k, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - struct bbpos bp = lru_pos_to_bp(lru_k); - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); - int ret = bkey_err(k); - if (ret) - goto err; - - enum bch_lru_type type = lru_type(lru_k); - u64 idx = bkey_lru_type_idx(c, type, k); - - if (lru_pos_time(lru_k.k->p) != idx) { - ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); - if (ret) - goto err; - - if (fsck_err(trans, lru_entry_bad, - "incorrect lru entry: lru %s time %llu\n" - "%s\n" - "for %s", - bch2_lru_types[type], - lru_pos_time(lru_k.k->p), - (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), - (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); - } -err: -fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -int bch2_check_lrus(struct bch_fs *c) -{ - struct bkey_buf last_flushed; - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_lru_key(trans, &iter, k, &last_flushed))); - - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); - return ret; - -} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h deleted file mode 100644 index 8abd0aa2083aea..00000000000000 --- a/fs/bcachefs/lru.h +++ /dev/null @@ -1,70 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LRU_H -#define _BCACHEFS_LRU_H - -static inline u64 lru_pos_id(struct bpos pos) -{ - return pos.inode >> LRU_TIME_BITS; -} - -static inline u64 lru_pos_time(struct bpos pos) -{ - return pos.inode & ~(~0ULL << LRU_TIME_BITS); -} - -static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) -{ - struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket); - - EBUG_ON(time > LRU_TIME_MAX); - EBUG_ON(lru_pos_id(pos) != lru_id); - EBUG_ON(lru_pos_time(pos) != time); - EBUG_ON(pos.offset != dev_bucket); - - return pos; -} - -static inline enum bch_lru_type lru_type(struct bkey_s_c l) -{ - u16 lru_id = l.k->p.inode >> 48; - - switch (lru_id) { - case BCH_LRU_BUCKET_FRAGMENTATION: - return BCH_LRU_fragmentation; - case BCH_LRU_STRIPE_FRAGMENTATION: - return BCH_LRU_stripes; - default: - return BCH_LRU_read; - } -} - -int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); -void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -void bch2_lru_pos_to_text(struct printbuf *, struct bpos); - -#define bch2_bkey_ops_lru ((struct bkey_ops) { \ - .key_validate = bch2_lru_validate, \ - .val_to_text = bch2_lru_to_text, \ - .min_val_size = 8, \ -}) - -int bch2_lru_del(struct btree_trans *, u16, u64, u64); -int bch2_lru_set(struct btree_trans *, u16, u64, u64); -int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); - -static inline int bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) -{ - return old_time != new_time - ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time) - : 0; -} - -struct bkey_buf; -int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); - -int bch2_check_lrus(struct bch_fs *); - -#endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h deleted file mode 100644 index b7392ad8e41f0a..00000000000000 --- a/fs/bcachefs/lru_format.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_LRU_FORMAT_H -#define _BCACHEFS_LRU_FORMAT_H - -struct bch_lru { - struct bch_val v; - __le64 idx; -} __packed __aligned(8); - -#define BCH_LRU_TYPES() \ - x(read) \ - x(fragmentation) \ - x(stripes) - -enum bch_lru_type { -#define x(n) BCH_LRU_##n, - BCH_LRU_TYPES() -#undef x -}; - -#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) -#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2) - -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - -#endif /* _BCACHEFS_LRU_FORMAT_H */ diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c deleted file mode 100644 index 0ea9f30803a2b3..00000000000000 --- a/fs/bcachefs/mean_and_variance.c +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions for incremental mean and variance. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * Copyright © 2022 Daniel B. Hill - * - * Author: Daniel B. Hill - * - * Description: - * - * This is includes some incremental algorithms for mean and variance calculation - * - * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf - * - * Create a struct and if it's the weighted variant set the w field (weight = 2^k). - * - * Use mean_and_variance[_weighted]_update() on the struct to update it's state. - * - * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation - * is deferred to these functions for performance reasons. - * - * see lib/math/mean_and_variance_test.c for examples of usage. - * - * DO NOT access the mean and variance fields of the weighted variants directly. - * DO NOT change the weight after calling update. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "mean_and_variance.h" - -u128_u u128_div(u128_u n, u64 d) -{ - u128_u r; - u64 rem; - u64 hi = u128_hi(n); - u64 lo = u128_lo(n); - u64 h = hi & ((u64) U32_MAX << 32); - u64 l = (hi & (u64) U32_MAX) << 32; - - r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); - r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); - r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); - return r; -} -EXPORT_SYMBOL_GPL(u128_div); - -/** - * mean_and_variance_get_mean() - get mean from @s - * @s: mean and variance number of samples and their sums - */ -s64 mean_and_variance_get_mean(struct mean_and_variance s) -{ - return s.n ? div64_u64(s.sum, s.n) : 0; -} -EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); - -/** - * mean_and_variance_get_variance() - get variance from @s1 - * @s1: mean and variance number of samples and sums - * - * see linked pdf equation 12. - */ -u64 mean_and_variance_get_variance(struct mean_and_variance s1) -{ - if (s1.n) { - u128_u s2 = u128_div(s1.sum_squares, s1.n); - u64 s3 = abs(mean_and_variance_get_mean(s1)); - - return u128_lo(u128_sub(s2, u128_square(s3))); - } else { - return 0; - } -} -EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); - -/** - * mean_and_variance_get_stddev() - get standard deviation from @s - * @s: mean and variance number of samples and their sums - */ -u32 mean_and_variance_get_stddev(struct mean_and_variance s) -{ - return int_sqrt64(mean_and_variance_get_variance(s)); -} -EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); - -/** - * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() - * @s: mean and variance number of samples and their sums - * @x: new value to include in the &mean_and_variance_weighted - * @initted: caller must track whether this is the first use or not - * @weight: ewma weight - * - * see linked pdf: function derived from equations 140-143 where alpha = 2^w. - * values are stored bitshifted for performance and added precision. - */ -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, - s64 x, bool initted, u8 weight) -{ - // previous weighted variance. - u8 w = weight; - u64 var_w0 = s->variance; - // new value weighted. - s64 x_w = x << w; - s64 diff_w = x_w - s->mean; - s64 diff = fast_divpow2(diff_w, w); - // new mean weighted. - s64 u_w1 = s->mean + diff; - - if (!initted) { - s->mean = x_w; - s->variance = 0; - } else { - s->mean = u_w1; - s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; - } -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); - -/** - * mean_and_variance_weighted_get_mean() - get mean from @s - * @s: mean and variance number of samples and their sums - * @weight: ewma weight - */ -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, - u8 weight) -{ - return fast_divpow2(s.mean, weight); -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); - -/** - * mean_and_variance_weighted_get_variance() -- get variance from @s - * @s: mean and variance number of samples and their sums - * @weight: ewma weight - */ -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, - u8 weight) -{ - // always positive don't need fast divpow2 - return s.variance >> weight; -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); - -/** - * mean_and_variance_weighted_get_stddev() - get standard deviation from @s - * @s: mean and variance number of samples and their sums - * @weight: ewma weight - */ -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, - u8 weight) -{ - return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight)); -} -EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); - -MODULE_AUTHOR("Daniel B. Hill"); -MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h deleted file mode 100644 index 47e4a3c3d26e73..00000000000000 --- a/fs/bcachefs/mean_and_variance.h +++ /dev/null @@ -1,203 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef MEAN_AND_VARIANCE_H_ -#define MEAN_AND_VARIANCE_H_ - -#include -#include -#include -#include - -#define SQRT_U64_MAX 4294967295ULL - -/* - * u128_u: u128 user mode, because not all architectures support a real int128 - * type - * - * We don't use this version in userspace, because in userspace we link with - * Rust and rustc has issues with u128. - */ - -#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC) - -typedef struct { - unsigned __int128 v; -} __aligned(16) u128_u; - -static inline u128_u u64_to_u128(u64 a) -{ - return (u128_u) { .v = a }; -} - -static inline u64 u128_lo(u128_u a) -{ - return a.v; -} - -static inline u64 u128_hi(u128_u a) -{ - return a.v >> 64; -} - -static inline u128_u u128_add(u128_u a, u128_u b) -{ - a.v += b.v; - return a; -} - -static inline u128_u u128_sub(u128_u a, u128_u b) -{ - a.v -= b.v; - return a; -} - -static inline u128_u u128_shl(u128_u a, s8 shift) -{ - a.v <<= shift; - return a; -} - -static inline u128_u u128_square(u64 a) -{ - u128_u b = u64_to_u128(a); - - b.v *= b.v; - return b; -} - -#else - -typedef struct { - u64 hi, lo; -} __aligned(16) u128_u; - -/* conversions */ - -static inline u128_u u64_to_u128(u64 a) -{ - return (u128_u) { .lo = a }; -} - -static inline u64 u128_lo(u128_u a) -{ - return a.lo; -} - -static inline u64 u128_hi(u128_u a) -{ - return a.hi; -} - -/* arithmetic */ - -static inline u128_u u128_add(u128_u a, u128_u b) -{ - u128_u c; - - c.lo = a.lo + b.lo; - c.hi = a.hi + b.hi + (c.lo < a.lo); - return c; -} - -static inline u128_u u128_sub(u128_u a, u128_u b) -{ - u128_u c; - - c.lo = a.lo - b.lo; - c.hi = a.hi - b.hi - (c.lo > a.lo); - return c; -} - -static inline u128_u u128_shl(u128_u i, s8 shift) -{ - u128_u r; - - r.lo = i.lo << (shift & 63); - if (shift < 64) - r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63)); - else { - r.hi = i.lo << (-shift & 63); - r.lo = 0; - } - return r; -} - -static inline u128_u u128_square(u64 i) -{ - u128_u r; - u64 h = i >> 32, l = i & U32_MAX; - - r = u128_shl(u64_to_u128(h*h), 64); - r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); - r = u128_add(r, u128_shl(u64_to_u128(l*h), 32)); - r = u128_add(r, u64_to_u128(l*l)); - return r; -} - -#endif - -static inline u128_u u64s_to_u128(u64 hi, u64 lo) -{ - u128_u c = u64_to_u128(hi); - - c = u128_shl(c, 64); - c = u128_add(c, u64_to_u128(lo)); - return c; -} - -u128_u u128_div(u128_u n, u64 d); - -struct mean_and_variance { - s64 n; - s64 sum; - u128_u sum_squares; -}; - -/* expontentially weighted variant */ -struct mean_and_variance_weighted { - s64 mean; - u64 variance; -}; - -/** - * fast_divpow2() - fast approximation for n / (1 << d) - * @n: numerator - * @d: the power of 2 denominator. - * - * note: this rounds towards 0. - */ -static inline s64 fast_divpow2(s64 n, u8 d) -{ - return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; -} - -/** - * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 - * and return it. - * @s1: the mean_and_variance to update. - * @v1: the new sample. - * - * see linked pdf equation 12. - */ -static inline void -mean_and_variance_update(struct mean_and_variance *s, s64 v) -{ - s->n++; - s->sum += v; - s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v))); -} - -s64 mean_and_variance_get_mean(struct mean_and_variance s); -u64 mean_and_variance_get_variance(struct mean_and_variance s1); -u32 mean_and_variance_get_stddev(struct mean_and_variance s); - -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, - s64 v, bool initted, u8 weight); - -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, - u8 weight); -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, - u8 weight); -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, - u8 weight); - -#endif // MEAN_AND_VAIRANCE_H_ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c deleted file mode 100644 index e9d9c0212e44b1..00000000000000 --- a/fs/bcachefs/mean_and_variance_test.c +++ /dev/null @@ -1,221 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include - -#include "mean_and_variance.h" - -#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX) - -static void mean_and_variance_basic_test(struct kunit *test) -{ - struct mean_and_variance s = {}; - - mean_and_variance_update(&s, 2); - mean_and_variance_update(&s, 2); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0); - KUNIT_EXPECT_EQ(test, s.n, 2); - - mean_and_variance_update(&s, 4); - mean_and_variance_update(&s, 4); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1); - KUNIT_EXPECT_EQ(test, s.n, 4); -} - -/* - * Test values computed using a spreadsheet from the psuedocode at the bottom: - * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf - */ - -static void mean_and_variance_weighted_test(struct kunit *test) -{ - struct mean_and_variance_weighted s = { }; - - mean_and_variance_weighted_update(&s, 10, false, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - - mean_and_variance_weighted_update(&s, 20, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - - mean_and_variance_weighted_update(&s, 30, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); - - s = (struct mean_and_variance_weighted) { }; - - mean_and_variance_weighted_update(&s, -10, false, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - - mean_and_variance_weighted_update(&s, -20, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - - mean_and_variance_weighted_update(&s, -30, true, 2); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); -} - -static void mean_and_variance_weighted_advanced_test(struct kunit *test) -{ - struct mean_and_variance_weighted s = { }; - bool initted = false; - s64 i; - - for (i = 10; i <= 100; i += 10) { - mean_and_variance_weighted_update(&s, i, initted, 8); - initted = true; - } - - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); - - s = (struct mean_and_variance_weighted) { }; - initted = false; - - for (i = -10; i >= -100; i -= 10) { - mean_and_variance_weighted_update(&s, i, initted, 8); - initted = true; - } - - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); -} - -static void do_mean_and_variance_test(struct kunit *test, - s64 initial_value, - s64 initial_n, - s64 n, - unsigned weight, - s64 *data, - s64 *mean, - s64 *stddev, - s64 *weighted_mean, - s64 *weighted_stddev) -{ - struct mean_and_variance mv = {}; - struct mean_and_variance_weighted vw = { }; - - for (unsigned i = 0; i < initial_n; i++) { - mean_and_variance_update(&mv, initial_value); - mean_and_variance_weighted_update(&vw, initial_value, false, weight); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0); - } - - for (unsigned i = 0; i < n; i++) { - mean_and_variance_update(&mv, data[i]); - mean_and_variance_weighted_update(&vw, data[i], true, weight); - - KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]); - } - - KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); -} - -/* Test behaviour with a single outlier, then back to steady state: */ -static void mean_and_variance_test_1(struct kunit *test) -{ - s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; - s64 mean[] = { 22, 21, 20, 19, 18, 17, 16 }; - s64 stddev[] = { 32, 29, 28, 27, 26, 25, 24 }; - s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; - s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - -/* Test behaviour where we switch from one steady state to another: */ -static void mean_and_variance_test_2(struct kunit *test) -{ - s64 d[] = { 100, 100, 100, 100, 100 }; - s64 mean[] = { 22, 32, 40, 46, 50 }; - s64 stddev[] = { 32, 39, 42, 44, 45 }; - s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; - s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - -static void mean_and_variance_fast_divpow2(struct kunit *test) -{ - s64 i; - u8 d; - - for (i = 0; i < 100; i++) { - d = 0; - KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d)); - KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d)); - for (d = 1; d < 32; d++) { - KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)), - div_u64(i, 1 << d), "%lld %u", i, d); - KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)), - div_u64(i, 1 << d), "%lld %u", -i, d); - } - } -} - -static void mean_and_variance_u128_basic_test(struct kunit *test) -{ - u128_u a = u64s_to_u128(0, U64_MAX); - u128_u a1 = u64s_to_u128(0, 1); - u128_u b = u64s_to_u128(1, 0); - u128_u c = u64s_to_u128(0, 1LLU << 63); - u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0); - KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0); - - KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX); - KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1); - - KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX); - - KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1); - KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31); -} - -static struct kunit_case mean_and_variance_test_cases[] = { - KUNIT_CASE(mean_and_variance_fast_divpow2), - KUNIT_CASE(mean_and_variance_u128_basic_test), - KUNIT_CASE(mean_and_variance_basic_test), - KUNIT_CASE(mean_and_variance_weighted_test), - KUNIT_CASE(mean_and_variance_weighted_advanced_test), - KUNIT_CASE(mean_and_variance_test_1), - KUNIT_CASE(mean_and_variance_test_2), - {} -}; - -static struct kunit_suite mean_and_variance_test_suite = { - .name = "mean and variance tests", - .test_cases = mean_and_variance_test_cases -}; - -kunit_test_suite(mean_and_variance_test_suite); - -MODULE_AUTHOR("Daniel B. Hill"); -MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests"); -MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c deleted file mode 100644 index f296cce95338ce..00000000000000 --- a/fs/bcachefs/migrate.c +++ /dev/null @@ -1,277 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Code for moving data off a device. - */ - -#include "bcachefs.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "ec.h" -#include "errcode.h" -#include "extents.h" -#include "io_write.h" -#include "journal.h" -#include "keylist.h" -#include "migrate.h" -#include "move.h" -#include "progress.h" -#include "replicas.h" -#include "super-io.h" - -static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - unsigned dev_idx, unsigned flags, bool metadata) -{ - unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; - unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; - unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; - unsigned nr_good; - - bch2_bkey_drop_device(k, dev_idx); - - nr_good = bch2_bkey_durability(c, k.s_c); - if ((!nr_good && !(flags & lost)) || - (nr_good < replicas && !(flags & degraded))) - return bch_err_throw(c, remove_would_lose_data); - - return 0; -} - -static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b, unsigned dev_idx, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_buf k; - - bch2_bkey_buf_init(&k); - bch2_bkey_buf_copy(&k, c, &b->key); - - int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: - bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); - - bch_err_fn(c, ret); - bch2_bkey_buf_exit(&k, c); - return ret; -} - -static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - unsigned dev_idx, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_i *n; - int ret; - - if (!bch2_bkey_has_device_c(k, dev_idx)) - return 0; - - n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); - if (ret) - return ret; - - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize(c, bkey_i_to_s(n)); - - /* - * Since we're not inserting through an extent iterator - * (BTREE_ITER_all_snapshots iterators aren't extent iterators), - * we aren't using the extent overwrite path to delete, we're - * just using the normal key deletion path: - */ - if (bkey_deleted(&n->k)) - n->k.size = 0; - return 0; -} - -static int bch2_dev_btree_drop_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - unsigned dev_idx, - struct bkey_buf *last_flushed, - unsigned flags) -{ - struct btree_iter iter; - struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); - int ret = PTR_ERR_OR_ZERO(b); - if (ret) - return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; - - ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_dev_usrdata_drop(struct bch_fs *c, - struct progress_indicator_state *progress, - unsigned dev_idx, unsigned flags) -{ - struct btree_trans *trans = bch2_trans_get(c); - enum btree_id id; - int ret = 0; - - for (id = 0; id < BTREE_ID_NR; id++) { - if (!btree_type_has_ptrs(id)) - continue; - - ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); - bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); - })); - if (ret) - break; - } - - bch2_trans_put(trans); - - return ret; -} - -static int bch2_dev_metadata_drop(struct bch_fs *c, - struct progress_indicator_state *progress, - unsigned dev_idx, unsigned flags) -{ - struct btree_trans *trans; - struct btree_iter iter; - struct closure cl; - struct btree *b; - struct bkey_buf k; - unsigned id; - int ret; - - /* don't handle this yet: */ - if (flags & BCH_FORCE_IF_METADATA_LOST) - return bch_err_throw(c, remove_with_metadata_missing_unimplemented); - - trans = bch2_trans_get(c); - bch2_bkey_buf_init(&k); - closure_init_stack(&cl); - - for (id = 0; id < BTREE_ID_NR; id++) { - bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, - BTREE_ITER_prefetch); -retry: - ret = 0; - while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(trans, &iter)) && - !(ret = PTR_ERR_OR_ZERO(b))) { - bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); - - if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) - goto next; - - ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - - if (ret) - break; -next: - bch2_btree_iter_next_node(trans, &iter); - } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_iter_exit(trans, &iter); - - if (ret) - goto err; - } - - bch2_btree_interior_updates_flush(c); - ret = 0; -err: - bch2_bkey_buf_exit(&k, c); - bch2_trans_put(trans); - - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); - - return ret; -} - -static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, - struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, - unsigned flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, - last_flushed); - int ret = bkey_err(k); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - return 0; - if (ret) - return ret; - - if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) - goto out; - - /* - * XXX: pass flags arg to invalidate_stripe_to_dev and handle it - * properly - */ - - if (bkey_is_btree_ptr(k.k)) - ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); - else if (k.k->type == KEY_TYPE_stripe) - ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); - else - ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) -{ - struct btree_trans *trans = bch2_trans_get(c); - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - int ret = bch2_btree_write_buffer_flush_sync(trans) ?: - for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, - POS(dev_idx, 0), - POS(dev_idx, U64_MAX), 0, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - if (k.k->type != KEY_TYPE_backpointer) - continue; - - data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), - &last_flushed, flags); - - })); - - bch2_bkey_buf_exit(&last_flushed, trans->c); - bch2_trans_put(trans); - bch_err_fn(c, ret); - return ret; -} - -int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) -{ - struct progress_indicator_state progress; - bch2_progress_init(&progress, c, - BIT_ULL(BTREE_ID_extents)| - BIT_ULL(BTREE_ID_reflink)); - - return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, &progress, dev_idx, flags); -} diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h deleted file mode 100644 index 30018140711b7d..00000000000000 --- a/fs/bcachefs/migrate.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MIGRATE_H -#define _BCACHEFS_MIGRATE_H - -int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); -int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); - -#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c deleted file mode 100644 index eec591e947bdad..00000000000000 --- a/fs/bcachefs/move.c +++ /dev/null @@ -1,1494 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "bkey_buf.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_write_buffer.h" -#include "compress.h" -#include "disk_groups.h" -#include "ec.h" -#include "errcode.h" -#include "error.h" -#include "inode.h" -#include "io_read.h" -#include "io_write.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "move.h" -#include "rebalance.h" -#include "reflink.h" -#include "replicas.h" -#include "snapshot.h" -#include "super-io.h" -#include "trace.h" - -#include -#include - -const char * const bch2_data_ops_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_DATA_OPS() -#undef x - NULL -}; - -struct evacuate_bucket_arg { - struct bpos bucket; - int gen; - struct data_update_opts data_opts; -}; - -static bool evacuate_bucket_pred(struct bch_fs *, void *, - enum btree_id, struct bkey_s_c, - struct bch_io_opts *, - struct data_update_opts *); - -static noinline void -trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move(c, buf.buf); - printbuf_exit(&buf); -} - -static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) -{ - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - trace_io_move_read(c, buf.buf); - printbuf_exit(&buf); -} - -static noinline void -trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts, - move_pred_fn pred, void *_arg, bool p) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "%ps: %u", pred, p); - - if (pred == evacuate_bucket_pred) { - struct evacuate_bucket_arg *arg = _arg; - prt_printf(&buf, " gen=%u", arg->gen); - } - - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move_pred(c, buf.buf); - printbuf_exit(&buf); -} - -static noinline void -trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen) -{ - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "bucket: "); - bch2_bpos_to_text(&buf, bucket); - prt_printf(&buf, " gen: %i\n", gen); - - trace_io_move_evacuate_bucket(c, buf.buf); - printbuf_exit(&buf); -} - -struct moving_io { - struct list_head read_list; - struct list_head io_list; - struct move_bucket *b; - struct closure cl; - bool read_completed; - - unsigned read_sectors; - unsigned write_sectors; - - struct data_update write; -}; - -static void move_free(struct moving_io *io) -{ - struct moving_context *ctxt = io->write.ctxt; - - if (io->b) - atomic_dec(&io->b->count); - - mutex_lock(&ctxt->lock); - list_del(&io->io_list); - wake_up(&ctxt->wait); - mutex_unlock(&ctxt->lock); - - if (!io->write.data_opts.scrub) { - bch2_data_update_exit(&io->write); - } else { - bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); - kfree(io->write.bvecs); - } - kfree(io); -} - -static void move_write_done(struct bch_write_op *op) -{ - struct moving_io *io = container_of(op, struct moving_io, write.op); - struct bch_fs *c = op->c; - struct moving_context *ctxt = io->write.ctxt; - - if (op->error) { - if (trace_io_move_write_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_write_op_to_text(&buf, op); - trace_io_move_write_fail(c, buf.buf); - printbuf_exit(&buf); - } - this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); - - ctxt->write_error = true; - } - - atomic_sub(io->write_sectors, &ctxt->write_sectors); - atomic_dec(&ctxt->write_ios); - move_free(io); - closure_put(&ctxt->cl); -} - -static void move_write(struct moving_io *io) -{ - struct bch_fs *c = io->write.op.c; - struct moving_context *ctxt = io->write.ctxt; - struct bch_read_bio *rbio = &io->write.rbio; - - if (ctxt->stats) { - if (rbio->bio.bi_status) - atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, - &ctxt->stats->sectors_error_uncorrected); - else if (rbio->saw_error) - atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, - &ctxt->stats->sectors_error_corrected); - } - - /* - * If the extent has been bitrotted, we're going to have to give it a - * new checksum in order to move it - but the poison bit will ensure - * that userspace still gets the appropriate error. - */ - if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && - (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - - rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, - nonce, &rbio->bio); - rbio->ret = 0; - } - - if (unlikely(rbio->ret || io->write.data_opts.scrub)) { - move_free(io); - return; - } - - if (trace_io_move_write_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); - trace_io_move_write(c, buf.buf); - printbuf_exit(&buf); - } - - closure_get(&io->write.ctxt->cl); - atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_inc(&io->write.ctxt->write_ios); - - bch2_data_update_read_done(&io->write); -} - -struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) -{ - struct moving_io *io = - list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); - - return io && io->read_completed ? io : NULL; -} - -static void move_read_endio(struct bio *bio) -{ - struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); - struct moving_context *ctxt = io->write.ctxt; - - atomic_sub(io->read_sectors, &ctxt->read_sectors); - atomic_dec(&ctxt->read_ios); - io->read_completed = true; - - wake_up(&ctxt->wait); - closure_put(&ctxt->cl); -} - -void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) -{ - struct moving_io *io; - - while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { - bch2_trans_unlock_long(ctxt->trans); - list_del(&io->read_list); - move_write(io); - } -} - -void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) -{ - unsigned sectors_pending = atomic_read(&ctxt->write_sectors); - - move_ctxt_wait_event(ctxt, - !atomic_read(&ctxt->write_sectors) || - atomic_read(&ctxt->write_sectors) != sectors_pending); -} - -void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) -{ - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); - bch2_trans_unlock_long(ctxt->trans); - closure_sync(&ctxt->cl); -} - -void bch2_moving_ctxt_exit(struct moving_context *ctxt) -{ - struct bch_fs *c = ctxt->trans->c; - - bch2_moving_ctxt_flush_all(ctxt); - - EBUG_ON(atomic_read(&ctxt->write_sectors)); - EBUG_ON(atomic_read(&ctxt->write_ios)); - EBUG_ON(atomic_read(&ctxt->read_sectors)); - EBUG_ON(atomic_read(&ctxt->read_ios)); - - mutex_lock(&c->moving_context_lock); - list_del(&ctxt->list); - mutex_unlock(&c->moving_context_lock); - - /* - * Generally, releasing a transaction within a transaction restart means - * an unhandled transaction restart: but this can happen legitimately - * within the move code, e.g. when bch2_move_ratelimit() tells us to - * exit before we've retried - */ - bch2_trans_begin(ctxt->trans); - bch2_trans_put(ctxt->trans); - memset(ctxt, 0, sizeof(*ctxt)); -} - -void bch2_moving_ctxt_init(struct moving_context *ctxt, - struct bch_fs *c, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc) -{ - memset(ctxt, 0, sizeof(*ctxt)); - - ctxt->trans = bch2_trans_get(c); - ctxt->fn = (void *) _RET_IP_; - ctxt->rate = rate; - ctxt->stats = stats; - ctxt->wp = wp; - ctxt->wait_on_copygc = wait_on_copygc; - - closure_init_stack(&ctxt->cl); - - mutex_init(&ctxt->lock); - INIT_LIST_HEAD(&ctxt->reads); - INIT_LIST_HEAD(&ctxt->ios); - init_waitqueue_head(&ctxt->wait); - - mutex_lock(&c->moving_context_lock); - list_add(&ctxt->list, &c->moving_context_list); - mutex_unlock(&c->moving_context_lock); -} - -void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) -{ - trace_move_data(c, stats); -} - -void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) -{ - memset(stats, 0, sizeof(*stats)); - stats->data_type = BCH_DATA_user; - scnprintf(stats->name, sizeof(stats->name), "%s", name); -} - -int bch2_move_extent(struct moving_context *ctxt, - struct move_bucket *bucket_in_flight, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_io_opts io_opts, - struct data_update_opts data_opts) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - int ret = -ENOMEM; - - if (trace_io_move_enabled()) - trace_io_move2(c, k, &io_opts, &data_opts); - this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - - if (ctxt->stats) - ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); - - bch2_data_update_opts_normalize(k, &data_opts); - - if (!data_opts.rewrite_ptrs && - !data_opts.extra_replicas && - !data_opts.scrub) { - if (data_opts.kill_ptrs) - return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); - return 0; - } - - struct moving_io *io = allocate_dropping_locks(trans, ret, - kzalloc(sizeof(struct moving_io), _gfp)); - if (!io) - goto err; - - if (ret) - goto err_free; - - INIT_LIST_HEAD(&io->io_list); - io->write.ctxt = ctxt; - io->read_sectors = k.k->size; - io->write_sectors = k.k->size; - - if (!data_opts.scrub) { - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, - &io_opts, data_opts, iter->btree_id, k); - if (ret) - goto err_free; - - io->write.op.end_io = move_write_done; - } else { - bch2_bkey_buf_init(&io->write.k); - bch2_bkey_buf_reassemble(&io->write.k, c, k); - - io->write.op.c = c; - io->write.data_opts = data_opts; - - bch2_trans_unlock(trans); - - ret = bch2_data_update_bios_init(&io->write, c, &io_opts); - if (ret) - goto err_free; - } - - io->write.rbio.bio.bi_end_io = move_read_endio; - io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); - - if (ctxt->stats) { - atomic64_inc(&ctxt->stats->keys_moved); - atomic64_add(k.k->size, &ctxt->stats->sectors_moved); - } - - if (bucket_in_flight) { - io->b = bucket_in_flight; - atomic_inc(&io->b->count); - } - - if (trace_io_move_read_enabled()) - trace_io_move_read2(c, k); - - mutex_lock(&ctxt->lock); - atomic_add(io->read_sectors, &ctxt->read_sectors); - atomic_inc(&ctxt->read_ios); - - list_add_tail(&io->read_list, &ctxt->reads); - list_add_tail(&io->io_list, &ctxt->ios); - mutex_unlock(&ctxt->lock); - - /* - * dropped by move_read_endio() - guards against use after free of - * ctxt when doing wakeup - */ - closure_get(&ctxt->cl); - __bch2_read_extent(trans, &io->write.rbio, - io->write.rbio.bio.bi_iter, - bkey_start_pos(k.k), - iter->btree_id, k, 0, - NULL, - BCH_READ_last_fragment, - data_opts.scrub ? data_opts.read_dev : -1); - return 0; -err_free: - kfree(io); -err: - if (bch2_err_matches(ret, EROFS) || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - count_event(c, io_move_start_fail); - - if (trace_io_move_start_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, ": "); - prt_str(&buf, bch2_err_str(ret)); - trace_io_move_start_fail(c, buf.buf); - printbuf_exit(&buf); - } - - if (bch2_err_matches(ret, BCH_ERR_data_update_done)) - return 0; - return ret; -} - -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, - struct per_snapshot_io_opts *io_opts, - struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; - struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; - int ret = 0; - - if (extent_iter->min_depth) - return opts_ret; - - if (extent_k.k->type == KEY_TYPE_reflink_v) - goto out; - - if (io_opts->cur_inum != extent_pos.inode) { - io_opts->d.nr = 0; - - ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), - BTREE_ITER_all_snapshots, k, ({ - if (k.k->p.offset != extent_pos.inode) - break; - - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; - _ret3 = bch2_inode_unpack(k, &inode); - if (_ret3) - break; - - struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; - bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - - darray_push(&io_opts->d, e); - })); - io_opts->cur_inum = extent_pos.inode; - } - - ret = ret ?: trans_was_restarted(trans, restart_count); - if (ret) - return ERR_PTR(ret); - - if (extent_k.k->p.snapshot) - darray_for_each(io_opts->d, i) - if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { - opts_ret = &i->io_opts; - break; - } -out: - ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); - if (ret) - return ERR_PTR(ret); - return opts_ret; -} - -int bch2_move_get_io_opts_one(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - - *io_opts = bch2_opts_to_inode_opts(c->opts); - - /* reflink btree? */ - if (!extent_k.k->p.inode) - goto out; - - struct btree_iter inode_iter; - struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_cached); - int ret = bkey_err(inode_k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - if (!ret && bkey_is_inode(inode_k.k)) { - struct bch_inode_unpacked inode; - bch2_inode_unpack(inode_k, &inode); - bch2_inode_opts_get(io_opts, c, &inode); - } - bch2_trans_iter_exit(trans, &inode_iter); - /* seem to be spinning here? */ -out: - return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); -} - -int bch2_move_ratelimit(struct moving_context *ctxt) -{ - struct bch_fs *c = ctxt->trans->c; - bool is_kthread = current->flags & PF_KTHREAD; - u64 delay; - - if (ctxt->wait_on_copygc && c->copygc_running) { - bch2_moving_ctxt_flush_all(ctxt); - wait_event_killable(c->copygc_running_wq, - !c->copygc_running || - (is_kthread && kthread_should_stop())); - } - - do { - delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; - - if (is_kthread && kthread_should_stop()) - return 1; - - if (delay) - move_ctxt_wait_event_timeout(ctxt, - freezing(current) || - (is_kthread && kthread_should_stop()), - delay); - - if (unlikely(freezing(current))) { - bch2_moving_ctxt_flush_all(ctxt); - try_to_freeze(); - } - } while (delay); - - /* - * XXX: these limits really ought to be per device, SSDs and hard drives - * will want different limits - */ - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && - atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && - atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && - atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); - - return 0; -} - -/* - * Move requires non extents iterators, and there's also no need for it to - * signal indirect_extent_missing_error: - */ -static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_reflink_p p) -{ - if (unlikely(REFLINK_P_ERROR(p.v))) - return bkey_s_c_null; - - struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); - - bch2_trans_iter_init(trans, iter, - BTREE_ID_reflink, reflink_pos, - BTREE_ITER_not_extents); - - struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); - if (!k.k || bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_null; - } - - return k; -} - -int bch2_move_data_btree(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id, unsigned level) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct per_snapshot_io_opts snapshot_io_opts; - struct bch_io_opts *io_opts; - struct bkey_buf sk; - struct btree_iter iter, reflink_iter = {}; - struct bkey_s_c k; - struct data_update_opts data_opts; - /* - * If we're moving a single file, also process reflinked data it points - * to (this includes propagating changed io_opts from the inode to the - * extent): - */ - bool walk_indirect = start.inode == end.inode; - int ret = 0, ret2; - - per_snapshot_io_opts_init(&snapshot_io_opts, c); - bch2_bkey_buf_init(&sk); - - if (ctxt->stats) { - ctxt->stats->data_type = BCH_DATA_user; - ctxt->stats->pos = BBPOS(btree_id, start); - } - -retry_root: - bch2_trans_begin(trans); - - if (level == bch2_btree_id_root(c, btree_id)->level + 1) { - bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); - ret = PTR_ERR_OR_ZERO(b); - if (ret) - goto root_err; - - if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); - goto retry_root; - } - - k = bkey_i_to_s_c(&b->key); - - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, - iter.pos, &iter, k); - ret = PTR_ERR_OR_ZERO(io_opts); - if (ret) - goto root_err; - - memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) - goto out; - - - if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, - k.k->p, data_opts.target, 0); - else - ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); - -root_err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_iter_exit(trans, &iter); - goto retry_root; - } - - goto out; - } - - bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - - if (ctxt->rate) - bch2_ratelimit_reset(ctxt->rate); - - while (!bch2_move_ratelimit(ctxt)) { - struct btree_iter *extent_iter = &iter; - - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek(trans, &iter); - if (!k.k) - break; - - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (bkey_gt(bkey_start_pos(k.k), end)) - break; - - if (ctxt->stats) - ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - - if (walk_indirect && - k.k->type == KEY_TYPE_reflink_p && - REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - bch2_trans_iter_exit(trans, &reflink_iter); - k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (!k.k) - goto next_nondata; - - /* - * XXX: reflink pointers may point to multiple indirect - * extents, so don't advance past the entire reflink - * pointer - need to fixup iter->k - */ - extent_iter = &reflink_iter; - } - - if (!bkey_extent_is_direct_data(k.k)) - goto next_nondata; - - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, - iter.pos, extent_iter, k); - ret = PTR_ERR_OR_ZERO(io_opts); - if (ret) - continue; - - memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) - goto next; - - /* - * The iterator gets unlocked by __bch2_read_extent - need to - * save a copy of @k elsewhere: - */ - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - if (!level) - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); - else if (!data_opts.scrub) - ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, - k.k->p, data_opts.target, 0); - else - ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); - - if (ret2) { - if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) - continue; - - if (bch2_err_matches(ret2, ENOMEM)) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - - /* XXX signal failure */ - goto next; - } -next: - if (ctxt->stats) - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); -next_nondata: - if (!bch2_btree_iter_advance(trans, &iter)) - break; - } -out: - bch2_trans_iter_exit(trans, &reflink_iter); - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&sk, c); - per_snapshot_io_opts_exit(&snapshot_io_opts); - - return ret; -} - -int __bch2_move_data(struct moving_context *ctxt, - struct bbpos start, - struct bbpos end, - move_pred_fn pred, void *arg) -{ - struct bch_fs *c = ctxt->trans->c; - enum btree_id id; - int ret = 0; - - for (id = start.btree; - id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); - id++) { - ctxt->stats->pos = BBPOS(id, POS_MIN); - - if (!btree_type_has_ptrs(id) || - !bch2_btree_id_root(c, id)->b) - continue; - - ret = bch2_move_data_btree(ctxt, - id == start.btree ? start.pos : POS_MIN, - id == end.btree ? end.pos : POS_MAX, - pred, arg, id, 0); - if (ret) - break; - } - - return ret; -} - -int bch2_move_data(struct bch_fs *c, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) -{ - struct moving_context ctxt; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - int ret = __bch2_move_data(&ctxt, start, end, pred, arg); - bch2_moving_ctxt_exit(&ctxt); - - return ret; -} - -static int __bch2_move_data_phys(struct moving_context *ctxt, - struct move_bucket *bucket_in_flight, - unsigned dev, - u64 bucket_start, - u64 bucket_end, - unsigned data_types, - bool copygc, - move_pred_fn pred, void *arg) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - bool is_kthread = current->flags & PF_KTHREAD; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_iter iter = {}, bp_iter = {}; - struct bkey_buf sk; - struct bkey_s_c k; - struct bkey_buf last_flushed; - u64 check_mismatch_done = bucket_start; - int ret = 0; - - struct bch_dev *ca = bch2_dev_tryget(c, dev); - if (!ca) - return 0; - - bucket_end = min(bucket_end, ca->mi.nbuckets); - - struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); - struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); - - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - bch2_bkey_buf_init(&sk); - - /* - * We're not run in a context that handles transaction restarts: - */ - bch2_trans_begin(trans); - - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); - - ret = bch2_btree_write_buffer_tryflush(trans); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "flushing btree write buffer"); - if (ret) - goto err; - - while (!(ret = bch2_move_ratelimit(ctxt))) { - if (is_kthread && kthread_should_stop()) - break; - - bch2_trans_begin(trans); - - k = bch2_btree_iter_peek(trans, &bp_iter); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - - if (!k.k || bkey_gt(k.k->p, bp_end)) - break; - - if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { - while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { - bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, - copygc, &last_flushed); - } - continue; - } - - if (k.k->type != KEY_TYPE_backpointer) - goto next; - - struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - - if (ctxt->stats) - ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - if (!(data_types & BIT(bp.v->data_type))) - goto next; - - if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes) - goto next; - - k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!k.k) - goto next; - - if (!bp.v->level) { - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); - if (ret) { - bch2_trans_iter_exit(trans, &iter); - continue; - } - } - - struct data_update_opts data_opts = {}; - bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts); - - if (trace_io_move_pred_enabled()) - trace_io_move_pred2(c, k, &io_opts, &data_opts, - pred, arg, p); - - if (!p) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - - if (data_opts.scrub && - !bch2_dev_idx_is_online(c, data_opts.read_dev)) { - bch2_trans_iter_exit(trans, &iter); - ret = bch_err_throw(c, device_offline); - break; - } - - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - /* move_extent will drop locks */ - unsigned sectors = bp.v->bucket_len; - - if (!bp.v->level) - ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); - else if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, - k.k->p, data_opts.target, 0); - else - ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); - - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret == -ENOMEM) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - if (ret) - goto err; - - if (ctxt->stats) - atomic64_add(sectors, &ctxt->stats->sectors_seen); -next: - bch2_btree_iter_advance(trans, &bp_iter); - } - - while (check_mismatch_done < bucket_end) - bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, - copygc, &last_flushed); -err: - bch2_trans_iter_exit(trans, &bp_iter); - bch2_bkey_buf_exit(&sk, c); - bch2_bkey_buf_exit(&last_flushed, c); - bch2_dev_put(ca); - return ret; -} - -int bch2_move_data_phys(struct bch_fs *c, - unsigned dev, - u64 start, - u64 end, - unsigned data_types, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) -{ - struct moving_context ctxt; - - bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - if (ctxt.stats) { - ctxt.stats->phys = true; - ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; - } - - int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, - data_types, false, pred, arg); - bch2_moving_ctxt_exit(&ctxt); - - return ret; -} - -static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct evacuate_bucket_arg *arg = _arg; - - *data_opts = arg->data_opts; - - unsigned i = 0; - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (ptr->dev == arg->bucket.inode && - (arg->gen < 0 || arg->gen == ptr->gen) && - !ptr->cached) - data_opts->rewrite_ptrs |= BIT(i); - i++; - } - - return data_opts->rewrite_ptrs != 0; -} - -int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts data_opts) -{ - struct bch_fs *c = ctxt->trans->c; - struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; - - count_event(c, io_move_evacuate_bucket); - if (trace_io_move_evacuate_bucket_enabled()) - trace_io_move_evacuate_bucket2(c, bucket, gen); - - return __bch2_move_data_phys(ctxt, bucket_in_flight, - bucket.inode, - bucket.offset, - bucket.offset + 1, - ~0, - true, - evacuate_bucket_pred, &arg); -} - -typedef bool (*move_btree_pred)(struct bch_fs *, void *, - struct btree *, struct bch_io_opts *, - struct data_update_opts *); - -static int bch2_move_btree(struct bch_fs *c, - struct bbpos start, - struct bbpos end, - move_btree_pred pred, void *arg, - struct bch_move_stats *stats) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct moving_context ctxt; - struct btree_trans *trans; - struct btree_iter iter; - struct btree *b; - enum btree_id btree; - struct data_update_opts data_opts; - int ret = 0; - - bch2_moving_ctxt_init(&ctxt, c, NULL, stats, - writepoint_ptr(&c->btree_write_point), - true); - trans = ctxt.trans; - - stats->data_type = BCH_DATA_btree; - - for (btree = start.btree; - btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); - btree ++) { - stats->pos = BBPOS(btree, POS_MIN); - - if (!bch2_btree_id_root(c, btree)->b) - continue; - - bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, - BTREE_ITER_prefetch); -retry: - ret = 0; - while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(trans, &iter)) && - !(ret = PTR_ERR_OR_ZERO(b))) { - if (kthread && kthread_should_stop()) - break; - - if ((cmp_int(btree, end.btree) ?: - bpos_cmp(b->key.k.p, end.pos)) > 0) - break; - - stats->pos = BBPOS(iter.btree_id, iter.pos); - - if (!pred(c, arg, b, &io_opts, &data_opts)) - goto next; - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; -next: - bch2_btree_iter_next_node(trans, &iter); - } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_iter_exit(trans, &iter); - - if (kthread && kthread_should_stop()) - break; - } - - bch_err_fn(c, ret); - bch2_moving_ctxt_exit(&ctxt); - bch2_btree_interior_updates_flush(c); - - return ret; -} - -static bool rereplicate_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - unsigned nr_good = bch2_bkey_durability(c, k); - unsigned replicas = bkey_is_btree_ptr(k.k) - ? c->opts.metadata_replicas - : io_opts->data_replicas; - - guard(rcu)(); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i = 0; - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ptr->cached && - (!ca || !ca->mi.durability)) - data_opts->kill_ptrs |= BIT(i); - i++; - } - - if (!data_opts->kill_ptrs && - (!nr_good || nr_good >= replicas)) - return false; - - data_opts->target = 0; - data_opts->extra_replicas = replicas - nr_good; - data_opts->btree_insert_flags = 0; - return true; -} - -static bool migrate_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_ioctl_data *op = arg; - unsigned i = 0; - - data_opts->rewrite_ptrs = 0; - data_opts->target = 0; - data_opts->extra_replicas = 0; - data_opts->btree_insert_flags = 0; - - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == op->migrate.dev) - data_opts->rewrite_ptrs |= 1U << i; - i++; - } - - return data_opts->rewrite_ptrs != 0; -} - -static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - -/* - * Ancient versions of bcachefs produced packed formats which could represent - * keys that the in memory format cannot represent; this checks for those - * formats so we can get rid of them. - */ -static bool bformat_needs_redo(struct bkey_format *f) -{ - for (unsigned i = 0; i < f->nr_fields; i++) - if (bch2_bkey_format_field_overflows(f, i)) - return true; - - return false; -} - -static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - if (b->version_ondisk != c->sb.version || - btree_node_need_rewrite(b) || - bformat_needs_redo(&b->format)) { - data_opts->target = 0; - data_opts->extra_replicas = 0; - data_opts->btree_insert_flags = 0; - return true; - } - - return false; -} - -int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) -{ - int ret; - - ret = bch2_move_btree(c, - BBPOS_MIN, - BBPOS_MAX, - rewrite_old_nodes_pred, c, stats); - if (!ret) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); - c->disk_sb.sb->version_min = c->disk_sb.sb->version; - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - - bch_err_fn(c, ret); - return ret; -} - -static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - unsigned durability = bch2_bkey_durability(c, k); - unsigned replicas = bkey_is_btree_ptr(k.k) - ? c->opts.metadata_replicas - : io_opts->data_replicas; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned i = 0; - - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { - unsigned d = bch2_extent_ptr_durability(c, &p); - - if (d && durability - d >= replicas) { - data_opts->kill_ptrs |= BIT(i); - durability -= d; - } - - i++; - } - - return data_opts->kill_ptrs != 0; -} - -static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), - io_opts, data_opts); -} - -static bool scrub_pred(struct bch_fs *c, void *_arg, - enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bch_ioctl_data *arg = _arg; - - if (k.k->type != KEY_TYPE_btree_ptr_v2) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == arg->migrate.dev) { - if (!p.crc.csum_type) - return false; - break; - } - } - - data_opts->scrub = true; - data_opts->read_dev = arg->migrate.dev; - return true; -} - -int bch2_data_job(struct bch_fs *c, - struct bch_move_stats *stats, - struct bch_ioctl_data op) -{ - struct bbpos start = BBPOS(op.start_btree, op.start_pos); - struct bbpos end = BBPOS(op.end_btree, op.end_pos); - int ret = 0; - - if (op.op >= BCH_DATA_OP_NR) - return -EINVAL; - - bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); - - switch (op.op) { - case BCH_DATA_OP_scrub: - /* - * prevent tests from spuriously failing, make sure we see all - * btree nodes that need to be repaired - */ - bch2_btree_interior_updates_flush(c); - - ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, - op.scrub.data_types, - NULL, - stats, - writepoint_hashed((unsigned long) current), - false, - scrub_pred, &op) ?: ret; - break; - - case BCH_DATA_OP_rereplicate: - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, -1); - ret = bch2_move_btree(c, start, end, - rereplicate_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - rereplicate_pred, c) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - break; - case BCH_DATA_OP_migrate: - if (op.migrate.dev >= c->sb.nr_devices) - return -EINVAL; - - stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, - ~0, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - migrate_pred, &op) ?: ret; - bch2_btree_interior_updates_flush(c); - ret = bch2_replicas_gc2(c) ?: ret; - break; - case BCH_DATA_OP_rewrite_old_nodes: - ret = bch2_scan_old_btree_nodes(c, stats); - break; - case BCH_DATA_OP_drop_extra_replicas: - ret = bch2_move_btree(c, start, end, - drop_extra_replicas_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, NULL, stats, - writepoint_hashed((unsigned long) current), - true, - drop_extra_replicas_pred, c) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - break; - default: - ret = -EINVAL; - } - - bch2_move_stats_exit(stats, c); - return ret; -} - -void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) -{ - prt_printf(out, "%s: data type==", stats->name); - bch2_prt_data_type(out, stats->data_type); - prt_str(out, " pos="); - bch2_bbpos_to_text(out, stats->pos); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); - prt_newline(out); - - prt_printf(out, "bytes moved:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); - prt_newline(out); - - prt_printf(out, "bytes raced:\t"); - prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); - prt_newline(out); - - printbuf_indent_sub(out, 2); -} - -static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - bch2_move_stats_to_text(out, ctxt->stats); - printbuf_indent_add(out, 2); - - prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", - atomic_read(&ctxt->read_ios), - c->opts.move_ios_in_flight, - atomic_read(&ctxt->read_sectors), - c->opts.move_bytes_in_flight >> 9); - - prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", - atomic_read(&ctxt->write_ios), - c->opts.move_ios_in_flight, - atomic_read(&ctxt->write_sectors), - c->opts.move_bytes_in_flight >> 9); - - printbuf_indent_add(out, 2); - - mutex_lock(&ctxt->lock); - struct moving_io *io; - list_for_each_entry(io, &ctxt->ios, io_list) - bch2_data_update_inflight_to_text(out, &io->write); - mutex_unlock(&ctxt->lock); - - printbuf_indent_sub(out, 4); -} - -void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct moving_context *ctxt; - - mutex_lock(&c->moving_context_lock); - list_for_each_entry(ctxt, &c->moving_context_list, list) - bch2_moving_ctxt_to_text(out, c, ctxt); - mutex_unlock(&c->moving_context_lock); -} - -void bch2_fs_move_init(struct bch_fs *c) -{ - INIT_LIST_HEAD(&c->moving_context_list); - mutex_init(&c->moving_context_lock); -} diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h deleted file mode 100644 index 86b80499ac55f2..00000000000000 --- a/fs/bcachefs/move.h +++ /dev/null @@ -1,165 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MOVE_H -#define _BCACHEFS_MOVE_H - -#include "bbpos.h" -#include "bcachefs_ioctl.h" -#include "btree_iter.h" -#include "buckets.h" -#include "data_update.h" -#include "move_types.h" - -struct bch_read_bio; - -struct moving_context { - struct btree_trans *trans; - struct list_head list; - void *fn; - - struct bch_ratelimit *rate; - struct bch_move_stats *stats; - struct write_point_specifier wp; - bool wait_on_copygc; - bool write_error; - - /* For waiting on outstanding reads and writes: */ - struct closure cl; - - struct mutex lock; - struct list_head reads; - struct list_head ios; - - /* in flight sectors: */ - atomic_t read_sectors; - atomic_t write_sectors; - atomic_t read_ios; - atomic_t write_ios; - - wait_queue_head_t wait; -}; - -#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \ -({ \ - int _ret = 0; \ - while (true) { \ - bool cond_finished = false; \ - bch2_moving_ctxt_do_pending_writes(_ctxt); \ - \ - if (_cond) \ - break; \ - bch2_trans_unlock_long((_ctxt)->trans); \ - _ret = __wait_event_timeout((_ctxt)->wait, \ - bch2_moving_ctxt_next_pending_write(_ctxt) || \ - (cond_finished = (_cond)), _timeout); \ - if (_ret || ( cond_finished)) \ - break; \ - } \ - _ret; \ -}) - -#define move_ctxt_wait_event(_ctxt, _cond) \ -do { \ - bool cond_finished = false; \ - bch2_moving_ctxt_do_pending_writes(_ctxt); \ - \ - if (_cond) \ - break; \ - bch2_trans_unlock_long((_ctxt)->trans); \ - __wait_event((_ctxt)->wait, \ - bch2_moving_ctxt_next_pending_write(_ctxt) || \ - (cond_finished = (_cond))); \ - if (cond_finished) \ - break; \ -} while (1) - -typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, - struct bch_io_opts *, struct data_update_opts *); - -extern const char * const bch2_data_ops_strs[]; - -void bch2_moving_ctxt_exit(struct moving_context *); -void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, - struct bch_ratelimit *, struct bch_move_stats *, - struct write_point_specifier, bool); -struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); -void bch2_moving_ctxt_do_pending_writes(struct moving_context *); -void bch2_moving_ctxt_flush_all(struct moving_context *); -void bch2_move_ctxt_wait_for_io(struct moving_context *); -int bch2_move_ratelimit(struct moving_context *); - -/* Inodes in different snapshots may have different IO options: */ -struct snapshot_io_opts_entry { - u32 snapshot; - struct bch_io_opts io_opts; -}; - -struct per_snapshot_io_opts { - u64 cur_inum; - struct bch_io_opts fs_io_opts; - DARRAY(struct snapshot_io_opts_entry) d; -}; - -static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) -{ - memset(io_opts, 0, sizeof(*io_opts)); - io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts); -} - -static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) -{ - darray_exit(&io_opts->d); -} - -int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, - struct btree_iter *, struct bkey_s_c); - -int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); - -int bch2_move_extent(struct moving_context *, - struct move_bucket *, - struct btree_iter *, - struct bkey_s_c, - struct bch_io_opts, - struct data_update_opts); - -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, - struct per_snapshot_io_opts *, struct bpos, - struct btree_iter *, struct bkey_s_c); - -int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, - move_pred_fn, void *, enum btree_id, unsigned); -int __bch2_move_data(struct moving_context *, - struct bbpos, - struct bbpos, - move_pred_fn, void *); -int bch2_move_data(struct bch_fs *, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *, - struct bch_move_stats *, - struct write_point_specifier, - bool, - move_pred_fn, void *); - -int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, - struct bch_ratelimit *, struct bch_move_stats *, - struct write_point_specifier, bool, - move_pred_fn, void *); - -int bch2_evacuate_bucket(struct moving_context *, - struct move_bucket *, - struct bpos, int, - struct data_update_opts); -int bch2_data_job(struct bch_fs *, - struct bch_move_stats *, - struct bch_ioctl_data); - -void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); -void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); -void bch2_move_stats_init(struct bch_move_stats *, const char *); - -void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_move_init(struct bch_fs *); - -#endif /* _BCACHEFS_MOVE_H */ diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h deleted file mode 100644 index c5c62cd600de1c..00000000000000 --- a/fs/bcachefs/move_types.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MOVE_TYPES_H -#define _BCACHEFS_MOVE_TYPES_H - -#include "bbpos_types.h" -#include "bcachefs_ioctl.h" - -struct bch_move_stats { - char name[32]; - bool phys; - enum bch_ioctl_data_event_ret ret; - - union { - struct { - enum bch_data_type data_type; - struct bbpos pos; - }; - struct { - unsigned dev; - u64 offset; - }; - }; - - atomic64_t keys_moved; - atomic64_t keys_raced; - atomic64_t sectors_seen; - atomic64_t sectors_moved; - atomic64_t sectors_raced; - atomic64_t sectors_error_corrected; - atomic64_t sectors_error_uncorrected; -}; - -struct move_bucket_key { - struct bpos bucket; - unsigned gen; -}; - -struct move_bucket { - struct move_bucket *next; - struct rhash_head hash; - struct move_bucket_key k; - unsigned sectors; - atomic_t count; -}; - -#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c deleted file mode 100644 index 5e6de91a87630d..00000000000000 --- a/fs/bcachefs/movinggc.c +++ /dev/null @@ -1,476 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Moving/copying garbage collector - * - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "backpointers.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "clock.h" -#include "errcode.h" -#include "error.h" -#include "lru.h" -#include "move.h" -#include "movinggc.h" -#include "trace.h" - -#include -#include -#include -#include -#include - -struct buckets_in_flight { - struct rhashtable *table; - struct move_bucket *first; - struct move_bucket *last; - size_t nr; - size_t sectors; - - DARRAY(struct move_bucket *) to_evacuate; -}; - -static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket, hash), - .key_offset = offsetof(struct move_bucket, k), - .key_len = sizeof(struct move_bucket_key), - .automatic_shrinking = true, -}; - -static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) -{ - if (!list->first) - list->first = b; - else - list->last->next = b; - - list->last = b; - list->nr++; - list->sectors += b->sectors; -} - -static int bch2_bucket_is_movable(struct btree_trans *trans, - struct move_bucket *b, u64 time) -{ - struct bch_fs *c = trans->c; - - if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) - return 0; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_cached); - int ret = bkey_err(k); - if (ret) - return ret; - - struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p); - if (!ca) - goto out; - - if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) - goto out; - - if (ca->mi.state != BCH_MEMBER_STATE_rw || - !bch2_dev_is_online(ca)) - goto out; - - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - b->k.gen = a->gen; - b->sectors = bch2_bucket_sectors_dirty(*a); - u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - - ret = lru_idx && lru_idx <= time; -out: - bch2_dev_put(ca); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static void move_bucket_free(struct buckets_in_flight *list, - struct move_bucket *b) -{ - int ret = rhashtable_remove_fast(list->table, &b->hash, - bch_move_bucket_params); - BUG_ON(ret); - kfree(b); -} - -static void move_buckets_wait(struct moving_context *ctxt, - struct buckets_in_flight *list, - bool flush) -{ - struct move_bucket *i; - - while ((i = list->first)) { - if (flush) - move_ctxt_wait_event(ctxt, !atomic_read(&i->count)); - - if (atomic_read(&i->count)) - break; - - list->first = i->next; - if (!list->first) - list->last = NULL; - - list->nr--; - list->sectors -= i->sectors; - - move_bucket_free(list, i); - } - - bch2_trans_unlock_long(ctxt->trans); -} - -static bool bucket_in_flight(struct buckets_in_flight *list, - struct move_bucket_key k) -{ - return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); -} - -static int bch2_copygc_get_buckets(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); - size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; - int ret; - - move_buckets_wait(ctxt, buckets_in_flight, false); - - ret = bch2_btree_write_buffer_tryflush(trans); - if (bch2_err_matches(ret, EROFS)) - return ret; - - if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) - return ret; - - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), - 0, k, ({ - struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret2 = 0; - - saw++; - - ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); - if (ret2 < 0) - goto err; - - if (!ret2) - not_movable++; - else if (bucket_in_flight(buckets_in_flight, b.k)) - in_flight++; - else { - struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); - ret2 = b_i ? 0 : -ENOMEM; - if (ret2) - goto err; - - *b_i = b; - - ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); - if (ret2) { - kfree(b_i); - goto err; - } - - ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, - bch_move_bucket_params); - BUG_ON(ret2); - - sectors += b.sectors; - } - - ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; -err: - ret2; - })); - - pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", - buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); - - return ret < 0 ? ret : 0; -} - -noinline -static int bch2_copygc(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight, - bool *did_work) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct data_update_opts data_opts = { - .btree_insert_flags = BCH_WATERMARK_copygc, - }; - u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); - u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); - int ret = 0; - - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); - if (ret) - goto err; - - darray_for_each(buckets_in_flight->to_evacuate, i) { - if (kthread_should_stop() || freezing(current)) - break; - - struct move_bucket *b = *i; - *i = NULL; - - move_bucket_in_flight_add(buckets_in_flight, b); - - ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); - if (ret) - goto err; - - *did_work = true; - } -err: - /* no entries in LRU btree found, or got to end: */ - if (bch2_err_matches(ret, ENOENT)) - ret = 0; - - if (ret < 0 && !bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "from bch2_move_data()"); - - sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; - sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; - trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); - - darray_for_each(buckets_in_flight->to_evacuate, i) - if (*i) - move_bucket_free(buckets_in_flight, *i); - darray_exit(&buckets_in_flight->to_evacuate); - return ret; -} - -static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) -{ - struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); - struct bch_dev_usage usage; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage.buckets[i] = usage_full.d[i].buckets; - - s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * - ca->mi.bucket_size) >> 1); - s64 fragmented = 0; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; - - return max(0LL, fragmented_allowed - fragmented); -} - -/* - * Copygc runs when the amount of fragmented data is above some arbitrary - * threshold: - * - * The threshold at the limit - when the device is full - is the amount of space - * we reserved in bch2_recalc_capacity; we can't have more than that amount of - * disk space stranded due to fragmentation and store everything we have - * promised to store. - * - * But we don't want to be running copygc unnecessarily when the device still - * has plenty of free space - rather, we want copygc to smoothly run every so - * often and continually reduce the amount of fragmented space as the device - * fills up. So, we increase the threshold by half the current free space. - */ -u64 bch2_copygc_wait_amount(struct bch_fs *c) -{ - u64 wait = U64_MAX; - - guard(rcu)(); - for_each_rw_member_rcu(c, ca) - wait = min(wait, bch2_copygc_dev_wait_amount(ca)); - return wait; -} - -void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) -{ - printbuf_tabstop_push(out, 32); - prt_printf(out, "running:\t%u\n", c->copygc_running); - prt_printf(out, "copygc_wait:\t%llu\n", c->copygc_wait); - prt_printf(out, "copygc_wait_at:\t%llu\n", c->copygc_wait_at); - - prt_printf(out, "Currently waiting for:\t"); - prt_human_readable_u64(out, max(0LL, c->copygc_wait - - atomic64_read(&c->io_clock[WRITE].now)) << 9); - prt_newline(out); - - prt_printf(out, "Currently waiting since:\t"); - prt_human_readable_u64(out, max(0LL, - atomic64_read(&c->io_clock[WRITE].now) - - c->copygc_wait_at) << 9); - prt_newline(out); - - bch2_printbuf_make_room(out, 4096); - - struct task_struct *t; - out->atomic++; - scoped_guard(rcu) { - prt_printf(out, "Currently calculated wait:\n"); - for_each_rw_member_rcu(c, ca) { - prt_printf(out, " %s:\t", ca->name); - prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); - prt_newline(out); - } - - t = rcu_dereference(c->copygc_thread); - if (t) - get_task_struct(t); - } - --out->atomic; - - if (t) { - bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); - put_task_struct(t); - } -} - -static int bch2_copygc_thread(void *arg) -{ - struct bch_fs *c = arg; - struct moving_context ctxt; - struct bch_move_stats move_stats; - struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight buckets = {}; - u64 last, wait; - - buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL); - int ret = !buckets.table - ? -ENOMEM - : rhashtable_init(buckets.table, &bch_move_bucket_params); - bch_err_msg(c, ret, "allocating copygc buckets in flight"); - if (ret) - goto err; - - set_freezable(); - - /* - * Data move operations can't run until after check_snapshots has - * completed, and bch2_snapshot_is_ancestor() is available. - */ - kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || - kthread_should_stop()); - - bch2_move_stats_init(&move_stats, "copygc"); - bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, - writepoint_ptr(&c->copygc_write_point), - false); - - while (!ret && !kthread_should_stop()) { - bool did_work = false; - - bch2_trans_unlock_long(ctxt.trans); - cond_resched(); - - if (!c->opts.copygc_enabled) { - move_buckets_wait(&ctxt, &buckets, true); - kthread_wait_freezable(c->opts.copygc_enabled || - kthread_should_stop()); - } - - if (unlikely(freezing(current))) { - move_buckets_wait(&ctxt, &buckets, true); - __refrigerator(false); - continue; - } - - last = atomic64_read(&clock->now); - wait = bch2_copygc_wait_amount(c); - - if (wait > clock->max_slop) { - c->copygc_wait_at = last; - c->copygc_wait = last + wait; - move_buckets_wait(&ctxt, &buckets, true); - trace_and_count(c, copygc_wait, c, wait, last + wait); - bch2_kthread_io_clock_wait(clock, last + wait, - MAX_SCHEDULE_TIMEOUT); - continue; - } - - c->copygc_wait = 0; - - c->copygc_running = true; - ret = bch2_copygc(&ctxt, &buckets, &did_work); - c->copygc_running = false; - - wake_up(&c->copygc_running_wq); - - if (!wait && !did_work) { - u64 min_member_capacity = bch2_min_rw_member_capacity(c); - - if (min_member_capacity == U64_MAX) - min_member_capacity = 128 * 2048; - - move_buckets_wait(&ctxt, &buckets, true); - bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), - MAX_SCHEDULE_TIMEOUT); - } - } - - move_buckets_wait(&ctxt, &buckets, true); - rhashtable_destroy(buckets.table); - bch2_moving_ctxt_exit(&ctxt); - bch2_move_stats_exit(&move_stats, c); -err: - kfree(buckets.table); - return ret; -} - -void bch2_copygc_stop(struct bch_fs *c) -{ - if (c->copygc_thread) { - kthread_stop(c->copygc_thread); - put_task_struct(c->copygc_thread); - } - c->copygc_thread = NULL; -} - -int bch2_copygc_start(struct bch_fs *c) -{ - struct task_struct *t; - int ret; - - if (c->copygc_thread) - return 0; - - if (c->opts.nochanges) - return 0; - - if (bch2_fs_init_fault("copygc_start")) - return -ENOMEM; - - t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); - ret = PTR_ERR_OR_ZERO(t); - bch_err_msg(c, ret, "creating copygc thread"); - if (ret) - return ret; - - get_task_struct(t); - - c->copygc_thread = t; - wake_up_process(c->copygc_thread); - - return 0; -} - -void bch2_fs_copygc_init(struct bch_fs *c) -{ - init_waitqueue_head(&c->copygc_running_wq); - c->copygc_running = false; -} diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h deleted file mode 100644 index f615910d6f9836..00000000000000 --- a/fs/bcachefs/movinggc.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_MOVINGGC_H -#define _BCACHEFS_MOVINGGC_H - -u64 bch2_copygc_wait_amount(struct bch_fs *); -void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); - -static inline void bch2_copygc_wakeup(struct bch_fs *c) -{ - guard(rcu)(); - struct task_struct *p = rcu_dereference(c->copygc_thread); - if (p) - wake_up_process(p); -} - -void bch2_copygc_stop(struct bch_fs *); -int bch2_copygc_start(struct bch_fs *); -void bch2_fs_copygc_init(struct bch_fs *); - -#endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c deleted file mode 100644 index c3f87c59922d1a..00000000000000 --- a/fs/bcachefs/namei.c +++ /dev/null @@ -1,1034 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "acl.h" -#include "btree_update.h" -#include "dirent.h" -#include "inode.h" -#include "namei.h" -#include "subvolume.h" -#include "xattr.h" - -#include - -static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode) -{ - return (subvol_inum) { - .subvol = inode->bi_parent_subvol ?: inum.subvol, - .inum = inode->bi_dir, - }; -} - -static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) -{ - return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; -} - -int bch2_create_trans(struct btree_trans *trans, - subvol_inum dir, - struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *new_inode, - const struct qstr *name, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct posix_acl *default_acl, - struct posix_acl *acl, - subvol_inum snapshot_src, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter inode_iter = {}; - subvol_inum new_inum = dir; - u64 now = bch2_current_time(c); - u64 cpu = raw_smp_processor_id(); - u64 dir_target; - u32 snapshot; - unsigned dir_type = mode_to_type(mode); - int ret; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, - BTREE_ITER_intent|BTREE_ITER_with_updates); - if (ret) - goto err; - - if (!(flags & BCH_CREATE_SNAPSHOT)) { - /* Normal create path - allocate a new inode: */ - bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u); - - if (flags & BCH_CREATE_TMPFILE) - new_inode->bi_flags |= BCH_INODE_unlinked; - - ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); - if (ret) - goto err; - - snapshot_src = (subvol_inum) { 0 }; - } else { - /* - * Creating a snapshot - we're not allocating a new inode, but - * we do have to lookup the root inode of the subvolume we're - * snapshotting and update it (in the new snapshot): - */ - - if (!snapshot_src.inum) { - /* Inode wasn't specified, just snapshot: */ - struct bch_subvolume s; - ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s); - if (ret) - goto err; - - snapshot_src.inum = le64_to_cpu(s.inode); - } - - ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, - BTREE_ITER_intent); - if (ret) - goto err; - - if (new_inode->bi_subvol != snapshot_src.subvol) { - /* Not a subvolume root: */ - ret = -EINVAL; - goto err; - } - - /* - * If we're not root, we have to own the subvolume being - * snapshotted: - */ - if (uid && new_inode->bi_uid != uid) { - ret = -EPERM; - goto err; - } - - flags |= BCH_CREATE_SUBVOL; - } - - new_inum.inum = new_inode->bi_inum; - dir_target = new_inode->bi_inum; - - if (flags & BCH_CREATE_SUBVOL) { - u32 new_subvol, dir_snapshot; - - ret = bch2_subvolume_create(trans, new_inode->bi_inum, - dir.subvol, - snapshot_src.subvol, - &new_subvol, &snapshot, - (flags & BCH_CREATE_SNAPSHOT_RO) != 0); - if (ret) - goto err; - - new_inode->bi_parent_subvol = dir.subvol; - new_inode->bi_subvol = new_subvol; - new_inum.subvol = new_subvol; - dir_target = new_subvol; - dir_type = DT_SUBVOL; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); - if (ret) - goto err; - - bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot); - ret = bch2_btree_iter_traverse(trans, &dir_iter); - if (ret) - goto err; - } - - if (!(flags & BCH_CREATE_SNAPSHOT)) { - if (default_acl) { - ret = bch2_set_acl_trans(trans, new_inum, new_inode, - default_acl, ACL_TYPE_DEFAULT); - if (ret) - goto err; - } - - if (acl) { - ret = bch2_set_acl_trans(trans, new_inum, new_inode, - acl, ACL_TYPE_ACCESS); - if (ret) - goto err; - } - } - - if (!(flags & BCH_CREATE_TMPFILE)) { - struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); - u64 dir_offset; - - if (is_subdir_for_nlink(new_inode)) - dir_u->bi_nlink++; - dir_u->bi_mtime = dir_u->bi_ctime = now; - - ret = bch2_dirent_create(trans, dir, &dir_hash, - dir_type, - name, - dir_target, - &dir_offset, - STR_HASH_must_create|BTREE_ITER_with_updates) ?: - bch2_inode_write(trans, &dir_iter, dir_u); - if (ret) - goto err; - - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; - } - - if (S_ISDIR(mode)) { - ret = bch2_maybe_propagate_has_case_insensitive(trans, - (subvol_inum) { - new_inode->bi_subvol ?: dir.subvol, - new_inode->bi_inum }, - new_inode); - if (ret) - goto err; - } - - if (S_ISDIR(mode) && - !new_inode->bi_subvol) - new_inode->bi_depth = dir_u->bi_depth + 1; - - inode_iter.flags &= ~BTREE_ITER_all_snapshots; - bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot); - - ret = bch2_btree_iter_traverse(trans, &inode_iter) ?: - bch2_inode_write(trans, &inode_iter, new_inode); -err: - bch2_trans_iter_exit(trans, &inode_iter); - bch2_trans_iter_exit(trans, &dir_iter); - return ret; -} - -int bch2_link_trans(struct btree_trans *trans, - subvol_inum dir, struct bch_inode_unpacked *dir_u, - subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct qstr *name) -{ - struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter inode_iter = {}; - struct bch_hash_info dir_hash; - u64 now = bch2_current_time(c); - u64 dir_offset = 0; - int ret; - - if (dir.subvol != inum.subvol) - return -EXDEV; - - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); - if (ret) - return ret; - - inode_u->bi_ctime = now; - ret = bch2_inode_nlink_inc(inode_u); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); - if (ret) - goto err; - - if (bch2_reinherit_attrs(inode_u, dir_u)) { - ret = -EXDEV; - goto err; - } - - dir_u->bi_mtime = dir_u->bi_ctime = now; - - dir_hash = bch2_hash_info_init(c, dir_u); - - ret = bch2_dirent_create(trans, dir, &dir_hash, - mode_to_type(inode_u->bi_mode), - name, inum.inum, - &dir_offset, - STR_HASH_must_create); - if (ret) - goto err; - - inode_u->bi_dir = dir.inum; - inode_u->bi_dir_offset = dir_offset; - - ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: - bch2_inode_write(trans, &inode_iter, inode_u); -err: - bch2_trans_iter_exit(trans, &dir_iter); - bch2_trans_iter_exit(trans, &inode_iter); - return ret; -} - -int bch2_unlink_trans(struct btree_trans *trans, - subvol_inum dir, - struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *inode_u, - const struct qstr *name, - bool deleting_subvol) -{ - struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter dirent_iter = {}; - struct btree_iter inode_iter = {}; - struct bch_hash_info dir_hash; - subvol_inum inum; - u64 now = bch2_current_time(c); - struct bkey_s_c k; - int ret; - - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); - if (ret) - goto err; - - dir_hash = bch2_hash_info_init(c, dir_u); - - ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_intent); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, - BTREE_ITER_intent); - if (ret) - goto err; - - if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) { - ret = bch2_empty_dir_trans(trans, inum); - if (ret) - goto err; - } - - if (deleting_subvol && !inode_u->bi_subvol) { - ret = bch_err_throw(c, ENOENT_not_subvol); - goto err; - } - - if (inode_u->bi_subvol) { - /* Recursive subvolume destroy not allowed (yet?) */ - ret = bch2_subvol_has_children(trans, inode_u->bi_subvol); - if (ret) - goto err; - } - - if (deleting_subvol || inode_u->bi_subvol) { - ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); - if (ret) - goto err; - - k = bch2_btree_iter_peek_slot(trans, &dirent_iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* - * If we're deleting a subvolume, we need to really delete the - * dirent, not just emit a whiteout in the current snapshot: - */ - bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &dirent_iter); - if (ret) - goto err; - } else { - bch2_inode_nlink_dec(trans, inode_u); - } - - if (inode_u->bi_dir == dirent_iter.pos.inode && - inode_u->bi_dir_offset == dirent_iter.pos.offset) { - inode_u->bi_dir = 0; - inode_u->bi_dir_offset = 0; - } - - dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; - dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); - - ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash, &dirent_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_inode_write(trans, &dir_iter, dir_u) ?: - bch2_inode_write(trans, &inode_iter, inode_u); -err: - bch2_trans_iter_exit(trans, &inode_iter); - bch2_trans_iter_exit(trans, &dirent_iter); - bch2_trans_iter_exit(trans, &dir_iter); - return ret; -} - -bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, - struct bch_inode_unpacked *src_u) -{ - u64 src, dst; - unsigned id; - bool ret = false; - - for (id = 0; id < Inode_opt_nr; id++) { - if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold) - continue; - - /* Skip attributes that were explicitly set on this inode */ - if (dst_u->bi_fields_set & (1 << id)) - continue; - - src = bch2_inode_opt_get(src_u, id); - dst = bch2_inode_opt_get(dst_u, id); - - if (src == dst) - continue; - - bch2_inode_opt_set(dst_u, id, src); - ret = true; - } - - return ret; -} - -static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) -{ - struct btree_iter iter; - struct bkey_i_subvolume *s = - bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_cached, subvolume); - int ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - s->v.fs_path_parent = cpu_to_le32(new_parent); - bch2_trans_iter_exit(trans, &iter); - return 0; -} - -int bch2_rename_trans(struct btree_trans *trans, - subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, - subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, - struct bch_inode_unpacked *src_inode_u, - struct bch_inode_unpacked *dst_inode_u, - const struct qstr *src_name, - const struct qstr *dst_name, - enum bch_rename_mode mode) -{ - struct bch_fs *c = trans->c; - struct btree_iter src_dir_iter = {}; - struct btree_iter dst_dir_iter = {}; - struct btree_iter src_inode_iter = {}; - struct btree_iter dst_inode_iter = {}; - struct bch_hash_info src_hash, dst_hash; - subvol_inum src_inum, dst_inum; - u64 src_offset, dst_offset; - u64 now = bch2_current_time(c); - int ret; - - ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, - BTREE_ITER_intent); - if (ret) - goto err; - - src_hash = bch2_hash_info_init(c, src_dir_u); - - if (!subvol_inum_eq(dst_dir, src_dir)) { - ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, - BTREE_ITER_intent); - if (ret) - goto err; - - dst_hash = bch2_hash_info_init(c, dst_dir_u); - } else { - dst_dir_u = src_dir_u; - dst_hash = src_hash; - } - - ret = bch2_dirent_rename(trans, - src_dir, &src_hash, - dst_dir, &dst_hash, - src_name, &src_inum, &src_offset, - dst_name, &dst_inum, &dst_offset, - mode); - if (ret) - goto err; - - ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, - BTREE_ITER_intent); - if (ret) - goto err; - - if (dst_inum.inum) { - ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, - BTREE_ITER_intent); - if (ret) - goto err; - } - - if (src_inode_u->bi_subvol && - dst_dir.subvol != src_inode_u->bi_parent_subvol) { - ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol); - if (ret) - goto err; - } - - if (mode == BCH_RENAME_EXCHANGE && - dst_inode_u->bi_subvol && - src_dir.subvol != dst_inode_u->bi_parent_subvol) { - ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol); - if (ret) - goto err; - } - - /* Can't move across subvolumes, unless it's a subvolume root: */ - if (src_dir.subvol != dst_dir.subvol && - (!src_inode_u->bi_subvol || - (dst_inum.inum && !dst_inode_u->bi_subvol))) { - ret = -EXDEV; - goto err; - } - - if (src_inode_u->bi_parent_subvol) - src_inode_u->bi_parent_subvol = dst_dir.subvol; - - if ((mode == BCH_RENAME_EXCHANGE) && - dst_inode_u->bi_parent_subvol) - dst_inode_u->bi_parent_subvol = src_dir.subvol; - - src_inode_u->bi_dir = dst_dir_u->bi_inum; - src_inode_u->bi_dir_offset = dst_offset; - - if (mode == BCH_RENAME_EXCHANGE) { - dst_inode_u->bi_dir = src_dir_u->bi_inum; - dst_inode_u->bi_dir_offset = src_offset; - } - - if (mode == BCH_RENAME_OVERWRITE && - dst_inode_u->bi_dir == dst_dir_u->bi_inum && - dst_inode_u->bi_dir_offset == src_offset) { - dst_inode_u->bi_dir = 0; - dst_inode_u->bi_dir_offset = 0; - } - - if (mode == BCH_RENAME_OVERWRITE) { - if (S_ISDIR(src_inode_u->bi_mode) != - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -ENOTDIR; - goto err; - } - - if (S_ISDIR(dst_inode_u->bi_mode)) { - ret = bch2_empty_dir_trans(trans, dst_inum); - if (ret) - goto err; - } - } - - if (!subvol_inum_eq(dst_dir, src_dir)) { - if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && - S_ISDIR(src_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } - - if (mode == BCH_RENAME_EXCHANGE && - bch2_reinherit_attrs(dst_inode_u, src_dir_u) && - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } - - ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?: - (mode == BCH_RENAME_EXCHANGE - ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u) - : 0); - if (ret) - goto err; - - if (is_subdir_for_nlink(src_inode_u)) { - src_dir_u->bi_nlink--; - dst_dir_u->bi_nlink++; - } - - if (S_ISDIR(src_inode_u->bi_mode) && - !src_inode_u->bi_subvol) - src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; - - if (mode == BCH_RENAME_EXCHANGE && - S_ISDIR(dst_inode_u->bi_mode) && - !dst_inode_u->bi_subvol) - dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; - } - - if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { - dst_dir_u->bi_nlink--; - src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; - } - - if (mode == BCH_RENAME_OVERWRITE) - bch2_inode_nlink_dec(trans, dst_inode_u); - - src_dir_u->bi_mtime = now; - src_dir_u->bi_ctime = now; - - if (src_dir.inum != dst_dir.inum) { - dst_dir_u->bi_mtime = now; - dst_dir_u->bi_ctime = now; - } - - src_inode_u->bi_ctime = now; - - if (dst_inum.inum) - dst_inode_u->bi_ctime = now; - - ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: - (src_dir.inum != dst_dir.inum - ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) - : 0) ?: - bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: - (dst_inum.inum - ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) - : 0); -err: - bch2_trans_iter_exit(trans, &dst_inode_iter); - bch2_trans_iter_exit(trans, &src_inode_iter); - bch2_trans_iter_exit(trans, &dst_dir_iter); - bch2_trans_iter_exit(trans, &src_dir_iter); - return ret; -} - -/* inum_to_path */ - -static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) -{ - bch2_printbuf_make_room(out, n); - - unsigned can_print = min(n, printbuf_remaining(out)); - - b += n; - - for (unsigned i = 0; i < can_print; i++) - out->buf[out->pos++] = *((char *) --b); - - printbuf_nul_terminate(out); -} - -static inline void prt_str_reversed(struct printbuf *out, const char *s) -{ - prt_bytes_reversed(out, s, strlen(s)); -} - -static inline void reverse_bytes(void *b, size_t n) -{ - char *e = b + n, *s = b; - - while (s < e) { - --e; - swap(*s, *e); - s++; - } -} - -static int __bch2_inum_to_path(struct btree_trans *trans, - u32 subvol, u64 inum, u32 snapshot, - struct printbuf *path) -{ - unsigned orig_pos = path->pos; - int ret = 0; - DARRAY(subvol_inum) inums = {}; - - if (!snapshot) { - ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); - if (ret) - goto disconnected; - } - - while (true) { - subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum }; - - if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) { - prt_str_reversed(path, "(loop)"); - break; - } - - ret = darray_push(&inums, n); - if (ret) - goto err; - - struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); - if (ret) - goto disconnected; - - if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL && - inode.bi_inum == BCACHEFS_ROOT_INO) - break; - - if (!inode.bi_dir && !inode.bi_dir_offset) { - ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer); - goto disconnected; - } - - inum = inode.bi_dir; - if (inode.bi_parent_subvol) { - subvol = inode.bi_parent_subvol; - ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot); - if (ret) - goto disconnected; - } - - struct btree_iter d_iter; - struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, - BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), - 0, dirent); - ret = bkey_err(d.s_c); - if (ret) - goto disconnected; - - struct qstr dirent_name = bch2_dirent_get_name(d); - - prt_bytes_reversed(path, dirent_name.name, dirent_name.len); - - prt_char(path, '/'); - - bch2_trans_iter_exit(trans, &d_iter); - } - - if (orig_pos == path->pos) - prt_char(path, '/'); -out: - ret = path->allocation_failure ? -ENOMEM : 0; - if (ret) - goto err; - - reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); - darray_exit(&inums); - return 0; -err: - darray_exit(&inums); - return ret; -disconnected: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - prt_str_reversed(path, "(disconnected)"); - goto out; -} - -int bch2_inum_to_path(struct btree_trans *trans, - subvol_inum inum, - struct printbuf *path) -{ - return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path); -} - -int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, - snapshot_id_list *snapshot_overwrites, - struct printbuf *path) -{ - return __bch2_inum_to_path(trans, 0, inum, snapshot, path); -} - -/* fsck */ - -static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - bool in_fsck) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = {}; - int ret = 0; - - if (inode_points_to_dirent(target, d)) - return 0; - - if (!bch2_inode_has_backpointer(target)) { - fsck_err_on(S_ISDIR(target->bi_mode), - trans, inode_dir_missing_backpointer, - "directory with missing backpointer\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - fsck_err_on(target->bi_flags & BCH_INODE_unlinked, - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - target->bi_flags &= ~BCH_INODE_unlinked; - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - return __bch2_fsck_write_inode(trans, target); - } - - struct bkey_s_c_dirent bp_dirent = - bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), - 0, dirent); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - if (!backpointer_exists) { - if (fsck_err(trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - } - } else { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - - if (S_ISDIR(target->bi_mode) || target->bi_subvol) { - /* - * XXX: verify connectivity of the other dirent - * up to the root before removing this one - * - * Additionally, bch2_lookup would need to cope with the - * dirent it found being removed - or should we remove - * the other one, even though the inode points to it? - */ - if (in_fsck) { - if (fsck_err(trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) - ret = bch2_fsck_remove_dirent(trans, d.k->p); - } else { - bch2_fs_inconsistent(c, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf); - } - - goto out; - } else { - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(!target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; - } - } - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -int __bch2_check_dirent_target(struct btree_trans *trans, - struct btree_iter *dirent_iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - bool in_fsck) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); - if (ret) - goto err; - - if (fsck_err_on(d.v->d_type != inode_d_type(target), - trans, dirent_d_type_wrong, - "incorrect d_type: got %s, should be %s:\n%s", - bch2_d_type_str(d.v->d_type), - bch2_d_type_str(inode_d_type(target)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = inode_d_type(target); - if (n->v.d_type == DT_SUBVOL) { - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); - } else { - n->v.d_inum = cpu_to_le64(target->bi_inum); - } - - ret = bch2_trans_update(trans, dirent_iter, &n->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (ret) - goto err; - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -/* - * BCH_INODE_has_case_insensitive: - * We have to track whether directories have any descendent directory that is - * casefolded - for overlayfs: - */ - -static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum) -{ - struct btree_iter iter = {}; - int ret = 0; - - while (true) { - struct bch_inode_unpacked inode; - ret = bch2_inode_peek(trans, &iter, &inode, inum, - BTREE_ITER_intent|BTREE_ITER_with_updates); - if (ret) - break; - - if (inode.bi_flags & BCH_INODE_has_case_insensitive) - break; - - inode.bi_flags |= BCH_INODE_has_case_insensitive; - ret = bch2_inode_write(trans, &iter, &inode); - if (ret) - break; - - bch2_trans_iter_exit(trans, &iter); - if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) - break; - - inum = parent_inum(inum, &inode); - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - if (!bch2_inode_casefold(trans->c, inode)) - return 0; - - inode->bi_flags |= BCH_INODE_has_case_insensitive; - - return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode)); -} - -int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - snapshot_id_list *snapshot_overwrites, - bool *do_update) -{ - struct printbuf buf = PRINTBUF; - bool repairing_parents = false; - int ret = 0; - - if (!S_ISDIR(inode->bi_mode)) { - /* - * Old versions set bi_casefold for non dirs, but that's - * unnecessary and wasteful - */ - if (inode->bi_casefold) { - inode->bi_casefold = 0; - *do_update = true; - } - return 0; - } - - if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive) - return 0; - - if (bch2_inode_casefold(trans->c, inode) && - !(inode->bi_flags & BCH_INODE_has_case_insensitive)) { - prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", - inode->bi_inum, inode->bi_snapshot); - - ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, - snapshot_overwrites, &buf); - if (ret) - goto err; - - if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { - inode->bi_flags |= BCH_INODE_has_case_insensitive; - *do_update = true; - } - } - - if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) - goto out; - - struct bch_inode_unpacked dir = *inode; - u32 snapshot = dir.bi_snapshot; - - while (!(dir.bi_inum == BCACHEFS_ROOT_INO && - dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { - if (dir.bi_parent_subvol) { - ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); - if (ret) - goto err; - - snapshot_overwrites = NULL; - } - - ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); - if (ret) - goto err; - - if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { - prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); - - ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, - snapshot_overwrites, &buf); - if (ret) - goto err; - - if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { - dir.bi_flags |= BCH_INODE_has_case_insensitive; - ret = __bch2_fsck_write_inode(trans, &dir); - if (ret) - goto err; - } - } - - /* - * We only need to check the first parent, unless we find an - * inconsistency - */ - if (!repairing_parents) - break; - } -out: -err: -fsck_err: - printbuf_exit(&buf); - if (ret) - return ret; - - if (repairing_parents) { - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - } - - return 0; -} diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h deleted file mode 100644 index ae6ebc2d078504..00000000000000 --- a/fs/bcachefs/namei.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_NAMEI_H -#define _BCACHEFS_NAMEI_H - -#include "dirent.h" - -struct posix_acl; - -#define BCH_CREATE_TMPFILE (1U << 0) -#define BCH_CREATE_SUBVOL (1U << 1) -#define BCH_CREATE_SNAPSHOT (1U << 2) -#define BCH_CREATE_SNAPSHOT_RO (1U << 3) - -int bch2_create_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - const struct qstr *, - uid_t, gid_t, umode_t, dev_t, - struct posix_acl *, - struct posix_acl *, - subvol_inum, unsigned); - -int bch2_link_trans(struct btree_trans *, - subvol_inum, struct bch_inode_unpacked *, - subvol_inum, struct bch_inode_unpacked *, - const struct qstr *); - -int bch2_unlink_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - const struct qstr *, bool); - -int bch2_rename_trans(struct btree_trans *, - subvol_inum, struct bch_inode_unpacked *, - subvol_inum, struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - struct bch_inode_unpacked *, - const struct qstr *, - const struct qstr *, - enum bch_rename_mode); - -bool bch2_reinherit_attrs(struct bch_inode_unpacked *, - struct bch_inode_unpacked *); - -int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32, - snapshot_id_list *, struct printbuf *); - -int __bch2_check_dirent_target(struct btree_trans *, - struct btree_iter *, - struct bkey_s_c_dirent, - struct bch_inode_unpacked *, bool); - -static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - -static inline int bch2_check_dirent_target(struct btree_trans *trans, - struct btree_iter *dirent_iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - bool in_fsck) -{ - if (likely(inode_points_to_dirent(target, d) && - d.v->d_type == inode_d_type(target))) - return 0; - - return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); -} - -int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *, - snapshot_id_list *, bool *); - -#endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c deleted file mode 100644 index 962218fa68ec01..00000000000000 --- a/fs/bcachefs/nocow_locking.c +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bkey_methods.h" -#include "nocow_locking.h" -#include "util.h" - -#include - -bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - unsigned i; - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (l->b[i] == dev_bucket && atomic_read(&l->l[i])) - return true; - return false; -} - -#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0) - -void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - int lock_val = flags ? 1 : -1; - unsigned i; - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (l->b[i] == dev_bucket) { - int v = atomic_sub_return(lock_val, &l->l[i]); - - BUG_ON(v && sign(v) != lock_val); - if (!v) - closure_wake_up(&l->wait); - return; - } - - BUG(); -} - -bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, - u64 dev_bucket, int flags) -{ - int v, lock_val = flags ? 1 : -1; - unsigned i; - - spin_lock(&l->lock); - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (l->b[i] == dev_bucket) - goto got_entry; - - for (i = 0; i < ARRAY_SIZE(l->b); i++) - if (!atomic_read(&l->l[i])) { - l->b[i] = dev_bucket; - goto take_lock; - } -fail: - spin_unlock(&l->lock); - return false; -got_entry: - v = atomic_read(&l->l[i]); - if (lock_val > 0 ? v < 0 : v > 0) - goto fail; -take_lock: - v = atomic_read(&l->l[i]); - /* Overflow? */ - if (v && sign(v + lock_val) != sign(v)) - goto fail; - - atomic_add(lock_val, &l->l[i]); - spin_unlock(&l->lock); - return true; -} - -void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct nocow_lock_bucket *l, - u64 dev_bucket, int flags) -{ - if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { - struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); - u64 start_time = local_clock(); - - __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); - bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); - } -} - -void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t) - -{ - unsigned i, nr_zero = 0; - struct nocow_lock_bucket *l; - - for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) { - unsigned v = 0; - - for (i = 0; i < ARRAY_SIZE(l->l); i++) - v |= atomic_read(&l->l[i]); - - if (!v) { - nr_zero++; - continue; - } - - if (nr_zero) - prt_printf(out, "(%u empty entries)\n", nr_zero); - nr_zero = 0; - - for (i = 0; i < ARRAY_SIZE(l->l); i++) { - int v = atomic_read(&l->l[i]); - if (v) { - bch2_bpos_to_text(out, u64_to_bucket(l->b[i])); - prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v)); - } - } - prt_newline(out); - } - - if (nr_zero) - prt_printf(out, "(%u empty entries)\n", nr_zero); -} - -void bch2_fs_nocow_locking_exit(struct bch_fs *c) -{ - struct bucket_nocow_lock_table *t = &c->nocow_locks; - - for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) - for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++) - BUG_ON(atomic_read(&l->l[j])); -} - -void bch2_fs_nocow_locking_init_early(struct bch_fs *c) -{ - struct bucket_nocow_lock_table *t = &c->nocow_locks; - - for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) - spin_lock_init(&l->lock); -} diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h deleted file mode 100644 index 48b8a003c0d25a..00000000000000 --- a/fs/bcachefs/nocow_locking.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_NOCOW_LOCKING_H -#define _BCACHEFS_NOCOW_LOCKING_H - -#include "bcachefs.h" -#include "alloc_background.h" -#include "nocow_locking_types.h" - -#include - -static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t, - u64 dev_bucket) -{ - unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS); - - return t->l + (h & (BUCKET_NOCOW_LOCKS - 1)); -} - -#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) - -bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos); -void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int); -bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int); -void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, - struct nocow_lock_bucket *, u64, int); - -static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - - __bch2_bucket_nocow_lock(t, l, dev_bucket, flags); -} - -static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - - return __bch2_bucket_nocow_trylock(l, dev_bucket, flags); -} - -void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); - -void bch2_fs_nocow_locking_exit(struct bch_fs *); -void bch2_fs_nocow_locking_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h deleted file mode 100644 index bd12bf6779241f..00000000000000 --- a/fs/bcachefs/nocow_locking_types.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H -#define _BCACHEFS_NOCOW_LOCKING_TYPES_H - -#define BUCKET_NOCOW_LOCKS_BITS 10 -#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) - -struct nocow_lock_bucket { - struct closure_waitlist wait; - spinlock_t lock; - u64 b[4]; - atomic_t l[4]; -} __aligned(SMP_CACHE_BYTES); - -struct bucket_nocow_lock_table { - struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS]; -}; - -#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */ - diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c deleted file mode 100644 index b1cf88905b816e..00000000000000 --- a/fs/bcachefs/opts.c +++ /dev/null @@ -1,844 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include - -#include "bcachefs.h" -#include "compress.h" -#include "disk_groups.h" -#include "error.h" -#include "movinggc.h" -#include "opts.h" -#include "rebalance.h" -#include "recovery_passes.h" -#include "super-io.h" -#include "util.h" - -#define x(t, n, ...) [n] = #t, - -const char * const bch2_error_actions[] = { - BCH_ERROR_ACTIONS() - NULL -}; - -const char * const bch2_degraded_actions[] = { - BCH_DEGRADED_ACTIONS() - NULL -}; - -const char * const bch2_fsck_fix_opts[] = { - BCH_FIX_ERRORS_OPTS() - NULL -}; - -const char * const bch2_version_upgrade_opts[] = { - BCH_VERSION_UPGRADE_OPTS() - NULL -}; - -const char * const bch2_sb_features[] = { - BCH_SB_FEATURES() - NULL -}; - -const char * const bch2_sb_compat[] = { - BCH_SB_COMPAT() - NULL -}; - -const char * const __bch2_btree_ids[] = { - BCH_BTREE_IDS() - NULL -}; - -const char * const __bch2_csum_types[] = { - BCH_CSUM_TYPES() - NULL -}; - -const char * const __bch2_csum_opts[] = { - BCH_CSUM_OPTS() - NULL -}; - -const char * const __bch2_compression_types[] = { - BCH_COMPRESSION_TYPES() - NULL -}; - -const char * const bch2_compression_opts[] = { - BCH_COMPRESSION_OPTS() - NULL -}; - -const char * const __bch2_str_hash_types[] = { - BCH_STR_HASH_TYPES() - NULL -}; - -const char * const bch2_str_hash_opts[] = { - BCH_STR_HASH_OPTS() - NULL -}; - -const char * const __bch2_data_types[] = { - BCH_DATA_TYPES() - NULL -}; - -const char * const bch2_member_states[] = { - BCH_MEMBER_STATES() - NULL -}; - -static const char * const __bch2_jset_entry_types[] = { - BCH_JSET_ENTRY_TYPES() - NULL -}; - -static const char * const __bch2_fs_usage_types[] = { - BCH_FS_USAGE_TYPES() - NULL -}; - -#undef x - -static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], - unsigned nr, const char *type, unsigned idx) -{ - if (idx < nr) - prt_str(out, opts[idx]); - else - prt_printf(out, "(unknown %s %u)", type, idx); -} - -#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \ -void bch2_prt_##name(struct printbuf *out, type t) \ -{ \ - prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\ -} - -PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); -PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); -PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); -PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); -PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); -PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); -PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); - -static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, - struct printbuf *err) -{ - if (!val) { - *res = FSCK_FIX_yes; - } else { - int ret = match_string(bch2_fsck_fix_opts, -1, val); - - if (ret < 0 && err) - prt_str(err, "fix_errors: invalid selection"); - if (ret < 0) - return ret; - *res = ret; - } - - return 0; -} - -static void bch2_opt_fix_errors_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) -{ - prt_str(out, bch2_fsck_fix_opts[v]); -} - -#define bch2_opt_fix_errors (struct bch_opt_fn) { \ - .parse = bch2_opt_fix_errors_parse, \ - .to_text = bch2_opt_fix_errors_to_text, \ -} - -const char * const bch2_d_types[BCH_DT_MAX] = { - [DT_UNKNOWN] = "unknown", - [DT_FIFO] = "fifo", - [DT_CHR] = "chr", - [DT_DIR] = "dir", - [DT_BLK] = "blk", - [DT_REG] = "reg", - [DT_LNK] = "lnk", - [DT_SOCK] = "sock", - [DT_WHT] = "whiteout", - [DT_SUBVOL] = "subvol", -}; - -void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) -{ -#define x(_name, ...) \ - if (opt_defined(src, _name)) \ - opt_set(*dst, _name, src._name); - - BCH_OPTS() -#undef x -} - -bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) -{ - switch (id) { -#define x(_name, ...) \ - case Opt_##_name: \ - return opt_defined(*opts, _name); - BCH_OPTS() -#undef x - default: - BUG(); - } -} - -u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) -{ - switch (id) { -#define x(_name, ...) \ - case Opt_##_name: \ - return opts->_name; - BCH_OPTS() -#undef x - default: - BUG(); - } -} - -void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) -{ - switch (id) { -#define x(_name, ...) \ - case Opt_##_name: \ - opt_set(*opts, _name, v); \ - break; - BCH_OPTS() -#undef x - default: - BUG(); - } -} - -/* dummy option, for options that aren't stored in the superblock */ -typedef u64 (*sb_opt_get_fn)(const struct bch_sb *); -typedef void (*sb_opt_set_fn)(struct bch_sb *, u64); -typedef u64 (*member_opt_get_fn)(const struct bch_member *); -typedef void (*member_opt_set_fn)(struct bch_member *, u64); - -__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; -__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; -__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; -__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; - -#define type_compatible_or_null(_p, _type) \ - __builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL) - -const struct bch_option bch2_opt_table[] = { -#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 -#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ - .min = _min, .max = _max -#define OPT_STR(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = ARRAY_SIZE(_choices) - 1, \ - .choices = _choices -#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = U64_MAX, \ - .choices = _choices -#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \ - .choices = _choices -#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn - -#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ - [Opt_##_name] = { \ - .attr.name = #_name, \ - .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ - .flags = _flags, \ - .hint = _hint, \ - .help = _help, \ - .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \ - .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \ - .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \ - .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\ - _type \ - }, - - BCH_OPTS() -#undef x -}; - -int bch2_opt_lookup(const char *name) -{ - const struct bch_option *i; - - for (i = bch2_opt_table; - i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); - i++) - if (!strcmp(name, i->attr.name)) - return i - bch2_opt_table; - - return -1; -} - -struct opt_synonym { - const char *s1, *s2; -}; - -static const struct opt_synonym bch2_opt_synonyms[] = { - { "quota", "usrquota" }, -}; - -static int bch2_mount_opt_lookup(const char *name) -{ - const struct opt_synonym *i; - - for (i = bch2_opt_synonyms; - i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); - i++) - if (!strcmp(name, i->s1)) - name = i->s2; - - return bch2_opt_lookup(name); -} - -struct opt_val_synonym { - const char *opt, *v1, *v2; -}; - -static const struct opt_val_synonym bch2_opt_val_synonyms[] = { - { "degraded", "true", "yes" }, - { "degraded", "false", "no" }, - { "degraded", "1", "yes" }, - { "degraded", "0", "no" }, -}; - -static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) -{ - const struct opt_val_synonym *i; - - for (i = bch2_opt_val_synonyms; - i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); - i++) - if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) - return i->v2; - - return val; -} - -int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) -{ - if (v < opt->min) { - if (err) - prt_printf(err, "%s: too small (min %llu)", - opt->attr.name, opt->min); - return -BCH_ERR_ERANGE_option_too_small; - } - - if (opt->max && v >= opt->max) { - if (err) - prt_printf(err, "%s: too big (max %llu)", - opt->attr.name, opt->max); - return -BCH_ERR_ERANGE_option_too_big; - } - - if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { - if (err) - prt_printf(err, "%s: not a multiple of 512", - opt->attr.name); - return -BCH_ERR_opt_parse_error; - } - - if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { - if (err) - prt_printf(err, "%s: must be a power of two", - opt->attr.name); - return -BCH_ERR_opt_parse_error; - } - - if (opt->fn.validate) - return opt->fn.validate(v, err); - - return 0; -} - -int bch2_opt_parse(struct bch_fs *c, - const struct bch_option *opt, - const char *val, u64 *res, - struct printbuf *err) -{ - ssize_t ret; - - if (err) - printbuf_indent_add_nextline(err, 2); - - switch (opt->type) { - case BCH_OPT_BOOL: - if (!val) - val = "1"; - - ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); - if (ret != -BCH_ERR_option_not_bool) { - *res = ret; - } else { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; - } - break; - case BCH_OPT_UINT: - if (!val) { - prt_printf(err, "%s: required value", - opt->attr.name); - return -EINVAL; - } - - if (*val != '-') { - ret = opt->flags & OPT_HUMAN_READABLE - ? bch2_strtou64_h(val, res) - : kstrtou64(val, 10, res); - } else { - prt_printf(err, "%s: must be a non-negative number", opt->attr.name); - return -BCH_ERR_option_negative; - } - - if (ret < 0) { - if (err) - prt_printf(err, "%s: must be a number", - opt->attr.name); - return ret; - } - break; - case BCH_OPT_STR: - if (!val) { - prt_printf(err, "%s: required value", - opt->attr.name); - return -EINVAL; - } - - ret = match_string(opt->choices, -1, val); - if (ret < 0) { - if (err) - prt_printf(err, "%s: invalid selection", - opt->attr.name); - return ret; - } - - *res = ret; - break; - case BCH_OPT_BITFIELD: { - s64 v = bch2_read_flag_list(val, opt->choices); - if (v < 0) - return v; - *res = v; - break; - } - case BCH_OPT_FN: - ret = opt->fn.parse(c, val, res, err); - - if (ret == -BCH_ERR_option_needs_open_fs) - return ret; - - if (ret < 0) { - if (err) - prt_printf(err, "%s: parse error", - opt->attr.name); - return ret; - } - } - - return bch2_opt_validate(opt, *res, err); -} - -void bch2_opt_to_text(struct printbuf *out, - struct bch_fs *c, struct bch_sb *sb, - const struct bch_option *opt, u64 v, - unsigned flags) -{ - if (flags & OPT_SHOW_MOUNT_STYLE) { - if (opt->type == BCH_OPT_BOOL) { - prt_printf(out, "%s%s", - v ? "" : "no", - opt->attr.name); - return; - } - - prt_printf(out, "%s=", opt->attr.name); - } - - switch (opt->type) { - case BCH_OPT_BOOL: - case BCH_OPT_UINT: - if (opt->flags & OPT_HUMAN_READABLE) - prt_human_readable_u64(out, v); - else - prt_printf(out, "%lli", v); - break; - case BCH_OPT_STR: - if (v < opt->min || v >= opt->max) - prt_printf(out, "(invalid option %lli)", v); - else if (flags & OPT_SHOW_FULL_LIST) - prt_string_option(out, opt->choices, v); - else - prt_str(out, opt->choices[v]); - break; - case BCH_OPT_BITFIELD: - prt_bitflags(out, opt->choices, v); - break; - case BCH_OPT_FN: - opt->fn.to_text(out, c, sb, v); - break; - default: - BUG(); - } -} - -void bch2_opts_to_text(struct printbuf *out, - struct bch_opts opts, - struct bch_fs *c, struct bch_sb *sb, - unsigned show_mask, unsigned hide_mask, - unsigned flags) -{ - bool first = true; - - for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - - if ((opt->flags & hide_mask) || !(opt->flags & show_mask)) - continue; - - u64 v = bch2_opt_get_by_id(&opts, i); - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - if (!first) - prt_char(out, ','); - first = false; - - bch2_opt_to_text(out, c, sb, opt, v, flags); - } -} - -int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) -{ - int ret = 0; - - switch (id) { - case Opt_state: - if (ca) - return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED); - break; - - case Opt_compression: - case Opt_background_compression: - ret = bch2_check_set_has_compressed_data(c, v); - break; - case Opt_erasure_code: - if (v) - bch2_check_set_feature(c, BCH_FEATURE_ec); - break; - default: - break; - } - - return ret; -} - -int bch2_opts_hooks_pre_set(struct bch_fs *c) -{ - for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); - if (ret) - return ret; - } - - return 0; -} - -void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, - struct bch_opts *new_opts, enum bch_opt_id id) -{ - switch (id) { - case Opt_foreground_target: - if (new_opts->foreground_target && - !new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_compression: - if (new_opts->compression && - !new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_background_target: - if (new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_background_compression: - if (new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); - break; - case Opt_rebalance_enabled: - bch2_rebalance_wakeup(c); - break; - case Opt_copygc_enabled: - bch2_copygc_wakeup(c); - break; - case Opt_discard: - if (!ca) { - mutex_lock(&c->sb_lock); - for_each_member_device(c, ca) { - struct bch_member *m = - bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_DISCARD(m, c->opts.discard); - } - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - break; - case Opt_version_upgrade: - /* - * XXX: in the future we'll likely want to do compatible - * upgrades at runtime as well, but right now there's nothing - * that does that: - */ - if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) - bch2_sb_upgrade_incompat(c); - break; - default: - break; - } -} - -int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, - const char *name, const char *val) -{ - struct printbuf err = PRINTBUF; - u64 v; - int ret, id; - - id = bch2_mount_opt_lookup(name); - - /* Check for the form "noopt", negation of a boolean opt: */ - if (id < 0 && - !val && - !strncmp("no", name, 2)) { - id = bch2_mount_opt_lookup(name + 2); - val = "0"; - } - - /* Unknown options are ignored: */ - if (id < 0) - return 0; - - /* must have a value for synonym lookup - but OPT_FN is weird */ - if (!val && bch2_opt_table[id].type != BCH_OPT_FN) - val = "1"; - - val = bch2_opt_val_synonym_lookup(name, val); - - if (!(bch2_opt_table[id].flags & OPT_MOUNT)) - goto bad_opt; - - if (id == Opt_acl && - !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) - goto bad_opt; - - if ((id == Opt_usrquota || - id == Opt_grpquota) && - !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) - goto bad_opt; - - ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); - if (ret == -BCH_ERR_option_needs_open_fs) { - ret = 0; - - if (parse_later) { - prt_printf(parse_later, "%s=%s,", name, val); - if (parse_later->allocation_failure) - ret = -ENOMEM; - } - - goto out; - } - - if (ret < 0) - goto bad_val; - - if (opts) - bch2_opt_set_by_id(opts, id, v); - - ret = 0; -out: - printbuf_exit(&err); - return ret; -bad_opt: - ret = -BCH_ERR_option_name; - goto out; -bad_val: - ret = -BCH_ERR_option_value; - goto out; -} - -int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, char *options, - bool ignore_unknown) -{ - char *copied_opts, *copied_opts_start; - char *opt, *name, *val; - int ret = 0; - - if (!options) - return 0; - - /* - * sys_fsconfig() is now occasionally providing us with option lists - * starting with a comma - weird. - */ - if (*options == ',') - options++; - - copied_opts = kstrdup(options, GFP_KERNEL); - if (!copied_opts) - return -ENOMEM; - copied_opts_start = copied_opts; - - while ((opt = strsep(&copied_opts, ",")) != NULL) { - if (!*opt) - continue; - - name = strsep(&opt, "="); - val = opt; - - ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); - if (ret == -BCH_ERR_option_name && ignore_unknown) - ret = 0; - if (ret) { - pr_err("Error parsing option %s: %s", name, bch2_err_str(ret)); - break; - } - } - - kfree(copied_opts_start); - return ret; -} - -u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx) -{ - const struct bch_option *opt = bch2_opt_table + id; - u64 v; - - if (dev_idx < 0) { - v = opt->get_sb(sb); - } else { - if (WARN(!bch2_member_exists(sb, dev_idx), - "tried to set device option %s on nonexistent device %i", - opt->attr.name, dev_idx)) - return 0; - - struct bch_member m = bch2_sb_member_get(sb, dev_idx); - v = opt->get_member(&m); - } - - if (opt->flags & OPT_SB_FIELD_ONE_BIAS) - --v; - - if (opt->flags & OPT_SB_FIELD_ILOG2) - v = 1ULL << v; - - if (opt->flags & OPT_SB_FIELD_SECTORS) - v <<= 9; - - return v; -} - -/* - * Initial options from superblock - here we don't want any options undefined, - * any options the superblock doesn't specify are set to 0: - */ -int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) -{ - for (unsigned id = 0; id < bch2_opts_nr; id++) { - const struct bch_option *opt = bch2_opt_table + id; - - if (opt->get_sb) - bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1)); - } - - return 0; -} - -bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, - const struct bch_option *opt, u64 v) -{ - bool changed = false; - - if (opt->flags & OPT_SB_FIELD_SECTORS) - v >>= 9; - - if (opt->flags & OPT_SB_FIELD_ILOG2) - v = ilog2(v); - - if (opt->flags & OPT_SB_FIELD_ONE_BIAS) - v++; - - if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) { - changed = v != opt->get_sb(sb); - - opt->set_sb(sb, v); - } - - if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { - if (WARN(!bch2_member_exists(sb, dev_idx), - "tried to set device option %s on nonexistent device %i", - opt->attr.name, dev_idx)) - return false; - - struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); - changed = v != opt->get_member(m); - opt->set_member(m, v); - } - - return changed; -} - -bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, - const struct bch_option *opt, u64 v) -{ - mutex_lock(&c->sb_lock); - bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); - if (changed) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - return changed; -} - -/* io opts: */ - -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) -{ - struct bch_io_opts opts = { -#define x(_name, _bits) ._name = src._name, - BCH_INODE_OPTS() -#undef x - }; - - bch2_io_opts_fixups(&opts); - return opts; -} - -bool bch2_opt_is_inode_opt(enum bch_opt_id id) -{ - static const enum bch_opt_id inode_opt_list[] = { -#define x(_name, _bits) Opt_##_name, - BCH_INODE_OPTS() -#undef x - }; - unsigned i; - - for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) - if (inode_opt_list[i] == id) - return true; - - return false; -} diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h deleted file mode 100644 index 63f8e254495cbd..00000000000000 --- a/fs/bcachefs/opts.h +++ /dev/null @@ -1,693 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_OPTS_H -#define _BCACHEFS_OPTS_H - -#include -#include -#include -#include -#include "bcachefs_format.h" - -struct bch_fs; - -extern const char * const bch2_error_actions[]; -extern const char * const bch2_degraded_actions[]; -extern const char * const bch2_fsck_fix_opts[]; -extern const char * const bch2_version_upgrade_opts[]; -extern const char * const bch2_sb_features[]; -extern const char * const bch2_sb_compat[]; -extern const char * const __bch2_btree_ids[]; -extern const char * const __bch2_csum_types[]; -extern const char * const __bch2_csum_opts[]; -extern const char * const __bch2_compression_types[]; -extern const char * const bch2_compression_opts[]; -extern const char * const __bch2_str_hash_types[]; -extern const char * const bch2_str_hash_opts[]; -extern const char * const __bch2_data_types[]; -extern const char * const bch2_member_states[]; -extern const char * const bch2_d_types[]; - -void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); -void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); -void bch2_prt_data_type(struct printbuf *, enum bch_data_type); -void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); -void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); -void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); -void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); - -static inline const char *bch2_d_type_str(unsigned d_type) -{ - return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; -} - -/* - * Mount options; we also store defaults in the superblock. - * - * Also exposed via sysfs: if an option is writeable, and it's also stored in - * the superblock, changing it via sysfs (currently? might change this) also - * updates the superblock. - * - * We store options as signed integers, where -1 means undefined. This means we - * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only - * apply the options from that struct that are defined. - */ - -/* When can be set: */ -enum opt_flags { - OPT_FS = BIT(0), /* Filesystem option */ - OPT_DEVICE = BIT(1), /* Device option */ - OPT_INODE = BIT(2), /* Inode option */ - OPT_FORMAT = BIT(3), /* May be specified at format time */ - OPT_MOUNT = BIT(4), /* May be specified at mount time */ - OPT_RUNTIME = BIT(5), /* May be specified at runtime */ - OPT_HUMAN_READABLE = BIT(6), - OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */ - OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */ - OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */ - OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */ - OPT_HIDDEN = BIT(11), -}; - -enum opt_type { - BCH_OPT_BOOL, - BCH_OPT_UINT, - BCH_OPT_STR, - BCH_OPT_BITFIELD, - BCH_OPT_FN, -}; - -struct bch_opt_fn { - int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); - void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); - int (*validate)(u64, struct printbuf *); -}; - -/** - * x(name, shortopt, type, in mem type, mode, sb_opt) - * - * @name - name of mount option, sysfs attribute, and struct bch_opts - * member - * - * @mode - when opt may be set - * - * @sb_option - name of corresponding superblock option - * - * @type - one of OPT_BOOL, OPT_UINT, OPT_STR - */ - -/* - * XXX: add fields for - * - default value - * - helptext - */ - -#ifdef __KERNEL__ -#define RATELIMIT_ERRORS_DEFAULT true -#else -#define RATELIMIT_ERRORS_DEFAULT false -#endif - -#ifdef CONFIG_BCACHEFS_DEBUG -#define BCACHEFS_VERBOSE_DEFAULT true -#else -#define BCACHEFS_VERBOSE_DEFAULT false -#endif - -#define BCH_FIX_ERRORS_OPTS() \ - x(exit, 0) \ - x(yes, 1) \ - x(no, 2) \ - x(ask, 3) - -enum fsck_err_opts { -#define x(t, n) FSCK_FIX_##t, - BCH_FIX_ERRORS_OPTS() -#undef x -}; - -#define BCH_OPTS() \ - x(block_size, u16, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(512, 1U << 16), \ - BCH_SB_BLOCK_SIZE, 4 << 10, \ - "size", NULL) \ - x(btree_node_size, u32, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(512, 1U << 20), \ - BCH_SB_BTREE_NODE_SIZE, 256 << 10, \ - "size", "Btree node size, default 256k") \ - x(errors, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ - NULL, "Action to take on filesystem error") \ - x(write_error_timeout, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, 300), \ - BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ - NULL, "Number of consecutive write errors allowed before kicking out a device")\ - x(metadata_replicas, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_META_REPLICAS_WANT, 1, \ - "#", "Number of metadata replicas") \ - x(data_replicas, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_DATA_REPLICAS_WANT, 1, \ - "#", "Number of data replicas") \ - x(metadata_replicas_required, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_META_REPLICAS_REQ, 1, \ - "#", NULL) \ - x(data_replicas_required, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_DATA_REPLICAS_REQ, 1, \ - "#", NULL) \ - x(encoded_extent_max, u32, \ - OPT_FS|OPT_FORMAT| \ - OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ - OPT_UINT(4096, 2U << 20), \ - BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ - "size", "Maximum size of checksummed/compressed extents")\ - x(metadata_checksum, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(__bch2_csum_opts), \ - BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ - NULL, NULL) \ - x(data_checksum, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(__bch2_csum_opts), \ - BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ - NULL, NULL) \ - x(checksum_err_retry_nr, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, 32), \ - BCH_SB_CSUM_ERR_RETRY_NR, 3, \ - NULL, NULL) \ - x(compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_compression), \ - BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ - NULL, NULL) \ - x(background_compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_compression), \ - BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ - NULL, NULL) \ - x(str_hash, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_str_hash_opts), \ - BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ - NULL, "Hash function for directory entries and xattrs")\ - x(metadata_target, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_METADATA_TARGET, 0, \ - "(target)", "Device or label for metadata writes") \ - x(foreground_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_FOREGROUND_TARGET, 0, \ - "(target)", "Device or label for foreground writes") \ - x(background_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_BACKGROUND_TARGET, 0, \ - "(target)", "Device or label to move data to in the background")\ - x(promote_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_PROMOTE_TARGET, 0, \ - "(target)", "Device or label to promote data to on read") \ - x(erasure_code, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_ERASURE_CODE, false, \ - NULL, "Enable erasure coding (DO NOT USE YET)") \ - x(casefold, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT, \ - OPT_BOOL(), \ - BCH_SB_CASEFOLD, false, \ - NULL, "Dirent lookups are casefolded") \ - x(casefold_disabled, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Disable casefolding filesystem wide") \ - x(inodes_32bit, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_INODE_32BIT, true, \ - NULL, "Constrain inode numbers to 32 bits") \ - x(shard_inode_numbers_bits, u8, \ - OPT_FS|OPT_FORMAT, \ - OPT_UINT(0, 8), \ - BCH_SB_SHARD_INUMS_NBITS, 0, \ - NULL, "Shard new inode numbers by CPU id") \ - x(inodes_use_key_cache, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_INODES_USE_KEY_CACHE, true, \ - NULL, "Use the btree key cache for the inodes btree") \ - x(btree_node_mem_ptr_optimization, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Stash pointer to in memory btree node in btree ptr")\ - x(gc_reserve_percent, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(5, 21), \ - BCH_SB_GC_RESERVE, 8, \ - "%", "Percentage of disk space to reserve for copygc")\ - x(gc_reserve_bytes, u64, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ - OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(0, U64_MAX), \ - BCH_SB_GC_RESERVE_BYTES, 0, \ - "%", "Amount of disk space to reserve for copygc\n" \ - "Takes precedence over gc_reserve_percent if set")\ - x(root_reserve_percent, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(0, 100), \ - BCH_SB_ROOT_RESERVE, 0, \ - "%", "Percentage of disk space to reserve for superuser")\ - x(wide_macs, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_128_BIT_MACS, false, \ - NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ - x(inline_data, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable inline data extents") \ - x(promote_whole_extents, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \ - NULL, "Promote whole extents, instead of just part being read")\ - x(acl, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_POSIX_ACL, true, \ - NULL, "Enable POSIX acls") \ - x(usrquota, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_USRQUOTA, false, \ - NULL, "Enable user quotas") \ - x(grpquota, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_GRPQUOTA, false, \ - NULL, "Enable group quotas") \ - x(prjquota, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_PRJQUOTA, false, \ - NULL, "Enable project quotas") \ - x(degraded, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_STR(bch2_degraded_actions), \ - BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ - NULL, "Allow mounting in degraded mode") \ - x(no_splitbrain_check, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't kick drives out when splitbrain detected")\ - x(verbose, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \ - NULL, "Extra debugging information during mount/recovery")\ - x(journal_flush_delay, u32, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, U32_MAX), \ - BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ - NULL, "Delay in milliseconds before automatic journal commits")\ - x(journal_flush_disabled, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ - NULL, "Disable journal flush on sync/fsync\n" \ - "If enabled, writes can be lost, but only since the\n"\ - "last journal write (default 1 second)") \ - x(journal_reclaim_delay, u32, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, U32_MAX), \ - BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ - NULL, "Delay in milliseconds before automatic journal reclaim")\ - x(move_bytes_in_flight, u32, \ - OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1024, U32_MAX), \ - BCH2_NO_SB_OPT, 1U << 20, \ - NULL, "Maximum Amount of IO to keep in flight by the move path")\ - x(move_ios_in_flight, u32, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, 1024), \ - BCH2_NO_SB_OPT, 32, \ - NULL, "Maximum number of IOs to keep in flight by the move path")\ - x(fsck, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Run fsck on mount") \ - x(fsck_memory_usage_percent, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(20, 70), \ - BCH2_NO_SB_OPT, 50, \ - NULL, "Maximum percentage of system ram fsck is allowed to pin")\ - x(fix_errors, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_FN(bch2_opt_fix_errors), \ - BCH2_NO_SB_OPT, FSCK_FIX_exit, \ - NULL, "Fix errors during fsck without asking") \ - x(ratelimit_errors, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ - NULL, "Ratelimit error messages during fsck") \ - x(nochanges, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Super read only mode - no writes at all will be issued,\n"\ - "even if we have to replay the journal") \ - x(norecovery, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Exit recovery immediately prior to journal replay")\ - x(journal_rewind, u64, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(0, U64_MAX), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Rewind journal") \ - x(recovery_passes, u64, \ - OPT_FS|OPT_MOUNT, \ - OPT_BITFIELD(bch2_recovery_passes), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Recovery passes to run explicitly") \ - x(recovery_passes_exclude, u64, \ - OPT_FS|OPT_MOUNT, \ - OPT_BITFIELD(bch2_recovery_passes), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Recovery passes to exclude") \ - x(recovery_pass_last, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_STR_NOLIMIT(bch2_recovery_passes), \ - BCH2_NO_SB_OPT, 0, \ - NULL, "Exit recovery after specified pass") \ - x(retain_recovery_info, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ - x(read_entire_journal, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Read all journal entries, not just dirty ones")\ - x(read_journal_only, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Only read the journal, skip the rest of recovery")\ - x(journal_transaction_names, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ - NULL, "Log transaction function names in journal") \ - x(allocator_stuck_timeout, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(0, U16_MAX), \ - BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \ - NULL, "Default timeout in seconds for stuck allocator messages")\ - x(noexcl, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don't open device in exclusive mode") \ - x(direct_io, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Use O_DIRECT (userspace only)") \ - x(sb, u64, \ - OPT_MOUNT, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ - "offset", "Sector offset of superblock") \ - x(read_only, u8, \ - OPT_FS|OPT_MOUNT|OPT_HIDDEN, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, NULL) \ - x(nostart, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Don\'t start filesystem, only open devices") \ - x(reconstruct_alloc, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Reconstruct alloc btree") \ - x(version_upgrade, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_version_upgrade_opts), \ - BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ - NULL, "Set superblock to latest version,\n" \ - "allowing any new features to be used") \ - x(stdio, u64, \ - 0, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Pointer to a struct stdio_redirect") \ - x(project, u8, \ - OPT_INODE, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, NULL) \ - x(nocow, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ - OPT_BOOL(), \ - BCH_SB_NOCOW, false, \ - NULL, "Nocow mode: Writes will be done in place when possible.\n"\ - "Snapshots and reflink will still caused writes to be COW\n"\ - "Implicitly disables data checksumming, compression and encryption")\ - x(nocow_enabled, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable nocow mode: enables runtime locking in\n"\ - "data move path needed if nocow will ever be in use\n")\ - x(copygc_enabled, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable copygc: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ - x(rebalance_enabled, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable rebalance: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ - x(rebalance_on_ac_only, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_REBALANCE_AC_ONLY, false, \ - NULL, "Enable rebalance while on mains power only\n") \ - x(auto_snapshot_deletion, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ - "quiet the system when doing performance testing\n")\ - x(no_data_io, u8, \ - OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Skip submit_bio() for data reads and writes, " \ - "for performance testing purposes") \ - x(state, u64, \ - OPT_DEVICE|OPT_RUNTIME, \ - OPT_STR(bch2_member_states), \ - BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ - "state", "rw,ro,failed,spare") \ - x(bucket_size, u32, \ - OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ - OPT_UINT(0, S64_MAX), \ - BCH_MEMBER_BUCKET_SIZE, 0, \ - "size", "Specifies the bucket size; must be greater than the btree node size")\ - x(durability, u8, \ - OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ - OPT_UINT(0, BCH_REPLICAS_MAX), \ - BCH_MEMBER_DURABILITY, 1, \ - "n", "Data written to this device will be considered\n"\ - "to have already been replicated n times") \ - x(data_allowed, u8, \ - OPT_DEVICE, \ - OPT_BITFIELD(__bch2_data_types), \ - BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ - "types", "Allowed data types for this device: journal, btree, and/or user")\ - x(discard, u8, \ - OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_MEMBER_DISCARD, true, \ - NULL, "Enable discard/TRIM support") \ - x(btree_node_prefetch, u8, \ - OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ - " prefetched sequentially") - -struct bch_opts { -#define x(_name, _bits, ...) unsigned _name##_defined:1; - BCH_OPTS() -#undef x - -#define x(_name, _bits, ...) _bits _name; - BCH_OPTS() -#undef x -}; - -struct bch2_opts_parse { - struct bch_opts opts; - - /* to save opts that can't be parsed before the FS is opened: */ - struct printbuf parse_later; -}; - -static const __maybe_unused struct bch_opts bch2_opts_default = { -#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ - ._name##_defined = true, \ - ._name = _default, \ - - BCH_OPTS() -#undef x -}; - -#define opt_defined(_opts, _name) ((_opts)._name##_defined) - -#define opt_get(_opts, _name) \ - (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) - -#define opt_set(_opts, _name, _v) \ -do { \ - (_opts)._name##_defined = true; \ - (_opts)._name = _v; \ -} while (0) - -static inline struct bch_opts bch2_opts_empty(void) -{ - return (struct bch_opts) { 0 }; -} - -void bch2_opts_apply(struct bch_opts *, struct bch_opts); - -enum bch_opt_id { -#define x(_name, ...) Opt_##_name, - BCH_OPTS() -#undef x - bch2_opts_nr -}; - -struct bch_fs; -struct printbuf; - -struct bch_option { - struct attribute attr; - enum opt_type type; - enum opt_flags flags; - u64 min, max; - - const char * const *choices; - - struct bch_opt_fn fn; - - const char *hint; - const char *help; - - u64 (*get_sb)(const struct bch_sb *); - void (*set_sb)(struct bch_sb *, u64); - - u64 (*get_member)(const struct bch_member *); - void (*set_member)(struct bch_member *, u64); - -}; - -extern const struct bch_option bch2_opt_table[]; - -bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); -u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); -void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); - -u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); -int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); - -struct bch_dev; -bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); - -int bch2_opt_lookup(const char *); -int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); -int bch2_opt_parse(struct bch_fs *, const struct bch_option *, - const char *, u64 *, struct printbuf *); - -#define OPT_SHOW_FULL_LIST (1 << 0) -#define OPT_SHOW_MOUNT_STYLE (1 << 1) - -void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, - const struct bch_option *, u64, unsigned); -void bch2_opts_to_text(struct printbuf *, - struct bch_opts, - struct bch_fs *, struct bch_sb *, - unsigned, unsigned, unsigned); - -int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); -int bch2_opts_hooks_pre_set(struct bch_fs *); -void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, - struct bch_opts *, enum bch_opt_id); - -int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, - struct printbuf *, const char *, const char *); -int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, - char *, bool); - -/* inode opts: */ - -struct bch_io_opts { -#define x(_name, _bits) u##_bits _name; - BCH_INODE_OPTS() -#undef x -#define x(_name, _bits) u64 _name##_from_inode:1; - BCH_INODE_OPTS() -#undef x -}; - -static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) -{ - if (!opts->background_target) - opts->background_target = opts->foreground_target; - if (!opts->background_compression) - opts->background_compression = opts->compression; - if (opts->nocow) { - opts->compression = opts->background_compression = 0; - opts->data_checksum = 0; - opts->erasure_code = 0; - } -} - -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); -bool bch2_opt_is_inode_opt(enum bch_opt_id); - -#endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c deleted file mode 100644 index 3302bbc78a09ba..00000000000000 --- a/fs/bcachefs/printbuf.c +++ /dev/null @@ -1,528 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1+ -/* Copyright (C) 2022 Kent Overstreet */ - -#include -#include -#include -#include -#include -#include - -#include "printbuf.h" - -static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos) -{ - return pos - buf->last_newline; -} - -static inline unsigned printbuf_linelen(struct printbuf *buf) -{ - return __printbuf_linelen(buf, buf->pos); -} - -/* - * Returns spaces from start of line, if set, or 0 if unset: - */ -static inline unsigned cur_tabstop(struct printbuf *buf) -{ - return buf->cur_tabstop < buf->nr_tabstops - ? buf->_tabstops[buf->cur_tabstop] - : 0; -} - -int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) -{ - /* Reserved space for terminating nul: */ - extra += 1; - - if (out->pos + extra <= out->size) - return 0; - - if (!out->heap_allocated) { - out->overflow = true; - return 0; - } - - unsigned new_size = roundup_pow_of_two(out->size + extra); - - /* Sanity check... */ - if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) { - out->allocation_failure = true; - out->overflow = true; - return -ENOMEM; - } - - /* - * Note: output buffer must be freeable with kfree(), it's not required - * that the user use printbuf_exit(). - */ - char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); - - if (!buf) { - out->allocation_failure = true; - out->overflow = true; - return -ENOMEM; - } - - out->buf = buf; - out->size = new_size; - return 0; -} - -static void printbuf_advance_pos(struct printbuf *out, unsigned len) -{ - out->pos += min(len, printbuf_remaining(out)); -} - -static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr) -{ - unsigned move = out->pos - pos; - - bch2_printbuf_make_room(out, nr); - - if (pos + nr < out->size) - memmove(out->buf + pos + nr, - out->buf + pos, - min(move, out->size - 1 - pos - nr)); - - if (pos < out->size) - memset(out->buf + pos, ' ', min(nr, out->size - pos)); - - printbuf_advance_pos(out, nr); - printbuf_nul_terminate_reserved(out); -} - -static void __printbuf_do_indent(struct printbuf *out, unsigned pos) -{ - while (true) { - int pad; - unsigned len = out->pos - pos; - char *p = out->buf + pos; - char *n = memscan(p, '\n', len); - if (cur_tabstop(out)) { - n = min(n, (char *) memscan(p, '\r', len)); - n = min(n, (char *) memscan(p, '\t', len)); - } - - pos = n - out->buf; - if (pos == out->pos) - break; - - switch (*n) { - case '\n': - pos++; - out->last_newline = pos; - - printbuf_insert_spaces(out, pos, out->indent); - - pos = min(pos + out->indent, out->pos); - out->last_field = pos; - out->cur_tabstop = 0; - break; - case '\r': - memmove(n, n + 1, out->pos - pos); - --out->pos; - pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos); - if (pad > 0) { - printbuf_insert_spaces(out, out->last_field, pad); - pos += pad; - } - - out->last_field = pos; - out->cur_tabstop++; - break; - case '\t': - pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1; - if (pad > 0) { - *n = ' '; - printbuf_insert_spaces(out, pos, pad - 1); - pos += pad; - } else { - memmove(n, n + 1, out->pos - pos); - --out->pos; - } - - out->last_field = pos; - out->cur_tabstop++; - break; - } - } -} - -static inline void printbuf_do_indent(struct printbuf *out, unsigned pos) -{ - if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling) - __printbuf_do_indent(out, pos); -} - -void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) -{ - int len; - - do { - va_list args2; - - va_copy(args2, args); - len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2); - va_end(args2); - } while (len > printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len)); - - unsigned indent_pos = out->pos; - printbuf_advance_pos(out, len); - printbuf_do_indent(out, indent_pos); -} - -void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) -{ - va_list args; - int len; - - do { - va_start(args, fmt); - len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args); - va_end(args); - } while (len > printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len)); - - unsigned indent_pos = out->pos; - printbuf_advance_pos(out, len); - printbuf_do_indent(out, indent_pos); -} - -/** - * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be - * null terminated - * @buf: printbuf to terminate - * Returns: Printbuf contents, as a nul terminated C string - */ -const char *bch2_printbuf_str(const struct printbuf *buf) -{ - /* - * If we've written to a printbuf then it's guaranteed to be a null - * terminated string - but if we haven't, then we might not have - * allocated a buffer at all: - */ - return buf->pos - ? buf->buf - : ""; -} - -/** - * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it - * against accidental use. - * @buf: printbuf to exit - */ -void bch2_printbuf_exit(struct printbuf *buf) -{ - if (buf->heap_allocated) { - kfree(buf->buf); - buf->buf = ERR_PTR(-EINTR); /* poison value */ - } -} - -void bch2_printbuf_tabstops_reset(struct printbuf *buf) -{ - buf->nr_tabstops = 0; -} - -void bch2_printbuf_tabstop_pop(struct printbuf *buf) -{ - if (buf->nr_tabstops) - --buf->nr_tabstops; -} - -/* - * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop - * - * @buf: printbuf to control - * @spaces: number of spaces from previous tabpstop - * - * In the future this function may allocate memory if setting more than - * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start - * of line. - */ -int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) -{ - unsigned prev_tabstop = buf->nr_tabstops - ? buf->_tabstops[buf->nr_tabstops - 1] - : 0; - - if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops))) - return -EINVAL; - - buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces; - buf->has_indent_or_tabstops = true; - return 0; -} - -/** - * bch2_printbuf_indent_add() - add to the current indent level - * - * @buf: printbuf to control - * @spaces: number of spaces to add to the current indent level - * - * Subsequent lines, and the current line if the output position is at the start - * of the current line, will be indented by @spaces more spaces. - */ -void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) -{ - if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) - spaces = 0; - - buf->indent += spaces; - prt_chars(buf, ' ', spaces); - - buf->has_indent_or_tabstops = true; -} - -/** - * bch2_printbuf_indent_add_nextline() - add to the current indent level for - * subsequent lines - * - * @buf: printbuf to control - * @spaces: number of spaces to add to the current indent level - * - * Subsequent lines - not the current line - will be indented by @spaces more - * spaces. - */ -void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces) -{ - if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) - spaces = 0; - - buf->indent += spaces; - buf->has_indent_or_tabstops = true; -} - -/** - * bch2_printbuf_indent_sub() - subtract from the current indent level - * - * @buf: printbuf to control - * @spaces: number of spaces to subtract from the current indent level - * - * Subsequent lines, and the current line if the output position is at the start - * of the current line, will be indented by @spaces less spaces. - */ -void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) -{ - if (WARN_ON_ONCE(spaces > buf->indent)) - spaces = buf->indent; - - if (buf->last_newline + buf->indent == buf->pos) { - buf->pos -= spaces; - printbuf_nul_terminate(buf); - } - buf->indent -= spaces; - - if (!buf->indent && !buf->nr_tabstops) - buf->has_indent_or_tabstops = false; -} - -void bch2_prt_newline(struct printbuf *buf) -{ - bch2_printbuf_make_room(buf, 1 + buf->indent); - - __prt_char_reserved(buf, '\n'); - - buf->last_newline = buf->pos; - - __prt_chars_reserved(buf, ' ', buf->indent); - - printbuf_nul_terminate_reserved(buf); - - buf->last_field = buf->pos; - buf->cur_tabstop = 0; -} - -void bch2_printbuf_strip_trailing_newline(struct printbuf *out) -{ - for (int p = out->pos - 1; p >= 0; --p) { - if (out->buf[p] == '\n') { - out->pos = p; - break; - } - if (out->buf[p] != ' ') - break; - } - - printbuf_nul_terminate_reserved(out); -} - -static void __prt_tab(struct printbuf *out) -{ - int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); - - prt_chars(out, ' ', spaces); - - out->last_field = out->pos; - out->cur_tabstop++; -} - -/** - * bch2_prt_tab() - Advance printbuf to the next tabstop - * @out: printbuf to control - * - * Advance output to the next tabstop by printing spaces. - */ -void bch2_prt_tab(struct printbuf *out) -{ - if (WARN_ON(!cur_tabstop(out))) - return; - - __prt_tab(out); -} - -static void __prt_tab_rjust(struct printbuf *buf) -{ - int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); - if (pad > 0) - printbuf_insert_spaces(buf, buf->last_field, pad); - - buf->last_field = buf->pos; - buf->cur_tabstop++; -} - -/** - * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying - * previous output - * - * @buf: printbuf to control - * - * Advance output to the next tabstop by inserting spaces immediately after the - * previous tabstop, right justifying previously outputted text. - */ -void bch2_prt_tab_rjust(struct printbuf *buf) -{ - if (WARN_ON(!cur_tabstop(buf))) - return; - - __prt_tab_rjust(buf); -} - -/** - * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters - * - * @out: output printbuf - * @str: string to print - * @count: number of bytes to print - * - * The following contol characters are handled as so: - * \n: prt_newline newline that obeys current indent level - * \t: prt_tab advance to next tabstop - * \r: prt_tab_rjust advance to next tabstop, with right justification - */ -void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) -{ - unsigned indent_pos = out->pos; - prt_bytes(out, str, count); - printbuf_do_indent(out, indent_pos); -} - -/** - * bch2_prt_human_readable_u64() - Print out a u64 in human readable units - * @out: output printbuf - * @v: integer to print - * - * Units of 2^10 (default) or 10^3 are controlled via @out->si_units - */ -void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) -{ - bch2_printbuf_make_room(out, 10); - unsigned len = string_get_size(v, 1, !out->si_units, - out->buf + out->pos, - printbuf_remaining_size(out)); - printbuf_advance_pos(out, len); -} - -/** - * bch2_prt_human_readable_s64() - Print out a s64 in human readable units - * @out: output printbuf - * @v: integer to print - * - * Units of 2^10 (default) or 10^3 are controlled via @out->si_units - */ -void bch2_prt_human_readable_s64(struct printbuf *out, s64 v) -{ - if (v < 0) - prt_char(out, '-'); - bch2_prt_human_readable_u64(out, abs(v)); -} - -/** - * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options - * @out: output printbuf - * @v: integer to print - * - * Units are either raw (default), or human reabable units (controlled via - * @buf->human_readable_units) - */ -void bch2_prt_units_u64(struct printbuf *out, u64 v) -{ - if (out->human_readable_units) - bch2_prt_human_readable_u64(out, v); - else - bch2_prt_printf(out, "%llu", v); -} - -/** - * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options - * @out: output printbuf - * @v: integer to print - * - * Units are either raw (default), or human reabable units (controlled via - * @buf->human_readable_units) - */ -void bch2_prt_units_s64(struct printbuf *out, s64 v) -{ - if (v < 0) - prt_char(out, '-'); - bch2_prt_units_u64(out, abs(v)); -} - -void bch2_prt_string_option(struct printbuf *out, - const char * const list[], - size_t selected) -{ - for (size_t i = 0; list[i]; i++) - bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); -} - -void bch2_prt_bitflags(struct printbuf *out, - const char * const list[], u64 flags) -{ - unsigned bit, nr = 0; - bool first = true; - - while (list[nr]) - nr++; - - while (flags && (bit = __ffs64(flags)) < nr) { - if (!first) - bch2_prt_printf(out, ","); - first = false; - bch2_prt_printf(out, "%s", list[bit]); - flags ^= BIT_ULL(bit); - } -} - -void bch2_prt_bitflags_vector(struct printbuf *out, - const char * const list[], - unsigned long *v, unsigned nr) -{ - bool first = true; - unsigned i; - - for (i = 0; i < nr; i++) - if (!list[i]) { - nr = i - 1; - break; - } - - for_each_set_bit(i, v, nr) { - if (!first) - bch2_prt_printf(out, ","); - first = false; - bch2_prt_printf(out, "%s", list[i]); - } -} diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h deleted file mode 100644 index 8f4e28d440ac3f..00000000000000 --- a/fs/bcachefs/printbuf.h +++ /dev/null @@ -1,298 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1+ */ -/* Copyright (C) 2022 Kent Overstreet */ - -#ifndef _BCACHEFS_PRINTBUF_H -#define _BCACHEFS_PRINTBUF_H - -/* - * Printbufs: Simple strings for printing to, with optional heap allocation - * - * This code has provisions for use in userspace, to aid in making other code - * portable between kernelspace and userspace. - * - * Basic example: - * struct printbuf buf = PRINTBUF; - * - * prt_printf(&buf, "foo="); - * foo_to_text(&buf, foo); - * printk("%s", buf.buf); - * printbuf_exit(&buf); - * - * Or - * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size) - * - * We can now write pretty printers instead of writing code that dumps - * everything to the kernel log buffer, and then those pretty-printers can be - * used by other code that outputs to kernel log, sysfs, debugfs, etc. - * - * Memory allocation: Outputing to a printbuf may allocate memory. This - * allocation is done with GFP_KERNEL, by default: use the newer - * memalloc_*_(save|restore) functions as needed. - * - * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations - * will be done with GFP_NOWAIT if printbuf->atomic is nonzero. - * - * It's allowed to grab the output buffer and free it later with kfree() instead - * of using printbuf_exit(), if the user just needs a heap allocated string at - * the end. - * - * Memory allocation failures: We don't return errors directly, because on - * memory allocation failure we usually don't want to bail out and unwind - we - * want to print what we've got, on a best-effort basis. But code that does want - * to return -ENOMEM may check printbuf.allocation_failure. - * - * Indenting, tabstops: - * - * To aid is writing multi-line pretty printers spread across multiple - * functions, printbufs track the current indent level. - * - * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent - * level, respectively. - * - * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from - * start of line. Once set, prt_tab() will output spaces up to the next tabstop. - * prt_tab_rjust() will also advance the current line of text up to the next - * tabstop, but it does so by shifting text since the previous tabstop up to the - * next tabstop - right justifying it. - * - * Make sure you use prt_newline() instead of \n in the format string for indent - * level and tabstops to work corretly. - * - * Output units: printbuf->units exists to tell pretty-printers how to output - * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as - * human readable bytes. prt_units() obeys it. - */ - -#include -#include - -enum printbuf_si { - PRINTBUF_UNITS_2, /* use binary powers of 2^10 */ - PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ -}; - -#define PRINTBUF_INLINE_TABSTOPS 6 - -struct printbuf { - char *buf; - unsigned size; - unsigned pos; - unsigned last_newline; - unsigned last_field; - unsigned indent; - /* - * If nonzero, allocations will be done with GFP_ATOMIC: - */ - u8 atomic; - bool allocation_failure:1; - bool heap_allocated:1; - bool overflow:1; - enum printbuf_si si_units:1; - bool human_readable_units:1; - bool has_indent_or_tabstops:1; - bool suppress_indent_tabstop_handling:1; - u8 nr_tabstops; - - /* - * Do not modify directly: use printbuf_tabstop_add(), - * printbuf_tabstop_get() - */ - u8 cur_tabstop; - u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; -}; - -int bch2_printbuf_make_room(struct printbuf *, unsigned); -__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...); -__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list); -const char *bch2_printbuf_str(const struct printbuf *); -void bch2_printbuf_exit(struct printbuf *); - -void bch2_printbuf_tabstops_reset(struct printbuf *); -void bch2_printbuf_tabstop_pop(struct printbuf *); -int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); - -void bch2_printbuf_indent_add(struct printbuf *, unsigned); -void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned); -void bch2_printbuf_indent_sub(struct printbuf *, unsigned); - -void bch2_prt_newline(struct printbuf *); -void bch2_printbuf_strip_trailing_newline(struct printbuf *); -void bch2_prt_tab(struct printbuf *); -void bch2_prt_tab_rjust(struct printbuf *); - -void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned); -void bch2_prt_human_readable_u64(struct printbuf *, u64); -void bch2_prt_human_readable_s64(struct printbuf *, s64); -void bch2_prt_units_u64(struct printbuf *, u64); -void bch2_prt_units_s64(struct printbuf *, s64); -void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); -void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); -void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], - unsigned long *, unsigned); - -/* Initializer for a heap allocated printbuf: */ -#define PRINTBUF ((struct printbuf) { .heap_allocated = true }) - -/* Initializer a printbuf that points to an external buffer: */ -#define PRINTBUF_EXTERN(_buf, _size) \ -((struct printbuf) { \ - .buf = _buf, \ - .size = _size, \ -}) - -static inline struct printbuf bch2_printbuf_init(void) -{ - return PRINTBUF; -} - -DEFINE_CLASS(printbuf, struct printbuf, - bch2_printbuf_exit(&_T), bch2_printbuf_init(), void) - -/* - * Returns size remaining of output buffer: - */ -static inline unsigned printbuf_remaining_size(struct printbuf *out) -{ - if (WARN_ON(out->size && out->pos >= out->size)) - out->pos = out->size - 1; - return out->size - out->pos; -} - -/* - * Returns number of characters we can print to the output buffer - i.e. - * excluding the terminating nul: - */ -static inline unsigned printbuf_remaining(struct printbuf *out) -{ - return out->size ? printbuf_remaining_size(out) - 1 : 0; -} - -static inline unsigned printbuf_written(struct printbuf *out) -{ - return out->size ? min(out->pos, out->size - 1) : 0; -} - -static inline void printbuf_nul_terminate_reserved(struct printbuf *out) -{ - if (WARN_ON(out->size && out->pos >= out->size)) - out->pos = out->size - 1; - if (out->size) - out->buf[out->pos] = 0; -} - -static inline void printbuf_nul_terminate(struct printbuf *out) -{ - bch2_printbuf_make_room(out, 1); - printbuf_nul_terminate_reserved(out); -} - -/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ -static inline void __prt_char_reserved(struct printbuf *out, char c) -{ - if (printbuf_remaining(out)) - out->buf[out->pos++] = c; -} - -/* Doesn't nul terminate: */ -static inline void __prt_char(struct printbuf *out, char c) -{ - bch2_printbuf_make_room(out, 1); - __prt_char_reserved(out, c); -} - -static inline void prt_char(struct printbuf *out, char c) -{ - bch2_printbuf_make_room(out, 2); - __prt_char_reserved(out, c); - printbuf_nul_terminate_reserved(out); -} - -static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) -{ - unsigned can_print = min(n, printbuf_remaining(out)); - - for (unsigned i = 0; i < can_print; i++) - out->buf[out->pos++] = c; -} - -static inline void prt_chars(struct printbuf *out, char c, unsigned n) -{ - bch2_printbuf_make_room(out, n); - __prt_chars_reserved(out, c, n); - printbuf_nul_terminate_reserved(out); -} - -static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) -{ - bch2_printbuf_make_room(out, n); - - unsigned can_print = min(n, printbuf_remaining(out)); - - for (unsigned i = 0; i < can_print; i++) - out->buf[out->pos++] = ((char *) b)[i]; - - printbuf_nul_terminate(out); -} - -static inline void prt_str(struct printbuf *out, const char *str) -{ - prt_bytes(out, str, strlen(str)); -} - -static inline void prt_str_indented(struct printbuf *out, const char *str) -{ - bch2_prt_bytes_indented(out, str, strlen(str)); -} - -static inline void prt_hex_byte(struct printbuf *out, u8 byte) -{ - bch2_printbuf_make_room(out, 3); - __prt_char_reserved(out, hex_asc_hi(byte)); - __prt_char_reserved(out, hex_asc_lo(byte)); - printbuf_nul_terminate_reserved(out); -} - -static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) -{ - bch2_printbuf_make_room(out, 3); - __prt_char_reserved(out, hex_asc_upper_hi(byte)); - __prt_char_reserved(out, hex_asc_upper_lo(byte)); - printbuf_nul_terminate_reserved(out); -} - -static inline void printbuf_reset_keep_tabstops(struct printbuf *buf) -{ - buf->pos = 0; - buf->allocation_failure = 0; - buf->last_newline = 0; - buf->last_field = 0; - buf->indent = 0; - buf->cur_tabstop = 0; -} - -/** - * printbuf_reset - re-use a printbuf without freeing and re-initializing it: - */ -static inline void printbuf_reset(struct printbuf *buf) -{ - printbuf_reset_keep_tabstops(buf); - buf->nr_tabstops = 0; -} - -/** - * printbuf_atomic_inc - mark as entering an atomic section - */ -static inline void printbuf_atomic_inc(struct printbuf *buf) -{ - buf->atomic++; -} - -/** - * printbuf_atomic_inc - mark as leaving an atomic section - */ -static inline void printbuf_atomic_dec(struct printbuf *buf) -{ - buf->atomic--; -} - -#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c deleted file mode 100644 index d09898566abea9..00000000000000 --- a/fs/bcachefs/progress.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bbpos.h" -#include "disk_accounting.h" -#include "progress.h" - -void bch2_progress_init(struct progress_indicator_state *s, - struct bch_fs *c, - u64 btree_id_mask) -{ - memset(s, 0, sizeof(*s)); - - s->next_print = jiffies + HZ * 10; - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - if (!(btree_id_mask & BIT_ULL(i))) - continue; - - struct disk_accounting_pos acc; - disk_accounting_key_init(acc, btree, .id = i); - - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - s->nodes_total += div64_ul(v, btree_sectors(c)); - } -} - -static inline bool progress_update_p(struct progress_indicator_state *s) -{ - bool ret = time_after_eq(jiffies, s->next_print); - - if (ret) - s->next_print = jiffies + HZ * 10; - return ret; -} - -void bch2_progress_update_iter(struct btree_trans *trans, - struct progress_indicator_state *s, - struct btree_iter *iter, - const char *msg) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(btree_iter_path(trans, iter))->b; - - s->nodes_seen += b != s->last_node; - s->last_node = b; - - if (progress_update_p(s)) { - struct printbuf buf = PRINTBUF; - unsigned percent = s->nodes_total - ? div64_u64(s->nodes_seen * 100, s->nodes_total) - : 0; - - prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", - msg, percent, s->nodes_seen, s->nodes_total); - bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -} diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h deleted file mode 100644 index 23fb1811f9436f..00000000000000 --- a/fs/bcachefs/progress.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_PROGRESS_H -#define _BCACHEFS_PROGRESS_H - -/* - * Lame progress indicators - * - * We don't like to use these because they print to the dmesg console, which is - * spammy - we much prefer to be wired up to a userspace programm (e.g. via - * thread_with_file) and have it print the progress indicator. - * - * But some code is old and doesn't support that, or runs in a context where - * that's not yet practical (mount). - */ - -struct progress_indicator_state { - unsigned long next_print; - u64 nodes_seen; - u64 nodes_total; - struct btree *last_node; -}; - -void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); -void bch2_progress_update_iter(struct btree_trans *, - struct progress_indicator_state *, - struct btree_iter *, - const char *); - -#endif /* _BCACHEFS_PROGRESS_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c deleted file mode 100644 index f241efb1fb5070..00000000000000 --- a/fs/bcachefs/quota.c +++ /dev/null @@ -1,892 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "btree_update.h" -#include "errcode.h" -#include "error.h" -#include "inode.h" -#include "quota.h" -#include "snapshot.h" -#include "super-io.h" - -static const char * const bch2_quota_types[] = { - "user", - "group", - "project", -}; - -static const char * const bch2_quota_counters[] = { - "space", - "inodes", -}; - -static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_quota *q = field_to_type(f, quota); - - if (vstruct_bytes(&q->field) < sizeof(*q)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&q->field), sizeof(*q)); - return -BCH_ERR_invalid_sb_quota; - } - - return 0; -} - -static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_quota *q = field_to_type(f, quota); - unsigned qtyp, counter; - - for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { - prt_printf(out, "%s: flags %llx", - bch2_quota_types[qtyp], - le64_to_cpu(q->q[qtyp].flags)); - - for (counter = 0; counter < Q_COUNTERS; counter++) - prt_printf(out, " %s timelimit %u warnlimit %u", - bch2_quota_counters[counter], - le32_to_cpu(q->q[qtyp].c[counter].timelimit), - le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); - - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_quota = { - .validate = bch2_sb_quota_validate, - .to_text = bch2_sb_quota_to_text, -}; - -int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, - c, quota_type_invalid, - "invalid quota type (%llu >= %u)", - k.k->p.inode, QTYP_NR); -fsck_err: - return ret; -} - -void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); - unsigned i; - - for (i = 0; i < Q_COUNTERS; i++) - prt_printf(out, "%s hardlimit %llu softlimit %llu", - bch2_quota_counters[i], - le64_to_cpu(dq.v->c[i].hardlimit), - le64_to_cpu(dq.v->c[i].softlimit)); -} - -#ifdef CONFIG_BCACHEFS_QUOTA - -#include -#include -#include - -static void qc_info_to_text(struct printbuf *out, struct qc_info *i) -{ - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 20); - - prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask); - prt_printf(out, "i_flags\t%u\n", i->i_flags); - prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit); - prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit); - prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit); - prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit); - prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit); - prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit); -} - -static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) -{ - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 20); - - prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask); - prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit); - prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit); - prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit); - prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit); - prt_printf(out, "d_space\t%llu\n", q->d_space); - prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count); - prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer); - prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer); - prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns); - prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns); -} - -static inline unsigned __next_qtype(unsigned i, unsigned qtypes) -{ - qtypes >>= i; - return qtypes ? i + __ffs(qtypes) : QTYP_NR; -} - -#define for_each_set_qtype(_c, _i, _q, _qtypes) \ - for (_i = 0; \ - (_i = __next_qtype(_i, _qtypes), \ - _q = &(_c)->quotas[_i], \ - _i < QTYP_NR); \ - _i++) - -static bool ignore_hardlimit(struct bch_memquota_type *q) -{ - if (capable(CAP_SYS_RESOURCE)) - return true; -#if 0 - struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; - - return capable(CAP_SYS_RESOURCE) && - (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || - !(info->dqi_flags & DQF_ROOT_SQUASH)); -#endif - return false; -} - -enum quota_msg { - SOFTWARN, /* Softlimit reached */ - SOFTLONGWARN, /* Grace time expired */ - HARDWARN, /* Hardlimit reached */ - - HARDBELOW, /* Usage got below inode hardlimit */ - SOFTBELOW, /* Usage got below inode softlimit */ -}; - -static int quota_nl[][Q_COUNTERS] = { - [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, - [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, - [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, - [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, - [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, - - [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, - [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, - [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, - [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, - [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, -}; - -struct quota_msgs { - u8 nr; - struct { - u8 qtype; - u8 msg; - } m[QTYP_NR * Q_COUNTERS]; -}; - -static void prepare_msg(unsigned qtype, - enum quota_counters counter, - struct quota_msgs *msgs, - enum quota_msg msg_type) -{ - BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); - - msgs->m[msgs->nr].qtype = qtype; - msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; - msgs->nr++; -} - -static void prepare_warning(struct memquota_counter *qc, - unsigned qtype, - enum quota_counters counter, - struct quota_msgs *msgs, - enum quota_msg msg_type) -{ - if (qc->warning_issued & (1 << msg_type)) - return; - - prepare_msg(qtype, counter, msgs, msg_type); -} - -static void flush_warnings(struct bch_qid qid, - struct super_block *sb, - struct quota_msgs *msgs) -{ - unsigned i; - - for (i = 0; i < msgs->nr; i++) - quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), - sb->s_dev, msgs->m[i].msg); -} - -static int bch2_quota_check_limit(struct bch_fs *c, - unsigned qtype, - struct bch_memquota *mq, - struct quota_msgs *msgs, - enum quota_counters counter, - s64 v, - enum quota_acct_mode mode) -{ - struct bch_memquota_type *q = &c->quotas[qtype]; - struct memquota_counter *qc = &mq->c[counter]; - u64 n = qc->v + v; - - BUG_ON((s64) n < 0); - - if (mode == KEY_TYPE_QUOTA_NOCHECK) - return 0; - - if (v <= 0) { - if (n < qc->hardlimit && - (qc->warning_issued & (1 << HARDWARN))) { - qc->warning_issued &= ~(1 << HARDWARN); - prepare_msg(qtype, counter, msgs, HARDBELOW); - } - - if (n < qc->softlimit && - (qc->warning_issued & (1 << SOFTWARN))) { - qc->warning_issued &= ~(1 << SOFTWARN); - prepare_msg(qtype, counter, msgs, SOFTBELOW); - } - - qc->warning_issued = 0; - return 0; - } - - if (qc->hardlimit && - qc->hardlimit < n && - !ignore_hardlimit(q)) { - prepare_warning(qc, qtype, counter, msgs, HARDWARN); - return -EDQUOT; - } - - if (qc->softlimit && - qc->softlimit < n) { - if (qc->timer == 0) { - qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit; - prepare_warning(qc, qtype, counter, msgs, SOFTWARN); - } else if (ktime_get_real_seconds() >= qc->timer && - !ignore_hardlimit(q)) { - prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); - return -EDQUOT; - } - } - - return 0; -} - -int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, - enum quota_counters counter, s64 v, - enum quota_acct_mode mode) -{ - unsigned qtypes = enabled_qtypes(c); - struct bch_memquota_type *q; - struct bch_memquota *mq[QTYP_NR]; - struct quota_msgs msgs; - unsigned i; - int ret = 0; - - memset(&msgs, 0, sizeof(msgs)); - - for_each_set_qtype(c, i, q, qtypes) { - mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL); - if (!mq[i]) - return -ENOMEM; - } - - for_each_set_qtype(c, i, q, qtypes) - mutex_lock_nested(&q->lock, i); - - for_each_set_qtype(c, i, q, qtypes) { - ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); - if (ret) - goto err; - } - - for_each_set_qtype(c, i, q, qtypes) - mq[i]->c[counter].v += v; -err: - for_each_set_qtype(c, i, q, qtypes) - mutex_unlock(&q->lock); - - flush_warnings(qid, c->vfs_sb, &msgs); - - return ret; -} - -static void __bch2_quota_transfer(struct bch_memquota *src_q, - struct bch_memquota *dst_q, - enum quota_counters counter, s64 v) -{ - BUG_ON(v > src_q->c[counter].v); - BUG_ON(v + dst_q->c[counter].v < v); - - src_q->c[counter].v -= v; - dst_q->c[counter].v += v; -} - -int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, - struct bch_qid dst, - struct bch_qid src, u64 space, - enum quota_acct_mode mode) -{ - struct bch_memquota_type *q; - struct bch_memquota *src_q[3], *dst_q[3]; - struct quota_msgs msgs; - unsigned i; - int ret = 0; - - qtypes &= enabled_qtypes(c); - - memset(&msgs, 0, sizeof(msgs)); - - for_each_set_qtype(c, i, q, qtypes) { - src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL); - dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL); - if (!src_q[i] || !dst_q[i]) - return -ENOMEM; - } - - for_each_set_qtype(c, i, q, qtypes) - mutex_lock_nested(&q->lock, i); - - for_each_set_qtype(c, i, q, qtypes) { - ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, - dst_q[i]->c[Q_SPC].v + space, - mode); - if (ret) - goto err; - - ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, - dst_q[i]->c[Q_INO].v + 1, - mode); - if (ret) - goto err; - } - - for_each_set_qtype(c, i, q, qtypes) { - __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); - __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); - } - -err: - for_each_set_qtype(c, i, q, qtypes) - mutex_unlock(&q->lock); - - flush_warnings(dst, c->vfs_sb, &msgs); - - return ret; -} - -static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, - struct qc_dqblk *qdq) -{ - struct bkey_s_c_quota dq; - struct bch_memquota_type *q; - struct bch_memquota *mq; - unsigned i; - - BUG_ON(k.k->p.inode >= QTYP_NR); - - if (!((1U << k.k->p.inode) & enabled_qtypes(c))) - return 0; - - switch (k.k->type) { - case KEY_TYPE_quota: - dq = bkey_s_c_to_quota(k); - q = &c->quotas[k.k->p.inode]; - - mutex_lock(&q->lock); - mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); - if (!mq) { - mutex_unlock(&q->lock); - return -ENOMEM; - } - - for (i = 0; i < Q_COUNTERS; i++) { - mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); - mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); - } - - if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) - mq->c[Q_SPC].timer = qdq->d_spc_timer; - if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) - mq->c[Q_SPC].warns = qdq->d_spc_warns; - if (qdq && qdq->d_fieldmask & QC_INO_TIMER) - mq->c[Q_INO].timer = qdq->d_ino_timer; - if (qdq && qdq->d_fieldmask & QC_INO_WARNS) - mq->c[Q_INO].warns = qdq->d_ino_warns; - - mutex_unlock(&q->lock); - } - - return 0; -} - -void bch2_fs_quota_exit(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->quotas); i++) - genradix_free(&c->quotas[i].table); -} - -void bch2_fs_quota_init(struct bch_fs *c) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->quotas); i++) - mutex_init(&c->quotas[i].lock); -} - -static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) -{ - struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota); - - if (sb_quota) - return sb_quota; - - sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64)); - if (sb_quota) { - unsigned qtype, qc; - - for (qtype = 0; qtype < QTYP_NR; qtype++) - for (qc = 0; qc < Q_COUNTERS; qc++) - sb_quota->q[qtype].c[qc].timelimit = - cpu_to_le32(7 * 24 * 60 * 60); - } - - return sb_quota; -} - -static void bch2_sb_quota_read(struct bch_fs *c) -{ - struct bch_sb_field_quota *sb_quota; - unsigned i, j; - - sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota); - if (!sb_quota) - return; - - for (i = 0; i < QTYP_NR; i++) { - struct bch_memquota_type *q = &c->quotas[i]; - - for (j = 0; j < Q_COUNTERS; j++) { - q->limits[j].timelimit = - le32_to_cpu(sb_quota->q[i].c[j].timelimit); - q->limits[j].warnlimit = - le32_to_cpu(sb_quota->q[i].c[j].warnlimit); - } - } -} - -static int bch2_fs_quota_read_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bch_inode_unpacked u; - struct bch_snapshot_tree s_t; - u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot); - - int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "%s: snapshot tree %u not found", __func__, tree); - if (ret) - return ret; - - if (!s_t.master_subvol) - goto advance; - - ret = bch2_inode_find_by_inum_nowarn_trans(trans, - (subvol_inum) { - le32_to_cpu(s_t.master_subvol), - k.k->p.offset, - }, &u); - /* - * Inode might be deleted in this snapshot - the easiest way to handle - * that is to just skip it here: - */ - if (bch2_err_matches(ret, ENOENT)) - goto advance; - - if (ret) - return ret; - - bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, - KEY_TYPE_QUOTA_NOCHECK); - bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, - KEY_TYPE_QUOTA_NOCHECK); -advance: - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); - return 0; -} - -int bch2_fs_quota_read(struct bch_fs *c) -{ - - mutex_lock(&c->sb_lock); - struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - mutex_unlock(&c->sb_lock); - return bch_err_throw(c, ENOSPC_sb_quota); - } - - bch2_sb_quota_read(c); - mutex_unlock(&c->sb_lock); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, - BTREE_ITER_prefetch, k, - __bch2_quota_set(c, k, NULL)) ?: - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - bch2_fs_quota_read_inode(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -/* Enable/disable/delete quotas for an entire filesystem: */ - -static int bch2_quota_enable(struct super_block *sb, unsigned uflags) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_sb_field_quota *sb_quota; - int ret = 0; - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - /* Accounting must be enabled at mount time: */ - if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) - return -EINVAL; - - /* Can't enable enforcement without accounting: */ - if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) - return -EINVAL; - - if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) - return -EINVAL; - - if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) - return -EINVAL; - - mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - ret = bch_err_throw(c, ENOSPC_sb_quota); - goto unlock; - } - - if (uflags & FS_QUOTA_UDQ_ENFD) - SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); - - if (uflags & FS_QUOTA_GDQ_ENFD) - SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); - - if (uflags & FS_QUOTA_PDQ_ENFD) - SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); - - bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); - - return bch2_err_class(ret); -} - -static int bch2_quota_disable(struct super_block *sb, unsigned uflags) -{ - struct bch_fs *c = sb->s_fs_info; - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - mutex_lock(&c->sb_lock); - if (uflags & FS_QUOTA_UDQ_ENFD) - SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); - - if (uflags & FS_QUOTA_GDQ_ENFD) - SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); - - if (uflags & FS_QUOTA_PDQ_ENFD) - SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; -} - -static int bch2_quota_remove(struct super_block *sb, unsigned uflags) -{ - struct bch_fs *c = sb->s_fs_info; - int ret; - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - if (uflags & FS_USER_QUOTA) { - if (c->opts.usrquota) - return -EINVAL; - - ret = bch2_btree_delete_range(c, BTREE_ID_quotas, - POS(QTYP_USR, 0), - POS(QTYP_USR, U64_MAX), - 0, NULL); - if (ret) - return ret; - } - - if (uflags & FS_GROUP_QUOTA) { - if (c->opts.grpquota) - return -EINVAL; - - ret = bch2_btree_delete_range(c, BTREE_ID_quotas, - POS(QTYP_GRP, 0), - POS(QTYP_GRP, U64_MAX), - 0, NULL); - if (ret) - return ret; - } - - if (uflags & FS_PROJ_QUOTA) { - if (c->opts.prjquota) - return -EINVAL; - - ret = bch2_btree_delete_range(c, BTREE_ID_quotas, - POS(QTYP_PRJ, 0), - POS(QTYP_PRJ, U64_MAX), - 0, NULL); - if (ret) - return ret; - } - - return 0; -} - -/* - * Return quota status information, such as enforcements, quota file inode - * numbers etc. - */ -static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) -{ - struct bch_fs *c = sb->s_fs_info; - unsigned qtypes = enabled_qtypes(c); - unsigned i; - - memset(state, 0, sizeof(*state)); - - for (i = 0; i < QTYP_NR; i++) { - state->s_state[i].flags |= QCI_SYSFILE; - - if (!(qtypes & (1 << i))) - continue; - - state->s_state[i].flags |= QCI_ACCT_ENABLED; - - state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; - state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; - - state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; - state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; - } - - return 0; -} - -/* - * Adjust quota timers & warnings - */ -static int bch2_quota_set_info(struct super_block *sb, int type, - struct qc_info *info) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_sb_field_quota *sb_quota; - int ret = 0; - - if (0) { - struct printbuf buf = PRINTBUF; - - qc_info_to_text(&buf, info); - pr_info("setting:\n%s", buf.buf); - printbuf_exit(&buf); - } - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - if (type >= QTYP_NR) - return -EINVAL; - - if (!((1 << type) & enabled_qtypes(c))) - return -ESRCH; - - if (info->i_fieldmask & - ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) - return -EINVAL; - - mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - ret = bch_err_throw(c, ENOSPC_sb_quota); - goto unlock; - } - - if (info->i_fieldmask & QC_SPC_TIMER) - sb_quota->q[type].c[Q_SPC].timelimit = - cpu_to_le32(info->i_spc_timelimit); - - if (info->i_fieldmask & QC_SPC_WARNS) - sb_quota->q[type].c[Q_SPC].warnlimit = - cpu_to_le32(info->i_spc_warnlimit); - - if (info->i_fieldmask & QC_INO_TIMER) - sb_quota->q[type].c[Q_INO].timelimit = - cpu_to_le32(info->i_ino_timelimit); - - if (info->i_fieldmask & QC_INO_WARNS) - sb_quota->q[type].c[Q_INO].warnlimit = - cpu_to_le32(info->i_ino_warnlimit); - - bch2_sb_quota_read(c); - - bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); - - return bch2_err_class(ret); -} - -/* Get/set individual quotas: */ - -static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) -{ - dst->d_space = src->c[Q_SPC].v << 9; - dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; - dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; - dst->d_spc_timer = src->c[Q_SPC].timer; - dst->d_spc_warns = src->c[Q_SPC].warns; - - dst->d_ino_count = src->c[Q_INO].v; - dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; - dst->d_ino_softlimit = src->c[Q_INO].softlimit; - dst->d_ino_timer = src->c[Q_INO].timer; - dst->d_ino_warns = src->c[Q_INO].warns; -} - -static int bch2_get_quota(struct super_block *sb, struct kqid kqid, - struct qc_dqblk *qdq) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_memquota_type *q = &c->quotas[kqid.type]; - qid_t qid = from_kqid(&init_user_ns, kqid); - struct bch_memquota *mq; - - memset(qdq, 0, sizeof(*qdq)); - - mutex_lock(&q->lock); - mq = genradix_ptr(&q->table, qid); - if (mq) - __bch2_quota_get(qdq, mq); - mutex_unlock(&q->lock); - - return 0; -} - -static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, - struct qc_dqblk *qdq) -{ - struct bch_fs *c = sb->s_fs_info; - struct bch_memquota_type *q = &c->quotas[kqid->type]; - qid_t qid = from_kqid(&init_user_ns, *kqid); - struct genradix_iter iter; - struct bch_memquota *mq; - int ret = 0; - - mutex_lock(&q->lock); - - genradix_for_each_from(&q->table, iter, mq, qid) - if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { - __bch2_quota_get(qdq, mq); - *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); - goto found; - } - - ret = -ENOENT; -found: - mutex_unlock(&q->lock); - return bch2_err_class(ret); -} - -static int bch2_set_quota_trans(struct btree_trans *trans, - struct bkey_i_quota *new_quota, - struct qc_dqblk *qdq) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_slots|BTREE_ITER_intent); - ret = bkey_err(k); - if (unlikely(ret)) - return ret; - - if (k.k->type == KEY_TYPE_quota) - new_quota->v = *bkey_s_c_to_quota(k).v; - - if (qdq->d_fieldmask & QC_SPC_SOFT) - new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); - if (qdq->d_fieldmask & QC_SPC_HARD) - new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); - - if (qdq->d_fieldmask & QC_INO_SOFT) - new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); - if (qdq->d_fieldmask & QC_INO_HARD) - new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - - ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_set_quota(struct super_block *sb, struct kqid qid, - struct qc_dqblk *qdq) -{ - struct bch_fs *c = sb->s_fs_info; - struct bkey_i_quota new_quota; - int ret; - - if (0) { - struct printbuf buf = PRINTBUF; - - qc_dqblk_to_text(&buf, qdq); - pr_info("setting:\n%s", buf.buf); - printbuf_exit(&buf); - } - - if (sb->s_flags & SB_RDONLY) - return -EROFS; - - bkey_quota_init(&new_quota.k_i); - new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_set_quota_trans(trans, &new_quota, qdq)) ?: - __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); - - return bch2_err_class(ret); -} - -const struct quotactl_ops bch2_quotactl_operations = { - .quota_enable = bch2_quota_enable, - .quota_disable = bch2_quota_disable, - .rm_xquota = bch2_quota_remove, - - .get_state = bch2_quota_get_state, - .set_info = bch2_quota_set_info, - - .get_dqblk = bch2_get_quota, - .get_nextdqblk = bch2_get_next_quota, - .set_dqblk = bch2_set_quota, -}; - -#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h deleted file mode 100644 index 1551800ff44c91..00000000000000 --- a/fs/bcachefs/quota.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_QUOTA_H -#define _BCACHEFS_QUOTA_H - -#include "inode.h" -#include "quota_types.h" - -extern const struct bch_sb_field_ops bch_sb_field_ops_quota; - -int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_quota ((struct bkey_ops) { \ - .key_validate = bch2_quota_validate, \ - .val_to_text = bch2_quota_to_text, \ - .min_val_size = 32, \ -}) - -static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) -{ - return (struct bch_qid) { - .q[QTYP_USR] = u->bi_uid, - .q[QTYP_GRP] = u->bi_gid, - .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, - }; -} - -static inline unsigned enabled_qtypes(struct bch_fs *c) -{ - return ((c->opts.usrquota << QTYP_USR)| - (c->opts.grpquota << QTYP_GRP)| - (c->opts.prjquota << QTYP_PRJ)); -} - -#ifdef CONFIG_BCACHEFS_QUOTA - -int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, - s64, enum quota_acct_mode); - -int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, - struct bch_qid, u64, enum quota_acct_mode); - -void bch2_fs_quota_exit(struct bch_fs *); -void bch2_fs_quota_init(struct bch_fs *); -int bch2_fs_quota_read(struct bch_fs *); - -extern const struct quotactl_ops bch2_quotactl_operations; - -#else - -static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, - enum quota_counters counter, s64 v, - enum quota_acct_mode mode) -{ - return 0; -} - -static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, - struct bch_qid dst, - struct bch_qid src, u64 space, - enum quota_acct_mode mode) -{ - return 0; -} - -static inline void bch2_fs_quota_exit(struct bch_fs *c) {} -static inline void bch2_fs_quota_init(struct bch_fs *c) {} -static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } - -#endif - -#endif /* _BCACHEFS_QUOTA_H */ diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h deleted file mode 100644 index dc34347ef6c74a..00000000000000 --- a/fs/bcachefs/quota_format.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_QUOTA_FORMAT_H -#define _BCACHEFS_QUOTA_FORMAT_H - -/* KEY_TYPE_quota: */ - -enum quota_types { - QTYP_USR = 0, - QTYP_GRP = 1, - QTYP_PRJ = 2, - QTYP_NR = 3, -}; - -enum quota_counters { - Q_SPC = 0, - Q_INO = 1, - Q_COUNTERS = 2, -}; - -struct bch_quota_counter { - __le64 hardlimit; - __le64 softlimit; -}; - -struct bch_quota { - struct bch_val v; - struct bch_quota_counter c[Q_COUNTERS]; -} __packed __aligned(8); - -/* BCH_SB_FIELD_quota: */ - -struct bch_sb_quota_counter { - __le32 timelimit; - __le32 warnlimit; -}; - -struct bch_sb_quota_type { - __le64 flags; - struct bch_sb_quota_counter c[Q_COUNTERS]; -}; - -struct bch_sb_field_quota { - struct bch_sb_field field; - struct bch_sb_quota_type q[QTYP_NR]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_QUOTA_FORMAT_H */ diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h deleted file mode 100644 index 6a136083d3899d..00000000000000 --- a/fs/bcachefs/quota_types.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_QUOTA_TYPES_H -#define _BCACHEFS_QUOTA_TYPES_H - -#include - -struct bch_qid { - u32 q[QTYP_NR]; -}; - -enum quota_acct_mode { - KEY_TYPE_QUOTA_PREALLOC, - KEY_TYPE_QUOTA_WARN, - KEY_TYPE_QUOTA_NOCHECK, -}; - -struct memquota_counter { - u64 v; - u64 hardlimit; - u64 softlimit; - s64 timer; - int warns; - int warning_issued; -}; - -struct bch_memquota { - struct memquota_counter c[Q_COUNTERS]; -}; - -typedef GENRADIX(struct bch_memquota) bch_memquota_table; - -struct quota_limit { - u32 timelimit; - u32 warnlimit; -}; - -struct bch_memquota_type { - struct quota_limit limits[Q_COUNTERS]; - bch_memquota_table table; - struct mutex lock; -}; - -#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c deleted file mode 100644 index b1438be9d69088..00000000000000 --- a/fs/bcachefs/rcu_pending.c +++ /dev/null @@ -1,666 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define pr_fmt(fmt) "%s() " fmt "\n", __func__ - -#include -#include -#include -#include -#include -#include - -#include "rcu_pending.h" -#include "darray.h" -#include "util.h" - -#define static_array_for_each(_a, _i) \ - for (typeof(&(_a)[0]) _i = _a; \ - _i < (_a) + ARRAY_SIZE(_a); \ - _i++) - -enum rcu_pending_special { - RCU_PENDING_KVFREE = 1, - RCU_PENDING_CALL_RCU = 2, -}; - -#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) -#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) - -#ifdef __KERNEL__ -typedef unsigned long rcu_gp_poll_state_t; - -static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) -{ - return l == r; -} -#else -typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; - -static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) -{ - return l.grace_period_id == r.grace_period_id; -} -#endif - -static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) -{ - return ssp - ? get_state_synchronize_srcu(ssp) - : get_state_synchronize_rcu(); -} - -static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) -{ - return ssp - ? start_poll_synchronize_srcu(ssp) - : start_poll_synchronize_rcu(); -} - -static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) -{ - return ssp - ? poll_state_synchronize_srcu(ssp, cookie) - : poll_state_synchronize_rcu(cookie); -} - -static inline void __rcu_barrier(struct srcu_struct *ssp) -{ - return ssp - ? srcu_barrier(ssp) - : rcu_barrier(); -} - -static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp, - rcu_callback_t func) -{ - if (ssp) - call_srcu(ssp, rhp, func); - else - call_rcu(rhp, func); -} - -struct rcu_pending_seq { - /* - * We're using a radix tree like a vector - we're just pushing elements - * onto the end; we're using a radix tree instead of an actual vector to - * avoid reallocation overhead - */ - GENRADIX(struct rcu_head *) objs; - size_t nr; - struct rcu_head **cursor; - rcu_gp_poll_state_t seq; -}; - -struct rcu_pending_list { - struct rcu_head *head; - struct rcu_head *tail; - rcu_gp_poll_state_t seq; -}; - -struct rcu_pending_pcpu { - struct rcu_pending *parent; - spinlock_t lock; - int cpu; - - /* - * We can't bound the number of unprocessed gp sequence numbers, and we - * can't efficiently merge radix trees for expired grace periods, so we - * need darray/vector: - */ - DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs; - - /* Third entry is for expired objects: */ - struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1]; - - struct rcu_head cb; - bool cb_armed; - struct work_struct work; -}; - -static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p) -{ - if (p->objs.nr) - return true; - - static_array_for_each(p->lists, i) - if (i->head) - return true; - - return false; -} - -static void rcu_pending_list_merge(struct rcu_pending_list *l1, - struct rcu_pending_list *l2) -{ -#ifdef __KERNEL__ - if (!l1->head) - l1->head = l2->head; - else - l1->tail->next = l2->head; -#else - if (!l1->head) - l1->head = l2->head; - else - l1->tail->next.next = (void *) l2->head; -#endif - - l1->tail = l2->tail; - l2->head = l2->tail = NULL; -} - -static void rcu_pending_list_add(struct rcu_pending_list *l, - struct rcu_head *n) -{ -#ifdef __KERNEL__ - if (!l->head) - l->head = n; - else - l->tail->next = n; - l->tail = n; - n->next = NULL; -#else - if (!l->head) - l->head = n; - else - l->tail->next.next = (void *) n; - l->tail = n; - n->next.next = NULL; -#endif -} - -static void merge_expired_lists(struct rcu_pending_pcpu *p) -{ - struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; - - for (struct rcu_pending_list *i = p->lists; i < expired; i++) - if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq)) - rcu_pending_list_merge(expired, i); -} - -#ifndef __KERNEL__ -static inline void kfree_bulk(size_t nr, void ** p) -{ - while (nr--) - kfree(*p); -} -#endif - -static noinline void __process_finished_items(struct rcu_pending *pending, - struct rcu_pending_pcpu *p, - unsigned long flags) -{ - struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; - struct rcu_pending_seq objs = {}; - struct rcu_head *list = NULL; - - if (p->objs.nr && - __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) { - objs = p->objs.data[0]; - darray_remove_item(&p->objs, p->objs.data); - } - - merge_expired_lists(p); - - list = expired->head; - expired->head = expired->tail = NULL; - - spin_unlock_irqrestore(&p->lock, flags); - - switch ((ulong) pending->process) { - case RCU_PENDING_KVFREE: - for (size_t i = 0; i < objs.nr; ) { - size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i); - - kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i)); - i += nr_this_node; - } - genradix_free(&objs.objs); - - while (list) { - struct rcu_head *obj = list; -#ifdef __KERNEL__ - list = obj->next; -#else - list = (void *) obj->next.next; -#endif - - /* - * low bit of pointer indicates whether rcu_head needs - * to be freed - kvfree_rcu_mightsleep() - */ - BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0); - - void *ptr = (void *)(((unsigned long) obj->func) & ~1UL); - bool free_head = ((unsigned long) obj->func) & 1UL; - - kvfree(ptr); - if (free_head) - kfree(obj); - } - - break; - - case RCU_PENDING_CALL_RCU: - for (size_t i = 0; i < objs.nr; i++) { - struct rcu_head *obj = *genradix_ptr(&objs.objs, i); - obj->func(obj); - } - genradix_free(&objs.objs); - - while (list) { - struct rcu_head *obj = list; -#ifdef __KERNEL__ - list = obj->next; -#else - list = (void *) obj->next.next; -#endif - obj->func(obj); - } - break; - - default: - for (size_t i = 0; i < objs.nr; i++) - pending->process(pending, *genradix_ptr(&objs.objs, i)); - genradix_free(&objs.objs); - - while (list) { - struct rcu_head *obj = list; -#ifdef __KERNEL__ - list = obj->next; -#else - list = (void *) obj->next.next; -#endif - pending->process(pending, obj); - } - break; - } -} - -static bool process_finished_items(struct rcu_pending *pending, - struct rcu_pending_pcpu *p, - unsigned long flags) -{ - /* - * XXX: we should grab the gp seq once and avoid multiple function - * calls, this is called from __rcu_pending_enqueue() fastpath in - * may_sleep==true mode - */ - if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) || - (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) || - (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) || - p->lists[2].head) { - __process_finished_items(pending, p, flags); - return true; - } - - return false; -} - -static void rcu_pending_work(struct work_struct *work) -{ - struct rcu_pending_pcpu *p = - container_of(work, struct rcu_pending_pcpu, work); - struct rcu_pending *pending = p->parent; - unsigned long flags; - - do { - spin_lock_irqsave(&p->lock, flags); - } while (process_finished_items(pending, p, flags)); - - spin_unlock_irqrestore(&p->lock, flags); -} - -static void rcu_pending_rcu_cb(struct rcu_head *rcu) -{ - struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb); - - schedule_work_on(p->cpu, &p->work); - - unsigned long flags; - spin_lock_irqsave(&p->lock, flags); - if (__rcu_pending_has_pending(p)) { - spin_unlock_irqrestore(&p->lock, flags); - __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb); - } else { - p->cb_armed = false; - spin_unlock_irqrestore(&p->lock, flags); - } -} - -static __always_inline struct rcu_pending_seq * -get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) -{ - darray_for_each_reverse(p->objs, objs) - if (rcu_gp_poll_cookie_eq(objs->seq, seq)) - return objs; - - if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) - return NULL; - - return &darray_last(p->objs); -} - -static noinline bool -rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, - struct rcu_head *head, void *ptr, - unsigned long *flags) -{ - if (ptr) { - if (!head) { - /* - * kvfree_rcu_mightsleep(): we weren't passed an - * rcu_head, but we need one: use the low bit of the - * ponter to free to flag that the head needs to be - * freed as well: - */ - ptr = (void *)(((unsigned long) ptr)|1UL); - head = kmalloc(sizeof(*head), __GFP_NOWARN); - if (!head) { - spin_unlock_irqrestore(&p->lock, *flags); - head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL); - /* - * dropped lock, did GFP_KERNEL allocation, - * check for gp expiration - */ - if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) { - kvfree(--ptr); - kfree(head); - spin_lock_irqsave(&p->lock, *flags); - return false; - } - } - } - - head->func = ptr; - } -again: - for (struct rcu_pending_list *i = p->lists; - i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { - if (rcu_gp_poll_cookie_eq(i->seq, seq)) { - rcu_pending_list_add(i, head); - return false; - } - } - - for (struct rcu_pending_list *i = p->lists; - i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { - if (!i->head) { - i->seq = seq; - rcu_pending_list_add(i, head); - return true; - } - } - - merge_expired_lists(p); - goto again; -} - -/* - * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via - * pending->pracess) once grace period elapses. - * - * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall - * back to a linked list. - * - * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a - * process callback - * - * - If @ptr and @head are both not NULL, we're kvfree_rcu() - * - * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep() - * - * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process - * expired items. - */ -static __always_inline void -__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, - void *ptr, bool may_sleep) -{ - - struct rcu_pending_pcpu *p; - struct rcu_pending_seq *objs; - struct genradix_node *new_node = NULL; - unsigned long flags; - bool start_gp = false; - - BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); - - /* We could technically be scheduled before taking the lock and end up - * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock - * anyways - * - * And we have to do it this way to avoid breaking PREEMPT_RT, which - * redefines how spinlocks work: - */ - p = raw_cpu_ptr(pending->p); - spin_lock_irqsave(&p->lock, flags); - rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); -restart: - if (may_sleep && - unlikely(process_finished_items(pending, p, flags))) - goto check_expired; - - /* - * In kvfree_rcu() mode, the radix tree is only for slab pointers so - * that we can do kfree_bulk() - vmalloc pointers always use the linked - * list: - */ - if (ptr && unlikely(is_vmalloc_addr(ptr))) - goto list_add; - - objs = get_object_radix(p, seq); - if (unlikely(!objs)) - goto list_add; - - if (unlikely(!objs->cursor)) { - /* - * New radix tree nodes must be added under @p->lock because the - * tree root is in a darray that can be resized (typically, - * genradix supports concurrent unlocked allocation of new - * nodes) - hence preallocation and the retry loop: - */ - objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs, - objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN); - if (unlikely(!objs->cursor)) { - if (may_sleep) { - spin_unlock_irqrestore(&p->lock, flags); - - gfp_t gfp = GFP_KERNEL; - if (!head) - gfp |= __GFP_NOFAIL; - - new_node = genradix_alloc_node(gfp); - if (!new_node) - may_sleep = false; - goto check_expired; - } -list_add: - start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags); - goto start_gp; - } - } - - *objs->cursor++ = ptr ?: head; - /* zero cursor if we hit the end of a radix tree node: */ - if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1))) - objs->cursor = NULL; - start_gp = !objs->nr; - objs->nr++; -start_gp: - if (unlikely(start_gp)) { - /* - * We only have one callback (ideally, we would have one for - * every outstanding graceperiod) - so if our callback is - * already in flight, we may still have to start a grace period - * (since we used get_state() above, not start_poll()) - */ - if (!p->cb_armed) { - p->cb_armed = true; - __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); - } else { - __start_poll_synchronize_rcu(pending->srcu); - } - } - spin_unlock_irqrestore(&p->lock, flags); -free_node: - if (new_node) - genradix_free_node(new_node); - return; -check_expired: - if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) { - switch ((ulong) pending->process) { - case RCU_PENDING_KVFREE: - kvfree(ptr); - break; - case RCU_PENDING_CALL_RCU: - head->func(head); - break; - default: - pending->process(pending, head); - break; - } - goto free_node; - } - - p = raw_cpu_ptr(pending->p); - spin_lock_irqsave(&p->lock, flags); - goto restart; -} - -void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj) -{ - __rcu_pending_enqueue(pending, obj, NULL, true); -} - -static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p) -{ - struct rcu_head *ret = NULL; - - spin_lock_irq(&p->lock); - darray_for_each(p->objs, objs) - if (objs->nr) { - ret = *genradix_ptr(&objs->objs, --objs->nr); - objs->cursor = NULL; - if (!objs->nr) - genradix_free(&objs->objs); - goto out; - } - - static_array_for_each(p->lists, i) - if (i->head) { - ret = i->head; -#ifdef __KERNEL__ - i->head = ret->next; -#else - i->head = (void *) ret->next.next; -#endif - if (!i->head) - i->tail = NULL; - goto out; - } -out: - spin_unlock_irq(&p->lock); - - return ret; -} - -struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending) -{ - return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p)); -} - -struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending) -{ - struct rcu_head *ret = rcu_pending_dequeue(pending); - - if (ret) - return ret; - - int cpu; - for_each_possible_cpu(cpu) { - ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu)); - if (ret) - break; - } - return ret; -} - -static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending) -{ - int cpu; - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - spin_lock_irq(&p->lock); - if (__rcu_pending_has_pending(p) || p->cb_armed) { - spin_unlock_irq(&p->lock); - return true; - } - spin_unlock_irq(&p->lock); - } - - return false; -} - -void rcu_pending_exit(struct rcu_pending *pending) -{ - int cpu; - - if (!pending->p) - return; - - while (rcu_pending_has_pending_or_armed(pending)) { - __rcu_barrier(pending->srcu); - - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - flush_work(&p->work); - } - } - - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - flush_work(&p->work); - } - - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - - static_array_for_each(p->lists, i) - WARN_ON(i->head); - WARN_ON(p->objs.nr); - darray_exit(&p->objs); - } - free_percpu(pending->p); -} - -/** - * rcu_pending_init: - initialize a rcu_pending - * - * @pending: Object to init - * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal - * RCU flavor - * @process: Callback function invoked on objects once their RCU barriers - * have completed; if NULL, kvfree() is used. - */ -int rcu_pending_init(struct rcu_pending *pending, - struct srcu_struct *srcu, - rcu_pending_process_fn process) -{ - pending->p = alloc_percpu(struct rcu_pending_pcpu); - if (!pending->p) - return -ENOMEM; - - int cpu; - for_each_possible_cpu(cpu) { - struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); - p->parent = pending; - p->cpu = cpu; - spin_lock_init(&p->lock); - darray_init(&p->objs); - INIT_WORK(&p->work, rcu_pending_work); - } - - pending->srcu = srcu; - pending->process = process; - - return 0; -} diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h deleted file mode 100644 index 71a2f4ddaade48..00000000000000 --- a/fs/bcachefs/rcu_pending.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_RCU_PENDING_H -#define _LINUX_RCU_PENDING_H - -#include - -struct rcu_pending; -typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *); - -struct rcu_pending_pcpu; - -struct rcu_pending { - struct rcu_pending_pcpu __percpu *p; - struct srcu_struct *srcu; - rcu_pending_process_fn process; -}; - -void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj); -struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending); -struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending); - -void rcu_pending_exit(struct rcu_pending *pending); -int rcu_pending_init(struct rcu_pending *pending, - struct srcu_struct *srcu, - rcu_pending_process_fn process); - -#endif /* _LINUX_RCU_PENDING_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c deleted file mode 100644 index 1c345b86b1c007..00000000000000 --- a/fs/bcachefs/rebalance.c +++ /dev/null @@ -1,889 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "btree_iter.h" -#include "btree_update.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "clock.h" -#include "compress.h" -#include "disk_groups.h" -#include "errcode.h" -#include "error.h" -#include "inode.h" -#include "io_write.h" -#include "move.h" -#include "rebalance.h" -#include "subvolume.h" -#include "super-io.h" -#include "trace.h" - -#include -#include -#include - -/* bch_extent_rebalance: */ - -static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) -{ - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) - if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) - return &entry->rebalance; - - return NULL; -} - -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) -{ - return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); -} - -static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s_c k, - struct bkey_ptrs_c ptrs) -{ - if (!opts->background_compression) - return 0; - - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) - return 0; - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - return rewrite_ptrs; -} - -static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_ptrs_c ptrs) -{ - if (!opts->background_target || - !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) - return 0; - - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - - guard(rcu)(); - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - return rewrite_ptrs; -} - -static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | - bch2_bkey_ptrs_need_move(c, opts, ptrs); -} - -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); - if (!opts) - return 0; - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - u64 sectors = 0; - - if (opts->background_compression) { - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) { - sectors = 0; - goto incompressible; - } - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - sectors += p.crc.compressed_size; - } - } -incompressible: - if (opts->background_target) { - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) - sectors += p.crc.compressed_size; - } - - return sectors; -} - -static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_s_c k) -{ - if (!bkey_extent_is_direct_data(k.k)) - return 0; - - const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); - - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); - return old == NULL || memcmp(old, &new, sizeof(new)); - } else { - return old != NULL; - } -} - -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_i *_k) -{ - if (!bkey_extent_is_direct_data(&_k->k)) - return 0; - - struct bkey_s k = bkey_i_to_s(_k); - struct bch_extent_rebalance *old = - (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { - if (!old) { - old = bkey_val_end(k); - k.k->u64s += sizeof(*old) / sizeof(u64); - } - - *old = io_opts_to_rebalance_opts(c, opts); - } else { - if (old) - extent_entry_drop(k, (union bch_extent_entry *) old); - } - - return 0; -} - -int bch2_get_update_rebalance_opts(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *iter, - struct bkey_s_c k) -{ - BUG_ON(iter->flags & BTREE_ITER_is_extents); - BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); - - const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v - ? bch2_bkey_rebalance_opts(k) : NULL; - if (r) { -#define x(_name) \ - if (r->_name##_from_inode) { \ - io_opts->_name = r->_name; \ - io_opts->_name##_from_inode = true; \ - } - BCH_REBALANCE_OPTS() -#undef x - } - - if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) - return 0; - - struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bkey_reassemble(n, k); - - /* On successfull transaction commit, @k was invalidated: */ - - return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; -} - -#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) - -static const char * const bch2_rebalance_state_strs[] = { -#define x(t) #t, - BCH_REBALANCE_STATES() - NULL -#undef x -}; - -int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i_cookie *cookie; - u64 v; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - v = k.k->type == KEY_TYPE_cookie - ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) - : 0; - - cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); - ret = PTR_ERR_OR_ZERO(cookie); - if (ret) - goto err; - - bkey_cookie_init(&cookie->k_i); - cookie->k.p = iter.pos; - cookie->v.cookie = cpu_to_le64(v + 1); - - ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) -{ - int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_set_rebalance_needs_scan_trans(trans, inum)); - bch2_rebalance_wakeup(c); - return ret; -} - -int bch2_set_fs_needs_rebalance(struct bch_fs *c) -{ - return bch2_set_rebalance_needs_scan(c, 0); -} - -static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 v; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - v = k.k->type == KEY_TYPE_cookie - ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) - : 0; - - if (v == cookie) - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, - struct btree_iter *work_iter) -{ - return !kthread_should_stop() - ? bch2_btree_iter_peek(trans, work_iter) - : bkey_s_c_null; -} - -static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) - return 0; - - struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - extent_entry_drop(bkey_i_to_s(n), - (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); -} - -static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, - struct bpos work_pos, - struct btree_iter *extent_iter, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bch_fs *c = trans->c; - - bch2_trans_iter_exit(trans, extent_iter); - bch2_trans_iter_init(trans, extent_iter, - work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, - work_pos, - BTREE_ITER_all_snapshots); - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); - if (bkey_err(k)) - return k; - - int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); - if (ret) - return bkey_s_c_err(ret); - - memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_only_specified_devs; - - if (!data_opts->rewrite_ptrs) { - /* - * device we would want to write to offline? devices in target - * changed? - * - * We'll now need a full scan before this extent is picked up - * again: - */ - int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); - if (ret) - return bkey_s_c_err(ret); - return bkey_s_c_null; - } - - if (trace_rebalance_extent_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); - if (p) { - prt_str(&buf, "compression="); - bch2_compression_opt_to_text(&buf, io_opts->background_compression); - prt_str(&buf, " "); - bch2_prt_u64_base2(&buf, p); - prt_newline(&buf); - } - - p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); - if (p) { - prt_str(&buf, "move="); - bch2_target_to_text(&buf, c, io_opts->background_target); - prt_str(&buf, " "); - bch2_prt_u64_base2(&buf, p); - prt_newline(&buf); - } - - trace_rebalance_extent(c, buf.buf); - printbuf_exit(&buf); - } - - return k; -} - -noinline_for_stack -static int do_rebalance_extent(struct moving_context *ctxt, - struct bpos work_pos, - struct btree_iter *extent_iter) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &trans->c->rebalance; - struct data_update_opts data_opts; - struct bch_io_opts io_opts; - struct bkey_s_c k; - struct bkey_buf sk; - int ret; - - ctxt->stats = &r->work_stats; - r->state = BCH_REBALANCE_working; - - bch2_bkey_buf_init(&sk); - - ret = bkey_err(k = next_rebalance_extent(trans, work_pos, - extent_iter, &io_opts, &data_opts)); - if (ret || !k.k) - goto out; - - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); - - /* - * The iterator gets unlocked by __bch2_read_extent - need to - * save a copy of @k elsewhere: - */ - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); - if (ret) { - if (bch2_err_matches(ret, ENOMEM)) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - ret = bch_err_throw(c, transaction_restart_nested); - } - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto out; - - /* skip it and continue, XXX signal failure */ - ret = 0; - } -out: - bch2_bkey_buf_exit(&sk, c); - return ret; -} - -static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &trans->c->rebalance; - - bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - ctxt->stats = &r->scan_stats; - - if (!inum) { - r->scan_start = BBPOS_MIN; - r->scan_end = BBPOS_MAX; - } else { - r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); - r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); - } - - r->state = BCH_REBALANCE_scanning; - - struct per_snapshot_io_opts snapshot_io_opts; - per_snapshot_io_opts_init(&snapshot_io_opts, c); - - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, - r->scan_start.pos, r->scan_end.pos, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents| - BTREE_ITER_prefetch, k, ({ - ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - - struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, - &snapshot_io_opts, iter.pos, &iter, k); - PTR_ERR_OR_ZERO(io_opts); - })) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie)); - - per_snapshot_io_opts_exit(&snapshot_io_opts); - bch2_move_stats_exit(&r->scan_stats, trans->c); - - /* - * Ensure that the rebalance_work entries we created are seen by the - * next iteration of do_rebalance(), so we don't end up stuck in - * rebalance_wait(): - */ - atomic64_inc(&r->scan_stats.sectors_seen); - bch2_btree_write_buffer_flush_sync(trans); - - return ret; -} - -static void rebalance_wait(struct bch_fs *c) -{ - struct bch_fs_rebalance *r = &c->rebalance; - struct io_clock *clock = &c->io_clock[WRITE]; - u64 now = atomic64_read(&clock->now); - u64 min_member_capacity = bch2_min_rw_member_capacity(c); - - if (min_member_capacity == U64_MAX) - min_member_capacity = 128 * 2048; - - r->wait_iotime_end = now + (min_member_capacity >> 6); - - if (r->state != BCH_REBALANCE_waiting) { - r->wait_iotime_start = now; - r->wait_wallclock_start = ktime_get_real_ns(); - r->state = BCH_REBALANCE_waiting; - } - - bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); -} - -static bool bch2_rebalance_enabled(struct bch_fs *c) -{ - return c->opts.rebalance_enabled && - !(c->opts.rebalance_on_ac_only && - c->rebalance.on_battery); -} - -static int do_rebalance(struct moving_context *ctxt) -{ - struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - struct bch_fs_rebalance *r = &c->rebalance; - struct btree_iter rebalance_work_iter, extent_iter = {}; - struct bkey_s_c k; - u32 kick = r->kick; - int ret = 0; - - bch2_trans_begin(trans); - - bch2_move_stats_init(&r->work_stats, "rebalance_work"); - bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - - bch2_trans_iter_init(trans, &rebalance_work_iter, - BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_all_snapshots); - - while (!bch2_move_ratelimit(ctxt)) { - if (!bch2_rebalance_enabled(c)) { - bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(bch2_rebalance_enabled(c) || - kthread_should_stop()); - } - - if (kthread_should_stop()) - break; - - bch2_trans_begin(trans); - - ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret || !k.k) - break; - - ret = k.k->type == KEY_TYPE_cookie - ? do_rebalance_scan(ctxt, k.k->p.inode, - le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) - : do_rebalance_extent(ctxt, k.k->p, &extent_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - bch2_btree_iter_advance(trans, &rebalance_work_iter); - } - - bch2_trans_iter_exit(trans, &extent_iter); - bch2_trans_iter_exit(trans, &rebalance_work_iter); - bch2_move_stats_exit(&r->scan_stats, c); - - if (!ret && - !kthread_should_stop() && - !atomic64_read(&r->work_stats.sectors_seen) && - !atomic64_read(&r->scan_stats.sectors_seen) && - kick == r->kick) { - bch2_moving_ctxt_flush_all(ctxt); - bch2_trans_unlock_long(trans); - rebalance_wait(c); - } - - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; -} - -static int bch2_rebalance_thread(void *arg) -{ - struct bch_fs *c = arg; - struct bch_fs_rebalance *r = &c->rebalance; - struct moving_context ctxt; - - set_freezable(); - - /* - * Data move operations can't run until after check_snapshots has - * completed, and bch2_snapshot_is_ancestor() is available. - */ - kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || - kthread_should_stop()); - - bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, - writepoint_ptr(&c->rebalance_write_point), - true); - - while (!kthread_should_stop() && !do_rebalance(&ctxt)) - ; - - bch2_moving_ctxt_exit(&ctxt); - - return 0; -} - -void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) -{ - printbuf_tabstop_push(out, 32); - - struct bch_fs_rebalance *r = &c->rebalance; - - /* print pending work */ - struct disk_accounting_pos acc; - disk_accounting_key_init(acc, rebalance_work); - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - - prt_printf(out, "pending work:\t"); - prt_human_readable_u64(out, v << 9); - prt_printf(out, "\n\n"); - - prt_str(out, bch2_rebalance_state_strs[r->state]); - prt_newline(out); - printbuf_indent_add(out, 2); - - switch (r->state) { - case BCH_REBALANCE_waiting: { - u64 now = atomic64_read(&c->io_clock[WRITE].now); - - prt_printf(out, "io wait duration:\t"); - bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); - prt_newline(out); - - prt_printf(out, "io wait remaining:\t"); - bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); - prt_newline(out); - - prt_printf(out, "duration waited:\t"); - bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); - prt_newline(out); - break; - } - case BCH_REBALANCE_working: - bch2_move_stats_to_text(out, &r->work_stats); - break; - case BCH_REBALANCE_scanning: - bch2_move_stats_to_text(out, &r->scan_stats); - break; - } - prt_newline(out); - - struct task_struct *t; - scoped_guard(rcu) { - t = rcu_dereference(c->rebalance.thread); - if (t) - get_task_struct(t); - } - - if (t) { - bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); - put_task_struct(t); - } - - printbuf_indent_sub(out, 2); -} - -void bch2_rebalance_stop(struct bch_fs *c) -{ - struct task_struct *p; - - c->rebalance.pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&c->rebalance.pd.rate); - - p = rcu_dereference_protected(c->rebalance.thread, 1); - c->rebalance.thread = NULL; - - if (p) { - /* for sychronizing with bch2_rebalance_wakeup() */ - synchronize_rcu(); - - kthread_stop(p); - put_task_struct(p); - } -} - -int bch2_rebalance_start(struct bch_fs *c) -{ - struct task_struct *p; - int ret; - - if (c->rebalance.thread) - return 0; - - if (c->opts.nochanges) - return 0; - - p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); - ret = PTR_ERR_OR_ZERO(p); - bch_err_msg(c, ret, "creating rebalance thread"); - if (ret) - return ret; - - get_task_struct(p); - rcu_assign_pointer(c->rebalance.thread, p); - wake_up_process(p); - return 0; -} - -#ifdef CONFIG_POWER_SUPPLY -#include - -static int bch2_rebalance_power_notifier(struct notifier_block *nb, - unsigned long event, void *data) -{ - struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); - - c->rebalance.on_battery = !power_supply_is_system_supplied(); - bch2_rebalance_wakeup(c); - return NOTIFY_OK; -} -#endif - -void bch2_fs_rebalance_exit(struct bch_fs *c) -{ -#ifdef CONFIG_POWER_SUPPLY - power_supply_unreg_notifier(&c->rebalance.power_notifier); -#endif -} - -int bch2_fs_rebalance_init(struct bch_fs *c) -{ - struct bch_fs_rebalance *r = &c->rebalance; - - bch2_pd_controller_init(&r->pd); - -#ifdef CONFIG_POWER_SUPPLY - r->power_notifier.notifier_call = bch2_rebalance_power_notifier; - int ret = power_supply_reg_notifier(&r->power_notifier); - if (ret) - return ret; - - r->on_battery = !power_supply_is_system_supplied(); -#endif - return 0; -} - -static int check_rebalance_work_one(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct btree_iter *rebalance_iter, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c extent_k, rebalance_k; - struct printbuf buf = PRINTBUF; - - int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?: - bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter)); - if (ret) - return ret; - - if (!extent_k.k && - extent_iter->btree_id == BTREE_ID_reflink && - (!rebalance_k.k || - rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { - bch2_trans_iter_exit(trans, extent_iter); - bch2_trans_iter_init(trans, extent_iter, - BTREE_ID_extents, POS_MIN, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots); - return bch_err_throw(c, transaction_restart_nested); - } - - if (!extent_k.k && !rebalance_k.k) - return 1; - - int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, - rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); - - struct bkey deleted; - bkey_init(&deleted); - - if (cmp < 0) { - deleted.p = extent_k.k->p; - rebalance_k.k = &deleted; - } else if (cmp > 0) { - deleted.p = rebalance_k.k->p; - extent_k.k = &deleted; - } - - bool should_have_rebalance = - bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; - bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; - - if (should_have_rebalance != have_rebalance) { - ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); - if (ret) - return ret; - - bch2_bkey_val_to_text(&buf, c, extent_k); - } - - if (fsck_err_on(!should_have_rebalance && have_rebalance, - trans, rebalance_work_incorrectly_set, - "rebalance work incorrectly set\n%s", buf.buf)) { - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - extent_k.k->p, false); - if (ret) - goto err; - } - - if (fsck_err_on(should_have_rebalance && !have_rebalance, - trans, rebalance_work_incorrectly_unset, - "rebalance work incorrectly unset\n%s", buf.buf)) { - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - extent_k.k->p, true); - if (ret) - goto err; - } - - if (cmp <= 0) - bch2_btree_iter_advance(trans, extent_iter); - if (cmp >= 0) - bch2_btree_iter_advance(trans, rebalance_iter); -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_rebalance_work(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter rebalance_iter, extent_iter; - int ret = 0; - - bch2_trans_iter_init(trans, &extent_iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch); - bch2_trans_iter_init(trans, &rebalance_iter, - BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_prefetch); - - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - while (!ret) { - bch2_trans_begin(trans); - - ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - } - - bch2_bkey_buf_exit(&last_flushed, c); - bch2_trans_iter_exit(trans, &extent_iter); - bch2_trans_iter_exit(trans, &rebalance_iter); - bch2_trans_put(trans); - return ret < 0 ? ret : 0; -} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h deleted file mode 100644 index 7a565ea7dbfcc5..00000000000000 --- a/fs/bcachefs/rebalance.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REBALANCE_H -#define _BCACHEFS_REBALANCE_H - -#include "compress.h" -#include "disk_groups.h" -#include "opts.h" -#include "rebalance_types.h" - -static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, - struct bch_io_opts *opts) -{ - struct bch_extent_rebalance r = { - .type = BIT(BCH_EXTENT_ENTRY_rebalance), -#define x(_name) \ - ._name = opts->_name, \ - ._name##_from_inode = opts->_name##_from_inode, - BCH_REBALANCE_OPTS() -#undef x - }; - - if (r.background_target && - !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) - r.background_target = 0; - - return r; -}; - -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); -int bch2_get_update_rebalance_opts(struct btree_trans *, - struct bch_io_opts *, - struct btree_iter *, - struct bkey_s_c); - -int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); -int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); -int bch2_set_fs_needs_rebalance(struct bch_fs *); - -static inline void bch2_rebalance_wakeup(struct bch_fs *c) -{ - c->rebalance.kick++; - guard(rcu)(); - struct task_struct *p = rcu_dereference(c->rebalance.thread); - if (p) - wake_up_process(p); -} - -void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); - -void bch2_rebalance_stop(struct bch_fs *); -int bch2_rebalance_start(struct bch_fs *); - -void bch2_fs_rebalance_exit(struct bch_fs *); -int bch2_fs_rebalance_init(struct bch_fs *); - -int bch2_check_rebalance_work(struct bch_fs *); - -#endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h deleted file mode 100644 index ff9a1342a22b4c..00000000000000 --- a/fs/bcachefs/rebalance_format.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REBALANCE_FORMAT_H -#define _BCACHEFS_REBALANCE_FORMAT_H - -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:3, - - promote_target_from_inode:1, - erasure_code_from_inode:1, - data_checksum_from_inode:1, - background_compression_from_inode:1, - data_replicas_from_inode:1, - background_target_from_inode:1, - - promote_target:16, - erasure_code:1, - data_checksum:4, - data_replicas:4, - background_compression:8, /* enum bch_compression_opt */ - background_target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 background_target:16, - background_compression:8, - data_replicas:4, - data_checksum:4, - erasure_code:1, - promote_target:16, - - background_target_from_inode:1, - data_replicas_from_inode:1, - background_compression_from_inode:1, - data_checksum_from_inode:1, - erasure_code_from_inode:1, - promote_target_from_inode:1, - - unused:3, - type:6; -#endif -}; - -/* subset of BCH_INODE_OPTS */ -#define BCH_REBALANCE_OPTS() \ - x(data_checksum) \ - x(background_compression) \ - x(data_replicas) \ - x(promote_target) \ - x(background_target) \ - x(erasure_code) - -#endif /* _BCACHEFS_REBALANCE_FORMAT_H */ - diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h deleted file mode 100644 index c659da149fa3c4..00000000000000 --- a/fs/bcachefs/rebalance_types.h +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REBALANCE_TYPES_H -#define _BCACHEFS_REBALANCE_TYPES_H - -#include "bbpos_types.h" -#include "move_types.h" - -#define BCH_REBALANCE_STATES() \ - x(waiting) \ - x(working) \ - x(scanning) - -enum bch_rebalance_states { -#define x(t) BCH_REBALANCE_##t, - BCH_REBALANCE_STATES() -#undef x -}; - -struct bch_fs_rebalance { - struct task_struct __rcu *thread; - u32 kick; - struct bch_pd_controller pd; - - enum bch_rebalance_states state; - u64 wait_iotime_start; - u64 wait_iotime_end; - u64 wait_wallclock_start; - - struct bch_move_stats work_stats; - - struct bbpos scan_start; - struct bbpos scan_end; - struct bch_move_stats scan_stats; - - bool on_battery; -#ifdef CONFIG_POWER_SUPPLY - struct notifier_block power_notifier; -#endif -}; - -#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c deleted file mode 100644 index c94debb12d2fee..00000000000000 --- a/fs/bcachefs/recovery.c +++ /dev/null @@ -1,1306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "bkey_buf.h" -#include "btree_journal_iter.h" -#include "btree_node_scan.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "buckets.h" -#include "dirent.h" -#include "disk_accounting.h" -#include "errcode.h" -#include "error.h" -#include "journal_io.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "logged_ops.h" -#include "move.h" -#include "movinggc.h" -#include "namei.h" -#include "quota.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-clean.h" -#include "sb-downgrade.h" -#include "snapshot.h" -#include "super-io.h" - -#include -#include - -int bch2_btree_lost_data(struct bch_fs *c, - struct printbuf *msg, - enum btree_id btree) -{ - u64 b = BIT_ULL(btree); - int ret = 0; - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (!(c->sb.btrees_lost_data & b)) { - prt_printf(msg, "flagging btree "); - bch2_btree_id_to_text(msg, btree); - prt_printf(msg, " lost data\n"); - - ext->btrees_lost_data |= cpu_to_le64(b); - } - - /* Once we have runtime self healing for topology errors we won't need this: */ - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - - /* Btree node accounting will be off: */ - __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; - -#ifdef CONFIG_BCACHEFS_DEBUG - /* - * These are much more minor, and don't need to be corrected right away, - * but in debug mode we want the next fsck run to be clean: - */ - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret; -#endif - - switch (btree) { - case BTREE_ID_alloc: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); - goto out; - case BTREE_ID_backpointers: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret; - goto out; - case BTREE_ID_need_discard: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_freespace: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_bucket_gens: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_lru: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; - goto out; - case BTREE_ID_accounting: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; - goto out; - case BTREE_ID_snapshots: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; - goto out; - default: - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; - ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; - goto out; - } -out: - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -static void kill_btree(struct bch_fs *c, enum btree_id btree) -{ - bch2_btree_id_root(c, btree)->alive = false; - bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); -} - -/* for -o reconstruct_alloc: */ -void bch2_reconstruct_alloc(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); - - __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) - if (btree_id_is_alloc(i)) - kill_btree(c, i); -} - -/* - * Btree node pointers have a field to stack a pointer to the in memory btree - * node; we need to zero out this field when reading in btree nodes, or when - * reading in keys from the journal: - */ -static void zero_out_btree_mem_ptr(struct journal_keys *keys) -{ - darray_for_each(*keys, i) - if (i->k->k.type == KEY_TYPE_btree_ptr_v2) - bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; -} - -/* journal replay: */ - -static void replay_now_at(struct journal *j, u64 seq) -{ - BUG_ON(seq < j->replay_journal_seq); - - seq = min(seq, j->replay_journal_seq_end); - - while (j->replay_journal_seq < seq) - bch2_journal_pin_put(j, j->replay_journal_seq++); -} - -static int bch2_journal_replay_accounting_key(struct btree_trans *trans, - struct journal_key *k) -{ - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, k->level, - BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(trans, &iter); - if (ret) - goto out; - - struct bkey u; - struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); - - /* Has this delta already been applied to the btree? */ - if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { - ret = 0; - goto out; - } - - struct bkey_i *new = k->k; - if (old.k->type == KEY_TYPE_accounting) { - new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - bch2_accounting_accumulate(bkey_i_to_accounting(new), - bkey_s_c_to_accounting(old)); - } - - trans->journal_res.seq = k->journal_seq; - - ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_journal_replay_key(struct btree_trans *trans, - struct journal_key *k) -{ - struct btree_iter iter; - unsigned iter_flags = - BTREE_ITER_intent| - BTREE_ITER_not_extents; - unsigned update_flags = BTREE_TRIGGER_norun; - int ret; - - if (k->overwritten) - return 0; - - trans->journal_res.seq = k->journal_seq; - - /* - * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to - * keep the key cache coherent with the underlying btree. Nothing - * besides the allocator is doing updates yet so we don't need key cache - * coherency for non-alloc btrees, and key cache fills for snapshots - * btrees use BTREE_ITER_filter_snapshots, which isn't available until - * the snapshots recovery pass runs. - */ - if (!k->level && k->btree_id == BTREE_ID_alloc) - iter_flags |= BTREE_ITER_cached; - else - update_flags |= BTREE_UPDATE_key_cache_reclaim; - - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, k->level, - iter_flags); - ret = bch2_btree_iter_traverse(trans, &iter); - if (ret) - goto out; - - struct btree_path *path = btree_iter_path(trans, &iter); - if (unlikely(!btree_path_node(path, k->level))) { - struct bch_fs *c = trans->c; - - CLASS(printbuf, buf)(); - prt_str(&buf, "btree="); - bch2_btree_id_to_text(&buf, k->btree_id); - prt_printf(&buf, " level=%u ", k->level); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); - - if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| - BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { - bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s", - buf.buf); - ret = -EINVAL; - } - - if (!k->allocated) { - bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s", - buf.buf); - k->overwritten = true; - goto out; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, - BTREE_MAX_DEPTH, 0, iter_flags); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_increase_depth(trans, iter.path, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } - - /* Must be checked with btree locked: */ - if (k->overwritten) - goto out; - - if (k->k->k.type == KEY_TYPE_accounting) { - struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto out; - - bkey_copy(n, k->k); - goto out; - } - - ret = bch2_trans_update(trans, &iter, k->k, update_flags); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int journal_sort_seq_cmp(const void *_l, const void *_r) -{ - const struct journal_key *l = *((const struct journal_key **)_l); - const struct journal_key *r = *((const struct journal_key **)_r); - - /* - * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last - * - * journal_seq == 0 means that the key comes from early repair, and - * should be inserted last so as to avoid overflowing the journal - */ - return cmp_int(l->journal_seq - 1, r->journal_seq - 1); -} - -int bch2_journal_replay(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - DARRAY(struct journal_key *) keys_sorted = { 0 }; - struct journal *j = &c->journal; - u64 start_seq = c->journal_replay_seq_start; - u64 end_seq = c->journal_replay_seq_start; - struct btree_trans *trans = NULL; - bool immediate_flush = false; - int ret = 0; - - if (keys->nr) { - ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", - keys->nr, start_seq, end_seq); - if (ret) - goto err; - } - - BUG_ON(!atomic_read(&keys->ref)); - - move_gap(keys, keys->nr); - trans = bch2_trans_get(c); - - /* - * Replay accounting keys first: we can't allow the write buffer to - * flush accounting keys until we're done - */ - darray_for_each(*keys, k) { - if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated)) - continue; - - cond_resched(); - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_skip_accounting_apply| - BCH_TRANS_COMMIT_no_journal_res| - BCH_WATERMARK_reclaim, - bch2_journal_replay_accounting_key(trans, k)); - if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) - goto err; - - k->overwritten = true; - } - - set_bit(BCH_FS_accounting_replay_done, &c->flags); - - /* - * First, attempt to replay keys in sorted order. This is more - * efficient - better locality of btree access - but some might fail if - * that would cause a journal deadlock. - */ - darray_for_each(*keys, k) { - cond_resched(); - - /* - * k->allocated means the key wasn't read in from the journal, - * rather it was from early repair code - */ - if (k->allocated) - immediate_flush = true; - - /* Skip fastpath if we're low on space in the journal */ - ret = c->journal.watermark ? -1 : - commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_journal_reclaim| - BCH_TRANS_COMMIT_skip_accounting_apply| - (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), - bch2_journal_replay_key(trans, k)); - BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting); - if (ret) { - ret = darray_push(&keys_sorted, k); - if (ret) - goto err; - } - } - - bch2_trans_unlock_long(trans); - /* - * Now, replay any remaining keys in the order in which they appear in - * the journal, unpinning those journal entries as we go: - */ - sort_nonatomic(keys_sorted.data, keys_sorted.nr, - sizeof(keys_sorted.data[0]), - journal_sort_seq_cmp, NULL); - - darray_for_each(keys_sorted, kp) { - cond_resched(); - - struct journal_key *k = *kp; - - if (k->journal_seq) - replay_now_at(j, k->journal_seq); - else - replay_now_at(j, j->replay_journal_seq_end); - - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_skip_accounting_apply| - (!k->allocated - ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim - : 0), - bch2_journal_replay_key(trans, k)); - if (ret) { - struct printbuf buf = PRINTBUF; - bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); - bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); - printbuf_exit(&buf); - goto err; - } - - BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); - } - - /* - * We need to put our btree_trans before calling flush_all_pins(), since - * that will use a btree_trans internally - */ - bch2_trans_put(trans); - trans = NULL; - - if (!c->opts.retain_recovery_info && - c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) - bch2_journal_keys_put_initial(c); - - replay_now_at(j, j->replay_journal_seq_end); - j->replay_journal_seq = 0; - - bch2_journal_set_replay_done(j); - - /* if we did any repair, flush it immediately */ - if (immediate_flush) { - bch2_journal_flush_all_pins(&c->journal); - ret = bch2_journal_meta(&c->journal); - } - - if (keys->nr) - bch2_journal_log_msg(c, "journal replay finished"); -err: - if (trans) - bch2_trans_put(trans); - darray_exit(&keys_sorted); - bch_err_fn(c, ret); - return ret; -} - -/* journal replay early: */ - -static int journal_replay_entry_early(struct bch_fs *c, - struct jset_entry *entry) -{ - int ret = 0; - - switch (entry->type) { - case BCH_JSET_ENTRY_btree_root: { - - if (unlikely(!entry->u64s)) - return 0; - - if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, - c, invalid_btree_id, - "invalid btree id %u (max %u)", - entry->btree_id, BTREE_ID_NR_MAX)) - return 0; - - while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { - ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); - if (ret) - return ret; - } - - struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - - r->level = entry->level; - bkey_copy(&r->key, (struct bkey_i *) entry->start); - r->error = 0; - r->alive = true; - break; - } - case BCH_JSET_ENTRY_usage: { - struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); - - switch (entry->btree_id) { - case BCH_FS_USAGE_key_version: - atomic64_set(&c->key_version, le64_to_cpu(u->v)); - break; - } - break; - } - case BCH_JSET_ENTRY_blacklist: { - struct jset_entry_blacklist *bl_entry = - container_of(entry, struct jset_entry_blacklist, entry); - - ret = bch2_journal_seq_blacklist_add(c, - le64_to_cpu(bl_entry->seq), - le64_to_cpu(bl_entry->seq) + 1); - break; - } - case BCH_JSET_ENTRY_blacklist_v2: { - struct jset_entry_blacklist_v2 *bl_entry = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - ret = bch2_journal_seq_blacklist_add(c, - le64_to_cpu(bl_entry->start), - le64_to_cpu(bl_entry->end) + 1); - break; - } - case BCH_JSET_ENTRY_clock: { - struct jset_entry_clock *clock = - container_of(entry, struct jset_entry_clock, entry); - - atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); - } - } -fsck_err: - return ret; -} - -static int journal_replay_early(struct bch_fs *c, - struct bch_sb_field_clean *clean) -{ - if (clean) { - for (struct jset_entry *entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - int ret = journal_replay_entry_early(c, entry); - if (ret) - return ret; - } - } else { - struct genradix_iter iter; - struct journal_replay *i, **_i; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - vstruct_for_each(&i->j, entry) { - int ret = journal_replay_entry_early(c, entry); - if (ret) - return ret; - } - } - } - - return 0; -} - -/* sb clean section: */ - -static int read_btree_roots(struct bch_fs *c) -{ - struct printbuf buf = PRINTBUF; - int ret = 0; - - for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (!r->alive) - continue; - - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, i, r->level); - - if (mustfix_fsck_err_on((ret = r->error), - c, btree_root_bkey_invalid, - "invalid btree root %s", - buf.buf) || - mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), - c, btree_root_read_error, - "error reading btree root %s: %s", - buf.buf, bch2_err_str(ret))) { - if (btree_id_is_alloc(i)) - r->error = 0; - ret = 0; - } - } - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - - if (!r->b && !r->error) { - r->alive = false; - r->level = 0; - bch2_btree_root_alloc_fake(c, i, 0); - } - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static bool check_version_upgrade(struct bch_fs *c) -{ - unsigned latest_version = bcachefs_metadata_version_current; - unsigned latest_compatible = min(latest_version, - bch2_latest_compatible_version(c->sb.version)); - unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; - unsigned new_version = 0; - bool ret = false; - - if (old_version < bcachefs_metadata_required_upgrade_below) { - if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || - latest_compatible < bcachefs_metadata_required_upgrade_below) - new_version = latest_version; - else - new_version = latest_compatible; - } else { - switch (c->opts.version_upgrade) { - case BCH_VERSION_UPGRADE_compatible: - new_version = latest_compatible; - break; - case BCH_VERSION_UPGRADE_incompatible: - new_version = latest_version; - break; - case BCH_VERSION_UPGRADE_none: - new_version = min(old_version, latest_version); - break; - } - } - - if (new_version > old_version) { - struct printbuf buf = PRINTBUF; - - if (old_version < bcachefs_metadata_required_upgrade_below) - prt_str(&buf, "Version upgrade required:\n"); - - if (old_version != c->sb.version) { - prt_str(&buf, "Version upgrade from "); - bch2_version_to_text(&buf, c->sb.version_upgrade_complete); - prt_str(&buf, " to "); - bch2_version_to_text(&buf, c->sb.version); - prt_str(&buf, " incomplete\n"); - } - - prt_printf(&buf, "Doing %s version upgrade from ", - BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) - ? "incompatible" : "compatible"); - bch2_version_to_text(&buf, old_version); - prt_str(&buf, " to "); - bch2_version_to_text(&buf, new_version); - prt_newline(&buf); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_upgrade(c, old_version, new_version); - passes = ext->recovery_passes_required[0] & ~passes; - - if (passes) { - prt_str(&buf, " running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - ret = true; - } - - if (new_version > c->sb.version_incompat_allowed && - c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Now allowing incompatible features up to "); - bch2_version_to_text(&buf, new_version); - prt_str(&buf, ", previously allowed up to "); - bch2_version_to_text(&buf, c->sb.version_incompat_allowed); - prt_newline(&buf); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - ret = true; - } - - if (ret) - bch2_sb_upgrade(c, new_version, - c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); - - return ret; -} - -int bch2_fs_recovery(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean = NULL; - struct jset *last_journal_entry = NULL; - u64 last_seq = 0, blacklist_seq, journal_seq; - int ret = 0; - - if (c->sb.clean) { - clean = bch2_read_superblock_clean(c); - ret = PTR_ERR_OR_ZERO(clean); - if (ret) - goto err; - - bch_info(c, "recovering from clean shutdown, journal seq %llu", - le64_to_cpu(clean->journal_seq)); - } else { - bch_info(c, "recovering from unclean shutdown"); - } - - if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { - bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); - ret = -EINVAL; - goto err; - } - - if (!c->sb.clean && - !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { - bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); - ret = -EINVAL; - goto err; - } - - if (c->opts.norecovery) { - c->opts.recovery_pass_last = c->opts.recovery_pass_last - ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) - : BCH_RECOVERY_PASS_snapshots_read; - c->opts.nochanges = true; - } - - if (c->opts.nochanges) - c->opts.read_only = true; - - if (c->opts.journal_rewind) { - bch_info(c, "rewinding journal, fsck required"); - c->opts.fsck = true; - } - - if (go_rw_in_recovery(c)) { - /* - * start workqueues/kworkers early - kthread creation checks for - * pending signals, which is _very_ annoying - */ - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - } - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; - - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); - write_sb = true; - } - - u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - if (sb_passes) { - struct printbuf buf = PRINTBUF; - prt_str(&buf, "superblock requires following recovery passes to be run:\n "); - prt_bitflags(&buf, bch2_recovery_passes, sb_passes); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (bch2_check_version_downgrade(c)) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Version downgrade required:"); - - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_downgrade(c, - BCH_VERSION_MINOR(bcachefs_metadata_version_current), - BCH_VERSION_MINOR(c->sb.version)); - passes = ext->recovery_passes_required[0] & ~passes; - if (passes) { - prt_str(&buf, "\n running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - write_sb = true; - } - - if (check_version_upgrade(c)) - write_sb = true; - - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { - SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); - write_sb = true; - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (c->sb.clean) - set_bit(BCH_FS_clean_recovery, &c->flags); - if (c->opts.fsck) - set_bit(BCH_FS_in_fsck, &c->flags); - set_bit(BCH_FS_in_recovery, &c->flags); - - ret = bch2_blacklist_table_initialize(c); - if (ret) { - bch_err(c, "error initializing blacklist table"); - goto err; - } - - bch2_journal_pos_from_member_info_resume(c); - - if (!c->sb.clean || c->opts.retain_recovery_info) { - struct genradix_iter iter; - struct journal_replay **i; - - bch_verbose(c, "starting journal read"); - ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); - if (ret) - goto err; - - /* - * note: cmd_list_journal needs the blacklist table fully up to date so - * it can asterisk ignored journal entries: - */ - if (c->opts.read_journal_only) - goto out; - - genradix_for_each_reverse(&c->journal_entries, iter, i) - if (!journal_replay_ignore(*i)) { - last_journal_entry = &(*i)->j; - break; - } - - if (mustfix_fsck_err_on(c->sb.clean && - last_journal_entry && - !journal_entry_empty(last_journal_entry), c, - clean_but_journal_not_empty, - "filesystem marked clean but journal not empty")) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - } - - if (!last_journal_entry) { - fsck_err_on(!c->sb.clean, c, - dirty_but_no_journal_entries, - "no journal entries found"); - if (clean) - goto use_clean; - - genradix_for_each_reverse(&c->journal_entries, iter, i) - if (*i) { - last_journal_entry = &(*i)->j; - (*i)->ignore_blacklisted = false; - (*i)->ignore_not_dirty= false; - /* - * This was probably a NO_FLUSH entry, - * so last_seq was garbage - but we know - * we're only using a single journal - * entry, set it here: - */ - (*i)->j.last_seq = (*i)->j.seq; - break; - } - } - - ret = bch2_journal_keys_sort(c); - if (ret) - goto err; - - if (c->sb.clean && last_journal_entry) { - ret = bch2_verify_superblock_clean(c, &clean, - last_journal_entry); - if (ret) - goto err; - } - } else { -use_clean: - if (!clean) { - bch_err(c, "no superblock clean section found"); - ret = bch_err_throw(c, fsck_repair_impossible); - goto err; - - } - blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; - } - - c->journal_replay_seq_start = last_seq; - c->journal_replay_seq_end = blacklist_seq - 1; - - zero_out_btree_mem_ptr(&c->journal_keys); - - ret = journal_replay_early(c, clean); - if (ret) - goto err; - - ret = bch2_fs_resize_on_mount(c); - if (ret) { - up_write(&c->state_lock); - goto err; - } - - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_info(c, "filesystem is an unresized image file, mounting ro"); - c->opts.read_only = true; - } - - if (!c->opts.read_only && - (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { - bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); - - bch2_reconstruct_alloc(c); - } else if (c->opts.reconstruct_alloc) { - bch2_journal_log_msg(c, "dropping alloc info"); - bch_info(c, "dropping and reconstructing all alloc info"); - - bch2_reconstruct_alloc(c); - } - - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { - /* We can't go RW to fix errors without alloc info */ - if (c->opts.fix_errors == FSCK_FIX_yes || - c->opts.fix_errors == FSCK_FIX_ask) - c->opts.fix_errors = FSCK_FIX_no; - if (c->opts.errors == BCH_ON_ERROR_fix_safe) - c->opts.errors = BCH_ON_ERROR_continue; - } - - /* - * After an unclean shutdown, skip then next few journal sequence - * numbers as they may have been referenced by btree writes that - * happened before their corresponding journal writes - those btree - * writes need to be ignored, by skipping and blacklisting the next few - * journal sequence numbers: - */ - if (!c->sb.clean) - journal_seq += JOURNAL_BUF_NR * 4; - - if (blacklist_seq != journal_seq) { - ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", - blacklist_seq, journal_seq) ?: - bch2_journal_seq_blacklist_add(c, - blacklist_seq, journal_seq); - if (ret) { - bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); - goto err; - } - } - - ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", - journal_seq, last_seq, blacklist_seq - 1) ?: - bch2_fs_journal_start(&c->journal, last_seq, journal_seq); - if (ret) - goto err; - - /* - * Skip past versions that might have possibly been used (as nonces), - * but hadn't had their pointers written: - */ - if (c->sb.encryption_type && !c->sb.clean) - atomic64_add(1 << 16, &c->key_version); - - ret = read_btree_roots(c); - if (ret) - goto err; - - set_bit(BCH_FS_btree_running, &c->flags); - - ret = bch2_sb_set_upgrade_extra(c); - if (ret) - goto err; - - ret = bch2_run_recovery_passes(c, 0); - if (ret) - goto err; - - /* - * Normally set by the appropriate recovery pass: when cleared, this - * indicates we're in early recovery and btree updates should be done by - * being applied to the journal replay keys. _Must_ be cleared before - * multithreaded use: - */ - set_bit(BCH_FS_may_go_rw, &c->flags); - clear_bit(BCH_FS_in_fsck, &c->flags); - - /* in case we don't run journal replay, i.e. norecovery mode */ - set_bit(BCH_FS_accounting_replay_done, &c->flags); - - bch2_async_btree_node_rewrites_flush(c); - - /* fsync if we fixed errors */ - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_meta(&c->journal); - } - - /* If we fixed errors, verify that fs is actually clean now: */ - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_errors_fixed, &c->flags) && - !test_bit(BCH_FS_errors_not_fixed, &c->flags) && - !test_bit(BCH_FS_error, &c->flags)) { - bch2_flush_fsck_errs(c); - - bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); - clear_bit(BCH_FS_errors_fixed, &c->flags); - - ret = bch2_run_recovery_passes(c, - BCH_RECOVERY_PASS_check_alloc_info); - if (ret) - goto err; - - if (test_bit(BCH_FS_errors_fixed, &c->flags) || - test_bit(BCH_FS_errors_not_fixed, &c->flags)) { - bch_err(c, "Second fsck run was not clean"); - set_bit(BCH_FS_errors_not_fixed, &c->flags); - } - - set_bit(BCH_FS_errors_fixed, &c->flags); - } - - if (enabled_qtypes(c)) { - bch_verbose(c, "reading quotas"); - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - bch_verbose(c, "quotas done"); - } - - mutex_lock(&c->sb_lock); - ext = bch2_sb_field_get(c->disk_sb.sb, ext); - write_sb = false; - - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); - write_sb = true; - } - - if (!test_bit(BCH_FS_error, &c->flags) && - !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); - write_sb = true; - } - - if (!test_bit(BCH_FS_error, &c->flags) && - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { - memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); - write_sb = true; - } - - if (c->opts.fsck && - !test_bit(BCH_FS_error, &c->flags) && - c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && - ext->btrees_lost_data) { - ext->btrees_lost_data = 0; - write_sb = true; - } - - if (c->opts.fsck && - !test_bit(BCH_FS_error, &c->flags) && - !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { - SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); - SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); - write_sb = true; - } - - if (bch2_blacklist_entries_gc(c)) - write_sb = true; - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || - c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { - struct bch_move_stats stats; - - bch2_move_stats_init(&stats, "recovery"); - - struct printbuf buf = PRINTBUF; - bch2_version_to_text(&buf, c->sb.version_min); - bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); - printbuf_exit(&buf); - - ret = bch2_fs_read_write_early(c) ?: - bch2_scan_old_btree_nodes(c, &stats); - if (ret) - goto err; - bch_info(c, "scanning for old btree nodes done"); - } - - ret = 0; -out: - bch2_flush_fsck_errs(c); - - if (!ret && - test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && - !c->opts.nochanges) { - bch2_fs_read_write_early(c); - bch2_delete_dead_snapshots_async(c); - } - - bch_err_fn(c, ret); -final_out: - if (!IS_ERR(clean)) - kfree(clean); - return ret; -err: -fsck_err: - { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret)); - bch2_fs_emergency_read_only2(c, &buf); - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - goto final_out; -} - -int bch2_fs_initialize(struct bch_fs *c) -{ - struct bch_inode_unpacked root_inode, lostfound_inode; - struct bkey_inode_buf packed_inode; - struct qstr lostfound = QSTR("lost+found"); - struct bch_member *m; - int ret; - - bch_notice(c, "initializing new filesystem"); - set_bit(BCH_FS_new_fs, &c->flags); - - mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); - - bch2_check_version_downgrade(c); - - if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { - bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - bch2_write_super(c); - } - - for_each_member_device(c, ca) { - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); - ca->mi = bch2_mi_to_cpu(m); - } - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - set_bit(BCH_FS_btree_running, &c->flags); - set_bit(BCH_FS_may_go_rw, &c->flags); - - for (unsigned i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc_fake(c, i, 0); - - ret = bch2_fs_journal_alloc(c); - if (ret) - goto err; - - /* - * journal_res_get() will crash if called before this has - * set up the journal.pin FIFO and journal.cur pointer: - */ - ret = bch2_fs_journal_start(&c->journal, 1, 1); - if (ret) - goto err; - - ret = bch2_fs_read_write_early(c); - if (ret) - goto err; - - set_bit(BCH_FS_accounting_replay_done, &c->flags); - bch2_journal_set_replay_done(&c->journal); - - for_each_member_device(c, ca) { - ret = bch2_dev_usage_init(ca, false); - if (ret) { - bch2_dev_put(ca); - goto err; - } - } - - /* - * Write out the superblock and journal buckets, now that we can do - * btree updates - */ - bch_verbose(c, "marking superblocks"); - ret = bch2_trans_mark_dev_sbs(c); - bch_err_msg(c, ret, "marking superblocks"); - if (ret) - goto err; - - ret = bch2_fs_freespace_init(c); - if (ret) - goto err; - - ret = bch2_initialize_subvolumes(c); - if (ret) - goto err; - - bch_verbose(c, "reading snapshots table"); - ret = bch2_snapshots_read(c); - if (ret) - goto err; - bch_verbose(c, "reading snapshots done"); - - bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); - root_inode.bi_inum = BCACHEFS_ROOT_INO; - root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - bch2_inode_pack(&packed_inode, &root_inode); - packed_inode.inode.k.p.snapshot = U32_MAX; - - ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "creating root directory"); - if (ret) - goto err; - - bch2_inode_init_early(c, &lostfound_inode); - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_create_trans(trans, - BCACHEFS_ROOT_SUBVOL_INUM, - &root_inode, &lostfound_inode, - &lostfound, - 0, 0, S_IFDIR|0700, 0, - NULL, NULL, (subvol_inum) { 0 }, 0)); - bch_err_msg(c, ret, "creating lost+found"); - if (ret) - goto err; - - c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; - - bch2_copygc_wakeup(c); - bch2_rebalance_wakeup(c); - - if (enabled_qtypes(c)) { - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - } - - ret = bch2_journal_flush(&c->journal); - bch_err_msg(c, ret, "writing first journal entry"); - if (ret) - goto err; - - mutex_lock(&c->sb_lock); - SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; - return 0; -err: - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h deleted file mode 100644 index c023f52fc2d6dc..00000000000000 --- a/fs/bcachefs/recovery.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_H -#define _BCACHEFS_RECOVERY_H - -int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id); -void bch2_reconstruct_alloc(struct bch_fs *); - -int bch2_journal_replay(struct bch_fs *); - -int bch2_fs_recovery(struct bch_fs *); -int bch2_fs_initialize(struct bch_fs *); - -#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c deleted file mode 100644 index 6a039e0110643f..00000000000000 --- a/fs/bcachefs/recovery_passes.c +++ /dev/null @@ -1,646 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "alloc_background.h" -#include "backpointers.h" -#include "btree_gc.h" -#include "btree_node_scan.h" -#include "disk_accounting.h" -#include "ec.h" -#include "fsck.h" -#include "inode.h" -#include "journal.h" -#include "lru.h" -#include "logged_ops.h" -#include "movinggc.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "snapshot.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" - -const char * const bch2_recovery_passes[] = { -#define x(_fn, ...) #_fn, - BCH_RECOVERY_PASSES() -#undef x - NULL -}; - -static const u8 passes_to_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x -}; - -static const u8 passes_from_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x -}; - -static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) -{ - return passes_to_stable_map[pass]; -} - -u64 bch2_recovery_passes_to_stable(u64 v) -{ - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_to_stable_map[i]); - return ret; -} - -static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) -{ - return pass < ARRAY_SIZE(passes_from_stable_map) - ? passes_from_stable_map[pass] - : 0; -} - -u64 bch2_recovery_passes_from_stable(u64 v) -{ - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_from_stable_map[i]); - return ret; -} - -static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - return 0; -} - -static void bch2_sb_recovery_passes_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_recovery_passes *r = - field_to_type(f, recovery_passes); - unsigned nr = recovery_passes_nr_entries(r); - - if (out->nr_tabstops < 1) - printbuf_tabstop_push(out, 32); - if (out->nr_tabstops < 2) - printbuf_tabstop_push(out, 16); - - prt_printf(out, "Pass\tLast run\tLast runtime\n"); - - for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { - if (!i->last_run) - continue; - - unsigned idx = i - r->start; - - prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); - - bch2_prt_datetime(out, le64_to_cpu(i->last_run)); - prt_tab(out); - - bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); - - if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) - prt_str(out, " (no ratelimit)"); - - prt_newline(out); - } -} - -static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); - - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_recovery_passes *r = - bch2_sb_field_get(c->disk_sb.sb, recovery_passes); - - if (stable >= recovery_passes_nr_entries(r)) { - unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); - - r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); - if (!r) { - bch_err(c, "error creating recovery_passes sb section"); - return NULL; - } - } - - return r->start + stable; -} - -static void bch2_sb_recovery_pass_complete(struct bch_fs *c, - enum bch_recovery_pass pass, - s64 start_time) -{ - guard(mutex)(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __clear_bit_le64(bch2_recovery_pass_to_stable(pass), - ext->recovery_passes_required); - - struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); - if (e) { - s64 end_time = ktime_get_real_seconds(); - e->last_run = cpu_to_le64(end_time); - e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); - SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); - } - - bch2_write_super(c); -} - -void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - guard(mutex)(&c->sb_lock); - - struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); - if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { - SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); - bch2_write_super(c); - } -} - -static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); - bool ret = false; - - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_recovery_passes *r = - bch2_sb_field_get(c->disk_sb.sb, recovery_passes); - - if (stable < recovery_passes_nr_entries(r)) { - struct recovery_pass_entry *i = r->start + stable; - - /* - * Ratelimit if the last runtime was more than 1% of the time - * since we last ran - */ - ret = (u64) le32_to_cpu(i->last_runtime) * 100 > - ktime_get_real_seconds() - le64_to_cpu(i->last_run); - - if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) - ret = false; - } - - return ret; -} - -const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { - .validate = bch2_sb_recovery_passes_validate, - .to_text = bch2_sb_recovery_passes_to_text -}; - -/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ -static int bch2_recovery_pass_empty(struct bch_fs *c) -{ - return 0; -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - /* - * After we go RW, the journal keys buffer can't be modified (except for - * setting journal_key->overwritten: it will be accessed by multiple - * threads - */ - move_gap(keys, keys->nr); - - set_bit(BCH_FS_may_go_rw, &c->flags); - - if (go_rw_in_recovery(c)) { - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { - bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); - bch2_reconstruct_alloc(c); - } - - return bch2_fs_read_write_early(c); - } - return 0; -} - -/* - * Make sure root inode is readable while we're still in recovery and can rewind - * for repair: - */ -static int bch2_lookup_root_inode(struct bch_fs *c) -{ - subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; - struct bch_inode_unpacked inode_u; - struct bch_subvolume subvol; - - return bch2_trans_do(c, - bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - unsigned when; -}; - -static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - -static u64 bch2_recovery_passes_match(unsigned flags) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & flags) - ret |= BIT_ULL(i); - return ret; -} - -u64 bch2_fsck_recovery_passes(void) -{ - return bch2_recovery_passes_match(PASS_FSCK); -} - -static void bch2_run_async_recovery_passes(struct bch_fs *c) -{ - if (!down_trylock(&c->recovery.run_lock)) - return; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) - goto unlock; - - if (queue_work(system_long_wq, &c->recovery.work)) - return; - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); -unlock: - up(&c->recovery.run_lock); -} - -static bool recovery_pass_needs_set(struct bch_fs *c, - enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags *flags) -{ - struct bch_fs_recovery *r = &c->recovery; - - /* - * Never run scan_for_btree_nodes persistently: check_topology will run - * it if required - */ - if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) - *flags |= RUN_RECOVERY_PASS_nopersistent; - - if ((*flags & RUN_RECOVERY_PASS_ratelimit) && - !bch2_recovery_pass_want_ratelimit(c, pass)) - *flags &= ~RUN_RECOVERY_PASS_ratelimit; - - /* - * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do - * anything if the pass has already run: these mean we need a prior pass - * to run before we continue to repair, we don't expect that pass to fix - * the damage we encountered. - * - * Otherwise, we run run_explicit_recovery_pass when we find damage, so - * it should run again even if it's already run: - */ - bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); - bool rewind = in_recovery && - r->curr_pass > pass && - !(r->passes_complete & BIT_ULL(pass)); - - if (persistent - ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) - : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) - return true; - - if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && - (r->passes_ratelimiting & BIT_ULL(pass))) - return true; - - if (rewind) - return true; - - return false; -} - -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -int __bch2_run_explicit_recovery_pass(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags flags) -{ - struct bch_fs_recovery *r = &c->recovery; - int ret = 0; - - lockdep_assert_held(&c->sb_lock); - - bch2_printbuf_make_room(out, 1024); - out->atomic++; - - unsigned long lockflags; - spin_lock_irqsave(&r->lock, lockflags); - - if (!recovery_pass_needs_set(c, pass, &flags)) - goto out; - - bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool rewind = in_recovery && - r->curr_pass > pass && - !(r->passes_complete & BIT_ULL(pass)); - bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; - - if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); - } - - if (pass < BCH_RECOVERY_PASS_set_may_go_rw && - (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { - prt_printf(out, "need recovery pass %s (%u), but already rw\n", - bch2_recovery_passes[pass], pass); - ret = bch_err_throw(c, cannot_rewind_recovery); - goto out; - } - - if (ratelimit) - r->passes_ratelimiting |= BIT_ULL(pass); - else - r->passes_ratelimiting &= ~BIT_ULL(pass); - - if (in_recovery && !ratelimit) { - prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[r->curr_pass], r->curr_pass, - rewind ? " - rewinding" : ""); - - r->passes_to_run |= BIT_ULL(pass); - - if (rewind) { - r->next_pass = pass; - r->passes_complete &= (1ULL << pass) >> 1; - ret = bch_err_throw(c, restart_recovery); - } - } else { - prt_printf(out, "scheduling recovery pass %s (%u)%s\n", - bch2_recovery_passes[pass], pass, - ratelimit ? " - ratelimiting" : ""); - - struct recovery_pass_fn *p = recovery_pass_fns + pass; - if (p->when & PASS_ONLINE) - bch2_run_async_recovery_passes(c); - } -out: - spin_unlock_irqrestore(&r->lock, lockflags); - --out->atomic; - return ret; -} - -int bch2_run_explicit_recovery_pass(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass, - enum bch_run_recovery_pass_flags flags) -{ - int ret = 0; - - if (recovery_pass_needs_set(c, pass, &flags)) { - guard(mutex)(&c->sb_lock); - ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); - bch2_write_super(c); - } - - return ret; -} - -/* - * Returns 0 if @pass has run recently, otherwise one of - * -BCH_ERR_restart_recovery - * -BCH_ERR_recovery_pass_will_run - */ -int bch2_require_recovery_pass(struct bch_fs *c, - struct printbuf *out, - enum bch_recovery_pass pass) -{ - if (test_bit(BCH_FS_in_recovery, &c->flags) && - c->recovery.passes_complete & BIT_ULL(pass)) - return 0; - - guard(mutex)(&c->sb_lock); - - if (bch2_recovery_pass_want_ratelimit(c, pass)) - return 0; - - enum bch_run_recovery_pass_flags flags = 0; - int ret = 0; - - if (recovery_pass_needs_set(c, pass, &flags)) { - ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); - bch2_write_super(c); - } - - return ret ?: bch_err_throw(c, recovery_pass_will_run); -} - -int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - enum bch_run_recovery_pass_flags flags = 0; - - if (!recovery_pass_needs_set(c, pass, &flags)) - return 0; - - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - mutex_lock(&c->sb_lock); - int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, - RUN_RECOVERY_PASS_nopersistent); - mutex_unlock(&c->sb_lock); - - bch2_print_str(c, KERN_NOTICE, buf.buf); - printbuf_exit(&buf); - return ret; -} - -static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct bch_fs_recovery *r = &c->recovery; - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - - s64 start_time = ktime_get_real_seconds(); - int ret = p->fn(c); - - r->passes_to_run &= ~BIT_ULL(pass); - - if (ret) { - r->passes_failing |= BIT_ULL(pass); - return ret; - } - - r->passes_failing = 0; - - if (!test_bit(BCH_FS_error, &c->flags)) - bch2_sb_recovery_pass_complete(c, pass, start_time); - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); - - return 0; -} - -static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, - bool online) -{ - struct bch_fs_recovery *r = &c->recovery; - int ret = 0; - - spin_lock_irq(&r->lock); - - if (online) - orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); - - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) - orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); - - /* - * A failed recovery pass will be retried after another pass succeeds - - * but not this iteration. - * - * This is because some passes depend on repair done by other passes: we - * may want to retry, but we don't want to loop on failing passes. - */ - - orig_passes_to_run &= ~r->passes_failing; - - r->passes_to_run = orig_passes_to_run; - - while (r->passes_to_run) { - unsigned prev_done = r->pass_done; - unsigned pass = __ffs64(r->passes_to_run); - r->curr_pass = pass; - r->next_pass = r->curr_pass + 1; - r->passes_to_run &= ~BIT_ULL(pass); - - spin_unlock_irq(&r->lock); - - int ret2 = bch2_run_recovery_pass(c, pass) ?: - bch2_journal_flush(&c->journal); - - spin_lock_irq(&r->lock); - - if (r->next_pass < r->curr_pass) { - /* Rewind: */ - r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); - } else if (!ret2) { - r->pass_done = max(r->pass_done, pass); - r->passes_complete |= BIT_ULL(pass); - } else { - ret = ret2; - } - - if (ret && !online) - break; - - if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && - r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { - bch2_copygc_wakeup(c); - bch2_rebalance_wakeup(c); - } - } - - clear_bit(BCH_FS_in_recovery, &c->flags); - spin_unlock_irq(&r->lock); - - return ret; -} - -static void bch2_async_recovery_passes_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); - struct bch_fs_recovery *r = &c->recovery; - - __bch2_run_recovery_passes(c, - c->sb.recovery_passes_required & ~r->passes_ratelimiting, - true); - - up(&r->run_lock); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); -} - -int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) -{ - return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); -} - -int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) -{ - u64 passes = - bch2_recovery_passes_match(PASS_ALWAYS) | - (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | - (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | - c->opts.recovery_passes | - c->sb.recovery_passes_required; - - if (c->opts.recovery_pass_last) - passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; - - /* - * We can't allow set_may_go_rw to be excluded; that would cause us to - * use the journal replay keys for updates where it's not expected. - */ - c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - passes &= ~c->opts.recovery_passes_exclude; - - passes &= ~(BIT_ULL(from) - 1); - - down(&c->recovery.run_lock); - int ret = __bch2_run_recovery_passes(c, passes, false); - up(&c->recovery.run_lock); - - return ret; -} - -static void prt_passes(struct printbuf *out, const char *msg, u64 passes) -{ - prt_printf(out, "%s:\t", msg); - prt_bitflags(out, bch2_recovery_passes, passes); - prt_newline(out); -} - -void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_fs_recovery *r = &c->recovery; - - printbuf_tabstop_push(out, 32); - prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); - prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & - bch2_recovery_passes_match(PASS_ONLINE)); - prt_passes(out, "Complete passes", r->passes_complete); - prt_passes(out, "Failing passes", r->passes_failing); - - if (r->curr_pass) { - prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); - prt_passes(out, "Current passes", r->passes_to_run); - } -} - -void bch2_fs_recovery_passes_init(struct bch_fs *c) -{ - spin_lock_init(&c->recovery.lock); - sema_init(&c->recovery.run_lock, 1); - - INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); -} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h deleted file mode 100644 index 2117f0ce19229a..00000000000000 --- a/fs/bcachefs/recovery_passes.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef _BCACHEFS_RECOVERY_PASSES_H -#define _BCACHEFS_RECOVERY_PASSES_H - -extern const char * const bch2_recovery_passes[]; - -extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes; - -u64 bch2_recovery_passes_to_stable(u64 v); -u64 bch2_recovery_passes_from_stable(u64 v); - -u64 bch2_fsck_recovery_passes(void); - -void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass); - -enum bch_run_recovery_pass_flags { - RUN_RECOVERY_PASS_nopersistent = BIT(0), - RUN_RECOVERY_PASS_ratelimit = BIT(1), -}; - -static inline bool go_rw_in_recovery(struct bch_fs *c) -{ - return (c->journal_keys.nr || - !c->opts.read_only || - !c->sb.clean || - c->opts.recovery_passes || - (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))); -} - -int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); - -int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass, - enum bch_run_recovery_pass_flags); -int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass, - enum bch_run_recovery_pass_flags); - -int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *, - enum bch_recovery_pass); - -int bch2_run_online_recovery_passes(struct bch_fs *, u64); -int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); - -void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); - -void bch2_fs_recovery_passes_init(struct bch_fs *); - -#endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h deleted file mode 100644 index b63c20558d3d42..00000000000000 --- a/fs/bcachefs/recovery_passes_format.h +++ /dev/null @@ -1,106 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H -#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H - -#define PASS_SILENT BIT(0) -#define PASS_FSCK BIT(1) -#define PASS_UNCLEAN BIT(2) -#define PASS_ALWAYS BIT(3) -#define PASS_ONLINE BIT(4) -#define PASS_ALLOC BIT(5) -#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC) - -#ifdef CONFIG_BCACHEFS_DEBUG -#define PASS_FSCK_DEBUG BIT(1) -#else -#define PASS_FSCK_DEBUG 0 -#endif - -/* - * Passes may be reordered, but the second field is a persistent identifier and - * must never change: - */ -#define BCH_RECOVERY_PASSES() \ - x(recovery_pass_empty, 41, PASS_SILENT) \ - x(scan_for_btree_nodes, 37, 0) \ - x(check_topology, 4, 0) \ - x(accounting_read, 39, PASS_ALWAYS) \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, 0) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK_ALLOC) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(reconstruct_snapshots, 38, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_unreachable_inodes, 40, PASS_FSCK) \ - x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_ALWAYS) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) \ - x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) - -/* We normally enumerate recovery passes in the order we run them: */ -enum bch_recovery_pass { -#define x(n, id, when) BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - BCH_RECOVERY_PASS_NR -}; - -/* But we also need stable identifiers that can be used in the superblock */ -enum bch_recovery_pass_stable { -#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, - BCH_RECOVERY_PASSES() -#undef x -}; - -struct recovery_pass_entry { - __le64 last_run; - __le32 last_runtime; - __le32 flags; -}; - -LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1) - -struct bch_sb_field_recovery_passes { - struct bch_sb_field field; - struct recovery_pass_entry start[]; -}; - -static inline unsigned -recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r) -{ - return r - ? ((vstruct_end(&r->field) - (void *) &r->start[0]) / - sizeof(struct recovery_pass_entry)) - : 0; -} - -#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h deleted file mode 100644 index aa9526938cc35d..00000000000000 --- a/fs/bcachefs/recovery_passes_types.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H -#define _BCACHEFS_RECOVERY_PASSES_TYPES_H - -struct bch_fs_recovery { - /* - * Two different uses: - * "Has this fsck pass?" - i.e. should this type of error be an - * emergency read-only - * And, in certain situations fsck will rewind to an earlier pass: used - * for signaling to the toplevel code which pass we want to run now. - */ - enum bch_recovery_pass curr_pass; - enum bch_recovery_pass next_pass; - /* never rewinds version of curr_pass */ - enum bch_recovery_pass pass_done; - u64 passes_to_run; - /* bitmask of recovery passes that we actually ran */ - u64 passes_complete; - u64 passes_failing; - u64 passes_ratelimiting; - spinlock_t lock; - struct semaphore run_lock; - struct work_struct work; -}; - -#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c deleted file mode 100644 index 92b90cfe622b96..00000000000000 --- a/fs/bcachefs/reflink.c +++ /dev/null @@ -1,865 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "bkey_buf.h" -#include "btree_update.h" -#include "buckets.h" -#include "enumerated_ref.h" -#include "error.h" -#include "extents.h" -#include "inode.h" -#include "io_misc.h" -#include "io_write.h" -#include "rebalance.h" -#include "reflink.h" -#include "subvolume.h" -#include "super-io.h" - -#include - -static inline bool bkey_extent_is_reflink_data(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_reflink_v: - case KEY_TYPE_indirect_inline_data: - return true; - default: - return false; - } -} - -static inline unsigned bkey_type_to_indirect(const struct bkey *k) -{ - switch (k->type) { - case KEY_TYPE_extent: - return KEY_TYPE_reflink_v; - case KEY_TYPE_inline_data: - return KEY_TYPE_indirect_inline_data; - default: - return 0; - } -} - -/* reflink pointers */ - -int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - int ret = 0; - - bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad), - c, reflink_p_front_pad_bad, - "idx < front_pad (%llu < %u)", - REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad)); -fsck_err: - return ret; -} - -void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - prt_printf(out, "idx %llu front_pad %u back_pad %u", - REFLINK_P_IDX(p.v), - le32_to_cpu(p.v->front_pad), - le32_to_cpu(p.v->back_pad)); - - if (REFLINK_P_ERROR(p.v)) - prt_str(out, " error"); -} - -bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); - struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); - - /* - * Disabled for now, the triggers code needs to be reworked for merging - * of reflink pointers to work: - */ - return false; - - if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v)) - return false; - - if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v)) - return false; - - bch2_key_resize(l.k, l.k->size + r.k->size); - return true; -} - -/* indirect extents */ - -int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), - c, reflink_v_pos_bad, - "indirect extent above maximum position 0:%llu", - REFLINK_P_IDX_MAX); - - ret = bch2_bkey_ptrs_validate(c, k, from); -fsck_err: - return ret; -} - -void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); - - bch2_bkey_ptrs_to_text(out, c, k); -} - -#if 0 -Currently disabled, needs to be debugged: - -bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); - - return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); -} -#endif - -/* indirect inline data */ - -int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - return 0; -} - -void bch2_indirect_inline_data_to_text(struct printbuf *out, - struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); - unsigned datalen = bkey_inline_data_bytes(k.k); - - prt_printf(out, "refcount %llu datalen %u: %*phN", - le64_to_cpu(d.v->refcount), datalen, - min(datalen, 32U), d.v->data); -} - -/* lookup */ - -static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p, - bool should_commit) -{ - struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - SET_REFLINK_P_ERROR(&new->v, false); - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); - if (ret) - return ret; - - if (!should_commit) - return 0; - - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; -} - -static int bch2_indirect_extent_missing_error(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 missing_start, u64 missing_end, - bool should_commit) -{ - if (REFLINK_P_ERROR(p.v)) - return 0; - - struct bch_fs *c = trans->c; - u64 live_start = REFLINK_P_IDX(p.v); - u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; - u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); - u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(missing_start < refd_start); - BUG_ON(missing_end > refd_end); - - struct bpos missing_pos = bkey_start_pos(p.k); - missing_pos.offset += missing_start - live_start; - - prt_printf(&buf, "pointer to missing indirect extent in "); - ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); - if (ret) - goto err; - - prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9); - bch2_bkey_val_to_text(&buf, c, p.s_c); - - prt_printf(&buf, "\nmissing reflink btree range %llu-%llu", - missing_start, missing_end); - - if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { - struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - /* - * Is the missing range not actually needed? - * - * p.v->idx refers to the data that we actually want, but if the - * indirect extent we point to was bigger, front_pad and back_pad - * indicate the range we took a reference on. - */ - - if (missing_end <= live_start) { - new->v.front_pad = cpu_to_le32(live_start - missing_end); - } else if (missing_start >= live_end) { - new->v.back_pad = cpu_to_le32(missing_start - live_end); - } else { - struct bpos new_start = bkey_start_pos(&new->k); - struct bpos new_end = new->k.p; - - if (missing_start > live_start) - new_start.offset += missing_start - live_start; - if (missing_end < live_end) - new_end.offset -= live_end - missing_end; - - bch2_cut_front(new_start, &new->k_i); - bch2_cut_back(new_end, &new->k_i); - - SET_REFLINK_P_ERROR(&new->v, true); - } - - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); - if (ret) - goto err; - - if (should_commit) - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* - * This is used from the read path, which doesn't expect to have to do a - * transaction commit, and from triggers, which should not be doing a commit: - */ -struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, - struct btree_iter *iter, - s64 *offset_into_extent, - struct bkey_s_c_reflink_p p, - bool should_commit, - unsigned iter_flags) -{ - BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad))); - BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad)); - - u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; - - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, - POS(0, reflink_offset), iter_flags); - if (bkey_err(k)) - return k; - - if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - u64 missing_end = min(k.k->p.offset, - REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad)); - BUG_ON(reflink_offset == missing_end); - - int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, - missing_end, should_commit); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_err(ret); - } - } else if (unlikely(REFLINK_P_ERROR(p.v))) { - int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_err(ret); - } - } - - *offset_into_extent = reflink_offset - bkey_start_offset(k.k); - return k; -} - -/* reflink pointer trigger */ - -static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, u64 *idx, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); - struct btree_iter iter; - struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false, - BTREE_ITER_intent| - BTREE_ITER_with_updates); - int ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_refcount_c(k)) { - if (!(flags & BTREE_TRIGGER_overwrite)) - ret = bch_err_throw(c, missing_indirect_extent); - goto next; - } - - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); - if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - log_fsck_err(trans, reflink_refcount_underflow, - "indirect extent refcount underflow while marking\n%s", - buf.buf); - goto next; - } - - if (flags & BTREE_TRIGGER_insert) { - struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - u64 pad; - - pad = max_t(s64, le32_to_cpu(v->front_pad), - REFLINK_P_IDX(v) - bkey_start_offset(&new->k)); - BUG_ON(pad > U32_MAX); - v->front_pad = cpu_to_le32(pad); - - pad = max_t(s64, le32_to_cpu(v->back_pad), - new->k.p.offset - p.k->size - REFLINK_P_IDX(v)); - BUG_ON(pad > U32_MAX); - v->back_pad = cpu_to_le32(pad); - } - - le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1); - - bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, new, 0); - if (ret) - goto err; -next: - *idx = k.k->p.offset; -err: -fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -} - -static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, u64 *idx, - enum btree_iter_update_trigger_flags flags, - size_t r_idx) -{ - struct bch_fs *c = trans->c; - struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; - u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); - s64 ret = 0; - struct printbuf buf = PRINTBUF; - - if (r_idx >= c->reflink_gc_nr) - goto not_found; - - r = genradix_ptr(&c->reflink_gc_table, r_idx); - next_idx = min(next_idx, r->offset - r->size); - if (*idx < next_idx) - goto not_found; - - BUG_ON((s64) r->refcount + add < 0); - - if (flags & BTREE_TRIGGER_gc) - r->refcount += add; - *idx = r->offset; - return 0; -not_found: - if (flags & BTREE_TRIGGER_check_repair) { - ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); - if (ret) - goto err; - } - - *idx = next_idx; -err: - printbuf_exit(&buf); - return ret; -} - -static int __trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - int ret = 0; - - u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); - u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); - - if (flags & BTREE_TRIGGER_transactional) { - while (idx < end && !ret) - ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); - } - - if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) { - size_t l = 0, r = c->reflink_gc_nr; - - while (l < r) { - size_t m = l + (r - l) / 2; - struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m); - if (ref->offset <= idx) - l = m + 1; - else - r = m; - } - - while (idx < end && !ret) - ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++); - } - - return ret; -} - -int bch2_trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - if ((flags & BTREE_TRIGGER_transactional) && - (flags & BTREE_TRIGGER_insert)) { - struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; - - v->front_pad = v->back_pad = 0; - } - - return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); -} - -/* indirect extent trigger */ - -static inline void -check_indirect_extent_deleting(struct bkey_s new, - enum btree_iter_update_trigger_flags *flags) -{ - if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) { - new.k->type = KEY_TYPE_deleted; - new.k->size = 0; - set_bkey_val_u64s(new.k, 0); - *flags &= ~BTREE_TRIGGER_insert; - } -} - -int bch2_trigger_reflink_v(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - if ((flags & BTREE_TRIGGER_transactional) && - (flags & BTREE_TRIGGER_insert)) - check_indirect_extent_deleting(new, &flags); - - return bch2_trigger_extent(trans, btree_id, level, old, new, flags); -} - -int bch2_trigger_indirect_inline_data(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - check_indirect_extent_deleting(new, &flags); - - return 0; -} - -/* create */ - -static int bch2_make_extent_indirect(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *orig, - bool reflink_p_may_update_opts_field) -{ - struct bch_fs *c = trans->c; - struct btree_iter reflink_iter = {}; - struct bkey_s_c k; - struct bkey_i *r_v; - struct bkey_i_reflink_p *r_p; - __le64 *refcount; - int ret; - - if (orig->k.type == KEY_TYPE_inline_data) - bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); - - bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, - BTREE_ITER_intent); - k = bch2_btree_iter_peek_prev(trans, &reflink_iter); - ret = bkey_err(k); - if (ret) - goto err; - - /* - * XXX: we're assuming that 56 bits will be enough for the life of the - * filesystem: we need to implement wraparound, with a cursor in the - * logged ops btree: - */ - if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) - return -ENOSPC; - - r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); - ret = PTR_ERR_OR_ZERO(r_v); - if (ret) - goto err; - - bkey_init(&r_v->k); - r_v->k.type = bkey_type_to_indirect(&orig->k); - r_v->k.p = reflink_iter.pos; - bch2_key_resize(&r_v->k, orig->k.size); - r_v->k.bversion = orig->k.bversion; - - set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); - - refcount = bkey_refcount(bkey_i_to_s(r_v)); - *refcount = 0; - memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); - - ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); - if (ret) - goto err; - - /* - * orig is in a bkey_buf which statically allocates 5 64s for the val, - * so we know it will be big enough: - */ - orig->k.type = KEY_TYPE_reflink_p; - r_p = bkey_i_to_reflink_p(orig); - set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); - - /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */ -#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) - __underlying_memset(&r_p->v, 0, sizeof(r_p->v)); -#else - memset(&r_p->v, 0, sizeof(r_p->v)); -#endif - - SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k)); - - if (reflink_p_may_update_opts_field) - SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); - - ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, - BTREE_UPDATE_internal_snapshot_node); -err: - bch2_trans_iter_exit(trans, &reflink_iter); - - return ret; -} - -static struct bkey_s_c get_next_src(struct btree_trans *trans, - struct btree_iter *iter, struct bpos end) -{ - struct bkey_s_c k; - int ret; - - for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) { - if (bkey_extent_is_unwritten(k)) - continue; - - if (bkey_extent_is_data(k.k)) - return k; - } - - if (bkey_ge(iter->pos, end)) - bch2_btree_iter_set_pos(trans, iter, end); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -} - -s64 bch2_remap_range(struct bch_fs *c, - subvol_inum dst_inum, u64 dst_offset, - subvol_inum src_inum, u64 src_offset, - u64 remap_sectors, - u64 new_i_size, s64 *i_sectors_delta, - bool may_change_src_io_path_opts) -{ - struct btree_trans *trans; - struct btree_iter dst_iter, src_iter; - struct bkey_s_c src_k; - struct bkey_buf new_dst, new_src; - struct bpos dst_start = POS(dst_inum.inum, dst_offset); - struct bpos src_start = POS(src_inum.inum, src_offset); - struct bpos dst_end = dst_start, src_end = src_start; - struct bch_io_opts opts; - struct bpos src_want; - u64 dst_done = 0; - u32 dst_snapshot, src_snapshot; - bool reflink_p_may_update_opts_field = - !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); - int ret = 0, ret2 = 0; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) - return bch_err_throw(c, erofs_no_writes); - - bch2_check_set_feature(c, BCH_FEATURE_reflink); - - dst_end.offset += remap_sectors; - src_end.offset += remap_sectors; - - bch2_bkey_buf_init(&new_dst); - bch2_bkey_buf_init(&new_src); - trans = bch2_trans_get(c); - - ret = bch2_inum_opts_get(trans, src_inum, &opts); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, - BTREE_ITER_intent); - bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, - BTREE_ITER_intent); - - while ((ret == 0 || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) && - bkey_lt(dst_iter.pos, dst_end)) { - struct disk_reservation disk_res = { 0 }; - - bch2_trans_begin(trans); - - if (fatal_signal_pending(current)) { - ret = -EINTR; - break; - } - - ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol, - &src_snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot); - - ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, - &dst_snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot); - - if (dst_inum.inum < src_inum.inum) { - /* Avoid some lock cycle transaction restarts */ - ret = bch2_btree_iter_traverse(trans, &dst_iter); - if (ret) - continue; - } - - dst_done = dst_iter.pos.offset - dst_start.offset; - src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(trans, &src_iter, src_want); - - src_k = get_next_src(trans, &src_iter, src_end); - ret = bkey_err(src_k); - if (ret) - continue; - - if (bkey_lt(src_want, src_iter.pos)) { - ret = bch2_fpunch_at(trans, &dst_iter, dst_inum, - min(dst_end.offset, - dst_iter.pos.offset + - src_iter.pos.offset - src_want.offset), - i_sectors_delta); - continue; - } - - if (src_k.k->type != KEY_TYPE_reflink_p) { - bch2_btree_iter_set_pos_to_extent_start(&src_iter); - - bch2_bkey_buf_reassemble(&new_src, c, src_k); - src_k = bkey_i_to_s_c(new_src.k); - - ret = bch2_make_extent_indirect(trans, &src_iter, - new_src.k, - reflink_p_may_update_opts_field); - if (ret) - continue; - - BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); - } - - if (src_k.k->type == KEY_TYPE_reflink_p) { - struct bkey_s_c_reflink_p src_p = - bkey_s_c_to_reflink_p(src_k); - struct bkey_i_reflink_p *dst_p = - bkey_reflink_p_init(new_dst.k); - - u64 offset = REFLINK_P_IDX(src_p.v) + - (src_want.offset - - bkey_start_offset(src_k.k)); - - SET_REFLINK_P_IDX(&dst_p->v, offset); - - if (reflink_p_may_update_opts_field && - may_change_src_io_path_opts && - REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v)) - SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); - } else { - BUG(); - } - - new_dst.k->k.p = dst_iter.pos; - bch2_key_resize(&new_dst.k->k, - min(src_k.k->p.offset - src_want.offset, - dst_end.offset - dst_iter.pos.offset)); - - ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: - bch2_extent_update(trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, - new_i_size, i_sectors_delta, - true); - bch2_disk_reservation_put(c, &disk_res); - } - bch2_trans_iter_exit(trans, &dst_iter); - bch2_trans_iter_exit(trans, &src_iter); - - BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); - BUG_ON(bkey_gt(dst_iter.pos, dst_end)); - - dst_done = dst_iter.pos.offset - dst_start.offset; - new_i_size = min(dst_iter.pos.offset << 9, new_i_size); - - do { - struct bch_inode_unpacked inode_u; - struct btree_iter inode_iter = {}; - - bch2_trans_begin(trans); - - ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, - dst_inum, BTREE_ITER_intent); - - if (!ret2 && - inode_u.bi_size < new_i_size) { - inode_u.bi_size = new_i_size; - ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - } - - bch2_trans_iter_exit(trans, &inode_iter); - } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); -err: - bch2_trans_put(trans); - bch2_bkey_buf_exit(&new_src, c); - bch2_bkey_buf_exit(&new_dst, c); - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink); - - return dst_done ?: ret ?: ret2; -} - -/* fsck */ - -static int bch2_gc_write_reflink_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - size_t *idx) -{ - struct bch_fs *c = trans->c; - const __le64 *refcount = bkey_refcount_c(k); - struct printbuf buf = PRINTBUF; - struct reflink_gc *r; - int ret = 0; - - if (!refcount) - return 0; - - while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && - r->offset < k.k->p.offset) - ++*idx; - - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; - } - - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), - trans, reflink_v_refcount_wrong, - "reflink key has wrong refcount:\n" - "%s\n" - "should be %u", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf), - r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - if (!r->refcount) - new->k.type = KEY_TYPE_deleted; - else - *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); - ret = bch2_trans_update(trans, iter, new, 0); - } -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_gc_reflink_done(struct bch_fs *c) -{ - size_t idx = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_reflink_key(trans, &iter, k, &idx))); - c->reflink_gc_nr = 0; - return ret; -} - -int bch2_gc_reflink_start(struct bch_fs *c) -{ - c->reflink_gc_nr = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, ({ - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - continue; - - struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, - c->reflink_gc_nr++, GFP_KERNEL); - if (!r) { - ret = bch_err_throw(c, ENOMEM_gc_reflink_start); - break; - } - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - 0; - }))); - - bch_err_fn(c, ret); - return ret; -} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h deleted file mode 100644 index 1632780bdf181f..00000000000000 --- a/fs/bcachefs/reflink.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REFLINK_H -#define _BCACHEFS_REFLINK_H - -int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ - .key_validate = bch2_reflink_p_validate, \ - .val_to_text = bch2_reflink_p_to_text, \ - .key_merge = bch2_reflink_p_merge, \ - .trigger = bch2_trigger_reflink_p, \ - .min_val_size = 16, \ -}) - -int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ - .key_validate = bch2_reflink_v_validate, \ - .val_to_text = bch2_reflink_v_to_text, \ - .swab = bch2_ptr_swab, \ - .trigger = bch2_trigger_reflink_v, \ - .min_val_size = 8, \ -}) - -int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_indirect_inline_data_to_text(struct printbuf *, - struct bch_fs *, struct bkey_s_c); -int bch2_trigger_indirect_inline_data(struct btree_trans *, - enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ - .key_validate = bch2_indirect_inline_data_validate, \ - .val_to_text = bch2_indirect_inline_data_to_text, \ - .trigger = bch2_trigger_indirect_inline_data, \ - .min_val_size = 8, \ -}) - -static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_reflink_v: - return &bkey_s_c_to_reflink_v(k).v->refcount; - case KEY_TYPE_indirect_inline_data: - return &bkey_s_c_to_indirect_inline_data(k).v->refcount; - default: - return NULL; - } -} - -static inline __le64 *bkey_refcount(struct bkey_s k) -{ - switch (k.k->type) { - case KEY_TYPE_reflink_v: - return &bkey_s_to_reflink_v(k).v->refcount; - case KEY_TYPE_indirect_inline_data: - return &bkey_s_to_indirect_inline_data(k).v->refcount; - default: - return NULL; - } -} - -struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *, - s64 *, struct bkey_s_c_reflink_p, - bool, unsigned); - -s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, - subvol_inum, u64, u64, u64, s64 *, - bool); - -int bch2_gc_reflink_done(struct bch_fs *); -int bch2_gc_reflink_start(struct bch_fs *); - -#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h deleted file mode 100644 index 92995e4f898e27..00000000000000 --- a/fs/bcachefs/reflink_format.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REFLINK_FORMAT_H -#define _BCACHEFS_REFLINK_FORMAT_H - -struct bch_reflink_p { - struct bch_val v; - __le64 idx_flags; - /* - * A reflink pointer might point to an indirect extent which is then - * later split (by copygc or rebalance). If we only pointed to part of - * the original indirect extent, and then one of the fragments is - * outside the range we point to, we'd leak a refcount: so when creating - * reflink pointers, we need to store pad values to remember the full - * range we were taking a reference on. - */ - __le32 front_pad; - __le32 back_pad; -} __packed __aligned(8); - -LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); -LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57); -LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS, - struct bch_reflink_p, idx_flags, 57, 58); - -struct bch_reflink_v { - struct bch_val v; - __le64 refcount; - union bch_extent_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -struct bch_indirect_inline_data { - struct bch_val v; - __le64 refcount; - u8 data[]; -}; - -#endif /* _BCACHEFS_REFLINK_FORMAT_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c deleted file mode 100644 index 8383bd7fdb3fee..00000000000000 --- a/fs/bcachefs/replicas.c +++ /dev/null @@ -1,918 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "buckets.h" -#include "disk_accounting.h" -#include "journal.h" -#include "replicas.h" -#include "super-io.h" - -#include - -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, - struct bch_replicas_cpu *); - -/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, const void *priv) -{ - size_t size = (size_t) priv; - return memcmp(l, r, size); -} - -/* Replicas tracking - in memory: */ - -static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - BUG_ON(!e->nr_devs); - BUG_ON(e->nr_required > 1 && - e->nr_required >= e->nr_devs); - - for (unsigned i = 0; i + 1 < e->nr_devs; i++) - BUG_ON(e->devs[i] >= e->devs[i + 1]); -#endif -} - -void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) -{ - bubble_sort(e->devs, e->nr_devs, u8_cmp); -} - -static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) -{ - eytzinger0_sort_r(r->entries, r->nr, r->entry_size, - bch2_memcmp, NULL, (void *)(size_t)r->entry_size); -} - -static void bch2_replicas_entry_v0_to_text(struct printbuf *out, - struct bch_replicas_entry_v0 *e) -{ - bch2_prt_data_type(out, e->data_type); - - prt_printf(out, ": %u [", e->nr_devs); - for (unsigned i = 0; i < e->nr_devs; i++) - prt_printf(out, i ? " %u" : "%u", e->devs[i]); - prt_printf(out, "]"); -} - -void bch2_replicas_entry_to_text(struct printbuf *out, - struct bch_replicas_entry_v1 *e) -{ - bch2_prt_data_type(out, e->data_type); - - prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); - for (unsigned i = 0; i < e->nr_devs; i++) - prt_printf(out, i ? " %u" : "%u", e->devs[i]); - prt_printf(out, "]"); -} - -static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, - struct printbuf *err) -{ - if (!r->nr_devs) { - prt_printf(err, "no devices in entry "); - goto bad; - } - - if (r->nr_required > 1 && - r->nr_required >= r->nr_devs) { - prt_printf(err, "bad nr_required in entry "); - goto bad; - } - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] != BCH_SB_MEMBER_INVALID && - !bch2_member_exists(sb, r->devs[i])) { - prt_printf(err, "invalid device %u in entry ", r->devs[i]); - goto bad; - } - - return 0; -bad: - bch2_replicas_entry_to_text(err, r); - return -BCH_ERR_invalid_replicas_entry; -} - -int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, - struct bch_fs *c, - struct printbuf *err) -{ - if (!r->nr_devs) { - prt_printf(err, "no devices in entry "); - goto bad; - } - - if (r->nr_required > 1 && - r->nr_required >= r->nr_devs) { - prt_printf(err, "bad nr_required in entry "); - goto bad; - } - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] != BCH_SB_MEMBER_INVALID && - !bch2_dev_exists(c, r->devs[i])) { - prt_printf(err, "invalid device %u in entry ", r->devs[i]); - goto bad; - } - - return 0; -bad: - bch2_replicas_entry_to_text(err, r); - return bch_err_throw(c, invalid_replicas_entry); -} - -void bch2_cpu_replicas_to_text(struct printbuf *out, - struct bch_replicas_cpu *r) -{ - struct bch_replicas_entry_v1 *e; - bool first = true; - - for_each_cpu_replicas_entry(r, e) { - if (!first) - prt_printf(out, " "); - first = false; - - bch2_replicas_entry_to_text(out, e); - } -} - -static void extent_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry_v1 *r) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - r->nr_required = 1; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - - if (!p.has_ec) - replicas_entry_add_dev(r, p.ptr.dev); - else - r->nr_required = 0; - } -} - -static void stripe_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry_v1 *r) -{ - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - const struct bch_extent_ptr *ptr; - - r->nr_required = s.v->nr_blocks - s.v->nr_redundant; - - for (ptr = s.v->ptrs; - ptr < s.v->ptrs + s.v->nr_blocks; - ptr++) - replicas_entry_add_dev(r, ptr->dev); -} - -void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, - struct bkey_s_c k) -{ - e->nr_devs = 0; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - e->data_type = BCH_DATA_btree; - extent_to_replicas(k, e); - break; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - e->data_type = BCH_DATA_user; - extent_to_replicas(k, e); - break; - case KEY_TYPE_stripe: - e->data_type = BCH_DATA_parity; - stripe_to_replicas(k, e); - break; - } - - bch2_replicas_entry_sort(e); -} - -void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, - enum bch_data_type data_type, - struct bch_devs_list devs) -{ - BUG_ON(!data_type || - data_type == BCH_DATA_sb || - data_type >= BCH_DATA_NR); - - e->data_type = data_type; - e->nr_devs = 0; - e->nr_required = 1; - - darray_for_each(devs, i) - replicas_entry_add_dev(e, *i); - - bch2_replicas_entry_sort(e); -} - -static struct bch_replicas_cpu -cpu_replicas_add_entry(struct bch_fs *c, - struct bch_replicas_cpu *old, - struct bch_replicas_entry_v1 *new_entry) -{ - struct bch_replicas_cpu new = { - .nr = old->nr + 1, - .entry_size = max_t(unsigned, old->entry_size, - replicas_entry_bytes(new_entry)), - }; - - new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); - if (!new.entries) - return new; - - for (unsigned i = 0; i < old->nr; i++) - memcpy(cpu_replicas_entry(&new, i), - cpu_replicas_entry(old, i), - old->entry_size); - - memcpy(cpu_replicas_entry(&new, old->nr), - new_entry, - replicas_entry_bytes(new_entry)); - - bch2_cpu_replicas_sort(&new); - return new; -} - -static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - int idx, entry_size = replicas_entry_bytes(search); - - if (unlikely(entry_size > r->entry_size)) - return -1; - -#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) - idx = eytzinger0_find(r->entries, r->nr, r->entry_size, - entry_cmp, search); -#undef entry_cmp - - return idx < r->nr ? idx : -1; -} - -int bch2_replicas_entry_idx(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - bch2_replicas_entry_sort(search); - - return __replicas_entry_idx(&c->replicas, search); -} - -static bool __replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_entry_v1 *search) -{ - return __replicas_entry_idx(r, search) >= 0; -} - -bool bch2_replicas_marked_locked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - verify_replicas_entry(search); - - return !search->nr_devs || - (__replicas_has_entry(&c->replicas, search) && - (likely((!c->replicas_gc.entries)) || - __replicas_has_entry(&c->replicas_gc, search))); -} - -bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry_v1 *search) -{ - percpu_down_read(&c->mark_lock); - bool ret = bch2_replicas_marked_locked(c, search); - percpu_up_read(&c->mark_lock); - - return ret; -} - -noinline -static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry_v1 *new_entry) -{ - struct bch_replicas_cpu new_r, new_gc; - int ret = 0; - - verify_replicas_entry(new_entry); - - memset(&new_r, 0, sizeof(new_r)); - memset(&new_gc, 0, sizeof(new_gc)); - - mutex_lock(&c->sb_lock); - - if (c->replicas_gc.entries && - !__replicas_has_entry(&c->replicas_gc, new_entry)) { - new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); - if (!new_gc.entries) { - ret = bch_err_throw(c, ENOMEM_cpu_replicas); - goto err; - } - } - - if (!__replicas_has_entry(&c->replicas, new_entry)) { - new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); - if (!new_r.entries) { - ret = bch_err_throw(c, ENOMEM_cpu_replicas); - goto err; - } - - ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); - if (ret) - goto err; - } - - if (!new_r.entries && - !new_gc.entries) - goto out; - - /* allocations done, now commit: */ - - if (new_r.entries) - bch2_write_super(c); - - /* don't update in memory replicas until changes are persistent */ - percpu_down_write(&c->mark_lock); - if (new_r.entries) - swap(c->replicas, new_r); - if (new_gc.entries) - swap(new_gc, c->replicas_gc); - percpu_up_write(&c->mark_lock); -out: - mutex_unlock(&c->sb_lock); - - kfree(new_r.entries); - kfree(new_gc.entries); - - return ret; -err: - bch_err_msg(c, ret, "adding replicas entry"); - goto out; -} - -int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) -{ - return likely(bch2_replicas_marked(c, r)) - ? 0 : bch2_mark_replicas_slowpath(c, r); -} - -/* - * Old replicas_gc mechanism: only used for journal replicas entries now, should - * die at some point: - */ - -int bch2_replicas_gc_end(struct bch_fs *c, int ret) -{ - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - percpu_down_write(&c->mark_lock); - - ret = ret ?: - bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); - if (!ret) - swap(c->replicas, c->replicas_gc); - - kfree(c->replicas_gc.entries); - c->replicas_gc.entries = NULL; - - percpu_up_write(&c->mark_lock); - - if (!ret) - bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - - return ret; -} - -int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -{ - struct bch_replicas_entry_v1 *e; - unsigned i = 0; - - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - BUG_ON(c->replicas_gc.entries); - - c->replicas_gc.nr = 0; - c->replicas_gc.entry_size = 0; - - for_each_cpu_replicas_entry(&c->replicas, e) { - /* Preserve unknown data types */ - if (e->data_type >= BCH_DATA_NR || - !((1 << e->data_type) & typemask)) { - c->replicas_gc.nr++; - c->replicas_gc.entry_size = - max_t(unsigned, c->replicas_gc.entry_size, - replicas_entry_bytes(e)); - } - } - - c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, - c->replicas_gc.entry_size, - GFP_KERNEL); - if (!c->replicas_gc.entries) { - mutex_unlock(&c->sb_lock); - bch_err(c, "error allocating c->replicas_gc"); - return bch_err_throw(c, ENOMEM_replicas_gc); - } - - for_each_cpu_replicas_entry(&c->replicas, e) - if (e->data_type >= BCH_DATA_NR || - !((1 << e->data_type) & typemask)) - memcpy(cpu_replicas_entry(&c->replicas_gc, i++), - e, c->replicas_gc.entry_size); - - bch2_cpu_replicas_sort(&c->replicas_gc); - mutex_unlock(&c->sb_lock); - - return 0; -} - -/* - * New much simpler mechanism for clearing out unneeded replicas entries - drop - * replicas entries that have 0 sectors used. - * - * However, we don't track sector counts for journal usage, so this doesn't drop - * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism - * is retained for that. - */ -int bch2_replicas_gc2(struct bch_fs *c) -{ - struct bch_replicas_cpu new = { 0 }; - unsigned nr; - int ret = 0; - - bch2_accounting_mem_gc(c); -retry: - nr = READ_ONCE(c->replicas.nr); - new.entry_size = READ_ONCE(c->replicas.entry_size); - new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); - if (!new.entries) { - bch_err(c, "error allocating c->replicas_gc"); - return bch_err_throw(c, ENOMEM_replicas_gc); - } - - mutex_lock(&c->sb_lock); - percpu_down_write(&c->mark_lock); - - if (nr != c->replicas.nr || - new.entry_size != c->replicas.entry_size) { - percpu_up_write(&c->mark_lock); - mutex_unlock(&c->sb_lock); - kfree(new.entries); - goto retry; - } - - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); - - struct disk_accounting_pos k = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; - - unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), - "embedded variable length struct"); - - struct bpos p = disk_accounting_pos_to_bpos(&k); - - struct bch_accounting_mem *acc = &c->accounting; - bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &p) >= acc->k.nr; - - if (e->data_type == BCH_DATA_journal || !kill) - memcpy(cpu_replicas_entry(&new, new.nr++), - e, new.entry_size); - } - - bch2_cpu_replicas_sort(&new); - - ret = bch2_cpu_replicas_to_sb_replicas(c, &new); - - if (!ret) - swap(c->replicas, new); - - kfree(new.entries); - - percpu_up_write(&c->mark_lock); - - if (!ret) - bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - - return ret; -} - -/* Replicas tracking - superblock: */ - -static int -__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, - struct bch_replicas_cpu *cpu_r) -{ - struct bch_replicas_entry_v1 *e, *dst; - unsigned nr = 0, entry_size = 0, idx = 0; - - for_each_replicas_entry(sb_r, e) { - entry_size = max_t(unsigned, entry_size, - replicas_entry_bytes(e)); - nr++; - } - - cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); - if (!cpu_r->entries) - return -BCH_ERR_ENOMEM_cpu_replicas; - - cpu_r->nr = nr; - cpu_r->entry_size = entry_size; - - for_each_replicas_entry(sb_r, e) { - dst = cpu_replicas_entry(cpu_r, idx++); - memcpy(dst, e, replicas_entry_bytes(e)); - bch2_replicas_entry_sort(dst); - } - - return 0; -} - -static int -__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, - struct bch_replicas_cpu *cpu_r) -{ - struct bch_replicas_entry_v0 *e; - unsigned nr = 0, entry_size = 0, idx = 0; - - for_each_replicas_entry(sb_r, e) { - entry_size = max_t(unsigned, entry_size, - replicas_entry_bytes(e)); - nr++; - } - - entry_size += sizeof(struct bch_replicas_entry_v1) - - sizeof(struct bch_replicas_entry_v0); - - cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); - if (!cpu_r->entries) - return -BCH_ERR_ENOMEM_cpu_replicas; - - cpu_r->nr = nr; - cpu_r->entry_size = entry_size; - - for_each_replicas_entry(sb_r, e) { - struct bch_replicas_entry_v1 *dst = - cpu_replicas_entry(cpu_r, idx++); - - dst->data_type = e->data_type; - dst->nr_devs = e->nr_devs; - dst->nr_required = 1; - memcpy(dst->devs, e->devs, e->nr_devs); - bch2_replicas_entry_sort(dst); - } - - return 0; -} - -int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) -{ - struct bch_sb_field_replicas *sb_v1; - struct bch_sb_field_replicas_v0 *sb_v0; - struct bch_replicas_cpu new_r = { 0, 0, NULL }; - int ret = 0; - - if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) - ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); - else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) - ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); - if (ret) - return ret; - - bch2_cpu_replicas_sort(&new_r); - - percpu_down_write(&c->mark_lock); - swap(c->replicas, new_r); - percpu_up_write(&c->mark_lock); - - kfree(new_r.entries); - - return 0; -} - -static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, - struct bch_replicas_cpu *r) -{ - struct bch_sb_field_replicas_v0 *sb_r; - struct bch_replicas_entry_v0 *dst; - struct bch_replicas_entry_v1 *src; - size_t bytes; - - bytes = sizeof(struct bch_sb_field_replicas); - - for_each_cpu_replicas_entry(r, src) - bytes += replicas_entry_bytes(src) - 1; - - sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, - DIV_ROUND_UP(bytes, sizeof(u64))); - if (!sb_r) - return bch_err_throw(c, ENOSPC_sb_replicas); - - bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); - sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); - - memset(&sb_r->entries, 0, - vstruct_end(&sb_r->field) - - (void *) &sb_r->entries); - - dst = sb_r->entries; - for_each_cpu_replicas_entry(r, src) { - dst->data_type = src->data_type; - dst->nr_devs = src->nr_devs; - memcpy(dst->devs, src->devs, src->nr_devs); - - dst = replicas_entry_next(dst); - - BUG_ON((void *) dst > vstruct_end(&sb_r->field)); - } - - return 0; -} - -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, - struct bch_replicas_cpu *r) -{ - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry_v1 *dst, *src; - bool need_v1 = false; - size_t bytes; - - bytes = sizeof(struct bch_sb_field_replicas); - - for_each_cpu_replicas_entry(r, src) { - bytes += replicas_entry_bytes(src); - if (src->nr_required != 1) - need_v1 = true; - } - - if (!need_v1) - return bch2_cpu_replicas_to_sb_replicas_v0(c, r); - - sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, - DIV_ROUND_UP(bytes, sizeof(u64))); - if (!sb_r) - return bch_err_throw(c, ENOSPC_sb_replicas); - - bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); - sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); - - memset(&sb_r->entries, 0, - vstruct_end(&sb_r->field) - - (void *) &sb_r->entries); - - dst = sb_r->entries; - for_each_cpu_replicas_entry(r, src) { - memcpy(dst, src, replicas_entry_bytes(src)); - - dst = replicas_entry_next(dst); - - BUG_ON((void *) dst > vstruct_end(&sb_r->field)); - } - - return 0; -} - -static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, - struct bch_sb *sb, - struct printbuf *err) -{ - unsigned i; - - sort_r(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL, - (void *)(size_t)cpu_r->entry_size); - - for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(cpu_r, i); - - int ret = bch2_replicas_entry_sb_validate(e, sb, err); - if (ret) - return ret; - - if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry_v1 *n = - cpu_replicas_entry(cpu_r, i + 1); - - BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); - - if (!memcmp(e, n, cpu_r->entry_size)) { - prt_printf(err, "duplicate replicas entry "); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } - } - } - - return 0; -} - -static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); - struct bch_replicas_cpu cpu_r; - int ret; - - ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); - if (ret) - return ret; - - ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); - kfree(cpu_r.entries); - return ret; -} - -static void bch2_sb_replicas_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_replicas *r = field_to_type(f, replicas); - struct bch_replicas_entry_v1 *e; - bool first = true; - - for_each_replicas_entry(r, e) { - if (!first) - prt_printf(out, " "); - first = false; - - bch2_replicas_entry_to_text(out, e); - } - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_replicas = { - .validate = bch2_sb_replicas_validate, - .to_text = bch2_sb_replicas_to_text, -}; - -static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_replicas_cpu cpu_r; - int ret; - - ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); - if (ret) - return ret; - - ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); - kfree(cpu_r.entries); - return ret; -} - -static void bch2_sb_replicas_v0_to_text(struct printbuf *out, - struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_replicas_entry_v0 *e; - bool first = true; - - for_each_replicas_entry(sb_r, e) { - if (!first) - prt_printf(out, " "); - first = false; - - bch2_replicas_entry_v0_to_text(out, e); - } - prt_newline(out); -} - -const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { - .validate = bch2_sb_replicas_v0_validate, - .to_text = bch2_sb_replicas_v0_to_text, -}; - -/* Query replicas: */ - -bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, - unsigned flags, bool print) -{ - struct bch_replicas_entry_v1 *e; - bool ret = true; - - percpu_down_read(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, e) { - unsigned nr_online = 0, nr_failed = 0, dflags = 0; - bool metadata = e->data_type < BCH_DATA_user; - - if (e->data_type == BCH_DATA_cached) - continue; - - scoped_guard(rcu) - for (unsigned i = 0; i < e->nr_devs; i++) { - if (e->devs[i] == BCH_SB_MEMBER_INVALID) { - nr_failed++; - continue; - } - - nr_online += test_bit(e->devs[i], devs.d); - - struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); - nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; - } - - if (nr_online + nr_failed == e->nr_devs) - continue; - - if (nr_online < e->nr_required) - dflags |= metadata - ? BCH_FORCE_IF_METADATA_LOST - : BCH_FORCE_IF_DATA_LOST; - - if (nr_online < e->nr_devs) - dflags |= metadata - ? BCH_FORCE_IF_METADATA_DEGRADED - : BCH_FORCE_IF_DATA_DEGRADED; - - if (dflags & ~flags) { - if (print) { - struct printbuf buf = PRINTBUF; - - bch2_replicas_entry_to_text(&buf, e); - bch_err(c, "insufficient devices online (%u) for replicas entry %s", - nr_online, buf.buf); - printbuf_exit(&buf); - } - ret = false; - break; - } - - } - percpu_up_read(&c->mark_lock); - - return ret; -} - -unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) -{ - struct bch_sb_field_replicas *replicas; - struct bch_sb_field_replicas_v0 *replicas_v0; - unsigned data_has = 0; - - replicas = bch2_sb_field_get(sb, replicas); - replicas_v0 = bch2_sb_field_get(sb, replicas_v0); - - if (replicas) { - struct bch_replicas_entry_v1 *r; - - for_each_replicas_entry(replicas, r) { - if (r->data_type >= sizeof(data_has) * 8) - continue; - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] == dev) - data_has |= 1 << r->data_type; - } - - } else if (replicas_v0) { - struct bch_replicas_entry_v0 *r; - - for_each_replicas_entry_v0(replicas_v0, r) { - if (r->data_type >= sizeof(data_has) * 8) - continue; - - for (unsigned i = 0; i < r->nr_devs; i++) - if (r->devs[i] == dev) - data_has |= 1 << r->data_type; - } - } - - - return data_has; -} - -unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) -{ - mutex_lock(&c->sb_lock); - unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); - mutex_unlock(&c->sb_lock); - - return ret; -} - -void bch2_fs_replicas_exit(struct bch_fs *c) -{ - kfree(c->replicas.entries); - kfree(c->replicas_gc.entries); -} diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h deleted file mode 100644 index 5aba2c1ce1331a..00000000000000 --- a/fs/bcachefs/replicas.h +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REPLICAS_H -#define _BCACHEFS_REPLICAS_H - -#include "bkey.h" -#include "eytzinger.h" -#include "replicas_types.h" - -void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); -void bch2_replicas_entry_to_text(struct printbuf *, - struct bch_replicas_entry_v1 *); -int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, - struct bch_fs *, struct printbuf *); -void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); - -static inline struct bch_replicas_entry_v1 * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -{ - return (void *) r->entries + r->entry_size * i; -} - -int bch2_replicas_entry_idx(struct bch_fs *, - struct bch_replicas_entry_v1 *); - -void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, - enum bch_data_type, - struct bch_devs_list); - -bool bch2_replicas_marked_locked(struct bch_fs *, - struct bch_replicas_entry_v1 *); -bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *); -int bch2_mark_replicas(struct bch_fs *, - struct bch_replicas_entry_v1 *); - -void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c); - -static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, - unsigned dev) -{ - e->data_type = BCH_DATA_cached; - e->nr_devs = 1; - e->nr_required = 1; - e->devs[0] = dev; -} - -bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, - unsigned, bool); - -unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); -unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); - -int bch2_replicas_gc_end(struct bch_fs *, int); -int bch2_replicas_gc_start(struct bch_fs *, unsigned); -int bch2_replicas_gc2(struct bch_fs *); - -#define for_each_cpu_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ - _i = (void *) (_i) + (_r)->entry_size) - -/* iterate over superblock replicas - used by userspace tools: */ - -#define replicas_entry_next(_i) \ - ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) - -#define for_each_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - -#define for_each_replicas_entry_v0(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - -int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); - -extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; -extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; - -void bch2_fs_replicas_exit(struct bch_fs *); - -#endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h deleted file mode 100644 index b7eff904acdb71..00000000000000 --- a/fs/bcachefs/replicas_format.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REPLICAS_FORMAT_H -#define _BCACHEFS_REPLICAS_FORMAT_H - -struct bch_replicas_entry_v0 { - __u8 data_type; - __u8 nr_devs; - __u8 devs[] __counted_by(nr_devs); -} __packed; - -struct bch_sb_field_replicas_v0 { - struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[]; -} __packed __aligned(8); - -struct bch_replicas_entry_v1 { - __u8 data_type; - __u8 nr_devs; - __u8 nr_required; - __u8 devs[] __counted_by(nr_devs); -} __packed; - -struct bch_sb_field_replicas { - struct bch_sb_field field; - struct bch_replicas_entry_v1 entries[]; -} __packed __aligned(8); - -#define replicas_entry_bytes(_i) \ - (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) - -#define replicas_entry_add_dev(e, d) ({ \ - (e)->nr_devs++; \ - (e)->devs[(e)->nr_devs - 1] = (d); \ -}) - -#endif /* _BCACHEFS_REPLICAS_FORMAT_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h deleted file mode 100644 index fed71c861fe76f..00000000000000 --- a/fs/bcachefs/replicas_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_REPLICAS_TYPES_H -#define _BCACHEFS_REPLICAS_TYPES_H - -struct bch_replicas_cpu { - unsigned nr; - unsigned entry_size; - struct bch_replicas_entry_v1 *entries; -}; - -#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c deleted file mode 100644 index 59c8770e4a0e94..00000000000000 --- a/fs/bcachefs/sb-clean.c +++ /dev/null @@ -1,340 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_update_interior.h" -#include "buckets.h" -#include "error.h" -#include "journal_io.h" -#include "replicas.h" -#include "sb-clean.h" -#include "super-io.h" - -/* - * BCH_SB_FIELD_clean: - * - * Btree roots, and a few other things, are recovered from the journal after an - * unclean shutdown - but after a clean shutdown, to avoid having to read the - * journal, we can store them in the superblock. - * - * bch_sb_field_clean simply contains a list of journal entries, stored exactly - * as they would be in the journal: - */ - -int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, - int write) -{ - struct bkey_validate_context from = { - .flags = write, - .from = BKEY_VALIDATE_superblock, - }; - struct jset_entry *entry; - int ret; - - for (entry = clean->start; - entry < (struct jset_entry *) vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if (vstruct_end(entry) > vstruct_end(&clean->field)) { - bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu", - le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s), - (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field)); - bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun); - return -BCH_ERR_fsck_repair_unimplemented; - } - - ret = bch2_journal_entry_validate(c, NULL, entry, - le16_to_cpu(c->disk_sb.sb->version), - BCH_SB_BIG_ENDIAN(c->disk_sb.sb), - from); - if (ret) - return ret; - } - - return 0; -} - -static struct bkey_i *btree_root_find(struct bch_fs *c, - struct bch_sb_field_clean *clean, - struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry, *start, *end; - - if (clean) { - start = clean->start; - end = vstruct_end(&clean->field); - } else { - start = j->start; - end = vstruct_last(j); - } - - for (entry = start; entry < end; entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_btree_root && - entry->btree_id == id) - goto found; - - return NULL; -found: - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - return k; -} - -int bch2_verify_superblock_clean(struct bch_fs *c, - struct bch_sb_field_clean **cleanp, - struct jset *j) -{ - unsigned i; - struct bch_sb_field_clean *clean = *cleanp; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - int ret = 0; - - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, - sb_clean_journal_seq_mismatch, - "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", - le64_to_cpu(clean->journal_seq), - le64_to_cpu(j->seq))) { - kfree(clean); - *cleanp = NULL; - return 0; - } - - for (i = 0; i < BTREE_ID_NR; i++) { - struct bkey_i *k1, *k2; - unsigned l1 = 0, l2 = 0; - - k1 = btree_root_find(c, clean, NULL, i, &l1); - k2 = btree_root_find(c, NULL, j, i, &l2); - - if (!k1 && !k2) - continue; - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - if (k1) - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); - else - prt_printf(&buf1, "(none)"); - - if (k2) - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); - else - prt_printf(&buf2, "(none)"); - - mustfix_fsck_err_on(!k1 || !k2 || - IS_ERR(k1) || - IS_ERR(k2) || - k1->k.u64s != k2->k.u64s || - memcmp(k1, k2, bkey_bytes(&k1->k)) || - l1 != l2, c, - sb_clean_btree_root_mismatch, - "superblock btree root %u doesn't match journal after clean shutdown\n" - "sb: l=%u %s\n" - "journal: l=%u %s\n", i, - l1, buf1.buf, - l2, buf2.buf); - } -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean, *sb_clean; - int ret; - - mutex_lock(&c->sb_lock); - sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); - - if (fsck_err_on(!sb_clean, c, - sb_clean_missing, - "superblock marked clean but clean section not present")) { - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - mutex_unlock(&c->sb_lock); - return ERR_PTR(-BCH_ERR_invalid_sb_clean); - } - - clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), - GFP_KERNEL); - if (!clean) { - mutex_unlock(&c->sb_lock); - return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); - } - - ret = bch2_sb_clean_validate_late(c, clean, READ); - if (ret) { - kfree(clean); - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); - } - - mutex_unlock(&c->sb_lock); - - return clean; -fsck_err: - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); -} - -void bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry **end, - u64 journal_seq) -{ - { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_key_version; - u->v = cpu_to_le64(atomic64_read(&c->key_version)); - } - - for (unsigned i = 0; i < 2; i++) { - struct jset_entry_clock *clock = - container_of(jset_entry_init(end, sizeof(*clock)), - struct jset_entry_clock, entry); - - clock->entry.type = BCH_JSET_ENTRY_clock; - clock->rw = i; - clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); - } -} - -static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - - if (vstruct_bytes(&clean->field) < sizeof(*clean)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&clean->field), sizeof(*clean)); - return -BCH_ERR_invalid_sb_clean; - } - - for (struct jset_entry *entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) { - prt_str(err, "entry type "); - bch2_prt_jset_entry_type(err, entry->type); - prt_str(err, " overruns end of section"); - return -BCH_ERR_invalid_sb_clean; - } - } - - return 0; -} - -static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - struct jset_entry *entry; - - prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags)); - prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq)); - - for (entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) - break; - - if (entry->type == BCH_JSET_ENTRY_btree_keys && - !entry->u64s) - continue; - - bch2_journal_entry_to_text(out, NULL, entry); - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_clean = { - .validate = bch2_sb_clean_validate, - .to_text = bch2_sb_clean_to_text, -}; - -int bch2_fs_mark_dirty(struct bch_fs *c) -{ - int ret; - - /* - * Unconditionally write superblock, to verify it hasn't changed before - * we go rw: - */ - - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); - - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -void bch2_fs_mark_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *sb_clean; - struct jset_entry *entry; - unsigned u64s; - int ret; - - mutex_lock(&c->sb_lock); - if (BCH_SB_CLEAN(c->disk_sb.sb)) - goto out; - - SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); - - u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; - - sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s); - if (!sb_clean) { - bch_err(c, "error resizing superblock while setting filesystem clean"); - goto out; - } - - sb_clean->flags = 0; - sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); - - /* Trying to catch outstanding bug: */ - BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); - - entry = sb_clean->start; - bch2_journal_super_entries_add_common(c, &entry, 0); - entry = bch2_btree_roots_to_journal_entries(c, entry, 0); - BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); - - memset(entry, 0, - vstruct_end(&sb_clean->field) - (void *) entry); - - /* - * this should be in the write path, and we should be validating every - * superblock section: - */ - ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); - if (ret) { - bch_err(c, "error writing marking filesystem clean: validate error"); - goto out; - } - - bch2_journal_pos_from_member_info_set(c); - - bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); -} diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h deleted file mode 100644 index 71caef2812398c..00000000000000 --- a/fs/bcachefs/sb-clean.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_CLEAN_H -#define _BCACHEFS_SB_CLEAN_H - -int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); -int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **, - struct jset *); -struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *); -void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); - -extern const struct bch_sb_field_ops bch_sb_field_ops_clean; - -int bch2_fs_mark_dirty(struct bch_fs *); -void bch2_fs_mark_clean(struct bch_fs *); - -#endif /* _BCACHEFS_SB_CLEAN_H */ diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c deleted file mode 100644 index 2b4b8445d418b3..00000000000000 --- a/fs/bcachefs/sb-counters.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "super-io.h" -#include "sb-counters.h" - -/* BCH_SB_FIELD_counters */ - -static const u8 counters_to_stable_map[] = { -#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, - BCH_PERSISTENT_COUNTERS() -#undef x -}; - -const char * const bch2_counter_names[] = { -#define x(t, n, ...) (#t), - BCH_PERSISTENT_COUNTERS() -#undef x - NULL -}; - -static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) -{ - if (!ctrs) - return 0; - - return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; -} - -static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - return 0; -} - -static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_counters *ctrs = field_to_type(f, counters); - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - if (stable < nr) - prt_printf(out, "%s \t%llu\n", - bch2_counter_names[i], - le64_to_cpu(ctrs->d[stable])); - } -} - -int bch2_sb_counters_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) - c->counters_on_mount[i] = 0; - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - if (stable < nr) { - u64 v = le64_to_cpu(ctrs->d[stable]); - percpu_u64_set(&c->counters[i], v); - c->counters_on_mount[i] = v; - } - } - - return 0; -} - -int bch2_sb_counters_from_cpu(struct bch_fs *c) -{ - struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - struct bch_sb_field_counters *ret; - unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - - if (nr < BCH_COUNTER_NR) { - ret = bch2_sb_field_resize(&c->disk_sb, counters, - sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - if (ret) { - ctrs = ret; - nr = bch2_sb_counter_nr_entries(ctrs); - } - } - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - if (stable < nr) - ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); - } - - return 0; -} - -void bch2_fs_counters_exit(struct bch_fs *c) -{ - free_percpu(c->counters); -} - -int bch2_fs_counters_init(struct bch_fs *c) -{ - c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); - if (!c->counters) - return -BCH_ERR_ENOMEM_fs_counters_init; - - return bch2_sb_counters_to_cpu(c); -} - -const struct bch_sb_field_ops bch_sb_field_ops_counters = { - .validate = bch2_sb_counters_validate, - .to_text = bch2_sb_counters_to_text, -}; - -#ifndef NO_BCACHEFS_CHARDEV -long bch2_ioctl_query_counters(struct bch_fs *c, - struct bch_ioctl_query_counters __user *user_arg) -{ - struct bch_ioctl_query_counters arg; - int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); - if (ret) - return ret; - - if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || - arg.pad) - return -EINVAL; - - arg.nr = min(arg.nr, BCH_COUNTER_NR); - ret = put_user(arg.nr, &user_arg->nr); - if (ret) - return ret; - - for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { - unsigned stable = counters_to_stable_map[i]; - - if (stable < arg.nr) { - u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) - ? percpu_u64_get(&c->counters[i]) - : c->counters_on_mount[i]; - - ret = put_user(v, &user_arg->d[stable]); - if (ret) - return ret; - } - } - - return 0; -} -#endif diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h deleted file mode 100644 index a4329ad8dd1baa..00000000000000 --- a/fs/bcachefs/sb-counters.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_COUNTERS_H -#define _BCACHEFS_SB_COUNTERS_H - -#include "bcachefs.h" -#include "super-io.h" - -int bch2_sb_counters_to_cpu(struct bch_fs *); -int bch2_sb_counters_from_cpu(struct bch_fs *); - -void bch2_fs_counters_exit(struct bch_fs *); -int bch2_fs_counters_init(struct bch_fs *); - -extern const char * const bch2_counter_names[]; -extern const struct bch_sb_field_ops bch_sb_field_ops_counters; - -long bch2_ioctl_query_counters(struct bch_fs *, - struct bch_ioctl_query_counters __user *); - -#endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h deleted file mode 100644 index b868702a431a5e..00000000000000 --- a/fs/bcachefs/sb-counters_format.h +++ /dev/null @@ -1,117 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H -#define _BCACHEFS_SB_COUNTERS_FORMAT_H - -enum counters_flags { - TYPE_COUNTER = BIT(0), /* event counters */ - TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */ -}; - -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0, TYPE_SECTORS) \ - x(io_read_inline, 80, TYPE_SECTORS) \ - x(io_read_hole, 81, TYPE_SECTORS) \ - x(io_read_promote, 30, TYPE_COUNTER) \ - x(io_read_bounce, 31, TYPE_COUNTER) \ - x(io_read_split, 33, TYPE_COUNTER) \ - x(io_read_reuse_race, 34, TYPE_COUNTER) \ - x(io_read_retry, 32, TYPE_COUNTER) \ - x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ - x(io_write, 1, TYPE_SECTORS) \ - x(io_move, 2, TYPE_SECTORS) \ - x(io_move_read, 35, TYPE_SECTORS) \ - x(io_move_write, 36, TYPE_SECTORS) \ - x(io_move_finish, 37, TYPE_SECTORS) \ - x(io_move_fail, 38, TYPE_COUNTER) \ - x(io_move_write_fail, 82, TYPE_COUNTER) \ - x(io_move_start_fail, 39, TYPE_COUNTER) \ - x(io_move_created_rebalance, 83, TYPE_COUNTER) \ - x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ - x(bucket_invalidate, 3, TYPE_COUNTER) \ - x(bucket_discard, 4, TYPE_COUNTER) \ - x(bucket_discard_fast, 79, TYPE_COUNTER) \ - x(bucket_alloc, 5, TYPE_COUNTER) \ - x(bucket_alloc_fail, 6, TYPE_COUNTER) \ - x(btree_cache_scan, 7, TYPE_COUNTER) \ - x(btree_cache_reap, 8, TYPE_COUNTER) \ - x(btree_cache_cannibalize, 9, TYPE_COUNTER) \ - x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \ - x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \ - x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \ - x(btree_node_write, 13, TYPE_COUNTER) \ - x(btree_node_read, 14, TYPE_COUNTER) \ - x(btree_node_compact, 15, TYPE_COUNTER) \ - x(btree_node_merge, 16, TYPE_COUNTER) \ - x(btree_node_split, 17, TYPE_COUNTER) \ - x(btree_node_rewrite, 18, TYPE_COUNTER) \ - x(btree_node_alloc, 19, TYPE_COUNTER) \ - x(btree_node_free, 20, TYPE_COUNTER) \ - x(btree_node_set_root, 21, TYPE_COUNTER) \ - x(btree_path_relock_fail, 22, TYPE_COUNTER) \ - x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \ - x(btree_reserve_get_fail, 24, TYPE_COUNTER) \ - x(journal_entry_full, 25, TYPE_COUNTER) \ - x(journal_full, 26, TYPE_COUNTER) \ - x(journal_reclaim_finish, 27, TYPE_COUNTER) \ - x(journal_reclaim_start, 28, TYPE_COUNTER) \ - x(journal_write, 29, TYPE_COUNTER) \ - x(copygc, 40, TYPE_COUNTER) \ - x(copygc_wait, 41, TYPE_COUNTER) \ - x(gc_gens_end, 42, TYPE_COUNTER) \ - x(gc_gens_start, 43, TYPE_COUNTER) \ - x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \ - x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \ - x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \ - x(trans_restart_fault_inject, 47, TYPE_COUNTER) \ - x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \ - x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \ - x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \ - x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \ - x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \ - x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \ - x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \ - x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \ - x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \ - x(trans_restart_relock, 57, TYPE_COUNTER) \ - x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \ - x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \ - x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \ - x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \ - x(trans_restart_relock_path, 62, TYPE_COUNTER) \ - x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \ - x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \ - x(trans_restart_traverse, 65, TYPE_COUNTER) \ - x(trans_restart_upgrade, 66, TYPE_COUNTER) \ - x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \ - x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \ - x(trans_restart_injected, 69, TYPE_COUNTER) \ - x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \ - x(trans_traverse_all, 71, TYPE_COUNTER) \ - x(transaction_commit, 72, TYPE_COUNTER) \ - x(write_super, 73, TYPE_COUNTER) \ - x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \ - x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ - x(trans_restart_split_race, 76, TYPE_COUNTER) \ - x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ - x(write_buffer_flush_sync, 78, TYPE_COUNTER) - -enum bch_persistent_counters { -#define x(t, n, ...) BCH_COUNTER_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_NR -}; - -enum bch_persistent_counters_stable { -#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_STABLE_NR -}; - -struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -}; - -#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c deleted file mode 100644 index 1506d05e06654b..00000000000000 --- a/fs/bcachefs/sb-downgrade.c +++ /dev/null @@ -1,457 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Superblock section that contains a list of recovery passes to run when - * downgrading past a given version - */ - -#include "bcachefs.h" -#include "darray.h" -#include "recovery_passes.h" -#include "sb-downgrade.h" -#include "sb-errors.h" -#include "super-io.h" - -#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63) - -/* - * Upgrade, downgrade tables - run certain recovery passes, fix certain errors - * - * x(version, recovery_passes, errors...) - */ -#define UPGRADE_TABLE() \ - x(snapshot_2, \ - RECOVERY_PASS_ALL_FSCK, \ - BCH_FSCK_ERR_subvol_root_wrong_bi_subvol, \ - BCH_FSCK_ERR_subvol_not_master_and_not_snapshot) \ - x(backpointers, \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v3, \ - RECOVERY_PASS_ALL_FSCK) \ - x(unwritten_extents, \ - RECOVERY_PASS_ALL_FSCK) \ - x(bucket_gens, \ - BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ - RECOVERY_PASS_ALL_FSCK) \ - x(lru_v2, \ - RECOVERY_PASS_ALL_FSCK) \ - x(fragmentation_lru, \ - RECOVERY_PASS_ALL_FSCK) \ - x(no_bps_in_alloc_keys, \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_trees, \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_skiplists, \ - BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \ - BCH_FSCK_ERR_snapshot_bad_depth, \ - BCH_FSCK_ERR_snapshot_bad_skiplist) \ - x(deleted_inodes, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ - x(rebalance_work, \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ - x(subvolume_fs_parent, \ - BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ - BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ - x(btree_subvolume_children, \ - BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ - BCH_FSCK_ERR_subvol_children_not_set) \ - x(mi_btree_bitmap, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_btree_bitmap_not_marked) \ - x(disk_accounting_v2, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_bkey_version_in_future, \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(disk_accounting_v3, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_bkey_version_in_future, \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \ - BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(disk_accounting_inum, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(rebalance_work_acct_fix, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) \ - x(inode_has_child_snapshots, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \ - x(backpointer_bucket_gen, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_backpointer_to_missing_ptr, \ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(disk_accounting_big_endian, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(cached_backpointers, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(stripe_backpointers, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(inode_has_case_insensitive, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ - BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) - -#define DOWNGRADE_TABLE() \ - x(bucket_stripe_sectors, \ - 0) \ - x(disk_accounting_v2, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_fs_usage_hidden_wrong, \ - BCH_FSCK_ERR_fs_usage_btree_wrong, \ - BCH_FSCK_ERR_fs_usage_data_wrong, \ - BCH_FSCK_ERR_fs_usage_cached_wrong, \ - BCH_FSCK_ERR_fs_usage_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ - BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_replicas_wrong, \ - BCH_FSCK_ERR_bkey_version_in_future) \ - x(disk_accounting_v3, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_dev_usage_buckets_wrong, \ - BCH_FSCK_ERR_dev_usage_sectors_wrong, \ - BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ - BCH_FSCK_ERR_fs_usage_hidden_wrong, \ - BCH_FSCK_ERR_fs_usage_btree_wrong, \ - BCH_FSCK_ERR_fs_usage_data_wrong, \ - BCH_FSCK_ERR_fs_usage_cached_wrong, \ - BCH_FSCK_ERR_fs_usage_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ - BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ - BCH_FSCK_ERR_fs_usage_replicas_wrong, \ - BCH_FSCK_ERR_accounting_replicas_not_marked, \ - BCH_FSCK_ERR_bkey_version_in_future) \ - x(rebalance_work_acct_fix, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(backpointer_bucket_gen, \ - BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \ - BCH_FSCK_ERR_backpointer_to_missing_ptr, \ - BCH_FSCK_ERR_ptr_to_missing_backpointer) \ - x(disk_accounting_big_endian, \ - BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch, \ - BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) - -struct upgrade_downgrade_entry { - u64 recovery_passes; - u16 version; - u16 nr_errors; - const u16 *errors; -}; - -#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ }; -UPGRADE_TABLE() -#undef x - -static const struct upgrade_downgrade_entry upgrade_table[] = { -#define x(ver, passes, ...) { \ - .recovery_passes = passes, \ - .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \ - .errors = upgrade_##ver##_errors, \ -}, -UPGRADE_TABLE() -#undef x -}; - -static int have_stripes(struct bch_fs *c) -{ - if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b)) - return 0; - - return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b); -} - -int bch2_sb_set_upgrade_extra(struct bch_fs *c) -{ - unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; - unsigned new_version = c->sb.version; - bool write_sb = false; - int ret = 0; - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - if (old_version < bcachefs_metadata_version_bucket_stripe_sectors && - new_version >= bcachefs_metadata_version_bucket_stripe_sectors && - (ret = have_stripes(c) > 0)) { - __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent); - write_sb = true; - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret < 0 ? ret : 0; -} - -void bch2_sb_set_upgrade(struct bch_fs *c, - unsigned old_version, - unsigned new_version) -{ - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - for (const struct upgrade_downgrade_entry *i = upgrade_table; - i < upgrade_table + ARRAY_SIZE(upgrade_table); - i++) - if (i->version > old_version && i->version <= new_version) { - u64 passes = i->recovery_passes; - - if (passes & RECOVERY_PASS_ALL_FSCK) - passes |= bch2_fsck_recovery_passes(); - passes &= ~RECOVERY_PASS_ALL_FSCK; - - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(passes)); - - for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++) - __set_bit_le64(*e, ext->errors_silent); - } -} - -#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ }; -DOWNGRADE_TABLE() -#undef x - -static const struct upgrade_downgrade_entry downgrade_table[] = { -#define x(ver, passes, ...) { \ - .recovery_passes = passes, \ - .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \ - .errors = downgrade_##ver##_errors, \ -}, -DOWNGRADE_TABLE() -#undef x -}; - -static int downgrade_table_extra(struct bch_fs *c, darray_char *table) -{ - unsigned dst_offset = table->nr; - struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); - unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); - int ret = 0; - - unsigned nr_errors = le16_to_cpu(dst->nr_errors); - - switch (le16_to_cpu(dst->version)) { - case bcachefs_metadata_version_bucket_stripe_sectors: - if (have_stripes(c)) { - bytes += sizeof(dst->errors[0]) * 2; - - ret = darray_make_room(table, bytes); - if (ret) - return ret; - - dst = (void *) &table->data[dst_offset]; - dst->nr_errors = cpu_to_le16(nr_errors + 1); - - /* open coded __set_bit_le64, as dst is packed and - * dst->recovery_passes is misaligned */ - unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; - dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64)); - - dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong); - } - break; - } - - return ret; -} - -static inline const struct bch_sb_field_downgrade_entry * -downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) -{ - return (void *) &e->errors[le16_to_cpu(e->nr_errors)]; -} - -#define for_each_downgrade_entry(_d, _i) \ - for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \ - (void *) _i < vstruct_end(&(_d)->field) && \ - (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \ - (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \ - _i = downgrade_entry_next_c(_i)) - -static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); - - for (const struct bch_sb_field_downgrade_entry *i = e->entries; - (void *) i < vstruct_end(&e->field); - i = downgrade_entry_next_c(i)) { - /* - * Careful: sb_field_downgrade_entry is only 2 byte aligned, but - * section sizes are 8 byte aligned - an empty entry spanning - * the end of the section is allowed (and ignored): - */ - if ((void *) &i->errors[0] > vstruct_end(&e->field)) - break; - - if (flags & BCH_VALIDATE_write && - (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) { - prt_printf(err, "downgrade entry overruns end of superblock section"); - return -BCH_ERR_invalid_sb_downgrade; - } - - if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) != - BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) { - prt_printf(err, "downgrade entry with mismatched major version (%u != %u)", - BCH_VERSION_MAJOR(le16_to_cpu(i->version)), - BCH_VERSION_MAJOR(le16_to_cpu(sb->version))); - return -BCH_ERR_invalid_sb_downgrade; - } - } - - return 0; -} - -static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); - - if (out->nr_tabstops <= 1) - printbuf_tabstop_push(out, 16); - - for_each_downgrade_entry(e, i) { - prt_str(out, "version:\t"); - bch2_version_to_text(out, le16_to_cpu(i->version)); - prt_newline(out); - - prt_str(out, "recovery passes:\t"); - prt_bitflags(out, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0]))); - prt_newline(out); - - prt_str(out, "errors:\t"); - bool first = true; - for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { - if (!first) - prt_char(out, ','); - first = false; - bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j])); - } - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_downgrade = { - .validate = bch2_sb_downgrade_validate, - .to_text = bch2_sb_downgrade_to_text, -}; - -int bch2_sb_downgrade_update(struct bch_fs *c) -{ - if (!test_bit(BCH_FS_btree_running, &c->flags)) - return 0; - - darray_char table = {}; - int ret = 0; - - for (const struct upgrade_downgrade_entry *src = downgrade_table; - src < downgrade_table + ARRAY_SIZE(downgrade_table); - src++) { - if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) - continue; - - if (src->version < c->sb.version_incompat) - continue; - - struct bch_sb_field_downgrade_entry *dst; - unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; - - ret = darray_make_room(&table, bytes); - if (ret) - goto out; - - dst = (void *) &darray_top(table); - dst->version = cpu_to_le16(src->version); - dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes)); - dst->recovery_passes[1] = 0; - dst->nr_errors = cpu_to_le16(src->nr_errors); - for (unsigned i = 0; i < src->nr_errors; i++) - dst->errors[i] = cpu_to_le16(src->errors[i]); - - ret = downgrade_table_extra(c, &table); - if (ret) - goto out; - - if (!dst->recovery_passes[0] && - !dst->recovery_passes[1] && - !dst->nr_errors) - continue; - - table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); - } - - struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); - - unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64)); - - if (d && le32_to_cpu(d->field.u64s) > sb_u64s) - goto out; - - d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s); - if (!d) { - ret = bch_err_throw(c, ENOSPC_sb_downgrade); - goto out; - } - - memcpy(d->entries, table.data, table.nr); - memset_u64s_tail(d->entries, 0, table.nr); -out: - darray_exit(&table); - return ret; -} - -void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor) -{ - struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); - if (!d) - return; - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - - for_each_downgrade_entry(d, i) { - unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version)); - if (new_minor < minor && minor <= old_minor) { - ext->recovery_passes_required[0] |= i->recovery_passes[0]; - ext->recovery_passes_required[1] |= i->recovery_passes[1]; - - for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { - unsigned e = le16_to_cpu(i->errors[j]); - if (e < BCH_FSCK_ERR_MAX) - __set_bit(e, c->sb.errors_silent); - if (e < sizeof(ext->errors_silent) * 8) - __set_bit_le64(e, ext->errors_silent); - } - } - } -} diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h deleted file mode 100644 index 095b7cc9bb4735..00000000000000 --- a/fs/bcachefs/sb-downgrade.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_DOWNGRADE_H -#define _BCACHEFS_SB_DOWNGRADE_H - -extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; - -int bch2_sb_downgrade_update(struct bch_fs *); -void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); -int bch2_sb_set_upgrade_extra(struct bch_fs *); -void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); - -#endif /* _BCACHEFS_SB_DOWNGRADE_H */ diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h deleted file mode 100644 index cffd932be3eca0..00000000000000 --- a/fs/bcachefs/sb-downgrade_format.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H -#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H - -struct bch_sb_field_downgrade_entry { - __le16 version; - __le64 recovery_passes[2]; - __le16 nr_errors; - __le16 errors[] __counted_by(nr_errors); -} __packed __aligned(2); - -struct bch_sb_field_downgrade { - struct bch_sb_field field; - struct bch_sb_field_downgrade_entry entries[]; -}; - -#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c deleted file mode 100644 index 48853efdc105d5..00000000000000 --- a/fs/bcachefs/sb-errors.c +++ /dev/null @@ -1,198 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "sb-errors.h" -#include "super-io.h" - -const char * const bch2_sb_error_strs[] = { -#define x(t, n, ...) [n] = #t, - BCH_SB_ERRS() -#undef x -}; - -void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) -{ - if (id < BCH_FSCK_ERR_MAX) - prt_str(out, bch2_sb_error_strs[id]); - else - prt_printf(out, "(unknown error %u)", id); -} - -static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e) -{ - return bch2_sb_field_nr_entries(e); -} - -static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) -{ - return (sizeof(struct bch_sb_field_errors) + - sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64); -} - -static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_errors *e = field_to_type(f, errors); - unsigned i, nr = bch2_sb_field_errors_nr_entries(e); - - for (i = 0; i < nr; i++) { - if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) { - prt_printf(err, "entry with count 0 (id "); - bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); - prt_printf(err, ")"); - return -BCH_ERR_invalid_sb_errors; - } - - if (i + 1 < nr && - BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >= - BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) { - prt_printf(err, "entries out of order"); - return -BCH_ERR_invalid_sb_errors; - } - } - - return 0; -} - -static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_errors *e = field_to_type(f, errors); - unsigned i, nr = bch2_sb_field_errors_nr_entries(e); - - if (out->nr_tabstops <= 1) - printbuf_tabstop_push(out, 16); - - for (i = 0; i < nr; i++) { - bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); - prt_tab(out); - prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); - prt_tab(out); - bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time)); - prt_newline(out); - } -} - -const struct bch_sb_field_ops bch_sb_field_ops_errors = { - .validate = bch2_sb_errors_validate, - .to_text = bch2_sb_errors_to_text, -}; - -void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c) -{ - if (out->nr_tabstops < 1) - printbuf_tabstop_push(out, 48); - if (out->nr_tabstops < 2) - printbuf_tabstop_push(out, 8); - if (out->nr_tabstops < 3) - printbuf_tabstop_push(out, 16); - - guard(mutex)(&c->fsck_error_counts_lock); - - bch_sb_errors_cpu *e = &c->fsck_error_counts; - darray_for_each(*e, i) { - bch2_sb_error_id_to_text(out, i->id); - prt_tab(out); - prt_u64(out, i->nr); - prt_tab(out); - bch2_prt_datetime(out, i->last_error_time); - prt_newline(out); - } -} - -void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) -{ - bch_sb_errors_cpu *e = &c->fsck_error_counts; - struct bch_sb_error_entry_cpu n = { - .id = err, - .nr = 1, - .last_error_time = ktime_get_real_seconds() - }; - unsigned i; - - mutex_lock(&c->fsck_error_counts_lock); - for (i = 0; i < e->nr; i++) { - if (err == e->data[i].id) { - e->data[i].nr++; - e->data[i].last_error_time = n.last_error_time; - goto out; - } - if (err < e->data[i].id) - break; - } - - if (darray_make_room(e, 1)) - goto out; - - darray_insert_item(e, i, n); -out: - mutex_unlock(&c->fsck_error_counts_lock); -} - -void bch2_sb_errors_from_cpu(struct bch_fs *c) -{ - bch_sb_errors_cpu *src = &c->fsck_error_counts; - struct bch_sb_field_errors *dst; - unsigned i; - - mutex_lock(&c->fsck_error_counts_lock); - - dst = bch2_sb_field_resize(&c->disk_sb, errors, - bch2_sb_field_errors_u64s(src->nr)); - - if (!dst) - goto err; - - for (i = 0; i < src->nr; i++) { - SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); - SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); - dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); - } - -err: - mutex_unlock(&c->fsck_error_counts_lock); -} - -static int bch2_sb_errors_to_cpu(struct bch_fs *c) -{ - struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); - bch_sb_errors_cpu *dst = &c->fsck_error_counts; - unsigned i, nr = bch2_sb_field_errors_nr_entries(src); - int ret; - - if (!nr) - return 0; - - mutex_lock(&c->fsck_error_counts_lock); - ret = darray_make_room(dst, nr); - if (ret) - goto err; - - dst->nr = nr; - - for (i = 0; i < nr; i++) { - dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); - dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); - dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); - } -err: - mutex_unlock(&c->fsck_error_counts_lock); - - return ret; -} - -void bch2_fs_sb_errors_exit(struct bch_fs *c) -{ - darray_exit(&c->fsck_error_counts); -} - -void bch2_fs_sb_errors_init_early(struct bch_fs *c) -{ - mutex_init(&c->fsck_error_counts_lock); - darray_init(&c->fsck_error_counts); -} - -int bch2_fs_sb_errors_init(struct bch_fs *c) -{ - return bch2_sb_errors_to_cpu(c); -} diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h deleted file mode 100644 index e86267264692d1..00000000000000 --- a/fs/bcachefs/sb-errors.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_ERRORS_H -#define _BCACHEFS_SB_ERRORS_H - -#include "sb-errors_types.h" - -extern const char * const bch2_sb_error_strs[]; - -void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); -void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *); - -extern const struct bch_sb_field_ops bch_sb_field_ops_errors; - -void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); - -void bch2_sb_errors_from_cpu(struct bch_fs *); - -void bch2_fs_sb_errors_exit(struct bch_fs *); -void bch2_fs_sb_errors_init_early(struct bch_fs *); -int bch2_fs_sb_errors_init(struct bch_fs *); - -#endif /* _BCACHEFS_SB_ERRORS_H */ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h deleted file mode 100644 index d154b7651d2875..00000000000000 --- a/fs/bcachefs/sb-errors_format.h +++ /dev/null @@ -1,353 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H -#define _BCACHEFS_SB_ERRORS_FORMAT_H - -enum bch_fsck_flags { - FSCK_CAN_FIX = BIT(0), - FSCK_CAN_IGNORE = BIT(1), - FSCK_AUTOFIX = BIT(2), - FSCK_ERR_NO_LOG = BIT(3), -}; - -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0, 0) \ - x(dirty_but_no_journal_entries, 1, 0) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \ - x(sb_clean_journal_seq_mismatch, 3, 0) \ - x(sb_clean_btree_root_mismatch, 4, 0) \ - x(sb_clean_missing, 5, 0) \ - x(jset_unsupported_version, 6, 0) \ - x(jset_unknown_csum, 7, 0) \ - x(jset_last_seq_newer_than_seq, 8, 0) \ - x(jset_past_bucket_end, 9, 0) \ - x(jset_seq_blacklisted, 10, 0) \ - x(journal_entries_missing, 11, 0) \ - x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \ - x(journal_entry_past_jset_end, 13, 0) \ - x(journal_entry_replicas_data_mismatch, 14, 0) \ - x(journal_entry_bkey_u64s_0, 15, 0) \ - x(journal_entry_bkey_past_end, 16, 0) \ - x(journal_entry_bkey_bad_format, 17, 0) \ - x(journal_entry_bkey_invalid, 18, 0) \ - x(journal_entry_btree_root_bad_size, 19, 0) \ - x(journal_entry_blacklist_bad_size, 20, 0) \ - x(journal_entry_blacklist_v2_bad_size, 21, 0) \ - x(journal_entry_blacklist_v2_start_past_end, 22, 0) \ - x(journal_entry_usage_bad_size, 23, 0) \ - x(journal_entry_data_usage_bad_size, 24, 0) \ - x(journal_entry_clock_bad_size, 25, 0) \ - x(journal_entry_clock_bad_rw, 26, 0) \ - x(journal_entry_dev_usage_bad_size, 27, 0) \ - x(journal_entry_dev_usage_bad_dev, 28, 0) \ - x(journal_entry_dev_usage_bad_pad, 29, 0) \ - x(btree_node_unreadable, 30, 0) \ - x(btree_node_fault_injected, 31, 0) \ - x(btree_node_bad_magic, 32, 0) \ - x(btree_node_bad_seq, 33, 0) \ - x(btree_node_unsupported_version, 34, 0) \ - x(btree_node_bset_older_than_sb_min, 35, 0) \ - x(btree_node_bset_newer_than_sb, 36, 0) \ - x(btree_node_data_missing, 37, FSCK_AUTOFIX) \ - x(btree_node_bset_after_end, 38, 0) \ - x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ - x(btree_node_replicas_data_mismatch, 40, 0) \ - x(bset_unknown_csum, 41, 0) \ - x(bset_bad_csum, 42, 0) \ - x(bset_past_end_of_btree_node, 43, 0) \ - x(bset_wrong_sector_offset, 44, 0) \ - x(bset_empty, 45, 0) \ - x(bset_bad_seq, 46, 0) \ - x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ - x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ - x(btree_node_bad_btree, 49, 0) \ - x(btree_node_bad_level, 50, 0) \ - x(btree_node_bad_min_key, 51, 0) \ - x(btree_node_bad_max_key, 52, 0) \ - x(btree_node_bad_format, 53, 0) \ - x(btree_node_bkey_past_bset_end, 54, 0) \ - x(btree_node_bkey_bad_format, 55, 0) \ - x(btree_node_bad_bkey, 56, 0) \ - x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \ - x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ - x(btree_root_read_error, 59, FSCK_AUTOFIX) \ - x(btree_root_bad_min_key, 60, 0) \ - x(btree_root_bad_max_key, 61, 0) \ - x(btree_node_read_error, 62, FSCK_AUTOFIX) \ - x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ - x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ - x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ - x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ - x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ - x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ - x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ - x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ - x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \ - x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \ - x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \ - x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \ - x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \ - x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \ - x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \ - x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \ - x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \ - x(bkey_version_in_future, 80, 0) \ - x(bkey_u64s_too_small, 81, 0) \ - x(bkey_invalid_type_for_btree, 82, 0) \ - x(bkey_extent_size_zero, 83, 0) \ - x(bkey_extent_size_greater_than_offset, 84, 0) \ - x(bkey_size_nonzero, 85, 0) \ - x(bkey_snapshot_nonzero, 86, 0) \ - x(bkey_snapshot_zero, 87, 0) \ - x(bkey_at_pos_max, 88, 0) \ - x(bkey_before_start_of_btree_node, 89, 0) \ - x(bkey_after_end_of_btree_node, 90, 0) \ - x(bkey_val_size_nonzero, 91, 0) \ - x(bkey_val_size_too_small, 92, 0) \ - x(alloc_v1_val_size_bad, 93, 0) \ - x(alloc_v2_unpack_error, 94, 0) \ - x(alloc_v3_unpack_error, 95, 0) \ - x(alloc_v4_val_size_bad, 96, 0) \ - x(alloc_v4_backpointers_start_bad, 97, 0) \ - x(alloc_key_data_type_bad, 98, 0) \ - x(alloc_key_empty_but_have_data, 99, 0) \ - x(alloc_key_dirty_sectors_0, 100, 0) \ - x(alloc_key_data_type_inconsistency, 101, 0) \ - x(alloc_key_to_missing_dev_bucket, 102, 0) \ - x(alloc_key_cached_inconsistency, 103, 0) \ - x(alloc_key_cached_but_read_time_zero, 104, FSCK_AUTOFIX) \ - x(alloc_key_to_missing_lru_entry, 105, FSCK_AUTOFIX) \ - x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \ - x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \ - x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \ - x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ - x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ - x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ - x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \ - x(bucket_sector_count_overflow, 112, 0) \ - x(bucket_metadata_type_mismatch, 113, 0) \ - x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \ - x(freespace_key_wrong, 115, FSCK_AUTOFIX) \ - x(freespace_hole_missing, 116, FSCK_AUTOFIX) \ - x(bucket_gens_val_size_bad, 117, 0) \ - x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \ - x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \ - x(bucket_gens_to_invalid_dev, 120, FSCK_AUTOFIX) \ - x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ - x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \ - x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ - x(backpointer_bucket_offset_wrong, 125, 0) \ - x(backpointer_level_bad, 294, 0) \ - x(backpointer_dev_bad, 297, 0) \ - x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \ - x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \ - x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ - x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ - x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ - x(lru_entry_bad, 131, FSCK_AUTOFIX) \ - x(btree_ptr_val_too_big, 132, 0) \ - x(btree_ptr_v2_val_too_big, 133, 0) \ - x(btree_ptr_has_non_ptr, 134, 0) \ - x(extent_ptrs_invalid_entry, 135, 0) \ - x(extent_ptrs_no_ptrs, 136, 0) \ - x(extent_ptrs_too_many_ptrs, 137, 0) \ - x(extent_ptrs_redundant_crc, 138, 0) \ - x(extent_ptrs_redundant_stripe, 139, 0) \ - x(extent_ptrs_unwritten, 140, 0) \ - x(extent_ptrs_written_and_unwritten, 141, 0) \ - x(ptr_to_invalid_device, 142, 0) \ - x(ptr_to_duplicate_device, 143, 0) \ - x(ptr_after_last_bucket, 144, 0) \ - x(ptr_before_first_bucket, 145, 0) \ - x(ptr_spans_multiple_buckets, 146, 0) \ - x(ptr_to_missing_backpointer, 147, FSCK_AUTOFIX) \ - x(ptr_to_missing_alloc_key, 148, FSCK_AUTOFIX) \ - x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ - x(ptr_to_missing_stripe, 150, 0) \ - x(ptr_to_incorrect_stripe, 151, 0) \ - x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ - x(ptr_too_stale, 153, 0) \ - x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ - x(ptr_bucket_data_type_mismatch, 155, 0) \ - x(ptr_cached_and_erasure_coded, 156, 0) \ - x(ptr_crc_uncompressed_size_too_small, 157, 0) \ - x(ptr_crc_uncompressed_size_too_big, 161, 0) \ - x(ptr_crc_uncompressed_size_mismatch, 300, 0) \ - x(ptr_crc_csum_type_unknown, 158, 0) \ - x(ptr_crc_compression_type_unknown, 159, 0) \ - x(ptr_crc_redundant, 160, 0) \ - x(ptr_crc_nonce_mismatch, 162, 0) \ - x(ptr_stripe_redundant, 163, 0) \ - x(extent_flags_not_at_start, 306, 0) \ - x(reservation_key_nr_replicas_invalid, 164, 0) \ - x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ - x(reflink_v_pos_bad, 292, 0) \ - x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ - x(reflink_refcount_underflow, 293, 0) \ - x(stripe_pos_bad, 167, 0) \ - x(stripe_val_size_bad, 168, 0) \ - x(stripe_csum_granularity_bad, 290, 0) \ - x(stripe_sector_count_wrong, 169, 0) \ - x(snapshot_tree_pos_bad, 170, 0) \ - x(snapshot_tree_to_missing_snapshot, 171, 0) \ - x(snapshot_tree_to_missing_subvol, 172, 0) \ - x(snapshot_tree_to_wrong_subvol, 173, 0) \ - x(snapshot_tree_to_snapshot_subvol, 174, 0) \ - x(snapshot_pos_bad, 175, 0) \ - x(snapshot_parent_bad, 176, 0) \ - x(snapshot_children_not_normalized, 177, 0) \ - x(snapshot_child_duplicate, 178, 0) \ - x(snapshot_child_bad, 179, 0) \ - x(snapshot_skiplist_not_normalized, 180, 0) \ - x(snapshot_skiplist_bad, 181, 0) \ - x(snapshot_should_not_have_subvol, 182, 0) \ - x(snapshot_to_bad_snapshot_tree, 183, FSCK_AUTOFIX) \ - x(snapshot_bad_depth, 184, 0) \ - x(snapshot_bad_skiplist, 185, 0) \ - x(subvol_pos_bad, 186, 0) \ - x(subvol_not_master_and_not_snapshot, 187, FSCK_AUTOFIX) \ - x(subvol_to_missing_root, 188, 0) \ - x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ - x(bkey_in_missing_snapshot, 190, 0) \ - x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \ - x(inode_pos_inode_nonzero, 191, 0) \ - x(inode_pos_blockdev_range, 192, 0) \ - x(inode_alloc_cursor_inode_bad, 301, 0) \ - x(inode_unpack_error, 193, 0) \ - x(inode_str_hash_invalid, 194, 0) \ - x(inode_v3_fields_start_bad, 195, 0) \ - x(inode_snapshot_mismatch, 196, 0) \ - x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \ - x(inode_unlinked_but_clean, 197, 0) \ - x(inode_unlinked_but_nlink_nonzero, 198, 0) \ - x(inode_unlinked_and_not_open, 281, 0) \ - x(inode_unlinked_but_has_dirent, 285, 0) \ - x(inode_checksum_type_invalid, 199, 0) \ - x(inode_compression_type_invalid, 200, 0) \ - x(inode_subvol_root_but_not_dir, 201, 0) \ - x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \ - x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \ - x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \ - x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \ - x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ - x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ - x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ - x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \ - x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ - x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ - x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ - x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \ - x(inode_unreachable, 210, FSCK_AUTOFIX) \ - x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ - x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ - x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \ - x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ - x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ - x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ - x(vfs_bad_inode_rm, 320, 0) \ - x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ - x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ - x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ - x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ - x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ - x(extent_overlapping, 215, 0) \ - x(key_in_missing_inode, 216, FSCK_AUTOFIX) \ - x(key_in_wrong_inode_type, 217, 0) \ - x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \ - x(dirent_empty_name, 219, 0) \ - x(dirent_val_too_big, 220, 0) \ - x(dirent_name_too_long, 221, 0) \ - x(dirent_name_embedded_nul, 222, 0) \ - x(dirent_name_dot_or_dotdot, 223, 0) \ - x(dirent_name_has_slash, 224, 0) \ - x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \ - x(inode_bi_parent_wrong, 226, 0) \ - x(dirent_in_missing_dir_inode, 227, 0) \ - x(dirent_in_non_dir_inode, 228, 0) \ - x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \ - x(dirent_to_overwritten_inode, 302, 0) \ - x(dirent_to_missing_subvol, 230, 0) \ - x(dirent_to_itself, 231, 0) \ - x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \ - x(quota_type_invalid, 232, 0) \ - x(xattr_val_size_too_small, 233, 0) \ - x(xattr_val_size_too_big, 234, 0) \ - x(xattr_invalid_type, 235, 0) \ - x(xattr_name_invalid_chars, 236, 0) \ - x(xattr_in_missing_inode, 237, 0) \ - x(root_subvol_missing, 238, 0) \ - x(root_dir_missing, 239, 0) \ - x(root_inode_not_dir, 240, 0) \ - x(dir_loop, 241, 0) \ - x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \ - x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \ - x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ - x(reflink_p_front_pad_bad, 245, 0) \ - x(journal_entry_dup_same_device, 246, 0) \ - x(inode_bi_subvol_missing, 247, 0) \ - x(inode_bi_subvol_wrong, 248, 0) \ - x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \ - x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \ - x(inode_bi_parent_nonzero, 251, 0) \ - x(dirent_to_missing_parent_subvol, 252, 0) \ - x(dirent_not_visible_in_parent_subvol, 253, 0) \ - x(subvol_fs_path_parent_wrong, 254, 0) \ - x(subvol_root_fs_path_parent_nonzero, 255, 0) \ - x(subvol_children_not_set, 256, 0) \ - x(subvol_children_bad, 257, 0) \ - x(subvol_loop, 258, 0) \ - x(subvol_unreachable, 259, FSCK_AUTOFIX) \ - x(btree_node_bkey_bad_u64s, 260, 0) \ - x(btree_node_topology_empty_interior_node, 261, 0) \ - x(btree_ptr_v2_min_key_bad, 262, 0) \ - x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ - x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ - x(dup_backpointer_to_bad_csum_extent, 265, 0) \ - x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ - x(sb_clean_entry_overrun, 267, 0) \ - x(btree_ptr_v2_written_0, 268, 0) \ - x(subvol_snapshot_bad, 269, 0) \ - x(subvol_inode_bad, 270, 0) \ - x(subvol_missing, 308, FSCK_AUTOFIX) \ - x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ - x(accounting_mismatch, 272, FSCK_AUTOFIX) \ - x(accounting_replicas_not_marked, 273, 0) \ - x(accounting_to_invalid_device, 289, 0) \ - x(invalid_btree_id, 274, FSCK_AUTOFIX) \ - x(alloc_key_io_time_bad, 275, 0) \ - x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ - x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ - x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ - x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ - x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ - x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ - x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \ - x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ - x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ - x(dirent_cf_name_too_big, 304, 0) \ - x(dirent_stray_data_after_cf_name, 305, 0) \ - x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ - x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 321, 0) - -enum bch_sb_error_id { -#define x(t, n, ...) BCH_FSCK_ERR_##t = n, - BCH_SB_ERRS() -#undef x -}; - -struct bch_sb_field_errors { - struct bch_sb_field field; - struct bch_sb_field_error_entry { - __le64 v; - __le64 last_error_time; - } entries[]; -}; - -LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); -LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); - -#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h deleted file mode 100644 index 40325239c3b0f2..00000000000000 --- a/fs/bcachefs/sb-errors_types.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_ERRORS_TYPES_H -#define _BCACHEFS_SB_ERRORS_TYPES_H - -#include "darray.h" - -struct bch_sb_error_entry_cpu { - u64 id:16, - nr:48; - u64 last_error_time; -}; - -typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; - -#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c deleted file mode 100644 index 6245e342a8a851..00000000000000 --- a/fs/bcachefs/sb-members.c +++ /dev/null @@ -1,606 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "disk_groups.h" -#include "error.h" -#include "opts.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-members.h" -#include "super-io.h" - -int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) -{ - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev); - bch2_bkey_val_to_text(&buf, c, k); - - bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); - - int ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_allocations, 0); - - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return ret; -} - -void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) -{ - if (dev != BCH_SB_MEMBER_INVALID) - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); -} - -void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) -{ - bch2_fs_inconsistent(ca->fs, - "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)", - bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets); -} - -#define x(t, n, ...) [n] = #t, -static const char * const bch2_iops_measurements[] = { - BCH_IOPS_MEASUREMENTS() - NULL -}; - -char * const bch2_member_error_strs[] = { - BCH_MEMBER_ERROR_TYPES() - NULL -}; -#undef x - -/* Code for bch_sb_field_members_v1: */ - -struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) -{ - return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); -} - -static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) -{ - struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); - memset(&ret, 0, sizeof(ret)); - memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); - return ret; -} - -static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i) -{ - return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES); -} - -static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i) -{ - struct bch_member ret, *p = members_v1_get_mut(mi, i); - memset(&ret, 0, sizeof(ret)); - memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); - return ret; -} - -struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i) -{ - struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2); - if (mi2) - return members_v2_get(mi2, i); - struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1); - return members_v1_get(mi1, i); -} - -static int sb_members_v2_resize_entries(struct bch_fs *c) -{ - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - - if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) { - unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) * - c->disk_sb.sb->nr_devices), 8); - - mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); - if (!mi) - return bch_err_throw(c, ENOSPC_sb_members_v2); - - for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { - void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); - memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); - memset(dst + le16_to_cpu(mi->member_bytes), - 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes))); - } - mi->member_bytes = cpu_to_le16(sizeof(struct bch_member)); - } - return 0; -} - -int bch2_sb_members_v2_init(struct bch_fs *c) -{ - struct bch_sb_field_members_v1 *mi1; - struct bch_sb_field_members_v2 *mi2; - - if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) { - mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2, - DIV_ROUND_UP(sizeof(*mi2) + - sizeof(struct bch_member) * c->sb.nr_devices, - sizeof(u64))); - mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1); - memcpy(&mi2->_members[0], &mi1->_members[0], - BCH_MEMBER_V1_BYTES * c->sb.nr_devices); - memset(&mi2->pad[0], 0, sizeof(mi2->pad)); - mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES); - } - - return sb_members_v2_resize_entries(c); -} - -int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) -{ - struct bch_sb_field_members_v1 *mi1; - struct bch_sb_field_members_v2 *mi2; - - if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { - bch2_sb_field_resize(disk_sb, members_v1, 0); - return 0; - } - - mi1 = bch2_sb_field_resize(disk_sb, members_v1, - DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * - disk_sb->sb->nr_devices, sizeof(u64))); - if (!mi1) - return -BCH_ERR_ENOSPC_sb_members; - - mi2 = bch2_sb_field_get(disk_sb->sb, members_v2); - - for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++) - memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); - - return 0; -} - -static int validate_member(struct printbuf *err, - struct bch_member m, - struct bch_sb *sb, - int i) -{ - if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) { - prt_printf(err, "device %u: too many buckets (got %llu, max %u)", - i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX); - return -BCH_ERR_invalid_sb_members; - } - - if (le64_to_cpu(m.nbuckets) - - le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) { - prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", - i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m.bucket_size) < - le16_to_cpu(sb->block_size)) { - prt_printf(err, "device %u: bucket size %u smaller than block size %u", - i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size)); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m.bucket_size) < - BCH_SB_BTREE_NODE_SIZE(sb)) { - prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", - i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); - return -BCH_ERR_invalid_sb_members; - } - - if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) { - prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); - return -BCH_ERR_invalid_sb_members; - } - - if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && - sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { - prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); - return -BCH_ERR_invalid_sb_members; - } - - return 0; -} - -static void member_to_text(struct printbuf *out, - struct bch_member m, - struct bch_sb_field_disk_groups *gi, - struct bch_sb *sb, - int i) -{ - unsigned data_have = bch2_sb_dev_has_data(sb, i); - u64 bucket_size = le16_to_cpu(m.bucket_size); - u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; - - if (!bch2_member_alive(&m)) - return; - - prt_printf(out, "Device:\t%u\n", i); - - printbuf_indent_add(out, 2); - - prt_printf(out, "Label:\t"); - if (BCH_MEMBER_GROUP(&m)) - bch2_disk_path_to_text_sb(out, sb, - BCH_MEMBER_GROUP(&m) - 1); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "UUID:\t"); - pr_uuid(out, m.uuid.b); - prt_newline(out); - - prt_printf(out, "Size:\t"); - prt_units_u64(out, device_size << 9); - prt_newline(out); - - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); - - for (unsigned i = 0; i < BCH_IOPS_NR; i++) - prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); - - prt_printf(out, "Bucket size:\t"); - prt_units_u64(out, bucket_size << 9); - prt_newline(out); - - prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); - prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); - - prt_printf(out, "Last mount:\t"); - if (m.last_mount) - bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); - else - prt_printf(out, "(never)"); - prt_newline(out); - - prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); - - prt_printf(out, "State:\t%s\n", - BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR - ? bch2_member_states[BCH_MEMBER_STATE(&m)] - : "unknown"); - - prt_printf(out, "Data allowed:\t"); - if (BCH_MEMBER_DATA_ALLOWED(&m)) - prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Has data:\t"); - if (data_have) - prt_bitflags(out, __bch2_data_types, data_have); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Btree allocated bitmap blocksize:\t"); - if (m.btree_bitmap_shift < 64) - prt_units_u64(out, 1ULL << m.btree_bitmap_shift); - else - prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); - prt_newline(out); - - prt_printf(out, "Btree allocated bitmap:\t"); - bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); - prt_newline(out); - - prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); - - prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); - prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); - prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m)); - - printbuf_indent_sub(out, 2); -} - -static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); - unsigned i; - - if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) { - prt_printf(err, "too many devices for section size"); - return -BCH_ERR_invalid_sb_members; - } - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member m = members_v1_get(mi, i); - - int ret = validate_member(err, m, sb, i); - if (ret) - return ret; - } - - return 0; -} - -static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); - struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - - if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { - prt_printf(out, "field ends before start of entries"); - return; - } - - unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]); - if (nr != sb->nr_devices) - prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); - - for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) - member_to_text(out, members_v1_get(mi, i), gi, sb, i); -} - -const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = { - .validate = bch2_sb_members_v1_validate, - .to_text = bch2_sb_members_v1_to_text, -}; - -static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); - struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - - if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { - prt_printf(out, "field ends before start of entries"); - return; - } - - if (!le16_to_cpu(mi->member_bytes)) { - prt_printf(out, "member_bytes 0"); - return; - } - - unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes); - if (nr != sb->nr_devices) - prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); - - /* - * We call to_text() on superblock sections that haven't passed - * validate, so we can't trust sb->nr_devices. - */ - - for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) - member_to_text(out, members_v2_get(mi, i), gi, sb, i); -} - -static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); - size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - - (void *) mi; - - if (mi_bytes > vstruct_bytes(&mi->field)) { - prt_printf(err, "section too small (%zu > %zu)", - mi_bytes, vstruct_bytes(&mi->field)); - return -BCH_ERR_invalid_sb_members; - } - - for (unsigned i = 0; i < sb->nr_devices; i++) { - int ret = validate_member(err, members_v2_get(mi, i), sb, i); - if (ret) - return ret; - } - - return 0; -} - -const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = { - .validate = bch2_sb_members_v2_validate, - .to_text = bch2_sb_members_v2_to_text, -}; - -void bch2_sb_members_from_cpu(struct bch_fs *c) -{ - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - - guard(rcu)(); - for_each_member_device_rcu(c, ca, NULL) { - struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); - - for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) - m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); - } -} - -void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_member m; - - mutex_lock(&ca->fs->sb_lock); - m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); - mutex_unlock(&ca->fs->sb_lock); - - printbuf_tabstop_push(out, 12); - - prt_str(out, "IO errors since filesystem creation"); - prt_newline(out); - - printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); - printbuf_indent_sub(out, 2); - - prt_str(out, "IO errors since "); - bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC); - prt_str(out, " ago"); - prt_newline(out); - - printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], - atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); - printbuf_indent_sub(out, 2); -} - -void bch2_dev_errors_reset(struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_member *m; - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) - m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); - m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds()); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); -} - -/* - * Per member "range has btree nodes" bitmap: - * - * This is so that if we ever have to run the btree node scan to repair we don't - * have to scan full devices: - */ - -bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) -{ - guard(rcu)(); - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (ca && - !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) - return false; - } - return true; -} - -static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, - u64 start, unsigned sectors) -{ - struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); - u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); - - u64 end = start + sectors; - - int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); - if (resize > 0) { - u64 new_bitmap = 0; - - for (unsigned i = 0; i < 64; i++) - if (bitmap & BIT_ULL(i)) - new_bitmap |= BIT_ULL(i >> resize); - bitmap = new_bitmap; - m->btree_bitmap_shift += resize; - } - - BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX); - BUG_ON(end > 64ULL << m->btree_bitmap_shift); - - for (unsigned bit = start >> m->btree_bitmap_shift; - (u64) bit << m->btree_bitmap_shift < end; - bit++) - bitmap |= BIT_ULL(bit); - - m->btree_allocated_bitmap = cpu_to_le64(bitmap); -} - -void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) -{ - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (!bch2_member_exists(c->disk_sb.sb, ptr->dev)) - continue; - - __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); - } -} - -unsigned bch2_sb_nr_devices(const struct bch_sb *sb) -{ - unsigned nr = 0; - - for (unsigned i = 0; i < sb->nr_devices; i++) - nr += bch2_member_exists((struct bch_sb *) sb, i); - return nr; -} - -int bch2_sb_member_alloc(struct bch_fs *c) -{ - unsigned dev_idx = c->sb.nr_devices; - struct bch_sb_field_members_v2 *mi; - unsigned nr_devices; - unsigned u64s; - int best = -1; - u64 best_last_mount = 0; - unsigned nr_deleted = 0; - - if (dev_idx < BCH_SB_MEMBERS_MAX) - goto have_slot; - - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { - /* eventually BCH_SB_MEMBERS_MAX will be raised */ - if (dev_idx == BCH_SB_MEMBER_INVALID) - continue; - - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - - nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); - - if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) - continue; - - u64 last_mount = le64_to_cpu(m.last_mount); - if (best < 0 || last_mount < best_last_mount) { - best = dev_idx; - best_last_mount = last_mount; - } - } - if (best >= 0) { - dev_idx = best; - goto have_slot; - } - - if (nr_deleted) - bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", - nr_deleted); - - return -BCH_ERR_ENOSPC_sb_members; -have_slot: - nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); - - mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + - le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); - - mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); - if (!mi) - return -BCH_ERR_ENOSPC_sb_members; - - c->disk_sb.sb->nr_devices = nr_devices; - return dev_idx; -} - -void bch2_sb_members_clean_deleted(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - bool write_sb = false; - - for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); - - if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { - memset(&m->uuid, 0, sizeof(m->uuid)); - write_sb = true; - } - } - - if (write_sb) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); -} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h deleted file mode 100644 index 8d8a8a857648c5..00000000000000 --- a/fs/bcachefs/sb-members.h +++ /dev/null @@ -1,377 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_MEMBERS_H -#define _BCACHEFS_SB_MEMBERS_H - -#include "darray.h" -#include "bkey_types.h" -#include "enumerated_ref.h" - -extern char * const bch2_member_error_strs[]; - -static inline struct bch_member * -__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) -{ - return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); -} - -int bch2_sb_members_v2_init(struct bch_fs *c); -int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); -struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); -struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); - -static inline bool bch2_dev_is_online(struct bch_dev *ca) -{ - return !enumerated_ref_is_zero(&ca->io_ref[READ]); -} - -static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); - -static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu(c, dev); - return ca && bch2_dev_is_online(ca); -} - -static inline bool bch2_dev_is_healthy(struct bch_dev *ca) -{ - return bch2_dev_is_online(ca) && - ca->mi.state != BCH_MEMBER_STATE_failed; -} - -static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -{ - return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -} - -static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, - unsigned dev) -{ - darray_for_each(devs, i) - if (*i == dev) - return true; - return false; -} - -static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, - unsigned dev) -{ - darray_for_each(*devs, i) - if (*i == dev) { - darray_remove_item(devs, i); - return; - } -} - -static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, - unsigned dev) -{ - if (!bch2_dev_list_has_dev(*devs, dev)) { - BUG_ON(devs->nr >= ARRAY_SIZE(devs->data)); - devs->data[devs->nr++] = dev; - } -} - -static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -{ - return (struct bch_devs_list) { .nr = 1, .data[0] = dev }; -} - -static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx, - const struct bch_devs_mask *mask) -{ - struct bch_dev *ca = NULL; - - while ((idx = mask - ? find_next_bit(mask->d, c->sb.nr_devices, idx) - : idx) < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[idx], - lockdep_is_held(&c->state_lock)))) - idx++; - - return ca; -} - -static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca, - const struct bch_devs_mask *mask) -{ - return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask); -} - -#define for_each_member_device_rcu(_c, _ca, _mask) \ - for (struct bch_dev *_ca = NULL; \ - (_ca = __bch2_next_dev((_c), _ca, (_mask)));) - -#define for_each_online_member_rcu(_c, _ca) \ - for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) - -#define for_each_rw_member_rcu(_c, _ca) \ - for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) - -static inline void bch2_dev_get(struct bch_dev *ca) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L); -#else - percpu_ref_get(&ca->ref); -#endif -} - -static inline void __bch2_dev_put(struct bch_dev *ca) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - long r = atomic_long_dec_return(&ca->ref); - if (r < (long) !ca->dying) - panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put); - ca->last_put = _THIS_IP_; - if (!r) - complete(&ca->ref_completion); -#else - percpu_ref_put(&ca->ref); -#endif -} - -static inline void bch2_dev_put(struct bch_dev *ca) -{ - if (ca) - __bch2_dev_put(ca); -} - -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) -{ - guard(rcu)(); - bch2_dev_put(ca); - if ((ca = __bch2_next_dev(c, ca, NULL))) - bch2_dev_get(ca); - return ca; -} - -/* - * If you break early, you must drop your ref on the current device - */ -#define __for_each_member_device(_c, _ca) \ - for (; (_ca = bch2_get_next_dev(_c, _ca));) - -#define for_each_member_device(_c, _ca) \ - for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_dev(_c, _ca));) - -static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, - struct bch_dev *ca, - unsigned state_mask, - int rw, unsigned ref_idx) -{ - guard(rcu)(); - if (ca) - enumerated_ref_put(&ca->io_ref[rw], ref_idx); - - while ((ca = __bch2_next_dev(c, ca, NULL)) && - (!((1 << ca->mi.state) & state_mask) || - !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) - ; - - return ca; -} - -#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \ - for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));) - -#define for_each_online_member(c, ca, ref_idx) \ - __for_each_online_member(c, ca, ~0, READ, ref_idx) - -#define for_each_rw_member(c, ca, ref_idx) \ - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx) - -#define for_each_readable_member(c, ca, ref_idx) \ - __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx) - -static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) -{ - return dev < c->sb.nr_devices && c->devs[dev]; -} - -static inline bool bucket_valid(const struct bch_dev *ca, u64 b) -{ - return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first; -} - -static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev) -{ - EBUG_ON(!bch2_dev_exists(c, dev)); - - return rcu_dereference_check(c->devs[dev], 1); -} - -static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev) -{ - EBUG_ON(!bch2_dev_exists(c, dev)); - - return rcu_dereference_protected(c->devs[dev], - lockdep_is_held(&c->sb_lock) || - lockdep_is_held(&c->state_lock)); -} - -static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev) -{ - return c && dev < c->sb.nr_devices - ? rcu_dereference(c->devs[dev]) - : NULL; -} - -int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned); - -void bch2_dev_missing_atomic(struct bch_fs *, unsigned); - -static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) -{ - struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); - if (unlikely(!ca)) - bch2_dev_missing_atomic(c, dev); - return ca; -} - -static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) -{ - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); - if (ca) - bch2_dev_get(ca); - return ca; -} - -static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) -{ - struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); - if (unlikely(!ca)) - bch2_dev_missing_atomic(c, dev); - return ca; -} - -static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) -{ - struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); - if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { - bch2_dev_put(ca); - ca = NULL; - } - return ca; -} - -void bch2_dev_bucket_missing(struct bch_dev *, u64); - -static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) -{ - struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); - if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { - bch2_dev_bucket_missing(ca, bucket.offset); - bch2_dev_put(ca); - ca = NULL; - } - return ca; -} - -static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) -{ - if (ca && ca->dev_idx == dev_idx) - return ca; - bch2_dev_put(ca); - return bch2_dev_tryget_noerror(c, dev_idx); -} - -static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) -{ - if (ca && ca->dev_idx == dev_idx) - return ca; - bch2_dev_put(ca); - return bch2_dev_tryget(c, dev_idx); -} - -static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, - int rw, unsigned ref_idx) -{ - might_sleep(); - - guard(rcu)(); - struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) - return NULL; - - if (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) - return ca; - - enumerated_ref_put(&ca->io_ref[rw], ref_idx); - return NULL; -} - -extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; -extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; - -static inline bool bch2_member_alive(struct bch_member *m) -{ - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && - !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); -} - -static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) -{ - if (dev < sb->nr_devices) { - struct bch_member m = bch2_sb_member_get(sb, dev); - return bch2_member_alive(&m); - } - return false; -} - -unsigned bch2_sb_nr_devices(const struct bch_sb *); - -static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -{ - return (struct bch_member_cpu) { - .nbuckets = le64_to_cpu(mi->nbuckets), - .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) - - le16_to_cpu(mi->first_bucket), - .first_bucket = le16_to_cpu(mi->first_bucket), - .bucket_size = le16_to_cpu(mi->bucket_size), - .group = BCH_MEMBER_GROUP(mi), - .state = BCH_MEMBER_STATE(mi), - .discard = BCH_MEMBER_DISCARD(mi), - .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), - .durability = BCH_MEMBER_DURABILITY(mi) - ? BCH_MEMBER_DURABILITY(mi) - 1 - : 1, - .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), - .valid = bch2_member_alive(mi), - .btree_bitmap_shift = mi->btree_bitmap_shift, - .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), - }; -} - -void bch2_sb_members_from_cpu(struct bch_fs *); - -void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); -void bch2_dev_errors_reset(struct bch_dev *); - -static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors) -{ - u64 end = start + sectors; - - if (end > 64ULL << ca->mi.btree_bitmap_shift) - return false; - - for (unsigned bit = start >> ca->mi.btree_bitmap_shift; - (u64) bit << ca->mi.btree_bitmap_shift < end; - bit++) - if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit))) - return false; - return true; -} - -bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); -void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); - -int bch2_sb_member_alloc(struct bch_fs *); -void bch2_sb_members_clean_deleted(struct bch_fs *); - -#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h deleted file mode 100644 index fb72ad730518f7..00000000000000 --- a/fs/bcachefs/sb-members_format.h +++ /dev/null @@ -1,128 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H -#define _BCACHEFS_SB_MEMBERS_FORMAT_H - -/* - * We refer to members with bitmasks in various places - but we need to get rid - * of this limit: - */ -#define BCH_SB_MEMBERS_MAX 64 - -/* - * Sentinal value - indicates a device that does not exist - */ -#define BCH_SB_MEMBER_INVALID 255 - -#define BCH_SB_MEMBER_DELETED_UUID \ - UUID_INIT(0xffffffff, 0xffff, 0xffff, \ - 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) - -#define BCH_MIN_NR_NBUCKETS (1 << 6) - -#define BCH_IOPS_MEASUREMENTS() \ - x(seqread, 0) \ - x(seqwrite, 1) \ - x(randread, 2) \ - x(randwrite, 3) - -enum bch_iops_measurement { -#define x(t, n) BCH_IOPS_##t = n, - BCH_IOPS_MEASUREMENTS() -#undef x - BCH_IOPS_NR -}; - -#define BCH_MEMBER_ERROR_TYPES() \ - x(read, 0) \ - x(write, 1) \ - x(checksum, 2) - -enum bch_member_error_type { -#define x(t, n) BCH_MEMBER_ERROR_##t = n, - BCH_MEMBER_ERROR_TYPES() -#undef x - BCH_MEMBER_ERROR_NR -}; - -struct bch_member { - __uuid_t uuid; - __le64 nbuckets; /* device size */ - __le16 first_bucket; /* index of first bucket used */ - __le16 bucket_size; /* sectors */ - __u8 btree_bitmap_shift; - __u8 pad[3]; - __le64 last_mount; /* time_t */ - - __le64 flags; - __le32 iops[4]; - __le64 errors[BCH_MEMBER_ERROR_NR]; - __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; - __le64 errors_reset_time; - __le64 seq; - __le64 btree_allocated_bitmap; - /* - * On recovery from a clean shutdown we don't normally read the journal, - * but we still want to resume writing from where we left off so we - * don't overwrite more than is necessary, for list journal debugging: - */ - __le32 last_journal_bucket; - __le32 last_journal_bucket_offset; -}; - -/* - * btree_allocated_bitmap can represent sector addresses of a u64: it itself has - * 64 elements, so 64 - ilog2(64) - */ -#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58 - -/* - * This limit comes from the bucket_gens array - it's a single allocation, and - * kernel allocation are limited to INT_MAX - */ -#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) - -#define BCH_MEMBER_V1_BYTES 56 - -LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16) -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) -/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) -LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) -LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) -LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags, 30, 31) -LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, - struct bch_member, flags, 31, 32) - -#if 0 -LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -#endif - -#define BCH_MEMBER_STATES() \ - x(rw, 0) \ - x(ro, 1) \ - x(failed, 2) \ - x(spare, 3) - -enum bch_member_state { -#define x(t, n) BCH_MEMBER_STATE_##t = n, - BCH_MEMBER_STATES() -#undef x - BCH_MEMBER_STATE_NR -}; - -struct bch_sb_field_members_v1 { - struct bch_sb_field field; - struct bch_member _members[]; //Members are now variable size -}; - -struct bch_sb_field_members_v2 { - struct bch_sb_field field; - __le16 member_bytes; //size of single member entry - u8 pad[6]; - struct bch_member _members[]; -}; - -#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h deleted file mode 100644 index d6443e18687299..00000000000000 --- a/fs/bcachefs/sb-members_types.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H -#define _BCACHEFS_SB_MEMBERS_TYPES_H - -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u64 nbuckets_minus_first; - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u16 group; - u8 state; - u8 discard; - u8 data_allowed; - u8 durability; - u8 freespace_initialized; - u8 resize_on_mount; - u8 valid; - u8 btree_bitmap_shift; - u64 btree_allocated_bitmap; -}; - -#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h deleted file mode 100644 index c4b3d8d3f4149c..00000000000000 --- a/fs/bcachefs/seqmutex.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SEQMUTEX_H -#define _BCACHEFS_SEQMUTEX_H - -#include - -struct seqmutex { - struct mutex lock; - u32 seq; -}; - -#define seqmutex_init(_lock) mutex_init(&(_lock)->lock) - -static inline bool seqmutex_trylock(struct seqmutex *lock) -{ - return mutex_trylock(&lock->lock); -} - -static inline void seqmutex_lock(struct seqmutex *lock) -{ - mutex_lock(&lock->lock); - lock->seq++; -} - -static inline u32 seqmutex_unlock(struct seqmutex *lock) -{ - u32 seq = lock->seq; - mutex_unlock(&lock->lock); - return seq; -} - -static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) -{ - if (lock->seq != seq || !mutex_trylock(&lock->lock)) - return false; - - if (lock->seq != seq) { - mutex_unlock(&lock->lock); - return false; - } - - return true; -} - -#endif /* _BCACHEFS_SEQMUTEX_H */ diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c deleted file mode 100644 index a1cc44e66c7ed5..00000000000000 --- a/fs/bcachefs/siphash.c +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-License-Identifier: BSD-3-Clause -/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ - -/*- - * Copyright (c) 2013 Andre Oppermann - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d - * are the number of compression rounds and the number of finalization rounds. - * A compression round is identical to a finalization round and this round - * function is called SipRound. Given a 128-bit key k and a (possibly empty) - * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). - * - * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, - * by Jean-Philippe Aumasson and Daniel J. Bernstein, - * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa - * https://131002.net/siphash/siphash.pdf - * https://131002.net/siphash/ - */ - -#include -#include -#include -#include - -#include "siphash.h" - -static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) -{ - while (rounds--) { - ctx->v[0] += ctx->v[1]; - ctx->v[2] += ctx->v[3]; - ctx->v[1] = rol64(ctx->v[1], 13); - ctx->v[3] = rol64(ctx->v[3], 16); - - ctx->v[1] ^= ctx->v[0]; - ctx->v[3] ^= ctx->v[2]; - ctx->v[0] = rol64(ctx->v[0], 32); - - ctx->v[2] += ctx->v[1]; - ctx->v[0] += ctx->v[3]; - ctx->v[1] = rol64(ctx->v[1], 17); - ctx->v[3] = rol64(ctx->v[3], 21); - - ctx->v[1] ^= ctx->v[2]; - ctx->v[3] ^= ctx->v[0]; - ctx->v[2] = rol64(ctx->v[2], 32); - } -} - -static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) -{ - u64 m = get_unaligned_le64(ptr); - - ctx->v[3] ^= m; - SipHash_Rounds(ctx, rounds); - ctx->v[0] ^= m; -} - -void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) -{ - u64 k0, k1; - - k0 = le64_to_cpu(key->k0); - k1 = le64_to_cpu(key->k1); - - ctx->v[0] = 0x736f6d6570736575ULL ^ k0; - ctx->v[1] = 0x646f72616e646f6dULL ^ k1; - ctx->v[2] = 0x6c7967656e657261ULL ^ k0; - ctx->v[3] = 0x7465646279746573ULL ^ k1; - - memset(ctx->buf, 0, sizeof(ctx->buf)); - ctx->bytes = 0; -} - -void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, - const void *src, size_t len) -{ - const u8 *ptr = src; - size_t left, used; - - if (len == 0) - return; - - used = ctx->bytes % sizeof(ctx->buf); - ctx->bytes += len; - - if (used > 0) { - left = sizeof(ctx->buf) - used; - - if (len >= left) { - memcpy(&ctx->buf[used], ptr, left); - SipHash_CRounds(ctx, ctx->buf, rc); - len -= left; - ptr += left; - } else { - memcpy(&ctx->buf[used], ptr, len); - return; - } - } - - while (len >= sizeof(ctx->buf)) { - SipHash_CRounds(ctx, ptr, rc); - len -= sizeof(ctx->buf); - ptr += sizeof(ctx->buf); - } - - if (len > 0) - memcpy(&ctx->buf[used], ptr, len); -} - -void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) -{ - u64 r; - - r = SipHash_End(ctx, rc, rf); - - *((__le64 *) dst) = cpu_to_le64(r); -} - -u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) -{ - u64 r; - size_t left, used; - - used = ctx->bytes % sizeof(ctx->buf); - left = sizeof(ctx->buf) - used; - memset(&ctx->buf[used], 0, left - 1); - ctx->buf[7] = ctx->bytes; - - SipHash_CRounds(ctx, ctx->buf, rc); - ctx->v[2] ^= 0xff; - SipHash_Rounds(ctx, rf); - - r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); - memset(ctx, 0, sizeof(*ctx)); - return r; -} - -u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) -{ - SIPHASH_CTX ctx; - - SipHash_Init(&ctx, key); - SipHash_Update(&ctx, rc, rf, src, len); - return SipHash_End(&ctx, rc, rf); -} diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h deleted file mode 100644 index 3dfaf34a43b284..00000000000000 --- a/fs/bcachefs/siphash.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause */ -/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ -/*- - * Copyright (c) 2013 Andre Oppermann - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -/* - * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) - * optimized for speed on short messages returning a 64bit hash/digest value. - * - * The number of rounds is defined during the initialization: - * SipHash24_Init() for the fast and resonable strong version - * SipHash48_Init() for the strong version (half as fast) - * - * struct SIPHASH_CTX ctx; - * SipHash24_Init(&ctx); - * SipHash_SetKey(&ctx, "16bytes long key"); - * SipHash_Update(&ctx, pointer_to_string, length_of_string); - * SipHash_Final(output, &ctx); - */ - -#ifndef _SIPHASH_H_ -#define _SIPHASH_H_ - -#include - -#define SIPHASH_BLOCK_LENGTH 8 -#define SIPHASH_KEY_LENGTH 16 -#define SIPHASH_DIGEST_LENGTH 8 - -typedef struct _SIPHASH_CTX { - u64 v[4]; - u8 buf[SIPHASH_BLOCK_LENGTH]; - u32 bytes; -} SIPHASH_CTX; - -typedef struct { - __le64 k0; - __le64 k1; -} SIPHASH_KEY; - -void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); -void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); -u64 SipHash_End(SIPHASH_CTX *, int, int); -void SipHash_Final(void *, SIPHASH_CTX *, int, int); -u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); - -#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) -#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) -#define SipHash24_End(_d) SipHash_End((_d), 2, 4) -#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) -#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) - -#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) -#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) -#define SipHash48_End(_d) SipHash_End((_d), 4, 8) -#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) -#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) - -#endif /* _SIPHASH_H_ */ diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c deleted file mode 100644 index 538c324f4765dc..00000000000000 --- a/fs/bcachefs/six.c +++ /dev/null @@ -1,878 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "six.h" - -#ifdef DEBUG -#define EBUG_ON(cond) BUG_ON(cond) -#else -#define EBUG_ON(cond) do {} while (0) -#endif - -#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) -#define six_release(l, ip) lock_release(l, ip) - -static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); - -#define SIX_LOCK_HELD_read_OFFSET 0 -#define SIX_LOCK_HELD_read ~(~0U << 26) -#define SIX_LOCK_HELD_intent (1U << 26) -#define SIX_LOCK_HELD_write (1U << 27) -#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) -#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) -#define SIX_LOCK_NOSPIN (1U << 31) - -struct six_lock_vals { - /* Value we add to the lock in order to take the lock: */ - u32 lock_val; - - /* If the lock has this value (used as a mask), taking the lock fails: */ - u32 lock_fail; - - /* Mask that indicates lock is held for this type: */ - u32 held_mask; - - /* Waitlist we wakeup when releasing the lock: */ - enum six_lock_type unlock_wakeup; -}; - -static const struct six_lock_vals l[] = { - [SIX_LOCK_read] = { - .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, - .lock_fail = SIX_LOCK_HELD_write, - .held_mask = SIX_LOCK_HELD_read, - .unlock_wakeup = SIX_LOCK_write, - }, - [SIX_LOCK_intent] = { - .lock_val = SIX_LOCK_HELD_intent, - .lock_fail = SIX_LOCK_HELD_intent, - .held_mask = SIX_LOCK_HELD_intent, - .unlock_wakeup = SIX_LOCK_intent, - }, - [SIX_LOCK_write] = { - .lock_val = SIX_LOCK_HELD_write, - .lock_fail = SIX_LOCK_HELD_read, - .held_mask = SIX_LOCK_HELD_write, - .unlock_wakeup = SIX_LOCK_read, - }, -}; - -static inline void six_set_bitmask(struct six_lock *lock, u32 mask) -{ - if ((atomic_read(&lock->state) & mask) != mask) - atomic_or(mask, &lock->state); -} - -static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) -{ - if (atomic_read(&lock->state) & mask) - atomic_and(~mask, &lock->state); -} - -static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, - u32 old, struct task_struct *owner) -{ - if (type != SIX_LOCK_intent) - return; - - if (!(old & SIX_LOCK_HELD_intent)) { - EBUG_ON(lock->owner); - lock->owner = owner; - } else { - EBUG_ON(lock->owner != current); - } -} - -static inline unsigned pcpu_read_count(struct six_lock *lock) -{ - unsigned read_count = 0; - int cpu; - - for_each_possible_cpu(cpu) - read_count += *per_cpu_ptr(lock->readers, cpu); - return read_count; -} - -/* - * __do_six_trylock() - main trylock routine - * - * Returns 1 on success, 0 on failure - * - * In percpu reader mode, a failed trylock may cause a spurious trylock failure - * for anoter thread taking the competing lock type, and we may havve to do a - * wakeup: when a wakeup is required, we return -1 - wakeup_type. - */ -static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, - struct task_struct *task, bool try) -{ - int ret; - u32 old; - - EBUG_ON(type == SIX_LOCK_write && lock->owner != task); - EBUG_ON(type == SIX_LOCK_write && - (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); - - /* - * Percpu reader mode: - * - * The basic idea behind this algorithm is that you can implement a lock - * between two threads without any atomics, just memory barriers: - * - * For two threads you'll need two variables, one variable for "thread a - * has the lock" and another for "thread b has the lock". - * - * To take the lock, a thread sets its variable indicating that it holds - * the lock, then issues a full memory barrier, then reads from the - * other thread's variable to check if the other thread thinks it has - * the lock. If we raced, we backoff and retry/sleep. - * - * Failure to take the lock may cause a spurious trylock failure in - * another thread, because we temporarily set the lock to indicate that - * we held it. This would be a problem for a thread in six_lock(), when - * they are calling trylock after adding themself to the waitlist and - * prior to sleeping. - * - * Therefore, if we fail to get the lock, and there were waiters of the - * type we conflict with, we will have to issue a wakeup. - * - * Since we may be called under wait_lock (and by the wakeup code - * itself), we return that the wakeup has to be done instead of doing it - * here. - */ - if (type == SIX_LOCK_read && lock->readers) { - preempt_disable(); - this_cpu_inc(*lock->readers); /* signal that we own lock */ - - smp_mb(); - - old = atomic_read(&lock->state); - ret = !(old & l[type].lock_fail); - - this_cpu_sub(*lock->readers, !ret); - preempt_enable(); - - if (!ret) { - smp_mb(); - if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write) - ret = -1 - SIX_LOCK_write; - } - } else if (type == SIX_LOCK_write && lock->readers) { - if (try) - atomic_add(SIX_LOCK_HELD_write, &lock->state); - - /* - * Make sure atomic_add happens before pcpu_read_count and - * six_set_bitmask in slow path happens before pcpu_read_count. - * - * Paired with the smp_mb() in read lock fast path (per-cpu mode) - * and the one before atomic_read in read unlock path. - */ - smp_mb(); - ret = !pcpu_read_count(lock); - - if (try && !ret) { - old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); - if (old & SIX_LOCK_WAITING_read) - ret = -1 - SIX_LOCK_read; - } - } else { - old = atomic_read(&lock->state); - do { - ret = !(old & l[type].lock_fail); - if (!ret || (type == SIX_LOCK_write && !try)) { - smp_mb(); - break; - } - } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); - - EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); - } - - if (ret > 0) - six_set_owner(lock, type, old, task); - - EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && - (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); - - return ret; -} - -static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) -{ - struct six_lock_waiter *w, *next; - struct task_struct *task; - bool saw_one; - int ret; -again: - ret = 0; - saw_one = false; - raw_spin_lock(&lock->wait_lock); - - list_for_each_entry_safe(w, next, &lock->wait_list, list) { - if (w->lock_want != lock_type) - continue; - - if (saw_one && lock_type != SIX_LOCK_read) - goto unlock; - saw_one = true; - - ret = __do_six_trylock(lock, lock_type, w->task, false); - if (ret <= 0) - goto unlock; - - /* - * Similar to percpu_rwsem_wake_function(), we need to guard - * against the wakee noticing w->lock_acquired, returning, and - * then exiting before we do the wakeup: - */ - task = get_task_struct(w->task); - __list_del(w->list.prev, w->list.next); - /* - * The release barrier here ensures the ordering of the - * __list_del before setting w->lock_acquired; @w is on the - * stack of the thread doing the waiting and will be reused - * after it sees w->lock_acquired with no other locking: - * pairs with smp_load_acquire() in six_lock_slowpath() - */ - smp_store_release(&w->lock_acquired, true); - wake_up_process(task); - put_task_struct(task); - } - - six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); -unlock: - raw_spin_unlock(&lock->wait_lock); - - if (ret < 0) { - lock_type = -ret - 1; - goto again; - } -} - -__always_inline -static void six_lock_wakeup(struct six_lock *lock, u32 state, - enum six_lock_type lock_type) -{ - if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) - return; - - if (!(state & (SIX_LOCK_WAITING_read << lock_type))) - return; - - __six_lock_wakeup(lock, lock_type); -} - -__always_inline -static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) -{ - int ret; - - ret = __do_six_trylock(lock, type, current, try); - if (ret < 0) - __six_lock_wakeup(lock, -ret - 1); - - return ret > 0; -} - -/** - * six_trylock_ip - attempt to take a six lock without blocking - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: true on success, false on failure. - */ -bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -{ - if (!do_six_trylock(lock, type, true)) - return false; - - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); - return true; -} -EXPORT_SYMBOL_GPL(six_trylock_ip); - -/** - * six_relock_ip - attempt to re-take a lock that was held previously - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @seq: lock sequence number obtained from six_lock_seq() while lock was - * held previously - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: true on success, false on failure. - */ -bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, - unsigned seq, unsigned long ip) -{ - if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) - return false; - - if (six_lock_seq(lock) != seq) { - six_unlock_ip(lock, type, ip); - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(six_relock_ip); - -#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN - -static inline bool six_owner_running(struct six_lock *lock) -{ - /* - * When there's no owner, we might have preempted between the owner - * acquiring the lock and setting the owner field. If we're an RT task - * that will live-lock because we won't let the owner complete. - */ - guard(rcu)(); - struct task_struct *owner = READ_ONCE(lock->owner); - return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); -} - -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait, - enum six_lock_type type) -{ - unsigned loop = 0; - u64 end_time; - - if (type == SIX_LOCK_write) - return false; - - if (lock->wait_list.next != &wait->list) - return false; - - if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) - return false; - - preempt_disable(); - end_time = sched_clock() + 10 * NSEC_PER_USEC; - - while (!need_resched() && six_owner_running(lock)) { - /* - * Ensures that writes to the waitlist entry happen after we see - * wait->lock_acquired: pairs with the smp_store_release in - * __six_lock_wakeup - */ - if (smp_load_acquire(&wait->lock_acquired)) { - preempt_enable(); - return true; - } - - if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { - six_set_bitmask(lock, SIX_LOCK_NOSPIN); - break; - } - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax(); - } - - preempt_enable(); - return false; -} - -#else /* CONFIG_LOCK_SPIN_ON_OWNER */ - -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait, - enum six_lock_type type) -{ - return false; -} - -#endif - -noinline -static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - int ret = 0; - - if (type == SIX_LOCK_write) { - EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); - atomic_add(SIX_LOCK_HELD_write, &lock->state); - smp_mb__after_atomic(); - } - - trace_contention_begin(lock, 0); - lock_contended(&lock->dep_map, ip); - - wait->task = current; - wait->lock_want = type; - wait->lock_acquired = false; - - raw_spin_lock(&lock->wait_lock); - six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); - /* - * Retry taking the lock after taking waitlist lock, in case we raced - * with an unlock: - */ - ret = __do_six_trylock(lock, type, current, false); - if (ret <= 0) { - wait->start_time = local_clock(); - - if (!list_empty(&lock->wait_list)) { - struct six_lock_waiter *last = - list_last_entry(&lock->wait_list, - struct six_lock_waiter, list); - - if (time_before_eq64(wait->start_time, last->start_time)) - wait->start_time = last->start_time + 1; - } - - list_add_tail(&wait->list, &lock->wait_list); - } - raw_spin_unlock(&lock->wait_lock); - - if (unlikely(ret > 0)) { - ret = 0; - goto out; - } - - if (unlikely(ret < 0)) { - __six_lock_wakeup(lock, -ret - 1); - ret = 0; - } - - if (six_optimistic_spin(lock, wait, type)) - goto out; - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - - /* - * Ensures that writes to the waitlist entry happen after we see - * wait->lock_acquired: pairs with the smp_store_release in - * __six_lock_wakeup - */ - if (smp_load_acquire(&wait->lock_acquired)) - break; - - ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; - if (unlikely(ret)) { - bool acquired; - - /* - * If should_sleep_fn() returns an error, we are - * required to return that error even if we already - * acquired the lock - should_sleep_fn() might have - * modified external state (e.g. when the deadlock cycle - * detector in bcachefs issued a transaction restart) - */ - raw_spin_lock(&lock->wait_lock); - acquired = wait->lock_acquired; - if (!acquired) - list_del(&wait->list); - raw_spin_unlock(&lock->wait_lock); - - if (unlikely(acquired)) { - do_six_unlock_type(lock, type); - } else if (type == SIX_LOCK_write) { - six_clear_bitmask(lock, SIX_LOCK_HELD_write); - six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); - } - break; - } - - schedule(); - } - - __set_current_state(TASK_RUNNING); -out: - trace_contention_end(lock, 0); - - return ret; -} - -/** - * six_lock_ip_waiter - take a lock, with full waitlist interface - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @wait: pointer to wait object, which will be added to lock's waitlist - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * This is the most general six_lock() variant, with parameters to support full - * cycle detection for deadlock avoidance. - * - * The code calling this function must implement tracking of held locks, and the - * @wait object should be embedded into the struct that tracks held locks - - * which must also be accessible in a thread-safe way. - * - * @should_sleep_fn should invoke the cycle detector; it should walk each - * lock's waiters, and for each waiter recursively walk their held locks. - * - * When this function must block, @wait will be added to @lock's waitlist before - * calling trylock, and before calling @should_sleep_fn, and @wait will not be - * removed from the lock waitlist until the lock has been successfully acquired, - * or we abort. - * - * @wait.start_time will be monotonically increasing for any given waitlist, and - * thus may be used as a loop cursor. - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - int ret; - - wait->start_time = 0; - - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); - - ret = do_six_trylock(lock, type, true) ? 0 - : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); - - if (ret && type != SIX_LOCK_write) - six_release(&lock->dep_map, ip); - if (!ret) - lock_acquired(&lock->dep_map, ip); - - return ret; -} -EXPORT_SYMBOL_GPL(six_lock_ip_waiter); - -__always_inline -static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - u32 state; - - if (type == SIX_LOCK_intent) - lock->owner = NULL; - - if (type == SIX_LOCK_read && - lock->readers) { - smp_mb(); /* unlock barrier */ - this_cpu_dec(*lock->readers); - smp_mb(); /* between unlocking and checking for waiters */ - state = atomic_read(&lock->state); - } else { - u32 v = l[type].lock_val; - - if (type != SIX_LOCK_read) - v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; - - EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); - state = atomic_sub_return_release(v, &lock->state); - } - - six_lock_wakeup(lock, state, l[type].unlock_wakeup); -} - -/** - * six_unlock_ip - drop a six lock - * @lock: lock to unlock - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * When a lock is held multiple times (because six_lock_incement()) was used), - * this decrements the 'lock held' counter by one. - * - * For example: - * six_lock_read(&foo->lock); read count 1 - * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 - */ -void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -{ - EBUG_ON(type == SIX_LOCK_write && - !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); - EBUG_ON((type == SIX_LOCK_write || - type == SIX_LOCK_intent) && - lock->owner != current); - - if (type != SIX_LOCK_write) - six_release(&lock->dep_map, ip); - - if (type == SIX_LOCK_intent && - lock->intent_lock_recurse) { - --lock->intent_lock_recurse; - return; - } - - if (type == SIX_LOCK_write && - lock->write_lock_recurse) { - --lock->write_lock_recurse; - return; - } - - if (type == SIX_LOCK_write) - lock->seq++; - - do_six_unlock_type(lock, type); -} -EXPORT_SYMBOL_GPL(six_unlock_ip); - -/** - * six_lock_downgrade - convert an intent lock to a read lock - * @lock: lock to dowgrade - * - * @lock will have read count incremented and intent count decremented - */ -void six_lock_downgrade(struct six_lock *lock) -{ - six_lock_increment(lock, SIX_LOCK_read); - six_unlock_intent(lock); -} -EXPORT_SYMBOL_GPL(six_lock_downgrade); - -/** - * six_lock_tryupgrade - attempt to convert read lock to an intent lock - * @lock: lock to upgrade - * - * On success, @lock will have intent count incremented and read count - * decremented - * - * Return: true on success, false on failure - */ -bool six_lock_tryupgrade(struct six_lock *lock) -{ - u32 old = atomic_read(&lock->state), new; - - do { - new = old; - - if (new & SIX_LOCK_HELD_intent) - return false; - - if (!lock->readers) { - EBUG_ON(!(new & SIX_LOCK_HELD_read)); - new -= l[SIX_LOCK_read].lock_val; - } - - new |= SIX_LOCK_HELD_intent; - } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); - - if (lock->readers) - this_cpu_dec(*lock->readers); - - six_set_owner(lock, SIX_LOCK_intent, old, current); - - return true; -} -EXPORT_SYMBOL_GPL(six_lock_tryupgrade); - -/** - * six_trylock_convert - attempt to convert a held lock from one type to another - * @lock: lock to upgrade - * @from: SIX_LOCK_read or SIX_LOCK_intent - * @to: SIX_LOCK_read or SIX_LOCK_intent - * - * On success, @lock will have intent count incremented and read count - * decremented - * - * Return: true on success, false on failure - */ -bool six_trylock_convert(struct six_lock *lock, - enum six_lock_type from, - enum six_lock_type to) -{ - EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); - - if (to == from) - return true; - - if (to == SIX_LOCK_read) { - six_lock_downgrade(lock); - return true; - } else { - return six_lock_tryupgrade(lock); - } -} -EXPORT_SYMBOL_GPL(six_trylock_convert); - -/** - * six_lock_increment - increase held lock count on a lock that is already held - * @lock: lock to increment - * @type: SIX_LOCK_read or SIX_LOCK_intent - * - * @lock must already be held, with a lock type that is greater than or equal to - * @type - * - * A corresponding six_unlock_type() call will be required for @lock to be fully - * unlocked. - */ -void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -{ - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); - - /* XXX: assert already locked, and that we don't overflow: */ - - switch (type) { - case SIX_LOCK_read: - if (lock->readers) { - this_cpu_inc(*lock->readers); - } else { - EBUG_ON(!(atomic_read(&lock->state) & - (SIX_LOCK_HELD_read| - SIX_LOCK_HELD_intent))); - atomic_add(l[type].lock_val, &lock->state); - } - break; - case SIX_LOCK_write: - lock->write_lock_recurse++; - fallthrough; - case SIX_LOCK_intent: - EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); - lock->intent_lock_recurse++; - break; - } -} -EXPORT_SYMBOL_GPL(six_lock_increment); - -/** - * six_lock_wakeup_all - wake up all waiters on @lock - * @lock: lock to wake up waiters for - * - * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then - * abort the lock operation. - * - * This function is never needed in a bug-free program; it's only useful in - * debug code, e.g. to determine if a cycle detector is at fault. - */ -void six_lock_wakeup_all(struct six_lock *lock) -{ - u32 state = atomic_read(&lock->state); - struct six_lock_waiter *w; - - six_lock_wakeup(lock, state, SIX_LOCK_read); - six_lock_wakeup(lock, state, SIX_LOCK_intent); - six_lock_wakeup(lock, state, SIX_LOCK_write); - - raw_spin_lock(&lock->wait_lock); - list_for_each_entry(w, &lock->wait_list, list) - wake_up_process(w->task); - raw_spin_unlock(&lock->wait_lock); -} -EXPORT_SYMBOL_GPL(six_lock_wakeup_all); - -/** - * six_lock_counts - return held lock counts, for each lock type - * @lock: lock to return counters for - * - * Return: the number of times a lock is held for read, intent and write. - */ -struct six_lock_count six_lock_counts(struct six_lock *lock) -{ - struct six_lock_count ret; - - ret.n[SIX_LOCK_read] = !lock->readers - ? atomic_read(&lock->state) & SIX_LOCK_HELD_read - : pcpu_read_count(lock); - ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + - lock->intent_lock_recurse; - ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); - - return ret; -} -EXPORT_SYMBOL_GPL(six_lock_counts); - -/** - * six_lock_readers_add - directly manipulate reader count of a lock - * @lock: lock to add/subtract readers for - * @nr: reader count to add/subtract - * - * When an upper layer is implementing lock reentrency, we may have both read - * and intent locks on the same lock. - * - * When we need to take a write lock, the read locks will cause self-deadlock, - * because six locks themselves do not track which read locks are held by the - * current thread and which are held by a different thread - it does no - * per-thread tracking of held locks. - * - * The upper layer that is tracking held locks may however, if trylock() has - * failed, count up its own read locks, subtract them, take the write lock, and - * then re-add them. - * - * As in any other situation when taking a write lock, @lock must be held for - * intent one (or more) times, so @lock will never be left unlocked. - */ -void six_lock_readers_add(struct six_lock *lock, int nr) -{ - if (lock->readers) { - this_cpu_add(*lock->readers, nr); - } else { - EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); - /* reader count starts at bit 0 */ - atomic_add(nr, &lock->state); - } -} -EXPORT_SYMBOL_GPL(six_lock_readers_add); - -/** - * six_lock_exit - release resources held by a lock prior to freeing - * @lock: lock to exit - * - * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is - * required to free the percpu read counts. - */ -void six_lock_exit(struct six_lock *lock) -{ - WARN_ON(lock->readers && pcpu_read_count(lock)); - WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); - - free_percpu(lock->readers); - lock->readers = NULL; -} -EXPORT_SYMBOL_GPL(six_lock_exit); - -void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags, - gfp_t gfp) -{ - atomic_set(&lock->state, 0); - raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - debug_check_no_locks_freed((void *) lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - - /* - * Don't assume that we have real percpu variables available in - * userspace: - */ -#ifdef __KERNEL__ - if (flags & SIX_LOCK_INIT_PCPU) { - /* - * We don't return an error here on memory allocation failure - * since percpu is an optimization, and locks will work with the - * same semantics in non-percpu mode: callers can check for - * failure if they wish by checking lock->readers, but generally - * will not want to treat it as an error. - */ - lock->readers = alloc_percpu_gfp(unsigned, gfp); - } -#endif -} -EXPORT_SYMBOL_GPL(__six_lock_init); diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h deleted file mode 100644 index 59b851cf8bacc4..00000000000000 --- a/fs/bcachefs/six.h +++ /dev/null @@ -1,388 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _LINUX_SIX_H -#define _LINUX_SIX_H - -/** - * DOC: SIX locks overview - * - * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores - * but with an additional state: read/shared, intent, exclusive/write - * - * The purpose of the intent state is to allow for greater concurrency on tree - * structures without deadlocking. In general, a read can't be upgraded to a - * write lock without deadlocking, so an operation that updates multiple nodes - * will have to take write locks for the full duration of the operation. - * - * But by adding an intent state, which is exclusive with other intent locks but - * not with readers, we can take intent locks at the start of the operation, - * and then take write locks only for the actual update to each individual - * nodes, without deadlocking. - * - * Example usage: - * six_lock_read(&foo->lock); - * six_unlock_read(&foo->lock); - * - * An intent lock must be held before taking a write lock: - * six_lock_intent(&foo->lock); - * six_lock_write(&foo->lock); - * six_unlock_write(&foo->lock); - * six_unlock_intent(&foo->lock); - * - * Other operations: - * six_trylock_read() - * six_trylock_intent() - * six_trylock_write() - * - * six_lock_downgrade() convert from intent to read - * six_lock_tryupgrade() attempt to convert from read to intent, may fail - * - * There are also interfaces that take the lock type as an enum: - * - * six_lock_type(&foo->lock, SIX_LOCK_read); - * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) - * six_lock_type(&foo->lock, SIX_LOCK_write); - * six_unlock_type(&foo->lock, SIX_LOCK_write); - * six_unlock_type(&foo->lock, SIX_LOCK_intent); - * - * Lock sequence numbers - unlock(), relock(): - * - * Locks embed sequences numbers, which are incremented on write lock/unlock. - * This allows locks to be dropped and the retaken iff the state they protect - * hasn't changed; this makes it much easier to avoid holding locks while e.g. - * doing IO or allocating memory. - * - * Example usage: - * six_lock_read(&foo->lock); - * u32 seq = six_lock_seq(&foo->lock); - * six_unlock_read(&foo->lock); - * - * some_operation_that_may_block(); - * - * if (six_relock_read(&foo->lock, seq)) { ... } - * - * If the relock operation succeeds, it is as if the lock was never unlocked. - * - * Reentrancy: - * - * Six locks are not by themselves reentrant, but have counters for both the - * read and intent states that can be used to provide reentrancy by an upper - * layer that tracks held locks. If a lock is known to already be held in the - * read or intent state, six_lock_increment() can be used to bump the "lock - * held in this state" counter, increasing the number of unlock calls that - * will be required to fully unlock it. - * - * Example usage: - * six_lock_read(&foo->lock); - * six_lock_increment(&foo->lock, SIX_LOCK_read); - * six_unlock_read(&foo->lock); - * six_unlock_read(&foo->lock); - * foo->lock is now fully unlocked. - * - * Since the intent state supercedes read, it's legal to increment the read - * counter when holding an intent lock, but not the reverse. - * - * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) - * is not legal. - * - * should_sleep_fn: - * - * There is a six_lock() variant that takes a function pointer that is called - * immediately prior to schedule() when blocking, and may return an error to - * abort. - * - * One possible use for this feature is when objects being locked are part of - * a cache and may reused, and lock ordering is based on a property of the - * object that will change when the object is reused - i.e. logical key order. - * - * If looking up an object in the cache may race with object reuse, and lock - * ordering is required to prevent deadlock, object reuse may change the - * correct lock order for that object and cause a deadlock. should_sleep_fn - * can be used to check if the object is still the object we want and avoid - * this deadlock. - * - * Wait list entry interface: - * - * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a - * wait list entry. By embedding six_lock_waiter into another object, and by - * traversing lock waitlists, it is then possible for an upper layer to - * implement full cycle detection for deadlock avoidance. - * - * should_sleep_fn should be used for invoking the cycle detector, walking the - * graph of held locks to check for a deadlock. The upper layer must track - * held locks for each thread, and each thread's held locks must be reachable - * from its six_lock_waiter object. - * - * six_lock_waiter() will add the wait object to the waitlist re-trying taking - * the lock, and before calling should_sleep_fn, and the wait object will not - * be removed from the waitlist until either the lock has been successfully - * acquired, or we aborted because should_sleep_fn returned an error. - * - * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will - * have timestamps in strictly ascending order - this is so the timestamp can - * be used as a cursor for lock graph traverse. - */ - -#include -#include -#include - -enum six_lock_type { - SIX_LOCK_read, - SIX_LOCK_intent, - SIX_LOCK_write, -}; - -struct six_lock { - atomic_t state; - u32 seq; - unsigned intent_lock_recurse; - unsigned write_lock_recurse; - struct task_struct *owner; - unsigned __percpu *readers; - raw_spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -struct six_lock_waiter { - struct list_head list; - struct task_struct *task; - enum six_lock_type lock_want; - bool lock_acquired; - u64 start_time; -}; - -typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); - -void six_lock_exit(struct six_lock *lock); - -enum six_lock_init_flags { - SIX_LOCK_INIT_PCPU = 1U << 0, -}; - -void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags, - gfp_t gfp); - -/** - * six_lock_init - initialize a six lock - * @lock: lock to initialize - * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU - */ -#define six_lock_init(lock, flags, gfp) \ -do { \ - static struct lock_class_key __key; \ - \ - __six_lock_init((lock), #lock, &__key, flags, gfp); \ -} while (0) - -/** - * six_lock_seq - obtain current lock sequence number - * @lock: six_lock to obtain sequence number for - * - * @lock should be held for read or intent, and not write - * - * By saving the lock sequence number, we can unlock @lock and then (typically - * after some blocking operation) attempt to relock it: the relock will succeed - * if the sequence number hasn't changed, meaning no write locks have been taken - * and state corresponding to what @lock protects is still valid. - */ -static inline u32 six_lock_seq(const struct six_lock *lock) -{ - return lock->seq; -} - -bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); - -/** - * six_trylock_type - attempt to take a six lock without blocking - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * - * Return: true on success, false on failure. - */ -static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -{ - return six_trylock_ip(lock, type, _THIS_IP_); -} - -int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip); - -/** - * six_lock_waiter - take a lock, with full waitlist interface - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @wait: pointer to wait object, which will be added to lock's waitlist - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * - * This is a convenience wrapper around six_lock_ip_waiter(), see that function - * for full documentation. - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); -} - -/** - * six_lock_ip - take a six lock lock - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - struct six_lock_waiter wait; - - return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); -} - -/** - * six_lock_type - take a six lock lock - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @should_sleep_fn: callback run after adding to waitlist, immediately prior - * to scheduling - * @p: passed through to @should_sleep_fn - * - * Return: 0 on success, or the return code from @should_sleep_fn on failure. - */ -static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - struct six_lock_waiter wait; - - return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); -} - -bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, - unsigned seq, unsigned long ip); - -/** - * six_relock_type - attempt to re-take a lock that was held previously - * @lock: lock to take - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * @seq: lock sequence number obtained from six_lock_seq() while lock was - * held previously - * - * Return: true on success, false on failure. - */ -static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) -{ - return six_relock_ip(lock, type, seq, _THIS_IP_); -} - -void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); - -/** - * six_unlock_type - drop a six lock - * @lock: lock to unlock - * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write - * - * When a lock is held multiple times (because six_lock_incement()) was used), - * this decrements the 'lock held' counter by one. - * - * For example: - * six_lock_read(&foo->lock); read count 1 - * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 - * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 - */ -static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - six_unlock_ip(lock, type, _THIS_IP_); -} - -#define __SIX_LOCK(type) \ -static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ -{ \ - return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ -} \ - \ -static inline bool six_trylock_##type(struct six_lock *lock) \ -{ \ - return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -} \ - \ -static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ - struct six_lock_waiter *wait, \ - six_lock_should_sleep_fn should_sleep_fn, void *p,\ - unsigned long ip) \ -{ \ - return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ -} \ - \ -static inline int six_lock_ip_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn should_sleep_fn, void *p, \ - unsigned long ip) \ -{ \ - return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ -} \ - \ -static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ -{ \ - return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ -} \ - \ -static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ -{ \ - return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ -} \ - \ -static inline int six_lock_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn fn, void *p)\ -{ \ - return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ -} \ - \ -static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ -{ \ - six_unlock_ip(lock, SIX_LOCK_##type, ip); \ -} \ - \ -static inline void six_unlock_##type(struct six_lock *lock) \ -{ \ - six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -} - -__SIX_LOCK(read) -__SIX_LOCK(intent) -__SIX_LOCK(write) -#undef __SIX_LOCK - -void six_lock_downgrade(struct six_lock *); -bool six_lock_tryupgrade(struct six_lock *); -bool six_trylock_convert(struct six_lock *, enum six_lock_type, - enum six_lock_type); - -void six_lock_increment(struct six_lock *, enum six_lock_type); - -void six_lock_wakeup_all(struct six_lock *); - -struct six_lock_count { - unsigned n[3]; -}; - -struct six_lock_count six_lock_counts(struct six_lock *); -void six_lock_readers_add(struct six_lock *, int); - -#endif /* _LINUX_SIX_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c deleted file mode 100644 index 4c43d2a2c1f5bb..00000000000000 --- a/fs/bcachefs/snapshot.c +++ /dev/null @@ -1,2043 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "bbpos.h" -#include "bkey_buf.h" -#include "btree_cache.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "buckets.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "fs.h" -#include "recovery_passes.h" -#include "snapshot.h" - -#include - -/* - * Snapshot trees: - * - * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they - * exist to provide a stable identifier for the whole lifetime of a snapshot - * tree. - */ - -void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); - - prt_printf(out, "subvol %u root snapshot %u", - le32_to_cpu(t.v->master_subvol), - le32_to_cpu(t.v->root_snapshot)); -} - -int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), - c, snapshot_tree_pos_bad, - "bad pos"); -fsck_err: - return ret; -} - -int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot_tree *s) -{ - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), - BTREE_ITER_with_updates, snapshot_tree, s); - - if (bch2_err_matches(ret, ENOENT)) - ret = bch_err_throw(trans->c, ENOENT_snapshot_tree); - return ret; -} - -struct bkey_i_snapshot_tree * -__bch2_snapshot_tree_create(struct btree_trans *trans) -{ - struct btree_iter iter; - int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_snapshot_trees, POS(0, U32_MAX)); - struct bkey_i_snapshot_tree *s_t; - - if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree); - if (ret) - return ERR_PTR(ret); - - s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - bch2_trans_iter_exit(trans, &iter); - return ret ? ERR_PTR(ret) : s_t; -} - -static int bch2_snapshot_tree_create(struct btree_trans *trans, - u32 root_id, u32 subvol_id, u32 *tree_id) -{ - struct bkey_i_snapshot_tree *n_tree = - __bch2_snapshot_tree_create(trans); - - if (IS_ERR(n_tree)) - return PTR_ERR(n_tree); - - n_tree->v.master_subvol = cpu_to_le32(subvol_id); - n_tree->v.root_snapshot = cpu_to_le32(root_id); - *tree_id = n_tree->k.p.offset; - return 0; -} - -/* Snapshot nodes: */ - -static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) -{ - while (id && id < ancestor) { - const struct snapshot_t *s = __snapshot_t(t, id); - id = s ? s->parent : 0; - } - return id == ancestor; -} - -static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) -{ - guard(rcu)(); - return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); -} - -static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) -{ - const struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return 0; - - if (s->skip[2] <= ancestor) - return s->skip[2]; - if (s->skip[1] <= ancestor) - return s->skip[1]; - if (s->skip[0] <= ancestor) - return s->skip[0]; - return s->parent; -} - -static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) -{ - const struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return false; - - return test_bit(ancestor - id - 1, s->is_ancestor); -} - -bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - u32 orig_id = id; -#endif - - guard(rcu)(); - struct snapshot_table *t = rcu_dereference(c->snapshots); - - if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) - return __bch2_snapshot_is_ancestor_early(t, id, ancestor); - - if (likely(ancestor >= IS_ANCESTOR_BITMAP)) - while (id && id < ancestor - IS_ANCESTOR_BITMAP) - id = get_ancestor_below(t, id, ancestor); - - bool ret = id && id < ancestor - ? test_ancestor_bitmap(t, id, ancestor) - : id == ancestor; - - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor)); - return ret; -} - -static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) -{ - size_t idx = U32_MAX - id; - struct snapshot_table *new, *old; - - size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); - size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); - - if (unlikely(new_bytes > INT_MAX)) - return NULL; - - new = kvzalloc(new_bytes, GFP_KERNEL); - if (!new) - return NULL; - - new->nr = new_size; - - old = rcu_dereference_protected(c->snapshots, true); - if (old) - memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); - - rcu_assign_pointer(c->snapshots, new); - kvfree_rcu(old, rcu); - - return &rcu_dereference_protected(c->snapshots, - lockdep_is_held(&c->snapshot_table_lock))->s[idx]; -} - -static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) -{ - size_t idx = U32_MAX - id; - struct snapshot_table *table = - rcu_dereference_protected(c->snapshots, - lockdep_is_held(&c->snapshot_table_lock)); - - lockdep_assert_held(&c->snapshot_table_lock); - - if (likely(table && idx < table->nr)) - return &table->s[idx]; - - return __snapshot_t_mut(c, id); -} - -void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - - if (BCH_SNAPSHOT_SUBVOL(s.v)) - prt_str(out, "subvol "); - if (BCH_SNAPSHOT_WILL_DELETE(s.v)) - prt_str(out, "will_delete "); - if (BCH_SNAPSHOT_DELETED(s.v)) - prt_str(out, "deleted "); - - prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", - le32_to_cpu(s.v->parent), - le32_to_cpu(s.v->children[0]), - le32_to_cpu(s.v->children[1]), - le32_to_cpu(s.v->subvol), - le32_to_cpu(s.v->tree)); - - if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) - prt_printf(out, " depth %u skiplist %u %u %u", - le32_to_cpu(s.v->depth), - le32_to_cpu(s.v->skip[0]), - le32_to_cpu(s.v->skip[1]), - le32_to_cpu(s.v->skip[2])); -} - -int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_snapshot s; - u32 i, id; - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), - c, snapshot_pos_bad, - "bad pos"); - - s = bkey_s_c_to_snapshot(k); - - id = le32_to_cpu(s.v->parent); - bkey_fsck_err_on(id && id <= k.k->p.offset, - c, snapshot_parent_bad, - "bad parent node (%u <= %llu)", - id, k.k->p.offset); - - bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), - c, snapshot_children_not_normalized, - "children not normalized"); - - bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], - c, snapshot_child_duplicate, - "duplicate child nodes"); - - for (i = 0; i < 2; i++) { - id = le32_to_cpu(s.v->children[i]); - - bkey_fsck_err_on(id >= k.k->p.offset, - c, snapshot_child_bad, - "bad child node (%u >= %llu)", - id, k.k->p.offset); - } - - if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { - bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || - le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), - c, snapshot_skiplist_not_normalized, - "skiplist not normalized"); - - for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { - id = le32_to_cpu(s.v->skip[i]); - - bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), - c, snapshot_skiplist_bad, - "bad skiplist node %u", id); - } - } -fsck_err: - return ret; -} - -static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) -{ - mutex_lock(&c->snapshot_table_lock); - int ret = snapshot_t_mut(c, id) - ? 0 - : bch_err_throw(c, ENOMEM_mark_snapshot); - mutex_unlock(&c->snapshot_table_lock); - return ret; -} - -static int __bch2_mark_snapshot(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct snapshot_t *t; - u32 id = new.k->p.offset; - int ret = 0; - - mutex_lock(&c->snapshot_table_lock); - - t = snapshot_t_mut(c, id); - if (!t) { - ret = bch_err_throw(c, ENOMEM_mark_snapshot); - goto err; - } - - if (new.k->type == KEY_TYPE_snapshot) { - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - - t->state = !BCH_SNAPSHOT_DELETED(s.v) - ? SNAPSHOT_ID_live - : SNAPSHOT_ID_deleted; - t->parent = le32_to_cpu(s.v->parent); - t->children[0] = le32_to_cpu(s.v->children[0]); - t->children[1] = le32_to_cpu(s.v->children[1]); - t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; - t->tree = le32_to_cpu(s.v->tree); - - if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { - t->depth = le32_to_cpu(s.v->depth); - t->skip[0] = le32_to_cpu(s.v->skip[0]); - t->skip[1] = le32_to_cpu(s.v->skip[1]); - t->skip[2] = le32_to_cpu(s.v->skip[2]); - } else { - t->depth = 0; - t->skip[0] = 0; - t->skip[1] = 0; - t->skip[2] = 0; - } - - u32 parent = id; - - while ((parent = bch2_snapshot_parent_early(c, parent)) && - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); - - if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) - bch2_delete_dead_snapshots_async(c); - } - } else { - memset(t, 0, sizeof(*t)); - } -err: - mutex_unlock(&c->snapshot_table_lock); - return ret; -} - -int bch2_mark_snapshot(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); -} - -int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot *s) -{ - return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_with_updates, snapshot, s); -} - -/* fsck: */ - -static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) -{ - return snapshot_t(c, id)->children[child]; -} - -static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_child(c, id, 0); -} - -static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_child(c, id, 1); -} - -static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) -{ - u32 n, parent; - - n = bch2_snapshot_left_child(c, id); - if (n) - return n; - - while ((parent = bch2_snapshot_parent(c, id))) { - n = bch2_snapshot_right_child(c, parent); - if (n && n != id) - return n; - id = parent; - } - - return 0; -} - -u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, - snapshot_id_list *skip) -{ - guard(rcu)(); - u32 id, subvol = 0, s; -retry: - id = snapshot_root; - while (id && bch2_snapshot_exists(c, id)) { - if (!(skip && snapshot_list_has_id(skip, id))) { - s = snapshot_t(c, id)->subvol; - - if (s && (!subvol || s < subvol)) - subvol = s; - } - id = bch2_snapshot_tree_next(c, id); - if (id == snapshot_root) - break; - } - - if (!subvol && skip) { - skip = NULL; - goto retry; - } - - return subvol; -} - -static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, - u32 snapshot_root, u32 *subvol_id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - bool found = false; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, - 0, k, ret) { - if (k.k->type != KEY_TYPE_subvolume) - continue; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) - continue; - if (!BCH_SUBVOLUME_SNAP(s.v)) { - *subvol_id = s.k->p.offset; - found = true; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && !found) { - struct bkey_i_subvolume *u; - - *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL); - - u = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, *subvol_id), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - SET_BCH_SUBVOLUME_SNAP(&u->v, false); - } - - return ret; -} - -static int check_snapshot_tree(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot_tree st; - struct bch_snapshot s; - struct bch_subvolume subvol; - struct printbuf buf = PRINTBUF; - struct btree_iter snapshot_iter = {}; - u32 root_id; - int ret; - - if (k.k->type != KEY_TYPE_snapshot_tree) - return 0; - - st = bkey_s_c_to_snapshot_tree(k); - root_id = le32_to_cpu(st.v->root_snapshot); - - struct bkey_s_c_snapshot snapshot_k = - bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, - POS(0, root_id), 0, snapshot); - ret = bkey_err(snapshot_k); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (!ret) - bkey_val_copy(&s, snapshot_k); - - if (fsck_err_on(ret || - root_id != bch2_snapshot_root(c, root_id) || - st.k->p.offset != le32_to_cpu(s.tree), - trans, snapshot_tree_to_missing_snapshot, - "snapshot tree points to missing/incorrect snapshot:\n%s", - (bch2_bkey_val_to_text(&buf, c, st.s_c), - prt_newline(&buf), - ret - ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) - : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), - buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto err; - } - - if (!st.v->master_subvol) - goto out; - - ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, - trans, snapshot_tree_to_missing_subvol, - "snapshot tree points to missing subvolume:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(!bch2_snapshot_is_ancestor(c, - le32_to_cpu(subvol.snapshot), - root_id), - trans, snapshot_tree_to_wrong_subvol, - "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), - trans, snapshot_tree_to_snapshot_subvol, - "snapshot tree points to snapshot subvolume:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { - struct bkey_i_snapshot_tree *u; - u32 subvol_id; - - ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); - bch_err_fn(c, ret); - - if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ - ret = 0; - goto err; - } - - if (ret) - goto err; - - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.master_subvol = cpu_to_le32(subvol_id); - st = snapshot_tree_i_to_s_c(u); - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &snapshot_iter); - printbuf_exit(&buf); - return ret; -} - -/* - * For each snapshot_tree, make sure it points to the root of a snapshot tree - * and that snapshot entry points back to it, or delete it. - * - * And, make sure it points to a subvolume within that snapshot tree, or correct - * it to point to the oldest subvolume within that snapshot tree. - */ -int bch2_check_snapshot_trees(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_snapshot_trees, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot_tree(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -/* - * Look up snapshot tree for @tree_id and find root, - * make sure @snap_id is a descendent: - */ -static int snapshot_tree_ptr_good(struct btree_trans *trans, - u32 snap_id, u32 tree_id) -{ - struct bch_snapshot_tree s_t; - int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); - - if (bch2_err_matches(ret, ENOENT)) - return 0; - if (ret) - return ret; - - return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); -} - -u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) -{ - if (!id) - return 0; - - guard(rcu)(); - const struct snapshot_t *s = snapshot_t(c, id); - return s->parent - ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)) - : id; -} - -static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) -{ - unsigned i; - - for (i = 0; i < 3; i++) - if (!s.parent) { - if (s.skip[i]) - return false; - } else { - if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) - return false; - } - - return true; -} - -/* - * snapshot_tree pointer was incorrect: look up root snapshot node, make sure - * its snapshot_tree pointer is correct (allocate new one if necessary), then - * update this node's pointer to root node's pointer: - */ -static int snapshot_tree_ptr_repair(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bch_snapshot *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter root_iter; - struct bch_snapshot_tree s_t; - struct bkey_s_c_snapshot root; - struct bkey_i_snapshot *u; - u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; - int ret; - - root = bch2_bkey_get_iter_typed(trans, &root_iter, - BTREE_ID_snapshots, POS(0, root_id), - BTREE_ITER_with_updates, snapshot); - ret = bkey_err(root); - if (ret) - goto err; - - tree_id = le32_to_cpu(root.v->tree); - - ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { - u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u) ?: - bch2_snapshot_tree_create(trans, root_id, - bch2_snapshot_oldest_subvol(c, root_id, NULL), - &tree_id); - if (ret) - goto err; - - u->v.tree = cpu_to_le32(tree_id); - if (k.k->p.offset == root_id) - *s = u->v; - } - - if (k.k->p.offset != root_id) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.tree = cpu_to_le32(tree_id); - *s = u->v; - } -err: - bch2_trans_iter_exit(trans, &root_iter); - return ret; -} - -static int check_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bch_snapshot s; - struct bch_subvolume subvol; - struct bch_snapshot v; - struct bkey_i_snapshot *u; - u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); - u32 real_depth; - struct printbuf buf = PRINTBUF; - u32 i, id; - int ret = 0; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - memset(&s, 0, sizeof(s)); - memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); - - if (BCH_SNAPSHOT_DELETED(&s)) - return 0; - - id = le32_to_cpu(s.parent); - if (id) { - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot with nonexistent parent:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (ret) - goto err; - - if (le32_to_cpu(v.children[0]) != k.k->p.offset && - le32_to_cpu(v.children[1]) != k.k->p.offset) { - bch_err(c, "snapshot parent %u missing pointer to child %llu", - id, k.k->p.offset); - ret = -EINVAL; - goto err; - } - } - - for (i = 0; i < 2 && s.children[i]; i++) { - id = le32_to_cpu(s.children[i]); - - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot node %llu has nonexistent child %u", - k.k->p.offset, id); - if (ret) - goto err; - - if (le32_to_cpu(v.parent) != k.k->p.offset) { - bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", - id, le32_to_cpu(v.parent), k.k->p.offset); - ret = -EINVAL; - goto err; - } - } - - bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && - !BCH_SNAPSHOT_WILL_DELETE(&s); - - if (should_have_subvol) { - id = le32_to_cpu(s.subvol); - ret = bch2_subvolume_get(trans, id, false, &subvol); - if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "snapshot points to nonexistent subvolume:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (ret) - goto err; - - if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { - bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", - k.k->p.offset); - ret = -EINVAL; - goto err; - } - } else { - if (fsck_err_on(s.subvol, - trans, snapshot_should_not_have_subvol, - "snapshot should not point to subvol:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.subvol = 0; - s = u->v; - } - } - - ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, - trans, snapshot_to_bad_snapshot_tree, - "snapshot points to missing/incorrect tree:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = snapshot_tree_ptr_repair(trans, iter, k, &s); - if (ret) - goto err; - } - ret = 0; - - real_depth = bch2_snapshot_depth(c, parent_id); - - if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, - trans, snapshot_bad_depth, - "snapshot with incorrect depth field, should be %u:\n%s", - real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - u->v.depth = cpu_to_le32(real_depth); - s = u->v; - } - - ret = snapshot_skiplist_good(trans, k.k->p.offset, s); - if (ret < 0) - goto err; - - if (fsck_err_on(!ret, - trans, snapshot_bad_skiplist, - "snapshot with bad skiplist field:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - goto err; - - for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) - u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); - - bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); - s = u->v; - } - ret = 0; -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int bch2_check_snapshots(struct bch_fs *c) -{ - /* - * We iterate backwards as checking/fixing the depth field requires that - * the parent's depth already be correct: - */ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse_commit(trans, iter, - BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int check_snapshot_exists(struct btree_trans *trans, u32 id) -{ - struct bch_fs *c = trans->c; - - /* Do we need to reconstruct the snapshot_tree entry as well? */ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - u32 tree_id = 0; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, - 0, k, ret) { - if (k.k->type == KEY_TYPE_snapshot_tree && - le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { - tree_id = k.k->p.offset; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - return ret; - - if (!tree_id) { - ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); - if (ret) - return ret; - } - - struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); - ret = PTR_ERR_OR_ZERO(snapshot); - if (ret) - return ret; - - bkey_snapshot_init(&snapshot->k_i); - snapshot->k.p = POS(0, id); - snapshot->v.tree = cpu_to_le32(tree_id); - snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); - - for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, - 0, k, ret) { - if (k.k->type == KEY_TYPE_subvolume && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { - snapshot->v.subvol = cpu_to_le32(k.k->p.offset); - SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return bch2_snapshot_table_make_room(c, id) ?: - bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); -} - -/* Figure out which snapshot nodes belong in the same tree: */ -struct snapshot_tree_reconstruct { - enum btree_id btree; - struct bpos cur_pos; - snapshot_id_list cur_ids; - DARRAY(snapshot_id_list) trees; -}; - -static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) -{ - darray_for_each(r->trees, i) - darray_exit(i); - darray_exit(&r->trees); - darray_exit(&r->cur_ids); -} - -static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) -{ - return r->btree == BTREE_ID_inodes - ? r->cur_pos.offset == pos.offset - : r->cur_pos.inode == pos.inode; -} - -static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) -{ - return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL; -} - -static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) -{ - bool first = true; - darray_for_each(*s, i) { - if (!first) - prt_char(out, ' '); - first = false; - prt_printf(out, "%u", *i); - } -} - -static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) -{ - if (r->cur_ids.nr) { - darray_for_each(r->trees, i) - if (snapshot_id_lists_have_common(i, &r->cur_ids)) { - int ret = snapshot_list_merge(c, i, &r->cur_ids); - if (ret) - return ret; - goto out; - } - darray_push(&r->trees, r->cur_ids); - darray_init(&r->cur_ids); - } -out: - r->cur_ids.nr = 0; - return 0; -} - -static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) -{ - if (!same_snapshot(r, pos)) - snapshot_tree_reconstruct_next(c, r); - r->cur_pos = pos; - return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); -} - -int bch2_reconstruct_snapshots(struct bch_fs *c) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; - struct snapshot_tree_reconstruct r = {}; - int ret = 0; - - for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { - if (btree_type_has_snapshots(btree)) { - r.btree = btree; - - ret = for_each_btree_key(trans, iter, btree, POS_MIN, - BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ - get_snapshot_trees(c, &r, k.k->p); - })); - if (ret) - goto err; - - snapshot_tree_reconstruct_next(c, &r); - } - } - - darray_for_each(r.trees, t) { - printbuf_reset(&buf); - snapshot_id_list_to_text(&buf, t); - - darray_for_each(*t, id) { - if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, - trans, snapshot_node_missing, - "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { - if (t->nr > 1) { - bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; - } - - ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot_exists(trans, *id)); - if (ret) - goto err; - } - } - } -fsck_err: -err: - bch2_trans_put(trans); - snapshot_tree_reconstruct_exit(&r); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -int __bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); - - /* Snapshot was definitively deleted, this error is marked autofix */ - if (fsck_err_on(state == SNAPSHOT_ID_deleted, - trans, bkey_in_deleted_snapshot, - "key in deleted snapshot %s, delete?", - (bch2_btree_id_to_text(&buf, iter->btree_id), - prt_char(&buf, ' '), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; - - if (state == SNAPSHOT_ID_empty) { - /* - * Snapshot missing: we should have caught this with btree_lost_data and - * kicked off reconstruct_snapshots, so if we end up here we have no - * idea what happened. - * - * Do not delete unless we know that subvolumes and snapshots - * are consistent: - * - * XXX: - * - * We could be smarter here, and instead of using the generic - * recovery pass ratelimiting, track if there have been any - * changes to the snapshots or inodes btrees since those passes - * last ran. - */ - ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret; - ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret; - - if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)) - ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - - unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0); - - if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot, - "key in missing snapshot %s, delete?", - (bch2_btree_id_to_text(&buf, iter->btree_id), - prt_char(&buf, ' '), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; - } - } -fsck_err: - printbuf_exit(&buf); - return ret; -} - -int __bch2_get_snapshot_overwrites(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - snapshot_id_list *s) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos), - BTREE_ITER_all_snapshots, k, ret) { - if (!bkey_eq(k.k->p, pos)) - break; - - if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) || - snapshot_list_has_ancestor(c, s, k.k->p.snapshot)) - continue; - - ret = snapshot_list_add(c, s, k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - if (ret) - darray_exit(s); - - return ret; -} - -/* - * Mark a snapshot as deleted, for future cleanup: - */ -int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) -{ - struct btree_iter iter; - struct bkey_i_snapshot *s = - bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, id), - 0, snapshot); - int ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - trans->c, "missing snapshot %u", id); - return ret; - } - - /* already deleted? */ - if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) - goto err; - - SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); - SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); - s->v.subvol = 0; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) -{ - if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) - swap(s->children[0], s->children[1]); -} - -static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter, p_iter = {}; - struct btree_iter c_iter = {}; - struct btree_iter tree_iter = {}; - u32 parent_id, child_id; - unsigned i; - int ret = 0; - - struct bkey_i_snapshot *s = - bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_intent, snapshot); - ret = PTR_ERR_OR_ZERO(s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", id); - - if (ret) - goto err; - - BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); - BUG_ON(s->v.children[1]); - - parent_id = le32_to_cpu(s->v.parent); - child_id = le32_to_cpu(s->v.children[0]); - - if (parent_id) { - struct bkey_i_snapshot *parent; - - parent = bch2_bkey_get_mut_typed(trans, &p_iter, - BTREE_ID_snapshots, POS(0, parent_id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(parent); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", parent_id); - if (unlikely(ret)) - goto err; - - /* find entry in parent->children for node being deleted */ - for (i = 0; i < 2; i++) - if (le32_to_cpu(parent->v.children[i]) == id) - break; - - if (bch2_fs_inconsistent_on(i == 2, c, - "snapshot %u missing child pointer to %u", - parent_id, id)) - goto err; - - parent->v.children[i] = cpu_to_le32(child_id); - - normalize_snapshot_child_pointers(&parent->v); - } - - if (child_id) { - struct bkey_i_snapshot *child; - - child = bch2_bkey_get_mut_typed(trans, &c_iter, - BTREE_ID_snapshots, POS(0, child_id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(child); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", child_id); - if (unlikely(ret)) - goto err; - - child->v.parent = cpu_to_le32(parent_id); - - if (!child->v.parent) { - child->v.skip[0] = 0; - child->v.skip[1] = 0; - child->v.skip[2] = 0; - } - } - - if (!parent_id) { - /* - * We're deleting the root of a snapshot tree: update the - * snapshot_tree entry to point to the new root, or delete it if - * this is the last snapshot ID in this tree: - */ - struct bkey_i_snapshot_tree *s_t; - - BUG_ON(s->v.children[1]); - - s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - if (ret) - goto err; - - if (s->v.children[0]) { - s_t->v.root_snapshot = s->v.children[0]; - } else { - s_t->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&s_t->k, 0); - } - } - - if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { - SET_BCH_SNAPSHOT_DELETED(&s->v, true); - s->v.parent = 0; - s->v.children[0] = 0; - s->v.children[1] = 0; - s->v.subvol = 0; - s->v.tree = 0; - s->v.depth = 0; - s->v.skip[0] = 0; - s->v.skip[1] = 0; - s->v.skip[2] = 0; - } else { - s->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&s->k, 0); - } -err: - bch2_trans_iter_exit(trans, &tree_iter); - bch2_trans_iter_exit(trans, &p_iter); - bch2_trans_iter_exit(trans, &c_iter); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_snapshot *n; - struct bkey_s_c k; - unsigned i, j; - u32 depth = bch2_snapshot_depth(c, parent); - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_intent); - k = bch2_btree_iter_peek(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - for (i = 0; i < nr_snapids; i++) { - k = bch2_btree_iter_prev_slot(trans, &iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !k.k->p.offset) { - ret = bch_err_throw(c, ENOSPC_snapshot_create); - goto err; - } - - n = bch2_bkey_alloc(trans, &iter, 0, snapshot); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.flags = 0; - n->v.parent = cpu_to_le32(parent); - n->v.subvol = cpu_to_le32(snapshot_subvols[i]); - n->v.tree = cpu_to_le32(tree); - n->v.depth = cpu_to_le32(depth); - n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); - n->v.btime.hi = 0; - - for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) - n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); - - bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); - SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - - ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); - if (ret) - goto err; - - new_snapids[i] = iter.pos.offset; - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create new snapshot IDs as children of an existing snapshot ID: - */ -static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct btree_iter iter; - struct bkey_i_snapshot *n_parent; - int ret = 0; - - n_parent = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, parent), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(n_parent); - if (unlikely(ret)) { - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not found", parent); - return ret; - } - - if (n_parent->v.children[0] || n_parent->v.children[1]) { - bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); - ret = -EINVAL; - goto err; - } - - ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - goto err; - - n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); - n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); - n_parent->v.subvol = 0; - SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create a snapshot node that is the root of a new tree: - */ -static int bch2_snapshot_node_create_tree(struct btree_trans *trans, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bkey_i_snapshot_tree *n_tree; - int ret; - - n_tree = __bch2_snapshot_tree_create(trans); - ret = PTR_ERR_OR_ZERO(n_tree) ?: - create_snapids(trans, 0, n_tree->k.p.offset, - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - return ret; - - n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); - n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); - return 0; -} - -int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - BUG_ON((parent == 0) != (nr_snapids == 1)); - BUG_ON((parent != 0) != (nr_snapids == 2)); - - return parent - ? bch2_snapshot_node_create_children(trans, parent, - new_snapids, snapshot_subvols, nr_snapids) - : bch2_snapshot_node_create_tree(trans, - new_snapids, snapshot_subvols, nr_snapids); - -} - -/* - * If we have an unlinked inode in an internal snapshot node, and the inode - * really has been deleted in all child snapshots, how does this get cleaned up? - * - * first there is the problem of how keys that have been overwritten in all - * child snapshots get deleted (unimplemented?), but inodes may perhaps be - * special? - * - * also: unlinked inode in internal snapshot appears to not be getting deleted - * correctly if inode doesn't exist in leaf snapshots - * - * solution: - * - * for a key in an interior snapshot node that needs work to be done that - * requires it to be mutated: iterate over all descendent leaf nodes and copy - * that key to snapshot leaf nodes, where we can mutate it - */ - -static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) -{ - struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id); - return i ? i->live_child : 0; -} - -static unsigned __live_child(struct snapshot_table *t, u32 id, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) -{ - struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return 0; - - for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) - if (s->children[i] && - !snapshot_list_has_id(delete_leaves, s->children[i]) && - !interior_delete_has_id(delete_interior, s->children[i])) - return s->children[i]; - - for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { - u32 live_child = s->children[i] - ? __live_child(t, s->children[i], delete_leaves, delete_interior) - : 0; - if (live_child) - return live_child; - } - - return 0; -} - -static unsigned live_child(struct bch_fs *c, u32 id) -{ - struct snapshot_delete *d = &c->snapshot_delete; - - guard(rcu)(); - return __live_child(rcu_dereference(c->snapshots), id, - &d->delete_leaves, &d->delete_interior); -} - -static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) -{ - return snapshot_list_has_id(&d->delete_leaves, id) || - interior_delete_has_id(&d->delete_interior, id) != 0; -} - -static int delete_dead_snapshots_process_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct snapshot_delete *d = &trans->c->snapshot_delete; - - if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - - u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); - if (live_child) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - new->k.p.snapshot = live_child; - - struct btree_iter dst_iter; - struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, - iter->btree_id, new->k.p, - BTREE_ITER_all_snapshots| - BTREE_ITER_intent); - ret = bkey_err(dst_k); - if (ret) - return ret; - - ret = (bkey_deleted(dst_k.k) - ? bch2_trans_update(trans, &dst_iter, new, - BTREE_UPDATE_internal_snapshot_node) - : 0) ?: - bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; - } - - return 0; -} - -static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - - u64 inum = iter->btree_id != BTREE_ID_inodes - ? iter->pos.inode - : iter->pos.offset; - - if (*prev_inum == inum) - return false; - - *prev_inum = inum; - - bool ret = !snapshot_list_has_id(&d->deleting_from_trees, - bch2_snapshot_tree(c, iter->pos.snapshot)); - if (unlikely(ret)) { - struct bpos pos = iter->pos; - pos.snapshot = 0; - if (iter->btree_id != BTREE_ID_inodes) - pos.offset = U64_MAX; - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos)); - } - - return ret; -} - -static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - - for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { - struct disk_reservation res = { 0 }; - u64 prev_inum = 0; - - d->pos.pos = POS_MIN; - - if (!btree_type_has_snapshots(d->pos.btree)) - continue; - - int ret = for_each_btree_key_commit(trans, iter, - d->pos.btree, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - delete_dead_snapshots_process_key(trans, &iter, k); - })); - - bch2_disk_reservation_put(c, &res); - - if (ret) - return ret; - } - - return 0; -} - -static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, - struct bpos start, struct bpos end) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - struct disk_reservation res = { 0 }; - - d->pos.btree = btree; - d->pos.pos = POS_MIN; - - int ret = for_each_btree_key_max_commit(trans, iter, - btree, start, end, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.pos = iter.pos; - delete_dead_snapshots_process_key(trans, &iter, k); - })); - - bch2_disk_reservation_put(c, &res); - return ret; -} - -static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - struct disk_reservation res = { 0 }; - u64 prev_inum = 0; - int ret = 0; - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); - - while (1) { - struct bkey_s_c k; - ret = lockrestart_do(trans, - bkey_err(k = bch2_btree_iter_peek(trans, &iter))); - if (ret) - break; - - if (!k.k) - break; - - d->pos.btree = iter.btree_id; - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - if (snapshot_id_dying(d, k.k->p.snapshot)) { - struct bpos start = POS(k.k->p.offset, 0); - struct bpos end = POS(k.k->p.offset, U64_MAX); - - ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: - delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: - delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); - if (ret) - break; - - bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1)); - } else { - bch2_btree_iter_advance(trans, &iter); - } - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) - goto err; - - prev_inum = 0; - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_inodes, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - d->pos.btree = iter.btree_id; - d->pos.pos = iter.pos; - - if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) - continue; - - delete_dead_snapshots_process_key(trans, &iter, k); - })); -err: - bch2_disk_reservation_put(c, &res); - return ret; -} - -/* - * For a given snapshot, if it doesn't have a subvolume that points to it, and - * it doesn't have child snapshot nodes - it's now redundant and we can mark it - * as deleted. - */ -static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) -{ - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - struct bch_fs *c = trans->c; - struct snapshot_delete *d = &c->snapshot_delete; - struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - unsigned live_children = 0; - int ret = 0; - - if (BCH_SNAPSHOT_SUBVOL(s.v)) - return 0; - - if (BCH_SNAPSHOT_DELETED(s.v)) - return 0; - - mutex_lock(&d->progress_lock); - for (unsigned i = 0; i < 2; i++) { - u32 child = le32_to_cpu(s.v->children[i]); - - live_children += child && - !snapshot_list_has_id(&d->delete_leaves, child); - } - - u32 tree = bch2_snapshot_tree(c, s.k->p.offset); - - if (live_children == 0) { - ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: - snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); - } else if (live_children == 1) { - struct snapshot_interior_delete n = { - .id = s.k->p.offset, - .live_child = live_child(c, s.k->p.offset), - }; - - if (!n.live_child) { - bch_err(c, "error finding live child of snapshot %u", n.id); - ret = -EINVAL; - } else { - ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: - darray_push(&d->delete_interior, n); - } - } - mutex_unlock(&d->progress_lock); - - return ret; -} - -static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, - interior_delete_list *skip) -{ - guard(rcu)(); - while (interior_delete_has_id(skip, id)) - id = __bch2_snapshot_parent(c, id); - - while (n--) { - do { - id = __bch2_snapshot_parent(c, id); - } while (interior_delete_has_id(skip, id)); - } - - return id; -} - -static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k, - interior_delete_list *deleted) -{ - struct bch_fs *c = trans->c; - u32 nr_deleted_ancestors = 0; - struct bkey_i_snapshot *s; - int ret; - - if (!bch2_snapshot_exists(c, k.k->p.offset)) - return 0; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - if (interior_delete_has_id(deleted, k.k->p.offset)) - return 0; - - s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - darray_for_each(*deleted, i) - nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); - - if (!nr_deleted_ancestors) - return 0; - - le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); - - if (!s->v.depth) { - s->v.skip[0] = 0; - s->v.skip[1] = 0; - s->v.skip[2] = 0; - } else { - u32 depth = le32_to_cpu(s->v.depth); - u32 parent = bch2_snapshot_parent(c, s->k.p.offset); - - for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { - u32 id = le32_to_cpu(s->v.skip[j]); - - if (interior_delete_has_id(deleted, id)) { - id = bch2_snapshot_nth_parent_skip(c, - parent, - depth > 1 - ? get_random_u32_below(depth - 1) - : 0, - deleted); - s->v.skip[j] = cpu_to_le32(id); - } - } - - bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); - } - - return bch2_trans_update(trans, iter, &s->k_i, 0); -} - -static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) -{ - prt_printf(out, "deleting from trees"); - darray_for_each(d->deleting_from_trees, i) - prt_printf(out, " %u", *i); - - prt_printf(out, "deleting leaves"); - darray_for_each(d->delete_leaves, i) - prt_printf(out, " %u", *i); - prt_newline(out); - - prt_printf(out, "interior"); - darray_for_each(d->delete_interior, i) - prt_printf(out, " %u->%u", i->id, i->live_child); - prt_newline(out); -} - -int __bch2_delete_dead_snapshots(struct bch_fs *c) -{ - struct snapshot_delete *d = &c->snapshot_delete; - int ret = 0; - - if (!mutex_trylock(&d->lock)) - return 0; - - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) - goto out_unlock; - - struct btree_trans *trans = bch2_trans_get(c); - - /* - * For every snapshot node: If we have no live children and it's not - * pointed to by a subvolume, delete it: - */ - d->running = true; - d->pos = BBPOS_MIN; - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - check_should_delete_snapshot(trans, k)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err; - - if (!d->delete_leaves.nr && !d->delete_interior.nr) - goto err; - - { - struct printbuf buf = PRINTBUF; - bch2_snapshot_delete_nodes_to_text(&buf, d); - - ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); - printbuf_exit(&buf); - if (ret) - goto err; - } - - ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) - ? delete_dead_snapshot_keys_v2(trans) - : delete_dead_snapshot_keys_v1(trans); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting keys from dying snapshots"); - if (ret) - goto err; - - darray_for_each(d->delete_leaves, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting snapshot %u", *i); - if (ret) - goto err; - } - - /* - * Fixing children of deleted snapshots can't be done completely - * atomically, if we crash between here and when we delete the interior - * nodes some depth fields will be off: - */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); - if (ret) - goto err; - - darray_for_each(d->delete_interior, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, i->id)); - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting snapshot %u", i->id); - if (ret) - goto err; - } -err: - mutex_lock(&d->progress_lock); - darray_exit(&d->deleting_from_trees); - darray_exit(&d->delete_interior); - darray_exit(&d->delete_leaves); - d->running = false; - mutex_unlock(&d->progress_lock); - bch2_trans_put(trans); - - bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots); -out_unlock: - mutex_unlock(&d->lock); - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); - return ret; -} - -int bch2_delete_dead_snapshots(struct bch_fs *c) -{ - if (!c->opts.auto_snapshot_deletion) - return 0; - - return __bch2_delete_dead_snapshots(c); -} - -void bch2_delete_dead_snapshots_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); - - set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); - - bch2_delete_dead_snapshots(c); - enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); -} - -void bch2_delete_dead_snapshots_async(struct bch_fs *c) -{ - if (!c->opts.auto_snapshot_deletion) - return; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) - return; - - BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - - if (!queue_work(system_long_wq, &c->snapshot_delete.work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); -} - -void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct snapshot_delete *d = &c->snapshot_delete; - - if (!d->running) { - prt_str(out, "(not running)"); - return; - } - - mutex_lock(&d->progress_lock); - bch2_snapshot_delete_nodes_to_text(out, d); - - bch2_bbpos_to_text(out, d->pos); - mutex_unlock(&d->progress_lock); -} - -int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos), - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots, - k, ret) { - if (!bkey_eq(pos, k.k->p)) - break; - - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { - ret = 1; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) -{ - /* If there's one child, it's redundant and keys will be moved to the child */ - return !!snap.v->children[0] + !!snap.v->children[1] == 1; -} - -static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) -{ - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || - interior_snapshot_needs_delete(snap)) - set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); - - return 0; -} - -int bch2_snapshots_read(struct bch_fs *c) -{ - /* - * Initializing the is_ancestor bitmaps requires ancestors to already be - * initialized - so mark in reverse: - */ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, - POS_MAX, 0, k, - __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_check_snapshot_needs_deletion(trans, k))); - bch_err_fn(c, ret); - - /* - * It's important that we check if we need to reconstruct snapshots - * before going RW, so we mark that pass as required in the superblock - - * otherwise, we could end up deleting keys with missing snapshot nodes - * instead - */ - BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && - test_bit(BCH_FS_may_go_rw, &c->flags)); - - return ret; -} - -void bch2_fs_snapshots_exit(struct bch_fs *c) -{ - kvfree(rcu_dereference_protected(c->snapshots, true)); -} - -void bch2_fs_snapshots_init_early(struct bch_fs *c) -{ - INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); - mutex_init(&c->snapshot_delete.lock); - mutex_init(&c->snapshot_delete.progress_lock); - mutex_init(&c->snapshots_unlinked_lock); -} diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h deleted file mode 100644 index 6766bf673ed92c..00000000000000 --- a/fs/bcachefs/snapshot.h +++ /dev/null @@ -1,275 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SNAPSHOT_H -#define _BCACHEFS_SNAPSHOT_H - -void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); - -#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ - .key_validate = bch2_snapshot_tree_validate, \ - .val_to_text = bch2_snapshot_tree_to_text, \ - .min_val_size = 8, \ -}) - -struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); - -int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); - -void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ - .key_validate = bch2_snapshot_validate, \ - .val_to_text = bch2_snapshot_to_text, \ - .trigger = bch2_mark_snapshot, \ - .min_val_size = 24, \ -}) - -static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) -{ - u32 idx = U32_MAX - id; - - return likely(t && idx < t->nr) - ? &t->s[idx] - : NULL; -} - -static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) -{ - return __snapshot_t(rcu_dereference(c->snapshots), id); -} - -static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->tree : 0; -} - -static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->parent : 0; -} - -static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - return __bch2_snapshot_parent_early(c, id); -} - -static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - if (!s) - return 0; - - u32 parent = s->parent; - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - parent && - s->depth != snapshot_t(c, parent)->depth + 1) - panic("id %u depth=%u parent %u depth=%u\n", - id, snapshot_t(c, id)->depth, - parent, snapshot_t(c, parent)->depth); - - return parent; -} - -static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - return __bch2_snapshot_parent(c, id); -} - -static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) -{ - guard(rcu)(); - while (n--) - id = __bch2_snapshot_parent(c, id); - return id; -} - -u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *); -u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); - -static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - - u32 parent; - while ((parent = __bch2_snapshot_parent(c, id))) - id = parent; - return id; -} - -static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->state : SNAPSHOT_ID_empty; -} - -static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - return __bch2_snapshot_id_state(c, id); -} - -static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) -{ - return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; -} - -static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; -} - -static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) -{ - int ret = bch2_snapshot_is_internal_node(c, id); - if (ret < 0) - return ret; - return !ret; -} - -static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) -{ - guard(rcu)(); - return parent ? snapshot_t(c, parent)->depth + 1 : 0; -} - -bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); - -static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -{ - return id == ancestor - ? true - : __bch2_snapshot_is_ancestor(c, id, ancestor); -} - -static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) -{ - guard(rcu)(); - const struct snapshot_t *t = snapshot_t(c, id); - return t && (t->children[0]|t->children[1]) != 0; -} - -static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) -{ - return darray_find(*s, id) != NULL; -} - -static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - darray_for_each(*s, i) - if (bch2_snapshot_is_ancestor(c, id, *i)) - return true; - return false; -} - -static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - BUG_ON(snapshot_list_has_id(s, id)); - int ret = darray_push(s, id); - if (ret) - bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); - return ret; -} - -static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id) -{ - int ret = snapshot_list_has_id(s, id) - ? 0 - : darray_push(s, id); - if (ret) - bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); - return ret; -} - -static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src) -{ - darray_for_each(*src, i) { - int ret = snapshot_list_add_nodup(c, dst, *i); - if (ret) - return ret; - } - - return 0; -} - -int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, - struct bch_snapshot *s); -int bch2_snapshot_get_subvol(struct btree_trans *, u32, - struct bch_subvolume *); - -/* only exported for tests: */ -int bch2_snapshot_node_create(struct btree_trans *, u32, - u32 *, u32 *, unsigned); - -int bch2_check_snapshot_trees(struct bch_fs *); -int bch2_check_snapshots(struct bch_fs *); -int bch2_reconstruct_snapshots(struct bch_fs *); - -int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); - -static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) - ? 0 - : __bch2_check_key_has_snapshot(trans, iter, k); -} - -int __bch2_get_snapshot_overwrites(struct btree_trans *, - enum btree_id, struct bpos, - snapshot_id_list *); - -/* - * Get a list of snapshot IDs that have overwritten a given key: - */ -static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans, - enum btree_id btree, struct bpos pos, - snapshot_id_list *s) -{ - darray_init(s); - - return bch2_snapshot_has_children(trans->c, pos.snapshot) - ? __bch2_get_snapshot_overwrites(trans, btree, pos, s) - : 0; - -} - -int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); - -int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); - -static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0) - return 0; - - return __bch2_key_has_snapshot_overwrites(trans, id, pos); -} - -int __bch2_delete_dead_snapshots(struct bch_fs *); -int bch2_delete_dead_snapshots(struct bch_fs *); -void bch2_delete_dead_snapshots_work(struct work_struct *); -void bch2_delete_dead_snapshots_async(struct bch_fs *); -void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); - -int bch2_snapshots_read(struct bch_fs *); -void bch2_fs_snapshots_exit(struct bch_fs *); -void bch2_fs_snapshots_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h deleted file mode 100644 index 9bccae1f3590ad..00000000000000 --- a/fs/bcachefs/snapshot_format.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H -#define _BCACHEFS_SNAPSHOT_FORMAT_H - -struct bch_snapshot { - struct bch_val v; - __le32 flags; - __le32 parent; - __le32 children[2]; - __le32 subvol; - /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ - __le32 tree; - __le32 depth; - __le32 skip[3]; - bch_le128 btime; -}; - -LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) -/* True if a subvolume points to this snapshot node: */ -LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) - -/* - * Snapshot trees: - * - * The snapshot_trees btree gives us persistent indentifier for each tree of - * bch_snapshot nodes, and allow us to record and easily find the root/master - * subvolume that other snapshots were created from: - */ -struct bch_snapshot_tree { - struct bch_val v; - __le32 master_subvol; - __le32 root_snapshot; -}; - -#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */ diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h deleted file mode 100644 index 0ab698f13e5c6a..00000000000000 --- a/fs/bcachefs/snapshot_types.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SNAPSHOT_TYPES_H -#define _BCACHEFS_SNAPSHOT_TYPES_H - -#include "bbpos_types.h" -#include "darray.h" -#include "subvolume_types.h" - -typedef DARRAY(u32) snapshot_id_list; - -#define IS_ANCESTOR_BITMAP 128 - -struct snapshot_t { - enum snapshot_id_state { - SNAPSHOT_ID_empty, - SNAPSHOT_ID_live, - SNAPSHOT_ID_deleted, - } state; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -}; - -struct snapshot_table { - struct rcu_head rcu; - size_t nr; -#ifndef RUST_BINDGEN - DECLARE_FLEX_ARRAY(struct snapshot_t, s); -#else - struct snapshot_t s[0]; -#endif -}; - -struct snapshot_interior_delete { - u32 id; - u32 live_child; -}; -typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - -struct snapshot_delete { - struct mutex lock; - struct work_struct work; - - struct mutex progress_lock; - snapshot_id_list deleting_from_trees; - snapshot_id_list delete_leaves; - interior_delete_list delete_interior; - - bool running; - struct bbpos pos; -}; - -#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c deleted file mode 100644 index 3e9f59226bdf2b..00000000000000 --- a/fs/bcachefs/str_hash.c +++ /dev/null @@ -1,400 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_cache.h" -#include "btree_update.h" -#include "dirent.h" -#include "fsck.h" -#include "str_hash.h" -#include "subvolume.h" - -static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) -{ - if (d.v->d_type == DT_SUBVOL) { - struct bch_subvolume subvol; - int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol), - false, &subvol); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - return !ret; - } else { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -} - -static int bch2_fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old, - bool *updated_before_k_pos) -{ - struct bch_fs *c = trans->c; - struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bkey_dirent_init(&new->k_i); - dirent_copy_target(new, old); - new->k.p = old.k->p; - - char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20); - ret = PTR_ERR_OR_ZERO(renamed_buf); - if (ret) - return ret; - - for (unsigned i = 0; i < 1000; i++) { - new->k.u64s = BKEY_U64s_MAX; - - struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf, - sprintf(renamed_buf, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i)); - - ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL); - if (ret) - return ret; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - (subvol_inum) { 0, old.k->p.inode }, - old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create); - if (ret && !bch2_err_matches(ret, EEXIST)) - break; - if (!ret) { - if (bpos_lt(new->k.p, old.k->p)) - *updated_before_k_pos = true; - break; - } - } - - ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); - bch_err_fn(c, ret); - return ret; -} - -static noinline int hash_pick_winner(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c k1, - struct bkey_s_c k2) -{ - if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && - !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) - return 0; - - switch (desc.btree_id) { - case BTREE_ID_dirents: { - int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1)); - if (ret < 0) - return ret; - if (!ret) - return 0; - - ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2)); - if (ret < 0) - return ret; - if (!ret) - return 1; - return 2; - } - default: - return 0; - } -} - -/* - * str_hash lookups across snapshots break in wild ways if hash_info in - * different snapshot versions doesn't match - so if we find one mismatch, check - * them all - */ -int bch2_repair_inode_hash_info(struct btree_trans *trans, - struct bch_inode_unpacked *snapshot_root) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - bool need_commit = false; - int ret = 0; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, - POS(0, snapshot_root->bi_inum), - BTREE_ITER_all_snapshots, k, ret) { - if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) - break; - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); - if (ret) - break; - - if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && - INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); - struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); - - BUG_ON(hash1.type != hash2.type || - memcmp(&hash1.siphash_key, - &hash2.siphash_key, - sizeof(hash1.siphash_key))); -#endif - continue; - } - - printbuf_reset(&buf); - prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", - snapshot_root->bi_inum, - inode.bi_snapshot, - snapshot_root->bi_snapshot); - - bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); - prt_printf(&buf, " %llx\n", inode.bi_hash_seed); - - bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); - prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); - - if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { - inode.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); - - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - break; - need_commit = true; - } - } - - if (ret) - goto err; - - if (!need_commit) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", - snapshot_root->bi_inum); - - prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); - bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); - prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); -#if 0 - prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); - bch2_prt_str_hash_type(&buf, hash_info->type); - prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); -#endif - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, fsck_repair_unimplemented); - goto err; - } - - ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; -err: -fsck_err: - printbuf_exit(&buf); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * All versions of the same inode in different snapshots must have the same hash - * seed/type: verify that the hash info we're using matches the root - */ -static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, - struct bch_hash_info *hash_info) -{ - struct bch_inode_unpacked snapshot_root; - int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); - if (ret) - return ret; - - struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); - if (hash_info->type != hash_root.type || - memcmp(&hash_info->siphash_key, - &hash_root.siphash_key, - sizeof(hash_root.siphash_key))) - ret = bch2_repair_inode_hash_info(trans, &snapshot_root); - - return ret; -} - -/* Put a str_hash key in its proper location, checking for duplicates */ -int bch2_str_hash_repair_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc *desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c k, - struct btree_iter *dup_iter, struct bkey_s_c dup_k, - bool *updated_before_k_pos) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - bool free_snapshots_seen = false; - int ret = 0; - - if (!s) { - s = bch2_trans_kmalloc(trans, sizeof(*s)); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - goto out; - - s->pos = k_iter->pos; - darray_init(&s->ids); - - ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids); - if (ret) - goto out; - - free_snapshots_seen = true; - } - - if (!dup_k.k) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, - (subvol_inum) { 0, new->k.p.inode }, - new->k.p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(dup_k); - if (ret) - goto out; - if (dup_k.k) - goto duplicate_entries; - - if (bpos_lt(new->k.p, k.k->p)) - *updated_before_k_pos = true; - - ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id, - k_iter->pos, new->k.p) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_commit; - } else { -duplicate_entries: - ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, dup_k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0); - break; - case 2: - ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info, - bkey_s_c_to_dirent(k), - updated_before_k_pos) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_ITER_with_updates); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_commit; - } -out: -fsck_err: - bch2_trans_iter_exit(trans, dup_iter); - printbuf_exit(&buf); - if (free_snapshots_seen) - darray_exit(&s->ids); - return ret; -} - -int __bch2_str_hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc *desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k, - bool *updated_before_k_pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = {}; - struct printbuf buf = PRINTBUF; - struct bkey_s_c k; - int ret = 0; - - u64 hash = desc->hash_bkey(hash_info, hash_k); - if (hash_k.k->p.offset < hash) - goto bad_hash; - - for_each_btree_key_norestart(trans, iter, desc->btree_id, - SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots| - BTREE_ITER_with_updates, k, ret) { - if (bkey_eq(k.k->p, hash_k.k->p)) - break; - - if (k.k->type == desc->key_type && - !desc->cmp_bkey(k, hash_k)) { - ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, - hash_info) ?: - bch2_str_hash_repair_key(trans, s, desc, hash_info, - k_iter, hash_k, - &iter, k, updated_before_k_pos); - break; - } - - if (bkey_deleted(k.k)) - goto bad_hash; - } - bch2_trans_iter_exit(trans, &iter); -out: -fsck_err: - printbuf_exit(&buf); - return ret; -bad_hash: - bch2_trans_iter_exit(trans, &iter); - /* - * Before doing any repair, check hash_info itself: - */ - ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); - if (ret) - goto out; - - if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: should be at %llu\n%s", - hash, - (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) - ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, - k_iter, hash_k, - &iter, bkey_s_c_null, - updated_before_k_pos); - goto out; -} diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h deleted file mode 100644 index 8979ac2d7a3bed..00000000000000 --- a/fs/bcachefs/str_hash.h +++ /dev/null @@ -1,431 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_STR_HASH_H -#define _BCACHEFS_STR_HASH_H - -#include "btree_iter.h" -#include "btree_update.h" -#include "checksum.h" -#include "error.h" -#include "inode.h" -#include "siphash.h" -#include "subvolume.h" -#include "super.h" - -#include -#include - -static inline enum bch_str_hash_type -bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) -{ - switch (opt) { - case BCH_STR_HASH_OPT_crc32c: - return BCH_STR_HASH_crc32c; - case BCH_STR_HASH_OPT_crc64: - return BCH_STR_HASH_crc64; - case BCH_STR_HASH_OPT_siphash: - return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) - ? BCH_STR_HASH_siphash - : BCH_STR_HASH_siphash_old; - default: - BUG(); - } -} - -struct bch_hash_info { - u32 inum_snapshot; - u8 type; - struct unicode_map *cf_encoding; - /* - * For crc32 or crc64 string hashes the first key value of - * the siphash_key (k0) is used as the key. - */ - SIPHASH_KEY siphash_key; -}; - -static inline struct bch_hash_info -bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) -{ - struct bch_hash_info info = { - .inum_snapshot = bi->bi_snapshot, - .type = INODE_STR_HASH(bi), - .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, - .siphash_key = { .k0 = bi->bi_hash_seed } - }; - - if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { - u8 digest[SHA256_DIGEST_SIZE]; - - sha256((const u8 *)&bi->bi_hash_seed, - sizeof(bi->bi_hash_seed), digest); - memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); - } - - return info; -} - -struct bch_str_hash_ctx { - union { - u32 crc32c; - u64 crc64; - SIPHASH_CTX siphash; - }; -}; - -static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info) -{ - switch (info->type) { - case BCH_STR_HASH_crc32c: - ctx->crc32c = crc32c(~0, &info->siphash_key.k0, - sizeof(info->siphash_key.k0)); - break; - case BCH_STR_HASH_crc64: - ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, - sizeof(info->siphash_key.k0)); - break; - case BCH_STR_HASH_siphash_old: - case BCH_STR_HASH_siphash: - SipHash24_Init(&ctx->siphash, &info->siphash_key); - break; - default: - BUG(); - } -} - -static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info, - const void *data, size_t len) -{ - switch (info->type) { - case BCH_STR_HASH_crc32c: - ctx->crc32c = crc32c(ctx->crc32c, data, len); - break; - case BCH_STR_HASH_crc64: - ctx->crc64 = crc64_be(ctx->crc64, data, len); - break; - case BCH_STR_HASH_siphash_old: - case BCH_STR_HASH_siphash: - SipHash24_Update(&ctx->siphash, data, len); - break; - default: - BUG(); - } -} - -static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, - const struct bch_hash_info *info) -{ - switch (info->type) { - case BCH_STR_HASH_crc32c: - return ctx->crc32c; - case BCH_STR_HASH_crc64: - return ctx->crc64 >> 1; - case BCH_STR_HASH_siphash_old: - case BCH_STR_HASH_siphash: - return SipHash24_End(&ctx->siphash) >> 1; - default: - BUG(); - } -} - -struct bch_hash_desc { - enum btree_id btree_id; - u8 key_type; - - u64 (*hash_key)(const struct bch_hash_info *, const void *); - u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); - bool (*cmp_key)(struct bkey_s_c, const void *); - bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); - bool (*is_visible)(subvol_inum inum, struct bkey_s_c); -}; - -static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) -{ - return k.k->type == desc.key_type && - (!desc.is_visible || - !inum.inum || - desc.is_visible(inum, k)); -} - -static __always_inline struct bkey_s_c -bch2_hash_lookup_in_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key, - enum btree_iter_update_trigger_flags flags, - u32 snapshot) -{ - struct bkey_s_c k; - int ret; - - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), - POS(inum.inum, U64_MAX), - BTREE_ITER_slots|flags, k, ret) { - if (is_visible_key(desc, inum, k)) { - if (!desc.cmp_key(k, key)) - return k; - } else if (k.k->type == KEY_TYPE_hash_whiteout) { - ; - } else { - /* hole, not found */ - break; - } - } - bch2_trans_iter_exit(trans, iter); - - return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); -} - -static __always_inline struct bkey_s_c -bch2_hash_lookup(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key, - enum btree_iter_update_trigger_flags flags) -{ - u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return bkey_s_c_err(ret); - - return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); -} - -static __always_inline int -bch2_hash_hole(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key) -{ - struct bkey_s_c k; - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), - POS(inum.inum, U64_MAX), - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) - if (!is_visible_key(desc, inum, k)) - return 0; - bch2_trans_iter_exit(trans, iter); - - return ret ?: -BCH_ERR_ENOSPC_str_hash_create; -} - -static __always_inline -int bch2_hash_needs_whiteout(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *start) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_copy_iter(trans, &iter, start); - - bch2_btree_iter_advance(trans, &iter); - - for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) { - if (k.k->type != desc.key_type && - k.k->type != KEY_TYPE_hash_whiteout) - break; - - if (k.k->type == desc.key_type && - desc.hash_bkey(info, k) <= start->pos.offset) { - ret = 1; - break; - } - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static __always_inline -struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, u32 snapshot, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter slot = {}; - struct bkey_s_c k; - bool found = false; - int ret; - - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(insert->k.p.inode, - desc.hash_bkey(info, bkey_i_to_s_c(insert)), - snapshot), - POS(insert->k.p.inode, U64_MAX), - BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) { - if (is_visible_key(desc, inum, k)) { - if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) - goto found; - - /* hash collision: */ - continue; - } - - if (!slot.path && !(flags & STR_HASH_must_replace)) - bch2_trans_copy_iter(trans, &slot, iter); - - if (k.k->type != KEY_TYPE_hash_whiteout) - goto not_found; - } - - if (!ret) - ret = bch_err_throw(c, ENOSPC_str_hash_create); -out: - bch2_trans_iter_exit(trans, &slot); - bch2_trans_iter_exit(trans, iter); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -found: - found = true; -not_found: - if (found && (flags & STR_HASH_must_create)) { - bch2_trans_iter_exit(trans, &slot); - return k; - } else if (!found && (flags & STR_HASH_must_replace)) { - ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace); - } else { - if (!found && slot.path) - swap(*iter, slot); - - insert->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, insert, flags); - } - - goto out; -} - -static __always_inline -int bch2_hash_set_in_snapshot(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, u32 snapshot, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum, - snapshot, insert, flags); - int ret = bkey_err(k); - if (ret) - return ret; - if (k.k) { - bch2_trans_iter_exit(trans, &iter); - return bch_err_throw(trans->c, EEXIST_str_hash_set); - } - - return 0; -} - -static __always_inline -int bch2_hash_set(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, - struct bkey_i *insert, - enum btree_iter_update_trigger_flags flags) -{ - insert->k.p.inode = inum.inum; - - u32 snapshot; - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - bch2_hash_set_in_snapshot(trans, desc, info, inum, - snapshot, insert, flags); -} - -static __always_inline -int bch2_hash_delete_at(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, - enum btree_iter_update_trigger_flags flags) -{ - struct bkey_i *delete; - int ret; - - delete = bch2_trans_kmalloc(trans, sizeof(*delete)); - ret = PTR_ERR_OR_ZERO(delete); - if (ret) - return ret; - - ret = bch2_hash_needs_whiteout(trans, desc, info, iter); - if (ret < 0) - return ret; - - bkey_init(&delete->k); - delete->k.p = iter->pos; - delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - - return bch2_trans_update(trans, iter, delete, flags); -} - -static __always_inline -int bch2_hash_delete(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, const void *key) -{ - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key, - BTREE_ITER_intent); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); - -struct snapshots_seen; -int bch2_str_hash_repair_key(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc *, - struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c, - struct btree_iter *, struct bkey_s_c, - bool *); - -int __bch2_str_hash_check_key(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc *, - struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c, - bool *); - -static inline int bch2_str_hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc *desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k, - bool *updated_before_k_pos) -{ - if (hash_k.k->type != desc->key_type) - return 0; - - if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) - return 0; - - return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k, - updated_before_k_pos); -} - -#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c deleted file mode 100644 index 020587449123b1..00000000000000 --- a/fs/bcachefs/subvolume.c +++ /dev/null @@ -1,752 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "fs.h" -#include "recovery_passes.h" -#include "snapshot.h" -#include "subvolume.h" - -#include - -static int bch2_subvolume_delete(struct btree_trans *, u32); - -static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) -{ - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "missing subvolume %u", subvolid); - bool print = bch2_count_fsck_err(c, subvol_missing, &buf); - - int ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_inodes, 0); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - return ret; -} - -static struct bpos subvolume_children_pos(struct bkey_s_c k) -{ - if (k.k->type != KEY_TYPE_subvolume) - return POS_MIN; - - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - if (!s.v->fs_path_parent) - return POS_MIN; - return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset); -} - -static int check_subvol(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_subvolume subvol; - struct btree_iter subvol_children_iter = {}; - struct bch_snapshot snapshot; - struct printbuf buf = PRINTBUF; - unsigned snapid; - int ret = 0; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - subvol = bkey_s_c_to_subvolume(k); - snapid = le32_to_cpu(subvol.v->snapshot); - ret = bch2_snapshot_lookup(trans, snapid, &snapshot); - - if (bch2_err_matches(ret, ENOENT)) - return bch2_run_print_explicit_recovery_pass(c, - BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; - if (ret) - return ret; - - if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - return ret ?: -BCH_ERR_transaction_restart_nested; - } - - if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && - subvol.v->fs_path_parent, - trans, subvol_root_fs_path_parent_nonzero, - "root subvolume has nonzero fs_path_parent\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.fs_path_parent = 0; - } - - if (subvol.v->fs_path_parent) { - struct bpos pos = subvolume_children_pos(k); - - struct bkey_s_c subvol_children_k = - bch2_bkey_get_iter(trans, &subvol_children_iter, - BTREE_ID_subvolume_children, pos, 0); - ret = bkey_err(subvol_children_k); - if (ret) - goto err; - - if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, - trans, subvol_children_not_set, - "subvolume not set in subvolume_children btree at %llu:%llu\n%s", - pos.inode, pos.offset, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); - if (ret) - goto err; - } - } - - struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_nowarn_trans(trans, - (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, - &inode); - if (!ret) { - if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, - trans, subvol_root_wrong_bi_subvol, - "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", - inode.bi_inum, inode.bi_snapshot, - inode.bi_subvol, subvol.k->p.offset)) { - inode.bi_subvol = subvol.k->p.offset; - inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - goto err; - } - } else if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(trans, subvol_to_missing_root, - "subvolume %llu points to missing subvolume root %llu:%u", - k.k->p.offset, le64_to_cpu(subvol.v->inode), - le32_to_cpu(subvol.v->snapshot))) { - /* - * Recreate - any contents that are still disconnected - * will then get reattached under lost+found - */ - bch2_inode_init_early(c, &inode); - bch2_inode_init_late(c, &inode, bch2_current_time(c), - 0, 0, S_IFDIR|0700, 0, NULL); - inode.bi_inum = le64_to_cpu(subvol.v->inode); - inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); - inode.bi_subvol = k.k->p.offset; - inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent); - ret = __bch2_fsck_write_inode(trans, &inode); - if (ret) - goto err; - } - } else { - goto err; - } - - if (!BCH_SUBVOLUME_SNAP(subvol.v)) { - u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); - u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root); - - struct bch_snapshot_tree st; - ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); - - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "%s: snapshot tree %u not found", __func__, snapshot_tree); - - if (ret) - goto err; - - if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, - trans, subvol_not_master_and_not_snapshot, - "subvolume %llu is not set as snapshot but is not master subvolume", - k.k->p.offset)) { - struct bkey_i_subvolume *s = - bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - goto err; - - SET_BCH_SUBVOLUME_SNAP(&s->v, true); - } - } -err: -fsck_err: - bch2_trans_iter_exit(trans, &subvol_children_iter); - printbuf_exit(&buf); - return ret; -} - -int bch2_check_subvols(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} - -static int check_subvol_child(struct btree_trans *trans, - struct btree_iter *child_iter, - struct bkey_s_c child_k) -{ - struct bch_subvolume s; - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), - 0, subvolume, &s); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (fsck_err_on(ret || - le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, - trans, subvol_children_bad, - "incorrect entry in subvolume_children btree %llu:%llu", - child_k.k->p.inode, child_k.k->p.offset)) { - ret = bch2_btree_delete_at(trans, child_iter, 0); - if (ret) - goto err; - } -err: -fsck_err: - return ret; -} - -int bch2_check_subvol_children(struct bch_fs *c) -{ - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol_child(trans, &iter, k))); - bch_err_fn(c, ret); - return 0; -} - -/* Subvolumes: */ - -int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); - int ret = 0; - - bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || - bkey_gt(k.k->p, SUBVOL_POS_MAX), - c, subvol_pos_bad, - "invalid pos"); - - bkey_fsck_err_on(!subvol.v->snapshot, - c, subvol_snapshot_bad, - "invalid snapshot"); - - bkey_fsck_err_on(!subvol.v->inode, - c, subvol_inode_bad, - "invalid inode"); -fsck_err: - return ret; -} - -void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - - prt_printf(out, "root %llu snapshot id %u", - le64_to_cpu(s.v->inode), - le32_to_cpu(s.v->snapshot)); - - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) { - prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); - prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); - } - - if (BCH_SUBVOLUME_RO(s.v)) - prt_printf(out, " ro"); - if (BCH_SUBVOLUME_SNAP(s.v)) - prt_printf(out, " snapshot"); - if (BCH_SUBVOLUME_UNLINKED(s.v)) - prt_printf(out, " unlinked"); -} - -static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) -{ - return !bpos_eq(pos, POS_MIN) - ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set) - : 0; -} - -int bch2_subvolume_trigger(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - enum btree_iter_update_trigger_flags flags) -{ - if (flags & BTREE_TRIGGER_transactional) { - struct bpos children_pos_old = subvolume_children_pos(old); - struct bpos children_pos_new = subvolume_children_pos(new.s_c); - - if (!bpos_eq(children_pos_old, children_pos_new)) { - int ret = subvolume_children_mod(trans, children_pos_old, false) ?: - subvolume_children_mod(trans, children_pos_new, true); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) -{ - struct btree_iter iter; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); - struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter); - bch2_trans_iter_exit(trans, &iter); - - return bkey_err(k) ?: k.k && k.k->p.inode == subvol - ? -BCH_ERR_ENOTEMPTY_subvol_not_empty - : 0; -} - -static __always_inline int -bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, - bool inconsistent_if_not_found, - struct bch_subvolume *s) -{ - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_cached| - BTREE_ITER_with_updates, subvolume, s); - if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found) - ret = bch2_subvolume_missing(trans->c, subvol) ?: ret; - return ret; -} - -int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, - bool inconsistent_if_not_found, - struct bch_subvolume *s) -{ - return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s); -} - -int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) -{ - struct bch_subvolume s; - int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s); - if (ret) - return ret; - - if (BCH_SUBVOLUME_RO(&s)) - return -EROFS; - return 0; -} - -int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol) -{ - return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol)); -} - -int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, - struct bch_subvolume *subvol) -{ - struct bch_snapshot snap; - - return bch2_snapshot_lookup(trans, snapshot, &snap) ?: - bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol); -} - -int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, - u32 *snapid, bool warn) -{ - struct btree_iter iter; - struct bkey_s_c_subvolume subvol; - int ret; - - subvol = bch2_bkey_get_iter_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached|BTREE_ITER_with_updates, - subvolume); - ret = bkey_err(subvol); - - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - - if (likely(!ret)) - *snapid = le32_to_cpu(subvol.v->snapshot); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, - u32 *snapid) -{ - return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true); -} - -static int bch2_subvolume_reparent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - u32 old_parent, u32 new_parent) -{ - struct bkey_i_subvolume *s; - int ret; - - if (k.k->type != KEY_TYPE_subvolume) - return 0; - - if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent) - return 0; - - s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - return ret; - - s->v.creation_parent = cpu_to_le32(new_parent); - return 0; -} - -/* - * Separate from the snapshot tree in the snapshots btree, we record the tree - * structure of how snapshot subvolumes were created - the parent subvolume of - * each snapshot subvolume. - * - * When a subvolume is deleted, we scan for child subvolumes and reparant them, - * to avoid dangling references: - */ -static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) -{ - struct bch_subvolume s; - - return lockrestart_do(trans, - bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?: - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_subvolume_reparent(trans, &iter, k, - subvolid_to_delete, le32_to_cpu(s.creation_parent))); -} - -/* - * Delete subvolume, mark snapshot ID as deleted, queue up snapshot - * deletion/cleanup: - */ -static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) -{ - struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {}; - - struct bkey_s_c_subvolume subvol = - bch2_bkey_get_iter_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached|BTREE_ITER_intent, - subvolume); - int ret = bkey_err(subvol); - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - if (ret) - goto err; - - u32 snapid = le32_to_cpu(subvol.v->snapshot); - - struct bkey_s_c_snapshot snapshot = - bch2_bkey_get_iter_typed(trans, &snapshot_iter, - BTREE_ID_snapshots, POS(0, snapid), - 0, snapshot); - ret = bkey_err(snapshot); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing snapshot %u", snapid); - if (ret) - goto err; - - u32 treeid = le32_to_cpu(snapshot.v->tree); - - struct bkey_s_c_snapshot_tree snapshot_tree = - bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, - BTREE_ID_snapshot_trees, POS(0, treeid), - 0, snapshot_tree); - ret = bkey_err(snapshot_tree); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing snapshot tree %u", treeid); - if (ret) - goto err; - - if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { - struct bkey_i_snapshot_tree *snapshot_tree_mut = - bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter, - &snapshot_tree.s_c, - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(snapshot_tree_mut); - if (ret) - goto err; - - snapshot_tree_mut->v.master_subvol = 0; - } - - ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?: - bch2_snapshot_node_set_deleted(trans, snapid); -err: - bch2_trans_iter_exit(trans, &snapshot_tree_iter); - bch2_trans_iter_exit(trans, &snapshot_iter); - bch2_trans_iter_exit(trans, &subvol_iter); - return ret; -} - -static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) -{ - int ret = bch2_subvolumes_reparent(trans, subvolid) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_subvolume_delete(trans, subvolid)); - - bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols); - return ret; -} - -static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, - snapshot_wait_for_pagecache_and_delete_work); - int ret = 0; - - while (!ret) { - mutex_lock(&c->snapshots_unlinked_lock); - snapshot_id_list s = c->snapshots_unlinked; - darray_init(&c->snapshots_unlinked); - mutex_unlock(&c->snapshots_unlinked_lock); - - if (!s.nr) - break; - - bch2_evict_subvolume_inodes(c, &s); - - darray_for_each(s, id) { - ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); - bch_err_msg(c, ret, "deleting subvolume %u", *id); - if (ret) - break; - } - - darray_exit(&s); - } - - enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); -} - -struct subvolume_unlink_hook { - struct btree_trans_commit_hook h; - u32 subvol; -}; - -static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *_h) -{ - struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); - struct bch_fs *c = trans->c; - int ret = 0; - - mutex_lock(&c->snapshots_unlinked_lock); - if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) - ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); - mutex_unlock(&c->snapshots_unlinked_lock); - - if (ret) - return ret; - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache)) - return -EROFS; - - if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) - enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); - return 0; -} - -int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) -{ - struct btree_iter iter; - struct bkey_i_subvolume *n; - struct subvolume_unlink_hook *h; - int ret = 0; - - h = bch2_trans_kmalloc(trans, sizeof(*h)); - ret = PTR_ERR_OR_ZERO(h); - if (ret) - return ret; - - h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; - h->subvol = subvolid; - bch2_trans_commit_hook(trans, &h->h); - - n = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached, subvolume); - ret = PTR_ERR_OR_ZERO(n); - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; - if (unlikely(ret)) - return ret; - - SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); - n->v.fs_path_parent = 0; - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_subvolume_create(struct btree_trans *trans, u64 inode, - u32 parent_subvolid, - u32 src_subvolid, - u32 *new_subvolid, - u32 *new_snapshotid, - bool ro) -{ - struct bch_fs *c = trans->c; - struct btree_iter dst_iter, src_iter = {}; - struct bkey_i_subvolume *new_subvol = NULL; - struct bkey_i_subvolume *src_subvol = NULL; - u32 parent = 0, new_nodes[2], snapshot_subvols[2]; - int ret = 0; - - ret = bch2_bkey_get_empty_slot(trans, &dst_iter, - BTREE_ID_subvolumes, POS(0, U32_MAX)); - if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = bch_err_throw(c, ENOSPC_subvolume_create); - if (ret) - return ret; - - snapshot_subvols[0] = dst_iter.pos.offset; - snapshot_subvols[1] = src_subvolid; - - if (src_subvolid) { - /* Creating a snapshot: */ - - src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, - BTREE_ID_subvolumes, POS(0, src_subvolid), - BTREE_ITER_cached, subvolume); - ret = PTR_ERR_OR_ZERO(src_subvol); - if (bch2_err_matches(ret, ENOENT)) - ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret; - if (unlikely(ret)) - goto err; - - parent = le32_to_cpu(src_subvol->v.snapshot); - } - - ret = bch2_snapshot_node_create(trans, parent, new_nodes, - snapshot_subvols, - src_subvolid ? 2 : 1); - if (ret) - goto err; - - if (src_subvolid) { - src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); - ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); - if (ret) - goto err; - } - - new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume); - ret = PTR_ERR_OR_ZERO(new_subvol); - if (ret) - goto err; - - new_subvol->v.flags = 0; - new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); - new_subvol->v.inode = cpu_to_le64(inode); - new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); - new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid); - new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); - new_subvol->v.otime.hi = 0; - - SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); - SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); - - *new_subvolid = new_subvol->k.p.offset; - *new_snapshotid = new_nodes[0]; -err: - bch2_trans_iter_exit(trans, &src_iter); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; -} - -int bch2_initialize_subvolumes(struct bch_fs *c) -{ - struct bkey_i_snapshot_tree root_tree; - struct bkey_i_snapshot root_snapshot; - struct bkey_i_subvolume root_volume; - int ret; - - bkey_snapshot_tree_init(&root_tree.k_i); - root_tree.k.p.offset = 1; - root_tree.v.master_subvol = cpu_to_le32(1); - root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); - - bkey_snapshot_init(&root_snapshot.k_i); - root_snapshot.k.p.offset = U32_MAX; - root_snapshot.v.flags = 0; - root_snapshot.v.parent = 0; - root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); - root_snapshot.v.tree = cpu_to_le32(1); - SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); - - bkey_subvolume_init(&root_volume.k_i); - root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_volume.v.flags = 0; - root_volume.v.snapshot = cpu_to_le32(U32_MAX); - root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_is_inode(k.k)) { - struct bch_fs *c = trans->c; - bch_err(c, "root inode not found"); - ret = bch_err_throw(c, ENOENT_inode); - goto err; - } - - ret = bch2_inode_unpack(k, &inode); - BUG_ON(ret); - - inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - - ret = bch2_inode_write(trans, &iter, &inode); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* set bi_subvol on root inode */ -int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) -{ - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; -} - -void bch2_fs_subvolumes_init_early(struct bch_fs *c) -{ - INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, - bch2_subvolume_wait_for_pagecache_and_delete); -} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h deleted file mode 100644 index 075f55e25c7048..00000000000000 --- a/fs/bcachefs/subvolume.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUBVOLUME_H -#define _BCACHEFS_SUBVOLUME_H - -#include "darray.h" -#include "subvolume_types.h" - -int bch2_check_subvols(struct bch_fs *); -int bch2_check_subvol_children(struct bch_fs *); - -int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, - enum btree_iter_update_trigger_flags); - -#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ - .key_validate = bch2_subvolume_validate, \ - .val_to_text = bch2_subvolume_to_text, \ - .trigger = bch2_subvolume_trigger, \ - .min_val_size = 16, \ -}) - -int bch2_subvol_has_children(struct btree_trans *, u32); -int bch2_subvolume_get(struct btree_trans *, unsigned, - bool, struct bch_subvolume *); -int __bch2_subvolume_get_snapshot(struct btree_trans *, u32, - u32 *, bool); -int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); - -int bch2_subvol_is_ro_trans(struct btree_trans *, u32); -int bch2_subvol_is_ro(struct bch_fs *, u32); - -static inline struct bkey_s_c -bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u32 subvolid, unsigned flags) -{ - u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot); - if (ret) - return bkey_s_c_err(ret); - - bch2_btree_iter_set_snapshot(trans, iter, snapshot); - return bch2_btree_iter_peek_max_type(trans, iter, end, flags); -} - -#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ - _end, _subvolid, _flags, _k, _do) \ -({ \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\ - _end, _subvolid, (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - -#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ - _start, _end, _subvolid, _flags, _k, _do) \ -({ \ - struct btree_iter _iter; \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ - _end, _subvolid, _flags, _k, _do); \ -}) - -int bch2_subvolume_unlink(struct btree_trans *, u32); -int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); - -int bch2_initialize_subvolumes(struct bch_fs *); -int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); - -void bch2_fs_subvolumes_init_early(struct bch_fs *); - -#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h deleted file mode 100644 index e029df7ba89f52..00000000000000 --- a/fs/bcachefs/subvolume_format.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H -#define _BCACHEFS_SUBVOLUME_FORMAT_H - -#define SUBVOL_POS_MIN POS(0, 1) -#define SUBVOL_POS_MAX POS(0, S32_MAX) -#define BCACHEFS_ROOT_SUBVOL 1 - -struct bch_subvolume { - struct bch_val v; - __le32 flags; - __le32 snapshot; - __le64 inode; - /* - * Snapshot subvolumes form a tree, separate from the snapshot nodes - * tree - if this subvolume is a snapshot, this is the ID of the - * subvolume it was created from: - * - * This is _not_ necessarily the subvolume of the directory containing - * this subvolume: - */ - __le32 creation_parent; - __le32 fs_path_parent; - bch_le128 otime; -}; - -LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) -/* - * We need to know whether a subvolume is a snapshot so we can know whether we - * can delete it (or whether it should just be rm -rf'd) - */ -LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) -LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) - -#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h deleted file mode 100644 index 9d634b906dcda3..00000000000000 --- a/fs/bcachefs/subvolume_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUBVOLUME_TYPES_H -#define _BCACHEFS_SUBVOLUME_TYPES_H - -typedef struct { - /* we can't have padding in this struct: */ - u64 subvol; - u64 inum; -} subvol_inum; - -#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c deleted file mode 100644 index 6c2e1d647403f2..00000000000000 --- a/fs/bcachefs/super-io.c +++ /dev/null @@ -1,1562 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "checksum.h" -#include "disk_groups.h" -#include "ec.h" -#include "error.h" -#include "journal.h" -#include "journal_sb.h" -#include "journal_seq_blacklist.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "quota.h" -#include "sb-clean.h" -#include "sb-counters.h" -#include "sb-downgrade.h" -#include "sb-errors.h" -#include "sb-members.h" -#include "super-io.h" -#include "super.h" -#include "trace.h" -#include "vstructs.h" - -#include -#include -#include - -struct bch2_metadata_version { - u16 version; - const char *name; -}; - -static const struct bch2_metadata_version bch2_metadata_versions[] = { -#define x(n, v) { \ - .version = v, \ - .name = #n, \ -}, - BCH_METADATA_VERSIONS() -#undef x -}; - -void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v) -{ - const char *str = "(unknown version)"; - - for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) - if (bch2_metadata_versions[i].version == v) { - str = bch2_metadata_versions[i].name; - break; - } - - prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); -} - -enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v) -{ - if (!BCH_VERSION_MAJOR(v)) - return v; - - for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) - if (bch2_metadata_versions[i].version > v && - BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == - BCH_VERSION_MAJOR(v)) - v = bch2_metadata_versions[i].version; - - return v; -} - -int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) -{ - int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && - version <= c->sb.version_incompat_allowed) - ? 0 - : -BCH_ERR_may_not_use_incompat_feature; - - mutex_lock(&c->sb_lock); - if (!ret) { - SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); - bch2_write_super(c); - } else { - darray_for_each(c->incompat_versions_requested, i) - if (version == *i) - goto out; - - darray_push(&c->incompat_versions_requested, version); - struct printbuf buf = PRINTBUF; - prt_str(&buf, "requested incompat feature "); - bch2_version_to_text(&buf, version); - prt_str(&buf, " currently not enabled, allowed up to "); - bch2_version_to_text(&buf, version); - prt_printf(&buf, "\n set version_upgrade=incompat to enable"); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - } - -out: - mutex_unlock(&c->sb_lock); - - return ret; -} - -const char * const bch2_sb_fields[] = { -#define x(name, nr) #name, - BCH_SB_FIELDS() -#undef x - NULL -}; - -static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, - enum bch_validate_flags, struct printbuf *); - -struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, - enum bch_sb_field_type type) -{ - /* XXX: need locking around superblock to access optional fields */ - - vstruct_for_each(sb, f) - if (le32_to_cpu(f->type) == type) - return f; - return NULL; -} - -static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, - struct bch_sb_field *f, - unsigned u64s) -{ - unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; - unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; - - BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); - - if (!f && !u64s) { - /* nothing to do: */ - } else if (!f) { - f = vstruct_last(sb->sb); - memset(f, 0, sizeof(u64) * u64s); - f->u64s = cpu_to_le32(u64s); - f->type = 0; - } else { - void *src, *dst; - - src = vstruct_end(f); - - if (u64s) { - f->u64s = cpu_to_le32(u64s); - dst = vstruct_end(f); - } else { - dst = f; - } - - memmove(dst, src, vstruct_end(sb->sb) - src); - - if (dst > src) - memset(src, 0, dst - src); - } - - sb->sb->u64s = cpu_to_le32(sb_u64s); - - return u64s ? f : NULL; -} - -void bch2_sb_field_delete(struct bch_sb_handle *sb, - enum bch_sb_field_type type) -{ - struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); - - if (f) - __bch2_sb_field_resize(sb, f, 0); -} - -/* Superblock realloc/free: */ - -void bch2_free_super(struct bch_sb_handle *sb) -{ - kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->s_bdev_file)) - bdev_fput(sb->s_bdev_file); - kfree(sb->holder); - kfree(sb->sb_name); - - kfree(sb->sb); - memset(sb, 0, sizeof(*sb)); -} - -int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) -{ - size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); - size_t new_buffer_size; - struct bch_sb *new_sb; - struct bio *bio; - - if (sb->bdev) - new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); - - new_buffer_size = roundup_pow_of_two(new_bytes); - - if (sb->sb && sb->buffer_size >= new_buffer_size) - return 0; - - if (sb->sb && sb->have_layout) { - u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; - - if (new_bytes > max_bytes) { - struct printbuf buf = PRINTBUF; - - prt_bdevname(&buf, sb->bdev); - prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes); - pr_err("%s", buf.buf); - printbuf_exit(&buf); - return -BCH_ERR_ENOSPC_sb; - } - } - - if (sb->buffer_size >= new_buffer_size && sb->sb) - return 0; - - if (dynamic_fault("bcachefs:add:super_realloc")) - return -BCH_ERR_ENOMEM_sb_realloc_injected; - - new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); - if (!new_sb) - return -BCH_ERR_ENOMEM_sb_buf_realloc; - - sb->sb = new_sb; - - if (sb->have_bio) { - unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); - - bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!bio) - return -BCH_ERR_ENOMEM_sb_bio_realloc; - - bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); - - kfree(sb->bio); - sb->bio = bio; - } - - sb->buffer_size = new_buffer_size; - - return 0; -} - -struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); - ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; - ssize_t d = -old_u64s + u64s; - - if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) - return NULL; - - if (sb->fs_sb) { - struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); - - lockdep_assert_held(&c->sb_lock); - - /* XXX: we're not checking that offline device have enough space */ - - for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) { - struct bch_sb_handle *dev_sb = &ca->disk_sb; - - if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize); - return NULL; - } - } - } - - f = bch2_sb_field_get_id(sb->sb, type); - f = __bch2_sb_field_resize(sb, f, u64s); - if (f) - f->type = cpu_to_le32(type); - return f; -} - -struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); - - if (!f || le32_to_cpu(f->u64s) < u64s) - f = bch2_sb_field_resize_id(sb, type, u64s); - return f; -} - -/* Superblock validate: */ - -static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) -{ - u64 offset, prev_offset, max_sectors; - unsigned i; - - BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); - - if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && - !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { - prt_printf(out, "Not a bcachefs superblock layout"); - return -BCH_ERR_invalid_sb_layout; - } - - if (layout->layout_type != 0) { - prt_printf(out, "Invalid superblock layout type %u", - layout->layout_type); - return -BCH_ERR_invalid_sb_layout_type; - } - - if (!layout->nr_superblocks) { - prt_printf(out, "Invalid superblock layout: no superblocks"); - return -BCH_ERR_invalid_sb_layout_nr_superblocks; - } - - if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { - prt_printf(out, "Invalid superblock layout: too many superblocks"); - return -BCH_ERR_invalid_sb_layout_nr_superblocks; - } - - if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) { - prt_printf(out, "Invalid superblock layout: max_size_bits too high"); - return -BCH_ERR_invalid_sb_layout_sb_max_size_bits; - } - - max_sectors = 1 << layout->sb_max_size_bits; - - prev_offset = le64_to_cpu(layout->sb_offset[0]); - - for (i = 1; i < layout->nr_superblocks; i++) { - offset = le64_to_cpu(layout->sb_offset[i]); - - if (offset < prev_offset + max_sectors) { - prt_printf(out, "Invalid superblock layout: superblocks overlap\n" - " (sb %u ends at %llu next starts at %llu", - i - 1, prev_offset + max_sectors, offset); - return -BCH_ERR_invalid_sb_layout_superblocks_overlap; - } - prev_offset = offset; - } - - return 0; -} - -static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) -{ - u16 version = le16_to_cpu(sb->version); - u16 version_min = le16_to_cpu(sb->version_min); - - if (!bch2_version_compatible(version)) { - prt_str(out, "Unsupported superblock version "); - bch2_version_to_text(out, version); - prt_str(out, " (min "); - bch2_version_to_text(out, bcachefs_metadata_version_min); - prt_str(out, ", max "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - prt_str(out, ")"); - return -BCH_ERR_invalid_sb_version; - } - - if (!bch2_version_compatible(version_min)) { - prt_str(out, "Unsupported superblock version_min "); - bch2_version_to_text(out, version_min); - prt_str(out, " (min "); - bch2_version_to_text(out, bcachefs_metadata_version_min); - prt_str(out, ", max "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - prt_str(out, ")"); - return -BCH_ERR_invalid_sb_version; - } - - if (version_min > version) { - prt_str(out, "Bad minimum version "); - bch2_version_to_text(out, version_min); - prt_str(out, ", greater than version field "); - bch2_version_to_text(out, version); - return -BCH_ERR_invalid_sb_version; - } - - return 0; -} - -int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, - enum bch_validate_flags flags, struct printbuf *out) -{ - enum bch_opt_id opt_id; - int ret; - - ret = bch2_sb_compatible(sb, out); - if (ret) - return ret; - - u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); - unsigned incompat_bit = 0; - if (incompat) - incompat_bit = __ffs64(incompat); - else if (sb->features[1]) - incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); - - if (incompat_bit) { - prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", - incompat_bit, - bch2_sb_features[BCH_FEATURE_NR - 1], - BCH_FEATURE_NR - 1); - return -BCH_ERR_invalid_sb_features; - } - - if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || - BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { - prt_str(out, "Filesystem has incompatible version "); - bch2_version_to_text(out, le16_to_cpu(sb->version)); - prt_str(out, ", current version "); - bch2_version_to_text(out, bcachefs_metadata_version_current); - return -BCH_ERR_invalid_sb_features; - } - - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { - prt_printf(out, "Bad user UUID (got zeroes)"); - return -BCH_ERR_invalid_sb_uuid; - } - - if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { - prt_printf(out, "Bad internal UUID (got zeroes)"); - return -BCH_ERR_invalid_sb_uuid; - } - - if (!(flags & BCH_VALIDATE_write) && - le64_to_cpu(sb->offset) != read_offset) { - prt_printf(out, "Bad sb offset (got %llu, read from %llu)", - le64_to_cpu(sb->offset), read_offset); - return -BCH_ERR_invalid_sb_offset; - } - - if (!sb->nr_devices || - sb->nr_devices > BCH_SB_MEMBERS_MAX) { - prt_printf(out, "Bad number of member devices %u (max %u)", - sb->nr_devices, BCH_SB_MEMBERS_MAX); - return -BCH_ERR_invalid_sb_too_many_members; - } - - if (sb->dev_idx >= sb->nr_devices) { - prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", - sb->dev_idx, sb->nr_devices); - return -BCH_ERR_invalid_sb_dev_idx; - } - - if (!sb->time_precision || - le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { - prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", - le32_to_cpu(sb->time_precision), NSEC_PER_SEC); - return -BCH_ERR_invalid_sb_time_precision; - } - - /* old versions didn't know to downgrade this field */ - if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version)) - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version)); - - if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) { - prt_printf(out, "Invalid version_incompat "); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); - prt_str(out, " > incompat_allowed "); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); - if (flags & BCH_VALIDATE_write) - return -BCH_ERR_invalid_sb_version; - else - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); - } - - if (sb->nr_devices > 1) - SET_BCH_SB_MULTI_DEVICE(sb, true); - - if (!flags) { - /* - * Been seeing a bug where these are getting inexplicably - * zeroed, so we're now validating them, but we have to be - * careful not to preven people's filesystems from mounting: - */ - if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) - SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); - if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) - SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); - - if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 && - !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb)) - SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) - SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); - - if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) - SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && - !BCH_SB_CSUM_ERR_RETRY_NR(sb)) - SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); - } - -#ifdef __KERNEL__ - if (!BCH_SB_SHARD_INUMS_NBITS(sb)) - SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); -#endif - - for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { - const struct bch_option *opt = bch2_opt_table + opt_id; - - if (opt->get_sb) { - u64 v = bch2_opt_from_sb(sb, opt_id, -1); - - prt_printf(out, "Invalid option "); - ret = bch2_opt_validate(opt, v, out); - if (ret) - return ret; - - printbuf_reset(out); - } - } - - /* validate layout */ - ret = validate_sb_layout(&sb->layout, out); - if (ret) - return ret; - - vstruct_for_each(sb, f) { - if (!f->u64s) { - prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", - le32_to_cpu(f->type)); - return -BCH_ERR_invalid_sb_field_size; - } - - if (vstruct_next(f) > vstruct_last(sb)) { - prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", - le32_to_cpu(f->type)); - return -BCH_ERR_invalid_sb_field_size; - } - } - - struct bch_sb_field *mi = - bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?: - bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1); - - /* members must be validated first: */ - if (!mi) { - prt_printf(out, "Invalid superblock: member info area missing"); - return -BCH_ERR_invalid_sb_members_missing; - } - - ret = bch2_sb_field_validate(sb, mi, flags, out); - if (ret) - return ret; - - vstruct_for_each(sb, f) { - if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) - continue; - - ret = bch2_sb_field_validate(sb, f, flags, out); - if (ret) - return ret; - } - - if ((flags & BCH_VALIDATE_write) && - bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { - prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", - le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), - le64_to_cpu(sb->seq)); - return -BCH_ERR_invalid_sb_members_missing; - } - - return 0; -} - -/* device open: */ - -static unsigned long le_ulong_to_cpu(unsigned long v) -{ - return sizeof(unsigned long) == 8 - ? le64_to_cpu(v) - : le32_to_cpu(v); -} - -static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr) -{ - BUG_ON(nr & (BITS_PER_TYPE(long) - 1)); - - for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++) - dst[i] = le_ulong_to_cpu(src[i]); -} - -static void bch2_sb_update(struct bch_fs *c) -{ - struct bch_sb *src = c->disk_sb.sb; - - lockdep_assert_held(&c->sb_lock); - - c->sb.uuid = src->uuid; - c->sb.user_uuid = src->user_uuid; - c->sb.version = le16_to_cpu(src->version); - c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src); - c->sb.version_incompat_allowed - = BCH_SB_VERSION_INCOMPAT_ALLOWED(src); - c->sb.version_min = le16_to_cpu(src->version_min); - c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); - c->sb.nr_devices = src->nr_devices; - c->sb.clean = BCH_SB_CLEAN(src); - c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); - - c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); - c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; - - /* XXX this is wrong, we need a 96 or 128 bit integer type */ - c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), - c->sb.nsec_per_time_unit); - c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); - - c->sb.features = le64_to_cpu(src->features[0]); - c->sb.compat = le64_to_cpu(src->compat[0]); - c->sb.multi_device = BCH_SB_MULTI_DEVICE(src); - - memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); - if (ext) { - c->sb.recovery_passes_required = - bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - - le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, - sizeof(c->sb.errors_silent) * 8); - c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); - } - - for_each_member_device(c, ca) { - struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); - ca->mi = bch2_mi_to_cpu(&m); - } -} - -static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) -{ - struct bch_sb_field *src_f, *dst_f; - struct bch_sb *dst = dst_handle->sb; - unsigned i; - - dst->version = src->version; - dst->version_min = src->version_min; - dst->seq = src->seq; - dst->uuid = src->uuid; - dst->user_uuid = src->user_uuid; - memcpy(dst->label, src->label, sizeof(dst->label)); - - dst->block_size = src->block_size; - dst->nr_devices = src->nr_devices; - - dst->time_base_lo = src->time_base_lo; - dst->time_base_hi = src->time_base_hi; - dst->time_precision = src->time_precision; - dst->write_time = src->write_time; - - memcpy(dst->flags, src->flags, sizeof(dst->flags)); - memcpy(dst->features, src->features, sizeof(dst->features)); - memcpy(dst->compat, src->compat, sizeof(dst->compat)); - - for (i = 0; i < BCH_SB_FIELD_NR; i++) { - int d; - - if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) - continue; - - src_f = bch2_sb_field_get_id(src, i); - dst_f = bch2_sb_field_get_id(dst, i); - - d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - - (dst_f ? le32_to_cpu(dst_f->u64s) : 0); - if (d > 0) { - int ret = bch2_sb_realloc(dst_handle, - le32_to_cpu(dst_handle->sb->u64s) + d); - - if (ret) - return ret; - - dst = dst_handle->sb; - dst_f = bch2_sb_field_get_id(dst, i); - } - - dst_f = __bch2_sb_field_resize(dst_handle, dst_f, - src_f ? le32_to_cpu(src_f->u64s) : 0); - - if (src_f) - memcpy(dst_f, src_f, vstruct_bytes(src_f)); - } - - return 0; -} - -int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) -{ - int ret; - - lockdep_assert_held(&c->sb_lock); - - ret = bch2_sb_realloc(&c->disk_sb, 0) ?: - __copy_super(&c->disk_sb, src) ?: - bch2_sb_replicas_to_cpu_replicas(c) ?: - bch2_sb_disk_groups_to_cpu(c); - if (ret) - return ret; - - bch2_sb_update(c); - return 0; -} - -int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) -{ - return __copy_super(&ca->disk_sb, c->disk_sb.sb); -} - -/* read superblock: */ - -static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) -{ - size_t bytes; - int ret; -reread: - bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - sb->bio->bi_iter.bi_sector = offset; - bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); - - ret = submit_bio_wait(sb->bio); - if (ret) { - prt_printf(err, "IO error: %i", ret); - return ret; - } - - if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && - !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { - prt_str(err, "Not a bcachefs superblock (got magic "); - pr_uuid(err, sb->sb->magic.b); - prt_str(err, ")"); - return -BCH_ERR_invalid_sb_magic; - } - - ret = bch2_sb_compatible(sb->sb, err); - if (ret) - return ret; - - bytes = vstruct_bytes(sb->sb); - - u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits); - if (bytes > sb_size) { - prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)", - bytes, sb_size); - return -BCH_ERR_invalid_sb_too_big; - } - - if (bytes > sb->buffer_size) { - ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); - if (ret) - return ret; - goto reread; - } - - enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); - if (csum_type >= BCH_CSUM_NR || - bch2_csum_type_is_encryption(csum_type)) { - prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); - return -BCH_ERR_invalid_sb_csum_type; - } - - /* XXX: verify MACs */ - struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb); - if (bch2_crc_cmp(csum, sb->sb->csum)) { - bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum); - return -BCH_ERR_invalid_sb_csum; - } - - sb->seq = le64_to_cpu(sb->sb->seq); - - return 0; -} - -static int __bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb, bool ignore_notbchfs_msg) -{ - u64 offset = opt_get(*opts, sb); - struct bch_sb_layout layout; - struct printbuf err = PRINTBUF; - struct printbuf err2 = PRINTBUF; - __le64 *i; - int ret; -#ifndef __KERNEL__ -retry: -#endif - memset(sb, 0, sizeof(*sb)); - sb->mode = BLK_OPEN_READ; - sb->have_bio = true; - sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL); - if (!sb->holder) - return -ENOMEM; - - sb->sb_name = kstrdup(path, GFP_KERNEL); - if (!sb->sb_name) { - ret = -ENOMEM; - prt_printf(&err, "error allocating memory for sb_name"); - goto err; - } - -#ifndef __KERNEL__ - if (opt_get(*opts, direct_io) == false) - sb->mode |= BLK_OPEN_BUFFERED; -#endif - - if (!opt_get(*opts, noexcl)) - sb->mode |= BLK_OPEN_EXCL; - - if (!opt_get(*opts, nochanges)) - sb->mode |= BLK_OPEN_WRITE; - - sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->s_bdev_file) && - PTR_ERR(sb->s_bdev_file) == -EACCES && - opt_get(*opts, read_only)) { - sb->mode &= ~BLK_OPEN_WRITE; - - sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->s_bdev_file)) - opt_set(*opts, nochanges, true); - } - - if (IS_ERR(sb->s_bdev_file)) { - ret = PTR_ERR(sb->s_bdev_file); - prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret)); - goto err; - } - sb->bdev = file_bdev(sb->s_bdev_file); - - ret = bch2_sb_realloc(sb, 0); - if (ret) { - prt_printf(&err, "error allocating memory for superblock"); - goto err; - } - - if (bch2_fs_init_fault("read_super")) { - prt_printf(&err, "dynamic fault"); - ret = -EFAULT; - goto err; - } - - ret = read_one_super(sb, offset, &err); - if (!ret) - goto got_super; - - if (opt_defined(*opts, sb)) - goto err; - - prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", - path, err.buf); - if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) - bch2_print_opts(opts, KERN_INFO "%s", err2.buf); - else - bch2_print_opts(opts, KERN_ERR "%s", err2.buf); - - printbuf_exit(&err2); - printbuf_reset(&err); - - /* - * Error reading primary superblock - read location of backup - * superblocks: - */ - bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; - /* - * use sb buffer to read layout, since sb buffer is page aligned but - * layout won't be: - */ - bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); - - ret = submit_bio_wait(sb->bio); - if (ret) { - prt_printf(&err, "IO error: %i", ret); - goto err; - } - - memcpy(&layout, sb->sb, sizeof(layout)); - ret = validate_sb_layout(&layout, &err); - if (ret) - goto err; - - for (i = layout.sb_offset; - i < layout.sb_offset + layout.nr_superblocks; i++) { - offset = le64_to_cpu(*i); - - if (offset == opt_get(*opts, sb)) { - ret = -BCH_ERR_invalid; - continue; - } - - ret = read_one_super(sb, offset, &err); - if (!ret) - goto got_super; - } - - goto err; - -got_super: - if (le16_to_cpu(sb->sb->block_size) << 9 < - bdev_logical_block_size(sb->bdev) && - opt_get(*opts, direct_io)) { -#ifndef __KERNEL__ - opt_set(*opts, direct_io, false); - bch2_free_super(sb); - goto retry; -#endif - prt_printf(&err, "block size (%u) smaller than device block size (%u)", - le16_to_cpu(sb->sb->block_size) << 9, - bdev_logical_block_size(sb->bdev)); - ret = -BCH_ERR_block_size_too_small; - goto err; - } - - sb->have_layout = true; - - ret = bch2_sb_validate(sb->sb, offset, 0, &err); - if (ret) { - bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", - path, err.buf); - goto err_no_print; - } -out: - printbuf_exit(&err); - return ret; -err: - bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", - path, err.buf); -err_no_print: - bch2_free_super(sb); - goto out; -} - -int bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) -{ - return __bch2_read_super(path, opts, sb, false); -} - -/* provide a silenced version for mount.bcachefs */ - -int bch2_read_super_silent(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) -{ - return __bch2_read_super(path, opts, sb, true); -} - -/* write superblock: */ - -static void write_super_endio(struct bio *bio) -{ - struct bch_dev *ca = bio->bi_private; - - bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status); - - /* XXX: return errors directly */ - - if (bio->bi_status) { - bch_err_dev_ratelimited(ca, "superblock %s error: %s", - str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status)); - ca->sb_write_error = 1; - } - - closure_put(&ca->fs->sb_write); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); -} - -static void read_back_super(struct bch_fs *c, struct bch_dev *ca) -{ - struct bch_sb *sb = ca->disk_sb.sb; - struct bio *bio = ca->disk_sb.bio; - - memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; - bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE); - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - closure_bio_submit(bio, &c->sb_write); -} - -static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) -{ - struct bch_sb *sb = ca->disk_sb.sb; - struct bio *bio = ca->disk_sb.bio; - - sb->offset = sb->layout.sb_offset[idx]; - - SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); - sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), - null_nonce(), sb); - - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; - bch2_bio_map(bio, sb, - roundup((size_t) vstruct_bytes(sb), - bdev_logical_block_size(ca->disk_sb.bdev))); - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], - bio_sectors(bio)); - - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - closure_bio_submit(bio, &c->sb_write); -} - -int bch2_write_super(struct bch_fs *c) -{ - struct closure *cl = &c->sb_write; - struct printbuf err = PRINTBUF; - unsigned sb = 0, nr_wrote; - struct bch_devs_mask sb_written; - bool wrote, can_mount_without_written, can_mount_with_written; - unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; - DARRAY(struct bch_dev *) online_devices = {}; - int ret = 0; - - trace_and_count(c, write_super, c, _RET_IP_); - - if (c->opts.degraded == BCH_DEGRADED_very) - degraded_flags |= BCH_FORCE_IF_LOST; - - lockdep_assert_held(&c->sb_lock); - - closure_init_stack(cl); - memset(&sb_written, 0, sizeof(sb_written)); - - /* - * Note: we do writes to RO devices here, and we might want to change - * that in the future. - * - * For now, we expect to be able to call write_super() when we're not - * yet RW: - */ - for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) { - ret = darray_push(&online_devices, ca); - if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - goto out; - } - enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); - } - - /* Make sure we're using the new magic numbers: */ - c->disk_sb.sb->magic = BCHFS_MAGIC; - c->disk_sb.sb->layout.magic = BCHFS_MAGIC; - - le64_add_cpu(&c->disk_sb.sb->seq, 1); - - struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - darray_for_each(online_devices, ca) - __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq; - c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds()); - - if (test_bit(BCH_FS_error, &c->flags)) - SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); - if (test_bit(BCH_FS_topology_error, &c->flags)) - SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); - - SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); - - bch2_sb_counters_from_cpu(c); - bch2_sb_members_from_cpu(c); - bch2_sb_members_cpy_v2_v1(&c->disk_sb); - bch2_sb_errors_from_cpu(c); - bch2_sb_downgrade_update(c); - - darray_for_each(online_devices, ca) - bch2_sb_from_fs(c, (*ca)); - - darray_for_each(online_devices, ca) { - printbuf_reset(&err); - - ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); - if (ret) { - bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); - goto out; - } - } - - if (c->opts.nochanges) - goto out; - - /* - * Defer writing the superblock until filesystem initialization is - * complete - don't write out a partly initialized superblock: - */ - if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) - goto out; - - if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "attempting to write superblock that wasn't version downgraded ("); - bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version)); - prt_str(&buf, " > "); - bch2_version_to_text(&buf, bcachefs_metadata_version_current); - prt_str(&buf, ")"); - bch2_fs_fatal_error(c, ": %s", buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, sb_not_downgraded); - goto out; - } - - darray_for_each(online_devices, ca) { - __set_bit((*ca)->dev_idx, sb_written.d); - (*ca)->sb_write_error = 0; - } - - darray_for_each(online_devices, ca) - read_back_super(c, *ca); - closure_sync(cl); - - darray_for_each(online_devices, cap) { - struct bch_dev *ca = *cap; - - if (ca->sb_write_error) - continue; - - if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { - struct printbuf buf = PRINTBUF; - prt_char(&buf, ' '); - prt_bdevname(&buf, ca->disk_sb.bdev); - prt_printf(&buf, - ": Superblock write was silently dropped! (seq %llu expected %llu)", - le64_to_cpu(ca->sb_read_scratch->seq), - ca->disk_sb.seq); - - if (c->opts.errors != BCH_ON_ERROR_continue && - c->opts.errors != BCH_ON_ERROR_fix_safe) { - ret = bch_err_throw(c, erofs_sb_err); - bch2_fs_fatal_error(c, "%s", buf.buf); - } else { - bch_err(c, "%s", buf.buf); - } - - printbuf_exit(&buf); - } - - if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { - struct printbuf buf = PRINTBUF; - prt_char(&buf, ' '); - prt_bdevname(&buf, ca->disk_sb.bdev); - prt_printf(&buf, - ": Superblock modified by another process (seq %llu expected %llu)", - le64_to_cpu(ca->sb_read_scratch->seq), - ca->disk_sb.seq); - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = bch_err_throw(c, erofs_sb_err); - } - } - - if (ret) - goto out; - - do { - wrote = false; - darray_for_each(online_devices, cap) { - struct bch_dev *ca = *cap; - if (!ca->sb_write_error && - sb < ca->disk_sb.sb->layout.nr_superblocks) { - write_one_super(c, ca, sb); - wrote = true; - } - } - closure_sync(cl); - sb++; - } while (wrote); - - darray_for_each(online_devices, cap) { - struct bch_dev *ca = *cap; - if (ca->sb_write_error) - __clear_bit(ca->dev_idx, sb_written.d); - else - ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); - } - - nr_wrote = dev_mask_nr(&sb_written); - - can_mount_with_written = - bch2_have_enough_devs(c, sb_written, degraded_flags, false); - - for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++) - sb_written.d[i] = ~sb_written.d[i]; - - can_mount_without_written = - bch2_have_enough_devs(c, sb_written, degraded_flags, false); - - /* - * If we would be able to mount _without_ the devices we successfully - * wrote superblocks to, we weren't able to write to enough devices: - * - * Exception: if we can mount without the successes because we haven't - * written anything (new filesystem), we continue if we'd be able to - * mount with the devices we did successfully write to: - */ - if (bch2_fs_fatal_err_on(!nr_wrote || - !can_mount_with_written || - (can_mount_without_written && - !can_mount_with_written), c, - ": Unable to write superblock to sufficient devices (from %ps)", - (void *) _RET_IP_)) - ret = bch_err_throw(c, erofs_sb_err); -out: - /* Make new options visible after they're persistent: */ - bch2_sb_update(c); - darray_for_each(online_devices, ca) - enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super); - darray_exit(&online_devices); - printbuf_exit(&err); - return ret; -} - -void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) -{ - mutex_lock(&c->sb_lock); - if (!(c->sb.features & (1ULL << feat))) { - c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); - - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); -} - -/* Downgrade if superblock is at a higher version than currently supported: */ -bool bch2_check_version_downgrade(struct bch_fs *c) -{ - bool ret = bcachefs_metadata_version_current < c->sb.version; - - lockdep_assert_held(&c->sb_lock); - - /* - * Downgrade, if superblock is at a higher version than currently - * supported: - * - * c->sb will be checked before we write the superblock, so update it as - * well: - */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current) - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current); - if (c->sb.version > bcachefs_metadata_version_current) - c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - if (c->sb.version_min > bcachefs_metadata_version_current) - c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); - c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); - return ret; -} - -void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) -{ - lockdep_assert_held(&c->sb_lock); - - if (BCH_VERSION_MAJOR(new_version) > - BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) - bch2_sb_field_resize(&c->disk_sb, downgrade, 0); - - c->disk_sb.sb->version = cpu_to_le16(new_version); - - if (incompat) { - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); - } -} - -void bch2_sb_upgrade_incompat(struct bch_fs *c) -{ - mutex_lock(&c->sb_lock); - if (c->sb.version == c->sb.version_incompat_allowed) - goto unlock; - - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Now allowing incompatible features up to "); - bch2_version_to_text(&buf, c->sb.version); - prt_str(&buf, ", previously allowed up to "); - bch2_version_to_text(&buf, c->sb.version_incompat_allowed); - prt_newline(&buf); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); - bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); -} - -static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - if (vstruct_bytes(f) < 88) { - prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88); - return -BCH_ERR_invalid_sb_ext; - } - - return 0; -} - -static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_ext *e = field_to_type(f, ext); - - prt_printf(out, "Recovery passes required:\t"); - prt_bitflags(out, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0]))); - prt_newline(out); - - unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL); - if (errors_silent) { - le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); - - prt_printf(out, "Errors to silently fix:\t"); - prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, - min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8)); - prt_newline(out); - - kfree(errors_silent); - } - - prt_printf(out, "Btrees with missing data:\t"); - prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); - prt_newline(out); -} - -static const struct bch_sb_field_ops bch_sb_field_ops_ext = { - .validate = bch2_sb_ext_validate, - .to_text = bch2_sb_ext_to_text, -}; - -static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { -#define x(f, nr) \ - [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, - BCH_SB_FIELDS() -#undef x -}; - -static const struct bch_sb_field_ops bch2_sb_field_null_ops; - -static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) -{ - return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) - ? bch2_sb_field_ops[type] - : &bch2_sb_field_null_ops; -} - -static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, - enum bch_validate_flags flags, struct printbuf *err) -{ - unsigned type = le32_to_cpu(f->type); - struct printbuf field_err = PRINTBUF; - const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); - int ret; - - ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0; - if (ret) { - prt_printf(err, "Invalid superblock section %s: %s", - bch2_sb_fields[type], field_err.buf); - prt_newline(err); - bch2_sb_field_to_text(err, sb, f); - } - - printbuf_exit(&field_err); - return ret; -} - -void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - unsigned type = le32_to_cpu(f->type); - const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); - - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); - - if (ops->to_text) - ops->to_text(out, sb, f); -} - -void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - unsigned type = le32_to_cpu(f->type); - - if (type < BCH_SB_FIELD_NR) - prt_printf(out, "%s", bch2_sb_fields[type]); - else - prt_printf(out, "(unknown field %u)", type); - - prt_printf(out, " (size %zu):", vstruct_bytes(f)); - prt_newline(out); - - __bch2_sb_field_to_text(out, sb, f); -} - -void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) -{ - unsigned i; - - prt_printf(out, "Type: %u", l->layout_type); - prt_newline(out); - - prt_str(out, "Superblock max size: "); - prt_units_u64(out, 512 << l->sb_max_size_bits); - prt_newline(out); - - prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); - prt_newline(out); - - prt_str(out, "Offsets: "); - for (i = 0; i < l->nr_superblocks; i++) { - if (i) - prt_str(out, ", "); - prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); - } - prt_newline(out); -} - -void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, - bool print_layout, unsigned fields) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 44); - - prt_printf(out, "External UUID:\t"); - pr_uuid(out, sb->user_uuid.b); - prt_newline(out); - - prt_printf(out, "Internal UUID:\t"); - pr_uuid(out, sb->uuid.b); - prt_newline(out); - - prt_printf(out, "Magic number:\t"); - pr_uuid(out, sb->magic.b); - prt_newline(out); - - prt_printf(out, "Device index:\t%u\n", sb->dev_idx); - - prt_printf(out, "Label:\t"); - if (!strlen(sb->label)) - prt_printf(out, "(none)"); - else - prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); - prt_newline(out); - - prt_printf(out, "Version:\t"); - bch2_version_to_text(out, le16_to_cpu(sb->version)); - prt_newline(out); - - prt_printf(out, "Incompatible features allowed:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); - prt_newline(out); - - prt_printf(out, "Incompatible features in use:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); - prt_newline(out); - - prt_printf(out, "Version upgrade complete:\t"); - bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); - prt_newline(out); - - prt_printf(out, "Oldest version on disk:\t"); - bch2_version_to_text(out, le16_to_cpu(sb->version_min)); - prt_newline(out); - - prt_printf(out, "Created:\t"); - if (sb->time_base_lo) - bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); - else - prt_printf(out, "(not set)"); - prt_newline(out); - - prt_printf(out, "Sequence number:\t"); - prt_printf(out, "%llu", le64_to_cpu(sb->seq)); - prt_newline(out); - - prt_printf(out, "Time of last write:\t"); - bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); - prt_newline(out); - - prt_printf(out, "Superblock size:\t"); - prt_units_u64(out, vstruct_bytes(sb)); - prt_str(out, "/"); - prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); - prt_newline(out); - - prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb)); - prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb)); - - prt_printf(out, "Sections:\t"); - u64 fields_have = 0; - vstruct_for_each(sb, f) - fields_have |= 1 << le32_to_cpu(f->type); - prt_bitflags(out, bch2_sb_fields, fields_have); - prt_newline(out); - - prt_printf(out, "Features:\t"); - prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); - prt_newline(out); - - prt_printf(out, "Compat features:\t"); - prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); - prt_newline(out); - - prt_newline(out); - prt_printf(out, "Options:"); - prt_newline(out); - printbuf_indent_add(out, 2); - { - enum bch_opt_id id; - - for (id = 0; id < bch2_opts_nr; id++) { - const struct bch_option *opt = bch2_opt_table + id; - - if (opt->get_sb) { - u64 v = bch2_opt_from_sb(sb, id, -1); - - prt_printf(out, "%s:\t", opt->attr.name); - bch2_opt_to_text(out, NULL, sb, opt, v, - OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); - prt_newline(out); - } - } - } - - printbuf_indent_sub(out, 2); - - if (print_layout) { - prt_newline(out); - prt_printf(out, "layout:"); - prt_newline(out); - printbuf_indent_add(out, 2); - bch2_sb_layout_to_text(out, &sb->layout); - printbuf_indent_sub(out, 2); - } - - vstruct_for_each(sb, f) - if (fields & (1 << le32_to_cpu(f->type))) { - prt_newline(out); - bch2_sb_field_to_text(out, sb, f); - } -} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h deleted file mode 100644 index a3b7a90f2533db..00000000000000 --- a/fs/bcachefs/super-io.h +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUPER_IO_H -#define _BCACHEFS_SUPER_IO_H - -#include "extents.h" -#include "eytzinger.h" -#include "super_types.h" -#include "super.h" -#include "sb-members.h" - -#include - -#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096 - -static inline bool bch2_version_compatible(u16 version) -{ - return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && - version >= bcachefs_metadata_version_min; -} - -void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); -enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); - -int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); - -static inline int bch2_request_incompat_feature(struct bch_fs *c, - enum bcachefs_metadata_version version) -{ - return likely(version <= c->sb.version_incompat) - ? 0 - : bch2_set_version_incompat(c, version); -} - -static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) -{ - return le32_to_cpu(f->u64s) * sizeof(u64); -} - -#define field_to_type(_f, _name) \ - container_of_or_null(_f, struct bch_sb_field_##_name, field) - -struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type); -#define bch2_sb_field_get(_sb, _name) \ - field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name) - -struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *, - enum bch_sb_field_type, unsigned); -#define bch2_sb_field_resize(_sb, _name, _u64s) \ - field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) - -struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *, - enum bch_sb_field_type, unsigned); -#define bch2_sb_field_get_minsize(_sb, _name, _u64s) \ - field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) - -#define bch2_sb_field_nr_entries(_f) \ - (_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) / \ - sizeof(_f->entries[0])) \ - : 0) - -void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); - -extern const char * const bch2_sb_fields[]; - -struct bch_sb_field_ops { - int (*validate)(struct bch_sb *, struct bch_sb_field *, - enum bch_validate_flags, struct printbuf *); - void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); -}; - -static inline __le64 bch2_sb_magic(struct bch_fs *c) -{ - __le64 ret; - - memcpy(&ret, &c->sb.uuid, sizeof(ret)); - return ret; -} - -static inline __u64 jset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); -} - -static inline __u64 bset_magic(struct bch_fs *c) -{ - return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); -} - -int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); -int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); - -void bch2_free_super(struct bch_sb_handle *); -int bch2_sb_realloc(struct bch_sb_handle *, unsigned); - -int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); - -int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); -int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); -int bch2_write_super(struct bch_fs *); -void __bch2_check_set_feature(struct bch_fs *, unsigned); - -static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) -{ - if (!(c->sb.features & (1ULL << feat))) - __bch2_check_set_feature(c, feat); -} - -bool bch2_check_version_downgrade(struct bch_fs *); -void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); -void bch2_sb_upgrade_incompat(struct bch_fs *); - -void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); -void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); -void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); -void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); - -#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c deleted file mode 100644 index c46b1053a02c90..00000000000000 --- a/fs/bcachefs/super.c +++ /dev/null @@ -1,2547 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bcachefs setup/teardown code, and some metadata io - read a superblock and - * figure out what to do with it. - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "async_objs.h" -#include "backpointers.h" -#include "bkey_sort.h" -#include "btree_cache.h" -#include "btree_gc.h" -#include "btree_journal_iter.h" -#include "btree_key_cache.h" -#include "btree_node_scan.h" -#include "btree_update_interior.h" -#include "btree_io.h" -#include "btree_write_buffer.h" -#include "buckets_waiting_for_journal.h" -#include "chardev.h" -#include "checksum.h" -#include "clock.h" -#include "compress.h" -#include "debug.h" -#include "disk_accounting.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "errcode.h" -#include "error.h" -#include "fs.h" -#include "fs-io.h" -#include "fs-io-buffered.h" -#include "fs-io-direct.h" -#include "fsck.h" -#include "inode.h" -#include "io_read.h" -#include "io_write.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "journal_seq_blacklist.h" -#include "move.h" -#include "migrate.h" -#include "movinggc.h" -#include "nocow_locking.h" -#include "quota.h" -#include "rebalance.h" -#include "recovery.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-clean.h" -#include "sb-counters.h" -#include "sb-errors.h" -#include "sb-members.h" -#include "snapshot.h" -#include "subvolume.h" -#include "super.h" -#include "super-io.h" -#include "sysfs.h" -#include "thread_with_file.h" -#include "trace.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kent Overstreet "); -MODULE_DESCRIPTION("bcachefs filesystem"); - -typedef DARRAY(struct bch_sb_handle) bch_sb_handles; - -#define x(n) #n, -const char * const bch2_fs_flag_strs[] = { - BCH_FS_FLAGS() - NULL -}; - -const char * const bch2_write_refs[] = { - BCH_WRITE_REFS() - NULL -}; - -const char * const bch2_dev_read_refs[] = { - BCH_DEV_READ_REFS() - NULL -}; - -const char * const bch2_dev_write_refs[] = { - BCH_DEV_WRITE_REFS() - NULL -}; -#undef x - -static void __bch2_print_str(struct bch_fs *c, const char *prefix, - const char *str) -{ -#ifdef __KERNEL__ - struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); - - if (unlikely(stdio)) { - bch2_stdio_redirect_printf(stdio, true, "%s", str); - return; - } -#endif - bch2_print_string_as_lines(KERN_ERR, str); -} - -void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) -{ - __bch2_print_str(c, prefix, str); -} - -__printf(2, 0) -static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) -{ -#ifdef __KERNEL__ - if (unlikely(stdio)) { - if (fmt[0] == KERN_SOH[0]) - fmt += 2; - - bch2_stdio_redirect_vprintf(stdio, true, fmt, args); - return; - } -#endif - vprintk(fmt, args); -} - -void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) -{ - struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; - - va_list args; - va_start(args, fmt); - bch2_print_maybe_redirect(stdio, fmt, args); - va_end(args); -} - -void __bch2_print(struct bch_fs *c, const char *fmt, ...) -{ - struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); - - va_list args; - va_start(args, fmt); - bch2_print_maybe_redirect(stdio, fmt, args); - va_end(args); -} - -#define KTYPE(type) \ -static const struct attribute_group type ## _group = { \ - .attrs = type ## _files \ -}; \ - \ -static const struct attribute_group *type ## _groups[] = { \ - &type ## _group, \ - NULL \ -}; \ - \ -static const struct kobj_type type ## _ktype = { \ - .release = type ## _release, \ - .sysfs_ops = &type ## _sysfs_ops, \ - .default_groups = type ## _groups \ -} - -static void bch2_fs_release(struct kobject *); -static void bch2_dev_release(struct kobject *); -static void bch2_fs_counters_release(struct kobject *k) -{ -} - -static void bch2_fs_internal_release(struct kobject *k) -{ -} - -static void bch2_fs_opts_dir_release(struct kobject *k) -{ -} - -static void bch2_fs_time_stats_release(struct kobject *k) -{ -} - -KTYPE(bch2_fs); -KTYPE(bch2_fs_counters); -KTYPE(bch2_fs_internal); -KTYPE(bch2_fs_opts_dir); -KTYPE(bch2_fs_time_stats); -KTYPE(bch2_dev); - -static struct kset *bcachefs_kset; -static LIST_HEAD(bch_fs_list); -static DEFINE_MUTEX(bch_fs_list_lock); - -DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); - -static void bch2_dev_unlink(struct bch_dev *); -static void bch2_dev_free(struct bch_dev *); -static int bch2_dev_alloc(struct bch_fs *, unsigned); -static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); -static void bch2_dev_io_ref_stop(struct bch_dev *, int); -static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); - -struct bch_fs *bch2_dev_to_fs(dev_t dev) -{ - guard(mutex)(&bch_fs_list_lock); - guard(rcu)(); - - struct bch_fs *c; - list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(c, ca, NULL) - if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { - closure_get(&c->cl); - return c; - } - return NULL; -} - -static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) -{ - struct bch_fs *c; - - lockdep_assert_held(&bch_fs_list_lock); - - list_for_each_entry(c, &bch_fs_list, list) - if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) - return c; - - return NULL; -} - -struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) -{ - struct bch_fs *c; - - mutex_lock(&bch_fs_list_lock); - c = __bch2_uuid_to_fs(uuid); - if (c) - closure_get(&c->cl); - mutex_unlock(&bch_fs_list_lock); - - return c; -} - -/* Filesystem RO/RW: */ - -/* - * For startup/shutdown of RW stuff, the dependencies are: - * - * - foreground writes depend on copygc and rebalance (to free up space) - * - * - copygc and rebalance depend on mark and sweep gc (they actually probably - * don't because they either reserve ahead of time or don't block if - * allocations fail, but allocations can require mark and sweep gc to run - * because of generation number wraparound) - * - * - all of the above depends on the allocator threads - * - * - allocator depends on the journal (when it rewrites prios and gens) - */ - -static void __bch2_fs_read_only(struct bch_fs *c) -{ - unsigned clean_passes = 0; - u64 seq = 0; - - bch2_fs_ec_stop(c); - bch2_open_buckets_stop(c, NULL, true); - bch2_rebalance_stop(c); - bch2_copygc_stop(c); - bch2_fs_ec_flush(c); - - bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", - journal_cur_seq(&c->journal)); - - do { - clean_passes++; - - if (bch2_btree_interior_updates_flush(c) || - bch2_btree_write_buffer_flush_going_ro(c) || - bch2_journal_flush_all_pins(&c->journal) || - bch2_btree_flush_all_writes(c) || - seq != atomic64_read(&c->journal.seq)) { - seq = atomic64_read(&c->journal.seq); - clean_passes = 0; - } - } while (clean_passes < 2); - - bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", - journal_cur_seq(&c->journal)); - - if (test_bit(JOURNAL_replay_done, &c->journal.flags) && - !test_bit(BCH_FS_emergency_ro, &c->flags)) - set_bit(BCH_FS_clean_shutdown, &c->flags); - - bch2_fs_journal_stop(&c->journal); - - bch_info(c, "%sclean shutdown complete, journal seq %llu", - test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", - c->journal.seq_ondisk); - - /* - * After stopping journal: - */ - for_each_member_device(c, ca) { - bch2_dev_io_ref_stop(ca, WRITE); - bch2_dev_allocator_remove(c, ca); - } -} - -static void bch2_writes_disabled(struct enumerated_ref *writes) -{ - struct bch_fs *c = container_of(writes, struct bch_fs, writes); - - set_bit(BCH_FS_write_disable_complete, &c->flags); - wake_up(&bch2_read_only_wait); -} - -void bch2_fs_read_only(struct bch_fs *c) -{ - if (!test_bit(BCH_FS_rw, &c->flags)) { - bch2_journal_reclaim_stop(&c->journal); - return; - } - - BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); - - bch_verbose(c, "going read-only"); - - /* - * Block new foreground-end write operations from starting - any new - * writes will return -EROFS: - */ - set_bit(BCH_FS_going_ro, &c->flags); - enumerated_ref_stop_async(&c->writes); - - /* - * If we're not doing an emergency shutdown, we want to wait on - * outstanding writes to complete so they don't see spurious errors due - * to shutting down the allocator: - * - * If we are doing an emergency shutdown outstanding writes may - * hang until we shutdown the allocator so we don't want to wait - * on outstanding writes before shutting everything down - but - * we do need to wait on them before returning and signalling - * that going RO is complete: - */ - wait_event(bch2_read_only_wait, - test_bit(BCH_FS_write_disable_complete, &c->flags) || - test_bit(BCH_FS_emergency_ro, &c->flags)); - - bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); - if (writes_disabled) - bch_verbose(c, "finished waiting for writes to stop"); - - __bch2_fs_read_only(c); - - wait_event(bch2_read_only_wait, - test_bit(BCH_FS_write_disable_complete, &c->flags)); - - if (!writes_disabled) - bch_verbose(c, "finished waiting for writes to stop"); - - clear_bit(BCH_FS_write_disable_complete, &c->flags); - clear_bit(BCH_FS_going_ro, &c->flags); - clear_bit(BCH_FS_rw, &c->flags); - - if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_error, &c->flags) && - !test_bit(BCH_FS_emergency_ro, &c->flags) && - test_bit(BCH_FS_started, &c->flags) && - test_bit(BCH_FS_clean_shutdown, &c->flags) && - c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { - BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); - BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); - BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.inc.keys.nr); - BUG_ON(c->btree_write_buffer.flushing.keys.nr); - bch2_verify_accounting_clean(c); - - bch_verbose(c, "marking filesystem clean"); - bch2_fs_mark_clean(c); - } else { - /* Make sure error counts/counters are persisted */ - mutex_lock(&c->sb_lock); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - bch_verbose(c, "done going read-only, filesystem not clean"); - } -} - -static void bch2_fs_read_only_work(struct work_struct *work) -{ - struct bch_fs *c = - container_of(work, struct bch_fs, read_only_work); - - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); -} - -static void bch2_fs_read_only_async(struct bch_fs *c) -{ - queue_work(system_long_wq, &c->read_only_work); -} - -bool bch2_fs_emergency_read_only(struct bch_fs *c) -{ - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - - bch2_journal_halt(&c->journal); - bch2_fs_read_only_async(c); - - wake_up(&bch2_read_only_wait); - return ret; -} - -static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, - bool locked) -{ - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - - if (!locked) - bch2_journal_halt(&c->journal); - else - bch2_journal_halt_locked(&c->journal); - bch2_fs_read_only_async(c); - wake_up(&bch2_read_only_wait); - - if (ret) - prt_printf(out, "emergency read only at seq %llu\n", - journal_cur_seq(&c->journal)); - - return ret; -} - -bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) -{ - return __bch2_fs_emergency_read_only2(c, out, false); -} - -bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) -{ - bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - - bch2_journal_halt_locked(&c->journal); - bch2_fs_read_only_async(c); - - wake_up(&bch2_read_only_wait); - return ret; -} - -static int __bch2_fs_read_write(struct bch_fs *c, bool early) -{ - int ret; - - BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - - if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) - return bch_err_throw(c, erofs_no_alloc_info); - - if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { - bch_err(c, "cannot go rw, unfixed btree errors"); - return bch_err_throw(c, erofs_unfixed_errors); - } - - if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { - bch_err(c, "cannot go rw, filesystem is an unresized image file"); - return bch_err_throw(c, erofs_filesystem_full); - } - - if (test_bit(BCH_FS_rw, &c->flags)) - return 0; - - bch_info(c, "going read-write"); - - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - - ret = bch2_sb_members_v2_init(c); - if (ret) - goto err; - - clear_bit(BCH_FS_clean_shutdown, &c->flags); - - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - bch2_dev_allocator_add(c, ca); - enumerated_ref_start(&ca->io_ref[WRITE]); - } - - bch2_recalc_capacity(c); - - /* - * First journal write must be a flush write: after a clean shutdown we - * don't read the journal, so the first journal write may end up - * overwriting whatever was there previously, and there must always be - * at least one non-flush write in the journal or recovery will fail: - */ - spin_lock(&c->journal.lock); - set_bit(JOURNAL_need_flush_write, &c->journal.flags); - set_bit(JOURNAL_running, &c->journal.flags); - bch2_journal_space_available(&c->journal); - spin_unlock(&c->journal.lock); - - ret = bch2_fs_mark_dirty(c); - if (ret) - goto err; - - ret = bch2_journal_reclaim_start(&c->journal); - if (ret) - goto err; - - set_bit(BCH_FS_rw, &c->flags); - set_bit(BCH_FS_was_rw, &c->flags); - - enumerated_ref_start(&c->writes); - - ret = bch2_copygc_start(c); - if (ret) { - bch_err_msg(c, ret, "error starting copygc thread"); - goto err; - } - - ret = bch2_rebalance_start(c); - if (ret) { - bch_err_msg(c, ret, "error starting rebalance thread"); - goto err; - } - - bch2_do_discards(c); - bch2_do_invalidates(c); - bch2_do_stripe_deletes(c); - bch2_do_pending_node_rewrites(c); - return 0; -err: - if (test_bit(BCH_FS_rw, &c->flags)) - bch2_fs_read_only(c); - else - __bch2_fs_read_only(c); - return ret; -} - -int bch2_fs_read_write(struct bch_fs *c) -{ - if (c->opts.recovery_pass_last && - c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) - return bch_err_throw(c, erofs_norecovery); - - if (c->opts.nochanges) - return bch_err_throw(c, erofs_nochanges); - - if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) - return bch_err_throw(c, erofs_no_alloc_info); - - return __bch2_fs_read_write(c, false); -} - -int bch2_fs_read_write_early(struct bch_fs *c) -{ - down_write(&c->state_lock); - int ret = __bch2_fs_read_write(c, true); - up_write(&c->state_lock); - - return ret; -} - -/* Filesystem startup/shutdown: */ - -static void __bch2_fs_free(struct bch_fs *c) -{ - for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_exit(&c->times[i]); - -#ifdef CONFIG_UNICODE - utf8_unload(c->cf_encoding); -#endif - - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - bch2_free_pending_node_rewrites(c); - bch2_free_fsck_errs(c); - bch2_fs_vfs_exit(c); - bch2_fs_snapshots_exit(c); - bch2_fs_sb_errors_exit(c); - bch2_fs_replicas_exit(c); - bch2_fs_rebalance_exit(c); - bch2_fs_quota_exit(c); - bch2_fs_nocow_locking_exit(c); - bch2_fs_journal_exit(&c->journal); - bch2_fs_fs_io_direct_exit(c); - bch2_fs_fs_io_buffered_exit(c); - bch2_fs_fsio_exit(c); - bch2_fs_io_write_exit(c); - bch2_fs_io_read_exit(c); - bch2_fs_encryption_exit(c); - bch2_fs_ec_exit(c); - bch2_fs_counters_exit(c); - bch2_fs_compress_exit(c); - bch2_io_clock_exit(&c->io_clock[WRITE]); - bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_buckets_waiting_for_journal_exit(c); - bch2_fs_btree_write_buffer_exit(c); - bch2_fs_btree_key_cache_exit(&c->btree_key_cache); - bch2_fs_btree_iter_exit(c); - bch2_fs_btree_interior_update_exit(c); - bch2_fs_btree_cache_exit(c); - bch2_fs_accounting_exit(c); - bch2_fs_async_obj_exit(c); - bch2_journal_keys_put_initial(c); - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - - BUG_ON(atomic_read(&c->journal_keys.ref)); - percpu_free_rwsem(&c->mark_lock); - if (c->online_reserved) { - u64 v = percpu_u64_get(c->online_reserved); - WARN(v, "online_reserved not 0 at shutdown: %lli", v); - free_percpu(c->online_reserved); - } - - darray_exit(&c->incompat_versions_requested); - darray_exit(&c->btree_roots_extra); - free_percpu(c->pcpu); - free_percpu(c->usage); - mempool_exit(&c->large_bkey_pool); - mempool_exit(&c->btree_bounce_pool); - bioset_exit(&c->btree_bio); - mempool_exit(&c->fill_iter); - enumerated_ref_exit(&c->writes); - kfree(rcu_dereference_protected(c->disk_groups, 1)); - kfree(c->journal_seq_blacklist_table); - - if (c->write_ref_wq) - destroy_workqueue(c->write_ref_wq); - if (c->btree_write_submit_wq) - destroy_workqueue(c->btree_write_submit_wq); - if (c->btree_read_complete_wq) - destroy_workqueue(c->btree_read_complete_wq); - if (c->copygc_wq) - destroy_workqueue(c->copygc_wq); - if (c->btree_write_complete_wq) - destroy_workqueue(c->btree_write_complete_wq); - if (c->btree_update_wq) - destroy_workqueue(c->btree_update_wq); - - bch2_free_super(&c->disk_sb); - kvfree(c); - module_put(THIS_MODULE); -} - -static void bch2_fs_release(struct kobject *kobj) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - __bch2_fs_free(c); -} - -void __bch2_fs_stop(struct bch_fs *c) -{ - bch_verbose(c, "shutting down"); - - set_bit(BCH_FS_stopping, &c->flags); - - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); - - for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); - if (ca) - bch2_dev_io_ref_stop(ca, READ); - } - - for_each_member_device(c, ca) - bch2_dev_unlink(ca); - - if (c->kobj.state_in_sysfs) - kobject_del(&c->kobj); - - bch2_fs_debug_exit(c); - bch2_fs_chardev_exit(c); - - bch2_ro_ref_put(c); - wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); - - kobject_put(&c->counters_kobj); - kobject_put(&c->time_stats); - kobject_put(&c->opts_dir); - kobject_put(&c->internal); - - /* btree prefetch might have kicked off reads in the background: */ - bch2_btree_flush_all_reads(c); - - for_each_member_device(c, ca) - cancel_work_sync(&ca->io_error_work); - - cancel_work_sync(&c->read_only_work); -} - -void bch2_fs_free(struct bch_fs *c) -{ - mutex_lock(&bch_fs_list_lock); - list_del(&c->list); - mutex_unlock(&bch_fs_list_lock); - - closure_sync(&c->cl); - closure_debug_destroy(&c->cl); - - for (unsigned i = 0; i < c->sb.nr_devices; i++) { - struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); - - if (ca) { - EBUG_ON(atomic_long_read(&ca->ref) != 1); - bch2_dev_io_ref_stop(ca, READ); - bch2_free_super(&ca->disk_sb); - bch2_dev_free(ca); - } - } - - bch_verbose(c, "shutdown complete"); - - kobject_put(&c->kobj); -} - -void bch2_fs_stop(struct bch_fs *c) -{ - __bch2_fs_stop(c); - bch2_fs_free(c); -} - -static int bch2_fs_online(struct bch_fs *c) -{ - int ret = 0; - - lockdep_assert_held(&bch_fs_list_lock); - - if (c->sb.multi_device && - __bch2_uuid_to_fs(c->sb.uuid)) { - bch_err(c, "filesystem UUID already open"); - return bch_err_throw(c, filesystem_uuid_already_open); - } - - ret = bch2_fs_chardev_init(c); - if (ret) { - bch_err(c, "error creating character device"); - return ret; - } - - bch2_fs_debug_init(c); - - ret = (c->sb.multi_device - ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) - : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: - kobject_add(&c->internal, &c->kobj, "internal") ?: - kobject_add(&c->opts_dir, &c->kobj, "options") ?: -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT - kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: -#endif - kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: - bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); - if (ret) { - bch_err(c, "error creating sysfs objects"); - return ret; - } - - down_write(&c->state_lock); - - for_each_member_device(c, ca) { - ret = bch2_dev_sysfs_online(c, ca); - if (ret) { - bch_err(c, "error creating sysfs objects"); - bch2_dev_put(ca); - goto err; - } - } - - BUG_ON(!list_empty(&c->list)); - list_add(&c->list, &bch_fs_list); -err: - up_write(&c->state_lock); - return ret; -} - -int bch2_fs_init_rw(struct bch_fs *c) -{ - if (test_bit(BCH_FS_rw_init_done, &c->flags)) - return 0; - - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE, 0))) - return bch_err_throw(c, ENOMEM_fs_other_alloc); - - int ret = bch2_fs_btree_interior_update_init(c) ?: - bch2_fs_btree_write_buffer_init(c) ?: - bch2_fs_fs_io_buffered_init(c) ?: - bch2_fs_io_write_init(c) ?: - bch2_fs_journal_init(&c->journal); - if (ret) - return ret; - - set_bit(BCH_FS_rw_init_done, &c->flags); - return 0; -} - -static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, - bch_sb_handles *sbs) -{ - struct bch_fs *c; - struct printbuf name = PRINTBUF; - unsigned i, iter_size; - int ret = 0; - - c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); - if (!c) { - c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); - goto out; - } - - c->stdio = (void *)(unsigned long) opts->stdio; - - __module_get(THIS_MODULE); - - closure_init(&c->cl, NULL); - - c->kobj.kset = bcachefs_kset; - kobject_init(&c->kobj, &bch2_fs_ktype); - kobject_init(&c->internal, &bch2_fs_internal_ktype); - kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); - kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); - kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); - - c->minor = -1; - c->disk_sb.fs_sb = true; - - init_rwsem(&c->state_lock); - mutex_init(&c->sb_lock); - mutex_init(&c->replicas_gc_lock); - mutex_init(&c->btree_root_lock); - INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); - - refcount_set(&c->ro_ref, 1); - init_waitqueue_head(&c->ro_ref_wait); - - for (i = 0; i < BCH_TIME_STAT_NR; i++) - bch2_time_stats_init(&c->times[i]); - - bch2_fs_allocator_background_init(c); - bch2_fs_allocator_foreground_init(c); - bch2_fs_btree_cache_init_early(&c->btree_cache); - bch2_fs_btree_gc_init_early(c); - bch2_fs_btree_interior_update_init_early(c); - bch2_fs_btree_iter_init_early(c); - bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); - bch2_fs_btree_write_buffer_init_early(c); - bch2_fs_copygc_init(c); - bch2_fs_ec_init_early(c); - bch2_fs_journal_init_early(&c->journal); - bch2_fs_journal_keys_init(c); - bch2_fs_move_init(c); - bch2_fs_nocow_locking_init_early(c); - bch2_fs_quota_init(c); - bch2_fs_recovery_passes_init(c); - bch2_fs_sb_errors_init_early(c); - bch2_fs_snapshots_init_early(c); - bch2_fs_subvolumes_init_early(c); - - INIT_LIST_HEAD(&c->list); - - mutex_init(&c->bio_bounce_pages_lock); - mutex_init(&c->snapshot_table_lock); - init_rwsem(&c->snapshot_create_lock); - - spin_lock_init(&c->btree_write_error_lock); - - INIT_LIST_HEAD(&c->journal_iters); - - INIT_LIST_HEAD(&c->fsck_error_msgs); - mutex_init(&c->fsck_error_msgs_lock); - - seqcount_init(&c->usage_lock); - - sema_init(&c->io_in_flight, 128); - - INIT_LIST_HEAD(&c->vfs_inodes_list); - mutex_init(&c->vfs_inodes_lock); - - c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; - c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; - c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; - - mutex_init(&c->sectors_available_lock); - - ret = percpu_init_rwsem(&c->mark_lock); - if (ret) - goto err; - - mutex_lock(&c->sb_lock); - ret = bch2_sb_to_fs(c, sb); - mutex_unlock(&c->sb_lock); - - if (ret) - goto err; - - /* Compat: */ - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && - !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) - SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); - - if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && - !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) - SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); - - c->opts = bch2_opts_default; - ret = bch2_opts_from_sb(&c->opts, sb); - if (ret) - goto err; - - bch2_opts_apply(&c->opts, *opts); - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - c->opts.block_size > PAGE_SIZE) { - bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); - ret = -EINVAL; - goto err; - } - - c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; - if (c->opts.inodes_use_key_cache) - c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; - c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; - - c->block_bits = ilog2(block_sectors(c)); - c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); - - if (bch2_fs_init_fault("fs_alloc")) { - bch_err(c, "fs_alloc fault injected"); - ret = -EFAULT; - goto err; - } - - if (c->sb.multi_device) - pr_uuid(&name, c->sb.user_uuid.b); - else - prt_bdevname(&name, sbs->data[0].bdev); - - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; - if (ret) - goto err; - - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - - iter_size = sizeof(struct sort_iter) + - (btree_blocks(c) + 1) * 2 * - sizeof(struct sort_iter_set); - - if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || - enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, - bch2_writes_disabled) || - mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->btree_bio, 1, - max(offsetof(struct btree_read_bio, bio), - offsetof(struct btree_write_bio, wbio.bio)), - BIOSET_NEED_BVECS) || - !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || - !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || - !(c->online_reserved = alloc_percpu(u64)) || - mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, - c->opts.btree_node_size) || - mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { - ret = bch_err_throw(c, ENOMEM_fs_other_alloc); - goto err; - } - - ret = - bch2_fs_async_obj_init(c) ?: - bch2_fs_btree_cache_init(c) ?: - bch2_fs_btree_iter_init(c) ?: - bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_buckets_waiting_for_journal_init(c) ?: - bch2_io_clock_init(&c->io_clock[READ]) ?: - bch2_io_clock_init(&c->io_clock[WRITE]) ?: - bch2_fs_compress_init(c) ?: - bch2_fs_counters_init(c) ?: - bch2_fs_ec_init(c) ?: - bch2_fs_encryption_init(c) ?: - bch2_fs_fsio_init(c) ?: - bch2_fs_fs_io_direct_init(c) ?: - bch2_fs_io_read_init(c) ?: - bch2_fs_rebalance_init(c) ?: - bch2_fs_sb_errors_init(c) ?: - bch2_fs_vfs_init(c); - if (ret) - goto err; - - if (go_rw_in_recovery(c)) { - /* - * start workqueues/kworkers early - kthread creation checks for - * pending signals, which is _very_ annoying - */ - ret = bch2_fs_init_rw(c); - if (ret) - goto err; - } - -#ifdef CONFIG_UNICODE - if (bch2_fs_casefold_enabled(c)) { - /* Default encoding until we can potentially have more as an option. */ - c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); - if (IS_ERR(c->cf_encoding)) { - printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - ret = -EINVAL; - goto err; - } - } -#else - if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { - printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); - ret = -EINVAL; - goto err; - } -#endif - - for (i = 0; i < c->sb.nr_devices; i++) { - if (!bch2_member_exists(c->disk_sb.sb, i)) - continue; - ret = bch2_dev_alloc(c, i); - if (ret) - goto err; - } - - bch2_journal_entry_res_resize(&c->journal, - &c->btree_root_journal_res, - BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); - bch2_journal_entry_res_resize(&c->journal, - &c->clock_journal_res, - (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); - - mutex_lock(&bch_fs_list_lock); - ret = bch2_fs_online(c); - mutex_unlock(&bch_fs_list_lock); - - if (ret) - goto err; -out: - return c; -err: - bch2_fs_free(c); - c = ERR_PTR(ret); - goto out; -} - -noinline_for_stack -static void print_mount_opts(struct bch_fs *c) -{ - enum bch_opt_id i; - CLASS(printbuf, p)(); - bch2_log_msg_start(c, &p); - - prt_str(&p, "starting version "); - bch2_version_to_text(&p, c->sb.version); - - bool first = true; - for (i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - u64 v = bch2_opt_get_by_id(&c->opts, i); - - if (!(opt->flags & OPT_MOUNT)) - continue; - - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - prt_str(&p, first ? " opts=" : ","); - first = false; - bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); - } - - if (c->sb.version_incompat_allowed != c->sb.version) { - prt_printf(&p, "\nallowing incompatible features above "); - bch2_version_to_text(&p, c->sb.version_incompat_allowed); - } - - if (c->opts.verbose) { - prt_printf(&p, "\nfeatures: "); - prt_bitflags(&p, bch2_sb_features, c->sb.features); - } - - if (c->sb.multi_device) { - prt_printf(&p, "\nwith devices"); - for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { - prt_char(&p, ' '); - prt_str(&p, ca->name); - } - } - - bch2_print_str(c, KERN_INFO, p.buf); -} - -static bool bch2_fs_may_start(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned flags = 0; - - switch (c->opts.degraded) { - case BCH_DEGRADED_very: - flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - break; - case BCH_DEGRADED_yes: - flags |= BCH_FORCE_IF_DEGRADED; - break; - default: - mutex_lock(&c->sb_lock); - for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_member_exists(c->disk_sb.sb, i)) - continue; - - ca = bch2_dev_locked(c, i); - - if (!bch2_dev_is_online(ca) && - (ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro)) { - mutex_unlock(&c->sb_lock); - return false; - } - } - mutex_unlock(&c->sb_lock); - break; - } - - return bch2_have_enough_devs(c, c->online_devs, flags, true); -} - -int bch2_fs_start(struct bch_fs *c) -{ - time64_t now = ktime_get_real_seconds(); - int ret = 0; - - print_mount_opts(c); - - if (c->cf_encoding) - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - - if (!bch2_fs_may_start(c)) - return bch_err_throw(c, insufficient_devices_to_start); - - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); - - BUG_ON(test_bit(BCH_FS_started, &c->flags)); - - if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, - sizeof(struct bch_sb_field_ext) / sizeof(u64))) { - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - ret = bch_err_throw(c, ENOSPC_sb); - goto err; - } - - ret = bch2_sb_members_v2_init(c); - if (ret) { - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - goto err; - } - - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(now); - - /* - * Dno't write superblock yet: recovery might have to downgrade - */ - mutex_unlock(&c->sb_lock); - - scoped_guard(rcu) - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); - up_write(&c->state_lock); - - c->recovery_task = current; - ret = BCH_SB_INITIALIZED(c->disk_sb.sb) - ? bch2_fs_recovery(c) - : bch2_fs_initialize(c); - c->recovery_task = NULL; - - if (ret) - goto err; - - ret = bch2_opts_hooks_pre_set(c); - if (ret) - goto err; - - if (bch2_fs_init_fault("fs_start")) { - ret = bch_err_throw(c, injected_fs_start); - goto err; - } - - set_bit(BCH_FS_started, &c->flags); - wake_up(&c->ro_ref_wait); - - down_write(&c->state_lock); - if (c->opts.read_only) - bch2_fs_read_only(c); - else if (!test_bit(BCH_FS_rw, &c->flags)) - ret = bch2_fs_read_write(c); - up_write(&c->state_lock); - -err: - if (ret) - bch_err_msg(c, ret, "starting filesystem"); - else - bch_verbose(c, "done starting filesystem"); - return ret; -} - -static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) -{ - struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); - - if (le16_to_cpu(sb->block_size) != block_sectors(c)) - return bch_err_throw(c, mismatched_block_size); - - if (le16_to_cpu(m.bucket_size) < - BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) - return bch_err_throw(c, bucket_size_too_small); - - return 0; -} - -static int bch2_dev_in_fs(struct bch_sb_handle *fs, - struct bch_sb_handle *sb, - struct bch_opts *opts) -{ - if (fs == sb) - return 0; - - if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) - return -BCH_ERR_device_not_a_member_of_filesystem; - - if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) - return -BCH_ERR_device_has_been_removed; - - if (fs->sb->block_size != sb->sb->block_size) - return -BCH_ERR_mismatched_block_size; - - if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || - le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) - return 0; - - if (fs->sb->seq == sb->sb->seq && - fs->sb->write_time != sb->sb->write_time) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Split brain detected between "); - prt_bdevname(&buf, sb->bdev); - prt_str(&buf, " and "); - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ':'); - prt_newline(&buf); - prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); - prt_newline(&buf); - - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); - prt_newline(&buf); - - prt_bdevname(&buf, sb->bdev); - prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); - prt_newline(&buf); - - if (!opts->no_splitbrain_check) - prt_printf(&buf, "Not using older sb"); - - pr_err("%s", buf.buf); - printbuf_exit(&buf); - - if (!opts->no_splitbrain_check) - return -BCH_ERR_device_splitbrain; - } - - struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); - u64 seq_from_fs = le64_to_cpu(m.seq); - u64 seq_from_member = le64_to_cpu(sb->sb->seq); - - if (seq_from_fs && seq_from_fs < seq_from_member) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "Split brain detected between "); - prt_bdevname(&buf, sb->bdev); - prt_str(&buf, " and "); - prt_bdevname(&buf, fs->bdev); - prt_char(&buf, ':'); - prt_newline(&buf); - - prt_bdevname(&buf, fs->bdev); - prt_str(&buf, " believes seq of "); - prt_bdevname(&buf, sb->bdev); - prt_printf(&buf, " to be %llu, but ", seq_from_fs); - prt_bdevname(&buf, sb->bdev); - prt_printf(&buf, " has %llu\n", seq_from_member); - - if (!opts->no_splitbrain_check) { - prt_str(&buf, "Not using "); - prt_bdevname(&buf, sb->bdev); - } - - pr_err("%s", buf.buf); - printbuf_exit(&buf); - - if (!opts->no_splitbrain_check) - return -BCH_ERR_device_splitbrain; - } - - return 0; -} - -/* Device startup/shutdown: */ - -static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) -{ - if (rw == READ) - clear_bit(ca->dev_idx, ca->fs->online_devs.d); - - if (!enumerated_ref_is_zero(&ca->io_ref[rw])) - enumerated_ref_stop(&ca->io_ref[rw], - rw == READ - ? bch2_dev_read_refs - : bch2_dev_write_refs); -} - -static void bch2_dev_release(struct kobject *kobj) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - - kfree(ca); -} - -static void bch2_dev_free(struct bch_dev *ca) -{ - WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); - WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); - - cancel_work_sync(&ca->io_error_work); - - bch2_dev_unlink(ca); - - if (ca->kobj.state_in_sysfs) - kobject_del(&ca->kobj); - - bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); - bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); - - bch2_free_super(&ca->disk_sb); - bch2_dev_allocator_background_exit(ca); - bch2_dev_journal_exit(ca); - - free_percpu(ca->io_done); - bch2_dev_buckets_free(ca); - kfree(ca->sb_read_scratch); - - bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); - bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - - enumerated_ref_exit(&ca->io_ref[WRITE]); - enumerated_ref_exit(&ca->io_ref[READ]); -#ifndef CONFIG_BCACHEFS_DEBUG - percpu_ref_exit(&ca->ref); -#endif - kobject_put(&ca->kobj); -} - -static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) -{ - - lockdep_assert_held(&c->state_lock); - - if (enumerated_ref_is_zero(&ca->io_ref[READ])) - return; - - __bch2_dev_read_only(c, ca); - - bch2_dev_io_ref_stop(ca, READ); - - bch2_dev_unlink(ca); - - bch2_free_super(&ca->disk_sb); - bch2_dev_journal_exit(ca); -} - -#ifndef CONFIG_BCACHEFS_DEBUG -static void bch2_dev_ref_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, ref); - - complete(&ca->ref_completion); -} -#endif - -static void bch2_dev_unlink(struct bch_dev *ca) -{ - struct kobject *b; - - /* - * This is racy w.r.t. the underlying block device being hot-removed, - * which removes it from sysfs. - * - * It'd be lovely if we had a way to handle this race, but the sysfs - * code doesn't appear to provide a good method and block/holder.c is - * susceptible as well: - */ - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev && - (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { - sysfs_remove_link(b, "bcachefs"); - sysfs_remove_link(&ca->kobj, "block"); - } -} - -static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) -{ - int ret; - - if (!c->kobj.state_in_sysfs) - return 0; - - if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: - bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); - if (ret) - return ret; - } - - if (ca->disk_sb.bdev) { - struct kobject *block = bdev_kobj(ca->disk_sb.bdev); - - ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); - if (ret) - return ret; - - ret = sysfs_create_link(&ca->kobj, block, "block"); - if (ret) - return ret; - } - - return 0; -} - -static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, - struct bch_member *member) -{ - struct bch_dev *ca; - unsigned i; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - return NULL; - - kobject_init(&ca->kobj, &bch2_dev_ktype); - init_completion(&ca->ref_completion); - - INIT_WORK(&ca->io_error_work, bch2_io_error_work); - - bch2_time_stats_quantiles_init(&ca->io_latency[READ]); - bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); - - ca->mi = bch2_mi_to_cpu(member); - - for (i = 0; i < ARRAY_SIZE(member->errors); i++) - atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); - - ca->uuid = member->uuid; - - ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / btree_sectors(c)); - -#ifndef CONFIG_BCACHEFS_DEBUG - if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) - goto err; -#else - atomic_long_set(&ca->ref, 1); -#endif - - mutex_init(&ca->bucket_backpointer_mismatch.lock); - mutex_init(&ca->bucket_backpointer_empty.lock); - - bch2_dev_allocator_background_init(ca); - - if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || - enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || - !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || - bch2_dev_buckets_alloc(c, ca) || - !(ca->io_done = alloc_percpu(*ca->io_done))) - goto err; - - return ca; -err: - bch2_dev_free(ca); - return NULL; -} - -static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, - unsigned dev_idx) -{ - ca->dev_idx = dev_idx; - __set_bit(ca->dev_idx, ca->self.d); - - if (!ca->name[0]) - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); - - ca->fs = c; - rcu_assign_pointer(c->devs[ca->dev_idx], ca); - - if (bch2_dev_sysfs_online(c, ca)) - pr_warn("error creating sysfs objects"); -} - -static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) -{ - struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - struct bch_dev *ca = NULL; - - if (bch2_fs_init_fault("dev_alloc")) - goto err; - - ca = __bch2_dev_alloc(c, &member); - if (!ca) - goto err; - - ca->fs = c; - - bch2_dev_attach(c, ca, dev_idx); - return 0; -err: - return bch_err_throw(c, ENOMEM_dev_alloc); -} - -static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) -{ - unsigned ret; - - if (bch2_dev_is_online(ca)) { - bch_err(ca, "already have device online in slot %u", - sb->sb->dev_idx); - return bch_err_throw(ca->fs, device_already_online); - } - - if (get_capacity(sb->bdev->bd_disk) < - ca->mi.bucket_size * ca->mi.nbuckets) { - bch_err(ca, "cannot online: device too small"); - return bch_err_throw(ca->fs, device_size_too_small); - } - - BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); - BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); - - ret = bch2_dev_journal_init(ca, sb->sb); - if (ret) - return ret; - - struct printbuf name = PRINTBUF; - prt_bdevname(&name, sb->bdev); - strscpy(ca->name, name.buf, sizeof(ca->name)); - printbuf_exit(&name); - - /* Commit: */ - ca->disk_sb = *sb; - memset(sb, 0, sizeof(*sb)); - - /* - * Stash pointer to the filesystem for blk_holder_ops - note that once - * attached to a filesystem, we will always close the block device - * before tearing down the filesystem object. - */ - ca->disk_sb.holder->c = ca->fs; - - ca->dev = ca->disk_sb.bdev->bd_dev; - - enumerated_ref_start(&ca->io_ref[READ]); - - return 0; -} - -static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) -{ - struct bch_dev *ca; - int ret; - - lockdep_assert_held(&c->state_lock); - - if (le64_to_cpu(sb->sb->seq) > - le64_to_cpu(c->disk_sb.sb->seq)) - bch2_sb_to_fs(c, sb->sb); - - BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); - - ca = bch2_dev_locked(c, sb->sb->dev_idx); - - ret = __bch2_dev_attach_bdev(ca, sb); - if (ret) - return ret; - - set_bit(ca->dev_idx, c->online_devs.d); - - bch2_dev_sysfs_online(c, ca); - - bch2_rebalance_wakeup(c); - return 0; -} - -/* Device management: */ - -/* - * Note: this function is also used by the error paths - when a particular - * device sees an error, we call it to determine whether we can just set the - * device RO, or - if this function returns false - we'll set the whole - * filesystem RO: - * - * XXX: maybe we should be more explicit about whether we're changing state - * because we got an error or what have you? - */ -bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - struct bch_devs_mask new_online_devs; - int nr_rw = 0, required; - - lockdep_assert_held(&c->state_lock); - - switch (new_state) { - case BCH_MEMBER_STATE_rw: - return true; - case BCH_MEMBER_STATE_ro: - if (ca->mi.state != BCH_MEMBER_STATE_rw) - return true; - - /* do we have enough devices to write to? */ - for_each_member_device(c, ca2) - if (ca2 != ca) - nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; - - required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) - ? c->opts.metadata_replicas - : metadata_replicas_required(c), - !(flags & BCH_FORCE_IF_DATA_DEGRADED) - ? c->opts.data_replicas - : data_replicas_required(c)); - - return nr_rw >= required; - case BCH_MEMBER_STATE_failed: - case BCH_MEMBER_STATE_spare: - if (ca->mi.state != BCH_MEMBER_STATE_rw && - ca->mi.state != BCH_MEMBER_STATE_ro) - return true; - - /* do we have enough devices to read from? */ - new_online_devs = c->online_devs; - __clear_bit(ca->dev_idx, new_online_devs.d); - - return bch2_have_enough_devs(c, new_online_devs, flags, false); - default: - BUG(); - } -} - -static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) -{ - bch2_dev_io_ref_stop(ca, WRITE); - - /* - * The allocator thread itself allocates btree nodes, so stop it first: - */ - bch2_dev_allocator_remove(c, ca); - bch2_recalc_capacity(c); - bch2_dev_journal_stop(&c->journal, ca); -} - -static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) -{ - lockdep_assert_held(&c->state_lock); - - BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); - - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); - - if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) - enumerated_ref_start(&ca->io_ref[WRITE]); - - bch2_dev_do_discards(ca); -} - -int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - struct bch_member *m; - int ret = 0; - - if (ca->mi.state == new_state) - return 0; - - if (!bch2_dev_state_allowed(c, ca, new_state, flags)) - return bch_err_throw(c, device_state_not_allowed); - - if (new_state != BCH_MEMBER_STATE_rw) - __bch2_dev_read_only(c, ca); - - bch_notice(ca, "%s", bch2_member_states[new_state]); - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_STATE(m, new_state); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (new_state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - - bch2_rebalance_wakeup(c); - - return ret; -} - -int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, - enum bch_member_state new_state, int flags) -{ - int ret; - - down_write(&c->state_lock); - ret = __bch2_dev_set_state(c, ca, new_state, flags); - up_write(&c->state_lock); - - return ret; -} - -/* Device add/removal: */ - -int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) -{ - struct bch_member *m; - unsigned dev_idx = ca->dev_idx, data; - bool fast_device_removal = !bch2_request_incompat_feature(c, - bcachefs_metadata_version_fast_device_removal); - int ret; - - down_write(&c->state_lock); - - /* - * We consume a reference to ca->ref, regardless of whether we succeed - * or fail: - */ - bch2_dev_put(ca); - - if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { - bch_err(ca, "Cannot remove without losing data"); - ret = bch_err_throw(c, device_state_not_allowed); - goto err; - } - - __bch2_dev_read_only(c, ca); - - ret = fast_device_removal - ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) - : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: - bch2_dev_remove_stripes(c, ca->dev_idx, flags)); - if (ret) - goto err; - - /* Check if device still has data before blowing away alloc info */ - struct bch_dev_usage usage = bch2_dev_usage_read(ca); - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (!data_type_is_empty(i) && - !data_type_is_hidden(i) && - usage.buckets[i]) { - bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", - __bch2_data_types[i], usage.buckets[i]); - ret = -EBUSY; - goto err; - } - - ret = bch2_dev_remove_alloc(c, ca); - bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); - if (ret) - goto err; - - /* - * We need to flush the entire journal to get rid of keys that reference - * the device being removed before removing the superblock entry - */ - bch2_journal_flush_all_pins(&c->journal); - - /* - * this is really just needed for the bch2_replicas_gc_(start|end) - * calls, and could be cleaned up: - */ - ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); - if (ret) - goto err; - - ret = bch2_journal_flush(&c->journal); - bch_err_msg(ca, ret, "bch2_journal_flush()"); - if (ret) - goto err; - - ret = bch2_replicas_gc2(c); - bch_err_msg(ca, ret, "bch2_replicas_gc2()"); - if (ret) - goto err; - - data = bch2_dev_has_data(c, ca); - if (data) { - struct printbuf data_has = PRINTBUF; - - prt_bitflags(&data_has, __bch2_data_types, data); - bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); - printbuf_exit(&data_has); - ret = -EBUSY; - goto err; - } - - __bch2_dev_offline(c, ca); - - mutex_lock(&c->sb_lock); - rcu_assign_pointer(c->devs[ca->dev_idx], NULL); - mutex_unlock(&c->sb_lock); - -#ifndef CONFIG_BCACHEFS_DEBUG - percpu_ref_kill(&ca->ref); -#else - ca->dying = true; - bch2_dev_put(ca); -#endif - wait_for_completion(&ca->ref_completion); - - bch2_dev_free(ca); - - /* - * Free this device's slot in the bch_member array - all pointers to - * this device must be gone: - */ - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); - - if (fast_device_removal) - m->uuid = BCH_SB_MEMBER_DELETED_UUID; - else - memset(&m->uuid, 0, sizeof(m->uuid)); - - bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - return 0; -err: - if (test_bit(BCH_FS_rw, &c->flags) && - ca->mi.state == BCH_MEMBER_STATE_rw && - !enumerated_ref_is_zero(&ca->io_ref[READ])) - __bch2_dev_read_write(c, ca); - up_write(&c->state_lock); - return ret; -} - -/* Add new device to running filesystem: */ -int bch2_dev_add(struct bch_fs *c, const char *path) -{ - struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb = {}; - struct bch_dev *ca = NULL; - struct printbuf errbuf = PRINTBUF; - struct printbuf label = PRINTBUF; - int ret = 0; - - ret = bch2_read_super(path, &opts, &sb); - bch_err_msg(c, ret, "reading super"); - if (ret) - goto err; - - struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); - - if (BCH_MEMBER_GROUP(&dev_mi)) { - bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); - if (label.allocation_failure) { - ret = -ENOMEM; - goto err; - } - } - - if (list_empty(&c->list)) { - mutex_lock(&bch_fs_list_lock); - if (__bch2_uuid_to_fs(c->sb.uuid)) - ret = bch_err_throw(c, filesystem_uuid_already_open); - else - list_add(&c->list, &bch_fs_list); - mutex_unlock(&bch_fs_list_lock); - - if (ret) { - bch_err(c, "filesystem UUID already open"); - goto err; - } - } - - ret = bch2_dev_may_add(sb.sb, c); - if (ret) - goto err; - - ca = __bch2_dev_alloc(c, &dev_mi); - if (!ca) { - ret = -ENOMEM; - goto err; - } - - ret = __bch2_dev_attach_bdev(ca, &sb); - if (ret) - goto err; - - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); - SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); - - ret = bch2_sb_from_fs(c, ca); - bch_err_msg(c, ret, "setting up new superblock"); - if (ret) - goto err_unlock; - - if (dynamic_fault("bcachefs:add:no_slot")) - goto err_unlock; - - ret = bch2_sb_member_alloc(c); - if (ret < 0) { - bch_err_msg(c, ret, "setting up new superblock"); - goto err_unlock; - } - unsigned dev_idx = ret; - ret = 0; - - /* success: */ - - dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); - *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; - - ca->disk_sb.sb->dev_idx = dev_idx; - bch2_dev_attach(c, ca, dev_idx); - - if (BCH_MEMBER_GROUP(&dev_mi)) { - ret = __bch2_dev_group_set(c, ca, label.buf); - bch_err_msg(c, ret, "creating new label"); - if (ret) - goto err_unlock; - } - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (test_bit(BCH_FS_started, &c->flags)) { - ret = bch2_dev_usage_init(ca, false); - if (ret) - goto err_late; - - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(ca, ret, "marking new superblock"); - if (ret) - goto err_late; - - ret = bch2_fs_freespace_init(c); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err_late; - - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err_late; - } - - /* - * We just changed the superblock UUID, invalidate cache and send a - * uevent to update /dev/disk/by-uuid - */ - invalidate_bdev(ca->disk_sb.bdev); - - char uuid_str[37]; - snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); - - char *envp[] = { - "CHANGE=uuid", - uuid_str, - NULL, - }; - kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); - - up_write(&c->state_lock); -out: - printbuf_exit(&label); - printbuf_exit(&errbuf); - bch_err_fn(c, ret); - return ret; - -err_unlock: - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); -err: - if (ca) - bch2_dev_free(ca); - bch2_free_super(&sb); - goto out; -err_late: - up_write(&c->state_lock); - ca = NULL; - goto err; -} - -/* Hot add existing device to running filesystem: */ -int bch2_dev_online(struct bch_fs *c, const char *path) -{ - struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb = { NULL }; - struct bch_dev *ca; - unsigned dev_idx; - int ret; - - down_write(&c->state_lock); - - ret = bch2_read_super(path, &opts, &sb); - if (ret) { - up_write(&c->state_lock); - return ret; - } - - dev_idx = sb.sb->dev_idx; - - ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); - bch_err_msg(c, ret, "bringing %s online", path); - if (ret) - goto err; - - ret = bch2_dev_attach_bdev(c, &sb); - if (ret) - goto err; - - ca = bch2_dev_locked(c, dev_idx); - - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); - if (ret) - goto err; - - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); - - if (!ca->mi.freespace_initialized) { - ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err; - } - - if (!ca->journal.nr) { - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(ca, ret, "allocating journal"); - if (ret) - goto err; - } - - mutex_lock(&c->sb_lock); - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(ktime_get_real_seconds()); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - up_write(&c->state_lock); - return 0; -err: - up_write(&c->state_lock); - bch2_free_super(&sb); - return ret; -} - -int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) -{ - down_write(&c->state_lock); - - if (!bch2_dev_is_online(ca)) { - bch_err(ca, "Already offline"); - up_write(&c->state_lock); - return 0; - } - - if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { - bch_err(ca, "Cannot offline required disk"); - up_write(&c->state_lock); - return bch_err_throw(c, device_state_not_allowed); - } - - __bch2_dev_offline(c, ca); - - up_write(&c->state_lock); - return 0; -} - -static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) -{ - struct bch_fs *c = ca->fs; - u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; - - return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod2(trans, false, v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free)) ?: - bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); -} - -int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -{ - struct bch_member *m; - u64 old_nbuckets; - int ret = 0; - - down_write(&c->state_lock); - old_nbuckets = ca->mi.nbuckets; - - if (nbuckets < ca->mi.nbuckets) { - bch_err(ca, "Cannot shrink yet"); - ret = -EINVAL; - goto err; - } - - if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { - bch_err(ca, "New device size too big (%llu greater than max %u)", - nbuckets, BCH_MEMBER_NBUCKETS_MAX); - ret = bch_err_throw(c, device_size_too_big); - goto err; - } - - if (bch2_dev_is_online(ca) && - get_capacity(ca->disk_sb.bdev->bd_disk) < - ca->mi.bucket_size * nbuckets) { - bch_err(ca, "New size larger than device"); - ret = bch_err_throw(c, device_size_too_small); - goto err; - } - - ret = bch2_dev_buckets_resize(c, ca, nbuckets); - bch_err_msg(ca, ret, "resizing buckets"); - if (ret) - goto err; - - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - if (ret) - goto err; - - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - m->nbuckets = cpu_to_le64(nbuckets); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (ca->mi.freespace_initialized) { - ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); - if (ret) - goto err; - } - - bch2_recalc_capacity(c); -err: - up_write(&c->state_lock); - return ret; -} - -int bch2_fs_resize_on_mount(struct bch_fs *c) -{ - for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { - u64 old_nbuckets = ca->mi.nbuckets; - u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), - ca->mi.bucket_size); - - if (ca->mi.resize_on_mount && - new_nbuckets > ca->mi.nbuckets) { - bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); - int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); - bch_err_fn(ca, ret); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_fs_resize_on_mount); - up_write(&c->state_lock); - return ret; - } - - mutex_lock(&c->sb_lock); - struct bch_member *m = - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - m->nbuckets = cpu_to_le64(new_nbuckets); - SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); - - c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - if (ca->mi.freespace_initialized) { - ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); - if (ret) { - enumerated_ref_put(&ca->io_ref[READ], - BCH_DEV_READ_REF_fs_resize_on_mount); - up_write(&c->state_lock); - return ret; - } - } - } - } - return 0; -} - -/* return with ref on ca->ref: */ -struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) -{ - if (!strncmp(name, "/dev/", strlen("/dev/"))) - name += strlen("/dev/"); - - for_each_member_device(c, ca) - if (!strcmp(name, ca->name)) - return ca; - return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); -} - -/* blk_holder_ops: */ - -static struct bch_fs *bdev_get_fs(struct block_device *bdev) - __releases(&bdev->bd_holder_lock) -{ - struct bch_sb_handle_holder *holder = bdev->bd_holder; - struct bch_fs *c = holder->c; - - if (c && !bch2_ro_ref_tryget(c)) - c = NULL; - - mutex_unlock(&bdev->bd_holder_lock); - - if (c) - wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); - return c; -} - -/* returns with ref on ca->ref */ -static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) -{ - for_each_member_device(c, ca) - if (ca->disk_sb.bdev == bdev) - return ca; - return NULL; -} - -static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) -{ - struct bch_fs *c = bdev_get_fs(bdev); - if (!c) - return; - - struct super_block *sb = c->vfs_sb; - if (sb) { - /* - * Not necessary, c->ro_ref guards against the filesystem being - * unmounted - we only take this to avoid a warning in - * sync_filesystem: - */ - down_read(&sb->s_umount); - } - - down_write(&c->state_lock); - struct bch_dev *ca = bdev_to_bch_dev(c, bdev); - if (!ca) - goto unlock; - - bool dev = bch2_dev_state_allowed(c, ca, - BCH_MEMBER_STATE_failed, - BCH_FORCE_IF_DEGRADED); - - if (!dev && sb) { - if (!surprise) - sync_filesystem(sb); - shrink_dcache_sb(sb); - evict_inodes(sb); - } - - struct printbuf buf = PRINTBUF; - __bch2_log_msg_start(ca->name, &buf); - - prt_printf(&buf, "offline from block layer"); - - if (dev) { - __bch2_dev_offline(c, ca); - } else { - bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only2(c, &buf); - } - - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - - bch2_dev_put(ca); -unlock: - if (sb) - up_read(&sb->s_umount); - up_write(&c->state_lock); - bch2_ro_ref_put(c); -} - -static void bch2_fs_bdev_sync(struct block_device *bdev) -{ - struct bch_fs *c = bdev_get_fs(bdev); - if (!c) - return; - - struct super_block *sb = c->vfs_sb; - if (sb) { - /* - * Not necessary, c->ro_ref guards against the filesystem being - * unmounted - we only take this to avoid a warning in - * sync_filesystem: - */ - down_read(&sb->s_umount); - sync_filesystem(sb); - up_read(&sb->s_umount); - } - - bch2_ro_ref_put(c); -} - -const struct blk_holder_ops bch2_sb_handle_bdev_ops = { - .mark_dead = bch2_fs_bdev_mark_dead, - .sync = bch2_fs_bdev_sync, -}; - -/* Filesystem open: */ - -static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) -{ - return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: - cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); -} - -struct bch_fs *bch2_fs_open(darray_const_str *devices, - struct bch_opts *opts) -{ - bch_sb_handles sbs = {}; - struct bch_fs *c = NULL; - struct bch_sb_handle *best = NULL; - struct printbuf errbuf = PRINTBUF; - int ret = 0; - - if (!try_module_get(THIS_MODULE)) - return ERR_PTR(-ENODEV); - - if (!devices->nr) { - ret = -EINVAL; - goto err; - } - - ret = darray_make_room(&sbs, devices->nr); - if (ret) - goto err; - - darray_for_each(*devices, i) { - struct bch_sb_handle sb = { NULL }; - - ret = bch2_read_super(*i, opts, &sb); - if (ret) - goto err; - - BUG_ON(darray_push(&sbs, sb)); - } - - if (opts->nochanges && !opts->read_only) { - ret = bch_err_throw(c, erofs_nochanges); - goto err_print; - } - - darray_for_each(sbs, sb) - if (!best || sb_cmp(sb->sb, best->sb) > 0) - best = sb; - - darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb, opts); - - if (ret == -BCH_ERR_device_has_been_removed || - ret == -BCH_ERR_device_splitbrain) { - bch2_free_super(sb); - darray_remove_item(&sbs, sb); - best -= best > sb; - ret = 0; - continue; - } - - if (ret) - goto err_print; - } - - c = bch2_fs_alloc(best->sb, opts, &sbs); - ret = PTR_ERR_OR_ZERO(c); - if (ret) - goto err; - - down_write(&c->state_lock); - darray_for_each(sbs, sb) { - ret = bch2_dev_attach_bdev(c, sb); - if (ret) { - up_write(&c->state_lock); - goto err; - } - } - up_write(&c->state_lock); - - if (!c->opts.nostart) { - ret = bch2_fs_start(c); - if (ret) - goto err; - } -out: - darray_for_each(sbs, sb) - bch2_free_super(sb); - darray_exit(&sbs); - printbuf_exit(&errbuf); - module_put(THIS_MODULE); - return c; -err_print: - pr_err("bch_fs_open err opening %s: %s", - devices->data[0], bch2_err_str(ret)); -err: - if (!IS_ERR_OR_NULL(c)) - bch2_fs_stop(c); - c = ERR_PTR(ret); - goto out; -} - -/* Global interfaces/init */ - -static void bcachefs_exit(void) -{ - bch2_debug_exit(); - bch2_vfs_exit(); - bch2_chardev_exit(); - bch2_btree_key_cache_exit(); - if (bcachefs_kset) - kset_unregister(bcachefs_kset); -} - -static int __init bcachefs_init(void) -{ - bch2_bkey_pack_test(); - - if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || - bch2_btree_key_cache_init() || - bch2_chardev_init() || - bch2_vfs_init() || - bch2_debug_init()) - goto err; - - return 0; -err: - bcachefs_exit(); - return -ENOMEM; -} - -#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); -BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - -static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) -{ - /* Match bool exactly, by re-using it. */ - struct static_key *key = kp->arg; - struct kernel_param boolkp = *kp; - bool v; - int ret; - - boolkp.arg = &v; - - ret = param_set_bool(val, &boolkp); - if (ret) - return ret; - if (v) - static_key_enable(key); - else - static_key_disable(key); - return 0; -} - -static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) -{ - struct static_key *key = kp->arg; - return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); -} - -static const struct kernel_param_ops bch2_param_ops_static_key_t = { - .flags = KERNEL_PARAM_OPS_FL_NOARG, - .set = bch2_param_set_static_key_t, - .get = bch2_param_get_static_key_t, -}; - -#define BCH_DEBUG_PARAM(name, description) \ - module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ - __MODULE_PARM_TYPE(name, "static_key_t"); \ - MODULE_PARM_DESC(name, description); -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -__maybe_unused -static unsigned bch2_metadata_version = bcachefs_metadata_version_current; -module_param_named(version, bch2_metadata_version, uint, 0444); - -module_exit(bcachefs_exit); -module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h deleted file mode 100644 index e90bab9afe78bd..00000000000000 --- a/fs/bcachefs/super.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUPER_H -#define _BCACHEFS_SUPER_H - -#include "extents.h" - -#include "bcachefs_ioctl.h" - -#include - -extern const char * const bch2_fs_flag_strs[]; -extern const char * const bch2_write_refs[]; -extern const char * const bch2_dev_read_refs[]; -extern const char * const bch2_dev_write_refs[]; - -struct bch_fs *bch2_dev_to_fs(dev_t); -struct bch_fs *bch2_uuid_to_fs(__uuid_t); - -bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); -int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); -int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, - enum bch_member_state, int); - -int bch2_dev_fail(struct bch_dev *, int); -int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); -int bch2_dev_add(struct bch_fs *, const char *); -int bch2_dev_online(struct bch_fs *, const char *); -int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); -int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); -struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); - -bool bch2_fs_emergency_read_only(struct bch_fs *); -bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *); - -bool bch2_fs_emergency_read_only_locked(struct bch_fs *); -void bch2_fs_read_only(struct bch_fs *); - -int bch2_fs_read_write(struct bch_fs *); -int bch2_fs_read_write_early(struct bch_fs *); - -int bch2_fs_resize_on_mount(struct bch_fs *); - -void __bch2_fs_stop(struct bch_fs *); -void bch2_fs_free(struct bch_fs *); -void bch2_fs_stop(struct bch_fs *); - -int bch2_fs_init_rw(struct bch_fs *); -int bch2_fs_start(struct bch_fs *); -struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); - -extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; - -#endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h deleted file mode 100644 index 3a899f799d1d18..00000000000000 --- a/fs/bcachefs/super_types.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SUPER_TYPES_H -#define _BCACHEFS_SUPER_TYPES_H - -struct bch_fs; - -struct bch_sb_handle_holder { - struct bch_fs *c; -}; - -struct bch_sb_handle { - struct bch_sb *sb; - struct file *s_bdev_file; - struct block_device *bdev; - char *sb_name; - struct bio *bio; - struct bch_sb_handle_holder *holder; - size_t buffer_size; - blk_mode_t mode; - unsigned have_layout:1; - unsigned have_bio:1; - unsigned fs_sb:1; - u64 seq; -}; - -struct bch_devs_mask { - unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; -}; - -struct bch_devs_list { - u8 nr; - u8 data[BCH_BKEY_PTRS_MAX]; -}; - -#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c deleted file mode 100644 index 05848375cea2a1..00000000000000 --- a/fs/bcachefs/sysfs.c +++ /dev/null @@ -1,914 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bcache sysfs interfaces - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#ifndef NO_BCACHEFS_SYSFS - -#include "bcachefs.h" -#include "alloc_background.h" -#include "alloc_foreground.h" -#include "sysfs.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_gc.h" -#include "buckets.h" -#include "clock.h" -#include "compress.h" -#include "disk_accounting.h" -#include "disk_groups.h" -#include "ec.h" -#include "enumerated_ref.h" -#include "error.h" -#include "inode.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "move.h" -#include "movinggc.h" -#include "nocow_locking.h" -#include "opts.h" -#include "rebalance.h" -#include "recovery_passes.h" -#include "replicas.h" -#include "sb-errors.h" -#include "super-io.h" -#include "tests.h" - -#include -#include -#include - -#include "util.h" - -#define SYSFS_OPS(type) \ -const struct sysfs_ops type ## _sysfs_ops = { \ - .show = type ## _show, \ - .store = type ## _store \ -} - -#define SHOW(fn) \ -static ssize_t fn ## _to_text(struct printbuf *, \ - struct kobject *, struct attribute *); \ - \ -static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ - char *buf) \ -{ \ - struct printbuf out = PRINTBUF; \ - ssize_t ret = fn ## _to_text(&out, kobj, attr); \ - \ - if (out.pos && out.buf[out.pos - 1] != '\n') \ - prt_newline(&out); \ - \ - if (!ret && out.allocation_failure) \ - ret = -ENOMEM; \ - \ - if (!ret) { \ - ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ - memcpy(buf, out.buf, ret); \ - } \ - printbuf_exit(&out); \ - return bch2_err_class(ret); \ -} \ - \ -static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ - struct attribute *attr) - -#define STORE(fn) \ -static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\ - const char *, size_t); \ - \ -static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ - const char *buf, size_t size) \ -{ \ - return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \ -} \ - \ -static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\ - const char *buf, size_t size) - -#define __sysfs_attribute(_name, _mode) \ - static struct attribute sysfs_##_name = \ - { .name = #_name, .mode = _mode } - -#define write_attribute(n) __sysfs_attribute(n, 0200) -#define read_attribute(n) __sysfs_attribute(n, 0444) -#define rw_attribute(n) __sysfs_attribute(n, 0644) - -#define sysfs_printf(file, fmt, ...) \ -do { \ - if (attr == &sysfs_ ## file) \ - prt_printf(out, fmt "\n", __VA_ARGS__); \ -} while (0) - -#define sysfs_print(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - snprint(out, var); \ -} while (0) - -#define sysfs_hprint(file, val) \ -do { \ - if (attr == &sysfs_ ## file) \ - prt_human_readable_s64(out, val); \ -} while (0) - -#define sysfs_strtoul(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe(buf, var) ?: (ssize_t) size; \ -} while (0) - -#define sysfs_strtoul_clamp(file, var, min, max) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe_clamp(buf, var, min, max) \ - ?: (ssize_t) size; \ -} while (0) - -#define strtoul_or_return(cp) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (_r) \ - return _r; \ - _v; \ -}) - -write_attribute(trigger_gc); -write_attribute(trigger_discards); -write_attribute(trigger_invalidates); -write_attribute(trigger_journal_commit); -write_attribute(trigger_journal_flush); -write_attribute(trigger_journal_writes); -write_attribute(trigger_btree_cache_shrink); -write_attribute(trigger_btree_key_cache_shrink); -write_attribute(trigger_btree_updates); -write_attribute(trigger_freelist_wakeup); -write_attribute(trigger_recalc_capacity); -write_attribute(trigger_delete_dead_snapshots); -write_attribute(trigger_emergency_read_only); -read_attribute(gc_gens_pos); - -read_attribute(uuid); -read_attribute(minor); -read_attribute(flags); -read_attribute(first_bucket); -read_attribute(nbuckets); -read_attribute(io_done); -read_attribute(io_errors); -write_attribute(io_errors_reset); - -read_attribute(io_latency_read); -read_attribute(io_latency_write); -read_attribute(io_latency_stats_read); -read_attribute(io_latency_stats_write); -read_attribute(congested); - -read_attribute(btree_write_stats); - -read_attribute(btree_cache_size); -read_attribute(compression_stats); -read_attribute(errors); -read_attribute(journal_debug); -read_attribute(btree_cache); -read_attribute(btree_key_cache); -read_attribute(btree_reserve_cache); -read_attribute(open_buckets); -read_attribute(open_buckets_partial); -read_attribute(nocow_lock_table); - -read_attribute(read_refs); -read_attribute(write_refs); - -read_attribute(internal_uuid); -read_attribute(disk_groups); - -read_attribute(has_data); -read_attribute(alloc_debug); -read_attribute(usage_base); - -#define x(t, n, ...) read_attribute(t); -BCH_PERSISTENT_COUNTERS() -#undef x - -rw_attribute(label); - -read_attribute(copy_gc_wait); - -sysfs_pd_controller_attribute(rebalance); -read_attribute(rebalance_status); -read_attribute(snapshot_delete_status); -read_attribute(recovery_status); - -read_attribute(new_stripes); - -read_attribute(io_timers_read); -read_attribute(io_timers_write); - -read_attribute(moving_ctxts); - -#ifdef CONFIG_BCACHEFS_TESTS -write_attribute(perf_test); -#endif /* CONFIG_BCACHEFS_TESTS */ - -#define x(_name) \ - static struct attribute sysfs_time_stat_##_name = \ - { .name = #_name, .mode = 0644 }; - BCH_TIME_STATS() -#undef x - -static size_t bch2_btree_cache_size(struct bch_fs *c) -{ - struct btree_cache *bc = &c->btree_cache; - size_t ret = 0; - struct btree *b; - - mutex_lock(&bc->lock); - list_for_each_entry(b, &bc->live[0].list, list) - ret += btree_buf_bytes(b); - list_for_each_entry(b, &bc->live[1].list, list) - ret += btree_buf_bytes(b); - list_for_each_entry(b, &bc->freeable, list) - ret += btree_buf_bytes(b); - mutex_unlock(&bc->lock); - return ret; -} - -static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) -{ - prt_str(out, "type"); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 24); - prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); - - for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) { - struct disk_accounting_pos a; - disk_accounting_key_init(a, compression, .type = i); - struct bpos p = disk_accounting_pos_to_bpos(&a); - u64 v[3]; - bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v)); - - u64 nr_extents = v[0]; - u64 sectors_uncompressed = v[1]; - u64 sectors_compressed = v[2]; - - bch2_prt_compression_type(out, i); - prt_tab(out); - - prt_human_readable_u64(out, sectors_compressed << 9); - prt_tab_rjust(out); - - prt_human_readable_u64(out, sectors_uncompressed << 9); - prt_tab_rjust(out); - - prt_human_readable_u64(out, nr_extents - ? div64_u64(sectors_uncompressed << 9, nr_extents) - : 0); - prt_tab_rjust(out); - prt_newline(out); - } - - return 0; -} - -static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_btree_id_to_text(out, c->gc_gens_btree); - prt_printf(out, ": "); - bch2_bpos_to_text(out, c->gc_gens_pos); - prt_printf(out, "\n"); -} - -static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_fs_usage_base b = {}; - - acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64)); - - prt_printf(out, "hidden:\t\t%llu\n", b.hidden); - prt_printf(out, "btree:\t\t%llu\n", b.btree); - prt_printf(out, "data:\t\t%llu\n", b.data); - prt_printf(out, "cached:\t%llu\n", b.cached); - prt_printf(out, "reserved:\t\t%llu\n", b.reserved); - prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); -} - -SHOW(bch2_fs) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - sysfs_print(minor, c->minor); - sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); - - if (attr == &sysfs_flags) - prt_bitflags(out, bch2_fs_flag_strs, c->flags); - - sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); - - if (attr == &sysfs_btree_write_stats) - bch2_btree_write_stats_to_text(out, c); - - if (attr == &sysfs_gc_gens_pos) - bch2_gc_gens_pos_to_text(out, c); - - sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ - - if (attr == &sysfs_copy_gc_wait) - bch2_copygc_wait_to_text(out, c); - - if (attr == &sysfs_rebalance_status) - bch2_rebalance_status_to_text(out, c); - - if (attr == &sysfs_snapshot_delete_status) - bch2_snapshot_delete_status_to_text(out, c); - - if (attr == &sysfs_recovery_status) - bch2_recovery_pass_status_to_text(out, c); - - /* Debugging: */ - - if (attr == &sysfs_journal_debug) - bch2_journal_debug_to_text(out, &c->journal); - - if (attr == &sysfs_btree_cache) - bch2_btree_cache_to_text(out, &c->btree_cache); - - if (attr == &sysfs_btree_key_cache) - bch2_btree_key_cache_to_text(out, &c->btree_key_cache); - - if (attr == &sysfs_btree_reserve_cache) - bch2_btree_reserve_cache_to_text(out, c); - - if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c, NULL); - - if (attr == &sysfs_open_buckets_partial) - bch2_open_buckets_partial_to_text(out, c); - - if (attr == &sysfs_compression_stats) - bch2_compression_stats_to_text(out, c); - - if (attr == &sysfs_errors) - bch2_fs_errors_to_text(out, c); - - if (attr == &sysfs_new_stripes) - bch2_new_stripes_to_text(out, c); - - if (attr == &sysfs_io_timers_read) - bch2_io_timers_to_text(out, &c->io_clock[READ]); - - if (attr == &sysfs_io_timers_write) - bch2_io_timers_to_text(out, &c->io_clock[WRITE]); - - if (attr == &sysfs_moving_ctxts) - bch2_fs_moving_ctxts_to_text(out, c); - - if (attr == &sysfs_write_refs) - enumerated_ref_to_text(out, &c->writes, bch2_write_refs); - - if (attr == &sysfs_nocow_lock_table) - bch2_nocow_locks_to_text(out, &c->nocow_locks); - - if (attr == &sysfs_disk_groups) - bch2_disk_groups_to_text(out, c); - - if (attr == &sysfs_alloc_debug) - bch2_fs_alloc_debug_to_text(out, c); - - if (attr == &sysfs_usage_base) - bch2_fs_usage_base_to_text(out, c); - - return 0; -} - -STORE(bch2_fs) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - - sysfs_pd_controller_store(rebalance, &c->rebalance.pd); - - /* Debugging: */ - - if (!test_bit(BCH_FS_started, &c->flags)) - return -EPERM; - - /* Debugging: */ - - if (attr == &sysfs_trigger_btree_updates) - queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); - - if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)) - return -EROFS; - - if (attr == &sysfs_trigger_btree_cache_shrink) { - struct btree_cache *bc = &c->btree_cache; - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = strtoul_or_return(buf); - bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc); - } - - if (attr == &sysfs_trigger_btree_key_cache_shrink) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = strtoul_or_return(buf); - c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); - } - - if (attr == &sysfs_trigger_gc) - bch2_gc_gens(c); - - if (attr == &sysfs_trigger_discards) - bch2_do_discards(c); - - if (attr == &sysfs_trigger_invalidates) - bch2_do_invalidates(c); - - if (attr == &sysfs_trigger_journal_commit) - bch2_journal_flush(&c->journal); - - if (attr == &sysfs_trigger_journal_flush) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_meta(&c->journal); - } - - if (attr == &sysfs_trigger_journal_writes) - bch2_journal_do_writes(&c->journal); - - if (attr == &sysfs_trigger_freelist_wakeup) - closure_wake_up(&c->freelist_wait); - - if (attr == &sysfs_trigger_recalc_capacity) { - down_read(&c->state_lock); - bch2_recalc_capacity(c); - up_read(&c->state_lock); - } - - if (attr == &sysfs_trigger_delete_dead_snapshots) - __bch2_delete_dead_snapshots(c); - - if (attr == &sysfs_trigger_emergency_read_only) { - struct printbuf buf = PRINTBUF; - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "shutdown by sysfs\n"); - bch2_fs_emergency_read_only2(c, &buf); - bch2_print_str(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - -#ifdef CONFIG_BCACHEFS_TESTS - if (attr == &sysfs_perf_test) { - char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; - char *test = strsep(&p, " \t\n"); - char *nr_str = strsep(&p, " \t\n"); - char *threads_str = strsep(&p, " \t\n"); - unsigned threads; - u64 nr; - int ret = -EINVAL; - - if (threads_str && - !(ret = kstrtouint(threads_str, 10, &threads)) && - !(ret = bch2_strtoull_h(nr_str, &nr))) - ret = bch2_btree_perf_test(c, test, nr, threads); - kfree(tmp); - - if (ret) - size = ret; - } -#endif - enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); - return size; -} -SYSFS_OPS(bch2_fs); - -struct attribute *bch2_fs_files[] = { - &sysfs_minor, - &sysfs_btree_cache_size, - &sysfs_btree_write_stats, - - &sysfs_rebalance_status, - &sysfs_snapshot_delete_status, - &sysfs_recovery_status, - - &sysfs_compression_stats, - &sysfs_errors, - -#ifdef CONFIG_BCACHEFS_TESTS - &sysfs_perf_test, -#endif - NULL -}; - -/* counters dir */ - -SHOW(bch2_fs_counters) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj); - u64 counter = 0; - u64 counter_since_mount = 0; - - printbuf_tabstop_push(out, 32); - - #define x(t, n, f, ...) \ - if (attr == &sysfs_##t) { \ - counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ - counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ - if (f & TYPE_SECTORS) { \ - counter <<= 9; \ - counter_since_mount <<= 9; \ - } \ - \ - prt_printf(out, "since mount:\t"); \ - (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\ - prt_human_readable_u64(out, counter_since_mount); \ - prt_newline(out); \ - \ - prt_printf(out, "since filesystem creation:\t"); \ - (f & TYPE_COUNTER) ? prt_u64(out, counter) : \ - prt_human_readable_u64(out, counter); \ - prt_newline(out); \ - } - BCH_PERSISTENT_COUNTERS() - #undef x - return 0; -} - -STORE(bch2_fs_counters) { - return 0; -} - -SYSFS_OPS(bch2_fs_counters); - -struct attribute *bch2_fs_counters_files[] = { -#define x(t, ...) \ - &sysfs_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - NULL -}; -/* internal dir - just a wrapper */ - -SHOW(bch2_fs_internal) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - - return bch2_fs_to_text(out, &c->kobj, attr); -} - -STORE(bch2_fs_internal) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, internal); - - return bch2_fs_store(&c->kobj, attr, buf, size); -} -SYSFS_OPS(bch2_fs_internal); - -struct attribute *bch2_fs_internal_files[] = { - &sysfs_flags, - &sysfs_journal_debug, - &sysfs_btree_cache, - &sysfs_btree_key_cache, - &sysfs_btree_reserve_cache, - &sysfs_new_stripes, - &sysfs_open_buckets, - &sysfs_open_buckets_partial, - &sysfs_write_refs, - &sysfs_nocow_lock_table, - &sysfs_io_timers_read, - &sysfs_io_timers_write, - - &sysfs_trigger_gc, - &sysfs_trigger_discards, - &sysfs_trigger_invalidates, - &sysfs_trigger_journal_commit, - &sysfs_trigger_journal_flush, - &sysfs_trigger_journal_writes, - &sysfs_trigger_btree_cache_shrink, - &sysfs_trigger_btree_key_cache_shrink, - &sysfs_trigger_btree_updates, - &sysfs_trigger_freelist_wakeup, - &sysfs_trigger_recalc_capacity, - &sysfs_trigger_delete_dead_snapshots, - &sysfs_trigger_emergency_read_only, - - &sysfs_gc_gens_pos, - - &sysfs_copy_gc_wait, - - sysfs_pd_controller_files(rebalance), - - &sysfs_moving_ctxts, - - &sysfs_internal_uuid, - - &sysfs_disk_groups, - &sysfs_alloc_debug, - &sysfs_usage_base, - NULL -}; - -/* options */ - -static ssize_t sysfs_opt_show(struct bch_fs *c, - struct bch_dev *ca, - enum bch_opt_id id, - struct printbuf *out) -{ - const struct bch_option *opt = bch2_opt_table + id; - u64 v; - - if (opt->flags & OPT_FS) { - v = bch2_opt_get_by_id(&c->opts, id); - } else if ((opt->flags & OPT_DEVICE) && opt->get_member) { - v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx); - } else { - return -EINVAL; - } - - bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); - prt_char(out, '\n'); - return 0; -} - -static ssize_t sysfs_opt_store(struct bch_fs *c, - struct bch_dev *ca, - enum bch_opt_id id, - const char *buf, size_t size) -{ - const struct bch_option *opt = bch2_opt_table + id; - int ret = 0; - - /* - * We don't need to take c->writes for correctness, but it eliminates an - * unsightly error message in the dmesg log when we're RO: - */ - if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))) - return -EROFS; - - char *tmp = kstrdup(buf, GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto err; - } - - u64 v; - ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: - bch2_opt_hook_pre_set(c, ca, id, v); - kfree(tmp); - - if (ret < 0) - goto err; - - bool is_sb = opt->get_sb || opt->get_member; - bool changed = false; - - if (is_sb) { - changed = bch2_opt_set_sb(c, ca, opt, v); - } else if (!ca) { - changed = bch2_opt_get_by_id(&c->opts, id) != v; - } else { - /* device options that aren't superblock options aren't - * supported */ - BUG(); - } - - if (!ca) - bch2_opt_set_by_id(&c->opts, id, v); - - if (changed) - bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); - - ret = size; -err: - enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); - return ret; -} - -SHOW(bch2_fs_opts_dir) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - int id = bch2_opt_lookup(attr->name); - if (id < 0) - return 0; - - return sysfs_opt_show(c, NULL, id, out); -} - -STORE(bch2_fs_opts_dir) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - int id = bch2_opt_lookup(attr->name); - if (id < 0) - return 0; - - return sysfs_opt_store(c, NULL, id, buf, size); -} -SYSFS_OPS(bch2_fs_opts_dir); - -struct attribute *bch2_fs_opts_dir_files[] = { NULL }; - -int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) -{ - for (const struct bch_option *i = bch2_opt_table; - i < bch2_opt_table + bch2_opts_nr; - i++) { - if (i->flags & OPT_HIDDEN) - continue; - if (!(i->flags & type)) - continue; - - int ret = sysfs_create_file(kobj, &i->attr); - if (ret) - return ret; - } - - return 0; -} - -/* time stats */ - -SHOW(bch2_fs_time_stats) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - -#define x(name) \ - if (attr == &sysfs_time_stat_##name) \ - bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); - BCH_TIME_STATS() -#undef x - - return 0; -} - -STORE(bch2_fs_time_stats) -{ - struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); - -#define x(name) \ - if (attr == &sysfs_time_stat_##name) \ - bch2_time_stats_reset(&c->times[BCH_TIME_##name]); - BCH_TIME_STATS() -#undef x - return size; -} -SYSFS_OPS(bch2_fs_time_stats); - -struct attribute *bch2_fs_time_stats_files[] = { -#define x(name) \ - &sysfs_time_stat_##name, - BCH_TIME_STATS() -#undef x - NULL -}; - -static const char * const bch2_rw[] = { - "read", - "write", - NULL -}; - -static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) -{ - int rw, i; - - for (rw = 0; rw < 2; rw++) { - prt_printf(out, "%s:\n", bch2_rw[rw]); - - for (i = 1; i < BCH_DATA_NR; i++) - prt_printf(out, "%-12s:%12llu\n", - bch2_data_type_str(i), - percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); - } -} - -SHOW(bch2_dev) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - struct bch_fs *c = ca->fs; - - sysfs_printf(uuid, "%pU\n", ca->uuid.b); - - sysfs_print(first_bucket, ca->mi.first_bucket); - sysfs_print(nbuckets, ca->mi.nbuckets); - - if (attr == &sysfs_label) { - if (ca->mi.group) - bch2_disk_path_to_text(out, c, ca->mi.group - 1); - prt_char(out, '\n'); - } - - if (attr == &sysfs_has_data) { - prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca)); - prt_char(out, '\n'); - } - - if (attr == &sysfs_io_done) - dev_io_done_to_text(out, ca); - - if (attr == &sysfs_io_errors) - bch2_dev_io_errors_to_text(out, ca); - - sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); - sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); - - if (attr == &sysfs_io_latency_stats_read) - bch2_time_stats_to_text(out, &ca->io_latency[READ].stats); - - if (attr == &sysfs_io_latency_stats_write) - bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); - - sysfs_printf(congested, "%u%%", - clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) - * 100 / CONGESTED_MAX); - - if (attr == &sysfs_alloc_debug) - bch2_dev_alloc_debug_to_text(out, ca); - - if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c, ca); - - int opt_id = bch2_opt_lookup(attr->name); - if (opt_id >= 0) - return sysfs_opt_show(c, ca, opt_id, out); - - if (attr == &sysfs_read_refs) - enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs); - - if (attr == &sysfs_write_refs) - enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs); - - return 0; -} - -STORE(bch2_dev) -{ - struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); - struct bch_fs *c = ca->fs; - - if (attr == &sysfs_label) { - char *tmp; - int ret; - - tmp = kstrdup(buf, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - - ret = bch2_dev_group_set(c, ca, strim(tmp)); - kfree(tmp); - if (ret) - return ret; - } - - if (attr == &sysfs_io_errors_reset) - bch2_dev_errors_reset(ca); - - int opt_id = bch2_opt_lookup(attr->name); - if (opt_id >= 0) - return sysfs_opt_store(c, ca, opt_id, buf, size); - - return size; -} -SYSFS_OPS(bch2_dev); - -struct attribute *bch2_dev_files[] = { - &sysfs_uuid, - &sysfs_first_bucket, - &sysfs_nbuckets, - - /* settings: */ - &sysfs_label, - - &sysfs_has_data, - &sysfs_io_done, - &sysfs_io_errors, - &sysfs_io_errors_reset, - - &sysfs_io_latency_read, - &sysfs_io_latency_write, - &sysfs_io_latency_stats_read, - &sysfs_io_latency_stats_write, - &sysfs_congested, - - /* debug: */ - &sysfs_alloc_debug, - &sysfs_open_buckets, - - &sysfs_read_refs, - &sysfs_write_refs, - NULL -}; - -#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h deleted file mode 100644 index 303e0433c702c6..00000000000000 --- a/fs/bcachefs/sysfs.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_SYSFS_H_ -#define _BCACHEFS_SYSFS_H_ - -#include - -#ifndef NO_BCACHEFS_SYSFS - -struct attribute; -struct sysfs_ops; - -extern struct attribute *bch2_fs_files[]; -extern struct attribute *bch2_fs_counters_files[]; -extern struct attribute *bch2_fs_internal_files[]; -extern struct attribute *bch2_fs_opts_dir_files[]; -extern struct attribute *bch2_fs_time_stats_files[]; -extern struct attribute *bch2_dev_files[]; - -extern const struct sysfs_ops bch2_fs_sysfs_ops; -extern const struct sysfs_ops bch2_fs_counters_sysfs_ops; -extern const struct sysfs_ops bch2_fs_internal_sysfs_ops; -extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -extern const struct sysfs_ops bch2_dev_sysfs_ops; - -int bch2_opts_create_sysfs_files(struct kobject *, unsigned); - -#else - -static struct attribute *bch2_fs_files[] = {}; -static struct attribute *bch2_fs_counters_files[] = {}; -static struct attribute *bch2_fs_internal_files[] = {}; -static struct attribute *bch2_fs_opts_dir_files[] = {}; -static struct attribute *bch2_fs_time_stats_files[] = {}; -static struct attribute *bch2_dev_files[] = {}; - -static const struct sysfs_ops bch2_fs_sysfs_ops; -static const struct sysfs_ops bch2_fs_counters_sysfs_ops; -static const struct sysfs_ops bch2_fs_internal_sysfs_ops; -static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -static const struct sysfs_ops bch2_dev_sysfs_ops; - -static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) -{ return 0; } - -#endif /* NO_BCACHEFS_SYSFS */ - -#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c deleted file mode 100644 index 782a05fe7656b5..00000000000000 --- a/fs/bcachefs/tests.c +++ /dev/null @@ -1,891 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_BCACHEFS_TESTS - -#include "bcachefs.h" -#include "btree_update.h" -#include "journal_reclaim.h" -#include "snapshot.h" -#include "tests.h" - -#include "linux/kthread.h" -#include "linux/random.h" - -static void delete_test_keys(struct bch_fs *c) -{ - int ret; - - ret = bch2_btree_delete_range(c, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), - POS(0, U64_MAX), - 0, NULL); - BUG_ON(ret); - - ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - POS(0, U64_MAX), - 0, NULL); - BUG_ON(ret); -} - -/* unit tests */ - -static int test_delete(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k.p.snapshot = U32_MAX; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_intent); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &k.k_i, 0)); - bch_err_msg(c, ret, "update error"); - if (ret) - goto err; - - pr_info("deleting once"); - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error (first)"); - if (ret) - goto err; - - pr_info("deleting twice"); - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error (second)"); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int test_delete_written(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k.p.snapshot = U32_MAX; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_intent); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, &k.k_i, 0)); - bch_err_msg(c, ret, "update error"); - if (ret) - goto err; - - bch2_trans_unlock(trans); - bch2_journal_flush_all_pins(&c->journal); - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, 0)); - bch_err_msg(c, ret, "delete error"); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int test_iterate(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test keys"); - - for (i = 0; i < nr; i++) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i; - ck.k.p.snapshot = U32_MAX; - - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i++); - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr); - - pr_info("iterating backwards"); - - ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, - SPOS(0, U64_MAX, U32_MAX), 0, k, ({ - BUG_ON(k.k->p.offset != --i); - 0; - }))); - bch_err_msg(c, ret, "error iterating backwards"); - if (ret) - return ret; - - BUG_ON(i); - return 0; -} - -static int test_iterate_extents(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test extents"); - - for (i = 0; i < nr; i += 8) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i + 8; - ck.k.p.snapshot = U32_MAX; - ck.k.size = 8; - - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i); - i = k.k->p.offset; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr); - - pr_info("iterating backwards"); - - ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, - SPOS(0, U64_MAX, U32_MAX), 0, k, ({ - BUG_ON(k.k->p.offset != i); - i = bkey_start_offset(k.k); - 0; - }))); - bch_err_msg(c, ret, "error iterating backwards"); - if (ret) - return ret; - - BUG_ON(i); - return 0; -} - -static int test_iterate_slots(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test keys"); - - for (i = 0; i < nr; i++) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i * 2; - ck.k.p.snapshot = U32_MAX; - - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i); - i += 2; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr * 2); - - pr_info("iterating forwards by slots"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ - if (i >= nr * 2) - break; - - BUG_ON(k.k->p.offset != i); - BUG_ON(bkey_deleted(k.k) != (i & 1)); - - i++; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards by slots"); - return ret; -} - -static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) -{ - u64 i; - int ret = 0; - - delete_test_keys(c); - - pr_info("inserting test keys"); - - for (i = 0; i < nr; i += 16) { - struct bkey_i_cookie ck; - - bkey_cookie_init(&ck.k_i); - ck.k.p.offset = i + 16; - ck.k.p.snapshot = U32_MAX; - ck.k.size = 8; - - ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0); - bch_err_msg(c, ret, "insert error"); - if (ret) - return ret; - } - - pr_info("iterating forwards"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i + 8); - BUG_ON(k.k->size != 8); - i += 16; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards"); - if (ret) - return ret; - - BUG_ON(i != nr); - - pr_info("iterating forwards by slots"); - i = 0; - - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ - if (i == nr) - break; - BUG_ON(bkey_deleted(k.k) != !(i % 16)); - - BUG_ON(bkey_start_offset(k.k) != i); - BUG_ON(k.k->size != 8); - i = k.k->p.offset; - 0; - }))); - bch_err_msg(c, ret, "error iterating forwards by slots"); - return ret; -} - -/* - * XXX: we really want to make sure we've got a btree with depth > 0 for these - * tests - */ -static int test_peek_end(struct bch_fs *c, u64 nr) -{ - delete_test_keys(c); - - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return 0; -} - -static int test_peek_end_extents(struct bch_fs *c, u64 nr) -{ - delete_test_keys(c); - - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - BUG_ON(k.k); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return 0; -} - -/* extent unit tests */ - -static u64 test_version; - -static int insert_test_extent(struct bch_fs *c, - u64 start, u64 end) -{ - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k_i.k.p.offset = end; - k.k_i.k.p.snapshot = U32_MAX; - k.k_i.k.size = end - start; - k.k_i.k.bversion.lo = test_version++; - - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __test_extent_overwrite(struct bch_fs *c, - u64 e1_start, u64 e1_end, - u64 e2_start, u64 e2_end) -{ - int ret; - - ret = insert_test_extent(c, e1_start, e1_end) ?: - insert_test_extent(c, e2_start, e2_end); - - delete_test_keys(c); - return ret; -} - -static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 0, 64, 0, 32) ?: - __test_extent_overwrite(c, 8, 64, 0, 32); -} - -static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 0, 64, 32, 64) ?: - __test_extent_overwrite(c, 0, 64, 32, 72); -} - -static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 0, 64, 32, 40); -} - -static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) -{ - return __test_extent_overwrite(c, 32, 64, 0, 64) ?: - __test_extent_overwrite(c, 32, 64, 0, 128) ?: - __test_extent_overwrite(c, 32, 64, 32, 64) ?: - __test_extent_overwrite(c, 32, 64, 32, 128); -} - -static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid) -{ - struct bkey_i_cookie k; - int ret; - - bkey_cookie_init(&k.k_i); - k.k_i.k.p.inode = inum; - k.k_i.k.p.offset = start + len; - k.k_i.k.p.snapshot = snapid; - k.k_i.k.size = len; - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, - BTREE_UPDATE_internal_snapshot_node)); - bch_err_fn(c, ret); - return ret; -} - -static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) -{ - return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */ - insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?: - insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?: - insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */ - insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?: - insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?: - insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX); -} - -/* snapshot unit tests */ - -/* Test skipping over keys in unrelated snapshots: */ -static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) -{ - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i_cookie cookie; - int ret; - - bkey_cookie_init(&cookie.k_i); - cookie.k.p.snapshot = snapid_hi; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); - if (ret) - return ret; - - trans = bch2_trans_get(c); - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); - - BUG_ON(k.k->p.snapshot != U32_MAX); - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int test_snapshots(struct bch_fs *c, u64 nr) -{ - struct bkey_i_cookie cookie; - u32 snapids[2]; - u32 snapid_subvols[2] = { 1, 1 }; - int ret; - - bkey_cookie_init(&cookie.k_i); - cookie.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); - if (ret) - return ret; - - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_snapshot_node_create(trans, U32_MAX, - snapids, - snapid_subvols, - 2)); - if (ret) - return ret; - - if (snapids[0] > snapids[1]) - swap(snapids[0], snapids[1]); - - ret = test_snapshot_filter(c, snapids[0], snapids[1]); - bch_err_msg(c, ret, "from test_snapshot_filter"); - return ret; -} - -/* perf tests */ - -static u64 test_rand(void) -{ - u64 v; - - get_random_bytes(&v, sizeof(v)); - return v; -} - -static int rand_insert(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_cookie k; - int ret = 0; - u64 i; - - for (i = 0; i < nr; i++) { - bkey_cookie_init(&k.k_i); - k.k.p.offset = test_rand(); - k.k.p.snapshot = U32_MAX; - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0)); - if (ret) - break; - } - - bch2_trans_put(trans); - return ret; -} - -static int rand_insert_multi(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_cookie k[8]; - int ret = 0; - unsigned j; - u64 i; - - for (i = 0; i < nr; i += ARRAY_SIZE(k)) { - for (j = 0; j < ARRAY_SIZE(k); j++) { - bkey_cookie_init(&k[j].k_i); - k[j].k.p.offset = test_rand(); - k[j].k.p.snapshot = U32_MAX; - } - - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: - bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0)); - if (ret) - break; - } - - bch2_trans_put(trans); - return ret; -} - -static int rand_lookup(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - u64 i; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX)); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter))); - ret = bkey_err(k); - if (ret) - break; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int rand_mixed_trans(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i_cookie *cookie, - u64 i, u64 pos) -{ - struct bkey_s_c k; - int ret; - - bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX)); - - k = bch2_btree_iter_peek(trans, iter); - ret = bkey_err(k); - bch_err_msg(trans->c, ret, "lookup error"); - if (ret) - return ret; - - if (!(i & 3) && k.k) { - bkey_cookie_init(&cookie->k_i); - cookie->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); - } - - return ret; -} - -static int rand_mixed(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie cookie; - int ret = 0; - u64 i, rand; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - for (i = 0; i < nr; i++) { - rand = test_rand(); - ret = commit_do(trans, NULL, NULL, 0, - rand_mixed_trans(trans, &iter, &cookie, i, rand)); - if (ret) - break; - } - - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; -} - -static int __do_delete(struct btree_trans *trans, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_intent); - k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k) - goto err; - - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int rand_delete(struct bch_fs *c, u64 nr) -{ - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - u64 i; - - for (i = 0; i < nr; i++) { - struct bpos pos = SPOS(0, test_rand(), U32_MAX); - - ret = commit_do(trans, NULL, NULL, 0, - __do_delete(trans, pos)); - if (ret) - break; - } - - bch2_trans_put(trans); - return ret; -} - -static int seq_insert(struct bch_fs *c, u64 nr) -{ - struct bkey_i_cookie insert; - - bkey_cookie_init(&insert.k_i); - - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - BTREE_ITER_slots|BTREE_ITER_intent, k, - NULL, NULL, 0, ({ - if (iter.pos.offset >= nr) - break; - insert.k.p = iter.pos; - bch2_trans_update(trans, &iter, &insert.k_i, 0); - }))); -} - -static int seq_lookup(struct bch_fs *c, u64 nr) -{ - return bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, - 0)); -} - -static int seq_overwrite(struct bch_fs *c, u64 nr) -{ - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - BTREE_ITER_intent, k, - NULL, NULL, 0, ({ - struct bkey_i_cookie u; - - bkey_reassemble(&u.k_i, k); - bch2_trans_update(trans, &iter, &u.k_i, 0); - }))); -} - -static int seq_delete(struct bch_fs *c, u64 nr) -{ - return bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), - POS(0, U64_MAX), - 0, NULL); -} - -typedef int (*perf_test_fn)(struct bch_fs *, u64); - -struct test_job { - struct bch_fs *c; - u64 nr; - unsigned nr_threads; - perf_test_fn fn; - - atomic_t ready; - wait_queue_head_t ready_wait; - - atomic_t done; - struct completion done_completion; - - u64 start; - u64 finish; - int ret; -}; - -static int btree_perf_test_thread(void *data) -{ - struct test_job *j = data; - int ret; - - if (atomic_dec_and_test(&j->ready)) { - wake_up(&j->ready_wait); - j->start = sched_clock(); - } else { - wait_event(j->ready_wait, !atomic_read(&j->ready)); - } - - ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); - if (ret) { - bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret)); - j->ret = ret; - } - - if (atomic_dec_and_test(&j->done)) { - j->finish = sched_clock(); - complete(&j->done_completion); - } - - return 0; -} - -int bch2_btree_perf_test(struct bch_fs *c, const char *testname, - u64 nr, unsigned nr_threads) -{ - struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; - char name_buf[20]; - struct printbuf nr_buf = PRINTBUF; - struct printbuf per_sec_buf = PRINTBUF; - unsigned i; - u64 time; - - if (nr == 0 || nr_threads == 0) { - pr_err("nr of iterations or threads is not allowed to be 0"); - return -EINVAL; - } - - atomic_set(&j.ready, nr_threads); - init_waitqueue_head(&j.ready_wait); - - atomic_set(&j.done, nr_threads); - init_completion(&j.done_completion); - -#define perf_test(_test) \ - if (!strcmp(testname, #_test)) j.fn = _test - - perf_test(rand_insert); - perf_test(rand_insert_multi); - perf_test(rand_lookup); - perf_test(rand_mixed); - perf_test(rand_delete); - - perf_test(seq_insert); - perf_test(seq_lookup); - perf_test(seq_overwrite); - perf_test(seq_delete); - - /* a unit test, not a perf test: */ - perf_test(test_delete); - perf_test(test_delete_written); - perf_test(test_iterate); - perf_test(test_iterate_extents); - perf_test(test_iterate_slots); - perf_test(test_iterate_slots_extents); - perf_test(test_peek_end); - perf_test(test_peek_end_extents); - - perf_test(test_extent_overwrite_front); - perf_test(test_extent_overwrite_back); - perf_test(test_extent_overwrite_middle); - perf_test(test_extent_overwrite_all); - perf_test(test_extent_create_overlapping); - - perf_test(test_snapshots); - - if (!j.fn) { - pr_err("unknown test %s", testname); - return -EINVAL; - } - - //pr_info("running test %s:", testname); - - if (nr_threads == 1) - btree_perf_test_thread(&j); - else - for (i = 0; i < nr_threads; i++) - kthread_run(btree_perf_test_thread, &j, - "bcachefs perf test[%u]", i); - - while (wait_for_completion_interruptible(&j.done_completion)) - ; - - time = j.finish - j.start; - - scnprintf(name_buf, sizeof(name_buf), "%s:", testname); - prt_human_readable_u64(&nr_buf, nr); - prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); - printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", - name_buf, nr_buf.buf, nr_threads, - div_u64(time, NSEC_PER_SEC), - div_u64(time * nr_threads, nr), - per_sec_buf.buf); - printbuf_exit(&per_sec_buf); - printbuf_exit(&nr_buf); - return j.ret; -} - -#endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h deleted file mode 100644 index c73b18aea7e01d..00000000000000 --- a/fs/bcachefs/tests.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_TEST_H -#define _BCACHEFS_TEST_H - -struct bch_fs; - -#ifdef CONFIG_BCACHEFS_TESTS - -int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); - -#else - -#endif /* CONFIG_BCACHEFS_TESTS */ - -#endif /* _BCACHEFS_TEST_H */ diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c deleted file mode 100644 index 314a24d15d4e7c..00000000000000 --- a/fs/bcachefs/thread_with_file.c +++ /dev/null @@ -1,494 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef NO_BCACHEFS_FS - -#include "bcachefs.h" -#include "thread_with_file.h" - -#include -#include -#include -#include -#include -#include - -void bch2_thread_with_file_exit(struct thread_with_file *thr) -{ - if (thr->task) { - kthread_stop(thr->task); - put_task_struct(thr->task); - } -} - -int bch2_run_thread_with_file(struct thread_with_file *thr, - const struct file_operations *fops, - int (*fn)(void *)) -{ - struct file *file = NULL; - int ret, fd = -1; - unsigned fd_flags = O_CLOEXEC; - - if (fops->read && fops->write) - fd_flags |= O_RDWR; - else if (fops->read) - fd_flags |= O_RDONLY; - else if (fops->write) - fd_flags |= O_WRONLY; - - char name[TASK_COMM_LEN]; - get_task_comm(name, current); - - thr->ret = 0; - thr->task = kthread_create(fn, thr, "%s", name); - ret = PTR_ERR_OR_ZERO(thr->task); - if (ret) - return ret; - - ret = get_unused_fd_flags(fd_flags); - if (ret < 0) - goto err; - fd = ret; - - file = anon_inode_getfile(name, fops, thr, fd_flags); - ret = PTR_ERR_OR_ZERO(file); - if (ret) - goto err; - - get_task_struct(thr->task); - wake_up_process(thr->task); - fd_install(fd, file); - return fd; -err: - if (fd >= 0) - put_unused_fd(fd); - if (thr->task) - kthread_stop(thr->task); - return ret; -} - -/* stdio_redirect */ - -static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen) -{ - return stdio->input.buf.nr > seen || stdio->done; -} - -static bool stdio_redirect_has_input(struct stdio_redirect *stdio) -{ - return stdio_redirect_has_more_input(stdio, 0); -} - -static bool stdio_redirect_has_output(struct stdio_redirect *stdio) -{ - return stdio->output.buf.nr || stdio->done; -} - -#define STDIO_REDIRECT_BUFSIZE 4096 - -static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) -{ - return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; -} - -static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) -{ - return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; -} - -static void stdio_buf_init(struct stdio_buf *buf) -{ - spin_lock_init(&buf->lock); - init_waitqueue_head(&buf->wait); - darray_init(&buf->buf); -} - -/* thread_with_stdio */ - -static void thread_with_stdio_done(struct thread_with_stdio *thr) -{ - thr->thr.done = true; - thr->stdio.done = true; - wake_up(&thr->stdio.input.wait); - wake_up(&thr->stdio.output.wait); -} - -static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, - size_t len, loff_t *ppos) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - struct stdio_buf *buf = &thr->stdio.output; - size_t copied = 0, b; - int ret = 0; - - if (!(file->f_flags & O_NONBLOCK)) { - ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio)); - if (ret) - return ret; - } else if (!stdio_redirect_has_output(&thr->stdio)) - return -EAGAIN; - - while (len && buf->buf.nr) { - if (fault_in_writeable(ubuf, len) == len) { - ret = -EFAULT; - break; - } - - spin_lock_irq(&buf->lock); - b = min_t(size_t, len, buf->buf.nr); - - if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) { - ubuf += b; - len -= b; - copied += b; - buf->buf.nr -= b; - memmove(buf->buf.data, - buf->buf.data + b, - buf->buf.nr); - } - spin_unlock_irq(&buf->lock); - } - - return copied ?: ret; -} - -static int thread_with_stdio_release(struct inode *inode, struct file *file) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - thread_with_stdio_done(thr); - bch2_thread_with_file_exit(&thr->thr); - darray_exit(&thr->stdio.input.buf); - darray_exit(&thr->stdio.output.buf); - thr->ops->exit(thr); - return 0; -} - -static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, - size_t len, loff_t *ppos) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - struct stdio_buf *buf = &thr->stdio.input; - size_t copied = 0; - ssize_t ret = 0; - - while (len) { - if (thr->thr.done) { - ret = -EPIPE; - break; - } - - size_t b = len - fault_in_readable(ubuf, len); - if (!b) { - ret = -EFAULT; - break; - } - - spin_lock(&buf->lock); - size_t makeroom = b; - if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr)) - makeroom = min_t(ssize_t, makeroom, - max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr, - 0)); - darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT); - - b = min(len, darray_room(buf->buf)); - - if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { - buf->buf.nr += b; - ubuf += b; - len -= b; - copied += b; - } - spin_unlock(&buf->lock); - - if (b) { - wake_up(&buf->wait); - } else { - if ((file->f_flags & O_NONBLOCK)) { - ret = -EAGAIN; - break; - } - - ret = wait_event_interruptible(buf->wait, - stdio_redirect_has_input_space(&thr->stdio)); - if (ret) - break; - } - } - - return copied ?: ret; -} - -static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - poll_wait(file, &thr->stdio.output.wait, wait); - poll_wait(file, &thr->stdio.input.wait, wait); - - __poll_t mask = 0; - - if (stdio_redirect_has_output(&thr->stdio)) - mask |= EPOLLIN; - if (stdio_redirect_has_input_space(&thr->stdio)) - mask |= EPOLLOUT; - if (thr->thr.done) - mask |= EPOLLHUP|EPOLLERR; - return mask; -} - -static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - poll_wait(file, &thr->stdio.output.wait, wait); - - __poll_t mask = 0; - - if (stdio_redirect_has_output(&thr->stdio)) - mask |= EPOLLIN; - if (thr->thr.done) - mask |= EPOLLHUP|EPOLLERR; - return mask; -} - -static int thread_with_stdio_flush(struct file *file, fl_owner_t id) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - return thr->thr.ret; -} - -static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) -{ - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - - if (thr->ops->unlocked_ioctl) - return thr->ops->unlocked_ioctl(thr, cmd, p); - return -ENOTTY; -} - -static const struct file_operations thread_with_stdio_fops = { - .read = thread_with_stdio_read, - .write = thread_with_stdio_write, - .poll = thread_with_stdio_poll, - .flush = thread_with_stdio_flush, - .release = thread_with_stdio_release, - .unlocked_ioctl = thread_with_stdio_ioctl, -}; - -static const struct file_operations thread_with_stdout_fops = { - .read = thread_with_stdio_read, - .poll = thread_with_stdout_poll, - .flush = thread_with_stdio_flush, - .release = thread_with_stdio_release, - .unlocked_ioctl = thread_with_stdio_ioctl, -}; - -static int thread_with_stdio_fn(void *arg) -{ - struct thread_with_stdio *thr = arg; - - thr->thr.ret = thr->ops->fn(thr); - - thread_with_stdio_done(thr); - return 0; -} - -void bch2_thread_with_stdio_init(struct thread_with_stdio *thr, - const struct thread_with_stdio_ops *ops) -{ - stdio_buf_init(&thr->stdio.input); - stdio_buf_init(&thr->stdio.output); - thr->ops = ops; -} - -int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr) -{ - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); -} - -int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, - const struct thread_with_stdio_ops *ops) -{ - bch2_thread_with_stdio_init(thr, ops); - - return __bch2_run_thread_with_stdio(thr); -} - -int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, - const struct thread_with_stdio_ops *ops) -{ - stdio_buf_init(&thr->stdio.input); - stdio_buf_init(&thr->stdio.output); - thr->ops = ops; - - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); -} -EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout); - -int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) -{ - struct stdio_buf *buf = &stdio->input; - - /* - * we're waiting on user input (or for the file descriptor to be - * closed), don't want a hung task warning: - */ - do { - wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), - sysctl_hung_task_timeout_secs * HZ / 2); - } while (!stdio_redirect_has_input(stdio)); - - if (stdio->done) - return -1; - - spin_lock(&buf->lock); - int ret = min(len, buf->buf.nr); - buf->buf.nr -= ret; - memcpy(ubuf, buf->buf.data, ret); - memmove(buf->buf.data, - buf->buf.data + ret, - buf->buf.nr); - spin_unlock(&buf->lock); - - wake_up(&buf->wait); - return ret; -} - -int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio, - darray_char *line, - unsigned long timeout) -{ - unsigned long until = jiffies + timeout, t; - struct stdio_buf *buf = &stdio->input; - size_t seen = 0; -again: - t = timeout != MAX_SCHEDULE_TIMEOUT - ? max_t(long, until - jiffies, 0) - : timeout; - - t = min(t, sysctl_hung_task_timeout_secs * HZ / 2); - - wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t); - - if (stdio->done) - return -1; - - spin_lock(&buf->lock); - seen = buf->buf.nr; - char *n = memchr(buf->buf.data, '\n', seen); - - if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) { - spin_unlock(&buf->lock); - return -ETIME; - } - - if (!n) { - buf->waiting_for_line = true; - spin_unlock(&buf->lock); - goto again; - } - - size_t b = n + 1 - buf->buf.data; - if (b > line->size) { - spin_unlock(&buf->lock); - int ret = darray_resize(line, b); - if (ret) - return ret; - seen = 0; - goto again; - } - - buf->buf.nr -= b; - memcpy(line->data, buf->buf.data, b); - memmove(buf->buf.data, - buf->buf.data + b, - buf->buf.nr); - line->nr = b; - - buf->waiting_for_line = false; - spin_unlock(&buf->lock); - - wake_up(&buf->wait); - return 0; -} - -int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line) -{ - return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT); -} - -__printf(3, 0) -static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) -{ - ssize_t ret; - - do { - va_list args2; - size_t len; - - va_copy(args2, args); - len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); - va_end(args2); - - if (len + 1 <= darray_room(*out)) { - out->nr += len; - return len; - } - - ret = darray_make_room_gfp(out, len + 1, gfp); - } while (ret == 0); - - return ret; -} - -ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, - const char *fmt, va_list args) -{ - struct stdio_buf *buf = &stdio->output; - unsigned long flags; - ssize_t ret; -again: - if (stdio->done) - return -EPIPE; - - spin_lock_irqsave(&buf->lock, flags); - ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); - spin_unlock_irqrestore(&buf->lock, flags); - - if (ret < 0) { - if (nonblocking) - return -EAGAIN; - - ret = wait_event_interruptible(buf->wait, - stdio_redirect_has_output_space(stdio)); - if (ret) - return ret; - goto again; - } - - wake_up(&buf->wait); - return ret; -} - -ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, - const char *fmt, ...) -{ - va_list args; - ssize_t ret; - - va_start(args, fmt); - ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); - va_end(args); - - return ret; -} - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h deleted file mode 100644 index 72497b9219113b..00000000000000 --- a/fs/bcachefs/thread_with_file.h +++ /dev/null @@ -1,81 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_THREAD_WITH_FILE_H -#define _BCACHEFS_THREAD_WITH_FILE_H - -#include "thread_with_file_types.h" - -/* - * Thread with file: Run a kthread and connect it to a file descriptor, so that - * it can be interacted with via fd read/write methods and closing the file - * descriptor stops the kthread. - * - * We have two different APIs: - * - * thread_with_file, the low level version. - * You get to define the full file_operations, including your release function, - * which means that you must call bch2_thread_with_file_exit() from your - * .release method - * - * thread_with_stdio, the higher level version - * This implements full piping of input and output, including .poll. - * - * Notes on behaviour: - * - kthread shutdown behaves like writing or reading from a pipe that has been - * closed - * - Input and output buffers are 4096 bytes, although buffers may in some - * situations slightly exceed that limit so as to avoid chopping off a - * message in the middle in nonblocking mode. - * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations - - * should be fine but might change in future revisions. - * - Output buffer may grow past 4096 bytes to deal with messages that are - * bigger than 4096 bytes - * - Writing may be done blocking or nonblocking; in nonblocking mode, we only - * drop entire messages. - * - * To write, use stdio_redirect_printf() - * To read, use stdio_redirect_read() or stdio_redirect_readline() - */ - -struct task_struct; - -struct thread_with_file { - struct task_struct *task; - int ret; - bool done; -}; - -void bch2_thread_with_file_exit(struct thread_with_file *); -int bch2_run_thread_with_file(struct thread_with_file *, - const struct file_operations *, - int (*fn)(void *)); - -struct thread_with_stdio; - -struct thread_with_stdio_ops { - void (*exit)(struct thread_with_stdio *); - int (*fn)(struct thread_with_stdio *); - long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); -}; - -struct thread_with_stdio { - struct thread_with_file thr; - struct stdio_redirect stdio; - const struct thread_with_stdio_ops *ops; -}; - -void bch2_thread_with_stdio_init(struct thread_with_stdio *, - const struct thread_with_stdio_ops *); -int __bch2_run_thread_with_stdio(struct thread_with_stdio *); -int bch2_run_thread_with_stdio(struct thread_with_stdio *, - const struct thread_with_stdio_ops *); -int bch2_run_thread_with_stdout(struct thread_with_stdio *, - const struct thread_with_stdio_ops *); -int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); - -int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long); -int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *); - -__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); -__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); - -#endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h deleted file mode 100644 index f4d484d44f6334..00000000000000 --- a/fs/bcachefs/thread_with_file_types.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H -#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H - -#include "darray.h" - -struct stdio_buf { - spinlock_t lock; - wait_queue_head_t wait; - darray_char buf; - bool waiting_for_line; -}; - -struct stdio_redirect { - struct stdio_buf input; - struct stdio_buf output; - bool done; -}; - -#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */ diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c deleted file mode 100644 index 2c34fe4be91202..00000000000000 --- a/fs/bcachefs/time_stats.c +++ /dev/null @@ -1,191 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include - -#include "eytzinger.h" -#include "time_stats.h" - -/* disable automatic switching to percpu mode */ -#define TIME_STATS_NONPCPU ((unsigned long) 1) - -static const struct time_unit time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "d", (u64) NSEC_PER_SEC * 3600 * 24}, - { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, - { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ - { "eon", U64_MAX }, -}; - -const struct time_unit *bch2_pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - -static void quantiles_update(struct quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - -static inline void time_stats_update_one(struct bch2_time_stats *stats, - u64 start, u64 end) -{ - u64 duration, freq; - bool initted = stats->last_event != 0; - - if (time_after64(end, start)) { - struct quantiles *quantiles = time_stats_to_quantiles(stats); - - duration = end - start; - mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, - duration, initted, TIME_STATS_MV_WEIGHT); - stats->max_duration = max(stats->max_duration, duration); - stats->min_duration = min(stats->min_duration, duration); - stats->total_duration += duration; - - if (quantiles) - quantiles_update(quantiles, duration); - } - - if (stats->last_event && time_after64(end, stats->last_event)) { - freq = end - stats->last_event; - mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, - freq, initted, TIME_STATS_MV_WEIGHT); - stats->max_freq = max(stats->max_freq, freq); - stats->min_freq = min(stats->min_freq, freq); - } - - stats->last_event = end; -} - -void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct time_stat_buffer *b) -{ - for (struct time_stat_buffer_entry *i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - time_stats_update_one(stats, i->start, i->end); - b->nr = 0; -} - -static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats, - struct time_stat_buffer *b) -{ - unsigned long flags; - - spin_lock_irqsave(&stats->lock, flags); - __bch2_time_stats_clear_buffer(stats, b); - spin_unlock_irqrestore(&stats->lock, flags); -} - -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) -{ - unsigned long flags; - - if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) { - spin_lock_irqsave(&stats->lock, flags); - time_stats_update_one(stats, start, end); - - if (!stats->buffer && - mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct time_stat_buffer, - GFP_ATOMIC); - spin_unlock_irqrestore(&stats->lock, flags); - } else { - struct time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); - - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct time_stat_buffer_entry) { - .start = start, - .end = end - }; - - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) - time_stats_clear_buffer(stats, b); - preempt_enable(); - } -} - -void bch2_time_stats_reset(struct bch2_time_stats *stats) -{ - spin_lock_irq(&stats->lock); - unsigned offset = offsetof(struct bch2_time_stats, min_duration); - memset((void *) stats + offset, 0, sizeof(*stats) - offset); - - if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) { - int cpu; - for_each_possible_cpu(cpu) - per_cpu_ptr(stats->buffer, cpu)->nr = 0; - } - spin_unlock_irq(&stats->lock); -} - -void bch2_time_stats_exit(struct bch2_time_stats *stats) -{ - if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) - free_percpu(stats->buffer); - stats->buffer = NULL; -} - -void bch2_time_stats_init(struct bch2_time_stats *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->min_duration = U64_MAX; - stats->min_freq = U64_MAX; - spin_lock_init(&stats->lock); -} - -void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats) -{ - bch2_time_stats_init(stats); - stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU; -} diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h deleted file mode 100644 index eddb0985bab4bc..00000000000000 --- a/fs/bcachefs/time_stats.h +++ /dev/null @@ -1,161 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * bch2_time_stats - collect statistics on events that have a duration, with nicely - * formatted textual output on demand - * - * - percpu buffering of event collection: cheap enough to shotgun - * everywhere without worrying about overhead - * - * tracks: - * - number of events - * - maximum event duration ever seen - * - sum of all event durations - * - average event duration, standard and weighted - * - standard deviation of event durations, standard and weighted - * and analagous statistics for the frequency of events - * - * We provide both mean and weighted mean (exponentially weighted), and standard - * deviation and weighted standard deviation, to give an efficient-to-compute - * view of current behaviour versus. average behaviour - "did this event source - * just become wonky, or is this typical?". - * - * Particularly useful for tracking down latency issues. - */ -#ifndef _BCACHEFS_TIME_STATS_H -#define _BCACHEFS_TIME_STATS_H - -#include -#include -#include - -#include "mean_and_variance.h" - -struct time_unit { - const char *name; - u64 nsecs; -}; - -/* - * given a nanosecond value, pick the preferred time units for printing: - */ -const struct time_unit *bch2_pick_time_units(u64 ns); - -/* - * quantiles - do not use: - * - * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't - * use in new code. - */ - -#define NR_QUANTILES 15 -#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) - -struct quantiles { - struct quantile_entry { - u64 m; - u64 step; - } entries[NR_QUANTILES]; -}; - -struct time_stat_buffer { - unsigned nr; - struct time_stat_buffer_entry { - u64 start; - u64 end; - } entries[31]; -}; - -struct bch2_time_stats { - spinlock_t lock; - bool have_quantiles; - struct time_stat_buffer __percpu *buffer; - /* all fields are in nanoseconds */ - u64 min_duration; - u64 max_duration; - u64 total_duration; - u64 max_freq; - u64 min_freq; - u64 last_event; - u64 last_event_start; - - struct mean_and_variance duration_stats; - struct mean_and_variance freq_stats; - -/* default weight for weighted mean and variance calculations */ -#define TIME_STATS_MV_WEIGHT 8 - - struct mean_and_variance_weighted duration_stats_weighted; - struct mean_and_variance_weighted freq_stats_weighted; -}; - -struct bch2_time_stats_quantiles { - struct bch2_time_stats stats; - struct quantiles quantiles; -}; - -static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats) -{ - return stats->have_quantiles - ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles - : NULL; -} - -void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); - -/** - * time_stats_update - collect a new event being tracked - * - * @stats - bch2_time_stats to update - * @start - start time of event, recorded with local_clock() - * - * The end duration of the event will be the current time - */ -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) -{ - __bch2_time_stats_update(stats, start, local_clock()); -} - -/** - * track_event_change - track state change events - * - * @stats - bch2_time_stats to update - * @v - new state, true or false - * - * Use this when tracking time stats for state changes, i.e. resource X becoming - * blocked/unblocked. - */ -static inline bool track_event_change(struct bch2_time_stats *stats, bool v) -{ - if (v != !!stats->last_event_start) { - if (!v) { - bch2_time_stats_update(stats, stats->last_event_start); - stats->last_event_start = 0; - } else { - stats->last_event_start = local_clock() ?: 1; - return true; - } - } - - return false; -} - -void bch2_time_stats_reset(struct bch2_time_stats *); -void bch2_time_stats_exit(struct bch2_time_stats *); -void bch2_time_stats_init(struct bch2_time_stats *); -void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *); - -static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) -{ - bch2_time_stats_exit(&statq->stats); -} -static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq) -{ - bch2_time_stats_init(&statq->stats); - statq->stats.have_quantiles = true; - memset(&statq->quantiles, 0, sizeof(statq->quantiles)); -} - -#endif /* _BCACHEFS_TIME_STATS_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c deleted file mode 100644 index dfad1d06633ddb..00000000000000 --- a/fs/bcachefs/trace.c +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bcachefs.h" -#include "alloc_types.h" -#include "buckets.h" -#include "btree_cache.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_update_interior.h" -#include "keylist.h" -#include "move_types.h" -#include "opts.h" -#include "six.h" - -#include - -#define CREATE_TRACE_POINTS -#include "trace.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h deleted file mode 100644 index 9c5a9c551f03d7..00000000000000 --- a/fs/bcachefs/trace.h +++ /dev/null @@ -1,1883 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM bcachefs - -#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) - -#include - -#define TRACE_BPOS_entries(name) \ - __field(u64, name##_inode ) \ - __field(u64, name##_offset ) \ - __field(u32, name##_snapshot ) - -#define TRACE_BPOS_assign(dst, src) \ - __entry->dst##_inode = (src).inode; \ - __entry->dst##_offset = (src).offset; \ - __entry->dst##_snapshot = (src).snapshot - -DECLARE_EVENT_CLASS(bpos, - TP_PROTO(const struct bpos *p), - TP_ARGS(p), - - TP_STRUCT__entry( - TRACE_BPOS_entries(p) - ), - - TP_fast_assign( - TRACE_BPOS_assign(p, *p); - ), - - TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) -); - -DECLARE_EVENT_CLASS(fs_str, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __string(str, str ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __assign_str(str); - ), - - TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) -); - -DECLARE_EVENT_CLASS(trans_str, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), - TP_ARGS(trans, caller_ip, str), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __string(str, str ) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __assign_str(str); - ), - - TP_printk("%d,%d %s %pS %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str)) -); - -DECLARE_EVENT_CLASS(trans_str_nocaller, - TP_PROTO(struct btree_trans *trans, const char *str), - TP_ARGS(trans, str), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __string(str, str ) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(str); - ), - - TP_printk("%d,%d %s %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->trans_fn, __get_str(str)) -); - -DECLARE_EVENT_CLASS(btree_node_nofs, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - -DECLARE_EVENT_CLASS(btree_node, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %s %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - -DECLARE_EVENT_CLASS(bch_fs, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c), - - TP_STRUCT__entry( - __field(dev_t, dev ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - ), - - TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) -); - -DECLARE_EVENT_CLASS(btree_trans, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - ), - - TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn) -); - -DECLARE_EVENT_CLASS(bio, - TP_PROTO(struct bio *bio), - TP_ARGS(bio), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(sector_t, sector ) - __field(unsigned int, nr_sector ) - __array(char, rwbs, 6 ) - ), - - TP_fast_assign( - __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf); - ), - - TP_printk("%d,%d %s %llu + %u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, __entry->nr_sector) -); - -/* errors */ - -TRACE_EVENT(error_throw, - TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip), - TP_ARGS(c, bch_err, ip), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(int, err ) - __array(char, err_str, 32 ) - __array(char, ip, 32 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->err = bch_err; - strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), - - TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ip, __entry->err_str) -); - -TRACE_EVENT(error_downcast, - TP_PROTO(int bch_err, int std_err, unsigned long ip), - TP_ARGS(bch_err, std_err, ip), - - TP_STRUCT__entry( - __array(char, bch_err, 32 ) - __array(char, std_err, 32 ) - __array(char, ip, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); - strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), - - TP_printk("%s ret %s -> %s %s", __entry->ip, - __entry->bch_err, __entry->std_err, __entry->ip) -); - -/* disk_accounting.c */ - -TRACE_EVENT(accounting_mem_insert, - TP_PROTO(struct bch_fs *c, const char *acc), - TP_ARGS(c, acc), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(unsigned, new_nr ) - __string(acc, acc ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->new_nr = c->accounting.k.nr; - __assign_str(acc); - ), - - TP_printk("%d,%d entries %u added %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->new_nr, - __get_str(acc)) -); - -/* fs.c: */ -TRACE_EVENT(bch2_sync_fs, - TP_PROTO(struct super_block *sb, int wait), - - TP_ARGS(sb, wait), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( int, wait ) - - ), - - TP_fast_assign( - __entry->dev = sb->s_dev; - __entry->wait = wait; - ), - - TP_printk("dev %d,%d wait %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->wait) -); - -/* fs-io.c: */ -TRACE_EVENT(bch2_fsync, - TP_PROTO(struct file *file, int datasync), - - TP_ARGS(file, datasync), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ino_t, parent ) - __field( int, datasync ) - ), - - TP_fast_assign( - struct dentry *dentry = file->f_path.dentry; - - __entry->dev = dentry->d_sb->s_dev; - __entry->ino = d_inode(dentry)->i_ino; - __entry->parent = d_inode(dentry->d_parent)->i_ino; - __entry->datasync = datasync; - ), - - TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned long) __entry->parent, __entry->datasync) -); - -/* super-io.c: */ -TRACE_EVENT(write_super, - TP_PROTO(struct bch_fs *c, unsigned long ip), - TP_ARGS(c, ip), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(unsigned long, ip ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->ip = ip; - ), - - TP_printk("%d,%d for %pS", - MAJOR(__entry->dev), MINOR(__entry->dev), - (void *) __entry->ip) -); - -/* io.c: */ - -DEFINE_EVENT(bio, io_read_promote, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -TRACE_EVENT(io_read_nopromote, - TP_PROTO(struct bch_fs *c, int ret), - TP_ARGS(c, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, ret, 32 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); - ), - - TP_printk("%d,%d ret %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ret) -); - -DEFINE_EVENT(bio, io_read_bounce, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_split, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_retry, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_reuse_race, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -DEFINE_EVENT(bio, io_read_fail_and_poison, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -/* ec.c */ - -TRACE_EVENT(stripe_create, - TP_PROTO(struct bch_fs *c, u64 idx, int ret), - TP_ARGS(c, idx, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, idx ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->idx = idx; - __entry->ret = ret; - ), - - TP_printk("%d,%d idx %llu ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->idx, - __entry->ret) -); - -/* Journal */ - -DEFINE_EVENT(bch_fs, journal_full, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(fs_str, journal_entry_full, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, journal_entry_close, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(bio, journal_write, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) -); - -TRACE_EVENT(journal_reclaim_start, - TP_PROTO(struct bch_fs *c, bool direct, bool kicked, - u64 min_nr, u64 min_key_cache, - u64 btree_cache_dirty, u64 btree_cache_total, - u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, direct, kicked, min_nr, min_key_cache, - btree_cache_dirty, btree_cache_total, - btree_key_cache_dirty, btree_key_cache_total), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(bool, direct ) - __field(bool, kicked ) - __field(u64, min_nr ) - __field(u64, min_key_cache ) - __field(u64, btree_cache_dirty ) - __field(u64, btree_cache_total ) - __field(u64, btree_key_cache_dirty ) - __field(u64, btree_key_cache_total ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->direct = direct; - __entry->kicked = kicked; - __entry->min_nr = min_nr; - __entry->min_key_cache = min_key_cache; - __entry->btree_cache_dirty = btree_cache_dirty; - __entry->btree_cache_total = btree_cache_total; - __entry->btree_key_cache_dirty = btree_key_cache_dirty; - __entry->btree_key_cache_total = btree_key_cache_total; - ), - - TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->direct, - __entry->kicked, - __entry->min_nr, - __entry->min_key_cache, - __entry->btree_cache_dirty, - __entry->btree_cache_total, - __entry->btree_key_cache_dirty, - __entry->btree_key_cache_total) -); - -TRACE_EVENT(journal_reclaim_finish, - TP_PROTO(struct bch_fs *c, u64 nr_flushed), - TP_ARGS(c, nr_flushed), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, nr_flushed ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->nr_flushed = nr_flushed; - ), - - TP_printk("%d,%d flushed %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->nr_flushed) -); - -/* bset.c: */ - -DEFINE_EVENT(bpos, bkey_pack_pos_fail, - TP_PROTO(const struct bpos *p), - TP_ARGS(p) -); - -/* Btree cache: */ - -TRACE_EVENT(btree_cache_scan, - TP_PROTO(long nr_to_scan, long can_free, long ret), - TP_ARGS(nr_to_scan, can_free, ret), - - TP_STRUCT__entry( - __field(long, nr_to_scan ) - __field(long, can_free ) - __field(long, ret ) - ), - - TP_fast_assign( - __entry->nr_to_scan = nr_to_scan; - __entry->can_free = can_free; - __entry->ret = ret; - ), - - TP_printk("scanned for %li nodes, can free %li, ret %li", - __entry->nr_to_scan, __entry->can_free, __entry->ret) -); - -DEFINE_EVENT(btree_node_nofs, btree_cache_reap, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans) -); - -/* Btree */ - -DEFINE_EVENT(btree_node, btree_node_read, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -TRACE_EVENT(btree_node_write, - TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), - TP_ARGS(b, bytes, sectors), - - TP_STRUCT__entry( - __field(enum btree_node_type, type) - __field(unsigned, bytes ) - __field(unsigned, sectors ) - ), - - TP_fast_assign( - __entry->type = btree_node_type(b); - __entry->bytes = bytes; - __entry->sectors = sectors; - ), - - TP_printk("bkey type %u bytes %u sectors %u", - __entry->type , __entry->bytes, __entry->sectors) -); - -DEFINE_EVENT(btree_node, btree_node_alloc, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_free, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -TRACE_EVENT(btree_reserve_get_fail, - TP_PROTO(const char *trans_fn, - unsigned long caller_ip, - size_t required, - int ret), - TP_ARGS(trans_fn, caller_ip, required, ret), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(size_t, required ) - __array(char, ret, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->required = required; - strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); - ), - - TP_printk("%s %pS required %zu ret %s", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->required, - __entry->ret) -); - -DEFINE_EVENT(btree_node, btree_node_compact, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_merge, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_split, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_rewrite, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -DEFINE_EVENT(btree_node, btree_node_set_root, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) -); - -TRACE_EVENT(btree_path_relock_fail, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned level), - TP_ARGS(trans, caller_ip, path, level), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, level ) - __field(u8, path_idx) - TRACE_BPOS_entries(pos) - __array(char, node, 24 ) - __field(u8, self_read_count ) - __field(u8, self_intent_count) - __field(u8, read_count ) - __field(u8, intent_count ) - __field(u32, iter_lock_seq ) - __field(u32, node_lock_seq ) - ), - - TP_fast_assign( - struct btree *b = btree_path_node(path, level); - struct six_lock_count c; - - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->level = level; - __entry->path_idx = path - trans->paths; - TRACE_BPOS_assign(pos, path->pos); - - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); - __entry->self_read_count = c.n[SIX_LOCK_read]; - __entry->self_intent_count = c.n[SIX_LOCK_intent]; - - if (IS_ERR(b)) { - strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); - } else { - c = six_lock_counts(&path->l[level].b->c.lock); - __entry->read_count = c.n[SIX_LOCK_read]; - __entry->intent_count = c.n[SIX_LOCK_intent]; - scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c); - } - __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) - ? six_lock_seq(&path->l[level].b->c.lock) - : 0; - ), - - TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->path_idx, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, - __entry->node, - __entry->self_read_count, - __entry->self_intent_count, - __entry->read_count, - __entry->intent_count, - __entry->iter_lock_seq, - __entry->node_lock_seq) -); - -TRACE_EVENT(btree_path_upgrade_fail, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned level), - TP_ARGS(trans, caller_ip, path, level), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, level ) - __field(u8, path_idx) - TRACE_BPOS_entries(pos) - __field(u8, locked ) - __field(u8, self_read_count ) - __field(u8, self_intent_count) - __field(u8, read_count ) - __field(u8, intent_count ) - __field(u32, iter_lock_seq ) - __field(u32, node_lock_seq ) - ), - - TP_fast_assign( - struct six_lock_count c; - - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->level = level; - __entry->path_idx = path - trans->paths; - TRACE_BPOS_assign(pos, path->pos); - __entry->locked = btree_node_locked(path, level); - - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), - __entry->self_read_count = c.n[SIX_LOCK_read]; - __entry->self_intent_count = c.n[SIX_LOCK_intent]; - c = six_lock_counts(&path->l[level].b->c.lock); - __entry->read_count = c.n[SIX_LOCK_read]; - __entry->intent_count = c.n[SIX_LOCK_intent]; - __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) - ? six_lock_seq(&path->l[level].b->c.lock) - : 0; - ), - - TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->path_idx, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, - __entry->locked, - __entry->self_read_count, - __entry->self_intent_count, - __entry->read_count, - __entry->intent_count, - __entry->iter_lock_seq, - __entry->node_lock_seq) -); - -/* Garbage collection */ - -DEFINE_EVENT(bch_fs, gc_gens_start, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, gc_gens_end, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -/* Allocator */ - -DEFINE_EVENT(fs_str, bucket_alloc, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, bucket_alloc_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DECLARE_EVENT_CLASS(discard_buckets_class, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, seen ) - __field(u64, open ) - __field(u64, need_journal_commit ) - __field(u64, discarded ) - __array(char, err, 16 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->seen = seen; - __entry->open = open; - __entry->need_journal_commit = need_journal_commit; - __entry->discarded = discarded; - strscpy(__entry->err, err, sizeof(__entry->err)); - ), - - TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->seen, - __entry->open, - __entry->need_journal_commit, - __entry->discarded, - __entry->err) -); - -DEFINE_EVENT(discard_buckets_class, discard_buckets, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) -); - -DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) -); - -TRACE_EVENT(bucket_invalidate, - TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), - TP_ARGS(c, dev, bucket, sectors), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, dev_idx ) - __field(u32, sectors ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->dev_idx = dev; - __entry->sectors = sectors; - __entry->bucket = bucket; - ), - - TP_printk("%d:%d invalidated %u:%llu cached sectors %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->dev_idx, __entry->bucket, - __entry->sectors) -); - -/* Moving IO */ - -DEFINE_EVENT(fs_str, io_move, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_read, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_write, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_finish, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_write_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_start_fail, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -TRACE_EVENT(move_data, - TP_PROTO(struct bch_fs *c, - struct bch_move_stats *stats), - TP_ARGS(c, stats), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, keys_moved ) - __field(u64, keys_raced ) - __field(u64, sectors_seen ) - __field(u64, sectors_moved ) - __field(u64, sectors_raced ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->keys_moved = atomic64_read(&stats->keys_moved); - __entry->keys_raced = atomic64_read(&stats->keys_raced); - __entry->sectors_seen = atomic64_read(&stats->sectors_seen); - __entry->sectors_moved = atomic64_read(&stats->sectors_moved); - __entry->sectors_raced = atomic64_read(&stats->sectors_raced); - ), - - TP_printk("%d,%d keys moved %llu raced %llu" - "sectors seen %llu moved %llu raced %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->keys_moved, - __entry->keys_raced, - __entry->sectors_seen, - __entry->sectors_moved, - __entry->sectors_raced) -); - -TRACE_EVENT(copygc, - TP_PROTO(struct bch_fs *c, - u64 buckets, - u64 sectors_seen, - u64 sectors_moved), - TP_ARGS(c, buckets, sectors_seen, sectors_moved), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, buckets ) - __field(u64, sectors_seen ) - __field(u64, sectors_moved ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->buckets = buckets; - __entry->sectors_seen = sectors_seen; - __entry->sectors_moved = sectors_moved; - ), - - TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->buckets, - __entry->sectors_seen, - __entry->sectors_moved) -); - -TRACE_EVENT(copygc_wait, - TP_PROTO(struct bch_fs *c, - u64 wait_amount, u64 until), - TP_ARGS(c, wait_amount, until), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, wait_amount ) - __field(u64, until ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->wait_amount = wait_amount; - __entry->until = until; - ), - - TP_printk("%d,%u waiting for %llu sectors until %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->wait_amount, __entry->until) -); - -/* btree transactions: */ - -DECLARE_EVENT_CLASS(transaction_event, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - ), - - TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) -); - -DEFINE_EVENT(transaction_event, transaction_commit, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -DEFINE_EVENT(transaction_event, trans_restart_injected, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -TRACE_EVENT(trans_restart_split_race, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree *b), - TP_ARGS(trans, caller_ip, b), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, level ) - __field(u16, written ) - __field(u16, blocks ) - __field(u16, u64s_remaining ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->level = b->c.level; - __entry->written = b->written; - __entry->blocks = btree_blocks(trans->c); - __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b); - ), - - TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", - __entry->trans_fn, (void *) __entry->caller_ip, - __entry->level, - __entry->written, __entry->blocks, - __entry->u64s_remaining) -); - -TRACE_EVENT(trans_blocked_journal_reclaim, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - - __field(unsigned long, key_cache_nr_keys ) - __field(unsigned long, key_cache_nr_dirty ) - __field(long, must_wait ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys); - __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty); - __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c); - ), - - TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li", - __entry->trans_fn, (void *) __entry->caller_ip, - __entry->key_cache_nr_keys, - __entry->key_cache_nr_dirty, - __entry->must_wait) -); - -#if 0 -/* todo: bring back dynamic fault injection */ -DEFINE_EVENT(transaction_event, trans_restart_fault_inject, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); -#endif - -DEFINE_EVENT(transaction_event, trans_traverse_all, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -DEFINE_EVENT(trans_str, trans_restart_too_many_iters, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - const char *paths), - TP_ARGS(trans, caller_ip, paths) -); - -DECLARE_EVENT_CLASS(transaction_restart_iter, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos) - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(fs_str, trans_restart_upgrade, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(trans_str, trans_restart_relock, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), - TP_ARGS(trans, caller_ip, str) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock, - TP_PROTO(struct btree_trans *trans, - const char *cycle), - TP_ARGS(trans, cycle) -); - -DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -TRACE_EVENT(trans_restart_would_deadlock_write, - TP_PROTO(struct btree_trans *trans), - TP_ARGS(trans), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - ), - - TP_printk("%s", __entry->trans_fn) -); - -TRACE_EVENT(trans_restart_mem_realloced, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - unsigned long bytes), - TP_ARGS(trans, caller_ip, bytes), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(unsigned long, bytes ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->bytes = bytes; - ), - - TP_printk("%s %pS bytes %lu", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->bytes) -); - -DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - -TRACE_EVENT(path_downgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_locks_want), - TP_ARGS(trans, caller_ip, path, old_locks_want), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(unsigned, old_locks_want ) - __field(unsigned, new_locks_want ) - __field(unsigned, btree ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->old_locks_want = old_locks_want; - __entry->new_locks_want = path->locks_want; - __entry->btree = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->old_locks_want, - __entry->new_locks_want, - bch2_btree_id_str(__entry->btree), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -TRACE_EVENT(key_cache_fill, - TP_PROTO(struct btree_trans *trans, const char *key), - TP_ARGS(trans, key), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __string(key, key ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(key); - ), - - TP_printk("%s %s", __entry->trans_fn, __get_str(key)) -); - -TRACE_EVENT(write_buffer_flush, - TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size), - TP_ARGS(trans, nr, skipped, fast, size), - - TP_STRUCT__entry( - __field(size_t, nr ) - __field(size_t, skipped ) - __field(size_t, fast ) - __field(size_t, size ) - ), - - TP_fast_assign( - __entry->nr = nr; - __entry->skipped = skipped; - __entry->fast = fast; - __entry->size = size; - ), - - TP_printk("%zu/%zu skipped %zu fast %zu", - __entry->nr, __entry->size, __entry->skipped, __entry->fast) -); - -TRACE_EVENT(write_buffer_flush_sync, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans, caller_ip), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - ), - - TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) -); - -TRACE_EVENT(write_buffer_flush_slowpath, - TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total), - TP_ARGS(trans, slowpath, total), - - TP_STRUCT__entry( - __field(size_t, slowpath ) - __field(size_t, total ) - ), - - TP_fast_assign( - __entry->slowpath = slowpath; - __entry->total = total; - ), - - TP_printk("%zu/%zu", __entry->slowpath, __entry->total) -); - -TRACE_EVENT(write_buffer_maybe_flush, - TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), - TP_ARGS(trans, caller_ip, key), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __string(key, key ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(key); - ), - - TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) -); - -DEFINE_EVENT(fs_str, rebalance_extent, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, data_update, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_pred, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_created_rebalance, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, io_move_evacuate_bucket, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, extent_trim_atomic, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, btree_iter_peek_slot, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, __btree_iter_peek, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, btree_iter_peek_max, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -DEFINE_EVENT(fs_str, btree_iter_peek_prev_min, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - -#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS - -TRACE_EVENT(update_by_path, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, - struct btree_insert_entry *i, bool overwrite), - TP_ARGS(trans, path, i, overwrite), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(btree_path_idx_t, path_idx ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - __field(u8, overwrite ) - __field(btree_path_idx_t, update_idx ) - __field(btree_path_idx_t, nr_updates ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->path_idx = path - trans->paths; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - __entry->overwrite = overwrite; - __entry->update_idx = i - trans->updates; - __entry->nr_updates = trans->nr_updates; - ), - - TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u", - __entry->trans_fn, - __entry->path_idx, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->overwrite, - __entry->update_idx, - __entry->nr_updates) -); - -TRACE_EVENT(btree_path_lock, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_bkey_cached_common *b), - TP_ARGS(trans, caller_ip, b), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, level ) - __array(char, node, 24 ) - __field(u32, lock_seq ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = b->btree_id; - __entry->level = b->level; - - scnprintf(__entry->node, sizeof(__entry->node), "%px", b); - __entry->lock_seq = six_lock_seq(&b->lock); - ), - - TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->level, - __entry->node, - __entry->lock_seq) -); - -DECLARE_EVENT_CLASS(btree_path_ev, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path), - - TP_STRUCT__entry( - __field(u16, idx ) - __field(u8, ref ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u", - __entry->idx, __entry->ref, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -DEFINE_EVENT(btree_path_ev, btree_path_get_ll, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -DEFINE_EVENT(btree_path_ev, btree_path_put_ll, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -TRACE_EVENT(btree_path_alloc, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, locks_want ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->locks_want = path->locks_want; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u", - __entry->idx, - bch2_btree_id_str(__entry->btree_id), - __entry->locks_want, - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot) -); - -TRACE_EVENT(btree_path_get, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos), - TP_ARGS(trans, path, new_pos), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, ref ) - __field(u8, preserve ) - __field(u8, locks_want ) - __field(u8, btree_id ) - TRACE_BPOS_entries(old_pos) - TRACE_BPOS_entries(new_pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - __entry->locks_want = path->locks_want; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(old_pos, path->pos); - TRACE_BPOS_assign(new_pos, *new_pos); - ), - - TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u", - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->locks_want, - __entry->old_pos_inode, - __entry->old_pos_offset, - __entry->old_pos_snapshot, - __entry->new_pos_inode, - __entry->new_pos_offset, - __entry->new_pos_snapshot) -); - -DECLARE_EVENT_CLASS(btree_path_clone, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), - TP_ARGS(trans, path, new), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, new_idx ) - __field(u8, btree_id ) - __field(u8, ref ) - __field(u8, preserve ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->new_idx = new - trans->paths; - __entry->btree_id = path->btree_id; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - TRACE_BPOS_assign(pos, path->pos); - ), - - TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u", - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->new_idx) -); - -DEFINE_EVENT(btree_path_clone, btree_path_clone, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), - TP_ARGS(trans, path, new) -); - -DEFINE_EVENT(btree_path_clone, btree_path_save_pos, - TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), - TP_ARGS(trans, path, new) -); - -DECLARE_EVENT_CLASS(btree_path_traverse, - TP_PROTO(struct btree_trans *trans, - struct btree_path *path), - TP_ARGS(trans, path), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(btree_path_idx_t, idx ) - __field(u8, ref ) - __field(u8, preserve ) - __field(u8, should_be_locked ) - __field(u8, btree_id ) - __field(u8, level ) - TRACE_BPOS_entries(pos) - __field(u8, locks_want ) - __field(u8, nodes_locked ) - __array(char, node0, 24 ) - __array(char, node1, 24 ) - __array(char, node2, 24 ) - __array(char, node3, 24 ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - __entry->btree_id = path->btree_id; - __entry->level = path->level; - TRACE_BPOS_assign(pos, path->pos); - - __entry->locks_want = path->locks_want; - __entry->nodes_locked = path->nodes_locked; - struct btree *b = path->l[0].b; - if (IS_ERR(b)) - strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); - b = path->l[1].b; - if (IS_ERR(b)) - strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); - b = path->l[2].b; - if (IS_ERR(b)) - strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); - b = path->l[3].b; - if (IS_ERR(b)) - strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); - ), - - TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n" - "locks %u %u %u %u node %s %s %s %s", - __entry->trans_fn, - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, - __entry->locks_want, - (__entry->nodes_locked >> 6) & 3, - (__entry->nodes_locked >> 4) & 3, - (__entry->nodes_locked >> 2) & 3, - (__entry->nodes_locked >> 0) & 3, - __entry->node3, - __entry->node2, - __entry->node1, - __entry->node0) -); - -DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start, - TP_PROTO(struct btree_trans *trans, - struct btree_path *path), - TP_ARGS(trans, path) -); - -DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end, - TP_PROTO(struct btree_trans *trans, struct btree_path *path), - TP_ARGS(trans, path) -); - -TRACE_EVENT(btree_path_set_pos, - TP_PROTO(struct btree_trans *trans, - struct btree_path *path, - struct bpos *new_pos), - TP_ARGS(trans, path, new_pos), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, ref ) - __field(u8, preserve ) - __field(u8, btree_id ) - TRACE_BPOS_entries(old_pos) - TRACE_BPOS_entries(new_pos) - __field(u8, locks_want ) - __field(u8, nodes_locked ) - __array(char, node0, 24 ) - __array(char, node1, 24 ) - __array(char, node2, 24 ) - __array(char, node3, 24 ) - ), - - TP_fast_assign( - __entry->idx = path - trans->paths; - __entry->ref = path->ref; - __entry->preserve = path->preserve; - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(old_pos, path->pos); - TRACE_BPOS_assign(new_pos, *new_pos); - - __entry->nodes_locked = path->nodes_locked; - struct btree *b = path->l[0].b; - if (IS_ERR(b)) - strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); - b = path->l[1].b; - if (IS_ERR(b)) - strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); - b = path->l[2].b; - if (IS_ERR(b)) - strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); - b = path->l[3].b; - if (IS_ERR(b)) - strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); - else - scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); - ), - - TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n" - "locks %u %u %u %u node %s %s %s %s", - __entry->idx, - __entry->ref, - __entry->preserve, - bch2_btree_id_str(__entry->btree_id), - __entry->old_pos_inode, - __entry->old_pos_offset, - __entry->old_pos_snapshot, - __entry->new_pos_inode, - __entry->new_pos_offset, - __entry->new_pos_snapshot, - (__entry->nodes_locked >> 6) & 3, - (__entry->nodes_locked >> 4) & 3, - (__entry->nodes_locked >> 2) & 3, - (__entry->nodes_locked >> 0) & 3, - __entry->node3, - __entry->node2, - __entry->node1, - __entry->node0) -); - -TRACE_EVENT(btree_path_free, - TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup), - TP_ARGS(trans, path, dup), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - __field(u8, preserve ) - __field(u8, should_be_locked) - __field(s8, dup ) - __field(u8, dup_locked ) - ), - - TP_fast_assign( - __entry->idx = path; - __entry->preserve = trans->paths[path].preserve; - __entry->should_be_locked = trans->paths[path].should_be_locked; - __entry->dup = dup ? dup - trans->paths : -1; - __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0; - ), - - TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx, - __entry->preserve ? 'P' : ' ', - __entry->should_be_locked ? 'S' : ' ', - __entry->dup, - __entry->dup_locked) -); - -#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ -#ifndef _TRACE_BCACHEFS_H - -static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path, - struct btree_insert_entry *i, bool overwrite) {} -static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {} -static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} -static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} -static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} -static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} -static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} -static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} - -#endif -#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ - -#define _TRACE_BCACHEFS_H -#endif /* _TRACE_BCACHEFS_H */ - -/* This part must be outside protection */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../fs/bcachefs - -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -#include diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c deleted file mode 100644 index 9764c2e6a91026..00000000000000 --- a/fs/bcachefs/two_state_shared_lock.c +++ /dev/null @@ -1,8 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "two_state_shared_lock.h" - -void __bch2_two_state_lock(two_state_lock_t *lock, int s) -{ - __wait_event(lock->wait, bch2_two_state_trylock(lock, s)); -} diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h deleted file mode 100644 index 7f647846b511fb..00000000000000 --- a/fs/bcachefs/two_state_shared_lock.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_TWO_STATE_LOCK_H -#define _BCACHEFS_TWO_STATE_LOCK_H - -#include -#include -#include - -#include "util.h" - -/* - * Two-state lock - can be taken for add or block - both states are shared, - * like read side of rwsem, but conflict with other state: - */ -typedef struct { - atomic_long_t v; - wait_queue_head_t wait; -} two_state_lock_t; - -static inline void two_state_lock_init(two_state_lock_t *lock) -{ - atomic_long_set(&lock->v, 0); - init_waitqueue_head(&lock->wait); -} - -static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s) -{ - long i = s ? 1 : -1; - - EBUG_ON(atomic_long_read(&lock->v) == 0); - - if (atomic_long_sub_return_release(i, &lock->v) == 0) - wake_up_all(&lock->wait); -} - -static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s) -{ - long i = s ? 1 : -1; - long old; - - old = atomic_long_read(&lock->v); - do { - if (i > 0 ? old < 0 : old > 0) - return false; - } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i)); - - return true; -} - -void __bch2_two_state_lock(two_state_lock_t *, int); - -static inline void bch2_two_state_lock(two_state_lock_t *lock, int s) -{ - if (!bch2_two_state_trylock(lock, s)) - __bch2_two_state_lock(lock, s); -} - -#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c deleted file mode 100644 index df9a6071fe186b..00000000000000 --- a/fs/bcachefs/util.c +++ /dev/null @@ -1,1047 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * random utility code, for bcache but in theory not specific to bcache - * - * Copyright 2010, 2011 Kent Overstreet - * Copyright 2012 Google, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eytzinger.h" -#include "mean_and_variance.h" -#include "util.h" - -static const char si_units[] = "?kMGTPEZY"; - -/* string_get_size units: */ -static const char *const units_2[] = { - "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" -}; -static const char *const units_10[] = { - "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" -}; - -static int parse_u64(const char *cp, u64 *res) -{ - const char *start = cp; - u64 v = 0; - - if (!isdigit(*cp)) - return -EINVAL; - - do { - if (v > U64_MAX / 10) - return -ERANGE; - v *= 10; - if (v > U64_MAX - (*cp - '0')) - return -ERANGE; - v += *cp - '0'; - cp++; - } while (isdigit(*cp)); - - *res = v; - return cp - start; -} - -static int bch2_pow(u64 n, u64 p, u64 *res) -{ - *res = 1; - - while (p--) { - if (*res > div64_u64(U64_MAX, n)) - return -ERANGE; - *res *= n; - } - return 0; -} - -static int parse_unit_suffix(const char *cp, u64 *res) -{ - const char *start = cp; - u64 base = 1024; - unsigned u; - int ret; - - if (*cp == ' ') - cp++; - - for (u = 1; u < strlen(si_units); u++) - if (*cp == si_units[u]) { - cp++; - goto got_unit; - } - - for (u = 0; u < ARRAY_SIZE(units_2); u++) - if (!strncmp(cp, units_2[u], strlen(units_2[u]))) { - cp += strlen(units_2[u]); - goto got_unit; - } - - for (u = 0; u < ARRAY_SIZE(units_10); u++) - if (!strncmp(cp, units_10[u], strlen(units_10[u]))) { - cp += strlen(units_10[u]); - base = 1000; - goto got_unit; - } - - *res = 1; - return 0; -got_unit: - ret = bch2_pow(base, u, res); - if (ret) - return ret; - - return cp - start; -} - -#define parse_or_ret(cp, _f) \ -do { \ - int _ret = _f; \ - if (_ret < 0) \ - return _ret; \ - cp += _ret; \ -} while (0) - -static int __bch2_strtou64_h(const char *cp, u64 *res) -{ - const char *start = cp; - u64 v = 0, b, f_n = 0, f_d = 1; - int ret; - - parse_or_ret(cp, parse_u64(cp, &v)); - - if (*cp == '.') { - cp++; - ret = parse_u64(cp, &f_n); - if (ret < 0) - return ret; - cp += ret; - - ret = bch2_pow(10, ret, &f_d); - if (ret) - return ret; - } - - parse_or_ret(cp, parse_unit_suffix(cp, &b)); - - if (v > div64_u64(U64_MAX, b)) - return -ERANGE; - v *= b; - - if (f_n > div64_u64(U64_MAX, b)) - return -ERANGE; - - f_n = div64_u64(f_n * b, f_d); - if (v + f_n < v) - return -ERANGE; - v += f_n; - - *res = v; - return cp - start; -} - -static int __bch2_strtoh(const char *cp, u64 *res, - u64 t_max, bool t_signed) -{ - bool positive = *cp != '-'; - u64 v = 0; - - if (*cp == '+' || *cp == '-') - cp++; - - parse_or_ret(cp, __bch2_strtou64_h(cp, &v)); - - if (*cp == '\n') - cp++; - if (*cp) - return -EINVAL; - - if (positive) { - if (v > t_max) - return -ERANGE; - } else { - if (v && !t_signed) - return -ERANGE; - - if (v > t_max + 1) - return -ERANGE; - v = -v; - } - - *res = v; - return 0; -} - -#define STRTO_H(name, type) \ -int bch2_ ## name ## _h(const char *cp, type *res) \ -{ \ - u64 v = 0; \ - int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ - ANYSINT_MAX(type) != ((type) ~0ULL)); \ - *res = v; \ - return ret; \ -} - -STRTO_H(strtoint, int) -STRTO_H(strtouint, unsigned int) -STRTO_H(strtoll, long long) -STRTO_H(strtoull, unsigned long long) -STRTO_H(strtou64, u64) - -u64 bch2_read_flag_list(const char *opt, const char * const list[]) -{ - u64 ret = 0; - char *p, *s, *d = kstrdup(opt, GFP_KERNEL); - - if (!d) - return -ENOMEM; - - s = strim(d); - - while ((p = strsep(&s, ",;"))) { - int flag = match_string(list, -1, p); - - if (flag < 0) { - ret = -1; - break; - } - - ret |= BIT_ULL(flag); - } - - kfree(d); - - return ret; -} - -bool bch2_is_zero(const void *_p, size_t n) -{ - const char *p = _p; - size_t i; - - for (i = 0; i < n; i++) - if (p[i]) - return false; - return true; -} - -void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits) -{ - while (nr_bits) - prt_char(out, '0' + ((v >> --nr_bits) & 1)); -} - -void bch2_prt_u64_base2(struct printbuf *out, u64 v) -{ - bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); -} - -static bool string_is_spaces(const char *str) -{ - while (*str) { - if (*str != ' ') - return false; - str++; - } - return true; -} - -void bch2_print_string_as_lines(const char *prefix, const char *lines) -{ - bool locked = false; - const char *p; - - if (!lines) { - printk("%s (null)\n", prefix); - return; - } - - locked = console_trylock(); - - while (*lines) { - p = strchrnul(lines, '\n'); - if (!*p && string_is_spaces(lines)) - break; - - printk("%s%.*s\n", prefix, (int) (p - lines), lines); - if (!*p) - break; - lines = p + 1; - } - if (locked) - console_unlock(); -} - -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, - gfp_t gfp) -{ -#ifdef CONFIG_STACKTRACE - unsigned nr_entries = 0; - - stack->nr = 0; - int ret = darray_make_room_gfp(stack, 32, gfp); - if (ret) - return ret; - - if (!down_read_trylock(&task->signal->exec_update_lock)) - return -1; - - do { - nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); - } while (nr_entries == stack->size && - !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); - - stack->nr = nr_entries; - up_read(&task->signal->exec_update_lock); - - return ret; -#else - return 0; -#endif -} - -void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) -{ - darray_for_each(*stack, i) { - prt_printf(out, "[<0>] %pB", (void *) *i); - prt_newline(out); - } -} - -int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) -{ - bch_stacktrace stack = { 0 }; - int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); - - bch2_prt_backtrace(out, &stack); - darray_exit(&stack); - return ret; -} - -#ifndef __KERNEL__ -#include -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - time_t t = sec; - char buf[64]; - ctime_r(&t, buf); - strim(buf); - prt_str(out, buf); -} -#else -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - char buf[64]; - snprintf(buf, sizeof(buf), "%ptT", &sec); - prt_u64(out, sec); -} -#endif - -void bch2_pr_time_units(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = bch2_pick_time_units(ns); - - prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name); -} - -static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = bch2_pick_time_units(ns); - - prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name); -} - -static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) -{ - prt_printf(out, "%s\t", name); - bch2_pr_time_units_aligned(out, ns); - prt_newline(out); -} - -#define TABSTOP_SIZE 12 - -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) -{ - struct quantiles *quantiles = time_stats_to_quantiles(stats); - s64 f_mean = 0, d_mean = 0; - u64 f_stddev = 0, d_stddev = 0; - - if (stats->buffer) { - int cpu; - - spin_lock_irq(&stats->lock); - for_each_possible_cpu(cpu) - __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); - spin_unlock_irq(&stats->lock); - } - - /* - * avoid divide by zero - */ - if (stats->freq_stats.n) { - f_mean = mean_and_variance_get_mean(stats->freq_stats); - f_stddev = mean_and_variance_get_stddev(stats->freq_stats); - d_mean = mean_and_variance_get_mean(stats->duration_stats); - d_stddev = mean_and_variance_get_stddev(stats->duration_stats); - } - - printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); - prt_printf(out, "count:\t%llu\n", stats->duration_stats.n); - printbuf_tabstop_pop(out); - - printbuf_tabstops_reset(out); - - printbuf_tabstop_push(out, out->indent + 20); - printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - printbuf_tabstop_push(out, 0); - printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - - prt_printf(out, "\tsince mount\r\trecent\r\n"); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, out->indent + 20); - printbuf_tabstop_push(out, TABSTOP_SIZE); - printbuf_tabstop_push(out, 2); - printbuf_tabstop_push(out, TABSTOP_SIZE); - - prt_printf(out, "duration of events\n"); - printbuf_indent_add(out, 2); - - pr_name_and_units(out, "min:", stats->min_duration); - pr_name_and_units(out, "max:", stats->max_duration); - pr_name_and_units(out, "total:", stats->total_duration); - - prt_printf(out, "mean:\t"); - bch2_pr_time_units_aligned(out, d_mean); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); - prt_newline(out); - - prt_printf(out, "stddev:\t"); - bch2_pr_time_units_aligned(out, d_stddev); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); - - printbuf_indent_sub(out, 2); - prt_newline(out); - - prt_printf(out, "time between events\n"); - printbuf_indent_add(out, 2); - - pr_name_and_units(out, "min:", stats->min_freq); - pr_name_and_units(out, "max:", stats->max_freq); - - prt_printf(out, "mean:\t"); - bch2_pr_time_units_aligned(out, f_mean); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); - prt_newline(out); - - prt_printf(out, "stddev:\t"); - bch2_pr_time_units_aligned(out, f_stddev); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); - - printbuf_indent_sub(out, 2); - prt_newline(out); - - printbuf_tabstops_reset(out); - - if (quantiles) { - int i = eytzinger0_first(NR_QUANTILES); - const struct time_unit *u = - bch2_pick_time_units(quantiles->entries[i].m); - u64 last_q = 0; - - prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(j, NR_QUANTILES) { - bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; - - u64 q = max(quantiles->entries[j].m, last_q); - prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); - last_q = q; - } - } -} - -/* ratelimit: */ - -/** - * bch2_ratelimit_delay() - return how long to delay until the next time to do - * some work - * @d: the struct bch_ratelimit to update - * Returns: the amount of time to delay by, in jiffies - */ -u64 bch2_ratelimit_delay(struct bch_ratelimit *d) -{ - u64 now = local_clock(); - - return time_after64(d->next, now) - ? nsecs_to_jiffies(d->next - now) - : 0; -} - -/** - * bch2_ratelimit_increment() - increment @d by the amount of work done - * @d: the struct bch_ratelimit to update - * @done: the amount of work done, in arbitrary units - */ -void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) -{ - u64 now = local_clock(); - - d->next += div_u64(done * NSEC_PER_SEC, d->rate); - - if (time_before64(now + NSEC_PER_SEC, d->next)) - d->next = now + NSEC_PER_SEC; - - if (time_after64(now - NSEC_PER_SEC * 2, d->next)) - d->next = now - NSEC_PER_SEC * 2; -} - -/* pd controller: */ - -/* - * Updates pd_controller. Attempts to scale inputed values to units per second. - * @target: desired value - * @actual: current value - * - * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing - * it makes actual go down. - */ -void bch2_pd_controller_update(struct bch_pd_controller *pd, - s64 target, s64 actual, int sign) -{ - s64 proportional, derivative, change; - - unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; - - if (seconds_since_update == 0) - return; - - pd->last_update = jiffies; - - proportional = actual - target; - proportional *= seconds_since_update; - proportional = div_s64(proportional, pd->p_term_inverse); - - derivative = actual - pd->last_actual; - derivative = div_s64(derivative, seconds_since_update); - derivative = ewma_add(pd->smoothed_derivative, derivative, - (pd->d_term / seconds_since_update) ?: 1); - derivative = derivative * pd->d_term; - derivative = div_s64(derivative, pd->p_term_inverse); - - change = proportional + derivative; - - /* Don't increase rate if not keeping up */ - if (change > 0 && - pd->backpressure && - time_after64(local_clock(), - pd->rate.next + NSEC_PER_MSEC)) - change = 0; - - change *= (sign * -1); - - pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, - 1, UINT_MAX); - - pd->last_actual = actual; - pd->last_derivative = derivative; - pd->last_proportional = proportional; - pd->last_change = change; - pd->last_target = target; -} - -void bch2_pd_controller_init(struct bch_pd_controller *pd) -{ - pd->rate.rate = 1024; - pd->last_update = jiffies; - pd->p_term_inverse = 6000; - pd->d_term = 30; - pd->d_smooth = pd->d_term; - pd->backpressure = 1; -} - -void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) -{ - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 20); - - prt_printf(out, "rate:\t"); - prt_human_readable_s64(out, pd->rate.rate); - prt_newline(out); - - prt_printf(out, "target:\t"); - prt_human_readable_u64(out, pd->last_target); - prt_newline(out); - - prt_printf(out, "actual:\t"); - prt_human_readable_u64(out, pd->last_actual); - prt_newline(out); - - prt_printf(out, "proportional:\t"); - prt_human_readable_s64(out, pd->last_proportional); - prt_newline(out); - - prt_printf(out, "derivative:\t"); - prt_human_readable_s64(out, pd->last_derivative); - prt_newline(out); - - prt_printf(out, "change:\t"); - prt_human_readable_s64(out, pd->last_change); - prt_newline(out); - - prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); -} - -/* misc: */ - -void bch2_bio_map(struct bio *bio, void *base, size_t size) -{ - while (size) { - struct page *page = is_vmalloc_addr(base) - ? vmalloc_to_page(base) - : virt_to_page(base); - unsigned offset = offset_in_page(base); - unsigned len = min_t(size_t, PAGE_SIZE - offset, size); - - BUG_ON(!bio_add_page(bio, page, len, offset)); - size -= len; - base += len; - } -} - -int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) -{ - while (size) { - struct page *page = alloc_pages(gfp_mask, 0); - unsigned len = min_t(size_t, PAGE_SIZE, size); - - if (!page) - return -ENOMEM; - - if (unlikely(!bio_add_page(bio, page, len, 0))) { - __free_page(page); - break; - } - - size -= len; - } - - return 0; -} - -u64 bch2_get_random_u64_below(u64 ceil) -{ - if (ceil <= U32_MAX) - return __get_random_u32_below(ceil); - - /* this is the same (clever) algorithm as in __get_random_u32_below() */ - u64 rand = get_random_u64(); - u64 mult = ceil * rand; - - if (unlikely(mult < ceil)) { - u64 bound; - div64_u64_rem(-ceil, ceil, &bound); - while (unlikely(mult < bound)) { - rand = get_random_u64(); - mult = ceil * rand; - } - } - - return mul_u64_u64_shr(ceil, rand, 64); -} - -void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) -{ - struct bio_vec bv; - struct bvec_iter iter; - - __bio_for_each_segment(bv, dst, iter, dst_iter) { - void *dstp = kmap_local_page(bv.bv_page); - - memcpy(dstp + bv.bv_offset, src, bv.bv_len); - kunmap_local(dstp); - - src += bv.bv_len; - } -} - -void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) -{ - struct bio_vec bv; - struct bvec_iter iter; - - __bio_for_each_segment(bv, src, iter, src_iter) { - void *srcp = kmap_local_page(bv.bv_page); - - memcpy(dst, srcp + bv.bv_offset, bv.bv_len); - kunmap_local(srcp); - - dst += bv.bv_len; - } -} - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_corrupt_bio(struct bio *bio) -{ - struct bvec_iter iter; - struct bio_vec bv; - unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); - - bio_for_each_segment(bv, bio, iter) { - unsigned u64s = bv.bv_len / sizeof(u64); - - if (offset < u64s) { - u64 *segment = bvec_kmap_local(&bv); - segment[offset] = get_random_u64(); - kunmap_local(segment); - return; - } - offset -= u64s; - } -} -#endif - -void bch2_bio_to_text(struct printbuf *out, struct bio *bio) -{ - prt_printf(out, "bi_remaining:\t%u\n", - atomic_read(&bio->__bi_remaining)); - prt_printf(out, "bi_end_io:\t%ps\n", - bio->bi_end_io); - prt_printf(out, "bi_status:\t%u\n", - bio->bi_status); -} - -#if 0 -void eytzinger1_test(void) -{ - unsigned inorder, size; - - pr_info("1 based eytzinger test:\n"); - - for (size = 2; - size < 65536; - size++) { - unsigned extra = eytzinger1_extra(size); - - if (!(size % 4096)) - pr_info("tree size %u\n", size); - - inorder = 1; - eytzinger1_for_each(eytz, size) { - BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); - BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); - BUG_ON(eytz != eytzinger1_last(size) && - eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); - - inorder++; - } - BUG_ON(inorder - 1 != size); - } -} - -void eytzinger0_test(void) -{ - - unsigned inorder, size; - - pr_info("0 based eytzinger test:\n"); - - for (size = 1; - size < 65536; - size++) { - unsigned extra = eytzinger0_extra(size); - - if (!(size % 4096)) - pr_info("tree size %u\n", size); - - inorder = 0; - eytzinger0_for_each(eytz, size) { - BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); - BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); - BUG_ON(eytz != eytzinger0_last(size) && - eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); - - inorder++; - } - BUG_ON(inorder != size); - - inorder = size - 1; - eytzinger0_for_each_prev(eytz, size) { - BUG_ON(eytz != eytzinger0_first(size) && - eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); - - inorder--; - } - BUG_ON(inorder != -1); - } -} - -static inline int cmp_u16(const void *_l, const void *_r) -{ - const u16 *l = _l, *r = _r; - - return (*l > *r) - (*r > *l); -} - -static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) -{ - int r, s; - bool bad; - - r = eytzinger0_find_le(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - if (r >= 0) { - if (test_array[r] > search) { - bad = true; - } else { - s = eytzinger0_next(r, nr); - bad = s >= 0 && test_array[s] <= search; - } - } else { - s = eytzinger0_last(nr); - bad = s >= 0 && test_array[s] <= search; - } - - if (bad) { - s = -1; - eytzinger0_for_each_prev(j, nr) { - if (test_array[j] <= search) { - s = j; - break; - } - } - - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_le(%12u) = %3i should be %3i\n", - search, r, s); - BUG(); - } -} - -static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) -{ - int r, s; - bool bad; - - r = eytzinger0_find_gt(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - if (r >= 0) { - if (test_array[r] <= search) { - bad = true; - } else { - s = eytzinger0_prev(r, nr); - bad = s >= 0 && test_array[s] > search; - } - } else { - s = eytzinger0_first(nr); - bad = s >= 0 && test_array[s] > search; - } - - if (bad) { - s = -1; - eytzinger0_for_each(j, nr) { - if (test_array[j] > search) { - s = j; - break; - } - } - - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_gt(%12u) = %3i should be %3i\n", - search, r, s); - BUG(); - } -} - -static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) -{ - int r, s; - bool bad; - - r = eytzinger0_find_ge(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - if (r >= 0) { - if (test_array[r] < search) { - bad = true; - } else { - s = eytzinger0_prev(r, nr); - bad = s >= 0 && test_array[s] >= search; - } - } else { - s = eytzinger0_first(nr); - bad = s >= 0 && test_array[s] >= search; - } - - if (bad) { - s = -1; - eytzinger0_for_each(j, nr) { - if (test_array[j] >= search) { - s = j; - break; - } - } - - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find_ge(%12u) = %3i should be %3i\n", - search, r, s); - BUG(); - } -} - -static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) -{ - unsigned r; - int s; - bool bad; - - r = eytzinger0_find(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - - if (r < nr) { - bad = test_array[r] != search; - } else { - s = eytzinger0_find_le(test_array, nr, - sizeof(test_array[0]), - cmp_u16, &search); - bad = s >= 0 && test_array[s] == search; - } - - if (bad) { - eytzinger0_for_each(j, nr) - pr_info("[%3u] = %12u\n", j, test_array[j]); - pr_info("find(%12u) = %3i is incorrect\n", - search, r); - BUG(); - } -} - -static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -{ - eytzinger0_find_test_le(test_array, nr, search); - eytzinger0_find_test_gt(test_array, nr, search); - eytzinger0_find_test_ge(test_array, nr, search); - eytzinger0_find_test_eq(test_array, nr, search); -} - -void eytzinger0_find_test(void) -{ - unsigned i, nr, allocated = 1 << 12; - u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); - - for (nr = 1; nr < allocated; nr++) { - u16 prev = 0; - - pr_info("testing %u elems\n", nr); - - get_random_bytes(test_array, nr * sizeof(test_array[0])); - eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); - - /* verify array is sorted correctly: */ - eytzinger0_for_each(j, nr) { - BUG_ON(test_array[j] < prev); - prev = test_array[j]; - } - - for (i = 0; i < U16_MAX; i += 1 << 12) - eytzinger0_find_test_val(test_array, nr, i); - - for (i = 0; i < nr; i++) { - eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); - eytzinger0_find_test_val(test_array, nr, test_array[i]); - eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); - } - } - - kfree(test_array); -} -#endif - -/* - * Accumulate percpu counters onto one cpu's copy - only valid when access - * against any percpu counter is guarded against - */ -u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) -{ - u64 *ret; - int cpu; - - /* access to pcpu vars has to be blocked by other locking */ - preempt_disable(); - ret = this_cpu_ptr(p); - preempt_enable(); - - for_each_possible_cpu(cpu) { - u64 *i = per_cpu_ptr(p, cpu); - - if (i != ret) { - acc_u64s(ret, i, nr); - memset(i, 0, nr * sizeof(u64)); - } - } - - return ret; -} - -void bch2_darray_str_exit(darray_const_str *d) -{ - darray_for_each(*d, i) - kfree(*i); - darray_exit(d); -} - -int bch2_split_devs(const char *_dev_name, darray_const_str *ret) -{ - darray_init(ret); - - char *dev_name, *s, *orig; - - dev_name = orig = kstrdup(_dev_name, GFP_KERNEL); - if (!dev_name) - return -ENOMEM; - - while ((s = strsep(&dev_name, ":"))) { - char *p = kstrdup(s, GFP_KERNEL); - if (!p) - goto err; - - if (darray_push(ret, p)) { - kfree(p); - goto err; - } - } - - kfree(orig); - return 0; -err: - bch2_darray_str_exit(ret); - kfree(orig); - return -ENOMEM; -} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h deleted file mode 100644 index 6488f098d1407e..00000000000000 --- a/fs/bcachefs/util.h +++ /dev/null @@ -1,782 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_UTIL_H -#define _BCACHEFS_UTIL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mean_and_variance.h" - -#include "darray.h" -#include "time_stats.h" - -struct closure; - -#ifdef CONFIG_BCACHEFS_DEBUG -#define EBUG_ON(cond) BUG_ON(cond) -#else -#define EBUG_ON(cond) -#endif - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define CPU_BIG_ENDIAN 0 -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define CPU_BIG_ENDIAN 1 -#endif - -/* type hackery */ - -#define type_is_exact(_val, _type) \ - __builtin_types_compatible_p(typeof(_val), _type) - -#define type_is(_val, _type) \ - (__builtin_types_compatible_p(typeof(_val), _type) || \ - __builtin_types_compatible_p(typeof(_val), const _type)) - -/* Userspace doesn't align allocations as nicely as the kernel allocators: */ -static inline size_t buf_pages(void *p, size_t len) -{ - return DIV_ROUND_UP(len + - ((unsigned long) p & (PAGE_SIZE - 1)), - PAGE_SIZE); -} - -static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) -{ - void *p = unlikely(n >= INT_MAX) - ? vmalloc_noprof(n) - : kvmalloc_noprof(n, flags & ~__GFP_ZERO); - if (p && (flags & __GFP_ZERO)) - memset(p, 0, n); - return p; -} -#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) - -#define init_heap(heap, _size, gfp) \ -({ \ - (heap)->nr = 0; \ - (heap)->size = (_size); \ - (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ - (gfp)); \ -}) - -#define free_heap(heap) \ -do { \ - kvfree((heap)->data); \ - (heap)->data = NULL; \ -} while (0) - -#define ANYSINT_MAX(t) \ - ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) - -#include "printbuf.h" - -#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__) -#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__) -#define printbuf_str(_buf) bch2_printbuf_str(_buf) -#define printbuf_exit(_buf) bch2_printbuf_exit(_buf) - -#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf) -#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf) -#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) - -#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) -#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n) -#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) - -#define prt_newline(_out) bch2_prt_newline(_out) -#define prt_tab(_out) bch2_prt_tab(_out) -#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out) - -#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__) -#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v)) -#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__) -#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__) -#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__) -#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__) -#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__) -#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) -#define prt_bitflags_vector(...) bch2_prt_bitflags_vector(__VA_ARGS__) - -void bch2_pr_time_units(struct printbuf *, u64); -void bch2_prt_datetime(struct printbuf *, time64_t); - -#ifdef __KERNEL__ -static inline void uuid_unparse_lower(u8 *uuid, char *out) -{ - sprintf(out, "%pUb", uuid); -} -#else -#include -#endif - -static inline void pr_uuid(struct printbuf *out, u8 *uuid) -{ - char uuid_str[40]; - - uuid_unparse_lower(uuid, uuid_str); - prt_printf(out, "%s", uuid_str); -} - -int bch2_strtoint_h(const char *, int *); -int bch2_strtouint_h(const char *, unsigned int *); -int bch2_strtoll_h(const char *, long long *); -int bch2_strtoull_h(const char *, unsigned long long *); -int bch2_strtou64_h(const char *, u64 *); - -static inline int bch2_strtol_h(const char *cp, long *res) -{ -#if BITS_PER_LONG == 32 - return bch2_strtoint_h(cp, (int *) res); -#else - return bch2_strtoll_h(cp, (long long *) res); -#endif -} - -static inline int bch2_strtoul_h(const char *cp, long *res) -{ -#if BITS_PER_LONG == 32 - return bch2_strtouint_h(cp, (unsigned int *) res); -#else - return bch2_strtoull_h(cp, (unsigned long long *) res); -#endif -} - -#define strtoi_h(cp, res) \ - ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ - : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ - : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ - : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ - : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ - : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ - : -EINVAL) - -#define strtoul_safe(cp, var) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r) \ - var = _v; \ - _r; \ -}) - -#define strtoul_safe_clamp(cp, var, min, max) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r) \ - var = clamp_t(typeof(var), _v, min, max); \ - _r; \ -}) - -#define strtoul_safe_restrict(cp, var, min, max) \ -({ \ - unsigned long _v; \ - int _r = kstrtoul(cp, 10, &_v); \ - if (!_r && _v >= min && _v <= max) \ - var = _v; \ - else \ - _r = -EINVAL; \ - _r; \ -}) - -#define snprint(out, var) \ - prt_printf(out, \ - type_is(var, int) ? "%i\n" \ - : type_is(var, unsigned) ? "%u\n" \ - : type_is(var, long) ? "%li\n" \ - : type_is(var, unsigned long) ? "%lu\n" \ - : type_is(var, s64) ? "%lli\n" \ - : type_is(var, u64) ? "%llu\n" \ - : type_is(var, char *) ? "%s\n" \ - : "%i\n", var) - -bool bch2_is_zero(const void *, size_t); - -u64 bch2_read_flag_list(const char *, const char * const[]); - -void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); -void bch2_prt_u64_base2(struct printbuf *, u64); - -void bch2_print_string_as_lines(const char *, const char *); - -typedef DARRAY(unsigned long) bch_stacktrace; -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); -void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); - -static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) -{ -#ifdef __KERNEL__ - prt_printf(out, "%pg", bdev); -#else - prt_str(out, bdev->name); -#endif -} - -void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); - -#define ewma_add(ewma, val, weight) \ -({ \ - typeof(ewma) _ewma = (ewma); \ - typeof(weight) _weight = (weight); \ - \ - (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ -}) - -struct bch_ratelimit { - /* Next time we want to do some work, in nanoseconds */ - u64 next; - - /* - * Rate at which we want to do work, in units per nanosecond - * The units here correspond to the units passed to - * bch2_ratelimit_increment() - */ - unsigned rate; -}; - -static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) -{ - d->next = local_clock(); -} - -u64 bch2_ratelimit_delay(struct bch_ratelimit *); -void bch2_ratelimit_increment(struct bch_ratelimit *, u64); - -struct bch_pd_controller { - struct bch_ratelimit rate; - unsigned long last_update; - - s64 last_actual; - s64 smoothed_derivative; - - unsigned p_term_inverse; - unsigned d_smooth; - unsigned d_term; - - /* for exporting to sysfs (no effect on behavior) */ - s64 last_derivative; - s64 last_proportional; - s64 last_change; - s64 last_target; - - /* - * If true, the rate will not increase if bch2_ratelimit_delay() - * is not being called often enough. - */ - bool backpressure; -}; - -void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); -void bch2_pd_controller_init(struct bch_pd_controller *); -void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); - -#define sysfs_pd_controller_attribute(name) \ - rw_attribute(name##_rate); \ - rw_attribute(name##_rate_bytes); \ - rw_attribute(name##_rate_d_term); \ - rw_attribute(name##_rate_p_term_inverse); \ - read_attribute(name##_rate_debug) - -#define sysfs_pd_controller_files(name) \ - &sysfs_##name##_rate, \ - &sysfs_##name##_rate_bytes, \ - &sysfs_##name##_rate_d_term, \ - &sysfs_##name##_rate_p_term_inverse, \ - &sysfs_##name##_rate_debug - -#define sysfs_pd_controller_show(name, var) \ -do { \ - sysfs_hprint(name##_rate, (var)->rate.rate); \ - sysfs_print(name##_rate_bytes, (var)->rate.rate); \ - sysfs_print(name##_rate_d_term, (var)->d_term); \ - sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ - \ - if (attr == &sysfs_##name##_rate_debug) \ - bch2_pd_controller_debug_to_text(out, var); \ -} while (0) - -#define sysfs_pd_controller_store(name, var) \ -do { \ - sysfs_strtoul_clamp(name##_rate, \ - (var)->rate.rate, 1, UINT_MAX); \ - sysfs_strtoul_clamp(name##_rate_bytes, \ - (var)->rate.rate, 1, UINT_MAX); \ - sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ - sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ - (var)->p_term_inverse, 1, INT_MAX); \ -} while (0) - -#define container_of_or_null(ptr, type, member) \ -({ \ - typeof(ptr) _ptr = ptr; \ - _ptr ? container_of(_ptr, type, member) : NULL; \ -}) - -static inline struct list_head *list_pop(struct list_head *head) -{ - if (list_empty(head)) - return NULL; - - struct list_head *ret = head->next; - list_del_init(ret); - return ret; -} - -#define list_pop_entry(head, type, member) \ - container_of_or_null(list_pop(head), type, member) - -/* Does linear interpolation between powers of two */ -static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) -{ - unsigned fract = x & ~(~0 << fract_bits); - - x >>= fract_bits; - x = 1 << x; - x += (x * fract) >> fract_bits; - - return x; -} - -void bch2_bio_map(struct bio *bio, void *base, size_t); -int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); - -#define closure_bio_submit(bio, cl) \ -do { \ - closure_get(cl); \ - submit_bio(bio); \ -} while (0) - -#define kthread_wait(cond) \ -({ \ - int _ret = 0; \ - \ - while (1) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (kthread_should_stop()) { \ - _ret = -1; \ - break; \ - } \ - \ - if (cond) \ - break; \ - \ - schedule(); \ - } \ - set_current_state(TASK_RUNNING); \ - _ret; \ -}) - -#define kthread_wait_freezable(cond) \ -({ \ - int _ret = 0; \ - while (1) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (kthread_should_stop()) { \ - _ret = -1; \ - break; \ - } \ - \ - if (cond) \ - break; \ - \ - schedule(); \ - try_to_freeze(); \ - } \ - set_current_state(TASK_RUNNING); \ - _ret; \ -}) - -u64 bch2_get_random_u64_below(u64); - -void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); -void memcpy_from_bio(void *, struct bio *, struct bvec_iter); - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_corrupt_bio(struct bio *); - -static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) -{ - if (ratio && !get_random_u32_below(ratio)) - bch2_corrupt_bio(bio); -} -#else -#define bch2_maybe_corrupt_bio(...) do {} while (0) -#endif - -void bch2_bio_to_text(struct printbuf *, struct bio *); - -static inline void memcpy_u64s_small(void *dst, const void *src, - unsigned u64s) -{ - u64 *d = dst; - const u64 *s = src; - - while (u64s--) - *d++ = *s++; -} - -static inline void __memcpy_u64s(void *dst, const void *src, - unsigned u64s) -{ -#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) - long d0, d1, d2; - - asm volatile("rep ; movsq" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (u64s), "1" (dst), "2" (src) - : "memory"); -#else - u64 *d = dst; - const u64 *s = src; - - while (u64s--) - *d++ = *s++; -#endif -} - -static inline void memcpy_u64s(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(!(dst >= src + u64s * sizeof(u64) || - dst + u64s * sizeof(u64) <= src)); - - __memcpy_u64s(dst, src, u64s); -} - -static inline void __memmove_u64s_down(void *dst, const void *src, - unsigned u64s) -{ - __memcpy_u64s(dst, src, u64s); -} - -static inline void memmove_u64s_down(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst > src); - - __memmove_u64s_down(dst, src, u64s); -} - -static inline void __memmove_u64s_down_small(void *dst, const void *src, - unsigned u64s) -{ - memcpy_u64s_small(dst, src, u64s); -} - -static inline void memmove_u64s_down_small(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst > src); - - __memmove_u64s_down_small(dst, src, u64s); -} - -static inline void __memmove_u64s_up_small(void *_dst, const void *_src, - unsigned u64s) -{ - u64 *dst = (u64 *) _dst + u64s; - u64 *src = (u64 *) _src + u64s; - - while (u64s--) - *--dst = *--src; -} - -static inline void memmove_u64s_up_small(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst < src); - - __memmove_u64s_up_small(dst, src, u64s); -} - -static inline void __memmove_u64s_up(void *_dst, const void *_src, - unsigned u64s) -{ - u64 *dst = (u64 *) _dst + u64s - 1; - u64 *src = (u64 *) _src + u64s - 1; - -#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) - long d0, d1, d2; - - asm volatile("std ;\n" - "rep ; movsq\n" - "cld ;\n" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (u64s), "1" (dst), "2" (src) - : "memory"); -#else - while (u64s--) - *dst-- = *src--; -#endif -} - -static inline void memmove_u64s_up(void *dst, const void *src, - unsigned u64s) -{ - EBUG_ON(dst < src); - - __memmove_u64s_up(dst, src, u64s); -} - -static inline void memmove_u64s(void *dst, const void *src, - unsigned u64s) -{ - if (dst < src) - __memmove_u64s_down(dst, src, u64s); - else - __memmove_u64s_up(dst, src, u64s); -} - -/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ -static inline void memset_u64s_tail(void *s, int c, unsigned bytes) -{ - unsigned rem = round_up(bytes, sizeof(u64)) - bytes; - - memset(s + bytes, c, rem); -} - -/* just the memmove, doesn't update @_nr */ -#define __array_insert_item(_array, _nr, _pos) \ - memmove(&(_array)[(_pos) + 1], \ - &(_array)[(_pos)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))) - -#define array_insert_item(_array, _nr, _pos, _new_item) \ -do { \ - __array_insert_item(_array, _nr, _pos); \ - (_nr)++; \ - (_array)[(_pos)] = (_new_item); \ -} while (0) - -#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -do { \ - (_nr) -= (_nr_to_remove); \ - memmove(&(_array)[(_pos)], \ - &(_array)[(_pos) + (_nr_to_remove)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))); \ -} while (0) - -#define array_remove_item(_array, _nr, _pos) \ - array_remove_items(_array, _nr, _pos, 1) - -static inline void __move_gap(void *array, size_t element_size, - size_t nr, size_t size, - size_t old_gap, size_t new_gap) -{ - size_t gap_end = old_gap + size - nr; - - if (new_gap < old_gap) { - size_t move = old_gap - new_gap; - - memmove(array + element_size * (gap_end - move), - array + element_size * (old_gap - move), - element_size * move); - } else if (new_gap > old_gap) { - size_t move = new_gap - old_gap; - - memmove(array + element_size * old_gap, - array + element_size * gap_end, - element_size * move); - } -} - -/* Move the gap in a gap buffer: */ -#define move_gap(_d, _new_gap) \ -do { \ - BUG_ON(_new_gap > (_d)->nr); \ - BUG_ON((_d)->gap > (_d)->nr); \ - \ - __move_gap((_d)->data, sizeof((_d)->data[0]), \ - (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \ - (_d)->gap = _new_gap; \ -} while (0) - -#define bubble_sort(_base, _nr, _cmp) \ -do { \ - ssize_t _i, _last; \ - bool _swapped = true; \ - \ - for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\ - _swapped = false; \ - for (_i = 0; _i < _last; _i++) \ - if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ - swap((_base)[_i], (_base)[_i + 1]); \ - _swapped = true; \ - } \ - } \ -} while (0) - -#define per_cpu_sum(_p) \ -({ \ - TYPEOF_UNQUAL(*_p) _ret = 0; \ - \ - int cpu; \ - for_each_possible_cpu(cpu) \ - _ret += *per_cpu_ptr(_p, cpu); \ - _ret; \ -}) - -static inline u64 percpu_u64_get(u64 __percpu *src) -{ - return per_cpu_sum(src); -} - -static inline void percpu_u64_set(u64 __percpu *dst, u64 src) -{ - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(dst, cpu) = 0; - this_cpu_write(*dst, src); -} - -static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) -{ - for (unsigned i = 0; i < nr; i++) - acc[i] += src[i]; -} - -static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, - unsigned nr) -{ - int cpu; - - for_each_possible_cpu(cpu) - acc_u64s(acc, per_cpu_ptr(src, cpu), nr); -} - -static inline void percpu_memset(void __percpu *p, int c, size_t bytes) -{ - int cpu; - - for_each_possible_cpu(cpu) - memset(per_cpu_ptr(p, cpu), c, bytes); -} - -u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); - -static inline int u8_cmp(u8 l, u8 r) -{ - return cmp_int(l, r); -} - -static inline int cmp_le32(__le32 l, __le32 r) -{ - return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); -} - -#include - -static inline bool qstr_eq(const struct qstr l, const struct qstr r) -{ - return l.len == r.len && !memcmp(l.name, r.name, l.len); -} - -void bch2_darray_str_exit(darray_const_str *); -int bch2_split_devs(const char *, darray_const_str *); - -#ifdef __KERNEL__ - -__must_check -static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) -{ - return copy_to_user(to, from, n) ? -EFAULT : 0; -} - -__must_check -static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n) -{ - return copy_from_user(to, from, n) ? -EFAULT : 0; -} - -#endif - -static inline void mod_bit(long nr, volatile unsigned long *addr, bool v) -{ - if (v) - set_bit(nr, addr); - else - clear_bit(nr, addr); -} - -static inline void __set_bit_le64(size_t bit, __le64 *addr) -{ - addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); -} - -static inline void __clear_bit_le64(size_t bit, __le64 *addr) -{ - addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64)); -} - -static inline bool test_bit_le64(size_t bit, __le64 *addr) -{ - return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; -} - -static inline void memcpy_swab(void *_dst, void *_src, size_t len) -{ - u8 *dst = _dst + len; - u8 *src = _src; - - while (len--) - *--dst = *src++; -} - -#define set_flags(_map, _in, _out) \ -do { \ - unsigned _i; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & (1 << _i)) \ - (_out) |= _map[_i]; \ - else \ - (_out) &= ~_map[_i]; \ -} while (0) - -#define map_flags(_map, _in) \ -({ \ - unsigned _out = 0; \ - \ - set_flags(_map, _in, _out); \ - _out; \ -}) - -#define map_flags_rev(_map, _in) \ -({ \ - unsigned _i, _out = 0; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & _map[_i]) { \ - (_out) |= 1 << _i; \ - (_in) &= ~_map[_i]; \ - } \ - (_out); \ -}) - -#define map_defined(_map) \ -({ \ - unsigned _in = ~0; \ - \ - map_flags_rev(_map, _in); \ -}) - -#endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c deleted file mode 100644 index 6620ecae26af3a..00000000000000 --- a/fs/bcachefs/varint.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include - -#ifdef CONFIG_VALGRIND -#include -#endif - -#include "errcode.h" -#include "varint.h" - -/** - * bch2_varint_encode - encode a variable length integer - * @out: destination to encode to - * @v: unsigned integer to encode - * Returns: size in bytes of the encoded integer - at most 9 bytes - */ -int bch2_varint_encode(u8 *out, u64 v) -{ - unsigned bits = fls64(v|1); - unsigned bytes = DIV_ROUND_UP(bits, 7); - __le64 v_le; - - if (likely(bytes < 9)) { - v <<= bytes; - v |= ~(~0 << (bytes - 1)); - v_le = cpu_to_le64(v); - memcpy(out, &v_le, bytes); - } else { - *out++ = 255; - bytes = 9; - put_unaligned_le64(v, out); - } - - return bytes; -} - -/** - * bch2_varint_decode - encode a variable length integer - * @in: varint to decode - * @end: end of buffer to decode from - * @out: on success, decoded integer - * Returns: size in bytes of the decoded integer - or -1 on failure (would - * have read past the end of the buffer) - */ -int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) -{ - unsigned bytes = likely(in < end) - ? ffz(*in & 255) + 1 - : 1; - u64 v; - - if (unlikely(in + bytes > end)) - return -BCH_ERR_varint_decode_error; - - if (likely(bytes < 9)) { - __le64 v_le = 0; - - memcpy(&v_le, in, bytes); - v = le64_to_cpu(v_le); - v >>= bytes; - } else { - v = get_unaligned_le64(++in); - } - - *out = v; - return bytes; -} - -/** - * bch2_varint_encode_fast - fast version of bch2_varint_encode - * @out: destination to encode to - * @v: unsigned integer to encode - * Returns: size in bytes of the encoded integer - at most 9 bytes - * - * This version assumes it's always safe to write 8 bytes to @out, even if the - * encoded integer would be smaller. - */ -int bch2_varint_encode_fast(u8 *out, u64 v) -{ - unsigned bits = fls64(v|1); - unsigned bytes = DIV_ROUND_UP(bits, 7); - - if (likely(bytes < 9)) { - v <<= bytes; - v |= ~(~0U << (bytes - 1)); - } else { - *out++ = 255; - bytes = 9; - } - - put_unaligned_le64(v, out); - return bytes; -} - -/** - * bch2_varint_decode_fast - fast version of bch2_varint_decode - * @in: varint to decode - * @end: end of buffer to decode from - * @out: on success, decoded integer - * Returns: size in bytes of the decoded integer - or -1 on failure (would - * have read past the end of the buffer) - * - * This version assumes that it is safe to read at most 8 bytes past the end of - * @end (we still return an error if the varint extends past @end). - */ -int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) -{ -#ifdef CONFIG_VALGRIND - VALGRIND_MAKE_MEM_DEFINED(in, 8); -#endif - u64 v = get_unaligned_le64(in); - unsigned bytes = ffz(*in) + 1; - - if (unlikely(in + bytes > end)) - return -BCH_ERR_varint_decode_error; - - if (likely(bytes < 9)) { - v >>= bytes; - v &= ~(~0ULL << (7 * bytes)); - } else { - v = get_unaligned_le64(++in); - } - - *out = v; - return bytes; -} diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h deleted file mode 100644 index 92a182fb3d7aed..00000000000000 --- a/fs/bcachefs/varint.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_VARINT_H -#define _BCACHEFS_VARINT_H - -int bch2_varint_encode(u8 *, u64); -int bch2_varint_decode(const u8 *, const u8 *, u64 *); - -int bch2_varint_encode_fast(u8 *, u64); -int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); - -#endif /* _BCACHEFS_VARINT_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h deleted file mode 100644 index 2ad338e282da82..00000000000000 --- a/fs/bcachefs/vstructs.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _VSTRUCTS_H -#define _VSTRUCTS_H - -#include "util.h" - -/* - * NOTE: we can't differentiate between __le64 and u64 with type_is - this - * assumes u64 is little endian: - */ -#define __vstruct_u64s(_s) \ -({ \ - ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ - : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ - : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ - : ((__force u8) ((_s)->u64s))); \ -}) - -#define __vstruct_bytes(_type, _u64s) \ -({ \ - BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ - \ - (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ -}) - -#define vstruct_bytes(_s) \ - __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) - -#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ - (round_up(__vstruct_bytes(_type, _u64s), \ - 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) - -#define vstruct_blocks(_s, _sector_block_bits) \ - __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) - -#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ - __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ - __vstruct_u64s(_s) + (_u64s)) - -#define vstruct_sectors(_s, _sector_block_bits) \ - (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) - -#define vstruct_next(_s) \ - ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) -#define vstruct_last(_s) \ - ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) -#define vstruct_end(_s) \ - ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) - -#define vstruct_for_each(_s, _i) \ - for (typeof(&(_s)->start[0]) _i = (_s)->start; \ - _i < vstruct_last(_s); \ - _i = vstruct_next(_i)) - -#define vstruct_for_each_safe(_s, _i) \ - for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \ - _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \ - _i = _next) - -#define vstruct_idx(_s, _idx) \ - ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) - -#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c deleted file mode 100644 index 627f153798c679..00000000000000 --- a/fs/bcachefs/xattr.c +++ /dev/null @@ -1,642 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "acl.h" -#include "bkey_methods.h" -#include "btree_update.h" -#include "extents.h" -#include "fs.h" -#include "rebalance.h" -#include "str_hash.h" -#include "xattr.h" - -#include -#include -#include - -static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); - -static u64 bch2_xattr_hash(const struct bch_hash_info *info, - const struct xattr_search_key *key) -{ - struct bch_str_hash_ctx ctx; - - bch2_str_hash_init(&ctx, info); - bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); - bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); - - return bch2_str_hash_end(&ctx, info); -} - -static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) -{ - return bch2_xattr_hash(info, key); -} - -static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -{ - struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); - - return bch2_xattr_hash(info, - &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); -} - -static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) -{ - struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); - const struct xattr_search_key *r = _r; - - return l.v->x_type != r->type || - l.v->x_name_len != r->name.len || - memcmp(l.v->x_name_and_value, r->name.name, r->name.len); -} - -static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -{ - struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); - struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); - - return l.v->x_type != r.v->x_type || - l.v->x_name_len != r.v->x_name_len || - memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); -} - -const struct bch_hash_desc bch2_xattr_hash_desc = { - .btree_id = BTREE_ID_xattrs, - .key_type = KEY_TYPE_xattr, - .hash_key = xattr_hash_key, - .hash_bkey = xattr_hash_bkey, - .cmp_key = xattr_cmp_key, - .cmp_bkey = xattr_cmp_bkey, -}; - -int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, - struct bkey_validate_context from) -{ - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len)); - int ret = 0; - - bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, - c, xattr_val_size_too_small, - "value too small (%zu < %u)", - bkey_val_u64s(k.k), val_u64s); - - /* XXX why +4 ? */ - val_u64s = xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len) + 4); - - bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, - c, xattr_val_size_too_big, - "value too big (%zu > %u)", - bkey_val_u64s(k.k), val_u64s); - - bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), - c, xattr_invalid_type, - "invalid type (%u)", xattr.v->x_type); - - bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), - c, xattr_name_invalid_chars, - "xattr name has invalid characters"); -fsck_err: - return ret; -} - -void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - const struct xattr_handler *handler; - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - - handler = bch2_xattr_type_to_handler(xattr.v->x_type); - if (handler && handler->prefix) - prt_printf(out, "%s", handler->prefix); - else if (handler) - prt_printf(out, "(type %u)", xattr.v->x_type); - else - prt_printf(out, "(unknown type %u)", xattr.v->x_type); - - unsigned name_len = xattr.v->x_name_len; - unsigned val_len = le16_to_cpu(xattr.v->x_val_len); - unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - - offsetof(struct bch_xattr, x_name_and_value); - - val_len = min_t(int, val_len, max_name_val_bytes - name_len); - name_len = min(name_len, max_name_val_bytes); - - prt_printf(out, "%.*s:%.*s", - name_len, xattr.v->x_name_and_value, - val_len, (char *) xattr_val(xattr.v)); - - if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || - xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { - prt_char(out, ' '); - bch2_acl_to_text(out, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - } -} - -static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, - const char *name, void *buffer, size_t size, int type) -{ - struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); - struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); - struct btree_iter iter; - struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode_inum(inode), &search, 0); - int ret = bkey_err(k); - if (ret) - return ret; - - struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - ret = le16_to_cpu(xattr.v->x_val_len); - if (buffer) { - if (ret > size) - ret = -ERANGE; - else - memcpy(buffer, xattr_val(xattr.v), ret); - } - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, - struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, - const char *name, const void *value, size_t size, - int type, int flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter inode_iter = {}; - int ret; - - ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); - if (ret) - return ret; - - /* - * Besides the ctime update, extents, dirents and xattrs updates require - * that an inode update also happens - to ensure that if a key exists in - * one of those btrees with a given snapshot ID an inode is also present - */ - inode_u->bi_ctime = bch2_current_time(c); - - ret = bch2_inode_write(trans, &inode_iter, inode_u); - bch2_trans_iter_exit(trans, &inode_iter); - - if (ret) - return ret; - - if (value) { - struct bkey_i_xattr *xattr; - unsigned namelen = strlen(name); - unsigned u64s = BKEY_U64s + - xattr_val_u64s(namelen, size); - - if (u64s > U8_MAX) - return -ERANGE; - - xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(xattr)) - return PTR_ERR(xattr); - - bkey_xattr_init(&xattr->k_i); - xattr->k.u64s = u64s; - xattr->v.x_type = type; - xattr->v.x_name_len = namelen; - xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name_and_value, name, namelen); - memcpy(xattr_val(&xattr->v), value, size); - - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, - inum, &xattr->k_i, - (flags & XATTR_CREATE ? STR_HASH_must_create : 0)| - (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0)); - } else { - struct xattr_search_key search = - X_SEARCH(type, name, strlen(name)); - - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, - hash_info, inum, &search); - } - - if (bch2_err_matches(ret, ENOENT)) - ret = flags & XATTR_REPLACE ? -ENODATA : 0; - - return ret; -} - -struct xattr_buf { - char *buf; - size_t len; - size_t used; -}; - -static int __bch2_xattr_emit(const char *prefix, - const char *name, size_t name_len, - struct xattr_buf *buf) -{ - const size_t prefix_len = strlen(prefix); - const size_t total_len = prefix_len + name_len + 1; - - if (buf->buf) { - if (buf->used + total_len > buf->len) - return -ERANGE; - - memcpy(buf->buf + buf->used, prefix, prefix_len); - memcpy(buf->buf + buf->used + prefix_len, - name, name_len); - buf->buf[buf->used + prefix_len + name_len] = '\0'; - } - - buf->used += total_len; - return 0; -} - -static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry) -{ - const struct xattr_handler *handler = bch2_xattr_type_to_handler(type); - - if (!xattr_handler_can_list(handler, dentry)) - return NULL; - - return xattr_prefix(handler); -} - -static int bch2_xattr_emit(struct dentry *dentry, - const struct bch_xattr *xattr, - struct xattr_buf *buf) -{ - const char *prefix; - - prefix = bch2_xattr_prefix(xattr->x_type, dentry); - if (!prefix) - return 0; - - return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); -} - -static int bch2_xattr_list_bcachefs(struct bch_fs *c, - struct bch_inode_unpacked *inode, - struct xattr_buf *buf, - bool all) -{ - const char *prefix = all ? "bcachefs_effective." : "bcachefs."; - unsigned id; - int ret = 0; - u64 v; - - for (id = 0; id < Inode_opt_nr; id++) { - v = bch2_inode_opt_get(inode, id); - if (!v) - continue; - - if (!all && - !(inode->bi_fields_set & (1 << id))) - continue; - - ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], - strlen(bch2_inode_opts[id]), buf); - if (ret) - break; - } - - return ret; -} - -ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct bch_fs *c = dentry->d_sb->s_fs_info; - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; - u64 offset = 0, inum = inode->ei_inode.bi_inum; - - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, - POS(inum, offset), - POS(inum, U64_MAX), - inode->ei_inum.subvol, 0, k, ({ - if (k.k->type != KEY_TYPE_xattr) - continue; - - bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); - }))) ?: - bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?: - bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); - - return ret ? bch2_err_class(ret) : buf.used; -} - -static int bch2_xattr_get_handler(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_trans_do(c, - bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); - - if (ret < 0 && bch2_err_matches(ret, ENOENT)) - ret = -ENODATA; - - return bch2_err_class(ret); -} - -static int bch2_xattr_set_handler(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *vinode, - const char *name, const void *value, - size_t size, int flags) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - struct bch_inode_unpacked inode_u; - int ret; - - ret = bch2_trans_run(c, - commit_do(trans, NULL, NULL, 0, - bch2_xattr_set(trans, inode_inum(inode), &inode_u, - &hash, name, value, size, - handler->flags, flags)) ?: - (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0)); - - return bch2_err_class(ret); -} - -static const struct xattr_handler bch_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .get = bch2_xattr_get_handler, - .set = bch2_xattr_set_handler, - .flags = KEY_TYPE_XATTR_INDEX_USER, -}; - -static bool bch2_xattr_trusted_list(struct dentry *dentry) -{ - return capable(CAP_SYS_ADMIN); -} - -static const struct xattr_handler bch_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .list = bch2_xattr_trusted_list, - .get = bch2_xattr_get_handler, - .set = bch2_xattr_set_handler, - .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, -}; - -static const struct xattr_handler bch_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = bch2_xattr_get_handler, - .set = bch2_xattr_set_handler, - .flags = KEY_TYPE_XATTR_INDEX_SECURITY, -}; - -#ifndef NO_BCACHEFS_FS - -static int opt_to_inode_opt(int id) -{ - switch (id) { -#define x(name, ...) \ - case Opt_##name: return Inode_opt_##name; - BCH_INODE_OPTS() -#undef x - default: - return -1; - } -} - -static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size, - bool all) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_opts opts = - bch2_inode_opts_to_opts(&inode->ei_inode); - const struct bch_option *opt; - int id, inode_opt_id; - struct printbuf out = PRINTBUF; - int ret; - u64 v; - - id = bch2_opt_lookup(name); - if (id < 0 || !bch2_opt_is_inode_opt(id)) - return -EINVAL; - - inode_opt_id = opt_to_inode_opt(id); - if (inode_opt_id < 0) - return -EINVAL; - - opt = bch2_opt_table + id; - - if (!bch2_opt_defined_by_id(&opts, id)) - return -ENODATA; - - if (!all && - !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) - return -ENODATA; - - v = bch2_opt_get_by_id(&opts, id); - bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); - - ret = out.pos; - - if (out.allocation_failure) { - ret = -ENOMEM; - } else if (buffer) { - if (out.pos > size) - ret = -ERANGE; - else - memcpy(buffer, out.buf, out.pos); - } - - printbuf_exit(&out); - return ret; -} - -static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) -{ - return __bch2_xattr_bcachefs_get(handler, dentry, vinode, - name, buffer, size, false); -} - -struct inode_opt_set { - int id; - u64 v; - bool defined; -}; - -static int inode_opt_set_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct inode_opt_set *s = p; - - if (s->id == Inode_opt_casefold) { - int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v); - if (ret) - return ret; - } - - if (s->defined) - bi->bi_fields_set |= 1U << s->id; - else - bi->bi_fields_set &= ~(1U << s->id); - - bch2_inode_opt_set(bi, s->id, s->v); - - return 0; -} - -static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *vinode, - const char *name, const void *value, - size_t size, int flags) -{ - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - const struct bch_option *opt; - char *buf; - struct inode_opt_set s; - int opt_id, inode_opt_id, ret; - - opt_id = bch2_opt_lookup(name); - if (opt_id < 0) - return -EINVAL; - - opt = bch2_opt_table + opt_id; - - inode_opt_id = opt_to_inode_opt(opt_id); - if (inode_opt_id < 0) - return -EINVAL; - - s.id = inode_opt_id; - - if (value) { - u64 v = 0; - - buf = kmalloc(size + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - memcpy(buf, value, size); - buf[size] = '\0'; - - ret = bch2_opt_parse(c, opt, buf, &v, NULL); - kfree(buf); - - if (ret < 0) - goto err_class_exit; - - ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); - if (ret < 0) - goto err_class_exit; - - s.v = v + 1; - s.defined = true; - } else { - /* - * Check if this option was set on the parent - if so, switched - * back to inheriting from the parent: - * - * rename() also has to deal with keeping inherited options up - * to date - see bch2_reinherit_attrs() - */ - spin_lock(&dentry->d_lock); - if (!IS_ROOT(dentry)) { - struct bch_inode_info *dir = - to_bch_ei(d_inode(dentry->d_parent)); - - s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); - } else { - s.v = 0; - } - spin_unlock(&dentry->d_lock); - - s.defined = false; - } - - mutex_lock(&inode->ei_update_lock); - if (inode_opt_id == Inode_opt_project) { - /* - * inode fields accessible via the xattr interface are stored - * with a +1 bias, so that 0 means unset: - */ - ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); - if (ret) - goto err; - } - - ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); -err: - mutex_unlock(&inode->ei_update_lock); -err_class_exit: - return bch2_err_class(ret); -} - -static const struct xattr_handler bch_xattr_bcachefs_handler = { - .prefix = "bcachefs.", - .get = bch2_xattr_bcachefs_get, - .set = bch2_xattr_bcachefs_set, -}; - -static int bch2_xattr_bcachefs_get_effective( - const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) -{ - return __bch2_xattr_bcachefs_get(handler, dentry, vinode, - name, buffer, size, true); -} - -/* Noop - xattrs in the bcachefs_effective namespace are inherited */ -static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *vinode, - const char *name, const void *value, - size_t size, int flags) -{ - return 0; -} - -static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { - .prefix = "bcachefs_effective.", - .get = bch2_xattr_bcachefs_get_effective, - .set = bch2_xattr_bcachefs_set_effective, -}; - -#endif /* NO_BCACHEFS_FS */ - -const struct xattr_handler * const bch2_xattr_handlers[] = { - &bch_xattr_user_handler, - &bch_xattr_trusted_handler, - &bch_xattr_security_handler, -#ifndef NO_BCACHEFS_FS - &bch_xattr_bcachefs_handler, - &bch_xattr_bcachefs_effective_handler, -#endif - NULL -}; - -static const struct xattr_handler *bch_xattr_handler_map[] = { - [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, - [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = - &nop_posix_acl_access, - [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &nop_posix_acl_default, - [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, - [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, -}; - -static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) -{ - return type < ARRAY_SIZE(bch_xattr_handler_map) - ? bch_xattr_handler_map[type] - : NULL; -} diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h deleted file mode 100644 index 1139bf345f7093..00000000000000 --- a/fs/bcachefs/xattr.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_XATTR_H -#define _BCACHEFS_XATTR_H - -#include "str_hash.h" - -extern const struct bch_hash_desc bch2_xattr_hash_desc; - -int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, - struct bkey_validate_context); -void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - -#define bch2_bkey_ops_xattr ((struct bkey_ops) { \ - .key_validate = bch2_xattr_validate, \ - .val_to_text = bch2_xattr_to_text, \ - .min_val_size = 8, \ -}) - -static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) -{ - return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + - name_len + val_len, sizeof(u64)); -} - -#define xattr_val(_xattr) \ - ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) - -struct xattr_search_key { - u8 type; - struct qstr name; -}; - -#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ - { .type = _type, .name = QSTR_INIT(_name, _len) }) - -struct dentry; -struct xattr_handler; -struct bch_hash_info; -struct bch_inode_info; - -/* Exported for cmd_migrate.c in tools: */ -int bch2_xattr_set(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *, const struct bch_hash_info *, - const char *, const void *, size_t, int, int); - -ssize_t bch2_xattr_list(struct dentry *, char *, size_t); - -extern const struct xattr_handler * const bch2_xattr_handlers[]; - -#endif /* _BCACHEFS_XATTR_H */ diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h deleted file mode 100644 index 4121b78d9a92a5..00000000000000 --- a/fs/bcachefs/xattr_format.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_XATTR_FORMAT_H -#define _BCACHEFS_XATTR_FORMAT_H - -#define KEY_TYPE_XATTR_INDEX_USER 0 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -#define KEY_TYPE_XATTR_INDEX_SECURITY 4 - -struct bch_xattr { - struct bch_val v; - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; - /* - * x_name contains the name and value counted by - * x_name_len + x_val_len. The introduction of - * __counted_by(x_name_len) previously caused a false positive - * detection of an out of bounds write. - */ - __u8 x_name_and_value[]; -} __packed __aligned(8); - -#endif /* _BCACHEFS_XATTR_FORMAT_H */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 264fba0d44bdf4..e4653bb99946b1 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -103,6 +103,21 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) +static inline void elf_coredump_set_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + mm->saved_e_flags = flags; +#endif +} + +static inline u32 elf_coredump_get_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + flags = mm->saved_e_flags; +#endif + return flags; +} + /* * We need to explicitly zero any trailing portion of the page that follows * p_filesz when it ends before the page ends (e.g. bss), otherwise this @@ -1290,6 +1305,8 @@ static int load_elf_binary(struct linux_binprm *bprm) mm->end_data = end_data; mm->start_stack = bprm->p; + elf_coredump_set_mm_eflags(mm, elf_ex->e_flags); + /** * DOC: "brk" handling * @@ -1804,6 +1821,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; + u16 machine; + u32 flags; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) @@ -1831,30 +1850,37 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 0; } - /* - * Initialize the ELF file header. - */ - fill_elf_header(elf, phdrs, - view->e_machine, view->e_flags); + machine = view->e_machine; + flags = view->e_flags; #else view = NULL; info->thread_notes = 2; - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); + machine = ELF_ARCH; + flags = ELF_CORE_EFLAGS; #endif + /* + * Override ELF e_flags with value taken from process, + * if arch needs that. + */ + flags = elf_coredump_get_mm_eflags(dump_task->mm, flags); + + /* + * Initialize the ELF file header. + */ + fill_elf_header(elf, phdrs, machine, flags); + /* * Allocate a structure for each thread. */ - info->thread = kzalloc(offsetof(struct elf_thread_core_info, - notes[info->thread_notes]), - GFP_KERNEL); + info->thread = kzalloc(struct_size(info->thread, notes, info->thread_notes), + GFP_KERNEL); if (unlikely(!info->thread)) return 0; info->thread->task = dump_task; for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) { - t = kzalloc(offsetof(struct elf_thread_core_info, - notes[info->thread_notes]), + t = kzalloc(struct_size(t, notes, info->thread_notes), GFP_KERNEL); if (unlikely(!t)) return 0; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ea95c90c847489..4438637c8900cd 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -62,6 +62,7 @@ config BTRFS_FS_RUN_SANITY_TESTS config BTRFS_DEBUG bool "Btrfs debugging support" depends on BTRFS_FS + select REF_TRACKER if STACKTRACE_SUPPORT help Enable run-time debugging support for the btrfs filesystem. @@ -117,14 +118,3 @@ config BTRFS_EXPERIMENTAL - large folio support If unsure, say N. - -config BTRFS_FS_REF_VERIFY - bool "Btrfs with the ref verify tool compiled in" - depends on BTRFS_FS - default n - help - Enable run-time extent reference verification instrumentation. This - is meant to be used by btrfs developers for tracking down extent - reference problems or verifying they didn't break something. - - If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 2d5f0482678b82..743d7677b175df 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -36,7 +36,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o +btrfs-$(CONFIG_BTRFS_DEBUG) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 861c7d92c437aa..1248aa2535d306 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -44,7 +44,7 @@ static __always_inline void memcpy_split_src(char *dest, const char *src1, * gives us all the type checking. * * The extent buffer pages stored in the array folios may not form a contiguous - * phyusical range, but the API functions assume the linear offset to the range + * physical range, but the API functions assume the linear offset to the range * from 0 to metadata node size. */ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 6a450be293b1cd..2ab550a1e715a7 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -859,7 +859,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, free_pref(ref); return PTR_ERR(eb); } - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_pref(ref); free_extent_buffer(eb); return -EIO; @@ -1062,7 +1062,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - if (type == BTRFS_REF_TYPE_INVALID) + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; offset = btrfs_extent_inline_ref_offset(leaf, iref); @@ -1422,7 +1422,7 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -1614,7 +1614,7 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ret = PTR_ERR(eb); goto out; } - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); ret = -EIO; goto out; @@ -1652,7 +1652,7 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, * case. */ ASSERT(eie); - if (!eie) { + if (unlikely(!eie)) { ret = -EUCLEAN; goto out; } @@ -1690,7 +1690,7 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are * added to the ulist at @ctx->refs, and that ulist is allocated by this * function. The caller should free the ulist with free_leaf_list() if - * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is + * @ctx->ignore_extent_item_pos is false, otherwise a simple ulist_free() is * enough. * * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. @@ -2215,7 +2215,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2312,7 +2312,7 @@ static int get_extent_inline_ref(unsigned long *ptr, *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref, BTRFS_REF_TYPE_ANY); - if (*out_type == BTRFS_REF_TYPE_INVALID) + if (unlikely(*out_type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; *ptr += btrfs_extent_inline_ref_size(*out_type); @@ -2868,7 +2868,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2876,7 +2876,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) ret = -EUCLEAN; goto release; } - if (path->slots[0] == 0) { + if (unlikely(path->slots[0] == 0)) { DEBUG_WARN(); ret = -EUCLEAN; goto release; @@ -3457,7 +3457,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, if (ret < 0) goto out; /* No extra backref? This means the tree block is corrupted */ - if (ret > 0) { + if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -3500,7 +3500,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, ((unsigned long)iter->cur_ptr); type = btrfs_get_extent_inline_ref_type(eb, iref, BTRFS_REF_TYPE_BLOCK); - if (type == BTRFS_REF_TYPE_INVALID) { + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) { ret = -EUCLEAN; goto out; } @@ -3612,7 +3612,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, } /* Sanity check, we shouldn't have any unchecked nodes */ - if (!upper->checked) { + if (unlikely(!upper->checked)) { DEBUG_WARN("we should not have any unchecked nodes"); return -EUCLEAN; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 34b0193a181c88..25d51c2460703b 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -190,7 +190,7 @@ struct btrfs_backref_share_check_ctx { * It's very common to have several file extent items that point to the * same extent (bytenr) but with different offsets and lengths. This * typically happens for COW writes, partial writes into prealloc - * extents, NOCOW writes after snapshoting a root, hole punching or + * extents, NOCOW writes after snapshotting a root, hole punching or * reflinking within the same file (less common perhaps). * So keep a small cache with the lookup results for the extent pointed * by the last few file extent items. This cache is checked, with a @@ -414,7 +414,7 @@ struct btrfs_backref_cache { /* * Whether this cache is for relocation * - * Reloction backref cache require more info for reloc root compared + * Relocation backref cache require more info for reloc root compared * to generic backref cache. */ bool is_reloc; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 50b5fc1c06d7cc..21df48e6c4fa20 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -93,6 +93,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, refcount_inc(&orig_bbio->ordered->refs); bbio->ordered = orig_bbio->ordered; } + bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; atomic_inc(&orig_bbio->pending_ios); return bbio; } @@ -166,7 +167,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, int mirror = repair_bbio->mirror_num; if (repair_bbio->bio.bi_status || - !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { + !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); repair_bbio->bio.bi_iter = repair_bbio->saved_iter; @@ -203,18 +204,21 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, */ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, u32 bio_offset, - struct bio_vec *bv, + phys_addr_t paddr, struct btrfs_failed_bio *fbio) { struct btrfs_inode *inode = failed_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio = page_folio(phys_to_page(paddr)); const u32 sectorsize = fs_info->sectorsize; + const u32 foff = offset_in_folio(folio, paddr); const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); struct btrfs_bio *repair_bbio; struct bio *repair_bio; int num_copies; int mirror; + ASSERT(foff + sectorsize <= folio_size(folio)); btrfs_debug(fs_info, "repair read error: read error at %llu", failed_bbio->file_offset + bio_offset); @@ -237,7 +241,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + bio_add_folio_nofail(repair_bio, folio, sectorsize, foff); repair_bbio = btrfs_bio(repair_bio); btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); @@ -258,6 +262,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de struct bvec_iter *iter = &bbio->saved_iter; blk_status_t status = bbio->bio.bi_status; struct btrfs_failed_bio *fbio = NULL; + phys_addr_t paddr; u32 offset = 0; /* Read-repair requires the inode field to be set by the submitter. */ @@ -275,17 +280,11 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de /* Clear the I/O error. A failed repair will reset it. */ bbio->bio.bi_status = BLK_STS_OK; - while (iter->bi_size) { - struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); - - bv.bv_len = min(bv.bv_len, sectorsize); - if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv)) - fbio = repair_one_sector(bbio, offset, &bv, fbio); - - bio_advance_iter_single(&bbio->bio, iter, sectorsize); + btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) { + if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr)) + fbio = repair_one_sector(bbio, offset, paddr, fbio); offset += sectorsize; } - if (bbio->csum != bbio->csum_inline) kfree(bbio->csum); @@ -780,11 +779,38 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) return true; } +static void assert_bbio_alignment(struct btrfs_bio *bbio) +{ +#ifdef CONFIG_BTRFS_ASSERT + struct btrfs_fs_info *fs_info = bbio->fs_info; + struct bio_vec bvec; + struct bvec_iter iter; + const u32 blocksize = fs_info->sectorsize; + + /* Metadata has no extra bs > ps alignment requirement. */ + if (!is_data_bbio(bbio)) + return; + + bio_for_each_bvec(bvec, &bbio->bio, iter) + ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) && + IS_ALIGNED(bvec.bv_len, blocksize), + "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u", + btrfs_root_id(bbio->inode->root), + btrfs_ino(bbio->inode), + bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT, + bbio->bio.bi_iter.bi_size, iter.bi_idx, + bvec.bv_offset, + bvec.bv_len); +#endif +} + void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); + assert_bbio_alignment(bbio); + while (!btrfs_submit_chunk(bbio, mirror_num)) ; } @@ -823,8 +849,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (ret < 0) goto out_counter_dec; - if (!smap.dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { + if (unlikely(!smap.dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) { ret = -EIO; goto out_counter_dec; } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index dc2eb43b70970b..00883aea55d70f 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -82,6 +82,8 @@ struct btrfs_bio { /* Save the first error status of split bio. */ blk_status_t status; + /* Use the commit root to look up csums (data read bio only). */ + bool csum_search_commit_root; /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9bf282d2453c02..5322ef2ae015e8 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1358,7 +1358,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * data in this block group. That check should be done by relocation routine, * not this function. */ -static int inc_block_group_ro(struct btrfs_block_group *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; @@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, bg1 = list_entry(a, struct btrfs_block_group, bg_list); bg2 = list_entry(b, struct btrfs_block_group, bg_list); - return bg1->used > bg2->used; + /* + * Some other task may be updating the ->used field concurrently, but it + * is not serious if we get a stale value or load/store tearing issues, + * as sorting the list of block groups to reclaim is not critical and an + * occasional imperfect order is ok. So silence KCSAN and avoid the + * overhead of locking or any other synchronization. + */ + return data_race(bg1->used > bg2->used); } static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) @@ -1964,7 +1971,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) * called, which is where we will transfer a reserved extent's * size from the "reserved" counter to the "used" counter - this * happens when running delayed references. When we relocate the - * chunk below, relocation first flushes dellaloc, waits for + * chunk below, relocation first flushes delalloc, waits for * ordered extent completion (which is where we create delayed * references for data extents) and commits the current * transaction (which runs delayed references), and only after @@ -2031,7 +2038,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) btrfs_reclaim_sweep(fs_info); spin_lock(&fs_info->unused_bgs_lock); if (!list_empty(&fs_info->reclaim_bgs)) - queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); + queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work); spin_unlock(&fs_info->unused_bgs_lock); } @@ -2064,7 +2071,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key return -ENOENT; } - if (map->start != key->objectid || map->chunk_len != key->offset) { + if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", key->objectid, key->offset, map->start, map->chunk_len); @@ -2077,7 +2084,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, @@ -2238,7 +2245,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return ret; /* Shouldn't have super stripes in sequential zones */ - if (zoned && nr) { + if (unlikely(zoned && nr)) { kfree(logical); btrfs_err(fs_info, "zoned: block group %llu must not contain super block", @@ -2329,7 +2336,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) break; bg = btrfs_lookup_block_group(fs_info, map->start); - if (!bg) { + if (unlikely(!bg)) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", map->start, map->chunk_len); @@ -2337,9 +2344,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) btrfs_free_chunk_map(map); break; } - if (bg->start != map->start || bg->length != map->chunk_len || - (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(bg->start != map->start || bg->length != map->chunk_len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", map->start, map->chunk_len, @@ -2832,7 +2839,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) * space or none at all (due to no need to COW, extent buffers * were already COWed in the current transaction and still * unwritten, tree heights lower than the maximum possible - * height, etc). For data we generally reserve the axact amount + * height, etc). For data we generally reserve the exact amount * of space we are going to allocate later, the exception is * when using compression, as we must reserve space based on the * uncompressed data size, because the compression is only done @@ -3241,7 +3248,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group, */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { /* * So theoretically we could recover from this, simply set the * super cache generation to 0 so we know to invalidate the @@ -3988,7 +3995,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans struct btrfs_space_info *sys_space_info; sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags); - if (!sys_space_info) { + if (unlikely(!sys_space_info)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -4002,17 +4009,17 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } - } else if (ret) { + } else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a8bb8429c96635..9172104a5889ec 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -63,7 +63,7 @@ enum btrfs_discard_state { * CHUNK_ALLOC_FORCE means it must try to allocate one * * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from - * find_free_extent() that also activaes the zone + * find_free_extent() that also activates the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0387b9f43a5292..af373d50a901f2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -338,6 +338,11 @@ struct btrfs_inode { struct list_head delayed_iput; struct rw_semaphore i_mmap_lock; + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif + struct inode vfs_inode; }; @@ -532,9 +537,9 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) /* We only allow BITS_PER_LONGS blocks for each bitmap. */ #ifdef CONFIG_BTRFS_EXPERIMENTAL - mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0, - ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits) - >> PAGE_SHIFT))); + mapping_set_folio_order_range(inode->vfs_inode.i_mapping, + inode->root->fs_info->block_min_order, + inode->root->fs_info->block_max_order); #endif } @@ -542,10 +547,12 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, - const u8 * const csum_expected); +void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, + u8 *dest); +int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, + const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, struct bio_vec *bv); + u32 bio_offset, phys_addr_t paddr); noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); @@ -558,7 +565,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, const struct fscrypt_str *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index); + const struct fscrypt_str *name, bool add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d09d622016ef54..bacad18357b338 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -90,19 +90,19 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) } static int compression_compress_pages(int type, struct list_head *ws, - struct address_space *mapping, u64 start, + struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { switch (type) { case BTRFS_COMPRESS_ZLIB: - return zlib_compress_folios(ws, mapping, start, folios, + return zlib_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_LZO: - return lzo_compress_folios(ws, mapping, start, folios, + return lzo_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_ZSTD: - return zstd_compress_folios(ws, mapping, start, folios, + return zstd_compress_folios(ws, inode, start, folios, out_folios, total_in, total_out); case BTRFS_COMPRESS_NONE: default: @@ -223,10 +223,14 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co /* * Common wrappers for page allocation from compression wrappers */ -struct folio *btrfs_alloc_compr_folio(void) +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info) { struct folio *folio = NULL; + /* For bs > ps cases, no cached folio pool for now. */ + if (fs_info->block_min_order) + goto alloc; + spin_lock(&compr_pool.lock); if (compr_pool.count > 0) { folio = list_first_entry(&compr_pool.list, struct folio, lru); @@ -238,13 +242,18 @@ struct folio *btrfs_alloc_compr_folio(void) if (folio) return folio; - return folio_alloc(GFP_NOFS, 0); +alloc: + return folio_alloc(GFP_NOFS, fs_info->block_min_order); } void btrfs_free_compr_folio(struct folio *folio) { bool do_free = false; + /* The folio is from bs > ps fs, no cached pool for now. */ + if (folio_order(folio)) + goto free; + spin_lock(&compr_pool.lock); if (compr_pool.count > compr_pool.thresh) { do_free = true; @@ -257,6 +266,7 @@ void btrfs_free_compr_folio(struct folio *folio) if (!do_free) return; +free: ASSERT(folio_ref_count(folio) == 1); folio_put(folio); } @@ -344,16 +354,19 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb->bbio.fs_info; struct bio *bio = &cb->bbio.bio; u32 offset = 0; while (offset < cb->compressed_len) { + struct folio *folio; int ret; - u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); + u32 len = min_t(u32, cb->compressed_len - offset, + btrfs_min_folio_size(fs_info)); + folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)]; /* Maximum compressed extent is smaller than bio size limit. */ - ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT], - len, 0); + ret = bio_add_folio(bio, folio, len, 0); ASSERT(ret); offset += len; } @@ -443,6 +456,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (fs_info->sectorsize < PAGE_SIZE) return 0; + /* For bs > ps cases, we don't support readahead for compressed folios for now. */ + if (fs_info->block_min_order) + return 0; + end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (cur < compressed_end) { @@ -602,17 +619,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->compressed_len = compressed_len; cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; + cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root; btrfs_free_extent_map(em); - cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info)); cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); if (!cb->compressed_folios) { status = BLK_STS_RESOURCE; goto out_free_bio; } - ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); + ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order, + cb->compressed_folios); if (ret) { status = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -687,8 +706,6 @@ struct heuristic_ws { struct list_head list; }; -static struct workspace_manager heuristic_wsm; - static void free_heuristic_ws(struct list_head *ws) { struct heuristic_ws *workspace; @@ -701,7 +718,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(void) +static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info) { struct heuristic_ws *ws; @@ -728,11 +745,9 @@ static struct list_head *alloc_heuristic_ws(void) return ERR_PTR(-ENOMEM); } -const struct btrfs_compress_op btrfs_heuristic_compress = { - .workspace_manager = &heuristic_wsm, -}; +const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 }; -static const struct btrfs_compress_op * const btrfs_compress_op[] = { +static const struct btrfs_compress_levels * const btrfs_compress_levels[] = { /* The heuristic is represented as compression type 0 */ &btrfs_heuristic_compress, &btrfs_zlib_compress, @@ -740,13 +755,13 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, int level) +static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); - case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); - case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(); - case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info); + case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info); + case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level); default: /* * This can't happen, the type is validated several times @@ -772,44 +787,58 @@ static void free_workspace(int type, struct list_head *ws) } } -static void btrfs_init_workspace_manager(int type) +static int alloc_workspace_manager(struct btrfs_fs_info *fs_info, + enum btrfs_compression_type type) { - struct workspace_manager *wsm; + struct workspace_manager *gwsm; struct list_head *workspace; - wsm = btrfs_compress_op[type]->workspace_manager; - INIT_LIST_HEAD(&wsm->idle_ws); - spin_lock_init(&wsm->ws_lock); - atomic_set(&wsm->total_ws, 0); - init_waitqueue_head(&wsm->ws_wait); + ASSERT(fs_info->compr_wsm[type] == NULL); + gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL); + if (!gwsm) + return -ENOMEM; + + INIT_LIST_HEAD(&gwsm->idle_ws); + spin_lock_init(&gwsm->ws_lock); + atomic_set(&gwsm->total_ws, 0); + init_waitqueue_head(&gwsm->ws_wait); + fs_info->compr_wsm[type] = gwsm; /* * Preallocate one workspace for each compression type so we can * guarantee forward progress in the worst case */ - workspace = alloc_workspace(type, 0); + workspace = alloc_workspace(fs_info, type, 0); if (IS_ERR(workspace)) { - btrfs_warn(NULL, - "cannot preallocate compression workspace, will try later"); + btrfs_warn(fs_info, + "cannot preallocate compression workspace for %s, will try later", + btrfs_compress_type2str(type)); } else { - atomic_set(&wsm->total_ws, 1); - wsm->free_ws = 1; - list_add(workspace, &wsm->idle_ws); + atomic_set(&gwsm->total_ws, 1); + gwsm->free_ws = 1; + list_add(workspace, &gwsm->idle_ws); } + return 0; } -static void btrfs_cleanup_workspace_manager(int type) +static void free_workspace_manager(struct btrfs_fs_info *fs_info, + enum btrfs_compression_type type) { - struct workspace_manager *wsman; struct list_head *ws; + struct workspace_manager *gwsm = fs_info->compr_wsm[type]; - wsman = btrfs_compress_op[type]->workspace_manager; - while (!list_empty(&wsman->idle_ws)) { - ws = wsman->idle_ws.next; + /* ZSTD uses its own workspace manager, should enter here. */ + ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES); + if (!gwsm) + return; + fs_info->compr_wsm[type] = NULL; + while (!list_empty(&gwsm->idle_ws)) { + ws = gwsm->idle_ws.next; list_del(ws); free_workspace(type, ws); - atomic_dec(&wsman->total_ws); + atomic_dec(&gwsm->total_ws); } + kfree(gwsm); } /* @@ -818,9 +847,9 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, int level) +struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level) { - struct workspace_manager *wsm; + struct workspace_manager *wsm = fs_info->compr_wsm[type]; struct list_head *workspace; int cpus = num_online_cpus(); unsigned nofs_flag; @@ -830,7 +859,7 @@ struct list_head *btrfs_get_workspace(int type, int level) wait_queue_head_t *ws_wait; int *free_ws; - wsm = btrfs_compress_op[type]->workspace_manager; + ASSERT(wsm); idle_ws = &wsm->idle_ws; ws_lock = &wsm->ws_lock; total_ws = &wsm->total_ws; @@ -866,7 +895,7 @@ struct list_head *btrfs_get_workspace(int type, int level) * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - workspace = alloc_workspace(type, level); + workspace = alloc_workspace(fs_info, type, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -889,7 +918,7 @@ struct list_head *btrfs_get_workspace(int type, int level) /* no burst */ 1); if (__ratelimit(&_rs)) - btrfs_warn(NULL, + btrfs_warn(fs_info, "no compression workspaces, low memory, retrying"); } goto again; @@ -897,13 +926,13 @@ struct list_head *btrfs_get_workspace(int type, int level) return workspace; } -static struct list_head *get_workspace(int type, int level) +static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); - case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); - case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); - case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); + case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level); + case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level); + case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level); + case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level); default: /* * This can't happen, the type is validated several times @@ -917,21 +946,21 @@ static struct list_head *get_workspace(int type, int level) * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -void btrfs_put_workspace(int type, struct list_head *ws) +void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) { - struct workspace_manager *wsm; + struct workspace_manager *gwsm = fs_info->compr_wsm[type]; struct list_head *idle_ws; spinlock_t *ws_lock; atomic_t *total_ws; wait_queue_head_t *ws_wait; int *free_ws; - wsm = btrfs_compress_op[type]->workspace_manager; - idle_ws = &wsm->idle_ws; - ws_lock = &wsm->ws_lock; - total_ws = &wsm->total_ws; - ws_wait = &wsm->ws_wait; - free_ws = &wsm->free_ws; + ASSERT(gwsm); + idle_ws = &gwsm->idle_ws; + ws_lock = &gwsm->ws_lock; + total_ws = &gwsm->total_ws; + ws_wait = &gwsm->ws_wait; + free_ws = &gwsm->free_ws; spin_lock(ws_lock); if (*free_ws <= num_online_cpus()) { @@ -948,13 +977,13 @@ void btrfs_put_workspace(int type, struct list_head *ws) cond_wake_up(ws_wait); } -static void put_workspace(int type, struct list_head *ws) +static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws) { switch (type) { - case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); - case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); + case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws); + case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws); default: /* * This can't happen, the type is validated several times @@ -970,12 +999,12 @@ static void put_workspace(int type, struct list_head *ws) */ static int btrfs_compress_set_level(unsigned int type, int level) { - const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; if (level == 0) - level = ops->default_level; + level = levels->default_level; else - level = clamp(level, ops->min_level, ops->max_level); + level = clamp(level, levels->min_level, levels->max_level); return level; } @@ -985,9 +1014,9 @@ static int btrfs_compress_set_level(unsigned int type, int level) */ bool btrfs_compress_level_valid(unsigned int type, int level) { - const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + const struct btrfs_compress_levels *levels = btrfs_compress_levels[type]; - return ops->min_level <= level && level <= ops->max_level; + return levels->min_level <= level && level <= levels->max_level; } /* Wrapper around find_get_page(), with extra error message. */ @@ -1022,44 +1051,46 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * - compression algo are 0-3 * - the level are bits 4-7 * - * @out_pages is an in/out parameter, holds maximum number of pages to allocate - * and returns number of actually allocated pages + * @out_folios is an in/out parameter, holds maximum number of folios to allocate + * and returns number of actually allocated folios * * @total_in is used to return the number of bytes actually read. It * may be smaller than the input length if we had to exit early because we - * ran out of room in the pages array or because we cross the + * ran out of room in the folios array or because we cross the * max_out threshold. * * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; level = btrfs_compress_set_level(type, level); - workspace = get_workspace(type, level); - ret = compression_compress_pages(type, workspace, mapping, start, folios, + workspace = get_workspace(fs_info, type, level); + ret = compression_compress_pages(type, workspace, inode, start, folios, out_folios, total_in, total_out); /* The total read-in bytes should be no larger than the input. */ ASSERT(*total_in <= orig_len); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); return ret; } static int btrfs_decompress_bio(struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct list_head *workspace; int ret; int type = cb->compress_type; - workspace = get_workspace(type, 0); + workspace = get_workspace(fs_info, type, 0); ret = compression_decompress_bio(workspace, cb); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); if (!ret) zero_fill_bio(&cb->orig_bbio->bio); @@ -1080,20 +1111,50 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, int ret; /* - * The full destination page range should not exceed the page size. + * The full destination folio range should not exceed the folio size. * And the @destlen should not exceed sectorsize, as this is only called for * inline file extents, which should not exceed sectorsize. */ - ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize); - workspace = get_workspace(type, 0); + workspace = get_workspace(fs_info, type, 0); ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); - put_workspace(type, workspace); + put_workspace(fs_info, type, workspace); return ret; } +int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info) +{ + int ret; + + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); + if (ret < 0) + goto error; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); + if (ret < 0) + goto error; + ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); + if (ret < 0) + goto error; + ret = zstd_alloc_workspace_manager(fs_info); + if (ret < 0) + goto error; + return 0; +error: + btrfs_free_compress_wsm(fs_info); + return ret; +} + +void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info) +{ + free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE); + free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB); + free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO); + zstd_free_workspace_manager(fs_info); +} + int __init btrfs_init_compress(void) { if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, @@ -1105,11 +1166,6 @@ int __init btrfs_init_compress(void) if (!compr_pool.shrinker) return -ENOMEM; - btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); - btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); - btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); - zstd_init_workspace_manager(); - spin_lock_init(&compr_pool.lock); INIT_LIST_HEAD(&compr_pool.list); compr_pool.count = 0; @@ -1130,10 +1186,6 @@ void __cold btrfs_exit_compress(void) btrfs_compr_pool_scan(NULL, NULL); shrinker_free(compr_pool.shrinker); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); - btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); - zstd_cleanup_workspace_manager(); bioset_exit(&btrfs_compressed_bioset); } @@ -1256,7 +1308,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, #define ENTROPY_LVL_HIGH (80) /* - * For increasead precision in shannon_entropy calculation, + * For increased precision in shannon_entropy calculation, * let's do pow(n, M) to save more digits after comma: * * - maximum int bit length is 64 @@ -1542,7 +1594,8 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, */ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) { - struct list_head *ws_list = get_workspace(0, 0); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct list_head *ws_list = get_workspace(fs_info, 0, 0); struct heuristic_ws *ws; u32 i; u8 byte; @@ -1611,30 +1664,34 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) } out: - put_workspace(0, ws_list); + put_workspace(fs_info, 0, ws_list); return ret; } /* - * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level. Negative level - * numbers are allowed. + * Convert the compression suffix (eg. after "zlib" starting with ":") to level. + * + * If the resulting level exceeds the algo's supported levels, it will be clamped. + * + * Return <0 if no valid string can be found. + * Return 0 if everything is fine. */ -int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret) { int level = 0; int ret; - if (!type) + if (!type) { + *level_ret = btrfs_compress_set_level(type, level); return 0; + } if (str[0] == ':') { ret = kstrtoint(str + 1, 10, &level); if (ret) - level = 0; + return ret; } - level = btrfs_compress_set_level(type, level); - - return level; + *level_ret = btrfs_compress_set_level(type, level); + return 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1b38e707bbd985..eba188a9e3bb58 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -75,6 +75,11 @@ struct compressed_bio { struct btrfs_bio bbio; }; +static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb) +{ + return cb->bbio.fs_info; +} + /* @range_end must be exclusive. */ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { @@ -84,11 +89,14 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6 return min(range_end, folio_end(folio)) - cur; } +int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info); +void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info); + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); bool btrfs_compress_level_valid(unsigned int type, int level); -int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -102,21 +110,11 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); -struct folio *btrfs_alloc_compr_folio(void); +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info); void btrfs_free_compr_folio(struct folio *folio); -enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LZO = 2, - BTRFS_COMPRESS_ZSTD = 3, - BTRFS_NR_COMPRESS_TYPES = 4, - - BTRFS_DEFRAG_DONT_COMPRESS, -}; - struct workspace_manager { struct list_head idle_ws; spinlock_t ws_lock; @@ -128,11 +126,10 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, int level); -void btrfs_put_workspace(int type, struct list_head *ws); +struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level); +void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws); -struct btrfs_compress_op { - struct workspace_manager *workspace_manager; +struct btrfs_compress_levels { /* Maximum level supported by the compression algorithm */ int min_level; int max_level; @@ -142,10 +139,10 @@ struct btrfs_compress_op { /* The heuristic workspaces are managed via the 0th workspace manager */ #define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES -extern const struct btrfs_compress_op btrfs_heuristic_compress; -extern const struct btrfs_compress_op btrfs_zlib_compress; -extern const struct btrfs_compress_op btrfs_lzo_compress; -extern const struct btrfs_compress_op btrfs_zstd_compress; +extern const struct btrfs_compress_levels btrfs_heuristic_compress; +extern const struct btrfs_compress_levels btrfs_zlib_compress; +extern const struct btrfs_compress_levels btrfs_lzo_compress; +extern const struct btrfs_compress_levels btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); bool btrfs_compress_is_valid_type(const char *str, size_t len); @@ -155,39 +152,39 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret); -int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, +int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); +struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level); void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); +struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level); -int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, +int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *lzo_alloc_workspace(void); +struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, +int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -void zstd_init_workspace_manager(void); -void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(int level); +int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info); +void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info); +struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(int level); -void zstd_put_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level); +void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 74e6d7f3d2660e..561658aca018b4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -30,10 +30,10 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, - int data_size, int extend); + int data_size, bool extend); static int push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, - struct extent_buffer *src, int empty); + struct extent_buffer *src, bool empty); static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); @@ -293,11 +293,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_inc_ref(trans, root, cow, 1); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } else { ret = btrfs_inc_ref(trans, root, cow, 0); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } if (ret) { @@ -536,14 +536,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -556,7 +556,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, parent_start = buf->start; ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -567,7 +567,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -575,7 +575,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, WARN_ON(trans->transid != btrfs_header_generation(parent)); ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -586,14 +586,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(trans, parent); if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } } ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, parent_start, last_ref); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } @@ -613,15 +613,12 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, return ret; } -static inline int should_cow_block(const struct btrfs_trans_handle *trans, - const struct btrfs_root *root, - const struct extent_buffer *buf) +static inline bool should_cow_block(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { if (btrfs_is_testing(root->fs_info)) - return 0; - - /* Ensure we can see the FORCE_COW bit */ - smp_mb__before_atomic(); + return false; /* * We do not need to cow a block if @@ -634,13 +631,25 @@ static inline int should_cow_block(const struct btrfs_trans_handle *trans, * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ - if (btrfs_header_generation(buf) == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) - return 0; - return 1; + + if (btrfs_header_generation(buf) != trans->transid) + return true; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) + return true; + + /* Ensure we can see the FORCE_COW bit. */ + smp_mb__before_atomic(); + if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) + return true; + + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return false; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return true; + + return false; } /* @@ -844,7 +853,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, &check); if (IS_ERR(eb)) return eb; - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); return ERR_PTR(-EIO); } @@ -913,7 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(child); free_extent_buffer(child); btrfs_abort_transaction(trans, ret); @@ -935,7 +944,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1010,7 +1019,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right, 0, 1); free_extent_buffer_stale(right); right = NULL; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1019,7 +1028,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1071,7 +1080,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1081,7 +1090,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1186,7 +1195,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(left); free_extent_buffer(left); btrfs_abort_transaction(trans, ret); @@ -1246,7 +1255,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_tree_unlock(right); free_extent_buffer(right); btrfs_abort_transaction(trans, ret); @@ -1484,13 +1493,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ - if (btrfs_verify_level_key(tmp, &check)) { + if (unlikely(btrfs_verify_level_key(tmp, &check))) { ret = -EUCLEAN; goto out; } @@ -1571,7 +1580,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) { + if (unlikely(!extent_buffer_uptodate(tmp))) { ret = -EIO; goto out; } @@ -1752,7 +1761,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * The root may have failed to write out at some point, and thus is no * longer valid, return an error in this case. */ - if (!extent_buffer_uptodate(b)) { + if (unlikely(!extent_buffer_uptodate(b))) { if (root_lock) btrfs_tree_unlock_rw(b, root_lock); free_extent_buffer(b); @@ -2260,7 +2269,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, again: b = btrfs_get_old_root(root, time_seq); - if (!b) { + if (unlikely(!b)) { ret = -EIO; goto done; } @@ -2686,7 +2695,7 @@ static bool check_sibling_keys(const struct extent_buffer *left, */ static int push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, - struct extent_buffer *src, int empty) + struct extent_buffer *src, bool empty) { struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; @@ -2722,13 +2731,13 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); /* dst is the left eb, src is the middle eb */ - if (check_sibling_keys(dst, src)) { + if (unlikely(check_sibling_keys(dst, src))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2796,7 +2805,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, push_items = max_push; /* dst is the right eb, src is the middle eb */ - if (check_sibling_keys(src, dst)) { + if (unlikely(check_sibling_keys(src, dst))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; @@ -2813,7 +2822,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, push_items); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2883,7 +2892,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_clear_buffer_dirty(trans, c); ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); - if (ret2 < 0) + if (unlikely(ret2 < 0)) btrfs_abort_transaction(trans, ret2); btrfs_tree_unlock(c); free_extent_buffer(c); @@ -2928,7 +2937,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, slot, nritems - slot); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2941,7 +2950,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, BTRFS_MOD_LOG_KEY_ADD); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3017,7 +3026,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ASSERT(btrfs_header_level(c) == level); ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); - if (ret) { + if (unlikely(ret)) { btrfs_tree_unlock(split); free_extent_buffer(split); btrfs_abort_transaction(trans, ret); @@ -3086,7 +3095,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf) int ret; ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_crit(fs_info, "leaf free space ret %d, leaf data size %lu, used %d nritems %d", ret, @@ -3102,7 +3111,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf) */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_path *path, - int data_size, int empty, + int data_size, bool empty, struct extent_buffer *right, int free_space, u32 left_nritems, u32 min_slot) @@ -3239,7 +3248,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int min_data_size, int data_size, - int empty, u32 min_slot) + bool empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -3278,7 +3287,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - if (check_sibling_keys(left, right)) { + if (unlikely(check_sibling_keys(left, right))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); btrfs_tree_unlock(right); @@ -3316,7 +3325,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size, - int empty, struct extent_buffer *left, + bool empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) { @@ -3494,7 +3503,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - if (check_sibling_keys(left, right)) { + if (unlikely(check_sibling_keys(left, right))) { ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); goto out; @@ -3642,7 +3651,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, - int extend) + bool extend) { struct btrfs_disk_key disk_key; struct extent_buffer *l; @@ -4075,7 +4084,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_set_item_size(leaf, slot, new_size); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4108,7 +4117,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, old_data = btrfs_item_data_end(leaf, slot); BUG_ON(slot < 0); - if (slot >= nritems) { + if (unlikely(slot >= nritems)) { btrfs_print_leaf(leaf); btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", slot, nritems); @@ -4135,7 +4144,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4183,7 +4192,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, data_end = leaf_data_end(leaf); total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); - if (btrfs_leaf_free_space(leaf) < total_size) { + if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(leaf)); @@ -4193,7 +4202,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, if (slot != nritems) { unsigned int old_data = btrfs_item_data_end(leaf, slot); - if (old_data < data_end) { + if (unlikely(old_data < data_end)) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "item at slot %d with data offset %u beyond data end of leaf %u", @@ -4232,7 +4241,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(leaf, nritems + batch->nr); btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(leaf) < 0) { + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { btrfs_print_leaf(leaf); BUG(); } @@ -4374,7 +4383,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (level) { ret = btrfs_tree_mod_log_insert_move(parent, slot, slot + 1, nritems - slot - 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4387,7 +4396,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, BTRFS_MOD_LOG_KEY_REMOVE); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 738179a5e17060..7b277934f66f92 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -153,7 +153,7 @@ void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) } /* - * Pick the defragable inode that we want, if it doesn't exist, we will get the + * Pick the defraggable inode that we want, if it doesn't exist, we will get the * next one. */ static struct inode_defrag *btrfs_pick_defrag_inode( @@ -924,7 +924,7 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-EIO); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0f8d8e275143b2..41e37f7f67cc01 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -57,6 +57,7 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; refcount_set(&delayed_node->refs, 0); + btrfs_delayed_node_ref_tracker_dir_init(delayed_node); delayed_node->ins_root = RB_ROOT_CACHED; delayed_node->del_root = RB_ROOT_CACHED; mutex_init(&delayed_node->mutex); @@ -65,7 +66,8 @@ static inline void btrfs_init_delayed_node( } static struct btrfs_delayed_node *btrfs_get_delayed_node( - struct btrfs_inode *btrfs_inode) + struct btrfs_inode *btrfs_inode, + struct btrfs_ref_tracker *tracker) { struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(btrfs_inode); @@ -74,6 +76,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = READ_ONCE(btrfs_inode->delayed_node); if (node) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS); return node; } @@ -83,6 +86,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); BUG_ON(btrfs_inode->delayed_node != node); xa_unlock(&root->delayed_nodes); return node; @@ -106,6 +110,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( */ if (refcount_inc_not_zero(&node->refs)) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, + GFP_ATOMIC); btrfs_inode->delayed_node = node; } else { node = NULL; @@ -126,7 +133,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * Return the delayed node, or error pointer on failure. */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct btrfs_inode *btrfs_inode) + struct btrfs_inode *btrfs_inode, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; struct btrfs_root *root = btrfs_inode->root; @@ -135,7 +143,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( void *ptr; again: - node = btrfs_get_delayed_node(btrfs_inode); + node = btrfs_get_delayed_node(btrfs_inode, tracker); if (node) return node; @@ -144,12 +152,10 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - /* Cached in the inode and can be accessed. */ - refcount_set(&node->refs, 2); - /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); if (ret == -ENOMEM) { + btrfs_delayed_node_ref_tracker_dir_exit(node); kmem_cache_free(delayed_node_cache, node); return ERR_PTR(-ENOMEM); } @@ -158,6 +164,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( if (ptr) { /* Somebody inserted it, go back and read it. */ xa_unlock(&root->delayed_nodes); + btrfs_delayed_node_ref_tracker_dir_exit(node); kmem_cache_free(delayed_node_cache, node); node = NULL; goto again; @@ -166,6 +173,12 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( ASSERT(xa_err(ptr) != -EINVAL); ASSERT(xa_err(ptr) != -ENOMEM); ASSERT(ptr == NULL); + + /* Cached in the inode and can be accessed. */ + refcount_set(&node->refs, 2); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_ATOMIC); + btrfs_inode->delayed_node = node; xa_unlock(&root->delayed_nodes); @@ -191,6 +204,8 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, list_add_tail(&node->n_list, &root->node_list); list_add_tail(&node->p_list, &root->prepare_list); refcount_inc(&node->refs); /* inserted into list */ + btrfs_delayed_node_ref_tracker_alloc(node, &node->node_list_tracker, + GFP_ATOMIC); root->nodes++; set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } @@ -204,6 +219,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_lock(&root->lock); if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; + btrfs_delayed_node_ref_tracker_free(node, &node->node_list_tracker); refcount_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) @@ -214,22 +230,26 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, } static struct btrfs_delayed_node *btrfs_first_delayed_node( - struct btrfs_delayed_root *delayed_root) + struct btrfs_delayed_root *delayed_root, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); node = list_first_entry_or_null(&delayed_root->node_list, struct btrfs_delayed_node, n_list); - if (node) + if (node) { refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); + } spin_unlock(&delayed_root->lock); return node; } static struct btrfs_delayed_node *btrfs_next_delayed_node( - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_root *delayed_root; struct list_head *p; @@ -249,6 +269,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( next = list_entry(p, struct btrfs_delayed_node, n_list); refcount_inc(&next->refs); + btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC); out: spin_unlock(&delayed_root->lock); @@ -257,7 +278,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( static void __btrfs_release_delayed_node( struct btrfs_delayed_node *delayed_node, - int mod) + int mod, struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_root *delayed_root; @@ -273,6 +294,7 @@ static void __btrfs_release_delayed_node( btrfs_dequeue_delayed_node(delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); + btrfs_delayed_node_ref_tracker_free(delayed_node, tracker); if (refcount_dec_and_test(&delayed_node->refs)) { struct btrfs_root *root = delayed_node->root; @@ -282,17 +304,20 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); + btrfs_delayed_node_ref_tracker_dir_exit(delayed_node); kmem_cache_free(delayed_node_cache, delayed_node); } } -static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) +static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { - __btrfs_release_delayed_node(node, 0); + __btrfs_release_delayed_node(node, 0, tracker); } static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( - struct btrfs_delayed_root *delayed_root) + struct btrfs_delayed_root *delayed_root, + struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; @@ -302,6 +327,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( if (node) { list_del_init(&node->p_list); refcount_inc(&node->refs); + btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); } spin_unlock(&delayed_root->lock); @@ -309,9 +335,10 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( } static inline void btrfs_release_prepared_delayed_node( - struct btrfs_delayed_node *node) + struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) { - __btrfs_release_delayed_node(node, 1); + __btrfs_release_delayed_node(node, 1, tracker); } static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, @@ -711,8 +738,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, u32 *ins_sizes; int i = 0; - ins_data = kmalloc(batch.nr * sizeof(u32) + - batch.nr * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(batch.nr, + sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) { ret = -ENOMEM; goto out; @@ -1011,7 +1038,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * transaction, because we could leave the inode with the * improper counts behind. */ - if (ret != -ENOENT) + if (unlikely(ret != -ENOENT)) btrfs_abort_transaction(trans, ret); goto out; } @@ -1039,7 +1066,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto err_out; } @@ -1126,6 +1153,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret = 0; @@ -1143,17 +1171,18 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) delayed_root = fs_info->delayed_root; - curr_node = btrfs_first_delayed_node(delayed_root); + curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker); while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } prev_node = curr_node; - curr_node = btrfs_next_delayed_node(curr_node); + prev_delayed_node_tracker = curr_delayed_node_tracker; + curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); /* * See the comment below about releasing path before releasing * node. If the commit of delayed items was successful the path @@ -1161,7 +1190,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) * point to locked extent buffers (a leaf at the very least). */ ASSERT(path->nodes[0] == NULL); - btrfs_release_delayed_node(prev_node); + btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); } /* @@ -1174,7 +1203,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) btrfs_free_path(path); if (curr_node) - btrfs_release_delayed_node(curr_node); + btrfs_release_delayed_node(curr_node, &curr_delayed_node_tracker); trans->block_rsv = block_rsv; return ret; @@ -1193,7 +1222,9 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node = + btrfs_get_delayed_node(inode, &delayed_node_tracker); BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1204,14 +1235,14 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (!delayed_node->count) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } mutex_unlock(&delayed_node->mutex); path = btrfs_alloc_path(); if (!path) { - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -ENOMEM; } @@ -1220,7 +1251,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); trans->block_rsv = block_rsv; return ret; @@ -1230,18 +1261,20 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_trans_handle *trans; - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret; + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return 0; mutex_lock(&delayed_node->mutex); if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } mutex_unlock(&delayed_node->mutex); @@ -1275,7 +1308,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1289,7 +1322,8 @@ void btrfs_remove_delayed_node(struct btrfs_inode *inode) return; inode->delayed_node = NULL; - btrfs_release_delayed_node(delayed_node); + + btrfs_release_delayed_node(delayed_node, &delayed_node->inode_cache_tracker); } struct btrfs_async_delayed_work { @@ -1305,6 +1339,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; int total_done = 0; @@ -1321,7 +1356,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) BTRFS_DELAYED_BACKGROUND / 2) break; - delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + delayed_node = btrfs_first_prepared_delayed_node(delayed_root, + &delayed_node_tracker); if (!delayed_node) break; @@ -1330,7 +1366,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_release_path(path); - btrfs_release_prepared_delayed_node(delayed_node); + btrfs_release_prepared_delayed_node(delayed_node, + &delayed_node_tracker); total_done++; continue; } @@ -1345,7 +1382,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) btrfs_btree_balance_dirty_nodelay(root->fs_info); btrfs_release_path(path); - btrfs_release_prepared_delayed_node(delayed_node); + btrfs_release_prepared_delayed_node(delayed_node, + &delayed_node_tracker); total_done++; } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) @@ -1377,10 +1415,15 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *node; - if (WARN_ON(node)) + node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker); + if (WARN_ON(node)) { + btrfs_delayed_node_ref_tracker_free(node, + &delayed_node_tracker); refcount_dec(&node->refs); + } } static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) @@ -1454,13 +1497,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_item *delayed_item; struct btrfs_dir_item *dir_item; bool reserve_leaf_space; u32 data_len; int ret; - delayed_node = btrfs_get_or_create_delayed_node(dir); + delayed_node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1536,7 +1580,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_unlock(&delayed_node->mutex); release_node: - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1591,10 +1635,11 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, u64 index) { struct btrfs_delayed_node *node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_item *item; int ret; - node = btrfs_get_or_create_delayed_node(dir); + node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker); if (IS_ERR(node)) return PTR_ERR(node); @@ -1635,14 +1680,16 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_unlock(&node->mutex); end: - btrfs_release_delayed_node(node); + btrfs_release_delayed_node(node, &delayed_node_tracker); return ret; } int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) { - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_ref_tracker delayed_node_tracker; + struct btrfs_delayed_node *delayed_node; + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return -ENOENT; @@ -1652,12 +1699,12 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) * is updated now. So we needn't lock the delayed node. */ if (!delayed_node->index_cnt) { - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -EINVAL; } inode->index_cnt = delayed_node->index_cnt; - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -1668,8 +1715,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, { struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; + struct btrfs_ref_tracker delayed_node_tracker; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return false; @@ -1704,6 +1752,7 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, * insert/delete delayed items in this period. So we also needn't * requeue or dequeue this delayed node. */ + btrfs_delayed_node_ref_tracker_free(delayed_node, &delayed_node_tracker); refcount_dec(&delayed_node->refs); return true; @@ -1843,19 +1892,19 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_inode_item *inode_item; struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return -ENOENT; mutex_lock(&delayed_node->mutex); if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return -ENOENT; } @@ -1864,8 +1913,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); @@ -1895,7 +1942,7 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -1904,9 +1951,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; int ret = 0; - delayed_node = btrfs_get_or_create_delayed_node(inode); + delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1926,7 +1974,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, atomic_inc(&root->fs_info->delayed_root->items); release_node: mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return ret; } @@ -1934,6 +1982,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; /* * we don't do delayed inode updates during log recovery because it @@ -1943,7 +1992,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) return -EAGAIN; - delayed_node = btrfs_get_or_create_delayed_node(inode); + delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); @@ -1970,7 +2019,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) atomic_inc(&fs_info->delayed_root->items); release_node: mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; } @@ -2014,19 +2063,21 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node; + struct btrfs_ref_tracker delayed_node_tracker; - delayed_node = btrfs_get_delayed_node(inode); + delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!delayed_node) return; __btrfs_kill_delayed_node(delayed_node); - btrfs_release_delayed_node(delayed_node); + btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); } void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { unsigned long index = 0; struct btrfs_delayed_node *delayed_nodes[8]; + struct btrfs_ref_tracker delayed_node_trackers[8]; while (1) { struct btrfs_delayed_node *node; @@ -2045,6 +2096,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) * about to be removed from the tree in the loop below */ if (refcount_inc_not_zero(&node->refs)) { + btrfs_delayed_node_ref_tracker_alloc(node, + &delayed_node_trackers[count], + GFP_ATOMIC); delayed_nodes[count] = node; count++; } @@ -2056,7 +2110,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); - btrfs_release_delayed_node(delayed_nodes[i]); + btrfs_release_delayed_node(delayed_nodes[i], + &delayed_node_trackers[i]); + btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]); } } } @@ -2064,14 +2120,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_delayed_node *curr_node, *prev_node; + struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; - curr_node = btrfs_first_delayed_node(fs_info->delayed_root); + curr_node = btrfs_first_delayed_node(fs_info->delayed_root, + &curr_delayed_node_tracker); while (curr_node) { __btrfs_kill_delayed_node(curr_node); prev_node = curr_node; - curr_node = btrfs_next_delayed_node(curr_node); - btrfs_release_delayed_node(prev_node); + prev_delayed_node_tracker = curr_delayed_node_tracker; + curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker); + btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker); } } @@ -2081,8 +2140,9 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, { struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; + struct btrfs_ref_tracker delayed_node_tracker; - node = btrfs_get_delayed_node(inode); + node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!node) return; @@ -2140,6 +2200,7 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, * delete delayed items. */ ASSERT(refcount_read(&node->refs) > 1); + btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); refcount_dec(&node->refs); } @@ -2150,8 +2211,9 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode, struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; struct btrfs_delayed_item *next; + struct btrfs_ref_tracker delayed_node_tracker; - node = btrfs_get_delayed_node(inode); + node = btrfs_get_delayed_node(inode, &delayed_node_tracker); if (!node) return; @@ -2183,5 +2245,6 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode, * delete delayed items. */ ASSERT(refcount_read(&node->refs) > 1); + btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); refcount_dec(&node->refs); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index e6e763ad2d421f..0d949edc0caf16 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -16,6 +16,7 @@ #include #include #include +#include #include "ctree.h" struct btrfs_disk_key; @@ -44,6 +45,22 @@ struct btrfs_delayed_root { wait_queue_head_t wait; }; +struct btrfs_ref_tracker_dir { +#ifdef CONFIG_BTRFS_DEBUG + struct ref_tracker_dir dir; +#else + struct {} tracker; +#endif +}; + +struct btrfs_ref_tracker { +#ifdef CONFIG_BTRFS_DEBUG + struct ref_tracker *tracker; +#else + struct {} tracker; +#endif +}; + #define BTRFS_DELAYED_NODE_IN_LIST 0 #define BTRFS_DELAYED_NODE_INODE_DIRTY 1 #define BTRFS_DELAYED_NODE_DEL_IREF 2 @@ -78,6 +95,12 @@ struct btrfs_delayed_node { * actual number of leaves we end up using. Protected by @mutex. */ u32 index_item_leaves; + /* Track all references to this delayed node. */ + struct btrfs_ref_tracker_dir ref_dir; + /* Track delayed node reference stored in node list. */ + struct btrfs_ref_tracker node_list_tracker; + /* Track delayed node reference stored in inode cache. */ + struct btrfs_ref_tracker inode_cache_tracker; }; struct btrfs_delayed_item { @@ -169,4 +192,74 @@ void __cold btrfs_delayed_inode_exit(void); /* for debugging */ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info); +#define BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT 16 +#define BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT 16 + +#ifdef CONFIG_BTRFS_DEBUG +static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + + ref_tracker_dir_init(&node->ref_dir.dir, + BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT, + "delayed_node"); +} + +static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + + ref_tracker_dir_exit(&node->ref_dir.dir); +} + +static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return; + + ref_tracker_dir_print(&node->ref_dir.dir, + BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT); +} + +static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker, + gfp_t gfp) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return 0; + + return ref_tracker_alloc(&node->ref_dir.dir, &tracker->tracker, gfp); +} + +static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) +{ + if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER)) + return 0; + + return ref_tracker_free(&node->ref_dir.dir, &tracker->tracker); +} +#else +static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) { } + +static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { } + +static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) { } + +static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker, + gfp_t gfp) +{ + return 0; +} + +static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node, + struct btrfs_ref_tracker *tracker) +{ + return 0; +} +#endif + #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ca382c5b186f47..481802efaa1436 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -895,7 +895,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * Initialize the structure which represents a modification to a an extent. + * Initialize the structure which represents a modification to an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * @@ -952,7 +952,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, bool skip_qgroup) { -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif @@ -969,7 +969,7 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, u64 mod_root, bool skip_qgroup) { -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif @@ -1251,7 +1251,6 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) { struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; - bool testing = btrfs_is_testing(fs_info); spin_lock(&delayed_refs->lock); while (true) { @@ -1281,7 +1280,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (!testing && pin_bytes) { + if (!btrfs_is_testing(fs_info) && pin_bytes) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, head->bytenr); @@ -1312,14 +1311,14 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) btrfs_error_unpin_extent_range(fs_info, head->bytenr, head->bytenr + head->num_bytes - 1); } - if (!testing) + if (!btrfs_is_testing(fs_info)) btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } - if (!testing) + if (!btrfs_is_testing(fs_info)) btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 552ec4fa645d4b..5ce94053214452 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -276,10 +276,6 @@ struct btrfs_ref { */ bool skip_qgroup; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - /* Through which root is this modification. */ - u64 real_root; -#endif u64 bytenr; u64 num_bytes; u64 owning_root; @@ -296,6 +292,11 @@ struct btrfs_ref { struct btrfs_data_ref data_ref; struct btrfs_tree_ref tree_ref; }; + +#ifdef CONFIG_BTRFS_DEBUG + /* Through which root is this modification. */ + u64 real_root; +#endif }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4675bcd5f92efb..a4eaef60549eed 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -98,7 +98,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) * We don't have a replace item or it's corrupted. If there is * a replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, &args)) { + if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) { btrfs_err(fs_info, "found replace target device without a valid replace item"); return -EUCLEAN; @@ -158,7 +158,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) * We don't have an active replace item but if there is a * replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, &args)) { + if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) { btrfs_err(fs_info, "replace without active item, run 'device scan --forget' on the target device"); ret = -EUCLEAN; @@ -177,8 +177,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) * allow 'btrfs dev replace_cancel' if src/tgt device is * missing */ - if (!dev_replace->srcdev && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely(!dev_replace->srcdev && !btrfs_test_opt(fs_info, DEGRADED))) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -186,8 +185,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", src_devid); } - if (!dev_replace->tgtdev && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely(!dev_replace->tgtdev && !btrfs_test_opt(fs_info, DEGRADED))) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -637,7 +635,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state"); + DEBUG_WARN("unexpected STARTED or SUSPENDED dev-replace state"); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; up_write(&dev_replace->rwsem); goto leave; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index fe9a4bd7e6e683..802d4dbe5b3817 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, if (iov_iter_alignment(iter) & blocksize_mask) return -EINVAL; + /* + * For bs > ps support, we heavily rely on large folios to make sure no + * block will cross large folio boundaries. + * + * But memory provided by direct IO is only virtually contiguous, not + * physically contiguous, and will break the btrfs' large folio requirement. + * + * So for bs > ps support, all direct IOs should fallback to buffered ones. + */ + if (fs_info->sectorsize > PAGE_SIZE) + return -EINVAL; + return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 70fc4e7cc5a0e6..9247a58894decc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -116,7 +116,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) * detect blocks that either didn't get written at all or got written * in the wrong place. */ -int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) +int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic) { if (!extent_buffer_uptodate(eb)) return 0; @@ -370,21 +370,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, ASSERT(check); found_start = btrfs_header_bytenr(eb); - if (found_start != eb->start) { + if (unlikely(found_start != eb->start)) { btrfs_err_rl(fs_info, "bad tree block start, mirror %u want %llu have %llu", eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } - if (check_tree_block_fsid(eb)) { + if (unlikely(check_tree_block_fsid(eb))) { btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); - if (found_level >= BTRFS_MAX_LEVEL) { + if (unlikely(found_level >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "bad tree block level, mirror %u level %d on logical %llu", eb->read_mirror, btrfs_header_level(eb), eb->start); @@ -404,13 +404,13 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); - if (!ignore_csum) { + if (unlikely(!ignore_csum)) { ret = -EUCLEAN; goto out; } } - if (found_level != check->level) { + if (unlikely(found_level != check->level)) { btrfs_err(fs_info, "level verify failed on logical %llu mirror %u wanted %u found %u", eb->start, eb->read_mirror, check->level, found_level); @@ -639,7 +639,6 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, u64 objectid, gfp_t flags) { struct btrfs_root *root; - bool dummy = btrfs_is_testing(fs_info); root = kzalloc(sizeof(*root), flags); if (!root) @@ -696,7 +695,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, root->log_transid_committed = -1; btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; - if (!dummy) { + if (!btrfs_is_testing(fs_info)) { btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES); btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, @@ -1047,7 +1046,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, root->node = NULL; goto fail; } - if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) { ret = -EIO; goto fail; } @@ -1056,10 +1055,10 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, * For real fs, and not log/reloc trees, root owner must * match its root node owner */ - if (!btrfs_is_testing(fs_info) && - btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && - btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && - btrfs_root_id(root) != btrfs_header_owner(root->node)) { + if (unlikely(!btrfs_is_testing(fs_info) && + btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && + btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + btrfs_root_id(root) != btrfs_header_owner(root->node))) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", btrfs_root_id(root), root->node->start, @@ -1248,6 +1247,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) if (fs_info->fs_devices) btrfs_close_devices(fs_info->fs_devices); + btrfs_free_compress_wsm(fs_info); percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); @@ -1958,7 +1958,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; - unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE; + unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); @@ -2058,7 +2058,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, u64 bytenr = btrfs_super_log_root(disk_super); int level = btrfs_super_log_root_level(disk_super); - if (fs_devices->rw_devices == 0) { + if (unlikely(fs_devices->rw_devices == 0)) { btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } @@ -2079,7 +2079,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, btrfs_put_root(log_tree_root); return ret; } - if (!extent_buffer_uptodate(log_tree_root->node)) { + if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; @@ -2087,10 +2087,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); + btrfs_put_root(log_tree_root); if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); - btrfs_put_root(log_tree_root); return ret; } @@ -2324,7 +2324,7 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, const u32 sectorsize = btrfs_super_sectorsize(sb); u32 sys_array_size = btrfs_super_sys_array_size(sb); - if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) { btrfs_err(fs_info, "system chunk array too big %u > %u", sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); return -EUCLEAN; @@ -2342,12 +2342,12 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); len = sizeof(*disk_key); - if (cur + len > sys_array_size) + if (unlikely(cur + len > sys_array_size)) goto short_read; cur += len; btrfs_disk_key_to_cpu(&key, disk_key); - if (key.type != BTRFS_CHUNK_ITEM_KEY) { + if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) { btrfs_err(fs_info, "unexpected item type %u in sys_array at offset %u", key.type, cur); @@ -2355,10 +2355,10 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, } chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); num_stripes = btrfs_stack_chunk_num_stripes(chunk); - if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)) goto short_read; type = btrfs_stack_chunk_type(chunk); - if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) { btrfs_err(fs_info, "invalid chunk type %llu in sys_array at offset %u", type, cur); @@ -2438,21 +2438,7 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } - /* - * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. - * - * For 4K page sized systems with non-debug builds, all 3 matches (4K). - * For 4K page sized systems with debug builds, there are two block sizes - * supported. (4K and 2K) - * - * We can support 16K sectorsize with 64K page size without problem, - * but such sectorsize/pagesize combination doesn't make much sense. - * 4K will be our future standard, PAGE_SIZE is supported from the very - * beginning. - */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && - sectorsize != PAGE_SIZE && - sectorsize != BTRFS_MIN_BLOCKSIZE)) { + if (!btrfs_supported_blocksize(sectorsize)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2619,13 +2605,13 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, ret = btrfs_validate_super(fs_info, sb, -1); if (ret < 0) goto out; - if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { + if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); goto out; } - if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid incompat flags, has 0x%llx valid mask 0x%llx", @@ -2655,7 +2641,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev root->node = NULL; return ret; } - if (!extent_buffer_uptodate(root->node)) { + if (unlikely(!extent_buffer_uptodate(root->node))) { free_extent_buffer(root->node); root->node = NULL; return -EIO; @@ -3256,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) } /* - * Subpage runtime limitation on v1 cache. + * Subpage/bs > ps runtime limitation on v1 cache. * - * V1 space cache still has some hard codeed PAGE_SIZE usage, while + * V1 space cache still has some hard coded PAGE_SIZE usage, while * we're already defaulting to v2 cache, no need to bother v1 as it's * going to be deprecated anyway. */ - if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { btrfs_warn(fs_info, "v1 space cache is not supported for page size %lu with sectorsize %u", PAGE_SIZE, fs_info->sectorsize); return -EINVAL; } + if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) { + btrfs_err(fs_info, + "RAID56 is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + return -EINVAL; + } /* This can be called by remount, we need to protect the super block. */ spin_lock(&fs_info->super_lock); @@ -3396,10 +3388,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize_bits = ilog2(nodesize); fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT); + fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; + if (fs_info->sectorsize > PAGE_SIZE) + btrfs_warn(fs_info, + "support for block size %u with page size %zu is experimental, some features may be missing", + fs_info->sectorsize, PAGE_SIZE); /* * Handle the space caching options appropriately now that we have the * super block loaded and validated. @@ -3421,6 +3419,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); + ret = btrfs_alloc_compress_wsm(fs_info); + if (ret) + goto fail_sb_buffer; ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -3468,7 +3469,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); - if (!fs_devices->latest_dev->bdev) { + if (unlikely(!fs_devices->latest_dev->bdev)) { btrfs_err(fs_info, "failed to read devices"); ret = -EIO; goto fail_tree_roots; @@ -3962,7 +3963,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) * Checks last_flush_error of disks in order to determine the device * state. */ - if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) + if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL))) return -EIO; return 0; @@ -4064,7 +4065,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); ret = btrfs_validate_write_super(fs_info, sb); - if (ret < 0) { + if (unlikely(ret < 0)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_handle_fs_error(fs_info, -EUCLEAN, "unexpected superblock corruption detected"); @@ -4075,7 +4076,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) if (ret) total_errors++; } - if (total_errors > max_errors) { + if (unlikely(total_errors > max_errors)) { btrfs_err(fs_info, "%d errors while writing supers", total_errors); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4100,7 +4101,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) total_errors++; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); - if (total_errors > max_errors) { + if (unlikely(total_errors > max_errors)) { btrfs_handle_fs_error(fs_info, -EIO, "%d errors while writing supers", total_errors); @@ -4880,7 +4881,7 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 864a55a96226e7..57920f2c6fe4ef 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -106,8 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int atomic); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 7fc8a3200b4005..d062ac521051b8 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -174,7 +174,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto fail; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset of -1 found, there would have to exist an * inode with such number or a root with such id. diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 66361325f6dcea..bb2ca1c9c7b026 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1237,7 +1237,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, state = next_search_state(inserted_state, end); /* * If there's a next state, whether contiguous or not, we don't - * need to unlock and start search agian. If it's not contiguous + * need to unlock and start search again. If it's not contiguous * we will end up here and try to allocate a prealloc state and insert. */ if (state) @@ -1664,7 +1664,7 @@ void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, */ u64 btrfs_count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig, + u32 bits, bool contig, struct extent_state **cached_state) { struct extent_state *state = NULL; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 36facca379738b..6f07b965e8da52 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -163,7 +163,7 @@ void __cold btrfs_extent_state_free_cachep(void); u64 btrfs_count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig, + u64 max_bytes, u32 bits, bool contig, struct extent_state **cached_state); void btrfs_free_extent_state(struct extent_state *state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 97d517cdf2df75..dc4ca98c37800a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -325,7 +325,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, + * is_data == BTRFS_REF_TYPE_DATA, data type is required, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -879,7 +879,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, ptr += btrfs_extent_inline_ref_size(type); continue; } - if (type == BTRFS_REF_TYPE_INVALID) { + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) { ret = -EUCLEAN; goto out; } @@ -1210,7 +1210,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, * We're adding refs to a tree block we already own, this * should not happen at all. */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(owner < BTRFS_FIRST_FREE_OBJECTID)) { btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", @@ -2157,7 +2157,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif ret = __btrfs_run_delayed_refs(trans, min_bytes); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -2355,7 +2355,7 @@ static noinline int check_committed_ref(struct btrfs_inode *inode, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -2457,7 +2457,7 @@ int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc) + bool full_backref, bool inc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 parent; @@ -2543,15 +2543,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); + return __btrfs_mod_ref(trans, root, buf, full_backref, true); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, bool full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + return __btrfs_mod_ref(trans, root, buf, full_backref, false); } static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) @@ -2760,7 +2760,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); - if (cache == NULL) { + if (unlikely(cache == NULL)) { /* Logic error, something removed the block group. */ ret = -EUCLEAN; goto out; @@ -2982,26 +2982,26 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, csum_root = btrfs_csum_root(trans->fs_info, bytenr); ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } } ret = btrfs_record_squota_delta(trans->fs_info, delta); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3115,7 +3115,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - if (!is_data && refs_to_drop != 1) { + if (unlikely(!is_data && refs_to_drop != 1)) { btrfs_crit(info, "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", node->bytenr, refs_to_drop); @@ -3162,7 +3162,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (!found_extent) { - if (iref) { + if (unlikely(iref)) { abort_and_dump(trans, path, "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", path->slots[0]); @@ -3172,7 +3172,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3221,7 +3221,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "umm, got %d back from search, was looking for %llu, slot %d", ret, bytenr, path->slots[0]); } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3254,7 +3254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - if (item_size < sizeof(*ei) + sizeof(*bi)) { + if (unlikely(item_size < sizeof(*ei) + sizeof(*bi))) { abort_and_dump(trans, path, "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", key.objectid, key.type, key.offset, @@ -3268,7 +3268,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } refs = btrfs_extent_refs(leaf, ei); - if (refs < refs_to_drop) { + if (unlikely(refs < refs_to_drop)) { abort_and_dump(trans, path, "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", refs_to_drop, refs, bytenr, path->slots[0]); @@ -3285,7 +3285,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - if (!found_extent) { + if (unlikely(!found_extent)) { abort_and_dump(trans, path, "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", path->slots[0]); @@ -3298,7 +3298,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3314,8 +3314,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* In this branch refs == 1 */ if (found_extent) { - if (is_data && refs_to_drop != - extent_data_ref_count(path, iref)) { + if (unlikely(is_data && refs_to_drop != + extent_data_ref_count(path, iref))) { abort_and_dump(trans, path, "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", extent_data_ref_count(path, iref), @@ -3324,7 +3324,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } if (iref) { - if (path->slots[0] != extent_slot) { + if (unlikely(path->slots[0] != extent_slot)) { abort_and_dump(trans, path, "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", key.objectid, key.type, @@ -3339,7 +3339,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * | extent_slot ||extent_slot + 1| * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ - if (path->slots[0] != extent_slot + 1) { + if (unlikely(path->slots[0] != extent_slot + 1)) { abort_and_dump(trans, path, "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", path->slots[0]); @@ -3363,7 +3363,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4297,7 +4297,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, } static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl) + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info) { if (ffe_ctl->for_treelog) { spin_lock(&fs_info->treelog_bg_lock); @@ -4315,12 +4316,13 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { /* - * No lock is OK here because avail is monotinically + * No lock is OK here because avail is monotonically * decreasing, and this is just a hint. */ u64 avail = block_group->zone_capacity - block_group->alloc_offset; if (block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && avail >= ffe_ctl->num_bytes) { ffe_ctl->hint_byte = block_group->start; break; @@ -4342,7 +4344,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - return prepare_allocation_zoned(fs_info, ffe_ctl); + return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); default: BUG(); } @@ -5061,7 +5063,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - if (check_eb_lock_owner(buf)) { + if (unlikely(check_eb_lock_owner(buf))) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } @@ -5470,17 +5472,17 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, if (!(wc->flags[level] & flag)) { ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_set_disk_extent_flags(trans, eb, flag); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -5582,7 +5584,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); - if (btrfs_buffer_uptodate(next, generation, 0)) + if (btrfs_buffer_uptodate(next, generation, false)) return 0; check.level = level - 1; @@ -5611,7 +5613,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. * * If we are DROP_REFERENCE this will figure out if we need to drop our current - * reference, skipping it if we dropped it from a previous incompleted drop, or + * reference, skipping it if we dropped it from a previous uncompleted drop, or * dropping it if we still have a reference to it. */ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -5636,7 +5638,7 @@ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_r ref.parent = path->nodes[level]->start; } else { ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); - if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { + if (unlikely(btrfs_root_id(root) != btrfs_header_owner(path->nodes[level]))) { btrfs_err(root->fs_info, "mismatched block owner"); return -EIO; } @@ -5758,7 +5760,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, /* * We have to walk down into this node, and if we're currently at the - * DROP_REFERNCE stage and this block is shared then we need to switch + * DROP_REFERENCE stage and this block is shared then we need to switch * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. */ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { @@ -5772,7 +5774,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, level--; ASSERT(level == btrfs_header_level(next)); - if (level != btrfs_header_level(next)) { + if (unlikely(level != btrfs_header_level(next))) { btrfs_err(root->fs_info, "mismatched level"); ret = -EIO; goto out_unlock; @@ -5883,7 +5885,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, } } else { ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -5908,13 +5910,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else if (btrfs_root_id(root) != btrfs_header_owner(eb)) + else if (unlikely(btrfs_root_id(root) != btrfs_header_owner(eb))) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else if (btrfs_root_id(root) != - btrfs_header_owner(path->nodes[level + 1])) + else if (unlikely(btrfs_root_id(root) != + btrfs_header_owner(path->nodes[level + 1]))) goto owner_mismatch; } @@ -6049,9 +6051,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. * - * If called with for_reloc == 0, may exit early with -EAGAIN + * If called with for_reloc set, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc) { const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; @@ -6178,13 +6180,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) while (1) { ret = walk_down_tree(trans, root, path, wc); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); break; } @@ -6211,7 +6213,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -6247,7 +6249,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -6255,7 +6257,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } else if (ret > 0) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 82d3a82dc712a4..e970ac42a871ad 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -140,9 +140,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, bool full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, bool full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); @@ -155,8 +155,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, - int for_reloc); +int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c953297aa89a01..c123a3ef154ae5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,26 @@ struct btrfs_bio_ctrl { enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; + /* + * For data read bios, we attempt to optimize csum lookups if the extent + * generation is older than the current one. To make this possible, we + * need to track the maximum generation of an extent in a bio_ctrl to + * make the decision when submitting the bio. + * + * The pattern between do_readpage(), submit_one_bio() and + * submit_extent_folio() is quite subtle, so tracking this is tricky. + * + * As we process extent E, we might submit a bio with existing built up + * extents before adding E to a new bio, or we might just add E to the + * bio. As a result, E's generation could apply to the current bio or + * to the next one, so we need to be careful to update the bio_ctrl's + * generation with E's only when we are sure E is added to bio_ctrl->bbio + * in submit_extent_folio(). + * + * See the comment in btrfs_lookup_bio_sums() for more detail on the + * need for this optimization. + */ + u64 generation; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; @@ -111,8 +131,46 @@ struct btrfs_bio_ctrl { */ unsigned long submit_bitmap; struct readahead_control *ractl; + + /* + * The start offset of the last used extent map by a read operation. + * + * This is for proper compressed read merge. + * U64_MAX means we are starting the read and have made no progress yet. + * + * The current btrfs_bio_is_contig() only uses disk_bytenr as + * the condition to check if the read can be merged with previous + * bio, which is not correct. E.g. two file extents pointing to the + * same extent but with different offset. + * + * So here we need to do extra checks to only merge reads that are + * covered by the same extent map. + * Just extent_map::start will be enough, as they are unique + * inside the same inode. + */ + u64 last_em_start; }; +/* + * Helper to set the csum search commit root option for a bio_ctrl's bbio + * before submitting the bio. + * + * Only for use by submit_one_bio(). + */ +static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) +{ + struct btrfs_bio *bbio = bio_ctrl->bbio; + + ASSERT(bbio); + + if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) + return; + + bio_ctrl->bbio->csum_search_commit_root = + (bio_ctrl->generation && + bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); +} + static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_bio *bbio = bio_ctrl->bbio; @@ -123,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); + bio_set_csum_search_commit_root(bio_ctrl); + if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); @@ -131,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; + /* + * We used the generation to decide whether to lookup csums in the + * commit_root or not when we called bio_set_csum_search_commit_root() + * above. Now, reset the generation for the next bio. + */ + bio_ctrl->generation = 0; } /* @@ -327,6 +393,13 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; + + /* + * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can + * return early without handling any dirty ranges. + */ + ASSERT(max_bytes >= fs_info->sectorsize); + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); if (!found || delalloc_end <= *start || delalloc_start > orig_end) { @@ -352,18 +425,19 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the folioss after the folios that has start */ + /* step two, lock all the folios after the folios that has start */ ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the folios are gone, lets avoid looping by - * shortening the size of the delalloc range we're searching + /* + * Some of the folios are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching. */ btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { - max_bytes = PAGE_SIZE; + max_bytes = fs_info->sectorsize; loops = 1; goto again; } else { @@ -552,6 +626,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Populate every free slot in a provided array with folios using GFP_NOFS. * * @nr_folios: number of folios to allocate + * @order: the order of the folios to be allocated * @folio_array: the array to fill with folios; any existing non-NULL entries in * the array will be skipped * @@ -559,12 +634,13 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * -ENOMEM otherwise, the partially allocated folios would be freed and * the array slots zeroed */ -int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array) +int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, + struct folio **folio_array) { for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) continue; - folio_array[i] = folio_alloc(GFP_NOFS, 0); + folio_array[i] = folio_alloc(GFP_NOFS, order); if (!folio_array[i]) goto error; } @@ -573,6 +649,7 @@ int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array) for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) folio_put(folio_array[i]); + folio_array[i] = NULL; } return -ENOMEM; } @@ -701,15 +778,18 @@ static void alloc_new_bio(struct btrfs_inode *inode, * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one + * @read_em_generation: generation of the extent_map we are submitting + * (only used for read) * * The will either add the page into the existing @bio_ctrl->bbio, or allocate a * new one in @bio_ctrl->bbio. - * The mirror number for this IO should already be initizlied in + * The mirror number for this IO should already be initialized in * @bio_ctrl->mirror_num. */ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, u64 disk_bytenr, struct folio *folio, - size_t size, unsigned long pg_offset) + size_t size, unsigned long pg_offset, + u64 read_em_generation) { struct btrfs_inode *inode = folio_to_inode(folio); loff_t file_offset = folio_pos(folio) + pg_offset; @@ -740,6 +820,11 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + /* + * Now that the folio is definitely added to the bio, include its + * generation in the max generation calculation. + */ + bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) @@ -909,7 +994,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, * return 0 on success, otherwise return error */ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -942,6 +1027,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, bool force_bio_submit = false; u64 disk_bytenr; u64 block_start; + u64 em_gen; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { @@ -1019,13 +1105,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, * non-optimal behavior (submitting 2 bios for the same extent). */ if (compress_type != BTRFS_COMPRESS_NONE && - prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->start) + bio_ctrl->last_em_start != U64_MAX && + bio_ctrl->last_em_start != em->start) force_bio_submit = true; - if (prev_em_start) - *prev_em_start = em->start; + bio_ctrl->last_em_start = em->start; + em_gen = em->generation; btrfs_free_extent_map(em); em = NULL; @@ -1049,7 +1135,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, - pg_offset); + pg_offset, em_gen); } return 0; } @@ -1238,12 +1324,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio) const u64 start = folio_pos(folio); const u64 end = start + folio_size(folio) - 1; struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, + .last_em_start = U64_MAX, + }; struct extent_map *em_cached = NULL; int ret; lock_extents_for_read(inode, start, end, &cached_state); - ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); btrfs_free_extent_map(em_cached); @@ -1580,7 +1669,7 @@ static int submit_one_sector(struct btrfs_inode *inode, ASSERT(folio_test_writeback(folio)); submit_extent_folio(bio_ctrl, disk_bytenr, folio, - sectorsize, filepos - folio_pos(folio)); + sectorsize, filepos - folio_pos(folio), 0); return 0; } @@ -1601,7 +1690,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; - bool error = false; + int found_error = 0; const u64 folio_start = folio_pos(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; @@ -1665,7 +1754,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, */ btrfs_mark_ordered_io_finished(inode, folio, cur, fs_info->sectorsize, false); - error = true; + if (!found_error) + found_error = ret; continue; } submitted_io = true; @@ -1682,11 +1772,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * If we hit any error, the corresponding sector will have its dirty * flag cleared and writeback finished, thus no need to handle the error case. */ - if (!submitted_io && !error) { + if (!submitted_io && !found_error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } - return ret; + return found_error; } /* @@ -2147,7 +2237,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * @fs_info: The fs_info for this file system. * @start: The offset of the range to start waiting on writeback. * @end: The end of the range, inclusive. This is meant to be used in - * conjuction with wait_marked_extents, so this will usually be + * conjunction with wait_marked_extents, so this will usually be * the_next_eb->start - 1. */ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, @@ -2417,7 +2507,7 @@ static int extent_write_cache_pages(struct address_space *mapping, * In above case, [32K, 96K) is asynchronously submitted * for compression, and [124K, 128K) needs to be written back. * - * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * If we didn't wait writeback for page 64K, [128K, 128K) * won't be submitted as the page still has writeback flag * and will be skipped in the next check. * @@ -2583,7 +2673,8 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD, - .ractl = rac + .ractl = rac, + .last_em_start = U64_MAX, }; struct folio *folio; struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); @@ -2591,12 +2682,11 @@ void btrfs_readahead(struct readahead_control *rac) const u64 end = start + readahead_length(rac) - 1; struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; - u64 prev_em_start = (u64)-1; lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) - btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); @@ -2901,7 +2991,7 @@ static void cleanup_extent_buffer_folios(struct extent_buffer *eb) { const int num_folios = num_extent_folios(eb); - /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ for (int i = 0; i < num_folios; i++) { ASSERT(eb->folios[i]); detach_extent_buffer_folio(eb, eb->folios[i]); @@ -3148,29 +3238,30 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, */ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { - if (!IS_ALIGNED(start, fs_info->sectorsize)) { + const u32 nodesize = fs_info->nodesize; + + if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { btrfs_err(fs_info, "bad tree block start %llu", start); return true; } - if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { + if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { btrfs_err(fs_info, "tree block is not nodesize aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (fs_info->nodesize >= PAGE_SIZE && - !PAGE_ALIGNED(start)) { + if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", - start, fs_info->nodesize); + start, nodesize); return true; } - if (!IS_ALIGNED(start, fs_info->nodesize) && - !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { + if (unlikely(!IS_ALIGNED(start, nodesize) && + !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { btrfs_warn(fs_info, "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", - start, fs_info->nodesize); + start, nodesize); } return false; } @@ -3789,7 +3880,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, return ret; wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) return -EIO; return 0; } @@ -4465,7 +4556,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) return; - if (btrfs_buffer_uptodate(eb, gen, 1)) { + if (btrfs_buffer_uptodate(eb, gen, true)) { free_extent_buffer(eb); return; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 61130786b9a3ad..5fcbfe44218c44 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -366,7 +366,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, bool nofail); -int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); +int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, + struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 57f52585a6dde9..7e38c23a0c1cb6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -460,7 +460,7 @@ void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) static inline void setup_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, - int modified) + bool modified) { refcount_inc(&em->refs); @@ -486,7 +486,7 @@ static inline void setup_extent_mapping(struct btrfs_inode *inode, * taken, or a reference dropped if the merge attempt was successful. */ static int add_extent_mapping(struct btrfs_inode *inode, - struct extent_map *em, int modified) + struct extent_map *em, bool modified) { struct extent_map_tree *tree = &inode->extent_tree; struct btrfs_root *root = inode->root; @@ -509,7 +509,7 @@ static int add_extent_mapping(struct btrfs_inode *inode, } static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) + u64 start, u64 len, bool strict) { struct extent_map *em; struct rb_node *rb_node; @@ -548,7 +548,7 @@ static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - return lookup_extent_mapping(tree, start, len, 1); + return lookup_extent_mapping(tree, start, len, true); } /* @@ -566,7 +566,7 @@ struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - return lookup_extent_mapping(tree, start, len, 0); + return lookup_extent_mapping(tree, start, len, false); } /* @@ -594,7 +594,7 @@ void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *e static void replace_extent_mapping(struct btrfs_inode *inode, struct extent_map *cur, struct extent_map *new, - int modified) + bool modified) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; @@ -670,7 +670,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, em->len = end - start; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) em->offset += start_diff; - return add_extent_mapping(inode, em, 0); + return add_extent_mapping(inode, em, false); } /* @@ -707,7 +707,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, if (em->disk_bytenr == EXTENT_MAP_INLINE) ASSERT(em->start == 0); - ret = add_extent_mapping(inode, em, 0); + ret = add_extent_mapping(inode, em, false); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -1057,7 +1057,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = btrfs_lookup_extent_mapping(em_tree, start, len); - if (!em) { + if (unlikely(!em)) { ret = -EIO; goto out_unlock; } @@ -1082,7 +1082,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr split_pre->flags = flags; split_pre->generation = em->generation; - replace_extent_mapping(inode, em, split_pre, 1); + replace_extent_mapping(inode, em, split_pre, true); /* * Now we only have an extent_map at: @@ -1098,7 +1098,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; split_mid->generation = em->generation; - add_extent_mapping(inode, split_mid, 1); + add_extent_mapping(inode, split_mid, true); /* Once for us */ btrfs_free_extent_map(em); @@ -1372,7 +1372,7 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) return; - queue_work(system_unbound_wq, &fs_info->em_shrinker_work); + queue_work(system_dfl_wq, &fs_info->em_shrinker_work); } void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index 7935586a9dbd0f..f2eaaef8422bf3 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -153,7 +153,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, if (cache_end > offset) { if (offset == cache->offset) { /* - * We cached a dealloc range (found in the io tree) for + * We cached a delalloc range (found in the io tree) for * a hole or prealloc extent and we have now found a * file extent item for the same offset. What we have * now is more recent and up to date, so discard what diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c09fbc257634ab..a42e6d54e7cd74 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -397,6 +397,36 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) path->skip_locking = 1; } + /* + * If we are searching for a csum of an extent from a past + * transaction, we can search in the commit root and reduce + * lock contention on the csum tree extent buffers. + * + * This is important because that lock is an rwsem which gets + * pretty heavy write load under memory pressure and sustained + * csum overwrites, unlike the commit_root_sem. (Memory pressure + * makes us writeback the nodes multiple times per transaction, + * which makes us cow them each time, taking the write lock.) + * + * Due to how rwsem is implemented, there is a possible + * priority inversion where the readers holding the lock don't + * get scheduled (say they're in a cgroup stuck in heavy reclaim) + * which then blocks writers, including transaction commit. By + * using a semaphore with fewer writers (only a commit switching + * the roots), we make this issue less likely. + * + * Note that we don't rely on btrfs_search_slot to lock the + * commit root csum. We call search_slot multiple times, which would + * create a potential race where a commit comes in between searches + * while we are not holding the commit_root_sem, and we get csums + * from across transactions. + */ + if (bbio->csum_search_commit_root) { + path->search_commit_root = 1; + path->skip_locking = 1; + down_read(&fs_info->commit_root_sem); + } + while (bio_offset < orig_len) { int count; u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; @@ -442,6 +472,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } + if (bbio->csum_search_commit_root) + up_read(&fs_info->commit_root_sem); return ret; } @@ -743,12 +775,10 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; struct btrfs_ordered_sum *sums; - char *data; - struct bvec_iter iter; - struct bio_vec bvec; + struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; + const u32 blocksize = fs_info->sectorsize; int index; - unsigned int blockcount; - int i; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -767,21 +797,9 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) shash->tfm = fs_info->csum_shash; - bio_for_each_segment(bvec, bio, iter) { - blockcount = BTRFS_BYTES_TO_BLKS(fs_info, - bvec.bv_len + fs_info->sectorsize - - 1); - - for (i = 0; i < blockcount; i++) { - data = bvec_kmap_local(&bvec); - crypto_shash_digest(shash, - data + (i * fs_info->sectorsize), - fs_info->sectorsize, - sums->sums + index); - kunmap_local(data); - index += fs_info->csum_size; - } - + btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) { + btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index); + index += fs_info->csum_size; } bbio->sums = sums; @@ -993,7 +1011,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, * item changed size or key */ ret = btrfs_split_item(trans, root, path, &key, offset); - if (ret && ret != -EAGAIN) { + if (unlikely(ret && ret != -EAGAIN)) { btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 204674934795cb..7efd1f8a19121f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -327,7 +327,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -426,7 +426,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -443,7 +443,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -587,21 +587,20 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ino || - key.type != BTRFS_EXTENT_DATA_KEY) { + if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) { + if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (key.offset > start || extent_end < end) { + if (unlikely(key.offset > start || extent_end < end)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -676,7 +675,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, btrfs_release_path(path); goto again; } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -704,7 +703,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ref.ref_root = btrfs_root_id(root); btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -712,7 +711,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, if (split == start) { key.offset = start; } else { - if (start != key.offset) { + if (unlikely(start != key.offset)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -744,7 +743,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -762,7 +761,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -783,7 +782,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, extent_end - key.offset); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -815,7 +814,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 if (ret) return ret; folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); return -EIO; } @@ -970,7 +969,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * Return: * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. - * -EAGAIN If we can't do a nocow write because snapshoting of the inode's + * -EAGAIN If we can't do a nocow write because snapshotting of the inode's * root is in progress or because we are in a non-blocking IO * context and need to block (@nowait is true). * < 0 If an error happened. @@ -2460,9 +2459,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, * got EOPNOTSUPP via prealloc then we messed up and * need to abort. */ - if (ret && - (ret != -EOPNOTSUPP || - (extent_info && extent_info->is_new_extent))) + if (unlikely(ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent)))) btrfs_abort_transaction(trans, ret); break; } @@ -2473,7 +2472,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, cur_offset < ino_size) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); - if (ret) { + if (unlikely(ret)) { /* * If we failed then we didn't insert our hole * entries for the area we dropped, so now the @@ -2493,7 +2492,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); - if (ret) { + if (unlikely(ret)) { /* * We couldn't clear our area, so we could * presumably adjust up and corrupt the fs, so @@ -2512,7 +2511,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len, drop_args.bytes_found); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -2607,7 +2606,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, cur_offset < drop_args.drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); - if (ret) { + if (unlikely(ret)) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2616,7 +2615,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } @@ -2626,7 +2625,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len, drop_args.bytes_found); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_trans; } @@ -3345,7 +3344,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * We could also use the extent map tree to find such delalloc that is * being flushed, but using the ordered extents tree is more efficient * because it's usually much smaller as ordered extents are removed from - * the tree once they complete. With the extent maps, we mau have them + * the tree once they complete. With the extent maps, we may have them * in the extent map tree for a very long time, and they were either * created by previous writes or loaded by read operations. */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5d8d1570a5c948..ab873bd6719209 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2282,7 +2282,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want * to reserve them to larger extents, however if we have plenty - * of cache left then go ahead an dadd them, no sense in adding + * of cache left then go ahead and add them, no sense in adding * the overhead of a bitmap if we don't have to. */ if (info->bytes <= fs_info->sectorsize * 8) { @@ -3829,7 +3829,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, /* * If we break out of trimming a bitmap prematurely, we should reset the - * trimming bit. In a rather contrieved case, it's possible to race here so + * trimming bit. In a rather contrived case, it's possible to race here so * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. * * start = start of bitmap @@ -4142,7 +4142,7 @@ int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool act if (!active) { set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); ret = cleanup_free_space_cache_v1(fs_info, trans); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index eba7f22ae49c67..dad0b492a66351 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -137,12 +137,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { DEBUG_WARN(); return -EIO; } - if (p->slots[0] == 0) { + if (unlikely(p->slots[0] == 0)) { DEBUG_WARN("no previous slot found"); return -EIO; } @@ -218,7 +218,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (!bitmap) { + if (unlikely(!bitmap)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -233,7 +233,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -271,7 +271,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -293,7 +293,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_release_path(path); - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -320,7 +320,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, data_size); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -361,7 +361,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (!bitmap) { + if (unlikely(!bitmap)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -376,7 +376,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -420,7 +420,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -454,7 +454,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, key.offset = (end_bit - start_bit) * fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -465,7 +465,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, start_bit = find_next_bit_le(bitmap, nrbits, end_bit); } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -848,14 +848,14 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, return 0; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); - if (!block_group) { + if (unlikely(!block_group)) { DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); @@ -1030,14 +1030,14 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, return 0; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); - if (!block_group) { + if (unlikely(!block_group)) { DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); @@ -1185,7 +1185,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) goto out_clear; } ret = btrfs_global_root_insert(free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_put_root(free_space_root); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -1197,7 +1197,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out_clear; @@ -1290,14 +1290,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); ret = clear_free_space_tree(trans, free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } ret = btrfs_del_root(trans, &free_space_root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1315,7 +1315,7 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); btrfs_put_root(free_space_root); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1344,7 +1344,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); ret = clear_free_space_tree(trans, free_space_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1362,7 +1362,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) goto next; ret = populate_free_space_tree(trans, block_group); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -1422,7 +1422,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, if (!path) { path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { btrfs_abort_transaction(trans, -ENOMEM); return -ENOMEM; } @@ -1430,7 +1430,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, } ret = add_new_free_space_info(trans, block_group, path); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1481,7 +1481,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -1496,7 +1496,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1527,7 +1527,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1611,7 +1611,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, extent_count++; } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, @@ -1672,7 +1672,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, extent_count++; } - if (extent_count != expected_extent_count) { + if (unlikely(extent_count != expected_extent_count)) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index b2bb86f8d7cf0c..feb0a2faa8379b 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -54,6 +54,54 @@ size_t __attribute_const__ btrfs_get_num_csums(void) return ARRAY_SIZE(btrfs_csums); } +/* + * We support the following block sizes for all systems: + * + * - 4K + * This is the most common block size. For PAGE SIZE > 4K cases the subpage + * mode is used. + * + * - PAGE_SIZE + * The straightforward block size to support. + * + * And extra support for the following block sizes based on the kernel config: + * + * - MIN_BLOCKSIZE + * This is either 4K (regular builds) or 2K (debug builds) + * This allows testing subpage routines on x86_64. + */ +bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize) +{ + /* @blocksize should be validated first. */ + ASSERT(is_power_of_2(blocksize) && blocksize >= BTRFS_MIN_BLOCKSIZE && + blocksize <= BTRFS_MAX_BLOCKSIZE); + + if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE) + return true; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * For bs > ps support it's done by specifying a minimal folio order + * for filemap, thus implying large data folios. + * For HIGHMEM systems, we can not always access the content of a (large) + * folio in one go, but go through them page by page. + * + * A lot of features don't implement a proper PAGE sized loop for large + * folios, this includes: + * + * - compression + * - verity + * - encoded write + * + * Considering HIGHMEM is such a pain to deal with and it's going + * to be deprecated eventually, just reject HIGHMEM && bs > ps cases. + */ + if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE) + return false; + return true; +#endif + return false; +} + /* * Start exclusive operation @type, return true on success. */ diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 8cc07cc70b1283..814bbc9417d2a2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -59,6 +59,8 @@ struct btrfs_space_info; #define BTRFS_MIN_BLOCKSIZE (SZ_4K) #endif +#define BTRFS_MAX_BLOCKSIZE (SZ_64K) + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -102,6 +104,8 @@ enum { BTRFS_FS_STATE_RO, /* Track if a transaction abort has been reported on this filesystem */ BTRFS_FS_STATE_TRANS_ABORTED, + /* Track if log replay has failed. */ + BTRFS_FS_STATE_LOG_REPLAY_ABORTED, /* * Bio operations should be blocked on this filesystem because a source * or target device is being destroyed as part of a device replace @@ -243,6 +247,7 @@ enum { BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30), BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31), BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32), + BTRFS_MOUNT_REF_TRACKER = (1ULL << 33), }; /* @@ -280,7 +285,7 @@ enum { #ifdef CONFIG_BTRFS_EXPERIMENTAL /* - * Features under developmen like Extent tree v2 support is enabled + * Features under development like Extent tree v2 support is enabled * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ @@ -303,6 +308,16 @@ enum { #define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) +enum btrfs_compression_type { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_ZSTD = 3, + BTRFS_NR_COMPRESS_TYPES = 4, + + BTRFS_DEFRAG_DONT_COMPRESS, +}; + struct btrfs_dev_replace { /* See #define above */ u64 replace_state; @@ -505,6 +520,9 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; + /* Compress related structures. */ + void *compr_wsm[BTRFS_NR_COMPRESS_TYPES]; + int compress_type; int compress_level; u32 commit_interval; @@ -809,6 +827,8 @@ struct btrfs_fs_info { u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; + u32 block_min_order; + u32 block_max_order; u32 csum_size; u32 csums_per_leaf; u32 stripesize; @@ -878,12 +898,10 @@ struct btrfs_fs_info { struct lockdep_map btrfs_trans_pending_ordered_map; struct lockdep_map btrfs_ordered_extent_map; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG spinlock_t ref_verify_lock; struct rb_root block_tree; -#endif -#ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; struct list_head allocated_roots; @@ -905,6 +923,12 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } +/* Return the minimal folio size of the fs. */ +static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info) +{ + return 1U << (PAGE_SHIFT + fs_info->block_min_order); +} + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); @@ -997,6 +1021,7 @@ static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs return folio_size(folio) >> fs_info->sectorsize_bits; } +bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize); bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, @@ -1107,9 +1132,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) #define EXPORT_FOR_TESTS -static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { - return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + return unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)); } void btrfs_test_destroy_inode(struct inode *inode); @@ -1118,9 +1143,9 @@ void btrfs_test_destroy_inode(struct inode *inode); #define EXPORT_FOR_TESTS static -static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { - return 0; + return false; } #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index f06cf701ae5ae0..1bd73b80f9fac8 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -137,7 +137,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, name); - if (!extref) { + if (unlikely(!extref)) { btrfs_abort_transaction(trans, -ENOENT); return -ENOENT; } @@ -627,7 +627,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (control->clear_extent_range) { ret = btrfs_inode_clear_file_extent_range(control->inode, clear_start, clear_len); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -666,7 +666,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, control->ino, extent_offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -684,7 +684,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -720,7 +720,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int ret2; ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret2) { + if (unlikely(ret2)) { btrfs_abort_transaction(trans, ret2); ret = ret2; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd82dcc7b2b7b2..ced87c9e468277 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -72,6 +72,9 @@ #include "raid-stripe-tree.h" #include "fiemap.h" +#define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) +#define COW_FILE_RANGE_NO_INLINE (1UL << 1) + struct btrfs_iget_args { u64 ino; struct btrfs_root *root; @@ -367,7 +370,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) } /* - * Unock inode i_rwsem. + * Unlock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. @@ -631,7 +634,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, drop_args.replace_extent = true; drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -639,7 +642,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, compressed_folio, update_i_size); - if (ret && ret != -ENOSPC) { + if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { @@ -649,7 +652,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); - if (ret && ret != -ENOSPC) { + if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { @@ -851,6 +854,8 @@ static void compress_file_range(struct btrfs_work *work) struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -861,7 +866,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned int poff; + unsigned int loff; int i; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; @@ -899,8 +904,8 @@ static void compress_file_range(struct btrfs_work *work) actual_end = min_t(u64, i_size, end + 1); again: folios = NULL; - nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); + nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift); /* * we don't want to send crud past the end of i_size through @@ -956,18 +961,18 @@ static void compress_file_range(struct btrfs_work *work) /* Compression level is applied here. */ ret = btrfs_compress_folios(compress_type, compress_level, - mapping, start, folios, &nr_folios, &total_in, + inode, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) goto mark_incompressible; /* - * Zero the tail end of the last page, as we might be sending it down + * Zero the tail end of the last folio, as we might be sending it down * to disk. */ - poff = offset_in_page(total_compressed); - if (poff) - folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); + loff = (total_compressed & (min_folio_size - 1)); + if (loff) + folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff); /* * Try to create an inline extent. @@ -1245,18 +1250,18 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_folio. + * When this function fails, it unlocks all folios except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_folio and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_folio is - * the only page handled anyway). + * unlocks all folios including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single block, so locked_folio is + * the only folio handled anyway). * - * When this function succeed and creates a normal extent, the page locking + * When this function succeed and creates a normal extent, the folio locking * status depends on the passed in flags: * - * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_folio are unlocked. + * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked. + * - Else all folios except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are cleaned up. @@ -1264,7 +1269,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, u64 *done_offset, - bool keep_locked, bool no_inline) + unsigned long flags) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1292,7 +1297,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - if (!no_inline) { + if (!(flags & COW_FILE_RANGE_NO_INLINE)) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); @@ -1320,7 +1325,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Do set the Ordered (Private2) bit so we know this page was properly * setup for writepage. */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; /* @@ -1531,10 +1536,11 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, end - start - cur_alloc_size + 1, NULL); } - btrfs_err_rl(fs_info, - "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", - __func__, btrfs_root_id(inode->root), - btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); + btrfs_err(fs_info, +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, + start, cur_alloc_size, ret); return ret; } @@ -1687,7 +1693,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, while (start <= end) { ret = cow_file_range(inode, locked_folio, start, end, - &done_offset, true, false); + &done_offset, COW_FILE_RANGE_KEEP_LOCKED); if (ret) return ret; extent_write_locked_range(&inode->vfs_inode, locked_folio, @@ -1768,9 +1774,15 @@ static int fallback_to_cow(struct btrfs_inode *inode, * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. + * + * And here we do not unlock the folio after a successful run. + * The folios will be unlocked after everything is finished, or by error handling. + * + * This is to ensure error handling won't need to clear dirty/ordered flags without + * a locked folio, which can race with writeback. */ - ret = cow_file_range(inode, locked_folio, start, end, NULL, false, - true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, + COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED); ASSERT(ret != 1); return ret; } @@ -1913,61 +1925,14 @@ static int can_nocow_file_extent(struct btrfs_path *path, return ret < 0 ? ret : can_nocow; } -/* - * Cleanup the dirty folios which will never be submitted due to error. - * - * When running a delalloc range, we may need to split the ranges (due to - * fragmentation or NOCOW). If we hit an error in the later part, we will error - * out and previously successfully executed range will never be submitted, thus - * we have to cleanup those folios by clearing their dirty flag, starting and - * finishing the writeback. - */ -static void cleanup_dirty_folios(struct btrfs_inode *inode, - struct folio *locked_folio, - u64 start, u64 end, int error) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct address_space *mapping = inode->vfs_inode.i_mapping; - pgoff_t start_index = start >> PAGE_SHIFT; - pgoff_t end_index = end >> PAGE_SHIFT; - u32 len; - - ASSERT(end + 1 - start < U32_MAX); - ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(end + 1, fs_info->sectorsize)); - len = end + 1 - start; - - /* - * Handle the locked folio first. - * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. - */ - btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); - - for (pgoff_t index = start_index; index <= end_index; index++) { - struct folio *folio; - - /* Already handled at the beginning. */ - if (index == locked_folio->index) - continue; - folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); - /* Cache already dropped, no need to do any cleanup. */ - if (IS_ERR(folio)) - continue; - btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); - folio_unlock(folio); - folio_put(folio); - } - mapping_set_error(mapping, error); -} - static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, struct extent_state **cached, struct can_nocow_file_extent_args *nocow_args, u64 file_pos, bool is_prealloc) { struct btrfs_ordered_extent *ordered; - u64 len = nocow_args->file_extent.num_bytes; - u64 end = file_pos + len - 1; + const u64 len = nocow_args->file_extent.num_bytes; + const u64 end = file_pos + len - 1; int ret = 0; btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); @@ -1978,8 +1943,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { - btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); - return PTR_ERR(em); + ret = PTR_ERR(em); + goto error; } btrfs_free_extent_map(em); } @@ -1991,8 +1956,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio if (IS_ERR(ordered)) { if (is_prealloc) btrfs_drop_extent_map_range(inode, file_pos, end, false); - btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); - return PTR_ERR(ordered); + ret = PTR_ERR(ordered); + goto error; } if (btrfs_is_data_reloc_root(inode->root)) @@ -2004,23 +1969,30 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio ret = btrfs_reloc_clone_csums(ordered); btrfs_put_ordered_extent(ordered); + if (ret < 0) + goto error; extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - /* - * On error, we need to cleanup the ordered extents we created. - * - * We do not clear the folio Dirty flags because they are set and - * cleaered by the caller. - */ - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, file_pos, len); + PAGE_SET_ORDERED); + return ret; + +error: + btrfs_cleanup_ordered_extents(inode, file_pos, len); + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_err(inode->root->fs_info, + "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), btrfs_ino(inode), + file_pos, len, ret); return ret; } /* - * when nowcow writeback call back. This checks for snapshots or COW copies + * When nocow writeback calls back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing @@ -2037,13 +2009,23 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, /* * If not 0, represents the inclusive end of the last fallback_to_cow() * range. Only for error handling. + * + * The same for nocow_end, it's to avoid double cleaning up the range + * already cleaned by nocow_one_range(). */ u64 cow_end = 0; + u64 nocow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; + /* The range that has ordered extent(s). */ + u64 oe_cleanup_start; + u64 oe_cleanup_len = 0; + /* The range that is untouched. */ + u64 untouched_start; + u64 untouched_len = 0; /* * Normally on a zoned device we're only doing COW writes, but in case @@ -2207,8 +2189,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, &nocow_args, cur_offset, extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (ret < 0) + if (ret < 0) { + nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; goto error; + } cur_offset = extent_end; } btrfs_release_path(path); @@ -2225,86 +2209,105 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, cow_start = (u64)-1; } - btrfs_free_path(path); - return 0; - -error: /* - * There are several error cases: - * - * 1) Failed without falling back to COW - * start cur_offset end - * |/////////////| | - * - * In this case, cow_start should be (u64)-1. + * Everything is finished without an error, can unlock the folios now. * - * For range [start, cur_offset) the folios are already unlocked (except - * @locked_folio), EXTENT_DELALLOC already removed. - * Need to clear the dirty flags and finish the ordered extents. - * - * 2) Failed with error before calling fallback_to_cow() - * - * start cow_start end - * |/////////////| | - * - * In this case, only @cow_start is set, @cur_offset is between - * [cow_start, end) - * - * It's mostly the same as case 1), just replace @cur_offset with - * @cow_start. - * - * 3) Failed with error from fallback_to_cow() - * - * start cow_start cow_end end - * |/////////////|-----------| | - * - * In this case, both @cow_start and @cow_end is set. - * - * For range [start, cow_start) it's the same as case 1). - * But for range [cow_start, cow_end), all the cleanup is handled by - * cow_file_range(), we should not touch anything in that range. - * - * So for all above cases, if @cow_start is set, cleanup ordered extents - * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). + * No need to touch the io tree range nor set folio ordered flag, as + * fallback_to_cow() and nocow_one_range() have already handled them. */ - if (cow_start != (u64)-1) - cur_offset = cow_start; + extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK); - if (cur_offset > start) { - btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); - cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); - } + btrfs_free_path(path); + return 0; - /* - * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to @cow_end + 1 to skip the COW range, as - * cow_file_range() will do the proper cleanup at error. - */ - if (cow_end) - cur_offset = cow_end + 1; +error: + if (cow_start == (u64)-1) { + /* + * case a) + * start cur_offset end + * | OE cleanup | Untouched | + * + * We finished a fallback_to_cow() or nocow_one_range() call, + * but failed to check the next range. + * + * or + * start cur_offset nocow_end end + * | OE cleanup | Skip | Untouched | + * + * nocow_one_range() failed, the range [cur_offset, nocow_end] is + * already cleaned up. + */ + oe_cleanup_start = start; + oe_cleanup_len = cur_offset - start; + if (nocow_end) + untouched_start = nocow_end + 1; + else + untouched_start = cur_offset; + untouched_len = end + 1 - untouched_start; + } else if (cow_start != (u64)-1 && cow_end == 0) { + /* + * case b) + * start cow_start cur_offset end + * | OE cleanup | Untouched | + * + * We got a range that needs COW, but before we hit the next NOCOW range, + * thus [cow_start, cur_offset) doesn't yet have any OE. + */ + oe_cleanup_start = start; + oe_cleanup_len = cow_start - start; + untouched_start = cow_start; + untouched_len = end + 1 - untouched_start; + } else { + /* + * case c) + * start cow_start cow_end end + * | OE cleanup | Skip | Untouched | + * + * fallback_to_cow() failed, and fallback_to_cow() will do the + * cleanup for its range, we shouldn't touch the range + * [cow_start, cow_end]. + */ + ASSERT(cow_start != (u64)-1 && cow_end != 0); + oe_cleanup_start = start; + oe_cleanup_len = cow_start - start; + untouched_start = cow_end + 1; + untouched_len = end + 1 - untouched_start; + } + + if (oe_cleanup_len) { + const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1; + btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len); + extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end, + locked_folio, NULL, + EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + } - /* - * We need to lock the extent here because we're clearing DELALLOC and - * we're not locked at this point. - */ - if (cur_offset < end) { + if (untouched_len) { struct extent_state *cached = NULL; + const u64 untouched_end = untouched_start + untouched_len - 1; - btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); - extent_clear_unlock_delalloc(inode, cur_offset, end, + /* + * We need to lock the extent here because we're clearing DELALLOC and + * we're not locked at this point. + */ + btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached); + extent_clear_unlock_delalloc(inode, untouched_start, untouched_end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); + btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL); } btrfs_free_path(path); - btrfs_err_rl(fs_info, - "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", - __func__, btrfs_root_id(inode->root), - btrfs_ino(inode), start, end + 1 - start, ret); + btrfs_err(fs_info, +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d", + __func__, btrfs_root_id(inode->root), btrfs_ino(inode), + start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len, + untouched_start, untouched_len, ret); return ret; } @@ -2349,8 +2352,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_folio, start, end, NULL, - false, false); + ret = cow_file_range(inode, locked_folio, start, end, NULL, 0); return ret; } @@ -2986,7 +2988,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the * number of bytes only for that range containing the inline extent. - * The remaining of the range will be processed when clearning the + * The remaining of the range will be processed when clearing the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { @@ -3102,14 +3104,15 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (!freespace_inode) btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); - if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) { ret = -EIO; goto out; } - if (btrfs_is_zoned(fs_info)) - btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); + ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + if (ret) + goto out; if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -3147,7 +3150,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3155,7 +3158,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { /* Logic error */ ASSERT(list_empty(&ordered_extent->list)); - if (!list_empty(&ordered_extent->list)) { + if (unlikely(!list_empty(&ordered_extent->list))) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -3163,7 +3166,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); - if (ret) { + if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); } @@ -3190,20 +3193,20 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = add_pending_csums(trans, &ordered_extent->list); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3221,7 +3224,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); - if (ret) { /* -ENOMEM or corruption */ + if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } @@ -3327,21 +3330,47 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } +void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, + u8 *dest) +{ + struct folio *folio = page_folio(phys_to_page(paddr)); + const u32 blocksize = fs_info->sectorsize; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + + shash->tfm = fs_info->csum_shash; + /* The full block must be inside the folio. */ + ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); + + if (folio_test_partial_kmap(folio)) { + size_t cur = paddr; + + crypto_shash_init(shash); + while (cur < paddr + blocksize) { + void *kaddr; + size_t len = min(paddr + blocksize - cur, + PAGE_SIZE - offset_in_page(cur)); + + kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur)); + crypto_shash_update(shash, kaddr, len); + kunmap_local(kaddr); + cur += len; + } + crypto_shash_final(shash, dest); + } else { + crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest); + } +} /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. * * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, - const u8 * const csum_expected) +int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, + const u8 * const csum_expected) { - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - - shash->tfm = fs_info->csum_shash; - crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - - if (memcmp(csum, csum_expected, fs_info->csum_size)) + btrfs_calculate_block_csum(fs_info, paddr, csum); + if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) return -EIO; return 0; } @@ -3360,17 +3389,16 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, struct bio_vec *bv) + u32 bio_offset, phys_addr_t paddr) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 blocksize = fs_info->sectorsize; + struct folio *folio; u64 file_offset = bbio->file_offset + bio_offset; - u64 end = file_offset + bv->bv_len - 1; + u64 end = file_offset + blocksize - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - void *kaddr; - - ASSERT(bv->bv_len == fs_info->sectorsize); if (!bbio->csum) return true; @@ -3386,12 +3414,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - kaddr = bvec_kmap_local(bv); - if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { - kunmap_local(kaddr); + if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) goto zeroit; - } - kunmap_local(kaddr); return true; zeroit: @@ -3399,7 +3423,9 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); - memzero_bvec(bv); + folio = page_folio(phys_to_page(paddr)); + ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); + folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); return false; } @@ -3513,7 +3539,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, int ret; ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); - if (ret && ret != -EEXIST) { + if (unlikely(ret && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -3885,10 +3911,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(inode); - if (ret) - goto out; - ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; @@ -3920,8 +3942,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); @@ -3953,6 +3973,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path btrfs_set_inode_mapping_order(inode); cache_index: + ret = btrfs_init_file_extent_tree(inode); + if (ret) + goto out; + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any @@ -4263,7 +4288,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, } ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); - if (ret) { + if (unlikely(ret)) { btrfs_crit(fs_info, "failed to delete reference to %.*s, root %llu inode %llu parent %llu", name->len, name->name, btrfs_root_id(root), ino, dir_ino); @@ -4275,7 +4300,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, rename_ctx->index = index; ret = btrfs_delete_delayed_dir_index(trans, dir, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -4430,7 +4455,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4461,14 +4486,14 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_del_root_ref(trans, objectid, btrfs_root_id(root), dir_ino, &index, &fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_delete_delayed_dir_index(trans, dir, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4526,7 +4551,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. @@ -4557,7 +4582,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) inode = btrfs_find_first_inode(root, min_ino); while (inode) { - if (atomic_read(&inode->vfs_inode.i_count) > 1) + if (icount_read(&inode->vfs_inode) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; @@ -4640,13 +4665,13 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, dir); ret = btrfs_unlink_subvol(trans, dir, dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } ret = btrfs_record_root_in_trans(trans, dest); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4660,7 +4685,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, btrfs_root_id(dest)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4668,7 +4693,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4677,7 +4702,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(dest)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4817,7 +4842,7 @@ static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start) folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto out_unlock; } @@ -4905,7 +4930,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e goto out; /* - * Skip the truncatioin if the range in the target block is already aligned. + * Skip the truncation if the range in the target block is already aligned. * The seemingly complex check will also handle the same block case. */ if (in_head_block && !IS_ALIGNED(start, blocksize)) @@ -4961,7 +4986,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e folio_put(folio); goto again; } - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto out_unlock; } @@ -5081,7 +5106,7 @@ static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -5601,8 +5626,8 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, } btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); - if (location->type != BTRFS_INODE_ITEM_KEY && - location->type != BTRFS_ROOT_ITEM_KEY) { + if (unlikely(location->type != BTRFS_INODE_ITEM_KEY && + location->type != BTRFS_ROOT_ITEM_KEY)) { ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", @@ -5696,7 +5721,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode) bool empty = false; xa_lock(&root->inodes); - entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + /* + * This btrfs_inode is being freed and has already been unhashed at this + * point. It's possible that another btrfs_inode has already been + * allocated for the same inode and inserted itself into the root, so + * don't delete it in that case. + * + * Note that this shouldn't need to allocate memory, so the gfp flags + * don't really matter. + */ + entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL, + GFP_ATOMIC); if (entry == inode) empty = xa_empty(&root->inodes); xa_unlock(&root->inodes); @@ -5883,7 +5918,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ - if (btrfs_inode_type(inode) != di_type) { + if (unlikely(btrfs_inode_type(inode) != di_type)) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", inode->vfs_inode.i_mode, btrfs_inode_type(inode), @@ -6470,6 +6505,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (!args->subvol) btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); + btrfs_set_inode_mapping_order(BTRFS_I(inode)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; @@ -6477,7 +6513,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; btrfs_update_inode_mapping_flags(BTRFS_I(inode)); - btrfs_set_inode_mapping_order(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6524,7 +6559,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); - if (ret != 0) { + if (unlikely(ret != 0)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6601,7 +6636,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, */ if (!args->subvol) { ret = btrfs_init_inode_security(trans, args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6621,14 +6656,14 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; } @@ -6659,7 +6694,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index) + const struct fscrypt_str *name, bool add_backref, u64 index) { int ret = 0; struct btrfs_key key; @@ -6692,7 +6727,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; - else if (ret) { + else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -6848,7 +6883,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, /* Link added now we update the inode item with the new link count. */ inc_nlink(inode); ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -6859,7 +6894,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, * open(2) O_TMPFILE flag. */ ret = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -7067,7 +7102,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ - if (!S_ISREG(inode->vfs_inode.i_mode)) { + if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) { ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", @@ -7144,7 +7179,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, insert: ret = 0; btrfs_release_path(path); - if (em->start > start || btrfs_extent_map_end(em) <= start) { + if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -7964,7 +7999,7 @@ int btrfs_drop_inode(struct inode *inode) if (btrfs_root_refs(&root->root_item) == 0) return 1; else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } static void init_once(void *foo) @@ -7972,6 +8007,9 @@ static void init_once(void *foo) struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } void __cold btrfs_destroy_cachep(void) @@ -8173,7 +8211,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { - if (need_abort) + if (unlikely(need_abort)) btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8221,7 +8259,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8229,12 +8267,12 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8243,7 +8281,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8251,12 +8289,12 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8264,14 +8302,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_name, 0, old_idx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), old_name, 0, new_idx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8512,7 +8550,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8520,12 +8558,12 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8536,7 +8574,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8545,7 +8583,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8553,7 +8591,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8562,7 +8600,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), &new_fname.disk_name, 0, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -8576,7 +8614,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (flags & RENAME_WHITEOUT) { ret = btrfs_create_new_inode(trans, &whiteout_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } else { @@ -8870,7 +8908,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); discard_new_inode(inode); @@ -8882,7 +8920,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_free_path(path); discard_new_inode(inode); @@ -9095,7 +9133,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans); @@ -9263,7 +9301,7 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { - if (ret > 0) { + if (unlikely(ret > 0)) { /* The extent item disappeared? */ return -EIO; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e13de2bdcbfab..a454b5ba209750 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -376,13 +376,13 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (comp) { ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); - if (ret && ret != -ENODATA) { + if (unlikely(ret && ret != -ENODATA)) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -633,7 +633,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1); - if (ret2 < 0) + if (unlikely(ret2 < 0)) btrfs_abort_transaction(trans, ret2); free_extent_buffer(leaf); goto out; @@ -654,14 +654,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -669,7 +669,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_record_new_subvolume(trans, BTRFS_I(dir)); ret = btrfs_create_new_inode(trans, &new_inode_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -957,7 +957,7 @@ static noinline int btrfs_mksnapshot(struct dentry *parent, /* * Force new buffered writes to reserve space even when NOCOW is - * possible. This is to avoid later writeback (running dealloc) to + * possible. This is to avoid later writeback (running delalloc) to * fallback to COW mode and unexpectedly fail with ENOSPC. */ btrfs_drew_read_lock(&root->snapshot_lock); @@ -1251,7 +1251,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol) + void __user *arg, bool subvol) { struct btrfs_ioctl_vol_args *vol_args; int ret; @@ -2133,7 +2133,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) ret = btrfs_next_leaf(fs_info->tree_root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -2216,7 +2216,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, ret = btrfs_next_leaf(root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -2245,7 +2245,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, ret = btrfs_next_item(root, path); if (ret < 0) { goto out; - } else if (ret > 0) { + } else if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -4008,7 +4008,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); - if (ret && ret != -ENOENT) { + if (unlikely(ret && ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; @@ -4032,7 +4032,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); - if (ret < 0 && ret != -EEXIST) { + if (unlikely(ret < 0 && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; @@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; @@ -4509,6 +4513,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) { + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; @@ -4780,14 +4790,14 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_inode *inode = BTRFS_I(file->f_inode); + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; int ret; u64 disk_bytenr, disk_io_size; - struct file *file; - struct btrfs_inode *inode; - struct btrfs_fs_info *fs_info; - struct extent_io_tree *io_tree; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; @@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = -EPERM; goto out_acct; } - file = cmd->file; - inode = BTRFS_I(file->f_inode); - fs_info = inode->root->fs_info; - io_tree = &inode->io_tree; + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (issue_flags & IO_URING_F_COMPAT) { @@ -4933,9 +4944,10 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); loff_t pos; struct kiocb kiocb; - struct file *file; ssize_t ret; void __user *sqe_addr; struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); @@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu ret = -EPERM; goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } - file = cmd->file; sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (!(file->f_mode & FMODE_WRITE)) { @@ -5223,13 +5238,13 @@ long btrfs_ioctl(struct file *file, unsigned int case FITRIM: return btrfs_ioctl_fitrim(fs_info, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0); + return btrfs_ioctl_snap_create(file, argp, false); case BTRFS_IOC_SNAP_CREATE_V2: - return btrfs_ioctl_snap_create_v2(file, argp, 0); + return btrfs_ioctl_snap_create_v2(file, argp, false); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1); + return btrfs_ioctl_snap_create(file, argp, true); case BTRFS_IOC_SUBVOL_CREATE_V2: - return btrfs_ioctl_snap_create_v2(file, argp, 1); + return btrfs_ioctl_snap_create_v2(file, argp, true); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp, false); case BTRFS_IOC_SNAP_DESTROY_V2: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index a3e6d9616e60bf..0035851d72b00f 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -361,7 +361,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) atomic_inc(&lock->readers); /* - * Ensure the pending reader count is perceieved BEFORE this reader + * Ensure the pending reader count is perceived BEFORE this reader * goes to sleep in case of active writers. This guarantees new writers * won't be allowed and that the current reader will be woken up when * the last active writer finishes its jobs. diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index af29df98ac1454..a4673e7d95d705 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -74,7 +74,7 @@ enum btrfs_lock_nesting { BTRFS_NESTING_NEW_ROOT, /* - * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * We are limited to MAX_LOCKDEP_SUBCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over * the limit. As of this writing we're limited to 8, and we're * definitely using 8, hence this check to keep us from messing up in diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d403641889caf3..4758f66da449c0 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -58,9 +58,6 @@ * 0x1000 | SegHdr N+1| Data payload N+1 ... | */ -#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) -#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) - struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -68,7 +65,14 @@ struct workspace { struct list_head list; }; -static struct workspace_manager wsm; +static u32 workspace_buf_length(const struct btrfs_fs_info *fs_info) +{ + return lzo1x_worst_compress(fs_info->sectorsize); +} +static u32 workspace_cbuf_length(const struct btrfs_fs_info *fs_info) +{ + return lzo1x_worst_compress(fs_info->sectorsize); +} void lzo_free_workspace(struct list_head *ws) { @@ -80,7 +84,7 @@ void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *lzo_alloc_workspace(void) +struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info) { struct workspace *workspace; @@ -89,8 +93,8 @@ struct list_head *lzo_alloc_workspace(void) return ERR_PTR(-ENOMEM); workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN); - workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); - workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); + workspace->buf = kvmalloc(workspace_buf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); + workspace->cbuf = kvmalloc(workspace_cbuf_length(fs_info), GFP_KERNEL | __GFP_NOWARN); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -128,19 +132,21 @@ static inline size_t read_compress_length(const char *buf) * * Will allocate new pages when needed. */ -static int copy_compressed_data_to_page(char *compressed_data, +static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, + char *compressed_data, size_t compressed_size, struct folio **out_folios, unsigned long max_nr_folio, - u32 *cur_out, - const u32 sectorsize) + u32 *cur_out) { + const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 sector_bytes_left; u32 orig_out; struct folio *cur_folio; char *kaddr; - if ((*cur_out / PAGE_SIZE) >= max_nr_folio) + if ((*cur_out >> min_folio_shift) >= max_nr_folio) return -E2BIG; /* @@ -149,18 +155,17 @@ static int copy_compressed_data_to_page(char *compressed_data, */ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); - cur_folio = out_folios[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out >> min_folio_shift]; /* Allocate a new page */ if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(); + cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; - out_folios[*cur_out / PAGE_SIZE] = cur_folio; + out_folios[*cur_out >> min_folio_shift] = cur_folio; } - kaddr = kmap_local_folio(cur_folio, 0); - write_compress_length(kaddr + offset_in_page(*cur_out), - compressed_size); + kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out)); + write_compress_length(kaddr, compressed_size); *cur_out += LZO_LEN; orig_out = *cur_out; @@ -172,20 +177,20 @@ static int copy_compressed_data_to_page(char *compressed_data, kunmap_local(kaddr); - if ((*cur_out / PAGE_SIZE) >= max_nr_folio) + if ((*cur_out >> min_folio_shift) >= max_nr_folio) return -E2BIG; - cur_folio = out_folios[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out >> min_folio_shift]; /* Allocate a new page */ if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(); + cur_folio = btrfs_alloc_compr_folio(fs_info); if (!cur_folio) return -ENOMEM; - out_folios[*cur_out / PAGE_SIZE] = cur_folio; + out_folios[*cur_out >> min_folio_shift] = cur_folio; } kaddr = kmap_local_folio(cur_folio, 0); - memcpy(kaddr + offset_in_page(*cur_out), + memcpy(kaddr + offset_in_folio(cur_folio, *cur_out), compressed_data + *cur_out - orig_out, copy_len); *cur_out += copy_len; @@ -209,12 +214,15 @@ static int copy_compressed_data_to_page(char *compressed_data, return 0; } -int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, +int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize; + const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + struct address_space *mapping = inode->vfs_inode.i_mapping; struct folio *folio_in = NULL; char *sizes_ptr; const unsigned long max_nr_folio = *out_folios; @@ -263,9 +271,9 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, goto out; } - ret = copy_compressed_data_to_page(workspace->cbuf, out_len, + ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len, folios, max_nr_folio, - &cur_out, sectorsize); + &cur_out); if (ret < 0) goto out; @@ -280,8 +288,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, goto out; } - /* Check if we have reached page boundary */ - if (PAGE_ALIGNED(cur_in)) { + /* Check if we have reached folio boundary. */ + if (IS_ALIGNED(cur_in, min_folio_size)) { folio_put(folio_in); folio_in = NULL; } @@ -298,7 +306,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, out: if (folio_in) folio_put(folio_in); - *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE); + *out_folios = DIV_ROUND_UP(cur_out, min_folio_size); return ret; } @@ -310,15 +318,16 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, static void copy_compressed_segment(struct compressed_bio *cb, char *dest, u32 len, u32 *cur_in) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct folio *cur_folio; - u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), - orig_in + len - *cur_in); + struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift]; + u32 copy_len = min_t(u32, orig_in + len - *cur_in, + folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); ASSERT(copy_len); - cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE]; memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, offset_in_folio(cur_folio, *cur_in), copy_len); @@ -332,6 +341,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; char *kaddr; int ret; /* Compressed data length, can be unaligned */ @@ -378,14 +388,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE]; + cur_folio = cb->compressed_folios[cur_in >> min_folio_shift]; ASSERT(cur_folio); kaddr = kmap_local_folio(cur_folio, 0); - seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); + seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; - if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) { + if (unlikely(seg_len > workspace_cbuf_length(fs_info))) { struct btrfs_inode *inode = cb->bbio.inode; /* @@ -445,19 +455,19 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; - size_t max_segment_len = WORKSPACE_BUF_LENGTH; + size_t max_segment_len = workspace_buf_length(fs_info); int ret = 0; - if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) + if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) return -EUCLEAN; in_len = read_compress_length(data_in); - if (in_len != srclen) + if (unlikely(in_len != srclen)) return -EUCLEAN; data_in += LZO_LEN; in_len = read_compress_length(data_in); - if (in_len != srclen - LZO_LEN * 2) { + if (unlikely(in_len != srclen - LZO_LEN * 2)) { ret = -EUCLEAN; goto out; } @@ -487,8 +497,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, return ret; } -const struct btrfs_compress_op btrfs_lzo_compress = { - .workspace_manager = &wsm, +const struct btrfs_compress_levels btrfs_lzo_compress = { .max_level = 1, .default_level = 1, }; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 363fd28c026880..a0cf8effe008e1 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -18,6 +18,7 @@ static const char fs_state_chars[] = { [BTRFS_FS_STATE_REMOUNTING] = 'M', [BTRFS_FS_STATE_RO] = 0, [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_LOG_REPLAY_ABORTED] = 'O', [BTRFS_FS_STATE_DEV_REPLACING] = 'R', [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 022ebc89af8550..4416c165644fa4 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -3,7 +3,6 @@ #ifndef BTRFS_MESSAGES_H #define BTRFS_MESSAGES_H -#include #include #include #include diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index ff5eac84d819d8..60f9b000d644bb 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. @@ -20,6 +21,54 @@ name = (1U << __ ## name ## _BIT), \ __ ## name ## _SEQ = __ ## name ## _BIT +static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter) +{ + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + return bvec_phys(&bv); +} + +/* + * Iterate bio using btrfs block size. + * + * This will handle large folio and highmem. + * + * @paddr: Physical memory address of each iteration + * @bio: The bio to iterate + * @iter: The bvec_iter (pointer) to use. + * @blocksize: The blocksize to iterate. + * + * This requires all folios in the bio to cover at least one block. + */ +#define btrfs_bio_for_each_block(paddr, bio, iter, blocksize) \ + for (; (iter)->bi_size && \ + (paddr = bio_iter_phys((bio), (iter)), 1); \ + bio_advance_iter_single((bio), (iter), (blocksize))) + +/* Initialize a bvec_iter to the size of the specified bio. */ +static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio) +{ + struct bio_vec *bvec; + u32 bio_size = 0; + int i; + + bio_for_each_bvec_all(bvec, bio, i) + bio_size += bvec->bv_len; + + return (struct bvec_iter) { + .bi_sector = 0, + .bi_size = bio_size, + .bi_idx = 0, + .bi_bvec_done = 0, + }; +} + +#define btrfs_bio_for_each_block_all(paddr, bio, blocksize) \ + for (struct bvec_iter iter = init_bvec_iter_for_bio(bio); \ + (iter).bi_size && \ + (paddr = bio_iter_phys((bio), &(iter)), 1); \ + bio_advance_iter_single((bio), &(iter), (blocksize))) + static inline void cond_wake_up(struct wait_queue_head *wq) { /* diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 74e38da9bd39cd..62b993fae54ff3 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -6,12 +6,19 @@ #include "messages.h" #include "ctree.h" #include "disk-io.h" +#include "file-item.h" #include "print-tree.h" #include "accessors.h" #include "tree-checker.h" #include "volumes.h" #include "raid-stripe-tree.h" +/* + * Large enough buffer size for the stringification of any key type yet short + * enough to use the stack and avoid allocations. + */ +#define KEY_TYPE_BUF_SIZE 32 + struct root_name_map { u64 id; const char *name; @@ -227,21 +234,209 @@ static void print_eb_refs_lock(const struct extent_buffer *eb) #endif } +static void print_timespec(const struct extent_buffer *eb, + struct btrfs_timespec *timespec, + const char *prefix, const char *suffix) +{ + const u64 secs = btrfs_timespec_sec(eb, timespec); + const u32 nsecs = btrfs_timespec_nsec(eb, timespec); + + pr_info("%s%llu.%u%s", prefix, secs, nsecs, suffix); +} + +static void print_inode_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_inode_item *ii = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + + pr_info("\t\tinode generation %llu transid %llu size %llu nbytes %llu\n", + btrfs_inode_generation(eb, ii), btrfs_inode_transid(eb, ii), + btrfs_inode_size(eb, ii), btrfs_inode_nbytes(eb, ii)); + pr_info("\t\tblock group %llu mode %o links %u uid %u gid %u\n", + btrfs_inode_block_group(eb, ii), btrfs_inode_mode(eb, ii), + btrfs_inode_nlink(eb, ii), btrfs_inode_uid(eb, ii), + btrfs_inode_gid(eb, ii)); + pr_info("\t\trdev %llu sequence %llu flags 0x%llx\n", + btrfs_inode_rdev(eb, ii), btrfs_inode_sequence(eb, ii), + btrfs_inode_flags(eb, ii)); + print_timespec(eb, &ii->atime, "\t\tatime ", "\n"); + print_timespec(eb, &ii->ctime, "\t\tctime ", "\n"); + print_timespec(eb, &ii->mtime, "\t\tmtime ", "\n"); + print_timespec(eb, &ii->otime, "\t\totime ", "\n"); +} + +static void print_dir_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_dir_item *di = btrfs_item_ptr(eb, i, struct btrfs_dir_item); + u32 cur = 0; + + while (cur < size) { + const u32 name_len = btrfs_dir_name_len(eb, di); + const u32 data_len = btrfs_dir_data_len(eb, di); + const u32 len = sizeof(*di) + name_len + data_len; + struct btrfs_key location; + + btrfs_dir_item_key_to_cpu(eb, di, &location); + pr_info("\t\tlocation key (%llu %u %llu) type %d\n", + location.objectid, location.type, location.offset, + btrfs_dir_ftype(eb, di)); + pr_info("\t\ttransid %llu data_len %u name_len %u\n", + btrfs_dir_transid(eb, di), data_len, name_len); + di = (struct btrfs_dir_item *)((char *)di + len); + cur += len; + } +} + +static void print_inode_ref_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_inode_ref *ref = btrfs_item_ptr(eb, i, struct btrfs_inode_ref); + u32 cur = 0; + + while (cur < size) { + const u64 index = btrfs_inode_ref_index(eb, ref); + const u32 name_len = btrfs_inode_ref_name_len(eb, ref); + const u32 len = sizeof(*ref) + name_len; + + pr_info("\t\tindex %llu name_len %u\n", index, name_len); + ref = (struct btrfs_inode_ref *)((char *)ref + len); + cur += len; + } +} + +static void print_inode_extref_item(const struct extent_buffer *eb, int i) +{ + const u32 size = btrfs_item_size(eb, i); + struct btrfs_inode_extref *extref; + u32 cur = 0; + + extref = btrfs_item_ptr(eb, i, struct btrfs_inode_extref); + while (cur < size) { + const u64 index = btrfs_inode_extref_index(eb, extref); + const u32 name_len = btrfs_inode_extref_name_len(eb, extref); + const u64 parent = btrfs_inode_extref_parent(eb, extref); + const u32 len = sizeof(*extref) + name_len; + + pr_info("\t\tindex %llu parent %llu name_len %u\n", + index, parent, name_len); + extref = (struct btrfs_inode_extref *)((char *)extref + len); + cur += len; + } +} + +static void print_dir_log_index_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_dir_log_item *dlog; + + dlog = btrfs_item_ptr(eb, i, struct btrfs_dir_log_item); + pr_info("\t\tdir log end %llu\n", btrfs_dir_log_end(eb, dlog)); +} + +static void print_extent_csum(const struct extent_buffer *eb, int i) +{ + const struct btrfs_fs_info *fs_info = eb->fs_info; + const u32 size = btrfs_item_size(eb, i); + const u32 csum_bytes = (size / fs_info->csum_size) * fs_info->sectorsize; + struct btrfs_key key; + + btrfs_item_key_to_cpu(eb, &key, i); + pr_info("\t\trange start %llu end %llu length %u\n", + key.offset, key.offset + csum_bytes, csum_bytes); +} + +static void print_file_extent_item(const struct extent_buffer *eb, int i) +{ + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + pr_info("\t\tgeneration %llu type %hhu\n", + btrfs_file_extent_generation(eb, fi), + btrfs_file_extent_type(eb, fi)); + + if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE) { + pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n", + btrfs_file_extent_inline_item_len(eb, i), + btrfs_file_extent_ram_bytes(eb, fi), + btrfs_file_extent_compression(eb, fi)); + return; + } + + pr_info("\t\textent data disk bytenr %llu nr %llu\n", + btrfs_file_extent_disk_bytenr(eb, fi), + btrfs_file_extent_disk_num_bytes(eb, fi)); + pr_info("\t\textent data offset %llu nr %llu ram %llu\n", + btrfs_file_extent_offset(eb, fi), + btrfs_file_extent_num_bytes(eb, fi), + btrfs_file_extent_ram_bytes(eb, fi)); + pr_info("\t\textent compression %hhu\n", + btrfs_file_extent_compression(eb, fi)); +} + +static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size) +{ + static const char *key_to_str[256] = { + [BTRFS_INODE_ITEM_KEY] = "INODE_ITEM", + [BTRFS_INODE_REF_KEY] = "INODE_REF", + [BTRFS_INODE_EXTREF_KEY] = "INODE_EXTREF", + [BTRFS_DIR_ITEM_KEY] = "DIR_ITEM", + [BTRFS_DIR_INDEX_KEY] = "DIR_INDEX", + [BTRFS_DIR_LOG_ITEM_KEY] = "DIR_LOG_ITEM", + [BTRFS_DIR_LOG_INDEX_KEY] = "DIR_LOG_INDEX", + [BTRFS_XATTR_ITEM_KEY] = "XATTR_ITEM", + [BTRFS_VERITY_DESC_ITEM_KEY] = "VERITY_DESC_ITEM", + [BTRFS_VERITY_MERKLE_ITEM_KEY] = "VERITY_MERKLE_ITEM", + [BTRFS_ORPHAN_ITEM_KEY] = "ORPHAN_ITEM", + [BTRFS_ROOT_ITEM_KEY] = "ROOT_ITEM", + [BTRFS_ROOT_REF_KEY] = "ROOT_REF", + [BTRFS_ROOT_BACKREF_KEY] = "ROOT_BACKREF", + [BTRFS_EXTENT_ITEM_KEY] = "EXTENT_ITEM", + [BTRFS_METADATA_ITEM_KEY] = "METADATA_ITEM", + [BTRFS_TREE_BLOCK_REF_KEY] = "TREE_BLOCK_REF", + [BTRFS_SHARED_BLOCK_REF_KEY] = "SHARED_BLOCK_REF", + [BTRFS_EXTENT_DATA_REF_KEY] = "EXTENT_DATA_REF", + [BTRFS_SHARED_DATA_REF_KEY] = "SHARED_DATA_REF", + [BTRFS_EXTENT_OWNER_REF_KEY] = "EXTENT_OWNER_REF", + [BTRFS_EXTENT_CSUM_KEY] = "EXTENT_CSUM", + [BTRFS_EXTENT_DATA_KEY] = "EXTENT_DATA", + [BTRFS_BLOCK_GROUP_ITEM_KEY] = "BLOCK_GROUP_ITEM", + [BTRFS_FREE_SPACE_INFO_KEY] = "FREE_SPACE_INFO", + [BTRFS_FREE_SPACE_EXTENT_KEY] = "FREE_SPACE_EXTENT", + [BTRFS_FREE_SPACE_BITMAP_KEY] = "FREE_SPACE_BITMAP", + [BTRFS_CHUNK_ITEM_KEY] = "CHUNK_ITEM", + [BTRFS_DEV_ITEM_KEY] = "DEV_ITEM", + [BTRFS_DEV_EXTENT_KEY] = "DEV_EXTENT", + [BTRFS_TEMPORARY_ITEM_KEY] = "TEMPORARY_ITEM", + [BTRFS_DEV_REPLACE_KEY] = "DEV_REPLACE", + [BTRFS_STRING_ITEM_KEY] = "STRING_ITEM", + [BTRFS_QGROUP_STATUS_KEY] = "QGROUP_STATUS", + [BTRFS_QGROUP_RELATION_KEY] = "QGROUP_RELATION", + [BTRFS_QGROUP_INFO_KEY] = "QGROUP_INFO", + [BTRFS_QGROUP_LIMIT_KEY] = "QGROUP_LIMIT", + [BTRFS_PERSISTENT_ITEM_KEY] = "PERSISTENT_ITEM", + [BTRFS_UUID_KEY_SUBVOL] = "UUID_KEY_SUBVOL", + [BTRFS_UUID_KEY_RECEIVED_SUBVOL] = "UUID_KEY_RECEIVED_SUBVOL", + [BTRFS_RAID_STRIPE_KEY] = "RAID_STRIPE", + }; + + if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID) + scnprintf(buf, buf_size, "UNTYPED"); + else if (key_to_str[key->type]) + scnprintf(buf, buf_size, key_to_str[key->type]); + else + scnprintf(buf, buf_size, "UNKNOWN.%d", key->type); +} + void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; int i; u32 type, nr; struct btrfs_root_item *ri; - struct btrfs_dir_item *di; - struct btrfs_inode_item *ii; struct btrfs_block_group_item *bi; - struct btrfs_file_extent_item *fi; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; struct btrfs_dev_extent *dev_extent; struct btrfs_key key; - struct btrfs_key found_key; if (!l) return; @@ -255,25 +450,35 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_leaf_free_space(l), btrfs_header_owner(l)); print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { + char key_buf[KEY_TYPE_BUF_SIZE]; + btrfs_item_key_to_cpu(l, &key, i); type = key.type; - pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n", - i, key.objectid, type, key.offset, + key_type_string(&key, key_buf, KEY_TYPE_BUF_SIZE); + + pr_info("\titem %d key (%llu %s %llu) itemoff %d itemsize %d\n", + i, key.objectid, key_buf, key.offset, btrfs_item_offset(l, i), btrfs_item_size(l, i)); switch (type) { case BTRFS_INODE_ITEM_KEY: - ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); - pr_info("\t\tinode generation %llu size %llu mode %o\n", - btrfs_inode_generation(l, ii), - btrfs_inode_size(l, ii), - btrfs_inode_mode(l, ii)); + print_inode_item(l, i); + break; + case BTRFS_INODE_REF_KEY: + print_inode_ref_item(l, i); + break; + case BTRFS_INODE_EXTREF_KEY: + print_inode_extref_item(l, i); break; case BTRFS_DIR_ITEM_KEY: - di = btrfs_item_ptr(l, i, struct btrfs_dir_item); - btrfs_dir_item_key_to_cpu(l, di, &found_key); - pr_info("\t\tdir oid %llu flags %u\n", - found_key.objectid, - btrfs_dir_flags(l, di)); + case BTRFS_DIR_INDEX_KEY: + case BTRFS_XATTR_ITEM_KEY: + print_dir_item(l, i); + break; + case BTRFS_DIR_LOG_INDEX_KEY: + print_dir_log_index_item(l, i); + break; + case BTRFS_EXTENT_CSUM_KEY: + print_extent_csum(l, i); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); @@ -303,24 +508,7 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_shared_data_ref_count(l, sref)); break; case BTRFS_EXTENT_DATA_KEY: - fi = btrfs_item_ptr(l, i, - struct btrfs_file_extent_item); - pr_info("\t\tgeneration %llu type %hhu\n", - btrfs_file_extent_generation(l, fi), - btrfs_file_extent_type(l, fi)); - if (btrfs_file_extent_type(l, fi) == - BTRFS_FILE_EXTENT_INLINE) { - pr_info("\t\tinline extent data size %llu\n", - btrfs_file_extent_ram_bytes(l, fi)); - break; - } - pr_info("\t\textent data disk bytenr %llu nr %llu\n", - btrfs_file_extent_disk_bytenr(l, fi), - btrfs_file_extent_disk_num_bytes(l, fi)); - pr_info("\t\textent data offset %llu nr %llu ram %llu\n", - btrfs_file_extent_offset(l, fi), - btrfs_file_extent_num_bytes(l, fi), - btrfs_file_extent_ram_bytes(l, fi)); + print_file_extent_item(l, i); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ccaa9a3cf1ce37..1175b8192cd7de 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1069,7 +1069,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, } path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_root; @@ -1081,7 +1081,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*ptr)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1111,7 +1111,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); if (ret > 0) goto out_add_root; - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1129,7 +1129,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, /* We should not have a stray @prealloc pointer. */ ASSERT(prealloc == NULL); prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); - if (!prealloc) { + if (unlikely(!prealloc)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out_free_path; @@ -1137,7 +1137,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, ret = add_qgroup_item(trans, quota_root, found_key.offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1145,13 +1145,13 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } ret = btrfs_search_slot_for_read(tree_root, &found_key, path, 1, 0); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1165,7 +1165,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, } } ret = btrfs_next_item(tree_root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1176,7 +1176,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, out_add_root: btrfs_release_path(path); ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1190,7 +1190,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -1376,13 +1376,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) btrfs_free_qgroup_config(fs_info); ret = btrfs_clean_quota_tree(trans, quota_root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_root(trans, "a_root->root_key); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -1455,6 +1455,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; + u64 num_bytes_cmpr = src->excl_cmpr; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -1466,11 +1467,12 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup_list *glist; qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes_cmpr; WARN_ON(sign < 0 && qgroup->excl < num_bytes); + WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr); qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes_cmpr; if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); @@ -2424,9 +2426,9 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, int i; /* Level sanity check */ - if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || - root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || - root_level < cur_level) { + if (unlikely(cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || + root_level < cur_level)) { btrfs_err_rl(fs_info, "%s: bad levels, cur_level=%d root_level=%d", __func__, cur_level, root_level); @@ -2442,7 +2444,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, * dst_path->nodes[root_level] must be initialized before * calling this function. */ - if (cur_level == root_level) { + if (unlikely(cur_level == root_level)) { btrfs_err_rl(fs_info, "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", __func__, root_level, root_level, cur_level); @@ -2528,7 +2530,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, return 0; /* Wrong parameter order */ - if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { + if (unlikely(btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb))) { btrfs_err_rl(fs_info, "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, btrfs_header_generation(src_eb), @@ -2536,7 +2538,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, return -EUCLEAN; } - if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { + if (unlikely(!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb))) { ret = -EIO; goto out; } @@ -2727,7 +2729,7 @@ static void qgroup_iterator_nested_clean(struct list_head *head) */ static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, struct ulist *roots, struct list_head *qgroups, - u64 seq, int update_old) + u64 seq, bool update_old) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -4708,8 +4710,8 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > - btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { + if (unlikely(btrfs_node_ptr_generation(subvol_parent, subvol_slot) > + btrfs_node_ptr_generation(reloc_parent, reloc_slot))) { btrfs_err_rl(fs_info, "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", __func__, @@ -4841,7 +4843,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, reloc_eb = NULL; goto free_out; } - if (!extent_buffer_uptodate(reloc_eb)) { + if (unlikely(!extent_buffer_uptodate(reloc_eb))) { ret = -EIO; goto free_out; } diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index cab0b291088c67..cc6f6095cc9fd0 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -67,7 +67,7 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *stripe_root = fs_info->stripe_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf; u64 found_start; @@ -260,7 +260,6 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -269,7 +268,7 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, struct btrfs_stripe_extent *stripe_extent, const size_t item_size) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; int slot; @@ -288,7 +287,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), item_size); - btrfs_free_path(path); return ret; } @@ -306,7 +304,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, int ret; stripe_extent = kzalloc(item_size, GFP_NOFS); - if (!stripe_extent) { + if (!unlikely(stripe_extent)) { btrfs_abort_transaction(trans, -ENOMEM); btrfs_end_transaction(trans); return -ENOMEM; @@ -376,7 +374,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, struct btrfs_stripe_extent *stripe_extent; struct btrfs_key stripe_key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; const u64 end = logical + *length; int num_stripes; @@ -402,7 +400,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); if (ret < 0) - goto free_path; + return ret; if (ret) { if (path->slots[0] != 0) path->slots[0]--; @@ -459,8 +457,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, trace_btrfs_get_raid_extent_offset(fs_info, logical, *length, stripe->physical, devid); - ret = 0; - goto free_path; + return 0; } /* If we're here, we haven't found the requested devid in the stripe. */ @@ -474,8 +471,6 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); } -free_path: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3ff2bedfb3a4c9..0135dceb7baaa0 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1167,7 +1167,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, /* Check if we have reached tolerance early. */ found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) + if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; return 0; } @@ -1208,17 +1208,16 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) const u32 sectorsize = rbio->bioc->fs_info->sectorsize; const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; struct bvec_iter iter = bio->bi_iter; + phys_addr_t paddr; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - while (iter.bi_size) { + btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) { unsigned int index = (offset >> sectorsize_bits); struct sector_ptr *sector = &rbio->bio_sectors[index]; - struct bio_vec bv = bio_iter_iovec(bio, iter); sector->has_paddr = true; - sector->paddr = bvec_phys(&bv); - bio_advance_iter_single(bio, &iter, sectorsize); + sector->paddr = paddr; offset += sectorsize; } } @@ -1511,22 +1510,17 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + const u32 blocksize = rbio->bioc->fs_info->sectorsize; + phys_addr_t paddr; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct sector_ptr *sector; - phys_addr_t paddr = bvec_phys(bvec); + btrfs_bio_for_each_block_all(paddr, bio, blocksize) { + struct sector_ptr *sector = find_stripe_sector(rbio, paddr); - for (u32 off = 0; off < bvec->bv_len; off += sectorsize) { - sector = find_stripe_sector(rbio, paddr + off); - ASSERT(sector); - if (sector) - sector->uptodate = 1; - } + ASSERT(sector); + if (sector) + sector->uptodate = 1; } } @@ -1573,8 +1567,7 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; int total_sector_nr = get_bio_sector_nr(rbio, bio); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + phys_addr_t paddr; /* No data csum for the whole stripe, no need to verify. */ if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1584,27 +1577,20 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; - bio_for_each_segment_all(bvec, bio, iter_all) { - void *kaddr; - - kaddr = bvec_kmap_local(bvec); - for (u32 off = 0; off < bvec->bv_len; - off += fs_info->sectorsize, total_sector_nr++) { - u8 csum_buf[BTRFS_CSUM_SIZE]; - u8 *expected_csum = rbio->csum_buf + - total_sector_nr * fs_info->csum_size; - int ret; + btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; + int ret; - /* No csum for this sector, skip to the next sector. */ - if (!test_bit(total_sector_nr, rbio->csum_bitmap)) - continue; + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; - ret = btrfs_check_sector_csum(fs_info, kaddr + off, - csum_buf, expected_csum); - if (ret < 0) - set_bit(total_sector_nr, rbio->error_bitmap); - } - kunmap_local(kaddr); + ret = btrfs_check_block_csum(fs_info, paddr, + csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + total_sector_nr++; } } @@ -1802,7 +1788,6 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; - void *kaddr; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1824,9 +1809,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - kaddr = kmap_local_sector(sector); - ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected); - kunmap_local(kaddr); + ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected); return ret; } @@ -1864,7 +1847,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, if (!found_errors) return 0; - if (found_errors > rbio->bioc->max_errors) + if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; /* @@ -2416,7 +2399,7 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) int found_errors; found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } @@ -2705,7 +2688,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; goto out; } @@ -2729,7 +2712,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * data, so the capability of the repair is declined. (In the * case of RAID5, we can not repair anything.) */ - if (dfail > rbio->bioc->max_errors - 1) { + if (unlikely(dfail > rbio->bioc->max_errors - 1)) { ret = -EIO; goto out; } @@ -2746,7 +2729,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * scrubbing parity, luckily, use the other one to repair the * data, or we can not repair the data stripe. */ - if (failp != rbio->scrubp) { + if (unlikely(failp != rbio->scrubp)) { ret = -EIO; goto out; } @@ -2837,7 +2820,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) int found_errors; found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); - if (found_errors > rbio->bioc->max_errors) { + if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; } @@ -2861,19 +2844,22 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) * This is for scrub call sites where we already have correct data contents. * This allows us to avoid reading data stripes again. * - * Unfortunately here we have to do page copy, other than reusing the pages. + * Unfortunately here we have to do folio copy, other than reusing the pages. * This is due to the fact rbio has its own page management for its cache. */ -void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, - struct page **data_pages, u64 data_logical) +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical) { + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 offset_in_full_stripe = data_logical - rbio->bioc->full_stripe_logical; - const int page_index = offset_in_full_stripe >> PAGE_SHIFT; - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; + unsigned int findex = 0; + unsigned int foffset = 0; int ret; + /* We shouldn't hit RAID56 for bs > ps cases for now. */ + ASSERT(fs_info->sectorsize <= PAGE_SIZE); + /* * If we hit ENOMEM temporarily, but later at * raid56_parity_submit_scrub_rbio() time it succeeded, we just do @@ -2890,14 +2876,25 @@ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); - for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { - struct page *dst = rbio->stripe_pages[page_nr + page_index]; - struct page *src = data_pages[page_nr]; + for (unsigned int cur_off = offset_in_full_stripe; + cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN; + cur_off += PAGE_SIZE) { + const unsigned int pindex = cur_off >> PAGE_SHIFT; + void *kaddr; + + kaddr = kmap_local_page(rbio->stripe_pages[pindex]); + memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE); + kunmap_local(kaddr); - memcpy_page(dst, 0, src, 0, PAGE_SIZE); - for (int sector_nr = sectors_per_page * page_index; - sector_nr < sectors_per_page * (page_index + 1); - sector_nr++) - rbio->stripe_sectors[sector_nr].uptodate = true; + foffset += PAGE_SIZE; + ASSERT(foffset <= folio_size(data_folios[findex])); + if (foffset == folio_size(data_folios[findex])) { + findex++; + foffset = 0; + } } + for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits; + sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits; + sector_nr++) + rbio->stripe_sectors[sector_nr].uptodate = true; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0d7b4c2fb6ae80..84c4d1d29c7a88 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -201,8 +201,8 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); -void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, - struct page **data_pages, u64 data_logical); +void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, + struct folio **data_folios, u64 data_logical); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 3871c3a6c743b5..de4cb0f3fbd046 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -971,7 +971,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *extent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int tree_block_level = 0; u64 bytenr = 0, num_bytes = 0; @@ -980,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; + extent_root = btrfs_extent_root(fs_info, 0); + /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */ + if (IS_ERR(extent_root)) { + btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling"); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - extent_root = btrfs_extent_root(fs_info, 0); eb = btrfs_read_lock_root_node(extent_root); level = btrfs_header_level(eb); path->nodes[level] = eb; @@ -1014,6 +1021,5 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 559bd25a2b7ab1..1ce544d53cc569 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -12,7 +12,7 @@ struct btrfs_fs_info; struct btrfs_ref; -#ifdef CONFIG_BTRFS_FS_REF_VERIFY +#ifdef CONFIG_BTRFS_DEBUG #include @@ -53,6 +53,6 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) { } -#endif /* CONFIG_BTRFS_FS_REF_VERIFY */ +#endif /* CONFIG_BTRFS_DEBUG */ #endif diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index ce25ab7f0e9965..5465a5eae9b2d1 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -23,7 +23,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, u64 endoff, const u64 destoff, const u64 olen, - int no_time_update) + bool no_time_update) { int ret; @@ -43,7 +43,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, } ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; @@ -268,12 +268,12 @@ static int clone_copy_inline_extent(struct btrfs_inode *inode, drop_args.end = aligned_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -285,7 +285,7 @@ static int clone_copy_inline_extent(struct btrfs_inode *inode, btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); btrfs_set_inode_full_sync(inode); ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); out: if (!ret && !trans) { @@ -337,10 +337,10 @@ static int clone_copy_inline_extent(struct btrfs_inode *inode, */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff, int no_time_update) + const u64 destoff, bool no_time_update) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_trans_handle *trans; char *buf = NULL; @@ -611,7 +611,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, } out: - btrfs_free_path(path); kvfree(buf); clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7256f6748c8f92..8dd8de6b9fb89e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -821,7 +821,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, u64 bytenr, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; int ret; @@ -834,11 +834,9 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + return ret; + if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -849,16 +847,11 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)); - if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { - ret = -EINVAL; - goto out; - } + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) + return -EINVAL; *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -974,7 +967,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -988,7 +981,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1199,7 +1192,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1212,7 +1205,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1226,7 +1219,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ref.ref_root = btrfs_root_id(src); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1240,7 +1233,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ref.ref_root = btrfs_root_id(dest); btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); break; } @@ -1490,7 +1483,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) * ->reloc_root. If it fails however we must * drop the ref ourselves. */ - ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); + ret2 = btrfs_drop_snapshot(reloc_root, false, true); if (ret2 < 0) { btrfs_put_root(reloc_root); if (!ret) @@ -1500,7 +1493,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) btrfs_put_root(root); } else { /* Orphan reloc tree, just clean it up */ - ret2 = btrfs_drop_snapshot(root, 0, 1); + ret2 = btrfs_drop_snapshot(root, false, true); if (ret2 < 0) { btrfs_put_root(root); if (!ret) @@ -1791,7 +1784,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); if (!err) err = ret; @@ -1960,7 +1953,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); return PTR_ERR(root); } - if (root->reloc_root != reloc_root) { + if (unlikely(root->reloc_root != reloc_root)) { DEBUG_WARN("unexpected reloc root found"); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", @@ -2031,7 +2024,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOENT); - if (next->new_bytenr) { + if (unlikely(next->new_bytenr)) { /* * We just created the reloc root, so we shouldn't have * ->new_bytenr set yet. If it is then we have multiple roots @@ -2090,7 +2083,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) * This can occur if we have incomplete extent refs leading all * the way up a particular path, in this case return -EUCLEAN. */ - if (!root) + if (unlikely(!root)) return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ @@ -2277,7 +2270,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(upper->eb, slot); if (lowest) { - if (bytenr != node->bytenr) { + if (unlikely(bytenr != node->bytenr)) { btrfs_err(root->fs_info, "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", bytenr, node->bytenr, slot, @@ -2332,7 +2325,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - if (ret) + if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } next: @@ -2454,7 +2447,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); - if (!extent_buffer_uptodate(eb)) { + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); return -EIO; } @@ -2519,7 +2512,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); - if (node->new_bytenr) { + if (unlikely(node->new_bytenr)) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); @@ -2839,7 +2832,7 @@ static int relocate_one_folio(struct reloc_control *rc, if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { ret = -EIO; goto release_folio; } @@ -3158,7 +3151,7 @@ static int __add_tree_block(struct reloc_control *rc, struct rb_root *blocks) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret; bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -3186,7 +3179,7 @@ static int __add_tree_block(struct reloc_control *rc, path->skip_locking = 1; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && skinny) { if (path->slots[0]) { @@ -3213,14 +3206,10 @@ static int __add_tree_block(struct reloc_control *rc, "tree block extent item (%llu) is not found in extent tree", bytenr); WARN_ON(1); - ret = -EINVAL; - goto out; + return -EINVAL; } - ret = add_tree_block(rc, &key, path, blocks); -out: - btrfs_free_path(path); - return ret; + return add_tree_block(rc, &key, path, blocks); } static int delete_block_group_cache(struct btrfs_block_group *block_group, @@ -3510,7 +3499,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct rb_root blocks = RB_ROOT; struct btrfs_key key; struct btrfs_trans_handle *trans = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; u64 flags; int ret; @@ -3679,14 +3668,13 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) if (ret < 0 && !err) err = ret; btrfs_free_block_rsv(fs_info, rc->block_rsv); - btrfs_free_path(path); return err; } static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_item *item; struct extent_buffer *leaf; int ret; @@ -3697,7 +3685,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_inode(trans, root, path, objectid); if (ret) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -3707,15 +3695,13 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); -out: - btrfs_free_path(path); - return ret; + return 0; } static void delete_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -3738,7 +3724,6 @@ static void delete_orphan_inode(struct btrfs_trans_handle *trans, out: if (ret) btrfs_abort_transaction(trans, ret); - btrfs_free_path(path); } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index e22e6b06927ab3..d07eab70f759d9 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -85,7 +85,7 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, * Key with offset -1 found, there would have to exist a root * with such id, but this is out of the valid range. */ - if (ret == 0) { + if (unlikely(ret == 0)) { ret = -EUCLEAN; goto out; } @@ -130,7 +130,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *item) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *l; int ret; int slot; @@ -143,15 +143,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { + if (unlikely(ret > 0)) { btrfs_crit(fs_info, "unable to find root key (%llu %u %llu) in tree %llu", key->objectid, key->type, key->offset, btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } l = path->nodes[0]; @@ -168,22 +168,22 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_release_path(path); ret = btrfs_search_slot(trans, root, key, path, -1, 1); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ret = btrfs_del_item(trans, root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } l = path->nodes[0]; slot = path->slots[0]; @@ -197,8 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); -out: - btrfs_free_path(path); return ret; } @@ -216,7 +214,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root; int err = 0; @@ -309,7 +307,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) btrfs_put_root(root); } - btrfs_free_path(path); return err; } @@ -318,7 +315,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key) { struct btrfs_root *root = trans->fs_info->tree_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); @@ -326,17 +323,12 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) - goto out; - if (ret != 0) { + return ret; + if (unlikely(ret > 0)) /* The root must exist but we did not find it by the key. */ - ret = -EUCLEAN; - goto out; - } + return -EUCLEAN; - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, @@ -344,7 +336,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root_ref *ref; struct extent_buffer *leaf; struct btrfs_key key; @@ -361,7 +353,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) { - goto out; + return ret; } else if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], @@ -369,18 +361,16 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, ptr = (unsigned long)(ref + 1); if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || (btrfs_root_ref_name_len(leaf, ref) != name->len) || - memcmp_extent_buffer(leaf, name->name, ptr, name->len)) { - ret = -ENOENT; - goto out; - } + memcmp_extent_buffer(leaf, name->name, ptr, name->len)) + return -ENOENT; + *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out; + return ret; } else { - ret = -ENOENT; - goto out; + return -ENOENT; } if (key.type == BTRFS_ROOT_BACKREF_KEY) { @@ -391,8 +381,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, goto again; } -out: - btrfs_free_path(path); return ret; } @@ -418,7 +406,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root_ref *ref; struct extent_buffer *leaf; unsigned long ptr; @@ -433,9 +421,8 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name->len); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - btrfs_free_path(path); return ret; } @@ -455,7 +442,6 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, goto again; } - btrfs_free_path(path); return 0; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6776e6ab8d1080..4691d0bdb2e86c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -113,7 +113,7 @@ enum { /* Which blocks are covered by extent items. */ scrub_bitmap_nr_has_extent = 0, - /* Which blocks are meteadata. */ + /* Which blocks are metadata. */ scrub_bitmap_nr_is_metadata, /* @@ -130,7 +130,7 @@ enum { scrub_bitmap_nr_last, }; -#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) +#define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) /* * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. @@ -139,7 +139,7 @@ struct scrub_stripe { struct scrub_ctx *sctx; struct btrfs_block_group *bg; - struct page *pages[SCRUB_STRIPE_PAGES]; + struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; struct scrub_sector_verification *sectors; struct btrfs_device *dev; @@ -206,7 +206,7 @@ struct scrub_ctx { ktime_t throttle_deadline; u64 throttle_sent; - int is_dev_replace; + bool is_dev_replace; u64 write_pointer; struct mutex wr_lock; @@ -339,10 +339,10 @@ static void release_scrub_stripe(struct scrub_stripe *stripe) if (!stripe) return; - for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) { - if (stripe->pages[i]) - __free_page(stripe->pages[i]); - stripe->pages[i] = NULL; + for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { + if (stripe->folios[i]) + folio_put(stripe->folios[i]); + stripe->folios[i] = NULL; } kfree(stripe->sectors); kfree(stripe->csums); @@ -355,6 +355,7 @@ static void release_scrub_stripe(struct scrub_stripe *stripe) static int init_scrub_stripe(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe) { + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; int ret; memset(stripe, 0, sizeof(*stripe)); @@ -367,7 +368,9 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info, atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); - ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false); + ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); + ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, + fs_info->block_min_order, stripe->folios); if (ret < 0) goto error; @@ -446,7 +449,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx) } static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( - struct btrfs_fs_info *fs_info, int is_dev_replace) + struct btrfs_fs_info *fs_info, bool is_dev_replace) { struct scrub_ctx *sctx; int i; @@ -585,7 +588,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * bool is_super, u64 logical, u64 physical) { struct btrfs_fs_info *fs_info = dev->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key found_key; struct extent_buffer *eb; struct btrfs_extent_item *ei; @@ -612,7 +615,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, &flags); if (ret < 0) - goto out; + return; swarn.extent_item_size = found_key.offset; @@ -658,9 +661,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); } - -out: - btrfs_free_path(path); } static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) @@ -687,13 +687,30 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) { - u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits); - const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + u32 offset = (sector_nr << fs_info->sectorsize_bits); + const struct folio *folio = stripe->folios[offset >> min_folio_shift]; - /* stripe->pages[] is allocated by us and no highmem is allowed. */ - ASSERT(page); - ASSERT(!PageHighMem(page)); - return page_address(page) + offset_in_page(offset); + /* stripe->folios[] is allocated by us and no highmem is allowed. */ + ASSERT(folio); + ASSERT(!folio_test_partial_kmap(folio)); + return folio_address(folio) + offset_in_folio(folio, offset); +} + +static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + u32 offset = (sector_nr << fs_info->sectorsize_bits); + const struct folio *folio = stripe->folios[offset >> min_folio_shift]; + + /* stripe->folios[] is allocated by us and no highmem is allowed. */ + ASSERT(folio); + ASSERT(!folio_test_partial_kmap(folio)); + /* And the range must be contained inside the folio. */ + ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); + return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); } static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) @@ -788,7 +805,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; - void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); u8 csum_buf[BTRFS_CSUM_SIZE]; int ret; @@ -833,7 +850,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) return; } - ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum); + ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); if (ret < 0) { scrub_bitmap_set_bit_csum_error(stripe, sector_nr); scrub_bitmap_set_bit_error(stripe, sector_nr); @@ -1369,8 +1386,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d * Slice is divided into intervals when the IO is submitted, adjust by * bwlimit and maximum of 64 intervals. */ - div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); - div = min_t(u32, 64, div); + div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); /* Start new epoch, set deadline */ now = ktime_get(); @@ -1513,7 +1529,7 @@ static int find_first_extent_item(struct btrfs_root *extent_root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - if (ret == 0) { + if (unlikely(ret == 0)) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. @@ -1859,6 +1875,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; int mirror = stripe->mirror_num; @@ -1871,7 +1888,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, return; } - bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, fs_info, scrub_read_endio, stripe); bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; @@ -1970,7 +1987,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) * metadata, we should immediately abort. */ for (int i = 0; i < nr_stripes; i++) { - if (stripe_has_metadata_error(&sctx->stripes[i])) { + if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { ret = -EIO; goto out; } @@ -2164,7 +2181,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, * As we may hit an empty data stripe while it's missing. */ bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); - if (!bitmap_empty(&error, stripe->nr_sectors)) { + if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { btrfs_err(fs_info, "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, @@ -2202,7 +2219,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; - raid56_parity_cache_data_pages(rbio, stripe->pages, + raid56_parity_cache_data_folios(rbio, stripe->folios, full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); } raid56_parity_submit_scrub_rbio(rbio); @@ -2586,7 +2603,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->dev_root; u64 chunk_offset; @@ -2858,8 +2875,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_put_block_group(cache); if (ret) break; - if (sctx->is_dev_replace && - atomic64_read(&dev_replace->num_write_errors) > 0) { + if (unlikely(sctx->is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0)) { ret = -EIO; break; } @@ -2872,8 +2889,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_release_path(path); } - btrfs_free_path(path); - return ret; } @@ -2889,13 +2904,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, if (ret < 0) return ret; ret = btrfs_check_super_csum(fs_info, sb); - if (ret != 0) { + if (unlikely(ret != 0)) { btrfs_err_rl(fs_info, "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } - if (btrfs_super_generation(sb) != generation) { + if (unlikely(btrfs_super_generation(sb) != generation)) { btrfs_err_rl(fs_info, "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, @@ -3013,7 +3028,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace) + bool readonly, bool is_dev_replace) { struct btrfs_dev_lookup_args args = { .devid = devid }; struct scrub_ctx *sctx; @@ -3065,8 +3080,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } mutex_lock(&fs_info->scrub_lock); - if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { + if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EIO; diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h index f0df597b75c7c7..aa68b6ebaf555c 100644 --- a/fs/btrfs/scrub.h +++ b/fs/btrfs/scrub.h @@ -11,7 +11,7 @@ struct btrfs_scrub_progress; int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace); + bool readonly, bool is_dev_replace); void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); int btrfs_scrub_cancel(struct btrfs_fs_info *info); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7664025a5af431..9230e5066fc6b7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -646,7 +646,7 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) ret = kernel_write(filp, buf + pos, len - pos, off); if (ret < 0) return ret; - if (ret == 0) + if (unlikely(ret == 0)) return -EIO; pos += ret; } @@ -909,7 +909,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, struct btrfs_inode_info *info) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_item *ii; struct btrfs_key key; @@ -924,11 +924,11 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, if (ret) { if (ret > 0) ret = -ENOENT; - goto out; + return ret; } if (!info) - goto out; + return 0; ii = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -945,9 +945,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino, */ info->fileattr = btrfs_inode_flags(path->nodes[0], ii); -out: - btrfs_free_path(path); - return ret; + return 0; } static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) @@ -973,13 +971,13 @@ typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); * path must point to the INODE_REF or INODE_EXTREF when called. */ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *found_key, int resolve, + struct btrfs_key *found_key, bool resolve, iterate_inode_ref_t iterate, void *ctx) { struct extent_buffer *eb = path->nodes[0]; struct btrfs_inode_ref *iref; struct btrfs_inode_extref *extref; - struct btrfs_path *tmp_path; + BTRFS_PATH_AUTO_FREE(tmp_path); struct fs_path *p; u32 cur = 0; u32 total; @@ -1076,7 +1074,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } out: - btrfs_free_path(tmp_path); fs_path_free(p); return ret; } @@ -1224,7 +1221,7 @@ static int get_inode_path(struct btrfs_root *root, { int ret; struct btrfs_key key, found_key; - struct btrfs_path *p; + BTRFS_PATH_AUTO_FREE(p); p = alloc_path_for_send(); if (!p) @@ -1238,28 +1235,20 @@ static int get_inode_path(struct btrfs_root *root, ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); if (ret < 0) - goto out; - if (ret) { - ret = 1; - goto out; - } + return ret; + if (ret) + return 1; + btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); if (found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && - found_key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = -ENOENT; - goto out; - } + found_key.type != BTRFS_INODE_EXTREF_KEY)) + return -ENOENT; - ret = iterate_inode_ref(root, p, &found_key, 1, - __copy_first_ref, path); + ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path); if (ret < 0) - goto out; - ret = 0; - -out: - btrfs_free_path(p); - return ret; + return ret; + return 0; } struct backref_ctx { @@ -1389,7 +1378,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, struct backref_ctx *bctx = ctx; struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; - const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + const u64 key = leaf_bytenr >> fs_info->nodesize_bits; struct btrfs_lru_cache_entry *raw_entry; struct backref_cache_entry *entry; @@ -1444,7 +1433,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, if (!new_entry) return; - new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits; new_entry->entry.gen = 0; new_entry->num_roots = 0; ULIST_ITER_INIT(&uiter); @@ -1716,7 +1705,7 @@ static int read_symlink(struct btrfs_root *root, struct fs_path *dest) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_file_extent_item *ei; u8 type; @@ -1733,21 +1722,20 @@ static int read_symlink(struct btrfs_root *root, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; - if (ret) { + return ret; + if (unlikely(ret)) { /* * An empty symlink inode. Can happen in rare error paths when * creating a symlink (transaction committed before the inode * eviction handler removed the symlink inode items and a crash - * happened in between or the subvol was snapshoted in between). + * happened in between or the subvol was snapshotted in between). * Print an informative message to dmesg/syslog so that the user * can delete the symlink. */ btrfs_err(root->fs_info, "Found empty symlink inode %llu at root %llu", ino, btrfs_root_id(root)); - ret = -EIO; - goto out; + return -EIO; } ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -1758,7 +1746,7 @@ static int read_symlink(struct btrfs_root *root, btrfs_crit(root->fs_info, "send: found symlink extent that is not inline, ino %llu root %llu extent type %d", ino, btrfs_root_id(root), type); - goto out; + return ret; } compression = btrfs_file_extent_compression(path->nodes[0], ei); if (unlikely(compression != BTRFS_COMPRESS_NONE)) { @@ -1766,17 +1754,13 @@ static int read_symlink(struct btrfs_root *root, btrfs_crit(root->fs_info, "send: found symlink extent with compression, ino %llu root %llu compression type %d", ino, btrfs_root_id(root), compression); - goto out; + return ret; } off = btrfs_file_extent_inline_start(ei); len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); - -out: - btrfs_free_path(path); - return ret; + return fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); } /* @@ -1787,8 +1771,7 @@ static int gen_unique_name(struct send_ctx *sctx, u64 ino, u64 gen, struct fs_path *dest) { - int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; char tmp[64]; int len; @@ -1811,10 +1794,9 @@ static int gen_unique_name(struct send_ctx *sctx, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (IS_ERR(di)) + return PTR_ERR(di); + if (di) { /* not unique, try again */ idx++; @@ -1823,7 +1805,6 @@ static int gen_unique_name(struct send_ctx *sctx, if (!sctx->parent_root) { /* unique */ - ret = 0; break; } @@ -1831,10 +1812,9 @@ static int gen_unique_name(struct send_ctx *sctx, path, BTRFS_FIRST_FREE_OBJECTID, &tmp_name, 0); btrfs_release_path(path); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (IS_ERR(di)) + return PTR_ERR(di); + if (di) { /* not unique, try again */ idx++; @@ -1844,11 +1824,7 @@ static int gen_unique_name(struct send_ctx *sctx, break; } - ret = fs_path_add(dest, tmp, len); - -out: - btrfs_free_path(path); - return ret; + return fs_path_add(dest, tmp, len); } enum inode_state { @@ -1960,7 +1936,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root, int ret = 0; struct btrfs_dir_item *di; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); path = alloc_path_for_send(); @@ -1968,19 +1944,15 @@ static int lookup_dir_item_inode(struct btrfs_root *root, return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); - if (IS_ERR_OR_NULL(di)) { - ret = di ? PTR_ERR(di) : -ENOENT; - goto out; - } + if (IS_ERR_OR_NULL(di)) + return di ? PTR_ERR(di) : -ENOENT; + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.type == BTRFS_ROOT_ITEM_KEY) { - ret = -ENOENT; - goto out; - } + if (key.type == BTRFS_ROOT_ITEM_KEY) + return -ENOENT; + *found_inode = key.objectid; -out: - btrfs_free_path(path); return ret; } @@ -1994,7 +1966,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, int ret; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int len; u64 parent_dir; @@ -2008,16 +1980,14 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); if (ret < 0) - goto out; + return ret; if (!ret) btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); if (ret || found_key.objectid != ino || (found_key.type != BTRFS_INODE_REF_KEY && - found_key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = -ENOENT; - goto out; - } + found_key.type != BTRFS_INODE_EXTREF_KEY)) + return -ENOENT; if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -2038,19 +2008,17 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); } if (ret < 0) - goto out; + return ret; btrfs_release_path(path); if (dir_gen) { ret = get_inode_gen(root, parent_dir, dir_gen); if (ret < 0) - goto out; + return ret; } *dir = parent_dir; -out: - btrfs_free_path(path); return ret; } @@ -2486,7 +2454,7 @@ static int send_subvol_begin(struct send_ctx *sctx) int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_root *parent_root = sctx->parent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -2498,10 +2466,8 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); - if (!name) { - btrfs_free_path(path); + if (!name) return -ENOMEM; - } key.objectid = btrfs_root_id(send_root); key.type = BTRFS_ROOT_BACKREF_KEY; @@ -2564,7 +2530,6 @@ static int send_subvol_begin(struct send_ctx *sctx) tlv_put_failure: out: - btrfs_free_path(path); kfree(name); return ret; } @@ -2715,7 +2680,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) int ret = 0; struct fs_path *p = NULL; struct btrfs_inode_item *ii; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; struct btrfs_key key; int slot; @@ -2759,7 +2724,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: free_path_for_command(sctx, p); - btrfs_free_path(path); return ret; } @@ -2769,7 +2733,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) * processing an inode that is a directory and it just got renamed, and existing * entries in the cache may refer to inodes that have the directory in their * full path - in which case we would generate outdated paths (pre-rename) - * for the inodes that the cache entries point to. Instead of prunning the + * for the inodes that the cache entries point to. Instead of pruning the * cache when inserting, do it after we finish processing each inode at * finish_inode_if_needed(). */ @@ -2930,7 +2894,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) { int ret = 0; int iter_ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key di_key; @@ -2970,7 +2934,6 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } @@ -3750,7 +3713,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key di_key; struct btrfs_dir_item *di; @@ -3771,19 +3734,15 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return 0; di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); - if (!di) { - ret = 0; - goto out; - } + if (!di) + return 0; /* * di_key.objectid has the number of the inode that has a dentry in the * parent directory with the same name that sctx->cur_ino is being @@ -3793,26 +3752,22 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, * that it happens after that other inode is renamed. */ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); - if (di_key.type != BTRFS_INODE_ITEM_KEY) { - ret = 0; - goto out; - } + if (di_key.type != BTRFS_INODE_ITEM_KEY) + return 0; ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); if (ret < 0) - goto out; + return ret; ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); if (ret < 0) { if (ret == -ENOENT) ret = 0; - goto out; + return ret; } /* Different inode, no need to delay the rename of sctx->cur_ino */ - if (right_gen != left_gen) { - ret = 0; - goto out; - } + if (right_gen != left_gen) + return 0; wdm = get_waiting_dir_move(sctx, di_key.objectid); if (wdm && !wdm->orphanized) { @@ -3826,8 +3781,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, if (!ret) ret = 1; } -out: - btrfs_free_path(path); return ret; } @@ -3877,7 +3830,7 @@ static int is_ancestor(struct btrfs_root *root, bool free_fs_path = false; int ret = 0; int iter_ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; if (!fs_path) { @@ -3945,7 +3898,6 @@ static int is_ancestor(struct btrfs_root *root, ret = iter_ret; out: - btrfs_free_path(path); if (free_fs_path) fs_path_free(fs_path); return ret; @@ -4756,8 +4708,8 @@ static int record_new_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, + false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4768,9 +4720,8 @@ static int record_deleted_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, record_deleted_ref_if_needed, - sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, + false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4781,12 +4732,12 @@ static int record_changed_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, record_new_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, + false, record_new_ref_if_needed, sctx); if (ret < 0) return ret; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, + false, record_deleted_ref_if_needed, sctx); if (ret < 0) return ret; @@ -4803,7 +4754,7 @@ static int process_all_refs(struct send_ctx *sctx, int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; iterate_inode_ref_t cb; @@ -4822,8 +4773,7 @@ static int process_all_refs(struct send_ctx *sctx, } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); - ret = -EINVAL; - goto out; + return -EINVAL; } key.objectid = sctx->cmp_key->objectid; @@ -4835,15 +4785,14 @@ static int process_all_refs(struct send_ctx *sctx, found_key.type != BTRFS_INODE_EXTREF_KEY)) break; - ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); + ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx); if (ret < 0) - goto out; + return ret; } /* Catch error found during iteration */ - if (iter_ret < 0) { - ret = iter_ret; - goto out; - } + if (iter_ret < 0) + return iter_ret; + btrfs_release_path(path); /* @@ -4851,10 +4800,7 @@ static int process_all_refs(struct send_ctx *sctx, * re-creating this inode and will be rename'ing it into place once we * rename the parent directory. */ - ret = process_recorded_refs(sctx, &pending_move); -out: - btrfs_free_path(path); - return ret; + return process_recorded_refs(sctx, &pending_move); } static int send_set_xattr(struct send_ctx *sctx, @@ -5080,7 +5026,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; @@ -5108,7 +5054,6 @@ static int process_all_new_xattrs(struct send_ctx *sctx) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } @@ -5254,7 +5199,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (!folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); - if (!folio_test_uptodate(folio)) { + if (unlikely(!folio_test_uptodate(folio))) { folio_unlock(folio); btrfs_err(fs_info, "send: IO error at offset %llu for inode %llu root %llu", @@ -5656,7 +5601,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + /* + * Do not go through encoded read for bs > ps cases. + * + * Encoded send is using vmallocated pages as buffer, which we can + * not ensure every folio is large enough to contain a block. + */ + if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE && + (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { bool is_inline = (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE); @@ -5766,7 +5718,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct extent_buffer *leaf; unsigned long data_ptr; @@ -5804,7 +5756,6 @@ static int send_capabilities(struct send_ctx *sctx) strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - btrfs_free_path(path); return ret; } @@ -5812,7 +5763,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, struct clone_root *clone_root, const u64 disk_byte, u64 data_offset, u64 offset, u64 len) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret; struct btrfs_inode_info info; @@ -5848,7 +5799,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = get_inode_info(clone_root->root, clone_root->ino, &info); btrfs_release_path(path); if (ret < 0) - goto out; + return ret; clone_src_i_size = info.size; /* @@ -5878,7 +5829,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, key.offset = clone_root->offset; ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.objectid == clone_root->ino && @@ -5899,7 +5850,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); if (ret < 0) - goto out; + return ret; else if (ret > 0) break; continue; @@ -5936,7 +5887,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_extent_data(sctx, dst_path, offset, hole_len); if (ret < 0) - goto out; + return ret; len -= hole_len; if (len == 0) @@ -6007,7 +5958,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_clone(sctx, offset, slen, clone_root); if (ret < 0) - goto out; + return ret; } ret = send_extent_data(sctx, dst_path, offset + slen, @@ -6041,7 +5992,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, } if (ret < 0) - goto out; + return ret; len -= clone_len; if (len == 0) @@ -6072,8 +6023,6 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; -out: - btrfs_free_path(path); return ret; } @@ -6162,7 +6111,7 @@ static int is_extent_unchanged(struct send_ctx *sctx, { int ret = 0; struct btrfs_key key; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int slot; struct btrfs_key found_key; @@ -6188,10 +6137,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); left_type = btrfs_file_extent_type(eb, ei); - if (left_type != BTRFS_FILE_EXTENT_REG) { - ret = 0; - goto out; - } + if (left_type != BTRFS_FILE_EXTENT_REG) + return 0; + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); left_len = btrfs_file_extent_num_bytes(eb, ei); left_offset = btrfs_file_extent_offset(eb, ei); @@ -6223,11 +6171,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, key.offset = ekey->offset; ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + return ret; + if (ret) + return 0; /* * Handle special case where the right side has no extents at all. @@ -6236,11 +6182,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || - found_key.type != key.type) { + found_key.type != key.type) /* If we're a hole then just pretend nothing changed */ - ret = (left_disknr) ? 0 : 1; - goto out; - } + return (left_disknr ? 0 : 1); /* * We're now on 2a, 2b or 7. @@ -6250,10 +6194,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); if (right_type != BTRFS_FILE_EXTENT_REG && - right_type != BTRFS_FILE_EXTENT_INLINE) { - ret = 0; - goto out; - } + right_type != BTRFS_FILE_EXTENT_INLINE) + return 0; if (right_type == BTRFS_FILE_EXTENT_INLINE) { right_len = btrfs_file_extent_ram_bytes(eb, ei); @@ -6266,11 +6208,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. */ - if (found_key.offset + right_len <= ekey->offset) { + if (found_key.offset + right_len <= ekey->offset) /* If we're a hole just pretend nothing changed */ - ret = (left_disknr) ? 0 : 1; - goto out; - } + return (left_disknr ? 0 : 1); /* * We just wanted to see if when we have an inline extent, what @@ -6280,10 +6220,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, * compressed extent representing data with a size matching * the page size (currently the same as sector size). */ - if (right_type == BTRFS_FILE_EXTENT_INLINE) { - ret = 0; - goto out; - } + if (right_type == BTRFS_FILE_EXTENT_INLINE) + return 0; right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); right_offset = btrfs_file_extent_offset(eb, ei); @@ -6303,17 +6241,15 @@ static int is_extent_unchanged(struct send_ctx *sctx, */ if (left_disknr != right_disknr || left_offset_fixed != right_offset || - left_gen != right_gen) { - ret = 0; - goto out; - } + left_gen != right_gen) + return 0; /* * Go to the next extent. */ ret = btrfs_next_item(sctx->parent_root, path); if (ret < 0) - goto out; + return ret; if (!ret) { eb = path->nodes[0]; slot = path->slots[0]; @@ -6324,10 +6260,9 @@ static int is_extent_unchanged(struct send_ctx *sctx, key.offset += right_len; break; } - if (found_key.offset != key.offset + right_len) { - ret = 0; - goto out; - } + if (found_key.offset != key.offset + right_len) + return 0; + key = found_key; } @@ -6340,15 +6275,12 @@ static int is_extent_unchanged(struct send_ctx *sctx, else ret = 0; - -out: - btrfs_free_path(path); return ret; } static int get_last_extent(struct send_ctx *sctx, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = sctx->send_root; struct btrfs_key key; int ret; @@ -6364,15 +6296,13 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) key.offset = offset; ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); if (ret < 0) - goto out; + return ret; ret = 0; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) - goto out; + return ret; sctx->cur_inode_last_extent = btrfs_file_extent_end(path); -out: - btrfs_free_path(path); return ret; } @@ -6380,7 +6310,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, const u64 start, const u64 end) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root = sctx->parent_root; u64 search_start = start; @@ -6395,7 +6325,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, key.offset = search_start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; @@ -6408,8 +6338,8 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -6431,15 +6361,11 @@ static int range_is_hole_in_parent(struct send_ctx *sctx, search_start = extent_end; goto next; } - ret = 0; - goto out; + return 0; next: path->slots[0]++; } - ret = 1; -out: - btrfs_free_path(path); - return ret; + return 1; } static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, @@ -6547,7 +6473,7 @@ static int process_all_extents(struct send_ctx *sctx) int ret = 0; int iter_ret = 0; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; @@ -6574,11 +6500,10 @@ static int process_all_extents(struct send_ctx *sctx) if (iter_ret < 0) ret = iter_ret; - btrfs_free_path(path); return ret; } -static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, +static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end, int *pending_move, int *refs_processed) { @@ -6601,7 +6526,7 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, return ret; } -static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) +static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end) { int ret = 0; struct btrfs_inode_info info; @@ -7036,7 +6961,7 @@ static int changed_ref(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { + if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "reference"); return -EIO; } @@ -7064,7 +6989,7 @@ static int changed_xattr(struct send_ctx *sctx, { int ret = 0; - if (sctx->cur_ino != sctx->cmp_key->objectid) { + if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) { inconsistent_snapshot_error(sctx, result, "xattr"); return -EIO; } @@ -7304,7 +7229,7 @@ static int search_key_again(const struct send_ctx *sctx, */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); ASSERT(ret <= 0); - if (ret > 0) { + if (unlikely(ret > 0)) { btrfs_print_tree(path->nodes[path->lowest_level], false); btrfs_err(root->fs_info, "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", @@ -7324,7 +7249,7 @@ static int full_send_tree(struct send_ctx *sctx) struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_fs_info *fs_info = send_root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = alloc_path_for_send(); if (!path) @@ -7341,7 +7266,7 @@ static int full_send_tree(struct send_ctx *sctx) ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) - goto out; + return ret; if (ret) goto out_finish; @@ -7351,7 +7276,7 @@ static int full_send_tree(struct send_ctx *sctx) ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) - goto out; + return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { @@ -7370,14 +7295,14 @@ static int full_send_tree(struct send_ctx *sctx) btrfs_release_path(path); ret = search_key_again(sctx, send_root, path, &key); if (ret < 0) - goto out; + return ret; } else { up_read(&fs_info->commit_root_sem); } ret = btrfs_next_item(send_root, path); if (ret < 0) - goto out; + return ret; if (ret) { ret = 0; break; @@ -7385,11 +7310,7 @@ static int full_send_tree(struct send_ctx *sctx) } out_finish: - ret = finish_inode_if_needed(sctx, 1); - -out: - btrfs_free_path(path); - return ret; + return finish_inode_if_needed(sctx, 1); } static int replace_node_with_clone(struct btrfs_path *path, int level) @@ -7644,8 +7565,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; int cmp; - struct btrfs_path *left_path = NULL; - struct btrfs_path *right_path = NULL; + BTRFS_PATH_AUTO_FREE(left_path); + BTRFS_PATH_AUTO_FREE(right_path); struct btrfs_key left_key; struct btrfs_key right_key; char *tmp_buf = NULL; @@ -7918,8 +7839,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, out_unlock: up_read(&fs_info->commit_root_sem); out: - btrfs_free_path(left_path); - btrfs_free_path(right_path); kvfree(tmp_buf); return ret; } @@ -7986,7 +7905,7 @@ static int ensure_commit_roots_uptodate(struct send_ctx *sctx) } /* - * Make sure any existing dellaloc is flushed for any root used by a send + * Make sure any existing delalloc is flushed for any root used by a send * operation so that we do not miss any data and we do not race with writeback * finishing and changing a tree while send is using the tree. This could * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 0481c693ac2eaf..97452fb5d29b02 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -479,7 +479,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, /* * On the zoned mode, we always allocate one zone as one chunk. - * Returning non-zone size alingned bytes here will result in + * Returning non-zone size aligned bytes here will result in * less pressure for the async metadata reclaim process, and it * will over-commit too much leading to ENOSPC. Align down to the * zone size to avoid that. @@ -1528,7 +1528,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, * turned into error mode due to a transaction abort when flushing space * above, in that case fail with the abort error instead of returning * success to the caller if we can steal from the global rsv - this is - * just to have caller fail immeditelly instead of later when trying to + * just to have caller fail immediately instead of later when trying to * modify the fs, making it easier to debug -ENOSPC problems. */ if (BTRFS_FS_ERROR(fs_info)) { @@ -1830,7 +1830,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, async_work); + queue_work(system_dfl_wq, async_work); } } else { list_add_tail(&ticket.list, @@ -1847,7 +1847,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, need_preemptive_reclaim(fs_info, space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); - queue_work(system_unbound_wq, + queue_work(system_dfl_wq, &fs_info->preempt_reclaim_work); } } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index cb4f97833dc34a..5ca8d4db67220c 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -690,7 +690,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, \ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ - "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ + "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), \ blocks_per_folio, &bitmap); \ } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index ee0710eb13fd0a..ad0552db7c7dcb 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -13,7 +13,7 @@ struct address_space; struct folio; /* - * Extra info for subpapge bitmap. + * Extra info for subpage bitmap. * * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into * one larger bitmap. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a262b494a89f5f..d6e496436539d2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -133,9 +133,8 @@ enum { Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY Opt_ref_verify, + Opt_ref_tracker, #endif Opt_err, }; @@ -257,8 +256,7 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY + fsparam_flag("ref_tracker", Opt_ref_tracker), fsparam_flag("ref_verify", Opt_ref_verify), #endif {} @@ -276,6 +274,7 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, const struct fs_parameter *param, int opt) { const char *string = param->string; + int ret; /* * Provide the same semantics as older kernels that don't use fs @@ -294,21 +293,30 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zlib", true)) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (btrfs_match_compress_type(string, "lzo", false)) { + } else if (btrfs_match_compress_type(string, "lzo", true)) { ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = 0; + ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3, + &ctx->compress_level); + if (ret < 0) + goto error; + if (string[3] == ':' && string[4]) + btrfs_warn(NULL, "Compression level ignored for LZO"); btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zstd", true)) { ctx->compress_type = BTRFS_COMPRESS_ZSTD; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); @@ -319,10 +327,14 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); } else { - btrfs_err(NULL, "unrecognized compression value %s", string); - return -EINVAL; + ret = -EINVAL; + goto error; } return 0; +error: + btrfs_err(NULL, "failed to parse compression option '%s'", string); + return ret; + } static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) @@ -632,11 +644,12 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } break; -#endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY case Opt_ref_verify: btrfs_set_opt(ctx->mount_opt, REF_VERIFY); break; + case Opt_ref_tracker: + btrfs_set_opt(ctx->mount_opt, REF_TRACKER); + break; #endif default: btrfs_err(NULL, "unrecognized mount option '%s'", param->key); @@ -912,7 +925,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec { struct btrfs_root *root = fs_info->tree_root; struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key location; struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; @@ -929,7 +942,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { - btrfs_free_path(path); return PTR_ERR(di); } if (!di) { @@ -938,13 +950,11 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec * it's always been there, but don't freak out, just try and * mount the top-level subvolume. */ - btrfs_free_path(path); *objectid = BTRFS_FS_TREE_OBJECTID; return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - btrfs_free_path(path); *objectid = location.objectid; return 0; } @@ -1079,7 +1089,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); - if (info->compress_level) + if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO) seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) @@ -1142,6 +1152,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) #endif if (btrfs_test_opt(info, REF_VERIFY)) seq_puts(seq, ",ref_verify"); + if (btrfs_test_opt(info, REF_TRACKER)) + seq_puts(seq, ",ref_tracker"); seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); subvol_name = btrfs_get_subvol_name_from_objectid(info, btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); @@ -1268,7 +1280,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); /* - * We need to cleanup all defragable inodes if the autodefragment is + * We need to cleanup all defraggable inodes if the autodefragment is * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && @@ -2260,10 +2272,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - if (IS_ERR(device)) - ret = PTR_ERR(device); - else - ret = 0; + ret = PTR_ERR_OR_ZERO(device); break; } ret = !(device->fs_devices->num_devices == @@ -2316,14 +2325,14 @@ static int check_dev_super(struct btrfs_device *dev) /* Verify the checksum. */ csum_type = btrfs_super_csum_type(sb); - if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) { btrfs_err(fs_info, "csum type changed, has %u expect %u", csum_type, btrfs_super_csum_type(fs_info->super_copy)); ret = -EUCLEAN; goto out; } - if (btrfs_check_super_csum(fs_info, sb)) { + if (unlikely(btrfs_check_super_csum(fs_info, sb))) { btrfs_err(fs_info, "csum for on-disk super block no longer matches"); ret = -EUCLEAN; goto out; @@ -2335,7 +2344,7 @@ static int check_dev_super(struct btrfs_device *dev) goto out; last_trans = btrfs_get_last_trans_committed(fs_info); - if (btrfs_super_generation(sb) != last_trans) { + if (unlikely(btrfs_super_generation(sb) != last_trans)) { btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", btrfs_super_generation(sb), last_trans); ret = -EUCLEAN; @@ -2472,9 +2481,6 @@ static int __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_ASSERT ", assert=on" #endif -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - ", ref-verify=on" -#endif #ifdef CONFIG_BLK_DEV_ZONED ", zoned=yes" #else diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 9d398f7a36addb..81f52c1f55ce57 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -409,13 +409,17 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, char *buf) { ssize_t ret = 0; + bool has_output = false; - if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) - ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); - if (PAGE_SIZE > SZ_4K) - ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); - ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); - + for (u32 cur = BTRFS_MIN_BLOCKSIZE; cur <= BTRFS_MAX_BLOCKSIZE; cur *= 2) { + if (!btrfs_supported_blocksize(cur)) + continue; + if (has_output) + ret += sysfs_emit_at(buf, ret, " "); + ret += sysfs_emit_at(buf, ret, "%u", cur); + has_output = true; + } + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_sectorsizes, diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 265370e79a546d..e2248acb906b74 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -997,12 +997,12 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) ret = simple_tests(&trans); if (!ret) { - test_msg("running delayed refs merg tests on metadata refs"); + test_msg("running delayed refs merge tests on metadata refs"); ret = merge_tests(&trans, BTRFS_REF_METADATA); } if (!ret) { - test_msg("running delayed refs merg tests on data refs"); + test_msg("running delayed refs merge tests on data refs"); ret = merge_tests(&trans, BTRFS_REF_DATA); } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 3a86534c116f2f..42af6c737c6e6f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1095,7 +1095,7 @@ int btrfs_test_extent_map(void) /* * Test a chunk with 2 data stripes one of which * intersects the physical address of the super block - * is correctly recognised. + * is correctly recognized. */ .raid_type = BTRFS_BLOCK_GROUP_RAID1, .physical_start = SZ_64M - SZ_4M, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c5c0d9cf1a8088..89ae0c7a610aa5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -103,7 +103,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep; * | attached to transid N+1. | * | | * | To next stage: | - * | Until all tree blocks are super blocks are | + * | Until all tree blocks and super blocks are | * | written to block devices | * V | * Transaction N [[TRANS_STATE_COMPLETED]] V @@ -404,7 +404,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, */ static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int force) + bool force) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; @@ -1569,7 +1569,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * qgroup counters could end up wrong. */ ret = btrfs_run_delayed_refs(trans, U64_MAX); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } @@ -1641,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; struct btrfs_inode *parent_inode = pending->dir; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; struct extent_buffer *old; @@ -1694,10 +1694,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto clear_skip_qgroup; } - key.objectid = objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; trans->bytes_reserved = trans->block_rsv->reserved; @@ -1714,7 +1710,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert the directory item */ ret = btrfs_set_inode_index(parent_inode, &index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1735,7 +1731,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_create_qgroup(trans, objectid); if (ret && ret != -EEXIST) { - if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) { + if (unlikely(ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info))) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1748,13 +1744,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans); - if (ret) { /* Transaction aborted */ + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } ret = record_root_in_trans(trans, root, 0); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1789,7 +1785,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, BTRFS_NESTING_COW); - if (ret) { + if (unlikely(ret)) { btrfs_tree_unlock(old); free_extent_buffer(old); btrfs_abort_transaction(trans, ret); @@ -1800,21 +1796,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_wmb(); + smp_mb__after_atomic(); btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = trans->transid; ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1826,7 +1824,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_root_id(parent_root), btrfs_ino(parent_inode), index, &fname.disk_name); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1841,7 +1839,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } ret = btrfs_reloc_post_snapshot(trans, pending); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1864,7 +1862,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, &fname.disk_name, parent_inode, &key, BTRFS_FT_DIR, index); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1874,14 +1872,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, inode_set_mtime_to_ts(&parent_inode->vfs_inode, inode_set_ctime_current(&parent_inode->vfs_inode)); ret = btrfs_update_inode_fallback(trans, parent_inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1889,7 +1887,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); - if (ret && ret != -EEXIST) { + if (unlikely(ret && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -1907,7 +1905,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, free_pending: kfree(new_root_item); pending->root_item = NULL; - btrfs_free_path(path); pending->path = NULL; return ret; @@ -2423,7 +2420,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * them. * * We needn't worry that this operation will corrupt the snapshots, - * because all the tree which are snapshoted will be forced to COW + * because all the tree which are snapshotted will be forced to COW * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); @@ -2657,9 +2654,9 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, 0, 0); + ret = btrfs_drop_snapshot(root, false, false); else - ret = btrfs_drop_snapshot(root, 1, 0); + ret = btrfs_drop_snapshot(root, true, false); btrfs_put_root(root); return (ret < 0) ? 0 : 1; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 0f556f4de3f924..ca30b15ea45234 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, /* Only these key->types needs to be checked */ ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY || key->type == BTRFS_DIR_INDEX_KEY || key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_EXTENT_DATA_KEY); @@ -1209,7 +1210,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, /* * For legacy root item, the members starting at generation_v2 will be * all filled with 0. - * And since we allow geneartion_v2 as 0, it will still pass the check. + * And since we allow generation_v2 as 0, it will still pass the check. */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), btrfs_item_size(leaf, slot)); @@ -1756,10 +1757,10 @@ static int check_inode_ref(struct extent_buffer *leaf, while (ptr < end) { u16 namelen; - if (unlikely(ptr + sizeof(iref) > end)) { + if (unlikely(ptr + sizeof(*iref) > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", - ptr, end, sizeof(iref)); + ptr, end, sizeof(*iref)); return -EUCLEAN; } @@ -1782,6 +1783,39 @@ static int check_inode_ref(struct extent_buffer *leaf, return 0; } +static int check_inode_extref(struct extent_buffer *leaf, + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) +{ + unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); + unsigned long end = ptr + btrfs_item_size(leaf, slot); + + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) + return -EUCLEAN; + + while (ptr < end) { + struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; + u16 namelen; + + if (unlikely(ptr + sizeof(*extref)) > end) { + inode_ref_err(leaf, slot, + "inode extref overflow, ptr %lu end %lu inode_extref size %zu", + ptr, end, sizeof(*extref)); + return -EUCLEAN; + } + + namelen = btrfs_inode_extref_name_len(leaf, extref); + if (unlikely(ptr + sizeof(*extref) + namelen > end)) { + inode_ref_err(leaf, slot, + "inode extref overflow, ptr %lu end %lu namelen %u", + ptr, end, namelen); + return -EUCLEAN; + } + ptr += sizeof(*extref) + namelen; + } + return 0; +} + static int check_raid_stripe_extent(const struct extent_buffer *leaf, const struct btrfs_key *key, int slot) { @@ -1893,6 +1927,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_INODE_REF_KEY: ret = check_inode_ref(leaf, key, prev_key, slot); break; + case BTRFS_INODE_EXTREF_KEY: + ret = check_inode_extref(leaf, key, prev_key, slot); + break; case BTRFS_BLOCK_GROUP_ITEM_KEY: ret = check_block_group_item(leaf, key, slot); break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7d5d90845ca985..6aad6b65522b21 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "file-item.h" #include "file.h" #include "orphan.h" +#include "print-tree.h" #include "tree-checker.h" #define MAX_CONFLICT_INODES 10 @@ -101,17 +102,134 @@ enum { LOG_WALK_REPLAY_ALL, }; +/* + * The walk control struct is used to pass state down the chain when processing + * the log tree. The stage field tells us which part of the log tree processing + * we are currently doing. + */ +struct walk_control { + /* + * Signal that we are freeing the metadata extents of a log tree. + * This is used at transaction commit time while freeing a log tree. + */ + bool free; + + /* + * Signal that we are pinning the metadata extents of a log tree and the + * data extents its leaves point to (if using mixed block groups). + * This happens in the first stage of log replay to ensure that during + * replay, while we are modifying subvolume trees, we don't overwrite + * the metadata extents of log trees. + */ + bool pin; + + /* What stage of the replay code we're currently in. */ + int stage; + + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY. + */ + bool ignore_cur_inode; + + /* + * The root we are currently replaying to. This is NULL for the replay + * stage LOG_WALK_PIN_ONLY. + */ + struct btrfs_root *root; + + /* The log tree we are currently processing (not NULL for any stage). */ + struct btrfs_root *log; + + /* The transaction handle used for replaying all log trees. */ + struct btrfs_trans_handle *trans; + + /* + * The function that gets used to process blocks we find in the tree. + * Note the extent_buffer might not be up to date when it is passed in, + * and it must be checked or read if you need the data inside it. + */ + int (*process_func)(struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level); + + /* + * The following are used only when stage is >= LOG_WALK_REPLAY_INODES + * and by the replay_one_buffer() callback. + */ + + /* The current log leaf being processed. */ + struct extent_buffer *log_leaf; + /* The key being processed of the current log leaf. */ + struct btrfs_key log_key; + /* The slot being processed of the current log leaf. */ + int log_slot; + + /* A path used for searches and modifications to subvolume trees. */ + struct btrfs_path *subvol_path; +}; + +static void do_abort_log_replay(struct walk_control *wc, const char *function, + unsigned int line, int error, const char *fmt, ...) +{ + struct btrfs_fs_info *fs_info = wc->trans->fs_info; + struct va_format vaf; + va_list args; + + /* + * Do nothing if we already aborted, to avoid dumping leaves again which + * can be verbose. Further more, only the first call is useful since it + * is where we have a problem. Note that we do not use the flag + * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that + * are outside of tree-log.c that can abort transactions (such as + * btrfs_add_link() for example), so if that happens we still want to + * dump all log replay specific information below. + */ + if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state)) + return; + + btrfs_abort_transaction(wc->trans, error); + + if (wc->subvol_path->nodes[0]) { + btrfs_crit(fs_info, + "subvolume (root %llu) leaf currently being processed:", + btrfs_root_id(wc->root)); + btrfs_print_leaf(wc->subvol_path->nodes[0]); + } + + if (wc->log_leaf) { + btrfs_crit(fs_info, + "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):", + btrfs_root_id(wc->root), wc->log_slot, + wc->log_key.objectid, wc->log_key.type, wc->log_key.offset); + btrfs_print_leaf(wc->log_leaf); + } + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV", + function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf); + + va_end(args); +} + +/* + * Use this for aborting a transaction during log replay while we are down the + * call chain of replay_one_buffer(), so that we get a lot more useful + * information for debugging issues when compared to a plain call to + * btrfs_abort_transaction(). + */ +#define btrfs_abort_log_replay(wc, error, fmt, args...) \ + do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args) + static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx); -static int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid); -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, +static int link_to_fixup_dir(struct walk_control *wc, u64 objectid); +static noinline int replay_dir_deletes(struct walk_control *wc, u64 dirid, bool del_all); static void wait_log_commit(struct btrfs_root *root, int transid); @@ -299,54 +417,14 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -/* - * the walk control struct is used to pass state down the chain when - * processing the log tree. The stage field tells us which part - * of the log tree processing we are currently doing. The others - * are state fields used for that specific part - */ -struct walk_control { - /* should we free the extent on disk when done? This is used - * at transaction commit time while freeing a log tree - */ - int free; - - /* pin only walk, we record which extents on disk belong to the - * log trees - */ - int pin; - - /* what stage of the replay code we're currently in */ - int stage; - - /* - * Ignore any items from the inode currently being processed. Needs - * to be set every time we find a BTRFS_INODE_ITEM_KEY. - */ - bool ignore_cur_inode; - - /* the root we are currently replaying */ - struct btrfs_root *replay_dest; - - /* the trans handle for the current replay */ - struct btrfs_trans_handle *trans; - - /* the function that gets used to process blocks we find in the - * tree. Note the extent_buffer might not be up to date when it is - * passed in, and it must be checked or read if you need the data - * inside it - */ - int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen, int level); -}; - /* * process_func used to pin down extents, write them or wait on them */ -static int process_one_buffer(struct btrfs_root *log, - struct extent_buffer *eb, +static int process_one_buffer(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { + struct btrfs_root *log = wc->log; + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; @@ -361,25 +439,36 @@ static int process_one_buffer(struct btrfs_root *log, }; ret = btrfs_read_extent_buffer(eb, &check); - if (ret) + if (unlikely(ret)) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; + } } if (wc->pin) { - ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); - if (ret) + ASSERT(trans != NULL); + ret = btrfs_pin_extent_for_log_replay(trans, eb); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); return ret; + } - if (btrfs_buffer_uptodate(eb, gen, 0) && - btrfs_header_level(eb) == 0) + if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) { ret = btrfs_exclude_logged_extents(eb); + if (ret) + btrfs_abort_transaction(trans, ret); + } } return ret; } /* - * Item overwrite used by log replay. The given eb, slot and key all refer to - * the source data we are copying out. + * Item overwrite used by log replay. The given log tree leaf, slot and key + * from the walk_control structure all refer to the source data we are copying + * out. * * The given root is for the tree we are copying into, and path is a scratch * path for use in this function (it should be released on entry and will be @@ -391,12 +480,10 @@ static int process_one_buffer(struct btrfs_root *log, * * If the key isn't in the destination yet, a new item is inserted. */ -static int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static int overwrite_item(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; int ret; u32 item_size; u64 saved_i_size = 0; @@ -405,7 +492,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, unsigned long dst_ptr; struct extent_buffer *dst_eb; int dst_slot; - bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; + const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY); /* * This is only used during log replay, so the root is always from a @@ -416,16 +503,21 @@ static int overwrite_item(struct btrfs_trans_handle *trans, */ ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); - item_size = btrfs_item_size(eb, slot); - src_ptr = btrfs_item_ptr_offset(eb, slot); + item_size = btrfs_item_size(wc->log_leaf, wc->log_slot); + src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); /* Look for the key in the destination tree. */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) + ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); return ret; + } - dst_eb = path->nodes[0]; - dst_slot = path->slots[0]; + dst_eb = wc->subvol_path->nodes[0]; + dst_slot = wc->subvol_path->slots[0]; if (ret == 0) { char *src_copy; @@ -435,16 +527,17 @@ static int overwrite_item(struct btrfs_trans_handle *trans, goto insert; if (item_size == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } src_copy = kmalloc(item_size, GFP_NOFS); if (!src_copy) { - btrfs_release_path(path); + btrfs_abort_log_replay(wc, -ENOMEM, + "failed to allocate memory for log leaf item"); return -ENOMEM; } - read_extent_buffer(eb, src_copy, src_ptr, item_size); + read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size); dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); @@ -456,7 +549,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * sync */ if (ret == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -464,7 +557,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * We need to load the old nbytes into the inode so when we * replay the extents we've logged we get the right nbytes. */ - if (inode_item) { + if (is_inode_item) { struct btrfs_inode_item *item; u64 nbytes; u32 mode; @@ -472,20 +565,20 @@ static int overwrite_item(struct btrfs_trans_handle *trans, item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); nbytes = btrfs_inode_nbytes(dst_eb, item); - item = btrfs_item_ptr(eb, slot, + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item); - btrfs_set_inode_nbytes(eb, item, nbytes); + btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes); /* * If this is a directory we need to reset the i_size to * 0 so that we can set it up properly when replaying * the rest of the items in this log. */ - mode = btrfs_inode_mode(eb, item); + mode = btrfs_inode_mode(wc->log_leaf, item); if (S_ISDIR(mode)) - btrfs_set_inode_size(eb, item, 0); + btrfs_set_inode_size(wc->log_leaf, item, 0); } - } else if (inode_item) { + } else if (is_inode_item) { struct btrfs_inode_item *item; u32 mode; @@ -493,38 +586,41 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * New inode, set nbytes to 0 so that the nbytes comes out * properly when we replay the extents. */ - item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); - btrfs_set_inode_nbytes(eb, item, 0); + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(wc->log_leaf, item, 0); /* * If this is a directory we need to reset the i_size to 0 so * that we can set it up properly when replaying the rest of * the items in this log. */ - mode = btrfs_inode_mode(eb, item); + mode = btrfs_inode_mode(wc->log_leaf, item); if (S_ISDIR(mode)) - btrfs_set_inode_size(eb, item, 0); + btrfs_set_inode_size(wc->log_leaf, item, 0); } insert: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* try to insert the key into the destination tree */ - path->skip_release_on_error = 1; - ret = btrfs_insert_empty_item(trans, root, path, - key, item_size); - path->skip_release_on_error = 0; + wc->subvol_path->skip_release_on_error = 1; + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size); + wc->subvol_path->skip_release_on_error = 0; - dst_eb = path->nodes[0]; - dst_slot = path->slots[0]; + dst_eb = wc->subvol_path->nodes[0]; + dst_slot = wc->subvol_path->slots[0]; /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { const u32 found_size = btrfs_item_size(dst_eb, dst_slot); if (found_size > item_size) - btrfs_truncate_item(trans, path, item_size, 1); + btrfs_truncate_item(trans, wc->subvol_path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(trans, path, item_size - found_size); + btrfs_extend_item(trans, wc->subvol_path, item_size - found_size); } else if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to insert item for key (%llu %u %llu)", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset); return ret; } dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); @@ -538,15 +634,15 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * state of the tree found in the subvolume, and i_size is modified * as it goes */ - if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + if (is_inode_item && ret == -EEXIST) { struct btrfs_inode_item *src_item; struct btrfs_inode_item *dst_item; src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(eb, src_item) == 0) { - const u64 ino_size = btrfs_inode_size(eb, src_item); + if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) { + const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item); /* * For regular files an ino_size == 0 is used only when @@ -555,21 +651,21 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * case don't set the size of the inode in the fs/subvol * tree, otherwise we would be throwing valid data away. */ - if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) && S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && ino_size != 0) btrfs_set_inode_size(dst_eb, dst_item, ino_size); goto no_copy; } - if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && + if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) && S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); + copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; @@ -579,7 +675,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, } /* make sure the generation is filled in */ - if (key->type == BTRFS_INODE_ITEM_KEY) { + if (is_inode_item) { struct btrfs_inode_item *dst_item; dst_item = (struct btrfs_inode_item *)dst_ptr; @@ -587,7 +683,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -618,292 +714,354 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * The extent is inserted into the file, dropping any existing extents * from the file that overlap the new one. */ -static noinline int replay_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int replay_one_extent(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = root->fs_info; int found_type; u64 extent_end; - u64 start = key->offset; + const u64 start = wc->log_key.offset; u64 nbytes = 0; + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + u64 offset; + unsigned long dest_offset; + struct btrfs_key ins; struct btrfs_file_extent_item *item; struct btrfs_inode *inode = NULL; - unsigned long size; int ret = 0; - item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(eb, item); + item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(wc->log_leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - nbytes = btrfs_file_extent_num_bytes(eb, item); - extent_end = start + nbytes; - - /* - * We don't add to the inodes nbytes if we are prealloc or a - * hole. - */ - if (btrfs_file_extent_disk_bytenr(eb, item) == 0) - nbytes = 0; + extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item); + /* Holes don't take up space. */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0) + nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_ram_bytes(eb, item); - nbytes = btrfs_file_extent_ram_bytes(eb, item); - extent_end = ALIGN(start + size, - fs_info->sectorsize); + nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item); + extent_end = ALIGN(start + nbytes, fs_info->sectorsize); } else { - btrfs_err(fs_info, - "unexpected extent type=%d root=%llu inode=%llu offset=%llu", - found_type, btrfs_root_id(root), key->objectid, key->offset); + btrfs_abort_log_replay(wc, -EUCLEAN, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), + wc->log_key.objectid, wc->log_key.offset); return -EUCLEAN; } - inode = btrfs_iget_logging(key->objectid, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); + inode = btrfs_iget_logging(wc->log_key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to get inode %llu for root %llu", + wc->log_key.objectid, btrfs_root_id(root)); + return ret; + } /* * first check to see if we already have this extent in the * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); + ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path, + btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; struct btrfs_file_extent_item existing; unsigned long ptr; - ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); + ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + read_extent_buffer(leaf, &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item, sizeof(existing)) == 0) { - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); goto out; } } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* drop any overlapping extents */ drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; + drop_args.path = wc->subvol_path; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to drop extents for inode %llu range [%llu, %llu) root %llu", + wc->log_key.objectid, start, extent_end, + btrfs_root_id(root)); goto out; + } - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 offset; - unsigned long dest_offset; - struct btrfs_key ins; - - if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && - btrfs_fs_incompat(fs_info, NO_HOLES)) - goto update_inode; - - ret = btrfs_insert_empty_item(trans, root, path, key, - sizeof(*item)); + if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(wc); if (ret) goto out; - dest_offset = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); - copy_extent_buffer(path->nodes[0], eb, dest_offset, - (unsigned long)item, sizeof(*item)); + goto update_inode; + } - ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.type = BTRFS_EXTENT_ITEM_KEY; - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); - offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * If not an inline extent, it can only be a regular or prealloc one. + * We have checked that above and returned -EUCLEAN if not. + */ - /* - * Manually record dirty extent, as here we did a shallow - * file extent item copy and skip normal backref update, - * but modifying extent tree all by ourselves. - * So need to manually record dirty extent for qgroup, - * as the owner of the file extent changed from log tree - * (doesn't affect qgroup) to fs/file tree(affects qgroup) - */ - ret = btrfs_qgroup_trace_extent(trans, - btrfs_file_extent_disk_bytenr(eb, item), - btrfs_file_extent_disk_num_bytes(eb, item)); - if (ret < 0) - goto out; + /* A hole and NO_HOLES feature enabled, nothing else to do. */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + goto update_inode; - if (ins.objectid > 0) { - u64 csum_start; - u64 csum_end; - LIST_HEAD(ordered_sums); + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, + &wc->log_key, sizeof(*item)); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to insert item with key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); + goto out; + } + dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0], + wc->subvol_path->slots[0]); + copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset, + (unsigned long)item, sizeof(*item)); - /* - * is this extent already allocated in the extent - * allocation tree? If so, just add a reference - */ - ret = btrfs_lookup_data_extent(fs_info, ins.objectid, - ins.offset); - if (ret < 0) { - goto out; - } else if (ret == 0) { - struct btrfs_ref ref = { - .action = BTRFS_ADD_DELAYED_REF, - .bytenr = ins.objectid, - .num_bytes = ins.offset, - .owning_root = btrfs_root_id(root), - .ref_root = btrfs_root_id(root), - }; - btrfs_init_data_ref(&ref, key->objectid, offset, - 0, false); - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) - goto out; - } else { - /* - * insert the extent pointer in the extent - * allocation tree - */ - ret = btrfs_alloc_logged_file_extent(trans, - btrfs_root_id(root), - key->objectid, offset, &ins); - if (ret) - goto out; - } - btrfs_release_path(path); + /* + * We have an explicit hole and NO_HOLES is not enabled. We have added + * the hole file extent item to the subvolume tree, so we don't have + * anything else to do other than update the file extent item range and + * update the inode item. + */ + if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) { + btrfs_release_path(wc->subvol_path); + goto update_inode; + } - if (btrfs_file_extent_compression(eb, item)) { - csum_start = ins.objectid; - csum_end = csum_start + ins.offset; - } else { - csum_start = ins.objectid + - btrfs_file_extent_offset(eb, item); - csum_end = csum_start + - btrfs_file_extent_num_bytes(eb, item); - } + ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item); + offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item); - ret = btrfs_lookup_csums_list(root->log_root, - csum_start, csum_end - 1, - &ordered_sums, false); - if (ret < 0) - goto out; - ret = 0; - /* - * Now delete all existing cums in the csum root that - * cover our range. We do this because we can have an - * extent that is completely referenced by one file - * extent item and partially referenced by another - * file extent item (like after using the clone or - * extent_same ioctls). In this case if we end up doing - * the replay of the one that partially references the - * extent first, and we do not do the csum deletion - * below, we can get 2 csum items in the csum tree that - * overlap each other. For example, imagine our log has - * the two following file extent items: - * - * key (257 EXTENT_DATA 409600) - * extent data disk byte 12845056 nr 102400 - * extent data offset 20480 nr 20480 ram 102400 - * - * key (257 EXTENT_DATA 819200) - * extent data disk byte 12845056 nr 102400 - * extent data offset 0 nr 102400 ram 102400 - * - * Where the second one fully references the 100K extent - * that starts at disk byte 12845056, and the log tree - * has a single csum item that covers the entire range - * of the extent: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * - * After the first file extent item is replayed, the - * csum tree gets the following csum item: - * - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which covers the 20K sub-range starting at offset 20K - * of our extent. Now when we replay the second file - * extent item, if we do not delete existing csum items - * that cover any of its blocks, we end up getting two - * csum items in our csum tree that overlap each other: - * - * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 - * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 - * - * Which is a problem, because after this anyone trying - * to lookup up for the checksum of any block of our - * extent starting at an offset of 40K or higher, will - * end up looking at the second csum item only, which - * does not contain the checksum for any block starting - * at offset 40K or higher of our extent. - */ - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums; - struct btrfs_root *csum_root; - - sums = list_first_entry(&ordered_sums, - struct btrfs_ordered_sum, - list); - csum_root = btrfs_csum_root(fs_info, - sums->logical); - if (!ret) - ret = btrfs_del_csums(trans, csum_root, - sums->logical, - sums->len); - if (!ret) - ret = btrfs_csum_file_blocks(trans, - csum_root, - sums); - list_del(&sums->list); - kfree(sums); - } - if (ret) - goto out; - } else { - btrfs_release_path(path); + /* + * Manually record dirty extent, as here we did a shallow file extent + * item copy and skip normal backref update, but modifying extent tree + * all by ourselves. So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree (doesn't affect + * qgroup) to fs/file tree (affects qgroup). + */ + ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, btrfs_root_id(root)); + goto out; + } + + /* + * Is this extent already allocated in the extent tree? + * If so, just add a reference. + */ + ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, btrfs_root_id(root)); + goto out; + } else if (ret == 0) { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + + btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false); + ret = btrfs_inc_extent_ref(trans, &ref); + if (ret) { + btrfs_abort_log_replay(wc, ret, +"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu", + ins.objectid, ins.offset, + wc->log_key.objectid, + btrfs_root_id(root)); + goto out; } - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - /* inline extents are easy, we just overwrite them */ - ret = overwrite_item(trans, root, path, eb, slot, key); - if (ret) + } else { + /* Insert the extent pointer in the extent tree. */ + ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root), + wc->log_key.objectid, offset, &ins); + if (ret) { + btrfs_abort_log_replay(wc, ret, +"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu", + ins.objectid, ins.offset, offset, + wc->log_key.objectid, btrfs_root_id(root)); goto out; + } } - ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); + btrfs_release_path(wc->subvol_path); + + if (btrfs_file_extent_compression(wc->log_leaf, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item); + csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item); + } + + ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, + &ordered_sums, false); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to lookups csums for range [%llu, %llu) inode %llu root %llu", + csum_start, csum_end, wc->log_key.objectid, + btrfs_root_id(root)); + goto out; + } + ret = 0; + /* + * Now delete all existing cums in the csum root that cover our range. + * We do this because we can have an extent that is completely + * referenced by one file extent item and partially referenced by + * another file extent item (like after using the clone or extent_same + * ioctls). In this case if we end up doing the replay of the one that + * partially references the extent first, and we do not do the csum + * deletion below, we can get 2 csum items in the csum tree that overlap + * each other. For example, imagine our log has the two following file + * extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent that starts at + * disk byte 12845056, and the log tree has a single csum item that + * covers the entire range of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the csum tree gets the + * following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K of our extent. + * Now when we replay the second file extent item, if we do not delete + * existing csum items that cover any of its blocks, we end up getting + * two csum items in our csum tree that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying to lookup for + * the checksum of any block of our extent starting at an offset of 40K + * or higher, will end up looking at the second csum item only, which + * does not contain the checksum for any block starting at offset 40K or + * higher of our extent. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + struct btrfs_root *csum_root; + + sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list); + csum_root = btrfs_csum_root(fs_info, sums->logical); + if (!ret) { + ret = btrfs_del_csums(trans, csum_root, sums->logical, + sums->len); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to delete csums for range [%llu, %llu) inode %llu root %llu", + sums->logical, + sums->logical + sums->len, + wc->log_key.objectid, + btrfs_root_id(root)); + } + if (!ret) { + ret = btrfs_csum_file_blocks(trans, csum_root, sums); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to add csums for range [%llu, %llu) inode %llu root %llu", + sums->logical, + sums->logical + sums->len, + wc->log_key.objectid, + btrfs_root_id(root)); + } + list_del(&sums->list); + kfree(sums); + } if (ret) goto out; update_inode: + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to set file extent range [%llu, %llu) inode %llu root %llu", + start, extent_end, wc->log_key.objectid, + btrfs_root_id(root)); + goto out; + } + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + wc->log_key.objectid, btrfs_root_id(root)); out: iput(&inode->vfs_inode); return ret; } -static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, +static int unlink_inode_for_log_replay(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name) { + struct btrfs_trans_handle *trans = wc->trans; int ret; ret = btrfs_unlink_inode(trans, dir, inode, name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to unlink inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), name->len, + name->name, btrfs_root_id(inode->root)); return ret; + } /* * Whenever we need to check if a name exists or not, we check the * fs/subvolume tree. So after an unlink we must run delayed items, so * that future checks for a name during log replay see that the name * does not exists anymore. */ - return btrfs_run_delayed_items(trans); + ret = btrfs_run_delayed_items(trans); + if (ret) + btrfs_abort_log_replay(wc, ret, +"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), name->len, + name->name, btrfs_root_id(inode->root)); + + return ret; } /* @@ -914,39 +1072,44 @@ static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, * This is a helper function to do the unlink of a specific directory * item */ -static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, +static noinline int drop_one_dir_item(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; struct btrfs_inode *inode; struct fscrypt_str name; - struct extent_buffer *leaf; + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; struct btrfs_key location; int ret; - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &location); ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); - if (ret) - return -ENOMEM; + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); + return ret; + } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to open inode %llu parent dir %llu name %.*s root %llu", + location.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); inode = NULL; goto out; } - ret = link_to_fixup_dir(trans, root, path, location.objectid); + ret = link_to_fixup_dir(wc, location.objectid); if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); out: kfree(name.name); if (inode) @@ -1013,7 +1176,7 @@ static noinline int backref_in_log(struct btrfs_root *log, u64 ref_objectid, const struct fscrypt_str *name) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; path = btrfs_alloc_path(); @@ -1021,12 +1184,10 @@ static noinline int backref_in_log(struct btrfs_root *log, return -ENOMEM; ret = btrfs_search_slot(NULL, log, key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret == 1) { - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret == 1) + return 0; if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], @@ -1035,20 +1196,15 @@ static noinline int backref_in_log(struct btrfs_root *log, else ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); -out: - btrfs_free_path(path); return ret; } -static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_root *log_root, +static int unlink_refs_not_in_log(struct walk_control *wc, struct btrfs_key *search_key, struct btrfs_inode *dir, - struct btrfs_inode *inode, - u64 parent_objectid) + struct btrfs_inode *inode) { - struct extent_buffer *leaf = path->nodes[0]; + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; unsigned long ptr; unsigned long ptr_end; @@ -1057,8 +1213,8 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, * log. If so, we allow them to stay otherwise they must be unlinked as * a conflict. */ - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]); while (ptr < ptr_end) { struct fscrypt_str victim_name; struct btrfs_inode_ref *victim_ref; @@ -1068,22 +1224,34 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, ret = read_alloc_one_name(leaf, (victim_ref + 1), btrfs_inode_ref_name_len(leaf, victim_ref), &victim_name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for inode %llu parent dir %llu root %llu", + btrfs_ino(inode), btrfs_ino(dir), + btrfs_root_id(inode->root)); return ret; + } - ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name); if (ret) { - kfree(victim_name.name); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + victim_name.len, victim_name.name, + btrfs_root_id(inode->root)); + kfree(victim_name.name); return ret; + } + kfree(victim_name.name); ptr = (unsigned long)(victim_ref + 1) + victim_name.len; continue; } inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name); kfree(victim_name.name); if (ret) return ret; @@ -1093,64 +1261,64 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, return 0; } -static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_root *root, - struct btrfs_root *log_root, +static int unlink_extrefs_not_in_log(struct walk_control *wc, struct btrfs_key *search_key, - struct btrfs_inode *inode, - u64 inode_objectid, - u64 parent_objectid) + struct btrfs_inode *dir, + struct btrfs_inode *inode) { - struct extent_buffer *leaf = path->nodes[0]; - const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]); - const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + struct extent_buffer *leaf = wc->subvol_path->nodes[0]; + const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]); + const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]); u32 cur_offset = 0; while (cur_offset < item_size) { + struct btrfs_root *log_root = wc->log; struct btrfs_inode_extref *extref; - struct btrfs_inode *victim_parent; struct fscrypt_str victim_name; int ret; extref = (struct btrfs_inode_extref *)(base + cur_offset); victim_name.len = btrfs_inode_extref_name_len(leaf, extref); - if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir)) goto next; ret = read_alloc_one_name(leaf, &extref->name, victim_name.len, &victim_name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for inode %llu parent dir %llu root %llu", + btrfs_ino(inode), btrfs_ino(dir), + btrfs_root_id(inode->root)); return ret; + } - search_key->objectid = inode_objectid; + search_key->objectid = btrfs_ino(inode); search_key->type = BTRFS_INODE_EXTREF_KEY; - search_key->offset = btrfs_extref_hash(parent_objectid, + search_key->offset = btrfs_extref_hash(btrfs_ino(dir), victim_name.name, victim_name.len); - ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name); if (ret) { - kfree(victim_name.name); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + victim_name.len, victim_name.name, + btrfs_root_id(inode->root)); + kfree(victim_name.name); return ret; + } + kfree(victim_name.name); next: cur_offset += victim_name.len + sizeof(*extref); continue; } - victim_parent = btrfs_iget_logging(parent_objectid, root); - if (IS_ERR(victim_parent)) { - kfree(victim_name.name); - return PTR_ERR(victim_parent); - } - inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - ret = unlink_inode_for_log_replay(trans, victim_parent, inode, - &victim_name); - iput(&victim_parent->vfs_inode); + ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name); kfree(victim_name.name); if (ret) return ret; @@ -1160,27 +1328,29 @@ static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, return 0; } -static inline int __add_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_root *log_root, +static inline int __add_inode_ref(struct walk_control *wc, struct btrfs_inode *dir, struct btrfs_inode *inode, - u64 inode_objectid, u64 parent_objectid, u64 ref_index, struct fscrypt_str *name) { int ret; + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_dir_item *di; struct btrfs_key search_key; struct btrfs_inode_extref *extref; again: /* Search old style refs */ - search_key.objectid = inode_objectid; + search_key.objectid = btrfs_ino(inode); search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = parent_objectid; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + search_key.offset = btrfs_ino(dir); + ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + search_key.objectid, search_key.type, + search_key.offset, btrfs_root_id(root)); return ret; } else if (ret == 0) { /* @@ -1190,52 +1360,60 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, if (search_key.objectid == search_key.offset) return 1; - ret = unlink_refs_not_in_log(trans, path, log_root, &search_key, - dir, inode, parent_objectid); + ret = unlink_refs_not_in_log(wc, &search_key, dir, inode); if (ret == -EAGAIN) goto again; else if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid); + extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name, + btrfs_ino(inode), btrfs_ino(dir)); if (IS_ERR(extref)) { return PTR_ERR(extref); } else if (extref) { - ret = unlink_extrefs_not_in_log(trans, path, root, log_root, - &search_key, inode, - inode_objectid, parent_objectid); + ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode); if (ret == -EAGAIN) goto again; else if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* look for a conflicting sequence number */ - di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir), ref_index, name, 0); if (IS_ERR(di)) { - return PTR_ERR(di); + ret = PTR_ERR(di); + btrfs_abort_log_replay(wc, ret, +"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(dir), ref_index, name->len, + name->name, btrfs_root_id(root)); + return ret; } else if (di) { - ret = drop_one_dir_item(trans, path, dir, di); + ret = drop_one_dir_item(wc, dir, di); if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); + di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { - return PTR_ERR(di); + ret = PTR_ERR(di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir item for dir %llu name %.*s root %llu", + btrfs_ino(dir), name->len, name->name, + btrfs_root_id(root)); + return ret; } else if (di) { - ret = drop_one_dir_item(trans, path, dir, di); + ret = drop_one_dir_item(wc, dir, di); if (ret) return ret; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return 0; } @@ -1288,63 +1466,79 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, * proper unlink of that name (that is, remove its entry from the inode * reference item and both dir index keys). */ -static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_inode *inode, - struct extent_buffer *log_eb, - int log_slot, - struct btrfs_key *key) +static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode) { + struct btrfs_root *root = wc->root; int ret; unsigned long ref_ptr; unsigned long ref_end; struct extent_buffer *eb; again: - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + btrfs_release_path(wc->subvol_path); + ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); if (ret > 0) { ret = 0; goto out; } - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search subvolume tree for key (%llu %u %llu) root %llu", + wc->log_key.objectid, wc->log_key.type, + wc->log_key.offset, btrfs_root_id(root)); goto out; + } - eb = path->nodes[0]; - ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); + eb = wc->subvol_path->nodes[0]; + ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]); + ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]); while (ref_ptr < ref_end) { struct fscrypt_str name; u64 parent_id; - if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) { ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get extref details for inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); + goto out; + } } else { - parent_id = key->offset; + parent_id = wc->log_key.offset; ret = ref_get_fields(eb, ref_ptr, &name, NULL); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get ref details for inode %llu parent_id %llu root %llu", + btrfs_ino(inode), parent_id, + btrfs_root_id(root)); + goto out; + } } - if (ret) - goto out; - if (key->type == BTRFS_INODE_EXTREF_KEY) - ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) + ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot, parent_id, &name); else - ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); + ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot, + &name); if (!ret) { struct btrfs_inode *dir; - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); dir = btrfs_iget_logging(parent_id, root); if (IS_ERR(dir)) { ret = PTR_ERR(dir); kfree(name.name); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_id, btrfs_root_id(root)); goto out; } - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); kfree(name.name); iput(&dir->vfs_inode); if (ret) @@ -1354,56 +1548,51 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, kfree(name.name); ref_ptr += name.len; - if (key->type == BTRFS_INODE_EXTREF_KEY) + if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) ref_ptr += sizeof(struct btrfs_inode_extref); else ref_ptr += sizeof(struct btrfs_inode_ref); } ret = 0; out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } /* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function. (it should be released on return). + * Replay one inode back reference item found in the log tree. + * Path is for temporary use by this function (it should be released on return). */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int add_inode_ref(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_inode *dir = NULL; struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; int ret; - const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY); + const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY); u64 parent_objectid; u64 inode_objectid; u64 ref_index = 0; int ref_struct_size; - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size(eb, slot); + ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); + ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot); if (is_extref_item) { struct btrfs_inode_extref *r; ref_struct_size = sizeof(struct btrfs_inode_extref); r = (struct btrfs_inode_extref *)ref_ptr; - parent_objectid = btrfs_inode_extref_parent(eb, r); + parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r); } else { ref_struct_size = sizeof(struct btrfs_inode_ref); - parent_objectid = key->offset; + parent_objectid = wc->log_key.offset; } - inode_objectid = key->objectid; + inode_objectid = wc->log_key.objectid; /* * it is possible that we didn't log all the parent directories @@ -1416,6 +1605,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; + else + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_objectid, btrfs_root_id(root)); dir = NULL; goto out; } @@ -1423,16 +1616,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, inode = btrfs_iget_logging(inode_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + inode_objectid, btrfs_root_id(root)); inode = NULL; goto out; } while (ref_ptr < ref_end) { if (is_extref_item) { - ret = extref_get_fields(eb, ref_ptr, &name, + ret = extref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index, &parent_objectid); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get extref details for inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); goto out; + } /* * parent object can change from one array * item to another. @@ -1457,19 +1658,35 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ ret = 0; goto next; + } else { + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + parent_objectid, + btrfs_root_id(root)); } goto out; } } } else { - ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); - if (ret) + ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to get ref details for inode %llu parent_objectid %llu root %llu", + btrfs_ino(inode), + parent_objectid, + btrfs_root_id(root)); goto out; + } } - ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - ref_index, &name); + ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir), + btrfs_ino(inode), ref_index, &name); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(inode), btrfs_ino(dir), + ref_index, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (ret == 0) { /* @@ -1479,9 +1696,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, dir, inode, - inode_objectid, parent_objectid, - ref_index, &name); + ret = __add_inode_ref(wc, dir, inode, ref_index, &name); if (ret) { if (ret == 1) ret = 0; @@ -1490,12 +1705,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* insert our name */ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, +"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu", + btrfs_ino(inode), + btrfs_ino(dir), ref_index, + name.len, name.name, + btrfs_root_id(root)); goto out; + } ret = btrfs_update_inode(trans, inode); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); goto out; + } } /* Else, ret == 1, we already have a perfect match, we're done. */ @@ -1517,14 +1744,14 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); + ret = unlink_old_inode_refs(wc, inode); if (ret) goto out; /* finally write the back reference in the inode */ - ret = overwrite_item(trans, root, path, eb, slot, key); + ret = overwrite_item(wc); out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); kfree(name.name); if (dir) iput(&dir->vfs_inode); @@ -1642,26 +1869,22 @@ static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) * number of back refs found. If it goes down to zero, the iput * will free the inode. */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, +static noinline int fixup_inode_link_count(struct walk_control *wc, struct btrfs_inode *inode) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = inode->root; - struct btrfs_path *path; int ret; u64 nlink = 0; const u64 ino = btrfs_ino(inode); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = count_inode_refs(inode, path); + ret = count_inode_refs(inode, wc->subvol_path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(inode, path); + ret = count_inode_extrefs(inode, wc->subvol_path); if (ret < 0) goto out; @@ -1680,7 +1903,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (inode->vfs_inode.i_nlink == 0) { if (S_ISDIR(inode->vfs_inode.i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, ino, true); + ret = replay_dir_deletes(wc, ino, true); if (ret) goto out; } @@ -1690,13 +1913,11 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } out: - btrfs_free_path(path); + btrfs_release_path(wc->subvol_path); return ret; } -static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) +static noinline int fixup_inode_link_counts(struct walk_control *wc) { int ret; struct btrfs_key key; @@ -1705,48 +1926,50 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_inode *inode; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1); if (ret < 0) break; if (ret == 1) { ret = 0; - if (path->slots[0] == 0) + if (wc->subvol_path->slots[0] == 0) break; - path->slots[0]--; + wc->subvol_path->slots[0]--; } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]); if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || key.type != BTRFS_ORPHAN_ITEM_KEY) break; - ret = btrfs_del_item(trans, root, path); + ret = btrfs_del_item(trans, root, wc->subvol_path); if (ret) break; - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); inode = btrfs_iget_logging(key.offset, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); break; } - ret = fixup_inode_link_count(trans, inode); + ret = fixup_inode_link_count(wc, inode); iput(&inode->vfs_inode); if (ret) break; /* * fixup on a directory may create new entries, - * make sure we always look for the highset possible + * make sure we always look for the highest possible * offset */ key.offset = (u64)-1; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } @@ -1756,36 +1979,48 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, * count when replay is done. The link count is incremented here * so the inode won't go away until we check it */ -static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid) +static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct btrfs_key key; int ret = 0; struct btrfs_inode *inode; struct inode *vfs_inode; inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + objectid, btrfs_root_id(root)); + return ret; + } vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (ret == 0) { if (!vfs_inode->i_nlink) set_nlink(vfs_inode, 1); else inc_nlink(vfs_inode); ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + objectid, btrfs_root_id(root)); } else if (ret == -EEXIST) { ret = 0; + } else { + btrfs_abort_log_replay(wc, ret, + "failed to insert fixup item for inode %llu root %llu", + objectid, btrfs_root_id(root)); } iput(vfs_inode); @@ -1826,9 +2061,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return ret; } -static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, +static int delete_conflicting_dir_entry(struct walk_control *wc, struct btrfs_inode *dir, - struct btrfs_path *path, struct btrfs_dir_item *dst_di, const struct btrfs_key *log_key, u8 log_flags, @@ -1836,12 +2070,12 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, { struct btrfs_key found_key; - btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key); /* The existing dentry points to the same inode, don't delete it. */ if (found_key.objectid == log_key->objectid && found_key.type == log_key->type && found_key.offset == log_key->offset && - btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) + btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags) return 1; /* @@ -1851,7 +2085,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, if (!exists) return 0; - return drop_one_dir_item(trans, path, dir, dst_di); + return drop_one_dir_item(wc, dir, dst_di); } /* @@ -1870,13 +2104,10 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a * non-existing inode) and 1 if the name was replayed. */ -static noinline int replay_one_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, - struct btrfs_dir_item *di, - struct btrfs_key *key) +static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; @@ -1891,53 +2122,85 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool update_size = true; bool name_added = false; - dir = btrfs_iget_logging(key->objectid, root); - if (IS_ERR(dir)) - return PTR_ERR(dir); + dir = btrfs_iget_logging(wc->log_key.objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + wc->log_key.objectid, btrfs_root_id(root)); + return ret; + } - ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); - if (ret) + ret = read_alloc_one_name(wc->log_leaf, di + 1, + btrfs_dir_name_len(wc->log_leaf, di), &name); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); goto out; + } - log_flags = btrfs_dir_flags(eb, di); - btrfs_dir_item_key_to_cpu(eb, di, &log_key); - ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); - btrfs_release_path(path); - if (ret < 0) + log_flags = btrfs_dir_flags(wc->log_leaf, di); + btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key); + ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0); + btrfs_release_path(wc->subvol_path); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + log_key.objectid, btrfs_root_id(root)); goto out; + } exists = (ret == 0); ret = 0; - dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - &name, 1); + dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, + wc->log_key.objectid, &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir item for dir %llu name %.*s root %llu", + wc->log_key.objectid, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di, &log_key, log_flags, exists); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to delete conflicting entry for dir %llu name %.*s root %llu", + btrfs_ino(dir), name.len, name.name, + btrfs_root_id(root)); goto out; + } dir_dst_matches = (ret == 1); } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); - index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, - key->objectid, key->offset, - &name, 1); + index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, + wc->log_key.objectid, + wc->log_key.offset, &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir index item for dir %llu name %.*s root %llu", + wc->log_key.objectid, name.len, name.name, + btrfs_root_id(root)); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + ret = delete_conflicting_dir_entry(wc, dir, index_dst_di, &log_key, log_flags, exists); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to delete conflicting entry for dir %llu name %.*s root %llu", + btrfs_ino(dir), name.len, name.name, + btrfs_root_id(root)); goto out; + } index_dst_matches = (ret == 1); } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (dir_dst_matches && index_dst_matches) { ret = 0; @@ -1951,9 +2214,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, */ search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = key->objectid; + search_key.offset = wc->log_key.objectid; ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu", + search_key.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -1964,9 +2231,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); + search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len); + ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name); if (ret < 0) { + btrfs_abort_log_replay(wc, ret, +"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu", + search_key.objectid, btrfs_ino(dir), + name.len, name.name, btrfs_root_id(root)); goto out; } else if (ret) { /* The dentry will be added later. */ @@ -1974,11 +2245,15 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, update_size = false; goto out; } - btrfs_release_path(path); - ret = insert_one_name(trans, root, key->objectid, key->offset, + ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset, &name, &log_key); - if (ret && ret != -ENOENT && ret != -EEXIST) + if (ret && ret != -ENOENT && ret != -EEXIST) { + btrfs_abort_log_replay(wc, ret, + "failed to insert name %.*s for inode %llu dir %llu root %llu", + name.len, name.name, log_key.objectid, + btrfs_ino(dir), btrfs_root_id(root)); goto out; + } if (!ret) name_added = true; update_size = false; @@ -1988,6 +2263,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (!ret && update_size) { btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); ret = btrfs_update_inode(trans, dir); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update dir inode %llu root %llu", + btrfs_ino(dir), btrfs_root_id(root)); } kfree(name.name); iput(&dir->vfs_inode); @@ -1997,20 +2276,16 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, } /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ -static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static noinline int replay_one_dir_item(struct walk_control *wc) { int ret; struct btrfs_dir_item *di; /* We only log dir index keys, which only contain a single dir item. */ - ASSERT(key->type == BTRFS_DIR_INDEX_KEY); + ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY); - di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); - ret = replay_one_name(trans, root, path, eb, di, key); + di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item); + ret = replay_one_name(wc, di); if (ret < 0) return ret; @@ -2040,17 +2315,11 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, * to ever delete the parent directory has it would result in stale * dentries that can never be deleted. */ - if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { - struct btrfs_path *fixup_path; + if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) { struct btrfs_key di_key; - fixup_path = btrfs_alloc_path(); - if (!fixup_path) - return -ENOMEM; - - btrfs_dir_item_key_to_cpu(eb, di, &di_key); - ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); - btrfs_free_path(fixup_path); + btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key); + ret = link_to_fixup_dir(wc, di_key.objectid); } return ret; @@ -2143,13 +2412,13 @@ static noinline int find_dir_range(struct btrfs_root *root, * item is not in the log, the item is removed and the inode it points * to is unlinked */ -static noinline int check_item_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *log, - struct btrfs_path *path, +static noinline int check_item_in_log(struct walk_control *wc, struct btrfs_path *log_path, struct btrfs_inode *dir, - struct btrfs_key *dir_key) + struct btrfs_key *dir_key, + bool force_remove) { + struct btrfs_trans_handle *trans = wc->trans; struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; @@ -2167,21 +2436,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, */ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); - eb = path->nodes[0]; - slot = path->slots[0]; + eb = wc->subvol_path->nodes[0]; + slot = wc->subvol_path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); - if (ret) + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to allocate name for dir %llu index %llu root %llu", + btrfs_ino(dir), dir_key->offset, + btrfs_root_id(root)); goto out; + } - if (log) { + if (!force_remove) { struct btrfs_dir_item *log_di; - log_di = btrfs_lookup_dir_index_item(trans, log, log_path, + log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path, dir_key->objectid, dir_key->offset, &name, 0); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu", + btrfs_ino(dir), dir_key->offset, + name.len, name.name, + btrfs_root_id(root)); goto out; } else if (log_di) { /* The dentry exists in the log, we have nothing to do. */ @@ -2191,28 +2470,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, } btrfs_dir_item_key_to_cpu(eb, di, &location); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_release_path(log_path); inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + location.objectid, btrfs_root_id(root)); goto out; } - ret = link_to_fixup_dir(trans, root, path, location.objectid); + ret = link_to_fixup_dir(wc, location.objectid); if (ret) goto out; inc_nlink(&inode->vfs_inode); - ret = unlink_inode_for_log_replay(trans, dir, inode, &name); + ret = unlink_inode_for_log_replay(wc, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset * (an index number), so we're done. */ out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_release_path(log_path); kfree(name.name); if (inode) @@ -2220,59 +2502,67 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, return ret; } -static int replay_xattr_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - const u64 ino) +static int replay_xattr_deletes(struct walk_control *wc) { + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_root *root = wc->root; + struct btrfs_root *log = wc->log; struct btrfs_key search_key; - struct btrfs_path *log_path; - int i; + BTRFS_PATH_AUTO_FREE(log_path); + const u64 ino = wc->log_key.objectid; int nritems; int ret; log_path = btrfs_alloc_path(); - if (!log_path) + if (!log_path) { + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; + } search_key.objectid = ino; search_key.type = BTRFS_XATTR_ITEM_KEY; search_key.offset = 0; again: - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret < 0) + ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search xattrs for inode %llu root %llu", + ino, btrfs_root_id(root)); goto out; + } process_leaf: - nritems = btrfs_header_nritems(path->nodes[0]); - for (i = path->slots[0]; i < nritems; i++) { + nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]); + for (int i = wc->subvol_path->slots[0]; i < nritems; i++) { struct btrfs_key key; struct btrfs_dir_item *di; struct btrfs_dir_item *log_di; u32 total_size; u32 cur; - btrfs_item_key_to_cpu(path->nodes[0], &key, i); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i); if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { ret = 0; goto out; } - di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); - total_size = btrfs_item_size(path->nodes[0], i); + di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size(wc->subvol_path->nodes[0], i); cur = 0; while (cur < total_size) { - u16 name_len = btrfs_dir_name_len(path->nodes[0], di); - u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di); u32 this_len = sizeof(*di) + name_len + data_len; char *name; name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; + btrfs_abort_log_replay(wc, ret, + "failed to allocate memory for name of length %u", + name_len); goto out; } - read_extent_buffer(path->nodes[0], name, + read_extent_buffer(wc->subvol_path->nodes[0], name, (unsigned long)(di + 1), name_len); log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, @@ -2280,40 +2570,59 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, btrfs_release_path(log_path); if (!log_di) { /* Doesn't exist in log tree, so delete it. */ - btrfs_release_path(path); - di = btrfs_lookup_xattr(trans, root, path, ino, + btrfs_release_path(wc->subvol_path); + di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino, name, name_len, -1); - kfree(name); if (IS_ERR(di)) { ret = PTR_ERR(di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup xattr with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; } ASSERT(di); ret = btrfs_delete_one_dir_name(trans, root, - path, di); - if (ret) + wc->subvol_path, di); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to delete xattr with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; - btrfs_release_path(path); + } + btrfs_release_path(wc->subvol_path); + kfree(name); search_key = key; goto again; } - kfree(name); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); + btrfs_abort_log_replay(wc, ret, + "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu", + name_len, name, ino, + btrfs_root_id(root)); + kfree(name); goto out; } + kfree(name); cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } } - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(root, wc->subvol_path); if (ret > 0) ret = 0; else if (ret == 0) goto process_leaf; + else + btrfs_abort_log_replay(wc, ret, + "failed to get next leaf in subvolume root %llu", + btrfs_root_id(root)); out: - btrfs_free_path(log_path); - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); return ret; } @@ -2328,12 +2637,11 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, * Anything we don't find in the log is unlinked and removed from the * directory. */ -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, +static noinline int replay_dir_deletes(struct walk_control *wc, u64 dirid, bool del_all) { + struct btrfs_root *root = wc->root; + struct btrfs_root *log = (del_all ? NULL : wc->log); u64 range_start; u64 range_end; int ret = 0; @@ -2345,8 +2653,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; log_path = btrfs_alloc_path(); - if (!log_path) + if (!log_path) { + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; + } dir = btrfs_iget_logging(dirid, root); /* @@ -2358,6 +2668,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; + else + btrfs_abort_log_replay(wc, ret, + "failed to lookup dir inode %llu root %llu", + dirid, btrfs_root_id(root)); return ret; } @@ -2367,32 +2681,46 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (del_all) range_end = (u64)-1; else { - ret = find_dir_range(log, path, dirid, + ret = find_dir_range(log, wc->subvol_path, dirid, &range_start, &range_end); - if (ret < 0) + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to find range for dir %llu in log tree root %llu", + dirid, btrfs_root_id(root)); goto out; - else if (ret > 0) + } else if (ret > 0) { break; + } } dir_key.offset = range_start; while (1) { int nritems; - ret = btrfs_search_slot(NULL, root, &dir_key, path, - 0, 0); - if (ret < 0) + ret = btrfs_search_slot(NULL, root, &dir_key, + wc->subvol_path, 0, 0); + if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to search root %llu for key (%llu %u %llu)", + btrfs_root_id(root), + dir_key.objectid, dir_key.type, + dir_key.offset); goto out; + } - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret == 1) + nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]); + if (wc->subvol_path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, wc->subvol_path); + if (ret == 1) { break; - else if (ret < 0) + } else if (ret < 0) { + btrfs_abort_log_replay(wc, ret, + "failed to get next leaf in subvolume root %llu", + btrfs_root_id(root)); goto out; + } } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key, + wc->subvol_path->slots[0]); if (found_key.objectid != dirid || found_key.type != dir_key.type) { ret = 0; @@ -2402,23 +2730,21 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (found_key.offset > range_end) break; - ret = check_item_in_log(trans, log, path, - log_path, dir, - &found_key); + ret = check_item_in_log(wc, log_path, dir, &found_key, del_all); if (ret) goto out; if (found_key.offset == (u64)-1) break; dir_key.offset = found_key.offset + 1; } - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); if (range_end == (u64)-1) break; range_start = range_end + 1; } ret = 0; out: - btrfs_release_path(path); + btrfs_release_path(wc->subvol_path); btrfs_free_path(log_path); iput(&dir->vfs_inode); return ret; @@ -2435,7 +2761,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, * only in the log (references come from either directory items or inode * back refs). */ -static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, +static int replay_one_buffer(struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { int nritems; @@ -2443,33 +2769,44 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, .transid = gen, .level = level }; - struct btrfs_path *path; - struct btrfs_root *root = wc->replay_dest; - struct btrfs_key key; - int i; + struct btrfs_root *root = wc->root; + struct btrfs_trans_handle *trans = wc->trans; int ret; - ret = btrfs_read_extent_buffer(eb, &check); - if (ret) - return ret; - - level = btrfs_header_level(eb); - if (level != 0) return 0; - path = btrfs_alloc_path(); - if (!path) + /* + * Set to NULL since it was not yet read and in case we abort log replay + * on error, we have no valid log tree leaf to dump. + */ + wc->log_leaf = NULL; + ret = btrfs_read_extent_buffer(eb, &check); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to read log tree leaf %llu for root %llu", + eb->start, btrfs_root_id(root)); + return ret; + } + + ASSERT(wc->subvol_path == NULL); + wc->subvol_path = btrfs_alloc_path(); + if (!wc->subvol_path) { + btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path"); return -ENOMEM; + } + + wc->log_leaf = eb; nritems = btrfs_header_nritems(eb); - for (i = 0; i < nritems; i++) { + for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) { struct btrfs_inode_item *inode_item; - btrfs_item_key_to_cpu(eb, &key, i); + btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot); - if (key.type == BTRFS_INODE_ITEM_KEY) { - inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(eb, wc->log_slot, + struct btrfs_inode_item); /* * An inode with no links is either: * @@ -2498,22 +2835,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, } /* Inode keys are done during the first stage. */ - if (key.type == BTRFS_INODE_ITEM_KEY && + if (wc->log_key.type == BTRFS_INODE_ITEM_KEY && wc->stage == LOG_WALK_REPLAY_INODES) { u32 mode; - ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); + ret = replay_xattr_deletes(wc); if (ret) break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc->trans, root, log, path, - key.objectid, false); + ret = replay_dir_deletes(wc, wc->log_key.objectid, false); if (ret) break; } - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); + ret = overwrite_item(wc); if (ret) break; @@ -2530,9 +2865,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_inode *inode; u64 from; - inode = btrfs_iget_logging(key.objectid, root); + inode = btrfs_iget_logging(wc->log_key.objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); + btrfs_abort_log_replay(wc, ret, + "failed to lookup inode %llu root %llu", + wc->log_key.objectid, + btrfs_root_id(root)); break; } from = ALIGN(i_size_read(&inode->vfs_inode), @@ -2540,21 +2879,31 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, inode, - &drop_args); - if (!ret) { + drop_args.path = wc->subvol_path; + ret = btrfs_drop_extents(trans, root, inode, &drop_args); + if (ret) { + btrfs_abort_log_replay(wc, ret, + "failed to drop extents for inode %llu root %llu offset %llu", + btrfs_ino(inode), + btrfs_root_id(root), + from); + } else { inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, inode); + ret = btrfs_update_inode(trans, inode); + if (ret) + btrfs_abort_log_replay(wc, ret, + "failed to update inode %llu root %llu", + btrfs_ino(inode), + btrfs_root_id(root)); } iput(&inode->vfs_inode); if (ret) break; } - ret = link_to_fixup_dir(wc->trans, root, - path, key.objectid); + ret = link_to_fixup_dir(wc, wc->log_key.objectid); if (ret) break; } @@ -2562,10 +2911,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (wc->ignore_cur_inode) continue; - if (key.type == BTRFS_DIR_INDEX_KEY && + if (wc->log_key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { - ret = replay_one_dir_item(wc->trans, root, path, - eb, i, &key); + ret = replay_one_dir_item(wc); if (ret) break; } @@ -2574,20 +2922,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, continue; /* these keys are simply copied */ - if (key.type == BTRFS_XATTR_ITEM_KEY) { - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); + if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc); if (ret) break; - } else if (key.type == BTRFS_INODE_REF_KEY || - key.type == BTRFS_INODE_EXTREF_KEY) { - ret = add_inode_ref(wc->trans, root, log, path, - eb, i, &key); + } else if (wc->log_key.type == BTRFS_INODE_REF_KEY || + wc->log_key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc); if (ret) break; - } else if (key.type == BTRFS_EXTENT_DATA_KEY) { - ret = replay_one_extent(wc->trans, root, path, - eb, i, &key); + } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc); if (ret) break; } @@ -2598,55 +2943,55 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, * older kernel with such keys, ignore them. */ } - btrfs_free_path(path); + btrfs_free_path(wc->subvol_path); + wc->subvol_path = NULL; return ret; } -/* - * Correctly adjust the reserved bytes occupied by a log tree extent buffer - */ -static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) -{ - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, start); - if (!cache) { - btrfs_err(fs_info, "unable to find block group for %llu", start); - return -ENOENT; - } - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->reserved -= fs_info->nodesize; - cache->space_info->bytes_reserved -= fs_info->nodesize; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - return 0; -} - static int clean_log_buffer(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_block_group *bg; + btrfs_tree_lock(eb); btrfs_clear_buffer_dirty(trans, eb); wait_on_extent_buffer_writeback(eb); btrfs_tree_unlock(eb); - if (trans) - return btrfs_pin_reserved_extent(trans, eb); + if (trans) { + int ret; + + ret = btrfs_pin_reserved_extent(trans, eb); + if (ret) + btrfs_abort_transaction(trans, ret); + return ret; + } + + bg = btrfs_lookup_block_group(fs_info, eb->start); + if (!bg) { + btrfs_err(fs_info, "unable to find block group for %llu", eb->start); + btrfs_handle_fs_error(fs_info, -ENOENT, NULL); + return -ENOENT; + } + + spin_lock(&bg->space_info->lock); + spin_lock(&bg->lock); + bg->reserved -= fs_info->nodesize; + bg->space_info->bytes_reserved -= fs_info->nodesize; + spin_unlock(&bg->lock); + spin_unlock(&bg->space_info->lock); - return unaccount_log_buffer(eb->fs_info, eb->start); + btrfs_put_block_group(bg); + + return 0; } -static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level, - struct walk_control *wc) +static noinline int walk_down_log_tree(struct btrfs_path *path, int *level, + struct walk_control *wc) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_fs_info *fs_info = wc->log->fs_info; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; @@ -2674,12 +3019,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_header_owner(cur), *level - 1); - if (IS_ERR(next)) - return PTR_ERR(next); + if (IS_ERR(next)) { + ret = PTR_ERR(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } if (*level == 1) { - ret = wc->process_func(root, next, wc, ptr_gen, - *level - 1); + ret = wc->process_func(next, wc, ptr_gen, *level - 1); if (ret) { free_extent_buffer(next); return ret; @@ -2690,6 +3040,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; } @@ -2705,6 +3059,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; } @@ -2721,10 +3079,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return 0; } -static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level, - struct walk_control *wc) +static noinline int walk_up_log_tree(struct btrfs_path *path, int *level, + struct walk_control *wc) { int i; int slot; @@ -2738,14 +3094,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level == 0); return 0; } else { - ret = wc->process_func(root, path->nodes[*level], wc, + ret = wc->process_func(path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level]), *level); if (ret) return ret; if (wc->free) { - ret = clean_log_buffer(trans, path->nodes[*level]); + ret = clean_log_buffer(wc->trans, path->nodes[*level]); if (ret) return ret; } @@ -2762,13 +3118,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, * the tree freeing any blocks that have a ref count of zero after being * decremented. */ -static int walk_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *log, struct walk_control *wc) +static int walk_log_tree(struct walk_control *wc) { + struct btrfs_root *log = wc->log; int ret = 0; int wret; int level; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int orig_level; path = btrfs_alloc_path(); @@ -2782,36 +3138,30 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, path->slots[level] = 0; while (1) { - wret = walk_down_log_tree(trans, log, path, &level, wc); + wret = walk_down_log_tree(path, &level, wc); if (wret > 0) break; - if (wret < 0) { - ret = wret; - goto out; - } + if (wret < 0) + return wret; - wret = walk_up_log_tree(trans, log, path, &level, wc); + wret = walk_up_log_tree(path, &level, wc); if (wret > 0) break; - if (wret < 0) { - ret = wret; - goto out; - } + if (wret < 0) + return wret; } /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { - ret = wc->process_func(log, path->nodes[orig_level], wc, + ret = wc->process_func(path->nodes[orig_level], wc, btrfs_header_generation(path->nodes[orig_level]), orig_level); if (ret) - goto out; + return ret; if (wc->free) - ret = clean_log_buffer(trans, path->nodes[orig_level]); + ret = clean_log_buffer(wc->trans, path->nodes[orig_level]); } -out: - btrfs_free_path(path); return ret; } @@ -3220,7 +3570,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); mutex_unlock(&fs_info->tree_log_mutex); - if (ret) { + if (unlikely(ret)) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); goto out_wake_log_root; @@ -3272,12 +3622,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, { int ret; struct walk_control wc = { - .free = 1, - .process_func = process_one_buffer + .free = true, + .process_func = process_one_buffer, + .log = log, + .trans = trans, }; if (log->node) { - ret = walk_log_tree(trans, log, &wc); + ret = walk_log_tree(&wc); if (ret) { /* * We weren't able to traverse the entire log tree, the @@ -3476,7 +3828,7 @@ static int inode_logged(const struct btrfs_trans_handle *trans, /* * The inode was previously logged and then evicted, set logged_trans to - * the current transacion's ID, to avoid future tree searches as long as + * the current transaction's ID, to avoid future tree searches as long as * the inode is not evicted again. */ spin_lock(&inode->lock); @@ -3547,13 +3899,13 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; ret = inode_logged(trans, dir, NULL); if (ret == 0) return; - else if (ret < 0) { + if (ret < 0) { btrfs_set_log_full_commit(trans); return; } @@ -3567,7 +3919,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, ret = join_running_log_trans(root); ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); if (WARN_ON(ret)) - goto out; + return; mutex_lock(&dir->log_mutex); @@ -3577,8 +3929,6 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); -out: - btrfs_free_path(path); } /* see comments for btrfs_del_dir_entries_in_log */ @@ -3691,8 +4041,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, struct btrfs_key *ins_keys; u32 *ins_sizes; - ins_data = kmalloc(count * sizeof(u32) + - count * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) return -ENOMEM; @@ -4255,7 +4604,7 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, int log_inode_only, + struct inode *inode, bool log_inode_only, u64 logged_isize) { u64 flags; @@ -4351,7 +4700,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, - 0, 0); + false, 0); btrfs_release_path(path); return 0; } @@ -4455,8 +4804,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src = src_path->nodes[0]; - ins_data = kmalloc(nr * sizeof(struct btrfs_key) + - nr * sizeof(u32), GFP_NOFS); + ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS); if (!ins_data) return -ENOMEM; @@ -4857,7 +5205,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct btrfs_key key; const u64 i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); - struct btrfs_path *dst_path = NULL; + BTRFS_PATH_AUTO_FREE(dst_path); bool dropped_extents = false; u64 truncate_offset = i_size; struct extent_buffer *leaf; @@ -4975,7 +5323,6 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, start_slot, ins_nr, 1, 0, ctx); out: btrfs_release_path(path); - btrfs_free_path(dst_path); return ret; } @@ -5348,7 +5695,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u64 *other_ino, u64 *other_parent) { int ret; - struct btrfs_path *search_path; + BTRFS_PATH_AUTO_FREE(search_path); char *name = NULL; u32 name_len = 0; u32 item_size = btrfs_item_size(eb, slot); @@ -5433,7 +5780,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, } ret = 0; out: - btrfs_free_path(search_path); kfree(name); return ret; } @@ -6161,8 +6507,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, if (!first) return 0; - ins_data = kmalloc(max_batch_size * sizeof(u32) + - max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) return -ENOMEM; ins_sizes = (u32 *)ins_data; @@ -6816,7 +7161,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root = inode->root; const u64 ino = btrfs_ino(inode); @@ -6832,7 +7177,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; while (true) { struct extent_buffer *leaf = path->nodes[0]; @@ -6844,8 +7189,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -6903,10 +7248,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, * at both parents and the old parent B would still * exist. */ - if (IS_ERR(dir_inode)) { - ret = PTR_ERR(dir_inode); - goto out; - } + if (IS_ERR(dir_inode)) + return PTR_ERR(dir_inode); if (!need_log_inode(trans, dir_inode)) { btrfs_add_delayed_iput(dir_inode); @@ -6919,14 +7262,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = log_new_dir_dentries(trans, dir_inode, ctx); btrfs_add_delayed_iput(dir_inode); if (ret) - goto out; + return ret; } path->slots[0]++; } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } static int log_new_ancestors(struct btrfs_trans_handle *trans, @@ -7037,7 +7377,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 ino = btrfs_ino(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret; @@ -7058,7 +7398,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) path->slots[0]++; @@ -7070,8 +7410,8 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; - else if (ret > 0) + return ret; + if (ret > 0) break; continue; } @@ -7088,10 +7428,8 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, * this loop, etc). So just return some error to fallback to * a transaction commit. */ - if (found_key.type == BTRFS_INODE_EXTREF_KEY) { - ret = -EMLINK; - goto out; - } + if (found_key.type == BTRFS_INODE_EXTREF_KEY) + return -EMLINK; /* * Logging ancestors needs to do more searches on the fs/subvol @@ -7103,14 +7441,11 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, ret = log_new_ancestors(trans, root, path, ctx); if (ret) - goto out; + return ret; btrfs_release_path(path); goto again; } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -7290,10 +7625,12 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) } wc.trans = trans; - wc.pin = 1; + wc.pin = true; + wc.log = log_root_tree; - ret = walk_log_tree(trans, log_root_tree, &wc); - if (ret) { + ret = walk_log_tree(&wc); + wc.log = NULL; + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7304,12 +7641,11 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) key.offset = (u64)-1; while (1) { - struct btrfs_root *log; struct btrfs_key found_key; ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7324,20 +7660,19 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_tree_root(log_root_tree, &found_key); - if (IS_ERR(log)) { - ret = PTR_ERR(log); + wc.log = btrfs_read_tree_root(log_root_tree, &found_key); + if (IS_ERR(wc.log)) { + ret = PTR_ERR(wc.log); + wc.log = NULL; btrfs_abort_transaction(trans, ret); goto error; } - wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, - true); - if (IS_ERR(wc.replay_dest)) { - ret = PTR_ERR(wc.replay_dest); - wc.replay_dest = NULL; - if (ret != -ENOENT) { - btrfs_put_root(log); + wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true); + if (IS_ERR(wc.root)) { + ret = PTR_ERR(wc.root); + wc.root = NULL; + if (unlikely(ret != -ENOENT)) { btrfs_abort_transaction(trans, ret); goto error; } @@ -7353,33 +7688,34 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) * block from being modified, and we'll just bail for * each subsequent pass. */ - ret = btrfs_pin_extent_for_log_replay(trans, log->node); - if (ret) { - btrfs_put_root(log); + ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error; } goto next; } - wc.replay_dest->log_root = log; - ret = btrfs_record_root_in_trans(trans, wc.replay_dest); - if (ret) { + wc.root->log_root = wc.log; + ret = btrfs_record_root_in_trans(trans, wc.root); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } - ret = walk_log_tree(trans, log, &wc); - if (ret) { + ret = walk_log_tree(&wc); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } if (wc.stage == LOG_WALK_REPLAY_ALL) { - struct btrfs_root *root = wc.replay_dest; + struct btrfs_root *root = wc.root; - ret = fixup_inode_link_counts(trans, wc.replay_dest, path); - if (ret) { + wc.subvol_path = path; + ret = fixup_inode_link_counts(&wc); + wc.subvol_path = NULL; + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } @@ -7392,17 +7728,18 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto next; } } next: - if (wc.replay_dest) { - wc.replay_dest->log_root = NULL; - btrfs_put_root(wc.replay_dest); + if (wc.root) { + wc.root->log_root = NULL; + btrfs_put_root(wc.root); } - btrfs_put_root(log); + btrfs_put_root(wc.log); + wc.log = NULL; if (ret) goto error; @@ -7414,7 +7751,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) /* step one is to pin it all, step two is to replay just inodes */ if (wc.pin) { - wc.pin = 0; + wc.pin = false; wc.process_func = replay_one_buffer; wc.stage = LOG_WALK_REPLAY_INODES; goto again; @@ -7432,14 +7769,13 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) if (ret) return ret; - log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); - btrfs_put_root(log_root_tree); return 0; error: if (wc.trans) btrfs_end_transaction(wc.trans); + btrfs_put_root(wc.log); clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); btrfs_free_path(path); return ret; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index b7a96a005487e1..46bd8ca5867085 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -487,12 +487,12 @@ static int rollback_verity(struct btrfs_inode *inode) inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = del_orphan(trans, inode); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -676,11 +676,11 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) if (ret < 0) return ret; - if (item.reserved[0] != 0 || item.reserved[1] != 0) + if (unlikely(item.reserved[0] != 0 || item.reserved[1] != 0)) return -EUCLEAN; true_size = btrfs_stack_verity_descriptor_size(&item); - if (true_size > INT_MAX) + if (unlikely(true_size > INT_MAX)) return -EUCLEAN; if (buf_size == 0) @@ -802,6 +802,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations btrfs_verityops = { + .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) - + (int)offsetof(struct btrfs_inode, vfs_inode), .begin_enable_verity = btrfs_begin_enable_verity, .end_enable_verity = btrfs_end_enable_verity, .get_verity_descriptor = btrfs_get_verity_descriptor, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fa7a929a046190..2bec544d8ba300 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1377,8 +1377,8 @@ struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, } /* - * Make sure the last byte of label is properly NUL termiated. We use - * '%s' to print the label, if not properly NUL termiated we can access + * Make sure the last byte of label is properly NUL terminated. We use + * '%s' to print the label, if not properly NUL terminated we can access * beyond the label. */ if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1]) @@ -1911,7 +1911,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, if (ret < 0) goto error; - if (ret == 0) { + if (unlikely(ret == 0)) { /* Corruption */ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); ret = -EUCLEAN; @@ -2243,7 +2243,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } ret = btrfs_rm_dev_item(trans, device); - if (ret) { + if (unlikely(ret)) { /* Any error in dev item removal is critical */ btrfs_crit(fs_info, "failed to remove device item for devid %llu: %d", @@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error; } + if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = true; down_write(&sb->s_umount); @@ -2838,21 +2843,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_lock(&fs_info->chunk_mutex); ret = init_first_rw_device(trans); mutex_unlock(&fs_info->chunk_mutex); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } } ret = btrfs_add_dev_item(trans, device); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } if (seeding_dev) { ret = btrfs_finish_sprout(trans); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } @@ -3044,7 +3049,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; - else if (ret > 0) { /* Logic error or corruption */ + else if (unlikely(ret > 0)) { /* Logic error or corruption */ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", chunk_offset); btrfs_abort_transaction(trans, -ENOENT); @@ -3053,7 +3058,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } ret = btrfs_del_item(trans, root, path); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); btrfs_abort_transaction(trans, ret); goto out; @@ -3278,7 +3283,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); - if (ret) { + if (unlikely(ret)) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, ret); goto out; @@ -3348,7 +3353,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) struct btrfs_space_info *space_info; space_info = btrfs_find_space_info(fs_info, sys_flags); - if (!space_info) { + if (unlikely(!space_info)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -3362,17 +3367,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = remove_chunk_item(trans, map, chunk_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } - } else if (ret) { + } else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3381,7 +3386,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3397,7 +3402,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) btrfs_trans_release_chunk_metadata(trans); ret = btrfs_remove_block_group(trans, map); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3522,7 +3527,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } - if (ret == 0) { + if (unlikely(ret == 0)) { /* * On the first search we would find chunk tree with * offset -1, which is not possible. On subsequent @@ -4264,7 +4269,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) * @flags: profile to validate * @extended: if true @flags is treated as an extended profile */ -static int alloc_profile_is_valid(u64 flags, int extended) +static int alloc_profile_is_valid(u64 flags, bool extended) { u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : BTRFS_BLOCK_GROUP_PROFILE_MASK); @@ -4458,7 +4463,7 @@ static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) } /* - * Should be called with balance mutexe held + * Should be called with balance mutex held */ int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, @@ -5036,7 +5041,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); btrfs_trans_release_chunk_metadata(trans); - if (ret < 0) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); } else { @@ -5696,7 +5701,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) { + if (unlikely(!chunk)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); goto out; @@ -7481,7 +7486,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) /* * Lockdep complains about possible circular locking dependency between * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores - * used for freeze procection of a fs (struct super_block.s_writers), + * used for freeze protection of a fs (struct super_block.s_writers), * which we take when starting a transaction, and extent buffers of the * chunk tree if we call read_one_dev() while holding a lock on an * extent buffer of the chunk tree. Since we are mounting the filesystem @@ -7914,8 +7919,6 @@ int btrfs_bg_type_to_factor(u64 flags) return btrfs_raid_array[index].ncopies; } - - static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) @@ -7929,7 +7932,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, int i; map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); - if (!map) { + if (unlikely(!map)) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); @@ -7938,7 +7941,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } stripe_len = btrfs_calc_stripe_length(map); - if (physical_len != stripe_len) { + if (unlikely(physical_len != stripe_len)) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", physical_offset, devid, map->start, physical_len, @@ -7958,8 +7961,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, devid, physical_offset, physical_len); for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].dev->devid == devid && - map->stripes[i].physical == physical_offset) { + if (unlikely(map->stripes[i].dev->devid == devid && + map->stripes[i].physical == physical_offset)) { found = true; if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, @@ -7972,7 +7975,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, break; } } - if (!found) { + if (unlikely(!found)) { btrfs_err(fs_info, "dev extent physical offset %llu devid %llu has no corresponding chunk", physical_offset, devid); @@ -7981,13 +7984,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* Make sure no dev extent is beyond device boundary */ dev = btrfs_find_device(fs_info->fs_devices, &args); - if (!dev) { + if (unlikely(!dev)) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; goto out; } - if (physical_offset + physical_len > dev->disk_total_bytes) { + if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", devid, physical_offset, physical_len, @@ -7999,8 +8002,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, if (dev->zone_info) { u64 zone_size = dev->zone_info->zone_size; - if (!IS_ALIGNED(physical_offset, zone_size) || - !IS_ALIGNED(physical_len, zone_size)) { + if (unlikely(!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size))) { btrfs_err(fs_info, "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", devid, physical_offset, physical_len); @@ -8024,7 +8027,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) struct btrfs_chunk_map *map; map = rb_entry(node, struct btrfs_chunk_map, rb_node); - if (map->num_stripes != map->verified_stripes) { + if (unlikely(map->num_stripes != map->verified_stripes)) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", map->start, map->verified_stripes, map->num_stripes); @@ -8084,7 +8087,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) if (ret < 0) goto out; /* No dev extents at all? Not good */ - if (ret > 0) { + if (unlikely(ret > 0)) { ret = -EUCLEAN; goto out; } @@ -8109,7 +8112,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) physical_len = btrfs_dev_extent_length(leaf, dext); /* Check if this dev extent overlaps with the previous one */ - if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", devid, physical_offset, prev_dev_ext_end); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a56e873a30295e..2cbf8080eade06 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -34,7 +34,7 @@ struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) /* - * Arbitratry maximum size of one discard request to limit potentially long time + * Arbitrary maximum size of one discard request to limit potentially long time * spent in blkdev_issue_discard(). */ #define BTRFS_MAX_DISCARD_CHUNK_SIZE (SZ_1G) @@ -495,7 +495,7 @@ struct btrfs_discard_stripe { }; /* - * Context for IO subsmission for device stripe. + * Context for IO submission for device stripe. * * - Track the unfinished mirrors for mirror based profiles * Mirror based profiles are SINGLE/DUP/RAID1/RAID10. diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 5292cd341f70f2..6caba8be7c845c 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -34,11 +34,9 @@ struct workspace { int level; }; -static struct workspace_manager wsm; - -struct list_head *zlib_get_workspace(unsigned int level) +struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { - struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level); + struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level); struct workspace *workspace = list_entry(ws, struct workspace, list); workspace->level = level; @@ -55,8 +53,25 @@ void zlib_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zlib_alloc_workspace(unsigned int level) +/* + * For s390 hardware acceleration, the buffer size should be at least + * ZLIB_DFLTCC_BUF_SIZE to achieve the best performance. + * + * But if bs > ps we can have large enough folios that meet the s390 hardware + * handling. + */ +static bool need_special_buffer(struct btrfs_fs_info *fs_info) +{ + if (!zlib_deflate_dfltcc_enabled()) + return false; + if (btrfs_min_folio_size(fs_info) >= ZLIB_DFLTCC_BUF_SIZE) + return false; + return true; +} + +struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { + const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; int workspacesize; @@ -69,19 +84,15 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN); workspace->level = level; workspace->buf = NULL; - /* - * In case of s390 zlib hardware support, allocate lager workspace - * buffer. If allocator fails, fall back to a single page buffer. - */ - if (zlib_deflate_dfltcc_enabled()) { + if (need_special_buffer(fs_info)) { workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE, __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | GFP_NOIO); workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; } if (!workspace->buf) { - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - workspace->buf_size = PAGE_SIZE; + workspace->buf = kmalloc(blocksize, GFP_KERNEL); + workspace->buf_size = blocksize; } if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -133,11 +144,15 @@ static int copy_data_into_buffer(struct address_space *mapping, return 0; } -int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, +int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret; char *data_in = NULL; char *cfolio_out; @@ -146,7 +161,8 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, struct folio *out_folio = NULL; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; - const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const unsigned long max_out = nr_dest_folios << min_folio_shift; + const u32 blocksize = fs_info->sectorsize; const u64 orig_end = start + len; *out_folios = 0; @@ -155,9 +171,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflateInit(&workspace->strm, workspace->level); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zlib compression init failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; @@ -167,7 +181,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -179,7 +193,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; workspace->strm.next_out = cfolio_out; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; while (workspace->strm.total_in < len) { /* @@ -191,10 +205,11 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned int copy_length = min(bytes_left, workspace->buf_size); /* - * This can only happen when hardware zlib compression is - * enabled. + * For s390 hardware accelerated zlib, and our folio is smaller + * than the copy_length, we need to fill the buffer so that + * we can take full advantage of hardware acceleration. */ - if (copy_length > PAGE_SIZE) { + if (need_special_buffer(fs_info)) { ret = copy_data_into_buffer(mapping, workspace, start, copy_length); if (ret < 0) @@ -225,9 +240,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_warn(inode->root->fs_info, + btrfs_warn(fs_info, "zlib compression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); @@ -237,7 +250,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, } /* we're making it bigger, give up */ - if (workspace->strm.total_in > 8192 && + if (workspace->strm.total_in > blocksize * 2 && workspace->strm.total_in < workspace->strm.total_out) { ret = -E2BIG; @@ -252,7 +265,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -260,7 +273,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, cfolio_out = folio_address(out_folio); folios[nr_folios] = out_folio; nr_folios++; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } /* we're all done */ @@ -278,7 +291,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = zlib_deflate(&workspace->strm, Z_FINISH); if (ret == Z_STREAM_END) break; - if (ret != Z_OK && ret != Z_BUF_ERROR) { + if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) { zlib_deflateEnd(&workspace->strm); ret = -EIO; goto out; @@ -288,7 +301,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -296,7 +309,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, cfolio_out = folio_address(out_folio); folios[nr_folios] = out_folio; nr_folios++; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = min_folio_size; workspace->strm.next_out = cfolio_out; } } @@ -322,20 +335,22 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); + const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret = 0, ret2; int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; struct folio **folios_in = cb->compressed_folios; data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; - workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); + workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size); workspace->strm.total_in = 0; workspace->strm.total_out = 0; @@ -396,7 +411,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; - workspace->strm.avail_in = min(tmp, PAGE_SIZE); + workspace->strm.avail_in = min(tmp, min_folio_size); } } if (unlikely(ret != Z_STREAM_END)) { @@ -484,8 +499,7 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, return ret; } -const struct btrfs_compress_op btrfs_zlib_compress = { - .workspace_manager = &wsm, +const struct btrfs_compress_levels btrfs_zlib_compress = { .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index ea662036f4413f..e00036672f338d 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -274,7 +274,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return ret; } *nr_zones = ret; - if (!ret) + if (unlikely(!ret)) return -EIO; /* Populate cache */ @@ -315,7 +315,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; /* No dev extents at all? Not good */ - if (ret > 0) + if (unlikely(ret > 0)) return -EUCLEAN; } @@ -503,7 +503,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; } - if (nreported != zone_info->nr_zones) { + if (unlikely(nreported != zone_info->nr_zones)) { btrfs_err(device->fs_info, "inconsistent number of zones on %s (%u/%u)", rcu_dereference(device->name), nreported, @@ -513,7 +513,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } if (max_active_zones) { - if (nactive > max_active_zones) { + if (unlikely(nactive > max_active_zones)) { + if (bdev_max_active_zones(bdev) == 0) { + max_active_zones = 0; + zone_info->max_active_zones = 0; + goto validate; + } btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", nactive, rcu_dereference(device->name), @@ -526,6 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } +validate: /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -544,7 +550,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (ret) goto out; - if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { + if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) { btrfs_err(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); @@ -562,7 +568,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); - if (ret != -ENOENT && ret) { + if (unlikely(ret != -ENOENT && ret)) { btrfs_err(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); @@ -895,7 +901,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, zones); if (ret < 0) return ret; - if (ret != BTRFS_NR_SB_LOG_ZONES) + if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) return -EIO; return sb_log_location(bdev, zones, rw, bytenr_ret); @@ -1247,7 +1253,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, root = btrfs_extent_root(fs_info, key.objectid); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); /* We should not find the exact match */ - if (!ret) + if (unlikely(!ret)) ret = -EUCLEAN; if (ret < 0) return ret; @@ -1268,8 +1274,8 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, else length = fs_info->nodesize; - if (!(found_key.objectid >= cache->start && - found_key.objectid + length <= cache->start + cache->length)) { + if (unlikely(!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length))) { return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; @@ -1351,7 +1357,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } - if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) { btrfs_err(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", zone.start << SECTOR_SHIFT, rcu_dereference(device->name), @@ -1393,7 +1399,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, struct zone_info *info, unsigned long *active) { - if (info->alloc_offset == WP_MISSING_DEV) { + if (unlikely(info->alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", info->physical); @@ -1422,13 +1428,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); - if (zone_info[0].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[0].physical); return -EIO; } - if (zone_info[1].alloc_offset == WP_MISSING_DEV) { + if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[1].physical); @@ -1441,14 +1447,14 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, if (zone_info[1].alloc_offset == WP_CONVENTIONAL) zone_info[1].alloc_offset = last_alloc; - if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { + if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); return -EIO; } if (test_bit(0, active) != test_bit(1, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else if (test_bit(0, active)) { set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); @@ -1483,16 +1489,16 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, if (zone_info[i].alloc_offset == WP_CONVENTIONAL) zone_info[i].alloc_offset = last_alloc; - if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && - !btrfs_test_opt(fs_info, DEGRADED)) { + if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && + !btrfs_test_opt(fs_info, DEGRADED))) { btrfs_err(fs_info, "zoned: write pointer offset mismatch of zones in %s profile", btrfs_bg_type_to_raid_name(map->type)); return -EIO; } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_test_opt(fs_info, DEGRADED) && - !btrfs_zone_activate(bg)) { + if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) && + !btrfs_zone_activate(bg))) { return -EIO; } } else { @@ -1548,7 +1554,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1580,7 +1586,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, continue; if (test_bit(0, active) != test_bit(i, active)) { - if (!btrfs_zone_activate(bg)) + if (unlikely(!btrfs_zone_activate(bg))) return -EIO; } else { if (test_bit(0, active)) @@ -1637,7 +1643,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return 0; /* Sanity check */ - if (!IS_ALIGNED(length, fs_info->zone_size)) { + if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) { btrfs_err(fs_info, "zoned: block group %llu len %llu unaligned to zone size %llu", logical, length, fs_info->zone_size); @@ -1750,7 +1756,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return -EINVAL; } - if (cache->alloc_offset > cache->zone_capacity) { + if (unlikely(cache->alloc_offset > cache->zone_capacity)) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", cache->alloc_offset, cache->zone_capacity, @@ -2081,7 +2087,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc, NULL, NULL); - if (ret || !bioc || mapped_length < PAGE_SIZE) { + if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) { ret = -EIO; goto out_put_bioc; } @@ -2139,7 +2145,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, if (physical_pos == wp) return 0; - if (physical_pos > wp) + if (unlikely(physical_pos > wp)) return -EUCLEAN; length = wp - physical_pos; @@ -2458,16 +2464,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) return ret; } -void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) +int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) - return; + return 0; block_group = btrfs_lookup_block_group(fs_info, logical); - ASSERT(block_group); + if (WARN_ON_ONCE(!block_group)) + return -ENOENT; /* No MIXED_BG on zoned btrfs. */ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) @@ -2484,16 +2491,21 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len out: btrfs_put_block_group(block_group); + return 0; } static void btrfs_zone_finish_endio_workfn(struct work_struct *work) { + int ret; struct btrfs_block_group *bg = container_of(work, struct btrfs_block_group, zone_finish_work); wait_on_extent_buffer_writeback(bg->last_eb); free_extent_buffer(bg->last_eb); - btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + ret = do_zone_finish(bg, true); + if (ret) + btrfs_handle_fs_error(bg->fs_info, ret, + "Failed to finish block-group's zone"); btrfs_put_block_group(bg); } @@ -2515,7 +2527,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, refcount_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); - queue_work(system_unbound_wq, &bg->zone_finish_work); + queue_work(system_dfl_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) @@ -2582,9 +2594,9 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); space_info->total_bytes -= bg->length; space_info->disk_total -= bg->length * factor; + space_info->disk_total -= bg->zone_unusable; /* There is no allocation ever happened. */ ASSERT(bg->used == 0); - ASSERT(bg->zone_unusable == 0); /* No super block in a block group on the zoned setup. */ ASSERT(bg->bytes_super == 0); spin_unlock(&space_info->lock); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 6e11533b8e14c2..17c5656580dd97 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); -void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, +int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); @@ -234,8 +234,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, return true; } -static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) { } +static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + return 0; +} static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index ff0292615e1f37..c9cddcfa337b91 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -77,7 +77,6 @@ struct workspace { */ struct zstd_workspace_manager { - const struct btrfs_compress_op *ops; spinlock_t lock; struct list_head lru_list; struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL]; @@ -86,8 +85,6 @@ struct zstd_workspace_manager { struct timer_list timer; }; -static struct zstd_workspace_manager wsm; - static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL]; static inline struct workspace *list_to_workspace(struct list_head *list) @@ -112,19 +109,19 @@ static inline int clip_level(int level) */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { + struct zstd_workspace_manager *zwsm = + container_of(timer, struct zstd_workspace_manager, timer); unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; - ASSERT(timer == &wsm.timer); - - spin_lock(&wsm.lock); + spin_lock(&zwsm->lock); - if (list_empty(&wsm.lru_list)) { - spin_unlock(&wsm.lock); + if (list_empty(&zwsm->lru_list)) { + spin_unlock(&zwsm->lock); return; } - list_for_each_prev_safe(pos, next, &wsm.lru_list) { + list_for_each_prev_safe(pos, next, &zwsm->lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); int level; @@ -141,15 +138,15 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level])) - clear_bit(level, &wsm.active_map); + if (list_empty(&zwsm->idle_ws[level])) + clear_bit(level, &zwsm->active_map); } - if (!list_empty(&wsm.lru_list)) - mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + if (!list_empty(&zwsm->lru_list)) + mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); - spin_unlock(&wsm.lock); + spin_unlock(&zwsm->lock); } /* @@ -182,49 +179,56 @@ static void zstd_calc_ws_mem_sizes(void) } } -void zstd_init_workspace_manager(void) +int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info) { + struct zstd_workspace_manager *zwsm; struct list_head *ws; - int i; + ASSERT(fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] == NULL); + zwsm = kzalloc(sizeof(*zwsm), GFP_KERNEL); + if (!zwsm) + return -ENOMEM; zstd_calc_ws_mem_sizes(); + spin_lock_init(&zwsm->lock); + init_waitqueue_head(&zwsm->wait); + timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0); - wsm.ops = &btrfs_zstd_compress; - spin_lock_init(&wsm.lock); - init_waitqueue_head(&wsm.wait); - timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0); - - INIT_LIST_HEAD(&wsm.lru_list); - for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) - INIT_LIST_HEAD(&wsm.idle_ws[i]); + INIT_LIST_HEAD(&zwsm->lru_list); + for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&zwsm->idle_ws[i]); + fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = zwsm; - ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); + ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); } else { - set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); - list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); + set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &zwsm->active_map); + list_add(ws, &zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); } + return 0; } -void zstd_cleanup_workspace_manager(void) +void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct workspace *workspace; - int i; - spin_lock_bh(&wsm.lock); - for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { - while (!list_empty(&wsm.idle_ws[i])) { - workspace = container_of(wsm.idle_ws[i].next, + if (!zwsm) + return; + fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = NULL; + spin_lock_bh(&zwsm->lock); + for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { + while (!list_empty(&zwsm->idle_ws[i])) { + workspace = container_of(zwsm->idle_ws[i].next, struct workspace, list); list_del(&workspace->list); list_del(&workspace->lru_list); zstd_free_workspace(&workspace->list); } } - spin_unlock_bh(&wsm.lock); - - timer_delete_sync(&wsm.timer); + spin_unlock_bh(&zwsm->lock); + timer_delete_sync(&zwsm->timer); + kfree(zwsm); } /* @@ -239,29 +243,31 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(int level) +static struct list_head *zstd_find_workspace(struct btrfs_fs_info *fs_info, int level) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; struct workspace *workspace; int i = clip_level(level); - spin_lock_bh(&wsm.lock); - for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { - if (!list_empty(&wsm.idle_ws[i])) { - ws = wsm.idle_ws[i].next; + ASSERT(zwsm); + spin_lock_bh(&zwsm->lock); + for_each_set_bit_from(i, &zwsm->active_map, ZSTD_BTRFS_MAX_LEVEL) { + if (!list_empty(&zwsm->idle_ws[i])) { + ws = zwsm->idle_ws[i].next; workspace = list_to_workspace(ws); list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); - if (list_empty(&wsm.idle_ws[i])) - clear_bit(i, &wsm.active_map); - spin_unlock_bh(&wsm.lock); + if (list_empty(&zwsm->idle_ws[i])) + clear_bit(i, &zwsm->active_map); + spin_unlock_bh(&zwsm->lock); return ws; } } - spin_unlock_bh(&wsm.lock); + spin_unlock_bh(&zwsm->lock); return NULL; } @@ -276,30 +282,33 @@ static struct list_head *zstd_find_workspace(int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(int level) +struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct list_head *ws; unsigned int nofs_flag; + ASSERT(zwsm); + /* level == 0 means we can use any workspace */ if (!level) level = 1; again: - ws = zstd_find_workspace(level); + ws = zstd_find_workspace(fs_info, level); if (ws) return ws; nofs_flag = memalloc_nofs_save(); - ws = zstd_alloc_workspace(level); + ws = zstd_alloc_workspace(fs_info, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ws)) { DEFINE_WAIT(wait); - prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&zwsm->wait, &wait, TASK_UNINTERRUPTIBLE); schedule(); - finish_wait(&wsm.wait, &wait); + finish_wait(&zwsm->wait, &wait); goto again; } @@ -318,34 +327,36 @@ struct list_head *zstd_get_workspace(int level) * isn't set, it is also set here. Only the max level workspace tries and wakes * up waiting workspaces. */ -void zstd_put_workspace(struct list_head *ws) +void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws) { + struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD]; struct workspace *workspace = list_to_workspace(ws); - spin_lock_bh(&wsm.lock); + ASSERT(zwsm); + spin_lock_bh(&zwsm->lock); /* A node is only taken off the lru if we are the corresponding level */ if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ - if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { + if (list_empty(&zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); } else { workspace->last_used = jiffies; - list_add(&workspace->lru_list, &wsm.lru_list); - if (!timer_pending(&wsm.timer)) - mod_timer(&wsm.timer, + list_add(&workspace->lru_list, &zwsm->lru_list); + if (!timer_pending(&zwsm->timer)) + mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); } } - set_bit(workspace->level, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level]); + set_bit(workspace->level, &zwsm->active_map); + list_add(&workspace->list, &zwsm->idle_ws[workspace->level]); workspace->req_level = 0; - spin_unlock_bh(&wsm.lock); + spin_unlock_bh(&zwsm->lock); if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) - cond_wake_up(&wsm.wait); + cond_wake_up(&zwsm->wait); } void zstd_free_workspace(struct list_head *ws) @@ -357,8 +368,9 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(int level) +struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) { + const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); @@ -371,7 +383,7 @@ struct list_head *zstd_alloc_workspace(int level) workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf = kmalloc(blocksize, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; @@ -384,11 +396,13 @@ struct list_head *zstd_alloc_workspace(int level) return ERR_PTR(-ENOMEM); } -int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, +int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; zstd_cstream *stream; int ret = 0; int nr_folios = 0; @@ -399,7 +413,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; - unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u32 blocksize = fs_info->sectorsize; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + unsigned long max_out = nr_dest_folios * min_folio_size; unsigned int cur_len; workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); @@ -411,9 +427,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zstd compression init level %d failed, root %llu inode %llu offset %llu", workspace->req_level, btrfs_root_id(inode->root), btrfs_ino(inode), start); @@ -431,7 +445,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -439,7 +453,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); while (1) { size_t ret2; @@ -447,9 +461,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_warn(inode->root->fs_info, + btrfs_warn(fs_info, "zstd compression level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), @@ -459,7 +471,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, } /* Check to see if we are making it bigger */ - if (tot_in + workspace->in_buf.pos > 8192 && + if (tot_in + workspace->in_buf.pos > blocksize * 2 && tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { ret = -E2BIG; @@ -475,13 +487,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, /* Check if we need more output space */ if (workspace->out_buf.pos == workspace->out_buf.size) { - tot_out += PAGE_SIZE; - max_out -= PAGE_SIZE; + tot_out += min_folio_size; + max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -489,8 +501,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, - PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } /* We've reached the end of the input */ @@ -522,9 +533,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret2 = zstd_end_stream(stream, &workspace->out_buf); if (unlikely(zstd_is_error(ret2))) { - struct btrfs_inode *inode = BTRFS_I(mapping->host); - - btrfs_err(inode->root->fs_info, + btrfs_err(fs_info, "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), @@ -542,13 +551,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, goto out; } - tot_out += PAGE_SIZE; - max_out -= PAGE_SIZE; + tot_out += min_folio_size; + max_out -= min_folio_size; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(); + out_folio = btrfs_alloc_compr_folio(fs_info); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -556,7 +565,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); } if (tot_out >= tot_in) { @@ -578,13 +587,16 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; + const u32 blocksize = fs_info->sectorsize; + const unsigned int min_folio_size = btrfs_min_folio_size(fs_info); unsigned long folio_in_index = 0; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; unsigned long total_out = 0; @@ -602,11 +614,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; + workspace->out_buf.size = blocksize; while (1) { size_t ret2; @@ -642,16 +654,16 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); folio_in_index++; - if (folio_in_index >= total_folios_in) { + if (unlikely(folio_in_index >= total_folios_in)) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } - srclen -= PAGE_SIZE; + srclen -= min_folio_size; workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); } } ret = 0; @@ -718,9 +730,7 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, return ret; } -const struct btrfs_compress_op btrfs_zstd_compress = { - /* ZSTD uses own workspace manager */ - .workspace_manager = NULL, +const struct btrfs_compress_levels btrfs_zstd_compress = { .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 91dfd02318772f..d1edb2ac38376c 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -387,10 +387,9 @@ int cachefiles_bury_object(struct cachefiles_cache *cache, cachefiles_io_error(cache, "Rename security error %d", ret); } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = dir, .old_dentry = rep, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = cache->graveyard, .new_dentry = grave, }; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8b202d789e9350..322ed268f14aa9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, 0, gfp_flags); if (IS_ERR(pages[index])) { - if (PTR_ERR(pages[index]) == -EINVAL) { + int err = PTR_ERR(pages[index]); + + if (err == -EINVAL) { pr_err_client(cl, "inode->i_blkbits=%hhu\n", inode->i_blkbits); } @@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, BUG_ON(ceph_wbc->locked_pages == 0); pages[index] = NULL; - return PTR_ERR(pages[index]); + return err; } } else { pages[index] = &folio->page; @@ -1687,6 +1689,7 @@ static int ceph_writepages_start(struct address_space *mapping, process_folio_batch: rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); if (rc) goto release_folios; @@ -1695,8 +1698,6 @@ static int ceph_writepages_start(struct address_space *mapping, goto release_folios; if (ceph_wbc.processed_in_fbatch) { - ceph_shift_unused_folios_left(&ceph_wbc.fbatch); - if (folio_batch_count(&ceph_wbc.fbatch) == 0 && ceph_wbc.locked_pages < ceph_wbc.max_pages) { doutc(cl, "reached end fbatch, trying for more\n"); diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index cab7226192073f..7026e794813ca1 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -133,6 +133,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) } static struct fscrypt_operations ceph_fscrypt_ops = { + .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) - + (int)offsetof(struct ceph_inode_info, netfs.inode), .needs_bounce_pages = 1, .get_context = ceph_crypt_get_context, .set_context = ceph_crypt_set_context, diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fdd404fc81124d..f3fe786b4143d4 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; - int pathlen = 0; - u64 pathbase; char *path; mutex_lock(&mdsc->mutex); @@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p) if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); } else if (req->r_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_dentry->d_lock); @@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); @@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p) } if (req->r_old_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_old_dentry->d_lock); @@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8478e7e75df66c..32973c62c1a230 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, /* If op failed, mark everyone involved for errors */ if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); /* mark error on parent + clear complete */ mapping_set_error(req->r_parent->i_mapping, result); @@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_old_inode->i_mapping, result); pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<>" : path, result); + ceph_mdsc_free_path_info(&path_info); } out: iput(req->r_old_inode); @@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) int err = -EROFS; int op; char *path; - int pathlen; - u64 pathbase; if (ceph_snap(dir) == CEPH_SNAPDIR) { /* rmdir .snap/foo is RMSNAP */ @@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c02f100f8552bb..978acd3d4b329c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file) int flags, fmode, wanted; struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; int mask = MAY_READ; @@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file) if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ @@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_parent->i_mapping, result); if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); pr_warn_client(cl, "async create failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<>" : path, result); + ceph_mdsc_free_path_info(&path_info); ceph_dir_clear_complete(req->r_parent); if (!d_unhashed(dentry)) @@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, int mask; int err; char *path; - int pathlen; - u64 pathbase; doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n", dir, ceph_vinop(dir), dentry, dentry, @@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; @@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, mask |= MAY_WRITE; err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fc543075b827a9..949f0badc944f5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } +/* + * Check if the parent inode matches the vino from directory reply info + */ +static inline bool ceph_vino_matches_parent(struct inode *parent, + struct ceph_vino vino) +{ + return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap; +} + +/* + * Validate that the directory inode referenced by @req->r_parent matches the + * inode number and snapshot id contained in the reply's directory record. If + * they do not match – which can theoretically happen if the parent dentry was + * moved between the time the request was issued and the reply arrived – fall + * back to looking up the correct inode in the inode cache. + * + * A reference is *always* returned. Callers that receive a different inode + * than the original @parent are responsible for dropping the extra reference + * once the reply has been processed. + */ +static struct inode *ceph_get_reply_dir(struct super_block *sb, + struct inode *parent, + struct ceph_mds_reply_info_parsed *rinfo) +{ + struct ceph_vino vino; + + if (unlikely(!rinfo->diri.in)) + return parent; /* nothing to compare against */ + + /* If we didn't have a cached parent inode to begin with, just bail out. */ + if (!parent) + return NULL; + + vino.ino = le64_to_cpu(rinfo->diri.in->ino); + vino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (likely(ceph_vino_matches_parent(parent, vino))) + return parent; /* matches – use the original reference */ + + /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */ + WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n", + ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap); + + return ceph_get_inode(sb, vino, NULL); +} + /** * ceph_new_inode - allocate a new inode in advance of an expected create * @dir: parent directory for new inode @@ -665,6 +711,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); #ifdef CONFIG_FS_ENCRYPTION + ci->i_crypt_info = NULL; ci->fscrypt_auth = NULL; ci->fscrypt_auth_len = 0; #endif @@ -1523,6 +1570,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) struct ceph_vino tvino, dvino; struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_client *cl = fsc->client; + struct inode *parent_dir = NULL; int err = 0; doutc(cl, "%p is_dentry %d is_target %d\n", req, @@ -1536,10 +1584,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_parent; - - if (dir) { - err = ceph_fill_inode(dir, NULL, &rinfo->diri, + /* + * r_parent may be stale, in cases when R_PARENT_LOCKED is not set, + * so we need to get the correct inode + */ + parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo); + if (unlikely(IS_ERR(parent_dir))) { + err = PTR_ERR(parent_dir); + goto done; + } + if (parent_dir) { + err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, rinfo->dirfrag, session, -1, &req->r_caps_reservation); if (err < 0) @@ -1548,14 +1603,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) WARN_ON_ONCE(1); } - if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { bool is_nokey = false; struct qstr dname; struct dentry *dn, *parent; struct fscrypt_str oname = FSTR_INIT(NULL, 0); - struct ceph_fname fname = { .dir = dir, + struct ceph_fname fname = { .dir = parent_dir, .name = rinfo->dname, .ctext = rinfo->altname, .name_len = rinfo->dname_len, @@ -1564,10 +1619,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) BUG_ON(!rinfo->head->is_target); BUG_ON(req->r_dentry); - parent = d_find_any_alias(dir); + parent = d_find_any_alias(parent_dir); BUG_ON(!parent); - err = ceph_fname_alloc_buffer(dir, &oname); + err = ceph_fname_alloc_buffer(parent_dir, &oname); if (err < 0) { dput(parent); goto done; @@ -1576,7 +1631,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); if (err < 0) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); goto done; } dname.name = oname.name; @@ -1595,7 +1650,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) dname.len, dname.name, dn); if (!dn) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); err = -ENOMEM; goto done; } @@ -1610,12 +1665,12 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) ceph_snap(d_inode(dn)) != tvino.snap)) { doutc(cl, " dn %p points to wrong inode %p\n", dn, d_inode(dn)); - ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(parent_dir); d_delete(dn); dput(dn); goto retry_lookup; } - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); req->r_dentry = dn; dput(parent); @@ -1794,6 +1849,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) &dvino, ptvino); } done: + /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */ + if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent)) + iput(parent_dir); doutc(cl, "done err=%d\n", err); return err; } @@ -2487,22 +2545,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, int truncate_retry = 20; /* The RMW will take around 50ms */ struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; dentry = d_find_alias(inode); if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0f497c39ff8246..73da2648fa0f27 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = atomic_read(&inode->i_count); + count = icount_read(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", @@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * ceph_mdsc_build_path - build a path string to a given dentry * @mdsc: mds client * @dentry: dentry to which path should be built - * @plen: returned length of string - * @pbase: returned base inode number + * @path_info: output path, length, base ino+snap, and freepath ownership flag * @for_wire: is this path going to be sent to the MDS? * * Build a string that represents the path to the dentry. This is mostly called @@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - int *plen, u64 *pbase, int for_wire) + struct ceph_path_info *path_info, int for_wire) { struct ceph_client *cl = mdsc->fsc->client; struct dentry *cur; @@ -2810,16 +2809,28 @@ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); } - *pbase = base; - *plen = PATH_MAX - 1 - pos; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + + path_info->vino.ino = base; + path_info->pathlen = PATH_MAX - 1 - pos; + path_info->path = path + pos; + path_info->freepath = true; + + /* Set snap from dentry if available */ + if (d_inode(dentry)) + path_info->vino.snap = ceph_snap(d_inode(dentry)); + else + path_info->vino.snap = CEPH_NOSNAP; + doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), - base, *plen, path + pos); + base, PATH_MAX - 1 - pos, path + pos); return path + pos; } static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - struct inode *dir, const char **ppath, int *ppathlen, - u64 *pino, bool *pfreepath, bool parent_locked) + struct inode *dir, struct ceph_path_info *path_info, + bool parent_locked) { char *path; @@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry dir = d_inode_rcu(dentry->d_parent); if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && !IS_ENCRYPTED(dir)) { - *pino = ceph_ino(dir); + path_info->vino.ino = ceph_ino(dir); + path_info->vino.snap = ceph_snap(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + path_info->path = dentry->d_name.name; + path_info->pathlen = dentry->d_name.len; + path_info->freepath = false; return 0; } rcu_read_unlock(); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap handling. + */ return 0; } -static int build_inode_path(struct inode *inode, - const char **ppath, int *ppathlen, u64 *pino, - bool *pfreepath) +static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct dentry *dentry; char *path; if (ceph_snap(inode) == CEPH_NOSNAP) { - *pino = ceph_ino(inode); - *ppathlen = 0; + path_info->vino.ino = ceph_ino(inode); + path_info->vino.snap = ceph_snap(inode); + path_info->pathlen = 0; + path_info->freepath = false; return 0; } dentry = d_find_alias(inode); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); dput(dentry); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap from dentry. + * Override with inode's snap since that's what this function is for. + */ + path_info->vino.snap = ceph_snap(inode); return 0; } @@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode, */ static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, - const char *rpath, u64 rino, const char **ppath, - int *pathlen, u64 *ino, bool *freepath, + const char *rpath, u64 rino, + struct ceph_path_info *path_info, bool parent_locked) { struct ceph_client *cl = mdsc->fsc->client; int r = 0; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + if (rinode) { - r = build_inode_path(rinode, ppath, pathlen, ino, freepath); + r = build_inode_path(rinode, path_info); doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino, - freepath, parent_locked); - doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); + r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked); + doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino, + path_info->pathlen, path_info->path); } else if (rpath || rino) { - *ino = rino; - *ppath = rpath; - *pathlen = rpath ? strlen(rpath) : 0; - doutc(cl, " path %.*s\n", *pathlen, rpath); + path_info->vino.ino = rino; + path_info->vino.snap = CEPH_NOSNAP; + path_info->path = rpath; + path_info->pathlen = rpath ? strlen(rpath) : 0; + path_info->freepath = false; + + doutc(cl, " path %.*s\n", path_info->pathlen, rpath); } return r; @@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_request_head_legacy *lhead; - const char *path1 = NULL; - const char *path2 = NULL; - u64 ino1 = 0, ino2 = 0; - int pathlen1 = 0, pathlen2 = 0; - bool freepath1 = false, freepath2 = false; + struct ceph_path_info path_info1 = {0}; + struct ceph_path_info path_info2 = {0}; struct dentry *old_dentry = NULL; int len; u16 releases; @@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, u16 request_head_version = mds_supported_head_version(session); kuid_t caller_fsuid = req->r_cred->fsuid; kgid_t caller_fsgid = req->r_cred->fsgid; + bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, - req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1, - test_bit(CEPH_MDS_R_PARENT_LOCKED, - &req->r_req_flags)); + req->r_parent, req->r_path1, req->r_ino1.ino, + &path_info1, parent_locked); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* + * When the parent directory's i_rwsem is *not* locked, req->r_parent may + * have become stale (e.g. after a concurrent rename) between the time the + * dentry was looked up and now. If we detect that the stored r_parent + * does not match the inode number we just encoded for the request, switch + * to the correct inode so that the MDS receives a valid parent reference. + */ + if (!parent_locked && req->r_parent && path_info1.vino.ino && + ceph_ino(req->r_parent) != path_info1.vino.ino) { + struct inode *old_parent = req->r_parent; + struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL); + if (!IS_ERR(correct_dir)) { + WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n", + ceph_ino(old_parent), path_info1.vino.ino); + /* + * Transfer CEPH_CAP_PIN from the old parent to the new one. + * The pin was taken earlier in ceph_mdsc_submit_request(). + */ + ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN); + iput(old_parent); + req->r_parent = correct_dir; + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + } + } + /* If r_old_dentry is set, then assume that its parent is locked */ if (req->r_old_dentry && !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) old_dentry = req->r_old_dentry; ret = set_request_path_attr(mdsc, NULL, old_dentry, - req->r_old_dentry_dir, - req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2, true); + req->r_old_dentry_dir, + req->r_path2, req->r_ino2.ino, + &path_info2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; @@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, /* filepaths */ len += 2 * (1 + sizeof(u32) + sizeof(u64)); - len += pathlen1 + pathlen2; + len += path_info1.pathlen + path_info2.pathlen; /* cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, !!req->r_old_inode_drop + !!req->r_old_dentry_drop); if (req->r_dentry_drop) - len += pathlen1; + len += path_info1.pathlen; if (req->r_old_dentry_drop) - len += pathlen2; + len += path_info2.pathlen; /* MClientRequest tail */ @@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, lhead->ino = cpu_to_le64(req->r_deleg_ino); lhead->args = req->r_args; - ceph_encode_filepath(&p, end, ino1, path1); - ceph_encode_filepath(&p, end, ino2, path2); + ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path); + ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path); /* make note of release offset, in case we need to replay */ req->r_request_release_offset = p - msg->front.iov_base; @@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.data_off = cpu_to_le16(0); out_free2: - if (freepath2) - ceph_mdsc_free_path((char *)path2, pathlen2); + ceph_mdsc_free_path_info(&path_info2); out_free1: - if (freepath1) - ceph_mdsc_free_path((char *)path1, pathlen1); + ceph_mdsc_free_path_info(&path_info1); out: return msg; out_err: @@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; struct ceph_cap *cap; - char *path; - int pathlen = 0, err; - u64 pathbase; + struct ceph_path_info path_info = {0}; + int err; u64 snap_follows; dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, recon_state->msg_version >= 2); dput(dentry); if (IS_ERR(path)) { err = PTR_ERR(path); goto out_err; } - } else { - path = NULL; - pathbase = 0; } spin_lock(&ci->i_ceph_lock); @@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.pathbase = cpu_to_le64(path_info.vino.ino); rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { @@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) ts = inode_get_atime(inode); ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v1.pathbase = cpu_to_le64(pathbase); + rec.v1.pathbase = cpu_to_le64(path_info.vino.ino); } if (list_empty(&ci->i_cap_snaps)) { @@ -4706,7 +4744,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) sizeof(struct ceph_filelock); rec.v2.flock_len = cpu_to_le32(struct_len); - struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); + struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2); if (struct_v >= 2) struct_len += sizeof(u64); /* snap_follows */ @@ -4730,7 +4768,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, struct_len); } - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); ceph_locks_to_pagelist(flocks, pagelist, num_fcntl_locks, num_flock_locks); @@ -4741,17 +4779,17 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) } else { err = ceph_pagelist_reserve(pagelist, sizeof(u64) + sizeof(u32) + - pathlen + sizeof(rec.v1)); + path_info.pathlen + sizeof(rec.v1)); if (err) goto out_err; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); } out_err: - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); if (!err) recon_state->nr_caps++; return err; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3e2a6fa7c19aab..0428a5eaf28c65 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); -static inline void ceph_mdsc_free_path(char *path, int len) +/* + * Structure to group path-related output parameters for build_*_path functions + */ +struct ceph_path_info { + const char *path; + int pathlen; + struct ceph_vino vino; + bool freepath; +}; + +static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info) { - if (!IS_ERR_OR_NULL(path)) - __putname(path - (PATH_MAX - 1 - len)); + if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path)) + __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen)); } extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, - struct dentry *dentry, int *plen, u64 *base, + struct dentry *dentry, struct ceph_path_info *path_info, int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3eb651862c555..db6c2db68f965e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -862,7 +862,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0); if (!fsc->inode_wq) goto fail_client; - fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); + fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1); if (!fsc->cap_wq) goto fail_inode_wq; @@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cf176aab0f8239..25d8bacbcf4408 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -463,6 +463,7 @@ struct ceph_inode_info { unsigned long i_work_mask; #ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; u32 fscrypt_auth_len; u32 fscrypt_file_len; u8 *fscrypt_auth; diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 740f18b60c9d0d..456c4a2efb5329 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode) static const struct super_operations configfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .free_inode = configfs_free_inode, }; diff --git a/fs/coredump.c b/fs/coredump.c index 5dce257c67fc8b..0d9a5d07a75d3e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -635,7 +635,7 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) /* * Usermode helpers are childen of either - * system_unbound_wq or of kthreadd. So we know that + * system_dfl_wq or of kthreadd. So we know that * we're starting off with a clean file descriptor * table. So we should always be able to use * COREDUMP_PIDFD_NUMBER as our file descriptor value. @@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, ssize_t retval; char old_core_pattern[CORENAME_MAX_SIZE]; + if (write) + return proc_dostring(table, write, buffer, lenp, ppos); + retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); error = proc_dostring(table, write, buffer, lenp, ppos); if (error) return error; + if (!check_coredump_socket()) { strscpy(core_pattern, old_core_pattern, retval + 1); return -EINVAL; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b002e9b734f99c..12daa85ed941b7 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode_nohighmem(inode); inode->i_data.a_ops = &cramfs_aops; break; - default: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: init_special_inode(inode, cramfs_inode->mode, old_decode_dev(cramfs_inode->size)); + break; + default: + printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + iget_failed(inode); + return ERR_PTR(-EIO); } inode->i_mode = cramfs_inode->mode; diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index b5dfb0aa405ab0..464b54610fd346 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -2,10 +2,9 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO - select CRYPTO_HASH - select CRYPTO_HKDF select CRYPTO_SKCIPHER select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_SHA512 select KEYS help Enable encryption of files and directories. This @@ -32,8 +31,6 @@ config FS_ENCRYPTION_ALGS select CRYPTO_CBC select CRYPTO_CTS select CRYPTO_ECB - select CRYPTO_HMAC - select CRYPTO_SHA512 select CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 486fcb2ecf13eb..5f5599020e94a6 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -113,7 +113,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; @@ -148,7 +148,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, */ for (i = 0; i < nr_pages; i++) { pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS : - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!pages[i]) break; } diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b6ccab524fdef8..07f9cbfe3ea411 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; @@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_ENCRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); @@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + @@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_DECRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f9f6713e144f7a..8e4c213d418bdd 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -11,7 +11,6 @@ * This has not yet undergone a rigorous security audit. */ -#include #include #include #include @@ -94,7 +93,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -138,7 +137,7 @@ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -274,8 +273,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { - return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, - orig_len, max_len, + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); + + return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); @@ -543,7 +543,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_inode_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir); WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d8b485b9881c50..4e8e82a9ccf9a3 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -11,10 +11,10 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H +#include #include #include #include -#include #include #define CONST_STRLEN(str) (sizeof(str) - 1) @@ -249,8 +249,8 @@ struct fscrypt_prepared_key { * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is - * allocated and stored in ->i_crypt_info. Once created, it remains until the - * inode is evicted. + * allocated and a pointer to it is stored in the file's in-memory inode. Once + * created, it remains until the inode is evicted. */ struct fscrypt_inode_info { @@ -381,12 +381,8 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 *encrypted_len_ret); /* hkdf.c */ -struct fscrypt_hkdf { - struct crypto_shash *hmac_tfm; -}; - -int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size); +void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key, + unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as @@ -405,11 +401,9 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY \ 8 /* info= */ -int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); - -void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); +void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT @@ -517,7 +511,7 @@ struct fscrypt_master_key_secret { * ->is_hw_wrapped=false, or by the "software secret" that hardware * derived from this master key if ->is_hw_wrapped=true. */ - struct fscrypt_hkdf hkdf; + struct hmac_sha512_key hkdf; /* * True if this key is a hardware-wrapped key; false if this key is a @@ -696,7 +690,7 @@ struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -int fscrypt_get_test_dummy_key_identifier( +void fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_add_test_dummy_key(struct super_block *sb, @@ -732,8 +726,8 @@ void fscrypt_destroy_prepared_key(struct super_block *sb, int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key); -int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, - const struct fscrypt_master_key *mk); +void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, + const struct fscrypt_master_key *mk); void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index b1ef506cd341de..706f56d0076eed 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation + * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): + * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". + * * This is used to derive keys from the fscrypt master keys (or from the * "software secrets" which hardware derives from the fscrypt master keys, in * the case that the fscrypt master keys are hardware-wrapped keys). @@ -7,10 +11,6 @@ * Copyright 2019 Google LLC */ -#include -#include -#include - #include "fscrypt_private.h" /* @@ -24,7 +24,6 @@ * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ -#define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE /* @@ -44,54 +43,24 @@ */ /* - * Compute HKDF-Extract using the given master key as the input keying material, - * and prepare an HMAC transform object keyed by the resulting pseudorandom key. - * - * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many - * times without having to recompute HKDF-Extract each time. + * Compute HKDF-Extract using 'master_key' as the input keying material, and + * prepare the resulting HMAC key in 'hkdf'. Afterwards, 'hkdf' can be used for + * HKDF-Expand many times without having to recompute HKDF-Extract each time. */ -int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size) +void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key, + unsigned int master_key_size) { - struct crypto_shash *hmac_tfm; static const u8 default_salt[HKDF_HASHLEN]; u8 prk[HKDF_HASHLEN]; - int err; - - hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, FSCRYPT_CRYPTOAPI_MASK); - if (IS_ERR(hmac_tfm)) { - fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", - PTR_ERR(hmac_tfm)); - return PTR_ERR(hmac_tfm); - } - - if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { - err = -EINVAL; - goto err_free_tfm; - } - - err = hkdf_extract(hmac_tfm, master_key, master_key_size, - default_salt, HKDF_HASHLEN, prk); - if (err) - goto err_free_tfm; - - err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); - if (err) - goto err_free_tfm; - hkdf->hmac_tfm = hmac_tfm; - goto out; - -err_free_tfm: - crypto_free_shash(hmac_tfm); -out: + hmac_sha512_usingrawkey(default_salt, sizeof(default_salt), + master_key, master_key_size, prk); + hmac_sha512_preparekey(hkdf, prk, sizeof(prk)); memzero_explicit(prk, sizeof(prk)); - return err; } /* - * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which - * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen' + * HKDF-Expand (RFC 5869 section 2.3). Expand the HMAC key 'hkdf' into 'okmlen' * bytes of output keying material parameterized by the application-specific * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context' * byte. This is thread-safe and may be called by multiple threads in parallel. @@ -100,30 +69,32 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, * adds to its application-specific info strings to guarantee that it doesn't * accidentally repeat an info string when using HKDF for different purposes.) */ -int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen) -{ - SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); - u8 *full_info; - int err; - - full_info = kzalloc(infolen + 9, GFP_KERNEL); - if (!full_info) - return -ENOMEM; - desc->tfm = hkdf->hmac_tfm; - - memcpy(full_info, "fscrypt\0", 8); - full_info[8] = context; - memcpy(full_info + 9, info, infolen); - - err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9, - okm, okmlen); - kfree_sensitive(full_info); - return err; -} - -void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf) +void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen) { - crypto_free_shash(hkdf->hmac_tfm); + struct hmac_sha512_ctx ctx; + u8 counter = 1; + u8 tmp[HKDF_HASHLEN]; + + WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN); + + for (unsigned int i = 0; i < okmlen; i += HKDF_HASHLEN) { + hmac_sha512_init(&ctx, hkdf); + if (i != 0) + hmac_sha512_update(&ctx, &okm[i - HKDF_HASHLEN], + HKDF_HASHLEN); + hmac_sha512_update(&ctx, "fscrypt\0", 8); + hmac_sha512_update(&ctx, &context, 1); + hmac_sha512_update(&ctx, info, infolen); + hmac_sha512_update(&ctx, &counter, 1); + if (okmlen - i < HKDF_HASHLEN) { + hmac_sha512_final(&ctx, tmp); + memcpy(&okm[i], tmp, okmlen - i); + memzero_explicit(tmp, sizeof(tmp)); + } else { + hmac_sha512_final(&ctx, &okm[i]); + } + counter++; + } } diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index e0b32ac841f765..b97de0d1430fd6 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -199,13 +199,13 @@ int fscrypt_prepare_setflags(struct inode *inode, err = fscrypt_require_key(inode); if (err) return err; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; down_read(&mk->mk_sem); if (mk->mk_present) - err = fscrypt_derive_dirhash_key(ci, mk); + fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; up_read(&mk->mk_sem); diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index caaff809765b29..5dee7c498bc8c5 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { - return inode->i_crypt_info->ci_inlinecrypt; + return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt; } EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); @@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, if (!fscrypt_inode_uses_inline_crypto(inode)) return; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); fscrypt_generate_dun(ci, first_lblk, dun); bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); @@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { const struct bio_crypt_ctx *bc = bio->bi_crypt_context; + const struct fscrypt_inode_info *ci; u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; if (!!bc != fscrypt_inode_uses_inline_crypto(inode)) return false; if (!bc) return true; + ci = fscrypt_get_inode_info_raw(inode); /* * Comparing the key pointers is good enough, as all I/O for each key * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) + if (bc->bc_key != ci->ci_enc_key.blk_key) return false; - fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); + fscrypt_generate_dun(ci, next_lblk, next_dun); return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); @@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) if (nr_blocks <= 1) return nr_blocks; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (!(fscrypt_policy_flags(&ci->ci_policy) & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) return nr_blocks; diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 7557f6a88b8f32..3adbd7167055a9 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -42,7 +42,6 @@ struct fscrypt_keyring { static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { - fscrypt_destroy_hkdf(&secret->hkdf); memzero_explicit(secret, sizeof(*secret)); } @@ -587,21 +586,17 @@ static int add_master_key(struct super_block *sb, keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY; } - err = fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); + fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); /* * Now that the KDF context is initialized, the raw KDF key is * no longer needed. */ memzero_explicit(kdf_key, kdf_key_size); - if (err) - return err; /* Calculate the key identifier */ - err = fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, - key_spec->u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - if (err) - return err; + fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, + key_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); } return do_add_master_key(sb, secret, key_spec); } @@ -835,24 +830,17 @@ fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) memcpy(secret->bytes, test_key, sizeof(test_key)); } -int fscrypt_get_test_dummy_key_identifier( +void fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_master_key_secret secret; - int err; fscrypt_get_test_dummy_secret(&secret); - - err = fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); - if (err) - goto out; - err = fscrypt_hkdf_expand(&secret.hkdf, - HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, - NULL, 0, key_identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); -out: + fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); + fscrypt_hkdf_expand(&secret.hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0, + key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); wipe_master_key_secret(&secret); - return err; } /** diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4f3b9ecbfe4e66..4bd3918f50e3fa 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -253,11 +253,8 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, sizeof(sb->s_uuid)); hkdf_infolen += sizeof(sb->s_uuid); } - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - hkdf_context, hkdf_info, hkdf_infolen, - mode_key, mode->keysize); - if (err) - goto out_unlock; + fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info, + hkdf_infolen, mode_key, mode->keysize); err = fscrypt_prepare_key(prep_key, mode_key, ci); memzero_explicit(mode_key, mode->keysize); if (err) @@ -278,36 +275,25 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an * endianness swap in order to get the same results as on little endian CPUs. */ -static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, - u8 context, const u8 *info, - unsigned int infolen, siphash_key_t *key) +static void fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, + u8 context, const u8 *info, + unsigned int infolen, siphash_key_t *key) { - int err; - - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, - (u8 *)key, sizeof(*key)); - if (err) - return err; - + fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, + (u8 *)key, sizeof(*key)); BUILD_BUG_ON(sizeof(*key) != 16); BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); le64_to_cpus(&key->key[0]); le64_to_cpus(&key->key[1]); - return 0; } -int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, - const struct fscrypt_master_key *mk) +void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, + const struct fscrypt_master_key *mk) { - int err; - - err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, - ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, - &ci->ci_dirhash_key); - if (err) - return err; + fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, + &ci->ci_dirhash_key); ci->ci_dirhash_key_initialized = true; - return 0; } void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, @@ -338,17 +324,12 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci, if (mk->mk_ino_hash_key_initialized) goto unlock; - err = fscrypt_derive_siphash_key(mk, - HKDF_CONTEXT_INODE_HASH_KEY, - NULL, 0, &mk->mk_ino_hash_key); - if (err) - goto unlock; + fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY, + NULL, 0, &mk->mk_ino_hash_key); /* pairs with smp_load_acquire() above */ smp_store_release(&mk->mk_ino_hash_key_initialized, true); unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); - if (err) - return err; } /* @@ -402,13 +383,10 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, } else { u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE]; - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - HKDF_CONTEXT_PER_FILE_ENC_KEY, - ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, - derived_key, ci->ci_mode->keysize); - if (err) - return err; - + fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_FILE_ENC_KEY, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, + derived_key, ci->ci_mode->keysize); err = fscrypt_set_per_file_enc_key(ci, derived_key); memzero_explicit(derived_key, ci->ci_mode->keysize); } @@ -416,11 +394,8 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, return err; /* Derive a secret dirhash key for directories that need it. */ - if (need_dirhash_key) { - err = fscrypt_derive_dirhash_key(ci, mk); - if (err) - return err; - } + if (need_dirhash_key) + fscrypt_derive_dirhash_key(ci, mk); return 0; } @@ -642,15 +617,16 @@ fscrypt_setup_encryption_info(struct inode *inode, goto out; /* - * For existing inodes, multiple tasks may race to set ->i_crypt_info. - * So use cmpxchg_release(). This pairs with the smp_load_acquire() in - * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with - * a RELEASE barrier so that other tasks can ACQUIRE it. + * For existing inodes, multiple tasks may race to set the inode's + * fscrypt info pointer. So use cmpxchg_release(). This pairs with the + * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the + * pointer with a RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) == + NULL) { /* - * We won the race and set ->i_crypt_info to our crypt_info. - * Now link it into the master key's inode list. + * We won the race and set the inode's fscrypt info to our + * crypt_info. Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; @@ -681,13 +657,13 @@ fscrypt_setup_encryption_info(struct inode *inode, * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * - * Set up ->i_crypt_info, if it hasn't already been done. + * Set up the inode's encryption key, if it hasn't already been done. * - * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * - * Return: 0 if ->i_crypt_info was set or was already set, *or* if the - * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * Return: 0 if the key is now set up, *or* if it couldn't be set up because the + * needed master key is absent. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) @@ -741,9 +717,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * - * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * If the directory is encrypted, set up its encryption key in preparation for * encrypting the name of the new file. Also, if the new inode will be - * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * encrypted, set up its encryption key too and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino @@ -752,8 +728,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * - * Return: 0 on success, -ENOKEY if the encryption key is missing, or another - * -errno code + * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode + * but the needed master key is absent, or another -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) @@ -800,8 +776,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); */ void fscrypt_put_encryption_info(struct inode *inode) { - put_crypt_info(inode->i_crypt_info); - inode->i_crypt_info = NULL; + /* + * Ideally we'd start with a lightweight IS_ENCRYPTED() check here + * before proceeding to retrieve and check the pointer. However, during + * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If + * an error occurs, it needs to be cleaned up regardless. + */ + struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode); + + put_crypt_info(*ci_addr); + *ci_addr = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6ad30ae07c065c..bbb2f5ced98806 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); - return &dir->i_crypt_info->ci_policy; + return &fscrypt_get_inode_info_raw(dir)->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); @@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); @@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); */ int fscrypt_set_context(struct inode *inode, void *fs_data) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ctxsize; @@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); @@ -826,10 +827,8 @@ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, policy->version = FSCRYPT_POLICY_V2; policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - err = fscrypt_get_test_dummy_key_identifier( + fscrypt_get_test_dummy_key_identifier( policy->v2.master_key_identifier); - if (err) - goto out; } else { err = -EINVAL; goto out; diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d5148..65cc119396542e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2509,8 +2509,8 @@ static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { - unsigned n = dir->i_dir_seq; - if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + unsigned n = READ_ONCE(dir->i_dir_seq); + if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1)) return n; cpu_relax(); } @@ -2922,6 +2922,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) write_sequnlock(&rename_lock); } +EXPORT_SYMBOL(d_exchange); /** * d_ancestor - search for an ancestor diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c12d649df6a543..661a99a7dfbe26 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -362,7 +362,8 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) } EXPORT_SYMBOL_GPL(debugfs_lookup); -static struct dentry *start_creating(const char *name, struct dentry *parent) +static struct dentry *debugfs_start_creating(const char *name, + struct dentry *parent) { struct dentry *dentry; int error; @@ -428,7 +429,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); - dentry = start_creating(name, parent); + dentry = debugfs_start_creating(name, parent); if (IS_ERR(dentry)) return dentry; @@ -577,7 +578,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size); */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { - struct dentry *dentry = start_creating(name, parent); + struct dentry *dentry = debugfs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) @@ -624,7 +625,7 @@ struct dentry *debugfs_create_automount(const char *name, debugfs_automount_t f, void *data) { - struct dentry *dentry = start_creating(name, parent); + struct dentry *dentry = debugfs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) @@ -687,7 +688,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, if (!link) return ERR_PTR(-ENOMEM); - dentry = start_creating(name, parent); + dentry = debugfs_start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); return dentry; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index a23fd524a6ee35..a0d75b5c83c632 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -26,6 +26,7 @@ /* * /config/dlm//spaces//nodes//nodeid (refers to ) * /config/dlm//spaces//nodes//weight + * /config/dlm//spaces//nodes//release_recover * /config/dlm//comms//nodeid (refers to ) * /config/dlm//comms//local * /config/dlm//comms//addr (write only) @@ -267,6 +268,7 @@ enum { enum { NODE_ATTR_NODEID = 0, NODE_ATTR_WEIGHT, + NODE_ATTR_RELEASE_RECOVER, }; struct dlm_clusters { @@ -280,6 +282,8 @@ struct dlm_spaces { struct dlm_space { struct config_group group; struct list_head members; + struct list_head members_gone; + int members_gone_count; struct mutex members_lock; int members_count; struct dlm_nodes *nds; @@ -310,6 +314,14 @@ struct dlm_node { int weight; int new; int comm_seq; /* copy of cm->seq when nd->nodeid is set */ + unsigned int release_recover; +}; + +struct dlm_member_gone { + int nodeid; + unsigned int release_recover; + + struct list_head list; /* space->members_gone */ }; static struct configfs_group_operations clusters_ops = { @@ -480,6 +492,7 @@ static struct config_group *make_space(struct config_group *g, const char *name) configfs_add_default_group(&nds->ns_group, &sp->group); INIT_LIST_HEAD(&sp->members); + INIT_LIST_HEAD(&sp->members_gone); mutex_init(&sp->members_lock); sp->members_count = 0; sp->nds = nds; @@ -587,10 +600,20 @@ static void drop_node(struct config_group *g, struct config_item *i) { struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); struct dlm_node *nd = config_item_to_node(i); + struct dlm_member_gone *mb_gone; + + mb_gone = kzalloc(sizeof(*mb_gone), GFP_KERNEL); + if (!mb_gone) + return; mutex_lock(&sp->members_lock); list_del(&nd->list); sp->members_count--; + + mb_gone->nodeid = nd->nodeid; + mb_gone->release_recover = nd->release_recover; + list_add(&mb_gone->list, &sp->members_gone); + sp->members_gone_count++; mutex_unlock(&sp->members_lock); config_item_put(i); @@ -815,12 +838,34 @@ static ssize_t node_weight_store(struct config_item *item, const char *buf, return len; } +static ssize_t node_release_recover_show(struct config_item *item, char *buf) +{ + struct dlm_node *n = config_item_to_node(item); + + return sprintf(buf, "%u\n", n->release_recover); +} + +static ssize_t node_release_recover_store(struct config_item *item, + const char *buf, size_t len) +{ + struct dlm_node *n = config_item_to_node(item); + int rc; + + rc = kstrtouint(buf, 0, &n->release_recover); + if (rc) + return rc; + + return len; +} + CONFIGFS_ATTR(node_, nodeid); CONFIGFS_ATTR(node_, weight); +CONFIGFS_ATTR(node_, release_recover); static struct configfs_attribute *node_attrs[] = { [NODE_ATTR_NODEID] = &node_attr_nodeid, [NODE_ATTR_WEIGHT] = &node_attr_weight, + [NODE_ATTR_RELEASE_RECOVER] = &node_attr_release_recover, NULL, }; @@ -882,9 +927,10 @@ static void put_comm(struct dlm_comm *cm) int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out) { + struct dlm_member_gone *mb_gone, *mb_safe; + struct dlm_config_node *nodes, *node; struct dlm_space *sp; struct dlm_node *nd; - struct dlm_config_node *nodes, *node; int rv, count; sp = get_space(lsname); @@ -898,7 +944,7 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, goto out; } - count = sp->members_count; + count = sp->members_count + sp->members_gone_count; nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); if (!nodes) { @@ -917,6 +963,20 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, nd->new = 0; } + /* we delay the remove on nodes until here as configfs does + * not support addtional attributes for rmdir(). + */ + list_for_each_entry_safe(mb_gone, mb_safe, &sp->members_gone, list) { + node->nodeid = mb_gone->nodeid; + node->release_recover = mb_gone->release_recover; + node->gone = true; + node++; + + list_del(&mb_gone->list); + sp->members_gone_count--; + kfree(mb_gone); + } + *count_out = count; *nodes_out = nodes; rv = 0; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 13a3d0b2619425..4ebd45f752762c 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -17,8 +17,10 @@ struct dlm_config_node { int nodeid; int weight; + bool gone; int new; uint32_t comm_seq; + unsigned int release_recover; }; extern const struct rhashtable_params dlm_rhash_rsb_params; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 6dd3a524cd3529..be938fdf17d967 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5576,7 +5576,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, if (rl->rl_status == DLM_LKSTS_CONVERT && middle_conversion(lkb)) { /* We may need to adjust grmode depending on other granted locks. */ - log_limit(ls, "%s %x middle convert gr %d rq %d remote %d %x", + log_rinfo(ls, "%s %x middle convert gr %d rq %d remote %d %x", __func__, lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid); rsb_set_flag(r, RSB_RECOVER_CONVERT); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 1929327ffbe1cf..ddaa765587068f 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -186,12 +186,17 @@ static struct kobj_type dlm_ktype = { static struct kset *dlm_kset; -static int do_uevent(struct dlm_ls *ls, int in) +static int do_uevent(struct dlm_ls *ls, int in, unsigned int release_recover) { - if (in) + char message[512] = {}; + char *envp[] = { message, NULL }; + + if (in) { kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE); - else - kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); + } else { + snprintf(message, 511, "RELEASE_RECOVER=%u", release_recover); + kobject_uevent_env(&ls->ls_kobj, KOBJ_OFFLINE, envp); + } log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving"); @@ -575,7 +580,7 @@ static int new_lockspace(const char *name, const char *cluster, current lockspace members are (via configfs) and then tells the lockspace to start running (via sysfs) in dlm_ls_start(). */ - error = do_uevent(ls, 1); + error = do_uevent(ls, 1, 0); if (error < 0) goto out_recoverd; @@ -592,7 +597,7 @@ static int new_lockspace(const char *name, const char *cluster, return 0; out_members: - do_uevent(ls, 0); + do_uevent(ls, 0, 0); dlm_clear_members(ls); kfree(ls->ls_node_array); out_recoverd: @@ -671,19 +676,20 @@ int dlm_new_user_lockspace(const char *name, const char *cluster, This is because there may be LKBs queued as ASTs that have been unlinked from their RSBs and are pending deletion once the AST has been delivered */ -static int lockspace_busy(struct dlm_ls *ls, int force) +static int lockspace_busy(struct dlm_ls *ls, unsigned int release_option) { struct dlm_lkb *lkb; unsigned long id; int rv = 0; read_lock_bh(&ls->ls_lkbxa_lock); - if (force == 0) { + if (release_option == DLM_RELEASE_NO_LOCKS) { xa_for_each(&ls->ls_lkbxa, id, lkb) { rv = 1; break; } - } else if (force == 1) { + } else if (release_option == DLM_RELEASE_UNUSED) { + /* TODO: handle this UNUSED option as NO_LOCKS in later patch */ xa_for_each(&ls->ls_lkbxa, id, lkb) { if (lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV) { @@ -698,11 +704,11 @@ static int lockspace_busy(struct dlm_ls *ls, int force) return rv; } -static int release_lockspace(struct dlm_ls *ls, int force) +static int release_lockspace(struct dlm_ls *ls, unsigned int release_option) { int busy, rv; - busy = lockspace_busy(ls, force); + busy = lockspace_busy(ls, release_option); spin_lock_bh(&lslist_lock); if (ls->ls_create_count == 1) { @@ -730,8 +736,9 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_device_deregister(ls); - if (force < 3 && dlm_user_daemon_available()) - do_uevent(ls, 0); + if (release_option != DLM_RELEASE_NO_EVENT && + dlm_user_daemon_available()) + do_uevent(ls, 0, (release_option == DLM_RELEASE_RECOVER)); dlm_recoverd_stop(ls); @@ -782,25 +789,24 @@ static int release_lockspace(struct dlm_ls *ls, int force) * lockspace must continue to function as usual, participating in recoveries, * until this returns. * - * Force has 4 possible values: - * 0 - don't destroy lockspace if it has any LKBs - * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs - * 2 - destroy lockspace regardless of LKBs - * 3 - destroy lockspace as part of a forced shutdown + * See DLM_RELEASE defines for release_option values and their meaning. */ -int dlm_release_lockspace(void *lockspace, int force) +int dlm_release_lockspace(void *lockspace, unsigned int release_option) { struct dlm_ls *ls; int error; + if (release_option > __DLM_RELEASE_MAX) + return -EINVAL; + ls = dlm_find_lockspace_local(lockspace); if (!ls) return -EINVAL; dlm_put_lockspace(ls); mutex_lock(&ls_lock); - error = release_lockspace(ls, force); + error = release_lockspace(ls, release_option); if (!error) ls_count--; if (!ls_count) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index e4373bce1bc239..9a0b6c2b6b01e4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1703,7 +1703,7 @@ static int work_start(void) return -ENOMEM; } - process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH, 0); + process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH | WQ_PERCPU, 0); if (!process_workqueue) { log_print("can't start dlm_process"); destroy_workqueue(io_workqueue); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 4887c8a05318dd..a44d16da7187c8 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -52,7 +52,7 @@ static int __init init_dlm(void) if (error) goto out_user; - dlm_wq = alloc_workqueue("dlm_wq", 0, 0); + dlm_wq = alloc_workqueue("dlm_wq", WQ_PERCPU, 0); if (!dlm_wq) { error = -ENOMEM; goto out_plock; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index b0864c93230f53..c0f557a80a7542 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -478,7 +478,8 @@ static void dlm_lsop_recover_prep(struct dlm_ls *ls) ls->ls_ops->recover_prep(ls->ls_ops_arg); } -static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) +static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb, + unsigned int release_recover) { struct dlm_slot slot; uint32_t seq; @@ -495,7 +496,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) error = dlm_comm_seq(memb->nodeid, &seq, false); - if (!error && seq == memb->comm_seq) + if (!release_recover && !error && seq == memb->comm_seq) return; slot.nodeid = memb->nodeid; @@ -552,6 +553,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) struct dlm_member *memb, *safe; struct dlm_config_node *node; int i, error, neg = 0, low = -1; + unsigned int release_recover; /* previously removed members that we've not finished removing need to * count as a negative change so the "neg" recovery steps will happen @@ -569,11 +571,21 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { node = find_config_node(rv, memb->nodeid); - if (node && !node->new) + if (!node) { + log_error(ls, "remove member %d invalid", + memb->nodeid); + return -EFAULT; + } + + if (!node->new && !node->gone) continue; - if (!node) { - log_rinfo(ls, "remove member %d", memb->nodeid); + release_recover = 0; + + if (node->gone) { + release_recover = node->release_recover; + log_rinfo(ls, "remove member %d%s", memb->nodeid, + release_recover ? " (release_recover)" : ""); } else { /* removed and re-added */ log_rinfo(ls, "remove member %d comm_seq %u %u", @@ -584,13 +596,16 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) list_move(&memb->list, &ls->ls_nodes_gone); remove_remote_member(memb->nodeid); ls->ls_num_nodes--; - dlm_lsop_recover_slot(ls, memb); + dlm_lsop_recover_slot(ls, memb, release_recover); } /* add new members to ls_nodes */ for (i = 0; i < rv->nodes_count; i++) { node = &rv->nodes[i]; + if (node->gone) + continue; + if (dlm_is_member(ls, node->nodeid)) continue; error = dlm_add_member(ls, node); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index be4240f09abd42..3ac020fb8139ea 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -842,7 +842,7 @@ static void recover_conversion(struct dlm_rsb *r) */ if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) || ((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) { - log_limit(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL", + log_rinfo(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL", __func__, lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid, other_lkid, other_grmode); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 5cb3896be8260f..51daf4acbe318b 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -425,7 +425,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params) dlm_put_lockspace(ls); if (error) - dlm_release_lockspace(lockspace, 0); + dlm_release_lockspace(lockspace, DLM_RELEASE_NO_LOCKS); else error = ls->ls_device.minor; @@ -436,7 +436,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) { dlm_lockspace_t *lockspace; struct dlm_ls *ls; - int error, force = 0; + int error, force = DLM_RELEASE_NO_LOCKS; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -446,7 +446,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) return -ENOENT; if (params->flags & DLM_USER_LSFLG_FORCEFREE) - force = 2; + force = DLM_RELEASE_NORMAL; lockspace = ls; dlm_put_lockspace(ls); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 72fbe1316ab883..abd954c6a14e95 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -634,10 +634,9 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_lock; } - rd.old_mnt_idmap = &nop_mnt_idmap; + rd.mnt_idmap = &nop_mnt_idmap; rd.old_parent = lower_old_dir_dentry; rd.old_dentry = lower_old_dentry; - rd.new_mnt_idmap = &nop_mnt_idmap; rd.new_parent = lower_new_dir_dentry; rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 4bb4002e3cdf04..1f4d8ce5666700 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb); static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .alloc_inode = efivarfs_alloc_inode, .free_inode = efivarfs_free_inode, .show_options = efivarfs_show_options, diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 3b1ba571c7286b..8ca29962a3ddef 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -475,6 +475,10 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) const struct file_operations erofs_file_fops = { .llseek = erofs_file_llseek, .read_iter = erofs_file_read_iter, + .unlocked_ioctl = erofs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = erofs_compat_ioctl, +#endif .mmap_prepare = erofs_file_mmap_prepare, .get_unmapped_area = thp_get_unmapped_area, .splice_read = filemap_splice_read, diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index debf469ad6bd56..32b4f5aa60c986 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -123,4 +123,8 @@ const struct file_operations erofs_dir_fops = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = erofs_readdir, + .unlocked_ioctl = erofs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = erofs_compat_ioctl, +#endif }; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 377ee12b8b9667..3d5738f80072ee 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -12,10 +12,12 @@ /* to allow for x86 boot sectors and other oddities. */ #define EROFS_SUPER_OFFSET 1024 -#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 -#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 -#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 +#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 +#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 #define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008 +#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010 + /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 9a2f5972152257..cb780c095d282a 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" +#include #include static int erofs_fill_symlink(struct inode *inode, void *kaddr, @@ -213,10 +214,7 @@ static int erofs_fill_inode(struct inode *inode) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; - if (erofs_inode_is_data_compressed(vi->datalayout)) - inode->i_fop = &generic_ro_fops; - else - inode->i_fop = &erofs_file_fops; + inode->i_fop = &erofs_file_fops; break; case S_IFDIR: inode->i_op = &erofs_dir_iops; @@ -341,6 +339,40 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, return 0; } +static int erofs_ioctl_get_volume_label(struct inode *inode, void __user *arg) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + int ret; + + if (!sbi->volume_name) + ret = clear_user(arg, 1); + else + ret = copy_to_user(arg, sbi->volume_name, + strlen(sbi->volume_name)); + return ret ? -EFAULT : 0; +} + +long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + void __user *argp = (void __user *)arg; + + switch (cmd) { + case FS_IOC_GETFSLABEL: + return erofs_ioctl_get_volume_label(inode, argp); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long erofs_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + return erofs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4ccc5f0ee8dfb9..f7f622836198da 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -153,6 +153,7 @@ struct erofs_sb_info { /* used for statfs, f_files - f_favail */ u64 inos; + char *volume_name; u32 feature_compat; u32 feature_incompat; @@ -234,6 +235,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX) +EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX) static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid) { @@ -535,6 +537,10 @@ static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { static inline void erofs_fscache_submit_bio(struct bio *bio) {} #endif +long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long erofs_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 1b529ace4db0ee..f3f8d8c066e4e6 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -343,6 +343,13 @@ static int erofs_read_superblock(struct super_block *sb) sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); + if (dsb->volume_name[0]) { + sbi->volume_name = kstrndup(dsb->volume_name, + sizeof(dsb->volume_name), GFP_KERNEL); + if (!sbi->volume_name) + return -ENOMEM; + } + /* parse on-disk compression configurations */ ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) @@ -822,6 +829,7 @@ static void erofs_sb_free(struct erofs_sb_info *sbi) kfree(sbi->domain_id); if (sbi->dif0.file) fput(sbi->dif0.file); + kfree(sbi->volume_name); kfree(sbi); } @@ -1018,10 +1026,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static void erofs_evict_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + dax_break_layout_final(inode); +#endif + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + const struct super_operations erofs_sops = { .put_super = erofs_put_super, .alloc_inode = erofs_alloc_inode, .free_inode = erofs_free_inode, + .evict_inode = erofs_evict_inode, .statfs = erofs_statfs, .show_options = erofs_show_options, }; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index eaa9efd766eea7..396536d9a86216 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; struct erofs_xattr_prefix_item *pfs; int ret = 0, i, len; + bool plain = erofs_sb_has_plain_xattr_pfx(sbi); if (!sbi->xattr_prefix_count) return 0; @@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb) if (!pfs) return -ENOMEM; - if (sbi->packed_inode) - buf.mapping = sbi->packed_inode->i_mapping; - else + if (!plain) { + if (erofs_sb_has_metabox(sbi)) + (void)erofs_init_metabuf(&buf, sb, true); + else if (sbi->packed_inode) + buf.mapping = sbi->packed_inode->i_mapping; + else + plain = true; + } + if (plain) (void)erofs_init_metabuf(&buf, sb, false); for (i = 0; i < sbi->xattr_prefix_count; i++) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2d73297003d25a..bc80cfe482f73b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -823,9 +823,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) } rcu_read_unlock(); } - } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; } if (pcl) { @@ -1835,7 +1832,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); - if (err) + if (err || !(map->m_flags & EROFS_MAP_ENCODED)) return; /* expand ra for the trailing edge if readahead */ @@ -1847,7 +1844,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, end = round_up(end, PAGE_SIZE); } else { end = round_up(map->m_la, PAGE_SIZE); - if (!map->m_llen) + if (!(map->m_flags & EROFS_MAP_ENCODED) || !map->m_llen) return; } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index a93efd95c5556d..e5581dbeb4c2bc 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode, .map = map, .in_mbox = erofs_inode_in_metabox(inode), }; - int err = 0; - unsigned int endoff, afmt; + unsigned int endoff; unsigned long initial_lcn; unsigned long long ofs, end; + int err; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && @@ -462,8 +462,8 @@ static int z_erofs_map_blocks_fo(struct inode *inode, map->m_pa = vi->z_fragmentoff; map->m_plen = vi->z_idata_size; if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map->m_plen); + erofs_err(sb, "ztailpacking inline data across blocks @ nid %llu", + vi->nid); err = -EFSCORRUPTED; goto unmap_out; } @@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode, err = -EFSCORRUPTED; goto unmap_out; } - afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? - Z_EROFS_COMPRESSION_INTERLACED : - Z_EROFS_COMPRESSION_SHIFTED; + if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) + map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + map->m_algorithmformat = vi->z_algorithmtype[1]; } else { - afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? - vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; - if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { - erofs_err(sb, "inconsistent algorithmtype %u for nid %llu", - afmt, vi->nid); - err = -EFSCORRUPTED; - goto unmap_out; - } + map->m_algorithmformat = vi->z_algorithmtype[0]; } - map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && @@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; struct z_erofs_map_header *h; + erofs_off_t pos; + int err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { /* @@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; - err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; @@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) vi->z_idata_size = le16_to_cpu(h->h_idata_size); - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_unlock; - } - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -726,6 +711,30 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) return err; } +static int z_erofs_map_sanity_check(struct inode *inode, + struct erofs_map_blocks *map) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + if (!(map->m_flags & EROFS_MAP_ENCODED)) + return 0; + if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) { + erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel", + map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid); + return -EOPNOTSUPP; + } + if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX && + !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + map->m_algorithmformat, EROFS_I(inode)->nid); + return -EFSCORRUPTED; + } + if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + return -EOPNOTSUPP; + return 0; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { @@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, else err = z_erofs_map_blocks_fo(inode, map, flags); } - if (!err && (map->m_flags & EROFS_MAP_ENCODED) && - unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || - map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) - err = -EOPNOTSUPP; + if (!err) + err = z_erofs_map_sanity_check(inode, map); if (err) map->m_llen = 0; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b22d6f819f782d..ee7c4b683ec3d2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -46,10 +46,10 @@ * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->lock (rwlock) + * 3) ep->lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a rwlock (ep->lock) because we manipulate objects + * We need a spinlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -195,7 +195,7 @@ struct eventpoll { struct list_head rdllist; /* Lock which protects rdllist and ovflist */ - rwlock_t lock; + spinlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; @@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) * in a lockless way. */ lockdep_assert_irqs_enabled(); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, @@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep, { struct epitem *epi, *nepi; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep, wake_up(&ep->wq); } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_get(struct eventpoll *ep) @@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) rb_erase_cached(&epi->rbn, &ep->rbr); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep) return -ENOMEM; mutex_init(&ep->mtx); - rwlock_init(&ep->lock); + spin_lock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); @@ -1239,100 +1239,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, } #endif /* CONFIG_KCMP */ -/* - * Adds a new entry to the tail of the list in a lockless way, i.e. - * multiple CPUs are allowed to call this function concurrently. - * - * Beware: it is necessary to prevent any other modifications of the - * existing list until all changes are completed, in other words - * concurrent list_add_tail_lockless() calls should be protected - * with a read lock, where write lock acts as a barrier which - * makes sure all list_add_tail_lockless() calls are fully - * completed. - * - * Also an element can be locklessly added to the list only in one - * direction i.e. either to the tail or to the head, otherwise - * concurrent access will corrupt the list. - * - * Return: %false if element has been already added to the list, %true - * otherwise. - */ -static inline bool list_add_tail_lockless(struct list_head *new, - struct list_head *head) -{ - struct list_head *prev; - - /* - * This is simple 'new->next = head' operation, but cmpxchg() - * is used in order to detect that same element has been just - * added to the list from another CPU: the winner observes - * new->next == new. - */ - if (!try_cmpxchg(&new->next, &new, head)) - return false; - - /* - * Initially ->next of a new element must be updated with the head - * (we are inserting to the tail) and only then pointers are atomically - * exchanged. XCHG guarantees memory ordering, thus ->next should be - * updated before pointers are actually swapped and pointers are - * swapped before prev->next is updated. - */ - - prev = xchg(&head->prev, new); - - /* - * It is safe to modify prev->next and new->prev, because a new element - * is added only to the tail and new->next is updated before XCHG. - */ - - prev->next = new; - new->prev = prev; - - return true; -} - -/* - * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, - * i.e. multiple CPUs are allowed to call this function concurrently. - * - * Return: %false if epi element has been already chained, %true otherwise. - */ -static inline bool chain_epi_lockless(struct epitem *epi) -{ - struct eventpoll *ep = epi->ep; - - /* Fast preliminary check */ - if (epi->next != EP_UNACTIVE_PTR) - return false; - - /* Check that the same epi has not been just chained from another CPU */ - if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) - return false; - - /* Atomically exchange tail */ - epi->next = xchg(&ep->ovflist, epi); - - return true; -} - /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. - * - * This callback takes a read lock in order not to contend with concurrent - * events from another file descriptor, thus all modifications to ->rdllist - * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_start/done_scan(), which stops all list modifications and guarantees - * that lists state is seen correctly. - * - * Another thing worth to mention is that ep_poll_callback() can be called - * concurrently for the same @epi from different CPUs if poll table was inited - * with several wait queues entries. Plural wakeup from different CPUs of a - * single wait queue is serialized by wq.lock, but the case when multiple wait - * queues are used should be detected accordingly. This is detected using - * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { @@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v unsigned long flags; int ewake = 0; - read_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (chain_epi_lockless(epi)) + if (epi->next == EP_UNACTIVE_PTR) { + epi->next = READ_ONCE(ep->ovflist); + WRITE_ONCE(ep->ovflist, epi); ep_pm_stay_awake_rcu(epi); + } } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ - if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) - ep_pm_stay_awake_rcu(epi); + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake_rcu(epi); } /* @@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v pwake++; out_unlock: - read_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, } /* We have to drop the new item inside our item list to keep track of it */ - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) @@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, if (waitqueue_active(&ep->poll_wait)) pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ @@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, init_wait(&wait); wait.func = ep_autoremove_wake_function; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it @@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); if (!eavail) timed_out = !ep_schedule_timeout(to) || @@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = 1; if (!list_empty_careful(&wait.entry)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its @@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } } } diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a149..4a89918b761f6d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -599,7 +599,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { - unsigned long ret; + int ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; @@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) + if (!error && !write) validate_coredump_safety(); return error; } diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 0a056d97e64024..cf0a0970c09562 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -227,6 +227,8 @@ static bool ext4_has_stable_inodes(struct super_block *sb) } const struct fscrypt_operations ext4_cryptops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 01a6e2de7fc3ef..6cb784a56b3bab 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1182,6 +1182,14 @@ struct ext4_inode_info { __u32 i_csum_seed; kprojid_t i_projid; + +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif }; /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df4051613b290a..ba4fd9aba1c14d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (atomic_read(&inode->i_count) > 1) { + if (icount_read(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, - atomic_read(&inode->i_count)); + icount_read(inode)); return; } if (inode->i_nlink) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5898d92ba19f14..8b18802e83ebd2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3995,7 +3995,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) list_splice_tail(&freed_data_list, &sbi->s_discard_list); spin_unlock(&sbi->s_md_lock); if (wake) - queue_work(system_unbound_wq, &sbi->s_discard_work); + queue_work(system_dfl_wq, &sbi->s_discard_work); } else { list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) kmem_cache_free(ext4_free_data_cachep, entry); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 699c15db28a82f..7f2d4014d1287f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) static int ext4_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); @@ -1470,6 +1470,12 @@ static void init_once(void *foo) init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + ei->i_crypt_info = NULL; +#endif +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } static int __init init_inodecache(void) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index d9203228ce9796..b0acb0c5031379 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -389,6 +389,8 @@ static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations ext4_verityops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .begin_enable_verity = ext4_begin_enable_verity, .end_enable_verity = ext4_end_enable_verity, .get_verity_descriptor = ext4_get_verity_descriptor, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 46be7560548ce2..6e465bbc85ee5c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -907,6 +907,12 @@ struct f2fs_inode_info { unsigned int atomic_write_cnt; loff_t original_i_size; /* original i_size before atomic write */ +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ +#endif +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; /* filesystem verity info */ +#endif }; static inline void get_read_extent_info(struct extent_info *ext, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c298..2619cbbd7d2d33 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -480,6 +480,12 @@ static void init_once(void *foo) struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; inode_init_once(&fi->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + fi->i_crypt_info = NULL; +#endif +#ifdef CONFIG_FS_VERITY + fi->i_verity_info = NULL; +#endif } #ifdef CONFIG_QUOTA @@ -1744,7 +1750,7 @@ static int f2fs_drop_inode(struct inode *inode) if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); /* should remain fi->extent_tree for writepage */ @@ -1763,12 +1769,12 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); spin_lock(&inode->i_lock); - atomic_dec(&inode->i_count); + iput(inode); } trace_f2fs_drop_inode(inode, 0); return 0; } - ret = generic_drop_inode(inode); + ret = inode_generic_drop(inode); if (!ret) ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); @@ -3570,6 +3576,8 @@ static struct block_device **f2fs_get_devices(struct super_block *sb, } static const struct fscrypt_operations f2fs_cryptops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_crypt_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, @@ -3581,7 +3589,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .has_stable_inodes = f2fs_has_stable_inodes, .get_devices = f2fs_get_devices, }; -#endif +#endif /* CONFIG_FS_ENCRYPTION */ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 2287f238ae09eb..f0ab9a3c7a82b3 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -287,6 +287,8 @@ static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations f2fs_verityops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_verity_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .begin_enable_verity = f2fs_begin_enable_verity, .end_enable_verity = f2fs_end_enable_verity, .get_verity_descriptor = f2fs_get_verity_descriptor, diff --git a/fs/fcntl.c b/fs/fcntl.c index 5598e4d5742299..72f8433d910988 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -355,8 +355,7 @@ static bool rw_hint_valid(u64 hint) } } -static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_get_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -367,8 +366,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, return 0; } -static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_set_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -547,10 +545,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: - err = fcntl_get_rw_hint(filp, cmd, arg); + err = fcntl_get_rw_hint(filp, arg); break; case F_SET_RW_HINT: - err = fcntl_set_rw_hint(filp, cmd, arg); + err = fcntl_set_rw_hint(filp, arg); break; default: break; diff --git a/fs/fhandle.c b/fs/fhandle.c index 68a7d2861c58fe..052f9c9368fbb1 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root) return 0; } + if (fd == FD_NSFS_ROOT) { + nsfs_get_root(root); + return 0; + } + return -EBADF; } @@ -207,6 +213,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry) if (!ctx->flags) return 1; + /* + * Verify that the decoded dentry itself has a valid id mapping. + * In case the decoded dentry is the mountfd root itself, this + * verifies that the mountfd inode itself has a valid id mapping. + */ + if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry))) + return 0; + /* * It's racy as we're not taking rename_lock but we're able to ignore * permissions and we just need an approximation whether we were able diff --git a/fs/file.c b/fs/file.c index 6d2275c3be9c69..28743b742e3cf6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; - return do_dup2(files, file, fd, flags); + err = do_dup2(files, file, fd, flags); + if (err < 0) + return err; + return 0; out_unlock: spin_unlock(&files->file_lock); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a07b8cf73ae271..2b35e80037feed 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct rcu_work work; + /* List of queued switching contexts for the wb */ + struct llist_node list; /* * Multiple inodes can be switched at once. The switching procedure @@ -378,7 +379,6 @@ struct inode_switch_wbs_context { * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ - struct bdi_writeback *new_wb; struct inode *inodes[]; }; @@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode, * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, * the specific list @inode was on is ignored and the @inode is put on * ->b_dirty which is always correct including from ->b_dirty_time. - * The transfer preserves @inode->dirtied_when ordering. If the @inode - * was clean, it means it was on the b_attached list, so move it onto - * the b_attached list of @new_wb. + * If the @inode was clean, it means it was on the b_attached list, so + * move it onto the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; if (inode->i_state & I_DIRTY_ALL) { - struct inode *pos; - - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; + /* + * We need to keep b_dirty list sorted by + * dirtied_time_when. However properly sorting the + * inode in the list gets too expensive when switching + * many inodes. So just attach inode at the end of the + * dirty list and clobber the dirtied_time_when. + */ + inode->dirtied_time_when = jiffies; inode_io_list_move_locked(inode, new_wb, - pos->i_io_list.prev); + &new_wb->b_dirty); } else { inode_cgwb_move_to_attached(inode, new_wb); } @@ -486,13 +487,11 @@ static bool inode_do_switch_wbs(struct inode *inode, return switched; } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void process_inode_switch_wbs(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; @@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) */ down_read(&bdi->wb_switch_rwsem); + inodep = isw->inodes; /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ +relock: if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); @@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } - for (inodep = isw->inodes; *inodep; inodep++) { + while (*inodep) { WARN_ON_ONCE((*inodep)->i_wb != old_wb); if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) nr_switched++; + inodep++; + if (*inodep && need_resched()) { + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + cond_resched(); + goto relock; + } } spin_unlock(&new_wb->list_lock); @@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } +void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, + switch_work); + struct inode_switch_wbs_context *isw, *next_isw; + struct llist_node *list; + + /* + * Grab out reference to wb so that it cannot get freed under us + * after we process all the isw items. + */ + wb_get(new_wb); + while (1) { + list = llist_del_all(&new_wb->switch_wbs_ctxs); + /* Nothing to do? */ + if (!list) + break; + /* + * In addition to synchronizing among switchers, I_WB_SWITCH + * tells the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be + * visible. + */ + synchronize_rcu(); + + llist_for_each_entry_safe(isw, next_isw, list, list) + process_inode_switch_wbs(new_wb, isw); + } + wb_put(new_wb); +} + static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { @@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode, return true; } +static void wb_queue_isw(struct bdi_writeback *wb, + struct inode_switch_wbs_context *isw) +{ + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) + queue_work(isw_wq, &wb->switch_work); +} + /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) @@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!memcg_css) goto out_free; - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); - if (!isw->new_wb) + if (!new_wb) goto out_free; - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) goto out_free; isw->inodes[0] = inode; - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1); + wb_queue_isw(new_wb, isw); return; out_free: atomic_dec(&isw_nr_in_flight); - if (isw->new_wb) - wb_put(isw->new_wb); + if (new_wb) + wb_put(new_wb); kfree(isw); } -static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, +static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) continue; isw->inodes[*nr] = inode; @@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb; int nr; bool restart = false; @@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); - if (isw->new_wb) + new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (new_wb) break; } - if (unlikely(!isw->new_wb)) - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + if (unlikely(!new_wb)) + new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); @@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ - restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); if (!restart) - restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, + &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); - wb_put(isw->new_wb); + wb_put(new_wb); kfree(isw); return restart; } - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + trace_inode_switch_wbs_queue(wb, new_wb, nr); + wb_queue_isw(new_wb, isw); return restart; } @@ -1123,7 +1162,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, dirty = dirty * 10 / 8; /* issue the writeback work */ - work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); + work = kzalloc(sizeof(*work), GFP_NOWAIT); if (work) { work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; @@ -1180,7 +1219,7 @@ void cgroup_writeback_umount(struct super_block *sb) static int __init cgroup_writeback_init(void) { - isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); + isw_wq = alloc_workqueue("inode_switch_wbs", WQ_PERCPU, 0); if (!isw_wq) return -ENOMEM; return 0; @@ -1767,7 +1806,7 @@ static int writeback_single_inode(struct inode *inode, int ret = 0; spin_lock(&inode->i_lock); - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); @@ -2442,7 +2481,7 @@ static int dirtytime_interval_handler(const struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) - mod_delayed_work(system_wq, &dirtytime_work, 0); + mod_delayed_work(system_percpu_wq, &dirtytime_work, 0); return ret; } diff --git a/fs/fsopen.c b/fs/fsopen.c index 1aaf4cb2afb29e..f645c99204eb06 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -18,50 +18,56 @@ #include "internal.h" #include "mount.h" +static inline const char *fetch_message_locked(struct fc_log *log, size_t len, + bool *need_free) +{ + const char *p; + int index; + + if (unlikely(log->head == log->tail)) + return ERR_PTR(-ENODATA); + + index = log->tail & (ARRAY_SIZE(log->buffer) - 1); + p = log->buffer[index]; + if (unlikely(strlen(p) > len)) + return ERR_PTR(-EMSGSIZE); + + log->buffer[index] = NULL; + *need_free = log->need_free & (1 << index); + log->need_free &= ~(1 << index); + log->tail++; + + return p; +} + /* * Allow the user to read back any error, warning or informational messages. + * Only one message is returned for each read(2) call. */ static ssize_t fscontext_read(struct file *file, char __user *_buf, size_t len, loff_t *pos) { struct fs_context *fc = file->private_data; - struct fc_log *log = fc->log.log; - unsigned int logsize = ARRAY_SIZE(log->buffer); - ssize_t ret; - char *p; + ssize_t err; + const char *p __free(kfree) = NULL, *message; bool need_free; - int index, n; + int n; - ret = mutex_lock_interruptible(&fc->uapi_mutex); - if (ret < 0) - return ret; - - if (log->head == log->tail) { - mutex_unlock(&fc->uapi_mutex); - return -ENODATA; - } - - index = log->tail & (logsize - 1); - p = log->buffer[index]; - need_free = log->need_free & (1 << index); - log->buffer[index] = NULL; - log->need_free &= ~(1 << index); - log->tail++; + err = mutex_lock_interruptible(&fc->uapi_mutex); + if (err < 0) + return err; + message = fetch_message_locked(fc->log.log, len, &need_free); mutex_unlock(&fc->uapi_mutex); + if (IS_ERR(message)) + return PTR_ERR(message); - ret = -EMSGSIZE; - n = strlen(p); - if (n > len) - goto err_free; - ret = -EFAULT; - if (copy_to_user(_buf, p, n) != 0) - goto err_free; - ret = n; - -err_free: if (need_free) - kfree(p); - return ret; + p = message; + + n = strlen(message); + if (copy_to_user(_buf, message, n)) + return -EFAULT; + return n; } static int fscontext_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049f9..66a1ba8c56b5e4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -119,7 +119,7 @@ void fuse_check_timeout(struct work_struct *work) goto abort_conn; out: - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); return; @@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num) { + while (num && ap->num_folios < num_pages) { struct folio *folio; unsigned int folio_offset; unsigned int nr_bytes; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2d817d7cab2649..5c569c3cb53f3d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else - blkbits = inode->i_sb->s_blocksize_bits; + blkbits = fc->blkbits; stat->blksize = 1 << blkbits; } @@ -1377,6 +1377,7 @@ static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode, generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; + stat->blksize = 1 << fi->cached_i_blkbits; if (test_bit(FUSE_I_BTIME, &fi->state)) { stat->btime = fi->i_btime; stat->result_mask |= STATX_BTIME; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5525a4520b0f89..4adcf09d4b01a6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = len, + .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), .flags = flags }; struct fuse_write_out outarg; @@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, fc->no_copy_file_range = 1; err = -EOPNOTSUPP; } + if (!err && outarg.size > len) + err = -EIO; + if (err) goto out; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ec248d13c8bfd9..cc428d04be3e14 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -210,6 +210,12 @@ struct fuse_inode { /** Reference to backing file in passthrough mode */ struct fuse_backing *fb; #endif + + /* + * The underlying inode->i_blkbits value will not be modified, + * so preserve the blocksize specified by the server. + */ + u8 cached_i_blkbits; }; /** FUSE inode state bits */ @@ -969,6 +975,14 @@ struct fuse_conn { /* Request timeout (in jiffies). 0 = no timeout */ unsigned int req_timeout; } timeout; + + /* + * This is a workaround until fuse uses iomap for reads. + * For fuseblk servers, this represents the blocksize passed in at + * mount time and for regular fuse servers, this is equivalent to + * inode->i_blkbits. + */ + u8 blkbits; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 67c2318bfc4294..7485a41af892ef 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -289,6 +289,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, } } + if (attr->blksize) + fi->cached_i_blkbits = ilog2(attr->blksize); + else + fi->cached_i_blkbits = fc->blkbits; + /* * Don't set the sticky bit in i_mode, unless we want the VFS * to check permissions. This prevents failures due to the @@ -1204,7 +1209,7 @@ static const struct super_operations fuse_super_operations = { .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, @@ -1268,7 +1273,7 @@ static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) { fc->timeout.req_timeout = secs_to_jiffies(timeout); INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); } @@ -1805,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; + /* + * This is a workaround until fuse hooks into iomap for reads. + * Use PAGE_SIZE for the blocksize else if the writeback cache + * is enabled, buffered writes go through iomap and a read may + * overwrite partially written data if blocksize < PAGE_SIZE + */ + fc->blkbits = sb->s_blocksize_bits; + if (ctx->blksize != PAGE_SIZE && + !sb_set_blocksize(sb, PAGE_SIZE)) + goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; + fc->blkbits = sb->s_blocksize_bits; } sb->s_subtype = ctx->subtype; diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 607ef735ad4ab3..eb97ac009e75d9 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) if (!file) goto out; + /* read/write/splice/mmap passthrough only relevant for regular files */ + res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; + if (!d_is_reg(file->f_path.dentry)) + goto out_fput; + backing_sb = file_inode(file)->i_sb; res = -ELOOP; if (backing_sb->s_stack_depth >= fc->max_stack_depth) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index c826e7ca49f580..76c8fd0bfc75d5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = fs->window_phys_addr + offset; + *pfn = PHYS_PFN(fs->window_phys_addr + offset); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 72d95185a39f61..bc67fa058c8459 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1442,6 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int ret; if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; @@ -1450,14 +1451,20 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) locks_lock_file_wait(file, fl); return -EIO; } - if (cmd == F_CANCELLK) - return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (IS_GETLK(cmd)) - return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (lock_is_unlock(fl)) - return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); - else - return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + down_read(&ls->ls_sem); + ret = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + if (cmd == F_CANCELLK) + ret = dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (IS_GETLK(cmd)) + ret = dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (lock_is_unlock(fl)) + ret = dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + ret = dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); + } + up_read(&ls->ls_sem); + return ret; } static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b6fd1cb17de7ba..b677c0e6b9ab30 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -481,11 +481,9 @@ int gfs2_instantiate(struct gfs2_holder *gh) /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock - * - * Returns true on success (i.e., progress was made or there are no waiters). */ -static bool do_promote(struct gfs2_glock *gl) +static void do_promote(struct gfs2_glock *gl) { struct gfs2_holder *gh, *current_gh; @@ -496,13 +494,10 @@ static bool do_promote(struct gfs2_glock *gl) if (!may_grant(gl, current_gh, gh)) { /* * If we get here, it means we may not grant this - * holder for some reason. If this holder is at the - * head of the list, it means we have a blocked holder - * at the head, so return false. + * holder for some reason. */ - if (list_is_first(&gh->gh_list, &gl->gl_holders)) - return false; - do_error(gl, 0); + if (current_gh) + do_error(gl, 0); /* Fail queued try locks */ break; } set_bit(HIF_HOLDER, &gh->gh_iflags); @@ -511,7 +506,6 @@ static bool do_promote(struct gfs2_glock *gl) if (!current_gh) current_gh = gh; } - return true; } /** @@ -646,8 +640,10 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) } /* Fast path - we got what we asked for */ - if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) + if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { + clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_demote_wake(gl); + } if (gl->gl_state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { int rv; @@ -693,54 +689,33 @@ __acquires(&gl->gl_lockref.lock) const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; - unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && gh && !(gh->gh_flags & LM_FLAG_NOEXP)) goto skip_inval; - lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP); GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); - if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && - glops->go_inval) { - /* - * If another process is already doing the invalidate, let that - * finish first. The glock state machine will get back to this - * holder again later. - */ - if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, - &gl->gl_flags)) - return; - do_error(gl, 0); /* Fail queued try locks */ - } - gl->gl_req = target; - set_bit(GLF_BLOCKING, &gl->gl_flags); - if ((gl->gl_req == LM_ST_UNLOCKED) || - (gl->gl_state == LM_ST_EXCLUSIVE) || - (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) - clear_bit(GLF_BLOCKING, &gl->gl_flags); - if (!glops->go_inval && !glops->go_sync) + if (!glops->go_inval || !glops->go_sync) goto skip_inval; spin_unlock(&gl->gl_lockref.lock); - if (glops->go_sync) { - ret = glops->go_sync(gl); - /* If we had a problem syncing (due to io errors or whatever, - * we should not invalidate the metadata or tell dlm to - * release the glock to other nodes. - */ - if (ret) { - if (cmpxchg(&sdp->sd_log_error, 0, ret)) { - fs_err(sdp, "Error %d syncing glock \n", ret); - gfs2_dump_glock(NULL, gl, true); - } - spin_lock(&gl->gl_lockref.lock); - goto skip_inval; + ret = glops->go_sync(gl); + /* If we had a problem syncing (due to io errors or whatever, + * we should not invalidate the metadata or tell dlm to + * release the glock to other nodes. + */ + if (ret) { + if (cmpxchg(&sdp->sd_log_error, 0, ret)) { + fs_err(sdp, "Error %d syncing glock\n", ret); + gfs2_dump_glock(NULL, gl, true); } + spin_lock(&gl->gl_lockref.lock); + goto skip_inval; } - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { + + if (target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) { /* * The call to go_sync should have cleared out the ail list. * If there are still items, we have a problem. We ought to @@ -755,12 +730,10 @@ __acquires(&gl->gl_lockref.lock) gfs2_dump_glock(NULL, gl, true); } glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); - clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } spin_lock(&gl->gl_lockref.lock); skip_inval: - gl->gl_lockref.count++; /* * Check for an error encountered since we called go_sync and go_inval. * If so, we can't withdraw from the glock code because the withdraw @@ -803,38 +776,41 @@ __acquires(&gl->gl_lockref.lock) if (!test_bit(GLF_CANCELING, &gl->gl_flags)) clear_bit(GLF_LOCK, &gl->gl_flags); clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + gl->gl_lockref.count++; gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); return; - } else { - clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } } if (ls->ls_ops->lm_lock) { set_bit(GLF_PENDING_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); - ret = ls->ls_ops->lm_lock(gl, target, lck_flags); + ret = ls->ls_ops->lm_lock(gl, target, gh ? gh->gh_flags : 0); spin_lock(&gl->gl_lockref.lock); - if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && - target == LM_ST_UNLOCKED && - test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { + if (!ret) { + /* The operation will be completed asynchronously. */ + gl->gl_lockref.count++; + return; + } + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); + + if (ret == -ENODEV && gl->gl_target == LM_ST_UNLOCKED && + target == LM_ST_UNLOCKED) { /* * The lockspace has been released and the lock has * been unlocked implicitly. */ - } else if (ret) { - fs_err(sdp, "lm_lock ret %d\n", ret); - target = gl->gl_state | LM_OUT_ERROR; } else { - /* The operation will be completed asynchronously. */ + fs_err(sdp, "lm_lock ret %d\n", ret); + GLOCK_BUG_ON(gl, !gfs2_withdrawing_or_withdrawn(sdp)); return; } - clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); } /* Complete the operation now. */ finish_xmote(gl, target); + gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); } @@ -855,11 +831,20 @@ __acquires(&gl->gl_lockref.lock) return; set_bit(GLF_LOCK, &gl->gl_flags); - /* While a demote is in progress, the GLF_LOCK flag must be set. */ + /* + * The GLF_DEMOTE_IN_PROGRESS flag is only set intermittently during + * locking operations. We have just started a locking operation by + * setting the GLF_LOCK flag, so the GLF_DEMOTE_IN_PROGRESS flag must + * be cleared. + */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); - if (test_bit(GLF_DEMOTE, &gl->gl_flags) && - gl->gl_demote_state != gl->gl_state) { + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { + if (gl->gl_demote_state == gl->gl_state) { + gfs2_demote_wake(gl); + goto promote; + } + if (find_first_holder(gl)) goto out_unlock; if (nonblock) @@ -869,31 +854,31 @@ __acquires(&gl->gl_lockref.lock) gl->gl_target = gl->gl_demote_state; do_xmote(gl, NULL, gl->gl_target); return; - } else { - if (test_bit(GLF_DEMOTE, &gl->gl_flags)) - gfs2_demote_wake(gl); - if (do_promote(gl)) - goto out_unlock; - gh = find_first_waiter(gl); - if (!gh) - goto out_unlock; - gl->gl_target = gh->gh_state; - if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) - do_error(gl, 0); /* Fail queued try locks */ - do_xmote(gl, gh, gl->gl_target); - return; } +promote: + do_promote(gl); + if (find_first_holder(gl)) + goto out_unlock; + gh = find_first_waiter(gl); + if (!gh) + goto out_unlock; + if (nonblock) + goto out_sched; + gl->gl_target = gh->gh_state; + if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + do_error(gl, 0); /* Fail queued try locks */ + do_xmote(gl, gh, gl->gl_target); + return; + out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_atomic(); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); return; out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_atomic(); } /** @@ -1462,6 +1447,24 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) va_end(args); } +static bool gfs2_should_queue_trylock(struct gfs2_glock *gl, + struct gfs2_holder *gh) +{ + struct gfs2_holder *current_gh, *gh2; + + current_gh = find_first_holder(gl); + if (current_gh && !may_grant(gl, current_gh, gh)) + return false; + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) + continue; + if (!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + return false; + } + return true; +} + static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) @@ -1480,27 +1483,20 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh) */ static inline void add_to_queue(struct gfs2_holder *gh) -__releases(&gl->gl_lockref.lock) -__acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_holder *gh2; - int try_futile = 0; GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) GLOCK_BUG_ON(gl, true); - if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { - if (test_bit(GLF_LOCK, &gl->gl_flags)) { - struct gfs2_holder *current_gh; - - current_gh = find_first_holder(gl); - try_futile = !may_grant(gl, current_gh, gh); - } - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) - goto fail; + if ((gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) && + !gfs2_should_queue_trylock(gl, gh)) { + gh->gh_error = GLR_TRYFAILED; + gfs2_holder_wake(gh); + return; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { @@ -1512,15 +1508,6 @@ __acquires(&gl->gl_lockref.lock) continue; goto trap_recursive; } - list_for_each_entry(gh2, &gl->gl_holders, gh_list) { - if (try_futile && - !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { -fail: - gh->gh_error = GLR_TRYFAILED; - gfs2_holder_wake(gh); - return; - } - } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); @@ -2321,8 +2308,6 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'y'; if (test_bit(GLF_LFLUSH, gflags)) *p++ = 'f'; - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) - *p++ = 'i'; if (test_bit(GLF_PENDING_REPLY, gflags)) *p++ = 'R'; if (test_bit(GLF_HAVE_REPLY, gflags)) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 9339a3bff6eeb1..d041b922b45e3b 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -68,6 +68,10 @@ enum { * also be granted in SHARED. The preferred state is whichever is compatible * with other granted locks, or the specified state if no other locks exist. * + * In addition, when a lock is already held in EX mode locally, a SHARED or + * DEFERRED mode request with the LM_FLAG_ANY flag set will be granted. + * (The LM_FLAG_ANY flag is only use for SHARED mode requests currently.) + * * LM_FLAG_NODE_SCOPE * This holder agrees to share the lock within this node. In other words, * the glock is held in EX mode according to DLM, but local holders on the diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index d4ad82f47eeea4..5a0ea416cfdae9 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -319,7 +319,6 @@ enum { GLF_DEMOTE_IN_PROGRESS = 5, GLF_DIRTY = 6, GLF_LFLUSH = 7, - GLF_INVALIDATE_IN_PROGRESS = 8, GLF_HAVE_REPLY = 9, GLF_INITIAL = 10, GLF_HAVE_FROZEN_REPLY = 11, @@ -376,7 +375,6 @@ struct gfs2_glock { enum { GIF_QD_LOCKED = 1, GIF_SW_PAGED = 3, - GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, }; @@ -658,6 +656,8 @@ struct lm_lockstruct { struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ char *ls_lvb_bits; + struct rw_semaphore ls_sem; + spinlock_t ls_recover_spin; /* protects following fields */ unsigned long ls_recover_flags; /* DFL_ */ uint32_t ls_recover_mount; /* gen in first recover_done cb */ @@ -823,7 +823,6 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; wait_queue_head_t sd_log_flush_wait; int sd_log_error; /* First log error */ - wait_queue_head_t sd_withdraw_wait; unsigned int sd_log_tail; unsigned int sd_log_flush_tail; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index cee5d199d2d870..4f00af7dd256b6 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -58,6 +58,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, /** * gfs2_update_reply_times - Update locking statistics * @gl: The glock to update + * @blocking: The operation may have been blocking * * This assumes that gl->gl_dstamp has been set earlier. * @@ -72,12 +73,12 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, * TRY_1CB flags are set are classified as non-blocking. All * other DLM requests are counted as (potentially) blocking. */ -static inline void gfs2_update_reply_times(struct gfs2_glock *gl) +static inline void gfs2_update_reply_times(struct gfs2_glock *gl, + bool blocking) { struct gfs2_pcpu_lkstats *lks; const unsigned gltype = gl->gl_name.ln_type; - unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ? - GFS2_LKS_SRTTB : GFS2_LKS_SRTT; + unsigned index = blocking ? GFS2_LKS_SRTTB : GFS2_LKS_SRTT; s64 rtt; preempt_disable(); @@ -119,14 +120,18 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl) static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; + bool blocking; unsigned ret; + blocking = test_bit(GLF_BLOCKING, &gl->gl_flags); + gfs2_update_reply_times(gl, blocking); + clear_bit(GLF_BLOCKING, &gl->gl_flags); + /* If the glock is dead, we only react to a dlm_unlock() reply. */ if (__lockref_is_dead(&gl->gl_lockref) && gl->gl_lksb.sb_status != -DLM_EUNLOCK) return; - gfs2_update_reply_times(gl); BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr) @@ -157,14 +162,6 @@ static void gdlm_ast(void *arg) } ret = gl->gl_req; - if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) { - if (gl->gl_req == LM_ST_SHARED) - ret = LM_ST_DEFERRED; - else if (gl->gl_req == LM_ST_DEFERRED) - ret = LM_ST_SHARED; - else - BUG(); - } /* * The GLF_INITIAL flag is initially set for new glocks. Upon the @@ -241,7 +238,7 @@ static bool down_conversion(int cur, int req) } static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, - const int cur, const int req) + const int req, bool blocking) { u32 lkf = 0; @@ -256,15 +253,6 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, lkf |= DLM_LKF_NOQUEUEBAST; } - if (gfs_flags & LM_FLAG_ANY) { - if (req == DLM_LOCK_PR) - lkf |= DLM_LKF_ALTCW; - else if (req == DLM_LOCK_CW) - lkf |= DLM_LKF_ALTPR; - else - BUG(); - } - if (!test_bit(GLF_INITIAL, &gl->gl_flags)) { lkf |= DLM_LKF_CONVERT; @@ -274,7 +262,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, * "upward" lock conversions or else DLM will reject the * request as invalid. */ - if (!down_conversion(cur, req)) + if (blocking) lkf |= DLM_LKF_QUECVT; } @@ -294,14 +282,20 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, unsigned int flags) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; + bool blocking; int cur, req; u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; int error; + gl->gl_req = req_state; cur = make_mode(gl->gl_name.ln_sbd, gl->gl_state); req = make_mode(gl->gl_name.ln_sbd, req_state); - lkf = make_flags(gl, flags, cur, req); + blocking = !down_conversion(cur, req) && + !(flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)); + lkf = make_flags(gl, flags, req, blocking); + if (blocking) + set_bit(GLF_BLOCKING, &gl->gl_flags); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); if (test_bit(GLF_INITIAL, &gl->gl_flags)) { @@ -318,8 +312,13 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, */ again: - error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, - GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; @@ -341,17 +340,10 @@ static void gdlm_put_lock(struct gfs2_glock *gl) return; } - clear_bit(GLF_BLOCKING, &gl->gl_flags); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); - /* don't want to call dlm if we've unmounted the lock protocol */ - if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { - gfs2_glock_free(gl); - return; - } - /* * When the lockspace is released, all remaining glocks will be * unlocked automatically. This is more efficient than unlocking them @@ -369,13 +361,23 @@ static void gdlm_put_lock(struct gfs2_glock *gl) flags |= DLM_LKF_VALBLK; again: - error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, - NULL, gl); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, + NULL, gl); + } + up_read(&ls->ls_sem); if (error == -EBUSY) { msleep(20); goto again; } + if (error == -ENODEV) { + gfs2_glock_free(gl); + return; + } + if (error) { fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, @@ -386,7 +388,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl) static void gdlm_cancel(struct gfs2_glock *gl) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; - dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + + down_read(&ls->ls_sem); + if (likely(ls->ls_dlm != NULL)) { + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); + } + up_read(&ls->ls_sem); } /* @@ -567,7 +574,11 @@ static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; - error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x error %d\n", name, lksb->sb_lkid, error); @@ -594,9 +605,14 @@ static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, memset(strname, 0, GDLM_STRNAME_BYTES); snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); - error = dlm_lock(ls->ls_dlm, mode, lksb, flags, - strname, GDLM_STRNAME_BYTES - 1, - 0, sync_wait_cb, ls, NULL); + down_read(&ls->ls_sem); + error = -ENODEV; + if (likely(ls->ls_dlm != NULL)) { + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + } + up_read(&ls->ls_sem); if (error) { fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", name, lksb->sb_lkid, flags, mode, error); @@ -1323,6 +1339,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) */ INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + ls->ls_dlm = NULL; spin_lock_init(&ls->ls_recover_spin); ls->ls_recover_flags = 0; ls->ls_recover_mount = 0; @@ -1357,6 +1374,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) * create/join lockspace */ + init_rwsem(&ls->ls_sem); error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, &gdlm_lockspace_ops, sdp, &ops_result, &ls->ls_dlm); @@ -1400,7 +1418,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) return 0; fail_release: - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); fail_free: free_recover_size(ls); fail: @@ -1436,10 +1454,12 @@ static void gdlm_unmount(struct gfs2_sbd *sdp) /* mounted_lock and control_lock will be purged in dlm recovery */ release: + down_write(&ls->ls_sem); if (ls->ls_dlm) { - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); ls->ls_dlm = NULL; } + up_write(&ls->ls_sem); free_recover_size(ls); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 0727f60ad02883..9d65719353faad 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -151,7 +151,8 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs2_recovery_wq = alloc_workqueue("gfs2_recovery", - WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, + 0); if (!gfs2_recovery_wq) goto fail_wq1; @@ -160,7 +161,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_control_wq) goto fail_wq2; - gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0); + gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", WQ_PERCPU, 0); if (!gfs2_freeze_wq) goto fail_wq3; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index efe99b73255137..aa15183f9a168e 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1193,13 +1193,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) error = -ENOMEM; sdp->sd_glock_wq = alloc_workqueue("gfs2-glock/%s", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0, + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE | WQ_PERCPU, + 0, sdp->sd_fsname); if (!sdp->sd_glock_wq) goto fail_iput; sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, 0, + sdp->sd_fsname); if (!sdp->sd_delete_wq) goto fail_glock_wq; @@ -1754,7 +1756,7 @@ static void gfs2_evict_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); continue; } - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b42e2110084b6c..644b2d1e72769e 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(SDF_EVICTING, &sdp->sd_flags)) return 1; - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /** diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 26036ffc3f338e..1c2507a273180b 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -52,7 +52,6 @@ {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \ {(1UL << GLF_DIRTY), "y" }, \ {(1UL << GLF_LFLUSH), "f" }, \ - {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ {(1UL << GLF_PENDING_REPLY), "R" }, \ {(1UL << GLF_HAVE_REPLY), "r" }, \ {(1UL << GLF_INITIAL), "a" }, \ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 24864a66074b2a..56412f63f3bb9b 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -309,7 +309,7 @@ void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...) va_end(args); } -int gfs2_withdraw(struct gfs2_sbd *sdp) +void gfs2_withdraw(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; const struct lm_lockops *lm = ls->ls_ops; @@ -322,7 +322,7 @@ int gfs2_withdraw(struct gfs2_sbd *sdp) wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG, TASK_UNINTERRUPTIBLE); - return -1; + return; } new = old | BIT(SDF_WITHDRAWN) | BIT(SDF_WITHDRAW_IN_PROG); } while (unlikely(!try_cmpxchg(&sdp->sd_flags, &old, new))); @@ -350,8 +350,6 @@ int gfs2_withdraw(struct gfs2_sbd *sdp) if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname); - - return -1; } /* @@ -473,46 +471,36 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, /* * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw - * Returns: -1 if this call withdrew the machine, - * -2 if it was already withdrawn */ -int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - const char *function, char *file, - unsigned int line) +void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, + unsigned int line) { - int me; - gfs2_lm(sdp, "fatal: invalid metadata block - " "bh = %llu (bad magic number), " "function = %s, file = %s, line = %u\n", (unsigned long long)bh->b_blocknr, function, file, line); - me = gfs2_withdraw(sdp); - return (me) ? -1 : -2; + gfs2_withdraw(sdp); } /* * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw - * Returns: -1 if this call withdrew the machine, - * -2 if it was already withdrawn */ -int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - u16 type, u16 t, const char *function, - char *file, unsigned int line) +void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, const char *function, + char *file, unsigned int line) { - int me; - gfs2_lm(sdp, "fatal: invalid metadata block - " "bh = %llu (type: exp=%u, found=%u), " "function = %s, file = %s, line = %u\n", (unsigned long long)bh->b_blocknr, type, t, function, file, line); - me = gfs2_withdraw(sdp); - return (me) ? -1 : -2; + gfs2_withdraw(sdp); } /* @@ -521,14 +509,14 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, * 0 if it was already withdrawn */ -int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, - unsigned int line) +void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, + unsigned int line) { gfs2_lm(sdp, "fatal: I/O error - " "function = %s, file = %s, line = %u\n", function, file, line); - return gfs2_withdraw(sdp); + gfs2_withdraw(sdp); } /* diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 27d03b64102418..da0373b1e82b9e 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -91,9 +91,9 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, gfs2_consist_rgrpd_i((rgd), __func__, __FILE__, __LINE__) -int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - const char *function, - char *file, unsigned int line); +void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, + char *file, unsigned int line); static inline int gfs2_meta_check(struct gfs2_sbd *sdp, struct buffer_head *bh) @@ -108,10 +108,10 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp, return 0; } -int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, - u16 type, u16 t, - const char *function, - char *file, unsigned int line); +void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, + const char *function, + char *file, unsigned int line); static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, struct buffer_head *bh, @@ -122,12 +122,16 @@ static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; u32 magic = be32_to_cpu(mh->mh_magic); u16 t = be32_to_cpu(mh->mh_type); - if (unlikely(magic != GFS2_MAGIC)) - return gfs2_meta_check_ii(sdp, bh, function, - file, line); - if (unlikely(t != type)) - return gfs2_metatype_check_ii(sdp, bh, type, t, function, - file, line); + if (unlikely(magic != GFS2_MAGIC)) { + gfs2_meta_check_ii(sdp, bh, function, + file, line); + return -EIO; + } + if (unlikely(t != type)) { + gfs2_metatype_check_ii(sdp, bh, type, t, function, + file, line); + return -EIO; + } return 0; } @@ -144,8 +148,8 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, } -int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, - char *file, unsigned int line); +void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line); int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, bool verbose); @@ -228,6 +232,6 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) __printf(2, 3) void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...); -int gfs2_withdraw(struct gfs2_sbd *sdp); +void gfs2_withdraw(struct gfs2_sbd *sdp); #endif /* __UTIL_DOT_H__ */ diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 34e9804e0f3601..c2f840c49e60b5 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -21,12 +21,12 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->tree = tree; fd->bnode = NULL; - ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFS_CAT_CNID: @@ -48,7 +48,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; @@ -115,6 +115,12 @@ int hfs_brec_find(struct hfs_find_data *fd) __be32 data; int height, res; + fd->record = -1; + fd->keyoffset = -1; + fd->keylength = -1; + fd->entryoffset = -1; + fd->entrylength = -1; + tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index 28307bc9ec1ee1..5e84833a4743ad 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) } } - hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + hfs_dbg("pos %u, num_bits %u\n", pos, *num_bits); HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: @@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if (!count) return 0; - hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); + hfs_dbg("start %u, count %u\n", start, count); /* are all of the bits in range? */ if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index e8cd1a31f2470c..fcfffe75d84e70 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -200,7 +200,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, { struct page *src_page, *dst_page; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -221,7 +221,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page *page; void *ptr; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -243,16 +243,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg_cont(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -260,18 +260,18 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; else tmp = node->tree->max_key_len + 1; - hfs_dbg_cont(BNODE_MOD, " (%d,%d", - tmp, hfs_bnode_read_u8(node, key_off)); + hfs_dbg(" (%d,%d", + tmp, hfs_bnode_read_u8(node, key_off)); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u8(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -361,7 +361,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -401,7 +401,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -546,7 +546,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -559,7 +559,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 896396554bcc17..e49a141c87e517 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -94,7 +94,7 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -179,6 +179,7 @@ int hfs_brec_remove(struct hfs_find_data *fd) struct hfs_btree *tree; struct hfs_bnode *node, *parent; int end_off, rec_off, data_off, size; + int src, dst, len; tree = fd->tree; node = fd->bnode; @@ -191,7 +192,7 @@ int hfs_brec_remove(struct hfs_find_data *fd) mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -208,10 +209,14 @@ int hfs_brec_remove(struct hfs_find_data *fd) } hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); - if (rec_off == end_off) - goto skip; size = fd->keylength + fd->entrylength; + if (rec_off == end_off) { + src = fd->keyoffset; + hfs_bnode_clear(node, src, size); + goto skip; + } + do { data_off = hfs_bnode_read_u16(node, rec_off); hfs_bnode_write_u16(node, rec_off + 2, data_off - size); @@ -219,9 +224,23 @@ int hfs_brec_remove(struct hfs_find_data *fd) } while (rec_off >= end_off); /* fill hole */ - hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size, - data_off - fd->keyoffset - size); + dst = fd->keyoffset; + src = fd->keyoffset + size; + len = data_off - src; + + hfs_bnode_move(node, dst, src, len); + + src = dst + len; + len = data_off - src; + + hfs_bnode_clear(node, src, len); + skip: + /* + * Remove the obsolete offset to free space. + */ + hfs_bnode_write_u16(node, end_off, 0); + hfs_bnode_dump(node); if (!fd->record) hfs_brec_update_parent(fd); @@ -242,7 +261,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d, new %d, next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -378,7 +397,7 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd) newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; else fd->keylength = newkeylen = tree->max_key_len + 1; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index e86e1e235658fa..22e62fe7448bf8 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -364,7 +364,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index d63880e7d9d672..caebabb6642f16 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -87,7 +87,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); if (dir->i_size >= HFS_MAX_VALENCE) return -ENOSPC; @@ -211,6 +211,124 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, return hfs_brec_find(fd); } +static inline +void hfs_set_next_unused_CNID(struct super_block *sb, + u32 deleted_cnid, u32 found_cnid) +{ + if (found_cnid < HFS_FIRSTUSER_CNID) { + atomic64_cmpxchg(&HFS_SB(sb)->next_id, + deleted_cnid + 1, HFS_FIRSTUSER_CNID); + } else { + atomic64_cmpxchg(&HFS_SB(sb)->next_id, + deleted_cnid + 1, found_cnid + 1); + } +} + +/* + * hfs_correct_next_unused_CNID() + * + * Correct the next unused CNID of Catalog Tree. + */ +static +int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) +{ + struct hfs_btree *cat_tree; + struct hfs_bnode *node; + s64 leaf_head; + s64 leaf_tail; + s64 node_id; + + hfs_dbg("cnid %u, next_id %lld\n", + cnid, atomic64_read(&HFS_SB(sb)->next_id)); + + if ((cnid + 1) < atomic64_read(&HFS_SB(sb)->next_id)) { + /* next ID should be unchanged */ + return 0; + } + + cat_tree = HFS_SB(sb)->cat_tree; + leaf_head = cat_tree->leaf_head; + leaf_tail = cat_tree->leaf_tail; + + if (leaf_head > leaf_tail) { + pr_err("node is corrupted: leaf_head %lld, leaf_tail %lld\n", + leaf_head, leaf_tail); + return -ERANGE; + } + + node = hfs_bnode_find(cat_tree, leaf_tail); + if (IS_ERR(node)) { + pr_err("fail to find leaf node: node ID %lld\n", + leaf_tail); + return -ENOENT; + } + + node_id = leaf_tail; + + do { + int i; + + if (node_id != leaf_tail) { + node = hfs_bnode_find(cat_tree, node_id); + if (IS_ERR(node)) + return -ENOENT; + } + + hfs_dbg("node %lld, leaf_tail %lld, leaf_head %lld\n", + node_id, leaf_tail, leaf_head); + + hfs_bnode_dump(node); + + for (i = node->num_recs - 1; i >= 0; i--) { + hfs_cat_rec rec; + u16 off, len, keylen; + int entryoffset; + int entrylength; + u32 found_cnid; + + len = hfs_brec_lenoff(node, i, &off); + keylen = hfs_brec_keylen(node, i); + if (keylen == 0) { + pr_err("fail to get the keylen: " + "node_id %lld, record index %d\n", + node_id, i); + return -EINVAL; + } + + entryoffset = off + keylen; + entrylength = len - keylen; + + if (entrylength > sizeof(rec)) { + pr_err("unexpected record length: " + "entrylength %d\n", + entrylength); + return -EINVAL; + } + + hfs_bnode_read(node, &rec, entryoffset, entrylength); + + if (rec.type == HFS_CDR_DIR) { + found_cnid = be32_to_cpu(rec.dir.DirID); + hfs_dbg("found_cnid %u\n", found_cnid); + hfs_set_next_unused_CNID(sb, cnid, found_cnid); + hfs_bnode_put(node); + return 0; + } else if (rec.type == HFS_CDR_FIL) { + found_cnid = be32_to_cpu(rec.file.FlNum); + hfs_dbg("found_cnid %u\n", found_cnid); + hfs_set_next_unused_CNID(sb, cnid, found_cnid); + hfs_bnode_put(node); + return 0; + } + } + + hfs_bnode_put(node); + + node_id = node->prev; + } while (node_id >= leaf_head); + + return -ENOENT; +} /* * hfs_cat_delete() @@ -225,7 +343,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) struct hfs_readdir_data *rd; int res, type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); sb = dir->i_sb; res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (res) @@ -271,6 +389,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) dir->i_size--; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); + + res = hfs_correct_next_unused_CNID(sb, cnid); + if (res) + goto out; + res = 0; out: hfs_find_exit(&fd); @@ -294,7 +417,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - (ino %lu, name %s) - (ino %lu, name %s)\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 580c62981dbd3d..a097908b269d0a 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -209,12 +209,12 @@ static void hfs_dump_extent(struct hfs_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent: "); for (i = 0; i < 3; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be16_to_cpu(extent[i].block), - be16_to_cpu(extent[i].count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" block %u, count %u", + be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + hfs_dbg("\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, @@ -411,10 +411,11 @@ int hfs_extend_file(struct inode *inode) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); @@ -456,7 +457,7 @@ int hfs_extend_file(struct inode *inode) return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfs_ext_write_extent(inode); if (res) goto out; @@ -481,7 +482,7 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)HFS_I(inode)->phys_size, inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 7c5a7ecfa2465a..fff149af89da31 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -9,12 +9,6 @@ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include @@ -24,35 +18,10 @@ #include #include +#include #include "hfs.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 - -//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP) -//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - - /* * struct hfs_inode_info * @@ -112,13 +81,13 @@ struct hfs_sb_info { the extents b-tree */ struct hfs_btree *cat_tree; /* Information about the catalog b-tree */ - u32 file_count; /* The number of + atomic64_t file_count; /* The number of regular files in the filesystem */ - u32 folder_count; /* The number of + atomic64_t folder_count; /* The number of directories in the filesystem */ - u32 next_id; /* The next available + atomic64_t next_id; /* The next available file id number */ u32 clumpablks; /* The number of allocation blocks to try to add when diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index bf4cb7e78396bd..9cd449913dc82a 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -183,6 +183,10 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t { struct super_block *sb = dir->i_sb; struct inode *inode = new_inode(sb); + s64 next_id; + s64 file_count; + s64 folder_count; + if (!inode) return NULL; @@ -190,7 +194,9 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); spin_lock_init(&HFS_I(inode)->open_dir_lock); hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); - inode->i_ino = HFS_SB(sb)->next_id++; + next_id = atomic64_inc_return(&HFS_SB(sb)->next_id); + BUG_ON(next_id > U32_MAX); + inode->i_ino = (u32)next_id; inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); @@ -202,7 +208,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; if (S_ISDIR(mode)) { inode->i_size = 2; - HFS_SB(sb)->folder_count++; + folder_count = atomic64_inc_return(&HFS_SB(sb)->folder_count); + BUG_ON(folder_count > U32_MAX); if (dir->i_ino == HFS_ROOT_CNID) HFS_SB(sb)->root_dirs++; inode->i_op = &hfs_dir_inode_operations; @@ -211,7 +218,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask; } else if (S_ISREG(mode)) { HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; - HFS_SB(sb)->file_count++; + file_count = atomic64_inc_return(&HFS_SB(sb)->file_count); + BUG_ON(file_count > U32_MAX); if (dir->i_ino == HFS_ROOT_CNID) HFS_SB(sb)->root_files++; inode->i_op = &hfs_file_inode_operations; @@ -241,16 +249,19 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { - HFS_SB(sb)->folder_count--; + BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); + atomic64_dec(&HFS_SB(sb)->folder_count); if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) HFS_SB(sb)->root_dirs--; set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); return; } - HFS_SB(sb)->file_count--; + + BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX); + atomic64_dec(&HFS_SB(sb)->file_count); if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) HFS_SB(sb)->root_files--; if (S_ISREG(inode->i_mode)) { @@ -425,7 +436,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_cat_rec rec; int res; - hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); res = hfs_ext_write_extent(inode); if (res) return res; diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 8082eb01127cdf..53f3fae6021797 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -150,11 +150,11 @@ int hfs_mdb_get(struct super_block *sb) /* These parameters are read from and written to the MDB */ HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks); - HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID); + atomic64_set(&HFS_SB(sb)->next_id, be32_to_cpu(mdb->drNxtCNID)); HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls); HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs); - HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt); - HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt); + atomic64_set(&HFS_SB(sb)->file_count, be32_to_cpu(mdb->drFilCnt)); + atomic64_set(&HFS_SB(sb)->folder_count, be32_to_cpu(mdb->drDirCnt)); /* TRY to get the alternate (backup) MDB. */ sect = part_start + part_size - 2; @@ -172,7 +172,7 @@ int hfs_mdb_get(struct super_block *sb) pr_warn("continuing without an alternate MDB\n"); } - HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL); + HFS_SB(sb)->bitmap = kzalloc(8192, GFP_KERNEL); if (!HFS_SB(sb)->bitmap) goto out; @@ -273,11 +273,17 @@ void hfs_mdb_commit(struct super_block *sb) /* These parameters may have been modified, so write them back */ mdb->drLsMod = hfs_mtime(); mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks); - mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id); + BUG_ON(atomic64_read(&HFS_SB(sb)->next_id) > U32_MAX); + mdb->drNxtCNID = + cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->next_id)); mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files); mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs); - mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count); - mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count); + BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX); + mdb->drFilCnt = + cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->file_count)); + BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); + mdb->drDirCnt = + cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->folder_count)); /* write MDB to disk */ mark_buffer_dirty(HFS_SB(sb)->mdb_bh); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 388a318297ece2..47f50fa555a457 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -319,6 +319,10 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) int silent = fc->sb_flags & SB_SILENT; int res; + atomic64_set(&sbi->file_count, 0); + atomic64_set(&sbi->folder_count, 0); + atomic64_set(&sbi->next_id, 0); + /* load_nls_default does not fail */ if (sbi->nls_disk && !sbi->nls_io) sbi->nls_io = load_nls_default(); diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index eeebe80c6be4aa..ba26980cc5035c 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -139,7 +139,7 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid, { int err = 0; - hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + hfs_dbg("name %s, cnid %d\n", name ? name : NULL, cnid); if (!HFSPLUS_SB(sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); @@ -201,7 +201,7 @@ int hfsplus_create_attr(struct inode *inode, int entry_size; int err; - hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -310,7 +310,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -356,7 +356,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) int err = 0; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); + hfs_dbg("cnid %d\n", cnid); if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 901e83d65d2021..afc9c89e8c6aff 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -18,12 +18,12 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->tree = tree; fd->bnode = NULL; - ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); + ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); mutex_lock_nested(&tree->tree_lock, hfsplus_btree_lock_class(tree)); @@ -34,7 +34,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; @@ -158,6 +158,12 @@ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) __be32 data; int height, res; + fd->record = -1; + fd->keyoffset = -1; + fd->keylength = -1; + fd->entryoffset = -1; + fd->entrylength = -1; + tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index bd8dcea8558800..1b3af8c87cadb5 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -31,7 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, if (!len) return size; - hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + hfs_dbg("size %u, offset %u, len %u\n", size, offset, len); mutex_lock(&sbi->alloc_mutex); mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -90,14 +90,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, else end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; } - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); start = size; goto out; found: start = offset + (curr - pptr) * 32 + i; if (start >= size) { - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); goto out; } /* do any partial u32 at the start */ @@ -155,7 +155,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); - hfs_dbg(BITMAP, "-> %u,%u\n", start, *max); + hfs_dbg("start %u, max %u\n", start, *max); out: mutex_unlock(&sbi->alloc_mutex); return start; @@ -174,7 +174,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) return 0; - hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count); + hfs_dbg("offset %u, count %u\n", offset, count); /* are all of the bits in range? */ if ((offset + count) > sbi->total_blocks) return -ENOENT; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 14f4995588ff03..63e652ad1e0def 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -18,47 +18,6 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" -static inline -bool is_bnode_offset_valid(struct hfs_bnode *node, int off) -{ - bool is_valid = off < node->tree->node_size; - - if (!is_valid) { - pr_err("requested invalid offset: " - "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d\n", - node->this, node->type, node->height, - node->tree->node_size, off); - } - - return is_valid; -} - -static inline -int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) -{ - unsigned int node_size; - - if (!is_bnode_offset_valid(node, off)) - return 0; - - node_size = node->tree->node_size; - - if ((off + len) > node_size) { - int new_len = (int)node_size - off; - - pr_err("requested length has been corrected: " - "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, " - "requested_len %d, corrected_len %d\n", - node->this, node->type, node->height, - node->tree->node_size, off, len, new_len); - - return new_len; - } - - return len; -} /* Copy a specified range of bytes from the raw data of a node */ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) @@ -214,7 +173,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct page **src_page, **dst_page; int l; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -272,7 +231,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) void *src_ptr, *dst_ptr; int l; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -392,16 +351,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -410,17 +369,17 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; - hfs_dbg_cont(BNODE_MOD, " (%d", tmp); + hfs_dbg(" (%d", tmp); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u16(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -456,7 +415,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node) /* move down? */ if (!node->prev && !node->next) - hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n"); + hfs_dbg("btree delete level\n"); if (!node->parent) { tree->root = 0; tree->depth = 0; @@ -511,7 +470,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -551,7 +510,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -697,7 +656,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -710,7 +669,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 1918544a78716e..b4645102feecd4 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -92,7 +92,7 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -193,7 +193,7 @@ int hfs_brec_remove(struct hfs_find_data *fd) mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -246,7 +246,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d - new %d - next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -383,7 +383,7 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd) newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; @@ -395,7 +395,7 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd) end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - hfs_dbg(BNODE_MOD, "splitting index node\n"); + hfs_dbg("splitting index node\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 9e1732a2b92a8c..7cc5aea145720c 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -393,6 +393,12 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) len = hfs_brec_lenoff(node, 2, &off16); off = off16; + if (!is_bnode_offset_valid(node, off)) { + hfs_bnode_put(node); + return ERR_PTR(-EIO); + } + len = check_and_correct_requested_length(node, off, len); + off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); data = kmap_local_page(*pagep); @@ -428,7 +434,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap_local(data); nidx = node->next; if (!nidx) { - hfs_dbg(BNODE_MOD, "create new bmap node\n"); + hfs_dbg("create new bmap node\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -454,7 +460,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); BUG_ON(!node->this); tree = node->tree; nidx = node->this; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 1995bafee83901..02c1eee4a4b860 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -259,7 +259,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) @@ -336,7 +336,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) int err, off; u16 type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; @@ -441,7 +441,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - ino %lu, name %s - ino %lu, name %s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 876bbb80fb4dce..1b3e27a0d5e038 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -204,7 +204,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) fd.entrylength); type = be16_to_cpu(entry.type); len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN; - err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); + err = hfsplus_uni2asc_str(sb, &fd.key->cat.name, strbuf, &len); if (err) goto out; if (type == HFSPLUS_FOLDER) { diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index b1699b3c246ae4..8e886514d27f1e 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -275,7 +275,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n", + hfs_dbg("ino %lu, iblock %llu - dblock %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -298,12 +298,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent "); for (i = 0; i < 8; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be32_to_cpu(extent[i].start_block), - be32_to_cpu(extent[i].block_count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" start_block %u, block_count %u", + be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + hfs_dbg("\n"); } static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, @@ -359,8 +359,7 @@ static int hfsplus_free_extents(struct super_block *sb, if (count <= block_nr) { err = hfsplus_block_free(sb, start, count); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = 0; @@ -370,8 +369,7 @@ static int hfsplus_free_extents(struct super_block *sb, count -= block_nr; err = hfsplus_block_free(sb, start + count, block_nr); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = cpu_to_be32(count); @@ -478,11 +476,12 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ hip->first_extents[0].start_block = cpu_to_be32(start); hip->first_extents[0].block_count = cpu_to_be32(len); @@ -521,7 +520,7 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfsplus_ext_write_extent_locked(inode); if (res) goto out; @@ -546,7 +545,7 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 96a5c24813dd6d..89e8b19c127b0a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -11,47 +11,14 @@ #ifndef _LINUX_HFSPLUS_FS_H #define _LINUX_HFSPLUS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include #include #include +#include #include "hfsplus_raw.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 -#define DBG_ATTR_MOD 0x00000080 - -#if 0 -#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) -#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#endif -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - /* Runtime config options */ #define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ @@ -521,8 +488,12 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2); int hfsplus_strcmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2); -int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, - char *astr, int *len_p); +int hfsplus_uni2asc_str(struct super_block *sb, + const struct hfsplus_unistr *ustr, char *astr, + int *len_p); +int hfsplus_uni2asc_xattr_str(struct super_block *sb, + const struct hfsplus_attr_unistr *ustr, + char *astr, int *len_p); int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len); int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); @@ -577,6 +548,48 @@ hfsplus_btree_lock_class(struct hfs_btree *tree) return class; } +static inline +bool is_bnode_offset_valid(struct hfs_bnode *node, int off) +{ + bool is_valid = off < node->tree->node_size; + + if (!is_valid) { + pr_err("requested invalid offset: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d\n", + node->this, node->type, node->height, + node->tree->node_size, off); + } + + return is_valid; +} + +static inline +int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) +{ + unsigned int node_size; + + if (!is_bnode_offset_valid(node, off)) + return 0; + + node_size = node->tree->node_size; + + if ((off + len) > node_size) { + int new_len = (int)node_size - off; + + pr_err("requested length has been corrected: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, " + "requested_len %d, corrected_len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len, new_len); + + return new_len; + } + + return len; +} + /* compatibility */ #define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) } #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 86351bdc898591..16bc4abc67e08f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -68,13 +68,26 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) if (!(inode->i_state & I_NEW)) return inode; - INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); - spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock); - mutex_init(&HFSPLUS_I(inode)->extents_lock); - HFSPLUS_I(inode)->flags = 0; + atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + HFSPLUS_I(inode)->first_blocks = 0; + HFSPLUS_I(inode)->clump_blocks = 0; + HFSPLUS_I(inode)->alloc_blocks = 0; + HFSPLUS_I(inode)->cached_start = U32_MAX; + HFSPLUS_I(inode)->cached_blocks = 0; + memset(HFSPLUS_I(inode)->first_extents, 0, sizeof(hfsplus_extent_rec)); + memset(HFSPLUS_I(inode)->cached_extents, 0, sizeof(hfsplus_extent_rec)); HFSPLUS_I(inode)->extent_state = 0; + mutex_init(&HFSPLUS_I(inode)->extents_lock); HFSPLUS_I(inode)->rsrc_inode = NULL; - atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + HFSPLUS_I(inode)->create_date = 0; + HFSPLUS_I(inode)->linkid = 0; + HFSPLUS_I(inode)->flags = 0; + HFSPLUS_I(inode)->fs_blocks = 0; + HFSPLUS_I(inode)->userflags = 0; + HFSPLUS_I(inode)->subfolders = 0; + INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); + spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock); + HFSPLUS_I(inode)->phys_size = 0; if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || inode->i_ino == HFSPLUS_ROOT_CNID) { @@ -150,7 +163,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -165,7 +178,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { @@ -184,7 +197,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - hfs_dbg(SUPER, "hfsplus_sync_fs\n"); + hfs_dbg("starting...\n"); /* * Explicitly write out the special metadata inodes. @@ -215,6 +228,10 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) vhdr->folder_count = cpu_to_be32(sbi->folder_count); vhdr->file_count = cpu_to_be32(sbi->file_count); + hfs_dbg("free_blocks %u, next_cnid %u, folder_count %u, file_count %u\n", + sbi->free_blocks, sbi->next_cnid, + sbi->folder_count, sbi->file_count); + if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr)); write_backup = 1; @@ -240,6 +257,8 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(sb->s_bdev); + hfs_dbg("finished: err %d\n", error); + return error; } @@ -288,7 +307,7 @@ static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - hfs_dbg(SUPER, "hfsplus_put_super\n"); + hfs_dbg("starting...\n"); cancel_delayed_work_sync(&sbi->sync_work); @@ -310,6 +329,8 @@ static void hfsplus_put_super(struct super_block *sb) kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); call_rcu(&sbi->rcu, delayed_free); + + hfs_dbg("finished\n"); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -524,7 +545,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) { - err = -EINVAL; + err = -EIO; goto out_put_root; } inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 36b6cf2a3abba4..11e08a4a18b295 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -40,6 +40,18 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, p1 = s1->unicode; p2 = s2->unicode; + if (len1 > HFSPLUS_MAX_STRLEN) { + len1 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s1->length), len1); + } + + if (len2 > HFSPLUS_MAX_STRLEN) { + len2 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s2->length), len2); + } + while (1) { c1 = c2 = 0; @@ -74,6 +86,18 @@ int hfsplus_strcmp(const struct hfsplus_unistr *s1, p1 = s1->unicode; p2 = s2->unicode; + if (len1 > HFSPLUS_MAX_STRLEN) { + len1 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s1->length), len1); + } + + if (len2 > HFSPLUS_MAX_STRLEN) { + len2 = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(s2->length), len2); + } + for (len = min(len1, len2); len > 0; len--) { c1 = be16_to_cpu(*p1); c2 = be16_to_cpu(*p2); @@ -119,9 +143,8 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) return NULL; } -int hfsplus_uni2asc(struct super_block *sb, - const struct hfsplus_unistr *ustr, - char *astr, int *len_p) +static int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, + int max_len, char *astr, int *len_p) { const hfsplus_unichr *ip; struct nls_table *nls = HFSPLUS_SB(sb)->nls; @@ -134,8 +157,8 @@ int hfsplus_uni2asc(struct super_block *sb, ip = ustr->unicode; ustrlen = be16_to_cpu(ustr->length); - if (ustrlen > HFSPLUS_MAX_STRLEN) { - ustrlen = HFSPLUS_MAX_STRLEN; + if (ustrlen > max_len) { + ustrlen = max_len; pr_err("invalid length %u has been corrected to %d\n", be16_to_cpu(ustr->length), ustrlen); } @@ -256,6 +279,21 @@ int hfsplus_uni2asc(struct super_block *sb, return res; } +inline int hfsplus_uni2asc_str(struct super_block *sb, + const struct hfsplus_unistr *ustr, char *astr, + int *len_p) +{ + return hfsplus_uni2asc(sb, ustr, HFSPLUS_MAX_STRLEN, astr, len_p); +} + +inline int hfsplus_uni2asc_xattr_str(struct super_block *sb, + const struct hfsplus_attr_unistr *ustr, + char *astr, int *len_p) +{ + return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr, + HFSPLUS_ATTR_MAX_STRLEN, astr, len_p); +} + /* * Convert one or more ASCII characters into a single unicode character. * Returns the number of ASCII characters corresponding to the unicode char. diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 18dc3d254d218c..ece4d29c0ab9c2 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -64,7 +64,7 @@ static void hfsplus_init_header_node(struct inode *attr_file, u32 used_bmp_bytes; u64 tmp; - hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n", + hfs_dbg("clump %u, node_size %u\n", clump_size, node_size); /* The end of the node contains list of record offsets */ @@ -132,7 +132,7 @@ static int hfsplus_create_attributes_file(struct super_block *sb) struct page *page; int old_state = HFSPLUS_EMPTY_ATTR_TREE; - hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID); + hfs_dbg("ino %d\n", HFSPLUS_ATTR_CNID); check_attr_tree_state_again: switch (atomic_read(&sbi->attr_tree_state)) { @@ -735,9 +735,9 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) goto end_listxattr; xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN; - if (hfsplus_uni2asc(inode->i_sb, - (const struct hfsplus_unistr *)&fd.key->attr.key_name, - strbuf, &xattr_name_len)) { + if (hfsplus_uni2asc_xattr_str(inode->i_sb, + &fd.key->attr.key_name, strbuf, + &xattr_name_len)) { pr_err("unicode conversion failed\n"); res = -EIO; goto end_listxattr; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 01e516175bcd72..1e1acf5775ab5f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, .free_inode = hostfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = hostfs_evict_inode, .statfs = hostfs_statfs, .show_options = hostfs_show_options, diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index a59e8fa630db67..34008442ee265f 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (hpfs_inode->i_rddir_off && !icount_read(i)) { if (*hpfs_inode->i_rddir_off) pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 09d4baef29cf9e..be4be99304bc01 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -517,14 +517,16 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, /* * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) while holding - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. + * unmapped in caller or hugetlb_vmdelete_list() skips + * unmapping it due to fail to grab lock. Unmap (again) + * while holding the fault mutex. The mutex will prevent + * faults until we finish removing the folio. Hold folio + * lock to guarantee no concurrent migration. */ + folio_lock(folio); if (unlikely(folio_mapped(folio))) hugetlb_unmap_file_folio(h, mapping, folio, index); - folio_lock(folio); /* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In diff --git a/fs/init.c b/fs/init.c index eef5124885e372..07f592ccdba868 100644 --- a/fs/init.c +++ b/fs/init.c @@ -149,7 +149,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) else if (!(S_ISBLK(mode) || S_ISCHR(mode))) return -EINVAL; - dentry = kern_path_create(AT_FDCWD, filename, &path, 0); + dentry = start_creating_path(AT_FDCWD, filename, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -158,7 +158,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -173,7 +173,7 @@ int __init init_link(const char *oldname, const char *newname) if (error) return error; - new_dentry = kern_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; @@ -191,7 +191,7 @@ int __init init_link(const char *oldname, const char *newname) error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); return error; @@ -203,14 +203,14 @@ int __init init_symlink(const char *oldname, const char *newname) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, newname, &path, 0); + dentry = start_creating_path(AT_FDCWD, newname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -225,7 +225,8 @@ int __init init_mkdir(const char *pathname, umode_t mode) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, pathname, &path, + LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); @@ -236,7 +237,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e2d..ec9339024ac36e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; @@ -550,11 +550,11 @@ static void __inode_add_lru(struct inode *inode, bool rotate) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { - void *bit_address; + void *bit_address; - bit_address = inode_state_wait_address(inode, bit); - init_wait_var_entry(wqe, bit_address, 0); - return __var_waitqueue(bit_address); + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); } EXPORT_SYMBOL(inode_bit_waitqueue); @@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) continue; spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_count)) { + if (icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } @@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ - if (atomic_read(&inode->i_count) || + if (icount_read(inode) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); @@ -1279,6 +1279,8 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; + might_sleep(); + again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data, true); @@ -1382,6 +1384,8 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; + might_sleep(); + again: inode = find_inode(sb, head, test, data, false); if (inode) { @@ -1422,6 +1426,9 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); if (inode) { @@ -1605,6 +1612,9 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; + + might_sleep(); + again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { @@ -1630,6 +1640,9 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); @@ -1780,6 +1793,8 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); + might_sleep(); + while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); @@ -1826,6 +1841,8 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, { struct inode *old; + might_sleep(); + inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); @@ -1838,11 +1855,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, EXPORT_SYMBOL(insert_inode_locked4); -int generic_delete_inode(struct inode *inode) +int inode_just_drop(struct inode *inode) { return 1; } -EXPORT_SYMBOL(generic_delete_inode); +EXPORT_SYMBOL(inode_just_drop); /* * Called when we're dropping the last reference @@ -1866,7 +1883,7 @@ static void iput_final(struct inode *inode) if (op->drop_inode) drop = op->drop_inode(inode); else - drop = generic_drop_inode(inode); + drop = inode_generic_drop(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && @@ -1908,20 +1925,45 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (!inode) + might_sleep(); + if (unlikely(!inode)) return; - BUG_ON(inode->i_state & I_CLEAR); + retry: - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { - if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { - atomic_inc(&inode->i_count); - spin_unlock(&inode->i_lock); - trace_writeback_lazytime_iput(inode); - mark_inode_dirty_sync(inode); - goto retry; - } - iput_final(inode); + lockdep_assert_not_held(&inode->i_lock); + VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode); + /* + * Note this assert is technically racy as if the count is bogusly + * equal to one, then two CPUs racing to further drop it can both + * conclude it's fine. + */ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode); + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) { + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + + spin_lock(&inode->i_lock); + if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) { + spin_unlock(&inode->i_lock); + goto retry; } + + if (!atomic_dec_and_test(&inode->i_count)) { + spin_unlock(&inode->i_lock); + return; + } + + /* + * iput_final() drops ->i_lock, we can't assert on it as the inode may + * be deallocated by the time the call returns. + */ + iput_final(inode); } EXPORT_SYMBOL(iput); @@ -2189,7 +2231,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -int file_remove_privs_flags(struct file *file, unsigned int flags) +static int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2214,7 +2256,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) @@ -2519,21 +2560,28 @@ void __init inode_init(void) void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; - if (S_ISCHR(mode)) { + switch (inode->i_mode & S_IFMT) { + case S_IFCHR: inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; - } else if (S_ISBLK(mode)) { + break; + case S_IFBLK: if (IS_ENABLED(CONFIG_BLOCK)) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; - } else if (S_ISFIFO(mode)) + break; + case S_IFIFO: inode->i_fop = &pipefifo_fops; - else if (S_ISSOCK(mode)) - ; /* leave it no_open_fops */ - else + break; + case S_IFSOCK: + /* leave it no_open_fops */ + break; + default: printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); + break; + } } EXPORT_SYMBOL(init_special_inode); @@ -2911,10 +2959,18 @@ EXPORT_SYMBOL(mode_strip_sgid); * * TODO: add a proper inode dumping routine, this is a stub to get debug off the * ground. + * + * TODO: handle getting to fs type with get_kernel_nofault()? + * See dump_mapping() above. */ void dump_inode(struct inode *inode, const char *reason) { - pr_warn("%s encountered for inode %px", reason, inode); + struct super_block *sb = inode->i_sb; + + pr_warn("%s encountered for inode %px\n" + "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", + reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, + inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); diff --git a/fs/internal.h b/fs/internal.h index 38e8aab27bbda3..a33d18ee5b74d2 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); void pidfs_get_root(struct path *path); +void nsfs_get_root(struct path *path); diff --git a/fs/ioctl.c b/fs/ioctl.c index 0248cb8db2d363..1c152c2b1b67de 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) out: return error; } -EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { @@ -426,7 +425,7 @@ static int ioctl_file_dedupe_range(struct file *file, goto out; } - size = offsetof(struct file_dedupe_range, info[count]); + size = struct_size(same, info, count); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fd827398afd2ff..8b847a1e27f13e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -304,6 +304,9 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, size_t size = i_size_read(iter->inode) - iomap->offset; size_t offset = offset_in_folio(folio, iomap->offset); + if (WARN_ON_ONCE(!iomap->inline_data)) + return -EIO; + if (folio_test_uptodate(folio)) return 0; @@ -894,7 +897,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, return true; } -static void iomap_write_end_inline(const struct iomap_iter *iter, +static bool iomap_write_end_inline(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; @@ -903,12 +906,16 @@ static void iomap_write_end_inline(const struct iomap_iter *iter, WARN_ON_ONCE(!folio_test_uptodate(folio)); BUG_ON(!iomap_inline_data_valid(iomap)); + if (WARN_ON_ONCE(!iomap->inline_data)) + return false; + flush_dcache_folio(folio); addr = kmap_local_folio(folio, pos); memcpy(iomap_inline_data(iomap, pos), addr, copied); kunmap_local(addr); mark_inode_dirty(iter->inode); + return true; } /* @@ -921,10 +928,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; - if (srcmap->type == IOMAP_INLINE) { - iomap_write_end_inline(iter, folio, pos, copied); - return true; - } + if (srcmap->type == IOMAP_INLINE) + return iomap_write_end_inline(iter, folio, pos, copied); if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { size_t bh_written; @@ -1396,6 +1401,9 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); + trace_iomap_zero_iter(iter->inode, folio_pos(folio) + offset, + bytes); + folio_zero_range(folio, offset, bytes); folio_mark_accessed(folio); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b84f6af2eb4c88..46aa85af13dc56 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -519,6 +519,9 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) loff_t pos = iomi->pos; u64 copied; + if (WARN_ON_ONCE(!inline_data)) + return -EIO; + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 6ad66e6ba653e8..a61c1dae474270 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -84,6 +84,7 @@ DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); DEFINE_RANGE_EVENT(iomap_dio_rw_queued); +DEFINE_RANGE_EVENT(iomap_zero_iter); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index a6c692cac61659..9adf36e6364b7d 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) !list_empty(&of->list)); } +/* Get active reference to kernfs node for an open file */ +static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of) +{ + /* Skip if file was already released */ + if (unlikely(of->released)) + return NULL; + + if (!kernfs_get_active(of->kn)) + return NULL; + + return of; +} + +static void kernfs_put_active_of(struct kernfs_open_file *of) +{ + return kernfs_put_active(of->kn); +} + /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * @@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) if (ops->seq_stop) ops->seq_stop(sf, v); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) @@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); @@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; @@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len < 0) @@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; @@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len > 0) @@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma) if (!of->vm_ops) return; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return; if (of->vm_ops->open) of->vm_ops->open(vma); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) @@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = 0; @@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) else file_update_time(file); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, if (!of->vm_ops) return -EINVAL; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) mutex_lock(&of->mutex); rc = -ENODEV; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) goto out_unlock; ops = kernfs_ops(of->kn); @@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) } vma->vm_ops = &kernfs_vm_ops; out_put: - kernfs_put_active(of->kn); + kernfs_put_active_of(of); out_unlock: mutex_unlock(&of->mutex); @@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; - if (!kernfs_get_active(kn)) + if (!kernfs_get_active_of(of)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) @@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) else ret = kernfs_generic_poll(of, wait); - kernfs_put_active(kn); + kernfs_put_active_of(of); return ret; } @@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); return -ENODEV; } @@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) else ret = generic_file_llseek(file, offset, whence); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); return ret; } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e384a69fbece7b..76eaf64b9d9e08 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, diff --git a/fs/locks.c b/fs/locks.c index 559f02aa417221..04a3f0e2072461 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2328,8 +2328,8 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) - * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations - * flags need to be set. + * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags + * need to be set. * * Callers expecting ->lock() to return asynchronously will only use F_SETLK, * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df9d11479caf1e..32db676127a9ed 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) inode->i_op = &minix_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &minix_aops; - } else + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { init_special_inode(inode, inode->i_mode, rdev); + } else { + printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + make_bad_inode(inode); + } } /* diff --git a/fs/mount.h b/fs/mount.h index 97737051a8b9df..79c85639a7ba0e 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -17,11 +17,7 @@ struct mnt_namespace { }; struct user_namespace *user_ns; struct ucounts *ucounts; - u64 seq; /* Sequence number to prevent loops */ - union { - wait_queue_head_t poll; - struct rcu_head mnt_ns_rcu; - }; + wait_queue_head_t poll; u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; #ifdef CONFIG_FSNOTIFY @@ -30,8 +26,6 @@ struct mnt_namespace { #endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ - struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; @@ -149,7 +143,7 @@ static inline void detach_mounts(struct dentry *dentry) static inline void get_mnt_ns(struct mnt_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } extern seqlock_t mount_lock; @@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry) static inline bool is_anon_ns(struct mnt_namespace *ns) { - return ns->seq == 0; + return ns->ns.ns_id == 0; } static inline bool anon_ns_root(const struct mount *m) diff --git a/fs/namei.c b/fs/namei.c index cd43ff89fbaa38..507ca0d7878d6a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1449,6 +1449,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags dentry->d_inode) return -EISDIR; + /* No need to trigger automounts if mountpoint crossing is disabled. */ + if (lookup_flags & LOOKUP_NO_XDEV) + return -EXDEV; + if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; @@ -1472,6 +1476,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { + if (lookup_flags & LOOKUP_NO_XDEV) { + ret = -EXDEV; + break; + } ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) @@ -1489,6 +1497,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; + if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) { + ret = -EXDEV; + break; + } continue; } } @@ -1630,12 +1642,8 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); - if (jumped) { - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - ret = -EXDEV; - else - nd->state |= ND_JUMPED; - } + if (jumped) + nd->state |= ND_JUMPED; if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) @@ -1827,6 +1835,20 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } +static struct dentry *lookup_slow_killable(const struct qstr *name, + struct dentry *dir, + unsigned int flags) +{ + struct inode *inode = dir->d_inode; + struct dentry *res; + + if (inode_lock_shared_killable(inode)) + return ERR_PTR(-EINTR); + res = __lookup_slow(name, dir, flags); + inode_unlock_shared(inode); + return res; +} + static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { @@ -2744,7 +2766,8 @@ static int filename_parentat(int dfd, struct filename *name, } /* does lookup, returns the object with parent locked */ -static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) +static struct dentry *__start_removing_path(int dfd, struct filename *name, + struct path *path) { struct path parent_path __free(path_put) = {}; struct dentry *d; @@ -2756,18 +2779,42 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); + /* don't fail immediately if it's r/o, at least try to report other errors */ + error = mnt_want_write(parent_path.mnt); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); - return d; - } + if (IS_ERR(d)) + goto unlock; + if (error) + goto fail; path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; + +fail: + dput(d); + d = ERR_PTR(error); +unlock: + inode_unlock(parent_path.dentry->d_inode); + if (!error) + mnt_drop_write(parent_path.mnt); + return d; } -struct dentry *kern_path_locked_negative(const char *name, struct path *path) +/** + * kern_path_parent: lookup path returning parent and target + * @name: path name + * @path: path to store parent in + * + * The path @name should end with a normal component, not "." or ".." or "/". + * A lookup is performed and if successful the parent information + * is store in @parent and the dentry is returned. + * + * The dentry maybe negative, the parent will be positive. + * + * Returns: dentry or error. + */ +struct dentry *kern_path_parent(const char *name, struct path *path) { struct path parent_path __free(path_put) = {}; struct filename *filename __free(putname) = getname_kernel(name); @@ -2780,35 +2827,35 @@ struct dentry *kern_path_locked_negative(const char *name, struct path *path) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); + + d = lookup_noperm_unlocked(&last, parent_path.dentry); + if (IS_ERR(d)) return d; - } path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; } -struct dentry *kern_path_locked(const char *name, struct path *path) +struct dentry *start_removing_path(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); - struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); + struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); putname(filename); return res; } -struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) +struct dentry *start_removing_user_path_at(int dfd, + const char __user *name, + struct path *path) { struct filename *filename = getname(name); - struct dentry *res = __kern_path_locked(dfd, filename, path); + struct dentry *res = __start_removing_path(dfd, filename, path); putname(filename); return res; } -EXPORT_SYMBOL(user_path_locked_at); +EXPORT_SYMBOL(start_removing_user_path_at); int kern_path(const char *name, unsigned int flags, struct path *path) { @@ -3010,6 +3057,47 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, } EXPORT_SYMBOL(lookup_one_unlocked); +/** + * lookup_one_positive_killable - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr olding pathname component to lookup + * @base: base directory to lookup from + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * It should be called without the parent i_rwsem held, and will take + * the i_rwsem itself if necessary. If a fatal signal is pending or + * delivered, it will return %-EINTR if the lock is needed. + */ +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base) +{ + int err; + struct dentry *ret; + + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow_killable(name, base, 0); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_killable); + /** * lookup_one_positive_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from @@ -4114,7 +4202,6 @@ static struct dentry *filename_create(int dfd, struct filename *name, unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; - int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); @@ -4129,7 +4216,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ - err2 = mnt_want_write(path->mnt); + error = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. @@ -4142,25 +4229,24 @@ static struct dentry *filename_create(int dfd, struct filename *name, if (IS_ERR(dentry)) goto unlock; - if (unlikely(err2)) { - error = err2; + if (unlikely(error)) goto fail; - } + return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); - if (!err2) + if (!error) mnt_drop_write(path->mnt); out: path_put(path); return dentry; } -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) +struct dentry *start_creating_path(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4168,9 +4254,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, putname(filename); return res; } -EXPORT_SYMBOL(kern_path_create); +EXPORT_SYMBOL(start_creating_path); -void done_path_create(struct path *path, struct dentry *dentry) +void end_creating_path(struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); @@ -4178,10 +4264,11 @@ void done_path_create(struct path *path, struct dentry *dentry) mnt_drop_write(path->mnt); path_put(path); } -EXPORT_SYMBOL(done_path_create); +EXPORT_SYMBOL(end_creating_path); -inline struct dentry *user_path_create(int dfd, const char __user *pathname, - struct path *path, unsigned int lookup_flags) +inline struct dentry *start_creating_user_path( + int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4189,7 +4276,7 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, putname(filename); return res; } -EXPORT_SYMBOL(user_path_create); +EXPORT_SYMBOL(start_creating_user_path); /** * vfs_mknod - create device node or file @@ -4297,7 +4384,7 @@ static int do_mknodat(int dfd, struct filename *name, umode_t mode, break; } out2: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4401,7 +4488,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode) if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4755,7 +4842,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to) if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4828,7 +4915,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to - * be writen back improperly if their true value is unknown to + * be written back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) @@ -4924,7 +5011,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { @@ -5024,20 +5111,20 @@ int vfs_rename(struct renamedata *rd) if (source == target) return 0; - error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); + error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); + error = may_create(rd->mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, is_dir); else - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) @@ -5052,13 +5139,13 @@ int vfs_rename(struct renamedata *rd) */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(rd->old_mnt_idmap, source, + error = inode_permission(rd->mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(rd->new_mnt_idmap, target, + error = inode_permission(rd->mnt_idmap, target, MAY_WRITE); if (error) return error; @@ -5126,7 +5213,7 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, + error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; @@ -5269,10 +5356,9 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, rd.old_parent = old_path.dentry; rd.old_dentry = old_dentry; - rd.old_mnt_idmap = mnt_idmap(old_path.mnt); + rd.mnt_idmap = mnt_idmap(old_path.mnt); rd.new_parent = new_path.dentry; rd.new_dentry = new_dentry; - rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); diff --git a/fs/namespace.c b/fs/namespace.c index ae6d1312b1849c..dc01b14c58cd6b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "pnode.h" #include "internal.h" @@ -65,6 +66,15 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); +static char * __initdata initramfs_options; +static int __init initramfs_options_setup(char *str) +{ + initramfs_options = str; + return 1; +} + +__setup("initramfs_options=", initramfs_options_setup); + static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); @@ -80,13 +90,10 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ -static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif -static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ -static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE = (1 << 0), @@ -119,59 +126,18 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { + struct ns_common *ns; + if (!node) return NULL; - return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); -} - -static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) -{ - struct mnt_namespace *ns_a = node_to_mnt_ns(a); - struct mnt_namespace *ns_b = node_to_mnt_ns(b); - u64 seq_a = ns_a->seq; - u64 seq_b = ns_b->seq; - - if (seq_a < seq_b) - return -1; - if (seq_a > seq_b) - return 1; - return 0; -} - -static inline void mnt_ns_tree_write_lock(void) -{ - write_seqlock(&mnt_ns_tree_lock); -} - -static inline void mnt_ns_tree_write_unlock(void) -{ - write_sequnlock(&mnt_ns_tree_lock); -} - -static void mnt_ns_tree_add(struct mnt_namespace *ns) -{ - struct rb_node *node, *prev; - - mnt_ns_tree_write_lock(); - node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->mnt_ns_tree_node); - if (!prev) - list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); - else - list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); - mnt_ns_tree_write_unlock(); - - WARN_ON_ONCE(node); + ns = rb_entry(node, struct ns_common, ns_tree_node); + return container_of(ns, struct mnt_namespace, ns); } static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ - if (refcount_dec_and_test(&ns->passive)) { + if (ns && refcount_dec_and_test(&ns->passive)) { fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); @@ -181,32 +147,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { - mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); + mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); } static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ - if (!is_anon_ns(ns)) { - mnt_ns_tree_write_lock(); - rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); - list_bidir_del_rcu(&ns->mnt_ns_list); - mnt_ns_tree_write_unlock(); - } + if (ns_tree_active(ns)) + ns_tree_remove(ns); - call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); -} - -static int mnt_ns_find(const void *key, const struct rb_node *node) -{ - const u64 mnt_ns_id = *(u64 *)key; - const struct mnt_namespace *ns = node_to_mnt_ns(node); - - if (mnt_ns_id < ns->seq) - return -1; - if (mnt_ns_id > ns->seq) - return 1; - return 0; + call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); } /* @@ -225,28 +175,21 @@ static int mnt_ns_find(const void *key, const struct rb_node *node) */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; - struct rb_node *node; - unsigned int seq; + struct mnt_namespace *mnt_ns; + struct ns_common *ns; guard(rcu)(); - do { - seq = read_seqbegin(&mnt_ns_tree_lock); - node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); - if (node) - break; - } while (read_seqretry(&mnt_ns_tree_lock, seq)); - - if (!node) + ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); + if (!ns) return NULL; /* * The last reference count is put with RCU delay so we can * unconditonally acquire a reference here. */ - ns = node_to_mnt_ns(node); - refcount_inc(&ns->passive); - return ns; + mnt_ns = container_of(ns, struct mnt_namespace, ns); + refcount_inc(&mnt_ns->passive); + return mnt_ns; } static inline void lock_mount_hash(void) @@ -1017,7 +960,7 @@ static inline bool check_anonymous_mnt(struct mount *mnt) return false; seq = mnt->mnt_ns->seq_origin; - return !seq || (seq == current->nsproxy->mnt_ns->seq); + return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id); } /* @@ -2152,19 +2095,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { + struct ns_common *ns; + guard(rcu)(); for (;;) { - struct list_head *list; - - if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); - else - list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); - if (list_is_head(list, &mnt_ns_list)) - return ERR_PTR(-ENOENT); + ns = ns_tree_adjoined_rcu(mntns, previous); + if (IS_ERR(ns)) + return ERR_CAST(ns); - mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + mntns = to_mnt_ns(ns); /* * The last passive reference count is put with RCU @@ -2179,7 +2119,7 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr * the mount namespace and it might already be on its * deathbed. */ - if (!refcount_inc_not_zero(&mntns->ns.count)) + if (!ns_ref_get(mntns)) continue; return mntns; @@ -2204,7 +2144,7 @@ static bool mnt_ns_loop(struct dentry *dentry) if (!mnt_ns) return false; - return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; + return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id; } struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, @@ -2455,7 +2395,7 @@ struct vfsmount *clone_private_mount(const struct path *path) return ERR_PTR(-EINVAL); } - if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (__has_locked_children(old_mnt, path->dentry)) @@ -3080,7 +3020,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive) if (is_anon_ns(src_mnt_ns)) ns->seq_origin = src_mnt_ns->seq_origin; else - ns->seq_origin = src_mnt_ns->seq; + ns->seq_origin = src_mnt_ns->ns.ns_id; } mnt = __do_loopback(path, recursive); @@ -3289,7 +3229,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ -static int do_remount(struct path *path, int ms_flags, int sb_flags, +static int do_remount(struct path *path, int sb_flags, int mnt_flags, void *data) { int err; @@ -3727,8 +3667,10 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, int error; error = security_sb_kern_mount(sb); - if (!error && mount_too_revealing(sb, &mnt_flags)) + if (!error && mount_too_revealing(sb, &mnt_flags)) { + errorfcp(fc, "VFS", "Mount too revealing"); error = -EPERM; + } if (unlikely(error)) { fc_drop_locked(fc); @@ -4112,7 +4054,7 @@ int path_mount(const char *dev_name, struct path *path, if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) return do_reconfigure_mnt(path, mnt_flags); if (flags & MS_REMOUNT) - return do_remount(path, flags, sb_flags, mnt_flags, data_page); + return do_remount(path, sb_flags, mnt_flags, data_page); if (flags & MS_BIND) return do_loopback(path, dev_name, flags & MS_REC); if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) @@ -4151,20 +4093,11 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) - ns_free_inum(&ns->ns); + ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } -/* - * Assign a sequence number so we can detect when we attempt to bind - * mount a reference to an older mount namespace into the current - * mount namespace, preventing reference counting loops. A 64bit - * number incrementing at 10Ghz will take 12,427 years to wrap which - * is effectively never, so we can ignore the possibility. - */ -static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); - static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; @@ -4180,22 +4113,20 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } - if (!anon) { - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - dec_mnt_namespaces(ucounts); - return ERR_PTR(ret); - } + + if (anon) + ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); + else + ret = ns_common_init(new_ns); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); } - new_ns->ns.ops = &mntns_operations; if (!anon) - new_ns->seq = atomic64_inc_return(&mnt_ns_seq); - refcount_set(&new_ns->ns.count, 1); + ns_tree_gen_id(&new_ns->ns); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; - INIT_LIST_HEAD(&new_ns->mnt_ns_list); - RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; @@ -4203,7 +4134,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } __latent_entropy -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, +struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; @@ -4234,7 +4165,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - ns_free_inum(&new_ns->ns); + ns_common_free(ns); dec_mnt_namespaces(new_ns->ucounts); mnt_ns_release(new_ns); return ERR_CAST(new); @@ -4281,7 +4212,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); - mnt_ns_tree_add(new_ns); + ns_tree_add_raw(new_ns); return new_ns; } @@ -4444,7 +4375,7 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { - pr_warn("VFS: Mount too revealing\n"); + errorfcp(fc, "VFS", "Mount too revealing"); goto err_unlock; } @@ -5007,7 +4938,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* @@ -5400,7 +5331,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; - s->sm.mnt_ns_id = ns->seq; + s->sm.mnt_ns_id = ns->ns.ns_id; } static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) @@ -5711,7 +5642,6 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { - struct path root __free(path_put) = {}; struct mount *m; int err; @@ -5723,7 +5653,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!s->mnt) return -ENOENT; - err = grab_requested_root(ns, &root); + err = grab_requested_root(ns, &s->root); if (err) return err; @@ -5732,7 +5662,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, * mounts to show users. */ m = real_mount(s->mnt); - if (!is_path_reachable(m, m->mnt.mnt_root, &root) && + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -5740,8 +5670,6 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (err) return err; - s->root = root; - /* * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap * can change concurrently as we only hold the read-side of the @@ -5910,7 +5838,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); @@ -5963,28 +5891,40 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (!ret) ret = copy_statmount_to_user(ks); kvfree(ks->seq.buf); + path_put(&ks->root); if (retry_statmount(ret, &seq_size)) goto retry; return ret; } -static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, - u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, - bool reverse) +struct klistmount { + u64 last_mnt_id; + u64 mnt_parent_id; + u64 *kmnt_ids; + u32 nr_mnt_ids; + struct mnt_namespace *ns; + struct path root; +}; + +static ssize_t do_listmount(struct klistmount *kls, bool reverse) { - struct path root __free(path_put) = {}; + struct mnt_namespace *ns = kls->ns; + u64 mnt_parent_id = kls->mnt_parent_id; + u64 last_mnt_id = kls->last_mnt_id; + u64 *mnt_ids = kls->kmnt_ids; + size_t nr_mnt_ids = kls->nr_mnt_ids; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); - ret = grab_requested_root(ns, &root); + ret = grab_requested_root(ns, &kls->root); if (ret) return ret; if (mnt_parent_id == LSMT_ROOT) { - orig = root; + orig = kls->root; } else { orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) @@ -5996,7 +5936,7 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, * Don't trigger audit denials. We just want to determine what * mounts to show users. */ - if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && + if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -6029,14 +5969,45 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, return ret; } +static void __free_klistmount_free(const struct klistmount *kls) +{ + path_put(&kls->root); + kvfree(kls->kmnt_ids); + mnt_ns_release(kls->ns); +} + +static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, + size_t nr_mnt_ids) +{ + + u64 last_mnt_id = kreq->param; + + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ + if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) + return -EINVAL; + + kls->last_mnt_id = last_mnt_id; + + kls->nr_mnt_ids = nr_mnt_ids; + kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids), + GFP_KERNEL_ACCOUNT); + if (!kls->kmnt_ids) + return -ENOMEM; + + kls->ns = grab_requested_mnt_ns(kreq); + if (!kls->ns) + return -ENOENT; + + kls->mnt_parent_id = kreq->mnt_id; + return 0; +} + SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { - u64 *kmnt_ids __free(kvfree) = NULL; + struct klistmount kls __free(klistmount_free) = {}; const size_t maxcount = 1000000; - struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct mnt_id_req kreq; - u64 last_mnt_id; ssize_t ret; if (flags & ~LISTMOUNT_REVERSE) @@ -6057,22 +6028,12 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (ret) return ret; - last_mnt_id = kreq.param; - /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ - if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) - return -EINVAL; - - kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids), - GFP_KERNEL_ACCOUNT); - if (!kmnt_ids) - return -ENOMEM; - - ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids); + if (ret) + return ret; - if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && - !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; /* @@ -6080,39 +6041,43 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, * listmount() doesn't care about any mount properties. */ scoped_guard(rwsem_read, &namespace_sem) - ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, - nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); + ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; - if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) + if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids))) return -EFAULT; return ret; } +struct mnt_namespace init_mnt_ns = { + .ns.inum = ns_init_inum(&init_mnt_ns), + .ns.ops = &mntns_operations, + .user_ns = &init_user_ns, + .ns.__ns_ref = REFCOUNT_INIT(1), + .ns.ns_type = ns_common_type(&init_mnt_ns), + .passive = REFCOUNT_INIT(1), + .mounts = RB_ROOT, + .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), +}; + static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; - struct mnt_namespace *ns; struct path root; - mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, true); - if (IS_ERR(ns)) - panic("Can't allocate initial namespace"); - ns->seq = atomic64_inc_return(&mnt_ns_seq); - ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); - ns->root = m; - ns->nr_mounts = 1; - mnt_add_to_ns(ns, m); - init_task.nsproxy->mnt_ns = ns; - get_mnt_ns(ns); + init_mnt_ns.root = m; + init_mnt_ns.nr_mounts = 1; + mnt_add_to_ns(&init_mnt_ns, m); + init_task.nsproxy->mnt_ns = &init_mnt_ns; + get_mnt_ns(&init_mnt_ns); root.mnt = mnt; root.dentry = mnt->mnt_root; @@ -6120,7 +6085,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); - mnt_ns_tree_add(ns); + ns_tree_add(&init_mnt_ns); } void __init mnt_init(void) @@ -6160,7 +6125,7 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - if (!refcount_dec_and_test(&ns->ns.count)) + if (!ns_ref_put(ns)) return; namespace_lock(); emptied_ns = ns; @@ -6409,7 +6374,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns) const struct proc_ns_operations mntns_operations = { .name = "mnt", - .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 18b3dc74c70e41..37ab6f28b5ad0e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -369,7 +369,7 @@ void netfs_readahead(struct readahead_control *ractl) return netfs_put_request(rreq, netfs_rreq_trace_put_return); cleanup_free: - return netfs_put_request(rreq, netfs_rreq_trace_put_failed); + return netfs_put_failed_request(rreq); } EXPORT_SYMBOL(netfs_readahead); @@ -472,7 +472,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); alloc_error: folio_unlock(folio); return ret; @@ -532,7 +532,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); alloc_error: folio_unlock(folio); return ret; @@ -699,7 +699,7 @@ int netfs_write_begin(struct netfs_inode *ctx, return 0; error_put: - netfs_put_request(rreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(rreq); error: if (folio) { folio_unlock(folio); @@ -754,7 +754,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, return ret < 0 ? ret : 0; error_put: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); error: _leave(" = %d", ret); return ret; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index f27ea5099a6813..09394ac2c180d3 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -347,7 +347,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_put(folio); ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); if (ret < 0) - goto error_folio_unlock; + goto out; continue; copied: diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index a05e13472bafb2..a498ee8d66745f 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -131,6 +131,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); return -EIO; } @@ -205,7 +206,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i if (user_backed_iter(iter)) { ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0); if (ret < 0) - goto out; + goto error_put; rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec; rreq->direct_bv_count = ret; rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); @@ -238,6 +239,10 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i if (ret > 0) orig_count -= ret; return ret; + +error_put: + netfs_put_failed_request(rreq); + return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index a16660ab7f8385..a9d1c3b2c08426 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -57,7 +57,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0); if (n < 0) { ret = n; - goto out; + goto error_put; } wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec; wreq->direct_bv_count = n; @@ -101,6 +101,10 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * out: netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; + +error_put: + netfs_put_failed_request(wreq); + return ret; } EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index d4f16fefd96518..4319611f535449 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -87,6 +87,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); void netfs_clear_subrequests(struct netfs_io_request *rreq); void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); +void netfs_put_failed_request(struct netfs_io_request *rreq); struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq); static inline void netfs_see_request(struct netfs_io_request *rreq, diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 20748bcfbf5902..486166460e177d 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -321,7 +321,7 @@ void netfs_wake_collector(struct netfs_io_request *rreq) { if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { - queue_work(system_unbound_wq, &rreq->work); + queue_work(system_dfl_wq, &rreq->work); } else { trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); wake_up(&rreq->waitq); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e8c99738b5bbf2..b8c4918d3dcdab 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -116,10 +116,8 @@ static void netfs_free_request_rcu(struct rcu_head *rcu) netfs_stat_d(&netfs_n_rh_rreq); } -static void netfs_free_request(struct work_struct *work) +static void netfs_deinit_request(struct netfs_io_request *rreq) { - struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, cleanup_work); struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; @@ -149,6 +147,14 @@ static void netfs_free_request(struct work_struct *work) if (atomic_dec_and_test(&ictx->io_count)) wake_up_var(&ictx->io_count); +} + +static void netfs_free_request(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, cleanup_work); + + netfs_deinit_request(rreq); call_rcu(&rreq->rcu, netfs_free_request_rcu); } @@ -163,10 +169,28 @@ void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace dead = __refcount_dec_and_test(&rreq->ref, &r); trace_netfs_rreq_ref(debug_id, r - 1, what); if (dead) - WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work)); + WARN_ON(!queue_work(system_dfl_wq, &rreq->cleanup_work)); } } +/* + * Free a request (synchronously) that was just allocated but has + * failed before it could be submitted. + */ +void netfs_put_failed_request(struct netfs_io_request *rreq) +{ + int r = refcount_read(&rreq->ref); + + /* new requests have two references (see + * netfs_alloc_request(), and this function is only allowed on + * new request objects + */ + WARN_ON_ONCE(r != 2); + + trace_netfs_rreq_ref(rreq->debug_id, r, netfs_rreq_trace_put_failed); + netfs_free_request(&rreq->cleanup_work); +} + /* * Allocate and partially initialise an I/O request structure. */ diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index 8097bc069c1de6..a1489aa29f782a 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -118,7 +118,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache( return creq; cancel_put: - netfs_put_request(creq, netfs_rreq_trace_put_return); + netfs_put_failed_request(creq); cancel: rreq->copy_to_cache = ERR_PTR(-ENOBUFS); clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index fa622a6cd56da3..5c0dc4efc79227 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -189,7 +189,7 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite return ret; cleanup_free: - netfs_put_request(rreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(rreq); return ret; } EXPORT_SYMBOL(netfs_read_single); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 0584cba1a04392..dd8743bc8d7fe3 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -133,8 +133,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, return wreq; nomem: - wreq->error = -ENOMEM; - netfs_put_request(wreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(wreq); return ERR_PTR(-ENOMEM); } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8fb4a950dd5581..4e3dcc157a83c8 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; + else + server->caps &= ~NFS_CAP_XATTR; #endif } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 86e36c630f09ea..8059ece82468d0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL_GPL(nfs_file_fsync); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to) +{ + struct folio *folio; + + if (from >= to) + return; + + folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + + if (folio_test_uptodate(folio)) { + loff_t fpos = folio_pos(folio); + size_t offset = from - fpos; + size_t end = folio_size(folio); + + if (to - fpos < end) + end = to - fpos; + folio_zero_segment(folio, offset, end); + trace_nfs_size_truncate_folio(mapping->host, to); + } + + folio_unlock(folio); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(nfs_truncate_last_folio); + /* * Decide whether a read/modify/write cycle may be more efficient * then a modify/write/read cycle when writing to a page in the @@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb, dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); + nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); fgp |= fgf_set_order(len); start: @@ -442,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset, dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); - if (offset != 0 || length < folio_size(folio)) - return; /* Cancel any unstarted writes on this page */ - nfs_wb_folio_cancel(inode, folio); + if (offset != 0 || length < folio_size(folio)) + nfs_wb_folio(inode, folio); + else + nfs_wb_folio_cancel(inode, folio); folio_wait_private_2(folio); /* [DEPRECATED] */ trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length); } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 8dc921d835388e..9edb5f9b0c4e47 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, struct pnfs_layout_segment *l2) { const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); - const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2); u32 i; if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) @@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, continue; if (check_device && - nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) { + // reinitialize the error state in case if this is the last iteration + ds = ERR_PTR(-EINVAL); continue; + } *best_idx = idx; break; @@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, struct nfs4_pnfs_ds *ds; ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); - if (ds) + if (!IS_ERR(ds)) return ds; return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); } @@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, best_idx); - if (ds || !pgio->pg_mirror_idx) + if (!IS_ERR(ds) || !pgio->pg_mirror_idx) return ds; return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); } @@ -868,7 +871,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, req->wb_nio = 0; ds = ff_layout_get_ds_for_read(pgio, &ds_idx); - if (!ds) { + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; pnfs_generic_pg_cleanup(pgio); @@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; + struct nfs4_pnfs_ds *ds; - if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx)) - ff_layout_send_layouterror(hdr->lseg); - else + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx); + if (IS_ERR(ds)) pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + else + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr, new_idx); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae42308..9bdaf7f38bedc9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); @@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; + loff_t oldsize = i_size_read(inode); int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (error) return error; - if (attr->ia_size == i_size_read(inode)) + if (attr->ia_size == oldsize) attr->ia_valid &= ~ATTR_SIZE; } @@ -767,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + nfs_file_block_o_direct(NFS_I(inode)); nfs_sync_inode(inode); + } fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { @@ -777,8 +780,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); - if (error == 0) + if (error == 0) { + if (attr->ia_valid & ATTR_SIZE) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + attr->ia_size); error = nfs_refresh_inode(inode, fattr); + } nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); @@ -2229,7 +2236,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74d712b584238d..c0a44f389f8f42 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; @@ -530,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } +/* Must be called with exclusively locked inode->i_rwsem */ +static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(&nfsi->vfs_inode); + } +} + + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 3388faf2acb9f5..d275b0a250bf3b 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -14,15 +14,6 @@ #include "internal.h" -/* Call with exclusively locked inode->i_rwsem */ -static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) -{ - if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { - clear_bit(NFS_INO_ODIRECT, &nfsi->flags); - inode_dio_wait(inode); - } -} - /** * nfs_start_io_read - declare the file is being used for buffered reads * @inode: file inode @@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (err) return err; - nfs_block_o_direct(nfsi, inode); + nfs_file_block_o_direct(nfsi); downgrade_write(&inode->i_rwsem); return 0; @@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (!err) - nfs_block_o_direct(NFS_I(inode), inode); + nfs_file_block_o_direct(NFS_I(inode)); return err; } diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index bd5fca28589989..97abf62f109d2e 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp) return; } - if (nfs_client_is_local(clp)) { - /* If already enabled, disable and re-enable */ - nfs_localio_disable_client(clp); - } + if (nfs_client_is_local(clp)) + return; if (!nfs_uuid_begin(&clp->cl_uuid)) return; @@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, case -ENOMEM: case -ENXIO: case -ENOENT: - /* Revalidate localio, will disable if unsupported */ + /* Revalidate localio */ + nfs_localio_disable_client(clp); nfs_local_probe(clp); } } @@ -453,12 +452,13 @@ static void nfs_local_call_read(struct work_struct *work) nfs_local_iter_init(&iter, iocb, READ); status = filp->f_op->read_iter(&iocb->kiocb, &iter); + + revert_creds(save_cred); + if (status != -EIOCBQUEUED) { nfs_local_read_done(iocb, status); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); } static int @@ -648,14 +648,15 @@ static void nfs_local_call_write(struct work_struct *work) file_start_write(filp); status = filp->f_op->write_iter(&iocb->kiocb, &iter); file_end_write(filp); + + revert_creds(save_cred); + current->flags = old_flags; + if (status != -EIOCBQUEUED) { nfs_local_write_done(iocb, status); nfs_local_vfs_getattr(iocb); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); - current->flags = old_flags; } static int diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 7f1ec9c67ff21d..f9a3a1fbf44ce8 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -335,7 +335,7 @@ static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp) num *= HZ; *((int *)kp->arg) = num; if (!list_empty(&nfs_automount_list)) - mod_delayed_work(system_wq, &nfs_automount_task, num); + mod_delayed_work(system_percpu_wq, &nfs_automount_task, num); } else { *((int *)kp->arg) = -1*HZ; cancel_delayed_work(&nfs_automount_task); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 01c01f45358b7c..6a0b5871ba3b09 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, exception.inode = inode; exception.state = lock->open_context->state; + nfs_file_block_o_direct(NFS_I(inode)); err = nfs_sync_inode(inode); if (err) goto out; @@ -137,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) @@ -145,7 +147,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == -EOPNOTSUPP) + + if (err == 0) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | NFS_CAP_ZERO_RANGE); @@ -183,6 +189,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) @@ -191,9 +198,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == 0) + if (err == 0) { + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); truncate_pagecache_range(inode, offset, (offset + len) -1); - if (err == -EOPNOTSUPP) + } else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); @@ -354,22 +363,27 @@ static int process_copy_commit(struct file *dst, loff_t pos_dst, /** * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload - * @inode: pointer to destination inode + * @file: pointer to destination file * @pos: destination offset * @len: copy length + * @oldsize: length of the file prior to clone/copy * * Punch a hole in the inode page cache, so that the NFS client will * know to retrieve new data. * Update the file size if necessary, and then mark the inode as having * invalid cached values for change attribute, ctime, mtime and space used. */ -static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) +static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len, + loff_t oldsize) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; loff_t newsize = pos + len; loff_t end = newsize - 1; - WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, end >> PAGE_SHIFT)); + nfs_truncate_last_folio(mapping, oldsize, pos); + WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT)); spin_lock(&inode->i_lock); if (newsize > i_size_read(inode)) @@ -402,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; + loff_t oldsize_dst = i_size_read(dst_inode); size_t count = args->count; ssize_t status; @@ -430,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } + nfs_file_block_o_direct(NFS_I(dst_inode)); status = nfs_sync_inode(dst_inode); if (status) return status; @@ -475,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count); + nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst); nfs_invalidate_atime(src_inode); status = res->write_res.count; out: @@ -1242,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct nfs42_clone_res res = { .server = server, }; + loff_t oldsize_dst = i_size_read(dst_inode); int status; msg->rpc_argp = &args; @@ -1276,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, /* a zero-length count means clone to EOF in src */ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; - nfs42_copy_dest_done(dst_inode, dst_offset, count); + nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1d6b5f4230c9b2..c9a0d1e420c6cb 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, lock_two_nondirectories(src_inode, dst_inode); /* flush all pending writes on both src and dst so that server * has the latest data */ + nfs_file_block_o_direct(NFS_I(src_inode)); ret = nfs_sync_inode(src_inode); if (ret) goto out_unlock; + nfs_file_block_o_direct(NFS_I(dst_inode)); ret = nfs_sync_inode(dst_inode); if (ret) goto out_unlock; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7d2b67e06cc37f..ce61253efd45b4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4013,8 +4013,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f res.attr_bitmask[2]; } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | - NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL); + server->caps &= + ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS | + NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME); server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) @@ -4092,7 +4094,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) }; int err; - nfs_server_set_init_caps(server); do { err = nfs4_handle_exception(server, _nfs4_server_capabilities(server, fhandle), diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index db3811af079691..18ae614e5a6c39 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -122,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) timeout = 5 * HZ; dprintk("%s: requeueing work. Lease period = %ld\n", __func__, (timeout + HZ - 1) / HZ); - mod_delayed_work(system_wq, &clp->cl_renewd, timeout); + mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout); set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 96b1323318c2f5..627115179795fc 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class, TP_ARGS(inode, new_size)) DEFINE_NFS_UPDATE_SIZE_EVENT(truncate); +DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio); DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 8b7c0473796755..647c53d1418ae6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -237,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error) } /* - * nfs_page_group_search_locked - * @head - head request of page group - * @page_offset - offset into page + * nfs_page_covers_folio + * @req: struct nfs_page * - * Search page group with head @head to find a request that contains the - * page offset @page_offset. - * - * Returns a pointer to the first matching nfs request, or NULL if no - * match is found. - * - * Must be called with the page group lock held - */ -static struct nfs_page * -nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) -{ - struct nfs_page *req; - - req = head; - do { - if (page_offset >= req->wb_pgbase && - page_offset < (req->wb_pgbase + req->wb_bytes)) - return req; - - req = req->wb_this_page; - } while (req != head); - - return NULL; -} - -/* - * nfs_page_group_covers_page - * @head - head request of page group - * - * Return true if the page group with head @head covers the whole page, - * returns false otherwise + * Return true if the request covers the whole folio. + * Note that the caller should ensure all subrequests have been joined */ static bool nfs_page_group_covers_page(struct nfs_page *req) { unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); - struct nfs_page *tmp; - unsigned int pos = 0; - - nfs_page_group_lock(req); - for (;;) { - tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (!tmp) - break; - pos = tmp->wb_pgbase + tmp->wb_bytes; - } - - nfs_page_group_unlock(req); - return pos >= len; + return req->wb_pgbase == 0 && req->wb_bytes == len; } /* We can set the PG_uptodate flag if we see that a write request @@ -2045,6 +2003,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) * release it */ nfs_inode_remove_request(req); nfs_unlock_and_release_request(req); + folio_cancel_dirty(folio); } return ret; diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 732abf6b92a569..85ca663c052c1f 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -113,7 +113,7 @@ static void nfsd_file_schedule_laundrette(void) { if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags)) - queue_delayed_work(system_unbound_wq, &nfsd_filecache_laundrette, + queue_delayed_work(system_dfl_wq, &nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index edf050766e5705..aa4a95713a4823 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1951,10 +1951,9 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out_dput_old; } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = fdentry, .old_dentry = odentry, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = tdentry, .new_dentry = ndentry, }; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 14868a3dd592ca..bc52afbfc5c739 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) ************************************************************************/ static ssize_t nilfs_feature_revision_show(struct kobject *kobj, - struct attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d.%d\n", NILFS_CURRENT_REV, NILFS_MINOR_REV); @@ -1087,7 +1087,7 @@ static const char features_readme_str[] = "(1) revision\n\tshow current revision of NILFS file system driver.\n"; static ssize_t nilfs_feature_README_show(struct kobject *kobj, - struct attribute *attr, + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, features_readme_str); diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h index 78a87a016928b7..d370cd5cce3f5d 100644 --- a/fs/nilfs2/sysfs.h +++ b/fs/nilfs2/sysfs.h @@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups { struct completion sg_segments_kobj_unregister; }; -#define NILFS_COMMON_ATTR_STRUCT(name) \ +#define NILFS_KOBJ_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ struct attribute attr; \ - ssize_t (*show)(struct kobject *, struct attribute *, \ + ssize_t (*show)(struct kobject *, struct kobj_attribute *, \ char *); \ - ssize_t (*store)(struct kobject *, struct attribute *, \ + ssize_t (*store)(struct kobject *, struct kobj_attribute *, \ const char *, size_t); \ } -NILFS_COMMON_ATTR_STRUCT(feature); +NILFS_KOBJ_ATTR_STRUCT(feature); #define NILFS_DEV_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 079b868552c21d..46bfc543f9467c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * removed all zero refcount inodes, in any case. Test to * be sure. */ - if (!atomic_read(&inode->i_count)) { + if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 798340db69d761..55a03bb05aa118 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -428,7 +428,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); - queue_work(system_unbound_wq, &connector_reaper_work); + queue_work(system_dfl_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about @@ -439,7 +439,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, + queue_delayed_work(system_dfl_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); diff --git a/fs/nsfs.c b/fs/nsfs.c index 59aa801347a7de..e7fd8a790aaa4b 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,12 +13,26 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "mount.h" #include "internal.h" static struct vfsmount *nsfs_mnt; +static struct path nsfs_root_path = {}; + +void nsfs_get_root(struct path *path) +{ + *path = nsfs_root_path; + path_get(path); +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { @@ -139,7 +153,7 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, * the size value will be set to the size the kernel knows about. */ kinfo->size = min(usize, sizeof(*kinfo)); - kinfo->mnt_ns_id = mnt_ns->seq; + kinfo->mnt_ns_id = mnt_ns->ns.ns_id; kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); /* Subtract the root mount of the mount namespace. */ if (kinfo->nr_mounts) @@ -163,15 +177,18 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + case NS_GET_ID: + return true; } /* Extensible ioctls require some extra handling. */ switch (_IOC_NR(cmd)) { case _IOC_NR(NS_MNT_GET_INFO): + return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_NEXT): + return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_PREV): - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0); } return false; @@ -202,26 +219,14 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: - return ns->ops->type; + return ns->ns_type; case NS_GET_OWNER_UID: - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); - case NS_GET_MNTNS_ID: { - __u64 __user *idp; - __u64 id; - - if (ns->ops->type != CLONE_NEWNS) - return -EINVAL; - - mnt_ns = container_of(ns, struct mnt_namespace, ns); - idp = (__u64 __user *)arg; - id = mnt_ns->seq; - return put_user(id, idp); - } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: @@ -229,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { - if (ns->ops->type != CLONE_NEWPID) + if (ns->ns_type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; @@ -267,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, ret = -ESRCH; return ret; } + case NS_GET_MNTNS_ID: + if (ns->ns_type != CLONE_NEWNS) + return -EINVAL; + fallthrough; + case NS_GET_ID: { + __u64 __user *idp; + __u64 id; + + idp = (__u64 __user *)arg; + id = ns->ns_id; + return put_user(id, idp); + } } /* extensible ioctls */ @@ -276,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (!uinfo) @@ -297,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) @@ -415,12 +432,164 @@ static const struct stashed_operations nsfs_stashed_ops = { .put_data = nsfs_put_data, }; +#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) +#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) + +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct ns_common *ns = inode->i_private; + int len = *max_len; + + if (parent) + return FILEID_INVALID; + + if (len < NSFS_FID_SIZE_U32_VER0) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + return FILEID_INVALID; + } else if (len > NSFS_FID_SIZE_U32_LATEST) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + } + + fid->ns_id = ns->ns_id; + fid->ns_type = ns->ns_type; + fid->ns_inum = inode->i_ino; + return FILEID_NSFS; +} + +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct path path __free(path_put) = {}; + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct user_namespace *owning_ns = NULL; + struct ns_common *ns; + int ret; + + if (fh_len < NSFS_FID_SIZE_U32_VER0) + return NULL; + + /* Check that any trailing bytes are zero. */ + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, + fh_len - NSFS_FID_SIZE_U32_LATEST)) + return NULL; + + switch (fh_type) { + case FILEID_NSFS: + break; + default: + return NULL; + } + + scoped_guard(rcu) { + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); + if (!ns) + return NULL; + + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); + VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (!__ns_ref_get(ns)) + return NULL; + } + + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + if (!current_in_namespace(to_cg_ns(ns))) + owning_ns = to_cg_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + if (!current_in_namespace(to_ipc_ns(ns))) + owning_ns = to_ipc_ns(ns)->user_ns; + break; +#endif + case CLONE_NEWNS: + if (!current_in_namespace(to_mnt_ns(ns))) + owning_ns = to_mnt_ns(ns)->user_ns; + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + if (!current_in_namespace(to_net_ns(ns))) + owning_ns = to_net_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + if (!current_in_namespace(to_pid_ns(ns))) { + owning_ns = to_pid_ns(ns)->user_ns; + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + if (!current_in_namespace(to_time_ns(ns))) + owning_ns = to_time_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + if (!current_in_namespace(to_user_ns(ns))) + owning_ns = to_user_ns(ns); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + if (!current_in_namespace(to_uts_ns(ns))) + owning_ns = to_uts_ns(ns)->user_ns; + break; +#endif + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + + /* path_from_stashed() unconditionally consumes the reference. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ERR_PTR(ret); + + return no_free_ptr(path.dentry); +} + +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + /* nsfs_fh_to_dentry() performs all permission checks. */ + return 0; +} + +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +{ + return file_open_root(path, "", oflags, 0); +} + +static const struct export_operations nsfs_export_operations = { + .encode_fh = nsfs_encode_fh, + .fh_to_dentry = nsfs_fh_to_dentry, + .open = nsfs_export_open, + .permission = nsfs_export_permission, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; + ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; @@ -438,4 +607,6 @@ void __init nsfs_init(void) if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; + nsfs_root_path.mnt = nsfs_mnt; + nsfs_root_path.dentry = nsfs_mnt->mnt_root; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2018501b224937..2347a50f079b7b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1876,7 +1876,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) dlm_debug_init(dlm); snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); - dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); + dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 5130ec44e5e158..cccaa1d6fbbac1 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = { .alloc_inode = dlmfs_alloc_inode, .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct inode_operations dlmfs_file_inode_operations = { @@ -595,7 +595,8 @@ static int __init init_dlmfs_fs(void) } cleanup_inode = 1; - user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0); + user_dlm_worker = alloc_workqueue("user_dlm", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!user_dlm_worker) { status = -ENOMEM; goto bail; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 930150ed5db15f..ef147e8b327126 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -706,6 +706,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, * it not only handles the fiemap for inlined files, but also deals * with the fast symlink, cause they have no difference for extent * mapping per se. + * + * Must be called with ip_alloc_sem semaphore held. */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, @@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, u64 phys; u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; struct ocfs2_inode_info *oi = OCFS2_I(inode); + lockdep_assert_held_read(&oi->ip_alloc_sem); di = (struct ocfs2_dinode *)di_bh->b_data; if (ocfs2_inode_is_fast_symlink(inode)) @@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) return ret; } @@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; - + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, len_bytes, fe_flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret) break; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8f732742b26e36..267b50e8e42e5c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4418,7 +4418,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } - new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); @@ -4435,7 +4435,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, d_inode(new_path.dentry), new_dentry, preserve); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 0f045e45fa0c3e..765105f1ff8a2c 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -952,7 +952,7 @@ static const struct dlm_lockspace_ops ocfs2_ls_ops = { static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) { version_unlock(conn); - dlm_release_lockspace(conn->cc_lockspace, 2); + dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL); conn->cc_lockspace = NULL; ocfs2_live_connection_drop(conn->cc_private); conn->cc_private = NULL; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index f3da840758e742..b46100a4f52935 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = { .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .statfs = orangefs_statfs, .show_options = orangefs_show_options, }; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index bb0d7ded8e763a..4f84abaa0d6805 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -361,10 +361,9 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, { int err; struct renamedata rd = { - .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), + .mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_parent = olddir, .old_dentry = olddentry, - .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_parent = newdir, .new_dentry = newdentry, .flags = flags, diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index b65cdfce31ce27..15cb06fa0c9a10 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -270,26 +270,26 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { - int err; + int err = 0; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); - err = down_write_killable(&dir->d_inode->i_rwsem); - if (!err) { - while (rdd->first_maybe_whiteout) { - struct ovl_cache_entry *p = - rdd->first_maybe_whiteout; - rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_idmap(path->mnt), - &QSTR_LEN(p->name, p->len), dir); - if (!IS_ERR(dentry)) { - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } + while (rdd->first_maybe_whiteout) { + struct ovl_cache_entry *p = + rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one_positive_killable(mnt_idmap(path->mnt), + &QSTR_LEN(p->name, p->len), + dir); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } else if (PTR_ERR(dentry) == -EINTR) { + err = -EINTR; + break; } - inode_unlock(dir->d_inode); } ovl_revert_creds(old_cred); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index df85a76597e910..bd3d7ba8fb95b5 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, .free_inode = ovl_free_inode, .destroy_inode = ovl_destroy_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = ovl_put_super, .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, diff --git a/fs/pidfs.c b/fs/pidfs.c index 108e7527f837fd..c40c29c702e56d 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd) * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases. */ - return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); } return false; @@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode) } static const struct super_operations pidfs_sops = { - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d41..42fead1efe5204 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/fs/proc/array.c b/fs/proc/array.c index d6a0369caa931e..69269745d73b8f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -157,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, unsigned int max_fds = 0; rcu_read_lock(); - ppid = pid_alive(p) ? - task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tracer = ptrace_parent(p); if (tracer) tpid = task_pid_nr_ns(tracer, ns); + ppid = task_ppid_nr_ns(p, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); cred = get_task_cred(p); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index bd0c099cfdd2a5..1762811122735f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -393,7 +393,8 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, if (proc_alloc_inum(&dp->low_ino)) goto out_free_entry; - pde_set_flags(dp); + if (!S_ISDIR(dp->mode)) + pde_set_flags(dp); write_lock(&proc_subdir_lock); dp->parent = dir; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 129490151be147..d9b7ef1223437d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 4403a2e20c165d..ea2b597fd92cdb 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -12,7 +12,7 @@ #include "internal.h" -static const struct proc_ns_operations *ns_entries[] = { +static const struct proc_ns_operations *const ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif @@ -117,7 +117,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry, static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; if (!task) return -ENOENT; @@ -151,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); diff --git a/fs/proc/root.c b/fs/proc/root.c index ed86ac7103843b..1e24e085c7d5a1 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,12 +38,14 @@ enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, + Opt_pidns, }; static const struct fs_parameter_spec proc_fs_parameters[] = { - fsparam_u32("gid", Opt_gid), + fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), + fsparam_file_or_string("pidns", Opt_pidns), {} }; @@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value) return 0; } +#ifdef CONFIG_PID_NS +static int proc_parse_pidns_param(struct fs_context *fc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct pid_namespace *target, *active = task_active_pid_ns(current); + struct ns_common *ns; + struct file *ns_filp __free(fput) = NULL; + + switch (param->type) { + case fs_value_is_file: + /* came through fsconfig, steal the file reference */ + ns_filp = no_free_ptr(param->file); + break; + case fs_value_is_string: + ns_filp = filp_open(param->string, O_RDONLY, 0); + break; + default: + WARN_ON_ONCE(true); + break; + } + if (!ns_filp) + ns_filp = ERR_PTR(-EBADF); + if (IS_ERR(ns_filp)) { + errorfc(fc, "could not get file from pidns argument"); + return PTR_ERR(ns_filp); + } + + if (!proc_ns_file(ns_filp)) + return invalfc(fc, "pidns argument is not an nsfs file"); + ns = get_proc_ns(file_inode(ns_filp)); + if (ns->ns_type != CLONE_NEWPID) + return invalfc(fc, "pidns argument is not a pidns file"); + target = container_of(ns, struct pid_namespace, ns); + + /* + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the + * permission model should be the same as pidns_install(). + */ + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { + errorfc(fc, "insufficient permissions to set pidns"); + return -EPERM; + } + if (!pidns_is_ancestor(target, active)) + return invalfc(fc, "cannot set pidns to non-descendant pidns"); + + put_pid_ns(ctx->pid_ns); + ctx->pid_ns = get_pid_ns(target); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + return 0; +} +#endif /* CONFIG_PID_NS */ + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + int opt, err; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) @@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_hidepid: - if (proc_parse_hidepid_param(fc, param)) - return -EINVAL; + err = proc_parse_hidepid_param(fc, param); + if (err) + return err; break; case Opt_subset: - if (proc_parse_subset_param(fc, param->string) < 0) - return -EINVAL; + err = proc_parse_subset_param(fc, param->string); + if (err) + return err; + break; + + case Opt_pidns: +#ifdef CONFIG_PID_NS + /* + * We would have to RCU-protect every proc_pid_ns() or + * proc_sb_info() access if we allowed this to be reconfigured + * for an existing procfs instance. Luckily, procfs instances + * are cheap to create, and mount-beneath would let you + * atomically replace an instance even with overmounts. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + errorfc(fc, "cannot reconfigure pidns for existing procfs"); + return -EBUSY; + } + err = proc_parse_pidns_param(fc, param, &result); + if (err) + return err; break; +#else + errorfc(fc, "pidns mount flag not supported on this system"); + return -EOPNOTSUPP; +#endif default: return -EINVAL; @@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info, fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; + if (ctx->mask & (1 << Opt_pidns) && + !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { + put_pid_ns(fs_info->pid_ns); + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + } } static int proc_fill_super(struct super_block *s, struct fs_context *fc) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 29cca0e6d0ff5a..b26ae556b4463e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2417,6 +2417,9 @@ static void pagemap_scan_backout_range(struct pagemap_scan_private *p, { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + if (!p->vec_buf) + return; + if (cur_buf->start != addr) cur_buf->end = addr; else diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1a2e1185426ca1..b4e55c90f8dc23 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc) static const struct super_operations pstore_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pstore_evict_inode, .show_options = pstore_show_options, }; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index ceb5639a062960..eb61ba5bb96422 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -43,7 +43,7 @@ struct psz_buffer { * * @magic: magic num for kmsg dump header * @time: kmsg dump trigger time - * @compressed: whether conpressed + * @compressed: whether compressed * @counter: kmsg dump counter * @reason: the kmsg dump reason (e.g. oops, panic, etc) * @data: pointer to log data @@ -214,7 +214,7 @@ static int psz_zone_write(struct pstore_zone *zone, atomic_set(&zone->buffer->datalen, wlen + off); } - /* avoid to damage old records */ + /* avoid damaging old records */ if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered)) goto dirty; @@ -249,7 +249,7 @@ static int psz_zone_write(struct pstore_zone *zone, return 0; dirty: - /* no need to mark dirty if going to try next zone */ + /* no need to mark it dirty if going to try next zone */ if (wcnt == -ENOMSG) return -ENOMSG; atomic_set(&zone->dirty, true); @@ -378,7 +378,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt) struct timespec64 time = { }; unsigned long i; /* - * Recover may on panic, we can't allocate any memory by kmalloc. + * Recover may happen on panic, we can't allocate any memory by kmalloc. * So, we use local array instead. */ char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0}; @@ -856,11 +856,11 @@ static int notrace psz_record_write(struct pstore_zone *zone, /** * psz_zone_write will set datalen as start + cnt. - * It work if actual data length lesser than buffer size. - * If data length greater than buffer size, pmsg will rewrite to - * beginning of zone, which make buffer->datalen wrongly. + * It works if actual data length is lesser than buffer size. + * If data length is greater than buffer size, pmsg will rewrite to + * the beginning of the zone, which makes buffer->datalen wrong. * So we should reset datalen as buffer size once actual data length - * greater than buffer size. + * is greater than buffer size. */ if (is_full_data) { atomic_set(&zone->buffer->datalen, zone->buffer_size); @@ -878,8 +878,9 @@ static int notrace psz_pstore_write(struct pstore_record *record) atomic_set(&cxt->on_panic, 1); /* - * if on panic, do not write except panic records - * Fix case that panic_write prints log which wakes up console backend. + * If on panic, do not write anything except panic records. + * Fix the case when panic_write prints log that wakes up + * console backend. */ if (is_on_panic() && record->type != PSTORE_TYPE_DMESG) return -EBUSY; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index df4a9b34876965..afa15a21453822 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -881,7 +881,7 @@ void dqput(struct dquot *dquot) put_releasing_dquots(dquot); atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); - queue_delayed_work(system_unbound_wq, "a_release_work, 1); + queue_delayed_work(system_dfl_wq, "a_release_work, 1); } EXPORT_SYMBOL(dqput); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index f8874c3b8c1e95..41f9995da7cab0 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations ramfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = ramfs_show_options, }; diff --git a/fs/read_write.c b/fs/read_write.c index c5b6265d984bae..833bae068770a4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (len == 0) return 0; + /* + * Make sure return value doesn't overflow in 32bit compat mode. Also + * limit the size for all cases except when calling ->copy_file_range(). + */ + if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) + len = min_t(size_t, MAX_RW_COUNT, len); + file_start_write(file_out); /* @@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, - min_t(loff_t, MAX_RW_COUNT, len), - REMAP_FILE_CAN_SHORTEN); + file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; @@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ - ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, - min_t(size_t, len, MAX_RW_COUNT), 0); + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); done: if (ret > 0) { fsnotify_access(file_in); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index d98e0d2de09fd0..0d0ef54fc4de1f 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, rdt_last_cmd_clear(); if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; else ret = -EINVAL; } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; else ret = -EINVAL; @@ -563,10 +563,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->r = r; rr->d = d; rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r) && + resctrl_is_mbm_event(evtid)) { + rr->is_mbm_cntr = true; + } else { + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } } cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); @@ -582,7 +587,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + if (rr->arch_mon_ctx) + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) @@ -625,11 +631,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) */ list_for_each_entry(d, &r->mon_domains, hdr.list) { if (d->ci_id == domid) { - rr.ci_id = d->ci_id; cpu = cpumask_any(&d->hdr.cpu_mask); ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) continue; + rr.ci = ci; mon_event_read(&rr, r, NULL, rdtgrp, &ci->shared_cpu_map, evtid, false); goto checkresult; @@ -653,10 +659,16 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) checkresult: + /* + * -ENOENT is a special case, set only when "mbm_event" counter assignment + * mode is enabled and no counter has been assigned. + */ if (rr.err == -EIO) seq_puts(m, "Error\n"); else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); + else if (rr.err == -ENOENT) + seq_puts(m, "Unassigned\n"); else seq_printf(m, "%llu\n", rr.val); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 0a1eedba2b03ad..cf1fd82dc5a99e 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -52,19 +52,31 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) } /** - * struct mon_evt - Entry in the event list of a resource + * struct mon_evt - Properties of a monitor event * @evtid: event id + * @rid: resource id for this event * @name: name of the event + * @evt_cfg: Event configuration value that represents the + * memory transactions (e.g., READS_TO_LOCAL_MEM, + * READS_TO_REMOTE_MEM) being tracked by @evtid. + * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list + * @enabled: true if the event is enabled */ struct mon_evt { enum resctrl_event_id evtid; + enum resctrl_res_level rid; char *name; + u32 evt_cfg; bool configurable; - struct list_head list; + bool enabled; }; +extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; + +#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ + mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. @@ -98,7 +110,9 @@ struct mon_data { * domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it + * is an MBM event. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -112,7 +126,8 @@ struct rmid_read { struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; - unsigned int ci_id; + struct cacheinfo *ci; + bool is_mbm_cntr; int err; u64 val; void *arch_mon_ctx; @@ -226,6 +241,8 @@ struct rdtgroup { #define RFTYPE_DEBUG BIT(10) +#define RFTYPE_ASSIGN_CONFIG BIT(11) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -375,6 +392,41 @@ bool closid_allocated(unsigned int closid); int resctrl_find_cleanest_closid(void); +void *rdt_kn_parent_priv(struct kernfs_node *kn); + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show); + +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, + void *v); + +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); + +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); + +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f5637855c3acac..4076336fbba6db 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid) entry = __rmid_entry(idx); - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); else list_add_tail(&entry->list, &rmid_free_lru); @@ -346,28 +346,97 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *state; - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: + if (!resctrl_is_mbm_event(evtid)) return NULL; + + state = d->mbm_states[MBM_STATE_IDX(evtid)]; + + return state ? &state[idx] : NULL; +} + +/* + * mbm_cntr_get() - Return the counter ID for the matching @evtid and @rdtgrp. + * + * Return: + * Valid counter ID on success, or -ENOENT on failure. + */ +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + if (!r->mon.mbm_cntr_assignable) + return -ENOENT; + + if (!resctrl_is_mbm_event(evtid)) + return -ENOENT; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (d->cntr_cfg[cntr_id].rdtgrp == rdtgrp && + d->cntr_cfg[cntr_id].evtid == evtid) + return cntr_id; + } + + return -ENOENT; +} + +/* + * mbm_cntr_alloc() - Initialize and return a new counter ID in the domain @d. + * Caller must ensure that the specified event is not assigned already. + * + * Return: + * Valid counter ID on success, or -ENOSPC on failure. + */ +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (!d->cntr_cfg[cntr_id].rdtgrp) { + d->cntr_cfg[cntr_id].rdtgrp = rdtgrp; + d->cntr_cfg[cntr_id].evtid = evtid; + return cntr_id; + } } + + return -ENOSPC; } -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) +/* + * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. + */ +static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +{ + memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); +} + +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct rdt_mon_domain *d; - struct cacheinfo *ci; + int cntr_id = -ENOENT; struct mbm_state *m; int err, ret; u64 tval = 0; + if (rr->is_mbm_cntr) { + cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + if (cntr_id < 0) { + rr->err = -ENOENT; + return -EINVAL; + } + } + if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + if (rr->is_mbm_cntr) + resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + else + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); @@ -378,8 +447,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* Reading a single domain, must be on a CPU in that domain. */ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -389,8 +462,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* Summing domains that share a cache, must be on a CPU for that cache. */ - ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!ci || ci->id != rr->ci_id) + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) return -EINVAL; /* @@ -402,10 +474,14 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ ret = -EINVAL; list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci_id != rr->ci_id) + if (d->ci_id != rr->ci->id) continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -421,8 +497,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. + * @rdtgrp: resctrl group associated with the CLOSID and RMID to identify + * the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * * Supporting function to calculate the memory bandwidth @@ -430,9 +506,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { u64 cur_bw, bytes, cur_bytes; + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct mbm_state *m; m = get_mbm_state(rr->d, closid, rmid, rr->evtid); @@ -461,7 +539,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp, rr); /* * For Ctrl groups read data from child monitor groups and @@ -472,8 +550,7 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) + if (__mon_event_count(entry, rr) == 0) ret = 0; } } @@ -604,44 +681,49 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) } static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; rr.r = r; rr.d = d; rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r)) { + rr.is_mbm_cntr = true; + } else { + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } } - __mon_event_count(closid, rmid, &rr); + __mon_event_count(rdtgrp, &rr); /* * If the software controller is enabled, compute the * bandwidth for this event id. */ if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); + mbm_bw_count(rdtgrp, &rr); - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + if (rr.arch_mon_ctx) + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) + struct rdtgroup *rdtgrp) { /* * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ - if (resctrl_arch_is_mbm_total_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID); - if (resctrl_arch_is_mbm_local_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID); } /* @@ -714,11 +796,11 @@ void mbm_handle_overflow(struct work_struct *work) d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); + mbm_update(r, d, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); + mbm_update(r, d, crgrp); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -844,38 +926,819 @@ static void dom_data_exit(struct rdt_resource *r) mutex_unlock(&rdtgroup_mutex); } -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, +/* + * All available events. Architecture code marks the ones that + * are supported by a system using resctrl_enable_mon_event() + * to set .enabled. + */ +struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { + [QOS_L3_OCCUP_EVENT_ID] = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_TOTAL_EVENT_ID] = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_LOCAL_EVENT_ID] = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, }; -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +void resctrl_enable_mon_event(enum resctrl_event_id eventid) +{ + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + return; + if (mon_event_all[eventid].enabled) { + pr_warn("Duplicate enable for event %d\n", eventid); + return; + } + + mon_event_all[eventid].enabled = true; +} + +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) +{ + return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS && + mon_event_all[eventid].enabled; +} + +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) +{ + return mon_event_all[evtid].evt_cfg; +} + +/** + * struct mbm_transaction - Memory transaction an MBM event can be configured with. + * @name: Name of memory transaction (read, write ...). + * @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to + * represent the memory transaction within an event's configuration. + */ +struct mbm_transaction { + char name[32]; + u32 val; }; -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +/* Decoded values for each type of memory transaction. */ +static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = { + {"local_reads", READS_TO_LOCAL_MEM}, + {"remote_reads", READS_TO_REMOTE_MEM}, + {"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM}, + {"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM}, + {"local_reads_slow_memory", READS_TO_LOCAL_S_MEM}, + {"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM}, + {"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM}, }; +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + bool sep = false; + int ret = 0, i; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (mevt->evt_cfg & mbm_transactions[i].val) { + if (sep) + seq_putc(seq, ','); + seq_printf(seq, "%s", mbm_transactions[i].name); + sep = true; + } + } + seq_putc(seq, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s, + void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool value; + int ret; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + r->mon.mbm_assign_on_mkdir = value; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + +/* + * mbm_cntr_free_all() - Clear all the counter ID configuration details in the + * domain @d. Called when mbm_assign_mode is changed. + */ +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); +} + +/* + * resctrl_reset_rmid_all() - Reset all non-architecture states for all the + * supported RMIDs. + */ +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + enum resctrl_event_id evt; + int idx; + + for_each_mbm_event_id(evt) { + if (!resctrl_is_mon_event_enabled(evt)) + continue; + idx = MBM_STATE_IDX(evt); + memset(d->mbm_states[idx], 0, sizeof(*d->mbm_states[0]) * idx_limit); + } +} + +/* + * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID + * pair in the domain. + * + * Assign the counter if @assign is true else unassign the counter. Reset the + * associated non-architectural state. + */ +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct mbm_state *m; + + resctrl_arch_config_cntr(r, d, evtid, rmid, closid, cntr_id, assign); + + m = get_mbm_state(d, closid, rmid, evtid); + if (m) + memset(m, 0, sizeof(*m)); +} + +/* + * rdtgroup_alloc_assign_cntr() - Allocate a counter ID and assign it to the event + * pointed to by @mevt and the resctrl group @rdtgrp within the domain @d. + * + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + /* No action required if the counter is assigned already. */ + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + if (cntr_id >= 0) + return 0; + + cntr_id = mbm_cntr_alloc(r, d, rdtgrp, mevt->evtid); + if (cntr_id < 0) { + rdt_last_cmd_printf("Failed to allocate counter for %s in domain %d\n", + mevt->name, d->hdr.id); + return cntr_id; + } + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, true); + + return 0; +} + /* - * Initialize the event list for the resource. + * rdtgroup_assign_cntr_event() - Assign a hardware counter for the event in + * @mevt to the resctrl group @rdtgrp. Assign counters to all domains if @d is + * NULL; otherwise, assign the counter to the specified domain @d. + * + * If all counters in a domain are already in use, rdtgroup_alloc_assign_cntr() + * will fail. The assignment process will abort at the first failure encountered + * during domain traversal, which may result in the event being only partially + * assigned. * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + int ret = 0; + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + if (ret) + return ret; + } + } else { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + } + + return ret; +} + +/* + * rdtgroup_assign_cntrs() - Assign counters to MBM events. Called when + * a new group is created. + * + * Each group can accommodate two counters per domain: one for the total + * event and one for the local event. Assignments may fail due to the limited + * number of counters. However, it is not necessary to fail the group creation + * and thus no failure is returned. Users have the option to modify the + * counter assignments after the group has been created. + */ +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || + !r->mon.mbm_assign_on_mkdir) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + +/* + * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration + * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. + */ +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + + /* If there is no cntr_id assigned, nothing to do */ + if (cntr_id < 0) + return; + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false); + + mbm_cntr_free(d, cntr_id); +} + +/* + * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with + * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign + * the counters from all the domains if @d is NULL else unassign from @d. + */ +static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } else { + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } +} + +/* + * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events. + * Called when a group is deleted. */ -static void l3_mon_evt_init(struct rdt_resource *r) +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) { - INIT_LIST_HEAD(&r->evt_list); + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + +static int resctrl_parse_mem_transactions(char *tok, u32 *val) +{ + u32 temp_val = 0; + char *evt_str; + bool found; + int i; + +next_config: + if (!tok || tok[0] == '\0') { + *val = temp_val; + return 0; + } + + /* Start processing the strings for each memory transaction type */ + evt_str = strim(strsep(&tok, ",")); + found = false; + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (!strcmp(mbm_transactions[i].name, evt_str)) { + temp_val |= mbm_transactions[i].val; + found = true; + break; + } + } + + if (!found) { + rdt_last_cmd_printf("Invalid memory transaction type %s\n", evt_str); + return -EINVAL; + } + + goto next_config; +} + +/* + * rdtgroup_update_cntr_event - Update the counter assignments for the event + * in a group. + * @r: Resource to which update needs to be done. + * @rdtgrp: Resctrl group. + * @evtid: MBM monitor event. + */ +static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, + enum resctrl_event_id evtid) +{ + struct rdt_mon_domain *d; + int cntr_id; + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid); + if (cntr_id >= 0) + rdtgroup_assign_cntr(r, d, evtid, rdtgrp->mon.rmid, + rdtgrp->closid, cntr_id, true); + } +} + +/* + * resctrl_update_cntr_allrdtgrp - Update the counter assignments for the event + * for all the groups. + * @mevt MBM Monitor event. + */ +static void resctrl_update_cntr_allrdtgrp(struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + struct rdtgroup *prgrp, *crgrp; + + /* + * Find all the groups where the event is assigned and update the + * configuration of existing assignments. + */ + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + rdtgroup_update_cntr_event(r, prgrp, mevt->evtid); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + rdtgroup_update_cntr_event(r, crgrp, mevt->evtid); + } +} + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + u32 evt_cfg = 0; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + ret = resctrl_parse_mem_transactions(buf, &evt_cfg); + if (!ret && mevt->evt_cfg != evt_cfg) { + mevt->evt_cfg = evt_cfg; + resctrl_update_cntr_allrdtgrp(mevt); + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool enabled; + + mutex_lock(&rdtgroup_mutex); + enabled = resctrl_arch_mbm_cntr_assign_enabled(r); + + if (r->mon.mbm_cntr_assignable) { + if (enabled) + seq_puts(s, "[mbm_event]\n"); + else + seq_puts(s, "[default]\n"); + + if (!IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED)) { + if (enabled) + seq_puts(s, "default\n"); + else + seq_puts(s, "mbm_event\n"); + } + } else { + seq_puts(s, "[default]\n"); + } + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *d; + int ret = 0; + bool enable; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!strcmp(buf, "default")) { + enable = 0; + } else if (!strcmp(buf, "mbm_event")) { + if (r->mon.mbm_cntr_assignable) { + enable = 1; + } else { + ret = -EINVAL; + rdt_last_cmd_puts("mbm_event mode is not supported\n"); + goto out_unlock; + } + } else { + ret = -EINVAL; + rdt_last_cmd_puts("Unsupported assign mode\n"); + goto out_unlock; + } + + if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) { + ret = resctrl_arch_mbm_cntr_assign_set(r, enable); + if (ret) + goto out_unlock; + + /* Update the visibility of BMEC related files */ + resctrl_bmec_files_show(r, NULL, !enable); + + /* + * Initialize the default memory transaction values for + * total and local events. + */ + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + /* Enable auto assignment when switching to "mbm_event" mode */ + if (enable) + r->mon.mbm_assign_on_mkdir = true; + /* + * Reset all the non-achitectural RMID state and assignable counters. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + mbm_cntr_free_all(r, d); + resctrl_reset_rmid_all(r, d); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + seq_printf(s, "%d=%d", dom->hdr.id, r->mon.num_mbm_cntrs); + sep = true; + } + seq_putc(s, '\n'); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return 0; +} + +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + u32 cntrs, i; + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + cntrs = 0; + for (i = 0; i < r->mon.num_mbm_cntrs; i++) { + if (!dom->cntr_cfg[i].rdtgrp) + cntrs++; + } + + seq_printf(s, "%d=%u", dom->hdr.id, cntrs); + sep = true; + } + seq_putc(s, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret; +} + +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + struct mon_evt *mevt; + int ret = 0; + bool sep; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out_unlock; + } + + rdt_last_cmd_clear(); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + sep = false; + seq_printf(s, "%s:", mevt->name); + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + if (mbm_cntr_get(r, d, rdtgrp, mevt->evtid) < 0) + seq_printf(s, "%d=_", d->hdr.id); + else + seq_printf(s, "%d=e", d->hdr.id); + + sep = true; + } + seq_putc(s, '\n'); + } + +out_unlock: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching + * event name. + */ +static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *name) +{ + struct mon_evt *mevt; + + for_each_mon_event(mevt) { + if (mevt->rid == r->rid && mevt->enabled && + resctrl_is_mbm_event(mevt->evtid) && + !strcmp(mevt->name, name)) + return mevt; + } + + return NULL; +} + +static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int ret = 0; + + if (!assign || strlen(assign) != 1) + return -EINVAL; + + switch (*assign) { + case 'e': + ret = rdtgroup_assign_cntr_event(d, rdtgrp, mevt); + break; + case '_': + rdtgroup_unassign_cntr_event(d, rdtgrp, mevt); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, + char *event, char *tok) +{ + struct rdt_mon_domain *d; + unsigned long dom_id = 0; + char *dom_str, *id_str; + struct mon_evt *mevt; + int ret; + + mevt = mbm_get_mon_event_by_name(r, event); + if (!mevt) { + rdt_last_cmd_printf("Invalid event %s\n", event); + return -ENOENT; + } + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + + id_str = strsep(&dom_str, "="); + + /* Check for domain id '*' which means all domains */ + if (id_str && *id_str == '*') { + ret = rdtgroup_modify_assign_state(dom_str, NULL, rdtgrp, mevt); + if (ret) + rdt_last_cmd_printf("Assign operation '%s:*=%s' failed\n", + event, dom_str); + return ret; + } else if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing domain id\n"); + return -EINVAL; + } + + /* Verify if the dom_id is valid */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + ret = rdtgroup_modify_assign_state(dom_str, d, rdtgrp, mevt); + if (ret) { + rdt_last_cmd_printf("Assign operation '%s:%ld=%s' failed\n", + event, dom_id, dom_str); + return ret; + } + goto next; + } + } + + rdt_last_cmd_printf("Invalid domain id %ld\n", dom_id); + return -EINVAL; +} + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdtgroup *rdtgrp; + char *token, *event; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event mode is not enabled\n"); + rdtgroup_kn_unlock(of->kn); + return -EINVAL; + } + + while ((token = strsep(&buf, "\n")) != NULL) { + /* + * The write command follows the following format: + * ":=" + * Extract the event name first. + */ + event = strsep(&token, ":"); + + ret = resctrl_parse_mbm_assignment(r, rdtgrp, event, token); + if (ret) + break; + } + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; } /** @@ -902,24 +1765,43 @@ int resctrl_mon_resource_init(void) if (ret) return ret; - l3_mon_evt_init(r); - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) + else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + if (r->mon.mbm_cntr_assignable) { + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + r->mon.mbm_assign_on_mkdir = true; + resctrl_file_fflags_init("num_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("available_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); + resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | + RFTYPE_RES_CACHE); + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + } + return 0; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 77d08229d85502..0320360cd7a6eb 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -123,14 +123,8 @@ void rdt_staged_configs_clear(void) static bool resctrl_is_mbm_enabled(void) { - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); -} - -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); + return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) || + resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } /* @@ -196,7 +190,7 @@ static int closid_alloc(void) lockdep_assert_held(&rdtgroup_mutex); if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { + resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { cleanest_closid = resctrl_find_cleanest_closid(); if (cleanest_closid < 0) return cleanest_closid; @@ -981,7 +975,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of, return 0; } -static void *rdt_kn_parent_priv(struct kernfs_node *kn) +void *rdt_kn_parent_priv(struct kernfs_node *kn) { /* * The parent pointer is only valid within RCU section since it can be @@ -1141,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->num_rmid); + seq_printf(seq, "%d\n", r->mon.num_rmid); return 0; } @@ -1152,9 +1146,12 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = rdt_kn_parent_priv(of->kn); struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) + if (mevt->configurable && + !resctrl_arch_mbm_cntr_assign_enabled(r)) seq_printf(seq, "%s_config\n", mevt->name); } @@ -1735,9 +1732,9 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) } /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { + if ((val & r->mon.mbm_cfg_mask) != val) { rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); + r->mon.mbm_cfg_mask); return -EINVAL; } @@ -1803,6 +1800,44 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, return ret ?: nbytes; } +/* + * resctrl_bmec_files_show() — Controls the visibility of BMEC-related resctrl + * files. When @show is true, the files are displayed; when false, the files + * are hidden. + * Don't treat kernfs_find_and_get failure as an error, since this function may + * be called regardless of whether BMEC is supported or the event is enabled. + */ +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show) +{ + struct kernfs_node *kn_config, *mon_kn = NULL; + char name[32]; + + if (!l3_mon_kn) { + sprintf(name, "%s_MON", r->name); + mon_kn = kernfs_find_and_get(kn_info, name); + if (!mon_kn) + return; + l3_mon_kn = mon_kn; + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_total_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_local_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + /* Release the reference only if it was acquired */ + if (mon_kn) + kernfs_put(mon_kn); +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1812,6 +1847,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_last_cmd_status_show, .fflags = RFTYPE_TOP_INFO, }, + { + .name = "mbm_assign_on_mkdir", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_on_mkdir_show, + .write = resctrl_mbm_assign_on_mkdir_write, + }, { .name = "num_closids", .mode = 0444, @@ -1826,6 +1868,12 @@ static struct rftype res_common_files[] = { .seq_show = rdt_mon_features_show, .fflags = RFTYPE_MON_INFO, }, + { + .name = "available_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_available_mbm_cntrs_show, + }, { .name = "num_rmids", .mode = 0444, @@ -1840,6 +1888,12 @@ static struct rftype res_common_files[] = { .seq_show = rdt_default_ctrl_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, + { + .name = "num_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_num_mbm_cntrs_show, + }, { .name = "min_cbm_bits", .mode = 0444, @@ -1915,6 +1969,28 @@ static struct rftype res_common_files[] = { .seq_show = mbm_local_bytes_config_show, .write = mbm_local_bytes_config_write, }, + { + .name = "event_filter", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = event_filter_show, + .write = event_filter_write, + }, + { + .name = "mbm_L3_assignments", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_L3_assignments_show, + .write = mbm_L3_assignments_write, + }, + { + .name = "mbm_assign_mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_mode_show, + .write = resctrl_mbm_assign_mode_write, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, + }, { .name = "cpus", .mode = 0644, @@ -2168,10 +2244,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, return ret; } +static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn) +{ + struct kernfs_node *kn_subdir, *kn_subdir2; + struct mon_evt *mevt; + int ret; + + kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt); + if (IS_ERR(kn_subdir2)) { + ret = PTR_ERR(kn_subdir2); + goto out; + } + + ret = rdtgroup_kn_set_ugid(kn_subdir2); + if (ret) + goto out; + + ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG); + if (ret) + break; + } + +out: + return ret; +} + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, unsigned long fflags) { struct kernfs_node *kn_subdir; + struct rdt_resource *r; int ret; kn_subdir = kernfs_create_dir(kn_info, name, @@ -2184,8 +2298,25 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, return ret; ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); + if (ret) + return ret; + + if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) { + r = priv; + if (r->mon.mbm_cntr_assignable) { + ret = resctrl_mkdir_event_configs(r, kn_subdir); + if (ret) + return ret; + /* + * Hide BMEC related files if mbm_event mode + * is enabled. + */ + if (resctrl_arch_mbm_cntr_assign_enabled(r)) + resctrl_bmec_files_show(r, kn_subdir, false); + } + } + + kernfs_activate(kn_subdir); return ret; } @@ -2608,10 +2739,8 @@ static int rdt_get_tree(struct fs_context *fc) goto out_root; ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } + if (ret) + goto out_schemata_free; ret = closid_init(); if (ret) @@ -2637,6 +2766,8 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_info; + rdtgroup_assign_cntrs(&rdtgroup_default); + ret = mkdir_mondata_all(rdtgroup_default.kn, &rdtgroup_default, &kn_mondata); if (ret < 0) @@ -2675,15 +2806,16 @@ static int rdt_get_tree(struct fs_context *fc) if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(&rdtgroup_default); kernfs_remove(kn_mongrp); + } out_info: kernfs_remove(kn_info); out_closid_exit: closid_exit(); out_schemata_free: schemata_list_destroy(); -out_ctx: rdt_disable_ctx(); out_root: rdtgroup_destroy_root(); @@ -2822,6 +2954,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + rdtgroup_unassign_cntrs(sentry); free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); @@ -2862,6 +2995,8 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); @@ -2946,6 +3081,7 @@ static void resctrl_fs_teardown(void) return; rmdir_all_sub(); + rdtgroup_unassign_cntrs(&rdtgroup_default); mon_put_kn_priv(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; @@ -3057,10 +3193,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, struct mon_evt *mevt; int ret, domid; - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) @@ -3427,9 +3562,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) } rdtgrp->mon.rmid = ret; + rdtgroup_assign_cntrs(rdtgrp); + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); + rdtgroup_unassign_cntrs(rdtgrp); free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -3439,8 +3577,10 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) { - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(rgrp); free_rmid(rgrp->closid, rgrp->mon.rmid); + } } /* @@ -3716,6 +3856,9 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; + + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* @@ -3763,6 +3906,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); @@ -4022,9 +4167,14 @@ static void rdtgroup_setup_default(void) static void domain_destroy_mon_state(struct rdt_mon_domain *d) { + int idx; + + kfree(d->cntr_cfg); bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } } void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4050,7 +4200,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -4084,32 +4234,41 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; + size_t tsize = sizeof(*d->mbm_states[0]); + enum resctrl_event_id eventid; + int idx; - if (resctrl_arch_is_llc_occupancy_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_states[idx]) + goto cleanup; } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } + + if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) { + tsize = sizeof(*d->cntr_cfg); + d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL); + if (!d->cntr_cfg) + goto cleanup; } return 0; +cleanup: + bitmap_free(d->rmid_busy_llc); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } + + return -ENOMEM; } int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4144,7 +4303,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) RESCTRL_PICK_ANY_CPU); } - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); /* @@ -4219,7 +4378,7 @@ void resctrl_offline_cpu(unsigned int cpu) cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0, cpu); } - if (resctrl_arch_is_llc_occupancy_enabled() && + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0, cpu); diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 2337cf795db3f8..35c4d27d2cc0ec 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -24,6 +24,7 @@ #endif #ifdef CONFIG_CIFS_SMB_DIRECT #include "smbdirect.h" +#include "../common/smbdirect/smbdirect_pdu.h" #endif #include "cifs_swn.h" #include "cached_dir.h" @@ -456,57 +457,55 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) sc = &server->smbd_conn->socket; sp = &sc->parameters; - seq_printf(m, "\nSMBDirect (in hex) protocol version: %x " - "transport status: %x", - server->smbd_conn->protocol, - server->smbd_conn->socket.status); - seq_printf(m, "\nConn receive_credit_max: %x " - "send_credit_target: %x max_send_size: %x", + seq_printf(m, "\nSMBDirect protocol version: 0x%x " + "transport status: %s (%u)", + SMBDIRECT_V1, + smbdirect_socket_status_string(sc->status), + sc->status); + seq_printf(m, "\nConn receive_credit_max: %u " + "send_credit_target: %u max_send_size: %u", sp->recv_credit_max, sp->send_credit_target, sp->max_send_size); - seq_printf(m, "\nConn max_fragmented_recv_size: %x " - "max_fragmented_send_size: %x max_receive_size:%x", + seq_printf(m, "\nConn max_fragmented_recv_size: %u " + "max_fragmented_send_size: %u max_receive_size:%u", sp->max_fragmented_recv_size, sp->max_fragmented_send_size, sp->max_recv_size); - seq_printf(m, "\nConn keep_alive_interval: %x " - "max_readwrite_size: %x rdma_readwrite_threshold: %x", + seq_printf(m, "\nConn keep_alive_interval: %u " + "max_readwrite_size: %u rdma_readwrite_threshold: %u", sp->keepalive_interval_msec * 1000, sp->max_read_write_size, - server->smbd_conn->rdma_readwrite_threshold); - seq_printf(m, "\nDebug count_get_receive_buffer: %x " - "count_put_receive_buffer: %x count_send_empty: %x", - server->smbd_conn->count_get_receive_buffer, - server->smbd_conn->count_put_receive_buffer, - server->smbd_conn->count_send_empty); - seq_printf(m, "\nRead Queue count_reassembly_queue: %x " - "count_enqueue_reassembly_queue: %x " - "count_dequeue_reassembly_queue: %x " - "reassembly_data_length: %x " - "reassembly_queue_length: %x", - server->smbd_conn->count_reassembly_queue, - server->smbd_conn->count_enqueue_reassembly_queue, - server->smbd_conn->count_dequeue_reassembly_queue, + server->rdma_readwrite_threshold); + seq_printf(m, "\nDebug count_get_receive_buffer: %llu " + "count_put_receive_buffer: %llu count_send_empty: %llu", + sc->statistics.get_receive_buffer, + sc->statistics.put_receive_buffer, + sc->statistics.send_empty); + seq_printf(m, "\nRead Queue " + "count_enqueue_reassembly_queue: %llu " + "count_dequeue_reassembly_queue: %llu " + "reassembly_data_length: %u " + "reassembly_queue_length: %u", + sc->statistics.enqueue_reassembly_queue, + sc->statistics.dequeue_reassembly_queue, sc->recv_io.reassembly.data_length, sc->recv_io.reassembly.queue_length); - seq_printf(m, "\nCurrent Credits send_credits: %x " - "receive_credits: %x receive_credit_target: %x", - atomic_read(&server->smbd_conn->send_credits), - atomic_read(&server->smbd_conn->receive_credits), - server->smbd_conn->receive_credit_target); - seq_printf(m, "\nPending send_pending: %x ", - atomic_read(&server->smbd_conn->send_pending)); - seq_printf(m, "\nReceive buffers count_receive_queue: %x ", - server->smbd_conn->count_receive_queue); - seq_printf(m, "\nMR responder_resources: %x " - "max_frmr_depth: %x mr_type: %x", - server->smbd_conn->responder_resources, - server->smbd_conn->max_frmr_depth, - server->smbd_conn->mr_type); - seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x", - atomic_read(&server->smbd_conn->mr_ready_count), - atomic_read(&server->smbd_conn->mr_used_count)); + seq_printf(m, "\nCurrent Credits send_credits: %u " + "receive_credits: %u receive_credit_target: %u", + atomic_read(&sc->send_io.credits.count), + atomic_read(&sc->recv_io.credits.count), + sc->recv_io.credits.target); + seq_printf(m, "\nPending send_pending: %u ", + atomic_read(&sc->send_io.pending.count)); + seq_printf(m, "\nMR responder_resources: %u " + "max_frmr_depth: %u mr_type: 0x%x", + sp->responder_resources, + sp->max_frmr_depth, + sc->mr_io.type); + seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u", + atomic_read(&sc->mr_io.ready.count), + atomic_read(&sc->mr_io.used.count)); skip_rdma: #endif seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x", diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index e1848276bab413..dcb39d1b595812 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode) /* no serverino => unconditional eviction */ return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || - generic_drop_inode(inode); + inode_generic_drop(inode); } static const struct super_operations cifs_super_ops = { @@ -1895,7 +1895,9 @@ init_cifs(void) cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n"); } - cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + cifsiod_wq = alloc_workqueue("cifsiod", + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cifsiod_wq) { rc = -ENOMEM; goto out_clean_proc; @@ -1923,28 +1925,32 @@ init_cifs(void) } cifsoplockd_wq = alloc_workqueue("cifsoplockd", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cifsoplockd_wq) { rc = -ENOMEM; goto out_destroy_fileinfo_put_wq; } deferredclose_wq = alloc_workqueue("deferredclose", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!deferredclose_wq) { rc = -ENOMEM; goto out_destroy_cifsoplockd_wq; } serverclose_wq = alloc_workqueue("serverclose", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!serverclose_wq) { rc = -ENOMEM; goto out_destroy_deferredclose_wq; } cfid_put_wq = alloc_workqueue("cfid_put_wq", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cfid_put_wq) { rc = -ENOMEM; goto out_destroy_serverclose_wq; diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 1e64a4fb6af037..3ac254e123dcac 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -87,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 7 +#define MAX_COMPOUND 10 /* * Default number of credits to keep available for SMB3. @@ -814,6 +814,13 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; unsigned int min_offload; + /* + * If payload is less than or equal to the threshold, + * use RDMA send/recv to send upper layer I/O. + * If payload is more than the threshold, + * use RDMA read/write through memory registration for I/O. + */ + unsigned int rdma_readwrite_threshold; unsigned int retrans; struct { bool requested; /* "compress" mount option set*/ @@ -1540,7 +1547,7 @@ struct cifs_io_subrequest { struct kvec iov[2]; struct TCP_Server_Info *server; #ifdef CONFIG_CIFS_SMB_DIRECT - struct smbd_mr *mr; + struct smbdirect_mr_io *mr; #endif struct cifs_credits credits; }; @@ -1882,9 +1889,12 @@ static inline bool is_replayable_error(int error) /* cifs_get_writable_file() flags */ -#define FIND_WR_ANY 0 -#define FIND_WR_FSUID_ONLY 1 -#define FIND_WR_WITH_DELETE 2 +enum cifs_writable_file_flags { + FIND_WR_ANY = 0U, + FIND_WR_FSUID_ONLY = (1U << 0), + FIND_WR_WITH_DELETE = (1U << 1), + FIND_WR_NO_PENDING_DELETE = (1U << 2), +}; #define MID_FREE 0 #define MID_REQUEST_ALLOCATED 1 @@ -2343,6 +2353,8 @@ struct smb2_compound_vars { struct kvec qi_iov; struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov; struct smb2_file_rename_info_hdr rename_info; struct smb2_file_link_info_hdr link_info; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index c34c533b2efade..e8fba98690ce38 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -312,8 +312,8 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon); -extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, - const char *path); +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, + struct dentry *dentry); extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode, const char *path); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 186e061068be3a..a5ed742afa007a 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -97,8 +97,12 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) cifs_trace_rw_credits_write_prepare); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->smbd_conn) - stream->sreq_max_segs = server->smbd_conn->max_frmr_depth; + if (server->smbd_conn) { + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); + + stream->sreq_max_segs = sp->max_frmr_depth; + } #endif } @@ -187,8 +191,12 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) cifs_trace_rw_credits_read_submit); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->smbd_conn) - rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth; + if (server->smbd_conn) { + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); + + rreq->io_streams[0].sreq_max_segs = sp->max_frmr_depth; + } #endif return 0; } @@ -998,7 +1006,10 @@ int cifs_open(struct inode *inode, struct file *file) /* Get the cached handle as SMB2 close is deferred */ if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { - rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile); + rc = cifs_get_writable_path(tcon, full_path, + FIND_WR_FSUID_ONLY | + FIND_WR_NO_PENDING_DELETE, + &cfile); } else { rc = cifs_get_readable_path(tcon, full_path, &cfile); } @@ -2530,6 +2541,9 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, continue; if (with_delete && !(open_file->fid.access & DELETE)) continue; + if ((flags & FIND_WR_NO_PENDING_DELETE) && + open_file->status_file_deleted) + continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ @@ -2647,6 +2661,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, spin_unlock(&tcon->open_file_lock); free_dentry_path(page); *ret_file = find_readable_file(cinode, 0); + if (*ret_file) { + spin_lock(&cinode->open_file_lock); + if ((*ret_file)->status_file_deleted) { + spin_unlock(&cinode->open_file_lock); + cifsFileInfo_put(*ret_file); + *ret_file = NULL; + } else { + spin_unlock(&cinode->open_file_lock); + } + } return *ret_file ? 0 : -ENOENT; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index fe453a4b3dc831..7e978408050130 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode) * but will return the EACCES to the caller. Note that the VFS does not call * unlink on negative dentries currently. */ -int cifs_unlink(struct inode *dir, struct dentry *dentry) +static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename) { int rc = 0; unsigned int xid; @@ -1984,7 +1984,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) } netfs_wait_for_outstanding_io(inode); - cifs_close_deferred_file_under_dentry(tcon, full_path); + cifs_close_deferred_file_under_dentry(tcon, dentry); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { @@ -2003,7 +2003,24 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto psx_del_no_retry; } - rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); + /* For SMB2+, if the file is open, we always perform a silly rename. + * + * We check for d_count() right after calling + * cifs_close_deferred_file_under_dentry() to make sure that the + * dentry's refcount gets dropped in case the file had any deferred + * close. + */ + if (!sillyrename && server->vals->protocol_id > SMB10_PROT_ID) { + spin_lock(&dentry->d_lock); + if (d_count(dentry) > 1) + sillyrename = true; + spin_unlock(&dentry->d_lock); + } + + if (sillyrename) + rc = -EBUSY; + else + rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); psx_del_no_retry: if (!rc) { @@ -2071,6 +2088,11 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) return rc; } +int cifs_unlink(struct inode *dir, struct dentry *dentry) +{ + return __cifs_unlink(dir, dentry, false); +} + static int cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, const char *full_path, struct cifs_sb_info *cifs_sb, @@ -2358,14 +2380,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb); cifs_put_tlink(tlink); + cifsInode = CIFS_I(d_inode(direntry)); + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags); spin_lock(&d_inode(direntry)->i_lock); i_size_write(d_inode(direntry), 0); clear_nlink(d_inode(direntry)); spin_unlock(&d_inode(direntry)->i_lock); } - cifsInode = CIFS_I(d_inode(direntry)); /* force revalidate to go get info when needed */ cifsInode->time = 0; @@ -2458,8 +2482,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ do_rename_exit: - if (rc == 0) + if (rc == 0) { d_move(from_dentry, to_dentry); + /* Force a new lookup */ + d_drop(from_dentry); + } cifs_put_tlink(tlink); return rc; } @@ -2470,6 +2497,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, struct dentry *target_dentry, unsigned int flags) { const char *from_name, *to_name; + struct TCP_Server_Info *server; void *page1, *page2; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; @@ -2505,6 +2533,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; page1 = alloc_dentry_path(); page2 = alloc_dentry_path(); @@ -2522,10 +2551,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, goto cifs_rename_exit; } - cifs_close_deferred_file_under_dentry(tcon, from_name); + cifs_close_deferred_file_under_dentry(tcon, source_dentry); if (d_inode(target_dentry) != NULL) { netfs_wait_for_outstanding_io(d_inode(target_dentry)); - cifs_close_deferred_file_under_dentry(tcon, to_name); + cifs_close_deferred_file_under_dentry(tcon, target_dentry); } rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, @@ -2591,19 +2620,53 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, unlink_target: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - - /* Try unlinking the target dentry if it's not negative */ - if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { - if (d_is_dir(target_dentry)) - tmprc = cifs_rmdir(target_dir, target_dentry); - else - tmprc = cifs_unlink(target_dir, target_dentry); - if (tmprc) - goto cifs_rename_exit; - rc = cifs_do_rename(xid, source_dentry, from_name, - target_dentry, to_name); - if (!rc) - rehash = false; + if (d_really_is_positive(target_dentry)) { + if (!rc) { + struct inode *inode = d_inode(target_dentry); + /* + * Samba and ksmbd servers allow renaming a target + * directory that is open, so make sure to update + * ->i_nlink and then mark it as delete pending. + */ + if (S_ISDIR(inode->i_mode)) { + drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb); + spin_lock(&inode->i_lock); + i_size_write(inode, 0); + clear_nlink(inode); + spin_unlock(&inode->i_lock); + set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags); + CIFS_I(inode)->time = 0; /* force reval */ + inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + } + } else if (rc == -EACCES || rc == -EEXIST) { + /* + * Rename failed, possibly due to a busy target. + * Retry it by unliking the target first. + */ + if (d_is_dir(target_dentry)) { + tmprc = cifs_rmdir(target_dir, target_dentry); + } else { + tmprc = __cifs_unlink(target_dir, target_dentry, + server->vals->protocol_id > SMB10_PROT_ID); + } + if (tmprc) { + /* + * Some servers will return STATUS_ACCESS_DENIED + * or STATUS_DIRECTORY_NOT_EMPTY when failing to + * rename a non-empty directory. Make sure to + * propagate the appropriate error back to + * userspace. + */ + if (tmprc == -EEXIST || tmprc == -ENOTEMPTY) + rc = tmprc; + goto cifs_rename_exit; + } + rc = cifs_do_rename(xid, source_dentry, from_name, + target_dentry, to_name); + if (!rc) + rehash = false; + } } /* force revalidate to go get info when needed */ @@ -2629,6 +2692,8 @@ cifs_dentry_needs_reval(struct dentry *dentry) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct cached_fid *cfid = NULL; + if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags)) + return false; if (cifs_i->time == 0) return true; @@ -2779,7 +2844,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) } cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", - full_path, inode, inode->i_count.counter, + full_path, inode, icount_read(inode), dentry, cifs_get_time(dentry), jiffies); again: diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index da23cc12a52caa..dda6dece802ad2 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -832,33 +832,28 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) kfree(tmp_list); } } -void -cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) + +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, + struct dentry *dentry) { - struct cifsFileInfo *cfile; struct file_list *tmp_list, *tmp_next_list; - void *page; - const char *full_path; + struct cifsFileInfo *cfile; LIST_HEAD(file_head); - page = alloc_dentry_path(); spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { - full_path = build_path_from_dentry(cfile->dentry, page); - if (strstr(full_path, path)) { - if (delayed_work_pending(&cfile->deferred)) { - if (cancel_delayed_work(&cfile->deferred)) { - spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - cifs_del_deferred_close(cfile); - spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - - tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); - if (tmp_list == NULL) - break; - tmp_list->cfile = cfile; - list_add_tail(&tmp_list->list, &file_head); - } - } + if ((cfile->dentry == dentry) && + delayed_work_pending(&cfile->deferred) && + cancel_delayed_work(&cfile->deferred)) { + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + break; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); } } spin_unlock(&tcon->open_file_lock); @@ -868,7 +863,6 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) list_del(&tmp_list->list); kfree(tmp_list); } - free_dentry_path(page); } /* diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index 224495322a05da..e56e4d402f1382 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -30,10 +30,9 @@ enum smb2_compound_ops { SMB2_OP_QUERY_DIR, SMB2_OP_MKDIR, SMB2_OP_RENAME, - SMB2_OP_DELETE, SMB2_OP_HARDLINK, SMB2_OP_SET_EOF, - SMB2_OP_RMDIR, + SMB2_OP_UNLINK, SMB2_OP_POSIX_QUERY_INFO, SMB2_OP_SET_REPARSE, SMB2_OP_GET_REPARSE, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 31c13fb5b85b62..0985db9f86e510 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -346,9 +346,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_posix_query_info_compound_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_DELETE: - trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path); - break; case SMB2_OP_MKDIR: /* * Directories are created through parameters in the @@ -356,23 +353,40 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, */ trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_RMDIR: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + case SMB2_OP_UNLINK: + rqst[num_rqst].rq_iov = vars->unlink_iov; rqst[num_rqst].rq_nvec = 1; size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ data[0] = &delete_pending[0]; - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_DISPOSITION_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (rc) + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); - trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path); + } + num_rqst++; + trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path); break; case SMB2_OP_SET_EOF: rqst[num_rqst].rq_iov = &vars->si_iov[0]; @@ -442,7 +456,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, ses->Suid, full_path); break; case SMB2_OP_RENAME: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_iov = vars->rename_iov; rqst[num_rqst].rq_nvec = 2; len = in_iov[i].iov_len; @@ -673,7 +687,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, } for (i = 0; i < num_cmds; i++) { - char *buf = rsp_iov[i + i].iov_base; + char *buf = rsp_iov[i + 1].iov_base; if (buf && resp_buftype[i + 1] != CIFS_NO_BUFFER) rc = server->ops->map_error(buf, false); @@ -732,19 +746,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_posix_query_info_compound_done(xid, tcon->tid, ses->Suid); break; - case SMB2_OP_DELETE: - if (rc) - trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc); - else { - /* - * If dentry (hence, inode) is NULL, lease break is going to - * take care of degrading leases on handles for deleted files. - */ - if (inode) - cifs_mark_open_handles_for_deleted_file(inode, full_path); - trace_smb3_delete_done(xid, tcon->tid, ses->Suid); - } - break; case SMB2_OP_MKDIR: if (rc) trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc); @@ -765,11 +766,11 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_rename_done(xid, tcon->tid, ses->Suid); SMB2_set_info_free(&rqst[num_rqst++]); break; - case SMB2_OP_RMDIR: - if (rc) - trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc); + case SMB2_OP_UNLINK: + if (!rc) + trace_smb3_unlink_done(xid, tcon->tid, ses->Suid); else - trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid); + trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc); SMB2_set_info_free(&rqst[num_rqst++]); break; case SMB2_OP_SET_EOF: @@ -1166,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE); return smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, NULL, - &(int){SMB2_OP_RMDIR}, 1, + &(int){SMB2_OP_UNLINK}, 1, NULL, NULL, NULL, NULL); } @@ -1174,21 +1175,107 @@ int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb, struct dentry *dentry) { + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + __le16 *utf16_path __free(kfree) = NULL; + int retries = 0, cur_sleep = 1; + struct TCP_Server_Info *server; struct cifs_open_parms oparms; + struct smb2_create_req *creq; + struct inode *inode = NULL; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + struct kvec close_iov; + int resp_buftype[2]; + struct cifs_fid fid; + int flags = 0; + __u8 oplock; + int rc; - oparms = CIFS_OPARMS(cifs_sb, tcon, name, - DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE); - int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, dentry); - if (rc == -EINVAL) { - cifs_dbg(FYI, "invalid lease key, resending request without lease"); - rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, NULL); + utf16_path = cifs_convert_path_to_utf16(name, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; +again: + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(tcon->ses); + + memset(rqst, 0, sizeof(rqst)); + memset(resp_buftype, 0, sizeof(resp_buftype)); + memset(rsp_iov, 0, sizeof(rsp_iov)); + + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = ARRAY_SIZE(open_iov); + + oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE | FILE_READ_ATTRIBUTES, + FILE_OPEN, CREATE_DELETE_ON_CLOSE | + OPEN_REPARSE_POINT, ACL_NO_MODE); + oparms.fid = &fid; + + if (dentry) { + inode = d_inode(dentry); + if (CIFS_I(inode)->lease_granted && server->ops->get_lease_key) { + oplock = SMB2_OPLOCK_LEVEL_LEASE; + server->ops->get_lease_key(inode, &fid); + } } + + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); + if (rc) + goto err_free; + smb2_set_next_command(tcon, &rqst[0]); + creq = rqst[0].rq_iov[0].iov_base; + creq->ShareAccess = FILE_SHARE_DELETE_LE; + + rqst[1].rq_iov = &close_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_close_init(tcon, server, &rqst[1], + COMPOUND_FID, COMPOUND_FID, false); + smb2_set_related(&rqst[1]); + if (rc) + goto err_free; + + if (retries) { + for (int i = 0; i < ARRAY_SIZE(rqst); i++) + smb2_set_replay(server, &rqst[i]); + } + + rc = compound_send_recv(xid, tcon->ses, server, flags, + ARRAY_SIZE(rqst), rqst, + resp_buftype, rsp_iov); + SMB2_open_free(&rqst[0]); + SMB2_close_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto again; + + /* Retry compound request without lease */ + if (rc == -EINVAL && dentry) { + dentry = NULL; + retries = 0; + cur_sleep = 1; + goto again; + } + /* + * If dentry (hence, inode) is NULL, lease break is going to + * take care of degrading leases on handles for deleted files. + */ + if (!rc && inode) + cifs_mark_open_handles_for_deleted_file(inode, name); + + return rc; + +err_free: + SMB2_open_free(&rqst[0]); + SMB2_close_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } @@ -1441,3 +1528,113 @@ int smb2_query_reparse_point(const unsigned int xid, cifs_free_open_info(&data); return rc; } + +static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb, + const char *name, size_t namelen) +{ + int len; + + if (*name == '\\' || + (cifs_sb_master_tlink(cifs_sb) && + cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/')) + name++; + return cifs_strndup_to_utf16(name, namelen, &len, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); +} + +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry)); + __le16 *utf16_path __free(kfree) = NULL; + __u32 co = file_create_options(dentry); + int cmds[] = { + SMB2_OP_SET_INFO, + SMB2_OP_RENAME, + SMB2_OP_UNLINK, + }; + const int num_cmds = ARRAY_SIZE(cmds); + char *to_name __free(kfree) = NULL; + __u32 attrs = cinode->cifsAttrs; + struct cifs_open_parms oparms; + static atomic_t sillycounter; + struct cifsFileInfo *cfile; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct kvec iov[2]; + const char *ppath; + void *page; + size_t len; + int rc; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + page = alloc_dentry_path(); + + ppath = build_path_from_dentry(dentry->d_parent, page); + if (IS_ERR(ppath)) { + rc = PTR_ERR(ppath); + goto out; + } + + len = strlen(ppath) + strlen("/.__smb1234") + 1; + to_name = kmalloc(len, GFP_KERNEL); + if (!to_name) { + rc = -ENOMEM; + goto out; + } + + scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb), + atomic_inc_return(&sillycounter) & 0xffff); + + utf16_path = utf16_smb2_path(cifs_sb, to_name, len); + if (!utf16_path) { + rc = -ENOMEM; + goto out; + } + + drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + DELETE | FILE_WRITE_ATTRIBUTES, + FILE_OPEN, co, ACL_NO_MODE); + + attrs &= ~ATTR_READONLY; + if (!attrs) + attrs = ATTR_NORMAL; + if (d_inode(dentry)->i_nlink <= 1) + attrs |= ATTR_HIDDEN; + iov[0].iov_base = &(FILE_BASIC_INFO) { + .Attributes = cpu_to_le32(attrs), + }; + iov[0].iov_len = sizeof(FILE_BASIC_INFO); + iov[1].iov_base = utf16_path; + iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path); + + cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, dentry); + if (rc == -EINVAL) { + cifs_dbg(FYI, "invalid lease key, resending request without lease\n"); + cifs_get_writable_path(tcon, full_path, + FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, NULL); + } + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags); + } else { + cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n", + __func__, full_path, to_name, rc); + rc = -EIO; + } +out: + cifs_put_tlink(tlink); + free_dentry_path(page); + return rc; +} diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 94b1d7a395d50a..4711a23c5b380d 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -504,8 +504,8 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { - struct smbdirect_socket_parameters *sp = - &server->smbd_conn->socket.parameters; + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); if (server->sign) /* @@ -555,8 +555,8 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { - struct smbdirect_socket_parameters *sp = - &server->smbd_conn->socket.parameters; + const struct smbdirect_socket_parameters *sp = + smbd_get_parameters(server->smbd_conn); if (server->sign) /* @@ -2640,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* SMB headers in a compound are 8 byte aligned. */ - if (!IS_ALIGNED(len, 8)) { - num_padding = 8 - (len & 7); + if (IS_ALIGNED(len, 8)) + goto out; + + num_padding = 8 - (len & 7); + if (smb3_encryption_required(tcon)) { + int i; + + /* + * Flatten request into a single buffer with required padding as + * the encryption layer can't handle the padding iovs. + */ + for (i = 1; i < rqst->rq_nvec; i++) { + memcpy(rqst->rq_iov[0].iov_base + + rqst->rq_iov[0].iov_len, + rqst->rq_iov[i].iov_base, + rqst->rq_iov[i].iov_len); + rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len; + } + memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len, + 0, num_padding); + rqst->rq_iov[0].iov_len += num_padding; + rqst->rq_nvec = 1; + } else { rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding; rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding; rqst->rq_nvec++; - len += num_padding; } + len += num_padding; +out: shdr->NextCommand = cpu_to_le32(len); } @@ -5376,6 +5398,7 @@ struct smb_version_operations smb20_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ @@ -5481,6 +5504,7 @@ struct smb_version_operations smb21_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb30_operations = { @@ -5597,6 +5621,7 @@ struct smb_version_operations smb30_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb311_operations = { @@ -5713,6 +5738,7 @@ struct smb_version_operations smb311_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c3b9d3f6210ff9..1c63d2c9cc9c82 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4411,7 +4411,7 @@ static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms) return false; /* offload also has its overhead, so only do it if desired */ - if (io_parms->length < server->smbd_conn->rdma_readwrite_threshold) + if (io_parms->length < server->rdma_readwrite_threshold) return false; return true; diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 6e805ece6a7b19..b3f1398c9f7906 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end); int smb2_make_nfs_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid); #endif /* _SMB2PROTO_H */ diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 02d6db431fd4ec..316f398c70f4b5 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -13,28 +13,35 @@ #include "cifsproto.h" #include "smb2proto.h" +const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn) +{ + struct smbdirect_socket *sc = &conn->socket; + + return &sc->parameters; +} + static struct smbdirect_recv_io *get_receive_buffer( - struct smbd_connection *info); + struct smbdirect_socket *sc); static void put_receive_buffer( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response); -static int allocate_receive_buffers(struct smbd_connection *info, int num_buf); -static void destroy_receive_buffers(struct smbd_connection *info); +static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf); +static void destroy_receive_buffers(struct smbdirect_socket *sc); static void enqueue_reassembly( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response, int data_length); static struct smbdirect_recv_io *_get_first_reassembly( - struct smbd_connection *info); + struct smbdirect_socket *sc); static int smbd_post_recv( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response); -static int smbd_post_send_empty(struct smbd_connection *info); +static int smbd_post_send_empty(struct smbdirect_socket *sc); -static void destroy_mr_list(struct smbd_connection *info); -static int allocate_mr_list(struct smbd_connection *info); +static void destroy_mr_list(struct smbdirect_socket *sc); +static int allocate_mr_list(struct smbdirect_socket *sc); struct smb_extract_to_rdma { struct ib_sge *sge; @@ -57,6 +64,9 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, /* SMBD negotiation timeout in seconds */ #define SMBD_NEGOTIATE_TIMEOUT 120 +/* The timeout to wait for a keepalive message from peer in seconds */ +#define KEEPALIVE_RECV_TIMEOUT 5 + /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */ #define SMBD_MIN_RECEIVE_SIZE 128 #define SMBD_MIN_FRAGMENTED_SIZE 131072 @@ -155,65 +165,277 @@ do { \ #define log_rdma_mr(level, fmt, args...) \ log_rdma(level, LOG_RDMA_MR, fmt, ##args) +static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc) +{ + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + wake_up_all(&sc->status_wait); + wake_up_all(&sc->send_io.credits.wait_queue); + wake_up_all(&sc->send_io.pending.dec_wait_queue); + wake_up_all(&sc->send_io.pending.zero_wait_queue); + wake_up_all(&sc->recv_io.reassembly.wait_queue); + wake_up_all(&sc->mr_io.ready.wait_queue); + wake_up_all(&sc->mr_io.cleanup.wait_queue); +} + static void smbd_disconnect_rdma_work(struct work_struct *work) { - struct smbd_connection *info = - container_of(work, struct smbd_connection, disconnect_work); - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, disconnect_work); - if (sc->status == SMBDIRECT_SOCKET_CONNECTED) { + /* + * make sure this and other work is not queued again + * but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->disconnect_work); + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->mr_io.recovery_work); + disable_work(&sc->idle.immediate_work); + disable_delayed_work(&sc->idle.timer_work); + + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + + switch (sc->status) { + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_CONNECTED: + case SMBDIRECT_SOCKET_ERROR: sc->status = SMBDIRECT_SOCKET_DISCONNECTING; rdma_disconnect(sc->rdma.cm_id); + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + /* + * rdma_connect() never reached + * RDMA_CM_EVENT_ESTABLISHED + */ + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + break; } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smbd_disconnect_wake_up_all(sc); } -static void smbd_disconnect_rdma_connection(struct smbd_connection *info) +static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) { - queue_work(info->workqueue, &info->disconnect_work); + /* + * make sure other work (than disconnect_work) is + * not queued again but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->mr_io.recovery_work); + disable_work(&sc->idle.immediate_work); + disable_delayed_work(&sc->idle.timer_work); + + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + + switch (sc->status) { + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_ERROR: + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + /* + * Keep the current error status + */ + break; + + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; + break; + + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; + break; + + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; + break; + + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_CONNECTED: + sc->status = SMBDIRECT_SOCKET_ERROR; + break; + } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smbd_disconnect_wake_up_all(sc); + + queue_work(sc->workqueue, &sc->disconnect_work); } /* Upcall from RDMA CM */ static int smbd_conn_upcall( struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct smbd_connection *info = id->context; - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = id->context; + struct smbdirect_socket_parameters *sp = &sc->parameters; const char *event_name = rdma_event_msg(event->event); + u8 peer_initiator_depth; + u8 peer_responder_resources; log_rdma_event(INFO, "event=%s status=%d\n", event_name, event->status); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED; + wake_up(&sc->status_wait); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: - info->ri_rc = 0; - complete(&info->ri_done); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; + wake_up(&sc->status_wait); break; case RDMA_CM_EVENT_ADDR_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - info->ri_rc = -EHOSTUNREACH; - complete(&info->ri_done); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; + smbd_disconnect_rdma_work(&sc->disconnect_work); break; case RDMA_CM_EVENT_ROUTE_ERROR: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - info->ri_rc = -ENETUNREACH; - complete(&info->ri_done); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; + smbd_disconnect_rdma_work(&sc->disconnect_work); break; case RDMA_CM_EVENT_ESTABLISHED: log_rdma_event(INFO, "connected event=%s\n", event_name); - sc->status = SMBDIRECT_SOCKET_CONNECTED; - wake_up_interruptible(&info->status_wait); + + /* + * Here we work around an inconsistency between + * iWarp and other devices (at least rxe and irdma using RoCEv2) + */ + if (rdma_protocol_iwarp(id->device, id->port_num)) { + /* + * iWarp devices report the peer's values + * with the perspective of the peer here. + * Tested with siw and irdma (in iwarp mode) + * We need to change to our perspective here, + * so we need to switch the values. + */ + peer_initiator_depth = event->param.conn.responder_resources; + peer_responder_resources = event->param.conn.initiator_depth; + } else { + /* + * Non iWarp devices report the peer's values + * already changed to our perspective here. + * Tested with rxe and irdma (in roce mode). + */ + peer_initiator_depth = event->param.conn.initiator_depth; + peer_responder_resources = event->param.conn.responder_resources; + } + if (rdma_protocol_iwarp(id->device, id->port_num) && + event->param.conn.private_data_len == 8) { + /* + * Legacy clients with only iWarp MPA v1 support + * need a private blob in order to negotiate + * the IRD/ORD values. + */ + const __be32 *ird_ord_hdr = event->param.conn.private_data; + u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); + u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); + + /* + * cifs.ko sends the legacy IRD/ORD negotiation + * event if iWarp MPA v2 was used. + * + * Here we check that the values match and only + * mark the client as legacy if they don't match. + */ + if ((u32)event->param.conn.initiator_depth != ird32 || + (u32)event->param.conn.responder_resources != ord32) { + /* + * There are broken clients (old cifs.ko) + * using little endian and also + * struct rdma_conn_param only uses u8 + * for initiator_depth and responder_resources, + * so we truncate the value to U8_MAX. + * + * smb_direct_accept_client() will then + * do the real negotiation in order to + * select the minimum between client and + * server. + */ + ird32 = min_t(u32, ird32, U8_MAX); + ord32 = min_t(u32, ord32, U8_MAX); + + sc->rdma.legacy_iwarp = true; + peer_initiator_depth = (u8)ird32; + peer_responder_resources = (u8)ord32; + } + } + + /* + * negotiate the value by using the minimum + * between client and server if the client provided + * non 0 values. + */ + if (peer_initiator_depth != 0) + sp->initiator_depth = + min_t(u8, sp->initiator_depth, + peer_initiator_depth); + if (peer_responder_resources != 0) + sp->responder_resources = + min_t(u8, sp->responder_resources, + peer_responder_resources); + + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; + wake_up(&sc->status_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_interruptible(&info->status_wait); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; + smbd_disconnect_rdma_work(&sc->disconnect_work); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -221,15 +443,10 @@ static int smbd_conn_upcall( /* This happens when we fail the negotiation */ if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) { log_rdma_event(ERR, "event=%s during negotiation\n", event_name); - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up(&info->status_wait); - break; } sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - wake_up_interruptible(&info->status_wait); - wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); - wake_up_interruptible_all(&info->wait_send_queue); + smbd_disconnect_rdma_work(&sc->disconnect_work); break; default: @@ -245,15 +462,15 @@ static int smbd_conn_upcall( static void smbd_qp_async_error_upcall(struct ib_event *event, void *context) { - struct smbd_connection *info = context; + struct smbdirect_socket *sc = context; - log_rdma_event(ERR, "%s on device %s info %p\n", - ib_event_msg(event->event), event->device->name, info); + log_rdma_event(ERR, "%s on device %s socket %p\n", + ib_event_msg(event->event), event->device->name, sc); switch (event->event) { case IB_EVENT_CQ_ERR: case IB_EVENT_QP_FATAL: - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); break; default: @@ -278,11 +495,9 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_send_io *request = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); struct smbdirect_socket *sc = request->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); - log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%d\n", - request, wc->status); + log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n", + request, ib_wc_status_msg(wc->status)); for (i = 0; i < request->num_sge; i++) ib_dma_unmap_single(sc->ib.dev, @@ -291,17 +506,18 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) DMA_TO_DEVICE); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { - log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n", - wc->status, wc->opcode); + if (wc->status != IB_WC_WR_FLUSH_ERR) + log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->opcode); mempool_free(request, sc->send_io.mem.pool); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); return; } - if (atomic_dec_and_test(&info->send_pending)) - wake_up(&info->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); - wake_up(&info->wait_post_send); + wake_up(&sc->send_io.pending.dec_wait_queue); mempool_free(request, sc->send_io.mem.pool); } @@ -325,8 +541,6 @@ static bool process_negotiation_response( struct smbdirect_recv_io *response, int packet_length) { struct smbdirect_socket *sc = response->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response); @@ -341,21 +555,19 @@ static bool process_negotiation_response( le16_to_cpu(packet->negotiated_version)); return false; } - info->protocol = le16_to_cpu(packet->negotiated_version); if (packet->credits_requested == 0) { log_rdma_event(ERR, "error: credits_requested==0\n"); return false; } - info->receive_credit_target = le16_to_cpu(packet->credits_requested); + sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested); + sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); if (packet->credits_granted == 0) { log_rdma_event(ERR, "error: credits_granted==0\n"); return false; } - atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted)); - - atomic_set(&info->receive_credits, 0); + atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted)); if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) { log_rdma_event(ERR, "error: preferred_send_size=%d\n", @@ -380,16 +592,12 @@ static bool process_negotiation_response( } sp->max_fragmented_send_size = le32_to_cpu(packet->max_fragmented_size); - info->rdma_readwrite_threshold = - rdma_readwrite_threshold > sp->max_fragmented_send_size ? - sp->max_fragmented_send_size : - rdma_readwrite_threshold; sp->max_read_write_size = min_t(u32, le32_to_cpu(packet->max_readwrite_size), - info->max_frmr_depth * PAGE_SIZE); - info->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; + sp->max_frmr_depth * PAGE_SIZE); + sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; return true; @@ -397,52 +605,40 @@ static bool process_negotiation_response( static void smbd_post_send_credits(struct work_struct *work) { - int ret = 0; int rc; struct smbdirect_recv_io *response; - struct smbd_connection *info = - container_of(work, struct smbd_connection, - post_send_credits_work); - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - wake_up(&info->wait_receive_queues); return; } - if (info->receive_credit_target > - atomic_read(&info->receive_credits)) { + if (sc->recv_io.credits.target > + atomic_read(&sc->recv_io.credits.count)) { while (true) { - response = get_receive_buffer(info); + response = get_receive_buffer(sc); if (!response) break; response->first_segment = false; - rc = smbd_post_recv(info, response); + rc = smbd_post_recv(sc, response); if (rc) { log_rdma_recv(ERR, "post_recv failed rc=%d\n", rc); - put_receive_buffer(info, response); + put_receive_buffer(sc, response); break; } - ret++; + atomic_inc(&sc->recv_io.posted.count); } } - spin_lock(&info->lock_new_credits_offered); - info->new_credits_offered += ret; - spin_unlock(&info->lock_new_credits_offered); - /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */ - info->send_immediate = true; - if (atomic_read(&info->receive_credits) < - info->receive_credit_target - 1) { - if (info->keep_alive_requested == KEEP_ALIVE_PENDING || - info->send_immediate) { - log_keep_alive(INFO, "send an empty message\n"); - smbd_post_send_empty(info); - } + if (atomic_read(&sc->recv_io.credits.count) < + sc->recv_io.credits.target - 1) { + log_keep_alive(INFO, "schedule send of an empty message\n"); + queue_work(sc->workqueue, &sc->idle.immediate_work); } } @@ -453,17 +649,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_recv_io *response = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); struct smbdirect_socket *sc = response->socket; - struct smbd_connection *info = - container_of(sc, struct smbd_connection, socket); - int data_length = 0; - - log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", - response, sc->recv_io.expected, wc->status, wc->opcode, + struct smbdirect_socket_parameters *sp = &sc->parameters; + u16 old_recv_credit_target; + u32 data_offset = 0; + u32 data_length = 0; + u32 remaining_data_length = 0; + bool negotiate_done = false; + + log_rdma_recv(INFO, + "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n", + response, sc->recv_io.expected, + ib_wc_status_msg(wc->status), wc->opcode, wc->byte_len, wc->pkey_index); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - log_rdma_recv(INFO, "wc->status=%d opcode=%d\n", - wc->status, wc->opcode); + if (wc->status != IB_WC_WR_FLUSH_ERR) + log_rdma_recv(ERR, "wc->status=%s opcode=%d\n", + ib_wc_status_msg(wc->status), wc->opcode); goto error; } @@ -473,21 +675,52 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) response->sge.length, DMA_FROM_DEVICE); + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + switch (sc->recv_io.expected) { /* SMBD negotiation response */ case SMBDIRECT_EXPECT_NEGOTIATE_REP: dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response)); sc->recv_io.reassembly.full_packet_received = true; - info->negotiate_done = + negotiate_done = process_negotiation_response(response, wc->byte_len); - put_receive_buffer(info, response); - complete(&info->negotiate_completion); + put_receive_buffer(sc, response); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING); + if (!negotiate_done) { + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + smbd_disconnect_rdma_connection(sc); + } else { + sc->status = SMBDIRECT_SOCKET_CONNECTED; + wake_up(&sc->status_wait); + } + return; /* SMBD data transfer packet */ case SMBDIRECT_EXPECT_DATA_TRANSFER: data_transfer = smbdirect_recv_io_payload(response); + + if (wc->byte_len < + offsetof(struct smbdirect_data_transfer, padding)) + goto error; + + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); data_length = le32_to_cpu(data_transfer->data_length); + if (wc->byte_len < data_offset || + (u64)wc->byte_len < (u64)data_offset + data_length) + goto error; + + if (remaining_data_length > sp->max_fragmented_recv_size || + data_length > sp->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size) + goto error; if (data_length) { if (sc->recv_io.reassembly.full_packet_received) @@ -499,17 +732,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) sc->recv_io.reassembly.full_packet_received = true; } - atomic_dec(&info->receive_credits); - info->receive_credit_target = + atomic_dec(&sc->recv_io.posted.count); + atomic_dec(&sc->recv_io.credits.count); + old_recv_credit_target = sc->recv_io.credits.target; + sc->recv_io.credits.target = le16_to_cpu(data_transfer->credits_requested); + sc->recv_io.credits.target = + min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); + sc->recv_io.credits.target = + max_t(u16, sc->recv_io.credits.target, 1); if (le16_to_cpu(data_transfer->credits_granted)) { atomic_add(le16_to_cpu(data_transfer->credits_granted), - &info->send_credits); + &sc->send_io.credits.count); /* * We have new send credits granted from remote peer * If any sender is waiting for credits, unblock it */ - wake_up_interruptible(&info->wait_send_queue); + wake_up(&sc->send_io.credits.wait_queue); } log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n", @@ -518,11 +757,11 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) le32_to_cpu(data_transfer->data_length), le32_to_cpu(data_transfer->remaining_data_length)); - /* Send a KEEP_ALIVE response right away if requested */ - info->keep_alive_requested = KEEP_ALIVE_NONE; + /* Send an immediate response right away if requested */ if (le16_to_cpu(data_transfer->flags) & SMBDIRECT_FLAG_RESPONSE_REQUESTED) { - info->keep_alive_requested = KEEP_ALIVE_PENDING; + log_keep_alive(INFO, "schedule send of immediate response\n"); + queue_work(sc->workqueue, &sc->idle.immediate_work); } /* @@ -530,10 +769,13 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) * reassembly queue and wake up the reading thread */ if (data_length) { - enqueue_reassembly(info, response, data_length); - wake_up_interruptible(&sc->recv_io.reassembly.wait_queue); + if (sc->recv_io.credits.target > old_recv_credit_target) + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); + + enqueue_reassembly(sc, response, data_length); + wake_up(&sc->recv_io.reassembly.wait_queue); } else - put_receive_buffer(info, response); + put_receive_buffer(sc, response); return; @@ -548,19 +790,20 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected); WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); error: - put_receive_buffer(info, response); - smbd_disconnect_rdma_connection(info); + put_receive_buffer(sc, response); + smbd_disconnect_rdma_connection(sc); } static struct rdma_cm_id *smbd_create_id( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct sockaddr *dstaddr, int port) { + struct smbdirect_socket_parameters *sp = &sc->parameters; struct rdma_cm_id *id; int rc; __be16 *sport; - id = rdma_create_id(&init_net, smbd_conn_upcall, info, + id = rdma_create_id(&init_net, smbd_conn_upcall, sc, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -575,43 +818,57 @@ static struct rdma_cm_id *smbd_create_id( *sport = htons(port); - init_completion(&info->ri_done); - info->ri_rc = -ETIMEDOUT; - + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING; rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, - RDMA_RESOLVE_TIMEOUT); + sp->resolve_addr_timeout_msec); if (rc) { log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); goto out; } - rc = wait_for_completion_interruptible_timeout( - &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = wait_event_interruptible_timeout( + sc->status_wait, + sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, + msecs_to_jiffies(sp->resolve_addr_timeout_msec)); /* e.g. if interrupted returns -ERESTARTSYS */ if (rc < 0) { log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); goto out; } - rc = info->ri_rc; - if (rc) { + if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) { + rc = -ETIMEDOUT; + log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); + goto out; + } + if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) { + rc = -EHOSTUNREACH; log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); goto out; } - info->ri_rc = -ETIMEDOUT; - rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING; + rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec); if (rc) { log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); goto out; } - rc = wait_for_completion_interruptible_timeout( - &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = wait_event_interruptible_timeout( + sc->status_wait, + sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, + msecs_to_jiffies(sp->resolve_route_timeout_msec)); /* e.g. if interrupted returns -ERESTARTSYS */ if (rc < 0) { log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); goto out; } - rc = info->ri_rc; - if (rc) { + if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) { + rc = -ETIMEDOUT; + log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); + goto out; + } + if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) { + rc = -ENETUNREACH; log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); goto out; } @@ -638,13 +895,16 @@ static bool frwr_is_supported(struct ib_device_attr *attrs) } static int smbd_ia_open( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct sockaddr *dstaddr, int port) { - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int rc; - sc->rdma.cm_id = smbd_create_id(info, dstaddr, port); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED; + + sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port); if (IS_ERR(sc->rdma.cm_id)) { rc = PTR_ERR(sc->rdma.cm_id); goto out1; @@ -659,19 +919,12 @@ static int smbd_ia_open( rc = -EPROTONOSUPPORT; goto out2; } - info->max_frmr_depth = min_t(int, - smbd_max_frmr_depth, + sp->max_frmr_depth = min_t(u32, + sp->max_frmr_depth, sc->ib.dev->attrs.max_fast_reg_page_list_len); - info->mr_type = IB_MR_TYPE_MEM_REG; + sc->mr_io.type = IB_MR_TYPE_MEM_REG; if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) - info->mr_type = IB_MR_TYPE_SG_GAPS; - - sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); - if (IS_ERR(sc->ib.pd)) { - rc = PTR_ERR(sc->ib.pd); - log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); - goto out2; - } + sc->mr_io.type = IB_MR_TYPE_SG_GAPS; return 0; @@ -689,9 +942,8 @@ static int smbd_ia_open( * After negotiation, the transport is connected and ready for * carrying upper layer SMB payload */ -static int smbd_post_send_negotiate_req(struct smbd_connection *info) +static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_send_wr send_wr; int rc = -ENOMEM; @@ -743,18 +995,18 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) request->sge[0].addr, request->sge[0].length, request->sge[0].lkey); - atomic_inc(&info->send_pending); + atomic_inc(&sc->send_io.pending.count); rc = ib_post_send(sc->ib.qp, &send_wr, NULL); if (!rc) return 0; /* if we reach here, post send failed */ log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); - atomic_dec(&info->send_pending); + atomic_dec(&sc->send_io.pending.count); ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr, request->sge[0].length, DMA_TO_DEVICE); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); dma_mapping_failed: mempool_free(request, sc->send_io.mem.pool); @@ -769,14 +1021,20 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) * buffer as possible, and extend the receive credits to remote peer * return value: the new credtis being granted. */ -static int manage_credits_prior_sending(struct smbd_connection *info) +static int manage_credits_prior_sending(struct smbdirect_socket *sc) { int new_credits; - spin_lock(&info->lock_new_credits_offered); - new_credits = info->new_credits_offered; - info->new_credits_offered = 0; - spin_unlock(&info->lock_new_credits_offered); + if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) + return 0; + + new_credits = atomic_read(&sc->recv_io.posted.count); + if (new_credits == 0) + return 0; + + new_credits -= atomic_read(&sc->recv_io.credits.count); + if (new_credits <= 0) + return 0; return new_credits; } @@ -790,21 +1048,27 @@ static int manage_credits_prior_sending(struct smbd_connection *info) * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set * 0: otherwise */ -static int manage_keep_alive_before_sending(struct smbd_connection *info) +static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) { - if (info->keep_alive_requested == KEEP_ALIVE_PENDING) { - info->keep_alive_requested = KEEP_ALIVE_SENT; + struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); return 1; } return 0; } /* Post the send request */ -static int smbd_post_send(struct smbd_connection *info, +static int smbd_post_send(struct smbdirect_socket *sc, struct smbdirect_send_io *request) { - struct smbdirect_socket *sc = &info->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_send_wr send_wr; int rc, i; @@ -831,21 +1095,17 @@ static int smbd_post_send(struct smbd_connection *info, rc = ib_post_send(sc->ib.qp, &send_wr, NULL); if (rc) { log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); rc = -EAGAIN; - } else - /* Reset timer for idle connection after packet is sent */ - mod_delayed_work(info->workqueue, &info->idle_timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); + } return rc; } -static int smbd_post_send_iter(struct smbd_connection *info, +static int smbd_post_send_iter(struct smbdirect_socket *sc, struct iov_iter *iter, int *_remaining_data_length) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; int i, rc; int header_length; @@ -856,8 +1116,8 @@ static int smbd_post_send_iter(struct smbd_connection *info, wait_credit: /* Wait for send credits. A SMBD packet needs one credit */ - rc = wait_event_interruptible(info->wait_send_queue, - atomic_read(&info->send_credits) > 0 || + rc = wait_event_interruptible(sc->send_io.credits.wait_queue, + atomic_read(&sc->send_io.credits.count) > 0 || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) goto err_wait_credit; @@ -867,14 +1127,14 @@ static int smbd_post_send_iter(struct smbd_connection *info, rc = -EAGAIN; goto err_wait_credit; } - if (unlikely(atomic_dec_return(&info->send_credits) < 0)) { - atomic_inc(&info->send_credits); + if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) { + atomic_inc(&sc->send_io.credits.count); goto wait_credit; } wait_send_queue: - wait_event(info->wait_post_send, - atomic_read(&info->send_pending) < sp->send_credit_target || + wait_event(sc->send_io.pending.dec_wait_queue, + atomic_read(&sc->send_io.pending.count) < sp->send_credit_target || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { @@ -883,9 +1143,9 @@ static int smbd_post_send_iter(struct smbd_connection *info, goto err_wait_send_queue; } - if (unlikely(atomic_inc_return(&info->send_pending) > + if (unlikely(atomic_inc_return(&sc->send_io.pending.count) > sp->send_credit_target)) { - atomic_dec(&info->send_pending); + atomic_dec(&sc->send_io.pending.count); goto wait_send_queue; } @@ -898,10 +1158,30 @@ static int smbd_post_send_iter(struct smbd_connection *info, request->socket = sc; memset(request->sge, 0, sizeof(request->sge)); + /* Map the packet to DMA */ + header_length = sizeof(struct smbdirect_data_transfer); + /* If this is a packet without payload, don't send padding */ + if (!iter) + header_length = offsetof(struct smbdirect_data_transfer, padding); + + packet = smbdirect_send_io_payload(request); + request->sge[0].addr = ib_dma_map_single(sc->ib.dev, + (void *)packet, + header_length, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { + rc = -EIO; + goto err_dma; + } + + request->sge[0].length = header_length; + request->sge[0].lkey = sc->ib.pd->local_dma_lkey; + request->num_sge = 1; + /* Fill in the data payload to find out how much data we can add */ if (iter) { struct smb_extract_to_rdma extract = { - .nr_sge = 1, + .nr_sge = request->num_sge, .max_sge = SMBDIRECT_SEND_IO_MAX_SGE, .sge = request->sge, .device = sc->ib.dev, @@ -920,21 +1200,17 @@ static int smbd_post_send_iter(struct smbd_connection *info, *_remaining_data_length -= data_length; } else { data_length = 0; - request->num_sge = 1; } /* Fill in the packet header */ - packet = smbdirect_send_io_payload(request); packet->credits_requested = cpu_to_le16(sp->send_credit_target); - new_credits = manage_credits_prior_sending(info); - atomic_add(new_credits, &info->receive_credits); + new_credits = manage_credits_prior_sending(sc); + atomic_add(new_credits, &sc->recv_io.credits.count); packet->credits_granted = cpu_to_le16(new_credits); - info->send_immediate = false; - packet->flags = 0; - if (manage_keep_alive_before_sending(info)) + if (manage_keep_alive_before_sending(sc)) packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); packet->reserved = 0; @@ -953,26 +1229,7 @@ static int smbd_post_send_iter(struct smbd_connection *info, le32_to_cpu(packet->data_length), le32_to_cpu(packet->remaining_data_length)); - /* Map the packet to DMA */ - header_length = sizeof(struct smbdirect_data_transfer); - /* If this is a packet without payload, don't send padding */ - if (!data_length) - header_length = offsetof(struct smbdirect_data_transfer, padding); - - request->sge[0].addr = ib_dma_map_single(sc->ib.dev, - (void *)packet, - header_length, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { - rc = -EIO; - request->sge[0].addr = 0; - goto err_dma; - } - - request->sge[0].length = header_length; - request->sge[0].lkey = sc->ib.pd->local_dma_lkey; - - rc = smbd_post_send(info, request); + rc = smbd_post_send(sc, request); if (!rc) return 0; @@ -985,19 +1242,16 @@ static int smbd_post_send_iter(struct smbd_connection *info, DMA_TO_DEVICE); mempool_free(request, sc->send_io.mem.pool); - /* roll back receive credits and credits to be offered */ - spin_lock(&info->lock_new_credits_offered); - info->new_credits_offered += new_credits; - spin_unlock(&info->lock_new_credits_offered); - atomic_sub(new_credits, &info->receive_credits); + /* roll back the granted receive credits */ + atomic_sub(new_credits, &sc->recv_io.credits.count); err_alloc: - if (atomic_dec_and_test(&info->send_pending)) - wake_up(&info->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); err_wait_send_queue: /* roll back send credits and pending */ - atomic_inc(&info->send_credits); + atomic_inc(&sc->send_io.credits.count); err_wait_credit: return rc; @@ -1008,15 +1262,15 @@ static int smbd_post_send_iter(struct smbd_connection *info, * Empty message is used to extend credits to peer to for keep live * while there is no upper layer payload to send at the time */ -static int smbd_post_send_empty(struct smbd_connection *info) +static int smbd_post_send_empty(struct smbdirect_socket *sc) { int remaining_data_length = 0; - info->count_send_empty++; - return smbd_post_send_iter(info, NULL, &remaining_data_length); + sc->statistics.send_empty++; + return smbd_post_send_iter(sc, NULL, &remaining_data_length); } -static int smbd_post_send_full_iter(struct smbd_connection *info, +static int smbd_post_send_full_iter(struct smbdirect_socket *sc, struct iov_iter *iter, int *_remaining_data_length) { @@ -1029,7 +1283,7 @@ static int smbd_post_send_full_iter(struct smbd_connection *info, */ while (iov_iter_count(iter) > 0) { - rc = smbd_post_send_iter(info, iter, _remaining_data_length); + rc = smbd_post_send_iter(sc, iter, _remaining_data_length); if (rc < 0) break; } @@ -1043,9 +1297,8 @@ static int smbd_post_send_full_iter(struct smbd_connection *info, * The interaction is controlled by send/receive credit system */ static int smbd_post_recv( - struct smbd_connection *info, struct smbdirect_recv_io *response) + struct smbdirect_socket *sc, struct smbdirect_recv_io *response) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_recv_wr recv_wr; int rc = -EIO; @@ -1071,7 +1324,7 @@ static int smbd_post_recv( ib_dma_unmap_single(sc->ib.dev, response->sge.addr, response->sge.length, DMA_FROM_DEVICE); response->sge.length = 0; - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); } @@ -1079,31 +1332,36 @@ static int smbd_post_recv( } /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */ -static int smbd_negotiate(struct smbd_connection *info) +static int smbd_negotiate(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int rc; - struct smbdirect_recv_io *response = get_receive_buffer(info); + struct smbdirect_recv_io *response = get_receive_buffer(sc); + + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP; - rc = smbd_post_recv(info, response); + rc = smbd_post_recv(sc, response); log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); - if (rc) + if (rc) { + put_receive_buffer(sc, response); return rc; + } - init_completion(&info->negotiate_completion); - info->negotiate_done = false; - rc = smbd_post_send_negotiate_req(info); + rc = smbd_post_send_negotiate_req(sc); if (rc) return rc; - rc = wait_for_completion_interruptible_timeout( - &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ); - log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc); + rc = wait_event_interruptible_timeout( + sc->status_wait, + sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc); - if (info->negotiate_done) + if (sc->status == SMBDIRECT_SOCKET_CONNECTED) return 0; if (rc == 0) @@ -1127,13 +1385,13 @@ static int smbd_negotiate(struct smbd_connection *info) * data_length: the size of payload in this packet */ static void enqueue_reassembly( - struct smbd_connection *info, + struct smbdirect_socket *sc, struct smbdirect_recv_io *response, int data_length) { - struct smbdirect_socket *sc = &info->socket; + unsigned long flags; - spin_lock(&sc->recv_io.reassembly.lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); list_add_tail(&response->list, &sc->recv_io.reassembly.list); sc->recv_io.reassembly.queue_length++; /* @@ -1144,9 +1402,8 @@ static void enqueue_reassembly( */ virt_wmb(); sc->recv_io.reassembly.data_length += data_length; - spin_unlock(&sc->recv_io.reassembly.lock); - info->count_reassembly_queue++; - info->count_enqueue_reassembly_queue++; + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + sc->statistics.enqueue_reassembly_queue++; } /* @@ -1154,9 +1411,8 @@ static void enqueue_reassembly( * Caller is responsible for locking * return value: the first entry if any, NULL if queue is empty */ -static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *info) +static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *ret = NULL; if (!list_empty(&sc->recv_io.reassembly.list)) { @@ -1173,9 +1429,8 @@ static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *i * pre-allocated in advance. * return value: the receive buffer, NULL if none is available */ -static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info) +static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *ret = NULL; unsigned long flags; @@ -1185,8 +1440,7 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info &sc->recv_io.free.list, struct smbdirect_recv_io, list); list_del(&ret->list); - info->count_receive_queue--; - info->count_get_receive_buffer++; + sc->statistics.get_receive_buffer++; } spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); @@ -1200,9 +1454,8 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info * receive buffer is returned. */ static void put_receive_buffer( - struct smbd_connection *info, struct smbdirect_recv_io *response) + struct smbdirect_socket *sc, struct smbdirect_recv_io *response) { - struct smbdirect_socket *sc = &info->socket; unsigned long flags; if (likely(response->sge.length != 0)) { @@ -1215,31 +1468,18 @@ static void put_receive_buffer( spin_lock_irqsave(&sc->recv_io.free.lock, flags); list_add_tail(&response->list, &sc->recv_io.free.list); - info->count_receive_queue++; - info->count_put_receive_buffer++; + sc->statistics.put_receive_buffer++; spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); - queue_work(info->workqueue, &info->post_send_credits_work); + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); } /* Preallocate all receive buffer on transport establishment */ -static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) +static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *response; int i; - INIT_LIST_HEAD(&sc->recv_io.reassembly.list); - spin_lock_init(&sc->recv_io.reassembly.lock); - sc->recv_io.reassembly.data_length = 0; - sc->recv_io.reassembly.queue_length = 0; - - INIT_LIST_HEAD(&sc->recv_io.free.list); - spin_lock_init(&sc->recv_io.free.lock); - info->count_receive_queue = 0; - - init_waitqueue_head(&info->wait_receive_queues); - for (i = 0; i < num_buf; i++) { response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL); if (!response) @@ -1248,7 +1488,6 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) response->socket = sc; response->sge.length = 0; list_add_tail(&response->list, &sc->recv_io.free.list); - info->count_receive_queue++; } return 0; @@ -1259,45 +1498,59 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) &sc->recv_io.free.list, struct smbdirect_recv_io, list); list_del(&response->list); - info->count_receive_queue--; mempool_free(response, sc->recv_io.mem.pool); } return -ENOMEM; } -static void destroy_receive_buffers(struct smbd_connection *info) +static void destroy_receive_buffers(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_recv_io *response; - while ((response = get_receive_buffer(info))) + while ((response = get_receive_buffer(sc))) mempool_free(response, sc->recv_io.mem.pool); } +static void send_immediate_empty_message(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.immediate_work); + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; + + log_keep_alive(INFO, "send an empty message\n"); + smbd_post_send_empty(sc); +} + /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ static void idle_connection_timer(struct work_struct *work) { - struct smbd_connection *info = container_of( - work, struct smbd_connection, - idle_timer_work.work); - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.timer_work.work); struct smbdirect_socket_parameters *sp = &sc->parameters; - if (info->keep_alive_requested != KEEP_ALIVE_NONE) { + if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { log_keep_alive(ERR, - "error status info->keep_alive_requested=%d\n", - info->keep_alive_requested); - smbd_disconnect_rdma_connection(info); + "error status sc->idle.keepalive=%d\n", + sc->idle.keepalive); + smbd_disconnect_rdma_connection(sc); return; } - log_keep_alive(INFO, "about to send an empty idle message\n"); - smbd_post_send_empty(info); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; - /* Setup the next idle timeout work */ - queue_delayed_work(info->workqueue, &info->idle_timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + log_keep_alive(INFO, "schedule send of empty idle message\n"); + queue_work(sc->workqueue, &sc->idle.immediate_work); } /* @@ -1309,7 +1562,6 @@ void smbd_destroy(struct TCP_Server_Info *server) { struct smbd_connection *info = server->smbd_conn; struct smbdirect_socket *sc; - struct smbdirect_socket_parameters *sp; struct smbdirect_recv_io *response; unsigned long flags; @@ -1318,35 +1570,51 @@ void smbd_destroy(struct TCP_Server_Info *server) return; } sc = &info->socket; - sp = &sc->parameters; + + log_rdma_event(INFO, "cancelling and disable disconnect_work\n"); + disable_work_sync(&sc->disconnect_work); log_rdma_event(INFO, "destroying rdma session\n"); - if (sc->status != SMBDIRECT_SOCKET_DISCONNECTED) { - rdma_disconnect(sc->rdma.cm_id); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { + smbd_disconnect_rdma_work(&sc->disconnect_work); log_rdma_event(INFO, "wait for transport being disconnected\n"); wait_event_interruptible( - info->status_wait, + sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + * + * Most likely this was already called via + * smbd_disconnect_rdma_work(), but call it again... + */ + smbd_disconnect_wake_up_all(sc); + + log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n"); + disable_work_sync(&sc->recv_io.posted.refill_work); + log_rdma_event(INFO, "destroying qp\n"); ib_drain_qp(sc->ib.qp); rdma_destroy_qp(sc->rdma.cm_id); sc->ib.qp = NULL; log_rdma_event(INFO, "cancelling idle timer\n"); - cancel_delayed_work_sync(&info->idle_timer_work); + disable_delayed_work_sync(&sc->idle.timer_work); + log_rdma_event(INFO, "cancelling send immediate work\n"); + disable_work_sync(&sc->idle.immediate_work); /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); do { spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - response = _get_first_reassembly(info); + response = _get_first_reassembly(sc); if (response) { list_del(&response->list); spin_unlock_irqrestore( &sc->recv_io.reassembly.lock, flags); - put_receive_buffer(info, response); + put_receive_buffer(sc, response); } else spin_unlock_irqrestore( &sc->recv_io.reassembly.lock, flags); @@ -1354,9 +1622,7 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->recv_io.reassembly.data_length = 0; log_rdma_event(INFO, "free receive buffers\n"); - wait_event(info->wait_receive_queues, - info->count_receive_queue == sp->recv_credit_max); - destroy_receive_buffers(info); + destroy_receive_buffers(sc); /* * For performance reasons, memory registration and deregistration @@ -1366,13 +1632,12 @@ void smbd_destroy(struct TCP_Server_Info *server) * path when sending data, and then release memory registrations. */ log_rdma_event(INFO, "freeing mr list\n"); - wake_up_interruptible_all(&info->wait_mr); - while (atomic_read(&info->mr_used_count)) { + while (atomic_read(&sc->mr_io.used.count)) { cifs_server_unlock(server); msleep(1000); cifs_server_lock(server); } - destroy_mr_list(info); + destroy_mr_list(sc); ib_free_cq(sc->ib.send_cq); ib_free_cq(sc->ib.recv_cq); @@ -1388,7 +1653,7 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->status = SMBDIRECT_SOCKET_DESTROYED; - destroy_workqueue(info->workqueue); + destroy_workqueue(sc->workqueue); log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); server->smbd_conn = NULL; @@ -1430,12 +1695,9 @@ int smbd_reconnect(struct TCP_Server_Info *server) return -ENOENT; } -static void destroy_caches_and_workqueue(struct smbd_connection *info) +static void destroy_caches(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; - - destroy_receive_buffers(info); - destroy_workqueue(info->workqueue); + destroy_receive_buffers(sc); mempool_destroy(sc->recv_io.mem.pool); kmem_cache_destroy(sc->recv_io.mem.cache); mempool_destroy(sc->send_io.mem.pool); @@ -1443,9 +1705,8 @@ static void destroy_caches_and_workqueue(struct smbd_connection *info) } #define MAX_NAME_LEN 80 -static int allocate_caches_and_workqueue(struct smbd_connection *info) +static int allocate_caches(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; char name[MAX_NAME_LEN]; int rc; @@ -1453,7 +1714,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer))) return -ENOMEM; - scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc); sc->send_io.mem.cache = kmem_cache_create( name, @@ -1469,7 +1730,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!sc->send_io.mem.pool) goto out1; - scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc); struct kmem_cache_args response_args = { .align = __alignof__(struct smbdirect_recv_io), @@ -1490,21 +1751,14 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!sc->recv_io.mem.pool) goto out3; - scnprintf(name, MAX_NAME_LEN, "smbd_%p", info); - info->workqueue = create_workqueue(name); - if (!info->workqueue) - goto out4; - - rc = allocate_receive_buffers(info, sp->recv_credit_max); + rc = allocate_receive_buffers(sc, sp->recv_credit_max); if (rc) { log_rdma_event(ERR, "failed to allocate receive buffers\n"); - goto out5; + goto out4; } return 0; -out5: - destroy_workqueue(info->workqueue); out4: mempool_destroy(sc->recv_io.mem.pool); out3: @@ -1528,46 +1782,63 @@ static struct smbd_connection *_smbd_get_connection( struct ib_qp_init_attr qp_attr; struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; struct ib_port_immutable port_immutable; - u32 ird_ord_hdr[2]; + __be32 ird_ord_hdr[2]; + char wq_name[80]; + struct workqueue_struct *workqueue; info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); if (!info) return NULL; sc = &info->socket; + scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc); + workqueue = create_workqueue(wq_name); + if (!workqueue) + goto create_wq_failed; + smbdirect_socket_init(sc); + sc->workqueue = workqueue; sp = &sc->parameters; - sc->status = SMBDIRECT_SOCKET_CONNECTING; - rc = smbd_ia_open(info, dstaddr, port); + INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work); + + sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT; + sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT; + sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT; + sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000; + sp->initiator_depth = 1; + sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES; + sp->recv_credit_max = smbd_receive_credit_max; + sp->send_credit_target = smbd_send_credit_target; + sp->max_send_size = smbd_max_send_size; + sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; + sp->max_recv_size = smbd_max_receive_size; + sp->max_frmr_depth = smbd_max_frmr_depth; + sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; + sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; + + rc = smbd_ia_open(sc, dstaddr, port); if (rc) { log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); goto create_id_failed; } - if (smbd_send_credit_target > sc->ib.dev->attrs.max_cqe || - smbd_send_credit_target > sc->ib.dev->attrs.max_qp_wr) { + if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe || + sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) { log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", - smbd_send_credit_target, + sp->send_credit_target, sc->ib.dev->attrs.max_cqe, sc->ib.dev->attrs.max_qp_wr); goto config_failed; } - if (smbd_receive_credit_max > sc->ib.dev->attrs.max_cqe || - smbd_receive_credit_max > sc->ib.dev->attrs.max_qp_wr) { + if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe || + sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) { log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", - smbd_receive_credit_max, + sp->recv_credit_max, sc->ib.dev->attrs.max_cqe, sc->ib.dev->attrs.max_qp_wr); goto config_failed; } - sp->recv_credit_max = smbd_receive_credit_max; - sp->send_credit_target = smbd_send_credit_target; - sp->max_send_size = smbd_max_send_size; - sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; - sp->max_recv_size = smbd_max_receive_size; - sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; - if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE || sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { log_rdma_event(ERR, @@ -1579,8 +1850,16 @@ static struct smbd_connection *_smbd_get_connection( goto config_failed; } + sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); + if (IS_ERR(sc->ib.pd)) { + rc = PTR_ERR(sc->ib.pd); + sc->ib.pd = NULL; + log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); + goto alloc_pd_failed; + } + sc->ib.send_cq = - ib_alloc_cq_any(sc->ib.dev, info, + ib_alloc_cq_any(sc->ib.dev, sc, sp->send_credit_target, IB_POLL_SOFTIRQ); if (IS_ERR(sc->ib.send_cq)) { sc->ib.send_cq = NULL; @@ -1588,7 +1867,7 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.recv_cq = - ib_alloc_cq_any(sc->ib.dev, info, + ib_alloc_cq_any(sc->ib.dev, sc, sp->recv_credit_max, IB_POLL_SOFTIRQ); if (IS_ERR(sc->ib.recv_cq)) { sc->ib.recv_cq = NULL; @@ -1597,7 +1876,7 @@ static struct smbd_connection *_smbd_get_connection( memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.event_handler = smbd_qp_async_error_upcall; - qp_attr.qp_context = info; + qp_attr.qp_context = sc; qp_attr.cap.max_send_wr = sp->send_credit_target; qp_attr.cap.max_recv_wr = sp->recv_credit_max; qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; @@ -1616,22 +1895,22 @@ static struct smbd_connection *_smbd_get_connection( } sc->ib.qp = sc->rdma.cm_id->qp; - memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = 0; - - conn_param.responder_resources = - min(sc->ib.dev->attrs.max_qp_rd_atom, - SMBD_CM_RESPONDER_RESOURCES); - info->responder_resources = conn_param.responder_resources; + sp->responder_resources = + min_t(u8, sp->responder_resources, + sc->ib.dev->attrs.max_qp_rd_atom); log_rdma_mr(INFO, "responder_resources=%d\n", - info->responder_resources); + sp->responder_resources); + + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.initiator_depth = sp->initiator_depth; + conn_param.responder_resources = sp->responder_resources; /* Need to send IRD/ORD in private data for iWARP */ sc->ib.dev->ops.get_port_immutable( sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable); if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = info->responder_resources; - ird_ord_hdr[1] = 1; + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); conn_param.private_data = ird_ord_hdr; conn_param.private_data_len = sizeof(ird_ord_hdr); } else { @@ -1646,8 +1925,8 @@ static struct smbd_connection *_smbd_get_connection( log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", &addr_in->sin_addr, port); - init_waitqueue_head(&info->status_wait); - init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; rc = rdma_connect(sc->rdma.cm_id, &conn_param); if (rc) { log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); @@ -1655,45 +1934,42 @@ static struct smbd_connection *_smbd_get_connection( } wait_event_interruptible_timeout( - info->status_wait, - sc->status != SMBDIRECT_SOCKET_CONNECTING, - msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + sc->status_wait, + sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, + msecs_to_jiffies(sp->rdma_connect_timeout_msec)); - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) { log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); goto rdma_connect_failed; } log_rdma_event(INFO, "rdma_connect connected\n"); - rc = allocate_caches_and_workqueue(info); + rc = allocate_caches(sc); if (rc) { log_rdma_event(ERR, "cache allocation failed\n"); goto allocate_cache_failed; } - init_waitqueue_head(&info->wait_send_queue); - INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); - queue_delayed_work(info->workqueue, &info->idle_timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); - - init_waitqueue_head(&info->wait_send_pending); - atomic_set(&info->send_pending, 0); - - init_waitqueue_head(&info->wait_post_send); + INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message); + INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer); + /* + * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING + * so that the timer will cause a disconnect. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->negotiate_timeout_msec)); - INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work); - INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); - info->new_credits_offered = 0; - spin_lock_init(&info->lock_new_credits_offered); + INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits); - rc = smbd_negotiate(info); + rc = smbd_negotiate(sc); if (rc) { log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc); goto negotiation_failed; } - rc = allocate_mr_list(info); + rc = allocate_mr_list(sc); if (rc) { log_rdma_mr(ERR, "memory registration allocation failed\n"); goto allocate_mr_failed; @@ -1708,11 +1984,11 @@ static struct smbd_connection *_smbd_get_connection( return NULL; negotiation_failed: - cancel_delayed_work_sync(&info->idle_timer_work); - destroy_caches_and_workqueue(info); + disable_delayed_work_sync(&sc->idle.timer_work); + destroy_caches(sc); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; rdma_disconnect(sc->rdma.cm_id); - wait_event(info->status_wait, + wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); allocate_cache_failed: @@ -1726,11 +2002,15 @@ static struct smbd_connection *_smbd_get_connection( if (sc->ib.recv_cq) ib_free_cq(sc->ib.recv_cq); -config_failed: ib_dealloc_pd(sc->ib.pd); + +alloc_pd_failed: +config_failed: rdma_destroy_id(sc->rdma.cm_id); create_id_failed: + destroy_workqueue(sc->workqueue); +create_wq_failed: kfree(info); return NULL; } @@ -1739,6 +2019,7 @@ struct smbd_connection *smbd_get_connection( struct TCP_Server_Info *server, struct sockaddr *dstaddr) { struct smbd_connection *ret; + const struct smbdirect_socket_parameters *sp; int port = SMBD_PORT; try_again: @@ -1749,6 +2030,16 @@ struct smbd_connection *smbd_get_connection( port = SMB_PORT; goto try_again; } + if (!ret) + return NULL; + + sp = &ret->socket.parameters; + + server->rdma_readwrite_threshold = + rdma_readwrite_threshold > sp->max_fragmented_send_size ? + sp->max_fragmented_send_size : + rdma_readwrite_threshold; + return ret; } @@ -1790,6 +2081,7 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) if (sc->recv_io.reassembly.data_length >= size) { int queue_length; int queue_removed = 0; + unsigned long flags; /* * Need to make sure reassembly_data_length is read before @@ -1804,7 +2096,7 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) to_read = size; offset = sc->recv_io.reassembly.first_entry_offset; while (data_read < size) { - response = _get_first_reassembly(info); + response = _get_first_reassembly(sc); data_transfer = smbdirect_recv_io_payload(response); data_length = le32_to_cpu(data_transfer->data_length); remaining_data_length = @@ -1849,16 +2141,15 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) if (queue_length) list_del(&response->list); else { - spin_lock_irq( - &sc->recv_io.reassembly.lock); + spin_lock_irqsave( + &sc->recv_io.reassembly.lock, flags); list_del(&response->list); - spin_unlock_irq( - &sc->recv_io.reassembly.lock); + spin_unlock_irqrestore( + &sc->recv_io.reassembly.lock, flags); } queue_removed++; - info->count_reassembly_queue--; - info->count_dequeue_reassembly_queue++; - put_receive_buffer(info, response); + sc->statistics.dequeue_reassembly_queue++; + put_receive_buffer(sc, response); offset = 0; log_read(INFO, "put_receive_buffer offset=0\n"); } else @@ -1872,10 +2163,10 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) to_read, data_read, offset); } - spin_lock_irq(&sc->recv_io.reassembly.lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.data_length -= data_read; sc->recv_io.reassembly.queue_length -= queue_removed; - spin_unlock_irq(&sc->recv_io.reassembly.lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.first_entry_offset = offset; log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", @@ -1960,13 +2251,13 @@ int smbd_send(struct TCP_Server_Info *server, klen += rqst->rq_iov[i].iov_len; iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); - rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length); + rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length); if (rc < 0) break; if (iov_iter_count(&rqst->rq_iter) > 0) { /* And then the data pages if there are any */ - rc = smbd_post_send_full_iter(info, &rqst->rq_iter, + rc = smbd_post_send_full_iter(sc, &rqst->rq_iter, &remaining_data_length); if (rc < 0) break; @@ -1981,8 +2272,8 @@ int smbd_send(struct TCP_Server_Info *server, * that means all the I/Os have been out and we are good to return */ - wait_event(info->wait_send_pending, - atomic_read(&info->send_pending) == 0 || + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0) @@ -1993,14 +2284,13 @@ int smbd_send(struct TCP_Server_Info *server, static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smbd_mr *mr; - struct ib_cqe *cqe; + struct smbdirect_mr_io *mr = + container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); + struct smbdirect_socket *sc = mr->socket; if (wc->status) { log_rdma_mr(ERR, "status=%d\n", wc->status); - cqe = wc->wr_cqe; - mr = container_of(cqe, struct smbd_mr, cqe); - smbd_disconnect_rdma_connection(mr->conn); + smbd_disconnect_rdma_connection(sc); } } @@ -2015,14 +2305,14 @@ static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) */ static void smbd_mr_recovery_work(struct work_struct *work) { - struct smbd_connection *info = - container_of(work, struct smbd_connection, mr_recovery_work); - struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *smbdirect_mr; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, mr_io.recovery_work); + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_mr_io *smbdirect_mr; int rc; - list_for_each_entry(smbdirect_mr, &info->mr_list, list) { - if (smbdirect_mr->state == MR_ERROR) { + list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) { + if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) { /* recover this MR entry */ rc = ib_dereg_mr(smbdirect_mr->mr); @@ -2030,25 +2320,25 @@ static void smbd_mr_recovery_work(struct work_struct *work) log_rdma_mr(ERR, "ib_dereg_mr failed rc=%x\n", rc); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); continue; } smbdirect_mr->mr = ib_alloc_mr( - sc->ib.pd, info->mr_type, - info->max_frmr_depth); + sc->ib.pd, sc->mr_io.type, + sp->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", - info->mr_type, - info->max_frmr_depth); - smbd_disconnect_rdma_connection(info); + sc->mr_io.type, + sp->max_frmr_depth); + smbd_disconnect_rdma_connection(sc); continue; } } else /* This MR is being used, don't recover it */ continue; - smbdirect_mr->state = MR_READY; + smbdirect_mr->state = SMBDIRECT_MR_READY; /* smbdirect_mr->state is updated by this function * and is read and updated by I/O issuing CPUs trying @@ -2057,19 +2347,18 @@ static void smbd_mr_recovery_work(struct work_struct *work) * value is updated before waking up any calls to * get_mr() from the I/O issuing CPUs */ - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); + if (atomic_inc_return(&sc->mr_io.ready.count) == 1) + wake_up(&sc->mr_io.ready.wait_queue); } } -static void destroy_mr_list(struct smbd_connection *info) +static void destroy_mr_list(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *mr, *tmp; + struct smbdirect_mr_io *mr, *tmp; - cancel_work_sync(&info->mr_recovery_work); - list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { - if (mr->state == MR_INVALIDATED) + disable_work_sync(&sc->mr_io.recovery_work); + list_for_each_entry_safe(mr, tmp, &sc->mr_io.all.list, list) { + if (mr->state == SMBDIRECT_MR_INVALIDATED) ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); ib_dereg_mr(mr->mr); @@ -2085,32 +2374,32 @@ static void destroy_mr_list(struct smbd_connection *info) * Recovery is done in smbd_mr_recovery_work. The content of list entry changes * as MRs are used and recovered for I/O, but the list links will not change */ -static int allocate_mr_list(struct smbd_connection *info) +static int allocate_mr_list(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int i; - struct smbd_mr *smbdirect_mr, *tmp; - - INIT_LIST_HEAD(&info->mr_list); - init_waitqueue_head(&info->wait_mr); - spin_lock_init(&info->mr_list_lock); - atomic_set(&info->mr_ready_count, 0); - atomic_set(&info->mr_used_count, 0); - init_waitqueue_head(&info->wait_for_mr_cleanup); - INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); + struct smbdirect_mr_io *smbdirect_mr, *tmp; + + INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); + + if (sp->responder_resources == 0) { + log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); + return -EINVAL; + } + /* Allocate more MRs (2x) than hardware responder_resources */ - for (i = 0; i < info->responder_resources * 2; i++) { + for (i = 0; i < sp->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); if (!smbdirect_mr) goto cleanup_entries; - smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type, - info->max_frmr_depth); + smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type, + sp->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", - info->mr_type, info->max_frmr_depth); + sc->mr_io.type, sp->max_frmr_depth); goto out; } - smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth, + smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth, sizeof(struct scatterlist), GFP_KERNEL); if (!smbdirect_mr->sgt.sgl) { @@ -2118,18 +2407,18 @@ static int allocate_mr_list(struct smbd_connection *info) ib_dereg_mr(smbdirect_mr->mr); goto out; } - smbdirect_mr->state = MR_READY; - smbdirect_mr->conn = info; + smbdirect_mr->state = SMBDIRECT_MR_READY; + smbdirect_mr->socket = sc; - list_add_tail(&smbdirect_mr->list, &info->mr_list); - atomic_inc(&info->mr_ready_count); + list_add_tail(&smbdirect_mr->list, &sc->mr_io.all.list); + atomic_inc(&sc->mr_io.ready.count); } return 0; out: kfree(smbdirect_mr); cleanup_entries: - list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { + list_for_each_entry_safe(smbdirect_mr, tmp, &sc->mr_io.all.list, list) { list_del(&smbdirect_mr->list); ib_dereg_mr(smbdirect_mr->mr); kfree(smbdirect_mr->sgt.sgl); @@ -2146,14 +2435,14 @@ static int allocate_mr_list(struct smbd_connection *info) * issuing I/O trying to get MR at the same time, mr_list_lock is used to * protect this situation. */ -static struct smbd_mr *get_mr(struct smbd_connection *info) +static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc) { - struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *ret; + struct smbdirect_mr_io *ret; + unsigned long flags; int rc; again: - rc = wait_event_interruptible(info->wait_mr, - atomic_read(&info->mr_ready_count) || + rc = wait_event_interruptible(sc->mr_io.ready.wait_queue, + atomic_read(&sc->mr_io.ready.count) || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) { log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc); @@ -2165,18 +2454,18 @@ static struct smbd_mr *get_mr(struct smbd_connection *info) return NULL; } - spin_lock(&info->mr_list_lock); - list_for_each_entry(ret, &info->mr_list, list) { - if (ret->state == MR_READY) { - ret->state = MR_REGISTERED; - spin_unlock(&info->mr_list_lock); - atomic_dec(&info->mr_ready_count); - atomic_inc(&info->mr_used_count); + spin_lock_irqsave(&sc->mr_io.all.lock, flags); + list_for_each_entry(ret, &sc->mr_io.all.list, list) { + if (ret->state == SMBDIRECT_MR_READY) { + ret->state = SMBDIRECT_MR_REGISTERED; + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); + atomic_dec(&sc->mr_io.ready.count); + atomic_inc(&sc->mr_io.used.count); return ret; } } - spin_unlock(&info->mr_list_lock); + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); /* * It is possible that we could fail to get MR because other processes may * try to acquire a MR at the same time. If this is the case, retry it. @@ -2187,8 +2476,7 @@ static struct smbd_mr *get_mr(struct smbd_connection *info) /* * Transcribe the pages from an iterator into an MR scatterlist. */ -static int smbd_iter_to_mr(struct smbd_connection *info, - struct iov_iter *iter, +static int smbd_iter_to_mr(struct iov_iter *iter, struct sg_table *sgt, unsigned int max_sg) { @@ -2210,25 +2498,26 @@ static int smbd_iter_to_mr(struct smbd_connection *info, * need_invalidate: true if this MR needs to be locally invalidated after I/O * return value: the MR registered, NULL if failed. */ -struct smbd_mr *smbd_register_mr(struct smbd_connection *info, +struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate) { struct smbdirect_socket *sc = &info->socket; - struct smbd_mr *smbdirect_mr; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_mr_io *smbdirect_mr; int rc, num_pages; enum dma_data_direction dir; struct ib_reg_wr *reg_wr; - num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1); - if (num_pages > info->max_frmr_depth) { + num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); + if (num_pages > sp->max_frmr_depth) { log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", - num_pages, info->max_frmr_depth); + num_pages, sp->max_frmr_depth); WARN_ON_ONCE(1); return NULL; } - smbdirect_mr = get_mr(info); + smbdirect_mr = get_mr(sc); if (!smbdirect_mr) { log_rdma_mr(ERR, "get_mr returning NULL\n"); return NULL; @@ -2241,8 +2530,8 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, smbdirect_mr->sgt.orig_nents = 0; log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", - num_pages, iov_iter_count(iter), info->max_frmr_depth); - smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth); + num_pages, iov_iter_count(iter), sp->max_frmr_depth); + smbd_iter_to_mr(iter, &smbdirect_mr->sgt, sp->max_frmr_depth); rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, dir); @@ -2287,32 +2576,32 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", rc, reg_wr->key); - /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ + /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/ map_mr_error: ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, smbdirect_mr->dir); dma_map_error: - smbdirect_mr->state = MR_ERROR; - if (atomic_dec_and_test(&info->mr_used_count)) - wake_up(&info->wait_for_mr_cleanup); + smbdirect_mr->state = SMBDIRECT_MR_ERROR; + if (atomic_dec_and_test(&sc->mr_io.used.count)) + wake_up(&sc->mr_io.cleanup.wait_queue); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); return NULL; } static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smbd_mr *smbdirect_mr; + struct smbdirect_mr_io *smbdirect_mr; struct ib_cqe *cqe; cqe = wc->wr_cqe; - smbdirect_mr = container_of(cqe, struct smbd_mr, cqe); - smbdirect_mr->state = MR_INVALIDATED; + smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe); + smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; if (wc->status != IB_WC_SUCCESS) { log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status); - smbdirect_mr->state = MR_ERROR; + smbdirect_mr->state = SMBDIRECT_MR_ERROR; } complete(&smbdirect_mr->invalidate_done); } @@ -2323,11 +2612,10 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) * and we have to locally invalidate the buffer to prevent data is being * modified by remote peer after upper layer consumes it */ -int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) +int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr) { struct ib_send_wr *wr; - struct smbd_connection *info = smbdirect_mr->conn; - struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket *sc = smbdirect_mr->socket; int rc = 0; if (smbdirect_mr->need_invalidate) { @@ -2344,36 +2632,36 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) rc = ib_post_send(sc->ib.qp, wr, NULL); if (rc) { log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); - smbd_disconnect_rdma_connection(info); + smbd_disconnect_rdma_connection(sc); goto done; } wait_for_completion(&smbdirect_mr->invalidate_done); smbdirect_mr->need_invalidate = false; } else /* - * For remote invalidation, just set it to MR_INVALIDATED + * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED * and defer to mr_recovery_work to recover the MR for next use */ - smbdirect_mr->state = MR_INVALIDATED; + smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; - if (smbdirect_mr->state == MR_INVALIDATED) { + if (smbdirect_mr->state == SMBDIRECT_MR_INVALIDATED) { ib_dma_unmap_sg( sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, smbdirect_mr->dir); - smbdirect_mr->state = MR_READY; - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); + smbdirect_mr->state = SMBDIRECT_MR_READY; + if (atomic_inc_return(&sc->mr_io.ready.count) == 1) + wake_up(&sc->mr_io.ready.wait_queue); } else /* * Schedule the work to do MR recovery for future I/Os MR * recovery is slow and don't want it to block current I/O */ - queue_work(info->workqueue, &info->mr_recovery_work); + queue_work(sc->workqueue, &sc->mr_io.recovery_work); done: - if (atomic_dec_and_test(&info->mr_used_count)) - wake_up(&info->wait_for_mr_cleanup); + if (atomic_dec_and_test(&sc->mr_io.used.count)) + wake_up(&sc->mr_io.cleanup.wait_queue); return rc; } diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index e45aa9ddd71da5..d67ac5ddaff4e5 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -27,12 +27,6 @@ extern int smbd_max_send_size; extern int smbd_send_credit_target; extern int smbd_receive_credit_max; -enum keep_alive_status { - KEEP_ALIVE_NONE, - KEEP_ALIVE_PENDING, - KEEP_ALIVE_SENT, -}; - /* * The context for the SMBDirect transport * Everything related to the transport is here. It has several logical parts @@ -44,79 +38,14 @@ enum keep_alive_status { */ struct smbd_connection { struct smbdirect_socket socket; - - int ri_rc; - struct completion ri_done; - wait_queue_head_t status_wait; - - struct completion negotiate_completion; - bool negotiate_done; - - struct work_struct disconnect_work; - struct work_struct post_send_credits_work; - - spinlock_t lock_new_credits_offered; - int new_credits_offered; - - /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */ - enum keep_alive_status keep_alive_requested; - int protocol; - atomic_t send_credits; - atomic_t receive_credits; - int receive_credit_target; - - /* Memory registrations */ - /* Maximum number of RDMA read/write outstanding on this connection */ - int responder_resources; - /* Maximum number of pages in a single RDMA write/read on this connection */ - int max_frmr_depth; - /* - * If payload is less than or equal to the threshold, - * use RDMA send/recv to send upper layer I/O. - * If payload is more than the threshold, - * use RDMA read/write through memory registration for I/O. - */ - int rdma_readwrite_threshold; - enum ib_mr_type mr_type; - struct list_head mr_list; - spinlock_t mr_list_lock; - /* The number of available MRs ready for memory registration */ - atomic_t mr_ready_count; - atomic_t mr_used_count; - wait_queue_head_t wait_mr; - struct work_struct mr_recovery_work; - /* Used by transport to wait until all MRs are returned */ - wait_queue_head_t wait_for_mr_cleanup; - - /* Activity accounting */ - atomic_t send_pending; - wait_queue_head_t wait_send_pending; - wait_queue_head_t wait_post_send; - - /* Receive queue */ - int count_receive_queue; - wait_queue_head_t wait_receive_queues; - - bool send_immediate; - - wait_queue_head_t wait_send_queue; - - struct workqueue_struct *workqueue; - struct delayed_work idle_timer_work; - - /* for debug purposes */ - unsigned int count_get_receive_buffer; - unsigned int count_put_receive_buffer; - unsigned int count_reassembly_queue; - unsigned int count_enqueue_reassembly_queue; - unsigned int count_dequeue_reassembly_queue; - unsigned int count_send_empty; }; /* Create a SMBDirect session */ struct smbd_connection *smbd_get_connection( struct TCP_Server_Info *server, struct sockaddr *dstaddr); +const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn); + /* Reconnect SMBDirect session */ int smbd_reconnect(struct TCP_Server_Info *server); /* Destroy SMBDirect session */ @@ -127,34 +56,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg); int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst); -enum mr_state { - MR_READY, - MR_REGISTERED, - MR_INVALIDATED, - MR_ERROR -}; - -struct smbd_mr { - struct smbd_connection *conn; - struct list_head list; - enum mr_state state; - struct ib_mr *mr; - struct sg_table sgt; - enum dma_data_direction dir; - union { - struct ib_reg_wr wr; - struct ib_send_wr inv_wr; - }; - struct ib_cqe cqe; - bool need_invalidate; - struct completion invalidate_done; -}; - /* Interfaces to register and deregister MR for RDMA read/write */ -struct smbd_mr *smbd_register_mr( +struct smbdirect_mr_io *smbd_register_mr( struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate); -int smbd_deregister_mr(struct smbd_mr *mr); +int smbd_deregister_mr(struct smbdirect_mr_io *mr); #else #define cifs_rdma_enabled(server) 0 diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index fe0e075bc63c3c..fd650e2afc7629 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter); @@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done); @@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err); diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h index b9a385344ff31c..05cc6a9d0ccd3e 100644 --- a/fs/smb/common/smbdirect/smbdirect.h +++ b/fs/smb/common/smbdirect/smbdirect.h @@ -23,6 +23,12 @@ struct smbdirect_buffer_descriptor_v1 { * Some values are important for the upper layer. */ struct smbdirect_socket_parameters { + __u32 resolve_addr_timeout_msec; + __u32 resolve_route_timeout_msec; + __u32 rdma_connect_timeout_msec; + __u32 negotiate_timeout_msec; + __u8 initiator_depth; + __u8 responder_resources; __u16 recv_credit_max; __u16 send_credit_target; __u32 max_send_size; @@ -30,6 +36,7 @@ struct smbdirect_socket_parameters { __u32 max_recv_size; __u32 max_fragmented_recv_size; __u32 max_read_write_size; + __u32 max_frmr_depth; __u32 keepalive_interval_msec; __u32 keepalive_timeout_msec; } __packed; diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 3c4a8d627aa3f5..db22a1d0546b4e 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -6,22 +6,102 @@ #ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ #define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ +#include + enum smbdirect_socket_status { SMBDIRECT_SOCKET_CREATED, - SMBDIRECT_SOCKET_CONNECTING, - SMBDIRECT_SOCKET_CONNECTED, + SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED, + SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, + SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED, + SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED, + SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, + SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED, + SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED, + SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, + SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED, + SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, + SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, SMBDIRECT_SOCKET_NEGOTIATE_FAILED, + SMBDIRECT_SOCKET_CONNECTED, + SMBDIRECT_SOCKET_ERROR, SMBDIRECT_SOCKET_DISCONNECTING, SMBDIRECT_SOCKET_DISCONNECTED, SMBDIRECT_SOCKET_DESTROYED }; +static __always_inline +const char *smbdirect_socket_status_string(enum smbdirect_socket_status status) +{ + switch (status) { + case SMBDIRECT_SOCKET_CREATED: + return "CREATED"; + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + return "RESOLVE_ADDR_NEEDED"; + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + return "RESOLVE_ADDR_RUNNING"; + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + return "RESOLVE_ADDR_FAILED"; + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + return "RESOLVE_ROUTE_NEEDED"; + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + return "RESOLVE_ROUTE_RUNNING"; + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + return "RESOLVE_ROUTE_FAILED"; + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + return "RDMA_CONNECT_NEEDED"; + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + return "RDMA_CONNECT_RUNNING"; + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + return "RDMA_CONNECT_FAILED"; + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + return "NEGOTIATE_NEEDED"; + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + return "NEGOTIATE_RUNNING"; + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + return "NEGOTIATE_FAILED"; + case SMBDIRECT_SOCKET_CONNECTED: + return "CONNECTED"; + case SMBDIRECT_SOCKET_ERROR: + return "ERROR"; + case SMBDIRECT_SOCKET_DISCONNECTING: + return "DISCONNECTING"; + case SMBDIRECT_SOCKET_DISCONNECTED: + return "DISCONNECTED"; + case SMBDIRECT_SOCKET_DESTROYED: + return "DESTROYED"; + } + + return ""; +} + +enum smbdirect_keepalive_status { + SMBDIRECT_KEEPALIVE_NONE, + SMBDIRECT_KEEPALIVE_PENDING, + SMBDIRECT_KEEPALIVE_SENT +}; + struct smbdirect_socket { enum smbdirect_socket_status status; + wait_queue_head_t status_wait; + int first_error; + + /* + * This points to the workqueue to + * be used for this socket. + * It can be per socket (on the client) + * or point to a global workqueue (on the server) + */ + struct workqueue_struct *workqueue; + + struct work_struct disconnect_work; /* RDMA related */ struct { struct rdma_cm_id *cm_id; + /* + * This is for iWarp MPA v1 + */ + bool legacy_iwarp; } rdma; /* IB verbs related */ @@ -39,6 +119,15 @@ struct smbdirect_socket { struct smbdirect_socket_parameters parameters; + /* + * The state for keepalive and timeout handling + */ + struct { + enum smbdirect_keepalive_status keepalive; + struct work_struct immediate_work; + struct delayed_work timer_work; + } idle; + /* * The state for posted send buffers */ @@ -51,6 +140,29 @@ struct smbdirect_socket { struct kmem_cache *cache; mempool_t *pool; } mem; + + /* + * The credit state for the send side + */ + struct { + atomic_t count; + wait_queue_head_t wait_queue; + } credits; + + /* + * The state about posted/pending sends + */ + struct { + atomic_t count; + /* + * woken when count is decremented + */ + wait_queue_head_t dec_wait_queue; + /* + * woken when count reached zero + */ + wait_queue_head_t zero_wait_queue; + } pending; } send_io; /* @@ -84,6 +196,23 @@ struct smbdirect_socket { spinlock_t lock; } free; + /* + * The state for posted recv_io messages + * and the refill work struct. + */ + struct { + atomic_t count; + struct work_struct refill_work; + } posted; + + /* + * The credit state for the recv side + */ + struct { + u16 target; + atomic_t count; + } credits; + /* * The list of arrived non-empty smbdirect_recv_io * structures @@ -110,8 +239,137 @@ struct smbdirect_socket { bool full_packet_received; } reassembly; } recv_io; + + /* + * The state for Memory registrations on the client + */ + struct { + enum ib_mr_type type; + + /* + * The list of free smbdirect_mr_io + * structures + */ + struct { + struct list_head list; + spinlock_t lock; + } all; + + /* + * The number of available MRs ready for memory registration + */ + struct { + atomic_t count; + wait_queue_head_t wait_queue; + } ready; + + /* + * The number of used MRs + */ + struct { + atomic_t count; + } used; + + struct work_struct recovery_work; + + /* Used by transport to wait until all MRs are returned */ + struct { + wait_queue_head_t wait_queue; + } cleanup; + } mr_io; + + /* + * The state for RDMA read/write requests on the server + */ + struct { + /* + * The credit state for the send side + */ + struct { + /* + * The maximum number of rw credits + */ + size_t max; + /* + * The number of pages per credit + */ + size_t num_pages; + atomic_t count; + wait_queue_head_t wait_queue; + } credits; + } rw_io; + + /* + * For debug purposes + */ + struct { + u64 get_receive_buffer; + u64 put_receive_buffer; + u64 enqueue_reassembly_queue; + u64 dequeue_reassembly_queue; + u64 send_empty; + } statistics; }; +static void __smbdirect_socket_disabled_work(struct work_struct *work) +{ + /* + * Should never be called as disable_[delayed_]work_sync() was used. + */ + WARN_ON_ONCE(1); +} + +static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) +{ + /* + * This also sets status = SMBDIRECT_SOCKET_CREATED + */ + BUILD_BUG_ON(SMBDIRECT_SOCKET_CREATED != 0); + memset(sc, 0, sizeof(*sc)); + + init_waitqueue_head(&sc->status_wait); + + INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->disconnect_work); + + INIT_WORK(&sc->idle.immediate_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->idle.immediate_work); + INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work); + disable_delayed_work_sync(&sc->idle.timer_work); + + atomic_set(&sc->send_io.credits.count, 0); + init_waitqueue_head(&sc->send_io.credits.wait_queue); + + atomic_set(&sc->send_io.pending.count, 0); + init_waitqueue_head(&sc->send_io.pending.dec_wait_queue); + init_waitqueue_head(&sc->send_io.pending.zero_wait_queue); + + INIT_LIST_HEAD(&sc->recv_io.free.list); + spin_lock_init(&sc->recv_io.free.lock); + + atomic_set(&sc->recv_io.posted.count, 0); + INIT_WORK(&sc->recv_io.posted.refill_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->recv_io.posted.refill_work); + + atomic_set(&sc->recv_io.credits.count, 0); + + INIT_LIST_HEAD(&sc->recv_io.reassembly.list); + spin_lock_init(&sc->recv_io.reassembly.lock); + init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); + + atomic_set(&sc->rw_io.credits.count, 0); + init_waitqueue_head(&sc->rw_io.credits.wait_queue); + + spin_lock_init(&sc->mr_io.all.lock); + INIT_LIST_HEAD(&sc->mr_io.all.list); + atomic_set(&sc->mr_io.ready.count, 0); + init_waitqueue_head(&sc->mr_io.ready.wait_queue); + atomic_set(&sc->mr_io.used.count, 0); + INIT_WORK(&sc->mr_io.recovery_work, __smbdirect_socket_disabled_work); + disable_work_sync(&sc->mr_io.recovery_work); + init_waitqueue_head(&sc->mr_io.cleanup.wait_queue); +} + struct smbdirect_send_io { struct smbdirect_socket *socket; struct ib_cqe cqe; @@ -136,6 +394,23 @@ struct smbdirect_send_io { u8 packet[]; }; +struct smbdirect_send_batch { + /* + * List of smbdirect_send_io messages + */ + struct list_head msg_list; + /* + * Number of list entries + */ + size_t wr_cnt; + + /* + * Possible remote key invalidation state + */ + bool need_invalidate_rkey; + u32 remote_key; +}; + struct smbdirect_recv_io { struct smbdirect_socket *socket; struct ib_cqe cqe; @@ -158,4 +433,44 @@ struct smbdirect_recv_io { u8 packet[]; }; +enum smbdirect_mr_state { + SMBDIRECT_MR_READY, + SMBDIRECT_MR_REGISTERED, + SMBDIRECT_MR_INVALIDATED, + SMBDIRECT_MR_ERROR +}; + +struct smbdirect_mr_io { + struct smbdirect_socket *socket; + struct ib_cqe cqe; + + struct list_head list; + + enum smbdirect_mr_state state; + struct ib_mr *mr; + struct sg_table sgt; + enum dma_data_direction dir; + union { + struct ib_reg_wr wr; + struct ib_send_wr inv_wr; + }; + + bool need_invalidate; + struct completion invalidate_done; +}; + +struct smbdirect_rw_io { + struct smbdirect_socket *socket; + struct ib_cqe cqe; + + struct list_head list; + + int error; + struct completion *completion; + + struct rdma_rw_ctx rdma_ctx; + struct sg_table sgt; + struct scatterlist sg_list[]; +}; + #endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */ diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 67c4f73398dfee..91a9344111348a 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -243,7 +243,7 @@ int ksmbd_conn_write(struct ksmbd_work *work) int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { int ret = -EINVAL; @@ -257,7 +257,7 @@ int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { int ret = -EINVAL; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 2aa8084bb59302..07b43634262a19 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -19,6 +19,8 @@ #include "smb_common.h" #include "ksmbd_work.h" +struct smbdirect_buffer_descriptor_v1; + #define KSMBD_SOCKET_BACKLOG 16 enum { @@ -133,11 +135,11 @@ struct ksmbd_transport_ops { unsigned int remote_key); int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); int (*rdma_write)(struct ksmbd_transport *t, void *buf, unsigned int len, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); void (*free_transport)(struct ksmbd_transport *kt); }; @@ -163,11 +165,11 @@ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); int ksmbd_conn_write(struct ksmbd_work *work); int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c index 72b00ca6e45517..4a71f46d7020be 100644 --- a/fs/smb/server/ksmbd_work.c +++ b/fs/smb/server/ksmbd_work.c @@ -78,7 +78,7 @@ int ksmbd_work_pool_init(void) int ksmbd_workqueue_init(void) { - ksmbd_wq = alloc_workqueue("ksmbd-io", 0, 0); + ksmbd_wq = alloc_workqueue("ksmbd-io", WQ_PERCPU, 0); if (!ksmbd_wq) return -ENOMEM; return 0; diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 8c9c49c3a0a473..40420544cc25a2 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -365,6 +365,7 @@ static void server_ctrl_handle_init(struct server_ctrl_struct *ctrl) return; } + pr_info("running\n"); WRITE_ONCE(server_conf.state, SERVER_STATE_RUNNING); } diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index a565fc36cee6df..0c069eff80b771 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -23,6 +23,7 @@ #include "asn1.h" #include "connection.h" #include "transport_ipc.h" +#include "../common/smbdirect/smbdirect.h" #include "transport_rdma.h" #include "vfs.h" #include "vfs_cache.h" @@ -6665,7 +6666,7 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) } static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, __le32 Channel, __le16 ChannelInfoLength) { @@ -6701,7 +6702,7 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)), le16_to_cpu(req->ReadChannelInfoLength)); if (err) @@ -6761,7 +6762,11 @@ int smb2_read(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { is_rdma_channel = true; - max_read_size = get_smbd_max_read_write_size(); + max_read_size = get_smbd_max_read_write_size(work->conn->transport); + if (max_read_size == 0) { + err = -EINVAL; + goto out; + } } if (is_rdma_channel == true) { @@ -6772,7 +6777,7 @@ int smb2_read(struct ksmbd_work *work) goto out; } err = smb2_set_remote_key_for_rdma(work, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + ch_offset), req->Channel, req->ReadChannelInfoLength); @@ -6967,7 +6972,7 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, return -ENOMEM; ret = ksmbd_conn_rdma_read(work->conn, data_buf, length, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)), le16_to_cpu(req->WriteChannelInfoLength)); if (ret < 0) { @@ -7019,7 +7024,11 @@ int smb2_write(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1 || req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { is_rdma_channel = true; - max_write_size = get_smbd_max_read_write_size(); + max_write_size = get_smbd_max_read_write_size(work->conn->transport); + if (max_write_size == 0) { + err = -EINVAL; + goto out; + } length = le32_to_cpu(req->RemainingBytes); } @@ -7032,7 +7041,7 @@ int smb2_write(struct ksmbd_work *work) goto out; } err = smb2_set_remote_key_for_rdma(work, - (struct smb2_buffer_desc_v1 *) + (struct smbdirect_buffer_descriptor_v1 *) ((char *)req + ch_offset), req->Channel, req->WriteChannelInfoLength); diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 16ae8a10490beb..5163d5241b90d5 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -136,12 +136,6 @@ struct create_posix_rsp { u8 SidBuffer[44]; } __packed; -struct smb2_buffer_desc_v1 { - __le64 offset; - __le32 token; - __le32 length; -} __packed; - #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 struct smb_sockaddr_in { diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 5466aa8c39b1cd..9e644a0daf1c4d 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -23,18 +23,24 @@ #include "connection.h" #include "smb_common.h" #include "../common/smb2status.h" +#include "../common/smbdirect/smbdirect.h" +#include "../common/smbdirect/smbdirect_pdu.h" +#include "../common/smbdirect/smbdirect_socket.h" #include "transport_rdma.h" #define SMB_DIRECT_PORT_IWARP 5445 #define SMB_DIRECT_PORT_INFINIBAND 445 -#define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100) +#define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1) -/* SMB_DIRECT negotiation timeout in seconds */ -#define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 +/* SMB_DIRECT negotiation timeout (for the server) in seconds */ +#define SMB_DIRECT_NEGOTIATE_TIMEOUT 5 -#define SMB_DIRECT_MAX_SEND_SGES 6 -#define SMB_DIRECT_MAX_RECV_SGES 1 +/* The timeout to wait for a keepalive message from peer in seconds */ +#define SMB_DIRECT_KEEPALIVE_SEND_INTERVAL 120 + +/* The timeout to wait for a keepalive message from peer in seconds */ +#define SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT 5 /* * Default maximum number of RDMA read/write outstanding on this connection @@ -87,131 +93,38 @@ static struct smb_direct_listener { static struct workqueue_struct *smb_direct_wq; -enum smb_direct_status { - SMB_DIRECT_CS_NEW = 0, - SMB_DIRECT_CS_CONNECTED, - SMB_DIRECT_CS_DISCONNECTING, - SMB_DIRECT_CS_DISCONNECTED, -}; - struct smb_direct_transport { struct ksmbd_transport transport; - enum smb_direct_status status; - bool full_packet_received; - wait_queue_head_t wait_status; - - struct rdma_cm_id *cm_id; - struct ib_cq *send_cq; - struct ib_cq *recv_cq; - struct ib_pd *pd; - struct ib_qp *qp; - - int max_send_size; - int max_recv_size; - int max_fragmented_send_size; - int max_fragmented_recv_size; - int max_rdma_rw_size; - - spinlock_t reassembly_queue_lock; - struct list_head reassembly_queue; - int reassembly_data_length; - int reassembly_queue_length; - int first_entry_offset; - wait_queue_head_t wait_reassembly_queue; - - spinlock_t receive_credit_lock; - int recv_credits; - int count_avail_recvmsg; - int recv_credit_max; - int recv_credit_target; - - spinlock_t recvmsg_queue_lock; - struct list_head recvmsg_queue; - - int send_credit_target; - atomic_t send_credits; - spinlock_t lock_new_recv_credits; - int new_recv_credits; - int max_rw_credits; - int pages_per_rw_credit; - atomic_t rw_credits; - - wait_queue_head_t wait_send_credits; - wait_queue_head_t wait_rw_credits; - - mempool_t *sendmsg_mempool; - struct kmem_cache *sendmsg_cache; - mempool_t *recvmsg_mempool; - struct kmem_cache *recvmsg_cache; - - wait_queue_head_t wait_send_pending; - atomic_t send_pending; - - struct delayed_work post_recv_credits_work; - struct work_struct send_immediate_work; - struct work_struct disconnect_work; - - bool negotiation_requested; + struct smbdirect_socket socket; }; -#define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) -#define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \ +#define KSMBD_TRANS(t) (&(t)->transport) +#define SMBD_TRANS(t) (container_of(t, \ struct smb_direct_transport, transport)) -enum { - SMB_DIRECT_MSG_NEGOTIATE_REQ = 0, - SMB_DIRECT_MSG_DATA_TRANSFER -}; static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; -struct smb_direct_send_ctx { - struct list_head msg_list; - int wr_cnt; - bool need_invalidate_rkey; - unsigned int remote_key; -}; - -struct smb_direct_sendmsg { - struct smb_direct_transport *transport; - struct ib_send_wr wr; - struct list_head list; - int num_sge; - struct ib_sge sge[SMB_DIRECT_MAX_SEND_SGES]; - struct ib_cqe cqe; - u8 packet[]; -}; - -struct smb_direct_recvmsg { - struct smb_direct_transport *transport; - struct list_head list; - int type; - struct ib_sge sge; - struct ib_cqe cqe; - bool first_segment; - u8 packet[]; -}; - -struct smb_direct_rdma_rw_msg { - struct smb_direct_transport *t; - struct ib_cqe cqe; - int status; - struct completion *completion; - struct list_head list; - struct rdma_rw_ctx rw_ctx; - struct sg_table sgt; - struct scatterlist sg_list[]; -}; - void init_smbd_max_io_size(unsigned int sz) { sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); smb_direct_max_read_write_size = sz; } -unsigned int get_smbd_max_read_write_size(void) +unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { - return smb_direct_max_read_write_size; + struct smb_direct_transport *t; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; + + if (kt->ops != &ksmbd_smb_direct_transport_ops) + return 0; + + t = SMBD_TRANS(kt); + sc = &t->socket; + sp = &sc->parameters; + + return sp->max_read_write_size; } static inline int get_buf_page_count(void *buf, int size) @@ -220,71 +133,65 @@ static inline int get_buf_page_count(void *buf, int size) (uintptr_t)buf / PAGE_SIZE; } -static void smb_direct_destroy_pools(struct smb_direct_transport *transport); +static void smb_direct_destroy_pools(struct smbdirect_socket *sc); static void smb_direct_post_recv_credits(struct work_struct *work); -static int smb_direct_post_send_data(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, +static int smb_direct_post_send_data(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx, struct kvec *iov, int niov, int remaining_data_length); -static inline struct smb_direct_transport * -smb_trans_direct_transfort(struct ksmbd_transport *t) -{ - return container_of(t, struct smb_direct_transport, transport); -} - static inline void -*smb_direct_recvmsg_payload(struct smb_direct_recvmsg *recvmsg) +*smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg) { return (void *)recvmsg->packet; } -static inline bool is_receive_credit_post_required(int receive_credits, - int avail_recvmsg_count) -{ - return receive_credits <= (smb_direct_receive_credit_max >> 3) && - avail_recvmsg_count >= (receive_credits >> 2); -} - static struct -smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t) +smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc) { - struct smb_direct_recvmsg *recvmsg = NULL; + struct smbdirect_recv_io *recvmsg = NULL; + unsigned long flags; - spin_lock(&t->recvmsg_queue_lock); - if (!list_empty(&t->recvmsg_queue)) { - recvmsg = list_first_entry(&t->recvmsg_queue, - struct smb_direct_recvmsg, + spin_lock_irqsave(&sc->recv_io.free.lock, flags); + if (!list_empty(&sc->recv_io.free.list)) { + recvmsg = list_first_entry(&sc->recv_io.free.list, + struct smbdirect_recv_io, list); list_del(&recvmsg->list); } - spin_unlock(&t->recvmsg_queue_lock); + spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); return recvmsg; } -static void put_recvmsg(struct smb_direct_transport *t, - struct smb_direct_recvmsg *recvmsg) +static void put_recvmsg(struct smbdirect_socket *sc, + struct smbdirect_recv_io *recvmsg) { + unsigned long flags; + if (likely(recvmsg->sge.length != 0)) { - ib_dma_unmap_single(t->cm_id->device, + ib_dma_unmap_single(sc->ib.dev, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); recvmsg->sge.length = 0; } - spin_lock(&t->recvmsg_queue_lock); - list_add(&recvmsg->list, &t->recvmsg_queue); - spin_unlock(&t->recvmsg_queue_lock); + spin_lock_irqsave(&sc->recv_io.free.lock, flags); + list_add(&recvmsg->list, &sc->recv_io.free.list); + spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); + + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); } -static void enqueue_reassembly(struct smb_direct_transport *t, - struct smb_direct_recvmsg *recvmsg, +static void enqueue_reassembly(struct smbdirect_socket *sc, + struct smbdirect_recv_io *recvmsg, int data_length) { - spin_lock(&t->reassembly_queue_lock); - list_add_tail(&recvmsg->list, &t->reassembly_queue); - t->reassembly_queue_length++; + unsigned long flags; + + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list); + sc->recv_io.reassembly.queue_length++; /* * Make sure reassembly_data_length is updated after list and * reassembly_queue_length are updated. On the dequeue side @@ -292,85 +199,228 @@ static void enqueue_reassembly(struct smb_direct_transport *t, * if reassembly_queue_length and list is up to date */ virt_wmb(); - t->reassembly_data_length += data_length; - spin_unlock(&t->reassembly_queue_lock); + sc->recv_io.reassembly.data_length += data_length; + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); } -static struct smb_direct_recvmsg *get_first_reassembly(struct smb_direct_transport *t) +static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc) { - if (!list_empty(&t->reassembly_queue)) - return list_first_entry(&t->reassembly_queue, - struct smb_direct_recvmsg, list); + if (!list_empty(&sc->recv_io.reassembly.list)) + return list_first_entry(&sc->recv_io.reassembly.list, + struct smbdirect_recv_io, list); else return NULL; } +static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc) +{ + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + wake_up_all(&sc->status_wait); + wake_up_all(&sc->send_io.credits.wait_queue); + wake_up_all(&sc->send_io.pending.zero_wait_queue); + wake_up_all(&sc->recv_io.reassembly.wait_queue); + wake_up_all(&sc->rw_io.credits.wait_queue); +} + static void smb_direct_disconnect_rdma_work(struct work_struct *work) { - struct smb_direct_transport *t = - container_of(work, struct smb_direct_transport, - disconnect_work); + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, disconnect_work); - if (t->status == SMB_DIRECT_CS_CONNECTED) { - t->status = SMB_DIRECT_CS_DISCONNECTING; - rdma_disconnect(t->cm_id); + /* + * make sure this and other work is not queued again + * but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->disconnect_work); + disable_work(&sc->recv_io.posted.refill_work); + disable_delayed_work(&sc->idle.timer_work); + disable_work(&sc->idle.immediate_work); + + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + + switch (sc->status) { + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_CONNECTED: + case SMBDIRECT_SOCKET_ERROR: + sc->status = SMBDIRECT_SOCKET_DISCONNECTING; + rdma_disconnect(sc->rdma.cm_id); + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + /* + * rdma_accept() never reached + * RDMA_CM_EVENT_ESTABLISHED + */ + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + break; } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smb_direct_disconnect_wake_up_all(sc); } static void -smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t) +smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) { - if (t->status == SMB_DIRECT_CS_CONNECTED) - queue_work(smb_direct_wq, &t->disconnect_work); + /* + * make sure other work (than disconnect_work) is + * not queued again but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->idle.immediate_work); + disable_delayed_work(&sc->idle.timer_work); + + if (sc->first_error == 0) + sc->first_error = -ECONNABORTED; + + switch (sc->status) { + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_ERROR: + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + /* + * Keep the current error status + */ + break; + + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; + break; + + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; + break; + + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; + break; + + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_CONNECTED: + sc->status = SMBDIRECT_SOCKET_ERROR; + break; + } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smb_direct_disconnect_wake_up_all(sc); + + queue_work(sc->workqueue, &sc->disconnect_work); } static void smb_direct_send_immediate_work(struct work_struct *work) { - struct smb_direct_transport *t = container_of(work, - struct smb_direct_transport, send_immediate_work); + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.immediate_work); + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; + + smb_direct_post_send_data(sc, NULL, NULL, 0, 0); +} + +static void smb_direct_idle_connection_timer(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.timer_work.work); + struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { + smb_direct_disconnect_rdma_connection(sc); + return; + } - if (t->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return; - smb_direct_post_send_data(t, NULL, NULL, 0, 0); + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + queue_work(sc->workqueue, &sc->idle.immediate_work); } static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) { struct smb_direct_transport *t; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; struct ksmbd_conn *conn; t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP); if (!t) return NULL; + sc = &t->socket; + smbdirect_socket_init(sc); + sp = &sc->parameters; - t->cm_id = cm_id; - cm_id->context = t; - - t->status = SMB_DIRECT_CS_NEW; - init_waitqueue_head(&t->wait_status); + sc->workqueue = smb_direct_wq; - spin_lock_init(&t->reassembly_queue_lock); - INIT_LIST_HEAD(&t->reassembly_queue); - t->reassembly_data_length = 0; - t->reassembly_queue_length = 0; - init_waitqueue_head(&t->wait_reassembly_queue); - init_waitqueue_head(&t->wait_send_credits); - init_waitqueue_head(&t->wait_rw_credits); + INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); - spin_lock_init(&t->receive_credit_lock); - spin_lock_init(&t->recvmsg_queue_lock); - INIT_LIST_HEAD(&t->recvmsg_queue); + sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; + sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; + sp->responder_resources = 1; + sp->recv_credit_max = smb_direct_receive_credit_max; + sp->send_credit_target = smb_direct_send_credit_target; + sp->max_send_size = smb_direct_max_send_size; + sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; + sp->max_recv_size = smb_direct_max_receive_size; + sp->max_read_write_size = smb_direct_max_read_write_size; + sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000; + sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000; - init_waitqueue_head(&t->wait_send_pending); - atomic_set(&t->send_pending, 0); + sc->rdma.cm_id = cm_id; + cm_id->context = sc; - spin_lock_init(&t->lock_new_recv_credits); + sc->ib.dev = sc->rdma.cm_id->device; - INIT_DELAYED_WORK(&t->post_recv_credits_work, - smb_direct_post_recv_credits); - INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work); - INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work); + INIT_WORK(&sc->recv_io.posted.refill_work, + smb_direct_post_recv_credits); + INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); + INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer); conn = ksmbd_conn_alloc(); if (!conn) @@ -391,89 +441,104 @@ static void smb_direct_free_transport(struct ksmbd_transport *kt) static void free_transport(struct smb_direct_transport *t) { - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_socket *sc = &t->socket; + struct smbdirect_recv_io *recvmsg; - wake_up_interruptible(&t->wait_send_credits); + disable_work_sync(&sc->disconnect_work); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { + smb_direct_disconnect_rdma_work(&sc->disconnect_work); + wait_event_interruptible(sc->status_wait, + sc->status == SMBDIRECT_SOCKET_DISCONNECTED); + } - ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); - wait_event(t->wait_send_pending, - atomic_read(&t->send_pending) == 0); + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + * + * Most likely this was already called via + * smb_direct_disconnect_rdma_work(), but call it again... + */ + smb_direct_disconnect_wake_up_all(sc); - cancel_work_sync(&t->disconnect_work); - cancel_delayed_work_sync(&t->post_recv_credits_work); - cancel_work_sync(&t->send_immediate_work); + disable_work_sync(&sc->recv_io.posted.refill_work); + disable_delayed_work_sync(&sc->idle.timer_work); + disable_work_sync(&sc->idle.immediate_work); - if (t->qp) { - ib_drain_qp(t->qp); - ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs); - t->qp = NULL; - rdma_destroy_qp(t->cm_id); + if (sc->ib.qp) { + ib_drain_qp(sc->ib.qp); + ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs); + sc->ib.qp = NULL; + rdma_destroy_qp(sc->rdma.cm_id); } ksmbd_debug(RDMA, "drain the reassembly queue\n"); do { - spin_lock(&t->reassembly_queue_lock); - recvmsg = get_first_reassembly(t); + unsigned long flags; + + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + recvmsg = get_first_reassembly(sc); if (recvmsg) { list_del(&recvmsg->list); - spin_unlock(&t->reassembly_queue_lock); - put_recvmsg(t, recvmsg); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + put_recvmsg(sc, recvmsg); } else { - spin_unlock(&t->reassembly_queue_lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); } } while (recvmsg); - t->reassembly_data_length = 0; - - if (t->send_cq) - ib_free_cq(t->send_cq); - if (t->recv_cq) - ib_free_cq(t->recv_cq); - if (t->pd) - ib_dealloc_pd(t->pd); - if (t->cm_id) - rdma_destroy_id(t->cm_id); - - smb_direct_destroy_pools(t); + sc->recv_io.reassembly.data_length = 0; + + if (sc->ib.send_cq) + ib_free_cq(sc->ib.send_cq); + if (sc->ib.recv_cq) + ib_free_cq(sc->ib.recv_cq); + if (sc->ib.pd) + ib_dealloc_pd(sc->ib.pd); + if (sc->rdma.cm_id) + rdma_destroy_id(sc->rdma.cm_id); + + smb_direct_destroy_pools(sc); ksmbd_conn_free(KSMBD_TRANS(t)->conn); } -static struct smb_direct_sendmsg -*smb_direct_alloc_sendmsg(struct smb_direct_transport *t) +static struct smbdirect_send_io +*smb_direct_alloc_sendmsg(struct smbdirect_socket *sc) { - struct smb_direct_sendmsg *msg; + struct smbdirect_send_io *msg; - msg = mempool_alloc(t->sendmsg_mempool, KSMBD_DEFAULT_GFP); + msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP); if (!msg) return ERR_PTR(-ENOMEM); - msg->transport = t; - INIT_LIST_HEAD(&msg->list); + msg->socket = sc; + INIT_LIST_HEAD(&msg->sibling_list); msg->num_sge = 0; return msg; } -static void smb_direct_free_sendmsg(struct smb_direct_transport *t, - struct smb_direct_sendmsg *msg) +static void smb_direct_free_sendmsg(struct smbdirect_socket *sc, + struct smbdirect_send_io *msg) { int i; if (msg->num_sge > 0) { - ib_dma_unmap_single(t->cm_id->device, + ib_dma_unmap_single(sc->ib.dev, msg->sge[0].addr, msg->sge[0].length, DMA_TO_DEVICE); for (i = 1; i < msg->num_sge; i++) - ib_dma_unmap_page(t->cm_id->device, + ib_dma_unmap_page(sc->ib.dev, msg->sge[i].addr, msg->sge[i].length, DMA_TO_DEVICE); } - mempool_free(msg, t->sendmsg_mempool); + mempool_free(msg, sc->send_io.mem.pool); } -static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) +static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg) { - switch (recvmsg->type) { - case SMB_DIRECT_MSG_DATA_TRANSFER: { - struct smb_direct_data_transfer *req = - (struct smb_direct_data_transfer *)recvmsg->packet; + struct smbdirect_socket *sc = recvmsg->socket; + + switch (sc->recv_io.expected) { + case SMBDIRECT_EXPECT_DATA_TRANSFER: { + struct smbdirect_data_transfer *req = + (struct smbdirect_data_transfer *)recvmsg->packet; struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet + le32_to_cpu(req->data_offset)); ksmbd_debug(RDMA, @@ -482,11 +547,11 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) le16_to_cpu(req->credits_requested), req->data_length, req->remaining_data_length, hdr->ProtocolId, hdr->Command); - break; + return 0; } - case SMB_DIRECT_MSG_NEGOTIATE_REQ: { - struct smb_direct_negotiate_req *req = - (struct smb_direct_negotiate_req *)recvmsg->packet; + case SMBDIRECT_EXPECT_NEGOTIATE_REQ: { + struct smbdirect_negotiate_req *req = + (struct smbdirect_negotiate_req *)recvmsg->packet; ksmbd_debug(RDMA, "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n", le16_to_cpu(req->min_version), @@ -504,29 +569,34 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) 128 * 1024) return -ECONNABORTED; - break; + return 0; } - default: - return -EINVAL; + case SMBDIRECT_EXPECT_NEGOTIATE_REP: + /* client only */ + break; } - return 0; + + /* This is an internal error */ + return -EINVAL; } static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smb_direct_recvmsg *recvmsg; - struct smb_direct_transport *t; + struct smbdirect_recv_io *recvmsg; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; - recvmsg = container_of(wc->wr_cqe, struct smb_direct_recvmsg, cqe); - t = recvmsg->transport; + recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); + sc = recvmsg->socket; + sp = &sc->parameters; if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); if (wc->status != IB_WC_WR_FLUSH_ERR) { pr_err("Recv error. status='%s (%d)' opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, wc->opcode); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } return; } @@ -538,108 +608,128 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); - switch (recvmsg->type) { - case SMB_DIRECT_MSG_NEGOTIATE_REQ: - if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) { - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + + switch (sc->recv_io.expected) { + case SMBDIRECT_EXPECT_NEGOTIATE_REQ: + if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) { + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); return; } - t->negotiation_requested = true; - t->full_packet_received = true; - t->status = SMB_DIRECT_CS_CONNECTED; - enqueue_reassembly(t, recvmsg, 0); - wake_up_interruptible(&t->wait_status); + sc->recv_io.reassembly.full_packet_received = true; + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; + enqueue_reassembly(sc, recvmsg, 0); + wake_up(&sc->status_wait); return; - case SMB_DIRECT_MSG_DATA_TRANSFER: { - struct smb_direct_data_transfer *data_transfer = - (struct smb_direct_data_transfer *)recvmsg->packet; - unsigned int data_length; - int avail_recvmsg_count, receive_credits; + case SMBDIRECT_EXPECT_DATA_TRANSFER: { + struct smbdirect_data_transfer *data_transfer = + (struct smbdirect_data_transfer *)recvmsg->packet; + u32 remaining_data_length, data_offset, data_length; + u16 old_recv_credit_target; if (wc->byte_len < - offsetof(struct smb_direct_data_transfer, padding)) { - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); + offsetof(struct smbdirect_data_transfer, padding)) { + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); return; } + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); data_length = le32_to_cpu(data_transfer->data_length); - if (data_length) { - if (wc->byte_len < sizeof(struct smb_direct_data_transfer) + - (u64)data_length) { - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); - return; - } + data_offset = le32_to_cpu(data_transfer->data_offset); + if (wc->byte_len < data_offset || + wc->byte_len < (u64)data_offset + data_length) { + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); + return; + } + if (remaining_data_length > sp->max_fragmented_recv_size || + data_length > sp->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > + (u64)sp->max_fragmented_recv_size) { + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); + return; + } - if (t->full_packet_received) + if (data_length) { + if (sc->recv_io.reassembly.full_packet_received) recvmsg->first_segment = true; if (le32_to_cpu(data_transfer->remaining_data_length)) - t->full_packet_received = false; + sc->recv_io.reassembly.full_packet_received = false; else - t->full_packet_received = true; - - spin_lock(&t->receive_credit_lock); - receive_credits = --(t->recv_credits); - avail_recvmsg_count = t->count_avail_recvmsg; - spin_unlock(&t->receive_credit_lock); - } else { - spin_lock(&t->receive_credit_lock); - receive_credits = --(t->recv_credits); - avail_recvmsg_count = ++(t->count_avail_recvmsg); - spin_unlock(&t->receive_credit_lock); + sc->recv_io.reassembly.full_packet_received = true; } - t->recv_credit_target = + atomic_dec(&sc->recv_io.posted.count); + atomic_dec(&sc->recv_io.credits.count); + + old_recv_credit_target = sc->recv_io.credits.target; + sc->recv_io.credits.target = le16_to_cpu(data_transfer->credits_requested); + sc->recv_io.credits.target = + min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); + sc->recv_io.credits.target = + max_t(u16, sc->recv_io.credits.target, 1); atomic_add(le16_to_cpu(data_transfer->credits_granted), - &t->send_credits); + &sc->send_io.credits.count); if (le16_to_cpu(data_transfer->flags) & - SMB_DIRECT_RESPONSE_REQUESTED) - queue_work(smb_direct_wq, &t->send_immediate_work); - - if (atomic_read(&t->send_credits) > 0) - wake_up_interruptible(&t->wait_send_credits); + SMBDIRECT_FLAG_RESPONSE_REQUESTED) + queue_work(sc->workqueue, &sc->idle.immediate_work); - if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count)) - mod_delayed_work(smb_direct_wq, - &t->post_recv_credits_work, 0); + if (atomic_read(&sc->send_io.credits.count) > 0) + wake_up(&sc->send_io.credits.wait_queue); if (data_length) { - enqueue_reassembly(t, recvmsg, (int)data_length); - wake_up_interruptible(&t->wait_reassembly_queue); + if (sc->recv_io.credits.target > old_recv_credit_target) + queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); + + enqueue_reassembly(sc, recvmsg, (int)data_length); + wake_up(&sc->recv_io.reassembly.wait_queue); } else - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); return; } + case SMBDIRECT_EXPECT_NEGOTIATE_REP: + /* client only */ + break; } /* * This is an internal error! */ - WARN_ON_ONCE(recvmsg->type != SMB_DIRECT_MSG_DATA_TRANSFER); - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); + WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); + put_recvmsg(sc, recvmsg); + smb_direct_disconnect_rdma_connection(sc); } -static int smb_direct_post_recv(struct smb_direct_transport *t, - struct smb_direct_recvmsg *recvmsg) +static int smb_direct_post_recv(struct smbdirect_socket *sc, + struct smbdirect_recv_io *recvmsg) { + struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_recv_wr wr; int ret; - recvmsg->sge.addr = ib_dma_map_single(t->cm_id->device, - recvmsg->packet, t->max_recv_size, + recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev, + recvmsg->packet, + sp->max_recv_size, DMA_FROM_DEVICE); - ret = ib_dma_mapping_error(t->cm_id->device, recvmsg->sge.addr); + ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr); if (ret) return ret; - recvmsg->sge.length = t->max_recv_size; - recvmsg->sge.lkey = t->pd->local_dma_lkey; + recvmsg->sge.length = sp->max_recv_size; + recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey; recvmsg->cqe.done = recv_done; wr.wr_cqe = &recvmsg->cqe; @@ -647,14 +737,14 @@ static int smb_direct_post_recv(struct smb_direct_transport *t, wr.sg_list = &recvmsg->sge; wr.num_sge = 1; - ret = ib_post_recv(t->qp, &wr, NULL); + ret = ib_post_recv(sc->ib.qp, &wr, NULL); if (ret) { pr_err("Can't post recv: %d\n", ret); - ib_dma_unmap_single(t->cm_id->device, + ib_dma_unmap_single(sc->ib.dev, recvmsg->sge.addr, recvmsg->sge.length, DMA_FROM_DEVICE); recvmsg->sge.length = 0; - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); return ret; } return ret; @@ -663,15 +753,16 @@ static int smb_direct_post_recv(struct smb_direct_transport *t, static int smb_direct_read(struct ksmbd_transport *t, char *buf, unsigned int size, int unused) { - struct smb_direct_recvmsg *recvmsg; - struct smb_direct_data_transfer *data_transfer; + struct smbdirect_recv_io *recvmsg; + struct smbdirect_data_transfer *data_transfer; int to_copy, to_read, data_read, offset; u32 data_length, remaining_data_length, data_offset; int rc; - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; again: - if (st->status != SMB_DIRECT_CS_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { pr_err("disconnected\n"); return -ENOTCONN; } @@ -681,9 +772,10 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, * the only one reading from the front of the queue. The transport * may add more entries to the back of the queue at the same time */ - if (st->reassembly_data_length >= size) { + if (sc->recv_io.reassembly.data_length >= size) { int queue_length; int queue_removed = 0; + unsigned long flags; /* * Need to make sure reassembly_data_length is read before @@ -693,13 +785,13 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, * updated in SOFTIRQ as more data is received */ virt_rmb(); - queue_length = st->reassembly_queue_length; + queue_length = sc->recv_io.reassembly.queue_length; data_read = 0; to_read = size; - offset = st->first_entry_offset; + offset = sc->recv_io.reassembly.first_entry_offset; while (data_read < size) { - recvmsg = get_first_reassembly(st); - data_transfer = smb_direct_recvmsg_payload(recvmsg); + recvmsg = get_first_reassembly(sc); + data_transfer = smbdirect_recv_io_payload(recvmsg); data_length = le32_to_cpu(data_transfer->data_length); remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); @@ -739,12 +831,12 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, if (queue_length) { list_del(&recvmsg->list); } else { - spin_lock_irq(&st->reassembly_queue_lock); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); list_del(&recvmsg->list); - spin_unlock_irq(&st->reassembly_queue_lock); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); } queue_removed++; - put_recvmsg(st, recvmsg); + put_recvmsg(sc, recvmsg); offset = 0; } else { offset += to_copy; @@ -754,34 +846,24 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, data_read += to_copy; } - spin_lock_irq(&st->reassembly_queue_lock); - st->reassembly_data_length -= data_read; - st->reassembly_queue_length -= queue_removed; - spin_unlock_irq(&st->reassembly_queue_lock); - - spin_lock(&st->receive_credit_lock); - st->count_avail_recvmsg += queue_removed; - if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) { - spin_unlock(&st->receive_credit_lock); - mod_delayed_work(smb_direct_wq, - &st->post_recv_credits_work, 0); - } else { - spin_unlock(&st->receive_credit_lock); - } + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + sc->recv_io.reassembly.data_length -= data_read; + sc->recv_io.reassembly.queue_length -= queue_removed; + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); - st->first_entry_offset = offset; + sc->recv_io.reassembly.first_entry_offset = offset; ksmbd_debug(RDMA, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", - data_read, st->reassembly_data_length, - st->first_entry_offset); + data_read, sc->recv_io.reassembly.data_length, + sc->recv_io.reassembly.first_entry_offset); read_rfc1002_done: return data_read; } ksmbd_debug(RDMA, "wait_event on more data\n"); - rc = wait_event_interruptible(st->wait_reassembly_queue, - st->reassembly_data_length >= size || - st->status != SMB_DIRECT_CS_CONNECTED); + rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue, + sc->recv_io.reassembly.data_length >= size || + sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) return -EINTR; @@ -790,56 +872,44 @@ static int smb_direct_read(struct ksmbd_transport *t, char *buf, static void smb_direct_post_recv_credits(struct work_struct *work) { - struct smb_direct_transport *t = container_of(work, - struct smb_direct_transport, post_recv_credits_work.work); - struct smb_direct_recvmsg *recvmsg; - int receive_credits, credits = 0; + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); + struct smbdirect_recv_io *recvmsg; + int credits = 0; int ret; - spin_lock(&t->receive_credit_lock); - receive_credits = t->recv_credits; - spin_unlock(&t->receive_credit_lock); - - if (receive_credits < t->recv_credit_target) { + if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) { while (true) { - recvmsg = get_free_recvmsg(t); + recvmsg = get_free_recvmsg(sc); if (!recvmsg) break; - recvmsg->type = SMB_DIRECT_MSG_DATA_TRANSFER; recvmsg->first_segment = false; - ret = smb_direct_post_recv(t, recvmsg); + ret = smb_direct_post_recv(sc, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); break; } credits++; + + atomic_inc(&sc->recv_io.posted.count); } } - spin_lock(&t->receive_credit_lock); - t->recv_credits += credits; - t->count_avail_recvmsg -= credits; - spin_unlock(&t->receive_credit_lock); - - spin_lock(&t->lock_new_recv_credits); - t->new_recv_credits += credits; - spin_unlock(&t->lock_new_recv_credits); - if (credits) - queue_work(smb_direct_wq, &t->send_immediate_work); + queue_work(sc->workqueue, &sc->idle.immediate_work); } static void send_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smb_direct_sendmsg *sendmsg, *sibling; - struct smb_direct_transport *t; + struct smbdirect_send_io *sendmsg, *sibling; + struct smbdirect_socket *sc; struct list_head *pos, *prev, *end; - sendmsg = container_of(wc->wr_cqe, struct smb_direct_sendmsg, cqe); - t = sendmsg->transport; + sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); + sc = sendmsg->socket; ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, @@ -849,55 +919,78 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) pr_err("Send error. status='%s (%d)', opcode=%d\n", ib_wc_status_msg(wc->status), wc->status, wc->opcode); - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); /* iterate and free the list of messages in reverse. the list's head * is invalid. */ - for (pos = &sendmsg->list, prev = pos->prev, end = sendmsg->list.next; + for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next; prev != end; pos = prev, prev = prev->prev) { - sibling = container_of(pos, struct smb_direct_sendmsg, list); - smb_direct_free_sendmsg(t, sibling); + sibling = container_of(pos, struct smbdirect_send_io, sibling_list); + smb_direct_free_sendmsg(sc, sibling); } - sibling = container_of(pos, struct smb_direct_sendmsg, list); - smb_direct_free_sendmsg(t, sibling); + sibling = container_of(pos, struct smbdirect_send_io, sibling_list); + smb_direct_free_sendmsg(sc, sibling); } -static int manage_credits_prior_sending(struct smb_direct_transport *t) +static int manage_credits_prior_sending(struct smbdirect_socket *sc) { int new_credits; - spin_lock(&t->lock_new_recv_credits); - new_credits = t->new_recv_credits; - t->new_recv_credits = 0; - spin_unlock(&t->lock_new_recv_credits); + if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) + return 0; + + new_credits = atomic_read(&sc->recv_io.posted.count); + if (new_credits == 0) + return 0; + + new_credits -= atomic_read(&sc->recv_io.credits.count); + if (new_credits <= 0) + return 0; + atomic_add(new_credits, &sc->recv_io.credits.count); return new_credits; } -static int smb_direct_post_send(struct smb_direct_transport *t, +static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) +{ + struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + return 1; + } + return 0; +} + +static int smb_direct_post_send(struct smbdirect_socket *sc, struct ib_send_wr *wr) { int ret; - atomic_inc(&t->send_pending); - ret = ib_post_send(t->qp, wr, NULL); + atomic_inc(&sc->send_io.pending.count); + ret = ib_post_send(sc->ib.qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - smb_direct_disconnect_rdma_connection(t); + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); + smb_direct_disconnect_rdma_connection(sc); } return ret; } -static void smb_direct_send_ctx_init(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, +static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx, bool need_invalidate_rkey, unsigned int remote_key) { @@ -907,47 +1000,50 @@ static void smb_direct_send_ctx_init(struct smb_direct_transport *t, send_ctx->remote_key = remote_key; } -static int smb_direct_flush_send_list(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, +static int smb_direct_flush_send_list(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx, bool is_last) { - struct smb_direct_sendmsg *first, *last; + struct smbdirect_send_io *first, *last; int ret; if (list_empty(&send_ctx->msg_list)) return 0; first = list_first_entry(&send_ctx->msg_list, - struct smb_direct_sendmsg, - list); + struct smbdirect_send_io, + sibling_list); last = list_last_entry(&send_ctx->msg_list, - struct smb_direct_sendmsg, - list); + struct smbdirect_send_io, + sibling_list); + + if (send_ctx->need_invalidate_rkey) { + first->wr.opcode = IB_WR_SEND_WITH_INV; + first->wr.ex.invalidate_rkey = send_ctx->remote_key; + send_ctx->need_invalidate_rkey = false; + send_ctx->remote_key = 0; + } last->wr.send_flags = IB_SEND_SIGNALED; last->wr.wr_cqe = &last->cqe; - if (is_last && send_ctx->need_invalidate_rkey) { - last->wr.opcode = IB_WR_SEND_WITH_INV; - last->wr.ex.invalidate_rkey = send_ctx->remote_key; - } - ret = smb_direct_post_send(t, &first->wr); + ret = smb_direct_post_send(sc, &first->wr); if (!ret) { - smb_direct_send_ctx_init(t, send_ctx, + smb_direct_send_ctx_init(send_ctx, send_ctx->need_invalidate_rkey, send_ctx->remote_key); } else { - atomic_add(send_ctx->wr_cnt, &t->send_credits); - wake_up(&t->wait_send_credits); + atomic_add(send_ctx->wr_cnt, &sc->send_io.credits.count); + wake_up(&sc->send_io.credits.wait_queue); list_for_each_entry_safe(first, last, &send_ctx->msg_list, - list) { - smb_direct_free_sendmsg(t, first); + sibling_list) { + smb_direct_free_sendmsg(sc, first); } } return ret; } -static int wait_for_credits(struct smb_direct_transport *t, +static int wait_for_credits(struct smbdirect_socket *sc, wait_queue_head_t *waitq, atomic_t *total_credits, int needed) { @@ -960,61 +1056,68 @@ static int wait_for_credits(struct smb_direct_transport *t, atomic_add(needed, total_credits); ret = wait_event_interruptible(*waitq, atomic_read(total_credits) >= needed || - t->status != SMB_DIRECT_CS_CONNECTED); + sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (t->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; else if (ret < 0) return ret; } while (true); } -static int wait_for_send_credits(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx) +static int wait_for_send_credits(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx) { int ret; if (send_ctx && - (send_ctx->wr_cnt >= 16 || atomic_read(&t->send_credits) <= 1)) { - ret = smb_direct_flush_send_list(t, send_ctx, false); + (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { + ret = smb_direct_flush_send_list(sc, send_ctx, false); if (ret) return ret; } - return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1); + return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); } -static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) +static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits) { - return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits); + return wait_for_credits(sc, + &sc->rw_io.credits.wait_queue, + &sc->rw_io.credits.count, + credits); } -static int calc_rw_credits(struct smb_direct_transport *t, +static int calc_rw_credits(struct smbdirect_socket *sc, char *buf, unsigned int len) { return DIV_ROUND_UP(get_buf_page_count(buf, len), - t->pages_per_rw_credit); + sc->rw_io.credits.num_pages); } -static int smb_direct_create_header(struct smb_direct_transport *t, +static int smb_direct_create_header(struct smbdirect_socket *sc, int size, int remaining_data_length, - struct smb_direct_sendmsg **sendmsg_out) + struct smbdirect_send_io **sendmsg_out) { - struct smb_direct_sendmsg *sendmsg; - struct smb_direct_data_transfer *packet; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_send_io *sendmsg; + struct smbdirect_data_transfer *packet; int header_length; int ret; - sendmsg = smb_direct_alloc_sendmsg(t); + sendmsg = smb_direct_alloc_sendmsg(sc); if (IS_ERR(sendmsg)) return PTR_ERR(sendmsg); /* Fill in the packet header */ - packet = (struct smb_direct_data_transfer *)sendmsg->packet; - packet->credits_requested = cpu_to_le16(t->send_credit_target); - packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); + packet = (struct smbdirect_data_transfer *)sendmsg->packet; + packet->credits_requested = cpu_to_le16(sp->send_credit_target); + packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); packet->flags = 0; + if (manage_keep_alive_before_sending(sc)) + packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); + packet->reserved = 0; if (!size) packet->data_offset = 0; @@ -1033,25 +1136,25 @@ static int smb_direct_create_header(struct smb_direct_transport *t, le32_to_cpu(packet->remaining_data_length)); /* Map the packet to DMA */ - header_length = sizeof(struct smb_direct_data_transfer); + header_length = sizeof(struct smbdirect_data_transfer); /* If this is a packet without payload, don't send padding */ if (!size) header_length = - offsetof(struct smb_direct_data_transfer, padding); + offsetof(struct smbdirect_data_transfer, padding); - sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device, + sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, (void *)packet, header_length, DMA_TO_DEVICE); - ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr); + ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); if (ret) { - smb_direct_free_sendmsg(t, sendmsg); + smb_direct_free_sendmsg(sc, sendmsg); return ret; } sendmsg->num_sge = 1; sendmsg->sge[0].length = header_length; - sendmsg->sge[0].lkey = t->pd->local_dma_lkey; + sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; *sendmsg_out = sendmsg; return 0; @@ -1101,14 +1204,14 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, return ib_dma_map_sg(device, sg_list, npages, dir); } -static int post_sendmsg(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, - struct smb_direct_sendmsg *msg) +static int post_sendmsg(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx, + struct smbdirect_send_io *msg) { int i; for (i = 0; i < msg->num_sge; i++) - ib_dma_sync_single_for_device(t->cm_id->device, + ib_dma_sync_single_for_device(sc->ib.dev, msg->sge[i].addr, msg->sge[i].length, DMA_TO_DEVICE); @@ -1122,34 +1225,34 @@ static int post_sendmsg(struct smb_direct_transport *t, msg->wr.wr_cqe = NULL; msg->wr.send_flags = 0; if (!list_empty(&send_ctx->msg_list)) { - struct smb_direct_sendmsg *last; + struct smbdirect_send_io *last; last = list_last_entry(&send_ctx->msg_list, - struct smb_direct_sendmsg, - list); + struct smbdirect_send_io, + sibling_list); last->wr.next = &msg->wr; } - list_add_tail(&msg->list, &send_ctx->msg_list); + list_add_tail(&msg->sibling_list, &send_ctx->msg_list); send_ctx->wr_cnt++; return 0; } msg->wr.wr_cqe = &msg->cqe; msg->wr.send_flags = IB_SEND_SIGNALED; - return smb_direct_post_send(t, &msg->wr); + return smb_direct_post_send(sc, &msg->wr); } -static int smb_direct_post_send_data(struct smb_direct_transport *t, - struct smb_direct_send_ctx *send_ctx, +static int smb_direct_post_send_data(struct smbdirect_socket *sc, + struct smbdirect_send_batch *send_ctx, struct kvec *iov, int niov, int remaining_data_length) { int i, j, ret; - struct smb_direct_sendmsg *msg; + struct smbdirect_send_io *msg; int data_length; - struct scatterlist sg[SMB_DIRECT_MAX_SEND_SGES - 1]; + struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; - ret = wait_for_send_credits(t, send_ctx); + ret = wait_for_send_credits(sc, send_ctx); if (ret) return ret; @@ -1157,10 +1260,10 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, for (i = 0; i < niov; i++) data_length += iov[i].iov_len; - ret = smb_direct_create_header(t, data_length, remaining_data_length, + ret = smb_direct_create_header(sc, data_length, remaining_data_length, &msg); if (ret) { - atomic_inc(&t->send_credits); + atomic_inc(&sc->send_io.credits.count); return ret; } @@ -1168,19 +1271,19 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, struct ib_sge *sge; int sg_cnt; - sg_init_table(sg, SMB_DIRECT_MAX_SEND_SGES - 1); - sg_cnt = get_mapped_sg_list(t->cm_id->device, + sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1); + sg_cnt = get_mapped_sg_list(sc->ib.dev, iov[i].iov_base, iov[i].iov_len, - sg, SMB_DIRECT_MAX_SEND_SGES - 1, + sg, SMBDIRECT_SEND_IO_MAX_SGE - 1, DMA_TO_DEVICE); if (sg_cnt <= 0) { pr_err("failed to map buffer\n"); ret = -ENOMEM; goto err; - } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES) { + } else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) { pr_err("buffer not fitted into sges\n"); ret = -E2BIG; - ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt, + ib_dma_unmap_sg(sc->ib.dev, sg, sg_cnt, DMA_TO_DEVICE); goto err; } @@ -1189,18 +1292,18 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, sge = &msg->sge[msg->num_sge]; sge->addr = sg_dma_address(&sg[j]); sge->length = sg_dma_len(&sg[j]); - sge->lkey = t->pd->local_dma_lkey; + sge->lkey = sc->ib.pd->local_dma_lkey; msg->num_sge++; } } - ret = post_sendmsg(t, send_ctx, msg); + ret = post_sendmsg(sc, send_ctx, msg); if (ret) goto err; return 0; err: - smb_direct_free_sendmsg(t, msg); - atomic_inc(&t->send_credits); + smb_direct_free_sendmsg(sc, msg); + atomic_inc(&sc->send_io.credits.count); return ret; } @@ -1208,79 +1311,133 @@ static int smb_direct_writev(struct ksmbd_transport *t, struct kvec *iov, int niovs, int buflen, bool need_invalidate, unsigned int remote_key) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); - int remaining_data_length; - int start, i, j; - int max_iov_size = st->max_send_size - - sizeof(struct smb_direct_data_transfer); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; + size_t remaining_data_length; + size_t iov_idx; + size_t iov_ofs; + size_t max_iov_size = sp->max_send_size - + sizeof(struct smbdirect_data_transfer); int ret; - struct kvec vec; - struct smb_direct_send_ctx send_ctx; + struct smbdirect_send_batch send_ctx; + int error = 0; - if (st->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; //FIXME: skip RFC1002 header.. + if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4)) + return -EINVAL; buflen -= 4; + iov_idx = 1; + iov_ofs = 0; remaining_data_length = buflen; ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); - smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); - start = i = 1; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen - iov[i].iov_len); - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) + smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key); + while (remaining_data_length) { + struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */ + size_t possible_bytes = max_iov_size; + size_t possible_vecs; + size_t bytes = 0; + size_t nvecs = 0; + + /* + * For the last message remaining_data_length should be + * have been 0 already! + */ + if (WARN_ON_ONCE(iov_idx >= niovs)) { + error = -EINVAL; + goto done; + } + + /* + * We have 2 factors which limit the arguments we pass + * to smb_direct_post_send_data(): + * + * 1. The number of supported sges for the send, + * while one is reserved for the smbdirect header. + * And we currently need one SGE per page. + * 2. The number of negotiated payload bytes per send. + */ + possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx); + + while (iov_idx < niovs && possible_vecs && possible_bytes) { + struct kvec *v = &vecs[nvecs]; + int page_count; + + v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs; + v->iov_len = min_t(size_t, + iov[iov_idx].iov_len - iov_ofs, + possible_bytes); + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (page_count > possible_vecs) { + /* + * If the number of pages in the buffer + * is to much (because we currently require + * one SGE per page), we need to limit the + * length. + * + * We know possible_vecs is at least 1, + * so we always keep the first page. + * + * We need to calculate the number extra + * pages (epages) we can also keep. + * + * We calculate the number of bytes in the + * first page (fplen), this should never be + * larger than v->iov_len because page_count is + * at least 2, but adding a limitation feels + * better. + * + * Then we calculate the number of bytes (elen) + * we can keep for the extra pages. + */ + size_t epages = possible_vecs - 1; + size_t fpofs = offset_in_page(v->iov_base); + size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len); + size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE); + + v->iov_len = fplen + elen; + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (WARN_ON_ONCE(page_count > possible_vecs)) { + /* + * Something went wrong in the above + * logic... + */ + error = -EINVAL; goto done; - } else { - /* iov[start] is too big, break it */ - int nvec = (buflen + max_iov_size - 1) / - max_iov_size; - - for (j = 0; j < nvec; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j * max_iov_size; - vec.iov_len = - min_t(int, max_iov_size, - buflen - max_iov_size * j); - remaining_data_length -= vec.iov_len; - ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1, - remaining_data_length); - if (ret) - goto done; } - i++; - if (i == niovs) - break; } - start = i; - buflen = 0; - } else { - i++; - if (i == niovs) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) - goto done; - break; + possible_vecs -= page_count; + nvecs += 1; + possible_bytes -= v->iov_len; + bytes += v->iov_len; + + iov_ofs += v->iov_len; + if (iov_ofs >= iov[iov_idx].iov_len) { + iov_idx += 1; + iov_ofs = 0; } } + + remaining_data_length -= bytes; + + ret = smb_direct_post_send_data(sc, &send_ctx, + vecs, nvecs, + remaining_data_length); + if (unlikely(ret)) { + error = ret; + goto done; + } } done: - ret = smb_direct_flush_send_list(st, &send_ctx, true); + ret = smb_direct_flush_send_list(sc, &send_ctx, true); + if (unlikely(!ret && error)) + ret = error; /* * As an optimization, we don't wait for individual I/O to finish @@ -1289,16 +1446,22 @@ static int smb_direct_writev(struct ksmbd_transport *t, * that means all the I/Os have been out and we are good to return */ - wait_event(st->wait_send_pending, - atomic_read(&st->send_pending) == 0); + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0) + ret = -ENOTCONN; + return ret; } static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, - struct smb_direct_rdma_rw_msg *msg, + struct smbdirect_rw_io *msg, enum dma_data_direction dir) { - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + struct smbdirect_socket *sc = &t->socket; + + rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, msg->sgt.nents, dir); sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); kfree(msg); @@ -1307,16 +1470,16 @@ static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, enum dma_data_direction dir) { - struct smb_direct_rdma_rw_msg *msg = container_of(wc->wr_cqe, - struct smb_direct_rdma_rw_msg, cqe); - struct smb_direct_transport *t = msg->t; + struct smbdirect_rw_io *msg = + container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe); + struct smbdirect_socket *sc = msg->socket; if (wc->status != IB_WC_SUCCESS) { - msg->status = -EIO; + msg->error = -EIO; pr_err("read/write error. opcode = %d, status = %s(%d)\n", wc->opcode, ib_wc_status_msg(wc->status), wc->status); if (wc->status != IB_WC_WR_FLUSH_ERR) - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); } complete(msg->completion); @@ -1334,11 +1497,13 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc) static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, int buf_len, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len, bool is_read) { - struct smb_direct_rdma_rw_msg *msg, *next_msg; + struct smbdirect_socket *sc = &t->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_rw_io *msg, *next_msg; int i, ret; DECLARE_COMPLETION_ONSTACK(completion); struct ib_send_wr *first_wr; @@ -1347,10 +1512,10 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, int credits_needed; unsigned int desc_buf_len, desc_num = 0; - if (t->status != SMB_DIRECT_CS_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -ENOTCONN; - if (buf_len > t->max_rdma_rw_size) + if (buf_len > sp->max_read_write_size) return -EINVAL; /* calculate needed credits */ @@ -1370,7 +1535,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, buf_len = 0; } - credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); + credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len); desc_buf += desc_buf_len; buf_len -= desc_buf_len; desc_num++; @@ -1379,7 +1544,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", str_read_write(is_read), buf_len, credits_needed); - ret = wait_for_rw_credits(t, credits_needed); + ret = wait_for_rw_credits(sc, credits_needed); if (ret < 0) return ret; @@ -1395,7 +1560,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, desc_buf_len = le32_to_cpu(desc[i].length); - msg->t = t; + msg->socket = sc; msg->cqe.done = is_read ? read_done : write_done; msg->completion = &completion; @@ -1417,7 +1582,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, goto out; } - ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, + ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, get_buf_page_count(desc_buf, desc_buf_len), 0, @@ -1438,96 +1603,94 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, /* concatenate work requests of rdma_rw_ctxs */ first_wr = NULL; list_for_each_entry_reverse(msg, &msg_list, list) { - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, + first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, &msg->cqe, first_wr); } - ret = ib_post_send(t->qp, first_wr, NULL); + ret = ib_post_send(sc->ib.qp, first_wr, NULL); if (ret) { pr_err("failed to post send wr for RDMA R/W: %d\n", ret); goto out; } - msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list); + msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list); wait_for_completion(&completion); - ret = msg->status; + ret = msg->error; out: list_for_each_entry_safe(msg, next_msg, &msg_list, list) { list_del(&msg->list); smb_direct_free_rdma_rw_msg(t, msg, is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } - atomic_add(credits_needed, &t->rw_credits); - wake_up(&t->wait_rw_credits); + atomic_add(credits_needed, &sc->rw_io.credits.count); + wake_up(&sc->rw_io.credits.wait_queue); return ret; } static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { - return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, + return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, desc, desc_len, false); } static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, unsigned int buflen, - struct smb2_buffer_desc_v1 *desc, + struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { - return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, + return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, desc, desc_len, true); } static void smb_direct_disconnect(struct ksmbd_transport *t) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; - ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", st->cm_id); + ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); - smb_direct_disconnect_rdma_work(&st->disconnect_work); - wait_event_interruptible(st->wait_status, - st->status == SMB_DIRECT_CS_DISCONNECTED); free_transport(st); } static void smb_direct_shutdown(struct ksmbd_transport *t) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; - ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id); + ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id); - smb_direct_disconnect_rdma_work(&st->disconnect_work); + smb_direct_disconnect_rdma_work(&sc->disconnect_work); } static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { - struct smb_direct_transport *t = cm_id->context; + struct smbdirect_socket *sc = cm_id->context; ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", cm_id, rdma_event_msg(event->event), event->event); switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: { - t->status = SMB_DIRECT_CS_CONNECTED; - wake_up_interruptible(&t->wait_status); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING); + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; + wake_up(&sc->status_wait); break; } case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DISCONNECTED: { - ib_drain_qp(t->qp); + ib_drain_qp(sc->ib.qp); - t->status = SMB_DIRECT_CS_DISCONNECTED; - wake_up_interruptible(&t->wait_status); - wake_up_interruptible(&t->wait_reassembly_queue); - wake_up(&t->wait_send_credits); + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + smb_direct_disconnect_rdma_work(&sc->disconnect_work); break; } case RDMA_CM_EVENT_CONNECT_ERROR: { - t->status = SMB_DIRECT_CS_DISCONNECTED; - wake_up_interruptible(&t->wait_status); + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + smb_direct_disconnect_rdma_work(&sc->disconnect_work); break; } default: @@ -1541,38 +1704,41 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, static void smb_direct_qpair_handler(struct ib_event *event, void *context) { - struct smb_direct_transport *t = context; + struct smbdirect_socket *sc = context; ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n", - t->cm_id, ib_event_msg(event->event), event->event); + sc->rdma.cm_id, ib_event_msg(event->event), event->event); switch (event->event) { case IB_EVENT_CQ_ERR: case IB_EVENT_QP_FATAL: - smb_direct_disconnect_rdma_connection(t); + smb_direct_disconnect_rdma_connection(sc); break; default: break; } } -static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, +static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc, int failed) { - struct smb_direct_sendmsg *sendmsg; - struct smb_direct_negotiate_resp *resp; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_send_io *sendmsg; + struct smbdirect_negotiate_resp *resp; int ret; - sendmsg = smb_direct_alloc_sendmsg(t); + sendmsg = smb_direct_alloc_sendmsg(sc); if (IS_ERR(sendmsg)) return -ENOMEM; - resp = (struct smb_direct_negotiate_resp *)sendmsg->packet; + resp = (struct smbdirect_negotiate_resp *)sendmsg->packet; if (failed) { memset(resp, 0, sizeof(*resp)); - resp->min_version = cpu_to_le16(0x0100); - resp->max_version = cpu_to_le16(0x0100); + resp->min_version = SMB_DIRECT_VERSION_LE; + resp->max_version = SMB_DIRECT_VERSION_LE; resp->status = STATUS_NOT_SUPPORTED; + + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; } else { resp->status = STATUS_SUCCESS; resp->min_version = SMB_DIRECT_VERSION_LE; @@ -1580,57 +1746,65 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, resp->negotiated_version = SMB_DIRECT_VERSION_LE; resp->reserved = 0; resp->credits_requested = - cpu_to_le16(t->send_credit_target); - resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); - resp->max_readwrite_size = cpu_to_le32(t->max_rdma_rw_size); - resp->preferred_send_size = cpu_to_le32(t->max_send_size); - resp->max_receive_size = cpu_to_le32(t->max_recv_size); + cpu_to_le16(sp->send_credit_target); + resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); + resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size); + resp->preferred_send_size = cpu_to_le32(sp->max_send_size); + resp->max_receive_size = cpu_to_le32(sp->max_recv_size); resp->max_fragmented_size = - cpu_to_le32(t->max_fragmented_recv_size); + cpu_to_le32(sp->max_fragmented_recv_size); + + sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; + sc->status = SMBDIRECT_SOCKET_CONNECTED; } - sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device, + sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, (void *)resp, sizeof(*resp), DMA_TO_DEVICE); - ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr); + ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); if (ret) { - smb_direct_free_sendmsg(t, sendmsg); + smb_direct_free_sendmsg(sc, sendmsg); return ret; } sendmsg->num_sge = 1; sendmsg->sge[0].length = sizeof(*resp); - sendmsg->sge[0].lkey = t->pd->local_dma_lkey; + sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; - ret = post_sendmsg(t, NULL, sendmsg); + ret = post_sendmsg(sc, NULL, sendmsg); if (ret) { - smb_direct_free_sendmsg(t, sendmsg); + smb_direct_free_sendmsg(sc, sendmsg); return ret; } - wait_event(t->wait_send_pending, - atomic_read(&t->send_pending) == 0); + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return -ENOTCONN; + return 0; } -static int smb_direct_accept_client(struct smb_direct_transport *t) +static int smb_direct_accept_client(struct smbdirect_socket *sc) { + struct smbdirect_socket_parameters *sp = &sc->parameters; struct rdma_conn_param conn_param; - struct ib_port_immutable port_immutable; - u32 ird_ord_hdr[2]; + __be32 ird_ord_hdr[2]; int ret; + /* + * smb_direct_handle_connect_request() + * already negotiated sp->initiator_depth + * and sp->responder_resources + */ memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = min_t(u8, t->cm_id->device->attrs.max_qp_rd_atom, - SMB_DIRECT_CM_INITIATOR_DEPTH); - conn_param.responder_resources = 0; - - t->cm_id->device->ops.get_port_immutable(t->cm_id->device, - t->cm_id->port_num, - &port_immutable); - if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = conn_param.responder_resources; - ird_ord_hdr[1] = 1; + conn_param.initiator_depth = sp->initiator_depth; + conn_param.responder_resources = sp->responder_resources; + + if (sc->rdma.legacy_iwarp) { + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); conn_param.private_data = ird_ord_hdr; conn_param.private_data_len = sizeof(ird_ord_hdr); } else { @@ -1641,7 +1815,17 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; conn_param.flow_control = 0; - ret = rdma_accept(t->cm_id, &conn_param); + /* + * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING + * so that the timer will cause a disconnect. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueue, &sc->idle.timer_work, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; + ret = rdma_accept(sc->rdma.cm_id, &conn_param); if (ret) { pr_err("error at rdma_accept: %d\n", ret); return ret; @@ -1649,57 +1833,60 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) return 0; } -static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) +static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) { + struct smbdirect_recv_io *recvmsg; int ret; - struct smb_direct_recvmsg *recvmsg; - recvmsg = get_free_recvmsg(t); + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; + + sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; + + recvmsg = get_free_recvmsg(sc); if (!recvmsg) return -ENOMEM; - recvmsg->type = SMB_DIRECT_MSG_NEGOTIATE_REQ; - ret = smb_direct_post_recv(t, recvmsg); + ret = smb_direct_post_recv(sc, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); goto out_err; } - t->negotiation_requested = false; - ret = smb_direct_accept_client(t); + ret = smb_direct_accept_client(sc); if (ret) { pr_err("Can't accept client\n"); goto out_err; } - smb_direct_post_recv_credits(&t->post_recv_credits_work.work); + smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); return 0; out_err: - put_recvmsg(t, recvmsg); + put_recvmsg(sc, recvmsg); return ret; } -static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) +static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc) { return min_t(unsigned int, - t->cm_id->device->attrs.max_fast_reg_page_list_len, + sc->ib.dev->attrs.max_fast_reg_page_list_len, 256); } -static int smb_direct_init_params(struct smb_direct_transport *t, +static int smb_direct_init_params(struct smbdirect_socket *sc, struct ib_qp_cap *cap) { - struct ib_device *device = t->cm_id->device; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct ib_device *device = sc->ib.dev; int max_send_sges, max_rw_wrs, max_send_wrs; unsigned int max_sge_per_wr, wrs_per_credit; /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, * SMB2 response could be mapped. */ - t->max_send_size = smb_direct_max_send_size; - max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3; - if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { - pr_err("max_send_size %d is too large\n", t->max_send_size); + max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; + if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) { + pr_err("max_send_size %d is too large\n", sp->max_send_size); return -EINVAL; } @@ -1710,10 +1897,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t, * are needed for MR registration, RDMA R/W, local & remote * MR invalidation. */ - t->max_rdma_rw_size = smb_direct_max_read_write_size; - t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); - t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size, - (t->pages_per_rw_credit - 1) * + sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc); + sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size, + (sc->rw_io.credits.num_pages - 1) * PAGE_SIZE); max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, @@ -1721,233 +1907,244 @@ static int smb_direct_init_params(struct smb_direct_transport *t, max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, max_send_sges); wrs_per_credit = max_t(unsigned int, 4, - DIV_ROUND_UP(t->pages_per_rw_credit, + DIV_ROUND_UP(sc->rw_io.credits.num_pages, max_sge_per_wr) + 1); - max_rw_wrs = t->max_rw_credits * wrs_per_credit; + max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit; - max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; + max_send_wrs = sp->send_credit_target + max_rw_wrs; if (max_send_wrs > device->attrs.max_cqe || max_send_wrs > device->attrs.max_qp_wr) { pr_err("consider lowering send_credit_target = %d\n", - smb_direct_send_credit_target); + sp->send_credit_target); pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; } - if (smb_direct_receive_credit_max > device->attrs.max_cqe || - smb_direct_receive_credit_max > device->attrs.max_qp_wr) { + if (sp->recv_credit_max > device->attrs.max_cqe || + sp->recv_credit_max > device->attrs.max_qp_wr) { pr_err("consider lowering receive_credit_max = %d\n", - smb_direct_receive_credit_max); + sp->recv_credit_max); pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; } - if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { + if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) { + pr_err("warning: device max_send_sge = %d too small\n", + device->attrs.max_send_sge); + return -EINVAL; + } + if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); return -EINVAL; } - t->recv_credits = 0; - t->count_avail_recvmsg = 0; - - t->recv_credit_max = smb_direct_receive_credit_max; - t->recv_credit_target = 10; - t->new_recv_credits = 0; - - t->send_credit_target = smb_direct_send_credit_target; - atomic_set(&t->send_credits, 0); - atomic_set(&t->rw_credits, t->max_rw_credits); + sc->recv_io.credits.target = 1; - t->max_send_size = smb_direct_max_send_size; - t->max_recv_size = smb_direct_max_receive_size; - t->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; + atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); cap->max_send_wr = max_send_wrs; - cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = max_sge_per_wr; - cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; + cap->max_recv_wr = sp->recv_credit_max; + cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; + cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; cap->max_inline_data = 0; - cap->max_rdma_ctxs = t->max_rw_credits; + cap->max_rdma_ctxs = sc->rw_io.credits.max; return 0; } -static void smb_direct_destroy_pools(struct smb_direct_transport *t) +static void smb_direct_destroy_pools(struct smbdirect_socket *sc) { - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; - while ((recvmsg = get_free_recvmsg(t))) - mempool_free(recvmsg, t->recvmsg_mempool); + while ((recvmsg = get_free_recvmsg(sc))) + mempool_free(recvmsg, sc->recv_io.mem.pool); - mempool_destroy(t->recvmsg_mempool); - t->recvmsg_mempool = NULL; + mempool_destroy(sc->recv_io.mem.pool); + sc->recv_io.mem.pool = NULL; - kmem_cache_destroy(t->recvmsg_cache); - t->recvmsg_cache = NULL; + kmem_cache_destroy(sc->recv_io.mem.cache); + sc->recv_io.mem.cache = NULL; - mempool_destroy(t->sendmsg_mempool); - t->sendmsg_mempool = NULL; + mempool_destroy(sc->send_io.mem.pool); + sc->send_io.mem.pool = NULL; - kmem_cache_destroy(t->sendmsg_cache); - t->sendmsg_cache = NULL; + kmem_cache_destroy(sc->send_io.mem.cache); + sc->send_io.mem.cache = NULL; } -static int smb_direct_create_pools(struct smb_direct_transport *t) +static int smb_direct_create_pools(struct smbdirect_socket *sc) { + struct smbdirect_socket_parameters *sp = &sc->parameters; char name[80]; int i; - struct smb_direct_recvmsg *recvmsg; + struct smbdirect_recv_io *recvmsg; - snprintf(name, sizeof(name), "smb_direct_rqst_pool_%p", t); - t->sendmsg_cache = kmem_cache_create(name, - sizeof(struct smb_direct_sendmsg) + - sizeof(struct smb_direct_negotiate_resp), + snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc); + sc->send_io.mem.cache = kmem_cache_create(name, + sizeof(struct smbdirect_send_io) + + sizeof(struct smbdirect_negotiate_resp), 0, SLAB_HWCACHE_ALIGN, NULL); - if (!t->sendmsg_cache) + if (!sc->send_io.mem.cache) return -ENOMEM; - t->sendmsg_mempool = mempool_create(t->send_credit_target, + sc->send_io.mem.pool = mempool_create(sp->send_credit_target, mempool_alloc_slab, mempool_free_slab, - t->sendmsg_cache); - if (!t->sendmsg_mempool) + sc->send_io.mem.cache); + if (!sc->send_io.mem.pool) goto err; - snprintf(name, sizeof(name), "smb_direct_resp_%p", t); - t->recvmsg_cache = kmem_cache_create(name, - sizeof(struct smb_direct_recvmsg) + - t->max_recv_size, + snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc); + sc->recv_io.mem.cache = kmem_cache_create(name, + sizeof(struct smbdirect_recv_io) + + sp->max_recv_size, 0, SLAB_HWCACHE_ALIGN, NULL); - if (!t->recvmsg_cache) + if (!sc->recv_io.mem.cache) goto err; - t->recvmsg_mempool = - mempool_create(t->recv_credit_max, mempool_alloc_slab, - mempool_free_slab, t->recvmsg_cache); - if (!t->recvmsg_mempool) + sc->recv_io.mem.pool = + mempool_create(sp->recv_credit_max, mempool_alloc_slab, + mempool_free_slab, sc->recv_io.mem.cache); + if (!sc->recv_io.mem.pool) goto err; - INIT_LIST_HEAD(&t->recvmsg_queue); - - for (i = 0; i < t->recv_credit_max; i++) { - recvmsg = mempool_alloc(t->recvmsg_mempool, KSMBD_DEFAULT_GFP); + for (i = 0; i < sp->recv_credit_max; i++) { + recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP); if (!recvmsg) goto err; - recvmsg->transport = t; + recvmsg->socket = sc; recvmsg->sge.length = 0; - list_add(&recvmsg->list, &t->recvmsg_queue); + list_add(&recvmsg->list, &sc->recv_io.free.list); } - t->count_avail_recvmsg = t->recv_credit_max; return 0; err: - smb_direct_destroy_pools(t); + smb_direct_destroy_pools(sc); return -ENOMEM; } -static int smb_direct_create_qpair(struct smb_direct_transport *t, +static int smb_direct_create_qpair(struct smbdirect_socket *sc, struct ib_qp_cap *cap) { + struct smbdirect_socket_parameters *sp = &sc->parameters; int ret; struct ib_qp_init_attr qp_attr; int pages_per_rw; - t->pd = ib_alloc_pd(t->cm_id->device, 0); - if (IS_ERR(t->pd)) { + sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); + if (IS_ERR(sc->ib.pd)) { pr_err("Can't create RDMA PD\n"); - ret = PTR_ERR(t->pd); - t->pd = NULL; + ret = PTR_ERR(sc->ib.pd); + sc->ib.pd = NULL; return ret; } - t->send_cq = ib_alloc_cq(t->cm_id->device, t, - smb_direct_send_credit_target + cap->max_rdma_ctxs, - 0, IB_POLL_WORKQUEUE); - if (IS_ERR(t->send_cq)) { + sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, + sp->send_credit_target + + cap->max_rdma_ctxs, + IB_POLL_WORKQUEUE); + if (IS_ERR(sc->ib.send_cq)) { pr_err("Can't create RDMA send CQ\n"); - ret = PTR_ERR(t->send_cq); - t->send_cq = NULL; + ret = PTR_ERR(sc->ib.send_cq); + sc->ib.send_cq = NULL; goto err; } - t->recv_cq = ib_alloc_cq(t->cm_id->device, t, - t->recv_credit_max, 0, IB_POLL_WORKQUEUE); - if (IS_ERR(t->recv_cq)) { + sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, + sp->recv_credit_max, + IB_POLL_WORKQUEUE); + if (IS_ERR(sc->ib.recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); - ret = PTR_ERR(t->recv_cq); - t->recv_cq = NULL; + ret = PTR_ERR(sc->ib.recv_cq); + sc->ib.recv_cq = NULL; goto err; } memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.event_handler = smb_direct_qpair_handler; - qp_attr.qp_context = t; + qp_attr.qp_context = sc; qp_attr.cap = *cap; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; - qp_attr.send_cq = t->send_cq; - qp_attr.recv_cq = t->recv_cq; + qp_attr.send_cq = sc->ib.send_cq; + qp_attr.recv_cq = sc->ib.recv_cq; qp_attr.port_num = ~0; - ret = rdma_create_qp(t->cm_id, t->pd, &qp_attr); + ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); if (ret) { pr_err("Can't create RDMA QP: %d\n", ret); goto err; } - t->qp = t->cm_id->qp; - t->cm_id->event_handler = smb_direct_cm_handler; + sc->ib.qp = sc->rdma.cm_id->qp; + sc->rdma.cm_id->event_handler = smb_direct_cm_handler; - pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; - if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { - ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, - t->max_rw_credits, IB_MR_TYPE_MEM_REG, - t->pages_per_rw_credit, 0); + pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1; + if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) { + ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs, + sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG, + sc->rw_io.credits.num_pages, 0); if (ret) { - pr_err("failed to init mr pool count %d pages %d\n", - t->max_rw_credits, t->pages_per_rw_credit); + pr_err("failed to init mr pool count %zu pages %zu\n", + sc->rw_io.credits.max, sc->rw_io.credits.num_pages); goto err; } } return 0; err: - if (t->qp) { - t->qp = NULL; - rdma_destroy_qp(t->cm_id); + if (sc->ib.qp) { + sc->ib.qp = NULL; + rdma_destroy_qp(sc->rdma.cm_id); } - if (t->recv_cq) { - ib_destroy_cq(t->recv_cq); - t->recv_cq = NULL; + if (sc->ib.recv_cq) { + ib_destroy_cq(sc->ib.recv_cq); + sc->ib.recv_cq = NULL; } - if (t->send_cq) { - ib_destroy_cq(t->send_cq); - t->send_cq = NULL; + if (sc->ib.send_cq) { + ib_destroy_cq(sc->ib.send_cq); + sc->ib.send_cq = NULL; } - if (t->pd) { - ib_dealloc_pd(t->pd); - t->pd = NULL; + if (sc->ib.pd) { + ib_dealloc_pd(sc->ib.pd); + sc->ib.pd = NULL; } return ret; } static int smb_direct_prepare(struct ksmbd_transport *t) { - struct smb_direct_transport *st = smb_trans_direct_transfort(t); - struct smb_direct_recvmsg *recvmsg; - struct smb_direct_negotiate_req *req; + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = &st->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_recv_io *recvmsg; + struct smbdirect_negotiate_req *req; + unsigned long flags; int ret; + /* + * We are waiting to pass the following states: + * + * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED + * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING + * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED + * + * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING + * in order to continue below. + * + * Everything else is unexpected and an error. + */ ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); - ret = wait_event_interruptible_timeout(st->wait_status, - st->negotiation_requested || - st->status == SMB_DIRECT_CS_DISCONNECTED, - SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); - if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED) + ret = wait_event_interruptible_timeout(sc->status_wait, + sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED && + sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING && + sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) return ret < 0 ? ret : -ETIMEDOUT; - recvmsg = get_first_reassembly(st); + recvmsg = get_first_reassembly(sc); if (!recvmsg) return -ECONNABORTED; @@ -1955,51 +2152,54 @@ static int smb_direct_prepare(struct ksmbd_transport *t) if (ret == -ECONNABORTED) goto out; - req = (struct smb_direct_negotiate_req *)recvmsg->packet; - st->max_recv_size = min_t(int, st->max_recv_size, + req = (struct smbdirect_negotiate_req *)recvmsg->packet; + sp->max_recv_size = min_t(int, sp->max_recv_size, le32_to_cpu(req->preferred_send_size)); - st->max_send_size = min_t(int, st->max_send_size, + sp->max_send_size = min_t(int, sp->max_send_size, le32_to_cpu(req->max_receive_size)); - st->max_fragmented_send_size = + sp->max_fragmented_send_size = le32_to_cpu(req->max_fragmented_size); - st->max_fragmented_recv_size = - (st->recv_credit_max * st->max_recv_size) / 2; + sp->max_fragmented_recv_size = + (sp->recv_credit_max * sp->max_recv_size) / 2; + sc->recv_io.credits.target = le16_to_cpu(req->credits_requested); + sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); + sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); - ret = smb_direct_send_negotiate_response(st, ret); + ret = smb_direct_send_negotiate_response(sc, ret); out: - spin_lock_irq(&st->reassembly_queue_lock); - st->reassembly_queue_length--; + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + sc->recv_io.reassembly.queue_length--; list_del(&recvmsg->list); - spin_unlock_irq(&st->reassembly_queue_lock); - put_recvmsg(st, recvmsg); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + put_recvmsg(sc, recvmsg); return ret; } -static int smb_direct_connect(struct smb_direct_transport *st) +static int smb_direct_connect(struct smbdirect_socket *sc) { - int ret; struct ib_qp_cap qp_cap; + int ret; - ret = smb_direct_init_params(st, &qp_cap); + ret = smb_direct_init_params(sc, &qp_cap); if (ret) { pr_err("Can't configure RDMA parameters\n"); return ret; } - ret = smb_direct_create_pools(st); + ret = smb_direct_create_pools(sc); if (ret) { pr_err("Can't init RDMA pool: %d\n", ret); return ret; } - ret = smb_direct_create_qpair(st, &qp_cap); + ret = smb_direct_create_qpair(sc, &qp_cap); if (ret) { pr_err("Can't accept RDMA client: %d\n", ret); return ret; } - ret = smb_direct_prepare_negotiation(st); + ret = smb_direct_prepare_negotiation(sc); if (ret) { pr_err("Can't negotiate: %d\n", ret); return ret; @@ -2016,10 +2216,15 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) return true; } -static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) +static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, + struct rdma_cm_event *event) { struct smb_direct_transport *t; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; struct task_struct *handler; + u8 peer_initiator_depth; + u8 peer_responder_resources; int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { @@ -2032,8 +2237,71 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) t = alloc_transport(new_cm_id); if (!t) return -ENOMEM; + sc = &t->socket; + sp = &sc->parameters; + + peer_initiator_depth = event->param.conn.initiator_depth; + peer_responder_resources = event->param.conn.responder_resources; + if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) && + event->param.conn.private_data_len == 8) { + /* + * Legacy clients with only iWarp MPA v1 support + * need a private blob in order to negotiate + * the IRD/ORD values. + */ + const __be32 *ird_ord_hdr = event->param.conn.private_data; + u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); + u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); + + /* + * cifs.ko sends the legacy IRD/ORD negotiation + * event if iWarp MPA v2 was used. + * + * Here we check that the values match and only + * mark the client as legacy if they don't match. + */ + if ((u32)event->param.conn.initiator_depth != ird32 || + (u32)event->param.conn.responder_resources != ord32) { + /* + * There are broken clients (old cifs.ko) + * using little endian and also + * struct rdma_conn_param only uses u8 + * for initiator_depth and responder_resources, + * so we truncate the value to U8_MAX. + * + * smb_direct_accept_client() will then + * do the real negotiation in order to + * select the minimum between client and + * server. + */ + ird32 = min_t(u32, ird32, U8_MAX); + ord32 = min_t(u32, ord32, U8_MAX); + + sc->rdma.legacy_iwarp = true; + peer_initiator_depth = (u8)ird32; + peer_responder_resources = (u8)ord32; + } + } + + /* + * First set what the we as server are able to support + */ + sp->initiator_depth = min_t(u8, sp->initiator_depth, + new_cm_id->device->attrs.max_qp_rd_atom); - ret = smb_direct_connect(t); + /* + * negotiate the value by using the minimum + * between client and server if the client provided + * non 0 values. + */ + if (peer_initiator_depth != 0) + sp->initiator_depth = min_t(u8, sp->initiator_depth, + peer_initiator_depth); + if (peer_responder_resources != 0) + sp->responder_resources = min_t(u8, sp->responder_resources, + peer_responder_resources); + + ret = smb_direct_connect(sc); if (ret) goto out_err; @@ -2057,7 +2325,7 @@ static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, { switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: { - int ret = smb_direct_handle_connect_request(cm_id); + int ret = smb_direct_handle_connect_request(cm_id, event); if (ret) { pr_err("Can't create transport: %d\n", ret); @@ -2177,7 +2445,8 @@ int ksmbd_rdma_init(void) * for lack of credits */ smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq", - WQ_HIGHPRI | WQ_MEM_RECLAIM, 0); + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!smb_direct_wq) return -ENOMEM; diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index a2291b77488a15..3f93c6a9f7e4ad 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -11,61 +11,20 @@ #define SMBD_MIN_IOSIZE (512 * 1024) #define SMBD_MAX_IOSIZE (16 * 1024 * 1024) -/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ -struct smb_direct_negotiate_req { - __le16 min_version; - __le16 max_version; - __le16 reserved; - __le16 credits_requested; - __le32 preferred_send_size; - __le32 max_receive_size; - __le32 max_fragmented_size; -} __packed; - -/* SMB DIRECT negotiation response packet [MS-SMBD] 2.2.2 */ -struct smb_direct_negotiate_resp { - __le16 min_version; - __le16 max_version; - __le16 negotiated_version; - __le16 reserved; - __le16 credits_requested; - __le16 credits_granted; - __le32 status; - __le32 max_readwrite_size; - __le32 preferred_send_size; - __le32 max_receive_size; - __le32 max_fragmented_size; -} __packed; - -#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001 - -/* SMB DIRECT data transfer packet with payload [MS-SMBD] 2.2.3 */ -struct smb_direct_data_transfer { - __le16 credits_requested; - __le16 credits_granted; - __le16 flags; - __le16 reserved; - __le32 remaining_data_length; - __le32 data_offset; - __le32 data_length; - __le32 padding; - __u8 buffer[]; -} __packed; - #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); void ksmbd_rdma_stop_listening(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); void init_smbd_max_io_size(unsigned int sz); -unsigned int get_smbd_max_read_write_size(void); +unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt); #else static inline int ksmbd_rdma_init(void) { return 0; } static inline void ksmbd_rdma_stop_listening(void) { } static inline void ksmbd_rdma_destroy(void) { } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } static inline void init_smbd_max_io_size(unsigned int sz) { } -static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } +static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; } #endif #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 04539037108c93..1cfa688904b2c4 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -196,7 +196,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) pr_err("File(%s): creation failed (err:%d)\n", name, err); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -237,7 +237,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) if (!err && dentry != d) ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); return err; @@ -669,7 +669,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, ksmbd_debug(VFS, "vfs_link failed err %d\n", err); out3: - done_path_create(&newpath, dentry); + end_creating_path(&newpath, dentry); out2: path_put(&oldpath); out1: @@ -770,10 +770,9 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, goto out4; } - rd.old_mnt_idmap = mnt_idmap(old_path->mnt), + rd.mnt_idmap = mnt_idmap(old_path->mnt), rd.old_parent = old_parent, rd.old_dentry = old_child, - rd.new_mnt_idmap = mnt_idmap(new_path.mnt), rd.new_parent = new_path.dentry, rd.new_dentry = new_dentry, rd.flags = flags, @@ -1326,7 +1325,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, if (!abs_name) return ERR_PTR(-ENOMEM); - dent = kern_path_create(AT_FDCWD, abs_name, path, flags); + dent = start_creating_path(AT_FDCWD, abs_name, path, flags); kfree(abs_name); return dent; } diff --git a/fs/super.c b/fs/super.c index 7f876f32343ad4..f4fa0e93c463ed 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1716,49 +1716,6 @@ int get_tree_bdev(struct fs_context *fc, } EXPORT_SYMBOL(get_tree_bdev); -static int test_bdev_super(struct super_block *s, void *data) -{ - return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; -} - -struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - dev_t dev; - - error = lookup_bdev(dev_name, &dev); - if (error) - return ERR_PTR(error); - - flags |= SB_NOSEC; - s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (s->s_root) { - if ((flags ^ s->s_flags) & SB_RDONLY) { - deactivate_locked_super(s); - return ERR_PTR(-EBUSY); - } - } else { - error = setup_bdev_super(s, flags, NULL); - if (!error) - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - s->s_flags |= SB_ACTIVE; - } - - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_bdev); - void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; @@ -1773,26 +1730,6 @@ void kill_block_super(struct super_block *sb) EXPORT_SYMBOL(kill_block_super); #endif -struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); - - if (IS_ERR(s)) - return ERR_CAST(s); - - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_nodev); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. @@ -2314,17 +2251,20 @@ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", - WQ_MEM_RECLAIM, 0, + WQ_MEM_RECLAIM | WQ_PERCPU, + 0, sb->s_id); if (!wq) return -ENOMEM; + + old = NULL; /* * This has to be atomic as more DIOs can race to create the workqueue */ - old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); - /* Someone created workqueue before us? Free ours... */ - if (old) + if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) { + /* Someone created workqueue before us? Free ours... */ destroy_workqueue(wq); + } return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1ca143d2f22ad2..3825e780cc580d 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -97,12 +97,9 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, count = size - pos; } - if (!battr->read && !battr->read_new) + if (!battr->read) return -EIO; - if (battr->read_new) - return battr->read_new(of->file, kobj, battr, buf, pos, count); - return battr->read(of->file, kobj, battr, buf, pos, count); } @@ -161,12 +158,9 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, if (!count) return 0; - if (!battr->write && !battr->write_new) + if (!battr->write) return -EIO; - if (battr->write_new) - return battr->write_new(of->file, kobj, battr, buf, pos, count); - return battr->write(of->file, kobj, battr, buf, pos, count); } @@ -335,19 +329,13 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, const struct kernfs_ops *ops; struct kernfs_node *kn; - if (battr->read && battr->read_new) - return -EINVAL; - - if (battr->write && battr->write_new) - return -EINVAL; - if (battr->mmap) ops = &sysfs_bin_kfops_mmap; - else if ((battr->read || battr->read_new) && (battr->write || battr->write_new)) + else if (battr->read && battr->write) ops = &sysfs_bin_kfops_rw; - else if (battr->read || battr->read_new) + else if (battr->read) ops = &sysfs_bin_kfops_ro; - else if (battr->write || battr->write_new) + else if (battr->write) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index fb5ac358077b15..0b14d004a095ba 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -88,6 +88,8 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, } const struct fscrypt_operations ubifs_crypt_operations = { + .inode_info_offs = (int)offsetof(struct ubifs_inode, i_crypt_info) - + (int)offsetof(struct ubifs_inode, vfs_inode), .legacy_key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b20686085e..46952a33c4e6fa 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) static int ubifs_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); @@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode) goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(c, !atomic_read(&inode->i_count)); + ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 5db45c9e26ee0e..49e50431741cd2 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -365,6 +365,7 @@ struct ubifs_gced_idx_leb { * @read_in_a_row: number of consecutive pages read in a row (for bulk read) * @data_len: length of the data attached to the inode * @data: inode's data + * @i_crypt_info: inode's fscrypt information * * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS @@ -416,6 +417,9 @@ struct ubifs_inode { pgoff_t read_in_a_row; int data_len; void *data; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif }; /** diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 503268cf429627..95ec42b847972c 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -19,8 +19,7 @@ struct block_buffer { }; /* Hash a block, writing the result to the next level's pending block buffer. */ -static int hash_one_block(struct inode *inode, - const struct merkle_tree_params *params, +static int hash_one_block(const struct merkle_tree_params *params, struct block_buffer *cur) { struct block_buffer *next = cur + 1; @@ -36,8 +35,7 @@ static int hash_one_block(struct inode *inode, /* Zero-pad the block if it's shorter than the block size. */ memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - fsverity_hash_block(params, inode, cur->data, - &next->data[next->filled]); + fsverity_hash_block(params, cur->data, &next->data[next->filled]); next->filled += params->digest_size; cur->filled = 0; return 0; @@ -123,7 +121,7 @@ static int build_merkle_tree(struct file *filp, fsverity_err(inode, "Short read of file data"); goto out; } - err = hash_one_block(inode, params, &buffers[-1]); + err = hash_one_block(params, &buffers[-1]); if (err) goto out; for (level = 0; level < num_levels; level++) { @@ -134,7 +132,7 @@ static int build_merkle_tree(struct file *filp, } /* Next block at @level is full */ - err = hash_one_block(inode, params, &buffers[level]); + err = hash_one_block(params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -154,7 +152,7 @@ static int build_merkle_tree(struct file *filp, /* Finish all nonempty pending tree blocks. */ for (level = 0; level < num_levels; level++) { if (buffers[level].filled != 0) { - err = hash_one_block(inode, params, &buffers[level]); + err = hash_one_block(params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -284,9 +282,9 @@ static int enable_verity(struct file *filp, /* Successfully enabled verity */ /* - * Readers can start using ->i_verity_info immediately, so it - * can't be rolled back once set. So don't set it until just - * after the filesystem has successfully enabled verity. + * Readers can start using the inode's verity info immediately, + * so it can't be rolled back once set. So don't set it until + * just after the filesystem has successfully enabled verity. */ fsverity_set_info(inode, vi); } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 5fe854a5b9ad3d..dd20b138d452fa 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -63,10 +63,11 @@ struct merkle_tree_params { * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated - * and stored in ->i_verity_info; it remains until the inode is evicted. It - * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file digest. The Merkle tree - * pages themselves are not cached here, but the filesystem may cache them. + * and a pointer to it is stored in the file's in-memory inode. It remains + * until the inode is evicted. It caches information about the Merkle tree + * that's needed to efficiently verify data read from the file. It also caches + * the file digest. The Merkle tree pages themselves are not cached here, but + * the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; @@ -89,7 +90,7 @@ union fsverity_hash_ctx * fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); void fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, const void *data, u8 *out); + const void *data, u8 *out); void fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 9bb3c6344907e9..de53e14c8aa78b 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -94,7 +94,6 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, /** * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters - * @inode: inode for which the hashing is being done * @data: virtual address of a buffer containing the block to hash * @out: output digest, size 'params->digest_size' bytes * @@ -102,7 +101,7 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, * in the Merkle tree parameters. */ void fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, const void *data, u8 *out) + const void *data, u8 *out) { union fsverity_hash_ctx ctx; diff --git a/fs/verity/open.c b/fs/verity/open.c index c561e130cd0c61..77b1c977af0256 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -244,17 +244,17 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple tasks may race to set ->i_verity_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in - * fsverity_get_info(). I.e., here we publish ->i_verity_info with a - * RELEASE barrier so that other tasks can ACQUIRE it. + * Multiple tasks may race to set the inode's verity info pointer, so + * use cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., publish the pointer with a RELEASE + * barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { - /* Lost the race, so free the fsverity_info we allocated. */ + if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) { + /* Lost the race, so free the verity info we allocated. */ fsverity_free_info(vi); /* - * Afterwards, the caller may access ->i_verity_info directly, - * so make sure to ACQUIRE the winning fsverity_info. + * Afterwards, the caller may access the inode's verity info + * directly, so make sure to ACQUIRE the winning verity info. */ (void)fsverity_get_info(inode); } @@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode, return 0; } -/* Ensure the inode has an ->i_verity_info */ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); @@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); void __fsverity_cleanup_inode(struct inode *inode) { - fsverity_free_info(inode->i_verity_info); - inode->i_verity_info = NULL; + struct fsverity_info **vi_addr = fsverity_info_addr(inode); + + fsverity_free_info(*vi_addr); + *vi_addr = NULL; } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); diff --git a/fs/verity/verify.c b/fs/verity/verify.c index a1f00c3fd3b276..86067c8b40cf32 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -10,6 +10,31 @@ #include #include +#define FS_VERITY_MAX_PENDING_BLOCKS 2 + +struct fsverity_pending_block { + const void *data; + u64 pos; + u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE]; +}; + +struct fsverity_verification_context { + struct inode *inode; + struct fsverity_info *vi; + unsigned long max_ra_pages; + + /* + * This is the queue of data blocks that are pending verification. When + * the crypto layer supports interleaved hashing, we allow multiple + * blocks to be queued up in order to utilize it. This can improve + * performance significantly vs. sequential hashing of each block. + */ + int num_pending; + int max_pending; + struct fsverity_pending_block + pending_blocks[FS_VERITY_MAX_PENDING_BLOCKS]; +}; + static struct workqueue_struct *fsverity_read_workqueue; /* @@ -79,7 +104,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, } /* - * Verify a single data block against the file's Merkle tree. + * Verify the hash of a single data block against the file's Merkle tree. * * In principle, we need to verify the entire path to the root node. However, * for efficiency the filesystem may cache the hash blocks. Therefore we need @@ -88,10 +113,11 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, * * Return: %true if the data block is valid, else %false. */ -static bool -verify_data_block(struct inode *inode, struct fsverity_info *vi, - const void *data, u64 data_pos, unsigned long max_ra_pages) +static bool verify_data_block(struct inode *inode, struct fsverity_info *vi, + const struct fsverity_pending_block *dblock, + unsigned long max_ra_pages) { + const u64 data_pos = dblock->pos; const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; int level; @@ -115,8 +141,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, */ u64 hidx = data_pos >> params->log_blocksize; - /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */ - BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX); + /* + * Up to FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS pages may + * be mapped at once. + */ + static_assert(FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS <= + KM_MAX_IDX); if (unlikely(data_pos >= inode->i_size)) { /* @@ -127,7 +157,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * any part past EOF should be all zeroes. Therefore, we need * to verify that any data blocks fully past EOF are all zeroes. */ - if (memchr_inv(data, 0, params->block_size)) { + if (memchr_inv(dblock->data, 0, params->block_size)) { fsverity_err(inode, "FILE CORRUPTED! Data past EOF is not zeroed"); return false; @@ -202,7 +232,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, unsigned long hblock_idx = hblocks[level - 1].index; unsigned int hoffset = hblocks[level - 1].hoffset; - fsverity_hash_block(params, inode, haddr, real_hash); + fsverity_hash_block(params, haddr, real_hash); if (memcmp(want_hash, real_hash, hsize) != 0) goto corrupted; /* @@ -220,18 +250,18 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, put_page(hpage); } - /* Finally, verify the data block. */ - fsverity_hash_block(params, inode, data, real_hash); - if (memcmp(want_hash, real_hash, hsize) != 0) + /* Finally, verify the hash of the data block. */ + if (memcmp(want_hash, dblock->real_hash, hsize) != 0) goto corrupted; return true; corrupted: - fsverity_err(inode, - "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", - data_pos, level - 1, - params->hash_alg->name, hsize, want_hash, - params->hash_alg->name, hsize, real_hash); + fsverity_err( + inode, + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + data_pos, level - 1, params->hash_alg->name, hsize, want_hash, + params->hash_alg->name, hsize, + level == 0 ? dblock->real_hash : real_hash); error: for (; level > 0; level--) { kunmap_local(hblocks[level - 1].addr); @@ -240,13 +270,73 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, return false; } +static void +fsverity_init_verification_context(struct fsverity_verification_context *ctx, + struct inode *inode, + unsigned long max_ra_pages) +{ + struct fsverity_info *vi = *fsverity_info_addr(inode); + + ctx->inode = inode; + ctx->vi = vi; + ctx->max_ra_pages = max_ra_pages; + ctx->num_pending = 0; + if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 && + sha256_finup_2x_is_optimized()) + ctx->max_pending = 2; + else + ctx->max_pending = 1; +} + +static void +fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx) +{ + int i; + + for (i = ctx->num_pending - 1; i >= 0; i--) { + kunmap_local(ctx->pending_blocks[i].data); + ctx->pending_blocks[i].data = NULL; + } + ctx->num_pending = 0; +} + static bool -verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, - unsigned long max_ra_pages) +fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx) { - struct inode *inode = data_folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; - const unsigned int block_size = vi->tree_params.block_size; + struct fsverity_info *vi = ctx->vi; + const struct merkle_tree_params *params = &vi->tree_params; + int i; + + if (ctx->num_pending == 2) { + /* num_pending == 2 implies that the algorithm is SHA-256 */ + sha256_finup_2x(params->hashstate ? ¶ms->hashstate->sha256 : + NULL, + ctx->pending_blocks[0].data, + ctx->pending_blocks[1].data, params->block_size, + ctx->pending_blocks[0].real_hash, + ctx->pending_blocks[1].real_hash); + } else { + for (i = 0; i < ctx->num_pending; i++) + fsverity_hash_block(params, ctx->pending_blocks[i].data, + ctx->pending_blocks[i].real_hash); + } + + for (i = 0; i < ctx->num_pending; i++) { + if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i], + ctx->max_ra_pages)) + return false; + } + fsverity_clear_pending_blocks(ctx); + return true; +} + +static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx, + struct folio *data_folio, size_t len, + size_t offset) +{ + struct fsverity_info *vi = ctx->vi; + const struct merkle_tree_params *params = &vi->tree_params; + const unsigned int block_size = params->block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size))) @@ -255,14 +345,11 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, folio_test_uptodate(data_folio))) return false; do { - void *data; - bool valid; - - data = kmap_local_folio(data_folio, offset); - valid = verify_data_block(inode, vi, data, pos + offset, - max_ra_pages); - kunmap_local(data); - if (!valid) + ctx->pending_blocks[ctx->num_pending].data = + kmap_local_folio(data_folio, offset); + ctx->pending_blocks[ctx->num_pending].pos = pos + offset; + if (++ctx->num_pending == ctx->max_pending && + !fsverity_verify_pending_blocks(ctx)) return false; offset += block_size; len -= block_size; @@ -284,7 +371,15 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - return verify_data_blocks(folio, len, offset, 0); + struct fsverity_verification_context ctx; + + fsverity_init_verification_context(&ctx, folio->mapping->host, 0); + + if (fsverity_add_data_blocks(&ctx, folio, len, offset) && + fsverity_verify_pending_blocks(&ctx)) + return true; + fsverity_clear_pending_blocks(&ctx); + return false; } EXPORT_SYMBOL_GPL(fsverity_verify_blocks); @@ -305,6 +400,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks); */ void fsverity_verify_bio(struct bio *bio) { + struct inode *inode = bio_first_folio_all(bio)->mapping->host; + struct fsverity_verification_context ctx; struct folio_iter fi; unsigned long max_ra_pages = 0; @@ -321,13 +418,21 @@ void fsverity_verify_bio(struct bio *bio) max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); } + fsverity_init_verification_context(&ctx, inode, max_ra_pages); + bio_for_each_folio_all(fi, bio) { - if (!verify_data_blocks(fi.folio, fi.length, fi.offset, - max_ra_pages)) { - bio->bi_status = BLK_STS_IOERR; - break; - } + if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length, + fi.offset)) + goto ioerr; } + + if (!fsverity_verify_pending_blocks(&ctx)) + goto ioerr; + return; + +ioerr: + fsverity_clear_pending_blocks(&ctx); + bio->bi_status = BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ @@ -355,7 +460,7 @@ void __init fsverity_init_workqueue(void) * latency on ARM64. */ fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue", - WQ_HIGHPRI, + WQ_HIGHPRI | WQ_PERCPU, num_online_cpus()); if (!fsverity_read_workqueue) panic("failed to allocate fsverity_read_queue"); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 065953475cf5eb..8930d5254e1da6 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -25,7 +25,7 @@ config XFS_FS config XFS_SUPPORT_V4 bool "Support deprecated V4 (crc=0) format" depends on XFS_FS - default y + default n help The V4 filesystem format lacks certain features that are supported by the V5 format, such as metadata checksumming, strengthened @@ -40,7 +40,7 @@ config XFS_SUPPORT_V4 filesystem is a V4 filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the V4 format will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -50,7 +50,7 @@ config XFS_SUPPORT_V4 config XFS_SUPPORT_ASCII_CI bool "Support deprecated case-insensitive ascii (ascii-ci=1) format" depends on XFS_FS - default y + default n help The ASCII case insensitivity filesystem feature only works correctly on systems that have been coerced into using ISO 8859-1, and it does @@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI filesystem is a case-insensitive filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the feature will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -137,7 +137,7 @@ config XFS_BTREE_IN_MEM config XFS_ONLINE_SCRUB bool "XFS online metadata check support" - default n + default y depends on XFS_FS depends on TMPFS && SHMEM select XFS_LIVE_HOOKS @@ -150,12 +150,8 @@ config XFS_ONLINE_SCRUB advantage here is to look for problems proactively so that they can be dealt with in a controlled manner. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y @@ -171,11 +167,9 @@ config XFS_ONLINE_SCRUB_STATS Usage data are collected in /sys/kernel/debug/xfs/scrub. - If unsure, say N. - config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" - default n + default y depends on XFS_FS && XFS_ONLINE_SCRUB select XFS_BTREE_IN_MEM help @@ -186,12 +180,8 @@ config XFS_ONLINE_REPAIR formatted with secondary metadata, such as reverse mappings and inode parent pointers. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fb79215a509d21..8ac8230c3d3cc2 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -92,9 +92,8 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return XFS_TEST_ERROR(avail < orig / 10 || - avail < mp->m_agbtree_maxlevels, - mp, XFS_ERRTAG_AG_RESV_CRITICAL); + return avail < orig / 10 || avail < mp->m_agbtree_maxlevels || + XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -203,7 +202,7 @@ __xfs_ag_resv_init( return -EINVAL; } - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else error = xfs_dec_fdblocks(mp, hidden_space, true); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 000cc7f4a3ce50..ad381c73abc4de 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3321,7 +3321,7 @@ xfs_agf_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agf_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } @@ -4019,8 +4019,7 @@ __xfs_free_extent( ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_free_extent_fix_freelist(tp, pag, &agbp); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fddb55605e0cc6..91c1b30ebaab31 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit( /* * For attr2 we can try to move the forkoff if there is space in the - * literal area, but for the old format we are done if there is no - * space in the fixed attribute fork. + * literal area */ - if (!xfs_has_attr2(mp)) - return 0; - dsize = dp->i_df.if_bytes; switch (dp->i_df.if_format) { @@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit( } /* - * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: - * - noattr2 mount option is set, - * - on-disk version bit says it is already set, or - * - the attr2 mount option is not set to enable automatic upgrade from attr1. + * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless + * on-disk version bit says it is already set */ STATIC void xfs_sbversion_add_attr2( struct xfs_mount *mp, struct xfs_trans *tp) { - if (xfs_has_noattr2(mp)) - return; if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) return; - if (!xfs_has_attr2(mp)) - return; spin_lock(&mp->m_sb_lock); xfs_add_attr2(mp); @@ -889,7 +879,7 @@ xfs_attr_sf_removename( /* * Fix up the start offset of the attribute fork */ - if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && !xfs_has_parent(mp)) { @@ -900,7 +890,6 @@ xfs_attr_sf_removename( ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || - !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE || xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, @@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit( bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } - if (xfs_has_attr2(dp->i_mount) && - (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && + if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform( * this case. */ if (!(args->op_flags & XFS_DA_OP_REPLACE)) { - ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); } @@ -1225,7 +1212,7 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { error = -EIO; goto out; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d954f9b8071f4b..53ef4b7e504d62 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local( static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, - int size, - int *version) + int size) { int default_size = xfs_default_attroffset(ip) >> 3; @@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff( ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_forkoff) ip->i_forkoff = default_size; - else if (xfs_has_attr2(ip->i_mount) && version) - *version = 2; break; default: ASSERT(0); @@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork( int rsvd) /* xact may use reserved blks */ { struct xfs_mount *mp = tp->t_mountp; - int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork( ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_bmap_set_attrforkoff(ip, size, &version); + error = xfs_bmap_set_attrforkoff(ip, size); if (error) return error; @@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) return error; - if (!xfs_has_attr(mp) || - (!xfs_has_attr2(mp) && version == 2)) { + if (!xfs_has_attr(mp)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); if (!xfs_has_attr(mp)) { xfs_add_attr(mp); - log_sb = true; - } - if (!xfs_has_attr2(mp) && version == 2) { xfs_add_attr2(mp); log_sb = true; } @@ -3662,8 +3654,7 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); else if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) @@ -3849,7 +3840,7 @@ xfs_bmapi_read( } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4200,7 +4191,7 @@ xfs_bmapi_write( (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4545,7 +4536,7 @@ xfs_bmapi_remap( (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5679,7 +5670,7 @@ xfs_bmap_collapse_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5795,7 +5786,7 @@ xfs_bmap_insert_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5900,7 +5891,7 @@ xfs_bmap_split_extent( int i = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -6065,7 +6056,7 @@ xfs_bmap_finish_one( trace_xfs_bmap_deferred(bi); - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a61211d253f1ca..dbe9df8c33004e 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -306,7 +306,7 @@ xfs_btree_check_block( fa = __xfs_btree_check_block(cur, block, level, bp); if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { + XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_btree_mark_sick(cur); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 723a0643b8386c..90f7fc219fccc8 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -565,7 +565,7 @@ xfs_da3_split( trace_xfs_da_split(state->args); - if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 1775abcfa04d61..82a338458a5179 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -223,7 +223,7 @@ xfs_dir_ino_validate( bool ino_ok = xfs_verify_dir_ino(mp, ino); if (XFS_IS_CORRUPT(mp, !ino_ok) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); return -EFSCORRUPTED; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a53c5d40e084dc..de840abc0bcd44 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -4,14 +4,22 @@ * Copyright (C) 2017 Oracle. * All Rights Reserved. */ -#ifndef __XFS_ERRORTAG_H_ +#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG) #define __XFS_ERRORTAG_H_ /* - * error injection tags - the labels can be anything you want - * but each tag should have its own unique number + * There are two ways to use this header file. The first way is to #include it + * bare, which will define all the XFS_ERRTAG_* error injection knobs for use + * with the XFS_TEST_ERROR macro. The second way is to enclose the #include + * with a #define for an XFS_ERRTAG macro, in which case the header will define + " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each + * defined error injection knob. */ +/* + * These are the actual error injection tags. The numbers should be consecutive + * because arrays are sized based on the maximum. + */ #define XFS_ERRTAG_NOERROR 0 #define XFS_ERRTAG_IFLUSH_1 1 #define XFS_ERRTAG_IFLUSH_2 2 @@ -71,49 +79,61 @@ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. */ #define XFS_RANDOM_DEFAULT 100 -#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) -#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#define XFS_RANDOM_FREE_EXTENT 1 -#define XFS_RANDOM_RMAP_FINISH_ONE 1 -#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 -#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 -#define XFS_RANDOM_BMAP_FINISH_ONE 1 -#define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_LOG_BAD_CRC 1 -#define XFS_RANDOM_LOG_ITEM_PIN 1 -#define XFS_RANDOM_BUF_LRU_REF 2 -#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 -#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 -#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT -#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 -#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 -#define XFS_RANDOM_AG_RESV_FAIL 1 -#define XFS_RANDOM_LARP 1 -#define XFS_RANDOM_DA_LEAF_SPLIT 1 -#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 -#define XFS_RANDOM_WB_DELAY_MS 3000 -#define XFS_RANDOM_WRITE_DELAY_MS 3000 -#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 -#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 + +/* + * Table of errror injection knobs. The parameters to the XFS_ERRTAG macro are: + * 1. The XFS_ERRTAG_ flag but without the prefix; + * 2. The name of the sysfs knob; and + * 3. The default value for the knob. + */ +#ifdef XFS_ERRTAG +# undef XFS_ERRTAGS +# define XFS_ERRTAGS \ +XFS_ERRTAG(NOERROR, noerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_1, iflush1, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_2, iflush2, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_3, iflush3, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_4, iflush4, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_5, iflush5, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_6, iflush6, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DA_READ_BUF, dareadbuf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BTREE_CHECK_LBLOCK, btree_chk_lblk, XFS_RANDOM_DEFAULT/4) \ +XFS_ERRTAG(BTREE_CHECK_SBLOCK, btree_chk_sblk, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ALLOC_READ_AGF, readagf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IALLOC_READ_AGI, readagi, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ITOBP_INOTOBP, itobp, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK, iunlink, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK_REMOVE, iunlinkrm, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DIR_INO_VALIDATE, dirinovalid, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BULKSTAT_READ_CHUNK, bulkstat, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IODONE_IOERR, logiodone, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATREAD_IOERR, stratread, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATCMPL_IOERR, stratcmpl, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(DIOWRITE_IOERR, diowrite, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BMAPIFORMAT, bmapifmt, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(FREE_EXTENT, free_extent, 1) \ +XFS_ERRTAG(RMAP_FINISH_ONE, rmap_finish_one, 1) \ +XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \ +XFS_ERRTAG(REFCOUNT_FINISH_ONE, refcount_finish_one, 1) \ +XFS_ERRTAG(BMAP_FINISH_ONE, bmap_finish_one, 1) \ +XFS_ERRTAG(AG_RESV_CRITICAL, ag_resv_critical, 4) \ +XFS_ERRTAG(LOG_BAD_CRC, log_bad_crc, 1) \ +XFS_ERRTAG(LOG_ITEM_PIN, log_item_pin, 1) \ +XFS_ERRTAG(BUF_LRU_REF, buf_lru_ref, 2) \ +XFS_ERRTAG(FORCE_SCRUB_REPAIR, force_repair, 1) \ +XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary, 1) \ +XFS_ERRTAG(IUNLINK_FALLBACK, iunlink_fallback, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BUF_IOERROR, buf_ioerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(REDUCE_MAX_IEXTENTS, reduce_max_iextents, 1) \ +XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \ +XFS_ERRTAG(AG_RESV_FAIL, ag_resv_fail, 1) \ +XFS_ERRTAG(LARP, larp, 1) \ +XFS_ERRTAG(DA_LEAF_SPLIT, da_leaf_split, 1) \ +XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ +XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ +XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ +XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) +#endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c index 3f1d6a98c11819..932ee4619e9ea6 100644 --- a/fs/xfs/libxfs/xfs_exchmaps.c +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -616,7 +616,7 @@ xfs_exchmaps_finish_one( return error; } - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) return -EIO; /* If we still have work to do, ask for a new transaction. */ @@ -882,7 +882,7 @@ xmi_ensure_delta_nextents( &new_nextents)) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && new_nextents > 10) return -EFBIG; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 750111634d9f7b..d97295eaebe631 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2140,7 +2140,7 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); @@ -2286,7 +2286,7 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) @@ -2706,7 +2706,7 @@ xfs_agi_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index aa13fc00afd707..b1812b2c3ccece 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -61,8 +61,8 @@ xfs_inode_buf_verify( di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP))) { + if (unlikely(!di_ok || + XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; xfs_buf_ioerror(bp, -EIO); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 4f99b90add5526..1772d82f2d68b6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -756,8 +756,7 @@ xfs_iext_count_extend( if (nr_exts < ifp->if_nextents) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && - nr_exts > 10) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10) return -EFBIG; if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 48fe49a5f050f3..309ce6dd555383 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -299,17 +299,6 @@ xfs_inode_init( } else { inode_init_owner(args->idmap, inode, dir, args->mode); } - - /* - * If the group ID of the new file does not match the effective - * group ID or one of the supplementary group IDs, the S_ISGID - * bit is cleared (and only if the irix_sgid_inherit - * compatibility variable is set). - */ - if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) - inode->i_mode &= ~S_ISGID; - ip->i_projid = xfs_get_initial_prid(pip); } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 0d637c276db053..6c50cb2ece1972 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -86,43 +86,6 @@ struct xfs_unmount_log_format { uint32_t pad2; /* may as well make it 64 bits */ }; -/* Region types for iovec's i_type */ -#define XLOG_REG_TYPE_BFORMAT 1 -#define XLOG_REG_TYPE_BCHUNK 2 -#define XLOG_REG_TYPE_EFI_FORMAT 3 -#define XLOG_REG_TYPE_EFD_FORMAT 4 -#define XLOG_REG_TYPE_IFORMAT 5 -#define XLOG_REG_TYPE_ICORE 6 -#define XLOG_REG_TYPE_IEXT 7 -#define XLOG_REG_TYPE_IBROOT 8 -#define XLOG_REG_TYPE_ILOCAL 9 -#define XLOG_REG_TYPE_IATTR_EXT 10 -#define XLOG_REG_TYPE_IATTR_BROOT 11 -#define XLOG_REG_TYPE_IATTR_LOCAL 12 -#define XLOG_REG_TYPE_QFORMAT 13 -#define XLOG_REG_TYPE_DQUOT 14 -#define XLOG_REG_TYPE_QUOTAOFF 15 -#define XLOG_REG_TYPE_LRHEADER 16 -#define XLOG_REG_TYPE_UNMOUNT 17 -#define XLOG_REG_TYPE_COMMIT 18 -#define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_ICREATE 20 -#define XLOG_REG_TYPE_RUI_FORMAT 21 -#define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_CUI_FORMAT 23 -#define XLOG_REG_TYPE_CUD_FORMAT 24 -#define XLOG_REG_TYPE_BUI_FORMAT 25 -#define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_ATTRI_FORMAT 27 -#define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 -#define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_XMI_FORMAT 31 -#define XLOG_REG_TYPE_XMD_FORMAT 32 -#define XLOG_REG_TYPE_ATTR_NEWNAME 33 -#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 -#define XLOG_REG_TYPE_MAX 34 - /* * Flags to log operation header * @@ -141,14 +104,13 @@ struct xfs_unmount_log_format { #define XLOG_END_TRANS 0x10 /* End a continued transaction */ #define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ - -typedef struct xlog_op_header { +struct xlog_op_header { __be32 oh_tid; /* transaction id of operation : 4 b */ __be32 oh_len; /* bytes in data region : 4 b */ __u8 oh_clientid; /* who sent me this : 1 b */ __u8 oh_flags; /* : 1 b */ __u16 oh_res2; /* 32 bit align : 2 b */ -} xlog_op_header_t; +}; /* valid values for h_fmt */ #define XLOG_FMT_UNKNOWN 0 @@ -174,12 +136,40 @@ typedef struct xlog_rec_header { __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; - /* new fields */ + + /* fields added by the Linux port: */ __be32 h_fmt; /* format of log record : 4 */ uuid_t h_fs_uuid; /* uuid of FS : 16 */ + + /* fields added for log v2: */ __be32 h_size; /* iclog size : 4 */ + + /* + * When h_size added for log v2 support, it caused structure to have + * a different size on i386 vs all other architectures because the + * sum of the size ofthe member is not aligned by that of the largest + * __be64-sized member, and i386 has really odd struct alignment rules. + * + * Due to the way the log headers are placed out on-disk that alone is + * not a problem becaue the xlog_rec_header always sits alone in a + * BBSIZEs area, and the rest of that area is padded with zeroes. + * But xlog_cksum used to calculate the checksum based on the structure + * size, and thus gives different checksums for i386 vs the rest. + * We now do two checksum validation passes for both sizes to allow + * moving v5 file systems with unclean logs between i386 and other + * (little-endian) architectures. + */ + __u32 h_pad0; } xlog_rec_header_t; +#ifdef __i386__ +#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size) +#define XLOG_REC_SIZE_OTHER sizeof(struct xlog_rec_header) +#else +#define XLOG_REC_SIZE sizeof(struct xlog_rec_header) +#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size) +#endif /* __i386__ */ + typedef struct xlog_rec_ext_header { __be32 xh_cycle; /* write cycle of log : 4 */ __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ @@ -195,12 +185,11 @@ typedef union xlog_in_core2 { } xlog_in_core_2_t; /* not an on-disk structure, but needed by log recovery in userspace */ -typedef struct xfs_log_iovec { +struct xfs_log_iovec { void *i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ -} xfs_log_iovec_t; - +}; /* * Transaction Header definitions. @@ -213,12 +202,12 @@ typedef struct xfs_log_iovec { * Do not change the below structure without redoing the code in * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). */ -typedef struct xfs_trans_header { +struct xfs_trans_header { uint th_magic; /* magic number */ uint th_type; /* transaction type */ int32_t th_tid; /* transaction id (unused) */ uint th_num_items; /* num items logged by trans */ -} xfs_trans_header_t; +}; #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ @@ -542,7 +531,7 @@ struct xfs_log_dinode { #define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) #define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) -typedef struct xfs_buf_log_format { +struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ unsigned short blf_flags; /* misc state */ @@ -550,7 +539,7 @@ typedef struct xfs_buf_log_format { int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ -} xfs_buf_log_format_t; +}; /* * All buffers now need to tell recovery where the magic number @@ -606,40 +595,41 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) /* * EFI/EFD log format definitions */ -typedef struct xfs_extent { +struct xfs_extent { xfs_fsblock_t ext_start; xfs_extlen_t ext_len; -} xfs_extent_t; +}; /* - * Since an xfs_extent_t has types (start:64, len: 32) - * there are different alignments on 32 bit and 64 bit kernels. - * So we provide the different variants for use by a - * conversion routine. + * Since the structures in struct xfs_extent add up to 96 bytes, it has + * different alignments on i386 vs all other architectures, because i386 + * does not pad structures to their natural alignment. + * + * Provide the different variants for use by a conversion routine. */ -typedef struct xfs_extent_32 { +struct xfs_extent_32 { uint64_t ext_start; uint32_t ext_len; -} __attribute__((packed)) xfs_extent_32_t; +} __attribute__((packed)); -typedef struct xfs_extent_64 { +struct xfs_extent_64 { uint64_t ext_start; uint32_t ext_len; uint32_t ext_pad; -} xfs_extent_64_t; +}; /* * This is the structure used to lay out an efi log item in the * log. The efi_extents field is a variable size array whose * size is given by efi_nextents. */ -typedef struct xfs_efi_log_format { +struct xfs_efi_log_format { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_t; + struct xfs_extent efi_extents[]; /* array of extents to free */ +}; static inline size_t xfs_efi_log_format_sizeof( @@ -649,13 +639,13 @@ xfs_efi_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efi_log_format_32 { +struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[]; /* array of extents to free */ -} __attribute__((packed)) xfs_efi_log_format_32_t; + struct xfs_extent_32 efi_extents[]; /* array of extents to free */ +} __attribute__((packed)); static inline size_t xfs_efi_log_format32_sizeof( @@ -665,13 +655,13 @@ xfs_efi_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efi_log_format_64 { +struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_64_t; + struct xfs_extent_64 efi_extents[]; /* array of extents to free */ +}; static inline size_t xfs_efi_log_format64_sizeof( @@ -686,13 +676,13 @@ xfs_efi_log_format64_sizeof( * log. The efd_extents array is a variable size array whose * size is given by efd_nextents; */ -typedef struct xfs_efd_log_format { +struct xfs_efd_log_format { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_t; + struct xfs_extent efd_extents[]; /* array of extents freed */ +}; static inline size_t xfs_efd_log_format_sizeof( @@ -702,13 +692,13 @@ xfs_efd_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efd_log_format_32 { +struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[]; /* array of extents freed */ -} __attribute__((packed)) xfs_efd_log_format_32_t; + struct xfs_extent_32 efd_extents[]; /* array of extents freed */ +} __attribute__((packed)); static inline size_t xfs_efd_log_format32_sizeof( @@ -718,13 +708,13 @@ xfs_efd_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efd_log_format_64 { +struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_64_t; + struct xfs_extent_64 efd_extents[]; /* array of extents freed */ +}; static inline size_t xfs_efd_log_format64_sizeof( @@ -957,14 +947,14 @@ struct xfs_xmd_log_format { * The first two fields must be the type and size fitting into * 32 bits : log_recovery code assumes that. */ -typedef struct xfs_dq_logformat { +struct xfs_dq_logformat { uint16_t qlf_type; /* dquot log item type */ uint16_t qlf_size; /* size of this item */ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ int64_t qlf_blkno; /* blkno of dquot buffer */ int32_t qlf_len; /* len of dquot buffer */ uint32_t qlf_boffset; /* off of dquot in buffer */ -} xfs_dq_logformat_t; +}; /* * log format struct for QUOTAOFF records. @@ -974,12 +964,12 @@ typedef struct xfs_dq_logformat { * to the first and ensures that the first logitem is taken out of the AIL * only when the last one is securely committed. */ -typedef struct xfs_qoff_logformat { +struct xfs_qoff_logformat { unsigned short qf_type; /* quotaoff log item type */ unsigned short qf_size; /* size of this item */ unsigned int qf_flags; /* USR and/or GRP */ char qf_pad[12]; /* padding for future */ -} xfs_qoff_logformat_t; +}; /* * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 95de2309503069..9e712e62369c47 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -111,7 +111,7 @@ struct xlog_recover_item { struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ - xfs_trans_header_t r_theader; /* trans header for partial */ + struct xfs_trans_header r_theader; /* trans header for partial */ int r_state; /* not needed */ xfs_lsn_t r_lsn; /* xact lsn */ struct list_head r_itemq; /* q for items */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 225923e463c41e..b02e3d6c0868c6 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -121,7 +121,7 @@ xfs_metafile_resv_critical( div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 5ed44fdf749105..7bfa3242e2c536 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -174,6 +174,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 328); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 260); XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 8977840374836e..2484dc9f6d7ecc 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space( * refcount continue update "error" has been injected. */ if (cur->bc_refc.nr_ops > 2 && - XFS_TEST_ERROR(false, cur->bc_mp, - XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) + XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; if (cur->bc_refc.nr_ops == 0) @@ -1398,7 +1397,7 @@ xfs_refcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* @@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3cdf50563fecb9..83e0488ff77364 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2690,7 +2690,7 @@ xfs_rmap_finish_one( trace_xfs_rmap_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5057536e586ca4..618061d898d4a8 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1067,7 +1067,7 @@ xfs_rtfree_extent( ASSERT(rbmip->i_itemp != NULL); xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 711e180f9ebb83..cdd16dd805d77c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -142,8 +142,6 @@ xfs_sb_version_to_features( if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) features |= XFS_FEAT_LAZYSBCOUNT; - if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) - features |= XFS_FEAT_ATTR2; if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) features |= XFS_FEAT_PROJID32; if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) @@ -155,7 +153,7 @@ xfs_sb_version_to_features( /* Always on V5 features */ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | - XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | + XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 | XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; /* Optional V5 features */ @@ -1524,7 +1522,8 @@ xfs_fs_geometry( geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | - XFS_FSOP_GEOM_FLAGS_EXTFLG; + XFS_FSOP_GEOM_FLAGS_EXTFLG | + XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_has_quota(mp)) @@ -1537,8 +1536,6 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; - if (xfs_has_attr2(mp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; if (xfs_has_crc(mp)) diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h index c4f1367b2cca4d..5fefd132e002ee 100644 --- a/fs/xfs/libxfs/xfs_zones.h +++ b/fs/xfs/libxfs/xfs_zones.h @@ -29,6 +29,13 @@ struct xfs_rtgroup; #define XFS_OPEN_GC_ZONES 1U #define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) +/* + * For zoned devices that do not have a limit on the number of open zones, and + * for regular devices using the zoned allocator, use the most common SMR disks + * limit (128) as the default limit on the number of open zones. + */ +#define XFS_DEFAULT_MAX_OPEN_ZONES 128 + bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, xfs_rgblock_t *write_pointer); diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 38a246b8bf11c9..b2a83801412e91 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -300,7 +300,7 @@ xrep_cow_find_bad( * on the debugging knob, replace everything in the CoW fork. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) @@ -385,7 +385,7 @@ xrep_cow_find_bad_rt( * CoW fork and then scan for staging extents in the refcountbt. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index 14939d7de34966..378ec7c8d38eeb 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -79,7 +79,7 @@ xchk_metapath_cleanup( if (mpath->dp_ilock_flags) xfs_iunlock(mpath->dp, mpath->dp_ilock_flags); - kfree(mpath->path); + kfree_const(mpath->path); } /* Set up a metadir path scan. @path must be dynamically allocated. */ @@ -98,13 +98,13 @@ xchk_setup_metapath_scan( error = xchk_install_live_inode(sc, ip); if (error) { - kfree(path); + kfree_const(path); return error; } mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS); if (!mpath) { - kfree(path); + kfree_const(path); return -ENOMEM; } @@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip); + kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip); } /* Scan a rtgroup inode under the /rtgroups directory. */ @@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kstrdup("quota", GFP_KERNEL), qi->qi_dirip); + kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip); } /* Scan a quota inode under the /quota directory. */ @@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode( return -ENOENT; return xchk_setup_metapath_scan(sc, qi->qi_dirip, - kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); + kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip); } #else # define xchk_setup_metapath_quotadir(...) (-ENOENT) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 1588ce971cb8e1..951ae8b71566c2 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -27,6 +27,15 @@ #include "scrub/repair.h" #include "scrub/newbt.h" +/* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. The newbt code should reserve + * exactly the correct number of blocks to rebuild the btree, so there should + * not be any excess blocks to free when committing a new btree. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + /* * Estimate proper slack values for a btree that's being reloaded. * diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 8703897c0a9ccb..07f5bb8a642124 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -36,6 +36,12 @@ #include "xfs_metafile.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_bmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -91,21 +97,33 @@ struct xreap_state { struct xfs_scrub *sc; - /* Reverse mapping owner and metadata reservation type. */ - const struct xfs_owner_info *oinfo; - enum xfs_ag_resv_type resv; + union { + struct { + /* + * For AG blocks, this is reverse mapping owner and + * metadata reservation type. + */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + }; + struct { + /* For file blocks, this is the inode and fork. */ + struct xfs_inode *ip; + int whichfork; + }; + }; - /* If true, roll the transaction before reaping the next extent. */ - bool force_roll; + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int nr_binval; - /* Number of deferred reaps attached to the current transaction. */ - unsigned int deferred; + /* Maximum number of buffers we can invalidate in a single tx. */ + unsigned int max_binval; - /* Number of invalidated buffers logged to the current transaction. */ - unsigned int invalidated; + /* Number of deferred reaps attached to the current transaction. */ + unsigned int nr_deferred; - /* Number of deferred reaps queued during the whole reap sequence. */ - unsigned long long total_deferred; + /* Maximum number of intents we can reap in a single transaction. */ + unsigned int max_deferred; }; /* Put a block back on the AGFL. */ @@ -148,71 +166,79 @@ xreap_put_freelist( } /* Are there any uncommitted reap operations? */ -static inline bool xreap_dirty(const struct xreap_state *rs) +static inline bool xreap_is_dirty(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred) - return true; - if (rs->invalidated) - return true; - if (rs->total_deferred) - return true; - return false; + return rs->nr_binval > 0 || rs->nr_deferred > 0; } -#define XREAP_MAX_BINVAL (2048) - /* - * Decide if we want to roll the transaction after reaping an extent. We don't - * want to overrun the transaction reservation, so we prohibit more than - * 128 EFIs per transaction. For the same reason, we limit the number - * of buffer invalidations to 2048. + * Decide if we need to roll the transaction to clear out the the log + * reservation that we allocated to buffer invalidations. */ -static inline bool xreap_want_roll(const struct xreap_state *rs) +static inline bool xreap_want_binval_roll(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) - return true; - if (rs->invalidated > XREAP_MAX_BINVAL) - return true; - return false; + return rs->nr_binval >= rs->max_binval; } -static inline void xreap_reset(struct xreap_state *rs) +/* Reset the buffer invalidation count after rolling. */ +static inline void xreap_binval_reset(struct xreap_state *rs) { - rs->total_deferred += rs->deferred; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_binval = 0; } -#define XREAP_MAX_DEFER_CHAIN (2048) +/* + * Bump the number of invalidated buffers, and return true if we can continue, + * or false if we need to roll the transaction. + */ +static inline bool xreap_inc_binval(struct xreap_state *rs) +{ + rs->nr_binval++; + return rs->nr_binval < rs->max_binval; +} /* * Decide if we want to finish the deferred ops that are attached to the scrub * transaction. We don't want to queue huge chains of deferred ops because * that can consume a lot of log space and kernel memory. Hence we trigger a - * xfs_defer_finish if there are more than 2048 deferred reap operations or the - * caller did some real work. + * xfs_defer_finish if there are too many deferred reap operations or we've run + * out of space for invalidations. */ -static inline bool -xreap_want_defer_finish(const struct xreap_state *rs) +static inline bool xreap_want_defer_finish(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) - return true; - return false; + return rs->nr_deferred >= rs->max_deferred; } +/* + * Reset the defer chain length and buffer invalidation count after finishing + * items. + */ static inline void xreap_defer_finish_reset(struct xreap_state *rs) { - rs->total_deferred = 0; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_deferred = 0; + rs->nr_binval = 0; +} + +/* + * Bump the number of deferred extent reaps. + */ +static inline void xreap_inc_defer(struct xreap_state *rs) +{ + rs->nr_deferred++; +} + +/* Force the caller to finish a deferred item chain. */ +static inline void xreap_force_defer_finish(struct xreap_state *rs) +{ + rs->nr_deferred = rs->max_deferred; +} + +/* Maximum number of fsblocks that we might find in a buffer to invalidate. */ +static inline unsigned int +xrep_binval_max_fsblocks( + struct xfs_mount *mp) +{ + /* Remote xattr values are the largest buffers that we support. */ + return xfs_attr3_max_rmt_blocks(mp); } /* @@ -224,12 +250,8 @@ xrep_bufscan_max_sectors( struct xfs_mount *mp, xfs_extlen_t fsblocks) { - int max_fsbs; - - /* Remote xattr values are the largest buffers that we support. */ - max_fsbs = xfs_attr3_max_rmt_blocks(mp); - - return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, + xrep_binval_max_fsblocks(mp))); } /* @@ -297,14 +319,13 @@ xreap_agextent_binval( while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); - rs->invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however * far we've gotten. */ - if (rs->invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { *aglenp -= agbno_next - bno; goto out; } @@ -416,21 +437,23 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, *aglenp); - rs->force_roll = true; - if (rs->oinfo == &XFS_RMAP_OINFO_COW) { /* - * If we're unmapping CoW staging extents, remove the + * t0: Unmapping CoW staging extents, remove the * records from the refcountbt, which will remove the * rmap record as well. */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); + xreap_inc_defer(rs); return 0; } - return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, - *aglenp, rs->oinfo); + /* t1: unmap crosslinked metadata blocks */ + xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, + rs->oinfo->oi_owner); + xreap_inc_defer(rs); + return 0; } trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); @@ -443,12 +466,12 @@ xreap_agextent_iter( */ xreap_agextent_binval(rs, agbno, aglenp); if (*aglenp == 0) { - ASSERT(xreap_want_roll(rs)); + ASSERT(xreap_want_binval_roll(rs)); return 0; } /* - * If we're getting rid of CoW staging extents, use deferred work items + * t2: To get rid of CoW staging extents, use deferred work items * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -463,23 +486,23 @@ xreap_agextent_iter( if (error) return error; - rs->force_roll = true; + xreap_inc_defer(rs); return 0; } - /* Put blocks back on the AGFL one at a time. */ + /* t3: Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); error = xreap_put_freelist(sc, agbno); if (error) return error; - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } /* - * Use deferred frees to get rid of the old btree blocks to try to + * t4: Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. * Add a defer ops barrier every other extent to avoid stressing the * system with large EFIs. @@ -489,12 +512,194 @@ xreap_agextent_iter( if (error) return error; - rs->deferred++; - if (rs->deferred % 2 == 0) + xreap_inc_defer(rs); + if (rs->nr_deferred % 2 == 0) xfs_defer_add_barrier(sc->tp); return 0; } +/* Configure the deferral and invalidation limits */ +static inline void +xreap_configure_limits( + struct xreap_state *rs, + unsigned int fixed_overhead, + unsigned int variable_overhead, + unsigned int per_intent, + unsigned int per_binval) +{ + struct xfs_scrub *sc = rs->sc; + unsigned int res = sc->tp->t_log_res - fixed_overhead; + + /* Don't underflow the reservation */ + if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) { + ASSERT(sc->tp->t_log_res >= + (fixed_overhead + variable_overhead)); + xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE); + return; + } + + rs->max_deferred = per_intent ? res / variable_overhead : 0; + res -= rs->max_deferred * per_intent; + rs->max_binval = per_binval ? res / per_binval : 0; +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single per-AG space extent. This is not for freeing CoW + * staging extents. + */ +STATIC void +xreap_configure_agextent_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap + * record. + * + * t3: Freeing to AGFL: roll and finish deferred items for every block. + * Limits here do not matter. + * + * t4: Freeing metadata blocks: deferred freeing of the space, which + * also removes the rmap record. + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = rui; + const unsigned int t4 = rui + efi; + const unsigned int per_intent = max(t1, t4); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of EFI or + * RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int step_size = max(f1, f2); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Maximum overhead of invalidating one buffer. */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that btree + * blocks aren't usually contiguous; and that scrub likely pulled all + * the buffers into memory. From these assumptions, set the maximum + * number of deferrals we can queue before flushing the defer chain, + * and the number of invalidations we can queue before rolling to a + * clean transaction (and possibly relogging some of the deferrals) to + * the same quantity. + */ + const unsigned int variable_overhead = per_intent + per_binval; + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, rs->max_deferred); +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_agcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t0 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t0, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Overhead of invalidating one buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that CoW + * staging extents are usually more than 1 fsblock, and that there + * shouldn't be any buffers for those blocks. From the assumptions, + * set the number of deferrals to use as much of the reservation as + * it can, but leave space to invalidate 1/8th that number of buffers. + */ + const unsigned int variable_overhead = per_intent + + (per_binval / 8); + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size, + per_intent, rs->max_deferred); +} + /* * Break an AG metadata extent into sub-extents by fate (crosslinked, not * crosslinked), and dispose of each sub-extent separately. @@ -531,11 +736,11 @@ xreap_agmeta_extent( if (error) return error; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xrep_roll_ag_trans(sc); if (error) return error; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -562,11 +767,12 @@ xrep_reap_agblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip == NULL); + xreap_configure_agextent_limits(&rs); error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -628,7 +834,7 @@ xreap_fsmeta_extent( if (error) goto out_agf; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { /* * Hold the AGF buffer across the transaction roll so * that we don't have to reattach it to the scrub @@ -639,7 +845,7 @@ xreap_fsmeta_extent( xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); if (error) goto out_agf; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -674,11 +880,15 @@ xrep_reap_fsblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + if (oinfo == &XFS_RMAP_OINFO_COW) + xreap_configure_agcow_limits(&rs); + else + xreap_configure_agextent_limits(&rs); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -770,7 +980,7 @@ xreap_rgextent_iter( rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); /* - * If there are other rmappings, this block is cross linked and must + * t1: There are other rmappings; this block is cross linked and must * not be freed. Remove the forward and reverse mapping and move on. */ if (crosslinked) { @@ -778,14 +988,14 @@ xreap_rgextent_iter( *rglenp); xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); - rs->deferred++; + xreap_inc_defer(rs); return 0; } trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); /* - * The CoW staging extent is not crosslinked. Use deferred work items + * t2: The CoW staging extent is not crosslinked. Use deferred work * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -799,10 +1009,73 @@ xreap_rgextent_iter( if (error) return error; - rs->deferred++; + xreap_inc_defer(rs); return 0; } +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_rgcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * The only buffer for the rt device is the rtgroup super, so we don't + * need to save space for buffer invalidations. + */ + xreap_configure_limits(rs, step_size, per_intent, per_intent, 0); + + trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent, + rs->max_deferred); +} + #define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ XFS_RTGLOCK_RMAP | \ XFS_RTGLOCK_REFCOUNT) @@ -855,11 +1128,11 @@ xreap_rtmeta_extent( if (error) goto out_unlock; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xfs_trans_roll_inode(&sc->tp, sc->ip); if (error) goto out_unlock; - xreap_reset(rs); + xreap_binval_reset(rs); } rgbno += rglen; @@ -891,12 +1164,14 @@ xrep_reap_rtblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + ASSERT(oinfo == &XFS_RMAP_OINFO_COW); + xreap_configure_rgcow_limits(&rs); error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -929,13 +1204,13 @@ xrep_reap_metadir_fsblocks( ASSERT(sc->ip != NULL); ASSERT(xfs_is_metadir_inode(sc->ip)); + xreap_configure_agextent_limits(&rs); xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); - error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) { + if (xreap_is_dirty(&rs)) { error = xrep_defer_finish(sc); if (error) return error; @@ -955,13 +1230,12 @@ xrep_reap_metadir_fsblocks( */ STATIC int xreap_bmapi_select( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool *crosslinked) { struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rs->sc; struct xfs_btree_cur *cur; xfs_filblks_t len = 1; xfs_agblock_t bno; @@ -975,7 +1249,8 @@ xreap_bmapi_select( cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, sc->sa.pag); - xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork, + imap->br_startoff); error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); if (error) goto out_cur; @@ -1038,21 +1313,19 @@ xreap_buf_loggable( */ STATIC int xreap_bmapi_binval( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; struct xfs_mount *mp = sc->mp; struct xfs_perag *pag = sc->sa.pag; - int bmap_flags = xfs_bmapi_aflag(whichfork); + int bmap_flags = xfs_bmapi_aflag(rs->whichfork); xfs_fileoff_t off; xfs_fileoff_t max_off; xfs_extlen_t scan_blocks; xfs_agblock_t bno; xfs_agblock_t agbno; xfs_agblock_t agbno_next; - unsigned int invalidated = 0; int error; /* @@ -1079,7 +1352,7 @@ xreap_bmapi_binval( struct xfs_bmbt_irec hmap; int nhmaps = 1; - error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap, &nhmaps, bmap_flags); if (error) return error; @@ -1120,14 +1393,13 @@ xreap_bmapi_binval( xfs_buf_stale(bp); xfs_buf_relse(bp); } - invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however - * much of the mapping we've seen so far. + * far we've gotten. */ - if (invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { imap->br_blockcount = agbno_next - bno; goto out; } @@ -1149,12 +1421,11 @@ xreap_bmapi_binval( */ STATIC int xrep_reap_bmapi_iter( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool crosslinked) { + struct xfs_scrub *sc = rs->sc; int error; if (crosslinked) { @@ -1171,14 +1442,14 @@ xrep_reap_bmapi_iter( imap->br_blockcount); /* - * Schedule removal of the mapping from the fork. We use + * t0: Schedule removal of the mapping from the fork. We use * deferred log intents in this function to control the exact * sequence of metadata updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); - xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); return 0; } @@ -1199,41 +1470,139 @@ xrep_reap_bmapi_iter( * transaction is full of logged buffer invalidations, so we need to * return early so that we can roll and retry. */ - error = xreap_bmapi_binval(sc, ip, whichfork, imap); + error = xreap_bmapi_binval(rs, imap); if (error || imap->br_blockcount == 0) return error; /* - * Schedule removal of the mapping from the fork. We use deferred log - * intents in this function to control the exact sequence of metadata + * t1: Schedule removal of the mapping from the fork. We use deferred + * work in this function to control the exact sequence of metadata * updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); return xfs_free_extent_later(sc->tp, imap->br_startblock, imap->br_blockcount, NULL, XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD); } +/* Compute the maximum mapcount of a file buffer. */ +static unsigned int +xreap_bmapi_binval_mapcount( + struct xfs_scrub *sc) +{ + /* directory blocks can span multiple fsblocks and be discontiguous */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR) + return sc->mp->m_dir_geo->fsbcount; + + /* all other file xattr/symlink blocks must be contiguous */ + return 1; +} + +/* Compute the maximum block size of a file buffer. */ +static unsigned int +xreap_bmapi_binval_blocksize( + struct xfs_scrub *sc) +{ + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_DIR: + return sc->mp->m_dir_geo->blksize; + case XFS_SCRUB_TYPE_XATTR: + case XFS_SCRUB_TYPE_PARENT: + /* + * The xattr structure itself consists of single fsblocks, but + * there could be remote xattr blocks to invalidate. + */ + return XFS_XATTR_SIZE_MAX; + } + + /* everything else is a single block */ + return sc->mp->m_sb.sb_blocksize; +} + +/* + * Compute the maximum number of buffer invalidations that we can do while + * reaping a single extent from a file fork. + */ +STATIC void +xreap_configure_bmapi_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* overhead of invalidating a buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc), + xreap_bmapi_binval_blocksize(sc)); + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int bui = xfs_bui_log_space(1) + + xfs_bud_log_space(); + + /* + * t1: Unmapping crosslinked file data blocks: one bmap deletion, + * possibly an EFI for underfilled bmbt blocks, and an rmap deletion. + * + * t2: Freeing freeing file data blocks: one bmap deletion, possibly an + * EFI for underfilled bmbt blocks, and another EFI for the space + * itself. + */ + const unsigned int t1 = (bui + efi) + rui; + const unsigned int t2 = (bui + efi) + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * Each call to xreap_ifork_extent starts with a clean transaction and + * operates on a single mapping by creating a chain of log intent items + * for that mapping. We need to leave enough reservation in the + * transaction to log btree buffer and inode updates for each step in + * the chain, and to relog the log intents. + */ + const unsigned int per_extent_res = per_intent + step_size; + + xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval); + + trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, 1); +} + /* * Dispose of as much of this file extent as we can. Upon successful return, * the imap will reflect the mapping that was removed from the fork. */ STATIC int xreap_ifork_extent( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; xfs_agnumber_t agno; bool crosslinked; int error; ASSERT(sc->sa.pag == NULL); - trace_xreap_ifork_extent(sc, ip, whichfork, imap); + trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap); agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); sc->sa.pag = xfs_perag_get(sc->mp, agno); @@ -1248,11 +1617,11 @@ xreap_ifork_extent( * Decide the fate of the blocks at the beginning of the mapping, then * update the mapping to use it with the unmap calls. */ - error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + error = xreap_bmapi_select(rs, imap, &crosslinked); if (error) goto out_agf; - error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + error = xrep_reap_bmapi_iter(rs, imap, crosslinked); if (error) goto out_agf; @@ -1276,6 +1645,11 @@ xrep_reap_ifork( struct xfs_inode *ip, int whichfork) { + struct xreap_state rs = { + .sc = sc, + .ip = ip, + .whichfork = whichfork, + }; xfs_fileoff_t off = 0; int bmap_flags = xfs_bmapi_aflag(whichfork); int error; @@ -1284,6 +1658,7 @@ xrep_reap_ifork( ASSERT(ip == sc->ip || ip == sc->tempip); ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + xreap_configure_bmapi_limits(&rs); while (off < XFS_MAX_FILEOFF) { struct xfs_bmbt_irec imap; int nimaps = 1; @@ -1303,13 +1678,14 @@ xrep_reap_ifork( * can in a single transaction. */ if (xfs_bmap_is_real_extent(&imap)) { - error = xreap_ifork_extent(sc, ip, whichfork, &imap); + error = xreap_ifork_extent(&rs, &imap); if (error) return error; error = xfs_defer_finish(&sc->tp); if (error) return error; + xreap_defer_finish_reset(&rs); } off = imap.br_startoff + imap.br_blockcount; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index d00c18954a26b8..efd5a7ccdf624a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1110,7 +1110,7 @@ xrep_will_attempt( return true; /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) return true; /* Metadata is corrupt or failed cross-referencing. */ diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 9c04295742c85f..2bb125c4f9bf2b 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) #ifdef CONFIG_XFS_ONLINE_REPAIR -/* - * This is the maximum number of deferred extent freeing item extents (EFIs) - * that we'll attach to a transaction without rolling the transaction to avoid - * overrunning a tr_itruncate reservation. - */ -#define XREP_MAX_ITRUNCATE_EFIS (128) - - /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 953ce7be78dc2f..5902398185a898 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -185,7 +185,7 @@ xrep_symlink_salvage_inline( return 0; nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); - strncpy(target_buf, ifp->if_data, nr); + memcpy(target_buf, ifp->if_data, nr); return nr; } diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2450e214103fed..987313a52e6401 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -22,6 +22,7 @@ #include "xfs_parent.h" #include "xfs_metafile.h" #include "xfs_rtgroup.h" +#include "xfs_trans.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index a8187281eb96b9..39ea651cbb7510 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2000,6 +2000,51 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); +DECLARE_EVENT_CLASS(xrep_reap_limits_class, + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, + unsigned int max_binval, unsigned int step_size, + unsigned int per_intent, + unsigned int max_deferred), + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, log_res) + __field(unsigned int, per_binval) + __field(unsigned int, max_binval) + __field(unsigned int, step_size) + __field(unsigned int, per_intent) + __field(unsigned int, max_deferred) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->log_res = tp->t_log_res; + __entry->per_binval = per_binval; + __entry->max_binval = max_binval; + __entry->step_size = step_size; + __entry->per_intent = per_intent; + __entry->max_deferred = max_deferred; + ), + TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->log_res, + __entry->per_binval, + __entry->max_binval, + __entry->step_size, + __entry->per_intent, + __entry->max_deferred) +); +#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \ +DEFINE_EVENT(xrep_reap_limits_class, name, \ + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \ + unsigned int max_binval, unsigned int step_size, \ + unsigned int per_intent, \ + unsigned int max_deferred), \ + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits); + DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, xfs_extlen_t len, bool crosslinked), diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5eef3bc30bda1a..c3a593319bee71 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -491,7 +491,7 @@ xfs_attr_finish_item( /* Reset trans after EAGAIN cycle since the transaction is new */ args->trans = tp; - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) { error = -EIO; goto out; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f9ef3b2a332a6f..773d959965dc29 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -387,8 +387,6 @@ xfs_buf_map_verify( struct xfs_buftarg *btp, struct xfs_buf_map *map) { - xfs_daddr_t eofs; - /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); @@ -397,11 +395,10 @@ xfs_buf_map_verify( * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ - eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (map->bm_bn < 0 || map->bm_bn >= eofs) { + if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) { xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", - __func__, map->bm_bn, eofs); + __func__, map->bm_bn, btp->bt_nr_sectors); WARN_ON(1); return -EFSCORRUPTED; } @@ -1299,7 +1296,7 @@ xfs_buf_bio_end_io( if (bio->bi_status) xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) xfs_buf_ioerror(bp, -EIO); if (bp->b_flags & XBF_ASYNC) { @@ -1720,26 +1717,30 @@ xfs_configure_buftarg_atomic_writes( int xfs_configure_buftarg( struct xfs_buftarg *btp, - unsigned int sectorsize) + unsigned int sectorsize, + xfs_rfsblock_t nr_blocks) { - int error; + struct xfs_mount *mp = btp->bt_mount; - ASSERT(btp->bt_bdev != NULL); + if (btp->bt_bdev) { + int error; - /* Set up metadata sector size info */ - btp->bt_meta_sectorsize = sectorsize; - btp->bt_meta_sectormask = sectorsize - 1; + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { + xfs_warn(mp, + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); + return -EINVAL; + } - error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); - if (error) { - xfs_warn(btp->bt_mount, - "Cannot use blocksize %u on device %pg, err %d", - sectorsize, btp->bt_bdev, error); - return -EINVAL; + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); } - if (bdev_can_atomic_write(btp->bt_bdev)) - xfs_configure_buftarg_atomic_writes(btp); + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; + /* m_blkbb_log is not set up yet */ + btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT); return 0; } @@ -1749,6 +1750,9 @@ xfs_init_buftarg( size_t logical_sectorsize, const char *descr) { + /* The maximum size of the buftarg is only known once the sb is read. */ + btp->bt_nr_sectors = (xfs_daddr_t)-1; + /* Set up device logical sector size mask */ btp->bt_logical_sectorsize = logical_sectorsize; btp->bt_logical_sectormask = logical_sectorsize - 1; @@ -2084,7 +2088,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) * This allows userspace to disrupt buffer caching for debug/testing * purposes. */ - if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) + if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) lru_ref = 0; atomic_set(&bp->b_lru_ref, lru_ref); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b269e115d9ace0..8fa7bdf59c9110 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -103,6 +103,7 @@ struct xfs_buftarg { size_t bt_meta_sectormask; size_t bt_logical_sectorsize; size_t bt_logical_sectormask; + xfs_daddr_t bt_nr_sectors; /* LRU control structures */ struct shrinker *bt_shrinker; @@ -372,7 +373,8 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize, + xfs_fsblock_t nr_blocks); #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 5d58e2ae4972da..e4c8af87363243 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer( */ xfs_sb_from_disk(&mp->m_sb, dsb); + /* + * Grow can change the device size. Mirror that into the buftarg. + */ + mp->m_ddev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) { + mp->m_rtdev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + } + if (mp->m_sb.sb_agcount < orig_agcount) { xfs_alert(mp, "Shrinking AG count in log recovery not supported"); return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index dbd87e1376943a..39830b252ac88d 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -10,61 +10,17 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" #include "xfs_inode.h" #ifdef DEBUG -static unsigned int xfs_errortag_random_default[] = { - XFS_RANDOM_DEFAULT, - XFS_RANDOM_IFLUSH_1, - XFS_RANDOM_IFLUSH_2, - XFS_RANDOM_IFLUSH_3, - XFS_RANDOM_IFLUSH_4, - XFS_RANDOM_IFLUSH_5, - XFS_RANDOM_IFLUSH_6, - XFS_RANDOM_DA_READ_BUF, - XFS_RANDOM_BTREE_CHECK_LBLOCK, - XFS_RANDOM_BTREE_CHECK_SBLOCK, - XFS_RANDOM_ALLOC_READ_AGF, - XFS_RANDOM_IALLOC_READ_AGI, - XFS_RANDOM_ITOBP_INOTOBP, - XFS_RANDOM_IUNLINK, - XFS_RANDOM_IUNLINK_REMOVE, - XFS_RANDOM_DIR_INO_VALIDATE, - XFS_RANDOM_BULKSTAT_READ_CHUNK, - XFS_RANDOM_IODONE_IOERR, - XFS_RANDOM_STRATREAD_IOERR, - XFS_RANDOM_STRATCMPL_IOERR, - XFS_RANDOM_DIOWRITE_IOERR, - XFS_RANDOM_BMAPIFORMAT, - XFS_RANDOM_FREE_EXTENT, - XFS_RANDOM_RMAP_FINISH_ONE, - XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE, - XFS_RANDOM_REFCOUNT_FINISH_ONE, - XFS_RANDOM_BMAP_FINISH_ONE, - XFS_RANDOM_AG_RESV_CRITICAL, - 0, /* XFS_RANDOM_DROP_WRITES has been removed */ - XFS_RANDOM_LOG_BAD_CRC, - XFS_RANDOM_LOG_ITEM_PIN, - XFS_RANDOM_BUF_LRU_REF, - XFS_RANDOM_FORCE_SCRUB_REPAIR, - XFS_RANDOM_FORCE_SUMMARY_RECALC, - XFS_RANDOM_IUNLINK_FALLBACK, - XFS_RANDOM_BUF_IOERROR, - XFS_RANDOM_REDUCE_MAX_IEXTENTS, - XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, - XFS_RANDOM_AG_RESV_FAIL, - XFS_RANDOM_LARP, - XFS_RANDOM_DA_LEAF_SPLIT, - XFS_RANDOM_ATTR_LEAF_TO_NODE, - XFS_RANDOM_WB_DELAY_MS, - XFS_RANDOM_WRITE_DELAY_MS, - XFS_RANDOM_EXCHMAPS_FINISH_ONE, - XFS_RANDOM_METAFILE_RESV_CRITICAL, -}; +#define XFS_ERRTAG(_tag, _name, _default) \ + [XFS_ERRTAG_##_tag] = (_default), +#include "xfs_errortag.h" +static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; +#undef XFS_ERRTAG struct xfs_errortag_attr { struct attribute attr; @@ -93,21 +49,18 @@ xfs_errortag_attr_store( size_t count) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; int ret; - unsigned int val; if (strcmp(buf, "default") == 0) { - val = xfs_errortag_random_default[xfs_attr->tag]; + mp->m_errortag[error_tag] = + xfs_errortag_random_default[error_tag]; } else { - ret = kstrtouint(buf, 0, &val); + ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]); if (ret) return ret; } - ret = xfs_errortag_set(mp, xfs_attr->tag, val); - if (ret) - return ret; return count; } @@ -118,10 +71,9 @@ xfs_errortag_attr_show( char *buf) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; - return snprintf(buf, PAGE_SIZE, "%u\n", - xfs_errortag_get(mp, xfs_attr->tag)); + return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]); } static const struct sysfs_ops xfs_errortag_sysfs_ops = { @@ -129,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = { .store = xfs_errortag_attr_store, }; -#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \ +#define XFS_ERRTAG(_tag, _name, _default) \ static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \ - .tag = (_tag), \ -} - -#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr - -XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR); -XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1); -XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2); -XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3); -XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4); -XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5); -XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6); -XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF); -XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK); -XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK); -XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF); -XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI); -XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP); -XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK); -XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE); -XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE); -XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK); -XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR); -XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR); -XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR); -XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR); -XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT); -XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT); -XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE); -XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); -XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); -XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); -XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); -XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); -XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); -XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); -XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); -XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); -XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); -XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); -XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); -XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); -XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + .tag = XFS_ERRTAG_##_tag, \ +}; +#include "xfs_errortag.h" +XFS_ERRTAGS +#undef XFS_ERRTAG +#define XFS_ERRTAG(_tag, _name, _default) \ + &xfs_errortag_attr_##_name.attr, +#include "xfs_errortag.h" static struct attribute *xfs_errortag_attrs[] = { - XFS_ERRORTAG_ATTR_LIST(noerror), - XFS_ERRORTAG_ATTR_LIST(iflush1), - XFS_ERRORTAG_ATTR_LIST(iflush2), - XFS_ERRORTAG_ATTR_LIST(iflush3), - XFS_ERRORTAG_ATTR_LIST(iflush4), - XFS_ERRORTAG_ATTR_LIST(iflush5), - XFS_ERRORTAG_ATTR_LIST(iflush6), - XFS_ERRORTAG_ATTR_LIST(dareadbuf), - XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk), - XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk), - XFS_ERRORTAG_ATTR_LIST(readagf), - XFS_ERRORTAG_ATTR_LIST(readagi), - XFS_ERRORTAG_ATTR_LIST(itobp), - XFS_ERRORTAG_ATTR_LIST(iunlink), - XFS_ERRORTAG_ATTR_LIST(iunlinkrm), - XFS_ERRORTAG_ATTR_LIST(dirinovalid), - XFS_ERRORTAG_ATTR_LIST(bulkstat), - XFS_ERRORTAG_ATTR_LIST(logiodone), - XFS_ERRORTAG_ATTR_LIST(stratread), - XFS_ERRORTAG_ATTR_LIST(stratcmpl), - XFS_ERRORTAG_ATTR_LIST(diowrite), - XFS_ERRORTAG_ATTR_LIST(bmapifmt), - XFS_ERRORTAG_ATTR_LIST(free_extent), - XFS_ERRORTAG_ATTR_LIST(rmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(refcount_continue_update), - XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), - XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(log_bad_crc), - XFS_ERRORTAG_ATTR_LIST(log_item_pin), - XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), - XFS_ERRORTAG_ATTR_LIST(force_repair), - XFS_ERRORTAG_ATTR_LIST(bad_summary), - XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), - XFS_ERRORTAG_ATTR_LIST(buf_ioerror), - XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), - XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), - XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), - XFS_ERRORTAG_ATTR_LIST(larp), - XFS_ERRORTAG_ATTR_LIST(da_leaf_split), - XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), - XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), - XFS_ERRORTAG_ATTR_LIST(write_delay_ms), - XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), - XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit), - NULL, + XFS_ERRTAGS + NULL }; ATTRIBUTE_GROUPS(xfs_errortag); +#undef XFS_ERRTAG + +/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */ +static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX); static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, @@ -295,7 +165,6 @@ xfs_errortag_enabled( bool xfs_errortag_test( struct xfs_mount *mp, - const char *expression, const char *file, int line, unsigned int error_tag) @@ -321,35 +190,11 @@ xfs_errortag_test( return false; xfs_warn_ratelimited(mp, -"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, mp->m_super->s_id); +"Injecting error at file %s, line %d, on filesystem \"%s\"", + file, line, mp->m_super->s_id); return true; } -int -xfs_errortag_get( - struct xfs_mount *mp, - unsigned int error_tag) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - return mp->m_errortag[error_tag]; -} - -int -xfs_errortag_set( - struct xfs_mount *mp, - unsigned int error_tag, - unsigned int tag_value) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - mp->m_errortag[error_tag] = tag_value; - return 0; -} - int xfs_errortag_add( struct xfs_mount *mp, @@ -359,9 +204,8 @@ xfs_errortag_add( if (!xfs_errortag_valid(error_tag)) return -EINVAL; - - return xfs_errortag_set(mp, error_tag, - xfs_errortag_random_default[error_tag]); + mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag]; + return 0; } int diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0b9c5ba8a5981a..fe6a71bbe9cde9 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -8,22 +8,17 @@ struct xfs_mount; -extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, - xfs_failaddr_t failaddr); -extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, const void *buf, size_t bufsize, - const char *filename, int linenum, - xfs_failaddr_t failaddr); +void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, xfs_failaddr_t failaddr); +void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, + const void *buf, size_t bufsize, const char *filename, + int linenum, xfs_failaddr_t failaddr); void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); -extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); -extern void xfs_verifier_error(struct xfs_buf *bp, int error, - xfs_failaddr_t failaddr); -extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); +void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); +void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); +void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) @@ -39,12 +34,12 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, #define XFS_CORRUPTION_DUMP_LEN (128) #ifdef DEBUG -extern int xfs_errortag_init(struct xfs_mount *mp); -extern void xfs_errortag_del(struct xfs_mount *mp); -extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, - const char *file, int line, unsigned int error_tag); -#define XFS_TEST_ERROR(expr, mp, tag) \ - ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +int xfs_errortag_init(struct xfs_mount *mp); +void xfs_errortag_del(struct xfs_mount *mp); +bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line, + unsigned int error_tag); +#define XFS_TEST_ERROR(mp, tag) \ + xfs_errortag_test((mp), __FILE__, __LINE__, (tag)) bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); #define XFS_ERRORTAG_DELAY(mp, tag) \ do { \ @@ -58,17 +53,13 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, - unsigned int tag_value); -extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_clearall(struct xfs_mount *mp); +int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); +int xfs_errortag_clearall(struct xfs_mount *mp); #else #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) -#define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_TEST_ERROR(mp, tag) (false) #define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) -#define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) #endif /* DEBUG */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 47ee598a982703..418ddab590e06b 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -202,7 +202,7 @@ xfs_efi_copy_format( sizeof(struct xfs_extent)); return 0; } else if (buf->iov_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base; + struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; @@ -216,7 +216,7 @@ xfs_efi_copy_format( } return 0; } else if (buf->iov_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base; + struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index c8402040410b54..af1b0331f7afa4 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -49,7 +49,7 @@ struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; - xfs_efi_log_format_t efi_format; + struct xfs_efi_log_format efi_format; }; static inline size_t @@ -69,7 +69,7 @@ struct xfs_efd_log_item { struct xfs_log_item efd_item; struct xfs_efi_log_item *efd_efip; uint efd_next_extent; - xfs_efd_log_format_t efd_format; + struct xfs_efd_log_format efd_format; }; static inline size_t diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f96fbf5c54c999..2702fef2c90cd2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -75,52 +75,47 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_csn_t -xfs_fsync_seq( - struct xfs_inode *ip, - bool datasync) -{ - if (!xfs_ipincount(ip)) - return 0; - if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - return 0; - return ip->i_itemp->ili_commit_seq; -} - /* - * All metadata updates are logged, which means that we just have to flush the - * log up to the latest LSN that touched the inode. + * All metadata updates are logged, which means that we just have to push the + * journal to the required sequence number than holds the updates. We track + * datasync commits separately to full sync commits, and hence only need to + * select the correct sequence number for the log force here. * - * If we have concurrent fsync/fdatasync() calls, we need them to all block on - * the log force before we clear the ili_fsync_fields field. This ensures that - * we don't get a racing sync operation that does not wait for the metadata to - * hit the journal before returning. If we race with clearing ili_fsync_fields, - * then all that will happen is the log force will do nothing as the lsn will - * already be on disk. We can't race with setting ili_fsync_fields because that - * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock - * shared until after the ili_fsync_fields is cleared. + * We don't have to serialise against concurrent modifications, as we do not + * have to wait for modifications that have not yet completed. We define a + * transaction commit as completing when the commit sequence number is updated, + * hence if the sequence number has not updated, the sync operation has been + * run before the commit completed and we don't have to wait for it. + * + * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain + * set on the log item until - at least - the journal flush completes. In + * reality, they are only cleared when the inode is fully unpinned (i.e. + * persistent in the journal and not dirty in the CIL), and so we rely on + * xfs_log_force_seq() either skipping sequences that have been persisted or + * waiting on sequences that are still in flight to correctly order concurrent + * sync operations. */ -static int +static int xfs_fsync_flush_log( struct xfs_inode *ip, bool datasync, int *log_flushed) { - int error = 0; - xfs_csn_t seq; + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - seq = xfs_fsync_seq(ip, datasync); - if (seq) { - error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, - log_flushed); + spin_lock(&iip->ili_lock); + if (datasync) + seq = iip->ili_datasync_seq; + else + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); - spin_lock(&ip->i_itemp->ili_lock); - ip->i_itemp->ili_fsync_fields = 0; - spin_unlock(&ip->i_itemp->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!seq) + return 0; + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); } STATIC int @@ -158,12 +153,10 @@ xfs_file_fsync( error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* - * Any inode that has dirty modifications in the log is pinned. The - * racy check here for a pinned inode will not catch modifications - * that happen concurrently to the fsync call, but fsync semantics - * only require to sync previously completed I/O. + * If the inode has a inode log item attached, it may need the journal + * flushed to persist any changes the log item might be tracking. */ - if (xfs_ipincount(ip)) { + if (ip->i_itemp) { err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); if (err2 && !error) error = err2; diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f6f628c01febaf..566fd663c95bba 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -14,8 +14,6 @@ */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ - .sgid_inherit = { 0, 0, 1 }, - .symlink_mode = { 0, 0, 1 }, .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4cf7abe5014371..e44040206851fc 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -646,8 +646,7 @@ xfs_iget_cache_miss( goto out_destroy; /* - * For version 5 superblocks, if we are initialising a new inode and we - * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can + * For version 5 superblocks, if we are initialising a new inode, we * simply build the new inode core with a random generation number. * * For version 4 (and older) superblocks, log recovery is dependent on @@ -655,8 +654,7 @@ xfs_iget_cache_miss( * value and hence we must also read the inode off disk even when * initializing new inodes. */ - if (xfs_has_v3inodes(mp) && - (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { + if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) { VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a32a..36b39539e561a3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -877,6 +877,35 @@ xfs_create_tmpfile( return error; } +static inline int +xfs_projid_differ( + struct xfs_inode *tdp, + struct xfs_inode *sip) +{ + /* + * If we are using project inheritance, we only allow hard link/renames + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && + tdp->i_projid != sip->i_projid)) { + /* + * Project quota setup skips special files which can + * leave inodes in a PROJINHERIT directory without a + * project ID set. We need to allow links to be made + * to these "project-less" inodes because userspace + * expects them to succeed after project ID setup, + * but everything else should be rejected. + */ + if (!special_file(VFS_I(sip)->i_mode) || + sip->i_projid != 0) { + return -EXDEV; + } + } + + return 0; +} + int xfs_link( struct xfs_inode *tdp, @@ -930,27 +959,9 @@ xfs_link( goto error_return; } - /* - * If we are using project inheritance, we only allow hard link - * creation in our tree when the project IDs are the same; else - * the tree quota mechanism could be circumvented. - */ - if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - tdp->i_projid != sip->i_projid)) { - /* - * Project quota setup skips special files which can - * leave inodes in a PROJINHERIT directory without a - * project ID set. We need to allow links to be made - * to these "project-less" inodes because userspace - * expects them to succeed after project ID setup, - * but everything else should be rejected. - */ - if (!special_file(VFS_I(sip)->i_mode) || - sip->i_projid != 0) { - error = -EXDEV; - goto error_return; - } - } + error = xfs_projid_differ(tdp, sip); + if (error) + goto error_return; error = xfs_dir_add_child(tp, resblks, &du); if (error) @@ -1035,7 +1046,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (atomic_read(&VFS_I(ip)->i_count)) + if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -1656,7 +1667,6 @@ xfs_ifree_mark_inode_stale( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); @@ -1821,12 +1831,20 @@ static void xfs_iunpin( struct xfs_inode *ip) { - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); + if (!seq) + return; /* Give the log a push to start the unpinning I/O */ - xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); + xfs_log_force_seq(ip->i_mount, seq, 0, NULL); } @@ -2227,16 +2245,9 @@ xfs_rename( if (du_wip.ip) xfs_trans_ijoin(tp, du_wip.ip, 0); - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - target_dp->i_projid != src_ip->i_projid)) { - error = -EXDEV; + error = xfs_projid_differ(target_dp, src_ip); + if (error) goto out_trans_cancel; - } /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) { @@ -2377,8 +2388,8 @@ xfs_iflush( * error handling as the caller will shutdown and fail the buffer. */ error = -EFSCORRUPTED; - if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), - mp, XFS_ERRTAG_IFLUSH_1)) { + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); @@ -2394,29 +2405,27 @@ xfs_iflush( goto flush_out; } } else if (S_ISREG(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE, - mp, XFS_ERRTAG_IFLUSH_3)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE && - ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, - mp, XFS_ERRTAG_IFLUSH_4)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > - ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { + if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > + ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %llu, " "total extents = %llu nblocks = %lld, ptr "PTR_FMT, @@ -2425,8 +2434,8 @@ xfs_iflush( ip->i_nblocks, ip); goto flush_out; } - if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, - mp, XFS_ERRTAG_IFLUSH_6)) { + if (ip->i_forkoff > mp->m_sb.sb_inodesize || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); @@ -2502,7 +2511,6 @@ xfs_iflush( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); spin_unlock(&iip->ili_lock); @@ -2661,12 +2669,15 @@ int xfs_log_force_inode( struct xfs_inode *ip) { + struct xfs_inode_log_item *iip = ip->i_itemp; xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - seq = ip->i_itemp->ili_commit_seq; - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!iip) + return 0; + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); if (!seq) return 0; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 829675700fcdd4..1bd411a1114c7e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -131,46 +131,28 @@ xfs_inode_item_precommit( } /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. + * Inode verifiers do not check that the extent size hints are an + * integer multiple of the rt extent size on a directory with + * rtinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the bad hints. */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) { + if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. - */ spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); - if (flags & XFS_ILOG_IVERSION) - flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); - - /* - * Inode verifiers do not check that the CoW extent size hint is an - * integer multiple of the rt extent size on a directory with both - * rtinherit and cowextsize flags set. If we're logging a directory - * that is misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { - ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; - ip->i_cowextsize = 0; - flags |= XFS_ILOG_CORE; - } - if (!iip->ili_item.li_buf) { struct xfs_buf *bp; int error; @@ -204,6 +186,20 @@ xfs_inode_item_precommit( xfs_trans_brelse(tp, bp); } + /* + * Store the dirty flags back into the inode item as this state is used + * later on in xfs_inode_item_committing() to determine whether the + * transaction is relevant to fsync state or not. + */ + iip->ili_dirty_flags = flags; + + /* + * Convert the flags on-disk fields that have been modified in the + * transaction so that ili_fields tracks the changes correctly. + */ + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines @@ -214,12 +210,6 @@ xfs_inode_item_precommit( spin_unlock(&iip->ili_lock); xfs_inode_item_precommit_check(ip); - - /* - * We are done with the log item transaction dirty state, so clear it so - * that it doesn't pollute future transactions. - */ - iip->ili_dirty_flags = 0; return 0; } @@ -729,13 +719,24 @@ xfs_inode_item_unpin( struct xfs_log_item *lip, int remove) { - struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); - if (atomic_dec_and_test(&ip->i_pincount)) + + /* + * If this is the last unpin, then the inode no longer needs a journal + * flush to persist it. Hence we can clear the commit sequence numbers + * as a fsync/fdatasync operation on the inode at this point is a no-op. + */ + if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) { + iip->ili_commit_seq = 0; + iip->ili_datasync_seq = 0; + spin_unlock(&iip->ili_lock); wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); + } } STATIC uint @@ -858,12 +859,45 @@ xfs_inode_item_committed( return lsn; } +/* + * The modification is now complete, so before we unlock the inode we need to + * update the commit sequence numbers for data integrity journal flushes. We + * always record the commit sequence number (ili_commit_seq) so that anything + * that needs a full journal sync will capture all of this modification. + * + * We then + * check if the changes will impact a datasync (O_DSYNC) journal flush. If the + * changes will require a datasync flush, then we also record the sequence in + * ili_datasync_seq. + * + * These commit sequence numbers will get cleared atomically with the inode being + * unpinned (i.e. pin count goes to zero), and so it will only be set when the + * inode is dirty in the journal. This removes the need for checking if the + * inode is pinned to determine if a journal flush is necessary, and hence + * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to + * serialise pin counts against commit sequence number updates. + * + */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { - INODE_ITEM(lip)->ili_commit_seq = seq; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + spin_lock(&iip->ili_lock); + iip->ili_commit_seq = seq; + if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP)) + iip->ili_datasync_seq = seq; + spin_unlock(&iip->ili_lock); + + /* + * Clear the per-transaction dirty flags now that we have finished + * recording the transaction's inode modifications in the CIL and are + * about to release and (maybe) unlock the inode. + */ + iip->ili_dirty_flags = 0; + return xfs_inode_item_release(lip); } @@ -1055,7 +1089,6 @@ xfs_iflush_abort_clean( { iip->ili_last_fields = 0; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index ba92ce11a01111..2ddcca41714f7a 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -32,9 +32,17 @@ struct xfs_inode_log_item { spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + + /* + * We record the sequence number for every inode modification, as + * well as those that only require fdatasync operations for data + * integrity. This allows optimisation of the O_DSYNC/fdatasync path + * without needing to track what modifications the journal is currently + * carrying for the inode. These are protected by the above ili_lock. + */ xfs_csn_t ili_commit_seq; /* last transaction commit */ + xfs_csn_t ili_datasync_seq; /* for datasync optimisation */ }; static inline int xfs_inode_clean(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index e1051a530a50fe..a6bb7ee7a27ad5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -512,9 +512,6 @@ xfs_fileattr_get( { struct xfs_inode *ip = XFS_I(d_inode(dentry)); - if (d_is_special(dentry)) - return -ENOTTY; - xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -736,9 +733,6 @@ xfs_fileattr_set( trace_xfs_ioctl_setattr(ip); - if (d_is_special(dentry)) - return -ENOTTY; - if (!fa->fsx_valid) { if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | FS_NODUMP_FL | @@ -1209,21 +1203,21 @@ xfs_file_ioctl( current->comm); return -ENOTTY; case XFS_IOC_DIOINFO: { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct kstat st; struct dioattr da; - da.d_mem = target->bt_logical_sectorsize; + error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0); + if (error) + return error; /* - * See xfs_report_dioalign() for an explanation about why this - * reports a value larger than the sector size for COW inodes. + * Some userspace directly feeds the return value to + * posix_memalign, which fails for values that are smaller than + * the pointer size. Round up the value to not break userspace. */ - if (xfs_is_cow_inode(ip)) - da.d_miniosz = xfs_inode_alloc_unitsize(ip); - else - da.d_miniosz = target->bt_logical_sectorsize; + da.d_mem = roundup(st.dio_mem_align, sizeof(void *)); + da.d_miniosz = st.dio_offset_align; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); - if (copy_to_user(arg, &da, sizeof(da))) return -EFAULT; return 0; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2a74f295734103..d3f6e3e42a1191 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -149,9 +149,18 @@ xfs_bmbt_to_iomap( iomap->bdev = target->bt_bdev; iomap->flags = iomap_flags; - if (xfs_ipincount(ip) && - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; + /* + * If the inode is dirty for datasync purposes, let iomap know so it + * doesn't elide the IO completion journal flushes on O_DSYNC IO. + */ + if (ip->i_itemp) { + struct xfs_inode_log_item *iip = ip->i_itemp; + + spin_lock(&iip->ili_lock); + if (iip->ili_datasync_seq) + iomap->flags |= IOMAP_F_DIRTY; + spin_unlock(&iip->ili_lock); + } iomap->validity_cookie = sequence_cookie; return 0; @@ -1554,7 +1563,7 @@ xfs_zoned_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; @@ -1728,7 +1737,7 @@ xfs_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 603effabe1ee12..caff0125faeac0 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -431,14 +431,12 @@ xfs_vn_symlink( struct dentry *dentry, const char *symname) { - struct inode *inode; - struct xfs_inode *cip = NULL; - struct xfs_name name; - int error; - umode_t mode; + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode = S_IFLNK | S_IRWXUGO; - mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); error = xfs_dentry_mode_to_name(&name, dentry, mode); if (unlikely(error)) goto out; @@ -1335,6 +1333,8 @@ static const struct inode_operations xfs_symlink_inode_operations = { .setattr = xfs_vn_setattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; /* Figure out if this file actually supports DAX. */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9a2221b4aa21ed..4dd747bdbccab2 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -89,8 +89,6 @@ typedef __u32 xfs_nlink_t; #undef XFS_NATIVE_HOST #endif -#define irix_sgid_inherit xfs_params.sgid_inherit.val -#define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val #define xfs_error_level xfs_params.error_level.val #define xfs_syncd_centisecs xfs_params.syncd_timer.val diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c8a57e21a1d3e0..603e85c1ab4cd7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -969,8 +969,8 @@ xfs_log_unmount_write( * counters will be recalculated. Refer to xlog_check_unmount_rec for * more details. */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { xfs_alert(mp, "%s: will fix summary counters at next mount", __func__); return; @@ -1240,7 +1240,7 @@ xlog_ioend_work( /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { + if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } @@ -1489,8 +1489,7 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | - WQ_HIGHPRI), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU), 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; @@ -1568,13 +1567,13 @@ xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, char *dp, - int size) + unsigned int hdrsize, + unsigned int size) { uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum_update((char *)rhead, - sizeof(struct xlog_rec_header), + crc = xfs_start_cksum_update((char *)rhead, hdrsize, offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ @@ -1818,7 +1817,7 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, - iclog->ic_datap, size); + iclog->ic_datap, XLOG_REC_SIZE, size); /* * Intentionally corrupt the log record CRC based on the error injection * frequency, if defined. This facilitates testing log recovery in the @@ -1827,7 +1826,7 @@ xlog_sync( * detects the bad CRC and attempts to recover. */ #ifdef DEBUG - if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { + if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_fail_crc = true; xfs_warn(log->l_mp, @@ -2656,10 +2655,11 @@ xlog_state_get_iclog_space( * until you know exactly how many bytes get copied. Therefore, wait * until later to update ic_offset. * - * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * xlog_write() algorithm assumes that at least 2 xlog_op_header's * can fit into remaining data section. */ - if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + if (iclog->ic_size - iclog->ic_offset < + 2 * sizeof(struct xlog_op_header)) { int error = 0; xlog_state_switch_iclogs(log, iclog, iclog->ic_size); @@ -3153,11 +3153,11 @@ xlog_calc_unit_res( */ /* for trans header */ - unit_bytes += sizeof(xlog_op_header_t); - unit_bytes += sizeof(xfs_trans_header_t); + unit_bytes += sizeof(struct xlog_op_header); + unit_bytes += sizeof(struct xfs_trans_header); /* for start-rec */ - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); /* * for LR headers - the space for data in an iclog is the size minus @@ -3180,12 +3180,12 @@ xlog_calc_unit_res( num_headers = howmany(unit_bytes, iclog_space); /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; + unit_bytes += sizeof(struct xlog_op_header) * num_headers; /* add extra header reservations if we overrun */ while (!num_headers || howmany(unit_bytes, iclog_space) > num_headers) { - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); num_headers++; } unit_bytes += log->l_iclog_hsize * num_headers; @@ -3322,7 +3322,7 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - xlog_op_header_t *ophead; + struct xlog_op_header *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; void *base_ptr, *ptr, *p; @@ -3400,7 +3400,7 @@ xlog_verify_iclog( op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); } } - ptr += sizeof(xlog_op_header_t) + op_len; + ptr += sizeof(struct xlog_op_header) + op_len; } } #endif diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index af6daf4f67924b..dcc1f44ed68f90 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -20,6 +20,43 @@ struct xfs_log_vec { int lv_alloc_size; /* size of allocated lv */ }; +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_RUI_FORMAT 21 +#define XLOG_REG_TYPE_RUD_FORMAT 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 + #define XFS_LOG_VEC_ORDERED (-1) /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index a9a7a271c15bb7..0cfc654d8e872b 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -499,8 +499,8 @@ xlog_recover_finish( extern void xlog_recover_cancel(struct xlog *); -extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, - char *dp, int size); +__le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, unsigned int hdrsize, unsigned int size); extern struct kmem_cache *xfs_log_ticket_cache; struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e6ed9e09c02710..549d60959aee5b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2894,20 +2894,34 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - __le32 old_crc = rhead->h_crc; - __le32 crc; + __le32 expected_crc = rhead->h_crc, crc, other_crc; - crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE, + be32_to_cpu(rhead->h_len)); + + /* + * Look at the end of the struct xlog_rec_header definition in + * xfs_log_format.h for the glory details. + */ + if (expected_crc && crc != expected_crc) { + other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER, + be32_to_cpu(rhead->h_len)); + if (other_crc == expected_crc) { + xfs_notice_once(log->l_mp, + "Fixing up incorrect CRC due to padding."); + crc = other_crc; + } + } /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets old_crc to 0 so we must consider this valid even on v5 supers. - * Otherwise, return EFSBADCRC on failure so the callers up the stack - * know precisely what failed. + * sets expected_crc to 0 so we must consider this valid even on v5 + * supers. Otherwise, return EFSBADCRC on failure so the callers up the + * stack know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (old_crc && crc != old_crc) + if (expected_crc && crc != expected_crc) return -EFSBADCRC; return 0; } @@ -2918,11 +2932,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != old_crc) { - if (old_crc || xfs_has_crc(log->l_mp)) { + if (crc != expected_crc) { + if (expected_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(old_crc), + le32_to_cpu(expected_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index dc32c5e34d8176..0953f6ae94abc8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1057,19 +1057,6 @@ xfs_mountfs( xfs_inodegc_start(mp); xfs_blockgc_start(mp); - /* - * Now that we've recovered any pending superblock feature bit - * additions, we can finish setting up the attr2 behaviour for the - * mount. The noattr2 option overrides the superblock flag, so only - * check the superblock feature flag if the mount option is not set. - */ - if (xfs_has_noattr2(mp)) { - mp->m_features &= ~XFS_FEAT_ATTR2; - } else if (!xfs_has_attr2(mp) && - (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { - mp->m_features |= XFS_FEAT_ATTR2; - } - if (xfs_has_metadir(mp)) { error = xfs_mount_setup_metadir(mp); if (error) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 97de44c32272f2..f046d1215b043c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -363,7 +363,6 @@ typedef struct xfs_mount { #define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */ #define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */ #define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */ -#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */ #define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */ #define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */ #define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */ @@ -386,7 +385,6 @@ typedef struct xfs_mount { /* Mount features */ #define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ -#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ #define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred @@ -396,7 +394,6 @@ typedef struct xfs_mount { #define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */ #define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */ #define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */ -#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/ #define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */ #define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */ #define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */ @@ -504,12 +501,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN) __XFS_HAS_V4_FEAT(logv2, LOGV2) __XFS_HAS_V4_FEAT(extflg, EXTFLG) __XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT) -__XFS_ADD_V4_FEAT(attr2, ATTR2) __XFS_ADD_V4_FEAT(projid32, PROJID32) __XFS_HAS_V4_FEAT(v3inodes, V3INODES) __XFS_HAS_V4_FEAT(crc, CRC) __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) +static inline void xfs_add_attr2(struct xfs_mount *mp) +{ + if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) + xfs_sb_version_addattr2(&mp->m_sb); +} + /* * Mount features * @@ -517,7 +519,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) * bit inodes and read-only state, are kept as operational state rather than * features. */ -__XFS_HAS_FEAT(noattr2, NOATTR2) __XFS_HAS_FEAT(noalign, NOALIGN) __XFS_HAS_FEAT(allocsize, ALLOCSIZE) __XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE) @@ -526,7 +527,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC) __XFS_HAS_FEAT(discard, DISCARD) __XFS_HAS_FEAT(grpid, GRPID) __XFS_HAS_FEAT(small_inums, SMALL_INUMS) -__XFS_HAS_FEAT(ikeep, IKEEP) __XFS_HAS_FEAT(swalloc, SWALLOC) __XFS_HAS_FEAT(filestreams, FILESTREAMS) __XFS_HAS_FEAT(dax_always, DAX_ALWAYS) diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 866c71d9fbaed4..73b7e72944e47f 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -293,7 +293,8 @@ int xfs_mru_cache_init(void) { xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index fbeddcac479208..b1767288994206 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -165,7 +165,7 @@ xfs_dax_translate_range( uint64_t *bblen) { u64 dev_start = btp->bt_dax_part_off; - u64 dev_len = bdev_nr_bytes(btp->bt_bdev); + u64 dev_len = BBTOB(btp->bt_nr_sectors); u64 dev_end = dev_start + dev_len - 1; /* Notify failure on the whole device. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a770d..e85a156dc17d16 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -105,8 +105,8 @@ enum { Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, - Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, - Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, + Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, + Opt_largeio, Opt_nolargeio, Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, @@ -133,12 +133,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("norecovery", Opt_norecovery), fsparam_flag("inode64", Opt_inode64), fsparam_flag("inode32", Opt_inode32), - fsparam_flag("ikeep", Opt_ikeep), - fsparam_flag("noikeep", Opt_noikeep), fsparam_flag("largeio", Opt_largeio), fsparam_flag("nolargeio", Opt_nolargeio), - fsparam_flag("attr2", Opt_attr2), - fsparam_flag("noattr2", Opt_noattr2), fsparam_flag("filestreams", Opt_filestreams), fsparam_flag("quota", Opt_quota), fsparam_flag("noquota", Opt_noquota), @@ -175,13 +171,11 @@ xfs_fs_show_options( { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_FEAT_IKEEP, ",ikeep" }, { XFS_FEAT_WSYNC, ",wsync" }, { XFS_FEAT_NOALIGN, ",noalign" }, { XFS_FEAT_SWALLOC, ",swalloc" }, { XFS_FEAT_NOUUID, ",nouuid" }, { XFS_FEAT_NORECOVERY, ",norecovery" }, - { XFS_FEAT_ATTR2, ",attr2" }, { XFS_FEAT_FILESTREAMS, ",filestreams" }, { XFS_FEAT_GRPID, ",grpid" }, { XFS_FEAT_DISCARD, ",discard" }, @@ -541,7 +535,8 @@ xfs_setup_devices( { int error; - error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize, + mp->m_sb.sb_dblocks); if (error) return error; @@ -551,7 +546,7 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_configure_buftarg(mp->m_logdev_targp, - log_sector_size); + log_sector_size, mp->m_sb.sb_logblocks); if (error) return error; } @@ -565,7 +560,7 @@ xfs_setup_devices( mp->m_rtdev_targp = mp->m_ddev_targp; } else if (mp->m_rtname) { error = xfs_configure_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_sectsize); + mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks); if (error) return error; } @@ -578,19 +573,19 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_unwritten; @@ -602,13 +597,14 @@ xfs_init_mount_workqueues( goto out_destroy_reclaim; mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_inodegc_wq) goto out_destroy_blockgc; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", - XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0, + mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_inodegc; @@ -778,7 +774,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode); + return inode_generic_drop(inode); } STATIC void @@ -1087,15 +1083,6 @@ xfs_finish_flags( } } - /* - * V5 filesystems always use attr2 format for attributes. - */ - if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " - "attr2 is always enabled for V5 filesystems."); - return -EINVAL; - } - /* * prohibit r/w mounts of read-only filesystems */ @@ -1542,22 +1529,6 @@ xfs_fs_parse_param( return 0; #endif /* Following mount options will be removed in September 2025 */ - case Opt_ikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true); - parsing_mp->m_features |= XFS_FEAT_IKEEP; - return 0; - case Opt_noikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false); - parsing_mp->m_features &= ~XFS_FEAT_IKEEP; - return 0; - case Opt_attr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true); - parsing_mp->m_features |= XFS_FEAT_ATTR2; - return 0; - case Opt_noattr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); - parsing_mp->m_features |= XFS_FEAT_NOATTR2; - return 0; case Opt_max_open_zones: parsing_mp->m_max_open_zones = result.uint_32; return 0; @@ -1593,16 +1564,6 @@ xfs_fs_validate_params( return -EINVAL; } - /* - * We have not read the superblock at this point, so only the attr2 - * mount option can set the attr2 feature by this stage. - */ - if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "attr2 and noattr2 cannot both be specified."); - return -EINVAL; - } - - if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) { xfs_warn(mp, "sunit and swidth options incompatible with the noalign option"); @@ -2177,21 +2138,6 @@ xfs_fs_reconfigure( if (error) return error; - /* attr2 -> noattr2 */ - if (xfs_has_noattr2(new_mp)) { - if (xfs_has_crc(mp)) { - xfs_warn(mp, - "attr2 is always enabled for a V5 filesystem - can't be changed."); - return -EINVAL; - } - mp->m_features &= ~XFS_FEAT_ATTR2; - mp->m_features |= XFS_FEAT_NOATTR2; - } else if (xfs_has_attr2(new_mp)) { - /* noattr2 -> attr2 */ - mp->m_features &= ~XFS_FEAT_NOATTR2; - mp->m_features |= XFS_FEAT_ATTR2; - } - /* Validate new max_atomic_write option before making other changes */ if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { error = xfs_set_max_atomic_write_opt(mp, @@ -2596,8 +2542,8 @@ xfs_init_workqueues(void) * AGs in all the filesystems mounted. Hence use the default large * max_active value for this workqueue. */ - xfs_alloc_wq = alloc_workqueue("xfsalloc", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0); + xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 0); if (!xfs_alloc_wq) return -ENOMEM; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 751dc74a30671a..9918f14b4874fd 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ -STATIC int +static inline int xfs_deprecated_dointvec_minmax( const struct ctl_table *ctl, int write, @@ -67,24 +67,6 @@ xfs_deprecated_dointvec_minmax( } static const struct ctl_table xfs_table[] = { - { - .procname = "irix_sgid_inherit", - .data = &xfs_params.sgid_inherit.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.sgid_inherit.min, - .extra2 = &xfs_params.sgid_inherit.max - }, - { - .procname = "irix_symlink_mode", - .data = &xfs_params.symlink_mode.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.symlink_mode.min, - .extra2 = &xfs_params.symlink_mode.max - }, { .procname = "panic_mask", .data = &xfs_params.panic_mask.val, @@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = { .extra1 = &xfs_params.blockgc_timer.min, .extra2 = &xfs_params.blockgc_timer.max, }, - { - .procname = "speculative_cow_prealloc_lifetime", - .data = &xfs_params.blockgc_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.blockgc_timer.min, - .extra2 = &xfs_params.blockgc_timer.max, - }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 51646f066c4f7d..ed9d896079c1a8 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val { } xfs_sysctl_val_t; typedef struct xfs_param { - xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is - * not a member of parent dir GID. */ - xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ac344e42846c14..79b8641880ab9d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1152,7 +1152,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->count = icount_read(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 575e7028f423a8..474f5a04ec6367 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -452,19 +452,17 @@ xfs_trans_mod_sb( */ STATIC void xfs_trans_apply_sb_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { - struct xfs_dsb *sbp; - struct xfs_buf *bp; - int whole = 0; - - bp = xfs_trans_getsb(tp); - sbp = bp->b_addr; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp = xfs_trans_getsb(tp); + struct xfs_dsb *sbp = bp->b_addr; + int whole = 0; /* * Only update the superblock counters if we are logging them */ - if (!xfs_has_lazysbcount((tp->t_mountp))) { + if (!xfs_has_lazysbcount(mp)) { if (tp->t_icount_delta) be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); if (tp->t_ifree_delta) @@ -491,8 +489,7 @@ xfs_trans_apply_sb_deltas( * write the correct value ondisk. */ if ((tp->t_frextents_delta || tp->t_res_frextents_delta) && - !xfs_has_rtgroups(tp->t_mountp)) { - struct xfs_mount *mp = tp->t_mountp; + !xfs_has_rtgroups(mp)) { int64_t rtxdelta; rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; @@ -505,6 +502,8 @@ xfs_trans_apply_sb_deltas( if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); + mp->m_ddev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_dblocks_delta); whole = 1; } if (tp->t_agcount_delta) { @@ -524,7 +523,7 @@ xfs_trans_apply_sb_deltas( * recompute the ondisk rtgroup block log. The incore values * will be recomputed in xfs_trans_unreserve_and_mod_sb. */ - if (xfs_has_rtgroups(tp->t_mountp)) { + if (xfs_has_rtgroups(mp)) { sbp->sb_rgblklog = xfs_compute_rgblklog( be32_to_cpu(sbp->sb_rgextents), be32_to_cpu(sbp->sb_rextsize)); @@ -537,6 +536,8 @@ xfs_trans_apply_sb_deltas( } if (tp->t_rblocks_delta) { be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); + mp->m_rtdev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_rblocks_delta); whole = 1; } if (tp->t_rextents_delta) { diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 67c328d23e4ae9..38983c6777df31 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -374,7 +374,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index f28214c28ab545..1147bacb2da8e6 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -493,64 +493,58 @@ xfs_try_open_zone( return oz; } +enum xfs_zone_alloc_score { + /* Any open zone will do it, we're desperate */ + XFS_ZONE_ALLOC_ANY = 0, + + /* It better fit somehow */ + XFS_ZONE_ALLOC_OK = 1, + + /* Only reuse a zone if it fits really well. */ + XFS_ZONE_ALLOC_GOOD = 2, +}; + /* - * For data with short or medium lifetime, try to colocated it into an - * already open zone with a matching temperature. + * Life time hint co-location matrix. Fields not set default to 0 + * aka XFS_ZONE_ALLOC_ANY. */ -static bool -xfs_colocate_eagerly( - enum rw_hint file_hint) -{ - switch (file_hint) { - case WRITE_LIFE_MEDIUM: - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - return true; - default: - return false; - } -} - -static bool -xfs_good_hint_match( - struct xfs_open_zone *oz, - enum rw_hint file_hint) -{ - switch (oz->oz_write_hint) { - case WRITE_LIFE_LONG: - case WRITE_LIFE_EXTREME: - /* colocate long and extreme */ - if (file_hint == WRITE_LIFE_LONG || - file_hint == WRITE_LIFE_EXTREME) - return true; - break; - case WRITE_LIFE_MEDIUM: - /* colocate medium with medium */ - if (file_hint == WRITE_LIFE_MEDIUM) - return true; - break; - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - case WRITE_LIFE_NOT_SET: - /* colocate short and none */ - if (file_hint <= WRITE_LIFE_SHORT) - return true; - break; - } - return false; -} +static const unsigned int +xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { + [WRITE_LIFE_NOT_SET] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_NONE] = { + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_SHORT] = { + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_MEDIUM] = { + [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_LONG] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_EXTREME] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, +}; static bool xfs_try_use_zone( struct xfs_zone_info *zi, enum rw_hint file_hint, struct xfs_open_zone *oz, - bool lowspace) + unsigned int goodness) { if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return false; - if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + + if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness) return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) return false; @@ -581,14 +575,14 @@ static struct xfs_open_zone * xfs_select_open_zone_lru( struct xfs_zone_info *zi, enum rw_hint file_hint, - bool lowspace) + unsigned int goodness) { struct xfs_open_zone *oz; lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + if (xfs_try_use_zone(zi, file_hint, oz, goodness)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -651,9 +645,11 @@ xfs_select_zone_nowait( * data. */ spin_lock(&zi->zi_open_zones_lock); - if (xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - else if (pack_tight) + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD); + if (oz) + goto out_unlock; + + if (pack_tight) oz = xfs_select_open_zone_mru(zi, write_hint); if (oz) goto out_unlock; @@ -667,16 +663,16 @@ xfs_select_zone_nowait( goto out_unlock; /* - * Try to colocate cold data with other cold data if we failed to open a - * new zone for it. + * Try to find an zone that is an ok match to colocate data with. + */ + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); + if (oz) + goto out_unlock; + + /* + * Pick the least recently used zone, regardless of hint match */ - if (write_hint != WRITE_LIFE_NOT_SET && - !xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY); out_unlock: spin_unlock(&zi->zi_open_zones_lock); return oz; @@ -1135,7 +1131,7 @@ xfs_calc_open_zones( if (bdev_open_zones) mp->m_max_open_zones = bdev_open_zones; else - mp->m_max_open_zones = xfs_max_open_zones(mp); + mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES; } if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { @@ -1248,7 +1244,7 @@ xfs_mount_zones( if (!mp->m_zone_info) return -ENOMEM; - xfs_info(mp, "%u zones of %u blocks size (%u max open)", + xfs_info(mp, "%u zones of %u blocks (%u max open zones)", mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index 53c98f5fe3c361..a2db36d18419a1 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -173,8 +173,10 @@ struct acpi_exception_info { #define AE_AML_TARGET_TYPE EXCEP_AML (0x0023) #define AE_AML_PROTOCOL EXCEP_AML (0x0024) #define AE_AML_BUFFER_LENGTH EXCEP_AML (0x0025) +#define AE_AML_TOO_FEW_ARGUMENTS EXCEP_AML (0x0026) +#define AE_AML_TOO_MANY_ARGUMENTS EXCEP_AML (0x0027) -#define AE_CODE_AML_MAX 0x0025 +#define AE_CODE_AML_MAX 0x0027 /* * Internal exceptions used for control @@ -353,7 +355,11 @@ static const struct acpi_exception_info acpi_gbl_exception_names_aml[] = { "A target operand of an incorrect type was encountered"), EXCEP_TXT("AE_AML_PROTOCOL", "Violation of a fixed ACPI protocol"), EXCEP_TXT("AE_AML_BUFFER_LENGTH", - "The length of the buffer is invalid/incorrect") + "The length of the buffer is invalid/incorrect"), + EXCEP_TXT("AE_AML_TOO_FEW_ARGUMENTS", + "There are fewer than expected method arguments"), + EXCEP_TXT("AE_AML_TOO_MANY_ARGUMENTS", + "There are too many arguments for this method") }; static const struct acpi_exception_info acpi_gbl_exception_names_ctrl[] = { diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index b49396aa405812..e65a2afe92504b 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -12,7 +12,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20250404 +#define ACPI_CA_VERSION 0x20250807 #include #include @@ -213,6 +213,12 @@ ACPI_INIT_GLOBAL(u8, acpi_gbl_osi_data, 0); */ ACPI_INIT_GLOBAL(u8, acpi_gbl_reduced_hardware, FALSE); +/* + * ACPI Global Lock is mainly used for systems with SMM, so no-SMM systems + * (such as loong_arch) may not have and not use Global Lock. + */ +ACPI_INIT_GLOBAL(u8, acpi_gbl_use_global_lock, TRUE); + /* * Maximum timeout for While() loop iterations before forced method abort. * This mechanism is intended to prevent infinite loops during interpreter diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index 243097a3da6360..8a67d4ea6e3feb 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -73,7 +73,7 @@ struct acpi_table_header { char oem_id[ACPI_OEM_ID_SIZE] ACPI_NONSTRING; /* ASCII OEM identification */ char oem_table_id[ACPI_OEM_TABLE_ID_SIZE] ACPI_NONSTRING; /* ASCII OEM table identification */ u32 oem_revision; /* OEM revision number */ - char asl_compiler_id[ACPI_NAMESEG_SIZE]; /* ASCII ASL compiler vendor ID */ + char asl_compiler_id[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; /* ASCII ASL compiler vendor ID */ u32 asl_compiler_revision; /* ASL compiler version */ }; diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h index 99fd1588ff3822..0b4c332df25c3e 100644 --- a/include/acpi/actbl1.h +++ b/include/acpi/actbl1.h @@ -565,6 +565,7 @@ struct acpi_cedt_cfmws_target_element { #define ACPI_CEDT_CFMWS_RESTRICT_VOLATILE (1<<2) #define ACPI_CEDT_CFMWS_RESTRICT_PMEM (1<<3) #define ACPI_CEDT_CFMWS_RESTRICT_FIXED (1<<4) +#define ACPI_CEDT_CFMWS_RESTRICT_BI (1<<5) /* 2: CXL XOR Interleave Math Structure */ diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 20f3d62e7a16a3..13fa8150484456 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -160,7 +160,7 @@ extern unsigned int cppc_khz_to_perf(struct cppc_perf_caps *caps, unsigned int f extern bool acpi_cpc_valid(void); extern bool cppc_allow_fast_switch(void); extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); -extern unsigned int cppc_get_transition_latency(int cpu); +extern int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern bool cpc_supported_by_cpu(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); @@ -216,9 +216,9 @@ static inline bool cppc_allow_fast_switch(void) { return false; } -static inline unsigned int cppc_get_transition_latency(int cpu) +static inline int cppc_get_transition_latency(int cpu) { - return CPUFREQ_ETERNAL; + return -ENODATA; } static inline bool cpc_ffh_supported(void) { diff --git a/include/acpi/processor.h b/include/acpi/processor.h index ff864c1cee3a4c..7146a8e9e9c25b 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -417,34 +417,15 @@ static inline void acpi_processor_throttling_init(void) {} #endif /* CONFIG_ACPI_CPU_FREQ_PSS */ /* in processor_idle.c */ -extern struct cpuidle_driver acpi_idle_driver; #ifdef CONFIG_ACPI_PROCESSOR_IDLE -int acpi_processor_power_init(struct acpi_processor *pr); -int acpi_processor_power_exit(struct acpi_processor *pr); +void acpi_processor_power_init(struct acpi_processor *pr); +void acpi_processor_power_exit(struct acpi_processor *pr); int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); void acpi_processor_unregister_idle_driver(void); -#else -static inline int acpi_processor_power_init(struct acpi_processor *pr) -{ - return -ENODEV; -} - -static inline int acpi_processor_power_exit(struct acpi_processor *pr) -{ - return -ENODEV; -} - -static inline int acpi_processor_power_state_has_changed(struct acpi_processor *pr) -{ - return -ENODEV; -} - -static inline int acpi_processor_hotplug(struct acpi_processor *pr) -{ - return -ENODEV; -} +int acpi_processor_ffh_lpi_probe(unsigned int cpu); +int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ @@ -467,11 +448,6 @@ static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy) } #endif /* CONFIG_CPU_FREQ */ -#ifdef CONFIG_ACPI_PROCESSOR_IDLE -extern int acpi_processor_ffh_lpi_probe(unsigned int cpu); -extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); -#endif - void acpi_processor_init_invariance_cppc(void); #endif diff --git a/include/asm-generic/bitops/__ffs.h b/include/asm-generic/bitops/__ffs.h index 2d08c750c8a730..3a899c626fdc35 100644 --- a/include/asm-generic/bitops/__ffs.h +++ b/include/asm-generic/bitops/__ffs.h @@ -10,7 +10,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned int generic___ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned int generic___ffs(unsigned long word) { unsigned int num = 0; diff --git a/include/asm-generic/bitops/__fls.h b/include/asm-generic/bitops/__fls.h index e974ec932ec189..35f33780ca6c37 100644 --- a/include/asm-generic/bitops/__fls.h +++ b/include/asm-generic/bitops/__fls.h @@ -10,7 +10,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned int generic___fls(unsigned long word) +static __always_inline __attribute_const__ unsigned int generic___fls(unsigned long word) { unsigned int num = BITS_PER_LONG - 1; diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h index cf4b3d33bf961e..d3c3f567045d46 100644 --- a/include/asm-generic/bitops/builtin-__ffs.h +++ b/include/asm-generic/bitops/builtin-__ffs.h @@ -8,7 +8,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned int __ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned int __ffs(unsigned long word) { return __builtin_ctzl(word); } diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h index 6d72fc8a525953..7770c4f1bfcd49 100644 --- a/include/asm-generic/bitops/builtin-__fls.h +++ b/include/asm-generic/bitops/builtin-__fls.h @@ -8,7 +8,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned int __fls(unsigned long word) +static __always_inline __attribute_const__ unsigned int __fls(unsigned long word) { return (sizeof(word) * 8) - 1 - __builtin_clzl(word); } diff --git a/include/asm-generic/bitops/builtin-fls.h b/include/asm-generic/bitops/builtin-fls.h index c8455cc28841af..be707da8c7cdd2 100644 --- a/include/asm-generic/bitops/builtin-fls.h +++ b/include/asm-generic/bitops/builtin-fls.h @@ -9,7 +9,7 @@ * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { return x ? sizeof(x) * 8 - __builtin_clz(x) : 0; } diff --git a/include/asm-generic/bitops/ffs.h b/include/asm-generic/bitops/ffs.h index 4c43f242daeb17..5ff2b7fbda6d8c 100644 --- a/include/asm-generic/bitops/ffs.h +++ b/include/asm-generic/bitops/ffs.h @@ -10,7 +10,7 @@ * the libc and compiler builtin ffs routines, therefore * differs in spirit from ffz (man ffs). */ -static inline int generic_ffs(int x) +static inline __attribute_const__ int generic_ffs(int x) { int r = 1; diff --git a/include/asm-generic/bitops/fls.h b/include/asm-generic/bitops/fls.h index 26f3ce1dd6e448..8eed3437edb95c 100644 --- a/include/asm-generic/bitops/fls.h +++ b/include/asm-generic/bitops/fls.h @@ -10,7 +10,7 @@ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int generic_fls(unsigned int x) +static __always_inline __attribute_const__ int generic_fls(unsigned int x) { int r = 32; diff --git a/include/asm-generic/bitops/fls64.h b/include/asm-generic/bitops/fls64.h index 866f2b2304ff63..b5f58dd261a370 100644 --- a/include/asm-generic/bitops/fls64.h +++ b/include/asm-generic/bitops/fls64.h @@ -16,7 +16,7 @@ * at position 64. */ #if BITS_PER_LONG == 32 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { __u32 h = x >> 32; if (h) @@ -24,7 +24,7 @@ static __always_inline int fls64(__u64 x) return fls(x); } #elif BITS_PER_LONG == 64 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { if (x == 0) return 0; diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h new file mode 100644 index 00000000000000..ee3793e9b1a4dc --- /dev/null +++ b/include/asm-generic/thread_info_tif.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_THREAD_INFO_TIF_H_ +#define _ASM_GENERIC_THREAD_INFO_TIF_H_ + +#include + +/* Bits 16-31 are reserved for architecture specific purposes */ + +#define TIF_NOTIFY_RESUME 0 // callback before returning to user +#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) + +#define TIF_SIGPENDING 1 // signal pending +#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) + +#define TIF_NOTIFY_SIGNAL 2 // signal notifications exist +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) + +#define TIF_MEMDIE 3 // is terminating due to OOM killer +#define _TIF_MEMDIE BIT(TIF_MEMDIE) + +#define TIF_NEED_RESCHED 4 // rescheduling necessary +#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) + +#ifdef HAVE_TIF_NEED_RESCHED_LAZY +# define TIF_NEED_RESCHED_LAZY 5 // Lazy rescheduling needed +# define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) +#endif + +#ifdef HAVE_TIF_POLLING_NRFLAG +# define TIF_POLLING_NRFLAG 6 // idle is polling for TIF_NEED_RESCHED +# define _TIF_POLLING_NRFLAG BIT(TIF_POLLING_NRFLAG) +#endif + +#define TIF_USER_RETURN_NOTIFY 7 // notify kernel of userspace return +#define _TIF_USER_RETURN_NOTIFY BIT(TIF_USER_RETURN_NOTIFY) + +#define TIF_UPROBE 8 // breakpointed or singlestepping +#define _TIF_UPROBE BIT(TIF_UPROBE) + +#define TIF_PATCH_PENDING 9 // pending live patching update +#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) + +#ifdef HAVE_TIF_RESTORE_SIGMASK +# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */ +# define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) +#endif + +#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index 7fc0b560007dd8..5c6d9799f4e746 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -4,8 +4,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE - #ifndef __arch_get_vdso_u_time_data static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { @@ -20,8 +18,6 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(vo } #endif -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ - #ifndef __arch_update_vdso_clock static __always_inline void __arch_update_vdso_clock(struct vdso_clock *vc) { diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ae2d2359b79e9e..a65a87366c48bc 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -157,7 +157,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define PATCHABLE_DISCARDS *(__patchable_function_entries) #endif -#ifndef CONFIG_ARCH_SUPPORTS_CFI_CLANG +#ifndef CONFIG_ARCH_SUPPORTS_CFI /* * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index ce6521ad04d121..2eda895f19f54c 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -9,9 +9,6 @@ #include #include -#define ARCH_TIMER_TYPE_CP15 BIT(0) -#define ARCH_TIMER_TYPE_MEM BIT(1) - #define ARCH_TIMER_CTRL_ENABLE (1 << 0) #define ARCH_TIMER_CTRL_IT_MASK (1 << 1) #define ARCH_TIMER_CTRL_IT_STAT (1 << 2) @@ -51,8 +48,6 @@ enum arch_timer_spi_nr { #define ARCH_TIMER_PHYS_ACCESS 0 #define ARCH_TIMER_VIRT_ACCESS 1 -#define ARCH_TIMER_MEM_PHYS_ACCESS 2 -#define ARCH_TIMER_MEM_VIRT_ACCESS 3 #define ARCH_TIMER_MEM_MAX_FRAMES 8 diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index 91f6b4cf561c76..38e26dff27b00a 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -45,19 +45,11 @@ static inline void chacha20_block(struct chacha_state *state, chacha_block_generic(state, out, 20); } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); void hchacha_block_generic(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); -static inline void hchacha_block(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) - hchacha_block_arch(state, out, nrounds); - else - hchacha_block_generic(state, out, nrounds); -} +void hchacha_block(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); enum chacha_constants { /* expand 32-byte k */ CHACHA_CONSTANT_EXPA = 0x61707865U, @@ -93,20 +85,8 @@ static inline void chacha_init(struct chacha_state *state, state->x[15] = get_unaligned_le32(iv + 12); } -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); -void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); - -static inline void chacha_crypt(struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) - chacha_crypt_arch(state, dst, src, bytes, nrounds); - else - chacha_crypt_generic(state, dst, src, bytes, nrounds); -} +void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); static inline void chacha20_crypt(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes) @@ -119,13 +99,4 @@ static inline void chacha_zeroize_state(struct chacha_state *state) memzero_explicit(state, sizeof(*state)); } -#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA) -bool chacha_is_arch_optimized(void); -#else -static inline bool chacha_is_arch_optimized(void) -{ - return false; -} -#endif - #endif /* _CRYPTO_CHACHA_H */ diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h index ece6a9b5fafc88..db63a5577c0045 100644 --- a/include/crypto/curve25519.h +++ b/include/crypto/curve25519.h @@ -6,7 +6,6 @@ #ifndef CURVE25519_H #define CURVE25519_H -#include // For crypto_memneq. #include #include @@ -14,49 +13,16 @@ enum curve25519_lengths { CURVE25519_KEY_SIZE = 32 }; -extern const u8 curve25519_null_point[]; -extern const u8 curve25519_base_point[]; - void curve25519_generic(u8 out[CURVE25519_KEY_SIZE], const u8 scalar[CURVE25519_KEY_SIZE], const u8 point[CURVE25519_KEY_SIZE]); -void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], - const u8 scalar[CURVE25519_KEY_SIZE], - const u8 point[CURVE25519_KEY_SIZE]); - -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]); - -bool curve25519_selftest(void); - -static inline bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_arch(mypublic, secret, basepoint); - else - curve25519_generic(mypublic, secret, basepoint); - return crypto_memneq(mypublic, curve25519_null_point, - CURVE25519_KEY_SIZE); -} - -static inline bool -__must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) -{ - if (unlikely(!crypto_memneq(secret, curve25519_null_point, - CURVE25519_KEY_SIZE))) - return false; + const u8 basepoint[CURVE25519_KEY_SIZE]); - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) - curve25519_base_arch(pub, secret); - else - curve25519_generic(pub, secret, curve25519_base_point); - return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE); -} +bool __must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]); static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE]) { diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index f7b3b93f3a49a7..107b797c33ecf7 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -135,6 +135,7 @@ struct af_alg_async_req { * SG? * @enc: Cryptographic operation to be performed when * recvmsg is invoked. + * @write: True if we are in the middle of a write. * @init: True if metadata has been sent. * @len: Length of memory allocated for this data structure. * @inflight: Non-zero when AIO requests are in flight. @@ -151,10 +152,11 @@ struct af_alg_ctx { size_t used; atomic_t rcvused; - bool more; - bool merge; - bool enc; - bool init; + bool more:1, + merge:1, + enc:1, + write:1, + init:1; unsigned int len; diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h deleted file mode 100644 index 506d56530ca93f..00000000000000 --- a/include/crypto/internal/blake2s.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Helper functions for BLAKE2s implementations. - * Keep this in sync with the corresponding BLAKE2b header. - */ - -#ifndef _CRYPTO_INTERNAL_BLAKE2S_H -#define _CRYPTO_INTERNAL_BLAKE2S_H - -#include -#include - -void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc); - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc); - -bool blake2s_selftest(void); - -#endif /* _CRYPTO_INTERNAL_BLAKE2S_H */ diff --git a/include/crypto/internal/poly1305.h b/include/crypto/internal/poly1305.h index c60315f475623f..a72fff409ab852 100644 --- a/include/crypto/internal/poly1305.h +++ b/include/crypto/internal/poly1305.h @@ -30,12 +30,13 @@ void poly1305_core_blocks(struct poly1305_state *state, void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], void *dst); -void poly1305_block_init_arch(struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -void poly1305_block_init_generic(struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit); +static inline void +poly1305_block_init_generic(struct poly1305_block_state *desc, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + poly1305_core_init(&desc->h); + poly1305_core_setkey(&desc->core_r, raw_key); +} static inline void poly1305_blocks_generic(struct poly1305_block_state *state, const u8 *src, unsigned int len, @@ -45,9 +46,6 @@ static inline void poly1305_blocks_generic(struct poly1305_block_state *state, len / POLY1305_BLOCK_SIZE, padbit); } -void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]); - static inline void poly1305_emit_generic(const struct poly1305_state *state, u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]) diff --git a/include/crypto/md5.h b/include/crypto/md5.h index 28ee533a0507a7..c9aa5c3abc5324 100644 --- a/include/crypto/md5.h +++ b/include/crypto/md5.h @@ -7,6 +7,7 @@ #define MD5_DIGEST_SIZE 16 #define MD5_HMAC_BLOCK_SIZE 64 +#define MD5_BLOCK_SIZE 64 #define MD5_BLOCK_WORDS 16 #define MD5_HASH_WORDS 4 #define MD5_STATE_SIZE 24 @@ -27,4 +28,182 @@ struct md5_state { u32 block[MD5_BLOCK_WORDS]; }; -#endif +/* State for the MD5 compression function */ +struct md5_block_state { + u32 h[MD5_HASH_WORDS]; +}; + +/** + * struct md5_ctx - Context for hashing a message with MD5 + * @state: the compression function state + * @bytecount: number of bytes processed so far + * @buf: partial block buffer; bytecount % MD5_BLOCK_SIZE bytes are valid + */ +struct md5_ctx { + struct md5_block_state state; + u64 bytecount; + u8 buf[MD5_BLOCK_SIZE] __aligned(__alignof__(__le64)); +}; + +/** + * md5_init() - Initialize an MD5 context for a new message + * @ctx: the context to initialize + * + * If you don't need incremental computation, consider md5() instead. + * + * Context: Any context. + */ +void md5_init(struct md5_ctx *ctx); + +/** + * md5_update() - Update an MD5 context with message data + * @ctx: the context to update; must have been initialized + * @data: the message data + * @len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len); + +/** + * md5_final() - Finish computing an MD5 message digest + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting MD5 message digest + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); + +/** + * md5() - Compute MD5 message digest in one shot + * @data: the message data + * @len: the data length in bytes + * @out: (output) the resulting MD5 message digest + * + * Context: Any context. + */ +void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]); + +/** + * struct hmac_md5_key - Prepared key for HMAC-MD5 + * @istate: private + * @ostate: private + */ +struct hmac_md5_key { + struct md5_block_state istate; + struct md5_block_state ostate; +}; + +/** + * struct hmac_md5_ctx - Context for computing HMAC-MD5 of a message + * @hash_ctx: private + * @ostate: private + */ +struct hmac_md5_ctx { + struct md5_ctx hash_ctx; + struct md5_block_state ostate; +}; + +/** + * hmac_md5_preparekey() - Prepare a key for HMAC-MD5 + * @key: (output) the key structure to initialize + * @raw_key: the raw HMAC-MD5 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * Note: the caller is responsible for zeroizing both the struct hmac_md5_key + * and the raw key once they are no longer needed. + * + * Context: Any context. + */ +void hmac_md5_preparekey(struct hmac_md5_key *key, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_md5_init() - Initialize an HMAC-MD5 context for a new message + * @ctx: (output) the HMAC context to initialize + * @key: the prepared HMAC key + * + * If you don't need incremental computation, consider hmac_md5() instead. + * + * Context: Any context. + */ +void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key); + +/** + * hmac_md5_init_usingrawkey() - Initialize an HMAC-MD5 context for a new + * message, using a raw key + * @ctx: (output) the HMAC context to initialize + * @raw_key: the raw HMAC-MD5 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * + * If you don't need incremental computation, consider hmac_md5_usingrawkey() + * instead. + * + * Context: Any context. + */ +void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx, + const u8 *raw_key, size_t raw_key_len); + +/** + * hmac_md5_update() - Update an HMAC-MD5 context with message data + * @ctx: the HMAC context to update; must have been initialized + * @data: the message data + * @data_len: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +static inline void hmac_md5_update(struct hmac_md5_ctx *ctx, + const u8 *data, size_t data_len) +{ + md5_update(&ctx->hash_ctx, data, data_len); +} + +/** + * hmac_md5_final() - Finish computing an HMAC-MD5 value + * @ctx: the HMAC context to finalize; must have been initialized + * @out: (output) the resulting HMAC-MD5 value + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); + +/** + * hmac_md5() - Compute HMAC-MD5 in one shot, using a prepared key + * @key: the prepared HMAC key + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-MD5 value + * + * If you're using the key only once, consider using hmac_md5_usingrawkey(). + * + * Context: Any context. + */ +void hmac_md5(const struct hmac_md5_key *key, + const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]); + +/** + * hmac_md5_usingrawkey() - Compute HMAC-MD5 in one shot, using a raw key + * @raw_key: the raw HMAC-MD5 key + * @raw_key_len: the key length in bytes. All key lengths are supported. + * @data: the message data + * @data_len: the data length in bytes + * @out: (output) the resulting HMAC-MD5 value + * + * If you're using the key multiple times, prefer to use hmac_md5_preparekey() + * followed by multiple calls to hmac_md5() instead. + * + * Context: Any context. + */ +void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[MD5_DIGEST_SIZE]); + +#endif /* _CRYPTO_MD5_H */ diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index e54abda8cfe95e..d4daeec8da19d8 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -64,13 +64,4 @@ void poly1305_update(struct poly1305_desc_ctx *desc, const u8 *src, unsigned int nbytes); void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest); -#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305) -bool poly1305_is_arch_optimized(void); -#else -static inline bool poly1305_is_arch_optimized(void) -{ - return false; -} -#endif - #endif diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index 15e461e568cca6..e5dafb935cc88b 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -375,6 +375,34 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); */ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); +/** + * sha256_finup_2x() - Compute two SHA-256 digests from a common initial + * context. On some CPUs, this is faster than sequentially + * computing each digest. + * @ctx: an optional initial context, which may have already processed data. If + * NULL, a default initial context is used (equivalent to sha256_init()). + * @data1: data for the first message + * @data2: data for the second message + * @len: the length of each of @data1 and @data2, in bytes + * @out1: (output) the first SHA-256 message digest + * @out2: (output) the second SHA-256 message digest + * + * Context: Any context. + */ +void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +/** + * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real + * interleaved implementation, as opposed to a + * sequential fallback + * @return: true if optimized + * + * Context: Any context. + */ +bool sha256_finup_2x_is_optimized(void); + /** * struct hmac_sha256_key - Prepared key for HMAC-SHA256 * @key: private diff --git a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h index 7ecc4f0b235aac..0c2ce81a874487 100644 --- a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h +++ b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h @@ -25,5 +25,6 @@ #define R9A09G077_CLK_PCLKM 13 #define R9A09G077_CLK_PCLKL 14 #define R9A09G077_SDHI_CLKHS 15 +#define R9A09G077_USB_CLK 16 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G077_CPG_H__ */ diff --git a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h index 925e57703925dd..70ee883f2386b4 100644 --- a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h +++ b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h @@ -25,5 +25,6 @@ #define R9A09G087_CLK_PCLKM 13 #define R9A09G087_CLK_PCLKL 14 #define R9A09G087_SDHI_CLKHS 15 +#define R9A09G087_USB_CLK 16 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G087_CPG_H__ */ diff --git a/include/dt-bindings/gpio/tegra256-gpio.h b/include/dt-bindings/gpio/tegra256-gpio.h new file mode 100644 index 00000000000000..a0353a302aeb41 --- /dev/null +++ b/include/dt-bindings/gpio/tegra256-gpio.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. */ + +/* + * This header provides constants for the nvidia,tegra256-gpio DT binding. + * + * The first cell in Tegra's GPIO specifier is the GPIO ID. + * The macros below provide names for this. + * + * The second cell contains standard flag values specified in gpio.h. + */ + +#ifndef _DT_BINDINGS_GPIO_TEGRA256_GPIO_H +#define _DT_BINDINGS_GPIO_TEGRA256_GPIO_H + +#include + +/* GPIOs implemented by main GPIO controller */ +#define TEGRA256_MAIN_GPIO_PORT_A 0 +#define TEGRA256_MAIN_GPIO_PORT_B 1 +#define TEGRA256_MAIN_GPIO_PORT_C 2 +#define TEGRA256_MAIN_GPIO_PORT_D 3 + +#define TEGRA256_MAIN_GPIO(port, offset) \ + ((TEGRA256_MAIN_GPIO_PORT_##port * 8) + (offset)) + +#endif + diff --git a/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h b/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h index f315d5a7f5eef7..7dd04424afcce8 100644 --- a/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h +++ b/include/dt-bindings/interrupt-controller/aspeed-scu-ic.h @@ -20,4 +20,18 @@ #define ASPEED_AST2600_SCU_IC1_LPC_RESET_LO_TO_HI 0 #define ASPEED_AST2600_SCU_IC1_LPC_RESET_HI_TO_LO 1 +#define ASPEED_AST2700_SCU_IC0_PCIE_PERST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC0_PCIE_PERST_HI_TO_LO 2 + +#define ASPEED_AST2700_SCU_IC1_PCIE_RCRST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC1_PCIE_RCRST_HI_TO_LO 2 + +#define ASPEED_AST2700_SCU_IC2_PCIE_PERST_LO_TO_HI 3 +#define ASPEED_AST2700_SCU_IC2_PCIE_PERST_HI_TO_LO 2 +#define ASPEED_AST2700_SCU_IC2_LPC_RESET_LO_TO_HI 1 +#define ASPEED_AST2700_SCU_IC2_LPC_RESET_HI_TO_LO 0 + +#define ASPEED_AST2700_SCU_IC3_LPC_RESET_LO_TO_HI 1 +#define ASPEED_AST2700_SCU_IC3_LPC_RESET_HI_TO_LO 0 + #endif /* _DT_BINDINGS_INTERRUPT_CONTROLLER_ASPEED_SCU_IC_H_ */ diff --git a/include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h b/include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h new file mode 100644 index 00000000000000..f088793f23eedc --- /dev/null +++ b/include/dt-bindings/pinctrl/renesas,r9a09g077-pinctrl.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * This header provides constants for Renesas RZ/T2H family pinctrl bindings. + * + * Copyright (C) 2025 Renesas Electronics Corp. + */ + +#ifndef __DT_BINDINGS_PINCTRL_RENESAS_R9A09G077_PINCTRL_H__ +#define __DT_BINDINGS_PINCTRL_RENESAS_R9A09G077_PINCTRL_H__ + +#define RZT2H_PINS_PER_PORT 8 + +/* + * Create the pin index from its bank and position numbers and store in + * the upper 16 bits the alternate function identifier + */ +#define RZT2H_PORT_PINMUX(b, p, f) ((b) * RZT2H_PINS_PER_PORT + (p) | ((f) << 16)) + +/* Convert a port and pin label to its global pin index */ +#define RZT2H_GPIO(port, pin) ((port) * RZT2H_PINS_PER_PORT + (pin)) + +#endif /* __DT_BINDINGS_PINCTRL_RENESAS_R9A09G077_PINCTRL_H__ */ diff --git a/include/dt-bindings/power/amlogic,s6-pwrc.h b/include/dt-bindings/power/amlogic,s6-pwrc.h new file mode 100644 index 00000000000000..2c005864ae73fa --- /dev/null +++ b/include/dt-bindings/power/amlogic,s6-pwrc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + */ +#ifndef _DT_BINDINGS_AMLOGIC_S6_POWER_H +#define _DT_BINDINGS_AMLOGIC_S6_POWER_H + +#define PWRC_S6_DSPA_ID 0 +#define PWRC_S6_DOS_HEVC_ID 1 +#define PWRC_S6_DOS_VDEC_ID 2 +#define PWRC_S6_VPU_HDMI_ID 3 +#define PWRC_S6_U2DRD_ID 4 +#define PWRC_S6_U3DRD_ID 5 +#define PWRC_S6_SD_EMMC_C_ID 6 +#define PWRC_S6_GE2D_ID 7 +#define PWRC_S6_AMFC_ID 8 +#define PWRC_S6_VC9000E_ID 9 +#define PWRC_S6_DEWARP_ID 10 +#define PWRC_S6_VICP_ID 11 +#define PWRC_S6_SD_EMMC_A_ID 12 +#define PWRC_S6_SD_EMMC_B_ID 13 +#define PWRC_S6_ETH_ID 14 +#define PWRC_S6_PCIE_ID 15 +#define PWRC_S6_NNA_4T_ID 16 +#define PWRC_S6_AUDIO_ID 17 +#define PWRC_S6_AUCPU_ID 18 +#define PWRC_S6_ADAPT_ID 19 + +#endif diff --git a/include/dt-bindings/power/amlogic,s7-pwrc.h b/include/dt-bindings/power/amlogic,s7-pwrc.h new file mode 100644 index 00000000000000..3f21d095f784ef --- /dev/null +++ b/include/dt-bindings/power/amlogic,s7-pwrc.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + */ +#ifndef _DT_BINDINGS_AMLOGIC_S7_POWER_H +#define _DT_BINDINGS_AMLOGIC_S7_POWER_H + +#define PWRC_S7_DOS_HEVC_ID 0 +#define PWRC_S7_DOS_VDEC_ID 1 +#define PWRC_S7_VPU_HDMI_ID 2 +#define PWRC_S7_USB_COMB_ID 3 +#define PWRC_S7_SD_EMMC_C_ID 4 +#define PWRC_S7_GE2D_ID 5 +#define PWRC_S7_SD_EMMC_A_ID 6 +#define PWRC_S7_SD_EMMC_B_ID 7 +#define PWRC_S7_ETH_ID 8 +#define PWRC_S7_AUCPU_ID 9 +#define PWRC_S7_AUDIO_ID 10 + +#endif diff --git a/include/dt-bindings/power/amlogic,s7d-pwrc.h b/include/dt-bindings/power/amlogic,s7d-pwrc.h new file mode 100644 index 00000000000000..c6998553670aae --- /dev/null +++ b/include/dt-bindings/power/amlogic,s7d-pwrc.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (C) 2025 Amlogic, Inc. All rights reserved + */ +#ifndef _DT_BINDINGS_AMLOGIC_S7D_POWER_H +#define _DT_BINDINGS_AMLOGIC_S7D_POWER_H + +#define PWRC_S7D_DOS_HCODEC_ID 0 +#define PWRC_S7D_DOS_HEVC_ID 1 +#define PWRC_S7D_DOS_VDEC_ID 2 +#define PWRC_S7D_VPU_HDMI_ID 3 +#define PWRC_S7D_USB_U2DRD_ID 4 +#define PWRC_S7D_USB_U2H_ID 5 +#define PWRC_S7D_SSD_EMMC_C_ID 6 +#define PWRC_S7D_GE2D_ID 7 +#define PWRC_S7D_AMFC_ID 8 +#define PWRC_S7D_EMMC_A_ID 9 +#define PWRC_S7D_EMMC_B_ID 10 +#define PWRC_S7D_ETH_ID 11 +#define PWRC_S7D_AUCPU_ID 12 +#define PWRC_S7D_AUDIO_ID 13 +#define PWRC_S7D_SRAMA_ID 14 +#define PWRC_S7D_DMC0_ID 15 +#define PWRC_S7D_DMC1_ID 16 +#define PWRC_S7D_DDR_ID 17 + +#endif diff --git a/include/dt-bindings/power/marvell,pxa1908-power.h b/include/dt-bindings/power/marvell,pxa1908-power.h new file mode 100644 index 00000000000000..19b088351af138 --- /dev/null +++ b/include/dt-bindings/power/marvell,pxa1908-power.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +/* + * Marvell PXA1908 power domains + * + * Copyright 2025, Duje Mihanović + */ + +#ifndef __DTS_MARVELL_PXA1908_POWER_H +#define __DTS_MARVELL_PXA1908_POWER_H + +#define PXA1908_POWER_DOMAIN_VPU 0 +#define PXA1908_POWER_DOMAIN_GPU 1 +#define PXA1908_POWER_DOMAIN_GPU2D 2 +#define PXA1908_POWER_DOMAIN_DSI 3 +#define PXA1908_POWER_DOMAIN_ISP 4 + +#endif diff --git a/include/dt-bindings/power/qcom,rpmhpd.h b/include/dt-bindings/power/qcom,rpmhpd.h index e54ffa3614515c..73cceb88953f70 100644 --- a/include/dt-bindings/power/qcom,rpmhpd.h +++ b/include/dt-bindings/power/qcom,rpmhpd.h @@ -29,4 +29,237 @@ #define RPMHPD_NSP2 19 #define RPMHPD_GMXC 20 +/* RPMh Power Domain performance levels */ +#define RPMH_REGULATOR_LEVEL_RETENTION 16 +#define RPMH_REGULATOR_LEVEL_MIN_SVS 48 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D3 50 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D2 52 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D1 56 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D0 60 +#define RPMH_REGULATOR_LEVEL_LOW_SVS 64 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_P1 72 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_L1 80 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_L2 96 +#define RPMH_REGULATOR_LEVEL_SVS 128 +#define RPMH_REGULATOR_LEVEL_SVS_L0 144 +#define RPMH_REGULATOR_LEVEL_SVS_L1 192 +#define RPMH_REGULATOR_LEVEL_SVS_L2 224 +#define RPMH_REGULATOR_LEVEL_NOM 256 +#define RPMH_REGULATOR_LEVEL_NOM_L0 288 +#define RPMH_REGULATOR_LEVEL_NOM_L1 320 +#define RPMH_REGULATOR_LEVEL_NOM_L2 336 +#define RPMH_REGULATOR_LEVEL_TURBO 384 +#define RPMH_REGULATOR_LEVEL_TURBO_L0 400 +#define RPMH_REGULATOR_LEVEL_TURBO_L1 416 +#define RPMH_REGULATOR_LEVEL_TURBO_L2 432 +#define RPMH_REGULATOR_LEVEL_TURBO_L3 448 +#define RPMH_REGULATOR_LEVEL_TURBO_L4 452 +#define RPMH_REGULATOR_LEVEL_TURBO_L5 456 +#define RPMH_REGULATOR_LEVEL_SUPER_TURBO 464 +#define RPMH_REGULATOR_LEVEL_SUPER_TURBO_NO_CPR 480 + +/* + * Platform-specific power domain bindings. Don't add new entries here, use + * RPMHPD_* above. + */ + +/* SA8775P Power Domain Indexes */ +#define SA8775P_CX 0 +#define SA8775P_CX_AO 1 +#define SA8775P_DDR 2 +#define SA8775P_EBI 3 +#define SA8775P_GFX 4 +#define SA8775P_LCX 5 +#define SA8775P_LMX 6 +#define SA8775P_MMCX 7 +#define SA8775P_MMCX_AO 8 +#define SA8775P_MSS 9 +#define SA8775P_MX 10 +#define SA8775P_MX_AO 11 +#define SA8775P_MXC 12 +#define SA8775P_MXC_AO 13 +#define SA8775P_NSP0 14 +#define SA8775P_NSP1 15 +#define SA8775P_XO 16 + +/* SDM670 Power Domain Indexes */ +#define SDM670_MX 0 +#define SDM670_MX_AO 1 +#define SDM670_CX 2 +#define SDM670_CX_AO 3 +#define SDM670_LMX 4 +#define SDM670_LCX 5 +#define SDM670_GFX 6 +#define SDM670_MSS 7 + +/* SDM845 Power Domain Indexes */ +#define SDM845_EBI 0 +#define SDM845_MX 1 +#define SDM845_MX_AO 2 +#define SDM845_CX 3 +#define SDM845_CX_AO 4 +#define SDM845_LMX 5 +#define SDM845_LCX 6 +#define SDM845_GFX 7 +#define SDM845_MSS 8 + +/* SDX55 Power Domain Indexes */ +#define SDX55_MSS 0 +#define SDX55_MX 1 +#define SDX55_CX 2 + +/* SDX65 Power Domain Indexes */ +#define SDX65_MSS 0 +#define SDX65_MX 1 +#define SDX65_MX_AO 2 +#define SDX65_CX 3 +#define SDX65_CX_AO 4 +#define SDX65_MXC 5 + +/* SM6350 Power Domain Indexes */ +#define SM6350_CX 0 +#define SM6350_GFX 1 +#define SM6350_LCX 2 +#define SM6350_LMX 3 +#define SM6350_MSS 4 +#define SM6350_MX 5 + +/* SM8150 Power Domain Indexes */ +#define SM8150_MSS 0 +#define SM8150_EBI 1 +#define SM8150_LMX 2 +#define SM8150_LCX 3 +#define SM8150_GFX 4 +#define SM8150_MX 5 +#define SM8150_MX_AO 6 +#define SM8150_CX 7 +#define SM8150_CX_AO 8 +#define SM8150_MMCX 9 +#define SM8150_MMCX_AO 10 + +/* SA8155P is a special case, kept for backwards compatibility */ +#define SA8155P_CX SM8150_CX +#define SA8155P_CX_AO SM8150_CX_AO +#define SA8155P_EBI SM8150_EBI +#define SA8155P_GFX SM8150_GFX +#define SA8155P_MSS SM8150_MSS +#define SA8155P_MX SM8150_MX +#define SA8155P_MX_AO SM8150_MX_AO + +/* SM8250 Power Domain Indexes */ +#define SM8250_CX 0 +#define SM8250_CX_AO 1 +#define SM8250_EBI 2 +#define SM8250_GFX 3 +#define SM8250_LCX 4 +#define SM8250_LMX 5 +#define SM8250_MMCX 6 +#define SM8250_MMCX_AO 7 +#define SM8250_MX 8 +#define SM8250_MX_AO 9 + +/* SM8350 Power Domain Indexes */ +#define SM8350_CX 0 +#define SM8350_CX_AO 1 +#define SM8350_EBI 2 +#define SM8350_GFX 3 +#define SM8350_LCX 4 +#define SM8350_LMX 5 +#define SM8350_MMCX 6 +#define SM8350_MMCX_AO 7 +#define SM8350_MX 8 +#define SM8350_MX_AO 9 +#define SM8350_MXC 10 +#define SM8350_MXC_AO 11 +#define SM8350_MSS 12 + +/* SM8450 Power Domain Indexes */ +#define SM8450_CX 0 +#define SM8450_CX_AO 1 +#define SM8450_EBI 2 +#define SM8450_GFX 3 +#define SM8450_LCX 4 +#define SM8450_LMX 5 +#define SM8450_MMCX 6 +#define SM8450_MMCX_AO 7 +#define SM8450_MX 8 +#define SM8450_MX_AO 9 +#define SM8450_MXC 10 +#define SM8450_MXC_AO 11 +#define SM8450_MSS 12 + +/* SM8550 Power Domain Indexes */ +#define SM8550_CX 0 +#define SM8550_CX_AO 1 +#define SM8550_EBI 2 +#define SM8550_GFX 3 +#define SM8550_LCX 4 +#define SM8550_LMX 5 +#define SM8550_MMCX 6 +#define SM8550_MMCX_AO 7 +#define SM8550_MX 8 +#define SM8550_MX_AO 9 +#define SM8550_MXC 10 +#define SM8550_MXC_AO 11 +#define SM8550_MSS 12 +#define SM8550_NSP 13 + +/* QDU1000/QRU1000 Power Domain Indexes */ +#define QDU1000_EBI 0 +#define QDU1000_MSS 1 +#define QDU1000_CX 2 +#define QDU1000_MX 3 + +/* SC7180 Power Domain Indexes */ +#define SC7180_CX 0 +#define SC7180_CX_AO 1 +#define SC7180_GFX 2 +#define SC7180_MX 3 +#define SC7180_MX_AO 4 +#define SC7180_LMX 5 +#define SC7180_LCX 6 +#define SC7180_MSS 7 + +/* SC7280 Power Domain Indexes */ +#define SC7280_CX 0 +#define SC7280_CX_AO 1 +#define SC7280_EBI 2 +#define SC7280_GFX 3 +#define SC7280_MX 4 +#define SC7280_MX_AO 5 +#define SC7280_LMX 6 +#define SC7280_LCX 7 +#define SC7280_MSS 8 + +/* SC8180X Power Domain Indexes */ +#define SC8180X_CX 0 +#define SC8180X_CX_AO 1 +#define SC8180X_EBI 2 +#define SC8180X_GFX 3 +#define SC8180X_LCX 4 +#define SC8180X_LMX 5 +#define SC8180X_MMCX 6 +#define SC8180X_MMCX_AO 7 +#define SC8180X_MSS 8 +#define SC8180X_MX 9 +#define SC8180X_MX_AO 10 + +/* SC8280XP Power Domain Indexes */ +#define SC8280XP_CX 0 +#define SC8280XP_CX_AO 1 +#define SC8280XP_DDR 2 +#define SC8280XP_EBI 3 +#define SC8280XP_GFX 4 +#define SC8280XP_LCX 5 +#define SC8280XP_LMX 6 +#define SC8280XP_MMCX 7 +#define SC8280XP_MMCX_AO 8 +#define SC8280XP_MSS 9 +#define SC8280XP_MX 10 +#define SC8280XP_MXC 12 +#define SC8280XP_MX_AO 11 +#define SC8280XP_NSP 13 +#define SC8280XP_QPHY 14 +#define SC8280XP_XO 15 + #endif diff --git a/include/dt-bindings/power/qcom-rpmpd.h b/include/dt-bindings/power/qcom-rpmpd.h index f15bcee7c9283e..4371ac941f29d9 100644 --- a/include/dt-bindings/power/qcom-rpmpd.h +++ b/include/dt-bindings/power/qcom-rpmpd.h @@ -4,258 +4,39 @@ #ifndef _DT_BINDINGS_POWER_QCOM_RPMPD_H #define _DT_BINDINGS_POWER_QCOM_RPMPD_H -/* SA8775P Power Domain Indexes */ -#define SA8775P_CX 0 -#define SA8775P_CX_AO 1 -#define SA8775P_DDR 2 -#define SA8775P_EBI 3 -#define SA8775P_GFX 4 -#define SA8775P_LCX 5 -#define SA8775P_LMX 6 -#define SA8775P_MMCX 7 -#define SA8775P_MMCX_AO 8 -#define SA8775P_MSS 9 -#define SA8775P_MX 10 -#define SA8775P_MX_AO 11 -#define SA8775P_MXC 12 -#define SA8775P_MXC_AO 13 -#define SA8775P_NSP0 14 -#define SA8775P_NSP1 15 -#define SA8775P_XO 16 - -/* SDM670 Power Domain Indexes */ -#define SDM670_MX 0 -#define SDM670_MX_AO 1 -#define SDM670_CX 2 -#define SDM670_CX_AO 3 -#define SDM670_LMX 4 -#define SDM670_LCX 5 -#define SDM670_GFX 6 -#define SDM670_MSS 7 - -/* SDM845 Power Domain Indexes */ -#define SDM845_EBI 0 -#define SDM845_MX 1 -#define SDM845_MX_AO 2 -#define SDM845_CX 3 -#define SDM845_CX_AO 4 -#define SDM845_LMX 5 -#define SDM845_LCX 6 -#define SDM845_GFX 7 -#define SDM845_MSS 8 - -/* SDX55 Power Domain Indexes */ -#define SDX55_MSS 0 -#define SDX55_MX 1 -#define SDX55_CX 2 - -/* SDX65 Power Domain Indexes */ -#define SDX65_MSS 0 -#define SDX65_MX 1 -#define SDX65_MX_AO 2 -#define SDX65_CX 3 -#define SDX65_CX_AO 4 -#define SDX65_MXC 5 - -/* SM6350 Power Domain Indexes */ -#define SM6350_CX 0 -#define SM6350_GFX 1 -#define SM6350_LCX 2 -#define SM6350_LMX 3 -#define SM6350_MSS 4 -#define SM6350_MX 5 - -/* SM6375 Power Domain Indexes */ -#define SM6375_VDDCX 0 -#define SM6375_VDDCX_AO 1 -#define SM6375_VDDCX_VFL 2 -#define SM6375_VDDMX 3 -#define SM6375_VDDMX_AO 4 -#define SM6375_VDDMX_VFL 5 -#define SM6375_VDDGX 6 -#define SM6375_VDDGX_AO 7 -#define SM6375_VDD_LPI_CX 8 -#define SM6375_VDD_LPI_MX 9 - -/* SM8150 Power Domain Indexes */ -#define SM8150_MSS 0 -#define SM8150_EBI 1 -#define SM8150_LMX 2 -#define SM8150_LCX 3 -#define SM8150_GFX 4 -#define SM8150_MX 5 -#define SM8150_MX_AO 6 -#define SM8150_CX 7 -#define SM8150_CX_AO 8 -#define SM8150_MMCX 9 -#define SM8150_MMCX_AO 10 - -/* SA8155P is a special case, kept for backwards compatibility */ -#define SA8155P_CX SM8150_CX -#define SA8155P_CX_AO SM8150_CX_AO -#define SA8155P_EBI SM8150_EBI -#define SA8155P_GFX SM8150_GFX -#define SA8155P_MSS SM8150_MSS -#define SA8155P_MX SM8150_MX -#define SA8155P_MX_AO SM8150_MX_AO - -/* SM8250 Power Domain Indexes */ -#define SM8250_CX 0 -#define SM8250_CX_AO 1 -#define SM8250_EBI 2 -#define SM8250_GFX 3 -#define SM8250_LCX 4 -#define SM8250_LMX 5 -#define SM8250_MMCX 6 -#define SM8250_MMCX_AO 7 -#define SM8250_MX 8 -#define SM8250_MX_AO 9 - -/* SM8350 Power Domain Indexes */ -#define SM8350_CX 0 -#define SM8350_CX_AO 1 -#define SM8350_EBI 2 -#define SM8350_GFX 3 -#define SM8350_LCX 4 -#define SM8350_LMX 5 -#define SM8350_MMCX 6 -#define SM8350_MMCX_AO 7 -#define SM8350_MX 8 -#define SM8350_MX_AO 9 -#define SM8350_MXC 10 -#define SM8350_MXC_AO 11 -#define SM8350_MSS 12 - -/* SM8450 Power Domain Indexes */ -#define SM8450_CX 0 -#define SM8450_CX_AO 1 -#define SM8450_EBI 2 -#define SM8450_GFX 3 -#define SM8450_LCX 4 -#define SM8450_LMX 5 -#define SM8450_MMCX 6 -#define SM8450_MMCX_AO 7 -#define SM8450_MX 8 -#define SM8450_MX_AO 9 -#define SM8450_MXC 10 -#define SM8450_MXC_AO 11 -#define SM8450_MSS 12 - -/* SM8550 Power Domain Indexes */ -#define SM8550_CX 0 -#define SM8550_CX_AO 1 -#define SM8550_EBI 2 -#define SM8550_GFX 3 -#define SM8550_LCX 4 -#define SM8550_LMX 5 -#define SM8550_MMCX 6 -#define SM8550_MMCX_AO 7 -#define SM8550_MX 8 -#define SM8550_MX_AO 9 -#define SM8550_MXC 10 -#define SM8550_MXC_AO 11 -#define SM8550_MSS 12 -#define SM8550_NSP 13 - -/* QDU1000/QRU1000 Power Domain Indexes */ -#define QDU1000_EBI 0 -#define QDU1000_MSS 1 -#define QDU1000_CX 2 -#define QDU1000_MX 3 - -/* SC7180 Power Domain Indexes */ -#define SC7180_CX 0 -#define SC7180_CX_AO 1 -#define SC7180_GFX 2 -#define SC7180_MX 3 -#define SC7180_MX_AO 4 -#define SC7180_LMX 5 -#define SC7180_LCX 6 -#define SC7180_MSS 7 - -/* SC7280 Power Domain Indexes */ -#define SC7280_CX 0 -#define SC7280_CX_AO 1 -#define SC7280_EBI 2 -#define SC7280_GFX 3 -#define SC7280_MX 4 -#define SC7280_MX_AO 5 -#define SC7280_LMX 6 -#define SC7280_LCX 7 -#define SC7280_MSS 8 - -/* SC8180X Power Domain Indexes */ -#define SC8180X_CX 0 -#define SC8180X_CX_AO 1 -#define SC8180X_EBI 2 -#define SC8180X_GFX 3 -#define SC8180X_LCX 4 -#define SC8180X_LMX 5 -#define SC8180X_MMCX 6 -#define SC8180X_MMCX_AO 7 -#define SC8180X_MSS 8 -#define SC8180X_MX 9 -#define SC8180X_MX_AO 10 - -/* SC8280XP Power Domain Indexes */ -#define SC8280XP_CX 0 -#define SC8280XP_CX_AO 1 -#define SC8280XP_DDR 2 -#define SC8280XP_EBI 3 -#define SC8280XP_GFX 4 -#define SC8280XP_LCX 5 -#define SC8280XP_LMX 6 -#define SC8280XP_MMCX 7 -#define SC8280XP_MMCX_AO 8 -#define SC8280XP_MSS 9 -#define SC8280XP_MX 10 -#define SC8280XP_MXC 12 -#define SC8280XP_MX_AO 11 -#define SC8280XP_NSP 13 -#define SC8280XP_QPHY 14 -#define SC8280XP_XO 15 - -/* SDM845 Power Domain performance levels */ -#define RPMH_REGULATOR_LEVEL_RETENTION 16 -#define RPMH_REGULATOR_LEVEL_MIN_SVS 48 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D3 50 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D2 52 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D1 56 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_D0 60 -#define RPMH_REGULATOR_LEVEL_LOW_SVS 64 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_P1 72 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_L1 80 -#define RPMH_REGULATOR_LEVEL_LOW_SVS_L2 96 -#define RPMH_REGULATOR_LEVEL_SVS 128 -#define RPMH_REGULATOR_LEVEL_SVS_L0 144 -#define RPMH_REGULATOR_LEVEL_SVS_L1 192 -#define RPMH_REGULATOR_LEVEL_SVS_L2 224 -#define RPMH_REGULATOR_LEVEL_NOM 256 -#define RPMH_REGULATOR_LEVEL_NOM_L0 288 -#define RPMH_REGULATOR_LEVEL_NOM_L1 320 -#define RPMH_REGULATOR_LEVEL_NOM_L2 336 -#define RPMH_REGULATOR_LEVEL_TURBO 384 -#define RPMH_REGULATOR_LEVEL_TURBO_L0 400 -#define RPMH_REGULATOR_LEVEL_TURBO_L1 416 -#define RPMH_REGULATOR_LEVEL_TURBO_L2 432 -#define RPMH_REGULATOR_LEVEL_TURBO_L3 448 -#define RPMH_REGULATOR_LEVEL_TURBO_L4 452 -#define RPMH_REGULATOR_LEVEL_TURBO_L5 456 -#define RPMH_REGULATOR_LEVEL_SUPER_TURBO 464 -#define RPMH_REGULATOR_LEVEL_SUPER_TURBO_NO_CPR 480 +#include + +/* Generic RPM Power Domain Indexes */ +#define RPMPD_VDDCX 0 +#define RPMPD_VDDCX_AO 1 +/* VFC and VFL are mutually exclusive and can not be present on the same platform */ +#define RPMPD_VDDCX_VFC 2 +#define RPMPD_VDDCX_VFL 2 +#define RPMPD_VDDMX 3 +#define RPMPD_VDDMX_AO 4 +#define RPMPD_VDDMX_VFL 5 +#define RPMPD_SSCCX 6 +#define RPMPD_SSCCX_VFL 7 +#define RPMPD_SSCMX 8 +#define RPMPD_SSCMX_VFL 9 + +/* + * Platform-specific power domain bindings. Don't add new entries here, use + * RPMPD_* above. + */ /* MDM9607 Power Domains */ -#define MDM9607_VDDCX 0 -#define MDM9607_VDDCX_AO 1 -#define MDM9607_VDDCX_VFL 2 -#define MDM9607_VDDMX 3 -#define MDM9607_VDDMX_AO 4 -#define MDM9607_VDDMX_VFL 5 +#define MDM9607_VDDCX RPMPD_VDDCX +#define MDM9607_VDDCX_AO RPMPD_VDDCX_AO +#define MDM9607_VDDCX_VFL RPMPD_VDDCX_VFL +#define MDM9607_VDDMX RPMPD_VDDMX +#define MDM9607_VDDMX_AO RPMPD_VDDMX_AO +#define MDM9607_VDDMX_VFL RPMPD_VDDMX_VFL /* MSM8226 Power Domain Indexes */ -#define MSM8226_VDDCX 0 -#define MSM8226_VDDCX_AO 1 -#define MSM8226_VDDCX_VFC 2 +#define MSM8226_VDDCX RPMPD_VDDCX +#define MSM8226_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8226_VDDCX_VFC RPMPD_VDDCX_VFC /* MSM8939 Power Domains */ #define MSM8939_VDDMDCX 0 @@ -268,11 +49,11 @@ #define MSM8939_VDDMX_AO 7 /* MSM8916 Power Domain Indexes */ -#define MSM8916_VDDCX 0 -#define MSM8916_VDDCX_AO 1 -#define MSM8916_VDDCX_VFC 2 -#define MSM8916_VDDMX 3 -#define MSM8916_VDDMX_AO 4 +#define MSM8916_VDDCX RPMPD_VDDCX +#define MSM8916_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8916_VDDCX_VFC RPMPD_VDDCX_VFC +#define MSM8916_VDDMX RPMPD_VDDMX +#define MSM8916_VDDMX_AO RPMPD_VDDMX_AO /* MSM8909 Power Domain Indexes */ #define MSM8909_VDDCX MSM8916_VDDCX @@ -282,11 +63,11 @@ #define MSM8909_VDDMX_AO MSM8916_VDDMX_AO /* MSM8917 Power Domain Indexes */ -#define MSM8917_VDDCX 0 -#define MSM8917_VDDCX_AO 1 -#define MSM8917_VDDCX_VFL 2 -#define MSM8917_VDDMX 3 -#define MSM8917_VDDMX_AO 4 +#define MSM8917_VDDCX RPMPD_VDDCX +#define MSM8917_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8917_VDDCX_VFL RPMPD_VDDCX_VFL +#define MSM8917_VDDMX RPMPD_VDDMX +#define MSM8917_VDDMX_AO RPMPD_VDDMX_AO /* MSM8937 Power Domain Indexes */ #define MSM8937_VDDCX MSM8917_VDDCX @@ -319,12 +100,12 @@ #define MSM8974_VDDGFX_VFC 4 /* MSM8976 Power Domain Indexes */ -#define MSM8976_VDDCX 0 -#define MSM8976_VDDCX_AO 1 -#define MSM8976_VDDCX_VFL 2 -#define MSM8976_VDDMX 3 -#define MSM8976_VDDMX_AO 4 -#define MSM8976_VDDMX_VFL 5 +#define MSM8976_VDDCX RPMPD_VDDCX +#define MSM8976_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8976_VDDCX_VFL RPMPD_VDDCX_VFL +#define MSM8976_VDDMX RPMPD_VDDMX +#define MSM8976_VDDMX_AO RPMPD_VDDMX_AO +#define MSM8976_VDDMX_VFL RPMPD_VDDMX_VFL /* MSM8994 Power Domain Indexes */ #define MSM8994_VDDCX 0 @@ -345,16 +126,26 @@ #define MSM8996_VDDSSCX_VFC 6 /* MSM8998 Power Domain Indexes */ -#define MSM8998_VDDCX 0 -#define MSM8998_VDDCX_AO 1 -#define MSM8998_VDDCX_VFL 2 -#define MSM8998_VDDMX 3 -#define MSM8998_VDDMX_AO 4 -#define MSM8998_VDDMX_VFL 5 -#define MSM8998_SSCCX 6 -#define MSM8998_SSCCX_VFL 7 -#define MSM8998_SSCMX 8 -#define MSM8998_SSCMX_VFL 9 +#define MSM8998_VDDCX RPMPD_VDDCX +#define MSM8998_VDDCX_AO RPMPD_VDDCX_AO +#define MSM8998_VDDCX_VFL RPMPD_VDDCX_VFL +#define MSM8998_VDDMX RPMPD_VDDMX +#define MSM8998_VDDMX_AO RPMPD_VDDMX_AO +#define MSM8998_VDDMX_VFL RPMPD_VDDMX_VFL +#define MSM8998_SSCCX RPMPD_SSCCX +#define MSM8998_SSCCX_VFL RPMPD_SSCCX_VFL +#define MSM8998_SSCMX RPMPD_SSCMX +#define MSM8998_SSCMX_VFL RPMPD_SSCMX_VFL + +/* QCM2290 Power Domains */ +#define QCM2290_VDDCX 0 +#define QCM2290_VDDCX_AO 1 +#define QCM2290_VDDCX_VFL 2 +#define QCM2290_VDDMX 3 +#define QCM2290_VDDMX_AO 4 +#define QCM2290_VDDMX_VFL 5 +#define QCM2290_VDD_LPI_CX 6 +#define QCM2290_VDD_LPI_MX 7 /* QCS404 Power Domains */ #define QCS404_VDDMX 0 @@ -366,16 +157,16 @@ #define QCS404_LPIMX_VFL 6 /* SDM660 Power Domains */ -#define SDM660_VDDCX 0 -#define SDM660_VDDCX_AO 1 -#define SDM660_VDDCX_VFL 2 -#define SDM660_VDDMX 3 -#define SDM660_VDDMX_AO 4 -#define SDM660_VDDMX_VFL 5 -#define SDM660_SSCCX 6 -#define SDM660_SSCCX_VFL 7 -#define SDM660_SSCMX 8 -#define SDM660_SSCMX_VFL 9 +#define SDM660_VDDCX RPMPD_VDDCX +#define SDM660_VDDCX_AO RPMPD_VDDCX_AO +#define SDM660_VDDCX_VFL RPMPD_VDDCX_VFL +#define SDM660_VDDMX RPMPD_VDDMX +#define SDM660_VDDMX_AO RPMPD_VDDMX_AO +#define SDM660_VDDMX_VFL RPMPD_VDDMX_VFL +#define SDM660_SSCCX RPMPD_SSCCX +#define SDM660_SSCCX_VFL RPMPD_SSCCX_VFL +#define SDM660_SSCMX RPMPD_SSCMX +#define SDM660_SSCMX_VFL RPMPD_SSCMX_VFL /* SM6115 Power Domains */ #define SM6115_VDDCX 0 @@ -388,22 +179,24 @@ #define SM6115_VDD_LPI_MX 7 /* SM6125 Power Domains */ -#define SM6125_VDDCX 0 -#define SM6125_VDDCX_AO 1 -#define SM6125_VDDCX_VFL 2 -#define SM6125_VDDMX 3 -#define SM6125_VDDMX_AO 4 -#define SM6125_VDDMX_VFL 5 +#define SM6125_VDDCX RPMPD_VDDCX +#define SM6125_VDDCX_AO RPMPD_VDDCX_AO +#define SM6125_VDDCX_VFL RPMPD_VDDCX_VFL +#define SM6125_VDDMX RPMPD_VDDMX +#define SM6125_VDDMX_AO RPMPD_VDDMX_AO +#define SM6125_VDDMX_VFL RPMPD_VDDMX_VFL -/* QCM2290 Power Domains */ -#define QCM2290_VDDCX 0 -#define QCM2290_VDDCX_AO 1 -#define QCM2290_VDDCX_VFL 2 -#define QCM2290_VDDMX 3 -#define QCM2290_VDDMX_AO 4 -#define QCM2290_VDDMX_VFL 5 -#define QCM2290_VDD_LPI_CX 6 -#define QCM2290_VDD_LPI_MX 7 +/* SM6375 Power Domain Indexes */ +#define SM6375_VDDCX 0 +#define SM6375_VDDCX_AO 1 +#define SM6375_VDDCX_VFL 2 +#define SM6375_VDDMX 3 +#define SM6375_VDDMX_AO 4 +#define SM6375_VDDMX_VFL 5 +#define SM6375_VDDGX 6 +#define SM6375_VDDGX_AO 7 +#define SM6375_VDD_LPI_CX 8 +#define SM6375_VDD_LPI_MX 9 /* RPM SMD Power Domain performance levels */ #define RPM_SMD_LEVEL_RETENTION 16 diff --git a/include/dt-bindings/thermal/tegra114-soctherm.h b/include/dt-bindings/thermal/tegra114-soctherm.h new file mode 100644 index 00000000000000..b766a61cd1ce74 --- /dev/null +++ b/include/dt-bindings/thermal/tegra114-soctherm.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * This header provides constants for binding nvidia,tegra114-soctherm. + */ + +#ifndef _DT_BINDINGS_THERMAL_TEGRA114_SOCTHERM_H +#define _DT_BINDINGS_THERMAL_TEGRA114_SOCTHERM_H + +#define TEGRA114_SOCTHERM_SENSOR_CPU 0 +#define TEGRA114_SOCTHERM_SENSOR_MEM 1 +#define TEGRA114_SOCTHERM_SENSOR_GPU 2 +#define TEGRA114_SOCTHERM_SENSOR_PLLX 3 + +#define TEGRA114_SOCTHERM_THROT_LEVEL_NONE 0 +#define TEGRA114_SOCTHERM_THROT_LEVEL_LOW 1 +#define TEGRA114_SOCTHERM_THROT_LEVEL_MED 2 +#define TEGRA114_SOCTHERM_THROT_LEVEL_HIGH 3 + +#endif diff --git a/include/kunit/run-in-irq-context.h b/include/kunit/run-in-irq-context.h new file mode 100644 index 00000000000000..108e96433ea45b --- /dev/null +++ b/include/kunit/run-in-irq-context.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Helper function for testing code in interrupt contexts + * + * Copyright 2025 Google LLC + */ +#ifndef _KUNIT_RUN_IN_IRQ_CONTEXT_H +#define _KUNIT_RUN_IN_IRQ_CONTEXT_H + +#include +#include +#include +#include + +#define KUNIT_IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5) + +struct kunit_irq_test_state { + bool (*func)(void *test_specific_state); + void *test_specific_state; + bool task_func_reported_failure; + bool hardirq_func_reported_failure; + bool softirq_func_reported_failure; + unsigned long hardirq_func_calls; + unsigned long softirq_func_calls; + struct hrtimer timer; + struct work_struct bh_work; +}; + +static enum hrtimer_restart kunit_irq_test_timer_func(struct hrtimer *timer) +{ + struct kunit_irq_test_state *state = + container_of(timer, typeof(*state), timer); + + WARN_ON_ONCE(!in_hardirq()); + state->hardirq_func_calls++; + + if (!state->func(state->test_specific_state)) + state->hardirq_func_reported_failure = true; + + hrtimer_forward_now(&state->timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL); + queue_work(system_bh_wq, &state->bh_work); + return HRTIMER_RESTART; +} + +static void kunit_irq_test_bh_work_func(struct work_struct *work) +{ + struct kunit_irq_test_state *state = + container_of(work, typeof(*state), bh_work); + + WARN_ON_ONCE(!in_serving_softirq()); + state->softirq_func_calls++; + + if (!state->func(state->test_specific_state)) + state->softirq_func_reported_failure = true; +} + +/* + * Helper function which repeatedly runs the given @func in task, softirq, and + * hardirq context concurrently, and reports a failure to KUnit if any + * invocation of @func in any context returns false. @func is passed + * @test_specific_state as its argument. At most 3 invocations of @func will + * run concurrently: one in each of task, softirq, and hardirq context. + * + * The main purpose of this interrupt context testing is to validate fallback + * code paths that run in contexts where the normal code path cannot be used, + * typically due to the FPU or vector registers already being in-use in kernel + * mode. These code paths aren't covered when the test code is executed only by + * the KUnit test runner thread in task context. The reason for the concurrency + * is because merely using hardirq context is not sufficient to reach a fallback + * code path on some architectures; the hardirq actually has to occur while the + * FPU or vector unit was already in-use in kernel mode. + * + * Another purpose of this testing is to detect issues with the architecture's + * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions, + * especially in softirq context when the softirq may have interrupted a task + * already using kernel-mode FPU or vector (if the arch didn't prevent that). + * Crypto functions are often executed in softirqs, so this is important. + */ +static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *), + int max_iterations, + void *test_specific_state) +{ + struct kunit_irq_test_state state = { + .func = func, + .test_specific_state = test_specific_state, + }; + unsigned long end_jiffies; + + /* + * Set up a hrtimer (the way we access hardirq context) and a work + * struct for the BH workqueue (the way we access softirq context). + */ + hrtimer_setup_on_stack(&state.timer, kunit_irq_test_timer_func, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + INIT_WORK_ONSTACK(&state.bh_work, kunit_irq_test_bh_work_func); + + /* Run for up to max_iterations or 1 second, whichever comes first. */ + end_jiffies = jiffies + HZ; + hrtimer_start(&state.timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL, + HRTIMER_MODE_REL_HARD); + for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies); + i++) { + if (!func(test_specific_state)) + state.task_func_reported_failure = true; + } + + /* Cancel the timer and work. */ + hrtimer_cancel(&state.timer); + flush_work(&state.bh_work); + + /* Sanity check: the timer and BH functions should have been run. */ + KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0, + "Timer function was not called"); + KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0, + "BH work function was not called"); + + /* Check for incorrect hash values reported from any context. */ + KUNIT_EXPECT_FALSE_MSG( + test, state.task_func_reported_failure, + "Incorrect hash values reported from task context"); + KUNIT_EXPECT_FALSE_MSG( + test, state.hardirq_func_reported_failure, + "Incorrect hash values reported from hardirq context"); + KUNIT_EXPECT_FALSE_MSG( + test, state.softirq_func_reported_failure, + "Incorrect hash values reported from softirq context"); +} + +#endif /* _KUNIT_RUN_IN_IRQ_CONTEXT_H */ diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 404883c7af6e83..4000ff16f2957e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -139,10 +139,13 @@ struct vgic_irq { bool pending_latch; /* The pending latch state used to calculate * the pending state for both level * and edge triggered IRQs. */ - bool active; /* not used for LPIs */ + bool active; + bool pending_release; /* Used for LPIs only, unreferenced IRQ + * pending a release */ + bool enabled; bool hw; /* Tied to HW IRQ */ - struct kref refcount; /* Used for LPIs */ + refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ union { diff --git a/include/linux/adi-axi-common.h b/include/linux/adi-axi-common.h index f64f4ad4bedae3..37962ba530dfc1 100644 --- a/include/linux/adi-axi-common.h +++ b/include/linux/adi-axi-common.h @@ -8,6 +8,8 @@ * https://wiki.analog.com/resources/fpga/docs/hdl/regmap */ +#include + #ifndef ADI_AXI_COMMON_H_ #define ADI_AXI_COMMON_H_ @@ -21,6 +23,25 @@ #define ADI_AXI_PCORE_VER_MINOR(version) (((version) >> 8) & 0xff) #define ADI_AXI_PCORE_VER_PATCH(version) ((version) & 0xff) +/** + * adi_axi_pcore_ver_gteq() - check if a version is satisfied + * @version: the full version read from the hardware + * @major: the major version to compare against + * @minor: the minor version to compare against + * + * ADI AXI IP Cores use semantic versioning, so this can be used to check for + * feature availability. + * + * Return: true if the version is greater than or equal to the specified + * major and minor version, false otherwise. + */ +static inline bool adi_axi_pcore_ver_gteq(u32 version, u32 major, u32 minor) +{ + return ADI_AXI_PCORE_VER_MAJOR(version) > (major) || + (ADI_AXI_PCORE_VER_MAJOR(version) == (major) && + ADI_AXI_PCORE_VER_MINOR(version) >= (minor)); +} + #define ADI_AXI_INFO_FPGA_TECH(info) (((info) >> 24) & 0xff) #define ADI_AXI_INFO_FPGA_FAMILY(info) (((info) >> 16) & 0xff) #define ADI_AXI_INFO_FPGA_SPEED_GRADE(info) (((info) >> 8) & 0xff) diff --git a/include/linux/audit.h b/include/linux/audit.h index a394614ccd0b81..536f8ee8da818c 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -37,6 +37,8 @@ struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; +struct lsm_id; +struct lsm_prop; struct audit_krule { u32 pflags; @@ -147,6 +149,10 @@ extern unsigned compat_signal_class[]; #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) +/* bit values for audit_cfg_lsm */ +#define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) +#define AUDIT_CFG_LSM_SECCTX_OBJECT BIT(1) + struct filename; #define AUDIT_OFF 0 @@ -185,6 +191,8 @@ extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); +extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); +extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -210,6 +218,8 @@ extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); + #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, @@ -245,6 +255,16 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } +static inline int audit_log_subj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} +static inline int audit_log_obj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -269,6 +289,9 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } +static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ } + #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC @@ -527,7 +550,7 @@ static inline void audit_log_kern_module(const char *name) static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { - if (!audit_dummy_context()) + if (audit_enabled) __audit_fanotify(response, friar); } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2ad261082bba5f..c5c9d89c73edcc 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -152,6 +152,10 @@ struct bdi_writeback { struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ + struct work_struct switch_work; /* work used to perform inode switching + * to this wb */ + struct llist_head switch_wbs_ctxs; /* queued contexts for + * writeback switching */ union { struct work_struct release_work; diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 9be2d50da09a41..ea7898cc590396 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -267,7 +267,7 @@ static inline int parity8(u8 val) * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ -static inline unsigned int __ffs64(u64 word) +static inline __attribute_const__ unsigned int __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc700925b802fe..a98c8334613474 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -109,6 +110,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); + int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -206,6 +208,7 @@ enum btf_field_type { BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), BPF_RES_SPIN_LOCK = (1 << 12), + BPF_TASK_WORK = (1 << 13), }; enum bpf_cgroup_storage_type { @@ -259,6 +262,7 @@ struct btf_record { int timer_off; int wq_off; int refcount_off; + int task_work_off; struct btf_field fields[]; }; @@ -285,9 +289,11 @@ struct bpf_map_owner { bool xdp_has_frags; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; + enum bpf_attach_type expected_attach_type; }; struct bpf_map { + u8 sha[SHA256_DIGEST_SIZE]; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY @@ -328,6 +334,7 @@ struct bpf_map { atomic64_t sleepable_refcnt; s64 __percpu *elem_count; u64 cookie; /* write-once */ + char *excl_prog_sha; }; static inline const char *btf_field_type_name(enum btf_field_type type) @@ -358,6 +365,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; + case BPF_TASK_WORK: + return "bpf_task_work"; default: WARN_ON_ONCE(1); return "unknown"; @@ -396,6 +405,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); + case BPF_TASK_WORK: + return sizeof(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -428,6 +439,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); + case BPF_TASK_WORK: + return __alignof__(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -459,6 +472,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: + case BPF_TASK_WORK: break; default: WARN_ON_ONCE(1); @@ -595,6 +609,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_wq_cancel_and_free(void *timer); +void bpf_task_work_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -767,12 +782,15 @@ enum bpf_type_flag { */ MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ + DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1110,7 +1128,7 @@ struct bpf_prog_offload { */ #define MAX_BPF_FUNC_REG_ARGS 5 -/* The argument is a structure. */ +/* The argument is a structure or a union. */ #define BTF_FMODEL_STRUCT_ARG BIT(0) /* The argument is signed. */ @@ -1358,6 +1376,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */ BPF_DYNPTR_TYPE_XDP, + /* Points to skb_metadata_end()-skb_metadata_len() */ + BPF_DYNPTR_TYPE_SKB_META, }; int bpf_dynptr_check_size(u32 size); @@ -1619,6 +1639,7 @@ struct bpf_prog_aux { bool priv_stack_requested; bool changes_pkt_data; bool might_sleep; + bool kprobe_write_ctx; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; @@ -1628,6 +1649,7 @@ struct bpf_prog_aux { /* function name for valid attach_btf_id */ const char *attach_func_name; struct bpf_prog **func; + struct bpf_prog_aux *main_prog_aux; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; @@ -1711,7 +1733,10 @@ struct bpf_prog { enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ - u8 tag[BPF_TAG_SIZE]; + union { + u8 digest[SHA256_DIGEST_SIZE]; + u8 tag[BPF_TAG_SIZE]; + }; struct bpf_prog_stats __percpu *stats; int __percpu *active; unsigned int (*bpf_func)(const void *ctx, @@ -1985,6 +2010,7 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ @@ -2411,6 +2437,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); @@ -2697,7 +2724,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, bool delete); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); @@ -2874,6 +2901,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -3161,6 +3189,11 @@ static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { } + +static inline void bpf_prog_report_arena_violation(bool write, unsigned long addr, + unsigned long fault_ip) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static __always_inline int @@ -3403,6 +3436,38 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ +#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) + +struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); +struct bpf_key *bpf_lookup_system_key(u64 id); +void bpf_key_put(struct bpf_key *bkey); +int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring); + +#else +static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + return NULL; +} + +static inline struct bpf_key *bpf_lookup_system_key(u64 id) +{ + return NULL; +} + +static inline void bpf_key_put(struct bpf_key *bkey) +{ +} + +static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ + return -EOPNOTSUPP; +} +#endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 94defa405c85e3..4c497e839526a4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -26,28 +26,6 @@ /* Patch buffer size */ #define INSN_BUF_SIZE 32 -/* Liveness marks, used for registers and spilled-regs (in stack slots). - * Read marks propagate upwards until they find a write mark; they record that - * "one of this state's descendants read this reg" (and therefore the reg is - * relevant for states_equal() checks). - * Write marks collect downwards and do not propagate; they record that "the - * straight-line code that reached this state (from its parent) wrote this reg" - * (and therefore that reads propagated from this state or its descendants - * should not propagate to its parent). - * A state with a write mark can receive read marks; it just won't propagate - * them to its parent, since the write mark is a property, not of the state, - * but of the link between it and its parent. See mark_reg_read() and - * mark_stack_slot_read() in kernel/bpf/verifier.c. - */ -enum bpf_reg_liveness { - REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ - REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ - REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ - REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, - REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ - REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ -}; - #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { @@ -212,8 +190,6 @@ struct bpf_reg_state { * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; - /* parentage chain for liveness checking */ - struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' @@ -226,7 +202,6 @@ struct bpf_reg_state { * patching which only happens after main verification finished. */ s32 subreg_def; - enum bpf_reg_liveness live; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; }; @@ -445,6 +420,7 @@ struct bpf_verifier_state { bool speculative; bool in_sleepable; + bool cleaned; /* first and last insn idx of this verifier state */ u32 first_insn_idx; @@ -665,6 +641,7 @@ struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ + u32 postorder_start; /* The idx to the env->cfg.insn_postorder */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) @@ -744,6 +721,8 @@ struct bpf_scc_info { struct bpf_scc_visit visits[]; }; +struct bpf_liveness; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -794,7 +773,10 @@ struct bpf_verifier_env { struct { int *insn_state; int *insn_stack; - /* vector of instruction indexes sorted in post-order */ + /* + * vector of instruction indexes sorted in post-order, grouped by subprogram, + * see bpf_subprog_info->postorder_start. + */ int *insn_postorder; int cur_stack; /* current position in the insn_postorder vector */ @@ -842,6 +824,7 @@ struct bpf_verifier_env { struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; struct bpf_scc_callchain callchain_buf; + struct bpf_liveness *liveness; /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; @@ -875,13 +858,15 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, #define verifier_bug_if(cond, env, fmt, args...) \ ({ \ bool __cond = (cond); \ - if (unlikely(__cond)) { \ - BPF_WARN_ONCE(1, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - bpf_log(&env->log, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - } \ + if (unlikely(__cond)) \ + verifier_bug(env, fmt " (" #cond ")", ##args); \ (__cond); \ }) -#define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) +#define verifier_bug(env, fmt, args...) \ + ({ \ + BPF_WARN_ONCE(1, "verifier bug: " fmt "\n", ##args); \ + bpf_log(&env->log, "verifier bug: " fmt "\n", ##args); \ + }) static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { @@ -962,6 +947,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) case BPF_PROG_TYPE_STRUCT_OPS: return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SYSCALL: return false; default: return true; @@ -1062,4 +1048,21 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno); +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); +int bpf_jmp_offset(struct bpf_insn *insn); +int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); + +int bpf_stack_liveness_init(struct bpf_verifier_env *env); +void bpf_stack_liveness_free(struct bpf_verifier_env *env); +int bpf_update_live_stack(struct bpf_verifier_env *env); +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frameno, u32 insn_idx, u64 mask); +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frameno, u64 mask); +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx); +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env); +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st); +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi); +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 9eda6b113f9b48..f06976ffb63f94 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -86,7 +86,7 @@ * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ -#define __bpf_kfunc __used __retain noinline +#define __bpf_kfunc __used __retain __noclone noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 0bf7d33a1048cf..7fcec025c5e01d 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -96,6 +96,14 @@ enum cc_attr { * enabled to run SEV-SNP guests. */ CC_ATTR_HOST_SEV_SNP, + + /** + * @CC_ATTR_SNP_SECURE_AVIC: Secure AVIC mode is active. + * + * The host kernel is running with the necessary features enabled + * to run SEV-SNP guests with full Secure AVIC capabilities. + */ + CC_ATTR_SNP_SECURE_AVIC, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM diff --git a/drivers/cdx/controller/bitfield.h b/include/linux/cdx/bitfield.h similarity index 100% rename from drivers/cdx/controller/bitfield.h rename to include/linux/cdx/bitfield.h diff --git a/include/linux/cdx/edac_cdx_pcol.h b/include/linux/cdx/edac_cdx_pcol.h new file mode 100644 index 00000000000000..749db33bb482d1 --- /dev/null +++ b/include/linux/cdx/edac_cdx_pcol.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Driver for AMD network controllers and boards + * + * Copyright (C) 2021, Xilinx, Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef MC_CDX_PCOL_H +#define MC_CDX_PCOL_H +#include + +#define MC_CMD_EDAC_GET_DDR_CONFIG_OUT_WORD_LENGTH_LEN 4 +/* Number of registers for the DDR controller */ +#define MC_CMD_GET_DDR_CONFIG_OFST 4 +#define MC_CMD_GET_DDR_CONFIG_LEN 4 + +/***********************************/ +/* MC_CMD_EDAC_GET_DDR_CONFIG + * Provides detailed configuration for the DDR controller of the given index. + */ +#define MC_CMD_EDAC_GET_DDR_CONFIG 0x3 + +/* MC_CMD_EDAC_GET_DDR_CONFIG_IN msgrequest */ +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_OFST 0 +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_LEN 4 + +#endif /* MC_CDX_PCOL_H */ diff --git a/drivers/cdx/controller/mcdi.h b/include/linux/cdx/mcdi.h similarity index 79% rename from drivers/cdx/controller/mcdi.h rename to include/linux/cdx/mcdi.h index 54a65e9760aeee..74075305cba48c 100644 --- a/drivers/cdx/controller/mcdi.h +++ b/include/linux/cdx/mcdi.h @@ -11,16 +11,7 @@ #include #include -#include "bitfield.h" -#include "mc_cdx_pcol.h" - -#ifdef DEBUG -#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) -#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) -#else -#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) -#define CDX_WARN_ON_PARANOID(x) do {} while (0) -#endif +#include "linux/cdx/bitfield.h" /** * enum cdx_mcdi_mode - MCDI transaction mode @@ -36,8 +27,6 @@ enum cdx_mcdi_mode { #define MCDI_RPC_LONG_TIMEOU (60 * HZ) #define MCDI_RPC_POST_RST_TIME (10 * HZ) -#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) - /** * enum cdx_mcdi_cmd_state - State for an individual MCDI command * @MCDI_STATE_QUEUED: Command not started and is waiting to run. @@ -180,24 +169,12 @@ struct cdx_mcdi_data { u32 fn_flags; }; -static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) -{ - return cdx->mcdi ? &cdx->mcdi->iface : NULL; -} - -int cdx_mcdi_init(struct cdx_mcdi *cdx); void cdx_mcdi_finish(struct cdx_mcdi *cdx); - +int cdx_mcdi_init(struct cdx_mcdi *cdx); void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len); int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, const struct cdx_dword *inbuf, size_t inlen, struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual); -int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, - const struct cdx_dword *inbuf, size_t inlen, - cdx_mcdi_async_completer *complete, - unsigned long cookie); -int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, - unsigned int timeout_jiffies); /* * We expect that 16- and 32-bit fields in MCDI requests and responses @@ -215,28 +192,8 @@ int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, #define _MCDI_DWORD(_buf, _field) \ ((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2)) -#define MCDI_BYTE(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ - *MCDI_PTR(_buf, _field)) -#define MCDI_WORD(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ - le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) #define MCDI_SET_DWORD(_buf, _field, _value) \ CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), CDX_DWORD, _value) #define MCDI_DWORD(_buf, _field) \ CDX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), CDX_DWORD) -#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ - CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ - MC_CMD_ ## _name1, _value1) -#define MCDI_SET_QWORD(_buf, _field, _value) \ - do { \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ - CDX_DWORD, (u32)(_value)); \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ - CDX_DWORD, (u64)(_value) >> 32); \ - } while (0) -#define MCDI_QWORD(_buf, _field) \ - (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ - (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) - #endif /* CDX_MCDI_H */ diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 52a98886a455d0..1fd22ea6eba4fe 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,7 +11,7 @@ #include #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI extern bool cfi_warn; enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, @@ -52,7 +52,7 @@ static inline u32 cfi_get_func_hash(void *func) extern u32 cfi_bpf_hash; extern u32 cfi_bpf_subprog_hash; -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ static inline int cfi_get_offset(void) { return 0; } static inline u32 cfi_get_func_hash(void *func) { return 0; } @@ -60,7 +60,7 @@ static inline u32 cfi_get_func_hash(void *func) { return 0; } #define cfi_bpf_hash 0U #define cfi_bpf_subprog_hash 0U -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifdef CONFIG_ARCH_USES_CFI_TRAPS bool is_cfi_trap(unsigned long addr); diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h index 685f7181780f92..a86af9bc8bdc47 100644 --- a/include/linux/cfi_types.h +++ b/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_ type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,12 +29,12 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ @@ -43,7 +43,7 @@ #else /* __ASSEMBLY__ */ -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI #define DEFINE_CFI_TYPE(name, func) \ /* \ * Force a reference to the function so the compiler generates \ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6b93a64115fe94..93318fce31f3a8 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -91,6 +91,12 @@ enum { * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * + * Alleviate the contention between fork, exec, exit operations and + * writing to cgroup.procs by taking a per threadgroup rwsem instead of + * the global cgroup_threadgroup_rwsem. Fork and other operations + * from threads in different thread groups no longer contend with + * writing to cgroup.procs. + * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from @@ -140,6 +146,17 @@ enum { __CFTYPE_ADDED = (1 << 18), }; +enum cgroup_attach_lock_mode { + /* Default */ + CGRP_ATTACH_LOCK_GLOBAL, + + /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ + CGRP_ATTACH_LOCK_NONE, + + /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */ + CGRP_ATTACH_LOCK_PER_THREADGROUP, +}; + /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can @@ -433,6 +450,23 @@ struct cgroup_freezer_state { * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; + + /* Freeze time data consistency protection */ + seqcount_t freeze_seq; + + /* + * Most recent time the cgroup was requested to freeze. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 freeze_start_nsec; + + /* + * Total duration the cgroup has spent freezing. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 frozen_nsec; }; struct cgroup { @@ -746,7 +780,6 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); - void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); @@ -822,6 +855,7 @@ struct cgroup_subsys { }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +extern bool cgroup_enable_per_threadgroup_rwsem; struct cgroup_of_peak { unsigned long value; @@ -833,11 +867,14 @@ struct cgroup_of_peak { * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes - * using a percpu_rw_semaphore. + * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when + * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); + if (cgroup_enable_per_threadgroup_rwsem) + down_read(&tsk->signal->cgroup_threadgroup_rwsem); } /** @@ -848,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { + if (cgroup_enable_per_threadgroup_rwsem) + up_read(&tsk->signal->cgroup_threadgroup_rwsem); percpu_up_read(&cgroup_threadgroup_rwsem); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e23..6ed477338b1660 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,7 @@ #include #include +#include struct kernel_clone_args; @@ -354,6 +355,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } +static inline bool css_is_online(struct cgroup_subsys_state *css) +{ + return css->flags & CSS_ONLINE; +} + static inline bool css_is_self(struct cgroup_subsys_state *css) { if (css == &css->cgroup->self) { @@ -650,6 +656,7 @@ static inline void cgroup_kthread_ready(void) } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); +struct cgroup *__cgroup_get_from_id(u64 id); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ @@ -783,52 +790,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ -struct cgroup_namespace { - struct ns_common ns; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct css_set *root_cset; -}; - -extern struct cgroup_namespace init_cgroup_ns; - -#ifdef CONFIG_CGROUPS - -void free_cgroup_ns(struct cgroup_namespace *ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns); - -int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -static inline void put_cgroup_ns(struct cgroup_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_cgroup_ns(ns); -} - -#else /* !CONFIG_CGROUPS */ - -static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } -static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - return old_ns; -} - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } -static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } - -#endif /* !CONFIG_CGROUPS */ - #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h new file mode 100644 index 00000000000000..78a8418558a4e2 --- /dev/null +++ b/include/linux/cgroup_namespace.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CGROUP_NAMESPACE_H +#define _LINUX_CGROUP_NAMESPACE_H + +#include + +struct cgroup_namespace { + struct ns_common ns; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct css_set *root_cset; +}; + +extern struct cgroup_namespace init_cgroup_ns; + +#ifdef CONFIG_CGROUPS + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +void free_cgroup_ns(struct cgroup_namespace *ns); + +struct cgroup_namespace *copy_cgroup_ns(u64 flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns); + +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) +{ + ns_ref_inc(ns); +} + +static inline void put_cgroup_ns(struct cgroup_namespace *ns) +{ + if (ns_ref_put(ns)) + free_cgroup_ns(ns); +} + +#else /* !CONFIG_CGROUPS */ + +static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } +static inline struct cgroup_namespace * +copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + return old_ns; +} + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } +static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } + +#endif /* !CONFIG_CGROUPS */ + +#endif /* _LINUX_CGROUP_NAMESPACE_H */ diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index fa4ffe037bc77f..8720a0705900ca 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -18,23 +18,42 @@ #define KASAN_ABI_VERSION 5 /* + * Clang 22 added preprocessor macros to match GCC, in hopes of eventually + * dropping __has_feature support for sanitizers: + * https://github.com/llvm/llvm-project/commit/568c23bbd3303518c5056d7f03444dae4fdc8a9c + * Create these macros for older versions of clang so that it is easy to clean + * up once the minimum supported version of LLVM for building the kernel always + * creates these macros. + * * Note: Checking __has_feature(*_sanitizer) is only true if the feature is * enabled. Therefore it is not required to additionally check defined(CONFIG_*) * to avoid adding redundant attributes in other configurations. */ +#if __has_feature(address_sanitizer) && !defined(__SANITIZE_ADDRESS__) +#define __SANITIZE_ADDRESS__ +#endif +#if __has_feature(hwaddress_sanitizer) && !defined(__SANITIZE_HWADDRESS__) +#define __SANITIZE_HWADDRESS__ +#endif +#if __has_feature(thread_sanitizer) && !defined(__SANITIZE_THREAD__) +#define __SANITIZE_THREAD__ +#endif -#if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer) -/* Emulate GCC's __SANITIZE_ADDRESS__ flag */ +/* + * Treat __SANITIZE_HWADDRESS__ the same as __SANITIZE_ADDRESS__ in the kernel. + */ +#ifdef __SANITIZE_HWADDRESS__ #define __SANITIZE_ADDRESS__ +#endif + +#ifdef __SANITIZE_ADDRESS__ #define __no_sanitize_address \ __attribute__((no_sanitize("address", "hwaddress"))) #else #define __no_sanitize_address #endif -#if __has_feature(thread_sanitizer) -/* emulate gcc's __SANITIZE_THREAD__ flag */ -#define __SANITIZE_THREAD__ +#ifdef __SANITIZE_THREAD__ #define __no_sanitize_thread \ __attribute__((no_sanitize("thread"))) #else diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 64ff73c533e54e..5b45ea7dff3e50 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -248,7 +248,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ -#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_CFI) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) /* * Force a reference to the external symbol so the compiler generates * __kcfi_typid. diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 16755431fc11ee..2f3e80bf9f3574 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -329,6 +329,29 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* + * The assume attribute is used to indicate that a certain condition is + * assumed to be true. If this condition is violated at runtime, the behavior + * is undefined. Compilers may or may not use this indication to generate + * optimized code. + * + * Note that the clang documentation states that optimizers may react + * differently to this attribute, and this may even have a negative + * performance impact. Therefore this attribute should be used with care. + * + * Optional: only supported since gcc >= 13 + * Optional: only supported since clang >= 19 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html#index-assume-statement-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#id13 + * + */ +#ifdef CONFIG_CC_HAS_ASSUME +# define __assume(expr) __attribute__((__assume__(expr))) +#else +# define __assume(expr) +#endif + /* * Optional: only supported since gcc >= 15 * Optional: only supported since clang >= 18 diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b91b993f58ee7f..487b3bf2e1eaf7 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -83,6 +83,7 @@ extern ssize_t cpu_show_old_microcode(struct device *dev, extern ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 40966512ea1812..0465d1e6f72ac0 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -26,12 +26,10 @@ *********************************************************************/ /* * Frequency values here are CPU kHz - * - * Maximum transition latency is in nanoseconds - if it's unknown, - * CPUFREQ_ETERNAL shall be used. */ -#define CPUFREQ_ETERNAL (-1) +#define CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS NSEC_PER_MSEC + #define CPUFREQ_NAME_LEN 16 /* Print length for names. Extra 1 space for accommodating '\n' in prints */ #define CPUFREQ_NAME_PLEN (CPUFREQ_NAME_LEN + 1) diff --git a/include/linux/cred.h b/include/linux/cred.h index a102a10f833fb8..89ae50ad2acea9 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -148,7 +148,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); -extern int copy_creds(struct task_struct *, unsigned long); +extern int copy_creds(struct task_struct *, u64); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddda5..9e62b2a85538da 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -636,6 +636,7 @@ struct damon_operations { * @data: Data that will be passed to @fn. * @repeat: Repeat invocations. * @return_code: Return code from @fn invocation. + * @dealloc_on_cancel: De-allocate when canceled. * * Control damon_call(), which requests specific kdamond to invoke a given * function. Refer to damon_call() for more details. @@ -645,6 +646,7 @@ struct damon_call_control { void *data; bool repeat; int return_code; + bool dealloc_on_cancel; /* private: internal use only */ /* informs if the kdamond finished handling of the request */ struct completion completion; diff --git a/include/linux/device.h b/include/linux/device.h index 0470d19da7f2ca..b031ff71a5bdfe 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -851,6 +851,9 @@ static inline bool device_pm_not_required(struct device *dev) static inline void device_set_pm_not_required(struct device *dev) { dev->power.no_pm = true; +#ifdef CONFIG_PM + dev->power.no_callbacks = true; +#endif } static inline void dev_pm_syscore_device(struct device *dev, bool val) diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h index ae696d10faff44..8c5f57e0d61349 100644 --- a/include/linux/device/devres.h +++ b/include/linux/device/devres.h @@ -80,6 +80,8 @@ void devm_kfree(struct device *dev, const void *p); void * __realloc_size(3) devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp); static inline void *devm_kmemdup_array(struct device *dev, const void *src, size_t n, size_t size, gfp_t flags) { diff --git a/include/linux/dlm.h b/include/linux/dlm.h index bacda9898f2b6c..7e7b45b0d09709 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -87,13 +87,44 @@ int dlm_new_lockspace(const char *name, const char *cluster, const struct dlm_lockspace_ops *ops, void *ops_arg, int *ops_result, dlm_lockspace_t **lockspace); +/* + * dlm_release_lockspace() release_option values: + * + * DLM_RELEASE_NO_LOCKS returns -EBUSY if any locks (lkb's) + * exist in the local lockspace. + * + * DLM_RELEASE_UNUSED previous value that is no longer used. + * + * DLM_RELEASE_NORMAL releases the lockspace regardless of any + * locks managed in the local lockspace. + * + * DLM_RELEASE_NO_EVENT release the lockspace regardless of any + * locks managed in the local lockspace, and does not submit + * a leave event to the cluster manager, so other nodes will + * not be notified that the node should be removed from the + * list of lockspace members. + * + * DLM_RELEASE_RECOVER like DLM_RELEASE_NORMAL, but the remaining + * nodes will handle the removal of the node as if the node + * had failed, e.g. the recover_slot() callback would be used. + */ +#define DLM_RELEASE_NO_LOCKS 0 +#define DLM_RELEASE_UNUSED 1 +#define DLM_RELEASE_NORMAL 2 +#define DLM_RELEASE_NO_EVENT 3 +#define DLM_RELEASE_RECOVER 4 +#define __DLM_RELEASE_MAX DLM_RELEASE_RECOVER + /* * dlm_release_lockspace * * Stop a lockspace. + * + * release_option: see DLM_RELEASE values above. */ -int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force); +int dlm_release_lockspace(dlm_lockspace_t *lockspace, + unsigned int release_option); /* * dlm_lock diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index de5bd76a400ca1..d7d757e72554e1 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -856,8 +856,8 @@ struct kernel_ethtool_ts_info { enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_source phc_source; int phc_phyindex; - enum hwtstamp_tx_types tx_types; - enum hwtstamp_rx_filters rx_filters; + u32 tx_types; + u32 rx_filters; }; /** diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c70..3aac58a520c70a 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -122,6 +122,12 @@ enum fid_type { FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, + /* + * + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. + */ + FILEID_NSFS = 0xf1, + /* * 64 bit unique kernfs id */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e7fd3ee759e07..f5c859b8131a3e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -78,6 +78,9 @@ struct ctl_table_header; /* unused opcode to mark special atomic instruction */ #define BPF_PROBE_ATOMIC 0xe0 +/* unused opcode to mark special ldsx instruction. Same as BPF_NOSPEC */ +#define BPF_PROBE_MEM32SX 0xc0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 @@ -997,12 +1000,6 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) return prog->len * sizeof(struct bpf_insn); } -static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) -{ - return round_up(bpf_prog_insn_size(prog) + - sizeof(__be64) + 1, SHA1_BLOCK_SIZE); -} - static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), @@ -1296,7 +1293,7 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { - pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, + pr_err("flen=%u proglen=%u pass=%u image=%p from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) @@ -1784,6 +1781,7 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1818,6 +1816,11 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi unsigned long len, bool flush) { } + +static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ diff --git a/include/linux/firewire.h b/include/linux/firewire.h index d38c6e538e5c15..6d208769d45627 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -88,23 +88,30 @@ struct fw_card { int node_id; int generation; - int current_tlabel; - u64 tlabel_mask; - struct list_head transaction_list; u64 reset_jiffies; - u32 split_timeout_hi; - u32 split_timeout_lo; - unsigned int split_timeout_cycles; - unsigned int split_timeout_jiffies; + struct { + int current_tlabel; + u64 tlabel_mask; + struct list_head list; + spinlock_t lock; + } transactions; + + struct { + u32 hi; + u32 lo; + unsigned int cycles; + unsigned int jiffies; + spinlock_t lock; + } split_timeout; unsigned long long guid; unsigned max_receive; int link_speed; int config_rom_generation; - spinlock_t lock; /* Take this lock when handling the lists in - * this struct. */ + spinlock_t lock; + struct fw_node *local_node; struct fw_node *root_node; struct fw_node *irm_node; @@ -115,8 +122,6 @@ struct fw_card { int index; struct list_head link; - struct list_head phy_receiver_list; - struct delayed_work br_work; /* bus reset job */ bool br_short; @@ -131,7 +136,11 @@ struct fw_card { bool broadcast_channel_allocated; u32 broadcast_channel; - __be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; + + struct { + __be32 buffer[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; + spinlock_t lock; + } topology_map; __be32 maint_utility_register; diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index d4212bc42b2c17..a33b4502735675 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -26,13 +26,43 @@ #define SCMI_IMX94_CTRL_SAI3_MCLK 5U /*!< WAKE SAI3 MCLK */ #define SCMI_IMX94_CTRL_SAI4_MCLK 6U /*!< WAKE SAI4 MCLK */ +#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_DRV) int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); int scmi_imx_misc_ctrl_set(u32 id, u32 val); +#else +static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val) +{ + return -EOPNOTSUPP; +} +static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val) +{ + return -EOPNOTSUPP; +} +#endif + +#if IS_ENABLED(CONFIG_IMX_SCMI_CPU_DRV) int scmi_imx_cpu_start(u32 cpuid, bool start); int scmi_imx_cpu_started(u32 cpuid, bool *started); int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, bool boot, bool resume); +#else +static inline int scmi_imx_cpu_start(u32 cpuid, bool start) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_started(u32 cpuid, bool *started) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, + bool boot, bool resume) +{ + return -EOPNOTSUPP; +} +#endif enum scmi_imx_lmm_op { SCMI_IMX_LMM_BOOT, @@ -44,7 +74,24 @@ enum scmi_imx_lmm_op { #define SCMI_IMX_LMM_OP_FORCEFUL 0 #define SCMI_IMX_LMM_OP_GRACEFUL BIT(0) +#if IS_ENABLED(CONFIG_IMX_SCMI_LMM_DRV) int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags); int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info); int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector); +#else +static inline int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector) +{ + return -EOPNOTSUPP; +} +#endif #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d7051f..9e9d7c757efe49 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -72,9 +72,7 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; -struct fscrypt_inode_info; struct fscrypt_operations; -struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; @@ -149,7 +147,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) -/* FMODE_* bit 13 */ +/* Supports IOCB_HAS_METADATA */ +#define FMODE_HAS_METADATA ((__force fmode_t)(1 << 13)) /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) @@ -356,6 +355,7 @@ struct readahead_control; #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE +#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -666,6 +666,124 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 +/* + * Inode state bits. Protected by inode->i_lock + * + * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, + * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. + * + * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, + * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at + * various stages of removing an inode. + * + * Two bits are used for locking and completion notification, I_NEW and I_SYNC. + * + * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on + * fdatasync() (unless I_DIRTY_DATASYNC is also set). + * Timestamp updates are the usual cause. + * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of + * these changes separately from I_DIRTY_SYNC so that we + * don't have to write inode on fdatasync() when only + * e.g. the timestamps have changed. + * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. + * I_DIRTY_TIME The inode itself has dirty timestamps, and the + * lazytime mount option is enabled. We keep track of this + * separately from I_DIRTY_SYNC in order to implement + * lazytime. This gets cleared if I_DIRTY_INODE + * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But + * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already + * in place because writeback might already be in progress + * and we don't want to lose the time update + * I_NEW Serves as both a mutex and completion notification. + * New inodes set I_NEW. If two processes both create + * the same inode, one of them will release its inode and + * wait for I_NEW to be released before returning. + * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can + * also cause waiting on I_NEW, without I_NEW actually + * being set. find_inode() uses this to prevent returning + * nearly-dead inodes. + * I_WILL_FREE Must be set when calling write_inode_now() if i_count + * is zero. I_FREEING must be set when I_WILL_FREE is + * cleared. + * I_FREEING Set when inode is about to be freed but still has dirty + * pages or buffers attached or the inode itself is still + * dirty. + * I_CLEAR Added by clear_inode(). In this state the inode is + * clean and can be destroyed. Inode keeps I_FREEING. + * + * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are + * prohibited for many purposes. iget() must wait for + * the inode to be completely released, then create it + * anew. Other functions will just ignore such inodes, + * if appropriate. I_NEW is used for waiting. + * + * I_SYNC Writeback of inode is running. The bit is set during + * data writeback, and cleared with a wakeup on the bit + * address once it is done. The bit is also used to pin + * the inode in memory for flusher thread. + * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab the i_pages lock. See + * inode_switch_wbs_work_fn() for details. + * + * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper + * and work dirs among overlayfs mounts. + * + * I_CREATING New object's inode in the middle of setting up. + * + * I_DONTCACHE Evict inode as soon as it is not used anymore. + * + * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. + * Used to detect that mark_inode_dirty() should not move + * inode between dirty lists. + * + * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. + * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * + * Q: What is the difference between I_WILL_FREE and I_FREEING? + * + * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait + * upon. There's one free address left. + */ + +enum inode_state_bits { + __I_NEW = 0U, + __I_SYNC = 1U, + __I_LRU_ISOLATING = 2U + /* reserved wait address bit 3 */ +}; + +enum inode_state_flags_t { + I_NEW = (1U << __I_NEW), + I_SYNC = (1U << __I_SYNC), + I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), + /* reserved flag bit 3 */ + I_DIRTY_SYNC = (1U << 4), + I_DIRTY_DATASYNC = (1U << 5), + I_DIRTY_PAGES = (1U << 6), + I_WILL_FREE = (1U << 7), + I_FREEING = (1U << 8), + I_CLEAR = (1U << 9), + I_REFERENCED = (1U << 10), + I_LINKABLE = (1U << 11), + I_DIRTY_TIME = (1U << 12), + I_WB_SWITCH = (1U << 13), + I_OVL_INUSE = (1U << 14), + I_CREATING = (1U << 15), + I_DONTCACHE = (1U << 16), + I_SYNC_QUEUED = (1U << 17), + I_PINNING_NETFS_WB = (1U << 18) +}; + +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) +#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) + /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -724,7 +842,7 @@ struct inode { #endif /* Misc */ - u32 i_state; + enum inode_state_flags_t i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -780,14 +898,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_ENCRYPTION - struct fscrypt_inode_info *i_crypt_info; -#endif - -#ifdef CONFIG_FS_VERITY - struct fsverity_info *i_verity_info; -#endif - void *i_private; /* fs or device private pointer */ } __randomize_layout; @@ -2008,20 +2118,18 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, /** * struct renamedata - contains all information required for renaming - * @old_mnt_idmap: idmap of the old mount the inode was found from + * @mnt_idmap: idmap of the mount in which the rename is happening. * @old_parent: parent of source * @old_dentry: source - * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_parent: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { - struct mnt_idmap *old_mnt_idmap; + struct mnt_idmap *mnt_idmap; struct dentry *old_parent; struct dentry *old_dentry; - struct mnt_idmap *new_mnt_idmap; struct dentry *new_parent; struct dentry *new_dentry; struct inode **delegated_inode; @@ -2052,8 +2160,6 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); - #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -2492,117 +2598,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, }; } -/* - * Inode state bits. Protected by inode->i_lock - * - * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, - * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. - * - * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, - * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at - * various stages of removing an inode. - * - * Two bits are used for locking and completion notification, I_NEW and I_SYNC. - * - * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on - * fdatasync() (unless I_DIRTY_DATASYNC is also set). - * Timestamp updates are the usual cause. - * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of - * these changes separately from I_DIRTY_SYNC so that we - * don't have to write inode on fdatasync() when only - * e.g. the timestamps have changed. - * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. - * I_DIRTY_TIME The inode itself has dirty timestamps, and the - * lazytime mount option is enabled. We keep track of this - * separately from I_DIRTY_SYNC in order to implement - * lazytime. This gets cleared if I_DIRTY_INODE - * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But - * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already - * in place because writeback might already be in progress - * and we don't want to lose the time update - * I_NEW Serves as both a mutex and completion notification. - * New inodes set I_NEW. If two processes both create - * the same inode, one of them will release its inode and - * wait for I_NEW to be released before returning. - * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can - * also cause waiting on I_NEW, without I_NEW actually - * being set. find_inode() uses this to prevent returning - * nearly-dead inodes. - * I_WILL_FREE Must be set when calling write_inode_now() if i_count - * is zero. I_FREEING must be set when I_WILL_FREE is - * cleared. - * I_FREEING Set when inode is about to be freed but still has dirty - * pages or buffers attached or the inode itself is still - * dirty. - * I_CLEAR Added by clear_inode(). In this state the inode is - * clean and can be destroyed. Inode keeps I_FREEING. - * - * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are - * prohibited for many purposes. iget() must wait for - * the inode to be completely released, then create it - * anew. Other functions will just ignore such inodes, - * if appropriate. I_NEW is used for waiting. - * - * I_SYNC Writeback of inode is running. The bit is set during - * data writeback, and cleared with a wakeup on the bit - * address once it is done. The bit is also used to pin - * the inode in memory for flusher thread. - * - * I_REFERENCED Marks the inode as recently references on the LRU list. - * - * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to - * synchronize competing switching instances and to tell - * wb stat updates to grab the i_pages lock. See - * inode_switch_wbs_work_fn() for details. - * - * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper - * and work dirs among overlayfs mounts. - * - * I_CREATING New object's inode in the middle of setting up. - * - * I_DONTCACHE Evict inode as soon as it is not used anymore. - * - * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. - * Used to detect that mark_inode_dirty() should not move - * inode between dirty lists. - * - * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. - * - * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding - * i_count. - * - * Q: What is the difference between I_WILL_FREE and I_FREEING? - * - * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait - * upon. There's one free address left. - */ -#define __I_NEW 0 -#define I_NEW (1 << __I_NEW) -#define __I_SYNC 1 -#define I_SYNC (1 << __I_SYNC) -#define __I_LRU_ISOLATING 2 -#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) - -#define I_DIRTY_SYNC (1 << 3) -#define I_DIRTY_DATASYNC (1 << 4) -#define I_DIRTY_PAGES (1 << 5) -#define I_WILL_FREE (1 << 6) -#define I_FREEING (1 << 7) -#define I_CLEAR (1 << 8) -#define I_REFERENCED (1 << 9) -#define I_LINKABLE (1 << 10) -#define I_DIRTY_TIME (1 << 11) -#define I_WB_SWITCH (1 << 12) -#define I_OVL_INUSE (1 << 13) -#define I_CREATING (1 << 14) -#define I_DONTCACHE (1 << 15) -#define I_SYNC_QUEUED (1 << 16) -#define I_PINNING_NETFS_WB (1 << 17) - -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) -#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) - extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { @@ -2614,6 +2609,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline int icount_read(const struct inode *inode) +{ + return atomic_read(&inode->i_count); +} + /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. @@ -2713,12 +2713,6 @@ static inline bool is_mgtime(const struct inode *inode) return inode->i_opflags & IOP_MGTIME; } -extern struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); @@ -3281,7 +3275,7 @@ static inline bool is_dot_dotdot(const char *name, size_t len) /** * name_contains_dotdot - check if a file name contains ".." path components - * + * @name: File path string to check * Search for ".." surrounded by either '/' or start/end of string. */ static inline bool name_contains_dotdot(const char *name) @@ -3313,8 +3307,8 @@ extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); -extern int generic_delete_inode(struct inode *inode); -static inline int generic_drop_inode(struct inode *inode) +extern int inode_just_drop(struct inode *inode); +static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } @@ -3393,7 +3387,6 @@ static inline struct inode *new_inode_pseudo(struct super_block *sb) extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); -extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); @@ -4023,4 +4016,18 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); +static inline bool extensible_ioctl_valid(unsigned int cmd_a, + unsigned int cmd_b, size_t min_size) +{ + if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b)) + return false; + if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b)) + return false; + if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b)) + return false; + if (_IOC_SIZE(cmd_a) < min_size) + return false; + return true; +} + #endif /* _LINUX_FS_H */ diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 7773eb870039c4..671f031be173b7 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -186,10 +186,12 @@ struct fc_log { extern __attribute__((format(printf, 4, 5))) void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...); -#define __logfc(fc, l, fmt, ...) logfc((fc)->log.log, NULL, \ - l, fmt, ## __VA_ARGS__) -#define __plog(p, l, fmt, ...) logfc((p)->log, (p)->prefix, \ - l, fmt, ## __VA_ARGS__) +#define __logfc(fc, l, fmt, ...) \ + logfc((fc)->log.log, NULL, (l), (fmt), ## __VA_ARGS__) +#define __plogp(p, prefix, l, fmt, ...) \ + logfc((p)->log, (prefix), (l), (fmt), ## __VA_ARGS__) +#define __plog(p, l, fmt, ...) __plogp(p, (p)->prefix, l, fmt, ## __VA_ARGS__) + /** * infof - Store supplementary informational message * @fc: The context in which to log the informational message @@ -201,6 +203,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__) #define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__) #define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__) +#define infofcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'i', fmt, ## __VA_ARGS__) /** * warnf - Store supplementary warning message @@ -213,6 +217,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__) #define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__) #define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__) +#define warnfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'w', fmt, ## __VA_ARGS__) /** * errorf - Store supplementary error message @@ -225,6 +231,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__) #define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__) #define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__) +#define errorfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'e', fmt, ## __VA_ARGS__) /** * invalf - Store supplementary invalid argument error message @@ -237,5 +245,7 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL) #define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL) #define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL) +#define invalfcp(fc, prefix, fmt, ...) \ + (errorfcp(fc, prefix, fmt, ## __VA_ARGS__), -EINVAL) #endif /* _LINUX_FS_CONTEXT_H */ diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 10dd161690a28c..516aba5b858b54 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -61,6 +61,12 @@ struct fscrypt_name { /* Crypto operations for filesystems */ struct fscrypt_operations { + /* + * The offset of the pointer to struct fscrypt_inode_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /* * If set, then fs/crypto/ will allocate a global bounce page pool the @@ -195,16 +201,44 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Returns the address of the fscrypt info pointer within the + * filesystem-specific part of the inode. (To save memory on filesystems that + * don't support fscrypt, a field in 'struct inode' itself is no longer used.) + */ +static inline struct fscrypt_inode_info ** +fscrypt_inode_info_addr(const struct inode *inode) +{ + VFS_WARN_ON_ONCE(inode->i_sb->s_cop->inode_info_offs == 0); + return (void *)inode + inode->i_sb->s_cop->inode_info_offs; +} + +/* + * Load the inode's fscrypt info pointer, using a raw dereference. Since this + * uses a raw dereference with no memory barrier, it is appropriate to use only + * when the caller knows the inode's key setup already happened, resulting in + * non-NULL fscrypt info. E.g., the file contents en/decryption functions use + * this, since fscrypt_file_open() set up the key. + */ +static inline struct fscrypt_inode_info * +fscrypt_get_inode_info_raw(const struct inode *inode) +{ + struct fscrypt_inode_info *ci = *fscrypt_inode_info_addr(inode); + + VFS_WARN_ON_ONCE(ci == NULL); + return ci; +} + static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). - * I.e., another task may publish ->i_crypt_info concurrently, executing - * a RELEASE barrier. We need to use smp_load_acquire() here to safely + * I.e., another task may publish the fscrypt info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_crypt_info); + return smp_load_acquire(fscrypt_inode_info_addr(inode)); } /** diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 1eb7eae580be70..5bc7280425a719 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -26,8 +26,16 @@ /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 +struct fsverity_info; + /* Verity operations for filesystems */ struct fsverity_operations { + /** + * The offset of the pointer to struct fsverity_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /** * Begin enabling verity on the given file. @@ -124,15 +132,37 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +/* + * Returns the address of the verity info pointer within the filesystem-specific + * part of the inode. (To save memory on filesystems that don't support + * fsverity, a field in 'struct inode' itself is no longer used.) + */ +static inline struct fsverity_info ** +fsverity_info_addr(const struct inode *inode) +{ + VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0); + return (void *)inode + inode->i_sb->s_vop->inode_info_offs; +} + static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* - * Pairs with the cmpxchg_release() in fsverity_set_info(). - * I.e., another task may publish ->i_verity_info concurrently, - * executing a RELEASE barrier. We need to use smp_load_acquire() here - * to safely ACQUIRE the memory the other task published. + * Since this function can be called on inodes belonging to filesystems + * that don't support fsverity at all, and fsverity_info_addr() doesn't + * work on such filesystems, we have to start with an IS_VERITY() check. + * Checking IS_VERITY() here is also useful to minimize the overhead of + * fsverity_active() on non-verity files. + */ + if (!IS_VERITY(inode)) + return NULL; + + /* + * Pairs with the cmpxchg_release() in fsverity_set_info(). I.e., + * another task may publish the inode's verity info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely + * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_verity_info); + return smp_load_acquire(fsverity_info_addr(inode)); } /* enable.c */ @@ -156,12 +186,19 @@ void __fsverity_cleanup_inode(struct inode *inode); * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * - * Filesystems must call this on inode eviction to free ->i_verity_info. + * Filesystems must call this on inode eviction to free the inode's verity info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (inode->i_verity_info) + /* + * Only IS_VERITY() inodes can have verity info, so start by checking + * for IS_VERITY() (which is faster than retrieving the pointer to the + * verity info). This minimizes overhead for non-verity inodes. + */ + if (IS_VERITY(inode)) __fsverity_cleanup_inode(inode); + else + VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL); } /* read_metadata.c */ @@ -267,12 +304,12 @@ static inline bool fsverity_verify_page(struct page *page) * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * - * This checks whether ->i_verity_info has been set. + * This checks whether the inode's verity info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with - * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) + * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before the verity info.) * * Return: true if reads need to go through fs-verity, otherwise false */ @@ -287,7 +324,7 @@ static inline bool fsverity_active(const struct inode *inode) * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, - * set up the inode's ->i_verity_info if not already done. + * set up the inode's verity info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 667f8fd58a793f..fabe2baf7b5090 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -388,28 +388,6 @@ struct gpio_irq_chip { * implies that if the chip supports IRQs, these IRQs need to be threaded * as the chip access may sleep when e.g. reading out the IRQ status * registers. - * @read_reg: reader function for generic GPIO - * @write_reg: writer function for generic GPIO - * @be_bits: if the generic GPIO has big endian bit order (bit 31 is representing - * line 0, bit 30 is line 1 ... bit 0 is line 31) this is set to true by the - * generic GPIO core. It is for internal housekeeping only. - * @reg_dat: data (in) register for generic GPIO - * @reg_set: output set register (out=high) for generic GPIO - * @reg_clr: output clear register (out=low) for generic GPIO - * @reg_dir_out: direction out setting register for generic GPIO - * @reg_dir_in: direction in setting register for generic GPIO - * @bgpio_dir_unreadable: indicates that the direction register(s) cannot - * be read and we need to rely on out internal state tracking. - * @bgpio_pinctrl: the generic GPIO uses a pin control backend. - * @bgpio_bits: number of register bits used for a generic GPIO i.e. - * * 8 - * @bgpio_lock: used to lock chip->bgpio_data. Also, this is needed to keep - * shadowed and real data registers writes together. - * @bgpio_data: shadowed data register for generic GPIO to clear/set bits - * safely. - * @bgpio_dir: shadowed direction register for generic GPIO to clear/set - * direction safely. A "1" in this word means the line is set as - * output. * * A gpio_chip can help platforms abstract various sources of GPIOs so * they can all be accessed through a common programming interface. @@ -475,23 +453,6 @@ struct gpio_chip { const char *const *names; bool can_sleep; -#if IS_ENABLED(CONFIG_GPIO_GENERIC) - unsigned long (*read_reg)(void __iomem *reg); - void (*write_reg)(void __iomem *reg, unsigned long data); - bool be_bits; - void __iomem *reg_dat; - void __iomem *reg_set; - void __iomem *reg_clr; - void __iomem *reg_dir_out; - void __iomem *reg_dir_in; - bool bgpio_dir_unreadable; - bool bgpio_pinctrl; - int bgpio_bits; - raw_spinlock_t bgpio_lock; - unsigned long bgpio_data; - unsigned long bgpio_dir; -#endif /* CONFIG_GPIO_GENERIC */ - #ifdef CONFIG_GPIOLIB_IRQCHIP /* * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib @@ -723,21 +684,6 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -int bgpio_init(struct gpio_chip *gc, struct device *dev, - unsigned long sz, void __iomem *dat, void __iomem *set, - void __iomem *clr, void __iomem *dirout, void __iomem *dirin, - unsigned long flags); - -#define BGPIOF_BIG_ENDIAN BIT(0) -#define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ -#define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ -#define BGPIOF_BIG_ENDIAN_BYTE_ORDER BIT(3) -#define BGPIOF_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ -#define BGPIOF_NO_OUTPUT BIT(5) /* only input */ -#define BGPIOF_NO_SET_ON_INPUT BIT(6) -#define BGPIOF_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ -#define BGPIOF_NO_INPUT BIT(8) /* only output */ - #ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain); @@ -772,16 +718,50 @@ struct gpio_pin_range { #ifdef CONFIG_PINCTRL -int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins); +int gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int const *pins, + unsigned int npins); int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, unsigned int gpio_offset, const char *pin_group); void gpiochip_remove_pin_ranges(struct gpio_chip *gc); +static inline int +gpiochip_add_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int npins) +{ + return gpiochip_add_pin_range_with_pins(gc, pinctl_name, gpio_offset, + pin_offset, NULL, npins); +} + +static inline int +gpiochip_add_sparse_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int const *pins, + unsigned int npins) +{ + return gpiochip_add_pin_range_with_pins(gc, pinctl_name, gpio_offset, 0, + pins, npins); +} #else /* ! CONFIG_PINCTRL */ +static inline int +gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int npins) +{ + return 0; +} + static inline int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, unsigned int gpio_offset, unsigned int pin_offset, @@ -789,6 +769,17 @@ gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, { return 0; } + +static inline int +gpiochip_add_sparse_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int const *pins, + unsigned int npins) +{ + return 0; +} + static inline int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h new file mode 100644 index 00000000000000..ee5d8355f73557 --- /dev/null +++ b/include/linux/gpio/forwarder.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_GPIO_FORWARDER_H +#define __LINUX_GPIO_FORWARDER_H + +struct gpio_desc; +struct gpio_chip; +struct gpiochip_fwd; + +struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, + unsigned int ngpios); +int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, + struct gpio_desc *desc, unsigned int offset); +void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset); +int gpiochip_fwd_register(struct gpiochip_fwd *fwd, void *data); + +struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); + +void *gpiochip_fwd_get_data(struct gpiochip_fwd *fwd); + +int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset); +int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, + unsigned int offset); +int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, + unsigned int offset); +int gpiochip_fwd_gpio_direction_output(struct gpiochip_fwd *fwd, + unsigned int offset, + int value); +int gpiochip_fwd_gpio_get(struct gpiochip_fwd *fwd, unsigned int offset); +int gpiochip_fwd_gpio_get_multiple(struct gpiochip_fwd *fwd, + unsigned long *mask, + unsigned long *bits); +int gpiochip_fwd_gpio_set(struct gpiochip_fwd *fwd, unsigned int offset, + int value); +int gpiochip_fwd_gpio_set_multiple(struct gpiochip_fwd *fwd, + unsigned long *mask, + unsigned long *bits); +int gpiochip_fwd_gpio_set_config(struct gpiochip_fwd *fwd, unsigned int offset, + unsigned long config); +int gpiochip_fwd_gpio_to_irq(struct gpiochip_fwd *fwd, unsigned int offset); + +#endif diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index f3a8db4598bb59..ff566dc9c3cbed 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -9,6 +9,16 @@ struct device; +#define GPIO_GENERIC_BIG_ENDIAN BIT(0) +#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ +#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ +#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) +#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ +#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ +#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) +#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ +#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ + /** * struct gpio_generic_chip_config - Generic GPIO chip configuration data * @dev: Parent device of the new GPIO chip (compulsory). @@ -50,26 +60,55 @@ struct gpio_generic_chip_config { * struct gpio_generic_chip - Generic GPIO chip implementation. * @gc: The underlying struct gpio_chip object, implementing low-level GPIO * chip routines. + * @read_reg: reader function for generic GPIO + * @write_reg: writer function for generic GPIO + * @be_bits: if the generic GPIO has big endian bit order (bit 31 is + * representing line 0, bit 30 is line 1 ... bit 0 is line 31) this + * is set to true by the generic GPIO core. It is for internal + * housekeeping only. + * @reg_dat: data (in) register for generic GPIO + * @reg_set: output set register (out=high) for generic GPIO + * @reg_clr: output clear register (out=low) for generic GPIO + * @reg_dir_out: direction out setting register for generic GPIO + * @reg_dir_in: direction in setting register for generic GPIO + * @dir_unreadable: indicates that the direction register(s) cannot be read and + * we need to rely on out internal state tracking. + * @pinctrl: the generic GPIO uses a pin control backend. + * @bits: number of register bits used for a generic GPIO + * i.e. * 8 + * @lock: used to lock chip->sdata. Also, this is needed to keep + * shadowed and real data registers writes together. + * @sdata: shadowed data register for generic GPIO to clear/set bits safely. + * @sdir: shadowed direction register for generic GPIO to clear/set direction + * safely. A "1" in this word means the line is set as output. */ struct gpio_generic_chip { struct gpio_chip gc; + unsigned long (*read_reg)(void __iomem *reg); + void (*write_reg)(void __iomem *reg, unsigned long data); + bool be_bits; + void __iomem *reg_dat; + void __iomem *reg_set; + void __iomem *reg_clr; + void __iomem *reg_dir_out; + void __iomem *reg_dir_in; + bool dir_unreadable; + bool pinctrl; + int bits; + raw_spinlock_t lock; + unsigned long sdata; + unsigned long sdir; }; -/** - * gpio_generic_chip_init() - Initialize a generic GPIO chip. - * @chip: Generic GPIO chip to set up. - * @cfg: Generic GPIO chip configuration. - * - * Returns 0 on success, negative error number on failure. - */ -static inline int -gpio_generic_chip_init(struct gpio_generic_chip *chip, - const struct gpio_generic_chip_config *cfg) +static inline struct gpio_generic_chip * +to_gpio_generic_chip(struct gpio_chip *gc) { - return bgpio_init(&chip->gc, cfg->dev, cfg->sz, cfg->dat, cfg->set, - cfg->clr, cfg->dirout, cfg->dirin, cfg->flags); + return container_of(gc, struct gpio_generic_chip, gc); } +int gpio_generic_chip_init(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg); + /** * gpio_generic_chip_set() - Set the GPIO line value of the generic GPIO chip. * @chip: Generic GPIO chip to use. @@ -94,17 +133,48 @@ gpio_generic_chip_set(struct gpio_generic_chip *chip, unsigned int offset, return chip->gc.set(&chip->gc, offset, value); } +/** + * gpio_generic_read_reg() - Read a register using the underlying callback. + * @chip: Generic GPIO chip to use. + * @reg: Register to read. + * + * Returns: value read from register. + */ +static inline unsigned long +gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) +{ + if (WARN_ON(!chip->read_reg)) + return 0; + + return chip->read_reg(reg); +} + +/** + * gpio_generic_write_reg() - Write a register using the underlying callback. + * @chip: Generic GPIO chip to use. + * @reg: Register to write to. + * @val: New value to write. + */ +static inline void gpio_generic_write_reg(struct gpio_generic_chip *chip, + void __iomem *reg, unsigned long val) +{ + if (WARN_ON(!chip->write_reg)) + return; + + chip->write_reg(reg, val); +} + #define gpio_generic_chip_lock(gen_gc) \ - raw_spin_lock(&(gen_gc)->gc.bgpio_lock) + raw_spin_lock(&(gen_gc)->lock) #define gpio_generic_chip_unlock(gen_gc) \ - raw_spin_unlock(&(gen_gc)->gc.bgpio_lock) + raw_spin_unlock(&(gen_gc)->lock) #define gpio_generic_chip_lock_irqsave(gen_gc, flags) \ - raw_spin_lock_irqsave(&(gen_gc)->gc.bgpio_lock, flags) + raw_spin_lock_irqsave(&(gen_gc)->lock, flags) #define gpio_generic_chip_unlock_irqrestore(gen_gc, flags) \ - raw_spin_unlock_irqrestore(&(gen_gc)->gc.bgpio_lock, flags) + raw_spin_unlock_irqrestore(&(gen_gc)->lock, flags) DEFINE_LOCK_GUARD_1(gpio_generic_lock, struct gpio_generic_chip, diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index b5a84864650d0c..592a774a53cdf5 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -261,16 +261,14 @@ struct platform_device; * true. */ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, - struct gpio_chip *chip, unsigned int offset, - unsigned int gpio); + struct gpio_chip *chip, unsigned int offset); #else static inline void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, struct gpio_chip *chip, - unsigned int offset, - unsigned int gpio) + unsigned int offset) { } diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index c722c67668c6e5..622a2939ebe0fd 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -6,6 +6,7 @@ struct device; struct fwnode_handle; struct gpio_regmap; +struct gpio_chip; struct irq_domain; struct regmap; @@ -40,6 +41,13 @@ struct regmap; * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). + * @init_valid_mask: (Optional) Routine to initialize @valid_mask, to be used + * if not all GPIOs are valid. + * @regmap_irq_chip: (Optional) Pointer on an regmap_irq_chip structure. If + * set, a regmap-irq device will be created and the IRQ + * domain will be set accordingly. + * @regmap_irq_line (Optional) The IRQ the device uses to signal interrupts. + * @regmap_irq_flags (Optional) The IRQF_ flags to use for the interrupt. * * The ->reg_mask_xlate translates a given base address and GPIO offset to * register and mask pair. The base address is one of the given register @@ -78,10 +86,20 @@ struct gpio_regmap_config { int ngpio_per_reg; struct irq_domain *irq_domain; +#ifdef CONFIG_REGMAP_IRQ + struct regmap_irq_chip *regmap_irq_chip; + int regmap_irq_line; + unsigned long regmap_irq_flags; +#endif + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, unsigned int offset, unsigned int *reg, unsigned int *mask); + int (*init_valid_mask)(struct gpio_chip *gc, + unsigned long *valid_mask, + unsigned int ngpios); + void *drvdata; }; diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h new file mode 100644 index 00000000000000..8838ca2f3d0893 --- /dev/null +++ b/include/linux/hfs_common.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HFS/HFS+ common definitions, inline functions, + * and shared functionality. + */ + +#ifndef _HFS_COMMON_H_ +#define _HFS_COMMON_H_ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define hfs_dbg(fmt, ...) \ + pr_debug("pid %d:%s:%d %s(): " fmt, \ + current->pid, __FILE__, __LINE__, __func__, ##__VA_ARGS__) \ + +#endif /* _HFS_COMMON_H_ */ diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1ef867bb8c44b0..2cf1bf65b22578 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -154,14 +154,11 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) return ktime_to_ns(timer->node.expires); } -static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) -{ - return ktime_sub(timer->node.expires, timer->base->get_time()); -} +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) +static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return timer->base->get_time(); + return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) @@ -200,8 +197,7 @@ __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { - return __hrtimer_expires_remaining_adjusted(timer, - timer->base->get_time()); + return __hrtimer_expires_remaining_adjusted(timer, hrtimer_cb_get_time(timer)); } #ifdef CONFIG_TIMERFD @@ -363,7 +359,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { - return hrtimer_forward(timer, timer->base->get_time(), interval); + return hrtimer_forward(timer, hrtimer_cb_get_time(timer), interval); } /* Precise sleep: */ diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 84a5045f80f36f..aa49ffa130e57f 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -41,7 +41,6 @@ * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers - * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -51,7 +50,6 @@ struct hrtimer_clock_base { seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; - ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 3a63dff62d0361..301a83afbd6636 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -24,6 +24,7 @@ enum hwmon_sensor_types { hwmon_curr, hwmon_power, hwmon_energy, + hwmon_energy64, hwmon_humidity, hwmon_fan, hwmon_pwm, @@ -491,6 +492,9 @@ int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, char *hwmon_sanitize_name(const char *name); char *devm_hwmon_sanitize_name(struct device *dev, const char *name); +void hwmon_lock(struct device *dev); +void hwmon_unlock(struct device *dev); + /** * hwmon_is_bad_char - Is the char invalid in a hwmon name * @ch: the char to be considered diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 20fd41b51d5c85..11a19241e360be 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -499,7 +499,7 @@ static inline struct i2c_client *i2c_verify_client(struct device *dev) * Modules for add-on boards must use other calls. */ #ifdef CONFIG_I2C_BOARDINFO -int +int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n); #else diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 043f5c7ff398ff..c52a82dd79a634 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -558,6 +558,26 @@ struct i3c_master_controller { #define i3c_bus_for_each_i3cdev(bus, dev) \ list_for_each_entry(dev, &(bus)->devs.i3c, common.node) +/** + * struct i3c_dma - DMA transfer and mapping descriptor + * @dev: device object of a device doing DMA + * @buf: destination/source buffer for DMA + * @len: length of transfer + * @map_len: length of DMA mapping + * @addr: mapped DMA address for a Host Controller Driver + * @dir: DMA direction + * @bounce_buf: an allocated bounce buffer if transfer needs it or NULL + */ +struct i3c_dma { + struct device *dev; + void *buf; + size_t len; + size_t map_len; + dma_addr_t addr; + enum dma_data_direction dir; + void *bounce_buf; +}; + int i3c_master_do_i2c_xfers(struct i3c_master_controller *master, const struct i2c_msg *xfers, int nxfers); @@ -575,6 +595,12 @@ int i3c_master_get_free_addr(struct i3c_master_controller *master, int i3c_master_add_i3c_dev_locked(struct i3c_master_controller *master, u8 addr); int i3c_master_do_daa(struct i3c_master_controller *master); +struct i3c_dma *i3c_master_dma_map_single(struct device *dev, void *ptr, + size_t len, bool force_bounce, + enum dma_data_direction dir); +void i3c_master_dma_unmap_single(struct i3c_dma *dma_xfer); +DEFINE_FREE(i3c_master_dma_unmap_single, void *, + if (_T) i3c_master_dma_unmap_single(_T)) int i3c_master_set_info(struct i3c_master_controller *master, const struct i3c_device_info *info); diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 80a178f3d89688..12f5ee43850ea5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -420,9 +420,6 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; - struct io_alloc_cache msg_cache; - spinlock_t msg_lock; - #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 14f7eaf1b4437c..079d8773790c07 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -118,8 +118,8 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk); -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk); +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { if (!current->io_context) return 0; @@ -129,7 +129,7 @@ static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611ad6..12faca29bbb9c3 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,20 +129,25 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) -extern struct ipc_namespace *copy_ipcs(unsigned long flags, +static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) +{ + return container_of(ns, struct ipc_namespace, ns); +} + +extern struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { if (ns) { - if (refcount_inc_not_zero(&ns->ns.count)) + if (ns_ref_get(ns)) return ns; } @@ -151,7 +156,7 @@ static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns extern void put_ipc_ns(struct ipc_namespace *ns); #else -static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +static inline struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 5d69820d8b027e..892e2d656e1e72 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -109,8 +109,9 @@ struct ipmi_smi_msg { enum ipmi_smi_msg_type type; - long msgid; - void *user_data; + long msgid; + /* Response to this message, will be NULL if not from a user request. */ + struct ipmi_recv_msg *recv_msg; int data_size; unsigned char data[IPMI_MAX_MSG_LENGTH]; @@ -168,9 +169,11 @@ struct ipmi_smi_handlers { * are held when this is run. Message are delivered one at * a time by the message handler, a new message will not be * delivered until the previous message is returned. + * + * This can return an error if the SMI is not in a state where it + * can send a message. */ - void (*sender)(void *send_info, - struct ipmi_smi_msg *msg); + int (*sender)(void *send_info, struct ipmi_smi_msg *msg); /* * Called by the upper layer to request that we try to get diff --git a/include/linux/irq.h b/include/linux/irq.h index 1d6b606a81efe5..c67e76fbcc0775 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -669,6 +669,8 @@ extern int irq_chip_set_parent_state(struct irq_data *data, extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); +extern void irq_chip_shutdown_parent(struct irq_data *data); +extern unsigned int irq_chip_startup_parent(struct irq_data *data); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); @@ -976,10 +978,6 @@ static inline void irq_free_desc(unsigned int irq) irq_free_descs(irq, 1); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq); -#endif - /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 91b20788273dfa..0d1927da805513 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -61,7 +61,7 @@ extern void register_refined_jiffies(long clock_tick_rate); -/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ +/* TICK_USEC is the time between ticks in usec */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b14..fe5ce9215821db 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -562,7 +562,7 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size); +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, @@ -574,7 +574,7 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } @@ -610,7 +610,7 @@ static __always_inline void kasan_poison_vmalloc(const void *start, static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fd11fffdd3c380..adbe234a6f6c61 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -211,7 +211,7 @@ LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, file_post_open, struct file *file, int mask) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 090d1d3e19fed6..79ec5a2bdcca7a 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -116,6 +116,9 @@ struct lsm_blob_sizes { int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ int lbs_tun_dev; int lbs_bdev; + int lbs_bpf_map; + int lbs_bpf_prog; + int lbs_bpf_token; }; /* diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index 2d13bbea4f3acb..f72e6d4b14a784 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -117,8 +117,10 @@ struct arizona_pdata { /** Check for line output with HPDET method */ bool hpdet_acc_id_line; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO used for mic isolation with HPDET */ int hpdet_id_gpio; +#endif /** Channel to use for headphone detection */ unsigned int hpdet_channel; @@ -129,8 +131,10 @@ struct arizona_pdata { /** Extra debounce timeout used during initial mic detection (ms) */ unsigned int micd_detect_debounce; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO for mic detection polarity */ int micd_pol_gpio; +#endif /** Mic detect ramp rate */ unsigned int micd_bias_start_time; @@ -184,8 +188,10 @@ struct arizona_pdata { /** Haptic actuator type */ unsigned int hap_act; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO for primary IRQ (used for edge triggered emulation) */ int irq_gpio; +#endif /** General purpose switch control */ unsigned int gpsw; diff --git a/include/linux/mfd/bq257xx.h b/include/linux/mfd/bq257xx.h new file mode 100644 index 00000000000000..1d6ddc7fb09fcb --- /dev/null +++ b/include/linux/mfd/bq257xx.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Register definitions for TI BQ257XX + * Copyright (C) 2020 Texas Instruments Incorporated - http://www.ti.com/ + */ + +#define BQ25703_CHARGE_OPTION_0 0x00 +#define BQ25703_CHARGE_CURRENT 0x02 +#define BQ25703_MAX_CHARGE_VOLT 0x04 +#define BQ25703_OTG_VOLT 0x06 +#define BQ25703_OTG_CURRENT 0x08 +#define BQ25703_INPUT_VOLTAGE 0x0a +#define BQ25703_MIN_VSYS 0x0c +#define BQ25703_IIN_HOST 0x0e +#define BQ25703_CHARGER_STATUS 0x20 +#define BQ25703_PROCHOT_STATUS 0x22 +#define BQ25703_IIN_DPM 0x24 +#define BQ25703_ADCIBAT_CHG 0x28 +#define BQ25703_ADCIINCMPIN 0x2a +#define BQ25703_ADCVSYSVBAT 0x2c +#define BQ25703_MANUFACT_DEV_ID 0x2e +#define BQ25703_CHARGE_OPTION_1 0x30 +#define BQ25703_CHARGE_OPTION_2 0x32 +#define BQ25703_CHARGE_OPTION_3 0x34 +#define BQ25703_ADC_OPTION 0x3a + +#define BQ25703_EN_LWPWR BIT(15) +#define BQ25703_WDTMR_ADJ_MASK GENMASK(14, 13) +#define BQ25703_WDTMR_DISABLE 0 +#define BQ25703_WDTMR_5_SEC 1 +#define BQ25703_WDTMR_88_SEC 2 +#define BQ25703_WDTMR_175_SEC 3 + +#define BQ25703_ICHG_MASK GENMASK(12, 6) +#define BQ25703_ICHG_STEP_UA 64000 +#define BQ25703_ICHG_MIN_UA 64000 +#define BQ25703_ICHG_MAX_UA 8128000 + +#define BQ25703_MAX_CHARGE_VOLT_MASK GENMASK(15, 4) +#define BQ25703_VBATREG_STEP_UV 16000 +#define BQ25703_VBATREG_MIN_UV 1024000 +#define BQ25703_VBATREG_MAX_UV 19200000 + +#define BQ25703_OTG_VOLT_MASK GENMASK(13, 6) +#define BQ25703_OTG_VOLT_STEP_UV 64000 +#define BQ25703_OTG_VOLT_MIN_UV 4480000 +#define BQ25703_OTG_VOLT_MAX_UV 20800000 +#define BQ25703_OTG_VOLT_NUM_VOLT 256 + +#define BQ25703_OTG_CUR_MASK GENMASK(14, 8) +#define BQ25703_OTG_CUR_STEP_UA 50000 +#define BQ25703_OTG_CUR_MAX_UA 6350000 + +#define BQ25703_MINVSYS_MASK GENMASK(13, 8) +#define BQ25703_MINVSYS_STEP_UV 256000 +#define BQ25703_MINVSYS_MIN_UV 1024000 +#define BQ25703_MINVSYS_MAX_UV 16128000 + +#define BQ25703_STS_AC_STAT BIT(15) +#define BQ25703_STS_IN_FCHRG BIT(10) +#define BQ25703_STS_IN_PCHRG BIT(9) +#define BQ25703_STS_FAULT_ACOV BIT(7) +#define BQ25703_STS_FAULT_BATOC BIT(6) +#define BQ25703_STS_FAULT_ACOC BIT(5) + +#define BQ25703_IINDPM_MASK GENMASK(14, 8) +#define BQ25703_IINDPM_STEP_UA 50000 +#define BQ25703_IINDPM_MIN_UA 50000 +#define BQ25703_IINDPM_MAX_UA 6400000 +#define BQ25703_IINDPM_DEFAULT_UA 3300000 +#define BQ25703_IINDPM_OFFSET_UA 50000 + +#define BQ25703_ADCIBAT_DISCHG_MASK GENMASK(6, 0) +#define BQ25703_ADCIBAT_CHG_MASK GENMASK(14, 8) +#define BQ25703_ADCIBAT_CHG_STEP_UA 64000 +#define BQ25703_ADCIBAT_DIS_STEP_UA 256000 + +#define BQ25703_ADCIIN GENMASK(15, 8) +#define BQ25703_ADCIINCMPIN_STEP 50000 + +#define BQ25703_ADCVSYS_MASK GENMASK(15, 8) +#define BQ25703_ADCVBAT_MASK GENMASK(7, 0) +#define BQ25703_ADCVSYSVBAT_OFFSET_UV 2880000 +#define BQ25703_ADCVSYSVBAT_STEP 64000 + +#define BQ25703_ADC_CH_MASK GENMASK(7, 0) +#define BQ25703_ADC_CONV_EN BIT(15) +#define BQ25703_ADC_START BIT(14) +#define BQ25703_ADC_FULL_SCALE BIT(13) +#define BQ25703_ADC_CMPIN_EN BIT(7) +#define BQ25703_ADC_VBUS_EN BIT(6) +#define BQ25703_ADC_PSYS_EN BIT(5) +#define BQ25703_ADC_IIN_EN BIT(4) +#define BQ25703_ADC_IDCHG_EN BIT(3) +#define BQ25703_ADC_ICHG_EN BIT(2) +#define BQ25703_ADC_VSYS_EN BIT(1) +#define BQ25703_ADC_VBAT_EN BIT(0) + +#define BQ25703_EN_OTG_MASK BIT(12) + +struct bq257xx_device { + struct i2c_client *client; + struct regmap *regmap; +}; diff --git a/include/linux/mfd/loongson-se.h b/include/linux/mfd/loongson-se.h new file mode 100644 index 00000000000000..07afa0c2524d5e --- /dev/null +++ b/include/linux/mfd/loongson-se.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (C) 2025 Loongson Technology Corporation Limited */ + +#ifndef __MFD_LOONGSON_SE_H__ +#define __MFD_LOONGSON_SE_H__ + +#define LOONGSON_ENGINE_CMD_TIMEOUT_US 10000 +#define SE_SEND_CMD_REG 0x0 +#define SE_SEND_CMD_REG_LEN 0x8 +/* Controller command ID */ +#define SE_CMD_START 0x0 +#define SE_CMD_SET_DMA 0x3 +#define SE_CMD_SET_ENGINE_CMDBUF 0x4 + +#define SE_S2LINT_STAT 0x88 +#define SE_S2LINT_EN 0x8c +#define SE_S2LINT_CL 0x94 +#define SE_L2SINT_STAT 0x98 +#define SE_L2SINT_SET 0xa0 + +#define SE_INT_ALL 0xffffffff +#define SE_INT_CONTROLLER BIT(0) + +#define SE_ENGINE_MAX 16 +#define SE_ENGINE_RNG 1 +#define SE_CMD_RNG 0x100 + +#define SE_ENGINE_TPM 5 +#define SE_CMD_TPM 0x500 + +#define SE_ENGINE_CMD_SIZE 32 + +struct loongson_se_engine { + struct loongson_se *se; + int id; + + /* Command buffer */ + void *command; + void *command_ret; + + void *data_buffer; + uint buffer_size; + /* Data buffer offset to DMA base */ + uint buffer_off; + + struct completion completion; + +}; + +struct loongson_se_engine *loongson_se_init_engine(struct device *dev, int id); +int loongson_se_send_engine_cmd(struct loongson_se_engine *engine); + +#endif diff --git a/include/linux/mfd/max7360.h b/include/linux/mfd/max7360.h new file mode 100644 index 00000000000000..44cf2bf651a252 --- /dev/null +++ b/include/linux/mfd/max7360.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __LINUX_MFD_MAX7360_H +#define __LINUX_MFD_MAX7360_H + +#include + +#define MAX7360_MAX_KEY_ROWS 8 +#define MAX7360_MAX_KEY_COLS 8 +#define MAX7360_MAX_KEY_NUM (MAX7360_MAX_KEY_ROWS * MAX7360_MAX_KEY_COLS) +#define MAX7360_ROW_SHIFT 3 + +#define MAX7360_MAX_GPIO 8 +#define MAX7360_MAX_GPO 6 +#define MAX7360_PORT_PWM_COUNT 8 +#define MAX7360_PORT_RTR_PIN (MAX7360_PORT_PWM_COUNT - 1) + +/* + * MAX7360 registers + */ +#define MAX7360_REG_KEYFIFO 0x00 +#define MAX7360_REG_CONFIG 0x01 +#define MAX7360_REG_DEBOUNCE 0x02 +#define MAX7360_REG_INTERRUPT 0x03 +#define MAX7360_REG_PORTS 0x04 +#define MAX7360_REG_KEYREP 0x05 +#define MAX7360_REG_SLEEP 0x06 + +/* + * MAX7360 GPIO registers + * + * All these registers are reset together when writing bit 3 of + * MAX7360_REG_GPIOCFG. + */ +#define MAX7360_REG_GPIOCFG 0x40 +#define MAX7360_REG_GPIOCTRL 0x41 +#define MAX7360_REG_GPIODEB 0x42 +#define MAX7360_REG_GPIOCURR 0x43 +#define MAX7360_REG_GPIOOUTM 0x44 +#define MAX7360_REG_PWMCOM 0x45 +#define MAX7360_REG_RTRCFG 0x46 +#define MAX7360_REG_I2C_TIMEOUT 0x48 +#define MAX7360_REG_GPIOIN 0x49 +#define MAX7360_REG_RTR_CNT 0x4A +#define MAX7360_REG_PWMBASE 0x50 +#define MAX7360_REG_PWMCFGBASE 0x58 + +#define MAX7360_REG_GPIO_LAST 0x5F + +#define MAX7360_REG_PWM(x) (MAX7360_REG_PWMBASE + (x)) +#define MAX7360_REG_PWMCFG(x) (MAX7360_REG_PWMCFGBASE + (x)) + +/* + * Configuration register bits + */ +#define MAX7360_FIFO_EMPTY 0x3F +#define MAX7360_FIFO_OVERFLOW 0x7F +#define MAX7360_FIFO_RELEASE BIT(6) +#define MAX7360_FIFO_COL GENMASK(5, 3) +#define MAX7360_FIFO_ROW GENMASK(2, 0) + +#define MAX7360_CFG_SLEEP BIT(7) +#define MAX7360_CFG_INTERRUPT BIT(5) +#define MAX7360_CFG_KEY_RELEASE BIT(3) +#define MAX7360_CFG_WAKEUP BIT(1) +#define MAX7360_CFG_TIMEOUT BIT(0) + +#define MAX7360_DEBOUNCE GENMASK(4, 0) +#define MAX7360_DEBOUNCE_MIN 9 +#define MAX7360_DEBOUNCE_MAX 40 +#define MAX7360_PORTS GENMASK(8, 5) + +#define MAX7360_INTERRUPT_TIME_MASK GENMASK(4, 0) +#define MAX7360_INTERRUPT_FIFO_MASK GENMASK(7, 5) + +#define MAX7360_PORT_CFG_INTERRUPT_MASK BIT(7) +#define MAX7360_PORT_CFG_INTERRUPT_EDGES BIT(6) +#define MAX7360_PORT_CFG_COMMON_PWM BIT(5) + +/* + * Autosleep register values + */ +#define MAX7360_AUTOSLEEP_8192MS 0x01 +#define MAX7360_AUTOSLEEP_4096MS 0x02 +#define MAX7360_AUTOSLEEP_2048MS 0x03 +#define MAX7360_AUTOSLEEP_1024MS 0x04 +#define MAX7360_AUTOSLEEP_512MS 0x05 +#define MAX7360_AUTOSLEEP_256MS 0x06 + +#define MAX7360_GPIO_CFG_RTR_EN BIT(7) +#define MAX7360_GPIO_CFG_GPIO_EN BIT(4) +#define MAX7360_GPIO_CFG_GPIO_RST BIT(3) + +#define MAX7360_ROT_DEBOUNCE GENMASK(3, 0) +#define MAX7360_ROT_DEBOUNCE_MIN 0 +#define MAX7360_ROT_DEBOUNCE_MAX 15 +#define MAX7360_ROT_INTCNT GENMASK(6, 4) +#define MAX7360_ROT_INTCNT_DLY BIT(7) + +#define MAX7360_INT_INTI 0 +#define MAX7360_INT_INTK 1 + +#define MAX7360_INT_GPIO 0 +#define MAX7360_INT_KEYPAD 1 +#define MAX7360_INT_ROTARY 2 + +#define MAX7360_NR_INTERNAL_IRQS 3 + +#endif diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h index f372926d5894fb..dd46fe424a8091 100644 --- a/include/linux/mfd/mc13xxx.h +++ b/include/linux/mfd/mc13xxx.h @@ -31,12 +31,6 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode, unsigned int channel, u8 ato, bool atox, unsigned int *sample); -/* Deprecated calls */ -static inline int mc13xxx_irq_ack(struct mc13xxx *mc13xxx, int irq) -{ - return 0; -} - static inline int mc13xxx_irq_request_nounmask(struct mc13xxx *mc13xxx, int irq, irq_handler_t handler, const char *name, void *dev) diff --git a/include/linux/mfd/nct6694.h b/include/linux/mfd/nct6694.h new file mode 100644 index 00000000000000..6eb9be2cd4a011 --- /dev/null +++ b/include/linux/mfd/nct6694.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Nuvoton Technology Corp. + * + * Nuvoton NCT6694 USB transaction and data structure. + */ + +#ifndef __MFD_NCT6694_H +#define __MFD_NCT6694_H + +#define NCT6694_VENDOR_ID 0x0416 +#define NCT6694_PRODUCT_ID 0x200B +#define NCT6694_INT_IN_EP 0x81 +#define NCT6694_BULK_IN_EP 0x02 +#define NCT6694_BULK_OUT_EP 0x03 + +#define NCT6694_HCTRL_SET 0x40 +#define NCT6694_HCTRL_GET 0x80 + +#define NCT6694_URB_TIMEOUT 1000 + +enum nct6694_irq_id { + NCT6694_IRQ_GPIO0 = 0, + NCT6694_IRQ_GPIO1, + NCT6694_IRQ_GPIO2, + NCT6694_IRQ_GPIO3, + NCT6694_IRQ_GPIO4, + NCT6694_IRQ_GPIO5, + NCT6694_IRQ_GPIO6, + NCT6694_IRQ_GPIO7, + NCT6694_IRQ_GPIO8, + NCT6694_IRQ_GPIO9, + NCT6694_IRQ_GPIOA, + NCT6694_IRQ_GPIOB, + NCT6694_IRQ_GPIOC, + NCT6694_IRQ_GPIOD, + NCT6694_IRQ_GPIOE, + NCT6694_IRQ_GPIOF, + NCT6694_IRQ_CAN0, + NCT6694_IRQ_CAN1, + NCT6694_IRQ_RTC, + NCT6694_NR_IRQS, +}; + +enum nct6694_response_err_status { + NCT6694_NO_ERROR = 0, + NCT6694_FORMAT_ERROR, + NCT6694_RESERVED1, + NCT6694_RESERVED2, + NCT6694_NOT_SUPPORT_ERROR, + NCT6694_NO_RESPONSE_ERROR, + NCT6694_TIMEOUT_ERROR, + NCT6694_PENDING, +}; + +struct __packed nct6694_cmd_header { + u8 rsv1; + u8 mod; + union __packed { + __le16 offset; + struct __packed { + u8 cmd; + u8 sel; + }; + }; + u8 hctrl; + u8 rsv2; + __le16 len; +}; + +struct __packed nct6694_response_header { + u8 sequence_id; + u8 sts; + u8 reserved[4]; + __le16 len; +}; + +union __packed nct6694_usb_msg { + struct nct6694_cmd_header cmd_header; + struct nct6694_response_header response_header; +}; + +struct nct6694 { + struct device *dev; + struct ida gpio_ida; + struct ida i2c_ida; + struct ida canfd_ida; + struct ida wdt_ida; + struct irq_domain *domain; + struct mutex access_lock; + spinlock_t irq_lock; + struct urb *int_in_urb; + struct usb_device *udev; + union nct6694_usb_msg *usb_msg; + __le32 *int_buffer; + unsigned int irq_enable; +}; + +int nct6694_read_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf); +int nct6694_write_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf); + +#endif diff --git a/include/linux/mfd/qnap-mcu.h b/include/linux/mfd/qnap-mcu.h index 8d48c212fd4446..42bf523f9a5b0c 100644 --- a/include/linux/mfd/qnap-mcu.h +++ b/include/linux/mfd/qnap-mcu.h @@ -7,6 +7,8 @@ #ifndef _LINUX_QNAP_MCU_H_ #define _LINUX_QNAP_MCU_H_ +#include + struct qnap_mcu; struct qnap_mcu_variant { diff --git a/include/linux/mfd/rohm-bd71828.h b/include/linux/mfd/rohm-bd71828.h index ce786c96404a3d..73a71ef6915254 100644 --- a/include/linux/mfd/rohm-bd71828.h +++ b/include/linux/mfd/rohm-bd71828.h @@ -189,6 +189,69 @@ enum { /* Charger/Battey */ #define BD71828_REG_CHG_STATE 0x65 #define BD71828_REG_CHG_FULL 0xd2 +#define BD71828_REG_CHG_EN 0x6F +#define BD71828_REG_DCIN_STAT 0x68 +#define BD71828_MASK_DCIN_DET 0x01 +#define BD71828_REG_VDCIN_U 0x9c +#define BD71828_MASK_CHG_EN 0x01 +#define BD71828_CHG_MASK_DCIN_U 0x0f +#define BD71828_REG_BAT_STAT 0x67 +#define BD71828_REG_BAT_TEMP 0x6c +#define BD71828_MASK_BAT_TEMP 0x07 +#define BD71828_BAT_TEMP_OPEN 0x07 +#define BD71828_MASK_BAT_DET 0x20 +#define BD71828_MASK_BAT_DET_DONE 0x10 +#define BD71828_REG_CHG_STATE 0x65 +#define BD71828_REG_VBAT_U 0x8c +#define BD71828_MASK_VBAT_U 0x0f +#define BD71828_REG_VBAT_REX_AVG_U 0x92 + +#define BD71828_REG_OCV_PWRON_U 0x8A + +#define BD71828_REG_VBAT_MIN_AVG_U 0x8e +#define BD71828_REG_VBAT_MIN_AVG_L 0x8f + +#define BD71828_REG_CC_CNT3 0xb5 +#define BD71828_REG_CC_CNT2 0xb6 +#define BD71828_REG_CC_CNT1 0xb7 +#define BD71828_REG_CC_CNT0 0xb8 +#define BD71828_REG_CC_CURCD_AVG_U 0xb2 +#define BD71828_MASK_CC_CURCD_AVG_U 0x3f +#define BD71828_MASK_CC_CUR_DIR 0x80 +#define BD71828_REG_VM_BTMP_U 0xa1 +#define BD71828_REG_VM_BTMP_L 0xa2 +#define BD71828_MASK_VM_BTMP_U 0x0f +#define BD71828_REG_COULOMB_CTRL 0xc4 +#define BD71828_REG_COULOMB_CTRL2 0xd2 +#define BD71828_MASK_REX_CC_CLR 0x01 +#define BD71828_MASK_FULL_CC_CLR 0x10 +#define BD71828_REG_CC_CNT_FULL3 0xbd +#define BD71828_REG_CC_CNT_CHG3 0xc1 + +#define BD71828_REG_VBAT_INITIAL1_U 0x86 +#define BD71828_REG_VBAT_INITIAL1_L 0x87 + +#define BD71828_REG_VBAT_INITIAL2_U 0x88 +#define BD71828_REG_VBAT_INITIAL2_L 0x89 + +#define BD71828_REG_IBAT_U 0xb0 +#define BD71828_REG_IBAT_L 0xb1 + +#define BD71828_REG_IBAT_AVG_U 0xb2 +#define BD71828_REG_IBAT_AVG_L 0xb3 + +#define BD71828_REG_VSYS_AVG_U 0x96 +#define BD71828_REG_VSYS_AVG_L 0x97 +#define BD71828_REG_VSYS_MIN_AVG_U 0x98 +#define BD71828_REG_VSYS_MIN_AVG_L 0x99 +#define BD71828_REG_CHG_SET1 0x75 +#define BD71828_REG_ALM_VBAT_LIMIT_U 0xaa +#define BD71828_REG_BATCAP_MON_LIMIT_U 0xcc +#define BD71828_REG_CONF 0x64 + +#define BD71828_REG_DCIN_CLPS 0x71 + +#define BD71828_REG_MEAS_CLEAR 0xaf /* LEDs */ #define BD71828_REG_LED_CTRL 0x4A diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c5fbfb8574931..10fe492e1fedcd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -663,6 +663,7 @@ struct mlx5e_resources { bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; + netdevice_tracker tracker; struct mutex uplink_netdev_lock; struct mlx5_crypto_dek_priv *dek_priv; }; diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 86055d55836d94..6ac76a0c382773 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -308,6 +308,8 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); struct mlx5_fc *mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size); void mlx5_fc_local_destroy(struct mlx5_fc *counter); +void mlx5_fc_local_get(struct mlx5_fc *counter); +void mlx5_fc_local_put(struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db9348..7f625c35128be9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -631,6 +631,11 @@ static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } + +static inline bool ptdesc_pmd_is_shared(struct ptdesc *ptdesc) +{ + return !!ptdesc_pmd_pts_count(ptdesc); +} #else static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) { @@ -1102,6 +1107,11 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + /* the ABI-related flags from the ELF header. Used for core dump */ + unsigned long saved_e_flags; +#endif + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 68f09a955a9020..e0e2c265e5d101 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -337,11 +337,15 @@ struct mmc_slot { struct regulator; struct mmc_pwrseq; +struct notifier_block; struct mmc_supply { struct regulator *vmmc; /* Card power supply */ struct regulator *vqmmc; /* Optional Vccq supply */ struct regulator *vqmmc2; /* Optional supply for phy */ + + struct notifier_block vmmc_nb; /* Notifier for vmmc */ + struct work_struct uv_work; /* Undervoltage work */ }; struct mmc_ctx { @@ -494,6 +498,13 @@ struct mmc_host { unsigned int can_dma_map_merge:1; /* merging can be used */ unsigned int vqmmc_enabled:1; /* vqmmc regulator is enabled */ + /* + * Indicates if an undervoltage event has already been handled. + * This prevents repeated regulator notifiers from triggering + * multiple REGULATOR_EVENT_UNDER_VOLTAGE events. + */ + unsigned int undervoltage:1; /* Undervoltage state */ + int rescan_disable; /* disable card detection */ int rescan_entered; /* used with nonremovable devices */ @@ -565,6 +576,7 @@ struct mmc_host { int hsq_depth; u32 err_stats[MMC_ERR_MAX]; + u32 max_sd_hs_hz; unsigned long private[] ____cacheline_aligned; }; @@ -743,5 +755,6 @@ int mmc_send_status(struct mmc_card *card, u32 *status); int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error); int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode); int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd); +int mmc_read_tuning(struct mmc_host *host, unsigned int blksz, unsigned int blocks); #endif /* LINUX_MMC_HOST_H */ diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b6481605..0acd1089d149cf 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,7 +11,9 @@ struct fs_struct; struct user_namespace; struct ns_common; -extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, +extern struct mnt_namespace init_mnt_ns; + +extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T)) diff --git a/include/linux/msi.h b/include/linux/msi.h index e5e86a8529fb6f..d415dd15a0a905 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -431,8 +431,6 @@ struct msi_domain_info; * function. * @domain_free_irqs: Optional function to override the default free * function. - * @msi_post_free: Optional function which is invoked after freeing - * all interrupts. * @msi_translate: Optional translate callback to support the odd wire to * MSI bridges, e.g. MBIGEN * @@ -473,8 +471,6 @@ struct msi_domain_ops { struct device *dev, int nvec); void (*domain_free_irqs)(struct irq_domain *domain, struct device *dev); - void (*msi_post_free)(struct irq_domain *domain, - struct device *dev); int (*msi_translate)(struct irq_domain *domain, struct irq_fwspec *fwspec, irq_hw_number_t *hwirq, unsigned int *type); }; @@ -568,6 +564,8 @@ enum { MSI_FLAG_PARENT_PM_DEV = (1 << 8), /* Support for parent mask/unmask */ MSI_FLAG_PCI_MSI_MASK_PARENT = (1 << 9), + /* Support for parent startup/shutdown */ + MSI_FLAG_PCI_MSI_STARTUP_PARENT = (1 << 10), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), diff --git a/include/linux/namei.h b/include/linux/namei.h index 5d085428e471d9..a7800ef04e7619 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -57,13 +57,17 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); - -extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); -extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); -extern void done_path_create(struct path *, struct dentry *); -extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *kern_path_locked_negative(const char *, struct path *); -extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); +struct dentry *kern_path_parent(const char *name, struct path *parent); + +extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int); +extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int); +extern void end_creating_path(struct path *, struct dentry *); +extern struct dentry *start_removing_path(const char *, struct path *); +extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *); +static inline void end_removing_path(struct path *path , struct dentry *dentry) +{ + end_creating_path(path, dentry); +} int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); @@ -80,6 +84,9 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7d22ea50b09841..f5b68b8abb5433 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -3,14 +3,151 @@ #define _LINUX_NS_COMMON_H #include +#include +#include struct proc_ns_operations; +struct cgroup_namespace; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; + struct ns_common { + u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t count; + refcount_t __ns_ref; /* do not use directly */ + union { + struct { + u64 ns_id; + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; + struct rcu_head ns_rcu; + }; }; +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); +void __ns_common_free(struct ns_common *ns); + +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) + +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + __inum) + +#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) + +static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) +{ + return refcount_dec_and_test(&ns->__ns_ref); +} + +static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) +{ + return refcount_inc_not_zero(&ns->__ns_ref); +} + +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) +#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) +#define ns_ref_put_and_lock(__ns, __lock) \ + refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) + #endif diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h new file mode 100644 index 00000000000000..e5a5fa83d36bde --- /dev/null +++ b/include/linux/nsfs.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner */ + +#ifndef _LINUX_NSFS_H +#define _LINUX_NSFS_H + +#include +#include +#include + +struct path; +struct task_struct; +struct proc_ns_operations; + +int ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +typedef struct ns_common *ns_get_path_helper_t(void *); +int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, + void *private_data); + +bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +void nsfs_init(void); + +#define __current_namespace_from_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: current->nsproxy->cgroup_ns, \ + struct ipc_namespace *: current->nsproxy->ipc_ns, \ + struct net *: current->nsproxy->net_ns, \ + struct pid_namespace *: task_active_pid_ns(current), \ + struct mnt_namespace *: current->nsproxy->mnt_ns, \ + struct time_namespace *: current->nsproxy->time_ns, \ + struct user_namespace *: current_user_ns(), \ + struct uts_namespace *: current->nsproxy->uts_ns) + +#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) + +#endif /* _LINUX_NSFS_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a2265..bd118a187dec15 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,17 +42,6 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns->ns), \ - struct ipc_namespace *: &(__ns->ns), \ - struct net *: &(__ns->ns), \ - struct pid_namespace *: &(__ns->ns), \ - struct mnt_namespace *: &(__ns->ns), \ - struct time_namespace *: &(__ns->ns), \ - struct user_namespace *: &(__ns->ns), \ - struct uts_namespace *: &(__ns->ns)) - /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. @@ -103,7 +92,7 @@ static inline struct cred *nsset_cred(struct nsset *set) * */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk); +int copy_namespaces(u64 flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); diff --git a/include/linux/nstree.h b/include/linux/nstree.h new file mode 100644 index 00000000000000..8b863669047390 --- /dev/null +++ b/include/linux/nstree.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NSTREE_H +#define _LINUX_NSTREE_H + +#include +#include +#include +#include +#include +#include + +extern struct ns_tree cgroup_ns_tree; +extern struct ns_tree ipc_ns_tree; +extern struct ns_tree mnt_ns_tree; +extern struct ns_tree net_ns_tree; +extern struct ns_tree pid_ns_tree; +extern struct ns_tree time_ns_tree; +extern struct ns_tree user_ns_tree; +extern struct ns_tree uts_ns_tree; + +#define to_ns_tree(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(cgroup_ns_tree), \ + struct ipc_namespace *: &(ipc_ns_tree), \ + struct net *: &(net_ns_tree), \ + struct pid_namespace *: &(pid_ns_tree), \ + struct mnt_namespace *: &(mnt_ns_tree), \ + struct time_namespace *: &(time_ns_tree), \ + struct user_namespace *: &(user_ns_tree), \ + struct uts_namespace *: &(uts_ns_tree)) + +u64 ns_tree_gen_id(struct ns_common *ns); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, + bool previous); + +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +{ + ns_tree_gen_id(ns); + __ns_tree_add_raw(ns, ns_tree); +} + +/** + * ns_tree_add_raw - Add a namespace to a namespace + * @ns: Namespace to add + * + * This function adds a namespace to the appropriate namespace tree + * without assigning a id. + */ +#define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_add - Add a namespace to a namespace tree + * @ns: Namespace to add + * + * This function assigns a new id to the namespace and adds it to the + * appropriate namespace tree and list. + */ +#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_remove - Remove a namespace from a namespace tree + * @ns: Namespace to remove + * + * This function removes a namespace from the appropriate namespace + * tree and list. + */ +#define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns)) + +#define ns_tree_adjoined_rcu(__ns, __previous) \ + __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) + +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) + +#endif /* _LINUX_NSTREE_H */ diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 682472c1549526..88e18615dd726b 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -134,6 +134,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_kernel_page_table_range_lockless(unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ec9d9602568395..fd1d91017b99b3 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,7 +859,7 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; - atomic_t mmap_count; + refcount_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; @@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2b80fd456c8b55..25a7257052ff94 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -232,9 +232,9 @@ static inline int pmd_dirty(pmd_t pmd) * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) +static inline void arch_enter_lazy_mmu_mode(void) {} +static inline void arch_leave_lazy_mmu_mode(void) {} +static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a58111998d..445517a72ad060 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -54,10 +54,15 @@ extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -78,12 +83,15 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) } #endif -extern struct pid_namespace *copy_pid_ns(unsigned long flags, +extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); +extern bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor); + #else /* !CONFIG_PID_NS */ #include @@ -97,7 +105,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) return 0; } -static inline struct pid_namespace *copy_pid_ns(unsigned long flags, +static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) @@ -118,6 +126,12 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } + +static inline bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + return false; +} #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 1bcf071b860ebb..d9245ecec71dc6 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -88,9 +88,13 @@ struct pinctrl_map; * passed in the argument on a custom form, else just use argument 1 * to indicate low power mode, argument 0 turns low power mode off. * @PIN_CONFIG_MODE_PWM: this will configure the pin for PWM - * @PIN_CONFIG_OUTPUT: this will configure the pin as an output and drive a - * value on the line. Use argument 1 to indicate high level, argument 0 to - * indicate low level. (Please see Documentation/driver-api/pin-control.rst, + * @PIN_CONFIG_LEVEL: setting this will configure the pin as an output and + * drive a value on the line. Use argument 1 to indicate high level, + * argument 0 to indicate low level. Conversely the value of the line + * can be read using this parameter, if and only if that value can be + * represented as a binary 0 or 1 where 0 indicate a low voltage level + * and 1 indicate a high voltage level. + * (Please see Documentation/driver-api/pin-control.rst, * section "GPIO mode pitfalls" for a discussion around this parameter.) * @PIN_CONFIG_OUTPUT_ENABLE: this will enable the pin's output mode * without driving a value there. For most platforms this reduces to @@ -137,7 +141,7 @@ enum pin_config_param { PIN_CONFIG_INPUT_SCHMITT_UV, PIN_CONFIG_MODE_LOW_POWER, PIN_CONFIG_MODE_PWM, - PIN_CONFIG_OUTPUT, + PIN_CONFIG_LEVEL, PIN_CONFIG_OUTPUT_ENABLE, PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, PIN_CONFIG_PERSIST_STATE, diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index d138e18156452e..1a8084e2940537 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -11,6 +11,7 @@ #ifndef __LINUX_PINCTRL_PINCTRL_H #define __LINUX_PINCTRL_PINCTRL_H +#include #include struct device; @@ -206,16 +207,20 @@ extern int pinctrl_get_group_pins(struct pinctrl_dev *pctldev, const char *pin_group, const unsigned int **pins, unsigned int *num_pins); +#define PINFUNCTION_FLAG_GPIO BIT(0) + /** * struct pinfunction - Description about a function * @name: Name of the function * @groups: An array of groups for this function * @ngroups: Number of groups in @groups + * @flags: Additional pin function flags */ struct pinfunction { const char *name; const char * const *groups; size_t ngroups; + unsigned long flags; }; /* Convenience macro to define a single named pinfunction */ @@ -226,6 +231,15 @@ struct pinfunction { .ngroups = (_ngroups), \ } +/* Same as PINCTRL_PINFUNCTION() but for the GPIO category of functions */ +#define PINCTRL_GPIO_PINFUNCTION(_name, _groups, _ngroups) \ +(struct pinfunction) { \ + .name = (_name), \ + .groups = (_groups), \ + .ngroups = (_ngroups), \ + .flags = PINFUNCTION_FLAG_GPIO, \ + } + #if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_PINCTRL) extern struct pinctrl_dev *of_pinctrl_get(struct device_node *np); #else diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index d6f7b58d6ad0cc..6db6c3e1ccc224 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -66,6 +66,8 @@ struct pinmux_ops { unsigned int selector, const char * const **groups, unsigned int *num_groups); + bool (*function_is_gpio) (struct pinctrl_dev *pctldev, + unsigned int selector); int (*set_mux) (struct pinctrl_dev *pctldev, unsigned int func_selector, unsigned int group_selector); int (*gpio_request_enable) (struct pinctrl_dev *pctldev, diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index c19b404e3d8d9e..69294f79cc88aa 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -1825,6 +1825,16 @@ struct ec_response_pwm_get_duty { uint16_t duty; /* Duty cycle, EC_PWM_MAX_DUTY = 100% */ } __ec_align2; +#define EC_CMD_PWM_GET_FAN_DUTY 0x0027 + +struct ec_params_pwm_get_fan_duty { + uint8_t fan_idx; +} __ec_align1; + +struct ec_response_pwm_get_fan_duty { + uint32_t percent; /* Percentage of duty cycle, ranging from 0 ~ 100 */ +} __ec_align4; + /*****************************************************************************/ /* * Lightbar commands. This looks worse than it is. Since we only use one HOST @@ -3127,14 +3137,31 @@ struct ec_params_thermal_set_threshold_v1 { /****************************************************************************/ -/* Toggle automatic fan control */ +/* Set or get fan control mode */ #define EC_CMD_THERMAL_AUTO_FAN_CTRL 0x0052 +enum ec_auto_fan_ctrl_cmd { + EC_AUTO_FAN_CONTROL_CMD_SET = 0, + EC_AUTO_FAN_CONTROL_CMD_GET, +}; + /* Version 1 of input params */ struct ec_params_auto_fan_ctrl_v1 { uint8_t fan_idx; } __ec_align1; +/* Version 2 of input params */ +struct ec_params_auto_fan_ctrl_v2 { + uint8_t fan_idx; + uint8_t cmd; /* enum ec_auto_fan_ctrl_cmd */ + uint8_t set_auto; /* only used with EC_AUTO_FAN_CONTROL_CMD_SET - bool + */ +} __ec_align4; + +struct ec_response_auto_fan_control { + uint8_t is_auto; /* bool */ +} __ec_align1; + /* Get/Set TMP006 calibration data */ #define EC_CMD_TMP006_GET_CALIBRATION 0x0053 #define EC_CMD_TMP006_SET_CALIBRATION 0x0054 diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 3ec24f445c29cd..de14923720a53f 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -33,12 +33,18 @@ /* * Max bus-specific overhead incurred by request/responses. - * I2C requires 1 additional byte for requests. - * I2C requires 2 additional bytes for responses. - * SPI requires up to 32 additional bytes for responses. + * + * Request: + * - I2C requires 1 byte (see struct ec_host_request_i2c). + * - ISHTP requires 4 bytes (see struct cros_ish_out_msg). + * + * Response: + * - I2C requires 2 bytes (see struct ec_host_response_i2c). + * - ISHTP requires 4 bytes (see struct cros_ish_in_msg). + * - SPI requires 32 bytes (see EC_MSG_PREAMBLE_COUNT). */ #define EC_PROTO_VERSION_UNKNOWN 0 -#define EC_MAX_REQUEST_OVERHEAD 1 +#define EC_MAX_REQUEST_OVERHEAD 4 #define EC_MAX_RESPONSE_OVERHEAD 32 /* @@ -122,6 +128,7 @@ struct cros_ec_command { * @dout_size: Size of dout buffer to allocate (zero to use static dout). * @wake_enabled: True if this device can wake the system from sleep. * @suspended: True if this device had been suspended. + * @registered: True if this device had been registered. * @cmd_xfer: Send command to EC and get response. * Returns the number of bytes received if the communication * succeeded, but that doesn't mean the EC was happy with the @@ -180,6 +187,7 @@ struct cros_ec_device { int dout_size; bool wake_enabled; bool suspended; + bool registered; int (*cmd_xfer)(struct cros_ec_device *ec, struct cros_ec_command *msg); int (*pkt_xfer)(struct cros_ec_device *ec, @@ -272,6 +280,8 @@ int cros_ec_cmd_readmem(struct cros_ec_device *ec_dev, u8 offset, u8 size, void int cros_ec_get_cmd_versions(struct cros_ec_device *ec_dev, u16 cmd); +bool cros_ec_device_registered(struct cros_ec_device *ec_dev); + /** * cros_ec_get_time_ns() - Return time in ns. * diff --git a/include/linux/platform_data/dmtimer-omap.h b/include/linux/platform_data/dmtimer-omap.h index 95d852aef130e3..726d8914384293 100644 --- a/include/linux/platform_data/dmtimer-omap.h +++ b/include/linux/platform_data/dmtimer-omap.h @@ -36,9 +36,13 @@ struct omap_dm_timer_ops { int (*set_pwm)(struct omap_dm_timer *timer, int def_on, int toggle, int trigger, int autoreload); int (*get_pwm_status)(struct omap_dm_timer *timer); + int (*set_cap)(struct omap_dm_timer *timer, + int autoreload, bool config_period); + int (*get_cap_status)(struct omap_dm_timer *timer); int (*set_prescaler)(struct omap_dm_timer *timer, int prescaler); unsigned int (*read_counter)(struct omap_dm_timer *timer); + unsigned int (*read_cap)(struct omap_dm_timer *timer, bool is_period); int (*write_counter)(struct omap_dm_timer *timer, unsigned int value); unsigned int (*read_status)(struct omap_dm_timer *timer); diff --git a/include/linux/platform_data/tmio.h b/include/linux/platform_data/tmio.h index b060124ba1aef8..426291713b83d5 100644 --- a/include/linux/platform_data/tmio.h +++ b/include/linux/platform_data/tmio.h @@ -47,6 +47,9 @@ /* Some controllers have a CBSY bit */ #define TMIO_MMC_HAVE_CBSY BIT(11) +/* Some controllers have a 64-bit wide data port register */ +#define TMIO_MMC_64BIT_DATA_PORT BIT(12) + struct tmio_mmc_data { void *chan_priv_tx; void *chan_priv_rx; diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index c84edf217819b4..f67a2cb7d78148 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -115,6 +115,12 @@ struct dev_pm_domain_list { * genpd provider specific way, likely through a * parent device node. This flag makes genpd to * skip its internal support for this. + * + * GENPD_FLAG_NO_STAY_ON: For genpd OF providers a powered-on PM domain at + * initialization is prevented from being + * powered-off until the ->sync_state() callback is + * invoked. This flag informs genpd to allow a + * power-off without waiting for ->sync_state(). */ #define GENPD_FLAG_PM_CLK (1U << 0) #define GENPD_FLAG_IRQ_SAFE (1U << 1) @@ -126,6 +132,7 @@ struct dev_pm_domain_list { #define GENPD_FLAG_OPP_TABLE_FW (1U << 7) #define GENPD_FLAG_DEV_NAME_FW (1U << 8) #define GENPD_FLAG_NO_SYNC_STATE (1U << 9) +#define GENPD_FLAG_NO_STAY_ON (1U << 10) enum gpd_status { GENPD_STATE_ON = 0, /* PM domain is on */ diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d88d6b6ccf5b20..edb8aed5ef62eb 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -21,6 +21,7 @@ #define RPM_GET_PUT 0x04 /* Increment/decrement the usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ +#define RPM_TRANSPARENT 0x10 /* Succeed if runtime PM is disabled */ /* * Use this for defining a set of PM operations to be used in all situations @@ -350,13 +351,12 @@ static inline int pm_runtime_force_resume(struct device *dev) { return -ENXIO; } * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing - * or device not in %RPM_ACTIVE state. + * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change + * ongoing or device not in %RPM_ACTIVE state. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM idle and suspend callbacks. */ @@ -370,14 +370,15 @@ static inline int pm_runtime_idle(struct device *dev) * @dev: Target device. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -396,14 +397,15 @@ static inline int pm_runtime_suspend(struct device *dev) * engaging its "idle check" callback. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -433,13 +435,12 @@ static inline int pm_runtime_resume(struct device *dev) * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing - * or device not in %RPM_ACTIVE state. + * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change + * ongoing or device not in %RPM_ACTIVE state. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_request_idle(struct device *dev) { @@ -464,15 +465,16 @@ static inline int pm_request_resume(struct device *dev) * equivalent pm_runtime_autosuspend() for @dev asynchronously. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_request_autosuspend(struct device *dev) { @@ -511,6 +513,19 @@ static inline int pm_runtime_get_sync(struct device *dev) return __pm_runtime_resume(dev, RPM_GET_PUT); } +static inline int pm_runtime_get_active(struct device *dev, int rpmflags) +{ + int ret; + + ret = __pm_runtime_resume(dev, RPM_GET_PUT | rpmflags); + if (ret < 0) { + pm_runtime_put_noidle(dev); + return ret; + } + + return 0; +} + /** * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. * @dev: Target device. @@ -521,15 +536,7 @@ static inline int pm_runtime_get_sync(struct device *dev) */ static inline int pm_runtime_resume_and_get(struct device *dev) { - int ret; - - ret = __pm_runtime_resume(dev, RPM_GET_PUT); - if (ret < 0) { - pm_runtime_put_noidle(dev); - return ret; - } - - return 0; + return pm_runtime_get_active(dev, 0); } /** @@ -540,23 +547,22 @@ static inline int pm_runtime_resume_and_get(struct device *dev) * equal to 0, queue up a work item for @dev like in pm_request_idle(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_runtime_put(struct device *dev) { return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } -DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) - /** * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. @@ -565,15 +571,16 @@ DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int __pm_runtime_put_autosuspend(struct device *dev) { @@ -590,15 +597,16 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev) * in pm_request_autosuspend(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_runtime_put_autosuspend(struct device *dev) { @@ -606,6 +614,26 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) return __pm_runtime_put_autosuspend(dev); } +DEFINE_GUARD(pm_runtime_active, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put(_T)); +DEFINE_GUARD(pm_runtime_active_auto, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put_autosuspend(_T)); +/* + * Use the following guards with ACQUIRE()/ACQUIRE_ERR(). + * + * The difference between the "_try" and "_try_enabled" variants is that the + * former do not produce an error when runtime PM is disabled for the given + * device. + */ +DEFINE_GUARD_COND(pm_runtime_active, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active, _try_enabled, + pm_runtime_resume_and_get(_T)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, + pm_runtime_resume_and_get(_T)) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. @@ -619,14 +647,15 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -646,15 +675,15 @@ static inline int pm_runtime_put_sync(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. - * * -EAGAIN: usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -677,15 +706,16 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index fdec9af9c54183..6653abfdf7470d 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -9,35 +9,27 @@ #ifndef __MAX77705_CHARGER_H #define __MAX77705_CHARGER_H __FILE__ +#include + /* MAX77705_CHG_REG_CHG_INT */ -#define MAX77705_BYP_I BIT(0) -#define MAX77705_INP_LIMIT_I BIT(1) -#define MAX77705_BATP_I BIT(2) -#define MAX77705_BAT_I BIT(3) -#define MAX77705_CHG_I BIT(4) -#define MAX77705_WCIN_I BIT(5) -#define MAX77705_CHGIN_I BIT(6) -#define MAX77705_AICL_I BIT(7) - -/* MAX77705_CHG_REG_CHG_INT_MASK */ -#define MAX77705_BYP_IM BIT(0) -#define MAX77705_INP_LIMIT_IM BIT(1) -#define MAX77705_BATP_IM BIT(2) -#define MAX77705_BAT_IM BIT(3) -#define MAX77705_CHG_IM BIT(4) -#define MAX77705_WCIN_IM BIT(5) -#define MAX77705_CHGIN_IM BIT(6) -#define MAX77705_AICL_IM BIT(7) +#define MAX77705_BYP_I (0) +#define MAX77705_INP_LIMIT_I (1) +#define MAX77705_BATP_I (2) +#define MAX77705_BAT_I (3) +#define MAX77705_CHG_I (4) +#define MAX77705_WCIN_I (5) +#define MAX77705_CHGIN_I (6) +#define MAX77705_AICL_I (7) /* MAX77705_CHG_REG_CHG_INT_OK */ -#define MAX77705_BYP_OK BIT(0) -#define MAX77705_DISQBAT_OK BIT(1) -#define MAX77705_BATP_OK BIT(2) -#define MAX77705_BAT_OK BIT(3) -#define MAX77705_CHG_OK BIT(4) -#define MAX77705_WCIN_OK BIT(5) -#define MAX77705_CHGIN_OK BIT(6) -#define MAX77705_AICL_OK BIT(7) +#define MAX77705_BYP_OK BIT(MAX77705_BYP_I) +#define MAX77705_DISQBAT_OK BIT(MAX77705_INP_LIMIT_I) +#define MAX77705_BATP_OK BIT(MAX77705_BATP_I) +#define MAX77705_BAT_OK BIT(MAX77705_BAT_I) +#define MAX77705_CHG_OK BIT(MAX77705_CHG_I) +#define MAX77705_WCIN_OK BIT(MAX77705_WCIN_I) +#define MAX77705_CHGIN_OK BIT(MAX77705_CHGIN_I) +#define MAX77705_AICL_OK BIT(MAX77705_AICL_I) /* MAX77705_CHG_REG_DETAILS_00 */ #define MAX77705_BATP_DTLS BIT(0) @@ -63,7 +55,6 @@ #define MAX77705_BUCK_SHIFT 2 #define MAX77705_BOOST_SHIFT 3 #define MAX77705_WDTEN_SHIFT 4 -#define MAX77705_MODE_MASK GENMASK(3, 0) #define MAX77705_CHG_MASK BIT(MAX77705_CHG_SHIFT) #define MAX77705_UNO_MASK BIT(MAX77705_UNO_SHIFT) #define MAX77705_OTG_MASK BIT(MAX77705_OTG_SHIFT) @@ -74,34 +65,19 @@ #define MAX77705_OTG_CTRL (MAX77705_OTG_MASK | MAX77705_BOOST_MASK) /* MAX77705_CHG_REG_CNFG_01 */ -#define MAX77705_FCHGTIME_SHIFT 0 -#define MAX77705_FCHGTIME_MASK GENMASK(2, 0) -#define MAX77705_CHG_RSTRT_SHIFT 4 -#define MAX77705_CHG_RSTRT_MASK GENMASK(5, 4) #define MAX77705_FCHGTIME_DISABLE 0 #define MAX77705_CHG_RSTRT_DISABLE 0x3 -#define MAX77705_PQEN_SHIFT 7 -#define MAX77705_PQEN_MASK BIT(7) #define MAX77705_CHG_PQEN_DISABLE 0 #define MAX77705_CHG_PQEN_ENABLE 1 /* MAX77705_CHG_REG_CNFG_02 */ -#define MAX77705_OTG_ILIM_SHIFT 6 -#define MAX77705_OTG_ILIM_MASK GENMASK(7, 6) #define MAX77705_OTG_ILIM_500 0 #define MAX77705_OTG_ILIM_900 1 #define MAX77705_OTG_ILIM_1200 2 #define MAX77705_OTG_ILIM_1500 3 -#define MAX77705_CHG_CC GENMASK(5, 0) /* MAX77705_CHG_REG_CNFG_03 */ -#define MAX77705_TO_ITH_SHIFT 0 -#define MAX77705_TO_ITH_MASK GENMASK(2, 0) -#define MAX77705_TO_TIME_SHIFT 3 -#define MAX77705_TO_TIME_MASK GENMASK(5, 3) -#define MAX77705_SYS_TRACK_DIS_SHIFT 7 -#define MAX77705_SYS_TRACK_DIS_MASK BIT(7) #define MAX77705_TO_ITH_150MA 0 #define MAX77705_TO_TIME_30M 3 #define MAX77705_SYS_TRACK_ENABLE 0 @@ -110,15 +86,8 @@ /* MAX77705_CHG_REG_CNFG_04 */ #define MAX77705_CHG_MINVSYS_SHIFT 6 #define MAX77705_CHG_MINVSYS_MASK GENMASK(7, 6) -#define MAX77705_CHG_PRM_SHIFT 0 -#define MAX77705_CHG_PRM_MASK GENMASK(5, 0) - -#define MAX77705_CHG_CV_PRM_SHIFT 0 -#define MAX77705_CHG_CV_PRM_MASK GENMASK(5, 0) /* MAX77705_CHG_REG_CNFG_05 */ -#define MAX77705_REG_B2SOVRC_SHIFT 0 -#define MAX77705_REG_B2SOVRC_MASK GENMASK(3, 0) #define MAX77705_B2SOVRC_DISABLE 0 #define MAX77705_B2SOVRC_4_5A 6 #define MAX77705_B2SOVRC_4_8A 8 @@ -128,9 +97,8 @@ #define MAX77705_WDTCLR_SHIFT 0 #define MAX77705_WDTCLR_MASK GENMASK(1, 0) #define MAX77705_WDTCLR 1 -#define MAX77705_CHGPROT_MASK GENMASK(3, 2) -#define MAX77705_CHGPROT_UNLOCKED GENMASK(3, 2) -#define MAX77705_SLOWEST_LX_SLOPE GENMASK(6, 5) +#define MAX77705_CHGPROT_UNLOCKED 3 +#define MAX77705_SLOWEST_LX_SLOPE 3 /* MAX77705_CHG_REG_CNFG_07 */ #define MAX77705_CHG_FMBST 4 @@ -140,36 +108,14 @@ #define MAX77705_REG_FGSRC_MASK BIT(MAX77705_REG_FGSRC_SHIFT) /* MAX77705_CHG_REG_CNFG_08 */ -#define MAX77705_REG_FSW_SHIFT 0 -#define MAX77705_REG_FSW_MASK GENMASK(1, 0) #define MAX77705_CHG_FSW_3MHz 0 #define MAX77705_CHG_FSW_2MHz 1 #define MAX77705_CHG_FSW_1_5MHz 2 /* MAX77705_CHG_REG_CNFG_09 */ -#define MAX77705_CHG_CHGIN_LIM_MASK GENMASK(6, 0) -#define MAX77705_CHG_EN_MASK BIT(7) #define MAX77705_CHG_DISABLE 0 -#define MAX77705_CHARGER_CHG_CHARGING(_reg) \ - (((_reg) & MAX77705_CHG_EN_MASK) > 1) - - -/* MAX77705_CHG_REG_CNFG_10 */ -#define MAX77705_CHG_WCIN_LIM GENMASK(5, 0) - -/* MAX77705_CHG_REG_CNFG_11 */ -#define MAX77705_VBYPSET_SHIFT 0 -#define MAX77705_VBYPSET_MASK GENMASK(6, 0) /* MAX77705_CHG_REG_CNFG_12 */ -#define MAX77705_CHGINSEL_SHIFT 5 -#define MAX77705_CHGINSEL_MASK BIT(MAX77705_CHGINSEL_SHIFT) -#define MAX77705_WCINSEL_SHIFT 6 -#define MAX77705_WCINSEL_MASK BIT(MAX77705_WCINSEL_SHIFT) -#define MAX77705_VCHGIN_REG_MASK GENMASK(4, 3) -#define MAX77705_WCIN_REG_MASK GENMASK(2, 1) -#define MAX77705_REG_DISKIP_SHIFT 0 -#define MAX77705_REG_DISKIP_MASK BIT(MAX77705_REG_DISKIP_SHIFT) /* REG=4.5V, UVLO=4.7V */ #define MAX77705_VCHGIN_4_5 0 /* REG=4.5V, UVLO=4.7V */ @@ -183,9 +129,59 @@ #define MAX77705_CURRENT_CHGIN_MIN 100000 #define MAX77705_CURRENT_CHGIN_MAX 3200000 +enum max77705_field_idx { + MAX77705_CHGPROT, + MAX77705_CHG_EN, + MAX77705_CHG_CC_LIM, + MAX77705_CHG_CHGIN_LIM, + MAX77705_CHG_CV_PRM, + MAX77705_CHG_PQEN, + MAX77705_CHG_RSTRT, + MAX77705_CHG_WCIN, + MAX77705_FCHGTIME, + MAX77705_LX_SLOPE, + MAX77705_MODE, + MAX77705_OTG_ILIM, + MAX77705_REG_B2SOVRC, + MAX77705_REG_DISKIP, + MAX77705_REG_FSW, + MAX77705_SYS_TRACK, + MAX77705_TO, + MAX77705_TO_TIME, + MAX77705_VBYPSET, + MAX77705_VCHGIN, + MAX77705_WCIN, + MAX77705_N_REGMAP_FIELDS, +}; + +static const struct reg_field max77705_reg_field[MAX77705_N_REGMAP_FIELDS] = { + [MAX77705_MODE] = REG_FIELD(MAX77705_CHG_REG_CNFG_00, 0, 3), + [MAX77705_FCHGTIME] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 0, 2), + [MAX77705_CHG_RSTRT] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 4, 5), + [MAX77705_CHG_PQEN] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 7, 7), + [MAX77705_CHG_CC_LIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_02, 0, 5), + [MAX77705_OTG_ILIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_02, 6, 7), + [MAX77705_TO] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 0, 2), + [MAX77705_TO_TIME] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 3, 5), + [MAX77705_SYS_TRACK] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 7, 7), + [MAX77705_CHG_CV_PRM] = REG_FIELD(MAX77705_CHG_REG_CNFG_04, 0, 5), + [MAX77705_REG_B2SOVRC] = REG_FIELD(MAX77705_CHG_REG_CNFG_05, 0, 3), + [MAX77705_CHGPROT] = REG_FIELD(MAX77705_CHG_REG_CNFG_06, 2, 3), + [MAX77705_LX_SLOPE] = REG_FIELD(MAX77705_CHG_REG_CNFG_06, 5, 6), + [MAX77705_REG_FSW] = REG_FIELD(MAX77705_CHG_REG_CNFG_08, 0, 1), + [MAX77705_CHG_CHGIN_LIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_09, 0, 6), + [MAX77705_CHG_EN] = REG_FIELD(MAX77705_CHG_REG_CNFG_09, 7, 7), + [MAX77705_CHG_WCIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_10, 0, 5), + [MAX77705_VBYPSET] = REG_FIELD(MAX77705_CHG_REG_CNFG_11, 0, 6), + [MAX77705_REG_DISKIP] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 0, 0), + [MAX77705_WCIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 1, 2), + [MAX77705_VCHGIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 3, 4), +}; + struct max77705_charger_data { struct device *dev; struct regmap *regmap; + struct regmap_field *rfield[MAX77705_N_REGMAP_FIELDS]; struct power_supply_battery_info *bat_info; struct workqueue_struct *wqueue; struct work_struct chgin_work; diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index f21f806bfb3831..360ffdf272dab8 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -176,6 +176,8 @@ enum power_supply_property { POWER_SUPPLY_PROP_MANUFACTURE_YEAR, POWER_SUPPLY_PROP_MANUFACTURE_MONTH, POWER_SUPPLY_PROP_MANUFACTURE_DAY, + POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, + POWER_SUPPLY_PROP_STATE_OF_HEALTH, /* Properties of type `const char *' */ POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 1fad1c8a4c76a6..102202185d7a2c 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -372,7 +372,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, /* * Migrate-Disable and why it is undesired. * - * When a preempted task becomes elegible to run under the ideal model (IOW it + * When a preempted task becomes eligible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() @@ -387,7 +387,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be - * in a migrate_disable() section, reducing it's available bandwidth. + * in a migrate_disable() section, reducing its available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully @@ -399,7 +399,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, - * all these primitives employ migirate_disable() to restore this implicit + * all these primitives employ migrate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting @@ -407,7 +407,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need - * a schedulability theory that does not depend on abritrary migration. + * a schedulability theory that does not depend on arbitrary migration. * * * Notes on the implementation. @@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * work-conserving schedulers. * */ -extern void migrate_disable(void); -extern void migrate_enable(void); /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section @@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void) DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #ifdef CONFIG_PREEMPT_DYNAMIC diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 4b20375f3783e7..e81b8e596e4f19 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -5,7 +5,7 @@ #ifndef _LINUX_PROC_NS_H #define _LINUX_PROC_NS_H -#include +#include #include struct pid_namespace; @@ -17,7 +17,6 @@ struct inode; struct proc_ns_operations { const char *name; const char *real_ns_name; - int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsset *nsset, struct ns_common *ns); @@ -66,25 +65,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_alloc_inum(struct ns_common *ns) -{ - WRITE_ONCE(ns->stashed, NULL); - return proc_alloc_inum(&ns->inum); -} - -#define ns_free_inum(ns) proc_free_inum((ns)->inum) - #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -extern int ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -typedef struct ns_common *ns_get_path_helper_t(void *); -extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, - void *private_data); - -extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); - -extern int ns_get_name(char *buf, size_t size, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -extern void nsfs_init(void); #endif /* _LINUX_PROC_NS_H */ diff --git a/include/linux/property.h b/include/linux/property.h index 82f0cb3abd1e22..50b26589dd70d1 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -176,6 +176,16 @@ struct fwnode_handle *fwnode_get_next_available_child_node( for (child = fwnode_get_next_available_child_node(fwnode, NULL); child;\ child = fwnode_get_next_available_child_node(fwnode, child)) +#define fwnode_for_each_child_node_scoped(fwnode, child) \ + for (struct fwnode_handle *child __free(fwnode_handle) = \ + fwnode_get_next_child_node(fwnode, NULL); \ + child; child = fwnode_get_next_child_node(fwnode, child)) + +#define fwnode_for_each_available_child_node_scoped(fwnode, child) \ + for (struct fwnode_handle *child __free(fwnode_handle) = \ + fwnode_get_next_available_child_node(fwnode, NULL); \ + child; child = fwnode_get_next_available_child_node(fwnode, child)) + struct fwnode_handle *device_get_next_child_node(const struct device *dev, struct fwnode_handle *child); @@ -574,8 +584,8 @@ const struct software_node * software_node_find_by_name(const struct software_node *parent, const char *name); -int software_node_register_node_group(const struct software_node **node_group); -void software_node_unregister_node_group(const struct software_node **node_group); +int software_node_register_node_group(const struct software_node * const *node_group); +void software_node_unregister_node_group(const struct software_node * const *node_group); int software_node_register(const struct software_node *node); void software_node_unregister(const struct software_node *node); diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 1504fb012c05b8..540abf7de04886 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,8 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_SFS_GET_FW_VERSIONS, + PSP_SFS_UPDATE, PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 8cafc483db53ad..549ac4aaad59ba 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -321,6 +322,7 @@ struct pwm_ops { * @npwm: number of PWMs controlled by this chip * @of_xlate: request a PWM device given a device tree PWM specifier * @atomic: can the driver's ->apply() be called in atomic context + * @gpio: &struct gpio_chip to operate this PWM chip's lines as GPO * @uses_pwmchip_alloc: signals if pwmchip_allow was used to allocate this chip * @operational: signals if the chip can be used (or is already deregistered) * @nonatomic_lock: mutex for nonatomic chips @@ -340,6 +342,7 @@ struct pwm_chip { bool atomic; /* only used internally by the PWM framework */ + struct gpio_chip gpio; bool uses_pwmchip_alloc; bool operational; union { diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb1d..f67f96711f0da2 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -962,6 +962,20 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) preempt_enable_notrace(); } +static __always_inline void rcu_read_lock_dont_migrate(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_disable(); + rcu_read_lock(); +} + +static inline void rcu_read_unlock_migrate(void) +{ + rcu_read_unlock(); + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_enable(); +} + /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. diff --git a/include/linux/regulator/s2dos05.h b/include/linux/regulator/s2dos05.h new file mode 100644 index 00000000000000..2e89fcbce76958 --- /dev/null +++ b/include/linux/regulator/s2dos05.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +// s2dos05.h +// +// Copyright (c) 2016 Samsung Electronics Co., Ltd +// http://www.samsung.com +// Copyright (C) 2024 Dzmitry Sankouski + +#ifndef __LINUX_S2DOS05_H +#define __LINUX_S2DOS05_H + +// S2DOS05 registers +// Slave Addr : 0xC0 +enum S2DOS05_reg { + S2DOS05_REG_DEV_ID, + S2DOS05_REG_TOPSYS_STAT, + S2DOS05_REG_STAT, + S2DOS05_REG_EN, + S2DOS05_REG_LDO1_CFG, + S2DOS05_REG_LDO2_CFG, + S2DOS05_REG_LDO3_CFG, + S2DOS05_REG_LDO4_CFG, + S2DOS05_REG_BUCK_CFG, + S2DOS05_REG_BUCK_VOUT, + S2DOS05_REG_IRQ_MASK = 0x0D, + S2DOS05_REG_SSD_TSD = 0x0E, + S2DOS05_REG_OCL = 0x10, + S2DOS05_REG_IRQ = 0x11 +}; + +// S2DOS05 regulator ids +enum S2DOS05_regulators { + S2DOS05_LDO1, + S2DOS05_LDO2, + S2DOS05_LDO3, + S2DOS05_LDO4, + S2DOS05_BUCK1, + S2DOS05_REG_MAX, +}; + +#define S2DOS05_IRQ_PWRMT_MASK BIT(5) +#define S2DOS05_IRQ_TSD_MASK BIT(4) +#define S2DOS05_IRQ_SSD_MASK BIT(3) +#define S2DOS05_IRQ_SCP_MASK BIT(2) +#define S2DOS05_IRQ_UVLO_MASK BIT(1) +#define S2DOS05_IRQ_OCD_MASK BIT(0) + +#define S2DOS05_BUCK_MIN1 506250 +#define S2DOS05_LDO_MIN1 1500000 +#define S2DOS05_LDO_MIN2 2700000 +#define S2DOS05_BUCK_STEP1 6250 +#define S2DOS05_LDO_STEP1 25000 +#define S2DOS05_LDO_VSEL_MASK 0x7F +#define S2DOS05_LDO_FD_MASK 0x80 +#define S2DOS05_BUCK_VSEL_MASK 0xFF +#define S2DOS05_BUCK_FD_MASK 0x08 + +#define S2DOS05_ENABLE_MASK_L1 BIT(0) +#define S2DOS05_ENABLE_MASK_L2 BIT(1) +#define S2DOS05_ENABLE_MASK_L3 BIT(2) +#define S2DOS05_ENABLE_MASK_L4 BIT(3) +#define S2DOS05_ENABLE_MASK_B1 BIT(4) + +#define S2DOS05_RAMP_DELAY 12000 + +#define S2DOS05_ENABLE_TIME_LDO 50 +#define S2DOS05_ENABLE_TIME_BUCK 350 + +#define S2DOS05_LDO_N_VOLTAGES (S2DOS05_LDO_VSEL_MASK + 1) +#define S2DOS05_BUCK_N_VOLTAGES (S2DOS05_BUCK_VSEL_MASK + 1) + +#define S2DOS05_REGULATOR_MAX (S2DOS05_REG_MAX) + +#endif // __LINUX_S2DOS05_H diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 6fb4894b8cfd1f..a7d92718b653f5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -156,28 +156,43 @@ struct rdt_ctrl_domain { u32 *mbps_val; }; +/** + * struct mbm_cntr_cfg - Assignable counter configuration. + * @evtid: MBM event to which the counter is assigned. Only valid + * if @rdtgroup is not NULL. + * @rdtgrp: resctrl group assigned to the counter. NULL if the + * counter is free. + */ +struct mbm_cntr_cfg { + enum resctrl_event_id evtid; + struct rdtgroup *rdtgrp; +}; + /** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold - * @mbm_total: saved state for MBM total bandwidth - * @mbm_local: saved state for MBM local bandwidth + * @mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct mbm_state + * indexed by RMID on x86 or combined CLOSID, RMID on Arm. * @mbm_over: worker to periodically read MBM h/w counters * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters * @cqm_work_cpu: worker CPU for CQM h/w counters + * @cntr_cfg: array of assignable counters' configuration (indexed + * by counter ID) */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; + struct mbm_state *mbm_states[QOS_NUM_L3_MBM_EVENTS]; struct delayed_work mbm_over; struct delayed_work cqm_limbo; int mbm_work_cpu; int cqm_work_cpu; + struct mbm_cntr_cfg *cntr_cfg; }; /** @@ -255,40 +270,53 @@ enum resctrl_schema_fmt { RESCTRL_SCHEMA_RANGE, }; +/** + * struct resctrl_mon - Monitoring related data of a resctrl resource. + * @num_rmid: Number of RMIDs available. + * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth + * monitoring events can be configured. + * @num_mbm_cntrs: Number of assignable counters. + * @mbm_cntr_assignable:Is system capable of supporting counter assignment? + * @mbm_assign_on_mkdir:True if counters should automatically be assigned to MBM + * events of monitor groups created via mkdir. + */ +struct resctrl_mon { + int num_rmid; + unsigned int mbm_cfg_mask; + int num_mbm_cntrs; + bool mbm_cntr_assignable; + bool mbm_assign_on_mkdir; +}; + /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine - * @num_rmid: Number of RMIDs available * @ctrl_scope: Scope of this resource for control functions * @mon_scope: Scope of this resource for monitor functions * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. + * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @evt_list: List of monitoring events - * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth - * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { int rid; bool alloc_capable; bool mon_capable; - int num_rmid; enum resctrl_scope ctrl_scope; enum resctrl_scope mon_scope; struct resctrl_cache cache; struct resctrl_membw membw; + struct resctrl_mon mon; struct list_head ctrl_domains; struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - struct list_head evt_list; - unsigned int mbm_cfg_mask; bool cdp_capable; }; @@ -372,8 +400,29 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +void resctrl_enable_mon_event(enum resctrl_event_id eventid); + +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); +static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) +{ + return (eventid >= QOS_L3_MBM_TOTAL_EVENT_ID && + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id eventid); + +/* Iterate over all memory bandwidth events */ +#define for_each_mbm_event_id(eventid) \ + for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID; eventid++) + +/* Iterate over memory bandwidth arrays in domain structures */ +#define for_each_mbm_idx(idx) \ + for (idx = 0; idx < QOS_NUM_L3_MBM_EVENTS; idx++) + /** * resctrl_arch_mon_event_config_write() - Write the config for an event. * @config_info: struct resctrl_mon_config_info describing the resource, domain @@ -416,6 +465,26 @@ static inline u32 resctrl_get_config_index(u32 closid, bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); +/** + * resctrl_arch_mbm_cntr_assign_enabled() - Check if MBM counter assignment + * mode is enabled. + * @r: Pointer to the resource structure. + * + * Return: + * true if the assignment mode is enabled, false otherwise. + */ +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); + +/** + * resctrl_arch_mbm_cntr_assign_set() - Configure the MBM counter assignment mode. + * @r: Pointer to the resource structure. + * @enable: Set to true to enable, false to disable the assignment mode. + * + * Return: + * 0 on success, < 0 on error. + */ +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. @@ -528,6 +597,63 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * */ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); +/** + * resctrl_arch_config_cntr() - Configure the counter with its new RMID + * and event details. + * @r: Resource structure. + * @d: The domain in which counter with ID @cntr_id should be configured. + * @evtid: Monitoring event type (e.g., QOS_L3_MBM_TOTAL_EVENT_ID + * or QOS_L3_MBM_LOCAL_EVENT_ID). + * @rmid: RMID. + * @closid: CLOSID. + * @cntr_id: Counter ID to configure. + * @assign: True to assign the counter or update an existing assignment, + * false to unassign the counter. + * + * This can be called from any CPU. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign); + +/** + * resctrl_arch_cntr_read() - Read the event data corresponding to the counter ID + * assigned to the RMID, event pair for this resource + * and domain. + * @r: Resource that the counter should be read from. + * @d: Domain that the counter should be read from. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to read. + * @eventid: The MBM event to which @cntr_id is assigned. + * @val: Result of the counter read in bytes. + * + * Called on a CPU that belongs to domain @d when "mbm_event" mode is enabled. + * Called from a non-migrateable process context via smp_call_on_cpu() unless all + * CPUs are nohz_full, in which case it is called via IPI (smp_call_function_any()). + * + * Return: + * 0 on success, or -EIO, -EINVAL etc on error. + */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val); + +/** + * resctrl_arch_reset_cntr() - Reset any private state associated with counter ID. + * @r: The domain's resource. + * @d: The counter ID's domain. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to reset. + * @eventid: The MBM event to which @cntr_id is assigned. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index a25fb9c4070d3c..acfe07860b346c 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,11 +34,18 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) -/* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. - */ +/* Number of memory transactions that an MBM event can be configured with */ +#define NUM_MBM_TRANSACTIONS 7 + +/* Event IDs */ enum resctrl_event_id { + /* Must match value of first event below */ + QOS_FIRST_EVENT = 0x01, + + /* + * These values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ QOS_L3_OCCUP_EVENT_ID = 0x01, QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, @@ -47,4 +54,7 @@ enum resctrl_event_id { QOS_NUM_EVENTS, }; +#define QOS_NUM_L3_MBM_EVENTS (QOS_L3_MBM_LOCAL_EVENT_ID - QOS_L3_MBM_TOTAL_EVENT_ID + 1) +#define MBM_STATE_IDX(evt) ((evt) - QOS_L3_MBM_TOTAL_EVENT_ID) + #endif /* __LINUX_RESCTRL_TYPES_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb559876..69553e7c14c18c 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,6 +7,12 @@ #include #include +#ifdef CONFIG_MEMBARRIER +# define RSEQ_EVENT_GUARD irq +#else +# define RSEQ_EVENT_GUARD preempt +#endif + /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. @@ -41,9 +47,8 @@ static inline void rseq_handle_notify_resume(struct ksignal *ksig, static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); rseq_handle_notify_resume(ksig, regs); } @@ -65,7 +70,7 @@ static inline void rseq_migrate(struct task_struct *t) * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; @@ -107,7 +112,7 @@ static inline void rseq_preempt(struct task_struct *t) static inline void rseq_migrate(struct task_struct *t) { } -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) diff --git a/include/linux/rtsx_usb.h b/include/linux/rtsx_usb.h index f267a06c6b1e7e..276b509c03e364 100644 --- a/include/linux/rtsx_usb.h +++ b/include/linux/rtsx_usb.h @@ -99,6 +99,17 @@ extern int rtsx_usb_card_exclusive_check(struct rtsx_ucr *ucr, int card); #define CD_MASK (SD_CD | MS_CD | XD_CD) #define SD_WP 0x08 +/* OCPCTL */ +#define MS_OCP_DETECT_EN 0x08 +#define MS_OCP_INT_EN 0x04 +#define MS_OCP_INT_CLR 0x02 +#define MS_OCP_CLEAR 0x01 + +/* OCPSTAT */ +#define MS_OCP_DETECT 0x80 +#define MS_OCP_NOW 0x02 +#define MS_OCP_EVER 0x01 + /* reader command field offset & parameters */ #define READ_REG_CMD 0 #define WRITE_REG_CMD 1 diff --git a/include/linux/rv.h b/include/linux/rv.h index 14410a42faefd0..9520aab34bcbe3 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -7,16 +7,14 @@ #ifndef _LINUX_RV_H #define _LINUX_RV_H -#include -#include - #define MAX_DA_NAME_LEN 32 #define MAX_DA_RETRY_RACING_EVENTS 3 #ifdef CONFIG_RV +#include #include +#include #include -#include /* * Deterministic automaton per-object variables. diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h index 309ca72f2dfb63..adcc43042c9021 100644 --- a/include/linux/rw_hint.h +++ b/include/linux/rw_hint.h @@ -14,6 +14,7 @@ enum rw_hint { WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, + WRITE_LIFE_HINT_NR, } __packed; /* Sparse ignores __packed annotations on enums, hence the #ifndef below. */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f8188b8333503c..cbb7340c5866fb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -49,6 +49,9 @@ #include #include #include +#ifndef COMPILE_OFFSETS +#include +#endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -706,7 +709,6 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; - unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -733,7 +735,6 @@ struct sched_dl_entity { * runnable task. */ struct rq *rq; - dl_server_has_tasks_f server_has_tasks; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES @@ -883,6 +884,11 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif @@ -2312,4 +2318,114 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +#ifndef MODULE +#ifndef COMPILE_OFFSETS + +extern void ___migrate_enable(void); + +struct rq; +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +/* + * The "struct rq" is not available here, so we can't access the + * "runqueues" with this_cpu_ptr(), as the compilation will fail in + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): + * typeof((ptr) + 0) + * + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. + */ +#ifdef CONFIG_SMP +#define this_rq_raw() arch_raw_cpu_ptr(&runqueues) +#else +#define this_rq_raw() PERCPU_PTR(&runqueues) +#endif +#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) + +static inline void __migrate_enable(void) +{ + struct task_struct *p = current; + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (unlikely(p->cpus_ptr != &p->cpus_mask)) + ___migrate_enable(); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq_pinned()--; +} + +static inline void __migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif + p->migration_disabled++; + return; + } + + guard(preempt)(); + this_rq_pinned()++; + p->migration_disabled = 1; +} +#else /* !COMPILE_OFFSETS */ +static inline void __migrate_disable(void) { } +static inline void __migrate_enable(void) { } +#endif /* !COMPILE_OFFSETS */ + +/* + * So that it is possible to not export the runqueues variable, define and + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will + * be defined in kernel/sched/core.c. + */ +#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE +static inline void migrate_disable(void) +{ + __migrate_disable(); +} + +static inline void migrate_enable(void) +{ + __migrate_enable(); +} +#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ + +#else /* MODULE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* MODULE */ + +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) + #endif diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 7047101dbf587a..d82b7a9b0658be 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -108,7 +108,11 @@ enum scx_kf_mask { SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ - /* ops.dequeue (in REST) may be nested inside DISPATCH */ + /* + * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and + * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be + * nested inside DISPATCH. + */ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1ef1edbaaf79a8..7d6449982822e5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -226,6 +226,10 @@ struct signal_struct { struct tty_audit_buf *tty_audit_buf; #endif +#ifdef CONFIG_CGROUPS + struct rw_semaphore cgroup_threadgroup_rwsem; +#endif + /* * Thread is the potential origin of an oom condition; kill first on * oom diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352bca..34d6a0e108c3e6 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,7 @@ extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); -extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(u64 clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 5263746b63e8c3..bbcfdf12aa6e57 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -30,33 +30,24 @@ struct sd_flag_debug { }; extern const struct sd_flag_debug sd_flag_debug[]; +struct sched_domain_topology_level; + #ifdef CONFIG_SCHED_SMT -static inline int cpu_smt_flags(void) -{ - return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; -} +extern int cpu_smt_flags(void); +extern const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_CLUSTER -static inline int cpu_cluster_flags(void) -{ - return SD_CLUSTER | SD_SHARE_LLC; -} +extern int cpu_cluster_flags(void); +extern const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_MC -static inline int cpu_core_flags(void) -{ - return SD_SHARE_LLC; -} +extern int cpu_core_flags(void); +extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu); #endif -#ifdef CONFIG_NUMA -static inline int cpu_numa_flags(void) -{ - return SD_NUMA; -} -#endif +extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu); extern int arch_asym_cpu_priority(int cpu); @@ -172,7 +163,7 @@ bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu); typedef int (*sched_domain_flags_f)(void); struct sd_data { diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b97170c..bd33f194c94a5b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -489,7 +489,7 @@ int security_file_receive(struct file *file); int security_file_open(struct file *file); int security_file_post_open(struct file *file, int mask); int security_file_truncate(struct file *file); -int security_task_alloc(struct task_struct *task, unsigned long clone_flags); +int security_task_alloc(struct task_struct *task, u64 clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); @@ -567,7 +567,8 @@ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, struct lsm_context *cp); -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp); +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); @@ -1215,7 +1216,7 @@ static inline int security_file_truncate(struct file *file) } static inline int security_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { return 0; } @@ -1551,7 +1552,8 @@ static inline int security_secid_to_secctx(u32 secid, struct lsm_context *cp) } static inline int security_lsmprop_to_secctx(struct lsm_prop *prop, - struct lsm_context *cp) + struct lsm_context *cp, + int lsmid) { return -EOPNOTSUPP; } diff --git a/include/linux/sem.h b/include/linux/sem.h index c4deefe42aeb3c..275269ce2ec892 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -9,12 +9,12 @@ struct task_struct; #ifdef CONFIG_SYSVIPC -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern int copy_semundo(u64 clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else -static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_semundo(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 0832776262ac3b..e6a3476bcef1ae 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -19,6 +19,7 @@ struct dentry; struct fwnode_handle; +struct device_node; struct sdw_bus; struct sdw_slave; @@ -1086,6 +1087,10 @@ int sdw_stream_add_slave(struct sdw_slave *slave, int sdw_stream_remove_slave(struct sdw_slave *slave, struct sdw_stream_runtime *stream); +struct device *of_sdw_find_device_by_node(struct device_node *np); + +int sdw_slave_get_current_bank(struct sdw_slave *sdev); + int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); /* messaging and data APIs */ @@ -1119,6 +1124,18 @@ static inline int sdw_stream_remove_slave(struct sdw_slave *slave, return -EINVAL; } +static inline struct device *of_sdw_find_device_by_node(struct device_node *np) +{ + WARN_ONCE(1, "SoundWire API is disabled"); + return NULL; +} + +static inline int sdw_slave_get_current_bank(struct sdw_slave *sdev) +{ + WARN_ONCE(1, "SoundWire API is disabled"); + return -EINVAL; +} + /* messaging and data APIs */ static inline int sdw_read(struct sdw_slave *slave, u32 addr) { diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e9ea43234d9a87..cb2c2df3108999 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -21,7 +21,7 @@ #include /* Max no. of CS supported per spi device */ -#define SPI_CS_CNT_MAX 24 +#define SPI_DEVICE_CS_CNT_MAX 4 struct dma_chan; struct software_node; @@ -170,6 +170,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * two delays will be added up. * @chip_select: Array of physical chipselect, spi->chipselect[i] gives * the corresponding physical CS for logical CS i. + * @num_chipselect: Number of physical chipselects used. * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines * (optional, NULL when not using a GPIO line) @@ -228,7 +229,8 @@ struct spi_device { struct spi_delay cs_hold; struct spi_delay cs_inactive; - u8 chip_select[SPI_CS_CNT_MAX]; + u8 chip_select[SPI_DEVICE_CS_CNT_MAX]; + u8 num_chipselect; /* * Bit mask of the chipselect(s) that the driver need to use from @@ -236,9 +238,9 @@ struct spi_device { * multiple chip selects & memories are connected in parallel * then more than one bit need to be set in cs_index_mask. */ - u32 cs_index_mask : SPI_CS_CNT_MAX; + u32 cs_index_mask : SPI_DEVICE_CS_CNT_MAX; - struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + struct gpio_desc *cs_gpiod[SPI_DEVICE_CS_CNT_MAX]; /* Chip select gpio desc */ /* * Likely need more hooks for more protocol options affecting how @@ -315,7 +317,7 @@ static inline bool spi_is_csgpiod(struct spi_device *spi) { u8 idx; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { if (spi_get_csgpiod(spi, idx)) return true; } @@ -719,8 +721,8 @@ struct spi_controller { bool auto_runtime_pm; bool fallback; bool last_cs_mode_high; - s8 last_cs[SPI_CS_CNT_MAX]; - u32 last_cs_index_mask : SPI_CS_CNT_MAX; + s8 last_cs[SPI_DEVICE_CS_CNT_MAX]; + u32 last_cs_index_mask : SPI_DEVICE_CS_CNT_MAX; struct completion xfer_completion; size_t max_dma_len; diff --git a/include/linux/stddef.h b/include/linux/stddef.h index dab49e2ec8c0af..80b6bfb944f0d2 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -94,7 +94,8 @@ enum { __DECLARE_FLEX_ARRAY(TYPE, NAME) /** - * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * __TRAILING_OVERLAP() - Overlap a flexible-array member with trailing + * members. * * Creates a union between a flexible-array member (FAM) in a struct and a set * of additional members that would otherwise follow it. @@ -102,15 +103,30 @@ enum { * @TYPE: Flexible structure type name, including "struct" keyword. * @NAME: Name for a variable to define. * @FAM: The flexible-array member within @TYPE + * @ATTRS: Any struct attributes (usually empty) * @MEMBERS: Trailing overlapping members. */ -#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ +#define __TRAILING_OVERLAP(TYPE, NAME, FAM, ATTRS, MEMBERS) \ union { \ TYPE NAME; \ struct { \ - unsigned char __offset_to_##FAM[offsetof(TYPE, FAM)]; \ + unsigned char __offset_to_FAM[offsetof(TYPE, FAM)]; \ MEMBERS \ - }; \ + } ATTRS; \ } +/** + * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * + * Creates a union between a flexible-array member (FAM) in a struct and a set + * of additional members that would otherwise follow it. + * + * @TYPE: Flexible structure type name, including "struct" keyword. + * @NAME: Name for a variable to define. + * @FAM: The flexible-array member within @TYPE + * @MEMBERS: Trailing overlapping members. + */ +#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ + __TRAILING_OVERLAP(TYPE, NAME, FAM, /* no attrs */, MEMBERS) + #endif diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index f3ba4f52ff2601..ee84087d4b26bc 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -17,6 +17,12 @@ #include +static inline const char *str_assert_deassert(bool v) +{ + return v ? "assert" : "deassert"; +} +#define str_deassert_assert(v) str_assert_deassert(!(v)) + static inline const char *str_enable_disable(bool v) { return v ? "enable" : "disable"; @@ -41,6 +47,12 @@ static inline const char *str_high_low(bool v) } #define str_low_high(v) str_high_low(!(v)) +static inline const char *str_input_output(bool v) +{ + return v ? "input" : "output"; +} +#define str_output_input(v) str_input_output(!(v)) + static inline const char *str_on_off(bool v) { return v ? "on" : "off"; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 317ae31e89b374..b02876f1ae38ac 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -418,6 +418,12 @@ static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { } #endif /* CONFIG_HIBERNATION */ +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_SUSPEND) +bool pm_hibernation_mode_is_suspend(void); +#else +static inline bool pm_hibernation_mode_is_suspend(void) { return false; } +#endif + int arch_resume_nosmt(void); #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fdfb..7012a0f758d84a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -385,6 +385,16 @@ void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); +static inline bool folio_may_be_lru_cached(struct folio *folio) +{ + /* + * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. + * Holding small numbers of low-order mTHP folios in per-CPU LRU cache + * will be sensible, but nobody has implemented and tested that yet. + */ + return !folio_test_large(folio); +} + extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 77f45e5d44139d..66c06fcdfe19e2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index f418aae4f1134f..9a25a29116528f 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -106,10 +106,7 @@ struct attribute_group { const struct bin_attribute *, int); struct attribute **attrs; - union { - const struct bin_attribute *const *bin_attrs; - const struct bin_attribute *const *bin_attrs_new; - }; + const struct bin_attribute *const *bin_attrs; }; #define SYSFS_PREALLOC 010000 @@ -293,7 +290,7 @@ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ - .bin_attrs_new = _name##_attrs, \ + .bin_attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) @@ -308,12 +305,8 @@ struct bin_attribute { struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); - ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *, - char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); - ssize_t (*write_new)(struct file *, struct kobject *, - const struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc9497..c514d0e5a45cb2 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,17 +33,22 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +static inline struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} +void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); @@ -52,7 +57,7 @@ struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_time_ns(ns); } @@ -108,6 +113,10 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #else +static inline void __init time_ns_init(void) +{ +} + static inline int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { @@ -129,7 +138,7 @@ static inline void put_time_ns(struct time_namespace *ns) } static inline -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 57ed3035cc3093..c52b862dad45be 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -51,9 +51,15 @@ struct tnum tnum_xor(struct tnum a, struct tnum b); /* Multiply two tnums, return @a * @b */ struct tnum tnum_mul(struct tnum a, struct tnum b); +/* Return true if the known bits of both tnums have the same value */ +bool tnum_overlap(struct tnum a, struct tnum b); + /* Return a tnum representing numbers satisfying both @a and @b */ struct tnum tnum_intersect(struct tnum a, struct tnum b); +/* Returns a tnum representing numbers satisfying either @a or @b */ +struct tnum tnum_union(struct tnum t1, struct tnum t2); + /* Return @a with all but the lowest @size bytes cleared */ struct tnum tnum_cast(struct tnum a, u8 size); diff --git a/include/linux/topology.h b/include/linux/topology.h index 33b7fda97d3902..6575af39fd10f7 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -260,7 +260,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu) #endif -static inline const struct cpumask *cpu_cpu_mask(int cpu) +static inline const struct cpumask *cpu_node_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c390946c..ee3d36eda45dd2 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -185,8 +186,14 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, + uprobe_opcode_t *insn, int nbytes, void *data); + extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -194,7 +201,11 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, + bool is_register); +extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -205,7 +216,7 @@ extern void uprobe_start_dup_mmap(void); extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); -extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); +extern void uprobe_copy_process(struct task_struct *t, u64 flags); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); @@ -224,8 +235,13 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; @@ -281,7 +297,7 @@ static inline bool uprobe_deny_signal(void) static inline void uprobe_free_utask(struct task_struct *t) { } -static inline void uprobe_copy_process(struct task_struct *t, unsigned long flags) +static inline void uprobe_copy_process(struct task_struct *t, u64 flags) { } static inline void uprobe_clear_state(struct mm_struct *mm) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 8afa8c3a097308..57d1ff0060901a 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -33,7 +33,7 @@ extern void user_event_mm_dup(struct task_struct *t, extern void user_event_mm_remove(struct task_struct *t); static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { struct user_event_mm *old_mm; @@ -68,7 +68,7 @@ static inline void user_events_exit(struct task_struct *t) } #else static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { } diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a0bb6d01213780..9a9aebbf96b9f2 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -168,10 +168,15 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -181,7 +186,7 @@ extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { - if (ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns_ref_put(ns)) __put_user_ns(ns); } diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h new file mode 100644 index 00000000000000..60f37fec0f4b1a --- /dev/null +++ b/include/linux/uts_namespace.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UTS_NAMESPACE_H +#define _LINUX_UTS_NAMESPACE_H + +#include +#include + +struct user_namespace; +extern struct user_namespace init_user_ns; + +struct uts_namespace { + struct new_utsname name; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; +} __randomize_layout; + +extern struct uts_namespace init_uts_ns; + +#ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static inline void get_uts_ns(struct uts_namespace *ns) +{ + ns_ref_inc(ns); +} + +extern struct uts_namespace *copy_utsname(u64 flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns); +extern void free_uts_ns(struct uts_namespace *ns); + +static inline void put_uts_ns(struct uts_namespace *ns) +{ + if (ns_ref_put(ns)) + free_uts_ns(ns); +} + +void uts_ns_init(void); +#else +static inline void get_uts_ns(struct uts_namespace *ns) +{ +} + +static inline void put_uts_ns(struct uts_namespace *ns) +{ +} + +static inline struct uts_namespace *copy_utsname(u64 flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns) +{ + if (flags & CLONE_NEWUTS) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline void uts_ns_init(void) +{ +} +#endif + +#endif /* _LINUX_UTS_NAMESPACE_H */ diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412bfe..547bd4439706e6 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include enum uts_proc { UTS_PROC_ARCH, @@ -18,57 +18,6 @@ enum uts_proc { UTS_PROC_DOMAINNAME, }; -struct user_namespace; -extern struct user_namespace init_user_ns; - -struct uts_namespace { - struct new_utsname name; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct ns_common ns; -} __randomize_layout; -extern struct uts_namespace init_uts_ns; - -#ifdef CONFIG_UTS_NS -static inline void get_uts_ns(struct uts_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -extern struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns); -extern void free_uts_ns(struct uts_namespace *ns); - -static inline void put_uts_ns(struct uts_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_uts_ns(ns); -} - -void uts_ns_init(void); -#else -static inline void get_uts_ns(struct uts_namespace *ns) -{ -} - -static inline void put_uts_ns(struct uts_namespace *ns) -{ -} - -static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns) -{ - if (flags & CLONE_NEWUTS) - return ERR_PTR(-EINVAL); - - return old_ns; -} - -static inline void uts_ns_init(void) -{ -} -#endif - #ifdef CONFIG_PROC_SYSCTL extern void uts_proc_notify(enum uts_proc proc); #else diff --git a/include/linux/verification.h b/include/linux/verification.h index 4f3022d081c31b..dec7f2beabfd4b 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -36,6 +36,7 @@ enum key_being_used_for { VERIFYING_KEY_SIGNATURE, VERIFYING_KEY_SELF_SIGNATURE, VERIFYING_UNSPECIFIED_SIGNATURE, + VERIFYING_BPF_SIGNATURE, NR__KEY_BEING_USED_FOR }; #ifdef CONFIG_SYSTEM_DATA_VERIFICATION diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8bf156dde554a5..7427b79d6f3d54 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -193,14 +193,15 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, } static inline void virtio_get_features(struct virtio_device *vdev, - u64 *features) + u64 *features_out) { if (vdev->config->get_extended_features) { - vdev->config->get_extended_features(vdev, features); + vdev->config->get_extended_features(vdev, features_out); return; } - virtio_features_from_u64(features, vdev->config->get_features(vdev)); + virtio_features_from_u64(features_out, + vdev->config->get_features(vdev)); } /** @@ -326,11 +327,11 @@ int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) static inline bool virtio_get_shm_region(struct virtio_device *vdev, - struct virtio_shm_region *region, u8 id) + struct virtio_shm_region *region_out, u8 id) { if (!vdev->config->get_shm_region) return false; - return vdev->config->get_shm_region(vdev, region, id); + return vdev->config->get_shm_region(vdev, region_out, id); } static inline bool virtio_is_little_endian(struct virtio_device *vdev) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 45d5dd470ff609..dabc351cc127d4 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -410,7 +410,7 @@ enum wq_flags { __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ /* BH wq only allows the following flags */ - __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, + __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI | WQ_PERCPU, }; enum wq_consts { @@ -434,10 +434,10 @@ enum wq_consts { * short queue flush time. Don't queue works which can run for too * long. * - * system_highpri_wq is similar to system_wq but for work items which + * system_highpri_wq is similar to system_percpu_wq but for work items which * require WQ_HIGHPRI. * - * system_long_wq is similar to system_wq but may host long running + * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * * system_dfl_wq is unbound workqueue. Workers are not bound to @@ -445,13 +445,13 @@ enum wq_consts { * executed immediately as long as max_active limit is not reached and * resources are available. * - * system_freezable_wq is equivalent to system_wq except that it's + * system_freezable_wq is equivalent to system_percpu_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. - * system_power_efficient_wq is identical to system_wq if + * system_power_efficient_wq is identical to system_percpu_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. * * system_bh[_highpri]_wq are convenience interface to softirq. BH work items @@ -502,7 +502,7 @@ void workqueue_softirq_dead(unsigned int cpu); * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means * that the sum of per-node max_active's may be larger than @max_active. * - * For detailed information on %WQ_* flags, please refer to + * For detailed information on %WQ_\* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: @@ -570,7 +570,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ - alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) @@ -708,7 +708,7 @@ static inline bool mod_delayed_work(struct workqueue_struct *wq, */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, system_wq, work); + return queue_work_on(cpu, system_percpu_wq, work); } /** @@ -727,7 +727,7 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) */ static inline bool schedule_work(struct work_struct *work) { - return queue_work(system_wq, work); + return queue_work(system_percpu_wq, work); } /** @@ -770,21 +770,21 @@ extern void __warn_flushing_systemwide_wq(void) #define flush_scheduled_work() \ ({ \ __warn_flushing_systemwide_wq(); \ - __flush_workqueue(system_wq); \ + __flush_workqueue(system_percpu_wq); \ }) #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ \ - if ((__builtin_constant_p(_wq == system_wq) && \ - _wq == system_wq) || \ + if ((__builtin_constant_p(_wq == system_percpu_wq) && \ + _wq == system_percpu_wq) || \ (__builtin_constant_p(_wq == system_highpri_wq) && \ _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ - (__builtin_constant_p(_wq == system_unbound_wq) && \ - _wq == system_unbound_wq) || \ + (__builtin_constant_p(_wq == system_dfl_wq) && \ + _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ _wq == system_freezable_wq) || \ (__builtin_constant_p(_wq == system_power_efficient_wq) && \ @@ -807,7 +807,7 @@ extern void __warn_flushing_systemwide_wq(void) static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, system_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_percpu_wq, dwork, delay); } /** @@ -821,7 +821,7 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(system_wq, dwork, delay); + return queue_delayed_work(system_percpu_wq, dwork, delay); } #ifndef CONFIG_SMP diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2848d731a4663..15a4bc4ab81953 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } +void inode_switch_wbs_work_fn(struct work_struct *work); + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6906af7a8f2412..6560b32f312557 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1245,6 +1245,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_role(struct hci_dev *hdev, + __u8 type, __u8 role, + bdaddr_t *ba) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && c->role == role && !bacmp(&c->dst, ba)) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev, bdaddr_t *ba, __u8 ba_type) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4160731dcb6e3a..1fc2fb03ce3f9a 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -3,6 +3,7 @@ #define __NET_DST_METADATA_H 1 #include +#include #include #include #include @@ -220,9 +221,15 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, int md_size) { const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; + + tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, flags, tunnel_id, md_size); - return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, flags, tunnel_id, md_size); + if (tun_dst && (iph->frag_off & htons(IP_DF))) + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + tun_dst->u.tun_info.key.tun_flags); + return tun_dst; } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275f3..cb664f6e355807 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -204,7 +204,7 @@ struct net { extern struct net init_net; #ifdef CONFIG_NET_NS -struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); @@ -218,7 +218,7 @@ extern struct task_struct *cleanup_net_task; #else /* CONFIG_NET_NS */ #include #include -static inline struct net *copy_net_ns(unsigned long flags, +static inline struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) @@ -262,10 +262,15 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { - refcount_inc(&net->ns.count); + ns_ref_inc(net); return net; } @@ -276,7 +281,7 @@ static inline struct net *maybe_get_net(struct net *net) * exists. If the reference count is zero this * function fails and returns NULL. */ - if (!refcount_inc_not_zero(&net->ns.count)) + if (!ns_ref_get(net)) net = NULL; return net; } @@ -284,7 +289,7 @@ static inline struct net *maybe_get_net(struct net *net) /* Try using put_net_track() instead */ static inline void put_net(struct net *net) { - if (refcount_dec_and_test(&net->ns.count)) + if (ns_ref_put(net)) __put_net(net); } @@ -296,7 +301,7 @@ int net_eq(const struct net *net1, const struct net *net2) static inline int check_net(const struct net *net) { - return refcount_read(&net->ns.count) != 0; + return ns_ref_read(net) != 0; } void net_drop_ns(void *); diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 891e43a01bdc31..3faa80f5d8019d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1912,7 +1912,6 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; u64 tstamp; - unsigned int base_seq; unsigned int gc_seq; u8 validate_state; struct work_struct destroy_work; diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 6c2f483d9828dd..656e784714f3b5 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -109,17 +109,11 @@ nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, const struct nft_set_ext * nft_hash_lookup(const struct net *net, const struct nft_set *set, const u32 *key); +#endif + const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key); -#else -static inline const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) -{ - return set->ops->lookup(net, set, key); -} -#endif /* called from nft_pipapo_avx2.c */ const struct nft_set_ext * diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index cc8060c017d5fb..99dd166c5d07c3 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -3,6 +3,7 @@ #define _NETNS_NFTABLES_H_ struct netns_nftables { + unsigned int base_seq; u8 gencursor; }; diff --git a/include/net/sock.h b/include/net/sock.h index fb13322a11fcf7..2e14283c5be1ad 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2061,6 +2061,9 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) if (sock) { WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } else { + /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); } } @@ -2082,8 +2085,6 @@ static inline void sock_orphan(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; - /* Note: sk_uid is unchanged. */ - WRITE_ONCE(sk->sk_ino, 0); write_unlock_bh(&sk->sk_callback_lock); } diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb1177..f288c348a6c132 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -115,6 +115,11 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } +static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC; +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 513c8e9704f657..4f2d3268a6769d 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -160,13 +160,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return ret; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { - struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); list_del(&xskb->list_node); } +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_first_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + list_node); + return &frag->xdp; +} + static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); @@ -389,8 +399,13 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return NULL; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) { + return NULL; } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h index 67031a774e3d3c..5368cf5fd623e7 100644 --- a/include/rv/ltl_monitor.h +++ b/include/rv/ltl_monitor.h @@ -56,7 +56,7 @@ static void ltl_task_init(struct task_struct *task, bool task_creation) ltl_atoms_fetch(task, mon); } -static void handle_task_newtask(void *data, struct task_struct *task, unsigned long flags) +static void handle_task_newtask(void *data, struct task_struct *task, u64 flags) { ltl_task_init(task, true); } diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h index 5459c221badfed..43a87a39110c0c 100644 --- a/include/sound/cs-amp-lib.h +++ b/include/sound/cs-amp-lib.h @@ -49,6 +49,7 @@ int cs_amp_write_cal_coeffs(struct cs_dsp *dsp, const struct cirrus_amp_cal_data *data); int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_index, struct cirrus_amp_cal_data *out_data); +int cs_amp_get_vendor_spkid(struct device *dev); struct cs_amp_test_hooks { efi_status_t (*get_efi_variable)(efi_char16_t *name, diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 7c8bbe8ad1e2de..ab044ce2aa8b35 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -85,7 +85,9 @@ #define CS35L56_DSP1_XMEM_UNPACKED24_0 0x2800000 #define CS35L56_DSP1_FW_VER 0x2800010 #define CS35L56_DSP1_HALO_STATE 0x28021E0 +#define CS35L56_B2_DSP1_HALO_STATE 0x2803D20 #define CS35L56_DSP1_PM_CUR_STATE 0x2804308 +#define CS35L56_B2_DSP1_PM_CUR_STATE 0x2804678 #define CS35L56_DSP1_XMEM_UNPACKED24_8191 0x2807FFC #define CS35L56_DSP1_CORE_BASE 0x2B80000 #define CS35L56_DSP1_SCRATCH1 0x2B805C0 @@ -337,9 +339,6 @@ extern const struct regmap_config cs35l56_regmap_sdw; extern const struct regmap_config cs35l63_regmap_i2c; extern const struct regmap_config cs35l63_regmap_sdw; -extern const struct cs35l56_fw_reg cs35l56_fw_reg; -extern const struct cs35l56_fw_reg cs35l63_fw_reg; - extern const struct cirrus_amp_cal_controls cs35l56_calibration_controls; extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC]; diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h index 5c739e49ae2615..d78cda86688884 100644 --- a/include/sound/soc-component.h +++ b/include/sound/soc-component.h @@ -261,31 +261,19 @@ struct snd_soc_component { list_for_each_entry_safe(dai, _dai, &(component)->dai_list, list) /** - * snd_soc_dapm_to_component() - Casts a DAPM context to the component it is - * embedded in - * @dapm: The DAPM context to cast to the component - * - * This function must only be used on DAPM contexts that are known to be part of - * a component (e.g. in a component driver). Otherwise the behavior is - * undefined. - */ -static inline struct snd_soc_component *snd_soc_dapm_to_component( - struct snd_soc_dapm_context *dapm) -{ - return container_of(dapm, struct snd_soc_component, dapm); -} - -/** - * snd_soc_component_get_dapm() - Returns the DAPM context associated with a + * snd_soc_component_to_dapm() - Returns the DAPM context associated with a * component * @component: The component for which to get the DAPM context */ -static inline struct snd_soc_dapm_context *snd_soc_component_get_dapm( +static inline struct snd_soc_dapm_context *snd_soc_component_to_dapm( struct snd_soc_component *component) { return &component->dapm; } +// FIXME +#define snd_soc_component_get_dapm snd_soc_component_to_dapm + /** * snd_soc_component_cache_sync() - Sync the register cache with the hardware * @component: COMPONENT to sync diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index ed39458b94bf9b..75941324886be3 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -583,11 +583,9 @@ struct snd_soc_dapm_update { struct snd_soc_dapm_context { enum snd_soc_bias_level bias_level; - /* bit field */ - unsigned int idle_bias_off:1; /* Use BIAS_OFF instead of STANDBY */ - unsigned int suspend_bias_off:1; /* Use BIAS_OFF in suspend if the DAPM is idle */ + bool idle_bias; /* Use BIAS_OFF instead of STANDBY when false */ - struct device *dev; /* from parent - for debug */ + struct device *dev; /* from parent - for debug */ /* REMOVE ME */ struct snd_soc_component *component; /* parent component */ struct snd_soc_card *card; /* parent card */ @@ -660,6 +658,12 @@ void snd_soc_dapm_connect_dai_link_widgets(struct snd_soc_card *card); int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); int snd_soc_dapm_widget_name_cmp(struct snd_soc_dapm_widget *widget, const char *s); +struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm); +struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm); +struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm); + +bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm); +void snd_soc_dapm_set_idle_bias(struct snd_soc_dapm_context *dapm, bool on); /* dapm path setup */ int snd_soc_dapm_new_widgets(struct snd_soc_card *card); diff --git a/include/sound/soc.h b/include/sound/soc.h index 1fffef311c413f..ddc508ff7b9be1 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1120,6 +1120,11 @@ static inline int snd_soc_card_is_instantiated(struct snd_soc_card *card) return card && card->instantiated; } +static inline struct snd_soc_dapm_context *snd_soc_card_to_dapm(struct snd_soc_card *card) +{ + return &card->dapm; +} + /* SoC machine DAI configuration, glues a codec and cpu DAI together */ struct snd_soc_pcm_runtime { struct device *dev; diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index 12e85df62091ec..76c64c5245d47c 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -250,5 +250,13 @@ int asoc_sdw_cs42l43_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_so int asoc_sdw_cs42l43_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_cs_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_maxim_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +/* TI */ +int asoc_sdw_ti_amp_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback); +int asoc_sdw_ti_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_ti_amp_initial_settings(struct snd_soc_card *card, + const char *name_prefix); #endif diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 7f83d242c8e9f8..1b3c48b5591dfd 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -69,6 +69,9 @@ enum afs_fs_operation { yfs_FS_RemoveACL = 64171, yfs_FS_RemoveFile2 = 64173, yfs_FS_StoreOpaqueACL2 = 64174, + yfs_FS_Rename_Replace = 64176, + yfs_FS_Rename_NoReplace = 64177, + yfs_FS_Rename_Exchange = 64187, yfs_FS_InlineBulkStatus = 64536, /* YFS Fetch multiple file statuses with errors */ yfs_FS_FetchData64 = 64537, /* YFS Fetch file data */ yfs_FS_StoreData64 = 64538, /* YFS Store file data */ @@ -300,6 +303,9 @@ enum yfs_cm_operation { EM(yfs_FS_RemoveACL, "YFS.RemoveACL") \ EM(yfs_FS_RemoveFile2, "YFS.RemoveFile2") \ EM(yfs_FS_StoreOpaqueACL2, "YFS.StoreOpaqueACL2") \ + EM(yfs_FS_Rename_Replace, "YFS.Rename_Replace") \ + EM(yfs_FS_Rename_NoReplace, "YFS.Rename_NoReplace") \ + EM(yfs_FS_Rename_Exchange, "YFS.Rename_Exchange") \ EM(yfs_FS_InlineBulkStatus, "YFS.InlineBulkStatus") \ EM(yfs_FS_FetchData64, "YFS.FetchData64") \ EM(yfs_FS_StoreData64, "YFS.StoreData64") \ diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index b8d1e00a7982c9..370016c38a5bbc 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -27,7 +27,8 @@ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ - { FL_OFDLCK, "FL_OFDLCK" }) + { FL_OFDLCK, "FL_OFDLCK" }, \ + { FL_RECLAIM, "FL_RECLAIM"}) #define show_fl_type(val) \ __print_symbolic(val, \ @@ -189,7 +190,7 @@ TRACE_EVENT(generic_add_lease, __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); - __entry->icount = atomic_read(&inode->i_count); + __entry->icount = icount_read(inode); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; diff --git a/include/trace/events/hwmon.h b/include/trace/events/hwmon.h index d1ff560cd9b561..3865098f21f140 100644 --- a/include/trace/events/hwmon.h +++ b/include/trace/events/hwmon.h @@ -9,14 +9,14 @@ DECLARE_EVENT_CLASS(hwmon_attr_class, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val), TP_STRUCT__entry( __field(int, index) __string(attr_name, attr_name) - __field(long, val) + __field(long long, val) ), TP_fast_assign( @@ -25,20 +25,20 @@ DECLARE_EVENT_CLASS(hwmon_attr_class, __entry->val = val; ), - TP_printk("index=%d, attr_name=%s, val=%ld", + TP_printk("index=%d, attr_name=%s, val=%lld", __entry->index, __get_str(attr_name), __entry->val) ); DEFINE_EVENT(hwmon_attr_class, hwmon_attr_show, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val) ); DEFINE_EVENT(hwmon_attr_class, hwmon_attr_store, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val) ); diff --git a/include/trace/events/task.h b/include/trace/events/task.h index af535b05303304..4f0759634306c7 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -8,14 +8,14 @@ TRACE_EVENT(task_newtask, - TP_PROTO(struct task_struct *task, unsigned long clone_flags), + TP_PROTO(struct task_struct *task, u64 clone_flags), TP_ARGS(task, clone_flags), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) - __field( unsigned long, clone_flags) + __field( u64, clone_flags) __field( short, oom_score_adj) ), @@ -26,7 +26,7 @@ TRACE_EVENT(task_newtask, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", + TP_printk("pid=%d comm=%s clone_flags=%llx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 1e23919c0da981..c08aff044e8073 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history, ) ); +TRACE_EVENT(inode_switch_wbs_queue, + + TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb, + unsigned int count), + + TP_ARGS(old_wb, new_wb, count), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(ino_t, old_cgroup_ino) + __field(ino_t, new_cgroup_ino) + __field(unsigned int, count) + ), + + TP_fast_assign( + strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); + __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); + __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); + __entry->count = count; + ), + + TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u", + __entry->name, + (unsigned long)__entry->old_cgroup_ino, + (unsigned long)__entry->new_cgroup_ino, + __entry->count + ) +); + TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9a4ecc9f6dc5b1..14a1c1fe013ace 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -148,6 +148,8 @@ #define AUDIT_IPE_POLICY_LOAD 1422 /* IPE policy load */ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ +#define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ +#define AUDIT_MAC_OBJ_CONTEXTS 1426 /* Multiple LSM objext contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 233de8677382ec..ae83d8649ef1cd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ @@ -1605,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -6666,6 +6682,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { @@ -7418,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index f291ab4f94ebcc..3741ea1b73d850 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -111,6 +111,7 @@ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0bd678a4a10ef8..beb4c2d1e41cb1 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t; /* buffered IO that drops the cache after reading or writing data */ #define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) +/* prevent pipe and socket writes from raising SIGPIPE */ +#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ - RWF_DONTCACHE) + RWF_DONTCACHE | RWF_NOSIGNAL) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/include/uapi/linux/i8k.h b/include/uapi/linux/i8k.h index 268e6268f6c808..a16e4049710fcc 100644 --- a/include/uapi/linux/i8k.h +++ b/include/uapi/linux/i8k.h @@ -36,6 +36,8 @@ #define I8K_FAN_LOW 1 #define I8K_FAN_HIGH 2 #define I8K_FAN_TURBO 3 +/* Many machines treat this mode as some sort of automatic mode */ +#define I8K_FAN_AUTO 3 #define I8K_FAN_MAX I8K_FAN_TURBO #define I8K_VOL_UP 1 diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 67d015df8893cc..5fd5b4cf75ca1e 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -31,6 +31,8 @@ #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) +#define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) + #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 6ac84b2f636ca2..7359d34da446b9 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 97d8d80d139fc4..e098759ec917ac 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,6 +40,10 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + enum init_ns_ino { IPC_NS_INIT_INO = 0xEFFFFFFFU, UTS_NS_INIT_INO = 0xEFFFFFFEU, @@ -51,6 +53,18 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif +}; + +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; }; +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + #endif /* __LINUX_NSFS_H */ diff --git a/include/uapi/linux/psp-sfs.h b/include/uapi/linux/psp-sfs.h new file mode 100644 index 00000000000000..94e51670383c81 --- /dev/null +++ b/include/uapi/linux/psp-sfs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Userspace interface for AMD Seamless Firmware Servicing (SFS) + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __PSP_SFS_USER_H__ +#define __PSP_SFS_USER_H__ + +#include + +/** + * SFS: AMD Seamless Firmware Support (SFS) interface + */ + +#define PAYLOAD_NAME_SIZE 64 +#define TEE_EXT_CMD_BUFFER_SIZE 4096 + +/** + * struct sfs_user_get_fw_versions - get current level of base firmware (output). + * @blob: current level of base firmware for ASP and patch levels (input/output). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_get_fw_versions { + __u8 blob[TEE_EXT_CMD_BUFFER_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * struct sfs_user_update_package - update SFS package (input). + * @payload_name: name of SFS package to load, verify and execute (input). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_update_package { + char payload_name[PAYLOAD_NAME_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * Seamless Firmware Support (SFS) IOC + * + * possible return codes for all SFS IOCTLs: + * 0: success + * -EINVAL: invalid input + * -E2BIG: excess data passed + * -EFAULT: failed to copy to/from userspace + * -EBUSY: mailbox in recovery or in use + * -ENODEV: driver not bound with PSP device + * -EACCES: request isn't authorized + * -EINVAL: invalid parameter + * -ETIMEDOUT: request timed out + * -EAGAIN: invalid request for state machine + * -ENOENT: not implemented + * -ENFILE: overflow + * -EPERM: invalid signature + * -EIO: PSP I/O error + */ +#define SFS_IOC_TYPE 'S' + +/** + * SFSIOCFWVERS - returns blob containing FW versions + * ASP provides the current level of Base Firmware for the ASP + * and the other microprocessors as well as current patch + * level(s). + */ +#define SFSIOCFWVERS _IOWR(SFS_IOC_TYPE, 0x1, struct sfs_user_get_fw_versions) + +/** + * SFSIOCUPDATEPKG - updates package/payload + * ASP loads, verifies and executes the SFS package. + * By default, the SFS package/payload is loaded from + * /lib/firmware/amd, but alternative firmware loading + * path can be specified using kernel parameter + * firmware_class.path or the firmware loading path + * can be customized using sysfs file: + * /sys/module/firmware_class/parameters/path. + */ +#define SFSIOCUPDATEPKG _IOWR(SFS_IOC_TYPE, 0x2, struct sfs_user_update_package) + +#endif /* __PSP_SFS_USER_H__ */ diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 18eefa6d93d62f..2c3346e91dbe59 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -37,6 +37,9 @@ /* * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl. + * + * Note: PTP_STRICT_FLAGS is always enabled by the kernel for + * PTP_EXTTS_REQUEST2 regardless of whether it is set by userspace. */ #define PTP_EXTTS_VALID_FLAGS (PTP_ENABLE_FEATURE | \ PTP_RISING_EDGE | \ diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 68a627d04afa12..10ad71aa00d67b 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -237,7 +237,7 @@ struct vduse_iova_umem { * struct vduse_iova_info - information of one IOVA region * @start: start of the IOVA region * @last: last of the IOVA region - * @capability: capability of the IOVA regsion + * @capability: capability of the IOVA region * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 7aa2eb76620508..6c12db16faa3ad 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -68,6 +68,7 @@ #define VIRTIO_ID_AUDIO_POLICY 39 /* virtio audio policy */ #define VIRTIO_ID_BT 40 /* virtio bluetooth */ #define VIRTIO_ID_GPIO 41 /* virtio gpio */ +#define VIRTIO_ID_SPI 45 /* virtio spi */ /* * Virtio Transitional IDs diff --git a/include/uapi/linux/virtio_spi.h b/include/uapi/linux/virtio_spi.h new file mode 100644 index 00000000000000..8ab3c970cdd311 --- /dev/null +++ b/include/uapi/linux/virtio_spi.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * Copyright (C) 2023 OpenSynergy GmbH + * Copyright (C) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + */ +#ifndef _LINUX_VIRTIO_VIRTIO_SPI_H +#define _LINUX_VIRTIO_VIRTIO_SPI_H + +#include +#include +#include +#include + +/* Sample data on trailing clock edge */ +#define VIRTIO_SPI_CPHA _BITUL(0) +/* Clock is high when IDLE */ +#define VIRTIO_SPI_CPOL _BITUL(1) +/* Chip Select is active high */ +#define VIRTIO_SPI_CS_HIGH _BITUL(2) +/* Transmit LSB first */ +#define VIRTIO_SPI_MODE_LSB_FIRST _BITUL(3) +/* Loopback mode */ +#define VIRTIO_SPI_MODE_LOOP _BITUL(4) + +/** + * struct virtio_spi_config - All config fields are read-only for the + * Virtio SPI driver + * @cs_max_number: maximum number of chipselect the host SPI controller + * supports. + * @cs_change_supported: indicates if the host SPI controller supports to toggle + * chipselect after each transfer in one message: + * 0: unsupported, chipselect will be kept in active state throughout the + * message transaction; + * 1: supported. + * Note: Message here contains a sequence of SPI transfers. + * @tx_nbits_supported: indicates the supported number of bit for writing: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @rx_nbits_supported: indicates the supported number of bit for reading: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @bits_per_word_mask: mask indicating which values of bits_per_word are + * supported. If not set, no limitation for bits_per_word. + * @mode_func_supported: indicates the following features are supported or not: + * bit 0-1: CPHA feature + * 0b00: invalid, should support as least one CPHA setting + * 0b01: supports CPHA=0 only + * 0b10: supports CPHA=1 only + * 0b11: supports CPHA=0 and CPHA=1. + * bit 2-3: CPOL feature + * 0b00: invalid, should support as least one CPOL setting + * 0b01: supports CPOL=0 only + * 0b10: supports CPOL=1 only + * 0b11: supports CPOL=0 and CPOL=1. + * bit 4: chipselect active high feature, 0 for unsupported and 1 for + * supported, chipselect active low is supported by default. + * bit 5: LSB first feature, 0 for unsupported and 1 for supported, + * MSB first is supported by default. + * bit 6: loopback mode feature, 0 for unsupported and 1 for supported, + * normal mode is supported by default. + * @max_freq_hz: the maximum clock rate supported in Hz unit, 0 means no + * limitation for transfer speed. + * @max_word_delay_ns: the maximum word delay supported, in nanoseconds. + * A value of 0 indicates that word delay is unsupported. + * Each transfer may consist of a sequence of words. + * @max_cs_setup_ns: the maximum delay supported after chipselect is asserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * asserted. + * @max_cs_hold_ns: the maximum delay supported before chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce before chipselect + * is deasserted. + * @max_cs_incative_ns: maximum delay supported after chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * deasserted. + */ +struct virtio_spi_config { + __u8 cs_max_number; + __u8 cs_change_supported; +#define VIRTIO_SPI_RX_TX_SUPPORT_DUAL _BITUL(0) +#define VIRTIO_SPI_RX_TX_SUPPORT_QUAD _BITUL(1) +#define VIRTIO_SPI_RX_TX_SUPPORT_OCTAL _BITUL(2) + __u8 tx_nbits_supported; + __u8 rx_nbits_supported; + __le32 bits_per_word_mask; +#define VIRTIO_SPI_MF_SUPPORT_CPHA_0 _BITUL(0) +#define VIRTIO_SPI_MF_SUPPORT_CPHA_1 _BITUL(1) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_0 _BITUL(2) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_1 _BITUL(3) +#define VIRTIO_SPI_MF_SUPPORT_CS_HIGH _BITUL(4) +#define VIRTIO_SPI_MF_SUPPORT_LSB_FIRST _BITUL(5) +#define VIRTIO_SPI_MF_SUPPORT_LOOPBACK _BITUL(6) + __le32 mode_func_supported; + __le32 max_freq_hz; + __le32 max_word_delay_ns; + __le32 max_cs_setup_ns; + __le32 max_cs_hold_ns; + __le32 max_cs_inactive_ns; +}; + +/** + * struct spi_transfer_head - virtio SPI transfer descriptor + * @chip_select_id: chipselect index the SPI transfer used. + * @bits_per_word: the number of bits in each SPI transfer word. + * @cs_change: whether to deselect device after finishing this transfer + * before starting the next transfer, 0 means cs keep asserted and + * 1 means cs deasserted then asserted again. + * @tx_nbits: bus width for write transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @rx_nbits: bus width for read transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @reserved: for future use. + * @mode: SPI transfer mode. + * bit 0: CPHA, determines the timing (i.e. phase) of the data + * bits relative to the clock pulses.For CPHA=0, the + * "out" side changes the data on the trailing edge of the + * preceding clock cycle, while the "in" side captures the data + * on (or shortly after) the leading edge of the clock cycle. + * For CPHA=1, the "out" side changes the data on the leading + * edge of the current clock cycle, while the "in" side + * captures the data on (or shortly after) the trailing edge of + * the clock cycle. + * bit 1: CPOL, determines the polarity of the clock. CPOL=0 is a + * clock which idles at 0, and each cycle consists of a pulse + * of 1. CPOL=1 is a clock which idles at 1, and each cycle + * consists of a pulse of 0. + * bit 2: CS_HIGH, if 1, chip select active high, else active low. + * bit 3: LSB_FIRST, determines per-word bits-on-wire, if 0, MSB + * first, else LSB first. + * bit 4: LOOP, loopback mode. + * @freq: the transfer speed in Hz. + * @word_delay_ns: delay to be inserted between consecutive words of a + * transfer, in ns unit. + * @cs_setup_ns: delay to be introduced after CS is asserted, in ns + * unit. + * @cs_delay_hold_ns: delay to be introduced before CS is deasserted + * for each transfer, in ns unit. + * @cs_change_delay_inactive_ns: delay to be introduced after CS is + * deasserted and before next asserted, in ns unit. + */ +struct spi_transfer_head { + __u8 chip_select_id; + __u8 bits_per_word; + __u8 cs_change; + __u8 tx_nbits; + __u8 rx_nbits; + __u8 reserved[3]; + __le32 mode; + __le32 freq; + __le32 word_delay_ns; + __le32 cs_setup_ns; + __le32 cs_delay_hold_ns; + __le32 cs_change_delay_inactive_ns; +}; + +/** + * struct spi_transfer_result - virtio SPI transfer result + * @result: Transfer result code. + * VIRTIO_SPI_TRANS_OK: Transfer successful. + * VIRTIO_SPI_PARAM_ERR: Parameter error. + * VIRTIO_SPI_TRANS_ERR: Transfer error. + */ +struct spi_transfer_result { +#define VIRTIO_SPI_TRANS_OK 0 +#define VIRTIO_SPI_PARAM_ERR 1 +#define VIRTIO_SPI_TRANS_ERR 2 + __u8 result; +}; + +#endif /* #ifndef _LINUX_VIRTIO_VIRTIO_SPI_H */ diff --git a/include/uapi/sound/compress_offload.h b/include/uapi/sound/compress_offload.h index 26f756cc2e6215..b610683fd8dbe4 100644 --- a/include/uapi/sound/compress_offload.h +++ b/include/uapi/sound/compress_offload.h @@ -13,7 +13,7 @@ #include #include -#define SNDRV_COMPRESS_VERSION SNDRV_PROTOCOL_VERSION(0, 4, 0) +#define SNDRV_COMPRESS_VERSION SNDRV_PROTOCOL_VERSION(0, 4, 1) /** * struct snd_compressed_buffer - compressed buffer * @fragment_size: size of buffer fragment in bytes diff --git a/include/uapi/sound/compress_params.h b/include/uapi/sound/compress_params.h index bc7648a30746f4..d7db6b4e116663 100644 --- a/include/uapi/sound/compress_params.h +++ b/include/uapi/sound/compress_params.h @@ -43,7 +43,8 @@ #define SND_AUDIOCODEC_BESPOKE ((__u32) 0x0000000E) #define SND_AUDIOCODEC_ALAC ((__u32) 0x0000000F) #define SND_AUDIOCODEC_APE ((__u32) 0x00000010) -#define SND_AUDIOCODEC_MAX SND_AUDIOCODEC_APE +#define SND_AUDIOCODEC_OPUS_RAW ((__u32) 0x00000011) +#define SND_AUDIOCODEC_MAX SND_AUDIOCODEC_OPUS_RAW /* * Profile and modes are listed with bit masks. This allows for a @@ -324,6 +325,43 @@ struct snd_dec_ape { __u32 seek_table_present; } __attribute__((packed, aligned(4))); +/** + * struct snd_dec_opus - Opus decoder parameters (raw opus packets) + * @version: Usually should be '1' but can be split into major (4 upper bits) + * and minor (4 lower bits) sub-fields. + * @num_channels: Number of output channels. + * @pre_skip: Number of samples to discard at 48 kHz. + * @sample_rate: Sample rate of original input. + * @output_gain: Gain to apply when decoding (in Q7.8 format). + * @mapping_family: Order and meaning of output channels. Only values 0 and 1 + * are expected; values 2..255 are not recommended for playback. + * + * @chan_map: Optional channel mapping table. Describes mapping of opus streams + * to decoded channels. Fields: + * @chan_map.stream_count: Number of streams encoded in each Ogg packet. + * @chan_map.coupled_count: Number of streams whose decoders are used + * for two channels. + * @chan_map.channel_map: Which decoded channel to be used for each one. + * Supports only mapping families 0 and 1, + * max number of channels is 8. + * + * These options were extracted from RFC7845 Section 5. + */ + +struct snd_dec_opus { + __u8 version; + __u8 num_channels; + __u16 pre_skip; + __u32 sample_rate; + __u16 output_gain; + __u8 mapping_family; + struct snd_dec_opus_ch_map { + __u8 stream_count; + __u8 coupled_count; + __u8 channel_map[8]; + } chan_map; +} __attribute__((packed, aligned(4))); + union snd_codec_options { struct snd_enc_wma wma; struct snd_enc_vorbis vorbis; @@ -334,6 +372,7 @@ union snd_codec_options { struct snd_dec_wma wma_d; struct snd_dec_alac alac_d; struct snd_dec_ape ape_d; + struct snd_dec_opus opus_d; struct { __u32 out_sample_rate; } src_d; diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 02533038640e53..23c39b96190fdf 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -31,7 +31,7 @@ struct arch_vdso_time_data {}; #if defined(CONFIG_ARCH_HAS_VDSO_ARCH_DATA) #include -#elif defined(CONFIG_GENERIC_VDSO_DATA_STORE) +#else struct vdso_arch_data { /* Needed for the generic code, never actually used at runtime */ char __unused; @@ -164,7 +164,6 @@ struct vdso_rng_data { * With the hidden visibility, the compiler simply generates a PC-relative * relocation, and this is what we need. */ -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE extern struct vdso_time_data vdso_u_time_data __attribute__((visibility("hidden"))); extern struct vdso_rng_data vdso_u_rng_data __attribute__((visibility("hidden"))); extern struct vdso_arch_data vdso_u_arch_data __attribute__((visibility("hidden"))); @@ -185,8 +184,6 @@ enum vdso_pages { VDSO_NR_PAGES }; -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ - /* * The generic vDSO implementation requires that gettimeofday.h * provides: @@ -196,11 +193,7 @@ enum vdso_pages { * - clock_gettime_fallback(): fallback for clock_gettime. * - clock_getres_fallback(): fallback for clock_getres. */ -#ifdef ENABLE_COMPAT_VDSO -#include -#else #include -#endif /* ENABLE_COMPAT_VDSO */ #else /* !__ASSEMBLY__ */ diff --git a/include/vdso/gettime.h b/include/vdso/gettime.h index c50d152e7b3e06..9ac161866653a0 100644 --- a/include/vdso/gettime.h +++ b/include/vdso/gettime.h @@ -5,6 +5,7 @@ #include struct __kernel_timespec; +struct __kernel_old_timeval; struct timezone; #if !defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) diff --git a/include/vdso/jiffies.h b/include/vdso/jiffies.h index 2f9d596c8b2977..8ca04a141412ec 100644 --- a/include/vdso/jiffies.h +++ b/include/vdso/jiffies.h @@ -5,7 +5,7 @@ #include /* for HZ */ #include -/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ +/* TICK_NSEC is the time between ticks in nsec */ #define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ) #endif /* __VDSO_JIFFIES_H */ diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index e279be353e3f11..69ac6d80a006b7 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -164,7 +164,7 @@ gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, { if (flags & GNTMAP_contains_pte) map->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) + else if (!xen_pv_domain()) map->host_addr = __pa(addr); else map->host_addr = addr; @@ -181,7 +181,7 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, { if (flags & GNTMAP_contains_pte) unmap->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) + else if (!xen_pv_domain()) unmap->host_addr = __pa(addr); else unmap->host_addr = addr; diff --git a/include/xen/mem-reservation.h b/include/xen/mem-reservation.h index a2ab516fcd2caf..3cbe3df0dfd4e9 100644 --- a/include/xen/mem-reservation.h +++ b/include/xen/mem-reservation.h @@ -39,7 +39,7 @@ static inline void xenmem_reservation_va_mapping_update(unsigned long count, xen_pfn_t *frames) { #ifdef CONFIG_XEN_HAVE_PVMMU - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) __xenmem_reservation_va_mapping_update(count, pages, frames); #endif } @@ -48,7 +48,7 @@ static inline void xenmem_reservation_va_mapping_reset(unsigned long count, struct page **pages) { #ifdef CONFIG_XEN_HAVE_PVMMU - if (!xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_pv_domain()) __xenmem_reservation_va_mapping_reset(count, pages); #endif } diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 9e2a769b0d961a..496e6013c689f9 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -116,7 +117,7 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma, unsigned int domid, struct page **pages) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, prot, domid, pages); @@ -150,7 +151,7 @@ static inline int xen_remap_domain_mfn_array(struct vm_area_struct *vma, int nr, int *err_ptr, pgprot_t prot, unsigned int domid) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return -EOPNOTSUPP; return xen_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid, @@ -175,7 +176,7 @@ static inline int xen_remap_domain_gfn_range(struct vm_area_struct *vma, pgprot_t prot, unsigned int domid, struct page **pages) { - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (!xen_pv_domain()) return -EOPNOTSUPP; return xen_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false); diff --git a/include/xen/xen.h b/include/xen/xen.h index a1e5b3f18d69f9..61854e3f283776 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -22,8 +22,15 @@ extern bool xen_pvh; #define xen_pvh 0 #endif +#ifdef CONFIG_X86 +#include + +#define xen_pv_domain() (cpu_feature_enabled(X86_FEATURE_XENPV)) +#else +#define xen_pv_domain() 0 +#endif + #define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) #define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) #define xen_pvh_domain() (xen_pvh) diff --git a/init/Kconfig b/init/Kconfig index e3eb63eadc8757..f3b13463ec2608 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -99,7 +99,10 @@ config GCC_ASM_GOTO_OUTPUT_BROKEN config CC_HAS_ASM_GOTO_OUTPUT def_bool y depends on !GCC_ASM_GOTO_OUTPUT_BROKEN + # Detect basic support depends on $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) + # Detect clang (< v17) scoped label issues + depends on $(success,echo 'void b(void **);void* c(void);int f(void){{asm goto(""::::l0);return 0;l0:return 1;}void *x __attribute__((cleanup(b)))=c();{asm goto(""::::l1);return 2;l1:return 3;}}' | $(CC) -x c - -c -o /dev/null) config CC_HAS_ASM_GOTO_TIED_OUTPUT depends on CC_HAS_ASM_GOTO_OUTPUT @@ -112,6 +115,16 @@ config TOOLS_SUPPORT_RELR config CC_HAS_ASM_INLINE def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) +config CC_HAS_ASSUME + bool + # clang needs to be at least 19.1.0 since the meaning of the assume + # attribute changed: + # https://github.com/llvm/llvm-project/commit/c44fa3e8a9a44c2e9a575768a3c185354b9f6c17 + default y if CC_IS_CLANG && CLANG_VERSION >= 190100 + # supported since gcc 13.1.0 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106654 + default y if CC_IS_GCC && GCC_VERSION >= 130100 + config CC_HAS_NO_PROFILE_FN_ATTR def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) @@ -1350,7 +1363,7 @@ config UTS_NS config TIME_NS bool "TIME namespace" - depends on GENERIC_VDSO_TIME_NS + depends on GENERIC_GETTIMEOFDAY default y help In this namespace boottime and monotonic clocks can be set. @@ -1501,6 +1514,7 @@ config BOOT_CONFIG_EMBED_FILE config INITRAMFS_PRESERVE_MTIME bool "Preserve cpio archive mtimes in initramfs" + depends on BLK_DEV_INITRD default y help Each entry in an initramfs cpio archive carries an mtime value. When @@ -2067,8 +2081,8 @@ config RUST depends on !GCC_PLUGIN_RANDSTRUCT depends on !RANDSTRUCT depends on !DEBUG_INFO_BTF || (PAHOLE_HAS_LANG_EXCLUDE && !LTO) - depends on !CFI_CLANG || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC - select CFI_ICALL_NORMALIZE_INTEGERS if CFI_CLANG + depends on !CFI || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC + select CFI_ICALL_NORMALIZE_INTEGERS if CFI depends on !CALL_PADDING || RUSTC_VERSION >= 108100 depends on !KASAN_SW_TAGS depends on !(MITIGATION_RETHUNK && KASAN) || RUSTC_VERSION >= 108300 diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index ac021ae6e6fa78..19d9f33dcacf85 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "do_mounts.h" @@ -186,14 +187,12 @@ static unsigned long nr_blocks(struct file *file) int __init rd_load_image(char *from) { int res = 0; - unsigned long rd_blocks, devblocks; + unsigned long rd_blocks, devblocks, nr_disks; int nblocks, i; char *buf = NULL; unsigned short rotate = 0; decompress_fn decompressor = NULL; -#if !defined(CONFIG_S390) char rotator[4] = { '|' , '/' , '-' , '\\' }; -#endif out_file = filp_open("/dev/ram", O_RDWR, 0); if (IS_ERR(out_file)) @@ -244,8 +243,9 @@ int __init rd_load_image(char *from) goto done; } - printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", - nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); + nr_disks = (nblocks - 1) / devblocks + 1; + pr_notice("RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", + nblocks, nr_disks, str_plural(nr_disks)); for (i = 0; i < nblocks; i++) { if (i && (i % devblocks == 0)) { pr_cont("done disk #1.\n"); @@ -255,12 +255,10 @@ int __init rd_load_image(char *from) } kernel_read(in_file, buf, BLOCK_SIZE, &in_pos); kernel_write(out_file, buf, BLOCK_SIZE, &out_pos); -#if !defined(CONFIG_S390) - if (!(i % 16)) { + if (!IS_ENABLED(CONFIG_S390) && !(i % 16)) { pr_cont("%c\b", rotator[rotate & 0x3]); rotate++; } -#endif } pr_cont("done.\n"); diff --git a/init/init_task.c b/init/init_task.c index e557f622bd9061..a55e2189206fa4 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -27,6 +27,9 @@ static struct signal_struct init_signals = { }, .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, +#ifdef CONFIG_CGROUPS + .cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem), +#endif .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS diff --git a/init/initramfs.c b/init/initramfs.c index 097673b97784db..6ddbfb17fb8f1b 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "do_mounts.h" #include "initramfs_internal.h" @@ -108,7 +109,7 @@ static char __init *find_link(int major, int minor, int ino, q->minor = minor; q->ino = ino; q->mode = mode; - strcpy(q->name, name); + strscpy(q->name, name); q->next = NULL; *p = q; hardlink_seen = true; @@ -152,7 +153,7 @@ static void __init dir_add(const char *name, size_t nlen, time64_t mtime) { struct dir_entry *de; - de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); + de = kmalloc(struct_size(de, name, nlen), GFP_KERNEL); if (!de) panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); diff --git a/init/main.c b/init/main.c index 0ee0ee7b7c2c0a..fab4f599c035e7 100644 --- a/init/main.c +++ b/init/main.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include @@ -956,6 +957,7 @@ void start_kernel(void) sort_main_extable(); trap_init(); mm_core_init(); + maple_tree_init(); poking_init(); ftrace_init(); @@ -973,7 +975,6 @@ void start_kernel(void) "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); radix_tree_init(); - maple_tree_init(); /* * Set up housekeeping before setting up workqueues to allow the unbound @@ -1072,6 +1073,7 @@ void start_kernel(void) fork_init(); proc_caches_init(); uts_ns_init(); + time_ns_init(); key_init(); security_init(); dbg_late_init(); diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 043cbf80a766de..d071835121c2c4 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,7 +8,8 @@ #include struct uts_namespace init_uts_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.ns_type = ns_common_type(&init_uts_ns), + .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, .nodename = UTS_NODENAME, @@ -18,7 +19,7 @@ struct uts_namespace init_uts_ns = { .domainname = UTS_DOMAINNAME, }, .user_ns = &init_user_ns, - .ns.inum = PROC_UTS_INIT_INO, + .ns.inum = ns_init_inum(&init_uts_ns), #ifdef CONFIG_UTS_NS .ns.ops = &utsns_operations, #endif diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 17dfaa0395c46b..1d03b2fc4b2594 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -352,16 +352,16 @@ static void create_worker_cb(struct callback_head *cb) struct io_wq *wq; struct io_wq_acct *acct; - bool do_create = false; + bool activated_free_worker, do_create = false; worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; acct = worker->acct; rcu_read_lock(); - do_create = !io_acct_activate_free_worker(acct); + activated_free_worker = io_acct_activate_free_worker(acct); rcu_read_unlock(); - if (!do_create) + if (activated_free_worker) goto no_need_create; raw_spin_lock(&acct->workers_lock); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 93633613a1657c..93665cebe9bdd7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -290,7 +290,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); - io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); } @@ -337,9 +336,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_cmd), sizeof(struct io_async_cmd)); - spin_lock_init(&ctx->msg_lock); - ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); ret |= io_rsrc_cache_init(ctx); if (ret) @@ -1406,8 +1402,10 @@ static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, tw); - if (unlikely(io_should_terminate_tw())) + struct io_ring_ctx *ctx = req->ctx; + + io_tw_lock(ctx, tw); + if (unlikely(io_should_terminate_tw(ctx))) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index abc6de227f74d2..1880902be6fd72 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -476,9 +476,9 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) * 2) PF_KTHREAD is set, in which case the invoker of the task_work is * our fallback task_work. */ -static inline bool io_should_terminate_tw(void) +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) { - return current->flags & (PF_KTHREAD | PF_EXITING); + return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs); } static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 4c2578f2efcb0e..5e5b94236d7204 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,7 +11,6 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" -#include "alloc_cache.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ @@ -76,13 +75,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); - if (spin_trylock(&ctx->msg_lock)) { - if (io_alloc_cache_put(&ctx->msg_cache, req)) - req = NULL; - spin_unlock(&ctx->msg_lock); - } - if (req) - kfree_rcu(req, rcu_head); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -104,26 +97,13 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } -static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req = NULL; - - if (spin_trylock(&ctx->msg_lock)) { - req = io_alloc_cache_get(&ctx->msg_cache); - spin_unlock(&ctx->msg_lock); - if (req) - return req; - } - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); -} - static int io_msg_data_remote(struct io_ring_ctx *target_ctx, struct io_msg *msg) { struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(target_ctx); + target = kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO) ; if (unlikely(!target)) return -ENOMEM; diff --git a/io_uring/notif.c b/io_uring/notif.c index 9a6f6e92d74242..ea9c0116cec2df 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -85,7 +85,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) return -EEXIST; prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); - prev_notif = cmd_to_io_kiocb(nd); + prev_notif = cmd_to_io_kiocb(prev_nd); /* make sure all noifications can be finished in the same task_work */ if (unlikely(notif->ctx != prev_notif->ctx || diff --git a/io_uring/poll.c b/io_uring/poll.c index c786e587563b05..6090a26975d400 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -224,7 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; - if (unlikely(io_should_terminate_tw())) + if (unlikely(io_should_terminate_tw(req->ctx))) return -ECANCELED; do { diff --git a/io_uring/rw.c b/io_uring/rw.c index 52a5b950b2e5e9..af5a54b5db1233 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -886,6 +886,9 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; + if (!(file->f_mode & FMODE_HAS_METADATA)) + return -EINVAL; + /* * We have a union of meta fields with wpq used for buffered-io * in io_async_rw, so fail it here. diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7f13bfa9f2b617..17e3aab0af3676 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -324,7 +324,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) int ret; if (prev) { - if (!io_should_terminate_tw()) { + if (!io_should_terminate_tw(req->ctx)) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 053bac89b6c0fc..213716e10d704a 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -118,7 +118,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; - if (io_should_terminate_tw()) + if (io_should_terminate_tw(req->ctx)) flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ diff --git a/ipc/msgutil.c b/ipc/msgutil.c index c7be0c79264767..7a03f6d03de3ad 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "util.h" @@ -26,12 +27,13 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .ns.count = REFCOUNT_INIT(1), + .ns.__ns_ref = REFCOUNT_INIT(1), .user_ns = &init_user_ns, - .ns.inum = PROC_IPC_INIT_INO, + .ns.inum = ns_init_inum(&init_ipc_ns), #ifdef CONFIG_IPC_NS .ns.ops = &ipcns_operations, #endif + .ns.ns_type = ns_common_type(&init_ipc_ns), }; struct msg_msgseg { diff --git a/ipc/namespace.c b/ipc/namespace.c index 4df91ceeeafe9f..59b12fcb40bdf4 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "util.h" @@ -61,12 +62,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free; - ns->ns.ops = &ipcns_operations; - refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; @@ -87,6 +86,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, sem_init_ns(ns); shm_init_ns(ns); + ns_tree_add(ns); return ns; @@ -97,7 +97,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, fail_put: put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kfree(ns); fail_dec: @@ -106,7 +106,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, return ERR_PTR(err); } -struct ipc_namespace *copy_ipcs(unsigned long flags, +struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) @@ -161,7 +161,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); kfree(ns); } @@ -199,20 +199,16 @@ static void free_ipc(struct work_struct *unused) */ void put_ipc_ns(struct ipc_namespace *ns) { - if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { + if (ns_ref_put_and_lock(ns, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); + ns_tree_remove(ns); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); } } -static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) -{ - return container_of(ns, struct ipc_namespace, ns); -} - static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; @@ -252,7 +248,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns) const struct proc_ns_operations ipcns_operations = { .name = "ipc", - .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, diff --git a/ipc/sem.c b/ipc/sem.c index a39cdc7bf88fa2..0f06e4bd4673f5 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2303,7 +2303,7 @@ SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, * parent and child tasks. */ -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +int copy_semundo(u64 clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; diff --git a/ipc/shm.c b/ipc/shm.c index a9310b6dbbc369..3db36773dd1023 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -148,6 +149,7 @@ void shm_exit_ns(struct ipc_namespace *ns) static int __init ipc_ns_init(void) { shm_init_ns(&init_ipc_ns); + ns_tree_add(&init_ipc_ns); return 0; } diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 54ea59ff8fbeb6..da326800c1c9be 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -103,6 +103,19 @@ config PREEMPT_RT Select this if you are building a kernel for systems which require real-time guarantees. +config PREEMPT_RT_NEEDS_BH_LOCK + bool "Enforce softirq synchronisation on PREEMPT_RT" + depends on PREEMPT_RT + help + Enforce synchronisation across the softirqs context. On PREEMPT_RT + the softirq is preemptible. This enforces the same per-CPU BLK + semantic non-PREEMPT_RT builds have. This should not be needed + because per-CPU locks were added to avoid the per-CPU BKL. + + This switch provides the old behaviour for testing reasons. Select + this if you suspect an error with preemptible softirq and want test + the old synchronized behaviour. + config PREEMPT_COUNT bool diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235f2..41751834e764f2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ + kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o @@ -122,7 +122,7 @@ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/audit.c b/kernel/audit.c index 61b5744d0bb684..26a332ffb1b8d9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,13 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; +/* Number of modules that provide a security context. + List of lsms that provide a security context */ +static u32 audit_subj_secctx_cnt; +static u32 audit_obj_secctx_cnt; +static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; +static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT]; + /** * struct audit_net - audit private network namespace data * @sk: communication socket @@ -195,8 +203,10 @@ static struct audit_ctl_mutex { * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct sk_buff *skb; /* formatted skb ready to send */ + struct sk_buff *skb; /* the skb for audit_log functions */ + struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ + struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; @@ -278,6 +288,33 @@ static pid_t auditd_pid_vnr(void) return pid; } +/** + * audit_cfg_lsm - Identify a security module as providing a secctx. + * @lsmid: LSM identity + * @flags: which contexts are provided + * + * Description: + * Increments the count of the security modules providing a secctx. + * If the LSM id is already in the list leave it alone. + */ +void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ + int i; + + if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { + for (i = 0 ; i < audit_subj_secctx_cnt; i++) + if (audit_subj_lsms[i] == lsmid) + return; + audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; + } + if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) { + for (i = 0 ; i < audit_obj_secctx_cnt; i++) + if (audit_obj_lsms[i] == lsmid) + return; + audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid; + } +} + /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace @@ -1113,7 +1150,6 @@ static int is_audit_feature_set(int i) return af.features & AUDIT_FEATURE_TO_MASK(i); } - static int audit_get_feature(struct sk_buff *skb) { u32 seq; @@ -1473,7 +1509,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, - &lsmctx); + &lsmctx, LSM_ID_UNDEF); if (err < 0) return err; } @@ -1776,10 +1812,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { + struct sk_buff *skb; + if (!ab) return; - kfree_skb(ab->skb); + while ((skb = skb_dequeue(&ab->skb_list))) + kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } @@ -1792,9 +1831,14 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, if (!ab) return NULL; + skb_queue_head_init(&ab->skb_list); + ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + + skb_queue_tail(&ab->skb_list, ab->skb); + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; @@ -1833,11 +1877,11 @@ unsigned int audit_serial(void) } static inline void audit_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial) + struct audit_stamp *stamp) { - if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - ktime_get_coarse_real_ts64(t); - *serial = audit_serial(); + if (!ctx || !auditsc_get_stamp(ctx, stamp)) { + ktime_get_coarse_real_ts64(&stamp->ctime); + stamp->serial = audit_serial(); } } @@ -1860,8 +1904,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct timespec64 t; - unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1916,12 +1958,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &t, &serial); + audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); return ab; } @@ -2177,33 +2221,179 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -int audit_log_task_context(struct audit_buffer *ab) +/** + * audit_buffer_aux_new - Add an aux record buffer to the skb list + * @ab: audit_buffer + * @type: message type + * + * Aux records are allocated and added to the skb list of + * the "main" record. The ab->skb is reset to point to the + * aux record on its creation. When the aux record in complete + * ab->skb has to be reset to point to the "main" record. + * This allows the audit_log_ functions to be ignorant of + * which kind of record it is logging to. It also avoids adding + * special data for aux records. + * + * On success ab->skb will point to the new aux record. + * Returns 0 on success, -ENOMEM should allocation fail. + */ +static int audit_buffer_aux_new(struct audit_buffer *ab, int type) +{ + WARN_ON(ab->skb != skb_peek(&ab->skb_list)); + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); + if (!ab->skb) + goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; + skb_queue_tail(&ab->skb_list, ab->skb); + + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); + + return 0; + +err: + kfree_skb(ab->skb); + ab->skb = skb_peek(&ab->skb_list); + return -ENOMEM; +} + +/** + * audit_buffer_aux_end - Switch back to the "main" record from an aux record + * @ab: audit_buffer + * + * Restores the "main" audit record to ab->skb. + */ +static void audit_buffer_aux_end(struct audit_buffer *ab) +{ + ab->skb = skb_peek(&ab->skb_list); +} + +/** + * audit_log_subj_ctx - Add LSM subject information + * @ab: audit_buffer + * @prop: LSM subject properties. + * + * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. + */ +int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { - struct lsm_prop prop; struct lsm_context ctx; + char *space = ""; int error; + int i; - security_current_getlsmprop_subj(&prop); - if (!lsmprop_is_set(&prop)) + security_current_getlsmprop_subj(prop); + if (!lsmprop_is_set(prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx); - if (error < 0) { - if (error != -EINVAL) - goto error_path; + if (audit_subj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return 0; + } + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; } - - audit_log_format(ab, " subj=%s", ctx.context); - security_release_secctx(&ctx); + /* Multiple LSMs provide contexts. Include an aux record. */ + audit_log_format(ab, " subj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_subj_secctx_cnt; i++) { + error = security_lsmprop_to_secctx(prop, &ctx, + audit_subj_lsms[i]->id); + if (error < 0) { + /* + * Don't print anything. An LSM like BPF could + * claim to support contexts, but only do so under + * certain conditions. + */ + if (error == -EOPNOTSUPP) + continue; + if (error != -EINVAL) + audit_panic("error in audit_log_subj_ctx"); + } else { + audit_log_format(ab, "%ssubj_%s=%s", space, + audit_subj_lsms[i]->name, ctx.context); + space = " "; + security_release_secctx(&ctx); + } + } + audit_buffer_aux_end(ab); return 0; error_path: - audit_panic("error in audit_log_task_context"); + audit_panic("error in audit_log_subj_ctx"); return error; } +EXPORT_SYMBOL(audit_log_subj_ctx); + +int audit_log_task_context(struct audit_buffer *ab) +{ + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + return audit_log_subj_ctx(ab, &prop); +} EXPORT_SYMBOL(audit_log_task_context); +int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) +{ + int i; + int rc; + int error = 0; + char *space = ""; + struct lsm_context ctx; + + if (audit_obj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return error; + } + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); + return 0; + } + audit_log_format(ab, " obj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_obj_secctx_cnt; i++) { + rc = security_lsmprop_to_secctx(prop, &ctx, + audit_obj_lsms[i]->id); + if (rc < 0) { + audit_log_format(ab, "%sobj_%s=?", space, + audit_obj_lsms[i]->name); + if (rc != -EINVAL) + audit_panic("error in audit_log_obj_ctx"); + error = rc; + } else { + audit_log_format(ab, "%sobj_%s=%s", space, + audit_obj_lsms[i]->name, ctx.context); + security_release_secctx(&ctx); + } + space = " "; + } + + audit_buffer_aux_end(ab); + return error; + +error_path: + audit_panic("error in audit_log_obj_ctx"); + return error; +} + void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { @@ -2410,6 +2600,28 @@ int audit_signal_info(int sig, struct task_struct *t) return audit_signal_info_syscall(t); } +/** + * __audit_log_end - enqueue one audit record + * @skb: the buffer to send + */ +static void __audit_log_end(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + + if (audit_rate_check()) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet */ + skb_queue_tail(&audit_queue, skb); + } else { + audit_log_lost("rate limit exceeded"); + kfree_skb(skb); + } +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer @@ -2422,25 +2634,15 @@ int audit_signal_info(int sig, struct task_struct *t) void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; - struct nlmsghdr *nlh; if (!ab) return; - if (audit_rate_check()) { - skb = ab->skb; - ab->skb = NULL; - - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + while ((skb = skb_dequeue(&ab->skb_list))) + __audit_log_end(skb); - /* queue the netlink packet and poke the kauditd thread */ - skb_queue_tail(&audit_queue, skb); - wake_up_interruptible(&kauditd_wait); - } else - audit_log_lost("rate limit exceeded"); + /* poke the kauditd thread */ + wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 2a24d01c5fb0e2..0f05933a173be0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -99,6 +99,12 @@ struct audit_proctitle { char *value; /* the cmdline field */ }; +/* A timestamp/serial pair to identify an event */ +struct audit_stamp { + struct timespec64 ctime; /* time of syscall entry */ + unsigned int serial; /* serial number for record */ +}; + /* The per-task audit context. */ struct audit_context { int dummy; /* must be the first element */ @@ -108,10 +114,9 @@ struct audit_context { AUDIT_CTX_URING, /* in use by io_uring */ } context; enum audit_state state, current_state; - unsigned int serial; /* serial number for record */ + struct audit_stamp stamp; /* event identifier */ int major; /* syscall number */ int uring_op; /* uring operation */ - struct timespec64 ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ long return_code;/* syscall return code */ u64 prio; @@ -263,7 +268,7 @@ extern void audit_put_tty(struct tty_struct *tty); extern unsigned int audit_serial(void); #ifdef CONFIG_AUDITSYSCALL extern int auditsc_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial); + struct audit_stamp *stamp); extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); @@ -304,7 +309,7 @@ extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); #else /* CONFIG_AUDITSYSCALL */ -#define auditsc_get_stamp(c, t, s) 0 +#define auditsc_get_stamp(c, s) 0 #define audit_put_watch(w) do { } while (0) #define audit_get_watch(w) do { } while (0) #define audit_to_watch(k, p, l, o) (-EINVAL) diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index c565fbf66ac876..b92805b317a2d4 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa struct audit_fsnotify_mark *audit_mark; struct path path; struct dentry *dentry; - struct inode *inode; int ret; if (pathname[0] != '/' || pathname[len-1] == '/') return ERR_PTR(-EINVAL); - dentry = kern_path_locked(pathname, &path); + dentry = kern_path_parent(pathname, &path); if (IS_ERR(dentry)) return ERR_CAST(dentry); /* returning an error */ - inode = path.dentry->d_inode; - inode_unlock(inode); + if (d_really_is_negative(dentry)) { + audit_mark = ERR_PTR(-ENOENT); + goto out; + } audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { @@ -100,7 +101,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); + ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0); if (ret < 0) { audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0eae2a3c895d8..1605df0a171ea8 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -93,8 +93,10 @@ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init; static struct audit_tree *alloc_tree(const char *s) { struct audit_tree *tree; + size_t sz; - tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL); + sz = strlen(s) + 1; + tree = kmalloc(struct_size(tree, pathname, sz), GFP_KERNEL); if (tree) { refcount_set(&tree->count, 1); tree->goner = 0; @@ -103,7 +105,7 @@ static struct audit_tree *alloc_tree(const char *s) INIT_LIST_HEAD(&tree->list); INIT_LIST_HEAD(&tree->same_root); tree->root = NULL; - strcpy(tree->pathname, s); + strscpy(tree->pathname, s, sz); } return tree; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0ebbbe37a60f02..a700e3c8925ff7 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -349,7 +349,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { struct dentry *d; - d = kern_path_locked_negative(watch->path, parent); + d = kern_path_parent(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); @@ -359,7 +359,6 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) watch->ino = d_backing_inode(d)->i_ino; } - inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f7708fe2c45722..c401082d9b2506 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1440,7 +1440,7 @@ static int update_lsm_rule(struct audit_krule *r) } /* This function will re-initialize the lsm_rule field of all applicable rules. - * It will traverse the filter lists serarching for rules that contain LSM + * It will traverse the filter lists searching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the * LSM field is re-initialized, and the old rule is replaced with the * updated rule. */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index eb98cd6fe91fb5..d1966144bdfe70 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -994,10 +994,10 @@ static void audit_reset_context(struct audit_context *ctx) */ ctx->current_state = ctx->state; - ctx->serial = 0; + ctx->stamp.serial = 0; + ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; ctx->major = 0; ctx->uring_op = 0; - ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; memset(ctx->argv, 0, sizeof(ctx->argv)); ctx->return_code = 0; ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0); @@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx) < 0) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop)) + rc = 1; + audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); @@ -1392,15 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - struct lsm_context lsmctx; - - if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx) < 0) { + if (audit_log_obj_ctx(ab, &context->ipc.oprop)) *call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", lsmctx.context); - security_release_secctx(&lsmctx); - } } if (context->ipc.has_perm) { audit_log_end(ab); @@ -1557,17 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (lsmprop_is_set(&n->oprop)) { - struct lsm_context ctx; - - if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(&n->oprop) && + audit_log_obj_ctx(ab, &n->oprop)) + *call_panic = 2; /* log the audit_names record type */ switch (n->type) { @@ -1785,8 +1763,9 @@ static void audit_log_exit(void) audit_log_pid_context(context, context->target_pid, context->target_auid, context->target_uid, context->target_sessionid, - &context->target_ref, context->target_comm)) - call_panic = 1; + &context->target_ref, + context->target_comm)) + call_panic = 1; if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); @@ -1917,7 +1896,7 @@ void __audit_uring_entry(u8 op) ctx->context = AUDIT_CTX_URING; ctx->current_state = ctx->state; - ktime_get_coarse_real_ts64(&ctx->ctime); + ktime_get_coarse_real_ts64(&ctx->stamp.ctime); } /** @@ -2039,7 +2018,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->argv[3] = a4; context->context = AUDIT_CTX_SYSCALL; context->current_state = state; - ktime_get_coarse_real_ts64(&context->ctime); + ktime_get_coarse_real_ts64(&context->stamp.ctime); } /** @@ -2508,21 +2487,17 @@ EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values * @ctx: audit_context for the task - * @t: timespec64 to store time recorded in the audit_context - * @serial: serial value that is recorded in the audit_context + * @stamp: timestamp to record * * Also sets the context as auditable. */ -int auditsc_get_stamp(struct audit_context *ctx, - struct timespec64 *t, unsigned int *serial) +int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp) { if (ctx->context == AUDIT_CTX_UNUSED) return 0; - if (!ctx->serial) - ctx->serial = audit_serial(); - t->tv_sec = ctx->ctime.tv_sec; - t->tv_nsec = ctx->ctime.tv_nsec; - *serial = ctx->serial; + if (!ctx->stamp.serial) + ctx->stamp.serial = audit_serial(); + *stamp = ctx->stamp; if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 17067dcb438610..eb3de35734f092 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -3,7 +3,7 @@ # BPF interpreter that, for example, classic socket filters depend on. config BPF bool - select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 # Used by archs to tell that they support BPF JIT compiler plus which # flavour. Only one of the two can be selected for a specific arch since diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 269c04a2466400..7fd0badfacb12f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o @@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 5b37753799d201..1074ac4459f2ca 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -633,3 +633,33 @@ static int __init kfunc_init(void) return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_stream_stage ss; + struct bpf_prog *prog; + u64 user_vm_start; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + + /* Use main prog for stream access */ + prog = prog->aux->main_prog_aux->prog; + + user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); + addr += clear_lo32(user_vm_start); + + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n", + write ? "WRITE" : "READ", addr); + bpf_stream_dump_stack(ss); + })); +} diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d080916faf976..80b1765a315969 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "map_in_map.h" @@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } +static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, + void *hash_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + sha256(array->value, (u64)array->elem_size * array->map.max_entries, + hash_buf); + memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + return 0; +} + static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { @@ -431,7 +443,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers_wq(struct bpf_map *map) +static void array_map_free_internal_structs(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -439,12 +451,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { for (i = 0; i < array->map.max_entries; i++) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); } } } @@ -783,7 +797,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers_wq, + .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, @@ -800,6 +814,7 @@ const struct bpf_map_ops array_map_ops = { .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, + .map_get_hash = &array_map_get_hash, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 148da8f7ff3685..0687a760974a42 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -45,8 +45,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; @@ -55,8 +54,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static struct bpf_local_storage_data * diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 15a3eb9b02d94c..e54cce2b91754c 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -62,8 +62,7 @@ void bpf_inode_storage_free(struct inode *inode) if (!bsb) return; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(bsb->storage); if (!local_storage) @@ -71,8 +70,7 @@ void bpf_inode_storage_free(struct inode *inode) bpf_local_storage_destroy(local_storage); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0cbcae7270790a..6ac35430c57344 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -705,13 +705,11 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) migrate_enable(); rcu_read_unlock_trace(); } else { - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); bpf_reset_run_ctx(old_run_ctx); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } /* bpf program can only return 0 or 1: diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 2d6e1c98d8adc3..e7a2fc60523f6c 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -19,14 +19,6 @@ #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) -static int get_next_cpu(int cpu) -{ - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_possible_mask); - return cpu; -} - /* Local list helpers */ static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) { @@ -482,7 +474,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); - steal = get_next_cpu(steal); + steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal); loc_l->next_steal = steal; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 687a3e9c76f527..a41e6730edcf33 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1174,6 +1174,18 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +u32 bpf_struct_ops_id(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + return st_map->map.id; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_id); + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 1109475953c01f..a1dc1bf0848a52 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -70,8 +70,7 @@ void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) @@ -81,8 +80,7 @@ void bpf_task_storage_free(struct task_struct *task) bpf_local_storage_destroy(local_storage); bpf_task_storage_unlock(); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 64739308902f7a..0de8fc8a0e0b32 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3478,60 +3478,45 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, return BTF_FIELD_FOUND; } -#define field_mask_test_name(field_type, field_type_str) \ - if (field_mask & field_type && !strcmp(name, field_type_str)) { \ - type = field_type; \ - goto end; \ - } - static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type, - u32 field_mask, u32 *seen_mask, - int *align, int *sz) -{ - int type = 0; + u32 field_mask, u32 *seen_mask, int *align, int *sz) +{ + const struct { + enum btf_field_type type; + const char *const name; + const bool is_unique; + } field_types[] = { + { BPF_SPIN_LOCK, "bpf_spin_lock", true }, + { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, + { BPF_TIMER, "bpf_timer", true }, + { BPF_WORKQUEUE, "bpf_wq", true }, + { BPF_TASK_WORK, "bpf_task_work", true }, + { BPF_LIST_HEAD, "bpf_list_head", false }, + { BPF_LIST_NODE, "bpf_list_node", false }, + { BPF_RB_ROOT, "bpf_rb_root", false }, + { BPF_RB_NODE, "bpf_rb_node", false }, + { BPF_REFCOUNT, "bpf_refcount", false }, + }; + int type = 0, i; const char *name = __btf_name_by_offset(btf, var_type->name_off); - - if (field_mask & BPF_SPIN_LOCK) { - if (!strcmp(name, "bpf_spin_lock")) { - if (*seen_mask & BPF_SPIN_LOCK) - return -E2BIG; - *seen_mask |= BPF_SPIN_LOCK; - type = BPF_SPIN_LOCK; - goto end; - } - } - if (field_mask & BPF_RES_SPIN_LOCK) { - if (!strcmp(name, "bpf_res_spin_lock")) { - if (*seen_mask & BPF_RES_SPIN_LOCK) - return -E2BIG; - *seen_mask |= BPF_RES_SPIN_LOCK; - type = BPF_RES_SPIN_LOCK; - goto end; - } - } - if (field_mask & BPF_TIMER) { - if (!strcmp(name, "bpf_timer")) { - if (*seen_mask & BPF_TIMER) - return -E2BIG; - *seen_mask |= BPF_TIMER; - type = BPF_TIMER; - goto end; - } - } - if (field_mask & BPF_WORKQUEUE) { - if (!strcmp(name, "bpf_wq")) { - if (*seen_mask & BPF_WORKQUEUE) + const char *field_type_name; + enum btf_field_type field_type; + bool is_unique; + + for (i = 0; i < ARRAY_SIZE(field_types); ++i) { + field_type = field_types[i].type; + field_type_name = field_types[i].name; + is_unique = field_types[i].is_unique; + if (!(field_mask & field_type) || strcmp(name, field_type_name)) + continue; + if (is_unique) { + if (*seen_mask & field_type) return -E2BIG; - *seen_mask |= BPF_WORKQUEUE; - type = BPF_WORKQUEUE; - goto end; + *seen_mask |= field_type; } + type = field_type; + goto end; } - field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); - field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); - field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); - field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); - field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { @@ -3545,8 +3530,6 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ return type; } -#undef field_mask_test_name - /* Repeat a number of fields for a specified number of times. * * Copy the fields starting from the first field and repeat them for @@ -3693,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf, case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: + case BPF_TASK_WORK: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) @@ -3985,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; + rec->task_work_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -4024,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->wq_off = rec->fields[i].offset; break; + case BPF_TASK_WORK: + WARN_ON_ONCE(rec->task_work_off >= 0); + rec->task_work_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ @@ -6762,7 +6751,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -7334,7 +7323,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) return t->size; return -EINVAL; } @@ -7343,7 +7332,7 @@ static u8 __get_type_fmodel_flags(const struct btf_type *t) { u8 flags = 0; - if (__btf_type_is_struct(t)) + if (btf_type_is_struct(t)) flags |= BTF_FMODEL_STRUCT_ARG; if (btf_type_is_signed_int(t)) flags |= BTF_FMODEL_SIGNED_ARG; @@ -7384,7 +7373,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, func->type, &t); - if (ret < 0 || __btf_type_is_struct(t)) { + if (ret < 0 || btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", tname, btf_type_str(t)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 180b630279b9cb..248f517d66d048 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,14 +27,15 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key); /* * cgroup bpf destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup bpf - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_bpf_destroy_wq; static int __init cgroup_bpf_wq_init(void) { - cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); + cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", + WQ_PERCPU, 1); if (!cgroup_bpf_destroy_wq) panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); return 0; @@ -71,8 +72,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, u32 func_ret; run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); array = rcu_dereference(cgrp->effective[atype]); item = &array->items[0]; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); @@ -88,8 +88,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, item++; } bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); return run_ctx.retval; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d04..d595fe512498cc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include @@ -119,6 +121,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->pages = size / PAGE_SIZE; fp->aux = aux; + fp->aux->main_prog_aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); @@ -293,28 +296,18 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); - u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA1_DIGEST_WORDS]; - u32 ws[SHA1_WORKSPACE_WORDS]; - u32 i, bsize, psize, blocks; + size_t size = bpf_prog_insn_size(fp); struct bpf_insn *dst; bool was_ld_map; - u8 *raw, *todo; - __be32 *result; - __be64 *bits; + u32 i; - raw = vmalloc(raw_size); - if (!raw) + dst = vmalloc(size); + if (!dst) return -ENOMEM; - sha1_init_raw(digest); - memset(ws, 0, sizeof(ws)); - /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ - dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && @@ -334,33 +327,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) was_ld_map = false; } } - - psize = bpf_prog_insn_size(fp); - memset(&raw[psize], 0, raw_size - psize); - raw[psize++] = 0x80; - - bsize = round_up(psize, SHA1_BLOCK_SIZE); - blocks = bsize / SHA1_BLOCK_SIZE; - todo = raw; - if (bsize - psize >= sizeof(__be64)) { - bits = (__be64 *)(todo + bsize - sizeof(__be64)); - } else { - bits = (__be64 *)(todo + bsize + bits_offset); - blocks++; - } - *bits = cpu_to_be64((psize - 1) << 3); - - while (blocks--) { - sha1_transform(digest, todo, ws); - todo += SHA1_BLOCK_SIZE; - } - - result = (__force __be32 *)digest; - for (i = 0; i < SHA1_DIGEST_WORDS; i++) - result[i] = cpu_to_be32(digest[i]); - memcpy(fp->tag, result, sizeof(fp->tag)); - - vfree(raw); + sha256((u8 *)dst, size, fp->digest); + vfree(dst); return 0; } @@ -2366,8 +2334,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, or interpreter is being used when - * prog->jit_requested is not 0, so warn about it! + * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; @@ -2394,6 +2361,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->expected_attach_type = fp->expected_attach_type; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { map->owner->storage_cookie[i] = @@ -2405,6 +2373,10 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, ret = map->owner->type == prog_type && map->owner->jited == fp->jited && map->owner->xdp_has_frags == aux->xdp_has_frags; + if (ret && + map->map_type == BPF_MAP_TYPE_PROG_ARRAY && + map->owner->expected_attach_type != fp->expected_attach_type) + ret = false; for_each_cgroup_storage_type(i) { if (!ret) break; @@ -2468,8 +2440,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return ret; } -static void bpf_prog_select_func(struct bpf_prog *fp) +static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { + bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; @@ -2478,15 +2451,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ - if (!fp->jit_requested && - !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; + select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif + return select_interpreter; } /** @@ -2505,7 +2479,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = fp->jit_requested; + bool jit_needed = false; if (fp->bpf_func) goto finalize; @@ -2514,7 +2488,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) bpf_prog_has_kfunc_call(fp)) jit_needed = true; - bpf_prog_select_func(fp); + if (!bpf_prog_select_interpreter(fp)) + jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -3024,7 +2999,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { - .func = NULL, + /* func is unused for tail_call, we set it to pass the + * get_helper_proto check + */ + .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, @@ -3324,9 +3302,8 @@ static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) rcu_read_unlock(); if (!prog) return true; - if (bpf_is_subprog(prog)) - return true; - ctxp->prog = prog; + /* Make sure we return the main prog if we found a subprog */ + ctxp->prog = prog->aux->main_prog_aux->prog; return false; } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b2b7b8ec2c2a1c..703e5df1f4ef9d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { @@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } - xdp_clear_return_frame_no_direct(); stats->pass += nframes; return nframes; @@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + xdp_set_return_frame_no_direct(); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); if (unlikely(ret->skb_n)) @@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (stats->redirect) xdp_do_flush(); + xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); @@ -550,7 +550,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); - queue_rcu_work(system_wq, &old_rcpu->free_work); + queue_rcu_work(system_percpu_wq, &old_rcpu->free_work); } } diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 94854cd9c4cc32..83c4d9943084b9 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); - if (!src_len || !dst_len) + if (!src_len || !dst_len || src_len > dst_len) return -EINVAL; if (siv_len != ctx->siv_len) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 482d284a155386..2625601de76e95 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -865,7 +865,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), - GFP_NOWAIT | __GFP_NOWARN, + GFP_NOWAIT, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 71f9931ac64cd4..c2fcd0cd51e51b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -215,7 +215,20 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } -static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem) +{ + if (btf_record_has_field(htab->map.record, BPF_TIMER)) + bpf_obj_free_timer(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) + bpf_obj_free_task_work(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); +} + +static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; @@ -227,12 +240,7 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); + htab_free_internal_structs(htab, elem); cond_resched(); } } @@ -1490,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } -static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_malloced_internal_structs(struct bpf_htab *htab) { int i; @@ -1502,28 +1510,23 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - htab_elem_value(l, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - htab_elem_value(l, htab->map.key_size)); + htab_free_internal_structs(htab, l); } cond_resched_rcu(); } rcu_read_unlock(); } -static void htab_map_free_timers_and_wq(struct bpf_map *map) +static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_and_wq(htab); + htab_free_malloced_internal_structs(htab); else - htab_free_prealloced_timers_and_wq(htab); + htab_free_prealloced_internal_structs(htab); } } @@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68c9..c9fab9a356dfc1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -25,6 +25,9 @@ #include #include #include +#include +#include +#include #include "../../lib/kstrtox.h" @@ -774,11 +777,9 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; - preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); - preempt_enable(); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); @@ -791,7 +792,6 @@ void bpf_put_buffers(void) if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); - preempt_enable(); } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) @@ -1084,6 +1084,17 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) +{ + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + *arr_idx = ((char *)value - array->value) / array->elem_size; + return arr_idx; + } + return (void *)value - round_up(map->key_size, 8); +} + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; @@ -1166,15 +1177,8 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) * bpf_map_delete_elem() on the same timer. */ this_cpu_write(hrtimer_running, t); - if (map->map_type == BPF_MAP_TYPE_ARRAY) { - struct bpf_array *array = container_of(map, struct bpf_array, map); - /* compute the key */ - idx = ((char *)value - array->value) / array->elem_size; - key = &idx; - } else { /* hash or lru */ - key = value - round_up(map->key_size, 8); - } + key = map_key_from_value(map, value, &idx); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); /* The verifier checked that return value is zero. */ @@ -1200,15 +1204,7 @@ static void bpf_wq_work(struct work_struct *work) if (!callback_fn) return; - if (map->map_type == BPF_MAP_TYPE_ARRAY) { - struct bpf_array *array = container_of(map, struct bpf_array, map); - - /* compute the key */ - idx = ((char *)value - array->value) / array->elem_size; - key = &idx; - } else { /* hash or lru */ - key = value - round_up(map->key_size, 8); - } + key = map_key_from_value(map, value, &idx); rcu_read_lock_trace(); migrate_disable(); @@ -1274,8 +1270,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until + * kmalloc_nolock() is available, avoid locking issues by using + * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). + */ + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; @@ -1597,7 +1596,7 @@ void bpf_timer_cancel_and_free(void *val) * timer callback. */ if (this_cpu_read(hrtimer_running)) { - queue_work(system_unbound_wq, &t->cb.delete_work); + queue_work(system_dfl_wq, &t->cb.delete_work); return; } @@ -1610,7 +1609,7 @@ void bpf_timer_cancel_and_free(void *val) if (hrtimer_try_to_cancel(&t->timer) >= 0) kfree_rcu(t, cb.rcu); else - queue_work(system_unbound_wq, &t->cb.delete_work); + queue_work(system_dfl_wq, &t->cb.delete_work); } else { bpf_timer_delete_work(&t->cb.delete_work); } @@ -1780,6 +1779,9 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); + case BPF_DYNPTR_TYPE_SKB_META: + memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1836,6 +1838,11 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); + case BPF_DYNPTR_TYPE_SKB_META: + if (flags) + return -EINVAL; + memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -1882,6 +1889,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: + case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: @@ -2537,7 +2545,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; - cgrp = cgroup_get_from_id(cgid); + cgrp = __cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; @@ -2710,6 +2718,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } + case BPF_DYNPTR_TYPE_SKB_META: + return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -3341,45 +3351,72 @@ __bpf_kfunc void __bpf_trap(void) * __get_kernel_nofault instead of plain dereference to make them safe. */ -/** - * bpf_strcmp - Compare two strings - * @s1__ign: One string - * @s2__ign: Another string - * - * Return: - * * %0 - Strings are equal - * * %-1 - @s1__ign is smaller - * * %1 - @s2__ign is smaller - * * %-EFAULT - Cannot read one of the strings - * * %-E2BIG - One of strings is too large - * * %-ERANGE - One of strings is outside of kernel address space - */ -__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) +static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) { char c1, c2; int i; - if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || - !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + if (!copy_from_kernel_nofault_allowed(s1, 1) || + !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { - __get_kernel_nofault(&c1, s1__ign, char, err_out); - __get_kernel_nofault(&c2, s2__ign, char, err_out); + __get_kernel_nofault(&c1, s1, char, err_out); + __get_kernel_nofault(&c2, s2, char, err_out); + if (ignore_case) { + c1 = tolower(c1); + c2 = tolower(c2); + } if (c1 != c2) return c1 < c2 ? -1 : 1; if (c1 == '\0') return 0; - s1__ign++; - s2__ign++; + s1++; + s2++; } return -E2BIG; err_out: return -EFAULT; } +/** + * bpf_strcmp - Compare two strings + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strcasecmp(s1__ign, s2__ign, false); +} + +/** + * bpf_strcasecmp - Compare two strings, ignoring the case of the characters + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strcasecmp(s1__ign, s2__ign, true); +} + /** * bpf_strnchr - Find a character in a length limited string * @s__ign: The string to be searched @@ -3664,10 +3701,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { - for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) { + for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2__ign + j, char, err_out); if (c2 == '\0') return i; + /* + * We allow reading an extra byte from s2 (note the + * `i + j <= len` above) to cover the case when s2 is + * a suffix of the first len chars of s1. + */ + if (i + j == len) + break; __get_kernel_nofault(&c1, s1__ign + j, char, err_out); if (c1 == '\0') return -ENOENT; @@ -3702,9 +3746,490 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); } +#ifdef CONFIG_KEYS +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_p: data to verify + * @sig_p: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION + struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const void *data, *sig; + u32 data_len, sig_len; + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, + trusted_keyring->key, + VERIFYING_BPF_SIGNATURE, NULL, + NULL); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ +} +#endif /* CONFIG_KEYS */ + +typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); + +enum bpf_task_work_state { + /* bpf_task_work is ready to be used */ + BPF_TW_STANDBY = 0, + /* irq work scheduling in progress */ + BPF_TW_PENDING, + /* task work scheduling in progress */ + BPF_TW_SCHEDULING, + /* task work is scheduled successfully */ + BPF_TW_SCHEDULED, + /* callback is running */ + BPF_TW_RUNNING, + /* associated BPF map value is deleted */ + BPF_TW_FREED, +}; + +struct bpf_task_work_ctx { + enum bpf_task_work_state state; + refcount_t refcnt; + struct callback_head work; + struct irq_work irq_work; + /* bpf_prog that schedules task work */ + struct bpf_prog *prog; + /* task for which callback is scheduled */ + struct task_struct *task; + /* the map and map value associated with this context */ + struct bpf_map *map; + void *map_val; + enum task_work_notify_mode mode; + bpf_task_work_callback_t callback_fn; + struct rcu_head rcu; +} __aligned(8); + +/* Actual type for struct bpf_task_work */ +struct bpf_task_work_kern { + struct bpf_task_work_ctx *ctx; +}; + +static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx) +{ + if (ctx->prog) { + bpf_prog_put(ctx->prog); + ctx->prog = NULL; + } + if (ctx->task) { + bpf_task_release(ctx->task); + ctx->task = NULL; + } +} + +static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) +{ + return refcount_inc_not_zero(&ctx->refcnt); +} + +static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) +{ + if (!refcount_dec_and_test(&ctx->refcnt)) + return; + + bpf_task_work_ctx_reset(ctx); + + /* bpf_mem_free expects migration to be disabled */ + migrate_disable(); + bpf_mem_free(&bpf_global_ma, ctx); + migrate_enable(); +} + +static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) +{ + /* + * Scheduled task_work callback holds ctx ref, so if we successfully + * cancelled, we put that ref on callback's behalf. If we couldn't + * cancel, callback will inevitably run or has already completed + * running, and it would have taken care of its ctx ref itself. + */ + if (task_work_cancel(ctx->task, &ctx->work)) + bpf_task_work_ctx_put(ctx); +} + +static void bpf_task_work_callback(struct callback_head *cb) +{ + struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work); + enum bpf_task_work_state state; + u32 idx; + void *key; + + /* Read lock is needed to protect ctx and map key/value access */ + guard(rcu_tasks_trace)(); + /* + * This callback may start running before bpf_task_work_irq() switched to + * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING. + */ + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING); + if (state == BPF_TW_SCHEDULED) + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING); + if (state == BPF_TW_FREED) { + bpf_task_work_ctx_put(ctx); + return; + } + + key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx); + + migrate_disable(); + ctx->callback_fn(ctx->map, key, ctx->map_val); + migrate_enable(); + + bpf_task_work_ctx_reset(ctx); + (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY); + + bpf_task_work_ctx_put(ctx); +} + +static void bpf_task_work_irq(struct irq_work *irq_work) +{ + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); + enum bpf_task_work_state state; + int err; + + guard(rcu_tasks_trace)(); + + if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { + bpf_task_work_ctx_put(ctx); + return; + } + + err = task_work_add(ctx->task, &ctx->work, ctx->mode); + if (err) { + bpf_task_work_ctx_reset(ctx); + /* + * try to switch back to STANDBY for another task_work reuse, but we might have + * gone to FREED already, which is fine as we already cleaned up after ourselves + */ + (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY); + bpf_task_work_ctx_put(ctx); + return; + } + + /* + * It's technically possible for just scheduled task_work callback to + * complete running by now, going SCHEDULING -> RUNNING and then + * dropping its ctx refcount. Instead of capturing extra ref just to + * protected below ctx->state access, we rely on RCU protection to + * perform below SCHEDULING -> SCHEDULED attempt. + */ + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); + if (state == BPF_TW_FREED) + bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */ +} + +static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw, + struct bpf_map *map) +{ + struct bpf_task_work_kern *twk = (void *)tw; + struct bpf_task_work_ctx *ctx, *old_ctx; + + ctx = READ_ONCE(twk->ctx); + if (ctx) + return ctx; + + ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); + if (!ctx) + return ERR_PTR(-ENOMEM); + + memset(ctx, 0, sizeof(*ctx)); + refcount_set(&ctx->refcnt, 1); /* map's own ref */ + ctx->state = BPF_TW_STANDBY; + + old_ctx = cmpxchg(&twk->ctx, NULL, ctx); + if (old_ctx) { + /* + * tw->ctx is set by concurrent BPF program, release allocated + * memory and try to reuse already set context. + */ + bpf_mem_free(&bpf_global_ma, ctx); + return old_ctx; + } + + return ctx; /* Success */ +} + +static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw, + struct bpf_map *map) +{ + struct bpf_task_work_ctx *ctx; + + ctx = bpf_task_work_fetch_ctx(tw, map); + if (IS_ERR(ctx)) + return ctx; + + /* try to get ref for task_work callback to hold */ + if (!bpf_task_work_ctx_tryget(ctx)) + return ERR_PTR(-EBUSY); + + if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { + /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ + bpf_task_work_ctx_put(ctx); + return ERR_PTR(-EBUSY); + } + + /* + * If no process or bpffs is holding a reference to the map, no new callbacks should be + * scheduled. This does not address any race or correctness issue, but rather is a policy + * choice: dropping user references should stop everything. + */ + if (!atomic64_read(&map->usercnt)) { + /* drop ref we just got for task_work callback itself */ + bpf_task_work_ctx_put(ctx); + /* transfer map's ref into cancel_and_free() */ + bpf_task_work_cancel_and_free(tw); + return ERR_PTR(-EBUSY); + } + + return ctx; +} + +static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw, + struct bpf_map *map, bpf_task_work_callback_t callback_fn, + struct bpf_prog_aux *aux, enum task_work_notify_mode mode) +{ + struct bpf_prog *prog; + struct bpf_task_work_ctx *ctx; + int err; + + BTF_TYPE_EMIT(struct bpf_task_work); + + prog = bpf_prog_inc_not_zero(aux->prog); + if (IS_ERR(prog)) + return -EBADF; + task = bpf_task_acquire(task); + if (!task) { + err = -EBADF; + goto release_prog; + } + + ctx = bpf_task_work_acquire_ctx(tw, map); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto release_all; + } + + ctx->task = task; + ctx->callback_fn = callback_fn; + ctx->prog = prog; + ctx->mode = mode; + ctx->map = map; + ctx->map_val = (void *)tw - map->record->task_work_off; + init_task_work(&ctx->work, bpf_task_work_callback); + init_irq_work(&ctx->irq_work, bpf_task_work_irq); + + irq_work_queue(&ctx->irq_work); + return 0; + +release_all: + bpf_task_release(task); +release_prog: + bpf_prog_put(prog); + return err; +} + +/** + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); +} + +/** + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); +} __bpf_kfunc_end_defs(); +static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) +{ + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); + + bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */ + bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */ +} + +void bpf_task_work_cancel_and_free(void *val) +{ + struct bpf_task_work_kern *twk = val; + struct bpf_task_work_ctx *ctx; + enum bpf_task_work_state state; + + ctx = xchg(&twk->ctx, NULL); + if (!ctx) + return; + + state = xchg(&ctx->state, BPF_TW_FREED); + if (state == BPF_TW_SCHEDULED) { + /* run in irq_work to avoid locks in NMI */ + init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled); + irq_work_queue(&ctx->irq_work); + return; + } + + bpf_task_work_ctx_put(ctx); /* put bpf map's ref */ +} + BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) @@ -3743,6 +4268,14 @@ BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) #endif +#ifdef CONFIG_KEYS +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +#endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { @@ -3824,6 +4357,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) BTF_ID_FLAGS(func, bpf_strcmp); +BTF_ID_FLAGS(func, bpf_strcasecmp); BTF_ID_FLAGS(func, bpf_strchr); BTF_ID_FLAGS(func, bpf_strchrnul); BTF_ID_FLAGS(func, bpf_strnchr); @@ -3838,6 +4372,8 @@ BTF_ID_FLAGS(func, bpf_strnstr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392ae..f90bdcc0a0476e 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(path_fd, pathname, &path, 0); + dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, ret = -EPERM; } out: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return ret; } @@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode) const struct super_operations bpf_super_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = bpf_show_options, .free_inode = bpf_free_inode, }; diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c new file mode 100644 index 00000000000000..3c611aba7f52c5 --- /dev/null +++ b/kernel/bpf/liveness.c @@ -0,0 +1,733 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include + +/* + * This file implements live stack slots analysis. After accumulating + * stack usage data, the analysis answers queries about whether a + * particular stack slot may be read by an instruction or any of it's + * successors. This data is consumed by the verifier states caching + * mechanism to decide which stack slots are important when looking for a + * visited state corresponding to the current state. + * + * The analysis is call chain sensitive, meaning that data is collected + * and queried for tuples (call chain, subprogram instruction index). + * Such sensitivity allows identifying if some subprogram call always + * leads to writes in the caller's stack. + * + * The basic idea is as follows: + * - As the verifier accumulates a set of visited states, the analysis instance + * accumulates a conservative estimate of stack slots that can be read + * or must be written for each visited tuple (call chain, instruction index). + * - If several states happen to visit the same instruction with the same + * call chain, stack usage information for the corresponding tuple is joined: + * - "may_read" set represents a union of all possibly read slots + * (any slot in "may_read" set might be read at or after the instruction); + * - "must_write" set represents an intersection of all possibly written slots + * (any slot in "must_write" set is guaranteed to be written by the instruction). + * - The analysis is split into two phases: + * - read and write marks accumulation; + * - read and write marks propagation. + * - The propagation phase is a textbook live variable data flow analysis: + * + * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)] + * state[cc, i].live_before = + * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read + * + * Where: + * - `U` stands for set union + * - `/` stands for set difference; + * - `cc` stands for a call chain; + * - `i` and `s` are instruction indexes; + * + * The above equations are computed for each call chain and instruction + * index until state stops changing. + * - Additionally, in order to transfer "must_write" information from a + * subprogram to call instructions invoking this subprogram, + * the "must_write_acc" set is tracked for each (cc, i) tuple. + * A set of stack slots that are guaranteed to be written by this + * instruction or any of its successors (within the subprogram). + * The equation for "must_write_acc" propagation looks as follows: + * + * state[cc, i].must_write_acc = + * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)] + * U state[cc, i].must_write + * + * (An intersection of all "must_write_acc" for instruction successors + * plus all "must_write" slots for the instruction itself). + * - After the propagation phase completes for a subprogram, information from + * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain: + * - "must_write_acc" set is intersected with the call site's "must_write" set; + * - "may_read" set is added to the call site's "may_read" set. + * - Any live stack queries must be taken after the propagation phase. + * - Accumulation and propagation phases can be entered multiple times, + * at any point in time: + * - "may_read" set only grows; + * - "must_write" set only shrinks; + * - for each visited verifier state with zero branches, all relevant + * read and write marks are already recorded by the analysis instance. + * + * Technically, the analysis is facilitated by the following data structures: + * - Call chain: for given verifier state, the call chain is a tuple of call + * instruction indexes leading to the current subprogram plus the subprogram + * entry point index. + * - Function instance: for a given call chain, for each instruction in + * the current subprogram, a mapping between instruction index and a + * set of "may_read", "must_write" and other marks accumulated for this + * instruction. + * - A hash table mapping call chains to function instances. + */ + +struct callchain { + u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */ + /* cached subprog_info[*].start for functions owning the frames: + * - sp_starts[curframe] used to get insn relative index within current function; + * - sp_starts[0..current-1] used for fast callchain_frame_up(). + */ + u32 sp_starts[MAX_CALL_FRAMES]; + u32 curframe; /* depth of callsites and sp_starts arrays */ +}; + +struct per_frame_masks { + u64 may_read; /* stack slots that may be read by this instruction */ + u64 must_write; /* stack slots written by this instruction */ + u64 must_write_acc; /* stack slots written by this instruction and its successors */ + u64 live_before; /* stack slots that may be read by this insn and its successors */ +}; + +/* + * A function instance created for a specific callchain. + * Encapsulates read and write marks for each instruction in the function. + * Marks are tracked for each frame in the callchain. + */ +struct func_instance { + struct hlist_node hl_node; + struct callchain callchain; + u32 insn_cnt; /* cached number of insns in the function */ + bool updated; + bool must_write_dropped; + /* Per frame, per instruction masks, frames allocated lazily. */ + struct per_frame_masks *frames[MAX_CALL_FRAMES]; + /* For each instruction a flag telling if "must_write" had been initialized for it. */ + bool *must_write_set; +}; + +struct live_stack_query { + struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */ + u32 curframe; + u32 insn_idx; +}; + +struct bpf_liveness { + DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */ + struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */ + /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */ + struct func_instance *cur_instance; + /* + * Below fields are used to accumulate stack write marks for instruction at + * @write_insn_idx before submitting the marks to @cur_instance. + */ + u64 write_masks_acc[MAX_CALL_FRAMES]; + u32 write_insn_idx; +}; + +/* Compute callchain corresponding to state @st at depth @frameno */ +static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st, + struct callchain *callchain, u32 frameno) +{ + struct bpf_subprog_info *subprog_info = env->subprog_info; + u32 i; + + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= frameno; i++) { + callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start; + if (i < st->curframe) + callchain->callsites[i] = st->frame[i + 1]->callsite; + } + callchain->curframe = frameno; + callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe]; +} + +static u32 hash_callchain(struct callchain *callchain) +{ + return jhash2(callchain->callsites, callchain->curframe, 0); +} + +static bool same_callsites(struct callchain *a, struct callchain *b) +{ + int i; + + if (a->curframe != b->curframe) + return false; + for (i = a->curframe; i >= 0; i--) + if (a->callsites[i] != b->callsites[i]) + return false; + return true; +} + +/* + * Find existing or allocate new function instance corresponding to @callchain. + * Instances are accumulated in env->liveness->func_instances and persist + * until the end of the verification process. + */ +static struct func_instance *__lookup_instance(struct bpf_verifier_env *env, + struct callchain *callchain) +{ + struct bpf_liveness *liveness = env->liveness; + struct bpf_subprog_info *subprog; + struct func_instance *result; + u32 subprog_sz, size, key; + + key = hash_callchain(callchain); + hash_for_each_possible(liveness->func_instances, result, hl_node, key) + if (same_callsites(&result->callchain, callchain)) + return result; + + subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]); + subprog_sz = (subprog + 1)->start - subprog->start; + size = sizeof(struct func_instance); + result = kvzalloc(size, GFP_KERNEL_ACCOUNT); + if (!result) + return ERR_PTR(-ENOMEM); + result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set), + GFP_KERNEL_ACCOUNT); + if (!result->must_write_set) + return ERR_PTR(-ENOMEM); + memcpy(&result->callchain, callchain, sizeof(*callchain)); + result->insn_cnt = subprog_sz; + hash_add(liveness->func_instances, &result->hl_node, key); + return result; +} + +static struct func_instance *lookup_instance(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + u32 frameno) +{ + struct callchain callchain; + + compute_callchain(env, st, &callchain, frameno); + return __lookup_instance(env, &callchain); +} + +int bpf_stack_liveness_init(struct bpf_verifier_env *env) +{ + env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT); + if (!env->liveness) + return -ENOMEM; + hash_init(env->liveness->func_instances); + return 0; +} + +void bpf_stack_liveness_free(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + struct hlist_node *tmp; + int bkt, i; + + if (!env->liveness) + return; + hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) { + for (i = 0; i <= instance->callchain.curframe; i++) + kvfree(instance->frames[i]); + kvfree(instance->must_write_set); + kvfree(instance); + } + kvfree(env->liveness); +} + +/* + * Convert absolute instruction index @insn_idx to an index relative + * to start of the function corresponding to @instance. + */ +static int relative_idx(struct func_instance *instance, u32 insn_idx) +{ + return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe]; +} + +static struct per_frame_masks *get_frame_masks(struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + if (!instance->frames[frame]) + return NULL; + + return &instance->frames[frame][relative_idx(instance, insn_idx)]; +} + +static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env, + struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + struct per_frame_masks *arr; + + if (!instance->frames[frame]) { + arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT); + instance->frames[frame] = arr; + if (!arr) + return ERR_PTR(-ENOMEM); + } + return get_frame_masks(instance, frame, insn_idx); +} + +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env) +{ + env->liveness->cur_instance = NULL; +} + +/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */ +static int ensure_cur_instance(struct bpf_verifier_env *env) +{ + struct bpf_liveness *liveness = env->liveness; + struct func_instance *instance; + + if (liveness->cur_instance) + return 0; + + instance = lookup_instance(env, env->cur_state, env->cur_state->curframe); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + liveness->cur_instance = instance; + return 0; +} + +/* Accumulate may_read masks for @frame at @insn_idx */ +static int mark_stack_read(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask) +{ + struct per_frame_masks *masks; + u64 new_may_read; + + masks = alloc_frame_masks(env, instance, frame, insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + new_may_read = masks->may_read | mask; + if (new_may_read != masks->may_read && + ((new_may_read | masks->live_before) != masks->live_before)) + instance->updated = true; + masks->may_read |= mask; + return 0; +} + +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask) +{ + int err; + + err = ensure_cur_instance(env); + err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask); + return err; +} + +static void reset_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int i; + + liveness->write_insn_idx = insn_idx; + for (i = 0; i <= instance->callchain.curframe; i++) + liveness->write_masks_acc[i] = 0; +} + +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int err; + + err = ensure_cur_instance(env); + if (err) + return err; + + reset_stack_write_marks(env, liveness->cur_instance, insn_idx); + return 0; +} + +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask) +{ + env->liveness->write_masks_acc[frame] |= mask; +} + +static int commit_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct bpf_liveness *liveness = env->liveness; + u32 idx, frame, curframe, old_must_write; + struct per_frame_masks *masks; + u64 mask; + + if (!instance) + return 0; + + curframe = instance->callchain.curframe; + idx = relative_idx(instance, liveness->write_insn_idx); + for (frame = 0; frame <= curframe; frame++) { + mask = liveness->write_masks_acc[frame]; + /* avoid allocating frames for zero masks */ + if (mask == 0 && !instance->must_write_set[idx]) + continue; + masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + old_must_write = masks->must_write; + /* + * If instruction at this callchain is seen for a first time, set must_write equal + * to @mask. Otherwise take intersection with the previous value. + */ + if (instance->must_write_set[idx]) + mask &= old_must_write; + if (old_must_write != mask) { + masks->must_write = mask; + instance->updated = true; + } + if (old_must_write & ~mask) + instance->must_write_dropped = true; + } + instance->must_write_set[idx] = true; + liveness->write_insn_idx = 0; + return 0; +} + +/* + * Merge stack writes marks in @env->liveness->write_masks_acc + * with information already in @env->liveness->cur_instance. + */ +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env) +{ + return commit_stack_write_marks(env, env->liveness->cur_instance); +} + +static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain) +{ + char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf); + char *buf = env->tmp_str_buf; + int i; + + buf += snprintf(buf, buf_end - buf, "("); + for (i = 0; i <= callchain->curframe; i++) + buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]); + snprintf(buf, buf_end - buf, ")"); + return env->tmp_str_buf; +} + +static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain, + char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new) +{ + u64 changed_bits = old ^ new; + u64 new_ones = new & changed_bits; + u64 new_zeros = ~new & changed_bits; + + if (!changed_bits) + return; + bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx); + if (new_ones) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones); + bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf); + } + if (new_zeros) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros); + bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf); + } + bpf_log(&env->log, "\n"); +} + +int bpf_jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + +__diag_push(); +__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl"); + +inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + static const struct opcode_info { + bool can_jump; + bool can_fallthrough; + } opcode_info_tbl[256] = { + [0 ... 255] = {.can_jump = false, .can_fallthrough = true}, + #define _J(code, ...) \ + [BPF_JMP | code] = __VA_ARGS__, \ + [BPF_JMP32 | code] = __VA_ARGS__ + + _J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}), + _J(BPF_JA, {.can_jump = true, .can_fallthrough = false}), + _J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}), + #undef _J + }; + struct bpf_insn *insn = &prog->insnsi[idx]; + const struct opcode_info *opcode_info; + int i = 0, insn_sz; + + opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)]; + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (opcode_info->can_fallthrough) + succ[i++] = idx + insn_sz; + + if (opcode_info->can_jump) + succ[i++] = idx + bpf_jmp_offset(insn) + 1; + + return i; +} + +__diag_pop(); + +static struct func_instance *get_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain callchain = instance->callchain; + + /* Adjust @callchain to represent callchain one frame up */ + callchain.callsites[callchain.curframe] = 0; + callchain.sp_starts[callchain.curframe] = 0; + callchain.curframe--; + callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe]; + return __lookup_instance(env, &callchain); +} + +static u32 callchain_subprog_start(struct callchain *callchain) +{ + return callchain->sp_starts[callchain->curframe]; +} + +/* + * Transfer @may_read and @must_write_acc marks from the first instruction of @instance, + * to the call instruction in function instance calling @instance. + */ +static int propagate_to_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain *callchain = &instance->callchain; + u32 this_subprog_start, callsite, frame; + struct func_instance *outer_instance; + struct per_frame_masks *insn; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + outer_instance = get_outer_instance(env, instance); + callsite = callchain->callsites[callchain->curframe - 1]; + + reset_stack_write_marks(env, outer_instance, callsite); + for (frame = 0; frame < callchain->curframe; frame++) { + insn = get_frame_masks(instance, frame, this_subprog_start); + if (!insn) + continue; + bpf_mark_stack_write(env, frame, insn->must_write_acc); + err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before); + if (err) + return err; + } + commit_stack_write_marks(env, outer_instance); + return 0; +} + +static inline bool update_insn(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx) +{ + struct bpf_insn_aux_data *aux = env->insn_aux_data; + u64 new_before, new_after, must_write_acc; + struct per_frame_masks *insn, *succ_insn; + u32 succ_num, s, succ[2]; + bool changed; + + succ_num = bpf_insn_successors(env->prog, insn_idx, succ); + if (unlikely(succ_num == 0)) + return false; + + changed = false; + insn = get_frame_masks(instance, frame, insn_idx); + new_before = 0; + new_after = 0; + /* + * New "must_write_acc" is an intersection of all "must_write_acc" + * of successors plus all "must_write" slots of instruction itself. + */ + must_write_acc = U64_MAX; + for (s = 0; s < succ_num; ++s) { + succ_insn = get_frame_masks(instance, frame, succ[s]); + new_after |= succ_insn->live_before; + must_write_acc &= succ_insn->must_write_acc; + } + must_write_acc |= insn->must_write; + /* + * New "live_before" is a union of all "live_before" of successors + * minus slots written by instruction plus slots read by instruction. + */ + new_before = (new_after & ~insn->must_write) | insn->may_read; + changed |= new_before != insn->live_before; + changed |= must_write_acc != insn->must_write_acc; + if (unlikely(env->log.level & BPF_LOG_LEVEL2) && + (insn->may_read || insn->must_write || + insn_idx == callchain_subprog_start(&instance->callchain) || + aux[insn_idx].prune_point)) { + log_mask_change(env, &instance->callchain, "live", + frame, insn_idx, insn->live_before, new_before); + log_mask_change(env, &instance->callchain, "written", + frame, insn_idx, insn->must_write_acc, must_write_acc); + } + insn->live_before = new_before; + insn->must_write_acc = must_write_acc; + return changed; +} + +/* Fixed-point computation of @live_before and @must_write_acc marks */ +static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance) +{ + u32 i, frame, po_start, po_end, cnt, this_subprog_start; + struct callchain *callchain = &instance->callchain; + int *insn_postorder = env->cfg.insn_postorder; + struct bpf_subprog_info *subprog; + struct per_frame_masks *insn; + bool changed; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + /* + * If must_write marks were updated must_write_acc needs to be reset + * (to account for the case when new must_write sets became smaller). + */ + if (instance->must_write_dropped) { + for (frame = 0; frame <= callchain->curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = 0; i < instance->insn_cnt; i++) { + insn = get_frame_masks(instance, frame, this_subprog_start + i); + insn->must_write_acc = 0; + } + } + } + + subprog = bpf_find_containing_subprog(env, this_subprog_start); + po_start = subprog->postorder_start; + po_end = (subprog + 1)->postorder_start; + cnt = 0; + /* repeat until fixed point is reached */ + do { + cnt++; + changed = false; + for (frame = 0; frame <= instance->callchain.curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = po_start; i < po_end; i++) + changed |= update_insn(env, instance, frame, insn_postorder[i]); + } + } while (changed); + + if (env->log.level & BPF_LOG_LEVEL2) + bpf_log(&env->log, "%s live stack update done in %d iterations\n", + fmt_callchain(env, callchain), cnt); + + /* transfer marks accumulated for outer frames to outer func instance (caller) */ + if (callchain->curframe > 0) { + err = propagate_to_outer_instance(env, instance); + if (err) + return err; + } + + return 0; +} + +/* + * Prepare all callchains within @env->cur_state for querying. + * This function should be called after each verifier.c:pop_stack() + * and whenever verifier.c:do_check_insn() processes subprogram exit. + * This would guarantee that visited verifier states with zero branches + * have their bpf_mark_stack_{read,write}() effects propagated in + * @env->liveness. + */ +int bpf_update_live_stack(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + int err, frame; + + bpf_reset_live_stack_callchain(env); + for (frame = env->cur_state->curframe; frame >= 0; --frame) { + instance = lookup_instance(env, env->cur_state, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + if (instance->updated) { + err = update_instance(env, instance); + if (err) + return err; + instance->updated = false; + instance->must_write_dropped = false; + } + } + return 0; +} + +static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi) +{ + struct per_frame_masks *masks; + + masks = get_frame_masks(instance, frameno, insn_idx); + return masks && (masks->live_before & BIT(spi)); +} + +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance; + u32 frame; + + memset(q, 0, sizeof(*q)); + for (frame = 0; frame <= st->curframe; frame++) { + instance = lookup_instance(env, st, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + q->instances[frame] = instance; + } + q->curframe = st->curframe; + q->insn_idx = st->insn_idx; + return 0; +} + +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi) +{ + /* + * Slot is alive if it is read before q->st->insn_idx in current func instance, + * or if for some outer func instance: + * - alive before callsite if callsite calls callback, otherwise + * - alive after callsite + */ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance, *curframe_instance; + u32 i, callsite; + bool alive; + + curframe_instance = q->instances[q->curframe]; + if (is_live_before(curframe_instance, q->insn_idx, frameno, spi)) + return true; + + for (i = frameno; i < q->curframe; i++) { + callsite = curframe_instance->callchain.callsites[i]; + instance = q->instances[i]; + alive = bpf_calls_callback(env, callsite) + ? is_live_before(instance, callsite, frameno, spi) + : is_live_before(instance, callsite + 1, frameno, spi); + if (alive) + return true; + } + + return false; +} diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 632d51b05fe983..c93a756e035c02 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -165,7 +165,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key, } new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), - __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN, + __GFP_ZERO | GFP_NOWAIT, map->numa_node); if (!new) return -ENOMEM; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 38050f4ee40030..f50533169cc34e 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -498,6 +498,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "skb"; case BPF_DYNPTR_TYPE_XDP: return "xdp"; + case BPF_DYNPTR_TYPE_SKB_META: + return "skb_meta"; case BPF_DYNPTR_TYPE_INVALID: return ""; default: @@ -540,19 +542,6 @@ static char slot_type_char[] = { [STACK_IRQ_FLAG] = 'f' }; -static void print_liveness(struct bpf_verifier_env *env, - enum bpf_reg_liveness live) -{ - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) - verbose(env, "_"); - if (live & REG_LIVE_READ) - verbose(env, "r"); - if (live & REG_LIVE_WRITTEN) - verbose(env, "w"); - if (live & REG_LIVE_DONE) - verbose(env, "D"); -} - #define UNUM_MAX_DECIMAL U16_MAX #define SNUM_MAX_DECIMAL S16_MAX #define SNUM_MIN_DECIMAL S16_MIN @@ -770,7 +759,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!print_all && !reg_scratched(env, i)) continue; verbose(env, " R%d", i); - print_liveness(env, reg->live); verbose(env, "="); print_reg_state(env, state, reg); } @@ -803,9 +791,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie break; types_buf[j] = '\0'; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); print_reg_state(env, state, reg); break; case STACK_DYNPTR: @@ -814,7 +800,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie reg = &state->stack[i].spilled_ptr; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); @@ -829,9 +814,8 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!reg->ref_obj_id) continue; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)", + (-i - 1) * BPF_REG_SIZE, iter_type_str(reg->iter.btf, reg->iter.btf_id), reg->ref_obj_id, iter_state_str(reg->iter.state), reg->iter.depth); @@ -839,9 +823,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie case STACK_MISC: case STACK_ZERO: default: - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); break; } } diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 889374722d0aa9..bd45dda9dc354c 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -736,7 +736,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) /* Defer barriers into worker to let the rest of map memory to be freed */ memset(ma, 0, sizeof(*ma)); INIT_WORK(©->work, free_mem_alloc_deferred); - queue_work(system_unbound_wq, ©->work); + queue_work(system_dfl_wq, ©->work); } void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 5ab354d55d8295..a00561b1d3e515 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -471,7 +471,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= _Q_MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { lockevent_inc(lock_no_node); RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); while (!queued_spin_trylock(lock)) { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa98..4d53cdd1374cf7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, false, false); if (unlikely(!trace)) @@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { @@ -646,7 +646,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall */ -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return bpf_stackmap_extract(map, key, value, true); +} + +/* Called from syscall */ +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *old_bucket; @@ -663,7 +671,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); - old_bucket = xchg(&smap->buckets[id], bucket); + if (delete) + old_bucket = bucket; + else + old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; @@ -754,6 +765,7 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, + .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0fbfa8532c392c..a48fa86f82a7fc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ +#include #include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include @@ -318,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { - err = bpf_stackmap_copy(map, key, value); + err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { @@ -672,6 +674,7 @@ void btf_record_free(struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to release */ break; default: @@ -725,6 +728,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to acquire */ break; default: @@ -783,6 +787,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) bpf_wq_cancel_and_free(obj + rec->wq_off); } +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) + return; + bpf_task_work_cancel_and_free(obj + rec->task_work_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -807,6 +818,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_WORKQUEUE: bpf_wq_cancel_and_free(field_ptr); break; + case BPF_TASK_WORK: + bpf_task_work_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -860,6 +874,7 @@ static void bpf_map_free(struct bpf_map *map) * the free of values or special fields allocated from bpf memory * allocator. */ + kfree(map->excl_prog_sha); migrate_disable(); map->ops->map_free(map); migrate_enable(); @@ -905,7 +920,7 @@ static void bpf_map_free_in_work(struct bpf_map *map) /* Avoid spawning kworkers, since they all might contend * for the same mutex like slab_mutex. */ - queue_work(system_unbound_wq, &map->work); + queue_work(system_dfl_wq, &map->work); } static void bpf_map_free_rcu_gp(struct rcu_head *rcu) @@ -1237,7 +1252,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | + BPF_TASK_WORK, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1269,6 +1285,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, break; case BPF_TIMER: case BPF_WORKQUEUE: + case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { @@ -1338,9 +1355,9 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } -#define BPF_MAP_CREATE_LAST_FIELD map_token_fd +#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bool kernel) +static int map_create(union bpf_attr *attr, bpfptr_t uattr) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1534,7 +1551,29 @@ static int map_create(union bpf_attr *attr, bool kernel) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token, kernel); + if (attr->excl_prog_hash) { + bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); + + if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + err = -EINVAL; + goto free_map; + } + + map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!map->excl_prog_sha) { + err = -ENOMEM; + goto free_map; + } + + if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { + err = -EFAULT; + goto free_map; + } + } else if (attr->excl_prog_hash_size) { + return -EINVAL; + } + + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -1627,7 +1666,8 @@ struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); -int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { return -ENOTSUPP; } @@ -2158,7 +2198,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); @@ -2761,8 +2802,44 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, + bool is_kernel) +{ + bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); + struct bpf_dynptr_kern sig_ptr, insns_ptr; + struct bpf_key *key = NULL; + void *sig; + int err = 0; + + if (system_keyring_id_check(attr->keyring_id) == 0) + key = bpf_lookup_system_key(attr->keyring_id); + else + key = bpf_lookup_user_key(attr->keyring_id, 0); + + if (!key) + return -EINVAL; + + sig = kvmemdup_bpfptr(usig, attr->signature_size); + if (IS_ERR(sig)) { + bpf_key_put(key); + return -ENOMEM; + } + + bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, + attr->signature_size); + bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, + prog->len * sizeof(struct bpf_insn)); + + err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, + (struct bpf_dynptr *)&sig_ptr, key); + + bpf_key_put(key); + kvfree(sig); + return err; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt +#define BPF_PROG_LOAD_LAST_FIELD keyring_id static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { @@ -2926,6 +3003,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; + if (attr->signature) { + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + if (err) + goto free_prog; + } + prog->orig_prog = NULL; prog->jited = 0; @@ -5161,6 +5244,9 @@ static int bpf_map_get_info_by_fd(struct file *file, info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -5185,6 +5271,25 @@ static int bpf_map_get_info_by_fd(struct file *file, return err; } + if (info.hash) { + char __user *uhash = u64_to_user_ptr(info.hash); + + if (!map->ops->map_get_hash) + return -EINVAL; + + if (info.hash_size != SHA256_DIGEST_SIZE) + return -EINVAL; + + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + if (err != 0) + return err; + + if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + return -EFAULT; + } else if (info.hash_size) { + return -EINVAL; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; @@ -6008,7 +6113,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr.is_kernel); + err = map_create(&attr, uattr); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index fa353c5d550fc9..f8e70e9c3998d4 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -116,31 +116,55 @@ struct tnum tnum_xor(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } -/* Generate partial products by multiplying each bit in the multiplier (tnum a) - * with the multiplicand (tnum b), and add the partial products after - * appropriately bit-shifting them. Instead of directly performing tnum addition - * on the generated partial products, equivalenty, decompose each partial - * product into two tnums, consisting of the value-sum (acc_v) and the - * mask-sum (acc_m) and then perform tnum addition on them. The following paper - * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398. +/* Perform long multiplication, iterating through the bits in a using rshift: + * - if LSB(a) is a known 0, keep current accumulator + * - if LSB(a) is a known 1, add b to current accumulator + * - if LSB(a) is unknown, take a union of the above cases. + * + * For example: + * + * acc_0: acc_1: + * + * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1 + * x1 01 11 + * ------ ------ ------ + * 11 11 11 + * xx 00 11 + * ------ ------ ------ + * ???? 0011 1001 */ struct tnum tnum_mul(struct tnum a, struct tnum b) { - u64 acc_v = a.value * b.value; - struct tnum acc_m = TNUM(0, 0); + struct tnum acc = TNUM(0, 0); while (a.value || a.mask) { /* LSB of tnum a is a certain 1 */ if (a.value & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.mask)); + acc = tnum_add(acc, b); /* LSB of tnum a is uncertain */ - else if (a.mask & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask)); + else if (a.mask & 1) { + /* acc = tnum_union(acc_0, acc_1), where acc_0 and + * acc_1 are partial accumulators for cases + * LSB(a) = certain 0 and LSB(a) = certain 1. + * acc_0 = acc + 0 * b = acc. + * acc_1 = acc + 1 * b = tnum_add(acc, b). + */ + + acc = tnum_union(acc, tnum_add(acc, b)); + } /* Note: no case for LSB is certain 0 */ a = tnum_rshift(a, 1); b = tnum_lshift(b, 1); } - return tnum_add(TNUM(acc_v, 0), acc_m); + return acc; +} + +bool tnum_overlap(struct tnum a, struct tnum b) +{ + u64 mu; + + mu = ~a.mask & ~b.mask; + return (a.value & mu) == (b.value & mu); } /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has @@ -155,6 +179,19 @@ struct tnum tnum_intersect(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } +/* Returns a tnum with the uncertainty from both a and b, and in addition, new + * uncertainty at any position that a and b disagree. This represents a + * superset of the union of the concrete sets of both a and b. Despite the + * overapproximation, it is optimal. + */ +struct tnum tnum_union(struct tnum a, struct tnum b) +{ + u64 v = a.value & b.value; + u64 mu = (a.value ^ b.value) | a.mask | b.mask; + + return TNUM(v & ~mu, mu); +} + struct tnum tnum_cast(struct tnum a, u8 size) { a.value &= (1ULL << (size * 8)) - 1; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 0e364614c3a291..5949095e51c3d0 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -899,8 +899,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void) static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); @@ -949,8 +948,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, update_prog_stats(prog, start); this_cpu_dec(*(prog->active)); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, @@ -960,8 +958,7 @@ static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, /* Runtime stats are exported via actual BPF_LSM_CGROUP * programs, not the shims. */ - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); @@ -974,8 +971,7 @@ static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, { bpf_reset_run_ctx(run_ctx->saved_run_ctx); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, @@ -1033,8 +1029,7 @@ static u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { - rcu_read_lock(); - migrate_disable(); + rcu_read_lock_dont_migrate(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); @@ -1048,8 +1043,7 @@ static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af690..73bba397672a9f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -674,6 +674,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_SKB; case DYNPTR_TYPE_XDP: return BPF_DYNPTR_TYPE_XDP; + case DYNPTR_TYPE_SKB_META: + return BPF_DYNPTR_TYPE_SKB_META; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -690,6 +692,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_SKB; case BPF_DYNPTR_TYPE_XDP: return DYNPTR_TYPE_XDP; + case BPF_DYNPTR_TYPE_SKB_META: + return DYNPTR_TYPE_SKB_META; default: return 0; } @@ -783,8 +787,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; } @@ -801,29 +804,7 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? - * - * While we don't allow reading STACK_INVALID, it is still possible to - * do <8 byte writes marking some but not all slots as STACK_MISC. Then, - * helpers or insns can do partial read of that part without failing, - * but check_stack_range_initialized, check_stack_read_var_off, and - * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of - * the slot conservatively. Hence we need to prevent those liveness - * marking walks. - * - * This was not a problem before because STACK_INVALID is only set by - * default (where the default reg state has its reg->parent as NULL), or - * in clean_live_states after REG_LIVE_DONE (at which point - * mark_reg_read won't walk reg->parent chain), but not randomly during - * verifier state exploration (like we did above). Hence, for our case - * parentage chain will still be live (i.e. reg->parent may be - * non-NULL), while earlier reg->parent was NULL, so we need - * REG_LIVE_WRITTEN to screen off read marker propagation when it is - * done later on reads or by mark_dynptr_read as well to unnecessary - * mark registers in verifier state. - */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); } static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -932,9 +913,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Same reason as unmark_stack_slots_dynptr above */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; } @@ -1052,7 +1031,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; @@ -1062,6 +1040,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_ITER; + bpf_mark_stack_write(env, state->frameno, BIT(spi - i)); mark_stack_slot_scratched(env, spi - i); } @@ -1087,12 +1066,10 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; - for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_INVALID; + bpf_mark_stack_write(env, state->frameno, BIT(spi - i)); mark_stack_slot_scratched(env, spi - i); } @@ -1182,9 +1159,9 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, slot = &state->stack[spi]; st = &slot->spilled_ptr; + bpf_mark_stack_write(env, reg->frameno, BIT(spi)); __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; st->irq.kfunc_class = kfunc_class; @@ -1238,8 +1215,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; + bpf_mark_stack_write(env, reg->frameno, BIT(spi)); for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_INVALID; @@ -1754,6 +1730,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return err; dst_state->speculative = src->speculative; dst_state->in_sleepable = src->in_sleepable; + dst_state->cleaned = src->cleaned; dst_state->curframe = src->curframe; dst_state->branches = src->branches; dst_state->parent = src->parent; @@ -1946,9 +1923,24 @@ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_stat return 0; visit = scc_visit_lookup(env, callchain); if (!visit) { - verifier_bug(env, "scc exit: no visit info for call chain %s", - format_callchain(env, callchain)); - return -EFAULT; + /* + * If path traversal stops inside an SCC, corresponding bpf_scc_visit + * must exist for non-speculative paths. For non-speculative paths + * traversal stops when: + * a. Verification error is found, maybe_exit_scc() is not called. + * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member + * of any SCC. + * c. A checkpoint is reached and matched. Checkpoints are created by + * is_state_visited(), which calls maybe_enter_scc(), which allocates + * bpf_scc_visit instances for checkpoints within SCCs. + * (c) is the only case that can reach this point. + */ + if (!st->speculative) { + verifier_bug(env, "scc exit: no visit info for call chain %s", + format_callchain(env, callchain)); + return -EFAULT; + } + return 0; } if (visit->entry_state != st) return 0; @@ -2017,7 +2009,7 @@ static void free_backedges(struct bpf_scc_visit *visit) for (backedge = visit->backedges; backedge; backedge = next) { free_verifier_state(&backedge->state, false); next = backedge->next; - kvfree(backedge); + kfree(backedge); } visit->backedges = NULL; } @@ -2232,10 +2224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) - reg->map_uid = reg->id; - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + if (btf_record_has_field(map->inner_map_meta->record, + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { reg->map_uid = reg->id; + } } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -2274,7 +2266,8 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) { return base_type(reg->type) == PTR_TO_MEM && - (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); + (reg->type & + (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)); } /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ @@ -2873,8 +2866,6 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); - regs[i].live = REG_LIVE_NONE; - regs[i].parent = NULL; regs[i].subreg_def = DEF_NOT_SUBREG; } @@ -2958,7 +2949,7 @@ static int cmp_subprogs(const void *a, const void *b) } /* Find subprogram that contains instruction at 'off' */ -static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *vals = env->subprog_info; int l, r, m; @@ -2983,7 +2974,7 @@ static int find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = find_containing_subprog(env, off); + p = bpf_find_containing_subprog(env, off); if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; @@ -3494,15 +3485,6 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } -static int jmp_offset(struct bpf_insn *insn) -{ - u8 code = insn->code; - - if (code == (BPF_JMP32 | BPF_JA)) - return insn->imm; - return insn->off; -} - static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3529,7 +3511,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - off = i + jmp_offset(&insn[i]) + 1; + off = i + bpf_jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -3555,69 +3537,15 @@ static int check_subprogs(struct bpf_verifier_env *env) return 0; } -/* Parentage chain of this register (or stack slot) should take care of all - * issues like callee-saved registers, stack slot allocation time, etc. - */ -static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_reg_state *state, - struct bpf_reg_state *parent, u8 flag) -{ - bool writes = parent == state->parent; /* Observe write marks */ - int cnt = 0; - - while (parent) { - /* if read wasn't screened by an earlier write ... */ - if (writes && state->live & REG_LIVE_WRITTEN) - break; - if (verifier_bug_if(parent->live & REG_LIVE_DONE, env, - "type %s var_off %lld off %d", - reg_type_str(env, parent->type), - parent->var_off.value, parent->off)) - return -EFAULT; - /* The first condition is more likely to be true than the - * second, checked it first. - */ - if ((parent->live & REG_LIVE_READ) == flag || - parent->live & REG_LIVE_READ64) - /* The parentage chain never changes and - * this parent was already marked as LIVE_READ. - * There is no need to keep walking the chain again and - * keep re-marking all parents as LIVE_READ. - * This case happens when the same register is read - * multiple times without writes into it in-between. - * Also, if parent has the stronger REG_LIVE_READ64 set, - * then no need to set the weak REG_LIVE_READ32. - */ - break; - /* ... then we depend on parent's value */ - parent->live |= flag; - /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ - if (flag == REG_LIVE_READ64) - parent->live &= ~REG_LIVE_READ32; - state = parent; - parent = state->parent; - writes = true; - cnt++; - } - - if (env->longest_mark_read_walk < cnt) - env->longest_mark_read_walk = cnt; - return 0; -} - static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots) { - struct bpf_func_state *state = func(env, reg); int err, i; for (i = 0; i < nr_slots; i++) { - struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; - - err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); + err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i)); if (err) return err; - mark_stack_slot_scratched(env, spi - i); } return 0; @@ -3663,7 +3591,7 @@ static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. */ -static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, +static bool is_reg64(struct bpf_insn *insn, u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) { u8 code, class, op; @@ -3774,14 +3702,14 @@ static int insn_def_regno(const struct bpf_insn *insn) } /* Return TRUE if INSN has defined any 32-bit value explicitly. */ -static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) +static bool insn_has_def32(struct bpf_insn *insn) { int dst_reg = insn_def_regno(insn); if (dst_reg == -1) return false; - return !is_reg64(env, insn, dst_reg, NULL, DST_OP); + return !is_reg64(insn, dst_reg, NULL, DST_OP); } static void mark_insn_zext(struct bpf_verifier_env *env, @@ -3812,7 +3740,7 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r mark_reg_scratched(env, regno); reg = ®s[regno]; - rw64 = is_reg64(env, insn, regno, reg, t); + rw64 = is_reg64(insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) { @@ -3826,15 +3754,13 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r if (rw64) mark_insn_zext(env, reg); - return mark_reg_read(env, reg, reg->parent, - rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); + return 0; } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); @@ -4195,7 +4121,7 @@ static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) } } /* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ -static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) { DECLARE_BITMAP(mask, 64); bool first = true; @@ -4250,8 +4176,6 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_histo } } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); - /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -4278,7 +4202,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); verbose(env, "mark_precise: frame%d: regs=%s ", bt->frame, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); verbose_insn(env, insn); @@ -4479,7 +4403,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * backtracking, as these registers are set by the function * invoking callback. */ - if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) + if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx)) for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { @@ -4918,7 +4842,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, bt_frame_reg_mask(bt, fr)); verbose(env, "mark_precise: frame%d: parent state regs=%s ", fr, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_stack_mask(bt, fr)); verbose(env, "stack=%s: ", env->tmp_str_buf); print_verifier_state(env, st, fr, true); @@ -5041,12 +4965,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, /* Copy src state preserving dst->parent and dst->live fields */ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) { - struct bpf_reg_state *parent = dst->parent; - enum bpf_reg_liveness live = dst->live; - *dst = *src; - dst->parent = parent; - dst->live = live; } static void save_register_state(struct bpf_verifier_env *env, @@ -5057,8 +4976,6 @@ static void save_register_state(struct bpf_verifier_env *env, int i; copy_register_state(&state->stack[spi].spilled_ptr, reg); - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -5152,6 +5069,18 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, if (err) return err; + if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) { + /* only mark the slot as written if all 8 bytes were written + * otherwise read propagation may incorrectly stop too soon + * when stack slots are partially written. + * This heuristic means that read propagation will be + * conservative, since it will add reg_live_read marks + * to stack slots all the way to first state when programs + * writes+reads less than 8 bytes + */ + bpf_mark_stack_write(env, state->frameno, BIT(spi)); + } + check_fastcall_stack_contract(env, state, insn_idx, off); mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { @@ -5195,17 +5124,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_SIZE; i++) scrub_spilled_slot(&state->stack[spi].slot_type[i]); - /* only mark the slot as written if all 8 bytes were written - * otherwise read propagation may incorrectly stop too soon - * when stack slots are partially written. - * This heuristic means that read propagation will be - * conservative, since it will add reg_live_read marks - * to stack slots all the way to first state when programs - * writes+reads less than 8 bytes - */ - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - /* when we zero initialize stack slots mark them as such */ if ((reg && register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { @@ -5398,7 +5316,6 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, dst_regno); } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } /* Read the stack at 'off' and put the results into the register indicated by @@ -5421,12 +5338,16 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg; u8 *stype, type; int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); + int err; stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; mark_stack_slot_scratched(env, spi); check_fastcall_stack_contract(env, state, env->insn_idx, off); + err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi)); + if (err) + return err; if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; @@ -5441,7 +5362,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno < 0) return 0; @@ -5495,7 +5415,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* not restoring original register state */ } } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (dst_regno >= 0) { /* restore register state from stack */ copy_register_state(&state->regs[dst_regno], reg); @@ -5503,7 +5422,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE @@ -5515,7 +5433,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { for (i = 0; i < size; i++) { type = stype[(slot - i) % BPF_REG_SIZE]; @@ -5529,7 +5446,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0) mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); insn_flags = 0; /* we are not restoring spilled register */ @@ -8157,10 +8073,10 @@ static int check_stack_range_initialized( /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); - /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi)); + if (err) + return err; + /* We do not call bpf_mark_stack_write(), as we can not * be sure that whether stack slot is written to or not. Hence, * we must still conservatively propagate reads upwards even if * helper may write to the entire memory range. @@ -8515,38 +8431,70 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) return 0; } -static int process_timer_func(struct bpf_verifier_env *env, int regno, - struct bpf_call_arg_meta *meta) +/* Check if @regno is a pointer to a specific field in a map value */ +static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, + enum btf_field_type field_type) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; + const char *struct_name = btf_field_type_name(field_type); + int field_off = -1; if (!is_const) { verbose(env, - "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s has to be at the constant offset\n", + regno, struct_name); return -EINVAL; } if (!map->btf) { - verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n", - map->name); + verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name, + struct_name); + return -EINVAL; + } + if (!btf_record_has_field(map->record, field_type)) { + verbose(env, "map '%s' has no valid %s\n", map->name, struct_name); return -EINVAL; } - if (!btf_record_has_field(map->record, BPF_TIMER)) { - verbose(env, "map '%s' has no valid bpf_timer\n", map->name); + switch (field_type) { + case BPF_TIMER: + field_off = map->record->timer_off; + break; + case BPF_TASK_WORK: + field_off = map->record->task_work_off; + break; + default: + verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; } - if (map->record->timer_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", - val + reg->off, map->record->timer_off); + if (field_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n", + val + reg->off, struct_name, field_off); return -EINVAL; } + return 0; +} + +static int process_timer_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TIMER); + if (err) + return err; + if (meta->map_ptr) { verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; + } meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; @@ -8569,6 +8517,26 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_task_work_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TASK_WORK); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_task_work helper"); + return -EFAULT; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10398,6 +10366,8 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static bool is_task_work_add_kfunc(u32 func_id); + static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); @@ -10616,7 +10586,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); + is_bpf_wq_set_callback_impl_kfunc(insn->imm) || + is_task_work_add_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -10717,6 +10688,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* and go analyze first insn of the callee */ *insn_idx = env->subprog_info[subprog].start - 1; + bpf_reset_live_stack_callchain(env); + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); print_verifier_state(env, state, caller->frameno, true); @@ -10842,7 +10815,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; - callee->callback_ret_range = retval_range(0, 1); + callee->callback_ret_range = retval_range(0, 0); return 0; } @@ -10929,6 +10902,36 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); + return 0; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id); /* Are we currently verifying the callback for a rbtree helper that must @@ -10992,8 +10995,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* we are going to rely on register's precise value */ - err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64); - err = err ?: mark_chain_precision(env, BPF_REG_0); + err = mark_chain_precision(env, BPF_REG_0); if (err) return err; @@ -11003,7 +11005,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) "At callback return", "R0"); return -EINVAL; } - if (!calls_callback(env, callee->callsite)) { + if (!bpf_calls_callback(env, callee->callsite)) { verifier_bug(env, "in callback at %d, callsite %d !calls_callback", *insn_idx, callee->callsite); return -EFAULT; @@ -11354,7 +11356,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return -EINVAL; *ptr = env->ops->get_func_proto(func_id, env->prog); - return *ptr ? 0 : -EINVAL; + return *ptr && (*ptr)->func ? 0 : -EINVAL; } static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -11641,7 +11643,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; - if (dynptr_type == BPF_DYNPTR_TYPE_SKB) + if (dynptr_type == BPF_DYNPTR_TYPE_SKB || + dynptr_type == BPF_DYNPTR_TYPE_SKB_META) /* this will trigger clear_all_pkt_pointers(), which will * invalidate all dynptr slices associated with the skb */ @@ -11896,17 +11899,11 @@ static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_re if (regno == BPF_REG_0) { /* Function return value */ - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = reg_size == sizeof(u64) ? DEF_NOT_SUBREG : env->insn_idx + 1; - } else { + } else if (reg_size == sizeof(u64)) { /* Function argument */ - if (reg_size == sizeof(u64)) { - mark_insn_zext(env, reg); - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - } else { - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); - } + mark_insn_zext(env, reg); } } @@ -12059,6 +12056,7 @@ enum { KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, + KF_ARG_TASK_WORK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12069,6 +12067,7 @@ BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) +BTF_ID(struct, bpf_task_work) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12117,6 +12116,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); +} + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); @@ -12204,6 +12208,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, + KF_ARG_PTR_TO_TASK_WORK, }; enum special_kfunc_type { @@ -12228,6 +12233,8 @@ enum special_kfunc_type { KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, + KF_bpf_dynptr_from_skb_meta, + KF_bpf_xdp_pull_data, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, @@ -12252,6 +12259,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, }; BTF_ID_LIST(special_kfunc_list) @@ -12277,9 +12286,13 @@ BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_from_skb_meta) +BTF_ID(func, bpf_xdp_pull_data) #else BTF_ID_UNUSED BTF_ID_UNUSED +BTF_ID_UNUSED +BTF_ID_UNUSED #endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) @@ -12318,6 +12331,14 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) + +static bool is_task_work_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; +} static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12349,6 +12370,11 @@ static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; } +static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data]; +} + static enum kfunc_ptr_arg_type get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -12408,6 +12434,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TASK_WORK; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; @@ -12751,7 +12780,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + is_task_work_add_kfunc(btf_id); } static bool is_bpf_throw_kfunc(struct bpf_insn *insn) @@ -13132,7 +13162,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || + reg->map_ptr->record->task_work_off >= 0)) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -13147,6 +13178,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ */ if (meta->map.ptr != reg->map_ptr || meta->map.uid != reg->map_uid) { + if (reg->map_ptr->record->task_work_off >= 0) { + verbose(env, + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } verbose(env, "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", meta->map.uid, reg->map_uid); @@ -13185,6 +13222,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; @@ -13253,6 +13291,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_SKB; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { + dynptr_arg_type |= DYNPTR_TYPE_SKB_META; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; @@ -13476,6 +13516,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TASK_WORK: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_task_work_func(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); @@ -13842,6 +13891,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_task_work_add_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_task_work_schedule_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); @@ -13901,6 +13960,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + if (is_kfunc_rcu_protected(&meta) && !in_rcu_cs(env)) { + verbose(env, "kernel func %s requires RCU critical section protection\n", func_name); + return -EACCES; + } + /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -14014,6 +14078,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Ensures we don't access the memory after a release_reference() */ if (meta.ref_obj_id) regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + + if (is_kfunc_rcu_protected(&meta)) + regs[BPF_REG_0].type |= MEM_RCU; } else { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].btf = desc_btf; @@ -14022,6 +14089,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) regs[BPF_REG_0].type |= PTR_UNTRUSTED; + else if (is_kfunc_rcu_protected(&meta)) + regs[BPF_REG_0].type |= MEM_RCU; if (is_iter_next_kfunc(&meta)) { struct bpf_reg_state *cur_iter; @@ -14066,6 +14135,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_kfunc_pkt_changing(&meta)) + clear_all_pkt_pointers(env); + nargs = btf_type_vlen(meta.func_proto); args = (const struct btf_param *)(meta.func_proto + 1); for (i = 0; i < nargs; i++) { @@ -15645,7 +15717,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -15664,7 +15735,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (!no_sext) dst_reg->id = 0; coerce_reg_to_size_sx(dst_reg, insn->off >> 3); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -15690,7 +15760,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ if (!is_src_reg_u32) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ @@ -15701,7 +15770,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) copy_register_state(dst_reg, src_reg); if (!no_sext) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; coerce_subreg_to_size_sx(dst_reg, insn->off >> 3); } @@ -15886,6 +15954,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value == t2.value; + if (!tnum_overlap(t1, t2)) + return 0; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 0; @@ -15910,6 +15980,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value != t2.value; + if (!tnum_overlap(t1, t2)) + return 1; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 1; @@ -17117,9 +17189,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char } if (frame->in_async_callback_fn) { - /* enforce return zero from async callbacks like timer */ exit_ctx = "At async callback return"; - range = retval_range(0, 0); + range = frame->callback_ret_range; goto enforce_retval; } @@ -17258,7 +17329,7 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->changes_pkt_data = true; } @@ -17266,7 +17337,7 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->might_sleep = true; } @@ -17280,8 +17351,8 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) { struct bpf_subprog_info *caller, *callee; - caller = find_containing_subprog(env, t); - callee = find_containing_subprog(env, w); + caller = bpf_find_containing_subprog(env, t); + callee = bpf_find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; caller->might_sleep |= callee->might_sleep; } @@ -17351,7 +17422,7 @@ static void mark_calls_callback(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].calls_callback = true; } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx) { return env->insn_aux_data[insn_idx].calls_callback; } @@ -17783,6 +17854,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) */ if (ret == 0 && is_kfunc_sleepable(&meta)) mark_subprog_might_sleep(env, t); + if (ret == 0 && is_kfunc_pkt_changing(&meta)) + mark_subprog_changes_pkt_data(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); @@ -17825,7 +17898,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state, *insn_postorder; + int *insn_stack, *insn_state; int ex_insn_beg, i, ret = 0; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); @@ -17838,14 +17911,6 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } - insn_postorder = env->cfg.insn_postorder = - kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); - if (!insn_postorder) { - kvfree(insn_state); - kvfree(insn_stack); - return -ENOMEM; - } - ex_insn_beg = env->exception_callback_subprog ? env->subprog_info[env->exception_callback_subprog].start : 0; @@ -17863,7 +17928,6 @@ static int check_cfg(struct bpf_verifier_env *env) case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; - insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17917,6 +17981,56 @@ static int check_cfg(struct bpf_verifier_env *env) return ret; } +/* + * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range + * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start) + * with indices of 'i' instructions in postorder. + */ +static int compute_postorder(struct bpf_verifier_env *env) +{ + u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2]; + int *stack = NULL, *postorder = NULL, *state = NULL; + + postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + stack = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + if (!postorder || !state || !stack) { + kvfree(postorder); + kvfree(state); + kvfree(stack); + return -ENOMEM; + } + cur_postorder = 0; + for (i = 0; i < env->subprog_cnt; i++) { + env->subprog_info[i].postorder_start = cur_postorder; + stack[0] = env->subprog_info[i].start; + stack_sz = 1; + do { + top = stack[stack_sz - 1]; + state[top] |= DISCOVERED; + if (state[top] & EXPLORED) { + postorder[cur_postorder++] = top; + stack_sz--; + continue; + } + succ_cnt = bpf_insn_successors(env->prog, top, succ); + for (s = 0; s < succ_cnt; ++s) { + if (!state[succ[s]]) { + stack[stack_sz++] = succ[s]; + state[succ[s]] |= DISCOVERED; + } + } + state[top] |= EXPLORED; + } while (stack_sz); + } + env->subprog_info[i].postorder_start = cur_postorder; + env->cfg.insn_postorder = postorder; + env->cfg.cur_postorder = cur_postorder; + kvfree(stack); + kvfree(state); + return 0; +} + static int check_abnormal_return(struct bpf_verifier_env *env) { int i; @@ -18449,16 +18563,15 @@ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) } static void clean_func_state(struct bpf_verifier_env *env, - struct bpf_func_state *st) + struct bpf_func_state *st, + u32 ip) { - enum bpf_reg_liveness live; + u16 live_regs = env->insn_aux_data[ip].live_regs_before; int i, j; for (i = 0; i < BPF_REG_FP; i++) { - live = st->regs[i].live; /* liveness must not touch this register anymore */ - st->regs[i].live |= REG_LIVE_DONE; - if (!(live & REG_LIVE_READ)) + if (!(live_regs & BIT(i))) /* since the register is unused, clear its state * to make further comparison simpler */ @@ -18466,10 +18579,7 @@ static void clean_func_state(struct bpf_verifier_env *env, } for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { - live = st->stack[i].spilled_ptr.live; - /* liveness must not touch this stack slot anymore */ - st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; - if (!(live & REG_LIVE_READ)) { + if (!bpf_stack_slot_alive(env, st->frameno, i)) { __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; @@ -18480,10 +18590,14 @@ static void clean_func_state(struct bpf_verifier_env *env, static void clean_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { - int i; + int i, ip; - for (i = 0; i <= st->curframe; i++) - clean_func_state(env, st->frame[i]); + bpf_live_stack_query_init(env, st); + st->cleaned = true; + for (i = 0; i <= st->curframe; i++) { + ip = frame_insn_idx(st, i); + clean_func_state(env, st->frame[i], ip); + } } /* the parentage chains form a tree. @@ -18494,25 +18608,23 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * but a lot of states will get revised from liveness point of view when * the verifier explores other branches. * Example: - * 1: r0 = 1 + * 1: *(u64)(r10 - 8) = 1 * 2: if r1 == 100 goto pc+1 - * 3: r0 = 2 - * 4: exit - * when the verifier reaches exit insn the register r0 in the state list of - * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch - * of insn 2 and goes exploring further. At the insn 4 it will walk the - * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. + * 3: *(u64)(r10 - 8) = 2 + * 4: r0 = *(u64)(r10 - 8) + * 5: exit + * when the verifier reaches exit insn the stack slot -8 in the state list of + * insn 2 is not yet marked alive. Then the verifier pops the other_branch + * of insn 2 and goes exploring further. After the insn 4 read, liveness + * analysis would propagate read mark for -8 at insn 2. * * Since the verifier pushes the branch states as it sees them while exploring * the program the condition of walking the branch instruction for the second * time means that all states below this branch were already explored and * their final liveness marks are already propagated. * Hence when the verifier completes the search of state list in is_state_visited() - * we can call this clean_live_states() function to mark all liveness states - * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' - * will not be used. - * This function also clears the registers and stack for states that !READ - * to simplify state merging. + * we can call this clean_live_states() function to clear dead the registers and stack + * slots to simplify state merging. * * Important note here that walking the same branch instruction in the callee * doesn't meant that the states are DONE. The verifier has to compare @@ -18532,7 +18644,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) continue; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) + if (sl->state.cleaned) /* all regs in this state in all frames were already marked */ continue; if (incomplete_read_marks(env, &sl->state)) @@ -18564,9 +18676,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (exact == EXACT) return regs_exact(rold, rcur, idmap); - if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT) - /* explored state didn't use this */ - return true; if (rold->type == NOT_INIT) { if (exact == NOT_EXACT || rcur->type == NOT_INIT) /* explored state can't have used this */ @@ -18690,7 +18799,6 @@ static struct bpf_reg_state unbound_reg; static __init int unbound_reg_init(void) { __mark_reg_unknown_imprecise(&unbound_reg); - unbound_reg.live |= REG_LIVE_READ; return 0; } late_initcall(unbound_reg_init); @@ -18743,13 +18851,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, cur->stack[spi].slot_type[i % BPF_REG_SIZE])) return false; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) - && exact == NOT_EXACT) { - i += BPF_REG_SIZE - 1; - /* explored state didn't use this */ - continue; - } - if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; @@ -18992,91 +19093,6 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } -/* Return 0 if no propagation happened. Return negative error code if error - * happened. Otherwise, return the propagated bit. - */ -static int propagate_liveness_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - struct bpf_reg_state *parent_reg) -{ - u8 parent_flag = parent_reg->live & REG_LIVE_READ; - u8 flag = reg->live & REG_LIVE_READ; - int err; - - /* When comes here, read flags of PARENT_REG or REG could be any of - * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need - * of propagation if PARENT_REG has strongest REG_LIVE_READ64. - */ - if (parent_flag == REG_LIVE_READ64 || - /* Or if there is no read flag from REG. */ - !flag || - /* Or if the read flag from REG is the same as PARENT_REG. */ - parent_flag == flag) - return 0; - - err = mark_reg_read(env, reg, parent_reg, flag); - if (err) - return err; - - return flag; -} - -/* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at an - * equivalent state (jump target or such) we didn't arrive by the straight-line - * code, so read marks in the state must propagate to the parent regardless - * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() is for. - */ -static int propagate_liveness(struct bpf_verifier_env *env, - const struct bpf_verifier_state *vstate, - struct bpf_verifier_state *vparent, - bool *changed) -{ - struct bpf_reg_state *state_reg, *parent_reg; - struct bpf_func_state *state, *parent; - int i, frame, err = 0; - bool tmp = false; - - changed = changed ?: &tmp; - if (vparent->curframe != vstate->curframe) { - WARN(1, "propagate_live: parent frame %d current frame %d\n", - vparent->curframe, vstate->curframe); - return -EFAULT; - } - /* Propagate read liveness of registers... */ - BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - for (frame = 0; frame <= vstate->curframe; frame++) { - parent = vparent->frame[frame]; - state = vstate->frame[frame]; - parent_reg = parent->regs; - state_reg = state->regs; - /* We don't need to worry about FP liveness, it's read-only */ - for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - err = propagate_liveness_reg(env, &state_reg[i], - &parent_reg[i]); - if (err < 0) - return err; - *changed |= err > 0; - if (err == REG_LIVE_READ64) - mark_insn_zext(env, &parent_reg[i]); - } - - /* Propagate stack slots. */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - parent_reg = &parent->stack[i].spilled_ptr; - state_reg = &state->stack[i].spilled_ptr; - err = propagate_liveness_reg(env, state_reg, - parent_reg); - *changed |= err > 0; - if (err < 0) - return err; - } - } - return 0; -} - /* find precise scalars in the previous equivalent state and * propagate them into the current state */ @@ -19096,8 +19112,7 @@ static int propagate_precision(struct bpf_verifier_env *env, first = true; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -19114,8 +19129,7 @@ static int propagate_precision(struct bpf_verifier_env *env, continue; state_reg = &state->stack[i].spilled_ptr; if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -19165,9 +19179,6 @@ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visi changed = false; for (backedge = visit->backedges; backedge; backedge = backedge->next) { st = &backedge->state; - err = propagate_liveness(env, st->equal_state, st, &changed); - if (err) - return err; err = propagate_precision(env, st->equal_state, st, &changed); if (err) return err; @@ -19191,7 +19202,7 @@ static bool states_maybe_looping(struct bpf_verifier_state *old, fcur = cur->frame[fr]; for (i = 0; i < MAX_BPF_REG; i++) if (memcmp(&fold->regs[i], &fcur->regs[i], - offsetof(struct bpf_reg_state, parent))) + offsetof(struct bpf_reg_state, frameno))) return false; return true; } @@ -19289,7 +19300,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new; bool force_new_state, add_new_state, loop; - int i, j, n, err, states_cnt = 0; + int n, err, states_cnt = 0; struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || @@ -19404,7 +19415,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) goto hit; } } - if (calls_callback(env, insn_idx)) { + if (bpf_calls_callback(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) goto hit; goto skip_inf_loop_check; @@ -19447,25 +19458,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: sl->hit_cnt++; - /* reached equivalent register/stack state, - * prune the search. - * Registers read by the continuation are read by us. - * If we have any write marks in env->cur_state, they - * will prevent corresponding reads in the continuation - * from reaching our parent (an explored_state). Our - * own state will get the read marks recorded, but - * they'll be immediately forgotten as we're pruning - * this state and will pop a new one. - */ - err = propagate_liveness(env, &sl->state, cur, NULL); /* if previous state reached the exit with precision and * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ + err = 0; if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0, 0); + err = push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; @@ -19553,7 +19554,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) err = err ?: add_scc_backedge(env, &sl->state, backedge); if (err) { free_verifier_state(&backedge->state, false); - kvfree(backedge); + kfree(backedge); return err; } } @@ -19636,7 +19637,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) err = maybe_enter_scc(env, new); if (err) { free_verifier_state(new, false); - kvfree(new_sl); + kfree(new_sl); return err; } @@ -19645,38 +19646,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) cur->dfs_depth = new->dfs_depth + 1; clear_jmp_history(cur); list_add(&new_sl->node, head); - - /* connect new state to parentage chain. Current frame needs all - * registers connected. Only r6 - r9 of the callers are alive (pushed - * to the stack implicitly by JITs) so in callers' frames connect just - * r6 - r9 as an optimization. Callers will have r1 - r5 connected to - * the state of the call instruction (with WRITTEN set), and r0 comes - * from callee with its full parentage chain, anyway. - */ - /* clear write marks in current state: the writes we did are not writes - * our child did, so they don't screen off its reads from us. - * (There are no read marks in current state, because reads always mark - * their parent and current state never has children yet. Only - * explored_states can get read marks.) - */ - for (j = 0; j <= cur->curframe; j++) { - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].live = REG_LIVE_NONE; - } - - /* all stack frames are accessible from callee, clear them all */ - for (j = 0; j <= cur->curframe; j++) { - struct bpf_func_state *frame = cur->frame[j]; - struct bpf_func_state *newframe = new->frame[j]; - - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; - frame->stack[i].spilled_ptr.parent = - &newframe->stack[i].spilled_ptr; - } - } return 0; } @@ -19812,6 +19781,9 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, return PROCESS_BPF_EXIT; if (env->cur_state->curframe) { + err = bpf_update_live_stack(env); + if (err) + return err; /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); if (err) @@ -19997,7 +19969,7 @@ static int do_check(struct bpf_verifier_env *env) for (;;) { struct bpf_insn *insn; struct bpf_insn_aux_data *insn_aux; - int err; + int err, marks_err; /* reset current history entry on each new instruction */ env->cur_hist_ent = NULL; @@ -20090,7 +20062,15 @@ static int do_check(struct bpf_verifier_env *env) if (state->speculative && insn_aux->nospec) goto process_bpf_exit; + err = bpf_reset_stack_write_marks(env, env->insn_idx); + if (err) + return err; err = do_check_insn(env, &do_print_state); + if (err >= 0 || error_recoverable_with_nospec(err)) { + marks_err = bpf_commit_stack_write_marks(env); + if (marks_err) + return marks_err; + } if (error_recoverable_with_nospec(err) && state->speculative) { /* Prevent this speculative path from ever reaching the * insn that would have been unsafe to execute. @@ -20129,6 +20109,9 @@ static int do_check(struct bpf_verifier_env *env) process_bpf_exit: mark_verifier_state_scratched(env); err = update_branch_counts(env, env->cur_state); + if (err) + return err; + err = bpf_update_live_stack(env); if (err) return err; err = pop_stack(env, &prev_insn_idx, &env->insn_idx, @@ -20193,8 +20176,11 @@ static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf) if (env->used_btfs[i].btf == btf) return i; - if (env->used_btf_cnt >= MAX_USED_BTFS) + if (env->used_btf_cnt >= MAX_USED_BTFS) { + verbose(env, "The total number of btfs per program has reached the limit of %u\n", + MAX_USED_BTFS); return -E2BIG; + } btf_get(btf); @@ -20360,6 +20346,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); + if (map->excl_prog_sha && + memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) { + verbose(env, "program's hash doesn't match map's excl_prog_hash\n"); + return -EACCES; + } + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { @@ -20699,12 +20691,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * [0, off) and [off, end) to new locations, so the patched range stays zero */ static void adjust_insn_aux_data(struct bpf_verifier_env *env, - struct bpf_insn_aux_data *new_data, struct bpf_prog *new_prog, u32 off, u32 cnt) { - struct bpf_insn_aux_data *old_data = env->insn_aux_data; + struct bpf_insn_aux_data *data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; - u32 old_seen = old_data[off].seen; + u32 old_seen = data[off].seen; u32 prog_len; int i; @@ -20712,22 +20703,20 @@ static void adjust_insn_aux_data(struct bpf_verifier_env *env, * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the * original insn at old prog. */ - old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); + data[off].zext_dst = insn_has_def32(insn + off + cnt - 1); if (cnt == 1) return; prog_len = new_prog->len; - memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); - memcpy(new_data + off + cnt - 1, old_data + off, - sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + memmove(data + off + cnt - 1, data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1)); for (i = off; i < off + cnt - 1; i++) { /* Expand insni[off]'s seen count to the patched range. */ - new_data[i].seen = old_seen; - new_data[i].zext_dst = insn_has_def32(env, insn + i); + data[i].seen = old_seen; + data[i].zext_dst = insn_has_def32(insn + i); } - env->insn_aux_data = new_data; - vfree(old_data); } static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) @@ -20765,10 +20754,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of struct bpf_insn_aux_data *new_data = NULL; if (len > 1) { - new_data = vzalloc(array_size(env->prog->len + len - 1, - sizeof(struct bpf_insn_aux_data))); + new_data = vrealloc(env->insn_aux_data, + array_size(env->prog->len + len - 1, + sizeof(struct bpf_insn_aux_data)), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!new_data) return NULL; + + env->insn_aux_data = new_data; } new_prog = bpf_patch_insn_single(env->prog, off, patch, len); @@ -20777,10 +20770,9 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of verbose(env, "insn %d cannot be patched due to 16-bit range\n", env->insn_aux_data[off].orig_idx); - vfree(new_data); return NULL; } - adjust_insn_aux_data(env, new_data, new_prog, off, len); + adjust_insn_aux_data(env, new_prog, off, len); adjust_subprog_starts(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; @@ -21131,7 +21123,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, * BPF_STX + SRC_OP, so it is safe to pass NULL * here. */ - if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) { + if (is_reg64(&insn, load_reg, NULL, DST_OP)) { if (class == BPF_LD && BPF_MODE(code) == BPF_IMM) i++; @@ -21400,10 +21392,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; case PTR_TO_ARENA: if (BPF_MODE(insn->code) == BPF_MEMSX) { - verbose(env, "sign extending loads from arena are not supported yet\n"); - return -EOPNOTSUPP; + if (!bpf_jit_supports_insn(insn, true)) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code); + } else { + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); } - insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); env->prog->aux->num_exentries++; continue; default: @@ -21578,6 +21574,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; + func[i]->aux->main_prog_aux = prog->aux; for (j = 0; j < prog->aux->size_poke_tab; j++) { struct bpf_jit_poke_descriptor *poke; @@ -21608,6 +21605,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEM32 || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; if ((BPF_CLASS(insn->code) == BPF_STX || @@ -23855,6 +23853,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, BTF_SET_START(btf_id_deny) BTF_ID_UNUSED #ifdef CONFIG_SMP +BTF_ID(func, ___migrate_enable) BTF_ID(func, migrate_disable) BTF_ID(func, migrate_enable) #endif @@ -24084,67 +24083,6 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } -static bool can_fallthrough(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return true; - - if (opcode == BPF_EXIT || opcode == BPF_JA) - return false; - - return true; -} - -static bool can_jump(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return false; - - switch (opcode) { - case BPF_JA: - case BPF_JEQ: - case BPF_JNE: - case BPF_JLT: - case BPF_JLE: - case BPF_JGT: - case BPF_JGE: - case BPF_JSGT: - case BPF_JSGE: - case BPF_JSLT: - case BPF_JSLE: - case BPF_JCOND: - case BPF_JSET: - return true; - } - - return false; -} - -static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) -{ - struct bpf_insn *insn = &prog->insnsi[idx]; - int i = 0, insn_sz; - u32 dst; - - insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - if (can_fallthrough(insn) && idx + 1 < prog->len) - succ[i++] = idx + insn_sz; - - if (can_jump(insn)) { - dst = idx + jmp_offset(insn) + 1; - if (i == 0 || succ[0] != dst) - succ[i++] = dst; - } - - return i; -} - /* Each field is a register bitmask */ struct insn_live_regs { u16 use; /* registers read by instruction */ @@ -24342,7 +24280,7 @@ static int compute_live_registers(struct bpf_verifier_env *env) u16 new_out = 0; u16 new_in = 0; - succ_num = insn_successors(env->prog, insn_idx, succ); + succ_num = bpf_insn_successors(env->prog, insn_idx, succ); for (int s = 0; s < succ_num; ++s) new_out |= state[succ[s]].in; new_in = (new_out & ~live->def) | live->use; @@ -24379,9 +24317,6 @@ static int compute_live_registers(struct bpf_verifier_env *env) out: kvfree(state); - kvfree(env->cfg.insn_postorder); - env->cfg.insn_postorder = NULL; - env->cfg.cur_postorder = 0; return err; } @@ -24511,7 +24446,7 @@ static int compute_scc(struct bpf_verifier_env *env) stack[stack_sz++] = w; } /* Visit 'w' successors */ - succ_cnt = insn_successors(env->prog, w, succ); + succ_cnt = bpf_insn_successors(env->prog, w, succ); for (j = 0; j < succ_cnt; ++j) { if (pre[succ[j]]) { low[w] = min(low[w], low[succ[j]]); @@ -24684,6 +24619,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = compute_postorder(env); + if (ret < 0) + goto skip_full_check; + + ret = bpf_stack_liveness_init(env); + if (ret) + goto skip_full_check; + ret = check_attach_btf_id(env); if (ret) goto skip_full_check; @@ -24833,6 +24776,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: + bpf_stack_liveness_free(env); kvfree(env->cfg.insn_postorder); kvfree(env->scc_info); kvfree(env); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b14e61c64a3473..22051b4f1ccbc0 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(bool lock_threadgroup); -void cgroup_attach_unlock(bool lock_threadgroup); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task, bool locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 2a4a387f867abc..a9e029b570c8c8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; @@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return retval; @@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return ret; } @@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc) if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, ctx->release_agent); + strscpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } @@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void) * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); + WQ_PERCPU, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb73..6ae5f48cf64e34 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -124,10 +125,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -216,13 +240,22 @@ static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); +/* + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, + * read protected by either. + * + * Can only be turned on, but not turned off. + */ +bool cgroup_enable_per_threadgroup_rwsem __read_mostly; + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, - .ns.inum = PROC_CGROUP_INIT_INO, + .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, + .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; @@ -1302,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; - /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + /* + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. + * favordynmods can flip while task is between + * cgroup_threadgroup_change_begin() and end(), so down_write global + * cgroup_threadgroup_rwsem to synchronize them. + * + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding + * cgroup_threadgroup_rwsem doesn't exlude tasks between + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to + * turn off. As the scenario is unlikely, simply disallow disabling once + * enabled and print out a warning. + */ + percpu_down_write(&cgroup_threadgroup_rwsem); if (favor && !favoring) { + cgroup_enable_per_threadgroup_rwsem = true; rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { + if (cgroup_enable_per_threadgroup_rwsem) + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } + percpu_up_write(&cgroup_threadgroup_rwsem); } static int cgroup_init_root_id(struct cgroup_root *root) @@ -2459,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether acquire and acquire which rwsem + * @tsk: thread group to lock * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2479,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. + * + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead + * on dynamic cgroup modifications. see the comment above + * CGRP_ROOT_FAVOR_DYNMODS definition. + * + * tsk is not NULL only when writing to cgroup.procs. */ -void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { cpus_read_lock(); - if (lock_threadgroup) + + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + down_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether release and release which rwsem + * @tsk: thread group to lock */ -void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { - if (lock_threadgroup) + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + up_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } + cpus_read_unlock(); } @@ -2944,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, /* look up all src csets */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ @@ -2968,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *threadgroup_locked) + enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; @@ -2976,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - /* - * If we migrate a single thread, we don't care about threadgroup - * stability. If the thread is `current`, it won't exit(2) under our - * hands or change PID through exec(2). We exclude - * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write - * callers by cgroup_mutex. - * Therefore, we can skip the global lock. - */ - lockdep_assert_held(&cgroup_mutex); - *threadgroup_locked = pid || threadgroup; - cgroup_attach_lock(*threadgroup_locked); - +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } } else { tsk = current; @@ -3010,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } - get_task_struct(tsk); - goto out_unlock_rcu; + rcu_read_unlock(); + + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers + * by cgroup_mutex. Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + + if (pid || threadgroup) { + if (cgroup_enable_per_threadgroup_rwsem) + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; + else + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + } else { + *lock_mode = CGRP_ATTACH_LOCK_NONE; + } + + cgroup_attach_lock(*lock_mode, tsk); + + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * A race with de_thread from another thread's exec() + * may strip us of our leadership. If this happens, + * throw this task away and try again. + */ + cgroup_attach_unlock(*lock_mode, tsk); + put_task_struct(tsk); + goto retry_find_task; + } + } + + return tsk; -out_unlock_threadgroup: - cgroup_attach_unlock(*threadgroup_locked); - *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) { - struct cgroup_subsys *ss; - int ssid; + cgroup_attach_unlock(lock_mode, task); /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - - cgroup_attach_unlock(threadgroup_locked); - - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3088,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + enum cgroup_attach_lock_mode lock_mode; bool has_tasks; int ret; @@ -3119,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - cgroup_attach_lock(has_tasks); + + if (has_tasks) + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(lock_mode, NULL); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3140,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(has_tasks); + cgroup_attach_unlock(lock_mode, NULL); return ret; } @@ -3763,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + unsigned int sequence; + u64 freeze_time; + + do { + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); + freeze_time = cgrp->freezer.frozen_nsec; + /* Add in current freezer interval if the cgroup is freezing. */ + if (test_bit(CGRP_FREEZE, &cgrp->flags)) + freeze_time += (ktime_get_ns() - + cgrp->freezer.freeze_start_nsec); + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); + + do_div(freeze_time, NSEC_PER_USEC); + seq_printf(seq, "frozen_usec %llu\n", freeze_time); + + return 0; +} + #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem @@ -4159,6 +4282,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -5241,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool threadgroup_locked; + enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5273,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, threadgroup_locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -5354,6 +5478,11 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, + { + .name = "cgroup.stat.local", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_core_local_stat_show, + }, { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, @@ -5558,7 +5687,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5567,7 +5696,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5701,7 +5830,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5763,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; + seqcount_init(&cgrp->freezer.freeze_seq); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -5771,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } @@ -5939,7 +6070,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6312,6 +6443,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cpuset_fs_type)); #endif + ns_tree_add(&init_cgroup_ns); return 0; } @@ -6325,8 +6457,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); @@ -6343,15 +6481,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) } /* - * cgroup_get_from_id : get the cgroup associated with cgroup id + * __cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure - * Only cgroups within current task's cgroup NS are valid. + * There are no cgroup NS restrictions. */ -struct cgroup *cgroup_get_from_id(u64 id) +struct cgroup *__cgroup_get_from_id(u64 id) { struct kernfs_node *kn; - struct cgroup *cgrp, *root_cgrp; + struct cgroup *cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) @@ -6373,6 +6511,22 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!cgrp) return ERR_PTR(-ENOENT); + return cgrp; +} + +/* + * cgroup_get_from_id : get the cgroup associated with cgroup id + * @id: cgroup id + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. + */ +struct cgroup *cgroup_get_from_id(u64 id) +{ + struct cgroup *cgrp, *root_cgrp; + + cgrp = __cgroup_get_from_id(id); + if (IS_ERR(cgrp)) + return cgrp; root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 383963e28ac69c..337608f408ce0a 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -38,7 +38,6 @@ enum prs_errcode { /* bits in struct cpuset flags field */ typedef enum { - CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); + return css_is_online(&cs->css) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) @@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cpuset_common_seq_show(struct seq_file *sf, void *v); +void cpuset_full_lock(void); +void cpuset_full_unlock(void); /* * cpuset-v1.c diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index b69a7db67090d8..12e76774c75b0e 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } @@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 27adb04df675d4..52468d2c178a3e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -40,6 +40,7 @@ #include #include #include +#include DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -131,11 +132,6 @@ static bool force_sd_rebuild; #define PRS_INVALID_ROOT -1 #define PRS_INVALID_ISOLATED -2 -static inline bool is_prs_invalid(int prs_state) -{ - return prs_state < 0; -} - /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. @@ -159,16 +155,21 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -static inline int is_partition_valid(const struct cpuset *cs) +static inline bool is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } -static inline int is_partition_invalid(const struct cpuset *cs) +static inline bool is_partition_invalid(const struct cpuset *cs) { return cs->partition_root_state < 0; } +static inline bool cs_is_member(const struct cpuset *cs) +{ + return cs->partition_root_state == PRS_MEMBER; +} + /* * Callers should hold callback_lock to modify partition_root_state. */ @@ -207,7 +208,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ static struct cpuset top_cpuset = { - .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, @@ -250,6 +251,12 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); +/** + * cpuset_lock - Acquire the global cpuset mutex + * + * This locks the global cpuset mutex to prevent modifications to cpuset + * hierarchy and configurations. This helper is not enough to make modification. + */ void cpuset_lock(void) { mutex_lock(&cpuset_mutex); @@ -260,6 +267,24 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +/** + * cpuset_full_lock - Acquire full protection for cpuset modification + * + * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex + * to safely modify cpuset data. + */ +void cpuset_full_lock(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +void cpuset_full_unlock(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) @@ -411,94 +436,104 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) } /** - * alloc_cpumasks - allocate three cpumasks for cpuset - * @cs: the cpuset that have cpumasks to be allocated. - * @tmp: the tmpmasks structure pointer + * alloc_cpumasks - Allocate an array of cpumask variables + * @pmasks: Pointer to array of cpumask_var_t pointers + * @size: Number of cpumasks to allocate * Return: 0 if successful, -ENOMEM otherwise. * - * Only one of the two input arguments should be non-NULL. + * Allocates @size cpumasks and initializes them to empty. Returns 0 on + * success, -ENOMEM on allocation failure. On failure, any previously + * allocated cpumasks are freed. */ -static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { - cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; + int i; - if (cs) { - pmask1 = &cs->cpus_allowed; - pmask2 = &cs->effective_cpus; - pmask3 = &cs->effective_xcpus; - pmask4 = &cs->exclusive_cpus; - } else { - pmask1 = &tmp->new_cpus; - pmask2 = &tmp->addmask; - pmask3 = &tmp->delmask; - pmask4 = NULL; + for (i = 0; i < size; i++) { + if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { + while (--i >= 0) + free_cpumask_var(*pmasks[i]); + return -ENOMEM; + } } - - if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) - goto free_one; - - if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) - goto free_two; - - if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) - goto free_three; - - return 0; +} + +/** + * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. + * @tmp: Pointer to tmpmasks structure to populate + * Return: 0 on success, -ENOMEM on allocation failure + */ +static inline int alloc_tmpmasks(struct tmpmasks *tmp) +{ + /* + * Array of pointers to the three cpumask_var_t fields in tmpmasks. + * Note: Array size must match actual number of masks (3) + */ + cpumask_var_t *pmask[3] = { + &tmp->new_cpus, + &tmp->addmask, + &tmp->delmask + }; -free_three: - free_cpumask_var(*pmask3); -free_two: - free_cpumask_var(*pmask2); -free_one: - free_cpumask_var(*pmask1); - return -ENOMEM; + return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** - * free_cpumasks - free cpumasks in a tmpmasks structure - * @cs: the cpuset that have cpumasks to be free. + * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ -static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline void free_tmpmasks(struct tmpmasks *tmp) { - if (cs) { - free_cpumask_var(cs->cpus_allowed); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->effective_xcpus); - free_cpumask_var(cs->exclusive_cpus); - } - if (tmp) { - free_cpumask_var(tmp->new_cpus); - free_cpumask_var(tmp->addmask); - free_cpumask_var(tmp->delmask); - } + if (!tmp) + return; + + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); } /** - * alloc_trial_cpuset - allocate a trial cpuset - * @cs: the cpuset that the trial cpuset duplicates + * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset + * @cs: Source cpuset to duplicate (NULL for a fresh allocation) + * + * Creates a new cpuset by either: + * 1. Duplicating an existing cpuset (if @cs is non-NULL), or + * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) + * + * Return: Pointer to newly allocated cpuset on success, NULL on failure */ -static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) +static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + /* Allocate base structure */ + trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : + kzalloc(sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; - if (alloc_cpumasks(trial, NULL)) { + /* Setup cpumask pointer array */ + cpumask_var_t *pmask[4] = { + &trial->cpus_allowed, + &trial->effective_cpus, + &trial->effective_xcpus, + &trial->exclusive_cpus + }; + + if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); - cpumask_copy(trial->effective_cpus, cs->effective_cpus); - cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); - cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + /* Copy masks if duplicating */ + if (cs) { + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + } + return trial; } @@ -508,7 +543,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static inline void free_cpuset(struct cpuset *cs) { - free_cpumasks(cs, NULL); + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); kfree(cs); } @@ -540,6 +578,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) return true; } +/** + * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * Conflict detection rules: + * 1. If either cpuset is CPU exclusive, they must be mutually exclusive + * 2. exclusive_cpus masks cannot intersect between cpusets + * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + */ +static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + /* If either cpuset is exclusive, check if they are mutually exclusive */ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return !cpusets_are_exclusive(cs1, cs2); + + /* Exclusive_cpus cannot intersect */ + if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) + return true; + + /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ + if (!cpumask_empty(cs1->cpus_allowed) && + cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) + return true; + + if (!cpumask_empty(cs2->cpus_allowed) && + cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + return true; + + return false; +} + +static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) + return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); + return false; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -621,38 +700,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - bool txset, cxset; /* Are exclusive_cpus set? */ - if (c == cur) continue; - - txset = !cpumask_empty(trial->exclusive_cpus); - cxset = !cpumask_empty(c->exclusive_cpus); - if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || - (txset && cxset)) { - if (!cpusets_are_exclusive(trial, c)) - goto out; - } else if (txset || cxset) { - struct cpumask *xcpus, *acpus; - - /* - * When just one of the exclusive_cpus's is set, - * cpus_allowed of the other cpuset, if set, cannot be - * a subset of it or none of those CPUs will be - * available if these exclusive CPUs are activated. - */ - if (txset) { - xcpus = trial->exclusive_cpus; - acpus = c->cpus_allowed; - } else { - xcpus = c->exclusive_cpus; - acpus = trial->cpus_allowed; - } - if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) - goto out; - } - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) + if (cpus_excl_conflict(trial, c)) + goto out; + if (mems_excl_conflict(trial, c)) goto out; } @@ -1363,38 +1415,25 @@ bool cpuset_cpu_is_isolated(int cpu) } EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); -/* - * compute_effective_exclusive_cpumask - compute effective exclusive CPUs - * @cs: cpuset - * @xcpus: effective exclusive CPUs value to be set - * @real_cs: the real cpuset (can be NULL) - * Return: 0 if there is no sibling conflict, > 0 otherwise +/** + * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets + * @parent: Parent cpuset containing all siblings + * @cs: Current cpuset (will be skipped) + * @excpus: exclusive effective CPU mask to modify * - * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to - * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus - * as well. The provision of real_cs means that a cpumask is being changed and - * the given cs is a trial one. + * This function ensures the given @excpus mask doesn't include any CPUs that + * are exclusively allocated to sibling cpusets. It walks through all siblings + * of @cs under @parent and removes their exclusive CPUs from @excpus. */ -static int compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus, - struct cpuset *real_cs) +static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, + struct cpumask *excpus) { struct cgroup_subsys_state *css; - struct cpuset *parent = parent_cs(cs); struct cpuset *sibling; int retval = 0; - if (!xcpus) - xcpus = cs->effective_xcpus; - - cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); - - if (!real_cs) { - if (!cpumask_empty(cs->exclusive_cpus)) - return 0; - } else { - cs = real_cs; - } + if (cpumask_empty(excpus)) + return retval; /* * Exclude exclusive CPUs from siblings @@ -1404,20 +1443,66 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs, if (sibling == cs) continue; - if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { - cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { + cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); retval++; continue; } - if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { - cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + if (cpumask_intersects(excpus, sibling->effective_xcpus)) { + cpumask_andnot(excpus, excpus, sibling->effective_xcpus); retval++; } } rcu_read_unlock(); + return retval; } +/* + * compute_excpus - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: 0 if there is no sibling conflict, > 0 otherwise + * + * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets + * and exclude their exclusive_cpus or effective_xcpus as well. + */ +static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) +{ + struct cpuset *parent = parent_cs(cs); + + cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); + + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +/* + * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset + * @trialcs: The trial cpuset containing the proposed new configuration + * @cs: The original cpuset that the trial configuration is based on + * Return: 0 if successful with no sibling conflict, >0 if a conflict is found + * + * Computes the effective_xcpus for a trial configuration. @cs is provided to represent + * the real cs. + */ +static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(trialcs); + struct cpumask *excpus = trialcs->effective_xcpus; + + /* trialcs is member, cpuset.cpus has no impact to excpus */ + if (cs_is_member(cs)) + cpumask_and(excpus, trialcs->exclusive_cpus, + parent->effective_xcpus); + else + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + static inline bool is_remote_partition(struct cpuset *cs) { return !list_empty(&cs->remote_sibling); @@ -1459,7 +1544,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * Note that creating a remote partition with any local partition root * above it or remote partition root underneath it is not allowed. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); + compute_excpus(cs, tmp->new_cpus); WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) @@ -1508,7 +1593,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) cs->partition_root_state = PRS_MEMBER; /* effective_xcpus may need to be changed */ - compute_effective_exclusive_cpumask(cs, NULL, NULL); + compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); @@ -1677,7 +1762,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_invalidate) { - if (is_prs_invalid(old_prs)) + if (is_partition_invalid(cs)) return 0; /* @@ -1709,13 +1794,14 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* - * Need to call compute_effective_exclusive_cpumask() in case + * Need to call compute_excpus() in case * exclusive_cpus not set. Sibling conflict should only happen * if exclusive_cpus isn't set. */ xcpus = tmp->delmask; - if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + if (compute_excpus(cs, xcpus)) WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; /* * Enabling partition root is not allowed if its @@ -1727,11 +1813,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; - /* - * A parent can be left with no CPU as long as there is no - * task directly associated with the parent partition. - */ - if (nocpu) + if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; /* @@ -1748,7 +1830,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = true; subparts_delta++; - new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus @@ -1788,7 +1869,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * For invalid partition: * delmask = newmask & parent->effective_xcpus */ - if (is_prs_invalid(old_prs)) { + if (is_partition_invalid(cs)) { adding = false; deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); @@ -1837,7 +1918,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * A partition error happens when parent has tasks and all * its effective CPUs will have to be distributed out. */ - WARN_ON_ONCE(!is_partition_valid(parent)); if (nocpu) { part_error = PERR_NOCPUS; if (is_partition_valid(cs)) @@ -1996,7 +2076,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); + compute_excpus(cs, new_ecpus); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -2075,7 +2155,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * its value is being processed. */ if (remote && (cp != cs)) { - compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + compute_excpus(cp, tmp->new_cpus); if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2177,7 +2257,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_effective_exclusive_cpumask(cp, NULL, NULL); + compute_excpus(cp, cp->effective_xcpus); /* * Make sure effective_xcpus is properly set for a valid @@ -2284,82 +2364,54 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, rcu_read_unlock(); } -/** - * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it - * @cs: the cpuset to consider - * @trialcs: trial cpuset - * @buf: buffer of cpu numbers written to this cpuset - */ -static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, - const char *buf) +static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) { int retval; - struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; - bool force = false; - int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; + retval = cpulist_parse(buf, out_mask); + if (retval < 0) + return retval; + if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) + return -EINVAL; - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); - if (cpumask_empty(trialcs->exclusive_cpus)) - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); - if (retval < 0) - return retval; + return 0; +} - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) - return -EINVAL; +/** + * validate_partition - Validate a cpuset partition configuration + * @cs: The cpuset to validate + * @trialcs: The trial cpuset containing proposed configuration changes + * + * If any validation check fails, the appropriate error code is set in the + * cpuset's prs_err field. + * + * Return: PRS error code (0 if valid, non-zero error code if invalid) + */ +static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) +{ + struct cpuset *parent = parent_cs(cs); - /* - * When exclusive_cpus isn't explicitly set, it is constrained - * by cpus_allowed and parent's effective_xcpus. Otherwise, - * trialcs->effective_xcpus is used as a temporary cpumask - * for checking validity of the partition root. - */ - trialcs->partition_root_state = PRS_MEMBER; - if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL, cs); - } + if (cs_is_member(trialcs)) + return PERR_NONE; - /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) - return 0; + if (cpumask_empty(trialcs->effective_xcpus)) + return PERR_INVCPUS; - if (alloc_cpumasks(NULL, &tmp)) - return -ENOMEM; + if (prstate_housekeeping_conflict(trialcs->partition_root_state, + trialcs->effective_xcpus)) + return PERR_HKEEPING; - if (old_prs) { - if (is_partition_valid(cs) && - cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } - } + if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) + return PERR_NOCPUS; - /* - * Check all the descendants in update_cpumasks_hier() if - * effective_xcpus is to be changed. - */ - force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + return PERR_NONE; +} + +static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + int retval; + struct cpuset *parent = parent_cs(cs); retval = validate_change(cs, trialcs); @@ -2374,7 +2426,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * partition. However, any conflicting sibling partitions * have to be marked as invalid too. */ - invalidate = true; + trialcs->prs_err = PERR_NOTEXCL; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { struct cpumask *xcpus = user_xcpus(trialcs); @@ -2382,36 +2434,92 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); rcu_read_lock(); } } rcu_read_unlock(); retval = 0; } + return retval; +} - if (retval < 0) - goto out_free; +/** + * partition_cpus_change - Handle partition state changes due to CPU mask updates + * @cs: The target cpuset being modified + * @trialcs: The trial cpuset containing proposed configuration changes + * @tmp: Temporary masks for intermediate calculations + * + * This function handles partition state transitions triggered by CPU mask changes. + * CPU modifications may cause a partition to be disabled or require state updates. + */ +static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + enum prs_errcode prs_err; - if (is_partition_valid(cs) || - (is_partition_invalid(cs) && !invalidate)) { - struct cpumask *xcpus = trialcs->effective_xcpus; + if (cs_is_member(cs)) + return; - if (cpumask_empty(xcpus) && is_partition_invalid(cs)) - xcpus = trialcs->cpus_allowed; + prs_err = validate_partition(cs, trialcs); + if (prs_err) + trialcs->prs_err = cs->prs_err = prs_err; - /* - * Call remote_cpus_update() to handle valid remote partition - */ - if (is_remote_partition(cs)) - remote_cpus_update(cs, NULL, xcpus, &tmp); - else if (invalidate) + if (is_remote_partition(cs)) { + if (trialcs->prs_err) + remote_partition_disable(cs, tmp); + else + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, tmp); + } else { + if (trialcs->prs_err) update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); + NULL, tmp); else update_parent_effective_cpumask(cs, partcmd_update, - xcpus, &tmp); + trialcs->effective_xcpus, tmp); } +} + +/** + * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + struct tmpmasks tmp; + bool force = false; + int old_prs = cs->partition_root_state; + + retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); + if (retval < 0) + return retval; + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + return 0; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; + + compute_trialcs_excpus(trialcs, cs); + trialcs->prs_err = PERR_NONE; + + retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + if (retval < 0) + goto out_free; + + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + + partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); @@ -2427,7 +2535,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); out_free: - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return retval; } @@ -2444,33 +2552,23 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; bool force = false; int old_prs = cs->partition_root_state; - if (!*buf) { - cpumask_clear(trialcs->exclusive_cpus); - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->exclusive_cpus); - if (retval < 0) - return retval; - } + retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) { - trialcs->partition_root_state = PRS_MEMBER; - /* - * Reject the change if there is exclusive CPUs conflict with - * the siblings. - */ - if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) - return -EINVAL; - } + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_trialcs_excpus(trialcs, cs)) + return -EINVAL; /* * Check all the descendants in update_cpumasks_hier() if @@ -2482,35 +2580,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval) return retval; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_tmpmasks(&tmp)) return -ENOMEM; - if (old_prs) { - if (cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } + trialcs->prs_err = PERR_NONE; + partition_cpus_change(cs, trialcs, &tmp); - if (is_remote_partition(cs)) { - if (invalidate) - remote_partition_disable(cs, &tmp); - else - remote_cpus_update(cs, trialcs->exclusive_cpus, - trialcs->effective_xcpus, &tmp); - } else if (invalidate) { - update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); - } else { - update_parent_effective_cpumask(cs, partcmd_update, - trialcs->effective_xcpus, &tmp); - } - } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); @@ -2530,7 +2605,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return 0; } @@ -2582,9 +2657,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -static void cpuset_post_attach(void) +static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); + kfree(head); +} + +static void schedule_flush_migrate_mm(void) +{ + struct callback_head *flush_cb; + + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); + if (!flush_cb) + return; + + init_task_work(flush_cb, flush_migrate_mm_task_workfn); + + if (task_work_add(current, flush_cb, TWA_RESUME)) + kfree(flush_cb); } /* @@ -2749,33 +2839,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, { int retval; - /* - * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) { - retval = -EACCES; - goto done; - } - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. + * The validate_change() call ensures that cpusets with tasks have memory. */ - if (!*buf) { - nodes_clear(trialcs->mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs->mems_allowed); - if (retval < 0) - goto done; + retval = nodelist_parse(buf, trialcs->mems_allowed); + if (retval < 0) + goto done; - if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } + if (!nodes_subset(trialcs->mems_allowed, + top_cpuset.mems_allowed)) { + retval = -EINVAL; + goto done; } if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { @@ -2826,7 +2901,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int spread_flag_changed; int err; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; @@ -2884,10 +2959,10 @@ static int update_prstate(struct cpuset *cs, int new_prs) /* * Treat a previously invalid partition root as if it is a "member". */ - if (new_prs && is_prs_invalid(old_prs)) + if (new_prs && is_partition_invalid(cs)) old_prs = PRS_MEMBER; - if (alloc_cpumasks(NULL, &tmpmask)) + if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; err = update_partition_exclusive_flag(cs, new_prs); @@ -2983,7 +3058,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) notify_partition_change(cs, old_prs); if (force_sd_rebuild) rebuild_sched_domains_locked(); - free_cpumasks(NULL, &tmpmask); + free_tmpmasks(&tmpmask); return 0; } @@ -3141,6 +3216,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; bool cpus_updated, mems_updated; + bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); @@ -3191,15 +3267,18 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) + if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - else + queue_task_work = true; + } else mmput(mm); } } out: + if (queue_task_work) + schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; if (cs->nr_migrate_dl_tasks) { @@ -3223,13 +3302,16 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, struct cpuset *trialcs; int retval = -ENODEV; + /* root is read-only */ + if (cs == &top_cpuset) + return -EACCES; + buf = strstrip(buf); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; @@ -3254,9 +3336,9 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, if (force_sd_rebuild) rebuild_sched_domains_locked(); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - flush_workqueue(cpuset_migrate_mm_wq); + cpuset_full_unlock(); + if (of_cft(of)->private == FILE_MEMLIST) + schedule_flush_migrate_mm(); return retval ?: nbytes; } @@ -3358,12 +3440,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, else return -EINVAL; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return retval ?: nbytes; } @@ -3462,15 +3542,10 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) return &top_cpuset.css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); + cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); - if (alloc_cpumasks(cs, NULL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } - __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; @@ -3493,10 +3568,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - - set_bit(CS_ONLINE, &cs->flags); + cpuset_full_lock(); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -3548,8 +3620,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return 0; } @@ -3564,17 +3635,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); } /* @@ -3586,16 +3652,11 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - + cpuset_full_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -3724,7 +3785,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .post_attach = cpuset_post_attach, .bind = cpuset_bind, .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, @@ -3928,7 +3988,7 @@ static void cpuset_handle_hotplug(void) bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; - if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; lockdep_assert_cpus_held(); @@ -4008,7 +4068,7 @@ static void cpuset_handle_hotplug(void) if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); - free_cpumasks(NULL, ptmp); + free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) @@ -4073,7 +4133,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = task_cs(tsk); if (cs != &top_cpuset) @@ -4095,7 +4154,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) cpumask_copy(pmask, possible_mask); } - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } @@ -4168,9 +4226,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return mask; @@ -4265,10 +4321,8 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return allowed; diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 80aa3f027ac3b1..81ea38dd6f9d27 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) return -ENODEV; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); @@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, css, css->id); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); cgroup_kn_unlock(of->kn); return 0; @@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) return -ENOMEM; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name_buf); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index bf1690a167dda0..6c18854bff3485 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) /* * Freeze or unfreeze all tasks in the given cgroup. */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; @@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - if (freeze) + write_seqcount_begin(&cgrp->freezer.freeze_seq); + if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); - else + cgrp->freezer.freeze_start_nsec = ts_nsec; + } else { clear_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.frozen_nsec += (ts_nsec - + cgrp->freezer.freeze_start_nsec); + } + write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) @@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) struct cgroup *parent; struct cgroup *dsct; bool applied = false; + u64 ts_nsec; bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) return; cgrp->freezer.freeze = freeze; + ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. @@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) /* * Do change actual state: freeze or unfreeze. */ - cgroup_do_freeze(dsct, freeze); + cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c664..fdbe57578e6886 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -5,7 +5,7 @@ #include #include #include - +#include /* cgroup namespaces */ @@ -21,33 +21,32 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts) static struct cgroup_namespace *alloc_cgroup_ns(void) { - struct cgroup_namespace *new_ns; + struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); + ret = ns_common_init(new_ns); + if (ret) return ERR_PTR(ret); - } - refcount_set(&new_ns->ns.count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; + ns_tree_add(new_ns); + return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) { + ns_tree_remove(ns); put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { @@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 64caaf997fc089..7c3924614e01de 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -93,8 +93,8 @@ CONFIG_SECCOMP_FILTER=y # Provides some protections against SYN flooding. CONFIG_SYN_COOKIES=y -# Enable Kernel Control Flow Integrity (currently Clang only). -CONFIG_CFI_CLANG=y +# Enable Kernel Control Flow Integrity. +CONFIG_CFI=y # CONFIG_CFI_PERMISSIVE is not set # Attack surface reduction: do not autoload TTY line disciplines. diff --git a/kernel/cred.c b/kernel/cred.c index 9676965c0981a0..dbf6b687dc5c5a 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void) * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) +int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index e43c6de2bce4e7..b82399437db031 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -39,6 +39,7 @@ enum { dma_debug_sg, dma_debug_coherent, dma_debug_resource, + dma_debug_noncoherent, }; enum map_err_types { @@ -141,6 +142,7 @@ static const char *type2name[] = { [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", + [dma_debug_noncoherent] = "noncoherent", }; static const char *dir2name[] = { @@ -993,7 +995,8 @@ static void check_unmap(struct dma_debug_entry *ref) "[mapped as %s] [unmapped as %s]\n", ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); - } else if (entry->type == dma_debug_coherent && + } else if ((entry->type == dma_debug_coherent || + entry->type == dma_debug_noncoherent) && ref->paddr != entry->paddr) { err_printk(ref->dev, entry, "device driver frees " "DMA memory with different CPU address " @@ -1581,6 +1584,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } } +void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_noncoherent; + entry->dev = dev; + entry->paddr = page_to_phys(page); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + + add_dma_entry(entry, attrs); +} + +void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_noncoherent, + .dev = dev, + .paddr = page_to_phys(page), + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} + static int __init dma_debug_driver_setup(char *str) { int i; diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae60..48757ca13f3140 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev, extern void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction); +extern void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs); +extern void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, @@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, int nelems, int direction) { } + +static inline void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ +} + +static inline void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} #endif /* CONFIG_DMA_API_DEBUG */ #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 107e4a4d251df6..56de28a3b1799f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -712,7 +712,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, if (page) { trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0); } else { trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } @@ -738,7 +738,7 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); - debug_dma_unmap_page(dev, dma_handle, size, dir); + debug_dma_free_pages(dev, page, size, dir, dma_handle); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 408d28b5179df8..f62e1d1b2063ea 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -143,6 +143,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } +/** + * arch_irqentry_exit_need_resched - Architecture specific need resched function + * + * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed. + * Defaults return true. + * + * The main purpose is to permit arch to avoid preemption of a task from an IRQ. + */ +static inline bool arch_irqentry_exit_need_resched(void); + +#ifndef arch_irqentry_exit_need_resched +static inline bool arch_irqentry_exit_need_resched(void) { return true; } +#endif + void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { @@ -150,7 +164,7 @@ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (need_resched() && arch_irqentry_exit_need_resched()) preempt_schedule_irq(); } } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6c83ad674d0104..808c0d7a31faf0 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr } struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx, start_entry_idx; + /* crosstask is not supported for user stacks */ + if (crosstask && user && !kernel) + return NULL; + entry = get_callchain_entry(&rctx); if (!entry) return NULL; - ctx.entry = entry; - ctx.max_stack = max_stack; - ctx.nr = entry->nr = init_nr; - ctx.contexts = 0; - ctx.contexts_maxed = false; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = 0; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) @@ -240,25 +244,19 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, perf_callchain_kernel(&ctx, regs); } - if (user) { + if (user && !crosstask) { if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - if (crosstask) + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) goto exit_put; + regs = task_pt_regs(current); + } - if (add_mark) - perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - start_entry_idx = entry->nr; - perf_callchain_user(&ctx, regs); - fixup_uretprobe_trampoline_entries(entry, start_entry_idx); - } + start_entry_idx = entry->nr; + perf_callchain_user(&ctx, regs); + fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } exit_put: diff --git a/kernel/events/core.c b/kernel/events/core.c index 820127536e62b7..7541f6f85fcb03 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3974,7 +3974,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, */ static inline bool event_update_userpage(struct perf_event *event) { - if (likely(!atomic_read(&event->mmap_count))) + if (likely(!refcount_read(&event->mmap_count))) return false; perf_event_update_time(event); @@ -6710,11 +6710,11 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; mapped_f mapped = get_mapped(event, event_mapped); - atomic_inc(&event->mmap_count); - atomic_inc(&event->rb->mmap_count); + refcount_inc(&event->mmap_count); + refcount_inc(&event->rb->mmap_count); if (vma->vm_pgoff) - atomic_inc(&event->rb->aux_mmap_count); + refcount_inc(&event->rb->aux_mmap_count); if (mapped) mapped(event, vma->vm_mm); @@ -6749,7 +6749,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6769,10 +6769,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&rb->aux_mutex); } - if (atomic_dec_and_test(&rb->mmap_count)) + if (refcount_dec_and_test(&rb->mmap_count)) detach_rest = true; - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); @@ -6933,230 +6933,242 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) return err; } -static int perf_mmap(struct file *file, struct vm_area_struct *vma) +static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra) { - struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; + unsigned long user_locked, user_lock_limit, locked, lock_limit; struct user_struct *user = current_user(); - struct mutex *aux_mutex = NULL; - struct perf_buffer *rb = NULL; - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra = 0, extra = 0; - int ret, flags = 0; - mapped_f mapped; + + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + /* Increase the limit linearly with more CPUs */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm); /* - * Don't allow mmap() of inherited per-task counters. This would - * create a performance issue due to all children writing to the - * same rb. + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += *user_extra; - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; + if (user_locked > user_lock_limit) { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ + *extra = user_locked - user_lock_limit; + *user_extra -= *extra; + } - ret = security_perf_event_read(event); - if (ret) - return ret; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra; - vma_size = vma->vm_end - vma->vm_start; - nr_pages = vma_size / PAGE_SIZE; + return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); +} - if (nr_pages > INT_MAX) - return -ENOMEM; +static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra) +{ + struct user_struct *user = current_user(); - if (vma_size != PAGE_SIZE * nr_pages) - return -EINVAL; + atomic_long_add(user_extra, &user->locked_vm); + atomic64_add(extra, &vma->vm_mm->pinned_vm); +} - user_extra = nr_pages; +static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + struct perf_buffer *rb; + int rb_flags = 0; - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; + nr_pages -= 1; /* - * This relies on __pmu_detach_event() taking mmap_mutex after marking - * the event REVOKED. Either we observe the state, or __pmu_detach_event() - * will detach the rb created here. + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. */ - if (event->state <= PERF_EVENT_STATE_REVOKED) { - ret = -ENODEV; - goto unlock; - } - - if (vma->vm_pgoff == 0) { - nr_pages -= 1; - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - goto unlock; - - WARN_ON_ONCE(event->ctx->parent_ctx); + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) - goto unlock; + WARN_ON_ONCE(event->ctx->parent_ctx); - if (atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Success -- managed to mmap() the same buffer - * multiple times. - */ - ret = 0; - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; - } + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + return -EINVAL; + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the - * event and continue as if !event->rb + * Success -- managed to mmap() the same buffer + * multiple times. */ - ring_buffer_attach(event, NULL); + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); + return 0; } - } else { /* - * AUX area mapping: if rb->aux_nr_pages != 0, it's already - * mapped, all subsequent mappings should have the same size - * and offset. Must be above the normal perf buffer. + * Raced against perf_mmap_close()'s + * refcount_dec_and_mutex_lock() remove the + * event and continue as if !event->rb */ - u64 aux_offset, aux_size; + ring_buffer_attach(event, NULL); + } - rb = event->rb; - if (!rb) - goto aux_unlock; + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) + return -EPERM; - aux_mutex = &rb->aux_mutex; - mutex_lock(aux_mutex); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; - aux_offset = READ_ONCE(rb->user_page->aux_offset); - aux_size = READ_ONCE(rb->user_page->aux_size); + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, rb_flags); - if (aux_offset < perf_data_size(rb) + PAGE_SIZE) - goto aux_unlock; + if (!rb) + return -ENOMEM; - if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) - goto aux_unlock; + refcount_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; - /* already mapped with a different offset */ - if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) - goto aux_unlock; + ring_buffer_attach(event, rb); - if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) - goto aux_unlock; + perf_event_update_time(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); - /* already mapped with a different size */ - if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) - goto aux_unlock; + perf_mmap_account(vma, user_extra, extra); + refcount_set(&event->mmap_count, 1); - if (!is_power_of_2(nr_pages)) - goto aux_unlock; + return 0; +} - if (!atomic_inc_not_zero(&rb->mmap_count)) - goto aux_unlock; +static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + u64 aux_offset, aux_size; + struct perf_buffer *rb; + int ret, rb_flags = 0; - if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); - ret = 0; - goto unlock; - } - } + rb = event->rb; + if (!rb) + return -EINVAL; - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + guard(mutex)(&rb->aux_mutex); /* - * Increase the limit linearly with more CPUs: + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. */ - user_lock_limit *= num_online_cpus(); + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); - user_locked = atomic_long_read(&user->locked_vm); + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + return -EINVAL; - /* - * sysctl_perf_event_mlock may have changed, so that - * user->locked_vm > user_lock_limit - */ - if (user_locked > user_lock_limit) - user_locked = user_lock_limit; - user_locked += user_extra; + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + return -EINVAL; - if (user_locked > user_lock_limit) { - /* - * charge locked_vm until it hits user_lock_limit; - * charge the rest from pinned_vm - */ - extra = user_locked - user_lock_limit; - user_extra -= extra; - } + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + return -EINVAL; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; + if (aux_size != nr_pages * PAGE_SIZE) + return -EINVAL; - if ((locked > lock_limit) && perf_is_paranoid() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + return -EINVAL; - WARN_ON(!rb && event->rb); + if (!is_power_of_2(nr_pages)) + return -EINVAL; - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; + if (!refcount_inc_not_zero(&rb->mmap_count)) + return -EINVAL; - if (!rb) { - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); + if (rb_has_aux(rb)) { + refcount_inc(&rb->aux_mmap_count); - if (!rb) { - ret = -ENOMEM; - goto unlock; + } else { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + refcount_dec(&rb->mmap_count); + return -EPERM; } - atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; + WARN_ON(!rb && event->rb); - ring_buffer_attach(event, rb); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; - perf_event_update_time(event); - perf_event_init_userpage(event); - perf_event_update_userpage(event); - ret = 0; - } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, - event->attr.aux_watermark, flags); - if (!ret) { - atomic_set(&rb->aux_mmap_count, 1); - rb->aux_mmap_locked = extra; + event->attr.aux_watermark, rb_flags); + if (ret) { + refcount_dec(&rb->mmap_count); + return ret; } + + refcount_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; } -unlock: - if (!ret) { - atomic_long_add(user_extra, &user->locked_vm); - atomic64_add(extra, &vma->vm_mm->pinned_vm); - - atomic_inc(&event->mmap_count); - } else if (rb) { - /* AUX allocation failed */ - atomic_dec(&rb->mmap_count); - } -aux_unlock: - if (aux_mutex) - mutex_unlock(aux_mutex); - mutex_unlock(&event->mmap_mutex); + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); + + return 0; +} + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long vma_size, nr_pages; + mapped_f mapped; + int ret; + + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same rb. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + ret = security_perf_event_read(event); if (ret) return ret; + vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; + + scoped_guard (mutex, &event->mmap_mutex) { + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + + if (vma->vm_pgoff == 0) + ret = perf_mmap_rb(vma, event, nr_pages); + else + ret = perf_mmap_aux(vma, event, nr_pages); + if (ret) + return ret; + } + /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. @@ -7174,7 +7186,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) * full cleanup in this case and therefore does not invoke * vmops::close(). */ - ret = map_range(rb, vma); + ret = map_range(event->rb, vma); if (ret) perf_mmap_close(vma); @@ -7440,7 +7452,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (!(current->flags & PF_KTHREAD)) { + } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -8080,7 +8092,7 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ - if (current->mm != NULL) { + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { struct page *p; pagefault_disable(); @@ -8192,7 +8204,8 @@ struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; - bool user = !event->attr.exclude_callchain_user; + bool user = !event->attr.exclude_callchain_user && + !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; @@ -8204,7 +8217,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, 0, kernel, user, + callchain = get_perf_callchain(regs, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } @@ -11232,6 +11245,10 @@ static int __perf_event_set_bpf_prog(struct perf_event *event, if (prog->kprobe_override && !is_kprobe) return -EINVAL; + /* Writing to context allowed only for uprobes. */ + if (prog->aux->kprobe_write_ctx && !is_uprobe) + return -EINVAL; + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); @@ -12217,7 +12234,7 @@ static const struct attribute_group *pmu_dev_groups[] = { }; static int pmu_bus_running; -static struct bus_type pmu_bus = { +static const struct bus_type pmu_bus = { .name = "event_source", .dev_groups = pmu_dev_groups, }; @@ -13249,7 +13266,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) + if (refcount_read(&event->mmap_count)) goto unlock; if (output_event) { @@ -13262,7 +13279,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto unlock; /* did we race against perf_mmap_close() */ - if (!atomic_read(&rb->mmap_count)) { + if (!refcount_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 249288d82b8dcf..d9cc5708309184 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,7 +35,7 @@ struct perf_buffer { spinlock_t event_lock; struct list_head event_list; - atomic_t mmap_count; + refcount_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; @@ -47,7 +47,7 @@ struct perf_buffer { unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; - atomic_t aux_mmap_count; + refcount_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index aa9a759e824fee..20a90502373620 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ - if (!atomic_read(&rb->aux_mmap_count)) + if (!refcount_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd83..5dcf927310fd53 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -121,7 +121,7 @@ struct xol_area { static void uprobe_warn(struct task_struct *t, const char *msg) { - pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); + pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg); } /* @@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, return identical; } -static int __uprobe_write_opcode(struct vm_area_struct *vma, +static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long opcode_vaddr, uprobe_opcode_t opcode) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(&opcode); + const unsigned long vaddr = insn_vaddr & PAGE_MASK; bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); - copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); /* * When unregistering, we may only zap a PTE if uffd is disabled and @@ -482,23 +483,32 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * - * Called with mm->mmap_lock held for read or write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode) + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) +{ + const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - int ret, is_register, ref_ctr_updated = 0; + int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; - is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -509,7 +519,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really - * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() * cannot deal with PMDs yet. */ if (is_register) @@ -521,14 +531,14 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, goto out; folio = page_folio(page); - ret = verify_opcode(page, opcode_vaddr, &opcode); + ret = verify(page, insn_vaddr, insn, nbytes, data); if (ret <= 0) { folio_put(folio); goto out; } /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); @@ -560,7 +570,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); folio_walk_end(&fw, vma); } @@ -580,7 +590,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && ref_ctr_updated) + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ @@ -602,7 +612,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** @@ -618,7 +628,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ @@ -1051,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1210,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + GFP_NOWAIT | __GFP_NOMEMALLOC); if (prev) prev->next = NULL; } @@ -1397,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -1463,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - mmap_read_lock(mm); + mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1480,7 +1490,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, vma, vaddr); } - mmap_read_unlock(mm); + mmap_write_unlock(mm); return err; } @@ -1726,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1758,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) @@ -1792,6 +1802,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1803,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; @@ -2160,7 +2180,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) +void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; @@ -2393,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ @@ -2677,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2741,6 +2765,16 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* + * If user decided to take execution elsewhere, it makes little sense + * to execute the original instruction, so let's skip it. + */ + if (instruction_pointer(regs) != bp_vaddr) + goto out; + + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; @@ -2752,6 +2786,23 @@ static void handle_swbp(struct pt_regs *regs) rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. diff --git a/kernel/fork.c b/kernel/fork.c index c4ada32598bd5e..cffa6157a55ac1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1014,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } @@ -1507,7 +1508,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1545,7 +1546,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { @@ -1566,7 +1567,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1596,7 +1597,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, return 0; } -static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) +static int copy_sighand(u64 clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -1645,7 +1646,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; @@ -1688,6 +1689,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->cgroup_threadgroup_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -2295,7 +2300,7 @@ __latent_entropy struct task_struct *copy_process( if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) - goto bad_fork_core_free; + goto bad_fork_cancel_cgroup; /* * If we fail beyond this point we don't free the allocated * futex hash map. We assume that another thread will be created diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index c716a66f86929c..d818b4d47f1bad 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -230,8 +230,9 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - q->key = *key; + struct task_struct *task; + q->key = *key; __futex_unqueue(q); WARN_ON(!q->rt_waiter); @@ -243,10 +244,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, futex_hash_get(hb); q->drop_hb_ref = true; q->lock_ptr = &hb->lock; + task = READ_ONCE(q->task); /* Signal locked state to the waiter */ futex_requeue_pi_complete(q, 1); - wake_up_state(q->task, TASK_NORMAL); + wake_up_state(task, TASK_NORMAL); } /** diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 4b6da9116aa6c3..880c9bf2f31504 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -39,6 +39,56 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, return 0; } +static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat) +{ +#ifdef CONFIG_COMPAT + if (compat) + return p->compat_robust_list; +#endif + return p->robust_list; +} + +static void __user *futex_get_robust_list_common(int pid, bool compat) +{ + struct task_struct *p = current; + void __user *head; + int ret; + + scoped_guard(rcu) { + if (pid) { + p = find_task_by_vpid(pid); + if (!p) + return (void __user *)ERR_PTR(-ESRCH); + } + get_task_struct(p); + } + + /* + * Hold exec_update_lock to serialize with concurrent exec() + * so ptrace_may_access() is checked against stable credentials + */ + ret = down_read_killable(&p->signal->exec_update_lock); + if (ret) + goto err_put; + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) + goto err_unlock; + + head = futex_task_robust_list(p, compat); + + up_read(&p->signal->exec_update_lock); + put_task_struct(p); + + return head; + +err_unlock: + up_read(&p->signal->exec_update_lock); +err_put: + put_task_struct(p); + return (void __user *)ERR_PTR(ret); +} + /** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] @@ -49,36 +99,14 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, size_t __user *, len_ptr) { - struct robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; + struct robust_list_head __user *head = futex_get_robust_list_common(pid, false); - head = p->robust_list; - rcu_read_unlock(); + if (IS_ERR(head)) + return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, @@ -455,36 +483,14 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, compat_uptr_t __user *, head_ptr, compat_size_t __user *, len_ptr) { - struct compat_robust_list_head __user *head; - unsigned long ret; - struct task_struct *p; - - rcu_read_lock(); - - ret = -ESRCH; - if (!pid) - p = current; - else { - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - } - - ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) - goto err_unlock; + struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true); - head = p->compat_robust_list; - rcu_read_unlock(); + if (IS_ERR(head)) + return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; } #endif /* CONFIG_COMPAT */ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1da5e9d9da7193..1b4254d19a73ec 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -6,10 +6,6 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool -# Legacy support, required for itanic -config GENERIC_IRQ_LEGACY - bool - # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool @@ -147,7 +143,9 @@ config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD config IRQ_KUNIT_TEST bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS depends on KUNIT=y + depends on SPARSE_IRQ default KUNIT_ALL_TESTS + select IRQ_DOMAIN imply SMP help This option enables KUnit tests for the IRQ subsystem API. These are diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0d0276378c707c..3ffa0d80ddd19c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1259,6 +1259,43 @@ int irq_chip_get_parent_state(struct irq_data *data, } EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); +/** + * irq_chip_shutdown_parent - Shutdown the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_shutdown() callback of the parent if available or falls + * back to irq_chip_disable_parent(). + */ +void irq_chip_shutdown_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_shutdown) + parent->chip->irq_shutdown(parent); + else + irq_chip_disable_parent(data); +} +EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent); + +/** + * irq_chip_startup_parent - Startup the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_startup() callback of the parent if available or falls + * back to irq_chip_enable_parent(). + */ +unsigned int irq_chip_startup_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_startup) + return parent->chip->irq_startup(parent); + + irq_chip_enable_parent(data); + return 0; +} +EXPORT_SYMBOL_GPL(irq_chip_startup_parent); + /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index eb16a58e0322e2..b411886986222d 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -30,29 +30,22 @@ static int devm_irq_match(struct device *dev, void *res, void *data) return this->irq == match->irq && this->dev_id == match->dev_id; } -/** - * devm_request_threaded_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @thread_fn: function to be called in a threaded interrupt context. NULL - * for devices which handle everything in @handler - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device, dev_name(dev) if NULL - * @dev_id: A cookie passed back to the handler function - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_threaded_irq(). IRQs requested with this function will be - * automatically freed on driver detach. - * - * If an IRQ allocated with this function needs to be freed - * separately, devm_free_irq() must be used. - */ -int devm_request_threaded_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, irq_handler_t thread_fn, - unsigned long irqflags, const char *devname, - void *dev_id) +static int devm_request_result(struct device *dev, int rc, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + const char *devname) +{ + if (rc >= 0) + return rc; + + return dev_err_probe(dev, rc, "request_irq(%u) %ps %ps %s\n", + irq, handler, thread_fn, devname ? : ""); +} + +static int __devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, + irq_handler_t thread_fn, + unsigned long irqflags, + const char *devname, void *dev_id) { struct irq_devres *dr; int rc; @@ -78,28 +71,48 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, return 0; } -EXPORT_SYMBOL(devm_request_threaded_irq); /** - * devm_request_any_context_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device, dev_name(dev) if NULL - * @dev_id: A cookie passed back to the handler function + * devm_request_threaded_irq - allocate an interrupt line for a managed device with error logging + * @dev: Device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the interrupt occurs + * @thread_fn: Function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL + * @dev_id: A cookie passed back to the handler function * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_any_context_irq(). IRQs requested with this function will be - * automatically freed on driver detach. + * Except for the extra @dev argument, this function takes the same + * arguments and performs the same function as request_threaded_irq(). + * Interrupts requested with this function will be automatically freed on + * driver detach. + * + * If an interrupt allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + * + * When the request fails, an error message is printed with contextual + * information (device name, interrupt number, handler functions and + * error code). Don't add extra error messages at the call sites. * - * If an IRQ allocated with this function needs to be freed - * separately, devm_free_irq() must be used. + * Return: 0 on success or a negative error number. */ -int devm_request_any_context_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id) +int devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + unsigned long irqflags, const char *devname, + void *dev_id) +{ + int rc = __devm_request_threaded_irq(dev, irq, handler, thread_fn, + irqflags, devname, dev_id); + + return devm_request_result(dev, rc, irq, handler, thread_fn, devname); +} +EXPORT_SYMBOL(devm_request_threaded_irq); + +static int __devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) { struct irq_devres *dr; int rc; @@ -124,6 +137,40 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq, return rc; } + +/** + * devm_request_any_context_irq - allocate an interrupt line for a managed device with error logging + * @dev: Device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the interrupt occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the same + * arguments and performs the same function as request_any_context_irq(). + * Interrupts requested with this function will be automatically freed on + * driver detach. + * + * If an interrupt allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + * + * When the request fails, an error message is printed with contextual + * information (device name, interrupt number, handler functions and + * error code). Don't add extra error messages at the call sites. + * + * Return: IRQC_IS_HARDIRQ or IRQC_IS_NESTED on success, or a negative error + * number. + */ +int devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + int rc = __devm_request_any_context_irq(dev, irq, handler, irqflags, + devname, dev_id); + + return devm_request_result(dev, rc, irq, handler, NULL, devname); +} EXPORT_SYMBOL(devm_request_any_context_irq); /** diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9489f93b3db344..e103451243a0b6 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,6 +136,44 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } +static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled); +static u64 irqhandler_duration_threshold_ns __ro_after_init; + +static int __init irqhandler_duration_check_setup(char *arg) +{ + unsigned long val; + int ret; + + ret = kstrtoul(arg, 0, &val); + if (ret) { + pr_err("Unable to parse irqhandler.duration_warn_us setting: ret=%d\n", ret); + return 0; + } + + if (!val) { + pr_err("Invalid irqhandler.duration_warn_us setting, must be > 0\n"); + return 0; + } + + irqhandler_duration_threshold_ns = val * 1000; + static_branch_enable(&irqhandler_duration_check_enabled); + + return 1; +} +__setup("irqhandler.duration_warn_us=", irqhandler_duration_check_setup); + +static inline void irqhandler_duration_check(u64 ts_start, unsigned int irq, + const struct irqaction *action) +{ + u64 delta_ns = local_clock() - ts_start; + + if (unlikely(delta_ns > irqhandler_duration_threshold_ns)) { + pr_warn_ratelimited("[CPU%u] long duration of IRQ[%u:%ps], took: %llu us\n", + smp_processor_id(), irq, action->handler, + div_u64(delta_ns, NSEC_PER_USEC)); + } +} + irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; @@ -155,7 +193,16 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) lockdep_hardirq_threaded(); trace_irq_handler_entry(irq, action); - res = action->handler(irq, action->dev_id); + + if (static_branch_unlikely(&irqhandler_duration_check_enabled)) { + u64 ts_start = local_clock(); + + res = action->handler(irq, action->dev_id); + irqhandler_duration_check(ts_start, irq, action); + } else { + res = action->handler(irq, action->dev_id); + } + trace_irq_handler_exit(irq, action, res); if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c index a75abebed7f248..e2d31914b3c443 100644 --- a/kernel/irq/irq_test.c +++ b/kernel/irq/irq_test.c @@ -41,21 +41,37 @@ static struct irq_chip fake_irq_chip = { .flags = IRQCHIP_SKIP_SET_WAKE, }; -static void irq_disable_depth_test(struct kunit *test) +static int irq_test_setup_fake_irq(struct kunit *test, struct irq_affinity_desc *affd) { struct irq_desc *desc; - int virq, ret; + int virq; - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, affd); KUNIT_ASSERT_GE(test, virq, 0); - irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */ + irq_settings_clr_norequest(desc); + + return virq; +} + +static void irq_disable_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_test_setup_fake_irq(test, NULL); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_EQ(test, desc->depth, 0); @@ -73,16 +89,13 @@ static void irq_free_disabled_test(struct kunit *test) struct irq_desc *desc; int virq, ret; - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); - KUNIT_ASSERT_GE(test, virq, 0); - - irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + virq = irq_test_setup_fake_irq(test, NULL); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_EQ(test, desc->depth, 0); @@ -93,7 +106,7 @@ static void irq_free_disabled_test(struct kunit *test) KUNIT_EXPECT_GE(test, desc->depth, 1); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_EQ(test, desc->depth, 0); free_irq(virq, NULL); @@ -112,10 +125,7 @@ static void irq_shutdown_depth_test(struct kunit *test) if (!IS_ENABLED(CONFIG_SMP)) kunit_skip(test, "requires CONFIG_SMP for managed shutdown"); - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); - KUNIT_ASSERT_GE(test, virq, 0); - - irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + virq = irq_test_setup_fake_irq(test, &affinity); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); @@ -124,7 +134,7 @@ static void irq_shutdown_depth_test(struct kunit *test) KUNIT_ASSERT_PTR_NE(test, data, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); @@ -169,13 +179,12 @@ static void irq_cpuhotplug_test(struct kunit *test) kunit_skip(test, "requires more than 1 CPU for CPU hotplug"); if (!cpu_is_hotpluggable(1)) kunit_skip(test, "CPU 1 must be hotpluggable"); + if (!cpu_online(1)) + kunit_skip(test, "CPU 1 must be online"); cpumask_copy(&affinity.mask, cpumask_of(1)); - virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); - KUNIT_ASSERT_GE(test, virq, 0); - - irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + virq = irq_test_setup_fake_irq(test, &affinity); desc = irq_to_desc(virq); KUNIT_ASSERT_PTR_NE(test, desc, NULL); @@ -184,7 +193,7 @@ static void irq_cpuhotplug_test(struct kunit *test) KUNIT_ASSERT_PTR_NE(test, data, NULL); ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); - KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_ASSERT_EQ(test, ret, 0); KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); @@ -196,13 +205,9 @@ static void irq_cpuhotplug_test(struct kunit *test) KUNIT_EXPECT_EQ(test, desc->depth, 1); KUNIT_EXPECT_EQ(test, remove_cpu(1), 0); - KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); - KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); KUNIT_EXPECT_GE(test, desc->depth, 1); KUNIT_EXPECT_EQ(test, add_cpu(1), 0); - KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); - KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); KUNIT_EXPECT_EQ(test, desc->depth, 1); enable_irq(virq); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index b64c57b44c2037..db714d3014b5f7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -653,13 +653,6 @@ void irq_mark_irq(unsigned int irq) irq_insert_desc(irq, irq_desc + irq); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq) -{ - free_desc(irq); -} -#endif - #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9b09ad3f9914cb..e7ad992548416a 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1644,9 +1644,6 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) else __msi_domain_free_irqs(dev, domain, ctrl); - if (ops->msi_post_free) - ops->msi_post_free(domain, dev); - if (info->flags & MSI_FLAG_FREE_MSI_DESCS) msi_domain_free_descs(dev, ctrl); } diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 39278737bb68fd..2a1beebf1d37a1 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -460,6 +460,6 @@ config UNUSED_KSYMS_WHITELIST config MODULES_TREE_LOOKUP def_bool y - depends on PERF_EVENTS || TRACING || CFI_CLANG + depends on PERF_EVENTS || TRACING || CFI endif # MODULES diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index d3204c5c74eb7c..f8e8c126705cbd 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -14,7 +14,7 @@ * Use a latched RB-tree for __module_address(); this allows us to use * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * This is conditional on PERF_EVENTS || TRACING || CFI because those can * really hit __module_address() hard by doing a lot of stack unwinding; * potentially from NMI context. */ diff --git a/kernel/nscommon.c b/kernel/nscommon.c new file mode 100644 index 00000000000000..c1fb2bad6d7295 --- /dev/null +++ b/kernel/nscommon.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +#ifdef CONFIG_DEBUG_VFS +static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) +{ + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + VFS_WARN_ON_ONCE(ops != &cgroupns_operations); + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + VFS_WARN_ON_ONCE(ops != &ipcns_operations); + break; +#endif + case CLONE_NEWNS: + VFS_WARN_ON_ONCE(ops != &mntns_operations); + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + VFS_WARN_ON_ONCE(ops != &netns_operations); + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + VFS_WARN_ON_ONCE(ops != &pidns_operations); + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + VFS_WARN_ON_ONCE(ops != &timens_operations); + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + VFS_WARN_ON_ONCE(ops != &userns_operations); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + VFS_WARN_ON_ONCE(ops != &utsns_operations); + break; +#endif + } +} +#endif + +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) +{ + refcount_set(&ns->__ns_ref, 1); + ns->stashed = NULL; + ns->ops = ops; + ns->ns_id = 0; + ns->ns_type = ns_type; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); + +#ifdef CONFIG_DEBUG_VFS + ns_debug(ns, ops); +#endif + + if (inum) { + ns->inum = inum; + return 0; + } + return proc_alloc_inum(&ns->inum); +} + +void __ns_common_free(struct ns_common *ns) +{ + proc_free_inum(ns->inum); +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38f5..19aa64ab08c830 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(unsigned long flags, +static struct nsproxy *create_new_namespaces(u64 flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -144,7 +144,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(u64 flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); @@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(fd_file(f))) { ns = get_proc_ns(file_inode(fd_file(f))); - if (flags && (ns->ops->type != flags)) + if (flags && (ns->ns_type != flags)) err = -EINVAL; - flags = ns->ops->type; + flags = ns->ns_type; } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { diff --git a/kernel/nstree.c b/kernel/nstree.c new file mode 100644 index 00000000000000..b24a320a11a683 --- /dev/null +++ b/kernel/nstree.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + * @type: type of namespaces in this tree + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + +struct ns_tree mnt_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), + .type = CLONE_NEWNS, +}; + +struct ns_tree net_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), + .type = CLONE_NEWNET, +}; +EXPORT_SYMBOL_GPL(net_ns_tree); + +struct ns_tree uts_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), + .type = CLONE_NEWUTS, +}; + +struct ns_tree user_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), + .type = CLONE_NEWUSER, +}; + +struct ns_tree ipc_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), + .type = CLONE_NEWIPC, +}; + +struct ns_tree pid_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), + .type = CLONE_NEWPID, +}; + +struct ns_tree cgroup_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), + .type = CLONE_NEWCGROUP, +}; + +struct ns_tree time_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), + .type = CLONE_NEWTIME, +}; + +DEFINE_COOKIE(namespace_cookie); + +static inline struct ns_common *node_to_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_tree_node); +} + +static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct ns_common *ns_a = node_to_ns(a); + struct ns_common *ns_b = node_to_ns(b); + u64 ns_id_a = ns_a->ns_id; + u64 ns_id_b = ns_b->ns_id; + + if (ns_id_a < ns_id_b) + return -1; + if (ns_id_a > ns_id_b) + return 1; + return 0; +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +{ + struct rb_node *node, *prev; + + VFS_WARN_ON_ONCE(!ns->ns_id); + + write_seqlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + + node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->ns_tree_node); + if (!prev) + list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); + else + list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + + write_sequnlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(node); +} + +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +{ + VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + + write_seqlock(&ns_tree->ns_tree_lock); + rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); + list_bidir_del_rcu(&ns->ns_list_node); + RB_CLEAR_NODE(&ns->ns_tree_node); + write_sequnlock(&ns_tree->ns_tree_lock); +} +EXPORT_SYMBOL_GPL(__ns_tree_remove); + +static int ns_find(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + + +static struct ns_tree *ns_tree_from_type(int ns_type) +{ + switch (ns_type) { + case CLONE_NEWCGROUP: + return &cgroup_ns_tree; + case CLONE_NEWIPC: + return &ipc_ns_tree; + case CLONE_NEWNS: + return &mnt_ns_tree; + case CLONE_NEWNET: + return &net_ns_tree; + case CLONE_NEWPID: + return &pid_ns_tree; + case CLONE_NEWUSER: + return &user_ns_tree; + case CLONE_NEWUTS: + return &uts_ns_tree; + case CLONE_NEWTIME: + return &time_ns_tree; + } + + return NULL; +} + +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree *ns_tree; + struct rb_node *node; + unsigned int seq; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + + do { + seq = read_seqbegin(&ns_tree->ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + if (node) + break; + } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + + if (!node) + return NULL; + + VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); + + return node_to_ns(node); +} + +/** + * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * tree + * @ns: namespace to start from + * @previous: if true find the previous namespace, otherwise the next + * + * Find the next or previous namespace in the same tree as @ns. If + * there is no next/previous namespace, -ENOENT is returned. + */ +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, bool previous) +{ + struct list_head *list; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); + + if (previous) + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + else + list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); + if (list_is_head(list, &ns_tree->ns_list)) + return ERR_PTR(-ENOENT); + + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); + + return list_entry_rcu(list, struct ns_common, ns_list_node); +} + +/** + * ns_tree_gen_id - generate a new namespace id + * @ns: namespace to generate id for + * + * Generates a new namespace id and assigns it to the namespace. All + * namespaces types share the same id space and thus can be compared + * directly. IOW, when two ids of two namespace are equal, they are + * identical. + */ +u64 ns_tree_gen_id(struct ns_common *ns) +{ + guard(preempt)(); + ns->ns_id = gen_cookie_next(&namespace_cookie); + return ns->ns_id; +} diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd256..4fffec767a63ad 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,13 +71,13 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = PROC_PID_INIT_INO, + .ns.inum = ns_init_inum(&init_pid_ns), #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif @@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif + .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -491,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) struct upid *upid; pid_t nr = 0; - if (pid && ns->level <= pid->level) { + if (pid && ns && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; @@ -514,7 +515,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (ns) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; @@ -680,7 +682,7 @@ static int pid_table_root_permissions(struct ctl_table_header *head, container_of(head->set, struct pid_namespace, set); int mode = table->mode; - if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) || + if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) || uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) mode = (mode & S_IRWXU) >> 6; else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717d3..650be58d8d1864 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "pid_sysctl.h" @@ -102,17 +103,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto out_free_idr; - ns->ns.ops = &pidns_operations; ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; - refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); @@ -124,10 +123,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif + ns_tree_add(ns); return ns; out_free_inum: - ns_free_inum(&ns->ns); + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -149,9 +149,10 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { + ns_tree_remove(ns); unregister_pidns_sysctls(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); @@ -168,10 +169,10 @@ static void destroy_pid_namespace_work(struct work_struct *work) parent = ns->parent; destroy_pid_namespace(ns); ns = parent; - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); + } while (ns != &init_pid_ns && ns_ref_put(ns)); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) @@ -183,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns != &init_pid_ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); @@ -344,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; @@ -390,11 +386,23 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + struct pid_namespace *ns; + + if (child->level < ancestor->level) + return false; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return ns == ancestor; +} + static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = to_pid_ns(ns); + struct pid_namespace *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) @@ -408,13 +416,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns) * this maintains the property that processes and their * children can not escape their current pid namespace. */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) + if (!pidns_is_ancestor(new, active)) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); @@ -447,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -458,7 +459,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, @@ -475,6 +475,7 @@ static __init int pid_namespaces_init(void) #endif register_pid_ns_sysctl_table_vm(); + ns_tree_add(&init_pid_ns); return 0; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2f66ab45382319..14e85ff2355123 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -80,6 +80,17 @@ static const struct platform_hibernation_ops *hibernation_ops; static atomic_t hibernate_atomic = ATOMIC_INIT(1); +#ifdef CONFIG_SUSPEND +/** + * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend + */ +bool pm_hibernation_mode_is_suspend(void) +{ + return hibernation_mode == HIBERNATION_SUSPEND; +} +EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend); +#endif + bool hibernate_acquire(void) { return atomic_add_unless(&hibernate_atomic, -1, 0); @@ -695,19 +706,13 @@ static void power_down(void) #ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { + pm_restore_gfp_mask(); error = suspend_devices_and_enter(mem_sleep_current); - if (error) { - hibernation_mode = hibernation_ops ? - HIBERNATION_PLATFORM : - HIBERNATION_SHUTDOWN; - } else { - /* Restore swap signature. */ - error = swsusp_unmark(); - if (error) - pr_err("Swap will be unusable! Try swapon -a.\n"); + if (!error) + goto exit; - return; - } + hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM : + HIBERNATION_SHUTDOWN; } #endif @@ -718,10 +723,9 @@ static void power_down(void) case HIBERNATION_PLATFORM: error = hibernation_platform_enter(); if (error == -EAGAIN || error == -EBUSY) { - swsusp_unmark(); events_check_enabled = false; pr_info("Wakeup event detected during hibernation, rolling back.\n"); - return; + goto exit; } fallthrough; case HIBERNATION_SHUTDOWN: @@ -740,6 +744,15 @@ static void power_down(void) pr_crit("Power down manually\n"); while (1) cpu_relax(); + +exit: + /* Match the pm_restore_gfp_mask() call in hibernate(). */ + pm_restrict_gfp_mask(); + + /* Restore swap signature. */ + error = swsusp_unmark(); + if (error) + pr_err("Swap will be unusable! Try swapon -a.\n"); } static int load_image_and_restore(void) diff --git a/kernel/rseq.c b/kernel/rseq.c index b7a1ec327e8117..2452b7366b00e9 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) /* * Load and clear event mask atomically with respect to - * scheduler preemption. + * scheduler preemption and membarrier IPIs. */ - preempt_disable(); - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) { + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + } return !!event_mask; } diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index c4a488e67aa7d8..755883faf75186 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -58,6 +58,7 @@ #include "deadline.c" #ifdef CONFIG_SCHED_CLASS_EXT +# include "ext_internal.h" # include "ext.c" # include "ext_idle.c" #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4cc..198d2dd45f59cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,8 @@ * Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ +#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE +#include #include #include #include @@ -917,7 +919,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); + rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta); if (rq == this_rq()) __hrtick_restart(rq); @@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) __do_set_cpus_allowed(p, &ac); } -void migrate_disable(void) -{ - struct task_struct *p = current; - - if (p->migration_disabled) { -#ifdef CONFIG_DEBUG_PREEMPT - /* - *Warn about overflow half-way through the range. - */ - WARN_ON_ONCE((s16)p->migration_disabled < 0); -#endif - p->migration_disabled++; - return; - } - - guard(preempt)(); - this_rq()->nr_pinned++; - p->migration_disabled = 1; -} -EXPORT_SYMBOL_GPL(migrate_disable); - -void migrate_enable(void) +void ___migrate_enable(void) { struct task_struct *p = current; struct affinity_context ac = { @@ -2410,35 +2391,19 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Check both overflow from migrate_disable() and superfluous - * migrate_enable(). - */ - if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) - return; -#endif + __set_cpus_allowed_ptr(p, &ac); +} +EXPORT_SYMBOL_GPL(___migrate_enable); - if (p->migration_disabled > 1) { - p->migration_disabled--; - return; - } +void migrate_disable(void) +{ + __migrate_disable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); - /* - * Ensure stop_task runs either before or after this, and that - * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). - */ - guard(preempt)(); - if (p->cpus_ptr != &p->cpus_mask) - __set_cpus_allowed_ptr(p, &ac); - /* - * Mustn't clear migration_disabled() until cpus_ptr points back at the - * regular cpus_mask, otherwise things that race (eg. - * select_fallback_rq) get confused. - */ - barrier(); - p->migration_disabled = 0; - this_rq()->nr_pinned--; +void migrate_enable(void) +{ + __migrate_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -4472,7 +4437,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * __sched_fork() is basic setup which is also used by sched_init() to * initialize the boot CPU's idle task. */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(u64 clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -4490,6 +4455,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; +#ifdef CONFIG_CFS_BANDWIDTH + init_cfs_throttle_work(p); +#endif #endif #ifdef CONFIG_SCHEDSTATS @@ -4707,7 +4675,7 @@ late_initcall(sched_core_sysctl_init); /* * fork()/clone()-time setup: */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(u64 clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); /* @@ -9362,8 +9330,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task, false); - - scx_cgroup_finish_attach(); } static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) @@ -9551,7 +9517,7 @@ static unsigned long tg_weight(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); #else - return sched_weight_from_cgroup(tg->scx_weight); + return sched_weight_from_cgroup(tg->scx.weight); #endif } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f25301267e4714..615411a0a8813d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_se->dl_defer && !dl_se->dl_defer_running && dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { - if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + if (!is_dl_boosted(dl_se)) { /* * Set dl_se->dl_defer_armed and dl_throttled variables to @@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; -static bool dl_server_stopped(struct sched_dl_entity *dl_se); - static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->dl_runtime) return HRTIMER_NORESTART; - if (!dl_se->server_has_tasks(dl_se)) { - replenish_dl_entity(dl_se); - dl_server_stopped(dl_se); - return HRTIMER_NORESTART; - } - if (dl_se->dl_defer_armed) { /* * First check if the server could consume runtime in background. @@ -1579,10 +1571,8 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) { - dl_se->dl_server_idle = 0; + if (dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); - } } void dl_server_start(struct sched_dl_entity *dl_se) @@ -1610,26 +1600,10 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } -static bool dl_server_stopped(struct sched_dl_entity *dl_se) -{ - if (!dl_se->dl_server_active) - return true; - - if (dl_se->dl_server_idle) { - dl_server_stop(dl_se); - return true; - } - - dl_se->dl_server_idle = 1; - return false; -} - void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) { dl_se->rq = rq; - dl_se->server_has_tasks = has_tasks; dl_se->server_pick_task = pick_task; } @@ -2394,10 +2368,7 @@ static struct task_struct *__pick_task_dl(struct rq *rq) if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (!dl_server_stopped(dl_se)) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); - } + dl_server_stop(dl_se); goto again; } rq->dl_server = dl_se; @@ -2580,6 +2551,25 @@ static int find_later_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); + + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); + + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); + + return p; +} + /* Locks the rq it finds */ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) { @@ -2607,12 +2597,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { - if (unlikely(task_rq(task) != rq || + /* + * double_lock_balance had to release rq->lock, in the + * meantime, task may no longer be fit to be migrated. + * Check the following to ensure that the task is + * still suitable for migration: + * 1. It is possible the task was scheduled, + * migrate_disabled was set and then got preempted, + * so we must check the task migration disable + * flag. + * 2. The CPU picked is in the task's affinity. + * 3. For throttled task (dl_task_offline_migration), + * check the following: + * - the task is not on the rq anymore (it was + * migrated) + * - the task is not on CPU anymore + * - the task is still a dl task + * - the task is not queued on the rq anymore + * 4. For the non-throttled task (push_dl_task), the + * check to ensure that this task is still at the + * head of the pushable tasks list is enough. + */ + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !dl_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + (task->dl.dl_throttled && + (task_rq(task) != rq || + task_on_cpu(rq, task) || + !dl_task(task) || + !task_on_rq_queued(task))) || + (!task->dl.dl_throttled && + task != pick_next_pushable_dl_task(rq)))) { + double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -2635,25 +2650,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) return later_rq; } -static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_dl_tasks(rq)) - return NULL; - - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - - WARN_ON_ONCE(rq->cpu != task_cpu(p)); - WARN_ON_ONCE(task_current(rq, p)); - WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - - WARN_ON_ONCE(!task_on_rq_queued(p)); - WARN_ON_ONCE(!dl_task(p)); - - return p; -} - /* * See if the non running -deadline tasks on this rq * can be sent to some other CPU where they can preempt diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4ae32ef179dd0f..2b0e88206d0768 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,1040 +9,6 @@ #include #include "ext_idle.h" -#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) - -enum scx_consts { - SCX_DSP_DFL_MAX_BATCH = 32, - SCX_DSP_MAX_LOOPS = 32, - SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, - - SCX_EXIT_BT_LEN = 64, - SCX_EXIT_MSG_LEN = 1024, - SCX_EXIT_DUMP_DFL_LEN = 32768, - - SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, - - /* - * Iterating all tasks may take a while. Periodically drop - * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. - */ - SCX_TASK_ITER_BATCH = 32, -}; - -enum scx_exit_kind { - SCX_EXIT_NONE, - SCX_EXIT_DONE, - - SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ - SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ - SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ - SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ - - SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ - SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ - SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ -}; - -/* - * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), - * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes - * are 64bit of the format: - * - * Bits: [63 .. 48 47 .. 32 31 .. 0] - * [ SYS ACT ] [ SYS RSN ] [ USR ] - * - * SYS ACT: System-defined exit actions - * SYS RSN: System-defined exit reasons - * USR : User-defined exit codes and reasons - * - * Using the above, users may communicate intention and context by ORing system - * actions and/or system reasons with a user-defined exit code. - */ -enum scx_exit_code { - /* Reasons */ - SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, - - /* Actions */ - SCX_ECODE_ACT_RESTART = 1LLU << 48, -}; - -/* - * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is - * being disabled. - */ -struct scx_exit_info { - /* %SCX_EXIT_* - broad category of the exit reason */ - enum scx_exit_kind kind; - - /* exit code if gracefully exiting */ - s64 exit_code; - - /* textual representation of the above */ - const char *reason; - - /* backtrace if exiting due to an error */ - unsigned long *bt; - u32 bt_len; - - /* informational message */ - char *msg; - - /* debug dump */ - char *dump; -}; - -/* sched_ext_ops.flags */ -enum scx_ops_flags { - /* - * Keep built-in idle tracking even if ops.update_idle() is implemented. - */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, - - /* - * By default, if there are no other task to run on the CPU, ext core - * keeps running the current task even after its slice expires. If this - * flag is specified, such tasks are passed to ops.enqueue() with - * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. - */ - SCX_OPS_ENQ_LAST = 1LLU << 1, - - /* - * An exiting task may schedule after PF_EXITING is set. In such cases, - * bpf_task_from_pid() may not be able to find the task and if the BPF - * scheduler depends on pid lookup for dispatching, the task will be - * lost leading to various issues including RCU grace period stalls. - * - * To mask this problem, by default, unhashed tasks are automatically - * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't - * depend on pid lookups and wants to handle these tasks directly, the - * following flag can be used. - */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, - - /* - * If set, only tasks with policy set to SCHED_EXT are attached to - * sched_ext. If clear, SCHED_NORMAL tasks are also included. - */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, - - /* - * A migration disabled task can only execute on its current CPU. By - * default, such tasks are automatically put on the CPU's local DSQ with - * the default slice on enqueue. If this ops flag is set, they also go - * through ops.enqueue(). - * - * A migration disabled task never invokes ops.select_cpu() as it can - * only select the current CPU. Also, p->cpus_ptr will only contain its - * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr - * and thus may disagree with cpumask_weight(p->cpus_ptr). - */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, - - /* - * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes - * ops.enqueue() on the ops.select_cpu() selected or the wakee's - * previous CPU via IPI (inter-processor interrupt) to reduce cacheline - * transfers. When this optimization is enabled, ops.select_cpu() is - * skipped in some cases (when racing against the wakee switching out). - * As the BPF scheduler may depend on ops.select_cpu() being invoked - * during wakeups, queued wakeup is disabled by default. - * - * If this ops flag is set, queued wakeup optimization is enabled and - * the BPF scheduler must be able to handle ops.enqueue() invoked on the - * wakee's CPU without preceding ops.select_cpu() even for tasks which - * may be executed on multiple CPUs. - */ - SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, - - /* - * If set, enable per-node idle cpumasks. If clear, use a single global - * flat idle cpumask. - */ - SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, - - /* - * CPU cgroup support flags - */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ - - SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | - SCX_OPS_ENQ_LAST | - SCX_OPS_ENQ_EXITING | - SCX_OPS_ENQ_MIGRATION_DISABLED | - SCX_OPS_ALLOW_QUEUED_WAKEUP | - SCX_OPS_SWITCH_PARTIAL | - SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_HAS_CGROUP_WEIGHT, - - /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ - __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, - - SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, -}; - -/* argument container for ops.init_task() */ -struct scx_init_task_args { - /* - * Set if ops.init_task() is being invoked on the fork path, as opposed - * to the scheduler transition path. - */ - bool fork; -#ifdef CONFIG_EXT_GROUP_SCHED - /* the cgroup the task is joining */ - struct cgroup *cgroup; -#endif -}; - -/* argument container for ops.exit_task() */ -struct scx_exit_task_args { - /* Whether the task exited before running on sched_ext. */ - bool cancelled; -}; - -/* argument container for ops->cgroup_init() */ -struct scx_cgroup_init_args { - /* the weight of the cgroup [1..10000] */ - u32 weight; - - /* bandwidth control parameters from cpu.max and cpu.max.burst */ - u64 bw_period_us; - u64 bw_quota_us; - u64 bw_burst_us; -}; - -enum scx_cpu_preempt_reason { - /* next task is being scheduled by &sched_class_rt */ - SCX_CPU_PREEMPT_RT, - /* next task is being scheduled by &sched_class_dl */ - SCX_CPU_PREEMPT_DL, - /* next task is being scheduled by &sched_class_stop */ - SCX_CPU_PREEMPT_STOP, - /* unknown reason for SCX being preempted */ - SCX_CPU_PREEMPT_UNKNOWN, -}; - -/* - * Argument container for ops->cpu_acquire(). Currently empty, but may be - * expanded in the future. - */ -struct scx_cpu_acquire_args {}; - -/* argument container for ops->cpu_release() */ -struct scx_cpu_release_args { - /* the reason the CPU was preempted */ - enum scx_cpu_preempt_reason reason; - - /* the task that's going to be scheduled on the CPU */ - struct task_struct *task; -}; - -/* - * Informational context provided to dump operations. - */ -struct scx_dump_ctx { - enum scx_exit_kind kind; - s64 exit_code; - const char *reason; - u64 at_ns; - u64 at_jiffies; -}; - -/** - * struct sched_ext_ops - Operation table for BPF scheduler implementation - * - * A BPF scheduler can implement an arbitrary scheduling policy by - * implementing and loading operations in this table. Note that a userland - * scheduling policy can also be implemented using the BPF scheduler - * as a shim layer. - */ -struct sched_ext_ops { - /** - * @select_cpu: Pick the target CPU for a task which is being woken up - * @p: task being woken up - * @prev_cpu: the cpu @p was on before sleeping - * @wake_flags: SCX_WAKE_* - * - * Decision made here isn't final. @p may be moved to any CPU while it - * is getting dispatched for execution later. However, as @p is not on - * the rq at this point, getting the eventual execution CPU right here - * saves a small bit of overhead down the line. - * - * If an idle CPU is returned, the CPU is kicked and will try to - * dispatch. While an explicit custom mechanism can be added, - * select_cpu() serves as the default way to wake up idle CPUs. - * - * @p may be inserted into a DSQ directly by calling - * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. - * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ - * of the CPU returned by this operation. - * - * Note that select_cpu() is never called for tasks that can only run - * on a single CPU or tasks with migration disabled, as they don't have - * the option to select a different CPU. See select_task_rq() for - * details. - */ - s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); - - /** - * @enqueue: Enqueue a task on the BPF scheduler - * @p: task being enqueued - * @enq_flags: %SCX_ENQ_* - * - * @p is ready to run. Insert directly into a DSQ by calling - * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly - * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, - * the task will stall. - * - * If @p was inserted into a DSQ from ops.select_cpu(), this callback is - * skipped. - */ - void (*enqueue)(struct task_struct *p, u64 enq_flags); - - /** - * @dequeue: Remove a task from the BPF scheduler - * @p: task being dequeued - * @deq_flags: %SCX_DEQ_* - * - * Remove @p from the BPF scheduler. This is usually called to isolate - * the task while updating its scheduling properties (e.g. priority). - * - * The ext core keeps track of whether the BPF side owns a given task or - * not and can gracefully ignore spurious dispatches from BPF side, - * which makes it safe to not implement this method. However, depending - * on the scheduling logic, this can lead to confusing behaviors - e.g. - * scheduling position not being updated across a priority change. - */ - void (*dequeue)(struct task_struct *p, u64 deq_flags); - - /** - * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs - * @cpu: CPU to dispatch tasks for - * @prev: previous task being switched out - * - * Called when a CPU's local dsq is empty. The operation should dispatch - * one or more tasks from the BPF scheduler into the DSQs using - * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ - * using scx_bpf_dsq_move_to_local(). - * - * The maximum number of times scx_bpf_dsq_insert() can be called - * without an intervening scx_bpf_dsq_move_to_local() is specified by - * ops.dispatch_max_batch. See the comments on top of the two functions - * for more details. - * - * When not %NULL, @prev is an SCX task with its slice depleted. If - * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in - * @prev->scx.flags, it is not enqueued yet and will be enqueued after - * ops.dispatch() returns. To keep executing @prev, return without - * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. - */ - void (*dispatch)(s32 cpu, struct task_struct *prev); - - /** - * @tick: Periodic tick - * @p: task running currently - * - * This operation is called every 1/HZ seconds on CPUs which are - * executing an SCX task. Setting @p->scx.slice to 0 will trigger an - * immediate dispatch cycle on the CPU. - */ - void (*tick)(struct task_struct *p); - - /** - * @runnable: A task is becoming runnable on its associated CPU - * @p: task becoming runnable - * @enq_flags: %SCX_ENQ_* - * - * This and the following three functions can be used to track a task's - * execution state transitions. A task becomes ->runnable() on a CPU, - * and then goes through one or more ->running() and ->stopping() pairs - * as it runs on the CPU, and eventually becomes ->quiescent() when it's - * done running on the CPU. - * - * @p is becoming runnable on the CPU because it's - * - * - waking up (%SCX_ENQ_WAKEUP) - * - being moved from another CPU - * - being restored after temporarily taken off the queue for an - * attribute change. - * - * This and ->enqueue() are related but not coupled. This operation - * notifies @p's state transition and may not be followed by ->enqueue() - * e.g. when @p is being dispatched to a remote CPU, or when @p is - * being enqueued on a CPU experiencing a hotplug event. Likewise, a - * task may be ->enqueue()'d without being preceded by this operation - * e.g. after exhausting its slice. - */ - void (*runnable)(struct task_struct *p, u64 enq_flags); - - /** - * @running: A task is starting to run on its associated CPU - * @p: task starting to run - * - * Note that this callback may be called from a CPU other than the - * one the task is going to run on. This can happen when a task - * property is changed (i.e., affinity), since scx_next_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to determine the - * target CPU the task is going to use. - * - * See ->runnable() for explanation on the task state notifiers. - */ - void (*running)(struct task_struct *p); - - /** - * @stopping: A task is stopping execution - * @p: task stopping to run - * @runnable: is task @p still runnable? - * - * Note that this callback may be called from a CPU other than the - * one the task was running on. This can happen when a task - * property is changed (i.e., affinity), since dequeue_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU - * the task was running on. - * - * See ->runnable() for explanation on the task state notifiers. If - * !@runnable, ->quiescent() will be invoked after this operation - * returns. - */ - void (*stopping)(struct task_struct *p, bool runnable); - - /** - * @quiescent: A task is becoming not runnable on its associated CPU - * @p: task becoming not runnable - * @deq_flags: %SCX_DEQ_* - * - * See ->runnable() for explanation on the task state notifiers. - * - * @p is becoming quiescent on the CPU because it's - * - * - sleeping (%SCX_DEQ_SLEEP) - * - being moved to another CPU - * - being temporarily taken off the queue for an attribute change - * (%SCX_DEQ_SAVE) - * - * This and ->dequeue() are related but not coupled. This operation - * notifies @p's state transition and may not be preceded by ->dequeue() - * e.g. when @p is being dispatched to a remote CPU. - */ - void (*quiescent)(struct task_struct *p, u64 deq_flags); - - /** - * @yield: Yield CPU - * @from: yielding task - * @to: optional yield target task - * - * If @to is NULL, @from is yielding the CPU to other runnable tasks. - * The BPF scheduler should ensure that other available tasks are - * dispatched before the yielding task. Return value is ignored in this - * case. - * - * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf - * scheduler can implement the request, return %true; otherwise, %false. - */ - bool (*yield)(struct task_struct *from, struct task_struct *to); - - /** - * @core_sched_before: Task ordering for core-sched - * @a: task A - * @b: task B - * - * Used by core-sched to determine the ordering between two tasks. See - * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on - * core-sched. - * - * Both @a and @b are runnable and may or may not currently be queued on - * the BPF scheduler. Should return %true if @a should run before @b. - * %false if there's no required ordering or @b should run before @a. - * - * If not specified, the default is ordering them according to when they - * became runnable. - */ - bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); - - /** - * @set_weight: Set task weight - * @p: task to set weight for - * @weight: new weight [1..10000] - * - * Update @p's weight to @weight. - */ - void (*set_weight)(struct task_struct *p, u32 weight); - - /** - * @set_cpumask: Set CPU affinity - * @p: task to set CPU affinity for - * @cpumask: cpumask of cpus that @p can run on - * - * Update @p's CPU affinity to @cpumask. - */ - void (*set_cpumask)(struct task_struct *p, - const struct cpumask *cpumask); - - /** - * @update_idle: Update the idle state of a CPU - * @cpu: CPU to update the idle state for - * @idle: whether entering or exiting the idle state - * - * This operation is called when @rq's CPU goes or leaves the idle - * state. By default, implementing this operation disables the built-in - * idle CPU tracking and the following helpers become unavailable: - * - * - scx_bpf_select_cpu_dfl() - * - scx_bpf_select_cpu_and() - * - scx_bpf_test_and_clear_cpu_idle() - * - scx_bpf_pick_idle_cpu() - * - * The user also must implement ops.select_cpu() as the default - * implementation relies on scx_bpf_select_cpu_dfl(). - * - * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle - * tracking. - */ - void (*update_idle)(s32 cpu, bool idle); - - /** - * @cpu_acquire: A CPU is becoming available to the BPF scheduler - * @cpu: The CPU being acquired by the BPF scheduler. - * @args: Acquire arguments, see the struct definition. - * - * A CPU that was previously released from the BPF scheduler is now once - * again under its control. - */ - void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); - - /** - * @cpu_release: A CPU is taken away from the BPF scheduler - * @cpu: The CPU being released by the BPF scheduler. - * @args: Release arguments, see the struct definition. - * - * The specified CPU is no longer under the control of the BPF - * scheduler. This could be because it was preempted by a higher - * priority sched_class, though there may be other reasons as well. The - * caller should consult @args->reason to determine the cause. - */ - void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); - - /** - * @init_task: Initialize a task to run in a BPF scheduler - * @p: task to initialize for BPF scheduling - * @args: init arguments, see the struct definition - * - * Either we're loading a BPF scheduler or a new task is being forked. - * Initialize @p for BPF scheduling. This operation may block and can - * be used for allocations, and is called exactly once for a task. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During a fork, it - * will abort that specific fork. - */ - s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); - - /** - * @exit_task: Exit a previously-running task from the system - * @p: task to exit - * @args: exit arguments, see the struct definition - * - * @p is exiting or the BPF scheduler is being unloaded. Perform any - * necessary cleanup for @p. - */ - void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); - - /** - * @enable: Enable BPF scheduling for a task - * @p: task to enable BPF scheduling for - * - * Enable @p for BPF scheduling. enable() is called on @p any time it - * enters SCX, and is always paired with a matching disable(). - */ - void (*enable)(struct task_struct *p); - - /** - * @disable: Disable BPF scheduling for a task - * @p: task to disable BPF scheduling for - * - * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. - * Disable BPF scheduling for @p. A disable() call is always matched - * with a prior enable() call. - */ - void (*disable)(struct task_struct *p); - - /** - * @dump: Dump BPF scheduler state on error - * @ctx: debug dump context - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. - */ - void (*dump)(struct scx_dump_ctx *ctx); - - /** - * @dump_cpu: Dump BPF scheduler state for a CPU on error - * @ctx: debug dump context - * @cpu: CPU to generate debug dump for - * @idle: @cpu is currently idle without any runnable tasks - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @cpu. If @idle is %true and this operation doesn't produce any - * output, @cpu is skipped for dump. - */ - void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); - - /** - * @dump_task: Dump BPF scheduler state for a runnable task on error - * @ctx: debug dump context - * @p: runnable task to generate debug dump for - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @p. - */ - void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); - -#ifdef CONFIG_EXT_GROUP_SCHED - /** - * @cgroup_init: Initialize a cgroup - * @cgrp: cgroup being initialized - * @args: init arguments, see the struct definition - * - * Either the BPF scheduler is being loaded or @cgrp created, initialize - * @cgrp for sched_ext. This operation may block. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During cgroup - * creation, it will abort the specific cgroup creation. - */ - s32 (*cgroup_init)(struct cgroup *cgrp, - struct scx_cgroup_init_args *args); - - /** - * @cgroup_exit: Exit a cgroup - * @cgrp: cgroup being exited - * - * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit - * @cgrp for sched_ext. This operation my block. - */ - void (*cgroup_exit)(struct cgroup *cgrp); - - /** - * @cgroup_prep_move: Prepare a task to be moved to a different cgroup - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Prepare @p for move from cgroup @from to @to. This operation may - * block and can be used for allocations. - * - * Return 0 for success, -errno for failure. An error return aborts the - * migration. - */ - s32 (*cgroup_prep_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_move: Commit cgroup move - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Commit the move. @p is dequeued during this operation. - */ - void (*cgroup_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_cancel_move: Cancel cgroup move - * @p: task whose cgroup move is being canceled - * @from: cgroup @p was being moved from - * @to: cgroup @p was being moved to - * - * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). - * Undo the preparation. - */ - void (*cgroup_cancel_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_set_weight: A cgroup's weight is being changed - * @cgrp: cgroup whose weight is being updated - * @weight: new weight [1..10000] - * - * Update @cgrp's weight to @weight. - */ - void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); - - /** - * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed - * @cgrp: cgroup whose bandwidth is being updated - * @period_us: bandwidth control period - * @quota_us: bandwidth control quota - * @burst_us: bandwidth control burst - * - * Update @cgrp's bandwidth control parameters. This is from the cpu.max - * cgroup interface. - * - * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled - * to. For example, if @period_us is 1_000_000 and @quota_us is - * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be - * interpreted in the same fashion and specifies how much @cgrp can - * burst temporarily. The specific control mechanism and thus the - * interpretation of @period_us and burstiness is upto to the BPF - * scheduler. - */ - void (*cgroup_set_bandwidth)(struct cgroup *cgrp, - u64 period_us, u64 quota_us, u64 burst_us); - -#endif /* CONFIG_EXT_GROUP_SCHED */ - - /* - * All online ops must come before ops.cpu_online(). - */ - - /** - * @cpu_online: A CPU became online - * @cpu: CPU which just came up - * - * @cpu just came online. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs beforehand. - */ - void (*cpu_online)(s32 cpu); - - /** - * @cpu_offline: A CPU is going offline - * @cpu: CPU which is going offline - * - * @cpu is going offline. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs afterwards. - */ - void (*cpu_offline)(s32 cpu); - - /* - * All CPU hotplug ops must come before ops.init(). - */ - - /** - * @init: Initialize the BPF scheduler - */ - s32 (*init)(void); - - /** - * @exit: Clean up after the BPF scheduler - * @info: Exit info - * - * ops.exit() is also called on ops.init() failure, which is a bit - * unusual. This is to allow rich reporting through @info on how - * ops.init() failed. - */ - void (*exit)(struct scx_exit_info *info); - - /** - * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch - */ - u32 dispatch_max_batch; - - /** - * @flags: %SCX_OPS_* flags - */ - u64 flags; - - /** - * @timeout_ms: The maximum amount of time, in milliseconds, that a - * runnable task should be able to wait before being scheduled. The - * maximum timeout may not exceed the default timeout of 30 seconds. - * - * Defaults to the maximum allowed timeout value of 30 seconds. - */ - u32 timeout_ms; - - /** - * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default - * value of 32768 is used. - */ - u32 exit_dump_len; - - /** - * @hotplug_seq: A sequence number that may be set by the scheduler to - * detect when a hotplug event has occurred during the loading process. - * If 0, no detection occurs. Otherwise, the scheduler will fail to - * load if the sequence number does not match @scx_hotplug_seq on the - * enable path. - */ - u64 hotplug_seq; - - /** - * @name: BPF scheduler's name - * - * Must be a non-zero valid BPF object name including only isalnum(), - * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the - * BPF scheduler is enabled. - */ - char name[SCX_OPS_NAME_LEN]; - - /* internal use only, must be NULL */ - void *priv; -}; - -enum scx_opi { - SCX_OPI_BEGIN = 0, - SCX_OPI_NORMAL_BEGIN = 0, - SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), - SCX_OPI_END = SCX_OP_IDX(init), -}; - -/* - * Collection of event counters. Event types are placed in descending order. - */ -struct scx_event_stats { - /* - * If ops.select_cpu() returns a CPU which can't be used by the task, - * the core scheduler code silently picks a fallback CPU. - */ - s64 SCX_EV_SELECT_CPU_FALLBACK; - - /* - * When dispatching to a local DSQ, the CPU may have gone offline in - * the meantime. In this case, the task is bounced to the global DSQ. - */ - s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; - - /* - * If SCX_OPS_ENQ_LAST is not set, the number of times that a task - * continued to run because there were no other tasks on the CPU. - */ - s64 SCX_EV_DISPATCH_KEEP_LAST; - - /* - * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task - * is dispatched to a local DSQ when exiting. - */ - s64 SCX_EV_ENQ_SKIP_EXITING; - - /* - * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a - * migration disabled task skips ops.enqueue() and is dispatched to its - * local DSQ. - */ - s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; - - /* - * Total number of times a task's time slice was refilled with the - * default value (SCX_SLICE_DFL). - */ - s64 SCX_EV_REFILL_SLICE_DFL; - - /* - * The total duration of bypass modes in nanoseconds. - */ - s64 SCX_EV_BYPASS_DURATION; - - /* - * The number of tasks dispatched in the bypassing mode. - */ - s64 SCX_EV_BYPASS_DISPATCH; - - /* - * The number of times the bypassing mode has been activated. - */ - s64 SCX_EV_BYPASS_ACTIVATE; -}; - -struct scx_sched { - struct sched_ext_ops ops; - DECLARE_BITMAP(has_op, SCX_OPI_END); - - /* - * Dispatch queues. - * - * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. - * This is to avoid live-locking in bypass mode where all tasks are - * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If - * per-node split isn't sufficient, it can be further split. - */ - struct rhashtable dsq_hash; - struct scx_dispatch_q **global_dsqs; - - /* - * The event counters are in a per-CPU variable to minimize the - * accounting overhead. A system-wide view on the event counter is - * constructed when requested by scx_bpf_events(). - */ - struct scx_event_stats __percpu *event_stats_cpu; - - bool warned_zero_slice; - - atomic_t exit_kind; - struct scx_exit_info *exit_info; - - struct kobject kobj; - - struct kthread_worker *helper; - struct irq_work error_irq_work; - struct kthread_work disable_work; - struct rcu_work rcu_work; -}; - -enum scx_wake_flags { - /* expose select WF_* flags as enums */ - SCX_WAKE_FORK = WF_FORK, - SCX_WAKE_TTWU = WF_TTWU, - SCX_WAKE_SYNC = WF_SYNC, -}; - -enum scx_enq_flags { - /* expose select ENQUEUE_* flags as enums */ - SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, - SCX_ENQ_HEAD = ENQUEUE_HEAD, - SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, - - /* high 32bits are SCX specific */ - - /* - * Set the following to trigger preemption when calling - * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the - * current task is cleared to zero and the CPU is kicked into the - * scheduling path. Implies %SCX_ENQ_HEAD. - */ - SCX_ENQ_PREEMPT = 1LLU << 32, - - /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. - */ - SCX_ENQ_REENQ = 1LLU << 40, - - /* - * The task being enqueued is the only task available for the cpu. By - * default, ext core keeps executing such tasks but when - * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the - * %SCX_ENQ_LAST flag set. - * - * The BPF scheduler is responsible for triggering a follow-up - * scheduling event. Otherwise, Execution may stall. - */ - SCX_ENQ_LAST = 1LLU << 41, - - /* high 8 bits are internal */ - __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, - - SCX_ENQ_CLEAR_OPSS = 1LLU << 56, - SCX_ENQ_DSQ_PRIQ = 1LLU << 57, -}; - -enum scx_deq_flags { - /* expose select DEQUEUE_* flags as enums */ - SCX_DEQ_SLEEP = DEQUEUE_SLEEP, - - /* high 32bits are SCX specific */ - - /* - * The generic core-sched layer decided to execute the task even though - * it hasn't been dispatched yet. Dequeue from the BPF side. - */ - SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, -}; - -enum scx_pick_idle_cpu_flags { - SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ - SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ -}; - -enum scx_kick_flags { - /* - * Kick the target CPU if idle. Guarantees that the target CPU goes - * through at least one full scheduling cycle before going idle. If the - * target CPU can be determined to be currently not idle and going to go - * through a scheduling cycle before going idle, noop. - */ - SCX_KICK_IDLE = 1LLU << 0, - - /* - * Preempt the current task and execute the dispatch path. If the - * current task of the target CPU is an SCX task, its ->scx.slice is - * cleared to zero before the scheduling path is invoked so that the - * task expires and the dispatch path is invoked. - */ - SCX_KICK_PREEMPT = 1LLU << 1, - - /* - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will - * return after the target CPU finishes picking the next task. - */ - SCX_KICK_WAIT = 1LLU << 2, -}; - -enum scx_tg_flags { - SCX_TG_ONLINE = 1U << 0, - SCX_TG_INITED = 1U << 1, -}; - -enum scx_enable_state { - SCX_ENABLING, - SCX_ENABLED, - SCX_DISABLING, - SCX_DISABLED, -}; - -static const char *scx_enable_state_str[] = { - [SCX_ENABLING] = "enabling", - [SCX_ENABLED] = "enabled", - [SCX_DISABLING] = "disabling", - [SCX_DISABLED] = "disabled", -}; - -/* - * sched_ext_entity->ops_state - * - * Used to track the task ownership between the SCX core and the BPF scheduler. - * State transitions look as follows: - * - * NONE -> QUEUEING -> QUEUED -> DISPATCHING - * ^ | | - * | v v - * \-------------------------------/ - * - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call - * sites for explanations on the conditions being waited upon and why they are - * safe. Transitions out of them into NONE or QUEUED must store_release and the - * waiters should load_acquire. - * - * Tracking scx_ops_state enables sched_ext core to reliably determine whether - * any given task can be dispatched by the BPF scheduler at all times and thus - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler - * to try to dispatch any task anytime regardless of its state as the SCX core - * can safely reject invalid dispatches. - */ -enum scx_ops_state { - SCX_OPSS_NONE, /* owned by the SCX core */ - SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ - SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ - SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ - - /* - * QSEQ brands each QUEUED instance so that, when dispatch races - * dequeue/requeue, the dispatcher can tell whether it still has a claim - * on the task being dispatched. - * - * As some 32bit archs can't do 64bit store_release/load_acquire, - * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on - * 32bit machines. The dispatch race window QSEQ protects is very narrow - * and runs with IRQ disabled. 30 bits should be sufficient. - */ - SCX_OPSS_QSEQ_SHIFT = 2, -}; - -/* Use macros to ensure that the type is unsigned long for the masks */ -#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) -#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) - /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the @@ -1170,7 +136,7 @@ static struct kset *scx_kset; #include static void process_ddsp_deferred_locals(struct rq *rq); -static void scx_bpf_kick_cpu(s32 cpu, u64 flags); +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -1185,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch, va_end(args); } -static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, - const char *fmt, ...) -{ - struct scx_sched *sch; - va_list args; - - rcu_read_lock(); - sch = rcu_dereference(scx_root); - if (sch) { - va_start(args, fmt); - scx_vexit(sch, kind, exit_code, fmt, args); - va_end(args); - } - rcu_read_unlock(); -} - #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) -#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) @@ -1232,10 +181,9 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } -static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, + struct task_struct *p) { - struct scx_sched *sch = scx_root; - return sch->global_dsqs[cpu_to_node(task_cpu(p))]; } @@ -1363,11 +311,11 @@ do { \ }) /* @mask is constant, always inline to cull unnecessary branches */ -static __always_inline bool scx_kf_allowed(u32 mask) +static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) { if (unlikely(!(current->scx.kf_mask & mask))) { - scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); + scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); return false; } @@ -1380,13 +328,13 @@ static __always_inline bool scx_kf_allowed(u32 mask) */ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_kf_error("cpu_release kfunc called from a nested operation"); + scx_error(sch, "cpu_release kfunc called from a nested operation"); return false; } if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_kf_error("dispatch kfunc called from a nested operation"); + scx_error(sch, "dispatch kfunc called from a nested operation"); return false; } @@ -1394,15 +342,16 @@ static __always_inline bool scx_kf_allowed(u32 mask) } /* see SCX_CALL_OP_TASK() */ -static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, +static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, + u32 mask, struct task_struct *p) { - if (!scx_kf_allowed(mask)) + if (!scx_kf_allowed(sch, mask)) return false; if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { - scx_kf_error("called on a task not being operated on"); + scx_error(sch, "called on a task not being operated on"); return false; } @@ -1488,10 +437,11 @@ struct bpf_iter_scx_dsq { */ struct scx_task_iter { struct sched_ext_entity cursor; - struct task_struct *locked; + struct task_struct *locked_task; struct rq *rq; struct rq_flags rf; u32 cnt; + bool list_locked; }; /** @@ -1519,15 +469,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter) iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked = NULL; + iter->locked_task = NULL; iter->cnt = 0; + iter->list_locked = true; } static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; + if (iter->locked_task) { + task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); + iter->locked_task = NULL; } } @@ -1537,24 +488,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) * * If @iter is in the middle of a locked iteration, it may be locking the rq of * the task currently being visited in addition to scx_tasks_lock. Unlock both. - * This function can be safely called anytime during an iteration. + * This function can be safely called anytime during an iteration. The next + * iterator operation will automatically restore the necessary locking. */ static void scx_task_iter_unlock(struct scx_task_iter *iter) { __scx_task_iter_rq_unlock(iter); - spin_unlock_irq(&scx_tasks_lock); + if (iter->list_locked) { + iter->list_locked = false; + spin_unlock_irq(&scx_tasks_lock); + } } -/** - * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() - * @iter: iterator to re-lock - * - * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it - * doesn't re-lock the rq lock. Must be called before other iterator operations. - */ -static void scx_task_iter_relock(struct scx_task_iter *iter) +static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { - spin_lock_irq(&scx_tasks_lock); + if (!iter->list_locked) { + spin_lock_irq(&scx_tasks_lock); + iter->list_locked = true; + } } /** @@ -1567,6 +518,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { + __scx_task_iter_maybe_relock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); } @@ -1584,10 +536,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; + __scx_task_iter_maybe_relock(iter); + if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - scx_task_iter_relock(iter); + __scx_task_iter_maybe_relock(iter); } list_for_each_entry(pos, cursor, tasks_node) { @@ -1650,7 +604,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return NULL; iter->rq = task_rq_lock(p, &iter->rf); - iter->locked = p; + iter->locked_task = p; return p; } @@ -1664,7 +618,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This can be used when preemption is not disabled. */ #define scx_add_event(sch, name, cnt) do { \ - this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, (cnt)); \ } while(0) @@ -1677,7 +631,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This should be used only when preemption is disabled. */ #define __scx_add_event(sch, name, cnt) do { \ - __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, cnt); \ } while(0) @@ -1765,23 +719,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) } } -/** - * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args - * @cpu: cpu number which came from a BPF ops - * @where: extra information reported on error - * - * The same as ops_cpu_valid() but @sch is implicit. - */ -static bool kf_cpu_valid(u32 cpu, const char *where) -{ - if (__cpu_valid(cpu)) { - return true; - } else { - scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); - return false; - } -} - /** * ops_sanitize_err - Sanitize a -errno value * @sch: scx_sched to error out on error @@ -1942,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) WRITE_ONCE(dsq->nr, dsq->nr + delta); } -static void refill_task_slice_dfl(struct task_struct *p) +static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); + __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, @@ -1963,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); raw_spin_lock(&dsq->lock); } } @@ -2142,26 +1079,27 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return find_global_dsq(p); + return find_global_dsq(sch, p); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); else dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return find_global_dsq(p); + return find_global_dsq(sch, p); } return dsq; } -static void mark_direct_dispatch(struct task_struct *ddsp_task, +static void mark_direct_dispatch(struct scx_sched *sch, + struct task_struct *ddsp_task, struct task_struct *p, u64 dsq_id, u64 enq_flags) { @@ -2175,10 +1113,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, /* @p must match the task on the enqueue path */ if (unlikely(p != ddsp_task)) { if (IS_ERR(ddsp_task)) - scx_kf_error("%s[%d] already direct-dispatched", + scx_error(sch, "%s[%d] already direct-dispatched", p->comm, p->pid); else - scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", ddsp_task->comm, ddsp_task->pid, p->comm, p->pid); return; @@ -2333,15 +1271,15 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, * higher priority it becomes from scx_prio_less()'s POV. */ touch_core_sched(rq, p); - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); local_norefill: dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); return; global: touch_core_sched(rq, p); /* see the comment in local: */ - refill_task_slice_dfl(p); - dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); + refill_task_slice_dfl(sch, p); + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2651,8 +1589,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch, if (!scx_rq_online(rq)) { if (enforce) - __scx_add_event(scx_root, - SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); + __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; } @@ -2754,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dst_dsq = find_global_dsq(p); + dst_dsq = find_global_dsq(sch, p); dst_rq = src_rq; } } else { @@ -2910,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dispatch_enqueue(sch, find_global_dsq(p), p, + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -3155,10 +2092,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * balance(), we want to complete this scheduling cycle and then * start a new one. IOW, we want to call resched_curr() on the * next, most likely idle, task, not the current one. Use - * scx_bpf_kick_cpu() for deferred kicking. + * scx_kick_cpu() for deferred kicking. */ if (unlikely(!--nr_loops)) { - scx_bpf_kick_cpu(cpu_of(rq), 0); + scx_kick_cpu(sch, cpu_of(rq), 0); break; } } while (dspc->nr_tasks); @@ -3442,24 +2379,25 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (keep_prev) { p = prev; if (!p->scx.slice) - refill_task_slice_dfl(p); + refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); } else { p = first_local_task(rq); if (!p) { if (kick_idle) - scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); + scx_kick_cpu(rcu_dereference_sched(scx_root), + cpu_of(rq), SCX_KICK_IDLE); return NULL; } if (unlikely(!p->scx.slice)) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = rcu_dereference_sched(scx_root); if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); sch->warned_zero_slice = true; } - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); } } @@ -3548,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; } else { cpu = prev_cpu; @@ -4084,7 +3022,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED -DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) @@ -4101,8 +3039,6 @@ int scx_tg_online(struct task_group *tg) WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = @@ -4122,7 +3058,6 @@ int scx_tg_online(struct task_group *tg) tg->scx.flags |= SCX_TG_ONLINE; } - percpu_up_read(&scx_cgroup_rwsem); return ret; } @@ -4132,15 +3067,11 @@ void scx_tg_offline(struct task_group *tg) WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && (tg->scx.flags & SCX_TG_INITED)) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); - - percpu_up_read(&scx_cgroup_rwsem); } int scx_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4150,9 +3081,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *p; int ret; - /* released in scx_finish/cancel_attach() */ - percpu_down_read(&scx_cgroup_rwsem); - if (!scx_cgroup_enabled) return 0; @@ -4192,7 +3120,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) p->scx.cgrp_moving_from = NULL; } - percpu_up_read(&scx_cgroup_rwsem); return ops_sanitize_err(sch, "cgroup_prep_move", ret); } @@ -4215,11 +3142,6 @@ void scx_cgroup_move_task(struct task_struct *p) p->scx.cgrp_moving_from = NULL; } -void scx_cgroup_finish_attach(void) -{ - percpu_up_read(&scx_cgroup_rwsem); -} - void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) { struct scx_sched *sch = scx_root; @@ -4227,7 +3149,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct task_struct *p; if (!scx_cgroup_enabled) - goto out_unlock; + return; cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && @@ -4236,15 +3158,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } -out_unlock: - percpu_up_read(&scx_cgroup_rwsem); } void scx_group_set_weight(struct task_group *tg, unsigned long weight) { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -4253,7 +3173,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) tg->scx.weight = weight; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_idle(struct task_group *tg, bool idle) @@ -4266,7 +3186,7 @@ void scx_group_set_bandwidth(struct task_group *tg, { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || @@ -4279,23 +3199,25 @@ void scx_group_set_bandwidth(struct task_group *tg, tg->scx.bw_quota_us = quota_us; tg->scx.bw_burst_us = burst_us; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } static void scx_cgroup_lock(void) { - percpu_down_write(&scx_cgroup_rwsem); + percpu_down_write(&scx_cgroup_ops_rwsem); + cgroup_lock(); } static void scx_cgroup_unlock(void) { - percpu_up_write(&scx_cgroup_rwsem); + cgroup_unlock(); + percpu_up_write(&scx_cgroup_ops_rwsem); } #else /* CONFIG_EXT_GROUP_SCHED */ -static inline void scx_cgroup_lock(void) {} -static inline void scx_cgroup_unlock(void) {} +static void scx_cgroup_lock(void) {} +static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED */ @@ -4411,15 +3333,12 @@ static void scx_cgroup_exit(struct scx_sched *sch) { struct cgroup_subsys_state *css; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - scx_cgroup_enabled = false; /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and exit all the inited ones, all online cgroups are exited. */ - rcu_read_lock(); css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); @@ -4430,17 +3349,9 @@ static void scx_cgroup_exit(struct scx_sched *sch) if (!sch->ops.cgroup_exit) continue; - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup); - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); } static int scx_cgroup_init(struct scx_sched *sch) @@ -4448,13 +3359,10 @@ static int scx_cgroup_init(struct scx_sched *sch) struct cgroup_subsys_state *css; int ret; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and init, all online cgroups are initialized. */ - rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { @@ -4473,10 +3381,6 @@ static int scx_cgroup_init(struct scx_sched *sch) continue; } - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { @@ -4485,11 +3389,7 @@ static int scx_cgroup_init(struct scx_sched *sch) return ret; } tg->scx.flags |= SCX_TG_INITED; - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); WARN_ON_ONCE(scx_cgroup_enabled); scx_cgroup_enabled = true; @@ -4572,7 +3472,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) int node; kthread_stop(sch->helper->task); - free_percpu(sch->event_stats_cpu); + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -4671,9 +3571,22 @@ bool task_should_scx(int policy) bool scx_allow_ttwu_queue(const struct task_struct *p) { - return !scx_enabled() || - (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || - p->sched_class != &ext_sched_class; + struct scx_sched *sch; + + if (!scx_enabled()) + return true; + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return true; + + if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + return true; + + if (unlikely(p->sched_class != &ext_sched_class)) + return true; + + return false; } /** @@ -4789,7 +3702,7 @@ static void scx_clear_softlockup(void) * * - pick_next_task() suppresses zero slice warning. * - * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. * * - scx_prio_less() reverts to the default core_sched_at order. @@ -5234,7 +4147,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", p->scx.dsq_vtime, p->scx.slice, p->scx.weight); - dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); + dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), + p->migration_disabled); if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); @@ -5473,13 +4387,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sch->global_dsqs[node] = dsq; } - sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); - if (!sch->event_stats_cpu) + sch->pcpu = alloc_percpu(struct scx_sched_pcpu); + if (!sch->pcpu) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); if (!sch->helper) - goto err_free_event_stats; + goto err_free_pcpu; sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); @@ -5497,8 +4411,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) err_stop_helper: kthread_stop(sch->helper->task); -err_free_event_stats: - free_percpu(sch->event_stats_cpu); +err_free_pcpu: + free_percpu(sch->pcpu); err_free_gdsqs: for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -5621,6 +4535,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_error(sch, "ops.init() failed (%d)", ret); goto err_disable; } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) @@ -5713,7 +4628,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = scx_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); @@ -5723,7 +4637,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); - scx_task_iter_relock(&sti); } scx_task_iter_stop(&sti); scx_cgroup_unlock(); @@ -5795,7 +4708,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); - scx_bypass(false); + /* we'll soon enter disable path, keep bypass on */ err_disable: mutex_unlock(&scx_enable_mutex); /* @@ -6328,40 +5241,41 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) +static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) { - if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) return false; lockdep_assert_irqs_disabled(); if (unlikely(!p)) { - scx_kf_error("called with NULL task"); + scx_error(sch, "called with NULL task"); return false; } if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_kf_error("invalid enq_flags 0x%llx", enq_flags); + scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); return false; } return true; } -static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, - u64 enq_flags) +static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 enq_flags) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct task_struct *ddsp_task; ddsp_task = __this_cpu_read(direct_dispatch_task); if (ddsp_task) { - mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); return; } if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { - scx_kf_error("dispatch buffer overflow"); + scx_error(sch, "dispatch buffer overflow"); return; } @@ -6413,7 +5327,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -6421,7 +5342,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice else p->scx.slice = p->scx.slice ?: 1; - scx_dsq_insert_commit(p, dsq_id, enq_flags); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); } /** @@ -6448,7 +5369,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -6458,7 +5386,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, p->scx.dsq_vtime = vtime; - scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } __bpf_kfunc_end_defs(); @@ -6483,7 +5411,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, bool in_balance; unsigned long flags; - if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed_if_unlocked() && + !scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; /* @@ -6568,7 +5497,15 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) { - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return 0; return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); @@ -6583,14 +5520,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) __bpf_kfunc void scx_bpf_dispatch_cancel(void) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return; if (dspc->cursor > 0) dspc->cursor--; else - scx_kf_error("dispatch buffer underflow"); + scx_error(sch, "dispatch buffer underflow"); } /** @@ -6609,11 +5553,17 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) */ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { - struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; flush_dispatch_buf(sch, dspc->rq); @@ -6760,12 +5710,18 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) { + struct scx_sched *sch; LIST_HEAD(tasks); u32 nr_enqueued = 0; struct rq *rq; struct task_struct *p, *n; - if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) return 0; rq = cpu_rq(smp_processor_id()); @@ -6788,12 +5744,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); @@ -6881,22 +5833,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { __bpf_kfunc_start_defs(); -/** - * scx_bpf_kick_cpu - Trigger reschedule on a CPU - * @cpu: cpu to kick - * @flags: %SCX_KICK_* flags - * - * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or - * trigger rescheduling on a busy CPU. This can be called from any online - * scx_ops operation and the actual kicking is performed asynchronously through - * an irq work. - */ -__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) { struct rq *this_rq; unsigned long irq_flags; - if (!kf_cpu_valid(cpu, NULL)) + if (!ops_cpu_valid(sch, cpu, NULL)) return; local_irq_save(irq_flags); @@ -6920,7 +5862,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) struct rq *target_rq = cpu_rq(cpu); if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) - scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) { @@ -6944,6 +5886,26 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) local_irq_restore(irq_flags); } +/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (likely(sch)) + scx_kick_cpu(sch, cpu, flags); +} + /** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ @@ -7124,28 +6086,29 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __bpf_kfunc_end_defs(); -static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, - char *fmt, unsigned long long *data, u32 data__sz) +static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, + size_t line_size, char *fmt, unsigned long long *data, + u32 data__sz) { struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; s32 ret; if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || (data__sz && !data)) { - scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); + scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); return -EINVAL; } ret = copy_from_kernel_nofault(data_buf, data, data__sz); if (ret < 0) { - scx_kf_error("failed to read data fields (%d)", ret); + scx_error(sch, "failed to read data fields (%d)", ret); return ret; } ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, &bprintf_data); if (ret < 0) { - scx_kf_error("format preparation failed (%d)", ret); + scx_error(sch, "format preparation failed (%d)", ret); return ret; } @@ -7153,17 +6116,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, bprintf_data.bin_args); bpf_bprintf_cleanup(&bprintf_data); if (ret < 0) { - scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); + scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); return ret; } return ret; } -static s32 bstr_format(struct scx_bstr_buf *buf, +static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, char *fmt, unsigned long long *data, u32 data__sz) { - return __bstr_format(buf->data, buf->line, sizeof(buf->line), + return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), fmt, data, data__sz); } @@ -7182,11 +6145,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7202,11 +6168,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7225,17 +6194,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; struct scx_dump_data *dd = &scx_dump_data; struct scx_bstr_buf *buf = &dd->buf; s32 ret; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + if (raw_smp_processor_id() != dd->cpu) { - scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); return; } /* append the formatted string to the line buf */ - ret = __bstr_format(buf->data, buf->line + dd->cursor, + ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, sizeof(buf->line) - dd->cursor, fmt, data, data__sz); if (ret < 0) { dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", @@ -7271,7 +6247,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, */ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7293,7 +6274,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) */ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7315,12 +6301,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) */ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(sch); + if (unlikely(!sch)) + return; + if (unlikely(perf > SCX_CPUPERF_ONE)) { - scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); return; } - if (kf_cpu_valid(cpu, NULL)) { + if (ops_cpu_valid(sch, cpu, NULL)) { struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); struct rq_flags rf; @@ -7329,7 +6323,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) * to the corresponding CPU to prevent ABBA deadlocks. */ if (locked_rq && rq != locked_rq) { - scx_kf_error("Invalid target CPU %d", cpu); + scx_error(sch, "Invalid target CPU %d", cpu); return; } @@ -7424,12 +6418,75 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) */ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { - if (!kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (!ops_cpu_valid(sch, cpu, NULL)) return NULL; + if (!sch->warned_deprecated_rq) { + printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " + "use scx_bpf_locked_rq() when holding rq lock " + "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); + sch->warned_deprecated_rq = true; + } + return cpu_rq(cpu); } +/** + * scx_bpf_locked_rq - Return the rq currently locked by SCX + * + * Returns the rq if a rq lock is currently held by SCX. + * Otherwise emits an error and returns NULL. + */ +__bpf_kfunc struct rq *scx_bpf_locked_rq(void) +{ + struct scx_sched *sch; + struct rq *rq; + + guard(preempt)(); + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return NULL; + + rq = scx_locked_rq(); + if (!rq) { + scx_error(sch, "accessing rq without holding rq lock"); + return NULL; + } + + return rq; +} + +/** + * scx_bpf_cpu_curr - Return remote CPU's curr task + * @cpu: CPU of interest + * + * Callers must hold RCU read lock (KF_RCU). + */ +__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + + return rcu_dereference(cpu_rq(cpu)->curr); +} + /** * scx_bpf_task_cgroup - Return the sched cgroup of a task * @p: task of interest @@ -7446,8 +6503,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) { struct task_group *tg = p->sched_task_group; struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + goto out; - if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) goto out; cgrp = tg_cgrp(tg); @@ -7528,7 +6592,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event /* Aggregate per-CPU event counters into @events. */ memset(events, 0, sizeof(*events)); for_each_possible_cpu(cpu) { - e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); + e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); @@ -7594,6 +6658,8 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) +BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 292bb41a242ec1..43429b33e52c11 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,29 +8,6 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT -static inline bool scx_kf_allowed_if_unlocked(void) -{ - return !current->scx.kf_mask; -} - -static inline bool scx_rq_bypassing(struct rq *rq) -{ - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); -} - -DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); - -DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); - -/* - * Return the rq currently locked from an scx callback, or NULL if no rq is - * locked. - */ -static inline struct rq *scx_locked_rq(void) -{ - return __this_cpu_read(scx_locked_rq_state); -} - void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -100,7 +77,6 @@ int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); void scx_cgroup_move_task(struct task_struct *p); -void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); @@ -111,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } static inline void scx_cgroup_move_task(struct task_struct *p) {} -static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7174e1c1a392c7..d2434c954848ef 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -819,10 +819,10 @@ void scx_idle_disable(void) * Helpers that can be called from the BPF scheduler. */ -static int validate_node(int node) +static int validate_node(struct scx_sched *sch, int node) { if (!static_branch_likely(&scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is disabled"); + scx_error(sch, "per-node idle tracking is disabled"); return -EOPNOTSUPP; } @@ -832,13 +832,13 @@ static int validate_node(int node) /* Make sure node is in a valid range */ if (node < 0 || node >= nr_node_ids) { - scx_kf_error("invalid node %d", node); + scx_error(sch, "invalid node %d", node); return -EINVAL; } /* Make sure the node is part of the set of possible nodes */ if (!node_possible(node)) { - scx_kf_error("unavailable node %d", node); + scx_error(sch, "unavailable node %d", node); return -EINVAL; } @@ -847,26 +847,53 @@ static int validate_node(int node) __bpf_kfunc_start_defs(); -static bool check_builtin_idle_enabled(void) +static bool check_builtin_idle_enabled(struct scx_sched *sch) { if (static_branch_likely(&scx_builtin_idle_enabled)) return true; - scx_kf_error("built-in idle tracking is disabled"); + scx_error(sch, "built-in idle tracking is disabled"); return false; } -static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, +/* + * Determine whether @p is a migration-disabled task in the context of BPF + * code. + * + * We can't simply check whether @p->migration_disabled is set in a + * sched_ext callback, because migration is always disabled for the current + * task while running BPF code. + * + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively + * disable and re-enable migration. For this reason, the current task + * inside a sched_ext callback is always a migration-disabled task. + * + * Therefore, when @p->migration_disabled == 1, check whether @p is the + * current task or not: if it is, then migration was not disabled before + * entering the callback, otherwise migration was disabled. + * + * Returns true if @p is migration-disabled, false otherwise. + */ +static bool is_bpf_migration_disabled(const struct task_struct *p) +{ + if (p->migration_disabled == 1) + return p != current; + else + return p->migration_disabled; +} + +static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, + s32 prev_cpu, u64 wake_flags, const struct cpumask *allowed, u64 flags) { struct rq *rq; struct rq_flags rf; s32 cpu; - if (!kf_cpu_valid(prev_cpu, NULL)) + if (!ops_cpu_valid(sch, prev_cpu, NULL)) return -EINVAL; - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return -EBUSY; /* @@ -879,7 +906,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f if (scx_kf_allowed_if_unlocked()) { rq = task_rq_lock(p, &rf); } else { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) + if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) return -EPERM; rq = scx_locked_rq(); } @@ -898,7 +925,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f * selection optimizations and simply check whether the previously * used CPU is idle and within the allowed cpumask. */ - if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) { + if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) { if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && scx_idle_test_and_clear_cpu(prev_cpu)) cpu = prev_cpu; @@ -922,9 +949,13 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f */ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) { - if (!kf_cpu_valid(cpu, NULL)) - return NUMA_NO_NODE; + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) + return NUMA_NO_NODE; return cpu_to_node(cpu); } @@ -946,15 +977,21 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { + struct scx_sched *sch; s32 cpu; - cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0); + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { *is_idle = true; return cpu; } *is_idle = false; - return prev_cpu; } @@ -981,7 +1018,16 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags) { - return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, + cpus_allowed, flags); } /** @@ -995,7 +1041,15 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; @@ -1011,12 +1065,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; return idle_cpumask(NUMA_NO_NODE)->cpu; @@ -1034,7 +1096,15 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; @@ -1054,12 +1124,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; if (sched_smt_active()) @@ -1095,10 +1173,18 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) */ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) { - if (!check_builtin_idle_enabled()) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) return false; - if (!kf_cpu_valid(cpu, NULL)) + if (!check_builtin_idle_enabled(sch)) + return false; + + if (!ops_cpu_valid(sch, cpu, NULL)) return false; return scx_idle_test_and_clear_cpu(cpu); @@ -1126,7 +1212,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; @@ -1158,12 +1252,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return -EBUSY; return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); @@ -1193,9 +1295,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { + struct scx_sched *sch; s32 cpu; - node = validate_node(node); + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; @@ -1233,10 +1342,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; s32 cpu; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h new file mode 100644 index 00000000000000..b3617abed51081 --- /dev/null +++ b/kernel/sched/ext_internal.h @@ -0,0 +1,1078 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo + */ +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_TASK_ITER_BATCH = 32, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ + SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ + SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), + * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes + * are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +enum scx_exit_flags { + /* + * ops.exit() may be called even if the loading failed before ops.init() + * finishes successfully. This is because ops.exit() allows rich exit + * info communication. The following flag indicates whether ops.init() + * finished successfully. + */ + SCX_EFLAG_INITIALIZED, +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + + /* exit code if gracefully exiting */ + s64 exit_code; + + /* %SCX_EFLAG_* */ + u64 flags; + + /* textual representation of the above */ + const char *reason; + + /* backtrace if exiting due to an error */ + unsigned long *bt; + u32 bt_len; + + /* informational message */ + char *msg; + + /* debug dump */ + char *dump; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * If set, only tasks with policy set to SCHED_EXT are attached to + * sched_ext. If clear, SCHED_NORMAL tasks are also included. + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, + + /* + * CPU cgroup support flags + */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | + SCX_OPS_HAS_CGROUP_WEIGHT, + + /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ + __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, + + SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { + /* + * Set if ops.init_task() is being invoked on the fork path, as opposed + * to the scheduler transition path. + */ + bool fork; +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; + + /* bandwidth control parameters from cpu.max and cpu.max.burst */ + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/* + * Informational context provided to dump operations. + */ +struct scx_dump_ctx { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + u64 at_ns; + u64 at_jiffies; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * A BPF scheduler can implement an arbitrary scheduling policy by + * implementing and loading operations in this table. Note that a userland + * scheduling policy can also be implemented using the BPF scheduler + * as a shim layer. + */ +struct sched_ext_ops { + /** + * @select_cpu: Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be inserted into a DSQ directly by calling + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ + * of the CPU returned by this operation. + * + * Note that select_cpu() is never called for tasks that can only run + * on a single CPU or tasks with migration disabled, as they don't have + * the option to select a different CPU. See select_task_rq() for + * details. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * @enqueue: Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Insert directly into a DSQ by calling + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, + * the task will stall. + * + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * @dequeue: Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ + * using scx_bpf_dsq_move_to_local(). + * + * The maximum number of times scx_bpf_dsq_insert() can be called + * without an intervening scx_bpf_dsq_move_to_local() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * @tick: Periodic tick + * @p: task running currently + * + * This operation is called every 1/HZ seconds on CPUs which are + * executing an SCX task. Setting @p->scx.slice to 0 will trigger an + * immediate dispatch cycle on the CPU. + */ + void (*tick)(struct task_struct *p); + + /** + * @runnable: A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU, or when @p is + * being enqueued on a CPU experiencing a hotplug event. Likewise, a + * task may be ->enqueue()'d without being preceded by this operation + * e.g. after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * @running: A task is starting to run on its associated CPU + * @p: task starting to run + * + * Note that this callback may be called from a CPU other than the + * one the task is going to run on. This can happen when a task + * property is changed (i.e., affinity), since scx_next_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to determine the + * target CPU the task is going to use. + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * @stopping: A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * Note that this callback may be called from a CPU other than the + * one the task was running on. This can happen when a task + * property is changed (i.e., affinity), since dequeue_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU + * the task was running on. + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * @quiescent: A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * @yield: Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * @core_sched_before: Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); + + /** + * @set_weight: Set task weight + * @p: task to set weight for + * @weight: new weight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * @set_cpumask: Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * @update_idle: Update the idle state of a CPU + * @cpu: CPU to update the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_select_cpu_and() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * @cpu_acquire: A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * @cpu_release: A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * @init_task: Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); + + /** + * @exit_task: Exit a previously-running task from the system + * @p: task to exit + * @args: exit arguments, see the struct definition + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); + + /** + * @enable: Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. enable() is called on @p any time it + * enters SCX, and is always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * @disable: Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + + /** + * @dump: Dump BPF scheduler state on error + * @ctx: debug dump context + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. + */ + void (*dump)(struct scx_dump_ctx *ctx); + + /** + * @dump_cpu: Dump BPF scheduler state for a CPU on error + * @ctx: debug dump context + * @cpu: CPU to generate debug dump for + * @idle: @cpu is currently idle without any runnable tasks + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @cpu. If @idle is %true and this operation doesn't produce any + * output, @cpu is skipped for dump. + */ + void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); + + /** + * @dump_task: Dump BPF scheduler state for a runnable task on error + * @ctx: debug dump context + * @p: runnable task to generate debug dump for + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @p. + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * @cgroup_init: Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * @cgroup_exit: Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_move: Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_cancel_move: Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_set_weight: A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @cgrp's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + + /** + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed + * @cgrp: cgroup whose bandwidth is being updated + * @period_us: bandwidth control period + * @quota_us: bandwidth control quota + * @burst_us: bandwidth control burst + * + * Update @cgrp's bandwidth control parameters. This is from the cpu.max + * cgroup interface. + * + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled + * to. For example, if @period_us is 1_000_000 and @quota_us is + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be + * interpreted in the same fashion and specifies how much @cgrp can + * burst temporarily. The specific control mechanism and thus the + * interpretation of @period_us and burstiness is upto to the BPF + * scheduler. + */ + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + +#endif /* CONFIG_EXT_GROUP_SCHED */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * @cpu_online: A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * @cpu_offline: A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * @init: Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * @exit: Clean up after the BPF scheduler + * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. + */ + void (*exit)(struct scx_exit_info *info); + + /** + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * @flags: %SCX_OPS_* flags + */ + u64 flags; + + /** + * @timeout_ms: The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default + * value of 32768 is used. + */ + u32 exit_dump_len; + + /** + * @hotplug_seq: A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + + /** + * @name: BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void *priv; +}; + +enum scx_opi { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), +}; + +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * Total number of times a task's time slice was refilled with the + * default value (SCX_SLICE_DFL). + */ + s64 SCX_EV_REFILL_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +struct scx_sched_pcpu { + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats event_stats; +}; + +struct scx_sched { + struct sched_ext_ops ops; + DECLARE_BITMAP(has_op, SCX_OPI_END); + + /* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. + * This is to avoid live-locking in bypass mode where all tasks are + * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If + * per-node split isn't sufficient, it can be further split. + */ + struct rhashtable dsq_hash; + struct scx_dispatch_q **global_dsqs; + struct scx_sched_pcpu __percpu *pcpu; + + bool warned_zero_slice:1; + bool warned_deprecated_rq:1; + + atomic_t exit_kind; + struct scx_exit_info *exit_info; + + struct kobject kobj; + + struct kthread_worker *helper; + struct irq_work error_irq_work; + struct kthread_work disable_work; + struct rcu_work rcu_work; +}; + +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * The BPF scheduler is responsible for triggering a follow-up + * scheduling event. Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ +}; + +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will + * return after the target CPU finishes picking the next task. + */ + SCX_KICK_WAIT = 1LLU << 2, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +enum scx_enable_state { + SCX_ENABLING, + SCX_ENABLED, + SCX_DISABLING, + SCX_DISABLED, +}; + +static const char *scx_enable_state_str[] = { + [SCX_ENABLING] = "enabling", + [SCX_ENABLED] = "enabled", + [SCX_DISABLING] = "disabling", + [SCX_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(scx_locked_rq_state); +} + +static inline bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + +static inline bool scx_rq_bypassing(struct rq *rq) +{ + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c2e..3a89f949e30770 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3542,7 +3542,7 @@ static void task_numa_work(struct callback_head *work) } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; @@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se) if (!gcfs_rq || !gcfs_rq->load.weight) return; - if (throttled_hierarchy(gcfs_rq)) - return; - shares = calc_group_shares(gcfs_rq); if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); @@ -5291,18 +5288,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) { - list_add_leaf_cfs_rq(cfs_rq); - } else { + list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH + if (cfs_rq->pelt_clock_throttled) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); -#endif + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } +#endif } } @@ -5341,8 +5336,6 @@ static void set_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable--; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5363,8 +5356,6 @@ static void clear_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable++; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5392,7 +5383,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * DELAY_DEQUEUE relies on spurious wakeups, special task * states must not suffer spurious wakeups, excempt them. */ - if (flags & DEQUEUE_SPECIAL) + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) delay = false; WARN_ON_ONCE(delay && se->sched_delayed); @@ -5450,8 +5441,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_queued == 0) + if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); +#ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { + struct rq *rq = rq_of(cfs_rq); + + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; + } +#endif + } return true; } @@ -5725,74 +5726,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled; +} + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return cfs_bandwidth_used() && cfs_rq->throttle_count; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); +} + +static inline bool task_is_throttled(struct task_struct *p) +{ + return cfs_bandwidth_used() && p->throttled; +} + +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); +static void throttle_cfs_rq_work(struct callback_head *work) +{ + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work); + struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; + + WARN_ON_ONCE(p != current); + p->sched_throttle_work.next = &p->sched_throttle_work; + + /* + * If task is exiting, then there won't be a return to userspace, so we + * don't have to bother with any of this. + */ + if ((p->flags & PF_EXITING)) + return; + + scoped_guard(task_rq_lock, p) { + se = &p->se; + cfs_rq = cfs_rq_of(se); + + /* Raced, forget */ + if (p->sched_class != &fair_sched_class) + return; + + /* + * If not in limbo, then either replenish has happened or this + * task got migrated out of the throttled cfs_rq, move along. + */ + if (!cfs_rq->throttle_count) + return; + rq = scope.rq; + update_rq_clock(rq); + WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + /* + * Must not set throttled before dequeue or dequeue will + * mistakenly regard this task as an already throttled one. + */ + p->throttled = true; + resched_curr(rq); + } +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + /* - * Ensure that neither of the group entities corresponding to src_cpu or - * dest_cpu are members of a throttled hierarchy when performing group - * load-balance operations. + * Task is throttled and someone wants to dequeue it again: + * it could be sched/core when core needs to do things like + * task affinity change, task group change, task sched class + * change etc. and in these cases, DEQUEUE_SLEEP is not set; + * or the task is blocked after throttled due to freezer etc. + * and in these cases, DEQUEUE_SLEEP is set. */ -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static void detach_task_cfs_rq(struct task_struct *p); +static void dequeue_throttled_task(struct task_struct *p, int flags) +{ + WARN_ON_ONCE(p->se.on_rq); + list_del_init(&p->throttle_node); + + /* task blocked after throttled */ + if (flags & DEQUEUE_SLEEP) { + p->throttled = false; + return; + } + + /* + * task is migrating off its old cfs_rq, detach + * the task's load from its old cfs_rq. + */ + if (task_on_rq_migrating(p)) + detach_task_cfs_rq(p); +} + +static bool enqueue_throttled_task(struct task_struct *p) { - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); - src_cfs_rq = tg->cfs_rq[src_cpu]; - dest_cfs_rq = tg->cfs_rq[dest_cpu]; + /* @p should have gone through dequeue_throttled_task() first */ + WARN_ON_ONCE(!list_empty(&p->throttle_node)); + + /* + * If the throttled task @p is enqueued to a throttled cfs_rq, + * take the fast path by directly putting the task on the + * target cfs_rq's limbo list. + * + * Do not do that when @p is current because the following race can + * cause @p's group_node to be incorectly re-insterted in its rq's + * cfs_tasks list, despite being throttled: + * + * cpuX cpuY + * p ret2user + * throttle_cfs_rq_work() sched_move_task(p) + * LOCK task_rq_lock + * dequeue_task_fair(p) + * UNLOCK task_rq_lock + * LOCK task_rq_lock + * task_current_donor(p) == true + * task_on_rq_queued(p) == true + * dequeue_task(p) + * put_prev_task(p) + * sched_change_group() + * enqueue_task(p) -> p's new cfs_rq + * is throttled, go + * fast path and skip + * actual enqueue + * set_next_task(p) + * list_move(&se->group_node, &rq->cfs_tasks); // bug + * schedule() + * + * In the above race case, @p current cfs_rq is in the same rq as + * its previous cfs_rq because sched_move_task() only moves a task + * to a different group from the same rq, so we can use its current + * cfs_rq to derive rq and test if the task is current. + */ + if (throttled_hierarchy(cfs_rq) && + !task_current_donor(rq_of(cfs_rq), p)) { + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + return true; + } - return throttled_hierarchy(src_cfs_rq) || - throttled_hierarchy(dest_cfs_rq); + /* we can't take the fast path, do an actual enqueue*/ + p->throttled = false; + return false; } +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct task_struct *p, *tmp; + + if (--cfs_rq->throttle_count) + return 0; - cfs_rq->throttle_count--; - if (!cfs_rq->throttle_count) { + if (cfs_rq->pelt_clock_throttled) { cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } - /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; - if (cfs_rq->throttled_clock_self) { - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + cfs_rq->throttled_clock_self = 0; - cfs_rq->throttled_clock_self = 0; + if (WARN_ON_ONCE((s64)delta < 0)) + delta = 0; - if (WARN_ON_ONCE((s64)delta < 0)) - delta = 0; + cfs_rq->throttled_clock_self_time += delta; + } - cfs_rq->throttled_clock_self_time += delta; - } + /* Re-enqueue the tasks that have been throttled at this level. */ + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_del_init(&p->throttle_node); + p->throttled = false; + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); } + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + return 0; } +static inline bool task_has_throttle_work(struct task_struct *p) +{ + return p->sched_throttle_work.next != &p->sched_throttle_work; +} + +static inline void task_throttle_setup_work(struct task_struct *p) +{ + if (task_has_throttle_work(p)) + return; + + /* + * Kthreads and exiting tasks don't return to userspace, so adding the + * work is pointless + */ + if ((p->flags & (PF_EXITING | PF_KTHREAD))) + return; + + task_work_add(p, &p->sched_throttle_work, TWA_RESUME); +} + +static void record_throttle_clock(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); - list_del_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttle_count++) + return 0; - WARN_ON_ONCE(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ + if (!cfs_rq->nr_queued) { + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; } - cfs_rq->throttle_count++; + WARN_ON_ONCE(cfs_rq->throttled_clock_self); + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -5800,8 +5980,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta, dequeue = 1; + int dequeue = 1; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5824,76 +6003,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!dequeue) return false; /* Throttle no longer required. */ - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - int flags; - - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - /* - * Abuse SPECIAL to avoid delayed dequeue in this instance. - * This avoids teaching dequeue_entities() about throttled - * entities and keeps things relatively simple. - */ - flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; - if (se->sched_delayed) - flags |= DEQUEUE_DELAYED; - dequeue_entity(qcfs_rq, se, flags); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - - if (qcfs_rq->load.weight) { - /* Avoid re-evaluating load for this entity: */ - se = parent_entity(se); - break; - } - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - } - - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, queued_delta); -done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; WARN_ON_ONCE(cfs_rq->throttled_clock); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5901,9 +6021,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta; - long rq_h_nr_queued = rq->cfs.h_nr_queued; + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + + /* + * It's possible we are called with !runtime_remaining due to things + * like user changed quota setting(see tg_set_cfs_bandwidth()) or async + * unthrottled us with a positive runtime_remaining but other still + * running entities consumed those runtime before we reached here. + * + * Anyway, we can't unthrottle this cfs_rq without any runtime remaining + * because any enqueue in tg_unthrottle_up() will immediately trigger a + * throttle, which is not supposed to happen on unthrottle path. + */ + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) + return; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5933,62 +6064,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (list_add_leaf_cfs_rq(cfs_rq_of(se))) break; } - goto unthrottle_throttle; - } - - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - /* Handle any unfinished DELAY_DEQUEUE business first. */ - if (se->sched_delayed) { - int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; - - dequeue_entity(qcfs_rq, se, flags); - } else if (se->on_rq) - break; - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - update_load_avg(qcfs_rq, se, UPDATE_TG); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; } - /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) - dl_server_start(&rq->fair_server); - - /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, queued_delta); - -unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ @@ -6472,6 +6549,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6639,19 +6717,28 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void task_throttle_setup_work(struct task_struct *p) {} +static bool task_is_throttled(struct task_struct *p) { return false; } +static void dequeue_throttled_task(struct task_struct *p, int flags) {} +static bool enqueue_throttled_task(struct task_struct *p) { return false; } +static void record_throttle_clock(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { return 0; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return false; +} + static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return 0; } -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) { return 0; } @@ -6831,6 +6918,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; + if (task_is_throttled(p) && enqueue_throttled_task(p)) + return; + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -6883,10 +6973,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; - flags = ENQUEUE_WAKEUP; } @@ -6908,10 +6994,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; } if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { @@ -6941,7 +7023,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) check_update_overutilized_status(rq); -enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -6963,6 +7044,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) bool was_sched_idle = sched_idle_rq(rq); bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; + bool task_throttled = flags & DEQUEUE_THROTTLE; struct task_struct *p = NULL; int h_nr_idle = 0; int h_nr_queued = 0; @@ -6996,9 +7078,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -7010,7 +7091,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + if (task_sleep && se) set_next_buddy(se); break; } @@ -7037,9 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); } sub_nr_running(rq, h_nr_queued); @@ -7073,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { + if (task_is_throttled(p)) { + dequeue_throttled_task(p, flags); + return true; + } + if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); @@ -8660,7 +8745,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + if (task_is_throttled(p)) return; if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { @@ -8741,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; struct cfs_rq *cfs_rq; + struct task_struct *p; + bool throttled; again: cfs_rq = &rq->cfs; if (!cfs_rq->nr_queued) return NULL; + throttled = false; + do { /* Might not have done put_prev_entity() */ if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; + throttled |= check_cfs_rq_runtime(cfs_rq); se = pick_next_entity(rq, cfs_rq); if (!se) @@ -8761,7 +8849,10 @@ static struct task_struct *pick_task_fair(struct rq *rq) cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + if (unlikely(throttled)) + task_throttle_setup_work(p); + return p; } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); @@ -8859,11 +8950,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru return pick_next_task_fair(rq, prev, NULL); } -static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) -{ - return !!dl_se->rq->cfs.nr_queued; -} - static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) { return pick_task_fair(dl_se->rq); @@ -8875,7 +8961,7 @@ void fair_server_init(struct rq *rq) init_dl_entity(dl_se); - dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + dl_server_init(dl_se, rq, fair_server_pick_task); } /* @@ -8928,8 +9014,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - /* throttled hierarchies are not runnable */ - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + /* !se->on_rq also covers throttled task */ + if (!se->on_rq) return false; /* Tell the scheduler that we'd really like se to run next. */ @@ -9288,7 +9374,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) throttled_lb_pair, or + * 2) target cfs_rq is in throttled hierarchy, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or * 5) are cache-hot on their current CPU, or @@ -9297,7 +9383,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (lb_throttled_hierarchy(p, env->dst_cpu)) return 0; /* @@ -13081,10 +13167,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - return; - - if (!throttled_hierarchy(cfs_rq)) + /* + * If a task gets attached to this cfs_rq and before being queued, + * it gets migrated to another CPU due to reasons like affinity + * change, make sure this cfs_rq stays on leaf cfs_rq list to have + * that removed load decayed or it can cause faireness problem. + */ + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ @@ -13095,10 +13184,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); - if (cfs_rq_throttled(cfs_rq)) - break; - - if (!throttled_hierarchy(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 62c3fa543c0f20..f921302dc40fb4 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { u64 throttled; - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) throttled = U64_MAX; else throttled = cfs_rq->throttled_clock_pelt_time; @@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - if (unlikely(cfs_rq->throttle_count)) + if (unlikely(cfs_rq->pelt_clock_throttled)) return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c new file mode 100644 index 00000000000000..a23747bbe25b4d --- /dev/null +++ b/kernel/sched/rq-offsets.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#define COMPILE_OFFSETS +#include +#include +#include "sched.h" + +int main(void) +{ + DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned)); + + return 0; +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f751..1f5d07067f60a3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -365,25 +365,50 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * * dl_se::rq -- runqueue we belong to. * - * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the - * server when it runs out of tasks to run. - * * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this * returns NULL. * * dl_server_update() -- called from update_curr_common(), propagates runtime * to the server. * - * dl_server_start() - * dl_server_stop() -- start/stop the server when it has (no) tasks. + * dl_server_start() -- start the server when it has tasks; it will stop + * automatically when there are no more tasks, per + * dl_se::server_pick() returning NULL. + * + * dl_server_stop() -- (force) stop the server; use when updating + * parameters. * * dl_server_init() -- initializes the server. + * + * When started the dl_server will (per dl_defer) schedule a timer for its + * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a + * server will run at the very end of its period. + * + * This is done such that any runtime from the target class can be accounted + * against the server -- through dl_server_update() above -- such that when it + * becomes time to run, it might already be out of runtime and get deferred + * until the next period. In this case dl_server_timer() will alternate + * between defer and replenish but never actually enqueue the server. + * + * Only when the target class does not manage to exhaust the server's runtime + * (there's actualy starvation in the given period), will the dl_server get on + * the runqueue. Once queued it will pick tasks from the target class and run + * them until either its runtime is exhaused, at which point its back to + * dl_server_timer, or until there are no more tasks to run, at which point + * the dl_server stops itself. + * + * By stopping at this point the dl_server retains bandwidth, which, if a new + * task wakes up imminently (starting the server again), can be used -- + * subject to CBS wakeup rules -- without having to wait for the next period. + * + * Additionally, because of the dl_defer behaviour the start/stop behaviour is + * naturally thottled to once per period, avoiding high context switch + * workloads from spamming the hrtimer program/cancel paths. */ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); @@ -735,10 +760,12 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; - int throttled; + bool throttled:1; + bool pelt_clock_throttled:1; int throttle_count; struct list_head throttled_list; struct list_head throttled_csd_list; + struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1935,12 +1962,12 @@ extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); -extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +extern void init_numa_balancing(u64 clone_flags, struct task_struct *p); #else /* !CONFIG_NUMA_BALANCING: */ static inline void -init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +init_numa_balancing(u64 clone_flags, struct task_struct *p) { } @@ -2342,6 +2369,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_THROTTLE 0x800 #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2658,6 +2686,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_entity(struct sched_dl_entity *dl_se); +extern void init_cfs_throttle_work(struct task_struct *p); + #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6e2f54169e66c0..444bdfdab73180 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd) enum numa_topology_type sched_numa_topology_type; static int sched_domains_numa_levels; -static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; @@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl, int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); + sd_weight = cpumask_weight(tl->mask(tl, cpu)); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); @@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl, }; sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); sd_id = cpumask_first(sd_span); sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); @@ -1732,22 +1724,63 @@ sd_init(struct sched_domain_topology_level *tl, return sd; } +#ifdef CONFIG_SCHED_SMT +int cpu_smt_flags(void) +{ + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; +} + +const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_CLUSTER +int cpu_cluster_flags(void) +{ + return SD_CLUSTER | SD_SHARE_LLC; +} + +const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_MC +int cpu_core_flags(void) +{ + return SD_SHARE_LLC; +} + +const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} +#endif + +const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + /* * Topology list, bottom-up. */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; @@ -1768,10 +1801,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) } #ifdef CONFIG_NUMA +static int cpu_numa_flags(void) +{ + return SD_NUMA; +} -static const struct cpumask *sd_numa_mask(int cpu) +static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; + return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; } static void sched_numa_warn(const char *str) @@ -2413,7 +2450,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) * breaks the linking done for an earlier span. */ for_each_cpu(cpu, cpu_map) { - const struct cpumask *tl_cpu_mask = tl->mask(cpu); + const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); int id; /* lowest bit set in this mask is used as a unique id */ @@ -2421,7 +2458,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) if (cpumask_test_cpu(id, id_seen)) { /* First CPU has already been seen, ensure identical spans */ - if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) return false; } else { /* First CPU hasn't been seen before, ensure it's a completely new span */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 41aa761c7738ce..25f62867a16d93 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -741,6 +741,26 @@ seccomp_prepare_user_filter(const char __user *user_filter) } #ifdef SECCOMP_ARCH_NATIVE +static bool seccomp_uprobe_exception(struct seccomp_data *sd) +{ +#if defined __NR_uretprobe || defined __NR_uprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + { +#ifdef __NR_uretprobe + if (sd->nr == __NR_uretprobe) + return true; +#endif +#ifdef __NR_uprobe + if (sd->nr == __NR_uprobe) + return true; +#endif + } +#endif + return false; +} + /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs @@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, return false; /* Our single exception to filtering. */ -#ifdef __NR_uretprobe -#ifdef SECCOMP_ARCH_COMPAT - if (sd->arch == SECCOMP_ARCH_NATIVE) -#endif - if (sd->nr == __NR_uretprobe) - return true; -#endif + if (seccomp_uprobe_exception(sd)) + return true; for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; @@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, #ifdef __NR_uretprobe __NR_uretprobe, +#endif +#ifdef __NR_uprobe + __NR_uprobe, #endif -1, /* negative terminated */ }; @@ -1139,7 +1157,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn static bool should_sleep_killable(struct seccomp_filter *match, struct seccomp_knotif *n) { - return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; + return match->wait_killable_recv && n->state >= SECCOMP_NOTIFY_SENT; } static int seccomp_do_user_notification(int this_syscall, @@ -1186,13 +1204,11 @@ static int seccomp_do_user_notification(int this_syscall, if (err != 0) { /* - * Check to see if the notifcation got picked up and - * whether we should switch to wait killable. + * Check to see whether we should switch to wait + * killable. Only return the interrupted error if not. */ - if (!wait_killable && should_sleep_killable(match, &n)) - continue; - - goto interrupted; + if (!(!wait_killable && should_sleep_killable(match, &n))) + goto interrupted; } addfd = list_first_entry_or_null(&n.addfd, diff --git a/kernel/smp.c b/kernel/smp.c index 56f83aa58ec82f..02f52291fae425 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -884,16 +884,15 @@ static void smp_call_function_many_cond(const struct cpumask *mask, * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait - * (atomically) until function has completed on other CPUs. If - * %SCF_RUN_LOCAL is set, the function will also be run locally - * if the local CPU is set in the @cpumask. - * - * If @wait is true, then returns once @func has returned. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. + * + * @func is not called on the local CPU even if @mask contains it. Consider + * using on_each_cpu_cond_mask() instead if this is not desirable. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) diff --git a/kernel/softirq.c b/kernel/softirq.c index 513b1945987cc6..77198911b8dd4b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -165,7 +165,11 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { - local_lock(&softirq_ctrl.lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) + local_lock(&softirq_ctrl.lock); + else + migrate_disable(); + /* Required to meet the RCU bottomhalf requirements. */ rcu_read_lock(); } else { @@ -177,17 +181,34 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) * Track the per CPU softirq disabled state. On RT this is per CPU * state to allow preemption of bottom half disabled sections. */ - newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt); - /* - * Reflect the result in the task state to prevent recursion on the - * local lock and to make softirq_count() & al work. - */ - current->softirq_disable_cnt = newcnt; + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + newcnt = this_cpu_add_return(softirq_ctrl.cnt, cnt); + /* + * Reflect the result in the task state to prevent recursion on the + * local lock and to make softirq_count() & al work. + */ + current->softirq_disable_cnt = newcnt; - if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { - raw_local_irq_save(flags); - lockdep_softirqs_off(ip); - raw_local_irq_restore(flags); + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } + } else { + bool sirq_dis = false; + + if (!current->softirq_disable_cnt) + sirq_dis = true; + + this_cpu_add(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt += cnt; + WARN_ON_ONCE(current->softirq_disable_cnt < 0); + + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_dis) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } } } EXPORT_SYMBOL(__local_bh_disable_ip); @@ -195,23 +216,42 @@ EXPORT_SYMBOL(__local_bh_disable_ip); static void __local_bh_enable(unsigned int cnt, bool unlock) { unsigned long flags; + bool sirq_en = false; int newcnt; - DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != - this_cpu_read(softirq_ctrl.cnt)); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != + this_cpu_read(softirq_ctrl.cnt)); + if (softirq_count() == cnt) + sirq_en = true; + } else { + if (current->softirq_disable_cnt == cnt) + sirq_en = true; + } - if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) { + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_en) { raw_local_irq_save(flags); lockdep_softirqs_on(_RET_IP_); raw_local_irq_restore(flags); } - newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt); - current->softirq_disable_cnt = newcnt; + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) { + newcnt = this_cpu_sub_return(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt = newcnt; - if (!newcnt && unlock) { - rcu_read_unlock(); - local_unlock(&softirq_ctrl.lock); + if (!newcnt && unlock) { + rcu_read_unlock(); + local_unlock(&softirq_ctrl.lock); + } + } else { + current->softirq_disable_cnt -= cnt; + this_cpu_sub(softirq_ctrl.cnt, cnt); + if (unlock && !current->softirq_disable_cnt) { + migrate_enable(); + rcu_read_unlock(); + } else { + WARN_ON_ONCE(current->softirq_disable_cnt < 0); + } } } @@ -228,7 +268,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) lock_map_release(&bh_lock_map); local_irq_save(flags); - curcnt = __this_cpu_read(softirq_ctrl.cnt); + if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) + curcnt = this_cpu_read(softirq_ctrl.cnt); + else + curcnt = current->softirq_disable_cnt; /* * If this is not reenabling soft interrupts, no point in trying to @@ -805,6 +848,58 @@ static bool tasklet_clear_sched(struct tasklet_struct *t) return false; } +#ifdef CONFIG_PREEMPT_RT +struct tasklet_sync_callback { + spinlock_t cb_lock; + atomic_t cb_waiters; +}; + +static DEFINE_PER_CPU(struct tasklet_sync_callback, tasklet_sync_callback) = { + .cb_lock = __SPIN_LOCK_UNLOCKED(tasklet_sync_callback.cb_lock), + .cb_waiters = ATOMIC_INIT(0), +}; + +static void tasklet_lock_callback(void) +{ + spin_lock(this_cpu_ptr(&tasklet_sync_callback.cb_lock)); +} + +static void tasklet_unlock_callback(void) +{ + spin_unlock(this_cpu_ptr(&tasklet_sync_callback.cb_lock)); +} + +static void tasklet_callback_cancel_wait_running(void) +{ + struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback); + + atomic_inc(&sync_cb->cb_waiters); + spin_lock(&sync_cb->cb_lock); + atomic_dec(&sync_cb->cb_waiters); + spin_unlock(&sync_cb->cb_lock); +} + +static void tasklet_callback_sync_wait_running(void) +{ + struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback); + + if (atomic_read(&sync_cb->cb_waiters)) { + spin_unlock(&sync_cb->cb_lock); + spin_lock(&sync_cb->cb_lock); + } +} + +#else /* !CONFIG_PREEMPT_RT: */ + +static void tasklet_lock_callback(void) { } +static void tasklet_unlock_callback(void) { } +static void tasklet_callback_sync_wait_running(void) { } + +#ifdef CONFIG_SMP +static void tasklet_callback_cancel_wait_running(void) { } +#endif +#endif /* !CONFIG_PREEMPT_RT */ + static void tasklet_action_common(struct tasklet_head *tl_head, unsigned int softirq_nr) { @@ -816,6 +911,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, tl_head->tail = &tl_head->head; local_irq_enable(); + tasklet_lock_callback(); while (list) { struct tasklet_struct *t = list; @@ -835,6 +931,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, } } tasklet_unlock(t); + tasklet_callback_sync_wait_running(); continue; } tasklet_unlock(t); @@ -847,6 +944,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head, __raise_softirq_irqoff(softirq_nr); local_irq_enable(); } + tasklet_unlock_callback(); } static __latent_entropy void tasklet_action(void) @@ -897,12 +995,9 @@ void tasklet_unlock_spin_wait(struct tasklet_struct *t) /* * Prevent a live lock when current preempted soft * interrupt processing or prevents ksoftirqd from - * running. If the tasklet runs on a different CPU - * then this has no effect other than doing the BH - * disable/enable dance for nothing. + * running. */ - local_bh_disable(); - local_bh_enable(); + tasklet_callback_cancel_wait_running(); } else { cpu_relax(); } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c6c..bf5d05c635ffd5 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e6e9b85d4db5f8..f7d52d9543cc7a 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o ifeq ($(CONFIG_SMP),y) obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o endif -obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o +obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 577f0e6842d4ce..069d93bfb0c75c 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -35,7 +35,7 @@ /** * struct alarm_base - Alarm timer bases - * @lock: Lock for syncrhonized access to the base + * @lock: Lock for synchronized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base * @get_timespec: Function to read the namespace time correlating to the base diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f3e831f62906f1..a59bc75ab7c5b4 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -633,7 +633,7 @@ void tick_offline_cpu(unsigned int cpu) raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); - tick_shutdown(cpu); + tick_shutdown(); /* * Unregister the clock event devices which were diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0aef0e349e49c5..a1890a073196b1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -144,7 +144,7 @@ static u64 suspend_start; * Default for maximum permissible skew when cs->uncertainty_margin is * not specified, and the lower bound even when cs->uncertainty_margin * is specified. This is also the default that is used when registering - * clocks with unspecifed cs->uncertainty_margin, so this macro is used + * clocks with unspecified cs->uncertainty_margin, so this macro is used * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. */ #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) @@ -407,9 +407,8 @@ void clocksource_verify_percpu(struct clocksource *cs) if (!cpumask_empty(&cpus_behind)) pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_behind), testcpu, cs->name); - if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) - pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); + pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); } EXPORT_SYMBOL_GPL(clocksource_verify_percpu); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c0a..88aa062b8a556d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -59,6 +59,7 @@ #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); /* * The timer bases: @@ -76,42 +77,34 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, }, .csd = CSD_INIT(retrigger_next_event, NULL) @@ -208,7 +201,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_ /* * The offline local CPU can't be the default target if the * next remote target event is after this timer. Keep the - * elected new base. An IPI will we issued to reprogram + * elected new base. An IPI will be issued to reprogram * it as a last resort. */ if (!hrtimer_base_is_online(this_cpu_base)) @@ -787,10 +780,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -1253,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); tim = hrtimer_update_lowres(timer, tim, mode); @@ -1574,10 +1567,10 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { switch (clock_id) { - case CLOCK_REALTIME: - return HRTIMER_BASE_REALTIME; case CLOCK_MONOTONIC: return HRTIMER_BASE_MONOTONIC; + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; case CLOCK_BOOTTIME: return HRTIMER_BASE_BOOTTIME; case CLOCK_TAI: @@ -1588,6 +1581,29 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) } } +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); + } +} + +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) +{ + return __hrtimer_cb_get_time(timer->base->clockid); +} +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); + static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) @@ -2295,11 +2311,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 876d389b2e219a..7c6110e964e7ec 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk) struct hrtimer *tmr = &tsk->signal->real_timer; if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); + hrtimer_forward_now(tmr, tsk->signal->it_real_incr); hrtimer_restart(tmr); } } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3b5..5b6997f4dc3da5 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; - refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; - ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -130,7 +129,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) @@ -253,16 +252,13 @@ static void timens_set_vvar_page(struct task_struct *task, void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); - kfree(ns); -} - -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) @@ -466,7 +462,6 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -476,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -484,9 +478,15 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns.ns_type = ns_common_type(&init_time_ns), + .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, - .ns.inum = PROC_TIME_INIT_INO, + .ns.inum = ns_init_inum(&init_time_ns), .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 8b582174b1f949..aa3120104a5128 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -299,8 +299,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), - timr->it_interval); + timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); hrtimer_restart(timer); } @@ -535,7 +534,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; } /* - * After succesful copy out, the timer ID is visible to user space + * After successful copy out, the timer ID is visible to user space * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create @@ -825,7 +824,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) - expires = ktime_add_safe(expires, timer->base->get_time()); + expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); if (!sigev_none) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cc15fe293719f5..cc1afec306b3f0 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) return HRTIMER_RESTART; } -void __init -sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pS as sched_clock source\n", read); } +EXPORT_SYMBOL_GPL(sched_clock_register); void __init generic_sched_clock_init(void) { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 9a3859443c042c..7e33d3f2e889b1 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -411,24 +411,18 @@ int tick_cpu_dying(unsigned int dying_cpu) } /* - * Shutdown an event device on a given cpu: + * Shutdown an event device on the outgoing CPU: * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. + * Called by the dying CPU during teardown, with clockevents_lock held + * and interrupts disabled. */ -void tick_shutdown(unsigned int cpu) +void tick_shutdown(void) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index faac36de35b9ef..4e4f7bbe2a64bc 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); extern void tick_offline_cpu(unsigned int cpu); -extern void tick_shutdown(unsigned int cpu); +extern void tick_shutdown(void); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index b03d0ada646950..488e47e96e93f9 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -102,8 +102,6 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - - SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3ae52978cae61a..8f23f5273babf9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -1241,188 +1240,6 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_KEYS -__bpf_kfunc_start_defs(); - -/** - * bpf_lookup_user_key - lookup a key by its serial - * @serial: key handle serial number - * @flags: lookup-specific flags - * - * Search a key with a given *serial* and the provided *flags*. - * If found, increment the reference count of the key by one, and - * return it in the bpf_key structure. - * - * The bpf_key structure must be passed to bpf_key_put() when done - * with it, so that the key reference count is decremented and the - * bpf_key structure is freed. - * - * Permission checks are deferred to the time the key is used by - * one of the available key-specific kfuncs. - * - * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested - * special keyring (e.g. session keyring), if it doesn't yet exist. - * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting - * for the key construction, and to retrieve uninstantiated keys (keys - * without data attached to them). - * - * Return: a bpf_key pointer with a valid key pointer if the key is found, a - * NULL pointer otherwise. - */ -__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) -{ - key_ref_t key_ref; - struct bpf_key *bkey; - - if (flags & ~KEY_LOOKUP_ALL) - return NULL; - - /* - * Permission check is deferred until the key is used, as the - * intent of the caller is unknown here. - */ - key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); - if (IS_ERR(key_ref)) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); - if (!bkey) { - key_put(key_ref_to_ptr(key_ref)); - return NULL; - } - - bkey->key = key_ref_to_ptr(key_ref); - bkey->has_ref = true; - - return bkey; -} - -/** - * bpf_lookup_system_key - lookup a key by a system-defined ID - * @id: key ID - * - * Obtain a bpf_key structure with a key pointer set to the passed key ID. - * The key pointer is marked as invalid, to prevent bpf_key_put() from - * attempting to decrement the key reference count on that pointer. The key - * pointer set in such way is currently understood only by - * verify_pkcs7_signature(). - * - * Set *id* to one of the values defined in include/linux/verification.h: - * 0 for the primary keyring (immutable keyring of system keys); - * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring - * (where keys can be added only if they are vouched for by existing keys - * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform - * keyring (primarily used by the integrity subsystem to verify a kexec'ed - * kerned image and, possibly, the initramfs signature). - * - * Return: a bpf_key pointer with an invalid key pointer set from the - * pre-determined ID on success, a NULL pointer otherwise - */ -__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) -{ - struct bpf_key *bkey; - - if (system_keyring_id_check(id) < 0) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); - if (!bkey) - return NULL; - - bkey->key = (struct key *)(unsigned long)id; - bkey->has_ref = false; - - return bkey; -} - -/** - * bpf_key_put - decrement key reference count if key is valid and free bpf_key - * @bkey: bpf_key structure - * - * Decrement the reference count of the key inside *bkey*, if the pointer - * is valid, and free *bkey*. - */ -__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) -{ - if (bkey->has_ref) - key_put(bkey->key); - - kfree(bkey); -} - -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -/** - * bpf_verify_pkcs7_signature - verify a PKCS#7 signature - * @data_p: data to verify - * @sig_p: signature of the data - * @trusted_keyring: keyring with keys trusted for signature verification - * - * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* - * with keys in a keyring referenced by *trusted_keyring*. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, - struct bpf_key *trusted_keyring) -{ - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; - const void *data, *sig; - u32 data_len, sig_len; - int ret; - - if (trusted_keyring->has_ref) { - /* - * Do the permission check deferred in bpf_lookup_user_key(). - * See bpf_lookup_user_key() for more details. - * - * A call to key_task_permission() here would be redundant, as - * it is already done by keyring_search() called by - * find_asymmetric_key(). - */ - ret = key_validate(trusted_keyring->key); - if (ret < 0) - return ret; - } - - data_len = __bpf_dynptr_size(data_ptr); - data = __bpf_dynptr_data(data_ptr, data_len); - sig_len = __bpf_dynptr_size(sig_ptr); - sig = __bpf_dynptr_data(sig_ptr, sig_len); - - return verify_pkcs7_signature(data, data_len, sig, sig_len, - trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, - NULL); -} -#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(key_sig_kfunc_set) -BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) -#endif -BTF_KFUNCS_END(key_sig_kfunc_set) - -static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { - .owner = THIS_MODULE, - .set = &key_sig_kfunc_set, -}; - -static int __init bpf_key_sig_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, - &bpf_key_sig_kfunc_set); -} - -late_initcall(bpf_key_sig_kfuncs_init); -#endif /* CONFIG_KEYS */ - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1521,8 +1338,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type { if (off < 0 || off >= sizeof(struct pt_regs)) return false; - if (type != BPF_READ) - return false; if (off % size != 0) return false; /* @@ -1532,6 +1347,9 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type if (off + size > sizeof(struct pt_regs)) return false; + if (type == BPF_WRITE) + prog->aux->kprobe_write_ctx = true; + return true; } @@ -2728,20 +2546,25 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, struct pt_regs *regs; int err; + /* + * graph tracer framework ensures we won't migrate, so there is no need + * to use migrate_disable for bpf_prog_run again. The check here just for + * __this_cpu_inc_return. + */ + cant_sleep(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { bpf_prog_inc_misses_counter(link->link.prog); err = 1; goto out; } - migrate_disable(); rcu_read_lock(); regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr()); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); - migrate_enable(); out: __this_cpu_dec(bpf_prog_active); @@ -2913,6 +2736,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!is_kprobe_multi(prog)) return -EINVAL; + /* Writing to context is not allowed for kprobes. */ + if (prog->aux->kprobe_write_ctx) + return -EINVAL; + flags = attr->link_create.kprobe_multi.flags; if (flags & ~BPF_F_KPROBE_MULTI_RETURN) return -EINVAL; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 2a42c1036ea87c..484ad7a18463a9 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -815,6 +815,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe unsigned long bitmap; unsigned long ret; int offset; + int bit; int i; ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset); @@ -829,6 +830,15 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe if (fregs) ftrace_regs_set_instruction_pointer(fregs, ret); + bit = ftrace_test_recursion_trylock(trace.func, ret); + /* + * This can fail because ftrace_test_recursion_trylock() allows one nest + * call. If we are already in a nested call, then we don't probe this and + * just return the original return address. + */ + if (unlikely(bit < 0)) + goto out; + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL trace.retval = ftrace_regs_get_return_value(fregs); #endif @@ -852,6 +862,8 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe } } + ftrace_test_recursion_unlock(bit); +out: /* * The ftrace_graph_return() may still access the current * ret_stack structure, we need to make sure the update of @@ -1397,7 +1409,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); - unregister_pm_notifier(&ftrace_suspend_notifier); + if (!ftrace_graph_active) + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index c8034dfc1070d6..5a807d62e76dce 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -428,8 +428,9 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad { unsigned long *addrs; - if (alist->index >= alist->size) - return -ENOMEM; + /* Previously we failed to expand the list. */ + if (alist->index == alist->size) + return -ENOSPC; alist->addrs[alist->index++] = addr; if (alist->index < alist->size) @@ -489,7 +490,7 @@ static int fprobe_module_callback(struct notifier_block *nb, for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); - if (alist.index < alist.size && alist.index > 0) + if (alist.index > 0) ftrace_set_filter_ips(&fprobe_graph_ops.ops, alist.addrs, alist.index, 1, 0); mutex_unlock(&fprobe_mutex); diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c index eea447b069071e..c1347da69e9dec 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.c +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -127,7 +127,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) mon = ltl_get_monitor(current); switch (id) { +#ifdef __NR_clock_nanosleep case __NR_clock_nanosleep: +#endif #ifdef __NR_clock_nanosleep_time64 case __NR_clock_nanosleep_time64: #endif @@ -138,7 +140,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); break; +#ifdef __NR_futex case __NR_futex: +#endif #ifdef __NR_futex_time64 case __NR_futex_time64: #endif diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 1482e91c39f402..48338520376f90 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -495,7 +495,7 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor *mon = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); (*pos)++; @@ -805,7 +805,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) retval = create_monitor_dir(monitor, parent); if (retval) - return retval; + goto out_unlock; /* keep children close to the parent for easier visualisation */ if (parent) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b7db732c0b1eb..b3c94fbaf002ff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -834,7 +834,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { - trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } @@ -871,6 +874,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_clear(&parser); ret = 0; } + out: trace_parser_put(&parser); if (ret < 0) { @@ -7209,7 +7213,7 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user entry = ring_buffer_event_data(event); entry->ip = ip; - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + len = copy_from_user_nofault(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; @@ -7306,7 +7310,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, entry = ring_buffer_event_data(event); - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + len = copy_from_user_nofault(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 5d64a18cacacc6..d06854bd32b357 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -230,6 +230,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = tracing_check_open_get_tr(NULL); if (ret) return ret; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index af42aaa3d172cc..2ab283fd3032e3 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm, { struct user_event_enabler_fault *fault; - fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT); if (!fault) return false; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ccae62d4fb9177..fa60362a3f31bd 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], return -EINVAL; } buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index fd259da0aa6456..dc734867f0fc44 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2322,12 +2322,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, int running, err; char *buf __free(kfree) = NULL; - buf = kmalloc(count, GFP_KERNEL); + if (count < 1) + return 0; + + buf = kmalloc(count + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_user(buf, ubuf, count)) return -EFAULT; + buf[count] = '\0'; if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) return -ENOMEM; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 16b283f9d83141..6ea2f6363b9089 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -57,12 +57,11 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); + stats->ac_ppid = task_ppid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); - stats->ac_ppid = pid_alive(tsk) ? - task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); task_cputime(tsk, &utime, &stime); diff --git a/kernel/user.c b/kernel/user.c index f46b1d41163b20..0163665914c97c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,10 +65,11 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.count = REFCOUNT_INIT(3), + .ns.ns_type = ns_common_type(&init_user_ns), + .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = PROC_USER_INIT_INO, + .ns.inum = ns_init_inum(&init_user_ns), #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 682f40d5632d44..03cb63883d041a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); @@ -124,12 +125,11 @@ int create_user_ns(struct cred *new) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_alloc_inum(&ns->ns); + + ret = ns_common_init(ns); if (ret) goto fail_free; - ns->ns.ops = &userns_operations; - refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; @@ -159,12 +159,13 @@ int create_user_ns(struct cred *new) goto fail_keyring; set_cred_user_ns(new, ns); + ns_tree_add(ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: @@ -201,6 +202,7 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + ns_tree_remove(ns); if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); @@ -218,11 +220,12 @@ static void free_user_ns(struct work_struct *work) #endif retire_userns_sysctls(ns); key_free_user_ns(ns); - ns_free_inum(&ns->ns); - kmem_cache_free(user_ns_cachep, ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; - } while (refcount_dec_and_test(&parent->ns.count)); + } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) @@ -1322,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; @@ -1402,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns) const struct proc_ns_operations userns_operations = { .name = "user", - .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, @@ -1413,6 +1410,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); + ns_tree_add(&init_user_ns); return 0; } subsys_initcall(user_namespaces_init); diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f24e..ebbfc578a9d308 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -13,6 +13,7 @@ #include #include #include +#include #include static struct kmem_cache *uts_ns_cache __ro_after_init; @@ -27,16 +28,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); } -static struct uts_namespace *create_uts_ns(void) -{ - struct uts_namespace *uts_ns; - - uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); - if (uts_ns) - refcount_set(&uts_ns->ns.count, 1); - return uts_ns; -} - /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone @@ -55,21 +46,20 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = create_uts_ns(); + ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL); if (!ns) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free; ns->ucounts = ucounts; - ns->ns.ops = &utsns_operations; - down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); + ns_tree_add(ns); return ns; fail_free: @@ -86,7 +76,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, +struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; @@ -105,15 +95,12 @@ struct uts_namespace *copy_utsname(unsigned long flags, void free_uts_ns(struct uts_namespace *ns) { + ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kmem_cache_free(uts_ns_cache, ns); -} - -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *utsns_get(struct task_struct *task) @@ -159,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns) const struct proc_ns_operations utsns_operations = { .name = "uts", - .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, @@ -174,4 +160,5 @@ void __init uts_ns_init(void) offsetof(struct uts_namespace, name), sizeof_field(struct uts_namespace, name), NULL); + ns_tree_add(&init_uts_ns); } diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index bc738fa90c1d68..27107dcc1cbfeb 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -100,6 +100,7 @@ void vhost_task_stop(struct vhost_task *vtsk) * freeing it below. */ wait_for_completion(&vtsk->exited); + put_task_struct(vtsk->task); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); @@ -148,7 +149,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), return ERR_CAST(tsk); } - vtsk->task = tsk; + vtsk->task = get_task_struct(tsk); return vtsk; } EXPORT_SYMBOL_GPL(vhost_task_create); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c6b79b3675c314..45320e27a16c4a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -222,7 +222,9 @@ struct worker_pool { struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ - +#ifdef CONFIG_PREEMPT_RT + spinlock_t cb_lock; /* BH worker cancel lock */ +#endif /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). @@ -2930,7 +2932,7 @@ static void idle_worker_timeout(struct timer_list *t) raw_spin_unlock_irq(&pool->lock); if (do_cull) - queue_work(system_unbound_wq, &pool->idle_cull_work); + queue_work(system_dfl_wq, &pool->idle_cull_work); } /** @@ -3078,6 +3080,31 @@ __acquires(&pool->lock) goto restart; } +#ifdef CONFIG_PREEMPT_RT +static void worker_lock_callback(struct worker_pool *pool) +{ + spin_lock(&pool->cb_lock); +} + +static void worker_unlock_callback(struct worker_pool *pool) +{ + spin_unlock(&pool->cb_lock); +} + +static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) +{ + spin_lock(&pool->cb_lock); + spin_unlock(&pool->cb_lock); +} + +#else + +static void worker_lock_callback(struct worker_pool *pool) { } +static void worker_unlock_callback(struct worker_pool *pool) { } +static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { } + +#endif + /** * manage_workers - manage worker pool * @worker: self @@ -3557,6 +3584,7 @@ static void bh_worker(struct worker *worker) int nr_restarts = BH_WORKER_RESTARTS; unsigned long end = jiffies + BH_WORKER_JIFFIES; + worker_lock_callback(pool); raw_spin_lock_irq(&pool->lock); worker_leave_idle(worker); @@ -3585,6 +3613,7 @@ static void bh_worker(struct worker *worker) worker_enter_idle(worker); kick_pool(pool); raw_spin_unlock_irq(&pool->lock); + worker_unlock_callback(pool); } /* @@ -4222,17 +4251,17 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) (data & WORK_OFFQ_BH)) { /* * On RT, prevent a live lock when %current preempted - * soft interrupt processing or prevents ksoftirqd from - * running by keeping flipping BH. If the BH work item - * runs on a different CPU then this has no effect other - * than doing the BH disable/enable dance for nothing. - * This is copied from - * kernel/softirq.c::tasklet_unlock_spin_wait(). + * soft interrupt processing by blocking on lock which + * is owned by the thread invoking the callback. */ while (!try_wait_for_completion(&barr.done)) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - local_bh_disable(); - local_bh_enable(); + struct worker_pool *pool; + + guard(rcu)(); + pool = get_work_pool(work); + if (pool) + workqueue_callback_cancel_wait_running(pool); } else { cpu_relax(); } @@ -4782,6 +4811,9 @@ static int init_worker_pool(struct worker_pool *pool) ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; +#ifdef CONFIG_PREEMPT_RT + spin_lock_init(&pool->cb_lock); +#endif /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(); @@ -6046,7 +6078,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - rcu_read_lock(); preempt_disable(); if (cpu == WORK_CPU_UNBOUND) @@ -6056,7 +6087,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) ret = !list_empty(&pwq->inactive_works); preempt_enable(); - rcu_read_unlock(); return ret; } @@ -7546,8 +7576,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (!thresh) return; - rcu_read_lock(); - for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; @@ -7589,8 +7617,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) } - rcu_read_unlock(); - if (lockup_detected) show_all_workqueues(); @@ -7642,7 +7668,7 @@ static int wq_watchdog_param_set_thresh(const char *val, if (ret) return ret; - if (system_wq) + if (system_percpu_wq) wq_watchdog_set_thresh(thresh); else wq_watchdog_thresh = thresh; @@ -7802,22 +7828,22 @@ void __init workqueue_init_early(void) ordered_wq_attrs[i] = attrs; } - system_wq = alloc_workqueue("events", 0, 0); - system_percpu_wq = alloc_workqueue("events", 0, 0); - system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); - system_long_wq = alloc_workqueue("events_long", 0, 0); + system_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_highpri_wq = alloc_workqueue("events_highpri", + WQ_HIGHPRI | WQ_PERCPU, 0); + system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", - WQ_FREEZABLE, 0); + WQ_FREEZABLE | WQ_PERCPU, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", - WQ_POWER_EFFICIENT, 0); + WQ_POWER_EFFICIENT | WQ_PERCPU, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", - WQ_FREEZABLE | WQ_POWER_EFFICIENT, - 0); - system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); + WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0); + system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", - WQ_BH | WQ_HIGHPRI, 0); + WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dc0e0c6ed075e9..24939b8553e67f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2479,6 +2479,20 @@ config STRING_HELPERS_KUNIT_TEST depends on KUNIT default KUNIT_ALL_TESTS +config FFS_KUNIT_TEST + tristate "KUnit test ffs-family functions at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds KUnit tests for ffs-family bit manipulation functions + including ffs(), __ffs(), fls(), __fls(), fls64(), and __ffs64(). + + These tests validate mathematical correctness, edge case handling, + and cross-architecture consistency of bit scanning functions. + + For more information on KUnit and unit tests in general, + please refer to Documentation/dev-tools/kunit/. + config TEST_KSTRTOX tristate "Test kstrto*() family of functions at runtime" @@ -2894,7 +2908,7 @@ config FORTIFY_KUNIT_TEST config LONGEST_SYM_KUNIT_TEST tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS depends on KUNIT && KPROBES - depends on !PREFIX_SYMBOLS && !CFI_CLANG && !GCOV_KERNEL + depends on !PREFIX_SYMBOLS && !CFI && !GCOV_KERNEL default KUNIT_ALL_TESTS help Tests the longest symbol possible diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c index fb8c0c5c2bd277..8778ec44bf630d 100644 --- a/lib/clz_ctz.c +++ b/lib/clz_ctz.c @@ -15,28 +15,28 @@ #include int __weak __ctzsi2(int val); -int __weak __ctzsi2(int val) +int __weak __attribute_const__ __ctzsi2(int val) { return __ffs(val); } EXPORT_SYMBOL(__ctzsi2); int __weak __clzsi2(int val); -int __weak __clzsi2(int val) +int __weak __attribute_const__ __clzsi2(int val) { return 32 - fls(val); } EXPORT_SYMBOL(__clzsi2); int __weak __clzdi2(u64 val); -int __weak __clzdi2(u64 val) +int __weak __attribute_const__ __clzdi2(u64 val) { return 64 - fls64(val); } EXPORT_SYMBOL(__clzdi2); int __weak __ctzdi2(u64 val); -int __weak __ctzdi2(u64 val) +int __weak __attribute_const__ __ctzdi2(u64 val) { return __ffs64(val); } diff --git a/lib/crc/arm/crc-t10dif.h b/lib/crc/arm/crc-t10dif.h index 2edf7e9681d05a..63441de5e3f161 100644 --- a/lib/crc/arm/crc-t10dif.h +++ b/lib/crc/arm/crc-t10dif.h @@ -5,8 +5,6 @@ * Copyright (C) 2016 Linaro Ltd */ -#include - #include #include @@ -23,7 +21,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) { if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { if (static_branch_likely(&have_pmull)) { - if (crypto_simd_usable()) { + if (likely(may_use_simd())) { kernel_neon_begin(); crc = crc_t10dif_pmull64(crc, data, length); kernel_neon_end(); @@ -31,7 +29,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) } } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && static_branch_likely(&have_neon) && - crypto_simd_usable()) { + likely(may_use_simd())) { u8 buf[16] __aligned(16); kernel_neon_begin(); @@ -45,7 +43,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) } #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch -static inline void crc_t10dif_mod_init_arch(void) +static void crc_t10dif_mod_init_arch(void) { if (elf_hwcap & HWCAP_NEON) { static_branch_enable(&have_neon); diff --git a/lib/crc/arm/crc32.h b/lib/crc/arm/crc32.h index 018007e162a2b6..7b76f52f6907db 100644 --- a/lib/crc/arm/crc32.h +++ b/lib/crc/arm/crc32.h @@ -7,8 +7,6 @@ #include -#include - #include #include #include @@ -34,7 +32,7 @@ static inline u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len) static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) { if (len >= PMULL_MIN_LEN + 15 && - static_branch_likely(&have_pmull) && crypto_simd_usable()) { + static_branch_likely(&have_pmull) && likely(may_use_simd())) { size_t n = -(uintptr_t)p & 15; /* align p to 16-byte boundary */ @@ -63,7 +61,7 @@ static inline u32 crc32c_scalar(u32 crc, const u8 *p, size_t len) static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) { if (len >= PMULL_MIN_LEN + 15 && - static_branch_likely(&have_pmull) && crypto_simd_usable()) { + static_branch_likely(&have_pmull) && likely(may_use_simd())) { size_t n = -(uintptr_t)p & 15; /* align p to 16-byte boundary */ @@ -85,7 +83,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) #define crc32_be_arch crc32_be_base /* not implemented on this arch */ #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (elf_hwcap2 & HWCAP2_CRC32) static_branch_enable(&have_crc32); diff --git a/lib/crc/arm64/crc-t10dif.h b/lib/crc/arm64/crc-t10dif.h index c4521a7f1ee9bf..f88db297180516 100644 --- a/lib/crc/arm64/crc-t10dif.h +++ b/lib/crc/arm64/crc-t10dif.h @@ -7,8 +7,6 @@ #include -#include - #include #include @@ -25,7 +23,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) { if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { if (static_branch_likely(&have_pmull)) { - if (crypto_simd_usable()) { + if (likely(may_use_simd())) { kernel_neon_begin(); crc = crc_t10dif_pmull_p64(crc, data, length); kernel_neon_end(); @@ -33,7 +31,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) } } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && static_branch_likely(&have_asimd) && - crypto_simd_usable()) { + likely(may_use_simd())) { u8 buf[16]; kernel_neon_begin(); @@ -47,7 +45,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) } #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch -static inline void crc_t10dif_mod_init_arch(void) +static void crc_t10dif_mod_init_arch(void) { if (cpu_have_named_feature(ASIMD)) { static_branch_enable(&have_asimd); diff --git a/lib/crc/arm64/crc32.h b/lib/crc/arm64/crc32.h index 6e5dec45f05d21..31e649cd40a2fd 100644 --- a/lib/crc/arm64/crc32.h +++ b/lib/crc/arm64/crc32.h @@ -5,8 +5,6 @@ #include #include -#include - // The minimum input length to consider the 4-way interleaved code path static const size_t min_len = 1024; @@ -23,7 +21,8 @@ static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_le_base(crc, p, len); - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + if (len >= min_len && cpu_have_named_feature(PMULL) && + likely(may_use_simd())) { kernel_neon_begin(); crc = crc32_le_arm64_4way(crc, p, len); kernel_neon_end(); @@ -43,7 +42,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32c_base(crc, p, len); - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + if (len >= min_len && cpu_have_named_feature(PMULL) && + likely(may_use_simd())) { kernel_neon_begin(); crc = crc32c_le_arm64_4way(crc, p, len); kernel_neon_end(); @@ -63,7 +63,8 @@ static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_be_base(crc, p, len); - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + if (len >= min_len && cpu_have_named_feature(PMULL) && + likely(may_use_simd())) { kernel_neon_begin(); crc = crc32_be_arm64_4way(crc, p, len); kernel_neon_end(); diff --git a/lib/crc/loongarch/crc32.h b/lib/crc/loongarch/crc32.h index 6de5c96594afc8..d34fa4c6863252 100644 --- a/lib/crc/loongarch/crc32.h +++ b/lib/crc/loongarch/crc32.h @@ -101,7 +101,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) #define crc32_be_arch crc32_be_base /* not implemented on this arch */ #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (cpu_has_crc32) static_branch_enable(&have_crc32); diff --git a/lib/crc/mips/crc32.h b/lib/crc/mips/crc32.h index 11cb272c63a691..3100354a049eb5 100644 --- a/lib/crc/mips/crc32.h +++ b/lib/crc/mips/crc32.h @@ -148,7 +148,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) #define crc32_be_arch crc32_be_base /* not implemented on this arch */ #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (cpu_have_feature(cpu_feature(MIPS_CRC32))) static_branch_enable(&have_crc32); diff --git a/lib/crc/powerpc/crc-t10dif.h b/lib/crc/powerpc/crc-t10dif.h index 59e16804a6eae9..8f4592a5323d62 100644 --- a/lib/crc/powerpc/crc-t10dif.h +++ b/lib/crc/powerpc/crc-t10dif.h @@ -6,8 +6,8 @@ * [based on crc32c-vpmsum_glue.c] */ +#include #include -#include #include #include #include @@ -29,7 +29,8 @@ static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len) u32 crc = crci; if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || - !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + !static_branch_likely(&have_vec_crypto) || + unlikely(!may_use_simd())) return crc_t10dif_generic(crc, p, len); if ((unsigned long)p & VMX_ALIGN_MASK) { @@ -61,7 +62,7 @@ static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len) } #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch -static inline void crc_t10dif_mod_init_arch(void) +static void crc_t10dif_mod_init_arch(void) { if (cpu_has_feature(CPU_FTR_ARCH_207S) && (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) diff --git a/lib/crc/powerpc/crc32.h b/lib/crc/powerpc/crc32.h index 811cc2e6ed24d4..0c852272a3828b 100644 --- a/lib/crc/powerpc/crc32.h +++ b/lib/crc/powerpc/crc32.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include -#include #include #include #include @@ -24,7 +24,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) unsigned int tail; if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || - !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + !static_branch_likely(&have_vec_crypto) || + unlikely(!may_use_simd())) return crc32c_base(crc, p, len); if ((unsigned long)p & VMX_ALIGN_MASK) { @@ -54,7 +55,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) } #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (cpu_has_feature(CPU_FTR_ARCH_207S) && (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) diff --git a/lib/crc/sparc/crc32.h b/lib/crc/sparc/crc32.h index 60f2765ac01573..df7c350acd7b5b 100644 --- a/lib/crc/sparc/crc32.h +++ b/lib/crc/sparc/crc32.h @@ -44,7 +44,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *data, size_t len) } #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { unsigned long cfr; diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c index f08d985d8860e8..9a450e25ac8116 100644 --- a/lib/crc/tests/crc_kunit.c +++ b/lib/crc/tests/crc_kunit.c @@ -6,6 +6,7 @@ * * Author: Eric Biggers */ +#include #include #include #include @@ -141,6 +142,54 @@ static size_t generate_random_length(size_t max_length) return len % (max_length + 1); } +#define IRQ_TEST_DATA_LEN 512 +#define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */ + +struct crc_irq_test_state { + const struct crc_variant *v; + u64 initial_crc; + u64 expected_crcs[IRQ_TEST_NUM_BUFFERS]; + atomic_t seqno; +}; + +/* + * Compute the CRC of one of the test messages and verify that it matches the + * expected CRC from @state->expected_crcs. To increase the chance of detecting + * problems, cycle through multiple messages. + */ +static bool crc_irq_test_func(void *state_) +{ + struct crc_irq_test_state *state = state_; + const struct crc_variant *v = state->v; + u32 i = (u32)atomic_inc_return(&state->seqno) % IRQ_TEST_NUM_BUFFERS; + u64 actual_crc = v->func(state->initial_crc, + &test_buffer[i * IRQ_TEST_DATA_LEN], + IRQ_TEST_DATA_LEN); + + return actual_crc == state->expected_crcs[i]; +} + +/* + * Test that if CRCs are computed in task, softirq, and hardirq context + * concurrently, then all results are as expected. + */ +static void crc_interrupt_context_test(struct kunit *test, + const struct crc_variant *v) +{ + struct crc_irq_test_state state = { + .v = v, + .initial_crc = generate_random_initial_crc(v), + }; + + for (int i = 0; i < IRQ_TEST_NUM_BUFFERS; i++) { + state.expected_crcs[i] = crc_ref( + v, state.initial_crc, + &test_buffer[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN); + } + + kunit_run_irq_test(test, crc_irq_test_func, 100000, &state); +} + /* Test that v->func gives the same CRCs as a reference implementation. */ static void crc_test(struct kunit *test, const struct crc_variant *v) { @@ -149,7 +198,6 @@ static void crc_test(struct kunit *test, const struct crc_variant *v) for (i = 0; i < CRC_KUNIT_NUM_TEST_ITERS; i++) { u64 init_crc, expected_crc, actual_crc; size_t len, offset; - bool nosimd; init_crc = generate_random_initial_crc(v); len = generate_random_length(CRC_KUNIT_MAX_LEN); @@ -168,22 +216,18 @@ static void crc_test(struct kunit *test, const struct crc_variant *v) /* Refresh the data occasionally. */ prandom_bytes_state(&rng, &test_buffer[offset], len); - nosimd = rand32() % 8 == 0; - /* * Compute the CRC, and verify that it equals the CRC computed * by a simple bit-at-a-time reference implementation. */ expected_crc = crc_ref(v, init_crc, &test_buffer[offset], len); - if (nosimd) - local_irq_disable(); actual_crc = v->func(init_crc, &test_buffer[offset], len); - if (nosimd) - local_irq_enable(); KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc, - "Wrong result with len=%zu offset=%zu nosimd=%d", - len, offset, nosimd); + "Wrong result with len=%zu offset=%zu", + len, offset); } + + crc_interrupt_context_test(test, v); } static __always_inline void diff --git a/lib/crc/x86/crc-pclmul-template.h b/lib/crc/x86/crc-pclmul-template.h index 35c950d7010c28..02744831c6fac0 100644 --- a/lib/crc/x86/crc-pclmul-template.h +++ b/lib/crc/x86/crc-pclmul-template.h @@ -12,7 +12,6 @@ #include #include -#include #include #include "crc-pclmul-consts.h" @@ -57,7 +56,7 @@ static inline bool have_avx512(void) #define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \ do { \ if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \ - crypto_simd_usable()) { \ + likely(irq_fpu_usable())) { \ const void *consts_ptr; \ \ consts_ptr = (consts).fold_across_128_bits_consts; \ diff --git a/lib/crc/x86/crc-t10dif.h b/lib/crc/x86/crc-t10dif.h index 2a02a3026f3f88..8ee8824da551c5 100644 --- a/lib/crc/x86/crc-t10dif.h +++ b/lib/crc/x86/crc-t10dif.h @@ -19,7 +19,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) } #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch -static inline void crc_t10dif_mod_init_arch(void) +static void crc_t10dif_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { static_branch_enable(&have_pclmulqdq); diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h index cea2c96d08d09e..19a5e3c6c73bb4 100644 --- a/lib/crc/x86/crc32.h +++ b/lib/crc/x86/crc32.h @@ -44,7 +44,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) return crc32c_base(crc, p, len); if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && - static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + static_branch_likely(&have_pclmulqdq) && likely(irq_fpu_usable())) { /* * Long length, the vector registers are usable, and the CPU is * 64-bit and supports both CRC32 and PCLMULQDQ instructions. @@ -106,7 +106,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) #define crc32_be_arch crc32_be_base /* not implemented on this arch */ #define crc32_mod_init_arch crc32_mod_init_arch -static inline void crc32_mod_init_arch(void) +static void crc32_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_XMM4_2)) static_branch_enable(&have_crc32); diff --git a/lib/crc/x86/crc64.h b/lib/crc/x86/crc64.h index fde1222c4c584c..7d459931934365 100644 --- a/lib/crc/x86/crc64.h +++ b/lib/crc/x86/crc64.h @@ -27,7 +27,7 @@ static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) } #define crc64_mod_init_arch crc64_mod_init_arch -static inline void crc64_mod_init_arch(void) +static void crc64_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { static_branch_enable(&have_pclmulqdq); diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 1e6b008f8fca45..eea17e36a22bed 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -28,109 +28,102 @@ config CRYPTO_LIB_ARC4 config CRYPTO_LIB_GF128MUL tristate -config CRYPTO_ARCH_HAVE_LIB_BLAKE2S - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the Blake2s library interface, - either builtin or as a module. - -config CRYPTO_LIB_BLAKE2S_GENERIC - def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - This symbol can be depended upon by arch implementations of the - Blake2s library interface that require the generic code as a - fallback, e.g., for SIMD implementations. If no arch specific - implementation is enabled, this implementation serves the users - of CRYPTO_LIB_BLAKE2S. +# BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option. -config CRYPTO_ARCH_HAVE_LIB_CHACHA +config CRYPTO_LIB_BLAKE2S_ARCH bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the ChaCha library interface, - either builtin or as a module. + depends on !UML + default y if ARM + default y if X86_64 -config CRYPTO_LIB_CHACHA_GENERIC +config CRYPTO_LIB_CHACHA tristate - default CRYPTO_LIB_CHACHA if !CRYPTO_ARCH_HAVE_LIB_CHACHA select CRYPTO_LIB_UTILS help - This symbol can be selected by arch implementations of the ChaCha - library interface that require the generic code as a fallback, e.g., - for SIMD implementations. If no arch specific implementation is - enabled, this implementation serves the users of CRYPTO_LIB_CHACHA. + Enable the ChaCha library interface. Select this if your module uses + chacha_crypt() or hchacha_block(). -config CRYPTO_LIB_CHACHA +config CRYPTO_LIB_CHACHA_ARCH + bool + depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN + default y if ARM + default y if ARM64 && KERNEL_MODE_NEON + default y if MIPS && CPU_MIPS32_R2 + default y if PPC64 && CPU_LITTLE_ENDIAN && VSX + default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if S390 + default y if X86_64 + +config CRYPTO_LIB_CURVE25519 tristate + select CRYPTO_LIB_UTILS help - Enable the ChaCha library interface. This interface may be fulfilled - by either the generic implementation or an arch-specific one, if one - is available and enabled. + The Curve25519 library functions. Select this if your module uses any + of the functions from . -config CRYPTO_ARCH_HAVE_LIB_CURVE25519 +config CRYPTO_LIB_CURVE25519_ARCH bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the Curve25519 library interface, - either builtin or as a module. + depends on CRYPTO_LIB_CURVE25519 && !UML && !KMSAN + default y if ARM && KERNEL_MODE_NEON + default y if PPC64 && CPU_LITTLE_ENDIAN + default y if X86_64 config CRYPTO_LIB_CURVE25519_GENERIC - tristate - select CRYPTO_LIB_UTILS - help - This symbol can be depended upon by arch implementations of the - Curve25519 library interface that require the generic code as a - fallback, e.g., for SIMD implementations. If no arch specific - implementation is enabled, this implementation serves the users - of CRYPTO_LIB_CURVE25519. + bool + depends on CRYPTO_LIB_CURVE25519 + default y if !CRYPTO_LIB_CURVE25519_ARCH || ARM || X86_64 -config CRYPTO_LIB_CURVE25519_INTERNAL +config CRYPTO_LIB_DES tristate - select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n -config CRYPTO_LIB_CURVE25519 +config CRYPTO_LIB_MD5 tristate - select CRYPTO - select CRYPTO_LIB_CURVE25519_INTERNAL help - Enable the Curve25519 library interface. This interface may be - fulfilled by either the generic implementation or an arch-specific - one, if one is available and enabled. + The MD5 and HMAC-MD5 library functions. Select this if your module + uses any of the functions from . -config CRYPTO_LIB_DES +config CRYPTO_LIB_MD5_ARCH + bool + depends on CRYPTO_LIB_MD5 && !UML + default y if MIPS && CPU_CAVIUM_OCTEON + default y if PPC + default y if SPARC64 + +config CRYPTO_LIB_POLY1305 tristate + help + The Poly1305 library functions. Select this if your module uses any + of the functions from . + +config CRYPTO_LIB_POLY1305_ARCH + bool + depends on CRYPTO_LIB_POLY1305 && !UML + default y if ARM + default y if ARM64 && KERNEL_MODE_NEON + default y if MIPS + # The PPC64 code needs to be fixed to work in softirq context. + default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN + default y if RISCV + default y if X86_64 + +# This symbol controls the inclusion of the Poly1305 generic code. This differs +# from most of the other algorithms, which handle the generic code +# "automatically" via __maybe_unused. This is needed so that the Adiantum code, +# which calls the poly1305_core_*() functions directly, can enable them. +config CRYPTO_LIB_POLY1305_GENERIC + bool + depends on CRYPTO_LIB_POLY1305 + # Enable if there's no arch impl or the arch impl requires the generic + # impl as a fallback. (Or if selected explicitly.) + default y if !CRYPTO_LIB_POLY1305_ARCH || PPC64 config CRYPTO_LIB_POLY1305_RSIZE int - default 2 if MIPS + default 2 if MIPS || RISCV default 11 if X86_64 default 9 if ARM || ARM64 default 1 -config CRYPTO_ARCH_HAVE_LIB_POLY1305 - bool - help - Declares whether the architecture provides an arch-specific - accelerated implementation of the Poly1305 library interface, - either builtin or as a module. - -config CRYPTO_LIB_POLY1305_GENERIC - tristate - default CRYPTO_LIB_POLY1305 if !CRYPTO_ARCH_HAVE_LIB_POLY1305 - help - This symbol can be selected by arch implementations of the Poly1305 - library interface that require the generic code as a fallback, e.g., - for SIMD implementations. If no arch specific implementation is - enabled, this implementation serves the users of CRYPTO_LIB_POLY1305. - -config CRYPTO_LIB_POLY1305 - tristate - help - Enable the Poly1305 library interface. This interface may be fulfilled - by either the generic implementation or an arch-specific one, if one - is available and enabled. - config CRYPTO_LIB_CHACHA20POLY1305 tristate select CRYPTO_LIB_CHACHA @@ -196,28 +189,4 @@ config CRYPTO_LIB_SM3 source "lib/crypto/tests/Kconfig" -if !KMSAN # avoid false positives from assembly -if ARM -source "lib/crypto/arm/Kconfig" -endif -if ARM64 -source "lib/crypto/arm64/Kconfig" -endif -if MIPS -source "lib/crypto/mips/Kconfig" -endif -if PPC -source "lib/crypto/powerpc/Kconfig" -endif -if RISCV -source "lib/crypto/riscv/Kconfig" -endif -if S390 -source "lib/crypto/s390/Kconfig" -endif -if X86 -source "lib/crypto/x86/Kconfig" -endif -endif - endmenu diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 539d5d59a50e48..bded351aeacef2 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -15,10 +15,6 @@ obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o libcryptoutils-y := memneq.o utils.o -# chacha is used by the /dev/random driver which is always builtin -obj-y += chacha.o -obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o - obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o libaes-y := aes.o @@ -33,39 +29,162 @@ libarc4-y := arc4.o obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o +################################################################################ + # blake2s is used by the /dev/random driver which is always builtin -obj-y += libblake2s.o -libblake2s-y := blake2s.o -libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += blake2s-generic.o -libblake2s-$(CONFIG_CRYPTO_SELFTESTS) += blake2s-selftest.o +obj-y += blake2s.o +ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y) +CFLAGS_blake2s.o += -I$(src)/$(SRCARCH) +obj-$(CONFIG_ARM) += arm/blake2s-core.o +obj-$(CONFIG_X86) += x86/blake2s-core.o +endif + +################################################################################ + +# chacha20_block() is used by the /dev/random driver which is always builtin +obj-y += chacha-block-generic.o + +obj-$(CONFIG_CRYPTO_LIB_CHACHA) += libchacha.o +libchacha-y := chacha.o + +ifeq ($(CONFIG_CRYPTO_LIB_CHACHA_ARCH),y) +CFLAGS_chacha.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libchacha-y += arm/chacha-scalar-core.o +libchacha-$(CONFIG_KERNEL_MODE_NEON) += arm/chacha-neon-core.o +endif + +libchacha-$(CONFIG_ARM64) += arm64/chacha-neon-core.o + +ifeq ($(CONFIG_MIPS),y) +libchacha-y += mips/chacha-core.o +AFLAGS_mips/chacha-core.o += -O2 # needed to fill branch delay slots +endif + +libchacha-$(CONFIG_PPC) += powerpc/chacha-p10le-8x.o +libchacha-$(CONFIG_RISCV) += riscv/chacha-riscv64-zvkb.o +libchacha-$(CONFIG_S390) += s390/chacha-s390.o +libchacha-$(CONFIG_X86) += x86/chacha-ssse3-x86_64.o \ + x86/chacha-avx2-x86_64.o \ + x86/chacha-avx512vl-x86_64.o +endif # CONFIG_CRYPTO_LIB_CHACHA_ARCH + +################################################################################ obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305) += libchacha20poly1305.o libchacha20poly1305-y += chacha20poly1305.o libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS) += chacha20poly1305-selftest.o -obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += libcurve25519-generic.o -libcurve25519-generic-y := curve25519-fiat32.o -libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := curve25519-hacl64.o -libcurve25519-generic-y += curve25519-generic.o +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o +libcurve25519-y := curve25519.o + +# Disable GCOV in odd or sensitive code +GCOV_PROFILE_curve25519.o := n + +ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y) +libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-hacl64.o +else +libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-fiat32.o +endif # clang versions prior to 18 may blow out the stack with KASAN ifeq ($(call clang-min-version, 180000),) KASAN_SANITIZE_curve25519-hacl64.o := n endif -obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o -libcurve25519-y += curve25519.o -libcurve25519-$(CONFIG_CRYPTO_SELFTESTS) += curve25519-selftest.o +ifeq ($(CONFIG_CRYPTO_LIB_CURVE25519_ARCH),y) +CFLAGS_curve25519.o += -I$(src)/$(SRCARCH) +libcurve25519-$(CONFIG_ARM) += arm/curve25519-core.o +libcurve25519-$(CONFIG_PPC) += powerpc/curve25519-ppc64le_asm.o +endif + +################################################################################ obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o libdes-y := des.o -obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o -libpoly1305-y += poly1305.o +################################################################################ -obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305-generic.o -libpoly1305-generic-y := poly1305-donna32.o -libpoly1305-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o -libpoly1305-generic-y += poly1305-generic.o +obj-$(CONFIG_CRYPTO_LIB_MD5) += libmd5.o +libmd5-y := md5.o +ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y) +CFLAGS_md5.o += -I$(src)/$(SRCARCH) +libmd5-$(CONFIG_PPC) += powerpc/md5-asm.o +libmd5-$(CONFIG_SPARC) += sparc/md5_asm.o +endif # CONFIG_CRYPTO_LIB_MD5_ARCH + +################################################################################ + +obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o +libpoly1305-y := poly1305.o +ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y) +libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna64.o +else +libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna32.o +endif + +ifeq ($(CONFIG_CRYPTO_LIB_POLY1305_ARCH),y) +CFLAGS_poly1305.o += -I$(src)/$(SRCARCH) + +ifeq ($(CONFIG_ARM),y) +libpoly1305-y += arm/poly1305-core.o +$(obj)/arm/poly1305-core.S: $(src)/arm/poly1305-armv4.pl + $(call cmd,perlasm) +# massage the perlasm code a bit so we only get the NEON routine if we need it +poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 +poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 +AFLAGS_arm/poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) +endif + +ifeq ($(CONFIG_ARM64),y) +libpoly1305-y += arm64/poly1305-core.o +$(obj)/arm64/poly1305-core.S: $(src)/arm64/poly1305-armv8.pl + $(call cmd,perlasm_with_args) +endif + +ifeq ($(CONFIG_MIPS),y) +libpoly1305-y += mips/poly1305-core.o +poly1305-perlasm-flavour-$(CONFIG_32BIT) := o32 +poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64 +quiet_cmd_perlasm_poly1305 = PERLASM $@ + cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@ +# Use if_changed instead of cmd, in case the flavour changed. +$(obj)/mips/poly1305-core.S: $(src)/mips/poly1305-mips.pl FORCE + $(call if_changed,perlasm_poly1305) +targets += mips/poly1305-core.S +endif + +libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o + +ifeq ($(CONFIG_RISCV),y) +libpoly1305-y += riscv/poly1305-core.o +poly1305-perlasm-flavour-$(CONFIG_32BIT) := 32 +poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64 +quiet_cmd_perlasm_poly1305 = PERLASM $@ + cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@ +# Use if_changed instead of cmd, in case the flavour changed. +$(obj)/riscv/poly1305-core.S: $(src)/riscv/poly1305-riscv.pl FORCE + $(call if_changed,perlasm_poly1305) +targets += riscv/poly1305-core.S +AFLAGS_riscv/poly1305-core.o += -Dpoly1305_init=poly1305_block_init +endif + +ifeq ($(CONFIG_X86),y) +libpoly1305-y += x86/poly1305-x86_64-cryptogams.o +$(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl + $(call cmd,perlasm) +endif + +endif # CONFIG_CRYPTO_LIB_POLY1305_ARCH + +# clean-files must be defined unconditionally +clean-files += arm/poly1305-core.S \ + arm64/poly1305-core.S \ + mips/poly1305-core.S \ + riscv/poly1305-core.S \ + x86/poly1305-x86_64-cryptogams.S ################################################################################ @@ -156,14 +275,6 @@ obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o libsm3-y := sm3.o -obj-$(CONFIG_ARM) += arm/ -obj-$(CONFIG_ARM64) += arm64/ -obj-$(CONFIG_MIPS) += mips/ -obj-$(CONFIG_PPC) += powerpc/ -obj-$(CONFIG_RISCV) += riscv/ -obj-$(CONFIG_S390) += s390/ -obj-$(CONFIG_X86) += x86/ - # clean-files must be defined unconditionally clean-files += arm/sha256-core.S arm/sha512-core.S clean-files += arm64/sha256-core.S arm64/sha512-core.S diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig deleted file mode 100644 index e8444fd0aae303..00000000000000 --- a/lib/crypto/arm/Kconfig +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_ARM - bool "Hash functions: BLAKE2s" - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: arm - - This is faster than the generic implementations of BLAKE2s and - BLAKE2b, but slower than the NEON implementation of BLAKE2b. - There is no NEON implementation of BLAKE2s, since NEON doesn't - really help with it. - -config CRYPTO_CHACHA20_NEON - tristate - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_ARM - tristate - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile deleted file mode 100644 index 4c042a4c77ed6e..00000000000000 --- a/lib/crypto/arm/Makefile +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o -libblake2s-arm-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-scalar-core.o chacha-glue.o -chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o - -obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o -poly1305-arm-y := poly1305-core.o poly1305-glue.o - -quiet_cmd_perl = PERL $@ - cmd_perl = $(PERL) $(<) > $(@) - -$(obj)/%-core.S: $(src)/%-armv4.pl - $(call cmd,perl) - -clean-files += poly1305-core.S - -aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 - -# massage the perlasm code a bit so we only get the NEON routine if we need it -poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 -poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 -AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S index df40e46601f100..293f44fa8f316f 100644 --- a/lib/crypto/arm/blake2s-core.S +++ b/lib/crypto/arm/blake2s-core.S @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * BLAKE2s digest algorithm, ARM scalar implementation + * BLAKE2s digest algorithm, ARM scalar implementation. This is faster + * than the generic implementations of BLAKE2s and BLAKE2b, but slower + * than the NEON implementation of BLAKE2b. There is no NEON + * implementation of BLAKE2s, since NEON doesn't really help with it. * * Copyright 2020 Google LLC * diff --git a/lib/crypto/arm/blake2s-glue.c b/lib/crypto/arm/blake2s-glue.c deleted file mode 100644 index 0238a70d9581e9..00000000000000 --- a/lib/crypto/arm/blake2s-glue.c +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include - -/* defined in blake2s-core.S */ -EXPORT_SYMBOL(blake2s_compress); diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h new file mode 100644 index 00000000000000..aa7a97139ea74a --- /dev/null +++ b/lib/crypto/arm/blake2s.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +/* defined in blake2s-core.S */ +void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, u32 inc); diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha.h similarity index 76% rename from lib/crypto/arm/chacha-glue.c rename to lib/crypto/arm/chacha.h index 88ec9641528319..0cae30f8ee5d15 100644 --- a/lib/crypto/arm/chacha-glue.c +++ b/lib/crypto/arm/chacha.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * ChaCha and HChaCha functions (ARM optimized) * @@ -6,11 +6,9 @@ * Copyright (C) 2015 Martin Willi */ -#include #include #include #include -#include #include #include @@ -64,8 +62,8 @@ static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, } } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) +static void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) { if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { hchacha_block_arm(state, out, nrounds); @@ -75,10 +73,9 @@ void hchacha_block_arch(const struct chacha_state *state, kernel_neon_end(); } } -EXPORT_SYMBOL(hchacha_block_arch); -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || bytes <= CHACHA_BLOCK_SIZE) { @@ -99,16 +96,9 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, dst += todo; } while (bytes); } -EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_arm_mod_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { switch (read_cpuid_part()) { @@ -124,15 +114,4 @@ static int __init chacha_arm_mod_init(void) static_branch_enable(&use_neon); } } - return 0; } -subsys_initcall(chacha_arm_mod_init); - -static void __exit chacha_arm_mod_exit(void) -{ -} -module_exit(chacha_arm_mod_exit); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/crypto/curve25519-core.S b/lib/crypto/arm/curve25519-core.S similarity index 100% rename from arch/arm/crypto/curve25519-core.S rename to lib/crypto/arm/curve25519-core.S diff --git a/lib/crypto/arm/curve25519.h b/lib/crypto/arm/curve25519.h new file mode 100644 index 00000000000000..f6d66494eb8f88 --- /dev/null +++ b/lib/crypto/arm/curve25519.h @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + * + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This + * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been + * manually reworked for use in kernel space. + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], + const u8 scalar[CURVE25519_KEY_SIZE], + const u8 point[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&have_neon) && crypto_simd_usable()) { + kernel_neon_begin(); + curve25519_neon(out, scalar, point); + kernel_neon_end(); + } else { + curve25519_generic(out, scalar, point); + } +} + +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + curve25519_arch(pub, secret, curve25519_base_point); +} + +#define curve25519_mod_init_arch curve25519_mod_init_arch +static void curve25519_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) + static_branch_enable(&have_neon); +} diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl index dd7a996361a719..34c11b7b44bd76 100644 --- a/lib/crypto/arm/poly1305-armv4.pl +++ b/lib/crypto/arm/poly1305-armv4.pl @@ -43,9 +43,8 @@ #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ -# define poly1305_init poly1305_block_init_arch +# define poly1305_init poly1305_block_init # define poly1305_blocks poly1305_blocks_arm -# define poly1305_emit poly1305_emit_arch #endif #if defined(__thumb2__) diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c deleted file mode 100644 index 2d86c78af8837b..00000000000000 --- a/lib/crypto/arm/poly1305-glue.c +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - len = round_down(len, POLY1305_BLOCK_SIZE); - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && likely(may_use_simd())) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else - poly1305_blocks_arm(state, src, len, padbit); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -bool poly1305_is_arch_optimized(void) -{ - /* We always can use at least the ARM scalar implementation. */ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init arm_poly1305_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - (elf_hwcap & HWCAP_NEON)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(arm_poly1305_mod_init); - -static void __exit arm_poly1305_mod_exit(void) -{ -} -module_exit(arm_poly1305_mod_exit); - -MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h new file mode 100644 index 00000000000000..0021cf368307c7 --- /dev/null +++ b/lib/crypto/arm/poly1305.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks_arm(state, src, len, padbit); +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) + static_branch_enable(&have_neon); +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h index fa1e9241900062..29f8bcad0447c3 100644 --- a/lib/crypto/arm/sha1.h +++ b/lib/crypto/arm/sha1.h @@ -35,7 +35,7 @@ static void sha1_blocks(struct sha1_block_state *state, #ifdef CONFIG_KERNEL_MODE_NEON #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { if (elf_hwcap & HWCAP_NEON) { static_branch_enable(&have_neon); diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h index da75cbdc51d413..7556457b3094b4 100644 --- a/lib/crypto/arm/sha256.h +++ b/lib/crypto/arm/sha256.h @@ -5,7 +5,10 @@ * Copyright 2025 Google LLC */ #include -#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); asmlinkage void sha256_block_data_order(struct sha256_block_state *state, const u8 *data, size_t nblocks); @@ -14,14 +17,11 @@ asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state, asmlinkage void sha256_ce_transform(struct sha256_block_state *state, const u8 *data, size_t nblocks); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); - static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && crypto_simd_usable()) { + static_branch_likely(&have_neon) && likely(may_use_simd())) { kernel_neon_begin(); if (static_branch_likely(&have_ce)) sha256_ce_transform(state, data, nblocks); @@ -35,7 +35,7 @@ static void sha256_blocks(struct sha256_block_state *state, #ifdef CONFIG_KERNEL_MODE_NEON #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { if (elf_hwcap & HWCAP_NEON) { static_branch_enable(&have_neon); diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h index f147b6490d6cd4..d1b485dd275db8 100644 --- a/lib/crypto/arm/sha512.h +++ b/lib/crypto/arm/sha512.h @@ -4,9 +4,8 @@ * * Copyright 2025 Google LLC */ - #include -#include +#include static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); @@ -19,7 +18,7 @@ static void sha512_blocks(struct sha512_block_state *state, const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && likely(crypto_simd_usable())) { + static_branch_likely(&have_neon) && likely(may_use_simd())) { kernel_neon_begin(); sha512_block_data_order_neon(state, data, nblocks); kernel_neon_end(); @@ -30,7 +29,7 @@ static void sha512_blocks(struct sha512_block_state *state, #ifdef CONFIG_KERNEL_MODE_NEON #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (cpu_has_neon()) static_branch_enable(&have_neon); diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig deleted file mode 100644 index 0b903ef524d857..00000000000000 --- a/lib/crypto/arm64/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA20_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile deleted file mode 100644 index 6207088397a73e..00000000000000 --- a/lib/crypto/arm64/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o -poly1305-neon-y := poly1305-core.o poly1305-glue.o -AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch -AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) void $(@) - -$(obj)/%-core.S: $(src)/%-armv8.pl - $(call cmd,perlasm) - -clean-files += poly1305-core.S diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha.h similarity index 75% rename from lib/crypto/arm64/chacha-neon-glue.c rename to lib/crypto/arm64/chacha.h index d0188f974ca5c4..ba6c22d4608633 100644 --- a/lib/crypto/arm64/chacha-neon-glue.c +++ b/lib/crypto/arm64/chacha.h @@ -18,11 +18,9 @@ * (at your option) any later version. */ -#include #include #include #include -#include #include #include @@ -61,8 +59,8 @@ static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, } } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) +static void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) { if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { hchacha_block_generic(state, out, nrounds); @@ -72,10 +70,9 @@ void hchacha_block_arch(const struct chacha_state *state, kernel_neon_end(); } } -EXPORT_SYMBOL(hchacha_block_arch); -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) @@ -93,27 +90,10 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, dst += todo; } while (bytes); } -EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&have_neon); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_simd_mod_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (cpu_have_named_feature(ASIMD)) static_branch_enable(&have_neon); - return 0; } -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl index 22c9069c065054..f1930c6b55cee5 100644 --- a/lib/crypto/arm64/poly1305-armv8.pl +++ b/lib/crypto/arm64/poly1305-armv8.pl @@ -50,6 +50,9 @@ #ifndef __KERNEL__ # include "arm_arch.h" .extern OPENSSL_armcap_P +#else +# define poly1305_init poly1305_block_init +# define poly1305_blocks poly1305_blocks_arm64 #endif .text diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c deleted file mode 100644 index 31aea21ce42f79..00000000000000 --- a/lib/crypto/arm64/poly1305-glue.c +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - len = round_down(len, POLY1305_BLOCK_SIZE); - if (static_branch_likely(&have_neon) && likely(may_use_simd())) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else - poly1305_blocks(state, src, len, padbit); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -bool poly1305_is_arch_optimized(void) -{ - /* We always can use at least the ARM64 scalar implementation. */ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init neon_poly1305_mod_init(void) -{ - if (cpu_have_named_feature(ASIMD)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(neon_poly1305_mod_init); - -static void __exit neon_poly1305_mod_exit(void) -{ -} -module_exit(neon_poly1305_mod_exit); - -MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/poly1305.h b/lib/crypto/arm64/poly1305.h new file mode 100644 index 00000000000000..aed5921ccd9a12 --- /dev/null +++ b/lib/crypto/arm64/poly1305.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks_arm64(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks_arm64(state, src, len, padbit); +} + +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) +{ + if (cpu_have_named_feature(ASIMD)) + static_branch_enable(&have_neon); +} diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h index f822563538cc87..aaef4ebfc5e34a 100644 --- a/lib/crypto/arm64/sha1.h +++ b/lib/crypto/arm64/sha1.h @@ -32,7 +32,7 @@ static void sha1_blocks(struct sha1_block_state *state, } #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { if (cpu_have_named_feature(SHA1)) static_branch_enable(&have_ce); diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S index b99d9589c42175..410174ba52373b 100644 --- a/lib/crypto/arm64/sha256-ce.S +++ b/lib/crypto/arm64/sha256-ce.S @@ -70,18 +70,22 @@ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .macro load_round_constants tmp + adr_l \tmp, .Lsha2_rcon + ld1 { v0.4s- v3.4s}, [\tmp], #64 + ld1 { v4.4s- v7.4s}, [\tmp], #64 + ld1 { v8.4s-v11.4s}, [\tmp], #64 + ld1 {v12.4s-v15.4s}, [\tmp] + .endm + /* * size_t __sha256_ce_transform(struct sha256_block_state *state, * const u8 *data, size_t nblocks); */ .text SYM_FUNC_START(__sha256_ce_transform) - /* load round constants */ - adr_l x8, .Lsha2_rcon - ld1 { v0.4s- v3.4s}, [x8], #64 - ld1 { v4.4s- v7.4s}, [x8], #64 - ld1 { v8.4s-v11.4s}, [x8], #64 - ld1 {v12.4s-v15.4s}, [x8] + + load_round_constants x8 /* load state */ ld1 {dgav.4s, dgbv.4s}, [x0] @@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b ) mov x0, x2 ret SYM_FUNC_END(__sha256_ce_transform) + + .unreq dga + .unreq dgav + .unreq dgb + .unreq dgbv + .unreq t0 + .unreq t1 + .unreq dg0q + .unreq dg0v + .unreq dg1q + .unreq dg1v + .unreq dg2q + .unreq dg2v + + // parameters for sha256_ce_finup2x() + ctx .req x0 + data1 .req x1 + data2 .req x2 + len .req w3 + out1 .req x4 + out2 .req x5 + + // other scalar variables + count .req x6 + final_step .req w7 + + // x8-x9 are used as temporaries. + + // v0-v15 are used to cache the SHA-256 round constants. + // v16-v19 are used for the message schedule for the first message. + // v20-v23 are used for the message schedule for the second message. + // v24-v31 are used for the state and temporaries as given below. + // *_a are for the first message and *_b for the second. + state0_a_q .req q24 + state0_a .req v24 + state1_a_q .req q25 + state1_a .req v25 + state0_b_q .req q26 + state0_b .req v26 + state1_b_q .req q27 + state1_b .req v27 + t0_a .req v28 + t0_b .req v29 + t1_a_q .req q30 + t1_a .req v30 + t1_b_q .req q31 + t1_b .req v31 + +#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) +#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) +// offsetof(struct __sha256_ctx, state) is assumed to be 0. + + // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a + // and m0_b contain the current 4 message schedule words for the first + // and second message respectively. + // + // If not all the message schedule words have been computed yet, then + // this also computes 4 more message schedule words for each message. + // m1_a-m3_a contain the next 3 groups of 4 message schedule words for + // the first message, and likewise m1_b-m3_b for the second. After + // consuming the current value of m0_a, this macro computes the group + // after m3_a and writes it to m0_a, and likewise for *_b. This means + // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a, + // m3_a, m0_a), and likewise for *_b, so the caller must cycle through + // the registers accordingly. + .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \ + m0_b, m1_b, m2_b, m3_b + add t0_a\().4s, \m0_a\().4s, \k\().4s + add t0_b\().4s, \m0_b\().4s, \k\().4s + .if \i < 48 + sha256su0 \m0_a\().4s, \m1_a\().4s + sha256su0 \m0_b\().4s, \m1_b\().4s + sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s + sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s + .endif + mov t1_a.16b, state0_a.16b + mov t1_b.16b, state0_b.16b + sha256h state0_a_q, state1_a_q, t0_a\().4s + sha256h state0_b_q, state1_b_q, t0_b\().4s + sha256h2 state1_a_q, t1_a_q, t0_a\().4s + sha256h2 state1_b_q, t1_b_q, t0_b\().4s + .endm + + .macro do_16rounds_2x i, k0, k1, k2, k3 + do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23 + do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20 + do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21 + do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22 + .endm + +// +// void sha256_ce_finup2x(const struct __sha256_ctx *ctx, +// const u8 *data1, const u8 *data2, int len, +// u8 out1[SHA256_DIGEST_SIZE], +// u8 out2[SHA256_DIGEST_SIZE]); +// +// This function computes the SHA-256 digests of two messages |data1| and +// |data2| that are both |len| bytes long, starting from the initial context +// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. +// +// The instructions for the two SHA-256 operations are interleaved. On many +// CPUs, this is almost twice as fast as hashing each message individually due +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. +// +SYM_FUNC_START(sha256_ce_finup2x) + sub sp, sp, #128 + mov final_step, #0 + load_round_constants x8 + + // Load the initial state from ctx->state. + ld1 {state0_a.4s-state1_a.4s}, [ctx] + + // Load ctx->bytecount. Take the mod 64 of it to get the number of + // bytes that are buffered in ctx->buf. Also save it in a register with + // len added to it. + ldr x8, [ctx, #OFFSETOF_BYTECOUNT] + add count, x8, len, sxtw + and x8, x8, #63 + cbz x8, .Lfinup2x_enter_loop // No bytes buffered? + + // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them + // followed by the first 64 - x8 bytes of data. Since len >= 64, we + // just load 64 bytes from each of ctx->buf, data1, and data2 + // unconditionally and rearrange the data as needed. + add x9, ctx, #OFFSETOF_BUF + ld1 {v16.16b-v19.16b}, [x9] + st1 {v16.16b-v19.16b}, [sp] + + ld1 {v16.16b-v19.16b}, [data1], #64 + add x9, sp, x8 + st1 {v16.16b-v19.16b}, [x9] + ld1 {v16.4s-v19.4s}, [sp] + + ld1 {v20.16b-v23.16b}, [data2], #64 + st1 {v20.16b-v23.16b}, [x9] + ld1 {v20.4s-v23.4s}, [sp] + + sub len, len, #64 + sub data1, data1, x8 + sub data2, data2, x8 + add len, len, w8 + mov state0_b.16b, state0_a.16b + mov state1_b.16b, state1_a.16b + b .Lfinup2x_loop_have_data + +.Lfinup2x_enter_loop: + sub len, len, #64 + mov state0_b.16b, state0_a.16b + mov state1_b.16b, state1_a.16b +.Lfinup2x_loop: + // Load the next two data blocks. + ld1 {v16.4s-v19.4s}, [data1], #64 + ld1 {v20.4s-v23.4s}, [data2], #64 +.Lfinup2x_loop_have_data: + // Convert the words of the data blocks from big endian. +CPU_LE( rev32 v16.16b, v16.16b ) +CPU_LE( rev32 v17.16b, v17.16b ) +CPU_LE( rev32 v18.16b, v18.16b ) +CPU_LE( rev32 v19.16b, v19.16b ) +CPU_LE( rev32 v20.16b, v20.16b ) +CPU_LE( rev32 v21.16b, v21.16b ) +CPU_LE( rev32 v22.16b, v22.16b ) +CPU_LE( rev32 v23.16b, v23.16b ) +.Lfinup2x_loop_have_bswapped_data: + + // Save the original state for each block. + st1 {state0_a.4s-state1_b.4s}, [sp] + + // Do the SHA-256 rounds on each block. + do_16rounds_2x 0, v0, v1, v2, v3 + do_16rounds_2x 16, v4, v5, v6, v7 + do_16rounds_2x 32, v8, v9, v10, v11 + do_16rounds_2x 48, v12, v13, v14, v15 + + // Add the original state for each block. + ld1 {v16.4s-v19.4s}, [sp] + add state0_a.4s, state0_a.4s, v16.4s + add state1_a.4s, state1_a.4s, v17.4s + add state0_b.4s, state0_b.4s, v18.4s + add state1_b.4s, state1_b.4s, v19.4s + + // Update len and loop back if more blocks remain. + sub len, len, #64 + tbz len, #31, .Lfinup2x_loop // len >= 0? + + // Check if any final blocks need to be handled. + // final_step = 2: all done + // final_step = 1: need to do count-only padding block + // final_step = 0: need to do the block with 0x80 padding byte + tbnz final_step, #1, .Lfinup2x_done + tbnz final_step, #0, .Lfinup2x_finalize_countonly + add len, len, #64 + cbz len, .Lfinup2x_finalize_blockaligned + + // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block. + // To do this, write the padding starting with the 0x80 byte to + // &sp[64]. Then for each message, copy the last 64 data bytes to sp + // and load from &sp[64 - len] to get the needed padding block. This + // code relies on the data buffers being >= 64 bytes in length. + sub w8, len, #64 // w8 = len - 64 + add data1, data1, w8, sxtw // data1 += len - 64 + add data2, data2, w8, sxtw // data2 += len - 64 +CPU_LE( mov x9, #0x80 ) +CPU_LE( fmov d16, x9 ) +CPU_BE( movi v16.16b, #0 ) +CPU_BE( mov x9, #0x8000000000000000 ) +CPU_BE( mov v16.d[1], x9 ) + movi v17.16b, #0 + stp q16, q17, [sp, #64] + stp q17, q17, [sp, #96] + sub x9, sp, w8, sxtw // x9 = &sp[64 - len] + cmp len, #56 + b.ge 1f // will count spill into its own block? + lsl count, count, #3 +CPU_LE( rev count, count ) + str count, [x9, #56] + mov final_step, #2 // won't need count-only block + b 2f +1: + mov final_step, #1 // will need count-only block +2: + ld1 {v16.16b-v19.16b}, [data1] + st1 {v16.16b-v19.16b}, [sp] + ld1 {v16.4s-v19.4s}, [x9] + ld1 {v20.16b-v23.16b}, [data2] + st1 {v20.16b-v23.16b}, [sp] + ld1 {v20.4s-v23.4s}, [x9] + b .Lfinup2x_loop_have_data + + // Prepare a padding block, either: + // + // {0x80, 0, 0, 0, ..., count (as __be64)} + // This is for a block aligned message. + // + // { 0, 0, 0, 0, ..., count (as __be64)} + // This is for a message whose length mod 64 is >= 56. + // + // Pre-swap the endianness of the words. +.Lfinup2x_finalize_countonly: + movi v16.2d, #0 + b 1f +.Lfinup2x_finalize_blockaligned: + mov x8, #0x80000000 + fmov d16, x8 +1: + movi v17.2d, #0 + movi v18.2d, #0 + ror count, count, #29 // ror(lsl(count, 3), 32) + mov v19.d[0], xzr + mov v19.d[1], count + mov v20.16b, v16.16b + movi v21.2d, #0 + movi v22.2d, #0 + mov v23.16b, v19.16b + mov final_step, #2 + b .Lfinup2x_loop_have_bswapped_data + +.Lfinup2x_done: + // Write the two digests with all bytes in the correct order. +CPU_LE( rev32 state0_a.16b, state0_a.16b ) +CPU_LE( rev32 state1_a.16b, state1_a.16b ) +CPU_LE( rev32 state0_b.16b, state0_b.16b ) +CPU_LE( rev32 state1_b.16b, state1_b.16b ) + st1 {state0_a.4s-state1_a.4s}, [out1] + st1 {state0_b.4s-state1_b.4s}, [out2] + add sp, sp, #128 + ret +SYM_FUNC_END(sha256_ce_finup2x) diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h index a211966c124a96..80d06df27d3a39 100644 --- a/lib/crypto/arm64/sha256.h +++ b/lib/crypto/arm64/sha256.h @@ -5,9 +5,12 @@ * Copyright 2025 Google LLC */ #include -#include +#include #include +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + asmlinkage void sha256_block_data_order(struct sha256_block_state *state, const u8 *data, size_t nblocks); asmlinkage void sha256_block_neon(struct sha256_block_state *state, @@ -15,14 +18,11 @@ asmlinkage void sha256_block_neon(struct sha256_block_state *state, asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state, const u8 *data, size_t nblocks); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); - static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon) && crypto_simd_usable()) { + static_branch_likely(&have_neon) && likely(may_use_simd())) { if (static_branch_likely(&have_ce)) { do { size_t rem; @@ -44,9 +44,46 @@ static void sha256_blocks(struct sha256_block_state *state, } } +static_assert(offsetof(struct __sha256_ctx, state) == 0); +static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); +static_assert(offsetof(struct __sha256_ctx, buf) == 40); +asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, int len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +#define sha256_finup_2x_arch sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + /* + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. + * Further limit len to 65536 to avoid spending too long with preemption + * disabled. (Of course, in practice len is nearly always 4096 anyway.) + */ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE && + len <= 65536 && likely(may_use_simd())) { + kernel_neon_begin(); + sha256_ce_finup2x(ctx, data1, data2, len, out1, out2); + kernel_neon_end(); + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); + return true; + } + return false; +} + +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return static_key_enabled(&have_ce); +} + #ifdef CONFIG_KERNEL_MODE_NEON #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { if (cpu_have_named_feature(ASIMD)) { static_branch_enable(&have_neon); diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h index 6abb40b467f2ec..ddb0d256f73aa3 100644 --- a/lib/crypto/arm64/sha512.h +++ b/lib/crypto/arm64/sha512.h @@ -4,9 +4,8 @@ * * Copyright 2025 Google LLC */ - #include -#include +#include #include static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns); @@ -21,7 +20,7 @@ static void sha512_blocks(struct sha512_block_state *state, { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && static_branch_likely(&have_sha512_insns) && - likely(crypto_simd_usable())) { + likely(may_use_simd())) { do { size_t rem; @@ -38,7 +37,7 @@ static void sha512_blocks(struct sha512_block_state *state, #ifdef CONFIG_KERNEL_MODE_NEON #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (cpu_have_named_feature(SHA512)) static_branch_enable(&have_sha512_insns); diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c deleted file mode 100644 index 9828176a2efec7..00000000000000 --- a/lib/crypto/blake2s-generic.c +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * - * This is an implementation of the BLAKE2s hash and PRF functions. - * - * Information: https://blake2.net/ - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -static const u8 blake2s_sigma[10][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, -}; - -static inline void blake2s_increment_counter(struct blake2s_state *state, - const u32 inc) -{ - state->t[0] += inc; - state->t[1] += (state->t[0] < inc); -} - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) - __weak __alias(blake2s_compress_generic); - -void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) -{ - u32 m[16]; - u32 v[16]; - int i; - - WARN_ON(IS_ENABLED(DEBUG) && - (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE)); - - while (nblocks > 0) { - blake2s_increment_counter(state, inc); - memcpy(m, block, BLAKE2S_BLOCK_SIZE); - le32_to_cpu_array(m, ARRAY_SIZE(m)); - memcpy(v, state->h, 32); - v[ 8] = BLAKE2S_IV0; - v[ 9] = BLAKE2S_IV1; - v[10] = BLAKE2S_IV2; - v[11] = BLAKE2S_IV3; - v[12] = BLAKE2S_IV4 ^ state->t[0]; - v[13] = BLAKE2S_IV5 ^ state->t[1]; - v[14] = BLAKE2S_IV6 ^ state->f[0]; - v[15] = BLAKE2S_IV7 ^ state->f[1]; - -#define G(r, i, a, b, c, d) do { \ - a += b + m[blake2s_sigma[r][2 * i + 0]]; \ - d = ror32(d ^ a, 16); \ - c += d; \ - b = ror32(b ^ c, 12); \ - a += b + m[blake2s_sigma[r][2 * i + 1]]; \ - d = ror32(d ^ a, 8); \ - c += d; \ - b = ror32(b ^ c, 7); \ -} while (0) - -#define ROUND(r) do { \ - G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ - G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ - G(r, 2, v[2], v[ 6], v[10], v[14]); \ - G(r, 3, v[3], v[ 7], v[11], v[15]); \ - G(r, 4, v[0], v[ 5], v[10], v[15]); \ - G(r, 5, v[1], v[ 6], v[11], v[12]); \ - G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ - G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ -} while (0) - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - -#undef G -#undef ROUND - - for (i = 0; i < 8; ++i) - state->h[i] ^= v[i] ^ v[i + 8]; - - block += BLAKE2S_BLOCK_SIZE; - --nblocks; - } -} - -EXPORT_SYMBOL(blake2s_compress_generic); diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c deleted file mode 100644 index d0634ed6a937fb..00000000000000 --- a/lib/crypto/blake2s-selftest.c +++ /dev/null @@ -1,651 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#include -#include -#include -#include - -/* - * blake2s_testvecs[] generated with the program below (using libb2-dev and - * libssl-dev [OpenSSL]) - * - * #include - * #include - * #include - * - * #include - * - * #define BLAKE2S_TESTVEC_COUNT 256 - * - * static void print_vec(const uint8_t vec[], int len) - * { - * int i; - * - * printf(" { "); - * for (i = 0; i < len; i++) { - * if (i && (i % 12) == 0) - * printf("\n "); - * printf("0x%02x, ", vec[i]); - * } - * printf("},\n"); - * } - * - * int main(void) - * { - * uint8_t key[BLAKE2S_KEYBYTES]; - * uint8_t buf[BLAKE2S_TESTVEC_COUNT]; - * uint8_t hash[BLAKE2S_OUTBYTES]; - * int i, j; - * - * key[0] = key[1] = 1; - * for (i = 2; i < BLAKE2S_KEYBYTES; ++i) - * key[i] = key[i - 2] + key[i - 1]; - * - * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) - * buf[i] = (uint8_t)i; - * - * printf("static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n"); - * - * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) { - * int outlen = 1 + i % BLAKE2S_OUTBYTES; - * int keylen = (13 * i) % (BLAKE2S_KEYBYTES + 1); - * - * blake2s(hash, buf, key + BLAKE2S_KEYBYTES - keylen, outlen, i, - * keylen); - * print_vec(hash, outlen); - * } - * printf("};\n\n"); - * - * return 0; - *} - */ -static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = { - { 0xa1, }, - { 0x7c, 0x89, }, - { 0x74, 0x0e, 0xd4, }, - { 0x47, 0x0c, 0x21, 0x15, }, - { 0x18, 0xd6, 0x9c, 0xa6, 0xc4, }, - { 0x13, 0x5d, 0x16, 0x63, 0x2e, 0xf9, }, - { 0x2c, 0xb5, 0x04, 0xb7, 0x99, 0xe2, 0x73, }, - { 0x9a, 0x0f, 0xd2, 0x39, 0xd6, 0x68, 0x1b, 0x92, }, - { 0xc8, 0xde, 0x7a, 0xea, 0x2f, 0xf4, 0xd2, 0xe3, 0x2b, }, - { 0x5b, 0xf9, 0x43, 0x52, 0x0c, 0x12, 0xba, 0xb5, 0x93, 0x9f, }, - { 0xc6, 0x2c, 0x4e, 0x80, 0xfc, 0x32, 0x5b, 0x33, 0xb8, 0xb8, 0x0a, }, - { 0xa7, 0x5c, 0xfd, 0x3a, 0xcc, 0xbf, 0x90, 0xca, 0xb7, 0x97, 0xde, 0xd8, }, - { 0x66, 0xca, 0x3c, 0xc4, 0x19, 0xef, 0x92, 0x66, 0x3f, 0x21, 0x8f, 0xda, - 0xb7, }, - { 0xba, 0xe5, 0xbb, 0x30, 0x25, 0x94, 0x6d, 0xc3, 0x89, 0x09, 0xc4, 0x25, - 0x52, 0x3e, }, - { 0xa2, 0xef, 0x0e, 0x52, 0x0b, 0x5f, 0xa2, 0x01, 0x6d, 0x0a, 0x25, 0xbc, - 0x57, 0xe2, 0x27, }, - { 0x4f, 0xe0, 0xf9, 0x52, 0x12, 0xda, 0x84, 0xb7, 0xab, 0xae, 0xb0, 0xa6, - 0x47, 0x2a, 0xc7, 0xf5, }, - { 0x56, 0xe7, 0xa8, 0x1c, 0x4c, 0xca, 0xed, 0x90, 0x31, 0xec, 0x87, 0x43, - 0xe7, 0x72, 0x08, 0xec, 0xbe, }, - { 0x7e, 0xdf, 0x80, 0x1c, 0x93, 0x33, 0xfd, 0x53, 0x44, 0xba, 0xfd, 0x96, - 0xe1, 0xbb, 0xb5, 0x65, 0xa5, 0x00, }, - { 0xec, 0x6b, 0xed, 0xf7, 0x7b, 0x62, 0x1d, 0x7d, 0xf4, 0x82, 0xf3, 0x1e, - 0x18, 0xff, 0x2b, 0xc4, 0x06, 0x20, 0x2a, }, - { 0x74, 0x98, 0xd7, 0x68, 0x63, 0xed, 0x87, 0xe4, 0x5d, 0x8d, 0x9e, 0x1d, - 0xfd, 0x2a, 0xbb, 0x86, 0xac, 0xe9, 0x2a, 0x89, }, - { 0x89, 0xc3, 0x88, 0xce, 0x2b, 0x33, 0x1e, 0x10, 0xd1, 0x37, 0x20, 0x86, - 0x28, 0x43, 0x70, 0xd9, 0xfb, 0x96, 0xd9, 0xb5, 0xd3, }, - { 0xcb, 0x56, 0x74, 0x41, 0x8d, 0x80, 0x01, 0x9a, 0x6b, 0x38, 0xe1, 0x41, - 0xad, 0x9c, 0x62, 0x74, 0xce, 0x35, 0xd5, 0x6c, 0x89, 0x6e, }, - { 0x79, 0xaf, 0x94, 0x59, 0x99, 0x26, 0xe1, 0xc9, 0x34, 0xfe, 0x7c, 0x22, - 0xf7, 0x43, 0xd7, 0x65, 0xd4, 0x48, 0x18, 0xac, 0x3d, 0xfd, 0x93, }, - { 0x85, 0x0d, 0xff, 0xb8, 0x3e, 0x87, 0x41, 0xb0, 0x95, 0xd3, 0x3d, 0x00, - 0x47, 0x55, 0x9e, 0xd2, 0x69, 0xea, 0xbf, 0xe9, 0x7a, 0x2d, 0x61, 0x45, }, - { 0x03, 0xe0, 0x85, 0xec, 0x54, 0xb5, 0x16, 0x53, 0xa8, 0xc4, 0x71, 0xe9, - 0x6a, 0xe7, 0xcb, 0xc4, 0x15, 0x02, 0xfc, 0x34, 0xa4, 0xa4, 0x28, 0x13, - 0xd1, }, - { 0xe3, 0x34, 0x4b, 0xe1, 0xd0, 0x4b, 0x55, 0x61, 0x8f, 0xc0, 0x24, 0x05, - 0xe6, 0xe0, 0x3d, 0x70, 0x24, 0x4d, 0xda, 0xb8, 0x91, 0x05, 0x29, 0x07, - 0x01, 0x3e, }, - { 0x61, 0xff, 0x01, 0x72, 0xb1, 0x4d, 0xf6, 0xfe, 0xd1, 0xd1, 0x08, 0x74, - 0xe6, 0x91, 0x44, 0xeb, 0x61, 0xda, 0x40, 0xaf, 0xfc, 0x8c, 0x91, 0x6b, - 0xec, 0x13, 0xed, }, - { 0xd4, 0x40, 0xd2, 0xa0, 0x7f, 0xc1, 0x58, 0x0c, 0x85, 0xa0, 0x86, 0xc7, - 0x86, 0xb9, 0x61, 0xc9, 0xea, 0x19, 0x86, 0x1f, 0xab, 0x07, 0xce, 0x37, - 0x72, 0x67, 0x09, 0xfc, }, - { 0x9e, 0xf8, 0x18, 0x67, 0x93, 0x10, 0x9b, 0x39, 0x75, 0xe8, 0x8b, 0x38, - 0x82, 0x7d, 0xb8, 0xb7, 0xa5, 0xaf, 0xe6, 0x6a, 0x22, 0x5e, 0x1f, 0x9c, - 0x95, 0x29, 0x19, 0xf2, 0x4b, }, - { 0xc8, 0x62, 0x25, 0xf5, 0x98, 0xc9, 0xea, 0xe5, 0x29, 0x3a, 0xd3, 0x22, - 0xeb, 0xeb, 0x07, 0x7c, 0x15, 0x07, 0xee, 0x15, 0x61, 0xbb, 0x05, 0x30, - 0x99, 0x7f, 0x11, 0xf6, 0x0a, 0x1d, }, - { 0x68, 0x70, 0xf7, 0x90, 0xa1, 0x8b, 0x1f, 0x0f, 0xbb, 0xce, 0xd2, 0x0e, - 0x33, 0x1f, 0x7f, 0xa9, 0x78, 0xa8, 0xa6, 0x81, 0x66, 0xab, 0x8d, 0xcd, - 0x58, 0x55, 0x3a, 0x0b, 0x7a, 0xdb, 0xb5, }, - { 0xdd, 0x35, 0xd2, 0xb4, 0xf6, 0xc7, 0xea, 0xab, 0x64, 0x24, 0x4e, 0xfe, - 0xe5, 0x3d, 0x4e, 0x95, 0x8b, 0x6d, 0x6c, 0xbc, 0xb0, 0xf8, 0x88, 0x61, - 0x09, 0xb7, 0x78, 0xa3, 0x31, 0xfe, 0xd9, 0x2f, }, - { 0x0a, }, - { 0x6e, 0xd4, }, - { 0x64, 0xe9, 0xd1, }, - { 0x30, 0xdd, 0x71, 0xef, }, - { 0x11, 0xb5, 0x0c, 0x87, 0xc9, }, - { 0x06, 0x1c, 0x6d, 0x04, 0x82, 0xd0, }, - { 0x5c, 0x42, 0x0b, 0xee, 0xc5, 0x9c, 0xb2, }, - { 0xe8, 0x29, 0xd6, 0xb4, 0x5d, 0xf7, 0x2b, 0x93, }, - { 0x18, 0xca, 0x27, 0x72, 0x43, 0x39, 0x16, 0xbc, 0x6a, }, - { 0x39, 0x8f, 0xfd, 0x64, 0xf5, 0x57, 0x23, 0xb0, 0x45, 0xf8, }, - { 0xbb, 0x3a, 0x78, 0x6b, 0x02, 0x1d, 0x0b, 0x16, 0xe3, 0xb2, 0x9a, }, - { 0xb8, 0xb4, 0x0b, 0xe5, 0xd4, 0x1d, 0x0d, 0x85, 0x49, 0x91, 0x35, 0xfa, }, - { 0x6d, 0x48, 0x2a, 0x0c, 0x42, 0x08, 0xbd, 0xa9, 0x78, 0x6f, 0x18, 0xaf, - 0xe2, }, - { 0x10, 0x45, 0xd4, 0x58, 0x88, 0xec, 0x4e, 0x1e, 0xf6, 0x14, 0x92, 0x64, - 0x7e, 0xb0, }, - { 0x8b, 0x0b, 0x95, 0xee, 0x92, 0xc6, 0x3b, 0x91, 0xf1, 0x1e, 0xeb, 0x51, - 0x98, 0x0a, 0x8d, }, - { 0xa3, 0x50, 0x4d, 0xa5, 0x1d, 0x03, 0x68, 0xe9, 0x57, 0x78, 0xd6, 0x04, - 0xf1, 0xc3, 0x94, 0xd8, }, - { 0xb8, 0x66, 0x6e, 0xdd, 0x46, 0x15, 0xae, 0x3d, 0x83, 0x7e, 0xcf, 0xe7, - 0x2c, 0xe8, 0x8f, 0xc7, 0x34, }, - { 0x2e, 0xc0, 0x1f, 0x29, 0xea, 0xf6, 0xb9, 0xe2, 0xc2, 0x93, 0xeb, 0x41, - 0x0d, 0xf0, 0x0a, 0x13, 0x0e, 0xa2, }, - { 0x71, 0xb8, 0x33, 0xa9, 0x1b, 0xac, 0xf1, 0xb5, 0x42, 0x8f, 0x5e, 0x81, - 0x34, 0x43, 0xb7, 0xa4, 0x18, 0x5c, 0x47, }, - { 0xda, 0x45, 0xb8, 0x2e, 0x82, 0x1e, 0xc0, 0x59, 0x77, 0x9d, 0xfa, 0xb4, - 0x1c, 0x5e, 0xa0, 0x2b, 0x33, 0x96, 0x5a, 0x58, }, - { 0xe3, 0x09, 0x05, 0xa9, 0xeb, 0x48, 0x13, 0xad, 0x71, 0x88, 0x81, 0x9a, - 0x3e, 0x2c, 0xe1, 0x23, 0x99, 0x13, 0x35, 0x9f, 0xb5, }, - { 0xb7, 0x86, 0x2d, 0x16, 0xe1, 0x04, 0x00, 0x47, 0x47, 0x61, 0x31, 0xfb, - 0x14, 0xac, 0xd8, 0xe9, 0xe3, 0x49, 0xbd, 0xf7, 0x9c, 0x3f, }, - { 0x7f, 0xd9, 0x95, 0xa8, 0xa7, 0xa0, 0xcc, 0xba, 0xef, 0xb1, 0x0a, 0xa9, - 0x21, 0x62, 0x08, 0x0f, 0x1b, 0xff, 0x7b, 0x9d, 0xae, 0xb2, 0x95, }, - { 0x85, 0x99, 0xea, 0x33, 0xe0, 0x56, 0xff, 0x13, 0xc6, 0x61, 0x8c, 0xf9, - 0x57, 0x05, 0x03, 0x11, 0xf9, 0xfb, 0x3a, 0xf7, 0xce, 0xbb, 0x52, 0x30, }, - { 0xb2, 0x72, 0x9c, 0xf8, 0x77, 0x4e, 0x8f, 0x6b, 0x01, 0x6c, 0xff, 0x4e, - 0x4f, 0x02, 0xd2, 0xbc, 0xeb, 0x51, 0x28, 0x99, 0x50, 0xab, 0xc4, 0x42, - 0xe3, }, - { 0x8b, 0x0a, 0xb5, 0x90, 0x8f, 0xf5, 0x7b, 0xdd, 0xba, 0x47, 0x37, 0xc9, - 0x2a, 0xd5, 0x4b, 0x25, 0x08, 0x8b, 0x02, 0x17, 0xa7, 0x9e, 0x6b, 0x6e, - 0xe3, 0x90, }, - { 0x90, 0xdd, 0xf7, 0x75, 0xa7, 0xa3, 0x99, 0x5e, 0x5b, 0x7d, 0x75, 0xc3, - 0x39, 0x6b, 0xa0, 0xe2, 0x44, 0x53, 0xb1, 0x9e, 0xc8, 0xf1, 0x77, 0x10, - 0x58, 0x06, 0x9a, }, - { 0x99, 0x52, 0xf0, 0x49, 0xa8, 0x8c, 0xec, 0xa6, 0x97, 0x32, 0x13, 0xb5, - 0xf7, 0xa3, 0x8e, 0xfb, 0x4b, 0x59, 0x31, 0x3d, 0x01, 0x59, 0x98, 0x5d, - 0x53, 0x03, 0x1a, 0x39, }, - { 0x9f, 0xe0, 0xc2, 0xe5, 0x5d, 0x93, 0xd6, 0x9b, 0x47, 0x8f, 0x9b, 0xe0, - 0x26, 0x35, 0x84, 0x20, 0x1d, 0xc5, 0x53, 0x10, 0x0f, 0x22, 0xb9, 0xb5, - 0xd4, 0x36, 0xb1, 0xac, 0x73, }, - { 0x30, 0x32, 0x20, 0x3b, 0x10, 0x28, 0xec, 0x1f, 0x4f, 0x9b, 0x47, 0x59, - 0xeb, 0x7b, 0xee, 0x45, 0xfb, 0x0c, 0x49, 0xd8, 0x3d, 0x69, 0xbd, 0x90, - 0x2c, 0xf0, 0x9e, 0x8d, 0xbf, 0xd5, }, - { 0x2a, 0x37, 0x73, 0x7f, 0xf9, 0x96, 0x19, 0xaa, 0x25, 0xd8, 0x13, 0x28, - 0x01, 0x29, 0x89, 0xdf, 0x6e, 0x0c, 0x9b, 0x43, 0x44, 0x51, 0xe9, 0x75, - 0x26, 0x0c, 0xb7, 0x87, 0x66, 0x0b, 0x5f, }, - { 0x23, 0xdf, 0x96, 0x68, 0x91, 0x86, 0xd0, 0x93, 0x55, 0x33, 0x24, 0xf6, - 0xba, 0x08, 0x75, 0x5b, 0x59, 0x11, 0x69, 0xb8, 0xb9, 0xe5, 0x2c, 0x77, - 0x02, 0xf6, 0x47, 0xee, 0x81, 0xdd, 0xb9, 0x06, }, - { 0x9d, }, - { 0x9d, 0x7d, }, - { 0xfd, 0xc3, 0xda, }, - { 0xe8, 0x82, 0xcd, 0x21, }, - { 0xc3, 0x1d, 0x42, 0x4c, 0x74, }, - { 0xe9, 0xda, 0xf1, 0xa2, 0xe5, 0x7c, }, - { 0x52, 0xb8, 0x6f, 0x81, 0x5c, 0x3a, 0x4c, }, - { 0x5b, 0x39, 0x26, 0xfc, 0x92, 0x5e, 0xe0, 0x49, }, - { 0x59, 0xe4, 0x7c, 0x93, 0x1c, 0xf9, 0x28, 0x93, 0xde, }, - { 0xde, 0xdf, 0xb2, 0x43, 0x61, 0x0b, 0x86, 0x16, 0x4c, 0x2e, }, - { 0x14, 0x8f, 0x75, 0x51, 0xaf, 0xb9, 0xee, 0x51, 0x5a, 0xae, 0x23, }, - { 0x43, 0x5f, 0x50, 0xd5, 0x70, 0xb0, 0x5b, 0x87, 0xf5, 0xd9, 0xb3, 0x6d, }, - { 0x66, 0x0a, 0x64, 0x93, 0x79, 0x71, 0x94, 0x40, 0xb7, 0x68, 0x2d, 0xd3, - 0x63, }, - { 0x15, 0x00, 0xc4, 0x0c, 0x7d, 0x1b, 0x10, 0xa9, 0x73, 0x1b, 0x90, 0x6f, - 0xe6, 0xa9, }, - { 0x34, 0x75, 0xf3, 0x86, 0x8f, 0x56, 0xcf, 0x2a, 0x0a, 0xf2, 0x62, 0x0a, - 0xf6, 0x0e, 0x20, }, - { 0xb1, 0xde, 0xc9, 0xf5, 0xdb, 0xf3, 0x2f, 0x4c, 0xd6, 0x41, 0x7d, 0x39, - 0x18, 0x3e, 0xc7, 0xc3, }, - { 0xc5, 0x89, 0xb2, 0xf8, 0xb8, 0xc0, 0xa3, 0xb9, 0x3b, 0x10, 0x6d, 0x7c, - 0x92, 0xfc, 0x7f, 0x34, 0x41, }, - { 0xc4, 0xd8, 0xef, 0xba, 0xef, 0xd2, 0xaa, 0xc5, 0x6c, 0x8e, 0x3e, 0xbb, - 0x12, 0xfc, 0x0f, 0x72, 0xbf, 0x0f, }, - { 0xdd, 0x91, 0xd1, 0x15, 0x9e, 0x7d, 0xf8, 0xc1, 0xb9, 0x14, 0x63, 0x96, - 0xb5, 0xcb, 0x83, 0x1d, 0x35, 0x1c, 0xec, }, - { 0xa9, 0xf8, 0x52, 0xc9, 0x67, 0x76, 0x2b, 0xad, 0xfb, 0xd8, 0x3a, 0xa6, - 0x74, 0x02, 0xae, 0xb8, 0x25, 0x2c, 0x63, 0x49, }, - { 0x77, 0x1f, 0x66, 0x70, 0xfd, 0x50, 0x29, 0xaa, 0xeb, 0xdc, 0xee, 0xba, - 0x75, 0x98, 0xdc, 0x93, 0x12, 0x3f, 0xdc, 0x7c, 0x38, }, - { 0xe2, 0xe1, 0x89, 0x5c, 0x37, 0x38, 0x6a, 0xa3, 0x40, 0xac, 0x3f, 0xb0, - 0xca, 0xfc, 0xa7, 0xf3, 0xea, 0xf9, 0x0f, 0x5d, 0x8e, 0x39, }, - { 0x0f, 0x67, 0xc8, 0x38, 0x01, 0xb1, 0xb7, 0xb8, 0xa2, 0xe7, 0x0a, 0x6d, - 0xd2, 0x63, 0x69, 0x9e, 0xcc, 0xf0, 0xf2, 0xbe, 0x9b, 0x98, 0xdd, }, - { 0x13, 0xe1, 0x36, 0x30, 0xfe, 0xc6, 0x01, 0x8a, 0xa1, 0x63, 0x96, 0x59, - 0xc2, 0xa9, 0x68, 0x3f, 0x58, 0xd4, 0x19, 0x0c, 0x40, 0xf3, 0xde, 0x02, }, - { 0xa3, 0x9e, 0xce, 0xda, 0x42, 0xee, 0x8c, 0x6c, 0x5a, 0x7d, 0xdc, 0x89, - 0x02, 0x77, 0xdd, 0xe7, 0x95, 0xbb, 0xff, 0x0d, 0xa4, 0xb5, 0x38, 0x1e, - 0xaf, }, - { 0x9a, 0xf6, 0xb5, 0x9a, 0x4f, 0xa9, 0x4f, 0x2c, 0x35, 0x3c, 0x24, 0xdc, - 0x97, 0x6f, 0xd9, 0xa1, 0x7d, 0x1a, 0x85, 0x0b, 0xf5, 0xda, 0x2e, 0xe7, - 0xb1, 0x1d, }, - { 0x84, 0x1e, 0x8e, 0x3d, 0x45, 0xa5, 0xf2, 0x27, 0xf3, 0x31, 0xfe, 0xb9, - 0xfb, 0xc5, 0x45, 0x99, 0x99, 0xdd, 0x93, 0x43, 0x02, 0xee, 0x58, 0xaf, - 0xee, 0x6a, 0xbe, }, - { 0x07, 0x2f, 0xc0, 0xa2, 0x04, 0xc4, 0xab, 0x7c, 0x26, 0xbb, 0xa8, 0xd8, - 0xe3, 0x1c, 0x75, 0x15, 0x64, 0x5d, 0x02, 0x6a, 0xf0, 0x86, 0xe9, 0xcd, - 0x5c, 0xef, 0xa3, 0x25, }, - { 0x2f, 0x3b, 0x1f, 0xb5, 0x91, 0x8f, 0x86, 0xe0, 0xdc, 0x31, 0x48, 0xb6, - 0xa1, 0x8c, 0xfd, 0x75, 0xbb, 0x7d, 0x3d, 0xc1, 0xf0, 0x10, 0x9a, 0xd8, - 0x4b, 0x0e, 0xe3, 0x94, 0x9f, }, - { 0x29, 0xbb, 0x8f, 0x6c, 0xd1, 0xf2, 0xb6, 0xaf, 0xe5, 0xe3, 0x2d, 0xdc, - 0x6f, 0xa4, 0x53, 0x88, 0xd8, 0xcf, 0x4d, 0x45, 0x42, 0x62, 0xdb, 0xdf, - 0xf8, 0x45, 0xc2, 0x13, 0xec, 0x35, }, - { 0x06, 0x3c, 0xe3, 0x2c, 0x15, 0xc6, 0x43, 0x03, 0x81, 0xfb, 0x08, 0x76, - 0x33, 0xcb, 0x02, 0xc1, 0xba, 0x33, 0xe5, 0xe0, 0xd1, 0x92, 0xa8, 0x46, - 0x28, 0x3f, 0x3e, 0x9d, 0x2c, 0x44, 0x54, }, - { 0xea, 0xbb, 0x96, 0xf8, 0xd1, 0x8b, 0x04, 0x11, 0x40, 0x78, 0x42, 0x02, - 0x19, 0xd1, 0xbc, 0x65, 0x92, 0xd3, 0xc3, 0xd6, 0xd9, 0x19, 0xe7, 0xc3, - 0x40, 0x97, 0xbd, 0xd4, 0xed, 0xfa, 0x5e, 0x28, }, - { 0x02, }, - { 0x52, 0xa8, }, - { 0x38, 0x25, 0x0d, }, - { 0xe3, 0x04, 0xd4, 0x92, }, - { 0x97, 0xdb, 0xf7, 0x81, 0xca, }, - { 0x8a, 0x56, 0x9d, 0x62, 0x56, 0xcc, }, - { 0xa1, 0x8e, 0x3c, 0x72, 0x8f, 0x63, 0x03, }, - { 0xf7, 0xf3, 0x39, 0x09, 0x0a, 0xa1, 0xbb, 0x23, }, - { 0x6b, 0x03, 0xc0, 0xe9, 0xd9, 0x83, 0x05, 0x22, 0x01, }, - { 0x1b, 0x4b, 0xf5, 0xd6, 0x4f, 0x05, 0x75, 0x91, 0x4c, 0x7f, }, - { 0x4c, 0x8c, 0x25, 0x20, 0x21, 0xcb, 0xc2, 0x4b, 0x3a, 0x5b, 0x8d, }, - { 0x56, 0xe2, 0x77, 0xa0, 0xb6, 0x9f, 0x81, 0xec, 0x83, 0x75, 0xc4, 0xf9, }, - { 0x71, 0x70, 0x0f, 0xad, 0x4d, 0x35, 0x81, 0x9d, 0x88, 0x69, 0xf9, 0xaa, - 0xd3, }, - { 0x50, 0x6e, 0x86, 0x6e, 0x43, 0xc0, 0xc2, 0x44, 0xc2, 0xe2, 0xa0, 0x1c, - 0xb7, 0x9a, }, - { 0xe4, 0x7e, 0x72, 0xc6, 0x12, 0x8e, 0x7c, 0xfc, 0xbd, 0xe2, 0x08, 0x31, - 0x3d, 0x47, 0x3d, }, - { 0x08, 0x97, 0x5b, 0x80, 0xae, 0xc4, 0x1d, 0x50, 0x77, 0xdf, 0x1f, 0xd0, - 0x24, 0xf0, 0x17, 0xc0, }, - { 0x01, 0xb6, 0x29, 0xf4, 0xaf, 0x78, 0x5f, 0xb6, 0x91, 0xdd, 0x76, 0x76, - 0xd2, 0xfd, 0x0c, 0x47, 0x40, }, - { 0xa1, 0xd8, 0x09, 0x97, 0x7a, 0xa6, 0xc8, 0x94, 0xf6, 0x91, 0x7b, 0xae, - 0x2b, 0x9f, 0x0d, 0x83, 0x48, 0xf7, }, - { 0x12, 0xd5, 0x53, 0x7d, 0x9a, 0xb0, 0xbe, 0xd9, 0xed, 0xe9, 0x9e, 0xee, - 0x61, 0x5b, 0x42, 0xf2, 0xc0, 0x73, 0xc0, }, - { 0xd5, 0x77, 0xd6, 0x5c, 0x6e, 0xa5, 0x69, 0x2b, 0x3b, 0x8c, 0xd6, 0x7d, - 0x1d, 0xbe, 0x2c, 0xa1, 0x02, 0x21, 0xcd, 0x29, }, - { 0xa4, 0x98, 0x80, 0xca, 0x22, 0xcf, 0x6a, 0xab, 0x5e, 0x40, 0x0d, 0x61, - 0x08, 0x21, 0xef, 0xc0, 0x6c, 0x52, 0xb4, 0xb0, 0x53, }, - { 0xbf, 0xaf, 0x8f, 0x3b, 0x7a, 0x97, 0x33, 0xe5, 0xca, 0x07, 0x37, 0xfd, - 0x15, 0xdf, 0xce, 0x26, 0x2a, 0xb1, 0xa7, 0x0b, 0xb3, 0xac, }, - { 0x16, 0x22, 0xe1, 0xbc, 0x99, 0x4e, 0x01, 0xf0, 0xfa, 0xff, 0x8f, 0xa5, - 0x0c, 0x61, 0xb0, 0xad, 0xcc, 0xb1, 0xe1, 0x21, 0x46, 0xfa, 0x2e, }, - { 0x11, 0x5b, 0x0b, 0x2b, 0xe6, 0x14, 0xc1, 0xd5, 0x4d, 0x71, 0x5e, 0x17, - 0xea, 0x23, 0xdd, 0x6c, 0xbd, 0x1d, 0xbe, 0x12, 0x1b, 0xee, 0x4c, 0x1a, }, - { 0x40, 0x88, 0x22, 0xf3, 0x20, 0x6c, 0xed, 0xe1, 0x36, 0x34, 0x62, 0x2c, - 0x98, 0x83, 0x52, 0xe2, 0x25, 0xee, 0xe9, 0xf5, 0xe1, 0x17, 0xf0, 0x5c, - 0xae, }, - { 0xc3, 0x76, 0x37, 0xde, 0x95, 0x8c, 0xca, 0x2b, 0x0c, 0x23, 0xe7, 0xb5, - 0x38, 0x70, 0x61, 0xcc, 0xff, 0xd3, 0x95, 0x7b, 0xf3, 0xff, 0x1f, 0x9d, - 0x59, 0x00, }, - { 0x0c, 0x19, 0x52, 0x05, 0x22, 0x53, 0xcb, 0x48, 0xd7, 0x10, 0x0e, 0x7e, - 0x14, 0x69, 0xb5, 0xa2, 0x92, 0x43, 0xa3, 0x9e, 0x4b, 0x8f, 0x51, 0x2c, - 0x5a, 0x2c, 0x3b, }, - { 0xe1, 0x9d, 0x70, 0x70, 0x28, 0xec, 0x86, 0x40, 0x55, 0x33, 0x56, 0xda, - 0x88, 0xca, 0xee, 0xc8, 0x6a, 0x20, 0xb1, 0xe5, 0x3d, 0x57, 0xf8, 0x3c, - 0x10, 0x07, 0x2a, 0xc4, }, - { 0x0b, 0xae, 0xf1, 0xc4, 0x79, 0xee, 0x1b, 0x3d, 0x27, 0x35, 0x8d, 0x14, - 0xd6, 0xae, 0x4e, 0x3c, 0xe9, 0x53, 0x50, 0xb5, 0xcc, 0x0c, 0xf7, 0xdf, - 0xee, 0xa1, 0x74, 0xd6, 0x71, }, - { 0xe6, 0xa4, 0xf4, 0x99, 0x98, 0xb9, 0x80, 0xea, 0x96, 0x7f, 0x4f, 0x33, - 0xcf, 0x74, 0x25, 0x6f, 0x17, 0x6c, 0xbf, 0xf5, 0x5c, 0x38, 0xd0, 0xff, - 0x96, 0xcb, 0x13, 0xf9, 0xdf, 0xfd, }, - { 0xbe, 0x92, 0xeb, 0xba, 0x44, 0x2c, 0x24, 0x74, 0xd4, 0x03, 0x27, 0x3c, - 0x5d, 0x5b, 0x03, 0x30, 0x87, 0x63, 0x69, 0xe0, 0xb8, 0x94, 0xf4, 0x44, - 0x7e, 0xad, 0xcd, 0x20, 0x12, 0x16, 0x79, }, - { 0x30, 0xf1, 0xc4, 0x8e, 0x05, 0x90, 0x2a, 0x97, 0x63, 0x94, 0x46, 0xff, - 0xce, 0xd8, 0x67, 0xa7, 0xac, 0x33, 0x8c, 0x95, 0xb7, 0xcd, 0xa3, 0x23, - 0x98, 0x9d, 0x76, 0x6c, 0x9d, 0xa8, 0xd6, 0x8a, }, - { 0xbe, }, - { 0x17, 0x6c, }, - { 0x1a, 0x42, 0x4f, }, - { 0xba, 0xaf, 0xb7, 0x65, }, - { 0xc2, 0x63, 0x43, 0x6a, 0xea, }, - { 0xe4, 0x4d, 0xad, 0xf2, 0x0b, 0x02, }, - { 0x04, 0xc7, 0xc4, 0x7f, 0xa9, 0x2b, 0xce, }, - { 0x66, 0xf6, 0x67, 0xcb, 0x03, 0x53, 0xc8, 0xf1, }, - { 0x56, 0xa3, 0x60, 0x78, 0xc9, 0x5f, 0x70, 0x1b, 0x5e, }, - { 0x99, 0xff, 0x81, 0x7c, 0x13, 0x3c, 0x29, 0x79, 0x4b, 0x65, }, - { 0x51, 0x10, 0x50, 0x93, 0x01, 0x93, 0xb7, 0x01, 0xc9, 0x18, 0xb7, }, - { 0x8e, 0x3c, 0x42, 0x1e, 0x5e, 0x7d, 0xc1, 0x50, 0x70, 0x1f, 0x00, 0x98, }, - { 0x5f, 0xd9, 0x9b, 0xc8, 0xd7, 0xb2, 0x72, 0x62, 0x1a, 0x1e, 0xba, 0x92, - 0xe9, }, - { 0x70, 0x2b, 0xba, 0xfe, 0xad, 0x5d, 0x96, 0x3f, 0x27, 0xc2, 0x41, 0x6d, - 0xc4, 0xb3, }, - { 0xae, 0xe0, 0xd5, 0xd4, 0xc7, 0xae, 0x15, 0x5e, 0xdc, 0xdd, 0x33, 0x60, - 0xd7, 0xd3, 0x5e, }, - { 0x79, 0x8e, 0xbc, 0x9e, 0x20, 0xb9, 0x19, 0x4b, 0x63, 0x80, 0xf3, 0x16, - 0xaf, 0x39, 0xbd, 0x92, }, - { 0xc2, 0x0e, 0x85, 0xa0, 0x0b, 0x9a, 0xb0, 0xec, 0xde, 0x38, 0xd3, 0x10, - 0xd9, 0xa7, 0x66, 0x27, 0xcf, }, - { 0x0e, 0x3b, 0x75, 0x80, 0x67, 0x14, 0x0c, 0x02, 0x90, 0xd6, 0xb3, 0x02, - 0x81, 0xf6, 0xa6, 0x87, 0xce, 0x58, }, - { 0x79, 0xb5, 0xe9, 0x5d, 0x52, 0x4d, 0xf7, 0x59, 0xf4, 0x2e, 0x27, 0xdd, - 0xb3, 0xed, 0x57, 0x5b, 0x82, 0xea, 0x6f, }, - { 0xa2, 0x97, 0xf5, 0x80, 0x02, 0x3d, 0xde, 0xa3, 0xf9, 0xf6, 0xab, 0xe3, - 0x57, 0x63, 0x7b, 0x9b, 0x10, 0x42, 0x6f, 0xf2, }, - { 0x12, 0x7a, 0xfc, 0xb7, 0x67, 0x06, 0x0c, 0x78, 0x1a, 0xfe, 0x88, 0x4f, - 0xc6, 0xac, 0x52, 0x96, 0x64, 0x28, 0x97, 0x84, 0x06, }, - { 0xc5, 0x04, 0x44, 0x6b, 0xb2, 0xa5, 0xa4, 0x66, 0xe1, 0x76, 0xa2, 0x51, - 0xf9, 0x59, 0x69, 0x97, 0x56, 0x0b, 0xbf, 0x50, 0xb3, 0x34, }, - { 0x21, 0x32, 0x6b, 0x42, 0xb5, 0xed, 0x71, 0x8d, 0xf7, 0x5a, 0x35, 0xe3, - 0x90, 0xe2, 0xee, 0xaa, 0x89, 0xf6, 0xc9, 0x9c, 0x4d, 0x73, 0xf4, }, - { 0x4c, 0xa6, 0x09, 0xf4, 0x48, 0xe7, 0x46, 0xbc, 0x49, 0xfc, 0xe5, 0xda, - 0xd1, 0x87, 0x13, 0x17, 0x4c, 0x59, 0x71, 0x26, 0x5b, 0x2c, 0x42, 0xb7, }, - { 0x13, 0x63, 0xf3, 0x40, 0x02, 0xe5, 0xa3, 0x3a, 0x5e, 0x8e, 0xf8, 0xb6, - 0x8a, 0x49, 0x60, 0x76, 0x34, 0x72, 0x94, 0x73, 0xf6, 0xd9, 0x21, 0x6a, - 0x26, }, - { 0xdf, 0x75, 0x16, 0x10, 0x1b, 0x5e, 0x81, 0xc3, 0xc8, 0xde, 0x34, 0x24, - 0xb0, 0x98, 0xeb, 0x1b, 0x8f, 0xa1, 0x9b, 0x05, 0xee, 0xa5, 0xe9, 0x35, - 0xf4, 0x1d, }, - { 0xcd, 0x21, 0x93, 0x6e, 0x5b, 0xa0, 0x26, 0x2b, 0x21, 0x0e, 0xa0, 0xb9, - 0x1c, 0xb5, 0xbb, 0xb8, 0xf8, 0x1e, 0xff, 0x5c, 0xa8, 0xf9, 0x39, 0x46, - 0x4e, 0x29, 0x26, }, - { 0x73, 0x7f, 0x0e, 0x3b, 0x0b, 0x5c, 0xf9, 0x60, 0xaa, 0x88, 0xa1, 0x09, - 0xb1, 0x5d, 0x38, 0x7b, 0x86, 0x8f, 0x13, 0x7a, 0x8d, 0x72, 0x7a, 0x98, - 0x1a, 0x5b, 0xff, 0xc9, }, - { 0xd3, 0x3c, 0x61, 0x71, 0x44, 0x7e, 0x31, 0x74, 0x98, 0x9d, 0x9a, 0xd2, - 0x27, 0xf3, 0x46, 0x43, 0x42, 0x51, 0xd0, 0x5f, 0xe9, 0x1c, 0x5c, 0x69, - 0xbf, 0xf6, 0xbe, 0x3c, 0x40, }, - { 0x31, 0x99, 0x31, 0x9f, 0xaa, 0x43, 0x2e, 0x77, 0x3e, 0x74, 0x26, 0x31, - 0x5e, 0x61, 0xf1, 0x87, 0xe2, 0xeb, 0x9b, 0xcd, 0xd0, 0x3a, 0xee, 0x20, - 0x7e, 0x10, 0x0a, 0x0b, 0x7e, 0xfa, }, - { 0xa4, 0x27, 0x80, 0x67, 0x81, 0x2a, 0xa7, 0x62, 0xf7, 0x6e, 0xda, 0xd4, - 0x5c, 0x39, 0x74, 0xad, 0x7e, 0xbe, 0xad, 0xa5, 0x84, 0x7f, 0xa9, 0x30, - 0x5d, 0xdb, 0xe2, 0x05, 0x43, 0xf7, 0x1b, }, - { 0x0b, 0x37, 0xd8, 0x02, 0xe1, 0x83, 0xd6, 0x80, 0xf2, 0x35, 0xc2, 0xb0, - 0x37, 0xef, 0xef, 0x5e, 0x43, 0x93, 0xf0, 0x49, 0x45, 0x0a, 0xef, 0xb5, - 0x76, 0x70, 0x12, 0x44, 0xc4, 0xdb, 0xf5, 0x7a, }, - { 0x1f, }, - { 0x82, 0x60, }, - { 0xcc, 0xe3, 0x08, }, - { 0x56, 0x17, 0xe4, 0x59, }, - { 0xe2, 0xd7, 0x9e, 0xc4, 0x4c, }, - { 0xb2, 0xad, 0xd3, 0x78, 0x58, 0x5a, }, - { 0xce, 0x43, 0xb4, 0x02, 0x96, 0xab, 0x3c, }, - { 0xe6, 0x05, 0x1a, 0x73, 0x22, 0x32, 0xbb, 0x77, }, - { 0x23, 0xe7, 0xda, 0xfe, 0x2c, 0xef, 0x8c, 0x22, 0xec, }, - { 0xe9, 0x8e, 0x55, 0x38, 0xd1, 0xd7, 0x35, 0x23, 0x98, 0xc7, }, - { 0xb5, 0x81, 0x1a, 0xe5, 0xb5, 0xa5, 0xd9, 0x4d, 0xca, 0x41, 0xe7, }, - { 0x41, 0x16, 0x16, 0x95, 0x8d, 0x9e, 0x0c, 0xea, 0x8c, 0x71, 0x9a, 0xc1, }, - { 0x7c, 0x33, 0xc0, 0xa4, 0x00, 0x62, 0xea, 0x60, 0x67, 0xe4, 0x20, 0xbc, - 0x5b, }, - { 0xdb, 0xb1, 0xdc, 0xfd, 0x08, 0xc0, 0xde, 0x82, 0xd1, 0xde, 0x38, 0xc0, - 0x90, 0x48, }, - { 0x37, 0x18, 0x2e, 0x0d, 0x61, 0xaa, 0x61, 0xd7, 0x86, 0x20, 0x16, 0x60, - 0x04, 0xd9, 0xd5, }, - { 0xb0, 0xcf, 0x2c, 0x4c, 0x5e, 0x5b, 0x4f, 0x2a, 0x23, 0x25, 0x58, 0x47, - 0xe5, 0x31, 0x06, 0x70, }, - { 0x91, 0xa0, 0xa3, 0x86, 0x4e, 0xe0, 0x72, 0x38, 0x06, 0x67, 0x59, 0x5c, - 0x70, 0x25, 0xdb, 0x33, 0x27, }, - { 0x44, 0x58, 0x66, 0xb8, 0x58, 0xc7, 0x13, 0xed, 0x4c, 0xc0, 0xf4, 0x9a, - 0x1e, 0x67, 0x75, 0x33, 0xb6, 0xb8, }, - { 0x7f, 0x98, 0x4a, 0x8e, 0x50, 0xa2, 0x5c, 0xcd, 0x59, 0xde, 0x72, 0xb3, - 0x9d, 0xc3, 0x09, 0x8a, 0xab, 0x56, 0xf1, }, - { 0x80, 0x96, 0x49, 0x1a, 0x59, 0xa2, 0xc5, 0xd5, 0xa7, 0x20, 0x8a, 0xb7, - 0x27, 0x62, 0x84, 0x43, 0xc6, 0xe1, 0x1b, 0x5d, }, - { 0x6b, 0xb7, 0x2b, 0x26, 0x62, 0x14, 0x70, 0x19, 0x3d, 0x4d, 0xac, 0xac, - 0x63, 0x58, 0x5e, 0x94, 0xb5, 0xb7, 0xe8, 0xe8, 0xa2, }, - { 0x20, 0xa8, 0xc0, 0xfd, 0x63, 0x3d, 0x6e, 0x98, 0xcf, 0x0c, 0x49, 0x98, - 0xe4, 0x5a, 0xfe, 0x8c, 0xaa, 0x70, 0x82, 0x1c, 0x7b, 0x74, }, - { 0xc8, 0xe8, 0xdd, 0xdf, 0x69, 0x30, 0x01, 0xc2, 0x0f, 0x7e, 0x2f, 0x11, - 0xcc, 0x3e, 0x17, 0xa5, 0x69, 0x40, 0x3f, 0x0e, 0x79, 0x7f, 0xcf, }, - { 0xdb, 0x61, 0xc0, 0xe2, 0x2e, 0x49, 0x07, 0x31, 0x1d, 0x91, 0x42, 0x8a, - 0xfc, 0x5e, 0xd3, 0xf8, 0x56, 0x1f, 0x2b, 0x73, 0xfd, 0x9f, 0xb2, 0x8e, }, - { 0x0c, 0x89, 0x55, 0x0c, 0x1f, 0x59, 0x2c, 0x9d, 0x1b, 0x29, 0x1d, 0x41, - 0x1d, 0xe6, 0x47, 0x8f, 0x8c, 0x2b, 0xea, 0x8f, 0xf0, 0xff, 0x21, 0x70, - 0x88, }, - { 0x12, 0x18, 0x95, 0xa6, 0x59, 0xb1, 0x31, 0x24, 0x45, 0x67, 0x55, 0xa4, - 0x1a, 0x2d, 0x48, 0x67, 0x1b, 0x43, 0x88, 0x2d, 0x8e, 0xa0, 0x70, 0xb3, - 0xc6, 0xbb, }, - { 0xe7, 0xb1, 0x1d, 0xb2, 0x76, 0x4d, 0x68, 0x68, 0x68, 0x23, 0x02, 0x55, - 0x3a, 0xe2, 0xe5, 0xd5, 0x4b, 0x43, 0xf9, 0x34, 0x77, 0x5c, 0xa1, 0xf5, - 0x55, 0xfd, 0x4f, }, - { 0x8c, 0x87, 0x5a, 0x08, 0x3a, 0x73, 0xad, 0x61, 0xe1, 0xe7, 0x99, 0x7e, - 0xf0, 0x5d, 0xe9, 0x5d, 0x16, 0x43, 0x80, 0x2f, 0xd0, 0x66, 0x34, 0xe2, - 0x42, 0x64, 0x3b, 0x1a, }, - { 0x39, 0xc1, 0x99, 0xcf, 0x22, 0xbf, 0x16, 0x8f, 0x9f, 0x80, 0x7f, 0x95, - 0x0a, 0x05, 0x67, 0x27, 0xe7, 0x15, 0xdf, 0x9d, 0xb2, 0xfe, 0x1c, 0xb5, - 0x1d, 0x60, 0x8f, 0x8a, 0x1d, }, - { 0x9b, 0x6e, 0x08, 0x09, 0x06, 0x73, 0xab, 0x68, 0x02, 0x62, 0x1a, 0xe4, - 0xd4, 0xdf, 0xc7, 0x02, 0x4c, 0x6a, 0x5f, 0xfd, 0x23, 0xac, 0xae, 0x6d, - 0x43, 0xa4, 0x7a, 0x50, 0x60, 0x3c, }, - { 0x1d, 0xb4, 0xc6, 0xe1, 0xb1, 0x4b, 0xe3, 0xf2, 0xe2, 0x1a, 0x73, 0x1b, - 0xa0, 0x92, 0xa7, 0xf5, 0xff, 0x8f, 0x8b, 0x5d, 0xdf, 0xa8, 0x04, 0xb3, - 0xb0, 0xf7, 0xcc, 0x12, 0xfa, 0x35, 0x46, }, - { 0x49, 0x45, 0x97, 0x11, 0x0f, 0x1c, 0x60, 0x8e, 0xe8, 0x47, 0x30, 0xcf, - 0x60, 0xa8, 0x71, 0xc5, 0x1b, 0xe9, 0x39, 0x4d, 0x49, 0xb6, 0x12, 0x1f, - 0x24, 0xab, 0x37, 0xff, 0x83, 0xc2, 0xe1, 0x3a, }, - { 0x60, }, - { 0x24, 0x26, }, - { 0x47, 0xeb, 0xc9, }, - { 0x4a, 0xd0, 0xbc, 0xf0, }, - { 0x8e, 0x2b, 0xc9, 0x85, 0x3c, }, - { 0xa2, 0x07, 0x15, 0xb8, 0x12, 0x74, }, - { 0x0f, 0xdb, 0x5b, 0x33, 0x69, 0xfe, 0x4b, }, - { 0xa2, 0x86, 0x54, 0xf4, 0xfd, 0xb2, 0xd4, 0xe6, }, - { 0xbb, 0x84, 0x78, 0x49, 0x27, 0x8e, 0x61, 0xda, 0x60, }, - { 0x04, 0xc3, 0xcd, 0xaa, 0x8f, 0xa7, 0x03, 0xc9, 0xf9, 0xb6, }, - { 0xf8, 0x27, 0x1d, 0x61, 0xdc, 0x21, 0x42, 0xdd, 0xad, 0x92, 0x40, }, - { 0x12, 0x87, 0xdf, 0xc2, 0x41, 0x45, 0x5a, 0x36, 0x48, 0x5b, 0x51, 0x2b, }, - { 0xbb, 0x37, 0x5d, 0x1f, 0xf1, 0x68, 0x7a, 0xc4, 0xa5, 0xd2, 0xa4, 0x91, - 0x8d, }, - { 0x5b, 0x27, 0xd1, 0x04, 0x54, 0x52, 0x9f, 0xa3, 0x47, 0x86, 0x33, 0x33, - 0xbf, 0xa0, }, - { 0xcf, 0x04, 0xea, 0xf8, 0x03, 0x2a, 0x43, 0xff, 0xa6, 0x68, 0x21, 0x4c, - 0xd5, 0x4b, 0xed, }, - { 0xaf, 0xb8, 0xbc, 0x63, 0x0f, 0x18, 0x4d, 0xe2, 0x7a, 0xdd, 0x46, 0x44, - 0xc8, 0x24, 0x0a, 0xb7, }, - { 0x3e, 0xdc, 0x36, 0xe4, 0x89, 0xb1, 0xfa, 0xc6, 0x40, 0x93, 0x2e, 0x75, - 0xb2, 0x15, 0xd1, 0xb1, 0x10, }, - { 0x6c, 0xd8, 0x20, 0x3b, 0x82, 0x79, 0xf9, 0xc8, 0xbc, 0x9d, 0xe0, 0x35, - 0xbe, 0x1b, 0x49, 0x1a, 0xbc, 0x3a, }, - { 0x78, 0x65, 0x2c, 0xbe, 0x35, 0x67, 0xdc, 0x78, 0xd4, 0x41, 0xf6, 0xc9, - 0xde, 0xde, 0x1f, 0x18, 0x13, 0x31, 0x11, }, - { 0x8a, 0x7f, 0xb1, 0x33, 0x8f, 0x0c, 0x3c, 0x0a, 0x06, 0x61, 0xf0, 0x47, - 0x29, 0x1b, 0x29, 0xbc, 0x1c, 0x47, 0xef, 0x7a, }, - { 0x65, 0x91, 0xf1, 0xe6, 0xb3, 0x96, 0xd3, 0x8c, 0xc2, 0x4a, 0x59, 0x35, - 0x72, 0x8e, 0x0b, 0x9a, 0x87, 0xca, 0x34, 0x7b, 0x63, }, - { 0x5f, 0x08, 0x87, 0x80, 0x56, 0x25, 0x89, 0x77, 0x61, 0x8c, 0x64, 0xa1, - 0x59, 0x6d, 0x59, 0x62, 0xe8, 0x4a, 0xc8, 0x58, 0x99, 0xd1, }, - { 0x23, 0x87, 0x1d, 0xed, 0x6f, 0xf2, 0x91, 0x90, 0xe2, 0xfe, 0x43, 0x21, - 0xaf, 0x97, 0xc6, 0xbc, 0xd7, 0x15, 0xc7, 0x2d, 0x08, 0x77, 0x91, }, - { 0x90, 0x47, 0x9a, 0x9e, 0x3a, 0xdf, 0xf3, 0xc9, 0x4c, 0x1e, 0xa7, 0xd4, - 0x6a, 0x32, 0x90, 0xfe, 0xb7, 0xb6, 0x7b, 0xfa, 0x96, 0x61, 0xfb, 0xa4, }, - { 0xb1, 0x67, 0x60, 0x45, 0xb0, 0x96, 0xc5, 0x15, 0x9f, 0x4d, 0x26, 0xd7, - 0x9d, 0xf1, 0xf5, 0x6d, 0x21, 0x00, 0x94, 0x31, 0x64, 0x94, 0xd3, 0xa7, - 0xd3, }, - { 0x02, 0x3e, 0xaf, 0xf3, 0x79, 0x73, 0xa5, 0xf5, 0xcc, 0x7a, 0x7f, 0xfb, - 0x79, 0x2b, 0x85, 0x8c, 0x88, 0x72, 0x06, 0xbe, 0xfe, 0xaf, 0xc1, 0x16, - 0xa6, 0xd6, }, - { 0x2a, 0xb0, 0x1a, 0xe5, 0xaa, 0x6e, 0xb3, 0xae, 0x53, 0x85, 0x33, 0x80, - 0x75, 0xae, 0x30, 0xe6, 0xb8, 0x72, 0x42, 0xf6, 0x25, 0x4f, 0x38, 0x88, - 0x55, 0xd1, 0xa9, }, - { 0x90, 0xd8, 0x0c, 0xc0, 0x93, 0x4b, 0x4f, 0x9e, 0x65, 0x6c, 0xa1, 0x54, - 0xa6, 0xf6, 0x6e, 0xca, 0xd2, 0xbb, 0x7e, 0x6a, 0x1c, 0xd3, 0xce, 0x46, - 0xef, 0xb0, 0x00, 0x8d, }, - { 0xed, 0x9c, 0x49, 0xcd, 0xc2, 0xde, 0x38, 0x0e, 0xe9, 0x98, 0x6c, 0xc8, - 0x90, 0x9e, 0x3c, 0xd4, 0xd3, 0xeb, 0x88, 0x32, 0xc7, 0x28, 0xe3, 0x94, - 0x1c, 0x9f, 0x8b, 0xf3, 0xcb, }, - { 0xac, 0xe7, 0x92, 0x16, 0xb4, 0x14, 0xa0, 0xe4, 0x04, 0x79, 0xa2, 0xf4, - 0x31, 0xe6, 0x0c, 0x26, 0xdc, 0xbf, 0x2f, 0x69, 0x1b, 0x55, 0x94, 0x67, - 0xda, 0x0c, 0xd7, 0x32, 0x1f, 0xef, }, - { 0x68, 0x63, 0x85, 0x57, 0x95, 0x9e, 0x42, 0x27, 0x41, 0x43, 0x42, 0x02, - 0xa5, 0x78, 0xa7, 0xc6, 0x43, 0xc1, 0x6a, 0xba, 0x70, 0x80, 0xcd, 0x04, - 0xb6, 0x78, 0x76, 0x29, 0xf3, 0xe8, 0xa0, }, - { 0xe6, 0xac, 0x8d, 0x9d, 0xf0, 0xc0, 0xf7, 0xf7, 0xe3, 0x3e, 0x4e, 0x28, - 0x0f, 0x59, 0xb2, 0x67, 0x9e, 0x84, 0x34, 0x42, 0x96, 0x30, 0x2b, 0xca, - 0x49, 0xb6, 0xc5, 0x9a, 0x84, 0x59, 0xa7, 0x81, }, - { 0x7e, }, - { 0x1e, 0x21, }, - { 0x26, 0xd3, 0xdd, }, - { 0x2c, 0xd4, 0xb3, 0x3d, }, - { 0x86, 0x7b, 0x76, 0x3c, 0xf0, }, - { 0x12, 0xc3, 0x70, 0x1d, 0x55, 0x18, }, - { 0x96, 0xc2, 0xbd, 0x61, 0x55, 0xf4, 0x24, }, - { 0x20, 0x51, 0xf7, 0x86, 0x58, 0x8f, 0x07, 0x2a, }, - { 0x93, 0x15, 0xa8, 0x1d, 0xda, 0x97, 0xee, 0x0e, 0x6c, }, - { 0x39, 0x93, 0xdf, 0xd5, 0x0e, 0xca, 0xdc, 0x7a, 0x92, 0xce, }, - { 0x60, 0xd5, 0xfd, 0xf5, 0x1b, 0x26, 0x82, 0x26, 0x73, 0x02, 0xbc, }, - { 0x98, 0xf2, 0x34, 0xe1, 0xf5, 0xfb, 0x00, 0xac, 0x10, 0x4a, 0x38, 0x9f, }, - { 0xda, 0x3a, 0x92, 0x8a, 0xd0, 0xcd, 0x12, 0xcd, 0x15, 0xbb, 0xab, 0x77, - 0x66, }, - { 0xa2, 0x92, 0x1a, 0xe5, 0xca, 0x0c, 0x30, 0x75, 0xeb, 0xaf, 0x00, 0x31, - 0x55, 0x66, }, - { 0x06, 0xea, 0xfd, 0x3e, 0x86, 0x38, 0x62, 0x4e, 0xa9, 0x12, 0xa4, 0x12, - 0x43, 0xbf, 0xa1, }, - { 0xe4, 0x71, 0x7b, 0x94, 0xdb, 0xa0, 0xd2, 0xff, 0x9b, 0xeb, 0xad, 0x8e, - 0x95, 0x8a, 0xc5, 0xed, }, - { 0x25, 0x5a, 0x77, 0x71, 0x41, 0x0e, 0x7a, 0xe9, 0xed, 0x0c, 0x10, 0xef, - 0xf6, 0x2b, 0x3a, 0xba, 0x60, }, - { 0xee, 0xe2, 0xa3, 0x67, 0x64, 0x1d, 0xc6, 0x04, 0xc4, 0xe1, 0x68, 0xd2, - 0x6e, 0xd2, 0x91, 0x75, 0x53, 0x07, }, - { 0xe0, 0xf6, 0x4d, 0x8f, 0x68, 0xfc, 0x06, 0x7e, 0x18, 0x79, 0x7f, 0x2b, - 0x6d, 0xef, 0x46, 0x7f, 0xab, 0xb2, 0xad, }, - { 0x3d, 0x35, 0x88, 0x9f, 0x2e, 0xcf, 0x96, 0x45, 0x07, 0x60, 0x71, 0x94, - 0x00, 0x8d, 0xbf, 0xf4, 0xef, 0x46, 0x2e, 0x3c, }, - { 0x43, 0xcf, 0x98, 0xf7, 0x2d, 0xf4, 0x17, 0xe7, 0x8c, 0x05, 0x2d, 0x9b, - 0x24, 0xfb, 0x4d, 0xea, 0x4a, 0xec, 0x01, 0x25, 0x29, }, - { 0x8e, 0x73, 0x9a, 0x78, 0x11, 0xfe, 0x48, 0xa0, 0x3b, 0x1a, 0x26, 0xdf, - 0x25, 0xe9, 0x59, 0x1c, 0x70, 0x07, 0x9f, 0xdc, 0xa0, 0xa6, }, - { 0xe8, 0x47, 0x71, 0xc7, 0x3e, 0xdf, 0xb5, 0x13, 0xb9, 0x85, 0x13, 0xa8, - 0x54, 0x47, 0x6e, 0x59, 0x96, 0x09, 0x13, 0x5f, 0x82, 0x16, 0x0b, }, - { 0xfb, 0xc0, 0x8c, 0x03, 0x21, 0xb3, 0xc4, 0xb5, 0x43, 0x32, 0x6c, 0xea, - 0x7f, 0xa8, 0x43, 0x91, 0xe8, 0x4e, 0x3f, 0xbf, 0x45, 0x58, 0x6a, 0xa3, }, - { 0x55, 0xf8, 0xf3, 0x00, 0x76, 0x09, 0xef, 0x69, 0x5d, 0xd2, 0x8a, 0xf2, - 0x65, 0xc3, 0xcb, 0x9b, 0x43, 0xfd, 0xb1, 0x7e, 0x7f, 0xa1, 0x94, 0xb0, - 0xd7, }, - { 0xaa, 0x13, 0xc1, 0x51, 0x40, 0x6d, 0x8d, 0x4c, 0x0a, 0x95, 0x64, 0x7b, - 0xd1, 0x96, 0xb6, 0x56, 0xb4, 0x5b, 0xcf, 0xd6, 0xd9, 0x15, 0x97, 0xdd, - 0xb6, 0xef, }, - { 0xaf, 0xb7, 0x36, 0xb0, 0x04, 0xdb, 0xd7, 0x9c, 0x9a, 0x44, 0xc4, 0xf6, - 0x1f, 0x12, 0x21, 0x2d, 0x59, 0x30, 0x54, 0xab, 0x27, 0x61, 0xa3, 0x57, - 0xef, 0xf8, 0x53, }, - { 0x97, 0x34, 0x45, 0x3e, 0xce, 0x7c, 0x35, 0xa2, 0xda, 0x9f, 0x4b, 0x46, - 0x6c, 0x11, 0x67, 0xff, 0x2f, 0x76, 0x58, 0x15, 0x71, 0xfa, 0x44, 0x89, - 0x89, 0xfd, 0xf7, 0x99, }, - { 0x1f, 0xb1, 0x62, 0xeb, 0x83, 0xc5, 0x9c, 0x89, 0xf9, 0x2c, 0xd2, 0x03, - 0x61, 0xbc, 0xbb, 0xa5, 0x74, 0x0e, 0x9b, 0x7e, 0x82, 0x3e, 0x70, 0x0a, - 0xa9, 0x8f, 0x2b, 0x59, 0xfb, }, - { 0xf8, 0xca, 0x5e, 0x3a, 0x4f, 0x9e, 0x10, 0x69, 0x10, 0xd5, 0x4c, 0xeb, - 0x1a, 0x0f, 0x3c, 0x6a, 0x98, 0xf5, 0xb0, 0x97, 0x5b, 0x37, 0x2f, 0x0d, - 0xbd, 0x42, 0x4b, 0x69, 0xa1, 0x82, }, - { 0x12, 0x8c, 0x6d, 0x52, 0x08, 0xef, 0x74, 0xb2, 0xe6, 0xaa, 0xd3, 0xb0, - 0x26, 0xb0, 0xd9, 0x94, 0xb6, 0x11, 0x45, 0x0e, 0x36, 0x71, 0x14, 0x2d, - 0x41, 0x8c, 0x21, 0x53, 0x31, 0xe9, 0x68, }, - { 0xee, 0xea, 0x0d, 0x89, 0x47, 0x7e, 0x72, 0xd1, 0xd8, 0xce, 0x58, 0x4c, - 0x94, 0x1f, 0x0d, 0x51, 0x08, 0xa3, 0xb6, 0x3d, 0xe7, 0x82, 0x46, 0x92, - 0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, }, -}; - -static bool __init noinline_for_stack blake2s_digest_test(void) -{ - u8 key[BLAKE2S_KEY_SIZE]; - u8 buf[ARRAY_SIZE(blake2s_testvecs)]; - u8 hash[BLAKE2S_HASH_SIZE]; - struct blake2s_state state; - bool success = true; - int i, l; - - key[0] = key[1] = 1; - for (i = 2; i < sizeof(key); ++i) - key[i] = key[i - 2] + key[i - 1]; - - for (i = 0; i < sizeof(buf); ++i) - buf[i] = (u8)i; - - for (i = l = 0; i < ARRAY_SIZE(blake2s_testvecs); l = (l + 37) % ++i) { - int outlen = 1 + i % BLAKE2S_HASH_SIZE; - int keylen = (13 * i) % (BLAKE2S_KEY_SIZE + 1); - - blake2s(hash, buf, key + BLAKE2S_KEY_SIZE - keylen, outlen, i, - keylen); - if (memcmp(hash, blake2s_testvecs[i], outlen)) { - pr_err("blake2s self-test %d: FAIL\n", i + 1); - success = false; - } - - if (!keylen) - blake2s_init(&state, outlen); - else - blake2s_init_key(&state, outlen, - key + BLAKE2S_KEY_SIZE - keylen, - keylen); - - blake2s_update(&state, buf, l); - blake2s_update(&state, buf + l, i - l); - blake2s_final(&state, hash); - if (memcmp(hash, blake2s_testvecs[i], outlen)) { - pr_err("blake2s init/update/final self-test %d: FAIL\n", - i + 1); - success = false; - } - } - - return success; -} - -static bool __init noinline_for_stack blake2s_random_test(void) -{ - struct blake2s_state state; - bool success = true; - int i, l; - - for (i = 0; i < 32; ++i) { - enum { TEST_ALIGNMENT = 16 }; - u8 blocks[BLAKE2S_BLOCK_SIZE * 2 + TEST_ALIGNMENT - 1] - __aligned(TEST_ALIGNMENT); - u8 *unaligned_block = blocks + BLAKE2S_BLOCK_SIZE; - struct blake2s_state state1, state2; - - get_random_bytes(blocks, sizeof(blocks)); - get_random_bytes(&state, sizeof(state)); - -#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \ - defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S) - memcpy(&state1, &state, sizeof(state1)); - memcpy(&state2, &state, sizeof(state2)); - blake2s_compress(&state1, blocks, 2, BLAKE2S_BLOCK_SIZE); - blake2s_compress_generic(&state2, blocks, 2, BLAKE2S_BLOCK_SIZE); - if (memcmp(&state1, &state2, sizeof(state1))) { - pr_err("blake2s random compress self-test %d: FAIL\n", - i + 1); - success = false; - } -#endif - - memcpy(&state1, &state, sizeof(state1)); - blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE); - for (l = 1; l < TEST_ALIGNMENT; ++l) { - memcpy(unaligned_block + l, blocks, - BLAKE2S_BLOCK_SIZE); - memcpy(&state2, &state, sizeof(state2)); - blake2s_compress(&state2, unaligned_block + l, 1, - BLAKE2S_BLOCK_SIZE); - if (memcmp(&state1, &state2, sizeof(state1))) { - pr_err("blake2s random compress align %d self-test %d: FAIL\n", - l, i + 1); - success = false; - } - } - } - - return success; -} - -bool __init blake2s_selftest(void) -{ - bool success; - - success = blake2s_digest_test(); - success &= blake2s_random_test(); - - return success; -} diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index f6ec68c3dcdae6..5638ed9d882d8b 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -8,15 +8,108 @@ * */ -#include +#include #include #include -#include #include #include #include #include +static const u8 blake2s_sigma[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, +}; + +static inline void blake2s_increment_counter(struct blake2s_state *state, + const u32 inc) +{ + state->t[0] += inc; + state->t[1] += (state->t[0] < inc); +} + +static void __maybe_unused +blake2s_compress_generic(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) +{ + u32 m[16]; + u32 v[16]; + int i; + + WARN_ON(IS_ENABLED(DEBUG) && + (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE)); + + while (nblocks > 0) { + blake2s_increment_counter(state, inc); + memcpy(m, block, BLAKE2S_BLOCK_SIZE); + le32_to_cpu_array(m, ARRAY_SIZE(m)); + memcpy(v, state->h, 32); + v[ 8] = BLAKE2S_IV0; + v[ 9] = BLAKE2S_IV1; + v[10] = BLAKE2S_IV2; + v[11] = BLAKE2S_IV3; + v[12] = BLAKE2S_IV4 ^ state->t[0]; + v[13] = BLAKE2S_IV5 ^ state->t[1]; + v[14] = BLAKE2S_IV6 ^ state->f[0]; + v[15] = BLAKE2S_IV7 ^ state->f[1]; + +#define G(r, i, a, b, c, d) do { \ + a += b + m[blake2s_sigma[r][2 * i + 0]]; \ + d = ror32(d ^ a, 16); \ + c += d; \ + b = ror32(b ^ c, 12); \ + a += b + m[blake2s_sigma[r][2 * i + 1]]; \ + d = ror32(d ^ a, 8); \ + c += d; \ + b = ror32(b ^ c, 7); \ +} while (0) + +#define ROUND(r) do { \ + G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ + G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ + G(r, 2, v[2], v[ 6], v[10], v[14]); \ + G(r, 3, v[3], v[ 7], v[11], v[15]); \ + G(r, 4, v[0], v[ 5], v[10], v[15]); \ + G(r, 5, v[1], v[ 6], v[11], v[12]); \ + G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ + G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ +} while (0) + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + +#undef G +#undef ROUND + + for (i = 0; i < 8; ++i) + state->h[i] ^= v[i] ^ v[i + 8]; + + block += BLAKE2S_BLOCK_SIZE; + --nblocks; + } +} + +#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_ARCH +#include "blake2s.h" /* $(SRCARCH)/blake2s.h */ +#else +#define blake2s_compress blake2s_compress_generic +#endif + static inline void blake2s_set_lastblock(struct blake2s_state *state) { state->f[0] = -1; @@ -59,14 +152,14 @@ void blake2s_final(struct blake2s_state *state, u8 *out) } EXPORT_SYMBOL(blake2s_final); +#ifdef blake2s_mod_init_arch static int __init blake2s_mod_init(void) { - if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && - WARN_ON(!blake2s_selftest())) - return -ENODEV; + blake2s_mod_init_arch(); return 0; } +subsys_initcall(blake2s_mod_init); +#endif -module_init(blake2s_mod_init); MODULE_DESCRIPTION("BLAKE2s hash function"); MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/chacha-block-generic.c b/lib/crypto/chacha-block-generic.c new file mode 100644 index 00000000000000..77f68de71066a9 --- /dev/null +++ b/lib/crypto/chacha-block-generic.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * The "hash function" used as the core of the ChaCha stream cipher (RFC7539) + * + * Copyright (C) 2015 Martin Willi + */ + +#include +#include +#include +#include +#include +#include +#include + +static void chacha_permute(struct chacha_state *state, int nrounds) +{ + u32 *x = state->x; + int i; + + /* whitelist the allowed round counts */ + WARN_ON_ONCE(nrounds != 20 && nrounds != 12); + + for (i = 0; i < nrounds; i += 2) { + x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); + x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); + x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16); + x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16); + + x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12); + x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12); + x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12); + x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12); + + x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8); + x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8); + x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8); + x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8); + + x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7); + x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7); + x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7); + x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7); + + x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16); + x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16); + x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16); + x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16); + + x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12); + x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12); + x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12); + x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12); + + x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8); + x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8); + x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8); + x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8); + + x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7); + x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7); + x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); + x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); + } +} + +/** + * chacha_block_generic - generate one keystream block and increment block counter + * @state: input state matrix + * @out: output keystream block + * @nrounds: number of rounds (20 or 12; 20 is recommended) + * + * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. + * The caller has already converted the endianness of the input. This function + * also handles incrementing the block counter in the input matrix. + */ +void chacha_block_generic(struct chacha_state *state, + u8 out[CHACHA_BLOCK_SIZE], int nrounds) +{ + struct chacha_state permuted_state = *state; + int i; + + chacha_permute(&permuted_state, nrounds); + + for (i = 0; i < ARRAY_SIZE(state->x); i++) + put_unaligned_le32(permuted_state.x[i] + state->x[i], + &out[i * sizeof(u32)]); + + state->x[12]++; +} +EXPORT_SYMBOL(chacha_block_generic); + +/** + * hchacha_block_generic - abbreviated ChaCha core, for XChaCha + * @state: input state matrix + * @out: the output words + * @nrounds: number of rounds (20 or 12; 20 is recommended) + * + * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step + * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha + * skips the final addition of the initial state, and outputs only certain words + * of the state. It should not be used for streaming directly. + */ +void hchacha_block_generic(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + struct chacha_state permuted_state = *state; + + chacha_permute(&permuted_state, nrounds); + + memcpy(&out[0], &permuted_state.x[0], 16); + memcpy(&out[4], &permuted_state.x[12], 16); +} +EXPORT_SYMBOL(hchacha_block_generic); diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c index 77f68de71066a9..e0c7cb4af31800 100644 --- a/lib/crypto/chacha.c +++ b/lib/crypto/chacha.c @@ -1,114 +1,70 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * The "hash function" used as the core of the ChaCha stream cipher (RFC7539) + * The ChaCha stream cipher (RFC7539) * * Copyright (C) 2015 Martin Willi */ +#include // for crypto_xor_cpy #include -#include -#include #include #include -#include -#include +#include -static void chacha_permute(struct chacha_state *state, int nrounds) +static void __maybe_unused +chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) { - u32 *x = state->x; - int i; - - /* whitelist the allowed round counts */ - WARN_ON_ONCE(nrounds != 20 && nrounds != 12); - - for (i = 0; i < nrounds; i += 2) { - x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); - x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); - x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16); - x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16); - - x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12); - x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12); - x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12); - x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12); - - x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8); - x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8); - x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8); - x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8); - - x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7); - x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7); - x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7); - x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7); - - x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16); - x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16); - x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16); - x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16); - - x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12); - x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12); - x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12); - x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12); - - x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8); - x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8); - x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8); - x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8); - - x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7); - x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7); - x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); - x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); + /* aligned to potentially speed up crypto_xor() */ + u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long)); + + while (bytes >= CHACHA_BLOCK_SIZE) { + chacha_block_generic(state, stream, nrounds); + crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE); + bytes -= CHACHA_BLOCK_SIZE; + dst += CHACHA_BLOCK_SIZE; + src += CHACHA_BLOCK_SIZE; + } + if (bytes) { + chacha_block_generic(state, stream, nrounds); + crypto_xor_cpy(dst, src, stream, bytes); } } -/** - * chacha_block_generic - generate one keystream block and increment block counter - * @state: input state matrix - * @out: output keystream block - * @nrounds: number of rounds (20 or 12; 20 is recommended) - * - * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. - * The caller has already converted the endianness of the input. This function - * also handles incrementing the block counter in the input matrix. - */ -void chacha_block_generic(struct chacha_state *state, - u8 out[CHACHA_BLOCK_SIZE], int nrounds) -{ - struct chacha_state permuted_state = *state; - int i; - - chacha_permute(&permuted_state, nrounds); - - for (i = 0; i < ARRAY_SIZE(state->x); i++) - put_unaligned_le32(permuted_state.x[i] + state->x[i], - &out[i * sizeof(u32)]); +#ifdef CONFIG_CRYPTO_LIB_CHACHA_ARCH +#include "chacha.h" /* $(SRCARCH)/chacha.h */ +#else +#define chacha_crypt_arch chacha_crypt_generic +#define hchacha_block_arch hchacha_block_generic +#endif - state->x[12]++; +void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + chacha_crypt_arch(state, dst, src, bytes, nrounds); } -EXPORT_SYMBOL(chacha_block_generic); +EXPORT_SYMBOL_GPL(chacha_crypt); -/** - * hchacha_block_generic - abbreviated ChaCha core, for XChaCha - * @state: input state matrix - * @out: the output words - * @nrounds: number of rounds (20 or 12; 20 is recommended) - * - * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step - * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha - * skips the final addition of the initial state, and outputs only certain words - * of the state. It should not be used for streaming directly. - */ -void hchacha_block_generic(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) +void hchacha_block(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) { - struct chacha_state permuted_state = *state; + hchacha_block_arch(state, out, nrounds); +} +EXPORT_SYMBOL_GPL(hchacha_block); - chacha_permute(&permuted_state, nrounds); +#ifdef chacha_mod_init_arch +static int __init chacha_mod_init(void) +{ + chacha_mod_init_arch(); + return 0; +} +subsys_initcall(chacha_mod_init); - memcpy(&out[0], &permuted_state.x[0], 16); - memcpy(&out[4], &permuted_state.x[12], 16); +static void __exit chacha_mod_exit(void) +{ } -EXPORT_SYMBOL(hchacha_block_generic); +module_exit(chacha_mod_exit); +#endif + +MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c deleted file mode 100644 index f8aa70c9f55988..00000000000000 --- a/lib/crypto/curve25519-generic.c +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * - * This is an implementation of the Curve25519 ECDH algorithm, using either - * a 32-bit implementation or a 64-bit implementation with 128-bit integers, - * depending on what is supported by the target compiler. - * - * Information: https://cr.yp.to/ecdh.html - */ - -#include -#include -#include - -const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 }; -const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 }; - -EXPORT_SYMBOL(curve25519_null_point); -EXPORT_SYMBOL(curve25519_base_point); -EXPORT_SYMBOL(curve25519_generic); - -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Curve25519 scalar multiplication"); -MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c index 6850b76a80c9e3..01e265dfbcd904 100644 --- a/lib/crypto/curve25519.c +++ b/lib/crypto/curve25519.c @@ -2,32 +2,77 @@ /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. * - * This is an implementation of the Curve25519 ECDH algorithm, using either - * a 32-bit implementation or a 64-bit implementation with 128-bit integers, + * This is an implementation of the Curve25519 ECDH algorithm, using either an + * architecture-optimized implementation or a generic implementation. The + * generic implementation is either 32-bit, or 64-bit with 128-bit integers, * depending on what is supported by the target compiler. * * Information: https://cr.yp.to/ecdh.html */ #include -#include +#include +#include #include +#include -static int __init curve25519_init(void) +static const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 }; +static const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 }; + +#ifdef CONFIG_CRYPTO_LIB_CURVE25519_ARCH +#include "curve25519.h" /* $(SRCARCH)/curve25519.h */ +#else +static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) { - if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && - WARN_ON(!curve25519_selftest())) - return -ENODEV; - return 0; + curve25519_generic(mypublic, secret, basepoint); +} + +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + curve25519_generic(pub, secret, curve25519_base_point); +} +#endif + +bool __must_check +curve25519(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + curve25519_arch(mypublic, secret, basepoint); + return crypto_memneq(mypublic, curve25519_null_point, + CURVE25519_KEY_SIZE); +} +EXPORT_SYMBOL(curve25519); + +bool __must_check +curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + if (unlikely(!crypto_memneq(secret, curve25519_null_point, + CURVE25519_KEY_SIZE))) + return false; + curve25519_base_arch(pub, secret); + return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE); } +EXPORT_SYMBOL(curve25519_generate_public); -static void __exit curve25519_exit(void) +#ifdef curve25519_mod_init_arch +static int __init curve25519_mod_init(void) { + curve25519_mod_init_arch(); + return 0; } +subsys_initcall(curve25519_mod_init); -module_init(curve25519_init); -module_exit(curve25519_exit); +static void __exit curve25519_mod_exit(void) +{ +} +module_exit(curve25519_mod_exit); +#endif MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Curve25519 scalar multiplication"); +MODULE_DESCRIPTION("Curve25519 algorithm"); MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c deleted file mode 100644 index 26862ad90a9640..00000000000000 --- a/lib/crypto/libchacha.c +++ /dev/null @@ -1,35 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The ChaCha stream cipher (RFC7539) - * - * Copyright (C) 2015 Martin Willi - */ - -#include // for crypto_xor_cpy -#include -#include -#include -#include - -void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - /* aligned to potentially speed up crypto_xor() */ - u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long)); - - while (bytes >= CHACHA_BLOCK_SIZE) { - chacha_block_generic(state, stream, nrounds); - crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE); - bytes -= CHACHA_BLOCK_SIZE; - dst += CHACHA_BLOCK_SIZE; - src += CHACHA_BLOCK_SIZE; - } - if (bytes) { - chacha_block_generic(state, stream, nrounds); - crypto_xor_cpy(dst, src, stream, bytes); - } -} -EXPORT_SYMBOL(chacha_crypt_generic); - -MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)"); -MODULE_LICENSE("GPL"); diff --git a/lib/crypto/md5.c b/lib/crypto/md5.c new file mode 100644 index 00000000000000..c0610ea1370e62 --- /dev/null +++ b/lib/crypto/md5.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * MD5 and HMAC-MD5 library functions + * + * md5_block_generic() is derived from cryptoapi implementation, originally + * based on the public domain implementation written by Colin Plumb in 1993. + * + * Copyright (c) Cryptoapi developers. + * Copyright (c) 2002 James Morris + * Copyright 2025 Google LLC + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct md5_block_state md5_iv = { + .h = { MD5_H0, MD5_H1, MD5_H2, MD5_H3 }, +}; + +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) + +static void md5_block_generic(struct md5_block_state *state, + const u8 data[MD5_BLOCK_SIZE]) +{ + u32 in[MD5_BLOCK_WORDS]; + u32 a, b, c, d; + + memcpy(in, data, MD5_BLOCK_SIZE); + le32_to_cpu_array(in, ARRAY_SIZE(in)); + + a = state->h[0]; + b = state->h[1]; + c = state->h[2]; + d = state->h[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + state->h[0] += a; + state->h[1] += b; + state->h[2] += c; + state->h[3] += d; +} + +static void __maybe_unused md5_blocks_generic(struct md5_block_state *state, + const u8 *data, size_t nblocks) +{ + do { + md5_block_generic(state, data); + data += MD5_BLOCK_SIZE; + } while (--nblocks); +} + +#ifdef CONFIG_CRYPTO_LIB_MD5_ARCH +#include "md5.h" /* $(SRCARCH)/md5.h */ +#else +#define md5_blocks md5_blocks_generic +#endif + +void md5_init(struct md5_ctx *ctx) +{ + ctx->state = md5_iv; + ctx->bytecount = 0; +} +EXPORT_SYMBOL_GPL(md5_init); + +void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len) +{ + size_t partial = ctx->bytecount % MD5_BLOCK_SIZE; + + ctx->bytecount += len; + + if (partial + len >= MD5_BLOCK_SIZE) { + size_t nblocks; + + if (partial) { + size_t l = MD5_BLOCK_SIZE - partial; + + memcpy(&ctx->buf[partial], data, l); + data += l; + len -= l; + + md5_blocks(&ctx->state, ctx->buf, 1); + } + + nblocks = len / MD5_BLOCK_SIZE; + len %= MD5_BLOCK_SIZE; + + if (nblocks) { + md5_blocks(&ctx->state, data, nblocks); + data += nblocks * MD5_BLOCK_SIZE; + } + partial = 0; + } + if (len) + memcpy(&ctx->buf[partial], data, len); +} +EXPORT_SYMBOL_GPL(md5_update); + +static void __md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]) +{ + u64 bitcount = ctx->bytecount << 3; + size_t partial = ctx->bytecount % MD5_BLOCK_SIZE; + + ctx->buf[partial++] = 0x80; + if (partial > MD5_BLOCK_SIZE - 8) { + memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - partial); + md5_blocks(&ctx->state, ctx->buf, 1); + partial = 0; + } + memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - 8 - partial); + *(__le64 *)&ctx->buf[MD5_BLOCK_SIZE - 8] = cpu_to_le64(bitcount); + md5_blocks(&ctx->state, ctx->buf, 1); + + cpu_to_le32_array(ctx->state.h, ARRAY_SIZE(ctx->state.h)); + memcpy(out, ctx->state.h, MD5_DIGEST_SIZE); +} + +void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]) +{ + __md5_final(ctx, out); + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(md5_final); + +void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]) +{ + struct md5_ctx ctx; + + md5_init(&ctx); + md5_update(&ctx, data, len); + md5_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(md5); + +static void __hmac_md5_preparekey(struct md5_block_state *istate, + struct md5_block_state *ostate, + const u8 *raw_key, size_t raw_key_len) +{ + union { + u8 b[MD5_BLOCK_SIZE]; + unsigned long w[MD5_BLOCK_SIZE / sizeof(unsigned long)]; + } derived_key = { 0 }; + + if (unlikely(raw_key_len > MD5_BLOCK_SIZE)) + md5(raw_key, raw_key_len, derived_key.b); + else + memcpy(derived_key.b, raw_key, raw_key_len); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); + *istate = md5_iv; + md5_blocks(istate, derived_key.b, 1); + + for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) + derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ + HMAC_IPAD_VALUE); + *ostate = md5_iv; + md5_blocks(ostate, derived_key.b, 1); + + memzero_explicit(&derived_key, sizeof(derived_key)); +} + +void hmac_md5_preparekey(struct hmac_md5_key *key, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_md5_preparekey(&key->istate, &key->ostate, raw_key, raw_key_len); +} +EXPORT_SYMBOL_GPL(hmac_md5_preparekey); + +void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key) +{ + ctx->hash_ctx.state = key->istate; + ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE; + ctx->ostate = key->ostate; +} +EXPORT_SYMBOL_GPL(hmac_md5_init); + +void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx, + const u8 *raw_key, size_t raw_key_len) +{ + __hmac_md5_preparekey(&ctx->hash_ctx.state, &ctx->ostate, + raw_key, raw_key_len); + ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE; +} +EXPORT_SYMBOL_GPL(hmac_md5_init_usingrawkey); + +void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]) +{ + /* Generate the padded input for the outer hash in ctx->hash_ctx.buf. */ + __md5_final(&ctx->hash_ctx, ctx->hash_ctx.buf); + memset(&ctx->hash_ctx.buf[MD5_DIGEST_SIZE], 0, + MD5_BLOCK_SIZE - MD5_DIGEST_SIZE); + ctx->hash_ctx.buf[MD5_DIGEST_SIZE] = 0x80; + *(__le64 *)&ctx->hash_ctx.buf[MD5_BLOCK_SIZE - 8] = + cpu_to_le64(8 * (MD5_BLOCK_SIZE + MD5_DIGEST_SIZE)); + + /* Compute the outer hash, which gives the HMAC value. */ + md5_blocks(&ctx->ostate, ctx->hash_ctx.buf, 1); + cpu_to_le32_array(ctx->ostate.h, ARRAY_SIZE(ctx->ostate.h)); + memcpy(out, ctx->ostate.h, MD5_DIGEST_SIZE); + + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL_GPL(hmac_md5_final); + +void hmac_md5(const struct hmac_md5_key *key, + const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]) +{ + struct hmac_md5_ctx ctx; + + hmac_md5_init(&ctx, key); + hmac_md5_update(&ctx, data, data_len); + hmac_md5_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_md5); + +void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len, + const u8 *data, size_t data_len, + u8 out[MD5_DIGEST_SIZE]) +{ + struct hmac_md5_ctx ctx; + + hmac_md5_init_usingrawkey(&ctx, raw_key, raw_key_len); + hmac_md5_update(&ctx, data, data_len); + hmac_md5_final(&ctx, out); +} +EXPORT_SYMBOL_GPL(hmac_md5_usingrawkey); + +#ifdef md5_mod_init_arch +static int __init md5_mod_init(void) +{ + md5_mod_init_arch(); + return 0; +} +subsys_initcall(md5_mod_init); + +static void __exit md5_mod_exit(void) +{ +} +module_exit(md5_mod_exit); +#endif + +MODULE_DESCRIPTION("MD5 and HMAC-MD5 library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig deleted file mode 100644 index 0670a170c1be04..00000000000000 --- a/lib/crypto/mips/Kconfig +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_MIPS - tristate - depends on CPU_MIPS32_R2 - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_MIPS - tristate - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile deleted file mode 100644 index 804488c7adedcc..00000000000000 --- a/lib/crypto/mips/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o -chacha-mips-y := chacha-core.o chacha-glue.o -AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots - -obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o -poly1305-mips-y := poly1305-core.o poly1305-glue.o - -perlasm-flavour-$(CONFIG_32BIT) := o32 -perlasm-flavour-$(CONFIG_64BIT) := 64 - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) - -$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE - $(call if_changed,perlasm) - -targets += poly1305-core.S diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c deleted file mode 100644 index 88c097594eb0f0..00000000000000 --- a/lib/crypto/mips/chacha-glue.c +++ /dev/null @@ -1,29 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ChaCha and HChaCha functions (MIPS optimized) - * - * Copyright (C) 2019 Linaro, Ltd. - */ - -#include -#include -#include - -asmlinkage void chacha_crypt_arch(struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); -EXPORT_SYMBOL(chacha_crypt_arch); - -asmlinkage void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); -EXPORT_SYMBOL(hchacha_block_arch); - -bool chacha_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/chacha.h b/lib/crypto/mips/chacha.h new file mode 100644 index 00000000000000..0c18c0dc2a4066 --- /dev/null +++ b/lib/crypto/mips/chacha.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ChaCha and HChaCha functions (MIPS optimized) + * + * Copyright (C) 2019 Linaro, Ltd. + */ + +#include + +asmlinkage void chacha_crypt_arch(struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); +asmlinkage void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); diff --git a/lib/crypto/mips/md5.h b/lib/crypto/mips/md5.h new file mode 100644 index 00000000000000..e08e28aeffa469 --- /dev/null +++ b/lib/crypto/mips/md5.h @@ -0,0 +1,65 @@ +/* + * Cryptographic API. + * + * MD5 Message Digest Algorithm (RFC1321). + * + * Adapted for OCTEON by Aaro Koskinen . + * + * Based on crypto/md5.c, which is: + * + * Derived from cryptoapi implementation, originally based on the + * public domain implementation written by Colin Plumb in 1993. + * + * Copyright (c) Cryptoapi developers. + * Copyright (c) 2002 James Morris + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void md5_blocks(struct md5_block_state *state, + const u8 *data, size_t nblocks) +{ + struct octeon_cop2_state cop2_state; + u64 *state64 = (u64 *)state; + unsigned long flags; + + if (!octeon_has_crypto()) + return md5_blocks_generic(state, data, nblocks); + + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); + + flags = octeon_crypto_enable(&cop2_state); + write_octeon_64bit_hash_dword(state64[0], 0); + write_octeon_64bit_hash_dword(state64[1], 1); + + do { + const u64 *block = (const u64 *)data; + + write_octeon_64bit_block_dword(block[0], 0); + write_octeon_64bit_block_dword(block[1], 1); + write_octeon_64bit_block_dword(block[2], 2); + write_octeon_64bit_block_dword(block[3], 3); + write_octeon_64bit_block_dword(block[4], 4); + write_octeon_64bit_block_dword(block[5], 5); + write_octeon_64bit_block_dword(block[6], 6); + octeon_md5_start(block[7]); + + data += MD5_BLOCK_SIZE; + } while (--nblocks); + + state64[0] = read_octeon_64bit_hash_dword(0); + state64[1] = read_octeon_64bit_hash_dword(1); + octeon_crypto_disable(&cop2_state, flags); + + le32_to_cpu_array(state->h, ARRAY_SIZE(state->h)); +} diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c deleted file mode 100644 index 764a38a652002a..00000000000000 --- a/lib/crypto/mips/poly1305-glue.c +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -bool poly1305_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl index 399f10c3e3850a..71347f34f4f9fd 100644 --- a/lib/crypto/mips/poly1305-mips.pl +++ b/lib/crypto/mips/poly1305-mips.pl @@ -93,9 +93,7 @@ #endif #ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch +# define poly1305_init poly1305_block_init #endif #if defined(__MIPSEB__) && !defined(MIPSEB) @@ -565,9 +563,7 @@ #endif #ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch +# define poly1305_init poly1305_block_init #endif #if defined(__MIPSEB__) && !defined(MIPSEB) diff --git a/lib/crypto/mips/poly1305.h b/lib/crypto/mips/poly1305.h new file mode 100644 index 00000000000000..85de450f1a93d5 --- /dev/null +++ b/lib/crypto/mips/poly1305.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS + * + * Copyright (C) 2019 Linaro Ltd. + */ + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); diff --git a/lib/crypto/poly1305-generic.c b/lib/crypto/poly1305-generic.c deleted file mode 100644 index 71a16c5c538b40..00000000000000 --- a/lib/crypto/poly1305-generic.c +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Poly1305 authenticator algorithm, RFC7539 - * - * Copyright (C) 2015 Martin Willi - * - * Based on public domain code by Andrew Moon and Daniel J. Bernstein. - */ - -#include -#include -#include -#include - -void poly1305_block_init_generic(struct poly1305_block_state *desc, - const u8 raw_key[POLY1305_BLOCK_SIZE]) -{ - poly1305_core_init(&desc->h); - poly1305_core_setkey(&desc->core_r, raw_key); -} -EXPORT_SYMBOL_GPL(poly1305_block_init_generic); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("Poly1305 algorithm (generic implementation)"); diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c index a6dc182b6c22d7..f313ccc4b4dd22 100644 --- a/lib/crypto/poly1305.c +++ b/lib/crypto/poly1305.c @@ -7,7 +7,6 @@ * Based on public domain code by Andrew Moon and Daniel J. Bernstein. */ -#include #include #include #include @@ -15,6 +14,14 @@ #include #include +#ifdef CONFIG_CRYPTO_LIB_POLY1305_ARCH +#include "poly1305.h" /* $(SRCARCH)/poly1305.h */ +#else +#define poly1305_block_init poly1305_block_init_generic +#define poly1305_blocks poly1305_blocks_generic +#define poly1305_emit poly1305_emit_generic +#endif + void poly1305_init(struct poly1305_desc_ctx *desc, const u8 key[POLY1305_KEY_SIZE]) { @@ -23,28 +30,40 @@ void poly1305_init(struct poly1305_desc_ctx *desc, desc->s[2] = get_unaligned_le32(key + 24); desc->s[3] = get_unaligned_le32(key + 28); desc->buflen = 0; - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_block_init_arch(&desc->state, key); - else - poly1305_block_init_generic(&desc->state, key); + poly1305_block_init(&desc->state, key); } EXPORT_SYMBOL(poly1305_init); -static inline void poly1305_blocks(struct poly1305_block_state *state, - const u8 *src, unsigned int len) -{ - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_blocks_arch(state, src, len, 1); - else - poly1305_blocks_generic(state, src, len, 1); -} - void poly1305_update(struct poly1305_desc_ctx *desc, const u8 *src, unsigned int nbytes) { - desc->buflen = BLOCK_HASH_UPDATE(poly1305_blocks, &desc->state, - src, nbytes, POLY1305_BLOCK_SIZE, - desc->buf, desc->buflen); + if (desc->buflen + nbytes >= POLY1305_BLOCK_SIZE) { + unsigned int bulk_len; + + if (desc->buflen) { + unsigned int l = POLY1305_BLOCK_SIZE - desc->buflen; + + memcpy(&desc->buf[desc->buflen], src, l); + src += l; + nbytes -= l; + + poly1305_blocks(&desc->state, desc->buf, + POLY1305_BLOCK_SIZE, 1); + desc->buflen = 0; + } + + bulk_len = round_down(nbytes, POLY1305_BLOCK_SIZE); + nbytes %= POLY1305_BLOCK_SIZE; + + if (bulk_len) { + poly1305_blocks(&desc->state, src, bulk_len, 1); + src += bulk_len; + } + } + if (nbytes) { + memcpy(&desc->buf[desc->buflen], src, nbytes); + desc->buflen += nbytes; + } } EXPORT_SYMBOL(poly1305_update); @@ -54,22 +73,28 @@ void poly1305_final(struct poly1305_desc_ctx *desc, u8 *dst) desc->buf[desc->buflen++] = 1; memset(desc->buf + desc->buflen, 0, POLY1305_BLOCK_SIZE - desc->buflen); - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_blocks_arch(&desc->state, desc->buf, - POLY1305_BLOCK_SIZE, 0); - else - poly1305_blocks_generic(&desc->state, desc->buf, - POLY1305_BLOCK_SIZE, 0); + poly1305_blocks(&desc->state, desc->buf, POLY1305_BLOCK_SIZE, + 0); } - if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) - poly1305_emit_arch(&desc->state.h, dst, desc->s); - else - poly1305_emit_generic(&desc->state.h, dst, desc->s); + poly1305_emit(&desc->state.h, dst, desc->s); *desc = (struct poly1305_desc_ctx){}; } EXPORT_SYMBOL(poly1305_final); +#ifdef poly1305_mod_init_arch +static int __init poly1305_mod_init(void) +{ + poly1305_mod_init_arch(); + return 0; +} +subsys_initcall(poly1305_mod_init); + +static void __exit poly1305_mod_exit(void) +{ +} +module_exit(poly1305_mod_exit); +#endif + MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539"); diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig deleted file mode 100644 index 2eaeb7665a6a0e..00000000000000 --- a/lib/crypto/powerpc/Kconfig +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA20_P10 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_P10 - tristate - depends on PPC64 && CPU_LITTLE_ENDIAN && VSX - depends on BROKEN # Needs to be fixed to work in softirq context - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - select CRYPTO_LIB_POLY1305_GENERIC diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile deleted file mode 100644 index 5709ae14258a06..00000000000000 --- a/lib/crypto/powerpc/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o -chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o - -obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o -poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o diff --git a/lib/crypto/powerpc/chacha-p10-glue.c b/lib/crypto/powerpc/chacha.h similarity index 62% rename from lib/crypto/powerpc/chacha-p10-glue.c rename to lib/crypto/powerpc/chacha.h index fcd23c6f1590bb..1df6e1ce31c460 100644 --- a/lib/crypto/powerpc/chacha-p10-glue.c +++ b/lib/crypto/powerpc/chacha.h @@ -1,14 +1,12 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ChaCha stream cipher (P10 accelerated) * * Copyright 2023- IBM Corp. All rights reserved. */ -#include #include #include -#include #include #include #include @@ -48,15 +46,10 @@ static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src, chacha_crypt_generic(state, dst, src, bytes, nrounds); } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); +#define hchacha_block_arch hchacha_block_generic /* not implemented yet */ -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) @@ -74,27 +67,10 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, dst += todo; } while (bytes); } -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&have_p10); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); -static int __init chacha_p10_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (cpu_has_feature(CPU_FTR_ARCH_31)) static_branch_enable(&have_p10); - return 0; } -subsys_initcall(chacha_p10_init); - -static void __exit chacha_p10_exit(void) -{ -} -module_exit(chacha_p10_exit); - -MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)"); -MODULE_AUTHOR("Danny Tsen "); -MODULE_LICENSE("GPL v2"); diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/lib/crypto/powerpc/curve25519-ppc64le_asm.S similarity index 100% rename from arch/powerpc/crypto/curve25519-ppc64le_asm.S rename to lib/crypto/powerpc/curve25519-ppc64le_asm.S diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/lib/crypto/powerpc/curve25519.h similarity index 56% rename from arch/powerpc/crypto/curve25519-ppc64le-core.c rename to lib/crypto/powerpc/curve25519.h index f7810be0b292b7..dee6234c48e92a 100644 --- a/arch/powerpc/crypto/curve25519-ppc64le-core.c +++ b/lib/crypto/powerpc/curve25519.h @@ -7,14 +7,9 @@ * - Algorithm 1 Scalar multiplication of a variable point */ -#include -#include - #include #include #include -#include -#include #include #include @@ -177,124 +172,15 @@ static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32], fe51_tobytes(out, x2); } -void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) +static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) { curve25519_fe51(mypublic, secret, basepoint); } -EXPORT_SYMBOL(curve25519_arch); -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) { curve25519_fe51(pub, secret, curve25519_base_point); } -EXPORT_SYMBOL(curve25519_base_arch); - -static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - u8 *secret = kpp_tfm_ctx(tfm); - - if (!len) - curve25519_generate_secret(secret); - else if (len == CURVE25519_KEY_SIZE && - crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) - memcpy(secret, buf, CURVE25519_KEY_SIZE); - else - return -EINVAL; - return 0; -} - -static int curve25519_generate_public_key(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - - if (req->src) - return -EINVAL; - - curve25519_base_arch(buf, secret); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static int curve25519_compute_shared_secret(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 public_key[CURVE25519_KEY_SIZE]; - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - - if (!req->src) - return -EINVAL; - - copied = sg_copy_to_buffer(req->src, - sg_nents_for_len(req->src, - CURVE25519_KEY_SIZE), - public_key, CURVE25519_KEY_SIZE); - if (copied != CURVE25519_KEY_SIZE) - return -EINVAL; - - curve25519_arch(buf, secret, public_key); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static unsigned int curve25519_max_size(struct crypto_kpp *tfm) -{ - return CURVE25519_KEY_SIZE; -} - -static struct kpp_alg curve25519_alg = { - .base.cra_name = "curve25519", - .base.cra_driver_name = "curve25519-ppc64le", - .base.cra_priority = 200, - .base.cra_module = THIS_MODULE, - .base.cra_ctxsize = CURVE25519_KEY_SIZE, - - .set_secret = curve25519_set_secret, - .generate_public_key = curve25519_generate_public_key, - .compute_shared_secret = curve25519_compute_shared_secret, - .max_size = curve25519_max_size, -}; - - -static int __init curve25519_mod_init(void) -{ - return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? - crypto_register_kpp(&curve25519_alg) : 0; -} - -static void __exit curve25519_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_KPP)) - crypto_unregister_kpp(&curve25519_alg); -} - -module_init(curve25519_mod_init); -module_exit(curve25519_mod_exit); - -MODULE_ALIAS_CRYPTO("curve25519"); -MODULE_ALIAS_CRYPTO("curve25519-ppc64le"); -MODULE_DESCRIPTION("PPC64le Curve25519 scalar multiplication with 51 bits limbs"); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Danny Tsen "); diff --git a/arch/powerpc/crypto/md5-asm.S b/lib/crypto/powerpc/md5-asm.S similarity index 100% rename from arch/powerpc/crypto/md5-asm.S rename to lib/crypto/powerpc/md5-asm.S diff --git a/lib/crypto/powerpc/md5.h b/lib/crypto/powerpc/md5.h new file mode 100644 index 00000000000000..540b08e34d1d58 --- /dev/null +++ b/lib/crypto/powerpc/md5.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * MD5 optimized for PowerPC + */ + +void ppc_md5_transform(u32 *state, const u8 *data, size_t nblocks); + +static void md5_blocks(struct md5_block_state *state, + const u8 *data, size_t nblocks) +{ + ppc_md5_transform(state->h, data, nblocks); +} diff --git a/lib/crypto/powerpc/poly1305-p10-glue.c b/lib/crypto/powerpc/poly1305.h similarity index 63% rename from lib/crypto/powerpc/poly1305-p10-glue.c rename to lib/crypto/powerpc/poly1305.h index 3f1664a724b655..b8ed098a0e95fc 100644 --- a/lib/crypto/powerpc/poly1305-p10-glue.c +++ b/lib/crypto/powerpc/poly1305.h @@ -1,15 +1,13 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Poly1305 authenticator algorithm, RFC7539. * * Copyright 2023- IBM Corp. All rights reserved. */ #include -#include #include #include #include -#include #include asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen); @@ -30,8 +28,8 @@ static void vsx_end(void) preempt_enable(); } -void poly1305_block_init_arch(struct poly1305_block_state *dctx, - const u8 raw_key[POLY1305_BLOCK_SIZE]) +static void poly1305_block_init(struct poly1305_block_state *dctx, + const u8 raw_key[POLY1305_BLOCK_SIZE]) { if (!static_key_enabled(&have_p10)) return poly1305_block_init_generic(dctx, raw_key); @@ -40,10 +38,9 @@ void poly1305_block_init_arch(struct poly1305_block_state *dctx, dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0); dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8); } -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) { if (!static_key_enabled(&have_p10)) return poly1305_blocks_generic(state, src, len, padbit); @@ -60,37 +57,18 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, } vsx_end(); } -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]) +static void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]) { if (!static_key_enabled(&have_p10)) return poly1305_emit_generic(state, digest, nonce); poly1305_emit_64(state, nonce, digest); } -EXPORT_SYMBOL_GPL(poly1305_emit_arch); -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&have_p10); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init poly1305_p10_init(void) +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) { if (cpu_has_feature(CPU_FTR_ARCH_31)) static_branch_enable(&have_p10); - return 0; } -subsys_initcall(poly1305_p10_init); - -static void __exit poly1305_p10_exit(void) -{ -} -module_exit(poly1305_p10_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Danny Tsen "); -MODULE_DESCRIPTION("Optimized Poly1305 for P10"); diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig deleted file mode 100644 index bc7a43f33eb3a3..00000000000000 --- a/lib/crypto/riscv/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_RISCV64 - tristate - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC diff --git a/lib/crypto/riscv/Makefile b/lib/crypto/riscv/Makefile deleted file mode 100644 index e27b78f317fc8e..00000000000000 --- a/lib/crypto/riscv/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o -chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o diff --git a/lib/crypto/riscv/chacha-riscv64-glue.c b/lib/crypto/riscv/chacha.h similarity index 57% rename from lib/crypto/riscv/chacha-riscv64-glue.c rename to lib/crypto/riscv/chacha.h index 8c3f11d79be31c..5c000c6aef4be1 100644 --- a/lib/crypto/riscv/chacha-riscv64-glue.c +++ b/lib/crypto/riscv/chacha.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +/* SPDX-License-Identifier: GPL-2.0-only */ /* * ChaCha stream cipher (RISC-V optimized) * @@ -8,25 +8,18 @@ #include #include -#include #include #include -#include static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb); asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out, size_t nblocks, int nrounds); -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); +#define hchacha_block_arch hchacha_block_generic /* not implemented yet */ -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { u8 block_buffer[CHACHA_BLOCK_SIZE]; unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE; @@ -48,28 +41,11 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, } kernel_vector_end(); } -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&use_zvkb); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); -static int __init riscv64_chacha_mod_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (riscv_isa_extension_available(NULL, ZVKB) && riscv_vector_vlen() >= 128) static_branch_enable(&use_zvkb); - return 0; } -subsys_initcall(riscv64_chacha_mod_init); - -static void __exit riscv64_chacha_mod_exit(void) -{ -} -module_exit(riscv64_chacha_mod_exit); - -MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)"); -MODULE_AUTHOR("Jerry Shih "); -MODULE_LICENSE("GPL"); diff --git a/lib/crypto/riscv/poly1305-riscv.pl b/lib/crypto/riscv/poly1305-riscv.pl new file mode 100644 index 00000000000000..e25e6338a9ac10 --- /dev/null +++ b/lib/crypto/riscv/poly1305-riscv.pl @@ -0,0 +1,847 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL. +# ==================================================================== +# +# Poly1305 hash for RISC-V. +# +# February 2019 +# +# In the essence it's pretty straightforward transliteration of MIPS +# module [without big-endian option]. +# +# 1.8 cycles per byte on U74, >100% faster than compiler-generated +# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69% +# improvement. +# +# June 2024. +# +# Add CHERI support. +# +###################################################################### +# +($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4)); +($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27)); +# +###################################################################### + +$flavour = shift || "64"; + +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +$code.=<<___; +#ifdef __KERNEL__ +# ifdef __riscv_zicfilp +# undef __riscv_zicfilp // calls are expected to be direct +# endif +#endif + +#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast) +# define __riscv_misaligned_fast 1 +#endif +___ + +if ($flavour =~ /64/) {{{ +###################################################################### +# 64-bit code path... +# +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2); + +$code.=<<___; +#if __riscv_xlen == 64 +# if __SIZEOF_POINTER__ == 16 +# define PUSH csc +# define POP clc +# else +# define PUSH sd +# define POP ld +# endif +#else +# error "unsupported __riscv_xlen" +#endif + +.option pic +.text + +.globl poly1305_init +.type poly1305_init,\@function +poly1305_init: +#ifdef __riscv_zicfilp + lpad 0 +#endif + sd $zero,0($ctx) + sd $zero,8($ctx) + sd $zero,16($ctx) + + beqz $inp,.Lno_key + +#ifndef __riscv_misaligned_fast + andi $tmp0,$inp,7 # $inp % 8 + andi $inp,$inp,-8 # align $inp + slli $tmp0,$tmp0,3 # byte to bit offset +#endif + ld $in0,0($inp) + ld $in1,8($inp) +#ifndef __riscv_misaligned_fast + beqz $tmp0,.Laligned_key + + ld $tmp2,16($inp) + neg $tmp1,$tmp0 # implicit &63 in sll + srl $in0,$in0,$tmp0 + sll $tmp3,$in1,$tmp1 + srl $in1,$in1,$tmp0 + sll $tmp2,$tmp2,$tmp1 + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 + +.Laligned_key: +#endif + li $tmp0,1 + slli $tmp0,$tmp0,32 # 0x0000000100000000 + addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1 + slli $tmp0,$tmp0,28 # 0x0ffffffc10000000 + addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff + + and $in0,$in0,$tmp0 + addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc + and $in1,$in1,$tmp0 + + sd $in0,24($ctx) + srli $tmp0,$in1,2 + sd $in1,32($ctx) + add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2) + sd $tmp0,40($ctx) + +.Lno_key: + li $a0,0 # return 0 + ret +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = + ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2); +my ($shr,$shl) = ($t5,$t6); # used on R6 + +$code.=<<___; +.globl poly1305_blocks +.type poly1305_blocks,\@function +poly1305_blocks: +#ifdef __riscv_zicfilp + lpad 0 +#endif + andi $len,$len,-16 # complete blocks only + beqz $len,.Lno_data + + caddi $sp,$sp,-4*__SIZEOF_POINTER__ + PUSH $s0,3*__SIZEOF_POINTER__($sp) + PUSH $s1,2*__SIZEOF_POINTER__($sp) + PUSH $s2,1*__SIZEOF_POINTER__($sp) + PUSH $s3,0*__SIZEOF_POINTER__($sp) + +#ifndef __riscv_misaligned_fast + andi $shr,$inp,7 + andi $inp,$inp,-8 # align $inp + slli $shr,$shr,3 # byte to bit offset + neg $shl,$shr # implicit &63 in sll +#endif + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + ld $r0,24($ctx) # load key + ld $r1,32($ctx) + ld $rs1,40($ctx) + + add $len,$len,$inp # end of buffer + +.Loop: + ld $in0,0($inp) # load input + ld $in1,8($inp) +#ifndef __riscv_misaligned_fast + beqz $shr,.Laligned_inp + + ld $tmp2,16($inp) + srl $in0,$in0,$shr + sll $tmp3,$in1,$shl + srl $in1,$in1,$shr + sll $tmp2,$tmp2,$shl + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 + +.Laligned_inp: +#endif + caddi $inp,$inp,16 + + andi $tmp0,$h2,-4 # modulo-scheduled reduction + srli $tmp1,$h2,2 + andi $h2,$h2,3 + + add $d0,$h0,$in0 # accumulate input + add $tmp1,$tmp1,$tmp0 + sltu $tmp0,$d0,$h0 + add $d0,$d0,$tmp1 # ... and residue + sltu $tmp1,$d0,$tmp1 + add $d1,$h1,$in1 + add $tmp0,$tmp0,$tmp1 + sltu $tmp1,$d1,$h1 + add $d1,$d1,$tmp0 + + add $d2,$h2,$padbit + sltu $tmp0,$d1,$tmp0 + mulhu $h1,$r0,$d0 # h0*r0 + mul $h0,$r0,$d0 + + add $d2,$d2,$tmp1 + add $d2,$d2,$tmp0 + mulhu $tmp1,$rs1,$d1 # h1*5*r1 + mul $tmp0,$rs1,$d1 + + mulhu $h2,$r1,$d0 # h0*r1 + mul $tmp2,$r1,$d0 + add $h0,$h0,$tmp0 + add $h1,$h1,$tmp1 + sltu $tmp0,$h0,$tmp0 + + add $h1,$h1,$tmp0 + add $h1,$h1,$tmp2 + mulhu $tmp1,$r0,$d1 # h1*r0 + mul $tmp0,$r0,$d1 + + sltu $tmp2,$h1,$tmp2 + add $h2,$h2,$tmp2 + mul $tmp2,$rs1,$d2 # h2*5*r1 + + add $h1,$h1,$tmp0 + add $h2,$h2,$tmp1 + mul $tmp3,$r0,$d2 # h2*r0 + sltu $tmp0,$h1,$tmp0 + add $h2,$h2,$tmp0 + + add $h1,$h1,$tmp2 + sltu $tmp2,$h1,$tmp2 + add $h2,$h2,$tmp2 + add $h2,$h2,$tmp3 + + bne $inp,$len,.Loop + + sd $h0,0($ctx) # store hash value + sd $h1,8($ctx) + sd $h2,16($ctx) + + POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue + POP $s1,2*__SIZEOF_POINTER__($sp) + POP $s2,1*__SIZEOF_POINTER__($sp) + POP $s3,0*__SIZEOF_POINTER__($sp) + caddi $sp,$sp,4*__SIZEOF_POINTER__ + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); + +$code.=<<___; +.globl poly1305_emit +.type poly1305_emit,\@function +poly1305_emit: +#ifdef __riscv_zicfilp + lpad 0 +#endif + ld $tmp2,16($ctx) + ld $tmp0,0($ctx) + ld $tmp1,8($ctx) + + andi $in0,$tmp2,-4 # final reduction + srl $in1,$tmp2,2 + andi $tmp2,$tmp2,3 + add $in0,$in0,$in1 + + add $tmp0,$tmp0,$in0 + sltu $in1,$tmp0,$in0 + addi $in0,$tmp0,5 # compare to modulus + add $tmp1,$tmp1,$in1 + sltiu $tmp3,$in0,5 + sltu $tmp4,$tmp1,$in1 + add $in1,$tmp1,$tmp3 + add $tmp2,$tmp2,$tmp4 + sltu $tmp3,$in1,$tmp3 + add $tmp2,$tmp2,$tmp3 + + srli $tmp2,$tmp2,2 # see if it carried/borrowed + neg $tmp2,$tmp2 + + xor $in0,$in0,$tmp0 + xor $in1,$in1,$tmp1 + and $in0,$in0,$tmp2 + and $in1,$in1,$tmp2 + xor $in0,$in0,$tmp0 + xor $in1,$in1,$tmp1 + + lwu $tmp0,0($nonce) # load nonce + lwu $tmp1,4($nonce) + lwu $tmp2,8($nonce) + lwu $tmp3,12($nonce) + slli $tmp1,$tmp1,32 + slli $tmp3,$tmp3,32 + or $tmp0,$tmp0,$tmp1 + or $tmp2,$tmp2,$tmp3 + + add $in0,$in0,$tmp0 # accumulate nonce + add $in1,$in1,$tmp2 + sltu $tmp0,$in0,$tmp0 + add $in1,$in1,$tmp0 + +#ifdef __riscv_misaligned_fast + sd $in0,0($mac) # write mac value + sd $in1,8($mac) +#else + srli $tmp0,$in0,8 # write mac value + srli $tmp1,$in0,16 + srli $tmp2,$in0,24 + sb $in0,0($mac) + srli $tmp3,$in0,32 + sb $tmp0,1($mac) + srli $tmp0,$in0,40 + sb $tmp1,2($mac) + srli $tmp1,$in0,48 + sb $tmp2,3($mac) + srli $tmp2,$in0,56 + sb $tmp3,4($mac) + srli $tmp3,$in1,8 + sb $tmp0,5($mac) + srli $tmp0,$in1,16 + sb $tmp1,6($mac) + srli $tmp1,$in1,24 + sb $tmp2,7($mac) + + sb $in1,8($mac) + srli $tmp2,$in1,32 + sb $tmp3,9($mac) + srli $tmp3,$in1,40 + sb $tmp0,10($mac) + srli $tmp0,$in1,48 + sb $tmp1,11($mac) + srli $tmp1,$in1,56 + sb $tmp2,12($mac) + sb $tmp3,13($mac) + sb $tmp0,14($mac) + sb $tmp1,15($mac) +#endif + + ret +.size poly1305_emit,.-poly1305_emit +.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" +___ +} +}}} else {{{ +###################################################################### +# 32-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = + ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3); + +$code.=<<___; +#if __riscv_xlen == 32 +# if __SIZEOF_POINTER__ == 8 +# define PUSH csc +# define POP clc +# else +# define PUSH sw +# define POP lw +# endif +# define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b +# define srliw srli +# define srlw srl +# define sllw sll +# define addw add +# define addiw addi +# define mulw mul +#elif __riscv_xlen == 64 +# if __SIZEOF_POINTER__ == 16 +# define PUSH csc +# define POP clc +# else +# define PUSH sd +# define POP ld +# endif +# define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32 +#else +# error "unsupported __riscv_xlen" +#endif + +.option pic +.text + +.globl poly1305_init +.type poly1305_init,\@function +poly1305_init: +#ifdef __riscv_zicfilp + lpad 0 +#endif + sw $zero,0($ctx) + sw $zero,4($ctx) + sw $zero,8($ctx) + sw $zero,12($ctx) + sw $zero,16($ctx) + + beqz $inp,.Lno_key + +#ifndef __riscv_misaligned_fast + andi $tmp0,$inp,3 # $inp % 4 + sub $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset +#endif + lw $in0,0($inp) + lw $in1,4($inp) + lw $in2,8($inp) + lw $in3,12($inp) +#ifndef __riscv_misaligned_fast + beqz $tmp0,.Laligned_key + + lw $tmp2,16($inp) + sub $tmp1,$zero,$tmp0 + srlw $in0,$in0,$tmp0 + sllw $tmp3,$in1,$tmp1 + srlw $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + sllw $tmp3,$in2,$tmp1 + srlw $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + sllw $tmp3,$in3,$tmp1 + srlw $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + sllw $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +.Laligned_key: +#endif + + lui $tmp0,0x10000 + addi $tmp0,$tmp0,-1 # 0x0fffffff + and $in0,$in0,$tmp0 + addi $tmp0,$tmp0,-3 # 0x0ffffffc + and $in1,$in1,$tmp0 + and $in2,$in2,$tmp0 + and $in3,$in3,$tmp0 + + sw $in0,20($ctx) + sw $in1,24($ctx) + sw $in2,28($ctx) + sw $in3,32($ctx) + + srlw $tmp1,$in1,2 + srlw $tmp2,$in2,2 + srlw $tmp3,$in3,2 + addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) + addw $in2,$in2,$tmp2 + addw $in3,$in3,$tmp3 + sw $in1,36($ctx) + sw $in2,40($ctx) + sw $in3,44($ctx) +.Lno_key: + li $a0,0 + ret +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = + ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2); +my ($d0,$d1,$d2,$d3) = + ($a4,$a5,$a6,$a7); +my $shr = $ra; # used on R6 + +$code.=<<___; +.globl poly1305_blocks +.type poly1305_blocks,\@function +poly1305_blocks: +#ifdef __riscv_zicfilp + lpad 0 +#endif + andi $len,$len,-16 # complete blocks only + beqz $len,.Labort + +#ifdef __riscv_zcmp + cm.push {ra,s0-s8}, -48 +#else + caddi $sp,$sp,-__SIZEOF_POINTER__*12 + PUSH $ra, __SIZEOF_POINTER__*11($sp) + PUSH $s0, __SIZEOF_POINTER__*10($sp) + PUSH $s1, __SIZEOF_POINTER__*9($sp) + PUSH $s2, __SIZEOF_POINTER__*8($sp) + PUSH $s3, __SIZEOF_POINTER__*7($sp) + PUSH $s4, __SIZEOF_POINTER__*6($sp) + PUSH $s5, __SIZEOF_POINTER__*5($sp) + PUSH $s6, __SIZEOF_POINTER__*4($sp) + PUSH $s7, __SIZEOF_POINTER__*3($sp) + PUSH $s8, __SIZEOF_POINTER__*2($sp) +#endif + +#ifndef __riscv_misaligned_fast + andi $shr,$inp,3 + andi $inp,$inp,-4 # align $inp + slli $shr,$shr,3 # byte to bit offset +#endif + + lw $h0,0($ctx) # load hash value + lw $h1,4($ctx) + lw $h2,8($ctx) + lw $h3,12($ctx) + lw $h4,16($ctx) + + lw $r0,20($ctx) # load key + lw $r1,24($ctx) + lw $r2,28($ctx) + lw $r3,32($ctx) + lw $rs1,36($ctx) + lw $rs2,40($ctx) + lw $rs3,44($ctx) + + add $len,$len,$inp # end of buffer + +.Loop: + lw $d0,0($inp) # load input + lw $d1,4($inp) + lw $d2,8($inp) + lw $d3,12($inp) +#ifndef __riscv_misaligned_fast + beqz $shr,.Laligned_inp + + lw $t4,16($inp) + sub $t5,$zero,$shr + srlw $d0,$d0,$shr + sllw $t3,$d1,$t5 + srlw $d1,$d1,$shr + or $d0,$d0,$t3 + sllw $t3,$d2,$t5 + srlw $d2,$d2,$shr + or $d1,$d1,$t3 + sllw $t3,$d3,$t5 + srlw $d3,$d3,$shr + or $d2,$d2,$t3 + sllw $t4,$t4,$t5 + or $d3,$d3,$t4 + +.Laligned_inp: +#endif + srliw $t3,$h4,2 # modulo-scheduled reduction + andi $t4,$h4,-4 + andi $h4,$h4,3 + + addw $d0,$d0,$h0 # accumulate input + addw $t4,$t4,$t3 + sltu $h0,$d0,$h0 + addw $d0,$d0,$t4 # ... and residue + sltu $t4,$d0,$t4 + + addw $d1,$d1,$h1 + addw $h0,$h0,$t4 # carry + sltu $h1,$d1,$h1 + addw $d1,$d1,$h0 + sltu $h0,$d1,$h0 + + addw $d2,$d2,$h2 + addw $h1,$h1,$h0 # carry + sltu $h2,$d2,$h2 + addw $d2,$d2,$h1 + sltu $h1,$d2,$h1 + + addw $d3,$d3,$h3 + addw $h2,$h2,$h1 # carry + sltu $h3,$d3,$h3 + addw $d3,$d3,$h2 + + MULX ($h1,$h0,$r0,$d0) # d0*r0 + + sltu $h2,$d3,$h2 + addw $h3,$h3,$h2 # carry + + MULX ($t4,$t3,$rs3,$d1) # d1*s3 + + addw $h4,$h4,$padbit + caddi $inp,$inp,16 + addw $h4,$h4,$h3 + + MULX ($t6,$a3,$rs2,$d2) # d2*s2 + addw $h0,$h0,$t3 + addw $h1,$h1,$t4 + sltu $t3,$h0,$t3 + addw $h1,$h1,$t3 + + MULX ($t4,$t3,$rs1,$d3) # d3*s1 + addw $h0,$h0,$a3 + addw $h1,$h1,$t6 + sltu $a3,$h0,$a3 + addw $h1,$h1,$a3 + + + MULX ($h2,$a3,$r1,$d0) # d0*r1 + addw $h0,$h0,$t3 + addw $h1,$h1,$t4 + sltu $t3,$h0,$t3 + addw $h1,$h1,$t3 + + MULX ($t4,$t3,$r0,$d1) # d1*r0 + addw $h1,$h1,$a3 + sltu $a3,$h1,$a3 + addw $h2,$h2,$a3 + + MULX ($t6,$a3,$rs3,$d2) # d2*s3 + addw $h1,$h1,$t3 + addw $h2,$h2,$t4 + sltu $t3,$h1,$t3 + addw $h2,$h2,$t3 + + MULX ($t4,$t3,$rs2,$d3) # d3*s2 + addw $h1,$h1,$a3 + addw $h2,$h2,$t6 + sltu $a3,$h1,$a3 + addw $h2,$h2,$a3 + + mulw $a3,$rs1,$h4 # h4*s1 + addw $h1,$h1,$t3 + addw $h2,$h2,$t4 + sltu $t3,$h1,$t3 + addw $h2,$h2,$t3 + + + MULX ($h3,$t3,$r2,$d0) # d0*r2 + addw $h1,$h1,$a3 + sltu $a3,$h1,$a3 + addw $h2,$h2,$a3 + + MULX ($t6,$a3,$r1,$d1) # d1*r1 + addw $h2,$h2,$t3 + sltu $t3,$h2,$t3 + addw $h3,$h3,$t3 + + MULX ($t4,$t3,$r0,$d2) # d2*r0 + addw $h2,$h2,$a3 + addw $h3,$h3,$t6 + sltu $a3,$h2,$a3 + addw $h3,$h3,$a3 + + MULX ($t6,$a3,$rs3,$d3) # d3*s3 + addw $h2,$h2,$t3 + addw $h3,$h3,$t4 + sltu $t3,$h2,$t3 + addw $h3,$h3,$t3 + + mulw $t3,$rs2,$h4 # h4*s2 + addw $h2,$h2,$a3 + addw $h3,$h3,$t6 + sltu $a3,$h2,$a3 + addw $h3,$h3,$a3 + + + MULX ($t6,$a3,$r3,$d0) # d0*r3 + addw $h2,$h2,$t3 + sltu $t3,$h2,$t3 + addw $h3,$h3,$t3 + + MULX ($t4,$t3,$r2,$d1) # d1*r2 + addw $h3,$h3,$a3 + sltu $a3,$h3,$a3 + addw $t6,$t6,$a3 + + MULX ($a3,$d3,$r0,$d3) # d3*r0 + addw $h3,$h3,$t3 + addw $t6,$t6,$t4 + sltu $t3,$h3,$t3 + addw $t6,$t6,$t3 + + MULX ($t4,$t3,$r1,$d2) # d2*r1 + addw $h3,$h3,$d3 + addw $t6,$t6,$a3 + sltu $d3,$h3,$d3 + addw $t6,$t6,$d3 + + mulw $a3,$rs3,$h4 # h4*s3 + addw $h3,$h3,$t3 + addw $t6,$t6,$t4 + sltu $t3,$h3,$t3 + addw $t6,$t6,$t3 + + + mulw $h4,$r0,$h4 # h4*r0 + addw $h3,$h3,$a3 + sltu $a3,$h3,$a3 + addw $t6,$t6,$a3 + addw $h4,$t6,$h4 + + li $padbit,1 # if we loop, padbit is 1 + + bne $inp,$len,.Loop + + sw $h0,0($ctx) # store hash value + sw $h1,4($ctx) + sw $h2,8($ctx) + sw $h3,12($ctx) + sw $h4,16($ctx) + +#ifdef __riscv_zcmp + cm.popret {ra,s0-s8}, 48 +#else + POP $ra, __SIZEOF_POINTER__*11($sp) + POP $s0, __SIZEOF_POINTER__*10($sp) + POP $s1, __SIZEOF_POINTER__*9($sp) + POP $s2, __SIZEOF_POINTER__*8($sp) + POP $s3, __SIZEOF_POINTER__*7($sp) + POP $s4, __SIZEOF_POINTER__*6($sp) + POP $s5, __SIZEOF_POINTER__*5($sp) + POP $s6, __SIZEOF_POINTER__*4($sp) + POP $s7, __SIZEOF_POINTER__*3($sp) + POP $s8, __SIZEOF_POINTER__*2($sp) + caddi $sp,$sp,__SIZEOF_POINTER__*12 +#endif +.Labort: + ret +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); + +$code.=<<___; +.globl poly1305_emit +.type poly1305_emit,\@function +poly1305_emit: +#ifdef __riscv_zicfilp + lpad 0 +#endif + lw $tmp4,16($ctx) + lw $tmp0,0($ctx) + lw $tmp1,4($ctx) + lw $tmp2,8($ctx) + lw $tmp3,12($ctx) + + srliw $ctx,$tmp4,2 # final reduction + andi $in0,$tmp4,-4 + andi $tmp4,$tmp4,3 + addw $ctx,$ctx,$in0 + + addw $tmp0,$tmp0,$ctx + sltu $ctx,$tmp0,$ctx + addiw $in0,$tmp0,5 # compare to modulus + addw $tmp1,$tmp1,$ctx + sltiu $in1,$in0,5 + sltu $ctx,$tmp1,$ctx + addw $in1,$in1,$tmp1 + addw $tmp2,$tmp2,$ctx + sltu $in2,$in1,$tmp1 + sltu $ctx,$tmp2,$ctx + addw $in2,$in2,$tmp2 + addw $tmp3,$tmp3,$ctx + sltu $in3,$in2,$tmp2 + sltu $ctx,$tmp3,$ctx + addw $in3,$in3,$tmp3 + addw $tmp4,$tmp4,$ctx + sltu $ctx,$in3,$tmp3 + addw $ctx,$ctx,$tmp4 + + srl $ctx,$ctx,2 # see if it carried/borrowed + sub $ctx,$zero,$ctx + + xor $in0,$in0,$tmp0 + xor $in1,$in1,$tmp1 + xor $in2,$in2,$tmp2 + xor $in3,$in3,$tmp3 + and $in0,$in0,$ctx + and $in1,$in1,$ctx + and $in2,$in2,$ctx + and $in3,$in3,$ctx + xor $in0,$in0,$tmp0 + xor $in1,$in1,$tmp1 + xor $in2,$in2,$tmp2 + xor $in3,$in3,$tmp3 + + lw $tmp0,0($nonce) # load nonce + lw $tmp1,4($nonce) + lw $tmp2,8($nonce) + lw $tmp3,12($nonce) + + addw $in0,$in0,$tmp0 # accumulate nonce + sltu $ctx,$in0,$tmp0 + + addw $in1,$in1,$tmp1 + sltu $tmp1,$in1,$tmp1 + addw $in1,$in1,$ctx + sltu $ctx,$in1,$ctx + addw $ctx,$ctx,$tmp1 + + addw $in2,$in2,$tmp2 + sltu $tmp2,$in2,$tmp2 + addw $in2,$in2,$ctx + sltu $ctx,$in2,$ctx + addw $ctx,$ctx,$tmp2 + + addw $in3,$in3,$tmp3 + addw $in3,$in3,$ctx + +#ifdef __riscv_misaligned_fast + sw $in0,0($mac) # write mac value + sw $in1,4($mac) + sw $in2,8($mac) + sw $in3,12($mac) +#else + srl $tmp0,$in0,8 # write mac value + srl $tmp1,$in0,16 + srl $tmp2,$in0,24 + sb $in0, 0($mac) + sb $tmp0,1($mac) + srl $tmp0,$in1,8 + sb $tmp1,2($mac) + srl $tmp1,$in1,16 + sb $tmp2,3($mac) + srl $tmp2,$in1,24 + sb $in1, 4($mac) + sb $tmp0,5($mac) + srl $tmp0,$in2,8 + sb $tmp1,6($mac) + srl $tmp1,$in2,16 + sb $tmp2,7($mac) + srl $tmp2,$in2,24 + sb $in2, 8($mac) + sb $tmp0,9($mac) + srl $tmp0,$in3,8 + sb $tmp1,10($mac) + srl $tmp1,$in3,16 + sb $tmp2,11($mac) + srl $tmp2,$in3,24 + sb $in3, 12($mac) + sb $tmp0,13($mac) + sb $tmp1,14($mac) + sb $tmp2,15($mac) +#endif + + ret +.size poly1305_emit,.-poly1305_emit +.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" +___ +} +}}} + +foreach (split("\n", $code)) { + if ($flavour =~ /^cheri/) { + s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/; + s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or + s/\b(ret|jal)\b/c$1/; + s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or + m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g; + } else { + s/\bcaddi?\b/add/ or + s/\bcmove\b/mv/; + } + print $_, "\n"; +} + +close STDOUT; diff --git a/lib/crypto/riscv/poly1305.h b/lib/crypto/riscv/poly1305.h new file mode 100644 index 00000000000000..88f3df44e355ea --- /dev/null +++ b/lib/crypto/riscv/poly1305.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for riscv + * + * Copyright (C) 2025 Institute of Software, CAS. + */ + +asmlinkage void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +asmlinkage void poly1305_blocks(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h index c0f79c18f11998..1def18b0a4fb55 100644 --- a/lib/crypto/riscv/sha256.h +++ b/lib/crypto/riscv/sha256.h @@ -9,19 +9,19 @@ * Author: Jerry Shih */ +#include #include -#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state, const u8 *data, size_t nblocks); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); - static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { - if (static_branch_likely(&have_extensions) && crypto_simd_usable()) { + if (static_branch_likely(&have_extensions) && likely(may_use_simd())) { kernel_vector_begin(); sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks); kernel_vector_end(); @@ -31,7 +31,7 @@ static void sha256_blocks(struct sha256_block_state *state, } #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { /* Both zvknha and zvknhb provide the SHA-256 instructions. */ if ((riscv_isa_extension_available(NULL, ZVKNHA) || diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h index 9d0abede322f75..145bdab1214e37 100644 --- a/lib/crypto/riscv/sha512.h +++ b/lib/crypto/riscv/sha512.h @@ -11,7 +11,6 @@ #include #include -#include static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions); @@ -21,8 +20,7 @@ asmlinkage void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state, static void sha512_blocks(struct sha512_block_state *state, const u8 *data, size_t nblocks) { - if (static_branch_likely(&have_extensions) && - likely(crypto_simd_usable())) { + if (static_branch_likely(&have_extensions) && likely(may_use_simd())) { kernel_vector_begin(); sha512_transform_zvknhb_zvkb(state, data, nblocks); kernel_vector_end(); @@ -32,7 +30,7 @@ static void sha512_blocks(struct sha512_block_state *state, } #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (riscv_isa_extension_available(NULL, ZVKNHB) && riscv_isa_extension_available(NULL, ZVKB) && diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig deleted file mode 100644 index 069b355fe51aa9..00000000000000 --- a/lib/crypto/s390/Kconfig +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_S390 - tristate - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile deleted file mode 100644 index 06c2cf77178ef2..00000000000000 --- a/lib/crypto/s390/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o -chacha_s390-y := chacha-glue.o chacha-s390.o diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha.h similarity index 51% rename from lib/crypto/s390/chacha-glue.c rename to lib/crypto/s390/chacha.h index c57dc851214fa3..fd9c4a42236568 100644 --- a/lib/crypto/s390/chacha-glue.c +++ b/lib/crypto/s390/chacha.h @@ -1,32 +1,21 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * ChaCha stream cipher (s390 optimized) * * Copyright IBM Corp. 2021 */ -#define KMSG_COMPONENT "chacha_s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include #include #include #include -#include #include #include #include "chacha-s390.h" -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - /* TODO: implement hchacha_block_arch() in assembly */ - hchacha_block_generic(state, out, nrounds); -} -EXPORT_SYMBOL(hchacha_block_arch); +#define hchacha_block_arch hchacha_block_generic /* not implemented yet */ -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { /* s390 chacha20 implementation has 20 rounds hard-coded, * it cannot handle a block of data or less, but otherwise @@ -45,13 +34,3 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, CHACHA_BLOCK_SIZE; } } -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return cpu_has_vx(); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)"); -MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/s390/sha1.h b/lib/crypto/s390/sha1.h index 08bd138e881cce..73d94476a157a8 100644 --- a/lib/crypto/s390/sha1.h +++ b/lib/crypto/s390/sha1.h @@ -20,7 +20,7 @@ static void sha1_blocks(struct sha1_block_state *state, } #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { if (cpu_have_feature(S390_CPU_FEATURE_MSA) && cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1)) diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h index 70a81cbc06b2c2..acd48350878975 100644 --- a/lib/crypto/s390/sha256.h +++ b/lib/crypto/s390/sha256.h @@ -20,7 +20,7 @@ static void sha256_blocks(struct sha256_block_state *state, } #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { if (cpu_have_feature(S390_CPU_FEATURE_MSA) && cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256)) diff --git a/lib/crypto/s390/sha512.h b/lib/crypto/s390/sha512.h index 24744651550cbd..46699d43df7eb6 100644 --- a/lib/crypto/s390/sha512.h +++ b/lib/crypto/s390/sha512.h @@ -20,7 +20,7 @@ static void sha512_blocks(struct sha512_block_state *state, } #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (cpu_have_feature(S390_CPU_FEATURE_MSA) && cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512)) diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 8fa15165d23e89..881b935418cead 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = { }, }; -static const struct sha256_block_state sha256_iv = { - .h = { - SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, - SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, +static const struct sha256_ctx initial_sha256_ctx = { + .ctx = { + .state = { + .h = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, + }, + }, + .bytecount = 0, }, }; +#define sha256_iv (initial_sha256_ctx.ctx.state) + static const u32 sha256_K[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) } EXPORT_SYMBOL(sha256); -/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */ +/* + * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined) + * doesn't need either HMAC support or interleaved hashing support + */ #ifndef __DISABLE_EXPORTS + +#ifndef sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + return false; +} +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return false; +} +#endif + +/* Sequential fallback implementation of sha256_finup_2x() */ +static noinline_for_stack void sha256_finup_2x_sequential( + const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, + size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) +{ + struct __sha256_ctx mut_ctx; + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data1, len); + __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE); + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data2, len); + __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE); +} + +void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + if (ctx == NULL) + ctx = &initial_sha256_ctx; + + if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1, + out2))) + return; + sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x); + +bool sha256_finup_2x_is_optimized(void) +{ + return sha256_finup_2x_is_optimized_arch(); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized); + static void __hmac_sha256_preparekey(struct sha256_block_state *istate, struct sha256_block_state *ostate, const u8 *raw_key, size_t raw_key_len, diff --git a/lib/crypto/sparc/md5.h b/lib/crypto/sparc/md5.h new file mode 100644 index 00000000000000..3995f3e075eb6b --- /dev/null +++ b/lib/crypto/sparc/md5.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * MD5 accelerated using the sparc64 crypto opcodes + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald + * Copyright (c) Jean-Francois Dive + * Copyright (c) Mathias Krause + * Copyright (c) Cryptoapi developers. + * Copyright (c) 2002 James Morris + */ + +#include +#include +#include + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_md5_opcodes); + +asmlinkage void md5_sparc64_transform(struct md5_block_state *state, + const u8 *data, size_t nblocks); + +static void md5_blocks(struct md5_block_state *state, + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_md5_opcodes)) { + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); + md5_sparc64_transform(state, data, nblocks); + le32_to_cpu_array(state->h, ARRAY_SIZE(state->h)); + } else { + md5_blocks_generic(state, data, nblocks); + } +} + +#define md5_mod_init_arch md5_mod_init_arch +static void md5_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_MD5)) + return; + + static_branch_enable(&have_md5_opcodes); + pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n"); +} diff --git a/arch/sparc/crypto/md5_asm.S b/lib/crypto/sparc/md5_asm.S similarity index 100% rename from arch/sparc/crypto/md5_asm.S rename to lib/crypto/sparc/md5_asm.S diff --git a/lib/crypto/sparc/sha1.h b/lib/crypto/sparc/sha1.h index 5015f93584b7e3..bdf771fcc1f73a 100644 --- a/lib/crypto/sparc/sha1.h +++ b/lib/crypto/sparc/sha1.h @@ -27,7 +27,7 @@ static void sha1_blocks(struct sha1_block_state *state, } #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { unsigned long cfr; diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h index 1d10108eb19543..b2f4419ec77810 100644 --- a/lib/crypto/sparc/sha256.h +++ b/lib/crypto/sparc/sha256.h @@ -27,7 +27,7 @@ static void sha256_blocks(struct sha256_block_state *state, } #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { unsigned long cfr; diff --git a/lib/crypto/sparc/sha512.h b/lib/crypto/sparc/sha512.h index 55303ab6b15f7d..a8c37a7d4c3937 100644 --- a/lib/crypto/sparc/sha512.h +++ b/lib/crypto/sparc/sha512.h @@ -26,7 +26,7 @@ static void sha512_blocks(struct sha512_block_state *state, } #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { unsigned long cfr; diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig index de7e8babb6afc5..578af717e13a7c 100644 --- a/lib/crypto/tests/Kconfig +++ b/lib/crypto/tests/Kconfig @@ -1,5 +1,34 @@ # SPDX-License-Identifier: GPL-2.0-or-later +config CRYPTO_LIB_BLAKE2S_KUNIT_TEST + tristate "KUnit tests for BLAKE2s" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + # No need to select CRYPTO_LIB_BLAKE2S here, as that option doesn't + # exist; the BLAKE2s code is always built-in for the /dev/random driver. + help + KUnit tests for the BLAKE2s cryptographic hash function. + +config CRYPTO_LIB_CURVE25519_KUNIT_TEST + tristate "KUnit tests for Curve25519" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + select CRYPTO_LIB_CURVE25519 + help + KUnit tests for the Curve25519 Diffie-Hellman function. + +config CRYPTO_LIB_MD5_KUNIT_TEST + tristate "KUnit tests for MD5" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS + select CRYPTO_LIB_BENCHMARK_VISIBLE + select CRYPTO_LIB_MD5 + help + KUnit tests for the MD5 cryptographic hash function and its + corresponding HMAC. + config CRYPTO_LIB_POLY1305_KUNIT_TEST tristate "KUnit tests for Poly1305" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile index 8601dccd6fddab..a71fad19922baa 100644 --- a/lib/crypto/tests/Makefile +++ b/lib/crypto/tests/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-or-later +obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o +obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o +obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o diff --git a/lib/crypto/tests/blake2s-testvecs.h b/lib/crypto/tests/blake2s-testvecs.h new file mode 100644 index 00000000000000..6f978b79a59b07 --- /dev/null +++ b/lib/crypto/tests/blake2s-testvecs.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2s */ + +static const struct { + size_t data_len; + u8 digest[BLAKE2S_HASH_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0x69, 0x21, 0x7a, 0x30, 0x79, 0x90, 0x80, 0x94, + 0xe1, 0x11, 0x21, 0xd0, 0x42, 0x35, 0x4a, 0x7c, + 0x1f, 0x55, 0xb6, 0x48, 0x2c, 0xa1, 0xa5, 0x1e, + 0x1b, 0x25, 0x0d, 0xfd, 0x1e, 0xd0, 0xee, 0xf9, + }, + }, + { + .data_len = 1, + .digest = { + 0x7c, 0xab, 0x53, 0xe2, 0x48, 0x87, 0xdf, 0x64, + 0x98, 0x6a, 0xc1, 0x7e, 0xf0, 0x01, 0x4d, 0xc9, + 0x07, 0x4f, 0xb8, 0x2f, 0x46, 0xd7, 0xee, 0xa9, + 0xad, 0xe5, 0xf8, 0x21, 0xac, 0xfe, 0x17, 0x58, + }, + }, + { + .data_len = 2, + .digest = { + 0x5e, 0x63, 0x2c, 0xd0, 0xf8, 0x7b, 0xf5, 0xae, + 0x61, 0x97, 0x94, 0x57, 0xc8, 0x76, 0x22, 0xd9, + 0x8b, 0x04, 0x5e, 0xf1, 0x5d, 0xd0, 0xfc, 0xd9, + 0x0c, 0x19, 0x2e, 0xe2, 0xc5, 0xd9, 0x73, 0x51, + }, + }, + { + .data_len = 3, + .digest = { + 0x33, 0x65, 0xa6, 0x37, 0xbf, 0xf8, 0x4f, 0x15, + 0x4c, 0xac, 0x9e, 0xa4, 0x3b, 0x02, 0x07, 0x0c, + 0x80, 0x86, 0x0d, 0x6c, 0xe4, 0xaf, 0x1c, 0xbc, + 0x0b, 0x9c, 0x0a, 0x98, 0xc2, 0x99, 0x71, 0xcd, + }, + }, + { + .data_len = 16, + .digest = { + 0x59, 0xd2, 0x10, 0xd3, 0x75, 0xac, 0x48, 0x32, + 0xb1, 0xea, 0xee, 0xcf, 0x0a, 0xd2, 0x8b, 0x15, + 0x5d, 0x72, 0x71, 0x4c, 0xa7, 0x29, 0xb0, 0x7a, + 0x44, 0x48, 0x8a, 0x54, 0x54, 0x54, 0x41, 0xf5, + }, + }, + { + .data_len = 32, + .digest = { + 0xdc, 0xfc, 0x46, 0x81, 0xc6, 0x1b, 0x2b, 0x47, + 0x8b, 0xed, 0xe0, 0x73, 0x34, 0x38, 0x53, 0x92, + 0x97, 0x2f, 0xfb, 0x51, 0xab, 0x4f, 0x2d, 0x9d, + 0x69, 0x04, 0xa9, 0x5d, 0x33, 0xef, 0xcb, 0x1c, + }, + }, + { + .data_len = 48, + .digest = { + 0xd6, 0x2a, 0x7f, 0x96, 0x04, 0x4d, 0x16, 0xc8, + 0x49, 0xe0, 0x37, 0x33, 0xe3, 0x7b, 0x34, 0x56, + 0x99, 0xc5, 0x78, 0x57, 0x06, 0x02, 0xb4, 0xea, + 0x80, 0xc4, 0xf8, 0x8f, 0x8d, 0x2b, 0xe4, 0x05, + }, + }, + { + .data_len = 49, + .digest = { + 0x8b, 0x58, 0x62, 0xb5, 0x85, 0xf6, 0x83, 0x36, + 0xf5, 0x34, 0xb8, 0xd4, 0xbc, 0x5c, 0x8b, 0x38, + 0xfd, 0x15, 0xcd, 0x44, 0x83, 0x25, 0x71, 0xe1, + 0xd5, 0xe8, 0xa1, 0xa4, 0x36, 0x98, 0x7e, 0x68, + }, + }, + { + .data_len = 63, + .digest = { + 0x7e, 0xeb, 0x06, 0x87, 0xdf, 0x1a, 0xdc, 0xe5, + 0xfb, 0x64, 0xd4, 0xd1, 0x5d, 0x9e, 0x75, 0xc0, + 0xb9, 0xad, 0x55, 0x6c, 0xe6, 0xba, 0x4d, 0x98, + 0x2f, 0xbf, 0x72, 0xad, 0x61, 0x37, 0xf6, 0x11, + }, + }, + { + .data_len = 64, + .digest = { + 0x72, 0xdb, 0x43, 0x16, 0x57, 0x8e, 0x3a, 0x96, + 0xf3, 0x98, 0x19, 0x24, 0x17, 0x3b, 0xe8, 0xad, + 0xa1, 0x9b, 0xa4, 0x1b, 0x74, 0x85, 0x2e, 0x24, + 0x70, 0xea, 0x31, 0x5a, 0x1c, 0xbe, 0x43, 0xb5, + }, + }, + { + .data_len = 65, + .digest = { + 0x32, 0x48, 0xb0, 0xf0, 0x3f, 0xbb, 0xd2, 0xa3, + 0xfd, 0xf6, 0x28, 0x4a, 0x2a, 0xc5, 0xbe, 0x4b, + 0x73, 0x50, 0x63, 0xd6, 0x16, 0x00, 0xef, 0xed, + 0xfe, 0x97, 0x41, 0x29, 0xb2, 0x84, 0xc4, 0xa3, + }, + }, + { + .data_len = 127, + .digest = { + 0x17, 0xda, 0x6b, 0x96, 0x6a, 0xa6, 0xa4, 0xa6, + 0xa6, 0xf3, 0x9d, 0x18, 0x19, 0x8d, 0x98, 0x7c, + 0x66, 0x38, 0xe8, 0x99, 0xe7, 0x0a, 0x50, 0x92, + 0xaf, 0x11, 0x80, 0x05, 0x66, 0xed, 0xab, 0x74, + }, + }, + { + .data_len = 128, + .digest = { + 0x13, 0xd5, 0x8b, 0x22, 0xae, 0x90, 0x7b, 0x67, + 0x87, 0x4e, 0x3c, 0x35, 0x4e, 0x01, 0xf0, 0xb1, + 0xd3, 0xd1, 0x67, 0xbb, 0x43, 0xdb, 0x7c, 0x75, + 0xa4, 0xc7, 0x64, 0x83, 0x1e, 0x9b, 0x98, 0xad, + }, + }, + { + .data_len = 129, + .digest = { + 0x6f, 0xe0, 0x5d, 0x9d, 0xd5, 0x78, 0x29, 0xfb, + 0xd0, 0x77, 0xd1, 0x8a, 0xf0, 0x80, 0xcb, 0x81, + 0x71, 0x9e, 0x4d, 0x49, 0xde, 0x74, 0x2a, 0x37, + 0xc0, 0xd5, 0xf0, 0xfa, 0x50, 0xe6, 0x23, 0xfe, + }, + }, + { + .data_len = 256, + .digest = { + 0x89, 0xac, 0xf6, 0xe7, 0x5e, 0xba, 0x53, 0xf4, + 0x92, 0x32, 0xd5, 0x64, 0xfb, 0xc4, 0x08, 0xac, + 0x2c, 0x19, 0x6e, 0x63, 0x13, 0x75, 0xd0, 0x60, + 0x54, 0x35, 0x82, 0xc4, 0x6d, 0x03, 0x1a, 0x05, + }, + }, + { + .data_len = 511, + .digest = { + 0x1c, 0xaf, 0x94, 0x7d, 0x9c, 0xce, 0x57, 0x64, + 0xf8, 0xa8, 0x25, 0x45, 0x32, 0x86, 0x2b, 0x04, + 0xb3, 0x2e, 0x67, 0xca, 0x73, 0x04, 0x2f, 0xab, + 0xcc, 0xda, 0x9e, 0x42, 0xa1, 0xaf, 0x83, 0x5a, + }, + }, + { + .data_len = 513, + .digest = { + 0x21, 0xdf, 0xdc, 0x29, 0xd9, 0xfc, 0x7b, 0xe7, + 0x3a, 0xc4, 0xe1, 0x61, 0xc5, 0xb5, 0xe1, 0xee, + 0x7a, 0x9d, 0x0c, 0x66, 0x36, 0x63, 0xe4, 0x12, + 0x62, 0xe2, 0xf5, 0x68, 0x72, 0xfc, 0x1e, 0x18, + }, + }, + { + .data_len = 1000, + .digest = { + 0x6e, 0xc7, 0x2e, 0xac, 0xd0, 0xbb, 0x22, 0xe0, + 0xc2, 0x40, 0xb2, 0xfe, 0x8c, 0xaf, 0x9e, 0xcf, + 0x32, 0x06, 0xc6, 0x45, 0x29, 0xbd, 0xe0, 0x7f, + 0x53, 0x32, 0xc3, 0x2b, 0x2f, 0x68, 0x12, 0xcd, + }, + }, + { + .data_len = 3333, + .digest = { + 0x76, 0xba, 0x52, 0xb5, 0x09, 0xf5, 0x19, 0x09, + 0x70, 0x1c, 0x09, 0x28, 0xb4, 0xaa, 0x98, 0x6a, + 0x79, 0xe7, 0x5e, 0xcd, 0xe8, 0xa4, 0x73, 0x69, + 0x1f, 0xf8, 0x05, 0x0a, 0xb4, 0xfe, 0xf9, 0x63, + }, + }, + { + .data_len = 4096, + .digest = { + 0xf7, 0xad, 0xf9, 0xc8, 0x0e, 0x04, 0x2f, 0xdf, + 0xbe, 0x39, 0x79, 0x07, 0x0d, 0xd8, 0x1b, 0x06, + 0x42, 0x3a, 0x43, 0x93, 0xf6, 0x7c, 0xc4, 0xe5, + 0xc2, 0xd5, 0xd0, 0xa6, 0x35, 0x6c, 0xbd, 0x17, + }, + }, + { + .data_len = 4128, + .digest = { + 0x38, 0xd7, 0xab, 0x7e, 0x08, 0xdc, 0x1e, 0xab, + 0x55, 0xbb, 0x3b, 0x7b, 0x6a, 0x17, 0xcc, 0x79, + 0xa7, 0x02, 0x62, 0x66, 0x9b, 0xca, 0xee, 0xc0, + 0x3d, 0x75, 0x34, 0x2e, 0x55, 0x82, 0x26, 0x3c, + }, + }, + { + .data_len = 4160, + .digest = { + 0xf7, 0xeb, 0x2f, 0x24, 0x98, 0x54, 0x04, 0x5a, + 0x19, 0xe4, 0x12, 0x9d, 0x97, 0xbc, 0x87, 0xa5, + 0x0b, 0x85, 0x29, 0xa1, 0x36, 0x89, 0xc9, 0xba, + 0xa0, 0xe0, 0xac, 0x99, 0x7d, 0xa4, 0x51, 0x9f, + }, + }, + { + .data_len = 4224, + .digest = { + 0x8f, 0xe8, 0xa7, 0x79, 0x02, 0xbb, 0x4a, 0x56, + 0x66, 0x91, 0xef, 0x22, 0xd1, 0x09, 0x26, 0x6c, + 0xa9, 0x13, 0xd7, 0x44, 0xc7, 0x19, 0x9c, 0x0b, + 0xfb, 0x4f, 0xca, 0x72, 0x8f, 0x34, 0xf7, 0x82, + }, + }, + { + .data_len = 16384, + .digest = { + 0xaa, 0x21, 0xbb, 0x25, 0x4b, 0x66, 0x6e, 0x29, + 0x71, 0xc1, 0x44, 0x67, 0x19, 0xed, 0xe6, 0xe6, + 0x61, 0x13, 0xf4, 0xb7, 0x02, 0x94, 0x81, 0x0f, + 0xa7, 0x4d, 0xbb, 0x2c, 0xb8, 0xeb, 0x41, 0x0e, + }, + }, +}; + +static const u8 hash_testvec_consolidated[BLAKE2S_HASH_SIZE] = { + 0x84, 0x21, 0xbb, 0x73, 0x64, 0x47, 0x45, 0xe0, + 0xc1, 0x83, 0x78, 0xf1, 0xea, 0xe5, 0xfd, 0xdb, + 0x01, 0xda, 0xb7, 0x86, 0x70, 0x3b, 0x83, 0xb3, + 0xbc, 0xd9, 0xfd, 0x96, 0xbd, 0x50, 0x06, 0x67, +}; + +static const u8 blake2s_keyed_testvec_consolidated[BLAKE2S_HASH_SIZE] = { + 0xa6, 0xad, 0xcd, 0xb8, 0xd9, 0xdd, 0xc7, 0x70, + 0x07, 0x09, 0x7f, 0x9f, 0x41, 0xa9, 0x70, 0xa4, + 0x1c, 0xca, 0x61, 0xbb, 0x58, 0xb5, 0xb2, 0x1d, + 0xd1, 0x71, 0x16, 0xb0, 0x49, 0x4f, 0x9e, 0x1b, +}; diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c new file mode 100644 index 00000000000000..057c40132246f1 --- /dev/null +++ b/lib/crypto/tests/blake2s_kunit.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include +#include "blake2s-testvecs.h" + +/* + * The following are compatibility functions that present BLAKE2s as an unkeyed + * hash function that produces hashes of fixed length BLAKE2S_HASH_SIZE, so that + * hash-test-template.h can be reused to test it. + */ + +static void blake2s_default(const u8 *data, size_t len, + u8 out[BLAKE2S_HASH_SIZE]) +{ + blake2s(out, data, NULL, BLAKE2S_HASH_SIZE, len, 0); +} + +static void blake2s_init_default(struct blake2s_state *state) +{ + blake2s_init(state, BLAKE2S_HASH_SIZE); +} + +/* + * Generate the HASH_KUNIT_CASES using hash-test-template.h. These test BLAKE2s + * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE. + */ +#define HASH blake2s_default +#define HASH_CTX blake2s_state +#define HASH_SIZE BLAKE2S_HASH_SIZE +#define HASH_INIT blake2s_init_default +#define HASH_UPDATE blake2s_update +#define HASH_FINAL blake2s_final +#include "hash-test-template.h" + +/* + * BLAKE2s specific test case which tests all possible combinations of key + * length and hash length. + */ +static void test_blake2s_all_key_and_hash_lens(struct kunit *test) +{ + const size_t data_len = 100; + u8 *data = &test_buf[0]; + u8 *key = data + data_len; + u8 *hash = key + BLAKE2S_KEY_SIZE; + struct blake2s_state main_state; + u8 main_hash[BLAKE2S_HASH_SIZE]; + + rand_bytes_seeded_from_len(data, data_len); + blake2s_init(&main_state, BLAKE2S_HASH_SIZE); + for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) { + rand_bytes_seeded_from_len(key, key_len); + for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) { + blake2s(hash, data, key, out_len, data_len, key_len); + blake2s_update(&main_state, hash, out_len); + } + } + blake2s_final(&main_state, main_hash); + KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated, + BLAKE2S_HASH_SIZE); +} + +/* + * BLAKE2s specific test case which tests using a guarded buffer for all allowed + * key lengths. Also tests both blake2s() and blake2s_init_key(). + */ +static void test_blake2s_with_guarded_key_buf(struct kunit *test) +{ + const size_t data_len = 100; + + rand_bytes(test_buf, data_len); + for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) { + u8 key[BLAKE2S_KEY_SIZE]; + u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len]; + u8 hash1[BLAKE2S_HASH_SIZE]; + u8 hash2[BLAKE2S_HASH_SIZE]; + struct blake2s_state state; + + rand_bytes(key, key_len); + memcpy(guarded_key, key, key_len); + + blake2s(hash1, test_buf, key, + BLAKE2S_HASH_SIZE, data_len, key_len); + blake2s(hash2, test_buf, guarded_key, + BLAKE2S_HASH_SIZE, data_len, key_len); + KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE); + + blake2s_init_key(&state, BLAKE2S_HASH_SIZE, + guarded_key, key_len); + blake2s_update(&state, test_buf, data_len); + blake2s_final(&state, hash2); + KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE); + } +} + +/* + * BLAKE2s specific test case which tests using a guarded output buffer for all + * allowed output lengths. + */ +static void test_blake2s_with_guarded_out_buf(struct kunit *test) +{ + const size_t data_len = 100; + + rand_bytes(test_buf, data_len); + for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) { + u8 hash[BLAKE2S_HASH_SIZE]; + u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len]; + + blake2s(hash, test_buf, NULL, out_len, data_len, 0); + blake2s(guarded_hash, test_buf, NULL, out_len, data_len, 0); + KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len); + } +} + +static struct kunit_case blake2s_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(test_blake2s_all_key_and_hash_lens), + KUNIT_CASE(test_blake2s_with_guarded_key_buf), + KUNIT_CASE(test_blake2s_with_guarded_out_buf), + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite blake2s_test_suite = { + .name = "blake2s", + .test_cases = blake2s_test_cases, + .suite_init = hash_suite_init, + .suite_exit = hash_suite_exit, +}; +kunit_test_suite(blake2s_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2s"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/curve25519-selftest.c b/lib/crypto/tests/curve25519_kunit.c similarity index 96% rename from lib/crypto/curve25519-selftest.c rename to lib/crypto/tests/curve25519_kunit.c index c85e85381e7884..248d05f66b35c1 100644 --- a/lib/crypto/curve25519-selftest.c +++ b/lib/crypto/tests/curve25519_kunit.c @@ -4,6 +4,8 @@ */ #include +#include +#include struct curve25519_test_vector { u8 private[CURVE25519_KEY_SIZE]; @@ -11,7 +13,7 @@ struct curve25519_test_vector { u8 result[CURVE25519_KEY_SIZE]; bool valid; }; -static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = { +static const struct curve25519_test_vector curve25519_test_vectors[] = { { .private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, @@ -1280,42 +1282,82 @@ static const struct curve25519_test_vector curve25519_test_vectors[] __initconst } }; -bool __init curve25519_selftest(void) +static void test_curve25519(struct kunit *test) { - bool success = true, ret, ret2; - size_t i = 0, j; - u8 in[CURVE25519_KEY_SIZE]; - u8 out[CURVE25519_KEY_SIZE], out2[CURVE25519_KEY_SIZE], - out3[CURVE25519_KEY_SIZE]; + for (size_t i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { + const struct curve25519_test_vector *vec = + &curve25519_test_vectors[i]; + u8 out[CURVE25519_KEY_SIZE] = {}; + bool ret; - for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { - memset(out, 0, CURVE25519_KEY_SIZE); - ret = curve25519(out, curve25519_test_vectors[i].private, - curve25519_test_vectors[i].public); - if (ret != curve25519_test_vectors[i].valid || - memcmp(out, curve25519_test_vectors[i].result, - CURVE25519_KEY_SIZE)) { - pr_err("curve25519 self-test %zu: FAIL\n", i + 1); - success = false; - } + ret = curve25519(out, vec->private, vec->public); + KUNIT_EXPECT_EQ_MSG(test, ret, vec->valid, + "Wrong return value with test vector %zu", + i); + KUNIT_EXPECT_MEMEQ_MSG(test, out, vec->result, sizeof(out), + "Wrong output with test vector %zu", i); } +} + +static void test_curve25519_basepoint(struct kunit *test) +{ + for (size_t i = 0; i < 5; ++i) { + u8 in[CURVE25519_KEY_SIZE]; + u8 out[CURVE25519_KEY_SIZE]; + u8 out2[CURVE25519_KEY_SIZE]; + bool ret, ret2; - for (i = 0; i < 5; ++i) { get_random_bytes(in, sizeof(in)); ret = curve25519_generate_public(out, in); ret2 = curve25519(out2, in, (u8[CURVE25519_KEY_SIZE]){ 9 }); - curve25519_generic(out3, in, (u8[CURVE25519_KEY_SIZE]){ 9 }); - if (ret != ret2 || - memcmp(out, out2, CURVE25519_KEY_SIZE) || - memcmp(out, out3, CURVE25519_KEY_SIZE)) { - pr_err("curve25519 basepoint self-test %zu: FAIL: input - 0x", - i + 1); - for (j = CURVE25519_KEY_SIZE; j-- > 0;) - printk(KERN_CONT "%02x", in[j]); - printk(KERN_CONT "\n"); - success = false; - } + KUNIT_EXPECT_EQ_MSG(test, ret, ret2, + "in=%*phN", CURVE25519_KEY_SIZE, in); + KUNIT_EXPECT_MEMEQ_MSG(test, out, out2, CURVE25519_KEY_SIZE, + "in=%*phN", CURVE25519_KEY_SIZE, in); } +} + +static void benchmark_curve25519(struct kunit *test) +{ + const u8 *private = curve25519_test_vectors[0].private; + const u8 *public = curve25519_test_vectors[0].public; + const size_t warmup_niter = 5000; + const size_t benchmark_niter = 1024; + u8 out[CURVE25519_KEY_SIZE]; + bool ok = true; + u64 t; + + if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK)) + kunit_skip(test, "not enabled"); - return success; + /* Warm-up */ + for (size_t i = 0; i < warmup_niter; i++) + ok &= curve25519(out, private, public); + + /* Benchmark */ + preempt_disable(); + t = ktime_get_ns(); + for (size_t i = 0; i < benchmark_niter; i++) + ok &= curve25519(out, private, public); + t = ktime_get_ns() - t; + preempt_enable(); + KUNIT_EXPECT_TRUE(test, ok); + kunit_info(test, "%llu ops/s", + div64_u64((u64)benchmark_niter * NSEC_PER_SEC, t ?: 1)); } + +static struct kunit_case curve25519_test_cases[] = { + KUNIT_CASE(test_curve25519), + KUNIT_CASE(test_curve25519_basepoint), + KUNIT_CASE(benchmark_curve25519), + {}, +}; + +static struct kunit_suite curve25519_test_suite = { + .name = "curve25519", + .test_cases = curve25519_test_cases, +}; +kunit_test_suite(curve25519_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for Curve25519"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/tests/hash-test-template.h b/lib/crypto/tests/hash-test-template.h index f437a0a9ac6cd1..61b43e62779fbf 100644 --- a/lib/crypto/tests/hash-test-template.h +++ b/lib/crypto/tests/hash-test-template.h @@ -5,11 +5,9 @@ * * Copyright 2025 Google LLC */ +#include #include -#include -#include #include -#include /* test_buf is a guarded buffer, i.e. &test_buf[TEST_BUF_LEN] is not mapped. */ #define TEST_BUF_LEN 16384 @@ -319,119 +317,6 @@ static void test_hash_ctx_zeroization(struct kunit *test) "Hash context was not zeroized by finalization"); } -#define IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5) - -struct hash_irq_test_state { - bool (*func)(void *test_specific_state); - void *test_specific_state; - bool task_func_reported_failure; - bool hardirq_func_reported_failure; - bool softirq_func_reported_failure; - unsigned long hardirq_func_calls; - unsigned long softirq_func_calls; - struct hrtimer timer; - struct work_struct bh_work; -}; - -static enum hrtimer_restart hash_irq_test_timer_func(struct hrtimer *timer) -{ - struct hash_irq_test_state *state = - container_of(timer, typeof(*state), timer); - - WARN_ON_ONCE(!in_hardirq()); - state->hardirq_func_calls++; - - if (!state->func(state->test_specific_state)) - state->hardirq_func_reported_failure = true; - - hrtimer_forward_now(&state->timer, IRQ_TEST_HRTIMER_INTERVAL); - queue_work(system_bh_wq, &state->bh_work); - return HRTIMER_RESTART; -} - -static void hash_irq_test_bh_work_func(struct work_struct *work) -{ - struct hash_irq_test_state *state = - container_of(work, typeof(*state), bh_work); - - WARN_ON_ONCE(!in_serving_softirq()); - state->softirq_func_calls++; - - if (!state->func(state->test_specific_state)) - state->softirq_func_reported_failure = true; -} - -/* - * Helper function which repeatedly runs the given @func in task, softirq, and - * hardirq context concurrently, and reports a failure to KUnit if any - * invocation of @func in any context returns false. @func is passed - * @test_specific_state as its argument. At most 3 invocations of @func will - * run concurrently: one in each of task, softirq, and hardirq context. - * - * The main purpose of this interrupt context testing is to validate fallback - * code paths that run in contexts where the normal code path cannot be used, - * typically due to the FPU or vector registers already being in-use in kernel - * mode. These code paths aren't covered when the test code is executed only by - * the KUnit test runner thread in task context. The reason for the concurrency - * is because merely using hardirq context is not sufficient to reach a fallback - * code path on some architectures; the hardirq actually has to occur while the - * FPU or vector unit was already in-use in kernel mode. - * - * Another purpose of this testing is to detect issues with the architecture's - * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions, - * especially in softirq context when the softirq may have interrupted a task - * already using kernel-mode FPU or vector (if the arch didn't prevent that). - * Crypto functions are often executed in softirqs, so this is important. - */ -static void run_irq_test(struct kunit *test, bool (*func)(void *), - int max_iterations, void *test_specific_state) -{ - struct hash_irq_test_state state = { - .func = func, - .test_specific_state = test_specific_state, - }; - unsigned long end_jiffies; - - /* - * Set up a hrtimer (the way we access hardirq context) and a work - * struct for the BH workqueue (the way we access softirq context). - */ - hrtimer_setup_on_stack(&state.timer, hash_irq_test_timer_func, - CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - INIT_WORK_ONSTACK(&state.bh_work, hash_irq_test_bh_work_func); - - /* Run for up to max_iterations or 1 second, whichever comes first. */ - end_jiffies = jiffies + HZ; - hrtimer_start(&state.timer, IRQ_TEST_HRTIMER_INTERVAL, - HRTIMER_MODE_REL_HARD); - for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies); - i++) { - if (!func(test_specific_state)) - state.task_func_reported_failure = true; - } - - /* Cancel the timer and work. */ - hrtimer_cancel(&state.timer); - flush_work(&state.bh_work); - - /* Sanity check: the timer and BH functions should have been run. */ - KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0, - "Timer function was not called"); - KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0, - "BH work function was not called"); - - /* Check for incorrect hash values reported from any context. */ - KUNIT_EXPECT_FALSE_MSG( - test, state.task_func_reported_failure, - "Incorrect hash values reported from task context"); - KUNIT_EXPECT_FALSE_MSG( - test, state.hardirq_func_reported_failure, - "Incorrect hash values reported from hardirq context"); - KUNIT_EXPECT_FALSE_MSG( - test, state.softirq_func_reported_failure, - "Incorrect hash values reported from softirq context"); -} - #define IRQ_TEST_DATA_LEN 256 #define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */ @@ -469,7 +354,7 @@ static void test_hash_interrupt_context_1(struct kunit *test) HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN, state.expected_hashes[i]); - run_irq_test(test, hash_irq_test1_func, 100000, &state); + kunit_run_irq_test(test, hash_irq_test1_func, 100000, &state); } struct hash_irq_test2_hash_ctx { @@ -500,7 +385,7 @@ static bool hash_irq_test2_func(void *state_) if (WARN_ON_ONCE(ctx == &state->ctxs[ARRAY_SIZE(state->ctxs)])) { /* * This should never happen, as the number of contexts is equal - * to the maximum concurrency level of run_irq_test(). + * to the maximum concurrency level of kunit_run_irq_test(). */ return false; } @@ -566,7 +451,7 @@ static void test_hash_interrupt_context_2(struct kunit *test) state->update_lens[state->num_steps++] = remaining; state->num_steps += 2; /* for init and final */ - run_irq_test(test, hash_irq_test2_func, 250000, state); + kunit_run_irq_test(test, hash_irq_test2_func, 250000, state); } #define UNKEYED_HASH_KUNIT_CASES \ diff --git a/lib/crypto/tests/md5-testvecs.h b/lib/crypto/tests/md5-testvecs.h new file mode 100644 index 00000000000000..be6727feb29660 --- /dev/null +++ b/lib/crypto/tests/md5-testvecs.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py md5 */ + +static const struct { + size_t data_len; + u8 digest[MD5_DIGEST_SIZE]; +} hash_testvecs[] = { + { + .data_len = 0, + .digest = { + 0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04, + 0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e, + }, + }, + { + .data_len = 1, + .digest = { + 0x16, 0x7b, 0x86, 0xf2, 0x1d, 0xf3, 0x76, 0xc9, + 0x6f, 0x10, 0xa0, 0x61, 0x5b, 0x14, 0x20, 0x0b, + }, + }, + { + .data_len = 2, + .digest = { + 0x2d, 0x30, 0x96, 0xc7, 0x43, 0x40, 0xed, 0xb2, + 0xfb, 0x84, 0x63, 0x9a, 0xec, 0xc7, 0x3c, 0x3c, + }, + }, + { + .data_len = 3, + .digest = { + 0xe5, 0x0f, 0xce, 0xe0, 0xc8, 0xff, 0x4e, 0x08, + 0x5e, 0x19, 0xe5, 0xf2, 0x08, 0x11, 0x19, 0x16, + }, + }, + { + .data_len = 16, + .digest = { + 0xe8, 0xca, 0x29, 0x05, 0x2f, 0xd1, 0xf3, 0x99, + 0x40, 0x71, 0xf5, 0xc2, 0xf7, 0xf8, 0x17, 0x3e, + }, + }, + { + .data_len = 32, + .digest = { + 0xe3, 0x20, 0xc1, 0xd8, 0x21, 0x14, 0x44, 0x59, + 0x1a, 0xf5, 0x91, 0xaf, 0x69, 0xbe, 0x93, 0x9d, + }, + }, + { + .data_len = 48, + .digest = { + 0xfb, 0x06, 0xb0, 0xf0, 0x00, 0x10, 0x4b, 0x68, + 0x3d, 0x75, 0xf9, 0x70, 0xde, 0xbb, 0x32, 0x16, + }, + }, + { + .data_len = 49, + .digest = { + 0x52, 0x86, 0x48, 0x8b, 0xae, 0x91, 0x7c, 0x4e, + 0xc2, 0x2a, 0x69, 0x07, 0x35, 0xcc, 0xb2, 0x88, + }, + }, + { + .data_len = 63, + .digest = { + 0xfa, 0xd3, 0xf6, 0xe6, 0x7b, 0x1a, 0xc6, 0x05, + 0x73, 0x35, 0x02, 0xab, 0xc7, 0xb3, 0x47, 0xcb, + }, + }, + { + .data_len = 64, + .digest = { + 0xc5, 0x59, 0x29, 0xe9, 0x0a, 0x4a, 0x86, 0x43, + 0x7c, 0xaf, 0xdf, 0x83, 0xd3, 0xb8, 0x33, 0x5f, + }, + }, + { + .data_len = 65, + .digest = { + 0x80, 0x05, 0x75, 0x39, 0xec, 0x44, 0x8a, 0x81, + 0xe7, 0x6e, 0x8d, 0xd1, 0xc6, 0xeb, 0xc2, 0xf0, + }, + }, + { + .data_len = 127, + .digest = { + 0x3f, 0x02, 0xe8, 0xc6, 0xb8, 0x6a, 0x39, 0xc3, + 0xa4, 0x1c, 0xd9, 0x8f, 0x4a, 0x71, 0x40, 0x30, + }, + }, + { + .data_len = 128, + .digest = { + 0x89, 0x4f, 0x79, 0x3e, 0xff, 0x0c, 0x22, 0x60, + 0xa2, 0xdc, 0x10, 0x5f, 0x23, 0x0a, 0xe7, 0xc6, + }, + }, + { + .data_len = 129, + .digest = { + 0x06, 0x56, 0x61, 0xb8, 0x8a, 0x82, 0x77, 0x1b, + 0x2c, 0x35, 0xb8, 0x9f, 0xd6, 0xf7, 0xbd, 0x5a, + }, + }, + { + .data_len = 256, + .digest = { + 0x5d, 0xdf, 0x7d, 0xc8, 0x43, 0x96, 0x3b, 0xdb, + 0xc7, 0x0e, 0x44, 0x42, 0x23, 0xf7, 0xed, 0xdf, + }, + }, + { + .data_len = 511, + .digest = { + 0xf6, 0x5f, 0x26, 0x51, 0x8a, 0x5a, 0x46, 0x8f, + 0x48, 0x72, 0x90, 0x74, 0x9d, 0x87, 0xbd, 0xdf, + }, + }, + { + .data_len = 513, + .digest = { + 0xd8, 0x2c, 0xc9, 0x76, 0xfa, 0x67, 0x2e, 0xa6, + 0xc8, 0x12, 0x4a, 0x64, 0xaa, 0x0b, 0x3d, 0xbd, + }, + }, + { + .data_len = 1000, + .digest = { + 0xe2, 0x7e, 0xb4, 0x5f, 0xe1, 0x74, 0x51, 0xfc, + 0xe0, 0xc8, 0xd5, 0xe6, 0x8b, 0x40, 0xd2, 0x0e, + }, + }, + { + .data_len = 3333, + .digest = { + 0xcd, 0x7d, 0x56, 0xa9, 0x4c, 0x47, 0xea, 0xc2, + 0x34, 0x0b, 0x84, 0x05, 0xf9, 0xad, 0xbb, 0x46, + }, + }, + { + .data_len = 4096, + .digest = { + 0x63, 0x6e, 0x58, 0xb3, 0x94, 0x6b, 0x83, 0x5f, + 0x1f, 0x0e, 0xd3, 0x66, 0x78, 0x71, 0x98, 0x42, + }, + }, + { + .data_len = 4128, + .digest = { + 0x9d, 0x68, 0xfc, 0x26, 0x8b, 0x4c, 0xa8, 0xe7, + 0x30, 0x0b, 0x19, 0x52, 0x6e, 0xa5, 0x65, 0x1c, + }, + }, + { + .data_len = 4160, + .digest = { + 0x1c, 0xaa, 0x7d, 0xee, 0x91, 0x01, 0xe2, 0x5a, + 0xec, 0xe9, 0xde, 0x57, 0x0a, 0xb6, 0x4c, 0x2f, + }, + }, + { + .data_len = 4224, + .digest = { + 0x1b, 0x31, 0xe3, 0x14, 0x07, 0x16, 0x17, 0xc6, + 0x98, 0x79, 0x88, 0x23, 0xb6, 0x3b, 0x25, 0xc4, + }, + }, + { + .data_len = 16384, + .digest = { + 0xc6, 0x3d, 0x56, 0x90, 0xf0, 0xf6, 0xe6, 0x50, + 0xf4, 0x76, 0x78, 0x67, 0xa3, 0xdd, 0x62, 0x7b, + }, + }, +}; + +static const u8 hash_testvec_consolidated[MD5_DIGEST_SIZE] = { + 0x70, 0x86, 0x9e, 0x6c, 0xa4, 0xc6, 0x71, 0x43, + 0x26, 0x02, 0x1b, 0x3f, 0xfd, 0x56, 0x9f, 0xa6, +}; + +static const u8 hmac_testvec_consolidated[MD5_DIGEST_SIZE] = { + 0x10, 0x02, 0x74, 0xf6, 0x4d, 0xb3, 0x3c, 0xc7, + 0xa1, 0xf7, 0xe6, 0xd4, 0x32, 0x64, 0xfa, 0x6d, +}; diff --git a/lib/crypto/tests/md5_kunit.c b/lib/crypto/tests/md5_kunit.c new file mode 100644 index 00000000000000..38bd52c25ae3e5 --- /dev/null +++ b/lib/crypto/tests/md5_kunit.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2025 Google LLC + */ +#include +#include "md5-testvecs.h" + +#define HASH md5 +#define HASH_CTX md5_ctx +#define HASH_SIZE MD5_DIGEST_SIZE +#define HASH_INIT md5_init +#define HASH_UPDATE md5_update +#define HASH_FINAL md5_final +#define HMAC_KEY hmac_md5_key +#define HMAC_CTX hmac_md5_ctx +#define HMAC_PREPAREKEY hmac_md5_preparekey +#define HMAC_INIT hmac_md5_init +#define HMAC_UPDATE hmac_md5_update +#define HMAC_FINAL hmac_md5_final +#define HMAC hmac_md5 +#define HMAC_USINGRAWKEY hmac_md5_usingrawkey +#include "hash-test-template.h" + +static struct kunit_case hash_test_cases[] = { + HASH_KUNIT_CASES, + KUNIT_CASE(benchmark_hash), + {}, +}; + +static struct kunit_suite hash_test_suite = { + .name = "md5", + .test_cases = hash_test_cases, + .suite_init = hash_suite_init, + .suite_exit = hash_suite_exit, +}; +kunit_test_suite(hash_test_suite); + +MODULE_DESCRIPTION("KUnit tests and benchmark for MD5 and HMAC-MD5"); +MODULE_LICENSE("GPL"); diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c index 1cd4caee6010d5..dcedfca06df658 100644 --- a/lib/crypto/tests/sha256_kunit.c +++ b/lib/crypto/tests/sha256_kunit.c @@ -5,6 +5,7 @@ #include #include "sha256-testvecs.h" +/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */ #define HASH sha256 #define HASH_CTX sha256_ctx #define HASH_SIZE SHA256_DIGEST_SIZE @@ -21,9 +22,192 @@ #define HMAC_USINGRAWKEY hmac_sha256_usingrawkey #include "hash-test-template.h" +static void free_guarded_buf(void *buf) +{ + vfree(buf); +} + +/* + * Allocate a KUnit-managed buffer that has length @len bytes immediately + * followed by an unmapped page, and assert that the allocation succeeds. + */ +static void *alloc_guarded_buf(struct kunit *test, size_t len) +{ + size_t full_len = round_up(len, PAGE_SIZE); + void *buf = vmalloc(full_len); + + KUNIT_ASSERT_NOT_NULL(test, buf); + KUNIT_ASSERT_EQ(test, 0, + kunit_add_action_or_reset(test, free_guarded_buf, buf)); + return buf + full_len - len; +} + +/* + * Test for sha256_finup_2x(). Specifically, choose various data lengths and + * salt lengths, and for each one, verify that sha256_finup_2x() produces the + * same results as sha256_update() and sha256_final(). + * + * Use guarded buffers for all inputs and outputs to reliably detect any + * out-of-bounds reads or writes, even if they occur in assembly code. + */ +static void test_sha256_finup_2x(struct kunit *test) +{ + const size_t max_data_len = 16384; + u8 *data1_buf, *data2_buf, *hash1, *hash2; + u8 expected_hash1[SHA256_DIGEST_SIZE]; + u8 expected_hash2[SHA256_DIGEST_SIZE]; + u8 salt[SHA256_BLOCK_SIZE]; + struct sha256_ctx *ctx; + + data1_buf = alloc_guarded_buf(test, max_data_len); + data2_buf = alloc_guarded_buf(test, max_data_len); + hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); + hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); + ctx = alloc_guarded_buf(test, sizeof(*ctx)); + + rand_bytes(data1_buf, max_data_len); + rand_bytes(data2_buf, max_data_len); + rand_bytes(salt, sizeof(salt)); + + for (size_t i = 0; i < 500; i++) { + size_t salt_len = rand_length(sizeof(salt)); + size_t data_len = rand_length(max_data_len); + const u8 *data1 = data1_buf + max_data_len - data_len; + const u8 *data2 = data2_buf + max_data_len - data_len; + struct sha256_ctx orig_ctx; + + sha256_init(ctx); + sha256_update(ctx, salt, salt_len); + orig_ctx = *ctx; + + sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2); + KUNIT_ASSERT_MEMEQ_MSG( + test, ctx, &orig_ctx, sizeof(*ctx), + "sha256_finup_2x() modified its ctx argument"); + + sha256_update(ctx, data1, data_len); + sha256_final(ctx, expected_hash1); + sha256_update(&orig_ctx, data2, data_len); + sha256_final(&orig_ctx, expected_hash2); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash1, expected_hash1, SHA256_DIGEST_SIZE, + "Wrong hash1 with salt_len=%zu data_len=%zu", salt_len, + data_len); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash2, expected_hash2, SHA256_DIGEST_SIZE, + "Wrong hash2 with salt_len=%zu data_len=%zu", salt_len, + data_len); + } +} + +/* Test sha256_finup_2x() with ctx == NULL */ +static void test_sha256_finup_2x_defaultctx(struct kunit *test) +{ + const size_t data_len = 128; + struct sha256_ctx ctx; + u8 hash1_a[SHA256_DIGEST_SIZE]; + u8 hash2_a[SHA256_DIGEST_SIZE]; + u8 hash1_b[SHA256_DIGEST_SIZE]; + u8 hash2_b[SHA256_DIGEST_SIZE]; + + rand_bytes(test_buf, 2 * data_len); + + sha256_init(&ctx); + sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a, + hash2_a); + + sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b, + hash2_b); + + KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE); + KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE); +} + +/* + * Test that sha256_finup_2x() and sha256_update/final() produce consistent + * results with total message lengths that require more than 32 bits. + */ +static void test_sha256_finup_2x_hugelen(struct kunit *test) +{ + const size_t data_len = 4 * SHA256_BLOCK_SIZE; + struct sha256_ctx ctx = {}; + u8 expected_hash[SHA256_DIGEST_SIZE]; + u8 hash[SHA256_DIGEST_SIZE]; + + rand_bytes(test_buf, data_len); + for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) { + sha256_init(&ctx); + ctx.ctx.bytecount = 0x123456789abcd00 + align; + + sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash); + + sha256_update(&ctx, test_buf, data_len); + sha256_final(&ctx, expected_hash); + + KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, + SHA256_DIGEST_SIZE); + } +} + +/* Benchmark for sha256_finup_2x() */ +static void benchmark_sha256_finup_2x(struct kunit *test) +{ + /* + * Try a few different salt lengths, since sha256_finup_2x() performance + * may vary slightly for the same data_len depending on how many bytes + * were already processed in the initial context. + */ + static const size_t salt_lens_to_test[] = { 0, 32, 64 }; + const size_t data_len = 4096; + const size_t num_iters = 4096; + struct sha256_ctx ctx; + u8 hash1[SHA256_DIGEST_SIZE]; + u8 hash2[SHA256_DIGEST_SIZE]; + + if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK)) + kunit_skip(test, "not enabled"); + if (!sha256_finup_2x_is_optimized()) + kunit_skip(test, "not relevant"); + + rand_bytes(test_buf, data_len * 2); + + /* Warm-up */ + for (size_t i = 0; i < num_iters; i++) + sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len], + data_len, hash1, hash2); + + for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) { + size_t salt_len = salt_lens_to_test[i]; + u64 t0, t1; + + /* + * Prepare the initial context. The time to process the salt is + * not measured; we're just interested in sha256_finup_2x(). + */ + sha256_init(&ctx); + sha256_update(&ctx, test_buf, salt_len); + + preempt_disable(); + t0 = ktime_get_ns(); + for (size_t j = 0; j < num_iters; j++) + sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len], + data_len, hash1, hash2); + t1 = ktime_get_ns(); + preempt_enable(); + kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s", + data_len, salt_len, + div64_u64((u64)data_len * 2 * num_iters * 1000, + t1 - t0 ?: 1)); + } +} + static struct kunit_case hash_test_cases[] = { HASH_KUNIT_CASES, + KUNIT_CASE(test_sha256_finup_2x), + KUNIT_CASE(test_sha256_finup_2x_defaultctx), + KUNIT_CASE(test_sha256_finup_2x_hugelen), KUNIT_CASE(benchmark_hash), + KUNIT_CASE(benchmark_sha256_finup_2x), {}, }; diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig deleted file mode 100644 index 546fe2afe0b51e..00000000000000 --- a/lib/crypto/x86/Kconfig +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_X86 - bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" - depends on 64BIT - select CRYPTO_LIB_BLAKE2S_GENERIC - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX-512 (Advanced Vector Extensions-512) - -config CRYPTO_CHACHA20_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile deleted file mode 100644 index c2ff8c5f1046e2..00000000000000 --- a/lib/crypto/x86/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o -libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o -poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -targets += poly1305-x86_64-cryptogams.S - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $< > $@ - -$(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perlasm) diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S index ac1c845445a4d5..ef8e9f427aab34 100644 --- a/lib/crypto/x86/blake2s-core.S +++ b/lib/crypto/x86/blake2s-core.S @@ -29,19 +29,19 @@ SIGMA: .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 -.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160 .align 64 SIGMA2: -.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 -.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 -.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 -.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 -.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 -.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 -.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 -.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 -.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 +.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 +.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 +.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 +.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 +.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 +.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 +.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 +.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 .text SYM_FUNC_START(blake2s_compress_ssse3) @@ -193,9 +193,9 @@ SYM_FUNC_START(blake2s_compress_avx512) leaq SIGMA2(%rip),%rax movb $0xa,%cl .Lblake2s_compress_avx512_roundloop: - addq $0x40,%rax - vmovdqa -0x40(%rax),%ymm8 - vmovdqa -0x20(%rax),%ymm9 + vpmovzxbd (%rax),%ymm8 + vpmovzxbd 0x8(%rax),%ymm9 + addq $0x10,%rax vpermi2d %ymm7,%ymm6,%ymm8 vpermi2d %ymm7,%ymm6,%ymm9 vmovdqa %ymm8,%ymm6 diff --git a/lib/crypto/x86/blake2s-glue.c b/lib/crypto/x86/blake2s.h similarity index 83% rename from lib/crypto/x86/blake2s-glue.c rename to lib/crypto/x86/blake2s.h index adc296cd17c933..b6d30d2fa045e3 100644 --- a/lib/crypto/x86/blake2s-glue.c +++ b/lib/crypto/x86/blake2s.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. */ @@ -7,8 +7,6 @@ #include #include #include -#include -#include #include #include #include @@ -23,8 +21,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) +static void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) { /* SIMD disables preemption, so relax after processing each page. */ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); @@ -49,9 +47,9 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block, block += blocks * BLAKE2S_BLOCK_SIZE; } while (nblocks); } -EXPORT_SYMBOL(blake2s_compress); -static int __init blake2s_mod_init(void) +#define blake2s_mod_init_arch blake2s_mod_init_arch +static void blake2s_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) static_branch_enable(&blake2s_use_ssse3); @@ -63,8 +61,4 @@ static int __init blake2s_mod_init(void) cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL)) static_branch_enable(&blake2s_use_avx512); - - return 0; } - -subsys_initcall(blake2s_mod_init); diff --git a/lib/crypto/x86/chacha_glue.c b/lib/crypto/x86/chacha.h similarity index 85% rename from lib/crypto/x86/chacha_glue.c rename to lib/crypto/x86/chacha.h index 10b2c945f54127..10cf8f1c569dca 100644 --- a/lib/crypto/x86/chacha_glue.c +++ b/lib/crypto/x86/chacha.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-or-later +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ChaCha and HChaCha functions (x86_64 optimized) * @@ -6,10 +6,8 @@ */ #include -#include #include #include -#include #include asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, @@ -126,8 +124,8 @@ static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, } } -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) +static void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) { if (!static_branch_likely(&chacha_use_simd)) { hchacha_block_generic(state, out, nrounds); @@ -137,10 +135,9 @@ void hchacha_block_arch(const struct chacha_state *state, kernel_fpu_end(); } } -EXPORT_SYMBOL(hchacha_block_arch); -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) +static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int bytes, int nrounds) { if (!static_branch_likely(&chacha_use_simd) || bytes <= CHACHA_BLOCK_SIZE) @@ -158,18 +155,12 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, dst += todo; } while (bytes); } -EXPORT_SYMBOL(chacha_crypt_arch); -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&chacha_use_simd); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_simd_mod_init(void) +#define chacha_mod_init_arch chacha_mod_init_arch +static void chacha_mod_init_arch(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; + return; static_branch_enable(&chacha_use_simd); @@ -182,15 +173,4 @@ static int __init chacha_simd_mod_init(void) boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ static_branch_enable(&chacha_use_avx512vl); } - return 0; } -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); diff --git a/arch/x86/crypto/curve25519-x86_64.c b/lib/crypto/x86/curve25519.h similarity index 94% rename from arch/x86/crypto/curve25519-x86_64.c rename to lib/crypto/x86/curve25519.h index d587f05c3c8c36..5c0b8408852dee 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/lib/crypto/x86/curve25519.h @@ -4,15 +4,9 @@ * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation */ -#include -#include - -#include #include #include #include -#include -#include #include #include @@ -1592,135 +1586,28 @@ static void curve25519_ever64_base(u8 *out, const u8 *priv) static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx); -void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]) +static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&curve25519_use_bmi2_adx)) curve25519_ever64(mypublic, secret, basepoint); else curve25519_generic(mypublic, secret, basepoint); } -EXPORT_SYMBOL(curve25519_arch); -void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]) +static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&curve25519_use_bmi2_adx)) curve25519_ever64_base(pub, secret); else curve25519_generic(pub, secret, curve25519_base_point); } -EXPORT_SYMBOL(curve25519_base_arch); - -static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, - unsigned int len) -{ - u8 *secret = kpp_tfm_ctx(tfm); - - if (!len) - curve25519_generate_secret(secret); - else if (len == CURVE25519_KEY_SIZE && - crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) - memcpy(secret, buf, CURVE25519_KEY_SIZE); - else - return -EINVAL; - return 0; -} - -static int curve25519_generate_public_key(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - - if (req->src) - return -EINVAL; - - curve25519_base_arch(buf, secret); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} - -static int curve25519_compute_shared_secret(struct kpp_request *req) -{ - struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); - const u8 *secret = kpp_tfm_ctx(tfm); - u8 public_key[CURVE25519_KEY_SIZE]; - u8 buf[CURVE25519_KEY_SIZE]; - int copied, nbytes; - - if (!req->src) - return -EINVAL; - - copied = sg_copy_to_buffer(req->src, - sg_nents_for_len(req->src, - CURVE25519_KEY_SIZE), - public_key, CURVE25519_KEY_SIZE); - if (copied != CURVE25519_KEY_SIZE) - return -EINVAL; - - curve25519_arch(buf, secret, public_key); - - /* might want less than we've got */ - nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); - copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, - nbytes), - buf, nbytes); - if (copied != nbytes) - return -EINVAL; - return 0; -} -static unsigned int curve25519_max_size(struct crypto_kpp *tfm) -{ - return CURVE25519_KEY_SIZE; -} - -static struct kpp_alg curve25519_alg = { - .base.cra_name = "curve25519", - .base.cra_driver_name = "curve25519-x86", - .base.cra_priority = 200, - .base.cra_module = THIS_MODULE, - .base.cra_ctxsize = CURVE25519_KEY_SIZE, - - .set_secret = curve25519_set_secret, - .generate_public_key = curve25519_generate_public_key, - .compute_shared_secret = curve25519_compute_shared_secret, - .max_size = curve25519_max_size, -}; - - -static int __init curve25519_mod_init(void) +#define curve25519_mod_init_arch curve25519_mod_init_arch +static void curve25519_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) static_branch_enable(&curve25519_use_bmi2_adx); - else - return 0; - return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? - crypto_register_kpp(&curve25519_alg) : 0; -} - -static void __exit curve25519_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && - static_branch_likely(&curve25519_use_bmi2_adx)) - crypto_unregister_kpp(&curve25519_alg); } - -module_init(curve25519_mod_init); -module_exit(curve25519_mod_exit); - -MODULE_ALIAS_CRYPTO("curve25519"); -MODULE_ALIAS_CRYPTO("curve25519-x86"); -MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl index 501827254fed75..409ec6955733ad 100644 --- a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl +++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl @@ -118,19 +118,6 @@ () } } -sub declare_typed_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_TYPED_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - sub end_function() { my ($name) = @_; if($kernel) { @@ -141,7 +128,7 @@ () } $code.=<<___ if $kernel; -#include +#include ___ if ($avx) { @@ -249,14 +236,14 @@ sub poly1305_iteration { $code.=<<___ if (!$kernel); .extern OPENSSL_ia32cap_P -.globl poly1305_block_init_arch -.hidden poly1305_block_init_arch +.globl poly1305_init_x86_64 +.hidden poly1305_init_x86_64 .globl poly1305_blocks_x86_64 .hidden poly1305_blocks_x86_64 .globl poly1305_emit_x86_64 .hidden poly1305_emit_x86_64 ___ -&declare_typed_function("poly1305_block_init_arch", 32, 3); +&declare_function("poly1305_init_x86_64", 32, 3); $code.=<<___; xor %eax,%eax mov %rax,0($ctx) # initialize hash value @@ -311,7 +298,7 @@ sub poly1305_iteration { .Lno_key: RET ___ -&end_function("poly1305_block_init_arch"); +&end_function("poly1305_init_x86_64"); &declare_function("poly1305_blocks_x86_64", 32, 4); $code.=<<___; @@ -4118,9 +4105,9 @@ sub poly1305_blocks_avxN { .section .pdata .align 4 - .rva .LSEH_begin_poly1305_block_init_arch - .rva .LSEH_end_poly1305_block_init_arch - .rva .LSEH_info_poly1305_block_init_arch + .rva .LSEH_begin_poly1305_init_x86_64 + .rva .LSEH_end_poly1305_init_x86_64 + .rva .LSEH_info_poly1305_init_x86_64 .rva .LSEH_begin_poly1305_blocks_x86_64 .rva .LSEH_end_poly1305_blocks_x86_64 @@ -4168,10 +4155,10 @@ sub poly1305_blocks_avxN { $code.=<<___; .section .xdata .align 8 -.LSEH_info_poly1305_block_init_arch: +.LSEH_info_poly1305_init_x86_64: .byte 9,0,0,0 .rva se_handler - .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch + .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 .LSEH_info_poly1305_blocks_x86_64: .byte 9,0,0,0 diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305.h similarity index 83% rename from lib/crypto/x86/poly1305_glue.c rename to lib/crypto/x86/poly1305.h index 856d48fd422b02..ee92e3740a7876 100644 --- a/lib/crypto/x86/poly1305_glue.c +++ b/lib/crypto/x86/poly1305.h @@ -1,16 +1,13 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. */ #include #include -#include #include #include -#include #include -#include struct poly1305_arch_internal { union { @@ -61,10 +58,8 @@ static void convert_to_base2_64(void *ctx) state->is_base2_26 = 0; } -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_init_x86_64(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, const u8 *inp, const size_t len, const u32 padbit); @@ -88,8 +83,14 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, - unsigned int len, u32 padbit) +static void poly1305_block_init(struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + poly1305_init_x86_64(state, raw_key); +} + +static void poly1305_blocks(struct poly1305_block_state *state, const u8 *inp, + unsigned int len, u32 padbit) { struct poly1305_arch_internal *ctx = container_of(&state->h.h, struct poly1305_arch_internal, h); @@ -129,25 +130,18 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, inp += bytes; } while (len); } -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -void poly1305_emit_arch(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) +static void poly1305_emit(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) { if (!static_branch_likely(&poly1305_use_avx)) poly1305_emit_x86_64(ctx, mac, nonce); else poly1305_emit_avx(ctx, mac, nonce); } -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&poly1305_use_avx); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); -static int __init poly1305_simd_mod_init(void) +#define poly1305_mod_init_arch poly1305_mod_init_arch +static void poly1305_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) @@ -161,15 +155,4 @@ static int __init poly1305_simd_mod_init(void) /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) static_branch_enable(&poly1305_use_avx512); - return 0; } -subsys_initcall(poly1305_simd_mod_init); - -static void __exit poly1305_simd_mod_exit(void) -{ -} -module_exit(poly1305_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jason A. Donenfeld "); -MODULE_DESCRIPTION("Poly1305 authenticator"); diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h index e308379d89bcf5..c48a0131fd12c9 100644 --- a/lib/crypto/x86/sha1.h +++ b/lib/crypto/x86/sha1.h @@ -55,7 +55,7 @@ static void sha1_blocks(struct sha1_block_state *state, } #define sha1_mod_init_arch sha1_mod_init_arch -static inline void sha1_mod_init_arch(void) +static void sha1_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha1_blocks_x86, sha1_blocks_ni); diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S index 4bd9490ffc6621..de5f707e7ef71b 100644 --- a/lib/crypto/x86/sha256-ni-asm.S +++ b/lib/crypto/x86/sha256-ni-asm.S @@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform) RET SYM_FUNC_END(sha256_ni_transform) +#undef DIGEST_PTR +#undef DATA_PTR +#undef NUM_BLKS +#undef SHA256CONSTANTS +#undef MSG +#undef STATE0 +#undef STATE1 +#undef MSG0 +#undef MSG1 +#undef MSG2 +#undef MSG3 +#undef TMP +#undef SHUF_MASK +#undef ABEF_SAVE +#undef CDGH_SAVE + +// parameters for sha256_ni_finup2x() +#define CTX %rdi +#define DATA1 %rsi +#define DATA2 %rdx +#define LEN %ecx +#define LEN8 %cl +#define LEN64 %rcx +#define OUT1 %r8 +#define OUT2 %r9 + +// other scalar variables +#define SHA256CONSTANTS %rax +#define COUNT %r10 +#define COUNT32 %r10d +#define FINAL_STEP %r11d + +// rbx is used as a temporary. + +#define MSG %xmm0 // sha256rnds2 implicit operand +#define STATE0_A %xmm1 +#define STATE1_A %xmm2 +#define STATE0_B %xmm3 +#define STATE1_B %xmm4 +#define TMP_A %xmm5 +#define TMP_B %xmm6 +#define MSG0_A %xmm7 +#define MSG1_A %xmm8 +#define MSG2_A %xmm9 +#define MSG3_A %xmm10 +#define MSG0_B %xmm11 +#define MSG1_B %xmm12 +#define MSG2_B %xmm13 +#define MSG3_B %xmm14 +#define SHUF_MASK %xmm15 + +#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state) +#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) +#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) + +// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b +// contain the current 4 message schedule words for the first and second message +// respectively. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words for each message. m1_a-m3_a contain +// the next 3 groups of 4 message schedule words for the first message, and +// likewise m1_b-m3_b for the second. After consuming the current value of +// m0_a, this macro computes the group after m3_a and writes it to m0_a, and +// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the +// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must +// cycle through the registers accordingly. +.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b + movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A + movdqa TMP_A, TMP_B + paddd \m0_a, TMP_A + paddd \m0_b, TMP_B +.if \i < 48 + sha256msg1 \m1_a, \m0_a + sha256msg1 \m1_b, \m0_b +.endif + movdqa TMP_A, MSG + sha256rnds2 STATE0_A, STATE1_A + movdqa TMP_B, MSG + sha256rnds2 STATE0_B, STATE1_B + pshufd $0x0E, TMP_A, MSG + sha256rnds2 STATE1_A, STATE0_A + pshufd $0x0E, TMP_B, MSG + sha256rnds2 STATE1_B, STATE0_B +.if \i < 48 + movdqa \m3_a, TMP_A + movdqa \m3_b, TMP_B + palignr $4, \m2_a, TMP_A + palignr $4, \m2_b, TMP_B + paddd TMP_A, \m0_a + paddd TMP_B, \m0_b + sha256msg2 \m3_a, \m0_a + sha256msg2 \m3_b, \m0_b +.endif +.endm + +// +// void sha256_ni_finup2x(const struct __sha256_ctx *ctx, +// const u8 *data1, const u8 *data2, int len, +// u8 out1[SHA256_DIGEST_SIZE], +// u8 out2[SHA256_DIGEST_SIZE]); +// +// This function computes the SHA-256 digests of two messages |data1| and +// |data2| that are both |len| bytes long, starting from the initial context +// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. +// +// The instructions for the two SHA-256 operations are interleaved. On many +// CPUs, this is almost twice as fast as hashing each message individually due +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. +// +SYM_FUNC_START(sha256_ni_finup2x) + // Allocate 128 bytes of stack space, 16-byte aligned. + push %rbx + push %rbp + mov %rsp, %rbp + sub $128, %rsp + and $~15, %rsp + + // Load the shuffle mask for swapping the endianness of 32-bit words. + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + + // Set up pointer to the round constants. + lea K256+32*4(%rip), SHA256CONSTANTS + + // Initially we're not processing the final blocks. + xor FINAL_STEP, FINAL_STEP + + // Load the initial state from ctx->state. + movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA + movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE + movdqa STATE0_A, TMP_A + punpcklqdq STATE1_A, STATE0_A // FEBA + punpckhqdq TMP_A, STATE1_A // DCHG + pshufd $0x1B, STATE0_A, STATE0_A // ABEF + pshufd $0xB1, STATE1_A, STATE1_A // CDGH + + // Load ctx->bytecount. Take the mod 64 of it to get the number of + // bytes that are buffered in ctx->buf. Also save it in a register with + // LEN added to it. + mov LEN, LEN + mov OFFSETOF_BYTECOUNT(CTX), %rbx + lea (%rbx, LEN64, 1), COUNT + and $63, %ebx + jz .Lfinup2x_enter_loop // No bytes buffered? + + // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them + // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we + // just load 64 bytes from each of ctx->buf, DATA1, and DATA2 + // unconditionally and rearrange the data as needed. + + movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A + movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A + movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A + movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A + movdqa MSG0_A, 0*16(%rsp) + movdqa MSG1_A, 1*16(%rsp) + movdqa MSG2_A, 2*16(%rsp) + movdqa MSG3_A, 3*16(%rsp) + + movdqu 0*16(DATA1), MSG0_A + movdqu 1*16(DATA1), MSG1_A + movdqu 2*16(DATA1), MSG2_A + movdqu 3*16(DATA1), MSG3_A + movdqu MSG0_A, 0*16(%rsp,%rbx) + movdqu MSG1_A, 1*16(%rsp,%rbx) + movdqu MSG2_A, 2*16(%rsp,%rbx) + movdqu MSG3_A, 3*16(%rsp,%rbx) + movdqa 0*16(%rsp), MSG0_A + movdqa 1*16(%rsp), MSG1_A + movdqa 2*16(%rsp), MSG2_A + movdqa 3*16(%rsp), MSG3_A + + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA2), MSG3_B + movdqu MSG0_B, 0*16(%rsp,%rbx) + movdqu MSG1_B, 1*16(%rsp,%rbx) + movdqu MSG2_B, 2*16(%rsp,%rbx) + movdqu MSG3_B, 3*16(%rsp,%rbx) + movdqa 0*16(%rsp), MSG0_B + movdqa 1*16(%rsp), MSG1_B + movdqa 2*16(%rsp), MSG2_B + movdqa 3*16(%rsp), MSG3_B + + sub $64, %rbx // rbx = buffered - 64 + sub %rbx, DATA1 // DATA1 += 64 - buffered + sub %rbx, DATA2 // DATA2 += 64 - buffered + add %ebx, LEN // LEN += buffered - 64 + movdqa STATE0_A, STATE0_B + movdqa STATE1_A, STATE1_B + jmp .Lfinup2x_loop_have_data + +.Lfinup2x_enter_loop: + sub $64, LEN + movdqa STATE0_A, STATE0_B + movdqa STATE1_A, STATE1_B +.Lfinup2x_loop: + // Load the next two data blocks. + movdqu 0*16(DATA1), MSG0_A + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA1), MSG1_A + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA1), MSG2_A + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA1), MSG3_A + movdqu 3*16(DATA2), MSG3_B + add $64, DATA1 + add $64, DATA2 +.Lfinup2x_loop_have_data: + // Convert the words of the data blocks from big endian. + pshufb SHUF_MASK, MSG0_A + pshufb SHUF_MASK, MSG0_B + pshufb SHUF_MASK, MSG1_A + pshufb SHUF_MASK, MSG1_B + pshufb SHUF_MASK, MSG2_A + pshufb SHUF_MASK, MSG2_B + pshufb SHUF_MASK, MSG3_A + pshufb SHUF_MASK, MSG3_B +.Lfinup2x_loop_have_bswapped_data: + + // Save the original state for each block. + movdqa STATE0_A, 0*16(%rsp) + movdqa STATE0_B, 1*16(%rsp) + movdqa STATE1_A, 2*16(%rsp) + movdqa STATE1_B, 3*16(%rsp) + + // Do the SHA-256 rounds on each block. +.irp i, 0, 16, 32, 48 + do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \ + MSG0_B, MSG1_B, MSG2_B, MSG3_B + do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \ + MSG1_B, MSG2_B, MSG3_B, MSG0_B + do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \ + MSG2_B, MSG3_B, MSG0_B, MSG1_B + do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \ + MSG3_B, MSG0_B, MSG1_B, MSG2_B +.endr + + // Add the original state for each block. + paddd 0*16(%rsp), STATE0_A + paddd 1*16(%rsp), STATE0_B + paddd 2*16(%rsp), STATE1_A + paddd 3*16(%rsp), STATE1_B + + // Update LEN and loop back if more blocks remain. + sub $64, LEN + jge .Lfinup2x_loop + + // Check if any final blocks need to be handled. + // FINAL_STEP = 2: all done + // FINAL_STEP = 1: need to do count-only padding block + // FINAL_STEP = 0: need to do the block with 0x80 padding byte + cmp $1, FINAL_STEP + jg .Lfinup2x_done + je .Lfinup2x_finalize_countonly + add $64, LEN + jz .Lfinup2x_finalize_blockaligned + + // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block. + // To do this, write the padding starting with the 0x80 byte to + // &sp[64]. Then for each message, copy the last 64 data bytes to sp + // and load from &sp[64 - LEN] to get the needed padding block. This + // code relies on the data buffers being >= 64 bytes in length. + mov $64, %ebx + sub LEN, %ebx // ebx = 64 - LEN + sub %rbx, DATA1 // DATA1 -= 64 - LEN + sub %rbx, DATA2 // DATA2 -= 64 - LEN + mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary + movd FINAL_STEP, MSG0_A + pxor MSG1_A, MSG1_A + movdqa MSG0_A, 4*16(%rsp) + movdqa MSG1_A, 5*16(%rsp) + movdqa MSG1_A, 6*16(%rsp) + movdqa MSG1_A, 7*16(%rsp) + cmp $56, LEN + jge 1f // will COUNT spill into its own block? + shl $3, COUNT + bswap COUNT + mov COUNT, 56(%rsp,%rbx) + mov $2, FINAL_STEP // won't need count-only block + jmp 2f +1: + mov $1, FINAL_STEP // will need count-only block +2: + movdqu 0*16(DATA1), MSG0_A + movdqu 1*16(DATA1), MSG1_A + movdqu 2*16(DATA1), MSG2_A + movdqu 3*16(DATA1), MSG3_A + movdqa MSG0_A, 0*16(%rsp) + movdqa MSG1_A, 1*16(%rsp) + movdqa MSG2_A, 2*16(%rsp) + movdqa MSG3_A, 3*16(%rsp) + movdqu 0*16(%rsp,%rbx), MSG0_A + movdqu 1*16(%rsp,%rbx), MSG1_A + movdqu 2*16(%rsp,%rbx), MSG2_A + movdqu 3*16(%rsp,%rbx), MSG3_A + + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA2), MSG3_B + movdqa MSG0_B, 0*16(%rsp) + movdqa MSG1_B, 1*16(%rsp) + movdqa MSG2_B, 2*16(%rsp) + movdqa MSG3_B, 3*16(%rsp) + movdqu 0*16(%rsp,%rbx), MSG0_B + movdqu 1*16(%rsp,%rbx), MSG1_B + movdqu 2*16(%rsp,%rbx), MSG2_B + movdqu 3*16(%rsp,%rbx), MSG3_B + jmp .Lfinup2x_loop_have_data + + // Prepare a padding block, either: + // + // {0x80, 0, 0, 0, ..., count (as __be64)} + // This is for a block aligned message. + // + // { 0, 0, 0, 0, ..., count (as __be64)} + // This is for a message whose length mod 64 is >= 56. + // + // Pre-swap the endianness of the words. +.Lfinup2x_finalize_countonly: + pxor MSG0_A, MSG0_A + jmp 1f + +.Lfinup2x_finalize_blockaligned: + mov $0x80000000, %ebx + movd %ebx, MSG0_A +1: + pxor MSG1_A, MSG1_A + pxor MSG2_A, MSG2_A + ror $29, COUNT + movq COUNT, MSG3_A + pslldq $8, MSG3_A + movdqa MSG0_A, MSG0_B + pxor MSG1_B, MSG1_B + pxor MSG2_B, MSG2_B + movdqa MSG3_A, MSG3_B + mov $2, FINAL_STEP + jmp .Lfinup2x_loop_have_bswapped_data + +.Lfinup2x_done: + // Write the two digests with all bytes in the correct order. + movdqa STATE0_A, TMP_A + movdqa STATE0_B, TMP_B + punpcklqdq STATE1_A, STATE0_A // GHEF + punpcklqdq STATE1_B, STATE0_B + punpckhqdq TMP_A, STATE1_A // ABCD + punpckhqdq TMP_B, STATE1_B + pshufd $0xB1, STATE0_A, STATE0_A // HGFE + pshufd $0xB1, STATE0_B, STATE0_B + pshufd $0x1B, STATE1_A, STATE1_A // DCBA + pshufd $0x1B, STATE1_B, STATE1_B + pshufb SHUF_MASK, STATE0_A + pshufb SHUF_MASK, STATE0_B + pshufb SHUF_MASK, STATE1_A + pshufb SHUF_MASK, STATE1_B + movdqu STATE0_A, 1*16(OUT1) + movdqu STATE0_B, 1*16(OUT2) + movdqu STATE1_A, 0*16(OUT1) + movdqu STATE1_B, 0*16(OUT2) + + mov %rbp, %rsp + pop %rbp + pop %rbx + RET +SYM_FUNC_END(sha256_ni_finup2x) + .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 K256: diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h index 669bc06538b67e..38e33b22a09277 100644 --- a/lib/crypto/x86/sha256.h +++ b/lib/crypto/x86/sha256.h @@ -5,9 +5,10 @@ * Copyright 2025 Google LLC */ #include -#include #include +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni); + DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ @@ -16,7 +17,7 @@ DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); static void c_fn(struct sha256_block_state *state, const u8 *data, \ size_t nblocks) \ { \ - if (likely(crypto_simd_usable())) { \ + if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ @@ -36,11 +37,48 @@ static void sha256_blocks(struct sha256_block_state *state, static_call(sha256_blocks_x86)(state, data, nblocks); } +static_assert(offsetof(struct __sha256_ctx, state) == 0); +static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); +static_assert(offsetof(struct __sha256_ctx, buf) == 40); +asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, int len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +#define sha256_finup_2x_arch sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + /* + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. + * Further limit len to 65536 to avoid spending too long with preemption + * disabled. (Of course, in practice len is nearly always 4096 anyway.) + */ + if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE && + len <= 65536 && likely(irq_fpu_usable())) { + kernel_fpu_begin(); + sha256_ni_finup2x(ctx, data1, data2, len, out1, out2); + kernel_fpu_end(); + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); + return true; + } + return false; +} + +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return static_key_enabled(&have_sha_ni); +} + #define sha256_mod_init_arch sha256_mod_init_arch -static inline void sha256_mod_init_arch(void) +static void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); + static_branch_enable(&have_sha_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h index c13503d9d57d92..0213c70cedd01c 100644 --- a/lib/crypto/x86/sha512.h +++ b/lib/crypto/x86/sha512.h @@ -4,9 +4,7 @@ * * Copyright 2025 Google LLC */ - #include -#include #include DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); @@ -17,7 +15,7 @@ DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); static void c_fn(struct sha512_block_state *state, const u8 *data, \ size_t nblocks) \ { \ - if (likely(crypto_simd_usable())) { \ + if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ @@ -37,7 +35,7 @@ static void sha512_blocks(struct sha512_block_state *state, } #define sha512_mod_init_arch sha512_mod_init_arch -static inline void sha512_mod_init_arch(void) +static void sha512_mod_init_arch(void) { if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c index 5d54c4b437df78..5f779719c3d34c 100644 --- a/lib/raid6/recov_rvv.c +++ b/lib/raid6/recov_rvv.c @@ -4,9 +4,7 @@ * Author: Chunyan Zhang */ -#include #include -#include #include static int rvv_has_vector(void) diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c index 7d82efa5b14f9e..89da5fc247aa94 100644 --- a/lib/raid6/rvv.c +++ b/lib/raid6/rvv.c @@ -9,11 +9,8 @@ * Copyright 2002-2004 H. Peter Anvin */ -#include #include -#include #include -#include #include "rvv.h" #define NSIZE (riscv_v_vsize / 32) /* NSIZE = vlenb */ @@ -47,7 +44,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]) @@ -120,7 +117,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]) @@ -221,9 +218,9 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -313,9 +310,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -443,13 +440,13 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -569,13 +566,13 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -757,21 +754,21 @@ static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" "vle8.v v16, (%[wp4])\n" - "vle8.v v17, (%[wp4])\n" + "vmv.v.v v17, v16\n" "vle8.v v20, (%[wp5])\n" - "vle8.v v21, (%[wp5])\n" + "vmv.v.v v21, v20\n" "vle8.v v24, (%[wp6])\n" - "vle8.v v25, (%[wp6])\n" + "vmv.v.v v25, v24\n" "vle8.v v28, (%[wp7])\n" - "vle8.v v29, (%[wp7])\n" + "vmv.v.v v29, v28\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -951,21 +948,21 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" "vle8.v v16, (%[wp4])\n" - "vle8.v v17, (%[wp4])\n" + "vmv.v.v v17, v16\n" "vle8.v v20, (%[wp5])\n" - "vle8.v v21, (%[wp5])\n" + "vmv.v.v v21, v20\n" "vle8.v v24, (%[wp6])\n" - "vle8.v v25, (%[wp6])\n" + "vmv.v.v v25, v24\n" "vle8.v v28, (%[wp7])\n" - "vle8.v v29, (%[wp7])\n" + "vmv.v.v v29, v28\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), diff --git a/lib/test_objpool.c b/lib/test_objpool.c index 8f688187fa8727..6a34a7582fdbf5 100644 --- a/lib/test_objpool.c +++ b/lib/test_objpool.c @@ -164,7 +164,7 @@ static enum hrtimer_restart ot_hrtimer_handler(struct hrtimer *hrt) /* do bulk-testings for objects pop/push */ item->worker(item, 1); - hrtimer_forward(hrt, hrt->base->get_time(), item->hrtcycle); + hrtimer_forward_now(hrt, item->hrtcycle); return HRTIMER_RESTART; } diff --git a/lib/tests/Makefile b/lib/tests/Makefile index fa6d728a8b5b15..f7460831cfdd4a 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o +obj-$(CONFIG_FFS_KUNIT_TEST) += ffs_kunit.o CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread) CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation) diff --git a/lib/tests/ffs_kunit.c b/lib/tests/ffs_kunit.c new file mode 100644 index 00000000000000..9a329cdc09c247 --- /dev/null +++ b/lib/tests/ffs_kunit.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for ffs()-family functions + */ +#include +#include + +/* + * Test data structures + */ +struct ffs_test_case { + unsigned long input; + int expected_ffs; /* ffs() result (1-based) */ + int expected_fls; /* fls() result (1-based) */ + const char *description; +}; + +struct ffs64_test_case { + u64 input; + int expected_fls64; /* fls64() result (1-based) */ + unsigned int expected_ffs64_0based; /* __ffs64() result (0-based) */ + const char *description; +}; + +/* + * Basic edge cases - core functionality validation + */ +static const struct ffs_test_case basic_test_cases[] = { + /* Zero case - special handling */ + {0x00000000, 0, 0, "zero value"}, + + /* Single bit patterns - powers of 2 */ + {0x00000001, 1, 1, "bit 0 set"}, + {0x00000002, 2, 2, "bit 1 set"}, + {0x00000004, 3, 3, "bit 2 set"}, + {0x00000008, 4, 4, "bit 3 set"}, + {0x00000010, 5, 5, "bit 4 set"}, + {0x00000020, 6, 6, "bit 5 set"}, + {0x00000040, 7, 7, "bit 6 set"}, + {0x00000080, 8, 8, "bit 7 set"}, + {0x00000100, 9, 9, "bit 8 set"}, + {0x00008000, 16, 16, "bit 15 set"}, + {0x00010000, 17, 17, "bit 16 set"}, + {0x40000000, 31, 31, "bit 30 set"}, + {0x80000000, 32, 32, "bit 31 set (sign bit)"}, + + /* Maximum values */ + {0xFFFFFFFF, 1, 32, "all bits set"}, + + /* Multiple bit patterns */ + {0x00000003, 1, 2, "bits 0-1 set"}, + {0x00000007, 1, 3, "bits 0-2 set"}, + {0x0000000F, 1, 4, "bits 0-3 set"}, + {0x000000FF, 1, 8, "bits 0-7 set"}, + {0x0000FFFF, 1, 16, "bits 0-15 set"}, + {0x7FFFFFFF, 1, 31, "bits 0-30 set"}, + + /* Sparse patterns */ + {0x00000101, 1, 9, "bits 0,8 set"}, + {0x00001001, 1, 13, "bits 0,12 set"}, + {0x80000001, 1, 32, "bits 0,31 set"}, + {0x40000002, 2, 31, "bits 1,30 set"}, +}; + +/* + * 64-bit test cases + */ +static const struct ffs64_test_case ffs64_test_cases[] = { + /* Zero case */ + {0x0000000000000000ULL, 0, 0, "zero value"}, + + /* Single bit patterns */ + {0x0000000000000001ULL, 1, 0, "bit 0 set"}, + {0x0000000000000002ULL, 2, 1, "bit 1 set"}, + {0x0000000000000004ULL, 3, 2, "bit 2 set"}, + {0x0000000000000008ULL, 4, 3, "bit 3 set"}, + {0x0000000000008000ULL, 16, 15, "bit 15 set"}, + {0x0000000000010000ULL, 17, 16, "bit 16 set"}, + {0x0000000080000000ULL, 32, 31, "bit 31 set"}, + {0x0000000100000000ULL, 33, 32, "bit 32 set"}, + {0x0000000200000000ULL, 34, 33, "bit 33 set"}, + {0x4000000000000000ULL, 63, 62, "bit 62 set"}, + {0x8000000000000000ULL, 64, 63, "bit 63 set (sign bit)"}, + + /* Maximum values */ + {0xFFFFFFFFFFFFFFFFULL, 64, 0, "all bits set"}, + + /* Cross 32-bit boundary patterns */ + {0x00000000FFFFFFFFULL, 32, 0, "lower 32 bits set"}, + {0xFFFFFFFF00000000ULL, 64, 32, "upper 32 bits set"}, + {0x8000000000000001ULL, 64, 0, "bits 0,63 set"}, + {0x4000000000000002ULL, 63, 1, "bits 1,62 set"}, + + /* Mixed patterns */ + {0x00000001FFFFFFFFULL, 33, 0, "bit 32 + lower 32 bits"}, + {0xFFFFFFFF80000000ULL, 64, 31, "upper 32 bits + bit 31"}, +}; + +/* + * Helper function to validate ffs results with detailed error messages + */ +static void validate_ffs_result(struct kunit *test, unsigned long input, + int actual, int expected, const char *func_name, + const char *description) +{ + KUNIT_EXPECT_EQ_MSG(test, actual, expected, + "%s(0x%08lx) [%s]: expected %d, got %d", + func_name, input, description, expected, actual); +} + +/* + * Helper function to validate 64-bit ffs results + */ +static void validate_ffs64_result(struct kunit *test, u64 input, + int actual, int expected, const char *func_name, + const char *description) +{ + KUNIT_EXPECT_EQ_MSG(test, actual, expected, + "%s(0x%016llx) [%s]: expected %d, got %d", + func_name, input, description, expected, actual); +} + +/* + * Helper function to validate mathematical relationships between functions + */ +static void validate_ffs_relationships(struct kunit *test, unsigned long input) +{ + int ffs_result; + int fls_result; + unsigned int ffs_0based; + unsigned int fls_0based; + + if (input == 0) { + /* Special case: zero input */ + KUNIT_EXPECT_EQ(test, ffs(input), 0); + KUNIT_EXPECT_EQ(test, fls(input), 0); + /* __ffs and __fls are undefined for 0, but often return specific values */ + return; + } + + ffs_result = ffs(input); + fls_result = fls(input); + ffs_0based = __ffs(input); + fls_0based = __fls(input); + + /* Relationship: ffs(x) == __ffs(x) + 1 for x != 0 */ + KUNIT_EXPECT_EQ_MSG(test, ffs_result, ffs_0based + 1, + "ffs(0x%08lx) != __ffs(0x%08lx) + 1: %d != %u + 1", + input, input, ffs_result, ffs_0based); + + /* Relationship: fls(x) == __fls(x) + 1 for x != 0 */ + KUNIT_EXPECT_EQ_MSG(test, fls_result, fls_0based + 1, + "fls(0x%08lx) != __fls(0x%08lx) + 1: %d != %u + 1", + input, input, fls_result, fls_0based); + + /* Range validation */ + KUNIT_EXPECT_GE(test, ffs_result, 1); + KUNIT_EXPECT_LE(test, ffs_result, BITS_PER_LONG); + KUNIT_EXPECT_GE(test, fls_result, 1); + KUNIT_EXPECT_LE(test, fls_result, BITS_PER_LONG); +} + +/* + * Helper function to validate 64-bit relationships + */ +static void validate_ffs64_relationships(struct kunit *test, u64 input) +{ + int fls64_result; + unsigned int ffs64_0based; + + if (input == 0) { + KUNIT_EXPECT_EQ(test, fls64(input), 0); + return; + } + + fls64_result = fls64(input); + ffs64_0based = __ffs64(input); + + /* Range validation */ + KUNIT_EXPECT_GE(test, fls64_result, 1); + KUNIT_EXPECT_LE(test, fls64_result, 64); + KUNIT_EXPECT_LT(test, ffs64_0based, 64); + + /* + * Relationships with 32-bit functions should hold for small values + * on all architectures. + */ + if (input <= 0xFFFFFFFFULL) { + unsigned long input_32 = (unsigned long)input; + KUNIT_EXPECT_EQ_MSG(test, fls64(input), fls(input_32), + "fls64(0x%llx) != fls(0x%lx): %d != %d", + input, input_32, fls64(input), fls(input_32)); + + if (input != 0) { + KUNIT_EXPECT_EQ_MSG(test, __ffs64(input), __ffs(input_32), + "__ffs64(0x%llx) != __ffs(0x%lx): %lu != %lu", + input, input_32, + (unsigned long)__ffs64(input), + (unsigned long)__ffs(input_32)); + } + } +} + +/* + * Test basic correctness of all ffs-family functions + */ +static void ffs_basic_correctness_test(struct kunit *test) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) { + const struct ffs_test_case *tc = &basic_test_cases[i]; + + /* Test ffs() */ + validate_ffs_result(test, tc->input, ffs(tc->input), + tc->expected_ffs, "ffs", tc->description); + + /* Test fls() */ + validate_ffs_result(test, tc->input, fls(tc->input), + tc->expected_fls, "fls", tc->description); + + /* Test __ffs() - skip zero case as it's undefined */ + if (tc->input != 0) { + /* Calculate expected __ffs() result: __ffs(x) == ffs(x) - 1 */ + unsigned int expected_ffs_0based = tc->expected_ffs - 1; + validate_ffs_result(test, tc->input, __ffs(tc->input), + expected_ffs_0based, "__ffs", tc->description); + } + + /* Test __fls() - skip zero case as it's undefined */ + if (tc->input != 0) { + /* Calculate expected __fls() result: __fls(x) == fls(x) - 1 */ + unsigned int expected_fls_0based = tc->expected_fls - 1; + validate_ffs_result(test, tc->input, __fls(tc->input), + expected_fls_0based, "__fls", tc->description); + } + } +} + +/* + * Test 64-bit function correctness + */ +static void ffs64_correctness_test(struct kunit *test) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) { + const struct ffs64_test_case *tc = &ffs64_test_cases[i]; + + /* Test fls64() */ + validate_ffs64_result(test, tc->input, fls64(tc->input), + tc->expected_fls64, "fls64", tc->description); + + /* Test __ffs64() - skip zero case as it's undefined */ + if (tc->input != 0) { + validate_ffs64_result(test, tc->input, __ffs64(tc->input), + tc->expected_ffs64_0based, "__ffs64", + tc->description); + } + } +} + +/* + * Test mathematical relationships between functions + */ +static void ffs_mathematical_relationships_test(struct kunit *test) +{ + int i; + + /* Test basic cases */ + for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) { + validate_ffs_relationships(test, basic_test_cases[i].input); + } + + /* Test 64-bit cases */ + for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) { + validate_ffs64_relationships(test, ffs64_test_cases[i].input); + } +} + +/* + * Test edge cases and boundary conditions + */ +static void ffs_edge_cases_test(struct kunit *test) +{ + unsigned long test_patterns[] = { + /* Powers of 2 */ + 1UL, 2UL, 4UL, 8UL, 16UL, 32UL, 64UL, 128UL, + 256UL, 512UL, 1024UL, 2048UL, 4096UL, 8192UL, + + /* Powers of 2 minus 1 */ + 1UL, 3UL, 7UL, 15UL, 31UL, 63UL, 127UL, 255UL, + 511UL, 1023UL, 2047UL, 4095UL, 8191UL, + + /* Boundary values */ + 0x7FFFFFFFUL, /* Maximum positive 32-bit */ + 0x80000000UL, /* Minimum negative 32-bit */ + 0xFFFFFFFFUL, /* Maximum 32-bit unsigned */ + }; + int i; + + for (i = 0; i < ARRAY_SIZE(test_patterns); i++) { + validate_ffs_relationships(test, test_patterns[i]); + } +} + +/* + * Test 64-bit edge cases + */ +static void ffs64_edge_cases_test(struct kunit *test) +{ + u64 test_patterns_64[] = { + /* 64-bit powers of 2 */ + 0x0000000100000000ULL, /* 2^32 */ + 0x0000000200000000ULL, /* 2^33 */ + 0x0000000400000000ULL, /* 2^34 */ + 0x0000001000000000ULL, /* 2^36 */ + 0x0000010000000000ULL, /* 2^40 */ + 0x0001000000000000ULL, /* 2^48 */ + 0x0100000000000000ULL, /* 2^56 */ + 0x4000000000000000ULL, /* 2^62 */ + 0x8000000000000000ULL, /* 2^63 */ + + /* Cross-boundary patterns */ + 0x00000000FFFFFFFFULL, /* Lower 32 bits */ + 0xFFFFFFFF00000000ULL, /* Upper 32 bits */ + 0x7FFFFFFFFFFFFFFFULL, /* Maximum positive 64-bit */ + 0xFFFFFFFFFFFFFFFFULL, /* Maximum 64-bit unsigned */ + }; + int i; + + for (i = 0; i < ARRAY_SIZE(test_patterns_64); i++) { + validate_ffs64_relationships(test, test_patterns_64[i]); + } +} + +/* + * ffz() test data - Find First Zero bit test cases + */ +struct ffz_test_case { + unsigned long input; + unsigned long expected_ffz; + const char *description; +}; + +static const struct ffz_test_case ffz_test_cases[] = { + /* Zero bits in specific positions */ + {0xFFFFFFFE, 0, "bit 0 is zero"}, /* ...11111110 */ + {0xFFFFFFFD, 1, "bit 1 is zero"}, /* ...11111101 */ + {0xFFFFFFFB, 2, "bit 2 is zero"}, /* ...11111011 */ + {0xFFFFFFF7, 3, "bit 3 is zero"}, /* ...11110111 */ + {0xFFFFFFEF, 4, "bit 4 is zero"}, /* ...11101111 */ + {0xFFFFFFDF, 5, "bit 5 is zero"}, /* ...11011111 */ + {0xFFFFFFBF, 6, "bit 6 is zero"}, /* ...10111111 */ + {0xFFFFFF7F, 7, "bit 7 is zero"}, /* ...01111111 */ + {0xFFFFFEFF, 8, "bit 8 is zero"}, /* Gap in bit 8 */ + {0xFFFF7FFF, 15, "bit 15 is zero"}, /* Gap in bit 15 */ + {0xFFFEFFFF, 16, "bit 16 is zero"}, /* Gap in bit 16 */ + {0xBFFFFFFF, 30, "bit 30 is zero"}, /* Gap in bit 30 */ + {0x7FFFFFFF, 31, "bit 31 is zero"}, /* 01111111... */ + + /* Multiple zero patterns */ + {0xFFFFFFFC, 0, "bits 0-1 are zero"}, /* ...11111100 */ + {0xFFFFFFF8, 0, "bits 0-2 are zero"}, /* ...11111000 */ + {0xFFFFFFF0, 0, "bits 0-3 are zero"}, /* ...11110000 */ + {0xFFFFFF00, 0, "bits 0-7 are zero"}, /* ...00000000 */ + {0xFFFF0000, 0, "bits 0-15 are zero"}, /* Lower 16 bits zero */ + + /* All zeros (special case) */ + {0x00000000, 0, "all bits zero"}, + + /* Complex patterns */ + {0xFFFDFFFF, 17, "bit 17 is zero"}, /* Gap in bit 17 */ + {0xFFF7FFFF, 19, "bit 19 is zero"}, /* Gap in bit 19 */ + {0xF7FFFFFF, 27, "bit 27 is zero"}, /* Gap in bit 27 */ + {0xDFFFFFFF, 29, "bit 29 is zero"}, /* Gap in bit 29 */ +}; + +/* + * Test basic correctness of ffz() function + */ +static void ffz_basic_correctness_test(struct kunit *test) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) { + const struct ffz_test_case *tc = &ffz_test_cases[i]; + unsigned long result = ffz(tc->input); + + KUNIT_EXPECT_EQ_MSG(test, result, tc->expected_ffz, + "ffz(0x%08lx) [%s]: expected %lu, got %lu", + tc->input, tc->description, tc->expected_ffz, result); + } +} + +/* + * Test mathematical relationships between ffz() and other functions + */ +static void validate_ffz_relationships(struct kunit *test, unsigned long input) +{ + unsigned long ffz_result; + + if (input == 0) { + /* ffz(0) should return 0 (first zero bit is at position 0) */ + KUNIT_EXPECT_EQ(test, ffz(input), 0); + return; + } + + if (input == ~0UL) { + /* ffz(~0) is undefined (no zero bits) - just verify it doesn't crash */ + ffz_result = ffz(input); + /* Implementation-defined behavior, just ensure it completes */ + return; + } + + ffz_result = ffz(input); + + /* Range validation - result should be within valid bit range */ + KUNIT_EXPECT_LT(test, ffz_result, BITS_PER_LONG); + + /* Verify the bit at ffz_result position is actually zero */ + KUNIT_EXPECT_EQ_MSG(test, (input >> ffz_result) & 1, 0, + "ffz(0x%08lx) = %lu, but bit %lu is not zero", + input, ffz_result, ffz_result); + + /* Core relationship: if we set the ffz bit, ffz should find a different bit */ + if (ffz_result < BITS_PER_LONG - 1) { + unsigned long modified = input | (1UL << ffz_result); + if (modified != ~0UL) { /* Skip if all bits would be set */ + unsigned long new_ffz = ffz(modified); + KUNIT_EXPECT_NE_MSG(test, new_ffz, ffz_result, + "ffz(0x%08lx) = %lu, but setting that bit doesn't change ffz result", + input, ffz_result); + } + } +} + +static void ffz_mathematical_relationships_test(struct kunit *test) +{ + unsigned long test_patterns[] = { + /* Powers of 2 with one bit clear */ + 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7, + 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F, + + /* Multiple patterns */ + 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000, + 0x7FFFFFFF, 0x3FFFFFFF, 0x1FFFFFFF, 0x0FFFFFFF, + + /* Complex bit patterns */ + 0xAAAAAAAA, 0x55555555, 0xCCCCCCCC, 0x33333333, + 0xF0F0F0F0, 0x0F0F0F0F, 0xFF00FF00, 0x00FF00FF, + }; + int i; + + /* Test basic test cases */ + for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) { + validate_ffz_relationships(test, ffz_test_cases[i].input); + } + + /* Test additional patterns */ + for (i = 0; i < ARRAY_SIZE(test_patterns); i++) { + validate_ffz_relationships(test, test_patterns[i]); + } +} + +/* + * Test edge cases and boundary conditions for ffz() + */ +static void ffz_edge_cases_test(struct kunit *test) +{ + unsigned long edge_patterns[] = { + /* Boundary values */ + 0x00000000, /* All zeros */ + 0x80000000, /* Only MSB set */ + 0x00000001, /* Only LSB set */ + 0x7FFFFFFF, /* MSB clear */ + 0xFFFFFFFE, /* LSB clear */ + + /* Powers of 2 complement patterns (one zero bit each) */ + ~(1UL << 0), ~(1UL << 1), ~(1UL << 2), ~(1UL << 3), + ~(1UL << 4), ~(1UL << 8), ~(1UL << 16), ~(1UL << 31), + + /* Walking zero patterns */ + 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7, + 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F, + 0xFFFFFEFF, 0xFFFFFDFF, 0xFFFFFBFF, 0xFFFFF7FF, + + /* Multiple zeros */ + 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000, + 0xFF000000, 0xF0000000, 0x00000000, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(edge_patterns); i++) { + validate_ffz_relationships(test, edge_patterns[i]); + } +} + +/* + * To have useful build error output, split the tests into separate + * functions so it's clear which are missing __attribute_const__. + */ +#define CREATE_WRAPPER(func) \ +static noinline bool build_test_##func(void) \ +{ \ + int init_##func = 32; \ + int result_##func = func(6); \ + \ + /* Does the static initializer vanish after calling func? */ \ + BUILD_BUG_ON(init_##func < 32); \ + \ + /* "Consume" the results so optimizer doesn't drop them. */ \ + barrier_data(&init_##func); \ + barrier_data(&result_##func); \ + \ + return true; \ +} +CREATE_WRAPPER(ffs) +CREATE_WRAPPER(fls) +CREATE_WRAPPER(__ffs) +CREATE_WRAPPER(__fls) +CREATE_WRAPPER(ffz) +#undef CREATE_WRAPPER + +/* + * Make sure that __attribute_const__ has be applied to all the + * functions. This is a regression test for: + * https://github.com/KSPP/linux/issues/364 + */ +static void ffs_attribute_const_test(struct kunit *test) +{ + KUNIT_EXPECT_TRUE(test, build_test_ffs()); + KUNIT_EXPECT_TRUE(test, build_test_fls()); + KUNIT_EXPECT_TRUE(test, build_test___ffs()); + KUNIT_EXPECT_TRUE(test, build_test___fls()); + KUNIT_EXPECT_TRUE(test, build_test_ffz()); +} + +/* + * KUnit test case definitions + */ +static struct kunit_case ffs_test_cases[] = { + KUNIT_CASE(ffs_basic_correctness_test), + KUNIT_CASE(ffs64_correctness_test), + KUNIT_CASE(ffs_mathematical_relationships_test), + KUNIT_CASE(ffs_edge_cases_test), + KUNIT_CASE(ffs64_edge_cases_test), + KUNIT_CASE(ffz_basic_correctness_test), + KUNIT_CASE(ffz_mathematical_relationships_test), + KUNIT_CASE(ffz_edge_cases_test), + KUNIT_CASE(ffs_attribute_const_test), + {} +}; + +/* + * KUnit test suite definition + */ +static struct kunit_suite ffs_test_suite = { + .name = "ffs", + .test_cases = ffs_test_cases, +}; + +kunit_test_suites(&ffs_test_suite); + +MODULE_DESCRIPTION("KUnit tests for ffs()-family functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 45df764b49ad62..db87ba34ef1928 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -12,24 +12,6 @@ config GENERIC_GETTIMEOFDAY Each architecture that enables this feature has to provide the fallback implementation. -config GENERIC_VDSO_32 - bool - depends on GENERIC_GETTIMEOFDAY && !64BIT - help - This config option helps to avoid possible performance issues - in 32 bit only architectures. - -config GENERIC_COMPAT_VDSO - bool - help - This config option enables the compat VDSO layer. - -config GENERIC_VDSO_TIME_NS - bool - help - Selected by architectures which support time namespaces in the - VDSO - config GENERIC_VDSO_OVERFLOW_PROTECT bool help @@ -37,14 +19,9 @@ config GENERIC_VDSO_OVERFLOW_PROTECT time getter functions for the price of an extra conditional in the hotpath. -endif - config VDSO_GETRANDOM bool help Selected by architectures that support vDSO getrandom(). -config GENERIC_VDSO_DATA_STORE - bool - help - Selected by architectures that use the generic vDSO data store. +endif diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index aedd40aaa950c8..405f743253d72b 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_GENERIC_VDSO_DATA_STORE) += datastore.o +obj-$(CONFIG_HAVE_GENERIC_VDSO) += datastore.o diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index 3693c6caf2c4d4..a565c30c71a04f 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -11,14 +11,14 @@ /* * The vDSO data page. */ -#ifdef CONFIG_HAVE_GENERIC_VDSO +#ifdef CONFIG_GENERIC_GETTIMEOFDAY static union { struct vdso_time_data data; u8 page[PAGE_SIZE]; } vdso_time_data_store __page_aligned_data; struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data; static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE); -#endif /* CONFIG_HAVE_GENERIC_VDSO */ +#endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #ifdef CONFIG_VDSO_GETRANDOM static union { @@ -46,7 +46,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, switch (vmf->pgoff) { case VDSO_TIME_PAGE_OFFSET: - if (!IS_ENABLED(CONFIG_HAVE_GENERIC_VDSO)) + if (!IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) return VM_FAULT_SIGBUS; pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); if (timens_page) { diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 02ea19f671647e..95df0153f05ab4 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -108,15 +108,11 @@ bool vdso_get_timestamp(const struct vdso_time_data *vd, const struct vdso_clock return true; } -#ifdef CONFIG_TIME_NS - -#ifdef CONFIG_GENERIC_VDSO_DATA_STORE static __always_inline const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) { return (void *)vd + PAGE_SIZE; } -#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ static __always_inline bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, @@ -149,20 +145,6 @@ bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock * return true; } -#else -static __always_inline -const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) -{ - return NULL; -} - -static __always_inline -bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) -{ - return false; -} -#endif static __always_inline bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, @@ -204,7 +186,6 @@ bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, return true; } -#ifdef CONFIG_TIME_NS static __always_inline bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, clockid_t clk, struct __kernel_timespec *ts) @@ -233,14 +214,6 @@ bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock return true; } -#else -static __always_inline -bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, - clockid_t clk, struct __kernel_timespec *ts) -{ - return false; -} -#endif static __always_inline bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef86..0beaca6bacf778 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work) wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); + WARN_ON_ONCE(work_pending(&wb->switch_work)); call_rcu(&wb->rcu, cgwb_free_rcu); } @@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi, wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); + INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); + init_llist_head(&wb->switch_wbs_ctxs); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); @@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; + INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); + init_llist_head(&bdi->wb.switch_wbs_ctxs); } return ret; } diff --git a/mm/damon/core.c b/mm/damon/core.c index 106ee8b0f2d5f1..08065b36397224 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2111,6 +2111,10 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) if (!quota->ms && !quota->sz && list_empty("a->goals)) return; + /* First charge window */ + if (!quota->total_charged_sz && !quota->charged_from) + quota->charged_from = jiffies; + /* New charge window starts */ if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { @@ -2475,10 +2479,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) mutex_lock(&ctx->call_controls_lock); list_del(&control->list); mutex_unlock(&ctx->call_controls_lock); - if (!control->repeat) + if (!control->repeat) { complete(&control->completion); - else + } else if (control->canceled && control->dealloc_on_cancel) { + kfree(control); + continue; + } else { list_add(&control->list, &repeat_controls); + } } control = list_first_entry_or_null(&repeat_controls, struct damon_call_control, list); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 151a9de5ad8b81..b5a5ed16a7a5db 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -198,6 +198,11 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; + if (!damon_lru_sort_mon_attrs.sample_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); if (err) goto out; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3c71b459667675..fb7c982a0018c2 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -194,6 +194,11 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; + if (!damon_reclaim_mon_attrs.aggr_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); if (err) goto out; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6d2b0dab50cbaa..7308dee97b2109 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1260,14 +1260,18 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); - struct damon_ctx *ctx = kdamond->damon_ctx; - bool running; + struct damon_ctx *ctx; + bool running = false; - if (!ctx) - running = false; - else + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + + ctx = kdamond->damon_ctx; + if (ctx) running = damon_is_running(ctx); + mutex_unlock(&damon_sysfs_lock); + return sysfs_emit(buf, "%s\n", running ? damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]); @@ -1530,14 +1534,10 @@ static int damon_sysfs_repeat_call_fn(void *data) return 0; } -static struct damon_call_control damon_sysfs_repeat_call_control = { - .fn = damon_sysfs_repeat_call_fn, - .repeat = true, -}; - static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) { struct damon_ctx *ctx; + struct damon_call_control *repeat_call_control; int err; if (damon_sysfs_kdamond_running(kdamond)) @@ -1550,18 +1550,29 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) damon_destroy_ctx(kdamond->damon_ctx); kdamond->damon_ctx = NULL; + repeat_call_control = kmalloc(sizeof(*repeat_call_control), + GFP_KERNEL); + if (!repeat_call_control) + return -ENOMEM; + ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); - if (IS_ERR(ctx)) + if (IS_ERR(ctx)) { + kfree(repeat_call_control); return PTR_ERR(ctx); + } err = damon_start(&ctx, 1, false); if (err) { + kfree(repeat_call_control); damon_destroy_ctx(ctx); return err; } kdamond->damon_ctx = ctx; - damon_sysfs_repeat_call_control.data = kdamond; - damon_call(ctx, &damon_sysfs_repeat_call_control); + repeat_call_control->fn = damon_sysfs_repeat_call_fn; + repeat_call_control->data = kdamond; + repeat_call_control->repeat = true; + repeat_call_control->dealloc_on_cancel = true; + damon_call(ctx, repeat_call_control); return err; } @@ -1581,12 +1592,14 @@ static int damon_sysfs_damon_call(int (*fn)(void *data), struct damon_sysfs_kdamond *kdamond) { struct damon_call_control call_control = {}; + int err; if (!kdamond->damon_ctx) return -EINVAL; call_control.fn = fn; call_control.data = kdamond; - return damon_call(kdamond->damon_ctx, &call_control); + err = damon_call(kdamond->damon_ctx, &call_control); + return err ? err : call_control.return_code; } struct damon_sysfs_schemes_walk_data { diff --git a/mm/gup.c b/mm/gup.c index adffe663594dc6..0bc4d140fc07fb 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2287,8 +2287,8 @@ static unsigned long collect_longterm_unpinnable_folios( struct pages_or_folios *pofs) { unsigned long collected = 0; - bool drain_allow = true; struct folio *folio; + int drained = 0; long i = 0; for (folio = pofs_get_folio(pofs, i); folio; @@ -2307,9 +2307,17 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (!folio_test_lru(folio) && drain_allow) { + if (drained == 0 && folio_may_be_lru_cached(folio) && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { + lru_add_drain(); + drained = 1; + } + if (drained == 1 && folio_may_be_lru_cached(folio) && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); - drain_allow = false; + drained = 2; } if (!folio_isolate_lru(folio)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 753f99b4c71866..6cfe0b43ab8f96 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5594,18 +5594,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* - * If the pagetables are shared don't copy or take references. - * - * dst_pte == src_pte is the common case of src/dest sharing. - * However, src could have 'unshared' and dst shares with - * another vma. So page_count of ptep page is checked instead - * to reliably determine whether pte is shared. - */ - if (page_count(virt_to_page(dst_pte)) > 1) { +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + /* If the pagetables are shared, there is nothing to do */ + if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) { addr |= last_addr_mask; continue; } +#endif dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -5851,7 +5846,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, spinlock_t *ptl; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - bool adjust_reservation = false; + bool adjust_reservation; unsigned long last_addr_mask; bool force_flush = false; @@ -5944,6 +5939,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(folio); + spin_unlock(ptl); /* * Restore the reservation for anonymous page, otherwise the @@ -5951,14 +5947,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * If there we are freeing a surplus, do not set the restore * reservation bit. */ + adjust_reservation = false; + + spin_lock_irq(&hugetlb_lock); if (!h->surplus_huge_pages && __vma_private_lock(vma) && folio_test_anon(folio)) { folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } - - spin_unlock(ptl); + spin_unlock_irq(&hugetlb_lock); /* * Adjust the reservation for the region that will have the @@ -7599,7 +7597,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, hugetlb_vma_assert_locked(vma); if (sz != PMD_SIZE) return 0; - if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) + if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index e2ceebf737ef7b..11d472a5c4e8d0 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -336,13 +336,13 @@ static void ___free_pages_bulk(struct page **pages, int nr_pages) } } -static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask) { unsigned long nr_populated, nr_total = nr_pages; struct page **page_array = pages; while (nr_pages) { - nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages); if (!nr_populated) { ___free_pages_bulk(page_array, nr_total - nr_pages); return -ENOMEM; @@ -354,25 +354,42 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages) return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; + unsigned int flags; int ret = 0; - data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO); if (!data.pages) return -ENOMEM; while (nr_total) { nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); - ret = ___alloc_pages_bulk(data.pages, nr_pages); + ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask); if (ret) break; data.start = start; + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, kasan_populate_vmalloc_pte, &data); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + ___free_pages_bulk(data.pages, nr_pages); if (ret) break; @@ -386,7 +403,7 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size) +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; @@ -415,7 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); if (ret) return ret; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b40bdfd224c3c..b486c1d19b2dd2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1417,8 +1417,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, _address))) referenced++; } if (!writable) { diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 1ea711786c522d..8bca7fece47f0e 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -195,7 +195,8 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, u32 origin, bool checked) { u64 address = (u64)addr; - u32 *shadow_start, *origin_start; + void *shadow_start; + u32 *aligned_shadow, *origin_start; size_t pad = 0; KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); @@ -214,9 +215,12 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, } __memset(shadow_start, b, size); - if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + if (IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + aligned_shadow = shadow_start; + } else { pad = address % KMSAN_ORIGIN_SIZE; address -= pad; + aligned_shadow = shadow_start - pad; size += pad; } size = ALIGN(size, KMSAN_ORIGIN_SIZE); @@ -230,7 +234,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, * corresponding shadow slot is zero. */ for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) { - if (origin || !shadow_start[i]) + if (origin || !aligned_shadow[i]) origin_start[i] = origin; } } diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index c6c5b2bbede0cc..902ec48b1e3e6a 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -556,6 +556,21 @@ DEFINE_TEST_MEMSETXX(16) DEFINE_TEST_MEMSETXX(32) DEFINE_TEST_MEMSETXX(64) +/* Test case: ensure that KMSAN does not access shadow memory out of bounds. */ +static void test_memset_on_guarded_buffer(struct kunit *test) +{ + void *buf = vmalloc(PAGE_SIZE); + + kunit_info(test, + "memset() on ends of guarded buffer should not crash\n"); + + for (size_t size = 0; size <= 128; size++) { + memset(buf, 0xff, size); + memset(buf + PAGE_SIZE - size, 0xff, size); + } + vfree(buf); +} + static noinline void fibonacci(int *array, int size, int start) { if (start < 2 || (start == size)) @@ -677,6 +692,7 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_memset16), KUNIT_CASE(test_memset32), KUNIT_CASE(test_memset64), + KUNIT_CASE(test_memset_on_guarded_buffer), KUNIT_CASE(test_long_origin_chain), KUNIT_CASE(test_stackdepot_roundtrip), KUNIT_CASE(test_unpoison_memory), diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fc30ca4804bf47..df6ee59527ddf5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -956,7 +956,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", - [MF_MSG_ALREADY_POISONED] = "already poisoned", + [MF_MSG_ALREADY_POISONED] = "already poisoned page", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1349,9 +1349,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(pfn); - - update_per_node_mf_stats(pfn, result); + if (type != MF_MSG_ALREADY_POISONED) { + num_poisoned_pages_inc(pfn); + update_per_node_mf_stats(pfn, result); + } pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -2094,12 +2095,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb *hugetlb = 0; return 0; } else if (res == -EHWPOISON) { - pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); res = kill_accessing_process(current, folio_pfn(folio), flags); - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); return res; } else if (res == -EBUSY) { if (!(flags & MF_NO_RETRY)) { @@ -2285,7 +2285,6 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; if (TestSetPageHWPoison(p)) { - pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); @@ -2569,10 +2568,9 @@ int unpoison_memory(unsigned long pfn) static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (!pfn_valid(pfn)) - return -ENXIO; - - p = pfn_to_page(pfn); + p = pfn_to_online_page(pfn); + if (!p) + return -EIO; folio = page_folio(p); mutex_lock(&mf_mutex); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f15af712bc346..74318c78771567 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1815,8 +1815,14 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; if (folio_contain_hwpoisoned_page(folio)) { - if (WARN_ON(folio_test_lru(folio))) - folio_isolate_lru(folio); + /* + * unmap_poisoned_folio() cannot handle large folios + * in all cases yet. + */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + goto put_folio; + if (folio_test_lru(folio) && !folio_isolate_lru(folio)) + goto put_folio; if (folio_mapped(folio)) { folio_lock(folio); unmap_poisoned_folio(folio, pfn, false); diff --git a/mm/mlock.c b/mm/mlock.c index a1d93ad33c6db5..bb0776f5ef7cad 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio) */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } diff --git a/mm/mremap.c b/mm/mremap.c index e618a706aff5a6..35de0a7b910e08 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1774,15 +1774,18 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) if (!vrm->new_len) return -EINVAL; - /* Is the new length or address silly? */ - if (vrm->new_len > TASK_SIZE || - vrm->new_addr > TASK_SIZE - vrm->new_len) + /* Is the new length silly? */ + if (vrm->new_len > TASK_SIZE) return -EINVAL; /* Remainder of checks are for cases with specific new_addr. */ if (!vrm_implies_new_addr(vrm)) return 0; + /* Is the new address silly? */ + if (vrm->new_addr > TASK_SIZE - vrm->new_len) + return -EINVAL; + /* The new address must be page-aligned. */ if (offset_in_page(vrm->new_addr)) return -EINVAL; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d28..936689d8bcac5e 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -606,10 +606,32 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { - struct mm_struct *mm = &init_mm; + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(&init_mm); + + return walk_kernel_page_table_range_lockless(start, end, ops, pgd, + private); +} + +/* + * Use this function to walk the kernel page tables locklessly. It should be + * guaranteed that the caller has exclusive access over the range they are + * operating on - that there should be no concurrent access, for example, + * changing permissions for vmalloc objects. + */ +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ struct mm_walk walk = { .ops = ops, - .mm = mm, + .mm = &init_mm, .pgd = pgd, .private = private, .no_vma = true @@ -620,16 +642,6 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end, if (!check_ops_valid(ops)) return -EINVAL; - /* - * Kernel intermediate page tables are usually not freed, so the mmap - * read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. - */ - mmap_assert_locked(mm); - return walk_pgd_range(start, end, &walk); } diff --git a/mm/percpu.c b/mm/percpu.c index a56f35dcc417e6..81462ce5866e16 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1734,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; - static int warn_limit = 10; + static atomic_t warn_limit = ATOMIC_INIT(10); struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; @@ -1904,13 +1904,17 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); - if (do_warn && warn_limit) { - pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); - if (!is_atomic) - dump_stack(); - if (!--warn_limit) - pr_info("limit reached, disable warning\n"); + if (do_warn) { + int remaining = atomic_dec_if_positive(&warn_limit); + + if (remaining >= 0) { + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); + if (!is_atomic) + dump_stack(); + if (remaining == 0) + pr_info("limit reached, disable warning\n"); + } } if (is_atomic) { diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b6e..932727247c64b2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5341,7 +5341,7 @@ static const struct super_operations shmem_ops = { .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, diff --git a/mm/swap.c b/mm/swap.c index 3632dd061bebb0..b74ebe865dd92a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; + /* block memcg migration while the folio moves between lru */ + if (move_fn != lru_add && !folio_test_clear_lru(folio)) + continue; + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, - struct folio *folio, move_fn_t move_fn, - bool on_lru, bool disable_irq) + struct folio *folio, move_fn_t move_fn, bool disable_irq) { unsigned long flags; - if (on_lru && !folio_test_clear_lru(folio)) - return; - folio_get(folio); if (disable_irq) @@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, else local_lock(&cpu_fbatches.lock); - if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || - lru_cache_disabled()) + if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) @@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_unlock(&cpu_fbatches.lock); } -#define folio_batch_add_and_move(folio, op, on_lru) \ - __folio_batch_add_and_move( \ - &cpu_fbatches.op, \ - folio, \ - op, \ - on_lru, \ - offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ +#define folio_batch_add_and_move(folio, op) \ + __folio_batch_add_and_move( \ + &cpu_fbatches.op, \ + folio, \ + op, \ + offsetof(struct cpu_fbatches, op) >= \ + offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) @@ -231,10 +231,10 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || - folio_test_unevictable(folio)) + folio_test_unevictable(folio) || !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_move_tail, true); + folio_batch_add_and_move(folio, lru_move_tail); } void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, @@ -328,10 +328,11 @@ static void folio_activate_drain(int cpu) void folio_activate(struct folio *folio) { - if (folio_test_active(folio) || folio_test_unevictable(folio)) + if (folio_test_active(folio) || folio_test_unevictable(folio) || + !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_activate, true); + folio_batch_add_and_move(folio, lru_activate); } #else @@ -507,7 +508,7 @@ void folio_add_lru(struct folio *folio) lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); - folio_batch_add_and_move(folio, lru_add, false); + folio_batch_add_and_move(folio, lru_add); } EXPORT_SYMBOL(folio_add_lru); @@ -685,13 +686,13 @@ void lru_add_drain_cpu(int cpu) void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate_file, true); + folio_batch_add_and_move(folio, lru_deactivate_file); } /* @@ -704,13 +705,13 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate, true); + folio_batch_add_and_move(folio, lru_deactivate); } /** @@ -723,10 +724,11 @@ void folio_deactivate(struct folio *folio) void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || + !folio_test_lru(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; - folio_batch_add_and_move(folio, lru_lazyfree, true); + folio_batch_add_and_move(folio, lru_lazyfree); } void lru_add_drain(void) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae134..5edd536ba9d2a5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2026,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); + /* Only reclaim behaviour flags are relevant. */ + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; might_sleep(); /* @@ -2038,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, */ va = node_alloc(size, align, vstart, vend, &addr, &vn_id); if (!va) { - gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -2089,7 +2089,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); - ret = kasan_populate_vmalloc(addr, size); + ret = kasan_populate_vmalloc(addr, size, gfp_mask); if (ret) { free_vmap_area(va); return ERR_PTR(ret); @@ -4826,7 +4826,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { - if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL)) goto err_free_shadow; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a48aec8bfd9251..674999999cd067 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4507,7 +4507,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (!folio_test_lru(folio) || zone > sc->reclaim_idx) { + if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 7a217485185741..fe49e8a7969ffc 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3087,8 +3087,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + /* Check for existing connection: + * + * 1. If it doesn't exist then it must be receiver/slave role. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); - if (!conn) { + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ @@ -4391,6 +4401,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "num %d", ev->num); + hci_dev_lock(hdev); + for (i = 0; i < ev->num; i++) { struct hci_comp_pkts_info *info = &ev->handles[i]; struct hci_conn *conn; @@ -4472,6 +4484,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, } queue_work(hdev->workqueue, &hdev->tx_work); + + hci_dev_unlock(hdev); } static void hci_mode_change_evt(struct hci_dev *hdev, void *data, @@ -5634,8 +5648,18 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr); - if (!conn) { + /* Check for existing connection: + * + * 1. If it doesn't exist then use the role to create a new object. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ + conn = hci_conn_hash_lookup_role(hdev, LE_LINK, role, bdaddr); + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index b6f888d8354e3f..7a7d4989085848 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2594,6 +2594,13 @@ static int hci_resume_advertising_sync(struct hci_dev *hdev) hci_remove_ext_adv_instance_sync(hdev, adv->instance, NULL); } + + /* If current advertising instance is set to instance 0x00 + * then we need to re-enable it. + */ + if (!hdev->cur_adv_instance) + err = hci_enable_ext_advertising_sync(hdev, + hdev->cur_adv_instance); } else { /* Schedule for most recent instance to be restarted and begin * the software rotation loop diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 50634ef5c8b707..225140fcb3d6c8 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1323,8 +1323,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) struct mgmt_mode *cp; /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; cp = cmd->param; @@ -1351,23 +1350,29 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) mgmt_status(err)); } - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp; + struct mgmt_mode cp; + + mutex_lock(&hdev->mgmt_pending_lock); /* Make sure cmd still outstanding. */ - if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); return -ECANCELED; + } - cp = cmd->param; + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); BT_DBG("%s", hdev->name); - return hci_set_powered_sync(hdev, cp->val); + return hci_set_powered_sync(hdev, cp.val); } static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, @@ -1516,8 +1521,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; hci_dev_lock(hdev); @@ -1539,12 +1543,15 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); } static int set_discoverable_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + BT_DBG("%s", hdev->name); return hci_update_discoverable_sync(hdev); @@ -1691,8 +1698,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; hci_dev_lock(hdev); @@ -1707,7 +1713,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); } @@ -1743,6 +1749,9 @@ static int set_connectable_update_settings(struct hci_dev *hdev, static int set_connectable_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + BT_DBG("%s", hdev->name); return hci_update_connectable_sync(hdev); @@ -1919,14 +1928,17 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) { struct cmd_lookup match = { NULL, hdev }; struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 enable = cp->val; + struct mgmt_mode *cp; + u8 enable; bool changed; /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; + cp = cmd->param; + enable = cp->val; + if (err) { u8 mgmt_err = mgmt_status(err); @@ -1935,8 +1947,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) new_settings(hdev, NULL); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, - cmd_status_rsp, &mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); return; } @@ -1946,7 +1957,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, settings_rsp, &match); + settings_rsp(cmd, &match); if (changed) new_settings(hdev, match.sk); @@ -1960,14 +1971,25 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) static int set_ssp_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode cp; bool changed = false; int err; - if (cp->val) + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + if (cp.val) changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED); - err = hci_write_ssp_mode_sync(hdev, cp->val); + err = hci_write_ssp_mode_sync(hdev, cp.val); if (!err && changed) hci_dev_clear_flag(hdev, HCI_SSP_ENABLED); @@ -2060,32 +2082,50 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) static void set_le_complete(struct hci_dev *hdev, void *data, int err) { + struct mgmt_pending_cmd *cmd = data; struct cmd_lookup match = { NULL, hdev }; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); - if (status) { - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, cmd_status_rsp, - &status); + if (err == -ECANCELED || !mgmt_pending_valid(hdev, data)) return; + + if (status) { + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status); + goto done; } - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, settings_rsp, &match); + settings_rsp(cmd, &match); new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); + +done: + mgmt_pending_free(cmd); } static int set_le_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 val = !!cp->val; + struct mgmt_mode cp; + u8 val; int err; + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + val = !!cp.val; + + mutex_unlock(&hdev->mgmt_pending_lock); + if (!val) { hci_clear_adv_instance_sync(hdev, NULL, 0x00, true); @@ -2127,7 +2167,12 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; u8 status = mgmt_status(err); - struct sock *sk = cmd->sk; + struct sock *sk; + + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) + return; + + sk = cmd->sk; if (status) { mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, @@ -2142,24 +2187,37 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) static int set_mesh_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_mesh *cp = cmd->param; - size_t len = cmd->param_len; + struct mgmt_cp_set_mesh cp; + size_t len; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + len = cmd->param_len; memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types)); - if (cp->enable) + if (cp.enable) hci_dev_set_flag(hdev, HCI_MESH); else hci_dev_clear_flag(hdev, HCI_MESH); - hdev->le_scan_interval = __le16_to_cpu(cp->period); - hdev->le_scan_window = __le16_to_cpu(cp->window); + hdev->le_scan_interval = __le16_to_cpu(cp.period); + hdev->le_scan_window = __le16_to_cpu(cp.window); - len -= sizeof(*cp); + len -= sizeof(cp); /* If filters don't fit, forward all adv pkts */ if (len <= sizeof(hdev->mesh_ad_types)) - memcpy(hdev->mesh_ad_types, cp->ad_types, len); + memcpy(hdev->mesh_ad_types, cp.ad_types, len); hci_update_passive_scan_sync(hdev); return 0; @@ -3867,15 +3925,16 @@ static int name_changed_sync(struct hci_dev *hdev, void *data) static void set_name_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_local_name *cp = cmd->param; + struct mgmt_cp_set_local_name *cp; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; + cp = cmd->param; + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); @@ -3887,16 +3946,27 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) hci_cmd_sync_queue(hdev, name_changed_sync, NULL, NULL); } - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_name_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_local_name *cp = cmd->param; + struct mgmt_cp_set_local_name cp; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); if (lmp_bredr_capable(hdev)) { - hci_update_name_sync(hdev, cp->name); + hci_update_name_sync(hdev, cp.name); hci_update_eir_sync(hdev); } @@ -4048,12 +4118,10 @@ int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip) static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct sk_buff *skb = cmd->skb; + struct sk_buff *skb; u8 status = mgmt_status(err); - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) - return; + skb = cmd->skb; if (!status) { if (!skb) @@ -4080,7 +4148,7 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) if (skb && !IS_ERR(skb)) kfree_skb(skb); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_default_phy_sync(struct hci_dev *hdev, void *data) @@ -4088,7 +4156,9 @@ static int set_default_phy_sync(struct hci_dev *hdev, void *data) struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_set_phy_configuration *cp = cmd->param; struct hci_cp_le_set_default_phy cp_phy; - u32 selected_phys = __le32_to_cpu(cp->selected_phys); + u32 selected_phys; + + selected_phys = __le32_to_cpu(cp->selected_phys); memset(&cp_phy, 0, sizeof(cp_phy)); @@ -4228,7 +4298,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, goto unlock; } - cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, + cmd = mgmt_pending_new(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, len); if (!cmd) err = -ENOMEM; @@ -5189,7 +5259,17 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, { struct mgmt_rp_add_adv_patterns_monitor rp; struct mgmt_pending_cmd *cmd = data; - struct adv_monitor *monitor = cmd->user_data; + struct adv_monitor *monitor; + + /* This is likely the result of hdev being closed and mgmt_index_removed + * is attempting to clean up any pending command so + * hci_adv_monitors_clear is about to be called which will take care of + * freeing the adv_monitor instances. + */ + if (status == -ECANCELED && !mgmt_pending_valid(hdev, cmd)) + return; + + monitor = cmd->user_data; hci_dev_lock(hdev); @@ -5215,9 +5295,20 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, static int mgmt_add_adv_patterns_monitor_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct adv_monitor *monitor = cmd->user_data; + struct adv_monitor *mon; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + mon = cmd->user_data; + + mutex_unlock(&hdev->mgmt_pending_lock); - return hci_add_adv_monitor(hdev, monitor); + return hci_add_adv_monitor(hdev, mon); } static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, @@ -5484,7 +5575,8 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, status); } -static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int err) +static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, + int err) { struct mgmt_rp_read_local_oob_data mgmt_rp; size_t rp_size = sizeof(mgmt_rp); @@ -5504,7 +5596,8 @@ static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int e bt_dev_dbg(hdev, "status %d", status); if (status) { - mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, status); + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, + status); goto remove; } @@ -5786,17 +5879,12 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - if (err == -ECANCELED) - return; - - if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && - cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && - cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); @@ -5804,6 +5892,9 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) static int start_discovery_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + return hci_start_discovery_sync(hdev); } @@ -6009,15 +6100,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -6025,6 +6115,9 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) static int stop_discovery_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + return hci_stop_discovery_sync(hdev); } @@ -6234,14 +6327,18 @@ static void enable_advertising_instance(struct hci_dev *hdev, int err) static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) { + struct mgmt_pending_cmd *cmd = data; struct cmd_lookup match = { NULL, hdev }; u8 instance; struct adv_info *adv_instance; u8 status = mgmt_status(err); + if (err == -ECANCELED || !mgmt_pending_valid(hdev, data)) + return; + if (status) { - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, - cmd_status_rsp, &status); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status); + mgmt_pending_free(cmd); return; } @@ -6250,8 +6347,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) else hci_dev_clear_flag(hdev, HCI_ADVERTISING); - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, settings_rsp, - &match); + settings_rsp(cmd, &match); new_settings(hdev, match.sk); @@ -6283,10 +6379,23 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) static int set_adv_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 val = !!cp->val; + struct mgmt_mode cp; + u8 val; - if (cp->val == 0x02) + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + val = !!cp.val; + + if (cp.val == 0x02) hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE); else hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); @@ -8039,10 +8148,6 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) - return; - if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -8149,7 +8254,7 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, kfree_skb(skb); kfree(mgmt_rp); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, @@ -8158,7 +8263,7 @@ static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, struct mgmt_pending_cmd *cmd; int err; - cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, + cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, cp, sizeof(*cp)); if (!cmd) return -ENOMEM; diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index a88a07da394734..aa7b5585cb268b 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -320,6 +320,52 @@ void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) mgmt_pending_free(cmd); } +bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + struct mgmt_pending_cmd *tmp; + + lockdep_assert_held(&hdev->mgmt_pending_lock); + + if (!cmd) + return false; + + list_for_each_entry(tmp, &hdev->mgmt_pending, list) { + if (cmd == tmp) + return true; + } + + return false; +} + +bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + bool listed; + + mutex_lock(&hdev->mgmt_pending_lock); + listed = __mgmt_pending_listed(hdev, cmd); + mutex_unlock(&hdev->mgmt_pending_lock); + + return listed; +} + +bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + bool listed; + + if (!cmd) + return false; + + mutex_lock(&hdev->mgmt_pending_lock); + + listed = __mgmt_pending_listed(hdev, cmd); + if (listed) + list_del(&cmd->list); + + mutex_unlock(&hdev->mgmt_pending_lock); + + return listed; +} + void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk) diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index 024e51dd693756..bcba8c9d895285 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -65,6 +65,9 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, void *data, u16 len); void mgmt_pending_free(struct mgmt_pending_cmd *cmd); void mgmt_pending_remove(struct mgmt_pending_cmd *cmd); +bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); +bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); +bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9728dbd4c66c51..dfb03ee0bb62a5 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -524,27 +524,27 @@ __bpf_kfunc int bpf_fentry_test1(int a) } EXPORT_SYMBOL_GPL(bpf_fentry_test1); -int noinline bpf_fentry_test2(int a, u64 b) +noinline int bpf_fentry_test2(int a, u64 b) { return a + b; } -int noinline bpf_fentry_test3(char a, int b, u64 c) +noinline int bpf_fentry_test3(char a, int b, u64 c) { return a + b + c; } -int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) +noinline int bpf_fentry_test4(void *a, char b, int c, u64 d) { return (long)a + b + c + d; } -int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) +noinline int bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) { return a + (long)b + c + d + e; } -int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) +noinline int bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) { return a + (long)b + c + d + (long)e + f; } @@ -553,13 +553,13 @@ struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; -int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) +noinline int bpf_fentry_test7(struct bpf_fentry_test_t *arg) { - asm volatile ("": "+r"(arg)); + asm volatile ("" : "+r"(arg)); return (long)arg; } -int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) +noinline int bpf_fentry_test8(struct bpf_fentry_test_t *arg) { return (long)arg->a; } @@ -569,12 +569,12 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } -int noinline bpf_fentry_test10(const void *a) +noinline int bpf_fentry_test10(const void *a) { return (long)a; } -void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) +noinline void bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } @@ -598,7 +598,7 @@ __bpf_kfunc int bpf_modify_return_test_tp(int nonce) return nonce; } -int noinline bpf_fentry_shadow_test(int a) +noinline int bpf_fentry_shadow_test(int a) { return a + 1; } @@ -665,7 +665,7 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; - if (user_size < ETH_HLEN || user_size > PAGE_SIZE - headroom - tailroom) + if (user_size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); size = SKB_DATA_ALIGN(size); @@ -1001,6 +1001,9 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, kattr->test.cpu || kattr->test.batch_size) return -EINVAL; + if (size < ETH_HLEN) + return -EINVAL; + data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); @@ -1207,9 +1210,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 retval = 0, meta_sz = 0, duration, max_linear_sz, size; + u32 linear_sz = kattr->test.data_size_in; u32 batch_size = kattr->test.batch_size; - u32 retval = 0, duration, max_data_sz; - u32 size = kattr->test.data_size_in; u32 headroom = XDP_PACKET_HEADROOM; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; @@ -1246,39 +1249,45 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (ctx) { /* There can't be user provided data before the meta data */ - if (ctx->data_meta || ctx->data_end != size || + if (ctx->data_meta || ctx->data_end > kattr->test.data_size_in || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; /* Meta data is allocated from the headroom */ headroom -= ctx->data; - } - max_data_sz = PAGE_SIZE - headroom - tailroom; - if (size > max_data_sz) { - /* disallow live data mode for jumbo frames */ - if (do_live) - goto free_ctx; - size = max_data_sz; + meta_sz = ctx->data; + linear_sz = ctx->data_end; } - data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom); + max_linear_sz = PAGE_SIZE - headroom - tailroom; + linear_sz = min_t(u32, linear_sz, max_linear_sz); + + /* disallow live data mode for jumbo frames */ + if (do_live && kattr->test.data_size_in > linear_sz) + goto free_ctx; + + if (kattr->test.data_size_in - meta_sz < ETH_HLEN) + return -EINVAL; + + data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); goto free_ctx; } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); - rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom; + rxqueue->xdp_rxq.frag_size = PAGE_SIZE; xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq); - xdp_prepare_buff(&xdp, data, headroom, size, true); + xdp_prepare_buff(&xdp, data, headroom, linear_sz, true); sinfo = xdp_get_shared_info_from_buff(&xdp); ret = xdp_convert_md_to_buff(ctx, &xdp); if (ret) goto free_data; + size = linear_sz; if (unlikely(kattr->test.data_size_in > size)) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); diff --git a/net/bridge/br.c b/net/bridge/br.c index 1885d0c315f027..c683baa3847f17 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -324,6 +324,13 @@ int br_boolopt_multi_toggle(struct net_bridge *br, int err = 0; int opt_id; + opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX); + if (opt_id != BITS_PER_LONG) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d", + opt_id); + return -EINVAL; + } + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 39844f14eed862..797719cb227ec5 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -290,8 +290,11 @@ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa) if (!ecu) ecu = j1939_ecu_create_locked(priv, name); err = PTR_ERR_OR_ZERO(ecu); - if (err) + if (err) { + if (j1939_address_is_unicast(sa)) + priv->ents[sa].nusers--; goto done; + } ecu->nusers++; /* TODO: do we care if ecu->addr != sa? */ diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 31a93cae5111b5..81f58924b4acd7 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -212,6 +212,7 @@ void j1939_priv_get(struct j1939_priv *priv); /* notify/alert all j1939 sockets bound to ifindex */ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv); +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv); int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk); void j1939_tp_init(struct j1939_priv *priv); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 7e8a20f2fc42b5..3706a872ecafdb 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -377,6 +377,9 @@ static int j1939_netdev_notify(struct notifier_block *nb, j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; + case NETDEV_UNREGISTER: + j1939_sk_netdev_event_unregister(priv); + break; } j1939_priv_put(priv); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 3d8b588822f9d2..88e7160d424896 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -521,6 +521,9 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); + jsk->priv = NULL; + synchronize_rcu(); + j1939_priv_put(priv); goto out_release_sock; } @@ -1300,6 +1303,55 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) read_unlock_bh(&priv->j1939_socks_lock); } +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) +{ + struct sock *sk; + struct j1939_sock *jsk; + bool wait_rcu = false; + +rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ + read_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + /* Skip if j1939_jsk_add() is not called on this socket. */ + if (!(jsk->state & J1939_SOCK_BOUND)) + continue; + sk = &jsk->sk; + sock_hold(sk); + read_unlock_bh(&priv->j1939_socks_lock); + /* Check if j1939_jsk_del() is not yet called on this socket after holding + * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call + * j1939_jsk_del() with socket's lock held. + */ + lock_sock(sk); + if (jsk->state & J1939_SOCK_BOUND) { + /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). + * Make this socket no longer bound, by pretending as if j1939_sk_bind() + * dropped old references but did not get new references. + */ + j1939_jsk_del(priv, jsk); + j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); + j1939_netdev_stop(priv); + /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from + * calling the corresponding j1939_priv_put(). + * + * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after + * an RCU grace period. But since the caller is holding a ref on this + * "priv", we can defer synchronize_rcu() until immediately before + * the caller calls j1939_priv_put(). + */ + j1939_priv_put(priv); + jsk->priv = NULL; + wait_rcu = true; + } + release_sock(sk); + sock_put(sk); + goto rescan; + } + read_unlock_bh(&priv->j1939_socks_lock); + if (wait_rcu) + synchronize_rcu(); +} + static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d1b5705dc0c648..9f6d860411cbd1 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1524,7 +1524,7 @@ static void con_fault_finish(struct ceph_connection *con) * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->v1.auth_retry) { + if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) { dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); @@ -1714,9 +1714,10 @@ static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ if (con->state == CEPH_CON_S_STANDBY) { - dout("clear_standby %p and ++connect_seq\n", con); + dout("clear_standby %p\n", con); con->state = CEPH_CON_S_PREOPEN; - con->v1.connect_seq++; + if (!ceph_msgr2(from_msgr(con->msgr))) + con->v1.connect_seq++; WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b66..8d49b2198d072f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6965,7 +6965,7 @@ static void napi_stop_kthread(struct napi_struct *napi) * the kthread. */ while (true) { - if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) break; msleep(20); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9c0ad7f4b5d810..ad54b12d4b4c80 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) - return dev_get_hwtstamp_phylib(dev, kernel_cfg); + if (ops->ndo_hwtstamp_get) { + int err; + + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); @@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) - return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + if (ops->ndo_hwtstamp_set) { + int err; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); diff --git a/net/core/filter.c b/net/core/filter.c index da391e2b0788d0..2af0a5f1d7489c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4153,34 +4153,45 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) return 0; } -static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - enum xdp_mem_type mem_type, bool release) +static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + bool tail, bool release) { - struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); + struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : + xsk_buff_get_head(xdp); if (release) { - xsk_buff_del_tail(zc_frag); - __xdp_return(0, mem_type, false, zc_frag); + xsk_buff_del_frag(zc_frag); } else { - zc_frag->data_end -= shrink; + if (tail) + zc_frag->data_end -= shrink; + else + zc_frag->data += shrink; } + + return zc_frag; } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, - int shrink) + int shrink, bool tail) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; + netmem_ref netmem = skb_frag_netmem(frag); + struct xdp_buff *zc_frag = NULL; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); - goto out; + netmem = 0; + zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); } - if (release) - __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); + if (release) { + __xdp_return(netmem, mem_type, false, zc_frag); + } else { + if (!tail) + skb_frag_off_add(frag, shrink); + skb_frag_size_sub(frag, shrink); + } -out: return release; } @@ -4198,18 +4209,15 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - if (bpf_xdp_shrink_data(xdp, frag, shrink)) { + if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) n_frags_free++; - } else { - skb_frag_size_sub(frag, shrink); - break; - } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); xdp->data_end -= offset; } @@ -7431,6 +7439,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_sock, FIELD)); \ } while (0) + BTF_TYPE_EMIT(struct bpf_xdp_sock); + switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); @@ -9284,13 +9294,17 @@ static bool sock_addr_is_valid_access(int off, int size, return false; info->reg_type = PTR_TO_SOCKET; break; - default: - if (type == BPF_READ) { - if (size != size_default) - return false; - } else { + case bpf_ctx_range(struct bpf_sock_addr, user_family): + case bpf_ctx_range(struct bpf_sock_addr, family): + case bpf_ctx_range(struct bpf_sock_addr, type): + case bpf_ctx_range(struct bpf_sock_addr, protocol): + if (type != BPF_READ) return false; - } + if (size != size_default) + return false; + break; + default: + return false; } return true; @@ -11990,6 +12004,16 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return func; } +/** + * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. + * @skb: socket buffer carrying the metadata + * @offset: offset into the metadata area, must be <= skb_metadata_len() + */ +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) @@ -12007,6 +12031,42 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, return 0; } +/** + * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. + * @skb_: socket buffer carrying the metadata + * @flags: future use, must be zero + * @ptr__uninit: dynptr to initialize + * + * Set up a dynptr for access to the metadata area earlier allocated from the + * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to + * &__sk_buff->data_meta. + * + * If passed @skb_ is a clone which shares the data with the original, the + * dynptr will be read-only. This limitation may be lifted in the future. + * + * Return: + * * %0 - dynptr ready to use + * * %-EINVAL - invalid flags, dynptr set to null + */ +__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)skb_; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + + if (skb_cloned(skb)) + bpf_dynptr_set_rdonly(ptr); + + return 0; +} + __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { @@ -12160,6 +12220,98 @@ __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, return 0; } +/** + * bpf_xdp_pull_data() - Pull in non-linear xdp data. + * @x: &xdp_md associated with the XDP buffer + * @len: length of data to be made directly accessible in the linear part + * + * Pull in data in case the XDP buffer associated with @x is non-linear and + * not all @len are in the linear data area. + * + * Direct packet access allows reading and writing linear XDP data through + * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which + * ends up in the linear part of the xdp_buff depends on the NIC and its + * configuration. When a frag-capable XDP program wants to directly access + * headers that may be in the non-linear area, call this kfunc to make sure + * the data is available in the linear area. Alternatively, use dynptr or + * bpf_xdp_{load,store}_bytes() to access data without pulling. + * + * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate + * headers in the non-linear data area. + * + * A call to this kfunc may reduce headroom. If there is not enough tailroom + * in the linear data area, metadata and data will be shifted down. + * + * A call to this kfunc is susceptible to change the buffer geometry. + * Therefore, at load time, all checks on pointers previously done by the + * verifier are invalidated and must be performed again, if the kfunc is used + * in combination with direct packet access. + * + * Return: + * * %0 - success + * * %-EINVAL - invalid len + */ +__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)x; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, delta, shift, headroom, tailroom, n_frags_free = 0; + void *data_hard_end = xdp_data_hard_end(xdp); + int data_len = xdp->data_end - xdp->data; + void *start; + + if (len <= data_len) + return 0; + + if (unlikely(len > xdp_get_buff_len(xdp))) + return -EINVAL; + + start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta; + + headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame); + tailroom = data_hard_end - xdp->data_end; + + delta = len - data_len; + if (unlikely(delta > tailroom + headroom)) + return -EINVAL; + + shift = delta - tailroom; + if (shift > 0) { + memmove(start - shift, start, xdp->data_end - start); + + xdp->data_meta -= shift; + xdp->data -= shift; + xdp->data_end -= shift; + } + + for (i = 0; i < sinfo->nr_frags && delta; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + u32 shrink = min_t(u32, delta, skb_frag_size(frag)); + + memcpy(xdp->data_end, skb_frag_address(frag), shrink); + + xdp->data_end += shrink; + sinfo->xdp_frags_size -= shrink; + delta -= shrink; + if (bpf_xdp_shrink_data(xdp, frag, shrink, false)) + n_frags_free++; + } + + if (unlikely(n_frags_free)) { + memmove(sinfo->frags, sinfo->frags + n_frags_free, + (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t)); + + sinfo->nr_frags -= n_frags_free; + + if (!sinfo->nr_frags) { + xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); + } + } + + return 0; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12181,8 +12333,13 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) + BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) +BTF_ID_FLAGS(func, bpf_xdp_pull_data) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) @@ -12202,6 +12359,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .set = &bpf_kfunc_check_set_skb, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb_meta, +}; + static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, @@ -12237,6 +12399,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c28cd66654447d..3c2dc4c5e683ea 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1328,7 +1328,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) struct netdev_rx_queue *queue = &dev->_rx[i]; struct kobject *kobj = &queue->kobj; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -2061,7 +2061,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) queue->kobj.uevent_suppress = 1; if (netdev_uses_bql(dev)) @@ -2315,7 +2315,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->ns.count)) + if (!check_net(dev_net(ndev))) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e1f..b0e0f22d7b213c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -314,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->ns.count) == 0) + if (!check_net(net)) return NETNSA_NSID_NOT_ASSIGNED; spin_lock(&net->nsid_lock); @@ -397,10 +398,15 @@ static __net_init void preinit_net_sysctl(struct net *net) } /* init code that must occur even if setup_net() is not called. */ -static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) +static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { + int ret; + + ret = ns_common_init(net); + if (ret) + return ret; + refcount_set(&net->passive, 1); - refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); @@ -420,6 +426,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ INIT_LIST_HEAD(&net->ptype_all); INIT_LIST_HEAD(&net->ptype_specific); preinit_net_sysctl(net); + return 0; } /* @@ -432,7 +439,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); + net->net_cookie = ns_tree_gen_id(&net->ns); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -442,6 +449,7 @@ static __net_init int setup_net(struct net *net) down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); + ns_tree_add_raw(net); out: return error; @@ -539,7 +547,7 @@ void net_drop_ns(void *p) net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; @@ -559,7 +567,9 @@ struct net *copy_net_ns(unsigned long flags, goto dec_ucounts; } - preinit_net(net, user_ns); + rv = preinit_net(net, user_ns); + if (rv < 0) + goto dec_ucounts; net->ucounts = ucounts; get_user_ns(user_ns); @@ -573,6 +583,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: + ns_common_free(net); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -659,8 +670,10 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ down_write(&net_rwsem); - llist_for_each_entry(net, net_kill_list, cleanup_list) + llist_for_each_entry(net, net_kill_list, cleanup_list) { + ns_tree_remove(net); list_del_rcu(&net->list); + } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). @@ -693,6 +706,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + ns_common_free(net); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); @@ -812,31 +826,12 @@ static void net_ns_net_debugfs(struct net *net) static __net_init int net_ns_net_init(struct net *net) { -#ifdef CONFIG_NET_NS - net->ns.ops = &netns_operations; -#endif - net->ns.inum = PROC_NET_INIT_INO; - if (net != &init_net) { - int ret = ns_alloc_inum(&net->ns); - if (ret) - return ret; - } net_ns_net_debugfs(net); return 0; } -static __net_exit void net_ns_net_exit(struct net *net) -{ - /* - * Initial network namespace doesn't exit so we don't need any - * special checks here. - */ - ns_free_inum(&net->ns); -} - static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, - .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { @@ -1282,7 +1277,12 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif - preinit_net(&init_net, &init_user_ns); + /* + * This currently cannot fail as the initial network namespace + * has a static inode number. + */ + if (preinit_net(&init_net, &init_user_ns)) + panic("Could not preinitialize the initial network namespace"); down_write(&pernet_ops_rwsem); if (setup_net(&init_net)) @@ -1517,11 +1517,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); @@ -1548,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns) const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ee0274417948e0..1c0279b9cb9f48 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6667,7 +6667,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, return NULL; while (data_len) { - if (nr_frags == MAX_SKB_FRAGS - 1) + if (nr_frags == MAX_SKB_FRAGS) goto failure; while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 110b3fa8a0b1b1..264fb82cba196e 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -34,7 +34,7 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { - static struct devlink_rate *devlink_rate; + struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && @@ -819,8 +819,8 @@ EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); */ void devl_rate_nodes_destroy(struct devlink *devlink) { - static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; + struct devlink_rate *devlink_rate, *tmp; devl_assert_locked(devlink); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 4f58648a27ad63..92e6a681c797ec 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -905,7 +905,7 @@ int ethtool_net_get_ts_info_by_phc(struct net_device *dev, int err; if (!ops->get_ts_info) - return -ENODEV; + return -EOPNOTSUPP; /* Does ptp comes from netdev */ ethtool_init_tsinfo(info); @@ -973,7 +973,7 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, int err; err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); - if (err == -ENODEV) { + if (err == -ENODEV || err == -EOPNOTSUPP) { struct phy_device *phy; phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 88657255fec12b..fbbc3ccf9df64b 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -49,7 +49,7 @@ static bool hsr_check_carrier(struct hsr_port *master) ASSERT_RTNL(); - hsr_for_each_port(master->hsr, port) { + hsr_for_each_port_rtnl(master->hsr, port) { if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { netif_carrier_on(master->dev); return true; @@ -105,7 +105,7 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) struct hsr_port *port; mtu_max = ETH_DATA_LEN; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) mtu_max = min(port->dev->mtu, mtu_max); @@ -139,7 +139,7 @@ static int hsr_dev_open(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -172,7 +172,7 @@ static int hsr_dev_close(struct net_device *dev) struct hsr_priv *hsr; hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -205,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, * may become enabled. */ features &= ~NETIF_F_ONE_FOR_ALL; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) features = netdev_increment_features(features, port->dev->features, mask); @@ -226,6 +226,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; + rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (master) { skb->dev = master->dev; @@ -238,6 +239,8 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } + rcu_read_unlock(); + return NETDEV_TX_OK; } @@ -484,7 +487,7 @@ static void hsr_set_rx_mode(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -506,7 +509,7 @@ static void hsr_change_rx_flags(struct net_device *dev, int change) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -534,7 +537,7 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) continue; @@ -580,7 +583,7 @@ static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: @@ -672,9 +675,14 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev, struct hsr_priv *hsr = netdev_priv(ndev); struct hsr_port *port; + rcu_read_lock(); hsr_for_each_port(hsr, port) - if (port->type == pt) + if (port->type == pt) { + dev_hold(port->dev); + rcu_read_unlock(); return port->dev; + } + rcu_read_unlock(); return NULL; } EXPORT_SYMBOL(hsr_get_port_ndev); diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 192893c3f2ec73..bc94b07101d80e 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -22,7 +22,7 @@ static bool hsr_slave_empty(struct hsr_priv *hsr) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) return false; return true; @@ -134,7 +134,7 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type == pt) return port; return NULL; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 135ec5fce01967..33b0d2460c9bcd 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -224,6 +224,9 @@ struct hsr_priv { #define hsr_for_each_port(hsr, port) \ list_for_each_entry_rcu((port), &(hsr)->ports, port_list) +#define hsr_for_each_port_rtnl(hsr, port) \ + list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held()) + struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt); /* Caller must ensure skb is a valid HSR frame */ diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 875ff923a8ed05..56a117560c0cbb 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -329,13 +329,13 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo) TCPF_NEW_SYN_RECV)) continue; - if (refcount_read(&sock_net(sk)->ns.count)) + if (check_net(sock_net(sk))) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (refcount_read(&sock_net(sk)->ns.count)) { + if (check_net(sock_net(sk))) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index cc9915543637df..2e61ac1371289a 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -206,6 +206,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); @@ -300,6 +303,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 29118c43ebf5f1..34137768e7f9a2 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2399,6 +2399,13 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, return -EINVAL; } + if (!list_empty(&old->grp_list) && + rtnl_dereference(new->nh_info)->fdb_nh != + rtnl_dereference(old->nh_info)->fdb_nh) { + NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group"); + return -EINVAL; + } + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71a956fbfc5533..ad76556800f2b2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3327,6 +3327,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + struct request_sock *req; u32 seq; if (old_state != TCP_CLOSE) @@ -3442,6 +3443,10 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) + reqsk_fastopen_remove(sk, req, false); tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index bbb8d5f0eae7d3..3338b6cc85c487 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1178,7 +1178,9 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) if (!ao) return; - WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); ao->rcv_sne = 0; hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ba581785adb4b3..a268e1595b22aa 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -408,8 +408,11 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); - if (!psock->cork) + if (!psock->cork) { + sk_msg_free(sk, msg); + *copied = 0; return -ENOMEM; + } } memcpy(psock->cork, msg, sizeof(*msg)); return 0; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03c068ea27b6ad..b67f94c60f9ffc 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -912,7 +912,7 @@ static void tcp_metrics_flush_all(struct net *net) spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : - !refcount_read(&tm_net(tm)->ns.count); + !check_net(tm_net(tm)); if (match) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 2a8ea28442b271..1103b3341a7056 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -985,13 +985,13 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return false; } - if (mp_opt->deny_join_id0) - WRITE_ONCE(msk->pm.remote_deny_join_id0, true); - if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: + if (mp_opt->deny_join_id0) + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); + mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959aea..ce7d42d3bd007b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -408,6 +408,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); + u16 flags = 0; if (err) return err; @@ -415,6 +416,12 @@ static int mptcp_event_created(struct sk_buff *skb, if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; + if (READ_ONCE(msk->pm.remote_deny_join_id0)) + flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; + + if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e6fd97b21e9eea..5e497a83e9675b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -371,6 +371,20 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } +static void mptcp_shutdown_subflows(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + slow = lock_sock_fast(ssk); + tcp_shutdown(ssk, SEND_SHUTDOWN); + unlock_sock_fast(ssk, slow); + } +} + /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { @@ -395,6 +409,7 @@ static void mptcp_check_data_fin_ack(struct sock *sk) break; case TCP_CLOSING: case TCP_LAST_ACK: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; } @@ -563,6 +578,7 @@ static bool mptcp_check_data_fin(struct sock *sk) mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; default: diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2c267aff95bec9..2abe6f1e994004 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1532,13 +1532,12 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + bool keep_open; - if (ssk->sk_prot->keepalive) { - if (sock_flag(sk, SOCK_KEEPOPEN)) - ssk->sk_prot->keepalive(ssk, 1); - else - ssk->sk_prot->keepalive(ssk, 0); - } + keep_open = sock_flag(sk, SOCK_KEEPOPEN); + if (ssk->sk_prot->keepalive) + ssk->sk_prot->keepalive(ssk, keep_open); + sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open); ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f1b62a9fe889a..f31a3a79531a2e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -883,6 +883,10 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); + + if (mp_opt.deny_join_id0) + WRITE_ONCE(owner->pm.remote_deny_join_id0, true); + mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1082de0965676..c3c73411c40c4b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1131,11 +1131,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } -static __be16 nft_base_seq(const struct net *net) +static unsigned int nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = nft_pernet(net); + return READ_ONCE(net->nft.base_seq); +} - return htons(nft_net->base_seq & 0xffff); +static __be16 nft_base_seq_be16(const struct net *net) +{ + return htons(nft_base_seq(net) & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -1155,7 +1158,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -1248,7 +1251,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -2030,7 +2033,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -2133,7 +2136,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -3671,7 +3674,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, - nft_base_seq(net)); + nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -3839,7 +3842,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -4050,7 +4053,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); @@ -4887,7 +4890,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), flags, ctx->family, NFNETLINK_V0, - nft_base_seq(ctx->net)); + nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -5032,7 +5035,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && @@ -6209,7 +6212,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && @@ -6238,7 +6241,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, - table->family, NFNETLINK_V0, nft_base_seq(net)); + table->family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -6331,7 +6334,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + NFNETLINK_V0, nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -6630,7 +6633,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb, } nelems++; } - audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); out_unlock: rcu_read_unlock(); @@ -8381,7 +8384,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -8446,7 +8449,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -8480,7 +8483,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) idx++; } if (ctx->reset && entries) - audit_log_obj_reset(table, nft_net->base_seq, entries); + audit_log_obj_reset(table, nft_base_seq(net), entries); if (rc < 0) break; } @@ -8649,7 +8652,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); @@ -8754,9 +8757,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { - struct nftables_pernet *nft_net = nft_pernet(net); char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); + table->name, nft_base_seq(net)); audit_log_nfcfg(buf, family, @@ -9442,7 +9444,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -9511,7 +9513,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -9696,17 +9698,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, - NFNETLINK_V0, nft_base_seq(net)); + NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -10968,11 +10969,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - base_seq = READ_ONCE(nft_net->base_seq); + base_seq = nft_base_seq(net); while (++base_seq == 0) ; - WRITE_ONCE(nft_net->base_seq, base_seq); + /* pairs with smp_load_acquire in nft_lookup_eval */ + smp_store_release(&net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); @@ -11181,7 +11183,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - nf_tables_commit_audit_log(&adl, nft_net->base_seq); + nf_tables_commit_audit_log(&adl, nft_base_seq(net)); nft_gc_seq_end(nft_net, gc_seq); nft_net->validate_state = NFT_VALIDATE_SKIP; @@ -11506,7 +11508,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) mutex_lock(&nft_net->commit_mutex); nft_net->tstamp = get_jiffies_64(); - genid_ok = genid == 0 || nft_net->base_seq == genid; + genid_ok = genid == 0 || nft_base_seq(net) == genid; if (!genid_ok) mutex_unlock(&nft_net->commit_mutex); @@ -12143,7 +12145,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); - nft_net->base_seq = 1; + net->nft.base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 40c602ffbcba72..58c5b14889c474 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,11 +24,11 @@ struct nft_lookup { struct nft_set_binding binding; }; -#ifdef CONFIG_MITIGATION_RETPOLINE -const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { +#ifdef CONFIG_MITIGATION_RETPOLINE if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) @@ -51,10 +51,46 @@ nft_set_do_lookup(const struct net *net, const struct nft_set *set, return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); +#endif return set->ops->lookup(net, set, key); } + +static unsigned int nft_base_seq(const struct net *net) +{ + /* pairs with smp_store_release() in nf_tables_commit() */ + return smp_load_acquire(&net->nft.base_seq); +} + +static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) +{ + return unlikely(seq != nft_base_seq(net)); +} + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + const struct nft_set_ext *ext; + unsigned int base_seq; + + do { + base_seq = nft_base_seq(net); + + ext = __nft_set_do_lookup(net, set, key); + if (ext) + break; + /* No match? There is a small chance that lookup was + * performed in the old generation, but nf_tables_commit() + * already unlinked a (matching) element. + * + * We need to repeat the lookup to make sure that we didn't + * miss a matching element in the new generation. + */ + } while (nft_lookup_should_retry(net, base_seq)); + + return ext; +} EXPORT_SYMBOL_GPL(nft_set_do_lookup); -#endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index c24c922f895d87..8d3f040a904a2c 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -226,7 +226,8 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 9a10251228fd5d..793790d79d1384 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -510,6 +510,23 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. + * Unlike other set types, this uses NFT_GENMASK_ANY instead of + * nft_genmask_cur(). + * + * This is because new (future) elements are not reachable from + * priv->match, they get added to priv->clone instead. + * When the commit phase flips the generation bitmask, the + * 'now old' entries are skipped but without the 'now current' + * elements becoming visible. Using nft_genmask_cur() thus creates + * inconsistent state: matching old entries get skipped but thew + * newly matching entries are unreachable. + * + * GENMASK will still find the 'now old' entries which ensures consistent + * priv->match view. + * + * nft_pipapo_commit swaps ->clone and ->match shortly after the + * genbit flip. As ->clone doesn't contain the old entries in the first + * place, lookup will only find the now-current ones. * * Return: ntables API extension pointer or NULL if no match. */ @@ -518,12 +535,11 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64()); + e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64()); return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 2f090e253caf7c..c0884fa68c7980 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1152,7 +1152,6 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, struct nft_pipapo *priv = nft_set_priv(set); const struct nft_set_ext *ext = NULL; struct nft_pipapo_scratch *scratch; - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; @@ -1248,8 +1247,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, if (last) { const struct nft_set_ext *e = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(e) || - !nft_set_elem_active(e, genmask))) + if (unlikely(nft_set_elem_expired(e))) goto next_match; ext = e; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 938a257c069e23..b1f04168ec9377 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -77,7 +77,9 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; - interval = rbe; + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_rbtree_elem_expired(rbe)) + interval = rbe; } else if (d > 0) parent = rcu_dereference_raw(parent->rb_right); else { @@ -102,8 +104,6 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, } if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) return &interval->ext; diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 0d04d23aafe7fc..0da652844dd66a 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,7 +84,6 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -96,12 +95,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - - if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) { - audit_log_format(audit_buf, " subj=%s", ctx.context); - security_release_secctx(&ctx); - } + audit_log_subj_ctx(audit_buf, &audit_info->prop); return audit_buf; } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 104732d3454348..978c129c609501 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group) !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; + if (ret) + break; + if (family->bind) family->bind(i); diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 28c1b00221780f..bd861191157b54 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -133,12 +133,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len, &off, PAGE_SIZE); - if (unlikely(ret != ibmr->sg_dma_len)) - return ret < 0 ? ret : -EINVAL; + if (unlikely(ret != ibmr->sg_dma_len)) { + ret = ret < 0 ? ret : -EINVAL; + goto out_inc; + } - if (cmpxchg(&frmr->fr_state, - FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) - return -EBUSY; + if (cmpxchg(&frmr->fr_state, FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) { + ret = -EBUSY; + goto out_inc; + } atomic_inc(&ibmr->ic->i_fastreg_inuse_count); @@ -166,11 +169,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) /* Failure here can be because of -ENOMEM as well */ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE); - atomic_inc(&ibmr->ic->i_fastreg_wrs); if (printk_ratelimit()) pr_warn("RDS/IB: %s returned error(%d)\n", __func__, ret); - goto out; + goto out_inc; } /* Wait for the registration to complete in order to prevent an invalid @@ -179,8 +181,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) */ wait_event(frmr->fr_reg_done, !frmr->fr_reg); -out: + return ret; +out_inc: + atomic_inc(&ibmr->ic->i_fastreg_wrs); return ret; } diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 41e657e977618a..cf2dcec6ce5afc 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -94,10 +94,10 @@ static const struct dmi_system_id rfkill_gpio_deny_table[] = { static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill; - struct gpio_desc *gpio; + const char *type_name = NULL; const char *name_property; const char *type_property; - const char *type_name; + struct gpio_desc *gpio; int ret; if (dmi_check_system(rfkill_gpio_deny_table)) diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 1e19c605bcc829..dce5a3d8a964f8 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -475,7 +475,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, struct krb5_buffer metadata; unsigned int offset = sp->offset, len = sp->len; size_t data_offset = 0, data_len = len; - u32 ac; + u32 ac = 0; int ret = -ENOMEM; _enter(""); @@ -499,9 +499,10 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, skb, &offset, &len, &ac); kfree(hdr); - if (ret == -EPROTO) { - rxrpc_abort_eproto(call, skb, ac, - rxgk_abort_1_verify_mic_eproto); + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); } else { sp->offset = offset; sp->len = len; @@ -524,15 +525,16 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, struct rxgk_header hdr; unsigned int offset = sp->offset, len = sp->len; int ret; - u32 ac; + u32 ac = 0; _enter(""); ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); - if (ret == -EPROTO) - rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); - if (ret < 0) + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); goto error; + } if (len < sizeof(hdr)) { ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index b94b77a1c31780..30275cb5ba3e25 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -54,6 +54,10 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, _enter(""); + if (ticket_len < 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + /* Get the session key length */ ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); if (ret < 0) @@ -187,7 +191,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, struct key *server_key; unsigned int ticket_offset, ticket_len; u32 kvno, enctype; - int ret, ec; + int ret, ec = 0; struct { __be32 kvno; @@ -195,22 +199,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, __be32 token_len; } container; + if (token_len < sizeof(container)) + goto short_packet; + /* Decode the RXGK_TokenContainer object. This tells us which server * key we should be using. We can then fetch the key, get the secret * and set up the crypto to extract the token. */ if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + goto short_packet; kvno = ntohl(container.kvno); enctype = ntohl(container.enctype); ticket_len = ntohl(container.token_len); ticket_offset = token_offset + sizeof(container); - if (xdr_round_up(ticket_len) > token_len - 3 * 4) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + if (xdr_round_up(ticket_len) > token_len - sizeof(container)) + goto short_packet; _debug("KVNO %u", kvno); _debug("ENC %u", enctype); @@ -236,9 +241,11 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, &ticket_offset, &ticket_len, &ec); crypto_free_aead(token_enc); token_enc = NULL; - if (ret < 0) - return rxrpc_abort_conn(conn, skb, ec, ret, - rxgk_abort_resp_tok_dec); + if (ret < 0) { + if (ret != -ENOMEM) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + } ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, ticket_len, _key); @@ -283,4 +290,8 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, * also come out this way if the ticket decryption fails. */ return ret; + +short_packet: + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); } diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index 7370a56559853f..80164d89e19c03 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -88,11 +88,16 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch. */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } @@ -127,11 +132,16 @@ int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 0eb00bbefd1748..77cc1c6dc3e977 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -56,6 +56,7 @@ static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, { struct smc_lo_dmb_node *dmb_node, *tmp_node; struct smc_lo_dev *ldev = smcd->priv; + struct folio *folio; int sba_idx, rc; /* check space for new dmb */ @@ -74,13 +75,16 @@ static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, dmb_node->sba_idx = sba_idx; dmb_node->len = dmb->dmb_len; - dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | - __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC); - if (!dmb_node->cpu_addr) { + + /* not critical; fail under memory pressure and fallback to TCP */ + folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_ZERO, + get_order(dmb_node->len)); + if (!folio) { rc = -ENOMEM; goto err_node; } + dmb_node->cpu_addr = folio_address(folio); dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; refcount_set(&dmb_node->refcnt, 1); @@ -122,7 +126,7 @@ static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, write_unlock_bh(&ldev->dmb_ht_lock); clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); - kvfree(dmb_node->cpu_addr); + folio_put(virt_to_folio(dmb_node->cpu_addr)); kfree(dmb_node); if (atomic_dec_and_test(&ldev->dmb_cnt)) diff --git a/net/socket.c b/net/socket.c index 682969deaed35d..bac335ecee4c54 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; + if (iocb->ki_flags & IOCB_NOSIGNAL) + msg.msg_flags |= MSG_NOSIGNAL; + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 73bc39281ef5f5..9b45fbdc90cabe 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,8 +276,6 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (unlikely(current->flags & PF_EXITING)) - return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c5f7bbf5775ff8..3aa987e7f0724d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -407,9 +407,9 @@ xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, alert_kvec.iov_len); ret = sock_recvmsg(sock, &msg, flags); - if (ret > 0 && - tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { - iov_iter_revert(&msg.msg_iter, ret); + if (ret > 0) { + if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) + iov_iter_revert(&msg.msg_iter, ret); ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, -EAGAIN); } diff --git a/net/tls/tls.h b/net/tls/tls.h index 4e077068e6d98a..e4c42731ce39ae 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -141,6 +141,7 @@ void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); void tls_err_abort(struct sock *sk, int err); +void tls_strp_abort_strp(struct tls_strparser *strp, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index d71643b494a1ae..98e12f0ff57e51 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -13,7 +13,7 @@ static struct workqueue_struct *tls_strp_wq; -static void tls_strp_abort_strp(struct tls_strparser *strp, int err) +void tls_strp_abort_strp(struct tls_strparser *strp, int err) { if (strp->stopped) return; @@ -211,11 +211,17 @@ static int tls_strp_copyin_frag(struct tls_strparser *strp, struct sk_buff *skb, struct sk_buff *in_skb, unsigned int offset, size_t in_len) { + unsigned int nfrag = skb->len / PAGE_SIZE; size_t len, chunk; skb_frag_t *frag; int sz; - frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE]; + if (unlikely(nfrag >= skb_shinfo(skb)->nr_frags)) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EMSGSIZE; + } + + frag = &skb_shinfo(skb)->frags[nfrag]; len = in_len; /* First make sure we got the header */ @@ -520,10 +526,8 @@ static int tls_strp_read_sock(struct tls_strparser *strp) tls_strp_load_anchor_with_queue(strp, inq); if (!strp->stm.full_len) { sz = tls_rx_msg_size(strp, strp->anchor); - if (sz < 0) { - tls_strp_abort_strp(strp, sz); + if (sz < 0) return sz; - } strp->stm.full_len = sz; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bac65d0d4e3e1e..daac9fd4be7eb5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2474,8 +2474,7 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb) return data_len + TLS_HEADER_SIZE; read_failure: - tls_err_abort(strp->sk, ret); - + tls_strp_abort_strp(strp, ret); return ret; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6d7c110814ffa7..768098dec23100 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1387,7 +1387,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); + dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; @@ -1417,7 +1417,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); return 0; out_unlock: @@ -1427,7 +1427,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893e6..852573423e52d1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7062,7 +7062,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg80211_registered_device *rdev, struct net_device *dev, - const u8 *mac_addr, struct station_info *sinfo) + const u8 *mac_addr, struct station_info *sinfo, + bool link_stats) { void *hdr; struct nlattr *sinfoattr, *bss_param; @@ -7283,7 +7284,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, goto nla_put_failure; } - if (sinfo->valid_links) { + if (link_stats && sinfo->valid_links) { links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); if (!links) goto nla_put_failure; @@ -7574,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo) < 0) + &sinfo, false) < 0) goto out; sta_idx++; @@ -7635,7 +7636,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, info->snd_portid, info->snd_seq, 0, - rdev, dev, mac_addr, &sinfo) < 0) { + rdev, dev, mac_addr, &sinfo, false) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -19680,7 +19681,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, return; if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } @@ -19710,7 +19711,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, } if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c3acecc14b1ae..72e34bd2d925c0 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,6 +36,20 @@ #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET 32 +struct xsk_addr_node { + u64 addr; + struct list_head addr_node; +}; + +struct xsk_addr_head { + u32 num_descs; + struct list_head addrs_list; +}; + +static struct kmem_cache *xsk_tx_generic_cache; + +#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb)) + void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -532,24 +546,43 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { unsigned long flags; int ret; spin_lock_irqsave(&pool->cq_lock, flags); - ret = xskq_prod_reserve_addr(pool->cq, addr); + ret = xskq_prod_reserve(pool->cq); spin_unlock_irqrestore(&pool->cq_lock, flags); return ret; } -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) +static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, + struct sk_buff *skb) { + struct xsk_addr_node *pos, *tmp; + u32 descs_processed = 0; unsigned long flags; + u32 idx; spin_lock_irqsave(&pool->cq_lock, flags); - xskq_prod_submit_n(pool->cq, n); + idx = xskq_get_prod(pool->cq); + + xskq_prod_write_addr(pool->cq, idx, + (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg); + descs_processed++; + + if (unlikely(XSKCB(skb)->num_descs > 1)) { + list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { + xskq_prod_write_addr(pool->cq, idx + descs_processed, + pos->addr); + descs_processed++; + list_del(&pos->addr_node); + kmem_cache_free(xsk_tx_generic_cache, pos); + } + } + xskq_prod_submit_n(pool->cq, descs_processed); spin_unlock_irqrestore(&pool->cq_lock, flags); } @@ -562,9 +595,14 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) spin_unlock_irqrestore(&pool->cq_lock, flags); } +static void xsk_inc_num_desc(struct sk_buff *skb) +{ + XSKCB(skb)->num_descs++; +} + static u32 xsk_get_num_desc(struct sk_buff *skb) { - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; + return XSKCB(skb)->num_descs; } static void xsk_destruct_skb(struct sk_buff *skb) @@ -576,23 +614,33 @@ static void xsk_destruct_skb(struct sk_buff *skb) *compl->tx_timestamp = ktime_get_tai_fast_ns(); } - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); + xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); sock_wfree(skb); } -static void xsk_set_destructor_arg(struct sk_buff *skb) +static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr) { - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; - - skb_shinfo(skb)->destructor_arg = (void *)num; + BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); + INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); + XSKCB(skb)->num_descs = 0; + skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addr_node *pos, *tmp; + + if (unlikely(num_descs > 1)) { + list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { + list_del(&pos->addr_node); + kmem_cache_free(xsk_tx_generic_cache, pos); + } + } skb->destructor = sock_wfree; - xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); + xsk_cq_cancel_locked(xs->pool, num_descs); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; @@ -609,6 +657,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; + struct xsk_addr_node *xsk_addr; struct sk_buff *skb = xs->skb; struct page *page; void *buffer; @@ -623,6 +672,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); skb_reserve(skb, hr); + + xsk_set_destructor_arg(skb, desc->addr); + } else { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); + + /* in case of -EOVERFLOW that could happen below, + * xsk_consume_skb() will release this node as whole skb + * would be dropped, which implies freeing all list elements + */ + xsk_addr->addr = desc->addr; + list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } addr = desc->addr; @@ -694,8 +756,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; + + xsk_set_destructor_arg(skb, desc->addr); } else { int nr_frags = skb_shinfo(skb)->nr_frags; + struct xsk_addr_node *xsk_addr; struct page *page; u8 *vaddr; @@ -710,12 +775,22 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; } + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (!xsk_addr) { + __free_page(page); + err = -ENOMEM; + goto free_err; + } + vaddr = kmap_local_page(page); memcpy(vaddr, buffer, len); kunmap_local(vaddr); skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); + + xsk_addr->addr = desc->addr; + list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } if (first_frag && desc->options & XDP_TX_METADATA) { @@ -759,7 +834,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); - xsk_set_destructor_arg(skb); + xsk_inc_num_desc(skb); return skb; @@ -769,7 +844,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (err == -EOVERFLOW) { /* Drop the packet */ - xsk_set_destructor_arg(xs->skb); + xsk_inc_num_desc(xs->skb); xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } else { @@ -812,7 +887,7 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); + err = xsk_cq_reserve_locked(xs->pool); if (err) { err = -EAGAIN; goto out; @@ -1815,8 +1890,18 @@ static int __init xsk_init(void) if (err) goto out_pernet; + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", + sizeof(struct xsk_addr_node), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!xsk_tx_generic_cache) { + err = -ENOMEM; + goto out_unreg_notif; + } + return 0; +out_unreg_notif: + unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 46d87e961ad6d3..f16f390370dc43 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q) /* Functions for producers */ +static inline u32 xskq_get_prod(struct xsk_queue *q) +{ + return READ_ONCE(q->ring->producer); +} + static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[idx & q->ring_mask] = addr; +} + static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index c7a1f080d2de3a..44b9de6e4e7788 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -438,7 +438,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->props.mode == XFRM_MODE_TUNNEL; - switch (x->props.family) { + switch (x->inner_mode.family) { case AF_INET: /* Check for IPv4 options */ if (ip_hdr(skb)->ihl != 5) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 78fcbb89cf3256..d213ca3653a8f2 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2583,6 +2583,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, for (h = 0; h < range; h++) { u32 spi = (low == high) ? low : get_random_u32_inclusive(low, high); + if (spi == 0) + goto next; newspi = htonl(spi); spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -2598,6 +2600,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, xfrm_state_put(x0); spin_unlock_bh(&net->xfrm.xfrm_state_lock); +next: if (signal_pending(current)) { err = -ERESTARTSYS; goto unlock; diff --git a/rust/Makefile b/rust/Makefile index bfa915b0e58854..23c7ae905bd2f0 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -98,6 +98,12 @@ quiet_cmd_rustdoc = RUSTDOC $(if $(rustdoc_host),H, ) $< # and then retouch the generated files. rustdoc: rustdoc-core rustdoc-macros rustdoc-compiler_builtins \ rustdoc-kernel rustdoc-pin_init + $(Q)grep -Ehro ' #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -84,6 +86,7 @@ /* `bindgen` gets confused at certain things. */ const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN; +const size_t RUST_CONST_HELPER_ARCH_KMALLOC_MINALIGN = ARCH_KMALLOC_MINALIGN; const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE; const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC; const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs index 474cc98c48a323..0c57cf9b4004f1 100644 --- a/rust/bindings/lib.rs +++ b/rust/bindings/lib.rs @@ -31,11 +31,19 @@ #[allow(clippy::undocumented_unsafe_blocks)] #[cfg_attr(CONFIG_RUSTC_HAS_UNNECESSARY_TRANSMUTES, allow(unnecessary_transmutes))] mod bindings_raw { + use pin_init::{MaybeZeroable, Zeroable}; + // Manual definition for blocklisted types. type __kernel_size_t = usize; type __kernel_ssize_t = isize; type __kernel_ptrdiff_t = isize; + // `bindgen` doesn't automatically do this, see + // + // + // SAFETY: `__BindgenBitfieldUnit` is a newtype around `Storage`. + unsafe impl Zeroable for __BindgenBitfieldUnit where Storage: Zeroable {} + // Use glob import here to expose all helpers. // Symbols defined within the module will take precedence to the glob import. pub use super::bindings_helper::*; diff --git a/rust/helpers/atomic.c b/rust/helpers/atomic.c new file mode 100644 index 00000000000000..cf06b7ef9a1c55 --- /dev/null +++ b/rust/helpers/atomic.c @@ -0,0 +1,1040 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Generated by scripts/atomic/gen-rust-atomic-helpers.sh +// DO NOT MODIFY THIS FILE DIRECTLY + +/* + * This file provides helpers for the various atomic functions for Rust. + */ +#ifndef _RUST_ATOMIC_API_H +#define _RUST_ATOMIC_API_H + +#include + +// TODO: Remove this after INLINE_HELPERS support is added. +#ifndef __rust_helper +#define __rust_helper +#endif + +__rust_helper int +rust_helper_atomic_read(const atomic_t *v) +{ + return atomic_read(v); +} + +__rust_helper int +rust_helper_atomic_read_acquire(const atomic_t *v) +{ + return atomic_read_acquire(v); +} + +__rust_helper void +rust_helper_atomic_set(atomic_t *v, int i) +{ + atomic_set(v, i); +} + +__rust_helper void +rust_helper_atomic_set_release(atomic_t *v, int i) +{ + atomic_set_release(v, i); +} + +__rust_helper void +rust_helper_atomic_add(int i, atomic_t *v) +{ + atomic_add(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return(int i, atomic_t *v) +{ + return atomic_add_return(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return_acquire(int i, atomic_t *v) +{ + return atomic_add_return_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return_release(int i, atomic_t *v) +{ + return atomic_add_return_release(i, v); +} + +__rust_helper int +rust_helper_atomic_add_return_relaxed(int i, atomic_t *v) +{ + return atomic_add_return_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add(int i, atomic_t *v) +{ + return atomic_fetch_add(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_acquire(int i, atomic_t *v) +{ + return atomic_fetch_add_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_release(int i, atomic_t *v) +{ + return atomic_fetch_add_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_add_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_sub(int i, atomic_t *v) +{ + atomic_sub(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return(int i, atomic_t *v) +{ + return atomic_sub_return(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return_acquire(int i, atomic_t *v) +{ + return atomic_sub_return_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return_release(int i, atomic_t *v) +{ + return atomic_sub_return_release(i, v); +} + +__rust_helper int +rust_helper_atomic_sub_return_relaxed(int i, atomic_t *v) +{ + return atomic_sub_return_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub(int i, atomic_t *v) +{ + return atomic_fetch_sub(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub_acquire(int i, atomic_t *v) +{ + return atomic_fetch_sub_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub_release(int i, atomic_t *v) +{ + return atomic_fetch_sub_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_sub_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_sub_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_inc(atomic_t *v) +{ + atomic_inc(v); +} + +__rust_helper int +rust_helper_atomic_inc_return(atomic_t *v) +{ + return atomic_inc_return(v); +} + +__rust_helper int +rust_helper_atomic_inc_return_acquire(atomic_t *v) +{ + return atomic_inc_return_acquire(v); +} + +__rust_helper int +rust_helper_atomic_inc_return_release(atomic_t *v) +{ + return atomic_inc_return_release(v); +} + +__rust_helper int +rust_helper_atomic_inc_return_relaxed(atomic_t *v) +{ + return atomic_inc_return_relaxed(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc(atomic_t *v) +{ + return atomic_fetch_inc(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc_acquire(atomic_t *v) +{ + return atomic_fetch_inc_acquire(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc_release(atomic_t *v) +{ + return atomic_fetch_inc_release(v); +} + +__rust_helper int +rust_helper_atomic_fetch_inc_relaxed(atomic_t *v) +{ + return atomic_fetch_inc_relaxed(v); +} + +__rust_helper void +rust_helper_atomic_dec(atomic_t *v) +{ + atomic_dec(v); +} + +__rust_helper int +rust_helper_atomic_dec_return(atomic_t *v) +{ + return atomic_dec_return(v); +} + +__rust_helper int +rust_helper_atomic_dec_return_acquire(atomic_t *v) +{ + return atomic_dec_return_acquire(v); +} + +__rust_helper int +rust_helper_atomic_dec_return_release(atomic_t *v) +{ + return atomic_dec_return_release(v); +} + +__rust_helper int +rust_helper_atomic_dec_return_relaxed(atomic_t *v) +{ + return atomic_dec_return_relaxed(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec(atomic_t *v) +{ + return atomic_fetch_dec(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec_acquire(atomic_t *v) +{ + return atomic_fetch_dec_acquire(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec_release(atomic_t *v) +{ + return atomic_fetch_dec_release(v); +} + +__rust_helper int +rust_helper_atomic_fetch_dec_relaxed(atomic_t *v) +{ + return atomic_fetch_dec_relaxed(v); +} + +__rust_helper void +rust_helper_atomic_and(int i, atomic_t *v) +{ + atomic_and(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and(int i, atomic_t *v) +{ + return atomic_fetch_and(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and_acquire(int i, atomic_t *v) +{ + return atomic_fetch_and_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and_release(int i, atomic_t *v) +{ + return atomic_fetch_and_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_and_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_and_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_andnot(int i, atomic_t *v) +{ + atomic_andnot(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot(int i, atomic_t *v) +{ + return atomic_fetch_andnot(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot_acquire(int i, atomic_t *v) +{ + return atomic_fetch_andnot_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot_release(int i, atomic_t *v) +{ + return atomic_fetch_andnot_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_andnot_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_andnot_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_or(int i, atomic_t *v) +{ + atomic_or(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or(int i, atomic_t *v) +{ + return atomic_fetch_or(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or_acquire(int i, atomic_t *v) +{ + return atomic_fetch_or_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or_release(int i, atomic_t *v) +{ + return atomic_fetch_or_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_or_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_or_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic_xor(int i, atomic_t *v) +{ + atomic_xor(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor(int i, atomic_t *v) +{ + return atomic_fetch_xor(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor_acquire(int i, atomic_t *v) +{ + return atomic_fetch_xor_acquire(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor_release(int i, atomic_t *v) +{ + return atomic_fetch_xor_release(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_xor_relaxed(int i, atomic_t *v) +{ + return atomic_fetch_xor_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_xchg(atomic_t *v, int new) +{ + return atomic_xchg(v, new); +} + +__rust_helper int +rust_helper_atomic_xchg_acquire(atomic_t *v, int new) +{ + return atomic_xchg_acquire(v, new); +} + +__rust_helper int +rust_helper_atomic_xchg_release(atomic_t *v, int new) +{ + return atomic_xchg_release(v, new); +} + +__rust_helper int +rust_helper_atomic_xchg_relaxed(atomic_t *v, int new) +{ + return atomic_xchg_relaxed(v, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg(v, old, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg_acquire(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg_acquire(v, old, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg_release(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg_release(v, old, new); +} + +__rust_helper int +rust_helper_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) +{ + return atomic_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg_acquire(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg_release(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) +{ + return atomic_try_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic_sub_and_test(int i, atomic_t *v) +{ + return atomic_sub_and_test(i, v); +} + +__rust_helper bool +rust_helper_atomic_dec_and_test(atomic_t *v) +{ + return atomic_dec_and_test(v); +} + +__rust_helper bool +rust_helper_atomic_inc_and_test(atomic_t *v) +{ + return atomic_inc_and_test(v); +} + +__rust_helper bool +rust_helper_atomic_add_negative(int i, atomic_t *v) +{ + return atomic_add_negative(i, v); +} + +__rust_helper bool +rust_helper_atomic_add_negative_acquire(int i, atomic_t *v) +{ + return atomic_add_negative_acquire(i, v); +} + +__rust_helper bool +rust_helper_atomic_add_negative_release(int i, atomic_t *v) +{ + return atomic_add_negative_release(i, v); +} + +__rust_helper bool +rust_helper_atomic_add_negative_relaxed(int i, atomic_t *v) +{ + return atomic_add_negative_relaxed(i, v); +} + +__rust_helper int +rust_helper_atomic_fetch_add_unless(atomic_t *v, int a, int u) +{ + return atomic_fetch_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic_add_unless(atomic_t *v, int a, int u) +{ + return atomic_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic_inc_not_zero(atomic_t *v) +{ + return atomic_inc_not_zero(v); +} + +__rust_helper bool +rust_helper_atomic_inc_unless_negative(atomic_t *v) +{ + return atomic_inc_unless_negative(v); +} + +__rust_helper bool +rust_helper_atomic_dec_unless_positive(atomic_t *v) +{ + return atomic_dec_unless_positive(v); +} + +__rust_helper int +rust_helper_atomic_dec_if_positive(atomic_t *v) +{ + return atomic_dec_if_positive(v); +} + +__rust_helper s64 +rust_helper_atomic64_read(const atomic64_t *v) +{ + return atomic64_read(v); +} + +__rust_helper s64 +rust_helper_atomic64_read_acquire(const atomic64_t *v) +{ + return atomic64_read_acquire(v); +} + +__rust_helper void +rust_helper_atomic64_set(atomic64_t *v, s64 i) +{ + atomic64_set(v, i); +} + +__rust_helper void +rust_helper_atomic64_set_release(atomic64_t *v, s64 i) +{ + atomic64_set_release(v, i); +} + +__rust_helper void +rust_helper_atomic64_add(s64 i, atomic64_t *v) +{ + atomic64_add(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return(s64 i, atomic64_t *v) +{ + return atomic64_add_return(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return_acquire(s64 i, atomic64_t *v) +{ + return atomic64_add_return_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return_release(s64 i, atomic64_t *v) +{ + return atomic64_add_return_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_add_return_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_add_return_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_add_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_sub(s64 i, atomic64_t *v) +{ + atomic64_sub(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return(s64 i, atomic64_t *v) +{ + return atomic64_sub_return(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return_acquire(s64 i, atomic64_t *v) +{ + return atomic64_sub_return_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return_release(s64 i, atomic64_t *v) +{ + return atomic64_sub_return_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_sub_return_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_sub_return_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_sub_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_inc(atomic64_t *v) +{ + atomic64_inc(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return(atomic64_t *v) +{ + return atomic64_inc_return(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return_acquire(atomic64_t *v) +{ + return atomic64_inc_return_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return_release(atomic64_t *v) +{ + return atomic64_inc_return_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_inc_return_relaxed(atomic64_t *v) +{ + return atomic64_inc_return_relaxed(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc(atomic64_t *v) +{ + return atomic64_fetch_inc(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc_acquire(atomic64_t *v) +{ + return atomic64_fetch_inc_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc_release(atomic64_t *v) +{ + return atomic64_fetch_inc_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_inc_relaxed(atomic64_t *v) +{ + return atomic64_fetch_inc_relaxed(v); +} + +__rust_helper void +rust_helper_atomic64_dec(atomic64_t *v) +{ + atomic64_dec(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return(atomic64_t *v) +{ + return atomic64_dec_return(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return_acquire(atomic64_t *v) +{ + return atomic64_dec_return_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return_release(atomic64_t *v) +{ + return atomic64_dec_return_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_return_relaxed(atomic64_t *v) +{ + return atomic64_dec_return_relaxed(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec(atomic64_t *v) +{ + return atomic64_fetch_dec(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec_acquire(atomic64_t *v) +{ + return atomic64_fetch_dec_acquire(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec_release(atomic64_t *v) +{ + return atomic64_fetch_dec_release(v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_dec_relaxed(atomic64_t *v) +{ + return atomic64_fetch_dec_relaxed(v); +} + +__rust_helper void +rust_helper_atomic64_and(s64 i, atomic64_t *v) +{ + atomic64_and(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_and_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_andnot(s64 i, atomic64_t *v) +{ + atomic64_andnot(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_andnot_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_or(s64 i, atomic64_t *v) +{ + atomic64_or(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_or_relaxed(i, v); +} + +__rust_helper void +rust_helper_atomic64_xor(s64 i, atomic64_t *v) +{ + atomic64_xor(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor_acquire(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor_release(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor_release(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_fetch_xor_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_xchg(atomic64_t *v, s64 new) +{ + return atomic64_xchg(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_xchg_acquire(atomic64_t *v, s64 new) +{ + return atomic64_xchg_acquire(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_xchg_release(atomic64_t *v, s64 new) +{ + return atomic64_xchg_release(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_xchg_relaxed(atomic64_t *v, s64 new) +{ + return atomic64_xchg_relaxed(v, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg(v, old, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg_acquire(v, old, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg_release(v, old, new); +} + +__rust_helper s64 +rust_helper_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) +{ + return atomic64_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg_acquire(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg_release(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) +{ + return atomic64_try_cmpxchg_relaxed(v, old, new); +} + +__rust_helper bool +rust_helper_atomic64_sub_and_test(s64 i, atomic64_t *v) +{ + return atomic64_sub_and_test(i, v); +} + +__rust_helper bool +rust_helper_atomic64_dec_and_test(atomic64_t *v) +{ + return atomic64_dec_and_test(v); +} + +__rust_helper bool +rust_helper_atomic64_inc_and_test(atomic64_t *v) +{ + return atomic64_inc_and_test(v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative(s64 i, atomic64_t *v) +{ + return atomic64_add_negative(i, v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative_acquire(s64 i, atomic64_t *v) +{ + return atomic64_add_negative_acquire(i, v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative_release(s64 i, atomic64_t *v) +{ + return atomic64_add_negative_release(i, v); +} + +__rust_helper bool +rust_helper_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) +{ + return atomic64_add_negative_relaxed(i, v); +} + +__rust_helper s64 +rust_helper_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) +{ + return atomic64_fetch_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) +{ + return atomic64_add_unless(v, a, u); +} + +__rust_helper bool +rust_helper_atomic64_inc_not_zero(atomic64_t *v) +{ + return atomic64_inc_not_zero(v); +} + +__rust_helper bool +rust_helper_atomic64_inc_unless_negative(atomic64_t *v) +{ + return atomic64_inc_unless_negative(v); +} + +__rust_helper bool +rust_helper_atomic64_dec_unless_positive(atomic64_t *v) +{ + return atomic64_dec_unless_positive(v); +} + +__rust_helper s64 +rust_helper_atomic64_dec_if_positive(atomic64_t *v) +{ + return atomic64_dec_if_positive(v); +} + +#endif /* _RUST_ATOMIC_API_H */ +// 615a0e0c98b5973a47fe4fa65e92935051ca00ed diff --git a/rust/helpers/barrier.c b/rust/helpers/barrier.c new file mode 100644 index 00000000000000..cdf28ce8e51167 --- /dev/null +++ b/rust/helpers/barrier.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_smp_mb(void) +{ + smp_mb(); +} + +void rust_helper_smp_wmb(void) +{ + smp_wmb(); +} + +void rust_helper_smp_rmb(void) +{ + smp_rmb(); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41dd5..730b2e810a8ad6 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -7,7 +7,9 @@ * Sorted alphabetically. */ +#include "atomic.c" #include "auxiliary.c" +#include "barrier.c" #include "blk.c" #include "bug.c" #include "build_assert.c" @@ -22,6 +24,7 @@ #include "dma.c" #include "drm.c" #include "err.c" +#include "irq.c" #include "fs.c" #include "io.c" #include "jump_label.c" @@ -34,6 +37,7 @@ #include "pid_namespace.c" #include "platform.c" #include "poll.c" +#include "processor.c" #include "property.c" #include "rbtree.c" #include "rcu.c" diff --git a/rust/helpers/irq.c b/rust/helpers/irq.c new file mode 100644 index 00000000000000..1faca428e2c047 --- /dev/null +++ b/rust/helpers/irq.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +int rust_helper_request_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *name, void *dev) +{ + return request_irq(irq, handler, flags, name, dev); +} diff --git a/rust/helpers/pci.c b/rust/helpers/pci.c index ef9cb38c81a6a5..fb814572b23631 100644 --- a/rust/helpers/pci.c +++ b/rust/helpers/pci.c @@ -2,6 +2,16 @@ #include +u16 rust_helper_pci_dev_id(struct pci_dev *dev) +{ + return PCI_DEVID(dev->bus->number, dev->devfn); +} + +resource_size_t rust_helper_pci_resource_start(struct pci_dev *pdev, int bar) +{ + return pci_resource_start(pdev, bar); +} + resource_size_t rust_helper_pci_resource_len(struct pci_dev *pdev, int bar) { return pci_resource_len(pdev, bar); @@ -11,3 +21,11 @@ bool rust_helper_dev_is_pci(const struct device *dev) { return dev_is_pci(dev); } + +#ifndef CONFIG_PCI_MSI +int rust_helper_pci_irq_vector(struct pci_dev *pdev, unsigned int nvec) +{ + return pci_irq_vector(pdev, nvec); +} + +#endif diff --git a/rust/helpers/processor.c b/rust/helpers/processor.c new file mode 100644 index 00000000000000..d41355e14d6eb5 --- /dev/null +++ b/rust/helpers/processor.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_cpu_relax(void) +{ + cpu_relax(); +} diff --git a/rust/helpers/refcount.c b/rust/helpers/refcount.c index d6adbd2e45a185..d175898ad7b81e 100644 --- a/rust/helpers/refcount.c +++ b/rust/helpers/refcount.c @@ -7,11 +7,21 @@ refcount_t rust_helper_REFCOUNT_INIT(int n) return (refcount_t)REFCOUNT_INIT(n); } +void rust_helper_refcount_set(refcount_t *r, int n) +{ + refcount_set(r, n); +} + void rust_helper_refcount_inc(refcount_t *r) { refcount_inc(r); } +void rust_helper_refcount_dec(refcount_t *r) +{ + refcount_dec(r); +} + bool rust_helper_refcount_dec_and_test(refcount_t *r) { return refcount_dec_and_test(r); diff --git a/rust/helpers/regulator.c b/rust/helpers/regulator.c index cd8b7ba648ee33..11bc332443bd06 100644 --- a/rust/helpers/regulator.c +++ b/rust/helpers/regulator.c @@ -40,4 +40,14 @@ int rust_helper_regulator_is_enabled(struct regulator *regulator) return regulator_is_enabled(regulator); } +int rust_helper_devm_regulator_get_enable(struct device *dev, const char *id) +{ + return devm_regulator_get_enable(dev, id); +} + +int rust_helper_devm_regulator_get_enable_optional(struct device *dev, const char *id) +{ + return devm_regulator_get_enable_optional(dev, id); +} + #endif diff --git a/rust/kernel/acpi.rs b/rust/kernel/acpi.rs index 7ae317368b0000..37e1161c12985f 100644 --- a/rust/kernel/acpi.rs +++ b/rust/kernel/acpi.rs @@ -37,11 +37,8 @@ impl DeviceId { /// Create a new device id from an ACPI 'id' string. #[inline(always)] pub const fn new(id: &'static CStr) -> Self { - build_assert!( - id.len_with_nul() <= Self::ACPI_ID_LEN, - "ID exceeds 16 bytes" - ); - let src = id.as_bytes_with_nul(); + let src = id.to_bytes_with_nul(); + build_assert!(src.len() <= Self::ACPI_ID_LEN, "ID exceeds 16 bytes"); // Replace with `bindings::acpi_device_id::default()` once stabilized for `const`. // SAFETY: FFI type is valid to be zero-initialized. let mut acpi: bindings::acpi_device_id = unsafe { core::mem::zeroed() }; diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index a2c49e5494d334..9c154209423cb2 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -2,18 +2,11 @@ //! Implementation of the kernel's memory allocation infrastructure. -#[cfg(not(any(test, testlib)))] pub mod allocator; pub mod kbox; pub mod kvec; pub mod layout; -#[cfg(any(test, testlib))] -pub mod allocator_test; - -#[cfg(any(test, testlib))] -pub use self::allocator_test as allocator; - pub use self::kbox::Box; pub use self::kbox::KBox; pub use self::kbox::KVBox; @@ -137,6 +130,14 @@ pub mod flags { /// - Implementers must ensure that all trait functions abide by the guarantees documented in the /// `# Guarantees` sections. pub unsafe trait Allocator { + /// The minimum alignment satisfied by all allocations from this allocator. + /// + /// # Guarantees + /// + /// Any pointer allocated by this allocator is guaranteed to be aligned to `MIN_ALIGN` even if + /// the requested layout has a smaller alignment. + const MIN_ALIGN: usize; + /// Allocate memory based on `layout` and `flags`. /// /// On success, returns a buffer represented as `NonNull<[u8]>` that satisfies the layout diff --git a/rust/kernel/alloc/allocator.rs b/rust/kernel/alloc/allocator.rs index 2692cf90c9482d..869d9fd6952742 100644 --- a/rust/kernel/alloc/allocator.rs +++ b/rust/kernel/alloc/allocator.rs @@ -17,6 +17,8 @@ use crate::alloc::{AllocError, Allocator}; use crate::bindings; use crate::pr_warn; +const ARCH_KMALLOC_MINALIGN: usize = bindings::ARCH_KMALLOC_MINALIGN; + /// The contiguous kernel allocator. /// /// `Kmalloc` is typically used for physically contiguous allocations up to page size, but also @@ -128,6 +130,8 @@ impl Kmalloc { // - passing a pointer to a valid memory allocation is OK, // - `realloc` satisfies the guarantees, since `ReallocFunc::call` has the same. unsafe impl Allocator for Kmalloc { + const MIN_ALIGN: usize = ARCH_KMALLOC_MINALIGN; + #[inline] unsafe fn realloc( ptr: Option>, @@ -147,6 +151,8 @@ unsafe impl Allocator for Kmalloc { // - passing a pointer to a valid memory allocation is OK, // - `realloc` satisfies the guarantees, since `ReallocFunc::call` has the same. unsafe impl Allocator for Vmalloc { + const MIN_ALIGN: usize = kernel::page::PAGE_SIZE; + #[inline] unsafe fn realloc( ptr: Option>, @@ -171,6 +177,8 @@ unsafe impl Allocator for Vmalloc { // - passing a pointer to a valid memory allocation is OK, // - `realloc` satisfies the guarantees, since `ReallocFunc::call` has the same. unsafe impl Allocator for KVmalloc { + const MIN_ALIGN: usize = ARCH_KMALLOC_MINALIGN; + #[inline] unsafe fn realloc( ptr: Option>, diff --git a/rust/kernel/alloc/allocator_test.rs b/rust/kernel/alloc/allocator_test.rs deleted file mode 100644 index 90dd987d40e452..00000000000000 --- a/rust/kernel/alloc/allocator_test.rs +++ /dev/null @@ -1,124 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -//! So far the kernel's `Box` and `Vec` types can't be used by userspace test cases, since all users -//! of those types (e.g. `CString`) use kernel allocators for instantiation. -//! -//! In order to allow userspace test cases to make use of such types as well, implement the -//! `Cmalloc` allocator within the `allocator_test` module and type alias all kernel allocators to -//! `Cmalloc`. The `Cmalloc` allocator uses libc's `realloc()` function as allocator backend. - -#![allow(missing_docs)] - -use super::{flags::*, AllocError, Allocator, Flags}; -use core::alloc::Layout; -use core::cmp; -use core::ptr; -use core::ptr::NonNull; - -/// The userspace allocator based on libc. -pub struct Cmalloc; - -pub type Kmalloc = Cmalloc; -pub type Vmalloc = Kmalloc; -pub type KVmalloc = Kmalloc; - -impl Cmalloc { - /// Returns a [`Layout`] that makes [`Kmalloc`] fulfill the requested size and alignment of - /// `layout`. - pub fn aligned_layout(layout: Layout) -> Layout { - // Note that `layout.size()` (after padding) is guaranteed to be a multiple of - // `layout.align()` which together with the slab guarantees means that `Kmalloc` will return - // a properly aligned object (see comments in `kmalloc()` for more information). - layout.pad_to_align() - } -} - -extern "C" { - #[link_name = "aligned_alloc"] - fn libc_aligned_alloc(align: usize, size: usize) -> *mut crate::ffi::c_void; - - #[link_name = "free"] - fn libc_free(ptr: *mut crate::ffi::c_void); -} - -// SAFETY: -// - memory remains valid until it is explicitly freed, -// - passing a pointer to a valid memory allocation created by this `Allocator` is always OK, -// - `realloc` provides the guarantees as provided in the `# Guarantees` section. -unsafe impl Allocator for Cmalloc { - unsafe fn realloc( - ptr: Option>, - layout: Layout, - old_layout: Layout, - flags: Flags, - ) -> Result, AllocError> { - let src = match ptr { - Some(src) => { - if old_layout.size() == 0 { - ptr::null_mut() - } else { - src.as_ptr() - } - } - None => ptr::null_mut(), - }; - - if layout.size() == 0 { - // SAFETY: `src` is either NULL or was previously allocated with this `Allocator` - unsafe { libc_free(src.cast()) }; - - return Ok(NonNull::slice_from_raw_parts( - crate::alloc::dangling_from_layout(layout), - 0, - )); - } - - // ISO C (ISO/IEC 9899:2011) defines `aligned_alloc`: - // - // > The value of alignment shall be a valid alignment supported by the implementation - // [...]. - // - // As an example of the "supported by the implementation" requirement, POSIX.1-2001 (IEEE - // 1003.1-2001) defines `posix_memalign`: - // - // > The value of alignment shall be a power of two multiple of sizeof (void *). - // - // and POSIX-based implementations of `aligned_alloc` inherit this requirement. At the time - // of writing, this is known to be the case on macOS (but not in glibc). - // - // Satisfy the stricter requirement to avoid spurious test failures on some platforms. - let min_align = core::mem::size_of::<*const crate::ffi::c_void>(); - let layout = layout.align_to(min_align).map_err(|_| AllocError)?; - let layout = layout.pad_to_align(); - - // SAFETY: Returns either NULL or a pointer to a memory allocation that satisfies or - // exceeds the given size and alignment requirements. - let dst = unsafe { libc_aligned_alloc(layout.align(), layout.size()) }.cast::(); - let dst = NonNull::new(dst).ok_or(AllocError)?; - - if flags.contains(__GFP_ZERO) { - // SAFETY: The preceding calls to `libc_aligned_alloc` and `NonNull::new` - // guarantee that `dst` points to memory of at least `layout.size()` bytes. - unsafe { dst.as_ptr().write_bytes(0, layout.size()) }; - } - - if !src.is_null() { - // SAFETY: - // - `src` has previously been allocated with this `Allocator`; `dst` has just been - // newly allocated, hence the memory regions do not overlap. - // - both` src` and `dst` are properly aligned and valid for reads and writes - unsafe { - ptr::copy_nonoverlapping( - src, - dst.as_ptr(), - cmp::min(layout.size(), old_layout.size()), - ) - }; - } - - // SAFETY: `src` is either NULL or was previously allocated with this `Allocator` - unsafe { libc_free(src.cast()) }; - - Ok(NonNull::slice_from_raw_parts(dst, layout.size())) - } -} diff --git a/rust/kernel/alloc/kbox.rs b/rust/kernel/alloc/kbox.rs index 856d05aa60f134..27c4b5a9b61dc5 100644 --- a/rust/kernel/alloc/kbox.rs +++ b/rust/kernel/alloc/kbox.rs @@ -7,7 +7,6 @@ use super::allocator::{KVmalloc, Kmalloc, Vmalloc}; use super::{AllocError, Allocator, Flags}; use core::alloc::Layout; use core::borrow::{Borrow, BorrowMut}; -use core::fmt; use core::marker::PhantomData; use core::mem::ManuallyDrop; use core::mem::MaybeUninit; @@ -17,6 +16,7 @@ use core::ptr::NonNull; use core::result::Result; use crate::ffi::c_void; +use crate::fmt; use crate::init::InPlaceInit; use crate::types::ForeignOwnable; use pin_init::{InPlaceWrite, Init, PinInit, ZeroableOption}; @@ -290,6 +290,83 @@ where Ok(Self::new(x, flags)?.into()) } + /// Construct a pinned slice of elements `Pin>`. + /// + /// This is a convenient means for creation of e.g. slices of structrures containing spinlocks + /// or mutexes. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::{new_spinlock, SpinLock}; + /// + /// struct Inner { + /// a: u32, + /// b: u32, + /// } + /// + /// #[pin_data] + /// struct Example { + /// c: u32, + /// #[pin] + /// d: SpinLock, + /// } + /// + /// impl Example { + /// fn new() -> impl PinInit { + /// try_pin_init!(Self { + /// c: 10, + /// d <- new_spinlock!(Inner { a: 20, b: 30 }), + /// }) + /// } + /// } + /// + /// // Allocate a boxed slice of 10 `Example`s. + /// let s = KBox::pin_slice( + /// | _i | Example::new(), + /// 10, + /// GFP_KERNEL + /// )?; + /// + /// assert_eq!(s[5].c, 10); + /// assert_eq!(s[3].d.lock().a, 20); + /// # Ok::<(), Error>(()) + /// ``` + pub fn pin_slice( + mut init: Func, + len: usize, + flags: Flags, + ) -> Result>, E> + where + Func: FnMut(usize) -> Item, + Item: PinInit, + E: From, + { + let mut buffer = super::Vec::::with_capacity(len, flags)?; + for i in 0..len { + let ptr = buffer.spare_capacity_mut().as_mut_ptr().cast(); + // SAFETY: + // - `ptr` is a valid pointer to uninitialized memory. + // - `ptr` is not used if an error is returned. + // - `ptr` won't be moved until it is dropped, i.e. it is pinned. + unsafe { init(i).__pinned_init(ptr)? }; + + // SAFETY: + // - `i + 1 <= len`, hence we don't exceed the capacity, due to the call to + // `with_capacity()` above. + // - The new value at index buffer.len() + 1 is the only element being added here, and + // it has been initialized above by `init(i).__pinned_init(ptr)`. + unsafe { buffer.inc_len(1) }; + } + + let (ptr, _, _) = buffer.into_raw_parts(); + let slice = core::ptr::slice_from_raw_parts_mut(ptr, len); + + // SAFETY: `slice` points to an allocation allocated with `A` (`buffer`) and holds a valid + // `[T]`. + Ok(Pin::from(unsafe { Box::from_raw(slice) })) + } + /// Convert a [`Box`] to a [`Pin>`]. If `T` does not implement /// [`Unpin`], then `x` will be pinned in memory and can't be moved. pub fn into_pin(this: Self) -> Pin { @@ -401,12 +478,17 @@ where } // SAFETY: The pointer returned by `into_foreign` comes from a well aligned -// pointer to `T`. +// pointer to `T` allocated by `A`. unsafe impl ForeignOwnable for Box where A: Allocator, { - const FOREIGN_ALIGN: usize = core::mem::align_of::(); + const FOREIGN_ALIGN: usize = if core::mem::align_of::() < A::MIN_ALIGN { + A::MIN_ALIGN + } else { + core::mem::align_of::() + }; + type Borrowed<'a> = &'a T; type BorrowedMut<'a> = &'a mut T; @@ -435,12 +517,12 @@ where } // SAFETY: The pointer returned by `into_foreign` comes from a well aligned -// pointer to `T`. +// pointer to `T` allocated by `A`. unsafe impl ForeignOwnable for Pin> where A: Allocator, { - const FOREIGN_ALIGN: usize = core::mem::align_of::(); + const FOREIGN_ALIGN: usize = as ForeignOwnable>::FOREIGN_ALIGN; type Borrowed<'a> = Pin<&'a T>; type BorrowedMut<'a> = Pin<&'a mut T>; diff --git a/rust/kernel/alloc/kvec.rs b/rust/kernel/alloc/kvec.rs index 3c72e0bdddb871..dfc101e03f358a 100644 --- a/rust/kernel/alloc/kvec.rs +++ b/rust/kernel/alloc/kvec.rs @@ -7,9 +7,9 @@ use super::{ layout::ArrayLayout, AllocError, Allocator, Box, Flags, }; +use crate::fmt; use core::{ borrow::{Borrow, BorrowMut}, - fmt, marker::PhantomData, mem::{ManuallyDrop, MaybeUninit}, ops::Deref, @@ -175,7 +175,7 @@ where /// Returns the number of elements that can be stored within the vector without allocating /// additional memory. - pub fn capacity(&self) -> usize { + pub const fn capacity(&self) -> usize { if const { Self::is_zst() } { usize::MAX } else { @@ -185,7 +185,7 @@ where /// Returns the number of elements stored within the vector. #[inline] - pub fn len(&self) -> usize { + pub const fn len(&self) -> usize { self.len } @@ -196,7 +196,7 @@ where /// - `additional` must be less than or equal to `self.capacity - self.len`. /// - All elements within the interval [`self.len`,`self.len + additional`) must be initialized. #[inline] - pub unsafe fn inc_len(&mut self, additional: usize) { + pub const unsafe fn inc_len(&mut self, additional: usize) { // Guaranteed by the type invariant to never underflow. debug_assert!(additional <= self.capacity() - self.len()); // INVARIANT: By the safety requirements of this method this represents the exact number of @@ -224,6 +224,16 @@ where } /// Returns a slice of the entire vector. + /// + /// # Examples + /// + /// ``` + /// let mut v = KVec::new(); + /// v.push(1, GFP_KERNEL)?; + /// v.push(2, GFP_KERNEL)?; + /// assert_eq!(v.as_slice(), &[1, 2]); + /// # Ok::<(), Error>(()) + /// ``` #[inline] pub fn as_slice(&self) -> &[T] { self @@ -245,7 +255,7 @@ where /// Returns a raw pointer to the vector's backing buffer, or, if `T` is a ZST, a dangling raw /// pointer. #[inline] - pub fn as_ptr(&self) -> *const T { + pub const fn as_ptr(&self) -> *const T { self.ptr.as_ptr() } @@ -261,7 +271,7 @@ where /// assert!(!v.is_empty()); /// ``` #[inline] - pub fn is_empty(&self) -> bool { + pub const fn is_empty(&self) -> bool { self.len() == 0 } @@ -1294,7 +1304,7 @@ impl<'vec, T> Drop for DrainAll<'vec, T> { } } -#[macros::kunit_tests(rust_kvec_kunit)] +#[macros::kunit_tests(rust_kvec)] mod tests { use super::*; use crate::prelude::*; diff --git a/rust/kernel/alloc/kvec/errors.rs b/rust/kernel/alloc/kvec/errors.rs index 348b8d27e102ca..21a920a4b09bc1 100644 --- a/rust/kernel/alloc/kvec/errors.rs +++ b/rust/kernel/alloc/kvec/errors.rs @@ -2,7 +2,7 @@ //! Errors for the [`Vec`] type. -use core::fmt::{self, Debug, Formatter}; +use kernel::fmt::{self, Debug, Formatter}; use kernel::prelude::*; /// Error type for [`Vec::push_within_capacity`]. diff --git a/rust/kernel/alloc/layout.rs b/rust/kernel/alloc/layout.rs index 93ed514f7cc7ed..52cbf61c4539ef 100644 --- a/rust/kernel/alloc/layout.rs +++ b/rust/kernel/alloc/layout.rs @@ -80,7 +80,7 @@ impl ArrayLayout { /// # Safety /// /// `len` must be a value, for which `len * size_of::() <= isize::MAX` is true. - pub unsafe fn new_unchecked(len: usize) -> Self { + pub const unsafe fn new_unchecked(len: usize) -> Self { // INVARIANT: By the safety requirements of this function // `len * size_of::() <= isize::MAX`. Self { diff --git a/rust/kernel/auxiliary.rs b/rust/kernel/auxiliary.rs index 4749fb6bffef34..e11848bbf20616 100644 --- a/rust/kernel/auxiliary.rs +++ b/rust/kernel/auxiliary.rs @@ -55,7 +55,7 @@ impl Adapter { extern "C" fn probe_callback( adev: *mut bindings::auxiliary_device, id: *const bindings::auxiliary_device_id, - ) -> kernel::ffi::c_int { + ) -> c_int { // SAFETY: The auxiliary bus only ever calls the probe callback with a valid pointer to a // `struct auxiliary_device`. // @@ -105,8 +105,8 @@ pub struct DeviceId(bindings::auxiliary_device_id); impl DeviceId { /// Create a new [`DeviceId`] from name. pub const fn new(modname: &'static CStr, name: &'static CStr) -> Self { - let name = name.as_bytes_with_nul(); - let modname = modname.as_bytes_with_nul(); + let name = name.to_bytes_with_nul(); + let modname = modname.to_bytes_with_nul(); // TODO: Replace with `bindings::auxiliary_device_id::default()` once stabilized for // `const`. @@ -245,7 +245,7 @@ kernel::impl_device_context_deref!(unsafe { Device }); kernel::impl_device_context_into_aref!(Device); // SAFETY: Instances of `Device` are always reference-counted. -unsafe impl crate::types::AlwaysRefCounted for Device { +unsafe impl crate::sync::aref::AlwaysRefCounted for Device { fn inc_ref(&self) { // SAFETY: The existence of a shared reference guarantees that the refcount is non-zero. unsafe { bindings::get_device(self.as_ref().as_raw()) }; diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs index 831445d37181a7..61ea35bba7d501 100644 --- a/rust/kernel/block/mq.rs +++ b/rust/kernel/block/mq.rs @@ -82,7 +82,7 @@ //! Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; //! let mut disk = gen_disk::GenDiskBuilder::new() //! .capacity_sectors(4096) -//! .build(format_args!("myblk"), tagset)?; +//! .build(fmt!("myblk"), tagset)?; //! //! # Ok::<(), kernel::error::Error>(()) //! ``` diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index cd54cd64ea8878..be92d0e5f03126 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -3,12 +3,12 @@ //! Generic disk abstraction. //! //! C header: [`include/linux/blkdev.h`](srctree/include/linux/blkdev.h) -//! C header: [`include/linux/blk_mq.h`](srctree/include/linux/blk_mq.h) +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet}; +use crate::fmt::{self, Write}; use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc}; use crate::{error, static_lock_class}; -use core::fmt::{self, Write}; /// A builder for [`GenDisk`]. /// diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs index c2b98f507bcbd4..c0f95a9419c4e2 100644 --- a/rust/kernel/block/mq/operations.rs +++ b/rust/kernel/block/mq/operations.rs @@ -10,9 +10,10 @@ use crate::{ block::mq::Request, error::{from_result, Result}, prelude::*, + sync::Refcount, types::ARef, }; -use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering}; +use core::marker::PhantomData; /// Implement this trait to interface blk-mq as block devices. /// @@ -78,7 +79,7 @@ impl OperationsVTable { let request = unsafe { &*(*bd).rq.cast::>() }; // One refcount for the ARef, one for being in flight - request.wrapper_ref().refcount().store(2, Ordering::Relaxed); + request.wrapper_ref().refcount().set(2); // SAFETY: // - We own a refcount that we took above. We pass that to `ARef`. @@ -187,7 +188,7 @@ impl OperationsVTable { // SAFETY: The refcount field is allocated but not initialized, so // it is valid for writes. - unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(AtomicU64::new(0)) }; + unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(Refcount::new(0)) }; Ok(0) }) diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs index 7e2159e4f6a6f7..d311e24e2595ae 100644 --- a/rust/kernel/block/mq/raw_writer.rs +++ b/rust/kernel/block/mq/raw_writer.rs @@ -1,8 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -use core::fmt::{self, Write}; - use crate::error::Result; +use crate::fmt::{self, Write}; use crate::prelude::EINVAL; /// A mutable reference to a byte buffer where a string can be written into. diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs index fefd394f064a71..f62a376dc31399 100644 --- a/rust/kernel/block/mq/request.rs +++ b/rust/kernel/block/mq/request.rs @@ -8,13 +8,10 @@ use crate::{ bindings, block::mq::Operations, error::Result, + sync::{atomic::Relaxed, Refcount}, types::{ARef, AlwaysRefCounted, Opaque}, }; -use core::{ - marker::PhantomData, - ptr::NonNull, - sync::atomic::{AtomicU64, Ordering}, -}; +use core::{marker::PhantomData, ptr::NonNull}; /// A wrapper around a blk-mq [`struct request`]. This represents an IO request. /// @@ -37,6 +34,9 @@ use core::{ /// We need to track 3 and 4 to ensure that it is safe to end the request and hand /// back ownership to the block layer. /// +/// Note that the driver can still obtain new `ARef` even if there is no `ARef`s in existence by +/// using `tag_to_rq`, hence the need to distinguish B and C. +/// /// The states are tracked through the private `refcount` field of /// `RequestDataWrapper`. This structure lives in the private data area of the C /// [`struct request`]. @@ -98,13 +98,16 @@ impl Request { /// /// [`struct request`]: srctree/include/linux/blk-mq.h fn try_set_end(this: ARef) -> Result<*mut bindings::request, ARef> { - // We can race with `TagSet::tag_to_rq` - if let Err(_old) = this.wrapper_ref().refcount().compare_exchange( - 2, - 0, - Ordering::Relaxed, - Ordering::Relaxed, - ) { + // To hand back the ownership, we need the current refcount to be 2. + // Since we can race with `TagSet::tag_to_rq`, this needs to atomically reduce + // refcount to 0. `Refcount` does not provide a way to do this, so use the underlying + // atomics directly. + if let Err(_old) = this + .wrapper_ref() + .refcount() + .as_atomic() + .cmpxchg(2, 0, Relaxed) + { return Err(this); } @@ -173,13 +176,13 @@ pub(crate) struct RequestDataWrapper { /// - 0: The request is owned by C block layer. /// - 1: The request is owned by Rust abstractions but there are no [`ARef`] references to it. /// - 2+: There are [`ARef`] references to the request. - refcount: AtomicU64, + refcount: Refcount, } impl RequestDataWrapper { /// Return a reference to the refcount of the request that is embedding /// `self`. - pub(crate) fn refcount(&self) -> &AtomicU64 { + pub(crate) fn refcount(&self) -> &Refcount { &self.refcount } @@ -189,7 +192,7 @@ impl RequestDataWrapper { /// # Safety /// /// - `this` must point to a live allocation of at least the size of `Self`. - pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut AtomicU64 { + pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut Refcount { // SAFETY: Because of the safety requirements of this function, the // field projection is safe. unsafe { &raw mut (*this).refcount } @@ -205,47 +208,13 @@ unsafe impl Send for Request {} // mutate `self` are internally synchronized` unsafe impl Sync for Request {} -/// Store the result of `op(target.load())` in target, returning new value of -/// target. -fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 { - let old = target.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| Some(op(x))); - - // SAFETY: Because the operation passed to `fetch_update` above always - // return `Some`, `old` will always be `Ok`. - let old = unsafe { old.unwrap_unchecked() }; - - op(old) -} - -/// Store the result of `op(target.load)` in `target` if `target.load() != -/// pred`, returning [`true`] if the target was updated. -fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool { - target - .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| { - if x == pred { - None - } else { - Some(op(x)) - } - }) - .is_ok() -} - // SAFETY: All instances of `Request` are reference counted. This // implementation of `AlwaysRefCounted` ensure that increments to the ref count // keeps the object alive in memory at least until a matching reference count // decrement is executed. unsafe impl AlwaysRefCounted for Request { fn inc_ref(&self) { - let refcount = &self.wrapper_ref().refcount(); - - #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] - let updated = atomic_relaxed_op_unless(refcount, |x| x + 1, 0); - - #[cfg(CONFIG_DEBUG_MISC)] - if !updated { - panic!("Request refcount zero on clone") - } + self.wrapper_ref().refcount().inc(); } unsafe fn dec_ref(obj: core::ptr::NonNull) { @@ -257,10 +226,10 @@ unsafe impl AlwaysRefCounted for Request { let refcount = unsafe { &*RequestDataWrapper::refcount_ptr(wrapper_ptr) }; #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] - let new_refcount = atomic_relaxed_op_return(refcount, |x| x - 1); + let is_zero = refcount.dec_and_test(); #[cfg(CONFIG_DEBUG_MISC)] - if new_refcount == 0 { + if is_zero { panic!("Request reached refcount zero in Rust abstractions"); } } diff --git a/rust/kernel/configfs.rs b/rust/kernel/configfs.rs index 2736b798cdc6c4..9fb5ef825e4129 100644 --- a/rust/kernel/configfs.rs +++ b/rust/kernel/configfs.rs @@ -263,7 +263,7 @@ impl Group { try_pin_init!(Self { group <- pin_init::init_zeroed().chain(|v: &mut Opaque| { let place = v.get(); - let name = name.as_bytes_with_nul().as_ptr(); + let name = name.to_bytes_with_nul().as_ptr(); // SAFETY: It is safe to initialize a group once it has been zeroed. unsafe { bindings::config_group_init_type_name(place, name.cast(), item_type.as_ptr()) @@ -613,7 +613,7 @@ where pub const fn new(name: &'static CStr) -> Self { Self { attribute: Opaque::new(bindings::configfs_attribute { - ca_name: name.as_char_ptr(), + ca_name: crate::str::as_char_ptr_in_const_context(name), ca_owner: core::ptr::null_mut(), ca_mode: 0o660, show: Some(Self::show), diff --git a/rust/kernel/cpu.rs b/rust/kernel/cpu.rs index 5de730c8d81722..cb6c0338ef5a61 100644 --- a/rust/kernel/cpu.rs +++ b/rust/kernel/cpu.rs @@ -109,6 +109,7 @@ impl CpuId { /// unexpectedly due to preemption or CPU migration. It should only be /// used when the context ensures that the task remains on the same CPU /// or the users could use a stale (yet valid) CPU ID. + #[inline] pub fn current() -> Self { // SAFETY: raw_smp_processor_id() always returns a valid CPU ID. unsafe { Self::from_u32_unchecked(bindings::raw_smp_processor_id()) } diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index eea57ba95f241d..21b5b9b8acc10b 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -27,7 +27,6 @@ use crate::clk::Clk; use core::{ cell::UnsafeCell, marker::PhantomData, - mem::MaybeUninit, ops::{Deref, DerefMut}, pin::Pin, ptr, @@ -39,7 +38,8 @@ use macros::vtable; const CPUFREQ_NAME_LEN: usize = bindings::CPUFREQ_NAME_LEN as usize; /// Default transition latency value in nanoseconds. -pub const ETERNAL_LATENCY_NS: u32 = bindings::CPUFREQ_ETERNAL as u32; +pub const DEFAULT_TRANSITION_LATENCY_NS: u32 = + bindings::CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; /// CPU frequency driver flags. pub mod flags { @@ -400,13 +400,13 @@ impl TableBuilder { /// The following example demonstrates how to create a CPU frequency table. /// /// ``` -/// use kernel::cpufreq::{ETERNAL_LATENCY_NS, Policy}; +/// use kernel::cpufreq::{DEFAULT_TRANSITION_LATENCY_NS, Policy}; /// /// fn update_policy(policy: &mut Policy) { /// policy /// .set_dvfs_possible_from_any_cpu(true) /// .set_fast_switch_possible(true) -/// .set_transition_latency_ns(ETERNAL_LATENCY_NS); +/// .set_transition_latency_ns(DEFAULT_TRANSITION_LATENCY_NS); /// /// pr_info!("The policy details are: {:?}\n", (policy.cpu(), policy.cur())); /// } @@ -1013,12 +1013,11 @@ impl Registration { } else { None }, - // SAFETY: All zeros is a valid value for `bindings::cpufreq_driver`. - ..unsafe { MaybeUninit::zeroed().assume_init() } + ..pin_init::zeroed() }; const fn copy_name(name: &'static CStr) -> [c_char; CPUFREQ_NAME_LEN] { - let src = name.as_bytes_with_nul(); + let src = name.to_bytes_with_nul(); let mut dst = [0; CPUFREQ_NAME_LEN]; build_assert!(src.len() <= CPUFREQ_NAME_LEN); diff --git a/rust/kernel/cred.rs b/rust/kernel/cred.rs index 2599f01e8b285f..4a2229542fb73d 100644 --- a/rust/kernel/cred.rs +++ b/rust/kernel/cred.rs @@ -8,11 +8,7 @@ //! //! Reference: -use crate::{ - bindings, - task::Kuid, - types::{AlwaysRefCounted, Opaque}, -}; +use crate::{bindings, sync::aref::AlwaysRefCounted, task::Kuid, types::Opaque}; /// Wraps the kernel's `struct cred`. /// diff --git a/rust/kernel/debugfs.rs b/rust/kernel/debugfs.rs new file mode 100644 index 00000000000000..381c23b3dd839b --- /dev/null +++ b/rust/kernel/debugfs.rs @@ -0,0 +1,594 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +//! DebugFS Abstraction +//! +//! C header: [`include/linux/debugfs.h`](srctree/include/linux/debugfs.h) + +// When DebugFS is disabled, many parameters are dead. Linting for this isn't helpful. +#![cfg_attr(not(CONFIG_DEBUG_FS), allow(unused_variables))] + +use crate::prelude::*; +use crate::str::CStr; +#[cfg(CONFIG_DEBUG_FS)] +use crate::sync::Arc; +use crate::uaccess::UserSliceReader; +use core::fmt; +use core::marker::PhantomData; +use core::marker::PhantomPinned; +#[cfg(CONFIG_DEBUG_FS)] +use core::mem::ManuallyDrop; +use core::ops::Deref; + +mod traits; +pub use traits::{Reader, Writer}; + +mod callback_adapters; +use callback_adapters::{FormatAdapter, NoWriter, WritableAdapter}; +mod file_ops; +use file_ops::{FileOps, ReadFile, ReadWriteFile, WriteFile}; +#[cfg(CONFIG_DEBUG_FS)] +mod entry; +#[cfg(CONFIG_DEBUG_FS)] +use entry::Entry; + +/// Owning handle to a DebugFS directory. +/// +/// The directory in the filesystem represented by [`Dir`] will be removed when handle has been +/// dropped *and* all children have been removed. +// If we have a parent, we hold a reference to it in the `Entry`. This prevents the `dentry` +// we point to from being cleaned up if our parent `Dir`/`Entry` is dropped before us. +// +// The `None` option indicates that the `Arc` could not be allocated, so our children would not be +// able to refer to us. In this case, we need to silently fail. All future child directories/files +// will silently fail as well. +#[derive(Clone)] +pub struct Dir(#[cfg(CONFIG_DEBUG_FS)] Option>>); + +impl Dir { + /// Create a new directory in DebugFS. If `parent` is [`None`], it will be created at the root. + fn create(name: &CStr, parent: Option<&Dir>) -> Self { + #[cfg(CONFIG_DEBUG_FS)] + { + let parent_entry = match parent { + // If the parent couldn't be allocated, just early-return + Some(Dir(None)) => return Self(None), + Some(Dir(Some(entry))) => Some(entry.clone()), + None => None, + }; + Self( + // If Arc creation fails, the `Entry` will be dropped, so the directory will be + // cleaned up. + Arc::new(Entry::dynamic_dir(name, parent_entry), GFP_KERNEL).ok(), + ) + } + #[cfg(not(CONFIG_DEBUG_FS))] + Self() + } + + /// Creates a DebugFS file which will own the data produced by the initializer provided in + /// `data`. + fn create_file<'a, T, E: 'a>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + file_ops: &'static FileOps, + ) -> impl PinInit, E> + 'a + where + T: Sync + 'static, + { + let scope = Scope::::new(data, move |data| { + #[cfg(CONFIG_DEBUG_FS)] + if let Some(parent) = &self.0 { + // SAFETY: Because data derives from a scope, and our entry will be dropped before + // the data is dropped, it is guaranteed to outlive the entry we return. + unsafe { Entry::dynamic_file(name, parent.clone(), data, file_ops) } + } else { + Entry::empty() + } + }); + try_pin_init! { + File { + scope <- scope + } ? E + } + } + + /// Create a new directory in DebugFS at the root. + /// + /// # Examples + /// + /// ``` + /// # use kernel::c_str; + /// # use kernel::debugfs::Dir; + /// let debugfs = Dir::new(c_str!("parent")); + /// ``` + pub fn new(name: &CStr) -> Self { + Dir::create(name, None) + } + + /// Creates a subdirectory within this directory. + /// + /// # Examples + /// + /// ``` + /// # use kernel::c_str; + /// # use kernel::debugfs::Dir; + /// let parent = Dir::new(c_str!("parent")); + /// let child = parent.subdir(c_str!("child")); + /// ``` + pub fn subdir(&self, name: &CStr) -> Self { + Dir::create(name, Some(self)) + } + + /// Creates a read-only file in this directory. + /// + /// The file's contents are produced by invoking [`Writer::write`] on the value initialized by + /// `data`. + /// + /// # Examples + /// + /// ``` + /// # use kernel::c_str; + /// # use kernel::debugfs::Dir; + /// # use kernel::prelude::*; + /// # let dir = Dir::new(c_str!("my_debugfs_dir")); + /// let file = KBox::pin_init(dir.read_only_file(c_str!("foo"), 200), GFP_KERNEL)?; + /// // "my_debugfs_dir/foo" now contains the number 200. + /// // The file is removed when `file` is dropped. + /// # Ok::<(), Error>(()) + /// ``` + pub fn read_only_file<'a, T, E: 'a>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + ) -> impl PinInit, E> + 'a + where + T: Writer + Send + Sync + 'static, + { + let file_ops = &>::FILE_OPS; + self.create_file(name, data, file_ops) + } + + /// Creates a read-only file in this directory, with contents from a callback. + /// + /// `f` must be a function item or a non-capturing closure. + /// This is statically asserted and not a safety requirement. + /// + /// # Examples + /// + /// ``` + /// # use core::sync::atomic::{AtomicU32, Ordering}; + /// # use kernel::c_str; + /// # use kernel::debugfs::Dir; + /// # use kernel::prelude::*; + /// # let dir = Dir::new(c_str!("foo")); + /// let file = KBox::pin_init( + /// dir.read_callback_file(c_str!("bar"), + /// AtomicU32::new(3), + /// &|val, f| { + /// let out = val.load(Ordering::Relaxed); + /// writeln!(f, "{out:#010x}") + /// }), + /// GFP_KERNEL)?; + /// // Reading "foo/bar" will show "0x00000003". + /// file.store(10, Ordering::Relaxed); + /// // Reading "foo/bar" will now show "0x0000000a". + /// # Ok::<(), Error>(()) + /// ``` + pub fn read_callback_file<'a, T, E: 'a, F>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + _f: &'static F, + ) -> impl PinInit, E> + 'a + where + T: Send + Sync + 'static, + F: Fn(&T, &mut fmt::Formatter<'_>) -> fmt::Result + Send + Sync, + { + let file_ops = >::FILE_OPS.adapt(); + self.create_file(name, data, file_ops) + } + + /// Creates a read-write file in this directory. + /// + /// Reading the file uses the [`Writer`] implementation. + /// Writing to the file uses the [`Reader`] implementation. + pub fn read_write_file<'a, T, E: 'a>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + ) -> impl PinInit, E> + 'a + where + T: Writer + Reader + Send + Sync + 'static, + { + let file_ops = &>::FILE_OPS; + self.create_file(name, data, file_ops) + } + + /// Creates a read-write file in this directory, with logic from callbacks. + /// + /// Reading from the file is handled by `f`. Writing to the file is handled by `w`. + /// + /// `f` and `w` must be function items or non-capturing closures. + /// This is statically asserted and not a safety requirement. + pub fn read_write_callback_file<'a, T, E: 'a, F, W>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + _f: &'static F, + _w: &'static W, + ) -> impl PinInit, E> + 'a + where + T: Send + Sync + 'static, + F: Fn(&T, &mut fmt::Formatter<'_>) -> fmt::Result + Send + Sync, + W: Fn(&T, &mut UserSliceReader) -> Result + Send + Sync, + { + let file_ops = + , W> as file_ops::ReadWriteFile<_>>::FILE_OPS + .adapt() + .adapt(); + self.create_file(name, data, file_ops) + } + + /// Creates a write-only file in this directory. + /// + /// The file owns its backing data. Writing to the file uses the [`Reader`] + /// implementation. + /// + /// The file is removed when the returned [`File`] is dropped. + pub fn write_only_file<'a, T, E: 'a>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + ) -> impl PinInit, E> + 'a + where + T: Reader + Send + Sync + 'static, + { + self.create_file(name, data, &T::FILE_OPS) + } + + /// Creates a write-only file in this directory, with write logic from a callback. + /// + /// `w` must be a function item or a non-capturing closure. + /// This is statically asserted and not a safety requirement. + pub fn write_callback_file<'a, T, E: 'a, W>( + &'a self, + name: &'a CStr, + data: impl PinInit + 'a, + _w: &'static W, + ) -> impl PinInit, E> + 'a + where + T: Send + Sync + 'static, + W: Fn(&T, &mut UserSliceReader) -> Result + Send + Sync, + { + let file_ops = , W> as WriteFile<_>>::FILE_OPS + .adapt() + .adapt(); + self.create_file(name, data, file_ops) + } + + // While this function is safe, it is intentionally not public because it's a bit of a + // footgun. + // + // Unless you also extract the `entry` later and schedule it for `Drop` at the appropriate + // time, a `ScopedDir` with a `Dir` parent will never be deleted. + fn scoped_dir<'data>(&self, name: &CStr) -> ScopedDir<'data, 'static> { + #[cfg(CONFIG_DEBUG_FS)] + { + let parent_entry = match &self.0 { + None => return ScopedDir::empty(), + Some(entry) => entry.clone(), + }; + ScopedDir { + entry: ManuallyDrop::new(Entry::dynamic_dir(name, Some(parent_entry))), + _phantom: PhantomData, + } + } + #[cfg(not(CONFIG_DEBUG_FS))] + ScopedDir::empty() + } + + /// Creates a new scope, which is a directory associated with some data `T`. + /// + /// The created directory will be a subdirectory of `self`. The `init` closure is called to + /// populate the directory with files and subdirectories. These files can reference the data + /// stored in the scope. + /// + /// The entire directory tree created within the scope will be removed when the returned + /// `Scope` handle is dropped. + pub fn scope<'a, T: 'a, E: 'a, F>( + &'a self, + data: impl PinInit + 'a, + name: &'a CStr, + init: F, + ) -> impl PinInit, E> + 'a + where + F: for<'data, 'dir> FnOnce(&'data T, &'dir ScopedDir<'data, 'dir>) + 'a, + { + Scope::new(data, |data| { + let scoped = self.scoped_dir(name); + init(data, &scoped); + scoped.into_entry() + }) + } +} + +#[pin_data] +/// Handle to a DebugFS scope, which ensures that attached `data` will outlive the DebugFS entry +/// without moving. +/// +/// This is internally used to back [`File`], and used in the API to represent the attachment +/// of a directory lifetime to a data structure which may be jointly accessed by a number of +/// different files. +/// +/// When dropped, a `Scope` will remove all directories and files in the filesystem backed by the +/// attached data structure prior to releasing the attached data. +pub struct Scope { + // This order is load-bearing for drops - `_entry` must be dropped before `data`. + #[cfg(CONFIG_DEBUG_FS)] + _entry: Entry<'static>, + #[pin] + data: T, + // Even if `T` is `Unpin`, we still can't allow it to be moved. + #[pin] + _pin: PhantomPinned, +} + +#[pin_data] +/// Handle to a DebugFS file, owning its backing data. +/// +/// When dropped, the DebugFS file will be removed and the attached data will be dropped. +pub struct File { + #[pin] + scope: Scope, +} + +#[cfg(not(CONFIG_DEBUG_FS))] +impl<'b, T: 'b> Scope { + fn new(data: impl PinInit + 'b, init: F) -> impl PinInit + 'b + where + F: for<'a> FnOnce(&'a T) + 'b, + { + try_pin_init! { + Self { + data <- data, + _pin: PhantomPinned + } ? E + } + .pin_chain(|scope| { + init(&scope.data); + Ok(()) + }) + } +} + +#[cfg(CONFIG_DEBUG_FS)] +impl<'b, T: 'b> Scope { + fn entry_mut(self: Pin<&mut Self>) -> &mut Entry<'static> { + // SAFETY: _entry is not structurally pinned. + unsafe { &mut Pin::into_inner_unchecked(self)._entry } + } + + fn new(data: impl PinInit + 'b, init: F) -> impl PinInit + 'b + where + F: for<'a> FnOnce(&'a T) -> Entry<'static> + 'b, + { + try_pin_init! { + Self { + _entry: Entry::empty(), + data <- data, + _pin: PhantomPinned + } ? E + } + .pin_chain(|scope| { + *scope.entry_mut() = init(&scope.data); + Ok(()) + }) + } +} + +impl<'a, T: 'a> Scope { + /// Creates a new scope, which is a directory at the root of the debugfs filesystem, + /// associated with some data `T`. + /// + /// The `init` closure is called to populate the directory with files and subdirectories. These + /// files can reference the data stored in the scope. + /// + /// The entire directory tree created within the scope will be removed when the returned + /// `Scope` handle is dropped. + pub fn dir( + data: impl PinInit + 'a, + name: &'a CStr, + init: F, + ) -> impl PinInit + 'a + where + F: for<'data, 'dir> FnOnce(&'data T, &'dir ScopedDir<'data, 'dir>) + 'a, + { + Scope::new(data, |data| { + let scoped = ScopedDir::new(name); + init(data, &scoped); + scoped.into_entry() + }) + } +} + +impl Deref for Scope { + type Target = T; + fn deref(&self) -> &T { + &self.data + } +} + +impl Deref for File { + type Target = T; + fn deref(&self) -> &T { + &self.scope + } +} + +/// A handle to a directory which will live at most `'dir`, accessing data that will live for at +/// least `'data`. +/// +/// Dropping a ScopedDir will not delete or clean it up, this is expected to occur through dropping +/// the `Scope` that created it. +pub struct ScopedDir<'data, 'dir> { + #[cfg(CONFIG_DEBUG_FS)] + entry: ManuallyDrop>, + _phantom: PhantomData &'dir ()>, +} + +impl<'data, 'dir> ScopedDir<'data, 'dir> { + /// Creates a subdirectory inside this `ScopedDir`. + /// + /// The returned directory handle cannot outlive this one. + pub fn dir<'dir2>(&'dir2 self, name: &CStr) -> ScopedDir<'data, 'dir2> { + #[cfg(not(CONFIG_DEBUG_FS))] + let _ = name; + ScopedDir { + #[cfg(CONFIG_DEBUG_FS)] + entry: ManuallyDrop::new(Entry::dir(name, Some(&*self.entry))), + _phantom: PhantomData, + } + } + + fn create_file(&self, name: &CStr, data: &'data T, vtable: &'static FileOps) { + #[cfg(CONFIG_DEBUG_FS)] + core::mem::forget(Entry::file(name, &self.entry, data, vtable)); + } + + /// Creates a read-only file in this directory. + /// + /// The file's contents are produced by invoking [`Writer::write`]. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn read_only_file(&self, name: &CStr, data: &'data T) { + self.create_file(name, data, &T::FILE_OPS) + } + + /// Creates a read-only file in this directory, with contents from a callback. + /// + /// The file contents are generated by calling `f` with `data`. + /// + /// + /// `f` must be a function item or a non-capturing closure. + /// This is statically asserted and not a safety requirement. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn read_callback_file(&self, name: &CStr, data: &'data T, _f: &'static F) + where + T: Send + Sync + 'static, + F: Fn(&T, &mut fmt::Formatter<'_>) -> fmt::Result + Send + Sync, + { + let vtable = as ReadFile<_>>::FILE_OPS.adapt(); + self.create_file(name, data, vtable) + } + + /// Creates a read-write file in this directory. + /// + /// Reading the file uses the [`Writer`] implementation on `data`. Writing to the file uses + /// the [`Reader`] implementation on `data`. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn read_write_file( + &self, + name: &CStr, + data: &'data T, + ) { + let vtable = &>::FILE_OPS; + self.create_file(name, data, vtable) + } + + /// Creates a read-write file in this directory, with logic from callbacks. + /// + /// Reading from the file is handled by `f`. Writing to the file is handled by `w`. + /// + /// `f` and `w` must be function items or non-capturing closures. + /// This is statically asserted and not a safety requirement. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn read_write_callback_file( + &self, + name: &CStr, + data: &'data T, + _f: &'static F, + _w: &'static W, + ) where + T: Send + Sync + 'static, + F: Fn(&T, &mut fmt::Formatter<'_>) -> fmt::Result + Send + Sync, + W: Fn(&T, &mut UserSliceReader) -> Result + Send + Sync, + { + let vtable = , W> as ReadWriteFile<_>>::FILE_OPS + .adapt() + .adapt(); + self.create_file(name, data, vtable) + } + + /// Creates a write-only file in this directory. + /// + /// Writing to the file uses the [`Reader`] implementation on `data`. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn write_only_file(&self, name: &CStr, data: &'data T) { + let vtable = &>::FILE_OPS; + self.create_file(name, data, vtable) + } + + /// Creates a write-only file in this directory, with write logic from a callback. + /// + /// Writing to the file is handled by `w`. + /// + /// `w` must be a function item or a non-capturing closure. + /// This is statically asserted and not a safety requirement. + /// + /// This function does not produce an owning handle to the file. The created + /// file is removed when the [`Scope`] that this directory belongs + /// to is dropped. + pub fn write_only_callback_file(&self, name: &CStr, data: &'data T, _w: &'static W) + where + T: Send + Sync + 'static, + W: Fn(&T, &mut UserSliceReader) -> Result + Send + Sync, + { + let vtable = &, W> as WriteFile<_>>::FILE_OPS + .adapt() + .adapt(); + self.create_file(name, data, vtable) + } + + fn empty() -> Self { + ScopedDir { + #[cfg(CONFIG_DEBUG_FS)] + entry: ManuallyDrop::new(Entry::empty()), + _phantom: PhantomData, + } + } + #[cfg(CONFIG_DEBUG_FS)] + fn into_entry(self) -> Entry<'dir> { + ManuallyDrop::into_inner(self.entry) + } + #[cfg(not(CONFIG_DEBUG_FS))] + fn into_entry(self) {} +} + +impl<'data> ScopedDir<'data, 'static> { + // This is safe, but intentionally not exported due to footgun status. A ScopedDir with no + // parent will never be released by default, and needs to have its entry extracted and used + // somewhere. + fn new(name: &CStr) -> ScopedDir<'data, 'static> { + ScopedDir { + #[cfg(CONFIG_DEBUG_FS)] + entry: ManuallyDrop::new(Entry::dir(name, None)), + _phantom: PhantomData, + } + } +} diff --git a/rust/kernel/debugfs/callback_adapters.rs b/rust/kernel/debugfs/callback_adapters.rs new file mode 100644 index 00000000000000..6c024230f676d5 --- /dev/null +++ b/rust/kernel/debugfs/callback_adapters.rs @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +//! Adapters which allow the user to supply a write or read implementation as a value rather +//! than a trait implementation. If provided, it will override the trait implementation. + +use super::{Reader, Writer}; +use crate::prelude::*; +use crate::uaccess::UserSliceReader; +use core::fmt; +use core::fmt::Formatter; +use core::marker::PhantomData; +use core::ops::Deref; + +/// # Safety +/// +/// To implement this trait, it must be safe to cast a `&Self` to a `&Inner`. +/// It is intended for use in unstacking adapters out of `FileOps` backings. +pub(crate) unsafe trait Adapter { + type Inner; +} + +/// Adapter to implement `Reader` via a callback with the same representation as `T`. +/// +/// * Layer it on top of `WriterAdapter` if you want to add a custom callback for `write`. +/// * Layer it on top of `NoWriter` to pass through any support present on the underlying type. +/// +/// # Invariants +/// +/// If an instance for `WritableAdapter<_, W>` is constructed, `W` is inhabited. +#[repr(transparent)] +pub(crate) struct WritableAdapter { + inner: D, + _writer: PhantomData, +} + +// SAFETY: Stripping off the adapter only removes constraints +unsafe impl Adapter for WritableAdapter { + type Inner = D; +} + +impl Writer for WritableAdapter { + fn write(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.inner.write(fmt) + } +} + +impl Reader for WritableAdapter +where + W: Fn(&D::Target, &mut UserSliceReader) -> Result + Send + Sync + 'static, +{ + fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result { + // SAFETY: WritableAdapter<_, W> can only be constructed if W is inhabited + let w: &W = unsafe { materialize_zst() }; + w(self.inner.deref(), reader) + } +} + +/// Adapter to implement `Writer` via a callback with the same representation as `T`. +/// +/// # Invariants +/// +/// If an instance for `FormatAdapter<_, F>` is constructed, `F` is inhabited. +#[repr(transparent)] +pub(crate) struct FormatAdapter { + inner: D, + _formatter: PhantomData, +} + +impl Deref for FormatAdapter { + type Target = D; + fn deref(&self) -> &D { + &self.inner + } +} + +impl Writer for FormatAdapter +where + F: Fn(&D, &mut Formatter<'_>) -> fmt::Result + 'static, +{ + fn write(&self, fmt: &mut Formatter<'_>) -> fmt::Result { + // SAFETY: FormatAdapter<_, F> can only be constructed if F is inhabited + let f: &F = unsafe { materialize_zst() }; + f(&self.inner, fmt) + } +} + +// SAFETY: Stripping off the adapter only removes constraints +unsafe impl Adapter for FormatAdapter { + type Inner = D; +} + +#[repr(transparent)] +pub(crate) struct NoWriter { + inner: D, +} + +// SAFETY: Stripping off the adapter only removes constraints +unsafe impl Adapter for NoWriter { + type Inner = D; +} + +impl Deref for NoWriter { + type Target = D; + fn deref(&self) -> &D { + &self.inner + } +} + +/// For types with a unique value, produce a static reference to it. +/// +/// # Safety +/// +/// The caller asserts that F is inhabited +unsafe fn materialize_zst() -> &'static F { + const { assert!(core::mem::size_of::() == 0) }; + let zst_dangle: core::ptr::NonNull = core::ptr::NonNull::dangling(); + // SAFETY: While the pointer is dangling, it is a dangling pointer to a ZST, based on the + // assertion above. The type is also inhabited, by the caller's assertion. This means + // we can materialize it. + unsafe { zst_dangle.as_ref() } +} diff --git a/rust/kernel/debugfs/entry.rs b/rust/kernel/debugfs/entry.rs new file mode 100644 index 00000000000000..f99402cd3ba0ca --- /dev/null +++ b/rust/kernel/debugfs/entry.rs @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +use crate::debugfs::file_ops::FileOps; +use crate::ffi::c_void; +use crate::str::CStr; +use crate::sync::Arc; +use core::marker::PhantomData; + +/// Owning handle to a DebugFS entry. +/// +/// # Invariants +/// +/// The wrapped pointer will always be `NULL`, an error, or an owned DebugFS `dentry`. +pub(crate) struct Entry<'a> { + entry: *mut bindings::dentry, + // If we were created with an owning parent, this is the keep-alive + _parent: Option>>, + // If we were created with a non-owning parent, this prevents us from outliving it + _phantom: PhantomData<&'a ()>, +} + +// SAFETY: [`Entry`] is just a `dentry` under the hood, which the API promises can be transferred +// between threads. +unsafe impl Send for Entry<'_> {} + +// SAFETY: All the C functions we call on the `dentry` pointer are threadsafe. +unsafe impl Sync for Entry<'_> {} + +impl Entry<'static> { + pub(crate) fn dynamic_dir(name: &CStr, parent: Option>) -> Self { + let parent_ptr = match &parent { + Some(entry) => entry.as_ptr(), + None => core::ptr::null_mut(), + }; + // SAFETY: The invariants of this function's arguments ensure the safety of this call. + // * `name` is a valid C string by the invariants of `&CStr`. + // * `parent_ptr` is either `NULL` (if `parent` is `None`), or a pointer to a valid + // `dentry` by our invariant. `debugfs_create_dir` handles `NULL` pointers correctly. + let entry = unsafe { bindings::debugfs_create_dir(name.as_char_ptr(), parent_ptr) }; + + Entry { + entry, + _parent: parent, + _phantom: PhantomData, + } + } + + /// # Safety + /// + /// * `data` must outlive the returned `Entry`. + pub(crate) unsafe fn dynamic_file( + name: &CStr, + parent: Arc, + data: &T, + file_ops: &'static FileOps, + ) -> Self { + // SAFETY: The invariants of this function's arguments ensure the safety of this call. + // * `name` is a valid C string by the invariants of `&CStr`. + // * `parent.as_ptr()` is a pointer to a valid `dentry` by invariant. + // * The caller guarantees that `data` will outlive the returned `Entry`. + // * The guarantees on `FileOps` assert the vtable will be compatible with the data we have + // provided. + let entry = unsafe { + bindings::debugfs_create_file_full( + name.as_char_ptr(), + file_ops.mode(), + parent.as_ptr(), + core::ptr::from_ref(data) as *mut c_void, + core::ptr::null(), + &**file_ops, + ) + }; + + Entry { + entry, + _parent: Some(parent), + _phantom: PhantomData, + } + } +} + +impl<'a> Entry<'a> { + pub(crate) fn dir(name: &CStr, parent: Option<&'a Entry<'_>>) -> Self { + let parent_ptr = match &parent { + Some(entry) => entry.as_ptr(), + None => core::ptr::null_mut(), + }; + // SAFETY: The invariants of this function's arguments ensure the safety of this call. + // * `name` is a valid C string by the invariants of `&CStr`. + // * `parent_ptr` is either `NULL` (if `parent` is `None`), or a pointer to a valid + // `dentry` (because `parent` is a valid reference to an `Entry`). The lifetime `'a` + // ensures that the parent outlives this entry. + let entry = unsafe { bindings::debugfs_create_dir(name.as_char_ptr(), parent_ptr) }; + + Entry { + entry, + _parent: None, + _phantom: PhantomData, + } + } + + pub(crate) fn file( + name: &CStr, + parent: &'a Entry<'_>, + data: &'a T, + file_ops: &FileOps, + ) -> Self { + // SAFETY: The invariants of this function's arguments ensure the safety of this call. + // * `name` is a valid C string by the invariants of `&CStr`. + // * `parent.as_ptr()` is a pointer to a valid `dentry` because we have `&'a Entry`. + // * `data` is a valid pointer to `T` for lifetime `'a`. + // * The returned `Entry` has lifetime `'a`, so it cannot outlive `parent` or `data`. + // * The caller guarantees that `vtable` is compatible with `data`. + // * The guarantees on `FileOps` assert the vtable will be compatible with the data we have + // provided. + let entry = unsafe { + bindings::debugfs_create_file_full( + name.as_char_ptr(), + file_ops.mode(), + parent.as_ptr(), + core::ptr::from_ref(data) as *mut c_void, + core::ptr::null(), + &**file_ops, + ) + }; + + Entry { + entry, + _parent: None, + _phantom: PhantomData, + } + } +} + +impl Entry<'_> { + /// Constructs a placeholder DebugFS [`Entry`]. + pub(crate) fn empty() -> Self { + Self { + entry: core::ptr::null_mut(), + _parent: None, + _phantom: PhantomData, + } + } + + /// Returns the pointer representation of the DebugFS directory. + /// + /// # Guarantees + /// + /// Due to the type invariant, the value returned from this function will always be an error + /// code, NULL, or a live DebugFS directory. If it is live, it will remain live at least as + /// long as this entry lives. + pub(crate) fn as_ptr(&self) -> *mut bindings::dentry { + self.entry + } +} + +impl Drop for Entry<'_> { + fn drop(&mut self) { + // SAFETY: `debugfs_remove` can take `NULL`, error values, and legal DebugFS dentries. + // `as_ptr` guarantees that the pointer is of this form. + unsafe { bindings::debugfs_remove(self.as_ptr()) } + } +} diff --git a/rust/kernel/debugfs/file_ops.rs b/rust/kernel/debugfs/file_ops.rs new file mode 100644 index 00000000000000..50fead17b6f31f --- /dev/null +++ b/rust/kernel/debugfs/file_ops.rs @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +use super::{Reader, Writer}; +use crate::debugfs::callback_adapters::Adapter; +use crate::prelude::*; +use crate::seq_file::SeqFile; +use crate::seq_print; +use crate::uaccess::UserSlice; +use core::fmt::{Display, Formatter, Result}; +use core::marker::PhantomData; + +#[cfg(CONFIG_DEBUG_FS)] +use core::ops::Deref; + +/// # Invariant +/// +/// `FileOps` will always contain an `operations` which is safe to use for a file backed +/// off an inode which has a pointer to a `T` in its private data that is safe to convert +/// into a reference. +pub(super) struct FileOps { + #[cfg(CONFIG_DEBUG_FS)] + operations: bindings::file_operations, + #[cfg(CONFIG_DEBUG_FS)] + mode: u16, + _phantom: PhantomData, +} + +impl FileOps { + /// # Safety + /// + /// The caller asserts that the provided `operations` is safe to use for a file whose + /// inode has a pointer to `T` in its private data that is safe to convert into a reference. + const unsafe fn new(operations: bindings::file_operations, mode: u16) -> Self { + Self { + #[cfg(CONFIG_DEBUG_FS)] + operations, + #[cfg(CONFIG_DEBUG_FS)] + mode, + _phantom: PhantomData, + } + } + + #[cfg(CONFIG_DEBUG_FS)] + pub(crate) const fn mode(&self) -> u16 { + self.mode + } +} + +impl FileOps { + pub(super) const fn adapt(&self) -> &FileOps { + // SAFETY: `Adapter` asserts that `T` can be legally cast to `T::Inner`. + unsafe { core::mem::transmute(self) } + } +} + +#[cfg(CONFIG_DEBUG_FS)] +impl Deref for FileOps { + type Target = bindings::file_operations; + + fn deref(&self) -> &Self::Target { + &self.operations + } +} + +struct WriterAdapter(T); + +impl<'a, T: Writer> Display for WriterAdapter<&'a T> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + self.0.write(f) + } +} + +/// Implements `open` for `file_operations` via `single_open` to fill out a `seq_file`. +/// +/// # Safety +/// +/// * `inode`'s private pointer must point to a value of type `T` which will outlive the `inode` +/// and will not have any unique references alias it during the call. +/// * `file` must point to a live, not-yet-initialized file object. +unsafe extern "C" fn writer_open( + inode: *mut bindings::inode, + file: *mut bindings::file, +) -> c_int { + // SAFETY: The caller ensures that `inode` is a valid pointer. + let data = unsafe { (*inode).i_private }; + // SAFETY: + // * `file` is acceptable by caller precondition. + // * `print_act` will be called on a `seq_file` with private data set to the third argument, + // so we meet its safety requirements. + // * The `data` pointer passed in the third argument is a valid `T` pointer that outlives + // this call by caller preconditions. + unsafe { bindings::single_open(file, Some(writer_act::), data) } +} + +/// Prints private data stashed in a seq_file to that seq file. +/// +/// # Safety +/// +/// `seq` must point to a live `seq_file` whose private data is a valid pointer to a `T` which may +/// not have any unique references alias it during the call. +unsafe extern "C" fn writer_act( + seq: *mut bindings::seq_file, + _: *mut c_void, +) -> c_int { + // SAFETY: By caller precondition, this pointer is valid pointer to a `T`, and + // there are not and will not be any unique references until we are done. + let data = unsafe { &*((*seq).private.cast::()) }; + // SAFETY: By caller precondition, `seq_file` points to a live `seq_file`, so we can lift + // it. + let seq_file = unsafe { SeqFile::from_raw(seq) }; + seq_print!(seq_file, "{}", WriterAdapter(data)); + 0 +} + +// Work around lack of generic const items. +pub(crate) trait ReadFile { + const FILE_OPS: FileOps; +} + +impl ReadFile for T { + const FILE_OPS: FileOps = { + let operations = bindings::file_operations { + read: Some(bindings::seq_read), + llseek: Some(bindings::seq_lseek), + release: Some(bindings::single_release), + open: Some(writer_open::), + // SAFETY: `file_operations` supports zeroes in all fields. + ..unsafe { core::mem::zeroed() } + }; + // SAFETY: `operations` is all stock `seq_file` implementations except for `writer_open`. + // `open`'s only requirement beyond what is provided to all open functions is that the + // inode's data pointer must point to a `T` that will outlive it, which matches the + // `FileOps` requirements. + unsafe { FileOps::new(operations, 0o400) } + }; +} + +fn read(data: &T, buf: *const c_char, count: usize) -> isize { + let mut reader = UserSlice::new(UserPtr::from_ptr(buf as *mut c_void), count).reader(); + + if let Err(e) = data.read_from_slice(&mut reader) { + return e.to_errno() as isize; + } + + count as isize +} + +/// # Safety +/// +/// `file` must be a valid pointer to a `file` struct. +/// The `private_data` of the file must contain a valid pointer to a `seq_file` whose +/// `private` data in turn points to a `T` that implements `Reader`. +/// `buf` must be a valid user-space buffer. +pub(crate) unsafe extern "C" fn write( + file: *mut bindings::file, + buf: *const c_char, + count: usize, + _ppos: *mut bindings::loff_t, +) -> isize { + // SAFETY: The file was opened with `single_open`, which sets `private_data` to a `seq_file`. + let seq = unsafe { &mut *((*file).private_data.cast::()) }; + // SAFETY: By caller precondition, this pointer is live and points to a value of type `T`. + let data = unsafe { &*(seq.private as *const T) }; + read(data, buf, count) +} + +// A trait to get the file operations for a type. +pub(crate) trait ReadWriteFile { + const FILE_OPS: FileOps; +} + +impl ReadWriteFile for T { + const FILE_OPS: FileOps = { + let operations = bindings::file_operations { + open: Some(writer_open::), + read: Some(bindings::seq_read), + write: Some(write::), + llseek: Some(bindings::seq_lseek), + release: Some(bindings::single_release), + // SAFETY: `file_operations` supports zeroes in all fields. + ..unsafe { core::mem::zeroed() } + }; + // SAFETY: `operations` is all stock `seq_file` implementations except for `writer_open` + // and `write`. + // `writer_open`'s only requirement beyond what is provided to all open functions is that + // the inode's data pointer must point to a `T` that will outlive it, which matches the + // `FileOps` requirements. + // `write` only requires that the file's private data pointer points to `seq_file` + // which points to a `T` that will outlive it, which matches what `writer_open` + // provides. + unsafe { FileOps::new(operations, 0o600) } + }; +} + +/// # Safety +/// +/// `inode` must be a valid pointer to an `inode` struct. +/// `file` must be a valid pointer to a `file` struct. +unsafe extern "C" fn write_only_open( + inode: *mut bindings::inode, + file: *mut bindings::file, +) -> c_int { + // SAFETY: The caller ensures that `inode` and `file` are valid pointers. + unsafe { (*file).private_data = (*inode).i_private }; + 0 +} + +/// # Safety +/// +/// * `file` must be a valid pointer to a `file` struct. +/// * The `private_data` of the file must contain a valid pointer to a `T` that implements +/// `Reader`. +/// * `buf` must be a valid user-space buffer. +pub(crate) unsafe extern "C" fn write_only_write( + file: *mut bindings::file, + buf: *const c_char, + count: usize, + _ppos: *mut bindings::loff_t, +) -> isize { + // SAFETY: The caller ensures that `file` is a valid pointer and that `private_data` holds a + // valid pointer to `T`. + let data = unsafe { &*((*file).private_data as *const T) }; + read(data, buf, count) +} + +pub(crate) trait WriteFile { + const FILE_OPS: FileOps; +} + +impl WriteFile for T { + const FILE_OPS: FileOps = { + let operations = bindings::file_operations { + open: Some(write_only_open), + write: Some(write_only_write::), + llseek: Some(bindings::noop_llseek), + // SAFETY: `file_operations` supports zeroes in all fields. + ..unsafe { core::mem::zeroed() } + }; + // SAFETY: + // * `write_only_open` populates the file private data with the inode private data + // * `write_only_write`'s only requirement is that the private data of the file point to + // a `T` and be legal to convert to a shared reference, which `write_only_open` + // satisfies. + unsafe { FileOps::new(operations, 0o200) } + }; +} diff --git a/rust/kernel/debugfs/traits.rs b/rust/kernel/debugfs/traits.rs new file mode 100644 index 00000000000000..ab009eb254b321 --- /dev/null +++ b/rust/kernel/debugfs/traits.rs @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Google LLC. + +//! Traits for rendering or updating values exported to DebugFS. + +use crate::prelude::*; +use crate::sync::Mutex; +use crate::uaccess::UserSliceReader; +use core::fmt::{self, Debug, Formatter}; +use core::str::FromStr; +use core::sync::atomic::{ + AtomicI16, AtomicI32, AtomicI64, AtomicI8, AtomicIsize, AtomicU16, AtomicU32, AtomicU64, + AtomicU8, AtomicUsize, Ordering, +}; + +/// A trait for types that can be written into a string. +/// +/// This works very similarly to `Debug`, and is automatically implemented if `Debug` is +/// implemented for a type. It is also implemented for any writable type inside a `Mutex`. +/// +/// The derived implementation of `Debug` [may +/// change](https://doc.rust-lang.org/std/fmt/trait.Debug.html#stability) +/// between Rust versions, so if stability is key for your use case, please implement `Writer` +/// explicitly instead. +pub trait Writer { + /// Formats the value using the given formatter. + fn write(&self, f: &mut Formatter<'_>) -> fmt::Result; +} + +impl Writer for Mutex { + fn write(&self, f: &mut Formatter<'_>) -> fmt::Result { + self.lock().write(f) + } +} + +impl Writer for T { + fn write(&self, f: &mut Formatter<'_>) -> fmt::Result { + writeln!(f, "{self:?}") + } +} + +/// A trait for types that can be updated from a user slice. +/// +/// This works similarly to `FromStr`, but operates on a `UserSliceReader` rather than a &str. +/// +/// It is automatically implemented for all atomic integers, or any type that implements `FromStr` +/// wrapped in a `Mutex`. +pub trait Reader { + /// Updates the value from the given user slice. + fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result; +} + +impl Reader for Mutex { + fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result { + let mut buf = [0u8; 128]; + if reader.len() > buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?; + let val = s.trim().parse::().map_err(|_| EINVAL)?; + *self.lock() = val; + Ok(()) + } +} + +macro_rules! impl_reader_for_atomic { + ($(($atomic_type:ty, $int_type:ty)),*) => { + $( + impl Reader for $atomic_type { + fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result { + let mut buf = [0u8; 21]; // Enough for a 64-bit number. + if reader.len() > buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?; + let val = s.trim().parse::<$int_type>().map_err(|_| EINVAL)?; + self.store(val, Ordering::Relaxed); + Ok(()) + } + } + )* + }; +} + +impl_reader_for_atomic!( + (AtomicI16, i16), + (AtomicI32, i32), + (AtomicI64, i64), + (AtomicI8, i8), + (AtomicIsize, isize), + (AtomicU16, u16), + (AtomicU32, u32), + (AtomicU64, u64), + (AtomicU8, u8), + (AtomicUsize, usize) +); diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs index 5902b3714a1662..1321e6f0b53c9a 100644 --- a/rust/kernel/device.rs +++ b/rust/kernel/device.rs @@ -5,10 +5,11 @@ //! C header: [`include/linux/device.h`](srctree/include/linux/device.h) use crate::{ - bindings, - types::{ARef, ForeignOwnable, Opaque}, + bindings, fmt, + sync::aref::ARef, + types::{ForeignOwnable, Opaque}, }; -use core::{fmt, marker::PhantomData, ptr}; +use core::{marker::PhantomData, ptr}; #[cfg(CONFIG_PRINTK)] use crate::c_str; @@ -138,7 +139,9 @@ pub mod property; /// } /// ``` /// -/// An example for a class device implementation is [`drm::Device`]. +/// An example for a class device implementation is +#[cfg_attr(CONFIG_DRM = "y", doc = "[`drm::Device`](kernel::drm::Device).")] +#[cfg_attr(not(CONFIG_DRM = "y"), doc = "`drm::Device`.")] /// /// # Invariants /// @@ -151,7 +154,6 @@ pub mod property; /// dropped from any thread. /// /// [`AlwaysRefCounted`]: kernel::types::AlwaysRefCounted -/// [`drm::Device`]: kernel::drm::Device /// [`impl_device_context_deref`]: kernel::impl_device_context_deref /// [`pci::Device`]: kernel::pci::Device /// [`platform::Device`]: kernel::platform::Device @@ -405,7 +407,7 @@ kernel::impl_device_context_deref!(unsafe { Device }); kernel::impl_device_context_into_aref!(Device); // SAFETY: Instances of `Device` are always reference-counted. -unsafe impl crate::types::AlwaysRefCounted for Device { +unsafe impl crate::sync::aref::AlwaysRefCounted for Device { fn inc_ref(&self) { // SAFETY: The existence of a shared reference guarantees that the refcount is non-zero. unsafe { bindings::get_device(self.as_raw()) }; @@ -571,7 +573,7 @@ macro_rules! impl_device_context_deref { #[macro_export] macro_rules! __impl_device_context_into_aref { ($src:ty, $device:tt) => { - impl ::core::convert::From<&$device<$src>> for $crate::types::ARef<$device> { + impl ::core::convert::From<&$device<$src>> for $crate::sync::aref::ARef<$device> { fn from(dev: &$device<$src>) -> Self { (&**dev).into() } @@ -595,7 +597,7 @@ macro_rules! impl_device_context_into_aref { macro_rules! dev_printk { ($method:ident, $dev:expr, $($f:tt)*) => { { - ($dev).$method(::core::format_args!($($f)*)); + ($dev).$method($crate::prelude::fmt!($($f)*)); } } } diff --git a/rust/kernel/device/property.rs b/rust/kernel/device/property.rs index 49ee12a906dbad..3a332a8c53a9eb 100644 --- a/rust/kernel/device/property.rs +++ b/rust/kernel/device/property.rs @@ -11,6 +11,7 @@ use crate::{ alloc::KVec, bindings, error::{to_result, Result}, + fmt, prelude::*, str::{CStr, CString}, types::{ARef, Opaque}, @@ -68,16 +69,16 @@ impl FwNode { unsafe { bindings::is_of_node(self.as_raw()) } } - /// Returns an object that implements [`Display`](core::fmt::Display) for + /// Returns an object that implements [`Display`](fmt::Display) for /// printing the name of a node. /// /// This is an alternative to the default `Display` implementation, which /// prints the full path. - pub fn display_name(&self) -> impl core::fmt::Display + '_ { + pub fn display_name(&self) -> impl fmt::Display + '_ { struct FwNodeDisplayName<'a>(&'a FwNode); - impl core::fmt::Display for FwNodeDisplayName<'_> { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + impl fmt::Display for FwNodeDisplayName<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // SAFETY: `self` is valid by its type invariant. let name = unsafe { bindings::fwnode_get_name(self.0.as_raw()) }; if name.is_null() { @@ -87,7 +88,7 @@ impl FwNode { // - `fwnode_get_name` returns null or a valid C string. // - `name` was checked to be non-null. let name = unsafe { CStr::from_char_ptr(name) }; - write!(f, "{name}") + fmt::Display::fmt(name, f) } } @@ -351,8 +352,8 @@ impl FwNodeReferenceArgs { } } -impl core::fmt::Debug for FwNodeReferenceArgs { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl fmt::Debug for FwNodeReferenceArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:?}", self.as_slice()) } } @@ -377,8 +378,8 @@ enum Node<'a> { Owned(ARef), } -impl core::fmt::Display for FwNode { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl fmt::Display for FwNode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // The logic here is the same as the one in lib/vsprintf.c // (fwnode_full_name_string). @@ -413,9 +414,9 @@ impl core::fmt::Display for FwNode { // SAFETY: `fwnode_get_name_prefix` returns null or a // valid C string. let prefix = unsafe { CStr::from_char_ptr(prefix) }; - write!(f, "{prefix}")?; + fmt::Display::fmt(prefix, f)?; } - write!(f, "{}", fwnode.display_name())?; + fmt::Display::fmt(&fwnode.display_name(), f)?; } Ok(()) diff --git a/rust/kernel/devres.rs b/rust/kernel/devres.rs index d04e3fcebafbb3..1325459622189e 100644 --- a/rust/kernel/devres.rs +++ b/rust/kernel/devres.rs @@ -13,8 +13,8 @@ use crate::{ ffi::c_void, prelude::*, revocable::{Revocable, RevocableGuard}, - sync::{rcu, Completion}, - types::{ARef, ForeignOwnable, Opaque, ScopeGuard}, + sync::{aref::ARef, rcu, Completion}, + types::{ForeignOwnable, Opaque, ScopeGuard}, }; use pin_init::Wrapper; diff --git a/rust/kernel/dma.rs b/rust/kernel/dma.rs index 2bc8ab51ec280f..68fe6762442438 100644 --- a/rust/kernel/dma.rs +++ b/rust/kernel/dma.rs @@ -9,8 +9,8 @@ use crate::{ device::{Bound, Core}, error::{to_result, Result}, prelude::*, + sync::aref::ARef, transmute::{AsBytes, FromBytes}, - types::ARef, }; /// Trait to be implemented by DMA capable bus devices. diff --git a/rust/kernel/drm/device.rs b/rust/kernel/drm/device.rs index d29c477e89a87d..0956ba0f64dea3 100644 --- a/rust/kernel/drm/device.rs +++ b/rust/kernel/drm/device.rs @@ -2,7 +2,7 @@ //! DRM device. //! -//! C header: [`include/linux/drm/drm_device.h`](srctree/include/linux/drm/drm_device.h) +//! C header: [`include/drm/drm_device.h`](srctree/include/drm/drm_device.h) use crate::{ alloc::allocator::Kmalloc, @@ -82,8 +82,8 @@ impl Device { major: T::INFO.major, minor: T::INFO.minor, patchlevel: T::INFO.patchlevel, - name: T::INFO.name.as_char_ptr().cast_mut(), - desc: T::INFO.desc.as_char_ptr().cast_mut(), + name: crate::str::as_char_ptr_in_const_context(T::INFO.name).cast_mut(), + desc: crate::str::as_char_ptr_in_const_context(T::INFO.desc).cast_mut(), driver_features: drm::driver::FEAT_GEM, ioctls: T::IOCTLS.as_ptr(), diff --git a/rust/kernel/drm/driver.rs b/rust/kernel/drm/driver.rs index fe7e8d06961aa5..d2dad77274c4ca 100644 --- a/rust/kernel/drm/driver.rs +++ b/rust/kernel/drm/driver.rs @@ -2,7 +2,7 @@ //! DRM driver core. //! -//! C header: [`include/linux/drm/drm_drv.h`](srctree/include/linux/drm/drm_drv.h) +//! C header: [`include/drm/drm_drv.h`](srctree/include/drm/drm_drv.h) use crate::{ bindings, device, devres, drm, diff --git a/rust/kernel/drm/file.rs b/rust/kernel/drm/file.rs index e8789c9110d654..8c46f8d519516a 100644 --- a/rust/kernel/drm/file.rs +++ b/rust/kernel/drm/file.rs @@ -2,7 +2,7 @@ //! DRM File objects. //! -//! C header: [`include/linux/drm/drm_file.h`](srctree/include/linux/drm/drm_file.h) +//! C header: [`include/drm/drm_file.h`](srctree/include/drm/drm_file.h) use crate::{bindings, drm, error::Result, prelude::*, types::Opaque}; use core::marker::PhantomData; diff --git a/rust/kernel/drm/gem/mod.rs b/rust/kernel/drm/gem/mod.rs index b71821cfb5eaa0..b9f3248876baa3 100644 --- a/rust/kernel/drm/gem/mod.rs +++ b/rust/kernel/drm/gem/mod.rs @@ -2,7 +2,7 @@ //! DRM GEM API //! -//! C header: [`include/linux/drm/drm_gem.h`](srctree/include/linux/drm/drm_gem.h) +//! C header: [`include/drm/drm_gem.h`](srctree/include/drm/drm_gem.h) use crate::{ alloc::flags::*, diff --git a/rust/kernel/drm/ioctl.rs b/rust/kernel/drm/ioctl.rs index fdec01c371687c..8431cdcd3ae0ef 100644 --- a/rust/kernel/drm/ioctl.rs +++ b/rust/kernel/drm/ioctl.rs @@ -2,7 +2,7 @@ //! DRM IOCTL definitions. //! -//! C header: [`include/linux/drm/drm_ioctl.h`](srctree/include/linux/drm/drm_ioctl.h) +//! C header: [`include/drm/drm_ioctl.h`](srctree/include/drm/drm_ioctl.h) use crate::ioctl; diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs index a41de293dcd11b..1c0e0e241daa91 100644 --- a/rust/kernel/error.rs +++ b/rust/kernel/error.rs @@ -2,7 +2,9 @@ //! Kernel errors. //! -//! C header: [`include/uapi/asm-generic/errno-base.h`](srctree/include/uapi/asm-generic/errno-base.h) +//! C header: [`include/uapi/asm-generic/errno-base.h`](srctree/include/uapi/asm-generic/errno-base.h)\ +//! C header: [`include/uapi/asm-generic/errno.h`](srctree/include/uapi/asm-generic/errno.h)\ +//! C header: [`include/linux/errno.h`](srctree/include/linux/errno.h) use crate::{ alloc::{layout::LayoutError, AllocError}, @@ -101,8 +103,23 @@ pub struct Error(NonZeroI32); impl Error { /// Creates an [`Error`] from a kernel error code. /// - /// It is a bug to pass an out-of-range `errno`. `EINVAL` would - /// be returned in such a case. + /// `errno` must be within error code range (i.e. `>= -MAX_ERRNO && < 0`). + /// + /// It is a bug to pass an out-of-range `errno`. [`code::EINVAL`] is returned in such a case. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(Error::from_errno(-1), EPERM); + /// assert_eq!(Error::from_errno(-2), ENOENT); + /// ``` + /// + /// The following calls are considered a bug: + /// + /// ``` + /// assert_eq!(Error::from_errno(0), EINVAL); + /// assert_eq!(Error::from_errno(-1000000), EINVAL); + /// ``` pub fn from_errno(errno: crate::ffi::c_int) -> Error { if let Some(error) = Self::try_from_errno(errno) { error @@ -158,7 +175,7 @@ impl Error { } /// Returns a string representing the error, if one exists. - #[cfg(not(any(test, testlib)))] + #[cfg(not(testlib))] pub fn name(&self) -> Option<&'static CStr> { // SAFETY: Just an FFI call, there are no extra safety requirements. let ptr = unsafe { bindings::errname(-self.0.get()) }; @@ -175,7 +192,7 @@ impl Error { /// When `testlib` is configured, this always returns `None` to avoid the dependency on a /// kernel function so that tests that use this (e.g., by calling [`Result::unwrap`]) can still /// run in userspace. - #[cfg(any(test, testlib))] + #[cfg(testlib)] pub fn name(&self) -> Option<&'static CStr> { None } @@ -375,8 +392,43 @@ impl From for Error { /// [Rust documentation]: https://doc.rust-lang.org/book/ch09-02-recoverable-errors-with-result.html pub type Result = core::result::Result; -/// Converts an integer as returned by a C kernel function to an error if it's negative, and -/// `Ok(())` otherwise. +/// Converts an integer as returned by a C kernel function to a [`Result`]. +/// +/// If the integer is negative, an [`Err`] with an [`Error`] as given by [`Error::from_errno`] is +/// returned. This means the integer must be `>= -MAX_ERRNO`. +/// +/// Otherwise, it returns [`Ok`]. +/// +/// It is a bug to pass an out-of-range negative integer. `Err(EINVAL)` is returned in such a case. +/// +/// # Examples +/// +/// This function may be used to easily perform early returns with the [`?`] operator when working +/// with C APIs within Rust abstractions: +/// +/// ``` +/// # use kernel::error::to_result; +/// # mod bindings { +/// # #![expect(clippy::missing_safety_doc)] +/// # use kernel::prelude::*; +/// # pub(super) unsafe fn f1() -> c_int { 0 } +/// # pub(super) unsafe fn f2() -> c_int { EINVAL.to_errno() } +/// # } +/// fn f() -> Result { +/// // SAFETY: ... +/// to_result(unsafe { bindings::f1() })?; +/// +/// // SAFETY: ... +/// to_result(unsafe { bindings::f2() })?; +/// +/// // ... +/// +/// Ok(()) +/// } +/// # assert_eq!(f(), Err(EINVAL)); +/// ``` +/// +/// [`?`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#the-question-mark-operator pub fn to_result(err: crate::ffi::c_int) -> Result { if err < 0 { Err(Error::from_errno(err)) diff --git a/rust/kernel/firmware.rs b/rust/kernel/firmware.rs index 1abab5b2f0522e..94e6bb88b90306 100644 --- a/rust/kernel/firmware.rs +++ b/rust/kernel/firmware.rs @@ -291,7 +291,7 @@ impl ModInfoBuilder { let module_name = this.module_name; if !this.module_name.is_empty() { - this = this.push_internal(module_name.as_bytes_with_nul()); + this = this.push_internal(module_name.to_bytes_with_nul()); if N != 0 { // Re-use the space taken by the NULL terminator and swap it with the '.' separator. diff --git a/rust/kernel/fs/file.rs b/rust/kernel/fs/file.rs index 35fd5db35c4652..cf06e73a6da041 100644 --- a/rust/kernel/fs/file.rs +++ b/rust/kernel/fs/file.rs @@ -10,8 +10,10 @@ use crate::{ bindings, cred::Credential, - error::{code::*, Error, Result}, - types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque}, + error::{code::*, to_result, Error, Result}, + fmt, + sync::aref::{ARef, AlwaysRefCounted}, + types::{NotThreadSafe, Opaque}, }; use core::ptr; @@ -398,9 +400,8 @@ impl FileDescriptorReservation { pub fn get_unused_fd_flags(flags: u32) -> Result { // SAFETY: FFI call, there are no safety requirements on `flags`. let fd: i32 = unsafe { bindings::get_unused_fd_flags(flags) }; - if fd < 0 { - return Err(Error::from_errno(fd)); - } + to_result(fd)?; + Ok(Self { fd: fd as u32, _not_send: NotThreadSafe, @@ -460,8 +461,8 @@ impl From for Error { } } -impl core::fmt::Debug for BadFdError { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl fmt::Debug for BadFdError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.pad("EBADF") } } diff --git a/rust/kernel/io.rs b/rust/kernel/io.rs index 03b467722b8651..ee182b0b5452df 100644 --- a/rust/kernel/io.rs +++ b/rust/kernel/io.rs @@ -8,6 +8,7 @@ use crate::error::{code::EINVAL, Result}; use crate::{bindings, build_assert, ffi::c_void}; pub mod mem; +pub mod poll; pub mod resource; pub use resource::Resource; diff --git a/rust/kernel/io/poll.rs b/rust/kernel/io/poll.rs new file mode 100644 index 00000000000000..613eb25047efc4 --- /dev/null +++ b/rust/kernel/io/poll.rs @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! IO polling. +//! +//! C header: [`include/linux/iopoll.h`](srctree/include/linux/iopoll.h). + +use crate::{ + error::{code::*, Result}, + processor::cpu_relax, + task::might_sleep, + time::{delay::fsleep, Delta, Instant, Monotonic}, +}; + +/// Polls periodically until a condition is met, an error occurs, +/// or the timeout is reached. +/// +/// The function repeatedly executes the given operation `op` closure and +/// checks its result using the condition closure `cond`. +/// +/// If `cond` returns `true`, the function returns successfully with +/// the result of `op`. Otherwise, it waits for a duration specified +/// by `sleep_delta` before executing `op` again. +/// +/// This process continues until either `op` returns an error, `cond` +/// returns `true`, or the timeout specified by `timeout_delta` is +/// reached. +/// +/// This function can only be used in a nonatomic context. +/// +/// # Errors +/// +/// If `op` returns an error, then that error is returned directly. +/// +/// If the timeout specified by `timeout_delta` is reached, then +/// `Err(ETIMEDOUT)` is returned. +/// +/// # Examples +/// +/// ```no_run +/// use kernel::io::{Io, poll::read_poll_timeout}; +/// use kernel::time::Delta; +/// +/// const HW_READY: u16 = 0x01; +/// +/// fn wait_for_hardware(io: &Io) -> Result<()> { +/// match read_poll_timeout( +/// // The `op` closure reads the value of a specific status register. +/// || io.try_read16(0x1000), +/// // The `cond` closure takes a reference to the value returned by `op` +/// // and checks whether the hardware is ready. +/// |val: &u16| *val == HW_READY, +/// Delta::from_millis(50), +/// Delta::from_secs(3), +/// ) { +/// Ok(_) => { +/// // The hardware is ready. The returned value of the `op` closure +/// // isn't used. +/// Ok(()) +/// } +/// Err(e) => Err(e), +/// } +/// } +/// ``` +#[track_caller] +pub fn read_poll_timeout( + mut op: Op, + mut cond: Cond, + sleep_delta: Delta, + timeout_delta: Delta, +) -> Result +where + Op: FnMut() -> Result, + Cond: FnMut(&T) -> bool, +{ + let start: Instant = Instant::now(); + + // Unlike the C version, we always call `might_sleep()` unconditionally, + // as conditional calls are error-prone. We clearly separate + // `read_poll_timeout()` and `read_poll_timeout_atomic()` to aid + // tools like klint. + might_sleep(); + + loop { + let val = op()?; + if cond(&val) { + // Unlike the C version, we immediately return. + // We know the condition is met so we don't need to check again. + return Ok(val); + } + + if start.elapsed() > timeout_delta { + // Unlike the C version, we immediately return. + // We have just called `op()` so we don't need to call it again. + return Err(ETIMEDOUT); + } + + if !sleep_delta.is_zero() { + fsleep(sleep_delta); + } + + // `fsleep()` could be a busy-wait loop so we always call `cpu_relax()`. + cpu_relax(); + } +} diff --git a/rust/kernel/irq.rs b/rust/kernel/irq.rs new file mode 100644 index 00000000000000..20abd405665596 --- /dev/null +++ b/rust/kernel/irq.rs @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! IRQ abstractions. +//! +//! An IRQ is an interrupt request from a device. It is used to get the CPU's +//! attention so it can service a hardware event in a timely manner. +//! +//! The current abstractions handle IRQ requests and handlers, i.e.: it allows +//! drivers to register a handler for a given IRQ line. +//! +//! C header: [`include/linux/device.h`](srctree/include/linux/interrupt.h) + +/// Flags to be used when registering IRQ handlers. +mod flags; + +/// IRQ allocation and handling. +mod request; + +pub use flags::Flags; + +pub use request::{ + Handler, IrqRequest, IrqReturn, Registration, ThreadedHandler, ThreadedIrqReturn, + ThreadedRegistration, +}; diff --git a/rust/kernel/irq/flags.rs b/rust/kernel/irq/flags.rs new file mode 100644 index 00000000000000..adfde96ec47cf4 --- /dev/null +++ b/rust/kernel/irq/flags.rs @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +// SPDX-FileCopyrightText: Copyright 2025 Collabora ltd. + +use crate::bindings; +use crate::prelude::*; + +/// Flags to be used when registering IRQ handlers. +/// +/// Flags can be used to request specific behaviors when registering an IRQ +/// handler, and can be combined using the `|`, `&`, and `!` operators to +/// further control the system's behavior. +/// +/// A common use case is to register a shared interrupt, as sharing the line +/// between devices is increasingly common in modern systems and is even +/// required for some buses. This requires setting [`Flags::SHARED`] when +/// requesting the interrupt. Other use cases include setting the trigger type +/// through `Flags::TRIGGER_*`, which determines when the interrupt fires, or +/// controlling whether the interrupt is masked after the handler runs by using +/// [`Flags::ONESHOT`]. +/// +/// If an invalid combination of flags is provided, the system will refuse to +/// register the handler, and lower layers will enforce certain flags when +/// necessary. This means, for example, that all the +/// [`crate::irq::Registration`] for a shared interrupt have to agree on +/// [`Flags::SHARED`] and on the same trigger type, if set. +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Flags(c_ulong); + +impl Flags { + /// Use the interrupt line as already configured. + pub const TRIGGER_NONE: Flags = Flags::new(bindings::IRQF_TRIGGER_NONE); + + /// The interrupt is triggered when the signal goes from low to high. + pub const TRIGGER_RISING: Flags = Flags::new(bindings::IRQF_TRIGGER_RISING); + + /// The interrupt is triggered when the signal goes from high to low. + pub const TRIGGER_FALLING: Flags = Flags::new(bindings::IRQF_TRIGGER_FALLING); + + /// The interrupt is triggered while the signal is held high. + pub const TRIGGER_HIGH: Flags = Flags::new(bindings::IRQF_TRIGGER_HIGH); + + /// The interrupt is triggered while the signal is held low. + pub const TRIGGER_LOW: Flags = Flags::new(bindings::IRQF_TRIGGER_LOW); + + /// Allow sharing the IRQ among several devices. + pub const SHARED: Flags = Flags::new(bindings::IRQF_SHARED); + + /// Set by callers when they expect sharing mismatches to occur. + pub const PROBE_SHARED: Flags = Flags::new(bindings::IRQF_PROBE_SHARED); + + /// Flag to mark this interrupt as timer interrupt. + pub const TIMER: Flags = Flags::new(bindings::IRQF_TIMER); + + /// Interrupt is per CPU. + pub const PERCPU: Flags = Flags::new(bindings::IRQF_PERCPU); + + /// Flag to exclude this interrupt from irq balancing. + pub const NOBALANCING: Flags = Flags::new(bindings::IRQF_NOBALANCING); + + /// Interrupt is used for polling (only the interrupt that is registered + /// first in a shared interrupt is considered for performance reasons). + pub const IRQPOLL: Flags = Flags::new(bindings::IRQF_IRQPOLL); + + /// Interrupt is not re-enabled after the hardirq handler finished. Used by + /// threaded interrupts which need to keep the irq line disabled until the + /// threaded handler has been run. + pub const ONESHOT: Flags = Flags::new(bindings::IRQF_ONESHOT); + + /// Do not disable this IRQ during suspend. Does not guarantee that this + /// interrupt will wake the system from a suspended state. + pub const NO_SUSPEND: Flags = Flags::new(bindings::IRQF_NO_SUSPEND); + + /// Force enable it on resume even if [`Flags::NO_SUSPEND`] is set. + pub const FORCE_RESUME: Flags = Flags::new(bindings::IRQF_FORCE_RESUME); + + /// Interrupt cannot be threaded. + pub const NO_THREAD: Flags = Flags::new(bindings::IRQF_NO_THREAD); + + /// Resume IRQ early during syscore instead of at device resume time. + pub const EARLY_RESUME: Flags = Flags::new(bindings::IRQF_EARLY_RESUME); + + /// If the IRQ is shared with a [`Flags::NO_SUSPEND`] user, execute this + /// interrupt handler after suspending interrupts. For system wakeup devices + /// users need to implement wakeup detection in their interrupt handlers. + pub const COND_SUSPEND: Flags = Flags::new(bindings::IRQF_COND_SUSPEND); + + /// Don't enable IRQ or NMI automatically when users request it. Users will + /// enable it explicitly by `enable_irq` or `enable_nmi` later. + pub const NO_AUTOEN: Flags = Flags::new(bindings::IRQF_NO_AUTOEN); + + /// Exclude from runnaway detection for IPI and similar handlers, depends on + /// `PERCPU`. + pub const NO_DEBUG: Flags = Flags::new(bindings::IRQF_NO_DEBUG); + + pub(crate) fn into_inner(self) -> c_ulong { + self.0 + } + + const fn new(value: u32) -> Self { + build_assert!(value as u64 <= c_ulong::MAX as u64); + Self(value as c_ulong) + } +} + +impl core::ops::BitOr for Flags { + type Output = Self; + fn bitor(self, rhs: Self) -> Self::Output { + Self(self.0 | rhs.0) + } +} + +impl core::ops::BitAnd for Flags { + type Output = Self; + fn bitand(self, rhs: Self) -> Self::Output { + Self(self.0 & rhs.0) + } +} + +impl core::ops::Not for Flags { + type Output = Self; + fn not(self) -> Self::Output { + Self(!self.0) + } +} diff --git a/rust/kernel/irq/request.rs b/rust/kernel/irq/request.rs new file mode 100644 index 00000000000000..b150563fdef809 --- /dev/null +++ b/rust/kernel/irq/request.rs @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0 +// SPDX-FileCopyrightText: Copyright 2025 Collabora ltd. + +//! This module provides types like [`Registration`] and +//! [`ThreadedRegistration`], which allow users to register handlers for a given +//! IRQ line. + +use core::marker::PhantomPinned; + +use crate::alloc::Allocator; +use crate::device::{Bound, Device}; +use crate::devres::Devres; +use crate::error::to_result; +use crate::irq::flags::Flags; +use crate::prelude::*; +use crate::str::CStr; +use crate::sync::Arc; + +/// The value that can be returned from a [`Handler`] or a [`ThreadedHandler`]. +#[repr(u32)] +pub enum IrqReturn { + /// The interrupt was not from this device or was not handled. + None = bindings::irqreturn_IRQ_NONE, + + /// The interrupt was handled by this device. + Handled = bindings::irqreturn_IRQ_HANDLED, +} + +/// Callbacks for an IRQ handler. +pub trait Handler: Sync { + /// The hard IRQ handler. + /// + /// This is executed in interrupt context, hence all corresponding + /// limitations do apply. + /// + /// All work that does not necessarily need to be executed from + /// interrupt context, should be deferred to a threaded handler. + /// See also [`ThreadedRegistration`]. + fn handle(&self, device: &Device) -> IrqReturn; +} + +impl Handler for Arc { + fn handle(&self, device: &Device) -> IrqReturn { + T::handle(self, device) + } +} + +impl Handler for Box { + fn handle(&self, device: &Device) -> IrqReturn { + T::handle(self, device) + } +} + +/// # Invariants +/// +/// - `self.irq` is the same as the one passed to `request_{threaded}_irq`. +/// - `cookie` was passed to `request_{threaded}_irq` as the cookie. It is guaranteed to be unique +/// by the type system, since each call to `new` will return a different instance of +/// `Registration`. +#[pin_data(PinnedDrop)] +struct RegistrationInner { + irq: u32, + cookie: *mut c_void, +} + +impl RegistrationInner { + fn synchronize(&self) { + // SAFETY: safe as per the invariants of `RegistrationInner` + unsafe { bindings::synchronize_irq(self.irq) }; + } +} + +#[pinned_drop] +impl PinnedDrop for RegistrationInner { + fn drop(self: Pin<&mut Self>) { + // SAFETY: + // + // Safe as per the invariants of `RegistrationInner` and: + // + // - The containing struct is `!Unpin` and was initialized using + // pin-init, so it occupied the same memory location for the entirety of + // its lifetime. + // + // Notice that this will block until all handlers finish executing, + // i.e.: at no point will &self be invalid while the handler is running. + unsafe { bindings::free_irq(self.irq, self.cookie) }; + } +} + +// SAFETY: We only use `inner` on drop, which called at most once with no +// concurrent access. +unsafe impl Sync for RegistrationInner {} + +// SAFETY: It is safe to send `RegistrationInner` across threads. +unsafe impl Send for RegistrationInner {} + +/// A request for an IRQ line for a given device. +/// +/// # Invariants +/// +/// - `ìrq` is the number of an interrupt source of `dev`. +/// - `irq` has not been registered yet. +pub struct IrqRequest<'a> { + dev: &'a Device, + irq: u32, +} + +impl<'a> IrqRequest<'a> { + /// Creates a new IRQ request for the given device and IRQ number. + /// + /// # Safety + /// + /// - `irq` should be a valid IRQ number for `dev`. + pub(crate) unsafe fn new(dev: &'a Device, irq: u32) -> Self { + // INVARIANT: `irq` is a valid IRQ number for `dev`. + IrqRequest { dev, irq } + } + + /// Returns the IRQ number of an [`IrqRequest`]. + pub fn irq(&self) -> u32 { + self.irq + } +} + +/// A registration of an IRQ handler for a given IRQ line. +/// +/// # Examples +/// +/// The following is an example of using `Registration`. It uses a +/// [`Completion`] to coordinate between the IRQ +/// handler and process context. [`Completion`] uses interior mutability, so the +/// handler can signal with [`Completion::complete_all()`] and the process +/// context can wait with [`Completion::wait_for_completion()`] even though +/// there is no way to get a mutable reference to the any of the fields in +/// `Data`. +/// +/// [`Completion`]: kernel::sync::Completion +/// [`Completion::complete_all()`]: kernel::sync::Completion::complete_all +/// [`Completion::wait_for_completion()`]: kernel::sync::Completion::wait_for_completion +/// +/// ``` +/// use kernel::c_str; +/// use kernel::device::{Bound, Device}; +/// use kernel::irq::{self, Flags, IrqRequest, IrqReturn, Registration}; +/// use kernel::prelude::*; +/// use kernel::sync::{Arc, Completion}; +/// +/// // Data shared between process and IRQ context. +/// #[pin_data] +/// struct Data { +/// #[pin] +/// completion: Completion, +/// } +/// +/// impl irq::Handler for Data { +/// // Executed in IRQ context. +/// fn handle(&self, _dev: &Device) -> IrqReturn { +/// self.completion.complete_all(); +/// IrqReturn::Handled +/// } +/// } +/// +/// // Registers an IRQ handler for the given IrqRequest. +/// // +/// // This runs in process context and assumes `request` was previously acquired from a device. +/// fn register_irq( +/// handler: impl PinInit, +/// request: IrqRequest<'_>, +/// ) -> Result>> { +/// let registration = Registration::new(request, Flags::SHARED, c_str!("my_device"), handler); +/// +/// let registration = Arc::pin_init(registration, GFP_KERNEL)?; +/// +/// registration.handler().completion.wait_for_completion(); +/// +/// Ok(registration) +/// } +/// # Ok::<(), Error>(()) +/// ``` +/// +/// # Invariants +/// +/// * We own an irq handler whose cookie is a pointer to `Self`. +#[pin_data] +pub struct Registration { + #[pin] + inner: Devres, + + #[pin] + handler: T, + + /// Pinned because we need address stability so that we can pass a pointer + /// to the callback. + #[pin] + _pin: PhantomPinned, +} + +impl Registration { + /// Registers the IRQ handler with the system for the given IRQ number. + pub fn new<'a>( + request: IrqRequest<'a>, + flags: Flags, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> impl PinInit + 'a { + try_pin_init!(&this in Self { + handler <- handler, + inner <- Devres::new( + request.dev, + try_pin_init!(RegistrationInner { + // INVARIANT: `this` is a valid pointer to the `Registration` instance + cookie: this.as_ptr().cast::(), + irq: { + // SAFETY: + // - The callbacks are valid for use with request_irq. + // - If this succeeds, the slot is guaranteed to be valid until the + // destructor of Self runs, which will deregister the callbacks + // before the memory location becomes invalid. + // - When request_irq is called, everything that handle_irq_callback will + // touch has already been initialized, so it's safe for the callback to + // be called immediately. + to_result(unsafe { + bindings::request_irq( + request.irq, + Some(handle_irq_callback::), + flags.into_inner(), + name.as_char_ptr(), + this.as_ptr().cast::(), + ) + })?; + request.irq + } + }) + ), + _pin: PhantomPinned, + }) + } + + /// Returns a reference to the handler that was registered with the system. + pub fn handler(&self) -> &T { + &self.handler + } + + /// Wait for pending IRQ handlers on other CPUs. + /// + /// This will attempt to access the inner [`Devres`] container. + pub fn try_synchronize(&self) -> Result { + let inner = self.inner.try_access().ok_or(ENODEV)?; + inner.synchronize(); + Ok(()) + } + + /// Wait for pending IRQ handlers on other CPUs. + pub fn synchronize(&self, dev: &Device) -> Result { + let inner = self.inner.access(dev)?; + inner.synchronize(); + Ok(()) + } +} + +/// # Safety +/// +/// This function should be only used as the callback in `request_irq`. +unsafe extern "C" fn handle_irq_callback(_irq: i32, ptr: *mut c_void) -> c_uint { + // SAFETY: `ptr` is a pointer to `Registration` set in `Registration::new` + let registration = unsafe { &*(ptr as *const Registration) }; + // SAFETY: The irq callback is removed before the device is unbound, so the fact that the irq + // callback is running implies that the device has not yet been unbound. + let device = unsafe { registration.inner.device().as_bound() }; + + T::handle(®istration.handler, device) as c_uint +} + +/// The value that can be returned from [`ThreadedHandler::handle`]. +#[repr(u32)] +pub enum ThreadedIrqReturn { + /// The interrupt was not from this device or was not handled. + None = bindings::irqreturn_IRQ_NONE, + + /// The interrupt was handled by this device. + Handled = bindings::irqreturn_IRQ_HANDLED, + + /// The handler wants the handler thread to wake up. + WakeThread = bindings::irqreturn_IRQ_WAKE_THREAD, +} + +/// Callbacks for a threaded IRQ handler. +pub trait ThreadedHandler: Sync { + /// The hard IRQ handler. + /// + /// This is executed in interrupt context, hence all corresponding + /// limitations do apply. All work that does not necessarily need to be + /// executed from interrupt context, should be deferred to the threaded + /// handler, i.e. [`ThreadedHandler::handle_threaded`]. + /// + /// The default implementation returns [`ThreadedIrqReturn::WakeThread`]. + #[expect(unused_variables)] + fn handle(&self, device: &Device) -> ThreadedIrqReturn { + ThreadedIrqReturn::WakeThread + } + + /// The threaded IRQ handler. + /// + /// This is executed in process context. The kernel creates a dedicated + /// `kthread` for this purpose. + fn handle_threaded(&self, device: &Device) -> IrqReturn; +} + +impl ThreadedHandler for Arc { + fn handle(&self, device: &Device) -> ThreadedIrqReturn { + T::handle(self, device) + } + + fn handle_threaded(&self, device: &Device) -> IrqReturn { + T::handle_threaded(self, device) + } +} + +impl ThreadedHandler for Box { + fn handle(&self, device: &Device) -> ThreadedIrqReturn { + T::handle(self, device) + } + + fn handle_threaded(&self, device: &Device) -> IrqReturn { + T::handle_threaded(self, device) + } +} + +/// A registration of a threaded IRQ handler for a given IRQ line. +/// +/// Two callbacks are required: one to handle the IRQ, and one to handle any +/// other work in a separate thread. +/// +/// The thread handler is only called if the IRQ handler returns +/// [`ThreadedIrqReturn::WakeThread`]. +/// +/// # Examples +/// +/// The following is an example of using [`ThreadedRegistration`]. It uses a +/// [`Mutex`](kernel::sync::Mutex) to provide interior mutability. +/// +/// ``` +/// use kernel::c_str; +/// use kernel::device::{Bound, Device}; +/// use kernel::irq::{ +/// self, Flags, IrqRequest, IrqReturn, ThreadedHandler, ThreadedIrqReturn, +/// ThreadedRegistration, +/// }; +/// use kernel::prelude::*; +/// use kernel::sync::{Arc, Mutex}; +/// +/// // Declare a struct that will be passed in when the interrupt fires. The u32 +/// // merely serves as an example of some internal data. +/// // +/// // [`irq::ThreadedHandler::handle`] takes `&self`. This example +/// // illustrates how interior mutability can be used when sharing the data +/// // between process context and IRQ context. +/// #[pin_data] +/// struct Data { +/// #[pin] +/// value: Mutex, +/// } +/// +/// impl ThreadedHandler for Data { +/// // This will run (in a separate kthread) if and only if +/// // [`ThreadedHandler::handle`] returns [`WakeThread`], which it does by +/// // default. +/// fn handle_threaded(&self, _dev: &Device) -> IrqReturn { +/// let mut data = self.value.lock(); +/// *data += 1; +/// IrqReturn::Handled +/// } +/// } +/// +/// // Registers a threaded IRQ handler for the given [`IrqRequest`]. +/// // +/// // This is executing in process context and assumes that `request` was +/// // previously acquired from a device. +/// fn register_threaded_irq( +/// handler: impl PinInit, +/// request: IrqRequest<'_>, +/// ) -> Result>> { +/// let registration = +/// ThreadedRegistration::new(request, Flags::SHARED, c_str!("my_device"), handler); +/// +/// let registration = Arc::pin_init(registration, GFP_KERNEL)?; +/// +/// { +/// // The data can be accessed from process context too. +/// let mut data = registration.handler().value.lock(); +/// *data += 1; +/// } +/// +/// Ok(registration) +/// } +/// # Ok::<(), Error>(()) +/// ``` +/// +/// # Invariants +/// +/// * We own an irq handler whose cookie is a pointer to `Self`. +#[pin_data] +pub struct ThreadedRegistration { + #[pin] + inner: Devres, + + #[pin] + handler: T, + + /// Pinned because we need address stability so that we can pass a pointer + /// to the callback. + #[pin] + _pin: PhantomPinned, +} + +impl ThreadedRegistration { + /// Registers the IRQ handler with the system for the given IRQ number. + pub fn new<'a>( + request: IrqRequest<'a>, + flags: Flags, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> impl PinInit + 'a { + try_pin_init!(&this in Self { + handler <- handler, + inner <- Devres::new( + request.dev, + try_pin_init!(RegistrationInner { + // INVARIANT: `this` is a valid pointer to the `ThreadedRegistration` instance. + cookie: this.as_ptr().cast::(), + irq: { + // SAFETY: + // - The callbacks are valid for use with request_threaded_irq. + // - If this succeeds, the slot is guaranteed to be valid until the + // destructor of Self runs, which will deregister the callbacks + // before the memory location becomes invalid. + // - When request_threaded_irq is called, everything that the two callbacks + // will touch has already been initialized, so it's safe for the + // callbacks to be called immediately. + to_result(unsafe { + bindings::request_threaded_irq( + request.irq, + Some(handle_threaded_irq_callback::), + Some(thread_fn_callback::), + flags.into_inner(), + name.as_char_ptr(), + this.as_ptr().cast::(), + ) + })?; + request.irq + } + }) + ), + _pin: PhantomPinned, + }) + } + + /// Returns a reference to the handler that was registered with the system. + pub fn handler(&self) -> &T { + &self.handler + } + + /// Wait for pending IRQ handlers on other CPUs. + /// + /// This will attempt to access the inner [`Devres`] container. + pub fn try_synchronize(&self) -> Result { + let inner = self.inner.try_access().ok_or(ENODEV)?; + inner.synchronize(); + Ok(()) + } + + /// Wait for pending IRQ handlers on other CPUs. + pub fn synchronize(&self, dev: &Device) -> Result { + let inner = self.inner.access(dev)?; + inner.synchronize(); + Ok(()) + } +} + +/// # Safety +/// +/// This function should be only used as the callback in `request_threaded_irq`. +unsafe extern "C" fn handle_threaded_irq_callback( + _irq: i32, + ptr: *mut c_void, +) -> c_uint { + // SAFETY: `ptr` is a pointer to `ThreadedRegistration` set in `ThreadedRegistration::new` + let registration = unsafe { &*(ptr as *const ThreadedRegistration) }; + // SAFETY: The irq callback is removed before the device is unbound, so the fact that the irq + // callback is running implies that the device has not yet been unbound. + let device = unsafe { registration.inner.device().as_bound() }; + + T::handle(®istration.handler, device) as c_uint +} + +/// # Safety +/// +/// This function should be only used as the callback in `request_threaded_irq`. +unsafe extern "C" fn thread_fn_callback(_irq: i32, ptr: *mut c_void) -> c_uint { + // SAFETY: `ptr` is a pointer to `ThreadedRegistration` set in `ThreadedRegistration::new` + let registration = unsafe { &*(ptr as *const ThreadedRegistration) }; + // SAFETY: The irq callback is removed before the device is unbound, so the fact that the irq + // callback is running implies that the device has not yet been unbound. + let device = unsafe { registration.inner.device().as_bound() }; + + T::handle_threaded(®istration.handler, device) as c_uint +} diff --git a/rust/kernel/kunit.rs b/rust/kernel/kunit.rs index 41efd87595d6ea..3a43886cc14e45 100644 --- a/rust/kernel/kunit.rs +++ b/rust/kernel/kunit.rs @@ -6,8 +6,8 @@ //! //! Reference: +use crate::fmt; use crate::prelude::*; -use core::fmt; #[cfg(CONFIG_PRINTK)] use crate::c_str; @@ -74,14 +74,14 @@ macro_rules! kunit_assert { // mistake (it is hidden to prevent that). // // This mimics KUnit's failed assertion format. - $crate::kunit::err(format_args!( + $crate::kunit::err($crate::prelude::fmt!( " # {}: ASSERTION FAILED at {FILE}:{LINE}\n", $name )); - $crate::kunit::err(format_args!( + $crate::kunit::err($crate::prelude::fmt!( " Expected {CONDITION} to be true, but is false\n" )); - $crate::kunit::err(format_args!( + $crate::kunit::err($crate::prelude::fmt!( " Failure not reported to KUnit since this is a non-KUnit task\n" )); break 'out; @@ -102,12 +102,12 @@ macro_rules! kunit_assert { unsafe impl Sync for UnaryAssert {} static LOCATION: Location = Location($crate::bindings::kunit_loc { - file: FILE.as_char_ptr(), + file: $crate::str::as_char_ptr_in_const_context(FILE), line: LINE, }); static ASSERTION: UnaryAssert = UnaryAssert($crate::bindings::kunit_unary_assert { assert: $crate::bindings::kunit_assert {}, - condition: CONDITION.as_char_ptr(), + condition: $crate::str::as_char_ptr_in_const_context(CONDITION), expected_true: true, }); @@ -202,7 +202,7 @@ pub const fn kunit_case( ) -> kernel::bindings::kunit_case { kernel::bindings::kunit_case { run_case: Some(run_case), - name: name.as_char_ptr(), + name: kernel::str::as_char_ptr_in_const_context(name), attr: kernel::bindings::kunit_attributes { speed: kernel::bindings::kunit_speed_KUNIT_SPEED_NORMAL, }, diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index fef97f2a50984f..09ee3d17ee0a60 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -17,6 +17,7 @@ // the unstable features in use. // // Stable since Rust 1.79.0. +#![feature(generic_nonzero)] #![feature(inline_const)] // // Stable since Rust 1.81.0. @@ -28,6 +29,7 @@ // Stable since Rust 1.83.0. #![feature(const_maybe_uninit_as_mut_ptr)] #![feature(const_mut_refs)] +#![feature(const_option)] #![feature(const_ptr_write)] #![feature(const_refs_to_cell)] // @@ -76,6 +78,7 @@ pub mod cpu; pub mod cpufreq; pub mod cpumask; pub mod cred; +pub mod debugfs; pub mod device; pub mod device_id; pub mod devres; @@ -92,6 +95,7 @@ pub mod fs; pub mod init; pub mod io; pub mod ioctl; +pub mod irq; pub mod jump_label; #[cfg(CONFIG_KUNIT)] pub mod kunit; @@ -110,6 +114,8 @@ pub mod pid_namespace; pub mod platform; pub mod prelude; pub mod print; +pub mod processor; +pub mod ptr; pub mod rbtree; pub mod regulator; pub mod revocable; @@ -206,7 +212,7 @@ impl ThisModule { } } -#[cfg(not(any(testlib, test)))] +#[cfg(not(testlib))] #[panic_handler] fn panic(info: &core::panic::PanicInfo<'_>) -> ! { pr_emerg!("{}\n", info); diff --git a/rust/kernel/list.rs b/rust/kernel/list.rs index 44e5219cfcbcbb..7355bbac16a7fe 100644 --- a/rust/kernel/list.rs +++ b/rust/kernel/list.rs @@ -38,6 +38,8 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// /// # Examples /// +/// Use [`ListLinks`] as the type of the intrusive field. +/// /// ``` /// use kernel::list::*; /// @@ -140,6 +142,124 @@ pub use self::arc_field::{define_list_arc_field_getter, ListArcField}; /// } /// # Result::<(), Error>::Ok(()) /// ``` +/// +/// Use [`ListLinksSelfPtr`] as the type of the intrusive field. This allows a list of trait object +/// type. +/// +/// ``` +/// use kernel::list::*; +/// +/// trait Foo { +/// fn foo(&self) -> (&'static str, i32); +/// } +/// +/// #[pin_data] +/// struct DTWrap { +/// #[pin] +/// links: ListLinksSelfPtr>, +/// value: T, +/// } +/// +/// impl DTWrap { +/// fn new(value: T) -> Result> { +/// ListArc::pin_init(try_pin_init!(Self { +/// value, +/// links <- ListLinksSelfPtr::new(), +/// }), GFP_KERNEL) +/// } +/// } +/// +/// impl_list_arc_safe! { +/// impl{T: ?Sized} ListArcSafe<0> for DTWrap { untracked; } +/// } +/// impl_list_item! { +/// impl ListItem<0> for DTWrap { using ListLinksSelfPtr { self.links }; } +/// } +/// +/// // Create a new empty list. +/// let mut list = List::>::new(); +/// { +/// assert!(list.is_empty()); +/// } +/// +/// struct A(i32); +/// // `A` returns the inner value for `foo`. +/// impl Foo for A { fn foo(&self) -> (&'static str, i32) { ("a", self.0) } } +/// +/// struct B; +/// // `B` always returns 42. +/// impl Foo for B { fn foo(&self) -> (&'static str, i32) { ("b", 42) } } +/// +/// // Insert 3 element using `push_back()`. +/// list.push_back(DTWrap::new(A(15))?); +/// list.push_back(DTWrap::new(A(32))?); +/// list.push_back(DTWrap::new(B)?); +/// +/// // Iterate over the list to verify the nodes were inserted correctly. +/// // [A(15), A(32), B] +/// { +/// let mut iter = list.iter(); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 15)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 32)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert!(iter.next().is_none()); +/// +/// // Verify the length of the list. +/// assert_eq!(list.iter().count(), 3); +/// } +/// +/// // Pop the items from the list using `pop_back()` and verify the content. +/// { +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("a", 32)); +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("a", 15)); +/// } +/// +/// // Insert 3 elements using `push_front()`. +/// list.push_front(DTWrap::new(A(15))?); +/// list.push_front(DTWrap::new(A(32))?); +/// list.push_front(DTWrap::new(B)?); +/// +/// // Iterate over the list to verify the nodes were inserted correctly. +/// // [B, A(32), A(15)] +/// { +/// let mut iter = list.iter(); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 32)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 15)); +/// assert!(iter.next().is_none()); +/// +/// // Verify the length of the list. +/// assert_eq!(list.iter().count(), 3); +/// } +/// +/// // Pop the items from the list using `pop_front()` and verify the content. +/// { +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("a", 15)); +/// assert_eq!(list.pop_back().ok_or(EINVAL)?.value.foo(), ("a", 32)); +/// } +/// +/// // Push `list2` to `list` through `push_all_back()`. +/// // list: [B] +/// // list2: [B, A(25)] +/// { +/// let mut list2 = List::>::new(); +/// list2.push_back(DTWrap::new(B)?); +/// list2.push_back(DTWrap::new(A(25))?); +/// +/// list.push_all_back(&mut list2); +/// +/// // list: [B, B, A(25)] +/// // list2: [] +/// let mut iter = list.iter(); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("b", 42)); +/// assert_eq!(iter.next().ok_or(EINVAL)?.value.foo(), ("a", 25)); +/// assert!(iter.next().is_none()); +/// assert!(list2.is_empty()); +/// } +/// # Result::<(), Error>::Ok(()) +/// ``` pub struct List, const ID: u64 = 0> { first: *mut ListLinksFields, _ty: PhantomData>, diff --git a/rust/kernel/miscdevice.rs b/rust/kernel/miscdevice.rs index 6373fe183b2748..d3aa7d25afad81 100644 --- a/rust/kernel/miscdevice.rs +++ b/rust/kernel/miscdevice.rs @@ -34,7 +34,7 @@ impl MiscDeviceOptions { // SAFETY: All zeros is valid for this C type. let mut result: bindings::miscdevice = unsafe { MaybeUninit::zeroed().assume_init() }; result.minor = bindings::MISC_DYNAMIC_MINOR as ffi::c_int; - result.name = self.name.as_char_ptr(); + result.name = crate::str::as_char_ptr_in_const_context(self.name); result.fops = MiscdeviceVTable::::build(); result } diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index 7de5cc7a0eeee0..be1027b7961b70 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -497,7 +497,7 @@ unsafe impl Sync for DriverVTable {} pub const fn create_phy_driver() -> DriverVTable { // INVARIANT: All the fields of `struct phy_driver` are initialized properly. DriverVTable(Opaque::new(bindings::phy_driver { - name: T::NAME.as_char_ptr().cast_mut(), + name: crate::str::as_char_ptr_in_const_context(T::NAME).cast_mut(), flags: T::FLAGS, phy_id: T::PHY_DEVICE_ID.id(), phy_id_mask: T::PHY_DEVICE_ID.mask_as_int(), diff --git a/rust/kernel/of.rs b/rust/kernel/of.rs index b76b35265df2ea..58b20c367f993f 100644 --- a/rust/kernel/of.rs +++ b/rust/kernel/of.rs @@ -34,7 +34,7 @@ unsafe impl RawDeviceIdIndex for DeviceId { impl DeviceId { /// Create a new device id from an OF 'compatible' string. pub const fn new(compatible: &'static CStr) -> Self { - let src = compatible.as_bytes_with_nul(); + let src = compatible.to_bytes_with_nul(); // Replace with `bindings::of_device_id::default()` once stabilized for `const`. // SAFETY: FFI type is valid to be zero-initialized. let mut of: bindings::of_device_id = unsafe { core::mem::zeroed() }; diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 887ee611b55310..7fcc5f6022c19f 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -10,10 +10,11 @@ use crate::{ devres::Devres, driver, error::{from_result, to_result, Result}, - io::Io, - io::IoRaw, + io::{Io, IoRaw}, + irq::{self, IrqRequest}, str::CStr, - types::{ARef, Opaque}, + sync::aref::ARef, + types::Opaque, ThisModule, }; use core::{ @@ -23,6 +24,10 @@ use core::{ }; use kernel::prelude::*; +mod id; + +pub use self::id::{Class, ClassMask, Vendor}; + /// An adapter for the registration of PCI drivers. pub struct Adapter(T); @@ -60,7 +65,7 @@ impl Adapter { extern "C" fn probe_callback( pdev: *mut bindings::pci_dev, id: *const bindings::pci_device_id, - ) -> kernel::ffi::c_int { + ) -> c_int { // SAFETY: The PCI bus only ever calls the probe callback with a valid pointer to a // `struct pci_dev`. // @@ -128,10 +133,11 @@ impl DeviceId { /// Equivalent to C's `PCI_DEVICE` macro. /// - /// Create a new `pci::DeviceId` from a vendor and device ID number. - pub const fn from_id(vendor: u32, device: u32) -> Self { + /// Create a new `pci::DeviceId` from a vendor and device ID. + #[inline] + pub const fn from_id(vendor: Vendor, device: u32) -> Self { Self(bindings::pci_device_id { - vendor, + vendor: vendor.as_raw() as u32, device, subvendor: DeviceId::PCI_ANY_ID, subdevice: DeviceId::PCI_ANY_ID, @@ -145,6 +151,7 @@ impl DeviceId { /// Equivalent to C's `PCI_DEVICE_CLASS` macro. /// /// Create a new `pci::DeviceId` from a class number and mask. + #[inline] pub const fn from_class(class: u32, class_mask: u32) -> Self { Self(bindings::pci_device_id { vendor: DeviceId::PCI_ANY_ID, @@ -157,6 +164,29 @@ impl DeviceId { override_only: 0, }) } + + /// Create a new [`DeviceId`] from a class number, mask, and specific vendor. + /// + /// This is more targeted than [`DeviceId::from_class`]: in addition to matching by [`Vendor`], + /// it also matches the PCI [`Class`] (up to the entire 24 bits, depending on the + /// [`ClassMask`]). + #[inline] + pub const fn from_class_and_vendor( + class: Class, + class_mask: ClassMask, + vendor: Vendor, + ) -> Self { + Self(bindings::pci_device_id { + vendor: vendor.as_raw() as u32, + device: DeviceId::PCI_ANY_ID, + subvendor: DeviceId::PCI_ANY_ID, + subdevice: DeviceId::PCI_ANY_ID, + class: class.as_raw(), + class_mask: class_mask.as_raw(), + driver_data: 0, + override_only: 0, + }) + } } // SAFETY: `DeviceId` is a `#[repr(transparent)]` wrapper of `pci_device_id` and does not add @@ -206,7 +236,7 @@ macro_rules! pci_device_table { /// ::IdInfo, /// [ /// ( -/// pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, bindings::PCI_ANY_ID as u32), +/// pci::DeviceId::from_id(pci::Vendor::REDHAT, bindings::PCI_ANY_ID as u32), /// (), /// ) /// ] @@ -240,11 +270,11 @@ pub trait Driver: Send { /// PCI driver probe. /// - /// Called when a new platform device is added or discovered. - /// Implementers should attempt to initialize the device here. + /// Called when a new pci device is added or discovered. Implementers should + /// attempt to initialize the device here. fn probe(dev: &Device, id_info: &Self::IdInfo) -> Result>>; - /// Platform driver unbind. + /// PCI driver unbind. /// /// Called when a [`Device`] is unbound from its bound [`Driver`]. Implementing this callback /// is optional. @@ -347,7 +377,7 @@ impl Bar { // `ioptr` is valid by the safety requirements. // `num` is valid by the safety requirements. unsafe { - bindings::pci_iounmap(pdev.as_raw(), ioptr as *mut kernel::ffi::c_void); + bindings::pci_iounmap(pdev.as_raw(), ioptr as *mut c_void); bindings::pci_release_region(pdev.as_raw(), num); } } @@ -359,6 +389,7 @@ impl Bar { } impl Bar { + #[inline] fn index_is_valid(index: u32) -> bool { // A `struct pci_dev` owns an array of resources with at most `PCI_NUM_RESOURCES` entries. index < bindings::PCI_NUM_RESOURCES @@ -381,24 +412,90 @@ impl Deref for Bar { } impl Device { + #[inline] fn as_raw(&self) -> *mut bindings::pci_dev { self.0.get() } } impl Device { - /// Returns the PCI vendor ID. - pub fn vendor_id(&self) -> u16 { + /// Returns the PCI vendor ID as [`Vendor`]. + /// + /// # Examples + /// + /// ``` + /// # use kernel::{device::Core, pci::{self, Vendor}, prelude::*}; + /// fn log_device_info(pdev: &pci::Device) -> Result { + /// // Get an instance of `Vendor`. + /// let vendor = pdev.vendor_id(); + /// dev_info!( + /// pdev.as_ref(), + /// "Device: Vendor={}, Device=0x{:x}\n", + /// vendor, + /// pdev.device_id() + /// ); + /// Ok(()) + /// } + /// ``` + #[inline] + pub fn vendor_id(&self) -> Vendor { // SAFETY: `self.as_raw` is a valid pointer to a `struct pci_dev`. - unsafe { (*self.as_raw()).vendor } + let vendor_id = unsafe { (*self.as_raw()).vendor }; + Vendor::from_raw(vendor_id) } /// Returns the PCI device ID. + #[inline] pub fn device_id(&self) -> u16 { - // SAFETY: `self.as_raw` is a valid pointer to a `struct pci_dev`. + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. unsafe { (*self.as_raw()).device } } + /// Returns the PCI revision ID. + #[inline] + pub fn revision_id(&self) -> u8 { + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. + unsafe { (*self.as_raw()).revision } + } + + /// Returns the PCI bus device/function. + #[inline] + pub fn dev_id(&self) -> u16 { + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. + unsafe { bindings::pci_dev_id(self.as_raw()) } + } + + /// Returns the PCI subsystem vendor ID. + #[inline] + pub fn subsystem_vendor_id(&self) -> u16 { + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. + unsafe { (*self.as_raw()).subsystem_vendor } + } + + /// Returns the PCI subsystem device ID. + #[inline] + pub fn subsystem_device_id(&self) -> u16 { + // SAFETY: By its type invariant `self.as_raw` is always a valid pointer to a + // `struct pci_dev`. + unsafe { (*self.as_raw()).subsystem_device } + } + + /// Returns the start of the given PCI bar resource. + pub fn resource_start(&self, bar: u32) -> Result { + if !Bar::index_is_valid(bar) { + return Err(EINVAL); + } + + // SAFETY: + // - `bar` is a valid bar number, as guaranteed by the above call to `Bar::index_is_valid`, + // - by its type invariant `self.as_raw` is always a valid pointer to a `struct pci_dev`. + Ok(unsafe { bindings::pci_resource_start(self.as_raw(), bar.try_into()?) }) + } + /// Returns the size of the given PCI bar resource. pub fn resource_len(&self, bar: u32) -> Result { if !Bar::index_is_valid(bar) { @@ -410,6 +507,13 @@ impl Device { // - by its type invariant `self.as_raw` is always a valid pointer to a `struct pci_dev`. Ok(unsafe { bindings::pci_resource_len(self.as_raw(), bar.try_into()?) }) } + + /// Returns the PCI class as a `Class` struct. + #[inline] + pub fn pci_class(&self) -> Class { + // SAFETY: `self.as_raw` is a valid pointer to a `struct pci_dev`. + Class::from_raw(unsafe { (*self.as_raw()).class }) + } } impl Device { @@ -431,6 +535,47 @@ impl Device { ) -> impl PinInit, Error> + 'a { self.iomap_region_sized::<0>(bar, name) } + + /// Returns an [`IrqRequest`] for the IRQ vector at the given index, if any. + pub fn irq_vector(&self, index: u32) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct pci_dev`. + let irq = unsafe { crate::bindings::pci_irq_vector(self.as_raw(), index) }; + if irq < 0 { + return Err(crate::error::Error::from_errno(irq)); + } + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + /// Returns a [`kernel::irq::Registration`] for the IRQ vector at the given + /// index. + pub fn request_irq<'a, T: crate::irq::Handler + 'static>( + &'a self, + index: u32, + flags: irq::Flags, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> Result, Error> + 'a> { + let request = self.irq_vector(index)?; + + Ok(irq::Registration::::new(request, flags, name, handler)) + } + + /// Returns a [`kernel::irq::ThreadedRegistration`] for the IRQ vector at + /// the given index. + pub fn request_threaded_irq<'a, T: crate::irq::ThreadedHandler + 'static>( + &'a self, + index: u32, + flags: irq::Flags, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> Result, Error> + 'a> { + let request = self.irq_vector(index)?; + + Ok(irq::ThreadedRegistration::::new( + request, flags, name, handler, + )) + } } impl Device { @@ -441,6 +586,7 @@ impl Device { } /// Enable bus-mastering for this device. + #[inline] pub fn set_master(&self) { // SAFETY: `self.as_raw` is guaranteed to be a pointer to a valid `struct pci_dev`. unsafe { bindings::pci_set_master(self.as_raw()) }; @@ -455,7 +601,7 @@ kernel::impl_device_context_into_aref!(Device); impl crate::dma::Device for Device {} // SAFETY: Instances of `Device` are always reference-counted. -unsafe impl crate::types::AlwaysRefCounted for Device { +unsafe impl crate::sync::aref::AlwaysRefCounted for Device { fn inc_ref(&self) { // SAFETY: The existence of a shared reference guarantees that the refcount is non-zero. unsafe { bindings::pci_dev_get(self.as_raw()) }; diff --git a/rust/kernel/pci/id.rs b/rust/kernel/pci/id.rs new file mode 100644 index 00000000000000..7f2a7f57507f24 --- /dev/null +++ b/rust/kernel/pci/id.rs @@ -0,0 +1,578 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! PCI device identifiers and related types. +//! +//! This module contains PCI class codes, Vendor IDs, and supporting types. + +use crate::{bindings, error::code::EINVAL, error::Error, prelude::*}; +use core::fmt; + +/// PCI device class codes. +/// +/// Each entry contains the full 24-bit PCI class code (base class in bits +/// 23-16, subclass in bits 15-8, programming interface in bits 7-0). +/// +/// # Examples +/// +/// ``` +/// # use kernel::{device::Core, pci::{self, Class}, prelude::*}; +/// fn probe_device(pdev: &pci::Device) -> Result { +/// let pci_class = pdev.pci_class(); +/// dev_info!( +/// pdev.as_ref(), +/// "Detected PCI class: {}\n", +/// pci_class +/// ); +/// Ok(()) +/// } +/// ``` +#[derive(Clone, Copy, PartialEq, Eq)] +#[repr(transparent)] +pub struct Class(u32); + +/// PCI class mask constants for matching [`Class`] codes. +#[repr(u32)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ClassMask { + /// Match the full 24-bit class code. + Full = 0xffffff, + /// Match the upper 16 bits of the class code (base class and subclass only) + ClassSubclass = 0xffff00, +} + +macro_rules! define_all_pci_classes { + ( + $($variant:ident = $binding:expr,)+ + ) => { + impl Class { + $( + #[allow(missing_docs)] + pub const $variant: Self = Self(Self::to_24bit_class($binding)); + )+ + } + + impl fmt::Display for Class { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $( + &Self::$variant => write!(f, stringify!($variant)), + )+ + _ => ::fmt(self, f), + } + } + } + }; +} + +/// Once constructed, a [`Class`] contains a valid PCI class code. +impl Class { + /// Create a [`Class`] from a raw 24-bit class code. + #[inline] + pub(super) fn from_raw(class_code: u32) -> Self { + Self(class_code) + } + + /// Get the raw 24-bit class code value. + #[inline] + pub const fn as_raw(self) -> u32 { + self.0 + } + + // Converts a PCI class constant to 24-bit format. + // + // Many device drivers use only the upper 16 bits (base class and subclass), + // but some use the full 24 bits. In order to support both cases, store the + // class code as a 24-bit value, where 16-bit values are shifted up 8 bits. + const fn to_24bit_class(val: u32) -> u32 { + if val > 0xFFFF { + val + } else { + val << 8 + } + } +} + +impl fmt::Debug for Class { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "0x{:06x}", self.0) + } +} + +impl ClassMask { + /// Get the raw mask value. + #[inline] + pub const fn as_raw(self) -> u32 { + self as u32 + } +} + +impl TryFrom for ClassMask { + type Error = Error; + + fn try_from(value: u32) -> Result { + match value { + 0xffffff => Ok(ClassMask::Full), + 0xffff00 => Ok(ClassMask::ClassSubclass), + _ => Err(EINVAL), + } + } +} + +/// PCI vendor IDs. +/// +/// Each entry contains the 16-bit PCI vendor ID as assigned by the PCI SIG. +#[derive(Clone, Copy, PartialEq, Eq)] +#[repr(transparent)] +pub struct Vendor(u16); + +macro_rules! define_all_pci_vendors { + ( + $($variant:ident = $binding:expr,)+ + ) => { + impl Vendor { + $( + #[allow(missing_docs)] + pub const $variant: Self = Self($binding as u16); + )+ + } + + impl fmt::Display for Vendor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $( + &Self::$variant => write!(f, stringify!($variant)), + )+ + _ => ::fmt(self, f), + } + } + } + }; +} + +/// Once constructed, a `Vendor` contains a valid PCI Vendor ID. +impl Vendor { + /// Create a Vendor from a raw 16-bit vendor ID. + #[inline] + pub(super) fn from_raw(vendor_id: u16) -> Self { + Self(vendor_id) + } + + /// Get the raw 16-bit vendor ID value. + #[inline] + pub const fn as_raw(self) -> u16 { + self.0 + } +} + +impl fmt::Debug for Vendor { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "0x{:04x}", self.0) + } +} + +define_all_pci_classes! { + NOT_DEFINED = bindings::PCI_CLASS_NOT_DEFINED, // 0x000000 + NOT_DEFINED_VGA = bindings::PCI_CLASS_NOT_DEFINED_VGA, // 0x000100 + + STORAGE_SCSI = bindings::PCI_CLASS_STORAGE_SCSI, // 0x010000 + STORAGE_IDE = bindings::PCI_CLASS_STORAGE_IDE, // 0x010100 + STORAGE_FLOPPY = bindings::PCI_CLASS_STORAGE_FLOPPY, // 0x010200 + STORAGE_IPI = bindings::PCI_CLASS_STORAGE_IPI, // 0x010300 + STORAGE_RAID = bindings::PCI_CLASS_STORAGE_RAID, // 0x010400 + STORAGE_SATA = bindings::PCI_CLASS_STORAGE_SATA, // 0x010600 + STORAGE_SATA_AHCI = bindings::PCI_CLASS_STORAGE_SATA_AHCI, // 0x010601 + STORAGE_SAS = bindings::PCI_CLASS_STORAGE_SAS, // 0x010700 + STORAGE_EXPRESS = bindings::PCI_CLASS_STORAGE_EXPRESS, // 0x010802 + STORAGE_OTHER = bindings::PCI_CLASS_STORAGE_OTHER, // 0x018000 + + NETWORK_ETHERNET = bindings::PCI_CLASS_NETWORK_ETHERNET, // 0x020000 + NETWORK_TOKEN_RING = bindings::PCI_CLASS_NETWORK_TOKEN_RING, // 0x020100 + NETWORK_FDDI = bindings::PCI_CLASS_NETWORK_FDDI, // 0x020200 + NETWORK_ATM = bindings::PCI_CLASS_NETWORK_ATM, // 0x020300 + NETWORK_OTHER = bindings::PCI_CLASS_NETWORK_OTHER, // 0x028000 + + DISPLAY_VGA = bindings::PCI_CLASS_DISPLAY_VGA, // 0x030000 + DISPLAY_XGA = bindings::PCI_CLASS_DISPLAY_XGA, // 0x030100 + DISPLAY_3D = bindings::PCI_CLASS_DISPLAY_3D, // 0x030200 + DISPLAY_OTHER = bindings::PCI_CLASS_DISPLAY_OTHER, // 0x038000 + + MULTIMEDIA_VIDEO = bindings::PCI_CLASS_MULTIMEDIA_VIDEO, // 0x040000 + MULTIMEDIA_AUDIO = bindings::PCI_CLASS_MULTIMEDIA_AUDIO, // 0x040100 + MULTIMEDIA_PHONE = bindings::PCI_CLASS_MULTIMEDIA_PHONE, // 0x040200 + MULTIMEDIA_HD_AUDIO = bindings::PCI_CLASS_MULTIMEDIA_HD_AUDIO, // 0x040300 + MULTIMEDIA_OTHER = bindings::PCI_CLASS_MULTIMEDIA_OTHER, // 0x048000 + + MEMORY_RAM = bindings::PCI_CLASS_MEMORY_RAM, // 0x050000 + MEMORY_FLASH = bindings::PCI_CLASS_MEMORY_FLASH, // 0x050100 + MEMORY_CXL = bindings::PCI_CLASS_MEMORY_CXL, // 0x050200 + MEMORY_OTHER = bindings::PCI_CLASS_MEMORY_OTHER, // 0x058000 + + BRIDGE_HOST = bindings::PCI_CLASS_BRIDGE_HOST, // 0x060000 + BRIDGE_ISA = bindings::PCI_CLASS_BRIDGE_ISA, // 0x060100 + BRIDGE_EISA = bindings::PCI_CLASS_BRIDGE_EISA, // 0x060200 + BRIDGE_MC = bindings::PCI_CLASS_BRIDGE_MC, // 0x060300 + BRIDGE_PCI_NORMAL = bindings::PCI_CLASS_BRIDGE_PCI_NORMAL, // 0x060400 + BRIDGE_PCI_SUBTRACTIVE = bindings::PCI_CLASS_BRIDGE_PCI_SUBTRACTIVE, // 0x060401 + BRIDGE_PCMCIA = bindings::PCI_CLASS_BRIDGE_PCMCIA, // 0x060500 + BRIDGE_NUBUS = bindings::PCI_CLASS_BRIDGE_NUBUS, // 0x060600 + BRIDGE_CARDBUS = bindings::PCI_CLASS_BRIDGE_CARDBUS, // 0x060700 + BRIDGE_RACEWAY = bindings::PCI_CLASS_BRIDGE_RACEWAY, // 0x060800 + BRIDGE_OTHER = bindings::PCI_CLASS_BRIDGE_OTHER, // 0x068000 + + COMMUNICATION_SERIAL = bindings::PCI_CLASS_COMMUNICATION_SERIAL, // 0x070000 + COMMUNICATION_PARALLEL = bindings::PCI_CLASS_COMMUNICATION_PARALLEL, // 0x070100 + COMMUNICATION_MULTISERIAL = bindings::PCI_CLASS_COMMUNICATION_MULTISERIAL, // 0x070200 + COMMUNICATION_MODEM = bindings::PCI_CLASS_COMMUNICATION_MODEM, // 0x070300 + COMMUNICATION_OTHER = bindings::PCI_CLASS_COMMUNICATION_OTHER, // 0x078000 + + SYSTEM_PIC = bindings::PCI_CLASS_SYSTEM_PIC, // 0x080000 + SYSTEM_PIC_IOAPIC = bindings::PCI_CLASS_SYSTEM_PIC_IOAPIC, // 0x080010 + SYSTEM_PIC_IOXAPIC = bindings::PCI_CLASS_SYSTEM_PIC_IOXAPIC, // 0x080020 + SYSTEM_DMA = bindings::PCI_CLASS_SYSTEM_DMA, // 0x080100 + SYSTEM_TIMER = bindings::PCI_CLASS_SYSTEM_TIMER, // 0x080200 + SYSTEM_RTC = bindings::PCI_CLASS_SYSTEM_RTC, // 0x080300 + SYSTEM_PCI_HOTPLUG = bindings::PCI_CLASS_SYSTEM_PCI_HOTPLUG, // 0x080400 + SYSTEM_SDHCI = bindings::PCI_CLASS_SYSTEM_SDHCI, // 0x080500 + SYSTEM_RCEC = bindings::PCI_CLASS_SYSTEM_RCEC, // 0x080700 + SYSTEM_OTHER = bindings::PCI_CLASS_SYSTEM_OTHER, // 0x088000 + + INPUT_KEYBOARD = bindings::PCI_CLASS_INPUT_KEYBOARD, // 0x090000 + INPUT_PEN = bindings::PCI_CLASS_INPUT_PEN, // 0x090100 + INPUT_MOUSE = bindings::PCI_CLASS_INPUT_MOUSE, // 0x090200 + INPUT_SCANNER = bindings::PCI_CLASS_INPUT_SCANNER, // 0x090300 + INPUT_GAMEPORT = bindings::PCI_CLASS_INPUT_GAMEPORT, // 0x090400 + INPUT_OTHER = bindings::PCI_CLASS_INPUT_OTHER, // 0x098000 + + DOCKING_GENERIC = bindings::PCI_CLASS_DOCKING_GENERIC, // 0x0a0000 + DOCKING_OTHER = bindings::PCI_CLASS_DOCKING_OTHER, // 0x0a8000 + + PROCESSOR_386 = bindings::PCI_CLASS_PROCESSOR_386, // 0x0b0000 + PROCESSOR_486 = bindings::PCI_CLASS_PROCESSOR_486, // 0x0b0100 + PROCESSOR_PENTIUM = bindings::PCI_CLASS_PROCESSOR_PENTIUM, // 0x0b0200 + PROCESSOR_ALPHA = bindings::PCI_CLASS_PROCESSOR_ALPHA, // 0x0b1000 + PROCESSOR_POWERPC = bindings::PCI_CLASS_PROCESSOR_POWERPC, // 0x0b2000 + PROCESSOR_MIPS = bindings::PCI_CLASS_PROCESSOR_MIPS, // 0x0b3000 + PROCESSOR_CO = bindings::PCI_CLASS_PROCESSOR_CO, // 0x0b4000 + + SERIAL_FIREWIRE = bindings::PCI_CLASS_SERIAL_FIREWIRE, // 0x0c0000 + SERIAL_FIREWIRE_OHCI = bindings::PCI_CLASS_SERIAL_FIREWIRE_OHCI, // 0x0c0010 + SERIAL_ACCESS = bindings::PCI_CLASS_SERIAL_ACCESS, // 0x0c0100 + SERIAL_SSA = bindings::PCI_CLASS_SERIAL_SSA, // 0x0c0200 + SERIAL_USB_UHCI = bindings::PCI_CLASS_SERIAL_USB_UHCI, // 0x0c0300 + SERIAL_USB_OHCI = bindings::PCI_CLASS_SERIAL_USB_OHCI, // 0x0c0310 + SERIAL_USB_EHCI = bindings::PCI_CLASS_SERIAL_USB_EHCI, // 0x0c0320 + SERIAL_USB_XHCI = bindings::PCI_CLASS_SERIAL_USB_XHCI, // 0x0c0330 + SERIAL_USB_CDNS = bindings::PCI_CLASS_SERIAL_USB_CDNS, // 0x0c0380 + SERIAL_USB_DEVICE = bindings::PCI_CLASS_SERIAL_USB_DEVICE, // 0x0c03fe + SERIAL_FIBER = bindings::PCI_CLASS_SERIAL_FIBER, // 0x0c0400 + SERIAL_SMBUS = bindings::PCI_CLASS_SERIAL_SMBUS, // 0x0c0500 + SERIAL_IPMI_SMIC = bindings::PCI_CLASS_SERIAL_IPMI_SMIC, // 0x0c0700 + SERIAL_IPMI_KCS = bindings::PCI_CLASS_SERIAL_IPMI_KCS, // 0x0c0701 + SERIAL_IPMI_BT = bindings::PCI_CLASS_SERIAL_IPMI_BT, // 0x0c0702 + + WIRELESS_RF_CONTROLLER = bindings::PCI_CLASS_WIRELESS_RF_CONTROLLER, // 0x0d1000 + WIRELESS_WHCI = bindings::PCI_CLASS_WIRELESS_WHCI, // 0x0d1010 + + INTELLIGENT_I2O = bindings::PCI_CLASS_INTELLIGENT_I2O, // 0x0e0000 + + SATELLITE_TV = bindings::PCI_CLASS_SATELLITE_TV, // 0x0f0000 + SATELLITE_AUDIO = bindings::PCI_CLASS_SATELLITE_AUDIO, // 0x0f0100 + SATELLITE_VOICE = bindings::PCI_CLASS_SATELLITE_VOICE, // 0x0f0300 + SATELLITE_DATA = bindings::PCI_CLASS_SATELLITE_DATA, // 0x0f0400 + + CRYPT_NETWORK = bindings::PCI_CLASS_CRYPT_NETWORK, // 0x100000 + CRYPT_ENTERTAINMENT = bindings::PCI_CLASS_CRYPT_ENTERTAINMENT, // 0x100100 + CRYPT_OTHER = bindings::PCI_CLASS_CRYPT_OTHER, // 0x108000 + + SP_DPIO = bindings::PCI_CLASS_SP_DPIO, // 0x110000 + SP_OTHER = bindings::PCI_CLASS_SP_OTHER, // 0x118000 + + ACCELERATOR_PROCESSING = bindings::PCI_CLASS_ACCELERATOR_PROCESSING, // 0x120000 + + OTHERS = bindings::PCI_CLASS_OTHERS, // 0xff0000 +} + +define_all_pci_vendors! { + PCI_SIG = bindings::PCI_VENDOR_ID_PCI_SIG, // 0x0001 + LOONGSON = bindings::PCI_VENDOR_ID_LOONGSON, // 0x0014 + SOLIDIGM = bindings::PCI_VENDOR_ID_SOLIDIGM, // 0x025e + TTTECH = bindings::PCI_VENDOR_ID_TTTECH, // 0x0357 + DYNALINK = bindings::PCI_VENDOR_ID_DYNALINK, // 0x0675 + UBIQUITI = bindings::PCI_VENDOR_ID_UBIQUITI, // 0x0777 + BERKOM = bindings::PCI_VENDOR_ID_BERKOM, // 0x0871 + ITTIM = bindings::PCI_VENDOR_ID_ITTIM, // 0x0b48 + COMPAQ = bindings::PCI_VENDOR_ID_COMPAQ, // 0x0e11 + LSI_LOGIC = bindings::PCI_VENDOR_ID_LSI_LOGIC, // 0x1000 + ATI = bindings::PCI_VENDOR_ID_ATI, // 0x1002 + VLSI = bindings::PCI_VENDOR_ID_VLSI, // 0x1004 + ADL = bindings::PCI_VENDOR_ID_ADL, // 0x1005 + NS = bindings::PCI_VENDOR_ID_NS, // 0x100b + TSENG = bindings::PCI_VENDOR_ID_TSENG, // 0x100c + WEITEK = bindings::PCI_VENDOR_ID_WEITEK, // 0x100e + DEC = bindings::PCI_VENDOR_ID_DEC, // 0x1011 + CIRRUS = bindings::PCI_VENDOR_ID_CIRRUS, // 0x1013 + IBM = bindings::PCI_VENDOR_ID_IBM, // 0x1014 + UNISYS = bindings::PCI_VENDOR_ID_UNISYS, // 0x1018 + COMPEX2 = bindings::PCI_VENDOR_ID_COMPEX2, // 0x101a + WD = bindings::PCI_VENDOR_ID_WD, // 0x101c + AMI = bindings::PCI_VENDOR_ID_AMI, // 0x101e + AMD = bindings::PCI_VENDOR_ID_AMD, // 0x1022 + TRIDENT = bindings::PCI_VENDOR_ID_TRIDENT, // 0x1023 + AI = bindings::PCI_VENDOR_ID_AI, // 0x1025 + DELL = bindings::PCI_VENDOR_ID_DELL, // 0x1028 + MATROX = bindings::PCI_VENDOR_ID_MATROX, // 0x102B + MOBILITY_ELECTRONICS = bindings::PCI_VENDOR_ID_MOBILITY_ELECTRONICS, // 0x14f2 + CT = bindings::PCI_VENDOR_ID_CT, // 0x102c + MIRO = bindings::PCI_VENDOR_ID_MIRO, // 0x1031 + NEC = bindings::PCI_VENDOR_ID_NEC, // 0x1033 + FD = bindings::PCI_VENDOR_ID_FD, // 0x1036 + SI = bindings::PCI_VENDOR_ID_SI, // 0x1039 + HP = bindings::PCI_VENDOR_ID_HP, // 0x103c + HP_3PAR = bindings::PCI_VENDOR_ID_HP_3PAR, // 0x1590 + PCTECH = bindings::PCI_VENDOR_ID_PCTECH, // 0x1042 + ASUSTEK = bindings::PCI_VENDOR_ID_ASUSTEK, // 0x1043 + DPT = bindings::PCI_VENDOR_ID_DPT, // 0x1044 + OPTI = bindings::PCI_VENDOR_ID_OPTI, // 0x1045 + ELSA = bindings::PCI_VENDOR_ID_ELSA, // 0x1048 + STMICRO = bindings::PCI_VENDOR_ID_STMICRO, // 0x104A + BUSLOGIC = bindings::PCI_VENDOR_ID_BUSLOGIC, // 0x104B + TI = bindings::PCI_VENDOR_ID_TI, // 0x104c + SONY = bindings::PCI_VENDOR_ID_SONY, // 0x104d + WINBOND2 = bindings::PCI_VENDOR_ID_WINBOND2, // 0x1050 + ANIGMA = bindings::PCI_VENDOR_ID_ANIGMA, // 0x1051 + EFAR = bindings::PCI_VENDOR_ID_EFAR, // 0x1055 + MOTOROLA = bindings::PCI_VENDOR_ID_MOTOROLA, // 0x1057 + PROMISE = bindings::PCI_VENDOR_ID_PROMISE, // 0x105a + FOXCONN = bindings::PCI_VENDOR_ID_FOXCONN, // 0x105b + UMC = bindings::PCI_VENDOR_ID_UMC, // 0x1060 + PICOPOWER = bindings::PCI_VENDOR_ID_PICOPOWER, // 0x1066 + MYLEX = bindings::PCI_VENDOR_ID_MYLEX, // 0x1069 + APPLE = bindings::PCI_VENDOR_ID_APPLE, // 0x106b + YAMAHA = bindings::PCI_VENDOR_ID_YAMAHA, // 0x1073 + QLOGIC = bindings::PCI_VENDOR_ID_QLOGIC, // 0x1077 + CYRIX = bindings::PCI_VENDOR_ID_CYRIX, // 0x1078 + CONTAQ = bindings::PCI_VENDOR_ID_CONTAQ, // 0x1080 + OLICOM = bindings::PCI_VENDOR_ID_OLICOM, // 0x108d + SUN = bindings::PCI_VENDOR_ID_SUN, // 0x108e + NI = bindings::PCI_VENDOR_ID_NI, // 0x1093 + CMD = bindings::PCI_VENDOR_ID_CMD, // 0x1095 + BROOKTREE = bindings::PCI_VENDOR_ID_BROOKTREE, // 0x109e + SGI = bindings::PCI_VENDOR_ID_SGI, // 0x10a9 + WINBOND = bindings::PCI_VENDOR_ID_WINBOND, // 0x10ad + PLX = bindings::PCI_VENDOR_ID_PLX, // 0x10b5 + MADGE = bindings::PCI_VENDOR_ID_MADGE, // 0x10b6 + THREECOM = bindings::PCI_VENDOR_ID_3COM, // 0x10b7 + AL = bindings::PCI_VENDOR_ID_AL, // 0x10b9 + NEOMAGIC = bindings::PCI_VENDOR_ID_NEOMAGIC, // 0x10c8 + TCONRAD = bindings::PCI_VENDOR_ID_TCONRAD, // 0x10da + ROHM = bindings::PCI_VENDOR_ID_ROHM, // 0x10db + NVIDIA = bindings::PCI_VENDOR_ID_NVIDIA, // 0x10de + IMS = bindings::PCI_VENDOR_ID_IMS, // 0x10e0 + AMCC = bindings::PCI_VENDOR_ID_AMCC, // 0x10e8 + AMPERE = bindings::PCI_VENDOR_ID_AMPERE, // 0x1def + INTERG = bindings::PCI_VENDOR_ID_INTERG, // 0x10ea + REALTEK = bindings::PCI_VENDOR_ID_REALTEK, // 0x10ec + XILINX = bindings::PCI_VENDOR_ID_XILINX, // 0x10ee + INIT = bindings::PCI_VENDOR_ID_INIT, // 0x1101 + CREATIVE = bindings::PCI_VENDOR_ID_CREATIVE, // 0x1102 + TTI = bindings::PCI_VENDOR_ID_TTI, // 0x1103 + SIGMA = bindings::PCI_VENDOR_ID_SIGMA, // 0x1105 + VIA = bindings::PCI_VENDOR_ID_VIA, // 0x1106 + SIEMENS = bindings::PCI_VENDOR_ID_SIEMENS, // 0x110A + VORTEX = bindings::PCI_VENDOR_ID_VORTEX, // 0x1119 + EF = bindings::PCI_VENDOR_ID_EF, // 0x111a + IDT = bindings::PCI_VENDOR_ID_IDT, // 0x111d + FORE = bindings::PCI_VENDOR_ID_FORE, // 0x1127 + PHILIPS = bindings::PCI_VENDOR_ID_PHILIPS, // 0x1131 + EICON = bindings::PCI_VENDOR_ID_EICON, // 0x1133 + CISCO = bindings::PCI_VENDOR_ID_CISCO, // 0x1137 + ZIATECH = bindings::PCI_VENDOR_ID_ZIATECH, // 0x1138 + SYSKONNECT = bindings::PCI_VENDOR_ID_SYSKONNECT, // 0x1148 + DIGI = bindings::PCI_VENDOR_ID_DIGI, // 0x114f + XIRCOM = bindings::PCI_VENDOR_ID_XIRCOM, // 0x115d + SERVERWORKS = bindings::PCI_VENDOR_ID_SERVERWORKS, // 0x1166 + ALTERA = bindings::PCI_VENDOR_ID_ALTERA, // 0x1172 + SBE = bindings::PCI_VENDOR_ID_SBE, // 0x1176 + TOSHIBA = bindings::PCI_VENDOR_ID_TOSHIBA, // 0x1179 + TOSHIBA_2 = bindings::PCI_VENDOR_ID_TOSHIBA_2, // 0x102f + ATTO = bindings::PCI_VENDOR_ID_ATTO, // 0x117c + RICOH = bindings::PCI_VENDOR_ID_RICOH, // 0x1180 + DLINK = bindings::PCI_VENDOR_ID_DLINK, // 0x1186 + ARTOP = bindings::PCI_VENDOR_ID_ARTOP, // 0x1191 + ZEITNET = bindings::PCI_VENDOR_ID_ZEITNET, // 0x1193 + FUJITSU_ME = bindings::PCI_VENDOR_ID_FUJITSU_ME, // 0x119e + MARVELL = bindings::PCI_VENDOR_ID_MARVELL, // 0x11ab + MARVELL_EXT = bindings::PCI_VENDOR_ID_MARVELL_EXT, // 0x1b4b + V3 = bindings::PCI_VENDOR_ID_V3, // 0x11b0 + ATT = bindings::PCI_VENDOR_ID_ATT, // 0x11c1 + SPECIALIX = bindings::PCI_VENDOR_ID_SPECIALIX, // 0x11cb + ANALOG_DEVICES = bindings::PCI_VENDOR_ID_ANALOG_DEVICES, // 0x11d4 + ZORAN = bindings::PCI_VENDOR_ID_ZORAN, // 0x11de + COMPEX = bindings::PCI_VENDOR_ID_COMPEX, // 0x11f6 + MICROSEMI = bindings::PCI_VENDOR_ID_MICROSEMI, // 0x11f8 + RP = bindings::PCI_VENDOR_ID_RP, // 0x11fe + CYCLADES = bindings::PCI_VENDOR_ID_CYCLADES, // 0x120e + ESSENTIAL = bindings::PCI_VENDOR_ID_ESSENTIAL, // 0x120f + O2 = bindings::PCI_VENDOR_ID_O2, // 0x1217 + THREEDX = bindings::PCI_VENDOR_ID_3DFX, // 0x121a + AVM = bindings::PCI_VENDOR_ID_AVM, // 0x1244 + STALLION = bindings::PCI_VENDOR_ID_STALLION, // 0x124d + AT = bindings::PCI_VENDOR_ID_AT, // 0x1259 + ASIX = bindings::PCI_VENDOR_ID_ASIX, // 0x125b + ESS = bindings::PCI_VENDOR_ID_ESS, // 0x125d + SATSAGEM = bindings::PCI_VENDOR_ID_SATSAGEM, // 0x1267 + ENSONIQ = bindings::PCI_VENDOR_ID_ENSONIQ, // 0x1274 + TRANSMETA = bindings::PCI_VENDOR_ID_TRANSMETA, // 0x1279 + ROCKWELL = bindings::PCI_VENDOR_ID_ROCKWELL, // 0x127A + ITE = bindings::PCI_VENDOR_ID_ITE, // 0x1283 + ALTEON = bindings::PCI_VENDOR_ID_ALTEON, // 0x12ae + NVIDIA_SGS = bindings::PCI_VENDOR_ID_NVIDIA_SGS, // 0x12d2 + PERICOM = bindings::PCI_VENDOR_ID_PERICOM, // 0x12D8 + AUREAL = bindings::PCI_VENDOR_ID_AUREAL, // 0x12eb + ELECTRONICDESIGNGMBH = bindings::PCI_VENDOR_ID_ELECTRONICDESIGNGMBH, // 0x12f8 + ESDGMBH = bindings::PCI_VENDOR_ID_ESDGMBH, // 0x12fe + CB = bindings::PCI_VENDOR_ID_CB, // 0x1307 + SIIG = bindings::PCI_VENDOR_ID_SIIG, // 0x131f + RADISYS = bindings::PCI_VENDOR_ID_RADISYS, // 0x1331 + MICRO_MEMORY = bindings::PCI_VENDOR_ID_MICRO_MEMORY, // 0x1332 + DOMEX = bindings::PCI_VENDOR_ID_DOMEX, // 0x134a + INTASHIELD = bindings::PCI_VENDOR_ID_INTASHIELD, // 0x135a + QUATECH = bindings::PCI_VENDOR_ID_QUATECH, // 0x135C + SEALEVEL = bindings::PCI_VENDOR_ID_SEALEVEL, // 0x135e + HYPERCOPE = bindings::PCI_VENDOR_ID_HYPERCOPE, // 0x1365 + DIGIGRAM = bindings::PCI_VENDOR_ID_DIGIGRAM, // 0x1369 + KAWASAKI = bindings::PCI_VENDOR_ID_KAWASAKI, // 0x136b + CNET = bindings::PCI_VENDOR_ID_CNET, // 0x1371 + LMC = bindings::PCI_VENDOR_ID_LMC, // 0x1376 + NETGEAR = bindings::PCI_VENDOR_ID_NETGEAR, // 0x1385 + APPLICOM = bindings::PCI_VENDOR_ID_APPLICOM, // 0x1389 + MOXA = bindings::PCI_VENDOR_ID_MOXA, // 0x1393 + CCD = bindings::PCI_VENDOR_ID_CCD, // 0x1397 + EXAR = bindings::PCI_VENDOR_ID_EXAR, // 0x13a8 + MICROGATE = bindings::PCI_VENDOR_ID_MICROGATE, // 0x13c0 + THREEWARE = bindings::PCI_VENDOR_ID_3WARE, // 0x13C1 + IOMEGA = bindings::PCI_VENDOR_ID_IOMEGA, // 0x13ca + ABOCOM = bindings::PCI_VENDOR_ID_ABOCOM, // 0x13D1 + SUNDANCE = bindings::PCI_VENDOR_ID_SUNDANCE, // 0x13f0 + CMEDIA = bindings::PCI_VENDOR_ID_CMEDIA, // 0x13f6 + ADVANTECH = bindings::PCI_VENDOR_ID_ADVANTECH, // 0x13fe + MEILHAUS = bindings::PCI_VENDOR_ID_MEILHAUS, // 0x1402 + LAVA = bindings::PCI_VENDOR_ID_LAVA, // 0x1407 + TIMEDIA = bindings::PCI_VENDOR_ID_TIMEDIA, // 0x1409 + ICE = bindings::PCI_VENDOR_ID_ICE, // 0x1412 + MICROSOFT = bindings::PCI_VENDOR_ID_MICROSOFT, // 0x1414 + OXSEMI = bindings::PCI_VENDOR_ID_OXSEMI, // 0x1415 + CHELSIO = bindings::PCI_VENDOR_ID_CHELSIO, // 0x1425 + EDIMAX = bindings::PCI_VENDOR_ID_EDIMAX, // 0x1432 + ADLINK = bindings::PCI_VENDOR_ID_ADLINK, // 0x144a + SAMSUNG = bindings::PCI_VENDOR_ID_SAMSUNG, // 0x144d + GIGABYTE = bindings::PCI_VENDOR_ID_GIGABYTE, // 0x1458 + AMBIT = bindings::PCI_VENDOR_ID_AMBIT, // 0x1468 + MYRICOM = bindings::PCI_VENDOR_ID_MYRICOM, // 0x14c1 + MEDIATEK = bindings::PCI_VENDOR_ID_MEDIATEK, // 0x14c3 + TITAN = bindings::PCI_VENDOR_ID_TITAN, // 0x14D2 + PANACOM = bindings::PCI_VENDOR_ID_PANACOM, // 0x14d4 + SIPACKETS = bindings::PCI_VENDOR_ID_SIPACKETS, // 0x14d9 + AFAVLAB = bindings::PCI_VENDOR_ID_AFAVLAB, // 0x14db + AMPLICON = bindings::PCI_VENDOR_ID_AMPLICON, // 0x14dc + BCM_GVC = bindings::PCI_VENDOR_ID_BCM_GVC, // 0x14a4 + BROADCOM = bindings::PCI_VENDOR_ID_BROADCOM, // 0x14e4 + TOPIC = bindings::PCI_VENDOR_ID_TOPIC, // 0x151f + MAINPINE = bindings::PCI_VENDOR_ID_MAINPINE, // 0x1522 + ENE = bindings::PCI_VENDOR_ID_ENE, // 0x1524 + SYBA = bindings::PCI_VENDOR_ID_SYBA, // 0x1592 + MORETON = bindings::PCI_VENDOR_ID_MORETON, // 0x15aa + VMWARE = bindings::PCI_VENDOR_ID_VMWARE, // 0x15ad + ZOLTRIX = bindings::PCI_VENDOR_ID_ZOLTRIX, // 0x15b0 + MELLANOX = bindings::PCI_VENDOR_ID_MELLANOX, // 0x15b3 + DFI = bindings::PCI_VENDOR_ID_DFI, // 0x15bd + QUICKNET = bindings::PCI_VENDOR_ID_QUICKNET, // 0x15e2 + ADDIDATA = bindings::PCI_VENDOR_ID_ADDIDATA, // 0x15B8 + PDC = bindings::PCI_VENDOR_ID_PDC, // 0x15e9 + FARSITE = bindings::PCI_VENDOR_ID_FARSITE, // 0x1619 + ARIMA = bindings::PCI_VENDOR_ID_ARIMA, // 0x161f + BROCADE = bindings::PCI_VENDOR_ID_BROCADE, // 0x1657 + SIBYTE = bindings::PCI_VENDOR_ID_SIBYTE, // 0x166d + ATHEROS = bindings::PCI_VENDOR_ID_ATHEROS, // 0x168c + NETCELL = bindings::PCI_VENDOR_ID_NETCELL, // 0x169c + CENATEK = bindings::PCI_VENDOR_ID_CENATEK, // 0x16CA + SYNOPSYS = bindings::PCI_VENDOR_ID_SYNOPSYS, // 0x16c3 + USR = bindings::PCI_VENDOR_ID_USR, // 0x16ec + VITESSE = bindings::PCI_VENDOR_ID_VITESSE, // 0x1725 + LINKSYS = bindings::PCI_VENDOR_ID_LINKSYS, // 0x1737 + ALTIMA = bindings::PCI_VENDOR_ID_ALTIMA, // 0x173b + CAVIUM = bindings::PCI_VENDOR_ID_CAVIUM, // 0x177d + TECHWELL = bindings::PCI_VENDOR_ID_TECHWELL, // 0x1797 + BELKIN = bindings::PCI_VENDOR_ID_BELKIN, // 0x1799 + RDC = bindings::PCI_VENDOR_ID_RDC, // 0x17f3 + GLI = bindings::PCI_VENDOR_ID_GLI, // 0x17a0 + LENOVO = bindings::PCI_VENDOR_ID_LENOVO, // 0x17aa + QCOM = bindings::PCI_VENDOR_ID_QCOM, // 0x17cb + CDNS = bindings::PCI_VENDOR_ID_CDNS, // 0x17cd + ARECA = bindings::PCI_VENDOR_ID_ARECA, // 0x17d3 + S2IO = bindings::PCI_VENDOR_ID_S2IO, // 0x17d5 + SITECOM = bindings::PCI_VENDOR_ID_SITECOM, // 0x182d + TOPSPIN = bindings::PCI_VENDOR_ID_TOPSPIN, // 0x1867 + COMMTECH = bindings::PCI_VENDOR_ID_COMMTECH, // 0x18f7 + SILAN = bindings::PCI_VENDOR_ID_SILAN, // 0x1904 + RENESAS = bindings::PCI_VENDOR_ID_RENESAS, // 0x1912 + SOLARFLARE = bindings::PCI_VENDOR_ID_SOLARFLARE, // 0x1924 + TDI = bindings::PCI_VENDOR_ID_TDI, // 0x192E + NXP = bindings::PCI_VENDOR_ID_NXP, // 0x1957 + PASEMI = bindings::PCI_VENDOR_ID_PASEMI, // 0x1959 + ATTANSIC = bindings::PCI_VENDOR_ID_ATTANSIC, // 0x1969 + JMICRON = bindings::PCI_VENDOR_ID_JMICRON, // 0x197B + KORENIX = bindings::PCI_VENDOR_ID_KORENIX, // 0x1982 + HUAWEI = bindings::PCI_VENDOR_ID_HUAWEI, // 0x19e5 + NETRONOME = bindings::PCI_VENDOR_ID_NETRONOME, // 0x19ee + QMI = bindings::PCI_VENDOR_ID_QMI, // 0x1a32 + AZWAVE = bindings::PCI_VENDOR_ID_AZWAVE, // 0x1a3b + REDHAT_QUMRANET = bindings::PCI_VENDOR_ID_REDHAT_QUMRANET, // 0x1af4 + ASMEDIA = bindings::PCI_VENDOR_ID_ASMEDIA, // 0x1b21 + REDHAT = bindings::PCI_VENDOR_ID_REDHAT, // 0x1b36 + WCHIC = bindings::PCI_VENDOR_ID_WCHIC, // 0x1c00 + SILICOM_DENMARK = bindings::PCI_VENDOR_ID_SILICOM_DENMARK, // 0x1c2c + AMAZON_ANNAPURNA_LABS = bindings::PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, // 0x1c36 + CIRCUITCO = bindings::PCI_VENDOR_ID_CIRCUITCO, // 0x1cc8 + AMAZON = bindings::PCI_VENDOR_ID_AMAZON, // 0x1d0f + ZHAOXIN = bindings::PCI_VENDOR_ID_ZHAOXIN, // 0x1d17 + ROCKCHIP = bindings::PCI_VENDOR_ID_ROCKCHIP, // 0x1d87 + HYGON = bindings::PCI_VENDOR_ID_HYGON, // 0x1d94 + META = bindings::PCI_VENDOR_ID_META, // 0x1d9b + FUNGIBLE = bindings::PCI_VENDOR_ID_FUNGIBLE, // 0x1dad + HXT = bindings::PCI_VENDOR_ID_HXT, // 0x1dbf + TEKRAM = bindings::PCI_VENDOR_ID_TEKRAM, // 0x1de1 + RPI = bindings::PCI_VENDOR_ID_RPI, // 0x1de4 + ALIBABA = bindings::PCI_VENDOR_ID_ALIBABA, // 0x1ded + CXL = bindings::PCI_VENDOR_ID_CXL, // 0x1e98 + TEHUTI = bindings::PCI_VENDOR_ID_TEHUTI, // 0x1fc9 + SUNIX = bindings::PCI_VENDOR_ID_SUNIX, // 0x1fd4 + HINT = bindings::PCI_VENDOR_ID_HINT, // 0x3388 + THREEDLABS = bindings::PCI_VENDOR_ID_3DLABS, // 0x3d3d + NETXEN = bindings::PCI_VENDOR_ID_NETXEN, // 0x4040 + AKS = bindings::PCI_VENDOR_ID_AKS, // 0x416c + WCHCN = bindings::PCI_VENDOR_ID_WCHCN, // 0x4348 + ACCESSIO = bindings::PCI_VENDOR_ID_ACCESSIO, // 0x494f + S3 = bindings::PCI_VENDOR_ID_S3, // 0x5333 + DUNORD = bindings::PCI_VENDOR_ID_DUNORD, // 0x5544 + DCI = bindings::PCI_VENDOR_ID_DCI, // 0x6666 + GLENFLY = bindings::PCI_VENDOR_ID_GLENFLY, // 0x6766 + INTEL = bindings::PCI_VENDOR_ID_INTEL, // 0x8086 + WANGXUN = bindings::PCI_VENDOR_ID_WANGXUN, // 0x8088 + SCALEMP = bindings::PCI_VENDOR_ID_SCALEMP, // 0x8686 + COMPUTONE = bindings::PCI_VENDOR_ID_COMPUTONE, // 0x8e0e + KTI = bindings::PCI_VENDOR_ID_KTI, // 0x8e2e + ADAPTEC = bindings::PCI_VENDOR_ID_ADAPTEC, // 0x9004 + ADAPTEC2 = bindings::PCI_VENDOR_ID_ADAPTEC2, // 0x9005 + HOLTEK = bindings::PCI_VENDOR_ID_HOLTEK, // 0x9412 + NETMOS = bindings::PCI_VENDOR_ID_NETMOS, // 0x9710 + THREECOM_2 = bindings::PCI_VENDOR_ID_3COM_2, // 0xa727 + SOLIDRUN = bindings::PCI_VENDOR_ID_SOLIDRUN, // 0xd063 + DIGIUM = bindings::PCI_VENDOR_ID_DIGIUM, // 0xd161 + TIGERJET = bindings::PCI_VENDOR_ID_TIGERJET, // 0xe159 + XILINX_RME = bindings::PCI_VENDOR_ID_XILINX_RME, // 0xea60 + XEN = bindings::PCI_VENDOR_ID_XEN, // 0x5853 + OCZ = bindings::PCI_VENDOR_ID_OCZ, // 0x1b85 + NCUBE = bindings::PCI_VENDOR_ID_NCUBE, // 0x10ff +} diff --git a/rust/kernel/pid_namespace.rs b/rust/kernel/pid_namespace.rs index 0e93808e4639b3..979a9718f153d8 100644 --- a/rust/kernel/pid_namespace.rs +++ b/rust/kernel/pid_namespace.rs @@ -7,10 +7,7 @@ //! C header: [`include/linux/pid_namespace.h`](srctree/include/linux/pid_namespace.h) and //! [`include/linux/pid.h`](srctree/include/linux/pid.h) -use crate::{ - bindings, - types::{AlwaysRefCounted, Opaque}, -}; +use crate::{bindings, sync::aref::AlwaysRefCounted, types::Opaque}; use core::ptr; /// Wraps the kernel's `struct pid_namespace`. Thread safe. diff --git a/rust/kernel/platform.rs b/rust/kernel/platform.rs index 8f028c76f9fa61..7205fe3416d36c 100644 --- a/rust/kernel/platform.rs +++ b/rust/kernel/platform.rs @@ -10,6 +10,7 @@ use crate::{ driver, error::{from_result, to_result, Result}, io::{mem::IoRequest, Resource}, + irq::{self, IrqRequest}, of, prelude::*, types::Opaque, @@ -284,6 +285,181 @@ impl Device { } } +macro_rules! define_irq_accessor_by_index { + ( + $(#[$meta:meta])* $fn_name:ident, + $request_fn:ident, + $reg_type:ident, + $handler_trait:ident + ) => { + $(#[$meta])* + pub fn $fn_name<'a, T: irq::$handler_trait + 'static>( + &'a self, + flags: irq::Flags, + index: u32, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> Result, Error> + 'a> { + let request = self.$request_fn(index)?; + + Ok(irq::$reg_type::::new( + request, + flags, + name, + handler, + )) + } + }; +} + +macro_rules! define_irq_accessor_by_name { + ( + $(#[$meta:meta])* $fn_name:ident, + $request_fn:ident, + $reg_type:ident, + $handler_trait:ident + ) => { + $(#[$meta])* + pub fn $fn_name<'a, T: irq::$handler_trait + 'static>( + &'a self, + flags: irq::Flags, + irq_name: &CStr, + name: &'static CStr, + handler: impl PinInit + 'a, + ) -> Result, Error> + 'a> { + let request = self.$request_fn(irq_name)?; + + Ok(irq::$reg_type::::new( + request, + flags, + name, + handler, + )) + } + }; +} + +impl Device { + /// Returns an [`IrqRequest`] for the IRQ at the given index, if any. + pub fn irq_by_index(&self, index: u32) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct platform_device`. + let irq = unsafe { bindings::platform_get_irq(self.as_raw(), index) }; + + if irq < 0 { + return Err(Error::from_errno(irq)); + } + + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + /// Returns an [`IrqRequest`] for the IRQ at the given index, but does not + /// print an error if the IRQ cannot be obtained. + pub fn optional_irq_by_index(&self, index: u32) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct platform_device`. + let irq = unsafe { bindings::platform_get_irq_optional(self.as_raw(), index) }; + + if irq < 0 { + return Err(Error::from_errno(irq)); + } + + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + /// Returns an [`IrqRequest`] for the IRQ with the given name, if any. + pub fn irq_by_name(&self, name: &CStr) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct platform_device`. + let irq = unsafe { bindings::platform_get_irq_byname(self.as_raw(), name.as_char_ptr()) }; + + if irq < 0 { + return Err(Error::from_errno(irq)); + } + + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + /// Returns an [`IrqRequest`] for the IRQ with the given name, but does not + /// print an error if the IRQ cannot be obtained. + pub fn optional_irq_by_name(&self, name: &CStr) -> Result> { + // SAFETY: `self.as_raw` returns a valid pointer to a `struct platform_device`. + let irq = unsafe { + bindings::platform_get_irq_byname_optional(self.as_raw(), name.as_char_ptr()) + }; + + if irq < 0 { + return Err(Error::from_errno(irq)); + } + + // SAFETY: `irq` is guaranteed to be a valid IRQ number for `&self`. + Ok(unsafe { IrqRequest::new(self.as_ref(), irq as u32) }) + } + + define_irq_accessor_by_index!( + /// Returns a [`irq::Registration`] for the IRQ at the given index. + request_irq_by_index, + irq_by_index, + Registration, + Handler + ); + define_irq_accessor_by_name!( + /// Returns a [`irq::Registration`] for the IRQ with the given name. + request_irq_by_name, + irq_by_name, + Registration, + Handler + ); + define_irq_accessor_by_index!( + /// Does the same as [`Self::request_irq_by_index`], except that it does + /// not print an error message if the IRQ cannot be obtained. + request_optional_irq_by_index, + optional_irq_by_index, + Registration, + Handler + ); + define_irq_accessor_by_name!( + /// Does the same as [`Self::request_irq_by_name`], except that it does + /// not print an error message if the IRQ cannot be obtained. + request_optional_irq_by_name, + optional_irq_by_name, + Registration, + Handler + ); + + define_irq_accessor_by_index!( + /// Returns a [`irq::ThreadedRegistration`] for the IRQ at the given index. + request_threaded_irq_by_index, + irq_by_index, + ThreadedRegistration, + ThreadedHandler + ); + define_irq_accessor_by_name!( + /// Returns a [`irq::ThreadedRegistration`] for the IRQ with the given name. + request_threaded_irq_by_name, + irq_by_name, + ThreadedRegistration, + ThreadedHandler + ); + define_irq_accessor_by_index!( + /// Does the same as [`Self::request_threaded_irq_by_index`], except + /// that it does not print an error message if the IRQ cannot be + /// obtained. + request_optional_threaded_irq_by_index, + optional_irq_by_index, + ThreadedRegistration, + ThreadedHandler + ); + define_irq_accessor_by_name!( + /// Does the same as [`Self::request_threaded_irq_by_name`], except that + /// it does not print an error message if the IRQ cannot be obtained. + request_optional_threaded_irq_by_name, + optional_irq_by_name, + ThreadedRegistration, + ThreadedHandler + ); +} + // SAFETY: `Device` is a transparent wrapper of a type that doesn't depend on `Device`'s generic // argument. kernel::impl_device_context_deref!(unsafe { Device }); @@ -292,7 +468,7 @@ kernel::impl_device_context_into_aref!(Device); impl crate::dma::Device for Device {} // SAFETY: Instances of `Device` are always reference-counted. -unsafe impl crate::types::AlwaysRefCounted for Device { +unsafe impl crate::sync::aref::AlwaysRefCounted for Device { fn inc_ref(&self) { // SAFETY: The existence of a shared reference guarantees that the refcount is non-zero. unsafe { bindings::get_device(self.as_ref().as_raw()) }; diff --git a/rust/kernel/prelude.rs b/rust/kernel/prelude.rs index 25fe97aafd02a1..198d09a31449d8 100644 --- a/rust/kernel/prelude.rs +++ b/rust/kernel/prelude.rs @@ -12,7 +12,10 @@ //! ``` #[doc(no_inline)] -pub use core::pin::Pin; +pub use core::{ + mem::{align_of, align_of_val, size_of, size_of_val}, + pin::Pin, +}; pub use ::ffi::{ c_char, c_int, c_long, c_longlong, c_schar, c_short, c_uchar, c_uint, c_ulong, c_ulonglong, diff --git a/rust/kernel/processor.rs b/rust/kernel/processor.rs new file mode 100644 index 00000000000000..85b49b3614dd2d --- /dev/null +++ b/rust/kernel/processor.rs @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Processor related primitives. +//! +//! C header: [`include/linux/processor.h`](srctree/include/linux/processor.h) + +/// Lower CPU power consumption or yield to a hyperthreaded twin processor. +/// +/// It also happens to serve as a compiler barrier. +#[inline] +pub fn cpu_relax() { + // SAFETY: Always safe to call. + unsafe { bindings::cpu_relax() } +} diff --git a/rust/kernel/ptr.rs b/rust/kernel/ptr.rs new file mode 100644 index 00000000000000..2e5e2a090480aa --- /dev/null +++ b/rust/kernel/ptr.rs @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Types and functions to work with pointers and addresses. + +use core::fmt::Debug; +use core::mem::align_of; +use core::num::NonZero; + +use crate::build_assert; + +/// Type representing an alignment, which is always a power of two. +/// +/// It is used to validate that a given value is a valid alignment, and to perform masking and +/// alignment operations. +/// +/// This is a temporary substitute for the [`Alignment`] nightly type from the standard library, +/// and to be eventually replaced by it. +/// +/// [`Alignment`]: https://github.com/rust-lang/rust/issues/102070 +/// +/// # Invariants +/// +/// An alignment is always a power of two. +#[repr(transparent)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Alignment(NonZero); + +impl Alignment { + /// Validates that `ALIGN` is a power of two at build-time, and returns an [`Alignment`] of the + /// same value. + /// + /// A build error is triggered if `ALIGN` is not a power of two. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// let v = Alignment::new::<16>(); + /// assert_eq!(v.as_usize(), 16); + /// ``` + #[inline(always)] + pub const fn new() -> Self { + build_assert!( + ALIGN.is_power_of_two(), + "Provided alignment is not a power of two." + ); + + // INVARIANT: `align` is a power of two. + // SAFETY: `align` is a power of two, and thus non-zero. + Self(unsafe { NonZero::new_unchecked(ALIGN) }) + } + + /// Validates that `align` is a power of two at runtime, and returns an + /// [`Alignment`] of the same value. + /// + /// Returns [`None`] if `align` is not a power of two. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::new_checked(16), Some(Alignment::new::<16>())); + /// assert_eq!(Alignment::new_checked(15), None); + /// assert_eq!(Alignment::new_checked(1), Some(Alignment::new::<1>())); + /// assert_eq!(Alignment::new_checked(0), None); + /// ``` + #[inline(always)] + pub const fn new_checked(align: usize) -> Option { + if align.is_power_of_two() { + // INVARIANT: `align` is a power of two. + // SAFETY: `align` is a power of two, and thus non-zero. + Some(Self(unsafe { NonZero::new_unchecked(align) })) + } else { + None + } + } + + /// Returns the alignment of `T`. + /// + /// This is equivalent to [`align_of`], but with the return value provided as an [`Alignment`]. + #[inline(always)] + pub const fn of() -> Self { + #![allow(clippy::incompatible_msrv)] + // This cannot panic since alignments are always powers of two. + // + // We unfortunately cannot use `new` as it would require the `generic_const_exprs` feature. + const { Alignment::new_checked(align_of::()).unwrap() } + } + + /// Returns this alignment as a [`usize`]. + /// + /// It is guaranteed to be a power of two. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::new::<16>().as_usize(), 16); + /// ``` + #[inline(always)] + pub const fn as_usize(self) -> usize { + self.as_nonzero().get() + } + + /// Returns this alignment as a [`NonZero`]. + /// + /// It is guaranteed to be a power of two. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::new::<16>().as_nonzero().get(), 16); + /// ``` + #[inline(always)] + pub const fn as_nonzero(self) -> NonZero { + // Allow the compiler to know that the value is indeed a power of two. This can help + // optimize some operations down the line, like e.g. replacing divisions by bit shifts. + if !self.0.is_power_of_two() { + // SAFETY: Per the invariants, `self.0` is always a power of two so this block will + // never be reached. + unsafe { core::hint::unreachable_unchecked() } + } + self.0 + } + + /// Returns the base-2 logarithm of the alignment. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::of::().log2(), 0); + /// assert_eq!(Alignment::new::<16>().log2(), 4); + /// ``` + #[inline(always)] + pub const fn log2(self) -> u32 { + self.0.ilog2() + } + + /// Returns the mask for this alignment. + /// + /// This is equivalent to `!(self.as_usize() - 1)`. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::Alignment; + /// + /// assert_eq!(Alignment::new::<0x10>().mask(), !0xf); + /// ``` + #[inline(always)] + pub const fn mask(self) -> usize { + // No underflow can occur as the alignment is guaranteed to be a power of two, and thus is + // non-zero. + !(self.as_usize() - 1) + } +} + +/// Trait for items that can be aligned against an [`Alignment`]. +pub trait Alignable: Sized { + /// Aligns `self` down to `alignment`. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::{Alignable, Alignment}; + /// + /// assert_eq!(0x2f_usize.align_down(Alignment::new::<0x10>()), 0x20); + /// assert_eq!(0x30usize.align_down(Alignment::new::<0x10>()), 0x30); + /// assert_eq!(0xf0u8.align_down(Alignment::new::<0x1000>()), 0x0); + /// ``` + fn align_down(self, alignment: Alignment) -> Self; + + /// Aligns `self` up to `alignment`, returning `None` if aligning would result in an overflow. + /// + /// # Examples + /// + /// ``` + /// use kernel::ptr::{Alignable, Alignment}; + /// + /// assert_eq!(0x4fusize.align_up(Alignment::new::<0x10>()), Some(0x50)); + /// assert_eq!(0x40usize.align_up(Alignment::new::<0x10>()), Some(0x40)); + /// assert_eq!(0x0usize.align_up(Alignment::new::<0x10>()), Some(0x0)); + /// assert_eq!(u8::MAX.align_up(Alignment::new::<0x10>()), None); + /// assert_eq!(0x10u8.align_up(Alignment::new::<0x100>()), None); + /// assert_eq!(0x0u8.align_up(Alignment::new::<0x100>()), Some(0x0)); + /// ``` + fn align_up(self, alignment: Alignment) -> Option; +} + +/// Implement [`Alignable`] for unsigned integer types. +macro_rules! impl_alignable_uint { + ($($t:ty),*) => { + $( + impl Alignable for $t { + #[inline(always)] + fn align_down(self, alignment: Alignment) -> Self { + // The operands of `&` need to be of the same type so convert the alignment to + // `Self`. This means we need to compute the mask ourselves. + ::core::num::NonZero::::try_from(alignment.as_nonzero()) + .map(|align| self & !(align.get() - 1)) + // An alignment larger than `Self` always aligns down to `0`. + .unwrap_or(0) + } + + #[inline(always)] + fn align_up(self, alignment: Alignment) -> Option { + let aligned_down = self.align_down(alignment); + if self == aligned_down { + Some(aligned_down) + } else { + Self::try_from(alignment.as_usize()) + .ok() + .and_then(|align| aligned_down.checked_add(align)) + } + } + } + )* + }; +} + +impl_alignable_uint!(u8, u16, u32, u64, usize); diff --git a/rust/kernel/regulator.rs b/rust/kernel/regulator.rs index 65f3a125348f2d..b55a201e5029fd 100644 --- a/rust/kernel/regulator.rs +++ b/rust/kernel/regulator.rs @@ -18,7 +18,7 @@ use crate::{ bindings, - device::Device, + device::{Bound, Device}, error::{from_err_ptr, to_result, Result}, prelude::*, }; @@ -30,7 +30,6 @@ mod private { impl Sealed for super::Enabled {} impl Sealed for super::Disabled {} - impl Sealed for super::Dynamic {} } /// A trait representing the different states a [`Regulator`] can be in. @@ -50,13 +49,6 @@ pub struct Enabled; /// own an `enable` reference count, but the regulator may still be on. pub struct Disabled; -/// A state that models the C API. The [`Regulator`] can be either enabled or -/// disabled, and the user is in control of the reference count. This is also -/// the default state. -/// -/// Use [`Regulator::is_enabled`] to check the regulator's current state. -pub struct Dynamic; - impl RegulatorState for Enabled { const DISABLE_ON_DROP: bool = true; } @@ -65,14 +57,9 @@ impl RegulatorState for Disabled { const DISABLE_ON_DROP: bool = false; } -impl RegulatorState for Dynamic { - const DISABLE_ON_DROP: bool = false; -} - /// A trait that abstracts the ability to check if a [`Regulator`] is enabled. pub trait IsEnabled: RegulatorState {} impl IsEnabled for Disabled {} -impl IsEnabled for Dynamic {} /// An error that can occur when trying to convert a [`Regulator`] between states. pub struct Error { @@ -82,6 +69,41 @@ pub struct Error { /// The regulator that caused the error, so that the operation may be retried. pub regulator: Regulator, } +/// Obtains and enables a [`devres`]-managed regulator for a device. +/// +/// This calls [`regulator_disable()`] and [`regulator_put()`] automatically on +/// driver detach. +/// +/// This API is identical to `devm_regulator_get_enable()`, and should be +/// preferred over the [`Regulator`] API if the caller only +/// cares about the regulator being enabled. +/// +/// [`devres`]: https://docs.kernel.org/driver-api/driver-model/devres.html +/// [`regulator_disable()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_disable +/// [`regulator_put()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_put +pub fn devm_enable(dev: &Device, name: &CStr) -> Result { + // SAFETY: `dev` is a valid and bound device, while `name` is a valid C + // string. + to_result(unsafe { bindings::devm_regulator_get_enable(dev.as_raw(), name.as_ptr()) }) +} + +/// Same as [`devm_enable`], but calls `devm_regulator_get_enable_optional` +/// instead. +/// +/// This obtains and enables a [`devres`]-managed regulator for a device, but +/// does not print a message nor provides a dummy if the regulator is not found. +/// +/// This calls [`regulator_disable()`] and [`regulator_put()`] automatically on +/// driver detach. +/// +/// [`devres`]: https://docs.kernel.org/driver-api/driver-model/devres.html +/// [`regulator_disable()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_disable +/// [`regulator_put()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_put +pub fn devm_enable_optional(dev: &Device, name: &CStr) -> Result { + // SAFETY: `dev` is a valid and bound device, while `name` is a valid C + // string. + to_result(unsafe { bindings::devm_regulator_get_enable_optional(dev.as_raw(), name.as_ptr()) }) +} /// A `struct regulator` abstraction. /// @@ -159,6 +181,29 @@ pub struct Error { /// } /// ``` /// +/// If a driver only cares about the regulator being on for as long it is bound +/// to a device, then it should use [`devm_enable`] or [`devm_enable_optional`]. +/// This should be the default use-case unless more fine-grained control over +/// the regulator's state is required. +/// +/// [`devm_enable`]: crate::regulator::devm_enable +/// [`devm_optional`]: crate::regulator::devm_enable_optional +/// +/// ``` +/// # use kernel::prelude::*; +/// # use kernel::c_str; +/// # use kernel::device::{Bound, Device}; +/// # use kernel::regulator; +/// fn enable(dev: &Device) -> Result { +/// // Obtain a reference to a (fictitious) regulator and enable it. This +/// // call only returns whether the operation succeeded. +/// regulator::devm_enable(dev, c_str!("vcc"))?; +/// +/// // The regulator will be disabled and put when `dev` is unbound. +/// Ok(()) +/// } +/// ``` +/// /// ## Disabling a regulator /// /// ``` @@ -183,64 +228,13 @@ pub struct Error { /// } /// ``` /// -/// ## Using [`Regulator`] -/// -/// This example mimics the behavior of the C API, where the user is in -/// control of the enabled reference count. This is useful for drivers that -/// might call enable and disable to manage the `enable` reference count at -/// runtime, perhaps as a result of `open()` and `close()` calls or whatever -/// other driver-specific or subsystem-specific hooks. -/// -/// ``` -/// # use kernel::prelude::*; -/// # use kernel::c_str; -/// # use kernel::device::Device; -/// # use kernel::regulator::{Regulator, Dynamic}; -/// struct PrivateData { -/// regulator: Regulator, -/// } -/// -/// // A fictictious probe function that obtains a regulator and sets it up. -/// fn probe(dev: &Device) -> Result { -/// // Obtain a reference to a (fictitious) regulator. -/// let mut regulator = Regulator::::get(dev, c_str!("vcc"))?; -/// -/// Ok(PrivateData { regulator }) -/// } -/// -/// // A fictictious function that indicates that the device is going to be used. -/// fn open(dev: &Device, data: &mut PrivateData) -> Result { -/// // Increase the `enabled` reference count. -/// data.regulator.enable()?; -/// -/// Ok(()) -/// } -/// -/// fn close(dev: &Device, data: &mut PrivateData) -> Result { -/// // Decrease the `enabled` reference count. -/// data.regulator.disable()?; -/// -/// Ok(()) -/// } -/// -/// fn remove(dev: &Device, data: PrivateData) -> Result { -/// // `PrivateData` is dropped here, which will drop the -/// // `Regulator` in turn. -/// // -/// // The reference that was obtained by `regulator_get()` will be -/// // released, but it is up to the user to make sure that the number of calls -/// // to `enable()` and `disabled()` are balanced before this point. -/// Ok(()) -/// } -/// ``` -/// /// # Invariants /// /// - `inner` is a non-null wrapper over a pointer to a `struct /// regulator` obtained from [`regulator_get()`]. /// /// [`regulator_get()`]: https://docs.kernel.org/driver-api/regulator.html#c.regulator_get -pub struct Regulator +pub struct Regulator where State: RegulatorState, { @@ -267,11 +261,8 @@ impl Regulator { pub fn get_voltage(&self) -> Result { // SAFETY: Safe as per the type invariants of `Regulator`. let voltage = unsafe { bindings::regulator_get_voltage(self.inner.as_ptr()) }; - if voltage < 0 { - Err(kernel::error::Error::from_errno(voltage)) - } else { - Ok(Voltage::from_microvolts(voltage)) - } + + to_result(voltage).map(|()| Voltage::from_microvolts(voltage)) } fn get_internal(dev: &Device, name: &CStr) -> Result> { @@ -289,12 +280,12 @@ impl Regulator { }) } - fn enable_internal(&mut self) -> Result { + fn enable_internal(&self) -> Result { // SAFETY: Safe as per the type invariants of `Regulator`. to_result(unsafe { bindings::regulator_enable(self.inner.as_ptr()) }) } - fn disable_internal(&mut self) -> Result { + fn disable_internal(&self) -> Result { // SAFETY: Safe as per the type invariants of `Regulator`. to_result(unsafe { bindings::regulator_disable(self.inner.as_ptr()) }) } @@ -310,7 +301,7 @@ impl Regulator { pub fn try_into_enabled(self) -> Result, Error> { // We will be transferring the ownership of our `regulator_get()` count to // `Regulator`. - let mut regulator = ManuallyDrop::new(self); + let regulator = ManuallyDrop::new(self); regulator .enable_internal() @@ -339,7 +330,7 @@ impl Regulator { pub fn try_into_disabled(self) -> Result, Error> { // We will be transferring the ownership of our `regulator_get()` count // to `Regulator`. - let mut regulator = ManuallyDrop::new(self); + let regulator = ManuallyDrop::new(self); regulator .disable_internal() @@ -354,28 +345,6 @@ impl Regulator { } } -impl Regulator { - /// Obtains a [`Regulator`] instance from the system. The current state of - /// the regulator is unknown and it is up to the user to manage the enabled - /// reference count. - /// - /// This closely mimics the behavior of the C API and can be used to - /// dynamically manage the enabled reference count at runtime. - pub fn get(dev: &Device, name: &CStr) -> Result { - Regulator::get_internal(dev, name) - } - - /// Increases the `enabled` reference count. - pub fn enable(&mut self) -> Result { - self.enable_internal() - } - - /// Decreases the `enabled` reference count. - pub fn disable(&mut self) -> Result { - self.disable_internal() - } -} - impl Regulator { /// Checks if the regulator is enabled. pub fn is_enabled(&self) -> bool { @@ -398,6 +367,14 @@ impl Drop for Regulator { } } +// SAFETY: It is safe to send a `Regulator` across threads. In particular, a +// Regulator can be dropped from any thread. +unsafe impl Send for Regulator {} + +// SAFETY: It is safe to send a &Regulator across threads because the C side +// handles its own locking. +unsafe impl Sync for Regulator {} + /// A voltage. /// /// This type represents a voltage value in microvolts. diff --git a/rust/kernel/seq_file.rs b/rust/kernel/seq_file.rs index 8f199b1a3bb14d..59fbfc2473f81c 100644 --- a/rust/kernel/seq_file.rs +++ b/rust/kernel/seq_file.rs @@ -4,7 +4,7 @@ //! //! C header: [`include/linux/seq_file.h`](srctree/include/linux/seq_file.h) -use crate::{bindings, c_str, types::NotThreadSafe, types::Opaque}; +use crate::{bindings, c_str, fmt, types::NotThreadSafe, types::Opaque}; /// A utility for generating the contents of a seq file. #[repr(transparent)] @@ -31,7 +31,7 @@ impl SeqFile { /// Used by the [`seq_print`] macro. #[inline] - pub fn call_printf(&self, args: core::fmt::Arguments<'_>) { + pub fn call_printf(&self, args: fmt::Arguments<'_>) { // SAFETY: Passing a void pointer to `Arguments` is valid for `%pA`. unsafe { bindings::seq_printf( @@ -47,7 +47,7 @@ impl SeqFile { #[macro_export] macro_rules! seq_print { ($m:expr, $($arg:tt)+) => ( - $m.call_printf(format_args!($($arg)+)) + $m.call_printf($crate::prelude::fmt!($($arg)+)) ); } pub use seq_print; diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 00f9b558a3ade1..cf5b638a097d99 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -11,12 +11,15 @@ use pin_init; mod arc; pub mod aref; +pub mod atomic; +pub mod barrier; pub mod completion; mod condvar; pub mod lock; mod locked_by; pub mod poll; pub mod rcu; +mod refcount; pub use arc::{Arc, ArcBorrow, UniqueArc}; pub use completion::Completion; @@ -25,6 +28,7 @@ pub use lock::global::{global_lock, GlobalGuard, GlobalLock, GlobalLockBackend, pub use lock::mutex::{new_mutex, Mutex, MutexGuard}; pub use lock::spinlock::{new_spinlock, SpinLock, SpinLockGuard}; pub use locked_by::LockedBy; +pub use refcount::Refcount; /// Represents a lockdep class. It's a wrapper around C's `lock_class_key`. #[repr(transparent)] diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs index 63a66761d0c7d7..289f77abf415a2 100644 --- a/rust/kernel/sync/arc.rs +++ b/rust/kernel/sync/arc.rs @@ -8,7 +8,7 @@ //! threads. //! //! It is different from the standard library's [`Arc`] in a few ways: -//! 1. It is backed by the kernel's `refcount_t` type. +//! 1. It is backed by the kernel's [`Refcount`] type. //! 2. It does not support weak references, which allows it to be half the size. //! 3. It saturates the reference count instead of aborting when it goes over a threshold. //! 4. It does not provide a `get_mut` method, so the ref counted object is pinned. @@ -18,16 +18,16 @@ use crate::{ alloc::{AllocError, Flags, KBox}, - bindings, ffi::c_void, + fmt, init::InPlaceInit, + sync::Refcount, try_init, - types::{ForeignOwnable, Opaque}, + types::ForeignOwnable, }; use core::{ alloc::Layout, borrow::{Borrow, BorrowMut}, - fmt, marker::PhantomData, mem::{ManuallyDrop, MaybeUninit}, ops::{Deref, DerefMut}, @@ -145,7 +145,7 @@ pub struct Arc { #[pin_data] #[repr(C)] struct ArcInner { - refcount: Opaque, + refcount: Refcount, data: T, } @@ -157,7 +157,7 @@ impl ArcInner { /// `ptr` must have been returned by a previous call to [`Arc::into_raw`], and the `Arc` must /// not yet have been destroyed. unsafe fn container_of(ptr: *const T) -> NonNull> { - let refcount_layout = Layout::new::(); + let refcount_layout = Layout::new::(); // SAFETY: The caller guarantees that the pointer is valid. let val_layout = Layout::for_value(unsafe { &*ptr }); // SAFETY: We're computing the layout of a real struct that existed when compiling this @@ -229,8 +229,7 @@ impl Arc { pub fn new(contents: T, flags: Flags) -> Result { // INVARIANT: The refcount is initialised to a non-zero value. let value = ArcInner { - // SAFETY: There are no safety requirements for this FFI call. - refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }), + refcount: Refcount::new(1), data: contents, }; @@ -321,7 +320,7 @@ impl Arc { /// use kernel::sync::{Arc, UniqueArc}; /// /// let arc = Arc::new(42, GFP_KERNEL)?; - /// let unique_arc = arc.into_unique_or_drop(); + /// let unique_arc = Arc::into_unique_or_drop(arc); /// /// // The above conversion should succeed since refcount of `arc` is 1. /// assert!(unique_arc.is_some()); @@ -337,35 +336,30 @@ impl Arc { /// let arc = Arc::new(42, GFP_KERNEL)?; /// let another = arc.clone(); /// - /// let unique_arc = arc.into_unique_or_drop(); + /// let unique_arc = Arc::into_unique_or_drop(arc); /// /// // The above conversion should fail since refcount of `arc` is >1. /// assert!(unique_arc.is_none()); /// /// # Ok::<(), Error>(()) /// ``` - pub fn into_unique_or_drop(self) -> Option>> { + pub fn into_unique_or_drop(this: Self) -> Option>> { // We will manually manage the refcount in this method, so we disable the destructor. - let me = ManuallyDrop::new(self); + let this = ManuallyDrop::new(this); // SAFETY: We own a refcount, so the pointer is still valid. - let refcount = unsafe { me.ptr.as_ref() }.refcount.get(); + let refcount = unsafe { &this.ptr.as_ref().refcount }; // If the refcount reaches a non-zero value, then we have destroyed this `Arc` and will // return without further touching the `Arc`. If the refcount reaches zero, then there are // no other arcs, and we can create a `UniqueArc`. - // - // SAFETY: We own a refcount, so the pointer is not dangling. - let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) }; - if is_zero { - // SAFETY: We have exclusive access to the arc, so we can perform unsynchronized - // accesses to the refcount. - unsafe { core::ptr::write(refcount, bindings::REFCOUNT_INIT(1)) }; + if refcount.dec_and_test() { + refcount.set(1); // INVARIANT: We own the only refcount to this arc, so we may create a `UniqueArc`. We // must pin the `UniqueArc` because the values was previously in an `Arc`, and they pin // their values. Some(Pin::from(UniqueArc { - inner: ManuallyDrop::into_inner(me), + inner: ManuallyDrop::into_inner(this), })) } else { None @@ -373,10 +367,10 @@ impl Arc { } } -// SAFETY: The pointer returned by `into_foreign` comes from a well aligned -// pointer to `ArcInner`. +// SAFETY: The pointer returned by `into_foreign` was originally allocated as an +// `KBox>`, so that type is what determines the alignment. unsafe impl ForeignOwnable for Arc { - const FOREIGN_ALIGN: usize = core::mem::align_of::>(); + const FOREIGN_ALIGN: usize = > as ForeignOwnable>::FOREIGN_ALIGN; type Borrowed<'a> = ArcBorrow<'a, T>; type BorrowedMut<'a> = Self::Borrowed<'a>; @@ -456,14 +450,10 @@ impl Borrow for Arc { impl Clone for Arc { fn clone(&self) -> Self { - // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is - // safe to dereference it. - let refcount = unsafe { self.ptr.as_ref() }.refcount.get(); - - // INVARIANT: C `refcount_inc` saturates the refcount, so it cannot overflow to zero. + // INVARIANT: `Refcount` saturates the refcount, so it cannot overflow to zero. // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is // safe to increment the refcount. - unsafe { bindings::refcount_inc(refcount) }; + unsafe { self.ptr.as_ref() }.refcount.inc(); // SAFETY: We just incremented the refcount. This increment is now owned by the new `Arc`. unsafe { Self::from_inner(self.ptr) } @@ -472,16 +462,10 @@ impl Clone for Arc { impl Drop for Arc { fn drop(&mut self) { - // SAFETY: By the type invariant, there is necessarily a reference to the object. We cannot - // touch `refcount` after it's decremented to a non-zero value because another thread/CPU - // may concurrently decrement it to zero and free it. It is ok to have a raw pointer to - // freed/invalid memory as long as it is never dereferenced. - let refcount = unsafe { self.ptr.as_ref() }.refcount.get(); - // INVARIANT: If the refcount reaches zero, there are no other instances of `Arc`, and // this instance is being dropped, so the broken invariant is not observable. - // SAFETY: Also by the type invariant, we are allowed to decrement the refcount. - let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) }; + // SAFETY: By the type invariant, there is necessarily a reference to the object. + let is_zero = unsafe { self.ptr.as_ref() }.refcount.dec_and_test(); if is_zero { // The count reached zero, we must free the memory. // @@ -775,8 +759,7 @@ impl UniqueArc { // INVARIANT: The refcount is initialised to a non-zero value. let inner = KBox::try_init::( try_init!(ArcInner { - // SAFETY: There are no safety requirements for this FFI call. - refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }), + refcount: Refcount::new(1), data <- pin_init::uninit::(), }? AllocError), flags, diff --git a/rust/kernel/sync/aref.rs b/rust/kernel/sync/aref.rs index dbd77bb68617ca..0d24a0432015d0 100644 --- a/rust/kernel/sync/aref.rs +++ b/rust/kernel/sync/aref.rs @@ -1,6 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 //! Internal reference counting support. +//! +//! Many C types already have their own reference counting mechanism (e.g. by storing a +//! `refcount_t`). This module provides support for directly using their internal reference count +//! from Rust; instead of making users have to use an additional Rust-reference count in the form of +//! [`Arc`]. +//! +//! The smart pointer [`ARef`] acts similarly to [`Arc`] in that it holds a refcount on the +//! underlying object, but this refcount is internal to the object. It essentially is a Rust +//! implementation of the `get_` and `put_` pattern used in C for reference counting. +//! +//! To make use of [`ARef`], `MyType` needs to implement [`AlwaysRefCounted`]. It is a trait +//! for accessing the internal reference count of an object of the `MyType` type. +//! +//! [`Arc`]: crate::sync::Arc +//! [`Arc`]: crate::sync::Arc use core::{marker::PhantomData, mem::ManuallyDrop, ops::Deref, ptr::NonNull}; @@ -97,7 +112,7 @@ impl ARef { /// /// ``` /// use core::ptr::NonNull; - /// use kernel::types::{ARef, AlwaysRefCounted}; + /// use kernel::sync::aref::{ARef, AlwaysRefCounted}; /// /// struct Empty {} /// diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs new file mode 100644 index 00000000000000..016a6bcaf0807b --- /dev/null +++ b/rust/kernel/sync/atomic.rs @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Atomic primitives. +//! +//! These primitives have the same semantics as their C counterparts: and the precise definitions of +//! semantics can be found at [`LKMM`]. Note that Linux Kernel Memory (Consistency) Model is the +//! only model for Rust code in kernel, and Rust's own atomics should be avoided. +//! +//! # Data races +//! +//! [`LKMM`] atomics have different rules regarding data races: +//! +//! - A normal write from C side is treated as an atomic write if +//! CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=y. +//! - Mixed-size atomic accesses don't cause data races. +//! +//! [`LKMM`]: srctree/tools/memory-model/ + +mod internal; +pub mod ordering; +mod predefine; + +pub use internal::AtomicImpl; +pub use ordering::{Acquire, Full, Relaxed, Release}; + +use crate::build_error; +use internal::{AtomicArithmeticOps, AtomicBasicOps, AtomicExchangeOps, AtomicRepr}; +use ordering::OrderingType; + +/// A memory location which can be safely modified from multiple execution contexts. +/// +/// This has the same size, alignment and bit validity as the underlying type `T`. And it disables +/// niche optimization for the same reason as [`UnsafeCell`]. +/// +/// The atomic operations are implemented in a way that is fully compatible with the [Linux Kernel +/// Memory (Consistency) Model][LKMM], hence they should be modeled as the corresponding +/// [`LKMM`][LKMM] atomic primitives. With the help of [`Atomic::from_ptr()`] and +/// [`Atomic::as_ptr()`], this provides a way to interact with [C-side atomic operations] +/// (including those without the `atomic` prefix, e.g. `READ_ONCE()`, `WRITE_ONCE()`, +/// `smp_load_acquire()` and `smp_store_release()`). +/// +/// # Invariants +/// +/// `self.0` is a valid `T`. +/// +/// [`UnsafeCell`]: core::cell::UnsafeCell +/// [LKMM]: srctree/tools/memory-model/ +/// [C-side atomic operations]: srctree/Documentation/atomic_t.txt +#[repr(transparent)] +pub struct Atomic(AtomicRepr); + +// SAFETY: `Atomic` is safe to share among execution contexts because all accesses are atomic. +unsafe impl Sync for Atomic {} + +/// Types that support basic atomic operations. +/// +/// # Round-trip transmutability +/// +/// `T` is round-trip transmutable to `U` if and only if both of these properties hold: +/// +/// - Any valid bit pattern for `T` is also a valid bit pattern for `U`. +/// - Transmuting (e.g. using [`transmute()`]) a value of type `T` to `U` and then to `T` again +/// yields a value that is in all aspects equivalent to the original value. +/// +/// # Safety +/// +/// - [`Self`] must have the same size and alignment as [`Self::Repr`]. +/// - [`Self`] must be [round-trip transmutable] to [`Self::Repr`]. +/// +/// Note that this is more relaxed than requiring the bi-directional transmutability (i.e. +/// [`transmute()`] is always sound between `U` and `T`) because of the support for atomic +/// variables over unit-only enums, see [Examples]. +/// +/// # Limitations +/// +/// Because C primitives are used to implement the atomic operations, and a C function requires a +/// valid object of a type to operate on (i.e. no `MaybeUninit<_>`), hence at the Rust <-> C +/// surface, only types with all the bits initialized can be passed. As a result, types like `(u8, +/// u16)` (padding bytes are uninitialized) are currently not supported. +/// +/// # Examples +/// +/// A unit-only enum that implements [`AtomicType`]: +/// +/// ``` +/// use kernel::sync::atomic::{AtomicType, Atomic, Relaxed}; +/// +/// #[derive(Clone, Copy, PartialEq, Eq)] +/// #[repr(i32)] +/// enum State { +/// Uninit = 0, +/// Working = 1, +/// Done = 2, +/// }; +/// +/// // SAFETY: `State` and `i32` has the same size and alignment, and it's round-trip +/// // transmutable to `i32`. +/// unsafe impl AtomicType for State { +/// type Repr = i32; +/// } +/// +/// let s = Atomic::new(State::Uninit); +/// +/// assert_eq!(State::Uninit, s.load(Relaxed)); +/// ``` +/// [`transmute()`]: core::mem::transmute +/// [round-trip transmutable]: AtomicType#round-trip-transmutability +/// [Examples]: AtomicType#examples +pub unsafe trait AtomicType: Sized + Send + Copy { + /// The backing atomic implementation type. + type Repr: AtomicImpl; +} + +/// Types that support atomic add operations. +/// +/// # Safety +/// +// TODO: Properly defines `wrapping_add` in the following comment. +/// `wrapping_add` any value of type `Self::Repr::Delta` obtained by [`Self::rhs_into_delta()`] to +/// any value of type `Self::Repr` obtained through transmuting a value of type `Self` to must +/// yield a value with a bit pattern also valid for `Self`. +pub unsafe trait AtomicAdd: AtomicType { + /// Converts `Rhs` into the `Delta` type of the atomic implementation. + fn rhs_into_delta(rhs: Rhs) -> ::Delta; +} + +#[inline(always)] +const fn into_repr(v: T) -> T::Repr { + // SAFETY: Per the safety requirement of `AtomicType`, `T` is round-trip transmutable to + // `T::Repr`, therefore the transmute operation is sound. + unsafe { core::mem::transmute_copy(&v) } +} + +/// # Safety +/// +/// `r` must be a valid bit pattern of `T`. +#[inline(always)] +const unsafe fn from_repr(r: T::Repr) -> T { + // SAFETY: Per the safety requirement of the function, the transmute operation is sound. + unsafe { core::mem::transmute_copy(&r) } +} + +impl Atomic { + /// Creates a new atomic `T`. + pub const fn new(v: T) -> Self { + // INVARIANT: Per the safety requirement of `AtomicType`, `into_repr(v)` is a valid `T`. + Self(AtomicRepr::new(into_repr(v))) + } + + /// Creates a reference to an atomic `T` from a pointer of `T`. + /// + /// This usually is used when communicating with C side or manipulating a C struct, see + /// examples below. + /// + /// # Safety + /// + /// - `ptr` is aligned to `align_of::()`. + /// - `ptr` is valid for reads and writes for `'a`. + /// - For the duration of `'a`, other accesses to `*ptr` must not cause data races (defined + /// by [`LKMM`]) against atomic operations on the returned reference. Note that if all other + /// accesses are atomic, then this safety requirement is trivially fulfilled. + /// + /// [`LKMM`]: srctree/tools/memory-model + /// + /// # Examples + /// + /// Using [`Atomic::from_ptr()`] combined with [`Atomic::load()`] or [`Atomic::store()`] can + /// achieve the same functionality as `READ_ONCE()`/`smp_load_acquire()` or + /// `WRITE_ONCE()`/`smp_store_release()` in C side: + /// + /// ``` + /// # use kernel::types::Opaque; + /// use kernel::sync::atomic::{Atomic, Relaxed, Release}; + /// + /// // Assume there is a C struct `foo`. + /// mod cbindings { + /// #[repr(C)] + /// pub(crate) struct foo { + /// pub(crate) a: i32, + /// pub(crate) b: i32 + /// } + /// } + /// + /// let tmp = Opaque::new(cbindings::foo { a: 1, b: 2 }); + /// + /// // struct foo *foo_ptr = ..; + /// let foo_ptr = tmp.get(); + /// + /// // SAFETY: `foo_ptr` is valid, and `.a` is in bounds. + /// let foo_a_ptr = unsafe { &raw mut (*foo_ptr).a }; + /// + /// // a = READ_ONCE(foo_ptr->a); + /// // + /// // SAFETY: `foo_a_ptr` is valid for read, and all other accesses on it is atomic, so no + /// // data race. + /// let a = unsafe { Atomic::from_ptr(foo_a_ptr) }.load(Relaxed); + /// # assert_eq!(a, 1); + /// + /// // smp_store_release(&foo_ptr->a, 2); + /// // + /// // SAFETY: `foo_a_ptr` is valid for writes, and all other accesses on it is atomic, so + /// // no data race. + /// unsafe { Atomic::from_ptr(foo_a_ptr) }.store(2, Release); + /// ``` + pub unsafe fn from_ptr<'a>(ptr: *mut T) -> &'a Self + where + T: Sync, + { + // CAST: `T` and `Atomic` have the same size, alignment and bit validity. + // SAFETY: Per function safety requirement, `ptr` is a valid pointer and the object will + // live long enough. It's safe to return a `&Atomic` because function safety requirement + // guarantees other accesses won't cause data races. + unsafe { &*ptr.cast::() } + } + + /// Returns a pointer to the underlying atomic `T`. + /// + /// Note that use of the return pointer must not cause data races defined by [`LKMM`]. + /// + /// # Guarantees + /// + /// The returned pointer is valid and properly aligned (i.e. aligned to [`align_of::()`]). + /// + /// [`LKMM`]: srctree/tools/memory-model + /// [`align_of::()`]: core::mem::align_of + pub const fn as_ptr(&self) -> *mut T { + // GUARANTEE: Per the function guarantee of `AtomicRepr::as_ptr()`, the `self.0.as_ptr()` + // must be a valid and properly aligned pointer for `T::Repr`, and per the safety guarantee + // of `AtomicType`, it's a valid and properly aligned pointer of `T`. + self.0.as_ptr().cast() + } + + /// Returns a mutable reference to the underlying atomic `T`. + /// + /// This is safe because the mutable reference of the atomic `T` guarantees exclusive access. + pub fn get_mut(&mut self) -> &mut T { + // CAST: `T` and `T::Repr` has the same size and alignment per the safety requirement of + // `AtomicType`, and per the type invariants `self.0` is a valid `T`, therefore the casting + // result is a valid pointer of `T`. + // SAFETY: The pointer is valid per the CAST comment above, and the mutable reference + // guarantees exclusive access. + unsafe { &mut *self.0.as_ptr().cast() } + } +} + +impl Atomic +where + T::Repr: AtomicBasicOps, +{ + /// Loads the value from the atomic `T`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Relaxed}; + /// + /// let x = Atomic::new(42i32); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// let x = Atomic::new(42i64); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// ``` + #[doc(alias("atomic_read", "atomic64_read"))] + #[inline(always)] + pub fn load(&self, _: Ordering) -> T { + let v = { + match Ordering::TYPE { + OrderingType::Relaxed => T::Repr::atomic_read(&self.0), + OrderingType::Acquire => T::Repr::atomic_read_acquire(&self.0), + _ => build_error!("Wrong ordering"), + } + }; + + // SAFETY: `v` comes from reading `self.0`, which is a valid `T` per the type invariants. + unsafe { from_repr(v) } + } + + /// Stores a value to the atomic `T`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Relaxed}; + /// + /// let x = Atomic::new(42i32); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// x.store(43, Relaxed); + /// + /// assert_eq!(43, x.load(Relaxed)); + /// ``` + #[doc(alias("atomic_set", "atomic64_set"))] + #[inline(always)] + pub fn store(&self, v: T, _: Ordering) { + let v = into_repr(v); + + // INVARIANT: `v` is a valid `T`, and is stored to `self.0` by `atomic_set*()`. + match Ordering::TYPE { + OrderingType::Relaxed => T::Repr::atomic_set(&self.0, v), + OrderingType::Release => T::Repr::atomic_set_release(&self.0, v), + _ => build_error!("Wrong ordering"), + } + } +} + +impl Atomic +where + T::Repr: AtomicExchangeOps, +{ + /// Atomic exchange. + /// + /// Atomically updates `*self` to `v` and returns the old value of `*self`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Acquire, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.xchg(52, Acquire)); + /// assert_eq!(52, x.load(Relaxed)); + /// ``` + #[doc(alias("atomic_xchg", "atomic64_xchg", "swap"))] + #[inline(always)] + pub fn xchg(&self, v: T, _: Ordering) -> T { + let v = into_repr(v); + + // INVARIANT: `self.0` is a valid `T` after `atomic_xchg*()` because `v` is transmutable to + // `T`. + let ret = { + match Ordering::TYPE { + OrderingType::Full => T::Repr::atomic_xchg(&self.0, v), + OrderingType::Acquire => T::Repr::atomic_xchg_acquire(&self.0, v), + OrderingType::Release => T::Repr::atomic_xchg_release(&self.0, v), + OrderingType::Relaxed => T::Repr::atomic_xchg_relaxed(&self.0, v), + } + }; + + // SAFETY: `ret` comes from reading `*self`, which is a valid `T` per type invariants. + unsafe { from_repr(ret) } + } + + /// Atomic compare and exchange. + /// + /// If `*self` == `old`, atomically updates `*self` to `new`. Otherwise, `*self` is not + /// modified. + /// + /// Compare: The comparison is done via the byte level comparison between `*self` and `old`. + /// + /// Ordering: When succeeds, provides the corresponding ordering as the `Ordering` type + /// parameter indicates, and a failed one doesn't provide any ordering, the load part of a + /// failed cmpxchg is a [`Relaxed`] load. + /// + /// Returns `Ok(value)` if cmpxchg succeeds, and `value` is guaranteed to be equal to `old`, + /// otherwise returns `Err(value)`, and `value` is the current value of `*self`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Full, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// // Checks whether cmpxchg succeeded. + /// let success = x.cmpxchg(52, 64, Relaxed).is_ok(); + /// # assert!(!success); + /// + /// // Checks whether cmpxchg failed. + /// let failure = x.cmpxchg(52, 64, Relaxed).is_err(); + /// # assert!(failure); + /// + /// // Uses the old value if failed, probably re-try cmpxchg. + /// match x.cmpxchg(52, 64, Relaxed) { + /// Ok(_) => { }, + /// Err(old) => { + /// // do something with `old`. + /// # assert_eq!(old, 42); + /// } + /// } + /// + /// // Uses the latest value regardlessly, same as atomic_cmpxchg() in C. + /// let latest = x.cmpxchg(42, 64, Full).unwrap_or_else(|old| old); + /// # assert_eq!(42, latest); + /// assert_eq!(64, x.load(Relaxed)); + /// ``` + /// + /// [`Relaxed`]: ordering::Relaxed + #[doc(alias( + "atomic_cmpxchg", + "atomic64_cmpxchg", + "atomic_try_cmpxchg", + "atomic64_try_cmpxchg", + "compare_exchange" + ))] + #[inline(always)] + pub fn cmpxchg( + &self, + mut old: T, + new: T, + o: Ordering, + ) -> Result { + // Note on code generation: + // + // try_cmpxchg() is used to implement cmpxchg(), and if the helper functions are inlined, + // the compiler is able to figure out that branch is not needed if the users don't care + // about whether the operation succeeds or not. One exception is on x86, due to commit + // 44fe84459faf ("locking/atomic: Fix atomic_try_cmpxchg() semantics"), the + // atomic_try_cmpxchg() on x86 has a branch even if the caller doesn't care about the + // success of cmpxchg and only wants to use the old value. For example, for code like: + // + // let latest = x.cmpxchg(42, 64, Full).unwrap_or_else(|old| old); + // + // It will still generate code: + // + // movl $0x40, %ecx + // movl $0x34, %eax + // lock + // cmpxchgl %ecx, 0x4(%rsp) + // jne 1f + // 2: + // ... + // 1: movl %eax, %ecx + // jmp 2b + // + // This might be "fixed" by introducing a try_cmpxchg_exclusive() that knows the "*old" + // location in the C function is always safe to write. + if self.try_cmpxchg(&mut old, new, o) { + Ok(old) + } else { + Err(old) + } + } + + /// Atomic compare and exchange and returns whether the operation succeeds. + /// + /// If `*self` == `old`, atomically updates `*self` to `new`. Otherwise, `*self` is not + /// modified, `*old` is updated to the current value of `*self`. + /// + /// "Compare" and "Ordering" part are the same as [`Atomic::cmpxchg()`]. + /// + /// Returns `true` means the cmpxchg succeeds otherwise returns `false`. + #[inline(always)] + fn try_cmpxchg(&self, old: &mut T, new: T, _: Ordering) -> bool { + let mut tmp = into_repr(*old); + let new = into_repr(new); + + // INVARIANT: `self.0` is a valid `T` after `atomic_try_cmpxchg*()` because `new` is + // transmutable to `T`. + let ret = { + match Ordering::TYPE { + OrderingType::Full => T::Repr::atomic_try_cmpxchg(&self.0, &mut tmp, new), + OrderingType::Acquire => { + T::Repr::atomic_try_cmpxchg_acquire(&self.0, &mut tmp, new) + } + OrderingType::Release => { + T::Repr::atomic_try_cmpxchg_release(&self.0, &mut tmp, new) + } + OrderingType::Relaxed => { + T::Repr::atomic_try_cmpxchg_relaxed(&self.0, &mut tmp, new) + } + } + }; + + // SAFETY: `tmp` comes from reading `*self`, which is a valid `T` per type invariants. + *old = unsafe { from_repr(tmp) }; + + ret + } +} + +impl Atomic +where + T::Repr: AtomicArithmeticOps, +{ + /// Atomic add. + /// + /// Atomically updates `*self` to `(*self).wrapping_add(v)`. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// x.add(12, Relaxed); + /// + /// assert_eq!(54, x.load(Relaxed)); + /// ``` + #[inline(always)] + pub fn add(&self, v: Rhs, _: ordering::Relaxed) + where + T: AtomicAdd, + { + let v = T::rhs_into_delta(v); + + // INVARIANT: `self.0` is a valid `T` after `atomic_add()` due to safety requirement of + // `AtomicAdd`. + T::Repr::atomic_add(&self.0, v); + } + + /// Atomic fetch and add. + /// + /// Atomically updates `*self` to `(*self).wrapping_add(v)`, and returns the value of `*self` + /// before the update. + /// + /// # Examples + /// + /// ``` + /// use kernel::sync::atomic::{Atomic, Acquire, Full, Relaxed}; + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// assert_eq!(54, { x.fetch_add(12, Acquire); x.load(Relaxed) }); + /// + /// let x = Atomic::new(42); + /// + /// assert_eq!(42, x.load(Relaxed)); + /// + /// assert_eq!(54, { x.fetch_add(12, Full); x.load(Relaxed) } ); + /// ``` + #[inline(always)] + pub fn fetch_add(&self, v: Rhs, _: Ordering) -> T + where + T: AtomicAdd, + { + let v = T::rhs_into_delta(v); + + // INVARIANT: `self.0` is a valid `T` after `atomic_fetch_add*()` due to safety requirement + // of `AtomicAdd`. + let ret = { + match Ordering::TYPE { + OrderingType::Full => T::Repr::atomic_fetch_add(&self.0, v), + OrderingType::Acquire => T::Repr::atomic_fetch_add_acquire(&self.0, v), + OrderingType::Release => T::Repr::atomic_fetch_add_release(&self.0, v), + OrderingType::Relaxed => T::Repr::atomic_fetch_add_relaxed(&self.0, v), + } + }; + + // SAFETY: `ret` comes from reading `self.0`, which is a valid `T` per type invariants. + unsafe { from_repr(ret) } + } +} diff --git a/rust/kernel/sync/atomic/internal.rs b/rust/kernel/sync/atomic/internal.rs new file mode 100644 index 00000000000000..6fdd8e59f45be8 --- /dev/null +++ b/rust/kernel/sync/atomic/internal.rs @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Atomic internal implementations. +//! +//! Provides 1:1 mapping to the C atomic operations. + +use crate::bindings; +use crate::macros::paste; +use core::cell::UnsafeCell; + +mod private { + /// Sealed trait marker to disable customized impls on atomic implementation traits. + pub trait Sealed {} +} + +// `i32` and `i64` are only supported atomic implementations. +impl private::Sealed for i32 {} +impl private::Sealed for i64 {} + +/// A marker trait for types that implement atomic operations with C side primitives. +/// +/// This trait is sealed, and only types that have directly mapping to the C side atomics should +/// impl this: +/// +/// - `i32` maps to `atomic_t`. +/// - `i64` maps to `atomic64_t`. +pub trait AtomicImpl: Sized + Send + Copy + private::Sealed { + /// The type of the delta in arithmetic or logical operations. + /// + /// For example, in `atomic_add(ptr, v)`, it's the type of `v`. Usually it's the same type of + /// [`Self`], but it may be different for the atomic pointer type. + type Delta; +} + +// `atomic_t` implements atomic operations on `i32`. +impl AtomicImpl for i32 { + type Delta = Self; +} + +// `atomic64_t` implements atomic operations on `i64`. +impl AtomicImpl for i64 { + type Delta = Self; +} + +/// Atomic representation. +#[repr(transparent)] +pub struct AtomicRepr(UnsafeCell); + +impl AtomicRepr { + /// Creates a new atomic representation `T`. + pub const fn new(v: T) -> Self { + Self(UnsafeCell::new(v)) + } + + /// Returns a pointer to the underlying `T`. + /// + /// # Guarantees + /// + /// The returned pointer is valid and properly aligned (i.e. aligned to [`align_of::()`]). + pub const fn as_ptr(&self) -> *mut T { + // GUARANTEE: `self.0` is an `UnsafeCell`, therefore the pointer returned by `.get()` + // must be valid and properly aligned. + self.0.get() + } +} + +// This macro generates the function signature with given argument list and return type. +macro_rules! declare_atomic_method { + ( + $(#[doc=$doc:expr])* + $func:ident($($arg:ident : $arg_type:ty),*) $(-> $ret:ty)? + ) => { + paste!( + $(#[doc = $doc])* + fn [< atomic_ $func >]($($arg: $arg_type,)*) $(-> $ret)?; + ); + }; + ( + $(#[doc=$doc:expr])* + $func:ident [$variant:ident $($rest:ident)*]($($arg_sig:tt)*) $(-> $ret:ty)? + ) => { + paste!( + declare_atomic_method!( + $(#[doc = $doc])* + [< $func _ $variant >]($($arg_sig)*) $(-> $ret)? + ); + ); + + declare_atomic_method!( + $(#[doc = $doc])* + $func [$($rest)*]($($arg_sig)*) $(-> $ret)? + ); + }; + ( + $(#[doc=$doc:expr])* + $func:ident []($($arg_sig:tt)*) $(-> $ret:ty)? + ) => { + declare_atomic_method!( + $(#[doc = $doc])* + $func($($arg_sig)*) $(-> $ret)? + ); + } +} + +// This macro generates the function implementation with given argument list and return type, and it +// will replace "call(...)" expression with "$ctype _ $func" to call the real C function. +macro_rules! impl_atomic_method { + ( + ($ctype:ident) $func:ident($($arg:ident: $arg_type:ty),*) $(-> $ret:ty)? { + $unsafe:tt { call($($c_arg:expr),*) } + } + ) => { + paste!( + #[inline(always)] + fn [< atomic_ $func >]($($arg: $arg_type,)*) $(-> $ret)? { + // TODO: Ideally we want to use the SAFETY comments written at the macro invocation + // (e.g. in `declare_and_impl_atomic_methods!()`, however, since SAFETY comments + // are just comments, and they are not passed to macros as tokens, therefore we + // cannot use them here. One potential improvement is that if we support using + // attributes as an alternative for SAFETY comments, then we can use that for macro + // generating code. + // + // SAFETY: specified on macro invocation. + $unsafe { bindings::[< $ctype _ $func >]($($c_arg,)*) } + } + ); + }; + ( + ($ctype:ident) $func:ident[$variant:ident $($rest:ident)*]($($arg_sig:tt)*) $(-> $ret:ty)? { + $unsafe:tt { call($($arg:tt)*) } + } + ) => { + paste!( + impl_atomic_method!( + ($ctype) [< $func _ $variant >]($($arg_sig)*) $( -> $ret)? { + $unsafe { call($($arg)*) } + } + ); + ); + impl_atomic_method!( + ($ctype) $func [$($rest)*]($($arg_sig)*) $( -> $ret)? { + $unsafe { call($($arg)*) } + } + ); + }; + ( + ($ctype:ident) $func:ident[]($($arg_sig:tt)*) $( -> $ret:ty)? { + $unsafe:tt { call($($arg:tt)*) } + } + ) => { + impl_atomic_method!( + ($ctype) $func($($arg_sig)*) $(-> $ret)? { + $unsafe { call($($arg)*) } + } + ); + } +} + +// Delcares $ops trait with methods and implements the trait for `i32` and `i64`. +macro_rules! declare_and_impl_atomic_methods { + ($(#[$attr:meta])* $pub:vis trait $ops:ident { + $( + $(#[doc=$doc:expr])* + fn $func:ident [$($variant:ident),*]($($arg_sig:tt)*) $( -> $ret:ty)? { + $unsafe:tt { bindings::#call($($arg:tt)*) } + } + )* + }) => { + $(#[$attr])* + $pub trait $ops: AtomicImpl { + $( + declare_atomic_method!( + $(#[doc=$doc])* + $func[$($variant)*]($($arg_sig)*) $(-> $ret)? + ); + )* + } + + impl $ops for i32 { + $( + impl_atomic_method!( + (atomic) $func[$($variant)*]($($arg_sig)*) $(-> $ret)? { + $unsafe { call($($arg)*) } + } + ); + )* + } + + impl $ops for i64 { + $( + impl_atomic_method!( + (atomic64) $func[$($variant)*]($($arg_sig)*) $(-> $ret)? { + $unsafe { call($($arg)*) } + } + ); + )* + } + } +} + +declare_and_impl_atomic_methods!( + /// Basic atomic operations + pub trait AtomicBasicOps { + /// Atomic read (load). + fn read[acquire](a: &AtomicRepr) -> Self { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast()) } + } + + /// Atomic set (store). + fn set[release](a: &AtomicRepr, v: Self) { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast(), v) } + } + } +); + +declare_and_impl_atomic_methods!( + /// Exchange and compare-and-exchange atomic operations + pub trait AtomicExchangeOps { + /// Atomic exchange. + /// + /// Atomically updates `*a` to `v` and returns the old value. + fn xchg[acquire, release, relaxed](a: &AtomicRepr, v: Self) -> Self { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast(), v) } + } + + /// Atomic compare and exchange. + /// + /// If `*a` == `*old`, atomically updates `*a` to `new`. Otherwise, `*a` is not + /// modified, `*old` is updated to the current value of `*a`. + /// + /// Return `true` if the update of `*a` occurred, `false` otherwise. + fn try_cmpxchg[acquire, release, relaxed]( + a: &AtomicRepr, old: &mut Self, new: Self + ) -> bool { + // SAFETY: `a.as_ptr()` is valid and properly aligned. `core::ptr::from_mut(old)` + // is valid and properly aligned. + unsafe { bindings::#call(a.as_ptr().cast(), core::ptr::from_mut(old), new) } + } + } +); + +declare_and_impl_atomic_methods!( + /// Atomic arithmetic operations + pub trait AtomicArithmeticOps { + /// Atomic add (wrapping). + /// + /// Atomically updates `*a` to `(*a).wrapping_add(v)`. + fn add[](a: &AtomicRepr, v: Self::Delta) { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(v, a.as_ptr().cast()) } + } + + /// Atomic fetch and add (wrapping). + /// + /// Atomically updates `*a` to `(*a).wrapping_add(v)`, and returns the value of `*a` + /// before the update. + fn fetch_add[acquire, release, relaxed](a: &AtomicRepr, v: Self::Delta) -> Self { + // SAFETY: `a.as_ptr()` is valid and properly aligned. + unsafe { bindings::#call(v, a.as_ptr().cast()) } + } + } +); diff --git a/rust/kernel/sync/atomic/ordering.rs b/rust/kernel/sync/atomic/ordering.rs new file mode 100644 index 00000000000000..3f103aa8db99b1 --- /dev/null +++ b/rust/kernel/sync/atomic/ordering.rs @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Memory orderings. +//! +//! The semantics of these orderings follows the [`LKMM`] definitions and rules. +//! +//! - [`Acquire`] provides ordering between the load part of the annotated operation and all the +//! following memory accesses, and if there is a store part, the store part has the [`Relaxed`] +//! ordering. +//! - [`Release`] provides ordering between all the preceding memory accesses and the store part of +//! the annotated operation, and if there is a load part, the load part has the [`Relaxed`] +//! ordering. +//! - [`Full`] means "fully-ordered", that is: +//! - It provides ordering between all the preceding memory accesses and the annotated operation. +//! - It provides ordering between the annotated operation and all the following memory accesses. +//! - It provides ordering between all the preceding memory accesses and all the following memory +//! accesses. +//! - All the orderings are the same strength as a full memory barrier (i.e. `smp_mb()`). +//! - [`Relaxed`] provides no ordering except the dependency orderings. Dependency orderings are +//! described in "DEPENDENCY RELATIONS" in [`LKMM`]'s [`explanation`]. +//! +//! [`LKMM`]: srctree/tools/memory-model/ +//! [`explanation`]: srctree/tools/memory-model/Documentation/explanation.txt + +/// The annotation type for relaxed memory ordering, for the description of relaxed memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Relaxed; + +/// The annotation type for acquire memory ordering, for the description of acquire memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Acquire; + +/// The annotation type for release memory ordering, for the description of release memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Release; + +/// The annotation type for fully-ordered memory ordering, for the description fully-ordered memory +/// ordering, see [module-level documentation]. +/// +/// [module-level documentation]: crate::sync::atomic::ordering +pub struct Full; + +/// Describes the exact memory ordering. +#[doc(hidden)] +pub enum OrderingType { + /// Relaxed ordering. + Relaxed, + /// Acquire ordering. + Acquire, + /// Release ordering. + Release, + /// Fully-ordered. + Full, +} + +mod internal { + /// Sealed trait, can be only implemented inside atomic mod. + pub trait Sealed {} + + impl Sealed for super::Relaxed {} + impl Sealed for super::Acquire {} + impl Sealed for super::Release {} + impl Sealed for super::Full {} +} + +/// The trait bound for annotating operations that support any ordering. +pub trait Ordering: internal::Sealed { + /// Describes the exact memory ordering. + const TYPE: OrderingType; +} + +impl Ordering for Relaxed { + const TYPE: OrderingType = OrderingType::Relaxed; +} + +impl Ordering for Acquire { + const TYPE: OrderingType = OrderingType::Acquire; +} + +impl Ordering for Release { + const TYPE: OrderingType = OrderingType::Release; +} + +impl Ordering for Full { + const TYPE: OrderingType = OrderingType::Full; +} + +/// The trait bound for operations that only support acquire or relaxed ordering. +pub trait AcquireOrRelaxed: Ordering {} + +impl AcquireOrRelaxed for Acquire {} +impl AcquireOrRelaxed for Relaxed {} + +/// The trait bound for operations that only support release or relaxed ordering. +pub trait ReleaseOrRelaxed: Ordering {} + +impl ReleaseOrRelaxed for Release {} +impl ReleaseOrRelaxed for Relaxed {} diff --git a/rust/kernel/sync/atomic/predefine.rs b/rust/kernel/sync/atomic/predefine.rs new file mode 100644 index 00000000000000..45a17985cda45e --- /dev/null +++ b/rust/kernel/sync/atomic/predefine.rs @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Pre-defined atomic types + +use crate::static_assert; +use core::mem::{align_of, size_of}; + +// SAFETY: `i32` has the same size and alignment with itself, and is round-trip transmutable to +// itself. +unsafe impl super::AtomicType for i32 { + type Repr = i32; +} + +// SAFETY: The wrapping add result of two `i32`s is a valid `i32`. +unsafe impl super::AtomicAdd for i32 { + fn rhs_into_delta(rhs: i32) -> i32 { + rhs + } +} + +// SAFETY: `i64` has the same size and alignment with itself, and is round-trip transmutable to +// itself. +unsafe impl super::AtomicType for i64 { + type Repr = i64; +} + +// SAFETY: The wrapping add result of two `i64`s is a valid `i64`. +unsafe impl super::AtomicAdd for i64 { + fn rhs_into_delta(rhs: i64) -> i64 { + rhs + } +} + +// Defines an internal type that always maps to the integer type which has the same size alignment +// as `isize` and `usize`, and `isize` and `usize` are always bi-directional transmutable to +// `isize_atomic_repr`, which also always implements `AtomicImpl`. +#[allow(non_camel_case_types)] +#[cfg(not(CONFIG_64BIT))] +type isize_atomic_repr = i32; +#[allow(non_camel_case_types)] +#[cfg(CONFIG_64BIT)] +type isize_atomic_repr = i64; + +// Ensure size and alignment requirements are checked. +static_assert!(size_of::() == size_of::()); +static_assert!(align_of::() == align_of::()); +static_assert!(size_of::() == size_of::()); +static_assert!(align_of::() == align_of::()); + +// SAFETY: `isize` has the same size and alignment with `isize_atomic_repr`, and is round-trip +// transmutable to `isize_atomic_repr`. +unsafe impl super::AtomicType for isize { + type Repr = isize_atomic_repr; +} + +// SAFETY: The wrapping add result of two `isize_atomic_repr`s is a valid `usize`. +unsafe impl super::AtomicAdd for isize { + fn rhs_into_delta(rhs: isize) -> isize_atomic_repr { + rhs as isize_atomic_repr + } +} + +// SAFETY: `u32` and `i32` has the same size and alignment, and `u32` is round-trip transmutable to +// `i32`. +unsafe impl super::AtomicType for u32 { + type Repr = i32; +} + +// SAFETY: The wrapping add result of two `i32`s is a valid `u32`. +unsafe impl super::AtomicAdd for u32 { + fn rhs_into_delta(rhs: u32) -> i32 { + rhs as i32 + } +} + +// SAFETY: `u64` and `i64` has the same size and alignment, and `u64` is round-trip transmutable to +// `i64`. +unsafe impl super::AtomicType for u64 { + type Repr = i64; +} + +// SAFETY: The wrapping add result of two `i64`s is a valid `u64`. +unsafe impl super::AtomicAdd for u64 { + fn rhs_into_delta(rhs: u64) -> i64 { + rhs as i64 + } +} + +// SAFETY: `usize` has the same size and alignment with `isize_atomic_repr`, and is round-trip +// transmutable to `isize_atomic_repr`. +unsafe impl super::AtomicType for usize { + type Repr = isize_atomic_repr; +} + +// SAFETY: The wrapping add result of two `isize_atomic_repr`s is a valid `usize`. +unsafe impl super::AtomicAdd for usize { + fn rhs_into_delta(rhs: usize) -> isize_atomic_repr { + rhs as isize_atomic_repr + } +} + +use crate::macros::kunit_tests; + +#[kunit_tests(rust_atomics)] +mod tests { + use super::super::*; + + // Call $fn($val) with each $type of $val. + macro_rules! for_each_type { + ($val:literal in [$($type:ty),*] $fn:expr) => { + $({ + let v: $type = $val; + + $fn(v); + })* + } + } + + #[test] + fn atomic_basic_tests() { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { + let x = Atomic::new(v); + + assert_eq!(v, x.load(Relaxed)); + }); + } + + #[test] + fn atomic_xchg_tests() { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { + let x = Atomic::new(v); + + let old = v; + let new = v + 1; + + assert_eq!(old, x.xchg(new, Full)); + assert_eq!(new, x.load(Relaxed)); + }); + } + + #[test] + fn atomic_cmpxchg_tests() { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { + let x = Atomic::new(v); + + let old = v; + let new = v + 1; + + assert_eq!(Err(old), x.cmpxchg(new, new, Full)); + assert_eq!(old, x.load(Relaxed)); + assert_eq!(Ok(old), x.cmpxchg(old, new, Relaxed)); + assert_eq!(new, x.load(Relaxed)); + }); + } + + #[test] + fn atomic_arithmetic_tests() { + for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| { + let x = Atomic::new(v); + + assert_eq!(v, x.fetch_add(12, Full)); + assert_eq!(v + 12, x.load(Relaxed)); + + x.add(13, Relaxed); + + assert_eq!(v + 25, x.load(Relaxed)); + }); + } +} diff --git a/rust/kernel/sync/barrier.rs b/rust/kernel/sync/barrier.rs new file mode 100644 index 00000000000000..8f2d435fcd9444 --- /dev/null +++ b/rust/kernel/sync/barrier.rs @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Memory barriers. +//! +//! These primitives have the same semantics as their C counterparts: and the precise definitions +//! of semantics can be found at [`LKMM`]. +//! +//! [`LKMM`]: srctree/tools/memory-model/ + +/// A compiler barrier. +/// +/// A barrier that prevents compiler from reordering memory accesses across the barrier. +#[inline(always)] +pub(crate) fn barrier() { + // By default, Rust inline asms are treated as being able to access any memory or flags, hence + // it suffices as a compiler barrier. + // + // SAFETY: An empty asm block. + unsafe { core::arch::asm!("") }; +} + +/// A full memory barrier. +/// +/// A barrier that prevents compiler and CPU from reordering memory accesses across the barrier. +#[inline(always)] +pub fn smp_mb() { + if cfg!(CONFIG_SMP) { + // SAFETY: `smp_mb()` is safe to call. + unsafe { bindings::smp_mb() }; + } else { + barrier(); + } +} + +/// A write-write memory barrier. +/// +/// A barrier that prevents compiler and CPU from reordering memory write accesses across the +/// barrier. +#[inline(always)] +pub fn smp_wmb() { + if cfg!(CONFIG_SMP) { + // SAFETY: `smp_wmb()` is safe to call. + unsafe { bindings::smp_wmb() }; + } else { + barrier(); + } +} + +/// A read-read memory barrier. +/// +/// A barrier that prevents compiler and CPU from reordering memory read accesses across the +/// barrier. +#[inline(always)] +pub fn smp_rmb() { + if cfg!(CONFIG_SMP) { + // SAFETY: `smp_rmb()` is safe to call. + unsafe { bindings::smp_rmb() }; + } else { + barrier(); + } +} diff --git a/rust/kernel/sync/refcount.rs b/rust/kernel/sync/refcount.rs new file mode 100644 index 00000000000000..19236a5bccdeb0 --- /dev/null +++ b/rust/kernel/sync/refcount.rs @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Atomic reference counting. +//! +//! C header: [`include/linux/refcount.h`](srctree/include/linux/refcount.h) + +use crate::build_assert; +use crate::sync::atomic::Atomic; +use crate::types::Opaque; + +/// Atomic reference counter. +/// +/// This type is conceptually an atomic integer, but provides saturation semantics compared to +/// normal atomic integers. Values in the negative range when viewed as a signed integer are +/// saturation (bad) values. For details about the saturation semantics, please refer to top of +/// [`include/linux/refcount.h`](srctree/include/linux/refcount.h). +/// +/// Wraps the kernel's C `refcount_t`. +#[repr(transparent)] +pub struct Refcount(Opaque); + +impl Refcount { + /// Construct a new [`Refcount`] from an initial value. + /// + /// The initial value should be non-saturated. + #[inline] + pub fn new(value: i32) -> Self { + build_assert!(value >= 0, "initial value saturated"); + // SAFETY: There are no safety requirements for this FFI call. + Self(Opaque::new(unsafe { bindings::REFCOUNT_INIT(value) })) + } + + #[inline] + fn as_ptr(&self) -> *mut bindings::refcount_t { + self.0.get() + } + + /// Get the underlying atomic counter that backs the refcount. + /// + /// NOTE: Usage of this function is discouraged as it can circumvent the protections offered by + /// `refcount.h`. If there is no way to achieve the result using APIs in `refcount.h`, then + /// this function can be used. Otherwise consider adding a binding for the required API. + #[inline] + pub fn as_atomic(&self) -> &Atomic { + let ptr = self.0.get().cast(); + // SAFETY: `refcount_t` is a transparent wrapper of `atomic_t`, which is an atomic 32-bit + // integer that is layout-wise compatible with `Atomic`. All values are valid for + // `refcount_t`, despite some of the values being considered saturated and "bad". + unsafe { &*ptr } + } + + /// Set a refcount's value. + #[inline] + pub fn set(&self, value: i32) { + // SAFETY: `self.as_ptr()` is valid. + unsafe { bindings::refcount_set(self.as_ptr(), value) } + } + + /// Increment a refcount. + /// + /// It will saturate if overflows and `WARN`. It will also `WARN` if the refcount is 0, as this + /// represents a possible use-after-free condition. + /// + /// Provides no memory ordering, it is assumed that caller already has a reference on the + /// object. + #[inline] + pub fn inc(&self) { + // SAFETY: self is valid. + unsafe { bindings::refcount_inc(self.as_ptr()) } + } + + /// Decrement a refcount. + /// + /// It will `WARN` on underflow and fail to decrement when saturated. + /// + /// Provides release memory ordering, such that prior loads and stores are done + /// before. + #[inline] + pub fn dec(&self) { + // SAFETY: `self.as_ptr()` is valid. + unsafe { bindings::refcount_dec(self.as_ptr()) } + } + + /// Decrement a refcount and test if it is 0. + /// + /// It will `WARN` on underflow and fail to decrement when saturated. + /// + /// Provides release memory ordering, such that prior loads and stores are done + /// before, and provides an acquire ordering on success such that memory deallocation + /// must come after. + /// + /// Returns true if the resulting refcount is 0, false otherwise. + /// + /// # Notes + /// + /// A common pattern of using `Refcount` is to free memory when the reference count reaches + /// zero. This means that the reference to `Refcount` could become invalid after calling this + /// function. This is fine as long as the reference to `Refcount` is no longer used when this + /// function returns `false`. It is not necessary to use raw pointers in this scenario, see + /// . + #[inline] + #[must_use = "use `dec` instead if you do not need to test if it is 0"] + pub fn dec_and_test(&self) -> bool { + // SAFETY: `self.as_ptr()` is valid. + unsafe { bindings::refcount_dec_and_test(self.as_ptr()) } + } +} + +// SAFETY: `refcount_t` is thread-safe. +unsafe impl Send for Refcount {} + +// SAFETY: `refcount_t` is thread-safe. +unsafe impl Sync for Refcount {} diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 7d0935bc325cb8..49fad6de06740a 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -9,7 +9,8 @@ use crate::{ ffi::{c_int, c_long, c_uint}, mm::MmWithUser, pid_namespace::PidNamespace, - types::{ARef, NotThreadSafe, Opaque}, + sync::aref::ARef, + types::{NotThreadSafe, Opaque}, }; use core::{ cmp::{Eq, PartialEq}, @@ -76,7 +77,7 @@ macro_rules! current { /// incremented when creating `State` and decremented when it is dropped: /// /// ``` -/// use kernel::{task::Task, types::ARef}; +/// use kernel::{task::Task, sync::aref::ARef}; /// /// struct State { /// creator: ARef, @@ -347,7 +348,7 @@ impl CurrentTask { } // SAFETY: The type invariants guarantee that `Task` is always refcounted. -unsafe impl crate::types::AlwaysRefCounted for Task { +unsafe impl crate::sync::aref::AlwaysRefCounted for Task { #[inline] fn inc_ref(&self) { // SAFETY: The existence of a shared reference means that the refcount is nonzero. diff --git a/rust/kernel/time.rs b/rust/kernel/time.rs index 64c8dcf548d630..6ea98dfcd02787 100644 --- a/rust/kernel/time.rs +++ b/rust/kernel/time.rs @@ -25,6 +25,7 @@ //! C header: [`include/linux/ktime.h`](srctree/include/linux/ktime.h). use core::marker::PhantomData; +use core::ops; pub mod delay; pub mod hrtimer; @@ -200,9 +201,31 @@ impl Instant { pub(crate) fn as_nanos(&self) -> i64 { self.inner } + + /// Create an [`Instant`] from a `ktime_t` without checking if it is non-negative. + /// + /// # Panics + /// + /// On debug builds, this function will panic if `ktime` is not in the range from 0 to + /// `KTIME_MAX`. + /// + /// # Safety + /// + /// The caller promises that `ktime` is in the range from 0 to `KTIME_MAX`. + #[inline] + pub(crate) unsafe fn from_ktime(ktime: bindings::ktime_t) -> Self { + debug_assert!(ktime >= 0); + + // INVARIANT: Our safety contract ensures that `ktime` is in the range from 0 to + // `KTIME_MAX`. + Self { + inner: ktime, + _c: PhantomData, + } + } } -impl core::ops::Sub for Instant { +impl ops::Sub for Instant { type Output = Delta; // By the type invariant, it never overflows. @@ -214,6 +237,46 @@ impl core::ops::Sub for Instant { } } +impl ops::Add for Instant { + type Output = Self; + + #[inline] + fn add(self, rhs: Delta) -> Self::Output { + // INVARIANT: With arithmetic over/underflow checks enabled, this will panic if we overflow + // (e.g. go above `KTIME_MAX`) + let res = self.inner + rhs.nanos; + + // INVARIANT: With overflow checks enabled, we verify here that the value is >= 0 + #[cfg(CONFIG_RUST_OVERFLOW_CHECKS)] + assert!(res >= 0); + + Self { + inner: res, + _c: PhantomData, + } + } +} + +impl ops::Sub for Instant { + type Output = Self; + + #[inline] + fn sub(self, rhs: Delta) -> Self::Output { + // INVARIANT: With arithmetic over/underflow checks enabled, this will panic if we overflow + // (e.g. go above `KTIME_MAX`) + let res = self.inner - rhs.nanos; + + // INVARIANT: With overflow checks enabled, we verify here that the value is >= 0 + #[cfg(CONFIG_RUST_OVERFLOW_CHECKS)] + assert!(res >= 0); + + Self { + inner: res, + _c: PhantomData, + } + } +} + /// A span of time. /// /// This struct represents a span of time, with its value stored as nanoseconds. @@ -224,6 +287,78 @@ pub struct Delta { nanos: i64, } +impl ops::Add for Delta { + type Output = Self; + + #[inline] + fn add(self, rhs: Self) -> Self { + Self { + nanos: self.nanos + rhs.nanos, + } + } +} + +impl ops::AddAssign for Delta { + #[inline] + fn add_assign(&mut self, rhs: Self) { + self.nanos += rhs.nanos; + } +} + +impl ops::Sub for Delta { + type Output = Self; + + #[inline] + fn sub(self, rhs: Self) -> Self::Output { + Self { + nanos: self.nanos - rhs.nanos, + } + } +} + +impl ops::SubAssign for Delta { + #[inline] + fn sub_assign(&mut self, rhs: Self) { + self.nanos -= rhs.nanos; + } +} + +impl ops::Mul for Delta { + type Output = Self; + + #[inline] + fn mul(self, rhs: i64) -> Self::Output { + Self { + nanos: self.nanos * rhs, + } + } +} + +impl ops::MulAssign for Delta { + #[inline] + fn mul_assign(&mut self, rhs: i64) { + self.nanos *= rhs; + } +} + +impl ops::Div for Delta { + type Output = i64; + + #[inline] + fn div(self, rhs: Self) -> Self::Output { + #[cfg(CONFIG_64BIT)] + { + self.nanos / rhs.nanos + } + + #[cfg(not(CONFIG_64BIT))] + { + // SAFETY: This function is always safe to call regardless of the input values + unsafe { bindings::div64_s64(self.nanos, rhs.nanos) } + } + } +} + impl Delta { /// A span of time equal to zero. pub const ZERO: Self = Self { nanos: 0 }; @@ -312,4 +447,30 @@ impl Delta { bindings::ktime_to_ms(self.as_nanos()) } } + + /// Return `self % dividend` where `dividend` is in nanoseconds. + /// + /// The kernel doesn't have any emulation for `s64 % s64` on 32 bit platforms, so this is + /// limited to 32 bit dividends. + #[inline] + pub fn rem_nanos(self, dividend: i32) -> Self { + #[cfg(CONFIG_64BIT)] + { + Self { + nanos: self.as_nanos() % i64::from(dividend), + } + } + + #[cfg(not(CONFIG_64BIT))] + { + let mut rem = 0; + + // SAFETY: `rem` is in the stack, so we can always provide a valid pointer to it. + unsafe { bindings::div_s64_rem(self.as_nanos(), dividend, &mut rem) }; + + Self { + nanos: i64::from(rem), + } + } + } } diff --git a/rust/kernel/time/hrtimer.rs b/rust/kernel/time/hrtimer.rs index 144e3b57cc7800..856d2d929a0089 100644 --- a/rust/kernel/time/hrtimer.rs +++ b/rust/kernel/time/hrtimer.rs @@ -69,9 +69,14 @@ use super::{ClockSource, Delta, Instant}; use crate::{prelude::*, types::Opaque}; -use core::marker::PhantomData; +use core::{marker::PhantomData, ptr::NonNull}; use pin_init::PinInit; +/// A type-alias to refer to the [`Instant`] for a given `T` from [`HrTimer`]. +/// +/// Where `C` is the [`ClockSource`] of the [`HrTimer`]. +pub type HrTimerInstant = Instant<<>::TimerMode as HrTimerMode>::Clock>; + /// A timer backed by a C `struct hrtimer`. /// /// # Invariants @@ -163,6 +168,84 @@ impl HrTimer { // handled on the C side. unsafe { bindings::hrtimer_cancel(c_timer_ptr) != 0 } } + + /// Forward the timer expiry for a given timer pointer. + /// + /// # Safety + /// + /// - `self_ptr` must point to a valid `Self`. + /// - The caller must either have exclusive access to the data pointed at by `self_ptr`, or be + /// within the context of the timer callback. + #[inline] + unsafe fn raw_forward(self_ptr: *mut Self, now: HrTimerInstant, interval: Delta) -> u64 + where + T: HasHrTimer, + { + // SAFETY: + // * The C API requirements for this function are fulfilled by our safety contract. + // * `self_ptr` is guaranteed to point to a valid `Self` via our safety contract + unsafe { + bindings::hrtimer_forward(Self::raw_get(self_ptr), now.as_nanos(), interval.as_nanos()) + } + } + + /// Conditionally forward the timer. + /// + /// If the timer expires after `now`, this function does nothing and returns 0. If the timer + /// expired at or before `now`, this function forwards the timer by `interval` until the timer + /// expires after `now` and then returns the number of times the timer was forwarded by + /// `interval`. + /// + /// This function is mainly useful for timer types which can provide exclusive access to the + /// timer when the timer is not running. For forwarding the timer from within the timer callback + /// context, see [`HrTimerCallbackContext::forward()`]. + /// + /// Returns the number of overruns that occurred as a result of the timer expiry change. + pub fn forward(self: Pin<&mut Self>, now: HrTimerInstant, interval: Delta) -> u64 + where + T: HasHrTimer, + { + // SAFETY: `raw_forward` does not move `Self` + let this = unsafe { self.get_unchecked_mut() }; + + // SAFETY: By existence of `Pin<&mut Self>`, the pointer passed to `raw_forward` points to a + // valid `Self` that we have exclusive access to. + unsafe { Self::raw_forward(this, now, interval) } + } + + /// Conditionally forward the timer. + /// + /// This is a variant of [`forward()`](Self::forward) that uses an interval after the current + /// time of the base clock for the [`HrTimer`]. + pub fn forward_now(self: Pin<&mut Self>, interval: Delta) -> u64 + where + T: HasHrTimer, + { + self.forward(HrTimerInstant::::now(), interval) + } + + /// Return the time expiry for this [`HrTimer`]. + /// + /// This value should only be used as a snapshot, as the actual expiry time could change after + /// this function is called. + pub fn expires(&self) -> HrTimerInstant + where + T: HasHrTimer, + { + // SAFETY: `self` is an immutable reference and thus always points to a valid `HrTimer`. + let c_timer_ptr = unsafe { HrTimer::raw_get(self) }; + + // SAFETY: + // - Timers cannot have negative ktime_t values as their expiration time. + // - There's no actual locking here, a racy read is fine and expected + unsafe { + Instant::from_ktime( + // This `read_volatile` is intended to correspond to a READ_ONCE call. + // FIXME(read_once): Replace with `read_once` when available on the Rust side. + core::ptr::read_volatile(&raw const ((*c_timer_ptr).node.expires)), + ) + } + } } /// Implemented by pointer types that point to structs that contain a [`HrTimer`]. @@ -300,9 +383,13 @@ pub trait HrTimerCallback { type Pointer<'a>: RawHrTimerCallback; /// Called by the timer logic when the timer fires. - fn run(this: as RawHrTimerCallback>::CallbackTarget<'_>) -> HrTimerRestart + fn run( + this: as RawHrTimerCallback>::CallbackTarget<'_>, + ctx: HrTimerCallbackContext<'_, Self>, + ) -> HrTimerRestart where - Self: Sized; + Self: Sized, + Self: HasHrTimer; } /// A handle representing a potentially running timer. @@ -324,6 +411,8 @@ pub unsafe trait HrTimerHandle { /// Note that the timer might be started by a concurrent start operation. If /// so, the timer might not be in the **stopped** state when this function /// returns. + /// + /// Returns `true` if the timer was running. fn cancel(&mut self) -> bool; } @@ -585,6 +674,63 @@ impl HrTimerMode for RelativePinnedHardMode { type Expires = Delta; } +/// Privileged smart-pointer for a [`HrTimer`] callback context. +/// +/// Many [`HrTimer`] methods can only be called in two situations: +/// +/// * When the caller has exclusive access to the `HrTimer` and the `HrTimer` is guaranteed not to +/// be running. +/// * From within the context of an `HrTimer`'s callback method. +/// +/// This type provides access to said methods from within a timer callback context. +/// +/// # Invariants +/// +/// * The existence of this type means the caller is currently within the callback for an +/// [`HrTimer`]. +/// * `self.0` always points to a live instance of [`HrTimer`]. +pub struct HrTimerCallbackContext<'a, T: HasHrTimer>(NonNull>, PhantomData<&'a ()>); + +impl<'a, T: HasHrTimer> HrTimerCallbackContext<'a, T> { + /// Create a new [`HrTimerCallbackContext`]. + /// + /// # Safety + /// + /// This function relies on the caller being within the context of a timer callback, so it must + /// not be used anywhere except for within implementations of [`RawHrTimerCallback::run`]. The + /// caller promises that `timer` points to a valid initialized instance of + /// [`bindings::hrtimer`]. + /// + /// The returned `Self` must not outlive the function context of [`RawHrTimerCallback::run`] + /// where this function is called. + pub(crate) unsafe fn from_raw(timer: *mut HrTimer) -> Self { + // SAFETY: The caller guarantees `timer` is a valid pointer to an initialized + // `bindings::hrtimer` + // INVARIANT: Our safety contract ensures that we're within the context of a timer callback + // and that `timer` points to a live instance of `HrTimer`. + Self(unsafe { NonNull::new_unchecked(timer) }, PhantomData) + } + + /// Conditionally forward the timer. + /// + /// This function is identical to [`HrTimer::forward()`] except that it may only be used from + /// within the context of a [`HrTimer`] callback. + pub fn forward(&mut self, now: HrTimerInstant, interval: Delta) -> u64 { + // SAFETY: + // - We are guaranteed to be within the context of a timer callback by our type invariants + // - By our type invariants, `self.0` always points to a valid `HrTimer` + unsafe { HrTimer::::raw_forward(self.0.as_ptr(), now, interval) } + } + + /// Conditionally forward the timer. + /// + /// This is a variant of [`HrTimerCallbackContext::forward()`] that uses an interval after the + /// current time of the base clock for the [`HrTimer`]. + pub fn forward_now(&mut self, duration: Delta) -> u64 { + self.forward(HrTimerInstant::::now(), duration) + } +} + /// Use to implement the [`HasHrTimer`] trait. /// /// See [`module`] documentation for an example. diff --git a/rust/kernel/time/hrtimer/arc.rs b/rust/kernel/time/hrtimer/arc.rs index ed490a7a895038..7be82bcb352ac4 100644 --- a/rust/kernel/time/hrtimer/arc.rs +++ b/rust/kernel/time/hrtimer/arc.rs @@ -3,6 +3,7 @@ use super::HasHrTimer; use super::HrTimer; use super::HrTimerCallback; +use super::HrTimerCallbackContext; use super::HrTimerHandle; use super::HrTimerMode; use super::HrTimerPointer; @@ -99,6 +100,12 @@ where // allocation from other `Arc` clones. let receiver = unsafe { ArcBorrow::from_raw(data_ptr) }; - T::run(receiver).into_c() + // SAFETY: + // - By C API contract `timer_ptr` is the pointer that we passed when queuing the timer, so + // it is a valid pointer to a `HrTimer` embedded in a `T`. + // - We are within `RawHrTimerCallback::run` + let context = unsafe { HrTimerCallbackContext::from_raw(timer_ptr) }; + + T::run(receiver, context).into_c() } } diff --git a/rust/kernel/time/hrtimer/pin.rs b/rust/kernel/time/hrtimer/pin.rs index aef16d9ee2f0c5..4d39ef7816971b 100644 --- a/rust/kernel/time/hrtimer/pin.rs +++ b/rust/kernel/time/hrtimer/pin.rs @@ -3,6 +3,7 @@ use super::HasHrTimer; use super::HrTimer; use super::HrTimerCallback; +use super::HrTimerCallbackContext; use super::HrTimerHandle; use super::HrTimerMode; use super::RawHrTimerCallback; @@ -103,6 +104,12 @@ where // here. let receiver_pin = unsafe { Pin::new_unchecked(receiver_ref) }; - T::run(receiver_pin).into_c() + // SAFETY: + // - By C API contract `timer_ptr` is the pointer that we passed when queuing the timer, so + // it is a valid pointer to a `HrTimer` embedded in a `T`. + // - We are within `RawHrTimerCallback::run` + let context = unsafe { HrTimerCallbackContext::from_raw(timer_ptr) }; + + T::run(receiver_pin, context).into_c() } } diff --git a/rust/kernel/time/hrtimer/pin_mut.rs b/rust/kernel/time/hrtimer/pin_mut.rs index 767d0a4e8a2c11..9d9447d4d57e83 100644 --- a/rust/kernel/time/hrtimer/pin_mut.rs +++ b/rust/kernel/time/hrtimer/pin_mut.rs @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 use super::{ - HasHrTimer, HrTimer, HrTimerCallback, HrTimerHandle, HrTimerMode, RawHrTimerCallback, - UnsafeHrTimerPointer, + HasHrTimer, HrTimer, HrTimerCallback, HrTimerCallbackContext, HrTimerHandle, HrTimerMode, + RawHrTimerCallback, UnsafeHrTimerPointer, }; use core::{marker::PhantomData, pin::Pin, ptr::NonNull}; @@ -107,6 +107,12 @@ where // here. let receiver_pin = unsafe { Pin::new_unchecked(receiver_ref) }; - T::run(receiver_pin).into_c() + // SAFETY: + // - By C API contract `timer_ptr` is the pointer that we passed when queuing the timer, so + // it is a valid pointer to a `HrTimer` embedded in a `T`. + // - We are within `RawHrTimerCallback::run` + let context = unsafe { HrTimerCallbackContext::from_raw(timer_ptr) }; + + T::run(receiver_pin, context).into_c() } } diff --git a/rust/kernel/time/hrtimer/tbox.rs b/rust/kernel/time/hrtimer/tbox.rs index ec08303315f280..aa1ee31a71953c 100644 --- a/rust/kernel/time/hrtimer/tbox.rs +++ b/rust/kernel/time/hrtimer/tbox.rs @@ -3,6 +3,7 @@ use super::HasHrTimer; use super::HrTimer; use super::HrTimerCallback; +use super::HrTimerCallbackContext; use super::HrTimerHandle; use super::HrTimerMode; use super::HrTimerPointer; @@ -119,6 +120,12 @@ where // `data_ptr` exist. let data_mut_ref = unsafe { Pin::new_unchecked(&mut *data_ptr) }; - T::run(data_mut_ref).into_c() + // SAFETY: + // - By C API contract `timer_ptr` is the pointer that we passed when queuing the timer, so + // it is a valid pointer to a `HrTimer` embedded in a `T`. + // - We are within `RawHrTimerCallback::run` + let context = unsafe { HrTimerCallbackContext::from_raw(timer_ptr) }; + + T::run(data_mut_ref, context).into_c() } } diff --git a/rust/macros/quote.rs b/rust/macros/quote.rs index 92cacc4067c9ac..acc140c186538d 100644 --- a/rust/macros/quote.rs +++ b/rust/macros/quote.rs @@ -2,7 +2,6 @@ use proc_macro::{TokenStream, TokenTree}; -#[allow(dead_code)] pub(crate) trait ToTokens { fn to_tokens(&self, tokens: &mut TokenStream); } @@ -47,121 +46,116 @@ impl ToTokens for TokenStream { /// `quote` crate but provides only just enough functionality needed by the current `macros` crate. macro_rules! quote_spanned { ($span:expr => $($tt:tt)*) => {{ - let mut tokens: ::std::vec::Vec<::proc_macro::TokenTree>; - #[allow(clippy::vec_init_then_push)] + let mut tokens = ::proc_macro::TokenStream::new(); { - tokens = ::std::vec::Vec::new(); let span = $span; quote_spanned!(@proc tokens span $($tt)*); } - ::proc_macro::TokenStream::from_iter(tokens) + tokens }}; (@proc $v:ident $span:ident) => {}; (@proc $v:ident $span:ident #$id:ident $($tt:tt)*) => { - let mut ts = ::proc_macro::TokenStream::new(); - $crate::quote::ToTokens::to_tokens(&$id, &mut ts); - $v.extend(ts); + $crate::quote::ToTokens::to_tokens(&$id, &mut $v); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident #(#$id:ident)* $($tt:tt)*) => { for token in $id { - let mut ts = ::proc_macro::TokenStream::new(); - $crate::quote::ToTokens::to_tokens(&token, &mut ts); - $v.extend(ts); + $crate::quote::ToTokens::to_tokens(&token, &mut $v); } quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident ( $($inner:tt)* ) $($tt:tt)*) => { #[allow(unused_mut)] - let mut tokens = ::std::vec::Vec::<::proc_macro::TokenTree>::new(); + let mut tokens = ::proc_macro::TokenStream::new(); quote_spanned!(@proc tokens $span $($inner)*); - $v.push(::proc_macro::TokenTree::Group(::proc_macro::Group::new( + $v.extend([::proc_macro::TokenTree::Group(::proc_macro::Group::new( ::proc_macro::Delimiter::Parenthesis, - ::proc_macro::TokenStream::from_iter(tokens) - ))); + tokens, + ))]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident [ $($inner:tt)* ] $($tt:tt)*) => { - let mut tokens = ::std::vec::Vec::new(); + let mut tokens = ::proc_macro::TokenStream::new(); quote_spanned!(@proc tokens $span $($inner)*); - $v.push(::proc_macro::TokenTree::Group(::proc_macro::Group::new( + $v.extend([::proc_macro::TokenTree::Group(::proc_macro::Group::new( ::proc_macro::Delimiter::Bracket, - ::proc_macro::TokenStream::from_iter(tokens) - ))); + tokens, + ))]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident { $($inner:tt)* } $($tt:tt)*) => { - let mut tokens = ::std::vec::Vec::new(); + let mut tokens = ::proc_macro::TokenStream::new(); quote_spanned!(@proc tokens $span $($inner)*); - $v.push(::proc_macro::TokenTree::Group(::proc_macro::Group::new( + $v.extend([::proc_macro::TokenTree::Group(::proc_macro::Group::new( ::proc_macro::Delimiter::Brace, - ::proc_macro::TokenStream::from_iter(tokens) - ))); + tokens, + ))]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident :: $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(':', ::proc_macro::Spacing::Joint) - )); - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(':', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::Spacing::Joint, ::proc_macro::Spacing::Alone].map(|spacing| { + ::proc_macro::TokenTree::Punct(::proc_macro::Punct::new(':', spacing)) + })); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident : $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(':', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new(':', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident , $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(',', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new(',', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident @ $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('@', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('@', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident ! $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('!', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('!', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident ; $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new(';', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new(';', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident + $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('+', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('+', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident = $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('=', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('=', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident # $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Punct( - ::proc_macro::Punct::new('#', ::proc_macro::Spacing::Alone) - )); + $v.extend([::proc_macro::TokenTree::Punct( + ::proc_macro::Punct::new('#', ::proc_macro::Spacing::Alone), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident _ $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Ident(::proc_macro::Ident::new("_", $span))); + $v.extend([::proc_macro::TokenTree::Ident( + ::proc_macro::Ident::new("_", $span), + )]); quote_spanned!(@proc $v $span $($tt)*); }; (@proc $v:ident $span:ident $id:ident $($tt:tt)*) => { - $v.push(::proc_macro::TokenTree::Ident(::proc_macro::Ident::new(stringify!($id), $span))); + $v.extend([::proc_macro::TokenTree::Ident( + ::proc_macro::Ident::new(stringify!($id), $span), + )]); quote_spanned!(@proc $v $span $($tt)*); }; } diff --git a/rust/uapi/lib.rs b/rust/uapi/lib.rs index 31c2f713313fe3..1d5fd9efb93e9d 100644 --- a/rust/uapi/lib.rs +++ b/rust/uapi/lib.rs @@ -34,4 +34,6 @@ type __kernel_size_t = usize; type __kernel_ssize_t = isize; type __kernel_ptrdiff_t = isize; +use pin_init::MaybeZeroable; + include!(concat!(env!("OBJTREE"), "/rust/uapi/uapi_generated.rs")); diff --git a/samples/damon/mtier.c b/samples/damon/mtier.c index 7ebd352138e4f2..beaf36657deacc 100644 --- a/samples/damon/mtier.c +++ b/samples/damon/mtier.c @@ -208,6 +208,9 @@ static int damon_sample_mtier_enable_store( if (enabled == is_enabled) return 0; + if (!init_called) + return 0; + if (enabled) { err = damon_sample_mtier_start(); if (err) diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c index 1b839c06a612f2..0226652f94d55e 100644 --- a/samples/damon/prcl.c +++ b/samples/damon/prcl.c @@ -137,6 +137,9 @@ static int damon_sample_prcl_enable_store( if (enabled == is_enabled) return 0; + if (!init_called) + return 0; + if (enabled) { err = damon_sample_prcl_start(); if (err) diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c index da052023b09907..21eaf15f987d47 100644 --- a/samples/damon/wsse.c +++ b/samples/damon/wsse.c @@ -118,6 +118,9 @@ static int damon_sample_wsse_enable_store( return 0; if (enabled) { + if (!init_called) + return 0; + err = damon_sample_wsse_start(); if (err) enabled = false; diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index cfea7a38befb05..da3a9f2091f55b 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -75,8 +75,8 @@ asm ( CALL_DEPTH_ACCOUNT " call my_direct_func1\n" " leave\n" -" .size my_tramp1, .-my_tramp1\n" ASM_RET +" .size my_tramp1, .-my_tramp1\n" " .type my_tramp2, @function\n" " .globl my_tramp2\n" diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig index 7f7371a004ee0a..66360cdf048ff2 100644 --- a/samples/rust/Kconfig +++ b/samples/rust/Kconfig @@ -62,6 +62,28 @@ config SAMPLE_RUST_DMA If unsure, say N. +config SAMPLE_RUST_DEBUGFS + tristate "DebugFS Test Module" + depends on DEBUG_FS + help + This option builds the Rust DebugFS Test module sample. + + To compile this as a module, choose M here: + the module will be called rust_debugfs. + + If unsure, say N. + +config SAMPLE_RUST_DEBUGFS_SCOPED + tristate "Scoped DebugFS Test Module" + depends on DEBUG_FS + help + This option builds the Rust Scoped DebugFS Test module sample. + + To compile this as a module, choose M here: + the module will be called rust_debugfs_scoped. + + If unsure, say N. + config SAMPLE_RUST_DRIVER_PCI tristate "PCI Driver" depends on PCI diff --git a/samples/rust/Makefile b/samples/rust/Makefile index bd2faad63b4f3b..69ca01497b589e 100644 --- a/samples/rust/Makefile +++ b/samples/rust/Makefile @@ -4,6 +4,8 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_SAMPLE_RUST_MINIMAL) += rust_minimal.o obj-$(CONFIG_SAMPLE_RUST_MISC_DEVICE) += rust_misc_device.o obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o +obj-$(CONFIG_SAMPLE_RUST_DEBUGFS) += rust_debugfs.o +obj-$(CONFIG_SAMPLE_RUST_DEBUGFS_SCOPED) += rust_debugfs_scoped.o obj-$(CONFIG_SAMPLE_RUST_DMA) += rust_dma.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PCI) += rust_driver_pci.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PLATFORM) += rust_driver_platform.o diff --git a/samples/rust/rust_configfs.rs b/samples/rust/rust_configfs.rs index af04bfa35cb28e..5005453f874da0 100644 --- a/samples/rust/rust_configfs.rs +++ b/samples/rust/rust_configfs.rs @@ -94,7 +94,7 @@ impl configfs::AttributeOperations<0> for Configuration { fn show(container: &Configuration, page: &mut [u8; PAGE_SIZE]) -> Result { pr_info!("Show message\n"); - let data = container.message; + let data = container.message.to_bytes(); page[0..data.len()].copy_from_slice(data); Ok(data.len()) } diff --git a/samples/rust/rust_debugfs.rs b/samples/rust/rust_debugfs.rs new file mode 100644 index 00000000000000..82b61a15a34b95 --- /dev/null +++ b/samples/rust/rust_debugfs.rs @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Sample DebugFS exporting platform driver +//! +//! To successfully probe this driver with ACPI, use an ssdt that looks like +//! +//! ```dsl +//! DefinitionBlock ("", "SSDT", 2, "TEST", "VIRTACPI", 0x00000001) +//! { +//! Scope (\_SB) +//! { +//! Device (T432) +//! { +//! Name (_HID, "LNUXBEEF") // ACPI hardware ID to match +//! Name (_UID, 1) +//! Name (_STA, 0x0F) // Device present, enabled +//! Name (_DSD, Package () { // Sample attribute +//! ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), +//! Package() { +//! Package(2) {"compatible", "sample-debugfs"} +//! } +//! }) +//! Name (_CRS, ResourceTemplate () +//! { +//! Memory32Fixed (ReadWrite, 0xFED00000, 0x1000) +//! }) +//! } +//! } +//! } +//! ``` + +use core::str::FromStr; +use core::sync::atomic::AtomicUsize; +use core::sync::atomic::Ordering; +use kernel::c_str; +use kernel::debugfs::{Dir, File}; +use kernel::new_mutex; +use kernel::prelude::*; +use kernel::sync::Mutex; + +use kernel::{acpi, device::Core, of, platform, str::CString, types::ARef}; + +kernel::module_platform_driver! { + type: RustDebugFs, + name: "rust_debugfs", + authors: ["Matthew Maurer"], + description: "Rust DebugFS usage sample", + license: "GPL", +} + +#[pin_data] +struct RustDebugFs { + pdev: ARef, + // As we only hold these for drop effect (to remove the directory/files) we have a leading + // underscore to indicate to the compiler that we don't expect to use this field directly. + _debugfs: Dir, + #[pin] + _compatible: File, + #[pin] + counter: File, + #[pin] + inner: File>, +} + +#[derive(Debug)] +struct Inner { + x: u32, + y: u32, +} + +impl FromStr for Inner { + type Err = Error; + fn from_str(s: &str) -> Result { + let mut parts = s.split_whitespace(); + let x = parts + .next() + .ok_or(EINVAL)? + .parse::() + .map_err(|_| EINVAL)?; + let y = parts + .next() + .ok_or(EINVAL)? + .parse::() + .map_err(|_| EINVAL)?; + if parts.next().is_some() { + return Err(EINVAL); + } + Ok(Inner { x, y }) + } +} + +kernel::acpi_device_table!( + ACPI_TABLE, + MODULE_ACPI_TABLE, + ::IdInfo, + [(acpi::DeviceId::new(c_str!("LNUXBEEF")), ())] +); + +impl platform::Driver for RustDebugFs { + type IdInfo = (); + const OF_ID_TABLE: Option> = None; + const ACPI_ID_TABLE: Option> = Some(&ACPI_TABLE); + + fn probe( + pdev: &platform::Device, + _info: Option<&Self::IdInfo>, + ) -> Result>> { + let result = KBox::try_pin_init(RustDebugFs::new(pdev), GFP_KERNEL)?; + // We can still mutate fields through the files which are atomic or mutexed: + result.counter.store(91, Ordering::Relaxed); + { + let mut guard = result.inner.lock(); + guard.x = guard.y; + guard.y = 42; + } + Ok(result) + } +} + +impl RustDebugFs { + fn build_counter(dir: &Dir) -> impl PinInit> + '_ { + dir.read_write_file(c_str!("counter"), AtomicUsize::new(0)) + } + + fn build_inner(dir: &Dir) -> impl PinInit>> + '_ { + dir.read_write_file(c_str!("pair"), new_mutex!(Inner { x: 3, y: 10 })) + } + + fn new(pdev: &platform::Device) -> impl PinInit + '_ { + let debugfs = Dir::new(c_str!("sample_debugfs")); + let dev = pdev.as_ref(); + + try_pin_init! { + Self { + _compatible <- debugfs.read_only_file( + c_str!("compatible"), + dev.fwnode() + .ok_or(ENOENT)? + .property_read::(c_str!("compatible")) + .required_by(dev)?, + ), + counter <- Self::build_counter(&debugfs), + inner <- Self::build_inner(&debugfs), + _debugfs: debugfs, + pdev: pdev.into(), + } + } + } +} diff --git a/samples/rust/rust_debugfs_scoped.rs b/samples/rust/rust_debugfs_scoped.rs new file mode 100644 index 00000000000000..b0c4e76b123eaa --- /dev/null +++ b/samples/rust/rust_debugfs_scoped.rs @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Sample DebugFS exporting platform driver that demonstrates the use of +//! `Scope::dir` to create a variety of files without the need to separately +//! track them all. + +use core::sync::atomic::AtomicUsize; +use kernel::debugfs::{Dir, Scope}; +use kernel::prelude::*; +use kernel::sync::Mutex; +use kernel::{c_str, new_mutex, str::CString}; + +module! { + type: RustScopedDebugFs, + name: "rust_debugfs_scoped", + authors: ["Matthew Maurer"], + description: "Rust Scoped DebugFS usage sample", + license: "GPL", +} + +fn remove_file_write( + mod_data: &ModuleData, + reader: &mut kernel::uaccess::UserSliceReader, +) -> Result { + let mut buf = [0u8; 128]; + if reader.len() >= buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?.trim(); + let nul_idx = s.len(); + buf[nul_idx] = 0; + let to_remove = CStr::from_bytes_with_nul(&buf[..nul_idx + 1]).map_err(|_| EINVAL)?; + mod_data + .devices + .lock() + .retain(|device| device.name.as_bytes() != to_remove.as_bytes()); + Ok(()) +} + +fn create_file_write( + mod_data: &ModuleData, + reader: &mut kernel::uaccess::UserSliceReader, +) -> Result { + let mut buf = [0u8; 128]; + if reader.len() > buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let mut nums = KVec::new(); + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?.trim(); + let mut items = s.split_whitespace(); + let name_str = items.next().ok_or(EINVAL)?; + let name = CString::try_from_fmt(fmt!("{name_str}"))?; + let file_name = CString::try_from_fmt(fmt!("{name_str}"))?; + for sub in items { + nums.push( + AtomicUsize::new(sub.parse().map_err(|_| EINVAL)?), + GFP_KERNEL, + )?; + } + + let scope = KBox::pin_init( + mod_data + .device_dir + .scope(DeviceData { name, nums }, &file_name, |dev_data, dir| { + for (idx, val) in dev_data.nums.iter().enumerate() { + let Ok(name) = CString::try_from_fmt(fmt!("{idx}")) else { + return; + }; + dir.read_write_file(&name, val); + } + }), + GFP_KERNEL, + )?; + (*mod_data.devices.lock()).push(scope, GFP_KERNEL)?; + + Ok(()) +} + +struct RustScopedDebugFs { + _data: Pin>>, +} + +#[pin_data] +struct ModuleData { + device_dir: Dir, + #[pin] + devices: Mutex>>>>, +} + +impl ModuleData { + fn init(device_dir: Dir) -> impl PinInit { + pin_init! { + Self { + device_dir: device_dir, + devices <- new_mutex!(KVec::new()) + } + } + } +} + +struct DeviceData { + name: CString, + nums: KVec, +} + +fn init_control(base_dir: &Dir, dyn_dirs: Dir) -> impl PinInit> + '_ { + base_dir.scope( + ModuleData::init(dyn_dirs), + c_str!("control"), + |data, dir| { + dir.write_only_callback_file(c_str!("create"), data, &create_file_write); + dir.write_only_callback_file(c_str!("remove"), data, &remove_file_write); + }, + ) +} + +impl kernel::Module for RustScopedDebugFs { + fn init(_module: &'static kernel::ThisModule) -> Result { + let base_dir = Dir::new(c_str!("rust_scoped_debugfs")); + let dyn_dirs = base_dir.subdir(c_str!("dynamic")); + Ok(Self { + _data: KBox::pin_init(init_control(&base_dir, dyn_dirs), GFP_KERNEL)?, + }) + } +} diff --git a/samples/rust/rust_dma.rs b/samples/rust/rust_dma.rs index c5e7cce6865402..ead8b542bb4aed 100644 --- a/samples/rust/rust_dma.rs +++ b/samples/rust/rust_dma.rs @@ -5,12 +5,11 @@ //! To make this driver probe, QEMU must be run with `-device pci-testdev`. use kernel::{ - bindings, device::Core, dma::{CoherentAllocation, Device, DmaMask}, pci, prelude::*, - types::ARef, + sync::aref::ARef, }; struct DmaSampleDriver { @@ -45,10 +44,7 @@ kernel::pci_device_table!( PCI_TABLE, MODULE_PCI_TABLE, ::IdInfo, - [( - pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, 0x5), - () - )] + [(pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), ())] ); impl pci::Driver for DmaSampleDriver { diff --git a/samples/rust/rust_driver_auxiliary.rs b/samples/rust/rust_driver_auxiliary.rs index f2a820683fc39e..55ece336ee45ae 100644 --- a/samples/rust/rust_driver_auxiliary.rs +++ b/samples/rust/rust_driver_auxiliary.rs @@ -5,7 +5,7 @@ //! To make this driver probe, QEMU must be run with `-device pci-testdev`. use kernel::{ - auxiliary, bindings, c_str, device::Core, driver, error::Error, pci, prelude::*, InPlaceModule, + auxiliary, c_str, device::Core, driver, error::Error, pci, prelude::*, InPlaceModule, }; use pin_init::PinInit; @@ -50,10 +50,7 @@ kernel::pci_device_table!( PCI_TABLE, MODULE_PCI_TABLE, ::IdInfo, - [( - pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, 0x5), - () - )] + [(pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), ())] ); impl pci::Driver for ParentDriver { @@ -81,11 +78,12 @@ impl ParentDriver { let parent = adev.parent().ok_or(EINVAL)?; let pdev: &pci::Device = parent.try_into()?; + let vendor = pdev.vendor_id(); dev_info!( adev.as_ref(), - "Connect auxiliary {} with parent: VendorID={:#x}, DeviceID={:#x}\n", + "Connect auxiliary {} with parent: VendorID={}, DeviceID={:#x}\n", adev.id(), - pdev.vendor_id(), + vendor, pdev.device_id() ); diff --git a/samples/rust/rust_driver_pci.rs b/samples/rust/rust_driver_pci.rs index 606946ff4d7fd9..97baec8df9bcdb 100644 --- a/samples/rust/rust_driver_pci.rs +++ b/samples/rust/rust_driver_pci.rs @@ -4,7 +4,7 @@ //! //! To make this driver probe, QEMU must be run with `-device pci-testdev`. -use kernel::{bindings, c_str, device::Core, devres::Devres, pci, prelude::*, types::ARef}; +use kernel::{c_str, device::Core, devres::Devres, pci, prelude::*, sync::aref::ARef}; struct Regs; @@ -38,7 +38,7 @@ kernel::pci_device_table!( MODULE_PCI_TABLE, ::IdInfo, [( - pci::DeviceId::from_id(bindings::PCI_VENDOR_ID_REDHAT, 0x5), + pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), TestIndex::NO_EVENTFD )] ); @@ -66,10 +66,11 @@ impl pci::Driver for SampleDriver { const ID_TABLE: pci::IdTable = &PCI_TABLE; fn probe(pdev: &pci::Device, info: &Self::IdInfo) -> Result>> { + let vendor = pdev.vendor_id(); dev_dbg!( pdev.as_ref(), - "Probe Rust PCI driver sample (PCI ID: 0x{:x}, 0x{:x}).\n", - pdev.vendor_id(), + "Probe Rust PCI driver sample (PCI ID: {}, 0x{:x}).\n", + vendor, pdev.device_id() ); diff --git a/samples/rust/rust_driver_platform.rs b/samples/rust/rust_driver_platform.rs index 69ed55b7b0faad..6473baf4f1206a 100644 --- a/samples/rust/rust_driver_platform.rs +++ b/samples/rust/rust_driver_platform.rs @@ -72,7 +72,7 @@ use kernel::{ of, platform, prelude::*, str::CString, - types::ARef, + sync::aref::ARef, }; struct SampleDriver { diff --git a/scripts/atomic/gen-atomics.sh b/scripts/atomic/gen-atomics.sh index 5b98a83076932f..02508d0d6fe45e 100755 --- a/scripts/atomic/gen-atomics.sh +++ b/scripts/atomic/gen-atomics.sh @@ -11,6 +11,7 @@ cat < ${LINUXDIR}/include/${header} diff --git a/scripts/atomic/gen-rust-atomic-helpers.sh b/scripts/atomic/gen-rust-atomic-helpers.sh new file mode 100755 index 00000000000000..45b1e100ed7c63 --- /dev/null +++ b/scripts/atomic/gen-rust-atomic-helpers.sh @@ -0,0 +1,67 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +ATOMICDIR=$(dirname $0) + +. ${ATOMICDIR}/atomic-tbl.sh + +#gen_proto_order_variant(meta, pfx, name, sfx, order, atomic, int, arg...) +gen_proto_order_variant() +{ + local meta="$1"; shift + local pfx="$1"; shift + local name="$1"; shift + local sfx="$1"; shift + local order="$1"; shift + local atomic="$1"; shift + local int="$1"; shift + + local atomicname="${atomic}_${pfx}${name}${sfx}${order}" + + local ret="$(gen_ret_type "${meta}" "${int}")" + local params="$(gen_params "${int}" "${atomic}" "$@")" + local args="$(gen_args "$@")" + local retstmt="$(gen_ret_stmt "${meta}")" + +cat < + +// TODO: Remove this after INLINE_HELPERS support is added. +#ifndef __rust_helper +#define __rust_helper +#endif + +EOF + +grep '^[a-z]' "$1" | while read name meta args; do + gen_proto "${meta}" "${name}" "atomic" "int" ${args} +done + +grep '^[a-z]' "$1" | while read name meta args; do + gen_proto "${meta}" "${name}" "atomic64" "s64" ${args} +done + +cat <get_passes()->get_pass_for_id(id); } +#if BUILDING_GCC_VERSION < 16000 #define TODO_verify_ssa TODO_verify_il #define TODO_verify_flow TODO_verify_il #define TODO_verify_stmts TODO_verify_il #define TODO_verify_rtl_sharing TODO_verify_il +#else +#define TODO_verify_ssa 0 +#define TODO_verify_flow 0 +#define TODO_verify_stmts 0 +#define TODO_verify_rtl_sharing 0 +#endif #define INSN_DELETED_P(insn) (insn)->deleted() diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py index 98445671fe8389..ccc24d30de8063 100644 --- a/scripts/gdb/linux/timerlist.py +++ b/scripts/gdb/linux/timerlist.py @@ -56,8 +56,6 @@ def print_base(base): text += " .index: {}\n".format(base['index']) text += " .resolution: {} nsecs\n".format(constants.LX_hrtimer_resolution) - - text += " .get_time: {}\n".format(base['get_time']) if constants.LX_CONFIG_HIGH_RES_TIMERS: text += " .offset: {} nsecs\n".format(base['offset']) text += "active timers:\n" diff --git a/scripts/generate_rust_analyzer.py b/scripts/generate_rust_analyzer.py index 7c3ea2b55041f8..fc27f0cca752d3 100755 --- a/scripts/generate_rust_analyzer.py +++ b/scripts/generate_rust_analyzer.py @@ -139,8 +139,8 @@ def append_crate_with_generated( "exclude_dirs": [], } - append_crate_with_generated("bindings", ["core", "ffi"]) - append_crate_with_generated("uapi", ["core", "ffi"]) + append_crate_with_generated("bindings", ["core", "ffi", "pin_init"]) + append_crate_with_generated("uapi", ["core", "ffi", "pin_init"]) append_crate_with_generated("kernel", ["core", "macros", "build_error", "pin_init", "ffi", "bindings", "uapi"]) def is_root_crate(build_file, target): diff --git a/scripts/kconfig/expr.h b/scripts/kconfig/expr.h index fe2231e0e6a4cd..5f900d18dae019 100644 --- a/scripts/kconfig/expr.h +++ b/scripts/kconfig/expr.h @@ -145,6 +145,7 @@ struct symbol { #define SYMBOL_CONST 0x0001 /* symbol is const */ #define SYMBOL_CHECK 0x0008 /* used during dependency checking */ #define SYMBOL_VALID 0x0080 /* set when symbol.curr is calculated */ +#define SYMBOL_TRANS 0x0100 /* symbol is transitional only (not visible)*/ #define SYMBOL_WRITE 0x0200 /* write symbol to file (KCONFIG_CONFIG) */ #define SYMBOL_WRITTEN 0x0800 /* track info to avoid double-write to .config */ #define SYMBOL_CHECKED 0x2000 /* used during dependency checking */ diff --git a/scripts/kconfig/lexer.l b/scripts/kconfig/lexer.l index 9c2cdfc33c6f06..6d2c92c6095dd8 100644 --- a/scripts/kconfig/lexer.l +++ b/scripts/kconfig/lexer.l @@ -126,6 +126,7 @@ n [A-Za-z0-9_-] "select" return T_SELECT; "source" return T_SOURCE; "string" return T_STRING; +"transitional" return T_TRANSITIONAL; "tristate" return T_TRISTATE; "visible" return T_VISIBLE; "||" return T_OR; diff --git a/scripts/kconfig/parser.y b/scripts/kconfig/parser.y index e9c3c664e92511..49b79dde1725ab 100644 --- a/scripts/kconfig/parser.y +++ b/scripts/kconfig/parser.y @@ -75,6 +75,7 @@ struct menu *current_menu, *current_entry, *current_choice; %token T_SELECT %token T_SOURCE %token T_STRING +%token T_TRANSITIONAL %token T_TRISTATE %token T_VISIBLE %token T_EOL @@ -205,6 +206,12 @@ config_option: T_PROMPT T_WORD_QUOTE if_expr T_EOL printd(DEBUG_PARSE, "%s:%d:prompt\n", cur_filename, cur_lineno); }; +config_option: T_TRANSITIONAL T_EOL +{ + current_entry->sym->flags |= SYMBOL_TRANS; + printd(DEBUG_PARSE, "%s:%d:transitional\n", cur_filename, cur_lineno); +}; + config_option: default expr if_expr T_EOL { menu_add_expr(P_DEFAULT, $2, $3); @@ -482,6 +489,43 @@ assign_val: %% +/** + * transitional_check_sanity - check transitional symbols have no other + * properties + * + * @menu: menu of the potentially transitional symbol + * + * Return: -1 if an error is found, 0 otherwise. + */ +static int transitional_check_sanity(const struct menu *menu) +{ + struct property *prop; + + if (!menu->sym || !(menu->sym->flags & SYMBOL_TRANS)) + return 0; + + /* Check for depends and visible conditions. */ + if ((menu->dep && !expr_is_yes(menu->dep)) || + (menu->visibility && !expr_is_yes(menu->visibility))) { + fprintf(stderr, "%s:%d: error: %s", + menu->filename, menu->lineno, + "transitional symbols can only have help sections\n"); + return -1; + } + + /* Check for any property other than "help". */ + for (prop = menu->sym->prop; prop; prop = prop->next) { + if (prop->type != P_COMMENT) { + fprintf(stderr, "%s:%d: error: %s", + prop->filename, prop->lineno, + "transitional symbols can only have help sections\n"); + return -1; + } + } + + return 0; +} + /** * choice_check_sanity - check sanity of a choice member * @@ -558,6 +602,9 @@ void conf_parse(const char *name) if (menu->sym && sym_check_deps(menu->sym)) yynerrs++; + if (transitional_check_sanity(menu)) + yynerrs++; + if (menu->sym && sym_is_choice(menu->sym)) { menu_for_each_sub_entry(child, menu) if (child->sym && choice_check_sanity(child)) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index 26ab10c0fd768f..760cac99838196 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -214,6 +214,11 @@ static void sym_calc_visibility(struct symbol *sym) struct property *prop; tristate tri; + if (sym->flags & SYMBOL_TRANS) { + sym->visible = yes; + return; + } + /* any prompt visible? */ tri = no; for_all_prompts(sym, prop) { @@ -526,7 +531,7 @@ void sym_calc_value(struct symbol *sym) } } - if (sym_is_choice(sym)) + if (sym_is_choice(sym) || sym->flags & SYMBOL_TRANS) sym->flags &= ~SYMBOL_WRITE; } diff --git a/scripts/kconfig/tests/conftest.py b/scripts/kconfig/tests/conftest.py index 2a2a7e2da06087..d94b79e012c04b 100644 --- a/scripts/kconfig/tests/conftest.py +++ b/scripts/kconfig/tests/conftest.py @@ -81,7 +81,22 @@ def _run_conf(self, mode, dot_config=None, out_file='.config', # For interactive modes such as oldaskconfig, oldconfig, # send 'Enter' key until the program finishes. if interactive: - ps.stdin.write(b'\n') + try: + ps.stdin.write(b'\n') + ps.stdin.flush() + except (BrokenPipeError, OSError): + # Process has exited, stop sending input + break + + # Close stdin gracefully + try: + ps.stdin.close() + except (BrokenPipeError, OSError): + # Ignore broken pipe on close + pass + + # Wait for process to complete + ps.wait() self.retcode = ps.returncode self.stdout = ps.stdout.read().decode() diff --git a/scripts/kconfig/tests/err_transitional/Kconfig b/scripts/kconfig/tests/err_transitional/Kconfig new file mode 100644 index 00000000000000..a75ed3b2fe5e43 --- /dev/null +++ b/scripts/kconfig/tests/err_transitional/Kconfig @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0 +# Test that transitional symbols cannot have properties other than help + +config BAD_DEFAULT + bool + transitional + default y + help + This transitional symbol illegally has a default property. + +config BAD_PROMPT + bool + transitional + prompt "Bad prompt" + help + This transitional symbol illegally has a prompt. + +config BAD_SELECT + bool + transitional + select OTHER_SYMBOL + help + This transitional symbol illegally has a select. + +config BAD_IMPLY + bool + transitional + imply OTHER_SYMBOL + help + This transitional symbol illegally has an imply. + +config BAD_DEPENDS + bool + transitional + depends on OTHER_SYMBOL + help + This transitional symbol illegally has a depends. + +config BAD_RANGE + int + transitional + range 1 10 + help + This transitional symbol illegally has a range. + +config BAD_NO_TYPE + transitional + help + This transitional symbol illegally has no type specified. + +config OTHER_SYMBOL + bool diff --git a/scripts/kconfig/tests/err_transitional/__init__.py b/scripts/kconfig/tests/err_transitional/__init__.py new file mode 100644 index 00000000000000..7dffb5b0833f0e --- /dev/null +++ b/scripts/kconfig/tests/err_transitional/__init__.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +""" +Test that transitional symbols with invalid properties are rejected. + +Transitional symbols can only have help sections. Any other properties +(default, select, depends, etc.) should cause a parser error. +""" + +def test(conf): + # This should fail with exit code 1 due to invalid transitional symbol + assert conf.olddefconfig() == 1 + + # Check that the error message is about transitional symbols + assert conf.stderr_contains('expected_stderr') diff --git a/scripts/kconfig/tests/err_transitional/expected_stderr b/scripts/kconfig/tests/err_transitional/expected_stderr new file mode 100644 index 00000000000000..b52db4f680f430 --- /dev/null +++ b/scripts/kconfig/tests/err_transitional/expected_stderr @@ -0,0 +1,7 @@ +Kconfig:46:warning: config symbol defined without type +Kconfig:7: error: transitional symbols can only have help sections +Kconfig:14: error: transitional symbols can only have help sections +Kconfig:21: error: transitional symbols can only have help sections +Kconfig:28: error: transitional symbols can only have help sections +Kconfig:32: error: transitional symbols can only have help sections +Kconfig:42: error: transitional symbols can only have help sections diff --git a/scripts/kconfig/tests/transitional/Kconfig b/scripts/kconfig/tests/transitional/Kconfig new file mode 100644 index 00000000000000..62c3b24665b992 --- /dev/null +++ b/scripts/kconfig/tests/transitional/Kconfig @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: GPL-2.0 +# Test transitional symbols for config migration with all Kconfig types + +# Enable module support for tristate testing +config MODULES + bool "Enable loadable module support" + modules + default y + +# Basic migration tests for all types +config NEW_BOOL + bool "New bool option" + default OLD_BOOL + +config OLD_BOOL + bool + transitional + +config NEW_TRISTATE + tristate "New tristate option" + default OLD_TRISTATE + +config OLD_TRISTATE + tristate + transitional + +config NEW_STRING + string "New string option" + default OLD_STRING + +config OLD_STRING + string + transitional + +config NEW_HEX + hex "New hex option" + default OLD_HEX + +config OLD_HEX + hex + transitional + +config NEW_INT + int "New int option" + default OLD_INT + +config OLD_INT + int + transitional + +# Precedence tests for all types +config NEW_BOOL_PRECEDENCE + bool "New bool option with precedence" + default OLD_BOOL_PRECEDENCE + +config OLD_BOOL_PRECEDENCE + bool + transitional + +config NEW_STRING_PRECEDENCE + string "New string option with precedence" + default OLD_STRING_PRECEDENCE + +config OLD_STRING_PRECEDENCE + string + transitional + +config NEW_TRISTATE_PRECEDENCE + tristate "New tristate option with precedence" + default OLD_TRISTATE_PRECEDENCE + +config OLD_TRISTATE_PRECEDENCE + tristate + transitional + +config NEW_HEX_PRECEDENCE + hex "New hex option with precedence" + default OLD_HEX_PRECEDENCE + +config OLD_HEX_PRECEDENCE + hex + transitional + +config NEW_INT_PRECEDENCE + int "New int option with precedence" + default OLD_INT_PRECEDENCE + +config OLD_INT_PRECEDENCE + int + transitional + +# Test that help sections are allowed for transitional symbols +config OLD_WITH_HELP + bool + transitional + help + This transitional symbol has a help section to validate that help is allowed. + +config REGULAR_OPTION + bool "Regular option" diff --git a/scripts/kconfig/tests/transitional/__init__.py b/scripts/kconfig/tests/transitional/__init__.py new file mode 100644 index 00000000000000..61937d10edf1ee --- /dev/null +++ b/scripts/kconfig/tests/transitional/__init__.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +""" +Test transitional symbol migration functionality for all Kconfig types. + +This tests that: +- OLD_* options in existing .config cause NEW_* options to be set +- OLD_* options are not written to the new .config file +- NEW_* options appear in the new .config file with correct values +- All Kconfig types work correctly: bool, tristate, string, hex, int +- User-set NEW values take precedence over conflicting OLD transitional values +""" + +def test(conf): + # Run olddefconfig to process the migration with the initial config + assert conf.olddefconfig(dot_config='initial_config') == 0 + + # Check that the configuration matches expected output + assert conf.config_contains('expected_config') diff --git a/scripts/kconfig/tests/transitional/expected_config b/scripts/kconfig/tests/transitional/expected_config new file mode 100644 index 00000000000000..846e9ddcab9103 --- /dev/null +++ b/scripts/kconfig/tests/transitional/expected_config @@ -0,0 +1,12 @@ +CONFIG_MODULES=y +CONFIG_NEW_BOOL=y +CONFIG_NEW_TRISTATE=m +CONFIG_NEW_STRING="test string" +CONFIG_NEW_HEX=0x1234 +CONFIG_NEW_INT=42 +# CONFIG_NEW_BOOL_PRECEDENCE is not set +CONFIG_NEW_STRING_PRECEDENCE="user value" +CONFIG_NEW_TRISTATE_PRECEDENCE=y +CONFIG_NEW_HEX_PRECEDENCE=0xABCD +CONFIG_NEW_INT_PRECEDENCE=100 +# CONFIG_REGULAR_OPTION is not set diff --git a/scripts/kconfig/tests/transitional/initial_config b/scripts/kconfig/tests/transitional/initial_config new file mode 100644 index 00000000000000..e648a65e504c08 --- /dev/null +++ b/scripts/kconfig/tests/transitional/initial_config @@ -0,0 +1,16 @@ +CONFIG_MODULES=y +CONFIG_OLD_BOOL=y +CONFIG_OLD_TRISTATE=m +CONFIG_OLD_STRING="test string" +CONFIG_OLD_HEX=0x1234 +CONFIG_OLD_INT=42 +# CONFIG_NEW_BOOL_PRECEDENCE is not set +CONFIG_OLD_BOOL_PRECEDENCE=y +CONFIG_NEW_STRING_PRECEDENCE="user value" +CONFIG_OLD_STRING_PRECEDENCE="old value" +CONFIG_NEW_TRISTATE_PRECEDENCE=y +CONFIG_OLD_TRISTATE_PRECEDENCE=m +CONFIG_NEW_HEX_PRECEDENCE=0xABCD +CONFIG_OLD_HEX_PRECEDENCE=0x5678 +CONFIG_NEW_INT_PRECEDENCE=100 +CONFIG_OLD_INT_PRECEDENCE=200 diff --git a/scripts/rustdoc_test_gen.rs b/scripts/rustdoc_test_gen.rs index abb34ada25082c..c8f9dc2ab976c2 100644 --- a/scripts/rustdoc_test_gen.rs +++ b/scripts/rustdoc_test_gen.rs @@ -202,7 +202,7 @@ pub extern "C" fn {kunit_name}(__kunit_test: *mut ::kernel::bindings::kunit) {{ // This follows the syntax for declaring test metadata in the proposed KTAP v2 spec, which may // be used for the proposed KUnit test attributes API. Thus hopefully this will make migration // easier later on. - ::kernel::kunit::info(format_args!(" # {kunit_name}.location: {real_path}:{line}\n")); + ::kernel::kunit::info(fmt!(" # {kunit_name}.location: {real_path}:{line}\n")); /// The anchor where the test code body starts. #[allow(unused)] diff --git a/security/Kconfig b/security/Kconfig index 4816fc74f81ebe..285f284dfcac44 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -269,6 +269,7 @@ endchoice config LSM string "Ordered list of enabled LSMs" + depends on SECURITY default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,ipe,bpf" if DEFAULT_SECURITY_SMACK default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,ipe,bpf" if DEFAULT_SECURITY_APPARMOR default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,ipe,bpf" if DEFAULT_SECURITY_TOMOYO diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b39..b3f7a3258a2cf7 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -112,7 +112,7 @@ static void apparmor_task_free(struct task_struct *task) } static int apparmor_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct aa_task_ctx *new = task_ctx(task); @@ -2530,6 +2530,9 @@ static int __init apparmor_init(void) security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) diff --git a/security/landlock/fs.c b/security/landlock/fs.c index c04f8879ad03ce..0bade2c5aa1d00 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1281,7 +1281,7 @@ static void hook_sb_delete(struct super_block *const sb) struct landlock_object *object; /* Only handles referenced inodes. */ - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) continue; /* diff --git a/security/min_addr.c b/security/min_addr.c index df1bc643d886bd..c55bb84b863209 100644 --- a/security/min_addr.c +++ b/security/min_addr.c @@ -3,6 +3,7 @@ #include #include #include +#include /* amount of vm to protect from userspace access by both DAC and the LSM*/ unsigned long mmap_min_addr; @@ -16,10 +17,7 @@ unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; static void update_mmap_min_addr(void) { #ifdef CONFIG_LSM_MMAP_MIN_ADDR - if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) - mmap_min_addr = dac_mmap_min_addr; - else - mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; + mmap_min_addr = umax(dac_mmap_min_addr, CONFIG_LSM_MMAP_MIN_ADDR); #else mmap_min_addr = dac_mmap_min_addr; #endif diff --git a/security/security.c b/security/security.c index ad163f06bf7abc..301104d63fdedb 100644 --- a/security/security.c +++ b/security/security.c @@ -283,6 +283,9 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) lsm_set_blob_size(&needed->lbs_xattr_count, &blob_sizes.lbs_xattr_count); lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev); + lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map); + lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog); + lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token); } /* Prepare LSM for initialization. */ @@ -480,6 +483,9 @@ static void __init ordered_lsm_init(void) init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev); init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); init_debug("bdev blob size = %d\n", blob_sizes.lbs_bdev); + init_debug("bpf map blob size = %d\n", blob_sizes.lbs_bpf_map); + init_debug("bpf prog blob size = %d\n", blob_sizes.lbs_bpf_prog); + init_debug("bpf token blob size = %d\n", blob_sizes.lbs_bpf_token); /* * Create any kmem_caches needed for blobs @@ -823,17 +829,50 @@ static int lsm_msg_msg_alloc(struct msg_msg *mp) */ static int lsm_bdev_alloc(struct block_device *bdev) { - if (blob_sizes.lbs_bdev == 0) { - bdev->bd_security = NULL; - return 0; - } + return lsm_blob_alloc(&bdev->bd_security, blob_sizes.lbs_bdev, + GFP_KERNEL); +} - bdev->bd_security = kzalloc(blob_sizes.lbs_bdev, GFP_KERNEL); - if (!bdev->bd_security) - return -ENOMEM; +#ifdef CONFIG_BPF_SYSCALL +/** + * lsm_bpf_map_alloc - allocate a composite bpf_map blob + * @map: the bpf_map that needs a blob + * + * Allocate the bpf_map blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_map_alloc(struct bpf_map *map) +{ + return lsm_blob_alloc(&map->security, blob_sizes.lbs_bpf_map, GFP_KERNEL); +} - return 0; +/** + * lsm_bpf_prog_alloc - allocate a composite bpf_prog blob + * @prog: the bpf_prog that needs a blob + * + * Allocate the bpf_prog blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_prog_alloc(struct bpf_prog *prog) +{ + return lsm_blob_alloc(&prog->aux->security, blob_sizes.lbs_bpf_prog, GFP_KERNEL); +} + +/** + * lsm_bpf_token_alloc - allocate a composite bpf_token blob + * @token: the bpf_token that needs a blob + * + * Allocate the bpf_token blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_token_alloc(struct bpf_token *token) +{ + return lsm_blob_alloc(&token->security, blob_sizes.lbs_bpf_token, GFP_KERNEL); } +#endif /* CONFIG_BPF_SYSCALL */ /** * lsm_early_task - during initialization allocate a composite task blob @@ -3185,7 +3224,7 @@ int security_file_truncate(struct file *file) * * Return: Returns a zero on success, negative values on failure. */ -int security_task_alloc(struct task_struct *task, unsigned long clone_flags) +int security_task_alloc(struct task_struct *task, u64 clone_flags) { int rc = lsm_task_alloc(task); @@ -4342,17 +4381,31 @@ EXPORT_SYMBOL(security_secid_to_secctx); * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx * @prop: lsm specific information * @cp: the LSM context + * @lsmid: which security module to report * * Convert a @prop entry to security context. If @cp is NULL the * length of the result will be returned. This does mean that the * length could change between calls to check the length and the * next call which actually allocates and returns the @cp. * + * @lsmid identifies which LSM should supply the context. + * A value of LSM_ID_UNDEF indicates that the first LSM suppling + * the hook should be used. This is used in cases where the + * ID of the supplying LSM is unambiguous. + * * Return: Return length of data on success, error on failure. */ -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp) +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid) { - return call_int_hook(lsmprop_to_secctx, prop, cp); + struct lsm_static_call *scall; + + lsm_for_each_hook(scall, lsmprop_to_secctx) { + if (lsmid != LSM_ID_UNDEF && lsmid != scall->hl->lsmid->id) + continue; + return scall->hl->hook.lsmprop_to_secctx(prop, cp); + } + return LSM_RET_DEFAULT(lsmprop_to_secctx); } EXPORT_SYMBOL(security_lsmprop_to_secctx); @@ -5714,7 +5767,16 @@ int security_bpf_prog(struct bpf_prog *prog) int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_map_create, map, attr, token, kernel); + int rc; + + rc = lsm_bpf_map_alloc(map); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_map_create, map, attr, token, kernel); + if (unlikely(rc)) + security_bpf_map_free(map); + return rc; } /** @@ -5733,7 +5795,16 @@ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_prog_load, prog, attr, token, kernel); + int rc; + + rc = lsm_bpf_prog_alloc(prog); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_prog_load, prog, attr, token, kernel); + if (unlikely(rc)) + security_bpf_prog_free(prog); + return rc; } /** @@ -5750,7 +5821,16 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { - return call_int_hook(bpf_token_create, token, attr, path); + int rc; + + rc = lsm_bpf_token_alloc(token); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_token_create, token, attr, path); + if (unlikely(rc)) + security_bpf_token_free(token); + return rc; } /** @@ -5794,6 +5874,8 @@ int security_bpf_token_capable(const struct bpf_token *token, int cap) void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free, map); + kfree(map->security); + map->security = NULL; } /** @@ -5805,6 +5887,8 @@ void security_bpf_map_free(struct bpf_map *map) void security_bpf_prog_free(struct bpf_prog *prog) { call_void_hook(bpf_prog_free, prog); + kfree(prog->aux->security); + prog->aux->security = NULL; } /** @@ -5816,6 +5900,8 @@ void security_bpf_prog_free(struct bpf_prog *prog) void security_bpf_token_free(struct bpf_token *token) { call_void_hook(bpf_token_free, token); + kfree(token->security); + token->security = NULL; } #endif /* CONFIG_BPF_SYSCALL */ diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 4b4837a20225bc..430b0e23ee00dc 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -292,27 +292,26 @@ static struct avc_xperms_decision_node struct avc_xperms_decision_node *xpd_node; struct extended_perms_decision *xpd; - xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, - GFP_NOWAIT | __GFP_NOWARN); + xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT); if (!xpd_node) return NULL; xpd = &xpd_node->xpd; if (which & XPERMS_ALLOWED) { xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!xpd->allowed) goto error; } if (which & XPERMS_AUDITALLOW) { xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!xpd->auditallow) goto error; } if (which & XPERMS_DONTAUDIT) { xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!xpd->dontaudit) goto error; } @@ -340,7 +339,7 @@ static struct avc_xperms_node *avc_xperms_alloc(void) { struct avc_xperms_node *xp_node; - xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT | __GFP_NOWARN); + xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT); if (!xp_node) return xp_node; INIT_LIST_HEAD(&xp_node->xpd_head); @@ -495,7 +494,7 @@ static struct avc_node *avc_alloc_node(void) { struct avc_node *node; - node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT | __GFP_NOWARN); + node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT); if (!node) goto out; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d40..76b66845a1c343 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -476,7 +476,9 @@ static int selinux_is_genfs_special_handling(struct super_block *sb) !strcmp(sb->s_type->name, "rootfs") || (selinux_policycap_cgroupseclabel() && (!strcmp(sb->s_type->name, "cgroup") || - !strcmp(sb->s_type->name, "cgroup2"))); + !strcmp(sb->s_type->name, "cgroup2"))) || + (selinux_policycap_functionfs_seclabel() && + !strcmp(sb->s_type->name, "functionfs")); } static int selinux_is_sblabel_mnt(struct super_block *sb) @@ -741,7 +743,9 @@ static int selinux_set_mnt_opts(struct super_block *sb, !strcmp(sb->s_type->name, "binder") || !strcmp(sb->s_type->name, "bpf") || !strcmp(sb->s_type->name, "pstore") || - !strcmp(sb->s_type->name, "securityfs")) + !strcmp(sb->s_type->name, "securityfs") || + (selinux_policycap_functionfs_seclabel() && + !strcmp(sb->s_type->name, "functionfs"))) sbsec->flags |= SE_SBGENFS; if (!strcmp(sb->s_type->name, "sysfs") || @@ -4144,7 +4148,7 @@ static int selinux_file_open(struct file *file) /* task security operations */ static int selinux_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { u32 sid = current_sid(); @@ -5885,7 +5889,7 @@ static unsigned int selinux_ip_output(void *priv, struct sk_buff *skb, /* we do this in the LOCAL_OUT path and not the POST_ROUTING path * because we want to make sure we apply the necessary labeling * before IPsec is applied so we can leverage AH protection */ - sk = sk_to_full_sk(skb->sk); + sk = skb_to_full_sk(skb); if (sk) { struct sk_security_struct *sksec; @@ -7062,14 +7066,14 @@ static int bpf_fd_pass(const struct file *file, u32 sid) if (file->f_op == &bpf_map_fops) { map = file->private_data; - bpfsec = map->security; + bpfsec = selinux_bpf_map_security(map); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(file->f_mode), NULL); if (ret) return ret; } else if (file->f_op == &bpf_prog_fops) { prog = file->private_data; - bpfsec = prog->aux->security; + bpfsec = selinux_bpf_prog_security(prog); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); if (ret) @@ -7083,7 +7087,7 @@ static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode) u32 sid = current_sid(); struct bpf_security_struct *bpfsec; - bpfsec = map->security; + bpfsec = selinux_bpf_map_security(map); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(fmode), NULL); } @@ -7093,7 +7097,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog) u32 sid = current_sid(); struct bpf_security_struct *bpfsec; - bpfsec = prog->aux->security; + bpfsec = selinux_bpf_prog_security(prog); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); } @@ -7103,69 +7107,33 @@ static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_map_security(map); bpfsec->sid = current_sid(); - map->security = bpfsec; return 0; } -static void selinux_bpf_map_free(struct bpf_map *map) -{ - struct bpf_security_struct *bpfsec = map->security; - - map->security = NULL; - kfree(bpfsec); -} - static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_prog_security(prog); bpfsec->sid = current_sid(); - prog->aux->security = bpfsec; return 0; } -static void selinux_bpf_prog_free(struct bpf_prog *prog) -{ - struct bpf_security_struct *bpfsec = prog->aux->security; - - prog->aux->security = NULL; - kfree(bpfsec); -} - static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_token_security(token); bpfsec->sid = current_sid(); - token->security = bpfsec; return 0; } - -static void selinux_bpf_token_free(struct bpf_token *token) -{ - struct bpf_security_struct *bpfsec = token->security; - - token->security = NULL; - kfree(bpfsec); -} #endif struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { @@ -7183,6 +7151,9 @@ struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { .lbs_xattr_count = SELINUX_INODE_INIT_XATTRS, .lbs_tun_dev = sizeof(struct tun_security_struct), .lbs_ib = sizeof(struct ib_security_struct), + .lbs_bpf_map = sizeof(struct bpf_security_struct), + .lbs_bpf_prog = sizeof(struct bpf_security_struct), + .lbs_bpf_token = sizeof(struct bpf_security_struct), }; #ifdef CONFIG_PERF_EVENTS @@ -7536,9 +7507,6 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(bpf, selinux_bpf), LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), - LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free), - LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free), - LSM_HOOK_INIT(bpf_token_free, selinux_bpf_token_free), #endif #ifdef CONFIG_PERF_EVENTS @@ -7618,6 +7586,11 @@ static __init int selinux_init(void) /* Set the security state for the initial task. */ cred_init_security(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&selinux_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); + default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 1d7ac59015a12d..2d5139c6d45b30 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "flask.h" #include "avc.h" @@ -245,4 +246,23 @@ selinux_perf_event(void *perf_event) return perf_event + selinux_blob_sizes.lbs_perf_event; } +#ifdef CONFIG_BPF_SYSCALL +static inline struct bpf_security_struct * +selinux_bpf_map_security(struct bpf_map *map) +{ + return map->security + selinux_blob_sizes.lbs_bpf_map; +} + +static inline struct bpf_security_struct * +selinux_bpf_prog_security(struct bpf_prog *prog) +{ + return prog->aux->security + selinux_blob_sizes.lbs_bpf_prog; +} + +static inline struct bpf_security_struct * +selinux_bpf_token_security(struct bpf_token *token) +{ + return token->security + selinux_blob_sizes.lbs_bpf_token; +} +#endif /* CONFIG_BPF_SYSCALL */ #endif /* _SELINUX_OBJSEC_H_ */ diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h index 7405154e6c42c1..135a969f873cad 100644 --- a/security/selinux/include/policycap.h +++ b/security/selinux/include/policycap.h @@ -17,6 +17,7 @@ enum { POLICYDB_CAP_NETLINK_XPERM, POLICYDB_CAP_NETIF_WILDCARD, POLICYDB_CAP_GENFS_SECLABEL_WILDCARD, + POLICYDB_CAP_FUNCTIONFS_SECLABEL, __POLICYDB_CAP_MAX }; #define POLICYDB_CAP_MAX (__POLICYDB_CAP_MAX - 1) diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h index d8962fcf2ff900..ff888288765171 100644 --- a/security/selinux/include/policycap_names.h +++ b/security/selinux/include/policycap_names.h @@ -20,6 +20,7 @@ const char *const selinux_policycap_names[__POLICYDB_CAP_MAX] = { "netlink_xperm", "netif_wildcard", "genfs_seclabel_wildcard", + "functionfs_seclabel", }; /* clang-format on */ diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 8201e6a3ac0fc8..0f954a40d3fc74 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -203,10 +203,10 @@ static inline bool selinux_policycap_netlink_xperm(void) selinux_state.policycap[POLICYDB_CAP_NETLINK_XPERM]); } -static inline bool selinux_policycap_netif_wildcard(void) +static inline bool selinux_policycap_functionfs_seclabel(void) { return READ_ONCE( - selinux_state.policycap[POLICYDB_CAP_NETIF_WILDCARD]); + selinux_state.policycap[POLICYDB_CAP_FUNCTIONFS_SECLABEL]); } struct selinux_policy_convert_data; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 9aa1d03ab6120a..232e087bce3eea 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1203,7 +1203,7 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { struct selinux_fs_info *fsi = file_inode(filep)->i_sb->s_fs_info; - char *page = NULL; + char buffer[4]; ssize_t length; ssize_t ret; int cur_enforcing; @@ -1217,27 +1217,19 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, fsi->bool_pending_names[index])) goto out_unlock; - ret = -ENOMEM; - page = (char *)get_zeroed_page(GFP_KERNEL); - if (!page) - goto out_unlock; - cur_enforcing = security_get_bool_value(index); if (cur_enforcing < 0) { ret = cur_enforcing; goto out_unlock; } - length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, - fsi->bool_pending_values[index]); + length = scnprintf(buffer, sizeof(buffer), "%d %d", !!cur_enforcing, + !!fsi->bool_pending_values[index]); mutex_unlock(&selinux_state.policy_mutex); - ret = simple_read_from_buffer(buf, count, ppos, page, length); -out_free: - free_page((unsigned long)page); - return ret; + return simple_read_from_buffer(buf, count, ppos, buffer, length); out_unlock: mutex_unlock(&selinux_state.policy_mutex); - goto out_free; + return ret; } static ssize_t sel_write_bool(struct file *filep, const char __user *buf, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fc340a6f0ddea8..fdf2f193a2910e 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5267,6 +5267,11 @@ static __init int smack_init(void) /* initialize the smack_known_list */ init_smack_known_list(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&smack_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); + return 0; } diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index d6ebcd9db80a37..48fc59d38ab217 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -514,7 +514,7 @@ struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { * Returns 0. */ static int tomoyo_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct tomoyo_task *old = tomoyo_task(current); struct tomoyo_task *new = tomoyo_task(task); diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c index c364bd126ac8b3..2d5f4d47071f79 100644 --- a/sound/core/hrtimer.c +++ b/sound/core/hrtimer.c @@ -44,7 +44,7 @@ static enum hrtimer_restart snd_hrtimer_callback(struct hrtimer *hrt) } /* calculate the drift */ - delta = ktime_sub(hrt->base->get_time(), hrtimer_get_expires(hrt)); + delta = ktime_sub(hrtimer_cb_get_time(hrt), hrtimer_get_expires(hrt)); if (delta > 0) ticks += ktime_divns(delta, ticks * resolution); diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 1eab940fa2e5ac..68bee40c9adafd 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -84,19 +84,24 @@ void snd_pcm_group_init(struct snd_pcm_group *group) } /* define group lock helpers */ -#define DEFINE_PCM_GROUP_LOCK(action, mutex_action) \ +#define DEFINE_PCM_GROUP_LOCK(action, bh_lock, bh_unlock, mutex_action) \ static void snd_pcm_group_ ## action(struct snd_pcm_group *group, bool nonatomic) \ { \ - if (nonatomic) \ + if (nonatomic) { \ mutex_ ## mutex_action(&group->mutex); \ - else \ - spin_ ## action(&group->lock); \ -} - -DEFINE_PCM_GROUP_LOCK(lock, lock); -DEFINE_PCM_GROUP_LOCK(unlock, unlock); -DEFINE_PCM_GROUP_LOCK(lock_irq, lock); -DEFINE_PCM_GROUP_LOCK(unlock_irq, unlock); + } else { \ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && bh_lock) \ + local_bh_disable(); \ + spin_ ## action(&group->lock); \ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && bh_unlock) \ + local_bh_enable(); \ + } \ +} + +DEFINE_PCM_GROUP_LOCK(lock, false, false, lock); +DEFINE_PCM_GROUP_LOCK(unlock, false, false, unlock); +DEFINE_PCM_GROUP_LOCK(lock_irq, true, false, lock); +DEFINE_PCM_GROUP_LOCK(unlock_irq, false, true, unlock); /** * snd_pcm_stream_lock - Lock the PCM stream diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c index f23c6b7ae2403c..91cce18901114b 100644 --- a/sound/core/seq/seq_fifo.c +++ b/sound/core/seq/seq_fifo.c @@ -210,6 +210,7 @@ int snd_seq_fifo_poll_wait(struct snd_seq_fifo *f, struct file *file, poll_table *wait) { poll_wait(file, &f->input_sleep, wait); + guard(spinlock_irq)(&f->lock); return (f->cells > 0); } diff --git a/sound/drivers/serial-generic.c b/sound/drivers/serial-generic.c index 21ae053c057671..766206c6ca75a8 100644 --- a/sound/drivers/serial-generic.c +++ b/sound/drivers/serial-generic.c @@ -37,6 +37,8 @@ MODULE_LICENSE("GPL"); #define SERIAL_TX_STATE_ACTIVE 1 #define SERIAL_TX_STATE_WAKEUP 2 +#define INTERNAL_BUF_SIZE 256 + struct snd_serial_generic { struct serdev_device *serdev; @@ -51,6 +53,7 @@ struct snd_serial_generic { struct work_struct tx_work; unsigned long tx_state; + char tx_buf[INTERNAL_BUF_SIZE]; }; static void snd_serial_generic_tx_wakeup(struct snd_serial_generic *drvdata) @@ -61,11 +64,8 @@ static void snd_serial_generic_tx_wakeup(struct snd_serial_generic *drvdata) schedule_work(&drvdata->tx_work); } -#define INTERNAL_BUF_SIZE 256 - static void snd_serial_generic_tx_work(struct work_struct *work) { - static char buf[INTERNAL_BUF_SIZE]; int num_bytes; struct snd_serial_generic *drvdata = container_of(work, struct snd_serial_generic, tx_work); @@ -78,8 +78,10 @@ static void snd_serial_generic_tx_work(struct work_struct *work) if (!test_bit(SERIAL_MODE_OUTPUT_OPEN, &drvdata->filemode)) break; - num_bytes = snd_rawmidi_transmit_peek(substream, buf, INTERNAL_BUF_SIZE); - num_bytes = serdev_device_write_buf(drvdata->serdev, buf, num_bytes); + num_bytes = snd_rawmidi_transmit_peek(substream, drvdata->tx_buf, + INTERNAL_BUF_SIZE); + num_bytes = serdev_device_write_buf(drvdata->serdev, drvdata->tx_buf, + num_bytes); if (!num_bytes) break; diff --git a/sound/hda/codecs/hdmi/hdmi.c b/sound/hda/codecs/hdmi/hdmi.c index dc38bfd9dba598..111c9b5335afcc 100644 --- a/sound/hda/codecs/hdmi/hdmi.c +++ b/sound/hda/codecs/hdmi/hdmi.c @@ -1549,6 +1549,7 @@ static const struct snd_pci_quirk force_connect_list[] = { SND_PCI_QUIRK(0x103c, 0x83e2, "HP EliteDesk 800 G4", 1), SND_PCI_QUIRK(0x103c, 0x83ef, "HP MP9 G4 Retail System AMS", 1), SND_PCI_QUIRK(0x103c, 0x845a, "HP EliteDesk 800 G4 DM 65W", 1), + SND_PCI_QUIRK(0x103c, 0x83f3, "HP ProDesk 400", 1), SND_PCI_QUIRK(0x103c, 0x870f, "HP", 1), SND_PCI_QUIRK(0x103c, 0x871a, "HP", 1), SND_PCI_QUIRK(0x103c, 0x8711, "HP", 1), diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index e82957453abca7..8620b045023b3f 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -3700,6 +3700,7 @@ enum { ALC236_FIXUP_DELL_DUAL_CODECS, ALC287_FIXUP_CS35L41_I2C_2_THINKPAD_ACPI, ALC287_FIXUP_TAS2781_I2C, + ALC295_FIXUP_DELL_TAS2781_I2C, ALC245_FIXUP_TAS2781_SPI_2, ALC287_FIXUP_TXNW2781_I2C, ALC287_FIXUP_YOGA7_14ARB7_I2C, @@ -5165,6 +5166,12 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc294_fixup_gx502_hp, }, + [ALC295_FIXUP_DELL_TAS2781_I2C] = { + .type = HDA_FIXUP_FUNC, + .v.func = tas2781_fixup_tias_i2c, + .chained = true, + .chain_id = ALC289_FIXUP_DUAL_SPK + }, [ALC294_FIXUP_ASUS_GU502_PINS] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -6287,8 +6294,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0c1e, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS), SND_PCI_QUIRK(0x1028, 0x0c28, "Dell Inspiron 16 Plus 7630", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS), SND_PCI_QUIRK(0x1028, 0x0c4d, "Dell", ALC287_FIXUP_CS35L41_I2C_4), - SND_PCI_QUIRK(0x1028, 0x0c94, "Dell Polaris 3 metal", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x1028, 0x0c96, "Dell Polaris 2in1", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x1028, 0x0c94, "Dell Polaris 3 metal", ALC295_FIXUP_DELL_TAS2781_I2C), + SND_PCI_QUIRK(0x1028, 0x0c96, "Dell Polaris 2in1", ALC295_FIXUP_DELL_TAS2781_I2C), SND_PCI_QUIRK(0x1028, 0x0cbd, "Dell Oasis 13 CS MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2), SND_PCI_QUIRK(0x1028, 0x0cbe, "Dell Oasis 13 2-IN-1 MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2), SND_PCI_QUIRK(0x1028, 0x0cbf, "Dell Oasis 13 Low Weight MTU-L", ALC289_FIXUP_DELL_CS35L41_SPI_2), @@ -6467,6 +6474,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8992, "HP EliteBook 845 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8994, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8995, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x89a0, "HP Laptop 15-dw4xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x89a4, "HP ProBook 440 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89a6, "HP ProBook 450 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89aa, "HP EliteBook 630 G9", ALC236_FIXUP_HP_GPIO_LED), @@ -6477,6 +6485,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x89c6, "Zbook Fury 17 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89ca, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x89d3, "HP EliteBook 645 G9 (MB 89D2)", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x89da, "HP Spectre x360 14t-ea100", ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX), SND_PCI_QUIRK(0x103c, 0x89e7, "HP Elite x2 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8a0f, "HP Pavilion 14-ec1xxx", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8a20, "HP Laptop 15s-fq5xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), @@ -7070,8 +7079,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38be, "Yoga S980-14.5 proX YC Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38bf, "Yoga S980-14.5 proX LX Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38c3, "Y980 DUAL", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), - SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), + SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x38cb, "Y790 YG DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38cd, "Y790 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38d2, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN), @@ -7092,6 +7101,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3913, "Lenovo 145", ALC236_FIXUP_LENOVO_INV_DMIC), SND_PCI_QUIRK(0x17aa, 0x391f, "Yoga S990-16 pro Quad YC Quad", ALC287_FIXUP_TXNW2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3920, "Yoga S990-16 pro Quad VECO Quad", ALC287_FIXUP_TXNW2781_I2C), + SND_PCI_QUIRK(0x17aa, 0x3929, "Thinkbook 13x Gen 5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x392b, "Thinkbook 13x Gen 5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_PCM_44K), diff --git a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c index 9ba14c09c07ffe..3cca750857b689 100644 --- a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c +++ b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c @@ -69,7 +69,7 @@ static int cirrus_scodec_test_gpio_set_config(struct gpio_chip *gc, unsigned long config) { switch (pinconf_to_config_param(config)) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_OUTPUT_ENABLE: return -EOPNOTSUPP; default: diff --git a/sound/hda/codecs/side-codecs/cs35l41_hda_property.c b/sound/hda/codecs/side-codecs/cs35l41_hda_property.c index d8249d997c2a0b..16d5ea77192f04 100644 --- a/sound/hda/codecs/side-codecs/cs35l41_hda_property.c +++ b/sound/hda/codecs/side-codecs/cs35l41_hda_property.c @@ -135,6 +135,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "17AA38C8", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, { "17AA38F9", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38FA", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, + { "17AA3929", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, + { "17AA392B", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, {} }; @@ -558,6 +560,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "17AA38C8", generic_dsd_config }, { "CSC3551", "17AA38F9", generic_dsd_config }, { "CSC3551", "17AA38FA", generic_dsd_config }, + { "CSC3551", "17AA3929", generic_dsd_config }, + { "CSC3551", "17AA392B", generic_dsd_config }, {} }; diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda.c b/sound/hda/codecs/side-codecs/cs35l56_hda.c index 36fa62a41984b9..5bb1c4ebeaf3cb 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda.c @@ -1049,6 +1049,7 @@ int cs35l56_hda_common_probe(struct cs35l56_hda *cs35l56, int hid, int id) goto err; } + cs35l56->base.type = hid & 0xff; cs35l56->base.cal_index = -1; cs35l56_init_cs_dsp(&cs35l56->base, &cs35l56->cs_dsp); diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c b/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c index d10209e4eddd5b..1072f17385ac52 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda_i2c.c @@ -27,8 +27,6 @@ static int cs35l56_hda_i2c_probe(struct i2c_client *clt) cs35l56->base.can_hibernate = true; #endif - cs35l56->base.fw_reg = &cs35l56_fw_reg; - cs35l56->base.regmap = devm_regmap_init_i2c(clt, &cs35l56_regmap_i2c); if (IS_ERR(cs35l56->base.regmap)) { ret = PTR_ERR(cs35l56->base.regmap); diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c b/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c index f57533d3d728f5..f802c83c57b4ef 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda_spi.c @@ -30,8 +30,6 @@ static int cs35l56_hda_spi_probe(struct spi_device *spi) cs35l56->base.can_hibernate = true; #endif - cs35l56->base.fw_reg = &cs35l56_fw_reg; - cs35l56->base.regmap = devm_regmap_init_spi(spi, &cs35l56_regmap_spi); if (IS_ERR(cs35l56->base.regmap)) { ret = PTR_ERR(cs35l56->base.regmap); diff --git a/sound/hda/codecs/side-codecs/tas2781_hda.c b/sound/hda/codecs/side-codecs/tas2781_hda.c index f46d2e06c64f34..96e6d82dc69eb1 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda.c @@ -33,6 +33,23 @@ const efi_guid_t tasdev_fct_efi_guid[] = { }; EXPORT_SYMBOL_NS_GPL(tasdev_fct_efi_guid, "SND_HDA_SCODEC_TAS2781"); +/* + * The order of calibrated-data writing function is a bit different from the + * order in UEFI. Here is the conversion to match the order of calibrated-data + * writing function. + */ +static void cali_cnv(unsigned char *data, unsigned int base, int offset) +{ + struct cali_reg reg_data; + + memcpy(®_data, &data[base], sizeof(reg_data)); + /* the data order has to be swapped between r0_low_reg and inv0_reg */ + swap(reg_data.r0_low_reg, reg_data.invr0_reg); + + cpu_to_be32_array((__force __be32 *)(data + offset + 1), + (u32 *)®_data, TASDEV_CALIB_N); +} + static void tas2781_apply_calib(struct tasdevice_priv *p) { struct calidata *cali_data = &p->cali_data; @@ -103,8 +120,7 @@ static void tas2781_apply_calib(struct tasdevice_priv *p) data[l] = k; oft++; - for (i = 0; i < TASDEV_CALIB_N * 4; i++) - data[l + i + 1] = data[4 * oft + i]; + cali_cnv(data, 4 * oft, l); k++; } } @@ -130,9 +146,8 @@ static void tas2781_apply_calib(struct tasdevice_priv *p) for (j = p->ndev - 1; j >= 0; j--) { l = j * (cali_data->cali_dat_sz_per_dev + 1); - for (i = TASDEV_CALIB_N * 4; i > 0 ; i--) - data[l + i] = data[p->index * 5 + i]; - data[l+i] = j; + cali_cnv(data, cali_data->cali_dat_sz_per_dev * j, l); + data[l] = j; } } @@ -178,6 +193,11 @@ int tas2781_save_calibration(struct tas2781_hda *hda) efi_status_t status; int i; + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { + dev_err(p->dev, "%s: NO EFI FOUND!\n", __func__); + return -EINVAL; + } + if (hda->catlog_id < LENOVO) efi_guid = tasdev_fct_efi_guid[hda->catlog_id]; diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c index 008dbe1490a7f5..4dea442d8c30e4 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c @@ -317,6 +317,11 @@ static int tas2563_save_calibration(struct tas2781_hda *h) unsigned int attr; int ret, i, j, k; + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { + dev_err(p->dev, "%s: NO EFI FOUND!\n", __func__); + return -EINVAL; + } + cd->cali_dat_sz_per_dev = TAS2563_CAL_DATA_SIZE * TASDEV_CALIB_N; /* extra byte for each device is the device number */ diff --git a/sound/hda/core/intel-dsp-config.c b/sound/hda/core/intel-dsp-config.c index b5917637089f9a..f961e8d8db9792 100644 --- a/sound/hda/core/intel-dsp-config.c +++ b/sound/hda/core/intel-dsp-config.c @@ -673,6 +673,8 @@ static int snd_intel_dsp_check_soundwire(struct pci_dev *pci) int ret; handle = ACPI_HANDLE(&pci->dev); + if (!handle) + return -ENODEV; ret = sdw_intel_acpi_scan(handle, &info); if (ret < 0) diff --git a/sound/pci/asihpi/asihpi.c b/sound/pci/asihpi/asihpi.c index 8419f2b6e5891e..fd0a67b772d1f0 100644 --- a/sound/pci/asihpi/asihpi.c +++ b/sound/pci/asihpi/asihpi.c @@ -982,12 +982,12 @@ static int snd_card_asihpi_playback_open(struct snd_pcm_substream *substream) err = hpi_outstream_open(card->hpi->adapter->index, substream->number, &dpcm->h_stream); hpi_handle_error(err); - if (err) + if (err) { kfree(dpcm); - if (err == HPI_ERROR_OBJ_ALREADY_OPEN) - return -EBUSY; - if (err) + if (err == HPI_ERROR_OBJ_ALREADY_OPEN) + return -EBUSY; return -EIO; + } /*? also check ASI5000 samplerate source If external, only support external rate. @@ -1156,12 +1156,12 @@ static int snd_card_asihpi_capture_open(struct snd_pcm_substream *substream) err = hpi_handle_error( hpi_instream_open(card->hpi->adapter->index, substream->number, &dpcm->h_stream)); - if (err) + if (err) { kfree(dpcm); - if (err == HPI_ERROR_OBJ_ALREADY_OPEN) - return -EBUSY; - if (err) + if (err == HPI_ERROR_OBJ_ALREADY_OPEN) + return -EBUSY; return -EIO; + } timer_setup(&dpcm->timer, snd_card_asihpi_timer_function, 0); dpcm->substream = substream; diff --git a/sound/soc/amd/acp/acp-i2s.c b/sound/soc/amd/acp/acp-i2s.c index 617690362ad75c..4ba0a66981ea9d 100644 --- a/sound/soc/amd/acp/acp-i2s.c +++ b/sound/soc/amd/acp/acp-i2s.c @@ -73,7 +73,7 @@ static int acp_i2s_set_fmt(struct snd_soc_dai *cpu_dai, unsigned int fmt) { struct device *dev = cpu_dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); int mode; mode = fmt & SND_SOC_DAIFMT_FORMAT_MASK; @@ -199,7 +199,7 @@ static int acp_i2s_hwparams(struct snd_pcm_substream *substream, struct snd_pcm_ u32 reg_val, fmt_reg, tdm_fmt; u32 lrclk_div_val, bclk_div_val; - chip = dev_get_platdata(dev); + chip = dev_get_drvdata(dev->parent); rsrc = chip->rsrc; /* These values are as per Hardware Spec */ @@ -386,7 +386,7 @@ static int acp_i2s_trigger(struct snd_pcm_substream *substream, int cmd, struct { struct acp_stream *stream = substream->runtime->private_data; struct device *dev = dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); struct acp_resource *rsrc = chip->rsrc; u32 val, period_bytes, reg_val, ier_val, water_val, buf_size, buf_reg; @@ -516,14 +516,13 @@ static int acp_i2s_trigger(struct snd_pcm_substream *substream, int cmd, struct static int acp_i2s_prepare(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { struct device *dev = dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); struct acp_resource *rsrc = chip->rsrc; struct acp_stream *stream = substream->runtime->private_data; u32 reg_dma_size = 0, reg_fifo_size = 0, reg_fifo_addr = 0; u32 phy_addr = 0, acp_fifo_addr = 0, ext_int_ctrl; unsigned int dir = substream->stream; - chip = dev_get_platdata(dev); switch (dai->driver->id) { case I2S_SP_INSTANCE: if (dir == SNDRV_PCM_STREAM_PLAYBACK) { @@ -632,7 +631,7 @@ static int acp_i2s_startup(struct snd_pcm_substream *substream, struct snd_soc_d { struct acp_stream *stream = substream->runtime->private_data; struct device *dev = dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); struct acp_resource *rsrc = chip->rsrc; unsigned int dir = substream->stream; unsigned int irq_bit = 0; diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index b8d58d2fe326d4..160c07699a8b72 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -178,6 +178,7 @@ config SND_SOC_ALL_CODECS imply SND_SOC_NAU8825 imply SND_SOC_HDMI_CODEC imply SND_SOC_PCM1681 + imply SND_SOC_PCM1754 imply SND_SOC_PCM1789_I2C imply SND_SOC_PCM179X_I2C imply SND_SOC_PCM179X_SPI @@ -193,6 +194,7 @@ config SND_SOC_ALL_CODECS imply SND_SOC_PCM512x_SPI imply SND_SOC_PCM6240 imply SND_SOC_PEB2466 + imply SND_SOC_PM4125_SDW imply SND_SOC_RK3308 imply SND_SOC_RK3328 imply SND_SOC_RK817 @@ -266,6 +268,7 @@ config SND_SOC_ALL_CODECS imply SND_SOC_TAS2770 imply SND_SOC_TAS2780 imply SND_SOC_TAS2781_I2C + imply SND_SOC_TAS2783_SDW imply SND_SOC_TAS5086 imply SND_SOC_TAS571X imply SND_SOC_TAS5720 @@ -1442,6 +1445,10 @@ config SND_SOC_PCM1681 tristate "Texas Instruments PCM1681 CODEC" depends on I2C +config SND_SOC_PCM1754 + tristate "Texas Instruments PCM1754 CODEC" + depends on GPIOLIB + config SND_SOC_PCM1789 tristate @@ -1558,6 +1565,23 @@ config SND_SOC_PEB2466 To compile this driver as a module, choose M here: the module will be called snd-soc-peb2466. +config SND_SOC_PM4125 + depends on SND_SOC_PM4125_SDW + tristate + depends on SOUNDWIRE || !SOUNDWIRE + +config SND_SOC_PM4125_SDW + tristate "PM4125 audio codec - SDW" + select SND_SOC_PM4125 + select SND_SOC_WCD_MBHC + select REGMAP_IRQ + depends on SOUNDWIRE + select REGMAP_SOUNDWIRE + help + The PMIC PM4125 has an in-built audio codec IC used with SoCs + like QCM2290, and it is connected via soundwire and SPMI. + To compile this codec driver say Y or m. + config SND_SOC_RK3308 tristate "Rockchip RK3308 audio CODEC" depends on ARM64 || COMPILE_TEST @@ -1918,6 +1942,7 @@ config SND_SOC_SGTL5000 config SND_SOC_SI476X tristate + depends on MFD_SI476X_CORE config SND_SOC_SIGMADSP tristate @@ -2074,6 +2099,19 @@ config SND_SOC_TAS2781_I2C algo coefficient setting, for one, two or even multiple TAS2781 chips. +config SND_SOC_TAS2783_SDW + tristate "Texas Instruments TAS2783 speaker amplifier (sdw)" + depends on SOUNDWIRE + depends on EFI + select REGMAP_SOUNDWIRE + select REGMAP_SOUNDWIRE_MBQ + select CRC32 + help + Enable support for Texas Instruments TAS2783A Digital input + mono Class-D and DSP-inside audio power amplifiers. TAS2783 + driver implements a flexible and configurable algorithm + cofficient setting, for one, two or multiple TAS2783 chips. + config SND_SOC_TAS5086 tristate "Texas Instruments TAS5086 speaker amplifier" depends on I2C @@ -2251,6 +2289,9 @@ config SND_SOC_UDA1380 config SND_SOC_WCD_CLASSH tristate +config SND_SOC_WCD_COMMON + tristate + config SND_SOC_WCD9335 tristate "WCD9335 Codec" depends on SLIMBUS @@ -2272,6 +2313,7 @@ config SND_SOC_WCD934X select REGMAP_IRQ select REGMAP_SLIMBUS select SND_SOC_WCD_CLASSH + select SND_SOC_WCD_COMMON select SND_SOC_WCD_MBHC depends on MFD_WCD934X || COMPILE_TEST help @@ -2283,6 +2325,7 @@ config SND_SOC_WCD937X tristate depends on SOUNDWIRE || !SOUNDWIRE select SND_SOC_WCD_CLASSH + select SND_SOC_WCD_COMMON config SND_SOC_WCD937X_SDW tristate "WCD9370/WCD9375 Codec - SDW" @@ -2302,6 +2345,7 @@ config SND_SOC_WCD938X tristate depends on SOUNDWIRE || !SOUNDWIRE select SND_SOC_WCD_CLASSH + select SND_SOC_WCD_COMMON select MULTIPLEXER config SND_SOC_WCD938X_SDW @@ -2321,6 +2365,7 @@ config SND_SOC_WCD939X depends on SOUNDWIRE || !SOUNDWIRE depends on TYPEC || !TYPEC select SND_SOC_WCD_CLASSH + select SND_SOC_WCD_COMMON config SND_SOC_WCD939X_SDW tristate "WCD9390/WCD9395 Codec - SDW" diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile index a476d6c454516f..bd95a7c911d5c1 100644 --- a/sound/soc/codecs/Makefile +++ b/sound/soc/codecs/Makefile @@ -203,6 +203,7 @@ snd-soc-ntp8918-y := ntp8918.o snd-soc-ntpfw-y := ntpfw.o snd-soc-hdmi-codec-y := hdmi-codec.o snd-soc-pcm1681-y := pcm1681.o +snd-soc-pcm1754-y := pcm1754.o snd-soc-pcm1789-codec-y := pcm1789.o snd-soc-pcm1789-i2c-y := pcm1789-i2c.o snd-soc-pcm179x-codec-y := pcm179x.o @@ -224,6 +225,8 @@ snd-soc-pcm512x-i2c-y := pcm512x-i2c.o snd-soc-pcm512x-spi-y := pcm512x-spi.o snd-soc-pcm6240-y := pcm6240.o snd-soc-peb2466-y := peb2466.o +snd-soc-pm4125-y := pm4125.o +snd-soc-pm4125-sdw-y := pm4125-sdw.o snd-soc-rk3308-y := rk3308_codec.o snd-soc-rk3328-y := rk3328_codec.o snd-soc-rk817-y := rk817_codec.o @@ -316,6 +319,7 @@ snd-soc-tas2781-comlib-y := tas2781-comlib.o snd-soc-tas2781-comlib-i2c-y := tas2781-comlib-i2c.o snd-soc-tas2781-fmwlib-y := tas2781-fmwlib.o snd-soc-tas2781-i2c-y := tas2781-i2c.o +snd-soc-tas2783-sdw-y := tas2783-sdw.o snd-soc-tfa9879-y := tfa9879.o snd-soc-tfa989x-y := tfa989x.o snd-soc-tlv320adc3xxx-y := tlv320adc3xxx.o @@ -341,6 +345,7 @@ snd-soc-uda1334-y := uda1334.o snd-soc-uda1342-y := uda1342.o snd-soc-uda1380-y := uda1380.o snd-soc-wcd-classh-y := wcd-clsh-v2.o +snd-soc-wcd-common-y := wcd-common.o snd-soc-wcd-mbhc-y := wcd-mbhc-v2.o snd-soc-wcd9335-y := wcd9335.o snd-soc-wcd934x-y := wcd934x.o @@ -624,6 +629,7 @@ obj-$(CONFIG_SND_SOC_NTP8918) += snd-soc-ntp8918.o obj-$(CONFIG_SND_SOC_NTPFW) += snd-soc-ntpfw.o obj-$(CONFIG_SND_SOC_HDMI_CODEC) += snd-soc-hdmi-codec.o obj-$(CONFIG_SND_SOC_PCM1681) += snd-soc-pcm1681.o +obj-$(CONFIG_SND_SOC_PCM1754) += snd-soc-pcm1754.o obj-$(CONFIG_SND_SOC_PCM179X) += snd-soc-pcm179x-codec.o obj-$(CONFIG_SND_SOC_PCM1789_I2C) += snd-soc-pcm1789-i2c.o obj-$(CONFIG_SND_SOC_PCM1789) += snd-soc-pcm1789-codec.o @@ -645,6 +651,12 @@ obj-$(CONFIG_SND_SOC_PCM512x_I2C) += snd-soc-pcm512x-i2c.o obj-$(CONFIG_SND_SOC_PCM512x_SPI) += snd-soc-pcm512x-spi.o obj-$(CONFIG_SND_SOC_PCM6240) += snd-soc-pcm6240.o obj-$(CONFIG_SND_SOC_PEB2466) += snd-soc-peb2466.o +obj-$(CONFIG_SND_SOC_PM4125_SDW) += snd-soc-pm4125-sdw.o +obj-$(CONFIG_SND_SOC_PM4125) += snd-soc-pm4125.o +ifdef CONFIG_SND_SOC_PM4125_SDW +# avoid link failure by forcing sdw code built-in when needed +obj-$(CONFIG_SND_SOC_PM4125) += snd-soc-pm4125-sdw.o +endif obj-$(CONFIG_SND_SOC_RK3308) += snd-soc-rk3308.o obj-$(CONFIG_SND_SOC_RK3328) += snd-soc-rk3328.o obj-$(CONFIG_SND_SOC_RK817) += snd-soc-rk817.o @@ -732,6 +744,7 @@ obj-$(CONFIG_SND_SOC_TAS2781_COMLIB) += snd-soc-tas2781-comlib.o obj-$(CONFIG_SND_SOC_TAS2781_COMLIB_I2C) += snd-soc-tas2781-comlib-i2c.o obj-$(CONFIG_SND_SOC_TAS2781_FMWLIB) += snd-soc-tas2781-fmwlib.o obj-$(CONFIG_SND_SOC_TAS2781_I2C) += snd-soc-tas2781-i2c.o +obj-$(CONFIG_SND_SOC_TAS2783_SDW) += snd-soc-tas2783-sdw.o obj-$(CONFIG_SND_SOC_TAS5086) += snd-soc-tas5086.o obj-$(CONFIG_SND_SOC_TAS571X) += snd-soc-tas571x.o obj-$(CONFIG_SND_SOC_TAS5720) += snd-soc-tas5720.o @@ -764,6 +777,7 @@ obj-$(CONFIG_SND_SOC_UDA1334) += snd-soc-uda1334.o obj-$(CONFIG_SND_SOC_UDA1342) += snd-soc-uda1342.o obj-$(CONFIG_SND_SOC_UDA1380) += snd-soc-uda1380.o obj-$(CONFIG_SND_SOC_WCD_CLASSH) += snd-soc-wcd-classh.o +obj-$(CONFIG_SND_SOC_WCD_COMMON) += snd-soc-wcd-common.o obj-$(CONFIG_SND_SOC_WCD_MBHC) += snd-soc-wcd-mbhc.o obj-$(CONFIG_SND_SOC_WCD9335) += snd-soc-wcd9335.o obj-$(CONFIG_SND_SOC_WCD934X) += snd-soc-wcd934x.o diff --git a/sound/soc/codecs/arizona-jack.c b/sound/soc/codecs/arizona-jack.c index 22f9c431a0e550..6b55610ad535d6 100644 --- a/sound/soc/codecs/arizona-jack.c +++ b/sound/soc/codecs/arizona-jack.c @@ -461,7 +461,11 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, bool *mic) { struct arizona *arizona = info->arizona; +#ifdef CONFIG_GPIOLIB_LEGACY int id_gpio = arizona->pdata.hpdet_id_gpio; +#else + int id_gpio = 0; +#endif if (!arizona->pdata.hpdet_acc_id) return 0; @@ -472,6 +476,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, */ info->hpdet_res[info->num_hpdet_res++] = *reading; +#ifdef CONFIG_GPIOLIB_LEGACY /* Only check the mic directly if we didn't already ID it */ if (id_gpio && info->num_hpdet_res == 1) { dev_dbg(arizona->dev, "Measuring mic\n"); @@ -489,6 +494,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, ARIZONA_HP_POLL, ARIZONA_HP_POLL); return -EAGAIN; } +#endif /* OK, got both. Now, compare... */ dev_dbg(arizona->dev, "HPDET measured %d %d\n", @@ -529,7 +535,9 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data) { struct arizona_priv *info = data; struct arizona *arizona = info->arizona; +#ifdef CONFIG_GPIOLIB_LEGACY int id_gpio = arizona->pdata.hpdet_id_gpio; +#endif int ret, reading, state, report; bool mic = false; @@ -585,8 +593,10 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data) arizona_extcon_hp_clamp(info, false); +#ifdef CONFIG_GPIOLIB_LEGACY if (id_gpio) gpio_set_value_cansleep(id_gpio, 0); +#endif /* If we have a mic then reenable MICDET */ if (state && (mic || info->mic)) @@ -1317,6 +1327,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) regmap_update_bits(arizona->regmap, ARIZONA_GP_SWITCH_1, ARIZONA_SW1_MODE_MASK, arizona->pdata.gpsw); +#ifdef CONFIG_GPIOLIB_LEGACY if (pdata->micd_pol_gpio > 0) { if (info->micd_modes[0].gpio) mode = GPIOF_OUT_INIT_HIGH; @@ -1332,7 +1343,9 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) } info->micd_pol_gpio = gpio_to_desc(pdata->micd_pol_gpio); - } else { + } else +#endif + { if (info->micd_modes[0].gpio) mode = GPIOD_OUT_HIGH; else @@ -1353,6 +1366,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) } } +#ifdef CONFIG_GPIOLIB_LEGACY if (arizona->pdata.hpdet_id_gpio > 0) { ret = devm_gpio_request_one(dev, arizona->pdata.hpdet_id_gpio, GPIOF_OUT_INIT_LOW, @@ -1364,6 +1378,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) return ret; } } +#endif return 0; } diff --git a/sound/soc/codecs/cs-amp-lib-test.c b/sound/soc/codecs/cs-amp-lib-test.c index f53650128fc3d2..2fde8430933830 100644 --- a/sound/soc/codecs/cs-amp-lib-test.c +++ b/sound/soc/codecs/cs-amp-lib-test.c @@ -19,6 +19,14 @@ #include #include +#define LENOVO_SPEAKER_ID_EFI_NAME L"SdwSpeaker" +#define LENOVO_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0x48df970e, 0xe27f, 0x460a, 0xb5, 0x86, 0x77, 0x19, 0x80, 0x1d, 0x92, 0x82) + +#define HP_SPEAKER_ID_EFI_NAME L"HPSpeakerID" +#define HP_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0xc49593a4, 0xd099, 0x419b, 0xa2, 0xc3, 0x67, 0xe9, 0x80, 0xe6, 0x1d, 0x1e) + KUNIT_DEFINE_ACTION_WRAPPER(faux_device_destroy_wrapper, faux_device_destroy, struct faux_device *) @@ -196,8 +204,40 @@ static efi_status_t cs_amp_lib_test_get_efi_variable(efi_char16_t *name, KUNIT_EXPECT_NOT_ERR_OR_NULL(test, guid); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, size); - KUNIT_EXPECT_MEMEQ(test, name, expected_name, sizeof(expected_name)); - KUNIT_EXPECT_MEMEQ(test, guid, &expected_guid, sizeof(expected_guid)); + if (memcmp(name, expected_name, sizeof(expected_name)) || + efi_guidcmp(*guid, expected_guid)) + return -EFI_NOT_FOUND; + + if (!buf) { + *size = priv->cal_blob->size; + return EFI_BUFFER_TOO_SMALL; + } + + KUNIT_ASSERT_GE_MSG(test, ksize(buf), priv->cal_blob->size, "Buffer to small"); + + memcpy(buf, priv->cal_blob, priv->cal_blob->size); + + return EFI_SUCCESS; +} + +static efi_status_t cs_amp_lib_test_get_hp_cal_efi_variable(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + static const efi_char16_t expected_name[] = L"SmartAmpCalibrationData"; + static const efi_guid_t expected_guid = + EFI_GUID(0x53559579, 0x8753, 0x4f5c, 0x91, 0x30, 0xe8, 0x2a, 0xcf, 0xb8, 0xd8, 0x93); + struct kunit *test = kunit_get_current_test(); + struct cs_amp_lib_test_priv *priv = test->priv; + + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, name); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, guid); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, size); + + if (memcmp(name, expected_name, sizeof(expected_name)) || + efi_guidcmp(*guid, expected_guid)) + return -EFI_NOT_FOUND; if (!buf) { *size = priv->cal_blob->size; @@ -211,6 +251,25 @@ static efi_status_t cs_amp_lib_test_get_efi_variable(efi_char16_t *name, return EFI_SUCCESS; } +/* Get cal data block from HP variable. */ +static void cs_amp_lib_test_get_hp_efi_cal(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct cirrus_amp_cal_data result_data; + int ret; + + cs_amp_lib_test_init_dummy_cal_blob(test, 2); + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_hp_cal_efi_variable); + + ret = cs_amp_get_efi_calibration_data(&priv->amp_dev->dev, 0, 0, &result_data); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_MEMEQ(test, &result_data, &priv->cal_blob->data[0], sizeof(result_data)); +} + /* Get cal data block for a given amp, matched by target UID. */ static void cs_amp_lib_test_get_efi_cal_by_uid_test(struct kunit *test) { @@ -642,6 +701,185 @@ static void cs_amp_lib_test_write_cal_data_test(struct kunit *test) KUNIT_EXPECT_EQ(test, entry->value, data.calStatus); } +static void cs_amp_lib_test_spkid_lenovo_not_present(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_none); + + KUNIT_EXPECT_EQ(test, -ENOENT, cs_amp_get_vendor_spkid(dev)); +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_d0(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + if (efi_guidcmp(*guid, LENOVO_SPEAKER_ID_EFI_GUID) || + memcmp(name, LENOVO_SPEAKER_ID_EFI_NAME, sizeof(LENOVO_SPEAKER_ID_EFI_NAME))) + return EFI_NOT_FOUND; + + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0xd0; + + return EFI_SUCCESS; +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_d1(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + if (efi_guidcmp(*guid, LENOVO_SPEAKER_ID_EFI_GUID) || + memcmp(name, LENOVO_SPEAKER_ID_EFI_NAME, sizeof(LENOVO_SPEAKER_ID_EFI_NAME))) + return EFI_NOT_FOUND; + + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0xd1; + + return EFI_SUCCESS; +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_lenovo_00(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + KUNIT_ASSERT_EQ(test, 0, efi_guidcmp(*guid, LENOVO_SPEAKER_ID_EFI_GUID)); + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0; + + return EFI_SUCCESS; +} + +static void cs_amp_lib_test_spkid_lenovo_d0(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_lenovo_d0); + + KUNIT_EXPECT_EQ(test, 0, cs_amp_get_vendor_spkid(dev)); +} + +static void cs_amp_lib_test_spkid_lenovo_d1(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_lenovo_d1); + + KUNIT_EXPECT_EQ(test, 1, cs_amp_get_vendor_spkid(dev)); +} + +static void cs_amp_lib_test_spkid_lenovo_illegal(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_lenovo_00); + + KUNIT_EXPECT_LT(test, cs_amp_get_vendor_spkid(dev), 0); +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_buf_too_small(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + return EFI_BUFFER_TOO_SMALL; +} + +static void cs_amp_lib_test_spkid_lenovo_oversize(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_buf_too_small); + + KUNIT_EXPECT_LT(test, cs_amp_get_vendor_spkid(dev), 0); +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_hp_30(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + if (efi_guidcmp(*guid, HP_SPEAKER_ID_EFI_GUID) || + memcmp(name, HP_SPEAKER_ID_EFI_NAME, sizeof(HP_SPEAKER_ID_EFI_NAME))) + return EFI_NOT_FOUND; + + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0x30; + + return EFI_SUCCESS; +} + +static efi_status_t cs_amp_lib_test_get_efi_variable_hp_31(efi_char16_t *name, + efi_guid_t *guid, + unsigned long *size, + void *buf) +{ + struct kunit *test = kunit_get_current_test(); + + if (efi_guidcmp(*guid, HP_SPEAKER_ID_EFI_GUID) || + memcmp(name, HP_SPEAKER_ID_EFI_NAME, sizeof(HP_SPEAKER_ID_EFI_NAME))) + return EFI_NOT_FOUND; + + KUNIT_ASSERT_EQ(test, *size, 1); + *size = 1; + *(u8 *)buf = 0x31; + + return EFI_SUCCESS; +} + +static void cs_amp_lib_test_spkid_hp_30(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_hp_30); + + KUNIT_EXPECT_EQ(test, 0, cs_amp_get_vendor_spkid(dev)); +} + +static void cs_amp_lib_test_spkid_hp_31(struct kunit *test) +{ + struct cs_amp_lib_test_priv *priv = test->priv; + struct device *dev = &priv->amp_dev->dev; + + kunit_activate_static_stub(test, + cs_amp_test_hooks->get_efi_variable, + cs_amp_lib_test_get_efi_variable_hp_31); + + KUNIT_EXPECT_EQ(test, 1, cs_amp_get_vendor_spkid(dev)); +} + static int cs_amp_lib_test_case_init(struct kunit *test) { struct cs_amp_lib_test_priv *priv; @@ -722,6 +960,7 @@ static struct kunit_case cs_amp_lib_test_cases[] = { KUNIT_CASE(cs_amp_lib_test_get_efi_cal_no_uid_index_not_found_test), KUNIT_CASE(cs_amp_lib_test_get_efi_cal_no_uid_no_index_test), KUNIT_CASE(cs_amp_lib_test_get_efi_cal_zero_not_matched_test), + KUNIT_CASE(cs_amp_lib_test_get_hp_efi_cal), KUNIT_CASE_PARAM(cs_amp_lib_test_get_efi_cal_by_uid_test, cs_amp_lib_test_get_cal_gen_params), KUNIT_CASE_PARAM(cs_amp_lib_test_get_efi_cal_by_index_unchecked_test, @@ -737,6 +976,15 @@ static struct kunit_case cs_amp_lib_test_cases[] = { /* Tests for writing calibration data */ KUNIT_CASE(cs_amp_lib_test_write_cal_data_test), + /* Test cases for speaker ID */ + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_not_present), + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_d0), + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_d1), + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_illegal), + KUNIT_CASE(cs_amp_lib_test_spkid_lenovo_oversize), + KUNIT_CASE(cs_amp_lib_test_spkid_hp_30), + KUNIT_CASE(cs_amp_lib_test_spkid_hp_31), + { } /* terminator */ }; diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index 808e67c90f7c72..8434d5196107e9 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -16,10 +16,35 @@ #include #include -#define CS_AMP_CAL_GUID \ +#define CIRRUS_LOGIC_CALIBRATION_EFI_NAME L"CirrusSmartAmpCalibrationData" +#define CIRRUS_LOGIC_CALIBRATION_EFI_GUID \ EFI_GUID(0x02f9af02, 0x7734, 0x4233, 0xb4, 0x3d, 0x93, 0xfe, 0x5a, 0xa3, 0x5d, 0xb3) -#define CS_AMP_CAL_NAME L"CirrusSmartAmpCalibrationData" +#define LENOVO_SPEAKER_ID_EFI_NAME L"SdwSpeaker" +#define LENOVO_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0x48df970e, 0xe27f, 0x460a, 0xb5, 0x86, 0x77, 0x19, 0x80, 0x1d, 0x92, 0x82) + +#define HP_SPEAKER_ID_EFI_NAME L"HPSpeakerID" +#define HP_SPEAKER_ID_EFI_GUID \ + EFI_GUID(0xc49593a4, 0xd099, 0x419b, 0xa2, 0xc3, 0x67, 0xe9, 0x80, 0xe6, 0x1d, 0x1e) + +#define HP_CALIBRATION_EFI_NAME L"SmartAmpCalibrationData" +#define HP_CALIBRATION_EFI_GUID \ + EFI_GUID(0x53559579, 0x8753, 0x4f5c, 0x91, 0x30, 0xe8, 0x2a, 0xcf, 0xb8, 0xd8, 0x93) + +static const struct cs_amp_lib_cal_efivar { + efi_char16_t *name; + efi_guid_t *guid; +} cs_amp_lib_cal_efivars[] = { + { + .name = HP_CALIBRATION_EFI_NAME, + .guid = &HP_CALIBRATION_EFI_GUID, + }, + { + .name = CIRRUS_LOGIC_CALIBRATION_EFI_NAME, + .guid = &CIRRUS_LOGIC_CALIBRATION_EFI_GUID, + }, +}; static int cs_amp_write_cal_coeff(struct cs_dsp *dsp, const struct cirrus_amp_cal_controls *controls, @@ -115,16 +140,41 @@ static efi_status_t cs_amp_get_efi_variable(efi_char16_t *name, return EFI_NOT_FOUND; } +static int cs_amp_convert_efi_status(efi_status_t status) +{ + switch (status) { + case EFI_SUCCESS: + return 0; + case EFI_NOT_FOUND: + return -ENOENT; + case EFI_BUFFER_TOO_SMALL: + return -EFBIG; + case EFI_UNSUPPORTED: + case EFI_ACCESS_DENIED: + case EFI_SECURITY_VIOLATION: + return -EACCES; + default: + return -EIO; + } +} + static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev) { struct cirrus_amp_efi_data *efi_data; unsigned long data_size = 0; u8 *data; efi_status_t status; - int ret; + int i, ret; + + /* Find EFI variable and get size */ + for (i = 0; i < ARRAY_SIZE(cs_amp_lib_cal_efivars); i++) { + status = cs_amp_get_efi_variable(cs_amp_lib_cal_efivars[i].name, + cs_amp_lib_cal_efivars[i].guid, + &data_size, NULL); + if (status == EFI_BUFFER_TOO_SMALL) + break; + } - /* Get real size of UEFI variable */ - status = cs_amp_get_efi_variable(CS_AMP_CAL_NAME, &CS_AMP_CAL_GUID, &data_size, NULL); if (status != EFI_BUFFER_TOO_SMALL) return ERR_PTR(-ENOENT); @@ -138,7 +188,9 @@ static struct cirrus_amp_efi_data *cs_amp_get_cal_efi_buffer(struct device *dev) if (!data) return ERR_PTR(-ENOMEM); - status = cs_amp_get_efi_variable(CS_AMP_CAL_NAME, &CS_AMP_CAL_GUID, &data_size, data); + status = cs_amp_get_efi_variable(cs_amp_lib_cal_efivars[i].name, + cs_amp_lib_cal_efivars[i].guid, + &data_size, data); if (status != EFI_SUCCESS) { ret = -EINVAL; goto err; @@ -273,6 +325,81 @@ int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_ } EXPORT_SYMBOL_NS_GPL(cs_amp_get_efi_calibration_data, "SND_SOC_CS_AMP_LIB"); +struct cs_amp_spkid_efi { + efi_char16_t *name; + efi_guid_t *guid; + u8 values[2]; +}; + +static int cs_amp_get_efi_byte_spkid(struct device *dev, const struct cs_amp_spkid_efi *info) +{ + efi_status_t status; + unsigned long size; + u8 spkid; + int i, ret; + + size = sizeof(spkid); + status = cs_amp_get_efi_variable(info->name, info->guid, &size, &spkid); + ret = cs_amp_convert_efi_status(status); + if (ret < 0) + return ret; + + if (size == 0) + return -ENOENT; + + for (i = 0; i < ARRAY_SIZE(info->values); i++) { + if (info->values[i] == spkid) + return i; + } + + dev_err(dev, "EFI speaker ID bad value %#x\n", spkid); + + return -EINVAL; +} + +static const struct cs_amp_spkid_efi cs_amp_spkid_byte_types[] = { + { + .name = LENOVO_SPEAKER_ID_EFI_NAME, + .guid = &LENOVO_SPEAKER_ID_EFI_GUID, + .values = { 0xd0, 0xd1 }, + }, + { + .name = HP_SPEAKER_ID_EFI_NAME, + .guid = &HP_SPEAKER_ID_EFI_GUID, + .values = { 0x30, 0x31 }, + }, +}; + +/** + * cs_amp_get_vendor_spkid - get a speaker ID from vendor-specific storage + * @dev: pointer to struct device + * + * Known vendor-specific methods of speaker ID are checked and if one is + * found its speaker ID value is returned. + * + * Return: >=0 is a valid speaker ID. -ENOENT if a vendor-specific method + * was not found. -EACCES if the vendor-specific storage could not + * be read. Other error values indicate that the data from the + * vendor-specific storage was found but could not be understood. + */ +int cs_amp_get_vendor_spkid(struct device *dev) +{ + int i, ret; + + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE) && + !IS_ENABLED(CONFIG_SND_SOC_CS_AMP_LIB_TEST)) + return -ENOENT; + + for (i = 0; i < ARRAY_SIZE(cs_amp_spkid_byte_types); i++) { + ret = cs_amp_get_efi_byte_spkid(dev, &cs_amp_spkid_byte_types[i]); + if (ret != -ENOENT) + return ret; + } + + return -ENOENT; +} +EXPORT_SYMBOL_NS_GPL(cs_amp_get_vendor_spkid, "SND_SOC_CS_AMP_LIB"); + static const struct cs_amp_test_hooks cs_amp_test_hook_ptrs = { .get_efi_variable = cs_amp_get_efi_variable, .write_cal_coeff = cs_amp_write_cal_coeff, diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c index 224d65987a8df3..173d7c59b7254d 100644 --- a/sound/soc/codecs/cs35l41.c +++ b/sound/soc/codecs/cs35l41.c @@ -7,6 +7,7 @@ // Author: David Rhodes #include +#include #include #include #include @@ -1147,45 +1148,55 @@ static int cs35l41_dsp_init(struct cs35l41_private *cs35l41) return ret; } -#ifdef CONFIG_ACPI -static int cs35l41_acpi_get_name(struct cs35l41_private *cs35l41) +static int cs35l41_get_system_name(struct cs35l41_private *cs35l41) { struct acpi_device *adev = ACPI_COMPANION(cs35l41->dev); - acpi_handle handle = acpi_device_handle(adev); - const char *hid; - const char *sub; - - /* If there is no acpi_device, there is no ACPI for this system, return 0 */ - if (!adev) - return 0; + const char *sub = NULL; + const char *tmp; + int ret = 0; - sub = acpi_get_subsystem_id(handle); - if (IS_ERR(sub)) { - /* If no _SUB, fallback to _HID, otherwise fail */ - if (PTR_ERR(sub) == -ENODATA) { - hid = acpi_device_hid(adev); - /* If dummy hid, return 0 and fallback to legacy firmware path */ - if (!strcmp(hid, "device")) - return 0; - sub = kstrdup(hid, GFP_KERNEL); - if (!sub) - sub = ERR_PTR(-ENOMEM); - - } else - return PTR_ERR(sub); + /* If there is no acpi_device, there is no ACPI for this system, skip checking ACPI */ + if (adev) { + acpi_handle handle = acpi_device_handle(adev); + + sub = acpi_get_subsystem_id(handle); + ret = PTR_ERR_OR_ZERO(sub); + if (ret) { + sub = NULL; + /* If no _SUB, fallback to _HID, otherwise fail */ + if (ret == -ENODATA) { + tmp = acpi_device_hid(adev); + /* If dummy hid, return 0 and fallback to legacy firmware path */ + if (!strcmp(tmp, "device")) { + ret = 0; + goto err; + } + sub = kstrdup(tmp, GFP_KERNEL); + if (!sub) { + ret = -ENOMEM; + goto err; + } + } + } + } else { + if (!device_property_read_string(cs35l41->dev, "cirrus,subsystem-id", &tmp)) { + sub = kstrdup(tmp, GFP_KERNEL); + if (!sub) { + ret = -ENOMEM; + goto err; + } + } } - cs35l41->dsp.system_name = sub; - dev_dbg(cs35l41->dev, "Subsystem ID: %s\n", cs35l41->dsp.system_name); +err: + if (sub) { + cs35l41->dsp.system_name = sub; + dev_info(cs35l41->dev, "Subsystem ID: %s\n", cs35l41->dsp.system_name); + } else + dev_warn(cs35l41->dev, "Subsystem ID not found\n"); - return 0; -} -#else -static int cs35l41_acpi_get_name(struct cs35l41_private *cs35l41) -{ - return 0; + return ret; } -#endif /* CONFIG_ACPI */ int cs35l41_probe(struct cs35l41_private *cs35l41, const struct cs35l41_hw_cfg *hw_cfg) { @@ -1317,7 +1328,7 @@ int cs35l41_probe(struct cs35l41_private *cs35l41, const struct cs35l41_hw_cfg * goto err; } - ret = cs35l41_acpi_get_name(cs35l41); + ret = cs35l41_get_system_name(cs35l41); if (ret < 0) goto err; diff --git a/sound/soc/codecs/cs35l56-i2c.c b/sound/soc/codecs/cs35l56-i2c.c index 073f1796ae2910..0492ddc4102d80 100644 --- a/sound/soc/codecs/cs35l56-i2c.c +++ b/sound/soc/codecs/cs35l56-i2c.c @@ -35,11 +35,11 @@ static int cs35l56_i2c_probe(struct i2c_client *client) switch (id) { case 0x3556: regmap_config = &cs35l56_regmap_i2c; - cs35l56->base.fw_reg = &cs35l56_fw_reg; + cs35l56->base.type = 0x56; break; case 0x3563: regmap_config = &cs35l63_regmap_i2c; - cs35l56->base.fw_reg = &cs35l63_fw_reg; + cs35l56->base.type = 0x63; break; default: return -ENODEV; diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c index 3905c9cb188a8c..42d24ac2977fc8 100644 --- a/sound/soc/codecs/cs35l56-sdw.c +++ b/sound/soc/codecs/cs35l56-sdw.c @@ -527,16 +527,16 @@ static int cs35l56_sdw_probe(struct sdw_slave *peripheral, const struct sdw_devi case 0x3556: case 0x3557: regmap_config = &cs35l56_regmap_sdw; - cs35l56->base.fw_reg = &cs35l56_fw_reg; break; case 0x3563: regmap_config = &cs35l63_regmap_sdw; - cs35l56->base.fw_reg = &cs35l63_fw_reg; break; default: return -ENODEV; } + cs35l56->base.type = ((unsigned int)id->driver_data) & 0xff; + cs35l56->base.regmap = devm_regmap_init(dev, &cs35l56_regmap_bus_sdw, peripheral, regmap_config); if (IS_ERR(cs35l56->base.regmap)) { diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 850fcf38599681..9e6b9ca2f3547d 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -309,6 +309,58 @@ static bool cs35l63_volatile_reg(struct device *dev, unsigned int reg) } } +static const struct cs35l56_fw_reg cs35l56_fw_reg = { + .fw_ver = CS35L56_DSP1_FW_VER, + .halo_state = CS35L56_DSP1_HALO_STATE, + .pm_cur_stat = CS35L56_DSP1_PM_CUR_STATE, + .prot_sts = CS35L56_PROTECTION_STATUS, + .transducer_actual_ps = CS35L56_TRANSDUCER_ACTUAL_PS, + .user_mute = CS35L56_MAIN_RENDER_USER_MUTE, + .user_volume = CS35L56_MAIN_RENDER_USER_VOLUME, + .posture_number = CS35L56_MAIN_POSTURE_NUMBER, +}; + +static const struct cs35l56_fw_reg cs35l56_b2_fw_reg = { + .fw_ver = CS35L56_DSP1_FW_VER, + .halo_state = CS35L56_B2_DSP1_HALO_STATE, + .pm_cur_stat = CS35L56_B2_DSP1_PM_CUR_STATE, + .prot_sts = CS35L56_PROTECTION_STATUS, + .transducer_actual_ps = CS35L56_TRANSDUCER_ACTUAL_PS, + .user_mute = CS35L56_MAIN_RENDER_USER_MUTE, + .user_volume = CS35L56_MAIN_RENDER_USER_VOLUME, + .posture_number = CS35L56_MAIN_POSTURE_NUMBER, +}; + +static const struct cs35l56_fw_reg cs35l63_fw_reg = { + .fw_ver = CS35L63_DSP1_FW_VER, + .halo_state = CS35L63_DSP1_HALO_STATE, + .pm_cur_stat = CS35L63_DSP1_PM_CUR_STATE, + .prot_sts = CS35L63_PROTECTION_STATUS, + .transducer_actual_ps = CS35L63_TRANSDUCER_ACTUAL_PS, + .user_mute = CS35L63_MAIN_RENDER_USER_MUTE, + .user_volume = CS35L63_MAIN_RENDER_USER_VOLUME, + .posture_number = CS35L63_MAIN_POSTURE_NUMBER, +}; + +static void cs35l56_set_fw_reg_table(struct cs35l56_base *cs35l56_base) +{ + switch (cs35l56_base->type) { + default: + switch (cs35l56_base->rev) { + case 0xb0: + cs35l56_base->fw_reg = &cs35l56_fw_reg; + break; + default: + cs35l56_base->fw_reg = &cs35l56_b2_fw_reg; + break; + } + break; + case 0x63: + cs35l56_base->fw_reg = &cs35l63_fw_reg; + break; + } +} + int cs35l56_mbox_send(struct cs35l56_base *cs35l56_base, unsigned int command) { unsigned int val; @@ -468,6 +520,11 @@ static const struct reg_sequence cs35l56_system_reset_seq[] = { REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), }; +static const struct reg_sequence cs35l56_b2_system_reset_seq[] = { + REG_SEQ0(CS35L56_B2_DSP1_HALO_STATE, 0), + REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), +}; + static const struct reg_sequence cs35l63_system_reset_seq[] = { REG_SEQ0(CS35L63_DSP1_HALO_STATE, 0), REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), @@ -490,9 +547,18 @@ void cs35l56_system_reset(struct cs35l56_base *cs35l56_base, bool is_soundwire) case 0x54: case 0x56: case 0x57: - regmap_multi_reg_write_bypassed(cs35l56_base->regmap, - cs35l56_system_reset_seq, - ARRAY_SIZE(cs35l56_system_reset_seq)); + switch (cs35l56_base->rev) { + case 0xb0: + regmap_multi_reg_write_bypassed(cs35l56_base->regmap, + cs35l56_system_reset_seq, + ARRAY_SIZE(cs35l56_system_reset_seq)); + break; + default: + regmap_multi_reg_write_bypassed(cs35l56_base->regmap, + cs35l56_b2_system_reset_seq, + ARRAY_SIZE(cs35l56_b2_system_reset_seq)); + break; + } break; case 0x63: regmap_multi_reg_write_bypassed(cs35l56_base->regmap, @@ -979,6 +1045,7 @@ int cs35l56_hw_init(struct cs35l56_base *cs35l56_base) return ret; } cs35l56_base->rev = revid & (CS35L56_AREVID_MASK | CS35L56_MTLREVID_MASK); + cs35l56_set_fw_reg_table(cs35l56_base); ret = cs35l56_wait_for_firmware_boot(cs35l56_base); if (ret) @@ -1054,7 +1121,17 @@ int cs35l56_get_speaker_id(struct cs35l56_base *cs35l56_base) u32 speaker_id; int i, ret; - /* Attempt to read the speaker type from a device property first */ + /* Check for vendor-specific speaker ID method */ + ret = cs_amp_get_vendor_spkid(cs35l56_base->dev); + if (ret >= 0) { + dev_dbg(cs35l56_base->dev, "Vendor Speaker ID = %d\n", ret); + return ret; + } else if (ret != -ENOENT) { + dev_err(cs35l56_base->dev, "Error getting vendor Speaker ID: %d\n", ret); + return ret; + } + + /* Attempt to read the speaker type from a device property */ ret = device_property_read_u32(cs35l56_base->dev, "cirrus,speaker-id", &speaker_id); if (!ret) { dev_dbg(cs35l56_base->dev, "Speaker ID = %d\n", speaker_id); @@ -1270,30 +1347,6 @@ const struct regmap_config cs35l63_regmap_sdw = { }; EXPORT_SYMBOL_NS_GPL(cs35l63_regmap_sdw, "SND_SOC_CS35L56_SHARED"); -const struct cs35l56_fw_reg cs35l56_fw_reg = { - .fw_ver = CS35L56_DSP1_FW_VER, - .halo_state = CS35L56_DSP1_HALO_STATE, - .pm_cur_stat = CS35L56_DSP1_PM_CUR_STATE, - .prot_sts = CS35L56_PROTECTION_STATUS, - .transducer_actual_ps = CS35L56_TRANSDUCER_ACTUAL_PS, - .user_mute = CS35L56_MAIN_RENDER_USER_MUTE, - .user_volume = CS35L56_MAIN_RENDER_USER_VOLUME, - .posture_number = CS35L56_MAIN_POSTURE_NUMBER, -}; -EXPORT_SYMBOL_NS_GPL(cs35l56_fw_reg, "SND_SOC_CS35L56_SHARED"); - -const struct cs35l56_fw_reg cs35l63_fw_reg = { - .fw_ver = CS35L63_DSP1_FW_VER, - .halo_state = CS35L63_DSP1_HALO_STATE, - .pm_cur_stat = CS35L63_DSP1_PM_CUR_STATE, - .prot_sts = CS35L63_PROTECTION_STATUS, - .transducer_actual_ps = CS35L63_TRANSDUCER_ACTUAL_PS, - .user_mute = CS35L63_MAIN_RENDER_USER_MUTE, - .user_volume = CS35L63_MAIN_RENDER_USER_VOLUME, - .posture_number = CS35L63_MAIN_POSTURE_NUMBER, -}; -EXPORT_SYMBOL_NS_GPL(cs35l63_fw_reg, "SND_SOC_CS35L56_SHARED"); - MODULE_DESCRIPTION("ASoC CS35L56 Shared"); MODULE_AUTHOR("Richard Fitzgerald "); MODULE_AUTHOR("Simon Trimmer "); diff --git a/sound/soc/codecs/cs35l56-spi.c b/sound/soc/codecs/cs35l56-spi.c index c2ddee22cd231b..9bc9b7c98390dc 100644 --- a/sound/soc/codecs/cs35l56-spi.c +++ b/sound/soc/codecs/cs35l56-spi.c @@ -26,7 +26,7 @@ static int cs35l56_spi_probe(struct spi_device *spi) spi_set_drvdata(spi, cs35l56); - cs35l56->base.fw_reg = &cs35l56_fw_reg; + cs35l56->base.type = 0x56; cs35l56->base.regmap = devm_regmap_init_spi(spi, regmap_config); if (IS_ERR(cs35l56->base.regmap)) { diff --git a/sound/soc/codecs/da7213.c b/sound/soc/codecs/da7213.c index a4496cc26902b8..ae89260ca215ff 100644 --- a/sound/soc/codecs/da7213.c +++ b/sound/soc/codecs/da7213.c @@ -2247,10 +2247,8 @@ static int da7213_runtime_resume(struct device *dev) return regcache_sync(da7213->regmap); } -static const struct dev_pm_ops da7213_pm = { - RUNTIME_PM_OPS(da7213_runtime_suspend, da7213_runtime_resume, NULL) - SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) -}; +static DEFINE_RUNTIME_DEV_PM_OPS(da7213_pm, da7213_runtime_suspend, + da7213_runtime_resume, NULL); static const struct i2c_device_id da7213_i2c_id[] = { { "da7213" }, diff --git a/sound/soc/codecs/pcm1754.c b/sound/soc/codecs/pcm1754.c new file mode 100644 index 00000000000000..b68a528000be89 --- /dev/null +++ b/sound/soc/codecs/pcm1754.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PCM1754 DAC ASoC codec driver + * + * Copyright (c) 2022 Alvin Šipraga + * Copyright (c) 2025 Stefan Kerkmann + */ + +#include +#include +#include + +#include +#include + +struct pcm1754_priv { + unsigned int format; + struct gpio_desc *gpiod_mute; + struct gpio_desc *gpiod_format; +}; + +static int pcm1754_set_dai_fmt(struct snd_soc_dai *codec_dai, + unsigned int format) +{ + struct snd_soc_component *component = codec_dai->component; + struct pcm1754_priv *priv = snd_soc_component_get_drvdata(component); + + priv->format = format; + + return 0; +} + +static int pcm1754_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *codec_dai) +{ + struct snd_soc_component *component = codec_dai->component; + struct pcm1754_priv *priv = snd_soc_component_get_drvdata(component); + int format; + + switch (priv->format & SND_SOC_DAIFMT_FORMAT_MASK) { + case SND_SOC_DAIFMT_RIGHT_J: + switch (params_width(params)) { + case 16: + format = 1; + break; + default: + return -EINVAL; + } + break; + case SND_SOC_DAIFMT_I2S: + switch (params_width(params)) { + case 16: + fallthrough; + case 24: + format = 0; + break; + default: + return -EINVAL; + } + break; + default: + dev_err(component->dev, "Invalid DAI format\n"); + return -EINVAL; + } + + gpiod_set_value_cansleep(priv->gpiod_format, format); + + return 0; +} + +static int pcm1754_mute_stream(struct snd_soc_dai *dai, int mute, int stream) +{ + struct pcm1754_priv *priv = snd_soc_component_get_drvdata(dai->component); + + gpiod_set_value_cansleep(priv->gpiod_mute, mute); + + return 0; +} + +static const struct snd_soc_dai_ops pcm1754_dai_ops = { + .set_fmt = pcm1754_set_dai_fmt, + .hw_params = pcm1754_hw_params, + .mute_stream = pcm1754_mute_stream, +}; + +static const struct snd_soc_dai_driver pcm1754_dai = { + .name = "pcm1754", + .playback = { + .stream_name = "Playback", + .channels_min = 2, + .channels_max = 2, + .rates = SNDRV_PCM_RATE_CONTINUOUS, + .rate_min = 5000, + .rate_max = 200000, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE + }, + .ops = &pcm1754_dai_ops, +}; + +static const struct snd_soc_dapm_widget pcm1754_dapm_widgets[] = { + SND_SOC_DAPM_REGULATOR_SUPPLY("VCC", 0, 0), + + SND_SOC_DAPM_DAC("DAC1", "Channel 1 Playback", SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_DAC("DAC2", "Channel 2 Playback", SND_SOC_NOPM, 0, 0), + + SND_SOC_DAPM_OUTPUT("VOUTL"), + SND_SOC_DAPM_OUTPUT("VOUTR"), +}; + +static const struct snd_soc_dapm_route pcm1754_dapm_routes[] = { + { "DAC1", NULL, "Playback" }, + { "DAC2", NULL, "Playback" }, + + { "DAC1", NULL, "VCC" }, + { "DAC2", NULL, "VCC" }, + + { "VOUTL", NULL, "DAC1" }, + { "VOUTR", NULL, "DAC2" }, +}; + +static const struct snd_soc_component_driver soc_component_dev_pcm1754 = { + .dapm_widgets = pcm1754_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(pcm1754_dapm_widgets), + .dapm_routes = pcm1754_dapm_routes, + .num_dapm_routes = ARRAY_SIZE(pcm1754_dapm_routes), +}; + +static int pcm1754_probe(struct platform_device *pdev) +{ + struct pcm1754_priv *priv; + struct device *dev = &pdev->dev; + struct snd_soc_dai_driver *dai_drv; + int ret; + + dai_drv = devm_kmemdup(dev, &pcm1754_dai, sizeof(*dai_drv), GFP_KERNEL); + if (!dai_drv) + return -ENOMEM; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->gpiod_mute = devm_gpiod_get_optional(dev, "mute", GPIOD_OUT_HIGH); + if (IS_ERR(priv->gpiod_mute)) + return dev_err_probe(dev, PTR_ERR(priv->gpiod_mute), + "failed to get mute gpio"); + + priv->gpiod_format = devm_gpiod_get_optional(dev, "format", GPIOD_OUT_LOW); + if (IS_ERR(priv->gpiod_format)) + return dev_err_probe(dev, PTR_ERR(priv->gpiod_format), + "failed to get format gpio"); + + dev_set_drvdata(dev, priv); + + ret = devm_snd_soc_register_component( + &pdev->dev, &soc_component_dev_pcm1754, dai_drv, 1); + if (ret) + return dev_err_probe(dev, ret, "failed to register"); + + return 0; +} + +#ifdef CONFIG_OF +static const struct of_device_id pcm1754_of_match[] = { + { .compatible = "ti,pcm1754" }, + { } +}; +MODULE_DEVICE_TABLE(of, pcm1754_of_match); +#endif + +static struct platform_driver pcm1754_codec_driver = { + .driver = { + .name = "pcm1754-codec", + .of_match_table = of_match_ptr(pcm1754_of_match), + }, + .probe = pcm1754_probe, +}; + +module_platform_driver(pcm1754_codec_driver); + +MODULE_DESCRIPTION("ASoC PCM1754 driver"); +MODULE_AUTHOR("Alvin Šipraga "); +MODULE_AUTHOR("Stefan Kerkmann "); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/pm4125-sdw.c b/sound/soc/codecs/pm4125-sdw.c new file mode 100644 index 00000000000000..4ed09fbe3f545a --- /dev/null +++ b/sound/soc/codecs/pm4125-sdw.c @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright, 2025 Linaro Ltd + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pm4125.h" + +static struct pm4125_sdw_ch_info pm4125_sdw_rx_ch_info[] = { + WCD_SDW_CH(PM4125_HPH_L, PM4125_HPH_PORT, BIT(0)), + WCD_SDW_CH(PM4125_HPH_R, PM4125_HPH_PORT, BIT(1)), +}; + +static struct pm4125_sdw_ch_info pm4125_sdw_tx_ch_info[] = { + WCD_SDW_CH(PM4125_ADC1, PM4125_ADC_1_2_DMIC1L_BCS_PORT, BIT(0)), + WCD_SDW_CH(PM4125_ADC2, PM4125_ADC_1_2_DMIC1L_BCS_PORT, BIT(1)), +}; + +static struct sdw_dpn_prop pm4125_dpn_prop[PM4125_MAX_SWR_PORTS] = { + { + .num = 1, + .type = SDW_DPN_SIMPLE, + .min_ch = 1, + .max_ch = 8, + .simple_ch_prep_sm = true, + }, { + .num = 2, + .type = SDW_DPN_SIMPLE, + .min_ch = 1, + .max_ch = 4, + .simple_ch_prep_sm = true, + } +}; + +struct device *pm4125_sdw_device_get(struct device_node *np) +{ + return bus_find_device_by_of_node(&sdw_bus_type, np); +} +EXPORT_SYMBOL_GPL(pm4125_sdw_device_get); + +int pm4125_sdw_hw_params(struct pm4125_sdw_priv *priv, struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) +{ + struct sdw_port_config port_config[PM4125_MAX_SWR_PORTS]; + unsigned long ch_mask; + int i, j; + + priv->sconfig.ch_count = 1; + priv->active_ports = 0; + for (i = 0; i < PM4125_MAX_SWR_PORTS; i++) { + ch_mask = priv->port_config[i].ch_mask; + if (!ch_mask) + continue; + + for_each_set_bit(j, &ch_mask, 4) + priv->sconfig.ch_count++; + + port_config[priv->active_ports] = priv->port_config[i]; + priv->active_ports++; + } + + priv->sconfig.bps = 1; + priv->sconfig.frame_rate = params_rate(params); + priv->sconfig.direction = priv->is_tx ? SDW_DATA_DIR_TX : SDW_DATA_DIR_RX; + priv->sconfig.type = SDW_STREAM_PCM; + + return sdw_stream_add_slave(priv->sdev, &priv->sconfig, &port_config[0], priv->active_ports, + priv->sruntime); +} +EXPORT_SYMBOL_GPL(pm4125_sdw_hw_params); + +static int pm4125_update_status(struct sdw_slave *slave, enum sdw_slave_status status) +{ + struct pm4125_sdw_priv *priv = dev_get_drvdata(&slave->dev); + + if (priv->regmap && status == SDW_SLAVE_ATTACHED) { + /* Write out any cached changes that happened between probe and attach */ + regcache_cache_only(priv->regmap, false); + return regcache_sync(priv->regmap); + } + + return 0; +} + +/* + * Handle Soundwire out-of-band interrupt event by triggering the first irq of the slave_irq + * irq domain, which then will be handled by the regmap_irq threaded irq. + * Looping is to ensure no interrupts were missed in the process. + */ +static int pm4125_interrupt_callback(struct sdw_slave *slave, struct sdw_slave_intr_status *status) +{ + struct pm4125_sdw_priv *priv = dev_get_drvdata(&slave->dev); + struct irq_domain *slave_irq = priv->slave_irq; + u32 sts1, sts2, sts3; + + do { + handle_nested_irq(irq_find_mapping(slave_irq, 0)); + regmap_read(priv->regmap, PM4125_DIG_SWR_INTR_STATUS_0, &sts1); + regmap_read(priv->regmap, PM4125_DIG_SWR_INTR_STATUS_1, &sts2); + regmap_read(priv->regmap, PM4125_DIG_SWR_INTR_STATUS_2, &sts3); + + } while (sts1 || sts2 || sts3); + + return IRQ_HANDLED; +} + +static const struct reg_default pm4125_defaults[] = { + { PM4125_ANA_MICBIAS_MICB_1_2_EN, 0x01 }, + { PM4125_ANA_MICBIAS_MICB_3_EN, 0x00 }, + { PM4125_ANA_MICBIAS_LDO_1_SETTING, 0x21 }, + { PM4125_ANA_MICBIAS_LDO_1_CTRL, 0x01 }, + { PM4125_ANA_TX_AMIC1, 0x00 }, + { PM4125_ANA_TX_AMIC2, 0x00 }, + { PM4125_ANA_MBHC_MECH, 0x39 }, + { PM4125_ANA_MBHC_ELECT, 0x08 }, + { PM4125_ANA_MBHC_ZDET, 0x10 }, + { PM4125_ANA_MBHC_RESULT_1, 0x00 }, + { PM4125_ANA_MBHC_RESULT_2, 0x00 }, + { PM4125_ANA_MBHC_RESULT_3, 0x00 }, + { PM4125_ANA_MBHC_BTN0_ZDET_VREF1, 0x00 }, + { PM4125_ANA_MBHC_BTN1_ZDET_VREF2, 0x10 }, + { PM4125_ANA_MBHC_BTN2_ZDET_VREF3, 0x20 }, + { PM4125_ANA_MBHC_BTN3_ZDET_DBG_400, 0x30 }, + { PM4125_ANA_MBHC_BTN4_ZDET_DBG_1400, 0x40 }, + { PM4125_ANA_MBHC_MICB2_RAMP, 0x00 }, + { PM4125_ANA_MBHC_CTL_1, 0x02 }, + { PM4125_ANA_MBHC_CTL_2, 0x05 }, + { PM4125_ANA_MBHC_PLUG_DETECT_CTL, 0xE9 }, + { PM4125_ANA_MBHC_ZDET_ANA_CTL, 0x0F }, + { PM4125_ANA_MBHC_ZDET_RAMP_CTL, 0x00 }, + { PM4125_ANA_MBHC_FSM_STATUS, 0x00 }, + { PM4125_ANA_MBHC_ADC_RESULT, 0x00 }, + { PM4125_ANA_MBHC_CTL_CLK, 0x30 }, + { PM4125_ANA_MBHC_ZDET_CALIB_RESULT, 0x00 }, + { PM4125_ANA_NCP_EN, 0x00 }, + { PM4125_ANA_NCP_VCTRL, 0xA7 }, + { PM4125_ANA_HPHPA_CNP_CTL_1, 0x54 }, + { PM4125_ANA_HPHPA_CNP_CTL_2, 0x2B }, + { PM4125_ANA_HPHPA_PA_STATUS, 0x00 }, + { PM4125_ANA_HPHPA_FSM_CLK, 0x12 }, + { PM4125_ANA_HPHPA_L_GAIN, 0x00 }, + { PM4125_ANA_HPHPA_R_GAIN, 0x00 }, + { PM4125_SWR_HPHPA_HD2, 0x1B }, + { PM4125_ANA_HPHPA_SPARE_CTL, 0x02 }, + { PM4125_ANA_SURGE_EN, 0x38 }, + { PM4125_ANA_COMBOPA_CTL, 0x35 }, + { PM4125_ANA_COMBOPA_CTL_4, 0x84 }, + { PM4125_ANA_COMBOPA_CTL_5, 0x05 }, + { PM4125_ANA_RXLDO_CTL, 0x86 }, + { PM4125_ANA_MBIAS_EN, 0x00 }, + { PM4125_DIG_SWR_CHIP_ID0, 0x00 }, + { PM4125_DIG_SWR_CHIP_ID1, 0x00 }, + { PM4125_DIG_SWR_CHIP_ID2, 0x0C }, + { PM4125_DIG_SWR_CHIP_ID3, 0x01 }, + { PM4125_DIG_SWR_SWR_TX_CLK_RATE, 0x00 }, + { PM4125_DIG_SWR_CDC_RST_CTL, 0x03 }, + { PM4125_DIG_SWR_TOP_CLK_CFG, 0x00 }, + { PM4125_DIG_SWR_CDC_RX_CLK_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_TX_CLK_CTL, 0x33 }, + { PM4125_DIG_SWR_SWR_RST_EN, 0x00 }, + { PM4125_DIG_SWR_CDC_RX_RST, 0x00 }, + { PM4125_DIG_SWR_CDC_RX0_CTL, 0xFC }, + { PM4125_DIG_SWR_CDC_RX1_CTL, 0xFC }, + { PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, 0x00 }, + { PM4125_DIG_SWR_CDC_COMP_CTL_0, 0x00 }, + { PM4125_DIG_SWR_CDC_RX_DELAY_CTL, 0x66 }, + { PM4125_DIG_SWR_CDC_RX_GAIN_0, 0x55 }, + { PM4125_DIG_SWR_CDC_RX_GAIN_1, 0xA9 }, + { PM4125_DIG_SWR_CDC_RX_GAIN_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_TX0_CTL, 0x68 }, + { PM4125_DIG_SWR_CDC_TX1_CTL, 0x68 }, + { PM4125_DIG_SWR_CDC_TX_RST, 0x00 }, + { PM4125_DIG_SWR_CDC_REQ0_CTL, 0x01 }, + { PM4125_DIG_SWR_CDC_REQ1_CTL, 0x01 }, + { PM4125_DIG_SWR_CDC_RST, 0x00 }, + { PM4125_DIG_SWR_CDC_AMIC_CTL, 0x02 }, + { PM4125_DIG_SWR_CDC_DMIC_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_DMIC1_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_DMIC1_RATE, 0x01 }, + { PM4125_DIG_SWR_PDM_WD_CTL0, 0x00 }, + { PM4125_DIG_SWR_PDM_WD_CTL1, 0x00 }, + { PM4125_DIG_SWR_INTR_MODE, 0x00 }, + { PM4125_DIG_SWR_INTR_MASK_0, 0xFF }, + { PM4125_DIG_SWR_INTR_MASK_1, 0x7F }, + { PM4125_DIG_SWR_INTR_MASK_2, 0x0C }, + { PM4125_DIG_SWR_INTR_STATUS_0, 0x00 }, + { PM4125_DIG_SWR_INTR_STATUS_1, 0x00 }, + { PM4125_DIG_SWR_INTR_STATUS_2, 0x00 }, + { PM4125_DIG_SWR_INTR_CLEAR_0, 0x00 }, + { PM4125_DIG_SWR_INTR_CLEAR_1, 0x00 }, + { PM4125_DIG_SWR_INTR_CLEAR_2, 0x00 }, + { PM4125_DIG_SWR_INTR_LEVEL_0, 0x00 }, + { PM4125_DIG_SWR_INTR_LEVEL_1, 0x2A }, + { PM4125_DIG_SWR_INTR_LEVEL_2, 0x00 }, + { PM4125_DIG_SWR_CDC_CONN_RX0_CTL, 0x00 }, + { PM4125_DIG_SWR_CDC_CONN_RX1_CTL, 0x00 }, + { PM4125_DIG_SWR_LOOP_BACK_MODE, 0x00 }, + { PM4125_DIG_SWR_DRIVE_STRENGTH_0, 0x00 }, + { PM4125_DIG_SWR_DIG_DEBUG_CTL, 0x00 }, + { PM4125_DIG_SWR_DIG_DEBUG_EN, 0x00 }, + { PM4125_DIG_SWR_DEM_BYPASS_DATA0, 0x55 }, + { PM4125_DIG_SWR_DEM_BYPASS_DATA1, 0x55 }, + { PM4125_DIG_SWR_DEM_BYPASS_DATA2, 0x55 }, + { PM4125_DIG_SWR_DEM_BYPASS_DATA3, 0x01 }, +}; + +static bool pm4125_rdwr_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case PM4125_ANA_MICBIAS_MICB_1_2_EN: + case PM4125_ANA_MICBIAS_MICB_3_EN: + case PM4125_ANA_MICBIAS_LDO_1_SETTING: + case PM4125_ANA_MICBIAS_LDO_1_CTRL: + case PM4125_ANA_TX_AMIC1: + case PM4125_ANA_TX_AMIC2: + case PM4125_ANA_MBHC_MECH: + case PM4125_ANA_MBHC_ELECT: + case PM4125_ANA_MBHC_ZDET: + case PM4125_ANA_MBHC_BTN0_ZDET_VREF1: + case PM4125_ANA_MBHC_BTN1_ZDET_VREF2: + case PM4125_ANA_MBHC_BTN2_ZDET_VREF3: + case PM4125_ANA_MBHC_BTN3_ZDET_DBG_400: + case PM4125_ANA_MBHC_BTN4_ZDET_DBG_1400: + case PM4125_ANA_MBHC_MICB2_RAMP: + case PM4125_ANA_MBHC_CTL_1: + case PM4125_ANA_MBHC_CTL_2: + case PM4125_ANA_MBHC_PLUG_DETECT_CTL: + case PM4125_ANA_MBHC_ZDET_ANA_CTL: + case PM4125_ANA_MBHC_ZDET_RAMP_CTL: + case PM4125_ANA_MBHC_CTL_CLK: + case PM4125_ANA_NCP_EN: + case PM4125_ANA_NCP_VCTRL: + case PM4125_ANA_HPHPA_CNP_CTL_1: + case PM4125_ANA_HPHPA_CNP_CTL_2: + case PM4125_ANA_HPHPA_FSM_CLK: + case PM4125_ANA_HPHPA_L_GAIN: + case PM4125_ANA_HPHPA_R_GAIN: + case PM4125_ANA_HPHPA_SPARE_CTL: + case PM4125_SWR_HPHPA_HD2: + case PM4125_ANA_SURGE_EN: + case PM4125_ANA_COMBOPA_CTL: + case PM4125_ANA_COMBOPA_CTL_4: + case PM4125_ANA_COMBOPA_CTL_5: + case PM4125_ANA_RXLDO_CTL: + case PM4125_ANA_MBIAS_EN: + case PM4125_DIG_SWR_SWR_TX_CLK_RATE: + case PM4125_DIG_SWR_CDC_RST_CTL: + case PM4125_DIG_SWR_TOP_CLK_CFG: + case PM4125_DIG_SWR_CDC_RX_CLK_CTL: + case PM4125_DIG_SWR_CDC_TX_CLK_CTL: + case PM4125_DIG_SWR_SWR_RST_EN: + case PM4125_DIG_SWR_CDC_RX_RST: + case PM4125_DIG_SWR_CDC_RX0_CTL: + case PM4125_DIG_SWR_CDC_RX1_CTL: + case PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1: + case PM4125_DIG_SWR_CDC_COMP_CTL_0: + case PM4125_DIG_SWR_CDC_RX_DELAY_CTL: + case PM4125_DIG_SWR_CDC_RX_GAIN_0: + case PM4125_DIG_SWR_CDC_RX_GAIN_1: + case PM4125_DIG_SWR_CDC_RX_GAIN_CTL: + case PM4125_DIG_SWR_CDC_TX0_CTL: + case PM4125_DIG_SWR_CDC_TX1_CTL: + case PM4125_DIG_SWR_CDC_TX_RST: + case PM4125_DIG_SWR_CDC_REQ0_CTL: + case PM4125_DIG_SWR_CDC_REQ1_CTL: + case PM4125_DIG_SWR_CDC_RST: + case PM4125_DIG_SWR_CDC_AMIC_CTL: + case PM4125_DIG_SWR_CDC_DMIC_CTL: + case PM4125_DIG_SWR_CDC_DMIC1_CTL: + case PM4125_DIG_SWR_CDC_DMIC1_RATE: + case PM4125_DIG_SWR_PDM_WD_CTL0: + case PM4125_DIG_SWR_PDM_WD_CTL1: + case PM4125_DIG_SWR_INTR_MODE: + case PM4125_DIG_SWR_INTR_MASK_0: + case PM4125_DIG_SWR_INTR_MASK_1: + case PM4125_DIG_SWR_INTR_MASK_2: + case PM4125_DIG_SWR_INTR_CLEAR_0: + case PM4125_DIG_SWR_INTR_CLEAR_1: + case PM4125_DIG_SWR_INTR_CLEAR_2: + case PM4125_DIG_SWR_INTR_LEVEL_0: + case PM4125_DIG_SWR_INTR_LEVEL_1: + case PM4125_DIG_SWR_INTR_LEVEL_2: + case PM4125_DIG_SWR_CDC_CONN_RX0_CTL: + case PM4125_DIG_SWR_CDC_CONN_RX1_CTL: + case PM4125_DIG_SWR_LOOP_BACK_MODE: + case PM4125_DIG_SWR_DRIVE_STRENGTH_0: + case PM4125_DIG_SWR_DIG_DEBUG_CTL: + case PM4125_DIG_SWR_DIG_DEBUG_EN: + case PM4125_DIG_SWR_DEM_BYPASS_DATA0: + case PM4125_DIG_SWR_DEM_BYPASS_DATA1: + case PM4125_DIG_SWR_DEM_BYPASS_DATA2: + case PM4125_DIG_SWR_DEM_BYPASS_DATA3: + return true; + } + + return false; +} + +static bool pm4125_readable_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case PM4125_ANA_MBHC_RESULT_1: + case PM4125_ANA_MBHC_RESULT_2: + case PM4125_ANA_MBHC_RESULT_3: + case PM4125_ANA_MBHC_FSM_STATUS: + case PM4125_ANA_MBHC_ADC_RESULT: + case PM4125_ANA_MBHC_ZDET_CALIB_RESULT: + case PM4125_ANA_HPHPA_PA_STATUS: + case PM4125_DIG_SWR_CHIP_ID0: + case PM4125_DIG_SWR_CHIP_ID1: + case PM4125_DIG_SWR_CHIP_ID2: + case PM4125_DIG_SWR_CHIP_ID3: + case PM4125_DIG_SWR_INTR_STATUS_0: + case PM4125_DIG_SWR_INTR_STATUS_1: + case PM4125_DIG_SWR_INTR_STATUS_2: + return true; + } + return pm4125_rdwr_register(dev, reg); +} + +static bool pm4125_volatile_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case PM4125_ANA_MBHC_RESULT_1: + case PM4125_ANA_MBHC_RESULT_2: + case PM4125_ANA_MBHC_RESULT_3: + case PM4125_ANA_MBHC_FSM_STATUS: + case PM4125_ANA_MBHC_ADC_RESULT: + case PM4125_ANA_MBHC_ZDET_CALIB_RESULT: + case PM4125_ANA_HPHPA_PA_STATUS: + case PM4125_DIG_SWR_CHIP_ID0: + case PM4125_DIG_SWR_CHIP_ID1: + case PM4125_DIG_SWR_CHIP_ID2: + case PM4125_DIG_SWR_CHIP_ID3: + case PM4125_DIG_SWR_INTR_STATUS_0: + case PM4125_DIG_SWR_INTR_STATUS_1: + case PM4125_DIG_SWR_INTR_STATUS_2: + return true; + } + + return false; +} + +static const struct regmap_config pm4125_regmap_config = { + .name = "pm4125_csr", + .reg_bits = 32, + .val_bits = 8, + .cache_type = REGCACHE_MAPLE, + .reg_defaults = pm4125_defaults, + .num_reg_defaults = ARRAY_SIZE(pm4125_defaults), + .max_register = PM4125_MAX_REGISTER, + .readable_reg = pm4125_readable_register, + .writeable_reg = pm4125_rdwr_register, + .volatile_reg = pm4125_volatile_register, +}; + +static const struct sdw_slave_ops pm4125_slave_ops = { + .update_status = pm4125_update_status, + .interrupt_callback = pm4125_interrupt_callback, +}; + +static int pm4125_sdw_component_bind(struct device *dev, struct device *master, void *data) +{ + pm_runtime_set_autosuspend_delay(dev, 3000); + pm_runtime_use_autosuspend(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + + return 0; +} + +static void pm4125_sdw_component_unbind(struct device *dev, struct device *master, void *data) +{ + pm_runtime_disable(dev); + pm_runtime_set_suspended(dev); + pm_runtime_dont_use_autosuspend(dev); +} + +static const struct component_ops pm4125_sdw_component_ops = { + .bind = pm4125_sdw_component_bind, + .unbind = pm4125_sdw_component_unbind, +}; + +static int pm4125_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) +{ + struct device *dev = &pdev->dev; + struct pm4125_sdw_priv *priv; + u8 master_ch_mask[PM4125_MAX_SWR_CH_IDS]; + int master_ch_mask_size = 0; + int ret, i; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + /* Port map index starts at 0, however the data port for this codec starts at index 1 */ + if (of_property_present(dev->of_node, "qcom,tx-port-mapping")) { + priv->is_tx = true; + ret = of_property_read_u32_array(dev->of_node, "qcom,tx-port-mapping", + &pdev->m_port_map[1], PM4125_MAX_TX_SWR_PORTS); + } else { + ret = of_property_read_u32_array(dev->of_node, "qcom,rx-port-mapping", + &pdev->m_port_map[1], PM4125_MAX_SWR_PORTS); + } + + if (ret < 0) + dev_info(dev, "Error getting static port mapping for %s (%d)\n", + priv->is_tx ? "TX" : "RX", ret); + + priv->sdev = pdev; + dev_set_drvdata(dev, priv); + + pdev->prop.scp_int1_mask = SDW_SCP_INT1_IMPL_DEF | + SDW_SCP_INT1_BUS_CLASH | + SDW_SCP_INT1_PARITY; + pdev->prop.lane_control_support = true; + pdev->prop.simple_clk_stop_capable = true; + + memset(master_ch_mask, 0, PM4125_MAX_SWR_CH_IDS); + + if (priv->is_tx) { + master_ch_mask_size = of_property_count_u8_elems(dev->of_node, + "qcom,tx-channel-mapping"); + + if (master_ch_mask_size) + ret = of_property_read_u8_array(dev->of_node, "qcom,tx-channel-mapping", + master_ch_mask, master_ch_mask_size); + } else { + master_ch_mask_size = of_property_count_u8_elems(dev->of_node, + "qcom,rx-channel-mapping"); + + if (master_ch_mask_size) + ret = of_property_read_u8_array(dev->of_node, "qcom,rx-channel-mapping", + master_ch_mask, master_ch_mask_size); + } + + if (ret < 0) + dev_info(dev, "Static channel mapping not specified using device channel maps\n"); + + if (priv->is_tx) { + pdev->prop.source_ports = GENMASK(PM4125_MAX_TX_SWR_PORTS, 0); + pdev->prop.src_dpn_prop = pm4125_dpn_prop; + priv->ch_info = &pm4125_sdw_tx_ch_info[0]; + + for (i = 0; i < master_ch_mask_size; i++) + priv->ch_info[i].master_ch_mask = PM4125_SWRM_CH_MASK(master_ch_mask[i]); + + pdev->prop.wake_capable = true; + + priv->regmap = devm_regmap_init_sdw(pdev, &pm4125_regmap_config); + if (IS_ERR(priv->regmap)) + return dev_err_probe(dev, PTR_ERR(priv->regmap), "regmap init failed\n"); + + /* Start in cache-only until device is enumerated */ + regcache_cache_only(priv->regmap, true); + } else { + pdev->prop.sink_ports = GENMASK(PM4125_MAX_SWR_PORTS - 1, 0); + pdev->prop.sink_dpn_prop = pm4125_dpn_prop; + priv->ch_info = &pm4125_sdw_rx_ch_info[0]; + + for (i = 0; i < master_ch_mask_size; i++) + priv->ch_info[i].master_ch_mask = PM4125_SWRM_CH_MASK(master_ch_mask[i]); + } + + ret = component_add(dev, &pm4125_sdw_component_ops); + if (ret) + return ret; + + /* Set suspended until aggregate device is bind */ + pm_runtime_set_suspended(dev); + + return 0; +} + +static int pm4125_remove(struct sdw_slave *pdev) +{ + struct device *dev = &pdev->dev; + + component_del(dev, &pm4125_sdw_component_ops); + + return 0; +} + +static const struct sdw_device_id pm4125_slave_id[] = { + SDW_SLAVE_ENTRY(0x0217, 0x10c, 0), /* Soundwire pm4125 RX/TX Device ID */ + { } +}; +MODULE_DEVICE_TABLE(sdw, pm4125_slave_id); + +static int __maybe_unused pm4125_sdw_runtime_suspend(struct device *dev) +{ + struct pm4125_sdw_priv *priv = dev_get_drvdata(dev); + + if (priv->regmap) { + regcache_cache_only(priv->regmap, true); + regcache_mark_dirty(priv->regmap); + } + + return 0; +} + +static int __maybe_unused pm4125_sdw_runtime_resume(struct device *dev) +{ + struct pm4125_sdw_priv *priv = dev_get_drvdata(dev); + + if (priv->regmap) { + regcache_cache_only(priv->regmap, false); + regcache_sync(priv->regmap); + } + + return 0; +} + +static const struct dev_pm_ops pm4125_sdw_pm_ops = { + SET_RUNTIME_PM_OPS(pm4125_sdw_runtime_suspend, pm4125_sdw_runtime_resume, NULL) +}; + +static struct sdw_driver pm4125_codec_driver = { + .probe = pm4125_probe, + .remove = pm4125_remove, + .ops = &pm4125_slave_ops, + .id_table = pm4125_slave_id, + .driver = { + .name = "pm4125-codec", + .pm = &pm4125_sdw_pm_ops, + } +}; +module_sdw_driver(pm4125_codec_driver); + +MODULE_DESCRIPTION("PM4125 SDW codec driver"); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/pm4125.c b/sound/soc/codecs/pm4125.c new file mode 100644 index 00000000000000..706fc668ffe2a3 --- /dev/null +++ b/sound/soc/codecs/pm4125.c @@ -0,0 +1,1780 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. +// Copyright (c) 2025, Linaro Ltd + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pm4125.h" +#include "wcd-mbhc-v2.h" + +#define WCD_MBHC_HS_V_MAX 1600 +#define PM4125_MBHC_MAX_BUTTONS 8 + +#define PM4125_RATES (SNDRV_PCM_RATE_8000 | SNDRV_PCM_RATE_16000 |\ + SNDRV_PCM_RATE_32000 | SNDRV_PCM_RATE_48000 |\ + SNDRV_PCM_RATE_96000 | SNDRV_PCM_RATE_192000 |\ + SNDRV_PCM_RATE_384000) + +/* Fractional Rates */ +#define PM4125_FRAC_RATES (SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_88200 |\ + SNDRV_PCM_RATE_176400 | SNDRV_PCM_RATE_352800) + +#define PM4125_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE |\ + SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_S32_LE) + +/* Registers in SPMI addr space */ +#define PM4125_CODEC_RESET_REG 0xF3DB +#define PM4125_CODEC_OFF 0x1 +#define PM4125_CODEC_ON 0x0 +#define PM4125_CODEC_FOUNDRY_ID_REG 0x7 + +enum { + HPH_COMP_DELAY, + HPH_PA_DELAY, + AMIC2_BCS_ENABLE, +}; + +enum { + AIF1_PB = 0, + AIF1_CAP, + NUM_CODEC_DAIS, +}; + +struct pm4125_priv { + struct sdw_slave *tx_sdw_dev; + struct pm4125_sdw_priv *sdw_priv[NUM_CODEC_DAIS]; + struct device *txdev; + struct device *rxdev; + struct device_node *rxnode; + struct device_node *txnode; + struct regmap *regmap; + struct regmap *spmi_regmap; + /* mbhc module */ + struct wcd_mbhc *wcd_mbhc; + struct wcd_mbhc_config mbhc_cfg; + struct wcd_mbhc_intr intr_ids; + struct irq_domain *virq; + const struct regmap_irq_chip *pm4125_regmap_irq_chip; + struct regmap_irq_chip_data *irq_chip; + struct snd_soc_jack *jack; + unsigned long status_mask; + s32 micb_ref[PM4125_MAX_MICBIAS]; + s32 pullup_ref[PM4125_MAX_MICBIAS]; + u32 micb1_mv; + u32 micb2_mv; + u32 micb3_mv; + + int hphr_pdm_wd_int; + int hphl_pdm_wd_int; + bool comp1_enable; + bool comp2_enable; + + atomic_t gloal_mbias_cnt; +}; + +static const char * const pm4125_power_supplies[] = { + "vdd-io", "vdd-cp", "vdd-mic-bias", "vdd-pa-vpos", +}; + +static const DECLARE_TLV_DB_SCALE(line_gain, 0, 7, 1); +static const DECLARE_TLV_DB_SCALE(analog_gain, 0, 25, 1); + +static const struct wcd_mbhc_field pm4125_mbhc_fields[WCD_MBHC_REG_FUNC_MAX] = { + WCD_MBHC_FIELD(WCD_MBHC_L_DET_EN, PM4125_ANA_MBHC_MECH, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_GND_DET_EN, PM4125_ANA_MBHC_MECH, 0x40), + WCD_MBHC_FIELD(WCD_MBHC_MECH_DETECTION_TYPE, PM4125_ANA_MBHC_MECH, 0x20), + WCD_MBHC_FIELD(WCD_MBHC_MIC_CLAMP_CTL, PM4125_ANA_MBHC_PLUG_DETECT_CTL, 0x30), + WCD_MBHC_FIELD(WCD_MBHC_ELECT_DETECTION_TYPE, PM4125_ANA_MBHC_ELECT, 0x08), + WCD_MBHC_FIELD(WCD_MBHC_HS_L_DET_PULL_UP_CTRL, PM4125_ANA_MBHC_PLUG_DETECT_CTL, 0x1F), + WCD_MBHC_FIELD(WCD_MBHC_HS_L_DET_PULL_UP_COMP_CTRL, PM4125_ANA_MBHC_MECH, 0x04), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_PLUG_TYPE, PM4125_ANA_MBHC_MECH, 0x10), + WCD_MBHC_FIELD(WCD_MBHC_GND_PLUG_TYPE, PM4125_ANA_MBHC_MECH, 0x08), + WCD_MBHC_FIELD(WCD_MBHC_SW_HPH_LP_100K_TO_GND, PM4125_ANA_MBHC_MECH, 0x01), + WCD_MBHC_FIELD(WCD_MBHC_ELECT_SCHMT_ISRC, PM4125_ANA_MBHC_ELECT, 0x06), + WCD_MBHC_FIELD(WCD_MBHC_FSM_EN, PM4125_ANA_MBHC_ELECT, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_INSREM_DBNC, PM4125_ANA_MBHC_PLUG_DETECT_CTL, 0x0F), + WCD_MBHC_FIELD(WCD_MBHC_BTN_DBNC, PM4125_ANA_MBHC_CTL_1, 0x03), + WCD_MBHC_FIELD(WCD_MBHC_HS_VREF, PM4125_ANA_MBHC_CTL_2, 0x03), + WCD_MBHC_FIELD(WCD_MBHC_HS_COMP_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x08), + WCD_MBHC_FIELD(WCD_MBHC_IN2P_CLAMP_STATE, PM4125_ANA_MBHC_RESULT_3, 0x10), + WCD_MBHC_FIELD(WCD_MBHC_MIC_SCHMT_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x20), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_SCHMT_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_HPHR_SCHMT_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x40), + WCD_MBHC_FIELD(WCD_MBHC_BTN_RESULT, PM4125_ANA_MBHC_RESULT_3, 0x07), + WCD_MBHC_FIELD(WCD_MBHC_BTN_ISRC_CTL, PM4125_ANA_MBHC_ELECT, 0x70), + WCD_MBHC_FIELD(WCD_MBHC_ELECT_RESULT, PM4125_ANA_MBHC_RESULT_3, 0xFF), + WCD_MBHC_FIELD(WCD_MBHC_MICB_CTRL, PM4125_ANA_MICBIAS_MICB_1_2_EN, 0xC0), + WCD_MBHC_FIELD(WCD_MBHC_HPHR_PA_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0x40), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_PA_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_HPH_PA_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0xC0), + WCD_MBHC_FIELD(WCD_MBHC_SWCH_LEVEL_REMOVE, PM4125_ANA_MBHC_RESULT_3, 0x10), + WCD_MBHC_FIELD(WCD_MBHC_FSM_STATUS, PM4125_ANA_MBHC_FSM_STATUS, 0x01), + WCD_MBHC_FIELD(WCD_MBHC_MUX_CTL, PM4125_ANA_MBHC_CTL_2, 0x70), + WCD_MBHC_FIELD(WCD_MBHC_MOISTURE_STATUS, PM4125_ANA_MBHC_FSM_STATUS, 0x20), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_OCP_DET_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0x01), + WCD_MBHC_FIELD(WCD_MBHC_HPHR_OCP_DET_EN, PM4125_ANA_HPHPA_CNP_CTL_2, 0x01), + WCD_MBHC_FIELD(WCD_MBHC_HPHL_OCP_STATUS, PM4125_DIG_SWR_INTR_STATUS_0, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_HPHR_OCP_STATUS, PM4125_DIG_SWR_INTR_STATUS_0, 0x20), + WCD_MBHC_FIELD(WCD_MBHC_ADC_EN, PM4125_ANA_MBHC_CTL_1, 0x08), + WCD_MBHC_FIELD(WCD_MBHC_ADC_COMPLETE, PM4125_ANA_MBHC_FSM_STATUS, 0x40), + WCD_MBHC_FIELD(WCD_MBHC_ADC_TIMEOUT, PM4125_ANA_MBHC_FSM_STATUS, 0x80), + WCD_MBHC_FIELD(WCD_MBHC_ADC_RESULT, PM4125_ANA_MBHC_ADC_RESULT, 0xFF), + WCD_MBHC_FIELD(WCD_MBHC_MICB2_VOUT, PM4125_ANA_MICBIAS_LDO_1_SETTING, 0x3F), + WCD_MBHC_FIELD(WCD_MBHC_ADC_MODE, PM4125_ANA_MBHC_CTL_1, 0x10), + WCD_MBHC_FIELD(WCD_MBHC_DETECTION_DONE, PM4125_ANA_MBHC_CTL_1, 0x04), + WCD_MBHC_FIELD(WCD_MBHC_ELECT_ISRC_EN, PM4125_ANA_MBHC_ZDET, 0x02), +}; + +static const struct regmap_irq pm4125_irqs[PM4125_NUM_IRQS] = { + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_BUTTON_PRESS_DET, 0, BIT(0)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_BUTTON_RELEASE_DET, 0, BIT(1)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_ELECT_INS_REM_DET, 0, BIT(2)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_ELECT_INS_REM_LEG_DET, 0, BIT(3)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_SW_DET, 0, BIT(4)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHR_OCP_INT, 0, BIT(5)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHR_CNP_INT, 0, BIT(6)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHL_OCP_INT, 0, BIT(7)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHL_CNP_INT, 1, BIT(0)), + REGMAP_IRQ_REG(PM4125_IRQ_EAR_CNP_INT, 1, BIT(1)), + REGMAP_IRQ_REG(PM4125_IRQ_EAR_SCD_INT, 1, BIT(2)), + REGMAP_IRQ_REG(PM4125_IRQ_AUX_CNP_INT, 1, BIT(3)), + REGMAP_IRQ_REG(PM4125_IRQ_AUX_SCD_INT, 1, BIT(4)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHL_PDM_WD_INT, 1, BIT(5)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHR_PDM_WD_INT, 1, BIT(6)), + REGMAP_IRQ_REG(PM4125_IRQ_AUX_PDM_WD_INT, 1, BIT(7)), + REGMAP_IRQ_REG(PM4125_IRQ_LDORT_SCD_INT, 2, BIT(0)), + REGMAP_IRQ_REG(PM4125_IRQ_MBHC_MOISTURE_INT, 2, BIT(1)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHL_SURGE_DET_INT, 2, BIT(2)), + REGMAP_IRQ_REG(PM4125_IRQ_HPHR_SURGE_DET_INT, 2, BIT(3)), +}; + +static int pm4125_handle_post_irq(void *data) +{ + struct pm4125_priv *pm4125 = (struct pm4125_priv *)data; + + regmap_write(pm4125->regmap, PM4125_DIG_SWR_INTR_CLEAR_0, 0); + regmap_write(pm4125->regmap, PM4125_DIG_SWR_INTR_CLEAR_1, 0); + regmap_write(pm4125->regmap, PM4125_DIG_SWR_INTR_CLEAR_2, 0); + + return IRQ_HANDLED; +} + +static const u32 pm4125_config_regs[] = { + PM4125_DIG_SWR_INTR_LEVEL_0, +}; + +static struct regmap_irq_chip pm4125_regmap_irq_chip = { + .name = "pm4125", + .irqs = pm4125_irqs, + .num_irqs = ARRAY_SIZE(pm4125_irqs), + .num_regs = 3, + .status_base = PM4125_DIG_SWR_INTR_STATUS_0, + .mask_base = PM4125_DIG_SWR_INTR_MASK_0, + .ack_base = PM4125_DIG_SWR_INTR_CLEAR_0, + .use_ack = 1, + .clear_ack = 1, + .config_base = pm4125_config_regs, + .num_config_bases = ARRAY_SIZE(pm4125_config_regs), + .num_config_regs = 1, + .runtime_pm = true, + .handle_post_irq = pm4125_handle_post_irq, +}; + +static void pm4125_reset(struct pm4125_priv *pm4125) +{ + regmap_write(pm4125->spmi_regmap, PM4125_CODEC_RESET_REG, PM4125_CODEC_OFF); + usleep_range(20, 30); + regmap_write(pm4125->spmi_regmap, PM4125_CODEC_RESET_REG, PM4125_CODEC_ON); + usleep_range(5000, 5010); +} + +static void pm4125_io_init(struct regmap *regmap) +{ + /* Disable HPH OCP */ + regmap_update_bits(regmap, PM4125_ANA_HPHPA_CNP_CTL_2, + PM4125_ANA_HPHPA_CNP_OCP_EN_L_MASK | PM4125_ANA_HPHPA_CNP_OCP_EN_R_MASK, + PM4125_ANA_HPHPA_CNP_OCP_DISABLE); + + /* Enable surge protection */ + regmap_update_bits(regmap, PM4125_ANA_SURGE_EN, PM4125_ANA_SURGE_PROTECTION_HPHL_MASK, + FIELD_PREP(PM4125_ANA_SURGE_PROTECTION_HPHL_MASK, + PM4125_ANA_SURGE_PROTECTION_ENABLE)); + regmap_update_bits(regmap, PM4125_ANA_SURGE_EN, PM4125_ANA_SURGE_PROTECTION_HPHR_MASK, + FIELD_PREP(PM4125_ANA_SURGE_PROTECTION_HPHR_MASK, + PM4125_ANA_SURGE_PROTECTION_ENABLE)); + + /* Disable mic bias 2 pull down */ + regmap_update_bits(regmap, PM4125_ANA_MICBIAS_MICB_1_2_EN, + PM4125_ANA_MICBIAS_MICB2_PULL_DN_MASK, + FIELD_PREP(PM4125_ANA_MICBIAS_MICB2_PULL_DN_MASK, + PM4125_ANA_MICBIAS_MICB_PULL_DISABLE)); +} + +static int pm4125_global_mbias_disable(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + if (atomic_dec_and_test(&pm4125->gloal_mbias_cnt)) { + + snd_soc_component_write_field(component, PM4125_ANA_MBIAS_EN, + PM4125_ANA_MBIAS_EN_V2I_MASK, + PM4125_ANA_MBIAS_EN_DISABLE); + snd_soc_component_write_field(component, PM4125_ANA_MBIAS_EN, + PM4125_ANA_MBIAS_EN_GLOBAL_MASK, + PM4125_ANA_MBIAS_EN_DISABLE); + } + + return 0; +} + +static int pm4125_global_mbias_enable(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + if (atomic_inc_return(&pm4125->gloal_mbias_cnt) == 1) { + snd_soc_component_write_field(component, PM4125_ANA_MBIAS_EN, + PM4125_ANA_MBIAS_EN_GLOBAL_MASK, + PM4125_ANA_MBIAS_EN_ENABLE); + snd_soc_component_write_field(component, PM4125_ANA_MBIAS_EN, + PM4125_ANA_MBIAS_EN_V2I_MASK, + PM4125_ANA_MBIAS_EN_ENABLE); + usleep_range(1000, 1100); + } + + return 0; +} + +static int pm4125_rx_clk_enable(struct snd_soc_component *component) +{ + pm4125_global_mbias_enable(component); + + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_ANA_RX_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_ANA_RX_DIV2_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + usleep_range(5000, 5100); + + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_FSM_CLK, + PM4125_ANA_HPHPA_FSM_DIV_RATIO_MASK, + PM4125_ANA_HPHPA_FSM_DIV_RATIO_68); + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_FSM_CLK, + PM4125_ANA_HPHPA_FSM_CLK_DIV_EN_MASK, + PM4125_ANA_HPHPA_FSM_CLK_DIV_ENABLE); + snd_soc_component_update_bits(component, PM4125_ANA_NCP_VCTRL, 0x07, 0x06); + snd_soc_component_write_field(component, PM4125_ANA_NCP_EN, + PM4125_ANA_NCP_ENABLE_MASK, + PM4125_ANA_NCP_ENABLE); + usleep_range(500, 510); + + return 0; +} + +static int pm4125_rx_clk_disable(struct snd_soc_component *component) +{ + + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_FSM_CLK, + PM4125_ANA_HPHPA_FSM_CLK_DIV_EN_MASK, + PM4125_ANA_HPHPA_FSM_CLK_DIV_DISABLE); + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_FSM_CLK, + PM4125_ANA_HPHPA_FSM_DIV_RATIO_MASK, + 0x00); + snd_soc_component_write_field(component, PM4125_ANA_NCP_EN, + PM4125_ANA_NCP_ENABLE_MASK, + PM4125_ANA_NCP_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_ANA_RX_DIV2_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_ANA_RX_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + + pm4125_global_mbias_disable(component); + + return 0; +} + + +static int pm4125_codec_enable_rxclk(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, + int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + pm4125_rx_clk_enable(component); + break; + case SND_SOC_DAPM_POST_PMD: + pm4125_rx_clk_disable(component); + break; + } + + return 0; +} + +static int pm4125_codec_hphl_dac_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_CNP_CTL_1, + PM4125_ANA_HPHPA_CNP_CTL_1_EN_MASK, + PM4125_ANA_HPHPA_CNP_CTL_1_EN); + snd_soc_component_write_field(component, PM4125_SWR_HPHPA_HD2, + PM4125_SWR_HPHPA_HD2_LEFT_MASK, + PM4125_SWR_HPHPA_HD2_ENABLE); + break; + case SND_SOC_DAPM_POST_PMU: + if (pm4125->comp1_enable) { + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHL_EN_MASK, + PM4125_DIG_SWR_COMP_ENABLE); + + if (pm4125->comp2_enable) + snd_soc_component_write_field(component, + PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHR_EN_MASK, + PM4125_DIG_SWR_COMP_ENABLE); + /* + * 5ms sleep is required after COMP is enabled as per + * HW requirement + */ + usleep_range(5000, 5100); + } else { + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHL_EN_MASK, + PM4125_DIG_SWR_COMP_DISABLE); + } + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX0_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX0_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_ENABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX0_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX0_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX0_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX0_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_ENABLE); + if (pm4125->comp1_enable) + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHL_EN_MASK, + PM4125_DIG_SWR_COMP_DISABLE); + break; + } + + return 0; +} + +static int pm4125_codec_hphr_dac_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_write_field(component, PM4125_ANA_HPHPA_CNP_CTL_1, + PM4125_ANA_HPHPA_CNP_CTL_1_EN_MASK, + PM4125_ANA_HPHPA_CNP_CTL_1_EN); + snd_soc_component_write_field(component, PM4125_SWR_HPHPA_HD2, + PM4125_SWR_HPHPA_HD2_RIGHT_MASK, + PM4125_SWR_HPHPA_HD2_ENABLE); + break; + case SND_SOC_DAPM_POST_PMU: + if (pm4125->comp2_enable) { + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHR_EN_MASK, + PM4125_DIG_SWR_COMP_ENABLE); + if (pm4125->comp1_enable) + snd_soc_component_write_field(component, + PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHL_EN_MASK, + PM4125_DIG_SWR_COMP_ENABLE); + /* + * 5ms sleep is required after COMP is enabled + * as per HW requirement + */ + usleep_range(5000, 5100); + } else { + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHR_EN_MASK, + PM4125_DIG_SWR_COMP_DISABLE); + } + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX1_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX1_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_ENABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX1_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX1_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX1_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX1_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_ENABLE); + if (pm4125->comp2_enable) + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_COMP_CTL_0, + PM4125_DIG_SWR_COMP_HPHR_EN_MASK, + PM4125_DIG_SWR_COMP_DISABLE); + break; + } + + return 0; +} + +static int pm4125_codec_ear_lo_dac_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX0_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX0_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_ENABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX0_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_ENABLE); + break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_CLK_CTL, + PM4125_DIG_SWR_RX0_CLK_EN_MASK, + PM4125_DIG_SWR_RX_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX_GAIN_CTL, + PM4125_DIG_SWR_RX0_EN_MASK, + PM4125_DIG_SWR_RX_INPUT_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_RX0_CTL, + PM4125_DIG_SWR_DSM_DITHER_EN_MASK, + PM4125_DIG_SWR_DSM_DITHER_ENABLE); + break; + } + + return 0; +} + + +static int pm4125_codec_enable_hphl_wdt_irq(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_POST_PMU: + usleep_range(5000, 5100); + enable_irq(pm4125->hphl_pdm_wd_int); + break; + case SND_SOC_DAPM_PRE_PMD: + disable_irq_nosync(pm4125->hphl_pdm_wd_int); + break; + } + + return 0; +} + +static int pm4125_codec_enable_hphr_wdt_irq(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_POST_PMU: + usleep_range(5000, 5100); + enable_irq(pm4125->hphr_pdm_wd_int); + break; + case SND_SOC_DAPM_PRE_PMD: + disable_irq_nosync(pm4125->hphr_pdm_wd_int); + break; + } + + return 0; +} + +static int pm4125_codec_enable_hphr_pa(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + usleep_range(200, 210); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL1, + PM4125_WDT_ENABLE_MASK, + (PM4125_WDT_ENABLE_RX1_M | PM4125_WDT_ENABLE_RX1_L)); + break; + case SND_SOC_DAPM_POST_PMD: + usleep_range(5000, 5100); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL1, + PM4125_WDT_ENABLE_MASK, 0x00); + break; + } + + return 0; +} + +static int pm4125_codec_enable_hphl_pa(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + usleep_range(200, 210); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, + (PM4125_WDT_ENABLE_RX0_M | PM4125_WDT_ENABLE_RX0_L)); + break; + case SND_SOC_DAPM_POST_PMD: + usleep_range(5000, 5100); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, 0x00); + break; + } + + return 0; +} + +static int pm4125_codec_enable_lo_pa(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_5, 0x04, 0x00); + usleep_range(1000, 1010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_4, 0x0F, 0x0F); + usleep_range(1000, 1010); + snd_soc_component_write_field(component, PM4125_ANA_COMBOPA_CTL, + PM4125_ANA_COMBO_PA_SELECT_MASK, + PM4125_ANA_COMBO_PA_SELECT_LO); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, + (PM4125_WDT_ENABLE_RX0_M | PM4125_WDT_ENABLE_RX0_L)); + break; + case SND_SOC_DAPM_POST_PMU: + usleep_range(5000, 5010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_4, 0x0F, 0x04); + break; + case SND_SOC_DAPM_POST_PMD: + usleep_range(2000, 2010); + snd_soc_component_write_field(component, PM4125_ANA_COMBOPA_CTL, + PM4125_ANA_COMBO_PA_SELECT_MASK, + PM4125_ANA_COMBO_PA_SELECT_EAR); + usleep_range(5000, 5100); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, 0x00); + break; + } + + return 0; +} + +static int pm4125_codec_enable_ear_pa(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_5, 0x04, 0x00); + usleep_range(1000, 1010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_4, 0x0F, 0x0F); + usleep_range(1000, 1010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL, + PM4125_ANA_COMBO_PA_SELECT_MASK, + PM4125_ANA_COMBO_PA_SELECT_EAR); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, + (PM4125_WDT_ENABLE_RX0_M | PM4125_WDT_ENABLE_RX0_L)); + break; + case SND_SOC_DAPM_POST_PMU: + usleep_range(5000, 5010); + snd_soc_component_update_bits(component, PM4125_ANA_COMBOPA_CTL_4, 0x0F, 0x04); + break; + case SND_SOC_DAPM_POST_PMD: + usleep_range(5000, 5010); + snd_soc_component_write_field(component, PM4125_DIG_SWR_PDM_WD_CTL0, + PM4125_WDT_ENABLE_MASK, 0x00); + break; + } + + return 0; +} + +static int pm4125_get_micb_vout_ctl_val(struct device *dev, u32 micb_mv) +{ + if (micb_mv < 1600 || micb_mv > 2850) { + dev_err(dev, "%s: unsupported micbias voltage (%u mV)\n", __func__, micb_mv); + return -EINVAL; + } + + return (micb_mv - 1600) / 50; +} + +static int pm4125_codec_enable_adc(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + /* Enable BCS for Headset mic */ + if (w->shift == 1 && + !(snd_soc_component_read(component, PM4125_ANA_TX_AMIC2) & 0x10)) { + set_bit(AMIC2_BCS_ENABLE, &pm4125->status_mask); + } + pm4125_global_mbias_enable(component); + if (w->shift) + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, + PM4125_DIG_SWR_TX_ANA_TXD1_MODE_MASK, + PM4125_DIG_SWR_TXD_MODE_NORMAL); + else + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, + PM4125_DIG_SWR_TX_ANA_TXD0_MODE_MASK, + PM4125_DIG_SWR_TXD_MODE_NORMAL); + break; + case SND_SOC_DAPM_POST_PMD: + if (w->shift == 1 && test_bit(AMIC2_BCS_ENABLE, &pm4125->status_mask)) + clear_bit(AMIC2_BCS_ENABLE, &pm4125->status_mask); + + if (w->shift) + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, + PM4125_DIG_SWR_TX_ANA_TXD1_MODE_MASK, + 0x00); + else + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1, + PM4125_DIG_SWR_TX_ANA_TXD0_MODE_MASK, + 0x00); + pm4125_global_mbias_disable(component); + break; + }; + + return 0; +} + +static int pm4125_codec_enable_dmic(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + u16 dmic_clk_reg = w->reg; + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_AMIC_CTL, + PM4125_DIG_SWR_AMIC_SELECT_MASK, + PM4125_DIG_SWR_AMIC_SELECT_DMIC1); + snd_soc_component_update_bits(component, dmic_clk_reg, + PM4125_DIG_SWR_DMIC1_CLK_EN_MASK, + PM4125_DIG_SWR_DMIC1_CLK_ENABLE); + break; + case SND_SOC_DAPM_POST_PMD: + snd_soc_component_update_bits(component, dmic_clk_reg, + PM4125_DIG_SWR_DMIC1_CLK_EN_MASK, + PM4125_DIG_SWR_DMIC1_CLK_DISABLE); + snd_soc_component_write_field(component, PM4125_DIG_SWR_CDC_AMIC_CTL, + PM4125_DIG_SWR_AMIC_SELECT_MASK, + PM4125_DIG_SWR_AMIC_SELECT_AMIC3); + break; + } + + return 0; +} + +static int pm4125_micbias_control(struct snd_soc_component *component, int micb_num, int req, + bool is_dapm) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + int micb_index = micb_num - 1; + u16 micb_reg; + u8 pullup_mask = 0, enable_mask = 0; + + if ((micb_index < 0) || (micb_index > PM4125_MAX_MICBIAS - 1)) { + dev_err(component->dev, "%s: Invalid micbias index, micb_ind:%d\n", + __func__, micb_index); + return -EINVAL; + } + switch (micb_num) { + case MIC_BIAS_1: + micb_reg = PM4125_ANA_MICBIAS_MICB_1_2_EN; + pullup_mask = PM4125_ANA_MICBIAS_MICB1_PULL_UP_MASK; + enable_mask = 0x40; + break; + case MIC_BIAS_2: + micb_reg = PM4125_ANA_MICBIAS_MICB_1_2_EN; + pullup_mask = PM4125_ANA_MICBIAS_MICB2_PULL_UP_MASK; + enable_mask = 0x04; + break; + case MIC_BIAS_3: + micb_reg = PM4125_ANA_MICBIAS_MICB_3_EN; + pullup_mask = 0x02; + break; + default: + dev_err(component->dev, "%s: Invalid micbias number: %d\n", + __func__, micb_num); + return -EINVAL; + }; + + switch (req) { + case MICB_PULLUP_ENABLE: + pm4125->pullup_ref[micb_index]++; + if ((pm4125->pullup_ref[micb_index] == 1) && + (pm4125->micb_ref[micb_index] == 0)) + snd_soc_component_update_bits(component, micb_reg, + pullup_mask, pullup_mask); + break; + case MICB_PULLUP_DISABLE: + if (pm4125->pullup_ref[micb_index] > 0) + pm4125->pullup_ref[micb_index]--; + if ((pm4125->pullup_ref[micb_index] == 0) && + (pm4125->micb_ref[micb_index] == 0)) + snd_soc_component_update_bits(component, micb_reg, + pullup_mask, 0x00); + break; + case MICB_ENABLE: + pm4125->micb_ref[micb_index]++; + if (pm4125->micb_ref[micb_index] == 1) { + pm4125_global_mbias_enable(component); + snd_soc_component_update_bits(component, micb_reg, + enable_mask, enable_mask); + } + break; + case MICB_DISABLE: + if (pm4125->micb_ref[micb_index] > 0) + pm4125->micb_ref[micb_index]--; + if ((pm4125->micb_ref[micb_index] == 0) && + (pm4125->pullup_ref[micb_index] > 0)) { + snd_soc_component_update_bits(component, micb_reg, + pullup_mask, pullup_mask); + snd_soc_component_update_bits(component, micb_reg, + enable_mask, 0x00); + pm4125_global_mbias_disable(component); + } else if ((pm4125->micb_ref[micb_index] == 0) && + (pm4125->pullup_ref[micb_index] == 0)) { + snd_soc_component_update_bits(component, micb_reg, + enable_mask, 0x00); + pm4125_global_mbias_disable(component); + } + break; + }; + + return 0; +} + +static int pm4125_codec_enable_micbias(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, + int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + int micb_num = w->shift; + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + if (micb_num == MIC_BIAS_3) + pm4125_micbias_control(component, micb_num, MICB_PULLUP_ENABLE, true); + else + pm4125_micbias_control(component, micb_num, MICB_ENABLE, true); + break; + case SND_SOC_DAPM_POST_PMU: + usleep_range(1000, 1100); + break; + case SND_SOC_DAPM_POST_PMD: + if (micb_num == MIC_BIAS_3) + pm4125_micbias_control(component, micb_num, MICB_PULLUP_DISABLE, true); + else + pm4125_micbias_control(component, micb_num, MICB_DISABLE, true); + break; + } + + return 0; +} + +static int pm4125_codec_enable_micbias_pullup(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + int micb_num = w->shift; + + switch (event) { + case SND_SOC_DAPM_PRE_PMU: + pm4125_micbias_control(component, micb_num, MICB_PULLUP_ENABLE, true); + break; + case SND_SOC_DAPM_POST_PMU: + usleep_range(1000, 1100); + break; + case SND_SOC_DAPM_POST_PMD: + pm4125_micbias_control(component, micb_num, MICB_PULLUP_DISABLE, true); + break; + } + + return 0; +} + +static int pm4125_connect_port(struct pm4125_sdw_priv *sdw_priv, u8 port_idx, u8 ch_id, bool enable) +{ + struct sdw_port_config *port_config = &sdw_priv->port_config[port_idx - 1]; + const struct pm4125_sdw_ch_info *ch_info = &sdw_priv->ch_info[ch_id]; + struct sdw_slave *sdev = sdw_priv->sdev; + u8 port_num = ch_info->port_num; + u8 ch_mask = ch_info->ch_mask; + u8 mstr_port_num, mstr_ch_mask; + + port_config->num = port_num; + + mstr_port_num = sdev->m_port_map[port_num]; + mstr_ch_mask = ch_info->master_ch_mask; + + if (enable) { + port_config->ch_mask |= ch_mask; + sdw_priv->master_channel_map[mstr_port_num] |= mstr_ch_mask; + } else { + port_config->ch_mask &= ~ch_mask; + sdw_priv->master_channel_map[mstr_port_num] &= ~mstr_ch_mask; + } + + return 0; +} + +static int pm4125_get_compander(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + struct soc_mixer_control *mc; + bool hphr; + + mc = (struct soc_mixer_control *)(kcontrol->private_value); + hphr = mc->shift; + + ucontrol->value.integer.value[0] = hphr ? pm4125->comp2_enable : pm4125->comp1_enable; + return 0; +} + +static int pm4125_set_compander(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[AIF1_PB]; + int value = ucontrol->value.integer.value[0]; + struct soc_mixer_control *mc; + int portidx; + bool hphr; + + mc = (struct soc_mixer_control *)(kcontrol->private_value); + hphr = mc->shift; + + if (hphr) { + if (value == pm4125->comp2_enable) + return 0; + + pm4125->comp2_enable = value; + } else { + if (value == pm4125->comp1_enable) + return 0; + + pm4125->comp1_enable = value; + } + + portidx = sdw_priv->ch_info[mc->reg].port_num; + + pm4125_connect_port(sdw_priv, portidx, mc->reg, value ? true : false); + + return 1; +} + +static int pm4125_get_swr_port(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) +{ + struct soc_mixer_control *mixer = (struct soc_mixer_control *)kcontrol->private_value; + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(comp); + struct pm4125_sdw_priv *sdw_priv; + int dai_id = mixer->shift; + int ch_idx = mixer->reg; + int portidx; + + sdw_priv = pm4125->sdw_priv[dai_id]; + portidx = sdw_priv->ch_info[ch_idx].port_num; + + ucontrol->value.integer.value[0] = sdw_priv->port_enable[portidx]; + + return 0; +} + +static int pm4125_set_swr_port(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) +{ + struct soc_mixer_control *mixer = (struct soc_mixer_control *)kcontrol->private_value; + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(comp); + struct pm4125_sdw_priv *sdw_priv; + int dai_id = mixer->shift; + int ch_idx = mixer->reg; + int portidx; + bool enable; + + sdw_priv = pm4125->sdw_priv[dai_id]; + + portidx = sdw_priv->ch_info[ch_idx].port_num; + + enable = ucontrol->value.integer.value[0]; + + if (enable == sdw_priv->port_enable[portidx]) { + pm4125_connect_port(sdw_priv, portidx, ch_idx, enable); + return 0; + } + + sdw_priv->port_enable[portidx] = enable; + pm4125_connect_port(sdw_priv, portidx, ch_idx, enable); + + return 1; +} + +static void pm4125_mbhc_bias_control(struct snd_soc_component *component, bool enable) +{ + snd_soc_component_write_field(component, PM4125_ANA_MBHC_ELECT, + PM4125_ANA_MBHC_ELECT_BIAS_EN_MASK, + enable ? PM4125_ANA_MBHC_ELECT_BIAS_ENABLE : + PM4125_ANA_MBHC_ELECT_BIAS_DISABLE); +} + +static void pm4125_mbhc_program_btn_thr(struct snd_soc_component *component, + int *btn_low, int *btn_high, + int num_btn, bool is_micbias) +{ + int i, vth; + + if (num_btn > WCD_MBHC_DEF_BUTTONS) { + dev_err(component->dev, "%s: invalid number of buttons: %d\n", + __func__, num_btn); + return; + } + + for (i = 0; i < num_btn; i++) { + vth = ((btn_high[i] * 2) / 25) & 0x3F; + snd_soc_component_write_field(component, PM4125_ANA_MBHC_BTN0_ZDET_VREF1 + i, + PM4125_ANA_MBHC_BTN0_THRESHOLD_MASK, vth << 2); + } +} + +static const struct wcd_mbhc_cb mbhc_cb = { + .mbhc_bias = pm4125_mbhc_bias_control, + .set_btn_thr = pm4125_mbhc_program_btn_thr, +}; + +static int pm4125_mbhc_init(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + struct wcd_mbhc_intr *intr_ids = &pm4125->intr_ids; + + intr_ids->mbhc_sw_intr = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_MBHC_SW_DET); + + intr_ids->mbhc_btn_press_intr = regmap_irq_get_virq(pm4125->irq_chip, + PM4125_IRQ_MBHC_BUTTON_PRESS_DET); + + intr_ids->mbhc_btn_release_intr = regmap_irq_get_virq(pm4125->irq_chip, + PM4125_IRQ_MBHC_BUTTON_RELEASE_DET); + + intr_ids->mbhc_hs_ins_intr = regmap_irq_get_virq(pm4125->irq_chip, + PM4125_IRQ_MBHC_ELECT_INS_REM_LEG_DET); + + intr_ids->mbhc_hs_rem_intr = regmap_irq_get_virq(pm4125->irq_chip, + PM4125_IRQ_MBHC_ELECT_INS_REM_DET); + + intr_ids->hph_left_ocp = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_HPHL_OCP_INT); + + intr_ids->hph_right_ocp = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_HPHR_OCP_INT); + + pm4125->wcd_mbhc = wcd_mbhc_init(component, &mbhc_cb, intr_ids, pm4125_mbhc_fields, false); + if (IS_ERR(pm4125->wcd_mbhc)) + return PTR_ERR(pm4125->wcd_mbhc); + + return 0; +} + +static void pm4125_mbhc_deinit(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + wcd_mbhc_deinit(pm4125->wcd_mbhc); +} + +static const struct snd_kcontrol_new pm4125_snd_controls[] = { + SOC_SINGLE_EXT("HPHL_COMP Switch", PM4125_COMP_L, 0, 1, 0, + pm4125_get_compander, pm4125_set_compander), + SOC_SINGLE_EXT("HPHR_COMP Switch", PM4125_COMP_R, 1, 1, 0, + pm4125_get_compander, pm4125_set_compander), + + SOC_SINGLE_TLV("HPHL Volume", PM4125_ANA_HPHPA_L_GAIN, 0, 20, 1, + line_gain), + SOC_SINGLE_TLV("HPHR Volume", PM4125_ANA_HPHPA_R_GAIN, 0, 20, 1, + line_gain), + SOC_SINGLE_TLV("ADC1 Volume", PM4125_ANA_TX_AMIC1, 0, 8, 0, + analog_gain), + SOC_SINGLE_TLV("ADC2 Volume", PM4125_ANA_TX_AMIC2, 0, 8, 0, + analog_gain), + + SOC_SINGLE_EXT("HPHL Switch", PM4125_HPH_L, 0, 1, 0, + pm4125_get_swr_port, pm4125_set_swr_port), + SOC_SINGLE_EXT("HPHR Switch", PM4125_HPH_R, 0, 1, 0, + pm4125_get_swr_port, pm4125_set_swr_port), + + SOC_SINGLE_EXT("ADC1 Switch", PM4125_ADC1, 1, 1, 0, + pm4125_get_swr_port, pm4125_set_swr_port), + SOC_SINGLE_EXT("ADC2 Switch", PM4125_ADC2, 1, 1, 0, + pm4125_get_swr_port, pm4125_set_swr_port), +}; + +static const struct snd_kcontrol_new adc1_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new adc2_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new dmic1_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new dmic2_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new ear_rdac_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new lo_rdac_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new hphl_rdac_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const struct snd_kcontrol_new hphr_rdac_switch[] = { + SOC_DAPM_SINGLE("Switch", SND_SOC_NOPM, 0, 1, 0) +}; + +static const char * const adc2_mux_text[] = { + "INP2", "INP3" +}; + +static const struct soc_enum adc2_enum = SOC_ENUM_SINGLE(PM4125_ANA_TX_AMIC2, 4, + ARRAY_SIZE(adc2_mux_text), adc2_mux_text); + +static const struct snd_kcontrol_new tx_adc2_mux = SOC_DAPM_ENUM("ADC2 MUX Mux", adc2_enum); + +static const struct snd_soc_dapm_widget pm4125_dapm_widgets[] = { + /* Input widgets */ + SND_SOC_DAPM_INPUT("AMIC1"), + SND_SOC_DAPM_INPUT("AMIC2"), + SND_SOC_DAPM_INPUT("AMIC3"), + SND_SOC_DAPM_INPUT("IN1_HPHL"), + SND_SOC_DAPM_INPUT("IN2_HPHR"), + + /* TX widgets */ + SND_SOC_DAPM_ADC_E("ADC1", NULL, SND_SOC_NOPM, 0, 0, pm4125_codec_enable_adc, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_ADC_E("ADC2", NULL, SND_SOC_NOPM, 1, 0, pm4125_codec_enable_adc, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + + SND_SOC_DAPM_MUX("ADC2 MUX", SND_SOC_NOPM, 0, 0, &tx_adc2_mux), + + /* TX mixers */ + SND_SOC_DAPM_MIXER("ADC1_MIXER", SND_SOC_NOPM, 0, 0, adc1_switch, ARRAY_SIZE(adc1_switch)), + SND_SOC_DAPM_MIXER("ADC2_MIXER", SND_SOC_NOPM, 1, 0, adc2_switch, ARRAY_SIZE(adc2_switch)), + + /* MIC_BIAS widgets */ + SND_SOC_DAPM_SUPPLY("MIC BIAS1", SND_SOC_NOPM, MIC_BIAS_1, 0, pm4125_codec_enable_micbias, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_SUPPLY("MIC BIAS2", SND_SOC_NOPM, MIC_BIAS_2, 0, pm4125_codec_enable_micbias, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_SUPPLY("MIC BIAS3", SND_SOC_NOPM, MIC_BIAS_3, 0, pm4125_codec_enable_micbias, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + + SND_SOC_DAPM_SUPPLY("PA_VPOS", SND_SOC_NOPM, 0, 0, NULL, 0), + + /* RX widgets */ + SND_SOC_DAPM_PGA_E("EAR PGA", PM4125_ANA_COMBOPA_CTL, 7, 0, NULL, 0, + pm4125_codec_enable_ear_pa, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_PGA_E("LO PGA", PM4125_ANA_COMBOPA_CTL, 7, 0, NULL, 0, + pm4125_codec_enable_lo_pa, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_PGA_E("HPHL PGA", PM4125_ANA_HPHPA_CNP_CTL_2, 7, 0, NULL, 0, + pm4125_codec_enable_hphl_pa, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_PGA_E("HPHR PGA", PM4125_ANA_HPHPA_CNP_CTL_2, 6, 0, NULL, 0, + pm4125_codec_enable_hphr_pa, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + + SND_SOC_DAPM_DAC_E("RDAC1", NULL, SND_SOC_NOPM, 0, 0, pm4125_codec_hphl_dac_event, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_DAC_E("RDAC2", NULL, SND_SOC_NOPM, 0, 0, pm4125_codec_hphr_dac_event, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_DAC_E("RDAC3", NULL, SND_SOC_NOPM, 0, 0, pm4125_codec_ear_lo_dac_event, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | + SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMD), + + + SND_SOC_DAPM_SUPPLY("HPHL_WDT_IRQ", SND_SOC_NOPM, 0, 0, pm4125_codec_enable_hphl_wdt_irq, + SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), + SND_SOC_DAPM_SUPPLY("HPHR_WDT_IRQ", SND_SOC_NOPM, 0, 0, pm4125_codec_enable_hphr_wdt_irq, + SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), + SND_SOC_DAPM_SUPPLY("RXCLK", SND_SOC_NOPM, 0, 0, pm4125_codec_enable_rxclk, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_MIXER_E("RX1", SND_SOC_NOPM, 0, 0, NULL, 0, NULL, 0), + SND_SOC_DAPM_MIXER_E("RX2", SND_SOC_NOPM, 0, 0, NULL, 0, NULL, 0), + + /* RX mixer widgets */ + SND_SOC_DAPM_MIXER("EAR_RDAC", SND_SOC_NOPM, 0, 0, ear_rdac_switch, + ARRAY_SIZE(ear_rdac_switch)), + SND_SOC_DAPM_MIXER("LO_RDAC", SND_SOC_NOPM, 0, 0, lo_rdac_switch, + ARRAY_SIZE(lo_rdac_switch)), + SND_SOC_DAPM_MIXER("HPHL_RDAC", SND_SOC_NOPM, 0, 0, hphl_rdac_switch, + ARRAY_SIZE(hphl_rdac_switch)), + SND_SOC_DAPM_MIXER("HPHR_RDAC", SND_SOC_NOPM, 0, 0, hphr_rdac_switch, + ARRAY_SIZE(hphr_rdac_switch)), + + /* TX output widgets */ + SND_SOC_DAPM_OUTPUT("ADC1_OUTPUT"), + SND_SOC_DAPM_OUTPUT("ADC2_OUTPUT"), + + /* RX output widgets */ + SND_SOC_DAPM_OUTPUT("EAR"), + SND_SOC_DAPM_OUTPUT("LO"), + SND_SOC_DAPM_OUTPUT("HPHL"), + SND_SOC_DAPM_OUTPUT("HPHR"), + + /* MIC_BIAS pull up widgets */ + SND_SOC_DAPM_SUPPLY("VA MIC BIAS1", SND_SOC_NOPM, MIC_BIAS_1, 0, + pm4125_codec_enable_micbias_pullup, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_SUPPLY("VA MIC BIAS2", SND_SOC_NOPM, MIC_BIAS_2, 0, + pm4125_codec_enable_micbias_pullup, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_SUPPLY("VA MIC BIAS3", SND_SOC_NOPM, MIC_BIAS_3, 0, + pm4125_codec_enable_micbias_pullup, + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD), + + /* TX widgets */ + SND_SOC_DAPM_ADC_E("DMIC1", NULL, PM4125_DIG_SWR_CDC_DMIC1_CTL, 0, 0, + pm4125_codec_enable_dmic, SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + SND_SOC_DAPM_ADC_E("DMIC2", NULL, PM4125_DIG_SWR_CDC_DMIC1_CTL, 1, 0, + pm4125_codec_enable_dmic, SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + + /* TX mixer widgets */ + SND_SOC_DAPM_MIXER("DMIC1_MIXER", SND_SOC_NOPM, 0, 0, dmic1_switch, + ARRAY_SIZE(dmic1_switch)), + SND_SOC_DAPM_MIXER("DMIC2_MIXER", SND_SOC_NOPM, 1, 0, dmic2_switch, + ARRAY_SIZE(dmic2_switch)), + + /* Output widgets */ + SND_SOC_DAPM_OUTPUT("DMIC1_OUTPUT"), + SND_SOC_DAPM_OUTPUT("DMIC2_OUTPUT"), +}; + +static const struct snd_soc_dapm_route pm4125_audio_map[] = { + { "ADC1_OUTPUT", NULL, "ADC1_MIXER" }, + { "ADC1_MIXER", "Switch", "ADC1" }, + { "ADC1", NULL, "AMIC1" }, + + { "ADC2_OUTPUT", NULL, "ADC2_MIXER" }, + { "ADC2_MIXER", "Switch", "ADC2" }, + { "ADC2", NULL, "ADC2 MUX" }, + { "ADC2 MUX", "INP3", "AMIC3" }, + { "ADC2 MUX", "INP2", "AMIC2" }, + + { "IN1_HPHL", NULL, "PA_VPOS" }, + { "RX1", NULL, "IN1_HPHL" }, + { "RX1", NULL, "RXCLK" }, + { "RX1", NULL, "HPHL_WDT_IRQ" }, + { "RDAC1", NULL, "RX1" }, + { "HPHL_RDAC", "Switch", "RDAC1" }, + { "HPHL PGA", NULL, "HPHL_RDAC" }, + { "HPHL", NULL, "HPHL PGA" }, + + { "IN2_HPHR", NULL, "PA_VPOS" }, + { "RX2", NULL, "IN2_HPHR" }, + { "RX2", NULL, "RXCLK" }, + { "RX2", NULL, "HPHR_WDT_IRQ" }, + { "RDAC2", NULL, "RX2" }, + { "HPHR_RDAC", "Switch", "RDAC2" }, + { "HPHR PGA", NULL, "HPHR_RDAC" }, + { "HPHR", NULL, "HPHR PGA" }, + + { "RDAC3", NULL, "RX1" }, + { "EAR_RDAC", "Switch", "RDAC3" }, + { "EAR PGA", NULL, "EAR_RDAC" }, + { "EAR", NULL, "EAR PGA" }, + + { "LO_RDAC", "Switch", "RDAC3" }, + { "LO PGA", NULL, "LO_RDAC" }, + { "LO", NULL, "LO PGA" }, + + { "DMIC1_OUTPUT", NULL, "DMIC1_MIXER" }, + { "DMIC1_MIXER", "Switch", "DMIC1" }, + + { "DMIC2_OUTPUT", NULL, "DMIC2_MIXER" }, + { "DMIC2_MIXER", "Switch", "DMIC2" }, +}; + +static int pm4125_set_micbias_data(struct device *dev, struct pm4125_priv *pm4125) +{ + int vout_ctl; + + /* Set micbias voltage */ + vout_ctl = pm4125_get_micb_vout_ctl_val(dev, pm4125->micb1_mv); + if (vout_ctl < 0) + return -EINVAL; + + regmap_update_bits(pm4125->regmap, PM4125_ANA_MICBIAS_LDO_1_SETTING, + PM4125_ANA_MICBIAS_MICB_OUT_VAL_MASK, vout_ctl << 3); + return 0; +} + +static irqreturn_t pm4125_wd_handle_irq(int irq, void *data) +{ + /* + * HPHR/HPHL Watchdog interrupt threaded handler + * Watchdog interrupts are expected to be enabled when switching on the HPHL/R + * in order to make sure the interrupts are acked by the regmap_irq handler + * io allow PDM sync. We could leave those interrupts masked but we would + * not haveany valid way to enable/disable them without violating irq layers. + * + * The HPHR/HPHL Watchdog interrupts are handled by regmap_irq, so requesting + * a threaded handler is the safest way to be able to ack those interrupts + * without colliding with the regmap_irq setup. + */ + return IRQ_HANDLED; +} + +static const struct irq_chip pm4125_codec_irq_chip = { + .name = "pm4125_codec", +}; + +static int pm4125_codec_irq_chip_map(struct irq_domain *irqd, unsigned int virq, + irq_hw_number_t hw) +{ + irq_set_chip_and_handler(virq, &pm4125_codec_irq_chip, handle_simple_irq); + irq_set_nested_thread(virq, 1); + irq_set_noprobe(virq); + + return 0; +} + +static const struct irq_domain_ops pm4125_domain_ops = { + .map = pm4125_codec_irq_chip_map, +}; + +static int pm4125_irq_init(struct pm4125_priv *pm4125, struct device *dev) +{ + pm4125->virq = irq_domain_add_linear(NULL, 1, &pm4125_domain_ops, NULL); + if (!(pm4125->virq)) { + dev_err(dev, "%s: Failed to add IRQ domain\n", __func__); + return -EINVAL; + } + + pm4125_regmap_irq_chip.irq_drv_data = pm4125; + + return devm_regmap_add_irq_chip(dev, pm4125->regmap, irq_create_mapping(pm4125->virq, 0), + IRQF_ONESHOT, 0, &pm4125_regmap_irq_chip, + &pm4125->irq_chip); +} + +static int pm4125_soc_codec_probe(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + struct sdw_slave *tx_sdw_dev = pm4125->tx_sdw_dev; + struct device *dev = component->dev; + unsigned long time_left; + int i, ret; + + time_left = wait_for_completion_timeout(&tx_sdw_dev->initialization_complete, + msecs_to_jiffies(5000)); + if (!time_left) { + dev_err(dev, "soundwire device init timeout\n"); + return -ETIMEDOUT; + } + + snd_soc_component_init_regmap(component, pm4125->regmap); + ret = pm_runtime_resume_and_get(dev); + if (ret < 0) + return ret; + + pm4125_io_init(pm4125->regmap); + + /* Set all interrupts as edge triggered */ + for (i = 0; i < pm4125_regmap_irq_chip.num_regs; i++) + regmap_write(pm4125->regmap, (PM4125_DIG_SWR_INTR_LEVEL_0 + i), 0); + + pm_runtime_put(dev); + + pm4125->hphr_pdm_wd_int = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_HPHR_PDM_WD_INT); + pm4125->hphl_pdm_wd_int = regmap_irq_get_virq(pm4125->irq_chip, PM4125_IRQ_HPHL_PDM_WD_INT); + + /* Request for watchdog interrupts */ + ret = devm_request_threaded_irq(dev, pm4125->hphr_pdm_wd_int, NULL, pm4125_wd_handle_irq, + IRQF_ONESHOT | IRQF_TRIGGER_RISING, + "HPHR PDM WDOG INT", pm4125); + if (ret) + dev_err(dev, "Failed to request HPHR wdt interrupt: %d\n", ret); + + ret = devm_request_threaded_irq(dev, pm4125->hphl_pdm_wd_int, NULL, pm4125_wd_handle_irq, + IRQF_ONESHOT | IRQF_TRIGGER_RISING, + "HPHL PDM WDOG INT", pm4125); + if (ret) + dev_err(dev, "Failed to request HPHL wdt interrupt: %d\n", ret); + + disable_irq_nosync(pm4125->hphr_pdm_wd_int); + disable_irq_nosync(pm4125->hphl_pdm_wd_int); + + ret = pm4125_mbhc_init(component); + if (ret) + dev_err(component->dev, "mbhc initialization failed\n"); + + return ret; +} + +static void pm4125_soc_codec_remove(struct snd_soc_component *component) +{ + struct pm4125_priv *pm4125 = snd_soc_component_get_drvdata(component); + + pm4125_mbhc_deinit(component); + free_irq(pm4125->hphl_pdm_wd_int, pm4125); + free_irq(pm4125->hphr_pdm_wd_int, pm4125); +} + +static int pm4125_codec_set_jack(struct snd_soc_component *comp, struct snd_soc_jack *jack, + void *data) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(comp->dev); + int ret = 0; + + if (jack) + ret = wcd_mbhc_start(pm4125->wcd_mbhc, &pm4125->mbhc_cfg, jack); + else + wcd_mbhc_stop(pm4125->wcd_mbhc); + + return ret; +} + +static const struct snd_soc_component_driver soc_codec_dev_pm4125 = { + .name = "pm4125_codec", + .probe = pm4125_soc_codec_probe, + .remove = pm4125_soc_codec_remove, + .controls = pm4125_snd_controls, + .num_controls = ARRAY_SIZE(pm4125_snd_controls), + .dapm_widgets = pm4125_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(pm4125_dapm_widgets), + .dapm_routes = pm4125_audio_map, + .num_dapm_routes = ARRAY_SIZE(pm4125_audio_map), + .set_jack = pm4125_codec_set_jack, + .endianness = 1, +}; + +static void pm4125_dt_parse_micbias_info(struct device *dev, struct pm4125_priv *priv) +{ + struct device_node *np = dev->of_node; + u32 prop_val = 0; + int ret; + + ret = of_property_read_u32(np, "qcom,micbias1-microvolt", &prop_val); + if (!ret) + priv->micb1_mv = prop_val / 1000; + else + dev_warn(dev, "Micbias1 DT property not found\n"); + + ret = of_property_read_u32(np, "qcom,micbias2-microvolt", &prop_val); + if (!ret) + priv->micb2_mv = prop_val / 1000; + else + dev_warn(dev, "Micbias2 DT property not found\n"); + + ret = of_property_read_u32(np, "qcom,micbias3-microvolt", &prop_val); + if (!ret) + priv->micb3_mv = prop_val / 1000; + else + dev_warn(dev, "Micbias3 DT property not found\n"); +} + +static int pm4125_codec_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *dai) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dai->dev); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[dai->id]; + + return pm4125_sdw_hw_params(sdw_priv, substream, params, dai); +} + +static int pm4125_codec_free(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dai->dev); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[dai->id]; + + return sdw_stream_remove_slave(sdw_priv->sdev, sdw_priv->sruntime); +} + +static int pm4125_codec_set_sdw_stream(struct snd_soc_dai *dai, void *stream, int direction) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dai->dev); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[dai->id]; + + sdw_priv->sruntime = stream; + + return 0; +} + +static int pm4125_get_channel_map(const struct snd_soc_dai *dai, + unsigned int *tx_num, unsigned int *tx_slot, + unsigned int *rx_num, unsigned int *rx_slot) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dai->dev); + struct pm4125_sdw_priv *sdw_priv = pm4125->sdw_priv[dai->id]; + int i; + + switch (dai->id) { + case AIF1_PB: + if (!rx_slot || !rx_num) { + dev_err(dai->dev, "Invalid rx_slot %p or rx_num %p\n", rx_slot, rx_num); + return -EINVAL; + } + + for (i = 0; i < SDW_MAX_PORTS; i++) + rx_slot[i] = sdw_priv->master_channel_map[i]; + + *rx_num = i; + break; + case AIF1_CAP: + if (!tx_slot || !tx_num) { + dev_err(dai->dev, "Invalid tx_slot %p or tx_num %p\n", tx_slot, tx_num); + return -EINVAL; + } + + for (i = 0; i < SDW_MAX_PORTS; i++) + tx_slot[i] = sdw_priv->master_channel_map[i]; + + *tx_num = i; + break; + default: + break; + } + + return 0; +} + +static const struct snd_soc_dai_ops pm4125_sdw_dai_ops = { + .hw_params = pm4125_codec_hw_params, + .hw_free = pm4125_codec_free, + .set_stream = pm4125_codec_set_sdw_stream, + .get_channel_map = pm4125_get_channel_map, +}; + +static struct snd_soc_dai_driver pm4125_dais[] = { + [0] = { + .name = "pm4125-sdw-rx", + .playback = { + .stream_name = "PM4125 AIF Playback", + .rates = PM4125_RATES | PM4125_FRAC_RATES, + .formats = PM4125_FORMATS, + .rate_min = 8000, + .rate_max = 384000, + .channels_min = 1, + .channels_max = 4, + }, + .ops = &pm4125_sdw_dai_ops, + }, + [1] = { + .name = "pm4125-sdw-tx", + .capture = { + .stream_name = "PM4125 AIF Capture", + .rates = PM4125_RATES, + .formats = PM4125_FORMATS, + .rate_min = 8000, + .rate_max = 192000, + .channels_min = 1, + .channels_max = 4, + }, + .ops = &pm4125_sdw_dai_ops, + }, +}; + +static int pm4125_bind(struct device *dev) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dev); + struct device_link *devlink; + int ret; + + /* Give the soundwire subdevices some more time to settle */ + usleep_range(15000, 15010); + + ret = component_bind_all(dev, pm4125); + if (ret) { + dev_err(dev, "Slave bind failed, ret = %d\n", ret); + return ret; + } + + pm4125->rxdev = pm4125_sdw_device_get(pm4125->rxnode); + if (!pm4125->rxdev) { + dev_err(dev, "could not find rxslave with matching of node\n"); + ret = -EINVAL; + goto error_unbind_all; + } + + pm4125->sdw_priv[AIF1_PB] = dev_get_drvdata(pm4125->rxdev); + pm4125->sdw_priv[AIF1_PB]->pm4125 = pm4125; + + pm4125->txdev = pm4125_sdw_device_get(pm4125->txnode); + if (!pm4125->txdev) { + dev_err(dev, "could not find txslave with matching of node\n"); + ret = -EINVAL; + goto error_unbind_all; + } + + pm4125->sdw_priv[AIF1_CAP] = dev_get_drvdata(pm4125->txdev); + pm4125->sdw_priv[AIF1_CAP]->pm4125 = pm4125; + + pm4125->tx_sdw_dev = dev_to_sdw_dev(pm4125->txdev); + if (!pm4125->tx_sdw_dev) { + dev_err(dev, "could not get txslave with matching of dev\n"); + ret = -EINVAL; + goto error_unbind_all; + } + + /* + * As TX is the main CSR reg interface, which should not be suspended first. + * expicilty add the dependency link + */ + devlink = device_link_add(pm4125->rxdev, pm4125->txdev, + DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME); + if (!devlink) { + dev_err(dev, "Could not devlink TX and RX\n"); + ret = -EINVAL; + goto error_unbind_all; + } + + devlink = device_link_add(dev, pm4125->txdev, + DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME); + if (!devlink) { + dev_err(dev, "Could not devlink PM4125 and TX\n"); + ret = -EINVAL; + goto link_remove_rx_tx; + } + + devlink = device_link_add(dev, pm4125->rxdev, + DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME); + if (!devlink) { + dev_err(dev, "Could not devlink PM4125 and RX\n"); + ret = -EINVAL; + goto link_remove_dev_tx; + } + + pm4125->regmap = dev_get_regmap(&pm4125->tx_sdw_dev->dev, NULL); + if (!pm4125->regmap) { + dev_err(dev, "could not get TX device regmap\n"); + ret = -EINVAL; + goto link_remove_dev_rx; + } + + ret = pm4125_irq_init(pm4125, dev); + if (ret) { + dev_err(dev, "IRQ init failed: %d\n", ret); + goto link_remove_dev_rx; + } + + pm4125->sdw_priv[AIF1_PB]->slave_irq = pm4125->virq; + pm4125->sdw_priv[AIF1_CAP]->slave_irq = pm4125->virq; + + ret = pm4125_set_micbias_data(dev, pm4125); + if (ret < 0) { + dev_err(dev, "Bad micbias pdata\n"); + goto link_remove_dev_rx; + } + + ret = snd_soc_register_component(dev, &soc_codec_dev_pm4125, + pm4125_dais, ARRAY_SIZE(pm4125_dais)); + if (!ret) + return ret; + + dev_err(dev, "Codec registration failed\n"); + +link_remove_dev_rx: + device_link_remove(dev, pm4125->rxdev); +link_remove_dev_tx: + device_link_remove(dev, pm4125->txdev); +link_remove_rx_tx: + device_link_remove(pm4125->rxdev, pm4125->txdev); +error_unbind_all: + component_unbind_all(dev, pm4125); + return ret; +} + +static void pm4125_unbind(struct device *dev) +{ + struct pm4125_priv *pm4125 = dev_get_drvdata(dev); + + snd_soc_unregister_component(dev); + device_link_remove(dev, pm4125->txdev); + device_link_remove(dev, pm4125->rxdev); + device_link_remove(pm4125->rxdev, pm4125->txdev); + component_unbind_all(dev, pm4125); +} + +static const struct component_master_ops pm4125_comp_ops = { + .bind = pm4125_bind, + .unbind = pm4125_unbind, +}; + +static int pm4125_add_slave_components(struct pm4125_priv *pm4125, struct device *dev, + struct component_match **matchptr) +{ + struct device_node *np = dev->of_node; + + pm4125->rxnode = of_parse_phandle(np, "qcom,rx-device", 0); + if (!pm4125->rxnode) + return dev_err_probe(dev, -ENODEV, "Couldn't parse phandle to qcom,rx-device\n"); + component_match_add_release(dev, matchptr, component_release_of, component_compare_of, + pm4125->rxnode); + + pm4125->txnode = of_parse_phandle(np, "qcom,tx-device", 0); + if (!pm4125->txnode) + return dev_err_probe(dev, -ENODEV, "Couldn't parse phandle to qcom,tx-device\n"); + component_match_add_release(dev, matchptr, component_release_of, component_compare_of, + pm4125->txnode); + + return 0; +} + +static int pm4125_probe(struct platform_device *pdev) +{ + struct component_match *match = NULL; + struct device *dev = &pdev->dev; + struct pm4125_priv *pm4125; + struct wcd_mbhc_config *cfg; + int ret; + + pm4125 = devm_kzalloc(dev, sizeof(*pm4125), GFP_KERNEL); + if (!pm4125) + return -ENOMEM; + + dev_set_drvdata(dev, pm4125); + + ret = devm_regulator_bulk_get_enable(dev, ARRAY_SIZE(pm4125_power_supplies), + pm4125_power_supplies); + if (ret) + return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); + + pm4125->spmi_regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!pm4125->spmi_regmap) + return -ENXIO; + + pm4125_reset(pm4125); + + pm4125_dt_parse_micbias_info(dev, pm4125); + atomic_set(&pm4125->gloal_mbias_cnt, 0); + + cfg = &pm4125->mbhc_cfg; + cfg->mbhc_micbias = MIC_BIAS_2; + cfg->anc_micbias = MIC_BIAS_2; + cfg->v_hs_max = WCD_MBHC_HS_V_MAX; + cfg->num_btn = PM4125_MBHC_MAX_BUTTONS; + cfg->micb_mv = pm4125->micb2_mv; + cfg->linein_th = 5000; + cfg->hs_thr = 1700; + cfg->hph_thr = 50; + + wcd_dt_parse_mbhc_data(dev, &pm4125->mbhc_cfg); + + ret = pm4125_add_slave_components(pm4125, dev, &match); + if (ret) + return ret; + + ret = component_master_add_with_match(dev, &pm4125_comp_ops, match); + if (ret) + return ret; + + pm_runtime_set_autosuspend_delay(dev, 1000); + pm_runtime_use_autosuspend(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + pm_runtime_idle(dev); + + return 0; +} + +static void pm4125_remove(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + + component_master_del(&pdev->dev, &pm4125_comp_ops); + + pm_runtime_disable(dev); + pm_runtime_set_suspended(dev); + pm_runtime_dont_use_autosuspend(dev); +} + +static const struct of_device_id pm4125_of_match[] = { + { .compatible = "qcom,pm4125-codec" }, + { } +}; +MODULE_DEVICE_TABLE(of, pm4125_of_match); + +static struct platform_driver pm4125_codec_driver = { + .probe = pm4125_probe, + .remove = pm4125_remove, + .driver = { + .name = "pm4125_codec", + .of_match_table = pm4125_of_match, + .suppress_bind_attrs = true, + }, +}; + +module_platform_driver(pm4125_codec_driver); +MODULE_DESCRIPTION("PM4125 audio codec driver"); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/pm4125.h b/sound/soc/codecs/pm4125.h new file mode 100644 index 00000000000000..3520c711b744a9 --- /dev/null +++ b/sound/soc/codecs/pm4125.h @@ -0,0 +1,307 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _PM4125_REGISTERS_H +#define _PM4125_REGISTERS_H + +#include +#include + +#define PM4125_ANA_BASE_ADDR 0x3000 +#define PM4125_DIG_BASE_ADDR 0x3400 + +#define PM4125_ANA_MICBIAS_MICB_1_2_EN (PM4125_ANA_BASE_ADDR+0x040) +#define PM4125_ANA_MICBIAS_MICB1_PULL_UP_MASK BIT(5) +#define PM4125_ANA_MICBIAS_MICB2_PULL_UP_MASK BIT(1) +#define PM4125_ANA_MICBIAS_MICB2_PULL_DN_MASK BIT(0) +#define PM4125_ANA_MICBIAS_MICB_PULL_ENABLE 1 +#define PM4125_ANA_MICBIAS_MICB_PULL_DISABLE 0 +#define PM4125_ANA_MICBIAS_MICB_3_EN (PM4125_ANA_BASE_ADDR+0x041) +#define PM4125_ANA_MICBIAS_LDO_1_SETTING (PM4125_ANA_BASE_ADDR+0x042) +#define PM4125_ANA_MICBIAS_MICB_OUT_VAL_MASK GENMASK(7, 3) +#define PM4125_ANA_MICBIAS_LDO_1_CTRL (PM4125_ANA_BASE_ADDR+0x043) +#define PM4125_ANA_TX_AMIC1 (PM4125_ANA_BASE_ADDR+0x047) +#define PM4125_ANA_TX_AMIC2 (PM4125_ANA_BASE_ADDR+0x048) +#define PM4125_ANA_MBHC_MECH (PM4125_ANA_BASE_ADDR+0x05A) +#define PM4125_ANA_MBHC_ELECT (PM4125_ANA_BASE_ADDR+0x05B) +#define PM4125_ANA_MBHC_ELECT_BIAS_EN_MASK BIT(0) +#define PM4125_ANA_MBHC_ELECT_BIAS_ENABLE 1 +#define PM4125_ANA_MBHC_ELECT_BIAS_DISABLE 0 +#define PM4125_ANA_MBHC_ZDET (PM4125_ANA_BASE_ADDR+0x05C) +#define PM4125_ANA_MBHC_RESULT_1 (PM4125_ANA_BASE_ADDR+0x05D) +#define PM4125_ANA_MBHC_RESULT_2 (PM4125_ANA_BASE_ADDR+0x05E) +#define PM4125_ANA_MBHC_RESULT_3 (PM4125_ANA_BASE_ADDR+0x05F) +#define PM4125_ANA_MBHC_BTN0_ZDET_VREF1 (PM4125_ANA_BASE_ADDR+0x060) +#define PM4125_ANA_MBHC_BTN0_THRESHOLD_MASK GENMASK(7, 2) +#define PM4125_ANA_MBHC_BTN1_ZDET_VREF2 (PM4125_ANA_BASE_ADDR+0x061) +#define PM4125_ANA_MBHC_BTN2_ZDET_VREF3 (PM4125_ANA_BASE_ADDR+0x062) +#define PM4125_ANA_MBHC_BTN3_ZDET_DBG_400 (PM4125_ANA_BASE_ADDR+0x063) +#define PM4125_ANA_MBHC_BTN4_ZDET_DBG_1400 (PM4125_ANA_BASE_ADDR+0x064) +#define PM4125_ANA_MBHC_MICB2_RAMP (PM4125_ANA_BASE_ADDR+0x065) +#define PM4125_ANA_MBHC_CTL_1 (PM4125_ANA_BASE_ADDR+0x066) +#define PM4125_ANA_MBHC_CTL_2 (PM4125_ANA_BASE_ADDR+0x067) +#define PM4125_ANA_MBHC_PLUG_DETECT_CTL (PM4125_ANA_BASE_ADDR+0x068) +#define PM4125_ANA_MBHC_ZDET_ANA_CTL (PM4125_ANA_BASE_ADDR+0x069) +#define PM4125_ANA_MBHC_ZDET_RAMP_CTL (PM4125_ANA_BASE_ADDR+0x06A) +#define PM4125_ANA_MBHC_FSM_STATUS (PM4125_ANA_BASE_ADDR+0x06B) +#define PM4125_ANA_MBHC_ADC_RESULT (PM4125_ANA_BASE_ADDR+0x06C) +#define PM4125_ANA_MBHC_CTL_CLK (PM4125_ANA_BASE_ADDR+0x06D) +#define PM4125_ANA_MBHC_ZDET_CALIB_RESULT (PM4125_ANA_BASE_ADDR+0x072) +#define PM4125_ANA_NCP_EN (PM4125_ANA_BASE_ADDR+0x077) +#define PM4125_ANA_NCP_ENABLE_MASK BIT(0) +#define PM4125_ANA_NCP_ENABLE 1 +#define PM4125_ANA_NCP_DISABLE 0 +#define PM4125_ANA_NCP_VCTRL (PM4125_ANA_BASE_ADDR+0x07C) +#define PM4125_ANA_HPHPA_CNP_CTL_1 (PM4125_ANA_BASE_ADDR+0x083) +#define PM4125_ANA_HPHPA_CNP_CTL_1_EN_MASK BIT(1) +#define PM4125_ANA_HPHPA_CNP_CTL_1_EN 1 +#define PM4125_ANA_HPHPA_CNP_CTL_2 (PM4125_ANA_BASE_ADDR+0x084) +#define PM4125_ANA_HPHPA_CNP_OCP_EN_L_MASK BIT(1) +#define PM4125_ANA_HPHPA_CNP_OCP_EN_R_MASK BIT(0) +#define PM4125_ANA_HPHPA_CNP_OCP_ENABLE 1 +#define PM4125_ANA_HPHPA_CNP_OCP_DISABLE 0 +#define PM4125_ANA_HPHPA_PA_STATUS (PM4125_ANA_BASE_ADDR+0x087) +#define PM4125_ANA_HPHPA_FSM_CLK (PM4125_ANA_BASE_ADDR+0x088) +#define PM4125_ANA_HPHPA_FSM_CLK_DIV_EN_MASK BIT(7) +#define PM4125_ANA_HPHPA_FSM_CLK_DIV_ENABLE 1 +#define PM4125_ANA_HPHPA_FSM_CLK_DIV_DISABLE 0 +#define PM4125_ANA_HPHPA_FSM_DIV_RATIO_MASK GENMASK(6, 0) +#define PM4125_ANA_HPHPA_FSM_DIV_RATIO_68 (0x11) +#define PM4125_ANA_HPHPA_L_GAIN (PM4125_ANA_BASE_ADDR+0x08B) +#define PM4125_ANA_HPHPA_R_GAIN (PM4125_ANA_BASE_ADDR+0x08C) +#define PM4125_ANA_HPHPA_SPARE_CTL (PM4125_ANA_BASE_ADDR+0x08E) +#define PM4125_SWR_HPHPA_HD2 (PM4125_ANA_BASE_ADDR+0x090) +#define PM4125_SWR_HPHPA_HD2_LEFT_MASK GENMASK(5, 3) +#define PM4125_SWR_HPHPA_HD2_RIGHT_MASK GENMASK(2, 0) +#define PM4125_SWR_HPHPA_HD2_ENABLE (BIT(2) | BIT(1) | BIT(0)) +#define PM4125_ANA_SURGE_EN (PM4125_ANA_BASE_ADDR+0x097) +#define PM4125_ANA_SURGE_PROTECTION_HPHL_MASK BIT(7) +#define PM4125_ANA_SURGE_PROTECTION_HPHR_MASK BIT(6) +#define PM4125_ANA_SURGE_PROTECTION_ENABLE 1 +#define PM4125_ANA_SURGE_PROTECTION_DISABLE 0 +#define PM4125_ANA_COMBOPA_CTL (PM4125_ANA_BASE_ADDR+0x09B) +#define PM4125_ANA_COMBO_PA_SELECT_MASK BIT(6) +#define PM4125_ANA_COMBO_PA_SELECT_EAR 0 +#define PM4125_ANA_COMBO_PA_SELECT_LO 1 +#define PM4125_ANA_COMBOPA_CTL_4 (PM4125_ANA_BASE_ADDR+0x09F) +#define PM4125_ANA_COMBOPA_CTL_5 (PM4125_ANA_BASE_ADDR+0x0A0) +#define PM4125_ANA_RXLDO_CTL (PM4125_ANA_BASE_ADDR+0x0B2) +#define PM4125_ANA_MBIAS_EN (PM4125_ANA_BASE_ADDR+0x0B4) +#define PM4125_ANA_MBIAS_EN_GLOBAL_MASK BIT(5) +#define PM4125_ANA_MBIAS_EN_V2I_MASK BIT(4) +#define PM4125_ANA_MBIAS_EN_ENABLE 1 +#define PM4125_ANA_MBIAS_EN_DISABLE 0 + +#define PM4125_DIG_SWR_CHIP_ID0 (PM4125_DIG_BASE_ADDR+0x001) +#define PM4125_DIG_SWR_CHIP_ID1 (PM4125_DIG_BASE_ADDR+0x002) +#define PM4125_DIG_SWR_CHIP_ID2 (PM4125_DIG_BASE_ADDR+0x003) +#define PM4125_DIG_SWR_CHIP_ID3 (PM4125_DIG_BASE_ADDR+0x004) +#define PM4125_DIG_SWR_SWR_TX_CLK_RATE (PM4125_DIG_BASE_ADDR+0x040) +#define PM4125_DIG_SWR_CDC_RST_CTL (PM4125_DIG_BASE_ADDR+0x041) +#define PM4125_DIG_SWR_TOP_CLK_CFG (PM4125_DIG_BASE_ADDR+0x042) +#define PM4125_DIG_SWR_CDC_RX_CLK_CTL (PM4125_DIG_BASE_ADDR+0x043) +#define PM4125_DIG_SWR_ANA_RX_DIV2_CLK_EN_MASK BIT(5) +#define PM4125_DIG_SWR_ANA_RX_CLK_EN_MASK BIT(4) +#define PM4125_DIG_SWR_RX1_CLK_EN_MASK BIT(1) +#define PM4125_DIG_SWR_RX0_CLK_EN_MASK BIT(0) +#define PM4125_DIG_SWR_RX_CLK_ENABLE 1 +#define PM4125_DIG_SWR_RX_CLK_DISABLE 0 +#define PM4125_DIG_SWR_CDC_TX_CLK_CTL (PM4125_DIG_BASE_ADDR+0x044) +#define PM4125_DIG_SWR_SWR_RST_EN (PM4125_DIG_BASE_ADDR+0x045) +#define PM4125_DIG_SWR_CDC_RX_RST (PM4125_DIG_BASE_ADDR+0x047) +#define PM4125_DIG_SWR_CDC_RX0_CTL (PM4125_DIG_BASE_ADDR+0x048) +#define PM4125_DIG_SWR_DSM_DITHER_EN_MASK BIT(7) +#define PM4125_DIG_SWR_DSM_DITHER_DISABLE 0 +#define PM4125_DIG_SWR_DSM_DITHER_ENABLE 1 +#define PM4125_DIG_SWR_CDC_RX1_CTL (PM4125_DIG_BASE_ADDR+0x049) +#define PM4125_DIG_SWR_CDC_TX_ANA_MODE_0_1 (PM4125_DIG_BASE_ADDR+0x04B) +#define PM4125_DIG_SWR_TX_ANA_TXD1_MODE_MASK GENMASK(7, 4) +#define PM4125_DIG_SWR_TX_ANA_TXD0_MODE_MASK GENMASK(3, 0) +#define PM4125_DIG_SWR_TXD_MODE_ULPI (0x9) +#define PM4125_DIG_SWR_TXD_MODE_NORMAL (0x3) +#define PM4125_DIG_SWR_CDC_COMP_CTL_0 (PM4125_DIG_BASE_ADDR+0x04F) +#define PM4125_DIG_SWR_COMP_HPHL_EN_MASK BIT(1) +#define PM4125_DIG_SWR_COMP_HPHR_EN_MASK BIT(0) +#define PM4125_DIG_SWR_COMP_ENABLE 1 +#define PM4125_DIG_SWR_COMP_DISABLE 0 +#define PM4125_DIG_SWR_CDC_RX_DELAY_CTL (PM4125_DIG_BASE_ADDR+0x052) +#define PM4125_DIG_SWR_CDC_RX_GAIN_0 (PM4125_DIG_BASE_ADDR+0x053) +#define PM4125_DIG_SWR_CDC_RX_GAIN_1 (PM4125_DIG_BASE_ADDR+0x054) +#define PM4125_DIG_SWR_CDC_RX_GAIN_CTL (PM4125_DIG_BASE_ADDR+0x057) +#define PM4125_DIG_SWR_RX1_EN_MASK BIT(3) +#define PM4125_DIG_SWR_RX0_EN_MASK BIT(2) +#define PM4125_DIG_SWR_RX_INPUT_DISABLE 0 +#define PM4125_DIG_SWR_RX_INPUT_ENABLE 1 +#define PM4125_DIG_SWR_CDC_TX0_CTL (PM4125_DIG_BASE_ADDR+0x060) +#define PM4125_DIG_SWR_CDC_TX1_CTL (PM4125_DIG_BASE_ADDR+0x061) +#define PM4125_DIG_SWR_CDC_TX_RST (PM4125_DIG_BASE_ADDR+0x063) +#define PM4125_DIG_SWR_CDC_REQ0_CTL (PM4125_DIG_BASE_ADDR+0x064) +#define PM4125_DIG_SWR_CDC_REQ1_CTL (PM4125_DIG_BASE_ADDR+0x065) +#define PM4125_DIG_SWR_CDC_RST (PM4125_DIG_BASE_ADDR+0x067) +#define PM4125_DIG_SWR_CDC_AMIC_CTL (PM4125_DIG_BASE_ADDR+0x06A) +#define PM4125_DIG_SWR_AMIC_SELECT_MASK BIT(1) +#define PM4125_DIG_SWR_AMIC_SELECT_DMIC1 0 +#define PM4125_DIG_SWR_AMIC_SELECT_AMIC3 1 +#define PM4125_DIG_SWR_CDC_DMIC_CTL (PM4125_DIG_BASE_ADDR+0x06B) +#define PM4125_DIG_SWR_CDC_DMIC1_CTL (PM4125_DIG_BASE_ADDR+0x06C) +#define PM4125_DIG_SWR_DMIC1_CLK_EN_MASK BIT(3) +#define PM4125_DIG_SWR_DMIC1_CLK_ENABLE 1 +#define PM4125_DIG_SWR_DMIC1_CLK_DISABLE 0 +#define PM4125_DIG_SWR_CDC_DMIC1_RATE (PM4125_DIG_BASE_ADDR+0x06D) +#define PM4125_DIG_SWR_PDM_WD_CTL0 (PM4125_DIG_BASE_ADDR+0x070) +#define PM4125_WDT_ENABLE_MASK GENMASK(1, 0) +#define PM4125_WDT_ENABLE_RX0_L BIT(0) +#define PM4125_WDT_ENABLE_RX0_M BIT(1) +#define PM4125_DIG_SWR_PDM_WD_CTL1 (PM4125_DIG_BASE_ADDR+0x071) +#define PM4125_WDT_ENABLE_RX1_L BIT(0) +#define PM4125_WDT_ENABLE_RX1_M BIT(1) +#define PM4125_DIG_SWR_INTR_MODE (PM4125_DIG_BASE_ADDR+0x080) +#define PM4125_DIG_SWR_INTR_MASK_0 (PM4125_DIG_BASE_ADDR+0x081) +#define PM4125_DIG_SWR_INTR_MASK_1 (PM4125_DIG_BASE_ADDR+0x082) +#define PM4125_DIG_SWR_INTR_MASK_2 (PM4125_DIG_BASE_ADDR+0x083) +#define PM4125_DIG_SWR_INTR_STATUS_0 (PM4125_DIG_BASE_ADDR+0x084) +#define PM4125_DIG_SWR_INTR_STATUS_1 (PM4125_DIG_BASE_ADDR+0x085) +#define PM4125_DIG_SWR_INTR_STATUS_2 (PM4125_DIG_BASE_ADDR+0x086) +#define PM4125_DIG_SWR_INTR_CLEAR_0 (PM4125_DIG_BASE_ADDR+0x087) +#define PM4125_DIG_SWR_INTR_CLEAR_1 (PM4125_DIG_BASE_ADDR+0x088) +#define PM4125_DIG_SWR_INTR_CLEAR_2 (PM4125_DIG_BASE_ADDR+0x089) +#define PM4125_DIG_SWR_INTR_LEVEL_0 (PM4125_DIG_BASE_ADDR+0x08A) +#define PM4125_DIG_SWR_INTR_LEVEL_1 (PM4125_DIG_BASE_ADDR+0x08B) +#define PM4125_DIG_SWR_INTR_LEVEL_2 (PM4125_DIG_BASE_ADDR+0x08C) +#define PM4125_DIG_SWR_CDC_CONN_RX0_CTL (PM4125_DIG_BASE_ADDR+0x093) +#define PM4125_DIG_SWR_CDC_CONN_RX1_CTL (PM4125_DIG_BASE_ADDR+0x094) +#define PM4125_DIG_SWR_LOOP_BACK_MODE (PM4125_DIG_BASE_ADDR+0x097) +#define PM4125_DIG_SWR_DRIVE_STRENGTH_0 (PM4125_DIG_BASE_ADDR+0x0A0) +#define PM4125_DIG_SWR_DIG_DEBUG_CTL (PM4125_DIG_BASE_ADDR+0x0AB) +#define PM4125_DIG_SWR_DIG_DEBUG_EN (PM4125_DIG_BASE_ADDR+0x0AC) +#define PM4125_DIG_SWR_DEM_BYPASS_DATA0 (PM4125_DIG_BASE_ADDR+0x0B0) +#define PM4125_DIG_SWR_DEM_BYPASS_DATA1 (PM4125_DIG_BASE_ADDR+0x0B1) +#define PM4125_DIG_SWR_DEM_BYPASS_DATA2 (PM4125_DIG_BASE_ADDR+0x0B2) +#define PM4125_DIG_SWR_DEM_BYPASS_DATA3 (PM4125_DIG_BASE_ADDR+0x0B3) + +#define PM4125_ANALOG_REGISTERS_MAX_SIZE (PM4125_ANA_BASE_ADDR+0x0B5) +#define PM4125_DIGITAL_REGISTERS_MAX_SIZE (PM4125_DIG_BASE_ADDR+0x0B4) +#define PM4125_ANALOG_MAX_REGISTER (PM4125_ANALOG_REGISTERS_MAX_SIZE - 1) +#define PM4125_DIGITAL_MAX_REGISTER (PM4125_DIGITAL_REGISTERS_MAX_SIZE - 1) +#define PM4125_MAX_REGISTER PM4125_DIGITAL_MAX_REGISTER + +#define PM4125_MAX_MICBIAS 3 +#define PM4125_MAX_SWR_CH_IDS 15 +#define PM4125_SWRM_CH_MASK(ch_idx) BIT(ch_idx - 1) + +enum pm4125_tx_sdw_ports { + PM4125_ADC_1_2_DMIC1L_BCS_PORT = 1, + PM4125_DMIC_1L_1R_ADC1_BCS_PORT, + PM4125_MAX_TX_SWR_PORTS = PM4125_DMIC_1L_1R_ADC1_BCS_PORT, +}; + +enum pm4125_rx_sdw_ports { + PM4125_HPH_PORT = 1, + PM4125_COMP_PORT, + PM4125_MAX_SWR_PORTS = PM4125_COMP_PORT, +}; + +struct pm4125_sdw_ch_info { + int port_num; + unsigned int ch_mask; + unsigned int master_ch_mask; +}; + +#define WCD_SDW_CH(id, pn, cmask) \ + [id] = { \ + .port_num = pn, \ + .ch_mask = cmask, \ + .master_ch_mask = cmask, \ + } + +struct pm4125_priv; +struct pm4125_sdw_priv { + struct sdw_slave *sdev; + struct sdw_stream_config sconfig; + struct sdw_stream_runtime *sruntime; + struct sdw_port_config port_config[PM4125_MAX_SWR_PORTS]; + struct pm4125_sdw_ch_info *ch_info; + bool port_enable[PM4125_MAX_SWR_CH_IDS]; + unsigned int master_channel_map[SDW_MAX_PORTS]; + int active_ports; + int num_ports; + bool is_tx; + struct pm4125_priv *pm4125; + struct irq_domain *slave_irq; + struct regmap *regmap; +}; + +#if IS_ENABLED(CONFIG_SND_SOC_PM4125_SDW) +int pm4125_sdw_free(struct pm4125_sdw_priv *pm4125, struct snd_pcm_substream *substream, + struct snd_soc_dai *dai); +int pm4125_sdw_set_sdw_stream(struct pm4125_sdw_priv *pm4125, struct snd_soc_dai *dai, void *stream, + int direction); +int pm4125_sdw_hw_params(struct pm4125_sdw_priv *pm4125, struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); + +struct device *pm4125_sdw_device_get(struct device_node *np); + +#else +static inline int pm4125_sdw_free(struct pm4125_sdw_priv *pm4125, + struct snd_pcm_substream *substream, struct snd_soc_dai *dai) +{ + return -EOPNOTSUPP; +} + +static inline int pm4125_sdw_set_sdw_stream(struct pm4125_sdw_priv *pm4125, + struct snd_soc_dai *dai, void *stream, int direction) +{ + return -EOPNOTSUPP; +} + +static inline int pm4125_sdw_hw_params(struct pm4125_sdw_priv *pm4125, + struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) +{ + return -EOPNOTSUPP; +} +#endif + +enum { + /* INTR_CTRL_INT_MASK_0 */ + PM4125_IRQ_MBHC_BUTTON_PRESS_DET = 0, + PM4125_IRQ_MBHC_BUTTON_RELEASE_DET, + PM4125_IRQ_MBHC_ELECT_INS_REM_DET, + PM4125_IRQ_MBHC_ELECT_INS_REM_LEG_DET, + PM4125_IRQ_MBHC_SW_DET, + PM4125_IRQ_HPHR_OCP_INT, + PM4125_IRQ_HPHR_CNP_INT, + PM4125_IRQ_HPHL_OCP_INT, + + /* INTR_CTRL_INT_MASK_1 */ + PM4125_IRQ_HPHL_CNP_INT, + PM4125_IRQ_EAR_CNP_INT, + PM4125_IRQ_EAR_SCD_INT, + PM4125_IRQ_AUX_CNP_INT, + PM4125_IRQ_AUX_SCD_INT, + PM4125_IRQ_HPHL_PDM_WD_INT, + PM4125_IRQ_HPHR_PDM_WD_INT, + PM4125_IRQ_AUX_PDM_WD_INT, + + /* INTR_CTRL_INT_MASK_2 */ + PM4125_IRQ_LDORT_SCD_INT, + PM4125_IRQ_MBHC_MOISTURE_INT, + PM4125_IRQ_HPHL_SURGE_DET_INT, + PM4125_IRQ_HPHR_SURGE_DET_INT, + PM4125_NUM_IRQS, +}; + +enum pm4125_tx_sdw_channels { + PM4125_ADC1, + PM4125_ADC2, +}; + +enum pm4125_rx_sdw_channels { + PM4125_HPH_L, + PM4125_HPH_R, + PM4125_COMP_L, + PM4125_COMP_R, +}; + +#endif /* _PM4125_REGISTERS_H */ diff --git a/sound/soc/codecs/rt5682s.c b/sound/soc/codecs/rt5682s.c index 80b921695e7d1c..1d80a4b862e276 100644 --- a/sound/soc/codecs/rt5682s.c +++ b/sound/soc/codecs/rt5682s.c @@ -653,14 +653,15 @@ static void rt5682s_sar_power_mode(struct snd_soc_component *component, int mode switch (mode) { case SAR_PWR_SAVING: snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_3, - RT5682S_CBJ_IN_BUF_MASK, RT5682S_CBJ_IN_BUF_DIS); + RT5682S_CBJ_IN_BUF_MASK, RT5682S_CBJ_IN_BUF_EN); snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_1, - RT5682S_MB1_PATH_MASK | RT5682S_MB2_PATH_MASK, - RT5682S_CTRL_MB1_REG | RT5682S_CTRL_MB2_REG); + RT5682S_MB1_PATH_MASK | RT5682S_MB2_PATH_MASK | + RT5682S_VREF_POW_MASK, RT5682S_CTRL_MB1_FSM | + RT5682S_CTRL_MB2_FSM | RT5682S_VREF_POW_FSM); snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK | RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS | - RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU); + RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU); usleep_range(5000, 5500); snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK, RT5682S_SAR_BUTDET_EN); @@ -688,7 +689,7 @@ static void rt5682s_sar_power_mode(struct snd_soc_component *component, int mode snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK | RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS | - RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU); + RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU); break; default: dev_err(component->dev, "Invalid SAR Power mode: %d\n", mode); @@ -725,7 +726,7 @@ static void rt5682s_disable_push_button_irq(struct snd_soc_component *component) snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK | RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS | - RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU); + RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU); } /** @@ -786,7 +787,7 @@ static int rt5682s_headset_detect(struct snd_soc_component *component, int jack_ jack_type = SND_JACK_HEADSET; snd_soc_component_write(component, RT5682S_SAR_IL_CMD_3, 0x024c); snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_1, - RT5682S_FAST_OFF_MASK, RT5682S_FAST_OFF_EN); + RT5682S_FAST_OFF_MASK, RT5682S_FAST_OFF_DIS); snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_SEL_MB1_2_MASK, val << RT5682S_SAR_SEL_MB1_2_SFT); rt5682s_enable_push_button_irq(component); @@ -966,7 +967,7 @@ static int rt5682s_set_jack_detect(struct snd_soc_component *component, RT5682S_EMB_JD_MASK | RT5682S_DET_TYPE | RT5682S_POL_FAST_OFF_MASK | RT5682S_MIC_CAP_MASK, RT5682S_EMB_JD_EN | RT5682S_DET_TYPE | - RT5682S_POL_FAST_OFF_HIGH | RT5682S_MIC_CAP_HS); + RT5682S_POL_FAST_OFF_LOW | RT5682S_MIC_CAP_HS); regmap_update_bits(rt5682s->regmap, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_POW_MASK, RT5682S_SAR_POW_EN); regmap_update_bits(rt5682s->regmap, RT5682S_GPIO_CTRL_1, diff --git a/sound/soc/codecs/tas2783-sdw.c b/sound/soc/codecs/tas2783-sdw.c new file mode 100644 index 00000000000000..1fb4227b711e6b --- /dev/null +++ b/sound/soc/codecs/tas2783-sdw.c @@ -0,0 +1,1331 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// ALSA SoC Texas Instruments TAS2783 Audio Smart Amplifier +// +// Copyright (C) 2025 Texas Instruments Incorporated +// https://www.ti.com +// +// The TAS2783 driver implements a flexible and configurable +// algo coefficient setting for single TAS2783 chips. +// +// Author: Niranjan H Y +// Author: Baojun Xu +// Author: Kevin Lu + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tas2783.h" + +#define TIMEOUT_FW_DL_MS (3000) +#define FW_DL_OFFSET 36 +#define FW_FL_HDR 12 +#define TAS2783_PROBE_TIMEOUT 5000 +#define TAS2783_CALI_GUID EFI_GUID(0x1f52d2a1, 0xbb3a, 0x457d, 0xbc, \ + 0x09, 0x43, 0xa3, 0xf4, 0x31, 0x0a, 0x92) + +static const u32 tas2783_cali_reg[] = { + TAS2783_CAL_R0, + TAS2783_CAL_INVR0, + TAS2783_CAL_R0LOW, + TAS2783_CAL_POWER, + TAS2783_CAL_TLIM, +}; + +struct bin_header_t { + u16 vendor_id; + u16 version; + u32 file_id; + u32 length; +}; + +struct calibration_data { + u32 is_valid; + unsigned long read_sz; + u8 data[TAS2783_CALIB_DATA_SZ]; +}; + +struct tas2783_prv { + struct snd_soc_component *component; + struct calibration_data cali_data; + struct sdw_slave *sdw_peripheral; + enum sdw_slave_status status; + /* calibration */ + struct mutex calib_lock; + /* pde and firmware download */ + struct mutex pde_lock; + struct regmap *regmap; + struct device *dev; + struct class *class; + struct attribute_group *cal_attr_groups; + struct tm tm; + u8 rca_binaryname[64]; + u8 dev_name[32]; + bool hw_init; + /* wq for firmware download */ + wait_queue_head_t fw_wait; + bool fw_dl_task_done; + bool fw_dl_success; +}; + +static const struct reg_default tas2783_reg_default[] = { + {TAS2783_AMP_LEVEL, 0x28}, + {TASDEV_REG_SDW(0, 0, 0x03), 0x28}, + {TASDEV_REG_SDW(0, 0, 0x04), 0x21}, + {TASDEV_REG_SDW(0, 0, 0x05), 0x41}, + {TASDEV_REG_SDW(0, 0, 0x06), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x07), 0x20}, + {TASDEV_REG_SDW(0, 0, 0x08), 0x09}, + {TASDEV_REG_SDW(0, 0, 0x09), 0x02}, + {TASDEV_REG_SDW(0, 0, 0x0a), 0x0a}, + {TASDEV_REG_SDW(0, 0, 0x0c), 0x10}, + {TASDEV_REG_SDW(0, 0, 0x0d), 0x13}, + {TASDEV_REG_SDW(0, 0, 0x0e), 0xc2}, + {TASDEV_REG_SDW(0, 0, 0x0f), 0x40}, + {TASDEV_REG_SDW(0, 0, 0x10), 0x04}, + {TASDEV_REG_SDW(0, 0, 0x13), 0x13}, + {TASDEV_REG_SDW(0, 0, 0x14), 0x12}, + {TASDEV_REG_SDW(0, 0, 0x15), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x16), 0x12}, + {TASDEV_REG_SDW(0, 0, 0x17), 0x80}, + {TAS2783_DVC_LVL, 0x00}, + {TASDEV_REG_SDW(0, 0, 0x1b), 0x61}, + {TASDEV_REG_SDW(0, 0, 0x1c), 0x36}, + {TASDEV_REG_SDW(0, 0, 0x1d), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x1f), 0x01}, + {TASDEV_REG_SDW(0, 0, 0x20), 0x2e}, + {TASDEV_REG_SDW(0, 0, 0x21), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x34), 0x06}, + {TASDEV_REG_SDW(0, 0, 0x35), 0xbd}, + {TASDEV_REG_SDW(0, 0, 0x36), 0xad}, + {TASDEV_REG_SDW(0, 0, 0x37), 0xa8}, + {TASDEV_REG_SDW(0, 0, 0x38), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x3b), 0xfc}, + {TASDEV_REG_SDW(0, 0, 0x3d), 0xdd}, + {TASDEV_REG_SDW(0, 0, 0x40), 0xf6}, + {TASDEV_REG_SDW(0, 0, 0x41), 0x14}, + {TASDEV_REG_SDW(0, 0, 0x5c), 0x19}, + {TASDEV_REG_SDW(0, 0, 0x5d), 0x80}, + {TASDEV_REG_SDW(0, 0, 0x63), 0x48}, + {TASDEV_REG_SDW(0, 0, 0x65), 0x08}, + {TASDEV_REG_SDW(0, 0, 0x66), 0xb2}, + {TASDEV_REG_SDW(0, 0, 0x67), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x6a), 0x12}, + {TASDEV_REG_SDW(0, 0, 0x6b), 0xfb}, + {TASDEV_REG_SDW(0, 0, 0x6c), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x6d), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x6e), 0x1a}, + {TASDEV_REG_SDW(0, 0, 0x6f), 0x00}, + {TASDEV_REG_SDW(0, 0, 0x70), 0x96}, + {TASDEV_REG_SDW(0, 0, 0x71), 0x02}, + {TASDEV_REG_SDW(0, 0, 0x73), 0x08}, + {TASDEV_REG_SDW(0, 0, 0x75), 0xe0}, + {TASDEV_REG_SDW(0, 0, 0x7a), 0x60}, + {TASDEV_REG_SDW(0, 0, 0x60), 0x21}, + {TASDEV_REG_SDW(0, 1, 0x02), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x17), 0xc0}, + {TASDEV_REG_SDW(0, 1, 0x19), 0x60}, + {TASDEV_REG_SDW(0, 1, 0x35), 0x75}, + {TASDEV_REG_SDW(0, 1, 0x3d), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x3e), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x3f), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x40), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x41), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x42), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x43), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x44), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x45), 0x00}, + {TASDEV_REG_SDW(0, 1, 0x47), 0xab}, + {TASDEV_REG_SDW(0, 0xfd, 0x0d), 0x0d}, + {TASDEV_REG_SDW(0, 0xfd, 0x39), 0x00}, + {TASDEV_REG_SDW(0, 0xfd, 0x3e), 0x00}, + {TASDEV_REG_SDW(0, 0xfd, 0x45), 0x00}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS21, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS21, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS24, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS24, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS26, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS28, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS28, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS127, 0x02, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS127, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, 0x01, 1), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, 0x02, 1), 0x9c00}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x01, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x01, 1), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x0b, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x01, 1), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x01, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x0b, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 1), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 2), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 2), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x01, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x05, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x01, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x05, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 2), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 3), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 4), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 5), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 6), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 7), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x06, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT23, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT23, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x04, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 1), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 2), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 3), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 4), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 5), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 6), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 7), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 8), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 9), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xa), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xb), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xc), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xd), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xe), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xf), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, 0x1, 0), 0x3}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, 0x10, 0), 0x3}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x06, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x13, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x06, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x13, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x05, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x10, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x11, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_TG23, 0x10, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x01, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x06, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x07, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x08, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x09, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x0a, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x10, 0), 0x1}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x12, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x13, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x14, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x15, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x16, 0), 0x0}, + {SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_UDMPU23, 0x10, 0), 0x0}, +}; + +static const struct reg_sequence tas2783_init_seq[] = { + REG_SEQ0(SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x10, 0x00), 0x04), + REG_SEQ0(0x00800418, 0x00), + REG_SEQ0(0x00800419, 0x00), + REG_SEQ0(0x0080041a, 0x00), + REG_SEQ0(0x0080041b, 0x00), + REG_SEQ0(0x00800428, 0x40), + REG_SEQ0(0x00800429, 0x00), + REG_SEQ0(0x0080042a, 0x00), + REG_SEQ0(0x0080042b, 0x00), + REG_SEQ0(SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x1, 0x00), 0x00), + REG_SEQ0(0x0080005c, 0xD9), + REG_SEQ0(0x00800082, 0x20), + REG_SEQ0(0x008000a1, 0x00), + REG_SEQ0(0x00800097, 0xc8), + REG_SEQ0(0x00800099, 0x20), + REG_SEQ0(0x008000c7, 0xaa), + REG_SEQ0(0x008000b5, 0x74), + REG_SEQ0(0x00800082, 0x20), + REG_SEQ0(0x00807e8d, 0x0d), + REG_SEQ0(0x00807eb9, 0x53), + REG_SEQ0(0x00807ebe, 0x42), + REG_SEQ0(0x00807ec5, 0x37), + REG_SEQ0(0x00800066, 0x92), + REG_SEQ0(0x00800003, 0x28), + REG_SEQ0(0x00800004, 0x21), + REG_SEQ0(0x00800005, 0x41), + REG_SEQ0(0x00800006, 0x00), + REG_SEQ0(0x00800007, 0x20), + REG_SEQ0(0x0080000c, 0x10), + REG_SEQ0(0x00800013, 0x08), + REG_SEQ0(0x00800015, 0x00), + REG_SEQ0(0x00800017, 0x80), + REG_SEQ0(0x0080001a, 0x00), + REG_SEQ0(0x0080001b, 0x22), + REG_SEQ0(0x0080001c, 0x36), + REG_SEQ0(0x0080001d, 0x01), + REG_SEQ0(0x0080001f, 0x00), + REG_SEQ0(0x00800020, 0x2e), + REG_SEQ0(0x00800034, 0x06), + REG_SEQ0(0x00800035, 0xb9), + REG_SEQ0(0x00800036, 0xad), + REG_SEQ0(0x00800037, 0xa8), + REG_SEQ0(0x00800038, 0x00), + REG_SEQ0(0x0080003b, 0xfc), + REG_SEQ0(0x0080003d, 0xdd), + REG_SEQ0(0x00800040, 0xf6), + REG_SEQ0(0x00800041, 0x14), + REG_SEQ0(0x0080005c, 0x19), + REG_SEQ0(0x0080005d, 0x80), + REG_SEQ0(0x00800063, 0x48), + REG_SEQ0(0x00800065, 0x08), + REG_SEQ0(0x00800067, 0x00), + REG_SEQ0(0x0080006a, 0x12), + REG_SEQ0(0x0080006b, 0x7b), + REG_SEQ0(0x0080006c, 0x00), + REG_SEQ0(0x0080006d, 0x00), + REG_SEQ0(0x0080006e, 0x1a), + REG_SEQ0(0x0080006f, 0x00), + REG_SEQ0(0x00800070, 0x96), + REG_SEQ0(0x00800071, 0x02), + REG_SEQ0(0x00800073, 0x08), + REG_SEQ0(0x00800075, 0xe0), + REG_SEQ0(0x0080007a, 0x60), + REG_SEQ0(0x008000bd, 0x00), + REG_SEQ0(0x008000be, 0x00), + REG_SEQ0(0x008000bf, 0x00), + REG_SEQ0(0x008000c0, 0x00), + REG_SEQ0(0x008000c1, 0x00), + REG_SEQ0(0x008000c2, 0x00), + REG_SEQ0(0x008000c3, 0x00), + REG_SEQ0(0x008000c4, 0x00), + REG_SEQ0(0x008000c5, 0x00), + REG_SEQ0(0x00800008, 0x49), + REG_SEQ0(0x00800009, 0x02), + REG_SEQ0(0x0080000a, 0x1a), + REG_SEQ0(0x0080000d, 0x93), + REG_SEQ0(0x0080000e, 0x82), + REG_SEQ0(0x0080000f, 0x42), + REG_SEQ0(0x00800010, 0x84), + REG_SEQ0(0x00800014, 0x0a), + REG_SEQ0(0x00800016, 0x00), + REG_SEQ0(0x00800060, 0x21), +}; + +static int tas2783_sdca_mbq_size(struct device *dev, u32 reg) +{ + switch (reg) { + case 0x000 ... 0x080: /* Data port 0. */ + case 0x100 ... 0x140: /* Data port 1. */ + case 0x200 ... 0x240: /* Data port 2. */ + case 0x300 ... 0x340: /* Data port 3. */ + case 0x400 ... 0x440: /* Data port 4. */ + case 0x500 ... 0x540: /* Data port 5. */ + case 0x800000 ... 0x803fff: /* Page 0 ~ 127. */ + case 0x807e80 ... 0x807eff: /* Page 253. */ + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_UDMPU23, + TAS2783_SDCA_CTL_UDMPU_CLUSTER, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, TAS2783_SDCA_CTL_FU_MUTE, + TAS2783_DEVICE_CHANNEL_LEFT): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, 0x1, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_TG23, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x0a, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x14, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x15, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x16, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT23, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 2): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 3): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 4): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 5): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 6): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 7): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 8): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 9): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xa): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xb): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xc): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xd): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xe): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x12, 0xf): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS21, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS21, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS24, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS24, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS25, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS25, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS127, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS127, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS26, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS28, 0x02, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_CS28, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x05, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 2): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x01, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x04, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x05, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x01, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x04, 0): + return 1; + + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x11, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 2): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 3): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 4): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 5): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 6): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x01, 7): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, 0x02, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x0b, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 2): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x0b, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x0b, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x0b, 1): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x07, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x09, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x13, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x13, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x11, 0): + return 2; + + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT21, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT26, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT28, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_IT29, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT23, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT24, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT25, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT28, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_OT127, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MU26, 0x06, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU127, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU26, 0x10, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x06, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x12, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_XU22, 0x13, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU21, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_MFPU26, 0x08, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_SAPU29, 0x05, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU21, 0x06, 0): + case SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PPU26, 0x06, 0): + return 4; + + default: + return 0; + } +} + +static bool tas2783_readable_register(struct device *dev, unsigned int reg) +{ + return tas2783_sdca_mbq_size(dev, reg) > 0; +} + +static bool tas2783_volatile_register(struct device *dev, u32 reg) +{ + switch (reg) { + case 0x000 ... 0x080: /* Data port 0. */ + case 0x100 ... 0x140: /* Data port 1. */ + case 0x200 ... 0x240: /* Data port 2. */ + case 0x300 ... 0x340: /* Data port 3. */ + case 0x400 ... 0x440: /* Data port 4. */ + case 0x500 ... 0x540: /* Data port 5. */ + case 0x800001: + return true; + + default: + return false; + } +} + +static const struct regmap_config tas_regmap = { + .reg_bits = 32, + .val_bits = 8, + .readable_reg = tas2783_readable_register, + .volatile_reg = tas2783_volatile_register, + .reg_defaults = tas2783_reg_default, + .num_reg_defaults = ARRAY_SIZE(tas2783_reg_default), + .max_register = 0x41008000 + TASDEV_REG_SDW(0xa1, 0x60, 0x7f), + .cache_type = REGCACHE_MAPLE, + .use_single_read = true, + .use_single_write = true, +}; + +static const struct regmap_sdw_mbq_cfg tas2783_mbq_cfg = { + .mbq_size = tas2783_sdca_mbq_size, +}; + +static s32 tas2783_digital_getvol(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return snd_soc_get_volsw(kcontrol, ucontrol); +} + +static s32 tas2783_digital_putvol(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return snd_soc_put_volsw(kcontrol, ucontrol); +} + +static s32 tas2783_amp_getvol(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return snd_soc_get_volsw(kcontrol, ucontrol); +} + +static s32 tas2783_amp_putvol(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return snd_soc_put_volsw(kcontrol, ucontrol); +} + +static const struct snd_kcontrol_new tas2783_snd_controls[] = { + SOC_SINGLE_RANGE_EXT_TLV("Amp Volume", TAS2783_AMP_LEVEL, + 1, 0, 20, 0, tas2783_amp_getvol, + tas2783_amp_putvol, tas2781_amp_tlv), + SOC_SINGLE_RANGE_EXT_TLV("Speaker Volume", TAS2783_DVC_LVL, + 0, 0, 200, 1, tas2783_digital_getvol, + tas2783_digital_putvol, tas2781_dvc_tlv), +}; + +static s32 tas2783_validate_calibdata(struct tas2783_prv *tas_dev, + u8 *data, u32 size) +{ + u32 ts, spk_count, size_calculated; + u32 crc_calculated, crc_read, i; + u32 *tmp_val; + struct tm tm; + + i = 0; + tmp_val = (u32 *)data; + if (tmp_val[i++] != 2783) { + dev_err(tas_dev->dev, "cal data magic number mismatch"); + return -EINVAL; + } + + spk_count = tmp_val[i++]; + if (spk_count > TAS2783_CALIB_MAX_SPK_COUNT) { + dev_err(tas_dev->dev, "cal data spk_count too large"); + return -EINVAL; + } + + ts = tmp_val[i++]; + time64_to_tm(ts, 0, &tm); + dev_dbg(tas_dev->dev, "cal data timestamp: %ld-%d-%d %d:%d:%d", + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); + + size_calculated = + (spk_count * TAS2783_CALIB_PARAMS * sizeof(u32)) + + TAS2783_CALIB_HDR_SZ + TAS2783_CALIB_CRC_SZ; + if (size_calculated > TAS2783_CALIB_DATA_SZ) { + dev_err(tas_dev->dev, "cali data sz too large"); + return -EINVAL; + } else if (size < size_calculated) { + dev_err(tas_dev->dev, "cali data size mismatch calc=%u vs %d\n", + size, size_calculated); + return -EINVAL; + } + + crc_calculated = crc32(~0, data, + size_calculated - TAS2783_CALIB_CRC_SZ) ^ ~0; + crc_read = tmp_val[(size_calculated - TAS2783_CALIB_CRC_SZ) / sizeof(u32)]; + if (crc_calculated != crc_read) { + dev_err(tas_dev->dev, + "calib data integrity check fail, 0x%08x vs 0x%08x\n", + crc_calculated, crc_read); + return -EINVAL; + } + + return 0; +} + +static void tas2783_set_calib_params_to_device(struct tas2783_prv *tas_dev, u32 *cali_data) +{ + u32 dev_count, offset, i, device_num; + u32 reg_value; + u8 buf[4]; + + dev_count = cali_data[1]; + offset = 3; + + for (device_num = 0; device_num < dev_count; device_num++) { + if (cali_data[offset] != tas_dev->sdw_peripheral->id.unique_id) { + offset += TAS2783_CALIB_PARAMS; + continue; + } + offset++; + + for (i = 0; i < ARRAY_SIZE(tas2783_cali_reg); i++) { + reg_value = cali_data[offset + i]; + buf[0] = reg_value >> 24; + buf[1] = reg_value >> 16; + buf[2] = reg_value >> 8; + buf[3] = reg_value & 0xff; + regmap_bulk_write(tas_dev->regmap, tas2783_cali_reg[i], + buf, sizeof(u32)); + } + break; + } + + if (device_num == dev_count) + dev_err(tas_dev->dev, "device not found\n"); + else + dev_dbg(tas_dev->dev, "calib data update done\n"); +} + +static s32 tas2783_update_calibdata(struct tas2783_prv *tas_dev) +{ + efi_guid_t efi_guid = TAS2783_CALI_GUID; + u32 attr, i, *tmp_val; + unsigned long size; + s32 ret; + efi_status_t status; + static efi_char16_t efi_names[][32] = { + L"SmartAmpCalibrationData", L"CALI_DATA"}; + + tmp_val = (u32 *)tas_dev->cali_data.data; + attr = 0; + i = 0; + + /* + * In some cases, the calibration is performed in Windows, + * and data was saved in UEFI. Linux can access it. + */ + for (i = 0; i < ARRAY_SIZE(efi_names); i++) { + size = 0; + status = efi.get_variable(efi_names[i], &efi_guid, &attr, + &size, NULL); + if (size > TAS2783_CALIB_DATA_SZ) { + dev_err(tas_dev->dev, "cali data too large\n"); + break; + } + + tas_dev->cali_data.read_sz = size; + if (status == EFI_BUFFER_TOO_SMALL) { + status = efi.get_variable(efi_names[i], &efi_guid, &attr, + &tas_dev->cali_data.read_sz, + tas_dev->cali_data.data); + dev_dbg(tas_dev->dev, "cali get %lu bytes result:%ld\n", + tas_dev->cali_data.read_sz, status); + } + if (status == EFI_SUCCESS) + break; + } + + if (status != EFI_SUCCESS) { + /* Failed got calibration data from EFI. */ + dev_dbg(tas_dev->dev, "No calibration data in UEFI."); + return 0; + } + + mutex_lock(&tas_dev->calib_lock); + ret = tas2783_validate_calibdata(tas_dev, tas_dev->cali_data.data, + tas_dev->cali_data.read_sz); + if (!ret) + tas2783_set_calib_params_to_device(tas_dev, tmp_val); + mutex_unlock(&tas_dev->calib_lock); + + return ret; +} + +static s32 read_header(const u8 *data, struct bin_header_t *hdr) +{ + hdr->vendor_id = get_unaligned_le16(&data[0]); + hdr->file_id = get_unaligned_le32(&data[2]); + hdr->version = get_unaligned_le16(&data[6]); + hdr->length = get_unaligned_le32(&data[8]); + return 12; +} + +static void tas2783_fw_ready(const struct firmware *fmw, void *context) +{ + struct tas2783_prv *tas_dev = + (struct tas2783_prv *)context; + const u8 *buf = NULL; + s32 offset = 0, img_sz, file_blk_size, ret; + struct bin_header_t hdr; + + if (!fmw || !fmw->data) { + /* No firmware binary, devices will work in ROM mode. */ + dev_err(tas_dev->dev, + "Failed to read %s, no side-effect on driver running\n", + tas_dev->rca_binaryname); + ret = -EINVAL; + goto out; + } + + mutex_lock(&tas_dev->pde_lock); + img_sz = fmw->size; + buf = fmw->data; + offset += FW_DL_OFFSET; + while (offset < (img_sz - FW_FL_HDR)) { + memset(&hdr, 0, sizeof(hdr)); + offset += read_header(&buf[offset], &hdr); + dev_dbg(tas_dev->dev, + "vndr=%d, file=%d, version=%d, len=%d, off=%d\n", + hdr.vendor_id, hdr.file_id, hdr.version, + hdr.length, offset); + /* size also includes the header */ + file_blk_size = hdr.length - FW_FL_HDR; + + switch (hdr.file_id) { + case 0: + ret = sdw_nwrite_no_pm(tas_dev->sdw_peripheral, + PRAM_ADDR_START, file_blk_size, + &buf[offset]); + if (ret < 0) + dev_err(tas_dev->dev, + "PRAM update failed: %d", ret); + break; + + case 1: + ret = sdw_nwrite_no_pm(tas_dev->sdw_peripheral, + YRAM_ADDR_START, file_blk_size, + &buf[offset]); + if (ret < 0) + dev_err(tas_dev->dev, + "YRAM update failed: %d", ret); + + break; + + default: + ret = -EINVAL; + dev_err(tas_dev->dev, "Unsupported file"); + break; + } + + if (ret == 0) + offset += file_blk_size; + else + break; + } + mutex_unlock(&tas_dev->pde_lock); + tas2783_update_calibdata(tas_dev); + +out: + if (!ret) + tas_dev->fw_dl_success = true; + tas_dev->fw_dl_task_done = true; + wake_up(&tas_dev->fw_wait); + if (fmw) + release_firmware(fmw); +} + +static inline s32 tas_clear_latch(struct tas2783_prv *priv) +{ + return regmap_update_bits(priv->regmap, + TASDEV_REG_SDW(0, 0, 0x5c), + 0x04, 0x04); +} + +static s32 tas_fu21_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *k, s32 event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct tas2783_prv *tas_dev = snd_soc_component_get_drvdata(component); + s32 mute; + + switch (event) { + case SND_SOC_DAPM_POST_PMU: + mute = 0; + break; + + case SND_SOC_DAPM_PRE_PMD: + mute = 1; + break; + } + + return sdw_write_no_pm(tas_dev->sdw_peripheral, + SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU21, + TAS2783_SDCA_CTL_FU_MUTE, 1), mute); +} + +static s32 tas_fu23_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *k, s32 event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct tas2783_prv *tas_dev = snd_soc_component_get_drvdata(component); + s32 mute; + + switch (event) { + case SND_SOC_DAPM_POST_PMU: + mute = 0; + break; + + case SND_SOC_DAPM_PRE_PMD: + mute = 1; + break; + } + + return sdw_write_no_pm(tas_dev->sdw_peripheral, + SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_FU23, + TAS2783_SDCA_CTL_FU_MUTE, 1), mute); +} + +static const struct snd_soc_dapm_widget tas_dapm_widgets[] = { + SND_SOC_DAPM_AIF_IN("ASI", "ASI Playback", 0, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_OUT("ASI OUT", "ASI Capture", 0, SND_SOC_NOPM, + 0, 0), + SND_SOC_DAPM_DAC_E("FU21", NULL, SND_SOC_NOPM, 0, 0, tas_fu21_event, + SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), + SND_SOC_DAPM_DAC_E("FU23", NULL, SND_SOC_NOPM, 0, 0, tas_fu23_event, + SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), + SND_SOC_DAPM_OUTPUT("SPK"), + SND_SOC_DAPM_INPUT("DMIC"), +}; + +static const struct snd_soc_dapm_route tas_audio_map[] = { + {"FU21", NULL, "ASI"}, + {"SPK", NULL, "FU21"}, + {"FU23", NULL, "ASI"}, + {"SPK", NULL, "FU23"}, + {"ASI OUT", NULL, "DMIC"}, +}; + +static s32 tas_set_sdw_stream(struct snd_soc_dai *dai, + void *sdw_stream, s32 direction) +{ + if (!sdw_stream) + return 0; + + snd_soc_dai_dma_data_set(dai, direction, sdw_stream); + + return 0; +} + +static void tas_sdw_shutdown(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + snd_soc_dai_set_dma_data(dai, substream, NULL); +} + +static s32 tas_sdw_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct tas2783_prv *tas_dev = + snd_soc_component_get_drvdata(component); + struct sdw_stream_config stream_config = {0}; + struct sdw_port_config port_config = {0}; + struct sdw_stream_runtime *sdw_stream; + struct sdw_slave *sdw_peripheral = tas_dev->sdw_peripheral; + s32 ret, retry = 3; + + if (!tas_dev->fw_dl_success) { + dev_err(tas_dev->dev, "error playback without fw download"); + return -EINVAL; + } + + sdw_stream = snd_soc_dai_get_dma_data(dai, substream); + if (!sdw_stream) + return -EINVAL; + + ret = tas_clear_latch(tas_dev); + if (ret) + dev_err(tas_dev->dev, + "clear latch failed, err=%d", ret); + + mutex_lock(&tas_dev->pde_lock); + /* + * Sometimes, there is error returned during power on. + * So added retry logic to ensure power on so that + * port prepare succeeds + */ + do { + ret = regmap_write(tas_dev->regmap, + SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, + TAS2783_SDCA_CTL_REQ_POW_STATE, 0), + TAS2783_SDCA_POW_STATE_ON); + if (!ret) + break; + usleep_range(2000, 2200); + } while (retry--); + mutex_unlock(&tas_dev->pde_lock); + if (ret) + return ret; + + /* SoundWire specific configuration */ + snd_sdw_params_to_config(substream, params, + &stream_config, &port_config); + /* port 1 for playback */ + if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) + port_config.num = 1; + else + port_config.num = 2; + + ret = sdw_stream_add_slave(sdw_peripheral, + &stream_config, &port_config, 1, sdw_stream); + if (ret) + dev_err(dai->dev, "Unable to configure port\n"); + + return ret; +} + +static s32 tas_sdw_pcm_hw_free(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + s32 ret; + struct snd_soc_component *component = dai->component; + struct tas2783_prv *tas_dev = + snd_soc_component_get_drvdata(component); + struct sdw_stream_runtime *sdw_stream = + snd_soc_dai_get_dma_data(dai, substream); + + sdw_stream_remove_slave(tas_dev->sdw_peripheral, sdw_stream); + + mutex_lock(&tas_dev->pde_lock); + ret = regmap_write(tas_dev->regmap, + SDW_SDCA_CTL(1, TAS2783_SDCA_ENT_PDE23, + TAS2783_SDCA_CTL_REQ_POW_STATE, 0), + TAS2783_SDCA_POW_STATE_OFF); + mutex_unlock(&tas_dev->pde_lock); + + return ret; +} + +static const struct snd_soc_dai_ops tas_dai_ops = { + .hw_params = tas_sdw_hw_params, + .hw_free = tas_sdw_pcm_hw_free, + .set_stream = tas_set_sdw_stream, + .shutdown = tas_sdw_shutdown, +}; + +static struct snd_soc_dai_driver tas_dai_driver[] = { + { + .name = "tas2783-codec", + .id = 0, + .playback = { + .stream_name = "Playback", + .channels_min = 1, + .channels_max = 4, + .rates = TAS2783_DEVICE_RATES, + .formats = TAS2783_DEVICE_FORMATS, + }, + .capture = { + .stream_name = "Capture", + .channels_min = 1, + .channels_max = 4, + .rates = TAS2783_DEVICE_RATES, + .formats = TAS2783_DEVICE_FORMATS, + }, + .ops = &tas_dai_ops, + .symmetric_rate = 1, + }, +}; + +static s32 tas_component_probe(struct snd_soc_component *component) +{ + struct tas2783_prv *tas_dev = + snd_soc_component_get_drvdata(component); + + tas_dev->component = component; + tas25xx_register_misc(tas_dev->sdw_peripheral); + + return 0; +} + +static void tas_component_remove(struct snd_soc_component *codec) +{ + struct tas2783_prv *tas_dev = + snd_soc_component_get_drvdata(codec); + tas25xx_deregister_misc(); + tas_dev->component = NULL; +} + +static const struct snd_soc_component_driver soc_codec_driver_tasdevice = { + .probe = tas_component_probe, + .remove = tas_component_remove, + .controls = tas2783_snd_controls, + .num_controls = ARRAY_SIZE(tas2783_snd_controls), + .dapm_widgets = tas_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(tas_dapm_widgets), + .dapm_routes = tas_audio_map, + .num_dapm_routes = ARRAY_SIZE(tas_audio_map), + .idle_bias_on = 1, + .endianness = 1, +}; + +static s32 tas_init(struct tas2783_prv *tas_dev) +{ + s32 ret; + + dev_set_drvdata(tas_dev->dev, tas_dev); + ret = devm_snd_soc_register_component(tas_dev->dev, + &soc_codec_driver_tasdevice, + tas_dai_driver, + ARRAY_SIZE(tas_dai_driver)); + if (ret) { + dev_err(tas_dev->dev, "%s: codec register error:%d.\n", + __func__, ret); + return ret; + } + + /* set autosuspend parameters */ + pm_runtime_set_autosuspend_delay(tas_dev->dev, 3000); + pm_runtime_use_autosuspend(tas_dev->dev); + /* make sure the device does not suspend immediately */ + pm_runtime_mark_last_busy(tas_dev->dev); + pm_runtime_enable(tas_dev->dev); + + return ret; +} + +static s32 tas_read_prop(struct sdw_slave *slave) +{ + struct sdw_slave_prop *prop = &slave->prop; + s32 nval; + s32 i, j; + u32 bit; + unsigned long addr; + struct sdw_dpn_prop *dpn; + + prop->scp_int1_mask = + SDW_SCP_INT1_BUS_CLASH | SDW_SCP_INT1_PARITY; + prop->quirks = SDW_SLAVE_QUIRKS_INVALID_INITIAL_PARITY; + + prop->paging_support = true; + + /* first we need to allocate memory for set bits in port lists */ + prop->source_ports = 0x04; /* BITMAP: 00000100 */ + prop->sink_ports = 0x2; /* BITMAP: 00000010 */ + + nval = hweight32(prop->source_ports); + prop->src_dpn_prop = devm_kcalloc(&slave->dev, nval, + sizeof(*prop->src_dpn_prop), GFP_KERNEL); + if (!prop->src_dpn_prop) + return -ENOMEM; + + i = 0; + dpn = prop->src_dpn_prop; + addr = prop->source_ports; + for_each_set_bit(bit, &addr, 32) { + dpn[i].num = bit; + dpn[i].type = SDW_DPN_FULL; + dpn[i].simple_ch_prep_sm = false; + dpn[i].ch_prep_timeout = 10; + i++; + } + + /* do this again for sink now */ + nval = hweight32(prop->sink_ports); + prop->sink_dpn_prop = devm_kcalloc(&slave->dev, nval, + sizeof(*prop->sink_dpn_prop), GFP_KERNEL); + if (!prop->sink_dpn_prop) + return -ENOMEM; + + j = 0; + dpn = prop->sink_dpn_prop; + addr = prop->sink_ports; + for_each_set_bit(bit, &addr, 32) { + dpn[j].num = bit; + dpn[j].type = SDW_DPN_FULL; + dpn[j].simple_ch_prep_sm = false; + dpn[j].ch_prep_timeout = 10; + j++; + } + + /* set the timeout values */ + prop->clk_stop_timeout = 200; + + return 0; +} + +static s32 tas2783_sdca_dev_suspend(struct device *dev) +{ + struct tas2783_prv *tas_dev = dev_get_drvdata(dev); + + if (!tas_dev->hw_init) + return 0; + + regcache_cache_only(tas_dev->regmap, true); + return 0; +} + +static s32 tas2783_sdca_dev_system_suspend(struct device *dev) +{ + return tas2783_sdca_dev_suspend(dev); +} + +static s32 tas2783_sdca_dev_resume(struct device *dev) +{ + struct sdw_slave *slave = dev_to_sdw_dev(dev); + struct tas2783_prv *tas_dev = dev_get_drvdata(dev); + unsigned long t; + + if (!slave->unattach_request) + goto regmap_sync; + + t = wait_for_completion_timeout(&slave->initialization_complete, + msecs_to_jiffies(TAS2783_PROBE_TIMEOUT)); + if (!t) { + dev_err(&slave->dev, "resume: initialization timed out\n"); + sdw_show_ping_status(slave->bus, true); + return -ETIMEDOUT; + } + + slave->unattach_request = 0; + +regmap_sync: + regcache_cache_only(tas_dev->regmap, false); + regcache_sync(tas_dev->regmap); + return 0; +} + +static const struct dev_pm_ops tas2783_sdca_pm = { + SYSTEM_SLEEP_PM_OPS(tas2783_sdca_dev_system_suspend, tas2783_sdca_dev_resume) + RUNTIME_PM_OPS(tas2783_sdca_dev_suspend, tas2783_sdca_dev_resume, NULL) +}; + +static s32 tas_io_init(struct device *dev, struct sdw_slave *slave) +{ + struct tas2783_prv *tas_dev = dev_get_drvdata(dev); + s32 ret; + u8 unique_id = tas_dev->sdw_peripheral->id.unique_id; + + if (tas_dev->hw_init) + return 0; + + tas_dev->fw_dl_task_done = false; + tas_dev->fw_dl_success = false; + scnprintf(tas_dev->rca_binaryname, sizeof(tas_dev->rca_binaryname), + "tas2783-%01x.bin", unique_id); + + ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT, + tas_dev->rca_binaryname, tas_dev->dev, + GFP_KERNEL, tas_dev, tas2783_fw_ready); + if (ret) { + dev_err(tas_dev->dev, + "firmware request failed for uid=%d, ret=%d\n", + unique_id, ret); + return ret; + } + + ret = wait_event_timeout(tas_dev->fw_wait, tas_dev->fw_dl_task_done, + msecs_to_jiffies(TIMEOUT_FW_DL_MS)); + if (!ret) { + dev_err(tas_dev->dev, "fw request, wait_event timeout\n"); + ret = -EAGAIN; + } else { + ret = regmap_multi_reg_write(tas_dev->regmap, tas2783_init_seq, + ARRAY_SIZE(tas2783_init_seq)); + tas_dev->hw_init = true; + } + + return ret; +} + +static s32 tas_update_status(struct sdw_slave *slave, + enum sdw_slave_status status) +{ + struct tas2783_prv *tas_dev = dev_get_drvdata(&slave->dev); + struct device *dev = &slave->dev; + + dev_dbg(dev, "Peripheral status = %s", + status == SDW_SLAVE_UNATTACHED ? "unattached" : + status == SDW_SLAVE_ATTACHED ? "attached" : "alert"); + + tas_dev->status = status; + if (status == SDW_SLAVE_UNATTACHED) + tas_dev->hw_init = false; + + /* Perform initialization only if slave status + * is present and hw_init flag is false + */ + if (tas_dev->hw_init || tas_dev->status != SDW_SLAVE_ATTACHED) + return 0; + + /* updated the cache data to device */ + regcache_cache_only(tas_dev->regmap, false); + regcache_sync(tas_dev->regmap); + + /* perform I/O transfers required for Slave initialization */ + return tas_io_init(&slave->dev, slave); +} + +static const struct sdw_slave_ops tas_sdw_ops = { + .read_prop = tas_read_prop, + .update_status = tas_update_status, +}; + +static void tas_remove(struct tas2783_prv *tas_dev) +{ + snd_soc_unregister_component(tas_dev->dev); +} + +static s32 tas_sdw_probe(struct sdw_slave *peripheral, + const struct sdw_device_id *id) +{ + struct regmap *regmap; + struct device *dev = &peripheral->dev; + struct tas2783_prv *tas_dev; + + tas_dev = devm_kzalloc(dev, sizeof(*tas_dev), GFP_KERNEL); + if (!tas_dev) + return dev_err_probe(dev, -ENOMEM, + "Failed devm_kzalloc"); + + tas_dev->dev = dev; + tas_dev->sdw_peripheral = peripheral; + tas_dev->hw_init = false; + mutex_init(&tas_dev->calib_lock); + mutex_init(&tas_dev->pde_lock); + + init_waitqueue_head(&tas_dev->fw_wait); + dev_set_drvdata(dev, tas_dev); + regmap = devm_regmap_init_sdw_mbq_cfg(peripheral, + &tas_regmap, + &tas2783_mbq_cfg); + if (IS_ERR(regmap)) + return dev_err_probe(dev, PTR_ERR(regmap), + "Failed devm_regmap_init_sdw."); + + /* keep in cache until the device is fully initialized */ + regcache_cache_only(regmap, true); + tas_dev->regmap = regmap; + return tas_init(tas_dev); +} + +static s32 tas_sdw_remove(struct sdw_slave *peripheral) +{ + struct tas2783_prv *tas_dev = dev_get_drvdata(&peripheral->dev); + + pm_runtime_disable(tas_dev->dev); + tas_remove(tas_dev); + mutex_destroy(&tas_dev->calib_lock); + mutex_destroy(&tas_dev->pde_lock); + dev_set_drvdata(&peripheral->dev, NULL); + + return 0; +} + +static const struct sdw_device_id tas_sdw_id[] = { + /* chipid for the TAS2783 is 0x0000 */ + SDW_SLAVE_ENTRY(0x0102, 0x0000, 0), + {}, +}; +MODULE_DEVICE_TABLE(sdw, tas_sdw_id); + +static struct sdw_driver tas_sdw_driver = { + .driver = { + .name = "slave-tas2783", + .pm = pm_ptr(&tas2783_sdca_pm), + }, + .probe = tas_sdw_probe, + .remove = tas_sdw_remove, + .ops = &tas_sdw_ops, + .id_table = tas_sdw_id, +}; +module_sdw_driver(tas_sdw_driver); + +MODULE_AUTHOR("Texas Instruments Inc."); +MODULE_DESCRIPTION("ASoC TAS2783 SoundWire Driver"); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/tas2783.h b/sound/soc/codecs/tas2783.h new file mode 100644 index 00000000000000..794333e0a35029 --- /dev/null +++ b/sound/soc/codecs/tas2783.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * ALSA SoC Texas Instruments TAS2783 Audio Smart Amplifier + * + * Copyright (C) 2025 Texas Instruments Incorporated + * https://www.ti.com + * + * The TAS2783 driver implements a flexible and configurable + * algo coefficient setting for single TAS2783 chips. + * + * Author: Niranjan H Y + * Author: Baojun Xu + */ +#include + +#ifndef __TAS2783_H__ +#define __TAS2783_H__ + +#define TAS2783_DEVICE_RATES (SNDRV_PCM_RATE_44100 | \ + SNDRV_PCM_RATE_48000 | \ + SNDRV_PCM_RATE_96000 | \ + SNDRV_PCM_RATE_88200) +#define TAS2783_DEVICE_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | \ + SNDRV_PCM_FMTBIT_S24_LE | \ + SNDRV_PCM_FMTBIT_S32_LE) + +/* book, page, register */ +#define TASDEV_REG_SDW(book, page, reg) (((book) * 256 * 128) + \ + 0x800000 + ((page) * 128) + (reg)) + +/* Volume control */ +#define TAS2783_DVC_LVL TASDEV_REG_SDW(0x0, 0x00, 0x1A) +#define TAS2783_AMP_LEVEL TASDEV_REG_SDW(0x0, 0x00, 0x03) +#define TAS2783_AMP_LEVEL_MASK GENMASK(5, 1) + +#define PRAM_ADDR_START TASDEV_REG_SDW(0x8c, 0x01, 0x8) +#define PRAM_ADDR_END TASDEV_REG_SDW(0x8c, 0xff, 0x7f) +#define YRAM_ADDR_START TASDEV_REG_SDW(0x00, 0x02, 0x8) +#define YRAM_ADDR_END TASDEV_REG_SDW(0x00, 0x37, 0x7f) + +/* Calibration data */ +#define TAS2783_CAL_R0 TASDEV_REG_SDW(0, 0x16, 0x4C) +#define TAS2783_CAL_INVR0 TASDEV_REG_SDW(0, 0x16, 0x5C) +#define TAS2783_CAL_R0LOW TASDEV_REG_SDW(0, 0x16, 0x64) +#define TAS2783_CAL_POWER TASDEV_REG_SDW(0, 0x15, 0x44) +#define TAS2783_CAL_TLIM TASDEV_REG_SDW(0, 0x17, 0x58) + +/* TAS2783 SDCA Control - function number */ +#define FUNC_NUM_SMART_AMP 0x01 + +/* TAS2783 SDCA entity */ + +#define TAS2783_SDCA_ENT_FU21 0x01 +#define TAS2783_SDCA_ENT_FU23 0x02 +#define TAS2783_SDCA_ENT_FU26 0x03 +#define TAS2783_SDCA_ENT_XU22 0x04 +#define TAS2783_SDCA_ENT_CS24 0x05 +#define TAS2783_SDCA_ENT_CS21 0x06 +#define TAS2783_SDCA_ENT_CS25 0x07 +#define TAS2783_SDCA_ENT_CS26 0x08 +#define TAS2783_SDCA_ENT_CS28 0x09 +#define TAS2783_SDCA_ENT_PDE23 0x0C +#define TAS2783_SDCA_ENT_UDMPU23 0x0E +#define TAS2783_SDCA_ENT_SAPU29 0x0F +#define TAS2783_SDCA_ENT_PPU21 0x10 +#define TAS2783_SDCA_ENT_PPU26 0x11 +#define TAS2783_SDCA_ENT_TG23 0x12 +#define TAS2783_SDCA_ENT_IT21 0x13 +#define TAS2783_SDCA_ENT_IT29 0x14 +#define TAS2783_SDCA_ENT_IT26 0x15 +#define TAS2783_SDCA_ENT_IT28 0x16 +#define TAS2783_SDCA_ENT_OT24 0x17 +#define TAS2783_SDCA_ENT_OT23 0x18 +#define TAS2783_SDCA_ENT_OT25 0x19 +#define TAS2783_SDCA_ENT_OT28 0x1A +#define TAS2783_SDCA_ENT_MU26 0x1b +#define TAS2783_SDCA_ENT_OT127 0x1E +#define TAS2783_SDCA_ENT_FU127 0x1F +#define TAS2783_SDCA_ENT_CS127 0x20 +#define TAS2783_SDCA_ENT_MFPU21 0x22 +#define TAS2783_SDCA_ENT_MFPU26 0x23 + +/* TAS2783 SDCA control */ +#define TAS2783_SDCA_CTL_REQ_POW_STATE 0x01 +#define TAS2783_SDCA_CTL_FU_MUTE 0x01 +#define TAS2783_SDCA_CTL_UDMPU_CLUSTER 0x10 + +#define TAS2783_DEVICE_CHANNEL_LEFT 1 +#define TAS2783_DEVICE_CHANNEL_RIGHT 2 + +#define TAS2783_SDCA_POW_STATE_ON 0 +#define TAS2783_SDCA_POW_STATE_OFF 3 + +/* calibration data */ +#define TAS2783_CALIB_PARAMS 6 /* 5 + 1 unique id */ +#define TAS2783_CALIB_MAX_SPK_COUNT 8 +#define TAS2783_CALIB_HDR_SZ 12 +#define TAS2783_CALIB_CRC_SZ 4 +#define TAS2783_CALIB_DATA_SZ ((TAS2783_CALIB_HDR_SZ) + TAS2783_CALIB_CRC_SZ + \ + ((TAS2783_CALIB_PARAMS) * 4 * (TAS2783_CALIB_MAX_SPK_COUNT))) + +#if IS_ENABLED(CONFIG_SND_SOC_TAS2783_UTIL) +int32_t tas25xx_register_misc(struct sdw_slave *peripheral); +int32_t tas25xx_deregister_misc(void); +#else +static void tas25xx_register_misc(struct sdw_slave *peripheral) {} +static void tas25xx_deregister_misc(void) {} +#endif + +#endif /*__TAS2783_H__ */ diff --git a/sound/soc/codecs/tlv320aic3x.c b/sound/soc/codecs/tlv320aic3x.c index f1649df197389d..eea8ca285f8e0f 100644 --- a/sound/soc/codecs/tlv320aic3x.c +++ b/sound/soc/codecs/tlv320aic3x.c @@ -121,6 +121,16 @@ static const struct reg_default aic3x_reg[] = { { 108, 0x00 }, { 109, 0x00 }, }; +static const struct reg_sequence aic3007_class_d[] = { + /* Class-D speaker driver init; datasheet p. 46 */ + { AIC3X_PAGE_SELECT, 0x0D }, + { 0xD, 0x0D }, + { 0x8, 0x5C }, + { 0x8, 0x5D }, + { 0x8, 0x5C }, + { AIC3X_PAGE_SELECT, 0x00 }, +}; + static bool aic3x_volatile_reg(struct device *dev, unsigned int reg) { switch (reg) { @@ -1393,6 +1403,10 @@ static int aic3x_set_power(struct snd_soc_component *component, int power) gpiod_set_value(aic3x->gpio_reset, 0); } + if (aic3x->model == AIC3X_MODEL_3007) + regmap_multi_reg_write_bypassed(aic3x->regmap, aic3007_class_d, + ARRAY_SIZE(aic3007_class_d)); + /* Sync reg_cache with the hardware */ regcache_cache_only(aic3x->regmap, false); regcache_sync(aic3x->regmap); @@ -1723,17 +1737,6 @@ static void aic3x_configure_ocmv(struct device *dev, struct aic3x_priv *aic3x) } } - -static const struct reg_sequence aic3007_class_d[] = { - /* Class-D speaker driver init; datasheet p. 46 */ - { AIC3X_PAGE_SELECT, 0x0D }, - { 0xD, 0x0D }, - { 0x8, 0x5C }, - { 0x8, 0x5D }, - { 0x8, 0x5C }, - { AIC3X_PAGE_SELECT, 0x00 }, -}; - int aic3x_probe(struct device *dev, struct regmap *regmap, kernel_ulong_t driver_data) { struct aic3x_priv *aic3x; @@ -1823,13 +1826,6 @@ int aic3x_probe(struct device *dev, struct regmap *regmap, kernel_ulong_t driver aic3x_configure_ocmv(dev, aic3x); - if (aic3x->model == AIC3X_MODEL_3007) { - ret = regmap_register_patch(aic3x->regmap, aic3007_class_d, - ARRAY_SIZE(aic3007_class_d)); - if (ret != 0) - dev_err(dev, "Failed to init class D: %d\n", ret); - } - ret = devm_snd_soc_register_component(dev, &soc_component_dev_aic3x, &aic3x_dai, 1); if (ret) return ret; diff --git a/sound/soc/codecs/wcd-common.c b/sound/soc/codecs/wcd-common.c new file mode 100644 index 00000000000000..9016e974582f55 --- /dev/null +++ b/sound/soc/codecs/wcd-common.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2025, Qualcomm Technologies, Inc. and/or its subsidiaries. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "wcd-common.h" + +#define WCD_MIN_MICBIAS_MV 1000 +#define WCD_DEF_MICBIAS_MV 1800 +#define WCD_MAX_MICBIAS_MV 2850 + +#define SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(m) (0xE0 + 0x10 * (m)) + +int wcd_get_micb_vout_ctl_val(struct device *dev, u32 micb_mv) +{ + /* min micbias voltage is 1V and maximum is 2.85V */ + if (micb_mv < WCD_MIN_MICBIAS_MV || micb_mv > WCD_MAX_MICBIAS_MV) { + dev_err(dev, "Unsupported micbias voltage (%u mV)\n", micb_mv); + return -EINVAL; + } + + return (micb_mv - WCD_MIN_MICBIAS_MV) / 50; +} +EXPORT_SYMBOL_GPL(wcd_get_micb_vout_ctl_val); + +static int wcd_get_micbias_val(struct device *dev, int micb_num, u32 *micb_mv) +{ + char micbias[64]; + int mv; + + sprintf(micbias, "qcom,micbias%d-microvolt", micb_num); + + if (of_property_read_u32(dev->of_node, micbias, &mv)) { + dev_err(dev, "%s value not found, using default\n", micbias); + mv = WCD_DEF_MICBIAS_MV; + } else { + /* convert it to milli volts */ + mv = mv/1000; + } + if (micb_mv) + *micb_mv = mv; + + mv = wcd_get_micb_vout_ctl_val(dev, mv); + if (mv < 0) { + dev_err(dev, "Unsupported %s voltage (%d mV), falling back to default (%d mV)\n", + micbias, mv, WCD_DEF_MICBIAS_MV); + return wcd_get_micb_vout_ctl_val(dev, WCD_DEF_MICBIAS_MV); + } + + return mv; +} + +int wcd_dt_parse_micbias_info(struct wcd_common *common) +{ + int ret, i; + + for (i = 0; i < common->max_bias; i++) { + ret = wcd_get_micbias_val(common->dev, i + 1, &common->micb_mv[i]); + if (ret < 0) + return ret; + common->micb_vout[i] = ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(wcd_dt_parse_micbias_info); + +static int wcd_sdw_component_bind(struct device *dev, struct device *master, void *data) +{ + pm_runtime_set_autosuspend_delay(dev, 3000); + pm_runtime_use_autosuspend(dev); + pm_runtime_mark_last_busy(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + + return 0; +} + +static void wcd_sdw_component_unbind(struct device *dev, struct device *master, void *data) +{ + pm_runtime_disable(dev); + pm_runtime_set_suspended(dev); + pm_runtime_dont_use_autosuspend(dev); +} + +const struct component_ops wcd_sdw_component_ops = { + .bind = wcd_sdw_component_bind, + .unbind = wcd_sdw_component_unbind, +}; +EXPORT_SYMBOL_GPL(wcd_sdw_component_ops); + +int wcd_update_status(struct sdw_slave *slave, enum sdw_slave_status status) +{ + struct regmap *regmap = dev_get_regmap(&slave->dev, NULL); + + if (regmap && status == SDW_SLAVE_ATTACHED) { + /* Write out any cached changes that happened between probe and attach */ + regcache_cache_only(regmap, false); + return regcache_sync(regmap); + } + + return 0; +} +EXPORT_SYMBOL_GPL(wcd_update_status); + +int wcd_bus_config(struct sdw_slave *slave, struct sdw_bus_params *params) +{ + sdw_write(slave, SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(params->next_bank), 0x01); + + return 0; +} +EXPORT_SYMBOL_GPL(wcd_bus_config); + +int wcd_interrupt_callback(struct sdw_slave *slave, struct irq_domain *slave_irq, + unsigned int wcd_intr_status0, unsigned int wcd_intr_status1, + unsigned int wcd_intr_status2) +{ + struct regmap *regmap = dev_get_regmap(&slave->dev, NULL); + u32 sts1, sts2, sts3; + + do { + handle_nested_irq(irq_find_mapping(slave_irq, 0)); + regmap_read(regmap, wcd_intr_status0, &sts1); + regmap_read(regmap, wcd_intr_status1, &sts2); + regmap_read(regmap, wcd_intr_status2, &sts3); + + } while (sts1 || sts2 || sts3); + + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(wcd_interrupt_callback); + +MODULE_DESCRIPTION("Common Qualcomm WCD Codec helpers driver"); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/wcd-common.h b/sound/soc/codecs/wcd-common.h new file mode 100644 index 00000000000000..d5c156e641fc33 --- /dev/null +++ b/sound/soc/codecs/wcd-common.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025, Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef __WCD_COMMON_H__ +#define __WCD_COMMON_H__ + +struct device; +struct sdw_slave; +struct sdw_bus_params; +struct irq_domain; +enum sdw_slave_status; + +#define WCD_MAX_MICBIAS 4 + +struct wcd_sdw_ch_info { + int port_num; + unsigned int ch_mask; + unsigned int master_ch_mask; +}; + +#define WCD_SDW_CH(id, pn, cmask) \ + [id] = { \ + .port_num = pn, \ + .ch_mask = cmask, \ + .master_ch_mask = cmask, \ + } + +struct wcd_common { + struct device *dev; + int max_bias; + u32 micb_mv[WCD_MAX_MICBIAS]; + u32 micb_vout[WCD_MAX_MICBIAS]; +}; + +extern const struct component_ops wcd_sdw_component_ops; +int wcd_get_micb_vout_ctl_val(struct device *dev, u32 micb_mv); +int wcd_dt_parse_micbias_info(struct wcd_common *common); +int wcd_update_status(struct sdw_slave *slave, enum sdw_slave_status status); +int wcd_bus_config(struct sdw_slave *slave, struct sdw_bus_params *params); +int wcd_interrupt_callback(struct sdw_slave *slave, struct irq_domain *slave_irq, + unsigned int wcd_intr_status0, unsigned int wcd_intr_status1, + unsigned int wcd_intr_status2); + +#endif /* __WCD_COMMON_H__ */ diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c index 1bb7e1dc7e6b0a..3c22f7149af81a 100644 --- a/sound/soc/codecs/wcd934x.c +++ b/sound/soc/codecs/wcd934x.c @@ -21,6 +21,7 @@ #include #include #include "wcd-clsh-v2.h" +#include "wcd-common.h" #include "wcd-mbhc-v2.h" #include @@ -116,9 +117,6 @@ #define WCD934X_DEC_PWR_LVL_DF 0x00 #define WCD934X_DEC_PWR_LVL_HYBRID WCD934X_DEC_PWR_LVL_DF -#define WCD934X_DEF_MICBIAS_MV 1800 -#define WCD934X_MAX_MICBIAS_MV 2850 - #define WCD_IIR_FILTER_SIZE (sizeof(u32) * BAND_MAX) #define WCD_IIR_FILTER_CTL(xname, iidx, bidx) \ @@ -530,6 +528,7 @@ struct wcd934x_codec { struct slim_device *sdev; struct slim_device *sidev; struct wcd_clsh_ctrl *clsh_ctrl; + struct wcd_common common; struct snd_soc_component *component; struct wcd934x_slim_ch rx_chs[WCD934X_RX_MAX]; struct wcd934x_slim_ch tx_chs[WCD934X_TX_MAX]; @@ -555,7 +554,6 @@ struct wcd934x_codec { struct mutex micb_lock; u32 micb_ref[WCD934X_MAX_MICBIAS]; u32 pullup_ref[WCD934X_MAX_MICBIAS]; - u32 micb2_mv; }; #define to_wcd934x_codec(_hw) container_of(_hw, struct wcd934x_codec, hw) @@ -2168,55 +2166,24 @@ static struct clk *wcd934x_register_mclk_output(struct wcd934x_codec *wcd) return NULL; } -static int wcd934x_get_micbias_val(struct device *dev, const char *micbias, - u32 *micb_mv) -{ - int mv; - - if (of_property_read_u32(dev->parent->of_node, micbias, &mv)) { - dev_err(dev, "%s value not found, using default\n", micbias); - mv = WCD934X_DEF_MICBIAS_MV; - } else { - /* convert it to milli volts */ - mv = mv/1000; - } - - if (mv < 1000 || mv > 2850) { - dev_err(dev, "%s value not in valid range, using default\n", - micbias); - mv = WCD934X_DEF_MICBIAS_MV; - } - - if (micb_mv) - *micb_mv = mv; - - return (mv - 1000) / 50; -} - static int wcd934x_init_dmic(struct snd_soc_component *comp) { - int vout_ctl_1, vout_ctl_2, vout_ctl_3, vout_ctl_4; struct wcd934x_codec *wcd = dev_get_drvdata(comp->dev); u32 def_dmic_rate, dmic_clk_drv; + int ret; - vout_ctl_1 = wcd934x_get_micbias_val(comp->dev, - "qcom,micbias1-microvolt", NULL); - vout_ctl_2 = wcd934x_get_micbias_val(comp->dev, - "qcom,micbias2-microvolt", - &wcd->micb2_mv); - vout_ctl_3 = wcd934x_get_micbias_val(comp->dev, - "qcom,micbias3-microvolt", NULL); - vout_ctl_4 = wcd934x_get_micbias_val(comp->dev, - "qcom,micbias4-microvolt", NULL); + ret = wcd_dt_parse_mbhc_data(comp->dev, &wcd->mbhc_cfg); + if (ret) + return ret; snd_soc_component_update_bits(comp, WCD934X_ANA_MICB1, - WCD934X_MICB_VAL_MASK, vout_ctl_1); + WCD934X_MICB_VAL_MASK, wcd->common.micb_vout[0]); snd_soc_component_update_bits(comp, WCD934X_ANA_MICB2, - WCD934X_MICB_VAL_MASK, vout_ctl_2); + WCD934X_MICB_VAL_MASK, wcd->common.micb_vout[1]); snd_soc_component_update_bits(comp, WCD934X_ANA_MICB3, - WCD934X_MICB_VAL_MASK, vout_ctl_3); + WCD934X_MICB_VAL_MASK, wcd->common.micb_vout[2]); snd_soc_component_update_bits(comp, WCD934X_ANA_MICB4, - WCD934X_MICB_VAL_MASK, vout_ctl_4); + WCD934X_MICB_VAL_MASK, wcd->common.micb_vout[3]); if (wcd->rate == WCD934X_MCLK_CLK_9P6MHZ) def_dmic_rate = WCD9XXX_DMIC_SAMPLE_RATE_4P8MHZ; @@ -2517,15 +2484,6 @@ static void wcd934x_mbhc_micb_ramp_control(struct snd_soc_component *component, } } -static int wcd934x_get_micb_vout_ctl_val(u32 micb_mv) -{ - /* min micbias voltage is 1V and maximum is 2.85V */ - if (micb_mv < 1000 || micb_mv > 2850) - return -EINVAL; - - return (micb_mv - 1000) / 50; -} - static int wcd934x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, int req_volt, int micb_num) { @@ -2562,7 +2520,7 @@ static int wcd934x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, cur_vout_ctl = snd_soc_component_read_field(component, micb_reg, WCD934X_MICB_VAL_MASK); - req_vout_ctl = wcd934x_get_micb_vout_ctl_val(req_volt); + req_vout_ctl = wcd_get_micb_vout_ctl_val(component->dev, req_volt); if (req_vout_ctl < 0) { ret = -EINVAL; goto exit; @@ -2610,10 +2568,10 @@ static int wcd934x_mbhc_micb_ctrl_threshold_mic(struct snd_soc_component *compon * voltage needed to detect threshold microphone, then do * not change the micbias, just return. */ - if (wcd934x->micb2_mv >= WCD_MBHC_THR_HS_MICB_MV) + if (wcd934x->common.micb_mv[1] >= WCD_MBHC_THR_HS_MICB_MV) return 0; - micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd934x->micb2_mv; + micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd934x->common.micb_mv[1]; rc = wcd934x_mbhc_micb_adjust_voltage(component, micb_mv, MIC_BIAS_2); @@ -3036,7 +2994,7 @@ static void wcd934x_mbhc_deinit(struct snd_soc_component *component) static int wcd934x_comp_probe(struct snd_soc_component *component) { struct wcd934x_codec *wcd = dev_get_drvdata(component->dev); - int i; + int i, ret; snd_soc_component_init_regmap(component, wcd->regmap); wcd->component = component; @@ -3054,7 +3012,12 @@ static int wcd934x_comp_probe(struct snd_soc_component *component) for (i = 0; i < NUM_CODEC_DAIS; i++) INIT_LIST_HEAD(&wcd->dai[i].slim_ch_list); - wcd934x_init_dmic(component); + + ret = wcd934x_init_dmic(component); + if (ret) { + dev_err(component->dev, "Failed to Initialize micbias\n"); + return ret; + } if (wcd934x_mbhc_init(component)) dev_err(component->dev, "Failed to Initialize MBHC\n"); @@ -5831,6 +5794,13 @@ static const struct snd_soc_component_driver wcd934x_component_drv = { .endianness = 1, }; +static void wcd934x_put_device_action(void *data) +{ + struct device *dev = data; + + put_device(dev); +} + static int wcd934x_codec_parse_data(struct wcd934x_codec *wcd) { struct device *dev = &wcd->sdev->dev; @@ -5847,11 +5817,13 @@ static int wcd934x_codec_parse_data(struct wcd934x_codec *wcd) return dev_err_probe(dev, -EINVAL, "Unable to get SLIM Interface device\n"); slim_get_logical_addr(wcd->sidev); - wcd->if_regmap = regmap_init_slimbus(wcd->sidev, + wcd->if_regmap = devm_regmap_init_slimbus(wcd->sidev, &wcd934x_ifc_regmap_config); - if (IS_ERR(wcd->if_regmap)) + if (IS_ERR(wcd->if_regmap)) { + put_device(&wcd->sidev->dev); return dev_err_probe(dev, PTR_ERR(wcd->if_regmap), "Failed to allocate ifc register map\n"); + } of_property_read_u32(dev->parent->of_node, "qcom,dmic-sample-rate", &wcd->dmic_sample_rate); @@ -5860,14 +5832,13 @@ static int wcd934x_codec_parse_data(struct wcd934x_codec *wcd) cfg->anc_micbias = MIC_BIAS_2; cfg->v_hs_max = WCD_MBHC_HS_V_MAX; cfg->num_btn = WCD934X_MBHC_MAX_BUTTONS; - cfg->micb_mv = wcd->micb2_mv; + cfg->micb_mv = wcd->common.micb_mv[1]; cfg->linein_th = 5000; cfg->hs_thr = 1700; cfg->hph_thr = 50; wcd_dt_parse_mbhc_data(dev, cfg); - return 0; } @@ -5888,11 +5859,17 @@ static int wcd934x_codec_probe(struct platform_device *pdev) wcd->sdev = to_slim_device(data->dev); mutex_init(&wcd->sysclk_mutex); mutex_init(&wcd->micb_lock); + wcd->common.dev = dev->parent; + wcd->common.max_bias = 4; ret = wcd934x_codec_parse_data(wcd); if (ret) return ret; + ret = devm_add_action_or_reset(dev, wcd934x_put_device_action, &wcd->sidev->dev); + if (ret) + return ret; + /* set default rate 9P6MHz */ regmap_update_bits(wcd->regmap, WCD934X_CODEC_RPM_CLK_MCLK_CFG, WCD934X_CODEC_RPM_CLK_MCLK_CFG_MCLK_MASK, diff --git a/sound/soc/codecs/wcd937x-sdw.c b/sound/soc/codecs/wcd937x-sdw.c index 1bfe7383b31117..1878d67e3fa109 100644 --- a/sound/soc/codecs/wcd937x-sdw.c +++ b/sound/soc/codecs/wcd937x-sdw.c @@ -19,7 +19,7 @@ #include #include "wcd937x.h" -static struct wcd937x_sdw_ch_info wcd937x_sdw_rx_ch_info[] = { +static struct wcd_sdw_ch_info wcd937x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD937X_HPH_L, WCD937X_HPH_PORT, BIT(0)), WCD_SDW_CH(WCD937X_HPH_R, WCD937X_HPH_PORT, BIT(1)), WCD_SDW_CH(WCD937X_CLSH, WCD937X_CLSH_PORT, BIT(0)), @@ -30,7 +30,7 @@ static struct wcd937x_sdw_ch_info wcd937x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD937X_DSD_R, WCD937X_DSD_PORT, BIT(1)), }; -static struct wcd937x_sdw_ch_info wcd937x_sdw_tx_ch_info[] = { +static struct wcd_sdw_ch_info wcd937x_sdw_tx_ch_info[] = { WCD_SDW_CH(WCD937X_ADC1, WCD937X_ADC_1_PORT, BIT(0)), WCD_SDW_CH(WCD937X_ADC2, WCD937X_ADC_2_3_PORT, BIT(0)), WCD_SDW_CH(WCD937X_ADC3, WCD937X_ADC_2_3_PORT, BIT(0)), @@ -78,12 +78,6 @@ static struct sdw_dpn_prop wcd937x_dpn_prop[WCD937X_MAX_SWR_PORTS] = { } }; -struct device *wcd937x_sdw_device_get(struct device_node *np) -{ - return bus_find_device_by_of_node(&sdw_bus_type, np); -} -EXPORT_SYMBOL_GPL(wcd937x_sdw_device_get); - int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, @@ -118,19 +112,6 @@ int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, } EXPORT_SYMBOL_GPL(wcd937x_sdw_hw_params); -static int wcd9370_update_status(struct sdw_slave *slave, enum sdw_slave_status status) -{ - struct wcd937x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - - if (wcd->regmap && status == SDW_SLAVE_ATTACHED) { - /* Write out any cached changes that happened between probe and attach */ - regcache_cache_only(wcd->regmap, false); - return regcache_sync(wcd->regmap); - } - - return 0; -} - /* * Handle Soundwire out-of-band interrupt event by triggering * the first irq of the slave_irq irq domain, which then will @@ -141,18 +122,9 @@ static int wcd9370_interrupt_callback(struct sdw_slave *slave, struct sdw_slave_intr_status *status) { struct wcd937x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - struct irq_domain *slave_irq = wcd->slave_irq; - u32 sts1, sts2, sts3; - - do { - handle_nested_irq(irq_find_mapping(slave_irq, 0)); - regmap_read(wcd->regmap, WCD937X_DIGITAL_INTR_STATUS_0, &sts1); - regmap_read(wcd->regmap, WCD937X_DIGITAL_INTR_STATUS_1, &sts2); - regmap_read(wcd->regmap, WCD937X_DIGITAL_INTR_STATUS_2, &sts3); - } while (sts1 || sts2 || sts3); - - return IRQ_HANDLED; + return wcd_interrupt_callback(slave, wcd->slave_irq, WCD937X_DIGITAL_INTR_STATUS_0, + WCD937X_DIGITAL_INTR_STATUS_1, WCD937X_DIGITAL_INTR_STATUS_2); } static const struct reg_default wcd937x_defaults[] = { @@ -985,35 +957,10 @@ static const struct regmap_config wcd937x_regmap_config = { }; static const struct sdw_slave_ops wcd9370_slave_ops = { - .update_status = wcd9370_update_status, + .update_status = wcd_update_status, .interrupt_callback = wcd9370_interrupt_callback, }; -static int wcd937x_sdw_component_bind(struct device *dev, - struct device *master, void *data) -{ - pm_runtime_set_autosuspend_delay(dev, 3000); - pm_runtime_use_autosuspend(dev); - pm_runtime_mark_last_busy(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - - return 0; -} - -static void wcd937x_sdw_component_unbind(struct device *dev, - struct device *master, void *data) -{ - pm_runtime_disable(dev); - pm_runtime_set_suspended(dev); - pm_runtime_dont_use_autosuspend(dev); -} - -static const struct component_ops wcd937x_sdw_component_ops = { - .bind = wcd937x_sdw_component_bind, - .unbind = wcd937x_sdw_component_unbind, -}; - static int wcd9370_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) { @@ -1099,7 +1046,7 @@ static int wcd9370_probe(struct sdw_slave *pdev, } - ret = component_add(dev, &wcd937x_sdw_component_ops); + ret = component_add(dev, &wcd_sdw_component_ops); if (ret) return ret; @@ -1113,7 +1060,7 @@ static int wcd9370_remove(struct sdw_slave *pdev) { struct device *dev = &pdev->dev; - component_del(dev, &wcd937x_sdw_component_ops); + component_del(dev, &wcd_sdw_component_ops); return 0; } diff --git a/sound/soc/codecs/wcd937x.c b/sound/soc/codecs/wcd937x.c index 3b0a8cc314e059..421ec7a2d6bdc8 100644 --- a/sound/soc/codecs/wcd937x.c +++ b/sound/soc/codecs/wcd937x.c @@ -21,6 +21,7 @@ #include #include "wcd-clsh-v2.h" +#include "wcd-common.h" #include "wcd-mbhc-v2.h" #include "wcd937x.h" @@ -85,6 +86,7 @@ struct wcd937x_priv { struct wcd_mbhc_config mbhc_cfg; struct wcd_mbhc_intr intr_ids; struct wcd_clsh_ctrl *clsh_info; + struct wcd_common common; struct irq_domain *virq; struct regmap_irq_chip_data *irq_chip; struct snd_soc_jack *jack; @@ -93,9 +95,6 @@ struct wcd937x_priv { s32 pullup_ref[WCD937X_MAX_MICBIAS]; u32 hph_mode; int ear_rx_path; - u32 micb1_mv; - u32 micb2_mv; - u32 micb3_mv; int hphr_pdm_wd_int; int hphl_pdm_wd_int; int aux_pdm_wd_int; @@ -872,15 +871,6 @@ static int wcd937x_enable_rx3(struct snd_soc_dapm_widget *w, return 0; } -static int wcd937x_get_micb_vout_ctl_val(u32 micb_mv) -{ - if (micb_mv < 1000 || micb_mv > 2850) { - pr_err("Unsupported micbias voltage (%u mV)\n", micb_mv); - return -EINVAL; - } - - return (micb_mv - 1000) / 50; -} static int wcd937x_tx_swr_ctrl(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event) @@ -1193,7 +1183,7 @@ static int wcd937x_codec_enable_micbias_pullup(struct snd_soc_dapm_widget *w, static int wcd937x_connect_port(struct wcd937x_sdw_priv *wcd, u8 port_idx, u8 ch_id, bool enable) { struct sdw_port_config *port_config = &wcd->port_config[port_idx - 1]; - const struct wcd937x_sdw_ch_info *ch_info = &wcd->ch_info[ch_id]; + const struct wcd_sdw_ch_info *ch_info = &wcd->ch_info[ch_id]; u8 port_num = ch_info->port_num; u8 ch_mask = ch_info->ch_mask; u8 mstr_port_num, mstr_ch_mask; @@ -1481,7 +1471,7 @@ static int wcd937x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, cur_vout_ctl = snd_soc_component_read_field(component, micb_reg, WCD937X_MICB_VOUT_MASK); - req_vout_ctl = wcd937x_get_micb_vout_ctl_val(req_volt); + req_vout_ctl = wcd_get_micb_vout_ctl_val(component->dev, req_volt); if (req_vout_ctl < 0) { ret = -EINVAL; goto exit; @@ -1529,10 +1519,10 @@ static int wcd937x_mbhc_micb_ctrl_threshold_mic(struct snd_soc_component *compon * voltage needed to detect threshold microphone, then do * not change the micbias, just return. */ - if (wcd937x->micb2_mv >= WCD_MBHC_THR_HS_MICB_MV) + if (wcd937x->common.micb_mv[2] >= WCD_MBHC_THR_HS_MICB_MV) return 0; - micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd937x->micb2_mv; + micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd937x->common.micb_mv[2]; return wcd937x_mbhc_micb_adjust_voltage(component, micb_mv, MIC_BIAS_2); } @@ -2046,9 +2036,9 @@ static const struct snd_kcontrol_new wcd937x_snd_controls[] = { SOC_ENUM_EXT("RX HPH Mode", rx_hph_mode_mux_enum, wcd937x_rx_hph_mode_get, wcd937x_rx_hph_mode_put), - SOC_SINGLE_EXT("HPHL_COMP Switch", SND_SOC_NOPM, 0, 1, 0, + SOC_SINGLE_EXT("HPHL_COMP Switch", WCD937X_COMP_L, 0, 1, 0, wcd937x_get_compander, wcd937x_set_compander), - SOC_SINGLE_EXT("HPHR_COMP Switch", SND_SOC_NOPM, 1, 1, 0, + SOC_SINGLE_EXT("HPHR_COMP Switch", WCD937X_COMP_R, 1, 1, 0, wcd937x_get_compander, wcd937x_set_compander), SOC_SINGLE_TLV("HPHL Volume", WCD937X_HPH_L_EN, 0, 20, 1, line_gain), @@ -2436,22 +2426,14 @@ static const struct snd_soc_dapm_route wcd9375_audio_map[] = { { "DMIC6_MIXER", "Switch", "DMIC6" }, }; -static int wcd937x_set_micbias_data(struct wcd937x_priv *wcd937x) +static void wcd937x_set_micbias_data(struct device *dev, struct wcd937x_priv *wcd937x) { - int vout_ctl[3]; - - /* Set micbias voltage */ - vout_ctl[0] = wcd937x_get_micb_vout_ctl_val(wcd937x->micb1_mv); - vout_ctl[1] = wcd937x_get_micb_vout_ctl_val(wcd937x->micb2_mv); - vout_ctl[2] = wcd937x_get_micb_vout_ctl_val(wcd937x->micb3_mv); - if ((vout_ctl[0] | vout_ctl[1] | vout_ctl[2]) < 0) - return -EINVAL; - - regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB1, WCD937X_ANA_MICB_VOUT, vout_ctl[0]); - regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB2, WCD937X_ANA_MICB_VOUT, vout_ctl[1]); - regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB3, WCD937X_ANA_MICB_VOUT, vout_ctl[2]); - - return 0; + regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB1, WCD937X_ANA_MICB_VOUT, + wcd937x->common.micb_vout[0]); + regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB2, WCD937X_ANA_MICB_VOUT, + wcd937x->common.micb_vout[1]); + regmap_update_bits(wcd937x->regmap, WCD937X_ANA_MICB3, WCD937X_ANA_MICB_VOUT, + wcd937x->common.micb_vout[2]); } static irqreturn_t wcd937x_wd_handle_irq(int irq, void *data) @@ -2630,31 +2612,6 @@ static const struct snd_soc_component_driver soc_codec_dev_wcd937x = { .endianness = 1, }; -static void wcd937x_dt_parse_micbias_info(struct device *dev, struct wcd937x_priv *wcd) -{ - struct device_node *np = dev->of_node; - u32 prop_val = 0; - int ret = 0; - - ret = of_property_read_u32(np, "qcom,micbias1-microvolt", &prop_val); - if (!ret) - wcd->micb1_mv = prop_val / 1000; - else - dev_warn(dev, "Micbias1 DT property not found\n"); - - ret = of_property_read_u32(np, "qcom,micbias2-microvolt", &prop_val); - if (!ret) - wcd->micb2_mv = prop_val / 1000; - else - dev_warn(dev, "Micbias2 DT property not found\n"); - - ret = of_property_read_u32(np, "qcom,micbias3-microvolt", &prop_val); - if (!ret) - wcd->micb3_mv = prop_val / 1000; - else - dev_warn(dev, "Micbias3 DT property not found\n"); -} - static bool wcd937x_swap_gnd_mic(struct snd_soc_component *component) { int value; @@ -2788,7 +2745,7 @@ static int wcd937x_bind(struct device *dev) return ret; } - wcd937x->rxdev = wcd937x_sdw_device_get(wcd937x->rxnode); + wcd937x->rxdev = of_sdw_find_device_by_node(wcd937x->rxnode); if (!wcd937x->rxdev) { dev_err(dev, "could not find slave with matching of node\n"); return -EINVAL; @@ -2797,7 +2754,7 @@ static int wcd937x_bind(struct device *dev) wcd937x->sdw_priv[AIF1_PB] = dev_get_drvdata(wcd937x->rxdev); wcd937x->sdw_priv[AIF1_PB]->wcd937x = wcd937x; - wcd937x->txdev = wcd937x_sdw_device_get(wcd937x->txnode); + wcd937x->txdev = of_sdw_find_device_by_node(wcd937x->txnode); if (!wcd937x->txdev) { dev_err(dev, "could not find txslave with matching of node\n"); return -EINVAL; @@ -2833,7 +2790,7 @@ static int wcd937x_bind(struct device *dev) return -EINVAL; } - wcd937x->regmap = dev_get_regmap(&wcd937x->tx_sdw_dev->dev, NULL); + wcd937x->regmap = wcd937x->sdw_priv[AIF1_CAP]->regmap; if (!wcd937x->regmap) { dev_err(dev, "could not get TX device regmap\n"); return -EINVAL; @@ -2848,11 +2805,7 @@ static int wcd937x_bind(struct device *dev) wcd937x->sdw_priv[AIF1_PB]->slave_irq = wcd937x->virq; wcd937x->sdw_priv[AIF1_CAP]->slave_irq = wcd937x->virq; - ret = wcd937x_set_micbias_data(wcd937x); - if (ret < 0) { - dev_err(dev, "Bad micbias pdata\n"); - return ret; - } + wcd937x_set_micbias_data(dev, wcd937x); ret = snd_soc_register_component(dev, &soc_codec_dev_wcd937x, wcd937x_dais, ARRAY_SIZE(wcd937x_dais)); @@ -2920,6 +2873,8 @@ static int wcd937x_probe(struct platform_device *pdev) dev_set_drvdata(dev, wcd937x); mutex_init(&wcd937x->micb_lock); + wcd937x->common.dev = dev; + wcd937x->common.max_bias = 3; wcd937x->reset_gpio = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW); if (IS_ERR(wcd937x->reset_gpio)) @@ -2939,13 +2894,15 @@ static int wcd937x_probe(struct platform_device *pdev) if (ret) return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); - wcd937x_dt_parse_micbias_info(dev, wcd937x); + ret = wcd_dt_parse_micbias_info(&wcd937x->common); + if (ret) + return dev_err_probe(dev, ret, "Failed to get micbias\n"); cfg->mbhc_micbias = MIC_BIAS_2; cfg->anc_micbias = MIC_BIAS_2; cfg->v_hs_max = WCD_MBHC_HS_V_MAX; cfg->num_btn = WCD937X_MBHC_MAX_BUTTONS; - cfg->micb_mv = wcd937x->micb2_mv; + cfg->micb_mv = wcd937x->common.micb_mv[2]; cfg->linein_th = 5000; cfg->hs_thr = 1700; cfg->hph_thr = 50; diff --git a/sound/soc/codecs/wcd937x.h b/sound/soc/codecs/wcd937x.h index 3ab21bb5846e2c..3d0ba3cc0ee614 100644 --- a/sound/soc/codecs/wcd937x.h +++ b/sound/soc/codecs/wcd937x.h @@ -7,6 +7,7 @@ #include #include +#include "wcd-common.h" #define WCD937X_BASE_ADDRESS 0x3000 #define WCD937X_ANA_BIAS 0x3001 @@ -507,26 +508,13 @@ enum wcd937x_rx_sdw_ports { WCD937X_MAX_SWR_PORTS = WCD937X_DSD_PORT, }; -struct wcd937x_sdw_ch_info { - int port_num; - unsigned int ch_mask; - unsigned int master_ch_mask; -}; - -#define WCD_SDW_CH(id, pn, cmask) \ - [id] = { \ - .port_num = pn, \ - .ch_mask = cmask, \ - .master_ch_mask = cmask, \ - } - struct wcd937x_priv; struct wcd937x_sdw_priv { struct sdw_slave *sdev; struct sdw_stream_config sconfig; struct sdw_stream_runtime *sruntime; struct sdw_port_config port_config[WCD937X_MAX_SWR_PORTS]; - struct wcd937x_sdw_ch_info *ch_info; + struct wcd_sdw_ch_info *ch_info; bool port_enable[WCD937X_MAX_SWR_CH_IDS]; unsigned int master_channel_map[SDW_MAX_PORTS]; int active_ports; @@ -549,24 +537,22 @@ int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); -struct device *wcd937x_sdw_device_get(struct device_node *np); - #else -int wcd937x_sdw_free(struct wcd937x_sdw_priv *wcd, +static inline int wcd937x_sdw_free(struct wcd937x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { return -EOPNOTSUPP; } -int wcd937x_sdw_set_sdw_stream(struct wcd937x_sdw_priv *wcd, +static inline int wcd937x_sdw_set_sdw_stream(struct wcd937x_sdw_priv *wcd, struct snd_soc_dai *dai, void *stream, int direction) { return -EOPNOTSUPP; } -int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, +static inline int wcd937x_sdw_hw_params(struct wcd937x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) diff --git a/sound/soc/codecs/wcd938x-sdw.c b/sound/soc/codecs/wcd938x-sdw.c index e822cc14525066..add907cb270629 100644 --- a/sound/soc/codecs/wcd938x-sdw.c +++ b/sound/soc/codecs/wcd938x-sdw.c @@ -18,10 +18,9 @@ #include #include #include "wcd938x.h" +#include "wcd-common.h" -#define SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(m) (0xE0 + 0x10 * (m)) - -static const struct wcd938x_sdw_ch_info wcd938x_sdw_rx_ch_info[] = { +static const struct wcd_sdw_ch_info wcd938x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD938X_HPH_L, WCD938X_HPH_PORT, BIT(0)), WCD_SDW_CH(WCD938X_HPH_R, WCD938X_HPH_PORT, BIT(1)), WCD_SDW_CH(WCD938X_CLSH, WCD938X_CLSH_PORT, BIT(0)), @@ -32,7 +31,7 @@ static const struct wcd938x_sdw_ch_info wcd938x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD938X_DSD_R, WCD938X_DSD_PORT, BIT(1)), }; -static const struct wcd938x_sdw_ch_info wcd938x_sdw_tx_ch_info[] = { +static const struct wcd_sdw_ch_info wcd938x_sdw_tx_ch_info[] = { WCD_SDW_CH(WCD938X_ADC1, WCD938X_ADC_1_2_PORT, BIT(0)), WCD_SDW_CH(WCD938X_ADC2, WCD938X_ADC_1_2_PORT, BIT(1)), WCD_SDW_CH(WCD938X_ADC3, WCD938X_ADC_3_4_PORT, BIT(0)), @@ -82,23 +81,6 @@ static struct sdw_dpn_prop wcd938x_dpn_prop[WCD938X_MAX_SWR_PORTS] = { } }; -struct device *wcd938x_sdw_device_get(struct device_node *np) -{ - return bus_find_device_by_of_node(&sdw_bus_type, np); - -} -EXPORT_SYMBOL_GPL(wcd938x_sdw_device_get); - -int wcd938x_swr_get_current_bank(struct sdw_slave *sdev) -{ - int bank; - - bank = sdw_read(sdev, SDW_SCP_CTRL); - - return ((bank & 0x40) ? 1 : 0); -} -EXPORT_SYMBOL_GPL(wcd938x_swr_get_current_bank); - int wcd938x_sdw_hw_params(struct wcd938x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, @@ -158,44 +140,13 @@ int wcd938x_sdw_set_sdw_stream(struct wcd938x_sdw_priv *wcd, } EXPORT_SYMBOL_GPL(wcd938x_sdw_set_sdw_stream); -static int wcd9380_update_status(struct sdw_slave *slave, - enum sdw_slave_status status) -{ - struct wcd938x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - - if (wcd->regmap && (status == SDW_SLAVE_ATTACHED)) { - /* Write out any cached changes that happened between probe and attach */ - regcache_cache_only(wcd->regmap, false); - return regcache_sync(wcd->regmap); - } - - return 0; -} - -static int wcd9380_bus_config(struct sdw_slave *slave, - struct sdw_bus_params *params) -{ - sdw_write(slave, SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(params->next_bank), 0x01); - - return 0; -} - static int wcd9380_interrupt_callback(struct sdw_slave *slave, struct sdw_slave_intr_status *status) { struct wcd938x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - struct irq_domain *slave_irq = wcd->slave_irq; - u32 sts1, sts2, sts3; - - do { - handle_nested_irq(irq_find_mapping(slave_irq, 0)); - regmap_read(wcd->regmap, WCD938X_DIGITAL_INTR_STATUS_0, &sts1); - regmap_read(wcd->regmap, WCD938X_DIGITAL_INTR_STATUS_1, &sts2); - regmap_read(wcd->regmap, WCD938X_DIGITAL_INTR_STATUS_2, &sts3); - } while (sts1 || sts2 || sts3); - - return IRQ_HANDLED; + return wcd_interrupt_callback(slave, wcd->slave_irq, WCD938X_DIGITAL_INTR_STATUS_0, + WCD938X_DIGITAL_INTR_STATUS_1, WCD938X_DIGITAL_INTR_STATUS_2); } static const struct reg_default wcd938x_defaults[] = { @@ -1193,25 +1144,9 @@ static const struct regmap_config wcd938x_regmap_config = { }; static const struct sdw_slave_ops wcd9380_slave_ops = { - .update_status = wcd9380_update_status, + .update_status = wcd_update_status, .interrupt_callback = wcd9380_interrupt_callback, - .bus_config = wcd9380_bus_config, -}; - -static int wcd938x_sdw_component_bind(struct device *dev, - struct device *master, void *data) -{ - return 0; -} - -static void wcd938x_sdw_component_unbind(struct device *dev, - struct device *master, void *data) -{ -} - -static const struct component_ops wcd938x_sdw_component_ops = { - .bind = wcd938x_sdw_component_bind, - .unbind = wcd938x_sdw_component_unbind, + .bus_config = wcd_bus_config, }; static int wcd9380_probe(struct sdw_slave *pdev, @@ -1278,7 +1213,7 @@ static int wcd9380_probe(struct sdw_slave *pdev, pm_runtime_set_active(dev); pm_runtime_enable(dev); - ret = component_add(dev, &wcd938x_sdw_component_ops); + ret = component_add(dev, &wcd_sdw_component_ops); if (ret) goto err_disable_rpm; @@ -1296,7 +1231,7 @@ static int wcd9380_remove(struct sdw_slave *pdev) { struct device *dev = &pdev->dev; - component_del(dev, &wcd938x_sdw_component_ops); + component_del(dev, &wcd_sdw_component_ops); pm_runtime_disable(dev); pm_runtime_set_suspended(dev); diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c index 711f373ece24cf..e1a4783b984c17 100644 --- a/sound/soc/codecs/wcd938x.c +++ b/sound/soc/codecs/wcd938x.c @@ -22,6 +22,7 @@ #include #include "wcd-clsh-v2.h" +#include "wcd-common.h" #include "wcd-mbhc-v2.h" #include "wcd938x.h" @@ -155,6 +156,7 @@ struct wcd938x_priv { struct wcd_mbhc_config mbhc_cfg; struct wcd_mbhc_intr intr_ids; struct wcd_clsh_ctrl *clsh_info; + struct wcd_common common; struct irq_domain *virq; struct regmap_irq_chip_data *irq_chip; struct snd_soc_jack *jack; @@ -169,10 +171,6 @@ struct wcd938x_priv { struct gpio_desc *us_euro_gpio; struct mux_control *us_euro_mux; unsigned int mux_state; - u32 micb1_mv; - u32 micb2_mv; - u32 micb3_mv; - u32 micb4_mv; int hphr_pdm_wd_int; int hphl_pdm_wd_int; int aux_pdm_wd_int; @@ -396,7 +394,7 @@ static int wcd938x_io_init(struct wcd938x_priv *wcd938x) } -static int wcd938x_sdw_connect_port(const struct wcd938x_sdw_ch_info *ch_info, +static int wcd938x_sdw_connect_port(const struct wcd_sdw_ch_info *ch_info, struct sdw_port_config *port_config, u8 enable) { @@ -1094,8 +1092,7 @@ static int wcd938x_tx_swr_ctrl(struct snd_soc_dapm_widget *w, int bank; int rate; - bank = (wcd938x_swr_get_current_bank(wcd938x->sdw_priv[AIF1_CAP]->sdev)) ? 0 : 1; - bank = bank ? 0 : 1; + bank = sdw_slave_get_current_bank(wcd938x->sdw_priv[AIF1_CAP]->sdev); switch (event) { case SND_SOC_DAPM_PRE_PMU: @@ -1975,15 +1972,6 @@ static void wcd938x_mbhc_micb_ramp_control(struct snd_soc_component *component, } } -static int wcd938x_get_micb_vout_ctl_val(u32 micb_mv) -{ - /* min micbias voltage is 1V and maximum is 2.85V */ - if (micb_mv < 1000 || micb_mv > 2850) - return -EINVAL; - - return (micb_mv - 1000) / 50; -} - static int wcd938x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, int req_volt, int micb_num) { @@ -2020,7 +2008,7 @@ static int wcd938x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, cur_vout_ctl = snd_soc_component_read_field(component, micb_reg, WCD938X_MICB_VOUT_MASK); - req_vout_ctl = wcd938x_get_micb_vout_ctl_val(req_volt); + req_vout_ctl = wcd_get_micb_vout_ctl_val(component->dev, req_volt); if (req_vout_ctl < 0) { ret = -EINVAL; goto exit; @@ -2068,10 +2056,10 @@ static int wcd938x_mbhc_micb_ctrl_threshold_mic(struct snd_soc_component *compon * voltage needed to detect threshold microphone, then do * not change the micbias, just return. */ - if (wcd938x->micb2_mv >= WCD_MBHC_THR_HS_MICB_MV) + if (wcd938x->common.micb_mv[2] >= WCD_MBHC_THR_HS_MICB_MV) return 0; - micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd938x->micb2_mv; + micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd938x->common.micb_mv[2]; return wcd938x_mbhc_micb_adjust_voltage(component, micb_mv, MIC_BIAS_2); } @@ -2976,28 +2964,16 @@ static const struct snd_soc_dapm_route wcd938x_audio_map[] = { {"EAR", NULL, "EAR PGA"}, }; -static int wcd938x_set_micbias_data(struct wcd938x_priv *wcd938x) +static void wcd938x_set_micbias_data(struct device *dev, struct wcd938x_priv *wcd938x) { - int vout_ctl_1, vout_ctl_2, vout_ctl_3, vout_ctl_4; - - /* set micbias voltage */ - vout_ctl_1 = wcd938x_get_micb_vout_ctl_val(wcd938x->micb1_mv); - vout_ctl_2 = wcd938x_get_micb_vout_ctl_val(wcd938x->micb2_mv); - vout_ctl_3 = wcd938x_get_micb_vout_ctl_val(wcd938x->micb3_mv); - vout_ctl_4 = wcd938x_get_micb_vout_ctl_val(wcd938x->micb4_mv); - if (vout_ctl_1 < 0 || vout_ctl_2 < 0 || vout_ctl_3 < 0 || vout_ctl_4 < 0) - return -EINVAL; - regmap_update_bits(wcd938x->regmap, WCD938X_ANA_MICB1, - WCD938X_MICB_VOUT_MASK, vout_ctl_1); + WCD938X_MICB_VOUT_MASK, wcd938x->common.micb_vout[0]); regmap_update_bits(wcd938x->regmap, WCD938X_ANA_MICB2, - WCD938X_MICB_VOUT_MASK, vout_ctl_2); + WCD938X_MICB_VOUT_MASK, wcd938x->common.micb_vout[1]); regmap_update_bits(wcd938x->regmap, WCD938X_ANA_MICB3, - WCD938X_MICB_VOUT_MASK, vout_ctl_3); + WCD938X_MICB_VOUT_MASK, wcd938x->common.micb_vout[2]); regmap_update_bits(wcd938x->regmap, WCD938X_ANA_MICB4, - WCD938X_MICB_VOUT_MASK, vout_ctl_4); - - return 0; + WCD938X_MICB_VOUT_MASK, wcd938x->common.micb_vout[3]); } static irqreturn_t wcd938x_wd_handle_irq(int irq, void *data) @@ -3201,37 +3177,6 @@ static const struct snd_soc_component_driver soc_codec_dev_wcd938x = { .endianness = 1, }; -static void wcd938x_dt_parse_micbias_info(struct device *dev, struct wcd938x_priv *wcd) -{ - struct device_node *np = dev->of_node; - u32 prop_val = 0; - int rc = 0; - - rc = of_property_read_u32(np, "qcom,micbias1-microvolt", &prop_val); - if (!rc) - wcd->micb1_mv = prop_val/1000; - else - dev_info(dev, "%s: Micbias1 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias2-microvolt", &prop_val); - if (!rc) - wcd->micb2_mv = prop_val/1000; - else - dev_info(dev, "%s: Micbias2 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias3-microvolt", &prop_val); - if (!rc) - wcd->micb3_mv = prop_val/1000; - else - dev_info(dev, "%s: Micbias3 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias4-microvolt", &prop_val); - if (!rc) - wcd->micb4_mv = prop_val/1000; - else - dev_info(dev, "%s: Micbias4 DT property not found\n", __func__); -} - static bool wcd938x_swap_gnd_mic(struct snd_soc_component *component) { struct wcd938x_priv *wcd938x = snd_soc_component_get_drvdata(component); @@ -3296,13 +3241,15 @@ static int wcd938x_populate_dt_data(struct wcd938x_priv *wcd938x, struct device if (ret) return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); - wcd938x_dt_parse_micbias_info(dev, wcd938x); + ret = wcd_dt_parse_micbias_info(&wcd938x->common); + if (ret) + return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); cfg->mbhc_micbias = MIC_BIAS_2; cfg->anc_micbias = MIC_BIAS_2; cfg->v_hs_max = WCD_MBHC_HS_V_MAX; cfg->num_btn = WCD938X_MBHC_MAX_BUTTONS; - cfg->micb_mv = wcd938x->micb2_mv; + cfg->micb_mv = wcd938x->common.micb_mv[2]; cfg->linein_th = 5000; cfg->hs_thr = 1700; cfg->hph_thr = 50; @@ -3400,7 +3347,7 @@ static int wcd938x_bind(struct device *dev) return ret; } - wcd938x->rxdev = wcd938x_sdw_device_get(wcd938x->rxnode); + wcd938x->rxdev = of_sdw_find_device_by_node(wcd938x->rxnode); if (!wcd938x->rxdev) { dev_err(dev, "could not find slave with matching of node\n"); ret = -EINVAL; @@ -3409,7 +3356,7 @@ static int wcd938x_bind(struct device *dev) wcd938x->sdw_priv[AIF1_PB] = dev_get_drvdata(wcd938x->rxdev); wcd938x->sdw_priv[AIF1_PB]->wcd938x = wcd938x; - wcd938x->txdev = wcd938x_sdw_device_get(wcd938x->txnode); + wcd938x->txdev = of_sdw_find_device_by_node(wcd938x->txnode); if (!wcd938x->txdev) { dev_err(dev, "could not find txslave with matching of node\n"); ret = -EINVAL; @@ -3442,7 +3389,7 @@ static int wcd938x_bind(struct device *dev) goto err_remove_tx_link; } - wcd938x->regmap = dev_get_regmap(&wcd938x->tx_sdw_dev->dev, NULL); + wcd938x->regmap = wcd938x->sdw_priv[AIF1_CAP]->regmap; if (!wcd938x->regmap) { dev_err(dev, "could not get TX device regmap\n"); ret = -EINVAL; @@ -3458,11 +3405,7 @@ static int wcd938x_bind(struct device *dev) wcd938x->sdw_priv[AIF1_PB]->slave_irq = wcd938x->virq; wcd938x->sdw_priv[AIF1_CAP]->slave_irq = wcd938x->virq; - ret = wcd938x_set_micbias_data(wcd938x); - if (ret < 0) { - dev_err(dev, "%s: bad micbias pdata\n", __func__); - goto err_remove_rx_link; - } + wcd938x_set_micbias_data(dev, wcd938x); ret = snd_soc_register_component(dev, &soc_codec_dev_wcd938x, wcd938x_dais, ARRAY_SIZE(wcd938x_dais)); @@ -3551,6 +3494,8 @@ static int wcd938x_probe(struct platform_device *pdev) dev_set_drvdata(dev, wcd938x); mutex_init(&wcd938x->micb_lock); + wcd938x->common.dev = dev; + wcd938x->common.max_bias = 4; ret = wcd938x_populate_dt_data(wcd938x, dev); if (ret) diff --git a/sound/soc/codecs/wcd938x.h b/sound/soc/codecs/wcd938x.h index fb6a0e4ef33774..c18610466d7d83 100644 --- a/sound/soc/codecs/wcd938x.h +++ b/sound/soc/codecs/wcd938x.h @@ -587,17 +587,6 @@ #define WCD938X_MAX_SWR_CH_IDS 15 -struct wcd938x_sdw_ch_info { - int port_num; - unsigned int ch_mask; -}; - -#define WCD_SDW_CH(id, pn, cmask) \ - [id] = { \ - .port_num = pn, \ - .ch_mask = cmask, \ - } - enum wcd938x_tx_sdw_ports { WCD938X_ADC_1_2_PORT = 1, WCD938X_ADC_3_4_PORT, @@ -649,7 +638,7 @@ struct wcd938x_sdw_priv { struct sdw_stream_config sconfig; struct sdw_stream_runtime *sruntime; struct sdw_port_config port_config[WCD938X_MAX_SWR_PORTS]; - const struct wcd938x_sdw_ch_info *ch_info; + const struct wcd_sdw_ch_info *ch_info; bool port_enable[WCD938X_MAX_SWR_CH_IDS]; int active_ports; bool is_tx; @@ -669,10 +658,6 @@ int wcd938x_sdw_hw_params(struct wcd938x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); - -struct device *wcd938x_sdw_device_get(struct device_node *np); -int wcd938x_swr_get_current_bank(struct sdw_slave *sdev); - #else static inline int wcd938x_sdw_free(struct wcd938x_sdw_priv *wcd, @@ -697,14 +682,5 @@ static inline int wcd938x_sdw_hw_params(struct wcd938x_sdw_priv *wcd, return -EOPNOTSUPP; } -static inline struct device *wcd938x_sdw_device_get(struct device_node *np) -{ - return NULL; -} - -static inline int wcd938x_swr_get_current_bank(struct sdw_slave *sdev) -{ - return 0; -} #endif /* CONFIG_SND_SOC_WCD938X_SDW */ #endif /* __WCD938X_H__ */ diff --git a/sound/soc/codecs/wcd939x-sdw.c b/sound/soc/codecs/wcd939x-sdw.c index f7a9323a9feadb..d369100a245742 100644 --- a/sound/soc/codecs/wcd939x-sdw.c +++ b/sound/soc/codecs/wcd939x-sdw.c @@ -20,10 +20,9 @@ #include #include #include "wcd939x.h" +#include "wcd-common.h" -#define SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(m) (0xE0 + 0x10 * (m)) - -static const struct wcd939x_sdw_ch_info wcd939x_sdw_rx_ch_info[] = { +static const struct wcd_sdw_ch_info wcd939x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD939X_HPH_L, WCD939X_HPH_PORT, BIT(0)), WCD_SDW_CH(WCD939X_HPH_R, WCD939X_HPH_PORT, BIT(1)), WCD_SDW_CH(WCD939X_CLSH, WCD939X_CLSH_PORT, BIT(0)), @@ -36,7 +35,7 @@ static const struct wcd939x_sdw_ch_info wcd939x_sdw_rx_ch_info[] = { WCD_SDW_CH(WCD939X_HIFI_PCM_R, WCD939X_HIFI_PCM_PORT, BIT(1)), }; -static const struct wcd939x_sdw_ch_info wcd939x_sdw_tx_ch_info[] = { +static const struct wcd_sdw_ch_info wcd939x_sdw_tx_ch_info[] = { WCD_SDW_CH(WCD939X_ADC1, WCD939X_ADC_1_4_PORT, BIT(0)), WCD_SDW_CH(WCD939X_ADC2, WCD939X_ADC_1_4_PORT, BIT(1)), WCD_SDW_CH(WCD939X_ADC3, WCD939X_ADC_1_4_PORT, BIT(2)), @@ -128,19 +127,6 @@ static struct sdw_dpn_prop wcd939x_tx_dpn_prop[WCD939X_MAX_TX_SWR_PORTS] = { } }; -struct device *wcd939x_sdw_device_get(struct device_node *np) -{ - return bus_find_device_by_of_node(&sdw_bus_type, np); -} -EXPORT_SYMBOL_GPL(wcd939x_sdw_device_get); - -unsigned int wcd939x_swr_get_current_bank(struct sdw_slave *sdev) -{ - return FIELD_GET(SDW_SCP_STAT_CURR_BANK, - sdw_read(sdev, SDW_SCP_CTRL)); -} -EXPORT_SYMBOL_GPL(wcd939x_swr_get_current_bank); - int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, @@ -199,38 +185,6 @@ int wcd939x_sdw_set_sdw_stream(struct wcd939x_sdw_priv *wcd, } EXPORT_SYMBOL_GPL(wcd939x_sdw_set_sdw_stream); -struct regmap *wcd939x_swr_get_regmap(struct wcd939x_sdw_priv *wcd) -{ - if (wcd->regmap) - return wcd->regmap; - - return ERR_PTR(-EINVAL); -} -EXPORT_SYMBOL_GPL(wcd939x_swr_get_regmap); - -static int wcd9390_update_status(struct sdw_slave *slave, - enum sdw_slave_status status) -{ - struct wcd939x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - - if (wcd->regmap && status == SDW_SLAVE_ATTACHED) { - /* Write out any cached changes that happened between probe and attach */ - regcache_cache_only(wcd->regmap, false); - return regcache_sync(wcd->regmap); - } - - return 0; -} - -static int wcd9390_bus_config(struct sdw_slave *slave, - struct sdw_bus_params *params) -{ - sdw_write(slave, SWRS_SCP_HOST_CLK_DIV2_CTL_BANK(params->next_bank), - 0x01); - - return 0; -} - /* * Handle Soundwire out-of-band interrupt event by triggering * the first irq of the slave_irq irq domain, which then will @@ -241,18 +195,9 @@ static int wcd9390_interrupt_callback(struct sdw_slave *slave, struct sdw_slave_intr_status *status) { struct wcd939x_sdw_priv *wcd = dev_get_drvdata(&slave->dev); - struct irq_domain *slave_irq = wcd->slave_irq; - u32 sts1, sts2, sts3; - - do { - handle_nested_irq(irq_find_mapping(slave_irq, 0)); - regmap_read(wcd->regmap, WCD939X_DIGITAL_INTR_STATUS_0, &sts1); - regmap_read(wcd->regmap, WCD939X_DIGITAL_INTR_STATUS_1, &sts2); - regmap_read(wcd->regmap, WCD939X_DIGITAL_INTR_STATUS_2, &sts3); - - } while (sts1 || sts2 || sts3); - return IRQ_HANDLED; + return wcd_interrupt_callback(slave, wcd->slave_irq, WCD939X_DIGITAL_INTR_STATUS_0, + WCD939X_DIGITAL_INTR_STATUS_1, WCD939X_DIGITAL_INTR_STATUS_2); } static const struct reg_default wcd939x_defaults[] = { @@ -1385,34 +1330,9 @@ static const struct regmap_config wcd939x_regmap_config = { }; static const struct sdw_slave_ops wcd9390_slave_ops = { - .update_status = wcd9390_update_status, + .update_status = wcd_update_status, .interrupt_callback = wcd9390_interrupt_callback, - .bus_config = wcd9390_bus_config, -}; - -static int wcd939x_sdw_component_bind(struct device *dev, struct device *master, - void *data) -{ - pm_runtime_set_autosuspend_delay(dev, 3000); - pm_runtime_use_autosuspend(dev); - pm_runtime_mark_last_busy(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - - return 0; -} - -static void wcd939x_sdw_component_unbind(struct device *dev, - struct device *master, void *data) -{ - pm_runtime_disable(dev); - pm_runtime_set_suspended(dev); - pm_runtime_dont_use_autosuspend(dev); -} - -static const struct component_ops wcd939x_sdw_component_ops = { - .bind = wcd939x_sdw_component_bind, - .unbind = wcd939x_sdw_component_unbind, + .bus_config = wcd_bus_config, }; static int wcd9390_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) @@ -1478,7 +1398,7 @@ static int wcd9390_probe(struct sdw_slave *pdev, const struct sdw_device_id *id) regcache_cache_only(wcd->regmap, true); } - ret = component_add(dev, &wcd939x_sdw_component_ops); + ret = component_add(dev, &wcd_sdw_component_ops); if (ret) return ret; @@ -1493,7 +1413,7 @@ static int wcd9390_remove(struct sdw_slave *pdev) struct device *dev = &pdev->dev; struct wcd939x_sdw_priv *wcd = dev_get_drvdata(dev); - component_del(dev, &wcd939x_sdw_component_ops); + component_del(dev, &wcd_sdw_component_ops); if (wcd->regmap) regmap_exit(wcd->regmap); diff --git a/sound/soc/codecs/wcd939x.c b/sound/soc/codecs/wcd939x.c index 64f082e474c1d4..e74e6f0131318c 100644 --- a/sound/soc/codecs/wcd939x.c +++ b/sound/soc/codecs/wcd939x.c @@ -28,6 +28,7 @@ #include #include "wcd-clsh-v2.h" +#include "wcd-common.h" #include "wcd-mbhc-v2.h" #include "wcd939x.h" @@ -191,6 +192,7 @@ struct wcd939x_priv { struct wcd_mbhc_config mbhc_cfg; struct wcd_mbhc_intr intr_ids; struct wcd_clsh_ctrl *clsh_info; + struct wcd_common common; struct irq_domain *virq; struct regmap_irq_chip_data *irq_chip; struct snd_soc_jack *jack; @@ -201,10 +203,6 @@ struct wcd939x_priv { u32 tx_mode[TX_ADC_MAX]; int variant; struct gpio_desc *reset_gpio; - u32 micb1_mv; - u32 micb2_mv; - u32 micb3_mv; - u32 micb4_mv; int hphr_pdm_wd_int; int hphl_pdm_wd_int; int ear_pdm_wd_int; @@ -415,7 +413,7 @@ static int wcd939x_io_init(struct snd_soc_component *component) return 0; } -static int wcd939x_sdw_connect_port(const struct wcd939x_sdw_ch_info *ch_info, +static int wcd939x_sdw_connect_port(const struct wcd_sdw_ch_info *ch_info, struct sdw_port_config *port_config, u8 enable) { @@ -1017,7 +1015,7 @@ static int wcd939x_tx_swr_ctrl(struct snd_soc_dapm_widget *w, int bank; int rate; - bank = wcd939x_swr_get_current_bank(wcd939x->sdw_priv[AIF1_CAP]->sdev); + bank = sdw_slave_get_current_bank(wcd939x->sdw_priv[AIF1_CAP]->sdev); switch (event) { case SND_SOC_DAPM_PRE_PMU: @@ -1919,17 +1917,6 @@ static void wcd939x_mbhc_micb_ramp_control(struct snd_soc_component *component, } } -static int wcd939x_get_micb_vout_ctl_val(u32 micb_mv) -{ - /* min micbias voltage is 1V and maximum is 2.85V */ - if (micb_mv < 1000 || micb_mv > 2850) { - pr_err("%s: unsupported micbias voltage\n", __func__); - return -EINVAL; - } - - return (micb_mv - 1000) / 50; -} - static int wcd939x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, int req_volt, int micb_num) { @@ -1969,7 +1956,7 @@ static int wcd939x_mbhc_micb_adjust_voltage(struct snd_soc_component *component, cur_vout_ctl = snd_soc_component_read_field(component, micb_reg, WCD939X_MICB_VOUT_CTL); - req_vout_ctl = wcd939x_get_micb_vout_ctl_val(req_volt); + req_vout_ctl = wcd_get_micb_vout_ctl_val(component->dev, req_volt); if (req_vout_ctl < 0) { ret = req_vout_ctl; goto exit; @@ -2021,10 +2008,10 @@ static int wcd939x_mbhc_micb_ctrl_threshold_mic(struct snd_soc_component *compon * voltage needed to detect threshold microphone, then do * not change the micbias, just return. */ - if (wcd939x->micb2_mv >= WCD_MBHC_THR_HS_MICB_MV) + if (wcd939x->common.micb_mv[1] >= WCD_MBHC_THR_HS_MICB_MV) return 0; - micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd939x->micb2_mv; + micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd939x->common.micb_mv[1]; return wcd939x_mbhc_micb_adjust_voltage(component, micb_mv, MIC_BIAS_2); } @@ -2895,28 +2882,16 @@ static const struct snd_soc_dapm_route wcd939x_audio_map[] = { {"EAR", NULL, "EAR PGA"}, }; -static int wcd939x_set_micbias_data(struct wcd939x_priv *wcd939x) +static void wcd939x_set_micbias_data(struct device *dev, struct wcd939x_priv *wcd939x) { - int vout_ctl_1, vout_ctl_2, vout_ctl_3, vout_ctl_4; - - /* set micbias voltage */ - vout_ctl_1 = wcd939x_get_micb_vout_ctl_val(wcd939x->micb1_mv); - vout_ctl_2 = wcd939x_get_micb_vout_ctl_val(wcd939x->micb2_mv); - vout_ctl_3 = wcd939x_get_micb_vout_ctl_val(wcd939x->micb3_mv); - vout_ctl_4 = wcd939x_get_micb_vout_ctl_val(wcd939x->micb4_mv); - if (vout_ctl_1 < 0 || vout_ctl_2 < 0 || vout_ctl_3 < 0 || vout_ctl_4 < 0) - return -EINVAL; - regmap_update_bits(wcd939x->regmap, WCD939X_ANA_MICB1, - WCD939X_MICB_VOUT_CTL, vout_ctl_1); + WCD939X_MICB_VOUT_CTL, wcd939x->common.micb_vout[0]); regmap_update_bits(wcd939x->regmap, WCD939X_ANA_MICB2, - WCD939X_MICB_VOUT_CTL, vout_ctl_2); + WCD939X_MICB_VOUT_CTL, wcd939x->common.micb_vout[1]); regmap_update_bits(wcd939x->regmap, WCD939X_ANA_MICB3, - WCD939X_MICB_VOUT_CTL, vout_ctl_3); + WCD939X_MICB_VOUT_CTL, wcd939x->common.micb_vout[2]); regmap_update_bits(wcd939x->regmap, WCD939X_ANA_MICB4, - WCD939X_MICB_VOUT_CTL, vout_ctl_4); - - return 0; + WCD939X_MICB_VOUT_CTL, wcd939x->common.micb_vout[3]); } static irqreturn_t wcd939x_wd_handle_irq(int irq, void *data) @@ -3186,37 +3161,6 @@ static int wcd939x_typec_mux_set(struct typec_mux_dev *mux, } #endif /* CONFIG_TYPEC */ -static void wcd939x_dt_parse_micbias_info(struct device *dev, struct wcd939x_priv *wcd) -{ - struct device_node *np = dev->of_node; - u32 prop_val = 0; - int rc = 0; - - rc = of_property_read_u32(np, "qcom,micbias1-microvolt", &prop_val); - if (!rc) - wcd->micb1_mv = prop_val / 1000; - else - dev_info(dev, "%s: Micbias1 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias2-microvolt", &prop_val); - if (!rc) - wcd->micb2_mv = prop_val / 1000; - else - dev_info(dev, "%s: Micbias2 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias3-microvolt", &prop_val); - if (!rc) - wcd->micb3_mv = prop_val / 1000; - else - dev_info(dev, "%s: Micbias3 DT property not found\n", __func__); - - rc = of_property_read_u32(np, "qcom,micbias4-microvolt", &prop_val); - if (!rc) - wcd->micb4_mv = prop_val / 1000; - else - dev_info(dev, "%s: Micbias4 DT property not found\n", __func__); -} - #if IS_ENABLED(CONFIG_TYPEC) static bool wcd939x_swap_gnd_mic(struct snd_soc_component *component) { @@ -3252,13 +3196,15 @@ static int wcd939x_populate_dt_data(struct wcd939x_priv *wcd939x, struct device if (ret) return dev_err_probe(dev, ret, "Failed to get and enable supplies\n"); - wcd939x_dt_parse_micbias_info(dev, wcd939x); + ret = wcd_dt_parse_micbias_info(&wcd939x->common); + if (ret) + return dev_err_probe(dev, ret, "Failed to get micbias\n"); cfg->mbhc_micbias = MIC_BIAS_2; cfg->anc_micbias = MIC_BIAS_2; cfg->v_hs_max = WCD_MBHC_HS_V_MAX; cfg->num_btn = WCD939X_MBHC_MAX_BUTTONS; - cfg->micb_mv = wcd939x->micb2_mv; + cfg->micb_mv = wcd939x->common.micb_mv[1]; cfg->linein_th = 5000; cfg->hs_thr = 1700; cfg->hph_thr = 50; @@ -3383,7 +3329,7 @@ static int wcd939x_bind(struct device *dev) goto err_put_typec_switch; } - wcd939x->rxdev = wcd939x_sdw_device_get(wcd939x->rxnode); + wcd939x->rxdev = of_sdw_find_device_by_node(wcd939x->rxnode); if (!wcd939x->rxdev) { dev_err(dev, "could not find slave with matching of node\n"); ret = -EINVAL; @@ -3392,7 +3338,7 @@ static int wcd939x_bind(struct device *dev) wcd939x->sdw_priv[AIF1_PB] = dev_get_drvdata(wcd939x->rxdev); wcd939x->sdw_priv[AIF1_PB]->wcd939x = wcd939x; - wcd939x->txdev = wcd939x_sdw_device_get(wcd939x->txnode); + wcd939x->txdev = of_sdw_find_device_by_node(wcd939x->txnode); if (!wcd939x->txdev) { dev_err(dev, "could not find txslave with matching of node\n"); ret = -EINVAL; @@ -3428,10 +3374,10 @@ static int wcd939x_bind(struct device *dev) } /* Get regmap from TX SoundWire device */ - wcd939x->regmap = wcd939x_swr_get_regmap(wcd939x->sdw_priv[AIF1_CAP]); - if (IS_ERR(wcd939x->regmap)) { + wcd939x->regmap = wcd939x->sdw_priv[AIF1_CAP]->regmap; + if (!wcd939x->regmap) { dev_err(dev, "could not get TX device regmap\n"); - ret = PTR_ERR(wcd939x->regmap); + ret = -ENODEV; goto err_remove_rx_link; } @@ -3444,11 +3390,7 @@ static int wcd939x_bind(struct device *dev) wcd939x->sdw_priv[AIF1_PB]->slave_irq = wcd939x->virq; wcd939x->sdw_priv[AIF1_CAP]->slave_irq = wcd939x->virq; - ret = wcd939x_set_micbias_data(wcd939x); - if (ret < 0) { - dev_err(dev, "%s: bad micbias pdata\n", __func__); - goto err_remove_rx_link; - } + wcd939x_set_micbias_data(dev, wcd939x); /* Check WCD9395 version */ regmap_read(wcd939x->regmap, WCD939X_DIGITAL_CHIP_ID1, &id1); @@ -3613,6 +3555,8 @@ static int wcd939x_probe(struct platform_device *pdev) dev_set_drvdata(dev, wcd939x); mutex_init(&wcd939x->micb_lock); + wcd939x->common.dev = dev; + wcd939x->common.max_bias = 4; ret = wcd939x_populate_dt_data(wcd939x, dev); if (ret) { diff --git a/sound/soc/codecs/wcd939x.h b/sound/soc/codecs/wcd939x.h index 3204fb10b58d7d..6bd2366587a8f1 100644 --- a/sound/soc/codecs/wcd939x.h +++ b/sound/soc/codecs/wcd939x.h @@ -844,17 +844,6 @@ #define WCD939X_MAX_SWR_CH_IDS (15) -struct wcd939x_sdw_ch_info { - int port_num; - unsigned int ch_mask; -}; - -#define WCD_SDW_CH(id, pn, cmask) \ - [id] = { \ - .port_num = pn, \ - .ch_mask = cmask, \ - } - enum wcd939x_tx_sdw_ports { WCD939X_ADC_1_4_PORT = 1, WCD939X_ADC_DMIC_1_2_PORT, @@ -909,7 +898,7 @@ struct wcd939x_sdw_priv { struct sdw_stream_config sconfig; struct sdw_stream_runtime *sruntime; struct sdw_port_config port_config[WCD939X_MAX_SWR_PORTS]; - const struct wcd939x_sdw_ch_info *ch_info; + const struct wcd_sdw_ch_info *ch_info; bool port_enable[WCD939X_MAX_SWR_CH_IDS]; int active_ports; bool is_tx; @@ -929,11 +918,6 @@ int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai); - -struct device *wcd939x_sdw_device_get(struct device_node *np); -unsigned int wcd939x_swr_get_current_bank(struct sdw_slave *sdev); - -struct regmap *wcd939x_swr_get_regmap(struct wcd939x_sdw_priv *wcd); #else static inline int wcd939x_sdw_free(struct wcd939x_sdw_priv *wcd, @@ -958,20 +942,6 @@ static inline int wcd939x_sdw_hw_params(struct wcd939x_sdw_priv *wcd, return -EOPNOTSUPP; } -static inline struct device *wcd939x_sdw_device_get(struct device_node *np) -{ - return NULL; -} - -static inline unsigned int wcd939x_swr_get_current_bank(struct sdw_slave *sdev) -{ - return 0; -} - -struct regmap *wcd939x_swr_get_regmap(struct wcd939x_sdw_priv *wcd) -{ - return PTR_ERR(-EINVAL); -} #endif /* CONFIG_SND_SOC_WCD939X_SDW */ #endif /* __WCD939X_H__ */ diff --git a/sound/soc/codecs/wm8993.c b/sound/soc/codecs/wm8993.c index 9be4f6cadba39e..75d923c2c9cac8 100644 --- a/sound/soc/codecs/wm8993.c +++ b/sound/soc/codecs/wm8993.c @@ -1536,7 +1536,7 @@ static int wm8993_probe(struct snd_soc_component *component) * VMID as an output and can disable it. */ if (wm8993->pdata.lineout1_diff && wm8993->pdata.lineout2_diff) - dapm->idle_bias_off = 1; + dapm->idle_bias = false; return 0; diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c index 240ec1bed23470..128c3a59beac3d 100644 --- a/sound/soc/codecs/wm8994.c +++ b/sound/soc/codecs/wm8994.c @@ -4182,8 +4182,8 @@ static int wm8994_component_probe(struct snd_soc_component *component) wm8994->micdet_irq = control->pdata.micdet_irq; - /* By default use idle_bias_off, will override for WM8994 */ - dapm->idle_bias_off = 1; + /* By default use idle_bias false, will override for WM8994 */ + dapm->idle_bias = false; /* Set revision-specific configuration */ switch (control->type) { @@ -4191,7 +4191,7 @@ static int wm8994_component_probe(struct snd_soc_component *component) /* Single ended line outputs should have VMID on. */ if (!control->pdata.lineout1_diff || !control->pdata.lineout2_diff) - dapm->idle_bias_off = 0; + dapm->idle_bias = true; switch (control->revision) { case 2: diff --git a/sound/soc/fsl/fsl_qmc_audio.c b/sound/soc/fsl/fsl_qmc_audio.c index 5614a8b909edf8..3de448ef724cb3 100644 --- a/sound/soc/fsl/fsl_qmc_audio.c +++ b/sound/soc/fsl/fsl_qmc_audio.c @@ -17,12 +17,6 @@ #include #include -struct qmc_dai_chan { - struct qmc_dai_prtd *prtd_tx; - struct qmc_dai_prtd *prtd_rx; - struct qmc_chan *qmc_chan; -}; - struct qmc_dai { char *name; int id; @@ -33,7 +27,7 @@ struct qmc_dai { unsigned int nb_chans_avail; unsigned int nb_chans_used_tx; unsigned int nb_chans_used_rx; - struct qmc_dai_chan *chans; + struct qmc_chan **qmc_chans; }; struct qmc_audio { @@ -57,7 +51,6 @@ struct qmc_dai_prtd { size_t ch_dma_offset; unsigned int channels; - DECLARE_BITMAP(chans_pending, 64); struct snd_pcm_substream *substream; }; @@ -126,17 +119,14 @@ static int qmc_audio_pcm_write_submit(struct qmc_dai_prtd *prtd) int ret; for (i = 0; i < prtd->channels; i++) { - bitmap_set(prtd->chans_pending, i, 1); - - ret = qmc_chan_write_submit(prtd->qmc_dai->chans[i].qmc_chan, + ret = qmc_chan_write_submit(prtd->qmc_dai->qmc_chans[i], prtd->ch_dma_addr_current + i * prtd->ch_dma_offset, prtd->ch_dma_size, - qmc_audio_pcm_write_complete, - &prtd->qmc_dai->chans[i]); + i == prtd->channels - 1 ? qmc_audio_pcm_write_complete : + NULL, prtd); if (ret) { dev_err(prtd->qmc_dai->dev, "write_submit %u failed %d\n", i, ret); - bitmap_clear(prtd->chans_pending, i, 1); return ret; } } @@ -146,20 +136,7 @@ static int qmc_audio_pcm_write_submit(struct qmc_dai_prtd *prtd) static void qmc_audio_pcm_write_complete(void *context) { - struct qmc_dai_chan *chan = context; - struct qmc_dai_prtd *prtd; - - prtd = chan->prtd_tx; - - /* Mark the current channel as completed */ - bitmap_clear(prtd->chans_pending, chan - prtd->qmc_dai->chans, 1); - - /* - * All QMC channels involved must have completed their transfer before - * submitting a new one. - */ - if (!bitmap_empty(prtd->chans_pending, 64)) - return; + struct qmc_dai_prtd *prtd = context; prtd->buffer_ended += prtd->period_size; if (prtd->buffer_ended >= prtd->buffer_size) @@ -182,17 +159,14 @@ static int qmc_audio_pcm_read_submit(struct qmc_dai_prtd *prtd) int ret; for (i = 0; i < prtd->channels; i++) { - bitmap_set(prtd->chans_pending, i, 1); - - ret = qmc_chan_read_submit(prtd->qmc_dai->chans[i].qmc_chan, + ret = qmc_chan_read_submit(prtd->qmc_dai->qmc_chans[i], prtd->ch_dma_addr_current + i * prtd->ch_dma_offset, prtd->ch_dma_size, - qmc_audio_pcm_read_complete, - &prtd->qmc_dai->chans[i]); + i == prtd->channels - 1 ? qmc_audio_pcm_read_complete : + NULL, prtd); if (ret) { dev_err(prtd->qmc_dai->dev, "read_submit %u failed %d\n", i, ret); - bitmap_clear(prtd->chans_pending, i, 1); return ret; } } @@ -202,26 +176,13 @@ static int qmc_audio_pcm_read_submit(struct qmc_dai_prtd *prtd) static void qmc_audio_pcm_read_complete(void *context, size_t length, unsigned int flags) { - struct qmc_dai_chan *chan = context; - struct qmc_dai_prtd *prtd; - - prtd = chan->prtd_rx; - - /* Mark the current channel as completed */ - bitmap_clear(prtd->chans_pending, chan - prtd->qmc_dai->chans, 1); + struct qmc_dai_prtd *prtd = context; if (length != prtd->ch_dma_size) { dev_err(prtd->qmc_dai->dev, "read complete length = %zu, exp %zu\n", length, prtd->ch_dma_size); } - /* - * All QMC channels involved must have completed their transfer before - * submitting a new one. - */ - if (!bitmap_empty(prtd->chans_pending, 64)) - return; - prtd->buffer_ended += prtd->period_size; if (prtd->buffer_ended >= prtd->buffer_size) prtd->buffer_ended = 0; @@ -239,7 +200,6 @@ static int qmc_audio_pcm_trigger(struct snd_soc_component *component, struct snd_pcm_substream *substream, int cmd) { struct qmc_dai_prtd *prtd = substream->runtime->private_data; - unsigned int i; int ret; if (!prtd->qmc_dai) { @@ -249,14 +209,10 @@ static int qmc_audio_pcm_trigger(struct snd_soc_component *component, switch (cmd) { case SNDRV_PCM_TRIGGER_START: - bitmap_zero(prtd->chans_pending, 64); prtd->buffer_ended = 0; prtd->ch_dma_addr_current = prtd->ch_dma_addr_start; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { - for (i = 0; i < prtd->channels; i++) - prtd->qmc_dai->chans[i].prtd_tx = prtd; - /* Submit first chunk ... */ ret = qmc_audio_pcm_write_submit(prtd); if (ret) @@ -272,9 +228,6 @@ static int qmc_audio_pcm_trigger(struct snd_soc_component *component, if (ret) return ret; } else { - for (i = 0; i < prtd->channels; i++) - prtd->qmc_dai->chans[i].prtd_rx = prtd; - /* Submit first chunk ... */ ret = qmc_audio_pcm_read_submit(prtd); if (ret) @@ -644,9 +597,9 @@ static int qmc_dai_hw_params(struct snd_pcm_substream *substream, chan_param.mode = QMC_TRANSPARENT; chan_param.transp.max_rx_buf_size = params_period_bytes(params) / nb_chans_used; for (i = 0; i < nb_chans_used; i++) { - ret = qmc_chan_set_param(qmc_dai->chans[i].qmc_chan, &chan_param); + ret = qmc_chan_set_param(qmc_dai->qmc_chans[i], &chan_param); if (ret) { - dev_err(dai->dev, "chans[%u], set param failed %d\n", + dev_err(dai->dev, "qmc_chans[%u], set param failed %d\n", i, ret); return ret; } @@ -688,7 +641,7 @@ static int qmc_dai_trigger(struct snd_pcm_substream *substream, int cmd, case SNDRV_PCM_TRIGGER_RESUME: case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: for (i = 0; i < nb_chans_used; i++) { - ret = qmc_chan_start(qmc_dai->chans[i].qmc_chan, direction); + ret = qmc_chan_start(qmc_dai->qmc_chans[i], direction); if (ret) goto err_stop; } @@ -697,13 +650,13 @@ static int qmc_dai_trigger(struct snd_pcm_substream *substream, int cmd, case SNDRV_PCM_TRIGGER_STOP: /* Stop and reset all QMC channels and return the first error encountered */ for (i = 0; i < nb_chans_used; i++) { - ret_tmp = qmc_chan_stop(qmc_dai->chans[i].qmc_chan, direction); + ret_tmp = qmc_chan_stop(qmc_dai->qmc_chans[i], direction); if (!ret) ret = ret_tmp; if (ret_tmp) continue; - ret_tmp = qmc_chan_reset(qmc_dai->chans[i].qmc_chan, direction); + ret_tmp = qmc_chan_reset(qmc_dai->qmc_chans[i], direction); if (!ret) ret = ret_tmp; } @@ -715,7 +668,7 @@ static int qmc_dai_trigger(struct snd_pcm_substream *substream, int cmd, case SNDRV_PCM_TRIGGER_PAUSE_PUSH: /* Stop all QMC channels and return the first error encountered */ for (i = 0; i < nb_chans_used; i++) { - ret_tmp = qmc_chan_stop(qmc_dai->chans[i].qmc_chan, direction); + ret_tmp = qmc_chan_stop(qmc_dai->qmc_chans[i], direction); if (!ret) ret = ret_tmp; } @@ -731,8 +684,8 @@ static int qmc_dai_trigger(struct snd_pcm_substream *substream, int cmd, err_stop: while (i--) { - qmc_chan_stop(qmc_dai->chans[i].qmc_chan, direction); - qmc_chan_reset(qmc_dai->chans[i].qmc_chan, direction); + qmc_chan_stop(qmc_dai->qmc_chans[i], direction); + qmc_chan_reset(qmc_dai->qmc_chans[i], direction); } return ret; } @@ -791,12 +744,17 @@ static int qmc_audio_dai_parse(struct qmc_audio *qmc_audio, struct device_node * struct qmc_dai *qmc_dai, struct snd_soc_dai_driver *qmc_soc_dai_driver) { + struct qmc_chan_ts_info ts_info; struct qmc_chan_info info; unsigned long rx_fs_rate; unsigned long tx_fs_rate; + int prev_last_rx_ts = 0; + int prev_last_tx_ts = 0; unsigned int nb_tx_ts; unsigned int nb_rx_ts; unsigned int i; + int last_rx_ts; + int last_tx_ts; int count; u32 val; int ret; @@ -823,19 +781,20 @@ static int qmc_audio_dai_parse(struct qmc_audio *qmc_audio, struct device_node * return dev_err_probe(qmc_audio->dev, -EINVAL, "dai %d no QMC channel defined\n", qmc_dai->id); - qmc_dai->chans = devm_kcalloc(qmc_audio->dev, count, sizeof(*qmc_dai->chans), GFP_KERNEL); - if (!qmc_dai->chans) + qmc_dai->qmc_chans = devm_kcalloc(qmc_audio->dev, count, sizeof(*qmc_dai->qmc_chans), + GFP_KERNEL); + if (!qmc_dai->qmc_chans) return -ENOMEM; for (i = 0; i < count; i++) { - qmc_dai->chans[i].qmc_chan = devm_qmc_chan_get_byphandles_index(qmc_audio->dev, np, - "fsl,qmc-chan", i); - if (IS_ERR(qmc_dai->chans[i].qmc_chan)) { - return dev_err_probe(qmc_audio->dev, PTR_ERR(qmc_dai->chans[i].qmc_chan), + qmc_dai->qmc_chans[i] = devm_qmc_chan_get_byphandles_index(qmc_audio->dev, np, + "fsl,qmc-chan", i); + if (IS_ERR(qmc_dai->qmc_chans[i])) { + return dev_err_probe(qmc_audio->dev, PTR_ERR(qmc_dai->qmc_chans[i]), "dai %d get QMC channel %d failed\n", qmc_dai->id, i); } - ret = qmc_chan_get_info(qmc_dai->chans[i].qmc_chan, &info); + ret = qmc_chan_get_info(qmc_dai->qmc_chans[i], &info); if (ret) { dev_err(qmc_audio->dev, "dai %d get QMC %d channel info failed %d\n", qmc_dai->id, i, ret); @@ -879,6 +838,30 @@ static int qmc_audio_dai_parse(struct qmc_audio *qmc_audio, struct device_node * return -EINVAL; } } + + ret = qmc_chan_get_ts_info(qmc_dai->qmc_chans[i], &ts_info); + if (ret) { + dev_err(qmc_audio->dev, "dai %d get QMC %d channel TS info failed %d\n", + qmc_dai->id, i, ret); + return ret; + } + + last_rx_ts = fls64(ts_info.rx_ts_mask); + last_tx_ts = fls64(ts_info.tx_ts_mask); + + if (prev_last_rx_ts > last_rx_ts) { + dev_err(qmc_audio->dev, "dai %d QMC chan %d unordered channels (RX timeslot %d before %d)\n", + qmc_dai->id, i, prev_last_rx_ts, last_rx_ts); + return -EINVAL; + } + if (prev_last_tx_ts > last_tx_ts) { + dev_err(qmc_audio->dev, "dai %d QMC chan %d unordered channels (TX timeslot %d before %d)\n", + qmc_dai->id, i, prev_last_tx_ts, last_tx_ts); + return -EINVAL; + } + + prev_last_rx_ts = last_rx_ts; + prev_last_tx_ts = last_tx_ts; } qmc_dai->nb_chans_avail = count; diff --git a/sound/soc/intel/avs/boards/es8336.c b/sound/soc/intel/avs/boards/es8336.c index 12e4e0aba5fa24..eb2b40894e3f20 100644 --- a/sound/soc/intel/avs/boards/es8336.c +++ b/sound/soc/intel/avs/boards/es8336.c @@ -132,7 +132,7 @@ static int avs_es8336_codec_init(struct snd_soc_pcm_runtime *runtime) snd_jack_set_key(data->jack.jack, SND_JACK_BTN_0, KEY_PLAYPAUSE); snd_soc_component_set_jack(component, &data->jack, NULL); - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; return 0; } diff --git a/sound/soc/intel/avs/boards/rt274.c b/sound/soc/intel/avs/boards/rt274.c index 67d2c4585cddc4..4055ecc60838f3 100644 --- a/sound/soc/intel/avs/boards/rt274.c +++ b/sound/soc/intel/avs/boards/rt274.c @@ -117,7 +117,7 @@ static int avs_rt274_codec_init(struct snd_soc_pcm_runtime *runtime) return ret; } - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; return 0; } diff --git a/sound/soc/intel/avs/boards/rt5640.c b/sound/soc/intel/avs/boards/rt5640.c index 706b84ffe1ef02..97d1fa944188fa 100644 --- a/sound/soc/intel/avs/boards/rt5640.c +++ b/sound/soc/intel/avs/boards/rt5640.c @@ -67,7 +67,7 @@ static int avs_rt5640_codec_init(struct snd_soc_pcm_runtime *runtime) return ret; snd_soc_component_set_jack(codec_dai->component, jack, NULL); - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; return 0; } diff --git a/sound/soc/intel/boards/bytcht_cx2072x.c b/sound/soc/intel/boards/bytcht_cx2072x.c index 68a3d345dc25df..27b63a853a48e4 100644 --- a/sound/soc/intel/boards/bytcht_cx2072x.c +++ b/sound/soc/intel/boards/bytcht_cx2072x.c @@ -77,7 +77,7 @@ static int byt_cht_cx2072x_init(struct snd_soc_pcm_runtime *rtd) byt_cht_cx2072x_acpi_gpios)) dev_warn(rtd->dev, "Unable to add GPIO mapping table\n"); - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; /* set the default PLL rate, the clock is handled by the codec driver */ ret = snd_soc_dai_set_sysclk(snd_soc_rtd_to_codec(rtd, 0), CX2072X_MCLK_EXTERNAL_PLL, diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index b384d38654e658..3b5f63112237ea 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -179,7 +179,7 @@ static int byt_cht_es8316_init(struct snd_soc_pcm_runtime *runtime) int num_routes; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; switch (BYT_CHT_ES8316_MAP(quirk)) { case BYT_CHT_ES8316_INTMIC_IN1_MAP: diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index bc846558480e41..1e9b1903fae810 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -1324,7 +1324,7 @@ static int byt_rt5640_init(struct snd_soc_pcm_runtime *runtime) int num_routes = 0; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; jack_data->use_platform_clock = true; /* Start with RC clk for jack-detect (we disable MCLK below) */ diff --git a/sound/soc/intel/boards/bytcr_rt5651.c b/sound/soc/intel/boards/bytcr_rt5651.c index 604a35d380e9ab..ca540a66f22ce0 100644 --- a/sound/soc/intel/boards/bytcr_rt5651.c +++ b/sound/soc/intel/boards/bytcr_rt5651.c @@ -586,7 +586,7 @@ static int byt_rt5651_init(struct snd_soc_pcm_runtime *runtime) int report; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; /* Start with RC clk for jack-detect (we disable MCLK below) */ if (byt_rt5651_quirk & BYT_RT5651_MCLK_EN) diff --git a/sound/soc/intel/boards/bytcr_wm5102.c b/sound/soc/intel/boards/bytcr_wm5102.c index a6dfbcfdf74e31..72c0e5941ae840 100644 --- a/sound/soc/intel/boards/bytcr_wm5102.c +++ b/sound/soc/intel/boards/bytcr_wm5102.c @@ -288,7 +288,7 @@ static int byt_wm5102_init(struct snd_soc_pcm_runtime *runtime) const struct snd_soc_dapm_route *custom_map = NULL; int ret, jack_type, num_routes = 0; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; ret = snd_soc_add_card_controls(card, byt_wm5102_controls, ARRAY_SIZE(byt_wm5102_controls)); diff --git a/sound/soc/intel/boards/sof_es8336.c b/sound/soc/intel/boards/sof_es8336.c index 1211a2b8a2a2c7..10b189ea88dba8 100644 --- a/sound/soc/intel/boards/sof_es8336.c +++ b/sound/soc/intel/boards/sof_es8336.c @@ -276,7 +276,7 @@ static int sof_es8316_init(struct snd_soc_pcm_runtime *runtime) int num_routes; int ret; - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; if (quirk & SOC_ES8336_HEADSET_MIC1) { custom_map = sof_es8316_headset_mic1_map; diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 36cf7e51b72dfa..a2897f0237836e 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -873,7 +873,7 @@ static int create_sdw_dailink(struct snd_soc_card *card, (*codec_conf)++; } - if (sof_end->include_sidecar) { + if (sof_end->include_sidecar && sof_end->codec_info->add_sidecar) { ret = sof_end->codec_info->add_sidecar(card, dai_links, codec_conf); if (ret) return ret; diff --git a/sound/soc/intel/catpt/pcm.c b/sound/soc/intel/catpt/pcm.c index 46acb7fdc547d8..bf734c69c4e095 100644 --- a/sound/soc/intel/catpt/pcm.c +++ b/sound/soc/intel/catpt/pcm.c @@ -568,8 +568,9 @@ static const struct snd_pcm_hardware catpt_pcm_hardware = { SNDRV_PCM_INFO_RESUME | SNDRV_PCM_INFO_NO_PERIOD_WAKEUP, .formats = SNDRV_PCM_FMTBIT_S16_LE | - SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, .period_bytes_min = PAGE_SIZE, .period_bytes_max = CATPT_BUFFER_MAX_SIZE / CATPT_PCM_PERIODS_MIN, .periods_min = CATPT_PCM_PERIODS_MIN, @@ -698,14 +699,18 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, .capture = { .stream_name = "Analog Capture", .channels_min = 2, .channels_max = 4, .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { @@ -717,7 +722,9 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_8000_192000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { @@ -729,7 +736,9 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_8000_192000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { @@ -741,7 +750,9 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { diff --git a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c index 75dc8935a79460..ec9fd8486c0534 100644 --- a/sound/soc/intel/common/soc-acpi-intel-mtl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-mtl-match.c @@ -948,6 +948,30 @@ static const struct snd_soc_acpi_adr_device cs42l42_0_adr[] = { } }; +static const struct snd_soc_acpi_adr_device tas2783_0_adr[] = { + { + .adr = 0x0000380102000001ull, + .num_endpoints = 1, + .endpoints = &spk_l_endpoint, + .name_prefix = "tas2783-1" + }, + { + .adr = 0x0000390102000001ull, + .num_endpoints = 1, + .endpoints = &spk_r_endpoint, + .name_prefix = "tas2783-2" + } +}; + +static const struct snd_soc_acpi_link_adr tas2783_link0[] = { + { + .mask = BIT(0), + .num_adr = ARRAY_SIZE(tas2783_0_adr), + .adr_d = tas2783_0_adr, + }, + {} +}; + static const struct snd_soc_acpi_link_adr cs42l42_link0_max98363_link2[] = { /* Expected order: jack -> amp */ { @@ -1080,6 +1104,12 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_sdw_machines[] = { .drv_name = "sof_sdw", .sof_tplg_filename = "sof-mtl-rt715-rt711-rt1308-mono.tplg", }, + { + .link_mask = BIT(0), + .links = tas2783_link0, + .drv_name = "sof_sdw", + .sof_tplg_filename = "sof-mtl-tas2783.tplg", + }, { .link_mask = GENMASK(3, 0), .links = mtl_rt713_l0_rt1316_l12_rt1713_l3, diff --git a/sound/soc/qcom/qdsp6/audioreach.c b/sound/soc/qcom/qdsp6/audioreach.c index bbfd51db879766..2365424a9b42bf 100644 --- a/sound/soc/qcom/qdsp6/audioreach.c +++ b/sound/soc/qcom/qdsp6/audioreach.c @@ -883,6 +883,7 @@ static int audioreach_set_compr_media_format(struct media_format *media_fmt_hdr, struct payload_media_fmt_aac_t *aac_cfg; struct payload_media_fmt_pcm *mp3_cfg; struct payload_media_fmt_flac_t *flac_cfg; + struct payload_media_fmt_opus_t *opus_cfg; switch (mcfg->fmt) { case SND_AUDIOCODEC_MP3: @@ -925,6 +926,32 @@ static int audioreach_set_compr_media_format(struct media_format *media_fmt_hdr, flac_cfg->min_frame_size = mcfg->codec.options.flac_d.min_frame_size; flac_cfg->max_frame_size = mcfg->codec.options.flac_d.max_frame_size; break; + case SND_AUDIOCODEC_OPUS_RAW: + media_fmt_hdr->data_format = DATA_FORMAT_RAW_COMPRESSED; + media_fmt_hdr->fmt_id = MEDIA_FMT_ID_OPUS; + media_fmt_hdr->payload_size = sizeof(*opus_cfg); + p = p + sizeof(*media_fmt_hdr); + opus_cfg = p; + /* raw opus packets prepended with 4 bytes of length */ + opus_cfg->bitstream_format = 1; + /* + * payload_type: + * 0 -- read metadata from opus stream; + * 1 -- metadata is provided by filling in the struct here. + */ + opus_cfg->payload_type = 1; + opus_cfg->version = mcfg->codec.options.opus_d.version; + opus_cfg->num_channels = mcfg->codec.options.opus_d.num_channels; + opus_cfg->pre_skip = mcfg->codec.options.opus_d.pre_skip; + opus_cfg->sample_rate = mcfg->codec.options.opus_d.sample_rate; + opus_cfg->output_gain = mcfg->codec.options.opus_d.output_gain; + opus_cfg->mapping_family = mcfg->codec.options.opus_d.mapping_family; + opus_cfg->stream_count = mcfg->codec.options.opus_d.chan_map.stream_count; + opus_cfg->coupled_count = mcfg->codec.options.opus_d.chan_map.coupled_count; + memcpy(opus_cfg->channel_mapping, mcfg->codec.options.opus_d.chan_map.channel_map, + sizeof(opus_cfg->channel_mapping)); + opus_cfg->reserved[0] = opus_cfg->reserved[1] = opus_cfg->reserved[2] = 0; + break; default: return -EINVAL; } @@ -995,6 +1022,7 @@ static int audioreach_i2s_set_media_format(struct q6apm_graph *graph, param_data->param_id = PARAM_ID_I2S_INTF_CFG; param_data->param_size = ic_sz - APM_MODULE_PARAM_DATA_SIZE; + intf_cfg->cfg.lpaif_type = module->hw_interface_type; intf_cfg->cfg.intf_idx = module->hw_interface_idx; intf_cfg->cfg.sd_line_idx = module->sd_line_idx; diff --git a/sound/soc/qcom/qdsp6/audioreach.h b/sound/soc/qcom/qdsp6/audioreach.h index 790fba96e34db0..d1b60b36468a86 100644 --- a/sound/soc/qcom/qdsp6/audioreach.h +++ b/sound/soc/qcom/qdsp6/audioreach.h @@ -31,6 +31,7 @@ struct q6apm_graph; #define MODULE_ID_MP3_DECODE 0x0700103B #define MODULE_ID_GAPLESS 0x0700104D #define MODULE_ID_DISPLAY_PORT_SINK 0x07001069 +#define MODULE_ID_OPUS_DEC 0x07001174 #define APM_CMD_GET_SPF_STATE 0x01001021 #define APM_CMD_RSP_GET_SPF_STATE 0x02001007 @@ -257,6 +258,22 @@ struct payload_media_fmt_aac_t { uint32_t sample_rate; } __packed; +#define MEDIA_FMT_ID_OPUS 0x09001039 +struct payload_media_fmt_opus_t { + uint16_t bitstream_format; + uint16_t payload_type; + uint8_t version; + uint8_t num_channels; + uint16_t pre_skip; + uint32_t sample_rate; + uint16_t output_gain; + uint8_t mapping_family; + uint8_t stream_count; + uint8_t coupled_count; + uint8_t channel_mapping[8]; + uint8_t reserved[3]; +} __packed; + #define DATA_CMD_WR_SH_MEM_EP_EOS 0x04001002 #define WR_SH_MEM_EP_EOS_POLICY_LAST 1 #define WR_SH_MEM_EP_EOS_POLICY_EACH 2 diff --git a/sound/soc/qcom/qdsp6/q6apm-dai.c b/sound/soc/qcom/qdsp6/q6apm-dai.c index 09da26f712a6ad..4ecaff45c51860 100644 --- a/sound/soc/qcom/qdsp6/q6apm-dai.c +++ b/sound/soc/qcom/qdsp6/q6apm-dai.c @@ -551,10 +551,11 @@ static int q6apm_dai_compr_get_caps(struct snd_soc_component *component, caps->max_fragment_size = COMPR_PLAYBACK_MAX_FRAGMENT_SIZE; caps->min_fragments = COMPR_PLAYBACK_MIN_NUM_FRAGMENTS; caps->max_fragments = COMPR_PLAYBACK_MAX_NUM_FRAGMENTS; - caps->num_codecs = 3; + caps->num_codecs = 4; caps->codecs[0] = SND_AUDIOCODEC_MP3; caps->codecs[1] = SND_AUDIOCODEC_AAC; caps->codecs[2] = SND_AUDIOCODEC_FLAC; + caps->codecs[3] = SND_AUDIOCODEC_OPUS_RAW; return 0; } diff --git a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c index 20974f10406bfe..528756f1332bcf 100644 --- a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c +++ b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c @@ -262,6 +262,7 @@ static const struct snd_soc_dai_ops q6i2s_ops = { .shutdown = q6apm_lpass_dai_shutdown, .set_channel_map = q6dma_set_channel_map, .hw_params = q6dma_hw_params, + .set_fmt = q6i2s_set_fmt, }; static const struct snd_soc_dai_ops q6hdmi_ops = { diff --git a/sound/soc/qcom/qdsp6/q6apm.c b/sound/soc/qcom/qdsp6/q6apm.c index b4ffa0f0b188e2..0e667a7eb5467b 100644 --- a/sound/soc/qcom/qdsp6/q6apm.c +++ b/sound/soc/qcom/qdsp6/q6apm.c @@ -354,6 +354,9 @@ int q6apm_set_real_module_id(struct device *dev, struct q6apm_graph *graph, case SND_AUDIOCODEC_FLAC: module_id = MODULE_ID_FLAC_DEC; break; + case SND_AUDIOCODEC_OPUS_RAW: + module_id = MODULE_ID_OPUS_DEC; + break; default: return -EINVAL; } diff --git a/sound/soc/qcom/sc8280xp.c b/sound/soc/qcom/sc8280xp.c index 73f9f82c4e2581..78e327bc2f0776 100644 --- a/sound/soc/qcom/sc8280xp.c +++ b/sound/soc/qcom/sc8280xp.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,10 @@ static int sc8280xp_snd_init(struct snd_soc_pcm_runtime *rtd) int dp_pcm_id = 0; switch (cpu_dai->id) { + case PRIMARY_MI2S_RX...QUATERNARY_MI2S_TX: + case QUINARY_MI2S_RX...QUINARY_MI2S_TX: + snd_soc_dai_set_fmt(cpu_dai, SND_SOC_DAIFMT_BP_FP); + break; case WSA_CODEC_DMA_RX_0: case WSA_CODEC_DMA_RX_1: /* @@ -82,8 +87,10 @@ static int sc8280xp_be_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, SNDRV_PCM_HW_PARAM_RATE); struct snd_interval *channels = hw_param_interval(params, SNDRV_PCM_HW_PARAM_CHANNELS); + struct snd_mask *fmt = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); rate->min = rate->max = 48000; + snd_mask_set_format(fmt, SNDRV_PCM_FORMAT_S16_LE); channels->min = 2; channels->max = 2; switch (cpu_dai->id) { @@ -186,9 +193,9 @@ static int sc8280xp_platform_probe(struct platform_device *pdev) static const struct of_device_id snd_sc8280xp_dt_match[] = { {.compatible = "qcom,qcm6490-idp-sndcard", "qcm6490"}, {.compatible = "qcom,qcs6490-rb3gen2-sndcard", "qcs6490"}, - {.compatible = "qcom,qcs8275-sndcard", "qcs8275"}, - {.compatible = "qcom,qcs9075-sndcard", "qcs9075"}, - {.compatible = "qcom,qcs9100-sndcard", "qcs9100"}, + {.compatible = "qcom,qcs8275-sndcard", "qcs8300"}, + {.compatible = "qcom,qcs9075-sndcard", "sa8775p"}, + {.compatible = "qcom,qcs9100-sndcard", "sa8775p"}, {.compatible = "qcom,sc8280xp-sndcard", "sc8280xp"}, {.compatible = "qcom,sm8450-sndcard", "sm8450"}, {.compatible = "qcom,sm8550-sndcard", "sm8550"}, diff --git a/sound/soc/renesas/rcar/msiof.c b/sound/soc/renesas/rcar/msiof.c index f5338bbb037c55..f2addfbac92374 100644 --- a/sound/soc/renesas/rcar/msiof.c +++ b/sound/soc/renesas/rcar/msiof.c @@ -7,7 +7,7 @@ // /* - * [NOTE] + * [NOTE-CLOCK-MODE] * * This driver doesn't support Clock/Frame Provider Mode * @@ -24,12 +24,64 @@ * Clock/Frame Consumer Mode. */ +/* + * [NOTE-RESET] + * + * MSIOF has TXRST/RXRST to reset FIFO, but it shouldn't be used during SYNC signal was asserted, + * because it will be cause of HW issue. + * + * When MSIOF is used as Sound driver, this driver is assuming it is used as clock consumer mode + * (= Codec is clock provider). This means, it can't control SYNC signal by itself. + * + * We need to use SW reset (= reset_control_xxx()) instead of TXRST/RXRST. + */ + +/* + * [NOTE-BOTH-SETTING] + * + * SITMDRn / SIRMDRn and some other registers should not be updated during working even though it + * was not related the target direction (for example, do TX settings during RX is working), + * otherwise it cause a FSERR. + * + * Setup both direction (Playback/Capture) in the same time. + */ + +/* + * [NOTE-R/L] + * + * The data of Captured might be R/L opposite. + * + * This driver is assuming MSIOF is used as Clock/Frame Consumer Mode, and there is a case that some + * Codec (= Clock/Frame Provider) might output Clock/Frame before setup MSIOF. It depends on Codec + * driver implementation. + * + * MSIOF will capture data without checking SYNC signal Hi/Low (= R/L). + * + * This means, if MSIOF RXE bit was set as 1 in case of SYNC signal was Hi (= R) timing, it will + * start capture data since next SYNC low singla (= L). Because Linux assumes sound data is lined + * up as R->L->R->L->..., the data R/L will be opposite. + * + * The only solution in this case is start CLK/SYNC *after* MSIOF settings, but it depends when and + * how Codec driver start it. + */ + +/* + * [NOTE-FSERR] + * + * We can't remove all FSERR. + * + * Renesas have tried to minimize the occurrence of FSERR errors as much as possible, but + * unfortunately we cannot remove them completely, because MSIOF might setup its register during + * CLK/SYNC are inputed. It can be happen because MSIOF is working as Clock/Frame Consumer. + */ + #include #include #include #include #include #include +#include #include #include #include @@ -60,10 +112,13 @@ struct msiof_priv { struct device *dev; struct snd_pcm_substream *substream[SNDRV_PCM_STREAM_LAST + 1]; + struct reset_control *reset; spinlock_t lock; void __iomem *base; resource_size_t phy_addr; + int count; + /* for error */ int err_syc[SNDRV_PCM_STREAM_LAST + 1]; int err_ovf[SNDRV_PCM_STREAM_LAST + 1]; @@ -121,7 +176,7 @@ static int msiof_hw_start(struct snd_soc_component *component, /* * see - * [NOTE] on top of this driver + * [NOTE-CLOCK-MODE] on top of this driver */ /* * see @@ -131,42 +186,63 @@ static int msiof_hw_start(struct snd_soc_component *component, * RX: Fig 109.15 */ - /* reset errors */ - priv->err_syc[substream->stream] = + /* + * Use reset_control_xx() instead of TXRST/RXRST. + * see + * [NOTE-RESET] + */ + if (!priv->count) + reset_control_deassert(priv->reset); + + priv->count++; + + /* + * Reset errors. ignore 1st FSERR + * + * see + * [NOTE-FSERR] + */ + priv->err_syc[substream->stream] = -1; priv->err_ovf[substream->stream] = priv->err_udf[substream->stream] = 0; /* Start DMAC */ snd_dmaengine_pcm_trigger(substream, cmd); + /* + * setup both direction (Playback/Capture) in the same time. + * see + * above [NOTE-BOTH-SETTING] + */ + /* SITMDRx */ - if (is_play) { - val = SITMDR1_PCON | - FIELD_PREP(SIMDR1_SYNCMD, SIMDR1_SYNCMD_LR) | - SIMDR1_SYNCAC | SIMDR1_XXSTP; - if (msiof_flag_has(priv, MSIOF_FLAGS_NEED_DELAY)) - val |= FIELD_PREP(SIMDR1_DTDL, 1); + val = SITMDR1_PCON | SIMDR1_SYNCAC | SIMDR1_XXSTP | + FIELD_PREP(SIMDR1_SYNCMD, SIMDR1_SYNCMD_LR); + if (msiof_flag_has(priv, MSIOF_FLAGS_NEED_DELAY)) + val |= FIELD_PREP(SIMDR1_DTDL, 1); - msiof_write(priv, SITMDR1, val); + msiof_write(priv, SITMDR1, val); - val = FIELD_PREP(SIMDR2_BITLEN1, width - 1); - msiof_write(priv, SITMDR2, val | FIELD_PREP(SIMDR2_GRP, 1)); - msiof_write(priv, SITMDR3, val); + val = FIELD_PREP(SIMDR2_BITLEN1, width - 1); + msiof_write(priv, SITMDR2, val | FIELD_PREP(SIMDR2_GRP, 1)); + msiof_write(priv, SITMDR3, val); - } /* SIRMDRx */ - else { - val = FIELD_PREP(SIMDR1_SYNCMD, SIMDR1_SYNCMD_LR) | - SIMDR1_SYNCAC; - if (msiof_flag_has(priv, MSIOF_FLAGS_NEED_DELAY)) - val |= FIELD_PREP(SIMDR1_DTDL, 1); + val = SIMDR1_SYNCAC | + FIELD_PREP(SIMDR1_SYNCMD, SIMDR1_SYNCMD_LR); + if (msiof_flag_has(priv, MSIOF_FLAGS_NEED_DELAY)) + val |= FIELD_PREP(SIMDR1_DTDL, 1); - msiof_write(priv, SIRMDR1, val); + msiof_write(priv, SIRMDR1, val); - val = FIELD_PREP(SIMDR2_BITLEN1, width - 1); - msiof_write(priv, SIRMDR2, val | FIELD_PREP(SIMDR2_GRP, 1)); - msiof_write(priv, SIRMDR3, val); - } + val = FIELD_PREP(SIMDR2_BITLEN1, width - 1); + msiof_write(priv, SIRMDR2, val | FIELD_PREP(SIMDR2_GRP, 1)); + msiof_write(priv, SIRMDR3, val); + + /* SIFCTR */ + msiof_write(priv, SIFCTR, + FIELD_PREP(SIFCTR_TFWM, SIFCTR_TFWM_1) | + FIELD_PREP(SIFCTR_RFWM, SIFCTR_RFWM_1)); /* SIIER */ if (is_play) @@ -183,10 +259,11 @@ static int msiof_hw_start(struct snd_soc_component *component, msiof_update(priv, SISTR, val, val); /* SICTR */ + val = SICTR_TEDG | SICTR_REDG; if (is_play) - val = SICTR_TXE | SICTR_TEDG; + val |= SICTR_TXE; else - val = SICTR_RXE | SICTR_REDG; + val |= SICTR_RXE; msiof_update_and_wait(priv, SICTR, val, val, val); return 0; @@ -207,9 +284,6 @@ static int msiof_hw_stop(struct snd_soc_component *component, val = SIIER_RDREQE | SIIER_RDMAE | SISTR_ERR_RX; msiof_update(priv, SIIER, val, 0); - /* Stop DMAC */ - snd_dmaengine_pcm_trigger(substream, cmd); - /* SICTR */ if (is_play) val = SICTR_TXE; @@ -217,6 +291,18 @@ static int msiof_hw_stop(struct snd_soc_component *component, val = SICTR_RXE; msiof_update_and_wait(priv, SICTR, val, 0, 0); + /* Stop DMAC */ + snd_dmaengine_pcm_trigger(substream, cmd); + + /* + * Ignore 1st FSERR + * + * see + * [NOTE-FSERR] + */ + if (priv->err_syc[substream->stream] < 0) + priv->err_syc[substream->stream] = 0; + /* indicate error status if exist */ if (priv->err_syc[substream->stream] || priv->err_ovf[substream->stream] || @@ -227,6 +313,11 @@ static int msiof_hw_stop(struct snd_soc_component *component, priv->err_ovf[substream->stream], priv->err_udf[substream->stream]); + priv->count--; + + if (!priv->count) + reset_control_assert(priv->reset); + return 0; } @@ -302,6 +393,9 @@ static struct snd_soc_dai_driver msiof_dai_driver = { .channels_max = 2, }, .ops = &msiof_dai_ops, + .symmetric_rate = 1, + .symmetric_channels = 1, + .symmetric_sample_bits = 1, }; static struct snd_pcm_hardware msiof_pcm_hardware = { @@ -490,12 +584,19 @@ static int msiof_probe(struct platform_device *pdev) if (IS_ERR(priv->base)) return PTR_ERR(priv->base); + priv->reset = devm_reset_control_get_exclusive(dev, NULL); + if (IS_ERR(priv->reset)) + return PTR_ERR(priv->reset); + + reset_control_assert(priv->reset); + ret = devm_request_irq(dev, irq, msiof_interrupt, 0, dev_name(dev), priv); if (ret) return ret; priv->dev = dev; priv->phy_addr = res->start; + priv->count = 0; spin_lock_init(&priv->lock); platform_set_drvdata(pdev, priv); diff --git a/sound/soc/sdw_utils/Makefile b/sound/soc/sdw_utils/Makefile index daf01911355371..a87c53e1a2c18e 100644 --- a/sound/soc/sdw_utils/Makefile +++ b/sound/soc/sdw_utils/Makefile @@ -6,5 +6,6 @@ snd-soc-sdw-utils-y := soc_sdw_utils.o soc_sdw_dmic.o soc_sdw_rt_dmic.o \ soc_sdw_bridge_cs35l56.o \ soc_sdw_cs42l42.o soc_sdw_cs42l43.o \ soc_sdw_cs_amp.o \ - soc_sdw_maxim.o + soc_sdw_maxim.o \ + soc_sdw_ti_amp.o obj-$(CONFIG_SND_SOC_SDW_UTILS) += snd-soc-sdw-utils.o diff --git a/sound/soc/sdw_utils/soc_sdw_ti_amp.c b/sound/soc/sdw_utils/soc_sdw_ti_amp.c new file mode 100644 index 00000000000000..f0011360ae9b6c --- /dev/null +++ b/sound/soc/sdw_utils/soc_sdw_ti_amp.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2025 Texas Instruments Inc. + +/* + * soc_sdw_ti_amp - Helpers to handle TI's soundwire based codecs + */ + +#include +#include +#include +#include +#include +#include + +#define TIAMP_SPK_VOLUME_0DB 200 + +int asoc_sdw_ti_amp_initial_settings(struct snd_soc_card *card, + const char *name_prefix) +{ + char *volume_ctl_name; + int ret; + + volume_ctl_name = kasprintf(GFP_KERNEL, "%s Speaker Volume", + name_prefix); + if (!volume_ctl_name) + return -ENOMEM; + + ret = snd_soc_limit_volume(card, volume_ctl_name, + TIAMP_SPK_VOLUME_0DB); + if (ret) + dev_err(card->dev, + "%s update failed %d\n", + volume_ctl_name, ret); + + kfree(volume_ctl_name); + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_ti_amp_initial_settings, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_ti_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, + struct snd_soc_dai *dai) +{ + struct snd_soc_card *card = rtd->card; + char widget_name[16]; + char speaker[16]; + struct snd_soc_dapm_route route = {speaker, NULL, widget_name}; + struct snd_soc_dai *codec_dai; + const char *prefix; + int i, ret = 0; + + for_each_rtd_codec_dais(rtd, i, codec_dai) { + if (!strstr(codec_dai->name, "tas2783")) + continue; + + prefix = codec_dai->component->name_prefix; + if (!strncmp(prefix, "tas2783-1", strlen("tas2783-1"))) { + strscpy(speaker, "Left Spk", sizeof(speaker)); + } else if (!strncmp(prefix, "tas2783-2", strlen("tas2783-2"))) { + strscpy(speaker, "Right Spk", sizeof(speaker)); + } else { + ret = -EINVAL; + dev_err(card->dev, "unhandled prefix %s", prefix); + break; + } + + snprintf(widget_name, sizeof(widget_name), "%s SPK", prefix); + ret = asoc_sdw_ti_amp_initial_settings(card, prefix); + if (ret) + return ret; + + ret = snd_soc_dapm_add_routes(&card->dapm, &route, 1); + if (ret) + return ret; + } + + return ret; +} +EXPORT_SYMBOL_NS(asoc_sdw_ti_spk_rtd_init, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_ti_amp_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback) +{ + if (!playback) + return 0; + + info->amp_num++; + + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_ti_amp_init, "SND_SOC_SDW_UTILS"); diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index ba20177a2405a1..0460e2a8c50a67 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -35,12 +35,12 @@ static const struct snd_kcontrol_new generic_spk_controls[] = { SOC_DAPM_PIN_SWITCH("Speaker"), }; -static const struct snd_soc_dapm_widget maxim_widgets[] = { +static const struct snd_soc_dapm_widget lr_spk_widgets[] = { SND_SOC_DAPM_SPK("Left Spk", NULL), SND_SOC_DAPM_SPK("Right Spk", NULL), }; -static const struct snd_kcontrol_new maxim_controls[] = { +static const struct snd_kcontrol_new lr_spk_controls[] = { SOC_DAPM_PIN_SWITCH("Left Spk"), SOC_DAPM_PIN_SWITCH("Right Spk"), }; @@ -58,6 +58,24 @@ static const struct snd_kcontrol_new rt700_controls[] = { }; struct asoc_sdw_codec_info codec_info_list[] = { + { + .part_id = 0x0000, /* TAS2783A */ + .dais = { + { + .direction = {true, true}, + .dai_name = "tas2783-codec", + .dai_type = SOC_SDW_DAI_TYPE_AMP, + .dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_AMP_IN_DAI_ID}, + .init = asoc_sdw_ti_amp_init, + .rtd_init = asoc_sdw_ti_spk_rtd_init, + .controls = lr_spk_controls, + .num_controls = ARRAY_SIZE(lr_spk_controls), + .widgets = lr_spk_widgets, + .num_widgets = ARRAY_SIZE(lr_spk_widgets), + }, + }, + .dai_num = 1, + }, { .part_id = 0x700, .name_prefix = "rt700", @@ -468,10 +486,10 @@ struct asoc_sdw_codec_info codec_info_list[] = { .dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_AMP_IN_DAI_ID}, .init = asoc_sdw_maxim_init, .rtd_init = asoc_sdw_maxim_spk_rtd_init, - .controls = maxim_controls, - .num_controls = ARRAY_SIZE(maxim_controls), - .widgets = maxim_widgets, - .num_widgets = ARRAY_SIZE(maxim_widgets), + .controls = lr_spk_controls, + .num_controls = ARRAY_SIZE(lr_spk_controls), + .widgets = lr_spk_widgets, + .num_widgets = ARRAY_SIZE(lr_spk_widgets), }, }, .dai_num = 1, @@ -488,10 +506,10 @@ struct asoc_sdw_codec_info codec_info_list[] = { .dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID}, .init = asoc_sdw_maxim_init, .rtd_init = asoc_sdw_maxim_spk_rtd_init, - .controls = maxim_controls, - .num_controls = ARRAY_SIZE(maxim_controls), - .widgets = maxim_widgets, - .num_widgets = ARRAY_SIZE(maxim_widgets), + .controls = lr_spk_controls, + .num_controls = ARRAY_SIZE(lr_spk_controls), + .widgets = lr_spk_widgets, + .num_widgets = ARRAY_SIZE(lr_spk_widgets), }, }, .dai_num = 1, diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index acdead21378610..9f323bfafbe836 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -717,7 +717,7 @@ int snd_soc_suspend(struct device *dev) * means it's doing something, * otherwise fall through. */ - if (dapm->idle_bias_off) { + if (!dapm->idle_bias) { dev_dbg(component->dev, "ASoC: idle_bias_off CODEC on over suspend\n"); break; @@ -1652,7 +1652,7 @@ static int soc_probe_component(struct snd_soc_card *card, if (ret < 0) goto err_probe; - WARN(dapm->idle_bias_off && + WARN(!dapm->idle_bias && dapm->bias_level != SND_SOC_BIAS_OFF, "codec %s can not start from non-off bias with idle_bias_off==1\n", component->name); diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index d74c096cc20840..51fb09d56c5a17 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -165,6 +165,27 @@ static void pop_dbg(struct device *dev, u32 pop_time, const char *fmt, ...) kfree(buf); } +struct device *snd_soc_dapm_to_dev(struct snd_soc_dapm_context *dapm) +{ + if (dapm->component) + return dapm->component->dev; + + return dapm->card->dev; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_to_dev); + +struct snd_soc_card *snd_soc_dapm_to_card(struct snd_soc_dapm_context *dapm) +{ + return dapm->card; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_to_card); + +struct snd_soc_component *snd_soc_dapm_to_component(struct snd_soc_dapm_context *dapm) +{ + return dapm->component; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_to_component); + static bool dapm_dirty_widget(struct snd_soc_dapm_widget *w) { return !list_empty(&w->dirty); @@ -2159,21 +2180,26 @@ static void dapm_power_one_widget(struct snd_soc_dapm_widget *w, dapm_seq_insert(w, down_list, false); } -static bool dapm_idle_bias_off(struct snd_soc_dapm_context *dapm) +bool snd_soc_dapm_get_idle_bias(struct snd_soc_dapm_context *dapm) { - if (dapm->idle_bias_off) - return true; + if (dapm->idle_bias) { + struct snd_soc_component *component = snd_soc_dapm_to_component(dapm); + unsigned int state = snd_power_get_state(dapm->card->snd_card); - switch (snd_power_get_state(dapm->card->snd_card)) { - case SNDRV_CTL_POWER_D3hot: - case SNDRV_CTL_POWER_D3cold: - return dapm->suspend_bias_off; - default: - break; + if ((state == SNDRV_CTL_POWER_D3hot || (state == SNDRV_CTL_POWER_D3cold)) && + component) + return !component->driver->suspend_bias_off; } - return false; + return dapm->idle_bias; } +EXPORT_SYMBOL_GPL(snd_soc_dapm_get_idle_bias); + +void snd_soc_dapm_set_idle_bias(struct snd_soc_dapm_context *dapm, bool on) +{ + dapm->idle_bias = on; +} +EXPORT_SYMBOL_GPL(snd_soc_dapm_set_idle_bias); /* * Scan each dapm widget for complete audio path. @@ -2200,10 +2226,10 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, trace_snd_soc_dapm_start(card, event); for_each_card_dapms(card, d) { - if (dapm_idle_bias_off(d)) - d->target_bias_level = SND_SOC_BIAS_OFF; - else + if (snd_soc_dapm_get_idle_bias(d)) d->target_bias_level = SND_SOC_BIAS_STANDBY; + else + d->target_bias_level = SND_SOC_BIAS_OFF; } dapm_reset(card); @@ -2267,7 +2293,7 @@ static int dapm_power_widgets(struct snd_soc_card *card, int event, if (d->target_bias_level > bias) bias = d->target_bias_level; for_each_card_dapms(card, d) - if (!dapm_idle_bias_off(d)) + if (snd_soc_dapm_get_idle_bias(d)) d->target_bias_level = bias; trace_snd_soc_dapm_walk_done(card); @@ -4801,8 +4827,7 @@ void snd_soc_dapm_init(struct snd_soc_dapm_context *dapm, if (component) { dapm->dev = component->dev; - dapm->idle_bias_off = !component->driver->idle_bias_on; - dapm->suspend_bias_off = component->driver->suspend_bias_off; + dapm->idle_bias = component->driver->idle_bias_on; } else { dapm->dev = card->dev; } diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index a629e0eacb20eb..d2b6fb8e0b6c69 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -118,6 +118,7 @@ static int soc_mixer_reg_to_ctl(struct soc_mixer_control *mc, unsigned int reg_v if (mc->sign_bit) val = sign_extend32(val, mc->sign_bit); + val = clamp(val, mc->min, mc->max); val -= mc->min; if (mc->invert) diff --git a/sound/soc/sof/ipc3-dtrace.c b/sound/soc/sof/ipc3-dtrace.c index 049b6204f5cb64..50700f5cb0effd 100644 --- a/sound/soc/sof/ipc3-dtrace.c +++ b/sound/soc/sof/ipc3-dtrace.c @@ -126,7 +126,7 @@ static int trace_filter_parse(struct snd_sof_dev *sdev, char *string, capacity += TRACE_FILTER_ELEMENTS_PER_ENTRY; entry = strchr(entry + 1, entry_delimiter[0]); } - *out = kmalloc(capacity * sizeof(**out), GFP_KERNEL); + *out = kmalloc_array(capacity, sizeof(**out), GFP_KERNEL); if (!*out) return -ENOMEM; diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index c700972d32ed38..6d81969e181c84 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -652,14 +652,14 @@ static int ipc4_ssp_dai_config_pcm_params_match(struct snd_sof_dev *sdev, if (params_rate(params) == le32_to_cpu(hw_config->fsync_rate) && params_width(params) == le32_to_cpu(hw_config->tdm_slot_width) && - params_channels(params) == le32_to_cpu(hw_config->tdm_slots)) { + params_channels(params) <= le32_to_cpu(hw_config->tdm_slots)) { current_config = le32_to_cpu(hw_config->id); partial_match = false; /* best match found */ break; } else if (current_config < 0 && params_rate(params) == le32_to_cpu(hw_config->fsync_rate) && - params_channels(params) == le32_to_cpu(hw_config->tdm_slots)) { + params_channels(params) <= le32_to_cpu(hw_config->tdm_slots)) { current_config = le32_to_cpu(hw_config->id); partial_match = true; /* keep looking for better match */ diff --git a/sound/soc/sof/sof-client-probes.c b/sound/soc/sof/sof-client-probes.c index a3785a76fc18e1..fc7975fb6e6f1c 100644 --- a/sound/soc/sof/sof-client-probes.c +++ b/sound/soc/sof/sof-client-probes.c @@ -539,7 +539,7 @@ static int sof_probes_client_probe(struct auxiliary_device *auxdev, card->dai_link = links; /* set idle_bias_off to prevent the core from resuming the card->dev */ - card->dapm.idle_bias_off = true; + card->dapm.idle_bias = false; snd_soc_card_set_drvdata(card, cdev); diff --git a/sound/soc/stm/stm32_sai_sub.c b/sound/soc/stm/stm32_sai_sub.c index 463a2b7d023b9c..0ae1eae2a59e2f 100644 --- a/sound/soc/stm/stm32_sai_sub.c +++ b/sound/soc/stm/stm32_sai_sub.c @@ -672,6 +672,14 @@ static int stm32_sai_set_sysclk(struct snd_soc_dai *cpu_dai, struct stm32_sai_sub_data *sai = snd_soc_dai_get_drvdata(cpu_dai); int ret; + /* + * The mclk rate is determined at runtime from the audio stream rate. + * Skip calls to the set_sysclk callback that are not relevant during the + * initialization phase. + */ + if (!snd_soc_card_is_instantiated(cpu_dai->component->card)) + return 0; + if (dir == SND_SOC_CLOCK_OUT && sai->sai_mclk) { ret = stm32_sai_sub_reg_up(sai, STM_SAI_CR1_REGX, SAI_XCR1_NODIV, diff --git a/sound/spi/at73c213.c b/sound/spi/at73c213.c index 0ece7ccbd55fae..0c2394733dc4bb 100644 --- a/sound/spi/at73c213.c +++ b/sound/spi/at73c213.c @@ -273,9 +273,8 @@ static int snd_at73c213_pcm_trigger(struct snd_pcm_substream *substream, int cmd) { struct snd_at73c213 *chip = snd_pcm_substream_chip(substream); - int retval = 0; - spin_lock(&chip->lock); + guard(spinlock)(&chip->lock); switch (cmd) { case SNDRV_PCM_TRIGGER_START: @@ -288,13 +287,11 @@ static int snd_at73c213_pcm_trigger(struct snd_pcm_substream *substream, break; default: dev_dbg(&chip->spi->dev, "spurious command %x\n", cmd); - retval = -EINVAL; + return -EINVAL; break; } - spin_unlock(&chip->lock); - - return retval; + return 0; } static snd_pcm_uframes_t @@ -358,30 +355,29 @@ static irqreturn_t snd_at73c213_interrupt(int irq, void *dev_id) int next_period; int retval = IRQ_NONE; - spin_lock(&chip->lock); + scoped_guard(spinlock, &chip->lock) { + block_size = frames_to_bytes(runtime, runtime->period_size); + status = ssc_readl(chip->ssc->regs, IMR); - block_size = frames_to_bytes(runtime, runtime->period_size); - status = ssc_readl(chip->ssc->regs, IMR); - - if (status & SSC_BIT(IMR_ENDTX)) { - chip->period++; - if (chip->period == runtime->periods) - chip->period = 0; - next_period = chip->period + 1; - if (next_period == runtime->periods) - next_period = 0; - - offset = block_size * next_period; - - ssc_writel(chip->ssc->regs, PDC_TNPR, - (long)runtime->dma_addr + offset); - ssc_writel(chip->ssc->regs, PDC_TNCR, - runtime->period_size * runtime->channels); - retval = IRQ_HANDLED; - } + if (status & SSC_BIT(IMR_ENDTX)) { + chip->period++; + if (chip->period == runtime->periods) + chip->period = 0; + next_period = chip->period + 1; + if (next_period == runtime->periods) + next_period = 0; + + offset = block_size * next_period; - ssc_readl(chip->ssc->regs, IMR); - spin_unlock(&chip->lock); + ssc_writel(chip->ssc->regs, PDC_TNPR, + (long)runtime->dma_addr + offset); + ssc_writel(chip->ssc->regs, PDC_TNCR, + runtime->period_size * runtime->channels); + retval = IRQ_HANDLED; + } + + ssc_readl(chip->ssc->regs, IMR); + } if (status & SSC_BIT(IMR_ENDTX)) snd_pcm_period_elapsed(chip->substream); diff --git a/sound/usb/card.c b/sound/usb/card.c index 0265206a8e8cf3..1d5a65eac93350 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -73,7 +73,7 @@ static bool lowlatency = true; static char *quirk_alias[SNDRV_CARDS]; static char *delayed_register[SNDRV_CARDS]; static bool implicit_fb[SNDRV_CARDS]; -static unsigned int quirk_flags[SNDRV_CARDS]; +static char *quirk_flags[SNDRV_CARDS]; bool snd_usb_use_vmalloc = true; bool snd_usb_skip_validation; @@ -103,13 +103,32 @@ module_param_array(delayed_register, charp, NULL, 0444); MODULE_PARM_DESC(delayed_register, "Quirk for delayed registration, given by id:iface, e.g. 0123abcd:4."); module_param_array(implicit_fb, bool, NULL, 0444); MODULE_PARM_DESC(implicit_fb, "Apply generic implicit feedback sync mode."); -module_param_array(quirk_flags, uint, NULL, 0444); -MODULE_PARM_DESC(quirk_flags, "Driver quirk bit flags."); module_param_named(use_vmalloc, snd_usb_use_vmalloc, bool, 0444); MODULE_PARM_DESC(use_vmalloc, "Use vmalloc for PCM intermediate buffers (default: yes)."); module_param_named(skip_validation, snd_usb_skip_validation, bool, 0444); MODULE_PARM_DESC(skip_validation, "Skip unit descriptor validation (default: no)."); +/* protects quirk_flags */ +static DEFINE_MUTEX(quirk_flags_mutex); + +static int param_set_quirkp(const char *val, + const struct kernel_param *kp) +{ + guard(mutex)(&quirk_flags_mutex); + return param_set_charp(val, kp); +} + +static const struct kernel_param_ops param_ops_quirkp = { + .set = param_set_quirkp, + .get = param_get_charp, + .free = param_free_charp, +}; + +#define param_check_quirkp param_check_charp + +module_param_array(quirk_flags, quirkp, NULL, 0644); +MODULE_PARM_DESC(quirk_flags, "Add/modify USB audio quirks"); + /* * we keep the snd_usb_audio_t instances by ourselves for merging * the all interfaces on the same card as one sound device. @@ -692,6 +711,31 @@ static void usb_audio_make_longname(struct usb_device *dev, } } +static void snd_usb_init_quirk_flags(int idx, struct snd_usb_audio *chip) +{ + size_t i; + + guard(mutex)(&quirk_flags_mutex); + + /* old style option found: the position-based integer value */ + if (quirk_flags[idx] && + !kstrtou32(quirk_flags[idx], 0, &chip->quirk_flags)) { + snd_usb_apply_flag_dbg("module param", chip, chip->quirk_flags); + return; + } + + /* take the default quirk from the quirk table */ + snd_usb_init_quirk_flags_table(chip); + + /* add or correct quirk bits from options */ + for (i = 0; i < ARRAY_SIZE(quirk_flags); i++) { + if (!quirk_flags[i] || !*quirk_flags[i]) + break; + + snd_usb_init_quirk_flags_parse_string(chip, quirk_flags[i]); + } +} + /* * create a chip instance and set its names. */ @@ -750,10 +794,7 @@ static int snd_usb_audio_create(struct usb_interface *intf, INIT_LIST_HEAD(&chip->midi_v2_list); INIT_LIST_HEAD(&chip->mixer_list); - if (quirk_flags[idx]) - chip->quirk_flags = quirk_flags[idx]; - else - snd_usb_init_quirk_flags(chip); + snd_usb_init_quirk_flags(idx, chip); card->private_free = snd_usb_audio_free; diff --git a/sound/usb/midi.c b/sound/usb/midi.c index 1912d59627bddc..dd8249e7597009 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1511,15 +1511,14 @@ static void snd_usbmidi_free(struct snd_usb_midi *umidi) { int i; + if (!umidi->disconnected) + snd_usbmidi_disconnect(&umidi->list); + for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { struct snd_usb_midi_endpoint *ep = &umidi->endpoints[i]; - if (ep->out) - snd_usbmidi_out_endpoint_delete(ep->out); - if (ep->in) - snd_usbmidi_in_endpoint_delete(ep->in); + kfree(ep->out); } mutex_destroy(&umidi->mutex); - timer_shutdown_sync(&umidi->error_timer); kfree(umidi); } diff --git a/sound/usb/mixer_s1810c.c b/sound/usb/mixer_s1810c.c index bee2a65fd3c9bf..15960d25e74882 100644 --- a/sound/usb/mixer_s1810c.c +++ b/sound/usb/mixer_s1810c.c @@ -93,6 +93,7 @@ struct s1810c_ctl_packet { #define SC1810C_CTL_LINE_SW 0 #define SC1810C_CTL_MUTE_SW 1 +#define SC1824C_CTL_MONO_SW 2 #define SC1810C_CTL_AB_SW 3 #define SC1810C_CTL_48V_SW 4 @@ -123,6 +124,7 @@ struct s1810c_state_packet { #define SC1810C_STATE_48V_SW 58 #define SC1810C_STATE_LINE_SW 59 #define SC1810C_STATE_MUTE_SW 60 +#define SC1824C_STATE_MONO_SW 61 #define SC1810C_STATE_AB_SW 62 struct s1810_mixer_state { @@ -145,12 +147,7 @@ snd_s1810c_send_ctl_packet(struct usb_device *dev, u32 a, pkt.b = b; pkt.c = c; pkt.d = d; - /* - * Value for settings 0/1 for this - * output channel is always 0 (probably because - * there is no ADAT output on 1810c) - */ - pkt.e = (c == 4) ? 0 : e; + pkt.e = e; ret = snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), SC1810C_CMD_REQ, @@ -213,115 +210,164 @@ snd_sc1810c_get_status_field(struct usb_device *dev, */ static int snd_s1810c_init_mixer_maps(struct snd_usb_audio *chip) { - u32 a, b, c, e, n, off; + u32 a, b, c, e, n, off, left, right; struct usb_device *dev = chip->dev; - /* Set initial volume levels ? */ - a = 0x64; - e = 0xbc; - for (n = 0; n < 2; n++) { - off = n * 18; - for (b = off; b < 18 + off; b++) { - /* This channel to all outputs ? */ - for (c = 0; c <= 8; c++) { + switch (chip->usb_id) { + case USB_ID(0x194f, 0x010c): /* 1810c */ + /* Set initial volume levels ? */ + a = 0x64; + e = 0xbc; + for (n = 0; n < 2; n++) { + off = n * 18; + for (b = off; b < 18 + off; b++) { + /* This channel to all outputs ? */ + for (c = 0; c <= 8; c++) { + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); + } + /* This channel to main output (again) */ + snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); + } + /* + * I noticed on UC that DAW channels have different + * initial volumes, so this makes sense. + */ + e = 0xb53bf0; + } + + /* Connect analog outputs ? */ + a = 0x65; + e = 0x01000000; + for (b = 1; b < 3; b++) { + snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); + } + snd_s1810c_send_ctl_packet(dev, a, 0, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, 0, 0, 1, e); + + /* Set initial volume levels for S/PDIF mappings ? */ + a = 0x64; + e = 0xbc; + c = 3; + for (n = 0; n < 2; n++) { + off = n * 18; + for (b = off; b < 18 + off; b++) { snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); } - /* This channel to main output (again) */ + e = 0xb53bf0; + } + + /* Connect S/PDIF output ? */ + a = 0x65; + e = 0x01000000; + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); + + /* Connect all outputs (again) ? */ + a = 0x65; + e = 0x01000000; + for (b = 0; b < 4; b++) { snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); } - /* - * I noticed on UC that DAW channels have different - * initial volumes, so this makes sense. - */ - e = 0xb53bf0; - } - /* Connect analog outputs ? */ - a = 0x65; - e = 0x01000000; - for (b = 1; b < 3; b++) { - snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); - } - snd_s1810c_send_ctl_packet(dev, a, 0, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, 0, 0, 1, e); - - /* Set initial volume levels for S/PDIF mappings ? */ - a = 0x64; - e = 0xbc; - c = 3; - for (n = 0; n < 2; n++) { - off = n * 18; - for (b = off; b < 18 + off; b++) { - snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); - snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); + /* Basic routing to get sound out of the device */ + a = 0x64; + e = 0x01000000; + for (c = 0; c < 4; c++) { + for (b = 0; b < 36; b++) { + if ((c == 0 && b == 18) || /* DAW1/2 -> Main */ + (c == 1 && b == 20) || /* DAW3/4 -> Line3/4 */ + (c == 2 && b == 22) || /* DAW4/5 -> Line5/6 */ + (c == 3 && b == 24)) { /* DAW5/6 -> S/PDIF */ + /* Left */ + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0); + b++; + /* Right */ + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); + } else { + /* Leave the rest disconnected */ + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0); + } + } } - e = 0xb53bf0; - } - /* Connect S/PDIF output ? */ - a = 0x65; - e = 0x01000000; - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); - - /* Connect all outputs (again) ? */ - a = 0x65; - e = 0x01000000; - for (b = 0; b < 4; b++) { - snd_s1810c_send_ctl_packet(dev, a, b, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, b, 0, 1, e); - } - - /* Basic routing to get sound out of the device */ - a = 0x64; - e = 0x01000000; - for (c = 0; c < 4; c++) { - for (b = 0; b < 36; b++) { - if ((c == 0 && b == 18) || /* DAW1/2 -> Main */ - (c == 1 && b == 20) || /* DAW3/4 -> Line3/4 */ - (c == 2 && b == 22) || /* DAW4/5 -> Line5/6 */ - (c == 3 && b == 24)) { /* DAW5/6 -> S/PDIF */ - /* Left */ + /* Set initial volume levels for S/PDIF (again) ? */ + a = 0x64; + e = 0xbc; + c = 3; + for (n = 0; n < 2; n++) { + off = n * 18; + for (b = off; b < 18 + off; b++) { snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); - snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0); - b++; - /* Right */ - snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0); snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); - } else { - /* Leave the rest disconnected */ - snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0); - snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0); } + e = 0xb53bf0; } - } - /* Set initial volume levels for S/PDIF (again) ? */ - a = 0x64; - e = 0xbc; - c = 3; - for (n = 0; n < 2; n++) { - off = n * 18; - for (b = off; b < 18 + off; b++) { + /* Connect S/PDIF outputs (again) ? */ + a = 0x65; + e = 0x01000000; + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); + + /* Again ? */ + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); + snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); + break; + + case USB_ID(0x194f, 0x010d): /* 1824c */ + /* Set all output faders to unity gain */ + a = 0x65; + c = 0x00; + e = 0x01000000; + + for (b = 0; b < 9; b++) { snd_s1810c_send_ctl_packet(dev, a, b, c, 0, e); snd_s1810c_send_ctl_packet(dev, a, b, c, 1, e); } - e = 0xb53bf0; - } - - /* Connect S/PDIF outputs (again) ? */ - a = 0x65; - e = 0x01000000; - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); - - /* Again ? */ - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 0, e); - snd_s1810c_send_ctl_packet(dev, a, 3, 0, 1, e); + /* Set + * Daw 1 -> Line out 1 (left), Daw 2 -> Line out 2 (right) + * Daw 3 -> Line out 3, (left) Daw 4 -> Line out 4 (right) + * Daw 5 -> Line out 5, (left) Daw 6 -> Line out 6 (right) + * Daw 7 -> Line out 7, (left) Daw 8 -> Line out 8 (right) + * Daw 9 -> SPDIF out 1, (left) Daw 10 -> SPDIF out 2 (right) + * Daw 11 -> ADAT out 1, (left) Daw 12 -> ADAT out 2 (right) + * Daw 13 -> ADAT out 3, (left) Daw 14 -> ADAT out 4 (right) + * Daw 15 -> ADAT out 5, (left) Daw 16 -> ADAT out 6 (right) + * Daw 17 -> ADAT out 7, (left) Daw 18 -> ADAT out 8 (right) + * Everything else muted + */ + a = 0x64; + /* The first Daw channel is channel 18 */ + left = 18; + + for (c = 0; c < 9; c++) { + right = left + 1; + + for (b = 0; b < 36; b++) { + if (b == left) { + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0x01000000); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0x00); + } else if (b == right) { + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0x00); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0x01000000); + } else { + snd_s1810c_send_ctl_packet(dev, a, b, c, 0, 0x00); + snd_s1810c_send_ctl_packet(dev, a, b, c, 1, 0x00); + } + } + left += 2; + } + break; + } return 0; } @@ -493,6 +539,15 @@ static const struct snd_kcontrol_new snd_s1810c_mute_sw = { .private_value = (SC1810C_STATE_MUTE_SW | SC1810C_CTL_MUTE_SW << 8) }; +static const struct snd_kcontrol_new snd_s1824c_mono_sw = { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "Mono Main Out Switch", + .info = snd_ctl_boolean_mono_info, + .get = snd_s1810c_switch_get, + .put = snd_s1810c_switch_set, + .private_value = (SC1824C_STATE_MONO_SW | SC1824C_CTL_MONO_SW << 8) +}; + static const struct snd_kcontrol_new snd_s1810c_48v_sw = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "48V Phantom Power On Mic Inputs Switch", @@ -579,8 +634,17 @@ int snd_sc1810_init_mixer(struct usb_mixer_interface *mixer) if (ret < 0) return ret; - ret = snd_s1810c_switch_init(mixer, &snd_s1810c_ab_sw); - if (ret < 0) - return ret; + // The 1824c has a Mono Main switch instead of a + // A/B select switch. + if (mixer->chip->usb_id == USB_ID(0x194f, 0x010d)) { + ret = snd_s1810c_switch_init(mixer, &snd_s1824c_mono_sw); + if (ret < 0) + return ret; + } else if (mixer->chip->usb_id == USB_ID(0x194f, 0x010c)) { + ret = snd_s1810c_switch_init(mixer, &snd_s1810c_ab_sw); + if (ret < 0) + return ret; + } + return ret; } diff --git a/sound/usb/qcom/qc_audio_offload.c b/sound/usb/qcom/qc_audio_offload.c index a17fb63f605e6b..cfb30a195364a4 100644 --- a/sound/usb/qcom/qc_audio_offload.c +++ b/sound/usb/qcom/qc_audio_offload.c @@ -538,38 +538,33 @@ static void uaudio_iommu_unmap(enum mem_type mtype, unsigned long iova, umap_size, iova, mapped_iova_size); } +static int uaudio_iommu_map_prot(bool dma_coherent) +{ + int prot = IOMMU_READ | IOMMU_WRITE; + + if (dma_coherent) + prot |= IOMMU_CACHE; + return prot; +} + /** - * uaudio_iommu_map() - maps iommu memory for adsp + * uaudio_iommu_map_pa() - maps iommu memory for adsp * @mtype: ring type * @dma_coherent: dma coherent * @pa: physical address for ring/buffer * @size: size of memory region - * @sgt: sg table for memory region * * Maps the XHCI related resources to a memory region that is assigned to be * used by the adsp. This will be mapped to the domain, which is created by * the ASoC USB backend driver. * */ -static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent, - phys_addr_t pa, size_t size, - struct sg_table *sgt) +static unsigned long uaudio_iommu_map_pa(enum mem_type mtype, bool dma_coherent, + phys_addr_t pa, size_t size) { - struct scatterlist *sg; unsigned long iova = 0; - size_t total_len = 0; - unsigned long iova_sg; - phys_addr_t pa_sg; bool map = true; - size_t sg_len; - int prot; - int ret; - int i; - - prot = IOMMU_READ | IOMMU_WRITE; - - if (dma_coherent) - prot |= IOMMU_CACHE; + int prot = uaudio_iommu_map_prot(dma_coherent); switch (mtype) { case MEM_EVENT_RING: @@ -583,20 +578,41 @@ static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent, &uaudio_qdev->xfer_ring_iova_size, &uaudio_qdev->xfer_ring_list, size); break; - case MEM_XFER_BUF: - iova = uaudio_get_iova(&uaudio_qdev->curr_xfer_buf_iova, - &uaudio_qdev->xfer_buf_iova_size, - &uaudio_qdev->xfer_buf_list, size); - break; default: dev_err(uaudio_qdev->data->dev, "unknown mem type %d\n", mtype); } if (!iova || !map) - goto done; + return 0; + + iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL); - if (!sgt) - goto skip_sgt_map; + return iova; +} + +static unsigned long uaudio_iommu_map_xfer_buf(bool dma_coherent, size_t size, + struct sg_table *sgt) +{ + struct scatterlist *sg; + unsigned long iova = 0; + size_t total_len = 0; + unsigned long iova_sg; + phys_addr_t pa_sg; + size_t sg_len; + int prot = uaudio_iommu_map_prot(dma_coherent); + int ret; + int i; + + prot = IOMMU_READ | IOMMU_WRITE; + + if (dma_coherent) + prot |= IOMMU_CACHE; + + iova = uaudio_get_iova(&uaudio_qdev->curr_xfer_buf_iova, + &uaudio_qdev->xfer_buf_iova_size, + &uaudio_qdev->xfer_buf_list, size); + if (!iova) + goto done; iova_sg = iova; for_each_sg(sgt->sgl, sg, sgt->nents, i) { @@ -618,11 +634,6 @@ static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent, uaudio_iommu_unmap(MEM_XFER_BUF, iova, size, total_len); iova = 0; } - return iova; - -skip_sgt_map: - iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL); - done: return iova; } @@ -1012,7 +1023,6 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, struct sg_table xfer_buf_sgt; dma_addr_t xfer_buf_dma; void *xfer_buf; - phys_addr_t xfer_buf_pa; u32 len = xfer_buf_len; bool dma_coherent; dma_addr_t xfer_buf_dma_sysdev; @@ -1043,18 +1053,12 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, if (!xfer_buf) return -ENOMEM; - /* Remapping is not possible if xfer_buf is outside of linear map */ - xfer_buf_pa = virt_to_phys(xfer_buf); - if (WARN_ON(!page_is_ram(PFN_DOWN(xfer_buf_pa)))) { - ret = -ENXIO; - goto unmap_sync; - } dma_get_sgtable(subs->dev->bus->sysdev, &xfer_buf_sgt, xfer_buf, xfer_buf_dma, len); /* map the physical buffer into sysdev as well */ - xfer_buf_dma_sysdev = uaudio_iommu_map(MEM_XFER_BUF, dma_coherent, - xfer_buf_pa, len, &xfer_buf_sgt); + xfer_buf_dma_sysdev = uaudio_iommu_map_xfer_buf(dma_coherent, + len, &xfer_buf_sgt); if (!xfer_buf_dma_sysdev) { ret = -ENOMEM; goto unmap_sync; @@ -1135,8 +1139,8 @@ uaudio_endpoint_setup(struct snd_usb_substream *subs, sg_free_table(sgt); /* data transfer ring */ - iova = uaudio_iommu_map(MEM_XFER_RING, dma_coherent, tr_pa, - PAGE_SIZE, NULL); + iova = uaudio_iommu_map_pa(MEM_XFER_RING, dma_coherent, tr_pa, + PAGE_SIZE); if (!iova) { ret = -ENOMEM; goto clear_pa; @@ -1199,8 +1203,8 @@ static int uaudio_event_ring_setup(struct snd_usb_substream *subs, mem_info->dma = sg_dma_address(sgt->sgl); sg_free_table(sgt); - iova = uaudio_iommu_map(MEM_EVENT_RING, dma_coherent, er_pa, - PAGE_SIZE, NULL); + iova = uaudio_iommu_map_pa(MEM_EVENT_RING, dma_coherent, er_pa, + PAGE_SIZE); if (!iova) { ret = -ENOMEM; goto clear_pa; diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index d736a475035659..634cb4fb586f91 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1599,9 +1599,6 @@ int snd_usb_apply_interface_quirk(struct snd_usb_audio *chip, /* presonus studio 1810c: skip altsets incompatible with device_setup */ if (chip->usb_id == USB_ID(0x194f, 0x010c)) return s1810c_skip_setting_quirk(chip, iface, altno); - /* presonus studio 1824c: skip altsets incompatible with device_setup */ - if (chip->usb_id == USB_ID(0x194f, 0x010d)) - return s1810c_skip_setting_quirk(chip, iface, altno); return 0; } @@ -2446,7 +2443,85 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { {} /* terminator */ }; -void snd_usb_init_quirk_flags(struct snd_usb_audio *chip) +#define QUIRK_STRING_ENTRY(x) \ + [QUIRK_TYPE_ ## x] = __stringify(x) + +static const char *const snd_usb_audio_quirk_flag_names[] = { + QUIRK_STRING_ENTRY(GET_SAMPLE_RATE), + QUIRK_STRING_ENTRY(SHARE_MEDIA_DEVICE), + QUIRK_STRING_ENTRY(ALIGN_TRANSFER), + QUIRK_STRING_ENTRY(TX_LENGTH), + QUIRK_STRING_ENTRY(PLAYBACK_FIRST), + QUIRK_STRING_ENTRY(SKIP_CLOCK_SELECTOR), + QUIRK_STRING_ENTRY(IGNORE_CLOCK_SOURCE), + QUIRK_STRING_ENTRY(ITF_USB_DSD_DAC), + QUIRK_STRING_ENTRY(CTL_MSG_DELAY), + QUIRK_STRING_ENTRY(CTL_MSG_DELAY_1M), + QUIRK_STRING_ENTRY(CTL_MSG_DELAY_5M), + QUIRK_STRING_ENTRY(IFACE_DELAY), + QUIRK_STRING_ENTRY(VALIDATE_RATES), + QUIRK_STRING_ENTRY(DISABLE_AUTOSUSPEND), + QUIRK_STRING_ENTRY(IGNORE_CTL_ERROR), + QUIRK_STRING_ENTRY(DSD_RAW), + QUIRK_STRING_ENTRY(SET_IFACE_FIRST), + QUIRK_STRING_ENTRY(GENERIC_IMPLICIT_FB), + QUIRK_STRING_ENTRY(SKIP_IMPLICIT_FB), + QUIRK_STRING_ENTRY(IFACE_SKIP_CLOSE), + QUIRK_STRING_ENTRY(FORCE_IFACE_RESET), + QUIRK_STRING_ENTRY(FIXED_RATE), + QUIRK_STRING_ENTRY(MIC_RES_16), + QUIRK_STRING_ENTRY(MIC_RES_384), + QUIRK_STRING_ENTRY(MIXER_PLAYBACK_MIN_MUTE), + QUIRK_STRING_ENTRY(MIXER_CAPTURE_MIN_MUTE), + NULL +}; + +const char *snd_usb_quirk_flag_find_name(unsigned long index) +{ + if (index >= ARRAY_SIZE(snd_usb_audio_quirk_flag_names)) + return NULL; + + return snd_usb_audio_quirk_flag_names[index]; +} + +u32 snd_usb_quirk_flags_from_name(const char *name) +{ + int i; + + if (!name || !*name) + return 0; + + for (i = 0; snd_usb_audio_quirk_flag_names[i]; i++) { + if (strcasecmp(name, snd_usb_audio_quirk_flag_names[i]) == 0) + return BIT_U32(i); + } + + return 0; +} + +void snd_usb_apply_flag_dbg(const char *reason, + struct snd_usb_audio *chip, + unsigned long flag) +{ + unsigned long bit; + + for_each_set_bit(bit, &flag, BYTES_TO_BITS(sizeof(flag))) { + const char *name = snd_usb_audio_quirk_flag_names[bit]; + + if (name) + usb_audio_dbg(chip, + "From %s apply quirk flag %s for device %04x:%04x\n", + reason, name, USB_ID_VENDOR(chip->usb_id), + USB_ID_PRODUCT(chip->usb_id)); + else + usb_audio_warn(chip, + "From %s apply unknown quirk flag 0x%lx for device %04x:%04x\n", + reason, bit, USB_ID_VENDOR(chip->usb_id), + USB_ID_PRODUCT(chip->usb_id)); + } +} + +void snd_usb_init_quirk_flags_table(struct snd_usb_audio *chip) { const struct usb_audio_quirk_flags_table *p; @@ -2454,12 +2529,97 @@ void snd_usb_init_quirk_flags(struct snd_usb_audio *chip) if (chip->usb_id == p->id || (!USB_ID_PRODUCT(p->id) && USB_ID_VENDOR(chip->usb_id) == USB_ID_VENDOR(p->id))) { - usb_audio_dbg(chip, - "Set quirk_flags 0x%x for device %04x:%04x\n", - p->flags, USB_ID_VENDOR(chip->usb_id), - USB_ID_PRODUCT(chip->usb_id)); + snd_usb_apply_flag_dbg("builtin table", chip, p->flags); chip->quirk_flags |= p->flags; return; } } } + +void snd_usb_init_quirk_flags_parse_string(struct snd_usb_audio *chip, + const char *str) +{ + u16 chip_vid = USB_ID_VENDOR(chip->usb_id); + u16 chip_pid = USB_ID_PRODUCT(chip->usb_id); + u32 mask_flags, unmask_flags, bit; + char *p, *field, *flag; + bool is_unmask; + u16 vid, pid; + + char *val __free(kfree) = kstrdup(str, GFP_KERNEL); + + if (!val) + return; + + for (p = val; p && *p;) { + /* Each entry consists of VID:PID:flags */ + field = strsep(&p, ":"); + if (!field) + break; + + if (strcmp(field, "*") == 0) + vid = 0; + else if (kstrtou16(field, 16, &vid)) + break; + + field = strsep(&p, ":"); + if (!field) + break; + + if (strcmp(field, "*") == 0) + pid = 0; + else if (kstrtou16(field, 16, &pid)) + break; + + field = strsep(&p, ";"); + if (!field || !*field) + break; + + if ((vid != 0 && vid != chip_vid) || + (pid != 0 && pid != chip_pid)) + continue; + + /* Collect the flags */ + mask_flags = 0; + unmask_flags = 0; + while (field && *field) { + flag = strsep(&field, "|"); + + if (!flag) + break; + + if (*flag == '!') { + is_unmask = true; + flag++; + } else { + is_unmask = false; + } + + if (!kstrtou32(flag, 16, &bit)) { + if (is_unmask) + unmask_flags |= bit; + else + mask_flags |= bit; + + break; + } + + bit = snd_usb_quirk_flags_from_name(flag); + + if (bit) { + if (is_unmask) + unmask_flags |= bit; + else + mask_flags |= bit; + } else { + pr_warn("snd_usb_audio: unknown flag %s while parsing param quirk_flags\n", + flag); + } + } + + chip->quirk_flags &= ~unmask_flags; + chip->quirk_flags |= mask_flags; + snd_usb_apply_flag_dbg("module param", chip, + chip->quirk_flags); + } +} diff --git a/sound/usb/quirks.h b/sound/usb/quirks.h index f9bfd5ac7bab01..f24d6a5a197a64 100644 --- a/sound/usb/quirks.h +++ b/sound/usb/quirks.h @@ -48,6 +48,15 @@ void snd_usb_audioformat_attributes_quirk(struct snd_usb_audio *chip, struct audioformat *fp, int stream); -void snd_usb_init_quirk_flags(struct snd_usb_audio *chip); +void snd_usb_apply_flag_dbg(const char *reason, + struct snd_usb_audio *chip, + unsigned long flag); + +void snd_usb_init_quirk_flags_table(struct snd_usb_audio *chip); +void snd_usb_init_quirk_flags_parse_string(struct snd_usb_audio *chip, + const char *str); + +const char *snd_usb_quirk_flag_find_name(unsigned long flag); +u32 snd_usb_quirk_flags_from_name(const char *name); #endif /* __USBAUDIO_QUIRKS_H */ diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index 30b5102e3caed0..79978cae9799cd 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -226,31 +226,63 @@ extern bool snd_usb_skip_validation; * Similar to QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE, but for capture streams */ -#define QUIRK_FLAG_GET_SAMPLE_RATE (1U << 0) -#define QUIRK_FLAG_SHARE_MEDIA_DEVICE (1U << 1) -#define QUIRK_FLAG_ALIGN_TRANSFER (1U << 2) -#define QUIRK_FLAG_TX_LENGTH (1U << 3) -#define QUIRK_FLAG_PLAYBACK_FIRST (1U << 4) -#define QUIRK_FLAG_SKIP_CLOCK_SELECTOR (1U << 5) -#define QUIRK_FLAG_IGNORE_CLOCK_SOURCE (1U << 6) -#define QUIRK_FLAG_ITF_USB_DSD_DAC (1U << 7) -#define QUIRK_FLAG_CTL_MSG_DELAY (1U << 8) -#define QUIRK_FLAG_CTL_MSG_DELAY_1M (1U << 9) -#define QUIRK_FLAG_CTL_MSG_DELAY_5M (1U << 10) -#define QUIRK_FLAG_IFACE_DELAY (1U << 11) -#define QUIRK_FLAG_VALIDATE_RATES (1U << 12) -#define QUIRK_FLAG_DISABLE_AUTOSUSPEND (1U << 13) -#define QUIRK_FLAG_IGNORE_CTL_ERROR (1U << 14) -#define QUIRK_FLAG_DSD_RAW (1U << 15) -#define QUIRK_FLAG_SET_IFACE_FIRST (1U << 16) -#define QUIRK_FLAG_GENERIC_IMPLICIT_FB (1U << 17) -#define QUIRK_FLAG_SKIP_IMPLICIT_FB (1U << 18) -#define QUIRK_FLAG_IFACE_SKIP_CLOSE (1U << 19) -#define QUIRK_FLAG_FORCE_IFACE_RESET (1U << 20) -#define QUIRK_FLAG_FIXED_RATE (1U << 21) -#define QUIRK_FLAG_MIC_RES_16 (1U << 22) -#define QUIRK_FLAG_MIC_RES_384 (1U << 23) -#define QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE (1U << 24) -#define QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE (1U << 25) +enum { + QUIRK_TYPE_GET_SAMPLE_RATE = 0, + QUIRK_TYPE_SHARE_MEDIA_DEVICE = 1, + QUIRK_TYPE_ALIGN_TRANSFER = 2, + QUIRK_TYPE_TX_LENGTH = 3, + QUIRK_TYPE_PLAYBACK_FIRST = 4, + QUIRK_TYPE_SKIP_CLOCK_SELECTOR = 5, + QUIRK_TYPE_IGNORE_CLOCK_SOURCE = 6, + QUIRK_TYPE_ITF_USB_DSD_DAC = 7, + QUIRK_TYPE_CTL_MSG_DELAY = 8, + QUIRK_TYPE_CTL_MSG_DELAY_1M = 9, + QUIRK_TYPE_CTL_MSG_DELAY_5M = 10, + QUIRK_TYPE_IFACE_DELAY = 11, + QUIRK_TYPE_VALIDATE_RATES = 12, + QUIRK_TYPE_DISABLE_AUTOSUSPEND = 13, + QUIRK_TYPE_IGNORE_CTL_ERROR = 14, + QUIRK_TYPE_DSD_RAW = 15, + QUIRK_TYPE_SET_IFACE_FIRST = 16, + QUIRK_TYPE_GENERIC_IMPLICIT_FB = 17, + QUIRK_TYPE_SKIP_IMPLICIT_FB = 18, + QUIRK_TYPE_IFACE_SKIP_CLOSE = 19, + QUIRK_TYPE_FORCE_IFACE_RESET = 20, + QUIRK_TYPE_FIXED_RATE = 21, + QUIRK_TYPE_MIC_RES_16 = 22, + QUIRK_TYPE_MIC_RES_384 = 23, + QUIRK_TYPE_MIXER_PLAYBACK_MIN_MUTE = 24, + QUIRK_TYPE_MIXER_CAPTURE_MIN_MUTE = 25, +/* Please also edit snd_usb_audio_quirk_flag_names */ +}; + +#define QUIRK_FLAG(x) BIT_U32(QUIRK_TYPE_ ## x) + +#define QUIRK_FLAG_GET_SAMPLE_RATE QUIRK_FLAG(GET_SAMPLE_RATE) +#define QUIRK_FLAG_SHARE_MEDIA_DEVICE QUIRK_FLAG(SHARE_MEDIA_DEVICE) +#define QUIRK_FLAG_ALIGN_TRANSFER QUIRK_FLAG(ALIGN_TRANSFER) +#define QUIRK_FLAG_TX_LENGTH QUIRK_FLAG(TX_LENGTH) +#define QUIRK_FLAG_PLAYBACK_FIRST QUIRK_FLAG(PLAYBACK_FIRST) +#define QUIRK_FLAG_SKIP_CLOCK_SELECTOR QUIRK_FLAG(SKIP_CLOCK_SELECTOR) +#define QUIRK_FLAG_IGNORE_CLOCK_SOURCE QUIRK_FLAG(IGNORE_CLOCK_SOURCE) +#define QUIRK_FLAG_ITF_USB_DSD_DAC QUIRK_FLAG(ITF_USB_DSD_DAC) +#define QUIRK_FLAG_CTL_MSG_DELAY QUIRK_FLAG(CTL_MSG_DELAY) +#define QUIRK_FLAG_CTL_MSG_DELAY_1M QUIRK_FLAG(CTL_MSG_DELAY_1M) +#define QUIRK_FLAG_CTL_MSG_DELAY_5M QUIRK_FLAG(CTL_MSG_DELAY_5M) +#define QUIRK_FLAG_IFACE_DELAY QUIRK_FLAG(IFACE_DELAY) +#define QUIRK_FLAG_VALIDATE_RATES QUIRK_FLAG(VALIDATE_RATES) +#define QUIRK_FLAG_DISABLE_AUTOSUSPEND QUIRK_FLAG(DISABLE_AUTOSUSPEND) +#define QUIRK_FLAG_IGNORE_CTL_ERROR QUIRK_FLAG(IGNORE_CTL_ERROR) +#define QUIRK_FLAG_DSD_RAW QUIRK_FLAG(DSD_RAW) +#define QUIRK_FLAG_SET_IFACE_FIRST QUIRK_FLAG(SET_IFACE_FIRST) +#define QUIRK_FLAG_GENERIC_IMPLICIT_FB QUIRK_FLAG(GENERIC_IMPLICIT_FB) +#define QUIRK_FLAG_SKIP_IMPLICIT_FB QUIRK_FLAG(SKIP_IMPLICIT_FB) +#define QUIRK_FLAG_IFACE_SKIP_CLOSE QUIRK_FLAG(IFACE_SKIP_CLOSE) +#define QUIRK_FLAG_FORCE_IFACE_RESET QUIRK_FLAG(FORCE_IFACE_RESET) +#define QUIRK_FLAG_FIXED_RATE QUIRK_FLAG(FIXED_RATE) +#define QUIRK_FLAG_MIC_RES_16 QUIRK_FLAG(MIC_RES_16) +#define QUIRK_FLAG_MIC_RES_384 QUIRK_FLAG(MIC_RES_384) +#define QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE QUIRK_FLAG(MIXER_PLAYBACK_MIN_MUTE) +#define QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE QUIRK_FLAG(MIXER_CAPTURE_MIN_MUTE) #endif /* __USBAUDIO_H */ diff --git a/tools/arch/loongarch/include/asm/inst.h b/tools/arch/loongarch/include/asm/inst.h index c25b5853181dba..d68fad63c8b732 100644 --- a/tools/arch/loongarch/include/asm/inst.h +++ b/tools/arch/loongarch/include/asm/inst.h @@ -51,6 +51,10 @@ enum reg2i16_op { bgeu_op = 0x1b, }; +enum reg3_op { + amswapw_op = 0x70c0, +}; + struct reg0i15_format { unsigned int immediate : 15; unsigned int opcode : 17; @@ -96,6 +100,13 @@ struct reg2i16_format { unsigned int opcode : 6; }; +struct reg3_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int rk : 5; + unsigned int opcode : 17; +}; + union loongarch_instruction { unsigned int word; struct reg0i15_format reg0i15_format; @@ -105,6 +116,7 @@ union loongarch_instruction { struct reg2i12_format reg2i12_format; struct reg2i14_format reg2i14_format; struct reg2i16_format reg2i16_format; + struct reg3_format reg3_format; }; #define LOONGARCH_INSN_SIZE sizeof(union loongarch_instruction) diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h index 0dfc09254f99af..56d7367ee344c1 100644 --- a/tools/arch/riscv/include/asm/csr.h +++ b/tools/arch/riscv/include/asm/csr.h @@ -468,13 +468,13 @@ #define IE_TIE (_AC(0x1, UL) << RV_IRQ_TIMER) #define IE_EIE (_AC(0x1, UL) << RV_IRQ_EXT) -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define __ASM_STR(x) x #else #define __ASM_STR(x) #x #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define csr_swap(csr, val) \ ({ \ @@ -536,6 +536,6 @@ : "memory"); \ }) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_RISCV_CSR_H */ diff --git a/tools/arch/riscv/include/asm/vdso/processor.h b/tools/arch/riscv/include/asm/vdso/processor.h index 662aca03984817..0665b117f30f27 100644 --- a/tools/arch/riscv/include/asm/vdso/processor.h +++ b/tools/arch/riscv/include/asm/vdso/processor.h @@ -2,7 +2,7 @@ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -27,6 +27,6 @@ static inline void cpu_relax(void) barrier(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index 183aa662b16523..099e926595bd98 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -37,6 +37,8 @@ #define INAT_PFX_EVEX 15 /* EVEX prefix */ /* x86-64 REX2 prefix */ #define INAT_PFX_REX2 16 /* 0xD5 */ +/* AMD XOP prefix */ +#define INAT_PFX_XOP 17 /* 0x8F */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -77,6 +79,7 @@ #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_XOPOK INAT_VEXOK #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) @@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); +extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, + insn_byte_t map_select); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } +static inline int inat_is_xop_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_XOP; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr) return attr & INAT_VEXOK; } +static inline int inat_accept_xop(insn_attr_t attr) +{ + return attr & INAT_XOPOK; +} + static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 0e5abd896ad42d..c683d609934b79 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -71,7 +71,10 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ - struct insn_field vex_prefix; /* VEX prefix */ + union { + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field xop_prefix; /* XOP prefix */ + }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -135,6 +138,17 @@ struct insn { #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* XOP bit fields */ +#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ +#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ +#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ +#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ +#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ +#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ +#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ +#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ +#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ +#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); @@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) return X86_REX2_M(insn->rex_prefix.bytes[1]); } -static inline int insn_is_avx(struct insn *insn) +static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); @@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +/* If we already know this is AVX/XOP encoded */ +static inline int avx_insn_is_xop(struct insn *insn) +{ + insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); + + return inat_is_xop_prefix(attr); +} + +static inline int insn_is_xop(struct insn *insn) +{ + if (!insn_is_avx_or_xop(insn)) + return 0; + + return avx_insn_is_xop(insn); +} + static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; @@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn) return X86_VEX_W(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_xop_map_bits(struct insn *insn) +{ + if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ + return 0; + return X86_XOP_M(insn->xop_prefix.bytes[1]); +} + +static inline insn_byte_t insn_xop_p_bits(struct insn *insn) +{ + return X86_XOP_P(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { - if (insn_is_avx(insn)) + if (insn_is_avx_or_xop(insn)) { + if (avx_insn_is_xop(insn)) + return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index b65c3ba5fa1410..f627196eb79662 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/tools/arch/x86/lib/inat.c b/tools/arch/x86/lib/inat.c index dfbcc64059412a..ffcb0e27453b8e 100644 --- a/tools/arch/x86/lib/inat.c +++ b/tools/arch/x86/lib/inat.c @@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, return table[opcode]; } +insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) +{ + const insn_attr_t *table; + + if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) + return 0; + map_select -= X86_XOP_M_MIN; + /* At first, this checks the master table */ + table = inat_xop_tables[map_select]; + if (!table) + return 0; + return table[opcode]; +} diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index bce69c6bfa6972..1d1c57c74d1fc2 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -200,12 +200,15 @@ int insn_get_prefixes(struct insn *insn) } insn->rex_prefix.got = 1; - /* Decode VEX prefix */ + /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); - attr = inat_get_opcode_attribute(b); - if (inat_is_vex_prefix(attr)) { + if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); - if (!insn->x86_64) { + + if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { + /* Grp1A.0 is always POP Ev */ + goto vex_end; + } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is @@ -226,13 +229,13 @@ int insn_get_prefixes(struct insn *insn) if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; - } else if (inat_is_vex3_prefix(attr)) { + } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) - /* VEX.W overrides opnd_size */ + /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* @@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn) insn_set_byte(opcode, 0, op); opcode->nbytes = 1; - /* Check if there is VEX prefix or not */ - if (insn_is_avx(insn)) { + /* Check if there is VEX/XOP prefix or not */ + if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; + + /* XOP prefix has different encoding */ + if (unlikely(avx_insn_is_xop(insn))) { + m = insn_xop_map_bits(insn); + insn->attr = inat_get_xop_attribute(op, m); + if (!inat_accept_xop(insn->attr)) { + insn->attr = 0; + return -EINVAL; + } + /* XOP has only 1 byte for opcode */ + goto end; + } + m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); @@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && + !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index 262f7ca1fb9527..2a4e69ecc2de0f 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -27,6 +27,11 @@ # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. +# (xop): this opcode accepts XOP prefix. +# +# XOP Superscripts +# (W=0): this opcode requires XOP.W == 0 +# (W=1): this opcode requires XOP.W == 1 # # Last Prefix Superscripts # - (66): the last prefix is 0x66 @@ -194,7 +199,7 @@ AVXcode: 8c: MOV Ev,Sw 8d: LEA Gv,M 8e: MOV Sw,Ew -8f: Grp1A (1A) | POP Ev (d64) +8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix) # 0x90 - 0x9f 90: NOP | PAUSE (F3) | XCHG r8,rAX 91: XCHG rCX/r9,rAX @@ -1106,6 +1111,84 @@ AVXcode: 7 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) EndTable +# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5 +Table: XOP map 8h +Referrer: +XOPcode: 0 +85: VPMACSSWW Vo,Ho,Wo,Lo +86: VPMACSSWD Vo,Ho,Wo,Lo +87: VPMACSSDQL Vo,Ho,Wo,Lo +8e: VPMACSSDD Vo,Ho,Wo,Lo +8f: VPMACSSDQH Vo,Ho,Wo,Lo +95: VPMACSWW Vo,Ho,Wo,Lo +96: VPMACSWD Vo,Ho,Wo,Lo +97: VPMACSDQL Vo,Ho,Wo,Lo +9e: VPMACSDD Vo,Ho,Wo,Lo +9f: VPMACSDQH Vo,Ho,Wo,Lo +a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1) +a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1) +a6: VPMADCSSWD Vo,Ho,Wo,Lo +b6: VPMADCSWD Vo,Ho,Wo,Lo +c0: VPROTB Vo,Wo,Ib +c1: VPROTW Vo,Wo,Ib +c2: VPROTD Vo,Wo,Ib +c3: VPROTQ Vo,Wo,Ib +cc: VPCOMccB Vo,Ho,Wo,Ib +cd: VPCOMccW Vo,Ho,Wo,Ib +ce: VPCOMccD Vo,Ho,Wo,Ib +cf: VPCOMccQ Vo,Ho,Wo,Ib +ec: VPCOMccUB Vo,Ho,Wo,Ib +ed: VPCOMccUW Vo,Ho,Wo,Ib +ee: VPCOMccUD Vo,Ho,Wo,Ib +ef: VPCOMccUQ Vo,Ho,Wo,Ib +EndTable + +Table: XOP map 9h +Referrer: +XOPcode: 1 +01: GrpXOP1 +02: GrpXOP2 +12: GrpXOP3 +80: VFRCZPS Vx,Wx +81: VFRCZPD Vx,Wx +82: VFRCZSS Vq,Wss +83: VFRCZSD Vq,Wsd +90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1) +95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1) +96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1) +97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1) +98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1) +99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1) +9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1) +9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1) +c1: VPHADDBW Vo,Wo +c2: VPHADDBD Vo,Wo +c3: VPHADDBQ Vo,Wo +c6: VPHADDWD Vo,Wo +c7: VPHADDWQ Vo,Wo +cb: VPHADDDQ Vo,Wo +d1: VPHADDUBWD Vo,Wo +d2: VPHADDUBD Vo,Wo +d3: VPHADDUBQ Vo,Wo +d6: VPHADDUWD Vo,Wo +d7: VPHADDUWQ Vo,Wo +db: VPHADDUDQ Vo,Wo +e1: VPHSUBBW Vo,Wo +e2: VPHSUBWD Vo,Wo +e3: VPHSUBDQ Vo,Wo +EndTable + +Table: XOP map Ah +Referrer: +XOPcode: 2 +10: BEXTR Gy,Ey,Id +12: GrpXOP4 +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1320,3 +1403,29 @@ GrpTable: GrpRNG 4: xcrypt-cfb 5: xcrypt-ofb EndTable + +# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4 +GrpTable: GrpXOP1 +1: BLCFILL By,Ey (xop) +2: BLSFILL By,Ey (xop) +3: BLCS By,Ey (xop) +4: TZMSK By,Ey (xop) +5: BLCIC By,Ey (xop) +6: BLSIC By,Ey (xop) +7: T1MSKC By,Ey (xop) +EndTable + +GrpTable: GrpXOP2 +1: BLCMSK By,Ey (xop) +6: BLCI By,Ey (xop) +EndTable + +GrpTable: GrpXOP3 +0: LLWPCB Ry (xop) +1: SLWPCB Ry (xop) +EndTable + +GrpTable: GrpXOP4 +0: LWPINS By,Ed,Id (xop) +1: LWPVAL By,Ed,Id (xop) +EndTable diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index 2c19d7fc8a8559..7ea1b75e59b742 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -21,6 +21,7 @@ function clear_vars() { eid = -1 # escape id gid = -1 # group id aid = -1 # AVX id + xopid = -1 # XOP id tname = "" } @@ -39,9 +40,11 @@ BEGIN { ggid = 1 geid = 1 gaid = 0 + gxopid = 0 delete etable delete gtable delete atable + delete xoptable opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" @@ -61,6 +64,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -87,6 +91,8 @@ BEGIN { evexonly_expr = "\\(ev\\)" # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size evex_scalable_expr = "\\(es\\)" + # All opcodes in XOP table or with (xop) superscript accept XOP prefix + xopok_expr = "\\(xop\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -106,6 +112,7 @@ BEGIN { prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" prefix_num["REX2"] = "INAT_PFX_REX2" + prefix_num["XOP"] = "INAT_PFX_XOP" clear_vars() } @@ -147,6 +154,7 @@ function array_size(arr, i,c) { if (NF != 1) { # AVX/escape opcode table aid = $2 + xopid = -1 if (gaid <= aid) gaid = aid + 1 if (tname == "") # AVX only opcode table @@ -156,6 +164,20 @@ function array_size(arr, i,c) { tname = "inat_primary_table" } +/^XOPcode:/ { + if (NF != 1) { + # XOP opcode table + xopid = $2 + aid = -1 + if (gxopid <= xopid) + gxopid = xopid + 1 + if (tname == "") # XOP only opcode table + tname = sprintf("inat_xop_table_%d", $2) + } + if (xopid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n) etable[eid,0] = tname if (aid >= 0) atable[aid,0] = tname + else if (xopid >= 0) + xoptable[xopid] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", @@ -347,6 +371,8 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") + else if (match(ext, xopok_expr) || xopid >= 0) + flags = add_flags(flags, "INAT_XOPOK") # check prefixes if (match(ext, prefix_expr)) { @@ -413,6 +439,14 @@ END { print " ["i"]["j"] = "atable[i,j]"," print "};\n" + print "/* XOP opcode map array */" + print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \ + " = {" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print " ["i"] = "xoptable[i]"," + print "};" + print "#else /* !__BOOT_COMPRESSED */\n" print "/* Escape opcode map array */" @@ -430,6 +464,10 @@ END { "[INAT_LSTPFX_MAX + 1];" print "" + print "/* XOP opcode map array */" + print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];" + print "" + print "static void inat_init_tables(void)" print "{" @@ -455,6 +493,12 @@ END { if (atable[i,j]) print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + print "" + print "\t/* Print XOP opcode map array */" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print "\tinat_xop_tables["i"] = "xoptable[i]";" + print "}" print "#endif" } diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index ca860fd97d8dfc..d0a36f442db72f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -16,7 +16,7 @@ SYNOPSIS **bpftool** [*OPTIONS*] **gen** *COMMAND* -*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } +*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } | [ { **-S** | **--sign** } {**-k** } **-i** ] } *COMMAND* := { **object** | **skeleton** | **help** } @@ -186,6 +186,17 @@ OPTIONS skeleton). A light skeleton contains a loader eBPF program. It does not use the majority of the libbpf infrastructure, and does not need libelf. +-S, --sign + For skeletons, generate a signed skeleton. This option must be used with + **-k** and **-i**. Using this flag implicitly enables **--use-loader**. + +-k + Path to the private key file in PEM format, required for signing. + +-i + Path to the X.509 certificate file in PEM or DER format, required for + signing. + EXAMPLES ======== **$ cat example1.bpf.c** diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index f69fd92df8d89d..009633294b0934 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -18,7 +18,7 @@ SYNOPSIS *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | -{ **-L** | **--use-loader** } } +{ **-L** | **--use-loader** } | [ { **-S** | **--sign** } **-k** **-i** ] } *COMMANDS* := { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | @@ -248,6 +248,18 @@ OPTIONS creating the maps, and loading the programs (see **bpftool prog tracelog** as a way to dump those messages). +-S, --sign + Enable signing of the BPF program before loading. This option must be + used with **-k** and **-i**. Using this flag implicitly enables + **--use-loader**. + +-k + Path to the private key file in PEM format, required when signing. + +-i + Path to the X.509 certificate file in PEM or DER format, required when + signing. + EXAMPLES ======== **# bpftool prog show** diff --git a/tools/bpf/bpftool/Documentation/bpftool-token.rst b/tools/bpf/bpftool/Documentation/bpftool-token.rst new file mode 100644 index 00000000000000..d082c499cfe370 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-token.rst @@ -0,0 +1,64 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-token +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF tokens +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + +**bpftool** [*OPTIONS*] **token** *COMMAND* + +*OPTIONS* := { |COMMON_OPTIONS| } + +*COMMANDS* := { **show** | **list** | **help** } + +TOKEN COMMANDS +=============== + +| **bpftool** **token** { **show** | **list** } +| **bpftool** **token help** +| + +DESCRIPTION +=========== +bpftool token { show | list } + List BPF token information for each *bpffs* mount point containing token + information on the system. Information include mount point path, allowed + **bpf**\ () system call commands, maps, programs, and attach types for the + token. + +bpftool prog help + Print short help message. + +OPTIONS +======== +.. include:: common_options.rst + +EXAMPLES +======== +| +| **# mkdir -p /sys/fs/bpf/token** +| **# mount -t bpf bpffs /sys/fs/bpf/token** \ +| **-o delegate_cmds=prog_load:map_create** \ +| **-o delegate_progs=kprobe** \ +| **-o delegate_attachs=xdp** +| **# bpftool token list** + +:: + + token_info /sys/fs/bpf/token + allowed_cmds: + map_create prog_load + allowed_maps: + allowed_progs: + kprobe + allowed_attachs: + xdp diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 9e9a5f006cd2aa..586d1b2595d16b 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -130,8 +130,8 @@ include $(FEATURES_DUMP) endif endif -LIBS = $(LIBBPF) -lelf -lz -LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz +LIBS = $(LIBBPF) -lelf -lz -lcrypto +LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz -lcrypto ifeq ($(feature-libelf-zstd),1) LIBS += -lzstd @@ -194,7 +194,7 @@ endif BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o sign.o) $(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index a759ba24471d67..53bcfeb1a76e62 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -262,7 +262,7 @@ _bpftool() # Deal with options if [[ ${words[cword]} == -* ]]; then local c='--version --json --pretty --bpffs --mapcompat --debug \ - --use-loader --base-btf' + --use-loader --base-btf --sign -i -k' COMPREPLY=( $( compgen -W "$c" -- "$cur" ) ) return 0 fi @@ -283,7 +283,7 @@ _bpftool() _sysfs_get_netdevs return 0 ;; - file|pinned|-B|--base-btf) + file|pinned|-B|--base-btf|-i|-k) _filedir return 0 ;; @@ -296,13 +296,21 @@ _bpftool() # Remove all options so completions don't have to deal with them. local i pprev for (( i=1; i < ${#words[@]}; )); do - if [[ ${words[i]::1} == - ]] && - [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then - words=( "${words[@]:0:i}" "${words[@]:i+1}" ) - [[ $i -le $cword ]] && cword=$(( cword - 1 )) - else - i=$(( ++i )) - fi + case ${words[i]} in + # Remove option and its argument + -B|--base-btf|-i|-k) + words=( "${words[@]:0:i}" "${words[@]:i+2}" ) + [[ $i -le $(($cword + 1)) ]] && cword=$(( cword - 2 )) + ;; + # No argument, remove option only + -*) + words=( "${words[@]:0:i}" "${words[@]:i+1}" ) + [[ $i -le $cword ]] && cword=$(( cword - 1 )) + ;; + *) + i=$(( ++i )) + ;; + esac done cur=${words[cword]} prev=${words[cword - 1]} @@ -1215,6 +1223,17 @@ _bpftool() ;; esac ;; + token) + case $command in + show|list) + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 4e896d8a2416e9..ff12628593aecd 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -38,7 +38,7 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d, __u32 info_len = sizeof(info); const char *prog_name = NULL; struct btf *prog_btf = NULL; - struct bpf_func_info finfo; + struct bpf_func_info finfo = {}; __u32 finfo_rec_size; char prog_str[1024]; int err; diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 944ebe21a2169a..ec356deb27c9ec 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -2,6 +2,10 @@ // Copyright (C) 2017 Facebook // Author: Roman Gushchin +#undef GCC_VERSION +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #define _XOPEN_SOURCE 500 #include #include diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index b07317d2842fe8..e8daf963ecef4d 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,7 @@ #include #include /* libbpf_num_possible_cpus */ #include +#include #include "main.h" @@ -1208,3 +1210,94 @@ int pathname_concat(char *buf, int buf_sz, const char *path, return 0; } + +static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, + char **value) +{ + char *sep; + + while (gzgets(file, buf, n)) { + if (strncmp(buf, "CONFIG_", 7)) + continue; + + sep = strchr(buf, '='); + if (!sep) + continue; + + /* Trim ending '\n' */ + buf[strlen(buf) - 1] = '\0'; + + /* Split on '=' and ensure that a value is present. */ + *sep = '\0'; + if (!sep[1]) + continue; + + *value = sep + 1; + return true; + } + + return false; +} + +int read_kernel_config(const struct kernel_config_option *requested_options, + size_t num_options, char **out_values, + const char *define_prefix) +{ + struct utsname utsn; + char path[PATH_MAX]; + gzFile file = NULL; + char buf[4096]; + char *value; + size_t i; + int ret = 0; + + if (!requested_options || !out_values || num_options == 0) + return -1; + + if (!uname(&utsn)) { + snprintf(path, sizeof(path), "/boot/config-%s", utsn.release); + + /* gzopen also accepts uncompressed files. */ + file = gzopen(path, "r"); + } + + if (!file) { + /* Some distributions build with CONFIG_IKCONFIG=y and put the + * config file at /proc/config.gz. + */ + file = gzopen("/proc/config.gz", "r"); + } + + if (!file) { + p_info("skipping kernel config, can't open file: %s", + strerror(errno)); + return -1; + } + + if (!gzgets(file, buf, sizeof(buf)) || !gzgets(file, buf, sizeof(buf))) { + p_info("skipping kernel config, can't read from file: %s", + strerror(errno)); + ret = -1; + goto end_parse; + } + + if (strcmp(buf, "# Automatically generated file; DO NOT EDIT.\n")) { + p_info("skipping kernel config, can't find correct file"); + ret = -1; + goto end_parse; + } + + while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { + for (i = 0; i < num_options; i++) { + if ((define_prefix && !requested_options[i].macro_dump) || + out_values[i] || strcmp(buf, requested_options[i].name)) + continue; + + out_values[i] = strdup(value); + } + } + +end_parse: + gzclose(file); + return ret; +} diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 24fecdf8e43078..0f6070a0c8e717 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -10,7 +10,6 @@ #ifdef USE_LIBCAP #include #endif -#include #include #include @@ -18,7 +17,6 @@ #include #include -#include #include "main.h" @@ -327,40 +325,9 @@ static void probe_jit_limit(void) } } -static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, - char **value) -{ - char *sep; - - while (gzgets(file, buf, n)) { - if (strncmp(buf, "CONFIG_", 7)) - continue; - - sep = strchr(buf, '='); - if (!sep) - continue; - - /* Trim ending '\n' */ - buf[strlen(buf) - 1] = '\0'; - - /* Split on '=' and ensure that a value is present. */ - *sep = '\0'; - if (!sep[1]) - continue; - - *value = sep + 1; - return true; - } - - return false; -} - static void probe_kernel_image_config(const char *define_prefix) { - static const struct { - const char * const name; - bool macro_dump; - } options[] = { + struct kernel_config_option options[] = { /* Enable BPF */ { "CONFIG_BPF", }, /* Enable bpf() syscall */ @@ -435,52 +402,11 @@ static void probe_kernel_image_config(const char *define_prefix) { "CONFIG_HZ", true, } }; char *values[ARRAY_SIZE(options)] = { }; - struct utsname utsn; - char path[PATH_MAX]; - gzFile file = NULL; - char buf[4096]; - char *value; size_t i; - if (!uname(&utsn)) { - snprintf(path, sizeof(path), "/boot/config-%s", utsn.release); - - /* gzopen also accepts uncompressed files. */ - file = gzopen(path, "r"); - } - - if (!file) { - /* Some distributions build with CONFIG_IKCONFIG=y and put the - * config file at /proc/config.gz. - */ - file = gzopen("/proc/config.gz", "r"); - } - if (!file) { - p_info("skipping kernel config, can't open file: %s", - strerror(errno)); - goto end_parse; - } - /* Sanity checks */ - if (!gzgets(file, buf, sizeof(buf)) || - !gzgets(file, buf, sizeof(buf))) { - p_info("skipping kernel config, can't read from file: %s", - strerror(errno)); - goto end_parse; - } - if (strcmp(buf, "# Automatically generated file; DO NOT EDIT.\n")) { - p_info("skipping kernel config, can't find correct file"); - goto end_parse; - } - - while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { - for (i = 0; i < ARRAY_SIZE(options); i++) { - if ((define_prefix && !options[i].macro_dump) || - values[i] || strcmp(buf, options[i].name)) - continue; - - values[i] = strdup(value); - } - } + if (read_kernel_config(options, ARRAY_SIZE(options), values, + define_prefix)) + return; for (i = 0; i < ARRAY_SIZE(options); i++) { if (define_prefix && !options[i].macro_dump) @@ -488,10 +414,6 @@ static void probe_kernel_image_config(const char *define_prefix) print_kernel_option(options[i].name, values[i], define_prefix); free(values[i]); } - -end_parse: - if (file) - gzclose(file); } static bool probe_bpf_syscall(const char *define_prefix) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 67a60114368f50..993c7d9484a463 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -688,10 +688,17 @@ static void codegen_destroy(struct bpf_object *obj, const char *obj_name) static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *header_guard) { DECLARE_LIBBPF_OPTS(gen_loader_opts, opts); + struct bpf_load_and_run_opts sopts = {}; + char sig_buf[MAX_SIG_SIZE]; + __u8 prog_sha[SHA256_DIGEST_LENGTH]; struct bpf_map *map; + char ident[256]; int err = 0; + if (sign_progs) + opts.gen_hash = true; + err = bpf_object__gen_loader(obj, &opts); if (err) return err; @@ -701,6 +708,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h p_err("failed to load object file"); goto out; } + /* If there was no error during load then gen_loader_opts * are populated with the loader program. */ @@ -780,8 +788,52 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h print_hex(opts.insns, opts.insns_sz); codegen("\ \n\ - \"; \n\ - \n\ + \";\n"); + + if (sign_progs) { + sopts.insns = opts.insns; + sopts.insns_sz = opts.insns_sz; + sopts.excl_prog_hash = prog_sha; + sopts.excl_prog_hash_sz = sizeof(prog_sha); + sopts.signature = sig_buf; + sopts.signature_sz = MAX_SIG_SIZE; + + err = bpftool_prog_sign(&sopts); + if (err < 0) { + p_err("failed to sign program"); + goto out; + } + + codegen("\ + \n\ + static const char opts_sig[] __attribute__((__aligned__(8))) = \"\\\n\ + "); + print_hex((const void *)sig_buf, sopts.signature_sz); + codegen("\ + \n\ + \";\n"); + + codegen("\ + \n\ + static const char opts_excl_hash[] __attribute__((__aligned__(8))) = \"\\\n\ + "); + print_hex((const void *)prog_sha, sizeof(prog_sha)); + codegen("\ + \n\ + \";\n"); + + codegen("\ + \n\ + opts.signature = (void *)opts_sig; \n\ + opts.signature_sz = sizeof(opts_sig) - 1; \n\ + opts.excl_prog_hash = (void *)opts_excl_hash; \n\ + opts.excl_prog_hash_sz = sizeof(opts_excl_hash) - 1; \n\ + opts.keyring_id = skel->keyring_id; \n\ + "); + } + + codegen("\ + \n\ opts.ctx = (struct bpf_loader_ctx *)skel; \n\ opts.data_sz = sizeof(opts_data) - 1; \n\ opts.data = (void *)opts_data; \n\ @@ -1240,7 +1292,7 @@ static int do_skeleton(int argc, char **argv) err = -errno; libbpf_strerror(err, err_buf, sizeof(err_buf)); p_err("failed to open BPF object file: %s", err_buf); - goto out; + goto out_obj; } bpf_object__for_each_map(map, obj) { @@ -1355,6 +1407,13 @@ static int do_skeleton(int argc, char **argv) printf("\t} links;\n"); } + if (sign_progs) { + codegen("\ + \n\ + __s32 keyring_id; \n\ + "); + } + if (btf) { err = codegen_datasecs(obj, obj_name); if (err) @@ -1552,6 +1611,7 @@ static int do_skeleton(int argc, char **argv) err = 0; out: bpf_object__close(obj); +out_obj: if (obj_data) munmap(obj_data, mmap_sz); close(fd); @@ -1930,7 +1990,7 @@ static int do_help(int argc, char **argv) " %1$s %2$s help\n" "\n" " " HELP_SPEC_OPTIONS " |\n" - " {-L|--use-loader} }\n" + " {-L|--use-loader} | [ {-S|--sign } {-k} {-i} ]}\n" "", bin_name, "gen"); diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index a773e05d5ade44..bdcd717b0348f8 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -282,11 +282,52 @@ get_addr_cookie_array(__u64 *addrs, __u64 *cookies, __u32 count) return data; } +static bool is_x86_ibt_enabled(void) +{ +#if defined(__x86_64__) + struct kernel_config_option options[] = { + { "CONFIG_X86_KERNEL_IBT", }, + }; + char *values[ARRAY_SIZE(options)] = { }; + bool ret; + + if (read_kernel_config(options, ARRAY_SIZE(options), values, NULL)) + return false; + + ret = !!values[0]; + free(values[0]); + return ret; +#else + return false; +#endif +} + +static bool +symbol_matches_target(__u64 sym_addr, __u64 target_addr, bool is_ibt_enabled) +{ + if (sym_addr == target_addr) + return true; + + /* + * On x86_64 architectures with CET (Control-flow Enforcement Technology), + * function entry points have a 4-byte 'endbr' instruction prefix. + * This causes kprobe hooks to target the address *after* 'endbr' + * (symbol address + 4), preserving the CET instruction. + * Here we check if the symbol address matches the hook target address + * minus 4, indicating a CET-enabled function entry point. + */ + if (is_ibt_enabled && sym_addr == target_addr - 4) + return true; + + return false; +} + static void show_kprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) { struct addr_cookie *data; __u32 i, j = 0; + bool is_ibt_enabled; jsonw_bool_field(json_wtr, "retprobe", info->kprobe_multi.flags & BPF_F_KPROBE_MULTI_RETURN); @@ -306,11 +347,13 @@ show_kprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) if (!dd.sym_count) goto error; + is_ibt_enabled = is_x86_ibt_enabled(); for (i = 0; i < dd.sym_count; i++) { - if (dd.sym_mapping[i].address != data[j].addr) + if (!symbol_matches_target(dd.sym_mapping[i].address, + data[j].addr, is_ibt_enabled)) continue; jsonw_start_object(json_wtr); - jsonw_uint_field(json_wtr, "addr", dd.sym_mapping[i].address); + jsonw_uint_field(json_wtr, "addr", (unsigned long)data[j].addr); jsonw_string_field(json_wtr, "func", dd.sym_mapping[i].name); /* Print null if it is vmlinux */ if (dd.sym_mapping[i].module[0] == '\0') { @@ -719,6 +762,7 @@ static void show_kprobe_multi_plain(struct bpf_link_info *info) { struct addr_cookie *data; __u32 i, j = 0; + bool is_ibt_enabled; if (!info->kprobe_multi.count) return; @@ -742,12 +786,14 @@ static void show_kprobe_multi_plain(struct bpf_link_info *info) if (!dd.sym_count) goto error; + is_ibt_enabled = is_x86_ibt_enabled(); printf("\n\t%-16s %-16s %s", "addr", "cookie", "func [module]"); for (i = 0; i < dd.sym_count; i++) { - if (dd.sym_mapping[i].address != data[j].addr) + if (!symbol_matches_target(dd.sym_mapping[i].address, + data[j].addr, is_ibt_enabled)) continue; printf("\n\t%016lx %-16llx %s", - dd.sym_mapping[i].address, data[j].cookie, dd.sym_mapping[i].name); + (unsigned long)data[j].addr, data[j].cookie, dd.sym_mapping[i].name); if (dd.sym_mapping[i].module[0] != '\0') printf(" [%s] ", dd.sym_mapping[i].module); else diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 2b7f2bd3a7dbc7..a829a6a49037ad 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -33,6 +33,9 @@ bool relaxed_maps; bool use_loader; struct btf *base_btf; struct hashmap *refs_table; +bool sign_progs; +const char *private_key_path; +const char *cert_path; static void __noreturn clean_and_exit(int i) { @@ -61,7 +64,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n" + " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter | token }\n" " " HELP_SPEC_OPTIONS " |\n" " {-V|--version} }\n" "", @@ -87,6 +90,7 @@ static const struct cmd commands[] = { { "gen", do_gen }, { "struct_ops", do_struct_ops }, { "iter", do_iter }, + { "token", do_token }, { "version", do_version }, { 0 } }; @@ -447,6 +451,7 @@ int main(int argc, char **argv) { "nomount", no_argument, NULL, 'n' }, { "debug", no_argument, NULL, 'd' }, { "use-loader", no_argument, NULL, 'L' }, + { "sign", no_argument, NULL, 'S' }, { "base-btf", required_argument, NULL, 'B' }, { 0 } }; @@ -473,7 +478,7 @@ int main(int argc, char **argv) bin_name = "bpftool"; opterr = 0; - while ((opt = getopt_long(argc, argv, "VhpjfLmndB:l", + while ((opt = getopt_long(argc, argv, "VhpjfLmndSi:k:B:l", options, NULL)) >= 0) { switch (opt) { case 'V': @@ -519,6 +524,16 @@ int main(int argc, char **argv) case 'L': use_loader = true; break; + case 'S': + sign_progs = true; + use_loader = true; + break; + case 'k': + private_key_path = optarg; + break; + case 'i': + cert_path = optarg; + break; default: p_err("unrecognized option '%s'", argv[optind - 1]); if (json_output) @@ -533,6 +548,16 @@ int main(int argc, char **argv) if (argc < 0) usage(); + if (sign_progs && (private_key_path == NULL || cert_path == NULL)) { + p_err("-i and -k must be supplied with -S for signing"); + return -EINVAL; + } + + if (!sign_progs && (private_key_path != NULL || cert_path != NULL)) { + p_err("--sign (or -S) must be explicitly passed with -i and -k to sign the programs"); + return -EINVAL; + } + if (version_requested) ret = do_version(argc, argv); else diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 6db704fda5c00f..1130299cede0b8 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -6,9 +6,14 @@ /* BFD and kernel.h both define GCC_VERSION, differently */ #undef GCC_VERSION +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #include #include +#include #include +#include #include #include #include @@ -52,6 +57,7 @@ static inline void *u64_to_ptr(__u64 ptr) }) #define ERR_MAX_LEN 1024 +#define MAX_SIG_SIZE 4096 #define BPF_TAG_FMT "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx" @@ -85,6 +91,9 @@ extern bool relaxed_maps; extern bool use_loader; extern struct btf *base_btf; extern struct hashmap *refs_table; +extern bool sign_progs; +extern const char *private_key_path; +extern const char *cert_path; void __printf(1, 2) p_err(const char *fmt, ...); void __printf(1, 2) p_info(const char *fmt, ...); @@ -166,6 +175,7 @@ int do_tracelog(int argc, char **arg) __weak; int do_feature(int argc, char **argv) __weak; int do_struct_ops(int argc, char **argv) __weak; int do_iter(int argc, char **argv) __weak; +int do_token(int argc, char **argv) __weak; int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); @@ -274,4 +284,15 @@ int pathname_concat(char *buf, int buf_sz, const char *path, /* print netfilter bpf_link info */ void netfilter_dump_plain(const struct bpf_link_info *info); void netfilter_dump_json(const struct bpf_link_info *info, json_writer_t *wtr); + +struct kernel_config_option { + const char *name; + bool macro_dump; +}; + +int read_kernel_config(const struct kernel_config_option *requested_options, + size_t num_options, char **out_values, + const char *define_prefix); +int bpftool_prog_sign(struct bpf_load_and_run_opts *opts); +__u32 register_session_key(const char *key_der_path); #endif diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 9722d841abc05d..6daf19809ca4a3 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -714,7 +715,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, if (mode == DUMP_JITED) { if (info->jited_prog_len == 0 || !info->jited_prog_insns) { - p_info("no instructions returned"); + p_err("error retrieving jit dump: no instructions returned or kernel.kptr_restrict set?"); return -1; } buf = u64_to_ptr(info->jited_prog_insns); @@ -1930,6 +1931,8 @@ static int try_loader(struct gen_loader_opts *gen) { struct bpf_load_and_run_opts opts = {}; struct bpf_loader_ctx *ctx; + char sig_buf[MAX_SIG_SIZE]; + __u8 prog_sha[SHA256_DIGEST_LENGTH]; int ctx_sz = sizeof(*ctx) + 64 * max(sizeof(struct bpf_map_desc), sizeof(struct bpf_prog_desc)); int log_buf_sz = (1u << 24) - 1; @@ -1953,6 +1956,26 @@ static int try_loader(struct gen_loader_opts *gen) opts.insns = gen->insns; opts.insns_sz = gen->insns_sz; fds_before = count_open_fds(); + + if (sign_progs) { + opts.excl_prog_hash = prog_sha; + opts.excl_prog_hash_sz = sizeof(prog_sha); + opts.signature = sig_buf; + opts.signature_sz = MAX_SIG_SIZE; + opts.keyring_id = KEY_SPEC_SESSION_KEYRING; + + err = bpftool_prog_sign(&opts); + if (err < 0) { + p_err("failed to sign program"); + goto out; + } + + err = register_session_key(cert_path); + if (err < 0) { + p_err("failed to add session key"); + goto out; + } + } err = bpf_load_and_run(&opts); fd_delta = count_open_fds() - fds_before; if (err < 0 || verifier_logs) { @@ -1961,6 +1984,7 @@ static int try_loader(struct gen_loader_opts *gen) fprintf(stderr, "loader prog leaked %d FDs\n", fd_delta); } +out: free(log_buf); return err; } @@ -1988,6 +2012,9 @@ static int do_loader(int argc, char **argv) goto err_close_obj; } + if (sign_progs) + gen.gen_hash = true; + err = bpf_object__gen_loader(obj, &gen); if (err) goto err_close_obj; @@ -2262,7 +2289,7 @@ static void profile_print_readings(void) static char *profile_target_name(int tgt_fd) { - struct bpf_func_info func_info; + struct bpf_func_info func_info = {}; struct bpf_prog_info info = {}; __u32 info_len = sizeof(info); const struct btf_type *t; @@ -2562,7 +2589,7 @@ static int do_help(int argc, char **argv) " METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} |\n" - " {-L|--use-loader} }\n" + " {-L|--use-loader} | [ {-S|--sign } {-k} {-i} ] \n" "", bin_name, argv[-2]); diff --git a/tools/bpf/bpftool/sign.c b/tools/bpf/bpftool/sign.c new file mode 100644 index 00000000000000..b34f74d210e9cd --- /dev/null +++ b/tools/bpf/bpftool/sign.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* + * Copyright (C) 2025 Google LLC. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "main.h" + +#define OPEN_SSL_ERR_BUF_LEN 256 + +static void display_openssl_errors(int l) +{ + char buf[OPEN_SSL_ERR_BUF_LEN]; + const char *file; + const char *data; + unsigned long e; + int flags; + int line; + + while ((e = ERR_get_error_all(&file, &line, NULL, &data, &flags))) { + ERR_error_string_n(e, buf, sizeof(buf)); + if (data && (flags & ERR_TXT_STRING)) { + p_err("OpenSSL %s: %s:%d: %s", buf, file, line, data); + } else { + p_err("OpenSSL %s: %s:%d", buf, file, line); + } + } +} + +#define DISPLAY_OSSL_ERR(cond) \ + do { \ + bool __cond = (cond); \ + if (__cond && ERR_peek_error()) \ + display_openssl_errors(__LINE__);\ + } while (0) + +static EVP_PKEY *read_private_key(const char *pkey_path) +{ + EVP_PKEY *private_key = NULL; + BIO *b; + + b = BIO_new_file(pkey_path, "rb"); + private_key = PEM_read_bio_PrivateKey(b, NULL, NULL, NULL); + BIO_free(b); + DISPLAY_OSSL_ERR(!private_key); + return private_key; +} + +static X509 *read_x509(const char *x509_name) +{ + unsigned char buf[2]; + X509 *x509 = NULL; + BIO *b; + int n; + + b = BIO_new_file(x509_name, "rb"); + if (!b) + goto cleanup; + + /* Look at the first two bytes of the file to determine the encoding */ + n = BIO_read(b, buf, 2); + if (n != 2) + goto cleanup; + + if (BIO_reset(b) != 0) + goto cleanup; + + if (buf[0] == 0x30 && buf[1] >= 0x81 && buf[1] <= 0x84) + /* Assume raw DER encoded X.509 */ + x509 = d2i_X509_bio(b, NULL); + else + /* Assume PEM encoded X.509 */ + x509 = PEM_read_bio_X509(b, NULL, NULL, NULL); + +cleanup: + BIO_free(b); + DISPLAY_OSSL_ERR(!x509); + return x509; +} + +__u32 register_session_key(const char *key_der_path) +{ + unsigned char *der_buf = NULL; + X509 *x509 = NULL; + int key_id = -1; + int der_len; + + if (!key_der_path) + return key_id; + x509 = read_x509(key_der_path); + if (!x509) + goto cleanup; + der_len = i2d_X509(x509, &der_buf); + if (der_len < 0) + goto cleanup; + key_id = syscall(__NR_add_key, "asymmetric", key_der_path, der_buf, + (size_t)der_len, KEY_SPEC_SESSION_KEYRING); +cleanup: + X509_free(x509); + OPENSSL_free(der_buf); + DISPLAY_OSSL_ERR(key_id == -1); + return key_id; +} + +int bpftool_prog_sign(struct bpf_load_and_run_opts *opts) +{ + BIO *bd_in = NULL, *bd_out = NULL; + EVP_PKEY *private_key = NULL; + CMS_ContentInfo *cms = NULL; + long actual_sig_len = 0; + X509 *x509 = NULL; + int err = 0; + + bd_in = BIO_new_mem_buf(opts->insns, opts->insns_sz); + if (!bd_in) { + err = -ENOMEM; + goto cleanup; + } + + private_key = read_private_key(private_key_path); + if (!private_key) { + err = -EINVAL; + goto cleanup; + } + + x509 = read_x509(cert_path); + if (!x509) { + err = -EINVAL; + goto cleanup; + } + + cms = CMS_sign(NULL, NULL, NULL, NULL, + CMS_NOCERTS | CMS_PARTIAL | CMS_BINARY | CMS_DETACHED | + CMS_STREAM); + if (!cms) { + err = -EINVAL; + goto cleanup; + } + + if (!CMS_add1_signer(cms, x509, private_key, EVP_sha256(), + CMS_NOCERTS | CMS_BINARY | CMS_NOSMIMECAP | + CMS_USE_KEYID | CMS_NOATTR)) { + err = -EINVAL; + goto cleanup; + } + + if (CMS_final(cms, bd_in, NULL, CMS_NOCERTS | CMS_BINARY) != 1) { + err = -EIO; + goto cleanup; + } + + EVP_Digest(opts->insns, opts->insns_sz, opts->excl_prog_hash, + &opts->excl_prog_hash_sz, EVP_sha256(), NULL); + + bd_out = BIO_new(BIO_s_mem()); + if (!bd_out) { + err = -ENOMEM; + goto cleanup; + } + + if (!i2d_CMS_bio_stream(bd_out, cms, NULL, 0)) { + err = -EIO; + goto cleanup; + } + + actual_sig_len = BIO_get_mem_data(bd_out, NULL); + if (actual_sig_len <= 0) { + err = -EIO; + goto cleanup; + } + + if ((size_t)actual_sig_len > opts->signature_sz) { + err = -ENOSPC; + goto cleanup; + } + + if (BIO_read(bd_out, opts->signature, actual_sig_len) != actual_sig_len) { + err = -EIO; + goto cleanup; + } + + opts->signature_sz = actual_sig_len; +cleanup: + BIO_free(bd_out); + CMS_ContentInfo_free(cms); + X509_free(x509); + EVP_PKEY_free(private_key); + BIO_free(bd_in); + DISPLAY_OSSL_ERR(err < 0); + return err; +} diff --git a/tools/bpf/bpftool/token.c b/tools/bpf/bpftool/token.c new file mode 100644 index 00000000000000..c08f34b9d51b58 --- /dev/null +++ b/tools/bpf/bpftool/token.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2025 Didi Technology Co., Tao Chen */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json_writer.h" +#include "main.h" + +#define MOUNTS_FILE "/proc/mounts" + +static struct { + const char *header; + const char *key; +} sets[] = { + {"allowed_cmds", "delegate_cmds"}, + {"allowed_maps", "delegate_maps"}, + {"allowed_progs", "delegate_progs"}, + {"allowed_attachs", "delegate_attachs"}, +}; + +static bool has_delegate_options(const char *mnt_ops) +{ + return strstr(mnt_ops, "delegate_cmds") || + strstr(mnt_ops, "delegate_maps") || + strstr(mnt_ops, "delegate_progs") || + strstr(mnt_ops, "delegate_attachs"); +} + +static char *get_delegate_value(char *opts, const char *key) +{ + char *token, *rest, *ret = NULL; + + if (!opts) + return NULL; + + for (token = strtok_r(opts, ",", &rest); token; + token = strtok_r(NULL, ",", &rest)) { + if (strncmp(token, key, strlen(key)) == 0 && + token[strlen(key)] == '=') { + ret = token + strlen(key) + 1; + break; + } + } + + return ret; +} + +static void print_items_per_line(char *input, int items_per_line) +{ + char *str, *rest; + int cnt = 0; + + if (!input) + return; + + for (str = strtok_r(input, ":", &rest); str; + str = strtok_r(NULL, ":", &rest)) { + if (cnt % items_per_line == 0) + printf("\n\t "); + + printf("%-20s", str); + cnt++; + } +} + +#define ITEMS_PER_LINE 4 +static void show_token_info_plain(struct mntent *mntent) +{ + size_t i; + + printf("token_info %s", mntent->mnt_dir); + + for (i = 0; i < ARRAY_SIZE(sets); i++) { + char *opts, *value; + + printf("\n\t%s:", sets[i].header); + opts = strdup(mntent->mnt_opts); + value = get_delegate_value(opts, sets[i].key); + print_items_per_line(value, ITEMS_PER_LINE); + free(opts); + } + + printf("\n"); +} + +static void split_json_array_str(char *input) +{ + char *str, *rest; + + if (!input) { + jsonw_start_array(json_wtr); + jsonw_end_array(json_wtr); + return; + } + + jsonw_start_array(json_wtr); + for (str = strtok_r(input, ":", &rest); str; + str = strtok_r(NULL, ":", &rest)) { + jsonw_string(json_wtr, str); + } + jsonw_end_array(json_wtr); +} + +static void show_token_info_json(struct mntent *mntent) +{ + size_t i; + + jsonw_start_object(json_wtr); + jsonw_string_field(json_wtr, "token_info", mntent->mnt_dir); + + for (i = 0; i < ARRAY_SIZE(sets); i++) { + char *opts, *value; + + jsonw_name(json_wtr, sets[i].header); + opts = strdup(mntent->mnt_opts); + value = get_delegate_value(opts, sets[i].key); + split_json_array_str(value); + free(opts); + } + + jsonw_end_object(json_wtr); +} + +static int __show_token_info(struct mntent *mntent) +{ + if (json_output) + show_token_info_json(mntent); + else + show_token_info_plain(mntent); + + return 0; +} + +static int show_token_info(void) +{ + FILE *fp; + struct mntent *ent; + + fp = setmntent(MOUNTS_FILE, "r"); + if (!fp) { + p_err("Failed to open: %s", MOUNTS_FILE); + return -1; + } + + if (json_output) + jsonw_start_array(json_wtr); + + while ((ent = getmntent(fp)) != NULL) { + if (strncmp(ent->mnt_type, "bpf", 3) == 0) { + if (has_delegate_options(ent->mnt_opts)) + __show_token_info(ent); + } + } + + if (json_output) + jsonw_end_array(json_wtr); + + endmntent(fp); + + return 0; +} + +static int do_show(int argc, char **argv) +{ + if (argc) + return BAD_ARG(); + + return show_token_info(); +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list }\n" + " %1$s %2$s help\n" + " " HELP_SPEC_OPTIONS " }\n" + "\n" + "", + bin_name, argv[-2]); + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { 0 } +}; + +int do_token(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/tracelog.c b/tools/bpf/bpftool/tracelog.c index 31d806e3bdaaa9..573a8d99f009a4 100644 --- a/tools/bpf/bpftool/tracelog.c +++ b/tools/bpf/bpftool/tracelog.c @@ -57,10 +57,8 @@ find_tracefs_mnt_single(unsigned long magic, char *mnt, const char *mntpt) static bool get_tracefs_pipe(char *mnt) { static const char * const known_mnts[] = { - "/sys/kernel/debug/tracing", "/sys/kernel/tracing", - "/tracing", - "/trace", + "/sys/kernel/debug/tracing", }; const char *pipe_name = "/trace_pipe"; const char *fstype = "tracefs"; @@ -95,12 +93,7 @@ static bool get_tracefs_pipe(char *mnt) return false; p_info("could not find tracefs, attempting to mount it now"); - /* Most of the time, tracefs is automatically mounted by debugfs at - * /sys/kernel/debug/tracing when we try to access it. If we could not - * find it, it is likely that debugfs is not mounted. Let's give one - * attempt at mounting just tracefs at /sys/kernel/tracing. - */ - strcpy(mnt, known_mnts[1]); + strcpy(mnt, known_mnts[0]); if (mount_tracefs(mnt)) return false; diff --git a/tools/include/linux/cfi_types.h b/tools/include/linux/cfi_types.h index 685f7181780f92..fb8d90bff92e0c 100644 --- a/tools/include/linux/cfi_types.h +++ b/tools/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_ type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,12 +29,12 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index c199ade200c240..d2f5aa085f8e36 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -116,6 +116,7 @@ #include "sched.h" #include "signal.h" #include "unistd.h" +#include "stdbool.h" #include "stdio.h" #include "stdlib.h" #include "string.h" diff --git a/tools/include/nolibc/poll.h b/tools/include/nolibc/poll.h index 1765acb17ea01f..0d053f93ea99b0 100644 --- a/tools/include/nolibc/poll.h +++ b/tools/include/nolibc/poll.h @@ -39,10 +39,8 @@ int sys_poll(struct pollfd *fds, int nfds, int timeout) t.tv_nsec = (timeout % 1000) * 1000000; } return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); -#elif defined(__NR_poll) - return my_syscall3(__NR_poll, fds, nfds, timeout); #else - return __nolibc_enosys(__func__, fds, nfds, timeout); + return my_syscall3(__NR_poll, fds, nfds, timeout); #endif } diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h index ba950f0e733843..2c1ad23b9b5c17 100644 --- a/tools/include/nolibc/std.h +++ b/tools/include/nolibc/std.h @@ -29,6 +29,6 @@ typedef unsigned long nlink_t; typedef signed long off_t; typedef signed long blksize_t; typedef signed long blkcnt_t; -typedef __kernel_old_time_t time_t; +typedef __kernel_time_t time_t; #endif /* _NOLIBC_STD_H */ diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 295e71d34abadb..c5564f57deec88 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -142,10 +142,8 @@ int sys_chmod(const char *path, mode_t mode) { #if defined(__NR_fchmodat) return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); -#elif defined(__NR_chmod) - return my_syscall2(__NR_chmod, path, mode); #else - return __nolibc_enosys(__func__, path, mode); + return my_syscall2(__NR_chmod, path, mode); #endif } @@ -165,10 +163,8 @@ int sys_chown(const char *path, uid_t owner, gid_t group) { #if defined(__NR_fchownat) return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); -#elif defined(__NR_chown) - return my_syscall3(__NR_chown, path, owner, group); #else - return __nolibc_enosys(__func__, path, owner, group); + return my_syscall3(__NR_chown, path, owner, group); #endif } @@ -238,11 +234,22 @@ static __attribute__((unused)) int sys_dup2(int old, int new) { #if defined(__NR_dup3) + int ret, nr_fcntl; + +#ifdef __NR_fcntl64 + nr_fcntl = __NR_fcntl64; +#else + nr_fcntl = __NR_fcntl; +#endif + + if (old == new) { + ret = my_syscall2(nr_fcntl, old, F_GETFD); + return ret < 0 ? ret : old; + } + return my_syscall3(__NR_dup3, old, new, 0); -#elif defined(__NR_dup2) - return my_syscall2(__NR_dup2, old, new); #else - return __nolibc_enosys(__func__, old, new); + return my_syscall2(__NR_dup2, old, new); #endif } @@ -327,10 +334,8 @@ pid_t sys_fork(void) * will not use the rest with no other flag. */ return my_syscall5(__NR_clone, SIGCHLD, 0, 0, 0, 0); -#elif defined(__NR_fork) - return my_syscall0(__NR_fork); #else - return __nolibc_enosys(__func__); + return my_syscall0(__NR_fork); #endif } #endif @@ -347,7 +352,7 @@ pid_t sys_vfork(void) { #if defined(__NR_vfork) return my_syscall0(__NR_vfork); -#elif defined(__NR_clone3) +#else /* * clone() could be used but has different argument orders per * architecture. @@ -358,8 +363,6 @@ pid_t sys_vfork(void) }; return my_syscall2(__NR_clone3, &args, sizeof(args)); -#else - return __nolibc_enosys(__func__); #endif } #endif @@ -569,10 +572,8 @@ int sys_link(const char *old, const char *new) { #if defined(__NR_linkat) return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); -#elif defined(__NR_link) - return my_syscall2(__NR_link, old, new); #else - return __nolibc_enosys(__func__, old, new); + return my_syscall2(__NR_link, old, new); #endif } @@ -593,41 +594,27 @@ off_t sys_lseek(int fd, off_t offset, int whence) #if defined(__NR_lseek) return my_syscall3(__NR_lseek, fd, offset, whence); #else - return __nolibc_enosys(__func__, fd, offset, whence); -#endif -} + __kernel_loff_t loff = 0; + off_t result; + int ret; -static __attribute__((unused)) -int sys_llseek(int fd, unsigned long offset_high, unsigned long offset_low, - __kernel_loff_t *result, int whence) -{ -#if defined(__NR_llseek) - return my_syscall5(__NR_llseek, fd, offset_high, offset_low, result, whence); -#else - return __nolibc_enosys(__func__, fd, offset_high, offset_low, result, whence); + /* Only exists on 32bit where nolibc off_t is also 32bit */ + ret = my_syscall5(__NR_llseek, fd, 0, offset, &loff, whence); + if (ret < 0) + result = ret; + else if (loff != (off_t)loff) + result = -EOVERFLOW; + else + result = loff; + + return result; #endif } static __attribute__((unused)) off_t lseek(int fd, off_t offset, int whence) { - __kernel_loff_t loff = 0; - off_t result; - int ret; - - result = sys_lseek(fd, offset, whence); - if (result == -ENOSYS) { - /* Only exists on 32bit where nolibc off_t is also 32bit */ - ret = sys_llseek(fd, 0, offset, &loff, whence); - if (ret < 0) - result = ret; - else if (loff != (off_t)loff) - result = -EOVERFLOW; - else - result = loff; - } - - return __sysret(result); + return __sysret(sys_lseek(fd, offset, whence)); } @@ -640,10 +627,8 @@ int sys_mkdir(const char *path, mode_t mode) { #if defined(__NR_mkdirat) return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); -#elif defined(__NR_mkdir) - return my_syscall2(__NR_mkdir, path, mode); #else - return __nolibc_enosys(__func__, path, mode); + return my_syscall2(__NR_mkdir, path, mode); #endif } @@ -662,10 +647,8 @@ int sys_rmdir(const char *path) { #if defined(__NR_rmdir) return my_syscall1(__NR_rmdir, path); -#elif defined(__NR_unlinkat) - return my_syscall3(__NR_unlinkat, AT_FDCWD, path, AT_REMOVEDIR); #else - return __nolibc_enosys(__func__, path); + return my_syscall3(__NR_unlinkat, AT_FDCWD, path, AT_REMOVEDIR); #endif } @@ -685,10 +668,8 @@ long sys_mknod(const char *path, mode_t mode, dev_t dev) { #if defined(__NR_mknodat) return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); -#elif defined(__NR_mknod) - return my_syscall3(__NR_mknod, path, mode, dev); #else - return __nolibc_enosys(__func__, path, mode, dev); + return my_syscall3(__NR_mknod, path, mode, dev); #endif } @@ -801,7 +782,7 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva t.tv_nsec = timeout->tv_usec * 1000; } return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#elif defined(__NR_pselect6_time64) +#else struct __kernel_timespec t; if (timeout) { @@ -809,8 +790,6 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva t.tv_nsec = timeout->tv_usec * 1000; } return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#else - return __nolibc_enosys(__func__, nfds, rfds, wfds, efds, timeout); #endif } @@ -874,10 +853,8 @@ int sys_symlink(const char *old, const char *new) { #if defined(__NR_symlinkat) return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); -#elif defined(__NR_symlink) - return my_syscall2(__NR_symlink, old, new); #else - return __nolibc_enosys(__func__, old, new); + return my_syscall2(__NR_symlink, old, new); #endif } @@ -931,10 +908,8 @@ int sys_unlink(const char *path) { #if defined(__NR_unlinkat) return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); -#elif defined(__NR_unlink) - return my_syscall1(__NR_unlink, path); #else - return __nolibc_enosys(__func__, path); + return my_syscall1(__NR_unlink, path); #endif } diff --git a/tools/include/nolibc/sys/random.h b/tools/include/nolibc/sys/random.h index 8d9749f1c84572..cd5d25c571a8b7 100644 --- a/tools/include/nolibc/sys/random.h +++ b/tools/include/nolibc/sys/random.h @@ -22,13 +22,13 @@ static __attribute__((unused)) ssize_t sys_getrandom(void *buf, size_t buflen, unsigned int flags) { - return my_syscall3(__NR_getrandom, buf, buflen, flags); + return my_syscall3(__NR_getrandom, buf, buflen, flags); } static __attribute__((unused)) ssize_t getrandom(void *buf, size_t buflen, unsigned int flags) { - return __sysret(sys_getrandom(buf, buflen, flags)); + return __sysret(sys_getrandom(buf, buflen, flags)); } #endif /* _NOLIBC_SYS_RANDOM_H */ diff --git a/tools/include/nolibc/sys/timerfd.h b/tools/include/nolibc/sys/timerfd.h index 4375d546ba58f8..5dd61030c9914d 100644 --- a/tools/include/nolibc/sys/timerfd.h +++ b/tools/include/nolibc/sys/timerfd.h @@ -34,7 +34,7 @@ int sys_timerfd_gettime(int fd, struct itimerspec *curr_value) { #if defined(__NR_timerfd_gettime) return my_syscall2(__NR_timerfd_gettime, fd, curr_value); -#elif defined(__NR_timerfd_gettime64) +#else struct __kernel_itimerspec kcurr_value; int ret; @@ -42,8 +42,6 @@ int sys_timerfd_gettime(int fd, struct itimerspec *curr_value) __nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval); __nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value); return ret; -#else - return __nolibc_enosys(__func__, fd, curr_value); #endif } @@ -60,7 +58,7 @@ int sys_timerfd_settime(int fd, int flags, { #if defined(__NR_timerfd_settime) return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value); -#elif defined(__NR_timerfd_settime64) +#else struct __kernel_itimerspec knew_value, kold_value; int ret; @@ -72,8 +70,6 @@ int sys_timerfd_settime(int fd, int flags, __nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value); } return ret; -#else - return __nolibc_enosys(__func__, fd, flags, new_value, old_value); #endif } diff --git a/tools/include/nolibc/sys/wait.h b/tools/include/nolibc/sys/wait.h index 56ddb806da7f24..4e66e1f7a03e45 100644 --- a/tools/include/nolibc/sys/wait.h +++ b/tools/include/nolibc/sys/wait.h @@ -16,27 +16,10 @@ /* * pid_t wait(int *status); - * pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage); * pid_t waitpid(pid_t pid, int *status, int options); * int waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options); */ -static __attribute__((unused)) -pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ -#ifdef __NR_wait4 - return my_syscall4(__NR_wait4, pid, status, options, rusage); -#else - return __nolibc_enosys(__func__, pid, status, options, rusage); -#endif -} - -static __attribute__((unused)) -pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - return __sysret(sys_wait4(pid, status, options, rusage)); -} - static __attribute__((unused)) int sys_waitid(int which, pid_t pid, siginfo_t *infop, int options, struct rusage *rusage) { diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h index d02bc44d2643a5..6c276b8d646a4e 100644 --- a/tools/include/nolibc/time.h +++ b/tools/include/nolibc/time.h @@ -45,7 +45,7 @@ int sys_clock_getres(clockid_t clockid, struct timespec *res) { #if defined(__NR_clock_getres) return my_syscall2(__NR_clock_getres, clockid, res); -#elif defined(__NR_clock_getres_time64) +#else struct __kernel_timespec kres; int ret; @@ -53,8 +53,6 @@ int sys_clock_getres(clockid_t clockid, struct timespec *res) if (res) __nolibc_timespec_kernel_to_user(&kres, res); return ret; -#else - return __nolibc_enosys(__func__, clockid, res); #endif } @@ -69,7 +67,7 @@ int sys_clock_gettime(clockid_t clockid, struct timespec *tp) { #if defined(__NR_clock_gettime) return my_syscall2(__NR_clock_gettime, clockid, tp); -#elif defined(__NR_clock_gettime64) +#else struct __kernel_timespec ktp; int ret; @@ -77,8 +75,6 @@ int sys_clock_gettime(clockid_t clockid, struct timespec *tp) if (tp) __nolibc_timespec_kernel_to_user(&ktp, tp); return ret; -#else - return __nolibc_enosys(__func__, clockid, tp); #endif } @@ -133,7 +129,8 @@ static __attribute__((unused)) int clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqtp, struct timespec *rmtp) { - return __sysret(sys_clock_nanosleep(clockid, flags, rqtp, rmtp)); + /* Directly return a positive error number */ + return -sys_clock_nanosleep(clockid, flags, rqtp, rmtp); } static __inline__ @@ -145,7 +142,7 @@ double difftime(time_t time1, time_t time2) static __inline__ int nanosleep(const struct timespec *rqtp, struct timespec *rmtp) { - return clock_nanosleep(CLOCK_REALTIME, 0, rqtp, rmtp); + return __sysret(sys_clock_nanosleep(CLOCK_REALTIME, 0, rqtp, rmtp)); } diff --git a/tools/include/nolibc/unistd.h b/tools/include/nolibc/unistd.h index 25bfc7732ec7e7..7405fa2b89baa8 100644 --- a/tools/include/nolibc/unistd.h +++ b/tools/include/nolibc/unistd.h @@ -33,7 +33,7 @@ static __attribute__((unused)) int sys_faccessat(int fd, const char *path, int amode, int flag) { - return my_syscall4(__NR_faccessat, fd, path, amode, flag); + return my_syscall4(__NR_faccessat, fd, path, amode, flag); } static __attribute__((unused)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 233de8677382ec..ae83d8649ef1cd 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ @@ -1605,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -6666,6 +6682,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { @@ -7418,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h index 34127653fd0070..33c9b578b3b243 100644 --- a/tools/include/uapi/linux/nsfs.h +++ b/tools/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,4 +40,19 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + +enum init_ns_ino { + IPC_NS_INIT_INO = 0xEFFFFFFFU, + UTS_NS_INIT_INO = 0xEFFFFFFEU, + USER_NS_INIT_INO = 0xEFFFFFFDU, + PID_NS_INIT_INO = 0xEFFFFFFCU, + CGROUP_NS_INIT_INO = 0xEFFFFFFBU, + TIME_NS_INIT_INO = 0xEFFFFFFAU, + NET_NS_INIT_INO = 0xEFFFFFF9U, + MNT_NS_INIT_INO = 0xEFFFFFF8U, +}; + #endif /* __LINUX_NSFS_H */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index ab40dbf9f020fb..339b197972374f 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -172,7 +172,7 @@ int bpf_map_create(enum bpf_map_type map_type, __u32 max_entries, const struct bpf_map_create_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, map_token_fd); + const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size); union bpf_attr attr; int fd; @@ -203,6 +203,8 @@ int bpf_map_create(enum bpf_map_type map_type, attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); attr.map_token_fd = OPTS_GET(opts, token_fd, 0); + attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL)); + attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0); fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); return libbpf_err_errno(fd); @@ -238,7 +240,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insn_cnt, struct bpf_prog_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, fd_array_cnt); + const size_t attr_sz = offsetofend(union bpf_attr, keyring_id); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 7252150e7ad357..e983a3e40d6120 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -54,9 +54,12 @@ struct bpf_map_create_opts { __s32 value_type_btf_obj_fd; __u32 token_fd; + + const void *excl_prog_hash; + __u32 excl_prog_hash_size; size_t :0; }; -#define bpf_map_create_opts__last_field token_fd +#define bpf_map_create_opts__last_field excl_prog_hash_size LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, const char *map_name, diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h index 6ff963a491d972..49af4260b8e6b7 100644 --- a/tools/lib/bpf/bpf_gen_internal.h +++ b/tools/lib/bpf/bpf_gen_internal.h @@ -4,6 +4,7 @@ #define __BPF_GEN_INTERNAL_H #include "bpf.h" +#include "libbpf_internal.h" struct ksym_relo_desc { const char *name; @@ -50,6 +51,7 @@ struct bpf_gen { __u32 nr_ksyms; int fd_array; int nr_fd_array; + int hash_insn_offset[SHA256_DWORD_SIZE]; }; void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps); diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 113ae4abd345f2..6945dd99a84693 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -110,6 +110,7 @@ static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn in static int add_data(struct bpf_gen *gen, const void *data, __u32 size); static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off); +static void emit_signature_match(struct bpf_gen *gen); void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps) { @@ -152,6 +153,8 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps /* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */ emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7)); emit(gen, BPF_EXIT_INSN()); + if (OPTS_GET(gen->opts, gen_hash, false)) + emit_signature_match(gen); } static int add_data(struct bpf_gen *gen, const void *data, __u32 size) @@ -368,6 +371,8 @@ static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off) __emit_sys_close(gen); } +static void compute_sha_update_offsets(struct bpf_gen *gen); + int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) { int i; @@ -394,6 +399,9 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) blob_fd_array_off(gen, i)); emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0)); emit(gen, BPF_EXIT_INSN()); + if (OPTS_GET(gen->opts, gen_hash, false)) + compute_sha_update_offsets(gen); + pr_debug("gen: finish %s\n", errstr(gen->error)); if (!gen->error) { struct gen_loader_opts *opts = gen->opts; @@ -446,6 +454,22 @@ void bpf_gen__free(struct bpf_gen *gen) _val; \ }) +static void compute_sha_update_offsets(struct bpf_gen *gen) +{ + __u64 sha[SHA256_DWORD_SIZE]; + __u64 sha_dw; + int i; + + libbpf_sha256(gen->data_start, gen->data_cur - gen->data_start, (__u8 *)sha); + for (i = 0; i < SHA256_DWORD_SIZE; i++) { + struct bpf_insn *insn = + (struct bpf_insn *)(gen->insn_start + gen->hash_insn_offset[i]); + sha_dw = tgt_endian(sha[i]); + insn[0].imm = (__u32)sha_dw; + insn[1].imm = sha_dw >> 32; + } +} + void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, __u32 btf_raw_size) { @@ -557,6 +581,29 @@ void bpf_gen__map_create(struct bpf_gen *gen, emit_sys_close_stack(gen, stack_off(inner_map_fd)); } +static void emit_signature_match(struct bpf_gen *gen) +{ + __s64 off; + int i; + + for (i = 0; i < SHA256_DWORD_SIZE; i++) { + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX, + 0, 0, 0, 0)); + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, i * sizeof(__u64))); + gen->hash_insn_offset[i] = gen->insn_cur - gen->insn_start; + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_3, 0, 0, 0, 0, 0)); + + off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 1; + if (is_simm16(off)) { + emit(gen, BPF_MOV64_IMM(BPF_REG_7, -EINVAL)); + emit(gen, BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, off)); + } else { + gen->error = -ERANGE; + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1)); + } + } +} + void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name, enum bpf_attach_type type) { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8f5a81b672e1b8..f92083f51bdb36 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -496,6 +497,7 @@ struct bpf_program { __u32 line_info_rec_size; __u32 line_info_cnt; __u32 prog_flags; + __u8 hash[SHA256_DIGEST_LENGTH]; }; struct bpf_struct_ops { @@ -575,6 +577,7 @@ struct bpf_map { bool autocreate; bool autoattach; __u64 map_extra; + struct bpf_program *excl_prog; }; enum extern_type { @@ -1013,35 +1016,33 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw, const struct btf_member *kern_data_member; struct btf *btf = NULL; __s32 kern_vtype_id, kern_type_id; - char tname[256]; + char tname[192], stname[256]; __u32 i; snprintf(tname, sizeof(tname), "%.*s", (int)bpf_core_essential_name_len(tname_raw), tname_raw); - kern_type_id = find_ksym_btf_id(obj, tname, BTF_KIND_STRUCT, - &btf, mod_btf); - if (kern_type_id < 0) { - pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", - tname); - return kern_type_id; - } - kern_type = btf__type_by_id(btf, kern_type_id); + snprintf(stname, sizeof(stname), "%s%s", STRUCT_OPS_VALUE_PREFIX, tname); - /* Find the corresponding "map_value" type that will be used - * in map_update(BPF_MAP_TYPE_STRUCT_OPS). For example, - * find "struct bpf_struct_ops_tcp_congestion_ops" from the - * btf_vmlinux. + /* Look for the corresponding "map_value" type that will be used + * in map_update(BPF_MAP_TYPE_STRUCT_OPS) first, figure out the btf + * and the mod_btf. + * For example, find "struct bpf_struct_ops_tcp_congestion_ops". */ - kern_vtype_id = find_btf_by_prefix_kind(btf, STRUCT_OPS_VALUE_PREFIX, - tname, BTF_KIND_STRUCT); + kern_vtype_id = find_ksym_btf_id(obj, stname, BTF_KIND_STRUCT, &btf, mod_btf); if (kern_vtype_id < 0) { - pr_warn("struct_ops init_kern: struct %s%s is not found in kernel BTF\n", - STRUCT_OPS_VALUE_PREFIX, tname); + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", stname); return kern_vtype_id; } kern_vtype = btf__type_by_id(btf, kern_vtype_id); + kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + if (kern_type_id < 0) { + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", tname); + return kern_type_id; + } + kern_type = btf__type_by_id(btf, kern_type_id); + /* Find "struct tcp_congestion_ops" from * struct bpf_struct_ops_tcp_congestion_ops { * [ ... ] @@ -1054,8 +1055,8 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw, break; } if (i == btf_vlen(kern_vtype)) { - pr_warn("struct_ops init_kern: struct %s data is not found in struct %s%s\n", - tname, STRUCT_OPS_VALUE_PREFIX, tname); + pr_warn("struct_ops init_kern: struct %s data is not found in struct %s\n", + tname, stname); return -EINVAL; } @@ -4485,6 +4486,44 @@ bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) } } +static int bpf_prog_compute_hash(struct bpf_program *prog) +{ + struct bpf_insn *purged; + int i, err = 0; + + purged = calloc(prog->insns_cnt, BPF_INSN_SZ); + if (!purged) + return -ENOMEM; + + /* If relocations have been done, the map_fd needs to be + * discarded for the digest calculation. + */ + for (i = 0; i < prog->insns_cnt; i++) { + purged[i] = prog->insns[i]; + if (purged[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + (purged[i].src_reg == BPF_PSEUDO_MAP_FD || + purged[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { + purged[i].imm = 0; + i++; + if (i >= prog->insns_cnt || + prog->insns[i].code != 0 || + prog->insns[i].dst_reg != 0 || + prog->insns[i].src_reg != 0 || + prog->insns[i].off != 0) { + err = -EINVAL; + goto out; + } + purged[i] = prog->insns[i]; + purged[i].imm = 0; + } + } + libbpf_sha256(purged, prog->insns_cnt * sizeof(struct bpf_insn), + prog->hash); +out: + free(purged); + return err; +} + static int bpf_program__record_reloc(struct bpf_program *prog, struct reloc_desc *reloc_desc, __u32 insn_idx, const char *sym_name, @@ -5093,6 +5132,16 @@ static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) return false; } + /* + * bpf_get_map_info_by_fd() for DEVMAP will always return flags with + * BPF_F_RDONLY_PROG set, but it generally is not set at map creation time. + * Thus, ignore the BPF_F_RDONLY_PROG flag in the flags returned from + * bpf_get_map_info_by_fd() when checking for compatibility with an + * existing DEVMAP. + */ + if (map->def.type == BPF_MAP_TYPE_DEVMAP || map->def.type == BPF_MAP_TYPE_DEVMAP_HASH) + map_info.map_flags &= ~BPF_F_RDONLY_PROG; + return (map_info.type == map->def.type && map_info.key_size == map->def.key_size && map_info.value_size == map->def.value_size && @@ -5224,6 +5273,14 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b create_attr.token_fd = obj->token_fd; if (obj->token_fd) create_attr.map_flags |= BPF_F_TOKEN_FD; + if (map->excl_prog) { + err = bpf_prog_compute_hash(map->excl_prog); + if (err) + return err; + + create_attr.excl_prog_hash = map->excl_prog->hash; + create_attr.excl_prog_hash_size = SHA256_DIGEST_LENGTH; + } if (bpf_map__is_struct_ops(map)) { create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; @@ -10514,6 +10571,27 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) return 0; } +int bpf_map__set_exclusive_program(struct bpf_map *map, struct bpf_program *prog) +{ + if (map_is_created(map)) { + pr_warn("exclusive programs must be set before map creation\n"); + return libbpf_err(-EINVAL); + } + + if (map->obj != prog->obj) { + pr_warn("excl_prog and map must be from the same bpf object\n"); + return libbpf_err(-EINVAL); + } + + map->excl_prog = prog; + return 0; +} + +struct bpf_program *bpf_map__exclusive_program(struct bpf_map *map) +{ + return map->excl_prog; +} + static struct bpf_map * __bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) { @@ -14207,3 +14285,100 @@ void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) free(s->progs); free(s); } + +static inline __u32 ror32(__u32 v, int bits) +{ + return (v >> bits) | (v << (32 - bits)); +} + +#define SHA256_BLOCK_LENGTH 64 +#define Ch(x, y, z) (((x) & (y)) ^ (~(x) & (z))) +#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) +#define Sigma_0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22)) +#define Sigma_1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25)) +#define sigma_0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3)) +#define sigma_1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10)) + +static const __u32 sha256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, + 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, + 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, + 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, + 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, + 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + +#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \ + { \ + __u32 tmp = h + Sigma_1(e) + Ch(e, f, g) + sha256_K[i] + w[i]; \ + d += tmp; \ + h = tmp + Sigma_0(a) + Maj(a, b, c); \ + } + +static void sha256_blocks(__u32 state[8], const __u8 *data, size_t nblocks) +{ + while (nblocks--) { + __u32 a = state[0]; + __u32 b = state[1]; + __u32 c = state[2]; + __u32 d = state[3]; + __u32 e = state[4]; + __u32 f = state[5]; + __u32 g = state[6]; + __u32 h = state[7]; + __u32 w[64]; + int i; + + for (i = 0; i < 16; i++) + w[i] = get_unaligned_be32(&data[4 * i]); + for (; i < ARRAY_SIZE(w); i++) + w[i] = sigma_1(w[i - 2]) + w[i - 7] + + sigma_0(w[i - 15]) + w[i - 16]; + for (i = 0; i < ARRAY_SIZE(w); i += 8) { + SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h); + SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g); + SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f); + SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e); + SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d); + SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c); + SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b); + SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); + } + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + data += SHA256_BLOCK_LENGTH; + } +} + +void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]) +{ + __u32 state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; + const __be64 bitcount = cpu_to_be64((__u64)len * 8); + __u8 final_data[2 * SHA256_BLOCK_LENGTH] = { 0 }; + size_t final_len = len % SHA256_BLOCK_LENGTH; + int i; + + sha256_blocks(state, data, len / SHA256_BLOCK_LENGTH); + + memcpy(final_data, data + len - final_len, final_len); + final_data[final_len] = 0x80; + final_len = round_up(final_len + 9, SHA256_BLOCK_LENGTH); + memcpy(&final_data[final_len - 8], &bitcount, 8); + + sha256_blocks(state, final_data, final_len / SHA256_BLOCK_LENGTH); + + for (i = 0; i < ARRAY_SIZE(state); i++) + put_unaligned_be32(state[i], &out[4 * i]); +} diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 455a957cb702ca..5118d0a90e243a 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -24,8 +24,25 @@ extern "C" { #endif +/** + * @brief **libbpf_major_version()** provides the major version of libbpf. + * @return An integer, the major version number + */ LIBBPF_API __u32 libbpf_major_version(void); + +/** + * @brief **libbpf_minor_version()** provides the minor version of libbpf. + * @return An integer, the minor version number + */ LIBBPF_API __u32 libbpf_minor_version(void); + +/** + * @brief **libbpf_version_string()** provides the version of libbpf in a + * human-readable form, e.g., "v1.7". + * @return Pointer to a static string containing the version + * + * The format is *not* a part of a stable API and may change in the future. + */ LIBBPF_API const char *libbpf_version_string(void); enum libbpf_errno { @@ -49,6 +66,14 @@ enum libbpf_errno { __LIBBPF_ERRNO__END, }; +/** + * @brief **libbpf_strerror()** converts the provided error code into a + * human-readable string. + * @param err The error code to convert + * @param buf Pointer to a buffer where the error message will be stored + * @param size The number of bytes in the buffer + * @return 0, on success; negative error code, otherwise + */ LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); /** @@ -252,7 +277,7 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, * @return 0, on success; negative error code, otherwise, error code is * stored in errno */ -int bpf_object__prepare(struct bpf_object *obj); +LIBBPF_API int bpf_object__prepare(struct bpf_object *obj); /** * @brief **bpf_object__load()** loads BPF object into kernel. @@ -1266,6 +1291,28 @@ LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, */ LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, const void *cur_key, void *next_key, size_t key_sz); +/** + * @brief **bpf_map__set_exclusive_program()** sets a map to be exclusive to the + * specified program. This must be called *before* the map is created. + * + * @param map BPF map to make exclusive. + * @param prog BPF program to be the exclusive user of the map. Must belong + * to the same bpf_object as the map. + * @return 0 on success; a negative error code otherwise. + * + * This function must be called after the BPF object is opened but before + * it is loaded. Once the object is loaded, only the specified program + * will be able to access the map's contents. + */ +LIBBPF_API int bpf_map__set_exclusive_program(struct bpf_map *map, struct bpf_program *prog); + +/** + * @brief **bpf_map__exclusive_program()** returns the exclusive program + * that is registered with the map (if any). + * @param map BPF map to which the exclusive program is registered. + * @return the registered exclusive program. + */ +LIBBPF_API struct bpf_program *bpf_map__exclusive_program(struct bpf_map *map); struct bpf_xdp_set_link_opts { size_t sz; @@ -1810,9 +1857,10 @@ struct gen_loader_opts { const char *insns; __u32 data_sz; __u32 insns_sz; + bool gen_hash; }; -#define gen_loader_opts__last_field insns_sz +#define gen_loader_opts__last_field gen_hash LIBBPF_API int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index d7bd463e7017e7..8ed8749907d472 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -448,4 +448,7 @@ LIBBPF_1.6.0 { } LIBBPF_1.5.0; LIBBPF_1.7.0 { + global: + bpf_map__set_exclusive_program; + bpf_map__exclusive_program; } LIBBPF_1.6.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 477a3b3389a091..c93797dcaf5bcf 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -736,4 +736,8 @@ int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern, int probe_fd(int fd); +#define SHA256_DIGEST_LENGTH 32 +#define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64) + +void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]); #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index 4d5fa079b5d626..6a8f5c7a02eb97 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -13,10 +13,15 @@ #include #include #include +#include #include #include "bpf.h" #endif +#ifndef SHA256_DIGEST_LENGTH +#define SHA256_DIGEST_LENGTH 32 +#endif + #ifndef __NR_bpf # if defined(__mips__) && defined(_ABIO32) # define __NR_bpf 4355 @@ -64,6 +69,11 @@ struct bpf_load_and_run_opts { __u32 data_sz; __u32 insns_sz; const char *errstr; + void *signature; + __u32 signature_sz; + __s32 keyring_id; + void *excl_prog_hash; + __u32 excl_prog_hash_sz; }; long kern_sys_bpf(__u32 cmd, void *attr, __u32 attr_size); @@ -220,14 +230,19 @@ static inline int skel_map_create(enum bpf_map_type map_type, const char *map_name, __u32 key_size, __u32 value_size, - __u32 max_entries) + __u32 max_entries, + const void *excl_prog_hash, + __u32 excl_prog_hash_sz) { - const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size); union bpf_attr attr; memset(&attr, 0, attr_sz); attr.map_type = map_type; + attr.excl_prog_hash = (unsigned long) excl_prog_hash; + attr.excl_prog_hash_size = excl_prog_hash_sz; + strncpy(attr.map_name, map_name, sizeof(attr.map_name)); attr.key_size = key_size; attr.value_size = value_size; @@ -300,6 +315,35 @@ static inline int skel_link_create(int prog_fd, int target_fd, return skel_sys_bpf(BPF_LINK_CREATE, &attr, attr_sz); } +static inline int skel_obj_get_info_by_fd(int fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, info); + __u8 sha[SHA256_DIGEST_LENGTH]; + struct bpf_map_info info; + __u32 info_len = sizeof(info); + union bpf_attr attr; + + memset(&info, 0, sizeof(info)); + info.hash = (long) &sha; + info.hash_size = SHA256_DIGEST_LENGTH; + + memset(&attr, 0, attr_sz); + attr.info.bpf_fd = fd; + attr.info.info = (long) &info; + attr.info.info_len = info_len; + return skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz); +} + +static inline int skel_map_freeze(int fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_fd); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + + return skel_sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz); +} #ifdef __KERNEL__ #define set_err #else @@ -308,12 +352,13 @@ static inline int skel_link_create(int prog_fd, int target_fd, static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) { - const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array); + const size_t prog_load_attr_sz = offsetofend(union bpf_attr, keyring_id); const size_t test_run_attr_sz = offsetofend(union bpf_attr, test); int map_fd = -1, prog_fd = -1, key = 0, err; union bpf_attr attr; - err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1); + err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1, + opts->excl_prog_hash, opts->excl_prog_hash_sz); if (map_fd < 0) { opts->errstr = "failed to create loader map"; set_err; @@ -327,11 +372,34 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) goto out; } +#ifndef __KERNEL__ + err = skel_map_freeze(map_fd); + if (err < 0) { + opts->errstr = "failed to freeze map"; + set_err; + goto out; + } + err = skel_obj_get_info_by_fd(map_fd); + if (err < 0) { + opts->errstr = "failed to fetch obj info"; + set_err; + goto out; + } +#endif + memset(&attr, 0, prog_load_attr_sz); attr.prog_type = BPF_PROG_TYPE_SYSCALL; attr.insns = (long) opts->insns; attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn); attr.license = (long) "Dual BSD/GPL"; +#ifndef __KERNEL__ + attr.signature = (long) opts->signature; + attr.signature_size = opts->signature_sz; +#else + if (opts->signature || opts->signature_sz) + pr_warn("signatures are not supported from bpf_preload\n"); +#endif + attr.keyring_id = opts->keyring_id; memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); attr.fd_array = (long) &map_fd; attr.log_level = opts->ctx->log_level; diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index 2a7865c8e3fe3c..43deb05a51970d 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -34,13 +34,32 @@ enum __bpf_usdt_arg_type { BPF_USDT_ARG_CONST, BPF_USDT_ARG_REG, BPF_USDT_ARG_REG_DEREF, + BPF_USDT_ARG_SIB, }; +/* + * This struct layout is designed specifically to be backwards/forward + * compatible between libbpf versions for ARG_CONST, ARG_REG, and + * ARG_REG_DEREF modes. ARG_SIB requires libbpf v1.7+. + */ struct __bpf_usdt_arg_spec { /* u64 scalar interpreted depending on arg_type, see below */ __u64 val_off; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* arg location case, see bpf_usdt_arg() for details */ - enum __bpf_usdt_arg_type arg_type; + enum __bpf_usdt_arg_type arg_type: 8; + /* index register offset within struct pt_regs */ + __u16 idx_reg_off: 12; + /* scale factor for index register (1, 2, 4, or 8) */ + __u16 scale_bitshift: 4; + /* reserved for future use, keeps reg_off offset stable */ + __u8 __reserved: 8; +#else + __u8 __reserved: 8; + __u16 idx_reg_off: 12; + __u16 scale_bitshift: 4; + enum __bpf_usdt_arg_type arg_type: 8; +#endif /* offset of referenced register within struct pt_regs */ short reg_off; /* whether arg should be interpreted as signed value */ @@ -149,7 +168,7 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) { struct __bpf_usdt_spec *spec; struct __bpf_usdt_arg_spec *arg_spec; - unsigned long val; + unsigned long val, idx; int err, spec_id; *res = 0; @@ -202,6 +221,27 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) return err; #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ val >>= arg_spec->arg_bitshift; +#endif + break; + case BPF_USDT_ARG_SIB: + /* Arg is in memory addressed by SIB (Scale-Index-Base) mode + * (e.g., "-1@-96(%rbp,%rax,8)" in USDT arg spec). We first + * fetch the base register contents and the index register + * contents from pt_regs. Then we calculate the final address + * as base + (index * scale) + offset, and do a user-space + * probe read to fetch the argument value. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + err = bpf_probe_read_kernel(&idx, sizeof(idx), (void *)ctx + arg_spec->idx_reg_off); + if (err) + return err; + err = bpf_probe_read_user(&val, sizeof(val), (void *)(val + (idx << arg_spec->scale_bitshift) + arg_spec->val_off)); + if (err) + return err; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + val >>= arg_spec->arg_bitshift; #endif break; default: diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 3373b9d45ac448..fc2785eecc1776 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -200,12 +200,23 @@ enum usdt_arg_type { USDT_ARG_CONST, USDT_ARG_REG, USDT_ARG_REG_DEREF, + USDT_ARG_SIB, }; /* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */ struct usdt_arg_spec { __u64 val_off; - enum usdt_arg_type arg_type; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + enum usdt_arg_type arg_type: 8; + __u16 idx_reg_off: 12; + __u16 scale_bitshift: 4; + __u8 __reserved: 8; /* keep reg_off offset stable */ +#else + __u8 __reserved: 8; /* keep reg_off offset stable */ + __u16 idx_reg_off: 12; + __u16 scale_bitshift: 4; + enum usdt_arg_type arg_type: 8; +#endif short reg_off; bool arg_signed; char arg_bitshift; @@ -570,9 +581,8 @@ static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long o return NULL; } -static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, - const char *data, size_t name_off, size_t desc_off, - struct usdt_note *usdt_note); +static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, + size_t desc_off, struct usdt_note *usdt_note); static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie); @@ -626,7 +636,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * struct elf_seg *seg = NULL; void *tmp; - err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, ¬e); + err = parse_usdt_note(&nhdr, data->d_buf, name_off, desc_off, ¬e); if (err) goto err_out; @@ -1132,8 +1142,7 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct /* Parse out USDT ELF note from '.note.stapsdt' section. * Logic inspired by perf's code. */ -static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, - const char *data, size_t name_off, size_t desc_off, +static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, struct usdt_note *note) { const char *provider, *name, *args; @@ -1283,11 +1292,51 @@ static int calc_pt_regs_off(const char *reg_name) static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { - char reg_name[16]; - int len, reg_off; - long off; + char reg_name[16] = {0}, idx_reg_name[16] = {0}; + int len, reg_off, idx_reg_off, scale = 1; + long off = 0; + + if (sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^,] , %d ) %n", + arg_sz, &off, reg_name, idx_reg_name, &scale, &len) == 5 || + sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^,] , %d ) %n", + arg_sz, reg_name, idx_reg_name, &scale, &len) == 4 || + sscanf(arg_str, " %d @ %ld ( %%%15[^,] , %%%15[^)] ) %n", + arg_sz, &off, reg_name, idx_reg_name, &len) == 4 || + sscanf(arg_str, " %d @ ( %%%15[^,] , %%%15[^)] ) %n", + arg_sz, reg_name, idx_reg_name, &len) == 3 + ) { + /* + * Scale Index Base case: + * 1@-96(%rbp,%rax,8) + * 1@(%rbp,%rax,8) + * 1@-96(%rbp,%rax) + * 1@(%rbp,%rax) + */ + arg->arg_type = USDT_ARG_SIB; + arg->val_off = off; - if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == 3) { + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + + idx_reg_off = calc_pt_regs_off(idx_reg_name); + if (idx_reg_off < 0) + return idx_reg_off; + arg->idx_reg_off = idx_reg_off; + + /* validate scale factor and set fields directly */ + switch (scale) { + case 1: arg->scale_bitshift = 0; break; + case 2: arg->scale_bitshift = 1; break; + case 4: arg->scale_bitshift = 2; break; + case 8: arg->scale_bitshift = 3; break; + default: + pr_warn("usdt: invalid SIB scale %d, expected 1, 2, 4, 8\n", scale); + return -EINVAL; + } + } else if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", + arg_sz, &off, reg_name, &len) == 3) { /* Memory dereference case, e.g., -4@-20(%rbp) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; @@ -1306,6 +1355,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec } else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -4@%eax */ arg->arg_type = USDT_ARG_REG; + /* register read has no memory offset */ arg->val_off = 0; reg_off = calc_pt_regs_off(reg_name); diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index 9ef569492560ef..ddaeb4eb3e2497 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -75,6 +75,9 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) size_t ci, cj, ei; int cmp; + if (!excludes->cnt) + return; + ci = cj = ei = 0; while (ci < cmds->cnt && ei < excludes->cnt) { cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index b6fdc68053cc4e..2e555c4060c5e4 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -278,6 +278,25 @@ static bool decode_insn_reg2i16_fomat(union loongarch_instruction inst, return true; } +static bool decode_insn_reg3_fomat(union loongarch_instruction inst, + struct instruction *insn) +{ + switch (inst.reg3_format.opcode) { + case amswapw_op: + if (inst.reg3_format.rd == LOONGARCH_GPR_ZERO && + inst.reg3_format.rk == LOONGARCH_GPR_RA && + inst.reg3_format.rj == LOONGARCH_GPR_ZERO) { + /* amswap.w $zero, $ra, $zero */ + insn->type = INSN_BUG; + } + break; + default: + return false; + } + + return true; +} + int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, struct instruction *insn) @@ -309,11 +328,19 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec return 0; if (decode_insn_reg2i16_fomat(inst, insn)) return 0; + if (decode_insn_reg3_fomat(inst, insn)) + return 0; - if (inst.word == 0) + if (inst.word == 0) { + /* andi $zero, $zero, 0x0 */ insn->type = INSN_NOP; - else if (inst.reg0i15_format.opcode == break_op) { - /* break */ + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x0) { + /* break 0x0 */ + insn->type = INSN_TRAP; + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x1) { + /* break 0x1 */ insn->type = INSN_BUG; } else if (inst.reg2_format.opcode == ertn_op) { /* ertn */ diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 98c4713c1b091b..0ad5cc70ecbe74 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -880,3 +880,15 @@ unsigned int arch_reloc_size(struct reloc *reloc) return 8; } } + +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_X86_64_32: + case R_X86_64_32S: + case R_X86_64_64: + return true; + default: + return false; + } +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 80239843e9f02c..0f6b197cfcb032 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -87,6 +87,7 @@ static const struct option check_options[] = { OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), + OPT_BOOLEAN(0 , "noabs", &opts.noabs, "reject absolute references in allocatable sections"), OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), @@ -162,6 +163,7 @@ static bool opts_valid(void) opts.hack_noinstr || opts.ibt || opts.mcount || + opts.noabs || opts.noinstr || opts.orc || opts.retpoline || diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d14f20ef1db13f..093fcd01dd6e06 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3564,7 +3564,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { /* Ignore KCFI type preambles, which always fall through */ if (!strncmp(func->name, "__cfi_", 6) || - !strncmp(func->name, "__pfx_", 6)) + !strncmp(func->name, "__pfx_", 6) || + !strncmp(func->name, "__pi___cfi_", 11) || + !strncmp(func->name, "__pi___pfx_", 11)) return 0; if (file->ignore_unreachables) @@ -4644,6 +4646,47 @@ static void disas_warned_funcs(struct objtool_file *file) disas_funcs(funcs); } +__weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) +{ + unsigned int type = reloc_type(reloc); + size_t sz = elf_addr_size(elf); + + return (sz == 8) ? (type == R_ABS64) : (type == R_ABS32); +} + +static int check_abs_references(struct objtool_file *file) +{ + struct section *sec; + struct reloc *reloc; + int ret = 0; + + for_each_sec(file, sec) { + /* absolute references in non-loadable sections are fine */ + if (!(sec->sh.sh_flags & SHF_ALLOC)) + continue; + + /* section must have an associated .rela section */ + if (!sec->rsec) + continue; + + /* + * Special case for compiler generated metadata that is not + * consumed until after boot. + */ + if (!strcmp(sec->name, "__patchable_function_entries")) + continue; + + for_each_reloc(sec->rsec, reloc) { + if (arch_absolute_reloc(file->elf, reloc)) { + WARN("section %s has absolute relocation at offset 0x%lx", + sec->name, reloc_offset(reloc)); + ret++; + } + } + } + return ret; +} + struct insn_chunk { void *addr; struct insn_chunk *next; @@ -4777,6 +4820,9 @@ int check(struct objtool_file *file) goto out; } + if (opts.noabs) + warnings += check_abs_references(file); + if (opts.orc && nr_insns) { ret = orc_create(file); if (ret) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 01ef6f415adf64..be33c7b43180aa 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -97,6 +97,7 @@ bool arch_is_embedded_insn(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); bool arch_pc_relative_reloc(struct reloc *reloc); +bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc); unsigned int arch_reloc_size(struct reloc *reloc); unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 6b08666fa69d64..ab22673862e1b1 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -26,6 +26,7 @@ struct opts { bool uaccess; int prefix; bool cfi; + bool noabs; /* options: */ bool backtrace; diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 6a922d046b8e2c..802895fae3cac0 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -45,7 +45,6 @@ NORETURN(rewind_stack_and_make_dead) NORETURN(rust_begin_unwind) NORETURN(rust_helper_BUG) NORETURN(sev_es_terminate) -NORETURN(snp_abort) NORETURN(start_kernel) NORETURN(stop_this_cpu) NORETURN(usercopy_abort) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index fd49703021fd26..078634461df270 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -2009,6 +2009,7 @@ static int __cmd_contention(int argc, const char **argv) .owner = show_lock_owner, .cgroups = RB_ROOT, }; + struct perf_env host_env; lockhash_table = calloc(LOCKHASH_SIZE, sizeof(*lockhash_table)); if (!lockhash_table) @@ -2024,7 +2025,10 @@ static int __cmd_contention(int argc, const char **argv) eops.mmap = perf_event__process_mmap; eops.tracing_data = perf_event__process_tracing_data; - session = perf_session__new(use_bpf ? NULL : &data, &eops); + perf_env__init(&host_env); + session = __perf_session__new(use_bpf ? NULL : &data, &eops, + /*trace_event_repipe=*/false, &host_env); + if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); err = PTR_ERR(session); @@ -2142,6 +2146,7 @@ static int __cmd_contention(int argc, const char **argv) evlist__delete(con.evlist); lock_contention_finish(&con); perf_session__delete(session); + perf_env__exit(&host_env); zfree(&lockhash_table); return err; } diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h index 89979ca23c3f1c..34e2fdfe7300ea 100644 --- a/tools/perf/util/include/linux/linkage.h +++ b/tools/perf/util/include/linux/linkage.h @@ -120,7 +120,7 @@ #endif // In the kernel sources (include/linux/cfi_types.h), this has a different -// definition when CONFIG_CFI_CLANG is used, for tools/ just use the !clang +// definition when CONFIG_CFI is used, for tools/ just use the !cfi // definition: #ifndef SYM_TYPED_START #define SYM_TYPED_START(name, linkage, align...) \ diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 8fabddc1c0dad2..72c7a4e15d617b 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -32,7 +32,7 @@ static void intel_pt_insn_decoder(struct insn *insn, intel_pt_insn->rel = 0; intel_pt_insn->emulated_ptwrite = false; - if (insn_is_avx(insn)) { + if (insn_is_avx_or_xop(insn)) { intel_pt_insn->op = INTEL_PT_OP_OTHER; intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH; intel_pt_insn->length = insn->length; diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 85b2a93a59ac65..779f6230130af2 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -477,6 +477,7 @@ static int __maps__insert(struct maps *maps, struct map *new) } /* Insert the value at the end. */ maps_by_address[nr_maps] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) maps_by_name[nr_maps] = map__get(new); @@ -502,8 +503,6 @@ static int __maps__insert(struct maps *maps, struct map *new) if (map__end(new) < map__start(new)) RC_CHK_ACCESS(maps)->ends_broken = true; - map__set_kmap_maps(new, maps); - return 0; } @@ -891,6 +890,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) if (before) { map__put(maps_by_address[i]); maps_by_address[i] = before; + map__set_kmap_maps(before, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -918,6 +918,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -942,14 +943,13 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); maps_by_name[ni] = map__get(new); } - map__set_kmap_maps(new, maps); - check_invariants(maps); return err; } @@ -1019,6 +1019,7 @@ int maps__copy_from(struct maps *dest, struct maps *parent) err = unwind__prepare_access(dest, new, NULL); if (!err) { dest_maps_by_address[i] = new; + map__set_kmap_maps(new, dest); if (dest_maps_by_name) dest_maps_by_name[i] = map__get(new); RC_CHK_ACCESS(dest)->nr_maps = i + 1; diff --git a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c index 9741e7503591c1..de93067a5da320 100644 --- a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c +++ b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c @@ -995,7 +995,7 @@ static acpi_status osl_list_customized_tables(char *directory) { void *table_dir; u32 instance; - char temp_name[ACPI_NAMESEG_SIZE]; + char temp_name[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; char *filename; acpi_status status = AE_OK; @@ -1312,7 +1312,7 @@ osl_get_customized_table(char *pathname, { void *table_dir; u32 current_instance = 0; - char temp_name[ACPI_NAMESEG_SIZE]; + char temp_name[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; char table_filename[PATH_MAX]; char *filename; acpi_status status; diff --git a/tools/power/acpi/tools/acpidump/apdump.c b/tools/power/acpi/tools/acpidump/apdump.c index bf30143efbdcb1..7a6223aa703c39 100644 --- a/tools/power/acpi/tools/acpidump/apdump.c +++ b/tools/power/acpi/tools/acpidump/apdump.c @@ -86,9 +86,10 @@ u8 ap_is_valid_checksum(struct acpi_table_header *table) if (ACPI_FAILURE(status)) { fprintf(stderr, "%4.4s: Warning: wrong checksum in table\n", table->signature); + return (FALSE); } - return (AE_OK); + return (TRUE); } /****************************************************************************** diff --git a/tools/power/acpi/tools/acpidump/apfiles.c b/tools/power/acpi/tools/acpidump/apfiles.c index 75db0091e2758a..d6b8a201480b75 100644 --- a/tools/power/acpi/tools/acpidump/apfiles.c +++ b/tools/power/acpi/tools/acpidump/apfiles.c @@ -103,7 +103,7 @@ int ap_open_output_file(char *pathname) int ap_write_to_binary_file(struct acpi_table_header *table, u32 instance) { - char filename[ACPI_NAMESEG_SIZE + 16] ACPI_NONSTRING; + char filename[ACPI_NAMESEG_SIZE + 16]; char instance_str[16]; ACPI_FILE file; acpi_size actual; diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c index 0ecac009273ce8..f2c1139adf7169 100644 --- a/tools/power/cpupower/lib/cpuidle.c +++ b/tools/power/cpupower/lib/cpuidle.c @@ -233,6 +233,7 @@ int cpuidle_state_disable(unsigned int cpu, { char value[SYSFS_PATH_MAX]; int bytes_written; + int len; if (cpuidle_state_count(cpu) <= idlestate) return -1; @@ -241,10 +242,10 @@ int cpuidle_state_disable(unsigned int cpu, idlestate_value_files[IDLESTATE_DISABLE])) return -2; - snprintf(value, SYSFS_PATH_MAX, "%u", disable); + len = snprintf(value, SYSFS_PATH_MAX, "%u", disable); bytes_written = cpuidle_state_write_file(cpu, idlestate, "disable", - value, sizeof(disable)); + value, len); if (bytes_written) return 0; return -3; diff --git a/tools/power/cpupower/lib/cpupower.c b/tools/power/cpupower/lib/cpupower.c index ce8dfb8e46abda..d7f7ec6f151c27 100644 --- a/tools/power/cpupower/lib/cpupower.c +++ b/tools/power/cpupower/lib/cpupower.c @@ -56,7 +56,7 @@ unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen) if (numwritten < 1) { perror(path); close(fd); - return -1; + return 0; } close(fd); diff --git a/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py b/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py index feb9f9421c7bcd..875b086550d1ba 100755 --- a/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py +++ b/tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py @@ -11,7 +11,7 @@ gnuplot 5.0 or higher gnuplot-py 1.8 or higher (Most of the distributions have these required packages. They may be called - gnuplot-py, phython-gnuplot or phython3-gnuplot, gnuplot-nox, ... ) + gnuplot-py, python-gnuplot or python3-gnuplot, gnuplot-nox, ... ) Kernel config for Linux trace is enabled diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 72a280e7a9d594..f2512d78bcbd8f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1195,7 +1195,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_EMERALDRAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_D, &spr_features }, - { INTEL_PANTHERCOVE_X, &dmr_features }, + { INTEL_DIAMONDRAPIDS_X, &dmr_features }, { INTEL_LAKEFIELD, &cnl_features }, { INTEL_ALDERLAKE, &adl_features }, { INTEL_ALDERLAKE_L, &adl_features }, @@ -1890,7 +1890,7 @@ int pmt_telemdir_sort(const struct dirent **a, const struct dirent **b) sscanf((*a)->d_name, "telem%u", &aidx); sscanf((*b)->d_name, "telem%u", &bidx); - return aidx >= bidx; + return (aidx > bidx) ? 1 : (aidx < bidx) ? -1 : 0; } const struct dirent *pmt_diriter_next(struct pmt_diriter_t *iter) diff --git a/tools/power/x86/x86_energy_perf_policy/Makefile b/tools/power/x86/x86_energy_perf_policy/Makefile index 666b325a62a229..d182846674008e 100644 --- a/tools/power/x86/x86_energy_perf_policy/Makefile +++ b/tools/power/x86/x86_energy_perf_policy/Makefile @@ -1,8 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 CC = $(CROSS_COMPILE)gcc -BUILD_OUTPUT := $(CURDIR) +BUILD_OUTPUT := $(CURDIR) PREFIX := /usr DESTDIR := +DAY := $(shell date +%Y.%m.%d) +SNAPSHOT = x86_energy_perf_policy-$(DAY) + + ifeq ("$(origin O)", "command line") BUILD_OUTPUT := $(O) @@ -27,3 +31,26 @@ install : x86_energy_perf_policy install -d $(DESTDIR)$(PREFIX)/share/man/man8 install -m 644 x86_energy_perf_policy.8 $(DESTDIR)$(PREFIX)/share/man/man8 +snapshot: x86_energy_perf_policy + @rm -rf $(SNAPSHOT) + @mkdir $(SNAPSHOT) + @cp x86_energy_perf_policy Makefile x86_energy_perf_policy.c x86_energy_perf_policy.8 $(SNAPSHOT) + + @sed -e 's/^#include /#include "bits.h"/' -e 's/u64/unsigned long long/' ../../../../arch/x86/include/asm/msr-index.h > $(SNAPSHOT)/msr-index.h + @echo '#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))' >> $(SNAPSHOT)/msr-index.h + @echo "#define BIT(x) (1 << (x))" > $(SNAPSHOT)/bits.h + @echo "#define BIT_ULL(nr) (1ULL << (nr))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK(h, l) (((~0UL) << (l)) & (~0UL >> (sizeof(long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h + + @echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h + @echo '#define __must_be_array(arr) 0' >> $(SNAPSHOT)/build_bug.h + + @echo PWD=. > $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile + @echo "CFLAGS += -DBUILD_BUG_HEADER='\"build_bug.h\"'" >> $(SNAPSHOT)/Makefile + @sed -e's/.*MSRHEADER.*//' Makefile >> $(SNAPSHOT)/Makefile + + @rm -f $(SNAPSHOT).tar.gz + tar cvzf $(SNAPSHOT).tar.gz $(SNAPSHOT) + diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.8 b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.8 index 78c6361898b189..0aa981c18e56f7 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.8 +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.8 @@ -2,7 +2,7 @@ .\" Distributed under the GPL, Copyleft 1994. .TH X86_ENERGY_PERF_POLICY 8 .SH NAME -x86_energy_perf_policy \- Manage Energy vs. Performance Policy via x86 Model Specific Registers +x86_energy_perf_policy \- Manage Energy vs. Performance Policy .SH SYNOPSIS .B x86_energy_perf_policy .RB "[ options ] [ scope ] [field \ value]" @@ -19,9 +19,14 @@ x86_energy_perf_policy \- Manage Energy vs. Performance Policy via x86 Model Spe .SH DESCRIPTION \fBx86_energy_perf_policy\fP displays and updates energy-performance policy settings specific to -Intel Architecture Processors. Settings are accessed via Model Specific Register (MSR) -updates, no matter if the Linux cpufreq sub-system is enabled or not. +Intel Architecture Processors. It summarizes settings available +in standard Linux interfaces (eg. cpufreq), +and also decodes underlying Model Specific Register (MSRs). +While \fBx86_energy_perf_policy\fP can manage energy-performance policy +using only MSR access, it prefers standard +Linux kernel interfaces, when they are available. +.SH BACKGROUND Policy in MSR_IA32_ENERGY_PERF_BIAS (EPB) may affect a wide range of hardware decisions, such as how aggressively the hardware enters and exits CPU idle states (C-states) @@ -200,7 +205,9 @@ runs only as root. .SH FILES .ta .nf -/dev/cpu/*/msr +EPB: /sys/devices/system/cpu/cpu*/power/energy_perf_bias +EPP: /sys/devices/system/cpu/cpu*/cpufreq/energy_performance_preference +MSR: /dev/cpu/*/msr .fi .SH "SEE ALSO" .nf diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index ebda9c366b2ba3..884a4c746f32e8 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -4,7 +4,7 @@ * policy preference bias on recent X86 processors. */ /* - * Copyright (c) 2010 - 2017 Intel Corporation. + * Copyright (c) 2010 - 2025 Intel Corporation. * Len Brown */ @@ -62,6 +62,7 @@ unsigned char turbo_update_value; unsigned char update_hwp_epp; unsigned char update_hwp_min; unsigned char update_hwp_max; +unsigned char hwp_limits_done_via_sysfs; unsigned char update_hwp_desired; unsigned char update_hwp_window; unsigned char update_hwp_use_pkg; @@ -517,7 +518,7 @@ void for_packages(unsigned long long pkg_set, int (func)(int)) void print_version(void) { - printf("x86_energy_perf_policy 17.05.11 (C) Len Brown \n"); + printf("x86_energy_perf_policy 2025.9.19 Len Brown \n"); } void cmdline(int argc, char **argv) @@ -630,7 +631,7 @@ void cmdline(int argc, char **argv) */ FILE *fopen_or_die(const char *path, const char *mode) { - FILE *filep = fopen(path, "r"); + FILE *filep = fopen(path, mode); if (!filep) err(1, "%s: open failed", path); @@ -644,7 +645,7 @@ void err_on_hypervisor(void) char *buffer; /* On VMs /proc/cpuinfo contains a "flags" entry for hypervisor */ - cpuinfo = fopen_or_die("/proc/cpuinfo", "ro"); + cpuinfo = fopen_or_die("/proc/cpuinfo", "r"); buffer = malloc(4096); if (!buffer) { @@ -809,7 +810,7 @@ void print_hwp_request_pkg(int pkg, struct msr_hwp_request *h, char *str) h->hwp_min, h->hwp_max, h->hwp_desired, h->hwp_epp, h->hwp_window, h->hwp_window & 0x7F, (h->hwp_window >> 7) & 0x7); } -void read_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr_offset) +void read_hwp_request_msr(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr_offset) { unsigned long long msr; @@ -823,7 +824,7 @@ void read_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr hwp_req->hwp_use_pkg = (((msr) >> 42) & 0x1); } -void write_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr_offset) +void write_hwp_request_msr(int cpu, struct msr_hwp_request *hwp_req, unsigned int msr_offset) { unsigned long long msr = 0; @@ -843,7 +844,7 @@ void write_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int ms put_msr(cpu, msr_offset, msr); } -static int get_epb(int cpu) +static int get_epb_sysfs(int cpu) { char path[SYSFS_PATH_MAX]; char linebuf[3]; @@ -865,7 +866,7 @@ static int get_epb(int cpu) return (int)val; } -static int set_epb(int cpu, int val) +static int set_epb_sysfs(int cpu, int val) { char path[SYSFS_PATH_MAX]; char linebuf[3]; @@ -895,14 +896,14 @@ int print_cpu_msrs(int cpu) struct msr_hwp_cap cap; int epb; - epb = get_epb(cpu); + epb = get_epb_sysfs(cpu); if (epb >= 0) printf("cpu%d: EPB %u\n", cpu, (unsigned int) epb); if (!has_hwp) return 0; - read_hwp_request(cpu, &req, MSR_HWP_REQUEST); + read_hwp_request_msr(cpu, &req, MSR_HWP_REQUEST); print_hwp_request(cpu, &req, ""); read_hwp_cap(cpu, &cap, MSR_HWP_CAPABILITIES); @@ -919,7 +920,7 @@ int print_pkg_msrs(int pkg) if (!has_hwp) return 0; - read_hwp_request(first_cpu_in_pkg[pkg], &req, MSR_HWP_REQUEST_PKG); + read_hwp_request_msr(first_cpu_in_pkg[pkg], &req, MSR_HWP_REQUEST_PKG); print_hwp_request_pkg(pkg, &req, ""); if (has_hwp_notify) { @@ -951,8 +952,10 @@ int ratio_2_sysfs_khz(int ratio) } /* * If HWP is enabled and cpufreq sysfs attribtes are present, - * then update sysfs, so that it will not become - * stale when we write to MSRs. + * then update via sysfs. The intel_pstate driver may modify (clip) + * this request, say, when HWP_CAP is outside of PLATFORM_INFO limits, + * and the driver-chosen value takes precidence. + * * (intel_pstate's max_perf_pct and min_perf_pct will follow cpufreq, * so we don't have to touch that.) */ @@ -1007,6 +1010,8 @@ int update_sysfs(int cpu) if (update_hwp_max) update_cpufreq_scaling_freq(1, cpu, req_update.hwp_max); + hwp_limits_done_via_sysfs = 1; + return 0; } @@ -1074,21 +1079,21 @@ int check_hwp_request_v_hwp_capabilities(int cpu, struct msr_hwp_request *req, s return 0; } -int update_hwp_request(int cpu) +int update_hwp_request_msr(int cpu) { struct msr_hwp_request req; struct msr_hwp_cap cap; int msr_offset = MSR_HWP_REQUEST; - read_hwp_request(cpu, &req, msr_offset); + read_hwp_request_msr(cpu, &req, msr_offset); if (debug) print_hwp_request(cpu, &req, "old: "); - if (update_hwp_min) + if (update_hwp_min && !hwp_limits_done_via_sysfs) req.hwp_min = req_update.hwp_min; - if (update_hwp_max) + if (update_hwp_max && !hwp_limits_done_via_sysfs) req.hwp_max = req_update.hwp_max; if (update_hwp_desired) @@ -1111,15 +1116,15 @@ int update_hwp_request(int cpu) verify_hwp_req_self_consistency(cpu, &req); - write_hwp_request(cpu, &req, msr_offset); + write_hwp_request_msr(cpu, &req, msr_offset); if (debug) { - read_hwp_request(cpu, &req, msr_offset); + read_hwp_request_msr(cpu, &req, msr_offset); print_hwp_request(cpu, &req, "new: "); } return 0; } -int update_hwp_request_pkg(int pkg) +int update_hwp_request_pkg_msr(int pkg) { struct msr_hwp_request req; struct msr_hwp_cap cap; @@ -1127,7 +1132,7 @@ int update_hwp_request_pkg(int pkg) int msr_offset = MSR_HWP_REQUEST_PKG; - read_hwp_request(cpu, &req, msr_offset); + read_hwp_request_msr(cpu, &req, msr_offset); if (debug) print_hwp_request_pkg(pkg, &req, "old: "); @@ -1155,10 +1160,10 @@ int update_hwp_request_pkg(int pkg) verify_hwp_req_self_consistency(cpu, &req); - write_hwp_request(cpu, &req, msr_offset); + write_hwp_request_msr(cpu, &req, msr_offset); if (debug) { - read_hwp_request(cpu, &req, msr_offset); + read_hwp_request_msr(cpu, &req, msr_offset); print_hwp_request_pkg(pkg, &req, "new: "); } return 0; @@ -1166,30 +1171,39 @@ int update_hwp_request_pkg(int pkg) int enable_hwp_on_cpu(int cpu) { - unsigned long long msr; + unsigned long long old_msr, new_msr; + + get_msr(cpu, MSR_PM_ENABLE, &old_msr); + + if (old_msr & 1) + return 0; /* already enabled */ - get_msr(cpu, MSR_PM_ENABLE, &msr); - put_msr(cpu, MSR_PM_ENABLE, 1); + new_msr = old_msr | 1; + put_msr(cpu, MSR_PM_ENABLE, new_msr); if (verbose) - printf("cpu%d: MSR_PM_ENABLE old: %d new: %d\n", cpu, (unsigned int) msr, 1); + printf("cpu%d: MSR_PM_ENABLE old: %llX new: %llX\n", cpu, old_msr, new_msr); return 0; } -int update_cpu_msrs(int cpu) +int update_cpu_epb_sysfs(int cpu) { - unsigned long long msr; int epb; - if (update_epb) { - epb = get_epb(cpu); - set_epb(cpu, new_epb); + epb = get_epb_sysfs(cpu); + set_epb_sysfs(cpu, new_epb); - if (verbose) - printf("cpu%d: ENERGY_PERF_BIAS old: %d new: %d\n", - cpu, epb, (unsigned int) new_epb); - } + if (verbose) + printf("cpu%d: ENERGY_PERF_BIAS old: %d new: %d\n", + cpu, epb, (unsigned int) new_epb); + + return 0; +} + +int update_cpu_msrs(int cpu) +{ + unsigned long long msr; if (update_turbo) { int turbo_is_present_and_disabled; @@ -1224,7 +1238,7 @@ int update_cpu_msrs(int cpu) if (!hwp_update_enabled()) return 0; - update_hwp_request(cpu); + update_hwp_request_msr(cpu); return 0; } @@ -1312,6 +1326,17 @@ void for_all_cpus_in_set(size_t set_size, cpu_set_t *cpu_set, int (func)(int)) if (CPU_ISSET_S(cpu_num, set_size, cpu_set)) func(cpu_num); } +int for_all_cpus_in_set_and(size_t set_size, cpu_set_t *cpu_set, int (func)(int)) +{ + int cpu_num; + int retval = 1; + + for (cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) + if (CPU_ISSET_S(cpu_num, set_size, cpu_set)) + retval &= func(cpu_num); + + return retval; +} void init_data_structures(void) { @@ -1326,21 +1351,38 @@ void init_data_structures(void) for_all_proc_cpus(mark_cpu_present); } -/* clear has_hwp if it is not enable (or being enabled) */ +int is_hwp_enabled_on_cpu(int cpu_num) +{ + unsigned long long msr; + int retval; + + /* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */ + get_msr(cpu_num, MSR_PM_ENABLE, &msr); + retval = (msr & 1); + + if (verbose) + fprintf(stderr, "cpu%d: %sHWP\n", cpu_num, retval ? "" : "No-"); + + return retval; +} +/* + * verify_hwp_is_enabled() + * + * Set (has_hwp=0) if no HWP feature or any of selected CPU set does not have HWP enabled + */ void verify_hwp_is_enabled(void) { - unsigned long long msr; + int retval; if (!has_hwp) /* set in early_cpuid() */ return; - /* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */ - get_msr(base_cpu, MSR_PM_ENABLE, &msr); - if ((msr & 1) == 0) { + retval = for_all_cpus_in_set_and(cpu_setsize, cpu_selected_set, is_hwp_enabled_on_cpu); + + if (retval == 0) { fprintf(stderr, "HWP can be enabled using '--hwp-enable'\n"); has_hwp = 0; - return; } } @@ -1551,10 +1593,13 @@ int main(int argc, char **argv) /* update CPU set */ if (cpu_selected_set) { + if (update_epb) + for_all_cpus_in_set(cpu_setsize, cpu_selected_set, update_cpu_epb_sysfs); for_all_cpus_in_set(cpu_setsize, cpu_selected_set, update_sysfs); for_all_cpus_in_set(cpu_setsize, cpu_selected_set, update_cpu_msrs); + } else if (pkg_selected_set) - for_packages(pkg_selected_set, update_hwp_request_pkg); + for_packages(pkg_selected_set, update_hwp_request_pkg_msr); return 0; } diff --git a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h new file mode 100644 index 00000000000000..4366fb3c91ce8a --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define __arena_global __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else + +/* emit instruction: + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as + * + * This is a workaround for LLVM compiler versions without + * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena + * pointers and native kernel/userspace ones. In this case we explicitly do so + * with cast_kern() and cast_user(). E.g., in the Linux kernel tree, + * tools/testing/selftests/bpf includes tests that use these macros to implement + * linked lists and hashtables backed by arena memory. In sched_ext, we use + * cast_kern() and cast_user() for compatibility with older LLVM toolchains. + */ +#ifndef bpf_addr_space_cast +#define bpf_addr_space_cast(var, dst_as, src_as)\ + asm volatile(".byte 0xBF; \ + .ifc %[reg], r0; \ + .byte 0x00; \ + .endif; \ + .ifc %[reg], r1; \ + .byte 0x11; \ + .endif; \ + .ifc %[reg], r2; \ + .byte 0x22; \ + .endif; \ + .ifc %[reg], r3; \ + .byte 0x33; \ + .endif; \ + .ifc %[reg], r4; \ + .byte 0x44; \ + .endif; \ + .ifc %[reg], r5; \ + .byte 0x55; \ + .endif; \ + .ifc %[reg], r6; \ + .byte 0x66; \ + .endif; \ + .ifc %[reg], r7; \ + .byte 0x77; \ + .endif; \ + .ifc %[reg], r8; \ + .byte 0x88; \ + .endif; \ + .ifc %[reg], r9; \ + .byte 0x99; \ + .endif; \ + .short %[off]; \ + .long %[as]" \ + : [reg]"+r"(var) \ + : [off]"i"(BPF_ADDR_SPACE_CAST) \ + , [as]"i"((dst_as << 16) | src_as)); +#endif + +#define __arena +#define __arena_global SEC(".addr_space.1") +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef TEST +#define can_loop true +#define __cond_break(expr) expr +#else +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ +#endif /* __BPF_FEATURE_MAY_GOTO */ +#endif /* TEST */ + +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) + + +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; diff --git a/tools/sched_ext/include/scx/bpf_arena_common.h b/tools/sched_ext/include/scx/bpf_arena_common.h new file mode 100644 index 00000000000000..10141db0b59d02 --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +/* Provide the definition of PAGE_SIZE. */ +#include + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +char __attribute__((weak)) arena[1]; + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index d4e21558e98269..06e2551033cb19 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -24,14 +24,26 @@ #include #include #include -#include "user_exit_info.h" +#include "user_exit_info.bpf.h" #include "enum_defs.autogen.h" +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ +#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_EXITING 0x00000004 #define CLOCK_MONOTONIC 1 +#ifndef NR_CPUS +#define NR_CPUS 1024 +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + extern int LINUX_KERNEL_VERSION __kconfig; extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak; extern const char CONFIG_LOCALVERSION[64] __kconfig __weak; @@ -91,6 +103,8 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; +struct rq *scx_bpf_locked_rq(void) __ksym; +struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; @@ -107,6 +121,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __ static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} +#define SCX_STRINGIFY(x) #x +#define SCX_TOSTRING(x) SCX_STRINGIFY(x) + /* * Helper macro for initializing the fmt and variadic argument inputs to both * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to @@ -141,13 +158,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments * instead of an array of u64. Invoking this macro will cause the scheduler to * exit in an erroneous state, with diagnostic information being passed to the - * user. + * user. It appends the file and line number to aid debugging. */ #define scx_bpf_error(fmt, args...) \ ({ \ - scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_bstr_preamble( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args) \ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ - ___scx_bpf_bstr_format_checker(fmt, ##args); \ + ___scx_bpf_bstr_format_checker( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args); \ }) /* @@ -229,6 +248,7 @@ BPF_PROG(name, ##args) * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of * `MEMBER_VPTR(ptr, ->member)`. */ +#ifndef MEMBER_VPTR #define MEMBER_VPTR(base, member) (typeof((base) member) *) \ ({ \ u64 __base = (u64)&(base); \ @@ -245,6 +265,7 @@ BPF_PROG(name, ##args) [max]"i"(sizeof(base) - sizeof((base) member))); \ __addr; \ }) +#endif /* MEMBER_VPTR */ /** * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element @@ -260,6 +281,7 @@ BPF_PROG(name, ##args) * size of the array to compute the max, which will result in rejection by * the verifier. */ +#ifndef ARRAY_ELEM_PTR #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ ({ \ u64 __base = (u64)arr; \ @@ -274,7 +296,7 @@ BPF_PROG(name, ##args) [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ __addr; \ }) - +#endif /* ARRAY_ELEM_PTR */ /* * BPF declarations and helpers @@ -438,8 +460,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) */ static inline bool is_migration_disabled(const struct task_struct *p) { - if (bpf_core_field_exists(p->migration_disabled)) - return p->migration_disabled; + /* + * Testing p->migration_disabled in a BPF code is tricky because the + * migration is _always_ disabled while running the BPF code. + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF + * code execution disable and re-enable the migration of the current + * task, respectively. So, the _current_ task of the sched_ext ops is + * always migration-disabled. Moreover, p->migration_disabled could be + * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is + * executed in the middle of the other BPF code execution. + * + * Therefore, we should decide that the _current_ task is + * migration-disabled only when its migration_disabled count is greater + * than one. In other words, when p->migration_disabled == 1, there is + * an ambiguity, so we should check if @p is the current task or not. + */ + if (bpf_core_field_exists(p->migration_disabled)) { + if (p->migration_disabled == 1) + return bpf_get_current_task_btf() != p; + else + return p->migration_disabled; + } return false; } @@ -476,7 +517,7 @@ static inline s64 time_delta(u64 after, u64 before) */ static inline bool time_after(u64 a, u64 b) { - return (s64)(b - a) < 0; + return (s64)(b - a) < 0; } /** @@ -500,7 +541,7 @@ static inline bool time_before(u64 a, u64 b) */ static inline bool time_after_eq(u64 a, u64 b) { - return (s64)(a - b) >= 0; + return (s64)(a - b) >= 0; } /** @@ -547,9 +588,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c) */ /* useful compiler attributes */ +#ifndef likely #define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#ifndef __maybe_unused #define __maybe_unused __attribute__((__unused__)) +#endif /* * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They @@ -632,6 +679,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __u.__val; \ }) +/* + * __calc_avg - Calculate exponential weighted moving average (EWMA) with + * @old and @new values. @decay represents how large the @old value remains. + * With a larger @decay value, the moving average changes slowly, exhibiting + * fewer fluctuations. + */ +#define __calc_avg(old, new, decay) ({ \ + typeof(decay) thr = 1 << (decay); \ + typeof(old) ret; \ + if (((old) < thr) || ((new) < thr)) { \ + if (((old) == 1) && ((new) == 0)) \ + ret = 0; \ + else \ + ret = ((old) - ((old) >> 1)) + ((new) >> 1); \ + } else { \ + ret = ((old) - ((old) >> (decay))) + ((new) >> (decay)); \ + } \ + ret; \ +}) + /* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. @@ -662,6 +729,25 @@ static inline u32 log2_u64(u64 v) return log2_u32(v) + 1; } +/* + * sqrt_u64 - Calculate the square root of value @x using Newton's method. + */ +static inline u64 __sqrt_u64(u64 x) +{ + if (x == 0 || x == 1) + return x; + + u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32); + + for (int i = 0; i < 8; ++i) { + u64 q = x / r; + if (r <= q) + break; + r = (r + q) >> 1; + } + return r; +} + /* * Return a value proportionally scaled to the task's weight. */ diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index 1dc76bd8429660..b3c6372bcf810b 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -75,8 +75,9 @@ typedef int64_t s64; #include "enums.h" /* not available when building kernel tools/sched_ext */ -#if __has_include() -#include +#if __has_include() +#include "bpf_arena_common.h" +#include #endif #endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 9252e1a00556f5..dd9144624dc99e 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -38,6 +38,7 @@ void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__i void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ @@ -82,6 +83,10 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) +#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ + (bpf_ksym_exists(bpf_cpumask_populate) ? \ + (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) + #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") @@ -225,6 +230,23 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \ scx_bpf_pick_any_cpu(cpus_allowed, flags)) +/* + * v6.18: Add a helper to retrieve the current task running on a CPU. + * + * Keep this helper available until v6.20 for compatibility. + */ +static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) +{ + struct rq *rq; + + if (bpf_ksym_exists(scx_bpf_cpu_curr)) + return scx_bpf_cpu_curr(cpu); + + rq = scx_bpf_cpu_rq(cpu); + + return rq ? rq->curr : NULL; +} + /* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h new file mode 100644 index 00000000000000..e7ac6611a99014 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ + +#ifndef __USER_EXIT_INFO_BPF_H +#define __USER_EXIT_INFO_BPF_H + +#ifndef LSP +#include "vmlinux.h" +#endif +#include + +#include "user_exit_info_common.h" + +#define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#endif /* __USER_EXIT_INFO_BPF_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 66f856640ee7e2..399697fa372fb1 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -10,55 +10,11 @@ #ifndef __USER_EXIT_INFO_H #define __USER_EXIT_INFO_H -#ifdef LSP -#define __bpf__ -#include "../vmlinux.h" -#endif - -enum uei_sizes { - UEI_REASON_LEN = 128, - UEI_MSG_LEN = 1024, - UEI_DUMP_DFL_LEN = 32768, -}; - -struct user_exit_info { - int kind; - s64 exit_code; - char reason[UEI_REASON_LEN]; - char msg[UEI_MSG_LEN]; -}; - -#ifdef __bpf__ - -#ifndef LSP -#include "vmlinux.h" -#endif -#include - -#define UEI_DEFINE(__name) \ - char RESIZABLE_ARRAY(data, __name##_dump); \ - const volatile u32 __name##_dump_len; \ - struct user_exit_info __name SEC(".data") - -#define UEI_RECORD(__uei_name, __ei) ({ \ - bpf_probe_read_kernel_str(__uei_name.reason, \ - sizeof(__uei_name.reason), (__ei)->reason); \ - bpf_probe_read_kernel_str(__uei_name.msg, \ - sizeof(__uei_name.msg), (__ei)->msg); \ - bpf_probe_read_kernel_str(__uei_name##_dump, \ - __uei_name##_dump_len, (__ei)->dump); \ - if (bpf_core_field_exists((__ei)->exit_code)) \ - __uei_name.exit_code = (__ei)->exit_code; \ - /* use __sync to force memory barrier */ \ - __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ - (__ei)->kind); \ -}) - -#else /* !__bpf__ */ - #include #include +#include "user_exit_info_common.h" + /* no need to call the following explicitly if SCX_OPS_LOAD() is used */ #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ @@ -114,5 +70,4 @@ enum uei_ecode_mask { #define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) -#endif /* __bpf__ */ #endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h new file mode 100644 index 00000000000000..2d0981aedd8981 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info_common.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __USER_EXIT_INFO_COMMON_H +#define __USER_EXIT_INFO_COMMON_H + +#ifdef LSP +#include "../vmlinux.h" +#endif + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#endif /* __USER_EXIT_INFO_COMMON_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 50bc1737c167a1..55df8b7988657b 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * A central FIFO sched_ext scheduler which demonstrates the followings: + * A central FIFO sched_ext scheduler which demonstrates the following: * * a. Making all scheduling decisions from one CPU: * diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 6ba6e610eeaa03..55931a4cd71c7c 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -61,6 +61,7 @@ int main(int argc, char **argv) skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + assert(skel->rodata->nr_cpu_ids > 0); assert(skel->rodata->nr_cpu_ids <= INT32_MAX); while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index fdc7170639e604..2c720e3ecad593 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops, .cgroup_move = (void *)fcg_cgroup_move, .init = (void *)fcg_init, .exit = (void *)fcg_exit, - .flags = SCX_OPS_ENQ_EXITING, + .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, .name = "flatcg"); diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index 6dd423eeb4ff98..cd85eb4011793c 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include @@ -137,6 +138,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->nr_cpus > 0); skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 69d877501cb727..3072b593f89816 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -39,7 +39,8 @@ const volatile u32 stall_kernel_nth; const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_batch; const volatile bool highpri_boosting; -const volatile bool print_shared_dsq; +const volatile bool print_dsqs_and_events; +const volatile bool print_msgs; const volatile s32 disallow_tgid; const volatile bool suppress_dump; @@ -56,7 +57,8 @@ struct qmap { queue1 SEC(".maps"), queue2 SEC(".maps"), queue3 SEC(".maps"), - queue4 SEC(".maps"); + queue4 SEC(".maps"), + dump_store SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); @@ -578,11 +580,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) return; scx_bpf_dump("QMAP FIFO[%d]:", i); + + /* + * Dump can be invoked anytime and there is no way to iterate in + * a non-destructive way. Pop and store in dump_store and then + * restore afterwards. If racing against new enqueues, ordering + * can get mixed up. + */ bpf_repeat(4096) { if (bpf_map_pop_elem(fifo, &pid)) break; + bpf_map_push_elem(&dump_store, &pid, 0); scx_bpf_dump(" %d", pid); } + + bpf_repeat(4096) { + if (bpf_map_pop_elem(&dump_store, &pid)) + break; + bpf_map_push_elem(fifo, &pid, 0); + } + scx_bpf_dump("\n"); } } @@ -617,22 +634,25 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) { - bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", - cgrp->kn->id, args->weight, args->bw_period_us, - args->bw_quota_us, args->bw_burst_us); + if (print_msgs) + bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", + cgrp->kn->id, args->weight, args->bw_period_us, + args->bw_quota_us, args->bw_burst_us); return 0; } void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) { - bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); + if (print_msgs) + bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); } void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) { - bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id, - period_us, quota_us, burst_us); + if (print_msgs) + bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", + cgrp->kn->id, period_us, quota_us, burst_us); } /* @@ -676,16 +696,20 @@ static void print_cpus(void) void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) { - bpf_printk("CPU %d coming online", cpu); - /* @cpu is already online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d coming online", cpu); + /* @cpu is already online at this point */ + print_cpus(); + } } void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) { - bpf_printk("CPU %d going offline", cpu); - /* @cpu is still online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d going offline", cpu); + /* @cpu is still online at this point */ + print_cpus(); + } } struct monitor_timer { @@ -783,35 +807,36 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { - struct scx_event_stats events; - bpf_rcu_read_lock(); dispatch_highpri(true); bpf_rcu_read_unlock(); monitor_cpuperf(); - if (print_shared_dsq) + if (print_dsqs_and_events) { + struct scx_event_stats events; + dump_shared_dsq(); - __COMPAT_scx_bpf_events(&events, sizeof(events)); - - bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", - scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", - scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", - scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); - bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", - scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); - bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", - scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", - scx_read_event(&events, SCX_EV_BYPASS_DURATION)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", - scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", - scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + __COMPAT_scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", + scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + } bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; @@ -823,7 +848,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) struct bpf_timer *timer; s32 ret; - print_cpus(); + if (print_msgs) + print_cpus(); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); if (ret) diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index c4912ab2e76f21..ef701d45ba4358 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -20,7 +20,7 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -" [-P] [-d PID] [-D LEN] [-p] [-v]\n" +" [-P] [-M] [-d PID] [-D LEN] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" @@ -28,7 +28,8 @@ const char help_fmt[] = " -T COUNT Stall every COUNT'th kernel thread\n" " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" -" -P Print out DSQ content to trace_pipe every second, use with -b\n" +" -P Print out DSQ content and event counters to trace_pipe every second\n" +" -M Print out debug messages to trace_pipe\n" " -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" @@ -66,7 +67,7 @@ int main(int argc, char **argv) skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -87,7 +88,10 @@ int main(int argc, char **argv) skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; case 'P': - skel->rodata->print_shared_dsq = true; + skel->rodata->print_dsqs_and_events = true; + break; + case 'M': + skel->rodata->print_msgs = true; break; case 'H': skel->rodata->highpri_boosting = true; diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index 76d83199545cb2..06d4b13bf76bcc 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ static void sigint_handler(int simple) static void read_stats(struct scx_simple *skel, __u64 *stats) { int nr_cpus = libbpf_num_possible_cpus(); + assert(nr_cpus > 0); __u64 cnts[2][nr_cpus]; __u32 idx; diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index 002ec38a8bbbf1..3b96d090c5ebe7 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -17,6 +17,8 @@ #include #include +#include + #include "../../kselftest.h" #define TESTS_PER_HWCAP 3 @@ -55,7 +57,6 @@ static void cmpbr_sigill(void) /* Not implemented, too complicated and unreliable anyway */ } - static void crc32_sigill(void) { /* CRC32W W0, W0, W1 */ @@ -169,6 +170,18 @@ static void lse128_sigill(void) : "cc", "memory"); } +static void lsfe_sigill(void) +{ + float __attribute__ ((aligned (16))) mem; + register float *memp asm ("x0") = &mem; + + /* STFADD H0, [X0] */ + asm volatile(".inst 0x7c20801f" + : "+r" (memp) + : + : "memory"); +} + static void lut_sigill(void) { /* LUTI2 V0.16B, { V0.16B }, V[0] */ @@ -762,6 +775,13 @@ static const struct hwcap_data { .cpuinfo = "lse128", .sigill_fn = lse128_sigill, }, + { + .name = "LSFE", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_LSFE, + .cpuinfo = "lsfe", + .sigill_fn = lsfe_sigill, + }, { .name = "LUT", .at_hwcap = AT_HWCAP2, diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index f58a9f89b952c4..1703543fb7c761 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -182,16 +182,16 @@ static int write_clone_read(void) } for (;;) { - waiting = wait4(ret, &status, __WCLONE, NULL); + waiting = waitpid(ret, &status, __WCLONE); if (waiting < 0) { if (errno == EINTR) continue; - ksft_print_msg("wait4() failed: %d\n", errno); + ksft_print_msg("waitpid() failed: %d\n", errno); return 0; } if (waiting != ret) { - ksft_print_msg("wait4() returned wrong PID %d\n", + ksft_print_msg("waitpid() returned wrong PID %d\n", waiting); return 0; } @@ -227,10 +227,10 @@ int main(int argc, char **argv) ret = open("/proc/sys/abi/sme_default_vector_length", O_RDONLY, 0); if (ret >= 0) { ksft_test_result(default_value(), "default_value\n"); - ksft_test_result(write_read, "write_read\n"); - ksft_test_result(write_sleep_read, "write_sleep_read\n"); - ksft_test_result(write_fork_read, "write_fork_read\n"); - ksft_test_result(write_clone_read, "write_clone_read\n"); + ksft_test_result(write_read(), "write_read\n"); + ksft_test_result(write_sleep_read(), "write_sleep_read\n"); + ksft_test_result(write_fork_read(), "write_fork_read\n"); + ksft_test_result(write_clone_read(), "write_clone_read\n"); } else { ksft_print_msg("SME support not present\n"); diff --git a/tools/testing/selftests/arm64/bti/assembler.h b/tools/testing/selftests/arm64/bti/assembler.h index 04e7b72880ef99..141cdcbf0b8fac 100644 --- a/tools/testing/selftests/arm64/bti/assembler.h +++ b/tools/testing/selftests/arm64/bti/assembler.h @@ -14,7 +14,6 @@ #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) - .macro startfn name:req .globl \name \name: diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index cdd7a45c045d5f..a85c19e9524e1d 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -1568,7 +1568,6 @@ static void run_sve_tests(void) &test_config); } } - } static void run_sme_tests(void) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index 74e23208b94cab..9349aa630c8419 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -105,8 +105,8 @@ static void child_start(struct child_data *child, const char *program) /* * Read from the startup pipe, there should be no data - * and we should block until it is closed. We just - * carry on on error since this isn't super critical. + * and we should block until it is closed. We just + * carry-on on error since this isn't super critical. */ ret = read(3, &i, sizeof(i)); if (ret < 0) @@ -549,7 +549,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < cpus; i++) { diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c index e3cec3723ffa96..0c40007d128214 100644 --- a/tools/testing/selftests/arm64/fp/kernel-test.c +++ b/tools/testing/selftests/arm64/fp/kernel-test.c @@ -188,13 +188,13 @@ static bool create_socket(void) ref = malloc(digest_len); if (!ref) { - printf("Failed to allocated %d byte reference\n", digest_len); + printf("Failed to allocate %d byte reference\n", digest_len); return false; } digest = malloc(digest_len); if (!digest) { - printf("Failed to allocated %d byte digest\n", digest_len); + printf("Failed to allocate %d byte digest\n", digest_len); return false; } diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index b22303778fb0ae..e0fc3a001e2830 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -66,7 +66,7 @@ static const struct vec_type vec_types[] = { }; #define VL_TESTS (((TEST_VQ_MAX - SVE_VQ_MIN) + 1) * 4) -#define FLAG_TESTS 2 +#define FLAG_TESTS 4 #define FPSIMD_TESTS 2 #define EXPECTED_TESTS ((VL_TESTS + FLAG_TESTS + FPSIMD_TESTS) * ARRAY_SIZE(vec_types)) @@ -95,19 +95,27 @@ static int do_child(void) static int get_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_GETREGSET)"); + return ret; } static int set_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; } static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, @@ -115,8 +123,9 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, { struct user_sve_header *sve; void *p; - size_t sz = sizeof *sve; + size_t sz = sizeof(*sve); struct iovec iov; + int ret; while (1) { if (*size < sz) { @@ -132,8 +141,11 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, iov.iov_base = *buf; iov.iov_len = sz; - if (ptrace(PTRACE_GETREGSET, pid, type->regset, &iov)) + ret = ptrace(PTRACE_GETREGSET, pid, type->regset, &iov); + if (ret) { + ksft_perror("ptrace(PTRACE_GETREGSET)"); goto error; + } sve = *buf; if (sve->size <= sz) @@ -152,10 +164,46 @@ static int set_sve(pid_t pid, const struct vec_type *type, const struct user_sve_header *sve) { struct iovec iov; + int ret; iov.iov_base = (void *)sve; iov.iov_len = sve->size; - return ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; +} + +/* A read operation fails */ +static void read_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header *new_sve = NULL; + size_t new_sve_size = 0; + void *ret; + + ret = get_sve(child, type, (void **)&new_sve, &new_sve_size); + + ksft_test_result(ret == NULL, "%s unsupported read fails\n", + type->name); + + free(new_sve); +} + +/* A write operation fails */ +static void write_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + /* Just the header, no data */ + memset(&sve, 0, sizeof(sve)); + sve.size = sizeof(sve); + sve.flags = SVE_PT_REGS_SVE; + sve.vl = SVE_VL_MIN; + ret = set_sve(child, type, &sve); + + ksft_test_result(ret != 0, "%s unsupported write fails\n", + type->name); } /* Validate setting and getting the inherit flag */ @@ -270,6 +318,25 @@ static void check_u32(unsigned int vl, const char *reg, } } +/* Set out of range VLs */ +static void ptrace_set_vl_ranges(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + memset(&sve, 0, sizeof(sve)); + sve.flags = SVE_PT_REGS_SVE; + sve.size = sizeof(sve); + + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL 0\n", type->name); + + sve.vl = SVE_VL_MAX + SVE_VQ_BYTES; + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL %d\n", type->name, + SVE_VL_MAX + SVE_VQ_BYTES); +} + /* Access the FPSIMD registers via the SVE regset */ static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type) { @@ -683,6 +750,20 @@ static int do_parent(pid_t child) } for (i = 0; i < ARRAY_SIZE(vec_types); i++) { + /* + * If the vector type isn't supported reads and writes + * should fail. + */ + if (!(getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap)) { + read_fails(child, &vec_types[i]); + write_fails(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s unsupported read fails\n", + vec_types[i].name); + ksft_test_result_skip("%s unsupported write fails\n", + vec_types[i].name); + } + /* FPSIMD via SVE regset */ if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { ptrace_sve_fpsimd(child, &vec_types[i]); @@ -703,6 +784,17 @@ static int do_parent(pid_t child) vec_types[i].name); } + /* Setting out of bounds VLs should fail */ + if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { + ptrace_set_vl_ranges(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s Set invalid VL 0\n", + vec_types[i].name); + ksft_test_result_skip("%s Set invalid VL %d\n", + vec_types[i].name, + SVE_VL_MAX + SVE_VQ_BYTES); + } + /* Step through every possible VQ */ for (vq = SVE_VQ_MIN; vq <= TEST_VQ_MAX; vq++) { vl = sve_vl_from_vq(vq); diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c index ea9c7d47790f9d..2d75d342eeb9d4 100644 --- a/tools/testing/selftests/arm64/fp/vec-syscfg.c +++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c @@ -690,7 +690,6 @@ static inline void smstop(void) asm volatile("msr S0_3_C4_C6_3, xzr"); } - /* * Verify we can change the SVE vector length while SME is active and * continue to use SME afterwards. diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c index 584b8d59b7ea1c..a7f34040fbf10b 100644 --- a/tools/testing/selftests/arm64/fp/zt-ptrace.c +++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c @@ -108,7 +108,6 @@ static int get_zt(pid_t pid, char zt[ZT_SIG_REG_BYTES]) return ptrace(PTRACE_GETREGSET, pid, NT_ARM_ZT, &iov); } - static int set_zt(pid_t pid, const char zt[ZT_SIG_REG_BYTES]) { struct iovec iov; diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index d2f3497a9103fc..1fbbf0ca1f0291 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -14,11 +14,11 @@ LDLIBS+=-lpthread include ../../lib.mk $(OUTPUT)/basic-gcs: basic-gcs.c - $(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ - -static -include ../../../../include/nolibc/nolibc.h \ + $(CC) $(CFLAGS) -fno-asynchronous-unwind-tables -fno-ident -s -nostdlib -nostdinc \ + -static -I../../../../include/nolibc -include ../../../../include/nolibc/nolibc.h \ -I../../../../../usr/include \ -std=gnu99 -I../.. -g \ - -ffreestanding -Wall $^ -o $@ -lgcc + -ffreestanding $^ -o $@ -lgcc $(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S $(CC) -nostdlib $^ -o $@ diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 54f9c888249d74..250977abc39886 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -10,6 +10,7 @@ #include #include +#include #include #include "kselftest.h" @@ -386,14 +387,13 @@ int main(void) ksft_print_header(); - /* - * We don't have getauxval() with nolibc so treat a failure to - * read GCS state as a lack of support and skip. - */ + if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) + ksft_exit_skip("SKIP GCS not supported\n"); + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_skip("Failed to read GCS state: %d\n", ret); + ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret); if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { gcs_mode = PR_SHADOW_STACK_ENABLE; @@ -410,7 +410,7 @@ int main(void) } /* One last test: disable GCS, we can do this one time */ - my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); + ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); if (ret != 0) ksft_print_msg("Failed to disable GCS: %d\n", ret); diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c index 989f75a491b757..1e6abb136ffd07 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-locking.c +++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c @@ -165,7 +165,6 @@ TEST_F(valid_modes, lock_enable_disable_others) ASSERT_EQ(ret, 0); ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES); - ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, variant->mode); ASSERT_EQ(ret, 0); diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c index bbc7f4950c13ed..cf316d78ea97c7 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-stress.c +++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c @@ -433,7 +433,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < gcs_threads; i++) diff --git a/tools/testing/selftests/arm64/pauth/exec_target.c b/tools/testing/selftests/arm64/pauth/exec_target.c index 4435600ca400dd..e597861b26d6bf 100644 --- a/tools/testing/selftests/arm64/pauth/exec_target.c +++ b/tools/testing/selftests/arm64/pauth/exec_target.c @@ -13,7 +13,12 @@ int main(void) unsigned long hwcaps; size_t val; - fread(&val, sizeof(size_t), 1, stdin); + size_t size = fread(&val, sizeof(size_t), 1, stdin); + + if (size != 1) { + fprintf(stderr, "Could not read input from stdin\n"); + return EXIT_FAILURE; + } /* don't try to execute illegal (unimplemented) instructions) caller * should have checked this and keep worker simple diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3d8378972d26cc..be1ee7ba7ce031 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -44,3 +44,4 @@ xdp_redirect_multi xdp_synproxy xdp_hw_metadata xdp_features +verification_cert.h diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 3ebd77206f98f9..a17baf8c6fd75a 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -2,4 +2,3 @@ # Alphabetical order get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) -verifier_iterating_callbacks diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4863106034dfbc..f00587d4ede68e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -120,7 +120,7 @@ TEST_PROGS_EXTENDED := \ test_bpftool.py TEST_KMODS := bpf_testmod.ko bpf_test_no_cfi.ko bpf_test_modorder_x.ko \ - bpf_test_modorder_y.ko + bpf_test_modorder_y.ko bpf_test_rqspinlock.ko TEST_KMOD_TARGETS = $(addprefix $(OUTPUT)/,$(TEST_KMODS)) # Compile but not part of 'make run_tests' @@ -137,7 +137,7 @@ TEST_GEN_PROGS_EXTENDED = \ xdping \ xskxceiver -TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi +TEST_GEN_FILES += $(TEST_KMODS) liburandom_read.so urandom_read sign-file uprobe_multi ifneq ($(V),1) submake_extras := feature_display=0 @@ -398,7 +398,7 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif -# vmlinux.h is first dumped to a temprorary file and then compared to +# vmlinux.h is first dumped to a temporary file and then compared to # the previous version. This helps to avoid unnecessary re-builds of # $(TRUNNER_BPF_OBJS) $(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) @@ -496,15 +496,16 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ test_subskeleton.skel.h test_subskeleton_lib.skel.h \ test_usdt.skel.h -LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ - trace_printk.c trace_vprintk.c map_ptr_kern.c \ +LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c +LSKELS_SIGNED := fentry_test.c fexit_test.c atomics.c + # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ kfunc_call_test_subprog.c -SKEL_BLACKLIST += $$(LSKELS) +SKEL_BLACKLIST += $$(LSKELS) $$(LSKELS_SIGNED) test_static_linked.skel.h-deps := test_static_linked1.bpf.o test_static_linked2.bpf.o linked_funcs.skel.h-deps := linked_funcs1.bpf.o linked_funcs2.bpf.o @@ -535,6 +536,7 @@ HEADERS_FOR_BPF_OBJS := $(wildcard $(BPFDIR)/*.bpf.h) \ # $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER +LSKEL_SIGN := -S -k $(PRIVATE_KEY) -i $(VERIFICATION_CERT) TRUNNER_OUTPUT := $(OUTPUT)$(if $2,/)$2 TRUNNER_BINARY := $1$(if $2,-)$2 TRUNNER_TEST_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.test.o, \ @@ -550,6 +552,7 @@ TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(TRUNNER_BPF_SRCS))) TRUNNER_BPF_LSKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.lskel.h, $$(LSKELS) $$(LSKELS_EXTRA)) TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) +TRUNNER_BPF_LSKELS_SIGNED := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.lskel.h, $$(LSKELS_SIGNED)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) # Evaluate rules now with extra TRUNNER_XXX variables above already defined @@ -604,6 +607,15 @@ $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) +$(TRUNNER_BPF_LSKELS_SIGNED): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) + $$(call msg,GEN-SKEL,$(TRUNNER_BINARY) (signed),$$@) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< + $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) + $(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(Q)$$(BPFTOOL) gen skeleton $(LSKEL_SIGN) $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ + $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(LINKED_BPF_OBJS): %: $(TRUNNER_OUTPUT)/% # .SECONDEXPANSION here allows to correctly expand %-deps variables as prerequisites @@ -653,6 +665,7 @@ $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_BPF_SKELS) \ $(TRUNNER_BPF_LSKELS) \ + $(TRUNNER_BPF_LSKELS_SIGNED) \ $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) @@ -667,6 +680,7 @@ $(foreach N,$(patsubst $(TRUNNER_OUTPUT)/%.o,%,$(TRUNNER_EXTRA_OBJS)), \ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ $(TRUNNER_EXTRA_HDRS) \ + $(VERIFY_SIG_HDR) \ $(TRUNNER_TESTS_HDR) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) @@ -697,6 +711,18 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ endef +VERIFY_SIG_SETUP := $(CURDIR)/verify_sig_setup.sh +VERIFY_SIG_HDR := verification_cert.h +VERIFICATION_CERT := $(BUILD_DIR)/signing_key.der +PRIVATE_KEY := $(BUILD_DIR)/signing_key.pem + +$(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) + $(Q)mkdir -p $(BUILD_DIR) + $(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR) + +$(VERIFY_SIG_HDR): $(VERIFICATION_CERT) + $(Q)xxd -i -n test_progs_verification_cert $< > $@ + # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs @@ -716,6 +742,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ disasm.c \ disasm_helpers.c \ json_writer.c \ + $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ ip_check_defrag_frags.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ @@ -725,7 +752,7 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/uprobe_multi \ $(TEST_KMOD_TARGETS) \ ima_setup.sh \ - verify_sig_setup.sh \ + $(VERIFY_SIG_SETUP) \ $(wildcard progs/btf_dump_test_case_*.c) \ $(wildcard progs/*.bpf.o) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE @@ -816,6 +843,7 @@ $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h +$(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_trie_map.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -837,6 +865,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_htab_mem.o \ $(OUTPUT)/bench_bpf_crypto.o \ $(OUTPUT)/bench_sockmap.o \ + $(OUTPUT)/bench_lpm_trie_map.o \ # $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index ddd73d06a1eb27..bd29bb2e6cb529 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -284,6 +284,7 @@ extern struct argp bench_htab_mem_argp; extern struct argp bench_trigger_batch_argp; extern struct argp bench_crypto_argp; extern struct argp bench_sockmap_argp; +extern struct argp bench_lpm_trie_map_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -299,6 +300,7 @@ static const struct argp_child bench_parsers[] = { { &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 }, { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, { &bench_sockmap_argp, 0, "bpf sockmap benchmark", 0 }, + { &bench_lpm_trie_map_argp, 0, "LPM trie map benchmark", 0 }, {}, }; @@ -499,7 +501,7 @@ extern const struct bench bench_rename_rawtp; extern const struct bench bench_rename_fentry; extern const struct bench bench_rename_fexit; -/* pure counting benchmarks to establish theoretical lmits */ +/* pure counting benchmarks to establish theoretical limits */ extern const struct bench bench_trig_usermode_count; extern const struct bench bench_trig_syscall_count; extern const struct bench bench_trig_kernel_count; @@ -510,6 +512,8 @@ extern const struct bench bench_trig_kretprobe; extern const struct bench bench_trig_kprobe_multi; extern const struct bench bench_trig_kretprobe_multi; extern const struct bench bench_trig_fentry; +extern const struct bench bench_trig_kprobe_multi_all; +extern const struct bench bench_trig_kretprobe_multi_all; extern const struct bench bench_trig_fexit; extern const struct bench bench_trig_fmodret; extern const struct bench bench_trig_tp; @@ -558,6 +562,13 @@ extern const struct bench bench_htab_mem; extern const struct bench bench_crypto_encrypt; extern const struct bench bench_crypto_decrypt; extern const struct bench bench_sockmap; +extern const struct bench bench_lpm_trie_noop; +extern const struct bench bench_lpm_trie_baseline; +extern const struct bench bench_lpm_trie_lookup; +extern const struct bench bench_lpm_trie_insert; +extern const struct bench bench_lpm_trie_update; +extern const struct bench bench_lpm_trie_delete; +extern const struct bench bench_lpm_trie_free; static const struct bench *benchs[] = { &bench_count_global, @@ -578,6 +589,8 @@ static const struct bench *benchs[] = { &bench_trig_kprobe_multi, &bench_trig_kretprobe_multi, &bench_trig_fentry, + &bench_trig_kprobe_multi_all, + &bench_trig_kretprobe_multi_all, &bench_trig_fexit, &bench_trig_fmodret, &bench_trig_tp, @@ -625,6 +638,13 @@ static const struct bench *benchs[] = { &bench_crypto_encrypt, &bench_crypto_decrypt, &bench_sockmap, + &bench_lpm_trie_noop, + &bench_lpm_trie_baseline, + &bench_lpm_trie_lookup, + &bench_lpm_trie_insert, + &bench_lpm_trie_update, + &bench_lpm_trie_delete, + &bench_lpm_trie_free, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 005c401b3e2275..bea323820ffb88 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -46,6 +46,7 @@ struct bench_res { unsigned long gp_ns; unsigned long gp_ct; unsigned int stime; + unsigned long duration_ns; }; struct bench { diff --git a/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c new file mode 100644 index 00000000000000..246f6cb3387dda --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Cloudflare */ + +/* + * All of these benchmarks operate on tries with keys in the range + * [0, args.nr_entries), i.e. there are no gaps or partially filled + * branches of the trie for any key < args.nr_entries. + * + * This gives an idea of worst-case behaviour. + */ + +#include +#include +#include +#include "lpm_trie_bench.skel.h" +#include "lpm_trie_map.skel.h" +#include "bench.h" +#include "testing_helpers.h" +#include "progs/lpm_trie.h" + +static struct ctx { + struct lpm_trie_bench *bench; +} ctx; + +static struct { + __u32 nr_entries; + __u32 prefixlen; + bool random; +} args = { + .nr_entries = 0, + .prefixlen = 32, + .random = false, +}; + +enum { + ARG_NR_ENTRIES = 9000, + ARG_PREFIX_LEN, + ARG_RANDOM, +}; + +static const struct argp_option opts[] = { + { "nr_entries", ARG_NR_ENTRIES, "NR_ENTRIES", 0, + "Number of unique entries in the LPM trie" }, + { "prefix_len", ARG_PREFIX_LEN, "PREFIX_LEN", 0, + "Number of prefix bits to use in the LPM trie" }, + { "random", ARG_RANDOM, NULL, 0, "Access random keys during op" }, + {}, +}; + +static error_t lpm_parse_arg(int key, char *arg, struct argp_state *state) +{ + long ret; + + switch (key) { + case ARG_NR_ENTRIES: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "Invalid nr_entries count."); + argp_usage(state); + } + args.nr_entries = ret; + break; + case ARG_PREFIX_LEN: + ret = strtol(arg, NULL, 10); + if (ret < 1 || ret > UINT_MAX) { + fprintf(stderr, "Invalid prefix_len value."); + argp_usage(state); + } + args.prefixlen = ret; + break; + case ARG_RANDOM: + args.random = true; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +const struct argp bench_lpm_trie_map_argp = { + .options = opts, + .parser = lpm_parse_arg, +}; + +static void validate_common(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "benchmark doesn't support consumer\n"); + exit(1); + } + + if (args.nr_entries == 0) { + fprintf(stderr, "Missing --nr_entries parameter\n"); + exit(1); + } + + if ((1UL << args.prefixlen) < args.nr_entries) { + fprintf(stderr, "prefix_len value too small for nr_entries\n"); + exit(1); + } +} + +static void lpm_insert_validate(void) +{ + validate_common(); + + if (env.producer_cnt != 1) { + fprintf(stderr, "lpm-trie-insert requires a single producer\n"); + exit(1); + } + + if (args.random) { + fprintf(stderr, "lpm-trie-insert does not support --random\n"); + exit(1); + } +} + +static void lpm_delete_validate(void) +{ + validate_common(); + + if (env.producer_cnt != 1) { + fprintf(stderr, "lpm-trie-delete requires a single producer\n"); + exit(1); + } + + if (args.random) { + fprintf(stderr, "lpm-trie-delete does not support --random\n"); + exit(1); + } +} + +static void lpm_free_validate(void) +{ + validate_common(); + + if (env.producer_cnt != 1) { + fprintf(stderr, "lpm-trie-free requires a single producer\n"); + exit(1); + } + + if (args.random) { + fprintf(stderr, "lpm-trie-free does not support --random\n"); + exit(1); + } +} + +static struct trie_key *keys; +static __u32 *vals; + +static void fill_map(int map_fd) +{ + int err; + + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + err = bpf_map_update_batch(map_fd, keys, vals, &args.nr_entries, &opts); + if (err) { + fprintf(stderr, "failed to batch update keys to map: %d\n", + -err); + exit(1); + } +} + +static void empty_map(int map_fd) +{ + int err; + + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + err = bpf_map_delete_batch(map_fd, keys, &args.nr_entries, &opts); + if (err) { + fprintf(stderr, "failed to batch delete keys for map: %d\n", + -err); + exit(1); + } +} + +static void attach_prog(void) +{ + int i; + + ctx.bench = lpm_trie_bench__open_and_load(); + if (!ctx.bench) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + ctx.bench->bss->nr_entries = args.nr_entries; + ctx.bench->bss->prefixlen = args.prefixlen; + ctx.bench->bss->random = args.random; + + if (lpm_trie_bench__attach(ctx.bench)) { + fprintf(stderr, "failed to attach skeleton\n"); + exit(1); + } + + keys = calloc(args.nr_entries, sizeof(*keys)); + vals = calloc(args.nr_entries, sizeof(*vals)); + + for (i = 0; i < args.nr_entries; i++) { + struct trie_key *k = &keys[i]; + __u32 *v = &vals[i]; + + k->prefixlen = args.prefixlen; + k->data = i; + *v = 1; + } +} + +static void attach_prog_and_fill_map(void) +{ + int fd; + + attach_prog(); + + fd = bpf_map__fd(ctx.bench->maps.trie_map); + fill_map(fd); +} + +static void lpm_noop_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_NOOP; +} + +static void lpm_baseline_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_BASELINE; +} + +static void lpm_lookup_setup(void) +{ + attach_prog_and_fill_map(); + ctx.bench->bss->op = LPM_OP_LOOKUP; +} + +static void lpm_insert_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_INSERT; +} + +static void lpm_update_setup(void) +{ + attach_prog_and_fill_map(); + ctx.bench->bss->op = LPM_OP_UPDATE; +} + +static void lpm_delete_setup(void) +{ + attach_prog_and_fill_map(); + ctx.bench->bss->op = LPM_OP_DELETE; +} + +static void lpm_free_setup(void) +{ + attach_prog(); + ctx.bench->bss->op = LPM_OP_FREE; +} + +static void lpm_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.bench->bss->hits, 0); + res->duration_ns = atomic_swap(&ctx.bench->bss->duration_ns, 0); +} + +static void bench_reinit_map(void) +{ + int fd = bpf_map__fd(ctx.bench->maps.trie_map); + + switch (ctx.bench->bss->op) { + case LPM_OP_INSERT: + /* trie_map needs to be emptied */ + empty_map(fd); + break; + case LPM_OP_DELETE: + /* trie_map needs to be refilled */ + fill_map(fd); + break; + default: + fprintf(stderr, "Unexpected REINIT return code for op %d\n", + ctx.bench->bss->op); + exit(1); + } +} + +/* For NOOP, BASELINE, LOOKUP, INSERT, UPDATE, and DELETE */ +static void *lpm_producer(void *unused __always_unused) +{ + int err; + char in[ETH_HLEN]; /* unused */ + + LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = in, + .data_size_in = sizeof(in), .repeat = 1, ); + + while (true) { + int fd = bpf_program__fd(ctx.bench->progs.run_bench); + err = bpf_prog_test_run_opts(fd, &opts); + if (err) { + fprintf(stderr, "failed to run BPF prog: %d\n", err); + exit(1); + } + + /* Check for kernel error code */ + if ((int)opts.retval < 0) { + fprintf(stderr, "BPF prog returned error: %d\n", + opts.retval); + exit(1); + } + + switch (opts.retval) { + case LPM_BENCH_SUCCESS: + break; + case LPM_BENCH_REINIT_MAP: + bench_reinit_map(); + break; + default: + fprintf(stderr, "Unexpected BPF prog return code %d for op %d\n", + opts.retval, ctx.bench->bss->op); + exit(1); + } + } + + return NULL; +} + +static void *lpm_free_producer(void *unused __always_unused) +{ + while (true) { + struct lpm_trie_map *skel; + + skel = lpm_trie_map__open_and_load(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + fill_map(bpf_map__fd(skel->maps.trie_free_map)); + lpm_trie_map__destroy(skel); + } + + return NULL; +} + +/* + * The standard bench op_report_*() functions assume measurements are + * taken over a 1-second interval but operations that modify the map + * (INSERT, DELETE, and FREE) cannot run indefinitely without + * "resetting" the map to the initial state. Depending on the size of + * the map, this likely needs to happen before the 1-second timer fires. + * + * Calculate the fraction of a second over which the op measurement was + * taken (to ignore any time spent doing the reset) and report the + * throughput results per second. + */ +static void frac_second_report_progress(int iter, struct bench_res *res, + long delta_ns, double rate_divisor, + char rate) +{ + double hits_per_sec, hits_per_prod; + + hits_per_sec = res->hits / rate_divisor / + (res->duration_ns / (double)NSEC_PER_SEC); + hits_per_prod = hits_per_sec / env.producer_cnt; + + printf("Iter %3d (%7.3lfus): ", iter, + (delta_ns - NSEC_PER_SEC) / 1000.0); + printf("hits %8.3lf%c/s (%7.3lf%c/prod)\n", hits_per_sec, rate, + hits_per_prod, rate); +} + +static void frac_second_report_final(struct bench_res res[], int res_cnt, + double lat_divisor, double rate_divisor, + char rate, const char *unit) +{ + double hits_mean = 0.0, hits_stddev = 0.0; + double latency = 0.0; + int i; + + for (i = 0; i < res_cnt; i++) { + double val = res[i].hits / rate_divisor / + (res[i].duration_ns / (double)NSEC_PER_SEC); + hits_mean += val / (0.0 + res_cnt); + latency += res[i].duration_ns / res[i].hits / (0.0 + res_cnt); + } + + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) { + double val = + res[i].hits / rate_divisor / + (res[i].duration_ns / (double)NSEC_PER_SEC); + hits_stddev += (hits_mean - val) * (hits_mean - val) / + (res_cnt - 1.0); + } + + hits_stddev = sqrt(hits_stddev); + } + printf("Summary: throughput %8.3lf \u00B1 %5.3lf %c ops/s (%7.3lf%c ops/prod), ", + hits_mean, hits_stddev, rate, hits_mean / env.producer_cnt, + rate); + printf("latency %8.3lf %s/op\n", + latency / lat_divisor / env.producer_cnt, unit); +} + +static void insert_ops_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double rate_divisor = 1000000.0; + char rate = 'M'; + + frac_second_report_progress(iter, res, delta_ns, rate_divisor, rate); +} + +static void delete_ops_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double rate_divisor = 1000000.0; + char rate = 'M'; + + frac_second_report_progress(iter, res, delta_ns, rate_divisor, rate); +} + +static void free_ops_report_progress(int iter, struct bench_res *res, + long delta_ns) +{ + double rate_divisor = 1000.0; + char rate = 'K'; + + frac_second_report_progress(iter, res, delta_ns, rate_divisor, rate); +} + +static void insert_ops_report_final(struct bench_res res[], int res_cnt) +{ + double lat_divisor = 1.0; + double rate_divisor = 1000000.0; + const char *unit = "ns"; + char rate = 'M'; + + frac_second_report_final(res, res_cnt, lat_divisor, rate_divisor, rate, + unit); +} + +static void delete_ops_report_final(struct bench_res res[], int res_cnt) +{ + double lat_divisor = 1.0; + double rate_divisor = 1000000.0; + const char *unit = "ns"; + char rate = 'M'; + + frac_second_report_final(res, res_cnt, lat_divisor, rate_divisor, rate, + unit); +} + +static void free_ops_report_final(struct bench_res res[], int res_cnt) +{ + double lat_divisor = 1000000.0; + double rate_divisor = 1000.0; + const char *unit = "ms"; + char rate = 'K'; + + frac_second_report_final(res, res_cnt, lat_divisor, rate_divisor, rate, + unit); +} + +/* noop bench measures harness-overhead */ +const struct bench bench_lpm_trie_noop = { + .name = "lpm-trie-noop", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_noop_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* baseline overhead for lookup and update */ +const struct bench bench_lpm_trie_baseline = { + .name = "lpm-trie-baseline", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_baseline_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* measure cost of doing a lookup on existing entries in a full trie */ +const struct bench bench_lpm_trie_lookup = { + .name = "lpm-trie-lookup", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_lookup_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* measure cost of inserting new entries into an empty trie */ +const struct bench bench_lpm_trie_insert = { + .name = "lpm-trie-insert", + .argp = &bench_lpm_trie_map_argp, + .validate = lpm_insert_validate, + .setup = lpm_insert_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = insert_ops_report_progress, + .report_final = insert_ops_report_final, +}; + +/* measure cost of updating existing entries in a full trie */ +const struct bench bench_lpm_trie_update = { + .name = "lpm-trie-update", + .argp = &bench_lpm_trie_map_argp, + .validate = validate_common, + .setup = lpm_update_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* measure cost of deleting existing entries from a full trie */ +const struct bench bench_lpm_trie_delete = { + .name = "lpm-trie-delete", + .argp = &bench_lpm_trie_map_argp, + .validate = lpm_delete_validate, + .setup = lpm_delete_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = delete_ops_report_progress, + .report_final = delete_ops_report_final, +}; + +/* measure cost of freeing a full trie */ +const struct bench bench_lpm_trie_free = { + .name = "lpm-trie-free", + .argp = &bench_lpm_trie_map_argp, + .validate = lpm_free_validate, + .setup = lpm_free_setup, + .producer_thread = lpm_free_producer, + .measure = lpm_measure, + .report_progress = free_ops_report_progress, + .report_final = free_ops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_sockmap.c b/tools/testing/selftests/bpf/benchs/bench_sockmap.c index 8ebf563a67a2b5..cfc072aa7fff7e 100644 --- a/tools/testing/selftests/bpf/benchs/bench_sockmap.c +++ b/tools/testing/selftests/bpf/benchs/bench_sockmap.c @@ -10,6 +10,7 @@ #include #include "bench.h" #include "bench_sockmap_prog.skel.h" +#include "bpf_util.h" #define FILE_SIZE (128 * 1024) #define DATA_REPEAT_SIZE 10 @@ -124,8 +125,8 @@ static void bench_sockmap_prog_destroy(void) { int i; - for (i = 0; i < sizeof(ctx.fds); i++) { - if (ctx.fds[0] > 0) + for (i = 0; i < ARRAY_SIZE(ctx.fds); i++) { + if (ctx.fds[i] > 0) close(ctx.fds[i]); } diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 82327657846e5f..1e2aff007c2a40 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -226,6 +226,65 @@ static void trigger_fentry_setup(void) attach_bpf(ctx.skel->progs.bench_trigger_fentry); } +static void attach_ksyms_all(struct bpf_program *empty, bool kretprobe) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + char **syms = NULL; + size_t cnt = 0; + + /* Some recursive functions will be skipped in + * bpf_get_ksyms -> skip_entry, as they can introduce sufficient + * overhead. However, it's difficut to skip all the recursive + * functions for a debug kernel. + * + * So, don't run the kprobe-multi-all and kretprobe-multi-all on + * a debug kernel. + */ + if (bpf_get_ksyms(&syms, &cnt, true)) { + fprintf(stderr, "failed to get ksyms\n"); + exit(1); + } + + opts.syms = (const char **) syms; + opts.cnt = cnt; + opts.retprobe = kretprobe; + /* attach empty to all the kernel functions except bpf_get_numa_node_id. */ + if (!bpf_program__attach_kprobe_multi_opts(empty, NULL, &opts)) { + fprintf(stderr, "failed to attach bpf_program__attach_kprobe_multi_opts to all\n"); + exit(1); + } +} + +static void trigger_kprobe_multi_all_setup(void) +{ + struct bpf_program *prog, *empty; + + setup_ctx(); + empty = ctx.skel->progs.bench_kprobe_multi_empty; + prog = ctx.skel->progs.bench_trigger_kprobe_multi; + bpf_program__set_autoload(empty, true); + bpf_program__set_autoload(prog, true); + load_ctx(); + + attach_ksyms_all(empty, false); + attach_bpf(prog); +} + +static void trigger_kretprobe_multi_all_setup(void) +{ + struct bpf_program *prog, *empty; + + setup_ctx(); + empty = ctx.skel->progs.bench_kretprobe_multi_empty; + prog = ctx.skel->progs.bench_trigger_kretprobe_multi; + bpf_program__set_autoload(empty, true); + bpf_program__set_autoload(prog, true); + load_ctx(); + + attach_ksyms_all(empty, true); + attach_bpf(prog); +} + static void trigger_fexit_setup(void) { setup_ctx(); @@ -512,6 +571,8 @@ BENCH_TRIG_KERNEL(kretprobe, "kretprobe"); BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi"); BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi"); BENCH_TRIG_KERNEL(fentry, "fentry"); +BENCH_TRIG_KERNEL(kprobe_multi_all, "kprobe-multi-all"); +BENCH_TRIG_KERNEL(kretprobe_multi_all, "kretprobe-multi-all"); BENCH_TRIG_KERNEL(fexit, "fexit"); BENCH_TRIG_KERNEL(fmodret, "fmodret"); BENCH_TRIG_KERNEL(tp, "tp"); diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh index a690f5a68b6b02..f7573708a0c33f 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -6,8 +6,8 @@ def_tests=( \ usermode-count kernel-count syscall-count \ fentry fexit fmodret \ rawtp tp \ - kprobe kprobe-multi \ - kretprobe kretprobe-multi \ + kprobe kprobe-multi kprobe-multi-all \ + kretprobe kretprobe-multi kretprobe-multi-all \ ) tests=("$@") diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index da7e230f2781e1..d89eda3fd8a357 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -599,4 +599,58 @@ extern void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it) __weak __ksym; extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, struct bpf_dynptr *value_p) __weak __ksym; +#define PREEMPT_BITS 8 +#define SOFTIRQ_BITS 8 +#define HARDIRQ_BITS 4 +#define NMI_BITS 4 + +#define PREEMPT_SHIFT 0 +#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) +#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) + +#define __IRQ_MASK(x) ((1UL << (x))-1) + +#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) +#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) +#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) + +extern bool CONFIG_PREEMPT_RT __kconfig __weak; +#ifdef bpf_target_x86 +extern const int __preempt_count __ksym; +#endif + +struct task_struct___preempt_rt { + int softirq_disable_cnt; +} __attribute__((preserve_access_index)); + +static inline int get_preempt_count(void) +{ +#if defined(bpf_target_x86) + return *(int *) bpf_this_cpu_ptr(&__preempt_count); +#elif defined(bpf_target_arm64) + return bpf_get_current_task_btf()->thread_info.preempt.count; +#endif + return 0; +} + +/* Description + * Report whether it is in interrupt context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_interrupt(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return pcnt & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK); + + tsk = (void *) bpf_get_current_task_btf(); + return (pcnt & (NMI_MASK | HARDIRQ_MASK)) | + (tsk->softirq_disable_cnt & SOFTIRQ_MASK); +} + #endif diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 9386dfe8b8849e..794d44d19c8865 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -19,6 +19,9 @@ extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, extern int bpf_dynptr_from_xdp(struct xdp_md *xdp, __u64 flags, struct bpf_dynptr *ptr__uninit) __ksym __weak; +extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags, + struct bpf_dynptr *ptr__uninit) __ksym __weak; + /* Description * Obtain a read-only pointer to the dynptr's data * Returns diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 5f6963a320d732..4bc2d25f33e180 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -67,6 +67,9 @@ static inline void bpf_strlcpy(char *dst, const char *src, size_t sz) #define sys_gettid() syscall(SYS_gettid) #endif +/* and poison usage to ensure it does not creep back in. */ +#pragma GCC poison gettid + #ifndef ENOTSUPP #define ENOTSUPP 524 #endif diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 15f6260148728d..20cede4db3cee8 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -412,6 +412,26 @@ void remove_cgroup(const char *relative_path) log_err("rmdiring cgroup %s .. %s", relative_path, cgroup_path); } +/* + * remove_cgroup_pid() - Remove a cgroup setup by process identified by PID + * @relative_path: The cgroup path, relative to the workdir, to remove + * @pid: PID to be used to find cgroup_path + * + * This function expects a cgroup to already be created, relative to the cgroup + * work dir. It also expects the cgroup doesn't have any children or live + * processes and it removes the cgroup. + * + * On failure, it will print an error to stderr. + */ +void remove_cgroup_pid(const char *relative_path, int pid) +{ + char cgroup_path[PATH_MAX + 1]; + + format_cgroup_path_pid(cgroup_path, relative_path, pid); + if (rmdir(cgroup_path)) + log_err("rmdiring cgroup %s .. %s", relative_path, cgroup_path); +} + /** * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD * @relative_path: The cgroup path, relative to the workdir, to join diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h index 182e1ac36c95dd..3857304be87410 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.h +++ b/tools/testing/selftests/bpf/cgroup_helpers.h @@ -19,6 +19,7 @@ int cgroup_setup_and_join(const char *relative_path); int get_root_cgroup(void); int create_and_get_cgroup(const char *relative_path); void remove_cgroup(const char *relative_path); +void remove_cgroup_pid(const char *relative_path, int pid); unsigned long long get_cgroup_id(const char *relative_path); int get_cgroup1_hierarchy_id(const char *subsys_name); diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 8916ab814a3ead..70b28c1e653ead 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -61,6 +61,7 @@ CONFIG_MPLS_IPTUNNEL=y CONFIG_MPLS_ROUTING=y CONFIG_MPTCP=y CONFIG_NET_ACT_GACT=y +CONFIG_NET_ACT_MIRRED=y CONFIG_NET_ACT_SKBMOD=y CONFIG_NET_CLS=y CONFIG_NET_CLS_ACT=y diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64 index e1495a4bbc99ca..7efad36ceb26f2 100644 --- a/tools/testing/selftests/bpf/config.aarch64 +++ b/tools/testing/selftests/bpf/config.aarch64 @@ -31,10 +31,7 @@ CONFIG_COMPAT=y CONFIG_CPUSETS=y CONFIG_CRASH_DUMP=y CONFIG_CRYPTO_USER_API_RNG=y -CONFIG_CRYPTO_USER_API_SKCIPHER=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_INFO_BTF=y -CONFIG_DEBUG_INFO_DWARF4=y CONFIG_DEBUG_INFO_REDUCED=n CONFIG_DEBUG_LIST=y CONFIG_DEBUG_LOCKDEP=y @@ -46,7 +43,6 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEVTMPFS=y CONFIG_DRM=y -CONFIG_DUMMY=y CONFIG_EXPERT=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -70,13 +66,11 @@ CONFIG_HZ_100=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_IKHEADERS=y CONFIG_INET6_ESP=y -CONFIG_INET_ESP=y CONFIG_INET=y CONFIG_INPUT_EVDEV=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTICAST=y CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_IPTABLES=y CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPVLAN=y CONFIG_JUMP_LABEL=y @@ -97,22 +91,18 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_NAMESPACES=y CONFIG_NET_ACT_BPF=y -CONFIG_NET_ACT_GACT=y CONFIG_NETDEVICES=y CONFIG_NETFILTER_XT_MATCH_BPF=y CONFIG_NETFILTER_XT_TARGET_MARK=y CONFIG_NET_KEY=y -CONFIG_NET_SCH_FQ=y CONFIG_NET_VRF=y CONFIG_NET=y -CONFIG_NF_TABLES=y CONFIG_NLMON=y CONFIG_NO_HZ_IDLE=y CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_OVERLAY_FS=y CONFIG_PACKET_DIAG=y -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI_HOST_GENERIC=y @@ -149,7 +139,6 @@ CONFIG_TASK_XACCT=y CONFIG_TCG_TIS=y CONFIG_TCG_TPM=y CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_DCTCP=y CONFIG_TLS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS=y @@ -161,6 +150,5 @@ CONFIG_UPROBES=y CONFIG_USER_NS=y CONFIG_VETH=y CONFIG_VLAN_8021Q=y -CONFIG_VSOCKETS=y CONFIG_VSOCKETS_LOOPBACK=y CONFIG_XFRM_USER=y diff --git a/tools/testing/selftests/bpf/config.ppc64el b/tools/testing/selftests/bpf/config.ppc64el index 9acf389dc4ce67..b53afb5e0b71e2 100644 --- a/tools/testing/selftests/bpf/config.ppc64el +++ b/tools/testing/selftests/bpf/config.ppc64el @@ -54,7 +54,6 @@ CONFIG_NET=y CONFIG_NO_HZ_IDLE=y CONFIG_NONPORTABLE=y CONFIG_NR_CPUS=256 -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI_HOST_GENERIC=y diff --git a/tools/testing/selftests/bpf/config.riscv64 b/tools/testing/selftests/bpf/config.riscv64 index bb7043a80e1ac0..7bee24a79a7103 100644 --- a/tools/testing/selftests/bpf/config.riscv64 +++ b/tools/testing/selftests/bpf/config.riscv64 @@ -48,7 +48,6 @@ CONFIG_NET_VRF=y CONFIG_NONPORTABLE=y CONFIG_NO_HZ_IDLE=y CONFIG_NR_CPUS=256 -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI=y diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x index 26c3bc2ce11d5e..db61878148e425 100644 --- a/tools/testing/selftests/bpf/config.s390x +++ b/tools/testing/selftests/bpf/config.s390x @@ -22,10 +22,7 @@ CONFIG_CHECKPOINT_RESTORE=y CONFIG_CPUSETS=y CONFIG_CRASH_DUMP=y CONFIG_CRYPTO_USER_API_RNG=y -CONFIG_CRYPTO_USER_API_SKCIPHER=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_INFO_BTF=y -CONFIG_DEBUG_INFO_DWARF4=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_NOTIFIERS=y @@ -56,11 +53,9 @@ CONFIG_IDLE_PAGE_TRACKING=y CONFIG_IKHEADERS=y CONFIG_INET6_ESP=y CONFIG_INET=y -CONFIG_INET_ESP=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTICAST=y CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_IPTABLES=y CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPVLAN=y CONFIG_JUMP_LABEL=y @@ -83,18 +78,14 @@ CONFIG_MEMORY_HOTREMOVE=y CONFIG_NAMESPACES=y CONFIG_NET=y CONFIG_NET_ACT_BPF=y -CONFIG_NET_ACT_GACT=y CONFIG_NET_KEY=y -CONFIG_NET_SCH_FQ=y CONFIG_NET_VRF=y CONFIG_NETDEVICES=y CONFIG_NETFILTER_XT_MATCH_BPF=y CONFIG_NETFILTER_XT_TARGET_MARK=y -CONFIG_NF_TABLES=y CONFIG_NO_HZ_IDLE=y CONFIG_NR_CPUS=256 CONFIG_NUMA=y -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI=y @@ -119,7 +110,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_TASK_XACCT=y CONFIG_TASKSTATS=y CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_DCTCP=y CONFIG_TLS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y @@ -131,6 +121,5 @@ CONFIG_UPROBES=y CONFIG_USER_NS=y CONFIG_VETH=y CONFIG_VLAN_8021Q=y -CONFIG_VSOCKETS=y CONFIG_VSOCKETS_LOOPBACK=y CONFIG_XFRM_USER=y diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64 index 5e713ef7caa307..42ad817b00aea5 100644 --- a/tools/testing/selftests/bpf/config.x86_64 +++ b/tools/testing/selftests/bpf/config.x86_64 @@ -44,7 +44,6 @@ CONFIG_CRYPTO_SEQIV=y CONFIG_CRYPTO_XXHASH=y CONFIG_DCB=y CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_INFO_BTF=y CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEFAULT_FQ_CODEL=y @@ -104,12 +103,10 @@ CONFIG_HZ_1000=y CONFIG_INET=y CONFIG_INPUT_EVDEV=y CONFIG_INTEL_POWERCLAMP=y -CONFIG_IP6_NF_IPTABLES=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MROUTE=y CONFIG_IP_MULTICAST=y CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_IPTABLES=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y CONFIG_IP_ROUTE_MULTIPATH=y @@ -162,7 +159,6 @@ CONFIG_NUMA=y CONFIG_NUMA_BALANCING=y CONFIG_NVMEM=y CONFIG_OSF_PARTITION=y -CONFIG_PACKET=y CONFIG_PANIC_ON_OOPS=y CONFIG_PARTITION_ADVANCED=y CONFIG_PCI=y @@ -220,7 +216,6 @@ CONFIG_VALIDATE_FS_PARSER=y CONFIG_VETH=y CONFIG_VIRT_DRIVERS=y CONFIG_VLAN_8021Q=y -CONFIG_VSOCKETS=y CONFIG_VSOCKETS_LOOPBACK=y CONFIG_X86_ACPI_CPUFREQ=y CONFIG_X86_CPUID=y diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 72b5c174ab3bb8..cdf7b664144420 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -457,7 +457,7 @@ int append_tid(char *str, size_t sz) if (end + 8 > sz) return -1; - sprintf(&str[end], "%07d", gettid()); + sprintf(&str[end], "%07ld", sys_gettid()); str[end + 7] = '\0'; return 0; diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 1d53a8561ee2fc..24c509ce4e5b2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -42,11 +42,11 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "2"}, - {1, "R3_w", "4"}, - {2, "R3_w", "8"}, - {3, "R3_w", "16"}, - {4, "R3_w", "32"}, + {0, "R3", "2"}, + {1, "R3", "4"}, + {2, "R3", "8"}, + {3, "R3", "16"}, + {4, "R3", "32"}, }, }, { @@ -70,17 +70,17 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "1"}, - {1, "R3_w", "2"}, - {2, "R3_w", "4"}, - {3, "R3_w", "8"}, - {4, "R3_w", "16"}, - {5, "R3_w", "1"}, - {6, "R4_w", "32"}, - {7, "R4_w", "16"}, - {8, "R4_w", "8"}, - {9, "R4_w", "4"}, - {10, "R4_w", "2"}, + {0, "R3", "1"}, + {1, "R3", "2"}, + {2, "R3", "4"}, + {3, "R3", "8"}, + {4, "R3", "16"}, + {5, "R3", "1"}, + {6, "R4", "32"}, + {7, "R4", "16"}, + {8, "R4", "8"}, + {9, "R4", "4"}, + {10, "R4", "2"}, }, }, { @@ -99,12 +99,12 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "4"}, - {1, "R3_w", "8"}, - {2, "R3_w", "10"}, - {3, "R4_w", "8"}, - {4, "R4_w", "12"}, - {5, "R4_w", "14"}, + {0, "R3", "4"}, + {1, "R3", "8"}, + {2, "R3", "10"}, + {3, "R4", "8"}, + {4, "R4", "12"}, + {5, "R4", "14"}, }, }, { @@ -121,10 +121,10 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "7"}, - {1, "R3_w", "7"}, - {2, "R3_w", "14"}, - {3, "R3_w", "56"}, + {0, "R3", "7"}, + {1, "R3", "7"}, + {2, "R3", "14"}, + {3, "R3", "56"}, }, }, @@ -162,19 +162,19 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R0_w", "pkt(off=8,r=8)"}, - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R3_w", "var_off=(0x0; 0x1fe)"}, - {8, "R3_w", "var_off=(0x0; 0x3fc)"}, - {9, "R3_w", "var_off=(0x0; 0x7f8)"}, - {10, "R3_w", "var_off=(0x0; 0xff0)"}, - {12, "R3_w", "pkt_end()"}, - {17, "R4_w", "var_off=(0x0; 0xff)"}, - {18, "R4_w", "var_off=(0x0; 0x1fe0)"}, - {19, "R4_w", "var_off=(0x0; 0xff0)"}, - {20, "R4_w", "var_off=(0x0; 0x7f8)"}, - {21, "R4_w", "var_off=(0x0; 0x3fc)"}, - {22, "R4_w", "var_off=(0x0; 0x1fe)"}, + {6, "R0", "pkt(off=8,r=8)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R3", "var_off=(0x0; 0x1fe)"}, + {8, "R3", "var_off=(0x0; 0x3fc)"}, + {9, "R3", "var_off=(0x0; 0x7f8)"}, + {10, "R3", "var_off=(0x0; 0xff0)"}, + {12, "R3", "pkt_end()"}, + {17, "R4", "var_off=(0x0; 0xff)"}, + {18, "R4", "var_off=(0x0; 0x1fe0)"}, + {19, "R4", "var_off=(0x0; 0xff0)"}, + {20, "R4", "var_off=(0x0; 0x7f8)"}, + {21, "R4", "var_off=(0x0; 0x3fc)"}, + {22, "R4", "var_off=(0x0; 0x1fe)"}, }, }, { @@ -195,16 +195,16 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R4_w", "var_off=(0x0; 0xff)"}, - {8, "R4_w", "var_off=(0x0; 0xff)"}, - {9, "R4_w", "var_off=(0x0; 0xff)"}, - {10, "R4_w", "var_off=(0x0; 0x1fe)"}, - {11, "R4_w", "var_off=(0x0; 0xff)"}, - {12, "R4_w", "var_off=(0x0; 0x3fc)"}, - {13, "R4_w", "var_off=(0x0; 0xff)"}, - {14, "R4_w", "var_off=(0x0; 0x7f8)"}, - {15, "R4_w", "var_off=(0x0; 0xff0)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R4", "var_off=(0x0; 0xff)"}, + {8, "R4", "var_off=(0x0; 0xff)"}, + {9, "R4", "var_off=(0x0; 0xff)"}, + {10, "R4", "var_off=(0x0; 0x1fe)"}, + {11, "R4", "var_off=(0x0; 0xff)"}, + {12, "R4", "var_off=(0x0; 0x3fc)"}, + {13, "R4", "var_off=(0x0; 0xff)"}, + {14, "R4", "var_off=(0x0; 0x7f8)"}, + {15, "R4", "var_off=(0x0; 0xff0)"}, }, }, { @@ -235,14 +235,14 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {2, "R5_w", "pkt(r=0)"}, - {4, "R5_w", "pkt(off=14,r=0)"}, - {5, "R4_w", "pkt(off=14,r=0)"}, + {2, "R5", "pkt(r=0)"}, + {4, "R5", "pkt(off=14,r=0)"}, + {5, "R4", "pkt(off=14,r=0)"}, {9, "R2", "pkt(r=18)"}, {10, "R5", "pkt(off=14,r=18)"}, - {10, "R4_w", "var_off=(0x0; 0xff)"}, - {13, "R4_w", "var_off=(0x0; 0xffff)"}, - {14, "R4_w", "var_off=(0x0; 0xffff)"}, + {10, "R4", "var_off=(0x0; 0xff)"}, + {13, "R4", "var_off=(0x0; 0xffff)"}, + {14, "R4", "var_off=(0x0; 0xffff)"}, }, }, { @@ -299,12 +299,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. */ - {11, "R5_w", "pkt(id=1,off=14,"}, + {11, "R5", "pkt(id=1,off=14,"}, /* At the time the word size load is performed from R5, * it's total offset is NET_IP_ALIGN + reg->off (0) + * reg->aux_off (14) which is 16. Then the variable @@ -320,12 +320,12 @@ static struct bpf_align_test tests[] = { * instruction to validate R5 state. We also check * that R4 is what it should be in such case. */ - {18, "R4_w", "var_off=(0x0; 0x3fc)"}, - {18, "R5_w", "var_off=(0x0; 0x3fc)"}, + {18, "R4", "var_off=(0x0; 0x3fc)"}, + {18, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant offset is added to R5, resulting in * reg->off of 14. */ - {19, "R5_w", "pkt(id=2,off=14,"}, + {19, "R5", "pkt(id=2,off=14,"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off * (14) which is 16. Then the variable offset is 4-byte @@ -337,21 +337,21 @@ static struct bpf_align_test tests[] = { /* Constant offset is added to R5 packet pointer, * resulting in reg->off value of 14. */ - {26, "R5_w", "pkt(off=14,r=8)"}, + {26, "R5", "pkt(off=14,r=8)"}, /* Variable offset is added to R5, resulting in a * variable offset of (4n). See comment for insn #18 * for R4 = R5 trick. */ - {28, "R4_w", "var_off=(0x0; 0x3fc)"}, - {28, "R5_w", "var_off=(0x0; 0x3fc)"}, + {28, "R4", "var_off=(0x0; 0x3fc)"}, + {28, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant is added to R5 again, setting reg->off to 18. */ - {29, "R5_w", "pkt(id=3,off=18,"}, + {29, "R5", "pkt(id=3,off=18,"}, /* And once more we add a variable; resulting var_off * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {31, "R4_w", "var_off=(0x0; 0x7fc)"}, - {31, "R5_w", "var_off=(0x0; 0x7fc)"}, + {31, "R4", "var_off=(0x0; 0x7fc)"}, + {31, "R5", "var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so @@ -397,12 +397,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {8, "R6_w", "var_off=(0x2; 0x7fc)"}, + {8, "R6", "var_off=(0x2; 0x7fc)"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w", "var_off=(0x2; 0x7fc)"}, + {11, "R5", "var_off=(0x2; 0x7fc)"}, {12, "R4", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -414,11 +414,11 @@ static struct bpf_align_test tests[] = { /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ - {17, "R6_w", "var_off=(0x0; 0x3fc)"}, + {17, "R6", "var_off=(0x0; 0x3fc)"}, /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w", "var_off=(0x2; 0xffc)"}, + {19, "R5", "var_off=(0x2; 0xffc)"}, {20, "R4", "var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -459,18 +459,18 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {3, "R5_w", "pkt_end()"}, + {3, "R5", "pkt_end()"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {5, "R5_w", "var_off=(0x0; 0xfffffffffffffffc)"}, + {5, "R5", "var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {6, "R5_w", "var_off=(0x2; 0xfffffffffffffffc)"}, + {6, "R5", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ {9, "R5", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, - {12, "R4_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {11, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {12, "R4", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -478,7 +478,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {15, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, } }, { @@ -513,12 +513,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {8, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {8, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {9, "R6_w", "var_off=(0x2; 0x7fc)"}, + {9, "R6", "var_off=(0x2; 0x7fc)"}, /* New unknown value in R7 is (4n) */ - {10, "R7_w", "var_off=(0x0; 0x3fc)"}, + {10, "R7", "var_off=(0x0; 0x3fc)"}, /* Subtracting it from R6 blows our unsigned bounds */ {11, "R6", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ @@ -566,16 +566,16 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {9, "R6_w", "var_off=(0x0; 0x3c)"}, + {6, "R2", "pkt(r=8)"}, + {9, "R6", "var_off=(0x0; 0x3c)"}, /* Adding 14 makes R6 be (4n+2) */ - {10, "R6_w", "var_off=(0x2; 0x7c)"}, + {10, "R6", "var_off=(0x2; 0x7c)"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w", "var_off=(0xffffffffffffff82; 0x7c)"}, + {13, "R5", "var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ - {14, "R7_w", "var_off=(0x0; 0x7fc)"}, + {14, "R7", "var_off=(0x0; 0x7fc)"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w", "var_off=(0x2; 0x7fc)"}, + {16, "R5", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so diff --git a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c index 0223fce4db2bc2..693fd86fbde622 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c @@ -40,8 +40,13 @@ static void *spin_lock_thread(void *arg) err = bpf_prog_test_run_opts(prog_fd, &topts); ASSERT_OK(err, "test_run err"); + + if (topts.retval == -EOPNOTSUPP) + goto end; + ASSERT_EQ((int)topts.retval, 0, "test_run retval"); +end: pthread_exit(arg); } @@ -63,6 +68,7 @@ static void test_arena_spin_lock_size(int size) skel = arena_spin_lock__open_and_load(); if (!ASSERT_OK_PTR(skel, "arena_spin_lock__open_and_load")) return; + if (skel->data->test_skip == 2) { test__skip(); goto end; @@ -86,6 +92,13 @@ static void test_arena_spin_lock_size(int size) goto end_barrier; } + if (skel->data->test_skip == 3) { + printf("%s:SKIP: CONFIG_NR_CPUS exceed the maximum supported by arena spinlock\n", + __func__); + test__skip(); + goto end_barrier; + } + ASSERT_EQ(skel->bss->counter, repeat * nthreads, "check counter value"); end_barrier: diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c index 13e101f370a1d3..92b5f378bfb826 100644 --- a/tools/testing/selftests/bpf/prog_tests/atomics.c +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -165,11 +165,17 @@ static void test_xchg(struct atomics_lskel *skel) void test_atomics(void) { struct atomics_lskel *skel; + int err; - skel = atomics_lskel__open_and_load(); - if (!ASSERT_OK_PTR(skel, "atomics skeleton load")) + skel = atomics_lskel__open(); + if (!ASSERT_OK_PTR(skel, "atomics skeleton open")) return; + skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = atomics_lskel__load(skel); + if (!ASSERT_OK(err, "atomics skeleton load")) + goto cleanup; + if (skel->data->skip_tests) { printf("%s:SKIP:no ENABLE_ATOMICS_TESTS (missing Clang BPF atomics support)", __func__); diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index cabc51c2ca6bd5..9e77e5da7097c3 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -3,6 +3,7 @@ #include "test_attach_kprobe_sleepable.skel.h" #include "test_attach_probe_manual.skel.h" #include "test_attach_probe.skel.h" +#include "kprobe_write_ctx.skel.h" /* this is how USDT semaphore is actually defined, except volatile modifier */ volatile unsigned short uprobe_ref_ctr __attribute__((unused)) __attribute((section(".probes"))); @@ -201,6 +202,31 @@ static void test_attach_kprobe_long_event_name(void) test_attach_probe_manual__destroy(skel); } +#ifdef __x86_64__ +/* attach kprobe/kretprobe long event name testings */ +static void test_attach_kprobe_write_ctx(void) +{ + struct kprobe_write_ctx *skel = NULL; + struct bpf_link *link = NULL; + + skel = kprobe_write_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_write_ctx__open_and_load")) + return; + + link = bpf_program__attach_kprobe_opts(skel->progs.kprobe_write_ctx, + "bpf_fentry_test1", NULL); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_kprobe_opts")) + bpf_link__destroy(link); + + kprobe_write_ctx__destroy(skel); +} +#else +static void test_attach_kprobe_write_ctx(void) +{ + test__skip(); +} +#endif + static void test_attach_probe_auto(struct test_attach_probe *skel) { struct bpf_link *uprobe_err_link; @@ -406,6 +432,8 @@ void test_attach_probe(void) test_attach_uprobe_long_event_name(); if (test__start_subtest("kprobe-long_name")) test_attach_kprobe_long_event_name(); + if (test__start_subtest("kprobe-write-ctx")) + test_attach_kprobe_write_ctx(); cleanup: test_attach_probe__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 4a0670c056bad6..75f4dff7d04220 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -450,8 +450,7 @@ static void pe_subtest(struct test_bpf_cookie *skel) attr.size = sizeof(attr); attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_CPU_CLOCK; - attr.freq = 1; - attr.sample_freq = 10000; + attr.sample_period = 100000; pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); if (!ASSERT_GE(pfd, 0, "perf_fd")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 82903585c8700c..10cba526d3e631 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -63,7 +63,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t) /* tests with t->known_ptr_sz have no "long" or "unsigned long" type, * so it's impossible to determine correct pointer size; but if they - * do, it should be 8 regardless of host architecture, becaues BPF + * do, it should be 8 regardless of host architecture, because BPF * target is always 64-bit */ if (!t->known_ptr_sz) { diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c b/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c index e0dd966e4a3ef3..5ad904e9d15df9 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c @@ -44,7 +44,7 @@ static void test_read_cgroup_xattr(void) if (!ASSERT_OK_PTR(skel, "read_cgroupfs_xattr__open_and_load")) goto out; - skel->bss->target_pid = gettid(); + skel->bss->target_pid = sys_gettid(); if (!ASSERT_OK(read_cgroupfs_xattr__attach(skel), "read_cgroupfs_xattr__attach")) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c index adda85f970589e..4b42fbc96efc9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c +++ b/tools/testing/selftests/bpf/prog_tests/cgrp_kfunc.c @@ -4,6 +4,8 @@ #define _GNU_SOURCE #include #include +#include +#include #include "cgrp_kfunc_failure.skel.h" #include "cgrp_kfunc_success.skel.h" @@ -87,6 +89,72 @@ static const char * const success_tests[] = { "test_cgrp_from_id", }; +static void test_cgrp_from_id_ns(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct cgrp_kfunc_success *skel; + struct bpf_program *prog; + int pid, pipe_fd[2]; + + skel = open_load_cgrp_kfunc_skel(); + if (!ASSERT_OK_PTR(skel, "open_load_skel")) + return; + + if (!ASSERT_OK(skel->bss->err, "pre_mkdir_err")) + goto cleanup; + + prog = skel->progs.test_cgrp_from_id_ns; + + if (!ASSERT_OK(pipe(pipe_fd), "pipe")) + goto cleanup; + + pid = fork(); + if (!ASSERT_GE(pid, 0, "fork result")) { + close(pipe_fd[0]); + close(pipe_fd[1]); + goto cleanup; + } + + if (pid == 0) { + int ret = 0; + + close(pipe_fd[0]); + + if (!ASSERT_GE(cgroup_setup_and_join("cgrp_from_id_ns"), 0, "join cgroup")) + exit(1); + + if (!ASSERT_OK(unshare(CLONE_NEWCGROUP), "unshare cgns")) + exit(1); + + ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(ret, "test run ret")) + exit(1); + + if (!ASSERT_OK(opts.retval, "test run retval")) + exit(1); + + if (!ASSERT_EQ(write(pipe_fd[1], &ret, sizeof(ret)), sizeof(ret), "write pipe")) + exit(1); + + exit(0); + } else { + int res; + + close(pipe_fd[1]); + + ASSERT_EQ(read(pipe_fd[0], &res, sizeof(res)), sizeof(res), "read res"); + ASSERT_EQ(waitpid(pid, NULL, 0), pid, "wait on child"); + + remove_cgroup_pid("cgrp_from_id_ns", pid); + + ASSERT_OK(res, "result from run"); + } + + close(pipe_fd[0]); +cleanup: + cgrp_kfunc_success__destroy(skel); +} + void test_cgrp_kfunc(void) { int i, err; @@ -102,6 +170,9 @@ void test_cgrp_kfunc(void) run_success_test(success_tests[i]); } + if (test__start_subtest("test_cgrp_from_id_ns")) + test_cgrp_from_id_ns(); + RUN_TESTS(cgrp_kfunc_failure); cleanup: diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index 9b2d9ceda21023..b9f86cb91e81b5 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -32,6 +32,8 @@ static struct { {"test_ringbuf", SETUP_SYSCALL_SLEEP}, {"test_skb_readonly", SETUP_SKB_PROG}, {"test_dynptr_skb_data", SETUP_SKB_PROG}, + {"test_dynptr_skb_meta_data", SETUP_SKB_PROG}, + {"test_dynptr_skb_meta_flags", SETUP_SKB_PROG}, {"test_adjust", SETUP_SYSCALL_SLEEP}, {"test_adjust_err", SETUP_SYSCALL_SLEEP}, {"test_zero_size_dynptr", SETUP_SYSCALL_SLEEP}, diff --git a/tools/testing/selftests/bpf/prog_tests/fd_array.c b/tools/testing/selftests/bpf/prog_tests/fd_array.c index 241b2c8c6e0f15..c534b4d5f9da80 100644 --- a/tools/testing/selftests/bpf/prog_tests/fd_array.c +++ b/tools/testing/selftests/bpf/prog_tests/fd_array.c @@ -293,7 +293,7 @@ static int get_btf_id_by_fd(int btf_fd, __u32 *id) * 1) Create a new btf, it's referenced only by a file descriptor, so refcnt=1 * 2) Load a BPF prog with fd_array[0] = btf_fd; now btf's refcnt=2 * 3) Close the btf_fd, now refcnt=1 - * Wait and check that BTF stil exists. + * Wait and check that BTF still exists. */ static void check_fd_array_cnt__referenced_btfs(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 130f5b82d2e601..5ef1804e44dfd5 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -12,13 +12,24 @@ void test_fentry_fexit(void) int err, prog_fd, i; LIBBPF_OPTS(bpf_test_run_opts, topts); - fentry_skel = fentry_test_lskel__open_and_load(); + fentry_skel = fentry_test_lskel__open(); if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load")) goto close_prog; - fexit_skel = fexit_test_lskel__open_and_load(); + + fentry_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fentry_test_lskel__load(fentry_skel); + if (!ASSERT_OK(err, "fentry_skel_load")) + goto close_prog; + + fexit_skel = fexit_test_lskel__open(); if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load")) goto close_prog; + fexit_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fexit_test_lskel__load(fexit_skel); + if (!ASSERT_OK(err, "fexit_skel_load")) + goto close_prog; + err = fentry_test_lskel__attach(fentry_skel); if (!ASSERT_OK(err, "fentry_attach")) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index aee1bc77a17f7b..ec882328eb591d 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -43,8 +43,13 @@ static void fentry_test(void) struct fentry_test_lskel *fentry_skel = NULL; int err; - fentry_skel = fentry_test_lskel__open_and_load(); - if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load")) + fentry_skel = fentry_test_lskel__open(); + if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_open")) + goto cleanup; + + fentry_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fentry_test_lskel__load(fentry_skel); + if (!ASSERT_OK(err, "fentry_skel_load")) goto cleanup; err = fentry_test_common(fentry_skel); diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index 1c13007e37dd2c..94eed753560c1c 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -43,8 +43,13 @@ static void fexit_test(void) struct fexit_test_lskel *fexit_skel = NULL; int err; - fexit_skel = fexit_test_lskel__open_and_load(); - if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load")) + fexit_skel = fexit_test_lskel__open(); + if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_open")) + goto cleanup; + + fexit_skel->keyring_id = KEY_SPEC_SESSION_KEYRING; + err = fexit_test_lskel__load(fexit_skel); + if (!ASSERT_OK(err, "fexit_skel_load")) goto cleanup; err = fexit_test_common(fexit_skel); diff --git a/tools/testing/selftests/bpf/prog_tests/free_timer.c b/tools/testing/selftests/bpf/prog_tests/free_timer.c index b7b77a6b29799c..0de8facca4c5bc 100644 --- a/tools/testing/selftests/bpf/prog_tests/free_timer.c +++ b/tools/testing/selftests/bpf/prog_tests/free_timer.c @@ -124,6 +124,10 @@ void test_free_timer(void) int err; skel = free_timer__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "open_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/kernel_flag.c b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c index a133354ac9bc29..97b00c7efe9431 100644 --- a/tools/testing/selftests/bpf/prog_tests/kernel_flag.c +++ b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c @@ -16,7 +16,7 @@ void test_kernel_flag(void) if (!ASSERT_OK_PTR(lsm_skel, "lsm_skel")) return; - lsm_skel->bss->monitored_tid = gettid(); + lsm_skel->bss->monitored_tid = sys_gettid(); ret = test_kernel_flag__attach(lsm_skel); if (!ASSERT_OK(ret, "test_kernel_flag__attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index e19ef509ebf85e..6cfaa978bc9af2 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -7,6 +7,7 @@ #include "kprobe_multi_session.skel.h" #include "kprobe_multi_session_cookie.skel.h" #include "kprobe_multi_verifier.skel.h" +#include "kprobe_write_ctx.skel.h" #include "bpf/libbpf_internal.h" #include "bpf/hashmap.h" @@ -422,220 +423,6 @@ static void test_unique_match(void) kprobe_multi__destroy(skel); } -static size_t symbol_hash(long key, void *ctx __maybe_unused) -{ - return str_hash((const char *) key); -} - -static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) -{ - return strcmp((const char *) key1, (const char *) key2) == 0; -} - -static bool is_invalid_entry(char *buf, bool kernel) -{ - if (kernel && strchr(buf, '[')) - return true; - if (!kernel && !strchr(buf, '[')) - return true; - return false; -} - -static bool skip_entry(char *name) -{ - /* - * We attach to almost all kernel functions and some of them - * will cause 'suspicious RCU usage' when fprobe is attached - * to them. Filter out the current culprits - arch_cpu_idle - * default_idle and rcu_* functions. - */ - if (!strcmp(name, "arch_cpu_idle")) - return true; - if (!strcmp(name, "default_idle")) - return true; - if (!strncmp(name, "rcu_", 4)) - return true; - if (!strcmp(name, "bpf_dispatcher_xdp_func")) - return true; - if (!strncmp(name, "__ftrace_invalid_address__", - sizeof("__ftrace_invalid_address__") - 1)) - return true; - return false; -} - -/* Do comparision by ignoring '.llvm.' suffixes. */ -static int compare_name(const char *name1, const char *name2) -{ - const char *res1, *res2; - int len1, len2; - - res1 = strstr(name1, ".llvm."); - res2 = strstr(name2, ".llvm."); - len1 = res1 ? res1 - name1 : strlen(name1); - len2 = res2 ? res2 - name2 : strlen(name2); - - if (len1 == len2) - return strncmp(name1, name2, len1); - if (len1 < len2) - return strncmp(name1, name2, len1) <= 0 ? -1 : 1; - return strncmp(name1, name2, len2) >= 0 ? 1 : -1; -} - -static int load_kallsyms_compare(const void *p1, const void *p2) -{ - return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); -} - -static int search_kallsyms_compare(const void *p1, const struct ksym *p2) -{ - return compare_name(p1, p2->name); -} - -static int get_syms(char ***symsp, size_t *cntp, bool kernel) -{ - size_t cap = 0, cnt = 0; - char *name = NULL, *ksym_name, **syms = NULL; - struct hashmap *map; - struct ksyms *ksyms; - struct ksym *ks; - char buf[256]; - FILE *f; - int err = 0; - - ksyms = load_kallsyms_custom_local(load_kallsyms_compare); - if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local")) - return -EINVAL; - - /* - * The available_filter_functions contains many duplicates, - * but other than that all symbols are usable in kprobe multi - * interface. - * Filtering out duplicates by using hashmap__add, which won't - * add existing entry. - */ - - if (access("/sys/kernel/tracing/trace", F_OK) == 0) - f = fopen("/sys/kernel/tracing/available_filter_functions", "r"); - else - f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); - - if (!f) - return -EINVAL; - - map = hashmap__new(symbol_hash, symbol_equal, NULL); - if (IS_ERR(map)) { - err = libbpf_get_error(map); - goto error; - } - - while (fgets(buf, sizeof(buf), f)) { - if (is_invalid_entry(buf, kernel)) - continue; - - free(name); - if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) - continue; - if (skip_entry(name)) - continue; - - ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); - if (!ks) { - err = -EINVAL; - goto error; - } - - ksym_name = ks->name; - err = hashmap__add(map, ksym_name, 0); - if (err == -EEXIST) { - err = 0; - continue; - } - if (err) - goto error; - - err = libbpf_ensure_mem((void **) &syms, &cap, - sizeof(*syms), cnt + 1); - if (err) - goto error; - - syms[cnt++] = ksym_name; - } - - *symsp = syms; - *cntp = cnt; - -error: - free(name); - fclose(f); - hashmap__free(map); - if (err) - free(syms); - return err; -} - -static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) -{ - unsigned long *addr, *addrs, *tmp_addrs; - int err = 0, max_cnt, inc_cnt; - char *name = NULL; - size_t cnt = 0; - char buf[256]; - FILE *f; - - if (access("/sys/kernel/tracing/trace", F_OK) == 0) - f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); - else - f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); - - if (!f) - return -ENOENT; - - /* In my local setup, the number of entries is 50k+ so Let us initially - * allocate space to hold 64k entries. If 64k is not enough, incrementally - * increase 1k each time. - */ - max_cnt = 65536; - inc_cnt = 1024; - addrs = malloc(max_cnt * sizeof(long)); - if (addrs == NULL) { - err = -ENOMEM; - goto error; - } - - while (fgets(buf, sizeof(buf), f)) { - if (is_invalid_entry(buf, kernel)) - continue; - - free(name); - if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) - continue; - if (skip_entry(name)) - continue; - - if (cnt == max_cnt) { - max_cnt += inc_cnt; - tmp_addrs = realloc(addrs, max_cnt); - if (!tmp_addrs) { - err = -ENOMEM; - goto error; - } - addrs = tmp_addrs; - } - - addrs[cnt++] = (unsigned long)addr; - } - - *addrsp = addrs; - *cntp = cnt; - -error: - free(name); - fclose(f); - if (err) - free(addrs); - return err; -} - static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts) { long attach_start_ns, attach_end_ns; @@ -670,7 +457,7 @@ static void test_kprobe_multi_bench_attach(bool kernel) char **syms = NULL; size_t cnt = 0; - if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms")) + if (!ASSERT_OK(bpf_get_ksyms(&syms, &cnt, kernel), "bpf_get_ksyms")) return; skel = kprobe_multi_empty__open_and_load(); @@ -696,13 +483,13 @@ static void test_kprobe_multi_bench_attach_addr(bool kernel) size_t cnt = 0; int err; - err = get_addrs(&addrs, &cnt, kernel); + err = bpf_get_addrs(&addrs, &cnt, kernel); if (err == -ENOENT) { test__skip(); return; } - if (!ASSERT_OK(err, "get_addrs")) + if (!ASSERT_OK(err, "bpf_get_addrs")) return; skel = kprobe_multi_empty__open_and_load(); @@ -753,6 +540,30 @@ static void test_attach_override(void) kprobe_multi_override__destroy(skel); } +#ifdef __x86_64__ +static void test_attach_write_ctx(void) +{ + struct kprobe_write_ctx *skel = NULL; + struct bpf_link *link = NULL; + + skel = kprobe_write_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_write_ctx__open_and_load")) + return; + + link = bpf_program__attach_kprobe_opts(skel->progs.kprobe_multi_write_ctx, + "bpf_fentry_test1", NULL); + if (!ASSERT_ERR_PTR(link, "bpf_program__attach_kprobe_opts")) + bpf_link__destroy(link); + + kprobe_write_ctx__destroy(skel); +} +#else +static void test_attach_write_ctx(void) +{ + test__skip(); +} +#endif + void serial_test_kprobe_multi_bench_attach(void) { if (test__start_subtest("kernel")) @@ -792,5 +603,7 @@ void test_kprobe_multi_test(void) test_session_cookie_skel_api(); if (test__start_subtest("unique_match")) test_unique_match(); + if (test__start_subtest("attach_write_ctx")) + test_attach_write_ctx(); RUN_TESTS(kprobe_multi_verifier); } diff --git a/tools/testing/selftests/bpf/prog_tests/map_excl.c b/tools/testing/selftests/bpf/prog_tests/map_excl.c new file mode 100644 index 00000000000000..6bdc6d6de0daf2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_excl.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Google LLC. */ +#define _GNU_SOURCE +#include +#include +#include +#include + +#include "map_excl.skel.h" + +static void test_map_excl_allowed(void) +{ + struct map_excl *skel = map_excl__open(); + int err; + + err = bpf_map__set_exclusive_program(skel->maps.excl_map, skel->progs.should_have_access); + if (!ASSERT_OK(err, "bpf_map__set_exclusive_program")) + goto out; + + bpf_program__set_autoload(skel->progs.should_have_access, true); + bpf_program__set_autoload(skel->progs.should_not_have_access, false); + + err = map_excl__load(skel); + ASSERT_OK(err, "map_excl__load"); +out: + map_excl__destroy(skel); +} + +static void test_map_excl_denied(void) +{ + struct map_excl *skel = map_excl__open(); + int err; + + err = bpf_map__set_exclusive_program(skel->maps.excl_map, skel->progs.should_have_access); + if (!ASSERT_OK(err, "bpf_map__make_exclusive")) + goto out; + + bpf_program__set_autoload(skel->progs.should_have_access, false); + bpf_program__set_autoload(skel->progs.should_not_have_access, true); + + err = map_excl__load(skel); + ASSERT_EQ(err, -EACCES, "exclusive map access not denied\n"); +out: + map_excl__destroy(skel); + +} + +void test_map_excl(void) +{ + if (test__start_subtest("map_excl_allowed")) + test_map_excl_allowed(); + if (test__start_subtest("map_excl_denied")) + test_map_excl_denied(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index 6d391d95f96e00..70fa7ae93173b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -90,7 +90,7 @@ void test_module_attach(void) test_module_attach__detach(skel); - /* attach fentry/fexit and make sure it get's module reference */ + /* attach fentry/fexit and make sure it gets module reference */ link = bpf_program__attach(skel->progs.handle_fentry); if (!ASSERT_OK_PTR(link, "attach_fentry")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c b/tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c new file mode 100644 index 00000000000000..9ae49b587f3e4c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/pinning_devmap_reuse.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + + +#include "test_pinning_devmap.skel.h" + +void test_pinning_devmap_reuse(void) +{ + const char *pinpath1 = "/sys/fs/bpf/pinmap1"; + const char *pinpath2 = "/sys/fs/bpf/pinmap2"; + struct test_pinning_devmap *skel1 = NULL, *skel2 = NULL; + int err; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + + /* load the object a first time */ + skel1 = test_pinning_devmap__open_and_load(); + if (!ASSERT_OK_PTR(skel1, "skel_load1")) + goto out; + + /* load the object a second time, re-using the pinned map */ + skel2 = test_pinning_devmap__open_and_load(); + if (!ASSERT_OK_PTR(skel2, "skel_load2")) + goto out; + + /* we can close the reference safely without + * the map's refcount falling to 0 + */ + test_pinning_devmap__destroy(skel1); + skel1 = NULL; + + /* now, swap the pins */ + err = renameat2(0, pinpath1, 0, pinpath2, RENAME_EXCHANGE); + if (!ASSERT_OK(err, "swap pins")) + goto out; + + /* load the object again, this time the re-use should fail */ + skel1 = test_pinning_devmap__open_and_load(); + if (!ASSERT_ERR_PTR(skel1, "skel_load3")) + goto out; + +out: + unlink(pinpath1); + unlink(pinpath2); + test_pinning_devmap__destroy(skel1); + test_pinning_devmap__destroy(skel2); +} diff --git a/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c b/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c index 14f2796076e0c2..7607cfc2408c2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_tests_framework.c @@ -54,3 +54,128 @@ void test_prog_tests_framework(void) return; clear_test_state(state); } + +static void dummy_emit(const char *buf, bool force) {} + +void test_prog_tests_framework_expected_msgs(void) +{ + struct expected_msgs msgs; + int i, j, error_cnt; + const struct { + const char *name; + const char *log; + const char *expected; + struct expect_msg *pats; + } cases[] = { + { + .name = "simple-ok", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "simple-fail", + .log = "aaabbbddd", + .expected = "MATCHED SUBSTR: 'aaa'\n" + "EXPECTED SUBSTR: 'ccc'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-ok-mid", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "foo", .negative = true }, + { .substr = "bar", .negative = true }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-ok-tail", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "foo", .negative = true }, + {} + } + }, + { + .name = "negative-ok-head", + .log = "aaabbbccc", + .pats = (struct expect_msg[]) { + { .substr = "foo", .negative = true }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-fail-head", + .log = "aaabbbccc", + .expected = "UNEXPECTED SUBSTR: 'aaa'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa", .negative = true }, + { .substr = "bbb" }, + {} + } + }, + { + .name = "negative-fail-tail", + .log = "aaabbbccc", + .expected = "UNEXPECTED SUBSTR: 'ccc'\n", + .pats = (struct expect_msg[]) { + { .substr = "bbb" }, + { .substr = "ccc", .negative = true }, + {} + } + }, + { + .name = "negative-fail-mid-1", + .log = "aaabbbccc", + .expected = "UNEXPECTED SUBSTR: 'bbb'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "bbb", .negative = true }, + { .substr = "ccc" }, + {} + } + }, + { + .name = "negative-fail-mid-2", + .log = "aaabbb222ccc", + .expected = "UNEXPECTED SUBSTR: '222'\n", + .pats = (struct expect_msg[]) { + { .substr = "aaa" }, + { .substr = "222", .negative = true }, + { .substr = "bbb", .negative = true }, + { .substr = "ccc" }, + {} + } + } + }; + + for (i = 0; i < ARRAY_SIZE(cases); i++) { + if (test__start_subtest(cases[i].name)) { + error_cnt = env.subtest_state->error_cnt; + msgs.patterns = cases[i].pats; + msgs.cnt = 0; + for (j = 0; cases[i].pats[j].substr; j++) + msgs.cnt++; + validate_msgs(cases[i].log, &msgs, dummy_emit); + fflush(stderr); + env.subtest_state->error_cnt = error_cnt; + if (cases[i].expected) + ASSERT_HAS_SUBSTR(env.subtest_state->log_buf, cases[i].expected, "expected output"); + else + ASSERT_STREQ(env.subtest_state->log_buf, "", "expected no output"); + test__end_subtest(); + } + } +} diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index e261b0e872dbba..d93a0c7b1786f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -623,7 +623,7 @@ static void range_cond(enum num_t t, struct range x, struct range y, *newx = range(t, x.a, x.b); *newy = range(t, y.a + 1, y.b); } else if (x.a == x.b && x.b == y.b) { - /* X is a constant matching rigth side of Y */ + /* X is a constant matching right side of Y */ *newx = range(t, x.a, x.b); *newy = range(t, y.a, y.b - 1); } else if (y.a == y.b && x.a == y.a) { @@ -631,7 +631,7 @@ static void range_cond(enum num_t t, struct range x, struct range y, *newx = range(t, x.a + 1, x.b); *newy = range(t, y.a, y.b); } else if (y.a == y.b && x.b == y.b) { - /* Y is a constant matching rigth side of X */ + /* Y is a constant matching right side of X */ *newx = range(t, x.a, x.b - 1); *newy = range(t, y.a, y.b); } else { diff --git a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c index 0703e987df8997..8c6c2043a43275 100644 --- a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c @@ -99,3 +99,19 @@ void test_res_spin_lock_success(void) res_spin_lock__destroy(skel); return; } + +void serial_test_res_spin_lock_stress(void) +{ + if (libbpf_num_possible_cpus() < 3) { + test__skip(); + return; + } + + ASSERT_OK(load_module("bpf_test_rqspinlock.ko", false), "load module AA"); + sleep(5); + unload_module("bpf_test_rqspinlock", false); + + ASSERT_OK(load_module_params("bpf_test_rqspinlock.ko", "test_ab=1", false), "load module ABBA"); + sleep(5); + unload_module("bpf_test_rqspinlock", false); +} diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index e3ea5dc2f697c4..254fbfeab06a22 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -13,22 +13,22 @@ static struct { const char *err_msg; } spin_lock_fail_tests[] = { { "lock_id_kptr_preserve", - "5: (bf) r1 = r0 ; R0_w=ptr_foo(id=2,ref_obj_id=2) " - "R1_w=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" + "5: (bf) r1 = r0 ; R0=ptr_foo(id=2,ref_obj_id=2) " + "R1=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", - "; R1_w=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" + "; R1=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" - " R0_w=map_value(id=1,map=array_map,ks=4,vs=8)" - " R1_w=map_value(id=1,map=array_map,ks=4,vs=8)\n" + " R0=map_value(id=1,map=array_map,ks=4,vs=8)" + " R1=map_value(id=1,map=array_map,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_innermapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" " R0=map_value(id=2,ks=4,vs=8)" - " R1_w=map_value(id=2,ks=4,vs=8)\n" + " R1=map_value(id=2,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mismatch_kptr_kptr", "bpf_spin_unlock of different lock" }, diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c index b7ba5cd47d96fa..271b5cc9fc0153 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c @@ -39,7 +39,7 @@ void test_stacktrace_build_id(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c index 0832fd7874575c..b277dddd5af7ff 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c @@ -66,7 +66,7 @@ void test_stacktrace_build_id_nmi(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c index df59e4ae295100..c23b97414813ab 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c @@ -1,46 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "stacktrace_map.skel.h" void test_stacktrace_map(void) { + struct stacktrace_map *skel; int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; - const char *prog_name = "oncpu"; - int err, prog_fd, stack_trace_len; - const char *file = "./test_stacktrace_map.bpf.o"; - __u32 key, val, duration = 0; - struct bpf_program *prog; - struct bpf_object *obj; - struct bpf_link *link; + int err, stack_trace_len; + __u32 key, val, stack_id, duration = 0; + __u64 stack[PERF_MAX_STACK_DEPTH]; - err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); - if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) + skel = stacktrace_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) return; - prog = bpf_object__find_program_by_name(obj, prog_name); - if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name)) - goto close_prog; - - link = bpf_program__attach_tracepoint(prog, "sched", "sched_switch"); - if (!ASSERT_OK_PTR(link, "attach_tp")) - goto close_prog; - - /* find map fds */ - control_map_fd = bpf_find_map(__func__, obj, "control_map"); - if (CHECK_FAIL(control_map_fd < 0)) - goto disable_pmu; - - stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap"); - if (CHECK_FAIL(stackid_hmap_fd < 0)) - goto disable_pmu; - - stackmap_fd = bpf_find_map(__func__, obj, "stackmap"); - if (CHECK_FAIL(stackmap_fd < 0)) - goto disable_pmu; - - stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap"); - if (CHECK_FAIL(stack_amap_fd < 0)) - goto disable_pmu; + control_map_fd = bpf_map__fd(skel->maps.control_map); + stackid_hmap_fd = bpf_map__fd(skel->maps.stackid_hmap); + stackmap_fd = bpf_map__fd(skel->maps.stackmap); + stack_amap_fd = bpf_map__fd(skel->maps.stack_amap); + err = stacktrace_map__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; /* give some time for bpf program run */ sleep(1); @@ -50,26 +31,32 @@ void test_stacktrace_map(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto out; err = compare_map_keys(stackmap_fd, stackid_hmap_fd); if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto out; stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64); err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len); if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap", "err %d errno %d\n", err, errno)) - goto disable_pmu; - -disable_pmu: - bpf_link__destroy(link); -close_prog: - bpf_object__close(obj); + goto out; + + stack_id = skel->bss->stack_id; + err = bpf_map_lookup_and_delete_elem(stackmap_fd, &stack_id, stack); + if (!ASSERT_OK(err, "lookup and delete target stack_id")) + goto out; + + err = bpf_map_lookup_elem(stackmap_fd, &stack_id, stack); + if (!ASSERT_EQ(err, -ENOENT, "lookup deleted stack_id")) + goto out; +out: + stacktrace_map__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c index c6ef06f55cdb46..e985d51d3d4789 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c @@ -5,7 +5,7 @@ void test_stacktrace_map_raw_tp(void) { const char *prog_name = "oncpu"; int control_map_fd, stackid_hmap_fd, stackmap_fd; - const char *file = "./test_stacktrace_map.bpf.o"; + const char *file = "./stacktrace_map.bpf.o"; __u32 key, val, duration = 0; int err, prog_fd; struct bpf_program *prog; @@ -46,7 +46,7 @@ void test_stacktrace_map_raw_tp(void) bpf_map_update_elem(control_map_fd, &key, &val, 0); /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c index 1932b1e0685cfd..dc2ccf6a14d133 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_skip.c @@ -40,7 +40,7 @@ void test_stacktrace_map_skip(void) skel->bss->control = 1; /* for every element in stackid_hmap, we can find a corresponding one - * in stackmap, and vise versa. + * in stackmap, and vice versa. */ err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (!ASSERT_OK(err, "compare_map_keys stackid_hmap vs. stackmap")) diff --git a/tools/testing/selftests/bpf/prog_tests/stream.c b/tools/testing/selftests/bpf/prog_tests/stream.c index d9f0185dca61b8..c3cce5c292bdda 100644 --- a/tools/testing/selftests/bpf/prog_tests/stream.c +++ b/tools/testing/selftests/bpf/prog_tests/stream.c @@ -2,7 +2,6 @@ /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ #include #include -#include #include "stream.skel.h" #include "stream_fail.skel.h" @@ -18,87 +17,6 @@ void test_stream_success(void) return; } -struct { - int prog_off; - const char *errstr; -} stream_error_arr[] = { - { - offsetof(struct stream, progs.stream_cond_break), - "ERROR: Timeout detected for may_goto instruction\n" - "CPU: [0-9]+ UID: 0 PID: [0-9]+ Comm: .*\n" - "Call trace:\n" - "([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" - "|[ \t]+[^\n]+\n)*", - }, - { - offsetof(struct stream, progs.stream_deadlock), - "ERROR: AA or ABBA deadlock detected for bpf_res_spin_lock\n" - "Attempted lock = (0x[0-9a-fA-F]+)\n" - "Total held locks = 1\n" - "Held lock\\[ 0\\] = \\1\n" // Lock address must match - "CPU: [0-9]+ UID: 0 PID: [0-9]+ Comm: .*\n" - "Call trace:\n" - "([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" - "|[ \t]+[^\n]+\n)*", - }, -}; - -static int match_regex(const char *pattern, const char *string) -{ - int err, rc; - regex_t re; - - err = regcomp(&re, pattern, REG_EXTENDED | REG_NEWLINE); - if (err) - return -1; - rc = regexec(&re, string, 0, NULL, 0); - regfree(&re); - return rc == 0 ? 1 : 0; -} - -void test_stream_errors(void) -{ - LIBBPF_OPTS(bpf_test_run_opts, opts); - LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); - struct stream *skel; - int ret, prog_fd; - char buf[1024]; - - skel = stream__open_and_load(); - if (!ASSERT_OK_PTR(skel, "stream__open_and_load")) - return; - - for (int i = 0; i < ARRAY_SIZE(stream_error_arr); i++) { - struct bpf_program **prog; - - prog = (struct bpf_program **)(((char *)skel) + stream_error_arr[i].prog_off); - prog_fd = bpf_program__fd(*prog); - ret = bpf_prog_test_run_opts(prog_fd, &opts); - ASSERT_OK(ret, "ret"); - ASSERT_OK(opts.retval, "retval"); - -#if !defined(__x86_64__) - ASSERT_TRUE(1, "Timed may_goto unsupported, skip."); - if (i == 0) { - ret = bpf_prog_stream_read(prog_fd, 2, buf, sizeof(buf), &ropts); - ASSERT_EQ(ret, 0, "stream read"); - continue; - } -#endif - - ret = bpf_prog_stream_read(prog_fd, BPF_STREAM_STDERR, buf, sizeof(buf), &ropts); - ASSERT_GT(ret, 0, "stream read"); - ASSERT_LE(ret, 1023, "len for buf"); - buf[ret] = '\0'; - - ret = match_regex(stream_error_arr[i].errstr, buf); - if (!ASSERT_TRUE(ret == 1, "regex match")) - fprintf(stderr, "Output from stream:\n%s\n", buf); - } - - stream__destroy(skel); -} - void test_stream_syscall(void) { LIBBPF_OPTS(bpf_test_run_opts, opts); @@ -139,3 +57,52 @@ void test_stream_syscall(void) stream__destroy(skel); } + +static void test_address(struct bpf_program *prog, unsigned long *fault_addr_p) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); + int ret, prog_fd; + char fault_addr[64]; + char buf[1024]; + + prog_fd = bpf_program__fd(prog); + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(ret, "ret"); + ASSERT_OK(opts.retval, "retval"); + + sprintf(fault_addr, "0x%lx", *fault_addr_p); + + ret = bpf_prog_stream_read(prog_fd, BPF_STREAM_STDERR, buf, sizeof(buf), &ropts); + ASSERT_GT(ret, 0, "stream read"); + ASSERT_LE(ret, 1023, "len for buf"); + buf[ret] = '\0'; + + if (!ASSERT_HAS_SUBSTR(buf, fault_addr, "fault_addr")) { + fprintf(stderr, "Output from stream:\n%s\n", buf); + fprintf(stderr, "Fault Addr: %s\n", fault_addr); + } +} + +void test_stream_arena_fault_address(void) +{ + struct stream *skel; + +#if !defined(__x86_64__) && !defined(__aarch64__) + printf("%s:SKIP: arena fault reporting not supported\n", __func__); + test__skip(); + return; +#endif + + skel = stream__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stream__open_and_load")) + return; + + if (test__start_subtest("read_fault")) + test_address(skel->progs.stream_arena_read_fault, &skel->bss->fault_addr); + if (test__start_subtest("write_fault")) + test_address(skel->progs.stream_arena_write_fault, &skel->bss->fault_addr); + + stream__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c index 35af8044d0590a..4d66fad3c8bdb9 100644 --- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c @@ -8,6 +8,7 @@ static const char * const test_cases[] = { "strcmp", + "strcasecmp", "strchr", "strchrnul", "strnchr", diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h new file mode 100644 index 00000000000000..2de38776a2d499 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TASK_LOCAL_DATA_H +#define __TASK_LOCAL_DATA_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT +#include +#endif + +#include + +/* + * OPTIONS + * + * Define the option before including the header + * + * TLD_FREE_DATA_ON_THREAD_EXIT - Frees memory on thread exit automatically + * + * Thread-specific memory for storing TLD is allocated lazily on the first call to + * tld_get_data(). The thread that calls it must also call tld_free() on thread exit + * to prevent memory leak. Pthread will be included if the option is defined. A pthread + * key will be registered with a destructor that calls tld_free(). + * + * + * TLD_DYN_DATA_SIZE - The maximum size of memory allocated for TLDs created dynamically + * (default: 64 bytes) + * + * A TLD can be defined statically using TLD_DEFINE_KEY() or created on the fly using + * tld_create_key(). As the total size of TLDs created with tld_create_key() cannot be + * possibly known statically, a memory area of size TLD_DYN_DATA_SIZE will be allocated + * for these TLDs. This additional memory is allocated for every thread that calls + * tld_get_data() even if no tld_create_key are actually called, so be mindful of + * potential memory wastage. Use TLD_DEFINE_KEY() whenever possible as just enough memory + * will be allocated for TLDs created with it. + * + * + * TLD_NAME_LEN - The maximum length of the name of a TLD (default: 62) + * + * Setting TLD_NAME_LEN will affect the maximum number of TLDs a process can store, + * TLD_MAX_DATA_CNT. + * + * + * TLD_DATA_USE_ALIGNED_ALLOC - Always use aligned_alloc() instead of malloc() + * + * When allocating the memory for storing TLDs, we need to make sure there is a memory + * region of the X bytes within a page. This is due to the limit posed by UPTR: memory + * pinned to the kernel cannot exceed a page nor can it cross the page boundary. The + * library normally calls malloc(2*X) given X bytes of total TLDs, and only uses + * aligned_alloc(PAGE_SIZE, X) when X >= PAGE_SIZE / 2. This is to reduce memory wastage + * as not all memory allocator can use the exact amount of memory requested to fulfill + * aligned_alloc(). For example, some may round the size up to the alignment. Enable the + * option to always use aligned_alloc() if the implementation has low memory overhead. + */ + +#define TLD_PAGE_SIZE getpagesize() +#define TLD_PAGE_MASK (~(TLD_PAGE_SIZE - 1)) + +#define TLD_ROUND_MASK(x, y) ((__typeof__(x))((y) - 1)) +#define TLD_ROUND_UP(x, y) ((((x) - 1) | TLD_ROUND_MASK(x, y)) + 1) + +#define TLD_READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + +#ifndef TLD_DYN_DATA_SIZE +#define TLD_DYN_DATA_SIZE 64 +#endif + +#define TLD_MAX_DATA_CNT (TLD_PAGE_SIZE / sizeof(struct tld_metadata) - 1) + +#ifndef TLD_NAME_LEN +#define TLD_NAME_LEN 62 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + __s16 off; +} tld_key_t; + +struct tld_metadata { + char name[TLD_NAME_LEN]; + _Atomic __u16 size; +}; + +struct tld_meta_u { + _Atomic __u8 cnt; + __u16 size; + struct tld_metadata metadata[]; +}; + +struct tld_data_u { + __u64 start; /* offset of tld_data_u->data in a page */ + char data[]; +}; + +struct tld_map_value { + void *data; + struct tld_meta_u *meta; +}; + +struct tld_meta_u * _Atomic tld_meta_p __attribute__((weak)); +__thread struct tld_data_u *tld_data_p __attribute__((weak)); +__thread void *tld_data_alloc_p __attribute__((weak)); + +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT +pthread_key_t tld_pthread_key __attribute__((weak)); + +static void tld_free(void); + +static void __tld_thread_exit_handler(void *unused) +{ + tld_free(); +} +#endif + +static int __tld_init_meta_p(void) +{ + struct tld_meta_u *meta, *uninit = NULL; + int err = 0; + + meta = (struct tld_meta_u *)aligned_alloc(TLD_PAGE_SIZE, TLD_PAGE_SIZE); + if (!meta) { + err = -ENOMEM; + goto out; + } + + memset(meta, 0, TLD_PAGE_SIZE); + meta->size = TLD_DYN_DATA_SIZE; + + if (!atomic_compare_exchange_strong(&tld_meta_p, &uninit, meta)) { + free(meta); + goto out; + } + +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT + pthread_key_create(&tld_pthread_key, __tld_thread_exit_handler); +#endif +out: + return err; +} + +static int __tld_init_data_p(int map_fd) +{ + bool use_aligned_alloc = false; + struct tld_map_value map_val; + struct tld_data_u *data; + void *data_alloc = NULL; + int err, tid_fd = -1; + + tid_fd = syscall(SYS_pidfd_open, sys_gettid(), O_EXCL); + if (tid_fd < 0) { + err = -errno; + goto out; + } + +#ifdef TLD_DATA_USE_ALIGNED_ALLOC + use_aligned_alloc = true; +#endif + + /* + * tld_meta_p->size = TLD_DYN_DATA_SIZE + + * total size of TLDs defined via TLD_DEFINE_KEY() + */ + data_alloc = (use_aligned_alloc || tld_meta_p->size * 2 >= TLD_PAGE_SIZE) ? + aligned_alloc(TLD_PAGE_SIZE, tld_meta_p->size) : + malloc(tld_meta_p->size * 2); + if (!data_alloc) { + err = -ENOMEM; + goto out; + } + + /* + * Always pass a page-aligned address to UPTR since the size of tld_map_value::data + * is a page in BTF. If data_alloc spans across two pages, use the page that contains large + * enough memory. + */ + if (TLD_PAGE_SIZE - (~TLD_PAGE_MASK & (intptr_t)data_alloc) >= tld_meta_p->size) { + map_val.data = (void *)(TLD_PAGE_MASK & (intptr_t)data_alloc); + data = data_alloc; + data->start = (~TLD_PAGE_MASK & (intptr_t)data_alloc) + + offsetof(struct tld_data_u, data); + } else { + map_val.data = (void *)(TLD_ROUND_UP((intptr_t)data_alloc, TLD_PAGE_SIZE)); + data = (void *)(TLD_ROUND_UP((intptr_t)data_alloc, TLD_PAGE_SIZE)); + data->start = offsetof(struct tld_data_u, data); + } + map_val.meta = TLD_READ_ONCE(tld_meta_p); + + err = bpf_map_update_elem(map_fd, &tid_fd, &map_val, 0); + if (err) { + free(data_alloc); + goto out; + } + + tld_data_p = data; + tld_data_alloc_p = data_alloc; +#ifdef TLD_FREE_DATA_ON_THREAD_EXIT + pthread_setspecific(tld_pthread_key, (void *)1); +#endif +out: + if (tid_fd >= 0) + close(tid_fd); + return err; +} + +static tld_key_t __tld_create_key(const char *name, size_t size, bool dyn_data) +{ + int err, i, sz, off = 0; + __u8 cnt; + + if (!TLD_READ_ONCE(tld_meta_p)) { + err = __tld_init_meta_p(); + if (err) + return (tld_key_t){err}; + } + + for (i = 0; i < TLD_MAX_DATA_CNT; i++) { +retry: + cnt = atomic_load(&tld_meta_p->cnt); + if (i < cnt) { + /* A metadata is not ready until size is updated with a non-zero value */ + while (!(sz = atomic_load(&tld_meta_p->metadata[i].size))) + sched_yield(); + + if (!strncmp(tld_meta_p->metadata[i].name, name, TLD_NAME_LEN)) + return (tld_key_t){-EEXIST}; + + off += TLD_ROUND_UP(sz, 8); + continue; + } + + /* + * TLD_DEFINE_KEY() is given memory upto a page while at most + * TLD_DYN_DATA_SIZE is allocated for tld_create_key() + */ + if (dyn_data) { + if (off + TLD_ROUND_UP(size, 8) > tld_meta_p->size) + return (tld_key_t){-E2BIG}; + } else { + if (off + TLD_ROUND_UP(size, 8) > TLD_PAGE_SIZE - sizeof(struct tld_data_u)) + return (tld_key_t){-E2BIG}; + tld_meta_p->size += TLD_ROUND_UP(size, 8); + } + + /* + * Only one tld_create_key() can increase the current cnt by one and + * takes the latest available slot. Other threads will check again if a new + * TLD can still be added, and then compete for the new slot after the + * succeeding thread update the size. + */ + if (!atomic_compare_exchange_strong(&tld_meta_p->cnt, &cnt, cnt + 1)) + goto retry; + + strncpy(tld_meta_p->metadata[i].name, name, TLD_NAME_LEN); + atomic_store(&tld_meta_p->metadata[i].size, size); + return (tld_key_t){(__s16)off}; + } + + return (tld_key_t){-ENOSPC}; +} + +/** + * TLD_DEFINE_KEY() - Define a TLD and a global variable key associated with the TLD. + * + * @name: The name of the TLD + * @size: The size of the TLD + * @key: The variable name of the key. Cannot exceed TLD_NAME_LEN + * + * The macro can only be used in file scope. + * + * A global variable key of opaque type, tld_key_t, will be declared and initialized before + * main() starts. Use tld_key_is_err() or tld_key_err_or_zero() later to check if the key + * creation succeeded. Pass the key to tld_get_data() to get a pointer to the TLD. + * bpf programs can also fetch the same key by name. + * + * The total size of TLDs created using TLD_DEFINE_KEY() cannot exceed a page. Just + * enough memory will be allocated for each thread on the first call to tld_get_data(). + */ +#define TLD_DEFINE_KEY(key, name, size) \ +tld_key_t key; \ + \ +__attribute__((constructor)) \ +void __tld_define_key_##key(void) \ +{ \ + key = __tld_create_key(name, size, false); \ +} + +/** + * tld_create_key() - Create a TLD and return a key associated with the TLD. + * + * @name: The name the TLD + * @size: The size of the TLD + * + * Return an opaque object key. Use tld_key_is_err() or tld_key_err_or_zero() to check + * if the key creation succeeded. Pass the key to tld_get_data() to get a pointer to + * locate the TLD. bpf programs can also fetch the same key by name. + * + * Use tld_create_key() only when a TLD needs to be created dynamically (e.g., @name is + * not known statically or a TLD needs to be created conditionally) + * + * An additional TLD_DYN_DATA_SIZE bytes are allocated per-thread to accommodate TLDs + * created dynamically with tld_create_key(). Since only a user page is pinned to the + * kernel, when TLDs created with TLD_DEFINE_KEY() uses more than TLD_PAGE_SIZE - + * TLD_DYN_DATA_SIZE, the buffer size will be limited to the rest of the page. + */ +__attribute__((unused)) +static tld_key_t tld_create_key(const char *name, size_t size) +{ + return __tld_create_key(name, size, true); +} + +__attribute__((unused)) +static inline bool tld_key_is_err(tld_key_t key) +{ + return key.off < 0; +} + +__attribute__((unused)) +static inline int tld_key_err_or_zero(tld_key_t key) +{ + return tld_key_is_err(key) ? key.off : 0; +} + +/** + * tld_get_data() - Get a pointer to the TLD associated with the given key of the + * calling thread. + * + * @map_fd: A file descriptor of tld_data_map, the underlying BPF task local storage map + * of task local data. + * @key: A key object created by TLD_DEFINE_KEY() or tld_create_key(). + * + * Return a pointer to the TLD if the key is valid; NULL if not enough memory for TLD + * for this thread, or the key is invalid. The returned pointer is guaranteed to be 8-byte + * aligned. + * + * Threads that call tld_get_data() must call tld_free() on exit to prevent + * memory leak if TLD_FREE_DATA_ON_THREAD_EXIT is not defined. + */ +__attribute__((unused)) +static void *tld_get_data(int map_fd, tld_key_t key) +{ + if (!TLD_READ_ONCE(tld_meta_p)) + return NULL; + + /* tld_data_p is allocated on the first invocation of tld_get_data() */ + if (!tld_data_p && __tld_init_data_p(map_fd)) + return NULL; + + return tld_data_p->data + key.off; +} + +/** + * tld_free() - Free task local data memory of the calling thread + * + * For the calling thread, all pointers to TLDs acquired before will become invalid. + * + * Users must call tld_free() on thread exit to prevent memory leak. Alternatively, + * define TLD_FREE_DATA_ON_THREAD_EXIT and a thread exit handler will be registered + * to free the memory automatically. + */ +__attribute__((unused)) +static void tld_free(void) +{ + if (tld_data_alloc_p) { + free(tld_data_alloc_p); + tld_data_alloc_p = NULL; + tld_data_p = NULL; + } +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __TASK_LOCAL_DATA_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/task_work_stress.c b/tools/testing/selftests/bpf/prog_tests/task_work_stress.c new file mode 100644 index 00000000000000..450d17d91a56fb --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_work_stress.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "task_work_stress.skel.h" +#include +#include +#include +#include +#include +#include + +struct test_data { + int prog_fd; + atomic_int exit; +}; + +void *runner(void *test_data) +{ + struct test_data *td = test_data; + int err = 0; + LIBBPF_OPTS(bpf_test_run_opts, opts); + + while (!err && !atomic_load(&td->exit)) + err = bpf_prog_test_run_opts(td->prog_fd, &opts); + + return NULL; +} + +static int get_env_int(const char *str, int def) +{ + const char *s = getenv(str); + char *end; + int retval; + + if (!s || !*s) + return def; + errno = 0; + retval = strtol(s, &end, 10); + if (errno || *end || retval < 0) + return def; + return retval; +} + +static void task_work_run(bool enable_delete) +{ + struct task_work_stress *skel; + struct bpf_program *scheduler, *deleter; + int nthreads = 16; + int test_time_s = get_env_int("BPF_TASK_WORK_TEST_TIME", 1); + pthread_t tid[nthreads], tid_del; + bool started[nthreads], started_del = false; + struct test_data td_sched = { .exit = 0 }, td_del = { .exit = 1 }; + int i, err; + + skel = task_work_stress__open(); + if (!ASSERT_OK_PTR(skel, "task_work__open")) + return; + + scheduler = bpf_object__find_program_by_name(skel->obj, "schedule_task_work"); + bpf_program__set_autoload(scheduler, true); + + deleter = bpf_object__find_program_by_name(skel->obj, "delete_task_work"); + bpf_program__set_autoload(deleter, true); + + err = task_work_stress__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + for (i = 0; i < nthreads; ++i) + started[i] = false; + + td_sched.prog_fd = bpf_program__fd(scheduler); + for (i = 0; i < nthreads; ++i) { + if (pthread_create(&tid[i], NULL, runner, &td_sched) != 0) { + fprintf(stderr, "could not start thread"); + goto cancel; + } + started[i] = true; + } + + if (enable_delete) + atomic_store(&td_del.exit, 0); + + td_del.prog_fd = bpf_program__fd(deleter); + if (pthread_create(&tid_del, NULL, runner, &td_del) != 0) { + fprintf(stderr, "could not start thread"); + goto cancel; + } + started_del = true; + + /* Run stress test for some time */ + sleep(test_time_s); + +cancel: + atomic_store(&td_sched.exit, 1); + atomic_store(&td_del.exit, 1); + for (i = 0; i < nthreads; ++i) { + if (started[i]) + pthread_join(tid[i], NULL); + } + + if (started_del) + pthread_join(tid_del, NULL); + + ASSERT_GT(skel->bss->callback_scheduled, 0, "work scheduled"); + /* Some scheduling attempts should have failed due to contention */ + ASSERT_GT(skel->bss->schedule_error, 0, "schedule error"); + + if (enable_delete) { + /* If delete thread is enabled, it has cancelled some callbacks */ + ASSERT_GT(skel->bss->delete_success, 0, "delete success"); + ASSERT_LT(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); + } else { + /* Without delete thread number of scheduled callbacks is the same as fired */ + ASSERT_EQ(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); + } + +cleanup: + task_work_stress__destroy(skel); +} + +void test_task_work_stress(void) +{ + if (test__start_subtest("no_delete")) + task_work_run(false); + if (test__start_subtest("with_delete")) + task_work_run(true); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c new file mode 100644 index 00000000000000..fd8762ba4b6732 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_id_ops_mapping.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "struct_ops_id_ops_mapping1.skel.h" +#include "struct_ops_id_ops_mapping2.skel.h" + +static void test_st_ops_id_ops_mapping(void) +{ + struct struct_ops_id_ops_mapping1 *skel1 = NULL; + struct struct_ops_id_ops_mapping2 *skel2 = NULL; + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int err, pid, prog1_fd, prog2_fd; + + skel1 = struct_ops_id_ops_mapping1__open_and_load(); + if (!ASSERT_OK_PTR(skel1, "struct_ops_id_ops_mapping1__open")) + goto out; + + skel2 = struct_ops_id_ops_mapping2__open_and_load(); + if (!ASSERT_OK_PTR(skel2, "struct_ops_id_ops_mapping2__open")) + goto out; + + err = bpf_map_get_info_by_fd(bpf_map__fd(skel1->maps.st_ops_map), + &info, &len); + if (!ASSERT_OK(err, "bpf_map_get_info_by_fd")) + goto out; + + skel1->bss->st_ops_id = info.id; + + err = bpf_map_get_info_by_fd(bpf_map__fd(skel2->maps.st_ops_map), + &info, &len); + if (!ASSERT_OK(err, "bpf_map_get_info_by_fd")) + goto out; + + skel2->bss->st_ops_id = info.id; + + err = struct_ops_id_ops_mapping1__attach(skel1); + if (!ASSERT_OK(err, "struct_ops_id_ops_mapping1__attach")) + goto out; + + err = struct_ops_id_ops_mapping2__attach(skel2); + if (!ASSERT_OK(err, "struct_ops_id_ops_mapping2__attach")) + goto out; + + /* run tracing prog that calls .test_1 and checks return */ + pid = getpid(); + skel1->bss->test_pid = pid; + skel2->bss->test_pid = pid; + sys_gettid(); + skel1->bss->test_pid = 0; + skel2->bss->test_pid = 0; + + /* run syscall_prog that calls .test_1 and checks return */ + prog1_fd = bpf_program__fd(skel1->progs.syscall_prog); + err = bpf_prog_test_run_opts(prog1_fd, NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + prog2_fd = bpf_program__fd(skel2->progs.syscall_prog); + err = bpf_prog_test_run_opts(prog2_fd, NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel1->bss->test_err, 0, "skel1->bss->test_err"); + ASSERT_EQ(skel2->bss->test_err, 0, "skel2->bss->test_err"); + +out: + struct_ops_id_ops_mapping1__destroy(skel1); + struct_ops_id_ops_mapping2__destroy(skel2); +} + +void test_struct_ops_id_ops_mapping(void) +{ + if (test__start_subtest("st_ops_id_ops_mapping")) + test_st_ops_id_ops_mapping(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c new file mode 100644 index 00000000000000..9fd6306b455c3c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#define TLD_FREE_DATA_ON_THREAD_EXIT +#define TLD_DYN_DATA_SIZE 4096 +#include "task_local_data.h" + +struct test_tld_struct { + __u64 a; + __u64 b; + __u64 c; + __u64 d; +}; + +#include "test_task_local_data.skel.h" + +TLD_DEFINE_KEY(value0_key, "value0", sizeof(int)); + +/* + * Reset task local data between subtests by clearing metadata other + * than the statically defined value0. This is safe as subtests run + * sequentially. Users of task local data library should not touch + * library internal. + */ +static void reset_tld(void) +{ + if (TLD_READ_ONCE(tld_meta_p)) { + /* Remove TLDs created by tld_create_key() */ + tld_meta_p->cnt = 1; + tld_meta_p->size = TLD_DYN_DATA_SIZE; + memset(&tld_meta_p->metadata[1], 0, + (TLD_MAX_DATA_CNT - 1) * sizeof(struct tld_metadata)); + } +} + +/* Serialize access to bpf program's global variables */ +static pthread_mutex_t global_mutex; + +static tld_key_t *tld_keys; + +#define TEST_BASIC_THREAD_NUM 32 + +void *test_task_local_data_basic_thread(void *arg) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct test_task_local_data *skel = (struct test_task_local_data *)arg; + int fd, err, tid, *value0, *value1; + struct test_tld_struct *value2; + + fd = bpf_map__fd(skel->maps.tld_data_map); + + value0 = tld_get_data(fd, value0_key); + if (!ASSERT_OK_PTR(value0, "tld_get_data")) + goto out; + + value1 = tld_get_data(fd, tld_keys[1]); + if (!ASSERT_OK_PTR(value1, "tld_get_data")) + goto out; + + value2 = tld_get_data(fd, tld_keys[2]); + if (!ASSERT_OK_PTR(value2, "tld_get_data")) + goto out; + + tid = sys_gettid(); + + *value0 = tid + 0; + *value1 = tid + 1; + value2->a = tid + 2; + value2->b = tid + 3; + value2->c = tid + 4; + value2->d = tid + 5; + + pthread_mutex_lock(&global_mutex); + /* Run task_main that read task local data and save to global variables */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_OK(opts.retval, "task_main retval"); + + ASSERT_EQ(skel->bss->test_value0, tid + 0, "tld_get_data value0"); + ASSERT_EQ(skel->bss->test_value1, tid + 1, "tld_get_data value1"); + ASSERT_EQ(skel->bss->test_value2.a, tid + 2, "tld_get_data value2.a"); + ASSERT_EQ(skel->bss->test_value2.b, tid + 3, "tld_get_data value2.b"); + ASSERT_EQ(skel->bss->test_value2.c, tid + 4, "tld_get_data value2.c"); + ASSERT_EQ(skel->bss->test_value2.d, tid + 5, "tld_get_data value2.d"); + pthread_mutex_unlock(&global_mutex); + + /* Make sure valueX are indeed local to threads */ + ASSERT_EQ(*value0, tid + 0, "value0"); + ASSERT_EQ(*value1, tid + 1, "value1"); + ASSERT_EQ(value2->a, tid + 2, "value2.a"); + ASSERT_EQ(value2->b, tid + 3, "value2.b"); + ASSERT_EQ(value2->c, tid + 4, "value2.c"); + ASSERT_EQ(value2->d, tid + 5, "value2.d"); + + *value0 = tid + 5; + *value1 = tid + 4; + value2->a = tid + 3; + value2->b = tid + 2; + value2->c = tid + 1; + value2->d = tid + 0; + + /* Run task_main again */ + pthread_mutex_lock(&global_mutex); + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_OK(opts.retval, "task_main retval"); + + ASSERT_EQ(skel->bss->test_value0, tid + 5, "tld_get_data value0"); + ASSERT_EQ(skel->bss->test_value1, tid + 4, "tld_get_data value1"); + ASSERT_EQ(skel->bss->test_value2.a, tid + 3, "tld_get_data value2.a"); + ASSERT_EQ(skel->bss->test_value2.b, tid + 2, "tld_get_data value2.b"); + ASSERT_EQ(skel->bss->test_value2.c, tid + 1, "tld_get_data value2.c"); + ASSERT_EQ(skel->bss->test_value2.d, tid + 0, "tld_get_data value2.d"); + pthread_mutex_unlock(&global_mutex); + +out: + pthread_exit(NULL); +} + +static void test_task_local_data_basic(void) +{ + struct test_task_local_data *skel; + pthread_t thread[TEST_BASIC_THREAD_NUM]; + char dummy_key_name[TLD_NAME_LEN]; + tld_key_t key; + int i, err; + + reset_tld(); + + ASSERT_OK(pthread_mutex_init(&global_mutex, NULL), "pthread_mutex_init"); + + skel = test_task_local_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + tld_keys = calloc(TLD_MAX_DATA_CNT, sizeof(tld_key_t)); + if (!ASSERT_OK_PTR(tld_keys, "calloc tld_keys")) + goto out; + + ASSERT_FALSE(tld_key_is_err(value0_key), "TLD_DEFINE_KEY"); + tld_keys[1] = tld_create_key("value1", sizeof(int)); + ASSERT_FALSE(tld_key_is_err(tld_keys[1]), "tld_create_key"); + tld_keys[2] = tld_create_key("value2", sizeof(struct test_tld_struct)); + ASSERT_FALSE(tld_key_is_err(tld_keys[2]), "tld_create_key"); + + /* + * Shouldn't be able to store data exceed a page. Create a TLD just big + * enough to exceed a page. TLDs already created are int value0, int + * value1, and struct test_tld_struct value2. + */ + key = tld_create_key("value_not_exist", + TLD_PAGE_SIZE - 2 * sizeof(int) - sizeof(struct test_tld_struct) + 1); + ASSERT_EQ(tld_key_err_or_zero(key), -E2BIG, "tld_create_key"); + + key = tld_create_key("value2", sizeof(struct test_tld_struct)); + ASSERT_EQ(tld_key_err_or_zero(key), -EEXIST, "tld_create_key"); + + /* Shouldn't be able to create the (TLD_MAX_DATA_CNT+1)-th TLD */ + for (i = 3; i < TLD_MAX_DATA_CNT; i++) { + snprintf(dummy_key_name, TLD_NAME_LEN, "dummy_value%d", i); + tld_keys[i] = tld_create_key(dummy_key_name, sizeof(int)); + ASSERT_FALSE(tld_key_is_err(tld_keys[i]), "tld_create_key"); + } + key = tld_create_key("value_not_exist", sizeof(struct test_tld_struct)); + ASSERT_EQ(tld_key_err_or_zero(key), -ENOSPC, "tld_create_key"); + + /* Access TLDs from multiple threads and check if they are thread-specific */ + for (i = 0; i < TEST_BASIC_THREAD_NUM; i++) { + err = pthread_create(&thread[i], NULL, test_task_local_data_basic_thread, skel); + if (!ASSERT_OK(err, "pthread_create")) + goto out; + } + +out: + for (i = 0; i < TEST_BASIC_THREAD_NUM; i++) + pthread_join(thread[i], NULL); + + if (tld_keys) { + free(tld_keys); + tld_keys = NULL; + } + tld_free(); + test_task_local_data__destroy(skel); +} + +#define TEST_RACE_THREAD_NUM (TLD_MAX_DATA_CNT - 3) + +void *test_task_local_data_race_thread(void *arg) +{ + int err = 0, id = (intptr_t)arg; + char key_name[32]; + tld_key_t key; + + key = tld_create_key("value_not_exist", TLD_PAGE_SIZE + 1); + if (tld_key_err_or_zero(key) != -E2BIG) { + err = 1; + goto out; + } + + /* Only one thread will succeed in creating value1 */ + key = tld_create_key("value1", sizeof(int)); + if (!tld_key_is_err(key)) + tld_keys[1] = key; + + /* Only one thread will succeed in creating value2 */ + key = tld_create_key("value2", sizeof(struct test_tld_struct)); + if (!tld_key_is_err(key)) + tld_keys[2] = key; + + snprintf(key_name, 32, "thread_%d", id); + tld_keys[id] = tld_create_key(key_name, sizeof(int)); + if (tld_key_is_err(tld_keys[id])) + err = 2; +out: + return (void *)(intptr_t)err; +} + +static void test_task_local_data_race(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + pthread_t thread[TEST_RACE_THREAD_NUM]; + struct test_task_local_data *skel; + int fd, i, j, err, *data; + void *ret = NULL; + + skel = test_task_local_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + tld_keys = calloc(TLD_MAX_DATA_CNT, sizeof(tld_key_t)); + if (!ASSERT_OK_PTR(tld_keys, "calloc tld_keys")) + goto out; + + fd = bpf_map__fd(skel->maps.tld_data_map); + + ASSERT_FALSE(tld_key_is_err(value0_key), "TLD_DEFINE_KEY"); + tld_keys[0] = value0_key; + + for (j = 0; j < 100; j++) { + reset_tld(); + + for (i = 0; i < TEST_RACE_THREAD_NUM; i++) { + /* + * Try to make tld_create_key() race with each other. Call + * tld_create_key(), both valid and invalid, from different threads. + */ + err = pthread_create(&thread[i], NULL, test_task_local_data_race_thread, + (void *)(intptr_t)(i + 3)); + if (CHECK_FAIL(err)) + break; + } + + /* Wait for all tld_create_key() to return */ + for (i = 0; i < TEST_RACE_THREAD_NUM; i++) { + pthread_join(thread[i], &ret); + if (CHECK_FAIL(ret)) + break; + } + + /* Write a unique number to each TLD */ + for (i = 0; i < TLD_MAX_DATA_CNT; i++) { + data = tld_get_data(fd, tld_keys[i]); + if (CHECK_FAIL(!data)) + break; + *data = i; + } + + /* Read TLDs and check the value to see if any address collides with another */ + for (i = 0; i < TLD_MAX_DATA_CNT; i++) { + data = tld_get_data(fd, tld_keys[i]); + if (CHECK_FAIL(*data != i)) + break; + } + + /* Run task_main to make sure no invalid TLDs are added */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_OK(opts.retval, "task_main retval"); + } +out: + if (tld_keys) { + free(tld_keys); + tld_keys = NULL; + } + tld_free(); + test_task_local_data__destroy(skel); +} + +void test_task_local_data(void) +{ + if (test__start_subtest("task_local_data_basic")) + test_task_local_data_basic(); + if (test__start_subtest("task_local_data_race")) + test_task_local_data_race(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_work.c b/tools/testing/selftests/bpf/prog_tests/test_task_work.c new file mode 100644 index 00000000000000..774b31a5f6ca12 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_task_work.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "task_work.skel.h" +#include "task_work_fail.skel.h" +#include +#include +#include +#include + +static int perf_event_open(__u32 type, __u64 config, int pid) +{ + struct perf_event_attr attr = { + .type = type, + .config = config, + .size = sizeof(struct perf_event_attr), + .sample_period = 100000, + }; + + return syscall(__NR_perf_event_open, &attr, pid, -1, -1, 0); +} + +struct elem { + char data[128]; + struct bpf_task_work tw; +}; + +static int verify_map(struct bpf_map *map, const char *expected_data) +{ + int err; + struct elem value; + int processed_values = 0; + int k, sz; + + sz = bpf_map__max_entries(map); + for (k = 0; k < sz; ++k) { + err = bpf_map__lookup_elem(map, &k, sizeof(int), &value, sizeof(struct elem), 0); + if (err) + continue; + if (!ASSERT_EQ(strcmp(expected_data, value.data), 0, "map data")) { + fprintf(stderr, "expected '%s', found '%s' in %s map", expected_data, + value.data, bpf_map__name(map)); + return 2; + } + processed_values++; + } + + return processed_values == 0; +} + +static void task_work_run(const char *prog_name, const char *map_name) +{ + struct task_work *skel; + struct bpf_program *prog; + struct bpf_map *map; + struct bpf_link *link = NULL; + int err, pe_fd = -1, pid, status, pipefd[2]; + char user_string[] = "hello world"; + + if (!ASSERT_NEQ(pipe(pipefd), -1, "pipe")) + return; + + pid = fork(); + if (pid == 0) { + __u64 num = 1; + int i; + char buf; + + close(pipefd[1]); + read(pipefd[0], &buf, sizeof(buf)); + close(pipefd[0]); + + for (i = 0; i < 10000; ++i) + num *= time(0) % 7; + (void)num; + exit(0); + } + if (!ASSERT_GT(pid, 0, "fork() failed")) { + close(pipefd[0]); + close(pipefd[1]); + return; + } + + skel = task_work__open(); + if (!ASSERT_OK_PTR(skel, "task_work__open")) + return; + + bpf_object__for_each_program(prog, skel->obj) { + bpf_program__set_autoload(prog, false); + } + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "prog_name")) + goto cleanup; + bpf_program__set_autoload(prog, true); + skel->bss->user_ptr = (char *)user_string; + + err = task_work__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + pe_fd = perf_event_open(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, pid); + if (pe_fd == -1 && (errno == ENOENT || errno == EOPNOTSUPP)) { + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); + test__skip(); + goto cleanup; + } + if (!ASSERT_NEQ(pe_fd, -1, "pe_fd")) { + fprintf(stderr, "perf_event_open errno: %d, pid: %d\n", errno, pid); + goto cleanup; + } + + link = bpf_program__attach_perf_event(prog, pe_fd); + if (!ASSERT_OK_PTR(link, "attach_perf_event")) + goto cleanup; + + /* perf event fd ownership is passed to bpf_link */ + pe_fd = -1; + close(pipefd[0]); + write(pipefd[1], user_string, 1); + close(pipefd[1]); + /* Wait to collect some samples */ + waitpid(pid, &status, 0); + pid = 0; + map = bpf_object__find_map_by_name(skel->obj, map_name); + if (!ASSERT_OK_PTR(map, "find map_name")) + goto cleanup; + if (!ASSERT_OK(verify_map(map, user_string), "verify map")) + goto cleanup; +cleanup: + if (pe_fd >= 0) + close(pe_fd); + bpf_link__destroy(link); + task_work__destroy(skel); + if (pid > 0) { + close(pipefd[0]); + write(pipefd[1], user_string, 1); + close(pipefd[1]); + waitpid(pid, &status, 0); + } +} + +void test_task_work(void) +{ + if (test__start_subtest("test_task_work_hash_map")) + task_work_run("oncpu_hash_map", "hmap"); + + if (test__start_subtest("test_task_work_array_map")) + task_work_run("oncpu_array_map", "arrmap"); + + if (test__start_subtest("test_task_work_lru_map")) + task_work_run("oncpu_lru_map", "lrumap"); + + RUN_TESTS(task_work_fail); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_veristat.c b/tools/testing/selftests/bpf/prog_tests/test_veristat.c index 367f47e4a936d4..b38c16b4247f77 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_veristat.c +++ b/tools/testing/selftests/bpf/prog_tests/test_veristat.c @@ -75,26 +75,26 @@ static void test_set_global_vars_succeeds(void) " -vl2 > %s", fix->veristat, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); - __CHECK_STR("_w=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); - __CHECK_STR("_w=0x80000000 ", "var_s32 = -0x80000000"); - __CHECK_STR("_w=0x76543210 ", "var_u32 = 0x76543210"); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); - __CHECK_STR("_w=128 ", "var_s8 = -128"); - __CHECK_STR("_w=255 ", "var_u8 = 255"); - __CHECK_STR("_w=11 ", "var_ea = EA2"); - __CHECK_STR("_w=12 ", "var_eb = EB2"); - __CHECK_STR("_w=13 ", "var_ec = EC2"); - __CHECK_STR("_w=1 ", "var_b = 1"); - __CHECK_STR("_w=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); - __CHECK_STR("_w=0xaaaa ", "union1.var_u16 = 0xaaaa"); - __CHECK_STR("_w=171 ", "arr[3]= 171"); - __CHECK_STR("_w=172 ", "arr[EA2] =172"); - __CHECK_STR("_w=10 ", "enum_arr[EC2]=EA3"); - __CHECK_STR("_w=173 ", "matrix[31][7][11]=173"); - __CHECK_STR("_w=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); - __CHECK_STR("_w=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); + __CHECK_STR("=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); + __CHECK_STR("=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); + __CHECK_STR("=0x80000000 ", "var_s32 = -0x80000000"); + __CHECK_STR("=0x76543210 ", "var_u32 = 0x76543210"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=128 ", "var_s8 = -128"); + __CHECK_STR("=255 ", "var_u8 = 255"); + __CHECK_STR("=11 ", "var_ea = EA2"); + __CHECK_STR("=12 ", "var_eb = EB2"); + __CHECK_STR("=13 ", "var_ec = EC2"); + __CHECK_STR("=1 ", "var_b = 1"); + __CHECK_STR("=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); + __CHECK_STR("=0xaaaa ", "union1.var_u16 = 0xaaaa"); + __CHECK_STR("=171 ", "arr[3]= 171"); + __CHECK_STR("=172 ", "arr[EA2] =172"); + __CHECK_STR("=10 ", "enum_arr[EC2]=EA3"); + __CHECK_STR("=173 ", "matrix[31][7][11]=173"); + __CHECK_STR("=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); + __CHECK_STR("=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); out: teardown_fixture(fix); @@ -117,8 +117,8 @@ static void test_set_global_vars_from_file_succeeds(void) SYS(out, "%s set_global_vars.bpf.o -G \"@%s\" -vl2 > %s", fix->veristat, input_file, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); out: close(fd); diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index d66687f1ee6a8d..34f9ccce260293 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -3,6 +3,7 @@ #include #include "timer.skel.h" #include "timer_failure.skel.h" +#include "timer_interrupt.skel.h" #define NUM_THR 8 @@ -86,6 +87,10 @@ void serial_test_timer(void) int err; timer_skel = timer__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) return; @@ -95,3 +100,36 @@ void serial_test_timer(void) RUN_TESTS(timer_failure); } + +void test_timer_interrupt(void) +{ + struct timer_interrupt *skel = NULL; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, opts); + + skel = timer_interrupt__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } + if (!ASSERT_OK_PTR(skel, "timer_interrupt__open_and_load")) + return; + + err = timer_interrupt__attach(skel); + if (!ASSERT_OK(err, "timer_interrupt__attach")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.test_timer_interrupt); + err = bpf_prog_test_run_opts(prog_fd, &opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto out; + + usleep(50); + + ASSERT_EQ(skel->bss->in_interrupt, 0, "in_interrupt"); + if (skel->bss->preempt_count) + ASSERT_NEQ(skel->bss->in_interrupt_cb, 0, "in_interrupt_cb"); + +out: + timer_interrupt__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/timer_crash.c b/tools/testing/selftests/bpf/prog_tests/timer_crash.c index f74b82305da8c8..b841597c8a3a31 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_crash.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_crash.c @@ -12,6 +12,10 @@ static void test_timer_crash_mode(int mode) struct timer_crash *skel; skel = timer_crash__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_crash__open_and_load")) return; skel->bss->pid = getpid(); diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c index 1a2f99596916fb..eb303fa1e09af9 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c @@ -59,6 +59,10 @@ void test_timer_lockup(void) } skel = timer_lockup__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c index 9ff7843909e7d3..c930c7d7105b9f 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_mim.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c @@ -65,6 +65,10 @@ void serial_test_timer_mim(void) goto cleanup; timer_skel = timer_mim__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c index 19e68d4b353278..6f8c0bfb041559 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c @@ -112,10 +112,39 @@ static void test_struct_many_args(void) tracing_struct_many_args__destroy(skel); } +static void test_union_args(void) +{ + struct tracing_struct *skel; + int err; + + skel = tracing_struct__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_struct__open_and_load")) + return; + + err = tracing_struct__attach(skel); + if (!ASSERT_OK(err, "tracing_struct__attach")) + goto out; + + ASSERT_OK(trigger_module_test_read(256), "trigger_read"); + + ASSERT_EQ(skel->bss->ut1_a_a, 1, "ut1:a.arg.a"); + ASSERT_EQ(skel->bss->ut1_b, 4, "ut1:b"); + ASSERT_EQ(skel->bss->ut1_c, 5, "ut1:c"); + + ASSERT_EQ(skel->bss->ut2_a, 6, "ut2:a"); + ASSERT_EQ(skel->bss->ut2_b_a, 2, "ut2:b.arg.a"); + ASSERT_EQ(skel->bss->ut2_b_b, 3, "ut2:b.arg.b"); + +out: + tracing_struct__destroy(skel); +} + void test_tracing_struct(void) { if (test__start_subtest("struct_args")) test_struct_args(); if (test__start_subtest("struct_many_args")) test_struct_many_args(); + if (test__start_subtest("union_args")) + test_union_args(); } diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe.c b/tools/testing/selftests/bpf/prog_tests/uprobe.c index cf3e0e7a64fae2..86404476c1dafe 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe.c @@ -2,6 +2,7 @@ /* Copyright (c) 2023 Hengqi Chen */ #include +#include #include "test_uprobe.skel.h" static FILE *urand_spawn(int *pid) @@ -33,7 +34,7 @@ static int urand_trigger(FILE **urand_pipe) return exit_code; } -void test_uprobe(void) +static void test_uprobe_attach(void) { LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); struct test_uprobe *skel; @@ -93,3 +94,156 @@ void test_uprobe(void) pclose(urand_pipe); test_uprobe__destroy(skel); } + +#ifdef __x86_64__ +__naked __maybe_unused unsigned long uprobe_regs_change_trigger(void) +{ + asm volatile ( + "ret\n" + ); +} + +static __naked void uprobe_regs_change(struct pt_regs *before, struct pt_regs *after) +{ + asm volatile ( + "movq %r11, 48(%rdi)\n" + "movq %r10, 56(%rdi)\n" + "movq %r9, 64(%rdi)\n" + "movq %r8, 72(%rdi)\n" + "movq %rax, 80(%rdi)\n" + "movq %rcx, 88(%rdi)\n" + "movq %rdx, 96(%rdi)\n" + "movq %rsi, 104(%rdi)\n" + "movq %rdi, 112(%rdi)\n" + + /* save 2nd argument */ + "pushq %rsi\n" + "call uprobe_regs_change_trigger\n" + + /* save return value and load 2nd argument pointer to rax */ + "pushq %rax\n" + "movq 8(%rsp), %rax\n" + + "movq %r11, 48(%rax)\n" + "movq %r10, 56(%rax)\n" + "movq %r9, 64(%rax)\n" + "movq %r8, 72(%rax)\n" + "movq %rcx, 88(%rax)\n" + "movq %rdx, 96(%rax)\n" + "movq %rsi, 104(%rax)\n" + "movq %rdi, 112(%rax)\n" + + /* restore return value and 2nd argument */ + "pop %rax\n" + "pop %rsi\n" + + "movq %rax, 80(%rsi)\n" + "ret\n" + ); +} + +static void regs_common(void) +{ + struct pt_regs before = {}, after = {}, expected = { + .rax = 0xc0ffe, + .rcx = 0xbad, + .rdx = 0xdead, + .r8 = 0x8, + .r9 = 0x9, + .r10 = 0x10, + .r11 = 0x11, + .rdi = 0x12, + .rsi = 0x13, + }; + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + struct test_uprobe *skel; + + skel = test_uprobe__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->my_pid = getpid(); + skel->bss->regs = expected; + + uprobe_opts.func_name = "uprobe_regs_change_trigger"; + skel->links.test_regs_change = bpf_program__attach_uprobe_opts(skel->progs.test_regs_change, + -1, + "/proc/self/exe", + 0 /* offset */, + &uprobe_opts); + if (!ASSERT_OK_PTR(skel->links.test_regs_change, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + uprobe_regs_change(&before, &after); + + ASSERT_EQ(after.rax, expected.rax, "ax"); + ASSERT_EQ(after.rcx, expected.rcx, "cx"); + ASSERT_EQ(after.rdx, expected.rdx, "dx"); + ASSERT_EQ(after.r8, expected.r8, "r8"); + ASSERT_EQ(after.r9, expected.r9, "r9"); + ASSERT_EQ(after.r10, expected.r10, "r10"); + ASSERT_EQ(after.r11, expected.r11, "r11"); + ASSERT_EQ(after.rdi, expected.rdi, "rdi"); + ASSERT_EQ(after.rsi, expected.rsi, "rsi"); + +cleanup: + test_uprobe__destroy(skel); +} + +static noinline unsigned long uprobe_regs_change_ip_1(void) +{ + return 0xc0ffee; +} + +static noinline unsigned long uprobe_regs_change_ip_2(void) +{ + return 0xdeadbeef; +} + +static void regs_ip(void) +{ + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + struct test_uprobe *skel; + unsigned long ret; + + skel = test_uprobe__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->my_pid = getpid(); + skel->bss->ip = (unsigned long) uprobe_regs_change_ip_2; + + uprobe_opts.func_name = "uprobe_regs_change_ip_1"; + skel->links.test_regs_change_ip = bpf_program__attach_uprobe_opts( + skel->progs.test_regs_change_ip, + -1, + "/proc/self/exe", + 0 /* offset */, + &uprobe_opts); + if (!ASSERT_OK_PTR(skel->links.test_regs_change_ip, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + ret = uprobe_regs_change_ip_1(); + ASSERT_EQ(ret, 0xdeadbeef, "ret"); + +cleanup: + test_uprobe__destroy(skel); +} + +static void test_uprobe_regs_change(void) +{ + if (test__start_subtest("regs_change_common")) + regs_common(); + if (test__start_subtest("regs_change_ip")) + regs_ip(); +} +#else +static void test_uprobe_regs_change(void) { } +#endif + +void test_uprobe(void) +{ + if (test__start_subtest("attach")) + test_uprobe_attach(); + test_uprobe_regs_change(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index b17dc39a23dbfa..6d75ede16e7ced 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -8,22 +8,31 @@ #include #include #include +#include #include #include #include #include #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#include "bpf/libbpf_internal.h" -__naked unsigned long uretprobe_regs_trigger(void) +#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 +#include "usdt.h" + +#pragma GCC diagnostic ignored "-Wattributes" + +__attribute__((aligned(16))) +__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void) { asm volatile ( + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */ "movq $0xdeadbeef, %rax\n" "ret\n" ); } -__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) +__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) { asm volatile ( "movq %r15, 0(%rdi)\n" @@ -44,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) "movq $0, 120(%rdi)\n" /* orig_rax */ "movq $0, 128(%rdi)\n" /* rip */ "movq $0, 136(%rdi)\n" /* cs */ + "pushq %rax\n" "pushf\n" "pop %rax\n" "movq %rax, 144(%rdi)\n" /* eflags */ + "pop %rax\n" "movq %rsp, 152(%rdi)\n" /* rsp */ "movq $0, 160(%rdi)\n" /* ss */ /* save 2nd argument */ "pushq %rsi\n" - "call uretprobe_regs_trigger\n" + "call uprobe_regs_trigger\n" /* save return value and load 2nd argument pointer to rax */ "pushq %rax\n" @@ -92,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) ); } -static void test_uretprobe_regs_equal(void) +static void test_uprobe_regs_equal(bool retprobe) { + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = retprobe, + ); struct uprobe_syscall *skel = NULL; struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; unsigned long *pa = (unsigned long *) &after; unsigned long *pp; + unsigned long offset; unsigned int i, cnt; - int err; + + offset = get_uprobe_offset(&uprobe_regs_trigger); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return; skel = uprobe_syscall__open_and_load(); if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) goto cleanup; - err = uprobe_syscall__attach(skel); - if (!ASSERT_OK(err, "uprobe_syscall__attach")) + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) goto cleanup; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + if (!retprobe) + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); pp = (unsigned long *) &skel->bss->regs; cnt = sizeof(before)/sizeof(*pb); @@ -119,7 +142,7 @@ static void test_uretprobe_regs_equal(void) unsigned int offset = i * sizeof(unsigned long); /* - * Check register before and after uretprobe_regs_trigger call + * Check register before and after uprobe_regs_trigger call * that triggers the uretprobe. */ switch (offset) { @@ -133,7 +156,7 @@ static void test_uretprobe_regs_equal(void) /* * Check register seen from bpf program and register after - * uretprobe_regs_trigger call + * uprobe_regs_trigger call (with rax exception, check below). */ switch (offset) { /* @@ -146,6 +169,15 @@ static void test_uretprobe_regs_equal(void) case offsetof(struct pt_regs, rsp): case offsetof(struct pt_regs, ss): break; + /* + * uprobe does not see return value in rax, it needs to see the + * original (before) rax value + */ + case offsetof(struct pt_regs, rax): + if (!retprobe) { + ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check"); + break; + } default: if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) fprintf(stdout, "failed register offset %u\n", offset); @@ -175,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset) return ret != n ? (int) ret : 0; } -static void test_uretprobe_regs_change(void) +static void test_regs_change(void) { struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; @@ -183,13 +215,16 @@ static void test_uretprobe_regs_change(void) unsigned long cnt = sizeof(before)/sizeof(*pb); unsigned int i, err, offset; - offset = get_uprobe_offset(uretprobe_regs_trigger); + offset = get_uprobe_offset(uprobe_regs_trigger); err = write_bpf_testmod_uprobe(offset); if (!ASSERT_OK(err, "register_uprobe")) return; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); if (!ASSERT_OK(err, "unregister_uprobe")) @@ -252,6 +287,7 @@ static void test_uretprobe_syscall_call(void) ); struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c = 0; + struct bpf_link *link; if (!ASSERT_OK(pipe(go), "pipe")) return; @@ -277,11 +313,14 @@ static void test_uretprobe_syscall_call(void) _exit(0); } - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, - "/proc/self/exe", - "uretprobe_syscall_call", &opts); - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) + skel->bss->pid = pid; + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + pid, "/proc/self/exe", + "uretprobe_syscall_call", &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) goto cleanup; + skel->links.test_uretprobe_multi = link; /* kick the child */ write(go[1], &c, 1); @@ -301,6 +340,256 @@ static void test_uretprobe_syscall_call(void) close(go[0]); } +#define TRAMP "[uprobes-trampoline]" + +__attribute__((aligned(16))) +__nocf_check __weak __naked void uprobe_test(void) +{ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} + +__attribute__((aligned(16))) +__nocf_check __weak void usdt_test(void) +{ + USDT(optimized_uprobe, usdt); +} + +static int find_uprobes_trampoline(void *tramp_addr) +{ + void *start, *end; + char line[128]; + int ret = -1; + FILE *maps; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + fprintf(stderr, "cannot open maps\n"); + return -1; + } + + while (fgets(line, sizeof(line), maps)) { + int m = -1; + + /* We care only about private r-x mappings. */ + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2) + continue; + if (m < 0) + continue; + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) { + ret = 0; + break; + } + } + + fclose(maps); + return ret; +} + +static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +static void *find_nop5(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop5, fn + i, 5)) + return fn + i; + } + return NULL; +} + +typedef void (__attribute__((nocf_check)) *trigger_t)(void); + +static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, + void *addr, int executed) +{ + struct __arch_relative_insn { + __u8 op; + __s32 raddr; + } __packed *call; + void *tramp = NULL; + + /* Uprobe gets optimized after first trigger, so let's press twice. */ + trigger(); + trigger(); + + /* Make sure bpf program got executed.. */ + ASSERT_EQ(skel->bss->executed, executed, "executed"); + + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + tramp = (void *) (call + 1) + call->raddr; + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + + return tramp; +} + +static void check_detach(void *addr, void *tramp) +{ + /* [uprobes_trampoline] stays after detach */ + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); +} + +static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, + trigger_t trigger, void *addr, int executed) +{ + void *tramp; + + tramp = check_attach(skel, trigger, addr, executed); + bpf_link__destroy(link); + check_detach(addr, tramp); +} + +static void test_uprobe_legacy(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + /* uprobe */ + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe */ + skel->bss->executed = 0; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_multi(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + /* uprobe.multi */ + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe.multi */ + skel->bss->executed = 0; + opts.retprobe = true; + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_session(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .session = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 4); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_usdt(void) +{ + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + void *addr; + + errno = 0; + addr = find_nop5(usdt_test); + if (!ASSERT_OK_PTR(addr, "find_nop5")) + return; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_usdt(skel->progs.test_usdt, + -1 /* all PIDs */, "/proc/self/exe", + "optimized_uprobe", "usdt", NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) + goto cleanup; + + check(skel, link, usdt_test, addr, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + /* * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c. * @@ -343,43 +632,172 @@ static void test_uretprobe_shadow_stack(void) return; } - /* Run all of the uretprobe tests. */ - test_uretprobe_regs_equal(); - test_uretprobe_regs_change(); + /* Run all the tests with shadow stack in place. */ + + test_uprobe_regs_equal(false); + test_uprobe_regs_equal(true); test_uretprobe_syscall_call(); + test_uprobe_legacy(); + test_uprobe_multi(); + test_uprobe_session(); + test_uprobe_usdt(); + + test_regs_change(); + ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } -#else -static void test_uretprobe_regs_equal(void) + +static volatile bool race_stop; + +static USDT_DEFINE_SEMA(race); + +static void *worker_trigger(void *arg) { - test__skip(); + unsigned long rounds = 0; + + while (!race_stop) { + uprobe_test(); + rounds++; + } + + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); + return NULL; } -static void test_uretprobe_regs_change(void) +static void *worker_attach(void *arg) { - test__skip(); + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct uprobe_syscall_executed *skel; + unsigned long rounds = 0, offset; + const char *sema[2] = { + __stringify(USDT_SEMA(race)), + NULL, + }; + unsigned long *ref; + int err; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return NULL; + + err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) + return NULL; + + opts.ref_ctr_offset = *ref; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return NULL; + + skel->bss->pid = getpid(); + + while (!race_stop) { + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) + break; + + bpf_link__destroy(skel->links.test_uprobe); + skel->links.test_uprobe = NULL; + rounds++; + } + + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); + uprobe_syscall_executed__destroy(skel); + free(ref); + return NULL; } -static void test_uretprobe_syscall_call(void) +static useconds_t race_msec(void) { - test__skip(); + char *env; + + env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC"); + if (env) + return atoi(env); + + /* default duration is 500ms */ + return 500; } -static void test_uretprobe_shadow_stack(void) +static void test_uprobe_race(void) { - test__skip(); + int err, i, nr_threads; + pthread_t *threads; + + nr_threads = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus")) + return; + nr_threads = max(2, nr_threads); + + threads = alloca(sizeof(*threads) * nr_threads); + if (!ASSERT_OK_PTR(threads, "malloc")) + return; + + for (i = 0; i < nr_threads; i++) { + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, + NULL); + if (!ASSERT_OK(err, "pthread_create")) + goto cleanup; + } + + usleep(race_msec() * 1000); + +cleanup: + race_stop = true; + for (nr_threads = i, i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); + + ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); } + +#ifndef __NR_uprobe +#define __NR_uprobe 336 #endif -void test_uprobe_syscall(void) +static void test_uprobe_error(void) +{ + long err = syscall(__NR_uprobe); + + ASSERT_EQ(err, -1, "error"); + ASSERT_EQ(errno, ENXIO, "errno"); +} + +static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) - test_uretprobe_regs_equal(); - if (test__start_subtest("uretprobe_regs_change")) - test_uretprobe_regs_change(); + test_uprobe_regs_equal(true); if (test__start_subtest("uretprobe_syscall_call")) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); + if (test__start_subtest("uprobe_legacy")) + test_uprobe_legacy(); + if (test__start_subtest("uprobe_multi")) + test_uprobe_multi(); + if (test__start_subtest("uprobe_session")) + test_uprobe_session(); + if (test__start_subtest("uprobe_usdt")) + test_uprobe_usdt(); + if (test__start_subtest("uprobe_race")) + test_uprobe_race(); + if (test__start_subtest("uprobe_error")) + test_uprobe_error(); + if (test__start_subtest("uprobe_regs_equal")) + test_uprobe_regs_equal(false); + if (test__start_subtest("regs_change")) + test_regs_change(); +} +#else +static void __test_uprobe_syscall(void) +{ + test__skip(); +} +#endif + +void test_uprobe_syscall(void) +{ + __test_uprobe_syscall(); } diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 9057e983cc5405..4f7f45e693153d 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,79 @@ static void __always_inline trigger_func(int x) { } } -static void subtest_basic_usdt(void) +#if defined(__x86_64__) || defined(__i386__) +/* + * SIB (Scale-Index-Base) addressing format: "size@(base_reg, index_reg, scale)" + * - 'size' is the size in bytes of the array element, and its sign indicates + * whether the type is signed (negative) or unsigned (positive). + * - 'base_reg' is the register holding the base address, normally rdx or edx + * - 'index_reg' is the register holding the index, normally rax or eax + * - 'scale' is the scaling factor (typically 1, 2, 4, or 8), which matches the + * size of the element type. + * + * For example, for an array of 'short' (signed 2-byte elements), the SIB spec would be: + * - size: -2 (negative because 'short' is signed) + * - scale: 2 (since sizeof(short) == 2) + * + * The resulting SIB format: "-2@(%%rdx,%%rax,2)" for x86_64, "-2@(%%edx,%%eax,2)" for i386 + */ +static volatile short array[] = {-1, -2, -3, -4}; + +#if defined(__x86_64__) +#define USDT_SIB_ARG_SPEC -2@(%%rdx,%%rax,2) +#else +#define USDT_SIB_ARG_SPEC -2@(%%edx,%%eax,2) +#endif + +unsigned short test_usdt_sib_semaphore SEC(".probes"); + +static void trigger_sib_spec(void) +{ + /* + * Force SIB addressing with inline assembly. + * + * You must compile with -std=gnu99 or -std=c99 to use the + * STAP_PROBE_ASM macro. + * + * The STAP_PROBE_ASM macro generates a quoted string that gets + * inserted between the surrounding assembly instructions. In this + * case, USDT_SIB_ARG_SPEC is embedded directly into the instruction + * stream, creating a probe point between the asm statement boundaries. + * It works fine with gcc/clang. + * + * Register constraints: + * - "d"(array): Binds the 'array' variable to %rdx or %edx register + * - "a"(0): Binds the constant 0 to %rax or %eax register + * These ensure that when USDT_SIB_ARG_SPEC references %%rdx(%edx) and + * %%rax(%eax), they contain the expected values for SIB addressing. + * + * The "memory" clobber prevents the compiler from reordering memory + * accesses around the probe point, ensuring that the probe behavior + * is predictable and consistent. + */ + asm volatile( + STAP_PROBE_ASM(test, usdt_sib, USDT_SIB_ARG_SPEC) + : + : "d"(array), "a"(0) + : "memory" + ); +} +#endif + +static void subtest_basic_usdt(bool optimized) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; - int err, i; + int err, i, called; + const __u64 expected_cookie = 0xcafedeadbeeffeed; + +#define TRIGGER(x) ({ \ + trigger_func(x); \ + if (optimized) \ + trigger_func(x); \ + optimized ? 2 : 1; \ + }) skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -59,20 +126,29 @@ static void subtest_basic_usdt(void) goto cleanup; /* usdt0 won't be auto-attached */ - opts.usdt_cookie = 0xcafedeadbeeffeed; + opts.usdt_cookie = expected_cookie; skel->links.usdt0 = bpf_program__attach_usdt(skel->progs.usdt0, 0 /*self*/, "/proc/self/exe", "test", "usdt0", &opts); if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; - trigger_func(1); +#if defined(__x86_64__) || defined(__i386__) + opts.usdt_cookie = expected_cookie; + skel->links.usdt_sib = bpf_program__attach_usdt(skel->progs.usdt_sib, + 0 /*self*/, "/proc/self/exe", + "test", "usdt_sib", &opts); + if (!ASSERT_OK_PTR(skel->links.usdt_sib, "usdt_sib_link")) + goto cleanup; +#endif + + alled = TRIGGER(1); - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); - ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); + ASSERT_EQ(bss->usdt0_cookie, expected_cookie, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); ASSERT_EQ(bss->usdt0_arg_ret, -ENOENT, "usdt0_arg_ret"); ASSERT_EQ(bss->usdt0_arg_size, -ENOENT, "usdt0_arg_size"); @@ -119,11 +195,11 @@ static void subtest_basic_usdt(void) * bpf_program__attach_usdt() handles this properly and attaches to * all possible places of USDT invocation. */ - trigger_func(2); + called += TRIGGER(2); - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); /* only check values that depend on trigger_func()'s input value */ ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); @@ -142,9 +218,9 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) goto cleanup; - trigger_func(3); + called += TRIGGER(3); - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); /* this time usdt3 has custom cookie */ ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); @@ -156,8 +232,19 @@ static void subtest_basic_usdt(void) ASSERT_EQ(bss->usdt3_args[1], 42, "usdt3_arg2"); ASSERT_EQ(bss->usdt3_args[2], (uintptr_t)&bla, "usdt3_arg3"); +#if defined(__x86_64__) || defined(__i386__) + trigger_sib_spec(); + ASSERT_EQ(bss->usdt_sib_called, 1, "usdt_sib_called"); + ASSERT_EQ(bss->usdt_sib_cookie, expected_cookie, "usdt_sib_cookie"); + ASSERT_EQ(bss->usdt_sib_arg_cnt, 1, "usdt_sib_arg_cnt"); + ASSERT_EQ(bss->usdt_sib_arg, nums[0], "usdt_sib_arg"); + ASSERT_EQ(bss->usdt_sib_arg_ret, 0, "usdt_sib_arg_ret"); + ASSERT_EQ(bss->usdt_sib_arg_size, sizeof(nums[0]), "usdt_sib_arg_size"); +#endif + cleanup: test_usdt__destroy(skel); +#undef TRIGGER } unsigned short test_usdt_100_semaphore SEC(".probes"); @@ -425,7 +512,11 @@ static void subtest_urandom_usdt(bool auto_attach) void test_usdt(void) { if (test__start_subtest("basic")) - subtest_basic_usdt(); + subtest_basic_usdt(false); +#ifdef __x86_64__ + if (test__start_subtest("basic_optimized")) + subtest_basic_usdt(true); +#endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); if (test__start_subtest("urand_auto_attach")) diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 77ec95d4ffaae9..28e81161e6fca9 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -46,6 +46,7 @@ #include "verifier_ldsx.skel.h" #include "verifier_leak_ptr.skel.h" #include "verifier_linked_scalars.skel.h" +#include "verifier_live_stack.skel.h" #include "verifier_load_acquire.skel.h" #include "verifier_loops1.skel.h" #include "verifier_lwt.skel.h" @@ -59,6 +60,7 @@ #include "verifier_meta_access.skel.h" #include "verifier_movsx.skel.h" #include "verifier_mtu.skel.h" +#include "verifier_mul.skel.h" #include "verifier_netfilter_ctx.skel.h" #include "verifier_netfilter_retcode.skel.h" #include "verifier_bpf_fastcall.skel.h" @@ -183,6 +185,7 @@ void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } void test_verifier_ldsx(void) { RUN(verifier_ldsx); } void test_verifier_leak_ptr(void) { RUN(verifier_leak_ptr); } void test_verifier_linked_scalars(void) { RUN(verifier_linked_scalars); } +void test_verifier_live_stack(void) { RUN(verifier_live_stack); } void test_verifier_loops1(void) { RUN(verifier_loops1); } void test_verifier_lwt(void) { RUN(verifier_lwt); } void test_verifier_map_in_map(void) { RUN(verifier_map_in_map); } @@ -194,6 +197,7 @@ void test_verifier_may_goto_1(void) { RUN(verifier_may_goto_1); } void test_verifier_may_goto_2(void) { RUN(verifier_may_goto_2); } void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } +void test_verifier_mul(void) { RUN(verifier_mul); } void test_verifier_netfilter_ctx(void) { RUN(verifier_netfilter_ctx); } void test_verifier_netfilter_retcode(void) { RUN(verifier_netfilter_retcode); } void test_verifier_bpf_fastcall(void) { RUN(verifier_bpf_fastcall); } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index b9d9f0a502cea7..178292d1251a10 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -9,6 +9,7 @@ #define TX_NETNS "xdp_context_tx" #define RX_NETNS "xdp_context_rx" #define TAP_NAME "tap0" +#define DUMMY_NAME "dum0" #define TAP_NETNS "xdp_context_tuntap" #define TEST_PAYLOAD_LEN 32 @@ -96,9 +97,7 @@ void test_xdp_context_test_run(void) /* Meta data must be 255 bytes or smaller */ test_xdp_context_error(prog_fd, opts, 0, 256, sizeof(data), 0, 0, 0); - /* Total size of data must match data_end - data_meta */ - test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), - sizeof(data) - 1, 0, 0, 0); + /* Total size of data must be data_end - data_meta or larger */ test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data) + 1, 0, 0, 0); @@ -156,15 +155,30 @@ static int send_test_packet(int ifindex) return -1; } -static void assert_test_result(struct test_xdp_meta *skel) +static int write_test_packet(int tap_fd) +{ + __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; + int n; + + /* The ethernet header doesn't need to be valid for this test */ + memset(packet, 0, sizeof(struct ethhdr)); + memcpy(packet + sizeof(struct ethhdr), test_payload, TEST_PAYLOAD_LEN); + + n = write(tap_fd, packet, sizeof(packet)); + if (!ASSERT_EQ(n, sizeof(packet), "write packet")) + return -1; + + return 0; +} + +static void assert_test_result(const struct bpf_map *result_map) { int err; __u32 map_key = 0; __u8 map_value[TEST_PAYLOAD_LEN]; - err = bpf_map__lookup_elem(skel->maps.test_result, &map_key, - sizeof(map_key), &map_value, - TEST_PAYLOAD_LEN, BPF_ANY); + err = bpf_map__lookup_elem(result_map, &map_key, sizeof(map_key), + &map_value, TEST_PAYLOAD_LEN, BPF_ANY); if (!ASSERT_OK(err, "lookup test_result")) return; @@ -172,6 +186,18 @@ static void assert_test_result(struct test_xdp_meta *skel) "test_result map contains test payload"); } +static bool clear_test_result(struct bpf_map *result_map) +{ + const __u8 v[sizeof(test_payload)] = {}; + const __u32 k = 0; + int err; + + err = bpf_map__update_elem(result_map, &k, sizeof(k), v, sizeof(v), BPF_ANY); + ASSERT_OK(err, "update test_result"); + + return err == 0; +} + void test_xdp_context_veth(void) { LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); @@ -248,7 +274,7 @@ void test_xdp_context_veth(void) if (!ASSERT_OK(ret, "send_test_packet")) goto close; - assert_test_result(skel); + assert_test_result(skel->maps.test_result); close: close_netns(nstoken); @@ -257,17 +283,21 @@ void test_xdp_context_veth(void) netns_free(tx_ns); } -void test_xdp_context_tuntap(void) +static void test_tuntap(struct bpf_program *xdp_prog, + struct bpf_program *tc_prio_1_prog, + struct bpf_program *tc_prio_2_prog, + struct bpf_map *result_map) { LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); struct netns_obj *ns = NULL; - struct test_xdp_meta *skel = NULL; - __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; int tap_fd = -1; int tap_ifindex; int ret; + if (!clear_test_result(result_map)) + return; + ns = netns_new(TAP_NETNS, true); if (!ASSERT_OK_PTR(ns, "create and open ns")) return; @@ -278,10 +308,6 @@ void test_xdp_context_tuntap(void) SYS(close, "ip link set dev " TAP_NAME " up"); - skel = test_xdp_meta__open_and_load(); - if (!ASSERT_OK_PTR(skel, "open and load skeleton")) - goto close; - tap_ifindex = if_nametoindex(TAP_NAME); if (!ASSERT_GE(tap_ifindex, 0, "if_nametoindex")) goto close; @@ -291,33 +317,175 @@ void test_xdp_context_tuntap(void) if (!ASSERT_OK(ret, "bpf_tc_hook_create")) goto close; - tc_opts.prog_fd = bpf_program__fd(skel->progs.ing_cls); + tc_opts.prog_fd = bpf_program__fd(tc_prio_1_prog); ret = bpf_tc_attach(&tc_hook, &tc_opts); if (!ASSERT_OK(ret, "bpf_tc_attach")) goto close; - ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(skel->progs.ing_xdp), + if (tc_prio_2_prog) { + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 2, + .prog_fd = bpf_program__fd(tc_prio_2_prog)); + + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + } + + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(xdp_prog), 0, NULL); if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) goto close; - /* The ethernet header is not relevant for this test and doesn't need to - * be meaningful. - */ - struct ethhdr eth = { 0 }; + ret = write_test_packet(tap_fd); + if (!ASSERT_OK(ret, "write_test_packet")) + goto close; - memcpy(packet, ð, sizeof(eth)); - memcpy(packet + sizeof(eth), test_payload, TEST_PAYLOAD_LEN); + assert_test_result(result_map); + +close: + if (tap_fd >= 0) + close(tap_fd); + netns_free(ns); +} + +/* Write a packet to a tap dev and copy it to ingress of a dummy dev */ +static void test_tuntap_mirred(struct bpf_program *xdp_prog, + struct bpf_program *tc_prog, + bool *test_pass) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); + struct netns_obj *ns = NULL; + int dummy_ifindex; + int tap_fd = -1; + int tap_ifindex; + int ret; + + *test_pass = false; + + ns = netns_new(TAP_NETNS, true); + if (!ASSERT_OK_PTR(ns, "netns_new")) + return; - ret = write(tap_fd, packet, sizeof(packet)); - if (!ASSERT_EQ(ret, sizeof(packet), "write packet")) + /* Setup dummy interface */ + SYS(close, "ip link add name " DUMMY_NAME " type dummy"); + SYS(close, "ip link set dev " DUMMY_NAME " up"); + + dummy_ifindex = if_nametoindex(DUMMY_NAME); + if (!ASSERT_GE(dummy_ifindex, 0, "if_nametoindex")) goto close; - assert_test_result(skel); + tc_hook.ifindex = dummy_ifindex; + ret = bpf_tc_hook_create(&tc_hook); + if (!ASSERT_OK(ret, "bpf_tc_hook_create")) + goto close; + + tc_opts.prog_fd = bpf_program__fd(tc_prog); + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + + /* Setup TAP interface */ + tap_fd = open_tuntap(TAP_NAME, true); + if (!ASSERT_GE(tap_fd, 0, "open_tuntap")) + goto close; + + SYS(close, "ip link set dev " TAP_NAME " up"); + + tap_ifindex = if_nametoindex(TAP_NAME); + if (!ASSERT_GE(tap_ifindex, 0, "if_nametoindex")) + goto close; + + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(xdp_prog), 0, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) + goto close; + + /* Copy all packets received from TAP to dummy ingress */ + SYS(close, "tc qdisc add dev " TAP_NAME " clsact"); + SYS(close, "tc filter add dev " TAP_NAME " ingress " + "protocol all matchall " + "action mirred ingress mirror dev " DUMMY_NAME); + + /* Receive a packet on TAP */ + ret = write_test_packet(tap_fd); + if (!ASSERT_OK(ret, "write_test_packet")) + goto close; + + ASSERT_TRUE(*test_pass, "test_pass"); close: if (tap_fd >= 0) close(tap_fd); - test_xdp_meta__destroy(skel); netns_free(ns); } + +void test_xdp_context_tuntap(void) +{ + struct test_xdp_meta *skel = NULL; + + skel = test_xdp_meta__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + return; + + if (test__start_subtest("data_meta")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls, + NULL, /* tc prio 2 */ + skel->maps.test_result); + if (test__start_subtest("dynptr_read")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls_dynptr_read, + NULL, /* tc prio 2 */ + skel->maps.test_result); + if (test__start_subtest("dynptr_slice")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls_dynptr_slice, + NULL, /* tc prio 2 */ + skel->maps.test_result); + if (test__start_subtest("dynptr_write")) + test_tuntap(skel->progs.ing_xdp_zalloc_meta, + skel->progs.ing_cls_dynptr_write, + skel->progs.ing_cls_dynptr_read, + skel->maps.test_result); + if (test__start_subtest("dynptr_slice_rdwr")) + test_tuntap(skel->progs.ing_xdp_zalloc_meta, + skel->progs.ing_cls_dynptr_slice_rdwr, + skel->progs.ing_cls_dynptr_slice, + skel->maps.test_result); + if (test__start_subtest("dynptr_offset")) + test_tuntap(skel->progs.ing_xdp_zalloc_meta, + skel->progs.ing_cls_dynptr_offset_wr, + skel->progs.ing_cls_dynptr_offset_rd, + skel->maps.test_result); + if (test__start_subtest("dynptr_offset_oob")) + test_tuntap(skel->progs.ing_xdp, + skel->progs.ing_cls_dynptr_offset_oob, + skel->progs.ing_cls, + skel->maps.test_result); + if (test__start_subtest("clone_data_meta_empty_on_data_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_data_meta_empty_on_data_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_data_meta_empty_on_meta_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_data_meta_empty_on_meta_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_empty_on_data_slice_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_empty_on_data_slice_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_empty_on_meta_slice_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_empty_on_meta_slice_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_rdonly_before_data_dynptr_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_rdonly_before_data_dynptr_write, + &skel->bss->test_pass); + if (test__start_subtest("clone_dynptr_rdonly_before_meta_dynptr_write")) + test_tuntap_mirred(skel->progs.ing_xdp, + skel->progs.clone_dynptr_rdonly_before_meta_dynptr_write, + &skel->bss->test_pass); + + test_xdp_meta__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c index 461ab18705d5c0..a8ab05216c3853 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -7,6 +7,7 @@ #include #include "test_xdp_devmap_helpers.skel.h" +#include "test_xdp_devmap_tailcall.skel.h" #include "test_xdp_with_devmap_frags_helpers.skel.h" #include "test_xdp_with_devmap_helpers.skel.h" @@ -107,6 +108,29 @@ static void test_neg_xdp_devmap_helpers(void) } } +static void test_xdp_devmap_tailcall(enum bpf_attach_type prog_dev, + enum bpf_attach_type prog_tail, + bool expect_reject) +{ + struct test_xdp_devmap_tailcall *skel; + int err; + + skel = test_xdp_devmap_tailcall__open(); + if (!ASSERT_OK_PTR(skel, "test_xdp_devmap_tailcall__open")) + return; + + bpf_program__set_expected_attach_type(skel->progs.xdp_devmap, prog_dev); + bpf_program__set_expected_attach_type(skel->progs.xdp_entry, prog_tail); + + err = test_xdp_devmap_tailcall__load(skel); + if (expect_reject) + ASSERT_ERR(err, "test_xdp_devmap_tailcall__load"); + else + ASSERT_OK(err, "test_xdp_devmap_tailcall__load"); + + test_xdp_devmap_tailcall__destroy(skel); +} + static void test_xdp_with_devmap_frags_helpers(void) { struct test_xdp_with_devmap_frags_helpers *skel; @@ -238,8 +262,13 @@ void serial_test_xdp_devmap_attach(void) if (test__start_subtest("DEVMAP with frags programs in entries")) test_xdp_with_devmap_frags_helpers(); - if (test__start_subtest("Verifier check of DEVMAP programs")) + if (test__start_subtest("Verifier check of DEVMAP programs")) { test_neg_xdp_devmap_helpers(); + test_xdp_devmap_tailcall(BPF_XDP_DEVMAP, BPF_XDP_DEVMAP, false); + test_xdp_devmap_tailcall(0, 0, true); + test_xdp_devmap_tailcall(BPF_XDP_DEVMAP, 0, true); + test_xdp_devmap_tailcall(0, BPF_XDP_DEVMAP, true); + } if (test__start_subtest("DEVMAP with programs in entries on veth")) test_xdp_with_devmap_helpers_veth(); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c new file mode 100644 index 00000000000000..efa350d04ec5ff --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "test_xdp_pull_data.skel.h" + +#define PULL_MAX (1 << 31) +#define PULL_PLUS_ONE (1 << 30) + +#define XDP_PACKET_HEADROOM 256 + +/* Find headroom and tailroom occupied by struct xdp_frame and struct + * skb_shared_info so that we can calculate the maximum pull lengths for + * test cases. They might not be the real size of the structures due to + * cache alignment. + */ +static int find_xdp_sizes(struct test_xdp_pull_data *skel, int frame_sz) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct xdp_md ctx = {}; + int prog_fd, err; + __u8 *buf; + + buf = calloc(frame_sz, sizeof(__u8)); + if (!ASSERT_OK_PTR(buf, "calloc buf")) + return -ENOMEM; + + topts.data_in = buf; + topts.data_out = buf; + topts.data_size_in = frame_sz; + topts.data_size_out = frame_sz; + /* Pass a data_end larger than the linear space available to make sure + * bpf_prog_test_run_xdp() will fill the linear data area so that + * xdp_find_sizes can infer the size of struct skb_shared_info + */ + ctx.data_end = frame_sz; + topts.ctx_in = &ctx; + topts.ctx_out = &ctx; + topts.ctx_size_in = sizeof(ctx); + topts.ctx_size_out = sizeof(ctx); + + prog_fd = bpf_program__fd(skel->progs.xdp_find_sizes); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + free(buf); + + return err; +} + +/* xdp_pull_data_prog will directly read a marker 0xbb stored at buf[1024] + * so caller expecting XDP_PASS should always pass pull_len no less than 1024 + */ +static void run_test(struct test_xdp_pull_data *skel, int retval, + int frame_sz, int buff_len, int meta_len, int data_len, + int pull_len) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct xdp_md ctx = {}; + int prog_fd, err; + __u8 *buf; + + buf = calloc(buff_len, sizeof(__u8)); + if (!ASSERT_OK_PTR(buf, "calloc buf")) + return; + + buf[meta_len + 1023] = 0xaa; + buf[meta_len + 1024] = 0xbb; + buf[meta_len + 1025] = 0xcc; + + topts.data_in = buf; + topts.data_out = buf; + topts.data_size_in = buff_len; + topts.data_size_out = buff_len; + ctx.data = meta_len; + ctx.data_end = meta_len + data_len; + topts.ctx_in = &ctx; + topts.ctx_out = &ctx; + topts.ctx_size_in = sizeof(ctx); + topts.ctx_size_out = sizeof(ctx); + + skel->bss->data_len = data_len; + if (pull_len & PULL_MAX) { + int headroom = XDP_PACKET_HEADROOM - meta_len - skel->bss->xdpf_sz; + int tailroom = frame_sz - XDP_PACKET_HEADROOM - + data_len - skel->bss->sinfo_sz; + + pull_len = pull_len & PULL_PLUS_ONE ? 1 : 0; + pull_len += headroom + tailroom + data_len; + } + skel->bss->pull_len = pull_len; + + prog_fd = bpf_program__fd(skel->progs.xdp_pull_data_prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_EQ(topts.retval, retval, "xdp_pull_data_prog retval"); + + if (retval == XDP_DROP) + goto out; + + ASSERT_EQ(ctx.data_end, meta_len + pull_len, "linear data size"); + ASSERT_EQ(topts.data_size_out, buff_len, "linear + non-linear data size"); + /* Make sure data around xdp->data_end was not messed up by + * bpf_xdp_pull_data() + */ + ASSERT_EQ(buf[meta_len + 1023], 0xaa, "data[1023]"); + ASSERT_EQ(buf[meta_len + 1024], 0xbb, "data[1024]"); + ASSERT_EQ(buf[meta_len + 1025], 0xcc, "data[1025]"); +out: + free(buf); +} + +static void test_xdp_pull_data_basic(void) +{ + u32 pg_sz, max_meta_len, max_data_len; + struct test_xdp_pull_data *skel; + + skel = test_xdp_pull_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_xdp_pull_data__open_and_load")) + return; + + pg_sz = sysconf(_SC_PAGE_SIZE); + + if (find_xdp_sizes(skel, pg_sz)) + goto out; + + max_meta_len = XDP_PACKET_HEADROOM - skel->bss->xdpf_sz; + max_data_len = pg_sz - XDP_PACKET_HEADROOM - skel->bss->sinfo_sz; + + /* linear xdp pkt, pull 0 byte */ + run_test(skel, XDP_PASS, pg_sz, 2048, 0, 2048, 2048); + + /* multi-buf pkt, pull results in linear xdp pkt */ + run_test(skel, XDP_PASS, pg_sz, 2048, 0, 1024, 2048); + + /* multi-buf pkt, pull 1 byte to linear data area */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1024, 1025); + + /* multi-buf pkt, pull 0 byte to linear data area */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1025, 1025); + + /* multi-buf pkt, empty linear data area, pull requires memmove */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, 0, PULL_MAX); + + /* multi-buf pkt, no headroom */ + run_test(skel, XDP_PASS, pg_sz, 9000, max_meta_len, 1024, PULL_MAX); + + /* multi-buf pkt, no tailroom, pull requires memmove */ + run_test(skel, XDP_PASS, pg_sz, 9000, 0, max_data_len, PULL_MAX); + + /* Test cases with invalid pull length */ + + /* linear xdp pkt, pull more than total data len */ + run_test(skel, XDP_DROP, pg_sz, 2048, 0, 2048, 2049); + + /* multi-buf pkt with no space left in linear data area */ + run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, max_data_len, + PULL_MAX | PULL_PLUS_ONE); + + /* multi-buf pkt, empty linear data area */ + run_test(skel, XDP_DROP, pg_sz, 9000, 0, 0, PULL_MAX | PULL_PLUS_ONE); + + /* multi-buf pkt, no headroom */ + run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, 1024, + PULL_MAX | PULL_PLUS_ONE); + + /* multi-buf pkt, no tailroom */ + run_test(skel, XDP_DROP, pg_sz, 9000, 0, max_data_len, + PULL_MAX | PULL_PLUS_ONE); + +out: + test_xdp_pull_data__destroy(skel); +} + +void test_xdp_pull_data(void) +{ + if (test__start_subtest("xdp_pull_data")) + test_xdp_pull_data_basic(); +} diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c index a52feff9811264..d1841aac94a22f 100644 --- a/tools/testing/selftests/bpf/progs/arena_atomics.c +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -28,7 +28,8 @@ bool skip_all_tests = true; #if defined(ENABLE_ATOMICS_TESTS) && \ defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) bool skip_lacq_srel_tests __attribute((__section__(".data"))) = false; #else bool skip_lacq_srel_tests = true; @@ -314,7 +315,8 @@ int load_acquire(const void *ctx) { #if defined(ENABLE_ATOMICS_TESTS) && \ defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) #define LOAD_ACQUIRE_ARENA(SIZEOP, SIZE, SRC, DST) \ { asm volatile ( \ @@ -365,7 +367,8 @@ int store_release(const void *ctx) { #if defined(ENABLE_ATOMICS_TESTS) && \ defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) #define STORE_RELEASE_ARENA(SIZEOP, DST, VAL) \ { asm volatile ( \ diff --git a/tools/testing/selftests/bpf/progs/arena_spin_lock.c b/tools/testing/selftests/bpf/progs/arena_spin_lock.c index c4500c37f85e06..086b57a426cf5a 100644 --- a/tools/testing/selftests/bpf/progs/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/arena_spin_lock.c @@ -37,8 +37,11 @@ int prog(void *ctx) #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) unsigned long flags; - if ((ret = arena_spin_lock_irqsave(&lock, flags))) + if ((ret = arena_spin_lock_irqsave(&lock, flags))) { + if (ret == -EOPNOTSUPP) + test_skip = 3; return ret; + } if (counter != limit) counter++; bpf_repeat(cs_count); diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h index d67466c1ff7754..f90531cf3ee59e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h +++ b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h @@ -302,7 +302,7 @@ int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val * barriers. */ if (val & _Q_LOCKED_MASK) - smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + (void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); /* * take ownership and clear the pending bit. @@ -380,7 +380,7 @@ int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); - arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + (void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); /* * While waiting for the MCS lock, the next pointer may have diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c index 1654a530aa3dc6..4e51785e7606e7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -101,7 +101,7 @@ static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, tp->snd_cwnd = pkts_in_flight + sndcnt; } -/* Decide wheather to run the increase function of congestion control. */ +/* Decide whether to run the increase function of congestion control. */ static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { if (tcp_sk(sk)->reordering > TCP_REORDERING) diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 7cd73e75f52a2b..32c511bcd60b3a 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ -/* WARNING: This implemenation is not necessarily the same +/* WARNING: This implementation is not necessarily the same * as the tcp_dctcp.c. The purpose is mainly for testing * the kernel BPF logic. */ diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index c1cfd297aabf11..a7a1a684eed116 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -33,7 +33,20 @@ * e.g. "foo{{[0-9]+}}" matches strings like "foo007". * Extended POSIX regular expression syntax is allowed * inside the brackets. + * __not_msg Message not expected to be found in verifier log. + * If __msg_not is situated between __msg tags + * framework matches __msg tags first, and then + * checks that __msg_not is not present in a portion of + * a log between bracketing __msg tags. + * Same regex syntax as for __msg is supported. * __msg_unpriv Same as __msg but for unprivileged mode. + * __not_msg_unpriv Same as __not_msg but for unprivileged mode. + * + * __stderr Message expected to be found in bpf stderr stream. The + * same regex rules apply like __msg. + * __stderr_unpriv Same as __stderr but for unpriveleged mode. + * __stdout Same as __stderr but for stdout stream. + * __stdout_unpriv Same as __stdout but for unpriveleged mode. * * __xlated Expect a line in a disassembly log after verifier applies rewrites. * Multiple __xlated attributes could be specified. @@ -115,12 +128,14 @@ * __caps_unpriv Specify the capabilities that should be set when running the test. */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) +#define __not_msg(msg) __attribute__((btf_decl_tag("comment:test_expect_not_msg=" XSTR(__COUNTER__) "=" msg))) #define __xlated(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated=" XSTR(__COUNTER__) "=" msg))) #define __jited(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure __attribute__((btf_decl_tag("comment:test_expect_failure"))) #define __success __attribute__((btf_decl_tag("comment:test_expect_success"))) #define __description(desc) __attribute__((btf_decl_tag("comment:test_description=" desc))) #define __msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __not_msg_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_not_msg_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __xlated_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_xlated_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __jited_unpriv(msg) __attribute__((btf_decl_tag("comment:test_jited=" XSTR(__COUNTER__) "=" msg))) #define __failure_unpriv __attribute__((btf_decl_tag("comment:test_expect_failure_unpriv"))) @@ -136,9 +151,14 @@ #define __arch_x86_64 __arch("X86_64") #define __arch_arm64 __arch("ARM64") #define __arch_riscv64 __arch("RISCV64") +#define __arch_s390x __arch("s390x") #define __caps_unpriv(caps) __attribute__((btf_decl_tag("comment:test_caps_unpriv=" EXPAND_QUOTE(caps)))) #define __load_if_JITed() __attribute__((btf_decl_tag("comment:load_mode=jited"))) #define __load_if_no_JITed() __attribute__((btf_decl_tag("comment:load_mode=no_jited"))) +#define __stderr(msg) __attribute__((btf_decl_tag("comment:test_expect_stderr=" XSTR(__COUNTER__) "=" msg))) +#define __stderr_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stderr_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __stdout(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout=" XSTR(__COUNTER__) "=" msg))) +#define __stdout_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout_unpriv=" XSTR(__COUNTER__) "=" msg))) /* Define common capabilities tested using __caps_unpriv */ #define CAP_NET_ADMIN 12 @@ -156,6 +176,10 @@ #define __imm_ptr(name) [name]"r"(&name) #define __imm_insn(name, expr) [name]"i"(*(long *)&(expr)) +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) + /* Magic constants used with __retval() */ #define POINTER_VALUE 0xbadcafe #define TEST_DATA_LEN 64 diff --git a/tools/testing/selftests/bpf/progs/bpf_test_utils.h b/tools/testing/selftests/bpf/progs/bpf_test_utils.h new file mode 100644 index 00000000000000..f4e67b492dd2b1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_test_utils.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BPF_TEST_UTILS_H__ +#define __BPF_TEST_UTILS_H__ + +#include +#include "bpf_misc.h" + +/* Clobber as many native registers and stack slots as possible. */ +static __always_inline void clobber_regs_stack(void) +{ + char tmp_str[] = "123456789"; + unsigned long tmp; + + bpf_strtoul(tmp_str, sizeof(tmp_str), 0, &tmp); + __sink(tmp); +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c b/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c index 092db1d0435e66..88e13e17ec9e97 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c +++ b/tools/testing/selftests/bpf/progs/cgroup_read_xattr.c @@ -73,7 +73,7 @@ int BPF_PROG(use_css_iter_non_sleepable) } SEC("lsm.s/socket_connect") -__failure __msg("expected an RCU CS") +__failure __msg("kernel func bpf_iter_css_new requires RCU critical section protection") int BPF_PROG(use_css_iter_sleepable_missing_rcu_lock) { u64 cgrp_id = bpf_get_current_cgroup_id(); diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c index 5354455a01be8a..02d8f160ca0e95 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_success.c @@ -221,3 +221,15 @@ int BPF_PROG(test_cgrp_from_id, struct cgroup *cgrp, const char *path) return 0; } + +SEC("syscall") +int test_cgrp_from_id_ns(void *ctx) +{ + struct cgroup *cg; + + cg = bpf_cgroup_from_id(1); + if (!cg) + return 42; + bpf_cgroup_release(cg); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c index 645be6cddf36cd..dfd8a258f14a88 100644 --- a/tools/testing/selftests/bpf/progs/crypto_sanity.c +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -14,7 +14,7 @@ unsigned char key[256] = {}; u16 udp_test_port = 7777; u32 authsize, key_len; char algo[128] = {}; -char dst[16] = {}; +char dst[16] = {}, dst_bad[8] = {}; int status; static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) @@ -59,10 +59,9 @@ int skb_crypto_setup(void *ctx) .authsize = authsize, }; struct bpf_crypto_ctx *cctx; - int err = 0; + int err; status = 0; - if (key_len > 256) { status = -EINVAL; return 0; @@ -70,8 +69,8 @@ int skb_crypto_setup(void *ctx) __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); __builtin_memcpy(¶ms.key, key, sizeof(key)); - cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); if (!cctx) { status = err; return 0; @@ -80,7 +79,6 @@ int skb_crypto_setup(void *ctx) err = crypto_ctx_insert(cctx); if (err && err != -EEXIST) status = err; - return 0; } @@ -92,6 +90,7 @@ int decrypt_sanity(struct __sk_buff *skb) struct bpf_dynptr psrc, pdst; int err; + status = 0; err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -110,13 +109,23 @@ int decrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } @@ -129,7 +138,6 @@ int encrypt_sanity(struct __sk_buff *skb) int err; status = 0; - err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -148,13 +156,23 @@ int encrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index bd8f15229f5c73..dda6a8dada826c 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -269,6 +269,26 @@ int data_slice_out_of_bounds_skb(struct __sk_buff *skb) return SK_PASS; } +/* A metadata slice can't be accessed out of bounds */ +SEC("?tc") +__failure __msg("value is outside of the allowed memory range") +int data_slice_out_of_bounds_skb_meta(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + /* this should fail */ + *(md + 1) = 42; + + return SK_PASS; +} + SEC("?raw_tp") __failure __msg("value is outside of the allowed memory range") int data_slice_out_of_bounds_map_value(void *ctx) @@ -1089,6 +1109,26 @@ int skb_invalid_slice_write(struct __sk_buff *skb) return SK_PASS; } +/* bpf_dynptr_slice()s are read-only and cannot be written to */ +SEC("?tc") +__failure __msg("R{{[0-9]+}} cannot write into rdonly_mem") +int skb_meta_invalid_slice_write(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + /* The read-only data slice is invalidated whenever a helper changes packet data */ SEC("?tc") __failure __msg("invalid mem access 'scalar'") @@ -1192,6 +1232,188 @@ int skb_invalid_data_slice4(struct __sk_buff *skb) return SK_PASS; } +/* Read-only skb data slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *d; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + d = bpf_dynptr_slice(&data, 0, NULL, sizeof(*d)); + if (!d) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + val = *d; + + return SK_PASS; +} + +/* Read-write skb data slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *d; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + d = bpf_dynptr_slice_rdwr(&data, 0, NULL, sizeof(*d)); + if (!d) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + *d = 42; + + return SK_PASS; +} + +/* Read-only skb metadata slice is invalidated on write to skb data */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_meta_slice_invalid_after_payload_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *md; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&data, 0, "x", 1, 0); + + /* this should fail */ + val = *md; + + return SK_PASS; +} + +/* Read-write skb metadata slice is invalidated on write to skb data slice */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_meta_slice_invalid_after_payload_write(struct __sk_buff *skb) +{ + struct bpf_dynptr data, meta; + __u8 *md; + + bpf_dynptr_from_skb(skb, 0, &data); + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&data, 0, "x", 1, 0); + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + +/* Read-only skb metadata slice is invalidated whenever a helper changes packet data */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_meta_slice_invalid_after_payload_helper(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + if (bpf_skb_pull_data(skb, skb->len)) + return SK_DROP; + + /* this should fail */ + val = *md; + + return SK_PASS; +} + +/* Read-write skb metadata slice is invalidated whenever a helper changes packet data */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_meta_slice_invalid_after_payload_helper(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + if (bpf_skb_pull_data(skb, skb->len)) + return SK_DROP; + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + +/* Read-only skb metadata slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int ro_skb_meta_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + val = *md; + + return SK_PASS; +} + +/* Read-write skb metadata slice is invalidated on write to skb metadata */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int rw_skb_meta_slice_invalid_after_metadata_write(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + bpf_dynptr_write(&meta, 0, "x", 1, 0); + + /* this should fail */ + *md = 42; + + return SK_PASS; +} + /* The read-only data slice is invalidated whenever a helper changes packet data */ SEC("?xdp") __failure __msg("invalid mem access 'scalar'") @@ -1255,6 +1477,19 @@ int skb_invalid_ctx(void *ctx) return 0; } +/* Only supported prog type can create skb_meta-type dynptrs */ +SEC("?raw_tp") +__failure __msg("calling kernel function bpf_dynptr_from_skb_meta is not allowed") +int skb_meta_invalid_ctx(void *ctx) +{ + struct bpf_dynptr meta; + + /* this should fail */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + + return 0; +} + SEC("fentry/skb_tx_error") __failure __msg("must be referenced or trusted") int BPF_PROG(skb_invalid_ctx_fentry, void *skb) @@ -1665,6 +1900,29 @@ int clone_skb_packet_data(struct __sk_buff *skb) return 0; } +/* A skb clone's metadata slice becomes invalid anytime packet data changes */ +SEC("?tc") +__failure __msg("invalid mem access 'scalar'") +int clone_skb_packet_meta(struct __sk_buff *skb) +{ + struct bpf_dynptr clone, meta; + __u8 *md; + + bpf_dynptr_from_skb_meta(skb, 0, &meta); + bpf_dynptr_clone(&meta, &clone); + md = bpf_dynptr_slice_rdwr(&clone, 0, NULL, sizeof(*md)); + if (!md) + return SK_DROP; + + if (bpf_skb_pull_data(skb, skb->len)) + return SK_DROP; + + /* this should fail */ + *md = 42; + + return 0; +} + /* A xdp clone's data slices should be invalid anytime packet data changes */ SEC("?xdp") __failure __msg("invalid mem access 'scalar'") diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index 8315273cb900c2..127dea342e5a67 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -211,6 +211,61 @@ int test_dynptr_skb_data(struct __sk_buff *skb) return 1; } +SEC("?tc") +int test_dynptr_skb_meta_data(struct __sk_buff *skb) +{ + struct bpf_dynptr meta; + __u8 *md; + int ret; + + err = 1; + ret = bpf_dynptr_from_skb_meta(skb, 0, &meta); + if (ret) + return 1; + + /* This should return NULL. Must use bpf_dynptr_slice API */ + err = 2; + md = bpf_dynptr_data(&meta, 0, sizeof(*md)); + if (md) + return 1; + + err = 0; + return 1; +} + +/* Check that skb metadata dynptr ops don't accept any flags. */ +SEC("?tc") +int test_dynptr_skb_meta_flags(struct __sk_buff *skb) +{ + const __u64 INVALID_FLAGS = ~0ULL; + struct bpf_dynptr meta; + __u8 buf; + int ret; + + err = 1; + ret = bpf_dynptr_from_skb_meta(skb, INVALID_FLAGS, &meta); + if (ret != -EINVAL) + return 1; + + err = 2; + ret = bpf_dynptr_from_skb_meta(skb, 0, &meta); + if (ret) + return 1; + + err = 3; + ret = bpf_dynptr_read(&buf, 0, &meta, 0, INVALID_FLAGS); + if (ret != -EINVAL) + return 1; + + err = 4; + ret = bpf_dynptr_write(&meta, 0, &buf, 0, INVALID_FLAGS); + if (ret != -EINVAL) + return 1; + + err = 0; + return 1; +} + SEC("tp/syscalls/sys_enter_nanosleep") int test_adjust(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index 5e0a1ca96d4e27..a01c2736890f94 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -18,43 +18,43 @@ return *(u64 *)num; \ } -__msg(": R0_w=0xffffffff80000000") +__msg(": R0=0xffffffff80000000") check_assert(s64, ==, eq_int_min, INT_MIN); -__msg(": R0_w=0x7fffffff") +__msg(": R0=0x7fffffff") check_assert(s64, ==, eq_int_max, INT_MAX); -__msg(": R0_w=0") +__msg(": R0=0") check_assert(s64, ==, eq_zero, 0); -__msg(": R0_w=0x8000000000000000 R1_w=0x8000000000000000") +__msg(": R0=0x8000000000000000 R1=0x8000000000000000") check_assert(s64, ==, eq_llong_min, LLONG_MIN); -__msg(": R0_w=0x7fffffffffffffff R1_w=0x7fffffffffffffff") +__msg(": R0=0x7fffffffffffffff R1=0x7fffffffffffffff") check_assert(s64, ==, eq_llong_max, LLONG_MAX); -__msg(": R0_w=scalar(id=1,smax=0x7ffffffe)") +__msg(": R0=scalar(id=1,smax=0x7ffffffe)") check_assert(s64, <, lt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") check_assert(s64, <, lt_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff7fffffff") +__msg(": R0=scalar(id=1,smax=0xffffffff7fffffff") check_assert(s64, <, lt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smax=0x7fffffff)") +__msg(": R0=scalar(id=1,smax=0x7fffffff)") check_assert(s64, <=, le_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=0)") +__msg(": R0=scalar(id=1,smax=0)") check_assert(s64, <=, le_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff80000000") +__msg(": R0=scalar(id=1,smax=0xffffffff80000000") check_assert(s64, <=, le_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000001") +__msg(": R0=scalar(id=1,smin=0xffffffff80000001") check_assert(s64, >, gt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000000") +__msg(": R0=scalar(id=1,smin=0xffffffff80000000") check_assert(s64, >=, ge_neg, INT_MIN); SEC("?tc") diff --git a/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c b/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c index 544e5ac9046106..d09bbd8ae8a85b 100644 --- a/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c +++ b/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c @@ -12,7 +12,7 @@ SEC("freplace/connect_v4_prog") int new_connect_v4_prog(struct bpf_sock_addr *ctx) { - // return value thats in invalid range + // return value that's in invalid range return 255; } diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index f41257eadbb258..d273b46dfc7c19 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -30,7 +30,7 @@ int force_clang_to_emit_btf_for_externs(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int create_and_destroy(void *ctx) { struct bpf_iter_num iter; @@ -196,7 +196,7 @@ int leak_iter_from_subprog_fail(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int valid_stack_reuse(void *ctx) { struct bpf_iter_num iter; @@ -345,7 +345,7 @@ int __naked read_from_iter_slot_fail(void) "r3 = 1000;" "call %[bpf_iter_num_new];" - /* attemp to leak bpf_iter_num state */ + /* attempt to leak bpf_iter_num state */ "r7 = *(u64 *)(r6 + 0);" "r8 = *(u64 *)(r6 + 8);" diff --git a/tools/testing/selftests/bpf/progs/iters_task_failure.c b/tools/testing/selftests/bpf/progs/iters_task_failure.c index 6b1588d7065273..fe3663dedbe14d 100644 --- a/tools/testing/selftests/bpf/progs/iters_task_failure.c +++ b/tools/testing/selftests/bpf/progs/iters_task_failure.c @@ -15,7 +15,7 @@ void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") -__failure __msg("expected an RCU CS when using bpf_iter_task_next") +__failure __msg("kernel func bpf_iter_task_new requires RCU critical section protection") int BPF_PROG(iter_tasks_without_lock) { struct task_struct *pos; @@ -27,7 +27,7 @@ int BPF_PROG(iter_tasks_without_lock) } SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") -__failure __msg("expected an RCU CS when using bpf_iter_css_next") +__failure __msg("kernel func bpf_iter_css_new requires RCU critical section protection") int BPF_PROG(iter_css_without_lock) { u64 cg_id = bpf_get_current_cgroup_id(); diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c index 9e4b45201e6927..5379e9960ffd2d 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod.c @@ -123,3 +123,49 @@ int iter_next_ptr_mem_not_trusted(const void *ctx) bpf_iter_num_destroy(&num_it); return 0; } + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("kernel func bpf_kfunc_ret_rcu_test requires RCU critical section protection") +int iter_ret_rcu_test_protected(const void *ctx) +{ + struct task_struct *p; + + p = bpf_kfunc_ret_rcu_test(); + return p->pid; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("R1 type=rcu_ptr_or_null_ expected=") +int iter_ret_rcu_test_type(const void *ctx) +{ + struct task_struct *p; + + bpf_rcu_read_lock(); + p = bpf_kfunc_ret_rcu_test(); + bpf_this_cpu_ptr(p); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("kernel func bpf_kfunc_ret_rcu_test_nostruct requires RCU critical section protection") +int iter_ret_rcu_test_protected_nostruct(const void *ctx) +{ + void *p; + + p = bpf_kfunc_ret_rcu_test_nostruct(4); + return *(int *)p; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("R1 type=rdonly_rcu_mem_or_null expected=") +int iter_ret_rcu_test_type_nostruct(const void *ctx) +{ + void *p; + + bpf_rcu_read_lock(); + p = bpf_kfunc_ret_rcu_test_nostruct(4); + bpf_this_cpu_ptr(p); + bpf_rcu_read_unlock(); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 6543d5b6e0a976..83791348bed526 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -20,7 +20,7 @@ __s64 res_empty; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_empty(const void *ctx) @@ -38,7 +38,7 @@ __s64 res_full; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_full(const void *ctx) @@ -58,7 +58,7 @@ static volatile int zero = 0; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_truncated(const void *ctx) diff --git a/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c b/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c new file mode 100644 index 00000000000000..f77aef0474d3b6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_write_ctx.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#if defined(__TARGET_ARCH_x86) +SEC("kprobe") +int kprobe_write_ctx(struct pt_regs *ctx) +{ + ctx->ax = 0; + return 0; +} + +SEC("kprobe.multi") +int kprobe_multi_write_ctx(struct pt_regs *ctx) +{ + ctx->ax = 0; + return 0; +} +#endif diff --git a/tools/testing/selftests/bpf/progs/linked_list_fail.c b/tools/testing/selftests/bpf/progs/linked_list_fail.c index 6438982b928bdc..ddd26d1a083f72 100644 --- a/tools/testing/selftests/bpf/progs/linked_list_fail.c +++ b/tools/testing/selftests/bpf/progs/linked_list_fail.c @@ -226,8 +226,7 @@ int obj_new_no_composite(void *ctx) SEC("?tc") int obj_new_no_struct(void *ctx) { - - bpf_obj_new(union { int data; unsigned udata; }); + (void)bpf_obj_new(union { int data; unsigned udata; }); return 0; } @@ -252,7 +251,7 @@ int new_null_ret(void *ctx) SEC("?tc") int obj_new_acq(void *ctx) { - bpf_obj_new(struct foo); + (void)bpf_obj_new(struct foo); return 0; } diff --git a/tools/testing/selftests/bpf/progs/loop1.c b/tools/testing/selftests/bpf/progs/loop1.c index 50e66772c0467c..b0fa26fb476080 100644 --- a/tools/testing/selftests/bpf/progs/loop1.c +++ b/tools/testing/selftests/bpf/progs/loop1.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include -#include -#include -#include -#include -#include +#include "vmlinux.h" #include #include diff --git a/tools/testing/selftests/bpf/progs/loop2.c b/tools/testing/selftests/bpf/progs/loop2.c index 947bb7e988c21d..0227409d4b0e0c 100644 --- a/tools/testing/selftests/bpf/progs/loop2.c +++ b/tools/testing/selftests/bpf/progs/loop2.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include -#include -#include -#include -#include -#include +#include "vmlinux.h" #include #include diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c index 717dab14322be5..5d1c9a775e6bbb 100644 --- a/tools/testing/selftests/bpf/progs/loop3.c +++ b/tools/testing/selftests/bpf/progs/loop3.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include -#include -#include -#include -#include -#include +#include "vmlinux.h" #include #include diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c index e4ff97fbcce184..dd36aff4fba360 100644 --- a/tools/testing/selftests/bpf/progs/loop6.c +++ b/tools/testing/selftests/bpf/progs/loop6.c @@ -1,8 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include +#include +#include #include #include #include "bpf_misc.h" @@ -26,12 +25,6 @@ char _license[] SEC("license") = "GPL"; #define SG_CHAIN 0x01UL #define SG_END 0x02UL -struct scatterlist { - unsigned long page_link; - unsigned int offset; - unsigned int length; -}; - #define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN) #define sg_is_last(sg) ((sg)->page_link & SG_END) #define sg_chain_ptr(sg) \ @@ -62,7 +55,7 @@ static inline struct scatterlist *get_sgp(struct scatterlist **sgs, int i) return sgp; } -int config = 0; +int run_once = 0; int result = 0; SEC("kprobe/virtqueue_add_sgs") @@ -73,14 +66,14 @@ int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, __u64 length1 = 0, length2 = 0; unsigned int i, n, len; - if (config != 0) + if (run_once != 0) return 0; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < out_sgs); i++) { __sink(out_sgs); for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); sgp = __sg_next(sgp)) { - bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + len = BPF_CORE_READ(sgp, length); length1 += len; n++; } @@ -90,13 +83,13 @@ int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, __sink(in_sgs); for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); sgp = __sg_next(sgp)) { - bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + len = BPF_CORE_READ(sgp, length); length2 += len; n++; } } - config = 1; + run_once = 1; result = length2 - length1; return 0; } diff --git a/tools/testing/selftests/bpf/progs/lpm_trie.h b/tools/testing/selftests/bpf/progs/lpm_trie.h new file mode 100644 index 00000000000000..76aa5821807f25 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lpm_trie.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PROGS_LPM_TRIE_H +#define __PROGS_LPM_TRIE_H + +struct trie_key { + __u32 prefixlen; + __u32 data; +}; + +/* Benchmark operations */ +enum { + LPM_OP_NOOP = 0, + LPM_OP_BASELINE, + LPM_OP_LOOKUP, + LPM_OP_INSERT, + LPM_OP_UPDATE, + LPM_OP_DELETE, + LPM_OP_FREE +}; + +/* + * Return values from run_bench. + * + * Negative values are also allowed and represent kernel error codes. + */ +#define LPM_BENCH_SUCCESS 0 +#define LPM_BENCH_REINIT_MAP 1 /* Reset trie to initial state for current op */ + +#endif diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c new file mode 100644 index 00000000000000..a0e6ebd5507a92 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Cloudflare */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_atomic.h" +#include "progs/lpm_trie.h" + +#define BPF_OBJ_NAME_LEN 16U +#define MAX_ENTRIES 100000000 +#define NR_LOOPS 10000 + +char _license[] SEC("license") = "GPL"; + +/* Filled by userspace. See fill_map() in bench_lpm_trie_map.c */ +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __type(key, struct trie_key); + __type(value, __u32); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, MAX_ENTRIES); +} trie_map SEC(".maps"); + +long hits; +long duration_ns; + +/* Configured from userspace */ +__u32 nr_entries; +__u32 prefixlen; +bool random; +__u8 op; + +static __u64 latency_free_start; + +SEC("fentry/bpf_map_free_deferred") +int BPF_PROG(trie_free_entry, struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + char name[BPF_OBJ_NAME_LEN]; + u32 map_type; + + map_type = BPF_CORE_READ(map, map_type); + if (map_type != BPF_MAP_TYPE_LPM_TRIE) + return 0; + + /* + * Ideally we'd have access to the map ID but that's already + * freed before we enter trie_free(). + */ + BPF_CORE_READ_STR_INTO(&name, map, name); + if (bpf_strncmp(name, BPF_OBJ_NAME_LEN, "trie_free_map")) + return 0; + + latency_free_start = bpf_ktime_get_ns(); + + return 0; +} + +SEC("fexit/bpf_map_free_deferred") +int BPF_PROG(trie_free_exit, struct work_struct *work) +{ + __u64 val; + + if (!latency_free_start) + return 0; + + val = bpf_ktime_get_ns() - latency_free_start; + latency_free_start = 0; + + __sync_add_and_fetch(&duration_ns, val); + __sync_add_and_fetch(&hits, 1); + + return 0; +} + +static __u32 cur_key; + +static __always_inline void generate_key(struct trie_key *key) +{ + key->prefixlen = prefixlen; + + if (random) + key->data = bpf_get_prandom_u32() % nr_entries; + else + key->data = cur_key++ % nr_entries; +} + +static int noop(__u32 index, __u32 *unused) +{ + return 0; +} + +static int baseline(__u32 index, __u32 *unused) +{ + struct trie_key key; + __u32 blackbox = 0; + + generate_key(&key); + /* Avoid compiler optimizing out the modulo */ + barrier_var(blackbox); + blackbox = READ_ONCE(key.data); + + return 0; +} + +static int lookup(__u32 index, int *retval) +{ + struct trie_key key; + + generate_key(&key); + if (!bpf_map_lookup_elem(&trie_map, &key)) { + *retval = -ENOENT; + return 1; + } + + return 0; +} + +static int insert(__u32 index, int *retval) +{ + struct trie_key key; + u32 val = 1; + int err; + + generate_key(&key); + err = bpf_map_update_elem(&trie_map, &key, &val, BPF_NOEXIST); + if (err) { + *retval = err; + return 1; + } + + /* Is this the last entry? */ + if (key.data == nr_entries - 1) { + /* For atomicity concerns, see the comment in delete() */ + *retval = LPM_BENCH_REINIT_MAP; + return 1; + } + + return 0; +} + +static int update(__u32 index, int *retval) +{ + struct trie_key key; + u32 val = 1; + int err; + + generate_key(&key); + err = bpf_map_update_elem(&trie_map, &key, &val, BPF_EXIST); + if (err) { + *retval = err; + return 1; + } + + return 0; +} + +static int delete(__u32 index, int *retval) +{ + struct trie_key key; + int err; + + generate_key(&key); + err = bpf_map_delete_elem(&trie_map, &key); + if (err) { + *retval = err; + return 1; + } + + /* Do we need to refill the map? */ + if (key.data == nr_entries - 1) { + /* + * Atomicity isn't required because DELETE only supports + * one producer running concurrently. What we need is a + * way to track how many entries have been deleted from + * the trie between consecutive invocations of the BPF + * prog because a single bpf_loop() call might not + * delete all entries, e.g. when NR_LOOPS < nr_entries. + */ + *retval = LPM_BENCH_REINIT_MAP; + return 1; + } + + return 0; +} + +SEC("xdp") +int BPF_PROG(run_bench) +{ + int err = LPM_BENCH_SUCCESS; + u64 start, delta; + int loops; + + start = bpf_ktime_get_ns(); + + switch (op) { + case LPM_OP_NOOP: + loops = bpf_loop(NR_LOOPS, noop, NULL, 0); + break; + case LPM_OP_BASELINE: + loops = bpf_loop(NR_LOOPS, baseline, NULL, 0); + break; + case LPM_OP_LOOKUP: + loops = bpf_loop(NR_LOOPS, lookup, &err, 0); + break; + case LPM_OP_INSERT: + loops = bpf_loop(NR_LOOPS, insert, &err, 0); + break; + case LPM_OP_UPDATE: + loops = bpf_loop(NR_LOOPS, update, &err, 0); + break; + case LPM_OP_DELETE: + loops = bpf_loop(NR_LOOPS, delete, &err, 0); + break; + default: + bpf_printk("invalid benchmark operation\n"); + return -1; + } + + delta = bpf_ktime_get_ns() - start; + + __sync_add_and_fetch(&duration_ns, delta); + __sync_add_and_fetch(&hits, loops); + + return err; +} diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_map.c b/tools/testing/selftests/bpf/progs/lpm_trie_map.c new file mode 100644 index 00000000000000..6e60d686b664fc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lpm_trie_map.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#define MAX_ENTRIES 100000000 + +struct trie_key { + __u32 prefixlen; + __u32 data; +}; + +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __type(key, struct trie_key); + __type(value, __u32); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, MAX_ENTRIES); +} trie_free_map SEC(".maps"); diff --git a/tools/testing/selftests/bpf/progs/map_excl.c b/tools/testing/selftests/bpf/progs/map_excl.c new file mode 100644 index 00000000000000..d461684728e4e7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/map_excl.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Google LLC. */ +#include +#include +#include + +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 1); +} excl_map SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int should_have_access(void *ctx) +{ + int key = 0, value = 0xdeadbeef; + + bpf_map_update_elem(&excl_map, &key, &value, 0); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int should_not_have_access(void *ctx) +{ + int key = 0, value = 0xdeadbeef; + + bpf_map_update_elem(&excl_map, &key, &value, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c index 4f94c971ae8626..3b984b6ae7c0b9 100644 --- a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c +++ b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c @@ -8,8 +8,8 @@ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r8 = *(u64 *)(r7 +0) ; R7_w=ptr_nameidata(off={{[0-9]+}}) R8_w=rdonly_untrusted_mem(sz=0)") -__msg("r9 = *(u8 *)(r8 +0) ; R8_w=rdonly_untrusted_mem(sz=0) R9_w=scalar") +__msg("r8 = *(u64 *)(r7 +0) ; R7=ptr_nameidata(off={{[0-9]+}}) R8=rdonly_untrusted_mem(sz=0)") +__msg("r9 = *(u8 *)(r8 +0) ; R8=rdonly_untrusted_mem(sz=0) R9=scalar") int btf_id_to_ptr_mem(void *ctx) { struct task_struct *task; diff --git a/tools/testing/selftests/bpf/progs/rbtree_search.c b/tools/testing/selftests/bpf/progs/rbtree_search.c index 098ef970fac160..b05565d1db0d47 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_search.c +++ b/tools/testing/selftests/bpf/progs/rbtree_search.c @@ -183,7 +183,7 @@ long test_##op##_spinlock_##dolock(void *ctx) \ } /* - * Use a spearate MSG macro instead of passing to TEST_XXX(..., MSG) + * Use a separate MSG macro instead of passing to TEST_XXX(..., MSG) * to ensure the message itself is not in the bpf prog lineinfo * which the verifier includes in its log. * Otherwise, the test_loader will incorrectly match the prog lineinfo diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/stacktrace_map.c similarity index 98% rename from tools/testing/selftests/bpf/progs/test_stacktrace_map.c rename to tools/testing/selftests/bpf/progs/stacktrace_map.c index 47568007b6683b..0c77df05be7fce 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_map.c @@ -50,6 +50,7 @@ struct sched_switch_args { int next_prio; }; +__u32 stack_id; SEC("tracepoint/sched/sched_switch") int oncpu(struct sched_switch_args *ctx) { @@ -64,6 +65,7 @@ int oncpu(struct sched_switch_args *ctx) /* The size of stackmap and stackid_hmap should be the same */ key = bpf_get_stackid(ctx, &stackmap, 0); if ((int)key >= 0) { + stack_id = key; bpf_map_update_elem(&stackid_hmap, &key, &val, 0); stack_p = bpf_map_lookup_elem(&stack_amap, &key); if (stack_p) diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index 35790897dc879e..4a5bd852f10c89 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -5,6 +5,7 @@ #include #include "bpf_misc.h" #include "bpf_experimental.h" +#include "bpf_arena_common.h" struct arr_elem { struct bpf_res_spin_lock lock; @@ -17,10 +18,29 @@ struct { __type(value, struct arr_elem); } arrmap SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1); /* number of pages */ +} arena SEC(".maps"); + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + #define ENOSPC 28 #define _STR "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" int size; +u64 fault_addr; +void *arena_ptr; SEC("syscall") __success __retval(0) @@ -37,7 +57,15 @@ int stream_exhaust(void *ctx) } SEC("syscall") +__arch_x86_64 +__arch_arm64 +__arch_s390x __success __retval(0) +__stderr("ERROR: Timeout detected for may_goto instruction") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") int stream_cond_break(void *ctx) { while (can_loop) @@ -47,6 +75,15 @@ int stream_cond_break(void *ctx) SEC("syscall") __success __retval(0) +__stderr("ERROR: AA or ABBA deadlock detected for bpf_res_spin_lock") +__stderr("{{Attempted lock = (0x[0-9a-fA-F]+)\n" +"Total held locks = 1\n" +"Held lock\\[ 0\\] = \\1}}") +__stderr("...") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") int stream_deadlock(void *ctx) { struct bpf_res_spin_lock *lock, *nlock; @@ -76,4 +113,125 @@ int stream_syscall(void *ctx) return 0; } +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena WRITE access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_write_fault(void *ctx) +{ + struct bpf_arena *ptr = (void *)&arena; + u64 user_vm_start; + + /* Prevent GCC bounds warning: casting &arena to struct bpf_arena * + * triggers bounds checking since the map definition is smaller than struct + * bpf_arena. barrier_var() makes the pointer opaque to GCC, preventing the + * bounds analysis + */ + barrier_var(ptr); + user_vm_start = ptr->user_vm_start; + fault_addr = user_vm_start + 0x7fff; + bpf_addr_space_cast(user_vm_start, 0, 1); + asm volatile ( + "r1 = %0;" + "r2 = 1;" + "*(u32 *)(r1 + 0x7fff) = r2;" + : + : "r" (user_vm_start) + : "r1", "r2" + ); + return 0; +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena READ access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_read_fault(void *ctx) +{ + struct bpf_arena *ptr = (void *)&arena; + u64 user_vm_start; + + /* Prevent GCC bounds warning: casting &arena to struct bpf_arena * + * triggers bounds checking since the map definition is smaller than struct + * bpf_arena. barrier_var() makes the pointer opaque to GCC, preventing the + * bounds analysis + */ + barrier_var(ptr); + user_vm_start = ptr->user_vm_start; + fault_addr = user_vm_start + 0x7fff; + bpf_addr_space_cast(user_vm_start, 0, 1); + asm volatile ( + "r1 = %0;" + "r1 = *(u32 *)(r1 + 0x7fff);" + : + : "r" (user_vm_start) + : "r1" + ); + return 0; +} + +static __noinline void subprog(void) +{ + int __arena *addr = (int __arena *)0xdeadbeef; + + arena_ptr = &arena; + *addr = 1; +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena WRITE access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_subprog_fault(void *ctx) +{ + subprog(); + return 0; +} + +static __noinline int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + int __arena *addr = (int __arena *)0xdeadbeef; + + arena_ptr = &arena; + *addr = 1; + return 0; +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("ERROR: Arena WRITE access at unmapped address 0x{{.*}}") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_arena_callback_fault(void *ctx) +{ + struct bpf_timer *arr_timer; + + arr_timer = bpf_map_lookup_elem(&array, &(int){0}); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, 1); + bpf_timer_set_callback(arr_timer, timer_cb); + bpf_timer_start(arr_timer, 0, 0); + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c index 53af438bd998e8..99d72c68f76af8 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c @@ -31,6 +31,8 @@ char *invalid_kern_ptr = (char *)-1; /* Passing NULL to string kfuncs (treated as a userspace ptr) */ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_null1(void *ctx) { return bpf_strcmp(NULL, "hello"); } SEC("syscall") __retval(USER_PTR_ERR)int test_strcmp_null2(void *ctx) { return bpf_strcmp("hello", NULL); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_null1(void *ctx) { return bpf_strcasecmp(NULL, "HELLO"); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strcasecmp_null2(void *ctx) { return bpf_strcasecmp("HELLO", NULL); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchr_null(void *ctx) { return bpf_strchr(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchrnul_null(void *ctx) { return bpf_strchrnul(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnchr_null(void *ctx) { return bpf_strnchr(NULL, 1, 'a'); } @@ -49,6 +51,8 @@ SEC("syscall") __retval(USER_PTR_ERR)int test_strnstr_null2(void *ctx) { return /* Passing userspace ptr to string kfuncs */ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { return bpf_strcmp(user_ptr, "hello"); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr2(void *ctx) { return bpf_strcmp("hello", user_ptr); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr1(void *ctx) { return bpf_strcasecmp(user_ptr, "HELLO"); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr2(void *ctx) { return bpf_strcasecmp("HELLO", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchr_user_ptr(void *ctx) { return bpf_strchr(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchrnul_user_ptr(void *ctx) { return bpf_strchrnul(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnchr_user_ptr(void *ctx) { return bpf_strnchr(user_ptr, 1, 'a'); } @@ -69,6 +73,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strnstr_user_ptr2(void *ctx) { re /* Passing invalid kernel ptr to string kfuncs should always return -EFAULT */ SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault1(void *ctx) { return bpf_strcmp(invalid_kern_ptr, "hello"); } SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault2(void *ctx) { return bpf_strcmp("hello", invalid_kern_ptr); } +SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault1(void *ctx) { return bpf_strcasecmp(invalid_kern_ptr, "HELLO"); } +SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault2(void *ctx) { return bpf_strcasecmp("HELLO", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strchr_pagefault(void *ctx) { return bpf_strchr(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strchrnul_pagefault(void *ctx) { return bpf_strchrnul(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strnchr_pagefault(void *ctx) { return bpf_strnchr(invalid_kern_ptr, 1, 'a'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c index 89fb4669b0e943..e41cc560199430 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c @@ -7,6 +7,7 @@ char long_str[XATTR_SIZE_MAX + 1]; SEC("syscall") int test_strcmp_too_long(void *ctx) { return bpf_strcmp(long_str, long_str); } +SEC("syscall") int test_strcasecmp_too_long(void *ctx) { return bpf_strcasecmp(long_str, long_str); } SEC("syscall") int test_strchr_too_long(void *ctx) { return bpf_strchr(long_str, 'b'); } SEC("syscall") int test_strchrnul_too_long(void *ctx) { return bpf_strchrnul(long_str, 'b'); } SEC("syscall") int test_strnchr_too_long(void *ctx) { return bpf_strnchr(long_str, sizeof(long_str), 'b'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index 46697f3818789a..2e3498e37b9ce1 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -12,6 +12,11 @@ char str[] = "hello world"; /* Functional tests */ __test(0) int test_strcmp_eq(void *ctx) { return bpf_strcmp(str, "hello world"); } __test(1) int test_strcmp_neq(void *ctx) { return bpf_strcmp(str, "hello"); } +__test(0) int test_strcasecmp_eq1(void *ctx) { return bpf_strcasecmp(str, "hello world"); } +__test(0) int test_strcasecmp_eq2(void *ctx) { return bpf_strcasecmp(str, "HELLO WORLD"); } +__test(0) int test_strcasecmp_eq3(void *ctx) { return bpf_strcasecmp(str, "HELLO world"); } +__test(1) int test_strcasecmp_neq1(void *ctx) { return bpf_strcasecmp(str, "hello"); } +__test(1) int test_strcasecmp_neq2(void *ctx) { return bpf_strcasecmp(str, "HELLO"); } __test(1) int test_strchr_found(void *ctx) { return bpf_strchr(str, 'e'); } __test(11) int test_strchr_null(void *ctx) { return bpf_strchr(str, '\0'); } __test(-ENOENT) int test_strchr_notfound(void *ctx) { return bpf_strchr(str, 'x'); } @@ -30,8 +35,12 @@ __test(2) int test_strcspn(void *ctx) { return bpf_strcspn(str, "lo"); } __test(6) int test_strstr_found(void *ctx) { return bpf_strstr(str, "world"); } __test(-ENOENT) int test_strstr_notfound(void *ctx) { return bpf_strstr(str, "hi"); } __test(0) int test_strstr_empty(void *ctx) { return bpf_strstr(str, ""); } -__test(0) int test_strnstr_found(void *ctx) { return bpf_strnstr(str, "hello", 6); } -__test(-ENOENT) int test_strnstr_notfound(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(0) int test_strnstr_found1(void *ctx) { return bpf_strnstr("", "", 0); } +__test(0) int test_strnstr_found2(void *ctx) { return bpf_strnstr(str, "hello", 5); } +__test(0) int test_strnstr_found3(void *ctx) { return bpf_strnstr(str, "hello", 6); } +__test(-ENOENT) int test_strnstr_notfound1(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(-ENOENT) int test_strnstr_notfound2(void *ctx) { return bpf_strnstr(str, "hello", 4); } +__test(-ENOENT) int test_strnstr_notfound3(void *ctx) { return bpf_strnstr("", "a", 0); } __test(0) int test_strnstr_empty(void *ctx) { return bpf_strnstr(str, "", 1); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c new file mode 100644 index 00000000000000..ad8bb546c9bfff --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping1.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define bpf_kfunc_multi_st_ops_test_1(args) bpf_kfunc_multi_st_ops_test_1(args, st_ops_id) +int st_ops_id; + +int test_pid; +int test_err; + +#define MAP1_MAGIC 1234 + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + return MAP1_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP1_MAGIC) + test_err++; + + return 0; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP1_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c new file mode 100644 index 00000000000000..cea1a2f4b62f6e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_id_ops_mapping2.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define bpf_kfunc_multi_st_ops_test_1(args) bpf_kfunc_multi_st_ops_test_1(args, st_ops_id) +int st_ops_id; + +int test_pid; +int test_err; + +#define MAP2_MAGIC 4567 + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + return MAP2_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP2_MAGIC) + test_err++; + + return 0; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1(&args); + if (ret != MAP2_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c index 36386b3c23a1f6..2b98b7710816dc 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return.c @@ -9,7 +9,7 @@ void bpf_task_release(struct task_struct *p) __ksym; /* This test struct_ops BPF programs returning referenced kptr. The verifier should * allow a referenced kptr or a NULL pointer to be returned. A referenced kptr to task - * here is acquried automatically as the task argument is tagged with "__ref". + * here is acquired automatically as the task argument is tagged with "__ref". */ SEC("struct_ops/test_return_ref_kptr") struct task_struct *BPF_PROG(kptr_return, int dummy, diff --git a/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c b/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c index 76dcb6089d7f8e..9c0a65466356c9 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_refcounted.c @@ -9,7 +9,7 @@ __attribute__((nomerge)) extern void bpf_task_release(struct task_struct *p) __k /* This is a test BPF program that uses struct_ops to access a referenced * kptr argument. This is a test for the verifier to ensure that it - * 1) recongnizes the task as a referenced object (i.e., ref_obj_id > 0), and + * 1) recognizes the task as a referenced object (i.e., ref_obj_id > 0), and * 2) the same reference can be acquired from multiple paths as long as it * has not been released. */ diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c index 327ca395e8601a..d556b19413d7b7 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c @@ -2,6 +2,7 @@ #include #include #include "bpf_legacy.h" +#include "bpf_test_utils.h" struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); @@ -24,6 +25,8 @@ int entry(struct __sk_buff *skb) { int ret = 1; + clobber_regs_stack(); + count++; subprog_tail(skb); subprog_tail(skb); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c index 72fd0d577506a2..ae94c9c70ab7d5 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c @@ -2,6 +2,7 @@ #include #include #include "bpf_misc.h" +#include "bpf_test_utils.h" int classifier_0(struct __sk_buff *skb); int classifier_1(struct __sk_buff *skb); @@ -60,6 +61,8 @@ int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb) { int ret = 0; + clobber_regs_stack(); + subprog_tail0(skb); subprog_tail1(skb); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c index a7fb91cb05b736..56b6b009984072 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c @@ -2,6 +2,7 @@ #include #include #include "bpf_misc.h" +#include "bpf_test_utils.h" int classifier_0(struct __sk_buff *skb); @@ -53,6 +54,8 @@ int tailcall_bpf2bpf_hierarchy_3(struct __sk_buff *skb) { int ret = 0; + clobber_regs_stack(); + bpf_tail_call_static(skb, &jmp_table0, 0); __sink(ret); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c index c87f9ca982d3ee..5261395713cd58 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c @@ -4,6 +4,7 @@ #include "vmlinux.h" #include #include +#include "bpf_test_utils.h" struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); @@ -24,6 +25,8 @@ int subprog_tail(void *ctx) SEC("fentry/dummy") int BPF_PROG(fentry, struct sk_buff *skb) { + clobber_regs_stack(); + count++; subprog_tail(ctx); subprog_tail(ctx); diff --git a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h new file mode 100644 index 00000000000000..432fff2af8441f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TASK_LOCAL_DATA_BPF_H +#define __TASK_LOCAL_DATA_BPF_H + +/* + * Task local data is a library that facilitates sharing per-task data + * between user space and bpf programs. + * + * + * USAGE + * + * A TLD, an entry of data in task local data, first needs to be created by the + * user space. This is done by calling user space API, TLD_DEFINE_KEY() or + * tld_create_key(), with the name of the TLD and the size. + * + * TLD_DEFINE_KEY(prio, "priority", sizeof(int)); + * + * or + * + * void func_call(...) { + * tld_key_t prio, in_cs; + * + * prio = tld_create_key("priority", sizeof(int)); + * in_cs = tld_create_key("in_critical_section", sizeof(bool)); + * ... + * + * A key associated with the TLD, which has an opaque type tld_key_t, will be + * initialized or returned. It can be used to get a pointer to the TLD in the + * user space by calling tld_get_data(). + * + * In a bpf program, tld_object_init() first needs to be called to initialized a + * tld_object on the stack. Then, TLDs can be accessed by calling tld_get_data(). + * The API will try to fetch the key by the name and use it to locate the data. + * A pointer to the TLD will be returned. It also caches the key in a task local + * storage map, tld_key_map, whose value type, struct tld_keys, must be defined + * by the developer. + * + * struct tld_keys { + * tld_key_t prio; + * tld_key_t in_cs; + * }; + * + * SEC("struct_ops") + * void prog(struct task_struct task, ...) + * { + * struct tld_object tld_obj; + * int err, *p; + * + * err = tld_object_init(task, &tld_obj); + * if (err) + * return; + * + * p = tld_get_data(&tld_obj, prio, "priority", sizeof(int)); + * if (p) + * // do something depending on *p + */ +#include +#include + +#define TLD_ROUND_MASK(x, y) ((__typeof__(x))((y) - 1)) +#define TLD_ROUND_UP(x, y) ((((x) - 1) | TLD_ROUND_MASK(x, y)) + 1) + +#define TLD_MAX_DATA_CNT (__PAGE_SIZE / sizeof(struct tld_metadata) - 1) + +#ifndef TLD_NAME_LEN +#define TLD_NAME_LEN 62 +#endif + +#ifndef TLD_KEY_MAP_CREATE_RETRY +#define TLD_KEY_MAP_CREATE_RETRY 10 +#endif + +typedef struct { + __s16 off; +} tld_key_t; + +struct tld_metadata { + char name[TLD_NAME_LEN]; + __u16 size; +}; + +struct tld_meta_u { + __u8 cnt; + __u16 size; + struct tld_metadata metadata[TLD_MAX_DATA_CNT]; +}; + +struct tld_data_u { + __u64 start; /* offset of tld_data_u->data in a page */ + char data[__PAGE_SIZE - sizeof(__u64)]; +}; + +struct tld_map_value { + struct tld_data_u __uptr *data; + struct tld_meta_u __uptr *meta; +}; + +typedef struct tld_uptr_dummy { + struct tld_data_u data[0]; + struct tld_meta_u meta[0]; +} *tld_uptr_dummy_t; + +struct tld_object { + struct tld_map_value *data_map; + struct tld_keys *key_map; + /* + * Force the compiler to generate the actual definition of tld_meta_u + * and tld_data_u in BTF. Without it, tld_meta_u and u_tld_data will + * be BTF_KIND_FWD. + */ + tld_uptr_dummy_t dummy[0]; +}; + +/* + * Map value of tld_key_map for caching keys. Must be defined by the developer. + * Members should be tld_key_t and passed to the 3rd argument of tld_fetch_key(). + */ +struct tld_keys; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tld_map_value); +} tld_data_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tld_keys); +} tld_key_map SEC(".maps"); + +/** + * tld_object_init() - Initialize a tld_object. + * + * @task: The task_struct of the target task + * @tld_obj: A pointer to a tld_object to be initialized + * + * Return 0 on success; -ENODATA if the user space did not initialize task local data + * for the current task through tld_get_data(); -ENOMEM if the creation of tld_key_map + * fails + */ +__attribute__((unused)) +static int tld_object_init(struct task_struct *task, struct tld_object *tld_obj) +{ + int i; + + tld_obj->data_map = bpf_task_storage_get(&tld_data_map, task, 0, 0); + if (!tld_obj->data_map) + return -ENODATA; + + bpf_for(i, 0, TLD_KEY_MAP_CREATE_RETRY) { + tld_obj->key_map = bpf_task_storage_get(&tld_key_map, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (likely(tld_obj->key_map)) + break; + } + if (!tld_obj->key_map) + return -ENOMEM; + + return 0; +} + +/* + * Return the offset of TLD if @name is found. Otherwise, return the current TLD count + * using the nonpositive range so that the next tld_get_data() can skip fetching key if + * no new TLD is added or start comparing name from the first newly added TLD. + */ +__attribute__((unused)) +static int __tld_fetch_key(struct tld_object *tld_obj, const char *name, int i_start) +{ + struct tld_metadata *metadata; + int i, cnt, start, off = 0; + + if (!tld_obj->data_map || !tld_obj->data_map->data || !tld_obj->data_map->meta) + return 0; + + start = tld_obj->data_map->data->start; + cnt = tld_obj->data_map->meta->cnt; + metadata = tld_obj->data_map->meta->metadata; + + bpf_for(i, 0, cnt) { + if (i >= TLD_MAX_DATA_CNT) + break; + + if (i >= i_start && !bpf_strncmp(metadata[i].name, TLD_NAME_LEN, name)) + return start + off; + + off += TLD_ROUND_UP(metadata[i].size, 8); + } + + return -cnt; +} + +/** + * tld_get_data() - Retrieve a pointer to the TLD associated with the name. + * + * @tld_obj: A pointer to a valid tld_object initialized by tld_object_init() + * @key: The cached key of the TLD in tld_key_map + * @name: The name of the key associated with a TLD + * @size: The size of the TLD. Must be a known constant value + * + * Return a pointer to the TLD associated with @name; NULL if not found or @size is too + * big. @key is used to cache the key if the TLD is found to speed up subsequent calls. + * It should be defined as an member of tld_keys of tld_key_t type by the developer. + */ +#define tld_get_data(tld_obj, key, name, size) \ + ({ \ + void *data = NULL, *_data = (tld_obj)->data_map->data; \ + long off = (tld_obj)->key_map->key.off; \ + int cnt; \ + \ + if (likely(_data)) { \ + if (likely(off > 0)) { \ + barrier_var(off); \ + if (likely(off < __PAGE_SIZE - size)) \ + data = _data + off; \ + } else { \ + cnt = -(off); \ + if (likely((tld_obj)->data_map->meta) && \ + cnt < (tld_obj)->data_map->meta->cnt) { \ + off = __tld_fetch_key(tld_obj, name, cnt); \ + (tld_obj)->key_map->key.off = off; \ + \ + if (likely(off < __PAGE_SIZE - size)) { \ + barrier_var(off); \ + if (off > 0) \ + data = _data + off; \ + } \ + } \ + } \ + } \ + data; \ + }) + +#endif diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c new file mode 100644 index 00000000000000..23217f06a3ece6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" +#include "errno.h" + +char _license[] SEC("license") = "GPL"; + +const void *user_ptr = NULL; + +struct elem { + char data[128]; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} arrmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} lrumap SEC(".maps"); + +static int process_work(struct bpf_map *map, void *key, void *value) +{ + struct elem *work = value; + + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); + return 0; +} + +int key = 0; + +SEC("perf_event") +int oncpu_hash_map(struct pt_regs *args) +{ + struct elem empty_work = { .data = { 0 } }; + struct elem *work; + struct task_struct *task; + int err; + + task = bpf_get_current_task_btf(); + err = bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); + if (err) + return 0; + work = bpf_map_lookup_elem(&hmap, &key); + if (!work) + return 0; + + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +int oncpu_array_map(struct pt_regs *args) +{ + struct elem *work; + struct task_struct *task; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) + return 0; + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +int oncpu_lru_map(struct pt_regs *args) +{ + struct elem empty_work = { .data = { 0 } }; + struct elem *work; + struct task_struct *task; + int err; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&lrumap, &key); + if (work) + return 0; + err = bpf_map_update_elem(&lrumap, &key, &empty_work, BPF_NOEXIST); + if (err) + return 0; + work = bpf_map_lookup_elem(&lrumap, &key); + if (!work || work->data[0]) + return 0; + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c new file mode 100644 index 00000000000000..77fe8f28facdb6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +const void *user_ptr = NULL; + +struct elem { + char data[128]; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} arrmap SEC(".maps"); + +static int process_work(struct bpf_map *map, void *key, void *value) +{ + struct elem *work = value; + + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); + return 0; +} + +int key = 0; + +SEC("perf_event") +__failure __msg("doesn't match map pointer in R3") +int mismatch_map(struct pt_regs *args) +{ + struct elem *work; + struct task_struct *task; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) + return 0; + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +__failure __msg("arg#1 doesn't point to a map value") +int no_map_task_work(struct pt_regs *args) +{ + struct task_struct *task; + struct bpf_task_work tw; + + task = bpf_get_current_task_btf(); + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int task_work_null(struct pt_regs *args) +{ + struct task_struct *task; + + task = bpf_get_current_task_btf(); + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL); + return 0; +} + +SEC("perf_event") +__failure __msg("Possibly NULL pointer passed to trusted arg2") +int map_null(struct pt_regs *args) +{ + struct elem *work; + struct task_struct *task; + + task = bpf_get_current_task_btf(); + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) + return 0; + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c new file mode 100644 index 00000000000000..90fca06fff56ca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" + +#define ENTRIES 128 + +char _license[] SEC("license") = "GPL"; + +__u64 callback_scheduled = 0; +__u64 callback_success = 0; +__u64 schedule_error = 0; +__u64 delete_success = 0; + +struct elem { + __u32 count; + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, ENTRIES); + __type(key, int); + __type(value, struct elem); +} hmap SEC(".maps"); + +static int process_work(struct bpf_map *map, void *key, void *value) +{ + __sync_fetch_and_add(&callback_success, 1); + return 0; +} + +SEC("syscall") +int schedule_task_work(void *ctx) +{ + struct elem empty_work = {.count = 0}; + struct elem *work; + int key = 0, err; + + key = bpf_ktime_get_ns() % ENTRIES; + work = bpf_map_lookup_elem(&hmap, &key); + if (!work) { + bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); + work = bpf_map_lookup_elem(&hmap, &key); + if (!work) + return 0; + } + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work, NULL); + if (err) + __sync_fetch_and_add(&schedule_error, 1); + else + __sync_fetch_and_add(&callback_scheduled, 1); + return 0; +} + +SEC("syscall") +int delete_task_work(void *ctx) +{ + int key = 0, err; + + key = bpf_get_prandom_u32() % ENTRIES; + err = bpf_map_delete_elem(&hmap, &key); + if (!err) + __sync_fetch_and_add(&delete_success, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index f344c6835e84e7..26a53e54b8fa21 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -22,6 +22,7 @@ #include "bpf_compiler.h" #include "test_cls_redirect.h" +#include "bpf_misc.h" #pragma GCC diagnostic ignored "-Waddress-of-packed-member" @@ -31,9 +32,6 @@ #define INLINING __always_inline #endif -#define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) - #define IP_OFFSET_MASK (0x1FFF) #define IP_MF (0x2000) @@ -129,7 +127,7 @@ typedef uint8_t *net_ptr __attribute__((align_value(8))); typedef struct buf { struct __sk_buff *skb; net_ptr head; - /* NB: tail musn't have alignment other than 1, otherwise + /* NB: tail mustn't have alignment other than 1, otherwise * LLVM will go and eliminate code, e.g. when checking packet lengths. */ uint8_t *const tail; diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c index d0f7670351e587..dfd4a2710391d9 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c @@ -494,7 +494,7 @@ static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_header *offset += sizeof(*next_hop); - /* Skip the remainig next hops (may be zero). */ + /* Skip the remaining next hops (may be zero). */ return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1); } diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index abb7344b531f45..5edf3cdc213d04 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ -#include -#include -#include -#include +#include "vmlinux.h" #include #include diff --git a/tools/testing/selftests/bpf/progs/test_pinning_devmap.c b/tools/testing/selftests/bpf/progs/test_pinning_devmap.c new file mode 100644 index 00000000000000..c855f8f87effcb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_pinning_devmap.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} pinmap1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u32); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} pinmap2 SEC(".maps"); diff --git a/tools/testing/selftests/bpf/progs/test_task_local_data.c b/tools/testing/selftests/bpf/progs/test_task_local_data.c new file mode 100644 index 00000000000000..fffafc0130449e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_task_local_data.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include "task_local_data.bpf.h" + +struct tld_keys { + tld_key_t value0; + tld_key_t value1; + tld_key_t value2; + tld_key_t value_not_exist; +}; + +struct test_tld_struct { + __u64 a; + __u64 b; + __u64 c; + __u64 d; +}; + +int test_value0; +int test_value1; +struct test_tld_struct test_value2; + +SEC("syscall") +int task_main(void *ctx) +{ + struct tld_object tld_obj; + struct test_tld_struct *struct_p; + struct task_struct *task; + int err, *int_p; + + task = bpf_get_current_task_btf(); + err = tld_object_init(task, &tld_obj); + if (err) + return 1; + + int_p = tld_get_data(&tld_obj, value0, "value0", sizeof(int)); + if (int_p) + test_value0 = *int_p; + else + return 2; + + int_p = tld_get_data(&tld_obj, value1, "value1", sizeof(int)); + if (int_p) + test_value1 = *int_p; + else + return 3; + + struct_p = tld_get_data(&tld_obj, value2, "value2", sizeof(struct test_tld_struct)); + if (struct_p) + test_value2 = *struct_p; + else + return 4; + + int_p = tld_get_data(&tld_obj, value_not_exist, "value_not_exist", sizeof(int)); + if (int_p) + return 5; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c index 5f4e87ee949ad1..1ecdf4c54de41f 100644 --- a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c @@ -14,10 +14,7 @@ #include #define BPF_PROG_TEST_TCP_HDR_OPTIONS #include "test_tcp_hdr_options.h" - -#ifndef sizeof_field -#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) -#endif +#include "bpf_misc.h" __u8 test_kind = TCPOPT_EXP; __u16 test_magic = 0xeB9F; diff --git a/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c index 540181c115a85a..ef00d38b0a8d24 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c @@ -23,7 +23,6 @@ struct { struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(max_entries, 2); __type(key, int); __type(value, __u32); } perf_event_map SEC(".maps"); diff --git a/tools/testing/selftests/bpf/progs/test_uprobe.c b/tools/testing/selftests/bpf/progs/test_uprobe.c index 896c88a4960df2..12f4065fca2016 100644 --- a/tools/testing/selftests/bpf/progs/test_uprobe.c +++ b/tools/testing/selftests/bpf/progs/test_uprobe.c @@ -59,3 +59,41 @@ int BPF_UPROBE(test4) test4_result = 1; return 0; } + +#if defined(__TARGET_ARCH_x86) +struct pt_regs regs; + +SEC("uprobe") +int BPF_UPROBE(test_regs_change) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + ctx->ax = regs.ax; + ctx->cx = regs.cx; + ctx->dx = regs.dx; + ctx->r8 = regs.r8; + ctx->r9 = regs.r9; + ctx->r10 = regs.r10; + ctx->r11 = regs.r11; + ctx->di = regs.di; + ctx->si = regs.si; + return 0; +} + +unsigned long ip; + +SEC("uprobe") +int BPF_UPROBE(test_regs_change_ip) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + ctx->ip = ip; + return 0; +} +#endif diff --git a/tools/testing/selftests/bpf/progs/test_usdt.c b/tools/testing/selftests/bpf/progs/test_usdt.c index 096488f47fbc82..a78c87537b0784 100644 --- a/tools/testing/selftests/bpf/progs/test_usdt.c +++ b/tools/testing/selftests/bpf/progs/test_usdt.c @@ -107,4 +107,35 @@ int BPF_USDT(usdt12, int a1, int a2, long a3, long a4, unsigned a5, return 0; } +int usdt_sib_called; +u64 usdt_sib_cookie; +int usdt_sib_arg_cnt; +int usdt_sib_arg_ret; +short usdt_sib_arg; +int usdt_sib_arg_size; + +/* + * usdt_sib is only tested on x86-related architectures, so it requires + * manual attach since auto-attach will panic tests under other architectures + */ +SEC("usdt") +int usdt_sib(struct pt_regs *ctx) +{ + long tmp; + + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&usdt_sib_called, 1); + + usdt_sib_cookie = bpf_usdt_cookie(ctx); + usdt_sib_arg_cnt = bpf_usdt_arg_cnt(ctx); + + usdt_sib_arg_ret = bpf_usdt_arg(ctx, 0, &tmp); + usdt_sib_arg = (short)tmp; + usdt_sib_arg_size = bpf_usdt_arg_size(ctx, 0); + + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c new file mode 100644 index 00000000000000..814e2a980e97e9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_tailcall.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +SEC("xdp") +int xdp_devmap(struct xdp_md *ctx) +{ + return ctx->egress_ifindex; +} + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, int (void *)); +} xdp_map SEC(".maps") = { + .values = { + [0] = (void *)&xdp_devmap, + }, +}; + +SEC("xdp") +int xdp_entry(struct xdp_md *ctx) +{ + bpf_tail_call(ctx, &xdp_map, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index fcf6ca14f2ea28..d79cb74b571e72 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -1,8 +1,11 @@ +#include #include +#include #include #include #include +#include "bpf_kfuncs.h" #define META_SIZE 32 @@ -23,6 +26,8 @@ struct { __uint(value_size, META_SIZE); } test_result SEC(".maps"); +bool test_pass; + SEC("tc") int ing_cls(struct __sk_buff *ctx) { @@ -40,6 +45,231 @@ int ing_cls(struct __sk_buff *ctx) return TC_ACT_SHOT; } +/* Read from metadata using bpf_dynptr_read helper */ +SEC("tc") +int ing_cls_dynptr_read(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + const __u32 zero = 0; + __u8 *dst; + + dst = bpf_map_lookup_elem(&test_result, &zero); + if (!dst) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_read(dst, META_SIZE, &meta, 0, 0); + + return TC_ACT_SHOT; +} + +/* Write to metadata using bpf_dynptr_write helper */ +SEC("tc") +int ing_cls_dynptr_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + __u8 *src; + + bpf_dynptr_from_skb(ctx, 0, &data); + src = bpf_dynptr_slice(&data, sizeof(struct ethhdr), NULL, META_SIZE); + if (!src) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_write(&meta, 0, src, META_SIZE, 0); + + return TC_ACT_UNSPEC; /* pass */ +} + +/* Read from metadata using read-only dynptr slice */ +SEC("tc") +int ing_cls_dynptr_slice(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + const __u32 zero = 0; + __u8 *dst, *src; + + dst = bpf_map_lookup_elem(&test_result, &zero); + if (!dst) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + src = bpf_dynptr_slice(&meta, 0, NULL, META_SIZE); + if (!src) + return TC_ACT_SHOT; + + __builtin_memcpy(dst, src, META_SIZE); + + return TC_ACT_SHOT; +} + +/* Write to metadata using writeable dynptr slice */ +SEC("tc") +int ing_cls_dynptr_slice_rdwr(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + __u8 *src, *dst; + + bpf_dynptr_from_skb(ctx, 0, &data); + src = bpf_dynptr_slice(&data, sizeof(struct ethhdr), NULL, META_SIZE); + if (!src) + return TC_ACT_SHOT; + + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + dst = bpf_dynptr_slice_rdwr(&meta, 0, NULL, META_SIZE); + if (!dst) + return TC_ACT_SHOT; + + __builtin_memcpy(dst, src, META_SIZE); + + return TC_ACT_UNSPEC; /* pass */ +} + +/* Read skb metadata in chunks from various offsets in different ways. */ +SEC("tc") +int ing_cls_dynptr_offset_rd(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + const __u32 chunk_len = META_SIZE / 4; + const __u32 zero = 0; + __u8 *dst, *src; + + dst = bpf_map_lookup_elem(&test_result, &zero); + if (!dst) + return TC_ACT_SHOT; + + /* 1. Regular read */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_read(dst, chunk_len, &meta, 0, 0); + dst += chunk_len; + + /* 2. Read from an offset-adjusted dynptr */ + bpf_dynptr_adjust(&meta, chunk_len, bpf_dynptr_size(&meta)); + bpf_dynptr_read(dst, chunk_len, &meta, 0, 0); + dst += chunk_len; + + /* 3. Read at an offset */ + bpf_dynptr_read(dst, chunk_len, &meta, chunk_len, 0); + dst += chunk_len; + + /* 4. Read from a slice starting at an offset */ + src = bpf_dynptr_slice(&meta, 2 * chunk_len, NULL, chunk_len); + if (!src) + return TC_ACT_SHOT; + __builtin_memcpy(dst, src, chunk_len); + + return TC_ACT_SHOT; +} + +/* Write skb metadata in chunks at various offsets in different ways. */ +SEC("tc") +int ing_cls_dynptr_offset_wr(struct __sk_buff *ctx) +{ + const __u32 chunk_len = META_SIZE / 4; + __u8 payload[META_SIZE]; + struct bpf_dynptr meta; + __u8 *dst, *src; + + bpf_skb_load_bytes(ctx, sizeof(struct ethhdr), payload, sizeof(payload)); + src = payload; + + /* 1. Regular write */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + bpf_dynptr_write(&meta, 0, src, chunk_len, 0); + src += chunk_len; + + /* 2. Write to an offset-adjusted dynptr */ + bpf_dynptr_adjust(&meta, chunk_len, bpf_dynptr_size(&meta)); + bpf_dynptr_write(&meta, 0, src, chunk_len, 0); + src += chunk_len; + + /* 3. Write at an offset */ + bpf_dynptr_write(&meta, chunk_len, src, chunk_len, 0); + src += chunk_len; + + /* 4. Write to a slice starting at an offset */ + dst = bpf_dynptr_slice_rdwr(&meta, 2 * chunk_len, NULL, chunk_len); + if (!dst) + return TC_ACT_SHOT; + __builtin_memcpy(dst, src, chunk_len); + + return TC_ACT_UNSPEC; /* pass */ +} + +/* Pass an OOB offset to dynptr read, write, adjust, slice. */ +SEC("tc") +int ing_cls_dynptr_offset_oob(struct __sk_buff *ctx) +{ + struct bpf_dynptr meta; + __u8 md, *p; + int err; + + err = bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (err) + goto fail; + + /* read offset OOB */ + err = bpf_dynptr_read(&md, sizeof(md), &meta, META_SIZE, 0); + if (err != -E2BIG) + goto fail; + + /* write offset OOB */ + err = bpf_dynptr_write(&meta, META_SIZE, &md, sizeof(md), 0); + if (err != -E2BIG) + goto fail; + + /* adjust end offset OOB */ + err = bpf_dynptr_adjust(&meta, 0, META_SIZE + 1); + if (err != -ERANGE) + goto fail; + + /* adjust start offset OOB */ + err = bpf_dynptr_adjust(&meta, META_SIZE + 1, META_SIZE + 1); + if (err != -ERANGE) + goto fail; + + /* slice offset OOB */ + p = bpf_dynptr_slice(&meta, META_SIZE, NULL, sizeof(*p)); + if (p) + goto fail; + + /* slice rdwr offset OOB */ + p = bpf_dynptr_slice_rdwr(&meta, META_SIZE, NULL, sizeof(*p)); + if (p) + goto fail; + + return TC_ACT_UNSPEC; +fail: + return TC_ACT_SHOT; +} + +/* Reserve and clear space for metadata but don't populate it */ +SEC("xdp") +int ing_xdp_zalloc_meta(struct xdp_md *ctx) +{ + struct ethhdr *eth = ctx_ptr(ctx, data); + __u8 *meta; + int ret; + + /* Drop any non-test packets */ + if (eth + 1 > ctx_ptr(ctx, data_end)) + return XDP_DROP; + if (eth->h_proto != 0) + return XDP_DROP; + + ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); + if (ret < 0) + return XDP_DROP; + + meta = ctx_ptr(ctx, data_meta); + if (meta + META_SIZE > ctx_ptr(ctx, data)) + return XDP_DROP; + + __builtin_memset(meta, 0, META_SIZE); + + return XDP_PASS; +} + SEC("xdp") int ing_xdp(struct xdp_md *ctx) { @@ -73,4 +303,193 @@ int ing_xdp(struct xdp_md *ctx) return XDP_PASS; } +/* + * Check that skb->data_meta..skb->data is empty if prog writes to packet + * _payload_ using packet pointers. Applies only to cloned skbs. + */ +SEC("tc") +int clone_data_meta_empty_on_data_write(struct __sk_buff *ctx) +{ + struct ethhdr *eth = ctx_ptr(ctx, data); + + if (eth + 1 > ctx_ptr(ctx, data_end)) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect no metadata */ + if (ctx->data_meta != ctx->data) + goto out; + + /* Packet write to trigger unclone in prologue */ + eth->h_proto = 42; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb->data_meta..skb->data is empty if prog writes to packet + * _metadata_ using packet pointers. Applies only to cloned skbs. + */ +SEC("tc") +int clone_data_meta_empty_on_meta_write(struct __sk_buff *ctx) +{ + struct ethhdr *eth = ctx_ptr(ctx, data); + __u8 *md = ctx_ptr(ctx, data_meta); + + if (eth + 1 > ctx_ptr(ctx, data_end)) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + if (md + 1 > ctx_ptr(ctx, data)) { + /* Expect no metadata */ + test_pass = true; + } else { + /* Metadata write to trigger unclone in prologue */ + *md = 42; + } +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is writable but empty if prog writes to packet + * _payload_ using a dynptr slice. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + struct ethhdr *eth; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice_rdwr(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect no metadata */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + goto out; + + /* Packet write to trigger unclone in prologue */ + eth->h_proto = 42; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is writable but empty if prog writes to packet + * _metadata_ using a dynptr slice. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_empty_on_meta_slice_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + const struct ethhdr *eth; + __u8 *md; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect no metadata */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + goto out; + + /* Metadata write to trigger unclone in prologue */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); + if (md) + *md = 42; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is read-only before prog writes to packet payload + * using dynptr_write helper. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_rdonly_before_data_dynptr_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + const struct ethhdr *eth; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect read-only metadata before unclone */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) + goto out; + + /* Helper write to payload will unclone the packet */ + bpf_dynptr_write(&data, offsetof(struct ethhdr, h_proto), "x", 1, 0); + + /* Expect no metadata after unclone */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != 0) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + +/* + * Check that skb_meta dynptr is read-only if prog writes to packet + * metadata using dynptr_write helper. Applies only to cloned skbs. + */ +SEC("tc") +int clone_dynptr_rdonly_before_meta_dynptr_write(struct __sk_buff *ctx) +{ + struct bpf_dynptr data, meta; + const struct ethhdr *eth; + + bpf_dynptr_from_skb(ctx, 0, &data); + eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); + if (!eth) + goto out; + /* Ignore non-test packets */ + if (eth->h_proto != 0) + goto out; + + /* Expect read-only metadata */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) + goto out; + + /* Metadata write. Expect failure. */ + bpf_dynptr_from_skb_meta(ctx, 0, &meta); + if (bpf_dynptr_write(&meta, 0, "x", 1, 0) != -EINVAL) + goto out; + + test_pass = true; +out: + return TC_ACT_SHOT; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_pull_data.c b/tools/testing/selftests/bpf/progs/test_xdp_pull_data.c new file mode 100644 index 00000000000000..c41a21413eaa21 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_pull_data.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include + +int xdpf_sz; +int sinfo_sz; +int data_len; +int pull_len; + +#define XDP_PACKET_HEADROOM 256 + +SEC("xdp.frags") +int xdp_find_sizes(struct xdp_md *ctx) +{ + xdpf_sz = sizeof(struct xdp_frame); + sinfo_sz = __PAGE_SIZE - XDP_PACKET_HEADROOM - + (ctx->data_end - ctx->data); + + return XDP_PASS; +} + +SEC("xdp.frags") +int xdp_pull_data_prog(struct xdp_md *ctx) +{ + __u8 *data_end = (void *)(long)ctx->data_end; + __u8 *data = (void *)(long)ctx->data; + __u8 *val_p; + int err; + + if (data_len != data_end - data) + return XDP_DROP; + + err = bpf_xdp_pull_data(ctx, pull_len); + if (err) + return XDP_DROP; + + val_p = (void *)(long)ctx->data + 1024; + if (val_p + 1 > (void *)(long)ctx->data_end) + return XDP_DROP; + + if (*val_p != 0xbb) + return XDP_DROP; + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/timer_interrupt.c b/tools/testing/selftests/bpf/progs/timer_interrupt.c new file mode 100644 index 00000000000000..19180a455f4095 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_interrupt.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "bpf_experimental.h" + +char _license[] SEC("license") = "GPL"; + +#define CLOCK_MONOTONIC 1 + +int preempt_count; +int in_interrupt; +int in_interrupt_cb; + +struct elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array SEC(".maps"); + +static int timer_in_interrupt(void *map, int *key, struct bpf_timer *timer) +{ + preempt_count = get_preempt_count(); + in_interrupt_cb = bpf_in_interrupt(); + return 0; +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test_timer_interrupt) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&array, &key); + if (!timer) + return 0; + + in_interrupt = bpf_in_interrupt(); + bpf_timer_init(timer, &array, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, timer_in_interrupt); + bpf_timer_start(timer, 0, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_struct.c b/tools/testing/selftests/bpf/progs/tracing_struct.c index c435a3a8328ab1..d460732e202395 100644 --- a/tools/testing/selftests/bpf/progs/tracing_struct.c +++ b/tools/testing/selftests/bpf/progs/tracing_struct.c @@ -18,6 +18,18 @@ struct bpf_testmod_struct_arg_3 { int b[]; }; +union bpf_testmod_union_arg_1 { + char a; + short b; + struct bpf_testmod_struct_arg_1 arg; +}; + +union bpf_testmod_union_arg_2 { + int a; + long b; + struct bpf_testmod_struct_arg_2 arg; +}; + long t1_a_a, t1_a_b, t1_b, t1_c, t1_ret, t1_nregs; __u64 t1_reg0, t1_reg1, t1_reg2, t1_reg3; long t2_a, t2_b_a, t2_b_b, t2_c, t2_ret; @@ -26,6 +38,9 @@ long t4_a_a, t4_b, t4_c, t4_d, t4_e_a, t4_e_b, t4_ret; long t5_ret; int t6; +long ut1_a_a, ut1_b, ut1_c; +long ut2_a, ut2_b_a, ut2_b_b; + SEC("fentry/bpf_testmod_test_struct_arg_1") int BPF_PROG2(test_struct_arg_1, struct bpf_testmod_struct_arg_2, a, int, b, int, c) { @@ -130,4 +145,22 @@ int BPF_PROG2(test_struct_arg_11, struct bpf_testmod_struct_arg_3 *, a) return 0; } +SEC("fexit/bpf_testmod_test_union_arg_1") +int BPF_PROG2(test_union_arg_1, union bpf_testmod_union_arg_1, a, int, b, int, c) +{ + ut1_a_a = a.arg.a; + ut1_b = b; + ut1_c = c; + return 0; +} + +SEC("fexit/bpf_testmod_test_union_arg_2") +int BPF_PROG2(test_union_arg_2, int, a, union bpf_testmod_union_arg_2, b) +{ + ut2_a = a; + ut2_b_a = b.arg.a; + ut2_b_b = b.arg.b; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 044a6d78923edf..3d5f30c29ae339 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -97,6 +97,12 @@ int bench_trigger_kprobe_multi(void *ctx) return 0; } +SEC("?kprobe.multi/bpf_get_numa_node_id") +int bench_kprobe_multi_empty(void *ctx) +{ + return 0; +} + SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { @@ -104,6 +110,12 @@ int bench_trigger_kretprobe_multi(void *ctx) return 0; } +SEC("?kretprobe.multi/bpf_get_numa_node_id") +int bench_kretprobe_multi_empty(void *ctx) +{ + return 0; +} + SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c index 8a4fa6c7ef5900..e08c31669e5a76 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c @@ -7,8 +7,8 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; -SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") -int uretprobe(struct pt_regs *ctx) +SEC("uprobe") +int probe(struct pt_regs *ctx) { __builtin_memcpy(®s, ctx, sizeof(regs)); return 0; diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 0d7f1a7db2e2ec..915d38591bf6a4 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" #include +#include +#include #include struct pt_regs regs; @@ -8,10 +10,64 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; int executed = 0; +int pid; + +SEC("uprobe") +int BPF_UPROBE(test_uprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(test_uretprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.multi") +int test_uprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} SEC("uretprobe.multi") -int test(struct pt_regs *regs) +int test_uretprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.session") +int test_uprobe_session(struct pt_regs *ctx) { - executed = 1; + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("usdt") +int test_usdt(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; return 0; } diff --git a/tools/testing/selftests/bpf/progs/uretprobe_stack.c b/tools/testing/selftests/bpf/progs/uretprobe_stack.c index 9fdcf396b8f467..a2951e2f1711b8 100644 --- a/tools/testing/selftests/bpf/progs/uretprobe_stack.c +++ b/tools/testing/selftests/bpf/progs/uretprobe_stack.c @@ -26,8 +26,8 @@ int usdt_len; SEC("uprobe//proc/self/exe:target_1") int BPF_UPROBE(uprobe_1) { - /* target_1 is recursive wit depth of 2, so we capture two separate - * stack traces, depending on which occurence it is + /* target_1 is recursive with depth of 2, so we capture two separate + * stack traces, depending on which occurrence it is */ static bool recur = false; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 9dbdf123542d3b..f19e15400b3e10 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -240,6 +240,7 @@ int big_alloc2(void *ctx) return 5; bpf_arena_free_pages(&arena, (void __arena *)pg, 2); page[i] = NULL; + barrier(); page[i + 1] = NULL; cond_break; } diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 87a2c60d86e6ee..0a72e0228ea9a2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -926,7 +926,7 @@ l1_%=: r0 = 0; \ SEC("socket") __description("bounds check for non const xor src dst") __success __log_level(2) -__msg("5: (af) r0 ^= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (af) r0 ^= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_xor_src_dst(void) { asm volatile (" \ @@ -947,7 +947,7 @@ __naked void non_const_xor_src_dst(void) SEC("socket") __description("bounds check for non const or src dst") __success __log_level(2) -__msg("5: (4f) r0 |= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (4f) r0 |= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_or_src_dst(void) { asm volatile (" \ @@ -968,7 +968,7 @@ __naked void non_const_or_src_dst(void) SEC("socket") __description("bounds check for non const mul regs") __success __log_level(2) -__msg("5: (2f) r0 *= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") +__msg("5: (2f) r0 *= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") __naked void non_const_mul_regs(void) { asm volatile (" \ @@ -1241,7 +1241,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("multiply mixed sign bounds. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") __naked void mult_mixed0_sign(void) { asm volatile ( @@ -1264,7 +1264,7 @@ __naked void mult_mixed0_sign(void) SEC("tc") __description("multiply mixed sign bounds. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=smin32=-100,smax=smax32=200)") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=smin32=-100,smax=smax32=200)") __naked void mult_mixed1_sign(void) { asm volatile ( @@ -1287,7 +1287,7 @@ __naked void mult_mixed1_sign(void) SEC("tc") __description("multiply negative bounds") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") __naked void mult_sign_bounds(void) { asm volatile ( @@ -1311,7 +1311,7 @@ __naked void mult_sign_bounds(void) SEC("tc") __description("multiply bounds that don't cross signed boundary") __success __log_level(2) -__msg("r8 *= r6 {{.*}}; R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8_w=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") +__msg("r8 *= r6 {{.*}}; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") __naked void mult_no_sign_crossing(void) { asm volatile ( @@ -1331,7 +1331,7 @@ __naked void mult_no_sign_crossing(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_unsign_ovf(void) { asm volatile ( @@ -1353,7 +1353,7 @@ __naked void mult_unsign_ovf(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_sign_ovf(void) { asm volatile ( @@ -1376,7 +1376,7 @@ __naked void mult_sign_ovf(void) SEC("socket") __description("64-bit addition, all outcomes overflow") __success __log_level(2) -__msg("5: (0f) r3 += r3 {{.*}} R3_w=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") +__msg("5: (0f) r3 += r3 {{.*}} R3=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") __retval(0) __naked void add64_full_overflow(void) { @@ -1396,7 +1396,7 @@ __naked void add64_full_overflow(void) SEC("socket") __description("64-bit addition, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("4: (0f) r3 += r3 {{.*}} R3_w=scalar()") +__msg("4: (0f) r3 += r3 {{.*}} R3=scalar()") __retval(0) __naked void add64_partial_overflow(void) { @@ -1416,7 +1416,7 @@ __naked void add64_partial_overflow(void) SEC("socket") __description("32-bit addition overflow, all outcomes overflow") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_full_overflow(void) { @@ -1436,7 +1436,7 @@ __naked void add32_full_overflow(void) SEC("socket") __description("32-bit addition, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_partial_overflow(void) { @@ -1456,7 +1456,7 @@ __naked void add32_partial_overflow(void) SEC("socket") __description("64-bit subtraction, all outcomes underflow") __success __log_level(2) -__msg("6: (1f) r3 -= r1 {{.*}} R3_w=scalar(umin=1,umax=0x8000000000000000)") +__msg("6: (1f) r3 -= r1 {{.*}} R3=scalar(umin=1,umax=0x8000000000000000)") __retval(0) __naked void sub64_full_overflow(void) { @@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void) SEC("socket") __description("64-bit subtraction, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("3: (1f) r3 -= r2 {{.*}} R3_w=scalar()") +__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()") __retval(0) __naked void sub64_partial_overflow(void) { @@ -1496,7 +1496,7 @@ __naked void sub64_partial_overflow(void) SEC("socket") __description("32-bit subtraction overflow, all outcomes underflow") __success __log_level(2) -__msg("5: (1c) w3 -= w1 {{.*}} R3_w=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") +__msg("5: (1c) w3 -= w1 {{.*}} R3=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_full_overflow(void) { @@ -1517,7 +1517,7 @@ __naked void sub32_full_overflow(void) SEC("socket") __description("32-bit subtraction, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("3: (1c) w3 -= w2 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("3: (1c) w3 -= w2 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_partial_overflow(void) { @@ -1617,7 +1617,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, positive overlap") __success __log_level(2) __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") __retval(0) __naked void bounds_deduct_positive_overlap(void) { @@ -1650,7 +1650,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, two overlaps") __failure __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") __msg("frame pointer is read only") __naked void bounds_deduct_two_overlaps(void) { @@ -1668,4 +1668,45 @@ l0_%=: r0 = 0; \ : __clobber_all); } +SEC("socket") +__description("dead jne branch due to disagreeing tnums") +__success __log_level(2) +__naked void jne_disagreeing_tnums(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + w0 = w0; \ + r0 >>= 30; \ + r0 <<= 30; \ + r1 = r0; \ + r1 += 1024; \ + if r1 != r0 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("dead jeq branch due to disagreeing tnums") +__success __log_level(2) +__naked void jeq_disagreeing_tnums(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + w0 = w0; \ + r0 >>= 30; \ + r0 <<= 30; \ + r1 = r0; \ + r1 += 1024; \ + if r1 == r0 goto +1; \ + exit; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index c258b0722e045e..fb4fa465d67c62 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -660,19 +660,24 @@ __naked void may_goto_interaction_x86_64(void) SEC("raw_tp") __arch_arm64 -__log_level(4) __msg("stack depth 16") -/* may_goto counter at -16 */ -__xlated("0: *(u64 *)(r10 -16) =") -__xlated("1: r1 = 1") -__xlated("2: call bpf_get_smp_processor_id") +__log_level(4) __msg("stack depth 24") +/* may_goto counter at -24 */ +__xlated("0: *(u64 *)(r10 -24) =") +/* may_goto timestamp at -16 */ +__xlated("1: *(u64 *)(r10 -16) =") +__xlated("2: r1 = 1") +__xlated("3: call bpf_get_smp_processor_id") /* may_goto expansion starts */ -__xlated("3: r11 = *(u64 *)(r10 -16)") -__xlated("4: if r11 == 0x0 goto pc+3") -__xlated("5: r11 -= 1") -__xlated("6: *(u64 *)(r10 -16) = r11") +__xlated("4: r11 = *(u64 *)(r10 -24)") +__xlated("5: if r11 == 0x0 goto pc+6") +__xlated("6: r11 -= 1") +__xlated("7: if r11 != 0x0 goto pc+2") +__xlated("8: r11 = -24") +__xlated("9: call unknown") +__xlated("10: *(u64 *)(r10 -24) = r11") /* may_goto expansion ends */ -__xlated("7: *(u64 *)(r10 -8) = r1") -__xlated("8: exit") +__xlated("11: *(u64 *)(r10 -8) = r1") +__xlated("12: exit") __success __naked void may_goto_interaction_arm64(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_ctx.c b/tools/testing/selftests/bpf/progs/verifier_ctx.c index 424463094760ac..5ebf7d9bcc55e1 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ctx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ctx.c @@ -5,8 +5,6 @@ #include #include "bpf_misc.h" -#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) - SEC("tc") __description("context stores via BPF_ATOMIC") __failure __msg("BPF_ATOMIC stores into R1 ctx is not allowed") @@ -264,4 +262,34 @@ narrow_load("sockops", bpf_sock_ops, skb_hwtstamp); unaligned_access("flow_dissector", __sk_buff, data); unaligned_access("netfilter", bpf_nf_ctx, skb); +#define padding_access(type, ctx, prev_field, sz) \ + SEC(type) \ + __description("access on " #ctx " padding after " #prev_field) \ + __naked void padding_ctx_access_##ctx(void) \ + { \ + asm volatile (" \ + r1 = *(u%[size] *)(r1 + %[off]); \ + r0 = 0; \ + exit;" \ + : \ + : __imm_const(size, sz * 8), \ + __imm_const(off, offsetofend(struct ctx, prev_field)) \ + : __clobber_all); \ + } + +__failure __msg("invalid bpf_context access") +padding_access("cgroup/bind4", bpf_sock_addr, msg_src_ip6[3], 4); + +__success +padding_access("sk_lookup", bpf_sk_lookup, remote_port, 2); + +__failure __msg("invalid bpf_context access") +padding_access("tc", __sk_buff, tstamp_type, 2); + +__failure __msg("invalid bpf_context access") +padding_access("cgroup/post_bind4", bpf_sock, dst_port, 2); + +__failure __msg("invalid bpf_context access") +padding_access("sk_reuseport", sk_reuseport_md, hash, 4); + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 181da86ba5f040..6630a92b1b47e7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -215,7 +215,7 @@ __weak int subprog_untrusted(const volatile struct task_struct *restrict task __ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_untrusted') is global and assumed valid.") __msg("Validating subprog_untrusted() func#1...") __msg(": R1=untrusted_ptr_task_struct") @@ -278,7 +278,7 @@ __weak int subprog_enum_untrusted(enum bpf_attach_type *p __arg_untrusted) SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_void_untrusted') is global and assumed valid.") __msg("Validating subprog_void_untrusted() func#1...") __msg(": R1=rdonly_untrusted_mem(sz=0)") diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index 52edee41caf674..c8494b682c3193 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -3,6 +3,7 @@ #include #include #include "bpf_misc.h" +#include "bpf_arena_common.h" #if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ @@ -10,6 +11,12 @@ defined(__TARGET_ARCH_loongarch)) && \ __clang_major__ >= 18 +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1); +} arena SEC(".maps"); + SEC("socket") __description("LDSX, S8") __success __success_unpriv __retval(-2) @@ -65,7 +72,7 @@ __naked void ldsx_s32(void) SEC("socket") __description("LDSX, S8 range checking, privileged") __log_level(2) __success __retval(1) -__msg("R1_w=scalar(smin=smin32=-128,smax=smax32=127)") +__msg("R1=scalar(smin=smin32=-128,smax=smax32=127)") __naked void ldsx_s8_range_priv(void) { asm volatile ( @@ -256,6 +263,175 @@ __naked void ldsx_ctx_8(void) : __clobber_all); } +SEC("syscall") +__description("Arena LDSX Disasm") +__success +__arch_x86_64 +__jited("movslq 0x10(%rax,%r12), %r14") +__jited("movswq 0x18(%rax,%r12), %r14") +__jited("movsbq 0x20(%rax,%r12), %r14") +__jited("movslq 0x10(%rdi,%r12), %r15") +__jited("movswq 0x18(%rdi,%r12), %r15") +__jited("movsbq 0x20(%rdi,%r12), %r15") +__arch_arm64 +__jited("add x11, x7, x28") +__jited("ldrsw x21, [x11, #0x10]") +__jited("add x11, x7, x28") +__jited("ldrsh x21, [x11, #0x18]") +__jited("add x11, x7, x28") +__jited("ldrsb x21, [x11, #0x20]") +__jited("add x11, x0, x28") +__jited("ldrsw x22, [x11, #0x10]") +__jited("add x11, x0, x28") +__jited("ldrsh x22, [x11, #0x18]") +__jited("add x11, x0, x28") +__jited("ldrsb x22, [x11, #0x20]") +__naked void arena_ldsx_disasm(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = r0;" + "r8 = *(s32 *)(r0 + 16);" + "r8 = *(s16 *)(r0 + 24);" + "r8 = *(s8 *)(r0 + 32);" + "r9 = *(s32 *)(r1 + 16);" + "r9 = *(s16 *)(r1 + 24);" + "r9 = *(s8 *)(r1 + 32);" + "r0 = 0;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX Exception") +__success __retval(0) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_exception(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r0 = 0xdeadbeef;" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0x3fe;" + "*(u64 *)(r0 + 0) = r1;" + "r0 = *(s8 *)(r0 + 0);" + "exit;" + : + : __imm_addr(arena) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX, S8") +__success __retval(-1) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_s8(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0x3fe;" + "*(u64 *)(r0 + 0) = r1;" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(s8 *)(r0 + 0);" +#else + "r0 = *(s8 *)(r0 + 7);" +#endif + "r0 >>= 1;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX, S16") +__success __retval(-1) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_s16(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0x3fffe;" + "*(u64 *)(r0 + 0) = r1;" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(s16 *)(r0 + 0);" +#else + "r0 = *(s16 *)(r0 + 6);" +#endif + "r0 >>= 1;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +SEC("syscall") +__description("Arena LDSX, S32") +__success __retval(-1) +__arch_x86_64 +__arch_arm64 +__naked void arena_ldsx_s32(void *ctx) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = %[numa_no_node];" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r0 = addr_space_cast(r0, 0x0, 0x1);" + "r1 = 0xfffffffe;" + "*(u64 *)(r0 + 0) = r1;" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "r0 = *(s32 *)(r0 + 0);" +#else + "r0 = *(s32 *)(r0 + 4);" +#endif + "r0 >>= 1;" + "exit;" + :: __imm(bpf_arena_alloc_pages), + __imm_addr(arena), + __imm_const(numa_no_node, NUMA_NO_NODE) + : __clobber_all + ); +} + +/* to retain debug info for BTF generation */ +void kfunc_root(void) +{ + bpf_arena_alloc_pages(0, 0, 0, 0, 0); +} + #else SEC("socket") diff --git a/tools/testing/selftests/bpf/progs/verifier_live_stack.c b/tools/testing/selftests/bpf/progs/verifier_live_stack.c new file mode 100644 index 00000000000000..c0e80850926827 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_live_stack.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, long long); +} map SEC(".maps"); + +SEC("socket") +__log_level(2) +__msg("(0) frame 0 insn 2 +written -8") +__msg("(0) frame 0 insn 1 +live -24") +__msg("(0) frame 0 insn 1 +written -8") +__msg("(0) frame 0 insn 0 +live -8,-24") +__msg("(0) frame 0 insn 0 +written -8") +__msg("(0) live stack update done in 2 iterations") +__naked void simple_read_simple_write(void) +{ + asm volatile ( + "r1 = *(u64 *)(r10 - 8);" + "r2 = *(u64 *)(r10 - 24);" + "*(u64 *)(r10 - 8) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("(0) frame 0 insn 1 +live -8") +__not_msg("(0) frame 0 insn 1 +written") +__msg("(0) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 1 +live -16") +__msg("(0) frame 0 insn 1 +written -32") +__msg("(0) live stack update done in 2 iterations") +__naked void read_write_join(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "if r0 > 42 goto 1f;" + "r0 = *(u64 *)(r10 - 8);" + "*(u64 *)(r10 - 32) = r0;" + "*(u64 *)(r10 - 40) = r0;" + "exit;" +"1:" + "r0 = *(u64 *)(r10 - 16);" + "*(u64 *)(r10 - 32) = r0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("2: (25) if r0 > 0x2a goto pc+1") +__msg("7: (95) exit") +__msg("(0) frame 0 insn 2 +written -16") +__msg("(0) live stack update done in 2 iterations") +__msg("7: (95) exit") +__not_msg("(0) frame 0 insn 2") +__msg("(0) live stack update done in 1 iterations") +__naked void must_write_not_same_slot(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r1 = -8;" + "if r0 > 42 goto 1f;" + "r1 = -16;" +"1:" + "r2 = r10;" + "r2 += r1;" + "*(u64 *)(r2 + 0) = r0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("(0) frame 0 insn 0 +written -8,-16") +__msg("(0) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 0 +written -8") +__msg("(0) live stack update done in 2 iterations") +__naked void must_write_not_same_type(void) +{ + asm volatile ( + "*(u64*)(r10 - 8) = 0;" + "r2 = r10;" + "r2 += -8;" + "r1 = %[map] ll;" + "call %[bpf_map_lookup_elem];" + "if r0 != 0 goto 1f;" + "r0 = r10;" + "r0 += -16;" +"1:" + "*(u64 *)(r0 + 0) = 42;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_map_lookup_elem), + __imm_addr(map) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("(2,4) frame 0 insn 4 +written -8") +__msg("(2,4) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 2 +written -8") +__msg("(0) live stack update done in 2 iterations") +__naked void caller_stack_write(void) +{ + asm volatile ( + "r1 = r10;" + "r1 += -8;" + "call write_first_param;" + "exit;" + ::: __clobber_all); +} + +static __used __naked void write_first_param(void) +{ + asm volatile ( + "*(u64 *)(r1 + 0) = 7;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +/* caller_stack_read() function */ +__msg("2: .12345.... (85) call pc+4") +__msg("5: .12345.... (85) call pc+1") +__msg("6: 0......... (95) exit") +/* read_first_param() function */ +__msg("7: .1........ (79) r0 = *(u64 *)(r1 +0)") +__msg("8: 0......... (95) exit") +/* update for callsite at (2) */ +__msg("(2,7) frame 0 insn 7 +live -8") +__msg("(2,7) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 2 +live -8") +__msg("(0) live stack update done in 2 iterations") +/* update for callsite at (5) */ +__msg("(5,7) frame 0 insn 7 +live -16") +__msg("(5,7) live stack update done in 2 iterations") +__msg("(0) frame 0 insn 5 +live -16") +__msg("(0) live stack update done in 2 iterations") +__naked void caller_stack_read(void) +{ + asm volatile ( + "r1 = r10;" + "r1 += -8;" + "call read_first_param;" + "r1 = r10;" + "r1 += -16;" + "call read_first_param;" + "exit;" + ::: __clobber_all); +} + +static __used __naked void read_first_param(void) +{ + asm volatile ( + "r0 = *(u64 *)(r1 + 0);" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__flag(BPF_F_TEST_STATE_FREQ) +__log_level(2) +/* read_first_param2() function */ +__msg(" 9: .1........ (79) r0 = *(u64 *)(r1 +0)") +__msg("10: .......... (b7) r0 = 0") +__msg("11: 0......... (05) goto pc+0") +__msg("12: 0......... (95) exit") +/* + * The purpose of the test is to check that checkpoint in + * read_first_param2() stops path traversal. This will only happen if + * verifier understands that fp[0]-8 at insn (12) is not alive. + */ +__msg("12: safe") +__msg("processed 20 insns") +__naked void caller_stack_pruning(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "if r0 == 42 goto 1f;" + "r0 = %[map] ll;" +"1:" + "*(u64 *)(r10 - 8) = r0;" + "r1 = r10;" + "r1 += -8;" + /* + * fp[0]-8 is either pointer to map or a scalar, + * preventing state pruning at checkpoint created for call. + */ + "call read_first_param2;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm_addr(map) + : __clobber_all); +} + +static __used __naked void read_first_param2(void) +{ + asm volatile ( + "r0 = *(u64 *)(r1 + 0);" + "r0 = 0;" + /* + * Checkpoint at goto +0 should fire, + * as caller stack fp[0]-8 is not alive at this point. + */ + "goto +0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__flag(BPF_F_TEST_STATE_FREQ) +__failure +__msg("R1 type=scalar expected=map_ptr") +__naked void caller_stack_pruning_callback(void) +{ + asm volatile ( + "r0 = %[map] ll;" + "*(u64 *)(r10 - 8) = r0;" + "r1 = 2;" + "r2 = loop_cb ll;" + "r3 = r10;" + "r3 += -8;" + "r4 = 0;" + /* + * fp[0]-8 is either pointer to map or a scalar, + * preventing state pruning at checkpoint created for call. + */ + "call %[bpf_loop];" + "r0 = 42;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_loop), + __imm_addr(map) + : __clobber_all); +} + +static __used __naked void loop_cb(void) +{ + asm volatile ( + /* + * Checkpoint at function entry should not fire, as caller + * stack fp[0]-8 is alive at this point. + */ + "r6 = r2;" + "r1 = *(u64 *)(r6 + 0);" + "*(u64*)(r10 - 8) = 7;" + "r2 = r10;" + "r2 += -8;" + "call %[bpf_map_lookup_elem];" + /* + * This should stop verifier on a second loop iteration, + * but only if verifier correctly maintains that fp[0]-8 + * is still alive. + */ + "*(u64 *)(r6 + 0) = 0;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_map_lookup_elem), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * Because of a bug in verifier.c:compute_postorder() + * the program below overflowed traversal queue in that function. + */ +SEC("socket") +__naked void syzbot_postorder_bug1(void) +{ + asm volatile ( + "r0 = 0;" + "if r0 != 0 goto -1;" + "exit;" + ::: __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/verifier_loops1.c b/tools/testing/selftests/bpf/progs/verifier_loops1.c index e07b43b78fd210..fbdde80e7b9055 100644 --- a/tools/testing/selftests/bpf/progs/verifier_loops1.c +++ b/tools/testing/selftests/bpf/progs/verifier_loops1.c @@ -283,4 +283,25 @@ exit_%=: \ : __clobber_all); } +/* + * This test case triggered a bug in verifier.c:maybe_exit_scc(). + * Speculative execution path reaches stack access instruction, + * stops and triggers maybe_exit_scc() w/o accompanying maybe_enter_scc() call. + */ +SEC("socket") +__arch_x86_64 +__caps_unpriv(CAP_BPF) +__naked void maybe_exit_scc_bug1(void) +{ + asm volatile ( + "r0 = 100;" +"1:" + /* Speculative execution path reaches and stops here. */ + "*(u64 *)(r10 - 512) = r0;" + /* Condition is always false, but verifier speculatively executes the true branch. */ + "if r0 <= 0x0 goto 1b;" + "exit;" + ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index 11a0791459669e..e2767d27d8aaf8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -70,10 +70,13 @@ __naked void bpf_map_ptr_write_rejected(void) : __clobber_all); } +/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing + * into this array is valid. The opts field is now at offset 33. + */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4") +__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) @@ -82,7 +85,7 @@ __naked void read_non_existent_field_rejected(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ - r6 = *(u32*)(r1 + 1); \ + r6 = *(u32*)(r1 + 33); \ r0 = 1; \ exit; \ " : diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c index 3966d827f28892..6d1edaef921383 100644 --- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c +++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c @@ -9,6 +9,8 @@ SEC("raw_tp") __description("may_goto 0") __arch_x86_64 +__arch_s390x +__arch_arm64 __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -27,6 +29,8 @@ __naked void may_goto_simple(void) SEC("raw_tp") __description("batch 2 of may_goto 0") __arch_x86_64 +__arch_s390x +__arch_arm64 __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -47,6 +51,8 @@ __naked void may_goto_batch_0(void) SEC("raw_tp") __description("may_goto batch with offsets 2/1/0") __arch_x86_64 +__arch_s390x +__arch_arm64 __xlated("0: r0 = 1") __xlated("1: exit") __success @@ -69,8 +75,10 @@ __naked void may_goto_batch_1(void) } SEC("raw_tp") -__description("may_goto batch with offsets 2/0 - x86_64") +__description("may_goto batch with offsets 2/0") __arch_x86_64 +__arch_s390x +__arch_arm64 __xlated("0: *(u64 *)(r10 -16) = 65535") __xlated("1: *(u64 *)(r10 -8) = 0") __xlated("2: r11 = *(u64 *)(r10 -16)") @@ -84,33 +92,7 @@ __xlated("9: r0 = 1") __xlated("10: r0 = 2") __xlated("11: exit") __success -__naked void may_goto_batch_2_x86_64(void) -{ - asm volatile ( - ".8byte %[may_goto1];" - ".8byte %[may_goto3];" - "r0 = 1;" - "r0 = 2;" - "exit;" - : - : __imm_insn(may_goto1, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 2 /* offset */, 0)), - __imm_insn(may_goto3, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0)) - : __clobber_all); -} - -SEC("raw_tp") -__description("may_goto batch with offsets 2/0 - arm64") -__arch_arm64 -__xlated("0: *(u64 *)(r10 -8) = 8388608") -__xlated("1: r11 = *(u64 *)(r10 -8)") -__xlated("2: if r11 == 0x0 goto pc+3") -__xlated("3: r11 -= 1") -__xlated("4: *(u64 *)(r10 -8) = r11") -__xlated("5: r0 = 1") -__xlated("6: r0 = 2") -__xlated("7: exit") -__success -__naked void may_goto_batch_2_arm64(void) +__naked void may_goto_batch_2(void) { asm volatile ( ".8byte %[may_goto1];" diff --git a/tools/testing/selftests/bpf/progs/verifier_mul.c b/tools/testing/selftests/bpf/progs/verifier_mul.c new file mode 100644 index 00000000000000..7145fe3351d5c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_mul.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Nandakumar Edamana */ +#include +#include +#include +#include "bpf_misc.h" + +/* Intended to test the abstract multiplication technique(s) used by + * the verifier. Using assembly to avoid compiler optimizations. + */ +SEC("fentry/bpf_fentry_test1") +void BPF_PROG(mul_precise, int x) +{ + /* First, force the verifier to be uncertain about the value: + * unsigned int a = (bpf_get_prandom_u32() & 0x2) | 0x1; + * + * Assuming the verifier is using tnum, a must be tnum{.v=0x1, .m=0x2}. + * Then a * 0x3 would be m0m1 (m for uncertain). Added imprecision + * would cause the following to fail, because the required return value + * is 0: + * return (a * 0x3) & 0x4); + */ + asm volatile ("\ + call %[bpf_get_prandom_u32];\ + r0 &= 0x2;\ + r0 |= 0x1;\ + r0 *= 0x3;\ + r0 &= 0x4;\ + if r0 != 0 goto l0_%=;\ + r0 = 0;\ + goto l1_%=;\ +l0_%=:\ + r0 = 1;\ +l1_%=:\ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 73fee2aec6983c..1fe090cd674495 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -144,21 +144,21 @@ SEC("?raw_tp") __success __log_level(2) /* * Without the bug fix there will be no history between "last_idx 3 first_idx 3" - * and "parent state regs=" lines. "R0_w=6" parts are here to help anchor + * and "parent state regs=" lines. "R0=6" parts are here to help anchor * expected log messages to the one specific mark_chain_precision operation. * * This is quite fragile: if verifier checkpointing heuristic changes, this * might need adjusting. */ -__msg("2: (07) r0 += 1 ; R0_w=6") +__msg("2: (07) r0 += 1 ; R0=6") __msg("3: (35) if r0 >= 0xa goto pc+1") __msg("mark_precise: frame0: last_idx 3 first_idx 3 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 2: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 4: (05) goto pc-4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (35) if r0 >= 0xa goto pc+1") -__msg("mark_precise: frame0: parent state regs= stack=: R0_rw=P4") -__msg("3: R0_w=6") +__msg("mark_precise: frame0: parent state regs= stack=: R0=P4") +__msg("3: R0=6") __naked int state_loop_first_last_equal(void) { asm volatile ( @@ -233,8 +233,8 @@ __naked void bpf_cond_op_not_r10(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (84) w0 = -w0 ; R0_w=0xffffffff") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (84) w0 = -w0 ; R0=0xffffffff") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (84) w0 = -w0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") @@ -268,8 +268,8 @@ __naked int bpf_neg_3(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (87) r0 = -r0 ; R0_w=-1") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (87) r0 = -r0 ; R0=-1") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (87) r0 = -r0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index 7c5e5e6d10ebc2..c0ce690ddb68a7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -349,11 +349,11 @@ __naked void precision_two_ids(void) SEC("socket") __success __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) -/* check thar r0 and r6 have different IDs after 'if', +/* check that r0 and r6 have different IDs after 'if', * collect_linked_regs() can't tie more than 6 registers for a single insn. */ __msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") -__msg("9: (bf) r6 = r6 ; R6_w=scalar(id=2") +__msg("9: (bf) r6 = r6 ; R6=scalar(id=2") /* check that r{0-5} are marked precise after 'if' */ __msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") __msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") @@ -779,12 +779,12 @@ __success __retval(0) /* Check that verifier believes r1/r0 are zero at exit */ __log_level(2) -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") __msg("from 3 to 4") -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") /* Verify that statements to randomize upper half of r1 had not been * generated. diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index 0d5e56dffabb8c..2b4610b53382d0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -1,14 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* Converted from tools/testing/selftests/bpf/verifier/sock.c */ -#include +#include "vmlinux.h" #include #include "bpf_misc.h" -#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) -#define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) - struct { __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY); __uint(max_entries, 1); @@ -1072,6 +1068,48 @@ int invalidate_pkt_pointers_from_global_func(struct __sk_buff *sk) return TCX_PASS; } +__noinline +long xdp_pull_data2(struct xdp_md *x, __u32 len) +{ + return bpf_xdp_pull_data(x, len); +} + +__noinline +long xdp_pull_data1(struct xdp_md *x, __u32 len) +{ + return xdp_pull_data2(x, len); +} + +/* global function calls bpf_xdp_pull_data(), which invalidates packet + * pointers established before global function call. + */ +SEC("xdp") +__failure __msg("invalid mem access") +int invalidate_xdp_pkt_pointers_from_global_func(struct xdp_md *x) +{ + int *p = (void *)(long)x->data; + + if ((void *)(p + 1) > (void *)(long)x->data_end) + return XDP_DROP; + xdp_pull_data1(x, 0); + *p = 42; /* this is unsafe */ + return XDP_PASS; +} + +/* XDP packet changing kfunc calls invalidate packet pointers */ +SEC("xdp") +__failure __msg("invalid mem access") +int invalidate_xdp_pkt_pointers(struct xdp_md *x) +{ + int *p = (void *)(long)x->data; + + if ((void *)(p + 1) > (void *)(long)x->data_end) + return XDP_DROP; + bpf_xdp_pull_data(x, 0); + *p = 42; /* this is unsafe */ + return XDP_PASS; +} + __noinline int tail_call(struct __sk_buff *sk) { diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 1e5a511e8494af..7a13dbd794b2fb 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -506,17 +506,17 @@ SEC("raw_tp") __log_level(2) __success /* fp-8 is spilled IMPRECISE value zero (represented by a zero value fake reg) */ -__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8_w=0") +__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8=0") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=0 R10=fp0 fp-16_w=0") +__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0=0 R10=fp0 fp-16=0") /* validate that assigning R2 from STACK_SPILL with zero value doesn't mark register * precise immediately; if necessary, it will be marked precise later */ -__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8_w=0") +__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0") /* similarly, when R2 is assigned from spilled register, it is initially * imprecise, but will be marked precise later once it is used in precise context */ -__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2_w=0 R10=fp0 fp-16_w=0") +__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2=0 R10=fp0 fp-16=0") __msg("11: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 11 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 10: (71) r2 = *(u8 *)(r10 -9)") @@ -598,7 +598,7 @@ __log_level(2) __success /* fp-4 is STACK_ZERO */ __msg("2: (62) *(u32 *)(r10 -4) = 0 ; R10=fp0 fp-8=0000????") -__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8=0000????") +__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0000????") __msg("5: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 4: (71) r2 = *(u8 *)(r10 -1)") @@ -640,25 +640,25 @@ SEC("raw_tp") __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) __success /* make sure fp-8 is IMPRECISE fake register spill */ -__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8_w=1") +__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8=1") /* and fp-16 is spilled IMPRECISE const reg */ -__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16_w=1") +__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=1") +__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2=1 R10=fp0 fp-8=1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") /* note, fp-8 is precise, fp-16 is not yet precise, we'll get there */ -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_w=1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7a) *(u64 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=1") +__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2=1 R10=fp0 fp-16=1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (79) r2 = *(u64 *)(r10 -16)") @@ -668,12 +668,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") /* now both fp-8 and fp-16 are precise, very good */ -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision(void) { asm volatile ( @@ -719,22 +719,22 @@ __success /* make sure fp-8 is 32-bit FAKE subregister spill */ __msg("3: (62) *(u32 *)(r10 -8) = 1 ; R10=fp0 fp-8=????1") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16=????1") +__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=????1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=????1") +__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2=1 R10=fp0 fp-8=????1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16=????1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (62) *(u32 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=????1") +__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2=1 R10=fp0 fp-16=????1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (61) r2 = *(u32 *)(r10 -16)") @@ -743,12 +743,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 10: (73) *(u8 *)(r1 +0) = r2 __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16_r=????P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision_subreg(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 9d415f7ce599b0..ac3e418c2a9616 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -105,7 +105,7 @@ __msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") __msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") __msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int fp_precise_subprog_result(void) { asm volatile ( @@ -141,7 +141,7 @@ __msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") * anyways, at which point we'll break precision chain */ __msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int sneaky_fp_precise_subprog_result(void) { asm volatile ( @@ -681,7 +681,7 @@ __msg("mark_precise: frame0: last_idx 10 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r7 stack= before 9: (bf) r1 = r8") __msg("mark_precise: frame0: regs=r7 stack= before 8: (27) r7 *= 4") __msg("mark_precise: frame0: regs=r7 stack= before 7: (79) r7 = *(u64 *)(r10 -8)") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=2 R6_w=1 R8_rw=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=2 R6=1 R8=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8=P1") __msg("mark_precise: frame0: last_idx 18 first_idx 0 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit") __msg("mark_precise: frame1: regs= stack= before 17: (0f) r0 += r2") diff --git a/tools/testing/selftests/bpf/progs/verifier_var_off.c b/tools/testing/selftests/bpf/progs/verifier_var_off.c index 1d36d01b746e78..f345466bca6868 100644 --- a/tools/testing/selftests/bpf/progs/verifier_var_off.c +++ b/tools/testing/selftests/bpf/progs/verifier_var_off.c @@ -114,8 +114,8 @@ __naked void stack_write_priv_vs_unpriv(void) } /* Similar to the previous test, but this time also perform a read from the - * address written to with a variable offset. The read is allowed, showing that, - * after a variable-offset write, a priviledged program can read the slots that + * address written to with a variable offet. The read is allowed, showing that, + * after a variable-offset write, a privileged program can read the slots that * were in the range of that write (even if the verifier doesn't actually know if * the slot being read was really written to or not. * @@ -157,7 +157,7 @@ __naked void stack_write_followed_by_read(void) SEC("socket") __description("variable-offset stack write clobbers spilled regs") __failure -/* In the priviledged case, dereferencing a spilled-and-then-filled +/* In the privileged case, dereferencing a spilled-and-then-filled * register is rejected because the previous variable offset stack * write might have overwritten the spilled pointer (i.e. we lose track * of the spilled register when we analyze the write). diff --git a/tools/testing/selftests/bpf/test_kmods/Makefile b/tools/testing/selftests/bpf/test_kmods/Makefile index d4e50c4509c93a..63c4d3f6a12f6b 100644 --- a/tools/testing/selftests/bpf/test_kmods/Makefile +++ b/tools/testing/selftests/bpf/test_kmods/Makefile @@ -8,7 +8,7 @@ Q = @ endif MODULES = bpf_testmod.ko bpf_test_no_cfi.ko bpf_test_modorder_x.ko \ - bpf_test_modorder_y.ko + bpf_test_modorder_y.ko bpf_test_rqspinlock.ko $(foreach m,$(MODULES),$(eval obj-m += $(m:.ko=.o))) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c new file mode 100644 index 00000000000000..769206fc70e485 --- /dev/null +++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct perf_event_attr hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, + .sample_period = 100000, +}; + +static rqspinlock_t lock_a; +static rqspinlock_t lock_b; + +static struct perf_event **rqsl_evts; +static int rqsl_nevts; + +static bool test_ab = false; +module_param(test_ab, bool, 0644); +MODULE_PARM_DESC(test_ab, "Test ABBA situations instead of AA situations"); + +static struct task_struct **rqsl_threads; +static int rqsl_nthreads; +static atomic_t rqsl_ready_cpus = ATOMIC_INIT(0); + +static int pause = 0; + +static bool nmi_locks_a(int cpu) +{ + return (cpu & 1) && test_ab; +} + +static int rqspinlock_worker_fn(void *arg) +{ + int cpu = smp_processor_id(); + unsigned long flags; + int ret; + + if (cpu) { + atomic_inc(&rqsl_ready_cpus); + + while (!kthread_should_stop()) { + if (READ_ONCE(pause)) { + msleep(1000); + continue; + } + if (nmi_locks_a(cpu)) + ret = raw_res_spin_lock_irqsave(&lock_b, flags); + else + ret = raw_res_spin_lock_irqsave(&lock_a, flags); + mdelay(20); + if (nmi_locks_a(cpu) && !ret) + raw_res_spin_unlock_irqrestore(&lock_b, flags); + else if (!ret) + raw_res_spin_unlock_irqrestore(&lock_a, flags); + cpu_relax(); + } + return 0; + } + + while (!kthread_should_stop()) { + int expected = rqsl_nthreads > 0 ? rqsl_nthreads - 1 : 0; + int ready = atomic_read(&rqsl_ready_cpus); + + if (ready == expected && !READ_ONCE(pause)) { + for (int i = 0; i < rqsl_nevts; i++) + perf_event_enable(rqsl_evts[i]); + pr_err("Waiting 5 secs to pause the test\n"); + msleep(1000 * 5); + WRITE_ONCE(pause, 1); + pr_err("Paused the test\n"); + } else { + msleep(1000); + cpu_relax(); + } + } + return 0; +} + +static void nmi_cb(struct perf_event *event, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + unsigned long flags; + int ret; + + if (!cpu || READ_ONCE(pause)) + return; + + if (nmi_locks_a(cpu)) + ret = raw_res_spin_lock_irqsave(&lock_a, flags); + else + ret = raw_res_spin_lock_irqsave(test_ab ? &lock_b : &lock_a, flags); + + mdelay(10); + + if (nmi_locks_a(cpu) && !ret) + raw_res_spin_unlock_irqrestore(&lock_a, flags); + else if (!ret) + raw_res_spin_unlock_irqrestore(test_ab ? &lock_b : &lock_a, flags); +} + +static void free_rqsl_threads(void) +{ + int i; + + if (rqsl_threads) { + for_each_online_cpu(i) { + if (rqsl_threads[i]) + kthread_stop(rqsl_threads[i]); + } + kfree(rqsl_threads); + } +} + +static void free_rqsl_evts(void) +{ + int i; + + if (rqsl_evts) { + for (i = 0; i < rqsl_nevts; i++) { + if (rqsl_evts[i]) + perf_event_release_kernel(rqsl_evts[i]); + } + kfree(rqsl_evts); + } +} + +static int bpf_test_rqspinlock_init(void) +{ + int i, ret; + int ncpus = num_online_cpus(); + + pr_err("Mode = %s\n", test_ab ? "ABBA" : "AA"); + + if (ncpus < 3) + return -ENOTSUPP; + + raw_res_spin_lock_init(&lock_a); + raw_res_spin_lock_init(&lock_b); + + rqsl_evts = kcalloc(ncpus - 1, sizeof(*rqsl_evts), GFP_KERNEL); + if (!rqsl_evts) + return -ENOMEM; + rqsl_nevts = ncpus - 1; + + for (i = 1; i < ncpus; i++) { + struct perf_event *e; + + e = perf_event_create_kernel_counter(&hw_attr, i, NULL, nmi_cb, NULL); + if (IS_ERR(e)) { + ret = PTR_ERR(e); + goto err_perf_events; + } + rqsl_evts[i - 1] = e; + } + + rqsl_threads = kcalloc(ncpus, sizeof(*rqsl_threads), GFP_KERNEL); + if (!rqsl_threads) { + ret = -ENOMEM; + goto err_perf_events; + } + rqsl_nthreads = ncpus; + + for_each_online_cpu(i) { + struct task_struct *t; + + t = kthread_create(rqspinlock_worker_fn, NULL, "rqsl_w/%d", i); + if (IS_ERR(t)) { + ret = PTR_ERR(t); + goto err_threads_create; + } + kthread_bind(t, i); + rqsl_threads[i] = t; + wake_up_process(t); + } + return 0; + +err_threads_create: + free_rqsl_threads(); +err_perf_events: + free_rqsl_evts(); + return ret; +} + +module_init(bpf_test_rqspinlock_init); + +static void bpf_test_rqspinlock_exit(void) +{ + free_rqsl_threads(); + free_rqsl_evts(); +} + +module_exit(bpf_test_rqspinlock_exit); + +MODULE_AUTHOR("Kumar Kartikeya Dwivedi"); +MODULE_DESCRIPTION("BPF rqspinlock stress test module"); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e9e918cdf31ff2..8074bc5f6f2004 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -62,6 +62,18 @@ struct bpf_testmod_struct_arg_5 { long d; }; +union bpf_testmod_union_arg_1 { + char a; + short b; + struct bpf_testmod_struct_arg_1 arg; +}; + +union bpf_testmod_union_arg_2 { + int a; + long b; + struct bpf_testmod_struct_arg_2 arg; +}; + __bpf_hook_start(); noinline int @@ -128,6 +140,20 @@ bpf_testmod_test_struct_arg_9(u64 a, void *b, short c, int d, void *e, char f, return bpf_testmod_test_struct_arg_result; } +noinline int +bpf_testmod_test_union_arg_1(union bpf_testmod_union_arg_1 a, int b, int c) +{ + bpf_testmod_test_struct_arg_result = a.arg.a + b + c; + return bpf_testmod_test_struct_arg_result; +} + +noinline int +bpf_testmod_test_union_arg_2(int a, union bpf_testmod_union_arg_2 b) +{ + bpf_testmod_test_struct_arg_result = a + b.arg.a + b.arg.b; + return bpf_testmod_test_struct_arg_result; +} + noinline int bpf_testmod_test_arg_ptr_to_struct(struct bpf_testmod_struct_arg_1 *a) { bpf_testmod_test_struct_arg_result = a->a; @@ -218,6 +244,16 @@ __bpf_kfunc void bpf_kfunc_rcu_task_test(struct task_struct *ptr) { } +__bpf_kfunc struct task_struct *bpf_kfunc_ret_rcu_test(void) +{ + return NULL; +} + +__bpf_kfunc int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) +{ + return NULL; +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -398,6 +434,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, struct bpf_testmod_struct_arg_3 *struct_arg3; struct bpf_testmod_struct_arg_4 struct_arg4 = {21, 22}; struct bpf_testmod_struct_arg_5 struct_arg5 = {23, 24, 25, 26}; + union bpf_testmod_union_arg_1 union_arg1 = { .arg = {1} }; + union bpf_testmod_union_arg_2 union_arg2 = { .arg = {2, 3} }; int i = 1; while (bpf_testmod_return_ptr(i)) @@ -415,6 +453,9 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, (void)bpf_testmod_test_struct_arg_9(16, (void *)17, 18, 19, (void *)20, 21, 22, struct_arg5, 27); + (void)bpf_testmod_test_union_arg_1(union_arg1, 4, 5); + (void)bpf_testmod_test_union_arg_2(6, union_arg2); + (void)bpf_testmod_test_arg_ptr_to_struct(&struct_arg1_2); (void)trace_bpf_testmod_test_raw_tp_null_tp(NULL); @@ -500,15 +541,21 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { */ #ifdef __x86_64__ +static int +uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) +{ + regs->cx = 0x87654321feebdaed; + return 0; +} + static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; - regs->cx = 0x87654321feebdaed; regs->r11 = (u64) -1; - return true; + return 0; } struct testmod_uprobe { @@ -520,6 +567,7 @@ struct testmod_uprobe { static DEFINE_MUTEX(testmod_uprobe_mutex); static struct testmod_uprobe uprobe = { + .consumer.handler = uprobe_handler, .consumer.ret_handler = uprobe_ret_handler, }; @@ -623,6 +671,8 @@ BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) +BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1) @@ -1057,6 +1107,8 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) return args->a; } +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1097,6 +1149,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABL BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1528,6 +1581,114 @@ static struct bpf_struct_ops testmod_st_ops = { .owner = THIS_MODULE, }; +struct hlist_head multi_st_ops_list; +static DEFINE_SPINLOCK(multi_st_ops_lock); + +static int multi_st_ops_init(struct btf *btf) +{ + spin_lock_init(&multi_st_ops_lock); + INIT_HLIST_HEAD(&multi_st_ops_list); + + return 0; +} + +static int multi_st_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static struct bpf_testmod_multi_st_ops *multi_st_ops_find_nolock(u32 id) +{ + struct bpf_testmod_multi_st_ops *st_ops; + + hlist_for_each_entry(st_ops, &multi_st_ops_list, node) { + if (st_ops->id == id) + return st_ops; + } + + return NULL; +} + +int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) +{ + struct bpf_testmod_multi_st_ops *st_ops; + unsigned long flags; + int ret = -1; + + spin_lock_irqsave(&multi_st_ops_lock, flags); + st_ops = multi_st_ops_find_nolock(id); + if (st_ops) + ret = st_ops->test_1(args); + spin_unlock_irqrestore(&multi_st_ops_lock, flags); + + return ret; +} + +static int multi_st_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_testmod_multi_st_ops *st_ops = + (struct bpf_testmod_multi_st_ops *)kdata; + unsigned long flags; + int err = 0; + u32 id; + + if (!st_ops->test_1) + return -EINVAL; + + id = bpf_struct_ops_id(kdata); + + spin_lock_irqsave(&multi_st_ops_lock, flags); + if (multi_st_ops_find_nolock(id)) { + pr_err("multi_st_ops(id:%d) has already been registered\n", id); + err = -EEXIST; + goto unlock; + } + + st_ops->id = id; + hlist_add_head(&st_ops->node, &multi_st_ops_list); +unlock: + spin_unlock_irqrestore(&multi_st_ops_lock, flags); + + return err; +} + +static void multi_st_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_testmod_multi_st_ops *st_ops; + unsigned long flags; + u32 id; + + id = bpf_struct_ops_id(kdata); + + spin_lock_irqsave(&multi_st_ops_lock, flags); + st_ops = multi_st_ops_find_nolock(id); + if (st_ops) + hlist_del(&st_ops->node); + spin_unlock_irqrestore(&multi_st_ops_lock, flags); +} + +static int bpf_testmod_multi_st_ops__test_1(struct st_ops_args *args) +{ + return 0; +} + +static struct bpf_testmod_multi_st_ops multi_st_ops_cfi_stubs = { + .test_1 = bpf_testmod_multi_st_ops__test_1, +}; + +struct bpf_struct_ops testmod_multi_st_ops = { + .verifier_ops = &bpf_testmod_verifier_ops, + .init = multi_st_ops_init, + .init_member = multi_st_ops_init_member, + .reg = multi_st_ops_reg, + .unreg = multi_st_ops_unreg, + .cfi_stubs = &multi_st_ops_cfi_stubs, + .name = "bpf_testmod_multi_st_ops", + .owner = THIS_MODULE, +}; + extern int bpf_fentry_test1(int a); static int bpf_testmod_init(void) @@ -1550,6 +1711,7 @@ static int bpf_testmod_init(void) ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2); ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops3, bpf_testmod_ops3); ret = ret ?: register_bpf_struct_ops(&testmod_st_ops, bpf_testmod_st_ops); + ret = ret ?: register_bpf_struct_ops(&testmod_multi_st_ops, bpf_testmod_multi_st_ops); ret = ret ?: register_btf_id_dtor_kfuncs(bpf_testmod_dtors, ARRAY_SIZE(bpf_testmod_dtors), THIS_MODULE); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h index c9fab51f16e2b7..f6e492f9d04268 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.h @@ -116,4 +116,10 @@ struct bpf_testmod_st_ops { struct module *owner; }; +struct bpf_testmod_multi_st_ops { + int (*test_1)(struct st_ops_args *args); + struct hlist_node node; + int id; +}; + #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index b58817938debd8..4df6fa6a92cba7 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -158,5 +158,9 @@ void bpf_kfunc_trusted_vma_test(struct vm_area_struct *ptr) __ksym; void bpf_kfunc_trusted_task_test(struct task_struct *ptr) __ksym; void bpf_kfunc_trusted_num_test(int *ptr) __ksym; void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; +struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; +int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; + +int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c index 4694422aa76c36..88e4aeab21b7bc 100644 --- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c +++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c @@ -74,7 +74,7 @@ int main(int argc, char **argv) /* Let's try detach it before it was ever attached */ ret = bpf_prog_detach2(progfd, lircfd, BPF_LIRC_MODE2); - if (ret != -1 || errno != ENOENT) { + if (ret != -ENOENT) { printf("bpf_prog_detach2 not attached should fail: %m\n"); return 1; } diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 78423cf89e01bb..74ecc281bb8c12 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -2,7 +2,6 @@ /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include #include -#include #include #include @@ -20,10 +19,12 @@ #define TEST_TAG_EXPECT_FAILURE "comment:test_expect_failure" #define TEST_TAG_EXPECT_SUCCESS "comment:test_expect_success" #define TEST_TAG_EXPECT_MSG_PFX "comment:test_expect_msg=" +#define TEST_TAG_EXPECT_NOT_MSG_PFX "comment:test_expect_not_msg=" #define TEST_TAG_EXPECT_XLATED_PFX "comment:test_expect_xlated=" #define TEST_TAG_EXPECT_FAILURE_UNPRIV "comment:test_expect_failure_unpriv" #define TEST_TAG_EXPECT_SUCCESS_UNPRIV "comment:test_expect_success_unpriv" #define TEST_TAG_EXPECT_MSG_PFX_UNPRIV "comment:test_expect_msg_unpriv=" +#define TEST_TAG_EXPECT_NOT_MSG_PFX_UNPRIV "comment:test_expect_not_msg_unpriv=" #define TEST_TAG_EXPECT_XLATED_PFX_UNPRIV "comment:test_expect_xlated_unpriv=" #define TEST_TAG_LOG_LEVEL_PFX "comment:test_log_level=" #define TEST_TAG_PROG_FLAGS_PFX "comment:test_prog_flags=" @@ -38,6 +39,10 @@ #define TEST_TAG_JITED_PFX_UNPRIV "comment:test_jited_unpriv=" #define TEST_TAG_CAPS_UNPRIV "comment:test_caps_unpriv=" #define TEST_TAG_LOAD_MODE_PFX "comment:load_mode=" +#define TEST_TAG_EXPECT_STDERR_PFX "comment:test_expect_stderr=" +#define TEST_TAG_EXPECT_STDERR_PFX_UNPRIV "comment:test_expect_stderr_unpriv=" +#define TEST_TAG_EXPECT_STDOUT_PFX "comment:test_expect_stdout=" +#define TEST_TAG_EXPECT_STDOUT_PFX_UNPRIV "comment:test_expect_stdout_unpriv=" /* Warning: duplicated in bpf_misc.h */ #define POINTER_VALUE 0xbadcafe @@ -61,24 +66,14 @@ enum load_mode { NO_JITED = 1 << 1, }; -struct expect_msg { - const char *substr; /* substring match */ - regex_t regex; - bool is_regex; - bool on_next_line; -}; - -struct expected_msgs { - struct expect_msg *patterns; - size_t cnt; -}; - struct test_subspec { char *name; bool expect_failure; struct expected_msgs expect_msgs; struct expected_msgs expect_xlated; struct expected_msgs jited; + struct expected_msgs stderr; + struct expected_msgs stdout; int retval; bool execute; __u64 caps; @@ -139,6 +134,10 @@ static void free_test_spec(struct test_spec *spec) free_msgs(&spec->unpriv.expect_xlated); free_msgs(&spec->priv.jited); free_msgs(&spec->unpriv.jited); + free_msgs(&spec->unpriv.stderr); + free_msgs(&spec->priv.stderr); + free_msgs(&spec->unpriv.stdout); + free_msgs(&spec->priv.stdout); free(spec->priv.name); free(spec->unpriv.name); @@ -206,7 +205,8 @@ static int compile_regex(const char *pattern, regex_t *regex) return 0; } -static int __push_msg(const char *pattern, bool on_next_line, struct expected_msgs *msgs) +static int __push_msg(const char *pattern, bool on_next_line, bool negative, + struct expected_msgs *msgs) { struct expect_msg *msg; void *tmp; @@ -222,6 +222,7 @@ static int __push_msg(const char *pattern, bool on_next_line, struct expected_ms msg = &msgs->patterns[msgs->cnt]; msg->on_next_line = on_next_line; msg->substr = pattern; + msg->negative = negative; msg->is_regex = false; if (strstr(pattern, "{{")) { err = compile_regex(pattern, &msg->regex); @@ -240,16 +241,16 @@ static int clone_msgs(struct expected_msgs *from, struct expected_msgs *to) for (i = 0; i < from->cnt; i++) { msg = &from->patterns[i]; - err = __push_msg(msg->substr, msg->on_next_line, to); + err = __push_msg(msg->substr, msg->on_next_line, msg->negative, to); if (err) return err; } return 0; } -static int push_msg(const char *substr, struct expected_msgs *msgs) +static int push_msg(const char *substr, bool negative, struct expected_msgs *msgs) { - return __push_msg(substr, false, msgs); + return __push_msg(substr, false, negative, msgs); } static int push_disasm_msg(const char *regex_str, bool *on_next_line, struct expected_msgs *msgs) @@ -260,7 +261,7 @@ static int push_disasm_msg(const char *regex_str, bool *on_next_line, struct exp *on_next_line = false; return 0; } - err = __push_msg(regex_str, *on_next_line, msgs); + err = __push_msg(regex_str, *on_next_line, false, msgs); if (err) return err; *on_next_line = true; @@ -374,6 +375,7 @@ enum arch { ARCH_X86_64 = 0x2, ARCH_ARM64 = 0x4, ARCH_RISCV64 = 0x8, + ARCH_S390X = 0x10, }; static int get_current_arch(void) @@ -384,6 +386,8 @@ static int get_current_arch(void) return ARCH_ARM64; #elif defined(__riscv) && __riscv_xlen == 64 return ARCH_RISCV64; +#elif defined(__s390x__) + return ARCH_S390X; #endif return ARCH_UNKNOWN; } @@ -404,6 +408,10 @@ static int parse_test_spec(struct test_loader *tester, bool xlated_on_next_line = true; bool unpriv_jit_on_next_line; bool jit_on_next_line; + bool stderr_on_next_line = true; + bool unpriv_stderr_on_next_line = true; + bool stdout_on_next_line = true; + bool unpriv_stdout_on_next_line = true; bool collect_jit = false; int func_id, i, err = 0; u32 arch_mask = 0; @@ -465,12 +473,22 @@ static int parse_test_spec(struct test_loader *tester, spec->auxiliary = true; spec->mode_mask |= UNPRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX))) { - err = push_msg(msg, &spec->priv.expect_msgs); + err = push_msg(msg, false, &spec->priv.expect_msgs); + if (err) + goto cleanup; + spec->mode_mask |= PRIV; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_NOT_MSG_PFX))) { + err = push_msg(msg, true, &spec->priv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= PRIV; } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_MSG_PFX_UNPRIV))) { - err = push_msg(msg, &spec->unpriv.expect_msgs); + err = push_msg(msg, false, &spec->unpriv.expect_msgs); + if (err) + goto cleanup; + spec->mode_mask |= UNPRIV; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_NOT_MSG_PFX_UNPRIV))) { + err = push_msg(msg, true, &spec->unpriv.expect_msgs); if (err) goto cleanup; spec->mode_mask |= UNPRIV; @@ -565,8 +583,10 @@ static int parse_test_spec(struct test_loader *tester, arch = ARCH_ARM64; } else if (strcmp(val, "RISCV64") == 0) { arch = ARCH_RISCV64; + } else if (strcmp(val, "s390x") == 0) { + arch = ARCH_S390X; } else { - PRINT_FAIL("bad arch spec: '%s'", val); + PRINT_FAIL("bad arch spec: '%s'\n", val); err = -EINVAL; goto cleanup; } @@ -593,6 +613,26 @@ static int parse_test_spec(struct test_loader *tester, err = -EINVAL; goto cleanup; } + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDERR_PFX))) { + err = push_disasm_msg(msg, &stderr_on_next_line, + &spec->priv.stderr); + if (err) + goto cleanup; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDERR_PFX_UNPRIV))) { + err = push_disasm_msg(msg, &unpriv_stderr_on_next_line, + &spec->unpriv.stderr); + if (err) + goto cleanup; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDOUT_PFX))) { + err = push_disasm_msg(msg, &stdout_on_next_line, + &spec->priv.stdout); + if (err) + goto cleanup; + } else if ((msg = skip_dynamic_pfx(s, TEST_TAG_EXPECT_STDOUT_PFX_UNPRIV))) { + err = push_disasm_msg(msg, &unpriv_stdout_on_next_line, + &spec->unpriv.stdout); + if (err) + goto cleanup; } } @@ -646,6 +686,10 @@ static int parse_test_spec(struct test_loader *tester, clone_msgs(&spec->priv.expect_xlated, &spec->unpriv.expect_xlated); if (spec->unpriv.jited.cnt == 0) clone_msgs(&spec->priv.jited, &spec->unpriv.jited); + if (spec->unpriv.stderr.cnt == 0) + clone_msgs(&spec->priv.stderr, &spec->unpriv.stderr); + if (spec->unpriv.stdout.cnt == 0) + clone_msgs(&spec->priv.stdout, &spec->unpriv.stdout); } spec->valid = true; @@ -707,44 +751,155 @@ static void emit_jited(const char *jited, bool force) fprintf(stdout, "JITED:\n=============\n%s=============\n", jited); } -static void validate_msgs(char *log_buf, struct expected_msgs *msgs, - void (*emit_fn)(const char *buf, bool force)) +static void emit_stderr(const char *stderr, bool force) { - const char *log = log_buf, *prev_match; + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "STDERR:\n=============\n%s=============\n", stderr); +} + +static void emit_stdout(const char *bpf_stdout, bool force) +{ + if (!force && env.verbosity == VERBOSE_NONE) + return; + fprintf(stdout, "STDOUT:\n=============\n%s=============\n", bpf_stdout); +} + +static const char *match_msg(struct expect_msg *msg, const char **log) +{ + const char *match = NULL; regmatch_t reg_match[1]; - int prev_match_line; - int match_line; - int i, j, err; + int err; + + if (!msg->is_regex) { + match = strstr(*log, msg->substr); + if (match) + *log = match + strlen(msg->substr); + } else { + err = regexec(&msg->regex, *log, 1, reg_match, 0); + if (err == 0) { + match = *log + reg_match[0].rm_so; + *log += reg_match[0].rm_eo; + } + } + return match; +} + +static int count_lines(const char *start, const char *end) +{ + const char *tmp; + int n = 0; + + for (tmp = start; tmp < end; ++tmp) + if (*tmp == '\n') + n++; + return n; +} + +struct match { + const char *start; + const char *end; + int line; +}; + +/* + * Positive messages are matched sequentially, each next message + * is looked for starting from the end of a previous matched one. + */ +static void match_positive_msgs(const char *log, struct expected_msgs *msgs, struct match *matches) +{ + const char *prev_match; + int i, line; - prev_match_line = -1; - match_line = 0; prev_match = log; + line = 0; for (i = 0; i < msgs->cnt; i++) { struct expect_msg *msg = &msgs->patterns[i]; - const char *match = NULL, *pat_status; - bool wrong_line = false; - - if (!msg->is_regex) { - match = strstr(log, msg->substr); - if (match) - log = match + strlen(msg->substr); - } else { - err = regexec(&msg->regex, log, 1, reg_match, 0); - if (err == 0) { - match = log + reg_match[0].rm_so; - log += reg_match[0].rm_eo; + const char *match = NULL; + + if (msg->negative) + continue; + + match = match_msg(msg, &log); + if (match) { + line += count_lines(prev_match, match); + matches[i].start = match; + matches[i].end = log; + matches[i].line = line; + prev_match = match; + } + } +} + +/* + * Each negative messages N located between positive messages P1 and P2 + * is matched in the span P1.end .. P2.start. Consequently, negative messages + * are unordered within the span. + */ +static void match_negative_msgs(const char *log, struct expected_msgs *msgs, struct match *matches) +{ + const char *start = log, *end, *next, *match; + const char *log_end = log + strlen(log); + int i, j, next_positive; + + for (i = 0; i < msgs->cnt; i++) { + struct expect_msg *msg = &msgs->patterns[i]; + + /* positive message bumps span start */ + if (!msg->negative) { + start = matches[i].end ?: start; + continue; + } + + /* count stride of negative patterns and adjust span end */ + end = log_end; + for (next_positive = i + 1; next_positive < msgs->cnt; next_positive++) { + if (!msgs->patterns[next_positive].negative) { + end = matches[next_positive].start; + break; } } - if (match) { - for (; prev_match < match; ++prev_match) - if (*prev_match == '\n') - ++match_line; - wrong_line = msg->on_next_line && prev_match_line >= 0 && - prev_match_line + 1 != match_line; + /* try matching negative messages within identified span */ + for (j = i; j < next_positive; j++) { + next = start; + match = match_msg(msg, &next); + if (match && next <= end) { + matches[j].start = match; + matches[j].end = next; + } } - if (!match || wrong_line) { + /* -1 to account for i++ */ + i = next_positive - 1; + } +} + +void validate_msgs(const char *log_buf, struct expected_msgs *msgs, + void (*emit_fn)(const char *buf, bool force)) +{ + struct match matches[msgs->cnt]; + struct match *prev_match = NULL; + int i, j; + + memset(matches, 0, sizeof(*matches) * msgs->cnt); + match_positive_msgs(log_buf, msgs, matches); + match_negative_msgs(log_buf, msgs, matches); + + for (i = 0; i < msgs->cnt; i++) { + struct expect_msg *msg = &msgs->patterns[i]; + struct match *match = &matches[i]; + const char *pat_status; + bool unexpected; + bool wrong_line; + bool no_match; + + no_match = !msg->negative && !match->start; + wrong_line = !msg->negative && + msg->on_next_line && + prev_match && prev_match->line + 1 != match->line; + unexpected = msg->negative && match->start; + if (no_match || wrong_line || unexpected) { PRINT_FAIL("expect_msg\n"); if (env.verbosity == VERBOSE_NONE) emit_fn(log_buf, true /*force*/); @@ -754,8 +909,10 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, pat_status = "MATCHED "; else if (wrong_line) pat_status = "WRONG LINE"; - else + else if (no_match) pat_status = "EXPECTED "; + else + pat_status = "UNEXPECTED"; msg = &msgs->patterns[j]; fprintf(stderr, "%s %s: '%s'\n", pat_status, @@ -765,12 +922,13 @@ static void validate_msgs(char *log_buf, struct expected_msgs *msgs, if (wrong_line) { fprintf(stderr, "expecting match at line %d, actual match is at line %d\n", - prev_match_line + 1, match_line); + prev_match->line + 1, match->line); } break; } - prev_match_line = match_line; + if (!msg->negative) + prev_match = match; } } @@ -929,6 +1087,19 @@ static int get_xlated_program_text(int prog_fd, char *text, size_t text_sz) return err; } +/* Read the bpf stream corresponding to the stream_id */ +static int get_stream(int stream_id, int prog_fd, char *text, size_t text_sz) +{ + LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); + int ret; + + ret = bpf_prog_stream_read(prog_fd, stream_id, text, text_sz, &ropts); + ASSERT_GT(ret, 0, "stream read"); + text[ret] = '\0'; + + return ret; +} + /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -1083,7 +1254,7 @@ void run_subtest(struct test_loader *tester, link = bpf_map__attach_struct_ops(map); if (!link) { PRINT_FAIL("bpf_map__attach_struct_ops failed for map %s: err=%d\n", - bpf_map__name(map), err); + bpf_map__name(map), -errno); goto tobj_cleanup; } links[links_cnt++] = link; @@ -1103,6 +1274,31 @@ void run_subtest(struct test_loader *tester, PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval); goto tobj_cleanup; } + + if (subspec->stderr.cnt) { + err = get_stream(2, bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err <= 0) { + PRINT_FAIL("Unexpected retval from get_stream(): %d, errno = %d\n", + err, errno); + goto tobj_cleanup; + } + emit_stderr(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->stderr, emit_stderr); + } + + if (subspec->stdout.cnt) { + err = get_stream(1, bpf_program__fd(tprog), + tester->log_buf, tester->log_buf_sz); + if (err <= 0) { + PRINT_FAIL("Unexpected retval from get_stream(): %d, errno = %d\n", + err, errno); + goto tobj_cleanup; + } + emit_stdout(tester->log_buf, false /*force*/); + validate_msgs(tester->log_buf, &subspec->stdout, emit_stdout); + } + /* redo bpf_map__attach_struct_ops for each test */ while (links_cnt > 0) bpf_link__destroy(links[--links_cnt]); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 309d9d4a8ace1d..02a85dda30e649 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -14,12 +14,14 @@ #include #include #include +#include #include #include #include #include "json_writer.h" #include "network_helpers.h" +#include "verification_cert.h" /* backtrace() and backtrace_symbols_fd() are glibc specific, * use header file when glibc is available and provide stub @@ -1928,6 +1930,13 @@ static void free_test_states(void) } } +static __u32 register_session_key(const char *key_data, size_t key_data_size) +{ + return syscall(__NR_add_key, "asymmetric", "libbpf_session_key", + (const void *)key_data, key_data_size, + KEY_SPEC_SESSION_KEYRING); +} + int main(int argc, char **argv) { static const struct argp argp = { @@ -1961,6 +1970,10 @@ int main(int argc, char **argv) /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); libbpf_set_print(libbpf_print_fn); + err = register_session_key((const char *)test_progs_verification_cert, + test_progs_verification_cert_len); + if (err < 0) + return err; traffic_monitor_set_print(traffic_monitor_print_fn); diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index df2222a1806fd1..eebfc18cdcd21d 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -546,4 +547,20 @@ extern void test_loader_fini(struct test_loader *tester); test_loader_fini(&tester); \ }) +struct expect_msg { + const char *substr; /* substring match */ + regex_t regex; + bool is_regex; + bool on_next_line; + bool negative; +}; + +struct expected_msgs { + struct expect_msg *patterns; + size_t cnt; +}; + +void validate_msgs(const char *log_buf, struct expected_msgs *msgs, + void (*emit_fn)(const char *buf, bool force)); + #endif /* __TEST_PROGS_H */ diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index fd2da2234cc9b4..76568db7a66422 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1372,7 +1372,7 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) } else fprintf(stderr, "unknown test\n"); out: - /* Detatch and zero all the maps */ + /* Detach and zero all the maps */ bpf_prog_detach2(bpf_program__fd(progs[3]), cg_fd, BPF_CGROUP_SOCK_OPS); for (i = 0; i < ARRAY_SIZE(links); i++) { diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c index 595194453ff8f8..35b4893ccdf8ae 100644 --- a/tools/testing/selftests/bpf/test_tcpnotify_user.c +++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c @@ -15,20 +15,18 @@ #include #include #include -#include #include -#include -#include "bpf_util.h" #include "cgroup_helpers.h" #include "test_tcpnotify.h" -#include "trace_helpers.h" #include "testing_helpers.h" #define SOCKET_BUFFER_SIZE (getpagesize() < 8192L ? getpagesize() : 8192L) pthread_t tid; +static bool exit_thread; + int rx_callbacks; static void dummyfn(void *ctx, int cpu, void *data, __u32 size) @@ -45,7 +43,7 @@ void tcp_notifier_poller(struct perf_buffer *pb) { int err; - while (1) { + while (!exit_thread) { err = perf_buffer__poll(pb, 100); if (err < 0 && err != -EINTR) { printf("failed perf_buffer__poll: %d\n", err); @@ -78,15 +76,10 @@ int main(int argc, char **argv) int error = EXIT_FAILURE; struct bpf_object *obj; char test_script[80]; - cpu_set_t cpuset; __u32 key = 0; libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - CPU_ZERO(&cpuset); - CPU_SET(0, &cpuset); - pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - cg_fd = cgroup_setup_and_join(cg_path); if (cg_fd < 0) goto err; @@ -151,6 +144,13 @@ int main(int argc, char **argv) sleep(10); + exit_thread = true; + int ret = pthread_join(tid, NULL); + if (ret) { + printf("FAILED: pthread_join\n"); + goto err; + } + if (verify_result(&g)) { printf("FAILED: Wrong stats Expected %d calls, got %d\n", g.ncalls, rx_callbacks); diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 65aafe0003db05..62db060298a4a3 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -241,4 +241,6 @@ done if [ $failures -eq 0 ]; then echo "All tests successful!" +else + exit 1 fi diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 5e9f16683be546..16eb37e5bad697 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -399,7 +399,7 @@ int unload_module(const char *name, bool verbose) return 0; } -int load_module(const char *path, bool verbose) +static int __load_module(const char *path, const char *param_values, bool verbose) { int fd; @@ -411,7 +411,7 @@ int load_module(const char *path, bool verbose) fprintf(stdout, "Can't find %s kernel module: %d\n", path, -errno); return -ENOENT; } - if (finit_module(fd, "", 0)) { + if (finit_module(fd, param_values, 0)) { fprintf(stdout, "Failed to load %s into the kernel: %d\n", path, -errno); close(fd); return -EINVAL; @@ -423,6 +423,16 @@ int load_module(const char *path, bool verbose) return 0; } +int load_module_params(const char *path, const char *param_values, bool verbose) +{ + return __load_module(path, param_values, verbose); +} + +int load_module(const char *path, bool verbose) +{ + return __load_module(path, "", verbose); +} + int unload_bpf_testmod(bool verbose) { return unload_module("bpf_testmod", verbose); diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h index 46d7f7089f636b..eb20d377221854 100644 --- a/tools/testing/selftests/bpf/testing_helpers.h +++ b/tools/testing/selftests/bpf/testing_helpers.h @@ -39,6 +39,7 @@ int kern_sync_rcu(void); int finit_module(int fd, const char *param_values, int flags); int delete_module(const char *name, int flags); int load_module(const char *path, bool verbose); +int load_module_params(const char *path, const char *param_values, bool verbose); int unload_module(const char *name, bool verbose); static inline __u64 get_time_ns(void) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 81943c6254e6bc..171987627f3a7b 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -17,7 +17,9 @@ #include #include #include +#include "bpf/hashmap.h" #include "bpf/libbpf_internal.h" +#include "bpf_util.h" #define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" #define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" @@ -519,3 +521,235 @@ void read_trace_pipe(void) { read_trace_pipe_iter(trace_pipe_cb, NULL, 0); } + +static size_t symbol_hash(long key, void *ctx __maybe_unused) +{ + return str_hash((const char *) key); +} + +static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) +{ + return strcmp((const char *) key1, (const char *) key2) == 0; +} + +static bool is_invalid_entry(char *buf, bool kernel) +{ + if (kernel && strchr(buf, '[')) + return true; + if (!kernel && !strchr(buf, '[')) + return true; + return false; +} + +static const char * const trace_blacklist[] = { + "migrate_disable", + "migrate_enable", + "rcu_read_unlock_strict", + "preempt_count_add", + "preempt_count_sub", + "__rcu_read_lock", + "__rcu_read_unlock", + "bpf_get_numa_node_id", +}; + +static bool skip_entry(char *name) +{ + int i; + + /* + * We attach to almost all kernel functions and some of them + * will cause 'suspicious RCU usage' when fprobe is attached + * to them. Filter out the current culprits - arch_cpu_idle + * default_idle and rcu_* functions. + */ + if (!strcmp(name, "arch_cpu_idle")) + return true; + if (!strcmp(name, "default_idle")) + return true; + if (!strncmp(name, "rcu_", 4)) + return true; + if (!strcmp(name, "bpf_dispatcher_xdp_func")) + return true; + if (!strncmp(name, "__ftrace_invalid_address__", + sizeof("__ftrace_invalid_address__") - 1)) + return true; + + for (i = 0; i < ARRAY_SIZE(trace_blacklist); i++) { + if (!strcmp(name, trace_blacklist[i])) + return true; + } + + return false; +} + +/* Do comparison by ignoring '.llvm.' suffixes. */ +static int compare_name(const char *name1, const char *name2) +{ + const char *res1, *res2; + int len1, len2; + + res1 = strstr(name1, ".llvm."); + res2 = strstr(name2, ".llvm."); + len1 = res1 ? res1 - name1 : strlen(name1); + len2 = res2 ? res2 - name2 : strlen(name2); + + if (len1 == len2) + return strncmp(name1, name2, len1); + if (len1 < len2) + return strncmp(name1, name2, len1) <= 0 ? -1 : 1; + return strncmp(name1, name2, len2) >= 0 ? 1 : -1; +} + +static int load_kallsyms_compare(const void *p1, const void *p2) +{ + return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); +} + +static int search_kallsyms_compare(const void *p1, const struct ksym *p2) +{ + return compare_name(p1, p2->name); +} + +int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) +{ + size_t cap = 0, cnt = 0; + char *name = NULL, *ksym_name, **syms = NULL; + struct hashmap *map; + struct ksyms *ksyms; + struct ksym *ks; + char buf[256]; + FILE *f; + int err = 0; + + ksyms = load_kallsyms_custom_local(load_kallsyms_compare); + if (!ksyms) + return -EINVAL; + + /* + * The available_filter_functions contains many duplicates, + * but other than that all symbols are usable to trace. + * Filtering out duplicates by using hashmap__add, which won't + * add existing entry. + */ + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); + + if (!f) + return -EINVAL; + + map = hashmap__new(symbol_hash, symbol_equal, NULL); + if (IS_ERR(map)) { + err = libbpf_get_error(map); + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) + continue; + if (skip_entry(name)) + continue; + + ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); + if (!ks) { + err = -EINVAL; + goto error; + } + + ksym_name = ks->name; + err = hashmap__add(map, ksym_name, 0); + if (err == -EEXIST) { + err = 0; + continue; + } + if (err) + goto error; + + err = libbpf_ensure_mem((void **) &syms, &cap, + sizeof(*syms), cnt + 1); + if (err) + goto error; + + syms[cnt++] = ksym_name; + } + + *symsp = syms; + *cntp = cnt; + +error: + free(name); + fclose(f); + hashmap__free(map); + if (err) + free(syms); + return err; +} + +int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) +{ + unsigned long *addr, *addrs, *tmp_addrs; + int err = 0, max_cnt, inc_cnt; + char *name = NULL; + size_t cnt = 0; + char buf[256]; + FILE *f; + + if (access("/sys/kernel/tracing/trace", F_OK) == 0) + f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); + else + f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); + + if (!f) + return -ENOENT; + + /* In my local setup, the number of entries is 50k+ so Let us initially + * allocate space to hold 64k entries. If 64k is not enough, incrementally + * increase 1k each time. + */ + max_cnt = 65536; + inc_cnt = 1024; + addrs = malloc(max_cnt * sizeof(long)); + if (addrs == NULL) { + err = -ENOMEM; + goto error; + } + + while (fgets(buf, sizeof(buf), f)) { + if (is_invalid_entry(buf, kernel)) + continue; + + free(name); + if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) + continue; + if (skip_entry(name)) + continue; + + if (cnt == max_cnt) { + max_cnt += inc_cnt; + tmp_addrs = realloc(addrs, max_cnt); + if (!tmp_addrs) { + err = -ENOMEM; + goto error; + } + addrs = tmp_addrs; + } + + addrs[cnt++] = (unsigned long)addr; + } + + *addrsp = addrs; + *cntp = cnt; + +error: + free(name); + fclose(f); + if (err) + free(addrs); + return err; +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 2ce873c9f9aad6..9437bdd4afa505 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -41,4 +41,7 @@ ssize_t get_rel_offset(uintptr_t addr); int read_build_id(const char *path, char *build_id, size_t size); +int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel); +int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel); + #endif diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h new file mode 100644 index 00000000000000..549d1f77481019 --- /dev/null +++ b/tools/testing/selftests/bpf/usdt.h @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * This single-header library defines a collection of variadic macros for + * defining and triggering USDTs (User Statically-Defined Tracepoints): + * + * - For USDTs without associated semaphore: + * USDT(group, name, args...) + * + * - For USDTs with implicit (transparent to the user) semaphore: + * USDT_WITH_SEMA(group, name, args...) + * USDT_IS_ACTIVE(group, name) + * + * - For USDTs with explicit (user-defined and provided) semaphore: + * USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...) + * USDT_SEMA_IS_ACTIVE(sema) + * + * all of which emit a NOP instruction into the instruction stream, and so + * have *zero* overhead for the surrounding code. USDTs are identified by + * a combination of `group` and `name` identifiers, which is used by external + * tracing tooling (tracers) for identifying exact USDTs of interest. + * + * USDTs can have an associated (2-byte) activity counter (USDT semaphore), + * automatically maintained by Linux kernel whenever any correctly written + * BPF-based tracer is attached to the USDT. This USDT semaphore can be used + * to check whether there is a need to do any extra data collection and + * processing for a given USDT (if necessary), and otherwise avoid extra work + * for a common case of USDT not being traced ("active"). + * + * See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or + * USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on + * working with USDTs with implicitly or explicitly associated + * USDT semaphores, respectively. + * + * There is also some additional data recorded into an auxiliary note + * section. The data in the note section describes the operands, in terms of + * size and location, used by tracing tooling to know where to find USDT + * arguments. Each location is encoded as an assembler operand string. + * Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert + * breakpoints on top of the nop, and decode the location operand-strings, + * like an assembler, to find the values being passed. + * + * The operand strings are selected by the compiler for each operand. + * They are constrained by inline-assembler codes.The default is: + * + * #define USDT_ARG_CONSTRAINT nor + * + * This is a good default if the operands tend to be integral and + * moderate in number (smaller than number of registers). In other + * cases, the compiler may report "'asm' requires impossible reload" or + * similar. In this case, consider simplifying the macro call (fewer + * and simpler operands), reduce optimization, or override the default + * constraints string via: + * + * #define USDT_ARG_CONSTRAINT g + * #include + * + * For some historical description of USDT v3 format (the one used by this + * library and generally recognized and assumed by BPF-based tracing tools) + * see [0]. The more formal specification can be found at [1]. Additional + * argument constraints information can be found at [2]. + * + * Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for + * this USDT library implementation. Current implementation differs *a lot* in + * terms of exposed user API and general usability, which was the main goal + * and focus of the reimplementation work. Nevertheless, underlying recorded + * USDT definitions are fully binary compatible and any USDT-based tooling + * should work equally well with USDTs defined by either SystemTap's or this + * library's USDT implementation. + * + * [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html + * [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html + * [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h + */ +#ifndef __USDT_H +#define __USDT_H + +/* + * Changelog: + * + * 0.1.0 + * ----- + * - Initial release + */ +#define USDT_MAJOR_VERSION 0 +#define USDT_MINOR_VERSION 1 +#define USDT_PATCH_VERSION 0 + +/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) +#define __usdt_va_opt 1 +#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__ +#else +#define __usdt_va_args(...) , ##__VA_ARGS__ +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. No USDT semaphore is + * associated with this USDT. + * + * Such "semaphoreless" USDTs are commonly used when there is no extra data + * collection or processing needed to collect and prepare USDT arguments and + * they are just available in the surrounding code. USDT() macro will just + * record their locations in CPU registers or in memory for tracing tooling to + * be able to access them, if necessary. + */ +#ifdef __usdt_va_opt +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__) +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. USDT also get an + * implicitly-defined associated USDT semaphore, which will be "activated" by + * tracing tooling and can be used to check whether USDT is being actively + * observed. + * + * USDTs with semaphore are commonly used when there is a need to perform + * additional data collection and processing to prepare USDT arguments, which + * otherwise might not be necessary for the rest of application logic. In such + * case, USDT semaphore can be used to avoid unnecessary extra work. If USDT + * is not traced (which is presumed to be a common situation), the associated + * USDT semaphore is "inactive", and so there is no need to waste resources to + * prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether + * USDT is "active". + * + * N.B. There is an inherent (albeit short) gap between checking whether USDT + * is active and triggering corresponding USDT, in which external tracer can + * be attached to an USDT and activate USDT semaphore after the activity check. + * If such a race occurs, tracers might miss one USDT execution. Tracers are + * expected to accommodate such possibility and this is expected to not be + * a problem for applications and tracers. + * + * N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained + * within a single executable or shared library and is not shared outside + * them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name + * identifier across executable and shared library, it will work and won't + * conflict, per se, but will define independent USDT semaphores, one for each + * shared library/executable in which USDT_WITH_SEMA(group, name) is used. + * That is, if you attach to this USDT in one shared library (or executable), + * then only USDT semaphore within that shared library (or executable) will be + * updated by the kernel, while other libraries (or executable) will not see + * activated USDT semaphore. In short, it's best to use unique USDT group:name + * identifiers across different shared libraries (and, equivalently, between + * executable and shared library). This is advanced consideration and is + * rarely (if ever) seen in practice, but just to avoid surprises this is + * called out here. (Static libraries become a part of final executable, once + * linked by linker, so the above considerations don't apply to them.) + */ +#ifdef __usdt_va_opt +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name) \ + __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name), \ + ##__VA_ARGS__) +#endif + +struct usdt_sema { volatile unsigned short active; }; + +/* + * Check if USDT with `group`:`name` identifier is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into the given USDT. It is assumed that USDT is triggered with + * USDT_WITH_SEMA() macro which will implicitly define associated USDT + * semaphore. (If one needs more control over USDT semaphore, see + * USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.) + * + * N.B. Such checks are necessarily racy and speculative. Between checking + * whether USDT is active and triggering the USDT itself, tracer can be + * detached with no notification. This race should be extremely rare and worst + * case should result in one-time wasted extra data collection and processing. + */ +#define USDT_IS_ACTIVE(group, name) ({ \ + extern struct usdt_sema __usdt_sema_name(group, name) \ + __usdt_asm_name(__usdt_sema_name(group, name)); \ + __usdt_sema_implicit(__usdt_sema_name(group, name)); \ + __usdt_sema_name(group, name).active > 0; \ +}) + +/* + * APIs for working with user-defined explicit USDT semaphores. + * + * This is a less commonly used advanced API for use cases in which user needs + * an explicit control over (potentially shared across multiple USDTs) USDT + * semaphore instance. This can be used when there is a group of logically + * related USDTs that all need extra data collection and processing whenever + * any of a family of related USDTs are "activated" (i.e., traced). In such + * a case, all such related USDTs will be associated with the same shared USDT + * semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be + * triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra + * USDT semaphore identifier as an extra parameter. + */ + +/** + * Underlying C global variable name for user-defined USDT semaphore with + * `sema` identifier. Could be useful for debugging, but normally shouldn't be + * used explicitly. + */ +#define USDT_SEMA(sema) __usdt_sema_##sema + +/* + * Define storage for user-defined USDT semaphore `sema`. + * + * Should be used only once in non-header source file to let compiler allocate + * space for the semaphore variable. Just like with any other global variable. + * + * This macro can be used anywhere where global variable declaration is + * allowed. Just like with global variable definitions, there should be only + * one definition of user-defined USDT semaphore with given `sema` identifier, + * otherwise compiler or linker will complain about duplicate variable + * definition. + * + * For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace + * and inside namespaces (including nested namespaces). Just make sure that + * USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is + * referenced, or any of its parent namespaces, so the C++ language-level + * identifier is visible to the code that needs to reference the semaphore. + * At the lowest layer, USDT semaphores have global naming and visibility + * (they have a corresponding `__usdt_sema_` symbol, which can be linked + * against from C or C++ code, if necessary). To keep it simple, putting + * USDT_DECLARE_SEMA() declarations into global namespaces is the simplest + * no-brainer solution. All these aspects are irrelevant for plain C, because + * C doesn't have namespaces and everything is always in the global namespace. + * + * N.B. Due to USDT metadata being recorded in non-allocatable ELF note + * section, it has limitations when it comes to relocations, which, in + * practice, means that it's not possible to correctly share USDT semaphores + * between main executable and shared libraries, or even between multiple + * shared libraries. USDT semaphore has to be contained to individual shared + * library or executable to avoid unpleasant surprises with half-working USDT + * semaphores. We enforce this by marking semaphore ELF symbols as having + * a hidden visibility. This is quite an advanced use case and consideration + * and for most users this should have no consequences whatsoever. + */ +#define USDT_DEFINE_SEMA(sema) \ + struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \ + __usdt_asm_name(USDT_SEMA(sema)) \ + __attribute__((visibility("hidden"))) = { 0 } + +/* + * Declare extern reference to user-defined USDT semaphore `sema`. + * + * Refers to a variable defined in another compilation unit by + * USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across + * multiple compilation units (i.e., .c and .cpp files). + * + * See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities. + */ +#define USDT_DECLARE_SEMA(sema) \ + extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema)) + +/* + * Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into USDT(s) associated with USDT semaphore `sema`. + * + * N.B. Such checks are necessarily racy. Between checking the state of USDT + * semaphore and triggering associated USDT(s), the active tracer might attach + * or detach. This race should be extremely rare and worst case should result + * in one-time missed USDT event or wasted extra data collection and + * processing. USDT-using tracers should be written with this in mind and is + * not a concern of the application defining USDTs with associated semaphore. + */ +#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0) + +/* + * Invoke USDT specified by `group` and `name` identifiers and associate + * explicitly user-defined semaphore `sema` with it. Pass through `args` as + * USDT arguments. `args` are optional and zero arguments are acceptable. + * + * Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be + * checked whether active with USDT_SEMA_IS_ACTIVE(). + */ +#ifdef __usdt_va_opt +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__) +#else +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__) +#endif + +/* + * Adjustable implementation aspects + */ +#ifndef USDT_ARG_CONSTRAINT +#if defined __powerpc__ +#define USDT_ARG_CONSTRAINT nZr +#elif defined __arm__ +#define USDT_ARG_CONSTRAINT g +#elif defined __loongarch__ +#define USDT_ARG_CONSTRAINT nmr +#else +#define USDT_ARG_CONSTRAINT nor +#endif +#endif /* USDT_ARG_CONSTRAINT */ + +#ifndef USDT_NOP +#if defined(__ia64__) || defined(__s390__) || defined(__s390x__) +#define USDT_NOP nop 0 +#else +#define USDT_NOP nop +#endif +#endif /* USDT_NOP */ + +/* + * Implementation details + */ +/* USDT name for implicitly-defined USDT semaphore, derived from group:name */ +#define __usdt_sema_name(group, name) __usdt_sema_##group##__##name +/* ELF section into which USDT semaphores are put */ +#define __usdt_sema_sec __attribute__((section(".probes"))) + +#define __usdt_concat(a, b) a ## b +#define __usdt_apply(fn, n) __usdt_concat(fn, n) + +#ifndef __usdt_nth +#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N +#endif + +#ifndef __usdt_narg +#ifdef __usdt_va_opt +#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#else +#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif +#endif /* __usdt_narg */ + +#define __usdt_hash # +#define __usdt_str_(x) #x +#define __usdt_str(x) __usdt_str_(x) + +#ifndef __usdt_asm_name +#define __usdt_asm_name(name) __asm__(__usdt_str(name)) +#endif + +#define __usdt_asm0() "\n" +#define __usdt_asm1(x) __usdt_str(x) "\n" +#define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__) +#define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__) +#define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__) +#define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__) +#define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__) +#define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__) +#define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__) +#define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__) +#define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__) +#define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__) +#define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__) +#define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#ifdef __LP64__ +#define __usdt_asm_addr .8byte +#else +#define __usdt_asm_addr .4byte +#endif + +#define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x) +#define __usdt_asm_strz(x) __usdt_asm_strz_(x) +#define __usdt_asm_str_(x) __usdt_asm1(.ascii #x) +#define __usdt_asm_str(x) __usdt_asm_str_(x) + +/* "semaphoreless" USDT case */ +#ifndef __usdt_sema_none +#define __usdt_sema_none(sema) +#endif + +/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */ +#ifndef __usdt_sema_implicit +#define __usdt_sema_implicit(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm1(.ifndef sema) \ + __usdt_asm3( .pushsection .probes, "aw", "progbits") \ + __usdt_asm1( .weak sema) \ + __usdt_asm1( .hidden sema) \ + __usdt_asm1( .align 2) \ + __usdt_asm1(sema:) \ + __usdt_asm1( .zero 2) \ + __usdt_asm2( .type sema, @object) \ + __usdt_asm2( .size sema, 2) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + ); +#endif + +/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */ +#ifndef __usdt_sema_explicit +#define __usdt_sema_explicit(sema) \ + __asm__ __volatile__ ("" :: "m" (sema)); +#endif + +/* main USDT definition (nop and .note.stapsdt metadata) */ +#define __usdt_probe(group, name, sema_def, sema, ...) do { \ + sema_def(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm( 990: USDT_NOP) \ + __usdt_asm3( .pushsection .note.stapsdt, "", "note") \ + __usdt_asm1( .balign 4) \ + __usdt_asm3( .4byte 992f-991f,994f-993f,3) \ + __usdt_asm1(991: .asciz "stapsdt") \ + __usdt_asm1(992: .balign 4) \ + __usdt_asm1(993: __usdt_asm_addr 990b) \ + __usdt_asm1( __usdt_asm_addr _.stapsdt.base) \ + __usdt_asm1( __usdt_asm_addr sema) \ + __usdt_asm_strz(group) \ + __usdt_asm_strz(name) \ + __usdt_asm_args(__VA_ARGS__) \ + __usdt_asm1( .ascii "\0") \ + __usdt_asm1(994: .balign 4) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.ifndef _.stapsdt.base) \ + __usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\ + __usdt_asm1( .weak _.stapsdt.base) \ + __usdt_asm1( .hidden _.stapsdt.base) \ + __usdt_asm1(_.stapsdt.base:) \ + __usdt_asm1( .space 1) \ + __usdt_asm2( .size _.stapsdt.base, 1) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + :: __usdt_asm_ops(__VA_ARGS__) \ + ); \ +} while (0) + +/* + * NB: gdb PR24541 highlighted an unspecified corner of the sdt.h + * operand note format. + * + * The named register may be a longer or shorter (!) alias for the + * storage where the value in question is found. For example, on + * i386, 64-bit value may be put in register pairs, and a register + * name stored would identify just one of them. Previously, gcc was + * asked to emit the %w[id] (16-bit alias of some registers holding + * operands), even when a wider 32-bit value was used. + * + * Bottom line: the byte-width given before the @ sign governs. If + * there is a mismatch between that width and that of the named + * register, then a sys/sdt.h note consumer may need to employ + * architecture-specific heuristics to figure out where the compiler + * has actually put the complete value. + */ +#if defined(__powerpc__) || defined(__powerpc64__) +#define __usdt_argref(id) %I[id]%[id] +#elif defined(__i386__) +#define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */ +#else +#define __usdt_argref(id) %[id] +#endif + +#define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \ + __usdt_asm1(.ascii "@") \ + __usdt_asm_str(__usdt_argref(__usdt_aval##n)) + +#define __usdt_asm_args0 /* no arguments */ +#define __usdt_asm_args1 __usdt_asm_arg(1) +#define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2) +#define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3) +#define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4) +#define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5) +#define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6) +#define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7) +#define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8) +#define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9) +#define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10) +#define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11) +#define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12) +#define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__)) + +#define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5) +#define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x)) + +/* + * We can't use __builtin_choose_expr() in C++, so fall back to table-based + * signedness determination for known types, utilizing templates magic. + */ +#ifdef __cplusplus + +#define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed) + +#include + +template struct __usdt_t { static const bool is_signed = false; }; +template struct __usdt_t : public __usdt_t {}; +template struct __usdt_t : public __usdt_t {}; + +#define __usdt_def_signed(T) \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; } +#define __usdt_maybe_signed(T) \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; } + +__usdt_def_signed(signed char); +__usdt_def_signed(short); +__usdt_def_signed(int); +__usdt_def_signed(long); +__usdt_def_signed(long long); +__usdt_maybe_signed(char); +__usdt_maybe_signed(wchar_t); + +#else /* !__cplusplus */ + +#define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4) +#define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U)) +#define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1) + +#endif /* __cplusplus */ + +#define __usdt_asm_op(n, x) \ + [__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \ + [__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x) + +#define __usdt_asm_ops0() [__usdt_dummy] "g" (0) +#define __usdt_asm_ops1(x) __usdt_asm_op(1, x) +#define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x) +#define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x) +#define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x) +#define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x) +#define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x) +#define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x) +#define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x) +#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x) +#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x) +#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x) +#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x) +#define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#endif /* __USDT_H */ diff --git a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c index b616575c3b00a5..ce13002c7a199b 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c +++ b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c @@ -93,7 +93,7 @@ .expected_attach_type = BPF_SK_LOOKUP, .result = VERBOSE_ACCEPT, .runs = -1, - .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8_w=-44\ + .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8=-44\ 2: (c5) if r0 s< 0x0 goto pc+2\ - R0_w=-44", + R0=-44", }, diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index f3492efc88346e..c8d640802cce41 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -1375,7 +1375,7 @@ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), /* write into map value */ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), - /* fetch secound map_value_ptr from the stack */ + /* fetch second map_value_ptr from the stack */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), /* write into map value */ @@ -1439,7 +1439,7 @@ /* second time with fp-16 */ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), - /* fetch secound map_value_ptr from the stack */ + /* fetch second map_value_ptr from the stack */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), /* write into map value */ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), @@ -1493,7 +1493,7 @@ /* second time with fp-16 */ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), - /* fetch secound map_value_ptr from the stack */ + /* fetch second map_value_ptr from the stack */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), /* write into map value */ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), @@ -2380,7 +2380,7 @@ */ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1), BPF_MOV64_REG(BPF_REG_9, BPF_REG_8), - /* r9 = *r9 ; verifier get's to this point via two paths: + /* r9 = *r9 ; verifier gets to this point via two paths: * ; (I) one including r9 = r8, verified first; * ; (II) one excluding r9 = r8, verified next. * ; After load of *r9 to r9 the frame[0].fp[-24].id == r9.id. diff --git a/tools/testing/selftests/bpf/verify_sig_setup.sh b/tools/testing/selftests/bpf/verify_sig_setup.sh index f2cac42298ba3b..09179fb551f09a 100755 --- a/tools/testing/selftests/bpf/verify_sig_setup.sh +++ b/tools/testing/selftests/bpf/verify_sig_setup.sh @@ -32,7 +32,7 @@ usage() exit 1 } -setup() +genkey() { local tmp_dir="$1" @@ -45,9 +45,14 @@ setup() openssl x509 -in ${tmp_dir}/signing_key.pem -out \ ${tmp_dir}/signing_key.der -outform der +} - key_id=$(cat ${tmp_dir}/signing_key.der | keyctl padd asymmetric ebpf_testing_key @s) +setup() +{ + local tmp_dir="$1" + genkey "${tmp_dir}" + key_id=$(cat ${tmp_dir}/signing_key.der | keyctl padd asymmetric ebpf_testing_key @s) keyring_id=$(keyctl newring ebpf_testing_keyring @s) keyctl link $key_id $keyring_id } @@ -105,6 +110,8 @@ main() if [[ "${action}" == "setup" ]]; then setup "${tmp_dir}" + elif [[ "${action}" == "genkey" ]]; then + genkey "${tmp_dir}" elif [[ "${action}" == "cleanup" ]]; then cleanup "${tmp_dir}" elif [[ "${action}" == "fsverity-create-sign" ]]; then diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index d532dd82a3a83e..e962f133250c7d 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -181,6 +181,12 @@ struct var_preset { bool applied; }; +enum dump_mode { + DUMP_NONE = 0, + DUMP_XLATED = 1, + DUMP_JITED = 2, +}; + static struct env { char **filenames; int filename_cnt; @@ -227,6 +233,7 @@ static struct env { char orig_cgroup[PATH_MAX]; char stat_cgroup[PATH_MAX]; int memory_peak_fd; + __u32 dump_mode; } env; static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) @@ -271,6 +278,7 @@ const char argp_program_doc[] = enum { OPT_LOG_FIXED = 1000, OPT_LOG_SIZE = 1001, + OPT_DUMP = 1002, }; static const struct argp_option opts[] = { @@ -295,6 +303,7 @@ static const struct argp_option opts[] = { "Force BPF verifier failure on register invariant violation (BPF_F_TEST_REG_INVARIANTS program flag)" }, { "top-src-lines", 'S', "N", 0, "Emit N most frequent source code lines" }, { "set-global-vars", 'G', "GLOBAL", 0, "Set global variables provided in the expression, for example \"var1 = 1\"" }, + { "dump", OPT_DUMP, "DUMP_MODE", OPTION_ARG_OPTIONAL, "Print BPF program dump (xlated, jited)" }, {}, }; @@ -427,6 +436,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return err; } break; + case OPT_DUMP: + if (!arg || strcasecmp(arg, "xlated") == 0) { + env.dump_mode |= DUMP_XLATED; + } else if (strcasecmp(arg, "jited") == 0) { + env.dump_mode |= DUMP_JITED; + } else { + fprintf(stderr, "Unrecognized dump mode '%s'\n", arg); + return -EINVAL; + } + break; default: return ARGP_ERR_UNKNOWN; } @@ -1554,6 +1573,36 @@ static int parse_rvalue(const char *val, struct rvalue *rvalue) return 0; } +static void dump(__u32 prog_id, enum dump_mode mode, const char *file_name, const char *prog_name) +{ + char command[64], buf[4096]; + FILE *fp; + int status; + + status = system("command -v bpftool > /dev/null 2>&1"); + if (status != 0) { + fprintf(stderr, "bpftool is not available, can't print program dump\n"); + return; + } + snprintf(command, sizeof(command), "bpftool prog dump %s id %u", + mode == DUMP_JITED ? "jited" : "xlated", prog_id); + fp = popen(command, "r"); + if (!fp) { + fprintf(stderr, "bpftool failed with error: %d\n", errno); + return; + } + + printf("DUMP (%s) %s/%s:\n", mode == DUMP_JITED ? "JITED" : "XLATED", file_name, prog_name); + while (fgets(buf, sizeof(buf), fp)) + fputs(buf, stdout); + fprintf(stdout, "\n"); + + if (ferror(fp)) + fprintf(stderr, "Failed to dump BPF prog with error: %d\n", errno); + + pclose(fp); +} + static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog) { const char *base_filename = basename(strdupa(filename)); @@ -1630,8 +1679,13 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf memset(&info, 0, info_len); fd = bpf_program__fd(prog); - if (fd > 0 && bpf_prog_get_info_by_fd(fd, &info, &info_len) == 0) + if (fd > 0 && bpf_prog_get_info_by_fd(fd, &info, &info_len) == 0) { stats->stats[JITED_SIZE] = info.jited_prog_len; + if (env.dump_mode & DUMP_JITED) + dump(info.id, DUMP_JITED, base_filename, prog_name); + if (env.dump_mode & DUMP_XLATED) + dump(info.id, DUMP_XLATED, base_filename, prog_name); + } parse_verif_log(buf, buf_sz, stats); diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c index 1503a1d2faa090..9ed8c796645d00 100644 --- a/tools/testing/selftests/bpf/xdping.c +++ b/tools/testing/selftests/bpf/xdping.c @@ -155,7 +155,7 @@ int main(int argc, char **argv) } if (!server) { - /* Only supports IPv4; see hints initiailization above. */ + /* Only supports IPv4; see hints initialization above. */ if (getaddrinfo(argv[optind], NULL, &hints, &a) || !a) { fprintf(stderr, "Could not resolve %s\n", argv[optind]); return 1; diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h index 93c2cc413cfcd0..48729da142c249 100644 --- a/tools/testing/selftests/bpf/xsk.h +++ b/tools/testing/selftests/bpf/xsk.h @@ -93,8 +93,8 @@ static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) /* Refresh the local tail pointer. * cached_cons is r->size bigger than the real consumer pointer so * that this addition can be avoided in the more frequently - * executed code that computs free_entries in the beginning of - * this function. Without this optimization it whould have been + * executed code that computes free_entries in the beginning of + * this function. Without this optimization it would have been * free_entries = r->cached_prod - r->cached_cons + r->size. */ r->cached_cons = __atomic_load_n(r->consumer, __ATOMIC_ACQUIRE); diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index a29de0713f19f0..352adc8df2d1cd 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -2276,25 +2276,13 @@ static int testapp_xdp_metadata_copy(struct test_spec *test) { struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - struct bpf_map *data_map; - int count = 0; - int key = 0; test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata, skel_tx->progs.xsk_xdp_populate_metadata, skel_rx->maps.xsk, skel_tx->maps.xsk); test->ifobj_rx->use_metadata = true; - data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss"); - if (!data_map || !bpf_map__is_internal(data_map)) { - ksft_print_msg("Error: could not find bss section of XDP program\n"); - return TEST_FAILURE; - } - - if (bpf_map_update_elem(bpf_map__fd(data_map), &key, &count, BPF_ANY)) { - ksft_print_msg("Error: could not update count element\n"); - return TEST_FAILURE; - } + skel_rx->bss->count = 0; return testapp_validate_traffic(test); } diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 0e89fcff4d05d3..44c52f620fda17 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -522,6 +522,18 @@ int proc_mount_contains(const char *option) return strstr(buf, option) != NULL; } +int cgroup_feature(const char *feature) +{ + char buf[PAGE_SIZE]; + ssize_t read; + + read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf)); + if (read < 0) + return read; + + return strstr(buf, feature) != NULL; +} + ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) { char path[PATH_MAX]; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index c69cab66254b41..9dc90a1b386d77 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -60,6 +60,7 @@ extern int cg_run_nowait(const char *cgroup, extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); int proc_mount_contains(const char *option); +int cgroup_feature(const char *feature); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); extern pid_t clone_into_cgroup(int cgroup_fd); diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 8730645d363a73..dfb763819581fa 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -804,6 +804,662 @@ static int test_cgfreezer_vfork(const char *root) return ret; } +/* + * Get the current frozen_usec for the cgroup. + */ +static long cg_check_freezetime(const char *cgroup) +{ + return cg_read_key_long(cgroup, "cgroup.stat.local", + "frozen_usec "); +} + +/* + * Test that the freeze time will behave as expected for an empty cgroup. + */ +static int test_cgfreezer_time_empty(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_empty"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create an empty cgroup and check that its freeze time + * is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 2) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 3) Unfreeze the cgroup. Check that the freeze time is + * larger than at 2). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Check the freeze time again to ensure that it has not + * changed. + */ + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * A simple test for cgroup freezer time accounting. This test follows + * the same flow as test_cgfreezer_time_empty, but with a single process + * in the cgroup. + */ +static int test_cgfreezer_time_simple(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + + cgroup = cg_name(root, "cg_time_test_simple"); + if (!cgroup) + goto cleanup; + + /* + * 1) Create a cgroup and check that its freeze time is 0. + */ + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Populate the cgroup with one child and check that the + * freeze time is still 0. + */ + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr > prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + + /* + * 3) Sleep for 1000 us. Check that the freeze time is at + * least 1000 us. + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr < 1000) { + debug("Expect time (%ld) to be at least 1000 us\n", + curr); + goto cleanup; + } + + /* + * 4) Unfreeze the cgroup. Check that the freeze time is + * larger than at 3). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Sleep for 1000 us. Check that the freeze time is the + * same as at 4). + */ + usleep(1000); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that freezer time accounting works as expected, even while we're + * populating a cgroup with processes. + */ +static int test_cgfreezer_time_populate(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + long prev, curr; + int i; + + cgroup = cg_name(root, "cg_time_test_populate"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + curr = cg_check_freezetime(cgroup); + if (curr < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + if (curr > 0) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 1) Populate the cgroup with 100 processes. Check that + * the freeze time is 0. + */ + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 2) Wait for the group to become fully populated. Check + * that the freeze time is 0. + */ + if (cg_wait_for_proc_count(cgroup, 100)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be 0\n", curr); + goto cleanup; + } + + /* + * 3) Freeze the cgroup and then populate it with 100 more + * processes. Check that the freeze time continues to grow. + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + for (i = 0; i < 100; i++) + cg_run_nowait(cgroup, child_fn, NULL); + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 4) Wait for the group to become fully populated. Check + * that the freeze time is larger than at 3). + */ + if (cg_wait_for_proc_count(cgroup, 200)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 5) Unfreeze the cgroup. Check that the freeze time is + * larger than at 4). + */ + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 6) Kill the processes. Check that the freeze time is the + * same as it was at 5). + */ + if (cg_killall(cgroup)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr != prev) { + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + /* + * 7) Freeze and unfreeze the cgroup. Check that the freeze + * time is larger than it was at 6). + */ + if (cg_freeze_nowait(cgroup, true)) + goto cleanup; + if (cg_freeze_nowait(cgroup, false)) + goto cleanup; + prev = curr; + curr = cg_check_freezetime(cgroup); + if (curr <= prev) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr, prev); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * Test that frozen time for a cgroup continues to work as expected, + * even as processes are migrated. Frozen cgroup A's freeze time should + * continue to increase and running cgroup B's should stay 0. + */ +static int test_cgfreezer_time_migrate(const char *root) +{ + long prev_A, curr_A, curr_B; + char *cgroup[2] = {0}; + int ret = KSFT_FAIL; + int pid; + + cgroup[0] = cg_name(root, "cg_time_test_migrate_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(root, "cg_time_test_migrate_B"); + if (!cgroup[1]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + pid = cg_run_nowait(cgroup[0], child_fn, NULL); + if (pid < 0) + goto cleanup; + + if (cg_wait_for_proc_count(cgroup[0], 1)) + goto cleanup; + + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A) { + debug("Expect time (%ld) to be 0\n", curr_A); + goto cleanup; + } + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + /* + * Freeze cgroup A. + */ + if (cg_freeze_wait(cgroup[0], true)) + goto cleanup; + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be > 0\n", curr_A); + goto cleanup; + } + + /* + * Migrate from A (frozen) to B (running). + */ + if (cg_enter(cgroup[1], pid)) + goto cleanup; + + usleep(1000); + curr_B = cg_check_freezetime(cgroup[1]); + if (curr_B) { + debug("Expect time (%ld) to be 0\n", curr_B); + goto cleanup; + } + + prev_A = curr_A; + curr_A = cg_check_freezetime(cgroup[0]); + if (curr_A <= prev_A) { + debug("Expect time (%ld) to be more than previous check (%ld)\n", + curr_A, prev_A); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (cgroup[0]) + cg_destroy(cgroup[0]); + free(cgroup[0]); + if (cgroup[1]) + cg_destroy(cgroup[1]); + free(cgroup[1]); + return ret; +} + +/* + * The test creates a cgroup and freezes it. Then it creates a child cgroup. + * After that it checks that the child cgroup has a non-zero freeze time + * that is less than the parent's. Next, it freezes the child, unfreezes + * the parent, and sleeps. Finally, it checks that the child's freeze + * time has grown larger than the parent's. + */ +static int test_cgfreezer_time_parent(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_parent_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_parent_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_freeze_wait(parent, true)) + goto cleanup; + + usleep(1000); + if (cg_create(child)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + /* + * Since the parent was frozen the entire time the child cgroup + * was being created, we expect the parent's freeze time to be + * larger than the child's. + * + * Ideally, we would be able to check both times simultaneously, + * but here we get the child's after we get the parent's. + */ + ptime = cg_check_freezetime(parent); + ctime = cg_check_freezetime(child); + if (ptime <= ctime) { + debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime); + goto cleanup; + } + + if (cg_freeze_nowait(child, true)) + goto cleanup; + + if (cg_freeze_wait(parent, false)) + goto cleanup; + + if (cg_check_frozen(child, true)) + goto cleanup; + + usleep(100000); + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + + if (ctime <= ptime) { + debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates a parent cgroup and a child cgroup. Then, it freezes + * the child and checks that the child's freeze time is greater than the + * parent's, which should be zero. + */ +static int test_cgfreezer_time_child(const char *root) +{ + char *parent, *child = NULL; + int ret = KSFT_FAIL; + long ptime, ctime; + + parent = cg_name(root, "cg_test_child_A"); + if (!parent) + goto cleanup; + + child = cg_name(parent, "cg_test_child_B"); + if (!child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_check_freezetime(parent) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(child)) + goto cleanup; + + if (cg_freeze_wait(child, true)) + goto cleanup; + + ctime = cg_check_freezetime(child); + ptime = cg_check_freezetime(parent); + if (ptime != 0) { + debug("Expect ptime (%ld) to be 0\n", ptime); + goto cleanup; + } + + if (ctime <= ptime) { + debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + free(child); + if (parent) + cg_destroy(parent); + free(parent); + return ret; +} + +/* + * The test creates the following hierarchy: + * A + * | + * B + * | + * C + * + * Then it freezes the cgroups in the order C, B, A. + * Then it unfreezes the cgroups in the order A, B, C. + * Then it checks that C's freeze time is larger than B's and + * that B's is larger than A's. + */ +static int test_cgfreezer_time_nested(const char *root) +{ + char *cgroup[3] = {0}; + int ret = KSFT_FAIL; + long time[3] = {0}; + int i; + + cgroup[0] = cg_name(root, "cg_test_time_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(cgroup[0], "B"); + if (!cgroup[1]) + goto cleanup; + + cgroup[2] = cg_name(cgroup[1], "C"); + if (!cgroup[2]) + goto cleanup; + + if (cg_create(cgroup[0])) + goto cleanup; + + if (cg_check_freezetime(cgroup[0]) < 0) { + ret = KSFT_SKIP; + goto cleanup; + } + + if (cg_create(cgroup[1])) + goto cleanup; + + if (cg_create(cgroup[2])) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], true)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[0], true)) + goto cleanup; + + usleep(1000); + + if (cg_freeze_nowait(cgroup[0], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[1], false)) + goto cleanup; + + if (cg_freeze_nowait(cgroup[2], false)) + goto cleanup; + + time[2] = cg_check_freezetime(cgroup[2]); + time[1] = cg_check_freezetime(cgroup[1]); + time[0] = cg_check_freezetime(cgroup[0]); + + if (time[2] <= time[1]) { + debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]); + goto cleanup; + } + + if (time[1] <= time[0]) { + debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]); + goto cleanup; + } + + ret = KSFT_PASS; + +cleanup: + for (i = 2; i >= 0 && cgroup[i]; i--) { + cg_destroy(cgroup[i]); + free(cgroup[i]); + } + + return ret; +} + #define T(x) { x, #x } struct cgfreezer_test { int (*fn)(const char *root); @@ -819,6 +1475,13 @@ struct cgfreezer_test { T(test_cgfreezer_stopped), T(test_cgfreezer_ptraced), T(test_cgfreezer_vfork), + T(test_cgfreezer_time_empty), + T(test_cgfreezer_time_simple), + T(test_cgfreezer_time_populate), + T(test_cgfreezer_time_migrate), + T(test_cgfreezer_time_parent), + T(test_cgfreezer_time_child), + T(test_cgfreezer_time_nested), }; #undef T diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c index 9ecb83c6cc5cbf..d8a1d1cd500727 100644 --- a/tools/testing/selftests/cgroup/test_pids.c +++ b/tools/testing/selftests/cgroup/test_pids.c @@ -77,6 +77,9 @@ static int test_pids_events(const char *root) char *cg_parent = NULL, *cg_child = NULL; int pid; + if (cgroup_feature("pids_localevents") <= 0) + return KSFT_SKIP; + cg_parent = cg_name(root, "pids_parent"); cg_child = cg_name(cg_parent, "pids_child"); if (!cg_parent || !cg_child) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index 7bc148889ca729..187b478d0ddf29 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -7,6 +7,8 @@ ALL_TESTS=" prio arp_validate num_grat_arp + fail_over_mac + vlan_over_bond " lib_dir=$(dirname "$0") @@ -352,8 +354,8 @@ garp_test() exp_num=$(echo "${param}" | cut -f6 -d ' ') active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") - slowwait_for_counter $((exp_num + 5)) $exp_num \ - tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" + slowwait_for_counter $((exp_num + 5)) $exp_num tc_rule_handle_stats_get \ + "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" &> /dev/null # check result real_num=$(tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}") @@ -376,6 +378,197 @@ num_grat_arp() done } +check_all_mac_same() +{ + RET=0 + # all slaves should have same mac address (with the first port's mac) + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + local eth1_mac=$(ip -n "$s_ns" -j link show eth1 | jq -r '.[]["address"]') + local eth2_mac=$(ip -n "$s_ns" -j link show eth2 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ] || [ "$eth0_mac" != "$bond_mac" ] || \ + [ "$eth1_mac" != "$bond_mac" ] || [ "$eth2_mac" != "$bond_mac" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_first() +{ + RET=0 + # bond mac address should be same with the first added slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_active() +{ + RET=0 + # bond mac address should be same with active slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + local active_slave_mac=$(ip -n "$s_ns" -j link show "$active_slave" | jq -r '.[]["address"]') + if [ "$bond_mac" != "$active_slave_mac" ]; then + RET=1 + fi +} + +check_backup_slave_mac_not_change() +{ + RET=0 + # backup slave's mac address is not changed + if ip -n "$s_ns" -d -j link show type bond_slave | jq -e '.[] + | select(.linkinfo.info_slave_data.state=="BACKUP") + | select(.address != .linkinfo.info_slave_data.perm_hwaddr)' &> /dev/null; then + RET=1 + fi +} + +check_backup_slave_mac_inherit() +{ + local backup_mac + RET=0 + + # backup slaves should use mac[1] or mac[2] + local backup_macs=$(ip -n "$s_ns" -d -j link show type bond_slave | \ + jq -r '.[] | select(.linkinfo.info_slave_data.state=="BACKUP") | .address') + for backup_mac in $backup_macs; do + if [ "$backup_mac" != "${mac[1]}" ] && [ "$backup_mac" != "${mac[2]}" ]; then + RET=1 + fi + done +} + +check_first_slave_random_mac() +{ + RET=0 + # remove the first added slave and added it back + ip -n "$s_ns" link set eth0 nomaster + ip -n "$s_ns" link set eth0 master bond0 + + # the first slave should use random mac address + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" = "${mac[0]}" ] && RET=1 + log_test "bond fail_over_mac follow" "random first slave mac" + + # remove the first slave, the permanent MAC address should be restored back + ip -n "$s_ns" link set eth0 nomaster + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" != "${mac[0]}" ] && RET=1 +} + +do_active_backup_failover() +{ + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + ip -n ${s_ns} link set ${active_slave} down + slowwait 2 active_slave_changed $active_slave + ip -n ${s_ns} link set ${active_slave} up +} + +fail_over_mac() +{ + # Bring down the first interface on the switch to force the bond to + # select another active interface instead of the first one that joined. + ip -n "$g_ns" link set s0 down + + # fail_over_mac none + bond_reset "mode active-backup miimon 100 fail_over_mac 0" + check_all_mac_same + log_test "fail_over_mac 0" "all slaves have same mac" + do_active_backup_failover + check_all_mac_same + log_test "fail_over_mac 0" "failover: all slaves have same mac" + + # fail_over_mac active + bond_reset "mode active-backup miimon 100 fail_over_mac 1" + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "backup slave mac is not changed" + do_active_backup_failover + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "failover: backup slave mac is not changed" + + # fail_over_mac follow + bond_reset "mode active-backup miimon 100 fail_over_mac 2" + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "backup slave mac inherit" + do_active_backup_failover + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "failover: bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "failover: backup slave mac inherit" + check_first_slave_random_mac + log_test "fail_over_mac 2" "first slave mac random" +} + +vlan_over_bond_arp() +{ + local mode="$1" + RET=0 + + bond_reset "mode $mode arp_interval 100 arp_ip_target 192.0.3.10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 101 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond arp" "$mode" +} + +vlan_over_bond_ns() +{ + local mode="$1" + RET=0 + + if skip_ns; then + log_test_skip "vlan_over_bond ns" "$mode" + return 0 + fi + + bond_reset "mode $mode arp_interval 100 ns_ip6_target 2001:db8::3:10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 102 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond ns" "$mode" +} + +vlan_over_bond() +{ + # add vlan 3 for client + ip -n "${c_ns}" link add eth0.3 link eth0 type vlan id 3 + ip -n "${c_ns}" link set eth0.3 up + ip -n "${c_ns}" addr add 192.0.3.10/24 dev eth0.3 + ip -n "${c_ns}" addr add 2001:db8::3:10/64 dev eth0.3 + + # Add tc rule to check the vlan pkts + tc -n "${c_ns}" qdisc add dev eth0.3 clsact + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol arp \ + handle 101 flower skip_hw arp_op request \ + arp_sip 192.0.3.1 arp_tip 192.0.3.10 action pass + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol ipv6 \ + handle 102 flower skip_hw ip_proto icmpv6 \ + type 135 src_ip 2001:db8::3:1 action pass + + vlan_over_bond_arp "active-backup" + vlan_over_bond_ns "active-backup" +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh index 195ef83cfbf167..167aa4a4a12a42 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh @@ -39,6 +39,8 @@ g_ip4="192.0.2.254" s_ip6="2001:db8::1" c_ip6="2001:db8::10" g_ip6="2001:db8::254" +mac[0]="00:0a:0b:0c:0d:01" +mac[1]="00:0a:0b:0c:0d:02" gateway_create() { @@ -62,6 +64,7 @@ server_create() for i in $(seq 0 1); do ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh index 3a1333d9a85b36..23a2932301cc0d 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh @@ -26,6 +26,7 @@ # +-------------------------------------+ source bond_topo_2d1c.sh +mac[2]="00:0a:0b:0c:0d:03" setup_prepare() { @@ -36,6 +37,7 @@ setup_prepare() # Add the extra device as we use 3 down links for bond0 local i=2 ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 ip -n ${s_ns} link set eth${i} master bond0 diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 4d16a69ffc6507..832fa1caeb6627 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -10,3 +10,4 @@ CONFIG_NET_CLS_MATCHALL=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y +CONFIG_VLAN_8021Q=m diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore index fcbdb1297e24e8..64ac0dfa46b7ef 100644 --- a/tools/testing/selftests/filesystems/.gitignore +++ b/tools/testing/selftests/filesystems/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only dnotify_test devpts_pts +fclog file_stressor anon_inode_test kernfs_test diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile index 73d4650af1a517..85427d7f19b9b5 100644 --- a/tools/testing/selftests/filesystems/Makefile +++ b/tools/testing/selftests/filesystems/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test +TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog TEST_GEN_PROGS_EXTENDED := dnotify_test include ../lib.mk diff --git a/tools/testing/selftests/filesystems/fclog.c b/tools/testing/selftests/filesystems/fclog.c new file mode 100644 index 00000000000000..912a8b755c3b63 --- /dev/null +++ b/tools/testing/selftests/filesystems/fclog.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai + * Copyright (C) 2025 SUSE LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define ASSERT_ERRNO(expected, _t, seen) \ + __EXPECT(expected, #expected, \ + ({__typeof__(seen) _tmp_seen = (seen); \ + _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1) + +#define ASSERT_ERRNO_EQ(expected, seen) \ + ASSERT_ERRNO(expected, ==, seen) + +#define ASSERT_SUCCESS(seen) \ + ASSERT_ERRNO(0, <=, seen) + +FIXTURE(ns) +{ + int host_mntns; +}; + +FIXTURE_SETUP(ns) +{ + /* Stash the old mntns. */ + self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_mntns); + + /* Create a new mount namespace and make it private. */ + ASSERT_SUCCESS(unshare(CLONE_NEWNS)); + ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); +} + +FIXTURE_TEARDOWN(ns) +{ + ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS)); + ASSERT_SUCCESS(close(self->host_mntns)); +} + +TEST_F(ns, fscontext_log_enodata) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + /* A brand new fscontext has no log entries. */ + char buf[128] = {}; + for (int i = 0; i < 16; i++) + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_errorfc) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + char buf[128] = {}; + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_errorfc_after_fsmount) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + int mfd = fsmount(fsfd, FSMOUNT_CLOEXEC, MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NOSUID); + ASSERT_SUCCESS(mfd); + ASSERT_SUCCESS(move_mount(mfd, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)); + + /* + * The fscontext log should still contain data even after + * FSCONFIG_CMD_CREATE and fsmount(). + */ + char buf[128] = {}; + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_F(ns, fscontext_log_emsgsize) +{ + int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0)); + + char buf[128] = {}; + /* + * Attempting to read a message with too small a buffer should not + * result in the message getting consumed. + */ + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 0)); + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 1)); + for (int i = 0; i < 16; i++) + ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 16)); + + ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf))); + EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf); + + /* The message has been consumed. */ + ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf))); + ASSERT_SUCCESS(close(fsfd)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c index 63ce708d93ed06..e4b7c2b457ee7a 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include #include #include @@ -10,20 +17,12 @@ #include #include #include +#include #include "../../kselftest_harness.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_cmds[] = { diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c index 090a5ca65004a0..9f57ca46e3afa0 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include #include #include @@ -10,21 +17,12 @@ #include #include #include +#include #include "../../kselftest_harness.h" -#include "../../pidfd/pidfd.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_types[] = { diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 8cfb87f7f7c505..490ace1f017e86 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,12 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 +PKG_CONFIG ?= pkg-config +LIBNUMA_TEST = $(shell sh -c "$(PKG_CONFIG) numa --atleast-version 2.0.16 > /dev/null 2>&1 && echo SUFFICIENT || echo NO") + INCLUDES := -I../include -I../../ $(KHDR_INCLUDES) -CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread $(INCLUDES) $(KHDR_INCLUDES) +CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread -D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64 $(INCLUDES) $(KHDR_INCLUDES) -DLIBNUMA_VER_$(LIBNUMA_TEST)=1 LDLIBS := -lpthread -lrt -lnuma LOCAL_HDRS := \ ../include/futextest.h \ - ../include/atomic.h \ - ../include/logging.h + ../include/atomic.h TEST_GEN_PROGS := \ futex_wait_timeout \ futex_wait_wouldblock \ diff --git a/tools/testing/selftests/futex/functional/futex_numa.c b/tools/testing/selftests/futex/functional/futex_numa.c index f29e4d627e7942..e0a33510ccb60c 100644 --- a/tools/testing/selftests/futex/functional/futex_numa.c +++ b/tools/testing/selftests/futex/functional/futex_numa.c @@ -5,9 +5,10 @@ #include #include #include +#include +#include #include #include -#include "logging.h" #include "futextest.h" #include "futex2test.h" diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index a9ecfb2d3932ad..d037a3f10ee85d 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -16,9 +16,9 @@ #include #include -#include "logging.h" #include "futextest.h" #include "futex2test.h" +#include "../../kselftest_harness.h" #define MAX_THREADS 64 @@ -77,7 +77,7 @@ static void join_max_threads(void) } } -static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flags) +static void __test_futex(void *futex_ptr, int err_value, unsigned int futex_flags) { int to_wake, ret, i, need_exit = 0; @@ -88,11 +88,17 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag do { ret = futex2_wake(futex_ptr, to_wake, futex_flags); - if (must_fail) { - if (ret < 0) - break; - ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n", - to_wake, futex_flags); + + if (err_value) { + if (ret >= 0) + ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n", + to_wake, futex_flags); + + if (errno != err_value) + ksft_exit_fail_msg("futex2_wake(%d, 0x%x) expected error was %d, but returned %d (%s)\n", + to_wake, futex_flags, err_value, errno, strerror(errno)); + + break; } if (ret < 0) { ksft_exit_fail_msg("Failed futex2_wake(%d, 0x%x): %m\n", @@ -106,12 +112,12 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag join_max_threads(); for (i = 0; i < MAX_THREADS; i++) { - if (must_fail && thread_args[i].result != -1) { + if (err_value && thread_args[i].result != -1) { ksft_print_msg("Thread %d should fail but succeeded (%d)\n", i, thread_args[i].result); need_exit = 1; } - if (!must_fail && thread_args[i].result != 0) { + if (!err_value && thread_args[i].result != 0) { ksft_print_msg("Thread %d failed (%d)\n", i, thread_args[i].result); need_exit = 1; } @@ -120,58 +126,30 @@ static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flag ksft_exit_fail_msg("Aborting due to earlier errors.\n"); } -static void test_futex(void *futex_ptr, int must_fail) +static void test_futex(void *futex_ptr, int err_value) { - __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); + __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); } -static void test_futex_mpol(void *futex_ptr, int must_fail) +static void test_futex_mpol(void *futex_ptr, int err_value) { - __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); + __test_futex(futex_ptr, err_value, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); } -static void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - -int main(int argc, char *argv[]) +TEST(futex_numa_mpol) { struct futex32_numa *futex_numa; - int mem_size, i; void *futex_ptr; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); + int mem_size; mem_size = sysconf(_SC_PAGE_SIZE); - futex_ptr = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + futex_ptr = mmap(NULL, mem_size * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (futex_ptr == MAP_FAILED) ksft_exit_fail_msg("mmap() for %d bytes failed\n", mem_size); + /* Create an invalid memory region for the "Memory out of range" test */ + mprotect(futex_ptr + mem_size, mem_size, PROT_NONE); + futex_numa = futex_ptr; ksft_print_msg("Regular test\n"); @@ -182,27 +160,31 @@ int main(int argc, char *argv[]) if (futex_numa->numa == FUTEX_NO_NODE) ksft_exit_fail_msg("NUMA node is left uninitialized\n"); - ksft_print_msg("Memory too small\n"); - test_futex(futex_ptr + mem_size - 4, 1); + /* FUTEX2_NUMA futex must be 8-byte aligned */ + ksft_print_msg("Mis-aligned futex\n"); + test_futex(futex_ptr + mem_size - 4, EINVAL); ksft_print_msg("Memory out of range\n"); - test_futex(futex_ptr + mem_size, 1); + test_futex(futex_ptr + mem_size, EFAULT); futex_numa->numa = FUTEX_NO_NODE; mprotect(futex_ptr, mem_size, PROT_READ); ksft_print_msg("Memory, RO\n"); - test_futex(futex_ptr, 1); + test_futex(futex_ptr, EFAULT); mprotect(futex_ptr, mem_size, PROT_NONE); ksft_print_msg("Memory, no access\n"); - test_futex(futex_ptr, 1); + test_futex(futex_ptr, EFAULT); mprotect(futex_ptr, mem_size, PROT_READ | PROT_WRITE); ksft_print_msg("Memory back to RW\n"); test_futex(futex_ptr, 0); + ksft_test_result_pass("futex2 memory boundary tests passed\n"); + /* MPOL test. Does not work as expected */ - for (i = 0; i < 4; i++) { +#ifdef LIBNUMA_VER_SUFFICIENT + for (int i = 0; i < 4; i++) { unsigned long nodemask; int ret; @@ -221,15 +203,17 @@ int main(int argc, char *argv[]) ret = futex2_wake(futex_ptr, 0, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); if (ret < 0) ksft_test_result_fail("Failed to wake 0 with MPOL: %m\n"); - if (0) - test_futex_mpol(futex_numa, 0); if (futex_numa->numa != i) { ksft_exit_fail_msg("Returned NUMA node is %d expected %d\n", futex_numa->numa, i); } } } - ksft_test_result_pass("NUMA MPOL tests passed\n"); - ksft_finished(); - return 0; + ksft_test_result_pass("futex2 MPOL hints test passed\n"); +#else + ksft_test_result_skip("futex2 MPOL hints test requires libnuma 2.0.16+\n"); +#endif + munmap(futex_ptr, mem_size * 2); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c index aea001ac494604..3b7b5851f290fc 100644 --- a/tools/testing/selftests/futex/functional/futex_priv_hash.c +++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c @@ -14,7 +14,7 @@ #include #include -#include "logging.h" +#include "../../kselftest_harness.h" #define MAX_THREADS 64 @@ -128,46 +128,14 @@ static void futex_dummy_op(void) ksft_exit_fail_msg("pthread_mutex_timedlock() did not timeout: %d.\n", ret); } -static void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -g Test global hash instead intead local immutable \n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - static const char *test_msg_auto_create = "Automatic hash bucket init on thread creation.\n"; static const char *test_msg_auto_inc = "Automatic increase with more than 16 CPUs\n"; -int main(int argc, char *argv[]) +TEST(priv_hash) { int futex_slots1, futex_slotsn, online_cpus; pthread_mutexattr_t mutex_attr_pi; int ret, retry = 20; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(21); ret = pthread_mutexattr_init(&mutex_attr_pi); ret |= pthread_mutexattr_setprotocol(&mutex_attr_pi, PTHREAD_PRIO_INHERIT); @@ -189,14 +157,14 @@ int main(int argc, char *argv[]) if (ret != 0) ksft_exit_fail_msg("pthread_join() failed: %d, %m\n", ret); - /* First thread, has to initialiaze private hash */ + /* First thread, has to initialize private hash */ futex_slots1 = futex_hash_slots_get(); if (futex_slots1 <= 0) { ksft_print_msg("Current hash buckets: %d\n", futex_slots1); - ksft_exit_fail_msg(test_msg_auto_create); + ksft_exit_fail_msg("%s", test_msg_auto_create); } - ksft_test_result_pass(test_msg_auto_create); + ksft_test_result_pass("%s", test_msg_auto_create); online_cpus = sysconf(_SC_NPROCESSORS_ONLN); ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS + 1); @@ -237,11 +205,11 @@ int main(int argc, char *argv[]) } ksft_print_msg("Expected increase of hash buckets but got: %d -> %d\n", futex_slots1, futex_slotsn); - ksft_exit_fail_msg(test_msg_auto_inc); + ksft_exit_fail_msg("%s", test_msg_auto_inc); } - ksft_test_result_pass(test_msg_auto_inc); + ksft_test_result_pass("%s", test_msg_auto_inc); } else { - ksft_test_result_skip(test_msg_auto_inc); + ksft_test_result_skip("%s", test_msg_auto_inc); } ret = pthread_mutex_unlock(&global_lock); @@ -257,17 +225,17 @@ int main(int argc, char *argv[]) futex_hash_slots_set_verify(2); join_max_threads(); - ksft_test_result(counter == MAX_THREADS, "Created of waited for %d of %d threads\n", + ksft_test_result(counter == MAX_THREADS, "Created and waited for %d of %d threads\n", counter, MAX_THREADS); counter = 0; - /* Once the user set something, auto reisze must be disabled */ + /* Once the user set something, auto resize must be disabled */ ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); create_max_threads(thread_lock_fn); join_max_threads(); ret = futex_hash_slots_get(); - ksft_test_result(ret == 2, "No more auto-resize after manaul setting, got %d\n", + ksft_test_result(ret == 2, "No more auto-resize after manual setting, got %d\n", ret); futex_hash_slots_set_must_fail(1 << 29); @@ -280,7 +248,7 @@ int main(int argc, char *argv[]) ret = futex_hash_slots_set(0); ksft_test_result(ret == 0, "Global hash request\n"); if (ret != 0) - goto out; + return; futex_hash_slots_set_must_fail(4); futex_hash_slots_set_must_fail(8); @@ -289,17 +257,14 @@ int main(int argc, char *argv[]) futex_hash_slots_set_must_fail(6); ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); - if (ret != 0) { + if (ret != 0) ksft_exit_fail_msg("pthread_barrier_init failed: %m\n"); - return 1; - } + create_max_threads(thread_lock_fn); join_max_threads(); ret = futex_hash_slots_get(); ksft_test_result(ret == 0, "Continue to use global hash\n"); - -out: - ksft_finished(); - return 0; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue.c b/tools/testing/selftests/futex/functional/futex_requeue.c index 51485be6eb2f1b..69e2555b603991 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue.c +++ b/tools/testing/selftests/futex/functional/futex_requeue.c @@ -7,24 +7,15 @@ #include #include -#include "logging.h" + #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 volatile futex_t *f1; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *waiterfn(void *arg) { struct timespec to; @@ -38,67 +29,49 @@ void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(requeue_single) { - pthread_t waiter[10]; - int res, ret = RET_PASS; - int c, i; volatile futex_t _f1 = 0; volatile futex_t f2 = 0; + pthread_t waiter[10]; + int res; f1 = &_f1; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(2); - ksft_print_msg("%s: Test futex_requeue\n", - basename(argv[0])); - /* * Requeue a waiter from f1 to f2, and wake f2. */ if (pthread_create(&waiter[0], NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Requeuing 1 futex from f1 to f2\n"); + ksft_print_dbg_msg("Requeuing 1 futex from f1 to f2\n"); res = futex_cmp_requeue(f1, 0, &f2, 0, 1, 0); - if (res != 1) { + if (res != 1) ksft_test_result_fail("futex_requeue simple returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; - } - - info("Waking 1 futex at f2\n"); + ksft_print_dbg_msg("Waking 1 futex at f2\n"); res = futex_wake(&f2, 1, 0); if (res != 1) { ksft_test_result_fail("futex_requeue simple returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_requeue simple succeeds\n"); } +} + +TEST(requeue_multiple) +{ + volatile futex_t _f1 = 0; + volatile futex_t f2 = 0; + pthread_t waiter[10]; + int res, i; + f1 = &_f1; /* * Create 10 waiters at f1. At futex_requeue, wake 3 and requeue 7. @@ -106,31 +79,28 @@ int main(int argc, char *argv[]) */ for (i = 0; i < 10; i++) { if (pthread_create(&waiter[i], NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); } usleep(WAKE_WAIT_US); - info("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n"); + ksft_print_dbg_msg("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n"); res = futex_cmp_requeue(f1, 0, &f2, 3, 7, 0); if (res != 10) { ksft_test_result_fail("futex_requeue many returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } - info("Waking INT_MAX futexes at f2\n"); + ksft_print_dbg_msg("Waking INT_MAX futexes at f2\n"); res = futex_wake(&f2, INT_MAX, 0); if (res != 7) { ksft_test_result_fail("futex_requeue many returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_requeue many succeeds\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c b/tools/testing/selftests/futex/functional/futex_requeue_pi.c index 215c6cb539b4ab..f299d75848cd43 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c @@ -26,11 +26,11 @@ #include #include #include + #include "atomic.h" #include "futextest.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue-pi" #define MAX_WAKE_ITERS 1000 #define THREAD_MAX 10 #define SIGNAL_PERIOD_US 100 @@ -42,12 +42,6 @@ futex_t f1 = FUTEX_INITIALIZER; futex_t f2 = FUTEX_INITIALIZER; futex_t wake_complete = FUTEX_INITIALIZER; -/* Test option defaults */ -static long timeout_ns; -static int broadcast; -static int owner; -static int locked; - struct thread_arg { long id; struct timespec *timeout; @@ -56,18 +50,73 @@ struct thread_arg { }; #define THREAD_ARG_INITIALIZER { 0, NULL, 0, 0 } -void usage(char *prog) +FIXTURE(args) { - printf("Usage: %s\n", prog); - printf(" -b Broadcast wakeup (all waiters)\n"); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -l Lock the pi futex across requeue\n"); - printf(" -o Use a third party pi futex owner during requeue (cancels -l)\n"); - printf(" -t N Timeout in nanoseconds (default: 0)\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} +}; + +FIXTURE_SETUP(args) +{ +}; + +FIXTURE_TEARDOWN(args) +{ +}; + +FIXTURE_VARIANT(args) +{ + long timeout_ns; + bool broadcast; + bool owner; + bool locked; +}; + +/* + * For a given timeout value, this macro creates a test input with all the + * possible combinations of valid arguments + */ +#define FIXTURE_VARIANT_ADD_TIMEOUT(timeout) \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout) \ +{ \ + .timeout_ns = timeout, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast_locked) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ + .locked = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_broadcast_owner) \ +{ \ + .timeout_ns = timeout, \ + .broadcast = true, \ + .owner = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_locked) \ +{ \ + .timeout_ns = timeout, \ + .locked = true, \ +}; \ + \ +FIXTURE_VARIANT_ADD(args, t_##timeout##_owner) \ +{ \ + .timeout_ns = timeout, \ + .owner = true, \ +}; \ + +FIXTURE_VARIANT_ADD_TIMEOUT(0); +FIXTURE_VARIANT_ADD_TIMEOUT(5000); +FIXTURE_VARIANT_ADD_TIMEOUT(500000); +FIXTURE_VARIANT_ADD_TIMEOUT(2000000000); int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, int policy, int prio) @@ -81,26 +130,26 @@ int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); if (ret) { - error("pthread_attr_setinheritsched\n", ret); + ksft_exit_fail_msg("pthread_attr_setinheritsched\n"); return -1; } ret = pthread_attr_setschedpolicy(&attr, policy); if (ret) { - error("pthread_attr_setschedpolicy\n", ret); + ksft_exit_fail_msg("pthread_attr_setschedpolicy\n"); return -1; } schedp.sched_priority = prio; ret = pthread_attr_setschedparam(&attr, &schedp); if (ret) { - error("pthread_attr_setschedparam\n", ret); + ksft_exit_fail_msg("pthread_attr_setschedparam\n"); return -1; } ret = pthread_create(pth, &attr, func, arg); if (ret) { - error("pthread_create\n", ret); + ksft_exit_fail_msg("pthread_create\n"); return -1; } return 0; @@ -112,7 +161,7 @@ void *waiterfn(void *arg) struct thread_arg *args = (struct thread_arg *)arg; futex_t old_val; - info("Waiter %ld: running\n", args->id); + ksft_print_dbg_msg("Waiter %ld: running\n", args->id); /* Each thread sleeps for a different amount of time * This is to avoid races, because we don't lock the * external mutex here */ @@ -120,26 +169,25 @@ void *waiterfn(void *arg) old_val = f1; atomic_inc(&waiters_blocked); - info("Calling futex_wait_requeue_pi: %p (%u) -> %p\n", + ksft_print_dbg_msg("Calling futex_wait_requeue_pi: %p (%u) -> %p\n", &f1, f1, &f2); args->ret = futex_wait_requeue_pi(&f1, old_val, &f2, args->timeout, FUTEX_PRIVATE_FLAG); - info("waiter %ld woke with %d %s\n", args->id, args->ret, + ksft_print_dbg_msg("waiter %ld woke with %d %s\n", args->id, args->ret, args->ret < 0 ? strerror(errno) : ""); atomic_inc(&waiters_woken); if (args->ret < 0) { if (args->timeout && errno == ETIMEDOUT) args->ret = 0; else { - args->ret = RET_ERROR; - error("futex_wait_requeue_pi\n", errno); + ksft_exit_fail_msg("futex_wait_requeue_pi\n"); } futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - info("Waiter %ld: exiting with %d\n", args->id, args->ret); + ksft_print_dbg_msg("Waiter %ld: exiting with %d\n", args->id, args->ret); pthread_exit((void *)&args->ret); } @@ -152,14 +200,14 @@ void *broadcast_wakerfn(void *arg) int nr_wake = 1; int i = 0; - info("Waker: waiting for waiters to block\n"); + ksft_print_dbg_msg("Waker: waiting for waiters to block\n"); while (waiters_blocked.val < THREAD_MAX) usleep(1000); usleep(1000); - info("Waker: Calling broadcast\n"); + ksft_print_dbg_msg("Waker: Calling broadcast\n"); if (args->lock) { - info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2); futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } continue_requeue: @@ -167,16 +215,14 @@ void *broadcast_wakerfn(void *arg) args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, nr_wake, nr_requeue, FUTEX_PRIVATE_FLAG); if (args->ret < 0) { - args->ret = RET_ERROR; - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); } else if (++i < MAX_WAKE_ITERS) { task_count += args->ret; if (task_count < THREAD_MAX - waiters_woken.val) goto continue_requeue; } else { - error("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n", - 0, MAX_WAKE_ITERS, task_count, THREAD_MAX); - args->ret = RET_ERROR; + ksft_exit_fail_msg("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n", + MAX_WAKE_ITERS, task_count, THREAD_MAX); } futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG); @@ -187,7 +233,7 @@ void *broadcast_wakerfn(void *arg) if (args->ret > 0) args->ret = task_count; - info("Waker: exiting with %d\n", args->ret); + ksft_print_dbg_msg("Waker: exiting with %d\n", args->ret); pthread_exit((void *)&args->ret); } @@ -200,20 +246,20 @@ void *signal_wakerfn(void *arg) int nr_wake = 1; int i = 0; - info("Waker: waiting for waiters to block\n"); + ksft_print_dbg_msg("Waker: waiting for waiters to block\n"); while (waiters_blocked.val < THREAD_MAX) usleep(1000); usleep(1000); while (task_count < THREAD_MAX && waiters_woken.val < THREAD_MAX) { - info("task_count: %d, waiters_woken: %d\n", + ksft_print_dbg_msg("task_count: %d, waiters_woken: %d\n", task_count, waiters_woken.val); if (args->lock) { - info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", - f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", + f2, &f2); futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); } - info("Waker: Calling signal\n"); + ksft_print_dbg_msg("Waker: Calling signal\n"); /* cond_signal */ old_val = f1; args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, @@ -221,28 +267,23 @@ void *signal_wakerfn(void *arg) FUTEX_PRIVATE_FLAG); if (args->ret < 0) args->ret = -errno; - info("futex: %x\n", f2); + ksft_print_dbg_msg("futex: %x\n", f2); if (args->lock) { - info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", - f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", + f2, &f2); futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); } - info("futex: %x\n", f2); - if (args->ret < 0) { - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); - args->ret = RET_ERROR; - break; - } + ksft_print_dbg_msg("futex: %x\n", f2); + if (args->ret < 0) + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); task_count += args->ret; usleep(SIGNAL_PERIOD_US); i++; /* we have to loop at least THREAD_MAX times */ if (i > MAX_WAKE_ITERS + THREAD_MAX) { - error("max signaling iterations (%d) reached, giving up on pending waiters.\n", - 0, MAX_WAKE_ITERS + THREAD_MAX); - args->ret = RET_ERROR; - break; + ksft_exit_fail_msg("max signaling iterations (%d) reached, giving up on pending waiters.\n", + MAX_WAKE_ITERS + THREAD_MAX); } } @@ -251,8 +292,8 @@ void *signal_wakerfn(void *arg) if (args->ret >= 0) args->ret = task_count; - info("Waker: exiting with %d\n", args->ret); - info("Waker: waiters_woken: %d\n", waiters_woken.val); + ksft_print_dbg_msg("Waker: exiting with %d\n", args->ret); + ksft_print_dbg_msg("Waker: waiters_woken: %d\n", waiters_woken.val); pthread_exit((void *)&args->ret); } @@ -269,35 +310,40 @@ void *third_party_blocker(void *arg) ret2 = futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); out: - if (args->ret || ret2) { - error("third_party_blocker() futex error", 0); - args->ret = RET_ERROR; - } + if (args->ret || ret2) + ksft_exit_fail_msg("third_party_blocker() futex error"); pthread_exit((void *)&args->ret); } -int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) +TEST_F(args, futex_requeue_pi) { - void *(*wakerfn)(void *) = signal_wakerfn; struct thread_arg blocker_arg = THREAD_ARG_INITIALIZER; struct thread_arg waker_arg = THREAD_ARG_INITIALIZER; pthread_t waiter[THREAD_MAX], waker, blocker; - struct timespec ts, *tsp = NULL; + void *(*wakerfn)(void *) = signal_wakerfn; + bool third_party_owner = variant->owner; + long timeout_ns = variant->timeout_ns; + bool broadcast = variant->broadcast; struct thread_arg args[THREAD_MAX]; - int *waiter_ret; - int i, ret = RET_PASS; + struct timespec ts, *tsp = NULL; + bool lock = variant->locked; + int *waiter_ret, i, ret = 0; + + ksft_print_msg( + "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n", + broadcast, lock, third_party_owner, timeout_ns); if (timeout_ns) { time_t secs; - info("timeout_ns = %ld\n", timeout_ns); + ksft_print_dbg_msg("timeout_ns = %ld\n", timeout_ns); ret = clock_gettime(CLOCK_MONOTONIC, &ts); secs = (ts.tv_nsec + timeout_ns) / 1000000000; ts.tv_nsec = ((int64_t)ts.tv_nsec + timeout_ns) % 1000000000; ts.tv_sec += secs; - info("ts.tv_sec = %ld\n", ts.tv_sec); - info("ts.tv_nsec = %ld\n", ts.tv_nsec); + ksft_print_dbg_msg("ts.tv_sec = %ld\n", ts.tv_sec); + ksft_print_dbg_msg("ts.tv_nsec = %ld\n", ts.tv_nsec); tsp = &ts; } @@ -307,10 +353,7 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) if (third_party_owner) { if (create_rt_thread(&blocker, third_party_blocker, (void *)&blocker_arg, SCHED_FIFO, 1)) { - error("Creating third party blocker thread failed\n", - errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating third party blocker thread failed\n"); } } @@ -318,20 +361,16 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) for (i = 0; i < THREAD_MAX; i++) { args[i].id = i; args[i].timeout = tsp; - info("Starting thread %d\n", i); + ksft_print_dbg_msg("Starting thread %d\n", i); if (create_rt_thread(&waiter[i], waiterfn, (void *)&args[i], SCHED_FIFO, 1)) { - error("Creating waiting thread failed\n", errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating waiting thread failed\n"); } } waker_arg.lock = lock; if (create_rt_thread(&waker, wakerfn, (void *)&waker_arg, SCHED_FIFO, 1)) { - error("Creating waker thread failed\n", errno); - ret = RET_ERROR; - goto out; + ksft_exit_fail_msg("Creating waker thread failed\n"); } /* Wait for threads to finish */ @@ -345,7 +384,6 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) pthread_join(blocker, NULL); pthread_join(waker, NULL); -out: if (!ret) { if (*waiter_ret) ret = *waiter_ret; @@ -355,66 +393,8 @@ int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) ret = blocker_arg.ret; } - return ret; + if (ret) + ksft_test_result_fail("fail"); } -int main(int argc, char *argv[]) -{ - char *test_name; - int c, ret; - - while ((c = getopt(argc, argv, "bchlot:v:")) != -1) { - switch (c) { - case 'b': - broadcast = 1; - break; - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'l': - locked = 1; - break; - case 'o': - owner = 1; - locked = 0; - break; - case 't': - timeout_ns = atoi(optarg); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test requeue functionality\n", basename(argv[0])); - ksft_print_msg( - "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n", - broadcast, locked, owner, timeout_ns); - - ret = asprintf(&test_name, - "%s broadcast=%d locked=%d owner=%d timeout=%ldns", - TEST_NAME, broadcast, locked, owner, timeout_ns); - if (ret < 0) { - ksft_print_msg("Failed to generate test name\n"); - test_name = TEST_NAME; - } - - /* - * FIXME: unit_test is obsolete now that we parse options and the - * various style of runs are done by run.sh - simplify the code and move - * unit_test into main() - */ - ret = unit_test(broadcast, locked, owner, timeout_ns); - - print_result(test_name, ret); - return ret; -} +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c index d0a4d332ea4413..77135a22a583e1 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c @@ -23,67 +23,32 @@ #include #include #include -#include "futextest.h" -#include "logging.h" -#define TEST_NAME "futex-requeue-pi-mismatched-ops" +#include "futextest.h" +#include "../../kselftest_harness.h" futex_t f1 = FUTEX_INITIALIZER; futex_t f2 = FUTEX_INITIALIZER; int child_ret = 0; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *blocking_child(void *arg) { child_ret = futex_wait(&f1, f1, NULL, FUTEX_PRIVATE_FLAG); if (child_ret < 0) { child_ret = -errno; - error("futex_wait\n", errno); + ksft_exit_fail_msg("futex_wait\n"); } return (void *)&child_ret; } -int main(int argc, char *argv[]) +TEST(requeue_pi_mismatched_ops) { - int ret = RET_PASS; pthread_t child; - int c; + int ret; - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Detect mismatched requeue_pi operations\n", - basename(argv[0])); + if (pthread_create(&child, NULL, blocking_child, NULL)) + ksft_exit_fail_msg("pthread_create\n"); - if (pthread_create(&child, NULL, blocking_child, NULL)) { - error("pthread_create\n", errno); - ret = RET_ERROR; - goto out; - } /* Allow the child to block in the kernel. */ sleep(1); @@ -102,34 +67,27 @@ int main(int argc, char *argv[]) * FUTEX_WAKE. */ ret = futex_wake(&f1, 1, FUTEX_PRIVATE_FLAG); - if (ret == 1) { - ret = RET_PASS; - } else if (ret < 0) { - error("futex_wake\n", errno); - ret = RET_ERROR; - } else { - error("futex_wake did not wake the child\n", 0); - ret = RET_ERROR; - } + if (ret == 1) + ret = 0; + else if (ret < 0) + ksft_exit_fail_msg("futex_wake\n"); + else + ksft_exit_fail_msg("futex_wake did not wake the child\n"); } else { - error("futex_cmp_requeue_pi\n", errno); - ret = RET_ERROR; + ksft_exit_fail_msg("futex_cmp_requeue_pi\n"); } } else if (ret > 0) { - fail("futex_cmp_requeue_pi failed to detect the mismatch\n"); - ret = RET_FAIL; + ksft_test_result_fail("futex_cmp_requeue_pi failed to detect the mismatch\n"); } else { - error("futex_cmp_requeue_pi found no waiters\n", 0); - ret = RET_ERROR; + ksft_exit_fail_msg("futex_cmp_requeue_pi found no waiters\n"); } pthread_join(child, NULL); - if (!ret) - ret = child_ret; - - out: - /* If the kernel crashes, we shouldn't return at all. */ - print_result(TEST_NAME, ret); - return ret; + if (!ret && !child_ret) + ksft_test_result_pass("futex_requeue_pi_mismatched_ops passed\n"); + else + ksft_test_result_pass("futex_requeue_pi_mismatched_ops failed\n"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c index c6b8f32990c875..e34ee0f9ebccdb 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -24,11 +24,11 @@ #include #include #include + #include "atomic.h" #include "futextest.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-requeue-pi-signal-restart" #define DELAY_US 100 futex_t f1 = FUTEX_INITIALIZER; @@ -37,15 +37,6 @@ atomic_t requeued = ATOMIC_INITIALIZER; int waiter_ret = 0; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, int policy, int prio) { @@ -57,35 +48,28 @@ int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, memset(&schedp, 0, sizeof(schedp)); ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); - if (ret) { - error("pthread_attr_setinheritsched\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setinheritsched\n"); ret = pthread_attr_setschedpolicy(&attr, policy); - if (ret) { - error("pthread_attr_setschedpolicy\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setschedpolicy\n"); schedp.sched_priority = prio; ret = pthread_attr_setschedparam(&attr, &schedp); - if (ret) { - error("pthread_attr_setschedparam\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_attr_setschedparam\n"); ret = pthread_create(pth, &attr, func, arg); - if (ret) { - error("pthread_create\n", ret); - return -1; - } + if (ret) + ksft_exit_fail_msg("pthread_create\n"); + return 0; } void handle_signal(int signo) { - info("signal received %s requeue\n", + ksft_print_dbg_msg("signal received %s requeue\n", requeued.val ? "after" : "prior to"); } @@ -94,78 +78,46 @@ void *waiterfn(void *arg) unsigned int old_val; int res; - waiter_ret = RET_PASS; - - info("Waiter running\n"); - info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Waiter running\n"); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); old_val = f1; res = futex_wait_requeue_pi(&f1, old_val, &(f2), NULL, FUTEX_PRIVATE_FLAG); if (!requeued.val || errno != EWOULDBLOCK) { - fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n", + ksft_test_result_fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n", res, strerror(errno)); - info("w2:futex: %x\n", f2); + ksft_print_dbg_msg("w2:futex: %x\n", f2); if (!res) futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - waiter_ret = RET_FAIL; } - info("Waiter exiting with %d\n", waiter_ret); pthread_exit(NULL); } -int main(int argc, char *argv[]) +TEST(futex_requeue_pi_signal_restart) { unsigned int old_val; struct sigaction sa; pthread_t waiter; - int c, res, ret = RET_PASS; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test signal handling during requeue_pi\n", - basename(argv[0])); - ksft_print_msg("\tArguments: \n"); + int res; sa.sa_handler = handle_signal; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; - if (sigaction(SIGUSR1, &sa, NULL)) { - error("sigaction\n", errno); - exit(1); - } + if (sigaction(SIGUSR1, &sa, NULL)) + ksft_exit_fail_msg("sigaction\n"); - info("m1:f2: %x\n", f2); - info("Creating waiter\n"); + ksft_print_dbg_msg("m1:f2: %x\n", f2); + ksft_print_dbg_msg("Creating waiter\n"); res = create_rt_thread(&waiter, waiterfn, NULL, SCHED_FIFO, 1); - if (res) { - error("Creating waiting thread failed", res); - ret = RET_ERROR; - goto out; - } + if (res) + ksft_exit_fail_msg("Creating waiting thread failed"); - info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); - info("m2:f2: %x\n", f2); + ksft_print_dbg_msg("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("m2:f2: %x\n", f2); futex_lock_pi(&f2, 0, 0, FUTEX_PRIVATE_FLAG); - info("m3:f2: %x\n", f2); + ksft_print_dbg_msg("m3:f2: %x\n", f2); while (1) { /* @@ -173,11 +125,11 @@ int main(int argc, char *argv[]) * restart futex_wait_requeue_pi() in the kernel. Wait for the * waiter to block on f1 again. */ - info("Issuing SIGUSR1 to waiter\n"); + ksft_print_dbg_msg("Issuing SIGUSR1 to waiter\n"); pthread_kill(waiter, SIGUSR1); usleep(DELAY_US); - info("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n"); + ksft_print_dbg_msg("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n"); old_val = f1; res = futex_cmp_requeue_pi(&f1, old_val, &(f2), 1, 0, FUTEX_PRIVATE_FLAG); @@ -191,12 +143,10 @@ int main(int argc, char *argv[]) atomic_set(&requeued, 1); break; } else if (res < 0) { - error("FUTEX_CMP_REQUEUE_PI failed\n", errno); - ret = RET_ERROR; - break; + ksft_exit_fail_msg("FUTEX_CMP_REQUEUE_PI failed\n"); } } - info("m4:f2: %x\n", f2); + ksft_print_dbg_msg("m4:f2: %x\n", f2); /* * Signal the waiter after requeue, waiter should return from @@ -204,19 +154,14 @@ int main(int argc, char *argv[]) * futex_unlock_pi() can't happen before the signal wakeup is detected * in the kernel. */ - info("Issuing SIGUSR1 to waiter\n"); + ksft_print_dbg_msg("Issuing SIGUSR1 to waiter\n"); pthread_kill(waiter, SIGUSR1); - info("Waiting for waiter to return\n"); + ksft_print_dbg_msg("Waiting for waiter to return\n"); pthread_join(waiter, NULL); - info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2); + ksft_print_dbg_msg("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2); futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); - info("m5:f2: %x\n", f2); - - out: - if (ret == RET_PASS && waiter_ret) - ret = waiter_ret; - - print_result(TEST_NAME, ret); - return ret; + ksft_print_dbg_msg("m5:f2: %x\n", f2); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait.c b/tools/testing/selftests/futex/functional/futex_wait.c index 685140d9b93d2f..152ca46128866f 100644 --- a/tools/testing/selftests/futex/functional/futex_wait.c +++ b/tools/testing/selftests/futex/functional/futex_wait.c @@ -9,25 +9,16 @@ #include #include #include -#include "logging.h" + #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait" #define timeout_ns 30000000 #define WAKE_WAIT_US 10000 #define SHM_PATH "futex_shm_file" void *futex; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - static void *waiterfn(void *arg) { struct timespec to; @@ -45,53 +36,37 @@ static void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(private_futex) { - int res, ret = RET_PASS, fd, c, shm_id; - u_int32_t f_private = 0, *shared_data; unsigned int flags = FUTEX_PRIVATE_FLAG; + u_int32_t f_private = 0; pthread_t waiter; - void *shm; + int res; futex = &f_private; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(3); - ksft_print_msg("%s: Test futex_wait\n", basename(argv[0])); - /* Testing a private futex */ - info("Calling private futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling private futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, (void *) &flags)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling private futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling private futex_wake on futex: %p\n", futex); res = futex_wake(futex, 1, FUTEX_PRIVATE_FLAG); if (res != 1) { ksft_test_result_fail("futex_wake private returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake private succeeds\n"); } +} + +TEST(anon_page) +{ + u_int32_t *shared_data; + pthread_t waiter; + int res, shm_id; /* Testing an anon page shared memory */ shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); @@ -105,67 +80,65 @@ int main(int argc, char *argv[]) *shared_data = 0; futex = shared_data; - info("Calling shared (page anon) futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (page anon) futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling shared (page anon) futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (page anon) futex_wake on futex: %p\n", futex); res = futex_wake(futex, 1, 0); if (res != 1) { ksft_test_result_fail("futex_wake shared (page anon) returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake shared (page anon) succeeds\n"); } + shmdt(shared_data); +} + +TEST(file_backed) +{ + u_int32_t f_private = 0; + pthread_t waiter; + int res, fd; + void *shm; /* Testing a file backed shared memory */ fd = open(SHM_PATH, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); - if (fd < 0) { - perror("open"); - exit(1); - } + if (fd < 0) + ksft_exit_fail_msg("open"); - if (ftruncate(fd, sizeof(f_private))) { - perror("ftruncate"); - exit(1); - } + if (ftruncate(fd, sizeof(f_private))) + ksft_exit_fail_msg("ftruncate"); shm = mmap(NULL, sizeof(f_private), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (shm == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (shm == MAP_FAILED) + ksft_exit_fail_msg("mmap"); memcpy(shm, &f_private, sizeof(f_private)); futex = shm; - info("Calling shared (file backed) futex_wait on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (file backed) futex_wait on futex: %p\n", futex); if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); - info("Calling shared (file backed) futex_wake on futex: %p\n", futex); + ksft_print_dbg_msg("Calling shared (file backed) futex_wake on futex: %p\n", futex); res = futex_wake(shm, 1, 0); if (res != 1) { ksft_test_result_fail("futex_wake shared (file backed) returned: %d %s\n", errno, strerror(errno)); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wake shared (file backed) succeeds\n"); } - /* Freeing resources */ - shmdt(shared_data); munmap(shm, sizeof(f_private)); remove(SHM_PATH); close(fd); - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c index fb4148f23fa372..8952ebda14ab8a 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c +++ b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c @@ -27,10 +27,9 @@ #include #include -#include "logging.h" #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-private-mapped-file" #define PAGE_SZ 4096 char pad[PAGE_SZ] = {1}; @@ -40,86 +39,44 @@ char pad2[PAGE_SZ] = {1}; #define WAKE_WAIT_US 3000000 struct timespec wait_timeout = { .tv_sec = 5, .tv_nsec = 0}; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *thr_futex_wait(void *arg) { int ret; - info("futex wait\n"); + ksft_print_dbg_msg("futex wait\n"); ret = futex_wait(&val, 1, &wait_timeout, 0); - if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) { - error("futex error.\n", errno); - print_result(TEST_NAME, RET_ERROR); - exit(RET_ERROR); - } + if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) + ksft_exit_fail_msg("futex error.\n"); if (ret && errno == ETIMEDOUT) - fail("waiter timedout\n"); + ksft_exit_fail_msg("waiter timedout\n"); - info("futex_wait: ret = %d, errno = %d\n", ret, errno); + ksft_print_dbg_msg("futex_wait: ret = %d, errno = %d\n", ret, errno); return NULL; } -int main(int argc, char **argv) +TEST(wait_private_mapped_file) { pthread_t thr; - int ret = RET_PASS; int res; - int c; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg( - "%s: Test the futex value of private file mappings in FUTEX_WAIT\n", - basename(argv[0])); - - ret = pthread_create(&thr, NULL, thr_futex_wait, NULL); - if (ret < 0) { - fprintf(stderr, "pthread_create error\n"); - ret = RET_ERROR; - goto out; - } - - info("wait a while\n"); + + res = pthread_create(&thr, NULL, thr_futex_wait, NULL); + if (res < 0) + ksft_exit_fail_msg("pthread_create error\n"); + + ksft_print_dbg_msg("wait a while\n"); usleep(WAKE_WAIT_US); val = 2; res = futex_wake(&val, 1, 0); - info("futex_wake %d\n", res); - if (res != 1) { - fail("FUTEX_WAKE didn't find the waiting thread.\n"); - ret = RET_FAIL; - } + ksft_print_dbg_msg("futex_wake %d\n", res); + if (res != 1) + ksft_exit_fail_msg("FUTEX_WAKE didn't find the waiting thread.\n"); - info("join\n"); + ksft_print_dbg_msg("join\n"); pthread_join(thr, NULL); - out: - print_result(TEST_NAME, ret); - return ret; + ksft_test_result_pass("wait_private_mapped_file"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c index d183f878360bcd..0c8766aced2e4d 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -16,26 +16,15 @@ *****************************************************************************/ #include + #include "futextest.h" #include "futex2test.h" -#include "logging.h" - -#define TEST_NAME "futex-wait-timeout" +#include "../../kselftest_harness.h" static long timeout_ns = 100000; /* 100us default timeout */ static futex_t futex_pi; static pthread_barrier_t barrier; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -t N Timeout in nanoseconds (default: 100,000)\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - /* * Get a PI lock and hold it forever, so the main thread lock_pi will block * and we can test the timeout @@ -47,13 +36,13 @@ void *get_pi_lock(void *arg) ret = futex_lock_pi(&futex_pi, NULL, 0, 0); if (ret != 0) - error("futex_lock_pi failed\n", ret); + ksft_exit_fail_msg("futex_lock_pi failed\n"); pthread_barrier_wait(&barrier); /* Blocks forever */ ret = futex_wait(&lock, 0, NULL, 0); - error("futex_wait failed\n", ret); + ksft_exit_fail_msg("futex_wait failed\n"); return NULL; } @@ -61,12 +50,11 @@ void *get_pi_lock(void *arg) /* * Check if the function returned the expected error */ -static void test_timeout(int res, int *ret, char *test_name, int err) +static void test_timeout(int res, char *test_name, int err) { if (!res || errno != err) { ksft_test_result_fail("%s returned %d\n", test_name, res < 0 ? errno : res); - *ret = RET_FAIL; } else { ksft_test_result_pass("%s succeeds\n", test_name); } @@ -78,10 +66,8 @@ static void test_timeout(int res, int *ret, char *test_name, int err) static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, long timeout_ns) { - if (clock_gettime(clockid, to)) { - error("clock_gettime failed\n", errno); - return errno; - } + if (clock_gettime(clockid, to)) + ksft_exit_fail_msg("clock_gettime failed\n"); to->tv_nsec += timeout_ns; @@ -93,83 +79,66 @@ static int futex_get_abs_timeout(clockid_t clockid, struct timespec *to, return 0; } -int main(int argc, char *argv[]) +TEST(wait_bitset) { futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; struct timespec to; - pthread_t thread; - int c; - struct futex_waitv waitv = { - .uaddr = (uintptr_t)&f1, - .val = f1, - .flags = FUTEX_32, - .__reserved = 0 - }; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 't': - timeout_ns = atoi(optarg); - break; - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(9); - ksft_print_msg("%s: Block on a futex and wait for timeout\n", - basename(argv[0])); - ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); - - pthread_barrier_init(&barrier, NULL, 2); - pthread_create(&thread, NULL, get_pi_lock, NULL); + int res; /* initialize relative timeout */ to.tv_sec = 0; to.tv_nsec = timeout_ns; res = futex_wait(&f1, f1, &to, 0); - test_timeout(res, &ret, "futex_wait relative", ETIMEDOUT); + test_timeout(res, "futex_wait relative", ETIMEDOUT); /* FUTEX_WAIT_BITSET with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_bitset(&f1, f1, &to, 1, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_wait_bitset realtime", ETIMEDOUT); + test_timeout(res, "futex_wait_bitset realtime", ETIMEDOUT); /* FUTEX_WAIT_BITSET with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_bitset(&f1, f1, &to, 1, 0); - test_timeout(res, &ret, "futex_wait_bitset monotonic", ETIMEDOUT); + test_timeout(res, "futex_wait_bitset monotonic", ETIMEDOUT); +} + +TEST(requeue_pi) +{ + futex_t f1 = FUTEX_INITIALIZER; + struct timespec to; + int res; /* FUTEX_WAIT_REQUEUE_PI with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_wait_requeue_pi realtime", ETIMEDOUT); + test_timeout(res, "futex_wait_requeue_pi realtime", ETIMEDOUT); /* FUTEX_WAIT_REQUEUE_PI with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, 0); - test_timeout(res, &ret, "futex_wait_requeue_pi monotonic", ETIMEDOUT); + test_timeout(res, "futex_wait_requeue_pi monotonic", ETIMEDOUT); + +} + +TEST(lock_pi) +{ + struct timespec to; + pthread_t thread; + int res; + + /* Create a thread that will lock forever so any waiter will timeout */ + pthread_barrier_init(&barrier, NULL, 2); + pthread_create(&thread, NULL, get_pi_lock, NULL); /* Wait until the other thread calls futex_lock_pi() */ pthread_barrier_wait(&barrier); pthread_barrier_destroy(&barrier); + /* * FUTEX_LOCK_PI with CLOCK_REALTIME * Due to historical reasons, FUTEX_LOCK_PI supports only realtime @@ -181,26 +150,38 @@ int main(int argc, char *argv[]) * smaller than realtime and the syscall will timeout immediately. */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_lock_pi(&futex_pi, &to, 0, 0); - test_timeout(res, &ret, "futex_lock_pi realtime", ETIMEDOUT); + test_timeout(res, "futex_lock_pi realtime", ETIMEDOUT); /* Test operations that don't support FUTEX_CLOCK_REALTIME */ res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS); + test_timeout(res, "futex_lock_pi invalid timeout flag", ENOSYS); +} + +TEST(waitv) +{ + futex_t f1 = FUTEX_INITIALIZER; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1, + .flags = FUTEX_32, + .__reserved = 0, + }; + struct timespec to; + int res; /* futex_waitv with CLOCK_MONOTONIC */ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); - test_timeout(res, &ret, "futex_waitv monotonic", ETIMEDOUT); + test_timeout(res, "futex_waitv monotonic", ETIMEDOUT); /* futex_waitv with CLOCK_REALTIME */ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) - return RET_FAIL; + ksft_test_result_error("get_time error"); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_REALTIME); - test_timeout(res, &ret, "futex_waitv realtime", ETIMEDOUT); - - ksft_print_cnts(); - return ret; + test_timeout(res, "futex_waitv realtime", ETIMEDOUT); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c index ed9cd07e31c1a9..ce2301500d839c 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c +++ b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c @@ -29,95 +29,55 @@ #include #include -#include "logging.h" #include "futextest.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-uninitialized-heap" #define WAIT_US 5000000 static int child_blocked = 1; -static int child_ret; +static bool child_ret; void *buf; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *wait_thread(void *arg) { int res; - child_ret = RET_PASS; + child_ret = true; res = futex_wait(buf, 1, NULL, 0); child_blocked = 0; if (res != 0 && errno != EWOULDBLOCK) { - error("futex failure\n", errno); - child_ret = RET_ERROR; + ksft_exit_fail_msg("futex failure\n"); + child_ret = false; } pthread_exit(NULL); } -int main(int argc, char **argv) +TEST(futex_wait_uninitialized_heap) { - int c, ret = RET_PASS; long page_size; pthread_t thr; - - while ((c = getopt(argc, argv, "chv:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } + int ret; page_size = sysconf(_SC_PAGESIZE); buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); - if (buf == (void *)-1) { - error("mmap\n", errno); - exit(1); - } - - ksft_print_header(); - ksft_set_plan(1); - ksft_print_msg("%s: Test the uninitialized futex value in FUTEX_WAIT\n", - basename(argv[0])); - + if (buf == (void *)-1) + ksft_exit_fail_msg("mmap\n"); ret = pthread_create(&thr, NULL, wait_thread, NULL); - if (ret) { - error("pthread_create\n", errno); - ret = RET_ERROR; - goto out; - } + if (ret) + ksft_exit_fail_msg("pthread_create\n"); - info("waiting %dus for child to return\n", WAIT_US); + ksft_print_dbg_msg("waiting %dus for child to return\n", WAIT_US); usleep(WAIT_US); - ret = child_ret; - if (child_blocked) { - fail("child blocked in kernel\n"); - ret = RET_FAIL; - } + if (child_blocked) + ksft_test_result_fail("child blocked in kernel\n"); - out: - print_result(TEST_NAME, ret); - return ret; + if (!child_ret) + ksft_test_result_fail("child error\n"); } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c index 2d8230da906429..36b7a54a40851a 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -21,72 +21,44 @@ #include #include #include + #include "futextest.h" #include "futex2test.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait-wouldblock" #define timeout_ns 100000 -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - -int main(int argc, char *argv[]) +TEST(futex_wait_wouldblock) { struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; - int c; - struct futex_waitv waitv = { - .uaddr = (uintptr_t)&f1, - .val = f1+1, - .flags = FUTEX_32, - .__reserved = 0 - }; + int res; - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(2); - ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", - basename(argv[0])); - - info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + ksft_print_dbg_msg("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); if (!res || errno != EWOULDBLOCK) { ksft_test_result_fail("futex_wait returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_wait\n"); } +} - if (clock_gettime(CLOCK_MONOTONIC, &to)) { - error("clock_gettime failed\n", errno); - return errno; - } +TEST(futex_waitv_wouldblock) +{ + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; + futex_t f1 = FUTEX_INITIALIZER; + struct futex_waitv waitv = { + .uaddr = (uintptr_t)&f1, + .val = f1 + 1, + .flags = FUTEX_32, + .__reserved = 0, + }; + int res; + + if (clock_gettime(CLOCK_MONOTONIC, &to)) + ksft_exit_fail_msg("clock_gettime failed %d\n", errno); to.tv_nsec += timeout_ns; @@ -95,17 +67,15 @@ int main(int argc, char *argv[]) to.tv_nsec -= 1000000000; } - info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + ksft_print_dbg_msg("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); if (!res || errno != EWOULDBLOCK) { ksft_test_result_fail("futex_waitv returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c index a94337f677e181..c684b10eb76e29 100644 --- a/tools/testing/selftests/futex/functional/futex_waitv.c +++ b/tools/testing/selftests/futex/functional/futex_waitv.c @@ -15,25 +15,16 @@ #include #include #include + #include "futextest.h" #include "futex2test.h" -#include "logging.h" +#include "../../kselftest_harness.h" -#define TEST_NAME "futex-wait" #define WAKE_WAIT_US 10000 #define NR_FUTEXES 30 static struct futex_waitv waitv[NR_FUTEXES]; u_int32_t futexes[NR_FUTEXES] = {0}; -void usage(char *prog) -{ - printf("Usage: %s\n", prog); - printf(" -c Use color\n"); - printf(" -h Display this help message\n"); - printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", - VQUIET, VCRITICAL, VINFO); -} - void *waiterfn(void *arg) { struct timespec to; @@ -41,7 +32,7 @@ void *waiterfn(void *arg) /* setting absolute timeout for futex2 */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -57,34 +48,10 @@ void *waiterfn(void *arg) return NULL; } -int main(int argc, char *argv[]) +TEST(private_waitv) { pthread_t waiter; - int res, ret = RET_PASS; - struct timespec to; - int c, i; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { - case 'c': - log_color(1); - break; - case 'h': - usage(basename(argv[0])); - exit(0); - case 'v': - log_verbosity(atoi(optarg)); - break; - default: - usage(basename(argv[0])); - exit(1); - } - } - - ksft_print_header(); - ksft_set_plan(7); - ksft_print_msg("%s: Test FUTEX_WAITV\n", - basename(argv[0])); + int res, i; for (i = 0; i < NR_FUTEXES; i++) { waitv[i].uaddr = (uintptr_t)&futexes[i]; @@ -95,7 +62,7 @@ int main(int argc, char *argv[]) /* Private waitv */ if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); @@ -104,10 +71,15 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv private\n"); } +} + +TEST(shared_waitv) +{ + pthread_t waiter; + int res, i; /* Shared waitv */ for (i = 0; i < NR_FUTEXES; i++) { @@ -128,7 +100,7 @@ int main(int argc, char *argv[]) } if (pthread_create(&waiter, NULL, waiterfn, NULL)) - error("pthread_create failed\n", errno); + ksft_exit_fail_msg("pthread_create failed\n"); usleep(WAKE_WAIT_US); @@ -137,19 +109,24 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake shared returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv shared\n"); } for (i = 0; i < NR_FUTEXES; i++) shmdt(u64_to_ptr(waitv[i].uaddr)); +} + +TEST(invalid_flag) +{ + struct timespec to; + int res; /* Testing a waiter without FUTEX_32 flag */ waitv[0].flags = FUTEX_PRIVATE_FLAG; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -158,17 +135,22 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv without FUTEX_32\n"); } +} + +TEST(unaligned_address) +{ + struct timespec to; + int res; /* Testing a waiter with an unaligned address */ waitv[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32; waitv[0].uaddr = 1; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -177,16 +159,21 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_wake private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv with an unaligned address\n"); } +} + +TEST(null_address) +{ + struct timespec to; + int res; /* Testing a NULL address for waiters.uaddr */ waitv[0].uaddr = 0x00000000; if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -195,14 +182,13 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv NULL address in waitv.uaddr\n"); } /* Testing a NULL address for *waiters */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -211,14 +197,19 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv NULL address in *waiters\n"); } +} + +TEST(invalid_clockid) +{ + struct timespec to; + int res; /* Testing an invalid clockid */ if (clock_gettime(CLOCK_MONOTONIC, &to)) - error("gettime64 failed\n", errno); + ksft_exit_fail_msg("gettime64 failed\n"); to.tv_sec++; @@ -227,11 +218,9 @@ int main(int argc, char *argv[]) ksft_test_result_fail("futex_waitv private returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; } else { ksft_test_result_pass("futex_waitv invalid clockid\n"); } - - ksft_print_cnts(); - return ret; } + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 81739849f2994d..e88545c06d57a7 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -18,74 +18,36 @@ # ############################################################################### -# Test for a color capable console -if [ -z "$USE_COLOR" ]; then - tput setf 7 || tput setaf 7 - if [ $? -eq 0 ]; then - USE_COLOR=1 - tput sgr0 - fi -fi -if [ "$USE_COLOR" -eq 1 ]; then - COLOR="-c" -fi - - echo -# requeue pi testing -# without timeouts -./futex_requeue_pi $COLOR -./futex_requeue_pi $COLOR -b -./futex_requeue_pi $COLOR -b -l -./futex_requeue_pi $COLOR -b -o -./futex_requeue_pi $COLOR -l -./futex_requeue_pi $COLOR -o -# with timeouts -./futex_requeue_pi $COLOR -b -l -t 5000 -./futex_requeue_pi $COLOR -l -t 5000 -./futex_requeue_pi $COLOR -b -l -t 500000 -./futex_requeue_pi $COLOR -l -t 500000 -./futex_requeue_pi $COLOR -b -t 5000 -./futex_requeue_pi $COLOR -t 5000 -./futex_requeue_pi $COLOR -b -t 500000 -./futex_requeue_pi $COLOR -t 500000 -./futex_requeue_pi $COLOR -b -o -t 5000 -./futex_requeue_pi $COLOR -l -t 5000 -./futex_requeue_pi $COLOR -b -o -t 500000 -./futex_requeue_pi $COLOR -l -t 500000 -# with long timeout -./futex_requeue_pi $COLOR -b -l -t 2000000000 -./futex_requeue_pi $COLOR -l -t 2000000000 - +./futex_requeue_pi echo -./futex_requeue_pi_mismatched_ops $COLOR +./futex_requeue_pi_mismatched_ops echo -./futex_requeue_pi_signal_restart $COLOR +./futex_requeue_pi_signal_restart echo -./futex_wait_timeout $COLOR +./futex_wait_timeout echo -./futex_wait_wouldblock $COLOR +./futex_wait_wouldblock echo -./futex_wait_uninitialized_heap $COLOR -./futex_wait_private_mapped_file $COLOR +./futex_wait_uninitialized_heap +./futex_wait_private_mapped_file echo -./futex_wait $COLOR +./futex_wait echo -./futex_requeue $COLOR +./futex_requeue echo -./futex_waitv $COLOR +./futex_waitv echo -./futex_priv_hash $COLOR -./futex_priv_hash -g $COLOR +./futex_priv_hash echo -./futex_numa_mpol $COLOR +./futex_numa_mpol diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h index 7a5fd1d5355e7e..3d48e9789d9fe6 100644 --- a/tools/testing/selftests/futex/include/futextest.h +++ b/tools/testing/selftests/futex/include/futextest.h @@ -58,6 +58,17 @@ typedef volatile u_int32_t futex_t; #define SYS_futex SYS_futex_time64 #endif +/* + * On 32bit systems if we use "-D_FILE_OFFSET_BITS=64 -D_TIME_BITS=64" or if + * we are using a newer compiler then the size of the timestamps will be 64bit, + * however, the SYS_futex will still point to the 32bit futex system call. + */ +#if __SIZEOF_POINTER__ == 4 && defined(SYS_futex_time64) && \ + defined(_TIME_BITS) && _TIME_BITS == 64 +# undef SYS_futex +# define SYS_futex SYS_futex_time64 +#endif + /** * futex() - SYS_futex syscall wrapper * @uaddr: address of first futex diff --git a/tools/testing/selftests/futex/include/logging.h b/tools/testing/selftests/futex/include/logging.h deleted file mode 100644 index 874c69ce5cce9e..00000000000000 --- a/tools/testing/selftests/futex/include/logging.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/****************************************************************************** - * - * Copyright © International Business Machines Corp., 2009 - * - * DESCRIPTION - * Glibc independent futex library for testing kernel functionality. - * - * AUTHOR - * Darren Hart - * - * HISTORY - * 2009-Nov-6: Initial version by Darren Hart - * - *****************************************************************************/ - -#ifndef _LOGGING_H -#define _LOGGING_H - -#include -#include -#include -#include -#include "kselftest.h" - -/* - * Define PASS, ERROR, and FAIL strings with and without color escape - * sequences, default to no color. - */ -#define ESC 0x1B, '[' -#define BRIGHT '1' -#define GREEN '3', '2' -#define YELLOW '3', '3' -#define RED '3', '1' -#define ESCEND 'm' -#define BRIGHT_GREEN ESC, BRIGHT, ';', GREEN, ESCEND -#define BRIGHT_YELLOW ESC, BRIGHT, ';', YELLOW, ESCEND -#define BRIGHT_RED ESC, BRIGHT, ';', RED, ESCEND -#define RESET_COLOR ESC, '0', 'm' -static const char PASS_COLOR[] = {BRIGHT_GREEN, ' ', 'P', 'A', 'S', 'S', - RESET_COLOR, 0}; -static const char ERROR_COLOR[] = {BRIGHT_YELLOW, 'E', 'R', 'R', 'O', 'R', - RESET_COLOR, 0}; -static const char FAIL_COLOR[] = {BRIGHT_RED, ' ', 'F', 'A', 'I', 'L', - RESET_COLOR, 0}; -static const char INFO_NORMAL[] = " INFO"; -static const char PASS_NORMAL[] = " PASS"; -static const char ERROR_NORMAL[] = "ERROR"; -static const char FAIL_NORMAL[] = " FAIL"; -const char *INFO = INFO_NORMAL; -const char *PASS = PASS_NORMAL; -const char *ERROR = ERROR_NORMAL; -const char *FAIL = FAIL_NORMAL; - -/* Verbosity setting for INFO messages */ -#define VQUIET 0 -#define VCRITICAL 1 -#define VINFO 2 -#define VMAX VINFO -int _verbose = VCRITICAL; - -/* Functional test return codes */ -#define RET_PASS 0 -#define RET_ERROR -1 -#define RET_FAIL -2 - -/** - * log_color() - Use colored output for PASS, ERROR, and FAIL strings - * @use_color: use color (1) or not (0) - */ -void log_color(int use_color) -{ - if (use_color) { - PASS = PASS_COLOR; - ERROR = ERROR_COLOR; - FAIL = FAIL_COLOR; - } else { - PASS = PASS_NORMAL; - ERROR = ERROR_NORMAL; - FAIL = FAIL_NORMAL; - } -} - -/** - * log_verbosity() - Set verbosity of test output - * @verbose: Enable (1) verbose output or not (0) - * - * Currently setting verbose=1 will enable INFO messages and 0 will disable - * them. FAIL and ERROR messages are always displayed. - */ -void log_verbosity(int level) -{ - if (level > VMAX) - level = VMAX; - else if (level < 0) - level = 0; - _verbose = level; -} - -/** - * print_result() - Print standard PASS | ERROR | FAIL results - * @ret: the return value to be considered: 0 | RET_ERROR | RET_FAIL - * - * print_result() is primarily intended for functional tests. - */ -void print_result(const char *test_name, int ret) -{ - switch (ret) { - case RET_PASS: - ksft_test_result_pass("%s\n", test_name); - ksft_print_cnts(); - return; - case RET_ERROR: - ksft_test_result_error("%s\n", test_name); - ksft_print_cnts(); - return; - case RET_FAIL: - ksft_test_result_fail("%s\n", test_name); - ksft_print_cnts(); - return; - } -} - -/* log level macros */ -#define info(message, vargs...) \ -do { \ - if (_verbose >= VINFO) \ - fprintf(stderr, "\t%s: "message, INFO, ##vargs); \ -} while (0) - -#define error(message, err, args...) \ -do { \ - if (_verbose >= VCRITICAL) {\ - if (err) \ - fprintf(stderr, "\t%s: %s: "message, \ - ERROR, strerror(err), ##args); \ - else \ - fprintf(stderr, "\t%s: "message, ERROR, ##args); \ - } \ -} while (0) - -#define fail(message, args...) \ -do { \ - if (_verbose >= VCRITICAL) \ - fprintf(stderr, "\t%s: "message, FAIL, ##args); \ -} while (0) - -#endif diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 651fc9f13c0810..45c14323a6183c 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -113,7 +113,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata, * necessarily mean a test failure, just that the limit has to be made * bigger. */ - ASSERT_GT(400, nth_state->iteration); + ASSERT_GT(1000, nth_state->iteration); if (nth_state->iteration != 0) { ssize_t res; ssize_t res2; diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index c3b6d2604b1e48..8deeb4b72e7338 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,7 @@ struct ksft_count { static struct ksft_count ksft_cnt; static unsigned int ksft_plan; +static bool ksft_debug_enabled; static inline unsigned int ksft_test_num(void) { @@ -175,6 +177,18 @@ static inline __printf(1, 2) void ksft_print_msg(const char *msg, ...) va_end(args); } +static inline void ksft_print_dbg_msg(const char *msg, ...) +{ + va_list args; + + if (!ksft_debug_enabled) + return; + + va_start(args, msg); + ksft_print_msg(msg, args); + va_end(args); +} + static inline void ksft_perror(const char *msg) { ksft_print_msg("%s: %s (%d)\n", msg, strerror(errno), errno); diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 8516e8434bc45c..3f66e862e83eba 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -1091,7 +1091,7 @@ static int test_harness_argv_check(int argc, char **argv) { int opt; - while ((opt = getopt(argc, argv, "hlF:f:V:v:t:T:r:")) != -1) { + while ((opt = getopt(argc, argv, "dhlF:f:V:v:t:T:r:")) != -1) { switch (opt) { case 'f': case 'F': @@ -1104,12 +1104,16 @@ static int test_harness_argv_check(int argc, char **argv) case 'l': test_harness_list_tests(); return KSFT_SKIP; + case 'd': + ksft_debug_enabled = true; + break; case 'h': default: fprintf(stderr, - "Usage: %s [-h|-l] [-t|-T|-v|-V|-f|-F|-r name]\n" + "Usage: %s [-h|-l|-d] [-t|-T|-v|-V|-f|-F|-r name]\n" "\t-h print help\n" "\t-l list all tests\n" + "\t-d enable debug prints\n" "\n" "\t-t name include test\n" "\t-T name exclude test\n" @@ -1142,8 +1146,9 @@ static bool test_enabled(int argc, char **argv, int opt; optind = 1; - while ((opt = getopt(argc, argv, "F:f:V:v:t:T:r:")) != -1) { - has_positive |= islower(opt); + while ((opt = getopt(argc, argv, "dF:f:V:v:t:T:r:")) != -1) { + if (opt != 'd') + has_positive |= islower(opt); switch (tolower(opt)) { case 't': diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore new file mode 100644 index 00000000000000..ccfb40837a7333 --- /dev/null +++ b/tools/testing/selftests/namespaces/.gitignore @@ -0,0 +1,3 @@ +nsid_test +file_handle_test +init_ino_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile new file mode 100644 index 00000000000000..5fe4b3dc07d3ed --- /dev/null +++ b/tools/testing/selftests/namespaces/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) + +TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test + +include ../lib.mk + diff --git a/tools/testing/selftests/namespaces/config b/tools/testing/selftests/namespaces/config new file mode 100644 index 00000000000000..d09836260262f4 --- /dev/null +++ b/tools/testing/selftests/namespaces/config @@ -0,0 +1,7 @@ +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CGROUPS=y diff --git a/tools/testing/selftests/namespaces/file_handle_test.c b/tools/testing/selftests/namespaces/file_handle_test.c new file mode 100644 index 00000000000000..f1bc5773f55216 --- /dev/null +++ b/tools/testing/selftests/namespaces/file_handle_test.c @@ -0,0 +1,1429 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +TEST(nsfs_net_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT as unprivileged user */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + if (fd < 0 && errno == EPERM) { + SKIP(free(handle); close(ns_fd); + return, + "Permission denied for unprivileged user (expected)"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_uts_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open UTS namespace file descriptor */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_ipc_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open IPC namespace file descriptor */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_pid_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open PID namespace file descriptor */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_mnt_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open mount namespace file descriptor */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open user namespace file descriptor */ + ns_fd = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_cgroup_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open cgroup namespace file descriptor */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "cgroup namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_time_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open time namespace file descriptor */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "time namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_net_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current network namespace */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create network namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's network namespace handle from new user+net namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new network namespace"); + } + + /* Should fail with permission denied since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_uts_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current UTS namespace */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create UTS namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's UTS namespace handle from new user+uts namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new UTS namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_ipc_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current IPC namespace */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create IPC namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's IPC namespace handle from new user+ipc namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new IPC namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_mnt_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current mount namespace */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create mount namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's mount namespace handle from new user+mnt namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new mount namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_cgroup_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current cgroup namespace */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "cgroup namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create cgroup namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's cgroup namespace handle from new user+cgroup namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new cgroup namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_pid_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current PID namespace */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new PID namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create PID namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for PID namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in PID namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new PID namespace */ + /* Try to open parent's PID namespace handle from new user+pid namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new PID namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_time_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current time namespace */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "time namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new time namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWTIME); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create time namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for time namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in time namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new time namespace */ + /* Try to open parent's time namespace handle from new user+time namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new time namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_open_flags) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Test invalid flags that should fail */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_WRONLY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDWR); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TRUNC); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECT); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TMPFILE); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECTORY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, ENOTDIR); + + close(ns_fd); + free(handle); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/init_ino_test.c b/tools/testing/selftests/namespaces/init_ino_test.c new file mode 100644 index 00000000000000..5b6993c3740b7d --- /dev/null +++ b/tools/testing/selftests/namespaces/init_ino_test.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2025 Christian Brauner + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +struct ns_info { + const char *name; + const char *proc_path; + unsigned int expected_ino; +}; + +static struct ns_info namespaces[] = { + { "ipc", "/proc/1/ns/ipc", IPC_NS_INIT_INO }, + { "uts", "/proc/1/ns/uts", UTS_NS_INIT_INO }, + { "user", "/proc/1/ns/user", USER_NS_INIT_INO }, + { "pid", "/proc/1/ns/pid", PID_NS_INIT_INO }, + { "cgroup", "/proc/1/ns/cgroup", CGROUP_NS_INIT_INO }, + { "time", "/proc/1/ns/time", TIME_NS_INIT_INO }, + { "net", "/proc/1/ns/net", NET_NS_INIT_INO }, + { "mnt", "/proc/1/ns/mnt", MNT_NS_INIT_INO }, +}; + +TEST(init_namespace_inodes) +{ + struct stat st; + + for (int i = 0; i < sizeof(namespaces) / sizeof(namespaces[0]); i++) { + int ret = stat(namespaces[i].proc_path, &st); + + /* Some namespaces might not be available (e.g., time namespace on older kernels) */ + if (ret < 0) { + if (errno == ENOENT) { + ksft_test_result_skip("%s namespace not available\n", + namespaces[i].name); + continue; + } + ASSERT_GE(ret, 0) + TH_LOG("Failed to stat %s: %s", + namespaces[i].proc_path, strerror(errno)); + } + + ASSERT_EQ(st.st_ino, namespaces[i].expected_ino) + TH_LOG("Namespace %s has inode 0x%lx, expected 0x%x", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + + ksft_print_msg("Namespace %s: inode 0x%lx matches expected 0x%x\n", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c new file mode 100644 index 00000000000000..e28accd74a57e0 --- /dev/null +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -0,0 +1,986 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" + +TEST(nsid_mntns_basic) +{ + __u64 mnt_ns_id = 0; + int fd_mntns; + int ret; + + /* Open the current mount namespace */ + fd_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_mntns, 0); + + /* Get the mount namespace ID */ + ret = ioctl(fd_mntns, NS_GET_MNTNS_ID, &mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(mnt_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 mnt_ns_id2 = 0; + ret = ioctl(fd_mntns, NS_GET_ID, &mnt_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mnt_ns_id, mnt_ns_id2); + + close(fd_mntns); +} + +TEST(nsid_mntns_separate) +{ + __u64 parent_mnt_ns_id = 0; + __u64 child_mnt_ns_id = 0; + int fd_parent_mntns, fd_child_mntns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's mount namespace ID */ + fd_parent_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_parent_mntns, 0); + ret = ioctl(fd_parent_mntns, NS_GET_ID, &parent_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_mnt_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_mntns); + SKIP(return, "No permission to create mount namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's mount namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid); + fd_child_mntns = open(path, O_RDONLY); + ASSERT_GE(fd_child_mntns, 0); + + /* Get child's mount namespace ID */ + ret = ioctl(fd_child_mntns, NS_GET_ID, &child_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_mnt_ns_id, 0); + + /* Parent and child should have different mount namespace IDs */ + ASSERT_NE(parent_mnt_ns_id, child_mnt_ns_id); + + close(fd_parent_mntns); + close(fd_child_mntns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_cgroupns_basic) +{ + __u64 cgroup_ns_id = 0; + int fd_cgroupns; + int ret; + + /* Open the current cgroup namespace */ + fd_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_cgroupns, 0); + + /* Get the cgroup namespace ID */ + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(cgroup_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 cgroup_ns_id2 = 0; + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(cgroup_ns_id, cgroup_ns_id2); + + close(fd_cgroupns); +} + +TEST(nsid_cgroupns_separate) +{ + __u64 parent_cgroup_ns_id = 0; + __u64 child_cgroup_ns_id = 0; + int fd_parent_cgroupns, fd_child_cgroupns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's cgroup namespace ID */ + fd_parent_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_parent_cgroupns, 0); + ret = ioctl(fd_parent_cgroupns, NS_GET_ID, &parent_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_cgroup_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_cgroupns); + SKIP(return, "No permission to create cgroup namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's cgroup namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/cgroup", pid); + fd_child_cgroupns = open(path, O_RDONLY); + ASSERT_GE(fd_child_cgroupns, 0); + + /* Get child's cgroup namespace ID */ + ret = ioctl(fd_child_cgroupns, NS_GET_ID, &child_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_cgroup_ns_id, 0); + + /* Parent and child should have different cgroup namespace IDs */ + ASSERT_NE(parent_cgroup_ns_id, child_cgroup_ns_id); + + close(fd_parent_cgroupns); + close(fd_child_cgroupns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_ipcns_basic) +{ + __u64 ipc_ns_id = 0; + int fd_ipcns; + int ret; + + /* Open the current IPC namespace */ + fd_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_ipcns, 0); + + /* Get the IPC namespace ID */ + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(ipc_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 ipc_ns_id2 = 0; + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(ipc_ns_id, ipc_ns_id2); + + close(fd_ipcns); +} + +TEST(nsid_ipcns_separate) +{ + __u64 parent_ipc_ns_id = 0; + __u64 child_ipc_ns_id = 0; + int fd_parent_ipcns, fd_child_ipcns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's IPC namespace ID */ + fd_parent_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_parent_ipcns, 0); + ret = ioctl(fd_parent_ipcns, NS_GET_ID, &parent_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_ipc_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_ipcns); + SKIP(return, "No permission to create IPC namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's IPC namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/ipc", pid); + fd_child_ipcns = open(path, O_RDONLY); + ASSERT_GE(fd_child_ipcns, 0); + + /* Get child's IPC namespace ID */ + ret = ioctl(fd_child_ipcns, NS_GET_ID, &child_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_ipc_ns_id, 0); + + /* Parent and child should have different IPC namespace IDs */ + ASSERT_NE(parent_ipc_ns_id, child_ipc_ns_id); + + close(fd_parent_ipcns); + close(fd_child_ipcns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_utsns_basic) +{ + __u64 uts_ns_id = 0; + int fd_utsns; + int ret; + + /* Open the current UTS namespace */ + fd_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_utsns, 0); + + /* Get the UTS namespace ID */ + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(uts_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 uts_ns_id2 = 0; + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(uts_ns_id, uts_ns_id2); + + close(fd_utsns); +} + +TEST(nsid_utsns_separate) +{ + __u64 parent_uts_ns_id = 0; + __u64 child_uts_ns_id = 0; + int fd_parent_utsns, fd_child_utsns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's UTS namespace ID */ + fd_parent_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_parent_utsns, 0); + ret = ioctl(fd_parent_utsns, NS_GET_ID, &parent_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_uts_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_utsns); + SKIP(return, "No permission to create UTS namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's UTS namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid); + fd_child_utsns = open(path, O_RDONLY); + ASSERT_GE(fd_child_utsns, 0); + + /* Get child's UTS namespace ID */ + ret = ioctl(fd_child_utsns, NS_GET_ID, &child_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_uts_ns_id, 0); + + /* Parent and child should have different UTS namespace IDs */ + ASSERT_NE(parent_uts_ns_id, child_uts_ns_id); + + close(fd_parent_utsns); + close(fd_child_utsns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_userns_basic) +{ + __u64 user_ns_id = 0; + int fd_userns; + int ret; + + /* Open the current user namespace */ + fd_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_userns, 0); + + /* Get the user namespace ID */ + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(user_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 user_ns_id2 = 0; + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(user_ns_id, user_ns_id2); + + close(fd_userns); +} + +TEST(nsid_userns_separate) +{ + __u64 parent_user_ns_id = 0; + __u64 child_user_ns_id = 0; + int fd_parent_userns, fd_child_userns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's user namespace ID */ + fd_parent_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_parent_userns, 0); + ret = ioctl(fd_parent_userns, NS_GET_ID, &parent_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_user_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_userns); + SKIP(return, "No permission to create user namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's user namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); + fd_child_userns = open(path, O_RDONLY); + ASSERT_GE(fd_child_userns, 0); + + /* Get child's user namespace ID */ + ret = ioctl(fd_child_userns, NS_GET_ID, &child_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_user_ns_id, 0); + + /* Parent and child should have different user namespace IDs */ + ASSERT_NE(parent_user_ns_id, child_user_ns_id); + + close(fd_parent_userns); + close(fd_child_userns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_timens_basic) +{ + __u64 time_ns_id = 0; + int fd_timens; + int ret; + + /* Open the current time namespace */ + fd_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get the time namespace ID */ + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(time_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 time_ns_id2 = 0; + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(time_ns_id, time_ns_id2); + + close(fd_timens); +} + +TEST(nsid_timens_separate) +{ + __u64 parent_time_ns_id = 0; + __u64 child_time_ns_id = 0; + int fd_parent_timens, fd_child_timens; + int ret; + pid_t pid; + int pipefd[2]; + + /* Open the current time namespace */ + fd_parent_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_parent_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get parent's time namespace ID */ + ret = ioctl(fd_parent_timens, NS_GET_ID, &parent_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_time_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new time namespace */ + ret = unshare(CLONE_NEWTIME); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES || errno == EINVAL) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_timens); + close(pipefd[0]); + SKIP(return, "Cannot create time namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's time namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/time", grandchild_pid); + fd_child_timens = open(path, O_RDONLY); + ASSERT_GE(fd_child_timens, 0); + + /* Get child's time namespace ID */ + ret = ioctl(fd_child_timens, NS_GET_ID, &child_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_time_ns_id, 0); + + /* Parent and child should have different time namespace IDs */ + ASSERT_NE(parent_time_ns_id, child_time_ns_id); + + close(fd_parent_timens); + close(fd_child_timens); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_pidns_basic) +{ + __u64 pid_ns_id = 0; + int fd_pidns; + int ret; + + /* Open the current PID namespace */ + fd_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_pidns, 0); + + /* Get the PID namespace ID */ + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(pid_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 pid_ns_id2 = 0; + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(pid_ns_id, pid_ns_id2); + + close(fd_pidns); +} + +TEST(nsid_pidns_separate) +{ + __u64 parent_pid_ns_id = 0; + __u64 child_pid_ns_id = 0; + int fd_parent_pidns, fd_child_pidns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's PID namespace ID */ + fd_parent_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_parent_pidns, 0); + ret = ioctl(fd_parent_pidns, NS_GET_ID, &parent_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_pid_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_pidns); + close(pipefd[0]); + SKIP(return, "No permission to create PID namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's PID namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/pid", grandchild_pid); + fd_child_pidns = open(path, O_RDONLY); + ASSERT_GE(fd_child_pidns, 0); + + /* Get child's PID namespace ID */ + ret = ioctl(fd_child_pidns, NS_GET_ID, &child_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_pid_ns_id, 0); + + /* Parent and child should have different PID namespace IDs */ + ASSERT_NE(parent_pid_ns_id, child_pid_ns_id); + + close(fd_parent_pidns); + close(fd_child_pidns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_netns_basic) +{ + __u64 net_ns_id = 0; + __u64 netns_cookie = 0; + int fd_netns; + int sock; + socklen_t optlen; + int ret; + + /* Open the current network namespace */ + fd_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_netns, 0); + + /* Get the network namespace ID via ioctl */ + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(net_ns_id, 0); + + /* Create a socket to get the SO_NETNS_COOKIE */ + sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(sock, 0); + + /* Get the network namespace cookie via socket option */ + optlen = sizeof(netns_cookie); + ret = getsockopt(sock, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + ASSERT_EQ(optlen, sizeof(netns_cookie)); + + /* The namespace ID and cookie should be identical */ + ASSERT_EQ(net_ns_id, netns_cookie); + + /* Verify we can get the same ID again */ + __u64 net_ns_id2 = 0; + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(net_ns_id, net_ns_id2); + + close(sock); + close(fd_netns); +} + +TEST(nsid_netns_separate) +{ + __u64 parent_net_ns_id = 0; + __u64 parent_netns_cookie = 0; + __u64 child_net_ns_id = 0; + __u64 child_netns_cookie = 0; + int fd_parent_netns, fd_child_netns; + int parent_sock, child_sock; + socklen_t optlen; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's network namespace ID */ + fd_parent_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_parent_netns, 0); + ret = ioctl(fd_parent_netns, NS_GET_ID, &parent_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_net_ns_id, 0); + + /* Get parent's network namespace cookie */ + parent_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(parent_sock, 0); + optlen = sizeof(parent_netns_cookie); + ret = getsockopt(parent_sock, SOL_SOCKET, SO_NETNS_COOKIE, &parent_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify parent's ID and cookie match */ + ASSERT_EQ(parent_net_ns_id, parent_netns_cookie); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_netns); + close(parent_sock); + SKIP(return, "No permission to create network namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's network namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/net", pid); + fd_child_netns = open(path, O_RDONLY); + ASSERT_GE(fd_child_netns, 0); + + /* Get child's network namespace ID */ + ret = ioctl(fd_child_netns, NS_GET_ID, &child_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_net_ns_id, 0); + + /* Create socket in child's namespace to get cookie */ + ret = setns(fd_child_netns, CLONE_NEWNET); + if (ret == 0) { + child_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(child_sock, 0); + + optlen = sizeof(child_netns_cookie); + ret = getsockopt(child_sock, SOL_SOCKET, SO_NETNS_COOKIE, &child_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify child's ID and cookie match */ + ASSERT_EQ(child_net_ns_id, child_netns_cookie); + + close(child_sock); + + /* Return to parent namespace */ + setns(fd_parent_netns, CLONE_NEWNET); + } + + /* Parent and child should have different network namespace IDs */ + ASSERT_NE(parent_net_ns_id, child_net_ns_id); + if (child_netns_cookie != 0) { + ASSERT_NE(parent_netns_cookie, child_netns_cookie); + } + + close(fd_parent_netns); + close(fd_child_netns); + close(parent_sock); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index c7e03e1d6f63bb..2b31d4a93ad7f7 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -116,6 +116,7 @@ TEST_PROGS += skf_net_off.sh TEST_GEN_FILES += skf_net_off TEST_GEN_FILES += tfo TEST_PROGS += tfo_passive.sh +TEST_PROGS += broadcast_ether_dst.sh TEST_PROGS += broadcast_pmtu.sh TEST_PROGS += ipv6_force_forwarding.sh diff --git a/tools/testing/selftests/net/broadcast_ether_dst.sh b/tools/testing/selftests/net/broadcast_ether_dst.sh new file mode 100755 index 00000000000000..334a7eca8a803b --- /dev/null +++ b/tools/testing/selftests/net/broadcast_ether_dst.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Author: Brett A C Sheffield +# Author: Oscar Maes +# +# Ensure destination ethernet field is correctly set for +# broadcast packets + +source lib.sh + +CLIENT_IP4="192.168.0.1" +GW_IP4="192.168.0.2" + +setup() { + setup_ns CLIENT_NS SERVER_NS + + ip -net "${SERVER_NS}" link add link1 type veth \ + peer name link0 netns "${CLIENT_NS}" + + ip -net "${CLIENT_NS}" link set link0 up + ip -net "${CLIENT_NS}" addr add "${CLIENT_IP4}"/24 dev link0 + + ip -net "${SERVER_NS}" link set link1 up + + ip -net "${CLIENT_NS}" route add default via "${GW_IP4}" + ip netns exec "${CLIENT_NS}" arp -s "${GW_IP4}" 00:11:22:33:44:55 +} + +cleanup() { + rm -f "${CAPFILE}" "${OUTPUT}" + ip -net "${SERVER_NS}" link del link1 + cleanup_ns "${CLIENT_NS}" "${SERVER_NS}" +} + +test_broadcast_ether_dst() { + local rc=0 + CAPFILE=$(mktemp -u cap.XXXXXXXXXX) + OUTPUT=$(mktemp -u out.XXXXXXXXXX) + + echo "Testing ethernet broadcast destination" + + # start tcpdump listening for icmp + # tcpdump will exit after receiving a single packet + # timeout will kill tcpdump if it is still running after 2s + timeout 2s ip netns exec "${CLIENT_NS}" \ + tcpdump -i link0 -c 1 -w "${CAPFILE}" icmp &> "${OUTPUT}" & + pid=$! + slowwait 1 grep -qs "listening" "${OUTPUT}" + + # send broadcast ping + ip netns exec "${CLIENT_NS}" \ + ping -W0.01 -c1 -b 255.255.255.255 &> /dev/null + + # wait for tcpdump for exit after receiving packet + wait "${pid}" + + # compare ethernet destination field to ff:ff:ff:ff:ff:ff + ether_dst=$(tcpdump -r "${CAPFILE}" -tnne 2>/dev/null | \ + awk '{sub(/,/,"",$3); print $3}') + if [[ "${ether_dst}" == "ff:ff:ff:ff:ff:ff" ]]; then + echo "[ OK ]" + rc="${ksft_pass}" + else + echo "[FAIL] expected dst ether addr to be ff:ff:ff:ff:ff:ff," \ + "got ${ether_dst}" + rc="${ksft_fail}" + fi + + return "${rc}" +} + +if [ ! -x "$(command -v tcpdump)" ]; then + echo "SKIP: Could not run test without tcpdump tool" + exit "${ksft_skip}" +fi + +trap cleanup EXIT + +setup +test_broadcast_ether_dst + +exit $? diff --git a/tools/testing/selftests/net/can/config b/tools/testing/selftests/net/can/config new file mode 100644 index 00000000000000..188f7979667097 --- /dev/null +++ b/tools/testing/selftests/net/can/config @@ -0,0 +1,3 @@ +CONFIG_CAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_VCAN=m diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index b39f748c25722a..2b0a90581e2f15 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -467,8 +467,8 @@ ipv6_fdb_grp_fcnal() log_test $? 0 "Get Fdb nexthop group by id" # fdb nexthop group can only contain fdb nexthops - run_cmd "$IP nexthop add id 63 via 2001:db8:91::4" - run_cmd "$IP nexthop add id 64 via 2001:db8:91::5" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::4 dev veth1" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::5 dev veth1" run_cmd "$IP nexthop add id 103 group 63/64 fdb" log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" @@ -494,6 +494,26 @@ ipv6_fdb_grp_fcnal() run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb" log_test $? 2 "Fdb Nexthop with encap" + # Replace FDB nexthop to non-FDB and vice versa + run_cmd "$IP nexthop add id 70 via 2001:db8:91::2 fdb" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1" + log_test $? 0 "Replace FDB nexthop to non-FDB nexthop" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 fdb" + log_test $? 0 "Replace non-FDB nexthop to FDB nexthop" + + # Replace FDB nexthop address while in a group + run_cmd "$IP nexthop add id 71 group 70 fdb" + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::3 fdb" + log_test $? 0 "Replace FDB nexthop address while in a group" + + # Cannot replace FDB nexthop to non-FDB and vice versa while in a group + run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1" + log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group" + run_cmd "$IP nexthop add id 72 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 73 group 72" + run_cmd "$IP nexthop replace id 72 via 2001:db8:91::2 fdb" + log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group" + run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100" run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" log_test $? 0 "Fdb mac add with nexthop group" @@ -547,15 +567,15 @@ ipv4_fdb_grp_fcnal() log_test $? 0 "Get Fdb nexthop group by id" # fdb nexthop group can only contain fdb nexthops - run_cmd "$IP nexthop add id 14 via 172.16.1.2" - run_cmd "$IP nexthop add id 15 via 172.16.1.3" + run_cmd "$IP nexthop add id 14 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 15 via 172.16.1.3 dev veth1" run_cmd "$IP nexthop add id 103 group 14/15 fdb" log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" # Non fdb nexthop group can not contain fdb nexthops run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb" run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb" - run_cmd "$IP nexthop add id 104 group 14/15" + run_cmd "$IP nexthop add id 104 group 16/17" log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" # fdb nexthop cannot have blackhole @@ -574,6 +594,26 @@ ipv4_fdb_grp_fcnal() run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb" log_test $? 2 "Fdb Nexthop with encap" + # Replace FDB nexthop to non-FDB and vice versa + run_cmd "$IP nexthop add id 18 via 172.16.1.2 fdb" + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1" + log_test $? 0 "Replace FDB nexthop to non-FDB nexthop" + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 fdb" + log_test $? 0 "Replace non-FDB nexthop to FDB nexthop" + + # Replace FDB nexthop address while in a group + run_cmd "$IP nexthop add id 19 group 18 fdb" + run_cmd "$IP nexthop replace id 18 via 172.16.1.3 fdb" + log_test $? 0 "Replace FDB nexthop address while in a group" + + # Cannot replace FDB nexthop to non-FDB and vice versa while in a group + run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1" + log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group" + run_cmd "$IP nexthop add id 20 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 21 group 20" + run_cmd "$IP nexthop replace id 20 via 172.16.1.2 fdb" + log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group" + run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100" run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" log_test $? 0 "Fdb mac add with nexthop group" @@ -582,7 +622,7 @@ ipv4_fdb_grp_fcnal() run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self" log_test $? 255 "Fdb mac add with nexthop" - run_cmd "$IP ro add 172.16.0.0/22 nhid 15" + run_cmd "$IP ro add 172.16.0.0/22 nhid 16" log_test $? 2 "Route add with fdb nexthop" run_cmd "$IP ro add 172.16.0.0/22 nhid 103" diff --git a/tools/testing/selftests/net/lib/xdp_native.bpf.c b/tools/testing/selftests/net/lib/xdp_native.bpf.c index 521ba38f2dddad..df4eea5c192b3b 100644 --- a/tools/testing/selftests/net/lib/xdp_native.bpf.c +++ b/tools/testing/selftests/net/lib/xdp_native.bpf.c @@ -14,6 +14,8 @@ #define MAX_PAYLOAD_LEN 5000 #define MAX_HDR_LEN 64 +extern int bpf_xdp_pull_data(struct xdp_md *xdp, __u32 len) __ksym __weak; + enum { XDP_MODE = 0, XDP_PORT = 1, @@ -68,30 +70,57 @@ static void record_stats(struct xdp_md *ctx, __u32 stat_type) static struct udphdr *filter_udphdr(struct xdp_md *ctx, __u16 port) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; struct udphdr *udph = NULL; - struct ethhdr *eth = data; + void *data, *data_end; + struct ethhdr *eth; + int err; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth)); + if (err) + return NULL; + + data_end = (void *)(long)ctx->data_end; + data = eth = (void *)(long)ctx->data; if (data + sizeof(*eth) > data_end) return NULL; if (eth->h_proto == bpf_htons(ETH_P_IP)) { - struct iphdr *iph = data + sizeof(*eth); + struct iphdr *iph; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*iph) + + sizeof(*udph)); + if (err) + return NULL; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + iph = data + sizeof(*eth); if (iph + 1 > (struct iphdr *)data_end || iph->protocol != IPPROTO_UDP) return NULL; - udph = (void *)eth + sizeof(*iph) + sizeof(*eth); - } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6h = data + sizeof(*eth); + udph = data + sizeof(*iph) + sizeof(*eth); + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + struct ipv6hdr *ipv6h; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*ipv6h) + + sizeof(*udph)); + if (err) + return NULL; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + ipv6h = data + sizeof(*eth); if (ipv6h + 1 > (struct ipv6hdr *)data_end || ipv6h->nexthdr != IPPROTO_UDP) return NULL; - udph = (void *)eth + sizeof(*ipv6h) + sizeof(*eth); + udph = data + sizeof(*ipv6h) + sizeof(*eth); } else { return NULL; } @@ -145,17 +174,34 @@ static void swap_machdr(void *data) static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; struct udphdr *udph = NULL; - struct ethhdr *eth = data; + void *data, *data_end; + struct ethhdr *eth; + int err; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth)); + if (err) + return XDP_PASS; + + data_end = (void *)(long)ctx->data_end; + data = eth = (void *)(long)ctx->data; if (data + sizeof(*eth) > data_end) return XDP_PASS; if (eth->h_proto == bpf_htons(ETH_P_IP)) { - struct iphdr *iph = data + sizeof(*eth); - __be32 tmp_ip = iph->saddr; + struct iphdr *iph; + __be32 tmp_ip; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*iph) + + sizeof(*udph)); + if (err) + return XDP_PASS; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + iph = data + sizeof(*eth); if (iph + 1 > (struct iphdr *)data_end || iph->protocol != IPPROTO_UDP) @@ -169,8 +215,10 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_PASS; record_stats(ctx, STATS_RX); + eth = data; swap_machdr((void *)eth); + tmp_ip = iph->saddr; iph->saddr = iph->daddr; iph->daddr = tmp_ip; @@ -178,9 +226,19 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_TX; - } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6h = data + sizeof(*eth); + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { struct in6_addr tmp_ipv6; + struct ipv6hdr *ipv6h; + + err = bpf_xdp_pull_data(ctx, sizeof(*eth) + sizeof(*ipv6h) + + sizeof(*udph)); + if (err) + return XDP_PASS; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + ipv6h = data + sizeof(*eth); if (ipv6h + 1 > (struct ipv6hdr *)data_end || ipv6h->nexthdr != IPPROTO_UDP) @@ -194,6 +252,7 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_PASS; record_stats(ctx, STATS_RX); + eth = data; swap_machdr((void *)eth); __builtin_memcpy(&tmp_ipv6, &ipv6h->saddr, sizeof(tmp_ipv6)); diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 7a3cb4c09e450f..d847ff1737c30c 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -28,7 +28,7 @@ flush_pids() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 4f07ac9fa207cb..b148cadb96d0b7 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1093,6 +1093,7 @@ int main_loop_s(int listensock) struct pollfd polls; socklen_t salen; int remotesock; + int err = 0; int fd = 0; again: @@ -1125,7 +1126,7 @@ int main_loop_s(int listensock) SOCK_TEST_TCPULP(remotesock, 0); memset(&winfo, 0, sizeof(winfo)); - copyfd_io(fd, remotesock, 1, true, &winfo); + err = copyfd_io(fd, remotesock, 1, true, &winfo); } else { perror("accept"); return 1; @@ -1134,10 +1135,10 @@ int main_loop_s(int listensock) if (cfg_input) close(fd); - if (--cfg_repeat > 0) + if (!err && --cfg_repeat > 0) goto again; - return 0; + return err; } static void init_rng(void) @@ -1247,7 +1248,7 @@ void xdisconnect(int fd) else xerror("bad family"); - strcpy(cmd, "ss -M | grep -q "); + strcpy(cmd, "ss -Mnt | grep -q "); cmdlen = strlen(cmd); if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen], sizeof(cmd) - cmdlen)) @@ -1257,7 +1258,7 @@ void xdisconnect(int fd) /* * wait until the pending data is completely flushed and all - * the MPTCP sockets reached the closed status. + * the sockets reached the closed status. * disconnect will bypass/ignore/drop any pending data. */ for (i = 0; ; i += msec_sleep) { diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 5e3c56253274a1..47ecb5b3836eb5 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -134,7 +134,7 @@ ns4="" TEST_GROUP="" # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cin_disconnect" @@ -211,6 +211,11 @@ if $checksum; then done fi +if $capture; then + rndh="${ns1:4}" + mptcp_lib_pr_info "Packet capture files will have this prefix: ${rndh}-" +fi + set_ethtool_flags() { local ns="$1" local dev="$2" @@ -361,7 +366,6 @@ do_transfer() if $capture; then local capuser - local rndh="${connector_ns:4}" if [ -z $SUDO_USER ] ; then capuser="" else diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 82cae37d9c2026..7fd555b123b900 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -8,7 +8,7 @@ # ShellCheck incorrectly believes that most of the code here is unreachable # because it's invoked by variable name, see how the "tests" array is used -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 . "$(dirname "${0}")/mptcp_lib.sh" diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 09cd24b2ae4662..d62e653d48b0f2 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -384,7 +384,7 @@ mptcp_lib_make_file() { mptcp_lib_print_file_err() { ls -l "${1}" 1>&2 echo "Trailing bytes are: " - tail -c 27 "${1}" + tail -c 32 "${1}" | od -x | head -n2 } # $1: input file ; $2: output file ; $3: what kind of file diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index e934dd26a59d9b..112c07c4c37a3c 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -667,22 +667,26 @@ static void process_one_client(int fd, int pipefd) do_getsockopts(&s, fd, ret, ret2); if (s.mptcpi_rcv_delta != (uint64_t)ret + 1) - xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64, s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - ret); + xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64 ", diff %" PRId64, + s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - (ret + 1)); /* be nice when running on top of older kernel */ if (s.pkt_stats_avail) { if (s.last_sample.mptcpi_bytes_sent != ret2) - xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_sent, ret2, s.last_sample.mptcpi_bytes_sent - ret2); if (s.last_sample.mptcpi_bytes_received != ret) - xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_received, ret, s.last_sample.mptcpi_bytes_received - ret); if (s.last_sample.mptcpi_bytes_acked != ret) - xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64, - s.last_sample.mptcpi_bytes_acked, ret2, - s.last_sample.mptcpi_bytes_acked - ret2); + xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, + s.last_sample.mptcpi_bytes_acked, ret, + s.last_sample.mptcpi_bytes_acked - ret); } close(fd); diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 418a903c3a4d39..f01989be6e9b3d 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -95,7 +95,7 @@ init() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { mptcp_lib_ns_exit "${ns1}" "${ns2}" "${ns_sbox}" diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index ac7ec6f9402376..ec6a8758819194 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -32,7 +32,7 @@ ns1="" err=$(mktemp) # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "${err}" @@ -70,8 +70,9 @@ format_endpoints() { mptcp_lib_pm_nl_format_endpoints "${@}" } +# This function is invoked indirectly +#shellcheck disable=SC2317,SC2329 get_endpoint() { - # shellcheck disable=SC2317 # invoked indirectly mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" } diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 994a556f46c151..93fea3442216c8 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -188,6 +188,13 @@ static int capture_events(int fd, int event_group) fprintf(stderr, ",error:%u", *(__u8 *)RTA_DATA(attrs)); else if (attrs->rta_type == MPTCP_ATTR_SERVER_SIDE) fprintf(stderr, ",server_side:%u", *(__u8 *)RTA_DATA(attrs)); + else if (attrs->rta_type == MPTCP_ATTR_FLAGS) { + __u16 flags = *(__u16 *)RTA_DATA(attrs); + + /* only print when present, easier */ + if (flags & MPTCP_PM_EV_FLAG_DENY_JOIN_ID0) + fprintf(stderr, ",deny_join_id0:1"); + } attrs = RTA_NEXT(attrs, msg_len); } diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 2329c2f8519b7c..1903e8e84a3151 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -35,7 +35,7 @@ usage() { } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cout" "$sout" diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 333064b0b5ac03..3d45991f24ede9 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -94,7 +94,7 @@ test_fail() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { print_title "Cleanup" @@ -201,6 +201,9 @@ make_connection() is_v6="v4" fi + # set this on the client side only: will not affect the rest + ip netns exec "$ns2" sysctl -q net.mptcp.allow_join_initial_addr_port=0 + :>"$client_evts" :>"$server_evts" @@ -223,23 +226,28 @@ make_connection() local client_token local client_port local client_serverside + local client_nojoin local server_token local server_serverside + local server_nojoin client_token=$(mptcp_lib_evts_get_info token "$client_evts") client_port=$(mptcp_lib_evts_get_info sport "$client_evts") client_serverside=$(mptcp_lib_evts_get_info server_side "$client_evts") + client_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$client_evts") server_token=$(mptcp_lib_evts_get_info token "$server_evts") server_serverside=$(mptcp_lib_evts_get_info server_side "$server_evts") + server_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$server_evts") print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" - if [ "$client_token" != "" ] && [ "$server_token" != "" ] && [ "$client_serverside" = 0 ] && - [ "$server_serverside" = 1 ] + if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && + [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}" else - test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})" + test_fail "Expected tokens (c:${client_token} - s:${server_token}), server (c:${client_serverside} - s:${server_serverside}), nojoin (c:${client_nojoin} - s:${server_nojoin})" mptcp_lib_result_print_all_tap exit ${KSFT_FAIL} fi diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index 3c8d3455d8e7f1..b327d3061ed53a 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -25,6 +25,7 @@ tests=" nat_related_v4 ip4-nat-related: ICMP related matches work with SNAT netlink_checks ovsnl: validate netlink attrs and settings upcall_interfaces ovs: test the upcall interfaces + tunnel_metadata ovs: test extraction of tunnel metadata drop_reason drop: test drop reasons are emitted psample psample: Sampling packets with psample" @@ -113,13 +114,13 @@ ovs_add_dp () { } ovs_add_if () { - info "Adding IF to DP: br:$2 if:$3" - if [ "$4" != "-u" ]; then - ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if "$2" "$3" \ - || return 1 + info "Adding IF to DP: br:$3 if:$4 ($2)" + if [ "$5" != "-u" ]; then + ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if \ + -t "$2" "$3" "$4" || return 1 else python3 $ovs_base/ovs-dpctl.py add-if \ - -u "$2" "$3" >$ovs_dir/$3.out 2>$ovs_dir/$3.err & + -u -t "$2" "$3" "$4" >$ovs_dir/$4.out 2>$ovs_dir/$4.err & pid=$! on_exit "ovs_sbx $1 kill -TERM $pid 2>/dev/null" fi @@ -166,9 +167,9 @@ ovs_add_netns_and_veths () { fi if [ "$7" != "-u" ]; then - ovs_add_if "$1" "$2" "$4" || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" || return 1 else - ovs_add_if "$1" "$2" "$4" -u || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" -u || return 1 fi if [ $TRACING -eq 1 ]; then @@ -756,6 +757,79 @@ test_upcall_interfaces() { return 0 } +ovs_add_kernel_tunnel() { + local sbxname=$1; shift + local ns=$1; shift + local tnl_type=$1; shift + local name=$1; shift + local addr=$1; shift + + info "setting up kernel ${tnl_type} tunnel ${name}" + ovs_sbx "${sbxname}" ip -netns ${ns} link add dev ${name} type ${tnl_type} $* || return 1 + on_exit "ovs_sbx ${sbxname} ip -netns ${ns} link del ${name} >/dev/null 2>&1" + ovs_sbx "${sbxname}" ip -netns ${ns} addr add dev ${name} ${addr} || return 1 + ovs_sbx "${sbxname}" ip -netns ${ns} link set dev ${name} mtu 1450 up || return 1 +} + +test_tunnel_metadata() { + which arping >/dev/null 2>&1 || return $ksft_skip + + sbxname="test_tunnel_metadata" + sbx_add "${sbxname}" || return 1 + + info "setting up new DP" + ovs_add_dp "${sbxname}" tdp0 -V 2:1 || return 1 + + ovs_add_netns_and_veths "${sbxname}" tdp0 tns left0 l0 \ + 172.31.110.1/24 || return 1 + + info "removing veth interface from openvswitch and setting IP" + ovs_del_if "${sbxname}" tdp0 left0 || return 1 + ovs_sbx "${sbxname}" ip addr add 172.31.110.2/24 dev left0 || return 1 + ovs_sbx "${sbxname}" ip link set left0 up || return 1 + + info "setting up tunnel port in openvswitch" + ovs_add_if "${sbxname}" "vxlan" tdp0 ovs-vxlan0 -u || return 1 + on_exit "ovs_sbx ${sbxname} ip link del ovs-vxlan0" + ovs_wait ip link show ovs-vxlan0 &>/dev/null || return 1 + ovs_sbx "${sbxname}" ip link set ovs-vxlan0 up || return 1 + + configs=$(echo ' + 1 172.31.221.1/24 1155332 32 set udpcsum flags\(df\|csum\) + 2 172.31.222.1/24 1234567 45 set noudpcsum flags\(df\) + 3 172.31.223.1/24 1020304 23 unset udpcsum flags\(csum\) + 4 172.31.224.1/24 1357986 15 unset noudpcsum' | sed '/^$/d') + + while read -r i addr id ttl df csum flags; do + ovs_add_kernel_tunnel "${sbxname}" tns vxlan vxlan${i} ${addr} \ + remote 172.31.110.2 id ${id} dstport 4789 \ + ttl ${ttl} df ${df} ${csum} || return 1 + done <<< "${configs}" + + ovs_wait grep -q 'listening on upcall packet handler' \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + + info "sending arping" + for i in 1 2 3 4; do + ovs_sbx "${sbxname}" ip netns exec tns \ + arping -I vxlan${i} 172.31.22${i}.2 -c 1 \ + >${ovs_dir}/arping.stdout 2>${ovs_dir}/arping.stderr + done + + info "checking that received decapsulated packets carry correct metadata" + while read -r i addr id ttl df csum flags; do + arp_hdr="arp\\(sip=172.31.22${i}.1,tip=172.31.22${i}.2,op=1,sha=" + addrs="src=172.31.110.1,dst=172.31.110.2" + ports="tp_src=[0-9]*,tp_dst=4789" + tnl_md="tunnel\\(tun_id=${id},${addrs},ttl=${ttl},${ports},${flags}\\)" + + ovs_sbx "${sbxname}" grep -qE "MISS upcall.*${tnl_md}.*${arp_hdr}" \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + done <<< "${configs}" + + return 0 +} + run_test() { ( tname="$1" diff --git a/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt new file mode 100644 index 00000000000000..26794e7ddfd5fb --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh + ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x602 /proc/sys/net/ipv4/tcp_timestamps=0` + + 0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:10(10) win 32792 + +0 > S. 0:0(0) ack 11 win 65535 + +// sk->sk_state is TCP_SYN_RECV + +.1 accept(3, ..., ...) = 4 + +// tcp_disconnect() sets sk->sk_state to TCP_CLOSE + +0 connect(4, AF_UNSPEC, ...) = 0 + +0 > R. 1:1(0) ack 11 win 65535 + +// connect() sets sk->sk_state to TCP_SYN_SENT + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation is now in progress) + +0 > S 0:0(0) win 65535 + +// tp->fastopen_rsk must be NULL + +1 > S 0:0(0) win 65535 diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 0f5640d8dc7fbf..dd093f9df6f147 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -2770,6 +2770,22 @@ TEST_F(tls_err, poll_partial_rec_async) } } +/* Use OOB+large send to trigger copy mode due to memory pressure. + * OOB causes a short read. + */ +TEST_F(tls_err, oob_pressure) +{ + char buf[1<<16]; + int i; + + memrnd(buf, sizeof(buf)); + + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); + EXPECT_EQ(send(self->fd2, buf, sizeof(buf), 0), sizeof(buf)); + for (i = 0; i < 64; i++) + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); +} + TEST(non_established) { struct tls12_crypto_info_aes_gcm_256 tls12; struct sockaddr_in addr; diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index 0fb759ba992ee6..330e000baeb1db 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -262,19 +262,22 @@ REPORT ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++ if (f || !p || !done) printf("failure\n"); else if (s) printf("warning\n"); else printf("success\n");; \ printf("\nSee all results in %s\n", ARGV[1]); }' +# Execute the toplevel kernel Makefile +KBUILD_MAKE = $(MAKE) -C $(srctree) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) LLVM= + help: @echo "Supported targets under selftests/nolibc:" @echo " all call the \"run\" target below" @echo " help this help" @echo " sysroot create the nolibc sysroot here (uses \$$ARCH)" - @echo " nolibc-test build the executable (uses \$$CC and \$$CROSS_COMPILE)" + @echo " nolibc-test build the executable (uses \$$CC or \$$CROSS_COMPILE)" @echo " libc-test build an executable using the compiler's default libc instead" @echo " run-user runs the executable under QEMU (uses \$$XARCH, \$$TEST)" @echo " initramfs.cpio prepare the initramfs archive with nolibc-test" @echo " initramfs prepare the initramfs tree with nolibc-test" @echo " defconfig create a fresh new default config (uses \$$XARCH)" - @echo " kernel (re)build the kernel (uses \$$XARCH)" - @echo " kernel-standalone (re)build the kernel with the initramfs (uses \$$XARCH)" + @echo " kernel (re)build the kernel (uses \$$XARCH, \$$CROSS_COMPILE)" + @echo " kernel-standalone (re)build the kernel with the initramfs (uses \$$XARCH, \$$CROSS_COMPILE)" @echo " run runs the kernel in QEMU after building it (uses \$$XARCH, \$$TEST)" @echo " rerun runs a previously prebuilt kernel in QEMU (uses \$$XARCH, \$$TEST)" @echo " clean clean the sysroot, initramfs, build and output files" @@ -340,17 +343,17 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(DEFCONFIG) + $(Q)$(KBUILD_MAKE) $(DEFCONFIG) $(Q)if [ -n "$(EXTRACONFIG)" ]; then \ $(srctree)/scripts/config --file $(objtree)/.config $(EXTRACONFIG); \ - $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ + $(KBUILD_MAKE) olddefconfig < /dev/null; \ fi kernel: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null + $(Q)$(KBUILD_MAKE) $(IMAGE_NAME) < /dev/null kernel-standalone: initramfs - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null + $(Q)$(KBUILD_MAKE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null # run the tests after building the kernel run: kernel initramfs.cpio diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index a297ee0d6d0754..29de21595fc953 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -196,8 +196,8 @@ int expect_zr(int expr, int llen) } -#define EXPECT_NZ(cond, expr, val) \ - do { if (!(cond)) result(llen, SKIPPED); else ret += expect_nz(expr, llen; } while (0) +#define EXPECT_NZ(cond, expr) \ + do { if (!(cond)) result(llen, SKIPPED); else ret += expect_nz(expr, llen); } while (0) static __attribute__((unused)) int expect_nz(int expr, int llen) @@ -686,7 +686,6 @@ int expect_strtox(int llen, void *func, const char *input, int base, intmax_t ex #define CASE_TEST(name) \ case __LINE__: llen += printf("%d %s", test, #name); -/* constructors validate that they are executed in definition order */ __attribute__((constructor)) static void constructor1(void) { @@ -1334,6 +1333,7 @@ int run_syscall(int min, int max) CASE_TEST(chroot_root); EXPECT_SYSZR(euid0, chroot("/")); break; CASE_TEST(chroot_blah); EXPECT_SYSER(1, chroot("/proc/self/blah"), -1, ENOENT); break; CASE_TEST(chroot_exe); EXPECT_SYSER(1, chroot(argv0), -1, ENOTDIR); break; + CASE_TEST(clock_nanosleep); ts.tv_nsec = -1; EXPECT_EQ(1, EINVAL, clock_nanosleep(CLOCK_REALTIME, 0, &ts, NULL)); break; CASE_TEST(close_m1); EXPECT_SYSER(1, close(-1), -1, EBADF); break; CASE_TEST(close_dup); EXPECT_SYSZR(1, close(dup(0))); break; CASE_TEST(dup_0); tmp = dup(0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config index 6133524710f790..cf7cc0ce02484e 100644 --- a/tools/testing/selftests/pidfd/config +++ b/tools/testing/selftests/pidfd/config @@ -4,6 +4,5 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_TIME_NS=y -CONFIG_GENERIC_VDSO_TIME_NS=y CONFIG_CGROUPS=y CONFIG_CHECKPOINT_RESTORE=y diff --git a/tools/testing/selftests/powerpc/include/instructions.h b/tools/testing/selftests/powerpc/include/instructions.h index 4efa6314bd9630..864f0c9f1afcb0 100644 --- a/tools/testing/selftests/powerpc/include/instructions.h +++ b/tools/testing/selftests/powerpc/include/instructions.h @@ -67,7 +67,7 @@ static inline int paste_last(void *i) #define PPC_INST_PASTE_LAST __PASTE(0, 0, 1, 1) /* This defines the prefixed load/store instructions */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define stringify_in_c(...) __VA_ARGS__ #else # define __stringify_in_c(...) #__VA_ARGS__ diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 19bb333e2485f5..6b78a8382d40e4 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -18,6 +18,7 @@ /proc-tid0 /proc-uptime-001 /proc-uptime-002 +/proc-pidns /read /self /setns-dcache diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 50aba102201a9d..be3013515aae57 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -28,5 +28,6 @@ TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self TEST_GEN_PROGS += proc-multiple-procfs TEST_GEN_PROGS += proc-fsconfig-hidepid +TEST_GEN_PROGS += proc-pidns include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-pidns.c b/tools/testing/selftests/proc/proc-pidns.c new file mode 100644 index 00000000000000..52500597f95148 --- /dev/null +++ b/tools/testing/selftests/proc/proc-pidns.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai + * Copyright (C) 2025 SUSE LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define ASSERT_ERRNO(expected, _t, seen) \ + __EXPECT(expected, #expected, \ + ({__typeof__(seen) _tmp_seen = (seen); \ + _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1) + +#define ASSERT_ERRNO_EQ(expected, seen) \ + ASSERT_ERRNO(expected, ==, seen) + +#define ASSERT_SUCCESS(seen) \ + ASSERT_ERRNO(0, <=, seen) + +static int touch(char *path) +{ + int fd = open(path, O_WRONLY|O_CREAT|O_CLOEXEC, 0644); + if (fd < 0) + return -1; + return close(fd); +} + +FIXTURE(ns) +{ + int host_mntns, host_pidns; + int dummy_pidns; +}; + +FIXTURE_SETUP(ns) +{ + /* Stash the old mntns. */ + self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_mntns); + + /* Create a new mount namespace and make it private. */ + ASSERT_SUCCESS(unshare(CLONE_NEWNS)); + ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); + + /* + * Create a proper tmpfs that we can use and will disappear once we + * leave this mntns. + */ + ASSERT_SUCCESS(mount("tmpfs", "/tmp", "tmpfs", 0, NULL)); + + /* + * Create a pidns we can use for later tests. We need to fork off a + * child so that we get a usable nsfd that we can bind-mount and open. + */ + ASSERT_SUCCESS(mkdir("/tmp/dummy", 0755)); + ASSERT_SUCCESS(touch("/tmp/dummy/pidns")); + ASSERT_SUCCESS(mkdir("/tmp/dummy/proc", 0755)); + + self->host_pidns = open("/proc/self/ns/pid", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_pidns); + ASSERT_SUCCESS(unshare(CLONE_NEWPID)); + + pid_t pid = fork(); + ASSERT_SUCCESS(pid); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGKILL); + ASSERT_SUCCESS(mount("/proc/self/ns/pid", "/tmp/dummy/pidns", NULL, MS_BIND, NULL)); + ASSERT_SUCCESS(mount("proc", "/tmp/dummy/proc", "proc", 0, NULL)); + exit(0); + } + + int wstatus; + ASSERT_EQ(waitpid(pid, &wstatus, 0), pid); + ASSERT_TRUE(WIFEXITED(wstatus)); + ASSERT_EQ(WEXITSTATUS(wstatus), 0); + + ASSERT_SUCCESS(setns(self->host_pidns, CLONE_NEWPID)); + + self->dummy_pidns = open("/tmp/dummy/pidns", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->dummy_pidns); +} + +FIXTURE_TEARDOWN(ns) +{ + ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS)); + ASSERT_SUCCESS(close(self->host_mntns)); + + ASSERT_SUCCESS(close(self->host_pidns)); + ASSERT_SUCCESS(close(self->dummy_pidns)); +} + +TEST_F(ns, pidns_mount_string_path) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc-host", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-host", "proc", 0, "pidns=/proc/self/ns/pid")); + ASSERT_SUCCESS(access("/tmp/proc-host/self/", X_OK)); + + ASSERT_SUCCESS(mkdir("/tmp/proc-dummy", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-dummy", "proc", 0, "pidns=/tmp/dummy/pidns")); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/1/", X_OK)); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/self/", X_OK)); +} + +TEST_F(ns, pidns_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_remount) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc", "proc", 0, "")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); + + ASSERT_ERRNO_EQ(-EBUSY, mount(NULL, "/tmp/proc", NULL, MS_REMOUNT, "pidns=/tmp/dummy/pidns")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/README b/tools/testing/selftests/riscv/README new file mode 100644 index 00000000000000..443da395da6897 --- /dev/null +++ b/tools/testing/selftests/riscv/README @@ -0,0 +1,24 @@ +KSelfTest RISC-V +================ + +- These tests are riscv specific and so not built or run but just skipped + completely when env-variable ARCH is found to be different than 'riscv'. + +- Holding true the above, RISC-V KSFT tests can be run within the + KSelfTest framework using standard Linux top-level-makefile targets: + + $ make TARGETS=riscv kselftest-clean + $ make TARGETS=riscv kselftest + + or + + $ make -C tools/testing/selftests TARGETS=riscv \ + INSTALL_PATH= install + + or, alternatively, only specific riscv/ subtargets can be picked: + + $ make -C tools/testing/selftests TARGETS=riscv RISCV_SUBTARGETS="mm vector" \ + INSTALL_PATH= install + + Further details on building and running KSFT can be found in: + Documentation/dev-tools/kselftest.rst diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index 663a9cef1952f0..dcac5cbe793370 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -40,9 +40,9 @@ * Define weak versions to play nice with binaries that are statically linked * against a libc that doesn't support registering its own rseq. */ -__weak ptrdiff_t __rseq_offset; -__weak unsigned int __rseq_size; -__weak unsigned int __rseq_flags; +extern __weak ptrdiff_t __rseq_offset; +extern __weak unsigned int __rseq_size; +extern __weak unsigned int __rseq_flags; static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset; static const unsigned int *libc_rseq_size_p = &__rseq_size; @@ -209,7 +209,7 @@ void rseq_init(void) * libc not having registered a restartable sequence. Try to find the * symbols if that's the case. */ - if (!*libc_rseq_size_p) { + if (!libc_rseq_size_p || !*libc_rseq_size_p) { libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset"); libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size"); libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags"); diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 61acbd45ffaaf8..874f17763536b6 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,14 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + +#ifndef __naked +#define __naked __attribute__((__naked__)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -3547,6 +3556,10 @@ static void signal_handler(int signal) perror("write from signal"); } +static void signal_handler_nop(int signal) +{ +} + TEST(user_notification_signal) { pid_t pid; @@ -4819,6 +4832,132 @@ TEST(user_notification_wait_killable_fatal) EXPECT_EQ(SIGTERM, WTERMSIG(status)); } +/* Ensure signals after the reply do not interrupt */ +TEST(user_notification_wait_killable_after_reply) +{ + int i, max_iter = 100000; + int listener, status; + int pipe_fds[2]; + pid_t pid; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) + { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + listener = user_notif_syscall( + __NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV); + ASSERT_GE(listener, 0); + + /* + * Used to count invocations. One token is transferred from the child + * to the parent per syscall invocation, the parent tries to take + * one token per successful RECV. If the syscall is restarted after + * RECV the parent will try to get two tokens while the child only + * provided one. + */ + ASSERT_EQ(pipe(pipe_fds), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct sigaction new_action = { + .sa_handler = signal_handler_nop, + .sa_flags = SA_RESTART, + }; + struct itimerval timer = { + .it_value = { .tv_usec = 1000 }, + .it_interval = { .tv_usec = 1000 }, + }; + char c = 'a'; + + close(pipe_fds[0]); + + /* Setup the sigaction with SA_RESTART */ + if (sigaction(SIGALRM, &new_action, NULL)) { + perror("sigaction"); + exit(1); + } + + /* + * Kill with SIGALRM repeatedly, to try to hit the race when + * handling the syscall. + */ + if (setitimer(ITIMER_REAL, &timer, NULL) < 0) + perror("setitimer"); + + for (i = 0; i < max_iter; ++i) { + int fd; + + /* Send one token per iteration to catch repeats. */ + if (write(pipe_fds[1], &c, sizeof(c)) != 1) { + perror("write"); + exit(1); + } + + fd = syscall(__NR_dup, 0); + if (fd < 0) { + perror("dup"); + exit(1); + } + close(fd); + } + + exit(0); + } + + close(pipe_fds[1]); + + for (i = 0; i < max_iter; ++i) { + struct seccomp_notif req = {}; + struct seccomp_notif_addfd addfd = {}; + struct pollfd pfd = { + .fd = pipe_fds[0], + .events = POLLIN, + }; + char c; + + /* + * Try to receive one token. If it failed, one child syscall + * was restarted after RECV and needed to be handled twice. + */ + ASSERT_EQ(poll(&pfd, 1, 1000), 1) + kill(pid, SIGKILL); + + ASSERT_EQ(read(pipe_fds[0], &c, sizeof(c)), 1) + kill(pid, SIGKILL); + + /* + * Get the notification, reply to it as fast as possible to test + * whether the child wrongly skips going into the non-preemptible + * (TASK_KILLABLE) state. + */ + do + ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req); + while (ret < 0 && errno == ENOENT); /* Accept interruptions before RECV */ + ASSERT_EQ(ret, 0) + kill(pid, SIGKILL); + + addfd.id = req.id; + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + addfd.srcfd = 0; + ASSERT_GE(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), 0) + kill(pid, SIGKILL); + } + + /* + * Wait for the process to exit, and make sure the process terminated + * with a zero exit code.. + */ + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + struct tsync_vs_thread_leader_args { pthread_t leader; }; @@ -4896,7 +5035,36 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } -noinline int probed(void) +#ifdef __x86_64__ + +/* + * We need naked probed_uprobe function. Using __nocf_check + * check to skip possible endbr64 instruction and ignoring + * -Wattributes, otherwise the compilation might fail. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" + +__naked __nocf_check noinline int probed_uprobe(void) +{ + /* + * Optimized uprobe is possible only on top of nop5 instruction. + */ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} +#pragma GCC diagnostic pop + +#else +noinline int probed_uprobe(void) +{ + return 1; +} +#endif + +noinline int probed_uretprobe(void) { return 1; } @@ -4949,35 +5117,46 @@ static ssize_t get_uprobe_offset(const void *addr) return found ? (uintptr_t)addr - start + base : -1; } -FIXTURE(URETPROBE) { +FIXTURE(UPROBE) { int fd; }; -FIXTURE_VARIANT(URETPROBE) { +FIXTURE_VARIANT(UPROBE) { /* - * All of the URETPROBE behaviors can be tested with either - * uretprobe attached or not + * All of the U(RET)PROBE behaviors can be tested with either + * u(ret)probe attached or not */ bool attach; + /* + * Test both uprobe and uretprobe. + */ + bool uretprobe; +}; + +FIXTURE_VARIANT_ADD(UPROBE, not_attached) { + .attach = false, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, attached) { +FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { .attach = true, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { - .attach = false, +FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { + .attach = true, + .uretprobe = true, }; -FIXTURE_SETUP(URETPROBE) +FIXTURE_SETUP(UPROBE) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; ssize_t offset; int type, bit; -#ifndef __NR_uretprobe - SKIP(return, "__NR_uretprobe syscall not defined"); +#if !defined(__NR_uprobe) || !defined(__NR_uretprobe) + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); #endif if (!variant->attach) @@ -4987,12 +5166,17 @@ FIXTURE_SETUP(URETPROBE) type = determine_uprobe_perf_type(); ASSERT_GE(type, 0); - bit = determine_uprobe_retprobe_bit(); - ASSERT_GE(bit, 0); - offset = get_uprobe_offset(probed); + + if (variant->uretprobe) { + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + } + + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); ASSERT_GE(offset, 0); - attr.config |= 1 << bit; + if (variant->uretprobe) + attr.config |= 1 << bit; attr.size = attr_sz; attr.type = type; attr.config1 = ptr_to_u64("/proc/self/exe"); @@ -5003,7 +5187,7 @@ FIXTURE_SETUP(URETPROBE) PERF_FLAG_FD_CLOEXEC); } -FIXTURE_TEARDOWN(URETPROBE) +FIXTURE_TEARDOWN(UPROBE) { /* we could call close(self->fd), but we'd need extra filter for * that and since we are calling _exit right away.. @@ -5017,11 +5201,17 @@ static int run_probed_with_filter(struct sock_fprog *prog) return -1; } - probed(); + /* + * Uprobe is optimized after first hit, so let's hit twice. + */ + probed_uprobe(); + probed_uprobe(); + + probed_uretprobe(); return 0; } -TEST_F(URETPROBE, uretprobe_default_allow) +TEST_F(UPROBE, uprobe_default_allow) { struct sock_filter filter[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), @@ -5034,7 +5224,7 @@ TEST_F(URETPROBE, uretprobe_default_allow) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block) +TEST_F(UPROBE, uprobe_default_block) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -5051,11 +5241,14 @@ TEST_F(URETPROBE, uretprobe_default_block) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +TEST_F(UPROBE, uprobe_block_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), #endif @@ -5070,11 +5263,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +TEST_F(UPROBE, uprobe_default_block_with_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), #endif diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 30d5c8f0e5c7df..ba322a353aff14 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only vdso_test vdso_test_abi -vdso_test_clock_getres vdso_test_correctness vdso_test_gettimeofday vdso_test_getcpu diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 918a2caa070ebc..e361aca22a74dc 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -4,7 +4,6 @@ include ../../../scripts/Makefile.arch TEST_GEN_PROGS := vdso_test_gettimeofday TEST_GEN_PROGS += vdso_test_getcpu TEST_GEN_PROGS += vdso_test_abi -TEST_GEN_PROGS += vdso_test_clock_getres ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64)) TEST_GEN_PROGS += vdso_standalone_test_x86 endif @@ -29,7 +28,6 @@ CFLAGS_NOLIBC := -nostdlib -nostdinc -ffreestanding -fno-asynchronous-unwind-tab $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c -$(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c | headers $(OUTPUT)/vdso_standalone_test_x86: CFLAGS:=$(CFLAGS_NOLIBC) $(CFLAGS) diff --git a/tools/testing/selftests/vDSO/vdso_call.h b/tools/testing/selftests/vDSO/vdso_call.h index bb237d771051bd..e7205584cbdca5 100644 --- a/tools/testing/selftests/vDSO/vdso_call.h +++ b/tools/testing/selftests/vDSO/vdso_call.h @@ -44,7 +44,6 @@ register long _r6 asm ("r6"); \ register long _r7 asm ("r7"); \ register long _r8 asm ("r8"); \ - register long _rval asm ("r3"); \ \ LOADARGS_##nr(fn, args); \ \ @@ -54,13 +53,13 @@ " bns+ 1f\n" \ " neg 3, 3\n" \ "1:" \ - : "+r" (_r0), "=r" (_r3), "+r" (_r4), "+r" (_r5), \ + : "+r" (_r0), "+r" (_r3), "+r" (_r4), "+r" (_r5), \ "+r" (_r6), "+r" (_r7), "+r" (_r8) \ - : "r" (_rval) \ + : \ : "r9", "r10", "r11", "r12", "cr0", "cr1", "cr5", \ "cr6", "cr7", "xer", "lr", "ctr", "memory" \ ); \ - _rval; \ + _r3; \ }) #else diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index a54424e2336f45..238d609a457a28 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -26,24 +26,31 @@ static const char *version; static const char **name; +/* The same as struct __kernel_timespec */ +struct vdso_timespec64 { + uint64_t tv_sec; + uint64_t tv_nsec; +}; + typedef long (*vdso_gettimeofday_t)(struct timeval *tv, struct timezone *tz); typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); +typedef long (*vdso_clock_gettime64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); typedef time_t (*vdso_time_t)(time_t *t); -const char *vdso_clock_name[12] = { - "CLOCK_REALTIME", - "CLOCK_MONOTONIC", - "CLOCK_PROCESS_CPUTIME_ID", - "CLOCK_THREAD_CPUTIME_ID", - "CLOCK_MONOTONIC_RAW", - "CLOCK_REALTIME_COARSE", - "CLOCK_MONOTONIC_COARSE", - "CLOCK_BOOTTIME", - "CLOCK_REALTIME_ALARM", - "CLOCK_BOOTTIME_ALARM", - "CLOCK_SGI_CYCLE", - "CLOCK_TAI", +static const char * const vdso_clock_name[] = { + [CLOCK_REALTIME] = "CLOCK_REALTIME", + [CLOCK_MONOTONIC] = "CLOCK_MONOTONIC", + [CLOCK_PROCESS_CPUTIME_ID] = "CLOCK_PROCESS_CPUTIME_ID", + [CLOCK_THREAD_CPUTIME_ID] = "CLOCK_THREAD_CPUTIME_ID", + [CLOCK_MONOTONIC_RAW] = "CLOCK_MONOTONIC_RAW", + [CLOCK_REALTIME_COARSE] = "CLOCK_REALTIME_COARSE", + [CLOCK_MONOTONIC_COARSE] = "CLOCK_MONOTONIC_COARSE", + [CLOCK_BOOTTIME] = "CLOCK_BOOTTIME", + [CLOCK_REALTIME_ALARM] = "CLOCK_REALTIME_ALARM", + [CLOCK_BOOTTIME_ALARM] = "CLOCK_BOOTTIME_ALARM", + [10 /* CLOCK_SGI_CYCLE */] = "CLOCK_SGI_CYCLE", + [CLOCK_TAI] = "CLOCK_TAI", }; static void vdso_test_gettimeofday(void) @@ -70,6 +77,33 @@ static void vdso_test_gettimeofday(void) } } +static void vdso_test_clock_gettime64(clockid_t clk_id) +{ + /* Find clock_gettime64. */ + vdso_clock_gettime64_t vdso_clock_gettime64 = + (vdso_clock_gettime64_t)vdso_sym(version, name[5]); + + if (!vdso_clock_gettime64) { + ksft_print_msg("Couldn't find %s\n", name[5]); + ksft_test_result_skip("%s %s\n", name[5], + vdso_clock_name[clk_id]); + return; + } + + struct vdso_timespec64 ts; + long ret = VDSO_CALL(vdso_clock_gettime64, 2, clk_id, &ts); + + if (ret == 0) { + ksft_print_msg("The time is %lld.%06lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + ksft_test_result_pass("%s %s\n", name[5], + vdso_clock_name[clk_id]); + } else { + ksft_test_result_fail("%s %s\n", name[5], + vdso_clock_name[clk_id]); + } +} + static void vdso_test_clock_gettime(clockid_t clk_id) { /* Find clock_gettime. */ @@ -171,23 +205,23 @@ static inline void vdso_test_clock(clockid_t clock_id) ksft_print_msg("clock_id: %s\n", vdso_clock_name[clock_id]); vdso_test_clock_gettime(clock_id); + vdso_test_clock_gettime64(clock_id); vdso_test_clock_getres(clock_id); } -#define VDSO_TEST_PLAN 16 +#define VDSO_TEST_PLAN 29 int main(int argc, char **argv) { unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); ksft_print_header(); - ksft_set_plan(VDSO_TEST_PLAN); - if (!sysinfo_ehdr) { - ksft_print_msg("AT_SYSINFO_EHDR is not present!\n"); - return KSFT_SKIP; - } + if (!sysinfo_ehdr) + ksft_exit_skip("AT_SYSINFO_EHDR is not present!\n"); + + ksft_set_plan(VDSO_TEST_PLAN); version = versions[VDSO_VERSION]; name = (const char **)&names[VDSO_NAMES]; @@ -198,40 +232,17 @@ int main(int argc, char **argv) vdso_test_gettimeofday(); -#if _POSIX_TIMERS > 0 - -#ifdef CLOCK_REALTIME vdso_test_clock(CLOCK_REALTIME); -#endif - -#ifdef CLOCK_BOOTTIME vdso_test_clock(CLOCK_BOOTTIME); -#endif - -#ifdef CLOCK_TAI vdso_test_clock(CLOCK_TAI); -#endif - -#ifdef CLOCK_REALTIME_COARSE vdso_test_clock(CLOCK_REALTIME_COARSE); -#endif - -#ifdef CLOCK_MONOTONIC vdso_test_clock(CLOCK_MONOTONIC); -#endif - -#ifdef CLOCK_MONOTONIC_RAW vdso_test_clock(CLOCK_MONOTONIC_RAW); -#endif - -#ifdef CLOCK_MONOTONIC_COARSE vdso_test_clock(CLOCK_MONOTONIC_COARSE); -#endif - -#endif + vdso_test_clock(CLOCK_PROCESS_CPUTIME_ID); + vdso_test_clock(CLOCK_THREAD_CPUTIME_ID); vdso_test_time(); - ksft_print_cnts(); - return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; + ksft_finished(); } diff --git a/tools/testing/selftests/vDSO/vdso_test_clock_getres.c b/tools/testing/selftests/vDSO/vdso_test_clock_getres.c deleted file mode 100644 index b5d5f59f725a70..00000000000000 --- a/tools/testing/selftests/vDSO/vdso_test_clock_getres.c +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note -/* - * vdso_clock_getres.c: Sample code to test clock_getres. - * Copyright (c) 2019 Arm Ltd. - * - * Compile with: - * gcc -std=gnu99 vdso_clock_getres.c - * - * Tested on ARM, ARM64, MIPS32, x86 (32-bit and 64-bit), - * Power (32-bit and 64-bit), S390x (32-bit and 64-bit). - * Might work on other architectures. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../kselftest.h" - -static long syscall_clock_getres(clockid_t _clkid, struct timespec *_ts) -{ - long ret; - - ret = syscall(SYS_clock_getres, _clkid, _ts); - - return ret; -} - -const char *vdso_clock_name[12] = { - "CLOCK_REALTIME", - "CLOCK_MONOTONIC", - "CLOCK_PROCESS_CPUTIME_ID", - "CLOCK_THREAD_CPUTIME_ID", - "CLOCK_MONOTONIC_RAW", - "CLOCK_REALTIME_COARSE", - "CLOCK_MONOTONIC_COARSE", - "CLOCK_BOOTTIME", - "CLOCK_REALTIME_ALARM", - "CLOCK_BOOTTIME_ALARM", - "CLOCK_SGI_CYCLE", - "CLOCK_TAI", -}; - -/* - * This function calls clock_getres in vdso and by system call - * with different values for clock_id. - * - * Example of output: - * - * clock_id: CLOCK_REALTIME [PASS] - * clock_id: CLOCK_BOOTTIME [PASS] - * clock_id: CLOCK_TAI [PASS] - * clock_id: CLOCK_REALTIME_COARSE [PASS] - * clock_id: CLOCK_MONOTONIC [PASS] - * clock_id: CLOCK_MONOTONIC_RAW [PASS] - * clock_id: CLOCK_MONOTONIC_COARSE [PASS] - */ -static inline int vdso_test_clock(unsigned int clock_id) -{ - struct timespec x, y; - - printf("clock_id: %s", vdso_clock_name[clock_id]); - clock_getres(clock_id, &x); - syscall_clock_getres(clock_id, &y); - - if ((x.tv_sec != y.tv_sec) || (x.tv_nsec != y.tv_nsec)) { - printf(" [FAIL]\n"); - return KSFT_FAIL; - } - - printf(" [PASS]\n"); - return KSFT_PASS; -} - -int main(int argc, char **argv) -{ - int ret = 0; - -#if _POSIX_TIMERS > 0 - -#ifdef CLOCK_REALTIME - ret += vdso_test_clock(CLOCK_REALTIME); -#endif - -#ifdef CLOCK_BOOTTIME - ret += vdso_test_clock(CLOCK_BOOTTIME); -#endif - -#ifdef CLOCK_TAI - ret += vdso_test_clock(CLOCK_TAI); -#endif - -#ifdef CLOCK_REALTIME_COARSE - ret += vdso_test_clock(CLOCK_REALTIME_COARSE); -#endif - -#ifdef CLOCK_MONOTONIC - ret += vdso_test_clock(CLOCK_MONOTONIC); -#endif - -#ifdef CLOCK_MONOTONIC_RAW - ret += vdso_test_clock(CLOCK_MONOTONIC_RAW); -#endif - -#ifdef CLOCK_MONOTONIC_COARSE - ret += vdso_test_clock(CLOCK_MONOTONIC_COARSE); -#endif - -#endif - if (ret > 0) - return KSFT_FAIL; - - return KSFT_PASS; -} diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index aaf0808125d723..13ff1934d47c94 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -49,7 +49,7 @@ actions_destroy(struct actions *self) static struct action * actions_new(struct actions *self) { - if (self->size >= self->len) { + if (self->len >= self->size) { self->size *= 2; self->list = realloc(self->list, self->size * sizeof(struct action)); } @@ -131,7 +131,7 @@ actions_parse(struct actions *self, const char *trigger) { enum action_type type = ACTION_NONE; char *token; - char trigger_c[strlen(trigger)]; + char trigger_c[strlen(trigger) + 1]; /* For ACTION_SIGNAL */ int signal = 0, pid = 0;